ADD unsqueeze

fix some bugs fix some for onnx_stub add Constant operator Fix onnx data parse and onnx_run api , and add some tests for model.
2023-04-17 15:07:21 +08:00 · 2023-04-17 15:07:21 +08:00 · f021afe199
parent f2be122efc
commit f021afe199
24 changed files with 926 additions and 43 deletions
--- a/include/core/graph_handler.h
+++ b/include/core/graph_handler.h
@ -80,6 +80,8 @@ class GraphHandlerObj {
                 const optional<vector<int>> &steps);
    Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
               const optional<vector<int>> &axes);
+    Tensor constant(Tensor output);
+    Tensor unsqueeze(Tensor input, const vector<int> &axes, Tensor output);

    //------ modifiers

--- a/include/core/operator.h
+++ b/include/core/operator.h
@ -25,6 +25,7 @@ enum class OpType {
    Mul,
    Div,
    Pow,
+    Sqrt,
    Gather,
    ReduceMean,
    Reshape,
@ -39,6 +40,8 @@ enum class OpType {
    Tanh,
    Abs,
    Resize,
+    Constant,
+    Unsqueeze,
    //
    MemBound = 300,
 };
--- a/include/core/tensor.h
+++ b/include/core/tensor.h
@ -104,7 +104,7 @@ class TensorObj : public TensorBaseObj {

  private:
    void printDataFloat() const;
-    void printDataUint32_t() const;
+    template <typename T> void printDataInteger() const;

    template <typename T>
    bool equalDataImpl(const T *a, const T *b, size_t size) const {
--- a/include/operators/constant.h
+++ b/include/operators/constant.h
@ -0,0 +1,29 @@
+#pragma once
+
+#include "core/operator.h"
+
+namespace infini {
+class ConstantObj : public OperatorObj {
+
+  public:
+    ConstantObj(GraphObj *graph, Tensor output)
+        : OperatorObj(OpType::Constant, {}, {output}) {
+        IT_ASSERT(output);
+        IT_ASSERT(checkValid(graph));
+    }
+    OP_CLONE(ConstantObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const {
+        return {{outputs[0]->getDims()}};
+    };
+
+    std::string toString() const override;
+    int numInputs() const override { return 0; }
+    int numOutputs() const override { return 1; }
+    void makeConstant() {}
+
+  private:
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+} // namespace infini
--- a/include/operators/element_wise.h
+++ b/include/operators/element_wise.h
@ -46,4 +46,5 @@ DEFINE_ELEMENT_WISE_OBJ(Sub, OpType::Sub)
 DEFINE_ELEMENT_WISE_OBJ(Mul, OpType::Mul)
 DEFINE_ELEMENT_WISE_OBJ(Div, OpType::Div)
 DEFINE_ELEMENT_WISE_OBJ(Pow, OpType::Pow)
+DEFINE_ELEMENT_WISE_OBJ(Sqrt, OpType::Sqrt)
 }; // namespace infini
--- a/include/operators/unsqueeze.h
+++ b/include/operators/unsqueeze.h
@ -0,0 +1,23 @@
+#include "core/operator.h"
+
+namespace infini {
+class UnsqueezeObj : public OperatorObj {
+    set<int> axis;
+
+  public:
+    UnsqueezeObj(GraphObj *graph, Tensor input, const vector<int> &axis,
+                 Tensor output);
+    OP_CLONE(UnsqueezeObj);
+
+    optional<vector<Shape>> inferShape(const TensorVec &inputs) const override;
+    std::string toString() const override;
+    int numInputs() const override { return 1; }
+    int numOutputs() const override { return 1; }
+
+  private:
+    bool parseAxis(const std::vector<int> &index, std::set<int> &axis) const;
+    vector<int> getWorkloadVector() const override;
+    vector<int> getOpAttrVector() const override;
+};
+
+} // namespace infini
--- a/include/utils/data_generator.h
+++ b/include/utils/data_generator.h
@ -16,6 +16,8 @@ class DataGenerator {
            fill(reinterpret_cast<uint32_t *>(data), size);
        else if (dataType == DataType::Float32)
            fill(reinterpret_cast<float *>(data), size);
+        else if (dataType == DataType::Int64)
+            fill(reinterpret_cast<long long *>(data), size);
        else
            IT_TODO_HALT();
    }
--- a/pyinfinitensor/src/pyinfinitensor/onnx.py
+++ b/pyinfinitensor/src/pyinfinitensor/onnx.py
@ -1,4 +1,5 @@
 import backend
+import numpy as np
 from onnx import (
    ModelProto,
    TensorProto,
@ -24,13 +25,16 @@ from onnx.checker import (
 from onnx.shape_inference import infer_shapes
 from typing import Dict, List, Any, Tuple, Sequence, Union, Optional
 from functools import reduce
+import struct

+# TODO: do we need need runtime here
 runtime = backend.runtime()

 class OnnxStub:
-    inputs: Dict[str, backend.Tensor] = {}
-    outputs: Dict[str, backend.Tensor] = {}
-    initializer: Dict[int, TensorProto] = {}
+    inputs: Dict[str, backend.Tensor] = {}  #只包含图最初的输入，不是包含每一层的输入
+    outputs: Dict[str, backend.Tensor] = {} #只包含图最后的输出
+    initializer: Dict[int, TensorProto] = {} #包含图每一层的权重和参数
+    tensors: Dict[int, TensorProto] = {} #包含所有tensor
    handler: backend.GraphHandler

    def __init__(self, model: ModelProto, runtime):
@ -39,13 +43,13 @@ class OnnxStub:

        tensors: Dict[str, backend.Tensor] = dict()
        data: Dict[str, TensorProto] = dict()
-
+        
        for input in model.graph.input:
            dims = _take_shape_dim(input.type.tensor_type.shape)
            tensors[input.name] = self.handler.tensor(
                dims, input.type.tensor_type.elem_type
            )
-
+           
        for output in model.graph.output:
            dims = _take_shape_dim(output.type.tensor_type.shape)
            tensors[output.name] = self.handler.tensor(
@ -54,6 +58,10 @@ class OnnxStub:

        for initializer in model.graph.initializer:
            data[initializer.name] = initializer
+            # weights and params are not in "model.graph.input" when .onnx is converted from .pth
+            tensors[initializer.name] = self.handler.tensor(
+                initializer.dims, initializer.data_type
+            )

        for node in model.graph.node:
            if node.op_type == "Conv":
@ -331,9 +339,30 @@ class OnnxStub:
                    _parse_data(data[node.input[1]]),
                    _parse_data(data[node.input[3]]) if len(node.input) > 3 else None,
                )
+            elif node.op_type == "Constant":
+                attr = next((attr for attr in node.attribute if attr.name == "value"), None)                
+                if attr == None:
+                    raise Exception("no value in constant nodeproto")
+                if attr.type == 4: #TENSOR     
+                    value_tensor  = self.handler.tensor(
+                        [self.get_data_len(attr.t)], attr.t.data_type
+                    )   
+                    value_tensor.data_malloc()
+                    self.fill_value(value_tensor, attr.t)                               
+                    tensors[node.output[0]] = self.handler.constant(
+                        value_tensor                      
+                    )
+                else :
+                    raise Exception("TODO")
+            elif node.op_type == "Unsqueeze":   
+                tensors[node.output[0]] = self.handler.unsqueeze(
+                        tensors[node.input[0]],
+                        _parse_data(data[node.input[1]]) if len(node.input) > 1 else next((attr.ints for attr in node.attribute if attr.name == "axes")),
+                        tensors.get(node.output[0]))   
            else:
+                print(node)
                raise Exception('Unsupported operator "{}"'.format(node.op_type))
-
+            
        self.handler.data_malloc()

        for name, obj in tensors.items():
@ -341,19 +370,84 @@ class OnnxStub:
            if tensor == None:
                if any(input.name == name for input in model.graph.input):
                    self.inputs[name] = obj
-            else:
+            if tensor != None:
                self.initializer[obj.fuid()] = tensor
-                if tensor.data_type == TensorProto.INT32:
-                    obj.copyin_int32([int(i) for i in tensor.int32_data])
-                elif tensor.data_type == TensorProto.INT64:
-                    obj.copyin_int64([int(i) for i in tensor.int64_data])
-                elif tensor.data_type == TensorProto.FLOAT:
-                    obj.copyin_float([int(i) for i in tensor.float_data])
-                else:
-                    assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)
+                self.fill_value(obj, tensor)                         

        for output in model.graph.output:
-            self.outputs[output.name] = tensors[output.name]
+            self.outputs[output.name] = tensors[output.name]  
+
+        self.tensors = tensors    
+
+    def get_data_len(self, tensor)-> int:
+        length = 0
+        ele_size=1
+        if tensor.data_type == TensorProto.INT32:
+            length = len(tensor.int32_data)    
+            ele_size = 4            
+        elif tensor.data_type == TensorProto.INT64:
+            length = len(tensor.int64_data)   
+            ele_size = 8
+        elif tensor.data_type == TensorProto.FLOAT:            
+            length = len(tensor.float_data)  
+            ele_size = 4
+        else:
+            assert False, "Unsupported Tensor Type: {}".format(tensor.data_type) 
+        if length == 0:
+            length = len(tensor.raw_data) // ele_size
+        return length
+
+    def fill_inputs(self, inputs: List[TensorProto]):
+        for tensor in inputs:
+            obj = self.inputs[tensor.name]  
+            self.fill_value(obj, tensor)             
+
+    def fill_value(self, obj, tensor):
+        # Data may be stored in raw_data when .onnx is converted from .pth
+        # or when parsing Constant oprator
+        if tensor.data_type == TensorProto.INT32:
+            int32_data = tensor.int32_data
+            if len(tensor.int32_data) == 0:
+                int32_data = self.parse_data(tensor.raw_data, tensor.data_type)
+            obj.copyin_int32([int(i) for i in tensor.int32_data])
+        elif tensor.data_type == TensorProto.INT64:
+            int64_data = tensor.int64_data
+            if len(tensor.int64_data) == 0:
+                int64_data = self.parse_data(tensor.raw_data, tensor.data_type)
+            # todo: convert int64 to int32, because backend donot support int64!
+            obj.copyin_int32([np.int32(i) for i in int64_data]) 
+        elif tensor.data_type == TensorProto.FLOAT:            
+            float_data = tensor.float_data
+            if len(float_data) == 0:
+                float_data = self.parse_data(tensor.raw_data, tensor.data_type)
+            obj.copyin_float([i for i in float_data])
+        else:
+            assert False, "Unsupported Tensor Type: {}".format(tensor.data_type)   
+
+    def parse_data(self, raw_data, dtype):
+        if dtype == TensorProto.FLOAT:  
+            fmt = 'f'
+            elem_size = 4
+        elif dtype == TensorProto.FLOAT16:  
+            fmt = 'e'
+            elem_size = 2
+        elif dtype == TensorProto.INT32:
+            fmt = 'i'
+            elem_size = 4
+        elif dtype == TensorProto.INT64:
+            fmt = 'q'
+            elem_size = 8
+        else:
+            raise ValueError('Unsupported data type')
+        num_elems = len(raw_data) // elem_size
+        data = []
+        for i in range(num_elems):
+            start_idx = i * elem_size
+            end_idx = start_idx + elem_size
+            elem_bytes = raw_data[start_idx:end_idx]
+            elem_value = struct.unpack(fmt, elem_bytes)[0]
+            data.append(elem_value)
+        return data

    def to_onnx(self, name: str) -> ModelProto:
        class Context:
@ -576,9 +670,11 @@ def from_onnx(model: ModelProto, runtime):
    stub = OnnxStub(model, runtime)
    return stub.inputs, stub.outputs, stub.handler

-def run_onnx(model: ModelProto, runtime):
+def run_onnx(model: ModelProto, inputs: List[TensorProto]):
    stub = OnnxStub(model, runtime)
+    stub.fill_inputs(inputs)
    stub.run()
+    return stub.outputs

 def _parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
    for attr in node.attribute:
--- a/pyinfinitensor/tests/export_onnx.py
+++ b/pyinfinitensor/tests/export_onnx.py
@ -0,0 +1,44 @@
+import torch
+import torchvision
+import os
+from models.drn import drn_a_50, drn_c_26  
+
+
+def export_model(modelfn, file_name):
+    files = os.listdir()
+    file = next(
+        (name for name in files if name.endswith(file_name)), None
+    )
+    if file == None:
+        # Load the pretrained model
+        model = modelfn(pretrained = True)
+        # Set the model to evaluation mode
+        model.eval()
+
+        # Create a sample input tensor
+        input_tensor = torch.randn(1, 3, 224, 224)
+        # Export the model to ONNX format
+        torch.onnx.export(model, input_tensor, file_name)
+
+
+
+from transformers import BertTokenizer, BertModel
+def export_bert():
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    model = BertModel.from_pretrained("bert-base-uncased")
+    text = "Replace me by any text you'd like."
+    encoded_input = tokenizer(text, return_tensors='pt')
+    #output = model(**encoded_input)
+    #inputs = torch.randn(1, 128, dtype=torch.float32)
+    input_names = ["input_ids", "attention_mask", "token_type_ids"]
+    torch.onnx.export(model,tuple(encoded_input.values()),
+                    f="bert.onnx",
+                    input_names=input_names,
+                
+                    output_names=["output"])
+    # https://huggingface.co/blog/convert-transformers-to-onnx#4-how-can-i-convert-a-transformers-model-bert-to-onnx
+
+#export_model(torchvision.models.resnet18, "resnet18.onnx")
+#export_model(torchvision.models.resnet50, "resnet50.onnx")
+#export_model(drn_c_26, "drn_c_26.onnx")
+export_model(torchvision.models.inception_v3, "inception_v3.onnx")
--- a/pyinfinitensor/tests/models/drn.py
+++ b/pyinfinitensor/tests/models/drn.py
@ -0,0 +1,416 @@
+# source from https://github.com/fyu/drn/blob/master/drn.py
+
+import pdb
+
+import torch.nn as nn
+import math
+import torch.utils.model_zoo as model_zoo
+
+BatchNorm = nn.BatchNorm2d
+
+
+# __all__ = ['DRN', 'drn26', 'drn42', 'drn58']
+
+
+webroot = 'http://dl.yf.io/drn/'
+
+model_urls = {
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'drn-c-26': webroot + 'drn_c_26-ddedf421.pth',
+    'drn-c-42': webroot + 'drn_c_42-9d336e8c.pth',
+    'drn-c-58': webroot + 'drn_c_58-0a53a92c.pth',
+    'drn-d-22': webroot + 'drn_d_22-4bd2f8ea.pth',
+    'drn-d-38': webroot + 'drn_d_38-eebb45f0.pth',
+    'drn-d-54': webroot + 'drn_d_54-0e0534ff.pth',
+    'drn-d-105': webroot + 'drn_d_105-12b40979.pth'
+}
+
+
+def conv3x3(in_planes, out_planes, stride=1, padding=1, dilation=1):
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=padding, bias=False, dilation=dilation)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 dilation=(1, 1), residual=True):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride,
+                             padding=dilation[0], dilation=dilation[0])
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes,
+                             padding=dilation[1], dilation=dilation[1])
+        self.bn2 = BatchNorm(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.residual = residual
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        if self.residual:
+            out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 dilation=(1, 1), residual=True):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation[1], bias=False,
+                               dilation=dilation[1])
+        self.bn2 = BatchNorm(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class DRN(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000,
+                 channels=(16, 32, 64, 128, 256, 512, 512, 512),
+                 out_map=False, out_middle=False, pool_size=28, arch='D'):
+        super(DRN, self).__init__()
+        self.inplanes = channels[0]
+        self.out_map = out_map
+        self.out_dim = channels[-1]
+        self.out_middle = out_middle
+        self.arch = arch
+
+        if arch == 'C':
+            self.conv1 = nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                                   padding=3, bias=False)
+            self.bn1 = BatchNorm(channels[0])
+            self.relu = nn.ReLU(inplace=True)
+
+            self.layer1 = self._make_layer(
+                BasicBlock, channels[0], layers[0], stride=1)
+            self.layer2 = self._make_layer(
+                BasicBlock, channels[1], layers[1], stride=2)
+        elif arch == 'D':
+            self.layer0 = nn.Sequential(
+                nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3,
+                          bias=False),
+                BatchNorm(channels[0]),
+                nn.ReLU(inplace=True)
+            )
+
+            self.layer1 = self._make_conv_layers(
+                channels[0], layers[0], stride=1)
+            self.layer2 = self._make_conv_layers(
+                channels[1], layers[1], stride=2)
+
+        self.layer3 = self._make_layer(block, channels[2], layers[2], stride=2)
+        self.layer4 = self._make_layer(block, channels[3], layers[3], stride=2)
+        self.layer5 = self._make_layer(block, channels[4], layers[4],
+                                       dilation=2, new_level=False)
+        self.layer6 = None if layers[5] == 0 else \
+            self._make_layer(block, channels[5], layers[5], dilation=4,
+                             new_level=False)
+
+        if arch == 'C':
+            self.layer7 = None if layers[6] == 0 else \
+                self._make_layer(BasicBlock, channels[6], layers[6], dilation=2,
+                                 new_level=False, residual=False)
+            self.layer8 = None if layers[7] == 0 else \
+                self._make_layer(BasicBlock, channels[7], layers[7], dilation=1,
+                                 new_level=False, residual=False)
+        elif arch == 'D':
+            self.layer7 = None if layers[6] == 0 else \
+                self._make_conv_layers(channels[6], layers[6], dilation=2)
+            self.layer8 = None if layers[7] == 0 else \
+                self._make_conv_layers(channels[7], layers[7], dilation=1)
+
+        if num_classes > 0:
+            self.avgpool = nn.AvgPool2d(pool_size)
+            self.fc = nn.Conv2d(self.out_dim, num_classes, kernel_size=1,
+                                stride=1, padding=0, bias=True)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1,
+                    new_level=True, residual=True):
+        assert dilation == 1 or dilation % 2 == 0
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm(planes * block.expansion),
+            )
+
+        layers = list()
+        layers.append(block(
+            self.inplanes, planes, stride, downsample,
+            dilation=(1, 1) if dilation == 1 else (
+                dilation // 2 if new_level else dilation, dilation),
+            residual=residual))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, residual=residual,
+                                dilation=(dilation, dilation)))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_layers(self, channels, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(self.inplanes, channels, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                BatchNorm(channels),
+                nn.ReLU(inplace=True)])
+            self.inplanes = channels
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = list()
+
+        if self.arch == 'C':
+            x = self.conv1(x)
+            x = self.bn1(x)
+            x = self.relu(x)
+        elif self.arch == 'D':
+            x = self.layer0(x)
+
+        x = self.layer1(x)
+        y.append(x)
+        x = self.layer2(x)
+        y.append(x)
+
+        x = self.layer3(x)
+        y.append(x)
+
+        x = self.layer4(x)
+        y.append(x)
+
+        x = self.layer5(x)
+        y.append(x)
+
+        if self.layer6 is not None:
+            x = self.layer6(x)
+            y.append(x)
+
+        if self.layer7 is not None:
+            x = self.layer7(x)
+            y.append(x)
+
+        if self.layer8 is not None:
+            x = self.layer8(x)
+            y.append(x)
+
+        if self.out_map:
+            x = self.fc(x)
+        else:
+            x = self.avgpool(x)
+            x = self.fc(x)
+            x = x.view(x.size(0), -1)
+
+        if self.out_middle:
+            return x, y
+        else:
+            return x
+
+
+class DRN_A(nn.Module):
+
+    def __init__(self, block, layers, num_classes=1000):
+        self.inplanes = 64
+        super(DRN_A, self).__init__()
+        self.out_dim = 512 * block.expansion
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=1,
+                                       dilation=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=1,
+                                       dilation=4)
+        self.avgpool = nn.AvgPool2d(28, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         nn.init.constant_(m.weight, 1)
+        #         nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes,
+                                dilation=(dilation, dilation)))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def drn_a_50(pretrained=False, **kwargs):
+    model = DRN_A(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
+    return model
+
+
+def drn_c_26(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='C', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-c-26']))
+    return model
+
+
+def drn_c_42(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-c-42']))
+    return model
+
+
+def drn_c_58(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='C', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-c-58']))
+    return model
+
+
+def drn_d_22(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 1, 1], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-22']))
+    return model
+
+
+def drn_d_24(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 2, 2, 2, 2, 2, 2], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-24']))
+    return model
+
+
+def drn_d_38(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-38']))
+    return model
+
+
+def drn_d_40(pretrained=False, **kwargs):
+    model = DRN(BasicBlock, [1, 1, 3, 4, 6, 3, 2, 2], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-40']))
+    return model
+
+
+def drn_d_54(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 1, 1], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-54']))
+    return model
+
+
+def drn_d_56(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 6, 3, 2, 2], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-56']))
+    return model
+
+
+def drn_d_105(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 23, 3, 1, 1], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-105']))
+    return model
+
+
+def drn_d_107(pretrained=False, **kwargs):
+    model = DRN(Bottleneck, [1, 1, 3, 4, 23, 3, 2, 2], arch='D', **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['drn-d-107']))
+    return model
--- a/pyinfinitensor/tests/test_model.py
+++ b/pyinfinitensor/tests/test_model.py
@ -0,0 +1,82 @@
+import os, onnx, unittest
+from typing import  Dict
+import numpy as np
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from PIL import Image
+from onnx import TensorProto
+from onnx.helper import (
+    make_model,
+    make_node,
+    make_tensor,
+    make_graph,
+    make_tensor_value_info,
+)
+from onnx.checker import check_model
+from pyinfinitensor.onnx import from_onnx, backend, run_onnx
+import onnxruntime
+
+def pre_process(img) :
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    transfn = transforms.Compose([transforms.Resize(256), 
+                              transforms.CenterCrop(224),
+                              transforms.ToTensor(),
+                              normalize,
+                              ])
+    #img = np.transpose(img,(1,2,0))
+    return transfn(img)    
+
+def model_run(onnx_file_name):
+    dir_path = "./"
+    files = os.listdir(dir_path)
+    model_file = next(
+        (name for name in files if name.endswith(onnx_file_name)), None
+    )
+    
+    if model_file != None:
+        model_path = os.path.join(dir_path, model_file)
+        print(
+            "model: {file}({size:.2f} MiB)".format(
+                file=model_path, size=os.path.getsize(model_path) / 1024 / 1024
+            )
+        )
+        model = onnx.load(model_path)
+        check_model(model)        
+        
+        session = onnxruntime.InferenceSession(model.SerializeToString(), None)
+        input_name = session.get_inputs()[0].name       
+        input_tensor = make_tensor(input_name, TensorProto.FLOAT, input_data.shape, input_data)
+        run_onnx(model,  [input_tensor])  
+
+def get_img_data():
+    dir_path = "./"
+    files = os.listdir(dir_path)
+    img_file = next(
+        (name for name in files if name.endswith(".jpg")), None
+        )
+    if img_file != None:
+        img_path = os.path.join(dir_path, img_file)
+        try:
+            img = Image.open(img_path)
+            input_data = pre_process(img)
+            img.close()
+        except FileNotFoundError:
+            print(f"Image not found: {img_file}")
+        else: 
+            return input_data.numpy().astype(np.float32)
+class TestStringMethods(unittest.TestCase):    
+    def test_model_run(self):
+        #input_data = get_img_data()
+        input_data = np.random.rand(1,3,224,224).astype(np.float32)
+        model_run("resnet18.onnx", input_data)
+        model_run("resnet50.onnx", input_data)
+        model_run("drn_c_26.onnx", input_data)
+
+
+
+                
+
+if __name__ == "__main__":
+    unittest.main()
--- a/pyinfinitensor/tests/test_onnx.py
+++ b/pyinfinitensor/tests/test_onnx.py
@ -1,4 +1,7 @@
 import os, onnx, unittest
+from typing import  Dict
+import numpy as np
+import onnxruntime
 from onnx import TensorProto
 from onnx.helper import (
    make_model,
@ -16,20 +19,7 @@ def make_and_import_model(graph: onnx.GraphProto):
    check_model(model)
    from_onnx(model, runtime)

-
 class TestStringMethods(unittest.TestCase):
-    #def test_run(self):
-    #    model_file = next(
-    #        (name for name in os.listdir() if name.endswith(".onnx")), None
-    #    )
-    #    if model_file != None:
-    #        print(
-    #            "model: {file}({size:.2f} MiB)".format(
-    #                file=model_file, size=os.path.getsize(model_file) / 1024 / 1024
-    #            )
-    #        )
-    #        run_onnx(onnx.load(model_file), runtime)
-
    def test_load(self):
        model_file = next(
            (name for name in os.listdir() if name.endswith(".onnx")), None
@ -317,5 +307,6 @@ class TestStringMethods(unittest.TestCase):
        handler.reshape(x, y, [3, 2, 1])


+
 if __name__ == "__main__":
    unittest.main()
--- a/src/core/graph_handler.cc
+++ b/src/core/graph_handler.cc
@ -1,6 +1,7 @@
 #include "core/graph_handler.h"
 #include "operators/batch_norm.h"
 #include "operators/concat.h"
+#include "operators/constant.h"
 #include "operators/conv.h"
 #include "operators/element_wise.h"
 #include "operators/gather.h"
@ -12,6 +13,7 @@
 #include "operators/slice.h"
 #include "operators/softmax.h"
 #include "operators/unary.h"
+#include "operators/unsqueeze.h"

 namespace infini {

@ -227,6 +229,24 @@ Tensor GraphHandlerObj::pad(Tensor input, Tensor output,
    }
 }

+Tensor GraphHandlerObj::constant(Tensor output) {
+    if (output) {
+        g->addOpWithOutputs<ConstantObj>(output);
+        return output;
+    } else
+        return g->addOp<ConstantObj>(output)->getOutput();
+}
+
+Tensor GraphHandlerObj::unsqueeze(Tensor input, const vector<int> &axes,
+                                  Tensor output) {
+    if (output) {
+        g->addOpWithOutputs<UnsqueezeObj>(std::move(input), axes, output);
+        return output;
+    } else
+        return g->addOp<UnsqueezeObj>(std::move(input), axes, output)
+            ->getOutput();
+}
+
 static DataType dtype_repr_convert(int dtype) {
    switch ((OnnxDType)dtype) {
    case OnnxDType::FLOAT:
@ -243,8 +263,11 @@ static DataType dtype_repr_convert(int dtype) {
        return DataType::Int16;
    case OnnxDType::INT32:
        return DataType::Int32;
+    // TODO: conver Int64 to int32, because some backend like dnnl donot support
+    // int64
    case OnnxDType::INT64:
-        return DataType::Int64;
+        // return DataType::Int64;
+        return DataType::Int32;
    default:
        IT_ASSERT(false, "Unsupported data type");
    }
--- a/src/core/tensor.cc
+++ b/src/core/tensor.cc
@ -62,7 +62,11 @@ void TensorObj::printData() const {
    if (dtype == DataType::Float32)
        printDataFloat();
    else if (dtype == DataType::UInt32)
-        printDataUint32_t();
+        printDataInteger<unsigned int>();
+    else if (dtype == DataType::Int32)
+        printDataInteger<int>();
+    else if (dtype == DataType::Int64)
+        printDataInteger<long long>();
    else
        IT_TODO_HALT();
 }
@ -94,12 +98,12 @@ void TensorObj::printDataFloat() const {
    }
 }

-void TensorObj::printDataUint32_t() const {
+template <typename T> void TensorObj::printDataInteger() const {
    IT_ASSERT(data != nullptr);
    std::cout << "Tensor: " << guid << std::endl;
    auto numDims = shape.size();
    auto dimSzVec = std::vector<int>(numDims, 1);
-    auto ptr = data->getPtr<VType *>();
+    auto ptr = data->getPtr<T *>();
    dimSzVec[numDims - 1] = shape[numDims - 1];
    for (int i = numDims - 1; i != 0; --i)
        dimSzVec[i - 1] = dimSzVec[i] * shape[i - 1];
@ -133,9 +137,15 @@ bool TensorObj::equalData(const Tensor &rhs) const {
    if (getDType() == DataType::UInt32)
        return equalDataImpl(getRawDataPtr<uint32_t *>(),
                             rhs->getRawDataPtr<uint32_t *>(), size());
+    if (getDType() == DataType::Int32)
+        return equalDataImpl(getRawDataPtr<int32_t *>(),
+                             rhs->getRawDataPtr<int32_t *>(), size());
    else if (getDType() == DataType::Float32)
        return equalDataImpl(getRawDataPtr<float *>(),
                             rhs->getRawDataPtr<float *>(), size());
+    else if (getDType() == DataType::Int64)
+        return equalDataImpl(getRawDataPtr<long long *>(),
+                             rhs->getRawDataPtr<long long *>(), size());
    else
        IT_TODO_HALT();
 }
--- a/src/ffi/ffi_infinitensor.cc
+++ b/src/ffi/ffi_infinitensor.cc
@ -195,6 +195,7 @@ void init_graph_builder(py::module &m) {
    py::class_<TensorObj, std::shared_ptr<TensorObj>>(m, "Tensor")
        .def("fuid", &TensorObj::getFuid, policy::automatic)
        .def("shape", &TensorObj::getDims, policy::move)
+        .def("data_malloc", &TensorObj::dataMalloc, policy::move)
        .def("copyin_float", &TensorObj::copyin<float>, policy::move)
        .def("copyin_int32", &TensorObj::copyin<int32_t>, policy::move)
        .def("copyin_int64", &TensorObj::copyin<int64_t>, policy::move)
@ -236,6 +237,8 @@ void init_graph_builder(py::module &m) {
        .def("gather", &Handler::gather, policy::move)
        .def("reduce_mean", &Handler::reduceMean, policy::move)
        .def("slice", &Handler::slice, policy::move)
+        .def("constant", &Handler::constant, policy::move)
+        .def("unsqueeze", &Handler::unsqueeze, policy::move)
        .def("pad", &Handler::pad, policy::move)
        .def("topo_sort", &Handler::topo_sort, policy::automatic)
        .def("operators", &Handler::operators, policy::move)
--- a/src/kernels/intelcpu/constant.cc
+++ b/src/kernels/intelcpu/constant.cc
@ -0,0 +1,15 @@
+#include "operators/constant.h"
+#include "intelcpu/mkl_kernel_without_config.h"
+
+namespace infini {
+class ConstantKernel : public MklKernelWithoutConfig {
+    void compute(const Operator &_op,
+                 const RuntimeObj *context) const override {
+        ;
+    }
+};
+REGISTER_KERNEL(Device::INTELCPU, OpType::Constant, DataType::Int32,
+                ConstantKernel, "Constant_Mkl_Int32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Constant, DataType::Float32,
+                ConstantKernel, "Constant_Mkl_Float32");
+} // namespace infini
--- a/src/kernels/intelcpu/gather.cc
+++ b/src/kernels/intelcpu/gather.cc
@ -38,12 +38,12 @@ class MklGather : public MklKernelWithoutConfig {

        sycl::queue q(sycl::cpu_selector{});
        auto inDevice = sycl::malloc_device<float>(iSize, q);
-        auto indexDevice = sycl::malloc_device<uint32_t>(idxSize, q);
+        auto indexDevice = sycl::malloc_device<int32_t>(idxSize, q);
        auto outDevice = sycl::malloc_device<float>(oSize, q);

        q.memcpy(inDevice, in->getRawDataPtr<float *>(), iSize * sizeof(float));
-        q.memcpy(indexDevice, index->getRawDataPtr<uint32_t *>(),
-                 idxSize * sizeof(uint32_t));
+        q.memcpy(indexDevice, index->getRawDataPtr<int32_t *>(),
+                 idxSize * sizeof(int32_t));
        q.wait();

        q.parallel_for(sycl::range<1>(oSize), [=](sycl::id<1> index) {
--- a/src/kernels/intelcpu/matmul_dpcpp.cc
+++ b/src/kernels/intelcpu/matmul_dpcpp.cc
@ -53,9 +53,9 @@ template <typename T> class MklDpcppMatmul : public CpuKernelWithoutConfig {
        // create execution queue and buffers of matrix data
        cl::sycl::queue main_queue(sycl::cpu_selector{}, exception_handler);

-        cl::sycl::buffer<float, 1> A_buffer(A, op->getInputs(0)->size());
-        cl::sycl::buffer<float, 1> B_buffer(B, op->getInputs(1)->size());
-        cl::sycl::buffer<float, 1> C_buffer(C, op->getOutput(0)->size());
+        cl::sycl::buffer<T, 1> A_buffer(A, op->getInputs(0)->size());
+        cl::sycl::buffer<T, 1> B_buffer(B, op->getInputs(1)->size());
+        cl::sycl::buffer<T, 1> C_buffer(C, op->getOutput(0)->size());

        // add oneapi::mkl::blas::gemm to execution queue
        try {
--- a/src/kernels/intelcpu/reshape.cc
+++ b/src/kernels/intelcpu/reshape.cc
@ -1,7 +1,7 @@
 #include "operators/reshape.h"
 #include "intelcpu/mkl_kernel_without_config.h"
 #include "intelcpu/mkl_runtime.h"
-
+#include "operators/unsqueeze.h"
 namespace infini {
 class MklReshape : public MklKernelWithoutConfig {
    void compute(const Operator &op,
@ -47,4 +47,6 @@ REGISTER_KERNEL(Device::INTELCPU, OpType::Identity, DataType::Float32,
                MklReshape, "Identify_Mkl_Float32");
 REGISTER_KERNEL(Device::INTELCPU, OpType::Flatten, DataType::Float32,
                MklReshape, "Flatten_Mkl_Float32");
+REGISTER_KERNEL(Device::INTELCPU, OpType::Unsqueeze, DataType::Float32,
+                MklReshape, "Unsqueeze_Mkl_Float32");
 }; // namespace infini
--- a/src/operators/constant.cc
+++ b/src/operators/constant.cc
@ -0,0 +1,21 @@
+#include "operators/constant.h"
+
+namespace infini {
+std::string ConstantObj::toString() const {
+    std::ostringstream os;
+    os << "Constant[" << getGuid() << "]";
+    os << "output = " << outputs[0]->getGuid() << ",";
+    return os.str();
+}
+
+vector<int> ConstantObj::getWorkloadVector() const {
+    vector<int> ret = outputs[0]->getDims();
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    return ret;
+}
+
+// need eps and momentum?
+vector<int> ConstantObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+} // namespace infini
--- a/src/operators/gather.cc
+++ b/src/operators/gather.cc
@ -28,7 +28,7 @@ optional<vector<Shape>> GatherObj::inferShape(const TensorVec &inputs) const {
 vector<DataType> GatherObj::inferDataType(const TensorVec &inputs) const {
    IT_ASSERT(inputs.size() == 2);
    auto index = inputs[1];
-    IT_ASSERT(index->getDType() == DataType::UInt32);
+    IT_ASSERT(index->getDType() == DataType::Int32);
    return {inputs[0]->getDType()};
 }

--- a/src/operators/unsqueeze.cc
+++ b/src/operators/unsqueeze.cc
@ -0,0 +1,65 @@
+#include "operators/unsqueeze.h"
+
+namespace infini {
+UnsqueezeObj::UnsqueezeObj(GraphObj *graph, Tensor in,
+                           const std::vector<int> &index, Tensor out)
+    : OperatorObj(OpType::Unsqueeze, {in}, {out}) {
+    IT_ASSERT(parseAxis(index, axis));
+    IT_ASSERT(checkValid(graph));
+}
+
+bool UnsqueezeObj::parseAxis(const std::vector<int> &index,
+                             std::set<int> &axis) const {
+    bool ret = true;
+    int nDim = inputs[0]->getDims().size() + index.size();
+    for (size_t i = 0; i < index.size(); ++i) {
+        int data = index[i];
+        if (data < 0)
+            data += nDim;
+        if (data >= nDim) {
+            ret = false;
+            break;
+        }
+        if (axis.find(data) != axis.end()) {
+            ret = false;
+            break;
+        }
+        axis.insert(data);
+    }
+    return ret;
+}
+
+optional<vector<Shape>>
+UnsqueezeObj::inferShape(const TensorVec &inputs) const {
+    Shape dims = inputs[0]->getDims();
+    for (int i : axis) {
+        auto it = dims.begin();
+        dims.insert(std::next(it, i), 1);
+    }
+    return {{dims}};
+}
+
+std::string UnsqueezeObj::toString() const {
+    std::ostringstream os;
+    os << "Unsqueeze[" << getGuid() << "]";
+    os << "(";
+    os << "inputs=";
+    for (auto i = 0; i < numInputs(); i++)
+        os << inputs[i]->getGuid() << ",";
+    os << "output=" << outputs[0]->getGuid() << ",";
+    os << ")";
+
+    return os.str();
+}
+
+vector<int> UnsqueezeObj::getWorkloadVector() const {
+    vector<int> ret = inputs[0]->getDims();
+    ret.emplace(ret.begin(), enum_to_underlying(type));
+    for (auto i : axis)
+        ret.emplace_back(i);
+    return ret;
+}
+vector<int> UnsqueezeObj::getOpAttrVector() const {
+    return {enum_to_underlying(type)};
+}
+} // namespace infini
--- a/test/kernels/intelcpu/test_mkl_unsqueeze.cc
+++ b/test/kernels/intelcpu/test_mkl_unsqueeze.cc
@ -0,0 +1,26 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "intelcpu/mkl_runtime.h"
+#include "operators/unsqueeze.h"
+
+#include "test.h"
+
+namespace infini {
+
+TEST(Unsqueeze, Mkl) {
+    Runtime runtime = MklRuntimeObj::getInstance();
+    Graph g = make_ref<GraphObj>(runtime);
+
+    auto input = g->addTensor({2, 3, 3}, DataType::Float32);
+    vector<int> index{1, 0};
+    auto op = g->addOp<UnsqueezeObj>(input, index, nullptr);
+    g->dataMalloc();
+    input->setData(IncrementalGenerator());
+
+    runtime->run(g);
+
+    auto o = g->cloneTensor(op->getOutput(0));
+    //  check results
+    EXPECT_TRUE(o->equalData(input));
+}
+} // namespace infini
--- a/test/operators/test_unsqueeze.cc
+++ b/test/operators/test_unsqueeze.cc
@ -0,0 +1,29 @@
+#include "core/graph.h"
+#include "core/runtime.h"
+#include "operators/unsqueeze.h"
+
+#include "test.h"
+
+namespace infini {
+TEST(Unsqueeze, ShapeInfer) {
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Graph g = make_ref<GraphObj>(runtime);
+        auto input = g->addTensor({1, 3, 2, 15}, DataType::Float32);
+        vector<int> index{1, 6, 0};
+        auto op = g->addOp<UnsqueezeObj>(input, index, nullptr);
+
+        EXPECT_EQ(op->getOutput(0)->getDims(), (Shape{1, 1, 1, 3, 2, 15, 1}));
+    }
+
+    {
+        Runtime runtime = NativeCpuRuntimeObj::getInstance();
+        Graph g = make_ref<GraphObj>(runtime);
+        auto input = g->addTensor({1, 3, 2, 15}, DataType::Float32);
+        vector<int> index{-6, -1, -7};
+        auto op = g->addOp<UnsqueezeObj>(input, index, nullptr);
+
+        EXPECT_EQ(op->getOutput(0)->getDims(), (Shape{1, 1, 1, 3, 2, 15, 1}));
+    }
+}
+} // namespace infini