InfiniTensor/include/core/graph_handler.h

#pragma once

#include "core/graph.h"
#include "core/runtime.h"
#include <cstdint>
#include <iostream>

#ifdef USE_CUDA
#include "cuda/cuda_runtime.h"
#endif

namespace infini {

class GraphHandlerObj {
    Graph g;

  public:
    GraphHandlerObj(Runtime runtime)
        : g(make_ref<GraphObj>(std::move(runtime))) {}

    Tensor tensor(Shape dims, int dtype);

    //------ operators

    inline OpVec operators() { return g->getOperators(); }

    Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
                int sh, int sw, int dh, int dw);
    Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph,
                            int pw, int sh, int sw, int dh, int dw, int oph,
                            int opw);
    Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
                  Tensor bias, ActType act);
    Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
                              Tensor var, Tensor scale, Tensor bias,
                              float momentum, float eps, bool training);
    Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
                              Tensor bias, float eps, int axis, int stash_type);

    Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
                   int ph, int pw, int sh, int sw, int ceilMode);
    Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
                   int ph, int pw, int sh, int sw, int ceilMode);

    Tensor add(Tensor a, Tensor b, Tensor c);
    Tensor sub(Tensor a, Tensor b, Tensor c);
    Tensor mul(Tensor a, Tensor b, Tensor c);
    Tensor div(Tensor a, Tensor b, Tensor c);
    Tensor pow(Tensor a, Tensor b, Tensor c);
    Tensor min(Tensor a, Tensor b, Tensor c);
    Tensor max(Tensor a, Tensor b, Tensor c);

    Tensor relu(Tensor x, Tensor y);
    Tensor silu(Tensor x, Tensor y);
    Tensor gelu(Tensor x, Tensor y);
    Tensor sigmoid(Tensor x, Tensor y);
    Tensor hardSigmoid(Tensor x, Tensor y);
    Tensor hardSwish(Tensor x, Tensor y);
    Tensor tanh(Tensor x, Tensor y);
    Tensor erf(Tensor x, Tensor y);
    Tensor softmax(Tensor x, Tensor y, int axis);
    Tensor abs(Tensor x, Tensor y);
    Tensor sqrt(Tensor x, Tensor y);
    Tensor neg(Tensor x, Tensor y);
    Tensor shape(Tensor x, Tensor y);
    Tensor identity(Tensor x, Tensor y);
    Tensor flatten(Tensor s, Tensor y, int axis);
    Tensor pRelu(Tensor x, Tensor slope, Tensor y);
    Tensor clip(Tensor x, Tensor y, std::optional<float> min,
                std::optional<float> max);
    Tensor transpose(Tensor data, Tensor transposed, Shape perm);
    Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
    Tensor resize(Tensor input, Tensor output,
                  const std::optional<vector<int>> &axes, Tensor sizes,
                  Tensor scales, Tensor roi, vector<uint32_t> sizes_,
                  vector<float> scales_, vector<float> roi_, string mode,
                  string ratioPolicy, string nearestMode,
                  string coordTransMode);
    Tensor squeeze(Tensor input, Tensor output, Shape axes);
    Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
    Tensor concat(TensorVec inputs, Tensor output, int dim);
    Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
                            Tensor input_q, Tensor input_k, Tensor input_v,
                            Tensor position_id, Tensor output_matmul);
    Tensor RoPE(Tensor pos, Tensor input, Tensor output);
    TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
                    std::variant<int, vector<int>> numOrRatio);
    Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
    Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
    Tensor reduceMean(Tensor data, Tensor reduced,
                      const optional<vector<int>> &axes, bool keepdims);
    Tensor reduceSum(Tensor data, Tensor reduced,
                     const optional<vector<int>> &axes, bool keepdims);
    Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
                 const vector<int> &ends, const optional<vector<int>> &axes,
                 const optional<vector<int>> &steps);
    Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
               const optional<vector<int>> &axes);
    Tensor cast(Tensor input, Tensor output, int to);
    Tensor expand(Tensor input, Tensor output, Shape dims);
    Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
    std::vector<int> getDims(Tensor x) { return x->getDims(); }

    Tensor allReduceSum(Tensor input, Tensor output);
    Tensor allReduceProd(Tensor input, Tensor output);
    Tensor allReduceMin(Tensor input, Tensor output);
    Tensor allReduceMax(Tensor input, Tensor output);
    Tensor allReduceAvg(Tensor input, Tensor output);
    TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
    Tensor broadcast(Tensor input, Tensor output, int root);
    Tensor send(Tensor input, int source, int destination, Tensor output);
    Tensor recv(Tensor output, int source, int destination, Shape dims,
                int outputType, Tensor input);
    Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
                        std::string mode);
    Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
               int size);

    //------ modifiers

    inline bool topo_sort() { return g->topo_sort(); }

    inline void optimize() { g->optimize(); }

    inline void shape_infer() { g->shape_infer(); }

    void change_shape(const vector<int> &shape, int tensorId);
    //------ runtime

    inline void data_malloc(bool useNaiveAllocator = false,
                            size_t memPoolSize = 0) {
        g->dataMalloc(useNaiveAllocator, memPoolSize);
    }

    inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }

    inline void free_heap() { g->freeHeap(); }

    inline void tune() { g->getRuntime()->run(g, true); }

    inline void run() { g->getRuntime()->run(g); }

    inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }

#ifdef USE_CUDA
    inline void run_with_cudagraph() {
        (as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
    }
#endif
};

} // namespace infini
-												feat: 创建 pyinfinitensor 前端

- python 前端项目结构及打包和安装脚本
- 后端编译出 so 改名为 backend，增加 GraphHandler 修改图结构
- ci 支持测试这些功能

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-12 08:23:49 +08:00
+								#pragma once
 								#include "core/graph.h"
 								#include "core/runtime.h"
-												temp: 实现初始值导入，但 resnet 报错

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-23 15:29:16 +08:00
+								#include <cstdint>
 								#include <iostream>
-												feat: 创建 pyinfinitensor 前端

- python 前端项目结构及打包和安装脚本
- 后端编译出 so 改名为 backend，增加 GraphHandler 修改图结构
- ci 支持测试这些功能

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-12 08:23:49 +08:00
-												[feature] add cudagraph support (#215)

* [feature] add cudagraph support

* modify code to pass the cuda_all_reduce test
											
										
										
											2024-02-21 14:00:25 +08:00
+								#ifdef USE_CUDA
 								#include "cuda/cuda_runtime.h"
 								#endif
-												feat: 创建 pyinfinitensor 前端

- python 前端项目结构及打包和安装脚本
- 后端编译出 so 改名为 backend，增加 GraphHandler 修改图结构
- ci 支持测试这些功能

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-12 08:23:49 +08:00
+								namespace infini {
 								class GraphHandlerObj {
 								    Graph g;
 								  public:
 								    GraphHandlerObj(Runtime runtime)
 								        : g(make_ref<GraphObj>(std::move(runtime))) {}
 								    Tensor tensor(Shape dims, int dtype);
-												feat: 导出 OperatorObj

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-17 15:00:34 +08:00
+								    //------ operators
 								    inline OpVec operators() { return g->getOperators(); }
-												feat: 前端支持 Conv 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-22 15:05:44 +08:00
+								    Tensor conv(Tensor input, Tensor weight, Tensor output, int ph, int pw,
 								                int sh, int sw, int dh, int dw);
-												Dev for 202303ddl (#66)

* add activation operatiopn relu, tanh, sigmoid on mlu

* commit for format

* add activation backward operation

* add test for activation_backward

* add test

* add convbpfilter

* fix

* add transpsoe code and test

* add trigon function operation on mlu: sin,cos,tan,asin,sinh,asinh

* add copy operation on mlu

* add ceil operation and floor operation

* add operation clip

* add operation cnnl div, test and test for divdemo bangc kernel

* add divnonan operation and test

* add erf operation

* add exp operation

* add operation fill

* add log operation

* add log1p operation

* add l2loss operation

* add maximum and minimum operation

* add mseloss operation

* add negTensor operation

* add power operation

* add reciprocal operation

* add sqrt and rsqrt operation

* add transform operation

* add addn operation

* add muln operation

* cherrry pick some operation

* add floordiv operation and floordivtrunc operation

* add floormod operation

* add cumsum operation

* add det operation

* add pad operation

* format

* add concat operation

* format

* add split operation

* fix concat and split operation

* add round operation

* add pooling operation

* add square operation

* add squaredDifference operation

* code format fix

* add flip operation

* code format fix

* add hardtanh operation

* add logic operation

* add addcdiv and addcmul operation

* add arange operation

* add bitcompute operation

* add net test

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: rename

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 用 NativeCpuRuntime 替换 CpuRuntime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code

* fix code

* fix code by review suggestion

* remove operation which is not the onnx operation

* fix format

* clang format

* refactor: tensor 的 print 加一层模板的 dataToString

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: onnx 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 增加计算图优化接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add clip operation

* feat: 支持导入 clip

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* test: 导入导出测试加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix batch norm

* feat: 增加 Shape 算子

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入 unsqueeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正 clip 接口

feat: 支持导入 transpose
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add broadcast operation

* fix elementwise-broadcast

* fix elementwise broadcast

* add broadcast for gpu elementsie

* feat: pad 支持 axes 负数

feat: 不支持的 padding 导出为独立的 pad 算子

feat: 支持导入 onnxsim 过的 inception
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正池化的测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 pads，支持 inception 导入导出，已加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 densenet 导入导出，并加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 squeeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix softmax

* feat: 导出 clip 和 transpose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 Conv 的 bias

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv group

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: matmul 的 bias 没有放在输入里，修正

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix exmaple

* fix: 改正 reduce_mean 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* refactor: 修改 slice 实现与 onnx 一致

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: 不导出两个 runtime 函数

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 中文使用指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 补全指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修复导入数据的问题

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 添加 Dropout 基本结构，但不支持两个输出是不同的类型

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 重新导出优化接口

feat: dropout 导入
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* build: BANG 选项加入 Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fxi code, change of test/kernels/bang/test* is use NativeCpuRuntime.
chaneg of include/bang/bang_runtime is for the cntoolkit upgrade.

* feat: 导出 bang runtime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add USE_BANG=1

* fix matmul

* fix reshape

* fix

* fix activation

* fix transpose

* format

* format

* update Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add prelu on mlu

* fix: ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 PRelu

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add convtrans on mlu

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* docs: 更新 README_CN.md

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code by review suggestions

* style

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: Softmax 的 axis 可以用默认值？感觉是 onnx 不标准

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix cuda & intelcpu bugs after merging

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: whjthu <haojie0429@gmail.com>
											
										
										
											2023-04-18 15:10:33 +08:00
+								    Tensor convTransposed2d(Tensor input, Tensor weight, Tensor output, int ph,
 								                            int pw, int sh, int sw, int dh, int dw, int oph,
 								                            int opw);
-												feat: 创建 pyinfinitensor 前端

- python 前端项目结构及打包和安装脚本
- 后端编译出 so 改名为 backend，增加 GraphHandler 修改图结构
- ci 支持测试这些功能

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-12 08:23:49 +08:00
+								    Tensor matmul(Tensor a, Tensor b, Tensor y, bool transA, bool transB,
 								                  Tensor bias, ActType act);
-												refactor(core): 添加新的 `OpType` 定义 (#99)

* feat: 添加新的 OpType 定义

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* refactor: 使用新的 OpType 替换原来的，修改整个项目

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: onnx 导入

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正 cuda 和 bang kernel 的问题

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 过滤 bang test

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 过滤 bang test

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix bang code.

* fix code on bang

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 删除指定文件

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 删两个没用的文件，去掉一个不知道为什么的注释

Signed-off-by: YdrMaster <ydrml@hotmail.com>

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
											
										
										
											2023-08-07 11:17:05 +08:00
+								    Tensor batchNormalization(Tensor input, Tensor output, Tensor mean,
 								                              Tensor var, Tensor scale, Tensor bias,
 								                              float momentum, float eps, bool training);
-												Add layer normalization  (#181)

* - add layernorm kernel

* success:add layernorm kernel and test

* fix: remove unusalble comments

* fix: modify code as reviewer suggested

* debug,modified .cu and test

* optional bias support

* overloading function

* fix bug after merging; remove time constrain in conv test

---------

Co-authored-by: kilinchange <kilinchange@163.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-11-24 15:15:14 +08:00
+								    Tensor layerNormalization(Tensor input, Tensor scale, Tensor output,
 								                              Tensor bias, float eps, int axis, int stash_type);
-												feat: 前端支持 batchNorm（无单元测试）

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-13 17:15:35 +08:00
-												feat: 前端支持 pool 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-14 16:26:47 +08:00
+								    Tensor maxPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-												Pooling ceil mode (#155)

* add ceil mode for pooling

* do not print debug info for allocator by default

* fix test bugs after introducing pooling ceil mode

* fix onnx import bug
											
										
										
											2023-10-09 20:51:39 +08:00
+								                   int ph, int pw, int sh, int sw, int ceilMode);
-												feat: 前端支持 pool 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-14 16:26:47 +08:00
+								    Tensor avgPool(Tensor input, Tensor output, int kh, int kw, int dh, int dw,
-												Pooling ceil mode (#155)

* add ceil mode for pooling

* do not print debug info for allocator by default

* fix test bugs after introducing pooling ceil mode

* fix onnx import bug
											
										
										
											2023-10-09 20:51:39 +08:00
+								                   int ph, int pw, int sh, int sw, int ceilMode);
-												feat: 前端支持 pool 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-14 16:26:47 +08:00
-												feat: 增加 add sub mul div pow 前端

- 添加每个算子的单元测试
- 添加线性回归模型导入测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-13 11:25:54 +08:00
+								    Tensor add(Tensor a, Tensor b, Tensor c);
 								    Tensor sub(Tensor a, Tensor b, Tensor c);
 								    Tensor mul(Tensor a, Tensor b, Tensor c);
 								    Tensor div(Tensor a, Tensor b, Tensor c);
 								    Tensor pow(Tensor a, Tensor b, Tensor c);
-												add CUDNN impl for Min and Max (#118)

* add cudnn impl for Min and Max

* fix onnx _search_shape with output shape
											
										
										
											2023-08-22 16:19:29 +08:00
+								    Tensor min(Tensor a, Tensor b, Tensor c);
 								    Tensor max(Tensor a, Tensor b, Tensor c);
-												feat: 前端支持 relu sigmoid tanh softmax abs 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-13 11:54:54 +08:00
 								    Tensor relu(Tensor x, Tensor y);
-												add rope and silu support

											
										
										
											2024-01-11 15:44:07 +08:00
+								    Tensor silu(Tensor x, Tensor y);
-												【Hackathon No.108】Add Gelu operator, ffi, kernel for cpu and gpu. (#148)

feat: Add Gelu kernel, operator, ffi.
											
										
										
											2023-10-10 15:21:13 +08:00
+								    Tensor gelu(Tensor x, Tensor y);
-												feat: 前端支持 relu sigmoid tanh softmax abs 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-13 11:54:54 +08:00
+								    Tensor sigmoid(Tensor x, Tensor y);
-												Add HardSigmoid and HardSwish (#156)

* Add HardSigmoid and HardSwish

* fix format
											
										
										
											2023-10-10 22:41:06 +08:00
+								    Tensor hardSigmoid(Tensor x, Tensor y);
 								    Tensor hardSwish(Tensor x, Tensor y);
-												feat: 前端支持 relu sigmoid tanh softmax abs 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-13 11:54:54 +08:00
+								    Tensor tanh(Tensor x, Tensor y);
-												框架支持bert/gpt2模型构图 (#94)

* feat: support to sqrt op

* feat: support to erf op

* feat: support to expand op

* feat: support to where op

* fix: gather op index can be int64_t(hard coding)

* fix: some wrong use

* style: fix the format style

* test: add test for change op

* fix: rebase to master

* fix: fix matmul b compute wrong

* add expand and where kernel

* Add int64 support for cuda gather kernel

* add test_where.cc

* add "expand.(cu/cc,test,cuda),modified where.cu"

* Separate initialization of datatypes to avoid compile error

* modify where.(cu/cc/h,test), expand and clip

* Format fix

* Format fix

---------

Co-authored-by: xgqdut2016 <kenan_gewei@163.com>
Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-08-29 16:06:52 +08:00
+								    Tensor erf(Tensor x, Tensor y);
-												Cpu backend2 (#77)

fix review

change Device::MKL to Device::INTELCPU

fix mkl linkage

fix errors according to merge from master

now can call mkl backend

fix softmax/flatten with axis from onnx.

modify README.md

fix memory refree

add env_lotus_intelcpu.sh

fix compile

merge from branch cpu_backend

fix something add gather

fix something

FIX: directory rename from "mkl" to "intelcpu"

ADD: use oneMKL dpcpp interface to implement matmul kernel.

ADD: add dpcpp as compiler for mkl, and fix warnings for clang compiling.
add dpcpp kernel for pow.

ADD: mkl kernel for pad.

ADD: slice mkl kernel.

ADD: reshape/flatten/identity mkl kernel.

ADD: split mkl kernel.

fix compile error

FIX: fix flattenObj with axis.

ADD reduce_mean mkl kernel.

Add concat mkl kernel.

bathNorm for mkl kernel.

sigmoid mkl kernel.

ADD：add mkl kernel for pooling

add more tests for softmax

Now softmax cuda kernel supports any axises.

mkl kernel for softmax

softmax

add axis to softmax operator

add mkl kernel for abs tanh

ADD: relu kernel for mkl

fix binary mkl primitives.

add mkl kernel for binary operators

fix compiler error

move stream to runtime

clang format

add MemoryFormat for tensorObj.

use post_ops for fused conv/deconv

Distinguish mkl  op_timer from cuda op timer.

add act optype to conv and deconv

add operator timer

add mkl kernel for convTransposed

minor fix for group conv

do not use cblas_sgemm_batch

CpuRuntimeObj->NativeCpuRuntimeObj

add  matmul op for mkl
											
										
										
											2023-04-17 12:15:23 +08:00
+								    Tensor softmax(Tensor x, Tensor y, int axis);
-												feat: 前端支持 relu sigmoid tanh softmax abs 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-13 11:54:54 +08:00
+								    Tensor abs(Tensor x, Tensor y);
-												impl sqrt on CUDA (#109)

* impl sqrt on CUDA
fix parser of Gather and ReduceMean

* fix test_gather

* fix test_cuda_gather

* impl sqrt cpu and add sqrt to test_cuda_unary

* cuda_unary supports arbitary shapes

* fix SplitOp with dim=-1

* fix SplitOp with dim=-1
											
										
										
											2023-08-18 12:17:47 +08:00
+								    Tensor sqrt(Tensor x, Tensor y);
-												Add Neg operator and kernel (#152)

* Add Neg operator and kernel

* handle neg in to_onnx

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-10-10 10:54:56 +08:00
+								    Tensor neg(Tensor x, Tensor y);
-												Dev for 202303ddl (#66)

* add activation operatiopn relu, tanh, sigmoid on mlu

* commit for format

* add activation backward operation

* add test for activation_backward

* add test

* add convbpfilter

* fix

* add transpsoe code and test

* add trigon function operation on mlu: sin,cos,tan,asin,sinh,asinh

* add copy operation on mlu

* add ceil operation and floor operation

* add operation clip

* add operation cnnl div, test and test for divdemo bangc kernel

* add divnonan operation and test

* add erf operation

* add exp operation

* add operation fill

* add log operation

* add log1p operation

* add l2loss operation

* add maximum and minimum operation

* add mseloss operation

* add negTensor operation

* add power operation

* add reciprocal operation

* add sqrt and rsqrt operation

* add transform operation

* add addn operation

* add muln operation

* cherrry pick some operation

* add floordiv operation and floordivtrunc operation

* add floormod operation

* add cumsum operation

* add det operation

* add pad operation

* format

* add concat operation

* format

* add split operation

* fix concat and split operation

* add round operation

* add pooling operation

* add square operation

* add squaredDifference operation

* code format fix

* add flip operation

* code format fix

* add hardtanh operation

* add logic operation

* add addcdiv and addcmul operation

* add arange operation

* add bitcompute operation

* add net test

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: rename

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 用 NativeCpuRuntime 替换 CpuRuntime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code

* fix code

* fix code by review suggestion

* remove operation which is not the onnx operation

* fix format

* clang format

* refactor: tensor 的 print 加一层模板的 dataToString

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: onnx 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 增加计算图优化接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add clip operation

* feat: 支持导入 clip

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* test: 导入导出测试加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix batch norm

* feat: 增加 Shape 算子

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入 unsqueeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正 clip 接口

feat: 支持导入 transpose
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add broadcast operation

* fix elementwise-broadcast

* fix elementwise broadcast

* add broadcast for gpu elementsie

* feat: pad 支持 axes 负数

feat: 不支持的 padding 导出为独立的 pad 算子

feat: 支持导入 onnxsim 过的 inception
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正池化的测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 pads，支持 inception 导入导出，已加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 densenet 导入导出，并加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 squeeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix softmax

* feat: 导出 clip 和 transpose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 Conv 的 bias

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv group

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: matmul 的 bias 没有放在输入里，修正

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix exmaple

* fix: 改正 reduce_mean 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* refactor: 修改 slice 实现与 onnx 一致

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: 不导出两个 runtime 函数

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 中文使用指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 补全指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修复导入数据的问题

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 添加 Dropout 基本结构，但不支持两个输出是不同的类型

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 重新导出优化接口

feat: dropout 导入
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* build: BANG 选项加入 Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fxi code, change of test/kernels/bang/test* is use NativeCpuRuntime.
chaneg of include/bang/bang_runtime is for the cntoolkit upgrade.

* feat: 导出 bang runtime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add USE_BANG=1

* fix matmul

* fix reshape

* fix

* fix activation

* fix transpose

* format

* format

* update Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add prelu on mlu

* fix: ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 PRelu

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add convtrans on mlu

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* docs: 更新 README_CN.md

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code by review suggestions

* style

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: Softmax 的 axis 可以用默认值？感觉是 onnx 不标准

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix cuda & intelcpu bugs after merging

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: whjthu <haojie0429@gmail.com>
											
										
										
											2023-04-18 15:10:33 +08:00
+								    Tensor shape(Tensor x, Tensor y);
-												feat: 前端支持 identity 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-13 12:13:01 +08:00
+								    Tensor identity(Tensor x, Tensor y);
-												Cpu backend2 (#77)

fix review

change Device::MKL to Device::INTELCPU

fix mkl linkage

fix errors according to merge from master

now can call mkl backend

fix softmax/flatten with axis from onnx.

modify README.md

fix memory refree

add env_lotus_intelcpu.sh

fix compile

merge from branch cpu_backend

fix something add gather

fix something

FIX: directory rename from "mkl" to "intelcpu"

ADD: use oneMKL dpcpp interface to implement matmul kernel.

ADD: add dpcpp as compiler for mkl, and fix warnings for clang compiling.
add dpcpp kernel for pow.

ADD: mkl kernel for pad.

ADD: slice mkl kernel.

ADD: reshape/flatten/identity mkl kernel.

ADD: split mkl kernel.

fix compile error

FIX: fix flattenObj with axis.

ADD reduce_mean mkl kernel.

Add concat mkl kernel.

bathNorm for mkl kernel.

sigmoid mkl kernel.

ADD：add mkl kernel for pooling

add more tests for softmax

Now softmax cuda kernel supports any axises.

mkl kernel for softmax

softmax

add axis to softmax operator

add mkl kernel for abs tanh

ADD: relu kernel for mkl

fix binary mkl primitives.

add mkl kernel for binary operators

fix compiler error

move stream to runtime

clang format

add MemoryFormat for tensorObj.

use post_ops for fused conv/deconv

Distinguish mkl  op_timer from cuda op timer.

add act optype to conv and deconv

add operator timer

add mkl kernel for convTransposed

minor fix for group conv

do not use cblas_sgemm_batch

CpuRuntimeObj->NativeCpuRuntimeObj

add  matmul op for mkl
											
										
										
											2023-04-17 12:15:23 +08:00
+								    Tensor flatten(Tensor s, Tensor y, int axis);
-												Dev for 202303ddl (#66)

* add activation operatiopn relu, tanh, sigmoid on mlu

* commit for format

* add activation backward operation

* add test for activation_backward

* add test

* add convbpfilter

* fix

* add transpsoe code and test

* add trigon function operation on mlu: sin,cos,tan,asin,sinh,asinh

* add copy operation on mlu

* add ceil operation and floor operation

* add operation clip

* add operation cnnl div, test and test for divdemo bangc kernel

* add divnonan operation and test

* add erf operation

* add exp operation

* add operation fill

* add log operation

* add log1p operation

* add l2loss operation

* add maximum and minimum operation

* add mseloss operation

* add negTensor operation

* add power operation

* add reciprocal operation

* add sqrt and rsqrt operation

* add transform operation

* add addn operation

* add muln operation

* cherrry pick some operation

* add floordiv operation and floordivtrunc operation

* add floormod operation

* add cumsum operation

* add det operation

* add pad operation

* format

* add concat operation

* format

* add split operation

* fix concat and split operation

* add round operation

* add pooling operation

* add square operation

* add squaredDifference operation

* code format fix

* add flip operation

* code format fix

* add hardtanh operation

* add logic operation

* add addcdiv and addcmul operation

* add arange operation

* add bitcompute operation

* add net test

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: rename

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 用 NativeCpuRuntime 替换 CpuRuntime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code

* fix code

* fix code by review suggestion

* remove operation which is not the onnx operation

* fix format

* clang format

* refactor: tensor 的 print 加一层模板的 dataToString

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: onnx 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 增加计算图优化接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add clip operation

* feat: 支持导入 clip

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* test: 导入导出测试加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix batch norm

* feat: 增加 Shape 算子

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入 unsqueeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正 clip 接口

feat: 支持导入 transpose
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add broadcast operation

* fix elementwise-broadcast

* fix elementwise broadcast

* add broadcast for gpu elementsie

* feat: pad 支持 axes 负数

feat: 不支持的 padding 导出为独立的 pad 算子

feat: 支持导入 onnxsim 过的 inception
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正池化的测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 pads，支持 inception 导入导出，已加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 densenet 导入导出，并加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 squeeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix softmax

* feat: 导出 clip 和 transpose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 Conv 的 bias

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv group

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: matmul 的 bias 没有放在输入里，修正

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix exmaple

* fix: 改正 reduce_mean 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* refactor: 修改 slice 实现与 onnx 一致

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: 不导出两个 runtime 函数

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 中文使用指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 补全指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修复导入数据的问题

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 添加 Dropout 基本结构，但不支持两个输出是不同的类型

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 重新导出优化接口

feat: dropout 导入
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* build: BANG 选项加入 Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fxi code, change of test/kernels/bang/test* is use NativeCpuRuntime.
chaneg of include/bang/bang_runtime is for the cntoolkit upgrade.

* feat: 导出 bang runtime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add USE_BANG=1

* fix matmul

* fix reshape

* fix

* fix activation

* fix transpose

* format

* format

* update Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add prelu on mlu

* fix: ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 PRelu

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add convtrans on mlu

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* docs: 更新 README_CN.md

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code by review suggestions

* style

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: Softmax 的 axis 可以用默认值？感觉是 onnx 不标准

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix cuda & intelcpu bugs after merging

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: whjthu <haojie0429@gmail.com>
											
										
										
											2023-04-18 15:10:33 +08:00
+								    Tensor pRelu(Tensor x, Tensor slope, Tensor y);
 								    Tensor clip(Tensor x, Tensor y, std::optional<float> min,
 								                std::optional<float> max);
 								    Tensor transpose(Tensor data, Tensor transposed, Shape perm);
-												feat: 前端支持 reshape

- 无法测试，因为后端不支持 shape 的 INT64 类型

opt: ReshapeObj 构造改为全部传值并在内部 move
Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-14 09:50:32 +08:00
+								    Tensor reshape(Tensor data, Tensor reshaped, Shape shape);
-												add frontend resize kernel (#194)

* - add frontend resize kernel

* - fix resize test

* - fix bug
- add onnx test for resize

* fix: modify codes as reviewer suggested

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-12-29 13:32:56 +08:00
+								    Tensor resize(Tensor input, Tensor output,
 								                  const std::optional<vector<int>> &axes, Tensor sizes,
 								                  Tensor scales, Tensor roi, vector<uint32_t> sizes_,
 								                  vector<float> scales_, vector<float> roi_, string mode,
 								                  string ratioPolicy, string nearestMode,
 								                  string coordTransMode);
-												解除前端对onnx infershape功能的依赖 (#206)

* feat: SqueezeOp lift the dependency of onnx infershape.

* feat: UnsqueezeOp lift the dependency of onnx infershape.

* feat: lift the dependency of onnx infershape

* fix: fix Makefile off nccl
											
										
										
											2024-01-12 14:54:27 +08:00
+								    Tensor squeeze(Tensor input, Tensor output, Shape axes);
 								    Tensor unsqueeze(Tensor input, Tensor output, Shape axes);
-												feat: 前端支持 concat 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-14 13:42:35 +08:00
+								    Tensor concat(TensorVec inputs, Tensor output, int dim);
-												use workspace to optimize kvcache attention

											
										
										
											2024-01-25 09:08:25 +08:00
+								    Tensor attentionKVCache(Tensor input_k_cache, Tensor input_v_cache,
 								                            Tensor input_q, Tensor input_k, Tensor input_v,
 								                            Tensor position_id, Tensor output_matmul);
-												add rope and silu support

											
										
										
											2024-01-11 15:44:07 +08:00
+								    Tensor RoPE(Tensor pos, Tensor input, Tensor output);
-												Dev for 202303ddl (#66)

* add activation operatiopn relu, tanh, sigmoid on mlu

* commit for format

* add activation backward operation

* add test for activation_backward

* add test

* add convbpfilter

* fix

* add transpsoe code and test

* add trigon function operation on mlu: sin,cos,tan,asin,sinh,asinh

* add copy operation on mlu

* add ceil operation and floor operation

* add operation clip

* add operation cnnl div, test and test for divdemo bangc kernel

* add divnonan operation and test

* add erf operation

* add exp operation

* add operation fill

* add log operation

* add log1p operation

* add l2loss operation

* add maximum and minimum operation

* add mseloss operation

* add negTensor operation

* add power operation

* add reciprocal operation

* add sqrt and rsqrt operation

* add transform operation

* add addn operation

* add muln operation

* cherrry pick some operation

* add floordiv operation and floordivtrunc operation

* add floormod operation

* add cumsum operation

* add det operation

* add pad operation

* format

* add concat operation

* format

* add split operation

* fix concat and split operation

* add round operation

* add pooling operation

* add square operation

* add squaredDifference operation

* code format fix

* add flip operation

* code format fix

* add hardtanh operation

* add logic operation

* add addcdiv and addcmul operation

* add arange operation

* add bitcompute operation

* add net test

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: rename

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 用 NativeCpuRuntime 替换 CpuRuntime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code

* fix code

* fix code by review suggestion

* remove operation which is not the onnx operation

* fix format

* clang format

* refactor: tensor 的 print 加一层模板的 dataToString

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: onnx 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 增加计算图优化接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add clip operation

* feat: 支持导入 clip

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* test: 导入导出测试加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix batch norm

* feat: 增加 Shape 算子

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入 unsqueeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正 clip 接口

feat: 支持导入 transpose
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add broadcast operation

* fix elementwise-broadcast

* fix elementwise broadcast

* add broadcast for gpu elementsie

* feat: pad 支持 axes 负数

feat: 不支持的 padding 导出为独立的 pad 算子

feat: 支持导入 onnxsim 过的 inception
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正池化的测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 pads，支持 inception 导入导出，已加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 densenet 导入导出，并加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 squeeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix softmax

* feat: 导出 clip 和 transpose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 Conv 的 bias

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv group

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: matmul 的 bias 没有放在输入里，修正

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix exmaple

* fix: 改正 reduce_mean 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* refactor: 修改 slice 实现与 onnx 一致

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: 不导出两个 runtime 函数

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 中文使用指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 补全指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修复导入数据的问题

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 添加 Dropout 基本结构，但不支持两个输出是不同的类型

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 重新导出优化接口

feat: dropout 导入
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* build: BANG 选项加入 Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fxi code, change of test/kernels/bang/test* is use NativeCpuRuntime.
chaneg of include/bang/bang_runtime is for the cntoolkit upgrade.

* feat: 导出 bang runtime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add USE_BANG=1

* fix matmul

* fix reshape

* fix

* fix activation

* fix transpose

* format

* format

* update Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add prelu on mlu

* fix: ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 PRelu

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add convtrans on mlu

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* docs: 更新 README_CN.md

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code by review suggestions

* style

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: Softmax 的 axis 可以用默认值？感觉是 onnx 不标准

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix cuda & intelcpu bugs after merging

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: whjthu <haojie0429@gmail.com>
											
										
										
											2023-04-18 15:10:33 +08:00
+								    TensorVec split(Tensor input, std::optional<TensorVec> outputs, int axis,
-												fix Issue 187 split infershape wrong (#197)

* fix: fix splitOp to support unequal portions

* fix: fix as review comment

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-12-28 21:39:24 +08:00
+								                    std::variant<int, vector<int>> numOrRatio);
-												feat: 前端支持 reduceMean 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-14 15:35:01 +08:00
+								    Tensor gather(Tensor data, Tensor indices, Tensor output, int axis);
-												Add GatherElements op and cuda kernel (#149)

* Add GatherElements op and cuda kernel

* fix format

* remove print

* remove unused var

* fix spacing

* fix format

---------

Co-authored-by: panzezhong@qiyuanlab.com <panzezhong@zezhongpan>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-10-12 09:18:12 +08:00
+								    Tensor gatherElements(Tensor data, Tensor indices, Tensor output, int axis);
-												feat: 前端支持 reduceMean 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-14 15:35:01 +08:00
+								    Tensor reduceMean(Tensor data, Tensor reduced,
 								                      const optional<vector<int>> &axes, bool keepdims);
-												Add ReduceSum op and kernel (#160)

* Add reduceSum op and kernel

* fix merge and format

* Reduce: reuse cat macro, add doc string

---------

Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-11-24 09:29:58 +08:00
+								    Tensor reduceSum(Tensor data, Tensor reduced,
 								                     const optional<vector<int>> &axes, bool keepdims);
-												feat: 前端支持 slice 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-14 17:35:18 +08:00
+								    Tensor slice(Tensor input, Tensor output, const vector<int> &starts,
 								                 const vector<int> &ends, const optional<vector<int>> &axes,
 								                 const optional<vector<int>> &steps);
-												feat: 前端支持 pad 及单元测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-15 11:41:06 +08:00
+								    Tensor pad(Tensor input, Tensor output, const vector<int> &pads,
 								               const optional<vector<int>> &axes);
-												support mixed dtype (#102)

* feat: support mixed dtype

* feat: support cast op

* test: add test for cast op

* feat: support datatype BFloat16

* feat: support data convert fp32 <-> bfp16

* fix: fix all op's infershape func

* fix as review comment
											
										
										
											2023-08-16 21:49:43 +08:00
+								    Tensor cast(Tensor input, Tensor output, int to);
-												框架支持bert/gpt2模型构图 (#94)

* feat: support to sqrt op

* feat: support to erf op

* feat: support to expand op

* feat: support to where op

* fix: gather op index can be int64_t(hard coding)

* fix: some wrong use

* style: fix the format style

* test: add test for change op

* fix: rebase to master

* fix: fix matmul b compute wrong

* add expand and where kernel

* Add int64 support for cuda gather kernel

* add test_where.cc

* add "expand.(cu/cc,test,cuda),modified where.cu"

* Separate initialization of datatypes to avoid compile error

* modify where.(cu/cc/h,test), expand and clip

* Format fix

* Format fix

---------

Co-authored-by: xgqdut2016 <kenan_gewei@163.com>
Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-08-29 16:06:52 +08:00
+								    Tensor expand(Tensor input, Tensor output, Shape dims);
 								    Tensor where(Tensor inputX, Tensor inputY, Tensor condition, Tensor output);
-												support Dynamic tensor infer shape and fix memory pool (#176)

* feat: support dynamic tensor part1

* feat: support dynamic-tensor part2

* feat: support dynamic tensor part 3

* fix: fix some ..

* - add kvcache example

* feat: support concat to identity kernel

* add a simple mempory pool for allocator

* fix: rebase to master

* fix bug after merging

* - remove outdated script

* fix: fix as review

---------

Co-authored-by: kilinchange <kilinchange@163.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-11-23 13:11:50 +08:00
+								    std::vector<int> getDims(Tensor x) { return x->getDims(); }
-												feat: 导出分配内存和运行推理的接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-23 11:08:00 +08:00
-												impl distributed launch with NCCL  (#106)

* add cmake bits about NCCL

* move example to examples/NNmodel

* impl NCCL communicator

* add comm related function to Runtime

* export runtime interface

* add launch.py

* use unique name to distingush the the NCCL ID file

* add timeout to communicator init

* expose communicator obj from runtime obj, add unit test for nccl communicator

* reformat files

* Add allReduce operator and cuda nccl allReduce kernel

* impl model parallel for resnet

* add allGather nccl kernel and operator

* Add allreduce allgather operator tests, change allgather kernel to output list of tensor, fix shape infer, handle nullptr output

* fix format of onnx.py

* use concat following AllGather

* get tensor parallel for resnet

* fix format of graph_handler.cc

* change BUILD_DIST default to OFF

* polish code of communicator

* update .gitignore

* Add broadcast operator and cuda kernel

* Add comments for operators

* remove const of class member

* move communicator to CudaRuntimeObj

* Add an empty line at EOF.

---------

Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-09-05 09:47:35 +08:00
+								    Tensor allReduceSum(Tensor input, Tensor output);
 								    Tensor allReduceProd(Tensor input, Tensor output);
 								    Tensor allReduceMin(Tensor input, Tensor output);
 								    Tensor allReduceMax(Tensor input, Tensor output);
 								    Tensor allReduceAvg(Tensor input, Tensor output);
 								    TensorVec allGather(Tensor input, std::optional<TensorVec> outputs, int n);
 								    Tensor broadcast(Tensor input, Tensor output, int root);
-												Add send and recv operators based on NCCL (#182)

* baseline sendrecv, bug

* success sendrecv

* get rank from comm

* set output shape

* successful:set output shape equal to input shape

* shape as attribute

* success:shape as attribute

* success send recv, output 0

* add onnx test

* split send and recv

* success split send and recv

* test-onnx bug

* success test-onnx

* modified onnx.py

* solve review
											
										
										
											2023-12-14 16:38:03 +08:00
+								    Tensor send(Tensor input, int source, int destination, Tensor output);
 								    Tensor recv(Tensor output, int source, int destination, Shape dims,
 								                int outputType, Tensor input);
-												[Kunlun & CUDA & BANG] add depth2space operator (#178)

* add depth2space operator

* fix format

* add depth2space on cambricon bang

* add depth2space on gpu

---------

Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-11-10 17:58:26 +08:00
+								    Tensor depthToSpace(Tensor input, Tensor output, int blocksize,
 								                        std::string mode);
-												Fix bang (#198)

* fix bang batchnorm

* fix pooling test bang

* add test batchnorm

* HIGH PRECISION ACTIVATION

* fix pooling

* fix matmul

* fix test

* add layernorm

* fix softmax

* fix

* better code

* fix

* fix worlflow

* fix workflow

* fix

* fix

* fxi matmul

* add LRN

* fix lrn

* fix lrn

---------

Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: Baoming Li <1508269885@qq.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-12-28 13:44:10 +08:00
+								    Tensor lrn(Tensor input, Tensor output, float alpha, float beta, float bias,
 								               int size);
-												impl distributed launch with NCCL  (#106)

* add cmake bits about NCCL

* move example to examples/NNmodel

* impl NCCL communicator

* add comm related function to Runtime

* export runtime interface

* add launch.py

* use unique name to distingush the the NCCL ID file

* add timeout to communicator init

* expose communicator obj from runtime obj, add unit test for nccl communicator

* reformat files

* Add allReduce operator and cuda nccl allReduce kernel

* impl model parallel for resnet

* add allGather nccl kernel and operator

* Add allreduce allgather operator tests, change allgather kernel to output list of tensor, fix shape infer, handle nullptr output

* fix format of onnx.py

* use concat following AllGather

* get tensor parallel for resnet

* fix format of graph_handler.cc

* change BUILD_DIST default to OFF

* polish code of communicator

* update .gitignore

* Add broadcast operator and cuda kernel

* Add comments for operators

* remove const of class member

* move communicator to CudaRuntimeObj

* Add an empty line at EOF.

---------

Co-authored-by: panzezhong <panzezhong@qiyuanlab.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-09-05 09:47:35 +08:00
-												feat: 导出 OperatorObj

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-17 15:00:34 +08:00
+								    //------ modifiers
 								    inline bool topo_sort() { return g->topo_sort(); }
-												Dev for 202303ddl (#66)

* add activation operatiopn relu, tanh, sigmoid on mlu

* commit for format

* add activation backward operation

* add test for activation_backward

* add test

* add convbpfilter

* fix

* add transpsoe code and test

* add trigon function operation on mlu: sin,cos,tan,asin,sinh,asinh

* add copy operation on mlu

* add ceil operation and floor operation

* add operation clip

* add operation cnnl div, test and test for divdemo bangc kernel

* add divnonan operation and test

* add erf operation

* add exp operation

* add operation fill

* add log operation

* add log1p operation

* add l2loss operation

* add maximum and minimum operation

* add mseloss operation

* add negTensor operation

* add power operation

* add reciprocal operation

* add sqrt and rsqrt operation

* add transform operation

* add addn operation

* add muln operation

* cherrry pick some operation

* add floordiv operation and floordivtrunc operation

* add floormod operation

* add cumsum operation

* add det operation

* add pad operation

* format

* add concat operation

* format

* add split operation

* fix concat and split operation

* add round operation

* add pooling operation

* add square operation

* add squaredDifference operation

* code format fix

* add flip operation

* code format fix

* add hardtanh operation

* add logic operation

* add addcdiv and addcmul operation

* add arange operation

* add bitcompute operation

* add net test

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: rename

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 用 NativeCpuRuntime 替换 CpuRuntime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code

* fix code

* fix code by review suggestion

* remove operation which is not the onnx operation

* fix format

* clang format

* refactor: tensor 的 print 加一层模板的 dataToString

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: onnx 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 增加计算图优化接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add clip operation

* feat: 支持导入 clip

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* test: 导入导出测试加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix batch norm

* feat: 增加 Shape 算子

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入 unsqueeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正 clip 接口

feat: 支持导入 transpose
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add broadcast operation

* fix elementwise-broadcast

* fix elementwise broadcast

* add broadcast for gpu elementsie

* feat: pad 支持 axes 负数

feat: 不支持的 padding 导出为独立的 pad 算子

feat: 支持导入 onnxsim 过的 inception
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修正池化的测试

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 pads，支持 inception 导入导出，已加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 densenet 导入导出，并加入 ci

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 squeeze

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix softmax

* feat: 导出 clip 和 transpose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持 Conv 的 bias

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: bias of conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导入 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 导出 split

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: conv group

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: matmul 的 bias 没有放在输入里，修正

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix exmaple

* fix: 改正 reduce_mean 导出

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* refactor: 修改 slice 实现与 onnx 一致

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* style: 不导出两个 runtime 函数

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 中文使用指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* doc: 补全指南

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: 修复导入数据的问题

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 添加 Dropout 基本结构，但不支持两个输出是不同的类型

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 重新导出优化接口

feat: dropout 导入
Signed-off-by: YdrMaster <ydrml@hotmail.com>

* build: BANG 选项加入 Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fxi code, change of test/kernels/bang/test* is use NativeCpuRuntime.
chaneg of include/bang/bang_runtime is for the cntoolkit upgrade.

* feat: 导出 bang runtime

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add USE_BANG=1

* fix matmul

* fix reshape

* fix

* fix activation

* fix transpose

* format

* format

* update Makefile

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add prelu on mlu

* fix: ConvTranspose

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* feat: 支持导入导出 PRelu

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* add convtrans on mlu

* fmt

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* docs: 更新 README_CN.md

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix code by review suggestions

* style

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix: Softmax 的 axis 可以用默认值？感觉是 onnx 不标准

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix cuda & intelcpu bugs after merging

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: wanghailu <wanghailu0717@163.com>
Co-authored-by: wanghailu <wanghailu@qiyuanlab.com>
Co-authored-by: whjthu <haojie0429@gmail.com>
											
										
										
											2023-04-18 15:10:33 +08:00
+								    inline void optimize() { g->optimize(); }
-												support Dynamic tensor infer shape and fix memory pool (#176)

* feat: support dynamic tensor part1

* feat: support dynamic-tensor part2

* feat: support dynamic tensor part 3

* fix: fix some ..

* - add kvcache example

* feat: support concat to identity kernel

* add a simple mempory pool for allocator

* fix: rebase to master

* fix bug after merging

* - remove outdated script

* fix: fix as review

---------

Co-authored-by: kilinchange <kilinchange@163.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-11-23 13:11:50 +08:00
+								    inline void shape_infer() { g->shape_infer(); }
 								    void change_shape(const vector<int> &shape, int tensorId);
-												feat: 导出分配内存和运行推理的接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-23 11:08:00 +08:00
+								    //------ runtime
-												support Dynamic tensor infer shape and fix memory pool (#176)

* feat: support dynamic tensor part1

* feat: support dynamic-tensor part2

* feat: support dynamic tensor part 3

* fix: fix some ..

* - add kvcache example

* feat: support concat to identity kernel

* add a simple mempory pool for allocator

* fix: rebase to master

* fix bug after merging

* - remove outdated script

* fix: fix as review

---------

Co-authored-by: kilinchange <kilinchange@163.com>
Co-authored-by: Haojie Wang <haojie0429@gmail.com>
											
										
										
											2023-11-23 13:11:50 +08:00
+								    inline void data_malloc(bool useNaiveAllocator = false,
 								                            size_t memPoolSize = 0) {
 								        g->dataMalloc(useNaiveAllocator, memPoolSize);
 								    }
 								    inline Tensor clone_KV(Tensor &tensor) { return g->cloneKV(tensor); }
 								    inline void free_heap() { g->freeHeap(); }
-												feat: 导出分配内存和运行推理的接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-23 11:08:00 +08:00
-												fix ReduceMean and element_wise (#90)

* feat: 导出 getPerfTime 到 python

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix parsing of ReduceMean

* ReduceMean axes defaults to None

* fix ElementWiseCudnn with shape broadcasting

* fix format

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: YdrMaster <ydrml@hotmail.com>
											
										
										
											2023-06-29 07:15:07 +08:00
+								    inline void tune() { g->getRuntime()->run(g, true); }
-												feat: 导出分配内存和运行推理的接口

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-23 11:08:00 +08:00
+								    inline void run() { g->getRuntime()->run(g); }
-												fix ReduceMean and element_wise (#90)

* feat: 导出 getPerfTime 到 python

Signed-off-by: YdrMaster <ydrml@hotmail.com>

* fix parsing of ReduceMean

* ReduceMean axes defaults to None

* fix ElementWiseCudnn with shape broadcasting

* fix format

---------

Signed-off-by: YdrMaster <ydrml@hotmail.com>
Co-authored-by: YdrMaster <ydrml@hotmail.com>
											
										
										
											2023-06-29 07:15:07 +08:00
 								    inline double get_perf_time() { return g->getRuntime()->getPerfTime(g); }
-												[feature] add cudagraph support (#215)

* [feature] add cudagraph support

* modify code to pass the cuda_all_reduce test
											
										
										
											2024-02-21 14:00:25 +08:00
 								#ifdef USE_CUDA
 								    inline void run_with_cudagraph() {
 								        (as<CudaRuntimeObj>(g->getRuntime()))->runWithCudaGraph(g);
 								    }
 								#endif
-												feat: 创建 pyinfinitensor 前端

- python 前端项目结构及打包和安装脚本
- 后端编译出 so 改名为 backend，增加 GraphHandler 修改图结构
- ci 支持测试这些功能

Signed-off-by: YdrMaster <ydrml@hotmail.com>

											
										
										
											2023-02-12 08:23:49 +08:00
+								};
 								} // namespace infini