InfiniTensor/examples/distributed/parallel.py

import onnx
from onnx import (
    ModelProto,
    TensorProto,
    NodeProto,
    AttributeProto,
)
from onnx import helper, numpy_helper
from typing import Dict, Any


def parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:
    for attr in node.attribute:
        if attr.name in attrs:
            if attr.type == AttributeProto.INT:
                attrs[attr.name] = attr.i
            elif attr.type == AttributeProto.INTS:
                attrs[attr.name] = attr.ints
            elif attr.type == AttributeProto.FLOAT:
                attrs[attr.name] = attr.f
            elif attr.type == AttributeProto.STRING:
                attrs[attr.name] = attr.s
            elif attr.type == AttributeProto.TENSOR:
                attrs[attr.name] = attr.t
            else:
                assert False, "Unsupported Attribute Type: {}".format(attr.type)
    return attrs


def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):
    data = {init.name: init for init in model.graph.initializer}
    nodes = list(model.graph.node)

    def shard_tensor(tensor: TensorProto, dim: int):
        array = numpy_helper.to_array(tensor)
        if dim >= array.ndim:
            dim = array.ndim - 1
        assert array.shape[dim] % tp_world_size == 0
        seg = array.shape[dim] // tp_world_size
        array = array[tp_rank * seg : (tp_rank + 1) * seg]
        return numpy_helper.from_array(array, name=tensor.name + f":sharded({dim})")

    def shard_gemm(node: NodeProto):
        attrs = parse_attribute(
            node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}
        )
        trans = [attrs["transA"], attrs["transB"]]
        dim = 0
        for i, (input, t) in enumerate(zip(node.input, trans)):
            if input in data:
                dim = i
                sharded = shard_tensor(data[input], dim ^ t)
                node.input[i] = sharded.name
                data[input] = sharded
        if len(node.input) > 2:
            input = node.input[2]
            sharded = shard_tensor(data[input], dim)
            node.input[2] = sharded.name
            data[input] = sharded

        node.output[0] += f":sharded({dim})"
        return dim

    for i, node in enumerate(nodes):
        if node.op_type == "Gemm":
            output = node.output[0]
            dim = shard_gemm(node)
            gathered = [node.output[0] + f".{i}" for i in range(tp_world_size)]
            # all_gather
            nodes.insert(
                i + 1,
                helper.make_node(
                    op_type="AllGather",
                    inputs=[node.output[0]],
                    outputs=gathered,
                    name=node.name + "/allgather",
                    # domain="infini", # shape inference fails for custom domain
                ),
            )
            # concat
            nodes.insert(
                i + 2,
                helper.make_node(
                    op_type="Concat",
                    inputs=gathered,
                    outputs=[output],
                    name=node.name + "/concat",
                    axis=dim,
                ),
            )
    graph = helper.make_graph(
        nodes,
        model.graph.name + f"_{tp_rank}",
        model.graph.input,
        model.graph.output,
        data.values(),
        doc_string=model.graph.doc_string,
        value_info=model.graph.value_info,
    )
    model = helper.make_model(graph)

    onnx.shape_inference.infer_shapes(model)
    return model
impl distributed launch with NCCL (#106) * add cmake bits about NCCL * move example to examples/NNmodel * impl NCCL communicator * add comm related function to Runtime * export runtime interface * add launch.py * use unique name to distingush the the NCCL ID file * add timeout to communicator init * expose communicator obj from runtime obj, add unit test for nccl communicator * reformat files * Add allReduce operator and cuda nccl allReduce kernel * impl model parallel for resnet * add allGather nccl kernel and operator * Add allreduce allgather operator tests, change allgather kernel to output list of tensor, fix shape infer, handle nullptr output * fix format of onnx.py * use concat following AllGather * get tensor parallel for resnet * fix format of graph_handler.cc * change BUILD_DIST default to OFF * polish code of communicator * update .gitignore * Add broadcast operator and cuda kernel * Add comments for operators * remove const of class member * move communicator to CudaRuntimeObj * Add an empty line at EOF. --------- Co-authored-by: panzezhong <panzezhong@qiyuanlab.com> Co-authored-by: Haojie Wang <haojie0429@gmail.com> 2023-09-05 09:47:35 +08:00			`import onnx`
			`from onnx import (`
			`ModelProto,`
			`TensorProto,`
			`NodeProto,`
			`AttributeProto,`
			`)`
			`from onnx import helper, numpy_helper`
			`from typing import Dict, Any`


			`def parse_attribute(node: NodeProto, attrs: Dict[str, Any] = dict()) -> Dict[str, Any]:`
			`for attr in node.attribute:`
			`if attr.name in attrs:`
			`if attr.type == AttributeProto.INT:`
			`attrs[attr.name] = attr.i`
			`elif attr.type == AttributeProto.INTS:`
			`attrs[attr.name] = attr.ints`
			`elif attr.type == AttributeProto.FLOAT:`
			`attrs[attr.name] = attr.f`
			`elif attr.type == AttributeProto.STRING:`
			`attrs[attr.name] = attr.s`
			`elif attr.type == AttributeProto.TENSOR:`
			`attrs[attr.name] = attr.t`
			`else:`
			`assert False, "Unsupported Attribute Type: {}".format(attr.type)`
			`return attrs`


			`def parallel_model(model: ModelProto, tp_world_size: int = 1, tp_rank: int = 0):`
			`data = {init.name: init for init in model.graph.initializer}`
			`nodes = list(model.graph.node)`

			`def shard_tensor(tensor: TensorProto, dim: int):`
			`array = numpy_helper.to_array(tensor)`
			`if dim >= array.ndim:`
			`dim = array.ndim - 1`
			`assert array.shape[dim] % tp_world_size == 0`
			`seg = array.shape[dim] // tp_world_size`
			`array = array[tp_rank * seg : (tp_rank + 1) * seg]`
			`return numpy_helper.from_array(array, name=tensor.name + f":sharded({dim})")`

			`def shard_gemm(node: NodeProto):`
			`attrs = parse_attribute(`
			`node, {"alpha": 1.0, "beta": 1.0, "transA": 0, "transB": 0}`
			`)`
			`trans = [attrs["transA"], attrs["transB"]]`
			`dim = 0`
			`for i, (input, t) in enumerate(zip(node.input, trans)):`
			`if input in data:`
			`dim = i`
			`sharded = shard_tensor(data[input], dim ^ t)`
			`node.input[i] = sharded.name`
			`data[input] = sharded`
			`if len(node.input) > 2:`
			`input = node.input[2]`
			`sharded = shard_tensor(data[input], dim)`
			`node.input[2] = sharded.name`
			`data[input] = sharded`

			`node.output[0] += f":sharded({dim})"`
			`return dim`

			`for i, node in enumerate(nodes):`
			`if node.op_type == "Gemm":`
			`output = node.output[0]`
			`dim = shard_gemm(node)`
			`gathered = [node.output[0] + f".{i}" for i in range(tp_world_size)]`
			`# all_gather`
			`nodes.insert(`
			`i + 1,`
			`helper.make_node(`
			`op_type="AllGather",`
			`inputs=[node.output[0]],`
			`outputs=gathered,`
			`name=node.name + "/allgather",`
			`# domain="infini", # shape inference fails for custom domain`
			`),`
			`)`
			`# concat`
			`nodes.insert(`
			`i + 2,`
			`helper.make_node(`
			`op_type="Concat",`
			`inputs=gathered,`
			`outputs=[output],`
			`name=node.name + "/concat",`
			`axis=dim,`
			`),`
			`)`
			`graph = helper.make_graph(`
			`nodes,`
			`model.graph.name + f"_{tp_rank}",`
			`model.graph.input,`
			`model.graph.output,`
			`data.values(),`
			`doc_string=model.graph.doc_string,`
			`value_info=model.graph.value_info,`
			`)`
			`model = helper.make_model(graph)`

			`onnx.shape_inference.infer_shapes(model)`
			`return model`