from typing import Dict, List, Union

import numpy as np
import onnx.numpy_helper as onph
from google.protobuf.internal.containers import (
    RepeatedCompositeFieldContainer,
    RepeatedScalarFieldContainer,
)
from onnx.onnx_pb import AttributeProto, GraphProto, ModelProto, NodeProto, TensorProto
from nodes.log import logger

from ..ncnn.model import (
    DTYPE_FP16,
    DTYPE_FP32,
    BinaryOpTypes,
    EltwiseOpTypes,
    GruDirectionFlags,
    InterpResizeTypes,
    NcnnLayer,
    NcnnModel,
    NormalizeEpsModes,
    PaddingTypes,
    PadModes,
    PermuteOrderTypes,
    ReductionOpTypes,
    UnaryOpTypes,
)
from ..ncnn.optimizer import NcnnOptimizer
from .tensorproto_utils import *

UOT = UnaryOpTypes
BOT = BinaryOpTypes
EOT = EltwiseOpTypes
GRU = GruDirectionFlags
IRT = InterpResizeTypes
NEM = NormalizeEpsModes
PAM = PadModes
PAT = PaddingTypes
POT = PermuteOrderTypes
ROT = ReductionOpTypes


class Onnx2NcnnConverter:
    def __init__(self, onnx_model: ModelProto):
        self.onnx_graph: GraphProto = onnx_model.graph
        self.mutable_graph_nodes: List[NodeProto] = list(self.onnx_graph.node)
        self.node_count: int = len(self.onnx_graph.node)
        self.weights: Dict[str, TensorProto] = {
            initializer.name: initializer for initializer in self.onnx_graph.initializer
        }

        self.producers: Dict[str, None] = {i.name: None for i in self.onnx_graph.input}
        self.node_reference: Dict[str, int] = {}
        self.blob_names: Dict[str, None] = {}

    @staticmethod
    def add_weight(
        layer: NcnnLayer,
        weight_name: str,
        data: Union[float, int, np.ndarray, TensorProto],
        quantize_tag: bytes = b"",
    ) -> int:
        if isinstance(data, TensorProto):
            data = onph.to_array(data)

        return layer.add_weight(weight_name, data, quantize_tag)

    @staticmethod
    def clear_container(
        container: Union[RepeatedCompositeFieldContainer, RepeatedScalarFieldContainer],
    ) -> None:
        for _ in range(len(container)):
            container.pop()

    def swap_nodes(self, a: int, b: int) -> None:
        self.mutable_graph_nodes[a], self.mutable_graph_nodes[b] = (
            self.mutable_graph_nodes[b],
            self.mutable_graph_nodes[a],
        )

    def fuse_rewrite_gather(self) -> None:
        for gather in self.mutable_graph_nodes:
            if gather.op_type == "Gather":
                indices = get_node_attr_from_input_ai(self.weights[gather.input[1]])
                if len(indices) == 1:
                    # Reconstruct node connections
                    self.node_reference[gather.input[1]] -= 1
                    origin_inp = gather.input[0]
                    gather.ClearField("input")
                    gather.input.append(origin_inp)

                    # Update axis, starts and ends
                    axis = get_node_attr_i(gather, "axis", 1)
                    gather.op_type = "Crop"
                    gather.ClearField("attribute")

                    index = indices[0]
                    set_node_attr_ai(gather, "starts", np.array([index], np.int32))
                    set_node_attr_ai(gather, "ends", np.array([index + 1], np.int32))
                    set_node_attr_ai(gather, "axis", np.array([axis], np.int32))

    def fuse_weight_reshape(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]
            if node.op_type == "Reshape":
                if node.input[0] in self.weights:
                    self.weights[node.output[0]] = self.weights[node.input[0]]
                    if len(node.input) == 1:
                        shape = get_node_attr_ai(node, "shape")
                    elif len(node.input) == 2:
                        shape = get_node_attr_from_input_ai(self.weights[node.input[1]])
                    else:
                        shape = np.empty(0, np.int64)

                    self.clear_container(self.weights[node.output[0]].dims)
                    for dim in shape:
                        self.weights[node.output[0]].dims.append(dim)

                    node.op_type = "noop_reducedncnn"

                    self.node_reference[node.input[0]] -= 1
                    if len(node.input) == 2:
                        self.node_reference[node.input[1]] -= 1

                    reduced_node_count[0] += 1
                    i += 1

    def fuse_weight_transpose(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]
            if node.op_type == "Transpose":
                if (
                    node.input[0] in self.weights
                    and len(self.weights[node.input[0]].dims) == 2
                ):
                    perm = get_node_attr_ai(node, "perm")
                    if perm.size != 2 or perm[0] != 1 or perm[1] != 0:
                        continue

                    self.weights[node.output[0]] = self.weights[node.input[0]]

                    # Permute weight
                    B = self.weights[node.output[0]]

                    h, w = B.dims[:2]

                    permuted_data = onph.to_array(B).T

                    B.dims[:2] = (w, h)

                    if B.raw_data:
                        B.raw_data = permuted_data.tobytes()
                    else:
                        self.clear_container(B.float_data)
                        B.float_data.extend(permuted_data)

                    # Reduce
                    node.op_type = "noop_reducednccn"
                    self.node_reference[node.input[0]] -= 1

                    reduced_node_count[0] += 1
                    i += 1

    def fuse_shufflechannel(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # ShuffleChannel <= Reshape - Transpose - Reshape
            # ShuffleChannel <= Reshape - Transpose - Constant - Reshape
            if node.op_type == "Reshape":
                if self.node_reference[node.output[0]] != 1:
                    continue

                if len(node.input) == 1:
                    shape = get_node_attr_ai(node, "shape")
                else:
                    # Skip weight reshape
                    if node.input[1] not in self.weights:
                        continue
                    shape = get_node_attr_from_input_ai(self.weights[node.input[1]])

                # 1 groups channels_per_group, height, width
                # reverse style = channels_per_group, groups, height * width
                if (shape.size != 5 and shape.size != 3) or (
                    shape.size == 5 and shape[0] != 1
                ):
                    continue
                if i + 2 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]

                if node3.op_type == "Constant":
                    if i + 3 >= self.node_count:
                        continue
                    node3 = self.mutable_graph_nodes[i + 3]
                if (node2.op_type != "Transpose" or node3.op_type != "Reshape") or (
                    self.node_reference[node2.output[0]] != 1
                ):
                    continue

                # 0 2 1 3 4
                # reverse style = 1 0 2
                perm = get_node_attr_ai(node2, "perm")
                if perm.size != 5 and perm.size != 3:
                    continue
                if perm.size == 5 and (
                    perm[0] != 0
                    or perm[1] != 2
                    or perm[2] != 1
                    or perm[3] != 3
                    or perm[4] != 4
                ):
                    continue
                if perm.size == 3 and (perm[0] != 1 or perm[1] != 0 or perm[2] != 2):
                    continue

                if len(node3.input) == 1:
                    shape3 = get_node_attr_ai(node3, "shape")
                else:
                    if node3.input[1] not in self.weights:
                        continue
                    shape3 = get_node_attr_from_input_ai(self.weights[node3.input[1]])

                # 1, -1, height, width
                # reverse style = group, -1, channels_per_group, height, width
                if shape3.size != 4 and shape3.size != 5:
                    continue
                if shape3.size == 4 and (
                    shape3[0] != 1
                    or (shape3[1] != -1 and shape3[1] != shape[1] * shape[2])
                ):
                    continue
                if shape3.size == 5 and (
                    shape3[0] != shape[1]
                    or shape3[2] != shape[0]
                    or shape3[3] * shape3[4] != shape[2]
                ):
                    continue

                # Reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"

                if len(node.input) == 2:
                    self.node_reference[node.input[1]] -= 1
                self.node_reference[node.output[0]] -= 1
                self.node_reference[node2.output[0]] -= 1
                if len(node3.input) == 2:
                    self.node_reference[node3.input[1]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)

                node3.op_type = "ShuffleChannel"
                node3.input[0] = node.input[0]

                attr_group = AttributeProto(name="group", i=shape[1], type=APT.INT)
                node3.attribute.append(attr_group)

                attr_reverse = AttributeProto(
                    name="reverse", i=int(shape.size == 3), type=APT.INT
                )
                node3.attribute.append(attr_reverse)

                reduced_node_count[0] += 2
                i += 2

    def fuse_shufflechannel_split(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # Split <= ShuffleChannel(reverse type) - Gather(0) - Gather(1)
            if node.op_type == "ShuffleChannel":
                # reverse = 1
                reverse = get_node_attr_i(node, "reverse")
                if reverse != 1 or (i + 2 >= self.node_count):
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]

                if node2.op_type != "Gather" or node3.op_type != "Gather":
                    continue
                if node2.input[0] != node.output[0] or node3.input[0] != node.output[0]:
                    continue

                # axis = 0 or indices = 0
                gather2_axis = get_node_attr_i(node2, "axis")
                if gather2_axis != 0 or node2.input[1] not in self.weights:
                    continue

                gather2_indices = get_node_attr_from_input_ai(
                    self.weights[node2.input[1]]
                )
                if gather2_indices.size != 1 or gather2_indices[0] != 0:
                    continue

                # axis = 0 or indices = 1
                gather3_axis = get_node_attr_i(node3, "axis")
                if gather3_axis != 0 or node3.input[1] not in self.weights:
                    continue

                gather3_indices = get_node_attr_from_input_ai(
                    self.weights[node3.input[1]]
                )
                if gather3_indices.size != 1 or gather2_indices[0] != 1:
                    continue

                # reduce
                node2.op_type = "noop_reducedncnn"

                self.node_reference[node.output[0]] -= 2
                self.node_reference[node2.input[1]] -= 1
                self.node_reference[node3.input[1]] -= 1

                node3.op_type = "Split"
                node3.ClearField("input")
                node3.input.append(node.output[0])
                node3.output.append(node3.output[0])
                node3.output[0] = node2.output[0]

                node3.ClearField("attribute")
                attr_axis = AttributeProto(name="axis", i=1, type=APT.INT)
                node3.attribute.append(attr_axis)

                reduced_node_count[0] += 1
                i += 1

    def fuse_hardswish(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # HardSwish <= Add(+3) - Clip(0, 6) - Mul(X, ) - Div( / 6)
            # HardSwish <= Add(+3) - Clip(0, 6) - Mul(X, ) - Mul(*(1 / 6))
            # HardSwish <= Add(+3) - Clip(0, 6) - Mul(X, ) - Constant - Div( / 6)
            # HardSwish <= Add(+3) - Clip(0, 6) - Mul(X, ) - Constant - Mul(*(1 / 6))
            # out = x * F.relu6(x + 3, inplace=True) / 6
            if node.op_type == "Add":
                if (
                    self.node_reference[node.output[0]] != 1
                    or i + 3 >= self.node_count
                    or node.input[1] not in self.weights
                ):
                    continue

                add_three = self.weights[node.input[1]]
                if (
                    len(add_three.dims) != 0
                    or get_tensor_proto_data_size(add_three, add_three.data_type) != 1
                ):
                    continue

                constant_add_three = get_node_attr_from_input_f(add_three)
                if constant_add_three != 3:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]
                node4 = self.mutable_graph_nodes[i + 3]

                if node4.op_type == "Constant":
                    if i + 4 >= self.node_count:
                        continue
                    node4 = self.mutable_graph_nodes[i + 4]
                if (
                    node2.op_type != "Clip"
                    or node3.op_type != "Mul"
                    or (node4.op_type != "Div" and node4.op_type != "Mul")
                ):
                    continue
                if self.node_reference[node2.output[0]] != 1:
                    continue

                if len(node2.input) == 1:
                    relu6_min = get_node_attr_f(node2, "min", -FLOAT32_MAX)
                    relu6_max = get_node_attr_f(node2, "max", FLOAT32_MAX)
                else:
                    min_tp = self.weights[node2.input[1]]
                    max_tp = self.weights[node2.input[2]]
                    relu6_min = get_node_attr_from_input_f(min_tp)
                    relu6_max = get_node_attr_from_input_f(max_tp)

                if relu6_min != 0 or relu6_max != 6:
                    continue
                if self.node_reference[node3.output[0]] != 1:
                    continue
                if node3.input[0] != node.input[0] or node3.input[1] != node2.output[0]:
                    continue
                if node4.input[1] not in self.weights:
                    continue

                div_six = self.weights[node4.input[1]]
                if (
                    len(div_six.dims) != 0
                    or get_tensor_proto_data_size(div_six, div_six.data_type) != 1
                ):
                    continue

                constant_div_six = get_node_attr_from_input_f(div_six)
                if (node4.op_type == "Div" and constant_div_six != 6) or (
                    node4.op_type == "Mul" and constant_div_six != 1 / 6
                ):
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"
                node3.op_type = "noop_reducedncnn"

                self.node_reference[node.input[0]] -= 1
                self.node_reference[node.input[1]] -= 1
                self.node_reference[node.output[0]] -= 1
                if len(node2.input) == 3:
                    self.node_reference[node2.input[1]] -= 1
                    self.node_reference[node2.input[2]] -= 1
                self.node_reference[node2.output[0]] -= 1
                self.node_reference[node3.output[0]] -= 1
                self.node_reference[node4.input[1]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)
                self.blob_names.pop(node3.output[0], None)

                node4.op_type = "HardSwish"
                node4.ClearField("input")
                node4.input.append(node.input[0])

                attr_alpha = AttributeProto(name="alpha", f=1 / 6, type=APT.FLOAT)
                node4.attribute.append(attr_alpha)

                attr_beta = AttributeProto(name="beta", f=0.5, type=APT.FLOAT)
                node4.attribute.append(attr_beta)

                reduced_node_count[0] += 3
                i += 3

        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # HardSwish <= HardSigmoid - Mul
            # out = x * hsigmoid(x)
            if node.op_type == "HardSigmoid":
                if self.node_reference[node.output[0]] != 1:
                    continue

                alpha = get_node_attr_f(node, "alpha", 0.2)
                beta = get_node_attr_f(node, "beta", 0.5)

                if i + 1 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]

                if node2.op_type != "Mul":
                    continue
                if node2.input[0] != node.input[0] or node2.input[1] != node.output[0]:
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"

                self.node_reference[node.input[0]] -= 1
                self.node_reference[node.output[0]] -= 1

                self.blob_names.pop(node.output[0], None)

                node2.op_type = "HardSwish"
                node2.ClearField("input")
                node2.input.append(node.input[0])

                attr_alpha = AttributeProto(name="alpha", f=alpha, type=APT.FLOAT)
                node2.attribute.append(attr_alpha)

                attr_beta = AttributeProto(name="beta", f=beta, type=APT.FLOAT)
                node2.attribute.append(attr_beta)

                reduced_node_count[0] += 1
                i += 1

    def fuse_hardsigmoid(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # HardSigmoid <= Add(+3) - Clip(0, 6) - Div( / 6)
            # HardSigmoid <= Add(+3) - Clip(0, 6) - Mul(*(1 / 6))
            # HardSigmoid <= Add(+3) - Clip(0, 6) - Constant - Div( / 6)
            # HardSigmoid <= Add(+3) - Clip(0, 6) - Constant - Mul(*(1 / 6))
            # out = F.relu6(x + 3, inplace=True) / 6
            if node.op_type == "Add":
                if (
                    self.node_reference[node.output[0]] != 1
                    or i + 2 >= self.node_count
                    or node.input[1] not in self.weights
                ):
                    continue

                add_three = self.weights[node.input[1]]
                if (
                    len(add_three.dims) != 0
                    or get_tensor_proto_data_size(add_three, add_three.data_type) != 1
                ):
                    continue

                constant_add_three = self.weights[node.input[1]]
                if constant_add_three != 3:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]

                if node3.op_type == "Constant":
                    if i + 3 >= self.node_count:
                        continue
                    node3 = self.mutable_graph_nodes[i + 3]

                if node2.op_type != "Clip" or (
                    node3.op_type != "Div" and node3.op_type != "Mul"
                ):
                    continue

                if self.node_reference[node2.output[0]] != 1:
                    continue

                if len(node2.input) == 1:
                    relu6_min = get_node_attr_f(node2, "min", -FLOAT32_MAX)
                    relu6_max = get_node_attr_f(node2, "max", FLOAT32_MAX)
                else:
                    min_tp = self.weights[node2.input[1]]
                    max_tp = self.weights[node2.input[2]]
                    relu6_min = get_node_attr_from_input_f(min_tp)
                    relu6_max = get_node_attr_from_input_f(max_tp)

                if relu6_min != 0 or relu6_max != 6:
                    continue
                if node3.input[1] not in self.weights:
                    continue

                div_six = self.weights[node3.input[1]]
                if (
                    len(div_six.dims) != 0
                    or get_tensor_proto_data_size(div_six, div_six.data_type) != 1
                ):
                    continue

                constant_div_six = get_node_attr_from_input_f(div_six)
                if (node3.op_type == "Div" and constant_div_six != 6) or (
                    node3.op_type == "Mul" and constant_div_six != 1 / 6
                ):
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"

                self.node_reference[node.input[1]] -= 1
                self.node_reference[node.output[0]] -= 1
                if len(node2.input) == 3:
                    self.node_reference[node2.input[1]] -= 1
                    self.node_reference[node2.input[2]] -= 1
                self.node_reference[node2.output[0]] -= 1
                self.node_reference[node3.input[1]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)

                node3.op_type = "HardSigmoid"
                node3.ClearField("input")
                node3.input.append(node.input[0])

                attr_alpha = AttributeProto(name="alpha", f=1 / 6, type=APT.FLOAT)
                node3.attribute.append(attr_alpha)

                attr_beta = AttributeProto(name="beta", f=0.5, type=APT.FLOAT)
                node3.attribute.append(attr_beta)

                reduced_node_count[0] += 2
                i += 2

    def fuse_swish(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # Swish <= Sigmoid - Mul
            # x * torch.sigmoid(x)
            if node.op_type == "Sigmoid":
                if self.node_reference[node.output[0]] != 1 or i + 1 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]

                if node2.op_type != "Mul":
                    continue
                if node2.input[0] != node.input[0] or node2.input[1] != node.output[0]:
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"

                self.node_reference[node.input[0]] -= 1
                self.node_reference[node.output[0]] -= 1

                self.blob_names.pop(node.output[0], None)

                node2.op_type = "Swish"
                node2.ClearField("input")
                node2.input.append(node.input[0])

                reduced_node_count[0] += 1
                i += 1

    def fuse_batchnorm1d_squeeze_unsqueeze(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # BatchNormalization <= Unsqueeze - BatchNormalization - Squeeze
            if node.op_type == "Unsqueeze":
                if self.node_reference[node.output[0]] != 1 or i + 2 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]

                if node2.op_type != "BatchNormalization" or node3.op_type != "Squeeze":
                    continue
                if self.node_reference[node2.output[0]] != 1:
                    continue
                if (
                    node2.input[0] != node.output[0]
                    or node3.input[0] != node2.output[0]
                ):
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"
                node3.op_type = "noop_reducedncnn"

                self.node_reference[node.output[0]] -= 1
                self.node_reference[node2.output[0]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)

                node2.input[0] = node.input[0]
                node2.output[0] = node3.output[0]

                reduced_node_count[0] += 2
                i += 2

    def fuse_unsqueeze_prelu(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # PReLU <= Unsqueeze - PReLU
            if node.op_type == "Unsqueeze":
                # check weight
                if node.input[0] not in self.weights:
                    continue

                B = self.weights[node.input[0]]
                if len(B.dims) != 1:
                    continue
                if self.node_reference[node.output[0]] != 1:
                    continue

                # axes = (1, 2)
                axes = get_node_attr_ai(node, "axes")
                if axes.size != 2 or axes[0] != 1 or axes[1] != 2:
                    continue
                if i + 1 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]

                if node2.op_type != "PRelu" or node2.input[1] != node.output[0]:
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"

                self.node_reference[node.output[0]] -= 1

                self.blob_names.pop(node.output[0], None)

                node2.input[1] = node.input[0]

                reduced_node_count[0] += 1
                i += 1

    def fuse_normalize(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # Normalize <= X - ReduceL2 - Clip - Expand - Div
            # Normalize <= X - ReduceL2 - Clip - Shape - Expand - Div
            if node.op_type == "ReduceL2":
                if self.node_reference[node.output[0]] != 1:
                    continue

                # axes = (1)
                axes = get_node_attr_ai(node, "axes")
                if len(axes) != 1 or axes[0] != 1 or i + 3 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]
                node4 = self.mutable_graph_nodes[i + 3]

                has_shape_node = node3.op_type == "Shape"
                node_shape = NodeProto()
                if has_shape_node:
                    if i + 4 >= self.node_count:
                        continue

                    node_shape = node3
                    node3 = self.mutable_graph_nodes[i + 3]
                    node4 = self.mutable_graph_nodes[i + 4]

                if (
                    node2.op_type != "Clip"
                    or node3.op_type != "Expand"
                    or node4.op_type != "Div"
                ):
                    continue
                if (
                    self.node_reference[node2.output[0]] != 1
                    or self.node_reference[node3.output[0]] != 1
                ):
                    continue
                if (
                    node2.input[0] != node.output[0]
                    or node3.input[0] != node2.output[0]
                    or node4.input[0] != node.input[0]
                    or node4.input[1] != node3.output[0]
                ):
                    continue

                if has_shape_node and (
                    node_shape.input[0] != node.input[0]
                    or node3.input[1] != node_shape.output[0]
                ):
                    continue

                # +eps
                if len(node2.input) == 1:
                    clip_min = get_node_attr_f(node2, "min", -FLOAT32_MAX)
                else:
                    min_tp = self.weights[node2.input[1]]
                    clip_min = get_node_attr_from_input_f(min_tp)

                # reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"
                if has_shape_node:
                    node_shape.op_type = "noop_reducedncnn"
                node3.op_type = "noop_reducedncnn"

                self.node_reference[node.input[0]] -= 2 if has_shape_node else 1
                self.node_reference[node.output[0]] -= 1
                self.node_reference[node2.output[0]] -= 1
                if has_shape_node:
                    self.node_reference[node_shape.output[0]] -= 1
                self.node_reference[node3.output[0]] -= 1
                if len(node3.input) == 2:
                    self.node_reference[node3.input[1]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)
                if has_shape_node:
                    self.blob_names.pop(node_shape.output[0], None)
                self.blob_names.pop(node3.output[0], None)

                node4.op_type = "Normalize"
                node4.ClearField("input")
                node4.input.append(node.input[0])

                attr_alpha = AttributeProto(name="eps", f=clip_min, type=APT.FLOAT)
                node4.attribute.append(attr_alpha)

                reduced_node_count[0] += 4 if has_shape_node else 3
                i += 4 if has_shape_node else 3

    def fuse_groupnorm(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # GroupNorm <= X - Reshape - InstanceNormalization - Reshape - Mul - Add
            if node.op_type == "Reshape":
                if self.node_reference[node.output[0]] != 1:
                    continue

                if len(node.input) == 1:
                    shape = get_node_attr_ai(node, "shape")
                else:
                    # Skip weight reshape
                    if node.input[1] not in self.weights:
                        continue

                    shape = get_node_attr_from_input_ai(self.weights[node.input[1]])

                # 0, group, -1
                if (
                    shape.size != 3
                    or shape[0] != 0
                    or shape[2] != -1
                    or i + 4 >= self.node_count
                ):
                    continue

                groups = shape[1]

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]
                node4 = self.mutable_graph_nodes[i + 3]
                node5 = self.mutable_graph_nodes[i + 4]

                if (
                    node2.op_type != "InstanceNormalization"
                    or node3.op_type != "Reshape"
                    or node4.op_type != "Mul"
                    or node5.op_type != "Add"
                ):
                    continue
                if (
                    self.node_reference[node2.output[0]] != 1
                    or self.node_reference[node3.output[0]] != 1
                    or self.node_reference[node4.output[0]] != 1
                ):
                    continue
                if (
                    node2.input[0] != node.output[0]
                    or node3.input[0] != node2.output[0]
                    or node4.input[0] != node3.output[0]
                    or node5.input[0] != node4.output[0]
                ):
                    continue

                # InstanceNormalization S=1 B=0
                S = get_node_attr_from_input_af(self.weights[node2.input[1]])
                B = get_node_attr_from_input_af(self.weights[node2.input[2]])
                if S.size != groups or B.size != groups:
                    continue
                if np.any(S != 1) or np.any(B != 0):
                    continue

                if len(node3.input) == 1:
                    shape2 = get_node_attr_ai(node3, "shape")
                else:
                    # Skip weight reshape
                    if node3.input[1] not in self.weights:
                        continue

                    shape2 = get_node_attr_from_input_ai(self.weights[node3.input[1]])

                # 1, channels, w, h
                if shape2.size != 4 or shape2[0] != 1:
                    continue

                channels = shape2[1]

                # affine
                affine_S = get_node_attr_from_input_af(self.weights[node4.input[1]])
                affine_B = get_node_attr_from_input_af(self.weights[node5.input[1]])
                if affine_S.size != channels and affine_B.size != channels:
                    continue  # only per-channel affine allowed

                # reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"
                node3.op_type = "noop_reducedncnn"
                node4.op_type = "noop_reducedncnn"

                if len(node.input) == 2:
                    self.node_reference[node.input[1]] -= 1
                self.node_reference[node.output[0]] -= 1
                self.node_reference[node2.input[1]] -= 1
                self.node_reference[node2.input[2]] -= 1
                self.node_reference[node2.output[0]] -= 1
                if len(node3.input) == 2:
                    self.node_reference[node3.input[1]] -= 1
                self.node_reference[node3.output[0]] -= 1
                self.node_reference[node4.output[0]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)
                self.blob_names.pop(node3.output[0], None)
                self.blob_names.pop(node4.output[0], None)

                affine_scale = node4.input[1]
                affine_bias = node5.input[1]

                node5.op_type = "GroupNorm"
                node5.ClearField("input")
                node5.input.append(node.input[0])
                node5.input.append(affine_scale)
                node5.input.append(affine_bias)

                attr_groups = AttributeProto(name="groups", i=groups, type=APT.INT)
                node5.attribute.append(attr_groups)

                attr_channels = AttributeProto(
                    name="channels", i=channels, type=APT.INT
                )
                node5.attribute.append(attr_channels)

                # +eps
                eps = get_node_attr_f(node2, "epsilon", 0.00001)
                attr_eps = AttributeProto(name="epsilon", f=eps, type=APT.FLOAT)
                node5.attribute.append(attr_eps)

                attr_affine = AttributeProto(name="affine", i=1, type=APT.INT)
                node5.attribute.append(attr_affine)

                reduced_node_count[0] += 4
                i += 4

    def fuse_layernorm(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div
            # LayerNorm <= X - ReduceMean - Sub - Pow - ReduceMean - Add - Sqrt - Div - Mul - Add
            if node.op_type == "ReduceMean":
                if self.node_reference[node.output[0]] != 1:
                    continue

                axes = get_node_attr_ai(node, "axes")

                # -1
                # -2 -1
                if axes.size != 1 and axes.size != 2:
                    continue
                if (axes.size == 1 and axes[0] != -1) or (
                    axes.size == 2 and (axes[0] != -2 or axes[1] != -1)
                ):
                    continue
                if i + 6 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]
                node4 = self.mutable_graph_nodes[i + 3]
                node5 = self.mutable_graph_nodes[i + 4]
                node6 = self.mutable_graph_nodes[i + 5]
                node7 = self.mutable_graph_nodes[i + 6]

                if node2.op_type != "Sub" or node3.op_type != "Pow":
                    continue
                if (
                    self.node_reference[node2.output[0]] != 2
                    or self.node_reference[node3.output[0]] != 1
                    or self.node_reference[node4.output[0]] != 1
                    or self.node_reference[node5.output[0]] != 1
                    or self.node_reference[node6.output[0]] != 1
                ):
                    continue
                if (
                    node2.input[0] != node.output[0]
                    or node2.input[1] != node.output[0]
                    or node3.input[0] != node2.output[0]
                    or node4.input[0] != node3.output[0]
                    or node5.input[0] != node4.output[0]
                    or node6.input[0] != node5.output[0]
                    or node7.input[0] != node2.output[0]
                    or node7.input[1] != node6.output[0]
                ):
                    continue
                if node3.input[1] not in self.weights:
                    continue

                pow_two = self.weights[node3.input[1]]
                if (
                    len(pow_two.dims) != 0
                    or get_tensor_proto_data_size(pow_two, pow_two.data_type) != 1
                ):
                    continue

                constant_pow_two = get_node_attr_from_input_f(pow_two)
                if constant_pow_two != 2:
                    continue

                axes4 = get_node_attr_ai(node4, "axes")

                # -1
                # -2 -1
                if axes4.size != axes.size:
                    continue
                if (axes.size == 1 and axes[4] != -1) or (
                    axes.size == 2 and (axes4[0] != -2 or axes4[1] != -1)
                ):
                    continue
                if node5.input[1] not in self.weights:
                    continue

                add_eps = self.weights[node5.input[1]]
                if (
                    len(add_eps.dims) != 0
                    or get_tensor_proto_data_size(add_eps, add_eps.data_type) != 1
                ):
                    continue

                eps = get_node_attr_from_input_f(add_eps)

                affine = 0
                while i + 8 < self.node_count:
                    node8 = self.mutable_graph_nodes[i + 7]
                    node9 = self.mutable_graph_nodes[i + 8]

                    if node8.op_type != "Mul" or node9.op_type != "Add":
                        break
                    if (
                        self.node_reference[node7.output[0]] != 1
                        or self.node_reference[node8.output[0]] != 1
                    ):
                        break
                    if (
                        node8.input[0] != node7.output[0]
                        or node9.input[0] != node8.output[0]
                    ):
                        break

                    # affine
                    affine_S = get_node_attr_from_input_af(self.weights[node8.input[1]])
                    affine_B = get_node_attr_from_input_af(self.weights[node9.input[1]])
                    if affine_S.size != affine_B.size:
                        break

                    affine = 1
                    break

                # reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"
                node3.op_type = "noop_reducedncnn"
                node4.op_type = "noop_reducedncnn"
                node5.op_type = "noop_reducedncnn"
                node6.op_type = "noop_reducedncnn"

                self.node_reference[node2.input[0]] -= 1
                self.node_reference[node2.input[1]] -= 1
                self.node_reference[node3.input[0]] -= 1
                self.node_reference[node3.input[1]] -= 1
                self.node_reference[node4.input[0]] -= 1
                self.node_reference[node5.input[0]] -= 1
                self.node_reference[node5.input[1]] -= 1
                self.node_reference[node6.input[0]] -= 1
                self.node_reference[node7.input[0]] -= 1
                self.node_reference[node7.input[1]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)
                self.blob_names.pop(node3.output[0], None)
                self.blob_names.pop(node4.output[0], None)
                self.blob_names.pop(node5.output[0], None)
                self.blob_names.pop(node6.output[0], None)

                attr_eps = AttributeProto(name="epsilon", f=eps, type=APT.FLOAT)
                attr_affine = AttributeProto(name="affine", i=affine, type=APT.INT)
                if affine == 0:
                    node7.op_type = "LayerNorm"
                    node7.ClearField("input")
                    node7.input.append(node.input[0])

                    node7.attribute.append(attr_eps)
                    node7.attribute.append(attr_affine)

                    reduced_node_count[0] += 6
                    i += 6
                else:
                    # This is probably unnecessary on their part, but I'm paranoid
                    node8 = self.mutable_graph_nodes[i + 7]
                    node9 = self.mutable_graph_nodes[i + 8]

                    node7.op_type = "noop_reducedncnn"
                    node8.op_type = "noop_reducedncnn"

                    self.node_reference[node8.input[0]] -= 1
                    self.node_reference[node9.input[0]] -= 1

                    self.blob_names.pop(node7.output[0], None)
                    self.blob_names.pop(node8.output[0], None)

                    affine_scale = node8.input[1]
                    affine_bias = node9.input[1]

                    node9.op_type = "LayerNorm"
                    node9.ClearField("input")
                    node9.input.append(node.input[0])
                    node9.input.append(affine_scale)
                    node9.input.append(affine_bias)

                    node9.attribute.append(attr_eps)
                    node9.attribute.append(attr_affine)

                    reduced_node_count[0] += 8
                    i += 8

    def fuse_flatten(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # Flatten <= X - Shape - Gather - Constant - Unsqueeze - Unsqueeze - Concat - Reshape
            if node.op_type == "Shape":
                if self.node_reference[node.output[0]] != 1:
                    continue
                if i + 6 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]
                node4 = self.mutable_graph_nodes[i + 3]
                node5 = self.mutable_graph_nodes[i + 4]
                node6 = self.mutable_graph_nodes[i + 5]
                node7 = self.mutable_graph_nodes[i + 6]

                if (
                    node2.op_type != "Gather"
                    or node3.op_type != "Constant"
                    or node4.op_type != "Unsqueeze"
                    or node5.op_type != "Unsqueeze"
                    or node6.op_type != "Concat"
                    or node7.op_type != "Reshape"
                ):
                    continue
                if (
                    self.node_reference[node2.output[0]] != 1
                    or self.node_reference[node4.output[0]] != 1
                    or self.node_reference[node5.output[0]] != 1
                    or self.node_reference[node6.output[0]] != 1
                ):
                    continue
                if (
                    node2.input[0] != node.output[0]
                    or node4.input[0] != node2.output[0]
                    or node5.input[0] != node3.output[0]
                    or node6.input[0] != node4.output[0]
                    or node6.input[1] != node5.output[0]
                    or node7.input[0] != node.input[0]
                    or node7.input[1] != node6.output[0]
                ):
                    continue

                # axis = 0
                gather_axis = get_node_attr_i(node2, "axis")
                if gather_axis != 0:
                    continue

                # indices = 0
                if node2.input[1] not in self.weights:
                    continue

                gather_indices = get_node_attr_from_input_ai(
                    self.weights[node2.input[1]]
                )
                if gather_indices.size != 1 or gather_indices[0] != 0:
                    continue

                # axes = (0)
                unsqueeze_axes = get_node_attr_ai(node4, "axes")
                if unsqueeze_axes.size != 1 or unsqueeze_axes[0] != 0:
                    continue
                unsqueeze_axes2 = get_node_attr_ai(node5, "axes")
                if unsqueeze_axes2.size != 1 or unsqueeze_axes2[0] != 0:
                    continue

                # data = -1
                if node5.input[0] not in self.weights:
                    continue

                unsqueeze2_data = get_node_attr_from_input_ai(
                    self.weights[node5.input[0]]
                )
                if unsqueeze2_data.size != 1 or unsqueeze2_data[0] != -1:
                    continue

                # axis = 0
                concat_axis = get_node_attr_i(node6, "axis")
                if concat_axis != 0:
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"
                node4.op_type = "noop_reducedncnn"
                node5.op_type = "noop_reducedncnn"
                node6.op_type = "noop_reducedncnn"

                self.node_reference[node.input[0]] -= 1
                self.node_reference[node.output[0]] -= 1
                self.node_reference[node2.input[1]] -= 1
                self.node_reference[node2.output[0]] -= 1
                self.node_reference[node4.output[0]] -= 1
                self.node_reference[node5.input[0]] -= 1
                self.node_reference[node5.output[0]] -= 1
                self.node_reference[node.output[0]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)
                self.blob_names.pop(node4.output[0], None)
                self.blob_names.pop(node5.output[0], None)
                self.blob_names.pop(node6.output[0], None)

                node7.op_type = "Flatten"
                node7.ClearField("input")
                node7.input.append(node.input[0])

                reduced_node_count[0] += 5
                i += 5

    def fuse_pixelshuffle(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # PixelShuffle <= Reshape - Transpose - Reshape
            # PixelShuffle <= Reshape - Transpose - Constant - Reshape
            if node.op_type == "Reshape":
                if self.node_reference[node.output[0]] != 1:
                    continue

                if len(node.input) == 1:
                    shape = get_node_attr_ai(node, "shape")
                else:
                    # skip weight reshape
                    if node.input[1] not in self.weights:
                        continue

                    shape = get_node_attr_from_input_ai(self.weights[node.input[1]])

                # -1, 3, upscale_factor, upscale_factor, height, width
                if (
                    shape.size != 6
                    or (shape[0] != 1 and shape[0] != -1)
                    or shape[2] != shape[3]
                    or i + 2 >= self.node_count
                ):
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]

                if node3.op_type == "Constant":
                    if i + 3 >= self.node_count:
                        continue

                    node3 = self.mutable_graph_nodes[i + 3]

                if node2.op_type != "Transpose" or node3.op_type != "Reshape":
                    continue
                if self.node_reference[node2.output[0]] != 1:
                    continue

                # 0 1 4 2 5 3
                perm = get_node_attr_ai(node2, "perm")
                if (
                    perm.size != 6
                    or perm[0] != 0
                    or perm[1] != 1
                    or perm[2] != 4
                    or perm[3] != 2
                    or perm[4] != 5
                    or perm[5] != 3
                ):
                    continue

                if len(node3.input) == 1:
                    shape3 = get_node_attr_ai(node3, "shape")
                else:
                    if node3.input[1] not in self.weights:
                        continue

                    shape3 = get_node_attr_from_input_ai(self.weights[node3.input[1]])

                # -1, 3, height, width
                if (
                    shape3.size != 4
                    or (shape3[0] != 1 and shape3[0] != -1)
                    or shape3[1] != shape[1]
                    or shape3[2] != shape[2] * shape[4]
                    or shape3[3] != shape[3] * shape[5]
                ):
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"

                if len(node.input) == 2:
                    self.node_reference[node.input[1]] -= 1
                self.node_reference[node.output[0]] -= 1
                self.node_reference[node2.output[0]] -= 1
                if len(node3.input) == 2:
                    self.node_reference[node3.input[1]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)

                node3.op_type = "PixelShuffle"
                node3.input[0] = node.input[0]

                attr_group = AttributeProto(
                    name="scale_factor", i=shape[2], type=APT.INT
                )
                node3.attribute.append(attr_group)

                reduced_node_count[0] += 2
                i += 2

    def fuse_reorg(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # PixelShuffle <= Reshape - Transpose - Reshape
            # PixelShuffle <= Reshape - Transpose - Constant - Reshape
            if node.op_type == "Reshape":
                if self.node_reference[node.output[0]] != 1:
                    continue

                if len(node.input) == 1:
                    shape = get_node_attr_ai(node, "shape")
                else:
                    if node.input[1] not in self.weights:
                        continue

                    shape = get_node_attr_from_input_ai(self.weights[node.input[1]])

                # -1, 3, out_height, block_size, out_width, block_size
                if (
                    shape.size != 6
                    or (shape[0] != 1 and shape[0] != -1)
                    or shape[3] != shape[5]
                    or i + 2 >= self.node_count
                ):
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]

                if node3.op_type == "Constant":
                    if i + 3 >= self.node_count:
                        continue

                    node3 = self.mutable_graph_nodes[i + 3]

                if node2.op_type != "Transpose" or node3.op_type != "Reshape":
                    continue
                if self.node_reference[node2.output[0]] != 1:
                    continue

                # 0 1 3 5 2 4
                perm = get_node_attr_ai(node2, "perm")
                if (
                    perm.size != 6
                    or perm[0] != 0
                    or perm[1] != 1
                    or perm[2] != 3
                    or perm[3] != 5
                    or perm[4] != 2
                    or perm[5] != 4
                ):
                    continue

                if len(node3.input) == 1:
                    shape3 = get_node_attr_ai(node3, "shape")
                else:
                    if node3.input[1] not in self.weights:
                        continue

                    shape3 = get_node_attr_from_input_ai(self.weights[node3.input[1]])

                # -1, out_channels, out_height, out_width
                if (
                    shape3.size != 4
                    or (shape3[0] != 1 and shape3[0] != -1)
                    or shape3[1] != shape[1] * shape[3] * shape[5]
                    or shape3[2] != shape[2]
                    or shape3[3] != shape[4]
                ):
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"

                if len(node.input) == 2:
                    self.node_reference[node.input[1]] -= 1
                self.node_reference[node.output[0]] -= 1
                self.node_reference[node2.output[0]] -= 1
                if len(node3.input) == 2:
                    self.node_reference[node3.input[1]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)

                node3.op_type = "Reorg"
                node3.input[0] = node.input[0]

                attr_group = AttributeProto(name="stride", i=shape[3], type=APT.INT)
                node3.attribute.append(attr_group)

                reduced_node_count[0] += 2
                i += 2

    def fuse_expand_broadcast(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # Add/Sub/Mul/Div/Min/Max <= Expand - Add/Sub/Mul/Div/Min/Max
            if node.op_type == "Expand":
                if self.node_reference[node.output[0]] != 1 or i + 1 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]

                if node2.op_type not in ["Add", "Sub", "Mul", "Div", "Min", "Max"]:
                    continue
                if (
                    node2.input[1] != node.output[0]
                    and node2.input[0] != node.output[0]
                ):
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"

                self.node_reference[node.output[0]] -= 1
                if len(node.input) == 2:
                    self.node_reference[node.input[1]] -= 1

                self.blob_names.pop(node.output[0], None)

                if node2.input[0] == node.output[0]:
                    node2.input[0] = node.input[0]
                else:
                    node2.input[1] = node.input[0]

                reduced_node_count[0] += 1
                i += 1

    def fuse_lstm_gru_rnn(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # LSTM(bi) <= LSTM(bi) - Transpose - Reshape - Transpose
            if node.op_type in ["LSTM", "GRU", "RNN"]:
                if self.node_reference[node.output[0]] != 1 or i + 2 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]

                if node2.op_type != "Transpose" or node3.op_type != "Reshape":
                    continue
                if self.node_reference[node2.output[0]] != 1:
                    continue
                if (
                    node2.input[0] != node.output[0]
                    or node3.input[0] != node2.output[0]
                ):
                    continue

                direction = get_node_attr_s(node, "direction")
                if direction != "bidirectional":
                    continue

                # 0 2 1 3
                perm = get_node_attr_ai(node2, "perm")
                if (
                    perm.size != 4
                    or perm[0] != 0
                    or perm[1] != 2
                    or perm[2] != 1
                    or perm[3] != 3
                ):
                    continue

                if len(node3.input) == 1:
                    shape = get_node_attr_ai(node3, "shape")
                else:
                    if node3.input[1] not in self.weights:
                        continue

                    shape = get_node_attr_from_input_ai(self.weights[node3.input[1]])

                # 0 0 -1
                if shape.size != 3 or shape[0] != 0 or shape[1] != 0 or shape[2] != -1:
                    continue

                # reduce
                node2.op_type = "noop_reducedncnn"
                node3.op_type = "noop_reducedncnn"

                self.node_reference[node.output[0]] -= 1
                self.node_reference[node2.output[0]] -= 1
                if len(node3.input) == 2:
                    self.node_reference[node3.input[1]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)

                node.output[0] = node3.output[0]

                reduced_node_count[0] += 2
                i += 2

                if i + 1 < self.node_count:
                    if self.node_reference[node3.output[0]] != 1:
                        continue

                    node4 = self.mutable_graph_nodes[i + 1]

                    if node4.op_type != "Transpose":
                        continue
                    if node4.input[0] != node.output[0]:
                        continue

                    # 1 0 2
                    perm4 = get_node_attr_ai(node4, "perm")
                    if (
                        perm4.size != 3
                        or perm4[0] != 1
                        or perm4[1] != 0
                        or perm4[2] != 2
                    ):
                        continue

                    # reduce
                    node4.op_type = "noop_reducedncnn"

                    self.node_reference[node.output[0]] -= 1

                    self.blob_names.pop(node.output[0], None)

                    node.output[0] = node4.output[0]

                    reduced_node_count[0] += 1
                    i += 1

        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # LSTM(uni) <= LSTM(uni) - Squeeze - Transpose
            if node.op_type in ["LSTM", "GRU", "RNN"]:
                if self.node_reference[node.output[0]] != 1 or i + 1 >= self.node_count:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]

                if node2.op_type != "Squeeze":
                    continue
                if node2.input[0] != node.output[0]:
                    continue

                direction = get_node_attr_s(node, "direction")
                if direction == "bidirectional":
                    continue

                axes = get_node_attr_ai(node2, "axes")
                if axes.size != 1 or axes[0] != 1:
                    continue

                # reduce
                node2.op_type = "noop_reducedncnn"

                self.node_reference[node.output[0]] -= 1

                self.blob_names.pop(node.output[0], None)

                node.output[0] = node2.output[0]

                reduced_node_count[0] += 1
                i += 1

                if i + 1 < self.node_count:
                    if self.node_reference[node2.output[0]] != 1:
                        continue

                    node3 = self.mutable_graph_nodes[i + 1]

                    if node3.op_type != "Transpose":
                        continue

                    if node3.input[0] != node.output[0]:
                        continue

                    # 1 0 2
                    perm4 = get_node_attr_ai(node3, "perm")
                    if (
                        perm4.size != 3
                        or perm4[0] != 1
                        or perm4[1] != 0
                        or perm4[2] != 2
                    ):
                        continue

                    # reduce
                    node3.op_type = "noop_reducedncnn"

                    self.node_reference[node.output[0]] -= 1

                    self.blob_names.pop(node.output[0], None)

                    node.output[0] = node3.output[0]

                    reduced_node_count[0] += 1
                    i += 1

        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # LSTM <= Transpose - LSTM
            if node.op_type == "Transpose":
                if self.node_reference[node.output[0]] != 1:
                    continue

                # 1 0 2
                perm = get_node_attr_ai(node, "perm")
                if perm.size != 3 or perm[0] != 1 or perm[1] != 0 or perm[2] != 2:
                    continue

                node2 = self.mutable_graph_nodes[i + 1]

                if node2.op_type not in ["LSTM", "GRU", "RNN"]:
                    continue
                if node2.input[0] != node.output[0]:
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"

                self.node_reference[node.output[0]] -= 1

                self.blob_names.pop(node.output[0], None)

                node2.input[0] = node.input[0]

                reduced_node_count[0] += 1
                i += 1

    def fuse_multiheadattention(self, reduced_node_count: List[int]) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # MultiHeadAttention <= MatMul(q) - Add
            #                      - MatMul(k) - Add
            #                      - MatMul(v) - Add
            #                      - Mul
            #                      - Reshape - Transpose
            #                      - Reshape - Reshape - Transpose - Transpose
            #                      - Gemm - Softmax - Gemm - Transpose - Reshape - MatMul - Add
            if node.op_type == "MatMul":
                if (
                    self.node_reference[node.output[0]] != 1
                    or i + 19 >= self.node_count
                ):
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]
                node4 = self.mutable_graph_nodes[i + 3]
                node5 = self.mutable_graph_nodes[i + 4]
                node6 = self.mutable_graph_nodes[i + 5]
                node7 = self.mutable_graph_nodes[i + 6]
                node8 = self.mutable_graph_nodes[i + 7]
                node9 = self.mutable_graph_nodes[i + 8]
                node10 = self.mutable_graph_nodes[i + 9]
                node11 = self.mutable_graph_nodes[i + 10]
                node12 = self.mutable_graph_nodes[i + 11]
                node13 = self.mutable_graph_nodes[i + 12]
                node14 = self.mutable_graph_nodes[i + 13]
                node15 = self.mutable_graph_nodes[i + 14]
                node16 = self.mutable_graph_nodes[i + 15]
                node17 = self.mutable_graph_nodes[i + 16]
                node18 = self.mutable_graph_nodes[i + 17]
                node19 = self.mutable_graph_nodes[i + 18]
                node20 = self.mutable_graph_nodes[i + 19]

                if (
                    node2.op_type != "Add"
                    or node3.op_type != "MatMul"
                    or node4.op_type != "Add"
                    or node5.op_type != "MatMul"
                    or node6.op_type != "Add"
                    or node7.op_type != "Mul"
                    or node8.op_type != "Reshape"
                    or node9.op_type != "Transpose"
                    or node10.op_type != "Reshape"
                    or node11.op_type != "Reshape"
                    or node12.op_type != "Transpose"
                    or node13.op_type != "Transpose"
                    or node14.op_type != "MatMul"
                    or node15.op_type != "Softmax"
                    or node16.op_type != "MatMul"
                    or node17.op_type != "Transpose"
                    or node18.op_type != "Reshape"
                    or node19.op_type != "MatMul"
                    or node20.op_type != "Add"
                ):
                    continue
                if (
                    self.node_reference[node2.output[0]] != 1
                    or self.node_reference[node3.output[0]] != 1
                    or self.node_reference[node4.output[0]] != 1
                    or self.node_reference[node5.output[0]] != 1
                    or self.node_reference[node6.output[0]] != 1
                    or self.node_reference[node7.output[0]] != 1
                    or self.node_reference[node8.output[0]] != 1
                    or self.node_reference[node9.output[0]] != 1
                    or self.node_reference[node10.output[0]] != 1
                    or self.node_reference[node11.output[0]] != 1
                    or self.node_reference[node12.output[0]] != 1
                    or self.node_reference[node13.output[0]] != 1
                    or self.node_reference[node14.output[0]] != 1
                    or self.node_reference[node15.output[0]] != 1
                    or self.node_reference[node16.output[0]] != 1
                    or self.node_reference[node17.output[0]] != 1
                    or self.node_reference[node18.output[0]] != 1
                    or self.node_reference[node19.output[0]] != 1
                ):
                    continue
                if (
                    node2.input[0] != node.output[0]
                    or node4.input[0] != node3.output[0]
                    or node6.input[0] != node5.output[0]
                    or node7.input[0] != node2.output[0]
                    or node8.input[0] != node7.output[0]
                    or node9.input[0] != node8.output[0]
                    or node10.input[0] != node4.output[0]
                    or node11.input[0] != node6.output[0]
                    or node12.input[0] != node11.output[0]
                    or node13.input[0] != node10.output[0]
                    or node14.input[0] != node9.output[0]
                    or node14.input[1] != node13.output[0]
                    or node15.input[0] != node14.output[0]
                    or node16.input[0] != node15.output[0]
                    or node16.input[1] != node12.output[0]
                    or node17.input[0] != node16.output[0]
                    or node18.input[0] != node17.output[0]
                    or node19.input[0] != node18.output[0]
                    or node20.input[0] != node19.output[0]
                ):
                    continue

                q_B = get_node_attr_from_input_af(self.weights[node2.input[1]])
                k_B = get_node_attr_from_input_af(self.weights[node4.input[1]])
                v_B = get_node_attr_from_input_af(self.weights[node6.input[1]])
                o_B = get_node_attr_from_input_af(self.weights[node20.input[1]])

                if q_B.size != k_B.size or q_B.size != v_B.size or q_B.size != o_B.size:
                    continue

                embed_dim = q_B.size

                # 1 0 2
                perm9 = get_node_attr_ai(node9, "perm")
                perm12 = get_node_attr_ai(node12, "perm")
                if perm9.size != 3 or perm9[0] != 1 or perm9[1] != 0 or perm9[2] != 2:
                    continue
                if (
                    perm12.size != 3
                    or perm12[0] != 1
                    or perm12[1] != 0
                    or perm12[2] != 2
                ):
                    continue

                # 1 2 0
                perm13 = get_node_attr_ai(node13, "perm")
                if (
                    perm13.size != 3
                    or perm13[0] != 1
                    or perm13[1] != 2
                    or perm13[2] != 0
                ):
                    continue

                # 1 0 2
                perm17 = get_node_attr_ai(node17, "perm")
                if (
                    perm17.size != 3
                    or perm17[0] != 1
                    or perm17[1] != 0
                    or perm17[2] != 2
                ):
                    continue

                softmax_axis = get_node_attr_i(node15, "axis")
                if softmax_axis != 2:
                    continue

                # 1/-1 seqlen * num_heads, embed_dim / num_heads
                if len(node8.input) == 1:
                    shape8 = get_node_attr_ai(node8, "shape")
                else:
                    if node8.input[1] not in self.weights:
                        continue

                    shape8 = get_node_attr_from_input_ai(self.weights[node8.input[1]])
                if len(node10.input) == 1:
                    shape10 = get_node_attr_ai(node10, "shape")
                else:
                    if node10.input[1] not in self.weights:
                        continue

                    shape10 = get_node_attr_from_input_ai(self.weights[node10.input[1]])
                if len(node11.input) == 1:
                    shape11 = get_node_attr_ai(node11, "shape")
                else:
                    if node11.input[1] not in self.weights:
                        continue

                    shape11 = get_node_attr_from_input_ai(self.weights[node11.input[1]])

                if shape8.size != 3 or shape10.size != 3 or shape11.size != 3:
                    continue
                if (
                    shape8[1] != shape10[1]
                    or shape8[1] != shape11[1]
                    or shape8[2] != shape10[2]
                    or shape8[2] != shape11[2]
                ):
                    continue

                num_heads = embed_dim / shape8[2]

                if len(node18.input) == 1:
                    shape18 = get_node_attr_ai(node18, "shape")
                else:
                    if node18.input[1] not in self.weights:
                        continue

                    shape18 = get_node_attr_from_input_ai(self.weights[node18.input[1]])

                if (
                    shape18.size != 3
                    or shape18[2] != embed_dim
                    or shape18[1] * num_heads != shape8[1]
                ):
                    continue

                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"
                node3.op_type = "noop_reducedncnn"
                node4.op_type = "noop_reducedncnn"
                node5.op_type = "noop_reducedncnn"
                node6.op_type = "noop_reducedncnn"
                node7.op_type = "noop_reducedncnn"
                node8.op_type = "noop_reducedncnn"
                node9.op_type = "noop_reducedncnn"
                node10.op_type = "noop_reducedncnn"
                node11.op_type = "noop_reducedncnn"
                node12.op_type = "noop_reducedncnn"
                node13.op_type = "noop_reducedncnn"
                node14.op_type = "noop_reducedncnn"
                node15.op_type = "noop_reducedncnn"
                node16.op_type = "noop_reducedncnn"
                node17.op_type = "noop_reducedncnn"
                node18.op_type = "noop_reducedncnn"
                node19.op_type = "noop_reducedncnn"

                self.node_reference[node2.input[0]] -= 1
                self.node_reference[node4.input[0]] -= 1
                self.node_reference[node6.input[0]] -= 1
                self.node_reference[node7.input[0]] -= 1
                self.node_reference[node7.input[1]] -= 1
                self.node_reference[node8.input[0]] -= 1
                if len(node8.input) == 2:
                    self.node_reference[node8.input[1]] -= 1
                self.node_reference[node9.input[0]] -= 1
                self.node_reference[node10.input[0]] -= 1
                if len(node10.input) == 2:
                    self.node_reference[node10.input[1]] -= 1
                self.node_reference[node11.input[0]] -= 1
                if len(node11.input) == 2:
                    self.node_reference[node11.input[1]] -= 1
                self.node_reference[node12.input[0]] -= 1
                self.node_reference[node13.input[0]] -= 1
                self.node_reference[node14.input[0]] -= 1
                self.node_reference[node14.input[1]] -= 1
                self.node_reference[node15.input[0]] -= 1
                self.node_reference[node16.input[0]] -= 1
                self.node_reference[node16.input[1]] -= 1
                self.node_reference[node17.input[0]] -= 1
                self.node_reference[node18.input[0]] -= 1
                if len(node18.input) == 2:
                    self.node_reference[node18.input[1]] -= 1
                self.node_reference[node19.input[0]] -= 1
                self.node_reference[node20.input[0]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)
                self.blob_names.pop(node3.output[0], None)
                self.blob_names.pop(node4.output[0], None)
                self.blob_names.pop(node5.output[0], None)
                self.blob_names.pop(node6.output[0], None)
                self.blob_names.pop(node7.output[0], None)
                self.blob_names.pop(node8.output[0], None)
                self.blob_names.pop(node9.output[0], None)
                self.blob_names.pop(node10.output[0], None)
                self.blob_names.pop(node11.output[0], None)
                self.blob_names.pop(node12.output[0], None)
                self.blob_names.pop(node13.output[0], None)
                self.blob_names.pop(node14.output[0], None)
                self.blob_names.pop(node15.output[0], None)
                self.blob_names.pop(node16.output[0], None)
                self.blob_names.pop(node17.output[0], None)
                self.blob_names.pop(node18.output[0], None)
                self.blob_names.pop(node19.output[0], None)

                qw = node.input[1]
                qb = node2.input[1]
                kw = node3.input[1]
                kb = node4.input[1]
                vw = node5.input[1]
                vb = node6.input[1]
                ow = node19.input[1]
                ob = node20.input[1]

                node20.op_type = "MultiHeadAttention"
                node20.ClearField("input")
                node20.input.append(node.input[0])
                node20.input.append(node3.input[0])
                node20.input.append(node5.input[0])
                node20.input.append(qw)
                node20.input.append(qb)
                node20.input.append(kw)
                node20.input.append(kb)
                node20.input.append(vw)
                node20.input.append(vb)
                node20.input.append(ow)
                node20.input.append(ob)

                attr_embed_dim = AttributeProto(
                    name="embed_dim", i=embed_dim, type=APT.INT
                )
                node20.attribute.append(attr_embed_dim)

                attr_num_heads = AttributeProto(
                    name="num_heads", i=num_heads, type=APT.INT
                )
                node20.attribute.append(attr_num_heads)

                reduced_node_count[0] += 19
                i += 19

        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # MultiHeadAttention <= MatMul(qkv) - Add - Split
            #                      - Mul
            #                      - Reshape - Transpose
            #                      - Reshape - Reshape - Transpose - Transpose
            #                      - Gemm - Softmax - Gemm - Transpose - Reshape - MatMul - Add
            if node.op_type == "MatMul":
                if (
                    self.node_reference[node.output[0]] != 1
                    or i + 16 >= self.node_count
                ):
                    continue

                node2 = self.mutable_graph_nodes[i + 1]
                node3 = self.mutable_graph_nodes[i + 2]
                node4 = self.mutable_graph_nodes[i + 3]
                node5 = self.mutable_graph_nodes[i + 4]
                node6 = self.mutable_graph_nodes[i + 5]
                node7 = self.mutable_graph_nodes[i + 6]
                node8 = self.mutable_graph_nodes[i + 7]
                node9 = self.mutable_graph_nodes[i + 8]
                node10 = self.mutable_graph_nodes[i + 9]
                node11 = self.mutable_graph_nodes[i + 10]
                node12 = self.mutable_graph_nodes[i + 11]
                node13 = self.mutable_graph_nodes[i + 12]
                node14 = self.mutable_graph_nodes[i + 13]
                node15 = self.mutable_graph_nodes[i + 14]
                node16 = self.mutable_graph_nodes[i + 15]
                node17 = self.mutable_graph_nodes[i + 16]

                if (
                    node2.op_type != "Add"
                    or node3.op_type != "Split"
                    or node4.op_type != "Mul"
                    or node5.op_type != "Reshape"
                    or node6.op_type != "Transpose"
                    or node7.op_type != "Reshape"
                    or node8.op_type != "Reshape"
                    or node9.op_type != "Transpose"
                    or node10.op_type != "Transpose"
                    or node11.op_type != "MatMul"
                    or node12.op_type != "Softmax"
                    or node13.op_type != "MatMul"
                    or node14.op_type != "Transpose"
                    or node15.op_type != "Reshape"
                    or node16.op_type != "MatMul"
                    or node17.op_type != "Add"
                ):
                    continue
                if (
                    self.node_reference[node2.output[0]] != 1
                    or self.node_reference[node3.output[0]] != 1
                    or self.node_reference[node3.output[1]] != 1
                    or self.node_reference[node3.output[2]] != 1
                    or self.node_reference[node4.output[0]] != 1
                    or self.node_reference[node5.output[0]] != 1
                    or self.node_reference[node6.output[0]] != 1
                    or self.node_reference[node7.output[0]] != 1
                    or self.node_reference[node8.output[0]] != 1
                    or self.node_reference[node9.output[0]] != 1
                    or self.node_reference[node10.output[0]] != 1
                    or self.node_reference[node11.output[0]] != 1
                    or self.node_reference[node12.output[0]] != 1
                    or self.node_reference[node13.output[0]] != 1
                    or self.node_reference[node14.output[0]] != 1
                    or self.node_reference[node15.output[0]] != 1
                    or self.node_reference[node16.output[0]] != 1
                ):
                    continue
                if (
                    node2.input[0] != node.output[0]
                    or node3.input[0] != node2.output[0]
                    or node4.input[0] != node3.output[0]
                    or node5.input[0] != node4.output[0]
                    or node6.input[0] != node5.output[0]
                    or node7.input[0] != node3.output[1]
                    or node8.input[0] != node3.output[2]
                    or node9.input[0] != node8.output[0]
                    or node10.input[0] != node7.output[0]
                    or node11.input[0] != node6.output[0]
                    or node11.input[1] != node10.output[0]
                    or node12.input[0] != node11.output[0]
                    or node13.input[0] != node12.output[0]
                    or node13.input[1] != node9.output[0]
                    or node14.input[0] != node13.output[0]
                    or node15.input[0] != node14.output[0]
                    or node16.input[0] != node15.output[0]
                    or node17.input[0] != node16.output[0]
                ):
                    continue

                qkv_B = get_node_attr_from_input_af(self.weights[node2.input[1]])
                o_B = get_node_attr_from_input_af(self.weights[node17.input[1]])

                if qkv_B.size != o_B.size * 3:
                    continue

                embed_dim = o_B.size

                # 1 0 2
                perm6 = get_node_attr_ai(node6, "perm")
                perm9 = get_node_attr_ai(node9, "perm")
                if perm6.size != 3 or perm6[0] != 1 or perm6[1] != 0 or perm6[2] != 2:
                    continue
                if perm9.size != 3 or perm9[0] != 1 or perm9[1] != 0 or perm9[2] != 2:
                    continue

                # 1 2 0
                perm10 = get_node_attr_ai(node10, "perm")
                if (
                    perm10.size != 3
                    or perm10[0] != 1
                    or perm10[1] != 2
                    or perm10[2] != 0
                ):
                    continue

                # 1 0 2
                perm14 = get_node_attr_ai(node14, "perm")
                if (
                    perm14.size != 3
                    or perm14[0] != 1
                    or perm14[1] != 0
                    or perm14[2] != 2
                ):
                    continue

                softmax_axis = get_node_attr_i(node12, "axis")
                if softmax_axis != 2:
                    continue

                # 1/-1, seqlen * num_heads, embed_dim / num_heads
                if len(node5.input) == 1:
                    shape5 = get_node_attr_ai(node5, "shape")
                else:
                    if node5.input[1] not in self.weights:
                        continue

                    shape5 = get_node_attr_from_input_ai(self.weights[node5.input[1]])
                if len(node7.input) == 1:
                    shape7 = get_node_attr_ai(node7, "shape")
                else:
                    if node7.input[1] not in self.weights:
                        continue

                    shape7 = get_node_attr_from_input_ai(self.weights[node7.input[1]])
                if len(node8.input) == 1:
                    shape8 = get_node_attr_ai(node8, "shape")
                else:
                    if node8.input[1] not in self.weights:
                        continue

                    shape8 = get_node_attr_from_input_ai(self.weights[node8.input[1]])

                if (
                    shape5[1] != shape7[1]
                    or shape5[1] != shape8[1]
                    or shape5[2] != shape7[2]
                    or shape5[2] != shape8[2]
                ):
                    continue

                num_heads = embed_dim / shape5[2]

                # 1, seqlen, embed_dim
                if len(node15.input) == 1:
                    shape15 = get_node_attr_ai(node15, "shape")
                else:
                    if node15.input[1] not in self.weights:
                        continue

                    shape15 = get_node_attr_from_input_ai(self.weights[node15.input[1]])

                if (
                    shape15.size != 3
                    or shape15[2] != embed_dim
                    or shape15[1] * num_heads != shape8[1]
                ):
                    continue

                # reduce
                node.op_type = "noop_reducedncnn"
                node2.op_type = "noop_reducedncnn"
                node3.op_type = "noop_reducedncnn"
                node4.op_type = "noop_reducedncnn"
                node5.op_type = "noop_reducedncnn"
                node6.op_type = "noop_reducedncnn"
                node7.op_type = "noop_reducedncnn"
                node8.op_type = "noop_reducedncnn"
                node9.op_type = "noop_reducedncnn"
                node10.op_type = "noop_reducedncnn"
                node11.op_type = "noop_reducedncnn"
                node12.op_type = "noop_reducedncnn"
                node13.op_type = "noop_reducedncnn"
                node14.op_type = "noop_reducedncnn"
                node15.op_type = "noop_reducedncnn"
                node16.op_type = "noop_reducedncnn"

                self.node_reference[node2.input[0]] -= 1
                self.node_reference[node3.input[0]] -= 1
                self.node_reference[node4.input[0]] -= 1
                self.node_reference[node4.input[1]] -= 1
                self.node_reference[node5.input[0]] -= 1
                if len(node5.input) == 2:
                    self.node_reference[node5.input[1]] -= 1
                self.node_reference[node6.input[0]] -= 1
                self.node_reference[node7.input[0]] -= 1
                if len(node7.input) == 2:
                    self.node_reference[node7.input[1]] -= 1
                self.node_reference[node8.input[0]] -= 1
                if len(node8.input) == 2:
                    self.node_reference[node8.input[1]] -= 1
                self.node_reference[node9.input[0]] -= 1
                self.node_reference[node10.input[0]] -= 1
                self.node_reference[node11.input[0]] -= 1
                self.node_reference[node11.input[1]] -= 1
                self.node_reference[node12.input[0]] -= 1
                self.node_reference[node13.input[0]] -= 1
                self.node_reference[node13.input[1]] -= 1
                self.node_reference[node14.input[0]] -= 1
                self.node_reference[node15.input[0]] -= 1
                if len(node15.input) == 2:
                    self.node_reference[node15.input[1]] -= 1
                self.node_reference[node16.input[0]] -= 1
                self.node_reference[node17.input[0]] -= 1

                self.blob_names.pop(node.output[0], None)
                self.blob_names.pop(node2.output[0], None)
                self.blob_names.pop(node3.output[0], None)
                self.blob_names.pop(node3.output[1], None)
                self.blob_names.pop(node3.output[2], None)
                self.blob_names.pop(node4.output[0], None)
                self.blob_names.pop(node5.output[0], None)
                self.blob_names.pop(node6.output[0], None)
                self.blob_names.pop(node7.output[0], None)
                self.blob_names.pop(node8.output[0], None)
                self.blob_names.pop(node9.output[0], None)
                self.blob_names.pop(node10.output[0], None)
                self.blob_names.pop(node11.output[0], None)
                self.blob_names.pop(node12.output[0], None)
                self.blob_names.pop(node13.output[0], None)
                self.blob_names.pop(node14.output[0], None)
                self.blob_names.pop(node15.output[0], None)
                self.blob_names.pop(node16.output[0], None)

                qkvw = node.input[1]
                qkvb = node2.input[1]
                ow = node16.input[1]
                ob = node17.input[1]

                node17.op_type = "MultiHeadAttention"
                node17.ClearField("input")
                node17.input.append(node.input[0])
                node17.input.append(qkvw)
                node17.input.append(qkvb)
                node17.input.append(ow)
                node17.input.append(ob)

                attr_embed_dim = AttributeProto(
                    name="embed_dim", i=embed_dim, type=APT.INT
                )
                node17.attribute.append(attr_embed_dim)

                attr_num_heads = AttributeProto(
                    name="num_heads", i=num_heads, type=APT.INT
                )
                node17.attribute.append(attr_num_heads)

                reduced_node_count[0] += 16
                i += 16

    def fuse_binaryop_with_scalar(self) -> None:
        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # Add/Sub/Mul/Div/Min/Max/Pow(a, x)
            if node.op_type in ["Add", "Sub", "Mul", "Div", "Min", "Max", "Pow"]:
                if node.input[0] not in self.weights:
                    continue

                scalar_b = self.weights[node.input[0]]
                if (
                    len(scalar_b.dims) != 0
                    or get_tensor_proto_data_size(scalar_b, scalar_b.data_type) != 1
                ):
                    continue

                if node.op_type == "Sub":
                    node.op_type = "RSub"
                elif node.op_type == "Div":
                    node.op_type = "RDiv"

                b = get_node_attr_from_input_f(scalar_b)

                self.node_reference[node.input[0]] -= 1

                node_input = node.input[1]
                node.ClearField("input")
                node.input.append(node_input)

                attr_with_scalar = AttributeProto(name="with_scalar", i=1, type=APT.INT)
                node.attribute.append(attr_with_scalar)

                attr_b = AttributeProto(name="b", f=b, type=APT.FLOAT)
                node.attribute.append(attr_b)

        for i in range(self.node_count):
            node = self.mutable_graph_nodes[i]

            # Add/Sub/Mul/Div/Min/Max/Pow(x, b)
            if node.op_type in ["Add", "Sub", "Mul", "Div", "Min", "Max", "Pow"]:
                if node.input[1] not in self.weights:
                    continue

                scalar_b = self.weights[node.input[1]]
                if (
                    len(scalar_b.dims) != 0
                    or get_tensor_proto_data_size(scalar_b, scalar_b.data_type) != 1
                ):
                    continue

                b = get_node_attr_from_input_f(scalar_b)

                self.node_reference[node.input[1]] -= 1

                node_input = node.input[0]
                node.ClearField("input")
                node.input.append(node_input)

                attr_with_scalar = AttributeProto(name="with_scalar", i=1, type=APT.INT)
                node.attribute.append(attr_with_scalar)

                attr_b = AttributeProto(name="b", f=b, type=APT.FLOAT)
                node.attribute.append(attr_b)

    def convert(self, is_fp16: bool = False, include_mem_data: bool = True):
        if is_fp16:
            logger.debug("NCNN mode: fp16")
        else:
            logger.debug("NCNN mode: fp32")

        # Topological sort
        i = 0
        while i < self.node_count:
            node = self.mutable_graph_nodes[i]
            swapnode = False
            missing_input_name = None
            for input_name in node.input:
                if (
                    input_name
                    and input_name not in self.producers
                    and input_name not in self.weights
                ):
                    swapnode = True
                    missing_input_name = input_name
                    break

            # If nothing missing, add outputs to producers and continue
            # to next node
            if not swapnode:
                for output_name in node.output:
                    if output_name:
                        self.producers[output_name] = None

                i += 1
                continue

            # find node that produces missing_input_name
            for j, nodeq in enumerate(self.mutable_graph_nodes, i + 1):
                found = False
                for output_name in nodeq.output:
                    if output_name == missing_input_name:
                        found = True
                        break

                if found:
                    break
            else:
                raise RuntimeError(
                    f"Cannot find node that produces {missing_input_name}, "
                    f"which is required by node {i} ({node.name})."
                )

            self.swap_nodes(i, j)

        # global definition line
        # [layer count][blob count]
        for node in self.onnx_graph.node:
            op = node.op_type
            if not node.name:
                node.name = node.output[0]

            if op == "Constant":
                self.weights[node.output[0]] = get_node_attr_tensor(node, "value")

            for input_name in node.input:
                self.blob_names[input_name] = None

                if input_name not in self.node_reference:
                    self.node_reference[input_name] = 1
                else:
                    self.node_reference[input_name] += 1

            if op == "Dropout":
                output_name = node.output[0]
                self.blob_names[output_name] = None
                self.node_reference[output_name] = 0
                continue

            for output_name in node.output:
                self.blob_names[output_name] = None
                self.node_reference[output_name] = 0

        # include Input node
        input_node_count = 0
        for graph_input in self.onnx_graph.input:
            input_name = graph_input.name

            # check weight
            if input_name not in self.weights:
                self.blob_names[input_name] = None
                input_node_count += 1

        # op chain fusion
        reduced_node_count = [0]
        self.fuse_weight_reshape(reduced_node_count)
        self.fuse_weight_transpose(reduced_node_count)
        self.fuse_shufflechannel(reduced_node_count)
        self.fuse_shufflechannel_split(reduced_node_count)
        self.fuse_hardsigmoid(reduced_node_count)
        self.fuse_hardswish(reduced_node_count)
        self.fuse_swish(reduced_node_count)
        self.fuse_batchnorm1d_squeeze_unsqueeze(reduced_node_count)
        self.fuse_unsqueeze_prelu(reduced_node_count)
        self.fuse_normalize(reduced_node_count)
        self.fuse_groupnorm(reduced_node_count)
        self.fuse_layernorm(reduced_node_count)
        self.fuse_flatten(reduced_node_count)
        self.fuse_pixelshuffle(reduced_node_count)
        self.fuse_reorg(reduced_node_count)
        self.fuse_expand_broadcast(reduced_node_count)
        self.fuse_lstm_gru_rnn(reduced_node_count)
        self.fuse_multiheadattention(reduced_node_count)
        self.fuse_binaryop_with_scalar()
        self.fuse_rewrite_gather()

        # reduce common const weight node_reference
        for node in self.onnx_graph.node:
            op = node.op_type
            if op == "BatchNormalization":
                self.node_reference[node.input[1]] -= 1
                self.node_reference[node.input[2]] -= 1
                self.node_reference[node.input[3]] -= 1
                self.node_reference[node.input[4]] -= 1
            elif op == "BiasGelu":
                self.node_reference[node.input[1]] -= 1
            elif op == "Clip":
                if len(node.input) == 3:
                    self.node_reference[node.input[1]] -= 1
                    self.node_reference[node.input[2]] -= 1
            elif op == "Conv":
                self.node_reference[node.input[1]] -= 1
                if len(node.input) == 3:
                    self.node_reference[node.input[2]] -= 1
            elif op == "ConvTranspose":
                self.node_reference[node.input[1]] -= 1
                if len(node.input) == 3:
                    self.node_reference[node.input[2]] -= 1
            elif op == "EmbedLayerNormalization":
                self.node_reference[node.input[1]] -= 1
                self.node_reference[node.input[2]] -= 1
                self.node_reference[node.input[3]] -= 1
                self.node_reference[node.input[4]] -= 1
                self.node_reference[node.input[5]] -= 1
                self.node_reference[node.input[6]] -= 1
            elif op == "Gemm":
                alpha = get_node_attr_f(node, "alpha", 1)
                beta = get_node_attr_f(node, "beta", 1)
                transA = get_node_attr_i(node, "transA", 0)
                transB = get_node_attr_i(node, "transB", 0)

                if alpha == 1 and beta == 1 and transA == 0 and transB == 1:
                    # InnerProduct-like A * B + C
                    self.node_reference[node.input[1]] -= 1
                    self.node_reference[node.input[2]] -= 1
            elif op == "GroupNorm":
                affine = get_node_attr_i(node, "affine", 1)
                if affine:
                    self.node_reference[node.input[1]] -= 1
                    self.node_reference[node.input[2]] -= 1
            elif op == "GRU":
                for gru_input in node.input:
                    self.node_reference[gru_input] -= 1
            elif op == "InstanceNormalization":
                self.node_reference[node.input[1]] -= 1
                self.node_reference[node.input[2]] -= 1
            elif op == "LayerNorm":
                affine = get_node_attr_i(node, "affine", 1)
                if affine:
                    self.node_reference[node.input[1]] -= 1
                    self.node_reference[node.input[2]] -= 1
            elif op == "LSTM":
                for lstm_input in node.input:
                    self.node_reference[lstm_input] -= 1
            elif op == "MatMul":
                if (
                    node.input[1] in self.weights
                    and len(self.weights[node.input[1]].dims) == 2
                ):
                    # InnerProduct
                    self.node_reference[node.input[1]] -= 1
            elif op == "MultiHeadAttention":
                if len(node.input) == 5:
                    self.node_reference[node.input[1]] -= 1
                    self.node_reference[node.input[2]] -= 1
                    self.node_reference[node.input[3]] -= 1
                    self.node_reference[node.input[4]] -= 1
                else:
                    self.node_reference[node.input[3]] -= 1
                    self.node_reference[node.input[4]] -= 1
                    self.node_reference[node.input[5]] -= 1
                    self.node_reference[node.input[6]] -= 1
                    self.node_reference[node.input[7]] -= 1
                    self.node_reference[node.input[8]] -= 1
                    self.node_reference[node.input[9]] -= 1
                    self.node_reference[node.input[10]] -= 1
            elif op == "Pad":
                if len(node.input) >= 2:
                    self.node_reference[node.input[1]] -= 1
            elif op == "PRelu":
                self.node_reference[node.input[1]] -= 1
            elif op == "Reshape":
                if len(node.input) >= 2:
                    self.node_reference[node.input[1]] -= 1
            elif op == "Resize":
                if len(node.input) == 2:
                    # opset 10
                    self.node_reference[node.input[1]] -= 1
                else:
                    # opset 11+
                    self.node_reference[node.input[1]] -= 1
                    self.node_reference[node.input[2]] -= 1
                    if len(node.input) >= 4:
                        self.node_reference[node.input[3]] -= 1
            elif op == "RNN":
                for rnn_input in node.input:
                    self.node_reference[rnn_input] -= 1
            elif op == "SkipLayerNormalization":
                self.node_reference[node.input[2]] -= 1
                self.node_reference[node.input[3]] -= 1
                self.node_reference[node.input[4]] -= 1
            elif op == "Slice":
                if len(node.input) >= 2:
                    self.node_reference[node.input[1]] -= 1
                    self.node_reference[node.input[2]] -= 1
                    if len(node.input) >= 4:
                        self.node_reference[node.input[3]] -= 1
                    if len(node.input) >= 5:
                        self.node_reference[node.input[4]] -= 1
            elif op == "Upsample":
                if len(node.input) >= 2:
                    self.node_reference[node.input[1]] -= 1
            elif op == "adaptive_avg_pool2d" or op == "adaptive_max_pool2d":
                if len(node.input) >= 2:
                    self.node_reference[node.input[1]] -= 1

        # count all weight node with zero reference
        zero_reference_weight_node_count = 0
        for input_name in self.weights.keys():
            # there may be some weight nodes in initializer but none of the graph nodes use them
            # add them to blob_names so we could get proper blob count later
            self.blob_names[input_name] = None

            refcount = self.node_reference[input_name]
            if refcount == 0:
                zero_reference_weight_node_count += 1

        # we always treat constant nodes as weights or binaryop_weights
        # do not count it twice for layer_count
        constant_node_count_moved_to_weight = 0
        for node in self.onnx_graph.node:
            if node.op_type == "Constant":
                constant_node_count_moved_to_weight += 1

        # some ops may have anonymous input
        # LSTM sequence_lens
        self.blob_names.pop("", None)
        self.node_reference.pop("", None)

        # remove node_reference entries with references equal to one
        split_layer_count = 0
        splitncnn_blob_count = 0

        # split node reference
        split_node_reference = {}
        for ref, count in self.node_reference.items():
            if count > 1:
                split_layer_count += 1
                splitncnn_blob_count += count
                split_node_reference[ref] = count

        ncnn_node_count = (
            self.node_count
            - constant_node_count_moved_to_weight
            + len(self.weights)
            - zero_reference_weight_node_count
            - reduced_node_count[0]
            + input_node_count
            + split_layer_count
        )
        ncnn_blob_count = (
            len(self.blob_names)
            - zero_reference_weight_node_count
            + splitncnn_blob_count
        )
        ncnn_model = NcnnModel(ncnn_node_count, ncnn_blob_count)
        logger.debug(
            f"Node count: {ncnn_model.node_count}, Blob count: {ncnn_model.blob_count}"
        )

        bin_length = 0
        for i, graph_input in enumerate(self.onnx_graph.input):
            input_name = graph_input.name

            # Make sure input is not in weights
            if input_name not in self.weights:
                ncnn_model.add_layer(
                    NcnnLayer("Input", input_name, 0, 1, outputs=[input_name])
                )

                refcount = self.node_reference[input_name]
                if refcount > 1:
                    layer_input_list = [
                        f"{input_name}_splitncnn_{j}" for j in range(refcount)
                    ]
                    ncnn_model.add_layer(
                        NcnnLayer(
                            "Split",
                            f"splitncnn_input{i}",
                            1,
                            refcount,
                            [input_name],
                            layer_input_list,
                        )
                    )

        # place MemoryData next if it is being included
        internal_split = 0
        if include_mem_data:
            for input_name, M in self.weights.items():
                refcount = self.node_reference[input_name]
                if refcount != 0:
                    layer = NcnnLayer("MemoryData", input_name, 0, 1, [input_name])

                    M_dims_size = len(M.dims)
                    if M_dims_size == 0:
                        layer.add_param(0, get_tensor_proto_data_size(M, M.data_type))
                    elif M_dims_size == 1:
                        layer.add_param(0, M.dims[0])
                    elif M_dims_size == 2:
                        layer.add_param(0, M.dims[1])
                        if M.dims[0] != 1:
                            layer.add_param(1, M.dims[0])
                    elif M_dims_size == 3:
                        layer.add_param(0, M.dims[2])
                        layer.add_param(1, M.dims[1])
                        if M.dims[0] != 1:
                            layer.add_param(2, M.dims[0])
                    elif M_dims_size == 4:
                        layer.add_param(0, M.dims[3])
                        layer.add_param(1, M.dims[2])
                        layer.add_param(2, M.dims[1])

                    bin_length += self.add_weight(layer, "MemoryData", M)

                    ncnn_model.add_layer(layer)

                    if refcount > 1:
                        layer_output_list = [
                            f"{input_name}_splitncnn_{i}" for i in range(refcount)
                        ]
                        ncnn_model.add_layer(
                            NcnnLayer(
                                "Split",
                                f"splitncnn_{internal_split}",
                                1,
                                refcount,
                                [input_name],
                                layer_output_list,
                            )
                        )

                        internal_split += 1

        for node in self.onnx_graph.node:
            op = node.op_type

            if op == "noop_reducedncnn":
                continue

            name = node.name
            if not name:
                name = node.output[0]

            input_size = len(node.input)
            output_size = len(node.output)

            for input_name in node.input:
                # check weight
                if not input_name or (
                    input_name in self.weights and self.node_reference[input_name] == 0
                ):
                    input_size -= 1

            layer = NcnnLayer()
            if op in [
                "Abs",
                "Acos",
                "Asin",
                "Atan",
                "Ceil",
                "Cos",
                "Exp",
                "Floor",
                "Log",
                "Neg",
                "Reciprocal",
                "Sin",
                "Sqrt",
                "Tan",
                "Tanh",
            ]:
                layer.op_type = "UnaryOp"
            elif op in [
                "Add",
                "Div",
                "Max",
                "Min",
                "Mul",
                "Pow",
                "RDiv",
                "RSub",
                "Sub",
            ]:
                layer.op_type = "BinaryOp"
            elif op == "AveragePool" or op == "MaxPool":
                kernel_shape = get_node_attr_ai(node, "kernel_shape")
                if kernel_shape.size == 1:
                    layer.op_type = "Pooling1D"
                else:
                    layer.op_type = "Pooling"
            elif op == "BatchNormalization":
                layer.op_type = "BatchNorm"
            elif op == "BiasGelu":
                layer.op_type = "BiasGelu"
            elif op == "Clip":
                layer.op_type = "Clip"
            elif op == "Concat":
                layer.op_type = "Concat"
            elif op == "Constant":
                continue
            elif op == "Conv":
                kernel_shape = get_node_attr_ai(node, "kernel_shape")
                if kernel_shape.size == 1:
                    layer.op_type = "Convolution1D"
                else:
                    group = get_node_attr_i(node, "group", 1)
                    if group > 1:
                        layer.op_type = "ConvolutionDepthWise"
                    else:
                        layer.op_type = "Convolution"
            elif op == "ConvTranspose":
                group = get_node_attr_i(node, "group", 1)
                if group > 1:
                    layer.op_type = "DeconvolutionDepthWise"
                else:
                    layer.op_type = "Deconvolution"
            elif op == "Crop" or op == "Slice":
                layer.op_type = "Crop"
            elif op == "DepthToSpace" or op == "PixelShuffle":
                layer.op_type = "PixelShuffle"
            elif op == "Dropout":
                layer.op_type = "Dropout"
                output_size = 1
            elif op == "Elu":
                layer.op_type = "ELU"
            elif op == "EmbedLayerNormalization":
                layer.op_type = "EmbedLayerNormalization"
            elif op == "Flatten":
                layer.op_type = "Flatten"
            elif op == "Gelu":
                layer.op_type = "GELU"
            elif op == "Gemm":
                alpha = get_node_attr_f(node, "alpha", 1)
                beta = get_node_attr_f(node, "beta", 1)
                transA = get_node_attr_i(node, "transA", 0)
                transB = get_node_attr_i(node, "transB", 0)

                if alpha == 1 and beta == 1 and transA == 0 and transB == 1:
                    # InnerProduct-like A * B + C
                    layer.op_type = "InnerProduct"
                else:
                    layer.op_type = "Gemm"
            elif op in [
                "GlobalAveragePool",
                "GlobalMaxPool",
                "adaptive_avg_pool2d",
                "adaptive_max_pool2d",
            ]:
                layer.op_type = "Pooling"
            elif op == "GroupNorm":
                layer.op_type = "GroupNorm"
            elif op == "GRU":
                layer.op_type = "GRU"
            elif op == "HardSigmoid":
                layer.op_type = "HardSigmoid"
            elif op == "HardSwish":
                layer.op_type = "HardSwish"
            elif op == "ImageScaler":
                layer.op_type = "Scale"
            elif op == "InstanceNormalization":
                layer.op_type = "InstanceNorm"
            elif op == "LayerNorm":
                layer.op_type = "LayerNorm"
            elif op == "LeakyRelu" or op == "Relu":
                layer.op_type = "ReLU"
            elif op == "LRN":
                layer.op_type = "LRN"
            elif op == "LSTM":
                layer.op_type = "LSTM"
            elif op == "MatMul":
                if (
                    node.input[1] in self.weights
                    and len(self.weights[node.input[1]].dims) == 2
                ):
                    layer.op_type = "InnerProduct"
                else:
                    layer.op_type = "Gemm"
            elif op == "MultiHeadAttention":
                layer.op_type = "MultiHeadAttention"
            elif op == "Normalize":
                layer.op_type = "Normalize"
            elif op == "Pad":
                layer.op_type = "Padding"
            elif op == "PRelu":
                layer.op_type = "PReLU"
            elif op in [
                "ReduceMax",
                "ReduceMin",
                "ReduceMean",
                "ReduceProd",
                "ReduceSum",
                "ReduceSumSquare",
                "ReduceL1",
                "ReduceL2",
                "ReduceLogSum",
                "ReduceLogSumExp",
            ]:
                layer.op_type = "Reduction"
            elif op == "Reorg":
                layer.op_type = "Reorg"
            elif op == "Reshape":
                layer.op_type = "Reshape"
            elif op == "RNN":
                layer.op_type = "RNN"
            elif op == "ShuffleChannel":
                layer.op_type = "ShuffleChannel"
            elif op == "Sigmoid":
                layer.op_type = "Sigmoid"
            elif op == "SkipLayerNormalization":
                layer.op_type = "SkipLayerNormalization"
            elif op == "Softmax":
                layer.op_type = "Softmax"
            elif op == "Softplus":
                layer.op_type = "Softplus"
            elif op == "Split":
                layer.op_type = "Slice"
            elif op == "Squeeze":
                layer.op_type = "Squeeze"
            elif op == "Sum":
                layer.op_type = "Eltwise"
            elif op == "Swish":
                layer.op_type = "Swish"
            elif op == "Transpose":
                layer.op_type = "Permute"
            elif op == "Upsample" or op == "Resize":
                layer.op_type = "Interp"
            elif op == "Unsqueeze":
                layer.op_type = "ExpandDims"
            else:
                error_msg = f"{op} not currently supported by NCNN."
                raise ValueError(error_msg)

            layer.name = name
            layer.num_inputs = input_size
            layer.num_outputs = output_size
            layer.params.set_op(layer.op_type)

            for input_name in node.input:
                # check weight
                if input_name and not (
                    input_name in self.weights and self.node_reference[input_name] == 0
                ):
                    if input_name in split_node_reference:
                        refidx = split_node_reference[input_name] - 1
                        split_node_reference[input_name] = refidx
                        input_name = f"{input_name}_splitncnn_{refidx}"

                    layer.inputs.append(input_name)

            for o in range(output_size):
                layer.outputs.append(node.output[o])

            if op == "Abs":
                layer.add_param(0, UOT.ABS)
            elif op == "Acos":
                layer.add_param(0, UOT.ACOS)
            elif layer.op_type == "BinaryOp":
                if op == "Add":
                    layer.add_param(0, BOT.ADD)
                elif op == "Div":
                    layer.add_param(0, BOT.DIV)
                elif op == "Max":
                    layer.add_param(0, BOT.MAX)
                elif op == "Min":
                    layer.add_param(0, BOT.MIN)
                elif op == "Mul":
                    layer.add_param(0, BOT.MUL)
                elif op == "Pow":
                    layer.add_param(0, BOT.POW)
                elif op == "RDiv":
                    layer.add_param(0, BOT.RDIV)
                elif op == "RSub":
                    layer.add_param(0, BOT.RSUB)
                elif op == "Sub":
                    layer.add_param(0, BOT.SUB)

                with_scalar = get_node_attr_i(node, "with_scalar", 0)
                b = get_node_attr_f(node, "b", 0)
                if with_scalar:
                    layer.add_param(1, with_scalar)
                    layer.add_param(2, b)
            elif op == "Asin":
                layer.add_param(0, UOT.ASIN)
            elif op == "Atan":
                layer.add_param(0, UOT.ATAN)
            elif op == "AveragePool" or op == "MaxPool":
                auto_pad = get_node_attr_s(node, "auto_pad")
                ceil_mode = get_node_attr_i(node, "ceil_mode", 0)
                kernel_shape = get_node_attr_ai(node, "kernel_shape")
                strides = get_node_attr_ai(node, "strides")
                pads = get_node_attr_ai(node, "pads")

                pool = int(op == "AveragePool")

                if ceil_mode == 1:
                    pad_mode = PAM.FULL
                elif auto_pad == "SAME_UPPER":
                    pad_mode = PAM.SAMEUPPER
                elif auto_pad == "SAME_LOWER":
                    pad_mode = PAM.SAMELOWER
                else:
                    pad_mode = PAM.VALID

                layer.add_param(0, pool)

                if kernel_shape.size == 1:
                    layer.add_param(1, int(kernel_shape[0]))
                elif kernel_shape.size == 2:
                    layer.add_param(1, int(kernel_shape[1]))
                    layer.add_param(11, int(kernel_shape[0]))

                if strides.size == 1:
                    layer.add_param(2, int(strides[0]))
                elif strides.size == 2:
                    layer.add_param(2, int(strides[1]))
                    layer.add_param(12, int(strides[0]))

                if pads.size == 1:
                    layer.add_param(3, int(pads[0]))
                elif pads.size == 2:
                    layer.add_param(3, int(pads[1]))
                    layer.add_param(13, int(pads[0]))
                elif pads.size == 4:
                    layer.add_param(3, int(pads[1]))
                    layer.add_param(13, int(pads[0]))
                    layer.add_param(14, int(pads[3]))
                    layer.add_param(15, int(pads[2]))

                layer.add_param(5, pad_mode)

                if pool:
                    avgpool_count_include_pad = get_node_attr_i(
                        node, "count_include_pad", 0
                    )
                    layer.add_param(6, avgpool_count_include_pad)
            elif op == "BatchNormalization":
                epsilon = get_node_attr_f(node, "epsilon", 0.00001)
                scale = self.weights[node.input[1]]
                B = self.weights[node.input[2]]
                mean = self.weights[node.input[3]]
                var = self.weights[node.input[4]]
                channels = get_tensor_proto_data_size(scale, scale.data_type)

                layer.add_param(0, channels)

                bin_length += self.add_weight(layer, "slope", scale)
                bin_length += self.add_weight(layer, "mean", mean)

                # apply epsilon to var
                v = onph.to_array(var)
                ve = np.array([v[i] + epsilon for i in range(channels)], np.float32)
                bin_length += self.add_weight(layer, "variance", ve)
                bin_length += self.add_weight(layer, "bias", B)
            elif op == "BiasGelu":
                B = self.weights[node.input[1]]

                layer.add_param(0, get_tensor_proto_data_size(B, B.data_type))

                bin_length += self.add_weight(layer, "bias", B)
            elif op == "Ceil":
                layer.add_param(0, UOT.CEIL)
            elif op == "Clip":
                if len(node.input) == 1:
                    minimum = get_node_attr_f(node, "min", -FLOAT32_MAX)
                    maximum = get_node_attr_f(node, "max", FLOAT32_MAX)
                else:
                    minimum = (
                        get_node_attr_from_input_f(self.weights[node.input[1]])
                        if node.input[1] in self.weights
                        else -FLOAT32_MAX
                    )
                    maximum = (
                        get_node_attr_from_input_f(self.weights[node.input[2]])
                        if node.input[2] in self.weights
                        else FLOAT32_MAX
                    )

                layer.add_param(0, minimum)
                layer.add_param(1, maximum)
            elif op == "Concat":
                axis = get_node_attr_i(node, "axis", 1)
                layer.add_param(0, axis - 1 if axis > 0 else axis)
            elif op == "Constant":
                logger.error("chaiNNer: code should not have reached inside Constant")
            elif op == "Conv":
                W = self.weights[node.input[1]]

                num_filter = W.dims[0]
                has_bias = int(len(node.input) == 3)

                auto_pad = get_node_attr_s(node, "auto_pad")
                kernel_shape = get_node_attr_ai(node, "kernel_shape")
                dilations = get_node_attr_ai(node, "dilations")
                strides = get_node_attr_ai(node, "strides")
                pads = get_node_attr_ai(node, "pads")
                group = get_node_attr_i(node, "group", 1)

                layer.add_param(0, num_filter)

                if kernel_shape.size == 1:
                    layer.add_param(1, int(kernel_shape[0]))
                elif kernel_shape.size == 2:
                    layer.add_param(1, int(kernel_shape[1]))
                    layer.add_param(11, int(kernel_shape[0]))

                if dilations.size == 1:
                    layer.add_param(2, int(dilations[0]))
                elif dilations.size == 2:
                    layer.add_param(2, int(dilations[1]))
                    layer.add_param(12, int(dilations[0]))

                if strides.size == 1:
                    layer.add_param(3, int(strides[0]))
                elif strides.size == 2:
                    layer.add_param(3, int(strides[1]))
                    layer.add_param(13, int(strides[0]))

                if auto_pad == "SAME_UPPER":
                    layer.add_param(4, -233)
                elif auto_pad == "SAME_LOWER":
                    layer.add_param(4, -234)
                else:
                    if pads.size == 1:
                        layer.add_param(4, int(pads[0]))
                    elif pads.size == 2:
                        layer.add_param(4, int(pads[1]))
                        layer.add_param(14, int(pads[0]))
                    elif pads.size == 4:
                        layer.add_param(4, int(pads[1]))
                        layer.add_param(14, int(pads[0]))
                        layer.add_param(15, int(pads[3]))
                        layer.add_param(16, int(pads[2]))

                layer.add_param(5, has_bias)

                layer.add_param(6, get_tensor_proto_data_size(W, W.data_type))

                if group > 1:
                    layer.add_param(7, int(group))

                quantize_tag = DTYPE_FP16 if is_fp16 else DTYPE_FP32
                bin_length += self.add_weight(layer, "weight", W, quantize_tag)

                if has_bias:
                    B = self.weights[node.input[2]]
                    bin_length += self.add_weight(layer, "bias", B)
            elif op == "ConvTranspose":
                W = self.weights[node.input[1]]

                has_bias = int(len(node.input) == 3)

                auto_pad = get_node_attr_s(node, "auto_pad")
                kernel_shape = get_node_attr_ai(node, "kernel_shape")
                dilations = get_node_attr_ai(node, "dilations")
                strides = get_node_attr_ai(node, "strides")
                output_padding = get_node_attr_ai(node, "output_padding")
                output_shape = get_node_attr_ai(node, "output_shape")
                pads = get_node_attr_ai(node, "pads")
                group = get_node_attr_i(node, "group", 1)
                num_filter = W.dims[1] * group

                layer.add_param(0, num_filter)

                if kernel_shape.size == 1:
                    layer.add_param(1, int(kernel_shape[0]))
                elif kernel_shape.size == 2:
                    layer.add_param(1, int(kernel_shape[1]))
                    layer.add_param(11, int(kernel_shape[0]))

                if dilations.size == 1:
                    layer.add_param(2, int(dilations[0]))
                elif dilations.size == 2:
                    layer.add_param(2, int(dilations[1]))
                    layer.add_param(12, int(dilations[0]))

                if strides.size == 1:
                    layer.add_param(3, int(strides[0]))
                elif strides.size == 2:
                    layer.add_param(3, int(strides[1]))
                    layer.add_param(13, int(strides[0]))

                if auto_pad == "SAME_UPPER":
                    layer.add_param(4, -233)
                elif auto_pad == "SAME_LOWER":
                    layer.add_param(4, -234)
                else:
                    if pads.size == 1:
                        layer.add_param(4, int(pads[0]))
                    elif pads.size == 2:
                        layer.add_param(4, int(pads[1]))
                        layer.add_param(14, int(pads[0]))
                    elif pads.size == 4:
                        layer.add_param(4, int(pads[1]))
                        layer.add_param(14, int(pads[0]))
                        layer.add_param(15, int(pads[3]))
                        layer.add_param(16, int(pads[2]))

                if output_padding.size == 1:
                    layer.add_param(18, int(output_padding[0]))
                elif output_padding.size == 2:
                    layer.add_param(18, int(output_padding[1]))
                    layer.add_param(19, int(output_padding[0]))

                if output_shape.size == 1:
                    layer.add_param(20, int(output_shape[0]))
                elif output_shape == 2:
                    layer.add_param(20, int(output_shape[1]))
                    layer.add_param(21, int(output_shape[0]))

                layer.add_param(5, has_bias)

                weight_data_size = get_tensor_proto_data_size(W, W.data_type)
                layer.add_param(6, weight_data_size)

                if group > 1:
                    layer.add_param(7, group)

                quantize_tag = DTYPE_FP16 if is_fp16 else DTYPE_FP32
                weight_data = onph.to_array(W)
                bin_length += self.add_weight(
                    layer, "weight", weight_data.swapaxes(0, 1), quantize_tag
                )

                if has_bias:
                    B = self.weights[node.input[2]]
                    bin_length += self.add_weight(layer, "bias", B)
            elif op == "Cos":
                layer.add_param(0, UOT.COS)
            elif op == "Crop":
                starts = get_node_attr_ai(node, "starts")
                layer.add_param(9, [starts.size, *starts])

                ends = get_node_attr_ai(node, "ends")
                layer.add_param(10, [ends.size, *ends])

                axes = get_node_attr_ai(node, "axis")
                layer.add_param(11, [axes.size, *axes])
            elif op == "DepthToSpace":
                # pixelshuffle
                scale_factor = get_node_attr_i(node, "blocksize", 1)
                mode = get_node_attr_s(node, "mode")
                layer.add_param(0, scale_factor)
                if mode == "CRD":
                    layer.add_param(1, 0)
                elif mode == "DCR":
                    layer.add_param(1, 1)
            elif op == "Dropout":
                pass
            elif op == "Elu":
                alpha = get_node_attr_f(node, "alpha", 1)
                layer.add_param(0, alpha)
            elif op == "EmbedLayerNormalization":
                logger.error(f"chaiNNer: No NCNN documentation for {op} yet, will not function")
                words = self.weights[node.input[2]]
                positions = self.weights[node.input[3]]
                W = self.weights[node.input[5]]
                B = self.weights[node.input[6]]

                layer.add_param(0, get_tensor_proto_data_size(B, B.data_type))
                layer.add_param(1, get_tensor_proto_data_size(words, words.data_type))
                layer.add_param(
                    2, get_tensor_proto_data_size(positions, positions.data_type)
                )

                quantize_tag = DTYPE_FP16 if is_fp16 else DTYPE_FP32
                bin_length += self.add_weight(layer, "words", words, DTYPE_FP32)
                bin_length += self.add_weight(layer, "positions", positions, DTYPE_FP32)
                bin_length += self.add_weight(layer, "weight", W, quantize_tag)
                bin_length += self.add_weight(layer, "bias", B)
            elif op == "Exp":
                layer.add_param(0, UOT.EXP)
            elif op == "Flatten":
                axis = get_node_attr_i(node, "axis", 1)
                if axis != 1:
                    raise ValueError(f"Unsupported Flatten axis {axis}.")
            elif op == "Floor":
                layer.add_param(0, UOT.FLOOR)
            elif op == "Gelu":
                layer.add_param(0, 1)
            elif op == "Gemm":
                alpha = get_node_attr_f(node, "alpha", 1)
                beta = get_node_attr_f(node, "beta", 1)
                transA = get_node_attr_i(node, "transA", 0)
                transB = get_node_attr_i(node, "transB", 0)

                if alpha == 1 and beta == 1 and transA == 0 and transB == 1:
                    # InnerProduct-like A * B * C
                    B = self.weights[node.input[1]]
                    C = self.weights[node.input[2]]

                    layer.add_param(0, get_tensor_proto_data_size(C, C.data_type))
                    layer.add_param(1, 1)
                    layer.add_param(2, get_tensor_proto_data_size(B, B.data_type))

                    quantize_tag = DTYPE_FP16 if is_fp16 else DTYPE_FP32
                    bin_length += self.add_weight(layer, "B", B, quantize_tag)
                    bin_length += self.add_weight(layer, "C", C)
                else:
                    # gemm
                    layer.add_param(0, alpha)
                    layer.add_param(1, beta)
                    layer.add_param(2, transA)
                    layer.add_param(3, transB)
            elif op == "GlobalAveragePool" or op == "GlobalMaxPool":
                layer.add_param(0, int(op == "GlobalAveragePool"))
                layer.add_param(4, 1)
            elif op == "adaptive_avg_pool2d" or op == "adaptive_max_pool2d":
                out_shape_tp = self.weights[node.input[1]]
                out_shape = get_node_attr_from_input_ai(out_shape_tp)

                layer.add_param(0, int(op == "adaptive_avg_pool2d"))
                layer.add_param(7, 1)
                if out_shape.size == 1:
                    layer.add_param(8, int(out_shape[0]))
                elif out_shape.size == 2:
                    layer.add_param(8, int(out_shape[1]))  # out_w
                    layer.add_param(18, int(out_shape[0]))  # out_h
            elif op == "GroupNorm":
                groups = get_node_attr_i(node, "groups", 1)
                channels = get_node_attr_i(node, "channels", 1)
                eps = get_node_attr_f(node, "epsilon", 0.00001)
                affine = get_node_attr_i(node, "affine", 1)

                if affine:
                    # discard affine-less S=1 B=0
                    affine_S = get_node_attr_from_input_af(self.weights[node.input[1]])
                    affine_B = get_node_attr_from_input_af(self.weights[node.input[2]])
                    if (
                        affine_S.size == 1
                        and affine_S[0] == 1
                        and affine_B.size == 1
                        and affine_B[0] == 0
                    ):
                        affine = 0
                    else:
                        if np.any(affine_S[:channels] != 1) or np.any(
                            affine_B[:channels] != 0
                        ):
                            affine = 1
                        else:
                            affine = 0

                layer.add_param(0, groups)
                layer.add_param(1, channels)
                layer.add_param(2, eps)
                layer.add_param(3, affine)
                if affine:
                    scale = self.weights[node.input[1]]
                    B = self.weights[node.input[2]]

                    bin_length += self.add_weight(layer, "scale", scale)
                    bin_length += self.add_weight(layer, "bias", B)
            elif op == "GRU":
                # W = self.weights[node.input[1]]
                # R = self.weights[node.input[2]]
                # B = self.weights[node.input[3]]

                # hidden_size = get_node_attr_i(node, "hidden_size", 0)
                # direction = get_node_attr_s(node, "direction")

                # if direction == "forward":
                #    direction_type = GRU.FORWARD
                # elif direction == "reverse":
                #    direction_type = GRU.REVERSE
                # elif direction == "bidirectional":
                #    direction_type = GRU.BIDIRECTIONAL

                # weight_data_size = get_tensor_proto_data_size(W)

                # layer.add_param(0, hidden_size)
                # layer.add_param(1, weight_data_size)
                # layer.add_param(2, direction_type)

                # num_directions = 2 if direction_type == GRU.BIDIRECTIONAL else 1

                # reorder num_directions-URN-hidden_size to num_directions-RUN-hidden_size
                # quantize_tag = DTYPE_FP16 if is_fp16 else DTYPE_FP32

                # logger.error(
                #    "Not sure GRU weight reordering is accurate, "
                #    "docs and code comments appear to give different shape orders"
                # )

                # W_array = onph.to_array(W)
                # W_array = np.stack(
                #    (W_array[:, 1, :], W_array[:, 0, :], W_array[:, 2, :]), axis=1
                # )
                # bin_length += self.add_weight(layer, W_array, "weight_xc_data", quantize_tag, is_fp16)

                # reduce U and R bias except N
                # reorder num_directions-URN-hidden to num_directions-RUN-hidden
                # B_array = onph.to_array(B)

                # bias_data_size_g = B_array.size / 6 / num_directions
                # for i in range(bias_data_size_g)[1:]:
                #    pass
                raise RuntimeError(
                    "GRU not implemented yet, please report issue with model used"
                )
            elif op == "HardSigmoid" or op == "Hard Swish":
                alpha = get_node_attr_f(node, "alpha", 0.2)
                beta = get_node_attr_f(node, "beta", 0.5)

                layer.add_param(0, alpha)
                layer.add_param(1, beta)
            elif op == "ImageScaler":
                bias = get_node_attr_af(node, "bias")
                scale = get_node_attr_f(node, "scale", 1)
                channels = bias.size

                layer.add_param(0, channels)
                layer.add_param(1, 1)

                bin_length += self.add_weight(layer, "scale", np.array((scale,) * 3))
                bin_length += self.add_weight(layer, "bias", bias)
            elif op == "InstanceNormalization":
                eps = get_node_attr_f(node, "epsilon", 0.00001)

                # Discard affine-less S=1 B=0
                affine_S = get_node_attr_from_input_af(self.weights[node.input[1]])
                affine_B = get_node_attr_from_input_af(self.weights[node.input[2]])
                channels = affine_S.size

                if np.any(affine_S[:channels] != 1) or np.any(affine_B[:channels] != 0):
                    affine = 1
                else:
                    affine = 0

                layer.add_param(0, channels)
                layer.add_param(1, eps)
                layer.add_param(2, affine)
                if affine:
                    scale = self.weights[node.input[1]]
                    B = self.weights[node.input[2]]

                    bin_length += self.add_weight(layer, "scale", scale)
                    bin_length += self.add_weight(layer, "bias", B)
            elif op == "LayerNorm":
                eps = get_node_attr_f(node, "epsilon", 0.00001)
                affine = get_node_attr_i(node, "affine", 1)

                if affine:
                    # discard affine-less S=1 B=0
                    affine_S = get_node_attr_from_input_af(self.weights[node.input[1]])
                    affine_B = get_node_attr_from_input_af(self.weights[node.input[2]])
                    affine_size = affine_S.size

                    if np.any(affine_S[:affine_size] != 1) or np.any(
                        affine_B[:affine_size]
                    ):
                        affine = 1
                    else:
                        affine = 0

                    if affine:
                        layer.add_param(0, affine_size)

                layer.add_param(1, eps)
                layer.add_param(2, affine)

                if affine:
                    scale = self.weights[node.input[1]]
                    B = self.weights[node.input[2]]

                    bin_length += self.add_weight(layer, "scale", scale)
                    bin_length += self.add_weight(layer, "bias", B)
            elif op == "LeakyRelu":
                alpha = get_node_attr_f(node, "alpha", 0.01)
                layer.add_param(0, alpha)
            elif op == "Log":
                layer.add_param(0, UOT.LOG)
            elif op == "LRN":
                layer.add_param(0, 0)
                layer.add_param(1, get_node_attr_i(node, "size", 1))
                layer.add_param(2, get_node_attr_f(node, "alpha", 1))
                layer.add_param(3, get_node_attr_f(node, "beta", 0.5))
                layer.add_param(4, get_node_attr_f(node, "bias", 1))
            elif op == "LSTM":
                # W = self.weights[node.input[1]]
                # R = self.weights[node.input[2]]
                # B = self.weights[node.input[3]]

                # hidden_size = get_node_attr_i(node, "hidden_size", 0)
                # direction = get_node_attr_s(node, "direction")

                # if direction == "forward":
                #    direction_type = GRU.FORWARD
                # elif direction == "reverse":
                #    direction_type = GRU.REVERSE
                # elif direction  == "bidirectional":
                #    direction_type = GRU.BIDIRECTIONAL
                raise RuntimeError(
                    "LSTM not implemented yet, please report issue with model used"
                )
            elif op == "MatMul":
                if node.input[1] in self.weights:
                    # InnerProduct
                    B = self.weights[node.input[1]]
                    weight_data_size = get_tensor_proto_data_size(B, B.data_type)
                    num_output = B.dims[-1]

                    layer.add_param(0, num_output)
                    layer.add_param(1, 0)
                    layer.add_param(2, weight_data_size)

                    B_array = onph.to_array(B)
                    bin_length += self.add_weight(layer, "bias", B_array.T, DTYPE_FP32)
                # There is a dead else here, not sure if this was incomplete code
            elif op == "MultiHeadAttention":
                # embed_dim = get_node_attr_i(node, "embed_dim", 0)
                # num_heads = get_node_attr_i(node, "num_heads", 0)

                # layer.add_param(0, embed_dim)
                # layer.add_param(1, num_heads)

                # if len(node.input) == 5:
                #    qkvw = self.weights[node.input[1]]
                #    qkvb = self.weights[node.input[2]]
                #    ow = self.weights[node.input[3]]
                #    ob = self.weights[node.input[4]]

                #    weight_data_size = get_tensor_proto_data_size(ow)

                #    layer.add_param(2, weight_data_size)

                #    quantize_tag = DTYPE_FP16 if is_fp16 else DTYPE_FP32
                raise RuntimeError(
                    "MultiHeadAttention not implemented, please report issue with model used"
                )
            elif op == "Neg":
                layer.add_param(0, UOT.NEG)
            elif op == "Normalize":
                eps = get_node_attr_f(node, "eps", 0)

                layer.add_param(1, 1)  # channel_shared
                layer.add_param(2, eps)
                layer.add_param(3, 1)  # scale_data_size
                layer.add_param(9, NEM.PYTORCH)

                bin_length += self.add_weight(layer, "scale", 1)
            elif op == "Pad":
                mode = get_node_attr_s(node, "mode")
                value = get_node_attr_f(node, "value", 0)

                if len(node.input) == 1:
                    pads = get_node_attr_ai(node, "pads")
                else:
                    pads = get_node_attr_from_input_ai(self.weights[node.input[1]])

                if mode == "edge":
                    ptype = PAT.REPLICATE
                elif mode == "reflect":
                    ptype = PAT.REFLECT
                else:
                    ptype = PAT.CONSTANT

                pad_size = pads.size
                top = bottom = front = behind = 0
                if pad_size == 8:
                    # NCHW
                    top = pads[2]
                    bottom = pads[6]
                    left = pads[3]
                    right = pads[7]
                    front = pads[1]
                    behind = pads[5]
                elif pad_size == 6:
                    # NHW
                    top = pads[1]
                    bottom = pads[4]
                    left = pads[2]
                    right = pads[5]
                else:
                    # NW
                    left = pads[1]
                    right = pads[3]

                layer.add_param(0, int(top))
                layer.add_param(1, int(bottom))
                layer.add_param(2, int(left))
                layer.add_param(3, int(right))
                layer.add_param(4, int(ptype))
                layer.add_param(5, int(value))
                layer.add_param(7, int(front))
                layer.add_param(8, int(behind))
            elif op == "PixelShuffle":
                layer.add_param(0, get_node_attr_i(node, "scale_factor", 1))
            elif op == "PRelu":
                slope = self.weights[node.input[1]]
                num_slope = get_tensor_proto_data_size(slope, slope.data_type)

                layer.add_param(0, num_slope)

                bin_length += self.add_weight(layer, "slope", slope)
            elif op == "Reciprocal":
                layer.add_param(0, UOT.RECIPROCAL)
            elif op in [
                "ReduceMax",
                "ReduceMin",
                "ReduceMean",
                "ReduceProd",
                "ReduceSum",
                "ReduceSumSquare",
                "ReduceL1",
                "ReduceL2",
                "ReduceLogSum",
                "ReduceLogSumExp",
            ]:
                if op == "ReduceSum":
                    op_type = ROT.SUM
                elif op == "ReduceSumSquare":
                    op_type = ROT.SUMSQ
                elif op == "ReduceMean":
                    op_type = ROT.MEAN
                elif op == "ReduceMax":
                    op_type = ROT.MAX
                elif op == "ReduceMin":
                    op_type = ROT.MIN
                elif op == "ReduceProd":
                    op_type = ROT.PROD
                elif op == "ReduceL1":
                    op_type = ROT.L1
                elif op == "ReduceL2":
                    op_type = ROT.L2
                elif op == "ReduceLogSum":
                    op_type = ROT.LOGSUM
                elif op == "ReduceLogSumExp":
                    op_type = ROT.LOGSUMEXP
                else:
                    op_type = -233

                layer.add_param(0, op_type)

                axes = get_node_attr_ai(node, "axes")
                keepdims = get_node_attr_i(node, "keepdims", 1)

                if axes.size > 0:
                    # if axes set, reduce according to axes
                    layer.add_param(1, 0)

                    for axis in axes:
                        if axis == 0 or axis > 4 or axis < -3:
                            raise ValueError(f"Unsupported axis {axis} in Reduction")
                    layer.add_param(
                        3,
                        [axes.size, *[a - 1 if a > 0 else a for a in axes]],
                    )
                else:
                    # if axes not set, reduce all axes by default
                    layer.add_param(1, 1)

                layer.add_param(4, keepdims)
                logger.error("chaiNNer: No NCNN documentation for Reduction param 5")
                layer.add_param(5, 1)
            elif op == "Reorg":
                layer.add_param(0, get_node_attr_i(node, "stride", 1))
            elif op == "Reshape":
                if len(node.input) == 1:
                    shape = get_node_attr_ai(node, "shape")
                else:
                    shape = get_node_attr_from_input_ai(self.weights[node.input[1]])

                shape_size = shape.size
                if shape_size == 1:
                    logger.error("chaiNNer: Should never reach shape.size == 1 in Reshape")
                    layer.add_param(0, int(shape[0]))
                elif shape_size == 2:
                    layer.add_param(0, int(shape[1]))
                elif shape_size == 3:
                    layer.add_param(0, int(shape[2]))
                    layer.add_param(1, int(shape[1]))
                elif shape_size == 4:
                    layer.add_param(0, int(shape[3]))
                    layer.add_param(1, int(shape[2]))
                    layer.add_param(2, int(shape[1]))
                elif shape_size == 5:
                    layer.add_param(0, int(shape[3] * shape[3]))
                    layer.add_param(1, int(shape[2]))
                    layer.add_param(2, int(shape[1]))
            elif op == "Resize":
                mode = get_node_attr_s(node, "mode")
                align = get_node_attr_s(node, "coordinate_transformation_mode")

                if len(node.input) == 2:
                    # opset 10
                    scales = get_node_attr_from_input_af(self.weights[node.input[1]])
                    sizes = np.empty(0, np.int32)
                else:
                    # opset 11+
                    scales = get_node_attr_from_input_af(self.weights[node.input[2]])
                    if len(node.input) >= 4:
                        sizes = get_node_attr_from_input_ai(self.weights[node.input[3]])
                    else:
                        sizes = np.empty(0, np.int32)

                if mode == "linear":
                    resize_type = IRT.BILINEAR
                elif mode == "cubic":
                    resize_type = IRT.BICUBIC
                else:
                    resize_type = IRT.NEAREST

                if scales.size == 0 and sizes.size == 0:
                    raise ValueError(
                        "Unsupported Resize scales and sizes are all empty."
                    )

                if scales.size == 2:
                    h_scale = 1
                    w_scale = scales[1]
                elif scales.size == 3:
                    h_scale = scales[1]
                    w_scale = scales[2]
                elif scales.size == 4:
                    if scales[1] != 1:
                        raise TypeError(f"Unsupported Resize scales {scales}.")
                    h_scale = scales[2]
                    w_scale = scales[3]
                else:
                    h_scale = 1
                    w_scale = 1

                if sizes.size == 2:
                    output_height = 0
                    output_width = sizes[1]
                elif sizes.size == 3:
                    output_height = sizes[1]
                    output_width = sizes[2]
                elif sizes.size == 4:
                    output_height = sizes[2]
                    output_width = sizes[3]
                else:
                    output_height = 0
                    output_width = 0

                align_corner = int(align == "align_corners")

                layer.add_param(0, resize_type)
                layer.add_param(1, float(h_scale))
                layer.add_param(2, float(w_scale))
                layer.add_param(3, int(output_height))
                layer.add_param(4, int(output_width))
                layer.add_param(6, align_corner)
            elif op == "RNN":
                W = self.weights[node.input[1]]
                R = self.weights[node.input[2]]
                B = self.weights[node.input[3]]

                hidden_size = get_node_attr_i(node, "hidden_size", 0)
                direction = get_node_attr_s(node, "direction")

                if direction == "reverse":
                    direction_type = GRU.REVERSE
                elif direction == "bidirectional":
                    direction_type = GRU.BIDIRECTIONAL
                else:
                    direction_type = GRU.FORWARD

                weight_data_size = get_tensor_proto_data_size(W, W.data_type)

                layer.add_param(0, hidden_size)
                layer.add_param(1, weight_data_size)
                layer.add_param(2, direction_type)

                quantize_tag = DTYPE_FP16 if is_fp16 else DTYPE_FP32
                bin_length += self.add_weight(layer, "weight", W, quantize_tag)

                # reduce xc and hc bias
                reduced_B = np.sum(onph.to_array(B), 1)
                bin_length += self.add_weight(layer, "bias", reduced_B, quantize_tag)

                bin_length += self.add_weight(layer, "R", R, quantize_tag)
            elif op == "ShuffleChannel":
                layer.add_param(0, get_node_attr_i(node, "group", 1))
                layer.add_param(1, get_node_attr_i(node, "reverse", 0))
            elif op == "Sigmoid":
                pass
            elif op == "Sin":
                layer.add_param(0, UOT.SIN)
            elif op == "SkipLayerNormalization":
                logger.error(f"chaiNNer: No NCNN documentation for {op} yet, will not function")
                W = self.weights[node.input[2]]
                B = self.weights[node.input[3]]
                B2 = self.weights[node.input[4]]

                layer.add_param(0, get_tensor_proto_data_size(B, B.data_type))

                quantize_tag = DTYPE_FP16 if is_fp16 else DTYPE_FP32
                bin_length += self.add_weight(layer, "weight", W, quantize_tag)
                bin_length += self.add_weight(layer, "bias1", B, DTYPE_FP32)
                bin_length += self.add_weight(layer, "bias2", B2, DTYPE_FP32)
            elif op == "Slice":
                input_size = len(node.input)
                if input_size == 1:
                    starts = get_node_attr_ai(node, "starts")
                    ends = get_node_attr_ai(node, "ends")
                    axes = get_node_attr_ai(node, "axes")
                    steps = get_node_attr_ai(node, "steps")
                else:
                    starts = get_node_attr_from_input_ai(self.weights[node.input[1]])
                    ends = get_node_attr_from_input_ai(self.weights[node.input[2]])
                    if input_size >= 4:
                        axes = get_node_attr_from_input_ai(self.weights[node.input[3]])
                    else:
                        axes = np.empty(0, np.int32)
                    if input_size >= 5:
                        steps = get_node_attr_from_input_ai(self.weights[node.input[4]])
                    else:
                        steps = np.empty(0, np.int32)

                assert np.all(steps != 1), f"Unsupported Slice step {steps}"

                # Filter out N-dim axis
                if axes.size:
                    for i, axis in enumerate(axes):
                        if axis == 0:
                            np.delete(starts, i)
                            np.delete(ends, i)
                            np.delete(axes, i)
                            break

                layer.add_param(9, [starts.size, *list(starts)])
                layer.add_param(10, [ends.size, *list(ends)])
                if axes.size:
                    assert np.all(
                        axes != 0 and axes <= 3 and axes >= -3
                    ), f"Unsupported Slice axes {axes}"
                    layer.add_param(
                        11, [axes.size, *[a - 1 if a > 0 else a for a in axes]]
                    )
            elif op == "Softmax":
                axis = get_node_attr_i(node, "axis", 1)
                layer.add_param(0, axis - 1)
                layer.add_param(1, 1)
            elif op == "Split":
                axis = get_node_attr_i(node, "axis", 0)
                splits = get_node_attr_ai(node, "split")

                assert axis >= 1, f"Unsupported axis {axis} in Split"

                if splits.size:
                    layer.add_param(0, [output_size, *list(splits[:-1]), -233])
                else:
                    layer.add_param(
                        0, [output_size, *[-233 for _ in range(output_size)]]
                    )
                layer.add_param(1, axis - 1)
            elif op == "Sqrt":
                layer.add_param(0, UOT.SQRT)
            elif op == "Squeeze":
                axes = get_node_attr_ai(node, "axes")

                if axes.size:
                    assert np.all(
                        axes != 0 and axes <= 4 and axes >= -3
                    ), f"Unsupported Squeeze axes {axes}"

                    layer.add_param(
                        3, [axes.size, *[a - 1 if a > 0 else a for a in axes]]
                    )
                else:
                    layer.add_param(0, 1)
                    layer.add_param(1, 1)
                    layer.add_param(2, 1)
            elif op == "Sum":
                layer.add_param(0, EOT.SUM)
            elif op == "Swish":
                pass
            elif op == "Tan":
                layer.add_param(0, UOT.TAN)
            elif op == "Tanh":
                layer.add_param(0, UOT.TANH)
            elif op == "Transpose":
                perm = get_node_attr_ai(node, "perm")
                if perm.size == 3:
                    if (perm[1] == 1 and perm[2] == 2) or (
                        perm[0] == 1 and perm[1] == 0 and perm[2] == 2
                    ):
                        layer.add_param(0, POT.WH_WHC_WHDC)
                    elif (perm[1] == 2 and perm[2] == 1) or (
                        perm[0] == 2 and perm[1] == 0 and perm[2] == 1
                    ):
                        layer.add_param(0, POT.HW_HWC_HWDC)
                elif perm.size == 4:
                    if perm[1] == 1 and perm[2] == 2 and perm[3] == 3:
                        layer.add_param(0, POT.WH_WHC_WHDC)
                    elif perm[1] == 1 and perm[2] == 3 and perm[3] == 2:
                        layer.add_param(0, POT.HW_HWC_HWDC)
                    elif perm[1] == 2 and perm[2] == 1 and perm[3] == 3:
                        layer.add_param(0, POT.WCH_WDHC)
                    elif perm[1] == 2 and perm[2] == 3 and perm[3] == 1:
                        layer.add_param(0, POT.CWH_DWHC)
                    elif perm[1] == 3 and perm[2] == 1 and perm[3] == 2:
                        layer.add_param(0, POT.HCW_HDWC)
                    elif perm[1] == 3 and perm[2] == 2 and perm[3] == 1:
                        layer.add_param(0, POT.CHW_DHWC)
                elif perm.size == 5:
                    if perm[1] == 1 and perm[2] == 2 and perm[3] == 3 and perm[4] == 4:
                        layer.add_param(0, POT.WH_WHC_WHDC)
                    elif (
                        perm[1] == 1 and perm[2] == 3 and perm[3] == 4 and perm[4] == 2
                    ):
                        layer.add_param(0, POT.HW_HWC_HWDC)
                    elif (
                        perm[1] == 2 and perm[2] == 1 and perm[3] == 3 and perm[4] == 4
                    ):
                        layer.add_param(0, POT.WCH_WDHC)
                    elif (
                        perm[1] == 2 and perm[2] == 3 and perm[3] == 4 and perm[4] == 1
                    ):
                        layer.add_param(0, POT.CWH_DWHC)
                    elif (
                        perm[1] == 3 and perm[2] == 4 and perm[3] == 1 and perm[4] == 2
                    ):
                        layer.add_param(0, POT.HCW_HDWC)
                    elif (
                        perm[1] == 3 and perm[2] == 4 and perm[3] == 2 and perm[4] == 1
                    ):
                        layer.add_param(0, POT.CHW_DHWC)
                    else:
                        error_msg = f"Unsupported Transpose type {perm}"
                        raise ValueError(error_msg)
            elif op == "Upsample":
                mode = get_node_attr_s(node, "mode")
                align = get_node_attr_s(node, "coordinate_transformation_mode")

                if len(node.input) == 1:
                    scales = get_node_attr_af(node, "scales")
                else:
                    scales = get_node_attr_from_input_af(self.weights[node.input[1]])

                if mode == "bilinear" or mode == "linear":
                    resize_type = IRT.BILINEAR
                elif mode == "trilinear":
                    raise ValueError("Upsample does not support trilinear mode")
                else:
                    resize_type = IRT.NEAREST

                if scales.size == 2:
                    h_scale = 1
                    w_scale = scales[1]
                elif scales.size == 3:
                    h_scale = scales[1]
                    w_scale = scales[2]
                elif scales.size == 4:
                    h_scale = scales[2]
                    w_scale = scales[3]

                    if scales[1] != 1:
                        error_msg = f"Unsupported Upsample scales {scales}"
                        raise ValueError(error_msg)
                else:
                    error_msg = f"Unsupported Upsample scales {scales}"
                    raise ValueError(error_msg)

                align_corner = int(align == "align_corners")

                layer.add_param(0, resize_type)
                layer.add_param(1, float(h_scale))
                layer.add_param(2, float(w_scale))
                layer.add_param(6, align_corner)
            elif op == "Unsqueeze":
                axes = get_node_attr_ai(node, "axes")

                assert (
                    np.all(axes != 0) and np.all(axes <= 4) and np.all(axes >= -4)
                ), f"Unsupported axes {axes} in Unsqueeze"

                layer.add_param(
                    3, [axes.size, *[axis - 1 if axis > 0 else axis for axis in axes]]
                )
            else:
                # NCNN TODO: op specific param
                # This is presumably to catch anything they haven't written an op for yet
                for attr in node.attribute:
                    if attr.type == 1:
                        error_msg = f"Op {op} does not exist yet; {attr.name}={attr.f}"
                    elif attr.type == 2:
                        error_msg = f"Op {op} does not exist yet; {attr.name}={attr.i}"
                    elif attr.type == 3:
                        error_msg = f"Op {op} does not exist yet; {attr.name}={attr.s}"
                    else:
                        error_msg = (
                            f"Op {op} does not exist yet; {attr.name}={attr.type}"
                        )

                    raise ValueError(error_msg)

            ncnn_model.add_layer(layer)

            for o in range(output_size):
                output_name = node.output[o]
                if output_name in self.node_reference:
                    refcount = self.node_reference[output_name]
                    if refcount > 1:
                        ncnn_model.add_layer(
                            NcnnLayer(
                                "Split",
                                f"splitncnn_{internal_split}",
                                1,
                                refcount,
                                [output_name],
                                [
                                    f"{output_name}_splitncnn_{j}"
                                    for j in range(refcount)
                                ],
                            )
                        )

                        internal_split += 1

        ncnn_model.bin_length = bin_length
        NcnnOptimizer(ncnn_model).optimize()

        return ncnn_model