File size: 5,041 Bytes
c7f0cc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
model = dict(
    type='PSGTr',
    backbone=dict(type='ResNet',
                  depth=50,
                  num_stages=4,
                  out_indices=(0, 1, 2, 3),
                  frozen_stages=1,
                  norm_cfg=dict(type='BN', requires_grad=False),
                  norm_eval=True,
                  style='pytorch',
                  init_cfg=dict(type='Pretrained',
                                checkpoint='torchvision://resnet50')),
    bbox_head=dict(
        type='PSGFormerHead',
        num_classes=80,
        num_relations=117,
        in_channels=2048,
        transformer=dict(
            type='DualTransformer',
            encoder=dict(type='DetrTransformerEncoder',
                         num_layers=6,
                         transformerlayers=dict(
                             type='BaseTransformerLayer',
                             attn_cfgs=[
                                 dict(type='MultiheadAttention',
                                      embed_dims=256,
                                      num_heads=8,
                                      dropout=0.1)
                             ],
                             feedforward_channels=2048,
                             ffn_dropout=0.1,
                             operation_order=('self_attn', 'norm', 'ffn',
                                              'norm'))),
            decoder1=dict(type='DetrTransformerDecoder',
                          return_intermediate=True,
                          num_layers=6,
                          transformerlayers=dict(
                              type='DetrTransformerDecoderLayer',
                              attn_cfgs=dict(type='MultiheadAttention',
                                             embed_dims=256,
                                             num_heads=8,
                                             dropout=0.1),
                              feedforward_channels=2048,
                              ffn_dropout=0.1,
                              operation_order=('self_attn', 'norm',
                                               'cross_attn', 'norm', 'ffn',
                                               'norm'))),
            decoder2=dict(type='DetrTransformerDecoder',
                          return_intermediate=True,
                          num_layers=6,
                          transformerlayers=dict(
                              type='DetrTransformerDecoderLayer',
                              attn_cfgs=dict(type='MultiheadAttention',
                                             embed_dims=256,
                                             num_heads=8,
                                             dropout=0.1),
                              feedforward_channels=2048,
                              ffn_dropout=0.1,
                              operation_order=('self_attn', 'norm',
                                               'cross_attn', 'norm', 'ffn',
                                               'norm'))),
        ),
        positional_encoding=dict(type='SinePositionalEncoding',
                                 num_feats=128,
                                 normalize=True),
        rel_loss_cls=dict(type='CrossEntropyLoss',
                          use_sigmoid=False,
                          loss_weight=2.0,
                          class_weight=1.0),
        sub_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0),
        obj_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0),
        loss_cls=dict(type='CrossEntropyLoss',
                      use_sigmoid=False,
                      loss_weight=4.0,
                      class_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=3.0),
        loss_iou=dict(type='GIoULoss', loss_weight=2.0),
        focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
        dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0)),
    # training and testing settings
    train_cfg=dict(id_assigner=dict(type='IdMatcher',
                                    sub_id_cost=dict(type='ClassificationCost',
                                                     weight=1.),
                                    obj_id_cost=dict(type='ClassificationCost',
                                                     weight=1.),
                                    r_cls_cost=dict(type='ClassificationCost',
                                                    weight=1.)),
                   bbox_assigner=dict(type='HungarianAssigner',
                                      cls_cost=dict(type='ClassificationCost',
                                                    weight=4.0),
                                      reg_cost=dict(type='BBoxL1Cost',
                                                    weight=3.0),
                                      iou_cost=dict(type='IoUCost',
                                                    iou_mode='giou',
                                                    weight=2.0))),
    test_cfg=dict(max_per_img=100))