VGG모델 설명 VGG


VGG 모델을 구현한 torch.nn.Module을 상속한 VGG 클래스

class VGG(nn.Module):
    def __init__(self, num_layers, num_classes=2, init_weights=True):
        super(VGG, self).__init__()
        # input image size (N, 3, 224, 224)
        # after maxpooling layer, h and w are devided by 2 : 224->112->56->28->14->7
        self.in_channels = 3
        # there are out_channels and M(maxpool) in self.vgg_cfg 
        if num_layers==11:
            self.vgg_cfg = [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
        elif num_layers==13:
            self.vgg_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
        elif num_layers==16:
            self.vgg_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M']
        elif num_layers==19:
            self.vgg_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M']
        else: 
            print("unavailable number of layers")
            sys.exit()

        self.conv_layers = self._make_layers(self.vgg_cfg)
        # fc layers part : adaptiveaveragepooling->FC->ReLU->Dropout->FC->ReLU->Dropout->FC (-> softmax)
        self.adaptive_avgpooling = nn.AdaptiveAvgPool2d(7)
        self.fc_layers = nn.Sequential(
            nn.Linear(512*7*7, 4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096, num_classes)
        )

        # placeholder for the gradients
        self.gradients = None

        if init_weights:
            self._initialize_weights()

    def _make_layers(self, cfg):
        layers = []
        in_channels = self.in_channels
        for v in cfg:
            if v == "M":
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels=in_channels, out_channels=v, kernel_size=3, stride=1, padding=1),
                            nn.BatchNorm2d(v), 
                            nn.ReLU()]
                in_channels = v
        return nn.Sequential(*layers)
    
    def forward(self, x):
        output = self.conv_layers(x)
        output = self.adaptive_avgpooling(output)
        output = output.view(-1, 512*7*7)        
        output = self.fc_layers(output)
        return output

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d): 
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') # He initialization
                if m.bias is not None: 
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01) # Fills the given 2-dimensional matrix with values drawn from a normal distribution parameterized by mean and std.
                nn.init.constant_(m.bias, 0)

1. init

def __init__(self, num_layers, num_classes=2, init_weights=True):
        super(VGG, self).__init__()
        # input image size (N, 3, 224, 224)
        # after maxpooling layer, h and w are devided by 2 : 224->112->56->28->14->7
        self.in_channels = 3
        # there are out_channels and M(maxpool) in self.vgg_cfg 
        if num_layers==11:
            self.vgg_cfg = [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
        elif num_layers==13:
            self.vgg_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
        elif num_layers==16:
            self.vgg_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M']
        elif num_layers==19:
            self.vgg_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M']
        else: 
            print("unavailable number of layers")
            sys.exit()

num_layer 파라미터는 vgg의 레이어 수를 뜻합니다. vgg11, 13, 16, 19의 구조를 리스트로 나타냅니다(output 채널수들과 maxpooling을 뜻하는 ‘M’ 으로)

        self.conv_layers = self._make_layers(self.vgg_cfg)
        # fc layers part : adaptiveaveragepooling->FC->ReLU->Dropout->FC->ReLU->Dropout->FC (-> softmax)
        self.adaptive_avgpooling = nn.AdaptiveAvgPool2d(7)
        self.fc_layers = nn.Sequential(
            nn.Linear(512*7*7, 4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096, num_classes)
        )

        # placeholder for the gradients
        self.gradients = None

        if init_weights:
            self._initialize_weights()

_make_layers(self, cfg) 함수로 conv부분을 만들어주므로 _make_layers(self, cfg)함수를 보겠습니다.

_make_layers(self, cfg)

def _make_layers(self, cfg):
        layers = []
        in_channels = self.in_channels
        for v in cfg:
            if v == "M":
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                layers += [nn.Conv2d(in_channels=in_channels, out_channels=v, kernel_size=3, stride=1, padding=1),
                            nn.BatchNorm2d(v), 
                            nn.ReLU()]
                in_channels = v
        return nn.Sequential(*layers)

cfg자리에 self.vgg_cfg를 넣어주면 채널수에 맞춰 conv-bn-relu레이어가 쌓이고, M자리에 maxpooling레이어가 쌓입니다.

레이어 리스트를 unpack하여 nn.Sequential에 넣어준 후 return 해줍니다

다시 init으로 돌아가 self.fc_layers를 살펴보면

self.fc_layers

        self.fc_layers = nn.Sequential(
            nn.Linear(512*7*7, 4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096, num_classes)
        )

마지막 conv레이어의 output이 채널이 512개 , h, w가 7이므로 51277이 들어가 4096이 나오는 fc레이어 → relu→dropout→ 다시 fc+relu+dropout 그리고 마지막에 클래스개수만큼을 output 채널로 갖는 fc레이어를 추가해줍니다.

forward

    def forward(self, x):
        output = self.conv_layers(x)
        output = self.adaptive_avgpooling(output)
        output = output.view(-1, 512*7*7)        
        output = self.fc_layers(output)
        return output