前言
之前发了一个shufflenet的鸟群分类算法,以fintune的方法来做,效果只有56%,太差了,所以我换了一个模型,又加入了一些微调手段来提神效果,现在把代码分享出来,供各位看官指正,希望能让我的分类结果进一步提升
从这里可以看出,我使用的vision transformer模型是最小的,因为能用的只有一个2080
分享的代码里模型部分是参考了github的。
工程结构如图所示
一、config.py
import argparse
'''
training settings
metavar参数,用来控制部分命令行参数的显示
'''
parser = argparse.ArgumentParser(description='PyTorch Example for all')
'''
train part
'''
parser.add_argument('--train-batch-size', type=int, default=32, metavar='N',
help='input batch size for training (default: 32)'
'一般对于GPU选择2的次方的batch可以更好的发挥性能。所以一般选择32、64、256。')
parser.add_argument('--test-batch-size', type=int, default=64, metavar='N',
help='input batch size for testing (default: 64)')
parser.add_argument('--freeze_epoch', type=int, default=0,
help="前10个epoch,冻结modeling的backbone")
parser.add_argument('--epochs', type=int, default=200, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
help='learning rate (default: 0.001)')
parser.add_argument('--is_save',type=bool,default=False,
help="是否保存")
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--label_smooth', type=float, default=0.01,
help='使用标签平滑,一定程度的减小过拟合')
parser.add_argument('--seed', type=int, default=123, metavar='S',
help='random seed 设置种子的用意是一旦固定种子,后面依次生成的随机数其实都是固定的,有利于实验结果的产生与比较')
parser.add_argument('--use_cuda', type=bool, default=True,
help='whether to use cuda to accerlate')
parser.add_argument('--base_data_path', type=str, default='E:/Datasets2/',
help="total base data path for training")
parser.add_argument('--resume', type=bool, default=True, metavar='R',
help="whether to use the pretrained model to start the train")
parser.add_argument('--saved_model', type=str, default="E:/完成工作/trained_model/",
help="the path to store the weight")
parser.add_argument('--val_num', type=float, default=0.3,
help="perecentage of validate data")
parser.add_argument('--save', type=bool, default=True,
help="whether to save the model weight")
parser.add_argument('--project_name', type=str, default='transformer based 分类算法实验',
help="该项目的工程名称")
parser.add_argument('--use_aug', type=bool, default=True,
help='使用数据增广,增加数据多样性,目前仅限于水平竖向的翻转')
parser.add_argument('--image_size', type=int, default=224, choices=[224, 384],
help='图片大小')
parser.add_argument('--patch_size', type=int, default=16,
help="图片分割块的数量")
parser.add_argument('--num_class', type=int, default=200,
help="分类数量")
parser.add_argument('--emb_dim', type=int, default=768,
help="位置嵌入维度")
parser.add_argument('--mlp_dim', type=int, default=3072)
parser.add_argument('--num_heads', type=int, default=12)
parser.add_argument('--num_layers', type=int, default=12)
parser.add_argument('--attn_dropout_rate', type=float, default=0.0)
parser.add_argument('--dropout_rate', type=float, default=0.1)
parser.add_argument('--weight_decay', type=float, default=1e-4)
'''
inference part
'''
parser.add_argument('--pretrained_weight', type=str, default="E:/完成工作/trained_model/",
help="the path to load the pytorch weight(.pth)")
二、datalist.py
仔细阅读代码,可以看到,我在这里加入了mixup的数据增强手段
from random import shuffle
import torch
import numpy as np
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
from pytorch_vision_transformer_classify.config import parser
args = parser.parse_args()
'''
1. 对图片进行按比例缩放
2. 对图片进行随机位置的截取
3. 对图片进行随机的水平和竖直翻转
4. 对图片进行随机角度的旋转
5. 对图片进行亮度、对比度和颜色的随机变化
数据增强最新的方式
1、Mixup
2、Cutout
3、Cutmix
4、Mosaic
'''
# 自己写Dataset至少需要有这样的格式
class Dataset(Dataset):
def __init__(self, lines,type):
super(Dataset, self).__init__()
self.base_path = args.base_data_path
self.annotation_lines = lines
self.type=type
self.train_batches = len(self.annotation_lines)
def __len__(self):
return self.train_batches
def __getitem__(self, index):
# if index == 0:
# shuffle(self.annotation_lines)
n = len(self.annotation_lines)
index = index % n
img, y = self.collect_image_label(self.annotation_lines[index])
'''按照pytorch的标准写'''
if self.type=='train':
tran=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(30),
transforms.RandomPerspective(),
transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])
temp_y = int(y) - 1
else:
tran = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])
temp_y = int(y) - 1
temp_y=self.onehot_encode(temp_y)
'''自定义'''
# if args.use_aug:
# img = self.img_augment(img)
'''图片预处理,标签预处理'''
temp_img =tran(img)
return temp_img.half(), temp_y
def collect_image_label(self, line):
line = line.split('*')
image_path = line[0]
label = line[1]
image = Image.open(image_path).convert("RGB")
# image=Image.fromarray(image.numpy(),mode='L')
return image, label
def rand(self, a=0, b=1):
return np.random.rand() * (b - a) + a
def img_augment(self, image):
# # 随机位置裁剪
# random_crop = self.rand() < 0.5
# # 中心裁剪
# center_crop = self.rand() < 0.5
# # 填充后随机裁剪
# random_crop_padding = self.rand() < 0.5
# 水平翻转
h_flip = self.rand() < 0.5
# 竖直翻转
v_flip = self.rand() < 0.5
# # 亮度
# bright = self.rand() < 0.5
# # 对比度
# contrast = self.rand() < 0.5
# # 饱和度
# saturation = self.rand() < 0.5
# # 颜色随机变换
# color = self.rand() < 0.5
# compose = self.rand() < 0.5
# # 旋转30
# rotate = self.rand() < 0.5
if h_flip:
image = transforms.RandomHorizontalFlip()(image)
if v_flip:
image = transforms.RandomVerticalFlip()(image)
# if rotate:
# image = transforms.RandomRotation(30)(image)
# if bright:
# image = transforms.ColorJitter(brightness=1)(image)
# if contrast:
# image = transforms.ColorJitter(contrast=1)(image)
# if saturation:
# image = transforms.ColorJitter(saturation=1)(image)
# if color:
# image = transforms.ColorJitter(hue=0.5)(image)
# if compose:
# image = transforms.ColorJitter(0.5, 0.5, 0.5)(image)
# if random_crop:
# image = transforms.RandomCrop(100)(image)
# if center_crop:
# image = transforms.CenterCrop(100)(image)
# if random_crop_padding:
# image = transforms.RandomCrop(100, padding=8)(image)
return image
def onehot_encode(self, label, n_class=200):
diag = torch.eye(n_class)
oh_vector = diag[label].view(n_class)
return oh_vector
# # DataLoader中collate_fn使用
# def dataset_collate(batch):
# images = []
# bboxes = []
# for img, box in batch:
# images.append(img)
# bboxes.append(box)
# images = np.array(images)
# bboxes = np.array(bboxes)
# return images, bboxes
class MixupDataset(Dataset):
def __init__(self,dataset):
self.dataset=dataset
self.beta_dist=torch.distributions.beta.Beta(0.2,0.2)
def __len__(self):
return len(self.dataset)
def __getitem__(self, index):
if self.rand()<0.01:
idx_a=index
idx_b=np.random.randint(len(self))
image_a,label_a=self.get_oneitem(idx_a)
image_b,label_b=self.get_oneitem(idx_b)
if label_a==label_b:
image=image_a
oh_label=self.onehot_encode(label_a)
else:
mix_rate=self.beta_dist.sample()
if mix_rate<0.5:
mix_rate=1.-mix_rate
image=mix_rate*image_a+(1.-mix_rate)*image_b
oh_label=mix_rate*self.onehot_encode(label_a)+(1.-mix_rate)*self.onehot_encode(label_b)
return image,oh_label
else:
return self.dataset[index][0],self.onehot_encode(self.dataset[index][1])
def get_oneitem(self, idx):
image = self.dataset[idx][0]
label = self.dataset[idx][1]
return image, label
def onehot_encode(self, label, n_class=200):
diag = torch.eye(n_class)
oh_vector = diag[label].view(n_class)
return oh_vector
def rand(self, a=0, b=1):
return np.random.rand() * (b - a) + a
if __name__ == "__main__":
Dataset()
1.引入库
代码如下(示例):
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
三.model.py
这部分代码源自visiontransformer的github,我作为刚入门的新手,这种级别的代码还写不出来
import torch
import torch.nn as nn
import torch.nn.functional as F
class PositionEmbs(nn.Module):
def __init__(self, num_patches, emb_dim, dropout_rate=0.1):
super(PositionEmbs, self).__init__()
self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, emb_dim))
if dropout_rate > 0:
self.dropout = nn.Dropout(dropout_rate)
else:
self.dropout = None
def forward(self, x):
out = x + self.pos_embedding
if self.dropout:
out = self.dropout(out)
return out
class MlpBlock(nn.Module):
""" Transformer Feed-Forward Block """
def __init__(self, in_dim, mlp_dim, out_dim, dropout_rate=0.1):
super(MlpBlock, self).__init__()
# init layers
self.fc1 = nn.Linear(in_dim, mlp_dim)
self.fc2 = nn.Linear(mlp_dim, out_dim)
# GELU激活函数
self.act = nn.GELU()
if dropout_rate > 0.0:
self.dropout1 = nn.Dropout(dropout_rate)
self.dropout2 = nn.Dropout(dropout_rate)
else:
self.dropout1 = None
self.dropout2 = None
def forward(self, x):
out = self.fc1(x)
out = self.act(out)
if self.dropout1:
out = self.dropout1(out)
out = self.fc2(out)
out = self.dropout2(out)
return out
class LinearGeneral(nn.Module):
def __init__(self, in_dim=(768,), feat_dim=(12, 64)):
super(LinearGeneral, self).__init__()
self.weight = nn.Parameter(torch.randn(*in_dim, *feat_dim))
self.bias = nn.Parameter(torch.zeros(*feat_dim))
def forward(self, x, dims):
a = torch.tensordot(x, self.weight, dims=dims) + self.bias
return a
class SelfAttention(nn.Module):
def __init__(self, in_dim, heads=8, dropout_rate=0.1):
super(SelfAttention, self).__init__()
self.heads = heads
self.head_dim = in_dim // heads
self.scale = self.head_dim ** 0.5
self.query = LinearGeneral((in_dim,), (self.heads, self.head_dim))
self.key = LinearGeneral((in_dim,), (self.heads, self.head_dim))
self.value = LinearGeneral((in_dim,), (self.heads, self.head_dim))
self.out = LinearGeneral((self.heads, self.head_dim), (in_dim,))
if dropout_rate > 0:
self.dropout = nn.Dropout(dropout_rate)
else:
self.dropout = None
def forward(self, x):
b, n, _ = x.shape
q = self.query(x, dims=([2], [0]))
k = self.key(x, dims=([2], [0]))
v = self.value(x, dims=([2], [0]))
q = q.permute(0, 2, 1, 3)
k = k.permute(0, 2, 1, 3)
v = v.permute(0, 2, 1, 3)
attn_weights = torch.matmul(q, k.transpose(-2, -1)) / self.scale
attn_weights = F.softmax(attn_weights, dim=-1)
out = torch.matmul(attn_weights, v)
out = out.permute(0, 2, 1, 3)
out = self.out(out, dims=([2, 3], [0, 1]))
return out
class EncoderBlock(nn.Module):
def __init__(self, in_dim, mlp_dim, num_heads, dropout_rate=0.1, attn_dropout_rate=0.1):
super(EncoderBlock, self).__init__()
self.norm1 = nn.LayerNorm(in_dim)
self.attn = SelfAttention(in_dim, heads=num_heads, dropout_rate=attn_dropout_rate)
if dropout_rate > 0:
self.dropout = nn.Dropout(dropout_rate)
else:
self.dropout = None
self.norm2 = nn.LayerNorm(in_dim)
self.mlp = MlpBlock(in_dim, mlp_dim, in_dim, dropout_rate)
def forward(self, x):
residual = x
out = self.norm1(x)
out = self.attn(out)
if self.dropout:
out = self.dropout(out)
out += residual
residual = out
out = self.norm2(out)
out = self.mlp(out)
out += residual
return out
class Encoder(nn.Module):
def __init__(self, num_patches, emb_dim, mlp_dim, num_layers=12, num_heads=12, dropout_rate=0.1,
attn_dropout_rate=0.0):
super(Encoder, self).__init__()
# positional embedding
self.pos_embedding = PositionEmbs(num_patches, emb_dim, dropout_rate)
# encoder blocks
in_dim = emb_dim
self.encoder_layers = nn.ModuleList()
for i in range(num_layers):
layer = EncoderBlock(in_dim, mlp_dim, num_heads, dropout_rate, attn_dropout_rate)
self.encoder_layers.append(layer)
self.norm = nn.LayerNorm(in_dim)
def forward(self, x):
out = self.pos_embedding(x)
for layer in self.encoder_layers:
out = layer(out)
out = self.norm(out)
return out
class VisionTransformer(nn.Module):
""" Vision Transformer """
def __init__(self,
image_size=(256, 256),
patch_size=(16, 16),
emb_dim=768,
mlp_dim=3072,
num_heads=12,
num_layers=12,
num_classes=1000,
attn_dropout_rate=0.0,
dropout_rate=0.1,
feat_dim=None):
super(VisionTransformer, self).__init__()
h, w = image_size
# embedding layer
fh, fw = patch_size
gh, gw = h // fh, w // fw
num_patches = gh * gw
self.embedding = nn.Conv2d(3, emb_dim, kernel_size=(fh, fw), stride=(fh, fw))
# class token
self.cls_token = nn.Parameter(torch.zeros(1, 1, emb_dim))
# transformer
self.transformer = Encoder(
num_patches=num_patches,
emb_dim=emb_dim,
mlp_dim=mlp_dim,
num_layers=num_layers,
num_heads=num_heads,
dropout_rate=dropout_rate,
attn_dropout_rate=attn_dropout_rate)
# classfier
self.classifier = nn.Linear(emb_dim, num_classes)
def forward(self, x):
emb = self.embedding(x) # (n, c, gh, gw)
emb = emb.permute(0, 2, 3, 1) # (n, gh, hw, c)
b, h, w, c = emb.shape
emb = emb.reshape(b, h * w, c)
# prepend class token
cls_token = self.cls_token.repeat(b, 1, 1)
emb = torch.cat([cls_token, emb], dim=1)
# transformer
feat = self.transformer(emb)
# classifier
logits = self.classifier(feat[:, 0])
return logits
if __name__ == '__main__':
model = VisionTransformer(num_layers=2)
x = torch.randn((2, 3, 256, 256))
out = model(x)
state_dict = model.state_dict()
for key, value in state_dict.items():
print("{}: {}".format(key, value.shape))
四、utils.py
在这里可以看到,我自己写了一个多分类的交叉熵的loss function。
我发现这种脚本自己写太重要了,因为这样做就可以配合我之前的mixup手段对数据的增强,同时我又加入了label smooth防止过拟合的手段
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def Color_print(line):
print(bcolors.OKGREEN + line + bcolors.ENDC)
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
'''
原因:用多卡训练的时候tensor不连续,即tensor分布在不同的内存或显存中。
解决方法:对tensor进行操作时先调用contiguous()。如tensor.contiguous().view()。
'''
# correct_k = correct[:k].view(-1).float().sum(0)
correct_k = correct[:k].contiguous().view(-1).float().sum(0)
res.append(correct_k / batch_size * 100.0)
return res
import torch
import torch.nn as nn
from pytorch_vision_transformer_classify.config import parser
'''
自定义的交叉熵(多分类),附带label_smooth
'''
class myLoss(nn.Module):
def __init__(self):
super(myLoss, self).__init__()
self.args = parser.parse_args()
self.smooth = self.args.label_smooth
def forward(self, x, y):
# print('-------------------------------')
# print(x)
# print(torch.log_softmax(x,dim=1))
# print(y)
# print(y.mul(torch.log_softmax(x,dim=1)))
# print(-torch.sum(y.mul(torch.log_softmax(x,dim=1))))
# print('-------------------------------')
# loss=-torch.sum(y.mul(torch.log_softmax(x,dim=1)))/x.size(0)
x = torch.log_softmax(x, dim=1)
y = y * (1 - self.smooth) + self.smooth / x.size(0)
loss = -torch.sum(y.mul(x))
return loss
五、train.py
这里是我训练模型的主要部分,我在网上查,通过增加batch-size可以提高模型的训练进度,同时我使用float16减少了模型精度,网上说这样也能提高模型性能,所以我试了一试
import os
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from pytorch_network.vision_transformer import VisionTransformer
from pytorch_vision_transformer_classify.config import parser
from pytorch_vision_transformer_classify.datalist import Dataset, MixupDataset
from pytorch_vision_transformer_classify.utils import Color_print
from pytorch_vision_transformer_classify.utils import myLoss
'''
细度分类
'''
best_acc = 0
class train(object):
def __init__(self):
self.args = parser.parse_args()
print(f"-----------{self.args.project_name}-----------")
use_cuda = self.args.use_cuda and torch.cuda.is_available()
if use_cuda:
torch.cuda.manual_seed(self.args.seed) # 为当前GPU设置随机种子
else:
torch.manual_seed(self.args.seed) # 为CPU设置种子用于生成随机数,以使得结果是确定的
self.device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 0, 'pin_memory': True} if use_cuda else {} # num_workers的值容易影响调试的是否成功
'''
构造DataLoader
'''
print("Create Dataloader")
# ToDo
self.images_path = os.path.join(self.args.base_data_path,
"Caltech-UCSD Birds-200-2011/data/CUB_200_2011/images.txt")
self.labels_path = os.path.join(self.args.base_data_path,
"Caltech-UCSD Birds-200-2011/data/CUB_200_2011/image_class_labels.txt")
self.annotation_lines = self.get_image_label()
np.random.seed(10101) # 保证实验的可重复性
np.random.shuffle(self.annotation_lines)
np.random.seed(None)
self.num_val = int(len(self.annotation_lines) * self.args.val_num)
self.num_train = len(self.annotation_lines) - self.num_val
self.train_loader = DataLoader(
MixupDataset(Dataset(self.annotation_lines[:self.num_train], type='train')),
batch_size=self.args.train_batch_size, shuffle=False, **kwargs)
self.test_loader = DataLoader(
Dataset(self.annotation_lines[self.num_train + 1:], type='test'),
batch_size=self.args.test_batch_size, shuffle=False, **kwargs)
'''
定义选择模型
'''
print('Create Model')
self.model = VisionTransformer(
image_size=(self.args.image_size, self.args.image_size),
patch_size=(self.args.patch_size, self.args.patch_size),
emb_dim=self.args.emb_dim,
mlp_dim=self.args.mlp_dim,
num_heads=self.args.num_heads,
num_layers=self.args.num_layers,
num_classes=self.args.num_class,
attn_dropout_rate=self.args.attn_dropout_rate,
dropout_rate=self.args.dropout_rate
).to(self.device).half()
'''
根据需要加载与训练的模型权重参数
'''
if self.args.resume and self.args.pretrained_weight:
# if True:
# try:
model_dict = self.model.state_dict()
# checkpoint = torch.load(self.args.pretrained_weight,map_location=self.device)
checkpoint = torch.load("E:/PretrainedModel/imagenet21k+imagenet2012_ViT-B_16.pth",
map_location=self.device)
pretrained_dict = checkpoint['state_dict']
pretrained_dict = {k: v for k, v in pretrained_dict.items() if np.shape(model_dict[k]) == np.shape(v)}
model_dict.update(pretrained_dict)
self.model.load_state_dict(model_dict, strict=True)
print("Restoring the weight from pretrained-weight file \nFinished to load the weight")
# except:
# print("can not load weight \ntrain the model from scratch")
# self.model.apply(self.weights_init)
'''
cuda 加速
'''
if use_cuda:
self.model = torch.nn.DataParallel(self.model,
device_ids=range(torch.cuda.device_count())) # parallel use GPU
cudnn.benchmark = True # speed up slightly
'''
构造loss目标函数
选择优化器
学习率变化选择
'''
print("Establish the loss, optimizer and learning_rate function")
# self.criterion = nn.CrossEntropyLoss()
# self.criterion=nn.LogSoftmax()
self.criterion = myLoss().to(self.device)
self.optimizer = optim.SGD(params=self.model.parameters(),
lr=self.args.lr,
weight_decay=self.args.weight_decay, # 防止过拟合
momentum=self.args.momentum)
# self.optimizer = optim.Adam(params=self.model.parameters(),
# lr=self.args.lr,
# betas=(0.9, 0.999),
# eps=1e-8,
# weight_decay=self.args.weight_decay)
self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=5, eta_min=1e-5)
# self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, 2e-3, epochs=self.args.epochs,
# steps_per_epoch=len(self.train_loader))
'''
模型开始训练
'''
print("Start training")
for epoch in range(1, self.args.epochs + 1):
self.train(epoch)
self.test(epoch)
# 清除部分无用变量
torch.cuda.empty_cache()
Color_print("finish model training")
'''
train部分
'''
def train(self, epoch):
# 冻结backbone网络,只训练classifier部分
if epoch < self.args.freeze_epoch + 1:
for name, param in self.model.named_parameters():
if 'classifier' in name:
param.requires_grad = True
else:
param.requires_grad = False
# 训练整个网络
else:
for param in self.model.parameters():
param.requires_grad = True
self.model.train()
average_loss = []
pbar = tqdm(self.train_loader, desc=f'Train Epoch {epoch}/{self.args.epochs}')
for data, target in pbar:
data, target = data.to(self.device), target.to(self.device)
self.optimizer.zero_grad() # 模型参数梯度清零
output = self.model(data)
loss = self.criterion(output, target)
loss.backward()
average_loss.append(loss.item())
self.optimizer.step()
pbar.set_description(f'Train Epoch: {epoch}/{self.args.epochs} loss: {np.mean(average_loss)}')
self.scheduler.step()
'''
test部分
'''
def test(self, epoch):
global best_acc
self.model.eval()
test_loss = 0
correct = torch.zeros(1).squeeze().cuda()
total = torch.zeros(1).squeeze().cuda()
average_loss = []
pbar = tqdm(self.test_loader, desc=f'Test Epoch{epoch}/{self.args.epochs}', mininterval=0.3)
with torch.no_grad():
for data, target in pbar:
data, target = data.to(self.device), target.to(self.device)
output = self.model(data)
test_loss = self.criterion(output, target).item() # sum up batch loss
average_loss.append(test_loss)
pred = torch.max(output, 1)[1]
target = torch.max(target, 1)[1]
correct += (pred == target).sum()
total += len(target)
pbar.set_description(
f'Test Epoch: {epoch}/{self.args.epochs} ')
predict_acc = correct / total
if self.args.save and predict_acc > best_acc:
best_acc = predict_acc
self.save_model(epoch, average_loss, predict_acc, correct, total)
def get_image_label(self):
images = []
labels = []
with open(self.images_path) as f:
for line in f.readlines():
images.append(line.split()[-1])
with open(self.labels_path) as f:
for line in f.readlines():
labels.append(line.split()[-1])
lines = []
for image, label in zip(images, labels):
# ToDo
lines.append(
"E:/Datasets2/Caltech-UCSD Birds-200-2011/data/CUB_200_2011/images/" + str(image) + '*' + str(label))
return lines
'''
模型权重初始化
'''
def weights_init(self, m):
if isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
nn.init.constant_(m.bias, 0)
# 也可以判断是否为conv2d,使用相应的初始化方式
elif isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
# 是否为批归一化层
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
'''
保存模型
'''
def save_model(self, epoch, average_loss, predict_acc, correct, total):
if not os.path.isdir(self.args.saved_model + self.args.project_name) and self.args.is_save:
os.mkdir(self.args.saved_model + self.args.project_name)
torch.save({
'epoch': epoch,
'model_state_dict': self.model.state_dict(),
'optimizer_state_dict': self.optimizer.state_dict(),
'loss': round(np.mean(average_loss), 2)
},
self.args.saved_model + self.args.project_name + f'/Epoch-{epoch}-Test_loss-{round(np.mean(average_loss), 4)}.pth')
percentage = round(predict_acc.item(), 4) * 100
print(
f"\n预测准确率:{percentage}% "
f"预测数量:{correct}/{total},"
f"保存路径:{self.args.saved_model + self.args.project_name}/Epoch-{epoch}-Test_loss-{round(np.mean(average_loss), 4)}.pth'")
if __name__ == "__main__":
train()
六、总结
深度学习是一件非常有意思的事情,但是run 网上的minst cifar 着实只是给你看个热闹,真正的学习算法是找有难度的数据集,自己纯手写代码,复现效果后,不断修改整理,借鉴别人代码的优点,武装自己的算法。
我在这个算法上研究了3个礼拜,发现数据增强对模型性能的提升有显著的效果(但是也不是很多),尤其是使用了mixup增强手段之后,同时由于使用了mixup,需要对标签进行修改,所以我放弃了nn.crossentropy()这样简单的手段,自己根据公式自己写,增加了自己对公式的理解和记忆。
之前在不使用这些手段前,模型能做到75%,这比用mobilenet和shufflenet相比差不多提高了10-20个百分点,之后又用了各种可行的手段,模型效果提高到79%。
由于我目前使用的是vit里最小的一个模型,可能用更大的模型可以做到83%-85%。目前的研究就只有这样。我看别人的csdn上介绍差不多能做到90%,我还要继续研究前进。