代码拉取完成,页面将自动刷新
# train.py
#!/usr/bin/env python3
""" train network using pytorch
Junde Wu
"""
import os
import sys
import argparse
from datetime import datetime
from collections import OrderedDict
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
import torchvision
import torchvision.transforms as transforms
from skimage import io
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, DistributedSampler, ConcatDataset
#from dataset import *
from torch.autograd import Variable
from PIL import Image
# from tensorboardX import SummaryWriter
#from models.discriminatorlayer import discriminator
from dataset import *
from conf import settings
import time
import cfg
from tqdm import tqdm
from torch.utils.data import DataLoader, random_split
from utils import *
import function
'''定义了一些图像预处理的操作,用于在训练和测试过程中对图像进行变换'''
transform_train = transforms.Compose([
transforms.Resize((args.image_size, args.image_size)),
transforms.ToTensor(),
])
transform_train_seg = transforms.Compose([
transforms.Resize((args.out_size, args.out_size)),
transforms.ToTensor(),
])
random_dataset = RandomDataset('DualModal2019/RGB/Training')
if args.net == 'sam':
prompt = 'click'
elif args.net == 'sam_lite':
prompt = 'noprompt'
elif args.net == 'sam_self':
prompt = 'noprompt'
train_dataset = HRFRGB(args, data_path="HRFdatabaseslice/train", transform=transform_train, transform_msk=transform_train_seg)
test_dataset = HRFRGB(args, data_path="HRFdatabaseslice/test", transform=transform_train, transform_msk=transform_train_seg)
train_sampler = DistributedSampler(train_dataset)
test_sampler = DistributedSampler(test_dataset)
nice_train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.b, num_workers=4, pin_memory=True)
nice_test_loader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args.b, num_workers=4, pin_memory=True)
net = get_network(args, args.net, use_gpu=args.gpu, gpu_device=device, distribution=args.distributed)
# 将创建好的模型放在GPU上,并且使用DistributedDataParallel
num_gpus = torch.cuda.device_count()
for n, value in net.image_encoder.named_parameters():
# if "Adapter" not in n and "patch_embed" not in n:
# value.requires_grad = False
if "Adapter" not in n:
value.requires_grad = False
if num_gpus > 1:
net = torch.nn.parallel.DistributedDataParallel(net, device_ids=[args.local_rank])
# net = net.to(device=device)
if args.pretrain:
weights = torch.load(args.pretrain) # 加载了预训练模型的权重
net.load_state_dict(weights, strict=False) # 加载的权重应用到神经网络模型 net
# 这段代码设置了一个 Adam 优化器来更新模型参数,并使用 StepLR 调度器在每 10 个 epoch 后将学习率缩小为原来的一半
"""
net.parameters(): 指定了需要被优化的模型参数,即神经网络中所有可学习的权重和偏置。
lr=args.lr: 学习率,决定了每次参数更新的幅度。
betas=(0.9, 0.999): Adam 优化器中的两个指数衰减率,控制了梯度的指数加权平均数和平方的指数加权平均数。
eps=1e-08: 为了数值稳定性而添加到分母中的小常数,防止除零错误。
weight_decay=0: 权重衰减(L2 正则化)的系数,用于控制模型的复杂度。
amsgrad=False: 是否使用 AMSGrad 变体,通常设为 False。
"""
optimizer = optim.Adam(net.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.3)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True,
# threshold=0.002, threshold_mode='rel', cooldown=0, min_lr=0,
# eps=1e-08)
start_epoch = 0
# 载入预训练的权重
if args.weights != 0:
print(f'=> resuming from {args.weights}')
assert os.path.exists(args.weights)
checkpoint_file = os.path.join(args.weights)
print(checkpoint_file)
assert os.path.exists(checkpoint_file)
loc = 'cuda:{}'.format(args.gpu_device)
checkpoint = torch.load(checkpoint_file, map_location=loc)
start_epoch = checkpoint['epoch']
best_tol = checkpoint['best_tol']
state_dict = checkpoint['state_dict']
if args.distributed != 'none':
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
# name = k[7:] # remove `module.`
name = 'module.' + k
new_state_dict[name] = v
# load params
else:
new_state_dict = state_dict
net.load_state_dict(new_state_dict)
print(f'=> loaded checkpoint {checkpoint_file} (epoch {start_epoch})')
# 创建 log 文件夹
if rank == 0:
args.path_helper = set_log_dir('logs', args.exp_name)
logger = create_logger(args.path_helper['log_path'])
logger.info(args)
'''tensorboard'''
if rank == 0:
# use tensorboard 创建 run 文件夹
if not os.path.exists(settings.LOG_DIR):
os.mkdir(settings.LOG_DIR)
log_dir = os.path.join(
settings.LOG_DIR, args.net, settings.TIME_NOW)
os.mkdir(log_dir)
# writer_t_loss = SummaryWriter(log_dir=os.path.join(log_dir, 'train_loss'))
# writer_t_iou = SummaryWriter(log_dir=os.path.join(log_dir, 'train_iou'))
# writer_t_dice = SummaryWriter(log_dir=os.path.join(log_dir, 'train_dice'))
# writer_v_loss = SummaryWriter(log_dir=os.path.join(log_dir, 'val_loss'))
# writer_v_iou = SummaryWriter(log_dir=os.path.join(log_dir, 'val_iou'))
# writer_v_dice = SummaryWriter(log_dir=os.path.join(log_dir, 'val_dice'))
# 暂时不用这个参数
writer = None
'''begain training'''
best_acc = 0.0
best_tol = 1e4
# 从本地文件中加载fake prompt
fake_prompt = np.load('fake_prompt.npz')
for epoch in range(start_epoch + 1, start_epoch + settings.EPOCH + 1):
if args.mod == 'sam_adpt':
train_sampler.set_epoch(epoch)
time_start = time.time()
if args.net == 'sam':
avg_loss, (iou, dice) = function.train_sam(args, net, optimizer, nice_train_loader, epoch, writer, schedulers=scheduler, vis=args.vis, device=device)
elif args.net == 'sam_lite' or args.net == 'sam_self' or args.net == 'sam_self_with_prompt':
avg_loss = function.train_mult_sam_lite(args, net, optimizer, nice_train_loader, epoch, writer, schedulers=scheduler, vis=args.vis, device=device, fake_prompt=fake_prompt)
if rank == 0:
logger.info(f'训练集平均loss: {avg_loss}|| @ epoch {epoch}.')
scheduler.step()
# scheduler.step(avg_loss)
# if rank == 0:
# writer_t_loss.add_scalar('loss', avg_loss, epoch)
# writer_t_iou.add_scalar('loss', iou, epoch)
# writer_t_dice.add_scalar('loss', dice, epoch)
time_end = time.time()
net.eval()
if epoch and epoch % args.val_freq == 0 or epoch == start_epoch + settings.EPOCH:
if args.net == 'sam':
tol, (eiou, edice) = function.validation_sam(args, nice_test_loader, epoch, net, writer)
elif args.net == 'sam_lite' or args.net == 'sam_self' or args.net == 'sam_self_with_prompt':
tol, (eiou1, edice1), (eiou2, edice2), (eiou3, edice3) = function.validation_mult_sam_lite(args, nice_test_loader, epoch, net, writer, fake_prompt=fake_prompt)
if rank == 0:
logger.info(f'T测试集平均loss: {tol}, 平均IOU: {eiou1},{eiou2},{eiou3} 平均DICE: {edice1}, {edice2}, {edice3} || @ epoch {epoch}.')
# if rank == 0:
# writer_v_loss.add_scalar('loss', tol, epoch)
# writer_v_iou.add_scalar('loss', eiou, epoch)
# writer_v_dice.add_scalar('loss', edice, epoch)
if args.distributed != 'none':
sd = net.module.state_dict()
else:
sd = net.state_dict()
if rank == 0 and tol < best_tol:
best_tol = tol
is_best = True
print('存储最佳节点')
save_checkpoint({
'epoch': epoch + 1,
'model': args.net,
'state_dict': sd,
'optimizer': optimizer.state_dict(),
'best_tol': best_tol,
'path_helper': args.path_helper,
}, is_best, args.path_helper['ckpt_path'], filename="best_checkpoint")
else:
is_best = False
# writer.close()
# if rank == 0:
# writer_t_loss.close()
# writer_t_iou.close()
# writer_t_dice.close()
# writer_v_loss.close()
# writer_v_iou.close()
# writer_v_dice.close()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。