From 291535f6f6e111884eeacfd642c52e04a1caff52 Mon Sep 17 00:00:00 2001 From: zhaoyuzhen Date: Thu, 8 Dec 2022 13:48:55 +0800 Subject: [PATCH] update code Changes to be committed: modified: application_example/mobilevit/readme.md new file: application_example/mobilevit/src/distribute.sh modified: application_example/mobilevit/src/train.py --- application_example/mobilevit/readme.md | 62 +++++----- .../mobilevit/src/distribute.sh | 34 ++++++ application_example/mobilevit/src/train.py | 109 +++++++++--------- 3 files changed, 114 insertions(+), 91 deletions(-) create mode 100644 application_example/mobilevit/src/distribute.sh diff --git a/application_example/mobilevit/readme.md b/application_example/mobilevit/readme.md index d7c3567..a6887a9 100644 --- a/application_example/mobilevit/readme.md +++ b/application_example/mobilevit/readme.md @@ -6,9 +6,10 @@ Self-attention models, especially vision transformers, are an alternative to Con Model trained by MindSpore: -| Model | Parameters | Top-1 | Top-5 | ckpt | -|---------------|------------|--------|--------|--------------------------------------------------------------------------------| -| MobileViT-XXS | 1.3 M | 62.184 | 84.292 | [ckpt](https://download.mindspore.cn/vision/cyclegan/apple/mobilevit_xxs.ckpt) | +| Model | Parameters | Top-1 | Top-5 | ckpt | +|---------------|------------|--------|--------|---------------------------------------------------------------------------| +| MobileViT-XXS | 1.3 M | 66.601 | 87.395 | [ckpt](https://download.mindspore.cn/vision/cyclegan/apple/xx_small.ckpt) | +| MobileViT-XS | 2.3 M | 71.853 | 90.631 | [ckpt](https://download.mindspore.cn/vision/cyclegan/apple/x_small.ckpt) | Model trained by PyTorch: @@ -20,24 +21,19 @@ Model trained by PyTorch: ## Training Parameter description -| Parameter | Default | Description | -|:---------------------|:-----------------|:----------------------------------------| -| device_target | GPU | Device type | -| data_url | None | Path of data file | -| num_parallel_workers | 8 | Number of parallel workers | -| batch_size | 64 | Number of batch size | -| num_classes | 1001 | Number of classification | -| momentum | 0.9 | Momentum for the moving average | -| epoch_size | 180 | Number of epochs | -| keep_checkpoint_max | 40 | Max number of checkpoint files | -| ckpt_save_dir | ./mobilevit.ckpt | Location of training outputs | -| run_distribute | True | Distributed parallel training | -| model_type | xx_small | Type of model to train | -| decay_epoc | 150 | Number of decay epochs | -| max_lr | 0.1 | Number of the maximum learning rate | -| min_lr | 1e-5 | Number of the minimum learning rate | -| resize | 256 | Resize the height and weight of picture | -| weight_decay | 4e-5 | Momentum for the moving average | +| Parameter | Default | Description | +|:---------------------|:--------------------------------|:----------------------------------------| +| device_target | Ascend | Device type | +| data_url | /home/ma-user/work/imagenet2012 | Path of data file | +| num_parallel_workers | 8 | Number of parallel workers | +| batch_size | 64 | Number of batch size | +| num_classes | 1000 | Number of classification | +| momentum | 0.9 | Momentum for the moving average | +| epoch_size | 300 | Number of epochs | +| keep_checkpoint_max | 40 | Max number of checkpoint files | +| ckpt_save_dir | ./Mobilevit_Ckpt | Location of training outputs | +| model_type | x_small | Type of model to train | +| decay_epoc | 250 | Number of decay epochs | ## Example @@ -82,26 +78,22 @@ You can unzip the dataset files into this directory structure and read them by M ### Data augmentation and Train Model Training data augmentation methods (RandomResizedCrop --> RandomHorizontalFlip --> ToTensor). - Run the train.py to start to train the model. Through the model_type parameter, you can choose which model you want to train. - -Attention, you can set that you want to do distribute parallel training by setting the **run_distribute** parameter in train.py. - -Attention, when you change the model_type, you have to change **ckpt_save_dir**, **model_type**the two parameter. +Attention, when you change the model_type, you have to change the default of **model_type** in train.py. ```shell -python train.py --data_url "./dataset" --epoch_size 180 --model_type "xx_small" +bash distribute.sh ``` output: ```text -Epoch:[128/150], step:[1218/20018], loss:[2.871/3.098], time:78.808ms, lr:0.00522 -Epoch:[128/150], step:[1219/20018], loss:[2.896/3.098], time:75.547ms, lr:0.00522 -Epoch:[128/150], step:[1220/20018], loss:[2.832/3.098], time:78.639ms, lr:0.00522 -Epoch:[128/150], step:[1221/20018], loss:[2.887/3.098], time:77.040ms, lr:0.00522 +Epoch:[299/300], step:[2490/2502], loss:[2.643/2.372], time:211.096ms, lr:0.00001 +Epoch:[299/300], step:[2491/2502], loss:[2.356/2.372], time:216.807ms, lr:0.00001 +Epoch:[299/300], step:[2492/2502], loss:[2.509/2.372], time:217.649ms, lr:0.00001 +Epoch:[299/300], step:[2493/2502], loss:[2.543/2.372], time:216.863ms, lr:0.00001 ...... ``` @@ -109,16 +101,18 @@ Epoch:[128/150], step:[1221/20018], loss:[2.887/3.098], time:77.040ms, lr:0.0052 After training, you can use test set to evaluate the performance of your model. Run eval.py to achieve this. The usage of model_type parameter is same as training process. +Attention, when you change the model_type, you have to change **model_type** ,**pretrained_model** two parameters. ```shell -python eval.py --model_type "xx_small" --pretrained_model "mobilevit_xxs.ckpt" +python eval.py --model_type "x_small" --pretrained_model "x_small.ckpt" + ``` output: ```text -{'Top_1_Accuracy':0.62184, 'Top_5_Accuracy':0.84292} +{'Top_1_Accuracy':0.7185299, 'Top_5_Accuracy':0.9063100} ``` ### Infer @@ -128,7 +122,7 @@ Put your image in the infer folder, then run infer.py to do inference. ```shell -python MobuileViT_infer.py --model_type "xx_small" --pretrained_model "mobilevit_xxs.ckpt" +python infer.py --model_type "x_small" --pretrained_model "x_small.ckpt" ``` output: diff --git a/application_example/mobilevit/src/distribute.sh b/application_example/mobilevit/src/distribute.sh new file mode 100644 index 0000000..9e9f6cc --- /dev/null +++ b/application_example/mobilevit/src/distribute.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +echo "==============================================================================================================" +echo "Please run the script as: " +echo "bash run.sh DATA_PATH RANK_SIZE" +echo "For example: bash run.sh /path/dataset 8" +echo "It is better to use the absolute path." +echo "==============================================================================================================" + +export DATA_PATH='/home/ma-user/work/imagenet/imagenet2012' +RANK_SIZE=8 + +for((i=0;i<${RANK_SIZE};i++)) +do + rm -rf device$i + mkdir device$i + cp ./train.py ./device$i + cd ./device$i + export DEVICE_ID=$i + export RANK_ID=$i + echo "start training for device $i" + env > env$i.log + python /home/ma-user/work/Ascend_mobilevit/train.py > train.log$i 2>&1 & + cd ../ +done + +if [ $? -eq 0 ];then + echo "training success" +else + echo "training failed" + exit 2 +fi +cd ../ + diff --git a/application_example/mobilevit/src/train.py b/application_example/mobilevit/src/train.py index 33f7109..4d0ca9f 100644 --- a/application_example/mobilevit/src/train.py +++ b/application_example/mobilevit/src/train.py @@ -15,15 +15,18 @@ # ============================================================================ """ mobilevit training script. """ +import os import argparse import mindspore import mindspore.dataset.vision.c_transforms as c_transforms import mindspore.dataset.vision.py_transforms as p_transforms -from mindspore import nn -from mindspore import context + from mindspore.train import Model +from mindspore import nn, context +from mindspore import ParallelMode from mindspore.common import set_seed +from mindspore.communication import init, get_rank, get_group_size from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from models.mobilevit import MobileViT @@ -34,30 +37,34 @@ set_seed(1) def mobilevit_train(args_opt): - """mobilevit train""" - - context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) + """MobileViT train.""" + context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, + device_id=int(os.environ["DEVICE_ID"])) context.set_context(enable_graph_kernel=False) + init() + + rank_id = get_rank() + rank_size = get_group_size() + + context.set_auto_parallel_context(parameter_broadcast=True) + context.set_auto_parallel_context(dataset_strategy="data_parallel") + context.set_auto_parallel_context(device_num=rank_size, + parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True) + # Data preprocessing - if args_opt.model_type == 'small': - img_transforms = ([ - c_transforms.Decode(), - c_transforms.RandomResizedCrop(256), - c_transforms.RandomHorizontalFlip(), - c_transforms.AutoAugment(), - p_transforms.RandomErasing(prob=0.25), - c_transforms.ConvertColor(c_transforms.ConvertMode.COLOR_RGB2BGR), - p_transforms.ToTensor(), - ]) - else: - img_transforms = ([ - c_transforms.Decode(), - c_transforms.RandomResizedCrop(256), - c_transforms.RandomHorizontalFlip(), - c_transforms.ConvertColor(c_transforms.ConvertMode.COLOR_RGB2BGR), - p_transforms.ToTensor(), - ]) + img_transforms = [ + c_transforms.Decode(), + c_transforms.RandomResizedCrop(256), + c_transforms.RandomHorizontalFlip(), + c_transforms.ConvertColor(c_transforms.ConvertMode.COLOR_RGB2BGR), + p_transforms.ToTensor(), + ] + + # loss scale + loss_scale = 1024.0 + loss_scale_manager = mindspore.FixedLossScaleManager(loss_scale, False) # dataset pipline dataset = ImageNet(args_opt.data_url, @@ -65,8 +72,10 @@ def mobilevit_train(args_opt): shuffle=True, transform=img_transforms, num_parallel_workers=args_opt.num_parallel_workers, - resize=args_opt.resize, - batch_size=args_opt.batch_size) + resize=256, + batch_size=args_opt.batch_size, + num_shards=rank_size, + shard_id=rank_id) dataset_train = dataset.run() step_size = dataset_train.get_dataset_size() @@ -74,20 +83,15 @@ def mobilevit_train(args_opt): # Create model. network = MobileViT(model_type=args_opt.model_type, num_classes=args_opt.num_classes) - # Define the decreasing learning rate - lr = nn.cosine_decay_lr(min_lr=args_opt.min_lr, - max_lr=args_opt.max_lr, + # 定义递减的学习率 + lr = nn.cosine_decay_lr(min_lr=1e-5, + max_lr=0.1, total_step=args_opt.epoch_size * step_size, step_per_epoch=step_size, decay_epoch=args_opt.decay_epoch) - # Define loss scale - loss_scale = 1024.0 - loss_scale_manager = mindspore.FixedLossScaleManager(loss_scale, False) - # Define optimizer. - network_opt = nn.SGD(network.trainable_params(), lr, momentum=args_opt.momentum, weight_decay=args_opt.weight_decay, - nesterov=False, loss_scale=loss_scale) + network_opt = nn.Momentum(network.trainable_params(), lr, 0.9, weight_decay=4e-5, loss_scale=loss_scale) # Define loss function. network_loss = CrossEntropySmooth(sparse=True, @@ -95,22 +99,18 @@ def mobilevit_train(args_opt): smooth_factor=0.1, classes_num=args_opt.num_classes) - # Define checkpoint + # 设定checkpoint ckpt_config = CheckpointConfig(save_checkpoint_steps=step_size, keep_checkpoint_max=args_opt.keep_checkpoint_max) ckpt_callback = ModelCheckpoint(prefix=args_opt.model_type, directory=args_opt.ckpt_save_dir, config=ckpt_config) # Define metrics. metrics = {'acc', "loss"} - # Define timer - time_cb = TimeMonitor(data_size=dataset_train.get_dataset_size()) - # Init the model. - if args_opt.device_target == "Ascend": - model = Model(network, loss_fn=network_loss, optimizer=network_opt, metrics=metrics, amp_level="auto", - loss_scale_manager=loss_scale_manager) - else: - model = Model(network, loss_fn=network_loss, optimizer=network_opt, metrics=metrics) + model = Model(network, loss_fn=network_loss, optimizer=network_opt, metrics=metrics, amp_level="auto", + loss_scale_manager=loss_scale_manager) + + time_cb = TimeMonitor(data_size=dataset_train.get_dataset_size()) # Begin to train. model.train(args_opt.epoch_size, @@ -121,21 +121,16 @@ def mobilevit_train(args_opt): if __name__ == '__main__': parser = argparse.ArgumentParser(description='MobileViT train.') - parser.add_argument('--epoch_size', type=int, default=200, help='Train epoch size.') - parser.add_argument('--model_type', default='xx_small', type=str, metavar='model_type') + parser.add_argument('--epoch_size', type=int, default=300, help='Train epoch size.') + parser.add_argument('--model_type', default='x_small', type=str, metavar='model_type') parser.add_argument('--batch_size', type=int, default=64, help='Number of batch size.') - parser.add_argument('--decay_epoch', type=int, default=150, help='Number of decay epochs.') - parser.add_argument('--num_classes', type=int, default=1001, help='Number of classification.') - parser.add_argument('--data_url', default=r"C:\Users\Administrator\Desktop\MobileViT修改版\src\dataset", - help='Location of data.') + parser.add_argument('--decay_epoch', type=int, default=250, help='Number of decay epochs.') + parser.add_argument('--num_classes', type=int, default=1000, help='Number of classification.') parser.add_argument('--momentum', type=float, default=0.9, help='Momentum for the moving average.') - parser.add_argument('--device_target', type=str, default="CPU", choices=["Ascend", "GPU", "CPU"]) - parser.add_argument('--max_lr', type=float, default=0.1, help='Number of the maximum learning rate.') - parser.add_argument('--num_parallel_workers', type=int, default=5, help='Number of parallel workers.') - parser.add_argument('--min_lr', type=float, default=1e-5, help='Number of the minimum learning rate.') - parser.add_argument('--resize', type=int, default=256, help='Resize the height and weight of picture.') - parser.add_argument('--weight_decay', type=float, default=4e-5, help='Momentum for the moving average.') - parser.add_argument('--keep_checkpoint_max', type=int, default=40, help='Max number of checkpoint files.') - parser.add_argument('--ckpt_save_dir', type=str, default="./Mobilevit_Ckpt/6", help='Location of training outputs.') + parser.add_argument('--device_target', type=str, default="Ascend", choices=["Ascend", "GPU", "CPU"]) + parser.add_argument('--num_parallel_workers', type=int, default=8, help='Number of parallel workers.') + parser.add_argument('--data_url', default='/home/ma-user/work/imagenet2012', help='Location of data.') + parser.add_argument('--keep_checkpoint_max', type=int, default=50, help='Max number of checkpoint files.') + parser.add_argument('--ckpt_save_dir', type=str, default="./Mobilevit_Ckpt", help='Location of training outputs.') args = parser.parse_known_args()[0] mobilevit_train(args) -- Gitee