1 Star 0 Fork 0

zzhxxxx/Megatron core-r0.6.0

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
.gitlab-ci.yml 6.06 KB
一键复制 编辑 原始数据 按行查看 历史
Maanu Grover 提交于 2024-03-25 18:01 . Split unit test jobs
workflow:
rules:
# always run MR pipelines
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
# always run web pipelines
- if: $CI_PIPELINE_SOURCE == "web"
# do not run branch pipelines if open MR exists
- if: $CI_COMMIT_BRANCH && $CI_OPEN_MERGE_REQUESTS
when: never
# run branch pipeline if no open MR
- if: $CI_COMMIT_BRANCH
stages:
- test
- jet
variables: &VARS
SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
PYTORCH_IMAGE: /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/nvcr_pytorch_23.04.sqsh # This is the image that is run by all nodes on selene for tests
PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
TEST_REGEX_ON_THIS_COMMIT: NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
JET_CUSTOM_FILTER: ""
DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
TIME_LIMIT: "10:00" # Default time limit for all jobs
MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE
JET_CLUSTER_BRANCH:
value: "mcore/draco-oci"
options:
- "mcore/draco-oci"
- "mcore/eos"
description: '"mcore/draco-oci" for OCI-IAD, "mcore/eos" for EOS'
include:
- jet-tests.yml
unit_tests:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
artifacts:
paths:
- coverage
expire_in: 30 days
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
unit_tests-data:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest tests/unit_tests/data
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: never
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
unit_tests-dist-checkpointing:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest tests/unit_tests/dist_checkpointing
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: never
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
unit_tests-fusions:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest tests/unit_tests/fusions
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: never
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
unit_tests-models:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest tests/unit_tests/models
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: never
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
unit_tests-pipeline-parallel:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest tests/unit_tests/pipeline_parallel
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: never
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
unit_tests-tensor-parallel:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest tests/unit_tests/tensor_parallel
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: never
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
unit_tests-transformer:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest tests/unit_tests/transformer
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: never
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
unit_tests-top-py:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/pytorch-all-tests:23.04-v1
tags:
- 8xL40S
stage: test
script:
- torchrun --nproc_per_node=8 -m pytest tests/unit_tests/*.py
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ /Run tests/'
when: never
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
when: never
- when: always
docs_build_test:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
stage: test
tags:
- os/linux
script:
- cd ..
- rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/nemo-megatron-core-tme/documentation.git
- mv megatron-lm/ documentation/
- cd documentation/
- ./repo docs
allow_failure: true
except:
- main
formatting:
image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/python-format:0.0.1
tags:
- os/linux
stage: test
script:
- black megatron/core --check --verbose --diff
- isort megatron/core --check
rules:
- when: always
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/zzhxxxx/megatron-core-r0.6.0.git
[email protected]:zzhxxxx/megatron-core-r0.6.0.git
zzhxxxx
megatron-core-r0.6.0
Megatron core-r0.6.0
core_r0.6.0

搜索帮助