From 6b1573c3bd1efbe7511ca19ada4079cc56ab5904 Mon Sep 17 00:00:00 2001 From: chenjunjie Date: Thu, 17 Oct 2024 21:49:46 +0800 Subject: [PATCH] cjj --- .../csrc/core/npu/NPUCachingAllocator.cpp | 163 +++++++----------- torch_npu/csrc/profiler/npu_profiler.cpp | 2 +- 2 files changed, 66 insertions(+), 99 deletions(-) diff --git a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp index 027c92a0c..8bc710172 100644 --- a/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp +++ b/torch_npu/csrc/core/npu/NPUCachingAllocator.cpp @@ -22,8 +22,8 @@ #include "NPUBlockHandle.h" #include "torch_npu/csrc/core/npu/sys_ctrl/npu_sys_ctrl.h" #include "torch_npu/csrc/core/npu/NPUEvent.h" -#ifndef BUILD_LIBTORCH #include "torch_npu/csrc/profiler/npu_profiler.h" +#ifndef BUILD_LIBTORCH #include "torch_npu/csrc/sanitizer/NPUTrace.h" #endif @@ -907,7 +907,8 @@ class DeviceCachingAllocator { // All public methods (except the above) acquire the allocator mutex. // Thus, do not call a public method from another public method. - Block* malloc(int device, size_t orig_size, aclrtStream stream) { + Block* malloc(int device, size_t orig_size, aclrtStream stream, uint8_t allocator_type = 0) + { // done outside the lock because we don't know what locks the recorder needs // to have... auto context = maybeGatherContext(RecordContext::STATE); @@ -1057,14 +1058,16 @@ class DeviceCachingAllocator { bool split_remainder = should_split(params.block, params.size()); return alloc_found_block( - std::move(params), orig_size, std::move(context), split_remainder); + std::move(params), orig_size, std::move(context), split_remainder, allocator_type); } Block* alloc_found_block( AllocParams params, size_t orig_size, std::shared_ptr context, - bool split_remainder) { + bool split_remainder, + uint8_t allocator_type) + { auto size = params.size(); auto device = params.device(); auto pool = params.pool; @@ -1156,11 +1159,27 @@ class DeviceCachingAllocator { stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current); +#ifndef BUILD_LIBTORCH + torch_npu::profiler::reportMemoryDataToNpuProfiler({ + static_cast(c10::DeviceType::PrivateUse1), + block->device, + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), + allocator_type, + reinterpret_cast(block->ptr), + block->size, + stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, + stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, + stats.active_bytes[static_cast(StatType::AGGREGATE)].current, + reinterpret_cast(block->stream)} + ); +#endif + return block; } - void free(Block* block) { + void free(Block* block, uint8_t allocator_type = 0) + { std::shared_ptr context = maybeGatherContext(RecordContext::ALL); std::lock_guard lock(mutex); @@ -1192,13 +1211,27 @@ class DeviceCachingAllocator { if (!block->stream_uses.empty() && c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) { insert_events(block); } else { - free_block(block, context); + free_block(block, context, allocator_type); } ASCEND_LOGD("PTA CachingAllocator free: free = %zu, cached = %lu, allocated = %lu", orig_block_size, stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current); +#ifndef BUILD_LIBTORCH + torch_npu::profiler::reportMemoryDataToNpuProfiler({ + static_cast(c10::DeviceType::PrivateUse1), + block->device, + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), + allocator_type, + reinterpret_cast(orig_block_ptr), + -orig_block_size, + stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, + stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, + stats.active_bytes[static_cast(StatType::AGGREGATE)].current, + reinterpret_cast(block->stream)} + ); +#endif } void* getBaseAllocation(Block* block, size_t* outSize) { @@ -1396,11 +1429,6 @@ class DeviceCachingAllocator { } } - DeviceStats get_stats() - { - return stats; - } - private: // All private methods do not acquire the allocator mutex. @@ -1561,7 +1589,8 @@ class DeviceCachingAllocator { /** moves a block into a pool of cached free blocks **/ void free_block( Block* block, - const std::shared_ptr& context) + const std::shared_ptr& context, + uint8_t allocator_type = 0) { AT_ASSERT(!block->allocated && block->event_count == 0, PTA_ERROR(ErrCode::VALUE)); @@ -1575,6 +1604,7 @@ class DeviceCachingAllocator { block->context_when_allocated = nullptr; size_t original_block_size = block->size; + auto orig_block_ptr = block->ptr; size_t requested_size = block->requested_size; auto& pool = *block->pool; @@ -1619,6 +1649,20 @@ class DeviceCachingAllocator { stats.requested_bytes[stat_type], -static_cast(requested_size)); }); +#ifndef BUILD_LIBTORCH + torch_npu::profiler::reportMemoryDataToNpuProfiler({ + static_cast(c10::DeviceType::PrivateUse1), + block->device, + static_cast(torch_npu::profiler::MemoryDataType::MEMORY_BLOCK_FREE), + allocator_type, + reinterpret_cast(orig_block_ptr), + -original_block_size, + stats.allocated_bytes[static_cast(StatType::AGGREGATE)].current, + stats.reserved_bytes[static_cast(StatType::AGGREGATE)].current, + stats.active_bytes[static_cast(StatType::AGGREGATE)].current, + reinterpret_cast(block->stream)} + ); +#endif } /** combine previously split blocks. returns the size of the subsumed block, or 0 on failure. **/ @@ -2279,20 +2323,7 @@ class NpuCachingAllocator : public NPUAllocator { "Allocator not initialized for device ", device, ": did you call init?", PTA_ERROR(ErrCode::PARAM)); Block* block = device_allocator[device]->malloc(device, size, stream); -#ifndef BUILD_LIBTORCH - torch_npu::profiler::reportMemoryDataToNpuProfiler({ - static_cast(c10::DeviceType::PrivateUse1), - block->device, - static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), - static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), - reinterpret_cast(block->ptr), - block->size, - device_allocator[device]->get_stats().allocated_bytes[static_cast(StatType::AGGREGATE)].current, - device_allocator[device]->get_stats().reserved_bytes[static_cast(StatType::AGGREGATE)].current, - device_allocator[device]->get_stats().active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream)} - ); -#endif + add_allocated_block(block); *devPtr = static_cast(block->ptr); #ifndef BUILD_LIBTORCH @@ -2319,35 +2350,9 @@ class NpuCachingAllocator : public NPUAllocator { reinterpret_cast(block->ptr)); } #endif + auto orig_block_ptr = block->ptr; + auto orig_block_size = block->size; device_allocator[block->device]->free(block); -#ifndef BUILD_LIBTORCH - if (block->stream_uses.empty() || !c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) { - torch_npu::profiler::reportMemoryDataToNpuProfiler({ - static_cast(c10::DeviceType::PrivateUse1), - block->device, - static_cast(torch_npu::profiler::MemoryDataType::MEMORY_BLOCK_FREE), - static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), - reinterpret_cast(block->ptr), - -(block->size), - device_allocator[block->device]->get_stats().allocated_bytes[static_cast(StatType::AGGREGATE)].current, - device_allocator[block->device]->get_stats().reserved_bytes[static_cast(StatType::AGGREGATE)].current, - device_allocator[block->device]->get_stats().active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream)} - ); - } - torch_npu::profiler::reportMemoryDataToNpuProfiler({ - static_cast(c10::DeviceType::PrivateUse1), - block->device, - static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), - static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_INNER), - reinterpret_cast(block->ptr), - -(block->size), - device_allocator[block->device]->get_stats().allocated_bytes[static_cast(StatType::AGGREGATE)].current, - device_allocator[block->device]->get_stats().reserved_bytes[static_cast(StatType::AGGREGATE)].current, - device_allocator[block->device]->get_stats().active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream)} - ); -#endif } void setMemoryFraction(double fraction, int device) override @@ -2633,22 +2638,9 @@ void* MallocBlock(size_t size, void *stream, int device) { } AT_ASSERT(caching_allocator.device_allocator[device], PTA_ERROR(ErrCode::NOT_FOUND)); AT_ASSERT(stream, PTA_ERROR(ErrCode::NOT_FOUND)); - auto block = caching_allocator.device_allocator[device]->malloc(device, size, stream); + auto block = caching_allocator.device_allocator[device]->malloc(device, size, stream, + static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL)); AT_ASSERT(block, PTA_ERROR(ErrCode::NOT_FOUND)); -#ifndef BUILD_LIBTORCH - torch_npu::profiler::reportMemoryDataToNpuProfiler({ - static_cast(c10::DeviceType::PrivateUse1), - block->device, - static_cast(torch_npu::profiler::MemoryDataType::MEMORY_MALLOC), - static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL), - reinterpret_cast(block->ptr), - block->size, - caching_allocator.device_allocator[device]->get_stats().allocated_bytes[static_cast(StatType::AGGREGATE)].current, - caching_allocator.device_allocator[device]->get_stats().reserved_bytes[static_cast(StatType::AGGREGATE)].current, - caching_allocator.device_allocator[device]->get_stats().active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream)} - ); -#endif return reinterpret_cast(block); } @@ -2657,35 +2649,10 @@ void FreeBlock(void *handle) { AT_ASSERT(block, PTA_ERROR(ErrCode::PTR)); caching_allocator.assertValidDevice(block->device); AT_ASSERT(caching_allocator.device_allocator[block->device], PTA_ERROR(ErrCode::NOT_FOUND)); - caching_allocator.device_allocator[block->device]->free(block); -#ifndef BUILD_LIBTORCH - if (block->stream_uses.empty() || !c10_npu::NpuSysCtrl::GetInstance().GetInitFlag()) { - torch_npu::profiler::reportMemoryDataToNpuProfiler({ - static_cast(c10::DeviceType::PrivateUse1), - block->device, - static_cast(torch_npu::profiler::MemoryDataType::MEMORY_BLOCK_FREE), - static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL), - reinterpret_cast(block->ptr), - -(block->size), - caching_allocator.device_allocator[block->device]->get_stats().allocated_bytes[static_cast(StatType::AGGREGATE)].current, - caching_allocator.device_allocator[block->device]->get_stats().reserved_bytes[static_cast(StatType::AGGREGATE)].current, - caching_allocator.device_allocator[block->device]->get_stats().active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream)} - ); - } - torch_npu::profiler::reportMemoryDataToNpuProfiler({ - static_cast(c10::DeviceType::PrivateUse1), - block->device, - static_cast(torch_npu::profiler::MemoryDataType::MEMORY_FREE), - static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL), - reinterpret_cast(block->ptr), - -(block->size), - caching_allocator.device_allocator[block->device]->get_stats().allocated_bytes[static_cast(StatType::AGGREGATE)].current, - caching_allocator.device_allocator[block->device]->get_stats().reserved_bytes[static_cast(StatType::AGGREGATE)].current, - caching_allocator.device_allocator[block->device]->get_stats().active_bytes[static_cast(StatType::AGGREGATE)].current, - reinterpret_cast(block->stream)} - ); -#endif + auto orig_block_ptr = block->ptr; + auto orig_block_size = block->size; + caching_allocator.device_allocator[block->device]->free(block, + static_cast(torch_npu::profiler::MemoryAllocatorType::ALLOCATOR_EXTERNAL)); } void* GetBlockPtr(const void *handle) { diff --git a/torch_npu/csrc/profiler/npu_profiler.cpp b/torch_npu/csrc/profiler/npu_profiler.cpp index b074e5b1d..d1a073024 100644 --- a/torch_npu/csrc/profiler/npu_profiler.cpp +++ b/torch_npu/csrc/profiler/npu_profiler.cpp @@ -117,7 +117,7 @@ struct NpuProfilerThreadLocalState : public ProfilerStateBase { c10::Device device) { if (config_.profile_memory && ProfilerMgr::GetInstance()->ReportEnable().load(std::memory_order_relaxed)) { - ProfilerMgr::GetInstance()->Upload(std::make_unique( + ProfilerMgr::GetInstance()->UploadWithLock(std::make_unique( reinterpret_cast(ptr), static_cast(Utils::GetClockTime()), alloc_size, -- Gitee