From 90bd4a3705ce1a239178284d001abe91dd05fb91 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 2 Jul 2024 17:10:56 +0800 Subject: [PATCH 01/16] =?UTF-8?q?estimator=E4=B8=ADtrain=E5=88=87=E6=8D=A2?= =?UTF-8?q?=E4=B8=BAeval,=E5=A2=9E=E5=8A=A0=E5=8E=9Fhost=E4=BE=A7train?= =?UTF-8?q?=E7=9A=84=E7=9B=B8=E5=85=B3=E7=8A=B6=E6=80=81=E5=A4=87=E4=BB=BD?= =?UTF-8?q?=EF=BC=8C=E5=9C=A8eval=E5=88=87=E6=8D=A2=E4=B8=BAtrain=E5=90=8E?= =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E8=BF=98=E5=8E=9F=E5=A4=87=E4=BB=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cache_manager/cache_manager.cpp | 27 ++++++++++++++++ .../cache_manager/cache_manager.h | 4 +++ src/AccCTR/src/embedding_cache/limited_set.h | 18 +++++++++++ .../offset_mapper/offset_mapper.h | 32 +++++++++++++++++++ src/AccCTR/src/include/embedding_cache.h | 14 ++++++++ src/core/emb_table/embedding_ddr.cpp | 10 ++++++ src/core/emb_table/embedding_ddr.h | 3 ++ src/core/emb_table/embedding_mgmt.cpp | 14 ++++++++ src/core/emb_table/embedding_mgmt.h | 11 +++++++ src/core/emb_table/embedding_static.cpp | 16 ++++++++-- src/core/emb_table/embedding_static.h | 4 +++ src/core/emb_table/embedding_table.cpp | 8 +++++ src/core/emb_table/embedding_table.h | 5 +++ src/core/hybrid_mgmt/hybrid_mgmt.cpp | 27 ++++++++++++++++ src/core/hybrid_mgmt/hybrid_mgmt.h | 5 +++ .../ock_ctr_common/include/embedding_cache.h | 14 ++++++++ 16 files changed, 210 insertions(+), 2 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp index 8a6187a1..452e2fd1 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -317,6 +317,33 @@ int EmbCacheManagerImpl::LoadEmbTableInfos(std::string tableName, const std::vec return H_OK; } +int EmbCacheManagerImpl::BackUpTrainStatus(std:string tableName) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + + auto om = offsetMappersBackUp.find(tableName); + if (om != offsetMappersBackUp.end()) { + offsetMappersBackUp[tableName] = offsetMappers[tableName]; + } else{ + offsetMappersBackUp[tableName].Initialize(1000, 1000); + offsetMappersBackUp[tableName] = offsetMappers[tableName]; + } + return H_OK; +} + +int EmbCacheManagerImpl::RecoverTrainStatus(std:string tableName) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + offsetMappers[tableName] = offsetMappersBackUp[tableName]; + return H_OK; +} + void EmbCacheManagerImpl::Destroy() { for (auto it = offsetMappers.begin(); it != offsetMappers.end(); it++) { diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h index 80fbcd46..359e88ad 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h @@ -73,11 +73,15 @@ public: const std::vector>& embeddings, const std::vector>& optimizerSlots) override; + int BackUpTrainStatus(std:string tableName) override; + int RecoverTrainStatus(std::string tableName) override; + uint32_t GetUsage(const std::string& tableName) override; private: std::map embCacheInfos; std::map offsetMappers; + std::map offsetMappersBackUp; std::map embTables; int CheckValidTableName(const std::string& tableName); diff --git a/src/AccCTR/src/embedding_cache/limited_set.h b/src/AccCTR/src/embedding_cache/limited_set.h index 036a6477..d44b615a 100644 --- a/src/AccCTR/src/embedding_cache/limited_set.h +++ b/src/AccCTR/src/embedding_cache/limited_set.h @@ -47,6 +47,24 @@ public: delete tail; } + // 拷贝构造函数 + LimitedSet(const LimitedSet& other): head(new Node(-1)), tail(new Node(-1)) + { + nodes.resize(other.nodes.size()); + for (auto &node: nodes) { + node = new Node(-1); + } + + // 初始化头尾节点 + head->next = tail; + tail->prev = head; + + // 遍历原vector的每一个节点并复制 + for (Node* node = other.head->next; node != other.tail; node = node->next) { + insert(node->value); + } + } + void insert(uint64_t value) { if (nodes[value]->value == value) { diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h index f42a0d3f..1ad470c5 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h @@ -35,6 +35,38 @@ public: ~OffsetMapper() = default; + OffsetMapper(const OffsetMapper& other): maxCacheSize(other.maxCacheSize), useLength(other.useLength), + validPos(new LimitedSet(*other.validPos)), + evictPos(new LimitedSet(*other.evictPos)), + pos2Key(other.pos2Key), lastBatchPos(other.lastBatchPos), + evictSize(other.evictSize) + { + } + + OffsetMapper& operator=(const OffsetMapper& other) + { + if (this != &other) { + delete validPos; + validPos = nullptr; + delete evictPos; + evictPos = nullptr; + + if (other.validPos != nullptr) { + validPos = new LimitedSet(*other.validPos); + } + if (other.evictPos != nullptr) { + evictPos = new LimitedSet(*other.evictPos); + } + + maxCacheSize = other.maxCacheSize; + useLength = other.useLength; + pos2Key = other.pos2Key; + lastBatchPos = other.lastBatchPos; + evictSize = other.evictSize; + } + return *this; + } + bool Initialize(uint32_t reserve, uint32_t maxSize = 0) { maxCacheSize = maxSize; diff --git a/src/AccCTR/src/include/embedding_cache.h b/src/AccCTR/src/include/embedding_cache.h index 4adf1fbf..40d9dcbe 100644 --- a/src/AccCTR/src/include/embedding_cache.h +++ b/src/AccCTR/src/include/embedding_cache.h @@ -315,6 +315,20 @@ public: virtual int LoadEmbTableInfos(std::string tableName, const std::vector& keys, const std::vector>& embeddings, const std::vector>& optimizerSlots) = 0; + + /* * + * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态 + * @Param tableName: 需要加载信息的table名字 + * @Return errorCode + */ + virtual int BackUpTrainStatus(std::string tableName) = 0; + + /* * + * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态 + * @Param tableName: 需要加载信息的table名字 + * @Return errorCode + */ + virtual int RecoverTrainStatus(std::string tableName) = 0; }; } // namespace EmbCache diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index ca706c73..e4b96eb6 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -376,3 +376,13 @@ void EmbeddingDDR::SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache) { this->embCache = embCache; } + +void EmbeddingDDR::BackUpTrainStatus() +{ + embCache->BackUpTrainStatus(name); +} + +void EmbeddingDDR::RecoverTrainStatus() +{ + embCache->RecoverTrainStatus(name); +} diff --git a/src/core/emb_table/embedding_ddr.h b/src/core/emb_table/embedding_ddr.h index ac5c5878..26d85e60 100644 --- a/src/core/emb_table/embedding_ddr.h +++ b/src/core/emb_table/embedding_ddr.h @@ -73,6 +73,9 @@ public: void SaveEmbAndOptim(const string& savePath); void SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache); + void BackUpTrainStatus(); + void RecoverTrainStatus(); + GTEST_PRIVATE: void EvictDeleteEmb(const vector& keys); diff --git a/src/core/emb_table/embedding_mgmt.cpp b/src/core/emb_table/embedding_mgmt.cpp index 9e7dcbb0..d889cdba 100644 --- a/src/core/emb_table/embedding_mgmt.cpp +++ b/src/core/emb_table/embedding_mgmt.cpp @@ -196,3 +196,17 @@ void EmbeddingMgmt::SetEmbCacheForEmbTable(const ock::ctr::EmbCacheManagerPtr& e table.second->SetEmbCache(embCache); } } + +void EmbeddingMgmt::BackUpTrainStatusBeforeLoad() +{ + for (auto& table: embeddings) { + table.second->BackUpTrainStatus(); + } +} + +void EmbeddingMgmt::RecoverTrainStatus() +{ + for (auto& table: embeddings) { + table.second->RecoverTrainStatus(); + } +} \ No newline at end of file diff --git a/src/core/emb_table/embedding_mgmt.h b/src/core/emb_table/embedding_mgmt.h index ef106786..7cd3f782 100644 --- a/src/core/emb_table/embedding_mgmt.h +++ b/src/core/emb_table/embedding_mgmt.h @@ -89,6 +89,17 @@ public: */ void Save(const string& filePath); + /** + * estimator模式下train切换为eval时, 备份所有表train的状态 + */ + void BackUpTrainStatusBeforeLoad(); + + /** + * estimator模式下eval切换为train时, 还原所有表train的状态 + */ + void RecoverTrainStatus(); + + /** * 获取所有表对应的DeviceOffsets,该偏移用于python侧保存embedding时抽取key对应的embedding */ diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index 61874b1f..0db152ed 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -160,11 +160,23 @@ void EmbeddingStatic::LoadKey(const string& savePath) } maxOffset = keyOffsetMap.size(); - free(static_cast(buf)); } vector EmbeddingStatic::GetDeviceOffset() { return deviceOffset; -} \ No newline at end of file +} + +void EmbeddingStatic::BackUpTrainStatus() +{ + keyOffsetMapBackUp = keyOffsetMap; +} + +void EmbeddingStatic::RecoverTrainStatus() +{ + if (keyOffsetMapBackUp.size()!=0) { + keyOffsetMap = keyOffsetMapBackUp; + keyOffsetMapBackUp.clear(); + } +} diff --git a/src/core/emb_table/embedding_static.h b/src/core/emb_table/embedding_static.h index 6515f586..6f772e08 100644 --- a/src/core/emb_table/embedding_static.h +++ b/src/core/emb_table/embedding_static.h @@ -39,6 +39,10 @@ public: void Save(const string& savePath); + void BackUpTrainStatus(); + + void RecoverTrainStatus(); + vector GetDeviceOffset(); GTEST_PRIVATE: diff --git a/src/core/emb_table/embedding_table.cpp b/src/core/emb_table/embedding_table.cpp index b4eb2379..12b0137a 100644 --- a/src/core/emb_table/embedding_table.cpp +++ b/src/core/emb_table/embedding_table.cpp @@ -143,6 +143,14 @@ void EmbeddingTable::Save(const string& filePath) { } +void EmbeddingTable::BackUpTrainStatus() +{ +} + +void EmbeddingTable::RecoverTrainStatus() +{ +} + void EmbeddingTable::MakeDir(const string& dirName) { if (fileSystemPtr_ == nullptr) { diff --git a/src/core/emb_table/embedding_table.h b/src/core/emb_table/embedding_table.h index cbf15a7a..174cc0fc 100644 --- a/src/core/emb_table/embedding_table.h +++ b/src/core/emb_table/embedding_table.h @@ -76,6 +76,10 @@ public: void MakeDir(const string& dirName); + virtual void BackUpTrainStatus(); + + virtual void RecoverTrainStatus(); + virtual vector GetDeviceOffset(); vector GetLoadOffset(); @@ -96,6 +100,7 @@ public: size_t ssdVocabSize; size_t maxOffset; absl::flat_hash_map keyOffsetMap; + absl::flat_hash_map keyOffsetMapBackUp; std::vector evictDevPos; // 记录HBM内被淘汰的key std::vector evictHostPos; // 记录Host内淘汰列表 diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index fda54d9d..100ed24e 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -221,6 +221,7 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) Checkpoint loadCkpt; vector loadFeatures; SetFeatureTypeForLoad(loadFeatures); + BackUpTrainStatus(); if (warmStartTables.size() == 0) { EmbeddingMgmt::Instance()->Load(loadPath, trainKeysSet); @@ -499,6 +500,8 @@ void HybridMgmt::EvalTask(TaskType type) cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; }); + // 在唤醒train的数据处理进程之前,需要将备份的train状态还原 + RecoverTrainStatus(); hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); LOG_DEBUG("wake TrainTask"); hybridMgmtBlock->DoBlock(channelId); @@ -2230,3 +2233,27 @@ bool HybridMgmt::IsTrainAndEvalCase() } return alreadyTrainOnce && isChannelSwitchCase; } + +void HybridMgmt::BackUpTrainStatus() +{ + int channelID = TRAIN_CHANNEL_ID; + int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[channelID]; + //续训load、predict模式下的load不需要对train的状态进行备份 + if (theTrainBatchId==0) { + return; + } + // train and eval模式下,train切换为eval之后 + // eval的load需要线备份原有的相关状态, HBM非扩容模式需要备份keyOffsetMap, DDR模式需要备份offsetMapper对象 + LOG_INFO("On Estimator train and eval mode, start to backup train status, " + "current train batchId: {} .", theTrainBatchId); + EmbeddingMgmt::Instance()->BackUpTrainStatusBeforeLoad(); + isBackUpTrainStatus = true; +} + +void HybridMgmt::RecoverTrainStatus() +{ + if (isBackUpTrainStatus) { + EmbeddingMgmt::Instance()->RecoverTrainStatus(); + } + isBackUpTrainStatus = false; +} \ No newline at end of file diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index 83299da3..fb050e70 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -133,6 +133,10 @@ namespace MxRec { void ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBatchOut); + void BackUpTrainStatus(); + + void RecoverTrainStatus(); + GTEST_PRIVATE: bool mutexDestroy { false }; std::mutex lookUpAndSendBatchIdMtx; @@ -225,6 +229,7 @@ namespace MxRec { bool isLoad { false }; bool isInitialized { false }; bool alreadyTrainOnce = false; // 用于判断是否为predict模式 + bool isBackUpTrainStatus = false; // 用于判断当前是否已经备份了train的状态 map lookUpSwapInAddrsPushId; // 用于处理eos场景,当消费者追上生产者且长时间无上游数据,会触发eos map specialProcessStatus; diff --git a/src/core/ock_ctr_common/include/embedding_cache.h b/src/core/ock_ctr_common/include/embedding_cache.h index f3bc9e23..5e25a718 100644 --- a/src/core/ock_ctr_common/include/embedding_cache.h +++ b/src/core/ock_ctr_common/include/embedding_cache.h @@ -315,6 +315,20 @@ public: virtual int LoadEmbTableInfos(std::string tableName, const std::vector& keys, const std::vector>& embeddings, const std::vector>& optimizerSlots) = 0; + + /* * + * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态 + * @Param tableName: 需要加载信息的table名字 + * @Return errorCode + */ + virtual int BackUpTrainStatus(std::string tableName) = 0; + + /* * + * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态 + * @Param tableName: 需要加载信息的table名字 + * @Return errorCode + */ + virtual int RecoverTrainStatus(std::string tableName) = 0; }; } // namespace EmbCache -- Gitee From 1e9e773c32f67ff466893976f5b748ac217947c0 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Mon, 15 Jul 2024 14:20:33 +0800 Subject: [PATCH 02/16] =?UTF-8?q?estimator=E4=B8=ADtrain=E5=88=87=E6=8D=A2?= =?UTF-8?q?=E4=B8=BAeval,=E5=A2=9E=E5=8A=A0=E5=8E=9Fhost=E4=BE=A7train?= =?UTF-8?q?=E7=9A=84=E7=9B=B8=E5=85=B3=E7=8A=B6=E6=80=81=E5=A4=87=E4=BB=BD?= =?UTF-8?q?=EF=BC=8C=E5=9C=A8eval=E5=88=87=E6=8D=A2=E4=B8=BAtrain=E5=90=8E?= =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E8=BF=98=E5=8E=9F=E5=A4=87=E4=BB=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/src/common/util/error_code.h | 1 + .../cache_manager/cache_manager.cpp | 57 ++++++++++++-- .../cache_manager/cache_manager.h | 8 +- src/AccCTR/src/embedding_cache/limited_set.h | 19 +++-- src/AccCTR/src/include/embedding_cache.h | 18 +++-- src/core/emb_table/embedding_ddr.cpp | 11 ++- src/core/emb_table/embedding_mgmt.h | 5 +- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 30 +++++--- src/core/hybrid_mgmt/hybrid_mgmt.h | 2 +- src/core/l3_storage/cache_manager.cpp | 74 ++++++++++++++++++- src/core/l3_storage/cache_manager.h | 11 +++ .../ock_ctr_common/include/embedding_cache.h | 18 +++-- 12 files changed, 200 insertions(+), 54 deletions(-) diff --git a/src/AccCTR/src/common/util/error_code.h b/src/AccCTR/src/common/util/error_code.h index b30bfd83..87c8ffe6 100644 --- a/src/AccCTR/src/common/util/error_code.h +++ b/src/AccCTR/src/common/util/error_code.h @@ -43,6 +43,7 @@ using CTRCode = enum : int { H_TABLE_NAME_EMPTY = 22, H_PREFILL_BUFFER_SIZE_INVALID = 23, H_TABLE_NAME_TOO_LONG = 24, + H_EMB_CACHE_INFO_LOST = 25 }; } } diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp index 68351328..52578820 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -253,8 +253,7 @@ int EmbCacheManagerImpl::ExportDeviceKeyOffsetPairs(const std::string& tableName if (checkTableNameRet != H_OK) { return checkTableNameRet; } - OffsetMapper& om = offsetMappers[tableName]; - koVec = om.ExportSortedKVPairs(); + koVec = offsetMappers[tableName].ExportSortedKVPairs(); return H_OK; } @@ -318,30 +317,58 @@ int EmbCacheManagerImpl::LoadEmbTableInfos(std::string tableName, const std::vec return H_OK; } -int EmbCacheManagerImpl::BackUpTrainStatus(std:string tableName) +int EmbCacheManagerImpl::BackUpTrainStatus(const std::string& tableName) { int checkTableNameRet = CheckValidTableName(tableName); if (checkTableNameRet != H_OK) { return checkTableNameRet; } + // Back up the key-offset correspondence on the device + kvVecsBackUp[tableName] = offsetMappers[tableName].ExportVec(); + + auto embInfo = embCacheInfos.find(tableName); + if (embInfo == embCacheInfos.end()) { + return H_EMB_CACHE_INFO_LOST; + } + uint32_t reserve = embInfo->second.maxCacheSize / VOCAB_CACHE_RATIO; + uint32_t maxCacheSize = embInfo->second.maxCacheSize; + auto om = offsetMappersBackUp.find(tableName); if (om != offsetMappersBackUp.end()) { - offsetMappersBackUp[tableName] = offsetMappers[tableName]; - } else{ - offsetMappersBackUp[tableName].Initialize(1000, 1000); - offsetMappersBackUp[tableName] = offsetMappers[tableName]; + offsetMappersBackUp[tableName].UnInitialize(); } + offsetMappersBackUp[tableName].Initialize(reserve, maxCacheSize); + offsetMappersBackUp[tableName] = offsetMappers[tableName]; + return H_OK; } -int EmbCacheManagerImpl::RecoverTrainStatus(std:string tableName) +int EmbCacheManagerImpl::RecoverTrainStatus(const std::string& tableName) { int checkTableNameRet = CheckValidTableName(tableName); if (checkTableNameRet != H_OK) { return checkTableNameRet; } + + auto embInfo = embCacheInfos.find(tableName); + if (embInfo == embCacheInfos.end()) { + return H_EMB_CACHE_INFO_LOST; + } + uint32_t reserve = embInfo->second.maxCacheSize / VOCAB_CACHE_RATIO; + uint32_t maxCacheSize = embInfo->second.maxCacheSize; + + offsetMappers[tableName].UnInitialize(); + offsetMappers[tableName].Initialize(reserve, maxCacheSize); offsetMappers[tableName] = offsetMappersBackUp[tableName]; + + // Recover the key-offset correspondence on the device + auto kvVecBackUp = kvVecsBackUp[tableName]; + for (const auto& kvPair: kvVecBackUp) { + offsetMappers[tableName].Put(kvPair.first, kvPair.second); + } + + kvVecBackUp.clear(); return H_OK; } @@ -449,3 +476,17 @@ uint32_t EmbCacheManagerImpl::GetUsage(const std::string& tableName) { return embTables[tableName].GetUsage(); } + +int EmbCacheManagerImpl::ResetOffsetMappers() +{ + for (auto it = offsetMappers.begin(); it != offsetMappers.end(); it++) { + auto embInfo = embCacheInfos.find(it->first); + if (embInfo == embCacheInfos.end()) { + return H_EMB_CACHE_INFO_LOST; + } + it->second.UnInitialize(); + uint32_t reserve = embInfo->second.maxCacheSize / VOCAB_CACHE_RATIO; + it->second.Initialize(reserve, embInfo->second.maxCacheSize); + } + return H_OK; +} diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h index 359e88ad..e4a240ae 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h @@ -73,8 +73,11 @@ public: const std::vector>& embeddings, const std::vector>& optimizerSlots) override; - int BackUpTrainStatus(std:string tableName) override; - int RecoverTrainStatus(std::string tableName) override; + int BackUpTrainStatus(const std::string& tableName) override; + + int RecoverTrainStatus(const std::string& tableName) override; + + int ResetOffsetMappers() override; uint32_t GetUsage(const std::string& tableName) override; @@ -83,6 +86,7 @@ private: std::map offsetMappers; std::map offsetMappersBackUp; std::map embTables; + std::map>> kvVecsBackUp; int CheckValidTableName(const std::string& tableName); diff --git a/src/AccCTR/src/embedding_cache/limited_set.h b/src/AccCTR/src/embedding_cache/limited_set.h index d44b615a..f7bc2e1e 100644 --- a/src/AccCTR/src/embedding_cache/limited_set.h +++ b/src/AccCTR/src/embedding_cache/limited_set.h @@ -20,19 +20,21 @@ limitations under the License. namespace EmbCache { +static constexpr int64_t NODE_DEFAULT_VALUE = -1; + class LimitedSet { public: struct Node { uint64_t value; Node *prev, *next; - Node(uint64_t val = -1) : value(val), prev(nullptr), next(nullptr) {} + Node(uint64_t val = NODE_DEFAULT_VALUE) : value(val), prev(nullptr), next(nullptr) {} }; - LimitedSet(uint64_t maxRange) : head(new Node(-1)), tail(new Node(-1)) + LimitedSet(uint64_t maxRange) : head(new Node(NODE_DEFAULT_VALUE)), tail(new Node(NODE_DEFAULT_VALUE)) { nodes.resize(maxRange); for (auto &node : nodes) { - node = new Node(-1); + node = new Node(NODE_DEFAULT_VALUE); } head->next = tail; tail->prev = head; @@ -47,19 +49,16 @@ public: delete tail; } - // 拷贝构造函数 - LimitedSet(const LimitedSet& other): head(new Node(-1)), tail(new Node(-1)) + LimitedSet(const LimitedSet& other): head(new Node(NODE_DEFAULT_VALUE)), tail(new Node(NODE_DEFAULT_VALUE)) { nodes.resize(other.nodes.size()); - for (auto &node: nodes) { - node = new Node(-1); + for (auto& node: nodes) { + node = new Node(NODE_DEFAULT_VALUE); } - // 初始化头尾节点 head->next = tail; tail->prev = head; - // 遍历原vector的每一个节点并复制 for (Node* node = other.head->next; node != other.tail; node = node->next) { insert(node->value); } @@ -87,7 +86,7 @@ public: Node *node = nodes[value]; node->prev->next = node->next; node->next->prev = node->prev; - node->value = -1; + node->value = NODE_DEFAULT_VALUE; } bool find(uint64_t value) diff --git a/src/AccCTR/src/include/embedding_cache.h b/src/AccCTR/src/include/embedding_cache.h index 40d9dcbe..c0468549 100644 --- a/src/AccCTR/src/include/embedding_cache.h +++ b/src/AccCTR/src/include/embedding_cache.h @@ -317,18 +317,24 @@ public: const std::vector>& optimizerSlots) = 0; /* * - * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态 - * @Param tableName: 需要加载信息的table名字 + * When switch the channel to eval, backup the current table's offsetMapper object. + * @Param tableName: embedding table name * @Return errorCode */ - virtual int BackUpTrainStatus(std::string tableName) = 0; + virtual int BackUpTrainStatus(const std::string& tableName) = 0; /* * - * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态 - * @Param tableName: 需要加载信息的table名字 + * When switch the eval channel back to train, Recover the current table's offsetMapper object to the backup state. + * @Param tableName: embedding table name + * @Return errorCode + */ + virtual int RecoverTrainStatus(const std::string& tableName) = 0; + + /* * + * Reset the offsetMapper object to revert to its initialized state after loading. * @Return errorCode */ - virtual int RecoverTrainStatus(std::string tableName) = 0; + virtual int ResetOffsetMappers() = 0; }; } // namespace EmbCache diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 82ca0b73..d05b3501 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -78,6 +78,11 @@ void EmbeddingDDR::Load(const string& savePath, mapResetOffsetMappers(); + if (rs != 0) { + throw runtime_error("embCache->ResetOffsetMappers failed, err code: " + to_string(rc)); + } } void EmbeddingDDR::LoadKey(const string &savePath, vector &keys) @@ -187,15 +192,13 @@ void EmbeddingDDR::LoadOptimizerSlot(const string &savePath, vector keys; vector> embeddings; vector> optimizerSlots; auto step = GetStepFromPath(savePath); - if (step > 0) { - SyncLatestEmbedding(); - embCache->GetEmbTableInfos(name, keys, embeddings, optimizerSlots); - } + embCache->GetEmbTableInfos(name, keys, embeddings, optimizerSlots); SaveKey(savePath, keys); SaveEmbedding(savePath, embeddings); diff --git a/src/core/emb_table/embedding_mgmt.h b/src/core/emb_table/embedding_mgmt.h index 7cd3f782..9dd0e363 100644 --- a/src/core/emb_table/embedding_mgmt.h +++ b/src/core/emb_table/embedding_mgmt.h @@ -90,16 +90,15 @@ public: void Save(const string& filePath); /** - * estimator模式下train切换为eval时, 备份所有表train的状态 + * In estimator mode, when switching from train to eval, backup the training state of all tables. */ void BackUpTrainStatusBeforeLoad(); /** - * estimator模式下eval切换为train时, 还原所有表train的状态 + * In estimator mode, when switching from eval to train, recover the training state of all tables. */ void RecoverTrainStatus(); - /** * 获取所有表对应的DeviceOffsets,该偏移用于python侧保存embedding时抽取key对应的embedding */ diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 84195a3c..91750b65 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -206,12 +206,6 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) throw runtime_error("HybridMgmt not initialized. Call Initialize first."); } - if (mgmtRankInfo.isDDR && IsTrainAndEvalCase()) { - LOG_INFO("estimator train and eval case, skip loading, " - "host will reuse data in memory while evaluating since is's same as saved data"); - return true; - } - // 数据处理线程上锁 KEY_PROCESS_INSTANCE->LoadSaveLock(); @@ -257,10 +251,15 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) featAdmitNEvict.LoadHistoryRecords(loadData.histRec); } + int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[TRAIN_CHANNEL_ID]; if (isL3StorageEnabled) { LOG_DEBUG(MGMT + "Start host side load: L3Storage key freq map"); auto step = GetStepFromPath(loadPath); - cacheManager->Load(mgmtEmbInfo, step, trainKeysSet); + // When in load and train mode or predict mode, SSD needs to actually execute loading + // When in the train and eval modes, loading before eval should be directly skipped + if (theTrainBatchId == 0) { + cacheManager->Load(mgmtEmbInfo, step, trainKeysSet); + } } LOG_DEBUG(MGMT + "Finish host side load process"); @@ -502,7 +501,7 @@ void HybridMgmt::EvalTask(TaskType type) cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; }); if (hybridMgmtBlock->pythonBatchId[EVAL_CHANNEL_ID] >= hybridMgmtBlock->hybridBatchId[EVAL_CHANNEL_ID]) { - // 在唤醒train的数据处理进程之前,需要将备份的train状态还原 + // Before waking the data process for training, Recover the backed-up training state RecoverTrainStatus(); hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); } else { @@ -2210,15 +2209,18 @@ void HybridMgmt::BackUpTrainStatus() { int channelID = TRAIN_CHANNEL_ID; int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[channelID]; - //续训load、predict模式下的load不需要对train的状态进行备份 - if (theTrainBatchId==0) { + if (theTrainBatchId == 0) { return; } - // train and eval模式下,train切换为eval之后 - // eval的load需要线备份原有的相关状态, HBM非扩容模式需要备份keyOffsetMap, DDR模式需要备份offsetMapper对象 + LOG_INFO("On Estimator train and eval mode, start to backup train status, " "current train batchId: {} .", theTrainBatchId); + // When in the train and eval mode of estimator, backup training states before loading. EmbeddingMgmt::Instance()->BackUpTrainStatusBeforeLoad(); + + if (isL3StorageEnabled) { + cacheManager->BackUpTrainStatus(); + } isBackUpTrainStatus = true; } @@ -2227,5 +2229,9 @@ void HybridMgmt::RecoverTrainStatus() if (isBackUpTrainStatus) { EmbeddingMgmt::Instance()->RecoverTrainStatus(); } + + if (isL3StorageEnabled) { + cacheManager->RecoverTrainStatus(); + } isBackUpTrainStatus = false; } \ No newline at end of file diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index f845efb1..233030b9 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -223,7 +223,7 @@ private: bool isLoad{false}; bool isInitialized{false}; bool alreadyTrainOnce = false; // 用于判断是否为predict模式 - bool isBackUpTrainStatus = false; // 用于判断当前是否已经备份了train的状态 + bool isBackUpTrainStatus = false; // whether the train state has been backed up map lookUpSwapInAddrsPushId; // 用于处理eos场景,当消费者追上生产者且长时间无上游数据,会触发eos map specialProcessStatus; diff --git a/src/core/l3_storage/cache_manager.cpp b/src/core/l3_storage/cache_manager.cpp index ee3d7bc5..7ea68e14 100644 --- a/src/core/l3_storage/cache_manager.cpp +++ b/src/core/l3_storage/cache_manager.cpp @@ -32,10 +32,10 @@ void CacheManager::Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vectorembCache = std::move(embCachePtr); for (auto& emb : mgmtEmbInfo) { - EmbBaseInfo baseInfo {emb.ssdVocabSize, emb.ssdDataPath, false}; + EmbBaseInfo baseInfo {emb.ssdVocabSize, emb.ssdDataPath, false, emb.extEmbeddingSize}; embBaseInfos.emplace(emb.name, baseInfo); preProcessMapper[emb.name].Initialize(emb.name, emb.hostVocabSize, emb.ssdVocabSize); } @@ -293,3 +293,73 @@ void CacheManager::FetchL3StorageEmb2DDR(string tableName, uint32_t extEmbedding embeddingTaskStep++; evictWaitCond.notify_all(); } + +void CacheManager::BackUpTrainStatus() +{ + ddrKeyFreqMapBackUp = ddrKeyFreqMap; + excludeDDRKeyCountMapBackUp = excludeDDRKeyCountMap; +} + +void CacheManager::RecoverTrainStatus() +{ + for (const auto& pair: excludeDDRKeyCountMapBackUp) { + auto tableName = pair.first; + + std::vector ssdKeysBeforeEval; + std::vector ssdKeysAfterEval; + std::vector swapInKeys; + std::vector swapOutKeys; + + for (const auto& keyMap : pair.second) { + ssdKeysBeforeEval.push_back(keyMap.first); + } + for (const auto& keyMap : excludeDDRKeyCountMap[tableName]) { + ssdKeysAfterEval.push_back(keyMap.first); + } + + GetSwapInAndSwapOutKeys(ssdKeysBeforeEval, ssdKeysAfterEval, swapInKeys, swapOutKeys); + + // ddr <-> ssd + // ddr-> lookup address, ssd->insert embedding , ddr->remove embedding + vector swapInKeysAddr; + int rc = embCache->EmbeddingLookupAddrs(tableName, swapInKeys, swapInKeysAddr); + if (rc != 0) { + throw runtime_error("EmbeddingLookUpAddrs failed! error code: " + std::to_string(rc)); + } + auto extEmbeddingSize = embBaseInfos[tableName].extEmbeddingSize; + l3Storage->InsertEmbeddingsByAddr(tableName, swapInKeys, swapInKeysAddr, extEmbeddingSize); + rc = embCache->EmbeddingRemove(tableName, swapInKeys); + if (rc != 0) { + throw runtime_error("EmbeddingRemove failed! error code: " + std::to_string(rc)); + } + + // ssd->fetch embedding, ddr->EmbeddingUpdate, ssd->delete embedding + auto swapOutEmbeddings = l3Storage->FetchEmbeddings(tableName, swapOutKeys); + vector swapOutFlattenEmbeddings; + for (auto& emb : swapOutEmbeddings) { + swapOutFlattenEmbeddings.insert(swapOutFlattenEmbeddings.cend(), emb.cbegin(), emb.cend()); + } + rc = embCache->EmbeddingUpdate(tableName, swapOutKeys, swapOutFlattenEmbeddings.data()); + l3Storage->DeleteEmbeddings(tableName, swapOutKeys); + } + + ddrKeyFreqMap = ddrKeyFreqMapBackUp; + excludeDDRKeyCountMap = excludeDDRKeyCountMapBackUp; +} + +void CacheManager::GetSwapInAndSwapOutKeys(vector& ssdKeysBeforeEval, + vector& ssdKeysAfterEval, + vector& swapInKeys, vector& swapOutKeys) +{ + std::sort(ssdKeysBeforeEval.begin(), ssdKeysBeforeEval.end()); + std::sort(ssdKeysAfterEval.begin(), ssdKeysAfterEval.end()); + vector intersectionKeys; + std::set_intersection(ssdKeysBeforeEval.begin(), ssdKeysBeforeEval.end(), ssdKeysAfterEval.begin(), + ssdKeysAfterEval.end(), std::back_inserter(intersectionKeys)); + + std::set_difference(ssdKeysBeforeEval.begin(), ssdKeysBeforeEval.end(), intersectionKeys.begin(), + intersectionKeys.end(), std::back_inserter(swapInKeys)); + std::set_difference(ssdKeysAfterEval.begin(), ssdKeysAfterEval.end(), intersectionKeys.begin(), + intersectionKeys.end(), std::back_inserter(swapOutKeys)); +} + diff --git a/src/core/l3_storage/cache_manager.h b/src/core/l3_storage/cache_manager.h index 79335788..34e7f0c2 100644 --- a/src/core/l3_storage/cache_manager.h +++ b/src/core/l3_storage/cache_manager.h @@ -107,10 +107,20 @@ namespace MxRec { int64_t GetTableUsage(const string& tableName); + void BackUpTrainStatus(); + + void RecoverTrainStatus(); + + void GetSwapInAndSwapOutKeys(vector& ssdKeysBeforeEval, + vector& ssdKeysAfterEval, + vector& swapInKeys, vector& swapOutKeys); + // DDR内每个表中emb数据频次缓存;map unordered_map ddrKeyFreqMap; + unordered_map ddrKeyFreqMapBackUp; // 每张表中非DDR内key的出现次数 unordered_map> excludeDDRKeyCountMap; + unordered_map> excludeDDRKeyCountMapBackUp; // 每一个table对应一个PreProcessMapper,预先推演HBM->DDR的情况 std::unordered_map preProcessMapper; @@ -125,6 +135,7 @@ namespace MxRec { uint64_t maxTableSize; vector savePath; bool isExist; + int extEmbeddingSize; }; void CreateL3StorageTableIfNotExist(const std::string& embTableName); diff --git a/src/core/ock_ctr_common/include/embedding_cache.h b/src/core/ock_ctr_common/include/embedding_cache.h index 5e25a718..ce807f16 100644 --- a/src/core/ock_ctr_common/include/embedding_cache.h +++ b/src/core/ock_ctr_common/include/embedding_cache.h @@ -317,18 +317,24 @@ public: const std::vector>& optimizerSlots) = 0; /* * - * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态 - * @Param tableName: 需要加载信息的table名字 + * When switch the channel to eval, backup the current table's offsetMapper object. + * @Param tableName: embedding table name * @Return errorCode */ - virtual int BackUpTrainStatus(std::string tableName) = 0; + virtual int BackUpTrainStatus(const std::string& tableName) = 0; /* * - * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态 - * @Param tableName: 需要加载信息的table名字 + * When switch the eval channel back to train, Recover the current table's offsetMapper object to the backup state. + * @Param tableName: embedding table name + * @Return errorCode + */ + virtual int RecoverTrainStatus(const std::string& tableName) = 0; + + /* * + * Reset the offsetMapper object to revert to its initialized state after loading. * @Return errorCode */ - virtual int RecoverTrainStatus(std::string tableName) = 0; + virtual int ResetOffsetMappers() = 0; }; } // namespace EmbCache -- Gitee From aaabe4aa37ef1b188d5e112c3a7c99040579c92f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Mon, 22 Jul 2024 22:08:22 +0800 Subject: [PATCH 03/16] =?UTF-8?q?mmoe=20=E6=A8=A1=E5=9E=8B=E6=A1=86?= =?UTF-8?q?=E6=9E=B6=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/model.py | 136 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 examples/mmoe/model.py diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py new file mode 100644 index 00000000..0046d2fd --- /dev/null +++ b/examples/mmoe/model.py @@ -0,0 +1,136 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import time +from easydict import EasyDict as edict + +import tensorflow as tf + + +model_cfg = edict() +model_cfg.loss_mode = "batch" +LOSS_OP_NAME = "loss" +LABEL_OP_NAME = "label" +VAR_LIST = "variable" +PRED_OP_NAME = "pred" + + +class MyModel: + def __init__(self, expert_num =8, expert_size=16, tower_size=8, gate_num = 2): + + self.expert_num = expert_num + self.expert_size = expert_size + self.tower_size = tower_size + self.gate_num = gate_num + + + def expert_layer(self, input): + param_expert = [] + for i in range(0, self.expert_num): + expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name = f'expert_payer_{i}', + kernel_initializer = tf.constant_initializer(value=0.1), + bias_initializer = tf.constant_initializer(values = 0.1)) + + param_expert.append(expert_linear) + return param_expert + + + def gate_layer(self, input): + param_gate = [] + for i in range(0, self.gate_num): + gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name = f'gate_payer_{i}', + kernel_initializer = tf.constant_initializer(value=0.1), + bias_initializer = tf.constant_initializer(values = 0.1)) + + param_gate.append(gate_linear) + return param_gate + + + def tower_layer(self, input, layer_name): + tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name = f'tower_payer_{layer_name}', + kernel_initializer = tf.constant_initializer(value=0.1), + bias_initializer = tf.constant_initializer(values = 0.1)) + + tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, name = f'tower_payer_out_{layer_name}', + kernel_initializer = tf.constant_initializer(value=0.1), + bias_initializer = tf.constant_initializer(values = 0.1)) + + return tower_linear_out + + + + + def build_model(self, + embedding=None, + dense_feature=None, + label=None, + is_training=True, + seed=None): + + with tf.variable_scope("mmoe", reuse=tf.AUTO_REUSE): + + dense_expert = self.expert_layer(dense_feature) + dense_gate = self.gate_layer(dense_feature) + + all_expert = [] + _slice_num = 0 + for i in range(0, self.expert_num): + slice_num_end = _slice_num + self.expert_size + cur_expert = tf.add(dense_expert[i], embedding[:, _slice_num:slice_num_end]) + cur_expert = tf.nn.relu(cur_expert) + all_expert.append(cur_expert) + _slice_num = slice_num_end + + expert_concat = tf.concat(all_expert, axis=1) + expert_concat = tf.reshape(expert_concat, [-1, self.expert_num, self.expert_size]) + + output_layers = [] + out_pred = [] + for i in range(0, self.gate_num): + slice_gate_end = _slice_num + self.expert_num + cur_gate = tf.add(dense_gate[i], embedding[:, _slice_num:slice_gate_end]) + cur_gate = tf.nn.softmax(cur_gate) + + cur_gate = tf.reshape(cur_gate, [-1, self.expert_num, 1]) + + cur_gate_expert = tf.multiply(x=expert_concat, y=cur_gate) + cur_gate_expert = tf.reduce_sum(cur_gate_expert, axis=1) + out = self.tower_layer(cur_gate_expert, i) + output_layers.append(out) + out_pred.append(tf.nn.softmax(out[:, 1])) + _slice_num = slice_num_end + trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mmoe') + + label_income = label[:, 0:1] + label_mat = label[:, 1:] + + pred_income_1 = tf.slice(output_layers[0], [0, 1], [-1, 1]) + pred_marital_1 = tf.slice(output_layers[1], [0, 1], [-1, 1]) + + cost_income = tf.losses.log_loss(labels=tf.cast(label_income, tf.float32), predictions=pred_income_1, + epsilon=1e-4) + cost_marital = tf.losses.log_loss(labels=tf.cast(label_mat, tf.float32), predictions=pred_marital_1, + epsilon=1e-4) + + avg_cost_income = tf.reduce_mean(cost_income) + avg_cost_marital = tf.reduce_mean(cost_marital) + + loss = 0.5 * (avg_cost_income + avg_cost_marital) + + return {LOSS_OP_NAME: loss, + PRED_OP_NAME: out_pred, + LABEL_OP_NAME: label, + VAR_LIST: trainable_variables} -- Gitee From f17973de35900ab90455e1933717c21161fe2a62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Mon, 22 Jul 2024 22:39:52 +0800 Subject: [PATCH 04/16] cleancode --- examples/mmoe/config.py | 197 ++++++++++++++++++++ examples/mmoe/criteo.py | 273 ++++++++++++++++++++++++++++ examples/mmoe/delay_loss_scale.py | 64 +++++++ examples/mmoe/gradient_descent_w.py | 71 ++++++++ examples/mmoe/mean_auc.py | 40 ++++ examples/mmoe/model.py | 27 +-- examples/mmoe/op_impl_mode.ini | 1 + examples/mmoe/optimizer.py | 35 ++++ 8 files changed, 695 insertions(+), 13 deletions(-) create mode 100644 examples/mmoe/config.py create mode 100644 examples/mmoe/criteo.py create mode 100644 examples/mmoe/delay_loss_scale.py create mode 100644 examples/mmoe/gradient_descent_w.py create mode 100644 examples/mmoe/mean_auc.py create mode 100644 examples/mmoe/op_impl_mode.ini create mode 100644 examples/mmoe/optimizer.py diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py new file mode 100644 index 00000000..d5540908 --- /dev/null +++ b/examples/mmoe/config.py @@ -0,0 +1,197 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import enum +import os + +import tensorflow as tf +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +from npu_bridge.estimator.npu.npu_config import NPURunConfig + +from mx_rec.constants.constants import CacheModeEnum + +SSD_DATA_PATH = ["ssd_data"] + + +class LearningRateScheduler: + """ + LR Scheduler combining Polynomial Decay with Warmup at the beginning. + TF-based cond operations necessary for performance in graph mode. + """ + + def __init__(self, base_lr_dense, base_lr_sparse, warmup_steps, decay_start_step, decay_steps): + self.warmup_steps = tf.constant(warmup_steps, dtype=tf.int32) + self.decay_start_step = tf.constant(decay_start_step, dtype=tf.int32) + self.decay_steps = tf.constant(decay_steps) + self.decay_end_step = decay_start_step + decay_steps # 65041 + self.poly_power = 2.0 + self.base_lr_dense = base_lr_dense + self.base_lr_sparse = base_lr_sparse + + def calc(self, global_step): + # used for the warmup stage + warmup_step = tf.cast(1 / self.warmup_steps, tf.float32) + lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step + lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32) + # used for the constant stage + lr_factor_constant = tf.cast(1.0, tf.float32) + + lr_sparse = self.base_lr_sparse * lr_factor_constant + lr_dense = self.base_lr_dense * lr_factor_constant + return lr_dense, lr_sparse + + +class Config: + def __init__(self, ): + self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None + tmp = os.getenv("TRAIN_RANK_SIZE") + if tmp is None: + raise ValueError("please export TRAIN_RANK_SIZE") + self.rank_size = int(tmp) + + self.data_path = os.getenv("DLRM_CRITEO_DATA_PATH") + self.train_file_pattern = "train" + self.test_file_pattern = "test" + + self.batch_size = 4096 + self.line_per_sample = 1 + self.train_epoch = 1 + self.test_epoch = 9 + self.perform_shuffle = False + + self.key_type = tf.int64 + self.label_type = tf.float32 + self.value_type = tf.int64 + + self.feat_cnt = 26 + self.__set_emb_table_size() + + self.field_num = 26 + self.send_count = 46000 // self.rank_size + + self.emb_dim = 8 + self.hashtable_threshold = 1 + + self.USE_PIPELINE_TEST = False + + # 动态学习率 + GLOBAL_BATCH_SIZE = 8192 * 8 + LR_SCHEDULE_STEPS = [ + int(2750 * 55296 / GLOBAL_BATCH_SIZE), + int(49315 * 55296 / GLOBAL_BATCH_SIZE), + int(27772 * 55296 / GLOBAL_BATCH_SIZE), + ] + self.global_step = tf.Variable(0, trainable=False) + _lr_scheduler = LearningRateScheduler( + 0.001, + 0.001, + LR_SCHEDULE_STEPS[0], + LR_SCHEDULE_STEPS[1], + LR_SCHEDULE_STEPS[2], + ) + self.learning_rate = _lr_scheduler.calc(self.global_step) + + def __set_emb_table_size(self): + self.cache_mode = os.getenv("CACHE_MODE") + if self.cache_mode is None: + raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") + + if self.cache_mode == CacheModeEnum.HBM.value: + self.dev_vocab_size = 14_000_000 * self.rank_size + self.host_vocab_size = 0 + elif self.cache_mode == CacheModeEnum.DDR.value: + self.dev_vocab_size = 500_000 * self.rank_size + self.host_vocab_size = 24_000_000 * self.rank_size + elif self.cache_mode == CacheModeEnum.SSD.value: + self.dev_vocab_size = 100_000 * self.rank_size + self.host_vocab_size = 2_000_000 * self.rank_size + self.ssd_vocab_size = 24_000_000 * self.rank_size + else: + raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") + + def get_emb_table_cfg(self): + if self.cache_mode == CacheModeEnum.HBM.value: + return {"device_vocabulary_size": self.dev_vocab_size} + elif self.cache_mode == CacheModeEnum.DDR.value: + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size} + elif self.cache_mode == CacheModeEnum.SSD.value: + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size, + "ssd_vocabulary_size": self.ssd_vocab_size, + "ssd_data_path": SSD_DATA_PATH} + else: + raise RuntimeError(f"get CACHE_MODE:{self.cache_mode}, check Config.__set_emb_table_size implementation") + + +def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"): + session_config = tf.ConfigProto(allow_soft_placement=False, + log_device_placement=False) + session_config.gpu_options.allow_growth = True + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["mix_compile_mode"].b = False + custom_op.parameter_map["use_off_line"].b = True + custom_op.parameter_map["min_group_size"].b = 1 + # 可选配置level0:pairwise;level1:pairwise + custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:fullmesh;level1:fullmesh") + custom_op.parameter_map["enable_data_pre_proc"].b = True + custom_op.parameter_map["iterations_per_loop"].i = 10 + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + custom_op.parameter_map["hcom_parallel"].b = False + custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes("op_impl_mode.ini") + custom_op.parameter_map["op_execute_timeout"].i = 2000 + custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes( + str(13 * 1024 * 1024 * 1024)) # total 31 need 13; + custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(18 * 1024 * 1024 * 1024)) # need 25 + custom_op.parameter_map["stream_max_parallel_num"].s = tf.compat.as_bytes("DNN_VM_AICPU:3,AIcoreEngine:3") + + if dump_data: + custom_op.parameter_map["enable_dump"].b = True + custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes(dump_path) + custom_op.parameter_map["dump_step"].s = tf.compat.as_bytes(dump_steps) + custom_op.parameter_map["dump_mode"].s = tf.compat.as_bytes("all") + + session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + return session_config + + +def get_npu_run_config(): + session_config = tf.ConfigProto(allow_soft_placement=False, + log_device_placement=False) + + session_config.gpu_options.allow_growth = True + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + run_config = NPURunConfig( + save_summary_steps=1000, + save_checkpoints_steps=100, + keep_checkpoint_max=5, + session_config=session_config, + log_step_count_steps=20, + precision_mode='allow_mix_precision', + enable_data_pre_proc=True, + iterations_per_loop=1, + jit_compile=False, + op_compiler_cache_mode="enable", + HCCL_algorithm="level0:fullmesh;level1:fullmesh" # 可选配置:level0:pairwise;level1:pairwise + ) + return run_config diff --git a/examples/mmoe/criteo.py b/examples/mmoe/criteo.py new file mode 100644 index 00000000..25f1d869 --- /dev/null +++ b/examples/mmoe/criteo.py @@ -0,0 +1,273 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import stat +import pickle +import argparse +import pandas as pd +import numpy as np +import tensorflow as tf +from tqdm import tqdm + +NAMES = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)] + + +def make_sub_file(lines, head, src_name, sub_dir_name, sub): + """Write sub-data. + + Args: + :param lines: A list. Several pieces of data. + :param head: A string. ['label', 'I1', 'I2', ...]. + :param src_name: A string. The name of data. + :param sub_dir_name: A string. + :param sub: A scalar(Int). Record the current number of sub file. + :return: sub + 1. + """ + root_path, file_path = os.path.split(src_name) + file_name, suffix = file_path.split('.') + split_file_name = file_name + "_" + str(sub).zfill(2) + "." + suffix + split_file = os.path.join(root_path, sub_dir_name, split_file_name) + if not os.path.exists(os.path.join(root_path, sub_dir_name)): + os.mkdir(os.path.join(root_path, sub_dir_name)) + + modes = stat.S_IWUSR | stat.S_IRUSR + flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT + f = os.fdopen(os.open(split_file, flags, modes), 'w') + try: + f.writelines([head]) + f.writelines(lines) + return sub + 1 + finally: + f.close() + + +def split_byline_count(filename, count, sub_dir_name): + """Split File. + Note: You can specify how many rows of data each sub file contains. + Args: + :param filename: A string. + :param count: A scalar(int). + :param sub_dir_name: A string. + :return: + """ + f = open(filename, 'r') + try: + head = f.readline() + buf = [] + sub = 1 + for line in f: + buf.append(line) + if len(buf) == count: + sub = make_sub_file(buf, head, filename, sub_dir_name, sub) + buf = [] + if len(buf) != 0: + try: + make_sub_file(buf, head, filename, sub_dir_name, sub) + except FileNotFoundError as err: + raise FileNotFoundError("please check the filename of data") from err + finally: + f.close() + + +def get_split_file_path(parent_path=None, dataset_path=None, sample_num=4600000): + """Get the list of split file path. + Note: Either parent_path or dataset_path must be valid. + If exists dataset_path + "/split", parent_path = dataset_path + "/split". + Args: + :param parent_path: A string. split file's parent path. + :param dataset_path: A string. + :param sample_num: A int. The sample number of every split file. + :return: A list. [file1_path, file2_path, ...] + """ + sub_dir_name = 'split' + if parent_path is None and dataset_path is None: + raise ValueError('Please give parent path or file path.') + if parent_path is None and os.path.exists(os.path.join(os.path.dirname(dataset_path), sub_dir_name)): + parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) + elif parent_path is None or not os.path.exists(parent_path): + split_byline_count(dataset_path, sample_num, sub_dir_name) + parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) + split_file_name = os.listdir(parent_path) + split_file_name.sort() + split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt'] + return split_file_list + + +def get_fea_map(fea_map_path=None, split_file_list=None): + """Get feature map. + Note: Either parent_path or dataset_path must be valid. + If exists dir(split_file_list[0]) + "/fea_map.pkl", fea_map_path is valid. + If fea_map_path is None and you want to build the feature map, + the default file path is the parent directory of split file + "fea_map.pkl". + Args: + :param fea_map_path: A string. + :param split_file_list: A list. [file1_path, file2_path, ...] + :return: A dict. {'C1':{}, 'C2':{}, ...} + """ + if fea_map_path is None and split_file_list is None: + raise ValueError('Please give feature map path or split file list.') + if fea_map_path is None and split_file_list is not None: + fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") + if os.path.exists(fea_map_path) and fea_map_path[-3:] == 'pkl': + with open(fea_map_path, 'rb') as f: + fea_map = pickle.load(f) + return fea_map + fea_map = {} + for file_open in tqdm(split_file_list): + f = open(file_open) + for line in f: + row = line.strip('\n').split('\t') + for i in range(14, 40): + if row[i] == '': + continue + name = NAMES[i] + fea_map.setdefault(name, {}) + if fea_map[name].get(row[i]) is None: + fea_map[name][row[i]] = len(fea_map[name]) + for j in range(1, 14): + if row[j] == '': + continue + name = NAMES[j] + fea_map.setdefault(name, {}) + fea_map[name].setdefault('min', float(row[j])) + fea_map[name].setdefault('max', float(row[j])) + fea_map[name]['min'] = min(fea_map[name]['min'], float(row[j])) + fea_map[name]['max'] = max(fea_map[name]['max'], float(row[j])) + f.close() + for i in range(14, 40): + fea_map[NAMES[i]]['-1'] = len(fea_map[NAMES[i]]) + fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") + + + modes = stat.S_IWUSR | stat.S_IRUSR + flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT + with os.fdopen(os.open(fea_map_path, flags, modes), 'wb') as fd: + pickle.dump(fea_map, fd, pickle.HIGHEST_PROTOCOL) + + return fea_map + + +def rec_kbins_discretizer(dat, n_bins, min_max_dict): + """Bin continuous data into intervals. + Note: The strategy is "uniform". + Args: + :param dat: A dataframe. + :param n_bins: A scalar(int). + :param min_max_dict: A dict such as {'min': , 'max': }. + :return: The new dataframe. + """ + features = dat.columns + n_features = len(features) + bin_edges = np.zeros(n_features, dtype=object) + for idx, feature in enumerate(features): + bin_edges[idx] = np.linspace(min_max_dict[feature]['min'], min_max_dict[feature]['max'], n_bins + 1) + rtol = 1.e-5 + atol = 1.e-8 + eps = atol + rtol * np.abs(dat[feature]) + dat[feature] = np.digitize(dat[feature] + eps, bin_edges[idx][1:]) + return dat + + +def convert_input2tfrd(in_file_path, out_file_path): + """ + txt to tfrecords + """ + def make_example(label_list, dense_feat_list, sparse_feat_list): + dense_feature = np.array(dense_feat_list, dtype=np.int64).reshape(-1) + sparse_feature = np.array(sparse_feat_list, dtype=np.int64).reshape(-1) + label = np.array(label_list, dtype=np.int64).reshape(-1) + feature_dict = { + "dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)), + "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), + "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) + } + example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) + + return example + + file_name = out_file_path + in_file_path[-12:-4] + '.tfrecord' + file_writer = tf.io.TFRecordWriter(file_name) + + with open(in_file_path, encoding='utf-8') as file_in: + + for _, line in tqdm(enumerate(file_in)): + + line = line.strip('\n') + items = line.split('\t') + if len(items) != 40: + continue + label = int(items[0]) + dense = items[1:14] + sparse = items[14:] + + ex = make_example(label, dense, sparse) + serialized = ex.SerializeToString() + file_writer.write(serialized) + + file_writer.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Get datasets') + parser.add_argument('--data_path') + parser.add_argument('--output_path') + + args, _ = parser.parse_known_args() + data_path = args.data_path + output_path = args.output_path + + # get txt_list + file_split_list = get_split_file_path(dataset_path=data_path) + # get feature_map + feature_map = get_fea_map(split_file_list=file_split_list) + + for file in tqdm(file_split_list): + + # read data + data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES) + # name feature + sparse_features = ['C' + str(i) for i in range(1, 27)] + dense_features = ['I' + str(i) for i in range(1, 14)] + # data processing + data_df[sparse_features] = data_df[sparse_features].fillna('-1') + data_df[dense_features] = data_df[dense_features].fillna(0) + # sparse feature: mapping + for col in sparse_features: + try: + data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) + except KeyError as e: + raise KeyError("Feature {} not found in dataset".format(col)) from e + # dense feature: Bin continuous data into intervals. + data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map) + # add offsets + slot_size_array = [ + 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, + 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, + 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573 + ] + offset_size_list = np.cumsum([0] + slot_size_array[:-1]) + for col_index in range(1, len(offset_size_list) + 1): + data_df.iloc[:, col_index] += offset_size_list[col_index - 1] + # save to txt + data_df.to_csv(file, sep='\t', index=False, header=False) + # txt to tfrecords + convert_input2tfrd(in_file_path=file, out_file_path=output_path) + + + + + diff --git a/examples/mmoe/delay_loss_scale.py b/examples/mmoe/delay_loss_scale.py new file mode 100644 index 00000000..f73baf68 --- /dev/null +++ b/examples/mmoe/delay_loss_scale.py @@ -0,0 +1,64 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf +from tensorflow.python.training import optimizer + +from config import Config + + +class DenseLossScaleOptimizer: + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: + if not isinstance(opt, optimizer.Optimizer): + raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) + self._optimizer = opt + self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + _update_lr_loss_scale(self._optimizer, loss_scale) + + def compute_gradients(self, loss, var_list=None): + return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list) + + def apply_gradients(self, avg_grads): + return self._optimizer.apply_gradients(avg_grads) + + +class SparseLossScaleOptimizer: + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: + if not isinstance(opt, optimizer.Optimizer): + raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) + self._optimizer = opt + self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + _update_lr_loss_scale(self._optimizer, loss_scale) + + def compute_gradients(self, loss, var_list=None): + return tf.gradients(loss * self._loss_scale, var_list) + + def apply_gradients(self, grads_and_vars): + return self._optimizer.apply_gradients(grads_and_vars) + + +def _update_lr_loss_scale(opt, loss_scale): + if loss_scale <= 0: + raise RuntimeError("the loss_scale must be greater than zero.") + loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + if hasattr(opt, "_lr"): + # LazyAdam or Adam optimizer + opt._lr = opt._lr / loss_scale + elif hasattr(opt, "_learning_rate"): + # SGD optimizer + opt._learning_rate = opt._learning_rate / loss_scale + else: + raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.") \ No newline at end of file diff --git a/examples/mmoe/gradient_descent_w.py b/examples/mmoe/gradient_descent_w.py new file mode 100644 index 00000000..53adb996 --- /dev/null +++ b/examples/mmoe/gradient_descent_w.py @@ -0,0 +1,71 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import defaultdict + +import tensorflow as tf +from tensorflow.python.ops import math_ops +from tensorflow.python.training import gradient_descent +from mx_rec.optimizers.base import CustomizedOptimizer +from mx_rec.util.log import logger +from mx_rec.util.initialize import ConfigInitializer + + +def create_hash_optimizer(learning_rate, weight_decay=0.0001, use_locking=False, name="GradientDescent"): + optimizer = CustomizedGradientDescentWithWeighDecay(learning_rate=learning_rate, + weight_decay=weight_decay, + use_locking=use_locking, + name=name) + ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer + return optimizer + + +class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOptimizer, CustomizedOptimizer): + name_counter = defaultdict(int) + + def __init__(self, learning_rate, weight_decay, use_locking=False, name="GradientDescent"): + self.optimizer_type = "gradient_descent_with_weight_decay" + self.weight_decay = weight_decay + super(CustomizedGradientDescentWithWeighDecay, self)._get_name(name=name) + super(CustomizedGradientDescentWithWeighDecay, self).__init__( + learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name + ) + self._slot_num = 0 + self._derivative = 1 + + def get_slot_init_values(self): + logger.info("no slot for gradient descent") + return [] + + def _apply_sparse_duplicate_indices(self, grad, var): + logger.debug(">>>> Enter _apply_sparse_duplicate_indices") + nd_indices = tf.expand_dims(grad.indices, 1) + logger.info(f"weigh_decay={self.weight_decay}") + if self.weight_decay is None: + nd_value = grad.values * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) + else: + nd_value = (grad.values + math_ops.cast(self.weight_decay, var.dtype.base_dtype) * + tf.gather(var, grad.indices)) * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) + var_update_op = tf.scatter_nd_add(var, nd_indices, -nd_value, use_locking=self._use_locking) + return var_update_op + + def _apply_dense(self, grad, var): + logger.debug(">>>> Enter _apply_dense") + raise NotImplementedError("You are using a wrong type of variable.") diff --git a/examples/mmoe/mean_auc.py b/examples/mmoe/mean_auc.py new file mode 100644 index 00000000..ff57df00 --- /dev/null +++ b/examples/mmoe/mean_auc.py @@ -0,0 +1,40 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from glob import glob +import numpy as np + + +def split_auc(log_input): + with open(log_input, 'r') as log: + all_auc = [] + for line in log.readlines(): + if 'Test' in line: + all_auc.append(float(line.split(';')[0].split(':')[-1].strip())) + all_auc_len = len(all_auc) + all_auc_arr = np.array(all_auc)[:all_auc_len - all_auc_len % 8] + test_auc = np.mean(all_auc_arr.reshape(-1, 8), axis=-1) + return test_auc + + +log_path_all = 'latest_*.log' +log_path_list = glob(log_path_all) + +for log_path in log_path_list: + print(os.path.basename(log_path)) + print(split_auc(log_path)) + print('*'*20) \ No newline at end of file diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index 0046d2fd..5b1917a3 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -29,7 +29,7 @@ PRED_OP_NAME = "pred" class MyModel: - def __init__(self, expert_num =8, expert_size=16, tower_size=8, gate_num = 2): + def __init__(self, expert_num=8, expert_size=16, tower_size=8, gate_num=2): self.expert_num = expert_num self.expert_size = expert_size @@ -40,9 +40,9 @@ class MyModel: def expert_layer(self, input): param_expert = [] for i in range(0, self.expert_num): - expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name = f'expert_payer_{i}', - kernel_initializer = tf.constant_initializer(value=0.1), - bias_initializer = tf.constant_initializer(values = 0.1)) + expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name=f'expert_payer_{i}', + kernel_initializer=tf.constant_initializer(value=0.1), + bias_initializer=tf.constant_initializer(values = 0.1)) param_expert.append(expert_linear) return param_expert @@ -51,22 +51,23 @@ class MyModel: def gate_layer(self, input): param_gate = [] for i in range(0, self.gate_num): - gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name = f'gate_payer_{i}', - kernel_initializer = tf.constant_initializer(value=0.1), - bias_initializer = tf.constant_initializer(values = 0.1)) + gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name=f'gate_payer_{i}', + kernel_initializer=tf.constant_initializer(value=0.1), + bias_initializer=tf.constant_initializer(values = 0.1)) param_gate.append(gate_linear) return param_gate def tower_layer(self, input, layer_name): - tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name = f'tower_payer_{layer_name}', - kernel_initializer = tf.constant_initializer(value=0.1), - bias_initializer = tf.constant_initializer(values = 0.1)) + tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name=f'tower_payer_{layer_name}', + kernel_initializer=tf.constant_initializer(value=0.1), + bias_initializer=tf.constant_initializer(values = 0.1)) - tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, name = f'tower_payer_out_{layer_name}', - kernel_initializer = tf.constant_initializer(value=0.1), - bias_initializer = tf.constant_initializer(values = 0.1)) + tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, + name=f'tower_payer_out_{layer_name}', + kernel_initializer=tf.constant_initializer(value=0.1), + bias_initializer=tf.constant_initializer(values=0.1)) return tower_linear_out diff --git a/examples/mmoe/op_impl_mode.ini b/examples/mmoe/op_impl_mode.ini new file mode 100644 index 00000000..579dea43 --- /dev/null +++ b/examples/mmoe/op_impl_mode.ini @@ -0,0 +1 @@ +ScatterNdAdd=support_out_of_bound_index \ No newline at end of file diff --git a/examples/mmoe/optimizer.py b/examples/mmoe/optimizer.py new file mode 100644 index 00000000..2c7685bb --- /dev/null +++ b/examples/mmoe/optimizer.py @@ -0,0 +1,35 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf +from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer +from mx_rec.util.initialize import ConfigInitializer +from mx_rec.optimizers.lazy_adam import create_hash_optimizer +from mx_rec.optimizers.lazy_adam_by_addr import create_hash_optimizer_by_address + + +def get_dense_and_sparse_optimizer(cfg): + dense_optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate[0]) + use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion + if use_dynamic_expansion: + sparse_optimizer = create_hash_optimizer_by_address(learning_rate=cfg.learning_rate[1]) + else: + sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1]) + loss_scale = 1 + sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale) + dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale) + + return dense_optimizer, sparse_optimizer -- Gitee From fe7073494d499d161e16ce826175f744a17336eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Mon, 22 Jul 2024 22:50:39 +0800 Subject: [PATCH 05/16] =?UTF-8?q?mmoe=20=E5=90=8A=E8=B5=B7=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/main_mxrec.py | 469 ++++++++++++++++++++++++++++++++++++ 1 file changed, 469 insertions(+) create mode 100644 examples/mmoe/main_mxrec.py diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py new file mode 100644 index 00000000..51ed7c4a --- /dev/null +++ b/examples/mmoe/main_mxrec.py @@ -0,0 +1,469 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import shutil +import time +import warnings +import random +from glob import glob + +import tensorflow as tf +from sklearn.metrics import roc_auc_score +import numpy as np + +from optimizer import get_dense_and_sparse_optimizer +from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum +from model import MyModel +from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET +from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func +from mx_rec.core.asc.manager import start_asc_pipeline +from mx_rec.core.embedding import create_table, sparse_lookup +from mx_rec.core.feature_process import EvictHook +from mx_rec.graph.modifier import modify_graph_and_start_emb_cache, GraphModifierHook +from mx_rec.constants.constants import ASCEND_TIMESTAMP +from mx_rec.util.initialize import ConfigInitializer, init, terminate_config_initializer +from mx_rec.util.ops import import_host_pipeline_ops +import mx_rec.util as mxrec_util +from mx_rec.util.variable import get_dense_and_sparse_variable +from mx_rec.util.log import logger +from npu_bridge.npu_init import * + +npu_plugin.set_device_sat_mode(0) + +dense_hashtable_seed = 128 +sparse_hashtable_seed = 128 +shuffle_seed = 128 +random.seed(shuffle_seed) + + +def add_timestamp_func(batch): + timestamp = import_host_pipeline_ops().return_timestamp(tf.cast(batch['label'], dtype=tf.int64)) + # tf.constant(np.random.randint(1,1688109060,1)), tf.int64)) + batch["timestamp"] = timestamp + return batch + + +def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, is_use_faae=False): + if config.USE_PIPELINE_TEST: + num_parallel = 1 + else: + num_parallel = 8 + + def extract_fn(data_record): + features = { + # Extract features using the keys set during creation + 'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64), + 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64), + 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32), + } + sample = tf.compat.v1.parse_single_example(data_record, features) + return sample + + def reshape_fn(batch): + batch['label'] = tf.reshape(batch['label'], [-1, 1]) + batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 13]) + batch['dense_feature'] = tf.math.log(batch['dense_feature'] + 3.0) + batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 26]) + return batch + + if is_training: + files_list = glob(os.path.join(config.data_path, config.train_file_pattern) + '/*.tfrecord') + else: + files_list = glob(os.path.join(config.data_path, config.test_file_pattern) + '/*.tfrecord') + dataset = tf.data.TFRecordDataset(files_list, num_parallel_reads=num_parallel) + batch_size = config.batch_size // config.line_per_sample + + dataset = dataset.shard(config.rank_size, config.rank_id) + if is_training: + dataset = dataset.shuffle(batch_size * 1000, seed=shuffle_seed) + if is_training: + dataset = dataset.repeat(config.train_epoch) + else: + dataset = dataset.repeat(config.test_epoch) + dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel).batch(batch_size, + drop_remainder=True) + dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel) + if is_use_faae: + dataset = dataset.map(add_timestamp_func) + + if not MODIFY_GRAPH_FLAG: + insert_fn = get_asc_insert_func(tgt_key_specs=feature_spec_list, is_training=is_training, dump_graph=dump_graph) + dataset = dataset.map(insert_fn) + + dataset = dataset.prefetch(100) + + iterator = dataset.make_initializable_iterator() + batch = iterator.get_next() + return batch, iterator + + +def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph): + embedding_list = [] + logger.debug(f"In model_forward function, is_train: {is_train}, feature_list: {len(feature_list)}, " + f"hash_table_list: {len(hash_table_list)}") + for feature, hash_table in zip(feature_list, hash_table_list): + if MODIFY_GRAPH_FLAG: + feature = batch["sparse_feature"] + embedding = sparse_lookup(hash_table, feature, cfg.send_count, dim=None, is_train=is_train, + name="user_embedding_lookup", modify_graph=modify_graph, batch=batch, + access_and_evict_config=None) + embedding_list.append(embedding) + + if len(embedding_list) == 1: + emb = embedding_list[0] + elif len(embedding_list) > 1: + emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False) + else: + raise ValueError("the length of embedding_list must be greater than or equal to 1.") + my_model = MyModel() + model_output = my_model.build_model(embedding=emb, + dense_feature=batch["dense_feature"], + label=batch["label"], + is_training=is_train, + seed=dense_hashtable_seed) + return model_output + + +def evaluate(): + print("read_test dataset") + if not MODIFY_GRAPH_FLAG: + eval_label = eval_model.get("label") + sess.run([eval_iterator.initializer]) + else: + # 在sess run模式下,若还是使用原来batch中的label去sess run,则会出现getnext超时报错,需要使用新数据集中的batch + eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label") + sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) + log_loss_list = [] + pred_list = [] + label_list = [] + eval_current_steps = 0 + finished = False + print("eval begin") + + while not finished: + try: + eval_current_steps += 1 + eval_start = time.time() + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_label]) + eval_cost = time.time() - eval_start + qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size + log_loss_list += list(eval_loss.reshape(-1)) + pred_list += list(pred.reshape(-1)) + label_list += list(label.reshape(-1)) + print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") + if eval_current_steps == eval_steps: + finished = True + except tf.errors.OutOfRangeError: + finished = True + auc = roc_auc_score(label_list, pred_list) + mean_log_loss = np.mean(log_loss_list) + return auc, mean_log_loss + + +def evaluate_fix(step): + print("read_test dataset evaluate_fix") + if not MODIFY_GRAPH_FLAG: + sess.run([eval_iterator.initializer]) + else: + sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) + log_loss_list = [] + pred_list = [] + label_list = [] + eval_current_steps = 0 + finished = False + print("eval begin") + while not finished: + try: + eval_current_steps += 1 + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_model.get("label")]) + log_loss_list += list(eval_loss.reshape(-1)) + pred_list += list(pred.reshape(-1)) + label_list += list(label.reshape(-1)) + print(f"eval current_steps: {eval_current_steps}") + + if eval_current_steps == eval_steps: + finished = True + except tf.errors.OutOfRangeError: + finished = True + + label_numpy = np.array(label_list) + pred_numpy = np.array(pred_list) + if not os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}"): + os.makedirs(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}") + + if os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy"): + os.remove(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy") + if os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy"): + os.remove(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy") + if os.path.exists(f"flag_{rank_id}.txt"): + os.remove(f"flag_{rank_id}.txt") + np.save(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy", label_numpy) + np.save(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy", pred_numpy) + os.mknod(f"flag_{rank_id}.txt") + while True: + file_exists_list = [os.path.exists(f"flag_{i}.txt") for i in range(rank_size)] + if sum(file_exists_list) == rank_size: + print("All saved!!!!!!!!!!") + break + else: + print("Waitting for saving numpy!!!!!!!!") + time.sleep(1) + continue + + auc = roc_auc_score(label_list, pred_list) + mean_log_loss = np.mean(log_loss_list) + return auc, mean_log_loss + + +def create_feature_spec_list(use_timestamp=False): + access_threshold = None + eviction_threshold = None + if use_timestamp: + access_threshold = 1000 + eviction_threshold = 180 + + feature_spec_list = [FeatureSpec("sparse_feature", table_name="sparse_embeddings", batch_size=cfg.batch_size, + access_threshold=access_threshold, eviction_threshold=eviction_threshold)] + if use_multi_lookup: + feature_spec_list.append(FeatureSpec("sparse_feature", table_name="sparse_embeddings", + batch_size=cfg.batch_size, + access_threshold=access_threshold, + eviction_threshold=eviction_threshold)) + if use_timestamp: + feature_spec_list.append(FeatureSpec("timestamp", is_timestamp=True)) + return feature_spec_list + + +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"Delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + _del_related_dir("kernel*") + _del_related_dir("model_dir_rank*") + _del_related_dir("op_cache") + + if os.getenv("CACHE_MODE", "") != CacheModeEnum.SSD.value: + return + logger.info("Current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" + " then create empty directory for this use case.") + for sub_path in SSD_DATA_PATH: + _del_related_dir(sub_path) + os.makedirs(sub_path, mode=0o550, exist_ok=True) + logger.info(f"Create dir:{sub_path}") + + +if __name__ == "__main__": + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + warnings.filterwarnings("ignore") + _clear_saved_model() + + rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None + rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None + interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None + train_steps = 10000 + eval_steps = 1360 + + try: + use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))) + use_multi_lookup = bool(int(os.getenv("USE_MULTI_LOOKUP", 0))) + MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0))) + use_faae = bool(int(os.getenv("USE_FAAE", 0))) + except ValueError as err: + raise ValueError("please correctly config USE_DYNAMIC_EXPANSION or USE_MULTI_LOOKUP or USE_FAAE " + "or USE_MODIFY_GRAPH only 0 or 1 is supported.") from err + + use_dynamic = bool(int(os.getenv("USE_DYNAMIC", 0))) + logger.info(f"USE_DYNAMIC:{use_dynamic}") + init(train_steps=train_steps, eval_steps=eval_steps, + use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion) + IF_LOAD = False + rank_id = mxrec_util.communication.hccl_ops.get_rank_id() + filelist = glob(f"./saved-model/sparse-model-0") + if filelist: + IF_LOAD = True + ConfigInitializer.get_instance().if_load = IF_LOAD + + cfg = Config() + feature_spec_list_train = None + feature_spec_list_eval = None + if use_faae: + feature_spec_list_train = create_feature_spec_list(use_timestamp=True) + feature_spec_list_eval = create_feature_spec_list(use_timestamp=True) + else: + feature_spec_list_train = create_feature_spec_list(use_timestamp=False) + feature_spec_list_eval = create_feature_spec_list(use_timestamp=False) + + train_batch, train_iterator = make_batch_and_iterator(cfg, feature_spec_list_train, is_training=True, + dump_graph=True, is_use_faae=use_faae) + eval_batch, eval_iterator = make_batch_and_iterator(cfg, feature_spec_list_eval, is_training=False, + dump_graph=False, is_use_faae=use_faae) + logger.info(f"train_batch: {train_batch}") + + if use_faae: + cfg.dev_vocab_size = cfg.dev_vocab_size // 2 + + optimizer_list = [get_dense_and_sparse_optimizer(cfg)] + + # note: variance_scaling_initializer only support HBM mode + emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) \ + if cfg.cache_mode != "HBM" or use_dynamic_expansion else \ + tf.compat.v1.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=sparse_hashtable_seed) + sparse_hashtable = create_table( + key_dtype=cfg.key_type, + dim=tf.TensorShape([cfg.emb_dim]), + name="sparse_embeddings", + emb_initializer=emb_initializer, + **cfg.get_emb_table_cfg() + ) + if use_faae: + tf.compat.v1.add_to_collection(ASCEND_TIMESTAMP, train_batch["timestamp"]) + + sparse_hashtable_list = [sparse_hashtable, sparse_hashtable] if use_multi_lookup else [sparse_hashtable] + train_model = model_forward(feature_spec_list_train, sparse_hashtable_list, train_batch, + is_train=True, modify_graph=MODIFY_GRAPH_FLAG) + eval_model = model_forward(feature_spec_list_eval, sparse_hashtable_list, eval_batch, + is_train=False, modify_graph=MODIFY_GRAPH_FLAG) + + dense_variables, sparse_variables = get_dense_and_sparse_variable() + trainable_varibles = [] + trainable_varibles.extend(dense_variables) + if use_dynamic_expansion: + trainable_varibles.append(tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB)[0]) + else: + trainable_varibles.extend(sparse_variables) + rank_size = mxrec_util.communication.hccl_ops.get_rank_size() + train_ops = [] + # multi task training + for loss, (dense_optimizer, sparse_optimizer) in zip([train_model.get("loss")], optimizer_list): + # do dense optimization + grads = dense_optimizer.compute_gradients(loss, var_list=trainable_varibles) + avg_grads = [] + for grad, var in grads[:-1]: + if rank_size > 1: + grad = hccl_ops.allreduce(grad, "sum") if grad is not None else None + if grad is not None: + avg_grads.append((grad / 8.0, var)) + # apply gradients: update variables + train_ops.append(dense_optimizer.apply_gradients(avg_grads)) + + if use_dynamic_expansion: + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) + # do sparse optimization by addr + sparse_grads = list(grads[-1]) # local_embedding + grads_and_vars = [(grad, address) for grad, address in zip(sparse_grads, train_address_list)] + train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) + else: + # do sparse optimization + sparse_grads = list(grads[-1]) + print("sparse_grads_tensor:", sparse_grads) + grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)] + train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) + + # 动态学习率更新 + train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]]) + + with tf.control_dependencies(train_ops): + train_ops = tf.no_op() + cfg.learning_rate = [cfg.learning_rate[0], cfg.learning_rate[1]] + + saver = tf.train.Saver() + if MODIFY_GRAPH_FLAG: + modify_graph_and_start_emb_cache(dump_graph=True) + else: + start_asc_pipeline() + + hook_list = [] + if use_faae: + hook_evict = EvictHook(evict_enable=True, evict_time_interval=120) + hook_list.append(hook_evict) + if MODIFY_GRAPH_FLAG: # 该场景添加hook处理校验问题 + hook_list.append(GraphModifierHook(modify_graph=False)) + + # with tf.compat.v1.Session(config=sess_config(dump_data=False)) as sess: + if use_faae: + sess = tf.compat.v1.train.MonitoredTrainingSession( + hooks=hook_list, + config=sess_config(dump_data=False) + ) + sess.graph._unsafe_unfinalize() + if not MODIFY_GRAPH_FLAG: + sess.run(train_iterator.initializer) + else: + sess.run(ConfigInitializer.get_instance().train_params_config.get_initializer(True)) + else: + sess = tf.compat.v1.Session(config=sess_config(dump_data=False)) + sess.run(tf.compat.v1.global_variables_initializer()) + if not MODIFY_GRAPH_FLAG: + sess.run(train_iterator.initializer) + else: + sess.run(ConfigInitializer.get_instance().train_params_config.get_initializer(True)) + + epoch = 0 + cost_sum = 0 + qps_sum = 0 + best_auc = 0 + iteration_per_loop = 10 + + train_ops = util.set_iteration_per_loop(sess, train_ops, 10) + + # for i in range(1, TRAIN_STEPS): + i = 0 + while True: + i += 1 + logger.info(f"################ training at step {i * iteration_per_loop} ################") + start_time = time.time() + + try: + grad, loss = sess.run([train_ops, train_model.get("loss")]) + lr = sess.run(cfg.learning_rate) + global_step = sess.run(cfg.global_step) + except tf.errors.OutOfRangeError: + logger.info(f"Encounter the end of Sequence for training.") + break + + end_time = time.time() + cost_time = end_time - start_time + qps = (1 / cost_time) * rank_size * cfg.batch_size * iteration_per_loop + cost_sum += cost_time + logger.info(f"step: {i * iteration_per_loop}; training loss: {loss}") + logger.info(f"step: {i * iteration_per_loop}; grad: {grad}") + logger.info(f"step: {i * iteration_per_loop}; lr: {lr}") + logger.info(f"global step: {global_step}") + logger.info(f"step: {i * iteration_per_loop}; current sess cost time: {cost_time:.10f}; current QPS: {qps}") + logger.info(f"training at step:{i * iteration_per_loop}, table[{sparse_hashtable.table_name}], " + f"table size:{sparse_hashtable.size()}, table capacity:{sparse_hashtable.capacity()}") + + if i % (train_steps // iteration_per_loop) == 0: + if interval is not None: + test_auc, test_mean_log_loss = evaluate_fix(i * iteration_per_loop) + else: + test_auc, test_mean_log_loss = evaluate() + print("Test auc: {}; log_loss: {} ".format(test_auc, test_mean_log_loss)) + best_auc = max(best_auc, test_auc) + logger.info(f"training step: {i * iteration_per_loop}, best auc: {best_auc}") + + sess.close() + + terminate_config_initializer() + logger.info("Demo done!") -- Gitee From 769164b3b7aff7766e4ffbec81e4766b13d75032 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Mon, 22 Jul 2024 23:38:14 +0800 Subject: [PATCH 06/16] =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=EF=BC=8C=E5=85=A5=E5=8F=A3=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/config.py | 31 ++-- examples/mmoe/criteo.py | 273 ------------------------------------ examples/mmoe/main_mxrec.py | 59 ++++---- 3 files changed, 51 insertions(+), 312 deletions(-) delete mode 100644 examples/mmoe/criteo.py diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py index d5540908..b87bc11b 100644 --- a/examples/mmoe/config.py +++ b/examples/mmoe/config.py @@ -42,10 +42,6 @@ class LearningRateScheduler: self.base_lr_sparse = base_lr_sparse def calc(self, global_step): - # used for the warmup stage - warmup_step = tf.cast(1 / self.warmup_steps, tf.float32) - lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step - lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32) # used for the constant stage lr_factor_constant = tf.cast(1.0, tf.float32) @@ -66,10 +62,15 @@ class Config: self.train_file_pattern = "train" self.test_file_pattern = "test" - self.batch_size = 4096 + self.batch_size = 32 self.line_per_sample = 1 - self.train_epoch = 1 - self.test_epoch = 9 + self.train_epoch = 100 + self.test_epoch = 100 + self.expert_num = 8 + self.gate_num = 2 + self.expert_size = 16 + self.tower_size = 8 + self.perform_shuffle = False self.key_type = tf.int64 @@ -82,7 +83,7 @@ class Config: self.field_num = 26 self.send_count = 46000 // self.rank_size - self.emb_dim = 8 + self.emb_dim = self.expert_num * self.expert_size + self.gate_num * self.expert_num self.hashtable_threshold = 1 self.USE_PIPELINE_TEST = False @@ -102,7 +103,7 @@ class Config: LR_SCHEDULE_STEPS[1], LR_SCHEDULE_STEPS[2], ) - self.learning_rate = _lr_scheduler.calc(self.global_step) + self.learning_rate = _lr_scheduler.calc() def __set_emb_table_size(self): self.cache_mode = os.getenv("CACHE_MODE") @@ -110,15 +111,15 @@ class Config: raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") if self.cache_mode == CacheModeEnum.HBM.value: - self.dev_vocab_size = 14_000_000 * self.rank_size + self.dev_vocab_size = 1000 * self.rank_size self.host_vocab_size = 0 elif self.cache_mode == CacheModeEnum.DDR.value: - self.dev_vocab_size = 500_000 * self.rank_size - self.host_vocab_size = 24_000_000 * self.rank_size + self.dev_vocab_size = 1000 * self.rank_size + self.host_vocab_size = 1000 * self.rank_size elif self.cache_mode == CacheModeEnum.SSD.value: - self.dev_vocab_size = 100_000 * self.rank_size - self.host_vocab_size = 2_000_000 * self.rank_size - self.ssd_vocab_size = 24_000_000 * self.rank_size + self.dev_vocab_size = 1000 * self.rank_size + self.host_vocab_size = 1000 * self.rank_size + self.ssd_vocab_size = 1000 * self.rank_size else: raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") diff --git a/examples/mmoe/criteo.py b/examples/mmoe/criteo.py deleted file mode 100644 index 25f1d869..00000000 --- a/examples/mmoe/criteo.py +++ /dev/null @@ -1,273 +0,0 @@ -# coding=utf-8 -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import os -import stat -import pickle -import argparse -import pandas as pd -import numpy as np -import tensorflow as tf -from tqdm import tqdm - -NAMES = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)] - - -def make_sub_file(lines, head, src_name, sub_dir_name, sub): - """Write sub-data. - - Args: - :param lines: A list. Several pieces of data. - :param head: A string. ['label', 'I1', 'I2', ...]. - :param src_name: A string. The name of data. - :param sub_dir_name: A string. - :param sub: A scalar(Int). Record the current number of sub file. - :return: sub + 1. - """ - root_path, file_path = os.path.split(src_name) - file_name, suffix = file_path.split('.') - split_file_name = file_name + "_" + str(sub).zfill(2) + "." + suffix - split_file = os.path.join(root_path, sub_dir_name, split_file_name) - if not os.path.exists(os.path.join(root_path, sub_dir_name)): - os.mkdir(os.path.join(root_path, sub_dir_name)) - - modes = stat.S_IWUSR | stat.S_IRUSR - flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT - f = os.fdopen(os.open(split_file, flags, modes), 'w') - try: - f.writelines([head]) - f.writelines(lines) - return sub + 1 - finally: - f.close() - - -def split_byline_count(filename, count, sub_dir_name): - """Split File. - Note: You can specify how many rows of data each sub file contains. - Args: - :param filename: A string. - :param count: A scalar(int). - :param sub_dir_name: A string. - :return: - """ - f = open(filename, 'r') - try: - head = f.readline() - buf = [] - sub = 1 - for line in f: - buf.append(line) - if len(buf) == count: - sub = make_sub_file(buf, head, filename, sub_dir_name, sub) - buf = [] - if len(buf) != 0: - try: - make_sub_file(buf, head, filename, sub_dir_name, sub) - except FileNotFoundError as err: - raise FileNotFoundError("please check the filename of data") from err - finally: - f.close() - - -def get_split_file_path(parent_path=None, dataset_path=None, sample_num=4600000): - """Get the list of split file path. - Note: Either parent_path or dataset_path must be valid. - If exists dataset_path + "/split", parent_path = dataset_path + "/split". - Args: - :param parent_path: A string. split file's parent path. - :param dataset_path: A string. - :param sample_num: A int. The sample number of every split file. - :return: A list. [file1_path, file2_path, ...] - """ - sub_dir_name = 'split' - if parent_path is None and dataset_path is None: - raise ValueError('Please give parent path or file path.') - if parent_path is None and os.path.exists(os.path.join(os.path.dirname(dataset_path), sub_dir_name)): - parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) - elif parent_path is None or not os.path.exists(parent_path): - split_byline_count(dataset_path, sample_num, sub_dir_name) - parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) - split_file_name = os.listdir(parent_path) - split_file_name.sort() - split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt'] - return split_file_list - - -def get_fea_map(fea_map_path=None, split_file_list=None): - """Get feature map. - Note: Either parent_path or dataset_path must be valid. - If exists dir(split_file_list[0]) + "/fea_map.pkl", fea_map_path is valid. - If fea_map_path is None and you want to build the feature map, - the default file path is the parent directory of split file + "fea_map.pkl". - Args: - :param fea_map_path: A string. - :param split_file_list: A list. [file1_path, file2_path, ...] - :return: A dict. {'C1':{}, 'C2':{}, ...} - """ - if fea_map_path is None and split_file_list is None: - raise ValueError('Please give feature map path or split file list.') - if fea_map_path is None and split_file_list is not None: - fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") - if os.path.exists(fea_map_path) and fea_map_path[-3:] == 'pkl': - with open(fea_map_path, 'rb') as f: - fea_map = pickle.load(f) - return fea_map - fea_map = {} - for file_open in tqdm(split_file_list): - f = open(file_open) - for line in f: - row = line.strip('\n').split('\t') - for i in range(14, 40): - if row[i] == '': - continue - name = NAMES[i] - fea_map.setdefault(name, {}) - if fea_map[name].get(row[i]) is None: - fea_map[name][row[i]] = len(fea_map[name]) - for j in range(1, 14): - if row[j] == '': - continue - name = NAMES[j] - fea_map.setdefault(name, {}) - fea_map[name].setdefault('min', float(row[j])) - fea_map[name].setdefault('max', float(row[j])) - fea_map[name]['min'] = min(fea_map[name]['min'], float(row[j])) - fea_map[name]['max'] = max(fea_map[name]['max'], float(row[j])) - f.close() - for i in range(14, 40): - fea_map[NAMES[i]]['-1'] = len(fea_map[NAMES[i]]) - fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") - - - modes = stat.S_IWUSR | stat.S_IRUSR - flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT - with os.fdopen(os.open(fea_map_path, flags, modes), 'wb') as fd: - pickle.dump(fea_map, fd, pickle.HIGHEST_PROTOCOL) - - return fea_map - - -def rec_kbins_discretizer(dat, n_bins, min_max_dict): - """Bin continuous data into intervals. - Note: The strategy is "uniform". - Args: - :param dat: A dataframe. - :param n_bins: A scalar(int). - :param min_max_dict: A dict such as {'min': , 'max': }. - :return: The new dataframe. - """ - features = dat.columns - n_features = len(features) - bin_edges = np.zeros(n_features, dtype=object) - for idx, feature in enumerate(features): - bin_edges[idx] = np.linspace(min_max_dict[feature]['min'], min_max_dict[feature]['max'], n_bins + 1) - rtol = 1.e-5 - atol = 1.e-8 - eps = atol + rtol * np.abs(dat[feature]) - dat[feature] = np.digitize(dat[feature] + eps, bin_edges[idx][1:]) - return dat - - -def convert_input2tfrd(in_file_path, out_file_path): - """ - txt to tfrecords - """ - def make_example(label_list, dense_feat_list, sparse_feat_list): - dense_feature = np.array(dense_feat_list, dtype=np.int64).reshape(-1) - sparse_feature = np.array(sparse_feat_list, dtype=np.int64).reshape(-1) - label = np.array(label_list, dtype=np.int64).reshape(-1) - feature_dict = { - "dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)), - "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), - "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) - } - example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) - - return example - - file_name = out_file_path + in_file_path[-12:-4] + '.tfrecord' - file_writer = tf.io.TFRecordWriter(file_name) - - with open(in_file_path, encoding='utf-8') as file_in: - - for _, line in tqdm(enumerate(file_in)): - - line = line.strip('\n') - items = line.split('\t') - if len(items) != 40: - continue - label = int(items[0]) - dense = items[1:14] - sparse = items[14:] - - ex = make_example(label, dense, sparse) - serialized = ex.SerializeToString() - file_writer.write(serialized) - - file_writer.close() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Get datasets') - parser.add_argument('--data_path') - parser.add_argument('--output_path') - - args, _ = parser.parse_known_args() - data_path = args.data_path - output_path = args.output_path - - # get txt_list - file_split_list = get_split_file_path(dataset_path=data_path) - # get feature_map - feature_map = get_fea_map(split_file_list=file_split_list) - - for file in tqdm(file_split_list): - - # read data - data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES) - # name feature - sparse_features = ['C' + str(i) for i in range(1, 27)] - dense_features = ['I' + str(i) for i in range(1, 14)] - # data processing - data_df[sparse_features] = data_df[sparse_features].fillna('-1') - data_df[dense_features] = data_df[dense_features].fillna(0) - # sparse feature: mapping - for col in sparse_features: - try: - data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) - except KeyError as e: - raise KeyError("Feature {} not found in dataset".format(col)) from e - # dense feature: Bin continuous data into intervals. - data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map) - # add offsets - slot_size_array = [ - 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, - 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, - 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573 - ] - offset_size_list = np.cumsum([0] + slot_size_array[:-1]) - for col_index in range(1, len(offset_size_list) + 1): - data_df.iloc[:, col_index] += offset_size_list[col_index - 1] - # save to txt - data_df.to_csv(file, sep='\t', index=False, header=False) - # txt to tfrecords - convert_input2tfrd(in_file_path=file, out_file_path=output_path) - - - - - diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py index 51ed7c4a..e236cd2f 100644 --- a/examples/mmoe/main_mxrec.py +++ b/examples/mmoe/main_mxrec.py @@ -66,18 +66,17 @@ def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, def extract_fn(data_record): features = { # Extract features using the keys set during creation - 'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64), - 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64), - 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32), + 'label': tf.compat.v1.FixedLenFeature(shape=(2 * config.line_per_sample,), dtype=tf.int64), + 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(29 * config.line_per_sample,), dtype=tf.int64), + 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(11 * config.line_per_sample,), dtype=tf.float32), } sample = tf.compat.v1.parse_single_example(data_record, features) return sample def reshape_fn(batch): - batch['label'] = tf.reshape(batch['label'], [-1, 1]) - batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 13]) - batch['dense_feature'] = tf.math.log(batch['dense_feature'] + 3.0) - batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 26]) + batch['label'] = tf.reshape(batch['label'], [-1, 2]) + batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 11]) + batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 29]) return batch if is_training: @@ -129,6 +128,7 @@ def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph): emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False) else: raise ValueError("the length of embedding_list must be greater than or equal to 1.") + emb = tf.reduce_sum(emb, axis=1) my_model = MyModel() model_output = my_model.build_model(embedding=emb, dense_feature=batch["dense_feature"], @@ -148,8 +148,10 @@ def evaluate(): eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label") sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) log_loss_list = [] - pred_list = [] - label_list = [] + pred_income_list = [] + pred_mat_list = [] + label_income_list = [] + label_mat_list = [] eval_current_steps = 0 finished = False print("eval begin") @@ -162,16 +164,21 @@ def evaluate(): eval_cost = time.time() - eval_start qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size log_loss_list += list(eval_loss.reshape(-1)) - pred_list += list(pred.reshape(-1)) - label_list += list(label.reshape(-1)) + pred_income = pred[0] + pred_mat = pred[1] + pred_income_list += list(pred_income.reshape(-1)) + pred_mat_list += list(pred_mat.reshape(-1)) + label_income_list += list(label[:, 0].reshape(-1)) + label_mat_list += list(label[:, 1].reshape(-1)) print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") if eval_current_steps == eval_steps: finished = True except tf.errors.OutOfRangeError: finished = True - auc = roc_auc_score(label_list, pred_list) + auc_income = roc_auc_score(label_income_list, pred_income_list) + auc_mat = roc_auc_score(label_mat_list, pred_mat_list) mean_log_loss = np.mean(log_loss_list) - return auc, mean_log_loss + return auc_income, auc_mat, mean_log_loss def evaluate_fix(step): @@ -281,8 +288,8 @@ if __name__ == "__main__": rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None - train_steps = 10000 - eval_steps = 1360 + train_steps = 1000 + eval_steps = 1000 try: use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))) @@ -326,9 +333,7 @@ if __name__ == "__main__": optimizer_list = [get_dense_and_sparse_optimizer(cfg)] # note: variance_scaling_initializer only support HBM mode - emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) \ - if cfg.cache_mode != "HBM" or use_dynamic_expansion else \ - tf.compat.v1.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=sparse_hashtable_seed) + emb_initializer = tf.constant_initializer(value = 0.1) sparse_hashtable = create_table( key_dtype=cfg.key_type, dim=tf.TensorShape([cfg.emb_dim]), @@ -422,7 +427,8 @@ if __name__ == "__main__": epoch = 0 cost_sum = 0 qps_sum = 0 - best_auc = 0 + best_income_auc = 0 + best_auc_mat = 0 iteration_per_loop = 10 train_ops = util.set_iteration_per_loop(sess, train_ops, 10) @@ -456,12 +462,17 @@ if __name__ == "__main__": if i % (train_steps // iteration_per_loop) == 0: if interval is not None: - test_auc, test_mean_log_loss = evaluate_fix(i * iteration_per_loop) + test_auc_income, test_auc_mat, test_mean_log_loss = evaluate_fix(i * iteration_per_loop) else: - test_auc, test_mean_log_loss = evaluate() - print("Test auc: {}; log_loss: {} ".format(test_auc, test_mean_log_loss)) - best_auc = max(best_auc, test_auc) - logger.info(f"training step: {i * iteration_per_loop}, best auc: {best_auc}") + test_auc_income, test_auc_mat, test_mean_log_loss = evaluate() + print("Test auc income: {};Test auc mat: {} ;log_loss: {} ".format(test_auc_income, + test_auc_mat,test_mean_log_loss)) + best_auc_income = max(best_auc_income, test_auc_income) + best_auc_mat = max(best_auc_mat, test_auc_mat) + logger.info(f"training step: {i * iteration_per_loop}, + best auc income: {best_auc_income} , + best auc mat: {best_auc_mat}") + sess.close() -- Gitee From c70d9eebb72a4f818b88d1ea19cb1ba9d172d197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 00:09:26 +0800 Subject: [PATCH 07/16] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index 5b1917a3..cf8ca108 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -42,7 +42,7 @@ class MyModel: for i in range(0, self.expert_num): expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name=f'expert_payer_{i}', kernel_initializer=tf.constant_initializer(value=0.1), - bias_initializer=tf.constant_initializer(values = 0.1)) + bias_initializer=tf.constant_initializer(value=0.1)) param_expert.append(expert_linear) return param_expert @@ -53,7 +53,7 @@ class MyModel: for i in range(0, self.gate_num): gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name=f'gate_payer_{i}', kernel_initializer=tf.constant_initializer(value=0.1), - bias_initializer=tf.constant_initializer(values = 0.1)) + bias_initializer=tf.constant_initializer(value=0.1)) param_gate.append(gate_linear) return param_gate @@ -62,12 +62,12 @@ class MyModel: def tower_layer(self, input, layer_name): tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name=f'tower_payer_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), - bias_initializer=tf.constant_initializer(values = 0.1)) + bias_initializer=tf.constant_initializer(value=0.1)) tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, name=f'tower_payer_out_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), - bias_initializer=tf.constant_initializer(values=0.1)) + bias_initializer=tf.constant_initializer(value=0.1)) return tower_linear_out -- Gitee From 66a629d05eafaeadb807d25b077e41cf5936f1c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 14:28:16 +0800 Subject: [PATCH 08/16] =?UTF-8?q?=E6=97=A0=E7=94=A8=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=88=A0=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/gradient_descent_w.py | 71 ----------------------------- 1 file changed, 71 deletions(-) delete mode 100644 examples/mmoe/gradient_descent_w.py diff --git a/examples/mmoe/gradient_descent_w.py b/examples/mmoe/gradient_descent_w.py deleted file mode 100644 index 53adb996..00000000 --- a/examples/mmoe/gradient_descent_w.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding=utf-8 -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from collections import defaultdict - -import tensorflow as tf -from tensorflow.python.ops import math_ops -from tensorflow.python.training import gradient_descent -from mx_rec.optimizers.base import CustomizedOptimizer -from mx_rec.util.log import logger -from mx_rec.util.initialize import ConfigInitializer - - -def create_hash_optimizer(learning_rate, weight_decay=0.0001, use_locking=False, name="GradientDescent"): - optimizer = CustomizedGradientDescentWithWeighDecay(learning_rate=learning_rate, - weight_decay=weight_decay, - use_locking=use_locking, - name=name) - ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer - return optimizer - - -class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOptimizer, CustomizedOptimizer): - name_counter = defaultdict(int) - - def __init__(self, learning_rate, weight_decay, use_locking=False, name="GradientDescent"): - self.optimizer_type = "gradient_descent_with_weight_decay" - self.weight_decay = weight_decay - super(CustomizedGradientDescentWithWeighDecay, self)._get_name(name=name) - super(CustomizedGradientDescentWithWeighDecay, self).__init__( - learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name - ) - self._slot_num = 0 - self._derivative = 1 - - def get_slot_init_values(self): - logger.info("no slot for gradient descent") - return [] - - def _apply_sparse_duplicate_indices(self, grad, var): - logger.debug(">>>> Enter _apply_sparse_duplicate_indices") - nd_indices = tf.expand_dims(grad.indices, 1) - logger.info(f"weigh_decay={self.weight_decay}") - if self.weight_decay is None: - nd_value = grad.values * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) - else: - nd_value = (grad.values + math_ops.cast(self.weight_decay, var.dtype.base_dtype) * - tf.gather(var, grad.indices)) * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) - var_update_op = tf.scatter_nd_add(var, nd_indices, -nd_value, use_locking=self._use_locking) - return var_update_op - - def _apply_dense(self, grad, var): - logger.debug(">>>> Enter _apply_dense") - raise NotImplementedError("You are using a wrong type of variable.") -- Gitee From aac7a3b3f4d613aea3c303d0266987e023daf62c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:20:36 +0800 Subject: [PATCH 09/16] =?UTF-8?q?=E5=90=8A=E8=B5=B7shell=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/run.sh | 99 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 examples/mmoe/run.sh diff --git a/examples/mmoe/run.sh b/examples/mmoe/run.sh new file mode 100644 index 00000000..6c142443 --- /dev/null +++ b/examples/mmoe/run.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +cur_path=$(dirname "$(readlink -f "$0")") + +so_path=$1 +mx_rec_package_path=$2 +hccl_cfg_json=$3 +dlrm_criteo_data_path=$4 +ip=$5 # no ranktable时传入该参数 + +interface="lo" +num_server=1 +local_rank_size=8 +num_process=$((num_server * local_rank_size)) +export TRAIN_RANK_SIZE=$num_process + +################# 参数配置 ###################### +export USE_DYNAMIC=0 # 0:静态shape;1:动态shape +export CACHE_MODE="HBM" # HBM;DDR;SSD +export USE_FAAE=0 # 0:关闭准入淘汰;1:开启准入淘汰 +export USE_DYNAMIC_EXPANSION=0 # 0:关闭动态扩容;1: 开启动态扩容 +export USE_MULTI_LOOKUP=0 # 0:一表一查;1:一表多查 +export USE_MODIFY_GRAPH=0 # 0:feature spec模式;1:自动改图模式 +################################################ +echo "CACHE_MODE:${CACHE_MODE}" + +export HCCL_CONNECT_TIMEOUT=1200 +export DLRM_CRITEO_DATA_PATH=${dlrm_criteo_data_path} +export PYTHONPATH=${mx_rec_package_path}:${so_path}:$PYTHONPATH +export LD_PRELOAD=/usr/lib64/libgomp.so.1 +export LD_LIBRARY_PATH=${so_path}:/usr/local/lib:$LD_LIBRARY_PATH +export ASCEND_DEVICE_ID=0 +export RANK_ID_START=0 +export JOB_ID=10086 +export CUSTOMIZED_OPS_LIB_PATH=${so_path}/libcust_ops.so # Todo: please config +export MXREC_LOG_LEVEL="INFO" +export TF_CPP_MIN_LOG_LEVEL=3 +export ASCEND_GLOBAL_LOG_LEVEL=3 +#export USE_FAAE=1 +export ENABLE_FORCE_V2_CONTROL=1 + +export PROFILING_OPTIONS='{"output":"/home/yz/profiling", + "training_trace":"on", + "task_trace":"on", + "aicpu":"on", + "fp_point":"", + "bp_point":"", + "aic_metrics":"PipeUtilization"}' + +RANK_ID_START=0 + +export MXREC_MODE="ASC" +echo "MXREC_MODE is $MXREC_MODE" +export py=main_mxrec.py +echo "py is $py" + +# 区分ranktable和no ranktable +if [ -n "$ip" ]; then + # no ranktable分支 + echo "Current is no ranktable solution." + echo "Input node ip: $ip, please make sure this ip is available." + export CM_CHIEF_IP=$ip # 主节点ip + export CM_CHIEF_PORT=60001 # 主节点监听端口 + export CM_CHIEF_DEVICE=0 # 主节点device id + export CM_WORKER_IP=$ip # 当前节点ip + export CM_WORKER_SIZE=$num_process # 参与集群训练的device数量 + echo "CM_CHIEF_IP=$CM_CHIEF_IP" + echo "CM_CHIEF_PORT=$CM_CHIEF_PORT" + echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE" + echo "CM_WORKER_IP=$CM_WORKER_IP" + echo "CM_WORKER_SIZE=$CM_WORKER_SIZE" +else + # ranktable分支 + echo "Current is ranktable solution, hccl json file:${hccl_cfg_json}" + export RANK_SIZE=$num_process + echo "RANK_SIZE=${RANK_SIZE}, please make sure hccl configuration json file match this parameter" + export RANK_TABLE_FILE=${hccl_cfg_json} +fi + +echo "use horovod to start tasks" +# GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO +mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' + +horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ +python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${num_process}p.log -- Gitee From 2bad2444eb05428de24da9a99e9f52496fcb4c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:21:16 +0800 Subject: [PATCH 10/16] =?UTF-8?q?=E6=97=A0=E9=9C=80loss=5Fscale=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=EF=BC=8C=E5=8E=BB=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/delay_loss_scale.py | 64 ------------------------------- 1 file changed, 64 deletions(-) delete mode 100644 examples/mmoe/delay_loss_scale.py diff --git a/examples/mmoe/delay_loss_scale.py b/examples/mmoe/delay_loss_scale.py deleted file mode 100644 index f73baf68..00000000 --- a/examples/mmoe/delay_loss_scale.py +++ /dev/null @@ -1,64 +0,0 @@ -# coding=utf-8 -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import tensorflow as tf -from tensorflow.python.training import optimizer - -from config import Config - - -class DenseLossScaleOptimizer: - def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: - if not isinstance(opt, optimizer.Optimizer): - raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) - self._optimizer = opt - self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - _update_lr_loss_scale(self._optimizer, loss_scale) - - def compute_gradients(self, loss, var_list=None): - return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list) - - def apply_gradients(self, avg_grads): - return self._optimizer.apply_gradients(avg_grads) - - -class SparseLossScaleOptimizer: - def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: - if not isinstance(opt, optimizer.Optimizer): - raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) - self._optimizer = opt - self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - _update_lr_loss_scale(self._optimizer, loss_scale) - - def compute_gradients(self, loss, var_list=None): - return tf.gradients(loss * self._loss_scale, var_list) - - def apply_gradients(self, grads_and_vars): - return self._optimizer.apply_gradients(grads_and_vars) - - -def _update_lr_loss_scale(opt, loss_scale): - if loss_scale <= 0: - raise RuntimeError("the loss_scale must be greater than zero.") - loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - if hasattr(opt, "_lr"): - # LazyAdam or Adam optimizer - opt._lr = opt._lr / loss_scale - elif hasattr(opt, "_learning_rate"): - # SGD optimizer - opt._learning_rate = opt._learning_rate / loss_scale - else: - raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.") \ No newline at end of file -- Gitee From 6d08cf2ecb0290eafae0c4639c86f8eb85c43e47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:25:42 +0800 Subject: [PATCH 11/16] =?UTF-8?q?=E6=97=A0=E7=94=A8=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E5=88=A0=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/mean_auc.py | 40 --------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 examples/mmoe/mean_auc.py diff --git a/examples/mmoe/mean_auc.py b/examples/mmoe/mean_auc.py deleted file mode 100644 index ff57df00..00000000 --- a/examples/mmoe/mean_auc.py +++ /dev/null @@ -1,40 +0,0 @@ -# coding=utf-8 -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import os -from glob import glob -import numpy as np - - -def split_auc(log_input): - with open(log_input, 'r') as log: - all_auc = [] - for line in log.readlines(): - if 'Test' in line: - all_auc.append(float(line.split(';')[0].split(':')[-1].strip())) - all_auc_len = len(all_auc) - all_auc_arr = np.array(all_auc)[:all_auc_len - all_auc_len % 8] - test_auc = np.mean(all_auc_arr.reshape(-1, 8), axis=-1) - return test_auc - - -log_path_all = 'latest_*.log' -log_path_list = glob(log_path_all) - -for log_path in log_path_list: - print(os.path.basename(log_path)) - print(split_auc(log_path)) - print('*'*20) \ No newline at end of file -- Gitee From 9845d170e50cd3087b4869fe070308230967e364 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:27:02 +0800 Subject: [PATCH 12/16] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/config.py | 38 ++++++++----------- examples/mmoe/main_mxrec.py | 67 ++++++++++++++-------------------- examples/mmoe/model.py | 15 +++++--- examples/mmoe/op_impl_mode.ini | 1 - examples/mmoe/optimizer.py | 6 +-- 5 files changed, 54 insertions(+), 73 deletions(-) diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py index b87bc11b..67ed7a20 100644 --- a/examples/mmoe/config.py +++ b/examples/mmoe/config.py @@ -32,16 +32,11 @@ class LearningRateScheduler: TF-based cond operations necessary for performance in graph mode. """ - def __init__(self, base_lr_dense, base_lr_sparse, warmup_steps, decay_start_step, decay_steps): - self.warmup_steps = tf.constant(warmup_steps, dtype=tf.int32) - self.decay_start_step = tf.constant(decay_start_step, dtype=tf.int32) - self.decay_steps = tf.constant(decay_steps) - self.decay_end_step = decay_start_step + decay_steps # 65041 - self.poly_power = 2.0 + def __init__(self, base_lr_dense, base_lr_sparse): self.base_lr_dense = base_lr_dense self.base_lr_sparse = base_lr_sparse - def calc(self, global_step): + def calc(self): # used for the constant stage lr_factor_constant = tf.cast(1.0, tf.float32) @@ -51,7 +46,7 @@ class LearningRateScheduler: class Config: - def __init__(self, ): + def __init__(self, ) -> None: self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None tmp = os.getenv("TRAIN_RANK_SIZE") if tmp is None: @@ -81,31 +76,30 @@ class Config: self.__set_emb_table_size() self.field_num = 26 - self.send_count = 46000 // self.rank_size + self.send_count = self.get_send_count(self.rank_size) self.emb_dim = self.expert_num * self.expert_size + self.gate_num * self.expert_num self.hashtable_threshold = 1 self.USE_PIPELINE_TEST = False - # 动态学习率 - GLOBAL_BATCH_SIZE = 8192 * 8 - LR_SCHEDULE_STEPS = [ - int(2750 * 55296 / GLOBAL_BATCH_SIZE), - int(49315 * 55296 / GLOBAL_BATCH_SIZE), - int(27772 * 55296 / GLOBAL_BATCH_SIZE), - ] self.global_step = tf.Variable(0, trainable=False) _lr_scheduler = LearningRateScheduler( 0.001, - 0.001, - LR_SCHEDULE_STEPS[0], - LR_SCHEDULE_STEPS[1], - LR_SCHEDULE_STEPS[2], + 0.001 ) self.learning_rate = _lr_scheduler.calc() + + def get_send_count(self, rank_size): + try: + return 46000 // rank_size + except ZeroDivisionError as exp: + raise ZeroDivisionError('Rank size can not be zero.') from exp + + + - def __set_emb_table_size(self): + def __set_emb_table_size(self) -> None: self.cache_mode = os.getenv("CACHE_MODE") if self.cache_mode is None: raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") @@ -123,7 +117,7 @@ class Config: else: raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") - def get_emb_table_cfg(self): + def get_emb_table_cfg(self) -> None: if self.cache_mode == CacheModeEnum.HBM.value: return {"device_vocabulary_size": self.dev_vocab_size} elif self.cache_mode == CacheModeEnum.DDR.value: diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py index e236cd2f..0eb127dd 100644 --- a/examples/mmoe/main_mxrec.py +++ b/examples/mmoe/main_mxrec.py @@ -24,10 +24,7 @@ from glob import glob import tensorflow as tf from sklearn.metrics import roc_auc_score import numpy as np - -from optimizer import get_dense_and_sparse_optimizer -from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum -from model import MyModel +from npu_bridge.npu_init import * from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline @@ -40,7 +37,9 @@ from mx_rec.util.ops import import_host_pipeline_ops import mx_rec.util as mxrec_util from mx_rec.util.variable import get_dense_and_sparse_variable from mx_rec.util.log import logger -from npu_bridge.npu_init import * +from optimizer import get_dense_and_sparse_optimizer +from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum +from model import MyModel npu_plugin.set_device_sat_mode(0) @@ -52,7 +51,6 @@ random.seed(shuffle_seed) def add_timestamp_func(batch): timestamp = import_host_pipeline_ops().return_timestamp(tf.cast(batch['label'], dtype=tf.int64)) - # tf.constant(np.random.randint(1,1688109060,1)), tf.int64)) batch["timestamp"] = timestamp return batch @@ -144,7 +142,8 @@ def evaluate(): eval_label = eval_model.get("label") sess.run([eval_iterator.initializer]) else: - # 在sess run模式下,若还是使用原来batch中的label去sess run,则会出现getnext超时报错,需要使用新数据集中的batch + # In sess run mode, if the label from the original batch is still used for sess run, + # a getnext timeout error will occur, and a new batch from the new dataset needs to be used eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label") sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) log_loss_list = [] @@ -157,24 +156,26 @@ def evaluate(): print("eval begin") while not finished: + + eval_current_steps += 1 + eval_start = time.time() try: - eval_current_steps += 1 - eval_start = time.time() eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_label]) - eval_cost = time.time() - eval_start - qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size - log_loss_list += list(eval_loss.reshape(-1)) - pred_income = pred[0] - pred_mat = pred[1] - pred_income_list += list(pred_income.reshape(-1)) - pred_mat_list += list(pred_mat.reshape(-1)) - label_income_list += list(label[:, 0].reshape(-1)) - label_mat_list += list(label[:, 1].reshape(-1)) - print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") - if eval_current_steps == eval_steps: - finished = True except tf.errors.OutOfRangeError: + break + eval_cost = time.time() - eval_start + qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size + log_loss_list += list(eval_loss.reshape(-1)) + pred_income = pred[0] + pred_mat = pred[1] + pred_income_list += list(pred_income.reshape(-1)) + pred_mat_list += list(pred_mat.reshape(-1)) + label_income_list += list(label[:, 0].reshape(-1)) + label_mat_list += list(label[:, 1].reshape(-1)) + print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") + if eval_current_steps == eval_steps: finished = True + auc_income = roc_auc_score(label_income_list, pred_income_list) auc_mat = roc_auc_score(label_mat_list, pred_mat_list) mean_log_loss = np.mean(log_loss_list) @@ -285,7 +286,6 @@ if __name__ == "__main__": warnings.filterwarnings("ignore") _clear_saved_model() - rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None train_steps = 1000 @@ -304,13 +304,8 @@ if __name__ == "__main__": logger.info(f"USE_DYNAMIC:{use_dynamic}") init(train_steps=train_steps, eval_steps=eval_steps, use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion) - IF_LOAD = False + rank_id = mxrec_util.communication.hccl_ops.get_rank_id() - filelist = glob(f"./saved-model/sparse-model-0") - if filelist: - IF_LOAD = True - ConfigInitializer.get_instance().if_load = IF_LOAD - cfg = Config() feature_spec_list_train = None feature_spec_list_eval = None @@ -385,14 +380,11 @@ if __name__ == "__main__": grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)] train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) - # 动态学习率更新 - train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]]) with tf.control_dependencies(train_ops): train_ops = tf.no_op() cfg.learning_rate = [cfg.learning_rate[0], cfg.learning_rate[1]] - saver = tf.train.Saver() if MODIFY_GRAPH_FLAG: modify_graph_and_start_emb_cache(dump_graph=True) else: @@ -405,7 +397,6 @@ if __name__ == "__main__": if MODIFY_GRAPH_FLAG: # 该场景添加hook处理校验问题 hook_list.append(GraphModifierHook(modify_graph=False)) - # with tf.compat.v1.Session(config=sess_config(dump_data=False)) as sess: if use_faae: sess = tf.compat.v1.train.MonitoredTrainingSession( hooks=hook_list, @@ -427,13 +418,12 @@ if __name__ == "__main__": epoch = 0 cost_sum = 0 qps_sum = 0 - best_income_auc = 0 + best_auc_income= 0 best_auc_mat = 0 iteration_per_loop = 10 train_ops = util.set_iteration_per_loop(sess, train_ops, 10) - # for i in range(1, TRAIN_STEPS): i = 0 while True: i += 1 @@ -441,9 +431,8 @@ if __name__ == "__main__": start_time = time.time() try: - grad, loss = sess.run([train_ops, train_model.get("loss")]) - lr = sess.run(cfg.learning_rate) - global_step = sess.run(cfg.global_step) + grad, loss, lr, global_step = sess.run([train_ops, train_model.get("loss"), + cfg.learning_rate, cfg.global_step]) except tf.errors.OutOfRangeError: logger.info(f"Encounter the end of Sequence for training.") break @@ -469,9 +458,7 @@ if __name__ == "__main__": test_auc_mat,test_mean_log_loss)) best_auc_income = max(best_auc_income, test_auc_income) best_auc_mat = max(best_auc_mat, test_auc_mat) - logger.info(f"training step: {i * iteration_per_loop}, - best auc income: {best_auc_income} , - best auc mat: {best_auc_mat}") + logger.info(f"training step: {i * iteration_per_loop}, best auc income: {best_auc_income} , best auc mat: {best_auc_mat}") sess.close() diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index cf8ca108..224e8d6d 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -37,10 +37,10 @@ class MyModel: self.gate_num = gate_num - def expert_layer(self, input): + def expert_layer(self, _input): param_expert = [] for i in range(0, self.expert_num): - expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name=f'expert_payer_{i}', + expert_linear = tf.layers.dense(_input, units=self.expert_size, activation=None, name=f'expert_layer_{i}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) @@ -48,10 +48,10 @@ class MyModel: return param_expert - def gate_layer(self, input): + def gate_layer(self, _input): param_gate = [] for i in range(0, self.gate_num): - gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name=f'gate_payer_{i}', + gate_linear = tf.layers.dense(_input, units=self.expert_num, activation=None, name=f'gate_layer_{i}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) @@ -59,8 +59,8 @@ class MyModel: return param_gate - def tower_layer(self, input, layer_name): - tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name=f'tower_payer_{layer_name}', + def tower_layer(self, _input, layer_name): + tower_linear = tf.layers.dense(_input, units=self.tower_size, activation=None, name=f'tower_layer_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) @@ -109,7 +109,10 @@ class MyModel: cur_gate_expert = tf.multiply(x=expert_concat, y=cur_gate) cur_gate_expert = tf.reduce_sum(cur_gate_expert, axis=1) + out = self.tower_layer(cur_gate_expert, i) + out = tf.nn.softmax(out) + out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0-1e-15) output_layers.append(out) out_pred.append(tf.nn.softmax(out[:, 1])) _slice_num = slice_num_end diff --git a/examples/mmoe/op_impl_mode.ini b/examples/mmoe/op_impl_mode.ini index 579dea43..e69de29b 100644 --- a/examples/mmoe/op_impl_mode.ini +++ b/examples/mmoe/op_impl_mode.ini @@ -1 +0,0 @@ -ScatterNdAdd=support_out_of_bound_index \ No newline at end of file diff --git a/examples/mmoe/optimizer.py b/examples/mmoe/optimizer.py index 2c7685bb..5469c705 100644 --- a/examples/mmoe/optimizer.py +++ b/examples/mmoe/optimizer.py @@ -15,12 +15,13 @@ # ============================================================================== import tensorflow as tf -from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer + from mx_rec.util.initialize import ConfigInitializer from mx_rec.optimizers.lazy_adam import create_hash_optimizer from mx_rec.optimizers.lazy_adam_by_addr import create_hash_optimizer_by_address + def get_dense_and_sparse_optimizer(cfg): dense_optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate[0]) use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion @@ -28,8 +29,5 @@ def get_dense_and_sparse_optimizer(cfg): sparse_optimizer = create_hash_optimizer_by_address(learning_rate=cfg.learning_rate[1]) else: sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1]) - loss_scale = 1 - sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale) - dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale) return dense_optimizer, sparse_optimizer -- Gitee From ca2e82248c638e21066a3c6ae779d9409724d122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:28:49 +0800 Subject: [PATCH 13/16] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index 224e8d6d..f18dbff0 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -60,11 +60,11 @@ class MyModel: def tower_layer(self, _input, layer_name): - tower_linear = tf.layers.dense(_input, units=self.tower_size, activation=None, name=f'tower_layer_{layer_name}', + tower_linear = tf.layers.dense(_input, units=self.tower_size, activation='relu', name=f'tower_layer_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) - tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, + tower_linear_out = tf.layers.dense(tower_linear, units=2, activation=None, name=f'tower_payer_out_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) -- Gitee From 13f3618364bae56befe067d91b75603f3bae4624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 19:29:53 +0800 Subject: [PATCH 14/16] codecheck --- examples/mmoe/config.py | 8 ++++---- examples/mmoe/main_mxrec.py | 12 +++++++----- examples/mmoe/model.py | 5 +++-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py index 67ed7a20..b6a83582 100644 --- a/examples/mmoe/config.py +++ b/examples/mmoe/config.py @@ -90,14 +90,14 @@ class Config: ) self.learning_rate = _lr_scheduler.calc() + + @staticmethod def get_send_count(self, rank_size): try: - return 46000 // rank_size + return 46000 // rank_size except ZeroDivisionError as exp: raise ZeroDivisionError('Rank size can not be zero.') from exp - - - + def __set_emb_table_size(self) -> None: self.cache_mode = os.getenv("CACHE_MODE") diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py index 0eb127dd..d02566aa 100644 --- a/examples/mmoe/main_mxrec.py +++ b/examples/mmoe/main_mxrec.py @@ -25,6 +25,7 @@ import tensorflow as tf from sklearn.metrics import roc_auc_score import numpy as np from npu_bridge.npu_init import * +from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline @@ -38,7 +39,7 @@ import mx_rec.util as mxrec_util from mx_rec.util.variable import get_dense_and_sparse_variable from mx_rec.util.log import logger from optimizer import get_dense_and_sparse_optimizer -from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum + from model import MyModel npu_plugin.set_device_sat_mode(0) @@ -328,7 +329,7 @@ if __name__ == "__main__": optimizer_list = [get_dense_and_sparse_optimizer(cfg)] # note: variance_scaling_initializer only support HBM mode - emb_initializer = tf.constant_initializer(value = 0.1) + emb_initializer = tf.constant_initializer(value=0.1) sparse_hashtable = create_table( key_dtype=cfg.key_type, dim=tf.TensorShape([cfg.emb_dim]), @@ -418,7 +419,7 @@ if __name__ == "__main__": epoch = 0 cost_sum = 0 qps_sum = 0 - best_auc_income= 0 + best_auc_income = 0 best_auc_mat = 0 iteration_per_loop = 10 @@ -455,10 +456,11 @@ if __name__ == "__main__": else: test_auc_income, test_auc_mat, test_mean_log_loss = evaluate() print("Test auc income: {};Test auc mat: {} ;log_loss: {} ".format(test_auc_income, - test_auc_mat,test_mean_log_loss)) + test_auc_mat, test_mean_log_loss)) best_auc_income = max(best_auc_income, test_auc_income) best_auc_mat = max(best_auc_mat, test_auc_mat) - logger.info(f"training step: {i * iteration_per_loop}, best auc income: {best_auc_income} , best auc mat: {best_auc_mat}") + logger.info(f"training step: {i * iteration_per_loop}, best auc income: " + f"{best_auc_income} , best auc mat: {best_auc_mat}") sess.close() diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index f18dbff0..f8090373 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -60,7 +60,8 @@ class MyModel: def tower_layer(self, _input, layer_name): - tower_linear = tf.layers.dense(_input, units=self.tower_size, activation='relu', name=f'tower_layer_{layer_name}', + tower_linear = tf.layers.dense(_input, units=self.tower_size, activation='relu', + name=f'tower_layer_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) @@ -112,7 +113,7 @@ class MyModel: out = self.tower_layer(cur_gate_expert, i) out = tf.nn.softmax(out) - out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0-1e-15) + out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0 - 1e-15) output_layers.append(out) out_pred.append(tf.nn.softmax(out[:, 1])) _slice_num = slice_num_end -- Gitee From e3ffcd9bffabc259852c0af58f43273272d655c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 21:59:32 +0800 Subject: [PATCH 15/16] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index f8090373..8cbb7ba8 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -116,7 +116,7 @@ class MyModel: out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0 - 1e-15) output_layers.append(out) out_pred.append(tf.nn.softmax(out[:, 1])) - _slice_num = slice_num_end + _slice_num = slice_gate_end trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mmoe') label_income = label[:, 0:1] -- Gitee From 8182c3f1f288eaa0c936b330ab8cd4e38dae8bff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Thu, 25 Jul 2024 16:20:15 +0800 Subject: [PATCH 16/16] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9=EF=BC=8C=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E9=9D=99=E6=80=81=E5=87=BD=E6=95=B0=E5=BF=98=E8=AE=B0?= =?UTF-8?q?=E5=88=A0=E9=99=A4self?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py index b6a83582..08cfe9e7 100644 --- a/examples/mmoe/config.py +++ b/examples/mmoe/config.py @@ -92,7 +92,7 @@ class Config: @staticmethod - def get_send_count(self, rank_size): + def get_send_count(rank_size): try: return 46000 // rank_size except ZeroDivisionError as exp: -- Gitee