From 90bd4a3705ce1a239178284d001abe91dd05fb91 Mon Sep 17 00:00:00 2001
From: longfeifei <962977793@qq.com>
Date: Tue, 2 Jul 2024 17:10:56 +0800
Subject: [PATCH 01/16] =?UTF-8?q?estimator=E4=B8=ADtrain=E5=88=87=E6=8D=A2?=
 =?UTF-8?q?=E4=B8=BAeval,=E5=A2=9E=E5=8A=A0=E5=8E=9Fhost=E4=BE=A7train?=
 =?UTF-8?q?=E7=9A=84=E7=9B=B8=E5=85=B3=E7=8A=B6=E6=80=81=E5=A4=87=E4=BB=BD?=
 =?UTF-8?q?=EF=BC=8C=E5=9C=A8eval=E5=88=87=E6=8D=A2=E4=B8=BAtrain=E5=90=8E?=
 =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E8=BF=98=E5=8E=9F=E5=A4=87=E4=BB=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cache_manager/cache_manager.cpp           | 27 ++++++++++++++++
 .../cache_manager/cache_manager.h             |  4 +++
 src/AccCTR/src/embedding_cache/limited_set.h  | 18 +++++++++++
 .../offset_mapper/offset_mapper.h             | 32 +++++++++++++++++++
 src/AccCTR/src/include/embedding_cache.h      | 14 ++++++++
 src/core/emb_table/embedding_ddr.cpp          | 10 ++++++
 src/core/emb_table/embedding_ddr.h            |  3 ++
 src/core/emb_table/embedding_mgmt.cpp         | 14 ++++++++
 src/core/emb_table/embedding_mgmt.h           | 11 +++++++
 src/core/emb_table/embedding_static.cpp       | 16 ++++++++--
 src/core/emb_table/embedding_static.h         |  4 +++
 src/core/emb_table/embedding_table.cpp        |  8 +++++
 src/core/emb_table/embedding_table.h          |  5 +++
 src/core/hybrid_mgmt/hybrid_mgmt.cpp          | 27 ++++++++++++++++
 src/core/hybrid_mgmt/hybrid_mgmt.h            |  5 +++
 .../ock_ctr_common/include/embedding_cache.h  | 14 ++++++++
 16 files changed, 210 insertions(+), 2 deletions(-)
diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp
index 8a6187a1..452e2fd1 100644
--- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp
+++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp
@@ -317,6 +317,33 @@ int EmbCacheManagerImpl::LoadEmbTableInfos(std::string tableName, const std::vec
     return H_OK;
 }
 
+int EmbCacheManagerImpl::BackUpTrainStatus(std:string tableName)
+{
+    int checkTableNameRet = CheckValidTableName(tableName);
+    if (checkTableNameRet != H_OK) {
+        return checkTableNameRet;
+    }
+
+    auto om = offsetMappersBackUp.find(tableName);
+    if (om != offsetMappersBackUp.end()) {
+        offsetMappersBackUp[tableName] = offsetMappers[tableName];
+    } else{
+        offsetMappersBackUp[tableName].Initialize(1000, 1000);
+        offsetMappersBackUp[tableName] = offsetMappers[tableName];
+    }
+    return H_OK;
+}
+
+int EmbCacheManagerImpl::RecoverTrainStatus(std:string tableName)
+{
+    int checkTableNameRet = CheckValidTableName(tableName);
+    if (checkTableNameRet != H_OK) {
+        return checkTableNameRet;
+    }
+    offsetMappers[tableName] = offsetMappersBackUp[tableName];
+    return H_OK;
+}
+
 void EmbCacheManagerImpl::Destroy()
 {
     for (auto it = offsetMappers.begin(); it != offsetMappers.end(); it++) {
diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h
index 80fbcd46..359e88ad 100644
--- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h
+++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h
@@ -73,11 +73,15 @@ public:
                           const std::vector<std::vector<float>>& embeddings,
                           const std::vector<std::vector<float>>& optimizerSlots) override;
 
+    int BackUpTrainStatus(std:string tableName) override;
+    int RecoverTrainStatus(std::string tableName) override;
+
     uint32_t GetUsage(const std::string& tableName) override;
 
 private:
     std::map<std::string, EmbCacheInfo> embCacheInfos;
     std::map<std::string, OffsetMapper> offsetMappers;
+    std::map<std::string, OffsetMapper> offsetMappersBackUp;
     std::map<std::string, EmbLocalTable> embTables;
 
     int CheckValidTableName(const std::string& tableName);
diff --git a/src/AccCTR/src/embedding_cache/limited_set.h b/src/AccCTR/src/embedding_cache/limited_set.h
index 036a6477..d44b615a 100644
--- a/src/AccCTR/src/embedding_cache/limited_set.h
+++ b/src/AccCTR/src/embedding_cache/limited_set.h
@@ -47,6 +47,24 @@ public:
         delete tail;
     }
 
+    // 拷贝构造函数
+    LimitedSet(const LimitedSet& other): head(new Node(-1)), tail(new Node(-1))
+    {
+        nodes.resize(other.nodes.size());
+        for (auto &node: nodes) {
+            node = new Node(-1);
+        }
+
+        // 初始化头尾节点
+        head->next = tail;
+        tail->prev = head;
+
+        // 遍历原vector的每一个节点并复制
+        for (Node* node = other.head->next; node != other.tail; node = node->next) {
+            insert(node->value);
+        }
+    }
+
     void insert(uint64_t value)
     {
         if (nodes[value]->value == value) {
diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h
index f42a0d3f..1ad470c5 100644
--- a/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h
+++ b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h
@@ -35,6 +35,38 @@ public:
 
     ~OffsetMapper() = default;
 
+    OffsetMapper(const OffsetMapper& other): maxCacheSize(other.maxCacheSize), useLength(other.useLength),
+                                             validPos(new LimitedSet(*other.validPos)),
+                                             evictPos(new LimitedSet(*other.evictPos)),
+                                             pos2Key(other.pos2Key), lastBatchPos(other.lastBatchPos),
+                                             evictSize(other.evictSize)
+    {
+    }
+
+    OffsetMapper& operator=(const OffsetMapper& other)
+    {
+        if (this != &other) {
+            delete validPos;
+            validPos = nullptr;
+            delete evictPos;
+            evictPos = nullptr;
+
+            if (other.validPos != nullptr) {
+                validPos = new LimitedSet(*other.validPos);
+            }
+            if (other.evictPos != nullptr) {
+                evictPos = new LimitedSet(*other.evictPos);
+            }
+
+            maxCacheSize = other.maxCacheSize;
+            useLength = other.useLength;
+            pos2Key = other.pos2Key;
+            lastBatchPos = other.lastBatchPos;
+            evictSize = other.evictSize;
+        }
+        return *this;
+    }
+
     bool Initialize(uint32_t reserve, uint32_t maxSize = 0)
     {
         maxCacheSize = maxSize;
diff --git a/src/AccCTR/src/include/embedding_cache.h b/src/AccCTR/src/include/embedding_cache.h
index 4adf1fbf..40d9dcbe 100644
--- a/src/AccCTR/src/include/embedding_cache.h
+++ b/src/AccCTR/src/include/embedding_cache.h
@@ -315,6 +315,20 @@ public:
     virtual int LoadEmbTableInfos(std::string tableName, const std::vector<uint64_t>& keys,
                                   const std::vector<std::vector<float>>& embeddings,
                                   const std::vector<std::vector<float>>& optimizerSlots) = 0;
+
+    /* *
+     * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态
+     * @Param tableName: 需要加载信息的table名字
+     * @Return errorCode
+     */
+     virtual int BackUpTrainStatus(std::string tableName) = 0;
+
+    /* *
+     * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态
+     * @Param tableName: 需要加载信息的table名字
+     * @Return errorCode
+     */
+     virtual int RecoverTrainStatus(std::string tableName) = 0;
 };
 }  // namespace EmbCache
 
diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp
index ca706c73..e4b96eb6 100644
--- a/src/core/emb_table/embedding_ddr.cpp
+++ b/src/core/emb_table/embedding_ddr.cpp
@@ -376,3 +376,13 @@ void EmbeddingDDR::SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache)
 {
     this->embCache = embCache;
 }
+
+void EmbeddingDDR::BackUpTrainStatus()
+{
+    embCache->BackUpTrainStatus(name);
+}
+
+void EmbeddingDDR::RecoverTrainStatus()
+{
+    embCache->RecoverTrainStatus(name);
+}
diff --git a/src/core/emb_table/embedding_ddr.h b/src/core/emb_table/embedding_ddr.h
index ac5c5878..26d85e60 100644
--- a/src/core/emb_table/embedding_ddr.h
+++ b/src/core/emb_table/embedding_ddr.h
@@ -73,6 +73,9 @@ public:
     void SaveEmbAndOptim(const string& savePath);
     void SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache);
 
+    void BackUpTrainStatus();
+    void RecoverTrainStatus();
+
 GTEST_PRIVATE:
 
     void EvictDeleteEmb(const vector<emb_key_t>& keys);
diff --git a/src/core/emb_table/embedding_mgmt.cpp b/src/core/emb_table/embedding_mgmt.cpp
index 9e7dcbb0..d889cdba 100644
--- a/src/core/emb_table/embedding_mgmt.cpp
+++ b/src/core/emb_table/embedding_mgmt.cpp
@@ -196,3 +196,17 @@ void EmbeddingMgmt::SetEmbCacheForEmbTable(const ock::ctr::EmbCacheManagerPtr& e
         table.second->SetEmbCache(embCache);
     }
 }
+
+void EmbeddingMgmt::BackUpTrainStatusBeforeLoad()
+{
+    for (auto& table: embeddings) {
+        table.second->BackUpTrainStatus();
+    }
+}
+
+void EmbeddingMgmt::RecoverTrainStatus()
+{
+    for (auto& table: embeddings) {
+        table.second->RecoverTrainStatus();
+    }
+}
\ No newline at end of file
diff --git a/src/core/emb_table/embedding_mgmt.h b/src/core/emb_table/embedding_mgmt.h
index ef106786..7cd3f782 100644
--- a/src/core/emb_table/embedding_mgmt.h
+++ b/src/core/emb_table/embedding_mgmt.h
@@ -89,6 +89,17 @@ public:
      */
     void Save(const string& filePath);
 
+    /**
+     * estimator模式下train切换为eval时， 备份所有表train的状态
+     */
+    void BackUpTrainStatusBeforeLoad();
+
+    /**
+     * estimator模式下eval切换为train时， 还原所有表train的状态
+     */
+    void RecoverTrainStatus();
+
+
     /**
     * 获取所有表对应的DeviceOffsets，该偏移用于python侧保存embedding时抽取key对应的embedding
     */
diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp
index 61874b1f..0db152ed 100644
--- a/src/core/emb_table/embedding_static.cpp
+++ b/src/core/emb_table/embedding_static.cpp
@@ -160,11 +160,23 @@ void EmbeddingStatic::LoadKey(const string& savePath)
     }
 
     maxOffset = keyOffsetMap.size();
-
     free(static_cast<void*>(buf));
 }
 
 vector<int64_t> EmbeddingStatic::GetDeviceOffset()
 {
     return deviceOffset;
-}
\ No newline at end of file
+}
+
+void EmbeddingStatic::BackUpTrainStatus()
+{
+    keyOffsetMapBackUp = keyOffsetMap;
+}
+
+void EmbeddingStatic::RecoverTrainStatus()
+{
+    if (keyOffsetMapBackUp.size()!=0) {
+        keyOffsetMap = keyOffsetMapBackUp;
+        keyOffsetMapBackUp.clear();
+    }
+}
diff --git a/src/core/emb_table/embedding_static.h b/src/core/emb_table/embedding_static.h
index 6515f586..6f772e08 100644
--- a/src/core/emb_table/embedding_static.h
+++ b/src/core/emb_table/embedding_static.h
@@ -39,6 +39,10 @@ public:
 
     void Save(const string& savePath);
 
+    void BackUpTrainStatus();
+
+    void RecoverTrainStatus();
+
     vector<int64_t> GetDeviceOffset();
 
 GTEST_PRIVATE:
diff --git a/src/core/emb_table/embedding_table.cpp b/src/core/emb_table/embedding_table.cpp
index b4eb2379..12b0137a 100644
--- a/src/core/emb_table/embedding_table.cpp
+++ b/src/core/emb_table/embedding_table.cpp
@@ -143,6 +143,14 @@ void EmbeddingTable::Save(const string& filePath)
 {
 }
 
+void EmbeddingTable::BackUpTrainStatus()
+{
+}
+
+void EmbeddingTable::RecoverTrainStatus()
+{
+}
+
 void EmbeddingTable::MakeDir(const string& dirName)
 {
     if (fileSystemPtr_ == nullptr) {
diff --git a/src/core/emb_table/embedding_table.h b/src/core/emb_table/embedding_table.h
index cbf15a7a..174cc0fc 100644
--- a/src/core/emb_table/embedding_table.h
+++ b/src/core/emb_table/embedding_table.h
@@ -76,6 +76,10 @@ public:
 
     void MakeDir(const string& dirName);
 
+    virtual void BackUpTrainStatus();
+
+    virtual void RecoverTrainStatus();
+
     virtual vector<int64_t> GetDeviceOffset();
 
     vector<int64_t> GetLoadOffset();
@@ -96,6 +100,7 @@ public:
     size_t ssdVocabSize;
     size_t maxOffset;
     absl::flat_hash_map<emb_key_t, int64_t> keyOffsetMap;
+    absl::flat_hash_map<emb_key_t, int64_t> keyOffsetMapBackUp;
     std::vector<int64_t> evictDevPos;     // 记录HBM内被淘汰的key
     std::vector<int64_t> evictHostPos; // 记录Host内淘汰列表
 
diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp
index fda54d9d..100ed24e 100644
--- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp
+++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp
@@ -221,6 +221,7 @@ bool HybridMgmt::Load(const string& loadPath, vector<string> warmStartTables)
     Checkpoint loadCkpt;
     vector<CkptFeatureType> loadFeatures;
     SetFeatureTypeForLoad(loadFeatures);
+    BackUpTrainStatus();
 
     if (warmStartTables.size() == 0) {
         EmbeddingMgmt::Instance()->Load(loadPath, trainKeysSet);
@@ -499,6 +500,8 @@ void HybridMgmt::EvalTask(TaskType type)
             cvCheckSave.wait(checkSaveLocker, [this] {
                 return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy;
             });
+            // 在唤醒train的数据处理进程之前，需要将备份的train状态还原
+            RecoverTrainStatus();
             hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID);
             LOG_DEBUG("wake TrainTask");
             hybridMgmtBlock->DoBlock(channelId);
@@ -2230,3 +2233,27 @@ bool HybridMgmt::IsTrainAndEvalCase()
     }
     return alreadyTrainOnce && isChannelSwitchCase;
 }
+
+void HybridMgmt::BackUpTrainStatus()
+{
+    int channelID = TRAIN_CHANNEL_ID;
+    int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[channelID];
+    //续训load、predict模式下的load不需要对train的状态进行备份
+    if (theTrainBatchId==0) {
+        return;
+    }
+    // train and eval模式下，train切换为eval之后
+    // eval的load需要线备份原有的相关状态， HBM非扩容模式需要备份keyOffsetMap, DDR模式需要备份offsetMapper对象
+    LOG_INFO("On Estimator train and eval mode, start to backup train status, "
+             "current train batchId: {} .", theTrainBatchId);
+    EmbeddingMgmt::Instance()->BackUpTrainStatusBeforeLoad();
+    isBackUpTrainStatus = true;
+}
+
+void HybridMgmt::RecoverTrainStatus()
+{
+    if (isBackUpTrainStatus) {
+        EmbeddingMgmt::Instance()->RecoverTrainStatus();
+    }
+    isBackUpTrainStatus = false;
+}
\ No newline at end of file
diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h
index 83299da3..fb050e70 100644
--- a/src/core/hybrid_mgmt/hybrid_mgmt.h
+++ b/src/core/hybrid_mgmt/hybrid_mgmt.h
@@ -133,6 +133,10 @@ namespace MxRec {
 
         void ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBatchOut);
 
+        void BackUpTrainStatus();
+
+        void RecoverTrainStatus();
+
     GTEST_PRIVATE:
         bool mutexDestroy { false };
         std::mutex lookUpAndSendBatchIdMtx;
@@ -225,6 +229,7 @@ namespace MxRec {
         bool isLoad { false };
         bool isInitialized { false };
         bool alreadyTrainOnce = false;  // 用于判断是否为predict模式
+        bool isBackUpTrainStatus = false; // 用于判断当前是否已经备份了train的状态
         map<string, int> lookUpSwapInAddrsPushId;  // 用于处理eos场景，当消费者追上生产者且长时间无上游数据，会触发eos
         map<string, ProcessStatus> specialProcessStatus;
 
diff --git a/src/core/ock_ctr_common/include/embedding_cache.h b/src/core/ock_ctr_common/include/embedding_cache.h
index f3bc9e23..5e25a718 100644
--- a/src/core/ock_ctr_common/include/embedding_cache.h
+++ b/src/core/ock_ctr_common/include/embedding_cache.h
@@ -315,6 +315,20 @@ public:
     virtual int LoadEmbTableInfos(std::string tableName, const std::vector<uint64_t>& keys,
                                   const std::vector<std::vector<float>>& embeddings,
                                   const std::vector<std::vector<float>>& optimizerSlots) = 0;
+
+    /* *
+     * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态
+     * @Param tableName: 需要加载信息的table名字
+     * @Return errorCode
+     */
+     virtual int BackUpTrainStatus(std::string tableName) = 0;
+
+    /* *
+     * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态
+     * @Param tableName: 需要加载信息的table名字
+     * @Return errorCode
+     */
+     virtual int RecoverTrainStatus(std::string tableName) = 0;
 };
 }  // namespace EmbCache
 
-- 
Gitee


From 1e9e773c32f67ff466893976f5b748ac217947c0 Mon Sep 17 00:00:00 2001
From: longfeifei <962977793@qq.com>
Date: Mon, 15 Jul 2024 14:20:33 +0800
Subject: [PATCH 02/16] =?UTF-8?q?estimator=E4=B8=ADtrain=E5=88=87=E6=8D=A2?=
 =?UTF-8?q?=E4=B8=BAeval,=E5=A2=9E=E5=8A=A0=E5=8E=9Fhost=E4=BE=A7train?=
 =?UTF-8?q?=E7=9A=84=E7=9B=B8=E5=85=B3=E7=8A=B6=E6=80=81=E5=A4=87=E4=BB=BD?=
 =?UTF-8?q?=EF=BC=8C=E5=9C=A8eval=E5=88=87=E6=8D=A2=E4=B8=BAtrain=E5=90=8E?=
 =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E8=BF=98=E5=8E=9F=E5=A4=87=E4=BB=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/AccCTR/src/common/util/error_code.h       |  1 +
 .../cache_manager/cache_manager.cpp           | 57 ++++++++++++--
 .../cache_manager/cache_manager.h             |  8 +-
 src/AccCTR/src/embedding_cache/limited_set.h  | 19 +++--
 src/AccCTR/src/include/embedding_cache.h      | 18 +++--
 src/core/emb_table/embedding_ddr.cpp          | 11 ++-
 src/core/emb_table/embedding_mgmt.h           |  5 +-
 src/core/hybrid_mgmt/hybrid_mgmt.cpp          | 30 +++++---
 src/core/hybrid_mgmt/hybrid_mgmt.h            |  2 +-
 src/core/l3_storage/cache_manager.cpp         | 74 ++++++++++++++++++-
 src/core/l3_storage/cache_manager.h           | 11 +++
 .../ock_ctr_common/include/embedding_cache.h  | 18 +++--
 12 files changed, 200 insertions(+), 54 deletions(-)

diff --git a/src/AccCTR/src/common/util/error_code.h b/src/AccCTR/src/common/util/error_code.h
index b30bfd83..87c8ffe6 100644
--- a/src/AccCTR/src/common/util/error_code.h
+++ b/src/AccCTR/src/common/util/error_code.h
@@ -43,6 +43,7 @@ using CTRCode = enum : int {
     H_TABLE_NAME_EMPTY = 22,
     H_PREFILL_BUFFER_SIZE_INVALID = 23,
     H_TABLE_NAME_TOO_LONG = 24,
+    H_EMB_CACHE_INFO_LOST = 25
 };
 }
 }
diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp
index 68351328..52578820 100644
--- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp
+++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp
@@ -253,8 +253,7 @@ int EmbCacheManagerImpl::ExportDeviceKeyOffsetPairs(const std::string& tableName
     if (checkTableNameRet != H_OK) {
         return checkTableNameRet;
     }
-    OffsetMapper& om = offsetMappers[tableName];
-    koVec = om.ExportSortedKVPairs();
+    koVec = offsetMappers[tableName].ExportSortedKVPairs();
     return H_OK;
 }
 
@@ -318,30 +317,58 @@ int EmbCacheManagerImpl::LoadEmbTableInfos(std::string tableName, const std::vec
     return H_OK;
 }
 
-int EmbCacheManagerImpl::BackUpTrainStatus(std:string tableName)
+int EmbCacheManagerImpl::BackUpTrainStatus(const std::string& tableName)
 {
     int checkTableNameRet = CheckValidTableName(tableName);
     if (checkTableNameRet != H_OK) {
         return checkTableNameRet;
     }
 
+    // Back up the key-offset correspondence on the device
+    kvVecsBackUp[tableName] = offsetMappers[tableName].ExportVec();
+
+    auto embInfo = embCacheInfos.find(tableName);
+    if (embInfo == embCacheInfos.end()) {
+        return H_EMB_CACHE_INFO_LOST;
+    }
+    uint32_t reserve = embInfo->second.maxCacheSize / VOCAB_CACHE_RATIO;
+    uint32_t maxCacheSize = embInfo->second.maxCacheSize;
+
     auto om = offsetMappersBackUp.find(tableName);
     if (om != offsetMappersBackUp.end()) {
-        offsetMappersBackUp[tableName] = offsetMappers[tableName];
-    } else{
-        offsetMappersBackUp[tableName].Initialize(1000, 1000);
-        offsetMappersBackUp[tableName] = offsetMappers[tableName];
+        offsetMappersBackUp[tableName].UnInitialize();
     }
+    offsetMappersBackUp[tableName].Initialize(reserve, maxCacheSize);
+    offsetMappersBackUp[tableName] = offsetMappers[tableName];
+
     return H_OK;
 }
 
-int EmbCacheManagerImpl::RecoverTrainStatus(std:string tableName)
+int EmbCacheManagerImpl::RecoverTrainStatus(const std::string& tableName)
 {
     int checkTableNameRet = CheckValidTableName(tableName);
     if (checkTableNameRet != H_OK) {
         return checkTableNameRet;
     }
+
+    auto embInfo = embCacheInfos.find(tableName);
+    if (embInfo == embCacheInfos.end()) {
+        return H_EMB_CACHE_INFO_LOST;
+    }
+    uint32_t reserve = embInfo->second.maxCacheSize / VOCAB_CACHE_RATIO;
+    uint32_t maxCacheSize = embInfo->second.maxCacheSize;
+
+    offsetMappers[tableName].UnInitialize();
+    offsetMappers[tableName].Initialize(reserve, maxCacheSize);
     offsetMappers[tableName] = offsetMappersBackUp[tableName];
+
+    // Recover the key-offset correspondence on the device
+    auto kvVecBackUp = kvVecsBackUp[tableName];
+    for (const auto& kvPair: kvVecBackUp) {
+        offsetMappers[tableName].Put(kvPair.first, kvPair.second);
+    }
+
+    kvVecBackUp.clear();
     return H_OK;
 }
 
@@ -449,3 +476,17 @@ uint32_t EmbCacheManagerImpl::GetUsage(const std::string& tableName)
 {
     return embTables[tableName].GetUsage();
 }
+
+int EmbCacheManagerImpl::ResetOffsetMappers()
+{
+    for (auto it = offsetMappers.begin(); it != offsetMappers.end(); it++)  {
+        auto embInfo = embCacheInfos.find(it->first);
+        if (embInfo == embCacheInfos.end()) {
+            return H_EMB_CACHE_INFO_LOST;
+        }
+        it->second.UnInitialize();
+        uint32_t reserve = embInfo->second.maxCacheSize / VOCAB_CACHE_RATIO;
+        it->second.Initialize(reserve, embInfo->second.maxCacheSize);
+    }
+    return H_OK;
+}
diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h
index 359e88ad..e4a240ae 100644
--- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h
+++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h
@@ -73,8 +73,11 @@ public:
                           const std::vector<std::vector<float>>& embeddings,
                           const std::vector<std::vector<float>>& optimizerSlots) override;
 
-    int BackUpTrainStatus(std:string tableName) override;
-    int RecoverTrainStatus(std::string tableName) override;
+    int BackUpTrainStatus(const std::string& tableName) override;
+
+    int RecoverTrainStatus(const std::string& tableName) override;
+
+    int ResetOffsetMappers() override;
 
     uint32_t GetUsage(const std::string& tableName) override;
 
@@ -83,6 +86,7 @@ private:
     std::map<std::string, OffsetMapper> offsetMappers;
     std::map<std::string, OffsetMapper> offsetMappersBackUp;
     std::map<std::string, EmbLocalTable> embTables;
+    std::map<std::string, std::vector<std::pair<uint64_t, uint64_t>>> kvVecsBackUp;
 
     int CheckValidTableName(const std::string& tableName);
 
diff --git a/src/AccCTR/src/embedding_cache/limited_set.h b/src/AccCTR/src/embedding_cache/limited_set.h
index d44b615a..f7bc2e1e 100644
--- a/src/AccCTR/src/embedding_cache/limited_set.h
+++ b/src/AccCTR/src/embedding_cache/limited_set.h
@@ -20,19 +20,21 @@ limitations under the License.
 
 namespace EmbCache {
 
+static constexpr int64_t NODE_DEFAULT_VALUE = -1;
+
 class LimitedSet {
 public:
     struct Node {
         uint64_t value;
         Node *prev, *next;
-        Node(uint64_t val = -1) : value(val), prev(nullptr), next(nullptr) {}
+        Node(uint64_t val = NODE_DEFAULT_VALUE) : value(val), prev(nullptr), next(nullptr) {}
     };
 
-    LimitedSet(uint64_t maxRange) : head(new Node(-1)), tail(new Node(-1))
+    LimitedSet(uint64_t maxRange) : head(new Node(NODE_DEFAULT_VALUE)), tail(new Node(NODE_DEFAULT_VALUE))
     {
         nodes.resize(maxRange);
         for (auto &node : nodes) {
-            node = new Node(-1);
+            node = new Node(NODE_DEFAULT_VALUE);
         }
         head->next = tail;
         tail->prev = head;
@@ -47,19 +49,16 @@ public:
         delete tail;
     }
 
-    // 拷贝构造函数
-    LimitedSet(const LimitedSet& other): head(new Node(-1)), tail(new Node(-1))
+    LimitedSet(const LimitedSet& other): head(new Node(NODE_DEFAULT_VALUE)), tail(new Node(NODE_DEFAULT_VALUE))
     {
         nodes.resize(other.nodes.size());
-        for (auto &node: nodes) {
-            node = new Node(-1);
+        for (auto& node: nodes) {
+            node = new Node(NODE_DEFAULT_VALUE);
         }
 
-        // 初始化头尾节点
         head->next = tail;
         tail->prev = head;
 
-        // 遍历原vector的每一个节点并复制
         for (Node* node = other.head->next; node != other.tail; node = node->next) {
             insert(node->value);
         }
@@ -87,7 +86,7 @@ public:
         Node *node = nodes[value];
         node->prev->next = node->next;
         node->next->prev = node->prev;
-        node->value = -1;
+        node->value = NODE_DEFAULT_VALUE;
     }
 
     bool find(uint64_t value)
diff --git a/src/AccCTR/src/include/embedding_cache.h b/src/AccCTR/src/include/embedding_cache.h
index 40d9dcbe..c0468549 100644
--- a/src/AccCTR/src/include/embedding_cache.h
+++ b/src/AccCTR/src/include/embedding_cache.h
@@ -317,18 +317,24 @@ public:
                                   const std::vector<std::vector<float>>& optimizerSlots) = 0;
 
     /* *
-     * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态
-     * @Param tableName: 需要加载信息的table名字
+     * When switch the channel to eval, backup the current table's offsetMapper object.
+     * @Param tableName: embedding table name
      * @Return errorCode
      */
-     virtual int BackUpTrainStatus(std::string tableName) = 0;
+    virtual int BackUpTrainStatus(const std::string& tableName) = 0;
 
     /* *
-     * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态
-     * @Param tableName: 需要加载信息的table名字
+     * When switch the eval channel back to train, Recover the current table's offsetMapper object to the backup state.
+     * @Param tableName: embedding table name
+     * @Return errorCode
+     */
+    virtual int RecoverTrainStatus(const std::string& tableName) = 0;
+
+    /* *
+     * Reset the offsetMapper object to revert to its initialized state after loading.
      * @Return errorCode
      */
-     virtual int RecoverTrainStatus(std::string tableName) = 0;
+    virtual int ResetOffsetMappers() = 0;
 };
 }  // namespace EmbCache
 
diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp
index 82ca0b73..d05b3501 100644
--- a/src/core/emb_table/embedding_ddr.cpp
+++ b/src/core/emb_table/embedding_ddr.cpp
@@ -78,6 +78,11 @@ void EmbeddingDDR::Load(const string& savePath, map<string, unordered_set<emb_ca
     }
 
     trainKeySet[name].insert(keys.cbegin(), keys.cend());
+    // Reset the offsetMapper object to revert to its initialized state after loading
+    auto rs = embCache->ResetOffsetMappers();
+    if (rs != 0) {
+        throw runtime_error("embCache->ResetOffsetMappers failed, err code: " + to_string(rc));
+    }
 }
 
 void EmbeddingDDR::LoadKey(const string &savePath, vector<emb_cache_key_t> &keys)
@@ -187,15 +192,13 @@ void EmbeddingDDR::LoadOptimizerSlot(const string &savePath, vector<vector<float
 
 void EmbeddingDDR::Save(const string& savePath)
 {
+    SyncLatestEmbedding();
     vector<emb_cache_key_t> keys;
     vector<vector<float>> embeddings;
     vector<vector<float>> optimizerSlots;
 
     auto step = GetStepFromPath(savePath);
-    if (step > 0) {
-        SyncLatestEmbedding();
-        embCache->GetEmbTableInfos(name, keys, embeddings, optimizerSlots);
-    }
+    embCache->GetEmbTableInfos(name, keys, embeddings, optimizerSlots);
 
     SaveKey(savePath, keys);
     SaveEmbedding(savePath, embeddings);
diff --git a/src/core/emb_table/embedding_mgmt.h b/src/core/emb_table/embedding_mgmt.h
index 7cd3f782..9dd0e363 100644
--- a/src/core/emb_table/embedding_mgmt.h
+++ b/src/core/emb_table/embedding_mgmt.h
@@ -90,16 +90,15 @@ public:
     void Save(const string& filePath);
 
     /**
-     * estimator模式下train切换为eval时， 备份所有表train的状态
+     * In estimator mode, when switching from train to eval, backup the training state of all tables.
      */
     void BackUpTrainStatusBeforeLoad();
 
     /**
-     * estimator模式下eval切换为train时， 还原所有表train的状态
+     * In estimator mode, when switching from eval to train, recover the training state of all tables.
      */
     void RecoverTrainStatus();
 
-
     /**
     * 获取所有表对应的DeviceOffsets，该偏移用于python侧保存embedding时抽取key对应的embedding
     */
diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp
index 84195a3c..91750b65 100644
--- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp
+++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp
@@ -206,12 +206,6 @@ bool HybridMgmt::Load(const string& loadPath, vector<string> warmStartTables)
         throw runtime_error("HybridMgmt not initialized. Call Initialize first.");
     }
 
-    if (mgmtRankInfo.isDDR && IsTrainAndEvalCase()) {
-        LOG_INFO("estimator train and eval case, skip loading, "
-                 "host will reuse data in memory while evaluating since is's same as saved data");
-        return true;
-    }
-
     // 数据处理线程上锁
     KEY_PROCESS_INSTANCE->LoadSaveLock();
 
@@ -257,10 +251,15 @@ bool HybridMgmt::Load(const string& loadPath, vector<string> warmStartTables)
         featAdmitNEvict.LoadHistoryRecords(loadData.histRec);
     }
 
+    int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[TRAIN_CHANNEL_ID];
     if (isL3StorageEnabled) {
         LOG_DEBUG(MGMT + "Start host side load: L3Storage key freq map");
         auto step = GetStepFromPath(loadPath);
-        cacheManager->Load(mgmtEmbInfo, step, trainKeysSet);
+        // When in load and train mode or predict mode, SSD needs to actually execute loading
+        // When in the train and eval modes, loading before eval should be directly skipped
+        if (theTrainBatchId == 0) {
+            cacheManager->Load(mgmtEmbInfo, step, trainKeysSet);
+        }
     }
 
     LOG_DEBUG(MGMT + "Finish host side load process");
@@ -502,7 +501,7 @@ void HybridMgmt::EvalTask(TaskType type)
             cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; });
 
             if (hybridMgmtBlock->pythonBatchId[EVAL_CHANNEL_ID] >= hybridMgmtBlock->hybridBatchId[EVAL_CHANNEL_ID]) {
-                // 在唤醒train的数据处理进程之前，需要将备份的train状态还原
+                // Before waking the data process for training, Recover the backed-up training state
                 RecoverTrainStatus();
                 hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID);
             } else {
@@ -2210,15 +2209,18 @@ void HybridMgmt::BackUpTrainStatus()
 {
     int channelID = TRAIN_CHANNEL_ID;
     int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[channelID];
-    //续训load、predict模式下的load不需要对train的状态进行备份
-    if (theTrainBatchId==0) {
+    if (theTrainBatchId == 0) {
         return;
     }
-    // train and eval模式下，train切换为eval之后
-    // eval的load需要线备份原有的相关状态， HBM非扩容模式需要备份keyOffsetMap, DDR模式需要备份offsetMapper对象
+
     LOG_INFO("On Estimator train and eval mode, start to backup train status, "
              "current train batchId: {} .", theTrainBatchId);
+    // When in the train and eval mode of estimator, backup training states before loading.
     EmbeddingMgmt::Instance()->BackUpTrainStatusBeforeLoad();
+
+    if (isL3StorageEnabled) {
+        cacheManager->BackUpTrainStatus();
+    }
     isBackUpTrainStatus = true;
 }
 
@@ -2227,5 +2229,9 @@ void HybridMgmt::RecoverTrainStatus()
     if (isBackUpTrainStatus) {
         EmbeddingMgmt::Instance()->RecoverTrainStatus();
     }
+
+    if (isL3StorageEnabled) {
+        cacheManager->RecoverTrainStatus();
+    }
     isBackUpTrainStatus = false;
 }
\ No newline at end of file
diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h
index f845efb1..233030b9 100644
--- a/src/core/hybrid_mgmt/hybrid_mgmt.h
+++ b/src/core/hybrid_mgmt/hybrid_mgmt.h
@@ -223,7 +223,7 @@ private:
     bool isLoad{false};
     bool isInitialized{false};
     bool alreadyTrainOnce = false;  // 用于判断是否为predict模式
-    bool isBackUpTrainStatus = false; // 用于判断当前是否已经备份了train的状态
+    bool isBackUpTrainStatus = false; // whether the train state has been backed up
     map<string, int> lookUpSwapInAddrsPushId;  // 用于处理eos场景，当消费者追上生产者且长时间无上游数据，会触发eos
     map<string, ProcessStatus> specialProcessStatus;
 
diff --git a/src/core/l3_storage/cache_manager.cpp b/src/core/l3_storage/cache_manager.cpp
index ee3d7bc5..7ea68e14 100644
--- a/src/core/l3_storage/cache_manager.cpp
+++ b/src/core/l3_storage/cache_manager.cpp
@@ -32,10 +32,10 @@ void CacheManager::Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector<EmbInfo
     if (level3Storage == nullptr) {
         throw runtime_error("level3Storage is nullptr");
     }
-    
+
     this->embCache = std::move(embCachePtr);
     for (auto& emb : mgmtEmbInfo) {
-        EmbBaseInfo baseInfo {emb.ssdVocabSize, emb.ssdDataPath, false};
+        EmbBaseInfo baseInfo {emb.ssdVocabSize, emb.ssdDataPath, false, emb.extEmbeddingSize};
         embBaseInfos.emplace(emb.name, baseInfo);
         preProcessMapper[emb.name].Initialize(emb.name, emb.hostVocabSize, emb.ssdVocabSize);
     }
@@ -293,3 +293,73 @@ void CacheManager::FetchL3StorageEmb2DDR(string tableName, uint32_t extEmbedding
     embeddingTaskStep++;
     evictWaitCond.notify_all();
 }
+
+void CacheManager::BackUpTrainStatus()
+{
+    ddrKeyFreqMapBackUp = ddrKeyFreqMap;
+    excludeDDRKeyCountMapBackUp = excludeDDRKeyCountMap;
+}
+
+void CacheManager::RecoverTrainStatus()
+{
+    for (const auto& pair: excludeDDRKeyCountMapBackUp) {
+        auto tableName = pair.first;
+
+        std::vector<emb_cache_key_t> ssdKeysBeforeEval;
+        std::vector<emb_cache_key_t> ssdKeysAfterEval;
+        std::vector<emb_cache_key_t> swapInKeys;
+        std::vector<emb_cache_key_t> swapOutKeys;
+
+        for (const auto& keyMap : pair.second) {
+            ssdKeysBeforeEval.push_back(keyMap.first);
+        }
+        for (const auto& keyMap : excludeDDRKeyCountMap[tableName]) {
+            ssdKeysAfterEval.push_back(keyMap.first);
+        }
+
+        GetSwapInAndSwapOutKeys(ssdKeysBeforeEval, ssdKeysAfterEval, swapInKeys, swapOutKeys);
+
+        // ddr <-> ssd
+        // ddr-> lookup address, ssd->insert embedding , ddr->remove embedding
+        vector<float*> swapInKeysAddr;
+        int rc = embCache->EmbeddingLookupAddrs(tableName, swapInKeys, swapInKeysAddr);
+        if (rc != 0) {
+            throw runtime_error("EmbeddingLookUpAddrs failed! error code: " + std::to_string(rc));
+        }
+        auto extEmbeddingSize = embBaseInfos[tableName].extEmbeddingSize;
+        l3Storage->InsertEmbeddingsByAddr(tableName, swapInKeys, swapInKeysAddr, extEmbeddingSize);
+        rc = embCache->EmbeddingRemove(tableName, swapInKeys);
+        if (rc != 0) {
+            throw runtime_error("EmbeddingRemove failed! error code: " + std::to_string(rc));
+        }
+
+        // ssd->fetch embedding, ddr->EmbeddingUpdate, ssd->delete embedding
+        auto swapOutEmbeddings = l3Storage->FetchEmbeddings(tableName, swapOutKeys);
+        vector<float> swapOutFlattenEmbeddings;
+        for (auto& emb : swapOutEmbeddings) {
+            swapOutFlattenEmbeddings.insert(swapOutFlattenEmbeddings.cend(), emb.cbegin(), emb.cend());
+        }
+        rc = embCache->EmbeddingUpdate(tableName, swapOutKeys, swapOutFlattenEmbeddings.data());
+        l3Storage->DeleteEmbeddings(tableName, swapOutKeys);
+    }
+
+    ddrKeyFreqMap = ddrKeyFreqMapBackUp;
+    excludeDDRKeyCountMap = excludeDDRKeyCountMapBackUp;
+}
+
+void CacheManager::GetSwapInAndSwapOutKeys(vector<emb_cache_key_t>& ssdKeysBeforeEval,
+                                           vector<emb_cache_key_t>& ssdKeysAfterEval,
+                                           vector<emb_cache_key_t>& swapInKeys, vector<emb_cache_key_t>& swapOutKeys)
+{
+    std::sort(ssdKeysBeforeEval.begin(), ssdKeysBeforeEval.end());
+    std::sort(ssdKeysAfterEval.begin(), ssdKeysAfterEval.end());
+    vector<emb_cache_key_t> intersectionKeys;
+    std::set_intersection(ssdKeysBeforeEval.begin(), ssdKeysBeforeEval.end(), ssdKeysAfterEval.begin(),
+                          ssdKeysAfterEval.end(), std::back_inserter(intersectionKeys));
+
+    std::set_difference(ssdKeysBeforeEval.begin(), ssdKeysBeforeEval.end(), intersectionKeys.begin(),
+                        intersectionKeys.end(), std::back_inserter(swapInKeys));
+    std::set_difference(ssdKeysAfterEval.begin(), ssdKeysAfterEval.end(), intersectionKeys.begin(),
+                        intersectionKeys.end(), std::back_inserter(swapOutKeys));
+}
+
diff --git a/src/core/l3_storage/cache_manager.h b/src/core/l3_storage/cache_manager.h
index 79335788..34e7f0c2 100644
--- a/src/core/l3_storage/cache_manager.h
+++ b/src/core/l3_storage/cache_manager.h
@@ -107,10 +107,20 @@ namespace MxRec {
 
         int64_t GetTableUsage(const string& tableName);
 
+        void BackUpTrainStatus();
+
+        void RecoverTrainStatus();
+
+        void GetSwapInAndSwapOutKeys(vector<emb_cache_key_t>& ssdKeysBeforeEval,
+                                     vector<emb_cache_key_t>& ssdKeysAfterEval,
+                                     vector<emb_cache_key_t>& swapInKeys, vector<emb_cache_key_t>& swapOutKeys);
+
         // DDR内每个表中emb数据频次缓存；map<embTableName, 频次缓存>
         unordered_map<std::string, LFUCache> ddrKeyFreqMap;
+        unordered_map<std::string, LFUCache> ddrKeyFreqMapBackUp;
         // 每张表中非DDR内key的出现次数
         unordered_map<std::string, unordered_map<emb_cache_key_t, freq_num_t>> excludeDDRKeyCountMap;
+        unordered_map<std::string, unordered_map<emb_cache_key_t, freq_num_t>> excludeDDRKeyCountMapBackUp;
 
         // 每一个table对应一个PreProcessMapper，预先推演HBM->DDR的情况
         std::unordered_map<std::string, PreProcessMapper> preProcessMapper;
@@ -125,6 +135,7 @@ namespace MxRec {
             uint64_t maxTableSize;
             vector<std::string> savePath;
             bool isExist;
+            int extEmbeddingSize;
         };
 
         void CreateL3StorageTableIfNotExist(const std::string& embTableName);
diff --git a/src/core/ock_ctr_common/include/embedding_cache.h b/src/core/ock_ctr_common/include/embedding_cache.h
index 5e25a718..ce807f16 100644
--- a/src/core/ock_ctr_common/include/embedding_cache.h
+++ b/src/core/ock_ctr_common/include/embedding_cache.h
@@ -317,18 +317,24 @@ public:
                                   const std::vector<std::vector<float>>& optimizerSlots) = 0;
 
     /* *
-     * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态
-     * @Param tableName: 需要加载信息的table名字
+     * When switch the channel to eval, backup the current table's offsetMapper object.
+     * @Param tableName: embedding table name
      * @Return errorCode
      */
-     virtual int BackUpTrainStatus(std::string tableName) = 0;
+    virtual int BackUpTrainStatus(const std::string& tableName) = 0;
 
     /* *
-     * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态
-     * @Param tableName: 需要加载信息的table名字
+     * When switch the eval channel back to train, Recover the current table's offsetMapper object to the backup state.
+     * @Param tableName: embedding table name
+     * @Return errorCode
+     */
+    virtual int RecoverTrainStatus(const std::string& tableName) = 0;
+
+    /* *
+     * Reset the offsetMapper object to revert to its initialized state after loading.
      * @Return errorCode
      */
-     virtual int RecoverTrainStatus(std::string tableName) = 0;
+    virtual int ResetOffsetMappers() = 0;
 };
 }  // namespace EmbCache
 
-- 
Gitee


From aaabe4aa37ef1b188d5e112c3a7c99040579c92f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Mon, 22 Jul 2024 22:08:22 +0800
Subject: [PATCH 03/16] =?UTF-8?q?mmoe=20=E6=A8=A1=E5=9E=8B=E6=A1=86?=
 =?UTF-8?q?=E6=9E=B6=E6=8F=90=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/model.py | 136 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 examples/mmoe/model.py

diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py
new file mode 100644
index 00000000..0046d2fd
--- /dev/null
+++ b/examples/mmoe/model.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import time
+from easydict import EasyDict as edict
+
+import tensorflow as tf
+
+
+model_cfg = edict()
+model_cfg.loss_mode = "batch"
+LOSS_OP_NAME = "loss"
+LABEL_OP_NAME = "label"
+VAR_LIST = "variable"
+PRED_OP_NAME = "pred"
+
+
+class MyModel:
+    def __init__(self, expert_num =8, expert_size=16, tower_size=8, gate_num = 2):
+
+        self.expert_num = expert_num
+        self.expert_size = expert_size
+        self.tower_size = tower_size
+        self.gate_num = gate_num
+
+    
+    def expert_layer(self, input):
+        param_expert = []
+        for i in range(0, self.expert_num):
+            expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name = f'expert_payer_{i}', 
+                                            kernel_initializer = tf.constant_initializer(value=0.1), 
+                                            bias_initializer = tf.constant_initializer(values = 0.1))
+            
+            param_expert.append(expert_linear)
+        return param_expert
+    
+    
+    def gate_layer(self, input):
+        param_gate = []
+        for i in range(0, self.gate_num):
+            gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name = f'gate_payer_{i}', 
+                                            kernel_initializer = tf.constant_initializer(value=0.1), 
+                                            bias_initializer = tf.constant_initializer(values = 0.1))
+            
+            param_gate.append(gate_linear)
+        return param_gate
+    
+    
+    def tower_layer(self, input, layer_name):
+        tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name = f'tower_payer_{layer_name}', 
+                                            kernel_initializer = tf.constant_initializer(value=0.1), 
+                                            bias_initializer = tf.constant_initializer(values = 0.1))
+        
+        tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, name = f'tower_payer_out_{layer_name}', 
+                                            kernel_initializer = tf.constant_initializer(value=0.1), 
+                                            bias_initializer = tf.constant_initializer(values = 0.1))
+        
+        return tower_linear_out
+        
+        
+
+    
+    def build_model(self,
+                    embedding=None,
+                    dense_feature=None,
+                    label=None,
+                    is_training=True,
+                    seed=None):
+
+        with tf.variable_scope("mmoe", reuse=tf.AUTO_REUSE):
+
+            dense_expert = self.expert_layer(dense_feature)
+            dense_gate = self.gate_layer(dense_feature)
+
+            all_expert = []
+            _slice_num = 0
+            for i in range(0, self.expert_num):
+                slice_num_end = _slice_num + self.expert_size
+                cur_expert = tf.add(dense_expert[i], embedding[:, _slice_num:slice_num_end])
+                cur_expert = tf.nn.relu(cur_expert)
+                all_expert.append(cur_expert)
+                _slice_num = slice_num_end
+
+            expert_concat = tf.concat(all_expert, axis=1)
+            expert_concat = tf.reshape(expert_concat, [-1, self.expert_num, self.expert_size])
+
+            output_layers = []
+            out_pred = []
+            for i in range(0, self.gate_num):
+                slice_gate_end = _slice_num + self.expert_num
+                cur_gate = tf.add(dense_gate[i], embedding[:, _slice_num:slice_gate_end])
+                cur_gate = tf.nn.softmax(cur_gate)
+
+                cur_gate = tf.reshape(cur_gate, [-1, self.expert_num, 1])
+
+                cur_gate_expert = tf.multiply(x=expert_concat, y=cur_gate)
+                cur_gate_expert = tf.reduce_sum(cur_gate_expert, axis=1)
+                out = self.tower_layer(cur_gate_expert, i)
+                output_layers.append(out)
+                out_pred.append(tf.nn.softmax(out[:, 1]))
+                _slice_num = slice_num_end
+            trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mmoe')
+
+            label_income = label[:, 0:1]
+            label_mat = label[:, 1:]
+
+            pred_income_1 = tf.slice(output_layers[0], [0, 1], [-1, 1])
+            pred_marital_1 = tf.slice(output_layers[1], [0, 1], [-1, 1])
+
+            cost_income = tf.losses.log_loss(labels=tf.cast(label_income, tf.float32), predictions=pred_income_1,
+                                             epsilon=1e-4)
+            cost_marital = tf.losses.log_loss(labels=tf.cast(label_mat, tf.float32), predictions=pred_marital_1,
+                                              epsilon=1e-4)
+
+            avg_cost_income = tf.reduce_mean(cost_income)
+            avg_cost_marital = tf.reduce_mean(cost_marital)
+
+            loss = 0.5 * (avg_cost_income + avg_cost_marital)
+            
+            return {LOSS_OP_NAME: loss,
+                    PRED_OP_NAME: out_pred,
+                    LABEL_OP_NAME: label,
+                    VAR_LIST: trainable_variables}
-- 
Gitee


From f17973de35900ab90455e1933717c21161fe2a62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Mon, 22 Jul 2024 22:39:52 +0800
Subject: [PATCH 04/16] cleancode

---
 examples/mmoe/config.py             | 197 ++++++++++++++++++++
 examples/mmoe/criteo.py             | 273 ++++++++++++++++++++++++++++
 examples/mmoe/delay_loss_scale.py   |  64 +++++++
 examples/mmoe/gradient_descent_w.py |  71 ++++++++
 examples/mmoe/mean_auc.py           |  40 ++++
 examples/mmoe/model.py              |  27 +--
 examples/mmoe/op_impl_mode.ini      |   1 +
 examples/mmoe/optimizer.py          |  35 ++++
 8 files changed, 695 insertions(+), 13 deletions(-)
 create mode 100644 examples/mmoe/config.py
 create mode 100644 examples/mmoe/criteo.py
 create mode 100644 examples/mmoe/delay_loss_scale.py
 create mode 100644 examples/mmoe/gradient_descent_w.py
 create mode 100644 examples/mmoe/mean_auc.py
 create mode 100644 examples/mmoe/op_impl_mode.ini
 create mode 100644 examples/mmoe/optimizer.py

diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py
new file mode 100644
index 00000000..d5540908
--- /dev/null
+++ b/examples/mmoe/config.py
@@ -0,0 +1,197 @@
+# coding=utf-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import enum
+import os
+
+import tensorflow as tf
+from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig
+from npu_bridge.estimator.npu.npu_config import NPURunConfig
+
+from mx_rec.constants.constants import CacheModeEnum
+
+SSD_DATA_PATH = ["ssd_data"]
+
+
+class LearningRateScheduler:
+    """
+    LR Scheduler combining Polynomial Decay with Warmup at the beginning.
+    TF-based cond operations necessary for performance in graph mode.
+    """
+
+    def __init__(self, base_lr_dense, base_lr_sparse, warmup_steps, decay_start_step, decay_steps):
+        self.warmup_steps = tf.constant(warmup_steps, dtype=tf.int32)
+        self.decay_start_step = tf.constant(decay_start_step, dtype=tf.int32)
+        self.decay_steps = tf.constant(decay_steps)
+        self.decay_end_step = decay_start_step + decay_steps  # 65041
+        self.poly_power = 2.0
+        self.base_lr_dense = base_lr_dense
+        self.base_lr_sparse = base_lr_sparse
+
+    def calc(self, global_step):
+        # used for the warmup stage
+        warmup_step = tf.cast(1 / self.warmup_steps, tf.float32)
+        lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step
+        lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32)
+        # used for the constant stage
+        lr_factor_constant = tf.cast(1.0, tf.float32)
+        
+        lr_sparse = self.base_lr_sparse * lr_factor_constant
+        lr_dense = self.base_lr_dense * lr_factor_constant
+        return lr_dense, lr_sparse
+
+
+class Config:
+    def __init__(self, ):
+        self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None
+        tmp = os.getenv("TRAIN_RANK_SIZE")
+        if tmp is None:
+            raise ValueError("please export TRAIN_RANK_SIZE")
+        self.rank_size = int(tmp)
+
+        self.data_path = os.getenv("DLRM_CRITEO_DATA_PATH")
+        self.train_file_pattern = "train"
+        self.test_file_pattern = "test"
+
+        self.batch_size = 4096
+        self.line_per_sample = 1
+        self.train_epoch = 1
+        self.test_epoch = 9
+        self.perform_shuffle = False
+
+        self.key_type = tf.int64
+        self.label_type = tf.float32
+        self.value_type = tf.int64
+
+        self.feat_cnt = 26
+        self.__set_emb_table_size()
+
+        self.field_num = 26
+        self.send_count = 46000 // self.rank_size
+
+        self.emb_dim = 8
+        self.hashtable_threshold = 1
+
+        self.USE_PIPELINE_TEST = False
+
+        # 动态学习率
+        GLOBAL_BATCH_SIZE = 8192 * 8
+        LR_SCHEDULE_STEPS = [
+            int(2750 * 55296 / GLOBAL_BATCH_SIZE),
+            int(49315 * 55296 / GLOBAL_BATCH_SIZE),
+            int(27772 * 55296 / GLOBAL_BATCH_SIZE),
+        ]
+        self.global_step = tf.Variable(0, trainable=False)
+        _lr_scheduler = LearningRateScheduler(
+            0.001,
+            0.001,
+            LR_SCHEDULE_STEPS[0],
+            LR_SCHEDULE_STEPS[1],
+            LR_SCHEDULE_STEPS[2],
+        )
+        self.learning_rate = _lr_scheduler.calc(self.global_step)
+
+    def __set_emb_table_size(self):
+        self.cache_mode = os.getenv("CACHE_MODE")
+        if self.cache_mode is None:
+            raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]")
+
+        if self.cache_mode == CacheModeEnum.HBM.value:
+            self.dev_vocab_size = 14_000_000 * self.rank_size
+            self.host_vocab_size = 0
+        elif self.cache_mode == CacheModeEnum.DDR.value:
+            self.dev_vocab_size = 500_000 * self.rank_size
+            self.host_vocab_size = 24_000_000 * self.rank_size
+        elif self.cache_mode == CacheModeEnum.SSD.value:
+            self.dev_vocab_size = 100_000 * self.rank_size
+            self.host_vocab_size = 2_000_000 * self.rank_size
+            self.ssd_vocab_size = 24_000_000 * self.rank_size
+        else:
+            raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]")
+
+    def get_emb_table_cfg(self):
+        if self.cache_mode == CacheModeEnum.HBM.value:
+            return {"device_vocabulary_size": self.dev_vocab_size}
+        elif self.cache_mode == CacheModeEnum.DDR.value:
+            return {"device_vocabulary_size": self.dev_vocab_size,
+                    "host_vocabulary_size": self.host_vocab_size}
+        elif self.cache_mode == CacheModeEnum.SSD.value:
+            return {"device_vocabulary_size": self.dev_vocab_size,
+                    "host_vocabulary_size": self.host_vocab_size,
+                    "ssd_vocabulary_size": self.ssd_vocab_size,
+                    "ssd_data_path": SSD_DATA_PATH}
+        else:
+            raise RuntimeError(f"get CACHE_MODE:{self.cache_mode}, check Config.__set_emb_table_size implementation")
+
+
+def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"):
+    session_config = tf.ConfigProto(allow_soft_placement=False,
+                                    log_device_placement=False)
+    session_config.gpu_options.allow_growth = True
+    custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add()
+    custom_op.name = "NpuOptimizer"
+    custom_op.parameter_map["mix_compile_mode"].b = False
+    custom_op.parameter_map["use_off_line"].b = True
+    custom_op.parameter_map["min_group_size"].b = 1
+    # 可选配置level0:pairwise;level1:pairwise
+    custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:fullmesh;level1:fullmesh")
+    custom_op.parameter_map["enable_data_pre_proc"].b = True
+    custom_op.parameter_map["iterations_per_loop"].i = 10
+    custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
+    custom_op.parameter_map["hcom_parallel"].b = False
+    custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes("op_impl_mode.ini")
+    custom_op.parameter_map["op_execute_timeout"].i = 2000
+    custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes(
+        str(13 * 1024 * 1024 * 1024))  # total 31 need 13;
+    custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(18 * 1024 * 1024 * 1024))  # need 25
+    custom_op.parameter_map["stream_max_parallel_num"].s = tf.compat.as_bytes("DNN_VM_AICPU:3,AIcoreEngine:3")
+
+    if dump_data:
+        custom_op.parameter_map["enable_dump"].b = True
+        custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes(dump_path)
+        custom_op.parameter_map["dump_step"].s = tf.compat.as_bytes(dump_steps)
+        custom_op.parameter_map["dump_mode"].s = tf.compat.as_bytes("all")
+
+    session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
+    session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
+
+    return session_config
+
+
+def get_npu_run_config():
+    session_config = tf.ConfigProto(allow_soft_placement=False,
+                                    log_device_placement=False)
+
+    session_config.gpu_options.allow_growth = True
+    custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add()
+    custom_op.name = "NpuOptimizer"
+    session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
+    session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
+
+    run_config = NPURunConfig(
+        save_summary_steps=1000,
+        save_checkpoints_steps=100,
+        keep_checkpoint_max=5,
+        session_config=session_config,
+        log_step_count_steps=20,
+        precision_mode='allow_mix_precision',
+        enable_data_pre_proc=True,
+        iterations_per_loop=1,
+        jit_compile=False,
+        op_compiler_cache_mode="enable",
+        HCCL_algorithm="level0:fullmesh;level1:fullmesh"  # 可选配置：level0:pairwise;level1:pairwise
+    )
+    return run_config
diff --git a/examples/mmoe/criteo.py b/examples/mmoe/criteo.py
new file mode 100644
index 00000000..25f1d869
--- /dev/null
+++ b/examples/mmoe/criteo.py
@@ -0,0 +1,273 @@
+# coding=utf-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import stat
+import pickle
+import argparse
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+from tqdm import tqdm
+
+NAMES = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)]
+
+
+def make_sub_file(lines, head, src_name, sub_dir_name, sub):
+    """Write sub-data.
+    
+    Args:
+        :param lines: A list. Several pieces of data.
+        :param head: A string. ['label', 'I1', 'I2', ...].
+        :param src_name: A string. The name of data.
+        :param sub_dir_name: A string.
+        :param sub: A scalar(Int). Record the current number of sub file.
+    :return: sub + 1.
+    """
+    root_path, file_path = os.path.split(src_name)
+    file_name, suffix = file_path.split('.')
+    split_file_name = file_name + "_" + str(sub).zfill(2) + "." + suffix
+    split_file = os.path.join(root_path, sub_dir_name, split_file_name)
+    if not os.path.exists(os.path.join(root_path, sub_dir_name)):
+        os.mkdir(os.path.join(root_path, sub_dir_name))
+
+    modes = stat.S_IWUSR | stat.S_IRUSR
+    flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT
+    f = os.fdopen(os.open(split_file, flags, modes), 'w')
+    try:
+        f.writelines([head])
+        f.writelines(lines)
+        return sub + 1
+    finally:
+        f.close()
+
+
+def split_byline_count(filename, count, sub_dir_name):
+    """Split File.
+    Note: You can specify how many rows of data each sub file contains.
+    Args:
+        :param filename: A string.
+        :param count: A scalar(int).
+        :param sub_dir_name: A string.
+    :return:
+    """
+    f = open(filename, 'r')
+    try:
+        head = f.readline()
+        buf = []
+        sub = 1
+        for line in f:
+            buf.append(line)
+            if len(buf) == count:
+                sub = make_sub_file(buf, head, filename, sub_dir_name, sub)
+                buf = []
+        if len(buf) != 0:
+            try:
+                make_sub_file(buf, head, filename, sub_dir_name, sub)
+            except FileNotFoundError as err:
+                raise FileNotFoundError("please check the filename of data") from err
+    finally:
+        f.close()
+
+
+def get_split_file_path(parent_path=None, dataset_path=None, sample_num=4600000):
+    """Get the list of split file path.
+    Note: Either parent_path or dataset_path must be valid.
+    If exists dataset_path + "/split", parent_path = dataset_path + "/split".
+    Args:
+        :param parent_path: A string. split file's parent path.
+        :param dataset_path: A string.
+        :param sample_num: A int. The sample number of every split file.
+    :return: A list. [file1_path, file2_path, ...]
+    """
+    sub_dir_name = 'split'
+    if parent_path is None and dataset_path is None:
+        raise ValueError('Please give parent path or file path.')
+    if parent_path is None and os.path.exists(os.path.join(os.path.dirname(dataset_path), sub_dir_name)):
+        parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name)
+    elif parent_path is None or not os.path.exists(parent_path):
+        split_byline_count(dataset_path, sample_num, sub_dir_name)
+        parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name)
+    split_file_name = os.listdir(parent_path)
+    split_file_name.sort()
+    split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt']
+    return split_file_list
+
+
+def get_fea_map(fea_map_path=None, split_file_list=None):
+    """Get feature map.
+    Note: Either parent_path or dataset_path must be valid.
+    If exists dir(split_file_list[0]) + "/fea_map.pkl", fea_map_path is valid.
+    If fea_map_path is None and you want to build the feature map,
+    the default file path is the parent directory of split file + "fea_map.pkl".
+    Args:
+        :param fea_map_path: A string.
+        :param split_file_list: A list. [file1_path, file2_path, ...]
+    :return: A dict. {'C1':{}, 'C2':{}, ...}
+    """
+    if fea_map_path is None and split_file_list is None:
+        raise ValueError('Please give feature map path or split file list.')
+    if fea_map_path is None and split_file_list is not None:
+        fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl")
+    if os.path.exists(fea_map_path) and fea_map_path[-3:] == 'pkl':
+        with open(fea_map_path, 'rb') as f:
+            fea_map = pickle.load(f)
+        return fea_map
+    fea_map = {}
+    for file_open in tqdm(split_file_list):
+        f = open(file_open)
+        for line in f:
+            row = line.strip('\n').split('\t')
+            for i in range(14, 40):
+                if row[i] == '':
+                    continue
+                name = NAMES[i]
+                fea_map.setdefault(name, {})
+                if fea_map[name].get(row[i]) is None:
+                    fea_map[name][row[i]] = len(fea_map[name])
+            for j in range(1, 14):
+                if row[j] == '':
+                    continue
+                name = NAMES[j]
+                fea_map.setdefault(name, {})
+                fea_map[name].setdefault('min', float(row[j]))
+                fea_map[name].setdefault('max', float(row[j]))
+                fea_map[name]['min'] = min(fea_map[name]['min'], float(row[j]))
+                fea_map[name]['max'] = max(fea_map[name]['max'], float(row[j]))
+        f.close()
+    for i in range(14, 40):
+        fea_map[NAMES[i]]['-1'] = len(fea_map[NAMES[i]])
+    fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl")
+
+
+    modes = stat.S_IWUSR | stat.S_IRUSR
+    flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT
+    with os.fdopen(os.open(fea_map_path, flags, modes), 'wb') as fd:
+        pickle.dump(fea_map, fd, pickle.HIGHEST_PROTOCOL)
+
+    return fea_map
+
+
+def rec_kbins_discretizer(dat, n_bins, min_max_dict):
+    """Bin continuous data into intervals.
+    Note: The strategy is "uniform".
+    Args:
+        :param dat: A dataframe.
+        :param n_bins: A scalar(int).
+        :param min_max_dict: A dict such as {'min': , 'max': }.
+    :return: The new  dataframe.
+    """
+    features = dat.columns
+    n_features = len(features)
+    bin_edges = np.zeros(n_features, dtype=object)
+    for idx, feature in enumerate(features):
+        bin_edges[idx] = np.linspace(min_max_dict[feature]['min'], min_max_dict[feature]['max'], n_bins + 1)
+        rtol = 1.e-5
+        atol = 1.e-8
+        eps = atol + rtol * np.abs(dat[feature])
+        dat[feature] = np.digitize(dat[feature] + eps, bin_edges[idx][1:])
+    return dat
+
+
+def convert_input2tfrd(in_file_path, out_file_path):
+    """
+    txt to tfrecords
+    """
+    def make_example(label_list, dense_feat_list, sparse_feat_list):
+        dense_feature = np.array(dense_feat_list, dtype=np.int64).reshape(-1)
+        sparse_feature = np.array(sparse_feat_list, dtype=np.int64).reshape(-1)
+        label = np.array(label_list, dtype=np.int64).reshape(-1)
+        feature_dict = {
+                    "dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)),
+                    "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)),
+                    "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label))
+        }
+        example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
+
+        return example
+
+    file_name = out_file_path + in_file_path[-12:-4] + '.tfrecord'
+    file_writer = tf.io.TFRecordWriter(file_name)
+
+    with open(in_file_path, encoding='utf-8') as file_in:
+
+        for _, line in tqdm(enumerate(file_in)):
+
+            line = line.strip('\n')
+            items = line.split('\t')
+            if len(items) != 40:
+                continue
+            label = int(items[0])
+            dense = items[1:14]
+            sparse = items[14:]
+
+            ex = make_example(label, dense, sparse)
+            serialized = ex.SerializeToString()
+            file_writer.write(serialized)
+
+        file_writer.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Get datasets')
+    parser.add_argument('--data_path')
+    parser.add_argument('--output_path')
+
+    args, _ = parser.parse_known_args()
+    data_path = args.data_path
+    output_path = args.output_path
+
+    # get txt_list
+    file_split_list = get_split_file_path(dataset_path=data_path)
+    # get feature_map
+    feature_map = get_fea_map(split_file_list=file_split_list)
+
+    for file in tqdm(file_split_list):
+
+        # read data
+        data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES)
+        # name feature
+        sparse_features = ['C' + str(i) for i in range(1, 27)]
+        dense_features = ['I' + str(i) for i in range(1, 14)]
+        # data processing
+        data_df[sparse_features] = data_df[sparse_features].fillna('-1')
+        data_df[dense_features] = data_df[dense_features].fillna(0)
+        # sparse feature: mapping
+        for col in sparse_features:
+            try:
+                data_df[col] = data_df[col].map(lambda x: feature_map[col][x])
+            except KeyError as e:
+                raise KeyError("Feature {} not found in dataset".format(col)) from e
+        # dense feature: Bin continuous data into intervals.
+        data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map)
+        # add offsets
+        slot_size_array = [
+                        1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001,
+                        1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196,
+                        29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573
+        ]
+        offset_size_list = np.cumsum([0] + slot_size_array[:-1])
+        for col_index in range(1, len(offset_size_list) + 1):
+            data_df.iloc[:, col_index] += offset_size_list[col_index - 1]
+        # save to txt
+        data_df.to_csv(file, sep='\t', index=False, header=False)
+        # txt to tfrecords
+        convert_input2tfrd(in_file_path=file, out_file_path=output_path)
+
+
+
+
+
diff --git a/examples/mmoe/delay_loss_scale.py b/examples/mmoe/delay_loss_scale.py
new file mode 100644
index 00000000..f73baf68
--- /dev/null
+++ b/examples/mmoe/delay_loss_scale.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+from tensorflow.python.training import optimizer
+
+from config import Config
+
+
+class DenseLossScaleOptimizer:
+    def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None:
+        if not isinstance(opt, optimizer.Optimizer):
+            raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt))
+        self._optimizer = opt
+        self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32)
+        _update_lr_loss_scale(self._optimizer, loss_scale)
+
+    def compute_gradients(self, loss, var_list=None):
+        return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list)
+
+    def apply_gradients(self, avg_grads):
+        return self._optimizer.apply_gradients(avg_grads)
+
+
+class SparseLossScaleOptimizer:
+    def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None:
+        if not isinstance(opt, optimizer.Optimizer):
+            raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt))
+        self._optimizer = opt
+        self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32)
+        _update_lr_loss_scale(self._optimizer, loss_scale)
+
+    def compute_gradients(self, loss, var_list=None):
+        return tf.gradients(loss * self._loss_scale, var_list)
+
+    def apply_gradients(self, grads_and_vars):
+        return self._optimizer.apply_gradients(grads_and_vars)
+
+
+def _update_lr_loss_scale(opt, loss_scale):
+    if loss_scale <= 0:
+        raise RuntimeError("the loss_scale must be greater than zero.")
+    loss_scale = tf.convert_to_tensor(loss_scale, tf.float32)
+    if hasattr(opt, "_lr"):
+        # LazyAdam or Adam optimizer
+        opt._lr = opt._lr / loss_scale
+    elif hasattr(opt, "_learning_rate"):
+        # SGD optimizer
+        opt._learning_rate = opt._learning_rate / loss_scale
+    else:
+        raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.")
\ No newline at end of file
diff --git a/examples/mmoe/gradient_descent_w.py b/examples/mmoe/gradient_descent_w.py
new file mode 100644
index 00000000..53adb996
--- /dev/null
+++ b/examples/mmoe/gradient_descent_w.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+
+import tensorflow as tf
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import gradient_descent
+from mx_rec.optimizers.base import CustomizedOptimizer
+from mx_rec.util.log import logger
+from mx_rec.util.initialize import ConfigInitializer
+
+
+def create_hash_optimizer(learning_rate, weight_decay=0.0001, use_locking=False, name="GradientDescent"):
+    optimizer = CustomizedGradientDescentWithWeighDecay(learning_rate=learning_rate,
+                                                        weight_decay=weight_decay,
+                                                        use_locking=use_locking,
+                                                        name=name)
+    ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer
+    return optimizer
+
+
+class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOptimizer, CustomizedOptimizer):
+    name_counter = defaultdict(int)
+
+    def __init__(self, learning_rate, weight_decay, use_locking=False, name="GradientDescent"):
+        self.optimizer_type = "gradient_descent_with_weight_decay"
+        self.weight_decay = weight_decay
+        super(CustomizedGradientDescentWithWeighDecay, self)._get_name(name=name)
+        super(CustomizedGradientDescentWithWeighDecay, self).__init__(
+            learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name
+        )
+        self._slot_num = 0
+        self._derivative = 1
+
+    def get_slot_init_values(self):
+        logger.info("no slot for gradient descent")
+        return []
+
+    def _apply_sparse_duplicate_indices(self, grad, var):
+        logger.debug(">>>> Enter _apply_sparse_duplicate_indices")
+        nd_indices = tf.expand_dims(grad.indices, 1)
+        logger.info(f"weigh_decay={self.weight_decay}")
+        if self.weight_decay is None:
+            nd_value = grad.values * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
+        else:
+            nd_value = (grad.values + math_ops.cast(self.weight_decay, var.dtype.base_dtype) *
+                        tf.gather(var, grad.indices)) * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
+        var_update_op = tf.scatter_nd_add(var, nd_indices, -nd_value, use_locking=self._use_locking)
+        return var_update_op
+
+    def _apply_dense(self, grad, var):
+        logger.debug(">>>> Enter _apply_dense")
+        raise NotImplementedError("You are using a wrong type of variable.")
diff --git a/examples/mmoe/mean_auc.py b/examples/mmoe/mean_auc.py
new file mode 100644
index 00000000..ff57df00
--- /dev/null
+++ b/examples/mmoe/mean_auc.py
@@ -0,0 +1,40 @@
+# coding=utf-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+from glob import glob
+import numpy as np
+
+
+def split_auc(log_input):
+    with open(log_input, 'r') as log:
+        all_auc = []
+        for line in log.readlines():
+            if 'Test' in line:
+                all_auc.append(float(line.split(';')[0].split(':')[-1].strip()))
+    all_auc_len = len(all_auc)
+    all_auc_arr = np.array(all_auc)[:all_auc_len - all_auc_len % 8]
+    test_auc = np.mean(all_auc_arr.reshape(-1, 8), axis=-1)
+    return test_auc
+
+
+log_path_all = 'latest_*.log'
+log_path_list = glob(log_path_all)
+
+for log_path in log_path_list:
+    print(os.path.basename(log_path))
+    print(split_auc(log_path))
+    print('*'*20)
\ No newline at end of file
diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py
index 0046d2fd..5b1917a3 100644
--- a/examples/mmoe/model.py
+++ b/examples/mmoe/model.py
@@ -29,7 +29,7 @@ PRED_OP_NAME = "pred"
 
 
 class MyModel:
-    def __init__(self, expert_num =8, expert_size=16, tower_size=8, gate_num = 2):
+    def __init__(self, expert_num=8, expert_size=16, tower_size=8, gate_num=2):
 
         self.expert_num = expert_num
         self.expert_size = expert_size
@@ -40,9 +40,9 @@ class MyModel:
     def expert_layer(self, input):
         param_expert = []
         for i in range(0, self.expert_num):
-            expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name = f'expert_payer_{i}', 
-                                            kernel_initializer = tf.constant_initializer(value=0.1), 
-                                            bias_initializer = tf.constant_initializer(values = 0.1))
+            expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name=f'expert_payer_{i}', 
+                                            kernel_initializer=tf.constant_initializer(value=0.1), 
+                                            bias_initializer=tf.constant_initializer(values = 0.1))
             
             param_expert.append(expert_linear)
         return param_expert
@@ -51,22 +51,23 @@ class MyModel:
     def gate_layer(self, input):
         param_gate = []
         for i in range(0, self.gate_num):
-            gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name = f'gate_payer_{i}', 
-                                            kernel_initializer = tf.constant_initializer(value=0.1), 
-                                            bias_initializer = tf.constant_initializer(values = 0.1))
+            gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name=f'gate_payer_{i}', 
+                                            kernel_initializer=tf.constant_initializer(value=0.1), 
+                                            bias_initializer=tf.constant_initializer(values = 0.1))
             
             param_gate.append(gate_linear)
         return param_gate
     
     
     def tower_layer(self, input, layer_name):
-        tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name = f'tower_payer_{layer_name}', 
-                                            kernel_initializer = tf.constant_initializer(value=0.1), 
-                                            bias_initializer = tf.constant_initializer(values = 0.1))
+        tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name=f'tower_payer_{layer_name}', 
+                                            kernel_initializer=tf.constant_initializer(value=0.1), 
+                                            bias_initializer=tf.constant_initializer(values = 0.1))
         
-        tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, name = f'tower_payer_out_{layer_name}', 
-                                            kernel_initializer = tf.constant_initializer(value=0.1), 
-                                            bias_initializer = tf.constant_initializer(values = 0.1))
+        tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, 
+                                            name=f'tower_payer_out_{layer_name}', 
+                                            kernel_initializer=tf.constant_initializer(value=0.1), 
+                                            bias_initializer=tf.constant_initializer(values=0.1))
         
         return tower_linear_out
         
diff --git a/examples/mmoe/op_impl_mode.ini b/examples/mmoe/op_impl_mode.ini
new file mode 100644
index 00000000..579dea43
--- /dev/null
+++ b/examples/mmoe/op_impl_mode.ini
@@ -0,0 +1 @@
+ScatterNdAdd=support_out_of_bound_index
\ No newline at end of file
diff --git a/examples/mmoe/optimizer.py b/examples/mmoe/optimizer.py
new file mode 100644
index 00000000..2c7685bb
--- /dev/null
+++ b/examples/mmoe/optimizer.py
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer
+from mx_rec.util.initialize import ConfigInitializer
+from mx_rec.optimizers.lazy_adam import create_hash_optimizer
+from mx_rec.optimizers.lazy_adam_by_addr import create_hash_optimizer_by_address
+
+
+def get_dense_and_sparse_optimizer(cfg):
+    dense_optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate[0])
+    use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion
+    if use_dynamic_expansion:
+        sparse_optimizer = create_hash_optimizer_by_address(learning_rate=cfg.learning_rate[1])
+    else:
+        sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1])
+    loss_scale = 1
+    sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale)
+    dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale)
+
+    return dense_optimizer, sparse_optimizer
-- 
Gitee


From fe7073494d499d161e16ce826175f744a17336eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Mon, 22 Jul 2024 22:50:39 +0800
Subject: [PATCH 05/16] =?UTF-8?q?mmoe=20=E5=90=8A=E8=B5=B7=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/main_mxrec.py | 469 ++++++++++++++++++++++++++++++++++++
 1 file changed, 469 insertions(+)
 create mode 100644 examples/mmoe/main_mxrec.py

diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py
new file mode 100644
index 00000000..51ed7c4a
--- /dev/null
+++ b/examples/mmoe/main_mxrec.py
@@ -0,0 +1,469 @@
+# coding=utf-8
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import os
+import shutil
+import time
+import warnings
+import random
+from glob import glob
+
+import tensorflow as tf
+from sklearn.metrics import roc_auc_score
+import numpy as np
+
+from optimizer import get_dense_and_sparse_optimizer
+from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum
+from model import MyModel
+from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET
+from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func
+from mx_rec.core.asc.manager import start_asc_pipeline
+from mx_rec.core.embedding import create_table, sparse_lookup
+from mx_rec.core.feature_process import EvictHook
+from mx_rec.graph.modifier import modify_graph_and_start_emb_cache, GraphModifierHook
+from mx_rec.constants.constants import ASCEND_TIMESTAMP
+from mx_rec.util.initialize import ConfigInitializer, init, terminate_config_initializer
+from mx_rec.util.ops import import_host_pipeline_ops
+import mx_rec.util as mxrec_util
+from mx_rec.util.variable import get_dense_and_sparse_variable
+from mx_rec.util.log import logger
+from npu_bridge.npu_init import *
+
+npu_plugin.set_device_sat_mode(0)
+
+dense_hashtable_seed = 128
+sparse_hashtable_seed = 128
+shuffle_seed = 128
+random.seed(shuffle_seed)
+
+
+def add_timestamp_func(batch):
+    timestamp = import_host_pipeline_ops().return_timestamp(tf.cast(batch['label'], dtype=tf.int64))
+    # tf.constant(np.random.randint(1,1688109060,1)), tf.int64))
+    batch["timestamp"] = timestamp
+    return batch
+
+
+def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, is_use_faae=False):
+    if config.USE_PIPELINE_TEST:
+        num_parallel = 1
+    else:
+        num_parallel = 8
+
+    def extract_fn(data_record):
+        features = {
+            # Extract features using the keys set during creation
+            'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64),
+            'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64),
+            'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32),
+        }
+        sample = tf.compat.v1.parse_single_example(data_record, features)
+        return sample
+
+    def reshape_fn(batch):
+        batch['label'] = tf.reshape(batch['label'], [-1, 1])
+        batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 13])
+        batch['dense_feature'] = tf.math.log(batch['dense_feature'] + 3.0)
+        batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 26])
+        return batch
+
+    if is_training:
+        files_list = glob(os.path.join(config.data_path, config.train_file_pattern) + '/*.tfrecord')
+    else:
+        files_list = glob(os.path.join(config.data_path, config.test_file_pattern) + '/*.tfrecord')
+    dataset = tf.data.TFRecordDataset(files_list, num_parallel_reads=num_parallel)
+    batch_size = config.batch_size // config.line_per_sample
+
+    dataset = dataset.shard(config.rank_size, config.rank_id)
+    if is_training:
+        dataset = dataset.shuffle(batch_size * 1000, seed=shuffle_seed)
+    if is_training:
+        dataset = dataset.repeat(config.train_epoch)
+    else:
+        dataset = dataset.repeat(config.test_epoch)
+    dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel).batch(batch_size,
+                                                                             drop_remainder=True)
+    dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel)
+    if is_use_faae:
+        dataset = dataset.map(add_timestamp_func)
+
+    if not MODIFY_GRAPH_FLAG:
+        insert_fn = get_asc_insert_func(tgt_key_specs=feature_spec_list, is_training=is_training, dump_graph=dump_graph)
+        dataset = dataset.map(insert_fn)
+
+    dataset = dataset.prefetch(100)
+
+    iterator = dataset.make_initializable_iterator()
+    batch = iterator.get_next()
+    return batch, iterator
+
+
+def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph):
+    embedding_list = []
+    logger.debug(f"In model_forward function, is_train: {is_train}, feature_list: {len(feature_list)}, "
+                 f"hash_table_list: {len(hash_table_list)}")
+    for feature, hash_table in zip(feature_list, hash_table_list):
+        if MODIFY_GRAPH_FLAG:
+            feature = batch["sparse_feature"]
+        embedding = sparse_lookup(hash_table, feature, cfg.send_count, dim=None, is_train=is_train,
+                                  name="user_embedding_lookup", modify_graph=modify_graph, batch=batch,
+                                  access_and_evict_config=None)
+        embedding_list.append(embedding)
+
+    if len(embedding_list) == 1:
+        emb = embedding_list[0]
+    elif len(embedding_list) > 1:
+        emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False)
+    else:
+        raise ValueError("the length of embedding_list must be greater than or equal to 1.")
+    my_model = MyModel()
+    model_output = my_model.build_model(embedding=emb,
+                                        dense_feature=batch["dense_feature"],
+                                        label=batch["label"],
+                                        is_training=is_train,
+                                        seed=dense_hashtable_seed)
+    return model_output
+
+
+def evaluate():
+    print("read_test dataset")
+    if not MODIFY_GRAPH_FLAG:
+        eval_label = eval_model.get("label")
+        sess.run([eval_iterator.initializer])
+    else:
+        # 在sess run模式下，若还是使用原来batch中的label去sess run，则会出现getnext超时报错，需要使用新数据集中的batch
+        eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label")
+        sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)])
+    log_loss_list = []
+    pred_list = []
+    label_list = []
+    eval_current_steps = 0
+    finished = False
+    print("eval begin")
+
+    while not finished:
+        try:
+            eval_current_steps += 1
+            eval_start = time.time()
+            eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_label])
+            eval_cost = time.time() - eval_start
+            qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size
+            log_loss_list += list(eval_loss.reshape(-1))
+            pred_list += list(pred.reshape(-1))
+            label_list += list(label.reshape(-1))
+            print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}")
+            if eval_current_steps == eval_steps:
+                finished = True
+        except tf.errors.OutOfRangeError:
+            finished = True
+    auc = roc_auc_score(label_list, pred_list)
+    mean_log_loss = np.mean(log_loss_list)
+    return auc, mean_log_loss
+
+
+def evaluate_fix(step):
+    print("read_test dataset evaluate_fix")
+    if not MODIFY_GRAPH_FLAG:
+        sess.run([eval_iterator.initializer])
+    else:
+        sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)])
+    log_loss_list = []
+    pred_list = []
+    label_list = []
+    eval_current_steps = 0
+    finished = False
+    print("eval begin")
+    while not finished:
+        try:
+            eval_current_steps += 1
+            eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_model.get("label")])
+            log_loss_list += list(eval_loss.reshape(-1))
+            pred_list += list(pred.reshape(-1))
+            label_list += list(label.reshape(-1))
+            print(f"eval current_steps: {eval_current_steps}")
+
+            if eval_current_steps == eval_steps:
+                finished = True
+        except tf.errors.OutOfRangeError:
+            finished = True
+
+    label_numpy = np.array(label_list)
+    pred_numpy = np.array(pred_list)
+    if not os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}"):
+        os.makedirs(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}")
+
+    if os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy"):
+        os.remove(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy")
+    if os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy"):
+        os.remove(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy")
+    if os.path.exists(f"flag_{rank_id}.txt"):
+        os.remove(f"flag_{rank_id}.txt")
+    np.save(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy", label_numpy)
+    np.save(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy", pred_numpy)
+    os.mknod(f"flag_{rank_id}.txt")
+    while True:
+        file_exists_list = [os.path.exists(f"flag_{i}.txt") for i in range(rank_size)]
+        if sum(file_exists_list) == rank_size:
+            print("All saved!!!!!!!!!!")
+            break
+        else:
+            print("Waitting for saving numpy!!!!!!!!")
+            time.sleep(1)
+            continue
+
+    auc = roc_auc_score(label_list, pred_list)
+    mean_log_loss = np.mean(log_loss_list)
+    return auc, mean_log_loss
+
+
+def create_feature_spec_list(use_timestamp=False):
+    access_threshold = None
+    eviction_threshold = None
+    if use_timestamp:
+        access_threshold = 1000
+        eviction_threshold = 180
+
+    feature_spec_list = [FeatureSpec("sparse_feature", table_name="sparse_embeddings", batch_size=cfg.batch_size,
+                                     access_threshold=access_threshold, eviction_threshold=eviction_threshold)]
+    if use_multi_lookup:
+        feature_spec_list.append(FeatureSpec("sparse_feature", table_name="sparse_embeddings",
+                                             batch_size=cfg.batch_size,
+                                             access_threshold=access_threshold,
+                                             eviction_threshold=eviction_threshold))
+    if use_timestamp:
+        feature_spec_list.append(FeatureSpec("timestamp", is_timestamp=True))
+    return feature_spec_list
+
+
+def _del_related_dir(del_path: str) -> None:
+    if not os.path.isabs(del_path):
+        del_path = os.path.join(os.getcwd(), del_path)
+    dirs = glob(del_path)
+    for sub_dir in dirs:
+        shutil.rmtree(sub_dir, ignore_errors=True)
+        logger.info(f"Delete dir:{sub_dir}")
+
+
+def _clear_saved_model() -> None:
+    _del_related_dir("/root/ascend/log/*")
+    _del_related_dir("kernel*")
+    _del_related_dir("model_dir_rank*")
+    _del_related_dir("op_cache")
+
+    if os.getenv("CACHE_MODE", "") != CacheModeEnum.SSD.value:
+        return
+    logger.info("Current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory"
+                " then create empty directory for this use case.")
+    for sub_path in SSD_DATA_PATH:
+        _del_related_dir(sub_path)
+        os.makedirs(sub_path, mode=0o550, exist_ok=True)
+        logger.info(f"Create dir:{sub_path}")
+
+
+if __name__ == "__main__":
+    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+    warnings.filterwarnings("ignore")
+    _clear_saved_model()
+
+    rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None
+    rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None
+    interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None
+    train_steps = 10000
+    eval_steps = 1360
+
+    try:
+        use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0)))
+        use_multi_lookup = bool(int(os.getenv("USE_MULTI_LOOKUP", 0)))
+        MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0)))
+        use_faae = bool(int(os.getenv("USE_FAAE", 0)))
+    except ValueError as err:
+        raise ValueError("please correctly config USE_DYNAMIC_EXPANSION or USE_MULTI_LOOKUP or USE_FAAE "
+                         "or USE_MODIFY_GRAPH only 0 or 1 is supported.") from err
+
+    use_dynamic = bool(int(os.getenv("USE_DYNAMIC", 0)))
+    logger.info(f"USE_DYNAMIC:{use_dynamic}")
+    init(train_steps=train_steps, eval_steps=eval_steps,
+         use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion)
+    IF_LOAD = False
+    rank_id = mxrec_util.communication.hccl_ops.get_rank_id()
+    filelist = glob(f"./saved-model/sparse-model-0")
+    if filelist:
+        IF_LOAD = True
+    ConfigInitializer.get_instance().if_load = IF_LOAD
+
+    cfg = Config()
+    feature_spec_list_train = None
+    feature_spec_list_eval = None
+    if use_faae:
+        feature_spec_list_train = create_feature_spec_list(use_timestamp=True)
+        feature_spec_list_eval = create_feature_spec_list(use_timestamp=True)
+    else:
+        feature_spec_list_train = create_feature_spec_list(use_timestamp=False)
+        feature_spec_list_eval = create_feature_spec_list(use_timestamp=False)
+
+    train_batch, train_iterator = make_batch_and_iterator(cfg, feature_spec_list_train, is_training=True,
+                                                          dump_graph=True, is_use_faae=use_faae)
+    eval_batch, eval_iterator = make_batch_and_iterator(cfg, feature_spec_list_eval, is_training=False,
+                                                        dump_graph=False, is_use_faae=use_faae)
+    logger.info(f"train_batch: {train_batch}")
+
+    if use_faae:
+        cfg.dev_vocab_size = cfg.dev_vocab_size // 2
+
+    optimizer_list = [get_dense_and_sparse_optimizer(cfg)]
+
+    # note: variance_scaling_initializer only support HBM mode
+    emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) \
+        if cfg.cache_mode != "HBM" or use_dynamic_expansion else \
+        tf.compat.v1.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=sparse_hashtable_seed)
+    sparse_hashtable = create_table(
+        key_dtype=cfg.key_type,
+        dim=tf.TensorShape([cfg.emb_dim]),
+        name="sparse_embeddings",
+        emb_initializer=emb_initializer,
+        **cfg.get_emb_table_cfg()
+    )
+    if use_faae:
+        tf.compat.v1.add_to_collection(ASCEND_TIMESTAMP, train_batch["timestamp"])
+
+    sparse_hashtable_list = [sparse_hashtable, sparse_hashtable] if use_multi_lookup else [sparse_hashtable]
+    train_model = model_forward(feature_spec_list_train, sparse_hashtable_list, train_batch,
+                                is_train=True, modify_graph=MODIFY_GRAPH_FLAG)
+    eval_model = model_forward(feature_spec_list_eval, sparse_hashtable_list, eval_batch,
+                               is_train=False, modify_graph=MODIFY_GRAPH_FLAG)
+
+    dense_variables, sparse_variables = get_dense_and_sparse_variable()
+    trainable_varibles = []
+    trainable_varibles.extend(dense_variables)
+    if use_dynamic_expansion:
+        trainable_varibles.append(tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB)[0])
+    else:
+        trainable_varibles.extend(sparse_variables)
+    rank_size = mxrec_util.communication.hccl_ops.get_rank_size()
+    train_ops = []
+    # multi task training
+    for loss, (dense_optimizer, sparse_optimizer) in zip([train_model.get("loss")], optimizer_list):
+        # do dense optimization
+        grads = dense_optimizer.compute_gradients(loss, var_list=trainable_varibles)
+        avg_grads = []
+        for grad, var in grads[:-1]:
+            if rank_size > 1:
+                grad = hccl_ops.allreduce(grad, "sum") if grad is not None else None
+            if grad is not None:
+                avg_grads.append((grad / 8.0, var))
+        # apply gradients: update variables
+        train_ops.append(dense_optimizer.apply_gradients(avg_grads))
+
+        if use_dynamic_expansion:
+            train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET)
+            # do sparse optimization by addr
+            sparse_grads = list(grads[-1])  # local_embedding
+            grads_and_vars = [(grad, address) for grad, address in zip(sparse_grads, train_address_list)]
+            train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars))
+        else:
+            # do sparse optimization
+            sparse_grads = list(grads[-1])
+            print("sparse_grads_tensor:", sparse_grads)
+            grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)]
+            train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars))
+
+    # 动态学习率更新
+    train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]])
+
+    with tf.control_dependencies(train_ops):
+        train_ops = tf.no_op()
+        cfg.learning_rate = [cfg.learning_rate[0], cfg.learning_rate[1]]
+
+    saver = tf.train.Saver()
+    if MODIFY_GRAPH_FLAG:
+        modify_graph_and_start_emb_cache(dump_graph=True)
+    else:
+        start_asc_pipeline()
+
+    hook_list = []
+    if use_faae:
+        hook_evict = EvictHook(evict_enable=True, evict_time_interval=120)
+        hook_list.append(hook_evict)
+        if MODIFY_GRAPH_FLAG:  # 该场景添加hook处理校验问题
+            hook_list.append(GraphModifierHook(modify_graph=False))
+
+    # with tf.compat.v1.Session(config=sess_config(dump_data=False)) as sess:
+    if use_faae:
+        sess = tf.compat.v1.train.MonitoredTrainingSession(
+            hooks=hook_list,
+            config=sess_config(dump_data=False)
+        )
+        sess.graph._unsafe_unfinalize()
+        if not MODIFY_GRAPH_FLAG:
+            sess.run(train_iterator.initializer)
+        else:
+            sess.run(ConfigInitializer.get_instance().train_params_config.get_initializer(True))
+    else:
+        sess = tf.compat.v1.Session(config=sess_config(dump_data=False))
+        sess.run(tf.compat.v1.global_variables_initializer())
+        if not MODIFY_GRAPH_FLAG:
+            sess.run(train_iterator.initializer)
+        else:
+            sess.run(ConfigInitializer.get_instance().train_params_config.get_initializer(True))
+
+    epoch = 0
+    cost_sum = 0
+    qps_sum = 0
+    best_auc = 0
+    iteration_per_loop = 10
+
+    train_ops = util.set_iteration_per_loop(sess, train_ops, 10)
+
+    # for i in range(1, TRAIN_STEPS):
+    i = 0
+    while True:
+        i += 1
+        logger.info(f"################    training at step {i * iteration_per_loop}    ################")
+        start_time = time.time()
+
+        try:
+            grad, loss = sess.run([train_ops, train_model.get("loss")])
+            lr = sess.run(cfg.learning_rate)
+            global_step = sess.run(cfg.global_step)
+        except tf.errors.OutOfRangeError:
+            logger.info(f"Encounter the end of Sequence for training.")
+            break
+
+        end_time = time.time()
+        cost_time = end_time - start_time
+        qps = (1 / cost_time) * rank_size * cfg.batch_size * iteration_per_loop
+        cost_sum += cost_time
+        logger.info(f"step: {i * iteration_per_loop}; training loss: {loss}")
+        logger.info(f"step: {i * iteration_per_loop}; grad: {grad}")
+        logger.info(f"step: {i * iteration_per_loop}; lr: {lr}")
+        logger.info(f"global step: {global_step}")
+        logger.info(f"step: {i * iteration_per_loop}; current sess cost time: {cost_time:.10f}; current QPS: {qps}")
+        logger.info(f"training at step:{i * iteration_per_loop}, table[{sparse_hashtable.table_name}], "
+                    f"table size:{sparse_hashtable.size()}, table capacity:{sparse_hashtable.capacity()}")
+
+        if i % (train_steps // iteration_per_loop) == 0:
+            if interval is not None:
+                test_auc, test_mean_log_loss = evaluate_fix(i * iteration_per_loop)
+            else:
+                test_auc, test_mean_log_loss = evaluate()
+            print("Test auc: {}; log_loss: {} ".format(test_auc, test_mean_log_loss))
+            best_auc = max(best_auc, test_auc)
+            logger.info(f"training step: {i * iteration_per_loop}, best auc: {best_auc}")
+
+    sess.close()
+
+    terminate_config_initializer()
+    logger.info("Demo done!")
-- 
Gitee


From 769164b3b7aff7766e4ffbec81e4766b13d75032 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Mon, 22 Jul 2024 23:38:14 +0800
Subject: [PATCH 06/16] =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9=EF=BC=8C=E5=85=A5=E5=8F=A3=E5=87=BD=E6=95=B0?=
 =?UTF-8?q?=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/config.py     |  31 ++--
 examples/mmoe/criteo.py     | 273 ------------------------------------
 examples/mmoe/main_mxrec.py |  59 ++++----
 3 files changed, 51 insertions(+), 312 deletions(-)
 delete mode 100644 examples/mmoe/criteo.py

diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py
index d5540908..b87bc11b 100644
--- a/examples/mmoe/config.py
+++ b/examples/mmoe/config.py
@@ -42,10 +42,6 @@ class LearningRateScheduler:
         self.base_lr_sparse = base_lr_sparse
 
     def calc(self, global_step):
-        # used for the warmup stage
-        warmup_step = tf.cast(1 / self.warmup_steps, tf.float32)
-        lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step
-        lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32)
         # used for the constant stage
         lr_factor_constant = tf.cast(1.0, tf.float32)
         
@@ -66,10 +62,15 @@ class Config:
         self.train_file_pattern = "train"
         self.test_file_pattern = "test"
 
-        self.batch_size = 4096
+        self.batch_size = 32
         self.line_per_sample = 1
-        self.train_epoch = 1
-        self.test_epoch = 9
+        self.train_epoch = 100
+        self.test_epoch = 100
+        self.expert_num = 8
+        self.gate_num = 2
+        self.expert_size = 16
+        self.tower_size = 8
+        
         self.perform_shuffle = False
 
         self.key_type = tf.int64
@@ -82,7 +83,7 @@ class Config:
         self.field_num = 26
         self.send_count = 46000 // self.rank_size
 
-        self.emb_dim = 8
+        self.emb_dim = self.expert_num * self.expert_size + self.gate_num * self.expert_num
         self.hashtable_threshold = 1
 
         self.USE_PIPELINE_TEST = False
@@ -102,7 +103,7 @@ class Config:
             LR_SCHEDULE_STEPS[1],
             LR_SCHEDULE_STEPS[2],
         )
-        self.learning_rate = _lr_scheduler.calc(self.global_step)
+        self.learning_rate = _lr_scheduler.calc()
 
     def __set_emb_table_size(self):
         self.cache_mode = os.getenv("CACHE_MODE")
@@ -110,15 +111,15 @@ class Config:
             raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]")
 
         if self.cache_mode == CacheModeEnum.HBM.value:
-            self.dev_vocab_size = 14_000_000 * self.rank_size
+            self.dev_vocab_size = 1000 * self.rank_size
             self.host_vocab_size = 0
         elif self.cache_mode == CacheModeEnum.DDR.value:
-            self.dev_vocab_size = 500_000 * self.rank_size
-            self.host_vocab_size = 24_000_000 * self.rank_size
+            self.dev_vocab_size = 1000 * self.rank_size
+            self.host_vocab_size = 1000 * self.rank_size
         elif self.cache_mode == CacheModeEnum.SSD.value:
-            self.dev_vocab_size = 100_000 * self.rank_size
-            self.host_vocab_size = 2_000_000 * self.rank_size
-            self.ssd_vocab_size = 24_000_000 * self.rank_size
+            self.dev_vocab_size = 1000 * self.rank_size
+            self.host_vocab_size = 1000 * self.rank_size
+            self.ssd_vocab_size = 1000 * self.rank_size
         else:
             raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]")
 
diff --git a/examples/mmoe/criteo.py b/examples/mmoe/criteo.py
deleted file mode 100644
index 25f1d869..00000000
--- a/examples/mmoe/criteo.py
+++ /dev/null
@@ -1,273 +0,0 @@
-# coding=utf-8
-# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import os
-import stat
-import pickle
-import argparse
-import pandas as pd
-import numpy as np
-import tensorflow as tf
-from tqdm import tqdm
-
-NAMES = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)]
-
-
-def make_sub_file(lines, head, src_name, sub_dir_name, sub):
-    """Write sub-data.
-    
-    Args:
-        :param lines: A list. Several pieces of data.
-        :param head: A string. ['label', 'I1', 'I2', ...].
-        :param src_name: A string. The name of data.
-        :param sub_dir_name: A string.
-        :param sub: A scalar(Int). Record the current number of sub file.
-    :return: sub + 1.
-    """
-    root_path, file_path = os.path.split(src_name)
-    file_name, suffix = file_path.split('.')
-    split_file_name = file_name + "_" + str(sub).zfill(2) + "." + suffix
-    split_file = os.path.join(root_path, sub_dir_name, split_file_name)
-    if not os.path.exists(os.path.join(root_path, sub_dir_name)):
-        os.mkdir(os.path.join(root_path, sub_dir_name))
-
-    modes = stat.S_IWUSR | stat.S_IRUSR
-    flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT
-    f = os.fdopen(os.open(split_file, flags, modes), 'w')
-    try:
-        f.writelines([head])
-        f.writelines(lines)
-        return sub + 1
-    finally:
-        f.close()
-
-
-def split_byline_count(filename, count, sub_dir_name):
-    """Split File.
-    Note: You can specify how many rows of data each sub file contains.
-    Args:
-        :param filename: A string.
-        :param count: A scalar(int).
-        :param sub_dir_name: A string.
-    :return:
-    """
-    f = open(filename, 'r')
-    try:
-        head = f.readline()
-        buf = []
-        sub = 1
-        for line in f:
-            buf.append(line)
-            if len(buf) == count:
-                sub = make_sub_file(buf, head, filename, sub_dir_name, sub)
-                buf = []
-        if len(buf) != 0:
-            try:
-                make_sub_file(buf, head, filename, sub_dir_name, sub)
-            except FileNotFoundError as err:
-                raise FileNotFoundError("please check the filename of data") from err
-    finally:
-        f.close()
-
-
-def get_split_file_path(parent_path=None, dataset_path=None, sample_num=4600000):
-    """Get the list of split file path.
-    Note: Either parent_path or dataset_path must be valid.
-    If exists dataset_path + "/split", parent_path = dataset_path + "/split".
-    Args:
-        :param parent_path: A string. split file's parent path.
-        :param dataset_path: A string.
-        :param sample_num: A int. The sample number of every split file.
-    :return: A list. [file1_path, file2_path, ...]
-    """
-    sub_dir_name = 'split'
-    if parent_path is None and dataset_path is None:
-        raise ValueError('Please give parent path or file path.')
-    if parent_path is None and os.path.exists(os.path.join(os.path.dirname(dataset_path), sub_dir_name)):
-        parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name)
-    elif parent_path is None or not os.path.exists(parent_path):
-        split_byline_count(dataset_path, sample_num, sub_dir_name)
-        parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name)
-    split_file_name = os.listdir(parent_path)
-    split_file_name.sort()
-    split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt']
-    return split_file_list
-
-
-def get_fea_map(fea_map_path=None, split_file_list=None):
-    """Get feature map.
-    Note: Either parent_path or dataset_path must be valid.
-    If exists dir(split_file_list[0]) + "/fea_map.pkl", fea_map_path is valid.
-    If fea_map_path is None and you want to build the feature map,
-    the default file path is the parent directory of split file + "fea_map.pkl".
-    Args:
-        :param fea_map_path: A string.
-        :param split_file_list: A list. [file1_path, file2_path, ...]
-    :return: A dict. {'C1':{}, 'C2':{}, ...}
-    """
-    if fea_map_path is None and split_file_list is None:
-        raise ValueError('Please give feature map path or split file list.')
-    if fea_map_path is None and split_file_list is not None:
-        fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl")
-    if os.path.exists(fea_map_path) and fea_map_path[-3:] == 'pkl':
-        with open(fea_map_path, 'rb') as f:
-            fea_map = pickle.load(f)
-        return fea_map
-    fea_map = {}
-    for file_open in tqdm(split_file_list):
-        f = open(file_open)
-        for line in f:
-            row = line.strip('\n').split('\t')
-            for i in range(14, 40):
-                if row[i] == '':
-                    continue
-                name = NAMES[i]
-                fea_map.setdefault(name, {})
-                if fea_map[name].get(row[i]) is None:
-                    fea_map[name][row[i]] = len(fea_map[name])
-            for j in range(1, 14):
-                if row[j] == '':
-                    continue
-                name = NAMES[j]
-                fea_map.setdefault(name, {})
-                fea_map[name].setdefault('min', float(row[j]))
-                fea_map[name].setdefault('max', float(row[j]))
-                fea_map[name]['min'] = min(fea_map[name]['min'], float(row[j]))
-                fea_map[name]['max'] = max(fea_map[name]['max'], float(row[j]))
-        f.close()
-    for i in range(14, 40):
-        fea_map[NAMES[i]]['-1'] = len(fea_map[NAMES[i]])
-    fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl")
-
-
-    modes = stat.S_IWUSR | stat.S_IRUSR
-    flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT
-    with os.fdopen(os.open(fea_map_path, flags, modes), 'wb') as fd:
-        pickle.dump(fea_map, fd, pickle.HIGHEST_PROTOCOL)
-
-    return fea_map
-
-
-def rec_kbins_discretizer(dat, n_bins, min_max_dict):
-    """Bin continuous data into intervals.
-    Note: The strategy is "uniform".
-    Args:
-        :param dat: A dataframe.
-        :param n_bins: A scalar(int).
-        :param min_max_dict: A dict such as {'min': , 'max': }.
-    :return: The new  dataframe.
-    """
-    features = dat.columns
-    n_features = len(features)
-    bin_edges = np.zeros(n_features, dtype=object)
-    for idx, feature in enumerate(features):
-        bin_edges[idx] = np.linspace(min_max_dict[feature]['min'], min_max_dict[feature]['max'], n_bins + 1)
-        rtol = 1.e-5
-        atol = 1.e-8
-        eps = atol + rtol * np.abs(dat[feature])
-        dat[feature] = np.digitize(dat[feature] + eps, bin_edges[idx][1:])
-    return dat
-
-
-def convert_input2tfrd(in_file_path, out_file_path):
-    """
-    txt to tfrecords
-    """
-    def make_example(label_list, dense_feat_list, sparse_feat_list):
-        dense_feature = np.array(dense_feat_list, dtype=np.int64).reshape(-1)
-        sparse_feature = np.array(sparse_feat_list, dtype=np.int64).reshape(-1)
-        label = np.array(label_list, dtype=np.int64).reshape(-1)
-        feature_dict = {
-                    "dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)),
-                    "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)),
-                    "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label))
-        }
-        example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
-
-        return example
-
-    file_name = out_file_path + in_file_path[-12:-4] + '.tfrecord'
-    file_writer = tf.io.TFRecordWriter(file_name)
-
-    with open(in_file_path, encoding='utf-8') as file_in:
-
-        for _, line in tqdm(enumerate(file_in)):
-
-            line = line.strip('\n')
-            items = line.split('\t')
-            if len(items) != 40:
-                continue
-            label = int(items[0])
-            dense = items[1:14]
-            sparse = items[14:]
-
-            ex = make_example(label, dense, sparse)
-            serialized = ex.SerializeToString()
-            file_writer.write(serialized)
-
-        file_writer.close()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Get datasets')
-    parser.add_argument('--data_path')
-    parser.add_argument('--output_path')
-
-    args, _ = parser.parse_known_args()
-    data_path = args.data_path
-    output_path = args.output_path
-
-    # get txt_list
-    file_split_list = get_split_file_path(dataset_path=data_path)
-    # get feature_map
-    feature_map = get_fea_map(split_file_list=file_split_list)
-
-    for file in tqdm(file_split_list):
-
-        # read data
-        data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES)
-        # name feature
-        sparse_features = ['C' + str(i) for i in range(1, 27)]
-        dense_features = ['I' + str(i) for i in range(1, 14)]
-        # data processing
-        data_df[sparse_features] = data_df[sparse_features].fillna('-1')
-        data_df[dense_features] = data_df[dense_features].fillna(0)
-        # sparse feature: mapping
-        for col in sparse_features:
-            try:
-                data_df[col] = data_df[col].map(lambda x: feature_map[col][x])
-            except KeyError as e:
-                raise KeyError("Feature {} not found in dataset".format(col)) from e
-        # dense feature: Bin continuous data into intervals.
-        data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map)
-        # add offsets
-        slot_size_array = [
-                        1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001,
-                        1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196,
-                        29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573
-        ]
-        offset_size_list = np.cumsum([0] + slot_size_array[:-1])
-        for col_index in range(1, len(offset_size_list) + 1):
-            data_df.iloc[:, col_index] += offset_size_list[col_index - 1]
-        # save to txt
-        data_df.to_csv(file, sep='\t', index=False, header=False)
-        # txt to tfrecords
-        convert_input2tfrd(in_file_path=file, out_file_path=output_path)
-
-
-
-
-
diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py
index 51ed7c4a..e236cd2f 100644
--- a/examples/mmoe/main_mxrec.py
+++ b/examples/mmoe/main_mxrec.py
@@ -66,18 +66,17 @@ def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph,
     def extract_fn(data_record):
         features = {
             # Extract features using the keys set during creation
-            'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64),
-            'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64),
-            'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32),
+            'label': tf.compat.v1.FixedLenFeature(shape=(2 * config.line_per_sample,), dtype=tf.int64),
+            'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(29 * config.line_per_sample,), dtype=tf.int64),
+            'dense_feature': tf.compat.v1.FixedLenFeature(shape=(11 * config.line_per_sample,), dtype=tf.float32),
         }
         sample = tf.compat.v1.parse_single_example(data_record, features)
         return sample
 
     def reshape_fn(batch):
-        batch['label'] = tf.reshape(batch['label'], [-1, 1])
-        batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 13])
-        batch['dense_feature'] = tf.math.log(batch['dense_feature'] + 3.0)
-        batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 26])
+        batch['label'] = tf.reshape(batch['label'], [-1, 2])
+        batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 11])
+        batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 29])
         return batch
 
     if is_training:
@@ -129,6 +128,7 @@ def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph):
         emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False)
     else:
         raise ValueError("the length of embedding_list must be greater than or equal to 1.")
+    emb = tf.reduce_sum(emb, axis=1)
     my_model = MyModel()
     model_output = my_model.build_model(embedding=emb,
                                         dense_feature=batch["dense_feature"],
@@ -148,8 +148,10 @@ def evaluate():
         eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label")
         sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)])
     log_loss_list = []
-    pred_list = []
-    label_list = []
+    pred_income_list = []
+    pred_mat_list = []
+    label_income_list = []
+    label_mat_list = []
     eval_current_steps = 0
     finished = False
     print("eval begin")
@@ -162,16 +164,21 @@ def evaluate():
             eval_cost = time.time() - eval_start
             qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size
             log_loss_list += list(eval_loss.reshape(-1))
-            pred_list += list(pred.reshape(-1))
-            label_list += list(label.reshape(-1))
+            pred_income = pred[0]
+            pred_mat = pred[1]
+            pred_income_list += list(pred_income.reshape(-1))
+            pred_mat_list += list(pred_mat.reshape(-1))
+            label_income_list += list(label[:, 0].reshape(-1))
+            label_mat_list += list(label[:, 1].reshape(-1))
             print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}")
             if eval_current_steps == eval_steps:
                 finished = True
         except tf.errors.OutOfRangeError:
             finished = True
-    auc = roc_auc_score(label_list, pred_list)
+    auc_income = roc_auc_score(label_income_list, pred_income_list)
+    auc_mat = roc_auc_score(label_mat_list, pred_mat_list)
     mean_log_loss = np.mean(log_loss_list)
-    return auc, mean_log_loss
+    return auc_income, auc_mat, mean_log_loss
 
 
 def evaluate_fix(step):
@@ -281,8 +288,8 @@ if __name__ == "__main__":
     rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None
     rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None
     interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None
-    train_steps = 10000
-    eval_steps = 1360
+    train_steps = 1000
+    eval_steps = 1000
 
     try:
         use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0)))
@@ -326,9 +333,7 @@ if __name__ == "__main__":
     optimizer_list = [get_dense_and_sparse_optimizer(cfg)]
 
     # note: variance_scaling_initializer only support HBM mode
-    emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) \
-        if cfg.cache_mode != "HBM" or use_dynamic_expansion else \
-        tf.compat.v1.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=sparse_hashtable_seed)
+    emb_initializer = tf.constant_initializer(value = 0.1)
     sparse_hashtable = create_table(
         key_dtype=cfg.key_type,
         dim=tf.TensorShape([cfg.emb_dim]),
@@ -422,7 +427,8 @@ if __name__ == "__main__":
     epoch = 0
     cost_sum = 0
     qps_sum = 0
-    best_auc = 0
+    best_income_auc = 0
+    best_auc_mat = 0
     iteration_per_loop = 10
 
     train_ops = util.set_iteration_per_loop(sess, train_ops, 10)
@@ -456,12 +462,17 @@ if __name__ == "__main__":
 
         if i % (train_steps // iteration_per_loop) == 0:
             if interval is not None:
-                test_auc, test_mean_log_loss = evaluate_fix(i * iteration_per_loop)
+                test_auc_income, test_auc_mat, test_mean_log_loss = evaluate_fix(i * iteration_per_loop)
             else:
-                test_auc, test_mean_log_loss = evaluate()
-            print("Test auc: {}; log_loss: {} ".format(test_auc, test_mean_log_loss))
-            best_auc = max(best_auc, test_auc)
-            logger.info(f"training step: {i * iteration_per_loop}, best auc: {best_auc}")
+                test_auc_income, test_auc_mat, test_mean_log_loss = evaluate()
+            print("Test auc income: {};Test auc mat: {} ;log_loss: {} ".format(test_auc_income, 
+                                                                               test_auc_mat,test_mean_log_loss))
+            best_auc_income = max(best_auc_income, test_auc_income)
+            best_auc_mat = max(best_auc_mat, test_auc_mat)
+            logger.info(f"training step: {i * iteration_per_loop}, 
+                        best auc income: {best_auc_income} , 
+                        best auc mat: {best_auc_mat}")
+
 
     sess.close()
 
-- 
Gitee


From c70d9eebb72a4f818b88d1ea19cb1ba9d172d197 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Tue, 23 Jul 2024 00:09:26 +0800
Subject: [PATCH 07/16] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/model.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py
index 5b1917a3..cf8ca108 100644
--- a/examples/mmoe/model.py
+++ b/examples/mmoe/model.py
@@ -42,7 +42,7 @@ class MyModel:
         for i in range(0, self.expert_num):
             expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name=f'expert_payer_{i}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
-                                            bias_initializer=tf.constant_initializer(values = 0.1))
+                                            bias_initializer=tf.constant_initializer(value=0.1))
             
             param_expert.append(expert_linear)
         return param_expert
@@ -53,7 +53,7 @@ class MyModel:
         for i in range(0, self.gate_num):
             gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name=f'gate_payer_{i}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
-                                            bias_initializer=tf.constant_initializer(values = 0.1))
+                                            bias_initializer=tf.constant_initializer(value=0.1))
             
             param_gate.append(gate_linear)
         return param_gate
@@ -62,12 +62,12 @@ class MyModel:
     def tower_layer(self, input, layer_name):
         tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name=f'tower_payer_{layer_name}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
-                                            bias_initializer=tf.constant_initializer(values = 0.1))
+                                            bias_initializer=tf.constant_initializer(value=0.1))
         
         tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, 
                                             name=f'tower_payer_out_{layer_name}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
-                                            bias_initializer=tf.constant_initializer(values=0.1))
+                                            bias_initializer=tf.constant_initializer(value=0.1))
         
         return tower_linear_out
         
-- 
Gitee


From 66a629d05eafaeadb807d25b077e41cf5936f1c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Tue, 23 Jul 2024 14:28:16 +0800
Subject: [PATCH 08/16] =?UTF-8?q?=E6=97=A0=E7=94=A8=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E5=88=A0=E9=99=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/gradient_descent_w.py | 71 -----------------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 examples/mmoe/gradient_descent_w.py

diff --git a/examples/mmoe/gradient_descent_w.py b/examples/mmoe/gradient_descent_w.py
deleted file mode 100644
index 53adb996..00000000
--- a/examples/mmoe/gradient_descent_w.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding=utf-8
-# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import defaultdict
-
-import tensorflow as tf
-from tensorflow.python.ops import math_ops
-from tensorflow.python.training import gradient_descent
-from mx_rec.optimizers.base import CustomizedOptimizer
-from mx_rec.util.log import logger
-from mx_rec.util.initialize import ConfigInitializer
-
-
-def create_hash_optimizer(learning_rate, weight_decay=0.0001, use_locking=False, name="GradientDescent"):
-    optimizer = CustomizedGradientDescentWithWeighDecay(learning_rate=learning_rate,
-                                                        weight_decay=weight_decay,
-                                                        use_locking=use_locking,
-                                                        name=name)
-    ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer
-    return optimizer
-
-
-class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOptimizer, CustomizedOptimizer):
-    name_counter = defaultdict(int)
-
-    def __init__(self, learning_rate, weight_decay, use_locking=False, name="GradientDescent"):
-        self.optimizer_type = "gradient_descent_with_weight_decay"
-        self.weight_decay = weight_decay
-        super(CustomizedGradientDescentWithWeighDecay, self)._get_name(name=name)
-        super(CustomizedGradientDescentWithWeighDecay, self).__init__(
-            learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name
-        )
-        self._slot_num = 0
-        self._derivative = 1
-
-    def get_slot_init_values(self):
-        logger.info("no slot for gradient descent")
-        return []
-
-    def _apply_sparse_duplicate_indices(self, grad, var):
-        logger.debug(">>>> Enter _apply_sparse_duplicate_indices")
-        nd_indices = tf.expand_dims(grad.indices, 1)
-        logger.info(f"weigh_decay={self.weight_decay}")
-        if self.weight_decay is None:
-            nd_value = grad.values * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
-        else:
-            nd_value = (grad.values + math_ops.cast(self.weight_decay, var.dtype.base_dtype) *
-                        tf.gather(var, grad.indices)) * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
-        var_update_op = tf.scatter_nd_add(var, nd_indices, -nd_value, use_locking=self._use_locking)
-        return var_update_op
-
-    def _apply_dense(self, grad, var):
-        logger.debug(">>>> Enter _apply_dense")
-        raise NotImplementedError("You are using a wrong type of variable.")
-- 
Gitee


From aac7a3b3f4d613aea3c303d0266987e023daf62c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Tue, 23 Jul 2024 17:20:36 +0800
Subject: [PATCH 09/16] =?UTF-8?q?=E5=90=8A=E8=B5=B7shell=E6=8F=90=E4=BA=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/run.sh | 99 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 examples/mmoe/run.sh

diff --git a/examples/mmoe/run.sh b/examples/mmoe/run.sh
new file mode 100644
index 00000000..6c142443
--- /dev/null
+++ b/examples/mmoe/run.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+cur_path=$(dirname "$(readlink -f "$0")")
+
+so_path=$1
+mx_rec_package_path=$2
+hccl_cfg_json=$3
+dlrm_criteo_data_path=$4
+ip=$5  # no ranktable时传入该参数
+
+interface="lo"
+num_server=1
+local_rank_size=8
+num_process=$((num_server * local_rank_size))
+export TRAIN_RANK_SIZE=$num_process
+
+################# 参数配置 ######################
+export USE_DYNAMIC=0            # 0：静态shape；1：动态shape
+export CACHE_MODE="HBM"         # HBM；DDR；SSD
+export USE_FAAE=0               # 0：关闭准入淘汰；1：开启准入淘汰
+export USE_DYNAMIC_EXPANSION=0  # 0：关闭动态扩容；1: 开启动态扩容
+export USE_MULTI_LOOKUP=0       # 0：一表一查；1：一表多查
+export USE_MODIFY_GRAPH=0       # 0：feature spec模式；1：自动改图模式
+################################################
+echo "CACHE_MODE:${CACHE_MODE}"
+
+export HCCL_CONNECT_TIMEOUT=1200
+export DLRM_CRITEO_DATA_PATH=${dlrm_criteo_data_path}
+export PYTHONPATH=${mx_rec_package_path}:${so_path}:$PYTHONPATH
+export LD_PRELOAD=/usr/lib64/libgomp.so.1
+export LD_LIBRARY_PATH=${so_path}:/usr/local/lib:$LD_LIBRARY_PATH
+export ASCEND_DEVICE_ID=0
+export RANK_ID_START=0
+export JOB_ID=10086
+export CUSTOMIZED_OPS_LIB_PATH=${so_path}/libcust_ops.so # Todo: please config
+export MXREC_LOG_LEVEL="INFO"
+export TF_CPP_MIN_LOG_LEVEL=3
+export ASCEND_GLOBAL_LOG_LEVEL=3
+#export USE_FAAE=1
+export ENABLE_FORCE_V2_CONTROL=1
+
+export PROFILING_OPTIONS='{"output":"/home/yz/profiling",
+                           "training_trace":"on",
+                           "task_trace":"on",
+                           "aicpu":"on",
+                           "fp_point":"",
+                           "bp_point":"",
+                           "aic_metrics":"PipeUtilization"}'
+
+RANK_ID_START=0
+
+export MXREC_MODE="ASC"
+echo "MXREC_MODE is $MXREC_MODE"
+export py=main_mxrec.py
+echo "py is $py"
+
+# 区分ranktable和no ranktable
+if [ -n "$ip" ]; then
+    # no ranktable分支
+    echo "Current is no ranktable solution."
+    echo "Input node ip: $ip, please make sure this ip is available."
+    export CM_CHIEF_IP=$ip  # 主节点ip
+    export CM_CHIEF_PORT=60001  # 主节点监听端口
+    export CM_CHIEF_DEVICE=0  # 主节点device id
+    export CM_WORKER_IP=$ip  # 当前节点ip
+    export CM_WORKER_SIZE=$num_process  # 参与集群训练的device数量
+    echo "CM_CHIEF_IP=$CM_CHIEF_IP"
+    echo "CM_CHIEF_PORT=$CM_CHIEF_PORT"
+    echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE"
+    echo "CM_WORKER_IP=$CM_WORKER_IP"
+    echo "CM_WORKER_SIZE=$CM_WORKER_SIZE"
+else
+    # ranktable分支
+    echo "Current is ranktable solution, hccl json file:${hccl_cfg_json}"
+    export RANK_SIZE=$num_process
+    echo "RANK_SIZE=${RANK_SIZE}, please make sure hccl configuration json file match this parameter"
+    export RANK_TABLE_FILE=${hccl_cfg_json}
+fi
+
+echo "use horovod to start tasks"
+# GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO
+mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0'
+
+horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \
+python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${num_process}p.log
-- 
Gitee


From 2bad2444eb05428de24da9a99e9f52496fcb4c67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Tue, 23 Jul 2024 17:21:16 +0800
Subject: [PATCH 10/16] =?UTF-8?q?=E6=97=A0=E9=9C=80loss=5Fscale=E5=8A=9F?=
 =?UTF-8?q?=E8=83=BD=EF=BC=8C=E5=8E=BB=E9=99=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/delay_loss_scale.py | 64 -------------------------------
 1 file changed, 64 deletions(-)
 delete mode 100644 examples/mmoe/delay_loss_scale.py

diff --git a/examples/mmoe/delay_loss_scale.py b/examples/mmoe/delay_loss_scale.py
deleted file mode 100644
index f73baf68..00000000
--- a/examples/mmoe/delay_loss_scale.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# coding=utf-8
-# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import tensorflow as tf
-from tensorflow.python.training import optimizer
-
-from config import Config
-
-
-class DenseLossScaleOptimizer:
-    def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None:
-        if not isinstance(opt, optimizer.Optimizer):
-            raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt))
-        self._optimizer = opt
-        self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32)
-        _update_lr_loss_scale(self._optimizer, loss_scale)
-
-    def compute_gradients(self, loss, var_list=None):
-        return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list)
-
-    def apply_gradients(self, avg_grads):
-        return self._optimizer.apply_gradients(avg_grads)
-
-
-class SparseLossScaleOptimizer:
-    def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None:
-        if not isinstance(opt, optimizer.Optimizer):
-            raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt))
-        self._optimizer = opt
-        self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32)
-        _update_lr_loss_scale(self._optimizer, loss_scale)
-
-    def compute_gradients(self, loss, var_list=None):
-        return tf.gradients(loss * self._loss_scale, var_list)
-
-    def apply_gradients(self, grads_and_vars):
-        return self._optimizer.apply_gradients(grads_and_vars)
-
-
-def _update_lr_loss_scale(opt, loss_scale):
-    if loss_scale <= 0:
-        raise RuntimeError("the loss_scale must be greater than zero.")
-    loss_scale = tf.convert_to_tensor(loss_scale, tf.float32)
-    if hasattr(opt, "_lr"):
-        # LazyAdam or Adam optimizer
-        opt._lr = opt._lr / loss_scale
-    elif hasattr(opt, "_learning_rate"):
-        # SGD optimizer
-        opt._learning_rate = opt._learning_rate / loss_scale
-    else:
-        raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.")
\ No newline at end of file
-- 
Gitee


From 6d08cf2ecb0290eafae0c4639c86f8eb85c43e47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Tue, 23 Jul 2024 17:25:42 +0800
Subject: [PATCH 11/16] =?UTF-8?q?=E6=97=A0=E7=94=A8=E8=84=9A=E6=9C=AC?=
 =?UTF-8?q?=E5=88=A0=E9=99=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/mean_auc.py | 40 ---------------------------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 examples/mmoe/mean_auc.py

diff --git a/examples/mmoe/mean_auc.py b/examples/mmoe/mean_auc.py
deleted file mode 100644
index ff57df00..00000000
--- a/examples/mmoe/mean_auc.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# coding=utf-8
-# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-import os
-from glob import glob
-import numpy as np
-
-
-def split_auc(log_input):
-    with open(log_input, 'r') as log:
-        all_auc = []
-        for line in log.readlines():
-            if 'Test' in line:
-                all_auc.append(float(line.split(';')[0].split(':')[-1].strip()))
-    all_auc_len = len(all_auc)
-    all_auc_arr = np.array(all_auc)[:all_auc_len - all_auc_len % 8]
-    test_auc = np.mean(all_auc_arr.reshape(-1, 8), axis=-1)
-    return test_auc
-
-
-log_path_all = 'latest_*.log'
-log_path_list = glob(log_path_all)
-
-for log_path in log_path_list:
-    print(os.path.basename(log_path))
-    print(split_auc(log_path))
-    print('*'*20)
\ No newline at end of file
-- 
Gitee


From 9845d170e50cd3087b4869fe070308230967e364 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Tue, 23 Jul 2024 17:27:02 +0800
Subject: [PATCH 12/16] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?=
 =?UTF-8?q?=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/config.py        | 38 ++++++++-----------
 examples/mmoe/main_mxrec.py    | 67 ++++++++++++++--------------------
 examples/mmoe/model.py         | 15 +++++---
 examples/mmoe/op_impl_mode.ini |  1 -
 examples/mmoe/optimizer.py     |  6 +--
 5 files changed, 54 insertions(+), 73 deletions(-)

diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py
index b87bc11b..67ed7a20 100644
--- a/examples/mmoe/config.py
+++ b/examples/mmoe/config.py
@@ -32,16 +32,11 @@ class LearningRateScheduler:
     TF-based cond operations necessary for performance in graph mode.
     """
 
-    def __init__(self, base_lr_dense, base_lr_sparse, warmup_steps, decay_start_step, decay_steps):
-        self.warmup_steps = tf.constant(warmup_steps, dtype=tf.int32)
-        self.decay_start_step = tf.constant(decay_start_step, dtype=tf.int32)
-        self.decay_steps = tf.constant(decay_steps)
-        self.decay_end_step = decay_start_step + decay_steps  # 65041
-        self.poly_power = 2.0
+    def __init__(self, base_lr_dense, base_lr_sparse):
         self.base_lr_dense = base_lr_dense
         self.base_lr_sparse = base_lr_sparse
 
-    def calc(self, global_step):
+    def calc(self):
         # used for the constant stage
         lr_factor_constant = tf.cast(1.0, tf.float32)
         
@@ -51,7 +46,7 @@ class LearningRateScheduler:
 
 
 class Config:
-    def __init__(self, ):
+    def __init__(self, ) -> None:
         self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None
         tmp = os.getenv("TRAIN_RANK_SIZE")
         if tmp is None:
@@ -81,31 +76,30 @@ class Config:
         self.__set_emb_table_size()
 
         self.field_num = 26
-        self.send_count = 46000 // self.rank_size
+        self.send_count = self.get_send_count(self.rank_size)
 
         self.emb_dim = self.expert_num * self.expert_size + self.gate_num * self.expert_num
         self.hashtable_threshold = 1
 
         self.USE_PIPELINE_TEST = False
 
-        # 动态学习率
-        GLOBAL_BATCH_SIZE = 8192 * 8
-        LR_SCHEDULE_STEPS = [
-            int(2750 * 55296 / GLOBAL_BATCH_SIZE),
-            int(49315 * 55296 / GLOBAL_BATCH_SIZE),
-            int(27772 * 55296 / GLOBAL_BATCH_SIZE),
-        ]
         self.global_step = tf.Variable(0, trainable=False)
         _lr_scheduler = LearningRateScheduler(
             0.001,
-            0.001,
-            LR_SCHEDULE_STEPS[0],
-            LR_SCHEDULE_STEPS[1],
-            LR_SCHEDULE_STEPS[2],
+            0.001
         )
         self.learning_rate = _lr_scheduler.calc()
+        
+    def get_send_count(self, rank_size):
+        try:
+            return  46000 // rank_size
+        except ZeroDivisionError as exp:
+            raise ZeroDivisionError('Rank size can not be zero.') from exp
+        
+        
+    
 
-    def __set_emb_table_size(self):
+    def __set_emb_table_size(self) -> None:
         self.cache_mode = os.getenv("CACHE_MODE")
         if self.cache_mode is None:
             raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]")
@@ -123,7 +117,7 @@ class Config:
         else:
             raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]")
 
-    def get_emb_table_cfg(self):
+    def get_emb_table_cfg(self) -> None:
         if self.cache_mode == CacheModeEnum.HBM.value:
             return {"device_vocabulary_size": self.dev_vocab_size}
         elif self.cache_mode == CacheModeEnum.DDR.value:
diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py
index e236cd2f..0eb127dd 100644
--- a/examples/mmoe/main_mxrec.py
+++ b/examples/mmoe/main_mxrec.py
@@ -24,10 +24,7 @@ from glob import glob
 import tensorflow as tf
 from sklearn.metrics import roc_auc_score
 import numpy as np
-
-from optimizer import get_dense_and_sparse_optimizer
-from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum
-from model import MyModel
+from npu_bridge.npu_init import *
 from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET
 from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func
 from mx_rec.core.asc.manager import start_asc_pipeline
@@ -40,7 +37,9 @@ from mx_rec.util.ops import import_host_pipeline_ops
 import mx_rec.util as mxrec_util
 from mx_rec.util.variable import get_dense_and_sparse_variable
 from mx_rec.util.log import logger
-from npu_bridge.npu_init import *
+from optimizer import get_dense_and_sparse_optimizer
+from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum
+from model import MyModel
 
 npu_plugin.set_device_sat_mode(0)
 
@@ -52,7 +51,6 @@ random.seed(shuffle_seed)
 
 def add_timestamp_func(batch):
     timestamp = import_host_pipeline_ops().return_timestamp(tf.cast(batch['label'], dtype=tf.int64))
-    # tf.constant(np.random.randint(1,1688109060,1)), tf.int64))
     batch["timestamp"] = timestamp
     return batch
 
@@ -144,7 +142,8 @@ def evaluate():
         eval_label = eval_model.get("label")
         sess.run([eval_iterator.initializer])
     else:
-        # 在sess run模式下，若还是使用原来batch中的label去sess run，则会出现getnext超时报错，需要使用新数据集中的batch
+        # In sess run mode, if the label from the original batch is still used for sess run, 
+        # a getnext timeout error will occur, and a new batch from the new dataset needs to be used
         eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label")
         sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)])
     log_loss_list = []
@@ -157,24 +156,26 @@ def evaluate():
     print("eval begin")
 
     while not finished:
+        
+        eval_current_steps += 1
+        eval_start = time.time()
         try:
-            eval_current_steps += 1
-            eval_start = time.time()
             eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_label])
-            eval_cost = time.time() - eval_start
-            qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size
-            log_loss_list += list(eval_loss.reshape(-1))
-            pred_income = pred[0]
-            pred_mat = pred[1]
-            pred_income_list += list(pred_income.reshape(-1))
-            pred_mat_list += list(pred_mat.reshape(-1))
-            label_income_list += list(label[:, 0].reshape(-1))
-            label_mat_list += list(label[:, 1].reshape(-1))
-            print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}")
-            if eval_current_steps == eval_steps:
-                finished = True
         except tf.errors.OutOfRangeError:
+            break
+        eval_cost = time.time() - eval_start
+        qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size
+        log_loss_list += list(eval_loss.reshape(-1))
+        pred_income = pred[0]
+        pred_mat = pred[1]
+        pred_income_list += list(pred_income.reshape(-1))
+        pred_mat_list += list(pred_mat.reshape(-1))
+        label_income_list += list(label[:, 0].reshape(-1))
+        label_mat_list += list(label[:, 1].reshape(-1))
+        print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}")
+        if eval_current_steps == eval_steps:
             finished = True
+        
     auc_income = roc_auc_score(label_income_list, pred_income_list)
     auc_mat = roc_auc_score(label_mat_list, pred_mat_list)
     mean_log_loss = np.mean(log_loss_list)
@@ -285,7 +286,6 @@ if __name__ == "__main__":
     warnings.filterwarnings("ignore")
     _clear_saved_model()
 
-    rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None
     rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None
     interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None
     train_steps = 1000
@@ -304,13 +304,8 @@ if __name__ == "__main__":
     logger.info(f"USE_DYNAMIC:{use_dynamic}")
     init(train_steps=train_steps, eval_steps=eval_steps,
          use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion)
-    IF_LOAD = False
+    
     rank_id = mxrec_util.communication.hccl_ops.get_rank_id()
-    filelist = glob(f"./saved-model/sparse-model-0")
-    if filelist:
-        IF_LOAD = True
-    ConfigInitializer.get_instance().if_load = IF_LOAD
-
     cfg = Config()
     feature_spec_list_train = None
     feature_spec_list_eval = None
@@ -385,14 +380,11 @@ if __name__ == "__main__":
             grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)]
             train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars))
 
-    # 动态学习率更新
-    train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]])
 
     with tf.control_dependencies(train_ops):
         train_ops = tf.no_op()
         cfg.learning_rate = [cfg.learning_rate[0], cfg.learning_rate[1]]
 
-    saver = tf.train.Saver()
     if MODIFY_GRAPH_FLAG:
         modify_graph_and_start_emb_cache(dump_graph=True)
     else:
@@ -405,7 +397,6 @@ if __name__ == "__main__":
         if MODIFY_GRAPH_FLAG:  # 该场景添加hook处理校验问题
             hook_list.append(GraphModifierHook(modify_graph=False))
 
-    # with tf.compat.v1.Session(config=sess_config(dump_data=False)) as sess:
     if use_faae:
         sess = tf.compat.v1.train.MonitoredTrainingSession(
             hooks=hook_list,
@@ -427,13 +418,12 @@ if __name__ == "__main__":
     epoch = 0
     cost_sum = 0
     qps_sum = 0
-    best_income_auc = 0
+    best_auc_income= 0
     best_auc_mat = 0
     iteration_per_loop = 10
 
     train_ops = util.set_iteration_per_loop(sess, train_ops, 10)
 
-    # for i in range(1, TRAIN_STEPS):
     i = 0
     while True:
         i += 1
@@ -441,9 +431,8 @@ if __name__ == "__main__":
         start_time = time.time()
 
         try:
-            grad, loss = sess.run([train_ops, train_model.get("loss")])
-            lr = sess.run(cfg.learning_rate)
-            global_step = sess.run(cfg.global_step)
+            grad, loss, lr, global_step = sess.run([train_ops, train_model.get("loss"), 
+                                                    cfg.learning_rate, cfg.global_step])
         except tf.errors.OutOfRangeError:
             logger.info(f"Encounter the end of Sequence for training.")
             break
@@ -469,9 +458,7 @@ if __name__ == "__main__":
                                                                                test_auc_mat,test_mean_log_loss))
             best_auc_income = max(best_auc_income, test_auc_income)
             best_auc_mat = max(best_auc_mat, test_auc_mat)
-            logger.info(f"training step: {i * iteration_per_loop}, 
-                        best auc income: {best_auc_income} , 
-                        best auc mat: {best_auc_mat}")
+            logger.info(f"training step: {i * iteration_per_loop}, best auc income: {best_auc_income} , best auc mat: {best_auc_mat}")
 
 
     sess.close()
diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py
index cf8ca108..224e8d6d 100644
--- a/examples/mmoe/model.py
+++ b/examples/mmoe/model.py
@@ -37,10 +37,10 @@ class MyModel:
         self.gate_num = gate_num
 
     
-    def expert_layer(self, input):
+    def expert_layer(self, _input):
         param_expert = []
         for i in range(0, self.expert_num):
-            expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name=f'expert_payer_{i}', 
+            expert_linear = tf.layers.dense(_input, units=self.expert_size, activation=None, name=f'expert_layer_{i}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
                                             bias_initializer=tf.constant_initializer(value=0.1))
             
@@ -48,10 +48,10 @@ class MyModel:
         return param_expert
     
     
-    def gate_layer(self, input):
+    def gate_layer(self, _input):
         param_gate = []
         for i in range(0, self.gate_num):
-            gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name=f'gate_payer_{i}', 
+            gate_linear = tf.layers.dense(_input, units=self.expert_num, activation=None, name=f'gate_layer_{i}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
                                             bias_initializer=tf.constant_initializer(value=0.1))
             
@@ -59,8 +59,8 @@ class MyModel:
         return param_gate
     
     
-    def tower_layer(self, input, layer_name):
-        tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name=f'tower_payer_{layer_name}', 
+    def tower_layer(self, _input, layer_name):
+        tower_linear = tf.layers.dense(_input, units=self.tower_size, activation=None, name=f'tower_layer_{layer_name}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
                                             bias_initializer=tf.constant_initializer(value=0.1))
         
@@ -109,7 +109,10 @@ class MyModel:
 
                 cur_gate_expert = tf.multiply(x=expert_concat, y=cur_gate)
                 cur_gate_expert = tf.reduce_sum(cur_gate_expert, axis=1)
+                
                 out = self.tower_layer(cur_gate_expert, i)
+                out = tf.nn.softmax(out)
+                out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0-1e-15)
                 output_layers.append(out)
                 out_pred.append(tf.nn.softmax(out[:, 1]))
                 _slice_num = slice_num_end
diff --git a/examples/mmoe/op_impl_mode.ini b/examples/mmoe/op_impl_mode.ini
index 579dea43..e69de29b 100644
--- a/examples/mmoe/op_impl_mode.ini
+++ b/examples/mmoe/op_impl_mode.ini
@@ -1 +0,0 @@
-ScatterNdAdd=support_out_of_bound_index
\ No newline at end of file
diff --git a/examples/mmoe/optimizer.py b/examples/mmoe/optimizer.py
index 2c7685bb..5469c705 100644
--- a/examples/mmoe/optimizer.py
+++ b/examples/mmoe/optimizer.py
@@ -15,12 +15,13 @@
 # ==============================================================================
 
 import tensorflow as tf
-from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer
+
 from mx_rec.util.initialize import ConfigInitializer
 from mx_rec.optimizers.lazy_adam import create_hash_optimizer
 from mx_rec.optimizers.lazy_adam_by_addr import create_hash_optimizer_by_address
 
 
+
 def get_dense_and_sparse_optimizer(cfg):
     dense_optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate[0])
     use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion
@@ -28,8 +29,5 @@ def get_dense_and_sparse_optimizer(cfg):
         sparse_optimizer = create_hash_optimizer_by_address(learning_rate=cfg.learning_rate[1])
     else:
         sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1])
-    loss_scale = 1
-    sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale)
-    dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale)
 
     return dense_optimizer, sparse_optimizer
-- 
Gitee


From ca2e82248c638e21066a3c6ae779d9409724d122 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Tue, 23 Jul 2024 17:28:49 +0800
Subject: [PATCH 13/16] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py
index 224e8d6d..f18dbff0 100644
--- a/examples/mmoe/model.py
+++ b/examples/mmoe/model.py
@@ -60,11 +60,11 @@ class MyModel:
     
     
     def tower_layer(self, _input, layer_name):
-        tower_linear = tf.layers.dense(_input, units=self.tower_size, activation=None, name=f'tower_layer_{layer_name}', 
+        tower_linear = tf.layers.dense(_input, units=self.tower_size, activation='relu', name=f'tower_layer_{layer_name}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
                                             bias_initializer=tf.constant_initializer(value=0.1))
         
-        tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, 
+        tower_linear_out = tf.layers.dense(tower_linear, units=2, activation=None, 
                                             name=f'tower_payer_out_{layer_name}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
                                             bias_initializer=tf.constant_initializer(value=0.1))
-- 
Gitee


From 13f3618364bae56befe067d91b75603f3bae4624 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Tue, 23 Jul 2024 19:29:53 +0800
Subject: [PATCH 14/16] codecheck

---
 examples/mmoe/config.py     |  8 ++++----
 examples/mmoe/main_mxrec.py | 12 +++++++-----
 examples/mmoe/model.py      |  5 +++--
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py
index 67ed7a20..b6a83582 100644
--- a/examples/mmoe/config.py
+++ b/examples/mmoe/config.py
@@ -90,14 +90,14 @@ class Config:
         )
         self.learning_rate = _lr_scheduler.calc()
         
+        
+    @staticmethod
     def get_send_count(self, rank_size):
         try:
-            return  46000 // rank_size
+            return 46000 // rank_size
         except ZeroDivisionError as exp:
             raise ZeroDivisionError('Rank size can not be zero.') from exp
-        
-        
-    
+         
 
     def __set_emb_table_size(self) -> None:
         self.cache_mode = os.getenv("CACHE_MODE")
diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py
index 0eb127dd..d02566aa 100644
--- a/examples/mmoe/main_mxrec.py
+++ b/examples/mmoe/main_mxrec.py
@@ -25,6 +25,7 @@ import tensorflow as tf
 from sklearn.metrics import roc_auc_score
 import numpy as np
 from npu_bridge.npu_init import *
+from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum
 from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET
 from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func
 from mx_rec.core.asc.manager import start_asc_pipeline
@@ -38,7 +39,7 @@ import mx_rec.util as mxrec_util
 from mx_rec.util.variable import get_dense_and_sparse_variable
 from mx_rec.util.log import logger
 from optimizer import get_dense_and_sparse_optimizer
-from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum
+
 from model import MyModel
 
 npu_plugin.set_device_sat_mode(0)
@@ -328,7 +329,7 @@ if __name__ == "__main__":
     optimizer_list = [get_dense_and_sparse_optimizer(cfg)]
 
     # note: variance_scaling_initializer only support HBM mode
-    emb_initializer = tf.constant_initializer(value = 0.1)
+    emb_initializer = tf.constant_initializer(value=0.1)
     sparse_hashtable = create_table(
         key_dtype=cfg.key_type,
         dim=tf.TensorShape([cfg.emb_dim]),
@@ -418,7 +419,7 @@ if __name__ == "__main__":
     epoch = 0
     cost_sum = 0
     qps_sum = 0
-    best_auc_income= 0
+    best_auc_income = 0
     best_auc_mat = 0
     iteration_per_loop = 10
 
@@ -455,10 +456,11 @@ if __name__ == "__main__":
             else:
                 test_auc_income, test_auc_mat, test_mean_log_loss = evaluate()
             print("Test auc income: {};Test auc mat: {} ;log_loss: {} ".format(test_auc_income, 
-                                                                               test_auc_mat,test_mean_log_loss))
+                                                                               test_auc_mat, test_mean_log_loss))
             best_auc_income = max(best_auc_income, test_auc_income)
             best_auc_mat = max(best_auc_mat, test_auc_mat)
-            logger.info(f"training step: {i * iteration_per_loop}, best auc income: {best_auc_income} , best auc mat: {best_auc_mat}")
+            logger.info(f"training step: {i * iteration_per_loop}, best auc income: "
+                        f"{best_auc_income} , best auc mat: {best_auc_mat}")
 
 
     sess.close()
diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py
index f18dbff0..f8090373 100644
--- a/examples/mmoe/model.py
+++ b/examples/mmoe/model.py
@@ -60,7 +60,8 @@ class MyModel:
     
     
     def tower_layer(self, _input, layer_name):
-        tower_linear = tf.layers.dense(_input, units=self.tower_size, activation='relu', name=f'tower_layer_{layer_name}', 
+        tower_linear = tf.layers.dense(_input, units=self.tower_size, activation='relu', 
+                                            name=f'tower_layer_{layer_name}', 
                                             kernel_initializer=tf.constant_initializer(value=0.1), 
                                             bias_initializer=tf.constant_initializer(value=0.1))
         
@@ -112,7 +113,7 @@ class MyModel:
                 
                 out = self.tower_layer(cur_gate_expert, i)
                 out = tf.nn.softmax(out)
-                out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0-1e-15)
+                out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0 - 1e-15)
                 output_layers.append(out)
                 out_pred.append(tf.nn.softmax(out[:, 1]))
                 _slice_num = slice_num_end
-- 
Gitee


From e3ffcd9bffabc259852c0af58f43273272d655c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Tue, 23 Jul 2024 21:59:32 +0800
Subject: [PATCH 15/16] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py
index f8090373..8cbb7ba8 100644
--- a/examples/mmoe/model.py
+++ b/examples/mmoe/model.py
@@ -116,7 +116,7 @@ class MyModel:
                 out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0 - 1e-15)
                 output_layers.append(out)
                 out_pred.append(tf.nn.softmax(out[:, 1]))
-                _slice_num = slice_num_end
+                _slice_num = slice_gate_end
             trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mmoe')
 
             label_income = label[:, 0:1]
-- 
Gitee


From 8182c3f1f288eaa0c936b330ab8cd4e38dae8bff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com>
Date: Thu, 25 Jul 2024 16:20:15 +0800
Subject: [PATCH 16/16] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9=EF=BC=8C=E6=B7=BB?=
 =?UTF-8?q?=E5=8A=A0=E9=9D=99=E6=80=81=E5=87=BD=E6=95=B0=E5=BF=98=E8=AE=B0?=
 =?UTF-8?q?=E5=88=A0=E9=99=A4self?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/mmoe/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py
index b6a83582..08cfe9e7 100644
--- a/examples/mmoe/config.py
+++ b/examples/mmoe/config.py
@@ -92,7 +92,7 @@ class Config:
         
         
     @staticmethod
-    def get_send_count(self, rank_size):
+    def get_send_count(rank_size):
         try:
             return 46000 // rank_size
         except ZeroDivisionError as exp:
-- 
Gitee