From 70f098fbccb4fbdcb1f61fe9bfdcfc54fe545709 Mon Sep 17 00:00:00 2001 From: z30043230 Date: Fri, 5 Sep 2025 15:27:07 +0800 Subject: [PATCH] =?UTF-8?q?nputrace=E5=92=8Cnpu-monitor=E5=88=87=E6=8D=A2?= =?UTF-8?q?=E6=97=B6=E7=9B=B4=E6=8E=A5=E6=8B=A6=E6=88=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- msmonitor/README.md | 2 +- .../dynolog_npu/cli/src/commands/nputrace.rs | 11 +++- .../dynolog/src/rpc/SimpleJsonServerInl.h | 57 +++++++++++-------- 3 files changed, 45 insertions(+), 25 deletions(-) diff --git a/msmonitor/README.md b/msmonitor/README.md index f38851159b..4632b8b702 100644 --- a/msmonitor/README.md +++ b/msmonitor/README.md @@ -76,7 +76,7 @@ dyno --certs-dir /home/client_certs nputrace --start-step 10 --iterations 2 --ac ## 📖 特性介绍 ⚠️ 由于底层资源限制,npumonitor功能和nputrace不能同时开启。 -1. 执行 dyno 命令后,响应结果里有一个 ‘response’ 的json字符串。该字符串中的 ‘commandStatus’ 字段用于标识命令是否生效:‘effective’ 表示命令会生效,‘ineffective’ 表示命令无效。其他字段均为 dynolog 的原生字段。 +1. 执行 dyno 命令后,响应结果里有一个 ‘response’ 的json字符串。该字符串中的 ‘commandStatus’ 字段用于标识命令是否生效:‘effective’ 表示命令会生效,‘ineffective’ 表示命令无效。其他字段均为 dynolog 的原生字段(仅状态为‘effective’时存在)。 ### 📈 npumonitor特性 npumonitor特性为用户提供轻量化监控关键指标的能力,npumonitor基于[MSPTI](https://www.hiascend.com/document/detail/zh/mindstudio/81RC1/T&ITools/Profiling/atlasprofiling_16_0021.html)开发,用户可以通过npumonitor查看模型运行时的计算、通信算子执行耗时。 diff --git a/msmonitor/dynolog_npu/cli/src/commands/nputrace.rs b/msmonitor/dynolog_npu/cli/src/commands/nputrace.rs index 66f5d576c8..5badb8c9f9 100644 --- a/msmonitor/dynolog_npu/cli/src/commands/nputrace.rs +++ b/msmonitor/dynolog_npu/cli/src/commands/nputrace.rs @@ -164,7 +164,16 @@ pub fn run_nputrace( println!("response = {}", resp_str); let resp_v: Value = serde_json::from_str(&resp_str)?; - let processes = resp_v["processesMatched"].as_array().unwrap(); + let processes = if let Some(val) = resp_v.get("processesMatched") { + if let Some(arr) = val.as_array() { + arr + } else { + println!("'processesMatched' is not an array"); + return Ok(()); + } + } else { + return Ok(()); + }; if processes.is_empty() { println!("No processes were matched, please check --job-id or --pids flags"); diff --git a/msmonitor/dynolog_npu/dynolog/src/rpc/SimpleJsonServerInl.h b/msmonitor/dynolog_npu/dynolog/src/rpc/SimpleJsonServerInl.h index b51d973e4f..02643d0101 100644 --- a/msmonitor/dynolog_npu/dynolog/src/rpc/SimpleJsonServerInl.h +++ b/msmonitor/dynolog_npu/dynolog/src/rpc/SimpleJsonServerInl.h @@ -21,6 +21,7 @@ class SimpleJsonServer : public SimpleJsonServerBase { ~SimpleJsonServer() {} std::string processOneImpl(const std::string& request) override; + nlohmann::json handleSetKinetOnDemandRequest(const nlohmann::json& request); private: std::shared_ptr handler_; @@ -88,6 +89,38 @@ std::string GetCommandStatus(const std::string& configStr) } } +template +nlohmann::json SimpleJsonServer::handleSetKinetOnDemandRequest(const nlohmann::json& request) { + using json = nlohmann::json; + json response; + if (!request.contains("config") || !request.contains("pids")) { + response["status"] = "failed"; + return response; + } + try { + std::string config = request.value("config", ""); + std::vector pids = request.at("pids").get>(); + std::set pids_set{pids.begin(), pids.end()}; // TODO directly convert? + int job_id = request.value("job_id", 0); + int process_limit = request.value("process_limit", 1000); + auto commandStatus = GetCommandStatus(config); + if (commandStatus == "effective") { + auto result = handler_->setKinetOnDemandRequest(job_id, pids_set, config, process_limit); + response["processesMatched"] = result.processesMatched; + response["eventProfilersTriggered"] = result.eventProfilersTriggered; + response["activityProfilersTriggered"] = result.activityProfilersTriggered; + response["eventProfilersBusy"] = result.eventProfilersBusy; + response["activityProfilersBusy"] = result.activityProfilersBusy; + } + response["commandStatus"] = commandStatus; + } catch (const std::exception& ex) { + LOG(ERROR) << "setKinetOnDemandRequest: parsing exception = " << ex.what(); + response["status"] = fmt::format("failed with exception = {}", ex.what()); + } + return response; +} + + template std::string SimpleJsonServer::processOneImpl( const std::string& request_str) { @@ -105,29 +138,7 @@ std::string SimpleJsonServer::processOneImpl( } else if (request["fn"] == "getVersion") { response["version"] = handler_->getVersion(); } else if (request["fn"] == "setKinetOnDemandRequest") { - if (!request.contains("config") || !request.contains("pids")) { - response["status"] = "failed"; - } else { - try { - std::string config = request.value("config", ""); - std::vector pids = request.at("pids").get>(); - std::set pids_set{pids.begin(), pids.end()}; // TODO directly convert? - - int job_id = request.value("job_id", 0); - int process_limit = request.value("process_limit", 1000); - auto result = handler_->setKinetOnDemandRequest(job_id, pids_set, config, process_limit); - auto commandStatus = GetCommandStatus(config); - response["commandStatus"] = commandStatus; - response["processesMatched"] = result.processesMatched; - response["eventProfilersTriggered"] = result.eventProfilersTriggered; - response["activityProfilersTriggered"] = result.activityProfilersTriggered; - response["eventProfilersBusy"] = result.eventProfilersBusy; - response["activityProfilersBusy"] = result.activityProfilersBusy; - } catch (const std::exception& ex) { - LOG(ERROR) << "setKinetOnDemandRequest: parsing exception = " << ex.what(); - response["status"] = fmt::format("failed with exception = {}", ex.what()); - } - } + response = handleSetKinetOnDemandRequest(request); } else if (request["fn"] == "dcgmProfPause") { if (!request.contains("duration_s")) { response["status"] = "failed"; -- Gitee