From b697a75300b880e76be97ef332576e7d5c2601fc Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Tue, 18 Jun 2024 11:51:41 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91deploy=E7=94=9F=E6=88=90hccl.j?= =?UTF-8?q?son=20bug=E4=BF=AE=E6=94=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/deploymentworker.go | 4 ++-- pkg/ring-controller/ranktable/v2/ranktable.go | 1 + pkg/ring-controller/ranktable/v2/types.go | 7 ++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pkg/ring-controller/agent/deploymentworker.go b/pkg/ring-controller/agent/deploymentworker.go index bde8391..69f830e 100644 --- a/pkg/ring-controller/agent/deploymentworker.go +++ b/pkg/ring-controller/agent/deploymentworker.go @@ -62,8 +62,8 @@ func (w *DeployWorker) doWork(pod *apiCoreV1.Pod, podInfo *podIdentifier) (bool, } // scenario check C: if current pod use chip, its' device info may not be ready // check basis: limits + annotations - if (podInfo.eventType == EventAdd || podInfo.eventType == EventUpdate) && !isPodAnnotationsReady(pod, - podInfo.String()) { + if (podInfo.eventType == EventAdd || podInfo.eventType == EventUpdate) && (!isPodAnnotationsReady(pod, + podInfo.String()) || pod.Status.PodIP == "") { return false, false } if w.configmapData.GetStatus() == ConfigmapCompleted { diff --git a/pkg/ring-controller/ranktable/v2/ranktable.go b/pkg/ring-controller/ranktable/v2/ranktable.go index aa31ff7..b99872e 100644 --- a/pkg/ring-controller/ranktable/v2/ranktable.go +++ b/pkg/ring-controller/ranktable/v2/ranktable.go @@ -56,6 +56,7 @@ func (r *RankTable) CachePodInfo(pod *apiCoreV1.Pod, instance ranktablev1.Instan // Build new server-level struct from device info server.ServerID = instance.ServerID server.PodID = pod.UID + server.ContainerIP = pod.Status.PodIP rankFactor := len(instance.Devices) if rankFactor > common.A800MaxChipNum { return fmt.Errorf("get error device num(%d), device num is too big", rankFactor) diff --git a/pkg/ring-controller/ranktable/v2/types.go b/pkg/ring-controller/ranktable/v2/types.go index b8beba6..d17b5c6 100644 --- a/pkg/ring-controller/ranktable/v2/types.go +++ b/pkg/ring-controller/ranktable/v2/types.go @@ -33,9 +33,10 @@ type RankTable struct { // Server to hccl type Server struct { - DeviceList []*Device `json:"device"` // device list in each server - ServerID string `json:"server_id"` // server id, represented by ip address - PodID types.UID `json:"-"` // pod id, equal to the last integer of pod name + DeviceList []*Device `json:"device"` // device list in each server + ServerID string `json:"server_id"` // server id, represented by ip address + PodID types.UID `json:"-"` // pod id, equal to the last integer of pod name + ContainerIP string `json:"container_ip,omitempty"` } // Device to hccl -- Gitee From f91a43494661eda252394cc7307e8c4e0581c072 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Tue, 18 Jun 2024 20:23:05 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91ranktable=E4=B8=AD=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0container=5Fip=E5=AD=97=E6=AE=B5=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/vcjobworker.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/ring-controller/agent/vcjobworker.go b/pkg/ring-controller/agent/vcjobworker.go index 38f1aa1..1e3b878 100644 --- a/pkg/ring-controller/agent/vcjobworker.go +++ b/pkg/ring-controller/agent/vcjobworker.go @@ -249,8 +249,8 @@ func (b *VCJobWorker) doPreCheck(pod *apiCoreV1.Pod, podInfo *podIdentifier) (bo } // scenario check C: if current pod use chip, its' device info may not be ready // check basis: limits + annotations - if (podInfo.eventType == EventAdd || podInfo.eventType == EventUpdate) && !isPodAnnotationsReady(pod, - podInfo.String()) { + if (podInfo.eventType == EventAdd || podInfo.eventType == EventUpdate) && (!isPodAnnotationsReady(pod, + podInfo.String()) || pod.Status.PodIP == "") { return false, false, fmt.Errorf("pod %s doesn't have device info, so no longer dealing with it", podInfo) } -- Gitee