From b065c3c8115e3b06d791e193980c7b519e265412 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Tue, 9 Jul 2024 20:35:32 +0800 Subject: [PATCH 01/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E4=BF=AE=E5=A4=8Ddeployment?= =?UTF-8?q?=E7=94=9F=E6=88=90ranktable=20bug=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/ranktable/v2/ranktable.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/ring-controller/ranktable/v2/ranktable.go b/pkg/ring-controller/ranktable/v2/ranktable.go index aa31ff7..f270a38 100644 --- a/pkg/ring-controller/ranktable/v2/ranktable.go +++ b/pkg/ring-controller/ranktable/v2/ranktable.go @@ -47,6 +47,10 @@ func (r *RankTable) BeforeUpdate() { // CachePodInfo :Cache pod info to RankTableV2 func (r *RankTable) CachePodInfo(pod *apiCoreV1.Pod, instance ranktablev1.Instance, rankStr string) error { + if _, ok := r.Servers.Load(pod.UID); ok { + return fmt.Errorf("%s/%s already exists in ranktable", pod.Namespace, pod.Name) + } + rankIndex, err := strconv.Atoi(rankStr) if err != nil { return fmt.Errorf("conv rankStr(%s) to int failed, err: %v", rankStr, err) -- Gitee From e86420b25de2ab8ab7789349ac5d3f219931ee97 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 10:15:52 +0800 Subject: [PATCH 02/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E4=BF=AE=E5=A4=8Ddeployment?= =?UTF-8?q?=E7=94=9F=E6=88=90ranktable=20bug=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/vcjobworker.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/ring-controller/agent/vcjobworker.go b/pkg/ring-controller/agent/vcjobworker.go index 38f1aa1..851d9db 100644 --- a/pkg/ring-controller/agent/vcjobworker.go +++ b/pkg/ring-controller/agent/vcjobworker.go @@ -354,17 +354,17 @@ func (b *WorkerInfo) handleAddUpdateEvent(podInfo *podIdentifier, pod *apiCoreV1 if rankExist { return fmt.Errorf("pod %s/%s already has rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) } + rankIndexStr = strconv.Itoa(int(b.rankIndex)) - err := b.updatePod(pod, func(newPod *apiCoreV1.Pod) { - rankIndexStr = strconv.Itoa(int(b.rankIndex)) - newPod.Annotations[PodRankIndexKey] = rankIndexStr - }) + // Cache device info from the pod + err := b.configmapData.CachePodInfo(pod, instance, rankIndexStr) if err != nil { return err } - // Cache device info from the pod - err = b.configmapData.CachePodInfo(pod, instance, rankIndexStr) + err = b.updatePod(pod, func(newPod *apiCoreV1.Pod) { + newPod.Annotations[PodRankIndexKey] = rankIndexStr + }) if err != nil { return err } -- Gitee From b60303dd7892abbf4b0fc70ebc365b199a2ac090 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 14:45:47 +0800 Subject: [PATCH 03/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/build.sh b/build/build.sh index 32ec5d7..7865742 100644 --- a/build/build.sh +++ b/build/build.sh @@ -20,7 +20,7 @@ cur_dir=$(dirname "$(readlink -f "$0")") top_dir=$(realpath "${cur_dir}"/..) export GO111MODULE="on" ver_file="${top_dir}"/service_config.ini -build_version="v5.0.RC1" +build_version="v6.0.RC3" if [ -f "$ver_file" ]; then line=$(sed -n '1p' "$ver_file" 2>&1) #cut the chars after ':' and add char 'v', the final example is v3.0.0 -- Gitee From 4b998ef24574d31d27dd42315ecd89e98de7d572 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 20:47:04 +0800 Subject: [PATCH 04/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/hccl-controller.yaml | 2 +- main.go | 6 +- pkg/ring-controller/agent/businessagent.go | 347 ++++++++++++++++--- pkg/ring-controller/agent/types.go | 15 +- pkg/ring-controller/agent/vcjobworker.go | 137 ++++---- pkg/ring-controller/controller/controller.go | 2 +- pkg/ring-controller/model/types.go | 4 + 7 files changed, 393 insertions(+), 120 deletions(-) diff --git a/build/hccl-controller.yaml b/build/hccl-controller.yaml index f5db08b..1382db5 100644 --- a/build/hccl-controller.yaml +++ b/build/hccl-controller.yaml @@ -20,7 +20,7 @@ rules: verbs: ["get","list","watch"] - apiGroups: [""] resources: ["configmaps"] - verbs: ["get", "update"] + verbs: ["get", "update", "list", "watch"] --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 diff --git a/main.go b/main.go index af29347..79219da 100644 --- a/main.go +++ b/main.go @@ -95,9 +95,13 @@ func main() { } jobInformer := jobInformerFactory.Batch().V1alpha1().Jobs() deploymentInformer := deploymentFactory.Apps().V1().Deployments() - cacheIndexer := make(map[string]cache.Indexer, 1) + rsInformer := deploymentFactory.Apps().V1().ReplicaSets() + cmInformer := deploymentFactory.Core().V1().ConfigMaps() + cacheIndexer := make(map[string]cache.Indexer) cacheIndexer[model.VCJobType] = jobInformer.Informer().GetIndexer() cacheIndexer[model.DeploymentType] = deploymentInformer.Informer().GetIndexer() + cacheIndexer[model.ReplicaSetType] = rsInformer.Informer().GetIndexer() + cacheIndexer[model.ConfigmapType] = cmInformer.Informer().GetIndexer() control, err := controller.NewEventController(kubeClient, jobClient, newConfig(), controller.InformerInfo{JobInformer: jobInformer, DeployInformer: deploymentInformer, CacheIndexers: cacheIndexer}, stopCh) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index a1c7f5d..c8b5dae 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -17,13 +17,24 @@ package agent import ( "context" + "encoding/json" + "errors" "fmt" + "hccl-controller/pkg/ring-controller/model" + ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" + v2 "hccl-controller/pkg/ring-controller/ranktable/v2" + appsV1 "k8s.io/api/apps/v1" + "k8s.io/client-go/util/retry" "reflect" "strings" + "sync" + "sync/atomic" "time" + "volcano.sh/apis/pkg/apis/batch/v1alpha1" "huawei.com/npu-exporter/v5/common-utils/hwlog" apiCoreV1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -49,7 +60,8 @@ func (p *podIdentifier) String() string { // implemented in the form of worker interface in the agent framework run. // Agent monitors POD events with a specific label and implements the // combination of tasks through different workers at different times. -func NewBusinessAgent(kubeClientSet kubernetes.Interface, recorder record.EventRecorder, config *Config, +func NewBusinessAgent(kubeClientSet kubernetes.Interface, indexers map[string]cache.Indexer, recorder record.EventRecorder, + config *Config, stopCh <-chan struct{}) (*BusinessAgent, error) { // create pod informer factory temp, newErr := labels.NewRequirement(Key910, selection.In, []string{Val910B, Val910}) @@ -69,6 +81,7 @@ func NewBusinessAgent(kubeClientSet kubernetes.Interface, recorder record.EventR informerFactory: podInformerFactory, podInformer: podInformerFactory.Core().V1().Pods().Informer(), PodsIndexer: podInformerFactory.Core().V1().Pods().Informer().GetIndexer(), + Indexers: indexers, Workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter( retryMilliSecond*time.Millisecond, threeMinutes*time.Second), "Pods"), KubeClientSet: kubeClientSet, @@ -168,24 +181,88 @@ func (b *BusinessAgent) doWork(obj interface{}) bool { func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, podKeyInfo *podIdentifier) bool { // Lock to safely obtain worker data in the Map - b.RwMutex.RLock() - defer b.RwMutex.RUnlock() - bsnsWorker, workerExist := b.BusinessWorker[podKeyInfo.jobUid] - hwlog.RunLog.Debugf(" worker : \n %+v", b.BusinessWorker) - if !workerExist { - if !podExist { + //b.RwMutex.RLock() + //defer b.RwMutex.RUnlock() + //bsnsWorker, workerExist := b.BusinessWorker[podKeyInfo.jobUid] + //hwlog.RunLog.Debugf(" worker : \n %+v", b.BusinessWorker) + //if !workerExist { + // if !podExist { + // b.Workqueue.Forget(obj) + // hwlog.RunLog.Infof("syncing '%s' terminated: current obj is no longer exist", podKeyInfo.String()) + // return true + // } + // // if someone create a single 910 pod without a job, how to handle? + // hwlog.RunLog.Debugf("syncing '%s' delayed: corresponding job worker may be uninitialized", + // podKeyInfo.String()) + // return false + //} + labelSeletor := map[string]string{} + replicas := int32(0) + if podKeyInfo.ownerKind == "Job" { + jobIndexer, ok := b.Indexers[model.VCJobType] + if !ok { b.Workqueue.Forget(obj) - hwlog.RunLog.Infof("syncing '%s' terminated: current obj is no longer exist", podKeyInfo.String()) + hwlog.RunLog.Errorf("job indexer not exist") return true } - // if someone create a single 910 pod without a job, how to handle? - hwlog.RunLog.Debugf("syncing '%s' delayed: corresponding job worker may be uninitialized", - podKeyInfo.String()) - return false + + jobObj, jobExist, err := jobIndexer.GetByKey(podKeyInfo.namespace + "/" + podKeyInfo.ownerName) + if err != nil { + hwlog.RunLog.Errorf("syncing '%s' failed: failed to get obj from indexer", podKeyInfo) + return false + } + if !jobExist { + if !podExist { + b.RwMutex.Lock() + delete(b.configMapCache, podKeyInfo.ownerUid) + b.RwMutex.Unlock() + b.Workqueue.Forget(obj) + hwlog.RunLog.Infof("syncing '%s' terminated: current obj is no longer exist", podKeyInfo.String()) + return true + } + hwlog.RunLog.Errorf("syncing '%s' failed: corresponding job not cache", podKeyInfo) + return false + } + labelSeletor = map[string]string{ + VolcanoJobNameKey: podKeyInfo.name, + VolcanoJobNamespaceKey: podKeyInfo.namespace, + } + job, ok := jobObj.(*v1alpha1.Job) + if !ok { + b.Workqueue.Forget(obj) + hwlog.RunLog.Errorf("syncing '%s' failed: failed to convert obj to job", podKeyInfo) + return true + } + for _, task := range job.Spec.Tasks { + replicas += task.Replicas + } + + } else if podKeyInfo.ownerKind == "ReplicaSet" { + rsIndexer := b.Indexers[model.ReplicaSetType] + rsObj, exist, err := rsIndexer.GetByKey(podKeyInfo.namespace + "/" + podKeyInfo.ownerName) + if err != nil { + hwlog.RunLog.Errorf("syncing '%s' failed: failed to get obj from api-server", podKeyInfo) + return false + } + if !exist { + if !podExist { + b.RwMutex.Lock() + delete(b.configMapCache, podKeyInfo.ownerUid) + b.RwMutex.Unlock() + b.Workqueue.Forget(obj) + hwlog.RunLog.Infof("syncing '%s' terminated: current obj is no longer exist", podKeyInfo.String()) + return true + } + hwlog.RunLog.Errorf("syncing '%s' failed: corresponding rs not cache", podKeyInfo) + return false + } + labelSeletor = rsObj.(metav1.Object).GetLabels() + replicas = *rsObj.(*appsV1.ReplicaSet).Spec.Replicas } + if podKeyInfo.eventType == EventDelete { b.Workqueue.Forget(obj) - if err := bsnsWorker.handleDeleteEvent(podKeyInfo); err != nil { + if err := b.handleDeleteEvent(podKeyInfo); err != nil { // only logs need to be recorded. hwlog.RunLog.Errorf("handleDeleteEvent error, error is %s", err) } @@ -202,14 +279,207 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p return true } - // if worker exist && pod exist, need check some special scenarios - hwlog.RunLog.Debugf("successfully synced '%s'", podKeyInfo) + if err := b.handleAddUpdateEvent(pod, podKeyInfo, labelSeletor, replicas); err != nil { + hwlog.RunLog.Errorf("handleAddUpdateEvent error, error is %s", err) + return false + } - forgetQueue, finished := bsnsWorker.doWork(pod, podKeyInfo) - if forgetQueue { - b.Workqueue.Forget(obj) + return true +} + +func (b *BusinessAgent) handleDeleteEvent(podInfo *podIdentifier) error { + b.RwMutex.Lock() + defer b.RwMutex.Unlock() + rankTable, ok := b.configMapCache[podInfo.ownerUid] + if !ok { + hwlog.RunLog.Debugf("handleDeleteEvent error, ranktable not exist") + return nil } - return finished + status := rankTable.GetStatus() + err := rankTable.RemovePodInfo(podInfo.namespace, podInfo.uid) + if err != nil { + return err + } + hwlog.RunLog.Infof("start to remove data of pod %s/%s", podInfo.namespace, podInfo.name) + + configmapName := "rings-config" + "-" + podInfo.jobName + if status == ConfigmapCompleted { + rankTable.SetStatus(ConfigmapInitializing) + hwlog.RunLog.Infof("pod(%s/%s) is delete, start to update configmap(%s) to initializing", podInfo.namespace, + podInfo.name, configmapName) + err = b.updateConfigMap(rankTable, podInfo.namespace, configmapName) + if err != nil { + rankTable.SetStatus(ConfigmapCompleted) + return err + } + } + + return nil +} + +func (b *BusinessAgent) updateConfigMap(rt ranktablev1.RankTabler, namespace, name string) error { + cmIndexer, ok := b.Indexers[model.ConfigmapType] + obj, exist, err := cmIndexer.GetByKey(namespace + "/" + name) + if err != nil { + return fmt.Errorf("get configmap error: %v", err) + } + if !exist { + return fmt.Errorf("configmap %s/%s not exist", namespace, name) + } + cm := obj.(*apiCoreV1.ConfigMap) + oldCM, ok := cm.Data[ConfigmapKey] + if !ok { + err = fmt.Errorf("old cm ranktable not exists") + hwlog.RunLog.Debug(err) + return err + } + hwlog.RunLog.Debugf("old cm ranktable %#v", oldCM) + label910, exist := (*cm).Labels[Key910] + if !exist || !(label910 == Val910B || label910 == Val910) { + return fmt.Errorf("invalid configmap label: %s", label910) + } + dataByteArray, err := json.Marshal(rt) + if err != nil { + return fmt.Errorf("marshal configmap data error: %v", err) + } + cm.Data[ConfigmapKey] = string(dataByteArray[:]) + + if _, err = b.KubeClientSet.CoreV1().ConfigMaps(namespace).Update(context.TODO(), cm, + metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to update ConfigMap for Job %v", err) + } + hwlog.RunLog.Debugf("new cm ranktable %s", cm.Data[ConfigmapKey]) + return nil +} + +func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIdentifier, + labelSeletor map[string]string, replicas int32) error { + b.RwMutex.Lock() + defer b.RwMutex.Unlock() + rankTable, ok := b.configMapCache[podInfo.ownerUid] + if !ok { + rankTable = &v2.RankTable{ServerCount: "0", ServerList: []*v2.Server(nil), Servers: &sync.Map{}, + RankTableStatus: ranktablev1.RankTableStatus{Status: "initializing"}, Version: "1.0"} + b.configMapCache[podInfo.ownerUid] = rankTable + } + + if rankTable.GetStatus() == ConfigmapCompleted { + hwlog.RunLog.Debugf("ranktable of job <%s/%s> is completed", pod.Namespace, podInfo.jobName) + return nil + } + + _, ok = pod.Annotations[PodDeviceKey] + if !ok { + return fmt.Errorf("pod %s/%s has no annotation %s", pod.Namespace, pod.Name, PodDeviceKey) + } + pods, err := b.informerFactory.Core().V1().Pods().Lister().List(labels.SelectorFromSet(labelSeletor)) + if err != nil { + return fmt.Errorf("failed to list pods: %v", err) + } + + running := 0 + for _, p := range pods { + if p.DeletionTimestamp != nil { + return fmt.Errorf("pod %s/%s is deleting", p.Namespace, p.Name) + } + if _, ok = p.Annotations[PodDeviceKey]; ok { + running++ + } + } + if running != int(replicas) { + return fmt.Errorf("ready pods %d is not equal to replicas %d", running, replicas) + } + + errs := int32(0) + + wg := &sync.WaitGroup{} + for _, p := range pods { + wg.Add(1) + go func(pod *apiCoreV1.Pod) { + defer wg.Done() + if err = b.cachePod(rankTable, pod); err != nil { + atomic.AddInt32(&errs, 1) + } + }(p) + } + wg.Wait() + + if errs > 0 { + return fmt.Errorf("cache pod info failed") + } + + rankTable.SetStatus(ConfigmapCompleted) + rankTable.BeforeUpdate() + configmapName := "rings-config" + "-" + podInfo.jobName + hwlog.RunLog.Infof("job is ready, start to update configmap(%s/%s) to completed", pod.Namespace, configmapName) + if err = b.updateConfigMap(rankTable, pod.Namespace, configmapName); err != nil { + hwlog.RunLog.Error("update configmap failed") + return err + } + + return nil + +} + +func (b *BusinessAgent) cachePod(rt ranktablev1.RankTabler, pod *apiCoreV1.Pod) error { + deviceInfo := pod.Annotations[PodDeviceKey] + + var instance ranktablev1.Instance + if err := json.Unmarshal([]byte(deviceInfo), &instance); err != nil { + return fmt.Errorf("parse annotation of pod %s/%s error: %#v", pod.Namespace, pod.Name, err) + } + if !ranktablev1.CheckDeviceInfo(&instance) { + return fmt.Errorf("deviceInfo failed the validation") + } + + rankIndexStr, err := b.getOrSetPodIndex(pod) + if err != nil { + return fmt.Errorf("error get or set pod index: %s", err) + } + + if err = rt.CachePodInfo(pod, instance, rankIndexStr); err != nil { + return fmt.Errorf("error cache pod info: %s", err) + } + return nil +} + +func (b *BusinessAgent) getOrSetPodIndex(pod *apiCoreV1.Pod) (string, error) { + var rankIndexStr string + + rankIndexStr, rankExist := pod.Annotations[PodRankIndexKey] + + if rankExist { + hwlog.RunLog.Infof("pod(%s/%s) already has rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) + } else { + for _, env := range pod.Spec.Containers[0].Env { + if env.Name == vcPodIndexKey { + rankIndexStr = env.Value + } + } + if rankIndexStr == "" { + return "", errors.New("index env not found in pod") + } + err := b.updatePod(pod, func(newPod *apiCoreV1.Pod) { + newPod.Annotations[PodRankIndexKey] = rankIndexStr + }) + if err != nil { + return "", err + } + hwlog.RunLog.Infof("set pod(%s/%s) rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) + } + return rankIndexStr, nil +} + +func (b *BusinessAgent) updatePod(pod *apiCoreV1.Pod, updateFunc func(*apiCoreV1.Pod)) error { + return retry.RetryOnConflict(retry.DefaultBackoff, func() error { + newPod, err := b.KubeClientSet.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) + if err != nil { + return err + } + updateFunc(newPod) + _, err = b.KubeClientSet.CoreV1().Pods(pod.Namespace).Update(context.TODO(), newPod, metav1.UpdateOptions{}) + return err + }) } // nameGenerationFunc: Generate the objects (Strings) to be put into the queue from POD metadata @@ -219,37 +489,34 @@ func (b *BusinessAgent) nameGenerationFunc(obj interface{}, eventType string) (* return nil, fmt.Errorf("object has no meta: %v", err) } labelMaps := metaData.GetLabels() - annotations := metaData.GetAnnotations() - OwnerReferences := metaData.GetOwnerReferences() - if len(OwnerReferences) != 1 { + //annotations := metaData.GetAnnotations() + owner := getControlled(metaData) + if owner == nil { return nil, fmt.Errorf("object has no owner: %v", err) } - jobUID := OwnerReferences[0].UID - if OwnerReferences[0].Kind == replicasetType { - rs, err := b.KubeClientSet.AppsV1().ReplicaSets(metaData.GetNamespace()).Get(context.TODO(), - OwnerReferences[0].Name, metav1.GetOptions{}) - if err != nil { - return nil, err - } - if len(rs.OwnerReferences) != 1 { - return nil, fmt.Errorf("object has no owner: %v", err) - } - hwlog.RunLog.Debugf("get pod(%s/%s) owner deploy uid from ReplicaSet: %s", metaData.GetNamespace(), - metaData.GetName(), rs.OwnerReferences[0].UID) - jobUID = rs.OwnerReferences[0].UID - } - return &podIdentifier{ name: metaData.GetName(), namespace: metaData.GetNamespace(), - rankIndex: annotations[PodRankIndexKey], + ownerKind: owner.Kind, + ownerName: owner.Name, + ownerUid: owner.UID, + //rankIndex: annotations[PodRankIndexKey], jobName: getWorkName(labelMaps), eventType: eventType, - jobUid: jobUID, - uid: metaData.GetUID(), + //jobUid: jobUID, + uid: metaData.GetUID(), }, nil } +func getControlled(obj metav1.Object) *metav1.OwnerReference { + for _, owner := range obj.GetOwnerReferences() { + if *owner.Controller { + return &owner + } + } + return nil +} + func isReferenceJobSameWithBsnsWorker(pod *apiCoreV1.Pod, jobName, bsnsWorkerUID string) bool { sameWorker := false for _, owner := range pod.OwnerReferences { diff --git a/pkg/ring-controller/agent/types.go b/pkg/ring-controller/agent/types.go index 164b805..ba5bdb4 100644 --- a/pkg/ring-controller/agent/types.go +++ b/pkg/ring-controller/agent/types.go @@ -102,12 +102,14 @@ type BusinessAgent struct { podInformer cache.SharedIndexInformer // PodsIndexer to get pod index by namespace&name PodsIndexer cache.Indexer + Indexers map[string]cache.Indexer // KubeClientSet : ClientSet to contact kube apiServer KubeClientSet kubernetes.Interface agentSwitch <-chan struct{} // RwMutex : to lock Agent Resource eg. Workqueue & BusinessWorker - RwMutex sync.RWMutex + RwMutex sync.RWMutex + configMapCache map[types.UID]ranktablev1.RankTabler // event recorder recorder record.EventRecorder @@ -135,11 +137,14 @@ type Config struct { type podIdentifier struct { namespace string name string - jobName string + ownerKind string + ownerName string + ownerUid types.UID eventType string - rankIndex string - uid types.UID - jobUid types.UID + jobName string + //rankIndex string + uid types.UID + //jobUid types.UID } // VCJobWorker controller for each volcano job, list/watch corresponding pods and build configmap rank table diff --git a/pkg/ring-controller/agent/vcjobworker.go b/pkg/ring-controller/agent/vcjobworker.go index 851d9db..d8ae6a2 100644 --- a/pkg/ring-controller/agent/vcjobworker.go +++ b/pkg/ring-controller/agent/vcjobworker.go @@ -20,6 +20,7 @@ import ( "encoding/json" "errors" "fmt" + "k8s.io/client-go/kubernetes" "strconv" "sync" "sync/atomic" @@ -431,55 +432,47 @@ func (b *WorkerInfo) updatePod(pod *apiCoreV1.Pod, updateFunc func(*apiCoreV1.Po } func (b *WorkerInfo) handleDeleteEvent(podInfo *podIdentifier) error { - hwlog.RunLog.Infof("current handleDeleteEvent pod is %s", podInfo) - b.cmMu.Lock() - defer b.cmMu.Unlock() - - status := b.configmapData.GetStatus() - - err := b.configmapData.RemovePodInfo(podInfo.namespace, podInfo.uid) - if err != nil { - return err - } - - hwlog.RunLog.Infof("start to remove data of pod %s/%s", podInfo.namespace, podInfo.name) - - if status == ConfigmapCompleted { - b.configmapData.SetStatus(ConfigmapInitializing) - hwlog.RunLog.Infof("pod(%s/%s) is delete, start to update configmap(%s) to initializing", podInfo.namespace, - podInfo.name, b.configmapName) - err = updateConfigMap(b, podInfo.namespace) - if err != nil { - b.configmapData.SetStatus(ConfigmapCompleted) - return err - } - } - - rankIndex := podInfo.rankIndex - if rankIndex != "" { - _, ok := b.cachedIndex.Load(rankIndex) - if !ok { - return fmt.Errorf("cannot find pod(%v) rank index %s", podInfo, rankIndex) - } - b.cachedIndex.Store(rankIndex, false) - } - hwlog.RunLog.Infof("data of pod %s/%s is removed", podInfo.namespace, podInfo.name) - b.cachedPods.Delete(podInfo.uid) - b.configmapData.DeletePod(podInfo.uid) - b.modifyStatistics(-1) + //hwlog.RunLog.Infof("current handleDeleteEvent pod is %s", podInfo) + //b.cmMu.Lock() + //defer b.cmMu.Unlock() + //status := b.configmapData.GetStatus() + //err := b.configmapData.RemovePodInfo(podInfo.namespace, podInfo.uid) + //if err != nil { + // return err + //} + // + //hwlog.RunLog.Infof("start to remove data of pod %s/%s", podInfo.namespace, podInfo.name) + // + //if status == ConfigmapCompleted { + // b.configmapData.SetStatus(ConfigmapInitializing) + // hwlog.RunLog.Infof("pod(%s/%s) is delete, start to update configmap(%s) to initializing", podInfo.namespace, + // podInfo.name, b.configmapName) + // err = updateConfigMap(b, podInfo.namespace) + // if err != nil { + // b.configmapData.SetStatus(ConfigmapCompleted) + // return err + // } + //} + // + //rankIndex := podInfo.rankIndex + //if rankIndex != "" { + // _, ok := b.cachedIndex.Load(rankIndex) + // if !ok { + // return fmt.Errorf("cannot find pod(%v) rank index %s", podInfo, rankIndex) + // } + // b.cachedIndex.Store(rankIndex, false) + //} + //hwlog.RunLog.Infof("data of pod %s/%s is removed", podInfo.namespace, podInfo.name) + //b.cachedPods.Delete(podInfo.uid) + //b.configmapData.DeletePod(podInfo.uid) + //b.modifyStatistics(-1) return nil } func (b *WorkerInfo) endRankTableConstruction(namespace string) error { - b.configmapData.SetStatus(ConfigmapCompleted) - b.configmapData.BeforeUpdate() - hwlog.RunLog.Infof("job is ready, start to update configmap(%s/%s) to completed", namespace, b.configmapName) - if err := updateConfigMap(b, namespace); err != nil { - hwlog.RunLog.Error("update configmap failed") - return err - } - return nil + + //return nil } // modifyStatistics statistic about how many pods have already cached @@ -514,33 +507,33 @@ func getWorkName(labels map[string]string) string { return "" } -func updateConfigMap(w *WorkerInfo, namespace string) error { - cm, err := w.kubeclientset.CoreV1().ConfigMaps(namespace).Get(context.TODO(), - w.configmapName, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("get configmap error: %v", err) - } - oldCM, ok := cm.Data[ConfigmapKey] - if !ok { - err = fmt.Errorf("old cm ranktable not exists") - hwlog.RunLog.Debug(err) - return err - } - hwlog.RunLog.Debugf("old cm ranktable %#v", oldCM) - label910, exist := (*cm).Labels[Key910] - if !exist || !(label910 == Val910B || label910 == Val910) { - return fmt.Errorf("invalid configmap label: %s", label910) - } - dataByteArray, err := json.Marshal(w.configmapData) - if err != nil { - return fmt.Errorf("marshal configmap data error: %v", err) - } - cm.Data[ConfigmapKey] = string(dataByteArray[:]) - - if _, err = w.kubeclientset.CoreV1().ConfigMaps(namespace).Update(context.TODO(), cm, - metav1.UpdateOptions{}); err != nil { - return fmt.Errorf("failed to update ConfigMap for Job %v", err) - } - hwlog.RunLog.Debugf("new cm ranktable %s", cm.Data[ConfigmapKey]) +func updateConfigMap(client kubernetes.Interface, rt *ranktablev1.RankTabler, namespace string) error { + //cm, err := client.CoreV1().ConfigMaps(namespace).Get(context.TODO(), + // w.configmapName, metav1.GetOptions{}) + //if err != nil { + // return fmt.Errorf("get configmap error: %v", err) + //} + //oldCM, ok := cm.Data[ConfigmapKey] + //if !ok { + // err = fmt.Errorf("old cm ranktable not exists") + // hwlog.RunLog.Debug(err) + // return err + //} + //hwlog.RunLog.Debugf("old cm ranktable %#v", oldCM) + //label910, exist := (*cm).Labels[Key910] + //if !exist || !(label910 == Val910B || label910 == Val910) { + // return fmt.Errorf("invalid configmap label: %s", label910) + //} + //dataByteArray, err := json.Marshal(w.configmapData) + //if err != nil { + // return fmt.Errorf("marshal configmap data error: %v", err) + //} + //cm.Data[ConfigmapKey] = string(dataByteArray[:]) + // + //if _, err = w.kubeclientset.CoreV1().ConfigMaps(namespace).Update(context.TODO(), cm, + // metav1.UpdateOptions{}); err != nil { + // return fmt.Errorf("failed to update ConfigMap for Job %v", err) + //} + //hwlog.RunLog.Debugf("new cm ranktable %s", cm.Data[ConfigmapKey]) return nil } diff --git a/pkg/ring-controller/controller/controller.go b/pkg/ring-controller/controller/controller.go index 36293a5..948ff7a 100644 --- a/pkg/ring-controller/controller/controller.go +++ b/pkg/ring-controller/controller/controller.go @@ -51,7 +51,7 @@ func NewEventController(kubeclientset kubernetes.Interface, jobclientset version eventBroadcaster.StartLogging(hwlog.RunLog.Infof) eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeclientset.CoreV1().Events("")}) recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: controllerName}) - agents, err := agent.NewBusinessAgent(kubeclientset, recorder, config, stopCh) + agents, err := agent.NewBusinessAgent(kubeclientset, informerInfo.CacheIndexers, recorder, config, stopCh) if err != nil { return nil, fmt.Errorf("error creating business agent: %s", err.Error()) } diff --git a/pkg/ring-controller/model/types.go b/pkg/ring-controller/model/types.go index 9299a14..7939459 100644 --- a/pkg/ring-controller/model/types.go +++ b/pkg/ring-controller/model/types.go @@ -30,6 +30,10 @@ const ( VCJobType = "vcjob" // DeploymentType To determine the type of listening:deployment. DeploymentType = "deployment" + // ReplicaSetType To determine the type of listening:replicaset. + ReplicaSetType = "replicaset" + // ConfigmapType To determine the type of listening:configmap. + ConfigmapType = "configmap" // BuildStatInterval 30 * time.Second BuildStatInterval = 30 * time.Second -- Gitee From 2aa3360a745f86d0609da8362906570ecf919e61 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 20:51:50 +0800 Subject: [PATCH 05/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 10 ++++++---- pkg/ring-controller/agent/types.go | 8 ++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index c8b5dae..b93d0bc 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -20,7 +20,9 @@ import ( "encoding/json" "errors" "fmt" - "hccl-controller/pkg/ring-controller/model" + //"hccl-controller/pkg/ring-controller/model" + + //"hccl-controller/pkg/ring-controller/model" ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" v2 "hccl-controller/pkg/ring-controller/ranktable/v2" appsV1 "k8s.io/api/apps/v1" @@ -199,7 +201,7 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p labelSeletor := map[string]string{} replicas := int32(0) if podKeyInfo.ownerKind == "Job" { - jobIndexer, ok := b.Indexers[model.VCJobType] + jobIndexer, ok := b.Indexers[VCJobType] if !ok { b.Workqueue.Forget(obj) hwlog.RunLog.Errorf("job indexer not exist") @@ -238,7 +240,7 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p } } else if podKeyInfo.ownerKind == "ReplicaSet" { - rsIndexer := b.Indexers[model.ReplicaSetType] + rsIndexer := b.Indexers[ReplicaSetType] rsObj, exist, err := rsIndexer.GetByKey(podKeyInfo.namespace + "/" + podKeyInfo.ownerName) if err != nil { hwlog.RunLog.Errorf("syncing '%s' failed: failed to get obj from api-server", podKeyInfo) @@ -318,7 +320,7 @@ func (b *BusinessAgent) handleDeleteEvent(podInfo *podIdentifier) error { } func (b *BusinessAgent) updateConfigMap(rt ranktablev1.RankTabler, namespace, name string) error { - cmIndexer, ok := b.Indexers[model.ConfigmapType] + cmIndexer, ok := b.Indexers[ConfigmapType] obj, exist, err := cmIndexer.GetByKey(namespace + "/" + name) if err != nil { return fmt.Errorf("get configmap error: %v", err) diff --git a/pkg/ring-controller/agent/types.go b/pkg/ring-controller/agent/types.go index ba5bdb4..8f82c80 100644 --- a/pkg/ring-controller/agent/types.go +++ b/pkg/ring-controller/agent/types.go @@ -29,6 +29,14 @@ import ( ) const ( + // VCJobType To determine the type of listening:vcjob. + VCJobType = "vcjob" + // DeploymentType To determine the type of listening:deployment. + DeploymentType = "deployment" + // ReplicaSetType To determine the type of listening:replicaset. + ReplicaSetType = "replicaset" + // ConfigmapType To determine the type of listening:configmap. + ConfigmapType = "configmap" // Key910 to get Configmap Key910 = "ring-controller.atlas" // Val910 to get Configmap -- Gitee From 7fa8b6082bfca518a6341d79814bcc22b8da963d Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 20:58:34 +0800 Subject: [PATCH 06/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index b93d0bc..c3e103a 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -84,6 +84,7 @@ func NewBusinessAgent(kubeClientSet kubernetes.Interface, indexers map[string]ca podInformer: podInformerFactory.Core().V1().Pods().Informer(), PodsIndexer: podInformerFactory.Core().V1().Pods().Informer().GetIndexer(), Indexers: indexers, + configMapCache: make(map[types.UID]ranktablev1.RankTabler), Workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter( retryMilliSecond*time.Millisecond, threeMinutes*time.Second), "Pods"), KubeClientSet: kubeClientSet, -- Gitee From eda43f890e27ee99fa473f77dd4fbf756cee0e39 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 21:04:09 +0800 Subject: [PATCH 07/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index c3e103a..dac1a74 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -382,6 +382,7 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde running := 0 for _, p := range pods { + hwlog.RunLog.Errorf("pod %s/%s status %s, annotations: %v", p.Namespace, p.Name, p.Annotations) if p.DeletionTimestamp != nil { return fmt.Errorf("pod %s/%s is deleting", p.Namespace, p.Name) } -- Gitee From 0796dac2dc452da4e92936bcce8ead2e8a0cee0b Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 21:04:56 +0800 Subject: [PATCH 08/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index dac1a74..29dd2ae 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -382,10 +382,11 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde running := 0 for _, p := range pods { - hwlog.RunLog.Errorf("pod %s/%s status %s, annotations: %v", p.Namespace, p.Name, p.Annotations) + if p.DeletionTimestamp != nil { return fmt.Errorf("pod %s/%s is deleting", p.Namespace, p.Name) } + hwlog.RunLog.Errorf("pod %s/%s annotations: %v", p.Namespace, p.Name, p.Annotations) if _, ok = p.Annotations[PodDeviceKey]; ok { running++ } -- Gitee From 2befe004a38f6fa56ddc7dddabb8096cfdd4a869 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 21:08:34 +0800 Subject: [PATCH 09/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index 29dd2ae..28ee50a 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -379,10 +379,9 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde if err != nil { return fmt.Errorf("failed to list pods: %v", err) } - + hwlog.RunLog.Errorf("list pods: %v", pods) running := 0 for _, p := range pods { - if p.DeletionTimestamp != nil { return fmt.Errorf("pod %s/%s is deleting", p.Namespace, p.Name) } -- Gitee From 2cc820bf1429896a0176406a7268b6338a83a46f Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 21:09:07 +0800 Subject: [PATCH 10/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index 28ee50a..c4cd25e 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -375,6 +375,7 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde if !ok { return fmt.Errorf("pod %s/%s has no annotation %s", pod.Namespace, pod.Name, PodDeviceKey) } + hwlog.RunLog.Infof("label selector: %v", labelSeletor) pods, err := b.informerFactory.Core().V1().Pods().Lister().List(labels.SelectorFromSet(labelSeletor)) if err != nil { return fmt.Errorf("failed to list pods: %v", err) -- Gitee From 090fa384a07c6d9cba042c55a4fe06d8d372e304 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 21:11:41 +0800 Subject: [PATCH 11/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index c4cd25e..0476d38 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -227,7 +227,7 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p return false } labelSeletor = map[string]string{ - VolcanoJobNameKey: podKeyInfo.name, + VolcanoJobNameKey: podKeyInfo.ownerName, VolcanoJobNamespaceKey: podKeyInfo.namespace, } job, ok := jobObj.(*v1alpha1.Job) -- Gitee From 4c1d9200172975f501c375237907c3882118a3fb Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 21:21:43 +0800 Subject: [PATCH 12/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index 0476d38..657193d 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -386,7 +386,6 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde if p.DeletionTimestamp != nil { return fmt.Errorf("pod %s/%s is deleting", p.Namespace, p.Name) } - hwlog.RunLog.Errorf("pod %s/%s annotations: %v", p.Namespace, p.Name, p.Annotations) if _, ok = p.Annotations[PodDeviceKey]; ok { running++ } -- Gitee From 8bb900038602c908396219d312a9ab284877b5fa Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 21:29:01 +0800 Subject: [PATCH 13/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index 657193d..320fba4 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -386,8 +386,8 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde if p.DeletionTimestamp != nil { return fmt.Errorf("pod %s/%s is deleting", p.Namespace, p.Name) } - if _, ok = p.Annotations[PodDeviceKey]; ok { - running++ + if _, ok = p.Annotations[PodDeviceKey]; !ok { + return fmt.Errorf("pod %s/%s has no annotation %s", p.Namespace, p.Name, PodDeviceKey) } } if running != int(replicas) { -- Gitee From bf06b4648fc92a734a5ba3aac20907179674885a Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 21:29:16 +0800 Subject: [PATCH 14/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index 320fba4..b7a0f2b 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -389,6 +389,7 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde if _, ok = p.Annotations[PodDeviceKey]; !ok { return fmt.Errorf("pod %s/%s has no annotation %s", p.Namespace, p.Name, PodDeviceKey) } + running++ } if running != int(replicas) { return fmt.Errorf("ready pods %d is not equal to replicas %d", running, replicas) -- Gitee From 6a1ad026b3caec71b67100202400c1becd6172f4 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Wed, 10 Jul 2024 21:40:36 +0800 Subject: [PATCH 15/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index b7a0f2b..a8ab297 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -360,10 +360,10 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde b.RwMutex.Lock() defer b.RwMutex.Unlock() rankTable, ok := b.configMapCache[podInfo.ownerUid] + defer func() { b.configMapCache[podInfo.ownerUid] = rankTable }() if !ok { rankTable = &v2.RankTable{ServerCount: "0", ServerList: []*v2.Server(nil), Servers: &sync.Map{}, RankTableStatus: ranktablev1.RankTableStatus{Status: "initializing"}, Version: "1.0"} - b.configMapCache[podInfo.ownerUid] = rankTable } if rankTable.GetStatus() == ConfigmapCompleted { @@ -375,12 +375,12 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde if !ok { return fmt.Errorf("pod %s/%s has no annotation %s", pod.Namespace, pod.Name, PodDeviceKey) } - hwlog.RunLog.Infof("label selector: %v", labelSeletor) + hwlog.RunLog.Debugf("label selector: %v", labelSeletor) pods, err := b.informerFactory.Core().V1().Pods().Lister().List(labels.SelectorFromSet(labelSeletor)) if err != nil { return fmt.Errorf("failed to list pods: %v", err) } - hwlog.RunLog.Errorf("list pods: %v", pods) + hwlog.RunLog.Debugf("list job<%s/%s> pods num: %d", podInfo.namespace, podInfo.ownerName, len(pods)) running := 0 for _, p := range pods { if p.DeletionTimestamp != nil { -- Gitee From 7fe45b5f866dc7d6385de0e8add703a91b23ceb3 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 09:16:20 +0800 Subject: [PATCH 16/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index a8ab297..dcf326e 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -20,9 +20,7 @@ import ( "encoding/json" "errors" "fmt" - //"hccl-controller/pkg/ring-controller/model" - //"hccl-controller/pkg/ring-controller/model" ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" v2 "hccl-controller/pkg/ring-controller/ranktable/v2" appsV1 "k8s.io/api/apps/v1" @@ -223,7 +221,7 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p hwlog.RunLog.Infof("syncing '%s' terminated: current obj is no longer exist", podKeyInfo.String()) return true } - hwlog.RunLog.Errorf("syncing '%s' failed: corresponding job not cache", podKeyInfo) + hwlog.RunLog.Warnf("syncing '%s' failed: corresponding job not cache", podKeyInfo) return false } labelSeletor = map[string]string{ @@ -403,6 +401,7 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde go func(pod *apiCoreV1.Pod) { defer wg.Done() if err = b.cachePod(rankTable, pod); err != nil { + hwlog.RunLog.Errorf("cache pod<%s/%s> info failed: %s", pod.Namespace, pod.Name, err) atomic.AddInt32(&errs, 1) } }(p) @@ -431,7 +430,7 @@ func (b *BusinessAgent) cachePod(rt ranktablev1.RankTabler, pod *apiCoreV1.Pod) var instance ranktablev1.Instance if err := json.Unmarshal([]byte(deviceInfo), &instance); err != nil { - return fmt.Errorf("parse annotation of pod %s/%s error: %#v", pod.Namespace, pod.Name, err) + return fmt.Errorf("parse annotation error: %#v", err) } if !ranktablev1.CheckDeviceInfo(&instance) { return fmt.Errorf("deviceInfo failed the validation") -- Gitee From fcbffaaea16260b9d8fe32a554e3b06e01a60aee Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 09:33:03 +0800 Subject: [PATCH 17/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 37 +++++++++++++--------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index dcf326e..03cfd33 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -20,6 +20,7 @@ import ( "encoding/json" "errors" "fmt" + "strconv" ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" v2 "hccl-controller/pkg/ring-controller/ranktable/v2" @@ -396,11 +397,11 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde errs := int32(0) wg := &sync.WaitGroup{} - for _, p := range pods { + for index, p := range pods { wg.Add(1) go func(pod *apiCoreV1.Pod) { defer wg.Done() - if err = b.cachePod(rankTable, pod); err != nil { + if err = b.cachePod(rankTable, index, pod); err != nil { hwlog.RunLog.Errorf("cache pod<%s/%s> info failed: %s", pod.Namespace, pod.Name, err) atomic.AddInt32(&errs, 1) } @@ -425,7 +426,7 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde } -func (b *BusinessAgent) cachePod(rt ranktablev1.RankTabler, pod *apiCoreV1.Pod) error { +func (b *BusinessAgent) cachePod(rt ranktablev1.RankTabler, index int, pod *apiCoreV1.Pod) error { deviceInfo := pod.Annotations[PodDeviceKey] var instance ranktablev1.Instance @@ -436,7 +437,7 @@ func (b *BusinessAgent) cachePod(rt ranktablev1.RankTabler, pod *apiCoreV1.Pod) return fmt.Errorf("deviceInfo failed the validation") } - rankIndexStr, err := b.getOrSetPodIndex(pod) + rankIndexStr, err := b.getOrSetPodIndex(index, pod) if err != nil { return fmt.Errorf("error get or set pod index: %s", err) } @@ -447,30 +448,36 @@ func (b *BusinessAgent) cachePod(rt ranktablev1.RankTabler, pod *apiCoreV1.Pod) return nil } -func (b *BusinessAgent) getOrSetPodIndex(pod *apiCoreV1.Pod) (string, error) { +func (b *BusinessAgent) getOrSetPodIndex(index int, pod *apiCoreV1.Pod) (string, error) { var rankIndexStr string rankIndexStr, rankExist := pod.Annotations[PodRankIndexKey] if rankExist { hwlog.RunLog.Infof("pod(%s/%s) already has rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) + return rankIndexStr, nil + } + if owner := getControlled(pod); owner != nil && owner.Kind == "ReplicaSet" { + rankIndexStr = strconv.Itoa(index) } else { for _, env := range pod.Spec.Containers[0].Env { if env.Name == vcPodIndexKey { rankIndexStr = env.Value } } - if rankIndexStr == "" { - return "", errors.New("index env not found in pod") - } - err := b.updatePod(pod, func(newPod *apiCoreV1.Pod) { - newPod.Annotations[PodRankIndexKey] = rankIndexStr - }) - if err != nil { - return "", err - } - hwlog.RunLog.Infof("set pod(%s/%s) rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) } + + if rankIndexStr == "" { + return "", errors.New("index env not found in pod") + } + err := b.updatePod(pod, func(newPod *apiCoreV1.Pod) { + newPod.Annotations[PodRankIndexKey] = rankIndexStr + }) + if err != nil { + return "", err + } + hwlog.RunLog.Infof("set pod(%s/%s) rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) + return rankIndexStr, nil } -- Gitee From 17b689bb04623c0c5172840373099a93a759cee6 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 09:37:29 +0800 Subject: [PATCH 18/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91debug=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90?= =?UTF-8?q?=E8=AF=84=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/agent/businessagent.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/agent/businessagent.go index 03cfd33..e856240 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/agent/businessagent.go @@ -399,13 +399,13 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde wg := &sync.WaitGroup{} for index, p := range pods { wg.Add(1) - go func(pod *apiCoreV1.Pod) { + go func(i int, pod *apiCoreV1.Pod) { defer wg.Done() - if err = b.cachePod(rankTable, index, pod); err != nil { + if err = b.cachePod(rankTable, i, pod); err != nil { hwlog.RunLog.Errorf("cache pod<%s/%s> info failed: %s", pod.Namespace, pod.Name, err) atomic.AddInt32(&errs, 1) } - }(p) + }(index, p) } wg.Wait() -- Gitee From e4bbbbb563382cdfcf0619cd6605b0fb76d74f66 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 11:07:29 +0800 Subject: [PATCH 19/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.go | 123 +--- pkg/ring-controller/agent/deploymentworker.go | 103 ---- .../agent/deploymentworker_test.go | 49 -- pkg/ring-controller/agent/types.go | 221 ------- pkg/ring-controller/agent/vcjobworker.go | 539 ------------------ pkg/ring-controller/agent/vcjobworker_test.go | 442 -------------- pkg/ring-controller/common/constants.go | 59 +- pkg/ring-controller/common/k8sclient.go | 64 +++ pkg/ring-controller/config/configs.go | 16 + .../{agent => controller}/businessagent.go | 255 ++++----- .../businessagent_test.go | 2 +- pkg/ring-controller/controller/controller.go | 215 ------- .../controller/controller_test.go | 184 ------ pkg/ring-controller/controller/types.go | 147 +++-- pkg/ring-controller/model/deployment.go | 119 ---- pkg/ring-controller/model/deployment_test.go | 228 -------- pkg/ring-controller/model/types.go | 65 --- pkg/ring-controller/model/vcjob.go | 281 --------- pkg/ring-controller/model/vcjob_test.go | 407 ------------- 19 files changed, 351 insertions(+), 3168 deletions(-) delete mode 100644 pkg/ring-controller/agent/deploymentworker.go delete mode 100644 pkg/ring-controller/agent/deploymentworker_test.go delete mode 100644 pkg/ring-controller/agent/types.go delete mode 100644 pkg/ring-controller/agent/vcjobworker.go delete mode 100644 pkg/ring-controller/agent/vcjobworker_test.go create mode 100644 pkg/ring-controller/common/k8sclient.go create mode 100644 pkg/ring-controller/config/configs.go rename pkg/ring-controller/{agent => controller}/businessagent.go (68%) rename pkg/ring-controller/{agent => controller}/businessagent_test.go (99%) delete mode 100644 pkg/ring-controller/controller/controller.go delete mode 100644 pkg/ring-controller/controller/controller_test.go delete mode 100644 pkg/ring-controller/model/deployment.go delete mode 100644 pkg/ring-controller/model/deployment_test.go delete mode 100644 pkg/ring-controller/model/types.go delete mode 100644 pkg/ring-controller/model/vcjob.go delete mode 100644 pkg/ring-controller/model/vcjob_test.go diff --git a/main.go b/main.go index 79219da..5c3bba9 100644 --- a/main.go +++ b/main.go @@ -20,24 +20,11 @@ import ( "errors" "flag" "fmt" - "time" - - "huawei.com/npu-exporter/v5/common-utils/hwlog" - "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/selection" - "k8s.io/client-go/informers" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/cache" - "k8s.io/client-go/tools/clientcmd" - "volcano.sh/apis/pkg/client/clientset/versioned" - "volcano.sh/apis/pkg/client/informers/externalversions" - "hccl-controller/pkg/resource-controller/signals" - "hccl-controller/pkg/ring-controller/agent" "hccl-controller/pkg/ring-controller/common" + "hccl-controller/pkg/ring-controller/config" "hccl-controller/pkg/ring-controller/controller" - "hccl-controller/pkg/ring-controller/model" + "huawei.com/npu-exporter/v5/common-utils/hwlog" ) var ( @@ -56,14 +43,7 @@ var ( const ( dryRun = false - displayStatistic = false - cmCheckInterval = 2 - cmCheckTimeout = 10 defaultLogFileName = "/var/log/mindx-dl/hccl-controller/hccl-controller.log" - defaultQPS = 200.0 - defaultBurst = 200 - maxQPS = 10000.0 - maxBurst = 10000 ) func main() { @@ -83,97 +63,25 @@ func main() { } // set up signals so we handle the first shutdown signal gracefully stopCh := signals.SetupSignalHandler() - kubeClient, jobClient, err := NewClientK8s() - if err != nil { - hwlog.RunLog.Error(err) - return - } - jobInformerFactory, deploymentFactory, newErr := newInformerFactory(jobClient, kubeClient) - if newErr != nil { - hwlog.RunLog.Error(newErr) - return - } - jobInformer := jobInformerFactory.Batch().V1alpha1().Jobs() - deploymentInformer := deploymentFactory.Apps().V1().Deployments() - rsInformer := deploymentFactory.Apps().V1().ReplicaSets() - cmInformer := deploymentFactory.Core().V1().ConfigMaps() - cacheIndexer := make(map[string]cache.Indexer) - cacheIndexer[model.VCJobType] = jobInformer.Informer().GetIndexer() - cacheIndexer[model.DeploymentType] = deploymentInformer.Informer().GetIndexer() - cacheIndexer[model.ReplicaSetType] = rsInformer.Informer().GetIndexer() - cacheIndexer[model.ConfigmapType] = cmInformer.Informer().GetIndexer() - control, err := controller.NewEventController(kubeClient, jobClient, newConfig(), - controller.InformerInfo{JobInformer: jobInformer, DeployInformer: deploymentInformer, - CacheIndexers: cacheIndexer}, stopCh) + + c, err := controller.NewBusinessAgent(newConfig(), stopCh) if err != nil { - hwlog.RunLog.Error(err) + hwlog.RunLog.Errorf("failed to create business agent: %v", err) return } - go jobInformerFactory.Start(stopCh) - go deploymentFactory.Start(stopCh) - if err = control.Run(jobParallelism, stopCh); err != nil { + if err = c.Run(podParallelism, stopCh); err != nil { hwlog.RunLog.Errorf("Error running controller: %s", err.Error()) } } -// NewClientK8s create k8s client -func NewClientK8s() (*kubernetes.Clientset, *versioned.Clientset, error) { - cfg, err := clientcmd.BuildConfigFromFlags("", "") - if err != nil { - hwlog.RunLog.Errorf("build client config err: %#v", err) - return nil, nil, err - } - if QPS <= 0 || QPS > maxQPS { - hwlog.RunLog.Warnf("kubeApiQps is invalid, require (0, %f) use default value %f", maxQPS, defaultQPS) - QPS = defaultQPS - } - if Burst <= 0 || Burst > maxBurst { - hwlog.RunLog.Warnf("kubeApiBurst is invalid, require (0, %d) use default value %d", maxBurst, defaultBurst) - Burst = defaultBurst - } - cfg.QPS = float32(QPS) - cfg.Burst = Burst - kubeClient, err := kubernetes.NewForConfig(cfg) - if err != nil { - return nil, nil, fmt.Errorf("error building kubernetes clientset: %s", err.Error()) - } - jobClient, err := versioned.NewForConfig(cfg) - if err != nil { - return nil, nil, fmt.Errorf("error building job clientset: %s", err.Error()) - } - return kubeClient, jobClient, nil - -} - -func newConfig() *agent.Config { - config := &agent.Config{ - DryRun: dryRun, - DisplayStatistic: displayStatistic, - PodParallelism: podParallelism, - CmCheckInterval: cmCheckInterval, - CmCheckTimeout: cmCheckTimeout, - } - return config -} - -func newInformerFactory(jobClient *versioned.Clientset, kubeClient *kubernetes.Clientset) ( - externalversions.SharedInformerFactory, informers.SharedInformerFactory, error) { - temp, newErr := labels.NewRequirement(agent.Key910, selection.In, []string{agent.Val910B, agent.Val910}) - if newErr != nil { - hwlog.RunLog.Infof("newInformerFactory %s", newErr) - return nil, nil, newErr +func newConfig() *config.Config { + return &config.Config{ + DryRun: dryRun, + PodParallelism: podParallelism, + Qps: QPS, + Burst: Burst, + HcclVersion: hcclVersion, } - labelSelector := temp.String() - jobInformerFactory := externalversions.NewSharedInformerFactoryWithOptions(jobClient, - time.Second*common.InformerInterval, externalversions.WithTweakListOptions(func(options *v1. - ListOptions) { - options.LabelSelector = labelSelector - })) - deploymentFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, - time.Second*common.InformerInterval, informers.WithTweakListOptions(func(options *v1.ListOptions) { - options.LabelSelector = labelSelector - })) - return jobInformerFactory, deploymentFactory, nil } func init() { @@ -195,8 +103,8 @@ func init() { "Query the verison of the program") flag.StringVar(&hcclVersion, "json", "v2", "Select version of hccl json file (v1/v2).") - flag.Float64Var(&QPS, "kubeApiQps", defaultQPS, "QPS to use while talking with kubernetes api-server") - flag.IntVar(&Burst, "kubeApiBurst", defaultBurst, "Burst to use while talking with kubernetes api-server") + flag.Float64Var(&QPS, "kubeApiQps", common.DefaultQPS, "QPS to use while talking with kubernetes api-server") + flag.IntVar(&Burst, "kubeApiBurst", common.DefaultBurst, "Burst to use while talking with kubernetes api-server") } func initHwLogger() error { @@ -210,7 +118,6 @@ func validate() error { if hcclVersion != "v1" && hcclVersion != "v2" { return errors.New("invalid json version value, should be v1/v2") } - agent.SetJSONVersion(hcclVersion) // check the validity of input parameters jobParallelism if jobParallelism <= 0 || jobParallelism > common.MaxJobParallelism { return errors.New("error parsing parameters: job parallelism should be range [1, 32]") diff --git a/pkg/ring-controller/agent/deploymentworker.go b/pkg/ring-controller/agent/deploymentworker.go deleted file mode 100644 index bde8391..0000000 --- a/pkg/ring-controller/agent/deploymentworker.go +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package agent - -import ( - "fmt" - "time" - - "huawei.com/npu-exporter/v5/common-utils/hwlog" - apiCoreV1 "k8s.io/api/core/v1" - - ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" -) - -// NewDeploymentWorker : to create Deployment Worker -func NewDeploymentWorker(agent *BusinessAgent, deploy DeployInfo, ranktable ranktablev1.RankTabler, - replicasTotal int32) *DeployWorker { - return &DeployWorker{ - WorkerInfo: WorkerInfo{ - kubeclientset: agent.KubeClientSet, - informerFactory: agent.informerFactory, - podsIndexer: agent.PodsIndexer, - recorder: agent.recorder, - dryRun: agent.dryRun, - statisticSwitch: make(chan struct{}), - configmapName: fmt.Sprintf("%s-%s", ConfigmapPrefix, deploy.DeployName), - configmapData: ranktable, - statisticStopped: false, - cachedPodNum: 0, - taskReplicasTotal: replicasTotal, - cachedIndex: newCachedIndex(int(replicasTotal)), - }, - DeployInfo: deploy} -} - -func (w *DeployWorker) doWork(pod *apiCoreV1.Pod, podInfo *podIdentifier) (bool, bool) { - // scenario check A: For an identical job, create it immediately after deletion - // check basis: job uid + creationTimestamp - if pod.CreationTimestamp.Before(&w.DeployCreationTimestamp) { - // old pod + new worker - hwlog.RunLog.Infof("syncing '%s' terminated: corresponding job worker is no "+ - "longer exist (basis: job uid + creationTimestamp)", podInfo) - return true, false - } - - // check whether pod has used npu - if used := containerUsedChip(pod); !used { - hwlog.RunLog.Errorf("pod %s doesn't use npu, so no longer dealing with it", podInfo) - return true, true - } - // scenario check C: if current pod use chip, its' device info may not be ready - // check basis: limits + annotations - if (podInfo.eventType == EventAdd || podInfo.eventType == EventUpdate) && !isPodAnnotationsReady(pod, - podInfo.String()) { - return false, false - } - if w.configmapData.GetStatus() == ConfigmapCompleted { - hwlog.RunLog.Infof("syncing '%s' terminated: corresponding rank table is completed", - podInfo) - return true, true - } - - // start to sync current pod - if err := w.syncHandler(pod, podInfo); err != nil { - hwlog.RunLog.Errorf("error syncing '%s': %s", podInfo, err.Error()) - } - return true, true -} - -// Statistic : no need to add lock here, deviation from true value is acceptable -func (w *DeployWorker) Statistic(stopTime time.Duration) { - for { - select { - case c, ok := <-w.statisticSwitch: - if !ok { - hwlog.RunLog.Error(c) - } - return - default: - if w.taskReplicasTotal == w.cachedPodNum { - hwlog.RunLog.Infof("rank table build progress for %s/%s is completed", - w.DeployNamespace, w.DeployName) - w.CloseStatistic() - return - } - hwlog.RunLog.Infof("rank table build progress for %s/%s: pods need to be cached = %d,"+ - "pods already cached = %d", w.DeployNamespace, w.DeployName, w.taskReplicasTotal, w.cachedPodNum) - time.Sleep(stopTime) - } - } -} diff --git a/pkg/ring-controller/agent/deploymentworker_test.go b/pkg/ring-controller/agent/deploymentworker_test.go deleted file mode 100644 index b418915..0000000 --- a/pkg/ring-controller/agent/deploymentworker_test.go +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -package agent - -import ( - "testing" - "time" - - "github.com/smartystreets/goconvey/convey" -) - -// TestDeployWorkerStatistic test DeployWorker_Statistic -func TestDeployWorkerStatistic(t *testing.T) { - convey.Convey("agent VCJobWorker_Statistic", t, func() { - d := &DeployWorker{WorkerInfo: WorkerInfo{statisticSwitch: make(chan struct{}), statisticStopped: false}} - const ( - TaskRep = 2 - SleepTime = 3 - ) - - convey.Convey(" chan will return when chan close ", func() { - d.taskReplicasTotal = TaskRep - d.cachedPodNum = 1 - go func() { - time.Sleep(SleepTime * time.Second) - d.CloseStatistic() - }() - d.Statistic(1 * time.Second) - }) - - convey.Convey(" chan will return when taskReplicasTotal==cachedPodNum ", func() { - const CachePod = 2 - d.taskReplicasTotal = TaskRep - d.cachedPodNum = CachePod - d.Statistic(1 * time.Second) - }) - }) -} diff --git a/pkg/ring-controller/agent/types.go b/pkg/ring-controller/agent/types.go deleted file mode 100644 index 8f82c80..0000000 --- a/pkg/ring-controller/agent/types.go +++ /dev/null @@ -1,221 +0,0 @@ -/* Copyright(C) 2020-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package agent - -import ( - "sync" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/informers" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/cache" - "k8s.io/client-go/tools/record" - "k8s.io/client-go/util/workqueue" - - ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" -) - -const ( - // VCJobType To determine the type of listening:vcjob. - VCJobType = "vcjob" - // DeploymentType To determine the type of listening:deployment. - DeploymentType = "deployment" - // ReplicaSetType To determine the type of listening:replicaset. - ReplicaSetType = "replicaset" - // ConfigmapType To determine the type of listening:configmap. - ConfigmapType = "configmap" - // Key910 to get Configmap - Key910 = "ring-controller.atlas" - // Val910 to get Configmap - Val910 = "ascend-910" - // Val910B to get Configmap - Val910B = "ascend-910b" - // A910ResourceName resource name for 910 - A910ResourceName = "huawei.com/Ascend910" - // ConfigmapPrefix to get from configmap - ConfigmapPrefix = "rings-config" - // ConfigmapCompleted Staus - ConfigmapCompleted = "completed" - // ConfigmapInitializing status - ConfigmapInitializing = "initializing" - // ConfigmapKey configmap Data Name - ConfigmapKey = "hccl.json" - // VolcanoJobNameKey to get job name - VolcanoJobNameKey = "volcano.sh/job-name" - // VolcanoJobNamespaceKey to get job namespace - VolcanoJobNamespaceKey = "volcano.sh/job-namespace" - // PodJobVersion to get job version - PodJobVersion = "volcano.sh/job-version" - // PodDeviceKey Pod annoation Key - PodDeviceKey = "ascend.kubectl.kubernetes.io/ascend-910-configuration" - // PodRankIndexKey pod rank index - PodRankIndexKey = "hccl/rankIndex" - // DeploymentNameKey pod label - DeploymentNameKey = "deploy-name" - // EventAdd event add - EventAdd = "add" - // EventUpdate event to update - EventUpdate = "update" - // EventDelete event to delete - EventDelete = "delete" - - vcPodIndexKey = "VC_TASK_INDEX" - replicasetType = "ReplicaSet" - - retryMilliSecond = 5 - threeMinutes = 180 - splitNum = 4 - - // InvalidNPUNum invalid NPU num - InvalidNPUNum = -1 -) - -var ( - // jsonVersion of hccl.json - jsonVersion = "v2" -) - -// SetJSONVersion set jsonVersion -func SetJSONVersion(v string) { - jsonVersion = v -} - -// GetJSONVersion get jsonVersion -func GetJSONVersion() string { - return jsonVersion -} - -// BusinessAgent Agent for all businessWorkers, responsibilities: -// - list/watch 910 pods, and assign each pod to corresponding handler -// (each business worker belongs to a volcano job, and contains a handler for building rank table) -type BusinessAgent struct { - // Config Agent configuration file - Config *Config - // business worker for each volcano job - BusinessWorker map[types.UID]Worker - informerFactory informers.SharedInformerFactory - podInformer cache.SharedIndexInformer - // PodsIndexer to get pod index by namespace&name - PodsIndexer cache.Indexer - Indexers map[string]cache.Indexer - // KubeClientSet : ClientSet to contact kube apiServer - KubeClientSet kubernetes.Interface - agentSwitch <-chan struct{} - - // RwMutex : to lock Agent Resource eg. Workqueue & BusinessWorker - RwMutex sync.RWMutex - configMapCache map[types.UID]ranktablev1.RankTabler - - // event recorder - recorder record.EventRecorder - // Workqueue: A queue with a limited rate.This queue is used to put pod event information - Workqueue workqueue.RateLimitingInterface - - // if print only, do not delete anything. - dryRun bool -} - -// Config controller init configure -type Config struct { - // DryRun:Is it a test - DryRun bool - // DisplayStatistic : a flag if starts to report rank table build statistic for job - DisplayStatistic bool - // PodParallelism : how many goroutine to run in the agent - PodParallelism int - // CmCheckInterval: ConfigMap Interval - CmCheckInterval int - // CmCheckTimeout :ConfigMap TimeOut - CmCheckTimeout int -} - -type podIdentifier struct { - namespace string - name string - ownerKind string - ownerName string - ownerUid types.UID - eventType string - jobName string - //rankIndex string - uid types.UID - //jobUid types.UID -} - -// VCJobWorker controller for each volcano job, list/watch corresponding pods and build configmap rank table -type VCJobWorker struct { - // WorkerInfo: normal Worker info - WorkerInfo - // JobInfo: VCJob Worker Info - JobInfo -} - -// JobInfo Job Worker Info -type JobInfo struct { - // JobVersion: When a job restart, JobVersion is needed to identify if a pod is old - // with respect to this job - JobVersion int32 - // JobUID: For an identical job, create it immediately after deletion, new - // vcjob Worker will cache old pod info without a identifier to distinguish - JobUID string - // JobCreationTimestamp: when pod reference job uid is different with uid of VCJobWorker - // creationTimestamp is needed to distinguish cases between: 1. old pod + new worker OR 2. new pod + old worker - JobCreationTimestamp metav1.Time - // JobNamespace: Job namespace - JobNamespace string - // JobName : Job name - JobName string -} - -// DeployWorker for deployment model -type DeployWorker struct { - // WorkerInfo: normal Worker info - WorkerInfo - // DeployInfo: Deployment Worker info - DeployInfo -} - -// WorkerInfo :normal Worker info -type WorkerInfo struct { - kubeclientset kubernetes.Interface - recorder record.EventRecorder - cmMu, statisticMu sync.Mutex - dryRun bool - statisticSwitch chan struct{} - informerFactory informers.SharedInformerFactory - podsIndexer cache.Indexer - - configmapName string - configmapData ranktablev1.RankTabler - - statisticStopped bool - rankIndex int32 - cachedIndex *sync.Map - cachedPods *sync.Map - cachedPodNum int32 - taskReplicasTotal int32 -} - -// DeployInfo : deployment Worker info -type DeployInfo struct { - // DeployCreationTimestamp: when pod reference job uid is different with uid of VCJobWorker - // creationTimestamp is needed to distinguish cases between: 1. old pod + new worker OR 2. new pod + old worker - DeployCreationTimestamp metav1.Time - // DeployNamespace :deployment namespace - DeployNamespace string - // DeployName : deployment name - DeployName string -} diff --git a/pkg/ring-controller/agent/vcjobworker.go b/pkg/ring-controller/agent/vcjobworker.go deleted file mode 100644 index d8ae6a2..0000000 --- a/pkg/ring-controller/agent/vcjobworker.go +++ /dev/null @@ -1,539 +0,0 @@ -/* Copyright(C) 2020-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package agent for logic -package agent - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "k8s.io/client-go/kubernetes" - "strconv" - "sync" - "sync/atomic" - "time" - - "huawei.com/npu-exporter/v5/common-utils/hwlog" - apiCoreV1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/util/retry" - - "hccl-controller/pkg/ring-controller/common" - ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" -) - -const maxRankIndex = 10000 - -// Worker :The main function of Worker is to get the information of NPU from the generated POD, -// and then assemble it into a complete HCCL.JSON file. -type Worker interface { - doWork(pod *apiCoreV1.Pod, podInfo *podIdentifier) (bool, bool) - Statistic(stopTime time.Duration) - WorkerCommon -} - -// NewVCJobWorker : Generates a Worker that handles the VCJob type -func NewVCJobWorker(agent *BusinessAgent, job JobInfo, ranktable ranktablev1.RankTabler, - replicasTotal int32) *VCJobWorker { - jobWorker := &VCJobWorker{ - WorkerInfo: WorkerInfo{ - kubeclientset: agent.KubeClientSet, - podsIndexer: agent.PodsIndexer, - informerFactory: agent.informerFactory, - recorder: agent.recorder, - dryRun: agent.dryRun, - statisticSwitch: make(chan struct{}), - configmapName: fmt.Sprintf("%s-%s", ConfigmapPrefix, job.JobName), - configmapData: ranktable, - statisticStopped: false, - cachedPodNum: 0, - taskReplicasTotal: replicasTotal, - cachedPods: &sync.Map{}, - cachedIndex: newCachedIndex(int(replicasTotal)), - }, - JobInfo: job, - } - return jobWorker -} - -func newCachedIndex(n int) *sync.Map { - m := &sync.Map{} - for i := 0; i < n; i++ { - m.Store(strconv.Itoa(i), false) - } - return m -} - -func (b *VCJobWorker) doWork(pod *apiCoreV1.Pod, podInfo *podIdentifier) (bool, bool) { - hwlog.RunLog.Debugf("syncing %s", podInfo) - if b.dryRun { - return true, true - } - - defer func() { - if b.configmapData.GetStatus() == ConfigmapCompleted { - return - } - if b.cacheRankTable() { - if err := b.endRankTableConstruction(pod.Namespace); err != nil { - hwlog.RunLog.Errorf("error end rank table construction: %s", err) - } - } - }() - - forgetQueue, finished, err := b.doPreCheck(pod, podInfo) - if err != nil { - hwlog.RunLog.Debugf("error do pre check: %s", err) - return forgetQueue, finished - } - - if _, ok := b.cachedPods.Load(pod.UID); ok { - return true, true - } - b.cachedPods.Store(pod.UID, true) - b.modifyStatistics(1) - return true, true -} - -func (b *VCJobWorker) cacheRankTable() bool { - if !b.tableConstructionFinished() { - hwlog.RunLog.Debugf("job %s/%s rank table construction not finished", b.JobNamespace, b.JobName) - return false - } - pods, err := b.getPodsFromCache() - if err != nil { - hwlog.RunLog.Errorf("error listing pods: %s", err.Error()) - return false - } - if !b.checkJobIsRunning(pods) { - hwlog.RunLog.Debugf("job %s/%s not running", b.JobNamespace, b.JobName) - return false - } - - if err = b.cacheReadyPods(pods); err != nil { - hwlog.RunLog.Errorf("error cache ready pods: %s", err.Error()) - return false - } - - return true -} - -func (b *VCJobWorker) getPodsFromCache() ([]*apiCoreV1.Pod, error) { - return b.informerFactory.Core().V1().Pods().Lister().List(labels.SelectorFromSet( - map[string]string{ - VolcanoJobNameKey: b.JobInfo.JobName, - VolcanoJobNamespaceKey: b.JobInfo.JobNamespace, - })) -} - -func (b *VCJobWorker) checkJobIsRunning(pods []*apiCoreV1.Pod) bool { - readyPods := int32(0) - for _, p := range pods { - if p.GetDeletionTimestamp() == nil { - readyPods++ - } - } - if readyPods != b.taskReplicasTotal { - hwlog.RunLog.Infof("job %s/%s ready pods: %d, total pods: %d", b.JobNamespace, b.JobName, readyPods, - b.taskReplicasTotal) - return false - } - return true -} - -func (b *VCJobWorker) cachePod(wg *sync.WaitGroup, pod *apiCoreV1.Pod, errs *sync.Map) { - defer wg.Done() - deviceInfo := pod.Annotations[PodDeviceKey] - - var instance ranktablev1.Instance - if err := json.Unmarshal([]byte(deviceInfo), &instance); err != nil { - errs.Store(pod.Name, fmt.Errorf("parse annotation of pod %s/%s error: %#v", pod.Namespace, pod.Name, - err)) - return - } - if !ranktablev1.CheckDeviceInfo(&instance) { - errs.Store(pod.Name, fmt.Errorf("deviceInfo failed the validation")) - return - } - - rankIndexStr, err := b.getOrSetPodIndex(pod) - if err != nil { - errs.Store(pod.Name, fmt.Errorf("error get or set pod index: %s", err)) - return - } - - if err = b.configmapData.CachePodInfo(pod, instance, rankIndexStr); err != nil { - errs.Store(pod.Name, fmt.Errorf("error cache pod info: %s", err)) - } -} - -func (b *VCJobWorker) cacheReadyPods(pods []*apiCoreV1.Pod) error { - errs := sync.Map{} - - wg := &sync.WaitGroup{} - for _, p := range pods { - wg.Add(1) - go b.cachePod(wg, p, &errs) - } - wg.Wait() - - var err error - errs.Range(func(key, value interface{}) bool { - if value != nil { - errVal, ok := value.(error) - if !ok { - hwlog.RunLog.Error("failed to convert value") - return false - } - err = errVal - return false - } - return true - }) - - return err -} - -func (b *VCJobWorker) doPreCheck(pod *apiCoreV1.Pod, podInfo *podIdentifier) (bool, bool, error) { - // scenario check A: For an identical job, create it immediately after deletion - // check basis: job uid + creationTimestamp - if !isReferenceJobSameWithBsnsWorker(pod, podInfo.jobName, b.JobUID) { - if pod.CreationTimestamp.Before(&b.JobCreationTimestamp) { - // old pod + new worker - hwlog.RunLog.Debugf("syncing '%s' terminated: corresponding job worker is no "+ - "longer exist (basis: job uid + creationTimestamp)", podInfo) - return true, false, errors.New("") - } - // new pod + old worker - hwlog.RunLog.Infof("syncing '%s' delayed: corresponding job worker is "+ - "uninitialized (basis: job uid + creationTimestamp)", podInfo) - return false, false, errors.New("") - } - // scenario check B: job set restart policy, delete pod - // check basis: job version - val, exists := pod.Annotations[PodJobVersion] - if !exists { - return true, true, fmt.Errorf("the key of " + PodJobVersion + " does not exist") - } - version64, err := strconv.ParseInt(val, common.Decimal, common.BitSize32) - if err != nil { - return true, true, fmt.Errorf("syncing '%s' failed, parse pod annotation error: %v", podInfo, err) - } - // job restart action will increase job version number - if version64 < int64(b.JobVersion) { - return true, true, fmt.Errorf("syncing '%s' terminated: corresponding job worker "+ - "is no longer exist (basis: job version number)", podInfo) - } - if version64 != 0 { - if _, ok := pod.Annotations[PodRankIndexKey]; !ok { - return true, false, fmt.Errorf("the %s key of pod(%s/%s) does not exist", PodRankIndexKey, pod.Namespace, pod.Name) - } - } - - // check whether pod has used npu - if used := containerUsedChip(pod); !used { - return true, true, fmt.Errorf("pod %s doesn't use npu, so no longer dealing with it", podInfo) - } - // scenario check C: if current pod use chip, its' device info may not be ready - // check basis: limits + annotations - if (podInfo.eventType == EventAdd || podInfo.eventType == EventUpdate) && !isPodAnnotationsReady(pod, - podInfo.String()) { - return false, false, fmt.Errorf("pod %s doesn't have device info, so no longer dealing with it", podInfo) - } - - return true, true, nil -} - -// Statistic : Determine whether CM has been built, process the build completion or change the goroutine exit signal. -// No need to add lock here, deviation from true value is acceptable -func (b *VCJobWorker) Statistic(stopTime time.Duration) { - for { - select { - case c, ok := <-b.statisticSwitch: - if !ok { - hwlog.RunLog.Error(c) - } - return - default: - if b.taskReplicasTotal == b.cachedPodNum { - hwlog.RunLog.Infof("rank table build progress for %s/%s is completed", - b.JobNamespace, b.JobName) - b.CloseStatistic() - return - } - hwlog.RunLog.Infof("rank table build progress for %s/%s: pods need to be cached = %d,"+ - "pods already cached = %d", b.JobNamespace, b.JobName, b.taskReplicasTotal, b.cachedPodNum) - time.Sleep(stopTime) - } - } -} - -// WorkerCommon : The common methods of Worker, these methods have a certain degree of fixedness, -// if the new Worker type does not apply to these methods, they can be overwritten. -type WorkerCommon interface { - handleAddUpdateEvent(podInfo *podIdentifier, pod *apiCoreV1.Pod) error - handleDeleteEvent(podInfo *podIdentifier) error - tableConstructionFinished() bool - endRankTableConstruction(string) error - modifyStatistics(diff int32) - // CloseStatistic : to close statisticSwitch chan - CloseStatistic() - syncHandler(pod *apiCoreV1.Pod, podInfo *podIdentifier) error -} - -func (b *WorkerInfo) syncHandler(pod *apiCoreV1.Pod, podInfo *podIdentifier) error { - hwlog.RunLog.Infof("syncHandler start, current pod is %s", podInfo) - - // if use 0 chip, end pod sync - if b.taskReplicasTotal == 0 && b.tableConstructionFinished() { - hwlog.RunLog.Infof("job %s/%s doesn't use d chip, rank table construction is finished", - podInfo.namespace, podInfo.jobName) - if err := b.endRankTableConstruction(pod.Namespace); err != nil { - return err - } - hwlog.RunLog.Infof("rank table for job %s/%s has finished construction", podInfo.namespace, podInfo.jobName) - return nil // need return directly - } - - // dryRun is for empty running and will not be committed - if b.dryRun { - hwlog.RunLog.Infof("I'am handling %s", podInfo) - return nil - } - - if podInfo.eventType == EventAdd || podInfo.eventType == EventUpdate { - return b.handleAddUpdateEvent(podInfo, pod) - } - hwlog.RunLog.Infof("undefined condition, pod: %s", podInfo) - return nil -} - -func (b *WorkerInfo) tableConstructionFinished() bool { - b.statisticMu.Lock() - defer b.statisticMu.Unlock() - - return b.cachedPodNum == b.taskReplicasTotal -} - -func (b *WorkerInfo) handleAddUpdateEvent(podInfo *podIdentifier, pod *apiCoreV1.Pod) error { - hwlog.RunLog.Debugf("current addUpdate pod is %s", podInfo) - // because this annotation is already used to filter pods in previous step scenario check C - // it can be used to identify if pod use chip here - deviceInfo, exist := pod.Annotations[PodDeviceKey] - if !exist { - return errors.New("the key of " + PodDeviceKey + " does not exist ") - } - var instance ranktablev1.Instance - if err := json.Unmarshal([]byte(deviceInfo), &instance); err != nil { - return fmt.Errorf("parse annotation of pod %s/%s error: %#v", pod.Namespace, pod.Name, err) - } - if !ranktablev1.CheckDeviceInfo(&instance) { - return errors.New("deviceInfo failed the validation") - } - - hwlog.RunLog.Infof("deviceId: (%#v)", deviceInfo) - - b.cmMu.Lock() - defer b.cmMu.Unlock() - var rankIndexStr string - // Get rankIndex from pod, use rankIndex if rankIndex exists in pod, use memory if it doesn't. - rankIndexStr, rankExist := pod.Annotations[PodRankIndexKey] - if rankExist { - return fmt.Errorf("pod %s/%s already has rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) - } - rankIndexStr = strconv.Itoa(int(b.rankIndex)) - - // Cache device info from the pod - err := b.configmapData.CachePodInfo(pod, instance, rankIndexStr) - if err != nil { - return err - } - - err = b.updatePod(pod, func(newPod *apiCoreV1.Pod) { - newPod.Annotations[PodRankIndexKey] = rankIndexStr - }) - if err != nil { - return err - } - b.rankIndex++ - - // Cache pod num plus one - b.modifyStatistics(1) - hwlog.RunLog.Infof("rank table build progress for %s/%s: pods need to be cached = %d, "+ - "pods already cached = %d", podInfo.namespace, podInfo.jobName, b.taskReplicasTotal, b.cachedPodNum) - // update configmap if finishing caching all pods' info - errs := updateWithFinish(b, podInfo.namespace) - if errs != nil { - return errs - } - - return nil -} - -func validate(rank int64) error { - if rank < 0 || rank > maxRankIndex { - return fmt.Errorf("rank index from pod is error") - } - return nil -} - -func (b *WorkerInfo) getOrSetPodIndex(pod *apiCoreV1.Pod) (string, error) { - var rankIndexStr string - - rankIndexStr, rankExist := pod.Annotations[PodRankIndexKey] - - if rankExist { - hwlog.RunLog.Infof("pod(%s/%s) already has rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) - } else { - for _, env := range pod.Spec.Containers[0].Env { - if env.Name == vcPodIndexKey { - rankIndexStr = env.Value - } - } - if rankIndexStr == "" { - return "", errors.New("index env not found in pod") - } - err := b.updatePod(pod, func(newPod *apiCoreV1.Pod) { - newPod.Annotations[PodRankIndexKey] = rankIndexStr - }) - if err != nil { - return "", err - } - hwlog.RunLog.Infof("set pod(%s/%s) rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) - } - b.cachedIndex.Store(rankIndexStr, true) - return rankIndexStr, nil -} - -func (b *WorkerInfo) updatePod(pod *apiCoreV1.Pod, updateFunc func(*apiCoreV1.Pod)) error { - return retry.RetryOnConflict(retry.DefaultBackoff, func() error { - newPod, err := b.kubeclientset.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) - if err != nil { - return err - } - updateFunc(newPod) - _, err = b.kubeclientset.CoreV1().Pods(pod.Namespace).Update(context.TODO(), newPod, metav1.UpdateOptions{}) - return err - }) -} - -func (b *WorkerInfo) handleDeleteEvent(podInfo *podIdentifier) error { - //hwlog.RunLog.Infof("current handleDeleteEvent pod is %s", podInfo) - //b.cmMu.Lock() - //defer b.cmMu.Unlock() - //status := b.configmapData.GetStatus() - //err := b.configmapData.RemovePodInfo(podInfo.namespace, podInfo.uid) - //if err != nil { - // return err - //} - // - //hwlog.RunLog.Infof("start to remove data of pod %s/%s", podInfo.namespace, podInfo.name) - // - //if status == ConfigmapCompleted { - // b.configmapData.SetStatus(ConfigmapInitializing) - // hwlog.RunLog.Infof("pod(%s/%s) is delete, start to update configmap(%s) to initializing", podInfo.namespace, - // podInfo.name, b.configmapName) - // err = updateConfigMap(b, podInfo.namespace) - // if err != nil { - // b.configmapData.SetStatus(ConfigmapCompleted) - // return err - // } - //} - // - //rankIndex := podInfo.rankIndex - //if rankIndex != "" { - // _, ok := b.cachedIndex.Load(rankIndex) - // if !ok { - // return fmt.Errorf("cannot find pod(%v) rank index %s", podInfo, rankIndex) - // } - // b.cachedIndex.Store(rankIndex, false) - //} - //hwlog.RunLog.Infof("data of pod %s/%s is removed", podInfo.namespace, podInfo.name) - //b.cachedPods.Delete(podInfo.uid) - //b.configmapData.DeletePod(podInfo.uid) - //b.modifyStatistics(-1) - return nil -} - -func (b *WorkerInfo) endRankTableConstruction(namespace string) error { - return nil - - //return nil -} - -// modifyStatistics statistic about how many pods have already cached -func (b *WorkerInfo) modifyStatistics(diff int32) { - atomic.AddInt32(&b.cachedPodNum, diff) -} - -// CloseStatistic : to close statisticSwitch chan -func (b *WorkerInfo) CloseStatistic() { - if !b.statisticStopped { - close(b.statisticSwitch) - b.statisticStopped = true - } -} - -func updateWithFinish(b *WorkerInfo, namespace string) error { - if b.tableConstructionFinished() { - if err := b.endRankTableConstruction(namespace); err != nil { - return err - } - } - return nil -} - -func getWorkName(labels map[string]string) string { - if label, ok := labels[VolcanoJobNameKey]; ok { - return label - } - if label, ok := labels[DeploymentNameKey]; ok { - return label - } - return "" -} - -func updateConfigMap(client kubernetes.Interface, rt *ranktablev1.RankTabler, namespace string) error { - //cm, err := client.CoreV1().ConfigMaps(namespace).Get(context.TODO(), - // w.configmapName, metav1.GetOptions{}) - //if err != nil { - // return fmt.Errorf("get configmap error: %v", err) - //} - //oldCM, ok := cm.Data[ConfigmapKey] - //if !ok { - // err = fmt.Errorf("old cm ranktable not exists") - // hwlog.RunLog.Debug(err) - // return err - //} - //hwlog.RunLog.Debugf("old cm ranktable %#v", oldCM) - //label910, exist := (*cm).Labels[Key910] - //if !exist || !(label910 == Val910B || label910 == Val910) { - // return fmt.Errorf("invalid configmap label: %s", label910) - //} - //dataByteArray, err := json.Marshal(w.configmapData) - //if err != nil { - // return fmt.Errorf("marshal configmap data error: %v", err) - //} - //cm.Data[ConfigmapKey] = string(dataByteArray[:]) - // - //if _, err = w.kubeclientset.CoreV1().ConfigMaps(namespace).Update(context.TODO(), cm, - // metav1.UpdateOptions{}); err != nil { - // return fmt.Errorf("failed to update ConfigMap for Job %v", err) - //} - //hwlog.RunLog.Debugf("new cm ranktable %s", cm.Data[ConfigmapKey]) - return nil -} diff --git a/pkg/ring-controller/agent/vcjobworker_test.go b/pkg/ring-controller/agent/vcjobworker_test.go deleted file mode 100644 index 57237d0..0000000 --- a/pkg/ring-controller/agent/vcjobworker_test.go +++ /dev/null @@ -1,442 +0,0 @@ -/* Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package agent - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "k8s.io/client-go/kubernetes" - "reflect" - "sync" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" - fakecm "k8s.io/client-go/kubernetes/typed/core/v1/fake" - - ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" - v2 "hccl-controller/pkg/ring-controller/ranktable/v2" - _ "hccl-controller/pkg/testtool" -) - -const ( - NameSpace = "namespace" - DataKey = "hccl.json" - DataValue = `{"status":"initializing"}` - CMName = "rings-config-test1" -) - -// TestGetWorkName test GetWorkName -func TestGetWorkName(t *testing.T) { - convey.Convey("agent GetWorkName", t, func() { - labels := make(map[string]string, 1) - - convey.Convey(" return volcano-job when label contains VolcanoJobNameKey ", func() { - labels[VolcanoJobNameKey] = VolcanoJobNameKey - labels[DeploymentNameKey] = DeploymentNameKey - work := getWorkName(labels) - convey.So(work, convey.ShouldEqual, VolcanoJobNameKey) - }) - convey.Convey(" return deployment-name when label contains VolcanoJobNameKey ", func() { - labels[DeploymentNameKey] = DeploymentNameKey - work := getWorkName(labels) - convey.So(work, convey.ShouldEqual, DeploymentNameKey) - }) - }) -} - -// TestUpdateConfigMap test UpdateConfigMap -func TestUpdateConfigMap(t *testing.T) { - convey.Convey("agent updateConfigMap", t, func() { - kube := fake.NewSimpleClientset() - work := &WorkerInfo{kubeclientset: kube, configmapName: CMName} - convey.Convey(" return err != nil when cm not exist ", func() { - err := updateConfigMap(work, NameSpace) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey(" return err != nil when label in cm not exist Key910 ", func() { - data := make(map[string]string, 1) - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: NameSpace}, Data: data} - kube.CoreV1().ConfigMaps(NameSpace).Create(context.TODO(), putCM, metav1.CreateOptions{}) - err := updateConfigMap(work, NameSpace) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey(" return err != nil when update cm error ", func() { - updateWhenUpdateCmErr(kube, work) - }) - convey.Convey(" return err == nil when label in cm normal ", func() { - updateWhenCMNormal(kube, work) - }) - }) - -} - -// TestNewCachedIndex test for newCachedIndex -func TestNewCachedIndex(t *testing.T) { - convey.Convey("test newCachedIndex", t, func() { - convey.Convey("return empty map when input is 0", func() { - c := newCachedIndex(0) - convey.ShouldEqual(c, sync.Map{}) - }) - convey.Convey(" map has value when input is 0", func() { - const jobReplicas = 2 - c := newCachedIndex(jobReplicas) - v, exist := c.Load("1") - convey.ShouldEqual(exist, true) - convey.ShouldEqual(v.(bool), false) - }) - }) -} - -func updateWhenCMNormal(kube *fake.Clientset, work *WorkerInfo) { - data := make(map[string]string, 1) - label := make(map[string]string, 1) - data[DataKey] = DataValue - label[Key910] = Val910 - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: NameSpace, Labels: label}, Data: data} - kube.CoreV1().ConfigMaps(NameSpace).Create(context.TODO(), putCM, metav1.CreateOptions{}) - work.configmapData = &ranktablev1.RankTable{RankTableStatus: ranktablev1.RankTableStatus{ - Status: "initializing", - }} - work.configmapData.SetStatus(ConfigmapCompleted) - err := updateConfigMap(work, NameSpace) - convey.So(err, convey.ShouldEqual, nil) - cm, _ := kube.CoreV1().ConfigMaps(NameSpace).Get(context.TODO(), CMName, - metav1.GetOptions{}) - convey.So(cm.Data[DataKey], convey.ShouldEqual, `{"status":"completed","group_list":null,"group_count":""}`) -} - -func updateWhenUpdateCmErr(kube *fake.Clientset, work *WorkerInfo) { - label := make(map[string]string, 1) - label[Key910] = Val910 - data := make(map[string]string, 1) - data[DataKey] = DataValue - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: NameSpace, Labels: label}, Data: data} - kube.CoreV1().ConfigMaps("namespace").Create(context.TODO(), putCM, metav1.CreateOptions{}) - work.configmapData = &ranktablev1.RankTable{RankTableStatus: ranktablev1.RankTableStatus{ - Status: "initializing", - }} - work.configmapData.SetStatus(ConfigmapCompleted) - patch := gomonkey.ApplyMethod(reflect.TypeOf(kube.CoreV1().ConfigMaps(NameSpace)), - "Update", func(_ *fakecm.FakeConfigMaps, _ context.Context, _ *corev1.ConfigMap, - _ metav1.UpdateOptions) (*corev1.ConfigMap, error) { - return nil, fmt.Errorf("update config error") - }) - defer patch.Reset() - err := updateConfigMap(work, NameSpace) - convey.So(err, convey.ShouldNotEqual, nil) - cm, _ := kube.CoreV1().ConfigMaps(NameSpace).Get(context.TODO(), CMName, - metav1.GetOptions{}) - convey.So(cm.Data[DataKey], convey.ShouldEqual, DataValue) -} - -// TestWorkerInfoCloseStatistic test WorkerInfo_CloseStatistic -func TestWorkerInfoCloseStatistic(t *testing.T) { - convey.Convey("agent TestWorkerInfo_CloseStatistic", t, func() { - w := &WorkerInfo{statisticStopped: true, statisticSwitch: make(chan struct{})} - - convey.Convey(" chan not close when statisticStopped is true ", func() { - w.CloseStatistic() - go func() { - w.statisticSwitch <- struct{}{} - }() - _, open := <-w.statisticSwitch - convey.So(open, convey.ShouldEqual, true) - }) - - }) -} - -// TestVCJobWorkerStatistic test VCJobWorker_Statistic -func TestVCJobWorkerStatistic(t *testing.T) { - convey.Convey("agent VCJobWorker_Statistic", t, func() { - vc := &VCJobWorker{WorkerInfo: WorkerInfo{statisticSwitch: make(chan struct{}), statisticStopped: false}} - const ( - TaskRep = 2 - SleepTime = 3 - ) - - convey.Convey(" chan will return when chan close ", func() { - vc.taskReplicasTotal = TaskRep - vc.cachedPodNum = 1 - go func() { - time.Sleep(SleepTime * time.Second) - vc.CloseStatistic() - }() - vc.Statistic(1 * time.Second) - }) - - convey.Convey(" chan will return when taskReplicasTotal==cachedPodNum ", func() { - const CachePod = 2 - vc.taskReplicasTotal = TaskRep - vc.cachedPodNum = CachePod - vc.Statistic(1 * time.Second) - }) - }) -} - -// TestValidateRank validate rank range -func TestValidateRank(t *testing.T) { - convey.Convey("test validate rank", t, func() { - convey.Convey("invalid rank too small", func() { - err := validate(-1) - convey.So(err, convey.ShouldBeError) - }) - convey.Convey("invalid rank too large", func() { - err := validate(maxRankIndex + 1) - convey.So(err, convey.ShouldBeError) - }) - convey.Convey("correct rank", func() { - err := validate(1) - convey.So(err, convey.ShouldBeNil) - }) - }) -} - -func TestGetPodIndex(t *testing.T) { - convey.Convey("test get PodIndex", t, func() { - worker := &WorkerInfo{ - cachedIndex: &sync.Map{}, - taskReplicasTotal: 2, - } - convey.Convey("pod with not digital rankIndex will return err", func() { - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - PodRankIndexKey: "xxx", - }, - }, - } - rank, err := worker.getOrSetPodIndex(pod) - convey.ShouldEqual(rank, "") - convey.ShouldNotBeNil(err) - }) - convey.Convey("pod with invalid rankIndex will return err", func() { - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - PodRankIndexKey: "-1", - }, - }, - } - rank, err := worker.getOrSetPodIndex(pod) - convey.ShouldEqual(rank, "") - convey.ShouldNotBeNil(err) - }) - convey.Convey("pod with valid rankIndex will return normal rank and nil", func() { - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{ - PodRankIndexKey: "0", - }, - }, - } - rank, err := worker.getOrSetPodIndex(pod) - convey.ShouldEqual(rank, 0) - convey.ShouldBeNil(err) - }) - }) -} - -func TestSetPodIndex(t *testing.T) { - convey.Convey("test SetPodIndex", t, func() { - worker := &WorkerInfo{ - cachedIndex: &sync.Map{}, - taskReplicasTotal: 2, - } - convey.Convey("pod without rankIndex will return normal rank and nil", func() { - pod := &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Annotations: map[string]string{}, - }, - Spec: corev1.PodSpec{ - Containers: []corev1.Container{ - { - Env: []corev1.EnvVar{ - { - Name: vcPodIndexKey, - Value: "1", - }, - }, - }, - }, - }, - } - patch := gomonkey.ApplyPrivateMethod(new(WorkerInfo), "updatePod", func(_ *corev1.Pod, - _ func(*corev1.Pod)) error { - return nil - }) - defer patch.Reset() - rank, err := worker.getOrSetPodIndex(pod) - convey.ShouldEqual(rank, 1) - convey.ShouldBeNil(err) - }) - }) -} - -func TestCacheRankTable(t *testing.T) { - vc, fakePod1, fakePod2 := prepareForTest() - convey.Convey("test cacheRankTable", t, func() { - - convey.Convey("cache pods not reach replicas will return false", func() { - vc.cachedPodNum = 1 - res := vc.cacheRankTable() - convey.So(res, convey.ShouldBeFalse) - }) - convey.Convey("get pods from cache failed will return false", func() { - patch := gomonkey.ApplyPrivateMethod(new(VCJobWorker), "getPodsFromCache", - func(_ *VCJobWorker) ([]*corev1.Pod, error) { - return nil, errors.New("get pods from cache failed") - }) - defer patch.Reset() - res := vc.cacheRankTable() - convey.So(res, convey.ShouldBeFalse) - }) - convey.Convey("there has pod who has been delete will return false", func() { - patch := gomonkey.ApplyPrivateMethod(new(VCJobWorker), "getPodsFromCache", - func(_ *VCJobWorker) ([]*corev1.Pod, error) { - return []*corev1.Pod{fakePod1, fakePod2}, nil - }) - defer patch.Reset() - res := vc.cacheRankTable() - convey.So(res, convey.ShouldBeFalse) - }) - convey.Convey("cache ready pods failed will return false", func() { - fakePod2.DeletionTimestamp = nil - patch1 := gomonkey.ApplyPrivateMethod(new(VCJobWorker), "getPodsFromCache", - func(_ *VCJobWorker) ([]*corev1.Pod, error) { - return []*corev1.Pod{fakePod1, fakePod2}, nil - }) - defer patch1.Reset() - patch2 := gomonkey.ApplyPrivateMethod(new(VCJobWorker), "cacheReadyPods", func(_ *VCJobWorker, - _ []*corev1.Pod) error { - return errors.New("cacheReadyPods failed") - }) - defer patch2.Reset() - res := vc.cacheRankTable() - convey.So(res, convey.ShouldBeFalse) - }) - }) -} - -func TestCacheReadyPods(t *testing.T) { - const ( - fakeReplicas = 1 - ) - convey.Convey("test cacheReadyPods", t, func() { - vc := getVCJobWorker(fakeReplicas) - instance := ranktablev1.Instance{ - Devices: []ranktablev1.Device{ - { - DeviceID: "1", - DeviceIP: "0.0.0.0", - }, - }, - - PodName: "pod1", - ServerID: "0.0.0.0", - } - device, err := json.Marshal(instance) - if err != nil { - return - } - fakePod := &corev1.Pod{ - TypeMeta: metav1.TypeMeta{}, - ObjectMeta: metav1.ObjectMeta{ - Name: "pod", - Namespace: "default", - Annotations: map[string]string{ - PodDeviceKey: string(device), - }, - }, - } - - pods := []*corev1.Pod{fakePod} - convey.Convey("unmarshal configuration failed will return err", func() { - patch := gomonkey.ApplyFunc(json.Unmarshal, func(_ []byte, _ interface{}) error { - return errors.New("unmarshal configuration faild") - }) - defer patch.Reset() - err = vc.cacheReadyPods(pods) - convey.ShouldNotBeNil(err) - }) - convey.Convey("check configuration failed will return err", func() { - patch := gomonkey.ApplyFunc(ranktablev1.CheckDeviceInfo, func(_ *ranktablev1.Instance) bool { - return false - }) - defer patch.Reset() - err = vc.cacheReadyPods(pods) - convey.ShouldNotBeNil(err) - }) - }) -} - -func getVCJobWorker(fakeReplicas int32) *VCJobWorker { - return &VCJobWorker{ - WorkerInfo: WorkerInfo{ - statisticSwitch: make(chan struct{}), - statisticStopped: false, - statisticMu: sync.Mutex{}, - taskReplicasTotal: fakeReplicas, - cachedPodNum: fakeReplicas, - configmapData: &v2.RankTable{ServerCount: "1", ServerList: []*v2.Server(nil), - RankTableStatus: ranktablev1.RankTableStatus{Status: ConfigmapInitializing}, Version: "1.0"}, - cachedPods: &sync.Map{}, - cachedIndex: newCachedIndex(int(fakeReplicas)), - kubeclientset: &kubernetes.Clientset{}, - }, - } -} - -func prepareForTest() (*VCJobWorker, *corev1.Pod, *corev1.Pod) { - const ( - fakeReplicas = 2 - fakeNamespace = "default" - ) - vc := &VCJobWorker{ - WorkerInfo: WorkerInfo{ - statisticSwitch: make(chan struct{}), - statisticStopped: false, - statisticMu: sync.Mutex{}, - taskReplicasTotal: fakeReplicas, - cachedPodNum: fakeReplicas, - }, - } - fakePod1 := &corev1.Pod{ - TypeMeta: metav1.TypeMeta{}, - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Namespace: fakeNamespace, - }, - } - fakePod2 := &corev1.Pod{ - TypeMeta: metav1.TypeMeta{}, - ObjectMeta: metav1.ObjectMeta{ - Name: "pod2", - Namespace: fakeNamespace, - DeletionTimestamp: &metav1.Time{}, - }, - } - return vc, fakePod1, fakePod2 -} diff --git a/pkg/ring-controller/common/constants.go b/pkg/ring-controller/common/constants.go index 7f63ee3..7be6a68 100644 --- a/pkg/ring-controller/common/constants.go +++ b/pkg/ring-controller/common/constants.go @@ -26,19 +26,54 @@ const ( MaxPodParallelism = 32 // InformerInterval informer interval time. InformerInterval = 30 - - // Index0 index 0 - Index0 = 0 - // Index1 index 1 - Index1 = 1 - // Index2 index 2 - Index2 = 2 - // Index3 index 3 - Index3 = 3 - // A800MaxChipNum the max num of Ascend910(B) in Atlas 800 server A800MaxChipNum = 16 +) + +const ( - // PodHasBeenCachedLogPattern log pattern of pod has been cached - PodHasBeenCachedLogPattern = "ANOMALY: pod %s/%s is already cached" + // Key910 to get Configmap + Key910 = "ring-controller.atlas" + // Val910 to get Configmap + Val910 = "ascend-910" + // Val910B to get Configmap + Val910B = "ascend-910b" + // A910ResourceName resource name for 910 + A910ResourceName = "huawei.com/Ascend910" + // ConfigmapPrefix to get from configmap + ConfigmapPrefix = "rings-config-" + // ConfigmapCompleted Staus + ConfigmapCompleted = "completed" + // ConfigmapInitializing status + ConfigmapInitializing = "initializing" + // ConfigmapKey configmap Data Name + ConfigmapKey = "hccl.json" + // VolcanoJobNameKey to get job name + VolcanoJobNameKey = "volcano.sh/job-name" + // VolcanoJobNamespaceKey to get job namespace + VolcanoJobNamespaceKey = "volcano.sh/job-namespace" + // PodDeviceKey Pod annoation Key + PodDeviceKey = "ascend.kubectl.kubernetes.io/ascend-910-configuration" + // PodRankIndexKey pod rank index + PodRankIndexKey = "hccl/rankIndex" + // DeploymentNameKey pod label + DeploymentNameKey = "deploy-name" + // EventAdd event add + EventAdd = "add" + // EventUpdate event to update + EventUpdate = "update" + // EventDelete event to delete + EventDelete = "delete" + + VcPodIndexKey = "VC_TASK_INDEX" + + RetryMilliSecond = 5 + ThreeMinutes = 180 +) + +const ( + DefaultQPS = 200.0 + DefaultBurst = 200 + maxQPS = 10000.0 + maxBurst = 10000 ) diff --git a/pkg/ring-controller/common/k8sclient.go b/pkg/ring-controller/common/k8sclient.go new file mode 100644 index 0000000..5a497da --- /dev/null +++ b/pkg/ring-controller/common/k8sclient.go @@ -0,0 +1,64 @@ +package common + +import ( + "fmt" + "hccl-controller/pkg/ring-controller/config" + "huawei.com/npu-exporter/v5/common-utils/hwlog" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/selection" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + "time" + "volcano.sh/apis/pkg/client/clientset/versioned" + "volcano.sh/apis/pkg/client/informers/externalversions" +) + +// NewClientK8s create k8s client +func NewClientK8s(config *config.Config) (*kubernetes.Clientset, *versioned.Clientset, error) { + cfg, err := clientcmd.BuildConfigFromFlags("", "") + if err != nil { + hwlog.RunLog.Errorf("build client config err: %#v", err) + return nil, nil, err + } + if config.Qps <= 0 || config.Qps > maxQPS { + hwlog.RunLog.Warnf("kubeApiQps is invalid, require (0, %f) use default value %f", maxQPS, DefaultQPS) + config.Qps = DefaultQPS + } + if config.Burst <= 0 || config.Burst > maxBurst { + hwlog.RunLog.Warnf("kubeApiBurst is invalid, require (0, %d) use default value %d", maxBurst, DefaultBurst) + config.Burst = DefaultBurst + } + cfg.QPS = float32(config.Qps) + cfg.Burst = config.Burst + kubeClient, err := kubernetes.NewForConfig(cfg) + if err != nil { + return nil, nil, fmt.Errorf("error building kubernetes clientset: %s", err.Error()) + } + jobClient, err := versioned.NewForConfig(cfg) + if err != nil { + return nil, nil, fmt.Errorf("error building job clientset: %s", err.Error()) + } + return kubeClient, jobClient, nil +} + +func NewInformerFactory(jobClient *versioned.Clientset, kubeClient *kubernetes.Clientset) ( + externalversions.SharedInformerFactory, informers.SharedInformerFactory, error) { + temp, newErr := labels.NewRequirement(Key910, selection.In, []string{Val910B, Val910}) + if newErr != nil { + hwlog.RunLog.Infof("newInformerFactory %s", newErr) + return nil, nil, newErr + } + labelSelector := temp.String() + jobInformerFactory := externalversions.NewSharedInformerFactoryWithOptions(jobClient, + time.Second*InformerInterval, externalversions.WithTweakListOptions(func(options *v1. + ListOptions) { + options.LabelSelector = labelSelector + })) + deploymentFactory := informers.NewSharedInformerFactoryWithOptions(kubeClient, + time.Second*InformerInterval, informers.WithTweakListOptions(func(options *v1.ListOptions) { + options.LabelSelector = labelSelector + })) + return jobInformerFactory, deploymentFactory, nil +} diff --git a/pkg/ring-controller/config/configs.go b/pkg/ring-controller/config/configs.go new file mode 100644 index 0000000..2bc941c --- /dev/null +++ b/pkg/ring-controller/config/configs.go @@ -0,0 +1,16 @@ +package config + +type Config struct { + // DryRun:Is it a test + DryRun bool + // DisplayStatistic : a flag if starts to report rank table build statistic for job + DisplayStatistic bool + // PodParallelism : how many goroutine to run in the agent + PodParallelism int + // QPS + Qps float64 + // Burst + Burst int + // HcclVersion + HcclVersion string +} diff --git a/pkg/ring-controller/agent/businessagent.go b/pkg/ring-controller/controller/businessagent.go similarity index 68% rename from pkg/ring-controller/agent/businessagent.go rename to pkg/ring-controller/controller/businessagent.go index e856240..55d6c2c 100644 --- a/pkg/ring-controller/agent/businessagent.go +++ b/pkg/ring-controller/controller/businessagent.go @@ -13,37 +13,38 @@ */ // Package agent for run the logic -package agent +package controller import ( "context" "encoding/json" "errors" "fmt" + "hccl-controller/pkg/ring-controller/config" + "k8s.io/apimachinery/pkg/util/runtime" + typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" "strconv" + samplescheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme" ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" v2 "hccl-controller/pkg/ring-controller/ranktable/v2" appsV1 "k8s.io/api/apps/v1" "k8s.io/client-go/util/retry" "reflect" - "strings" "sync" "sync/atomic" "time" "volcano.sh/apis/pkg/apis/batch/v1alpha1" "huawei.com/npu-exporter/v5/common-utils/hwlog" - apiCoreV1 "k8s.io/api/core/v1" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/informers" - "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/workqueue" @@ -61,62 +62,76 @@ func (p *podIdentifier) String() string { // implemented in the form of worker interface in the agent framework run. // Agent monitors POD events with a specific label and implements the // combination of tasks through different workers at different times. -func NewBusinessAgent(kubeClientSet kubernetes.Interface, indexers map[string]cache.Indexer, recorder record.EventRecorder, - config *Config, - stopCh <-chan struct{}) (*BusinessAgent, error) { - // create pod informer factory - temp, newErr := labels.NewRequirement(Key910, selection.In, []string{Val910B, Val910}) +func NewBusinessAgent(config *config.Config, stopCh <-chan struct{}) (*BusinessAgent, error) { + runtime.Must(samplescheme.AddToScheme(scheme.Scheme)) + + kubeClient, jobClient, err := common.NewClientK8s(config) + if err != nil { + hwlog.RunLog.Error(err) + return nil, err + } + hwlog.RunLog.Info("Creating event broadcaster") + eventBroadcaster := record.NewBroadcaster() + eventBroadcaster.StartLogging(hwlog.RunLog.Infof) + eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) + recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: controllerName}) + + jobInformerFactory, k8sInformerFactory, newErr := common.NewInformerFactory(jobClient, kubeClient) if newErr != nil { - hwlog.RunLog.Infof("NewBusinessAgent %s", newErr) + hwlog.RunLog.Error(newErr) return nil, newErr } - - labelSelector := temp.String() - podInformerFactory := informers.NewSharedInformerFactoryWithOptions(kubeClientSet, - time.Second*common.InformerInterval, informers.WithTweakListOptions(func(options *metav1.ListOptions) { - options.LabelSelector = labelSelector - })) - - // each worker share the same init parameters stored here + jobInformer := jobInformerFactory.Batch().V1alpha1().Jobs().Informer() + rsInformer := k8sInformerFactory.Apps().V1().ReplicaSets().Informer() + cmInformer := k8sInformerFactory.Core().V1().ConfigMaps().Informer() + podInformer := k8sInformerFactory.Core().V1().Pods().Informer() businessAgent := &BusinessAgent{ - informerFactory: podInformerFactory, - podInformer: podInformerFactory.Core().V1().Pods().Informer(), - PodsIndexer: podInformerFactory.Core().V1().Pods().Informer().GetIndexer(), - Indexers: indexers, - configMapCache: make(map[types.UID]ranktablev1.RankTabler), + informerFactory: k8sInformerFactory, + podInformer: podInformer, + PodsIndexer: podInformer.GetIndexer(), + JobsIndexer: jobInformer.GetIndexer(), + ReplicaSetIndexer: rsInformer.GetIndexer(), + ConfigmapsIndexer: cmInformer.GetIndexer(), + configMapCache: make(map[types.UID]ranktablev1.RankTabler), Workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter( - retryMilliSecond*time.Millisecond, threeMinutes*time.Second), "Pods"), - KubeClientSet: kubeClientSet, - BusinessWorker: make(map[types.UID]Worker), - recorder: recorder, - Config: config, - agentSwitch: stopCh, + common.RetryMilliSecond*time.Millisecond, common.ThreeMinutes*time.Second), "Pods"), + KubeClientSet: kubeClient, + recorder: recorder, + Config: config, } // when pod is added, annotation info is ready. No need to listen update event. businessAgent.podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { - businessAgent.enqueuePod(obj, EventAdd) + businessAgent.enqueuePod(obj, common.EventAdd) }, UpdateFunc: func(old, new interface{}) { if !reflect.DeepEqual(old, new) { - businessAgent.enqueuePod(new, EventUpdate) + businessAgent.enqueuePod(new, common.EventUpdate) } }, DeleteFunc: func(obj interface{}) { - businessAgent.enqueuePod(obj, EventDelete) + businessAgent.enqueuePod(obj, common.EventDelete) }, }) hwlog.RunLog.Info("start informer factory") - go podInformerFactory.Start(stopCh) + go k8sInformerFactory.Start(stopCh) + go jobInformerFactory.Start(stopCh) + hwlog.RunLog.Info("waiting for informer caches to sync") - if ok := cache.WaitForCacheSync(stopCh, businessAgent.podInformer.HasSynced); !ok { + if ok := cache.WaitForCacheSync( + stopCh, + podInformer.HasSynced, + jobInformer.HasSynced, + rsInformer.HasSynced, + cmInformer.HasSynced, + ); !ok { hwlog.RunLog.Errorf("caches sync failed") return businessAgent, fmt.Errorf("caches sync failed") } - return businessAgent, businessAgent.run(config.PodParallelism) + return businessAgent, nil } // enqueuePod to through the monitoring of POD time, @@ -130,13 +145,13 @@ func (b *BusinessAgent) enqueuePod(obj interface{}, eventType string) { b.Workqueue.AddRateLimited(podInfo) } -func (b *BusinessAgent) run(threadiness int) error { +func (b *BusinessAgent) Run(threadiness int, stopCh <-chan struct{}) error { hwlog.RunLog.Info("Starting workers") for i := 0; i < threadiness; i++ { - go wait.Until(b.runMasterWorker, time.Second, b.agentSwitch) + go wait.Until(b.runMasterWorker, time.Second, stopCh) } hwlog.RunLog.Info("Started workers") - + <-stopCh return nil } @@ -182,33 +197,10 @@ func (b *BusinessAgent) doWork(obj interface{}) bool { } func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, podKeyInfo *podIdentifier) bool { - // Lock to safely obtain worker data in the Map - //b.RwMutex.RLock() - //defer b.RwMutex.RUnlock() - //bsnsWorker, workerExist := b.BusinessWorker[podKeyInfo.jobUid] - //hwlog.RunLog.Debugf(" worker : \n %+v", b.BusinessWorker) - //if !workerExist { - // if !podExist { - // b.Workqueue.Forget(obj) - // hwlog.RunLog.Infof("syncing '%s' terminated: current obj is no longer exist", podKeyInfo.String()) - // return true - // } - // // if someone create a single 910 pod without a job, how to handle? - // hwlog.RunLog.Debugf("syncing '%s' delayed: corresponding job worker may be uninitialized", - // podKeyInfo.String()) - // return false - //} labelSeletor := map[string]string{} replicas := int32(0) if podKeyInfo.ownerKind == "Job" { - jobIndexer, ok := b.Indexers[VCJobType] - if !ok { - b.Workqueue.Forget(obj) - hwlog.RunLog.Errorf("job indexer not exist") - return true - } - - jobObj, jobExist, err := jobIndexer.GetByKey(podKeyInfo.namespace + "/" + podKeyInfo.ownerName) + jobObj, jobExist, err := b.JobsIndexer.GetByKey(podKeyInfo.namespace + "/" + podKeyInfo.ownerName) if err != nil { hwlog.RunLog.Errorf("syncing '%s' failed: failed to get obj from indexer", podKeyInfo) return false @@ -226,8 +218,8 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p return false } labelSeletor = map[string]string{ - VolcanoJobNameKey: podKeyInfo.ownerName, - VolcanoJobNamespaceKey: podKeyInfo.namespace, + common.VolcanoJobNameKey: podKeyInfo.ownerName, + common.VolcanoJobNamespaceKey: podKeyInfo.namespace, } job, ok := jobObj.(*v1alpha1.Job) if !ok { @@ -240,8 +232,7 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p } } else if podKeyInfo.ownerKind == "ReplicaSet" { - rsIndexer := b.Indexers[ReplicaSetType] - rsObj, exist, err := rsIndexer.GetByKey(podKeyInfo.namespace + "/" + podKeyInfo.ownerName) + rsObj, exist, err := b.ReplicaSetIndexer.GetByKey(podKeyInfo.namespace + "/" + podKeyInfo.ownerName) if err != nil { hwlog.RunLog.Errorf("syncing '%s' failed: failed to get obj from api-server", podKeyInfo) return false @@ -262,7 +253,7 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p replicas = *rsObj.(*appsV1.ReplicaSet).Spec.Replicas } - if podKeyInfo.eventType == EventDelete { + if podKeyInfo.eventType == common.EventDelete { b.Workqueue.Forget(obj) if err := b.handleDeleteEvent(podKeyInfo); err != nil { // only logs need to be recorded. @@ -275,7 +266,7 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p return true } - pod, ok := tmpObj.(*apiCoreV1.Pod) + pod, ok := tmpObj.(*corev1.Pod) if !ok { hwlog.RunLog.Error("pod transform failed") return true @@ -304,14 +295,14 @@ func (b *BusinessAgent) handleDeleteEvent(podInfo *podIdentifier) error { } hwlog.RunLog.Infof("start to remove data of pod %s/%s", podInfo.namespace, podInfo.name) - configmapName := "rings-config" + "-" + podInfo.jobName - if status == ConfigmapCompleted { - rankTable.SetStatus(ConfigmapInitializing) + configmapName := common.ConfigmapPrefix + podInfo.jobName + if status == common.ConfigmapCompleted { + rankTable.SetStatus(common.ConfigmapInitializing) hwlog.RunLog.Infof("pod(%s/%s) is delete, start to update configmap(%s) to initializing", podInfo.namespace, podInfo.name, configmapName) err = b.updateConfigMap(rankTable, podInfo.namespace, configmapName) if err != nil { - rankTable.SetStatus(ConfigmapCompleted) + rankTable.SetStatus(common.ConfigmapCompleted) return err } } @@ -320,41 +311,36 @@ func (b *BusinessAgent) handleDeleteEvent(podInfo *podIdentifier) error { } func (b *BusinessAgent) updateConfigMap(rt ranktablev1.RankTabler, namespace, name string) error { - cmIndexer, ok := b.Indexers[ConfigmapType] - obj, exist, err := cmIndexer.GetByKey(namespace + "/" + name) + obj, exist, err := b.ConfigmapsIndexer.GetByKey(namespace + "/" + name) if err != nil { return fmt.Errorf("get configmap error: %v", err) } if !exist { return fmt.Errorf("configmap %s/%s not exist", namespace, name) } - cm := obj.(*apiCoreV1.ConfigMap) - oldCM, ok := cm.Data[ConfigmapKey] + cm := obj.(*corev1.ConfigMap) + oldCM, ok := cm.Data[common.ConfigmapKey] if !ok { err = fmt.Errorf("old cm ranktable not exists") hwlog.RunLog.Debug(err) return err } hwlog.RunLog.Debugf("old cm ranktable %#v", oldCM) - label910, exist := (*cm).Labels[Key910] - if !exist || !(label910 == Val910B || label910 == Val910) { - return fmt.Errorf("invalid configmap label: %s", label910) - } dataByteArray, err := json.Marshal(rt) if err != nil { return fmt.Errorf("marshal configmap data error: %v", err) } - cm.Data[ConfigmapKey] = string(dataByteArray[:]) + cm.Data[common.ConfigmapKey] = string(dataByteArray[:]) if _, err = b.KubeClientSet.CoreV1().ConfigMaps(namespace).Update(context.TODO(), cm, metav1.UpdateOptions{}); err != nil { return fmt.Errorf("failed to update ConfigMap for Job %v", err) } - hwlog.RunLog.Debugf("new cm ranktable %s", cm.Data[ConfigmapKey]) + hwlog.RunLog.Debugf("new cm ranktable %s", cm.Data[common.ConfigmapKey]) return nil } -func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIdentifier, +func (b *BusinessAgent) handleAddUpdateEvent(pod *corev1.Pod, podInfo *podIdentifier, labelSeletor map[string]string, replicas int32) error { b.RwMutex.Lock() defer b.RwMutex.Unlock() @@ -365,14 +351,14 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde RankTableStatus: ranktablev1.RankTableStatus{Status: "initializing"}, Version: "1.0"} } - if rankTable.GetStatus() == ConfigmapCompleted { + if rankTable.GetStatus() == common.ConfigmapCompleted { hwlog.RunLog.Debugf("ranktable of job <%s/%s> is completed", pod.Namespace, podInfo.jobName) return nil } - _, ok = pod.Annotations[PodDeviceKey] + _, ok = pod.Annotations[common.PodDeviceKey] if !ok { - return fmt.Errorf("pod %s/%s has no annotation %s", pod.Namespace, pod.Name, PodDeviceKey) + return fmt.Errorf("pod %s/%s has no annotation %s", pod.Namespace, pod.Name, common.PodDeviceKey) } hwlog.RunLog.Debugf("label selector: %v", labelSeletor) pods, err := b.informerFactory.Core().V1().Pods().Lister().List(labels.SelectorFromSet(labelSeletor)) @@ -385,8 +371,8 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde if p.DeletionTimestamp != nil { return fmt.Errorf("pod %s/%s is deleting", p.Namespace, p.Name) } - if _, ok = p.Annotations[PodDeviceKey]; !ok { - return fmt.Errorf("pod %s/%s has no annotation %s", p.Namespace, p.Name, PodDeviceKey) + if _, ok = p.Annotations[common.PodDeviceKey]; !ok { + return fmt.Errorf("pod %s/%s has no annotation %s", p.Namespace, p.Name, common.PodDeviceKey) } running++ } @@ -399,7 +385,7 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde wg := &sync.WaitGroup{} for index, p := range pods { wg.Add(1) - go func(i int, pod *apiCoreV1.Pod) { + go func(i int, pod *corev1.Pod) { defer wg.Done() if err = b.cachePod(rankTable, i, pod); err != nil { hwlog.RunLog.Errorf("cache pod<%s/%s> info failed: %s", pod.Namespace, pod.Name, err) @@ -413,9 +399,9 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde return fmt.Errorf("cache pod info failed") } - rankTable.SetStatus(ConfigmapCompleted) + rankTable.SetStatus(common.ConfigmapCompleted) rankTable.BeforeUpdate() - configmapName := "rings-config" + "-" + podInfo.jobName + configmapName := common.ConfigmapPrefix + podInfo.jobName hwlog.RunLog.Infof("job is ready, start to update configmap(%s/%s) to completed", pod.Namespace, configmapName) if err = b.updateConfigMap(rankTable, pod.Namespace, configmapName); err != nil { hwlog.RunLog.Error("update configmap failed") @@ -426,8 +412,8 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *apiCoreV1.Pod, podInfo *podIde } -func (b *BusinessAgent) cachePod(rt ranktablev1.RankTabler, index int, pod *apiCoreV1.Pod) error { - deviceInfo := pod.Annotations[PodDeviceKey] +func (b *BusinessAgent) cachePod(rt ranktablev1.RankTabler, index int, pod *corev1.Pod) error { + deviceInfo := pod.Annotations[common.PodDeviceKey] var instance ranktablev1.Instance if err := json.Unmarshal([]byte(deviceInfo), &instance); err != nil { @@ -448,10 +434,10 @@ func (b *BusinessAgent) cachePod(rt ranktablev1.RankTabler, index int, pod *apiC return nil } -func (b *BusinessAgent) getOrSetPodIndex(index int, pod *apiCoreV1.Pod) (string, error) { +func (b *BusinessAgent) getOrSetPodIndex(index int, pod *corev1.Pod) (string, error) { var rankIndexStr string - rankIndexStr, rankExist := pod.Annotations[PodRankIndexKey] + rankIndexStr, rankExist := pod.Annotations[common.PodRankIndexKey] if rankExist { hwlog.RunLog.Infof("pod(%s/%s) already has rankIndex: %s", pod.Namespace, pod.Name, rankIndexStr) @@ -461,7 +447,7 @@ func (b *BusinessAgent) getOrSetPodIndex(index int, pod *apiCoreV1.Pod) (string, rankIndexStr = strconv.Itoa(index) } else { for _, env := range pod.Spec.Containers[0].Env { - if env.Name == vcPodIndexKey { + if env.Name == common.VcPodIndexKey { rankIndexStr = env.Value } } @@ -470,8 +456,8 @@ func (b *BusinessAgent) getOrSetPodIndex(index int, pod *apiCoreV1.Pod) (string, if rankIndexStr == "" { return "", errors.New("index env not found in pod") } - err := b.updatePod(pod, func(newPod *apiCoreV1.Pod) { - newPod.Annotations[PodRankIndexKey] = rankIndexStr + err := b.updatePod(pod, func(newPod *corev1.Pod) { + newPod.Annotations[common.PodRankIndexKey] = rankIndexStr }) if err != nil { return "", err @@ -481,7 +467,7 @@ func (b *BusinessAgent) getOrSetPodIndex(index int, pod *apiCoreV1.Pod) (string, return rankIndexStr, nil } -func (b *BusinessAgent) updatePod(pod *apiCoreV1.Pod, updateFunc func(*apiCoreV1.Pod)) error { +func (b *BusinessAgent) updatePod(pod *corev1.Pod, updateFunc func(*corev1.Pod)) error { return retry.RetryOnConflict(retry.DefaultBackoff, func() error { newPod, err := b.KubeClientSet.CoreV1().Pods(pod.Namespace).Get(context.TODO(), pod.Name, metav1.GetOptions{}) if err != nil { @@ -519,6 +505,16 @@ func (b *BusinessAgent) nameGenerationFunc(obj interface{}, eventType string) (* }, nil } +func getWorkName(labelMaps map[string]string) string { + if jobName, ok := labelMaps[common.VolcanoJobNameKey]; ok { + return jobName + } + if deployName, ok := labelMaps[common.DeploymentNameKey]; ok { + return deployName + } + return "" +} + func getControlled(obj metav1.Object) *metav1.OwnerReference { for _, owner := range obj.GetOwnerReferences() { if *owner.Controller { @@ -528,7 +524,7 @@ func getControlled(obj metav1.Object) *metav1.OwnerReference { return nil } -func isReferenceJobSameWithBsnsWorker(pod *apiCoreV1.Pod, jobName, bsnsWorkerUID string) bool { +func isReferenceJobSameWithBsnsWorker(pod *corev1.Pod, jobName, bsnsWorkerUID string) bool { sameWorker := false for _, owner := range pod.OwnerReferences { if owner.Name == jobName && string(owner.UID) == bsnsWorkerUID { @@ -538,58 +534,3 @@ func isReferenceJobSameWithBsnsWorker(pod *apiCoreV1.Pod, jobName, bsnsWorkerUID } return sameWorker } - -func isPodAnnotationsReady(pod *apiCoreV1.Pod, identifier string) bool { - _, exist := pod.Annotations[PodDeviceKey] - if !exist { - hwlog.RunLog.Debugf("syncing '%s' delayed: device info is not ready", identifier) - return false - } - return true -} - -func containerUsedChip(pod *apiCoreV1.Pod) bool { - for _, container := range pod.Spec.Containers { - if GetNPUNum(container) > 0 { - return true - } - } - - return false -} - -// GetNPUNum get npu npuNum from container: -// 0 presents not use npu; -// -1 presents got invalid npu num; -// other values present use npu; -func GetNPUNum(c apiCoreV1.Container) int32 { - for name, qtt := range c.Resources.Limits { - if !strings.HasPrefix(string(name), A910ResourceName) { - continue - } - if common.A800MaxChipNum < qtt.Value() || qtt.Value() < 0 { - return InvalidNPUNum - } - return int32(qtt.Value()) - } - return 0 -} - -// DeleteWorker : Delete worker(namespace/name) from BusinessWorker map in agent -func DeleteWorker(uid types.UID, agent *BusinessAgent) { - agent.RwMutex.Lock() - defer agent.RwMutex.Unlock() - hwlog.RunLog.Infof("not exist + delete, current job is %s", uid) - worker, exist := agent.BusinessWorker[uid] - if !exist { - hwlog.RunLog.Infof("failed to delete business worker for %s, it's not exist", uid) - return - } - - if agent.Config.DisplayStatistic { - worker.CloseStatistic() - } - delete(agent.BusinessWorker, uid) - hwlog.RunLog.Infof("business worker for %s is deleted", uid) - return -} diff --git a/pkg/ring-controller/agent/businessagent_test.go b/pkg/ring-controller/controller/businessagent_test.go similarity index 99% rename from pkg/ring-controller/agent/businessagent_test.go rename to pkg/ring-controller/controller/businessagent_test.go index 7ad17e9..a80d865 100644 --- a/pkg/ring-controller/agent/businessagent_test.go +++ b/pkg/ring-controller/controller/businessagent_test.go @@ -13,7 +13,7 @@ Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. See the License for the specific language governing permissions and limitations under the License. */ -package agent +package controller import ( "math" diff --git a/pkg/ring-controller/controller/controller.go b/pkg/ring-controller/controller/controller.go deleted file mode 100644 index 948ff7a..0000000 --- a/pkg/ring-controller/controller/controller.go +++ /dev/null @@ -1,215 +0,0 @@ -/* Copyright(C) 2020-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package controller responsibilities:business worker for each job according to job events -package controller - -import ( - "fmt" - "reflect" - "strings" - "time" - - "huawei.com/npu-exporter/v5/common-utils/hwlog" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/scheme" - typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" - "k8s.io/client-go/tools/cache" - "k8s.io/client-go/tools/record" - "k8s.io/client-go/util/workqueue" - "volcano.sh/apis/pkg/client/clientset/versioned" - samplescheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme" - - "hccl-controller/pkg/ring-controller/agent" - "hccl-controller/pkg/ring-controller/common" - "hccl-controller/pkg/ring-controller/model" -) - -// NewEventController returns a new sample controller -func NewEventController(kubeclientset kubernetes.Interface, jobclientset versioned.Interface, config *agent.Config, - informerInfo InformerInfo, stopCh <-chan struct{}) (*EventController, error) { - // Create event broadcaster - // Add ring-controller types to the default Kubernetes Scheme so Events can be - // logged for ring-controller types. - runtime.Must(samplescheme.AddToScheme(scheme.Scheme)) - hwlog.RunLog.Info("Creating event broadcaster") - eventBroadcaster := record.NewBroadcaster() - eventBroadcaster.StartLogging(hwlog.RunLog.Infof) - eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeclientset.CoreV1().Events("")}) - recorder := eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: controllerName}) - agents, err := agent.NewBusinessAgent(kubeclientset, informerInfo.CacheIndexers, recorder, config, stopCh) - if err != nil { - return nil, fmt.Errorf("error creating business agent: %s", err.Error()) - } - c := &EventController{ - kubeclientset: kubeclientset, - jobclientset: jobclientset, - jobsSynced: informerInfo.JobInformer.Informer().HasSynced, - deploySynced: informerInfo.DeployInformer.Informer().HasSynced, - workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "model"), - recorder: recorder, - agent: agents, - cacheIndexers: informerInfo.CacheIndexers, - } - informerInfo.addEventHandle(c) - return c, nil -} - -// Run will set up the event handlers for types we are interested in, as well -// as syncing informer caches and starting workers. It will block until stopCh -func (c *EventController) Run(threadiness int, stopCh <-chan struct{}) error { - defer runtime.HandleCrash() - defer c.workqueue.ShutDown() - defer c.agent.Workqueue.ShuttingDown() - - // Wait for the caches to be synced before starting workers - hwlog.RunLog.Debug("Waiting for informer caches to sync") - ok := cache.WaitForCacheSync(stopCh, c.jobsSynced, c.deploySynced) - if !ok { - return fmt.Errorf("failed to wait for caches to sync") - } - - hwlog.RunLog.Debug("Starting workers") - - for i := 0; i < threadiness; i++ { - go wait.Until(c.runMaster, time.Second, stopCh) - } - - hwlog.RunLog.Debug("Started") - if stopCh != nil { - <-stopCh - } - hwlog.RunLog.Debug("Shutting down") - return nil -} - -func (c *EventController) runMaster() { - for c.processNextWork() { - } -} - -func (c *EventController) processNextWork() bool { - hwlog.RunLog.Debug("get workqueue-", c.workqueue.Len()) - obj, shutdown := c.workqueue.Get() - if shutdown { - return false - } - - err := func(obj interface{}) error { - defer c.workqueue.Done(obj) - var mo model.ResourceEventHandler - var ok bool - if mo, ok = obj.(model.ResourceEventHandler); !ok { - c.workqueue.Forget(obj) - return fmt.Errorf("expected ResourceEventHandler in workqueue but got %#v", obj) - } - - if err := c.SyncHandler(mo); err != nil { - c.workqueue.Forget(obj) - return fmt.Errorf("error to syncing '%s': %s", mo.GetModelKey(), err.Error()) - } - - c.workqueue.Forget(obj) - hwlog.RunLog.Debugf("Synced Successfully %+v ", mo) - return nil - }(obj) - - if err != nil { - hwlog.RunLog.Errorf("processNextWork controller, err %v", err) - runtime.HandleError(err) - return true - } - - return true -} - -// enqueueJob takes a Job resource and converts -// it into a namespace/name string which is then put onto the work queue. This method -// should *not* be passed resources of any type other than Job. -func (c *EventController) enqueueJob(obj interface{}, eventType string) { - models, err := model.Factory(obj, eventType, c.cacheIndexers) - if err != nil { - runtime.HandleError(err) - return - } - c.workqueue.AddRateLimited(models) -} - -// SyncHandler : to do things from model -func (c *EventController) SyncHandler(model model.ResourceEventHandler) error { - key := model.GetModelKey() - hwlog.RunLog.Infof("SyncHandler start, current key is %v", key) - - var namespace, name, eventType string - parts := strings.Split(key, "/") - switch len(parts) { - case common.Index2: - // name only, no namespace - namespace = "" - name = parts[common.Index0] - eventType = parts[common.Index1] - case common.Index3: - // namespace and name - namespace = parts[common.Index0] - name = parts[common.Index1] - eventType = parts[common.Index2] - default: - return fmt.Errorf("failed to split key, unexpected key format: %q", key) - } - - _, exists, err := model.GetCacheIndex().GetByKey(namespace + "/" + name) - if err != nil { - return fmt.Errorf("failed to get obj from indexer: %s", key) - } - if !exists { - if eventType == agent.EventDelete { - agent.DeleteWorker(model.GetUID(), c.agent) - hwlog.RunLog.Infof("not exist + delete, eventType is %s, current key is %s", eventType, key) - return nil - } - return fmt.Errorf("undefined condition, eventType is %s, current key is %s", eventType, key) - } - - switch eventType { - case agent.EventAdd: - hwlog.RunLog.Infof("exist + add, current job is %s/%s", namespace, name) - return model.EventAdd(c.agent) - case agent.EventUpdate: - // unnecessary to handle - return model.EventUpdate(c.agent) - default: - return fmt.Errorf("undefined condition, eventType is %s, current key is %s", eventType, key) - } -} - -func (in *InformerInfo) addEventHandle(controller *EventController) { - eventHandlerFunc := cache.ResourceEventHandlerFuncs{ - AddFunc: func(obj interface{}) { - controller.enqueueJob(obj, agent.EventAdd) - }, - UpdateFunc: func(old, new interface{}) { - if !reflect.DeepEqual(old, new) { - controller.enqueueJob(new, agent.EventUpdate) - } - }, - DeleteFunc: func(obj interface{}) { - controller.enqueueJob(obj, agent.EventDelete) - }, - } - in.JobInformer.Informer().AddEventHandler(eventHandlerFunc) - in.DeployInformer.Informer().AddEventHandler(eventHandlerFunc) -} diff --git a/pkg/ring-controller/controller/controller_test.go b/pkg/ring-controller/controller/controller_test.go deleted file mode 100644 index 73635c5..0000000 --- a/pkg/ring-controller/controller/controller_test.go +++ /dev/null @@ -1,184 +0,0 @@ -/* Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -package controller - -import ( - "fmt" - "reflect" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/informers" - "k8s.io/client-go/kubernetes/fake" - "k8s.io/client-go/tools/cache" - "volcano.sh/apis/pkg/apis/batch/v1alpha1" - vofake "volcano.sh/apis/pkg/client/clientset/versioned/fake" - "volcano.sh/apis/pkg/client/informers/externalversions" - - "hccl-controller/pkg/ring-controller/agent" - "hccl-controller/pkg/ring-controller/common" - "hccl-controller/pkg/ring-controller/model" - _ "hccl-controller/pkg/testtool" -) - -// TestControllerRun test Controller Run -func TestControllerRun(t *testing.T) { - convey.Convey("controller Controller_Run", t, func() { - ctr := newFakeController() - convey.Convey("err != nil when cache not exist ", func() { - patches := gomonkey.ApplyFunc(cache.WaitForCacheSync, func(_ <-chan struct{}, _ ...cache.InformerSynced) bool { - return false - }) - defer patches.Reset() - err := ctr.Run(1, nil) - convey.So(err, convey.ShouldNotEqual, nil) - }) - - convey.Convey("err == nil when cache exist ", func() { - patches := gomonkey.ApplyFunc(cache.WaitForCacheSync, func(_ <-chan struct{}, _ ...cache.InformerSynced) bool { - return true - }) - defer patches.Reset() - err := ctr.Run(1, nil) - convey.So(err, convey.ShouldEqual, nil) - }) - }) -} - -// TestProcessNextWorkItem test ProcessNextWorkItem -func TestProcessNextWorkItem(t *testing.T) { - convey.Convey("controller ProcessNextWorkItem", t, func() { - ctr := newFakeController() - convey.Convey("res == true when process ", func() { - obj := &v1alpha1.Job{TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{Name: "test1", - GenerateName: "", Namespace: "tt1", SelfLink: "", UID: types.UID("xxxx"), ResourceVersion: "", - Generation: 0, CreationTimestamp: metav1.Now(), DeletionTimestamp: nil, - DeletionGracePeriodSeconds: nil, Labels: nil, Annotations: nil, OwnerReferences: nil, - Finalizers: nil, ManagedFields: nil}, Spec: v1alpha1.JobSpec{}, - Status: v1alpha1.JobStatus{}} - ctr.enqueueJob(obj, agent.EventAdd) - patches := gomonkey.ApplyMethod(reflect.TypeOf(ctr), "SyncHandler", func(_ *EventController, - m model.ResourceEventHandler) error { - return fmt.Errorf("undefined condition, things is %s", m.GetModelKey()) - }) - defer patches.Reset() - res := ctr.processNextWork() - convey.So(res, convey.ShouldEqual, true) - convey.So(ctr.workqueue.Len(), convey.ShouldEqual, 0) - }) - - convey.Convey("err != nil when cache not exist ", func() { - obj := &v1alpha1.Job{TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{Name: "test1", - GenerateName: "", Namespace: "tt1", SelfLink: "", UID: types.UID("xxxx"), ResourceVersion: "", - Generation: 0, CreationTimestamp: metav1.Now(), DeletionTimestamp: nil, - DeletionGracePeriodSeconds: nil, Labels: nil, Annotations: nil, OwnerReferences: nil, - Finalizers: nil, ManagedFields: nil}, Spec: v1alpha1.JobSpec{}, - Status: v1alpha1.JobStatus{}} - ctr.enqueueJob(obj, agent.EventAdd) - patches := gomonkey.ApplyMethod(reflect.TypeOf(ctr), "SyncHandler", func(_ *EventController, - m model.ResourceEventHandler) error { - return nil - }) - defer patches.Reset() - res := ctr.processNextWork() - convey.So(res, convey.ShouldEqual, true) - convey.So(ctr.workqueue.Len(), convey.ShouldEqual, 0) - }) - }) -} - -// TestControllerSyncHandler test Controller SyncHandler -func TestControllerSyncHandler(t *testing.T) { - convey.Convey("controller Controller_SyncHandler", t, func() { - ctr := newFakeController() - convey.Convey("err != nil when splitKeyFunc return err ", func() { - obj := &v1alpha1.Job{TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{Name: "test", - GenerateName: "", Namespace: "namespace", SelfLink: "", UID: types.UID("xxxx"), - ResourceVersion: "", Generation: 0, CreationTimestamp: metav1.Now(), DeletionTimestamp: nil, - DeletionGracePeriodSeconds: nil, Labels: nil, Annotations: nil, OwnerReferences: nil, - Finalizers: nil, ManagedFields: nil}, Spec: v1alpha1.JobSpec{}, - Status: v1alpha1.JobStatus{}} - rs, _ := model.Factory(obj, agent.EventAdd, ctr.cacheIndexers) - patches := gomonkey.ApplyMethod(reflect.TypeOf(new(model.VCJobModel)), "GetModelKey", - func(_ *model.VCJobModel) string { - return "" - }) - defer patches.Reset() - err := ctr.SyncHandler(rs) - convey.So(err, convey.ShouldNotEqual, nil) - }) - convey.Convey("err != nil when index getByKey return err ", func() { - obj := &v1alpha1.Job{TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{Name: "test", - GenerateName: "", Namespace: "namespace", SelfLink: "", UID: types.UID("xxxx"), - ResourceVersion: "", Generation: 0, CreationTimestamp: metav1.Now(), DeletionTimestamp: nil, - DeletionGracePeriodSeconds: nil, Labels: nil, Annotations: nil, OwnerReferences: nil, - Finalizers: nil, ManagedFields: nil}, Spec: v1alpha1.JobSpec{}, - Status: v1alpha1.JobStatus{}} - rs, _ := model.Factory(obj, agent.EventAdd, ctr.cacheIndexers) - rs.GetCacheIndex().Add(obj) - patches := gomonkey.ApplyMethod(reflect.TypeOf(rs), "EventAdd", func(_ *model.VCJobModel, - _ *agent.BusinessAgent) error { - return nil - }) - defer patches.Reset() - err := ctr.SyncHandler(rs) - convey.So(err, convey.ShouldEqual, nil) - }) - }) -} - -func newFakeController() *EventController { - config := newTestConfig() - kube := fake.NewSimpleClientset() - volcano := vofake.NewSimpleClientset() - jobInformerFactory := externalversions.NewSharedInformerFactoryWithOptions(volcano, - time.Second*common.InformerInterval, externalversions.WithTweakListOptions(func(options *v1.ListOptions) { - return - })) - deploymentFactory := informers.NewSharedInformerFactoryWithOptions(kube, time.Second*common.InformerInterval, - informers.WithTweakListOptions(func(options *v1.ListOptions) { - return - })) - jobInformer := jobInformerFactory.Batch().V1alpha1().Jobs() - deploymentInformer := deploymentFactory.Apps().V1().Deployments() - cacheIndexer := make(map[string]cache.Indexer, 1) - cacheIndexer[model.VCJobType] = jobInformer.Informer().GetIndexer() - cacheIndexer[model.DeploymentType] = deploymentInformer.Informer().GetIndexer() - c, err := NewEventController(kube, volcano, config, InformerInfo{JobInformer: jobInformer, - DeployInformer: deploymentInformer, CacheIndexers: cacheIndexer}, make(chan struct{})) - if err != nil { - return nil - } - return c -} - -func newTestConfig() *agent.Config { - const ( - PodParalle = 1 - CmCheckIn = 3 - CmCheckTout = 10 - ) - return &agent.Config{ - DryRun: false, - DisplayStatistic: false, - PodParallelism: PodParalle, - CmCheckInterval: CmCheckIn, - CmCheckTimeout: CmCheckTout, - } -} diff --git a/pkg/ring-controller/controller/types.go b/pkg/ring-controller/controller/types.go index 9c2ac57..5543c99 100644 --- a/pkg/ring-controller/controller/types.go +++ b/pkg/ring-controller/controller/types.go @@ -1,4 +1,4 @@ -/* Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. +/* Copyright(C) 2020-2023. Huawei Technologies Co.,Ltd. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -12,57 +12,130 @@ limitations under the License. */ -// Package controller for controller package controller import ( - "k8s.io/client-go/informers/apps/v1" + "hccl-controller/pkg/ring-controller/config" + "sync" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/workqueue" - "volcano.sh/apis/pkg/client/clientset/versioned" - "volcano.sh/apis/pkg/client/informers/externalversions/batch/v1alpha1" - "hccl-controller/pkg/ring-controller/agent" + ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" ) const ( controllerName = "ring-controller" ) -// EventController for handling event and initialize business agent -type EventController struct { - // component for recycle resources - agent *agent.BusinessAgent - - cacheIndexers map[string]cache.Indexer - // kubeclientset is a standard kubernetes clientset - kubeclientset kubernetes.Interface - - // jobclientset is a clientset for volcano job - jobclientset versioned.Interface - - // component for resource batch/v1alpha1/Job - jobsSynced cache.InformerSynced - deploySynced cache.InformerSynced - // workqueue is a rate limited work queue. This is used to queue work to be - // processed instead of performing it as soon as a change happens. This - // means we can ensure we only process a fixed amount of resources at a - // time, and makes it easy to ensure we are never processing the same item - // simultaneously in two different workers. - workqueue workqueue.RateLimitingInterface - // recorder is an event recorder for recording Event resources to the - // Kubernetes API. +// BusinessAgent Agent for all businessWorkers, responsibilities: +// - list/watch 910 pods, and assign each pod to corresponding handler +// (each business worker belongs to a volcano job, and contains a handler for building rank table) +type BusinessAgent struct { + // Config Agent configuration file + Config *config.Config + + // KubeClientSet : ClientSet to contact kube apiServer + KubeClientSet kubernetes.Interface + + informerFactory informers.SharedInformerFactory + podInformer cache.SharedIndexInformer + // PodsIndexer to get pod index by namespace&name + PodsIndexer cache.Indexer + JobsIndexer cache.Indexer + ConfigmapsIndexer cache.Indexer + ReplicaSetIndexer cache.Indexer + + // RwMutex : to lock Agent Resource eg. Workqueue & BusinessWorker + RwMutex sync.RWMutex + configMapCache map[types.UID]ranktablev1.RankTabler + + // event recorder recorder record.EventRecorder + // Workqueue: A queue with a limited rate.This queue is used to put pod event information + Workqueue workqueue.RateLimitingInterface + + // if print only, do not delete anything. + dryRun bool +} + +type podIdentifier struct { + namespace string + name string + ownerKind string + ownerName string + ownerUid types.UID + eventType string + jobName string + uid types.UID +} + +// VCJobWorker controller for each volcano job, list/watch corresponding pods and build configmap rank table +type VCJobWorker struct { + // WorkerInfo: normal Worker info + WorkerInfo + // JobInfo: VCJob Worker Info + JobInfo +} + +// JobInfo Job Worker Info +type JobInfo struct { + // JobVersion: When a job restart, JobVersion is needed to identify if a pod is old + // with respect to this job + JobVersion int32 + // JobUID: For an identical job, create it immediately after deletion, new + // vcjob Worker will cache old pod info without a identifier to distinguish + JobUID string + // JobCreationTimestamp: when pod reference job uid is different with uid of VCJobWorker + // creationTimestamp is needed to distinguish cases between: 1. old pod + new worker OR 2. new pod + old worker + JobCreationTimestamp metav1.Time + // JobNamespace: Job namespace + JobNamespace string + // JobName : Job name + JobName string +} + +// DeployWorker for deployment model +type DeployWorker struct { + // WorkerInfo: normal Worker info + WorkerInfo + // DeployInfo: Deployment Worker info + DeployInfo +} + +// WorkerInfo :normal Worker info +type WorkerInfo struct { + kubeclientset kubernetes.Interface + recorder record.EventRecorder + cmMu, statisticMu sync.Mutex + dryRun bool + statisticSwitch chan struct{} + informerFactory informers.SharedInformerFactory + podsIndexer cache.Indexer + + configmapName string + configmapData ranktablev1.RankTabler + + statisticStopped bool + rankIndex int32 + cachedIndex *sync.Map + cachedPods *sync.Map + cachedPodNum int32 + taskReplicasTotal int32 } -// InformerInfo : Defining what the Controller will use -type InformerInfo struct { - // CacheIndexers : to store different type cache index - CacheIndexers map[string]cache.Indexer - // JobInformer : vcjob type informer - JobInformer v1alpha1.JobInformer - // DeployInformer: deployment type informer - DeployInformer v1.DeploymentInformer +// DeployInfo : deployment Worker info +type DeployInfo struct { + // DeployCreationTimestamp: when pod reference job uid is different with uid of VCJobWorker + // creationTimestamp is needed to distinguish cases between: 1. old pod + new worker OR 2. new pod + old worker + DeployCreationTimestamp metav1.Time + // DeployNamespace :deployment namespace + DeployNamespace string + // DeployName : deployment name + DeployName string } diff --git a/pkg/ring-controller/model/deployment.go b/pkg/ring-controller/model/deployment.go deleted file mode 100644 index 3624580..0000000 --- a/pkg/ring-controller/model/deployment.go +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -// Package model : to handle event in controller logic -package model - -import ( - "errors" - "fmt" - "strconv" - - "huawei.com/npu-exporter/v5/common-utils/hwlog" - - "hccl-controller/pkg/ring-controller/agent" - "hccl-controller/pkg/ring-controller/common" - "hccl-controller/pkg/ring-controller/ranktable/v1" -) - -// GetReplicas : to return the replicas in deployment. -func (deploy *DeployModel) GetReplicas() string { - return strconv.Itoa(int(deploy.replicas)) -} - -// EventAdd : to handle deployment add event -func (deploy *DeployModel) EventAdd(businessAgent *agent.BusinessAgent) error { - // check if job's corresponding configmap is created successfully via volcano controller - cm, err := checkCMCreation(deploy.DeployNamespace, deploy.DeployName, businessAgent.KubeClientSet, - businessAgent.Config) - if err != nil { - return err - } - - // retrieve configmap data - jobStartString, ok := cm.Data[agent.ConfigmapKey] - if !ok { - return errors.New("the key of " + agent.ConfigmapKey + " does not exist") - } - var rst v1.RankTableStatus - if err = rst.UnmarshalToRankTable(jobStartString); err != nil { - return err - } - hwlog.RunLog.Debugf("jobStarting: %#v", jobStartString) - - ranktable, replicasTotal, err := RanktableFactory(deploy, rst, agent.GetJSONVersion()) - if err != nil { - return err - } - deploymentWorker := agent.NewDeploymentWorker(businessAgent, deploy.DeployInfo, ranktable, replicasTotal) - - // create a business worker for current deployment - businessAgent.RwMutex.Lock() - defer businessAgent.RwMutex.Unlock() - - hwlog.RunLog.Infof("create business worker for %s/%s, UID: %s", deploy.DeployNamespace, deploy.DeployName, deploy.uid) - _, exist := businessAgent.BusinessWorker[deploy.uid] - if exist { - hwlog.RunLog.Infof("business worker for %s/%s is already existed", deploy.DeployNamespace, deploy.DeployName) - return nil - } - - // start to report rank table build statistic for current deployment - if businessAgent.Config.DisplayStatistic { - go deploymentWorker.Statistic(BuildStatInterval) - } - - // save current business worker - businessAgent.BusinessWorker[deploy.uid] = deploymentWorker - return nil -} - -// EventUpdate : to handle deployment update event -func (deploy *DeployModel) EventUpdate(businessAgent *agent.BusinessAgent) error { - businessAgent.RwMutex.RLock() - _, exist := businessAgent.BusinessWorker[deploy.uid] - businessAgent.RwMutex.RUnlock() - if !exist { - // for pod update, the version will be incorrect - err := deploy.EventAdd(businessAgent) - if err != nil { - return err - } - } - return nil -} - -// GenerateGrouplist to create GroupList. in ranktable v1 will use it. -func (deploy *DeployModel) GenerateGrouplist() ([]*v1.Group, int32, error) { - var deviceTotal int32 - - for _, container := range deploy.containers { - npuNum := agent.GetNPUNum(container) - if npuNum == agent.InvalidNPUNum { - return nil, 0, fmt.Errorf("get wrong npu num(%d) in container", npuNum) - } - deviceTotal += npuNum - } - if deploy.replicas > maxNodeNum { - return nil, 0, errors.New("the number of Replicas in a deployment is too large") - } - deviceTotal *= deploy.replicas - - var instanceList []*v1.Instance - group := v1.Group{GroupName: deploy.DeployName, DeviceCount: strconv.FormatInt(int64(deviceTotal), - common.Decimal), InstanceCount: strconv.FormatInt(int64(deploy.replicas), common.Decimal), - InstanceList: instanceList} - - return []*v1.Group{&group}, deploy.replicas, nil -} diff --git a/pkg/ring-controller/model/deployment_test.go b/pkg/ring-controller/model/deployment_test.go deleted file mode 100644 index 0deab6a..0000000 --- a/pkg/ring-controller/model/deployment_test.go +++ /dev/null @@ -1,228 +0,0 @@ -/* -Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -package model - -import ( - "errors" - "fmt" - "k8s.io/apimachinery/pkg/types" - "reflect" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/util/workqueue" - - "hccl-controller/pkg/ring-controller/agent" - "hccl-controller/pkg/ring-controller/common" - ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" -) - -// TestDeployModelEventAdd test Dep6loyModel_EventAdd -func TestDeployModelEventAdd(t *testing.T) { - convey.Convey("model DeployModel_EventAdd", t, func() { - model := &DeployModel{DeployInfo: agent.DeployInfo{DeployNamespace: "namespace", DeployName: "test"}} - const ( - CmIntervals = 2 - CmTimeout = 5 - SleepTime = 3 - ) - config := &agent.Config{ - CmCheckInterval: CmIntervals, - CmCheckTimeout: CmTimeout, - DryRun: false, - DisplayStatistic: false, - PodParallelism: 1, - } - ag := &agent.BusinessAgent{ - Workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter( - CmTimeout*time.Millisecond, SleepTime*time.Second), "Pods"), - BusinessWorker: make(map[types.UID]agent.Worker, 1), - Config: config, - } - - convey.Convey("err !=nil& when configmap is not exist ", func() { - eventAddWhenCMNotExist(model, ag) - }) - convey.Convey("err !=nil & when rankTableFactory return nil", func() { - eventAddWhenFacNil(model, ag) - }) - - convey.Convey("err ==nil& when jobStartString is ok and version is v2", func() { - eventAddWhenV2(model, ag) - }) - - convey.Convey("err == nil when BusinessWorker [namespace/name] exist", func() { - eventAddWhenWorkerExist(ag, model) - }) - }) -} - -func eventAddWhenWorkerExist(ag *agent.BusinessAgent, model *DeployModel) { - ag.BusinessWorker["namespace/test"] = nil - patches := gomonkey.ApplyFunc(checkCMCreation, func(_, _ string, _ kubernetes.Interface, - _ *agent.Config) (*corev1.ConfigMap, error) { - data := make(map[string]string, 1) - data[DataKey] = DataValue - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: "namespace"}, Data: data} - return putCM, nil - }) - defer patches.Reset() - patch := gomonkey.ApplyFunc(RanktableFactory, func(_ ResourceEventHandler, _ ranktablev1.RankTableStatus, - _ string) (ranktablev1.RankTabler, int32, error) { - return nil, int32(1), nil - }) - defer patch.Reset() - const workerNumTwo = 2 - err := model.EventAdd(ag) - convey.So(err, convey.ShouldEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, workerNumTwo) -} - -func eventAddWhenV2(model *DeployModel, ag *agent.BusinessAgent) { - patches := gomonkey.ApplyFunc(checkCMCreation, func(_, _ string, _ kubernetes.Interface, - _ *agent.Config) (*corev1.ConfigMap, error) { - data := make(map[string]string, 1) - data[DataKey] = DataValue - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: NameSpace}, Data: data} - return putCM, nil - }) - defer patches.Reset() - model = &DeployModel{} - patch := gomonkey.ApplyFunc(RanktableFactory, func(_ ResourceEventHandler, _ ranktablev1.RankTableStatus, - _ string) (ranktablev1.RankTabler, int32, error) { - return nil, int32(1), nil - }) - defer patch.Reset() - err := model.EventAdd(ag) - convey.So(err, convey.ShouldEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 1) -} - -func eventAddWhenFacNil(model *DeployModel, ag *agent.BusinessAgent) { - patches := gomonkey.ApplyFunc(checkCMCreation, func(_, _ string, _ kubernetes.Interface, - _ *agent.Config) (*corev1.ConfigMap, error) { - data := make(map[string]string, 1) - data[DataKey] = DataValue - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: NameSpace}, Data: data} - return putCM, nil - }) - defer patches.Reset() - patches2 := gomonkey.ApplyFunc(RanktableFactory, func(_ ResourceEventHandler, - _ ranktablev1.RankTableStatus, _ string) (ranktablev1.RankTabler, int32, error) { - return nil, int32(0), errors.New("generated group list from job error") - }) - defer patches2.Reset() - err := model.EventAdd(ag) - convey.So(err, convey.ShouldNotEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 0) -} - -func eventAddWhenCMNotExist(model *DeployModel, ag *agent.BusinessAgent) { - patches := gomonkey.ApplyFunc(checkCMCreation, func(_, _ string, _ kubernetes.Interface, - _ *agent.Config) (*corev1.ConfigMap, error) { - - return nil, fmt.Errorf(" failed to get configmap for job") - }) - defer patches.Reset() - err := model.EventAdd(ag) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 0) - convey.So(err, convey.ShouldNotEqual, nil) -} - -// TestDeployModelEventUpdate test DeployModel_EventUpdate -func TestDeployModelEventUpdate(t *testing.T) { - const ( - WorkLenExpect, CmTimeout, SleepTime = 2, 5, 3 - fakeUID = "xzxsadas" - ) - convey.Convey("model DeployModel_EventUpdate", t, func() { - model := &DeployModel{ - DeployInfo: agent.DeployInfo{ - DeployNamespace: "namespace", - DeployName: "test", - }, - modelCommon: modelCommon{ - uid: fakeUID, - }, - } - ag := &agent.BusinessAgent{ - Workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter( - CmTimeout*time.Millisecond, SleepTime*time.Second), "Pods"), - BusinessWorker: make(map[types.UID]agent.Worker, 1), - } - convey.Convey("err == nil when BusinessWorker exist job", func() { - ag.BusinessWorker[fakeUID] = nil - err := model.EventUpdate(ag) - convey.So(err, convey.ShouldEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 1) - }) - convey.Convey("err == nil && len(map)==len(map)+1 when BusinessWorker do not exist job", func() { - ag.BusinessWorker["namespace/test1"] = nil - patch := gomonkey.ApplyMethod(reflect.TypeOf(model), "EventAdd", func(dp *DeployModel, - agent *agent.BusinessAgent) error { - agent.BusinessWorker["xxxxx"] = nil - return nil - }) - defer patch.Reset() - err := model.EventUpdate(ag) - convey.So(err, convey.ShouldEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, WorkLenExpect) - }) - convey.Convey("err != nil when eventAdd has error", func() { - patch := gomonkey.ApplyMethod(reflect.TypeOf(model), "EventAdd", func(_ *DeployModel, - agent *agent.BusinessAgent) error { - return fmt.Errorf("get configmap errors") - }) - err := model.EventUpdate(ag) - defer patch.Reset() - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 0) - convey.So(err, convey.ShouldNotEqual, nil) - }) - }) -} - -// TestDeployModelGenerateGrouplist test DeployModel_GenerateGrouplis -func TestDeployModelGenerateGrouplist(t *testing.T) { - convey.Convey("model DeployModel_GenerateGrouplist", t, func() { - const ( - WorkLenExpect = 2 - DeployRep = 2 - ) - model := &DeployModel{replicas: DeployRep} - convey.Convey("err == nil & Group is ok ", func() { - resouceList := make(corev1.ResourceList, 1) - resouceList[agent.A910ResourceName] = *resource.NewScaledQuantity(common.Index2, 0) - containers := []corev1.Container{ - {Resources: corev1.ResourceRequirements{Limits: resouceList}}, - {Resources: corev1.ResourceRequirements{Limits: resouceList}}, - } - model.containers = containers - groupList, re, _ := model.GenerateGrouplist() - convey.So(len(groupList), convey.ShouldEqual, 1) - convey.So(groupList[0].DeviceCount, convey.ShouldEqual, "8") - convey.So(re, convey.ShouldEqual, WorkLenExpect) - }) - }) -} diff --git a/pkg/ring-controller/model/types.go b/pkg/ring-controller/model/types.go deleted file mode 100644 index 7939459..0000000 --- a/pkg/ring-controller/model/types.go +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package model - -import ( - "time" - - "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/tools/cache" - "volcano.sh/apis/pkg/apis/batch/v1alpha1" - - "hccl-controller/pkg/ring-controller/agent" -) - -const ( - // VCJobType To determine the type of listening:vcjob. - VCJobType = "vcjob" - // DeploymentType To determine the type of listening:deployment. - DeploymentType = "deployment" - // ReplicaSetType To determine the type of listening:replicaset. - ReplicaSetType = "replicaset" - // ConfigmapType To determine the type of listening:configmap. - ConfigmapType = "configmap" - - // BuildStatInterval 30 * time.Second - BuildStatInterval = 30 * time.Second - - maxContainerNum = 2 - maxNodeNum = 2048 -) - -type modelCommon struct { - key string - cacheIndexer cache.Indexer - uid types.UID -} - -// VCJobModel : to handle vcjob type -type VCJobModel struct { - modelCommon - agent.JobInfo - jobPhase string - taskSpec []v1alpha1.TaskSpec -} - -// DeployModel : to handle deployment type -type DeployModel struct { - modelCommon - agent.DeployInfo - replicas int32 - containers []v1.Container -} diff --git a/pkg/ring-controller/model/vcjob.go b/pkg/ring-controller/model/vcjob.go deleted file mode 100644 index d916875..0000000 --- a/pkg/ring-controller/model/vcjob.go +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright(C) 2020-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package model - -import ( - "context" - "errors" - "fmt" - "strconv" - "sync" - "time" - - "huawei.com/npu-exporter/v5/common-utils/hwlog" - appsV1 "k8s.io/api/apps/v1" - apiCoreV1 "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/tools/cache" - "volcano.sh/apis/pkg/apis/batch/v1alpha1" - - "hccl-controller/pkg/ring-controller/agent" - "hccl-controller/pkg/ring-controller/common" - ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" - "hccl-controller/pkg/ring-controller/ranktable/v2" -) - -// ResourceEventHandler to define same func, controller to use this function to finish some thing. -type ResourceEventHandler interface { - EventAdd(tagentInterface *agent.BusinessAgent) error - EventUpdate(tagentInterface *agent.BusinessAgent) error - GenerateGrouplist() ([]*ranktablev1.Group, int32, error) - GetReplicas() string - GetCacheIndex() cache.Indexer - GetModelKey() string - GetUID() types.UID -} - -// GetModelKey return model key. -func (model *modelCommon) GetModelKey() string { - return model.key -} - -// GetCacheIndex return CacheIndex -func (model *modelCommon) GetCacheIndex() cache.Indexer { - return model.cacheIndexer -} - -// GetReplicas : return vcjob replicas -func (job *VCJobModel) GetReplicas() string { - return strconv.Itoa(len(job.taskSpec)) -} - -// GetUID : return vcjob uid -func (model *modelCommon) GetUID() types.UID { - return model.uid -} - -// EventAdd to handle vcjob add event -func (job *VCJobModel) EventAdd(businessAgent *agent.BusinessAgent) error { - - businessAgent.RwMutex.RLock() - hwlog.RunLog.Infof("create business worker for %s/%s", job.JobNamespace, job.JobName) - _, exist := businessAgent.BusinessWorker[job.uid] - businessAgent.RwMutex.RUnlock() - if exist { - hwlog.RunLog.Infof("business worker for %s/%s is already existed", job.JobNamespace, job.JobName) - return nil - } - - // check if job's corresponding configmap is created successfully via volcano controller - cm, err := checkCMCreation(job.JobNamespace, job.JobName, businessAgent.KubeClientSet, businessAgent.Config) - if err != nil { - return err - } - - // retrieve configmap data - jobStartString, ok := cm.Data[agent.ConfigmapKey] - if !ok { - return errors.New("the key of " + agent.ConfigmapKey + " does not exist") - } - var rst ranktablev1.RankTableStatus - if err = rst.UnmarshalToRankTable(jobStartString); err != nil { - return err - } - hwlog.RunLog.Debugf("jobStarting: %#v", jobStartString) - - ranktable, replicasTotal, err := RanktableFactory(job, rst, agent.GetJSONVersion()) - if err != nil { - return err - } - jobWorker := agent.NewVCJobWorker(businessAgent, job.JobInfo, ranktable, replicasTotal) - - // create a business worker for current job - businessAgent.RwMutex.Lock() - defer businessAgent.RwMutex.Unlock() - - // start to report rank table build statistic for current job - if businessAgent.Config.DisplayStatistic { - go jobWorker.Statistic(BuildStatInterval) - } - - // save current business worker - businessAgent.BusinessWorker[job.uid] = jobWorker - return nil -} - -// EventUpdate : to handle vcjob update event -func (job *VCJobModel) EventUpdate(businessAgent *agent.BusinessAgent) error { - businessAgent.RwMutex.RLock() - _, exist := businessAgent.BusinessWorker[job.uid] - businessAgent.RwMutex.RUnlock() - if !exist { - // for job update, if create business worker at job restart phase, the version will be incorrect - err := job.EventAdd(businessAgent) - if err != nil { - return err - } - } - return nil -} - -// GenerateGrouplist : to generate GroupList, ranktable v1 will use it. -func (job *VCJobModel) GenerateGrouplist() ([]*ranktablev1.Group, int32, error) { - var replicasTotal int32 - var groupList []*ranktablev1.Group - for _, taskSpec := range job.taskSpec { - var deviceTotal int32 - - if len(taskSpec.Template.Spec.Containers) > maxContainerNum { - return nil, 0, errors.New("the number of container in a taskSpec is too large") - } - for _, container := range taskSpec.Template.Spec.Containers { - npuNum := agent.GetNPUNum(container) - if npuNum == agent.InvalidNPUNum { - return nil, 0, fmt.Errorf("get wrong npu num(%d) in container", npuNum) - } - deviceTotal += npuNum - } - if taskSpec.Replicas > maxNodeNum { - return nil, 0, errors.New("the number of Replicas in a taskSpec is too large") - } - deviceTotal *= taskSpec.Replicas - - var instanceList []*ranktablev1.Instance - group := ranktablev1.Group{GroupName: taskSpec.Name, DeviceCount: strconv.FormatInt(int64(deviceTotal), - common.Decimal), InstanceCount: strconv.FormatInt(int64(taskSpec.Replicas), common.Decimal), - InstanceList: instanceList} - groupList = append(groupList, &group) - replicasTotal += taskSpec.Replicas - } - return groupList, replicasTotal, nil -} - -// checkCMCreation check configmap -func checkCMCreation(namespace, name string, kubeClientSet kubernetes.Interface, config *agent.Config) ( - *apiCoreV1.ConfigMap, error) { - var cm *apiCoreV1.ConfigMap - err := wait.PollImmediate(time.Duration(config.CmCheckTimeout)*time.Second, - time.Duration(config.CmCheckTimeout)*time.Second, - func() (bool, error) { - var errTmp error - cm, errTmp = kubeClientSet.CoreV1().ConfigMaps(namespace). - Get(context.TODO(), fmt.Sprintf("%s-%s", agent.ConfigmapPrefix, name), metav1.GetOptions{}) - if errTmp != nil { - if apierrors.IsNotFound(errTmp) { - return false, nil - } - return true, fmt.Errorf("get configmap error: %#v", errTmp) - } - return true, nil - }) - if err != nil { - return nil, fmt.Errorf("failed to get configmap for job %s/%s: %v", namespace, name, err) - } - label910, exist := (*cm).Labels[agent.Key910] - if !exist || !(label910 == agent.Val910B || label910 == agent.Val910) { - return nil, fmt.Errorf("invalid configmap label %s", label910) - } - - return cm, nil -} - -// Factory : to generate model -func Factory(obj interface{}, eventType string, indexers map[string]cache.Indexer) (ResourceEventHandler, error) { - metaData, err := meta.Accessor(obj) - if err != nil { - return nil, fmt.Errorf("object has no meta: %v", err) - } - key := metaData.GetName() + "/" + eventType - if len(metaData.GetNamespace()) > 0 { - key = metaData.GetNamespace() + "/" + metaData.GetName() + "/" + eventType - } - if _, ok := indexers[VCJobType]; !ok { - return nil, fmt.Errorf("the key does not exist err %v ", ok) - } - if _, ok := indexers[DeploymentType]; !ok { - return nil, fmt.Errorf("the key does not exist err %v ", ok) - } - var model ResourceEventHandler - switch t := obj.(type) { - case *v1alpha1.Job: - if err = validateVCJob(t); err != nil { - return nil, err - } - model = &VCJobModel{ - modelCommon: modelCommon{key: key, cacheIndexer: indexers[VCJobType], uid: metaData.GetUID()}, - JobInfo: agent.JobInfo{ - JobUID: string(t.UID), JobVersion: t.Status.Version, JobCreationTimestamp: t.CreationTimestamp, - JobNamespace: t.Namespace, JobName: t.Name, - }, - jobPhase: string(t.Status.State.Phase), taskSpec: t.Spec.Tasks} - case *appsV1.Deployment: - if err = validateDeployment(t); err != nil { - return nil, err - } - model = &DeployModel{ - modelCommon: modelCommon{key: key, cacheIndexer: indexers[DeploymentType], uid: metaData.GetUID()}, - containers: t.Spec.Template.Spec.Containers, replicas: *t.Spec.Replicas, - DeployInfo: agent.DeployInfo{ - DeployNamespace: t.Namespace, DeployName: t.Name, DeployCreationTimestamp: t.CreationTimestamp, - }} - default: - return nil, fmt.Errorf("job factory err, %s ", key) - } - - return model, nil -} - -func validateVCJob(job *v1alpha1.Job) error { - // Tasks represents the number of pod with a train task - if len(job.Spec.Tasks) > maxNodeNum { - return errors.New("the number of Tasks in a train task is too large") - } - return nil -} - -func validateDeployment(d *appsV1.Deployment) error { - // the number of container in one pod - if len(d.Spec.Template.Spec.Containers) > maxContainerNum { - return errors.New("the number of Containers in deployment is too large") - } - // pod num with a train task - if *d.Spec.Replicas > maxNodeNum { - return errors.New("the number of Replicas in a train task is too large") - } - return nil -} - -// RanktableFactory : return the version type of ranktable according to your input parameters -func RanktableFactory(model ResourceEventHandler, rst ranktablev1.RankTableStatus, - JSONVersion string) (ranktablev1.RankTabler, int32, error) { - var ranktable ranktablev1.RankTabler - groupList, replicasTotal, err := model.GenerateGrouplist() - if err != nil { - return nil, 0, fmt.Errorf("generate group list from job error: %v", err) - } - if JSONVersion == "v1" { - ranktable = &ranktablev1.RankTable{RankTableStatus: ranktablev1.RankTableStatus{Status: rst.Status}, - GroupCount: model.GetReplicas(), GroupList: groupList} - } else { - ranktable = &v2.RankTable{ServerCount: "0", ServerList: []*v2.Server(nil), Servers: &sync.Map{}, - RankTableStatus: ranktablev1.RankTableStatus{Status: rst.Status}, Version: "1.0"} - } - return ranktable, replicasTotal, nil -} diff --git a/pkg/ring-controller/model/vcjob_test.go b/pkg/ring-controller/model/vcjob_test.go deleted file mode 100644 index 34ac02c..0000000 --- a/pkg/ring-controller/model/vcjob_test.go +++ /dev/null @@ -1,407 +0,0 @@ -/* Copyright(C) 2020-2023. Huawei Technologies Co.,Ltd. All rights reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package model - -import ( - "context" - "fmt" - "reflect" - "testing" - "time" - - "github.com/agiledragon/gomonkey/v2" - "github.com/smartystreets/goconvey/convey" - appsV1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" - "k8s.io/client-go/kubernetes/fake" - typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" - "k8s.io/client-go/util/workqueue" - "volcano.sh/apis/pkg/apis/batch/v1alpha1" - - "hccl-controller/pkg/ring-controller/agent" - ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" - _ "hccl-controller/pkg/testtool" -) - -const ( - NameSpace = "namespace" - Name = "test1" - DataKey = "hccl.json" - DataValue = `{"status":"initializing"}` - CMName = "rings-config-test1" - Initializing = "initializing" -) - -// TestFactory test Factory -func TestFactory(t *testing.T) { - convey.Convey("model Factory", t, func() { - convey.Convey("err != nil when obj == nil", func() { - _, err := Factory(nil, "", nil) - convey.So(err, convey.ShouldNotEqual, - nil) - }) - - convey.Convey("err !=nil& when obj is daemonSet ", func() { - obj := &appsV1.DaemonSet{TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{Name: "test1", - GenerateName: "", Namespace: "tt1", SelfLink: "", UID: types.UID("xxxx"), ResourceVersion: "", - Generation: 0, CreationTimestamp: metav1.Now(), DeletionTimestamp: nil, - DeletionGracePeriodSeconds: nil, Labels: nil, Annotations: nil, OwnerReferences: nil, - Finalizers: nil, ManagedFields: nil}, Spec: appsV1.DaemonSetSpec{}, - Status: appsV1.DaemonSetStatus{}} - _, err := Factory(obj, "add", nil) - convey.So(err, convey.ShouldNotEqual, nil) - }) - - convey.Convey("err ==nil& resourceHandle = jobHandle when obj is job ", func() { - obj := &v1alpha1.Job{TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{Name: "test1", - GenerateName: "", Namespace: "tt1", SelfLink: "", UID: types.UID("xxxx"), ResourceVersion: "", - Generation: 0, CreationTimestamp: metav1.Now(), DeletionTimestamp: nil, - DeletionGracePeriodSeconds: nil, Labels: nil, Annotations: nil, OwnerReferences: nil, - Finalizers: nil, ManagedFields: nil}, Spec: v1alpha1.JobSpec{}, - Status: v1alpha1.JobStatus{}} - rs, _ := Factory(obj, "add", nil) - convey.So(rs, convey.ShouldEqual, nil) - }) - - convey.Convey("err ==nil& resourceHandle = DeploymentHandle when obj is deployment ", func() { - replicas := int32(1) - obj := &appsV1.Deployment{TypeMeta: metav1.TypeMeta{}, ObjectMeta: metav1.ObjectMeta{Name: "test1", - GenerateName: "", Namespace: "tt1", SelfLink: "", UID: types.UID("xxxx"), ResourceVersion: "", - Generation: 0, CreationTimestamp: metav1.Now(), DeletionTimestamp: nil, - DeletionGracePeriodSeconds: nil, Labels: nil, Annotations: nil, OwnerReferences: nil, - Finalizers: nil, ManagedFields: nil}, - Spec: appsV1.DeploymentSpec{Replicas: &replicas}, Status: appsV1.DeploymentStatus{}} - rs, _ := Factory(obj, "add", nil) - convey.So(rs, convey.ShouldEqual, nil) - }) - }) -} - -// TestRanktableFactory test RanktableFactory -func TestRanktableFactory(t *testing.T) { - convey.Convey("model RankTableFactory", t, func() { - model := &VCJobModel{} - convey.Convey("err != nil when obj == nil", func() { - patch := gomonkey.ApplyMethod(reflect.TypeOf(model), "GenerateGrouplist", func(_ *VCJobModel) ( - []*ranktablev1.Group, int32, error) { - return nil, int32(0), fmt.Errorf("test") - }) - defer patch.Reset() - _, _, err := RanktableFactory(model, ranktablev1.RankTableStatus{Status: ""}, "") - convey.So(err, convey.ShouldNotEqual, nil) - }) - - convey.Convey("err ==nil& when RankTableStatus is ok and version is v1", func() { - model = &VCJobModel{taskSpec: append([]v1alpha1.TaskSpec(nil), v1alpha1.TaskSpec{})} - patch := gomonkey.ApplyMethod(reflect.TypeOf(model), "GenerateGrouplist", func(_ *VCJobModel) ( - []*ranktablev1.Group, int32, error) { - return nil, int32(1), nil - }) - defer patch.Reset() - rt, _, err := RanktableFactory(model, ranktablev1.RankTableStatus{Status: Initializing}, "v1") - convey.So(err, convey.ShouldEqual, nil) - convey.So(rt.GetStatus(), convey.ShouldEqual, "initializing") - rv := reflect.ValueOf(rt).Elem() - convey.So(rv.FieldByName("GroupCount").String(), convey.ShouldEqual, "1") - }) - - convey.Convey("err ==nil& when RankTableStatus is ok and version is v2", func() { - model = &VCJobModel{taskSpec: append([]v1alpha1.TaskSpec(nil), v1alpha1.TaskSpec{})} - pathch := gomonkey.ApplyMethod(reflect.TypeOf(model), "GenerateGrouplist", func(_ *VCJobModel) ( - []*ranktablev1.Group, int32, error) { - return nil, int32(1), nil - }) - defer pathch.Reset() - rt, _, err := RanktableFactory(model, ranktablev1.RankTableStatus{Status: Initializing}, "v2") - convey.So(err, convey.ShouldEqual, nil) - convey.So(rt.GetStatus(), convey.ShouldEqual, "initializing") - rv := reflect.ValueOf(rt).Elem() - convey.So(rv.FieldByName("ServerCount").String(), convey.ShouldEqual, "0") - }) - }) -} - -// TestCheckCMCreation test CheckCMCreation -func TestCheckCMCreation(t *testing.T) { - const ( - CmInterval = 2 - CmTimeout = 5 - ) - config := &agent.Config{ - DryRun: false, - DisplayStatistic: true, - PodParallelism: 1, - CmCheckInterval: CmInterval, - CmCheckTimeout: CmTimeout, - } - convey.Convey("model checkCMCreation", t, func() { - fakeClient := fake.NewSimpleClientset() - fakeCoreV1 := fakeClient.CoreV1() - cms := fakeCoreV1.ConfigMaps(NameSpace) - convey.Convey("err == nil when Normal", func() { - checkCmWhenNormal(cms, fakeClient, config) - }) - convey.Convey("err != nil when Label not exist", func() { - data := make(map[string]string, 1) - label := make(map[string]string, 1) - data[DataKey] = DataValue - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: "namespace", Labels: label}, Data: data} - cms.Create(context.TODO(), putCM, metav1.CreateOptions{}) - getCM, err := checkCMCreation(NameSpace, Name, fakeClient, config) - convey.So(err, convey.ShouldNotEqual, nil) - convey.So(getCM, convey.ShouldEqual, nil) - }) - convey.Convey("err != nil when cm not exist", func() { - data := make(map[string]string, 1) - label := make(map[string]string, 1) - data[DataKey] = DataValue - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: "rings-config-test12", - Namespace: "namespace", Labels: label}, Data: data} - cms.Create(context.TODO(), putCM, metav1.CreateOptions{}) - getCM, err := checkCMCreation(NameSpace, Name, fakeClient, config) - convey.So(err, convey.ShouldNotEqual, nil) - convey.So(getCM, convey.ShouldEqual, nil) - }) - }) -} - -func checkCmWhenNormal(cms typedcorev1.ConfigMapInterface, fakeClient *fake.Clientset, config *agent.Config) { - data := make(map[string]string, 1) - label := make(map[string]string, 1) - data[DataKey] = DataValue - label[agent.Key910] = agent.Val910B - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: "namespace", Labels: label}, Data: data} - cms.Create(context.TODO(), putCM, metav1.CreateOptions{}) - getCM, err := checkCMCreation(NameSpace, Name, fakeClient, config) - convey.So(err, convey.ShouldEqual, nil) - convey.So(getCM.String(), convey.ShouldEqual, putCM.String()) -} - -// TestVCJobModelEventAdd test VCJobModel_EventAdd -func TestVCJobModelEventAdd(t *testing.T) { - convey.Convey("model VCJobModel_EventAdd", t, func() { - model := &VCJobModel{JobInfo: agent.JobInfo{JobNamespace: "namespace", JobName: "test"}} - const ( - CmInterval = 2 - CmTimeout = 5 - TimeSleep = 3 - ) - - config := &agent.Config{ - DryRun: false, - DisplayStatistic: false, - PodParallelism: 1, - CmCheckInterval: CmInterval, - CmCheckTimeout: CmTimeout, - } - ag := &agent.BusinessAgent{ - Workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter( - CmTimeout*time.Millisecond, TimeSleep*time.Second), "Pods"), - KubeClientSet: fake.NewSimpleClientset(), - BusinessWorker: make(map[types.UID]agent.Worker, 1), - Config: config, - } - convey.Convey("err == nil when BusinessWorker [namespace/name] exist", func() { - ag.BusinessWorker["namespace/test"] = nil - err := model.EventAdd(ag) - convey.ShouldNotBeNil(err) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 1) - }) - convey.Convey("err !=nil& when configmap is not exist ", func() { - patches := gomonkey.ApplyFunc(checkCMCreation, func(_, _ string, _ kubernetes.Interface, - _ *agent.Config) (*corev1.ConfigMap, error) { - return nil, fmt.Errorf(" failed to get configmap for job") - }) - defer patches.Reset() - err := model.EventAdd(ag) - convey.So(err, convey.ShouldNotEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 0) - }) - convey.Convey("err !=nil & when rankTableFactory return nil", func() { - eventAddWhenFactNil(model, ag) - }) - - convey.Convey("err ==nil& when jobStartString is ok and version is v2", func() { - eventAddWhenVersionV2(model, ag) - }) - }) -} - -func eventAddWhenVersionV2(model *VCJobModel, ag *agent.BusinessAgent) { - patches := gomonkey.ApplyFunc(checkCMCreation, func(_, _ string, _ kubernetes.Interface, _ *agent.Config) ( - *corev1.ConfigMap, error) { - data := make(map[string]string, 1) - data[DataKey] = DataValue - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: "namespace"}, Data: data} - return putCM, nil - }) - defer patches.Reset() - model = &VCJobModel{taskSpec: append([]v1alpha1.TaskSpec(nil), v1alpha1.TaskSpec{})} - patch := gomonkey.ApplyMethod(reflect.TypeOf(model), "GenerateGrouplist", func(_ *VCJobModel) ( - []*ranktablev1.Group, int32, error) { - return nil, int32(1), nil - }) - defer patch.Reset() - err := model.EventAdd(ag) - convey.So(err, convey.ShouldEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 1) -} - -func eventAddWhenFactNil(model *VCJobModel, ag *agent.BusinessAgent) { - patches := gomonkey.ApplyFunc(checkCMCreation, func(_, _ string, _ kubernetes.Interface, _ *agent.Config) ( - *corev1.ConfigMap, error) { - data := make(map[string]string, 1) - data[DataKey] = DataValue - putCM := &corev1.ConfigMap{ObjectMeta: metav1.ObjectMeta{Name: CMName, - Namespace: "namespace"}, Data: data} - return putCM, nil - }) - defer patches.Reset() - patches2 := gomonkey.ApplyFunc(RanktableFactory, func(_ ResourceEventHandler, _ ranktablev1.RankTableStatus, - _ string) (ranktablev1.RankTabler, int32, error) { - return nil, int32(0), fmt.Errorf("generate group list from job error") - }) - defer patches2.Reset() - err := model.EventAdd(ag) - convey.So(err, convey.ShouldNotEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 0) -} - -// TestVCJobModelEventUpdate test VCJobModel_EventUpdate -func TestVCJobModelEventUpdate(t *testing.T) { - convey.Convey("model VCJobModel_EventUpdate", t, func() { - const ( - CmTimeout = 5 - TimeSleep = 3 - WorkLenExpect = 2 - fakeUID = "sqwqasdsa" - ) - model := &VCJobModel{ - JobInfo: agent.JobInfo{ - JobNamespace: "namespace", - JobName: "test", - }, - modelCommon: modelCommon{ - uid: fakeUID, - }, - } - ag := &agent.BusinessAgent{ - Workqueue: workqueue.NewNamedRateLimitingQueue(workqueue.NewItemExponentialFailureRateLimiter( - CmTimeout*time.Millisecond, TimeSleep*time.Second), "Pods"), - BusinessWorker: make(map[types.UID]agent.Worker, 1), - } - convey.Convey("err == nil when BusinessWorker exist job", func() { - ag.BusinessWorker[fakeUID] = nil - err := model.EventUpdate(ag) - convey.So(err, convey.ShouldEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 1) - }) - - convey.Convey("err == nil && len(map)==len(map)+1 when BusinessWorker do not exist job", func() { - ag.BusinessWorker["namespace/test1"] = nil - patch := gomonkey.ApplyMethod(reflect.TypeOf(model), "EventAdd", func(vc *VCJobModel, - agent *agent.BusinessAgent) error { - agent.BusinessWorker["xxxxxx"] = nil - return nil - }) - defer patch.Reset() - err := model.EventUpdate(ag) - convey.So(err, convey.ShouldEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, WorkLenExpect) - }) - convey.Convey("err != nil when eventAdd has error", func() { - updateWhenAddErr(model, ag) - }) - }) -} - -func updateWhenAddErr(model *VCJobModel, ag *agent.BusinessAgent) { - patch := gomonkey.ApplyMethod(reflect.TypeOf(model), "EventAdd", func(_ *VCJobModel, - agent *agent.BusinessAgent) error { - return fmt.Errorf("get configmap error") - }) - defer patch.Reset() - err := model.EventUpdate(ag) - convey.So(err, convey.ShouldNotEqual, nil) - convey.So(len(ag.BusinessWorker), convey.ShouldEqual, 0) -} - -// TestVCJobModelGenerateGrouplist test VCJobModel_GenerateGrouplist -func TestVCJobModelGenerateGrouplist(t *testing.T) { - convey.Convey("model VCJobModel_GenerateGrouplist", t, func() { - const ( - TaskRep = 2 - RepExpect = 2 - ) - - model := &VCJobModel{JobInfo: agent.JobInfo{JobNamespace: "namespace", JobName: "test"}} - convey.Convey("err == nil & Group is ok ", func() { - resouceList := make(corev1.ResourceList) - resouceList[agent.A910ResourceName] = *resource.NewScaledQuantity(TaskRep, 0) - containers := []corev1.Container{ - {Resources: corev1.ResourceRequirements{Limits: resouceList}}, - {Resources: corev1.ResourceRequirements{Limits: resouceList}}, - } - model.taskSpec = append(model.taskSpec, v1alpha1.TaskSpec{Replicas: TaskRep, - Template: corev1.PodTemplateSpec{Spec: corev1.PodSpec{Containers: containers}}}) - groupList, re, _ := model.GenerateGrouplist() - convey.So(len(groupList), convey.ShouldEqual, 1) - convey.So(groupList[0].DeviceCount, convey.ShouldEqual, "8") - convey.So(re, convey.ShouldEqual, RepExpect) - }) - }) -} - -// TestValidateDeployment validate resources in deployment -func TestValidateDeployment(t *testing.T) { - convey.Convey("test validateDeployment", t, func() { - convey.Convey("container num exceed 2", func() { - d := new(appsV1.Deployment) - d.Spec.Template.Spec.Containers = append(d.Spec.Template.Spec.Containers, corev1.Container{}) - d.Spec.Template.Spec.Containers = append(d.Spec.Template.Spec.Containers, corev1.Container{}) - d.Spec.Template.Spec.Containers = append(d.Spec.Template.Spec.Containers, corev1.Container{}) - err := validateDeployment(d) - convey.So(err, convey.ShouldBeError) - }) - convey.Convey("replicas exceed 256", func() { - d := appsV1.Deployment{} - r := int32(maxNodeNum + 1) - d.Spec.Replicas = &r - err := validateDeployment(&d) - convey.So(err, convey.ShouldBeError) - }) - }) -} - -// TestValidateVCJob validate resources in vcjob -func TestValidateVCJob(t *testing.T) { - convey.Convey("test validateVCJob", t, func() { - convey.Convey("vcjob tasks num exceed 256", func() { - j := v1alpha1.Job{} - for i := 0; i < maxNodeNum+1; i++ { - j.Spec.Tasks = append(j.Spec.Tasks, v1alpha1.TaskSpec{}) - } - err := validateVCJob(&j) - convey.So(err, convey.ShouldBeError) - }) - }) -} -- Gitee From 0f5381dcee57cca1d905e9651dc212d6aea91a03 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 11:11:52 +0800 Subject: [PATCH 20/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../controller/businessagent.go | 16 +-- .../controller/businessagent_test.go | 107 ------------------ 2 files changed, 1 insertion(+), 122 deletions(-) delete mode 100644 pkg/ring-controller/controller/businessagent_test.go diff --git a/pkg/ring-controller/controller/businessagent.go b/pkg/ring-controller/controller/businessagent.go index 55d6c2c..7050f15 100644 --- a/pkg/ring-controller/controller/businessagent.go +++ b/pkg/ring-controller/controller/businessagent.go @@ -486,7 +486,6 @@ func (b *BusinessAgent) nameGenerationFunc(obj interface{}, eventType string) (* return nil, fmt.Errorf("object has no meta: %v", err) } labelMaps := metaData.GetLabels() - //annotations := metaData.GetAnnotations() owner := getControlled(metaData) if owner == nil { return nil, fmt.Errorf("object has no owner: %v", err) @@ -497,11 +496,9 @@ func (b *BusinessAgent) nameGenerationFunc(obj interface{}, eventType string) (* ownerKind: owner.Kind, ownerName: owner.Name, ownerUid: owner.UID, - //rankIndex: annotations[PodRankIndexKey], jobName: getWorkName(labelMaps), eventType: eventType, - //jobUid: jobUID, - uid: metaData.GetUID(), + uid: metaData.GetUID(), }, nil } @@ -523,14 +520,3 @@ func getControlled(obj metav1.Object) *metav1.OwnerReference { } return nil } - -func isReferenceJobSameWithBsnsWorker(pod *corev1.Pod, jobName, bsnsWorkerUID string) bool { - sameWorker := false - for _, owner := range pod.OwnerReferences { - if owner.Name == jobName && string(owner.UID) == bsnsWorkerUID { - sameWorker = true - break - } - } - return sameWorker -} diff --git a/pkg/ring-controller/controller/businessagent_test.go b/pkg/ring-controller/controller/businessagent_test.go deleted file mode 100644 index a80d865..0000000 --- a/pkg/ring-controller/controller/businessagent_test.go +++ /dev/null @@ -1,107 +0,0 @@ -/* -Copyright(C) 2022. Huawei Technologies Co.,Ltd. All rights reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ -package controller - -import ( - "math" - "strconv" - "testing" - - "github.com/smartystreets/goconvey/convey" - apiCorev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes/fake" - - _ "hccl-controller/pkg/testtool" -) - -// TestDeleteWorker test DeleteWorker -func TestDeleteWorker(t *testing.T) { - convey.Convey("agent DeleteWorker", t, func() { - bus, _ := NewBusinessAgent(fake.NewSimpleClientset(), nil, - &Config{PodParallelism: 1}, make(chan struct{})) - const fakeJobUID = "asdaadadqasd" - convey.Convey("DeleteWorker businessAgent when exist", func() { - - bus.BusinessWorker[fakeJobUID] = new(VCJobWorker) - DeleteWorker(fakeJobUID, bus) - convey.So(len(bus.BusinessWorker), convey.ShouldEqual, 0) - }) - convey.Convey("DeleteWorker businessAgent when not exist", func() { - bus.BusinessWorker[fakeJobUID] = nil - DeleteWorker("namespace", bus) - convey.So(len(bus.BusinessWorker), convey.ShouldEqual, 1) - }) - }) -} - -// TestGetNPUNum test GetNPUNum -func TestGetNPUNum(t *testing.T) { - const a910With2CResourceName = A910ResourceName + "-2c" - convey.Convey("Get NPUNum", t, func() { - convey.Convey("no npu found", func() { - c := apiCorev1.Container{Resources: apiCorev1.ResourceRequirements{}} - val := GetNPUNum(c) - convey.So(val, convey.ShouldEqual, 0) - }) - convey.Convey("legal npu number", func() { - rl := apiCorev1.ResourceList{} - rl[a910With2CResourceName] = resource.MustParse("1") - c := apiCorev1.Container{Resources: apiCorev1.ResourceRequirements{Limits: rl}} - val := GetNPUNum(c) - convey.So(val, convey.ShouldEqual, 1) - }) - convey.Convey("illegal npu number, number is too big", func() { - rl := apiCorev1.ResourceList{} - tooBigNum := math.MaxInt32 + 1 - rl[a910With2CResourceName] = resource.MustParse(strconv.Itoa(tooBigNum)) - c := apiCorev1.Container{Resources: apiCorev1.ResourceRequirements{Limits: rl}} - val := GetNPUNum(c) - convey.So(val, convey.ShouldEqual, InvalidNPUNum) - }) - convey.Convey("illegal npu number, number is too small", func() { - rl := apiCorev1.ResourceList{} - tooSmallNum := math.MinInt32 - 1 - rl[a910With2CResourceName] = resource.MustParse(strconv.Itoa(tooSmallNum)) - c := apiCorev1.Container{Resources: apiCorev1.ResourceRequirements{Limits: rl}} - val := GetNPUNum(c) - convey.So(val, convey.ShouldEqual, InvalidNPUNum) - }) - }) -} - -// TestIsReferenceJobSameWithBsnsWorker test isReferenceJobSameWithBsnsWorker -func TestIsReferenceJobSameWithBsnsWorker(t *testing.T) { - convey.Convey("test isReferenceJobSameWithBsnsWorker", t, func() { - uuid := "UID-xxxxxxxxxxxxxxx" - name := "test-name" - or := []metav1.OwnerReference{ - {UID: types.UID(uuid), Name: name}, - } - pod := apiCorev1.Pod{} - pod.OwnerReferences = or - convey.Convey("the same", func() { - isSame := isReferenceJobSameWithBsnsWorker(&pod, name, uuid) - convey.So(isSame, convey.ShouldEqual, true) - }) - convey.Convey("not same", func() { - isSame := isReferenceJobSameWithBsnsWorker(&pod, "podName", uuid) - convey.So(isSame, convey.ShouldEqual, false) - }) - }) -} -- Gitee From c491bda0d195abfa78fa0fd04cbeeb4069a799d5 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 11:12:40 +0800 Subject: [PATCH 21/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/controller/types.go | 66 ------------------------- 1 file changed, 66 deletions(-) diff --git a/pkg/ring-controller/controller/types.go b/pkg/ring-controller/controller/types.go index 5543c99..8ee7896 100644 --- a/pkg/ring-controller/controller/types.go +++ b/pkg/ring-controller/controller/types.go @@ -18,7 +18,6 @@ import ( "hccl-controller/pkg/ring-controller/config" "sync" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" @@ -74,68 +73,3 @@ type podIdentifier struct { jobName string uid types.UID } - -// VCJobWorker controller for each volcano job, list/watch corresponding pods and build configmap rank table -type VCJobWorker struct { - // WorkerInfo: normal Worker info - WorkerInfo - // JobInfo: VCJob Worker Info - JobInfo -} - -// JobInfo Job Worker Info -type JobInfo struct { - // JobVersion: When a job restart, JobVersion is needed to identify if a pod is old - // with respect to this job - JobVersion int32 - // JobUID: For an identical job, create it immediately after deletion, new - // vcjob Worker will cache old pod info without a identifier to distinguish - JobUID string - // JobCreationTimestamp: when pod reference job uid is different with uid of VCJobWorker - // creationTimestamp is needed to distinguish cases between: 1. old pod + new worker OR 2. new pod + old worker - JobCreationTimestamp metav1.Time - // JobNamespace: Job namespace - JobNamespace string - // JobName : Job name - JobName string -} - -// DeployWorker for deployment model -type DeployWorker struct { - // WorkerInfo: normal Worker info - WorkerInfo - // DeployInfo: Deployment Worker info - DeployInfo -} - -// WorkerInfo :normal Worker info -type WorkerInfo struct { - kubeclientset kubernetes.Interface - recorder record.EventRecorder - cmMu, statisticMu sync.Mutex - dryRun bool - statisticSwitch chan struct{} - informerFactory informers.SharedInformerFactory - podsIndexer cache.Indexer - - configmapName string - configmapData ranktablev1.RankTabler - - statisticStopped bool - rankIndex int32 - cachedIndex *sync.Map - cachedPods *sync.Map - cachedPodNum int32 - taskReplicasTotal int32 -} - -// DeployInfo : deployment Worker info -type DeployInfo struct { - // DeployCreationTimestamp: when pod reference job uid is different with uid of VCJobWorker - // creationTimestamp is needed to distinguish cases between: 1. old pod + new worker OR 2. new pod + old worker - DeployCreationTimestamp metav1.Time - // DeployNamespace :deployment namespace - DeployNamespace string - // DeployName : deployment name - DeployName string -} -- Gitee From 911175d656045ba553fe54673532900f2c46aa3d Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 11:13:28 +0800 Subject: [PATCH 22/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/common/constants.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/ring-controller/common/constants.go b/pkg/ring-controller/common/constants.go index 7be6a68..0698f73 100644 --- a/pkg/ring-controller/common/constants.go +++ b/pkg/ring-controller/common/constants.go @@ -31,7 +31,6 @@ const ( ) const ( - // Key910 to get Configmap Key910 = "ring-controller.atlas" // Val910 to get Configmap -- Gitee From d3154a943eb03850d920c38b6afb14e22bd5fc8e Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 11:28:42 +0800 Subject: [PATCH 23/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/controller/businessagent.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/ring-controller/controller/businessagent.go b/pkg/ring-controller/controller/businessagent.go index 7050f15..9883dd5 100644 --- a/pkg/ring-controller/controller/businessagent.go +++ b/pkg/ring-controller/controller/businessagent.go @@ -306,6 +306,7 @@ func (b *BusinessAgent) handleDeleteEvent(podInfo *podIdentifier) error { return err } } + rankTable.DeletePod(podInfo.uid) return nil } -- Gitee From f2dfdb2c63d80c9260732e141b895a54a9a9d0ea Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 11:33:50 +0800 Subject: [PATCH 24/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/controller/businessagent.go | 11 +++-------- pkg/ring-controller/ranktable/v2/ranktable.go | 1 + 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pkg/ring-controller/controller/businessagent.go b/pkg/ring-controller/controller/businessagent.go index 9883dd5..1431703 100644 --- a/pkg/ring-controller/controller/businessagent.go +++ b/pkg/ring-controller/controller/businessagent.go @@ -289,23 +289,18 @@ func (b *BusinessAgent) handleDeleteEvent(podInfo *podIdentifier) error { return nil } status := rankTable.GetStatus() - err := rankTable.RemovePodInfo(podInfo.namespace, podInfo.uid) - if err != nil { - return err - } - hwlog.RunLog.Infof("start to remove data of pod %s/%s", podInfo.namespace, podInfo.name) - configmapName := common.ConfigmapPrefix + podInfo.jobName if status == common.ConfigmapCompleted { rankTable.SetStatus(common.ConfigmapInitializing) - hwlog.RunLog.Infof("pod(%s/%s) is delete, start to update configmap(%s) to initializing", podInfo.namespace, + hwlog.RunLog.Infof("start to update configmap(%s) to initializing", podInfo.namespace, podInfo.name, configmapName) - err = b.updateConfigMap(rankTable, podInfo.namespace, configmapName) + err := b.updateConfigMap(rankTable, podInfo.namespace, configmapName) if err != nil { rankTable.SetStatus(common.ConfigmapCompleted) return err } } + hwlog.RunLog.Infof("start to remove data of pod %s/%s", podInfo.namespace, podInfo.name) rankTable.DeletePod(podInfo.uid) return nil diff --git a/pkg/ring-controller/ranktable/v2/ranktable.go b/pkg/ring-controller/ranktable/v2/ranktable.go index f270a38..6b93f30 100644 --- a/pkg/ring-controller/ranktable/v2/ranktable.go +++ b/pkg/ring-controller/ranktable/v2/ranktable.go @@ -29,6 +29,7 @@ import ( // BeforeUpdate do prepare func (r *RankTable) BeforeUpdate() { + r.ServerList = make([]*Server, 0) r.Servers.Range(func(key, value interface{}) bool { r.ServerList = append(r.ServerList, value.(*Server)) return true -- Gitee From cd970a073d4c28eda06ddaf5dfb18fe85f8f066b Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 12:08:27 +0800 Subject: [PATCH 25/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/controller/businessagent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/ring-controller/controller/businessagent.go b/pkg/ring-controller/controller/businessagent.go index 1431703..3831c9a 100644 --- a/pkg/ring-controller/controller/businessagent.go +++ b/pkg/ring-controller/controller/businessagent.go @@ -273,7 +273,7 @@ func (b *BusinessAgent) doWorkByWorker(tmpObj, obj interface{}, podExist bool, p } if err := b.handleAddUpdateEvent(pod, podKeyInfo, labelSeletor, replicas); err != nil { - hwlog.RunLog.Errorf("handleAddUpdateEvent error, error is %s", err) + hwlog.RunLog.Debugf("handleAddUpdateEvent error, error is %s", err) return false } -- Gitee From 3d3b03aac1e680867b7f5d21f5758db614a4f491 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 14:35:21 +0800 Subject: [PATCH 26/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/controller/businessagent.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/ring-controller/controller/businessagent.go b/pkg/ring-controller/controller/businessagent.go index 3831c9a..ab153a5 100644 --- a/pkg/ring-controller/controller/businessagent.go +++ b/pkg/ring-controller/controller/businessagent.go @@ -292,8 +292,7 @@ func (b *BusinessAgent) handleDeleteEvent(podInfo *podIdentifier) error { configmapName := common.ConfigmapPrefix + podInfo.jobName if status == common.ConfigmapCompleted { rankTable.SetStatus(common.ConfigmapInitializing) - hwlog.RunLog.Infof("start to update configmap(%s) to initializing", podInfo.namespace, - podInfo.name, configmapName) + hwlog.RunLog.Infof("start to update configmap(%s/%s) to initializing", podInfo.namespace, configmapName) err := b.updateConfigMap(rankTable, podInfo.namespace, configmapName) if err != nil { rankTable.SetStatus(common.ConfigmapCompleted) -- Gitee From 1c113baec4b091f7ca28c9ae0a2eddaefda61409 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 14:38:48 +0800 Subject: [PATCH 27/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/controller/businessagent.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/ring-controller/controller/businessagent.go b/pkg/ring-controller/controller/businessagent.go index ab153a5..3d3fe34 100644 --- a/pkg/ring-controller/controller/businessagent.go +++ b/pkg/ring-controller/controller/businessagent.go @@ -358,7 +358,8 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *corev1.Pod, podInfo *podIdenti hwlog.RunLog.Debugf("label selector: %v", labelSeletor) pods, err := b.informerFactory.Core().V1().Pods().Lister().List(labels.SelectorFromSet(labelSeletor)) if err != nil { - return fmt.Errorf("failed to list pods: %v", err) + hwlog.RunLog.Errorf("list pods failed: %v", err) + return err } hwlog.RunLog.Debugf("list job<%s/%s> pods num: %d", podInfo.namespace, podInfo.ownerName, len(pods)) running := 0 @@ -372,6 +373,7 @@ func (b *BusinessAgent) handleAddUpdateEvent(pod *corev1.Pod, podInfo *podIdenti running++ } if running != int(replicas) { + hwlog.RunLog.Errorf("ready pods %d is not equal to replicas %d", running, replicas) return fmt.Errorf("ready pods %d is not equal to replicas %d", running, replicas) } -- Gitee From f42d17294478db2f303e3db575d06c9eabc33ede Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 15:31:26 +0800 Subject: [PATCH 28/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.go | 4 +++- pkg/ring-controller/common/k8sclient.go | 21 ++++++++++++++++-- pkg/ring-controller/config/configs.go | 15 +++++++++++++ .../controller/businessagent.go | 22 +++++++++---------- 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/main.go b/main.go index 5c3bba9..ab58334 100644 --- a/main.go +++ b/main.go @@ -20,11 +20,13 @@ import ( "errors" "flag" "fmt" + + "huawei.com/npu-exporter/v5/common-utils/hwlog" + "hccl-controller/pkg/resource-controller/signals" "hccl-controller/pkg/ring-controller/common" "hccl-controller/pkg/ring-controller/config" "hccl-controller/pkg/ring-controller/controller" - "huawei.com/npu-exporter/v5/common-utils/hwlog" ) var ( diff --git a/pkg/ring-controller/common/k8sclient.go b/pkg/ring-controller/common/k8sclient.go index 5a497da..b0968ef 100644 --- a/pkg/ring-controller/common/k8sclient.go +++ b/pkg/ring-controller/common/k8sclient.go @@ -1,8 +1,24 @@ +/* Copyright(C) 2020-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package common for common value package common import ( "fmt" - "hccl-controller/pkg/ring-controller/config" + "time" + "huawei.com/npu-exporter/v5/common-utils/hwlog" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -10,9 +26,10 @@ import ( "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" - "time" "volcano.sh/apis/pkg/client/clientset/versioned" "volcano.sh/apis/pkg/client/informers/externalversions" + + "hccl-controller/pkg/ring-controller/config" ) // NewClientK8s create k8s client diff --git a/pkg/ring-controller/config/configs.go b/pkg/ring-controller/config/configs.go index 2bc941c..1e4a2c4 100644 --- a/pkg/ring-controller/config/configs.go +++ b/pkg/ring-controller/config/configs.go @@ -1,3 +1,18 @@ +/* Copyright(C) 2020-2023. Huawei Technologies Co.,Ltd. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Package config for controller config package config type Config struct { diff --git a/pkg/ring-controller/controller/businessagent.go b/pkg/ring-controller/controller/businessagent.go index 3d3fe34..5e5885e 100644 --- a/pkg/ring-controller/controller/businessagent.go +++ b/pkg/ring-controller/controller/businessagent.go @@ -20,36 +20,34 @@ import ( "encoding/json" "errors" "fmt" - "hccl-controller/pkg/ring-controller/config" - "k8s.io/apimachinery/pkg/util/runtime" - typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" - "strconv" - samplescheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme" - - ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" - v2 "hccl-controller/pkg/ring-controller/ranktable/v2" - appsV1 "k8s.io/api/apps/v1" - "k8s.io/client-go/util/retry" "reflect" + "strconv" "sync" "sync/atomic" "time" - "volcano.sh/apis/pkg/apis/batch/v1alpha1" + ranktablev1 "hccl-controller/pkg/ring-controller/ranktable/v1" "huawei.com/npu-exporter/v5/common-utils/hwlog" + appsV1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes/scheme" + typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" "k8s.io/client-go/util/workqueue" + "volcano.sh/apis/pkg/apis/batch/v1alpha1" + samplescheme "volcano.sh/apis/pkg/client/clientset/versioned/scheme" "hccl-controller/pkg/ring-controller/common" + "hccl-controller/pkg/ring-controller/config" + v2 "hccl-controller/pkg/ring-controller/ranktable/v2" ) // String to return podIdentifier string style : -- Gitee From 534b0a939624e5c6170146510ca45c9271570f79 Mon Sep 17 00:00:00 2001 From: shepherd cheung <1220798123@qq.com> Date: Thu, 11 Jul 2024 19:41:36 +0800 Subject: [PATCH 29/29] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=88=A0=E9=99=A4=E5=86=97?= =?UTF-8?q?=E4=BD=99=E5=86=85=E5=AE=B9=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=BA=BA=20Modifier=E3=80=91Atlas=5Fzxp=20=E3=80=90=E8=AF=84?= =?UTF-8?q?=E5=AE=A1=E4=BA=BA=20Reviewer=E3=80=91Atlas=5Fkfa?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/ring-controller/controller/businessagent.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/ring-controller/controller/businessagent.go b/pkg/ring-controller/controller/businessagent.go index 5e5885e..1f750d3 100644 --- a/pkg/ring-controller/controller/businessagent.go +++ b/pkg/ring-controller/controller/businessagent.go @@ -98,8 +98,12 @@ func NewBusinessAgent(config *config.Config, stopCh <-chan struct{}) (*BusinessA Config: config, } + podInformer.SetWatchErrorHandler(func(r *cache.Reflector, err error) { + + }) + // when pod is added, annotation info is ready. No need to listen update event. - businessAgent.podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + podInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { businessAgent.enqueuePod(obj, common.EventAdd) }, -- Gitee