diff --git a/mindx-dl/upgrade/entry.sh b/mindx-dl/upgrade/entry.sh index 18b234582b341f1c8162113ecdfde197ab7e86e6..d72ee6d74e6701b0254bea7acac0127cb1855a40 100644 --- a/mindx-dl/upgrade/entry.sh +++ b/mindx-dl/upgrade/entry.sh @@ -1,23 +1,172 @@ #!/bin/bash +# Copyright © Huawei Technologies Co., Ltd. 2020. All rights reserved. + +mindxDLComponentConstantArr[1]="volcano" +mindxDLComponentConstantArr[2]="hccl-controller" +mindxDLComponentConstantArr[3]="ascendplugin-310" +mindxDLComponentConstantArr[4]="ascendplugin-volcano" +mindxDLComponentConstantArr[5]="cadvisor" +mindxDLComponentNumber=5 + +# for display update package +mindxDLUpdateVersionForPrintArr=() +# store 1,2,3,4,5 for selected +reqiureUpdateComponentNumberArr=() +# store the update package name which exist in yaml +localComponentsArr=() +# store ansible update component name +updateAnsibleComponentName="" + +function getUpdateYamlPath(){ + # shellcheck disable=SC2006 + updateRootPath=`grep dls_root_dir /etc/ansible/hosts|cut -d '=' -f 2` +} + +getUpdateComponentsFromYaml(){ + local updatePath + local yamlRelativePath + local componentFiles + local index + + getUpdateYamlPath + # ansible not install deal + yamlRelativePath="/upgrade_dependencies/yamls/" + updatePath="$updateRootPath$yamlRelativePath" + componentFiles=$(ls "$updatePath") + index=0; + localComponentsArr=() + for fileName in $componentFiles + do + updateVersion=${fileName%*.yaml} + # shellcheck disable=SC2068 + for i in ${mindxDLComponentConstantArr[@]} + do + if [[ $updateVersion =~ ^$i ]];then + localComponentsArr[$index]=$updateVersion + ((index=index+1)) + fi + done + done + # exclude repeat + localComponentsArr=($(echo ${localComponentsArr[*]} | sed 's/ /\n/g' | sort |uniq)) +} + +function getUpdateVersionInfo(){ + local count + local tmpName + local tmpFileName + + getUpdateComponentsFromYaml + # sort the Components;1:volcano 2:device-plugin 3:hccl 4:cadvisor + count=1 + mindxDLUpdateVersionForPrintArr[0]="All" + # shellcheck disable=SC2068 + for ComponentFileName in ${mindxDLComponentConstantArr[@]} + do + for tmpFileName in ${localComponentsArr[@]} + do + # cut by -v[0-9] + tmpName=(${tmpFileName//-v[0-9]/ }) + if [[ $tmpName == $ComponentFileName ]];then + mindxDLUpdateVersionForPrintArr[$count]=$tmpFileName + fi + done + ((count=count+1)) + done +} + +function getRequireUpdateComponent() { + local inputUpdateComponents + local updateComponents + local flag=0 + + # get input + while :; do + read -r -p "Input the Components which you want to update(can multiple choice by ',')" updateComponents + inputUpdateComponents=(${updateComponents//,/ }) + # shellcheck disable=SC2068 + for i in ${inputUpdateComponents[@]} + do + if [[ $i =~ ^[0-$mindxDLComponentNumber]$ ]];then + reqiureUpdateComponentNumberArr[${#reqiureUpdateComponentNumberArr[@]}]=$i + flag=1 + else + echo "$i out of range,try again" + flag=0 + reqiureUpdateComponentNumberArr=() + break + fi + done + if [[ 1 == $flag ]];then + break + fi + done + # change 0 to 1,2,3,4 + if [[ "${reqiureUpdateComponentNumberArr[*]}" =~ ^"0" ]];then + # find input 0 + # 1. reset reqiureUpdateComponentNumberArr + reqiureUpdateComponentNumberArr=() + # 2. change 0 to exist Component + for a in $(seq 1 $mindxDLComponentNumber) + do + # find exist Component + # shellcheck disable=SC2199 + if [[ ${localComponentsArr[@]} =~ ${mindxDLComponentConstantArr[a]} ]];then + reqiureUpdateComponentNumberArr=("${reqiureUpdateComponentNumberArr[@]}" "$a") + fi + done + fi + + # remove duplication + # shellcheck disable=SC2207 + reqiureUpdateComponentNumberArr=($(echo "${reqiureUpdateComponentNumberArr[*]}" | sed 's/ /\n/g' | sort |uniq)) +} function exportYaml(){ - mkdir -p "${1}" - cd "${1}" - # Collect previous version resource definition of device-plugin. - kubectl get daemonset -n kube-system ascend-device-plugin-daemonset -o yaml > 910-ascend-device-plugin-export.yaml - kubectl get daemonset -n kube-system ascend-device-plugin2-daemonset -o yaml > 310-ascend-device-plugin-export.yaml - # Collect previous version resource definition of hccl-controller. - kubectl get deployment hccl-controller -o yaml > hccl-controller-export.yaml - # Collect previous version resource definition of cadvisor. - kubectl get daemonset -n cadvisor cadvisor -o yaml > cadvisor-export.yaml - cd .. + # key=daemon name, value=namespace + declare -A daemonsetNamespaceArray=(["ascend-device-plugin-daemonset"]="kube-system" ["ascend-device-plugin2-daemonset"]="kube-system" ["cadvisor"]="cadvisor") + # key=daemon name, value=export(Component) name + declare -A daemonsetComponentNameArray=(["ascend-device-plugin-daemonset"]="910-ascend-device-plugin" ["ascend-device-plugin2-daemonset"]="310-ascend-device-plugin" ["cadvisor"]="cadvisor") + # key=deployment name, value=export name + declare -A deploymentNamespaceArray=(["hccl-controller"]="default") + # key=deployment name, value=export(Component) name + declare -A deploymentComponentNameArray=(["hccl-controller"]="hccl-controller") + + local key + + mkdir -p "${1}" + cd "${1}" + # export daemonset;Collect previous version resource definition + # shellcheck disable=SC2068 + for key in ${!daemonsetNamespaceArray[@]} + do + if [[ "$(kubectl get daemonset --all-namespaces | grep -c $key)" -gt 0 ]];then + kubectl get daemonset -n "${daemonsetNamespaceArray[$key]}" "$key" -o yaml > "${daemonsetComponentNameArray[$key]}"-export.yaml + echo -e "export ${daemonsetComponentNameArray[$key]} successfully" | tee ../export_log.txt + else + echo -e "Not find ${daemonsetComponentNameArray[$key]} daemonset!\n" | tee ../export_log.txt + fi + done + + # export daemonset;Collect previous version resource definition + # shellcheck disable=SC2068 + for key in ${!deploymentNamespaceArray[@]} + do + if [[ "$(kubectl get deployment --all-namespaces | grep -c $key)" -gt 0 ]];then + kubectl get deployment -n "${deploymentNamespaceArray[$key]}" "$key" -o yaml > "${deploymentComponentNameArray[$key]}"-export.yaml + echo -e "export ${deploymentComponentNameArray[$key]} successfully" | tee ../export_log.txt + else + echo -e "Not find ${deploymentComponentNameArray[$key]} deployment!\n" | tee ../export_log.txt + fi + done + + cd .. } function printVersion(){ - echo -e "$(kubectl describe pod -n "${1}" "$(kubectl get pods -A | grep "${2}" | awk '{print $2}' | head -n 1)" | grep Image: | awk '{print $2}')" + echo -e "$(kubectl describe pod -n "${1}" "$(kubectl get pods --all-namespaces | grep "${2}" | awk '{print $2}' | head -n 1)" | grep Image: | awk '{print $2}')" } - function saveImageVersion(){ vcAdmission=$(printVersion "volcano-system" "volcano-admission") vcControllers=$(printVersion "volcano-system" "volcano-controllers") @@ -36,39 +185,92 @@ function versionPrint(){ echo -e "\nAscend-device-plugin\n$dp\n" } +# print the update package local have +function updatePackagePrint() { + for i in $(seq 0 $mindxDLComponentNumber) + do + echo -e "$i : ${mindxDLUpdateVersionForPrintArr[$i]}" + done +} + +function getUpdateAnsibleComponentName() { + local component + local updateComponent + local volcanoSuffix="Open" + + # shellcheck disable=SC2068 + for component in ${reqiureUpdateComponentNumberArr[@]} + do + if [[ ${mindxDLComponentConstantArr[$component]} == "volcano" ]];then + updateComponent=${mindxDLComponentConstantArr[$component]}$volcanoSuffix + else + updateComponent=${mindxDLComponentConstantArr[$component]} + fi + updateAnsibleComponentName=${updateAnsibleComponentName[@]}$updateComponent + done +} + +function doUpdateComponent() { + echo -e "\nUpgrade begins.\n" + ansible-playbook -vv ./upgrade.yaml --tags=upgrade --extra-vars "updateComponent=$updateAnsibleComponentName" +} + +function doCheckUpgradeResult() { + echo -e "\nChecking upgrade result..." + ansible-playbook -vv ./upgrade.yaml --tags=check --extra-vars "updateComponent=$updateAnsibleComponentName" | tee ./check_log.txt + echo -e "\nChecking complete." +} + +function rollbackVolcanoComponent() { + # volcano 0.4.0 version need to do special handing + if [ "$(grep -c "0.4.0" ./check_log.txt)" -eq "1" ];then + cd ../volcano-difference + tr -d '\r' < gen-admission-secret.sh > gen-admission-secret-exec.sh + bash -x gen-admission-secret-exec.sh --service volcano-admission-service --namespace volcano-system --secret volcano-admission-secret || true + kubectl apply -f volcano-v*.yaml + else + cd ../volcano-difference + kubectl apply -f volcano-v*.yaml + fi + cd .. +} function upgrade(){ set -e # Save previous version image info saveImageVersion + echo -e "\nBefore Upgrade:" | tee ./pre_check.txt versionPrint | tee ./pre_check.txt + # get update Component information + echo -e "\nThe update package:" | tee ./pre_check.txt + + getUpdateVersionInfo + updatePackagePrint # Pause - local continue - read -r -p "Do you want to continue upgrade?(yes/no)" continue + local continued + read -r -p "Do you want to continue upgrade?(yes/no)" continued - while [ "$continue" != 'yes' ] && [ "$continue" != 'no' ];do - read -r -p "Invalid input. Do you want to continue upgrade?(yes/no)" continue + while [ "$continued" != 'yes' ] && [ "$continued" != 'no' ];do + echo -e "\n" + read -r -p "Invalid input. Do you want to continue upgrade?(yes/no)" continued done - if [ "$continue" == 'no' ];then + if [ "$continued" == 'no' ];then echo -e "\nUpgrade terminated." return 0 fi # Save previous version yamls exportYaml "./Previous_version_info" - + # Get update Component from user input + getRequireUpdateComponent # Upgrade begins - echo -e "\nUpgrade begins.\n" - ansible-playbook -vv ./upgrade.yaml --tags=upgrade - + getUpdateAnsibleComponentName + doUpdateComponent # Checking upgrade result. - echo -e "\nChecking upgrade result..." - ansible-playbook -vv ./upgrade.yaml --tags=check | tee ./check_log.txt - echo -e "\nChecking complete." - + doCheckUpgradeResult # Post-upgrade processing if [ "$(grep -c "failed=1" ./check_log.txt)" -eq "1" ];then echo -e "\nUpgrade failed.\n" @@ -89,8 +291,8 @@ function upgrade(){ kubectl delete deployment hccl-controller || true kubectl delete deployment volcano-admission -n volcano-system || true kubectl delete job volcano-admission-init -n volcano-system || true - # Wait a short period of time til resources deleted. - while [ "$(kubectl get pods -A | grep -c Terminating)" -gt 0 ];do + # Wait a short period of time till resources deleted. + while [ "$(kubectl get pods --all-namespaces | grep -c Terminating)" -gt 0 ];do echo -e "\nWaiting for the pods to terminate, please do not interrupt.\n" sleep 10 done @@ -100,14 +302,12 @@ function upgrade(){ kubectl apply -f 910-ascend-device-plugin-export.yaml kubectl apply -f cadvisor-export.yaml kubectl apply -f hccl-controller-export.yaml - cd ../volcano-difference - tr -d '\r' < gen-admission-secret.sh > gen-admission-secret-exec.sh - bash -x gen-admission-secret-exec.sh --service volcano-admission-service --namespace volcano-system --secret volcano-admission-secret || true - kubectl apply -f volcano-v*.yaml - cd .. - # Wait a short period of time til resources rolled back. + + rollbackVolcanoComponent + + # Wait a short period of time till resources rolled back. sleep 30s - while [ "$(kubectl get pods -A | grep -c Terminating)" -gt 0 ];do + while [ "$(kubectl get pods --all-namespaces | grep -c Terminating)" -gt 0 ];do echo -e "\nWaiting for the pods to terminate, please do not interrupt.\n" sleep 10 done @@ -129,7 +329,8 @@ function upgrade(){ done # Remove old version images. if [ "$remove" == 'yes' ];then - ansible-playbook upgrade.yaml --tags=remove-images --extra-vars "vA=$vcAdmission vC=$vcControllers vS=$vcScheduler hc=$hc ca=$ca dp=$dp" + # shellcheck disable=SC2068 + ansible-playbook upgrade.yaml --tags=remove-images --extra-vars "vA=$vcAdmission vC=$vcControllers vS=$vcScheduler hc=$hc ca=$ca dp=$dp updateComponant=$updateAnsibleComponentName" fi rm -rf ./Previous_version_info @@ -139,8 +340,8 @@ function upgrade(){ versionPrint | tee ./post_check.txt fi rm -f ./check_log.txt -} +} function main(){ upgrade diff --git a/mindx-dl/upgrade/upgrade.yaml b/mindx-dl/upgrade/upgrade.yaml index e62ff42ca8460329d05349b1ad2e90df0bd836a7..f8122837b5169fecd8e42f7e62341b3bac16efce 100644 --- a/mindx-dl/upgrade/upgrade.yaml +++ b/mindx-dl/upgrade/upgrade.yaml @@ -1,3 +1,4 @@ +# Copyright © Huawei Technologies Co., Ltd. 2020. All rights reserved. --- # This playbook is used to upgrade MindX DL components. # Before running the script, ensure that the dependent files are stored in the 'dls_root_dir/upgrade_dependencies' directory defined in /etc/ansible/hosts @@ -12,7 +13,7 @@ - name: scp images and yaml files from master to workers copy: src: "{{ dls_root_dir }}/upgrade_dependencies" - dest: "{{ dls_root_dir }}/" + dest: "{{ dls_root_dir }}" when: ansible_default_ipv4['address'] != master_ip tags: upgrade @@ -22,33 +23,49 @@ remote_user: root tasks: - # load cadvisor & device-plugin images on nodes - - name: load cadvisor & device-plugin images on nodes - arm64 + # load cadvisor images on nodes + - name: load cadvisor images on worker nodes - arm64 shell: chdir: "{{ dls_root_dir }}/upgrade_dependencies/images" cmd: - docker load -i Ascend-K8sDevicePlugin*arm64-Docker.tar.gz; docker load -i huawei-cadvisor*arm64.tar.gz - when: ansible_architecture == "aarch64" + when: ansible_architecture == "aarch64" and "cadvisor" in updateComponent tags: upgrade - - name: load cadvisor & device-plugin images on nodes - x86_64 + # load device-plugin images on nodes + - name: device-plugin images on worker nodes - arm64 + shell: + chdir: "{{ dls_root_dir }}/upgrade_dependencies/images" + cmd: + docker load -i Ascend-K8sDevicePlugin*arm64-Docker.tar.gz; + when: ansible_architecture == "aarch64" and ('"ascendplugin-volcano" in updateComponent or "ascendplugin-310" in updateComponent') + tags: upgrade + + # load cadvisor images on nodes + - name: load cadvisor images on worker nodes - x86_64 shell: chdir: "{{ dls_root_dir }}/upgrade_dependencies/images" cmd: - docker load -i Ascend-K8sDevicePlugin*amd64-Docker.tar.gz; docker load -i huawei-cadvisor*amd64.tar.gz - when: ansible_architecture == "x86_64" + when: ansible_architecture == "x86_64" and "cadvisor" in updateComponent tags: upgrade + # load device-plugin images on nodes + - name: load device-plugin images on nodes - x86_64 + shell: + chdir: "{{ dls_root_dir }}/upgrade_dependencies/images" + cmd: + docker load -i Ascend-K8sDevicePlugin*amd64-Docker.tar.gz; + when: ansible_architecture == "x86_64" and ('"ascendplugin-volcano" in updateComponent or "ascendplugin-310" in updateComponent') + tags: upgrade # This play is used to load images to master node. - hosts: localnode, master remote_user: root tasks: - # load volcano & hccl-controller images on nodes - - name: load volcano & hccl-controller images on nodes - arm64 + # load volcano images on nodes + - name: load volcano images on nodes - arm64 shell: chdir: "{{ dls_root_dir }}/upgrade_dependencies/images" cmd: @@ -56,11 +73,20 @@ docker load -i vc-controller-manager*arm64.tar.gz; docker load -i vc-scheduler*arm64.tar.gz; docker load -i vc-webhook-manager-v*arm64.tar.gz; + when: ansible_architecture == "aarch64" and "volcanoOpen" in updateComponent + tags: upgrade + + # load hccl-controller images on nodes + - name: load hccl-controller images on nodes - arm64 + shell: + chdir: "{{ dls_root_dir }}/upgrade_dependencies/images" + cmd: docker load -i hccl-controller*arm64.tar.gz; - when: ansible_architecture == "aarch64" + when: ansible_architecture == "aarch64" and "hccl-controller" in updateComponent tags: upgrade - - name: load volcano & hccl-controller images on nodes - x86_64 + # load volcano images on nodes + - name: load volcano images on nodes - x86_64 shell: chdir: "{{ dls_root_dir }}/upgrade_dependencies/images" cmd: @@ -68,8 +94,16 @@ docker load -i vc-controller-manager*amd64.tar.gz; docker load -i vc-scheduler*amd64.tar.gz; docker load -i vc-webhook-manager-v*amd64.tar.gz; + when: ansible_architecture == "x86_64" and "volcanoOpen" in updateComponent + tags: upgrade + + # load hccl-controller images on nodes + - name: load hccl-controller images on nodes - x86_64 + shell: + chdir: "{{ dls_root_dir }}/upgrade_dependencies/images" + cmd: docker load -i hccl-controller*amd64.tar.gz; - when: ansible_architecture == "x86_64" + when: ansible_architecture == "x86_64" and "hccl-controller" in updateComponent tags: upgrade # Handle version differences of components. @@ -77,7 +111,9 @@ shell: cmd: kubectl delete daemonset cadvisor -n cadvisor; + when: '"cadvisor" in updateComponent' tags: upgrade + ignore_errors: True - name: Handling volcano version differences shell: @@ -85,19 +121,53 @@ kubectl delete secret volcano-admission-secret -n volcano-system; kubectl delete deployment volcano-admission -n volcano-system; kubectl delete job volcano-admission-init -n volcano-system + when: '"volcanoOpen" in updateComponent' tags: upgrade ignore_errors: True # Upgrade components by use kubectl apply command to change the image currently in use. - - name: Upgrade services + - name: Upgrade volcano shell: chdir: "{{ dls_root_dir }}/upgrade_dependencies/yamls" cmd: - kubectl apply -f hccl-controller*.yaml; kubectl apply -f volcano-*.yaml; - kubectl apply -f cadvisor-*.yaml; + when: '"volcanoOpen" in updateComponent' + tags: upgrade + ignore_errors: True + + - name: Upgrade hccl + shell: + chdir: "{{ dls_root_dir }}/upgrade_dependencies/yamls" + cmd: + kubectl apply -f hccl-controller*.yaml; + when: '"hccl-controller" in updateComponent' + tags: upgrade + ignore_errors: True + + - name: Upgrade ascendplugin-volcano + shell: + chdir: "{{ dls_root_dir }}/upgrade_dependencies/yamls" + cmd: kubectl apply -f ascendplugin-volcano*.yaml; + when: '"ascendplugin-volcano" in updateComponent' + tags: upgrade + ignore_errors: True + + - name: Upgrade ascendplugin-310 + shell: + chdir: "{{ dls_root_dir }}/upgrade_dependencies/yamls" + cmd: kubectl apply -f ascendplugin-310*.yaml; + when: '"ascendplugin-310" in updateComponent' + tags: upgrade + ignore_errors: True + + - name: Upgrade cadvisor + shell: + chdir: "{{ dls_root_dir }}/upgrade_dependencies/yamls" + cmd: + kubectl apply -f cadvisor-*.yaml; + when: '"cadvisor" in updateComponent' tags: upgrade ignore_errors: True @@ -115,6 +185,7 @@ until: "return_value.stdout == '1'" retries: 6 delay: 10 + when: '"hccl-controller" in updateComponent' tags: check - name: Check volcano status @@ -125,34 +196,10 @@ until: "return_value.stdout == '3'" retries: 6 delay: 10 + when: '"volcanoOpen" in updateComponent' tags: check -- hosts: localnode - remote_user: root - - tasks: - # Check status of components. - - name: Check cadvisor status - shell: - cmd: - kubectl get pods --all-namespaces -o wide | grep cadvisor |grep Running | wc -l - register: return_value - until: "return_value.stdout == '1'" - retries: 6 - delay: 10 - tags: check - - - name: Check device plugin status - shell: - cmd: - kubectl get pods --all-namespaces | grep ascend-device-plugin |grep Running | wc -l - register: return_value - until: "return_value.stdout == '1'" - retries: 6 - delay: 10 - tags: check - -- hosts: master +- hosts: master,localnode remote_user: root vars: HOST_COUNT: "{{ groups['workers'] | length }}" @@ -167,6 +214,7 @@ until: "return_value.stdout == HOST_COUNT" retries: 6 delay: 10 + when: '"cadvisor" in updateComponent' tags: check - name: Check device plugin status @@ -177,6 +225,7 @@ until: "return_value.stdout == HOST_COUNT" retries: 6 delay: 10 + when: '"ascendplugin-volcano" in updateComponent or "ascendplugin-310" in updateComponent' tags: check # The following plays are used to remove previous version images after upgrade. @@ -191,6 +240,7 @@ docker rmi {{ vC }} docker rmi {{ vS }} ignore_errors: True + when: '"volcanoOpen" in updateComponent' tags: remove-images - name: Remove hccl-controller image @@ -198,6 +248,7 @@ cmd: docker rmi {{ hc }} ignore_errors: True + when: '"hccl-controller" in updateComponent' tags: remove-images - hosts: localnode, workers @@ -209,6 +260,7 @@ cmd: docker rmi {{ ca }} ignore_errors: True + when: '"cadvisor" in updateComponent ' tags: remove-images - name: Remove device-plugin image @@ -216,4 +268,15 @@ cmd: docker rmi {{ dp }} ignore_errors: True + when: '"ascendplugin-volcano" in updateComponent or "ascendplugin-310" in updateComponent' + tags: remove-images + + # remove the update files + - name: Remove update package files + file: + path: "{{ dls_root_dir }}/upgrade_dependencies" + state: absent + ignore_errors: True + when: + - ansible_hostname not in groups["master"] tags: remove-images \ No newline at end of file