From 01f288baccbafaa233d1180482d43baafb2fee3a Mon Sep 17 00:00:00 2001
From: mabofu <dev16302@linx-info.com>
Date: Fri, 26 May 2023 18:00:47 +0800
Subject: [PATCH] fixed bug-I79978: dsms-deploy init cluster faild

---
 dsms_deploy/dsms-deploy.py | 211 ++++++++++++++++++++++++++++++++++---
 1 file changed, 195 insertions(+), 16 deletions(-)

diff --git a/dsms_deploy/dsms-deploy.py b/dsms_deploy/dsms-deploy.py
index 5658cc0..a572496 100755
--- a/dsms_deploy/dsms-deploy.py
+++ b/dsms_deploy/dsms-deploy.py
@@ -26,8 +26,8 @@ ADD_NODE_CONF = 'add-node.conf'
 DEPEND_LIST = 'depend_list'
 LEADER_NODE = 'NODE1'
 SSH_PATH = '/root/.ssh'
-
 INIT_DIR = '/home/my-cluster'
+SERVER_CHECK_RETRY = 10
 
 update_hosts = """
 #!/bin/bash
@@ -96,21 +96,21 @@ logging_config = {
       'class': 'logging.handlers.RotatingFileHandler',
       'formatter': 'dsms-storage',
       'filename': '%s/dsms-deploy.log' % LOG_DIR,
-      'maxBytes': 1024000,
-      'backupCount': 1,
+      'maxBytes': 100 * 1024 * 1024,
+      'backupCount': 10,
     },
     'paramiko_log_file': {
       'level': 'INFO',
       'class': 'logging.handlers.RotatingFileHandler',
       'formatter': 'paramiko',
       'filename': '%s/dsms-deploy.log' % LOG_DIR,
-      'maxBytes': 1024000,
-      'backupCount': 1,
+      'maxBytes': 100 * 1024 * 1024,
+      'backupCount': 10,
     }
   },
   'loggers': {
     '': {
-      'level': 'INFO',
+      'level': 'DEBUG',
       'handlers': ['console', 'log_file'],
     },
     'paramiko': {
@@ -263,7 +263,7 @@ def read_add_conf(conf_path):
 
 
 def run_ssh_command(server, username, password, command):
-  logger.info(f"{server} execute start: `{command}`")
+  logger.debug(f"{server} execute start: `{command}`")
   ssh_client = paramiko.SSHClient()
   ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
   ssh_client.connect(hostname=server, username=username, password=password,
@@ -276,10 +276,10 @@ def run_ssh_command(server, username, password, command):
 
   # Real-time prints execution and writes to a class file object
   for line in stdout:
-    logger.info(line.strip())
+    logger.debug(line.strip())
     output_file.write(line)
   for line in stderr:
-    logger.info(line.strip())
+    logger.debug(line.strip())
     output_file.write(line)
 
   exit_code = stdout.channel.recv_exit_status()
@@ -288,10 +288,10 @@ def run_ssh_command(server, username, password, command):
   ssh_client.close()
 
   if exit_code == 0:
-    logger.info(f"{server} execute end: `{command}` success")
+    logger.debug(f"{server} execute end: `{command}` success")
     return output
   else:
-    logger.error('{}{}{}'.format(termcolor.red, f"{server} execute end: `{command}` failed", termcolor.end))
+    logger.debug('{}{}{}'.format(termcolor.red, f"{server} execute end: `{command}` failed", termcolor.end))
     raise DsmsDeployException(output)
 
 
@@ -502,9 +502,117 @@ def configure_cluster(nodes):
   return result
 
 
+def is_mgr_available(ip, password):
+  # type: () -> bool
+  """
+  Check mgr is available
+  """
+  try:
+    cluster_status = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph status -f json-pretty')
+    json_result = json.loads(cluster_status)
+    return json_result.get('mgrmap', {}).get('available', False)
+  except Exception as e:
+    logger.debug('status failed: %s' % e)
+    return False
+
+
+def is_mon_available(ip, password):
+  # type: () -> bool
+  """
+  Check mon is available
+  """
+  try:
+    cluster_status = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph status -f json-pretty')
+    json_result = json.loads(cluster_status)
+    return json_result.get('monmap', {}).get('num_mons', 0) > 0
+  except Exception as e:
+    logger.debug('status failed: %s' % e)
+    return False
+
+
+def is_mds_available(ip, password):
+  # type: () -> bool
+  """
+  Check mds is available
+  """
+  try:
+    cluster_status = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph status -f json-pretty')
+    json_result = json.loads(cluster_status)
+    return json_result.get('fsmap', {}).get('up:standby', 0) > 0
+  except Exception as e:
+    logger.debug('status failed: %s' % e)
+    return False
+
+
+def is_available(service, ip, passwd, func):
+  """
+  Wait for a service to become available.
+
+  :param ip: ceph node ssh ip
+  :param passwd: ceph node ssh password
+  :param service: the name of the service
+  :param func: the callable object that determines availability
+  """
+  logger.info('waiting for %s...' % service)
+  num = 1
+  while True:
+    if func(ip, passwd):
+      logger.info('%s is available' % service)
+      break
+    elif num > SERVER_CHECK_RETRY:
+      raise DsmsDeployException('%s not available after %s tries' % (service, SERVER_CHECK_RETRY))
+
+    logger.info('%s not available, waiting (%s/%s)...' % (service, num, SERVER_CHECK_RETRY))
+
+    num += 1
+    time.sleep(1)
+
+
+# wait for mgr to restart (after enabling a module)
+def wait_for_mgr_restart(ip, password):
+  # first get latest mgrmap epoch from the mon
+  mgr_dump = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph mgr dump')
+  j = json.loads(mgr_dump)
+  epoch = j['epoch']
+  # wait for mgr to have it
+  logger.info('waiting for the mgr to restart...')
+
+  def mgr_has_latest_epoch(ip, password):
+    # type: () -> bool
+    try:
+      out = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph tell mgr mgr_status')
+      j = json.loads(out)
+      return j['mgrmap_epoch'] >= epoch
+    except Exception as e:
+      logger.debug('tell mgr mgr_status failed: %s' % e)
+      return False
+
+  is_available('Mgr version %d' % epoch, ip, password, mgr_has_latest_epoch)
+
+
+# remove ceph orch services
+def remove_ceph_orch_services(ip, password):
+  # first get ceph orch services
+  ceph_orch_ls = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph orch ls -f json')
+  orch_services = json.loads(ceph_orch_ls)
+  for service in orch_services:
+    logger.info(f'remove ceph orch service {service["service_name"]}')
+    run_ssh_command(ip, DEPLOY_USERNAME, password, f'ceph orch rm {service["service_name"]}')
+
+
+def get_ceph_fs_id(ip, password):
+  try:
+    cluster_fs_id = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph fsid -f json-pretty')
+    json_result = json.loads(cluster_fs_id)
+    return json_result.get('fsid', '')
+  except Exception as e:
+    logger.debug('get cluster fsid failed: %s' % e)
+    return ""
+
+
 def deploy_node(node):
   if node.ip:
-    logger.info(f'start deploy server: {node.hostname}')
+    logger.info(f'starting deploy server: {node.hostname} ...')
     run_ssh_command(node.ip, DEPLOY_USERNAME, node.root_passwd, f'mkdir -p {DEPLOY_PACKAGE}')
     depend_list = []
     whl_list = []
@@ -589,6 +697,54 @@ def check_time_difference(nodes):
   logger.info("Check servers time pass")
 
 
+def purge_cluster(nodes):
+  logger.info('starting purge old cluster ...')
+  leader = nodes.get(LEADER_NODE)
+
+  try:
+    fsid = get_ceph_fs_id(leader.ip, leader.root_passwd)
+    remove_ceph_orch_services(leader.ip, leader.root_passwd)
+  except Exception as e:
+    logger.debug('remove ceph orch services failed: %s' % e)
+
+  for server in nodes.values():
+    # purge osd
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'systemctl stop ceph-osd@*.service')
+    osd_count = run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'df -h | grep /var/lib/ceph/osd/ceph- | wc -l')
+    if (int(osd_count.strip()) > 0):
+      run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'umount -f -l /var/lib/ceph/osd/ceph-*')
+
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/run/ceph/ceph-osd.*.asok')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/osd/*')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/bootstrap-osd/*')
+    # purge mds
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'systemctl stop ceph-mds@*.service')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/run/ceph/ceph-mds.*.asok')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/mds/*')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/bootstrap-mds/*')
+    # purge mgr
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'systemctl stop ceph-mgr@*.service')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/run/ceph/ceph-mgr.*.asok')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/mgr/*')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/bootstrap-mgr/*')
+    # purge mon
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'systemctl stop ceph-mon@*.service')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/run/ceph/ceph-mon.*.asok')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/mon/*')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/bootstrap-mon/*')
+    # purge /etc/ceph
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /etc/ceph/*')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /run/cephadm/*')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /run/ceph/*')
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/log/ceph/*')
+    if (fsid):
+      run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, f'rm -rf /var/lib/ceph/{fsid}')
+
+    # purge deploy-dir
+    run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, f'rm -rf {INIT_DIR}/*')
+  logger.info('purge old cluster success')
+
+
 def command_version(args):
   return VERSION
 
@@ -610,19 +766,41 @@ def command_init(args):
   configure_cluster(nodes)
   leader = nodes.get(LEADER_NODE)
   hostname = ' '.join([node.hostname for node in nodes.values()])
+  # first purge exist cluster.
+  purge_cluster(nodes)
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'mkdir -p {INIT_DIR}')
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd,
                   f'cd {INIT_DIR};ceph-deploy new {hostname} --cluster-network={leader.cluster_network} --public-network={leader.public_network}')
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy --overwrite-conf config push {hostname}')
+  logger.info('Creating mon...')
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy mon create-initial')
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy admin {hostname}')
+  logger.info('waiting for mon to start...')
+  is_available('mon', leader.ip, leader.root_passwd, is_mon_available)
+
+  logger.info('Creating mgr...')
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy mgr create {leader.hostname}')
+  logger.info('waiting for mgr to start...')
+  is_available('mgr', leader.ip, leader.root_passwd, is_mgr_available)
+
+  logger.info('Creating mds...')
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy mds create {hostname}')
-  run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph mgr module enable cephadm --force')
-  run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph config set mgr mgr/prometheus/scrape_interval 10')
-  run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph mgr module enable prometheus --force')
-  time.sleep(20)  # give some room to start
+  logger.info('waiting for mds to start...')
+  is_available('mds', leader.ip, leader.root_passwd, is_mds_available)
+
+  # Config cluster
+  logger.info('mon, mgr, mds service is started.')
+  logger.info('Starting config cluster...')
+  logger.info('enabling cephadm module...')
+  run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph mgr module enable cephadm')
+  wait_for_mgr_restart(leader.ip, leader.root_passwd)
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph orch set backend cephadm')
+
+  logger.info('enabling prometheus module...')
+  run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph config set mgr mgr/prometheus/scrape_interval 10')
+  run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph mgr module enable prometheus')
+  wait_for_mgr_restart(leader.ip, leader.root_passwd)
+
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph cephadm generate-key')
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph cephadm get-pub-key > ~/ceph.pub')
 
@@ -643,6 +821,7 @@ def command_init(args):
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph config set global mon_warn_on_pool_no_app false')
   run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph config set mgr mgr/devicehealth/enable_monitoring false')
   time.sleep(5)  # give some room to start
+
   command_info('')
   logger.info('init dsms-storage cluster success')
 
-- 
Gitee