From 01f288baccbafaa233d1180482d43baafb2fee3a Mon Sep 17 00:00:00 2001 From: mabofu Date: Fri, 26 May 2023 18:00:47 +0800 Subject: [PATCH] fixed bug-I79978: dsms-deploy init cluster faild --- dsms_deploy/dsms-deploy.py | 211 ++++++++++++++++++++++++++++++++++--- 1 file changed, 195 insertions(+), 16 deletions(-) diff --git a/dsms_deploy/dsms-deploy.py b/dsms_deploy/dsms-deploy.py index 5658cc0..a572496 100755 --- a/dsms_deploy/dsms-deploy.py +++ b/dsms_deploy/dsms-deploy.py @@ -26,8 +26,8 @@ ADD_NODE_CONF = 'add-node.conf' DEPEND_LIST = 'depend_list' LEADER_NODE = 'NODE1' SSH_PATH = '/root/.ssh' - INIT_DIR = '/home/my-cluster' +SERVER_CHECK_RETRY = 10 update_hosts = """ #!/bin/bash @@ -96,21 +96,21 @@ logging_config = { 'class': 'logging.handlers.RotatingFileHandler', 'formatter': 'dsms-storage', 'filename': '%s/dsms-deploy.log' % LOG_DIR, - 'maxBytes': 1024000, - 'backupCount': 1, + 'maxBytes': 100 * 1024 * 1024, + 'backupCount': 10, }, 'paramiko_log_file': { 'level': 'INFO', 'class': 'logging.handlers.RotatingFileHandler', 'formatter': 'paramiko', 'filename': '%s/dsms-deploy.log' % LOG_DIR, - 'maxBytes': 1024000, - 'backupCount': 1, + 'maxBytes': 100 * 1024 * 1024, + 'backupCount': 10, } }, 'loggers': { '': { - 'level': 'INFO', + 'level': 'DEBUG', 'handlers': ['console', 'log_file'], }, 'paramiko': { @@ -263,7 +263,7 @@ def read_add_conf(conf_path): def run_ssh_command(server, username, password, command): - logger.info(f"{server} execute start: `{command}`") + logger.debug(f"{server} execute start: `{command}`") ssh_client = paramiko.SSHClient() ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ssh_client.connect(hostname=server, username=username, password=password, @@ -276,10 +276,10 @@ def run_ssh_command(server, username, password, command): # Real-time prints execution and writes to a class file object for line in stdout: - logger.info(line.strip()) + logger.debug(line.strip()) output_file.write(line) for line in stderr: - logger.info(line.strip()) + logger.debug(line.strip()) output_file.write(line) exit_code = stdout.channel.recv_exit_status() @@ -288,10 +288,10 @@ def run_ssh_command(server, username, password, command): ssh_client.close() if exit_code == 0: - logger.info(f"{server} execute end: `{command}` success") + logger.debug(f"{server} execute end: `{command}` success") return output else: - logger.error('{}{}{}'.format(termcolor.red, f"{server} execute end: `{command}` failed", termcolor.end)) + logger.debug('{}{}{}'.format(termcolor.red, f"{server} execute end: `{command}` failed", termcolor.end)) raise DsmsDeployException(output) @@ -502,9 +502,117 @@ def configure_cluster(nodes): return result +def is_mgr_available(ip, password): + # type: () -> bool + """ + Check mgr is available + """ + try: + cluster_status = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph status -f json-pretty') + json_result = json.loads(cluster_status) + return json_result.get('mgrmap', {}).get('available', False) + except Exception as e: + logger.debug('status failed: %s' % e) + return False + + +def is_mon_available(ip, password): + # type: () -> bool + """ + Check mon is available + """ + try: + cluster_status = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph status -f json-pretty') + json_result = json.loads(cluster_status) + return json_result.get('monmap', {}).get('num_mons', 0) > 0 + except Exception as e: + logger.debug('status failed: %s' % e) + return False + + +def is_mds_available(ip, password): + # type: () -> bool + """ + Check mds is available + """ + try: + cluster_status = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph status -f json-pretty') + json_result = json.loads(cluster_status) + return json_result.get('fsmap', {}).get('up:standby', 0) > 0 + except Exception as e: + logger.debug('status failed: %s' % e) + return False + + +def is_available(service, ip, passwd, func): + """ + Wait for a service to become available. + + :param ip: ceph node ssh ip + :param passwd: ceph node ssh password + :param service: the name of the service + :param func: the callable object that determines availability + """ + logger.info('waiting for %s...' % service) + num = 1 + while True: + if func(ip, passwd): + logger.info('%s is available' % service) + break + elif num > SERVER_CHECK_RETRY: + raise DsmsDeployException('%s not available after %s tries' % (service, SERVER_CHECK_RETRY)) + + logger.info('%s not available, waiting (%s/%s)...' % (service, num, SERVER_CHECK_RETRY)) + + num += 1 + time.sleep(1) + + +# wait for mgr to restart (after enabling a module) +def wait_for_mgr_restart(ip, password): + # first get latest mgrmap epoch from the mon + mgr_dump = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph mgr dump') + j = json.loads(mgr_dump) + epoch = j['epoch'] + # wait for mgr to have it + logger.info('waiting for the mgr to restart...') + + def mgr_has_latest_epoch(ip, password): + # type: () -> bool + try: + out = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph tell mgr mgr_status') + j = json.loads(out) + return j['mgrmap_epoch'] >= epoch + except Exception as e: + logger.debug('tell mgr mgr_status failed: %s' % e) + return False + + is_available('Mgr version %d' % epoch, ip, password, mgr_has_latest_epoch) + + +# remove ceph orch services +def remove_ceph_orch_services(ip, password): + # first get ceph orch services + ceph_orch_ls = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph orch ls -f json') + orch_services = json.loads(ceph_orch_ls) + for service in orch_services: + logger.info(f'remove ceph orch service {service["service_name"]}') + run_ssh_command(ip, DEPLOY_USERNAME, password, f'ceph orch rm {service["service_name"]}') + + +def get_ceph_fs_id(ip, password): + try: + cluster_fs_id = run_ssh_command(ip, DEPLOY_USERNAME, password, 'ceph fsid -f json-pretty') + json_result = json.loads(cluster_fs_id) + return json_result.get('fsid', '') + except Exception as e: + logger.debug('get cluster fsid failed: %s' % e) + return "" + + def deploy_node(node): if node.ip: - logger.info(f'start deploy server: {node.hostname}') + logger.info(f'starting deploy server: {node.hostname} ...') run_ssh_command(node.ip, DEPLOY_USERNAME, node.root_passwd, f'mkdir -p {DEPLOY_PACKAGE}') depend_list = [] whl_list = [] @@ -589,6 +697,54 @@ def check_time_difference(nodes): logger.info("Check servers time pass") +def purge_cluster(nodes): + logger.info('starting purge old cluster ...') + leader = nodes.get(LEADER_NODE) + + try: + fsid = get_ceph_fs_id(leader.ip, leader.root_passwd) + remove_ceph_orch_services(leader.ip, leader.root_passwd) + except Exception as e: + logger.debug('remove ceph orch services failed: %s' % e) + + for server in nodes.values(): + # purge osd + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'systemctl stop ceph-osd@*.service') + osd_count = run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'df -h | grep /var/lib/ceph/osd/ceph- | wc -l') + if (int(osd_count.strip()) > 0): + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'umount -f -l /var/lib/ceph/osd/ceph-*') + + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/run/ceph/ceph-osd.*.asok') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/osd/*') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/bootstrap-osd/*') + # purge mds + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'systemctl stop ceph-mds@*.service') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/run/ceph/ceph-mds.*.asok') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/mds/*') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/bootstrap-mds/*') + # purge mgr + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'systemctl stop ceph-mgr@*.service') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/run/ceph/ceph-mgr.*.asok') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/mgr/*') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/bootstrap-mgr/*') + # purge mon + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'systemctl stop ceph-mon@*.service') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/run/ceph/ceph-mon.*.asok') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/mon/*') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/lib/ceph/bootstrap-mon/*') + # purge /etc/ceph + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /etc/ceph/*') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /run/cephadm/*') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /run/ceph/*') + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, 'rm -rf /var/log/ceph/*') + if (fsid): + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, f'rm -rf /var/lib/ceph/{fsid}') + + # purge deploy-dir + run_ssh_command(server.ip, DEPLOY_USERNAME, server.root_passwd, f'rm -rf {INIT_DIR}/*') + logger.info('purge old cluster success') + + def command_version(args): return VERSION @@ -610,19 +766,41 @@ def command_init(args): configure_cluster(nodes) leader = nodes.get(LEADER_NODE) hostname = ' '.join([node.hostname for node in nodes.values()]) + # first purge exist cluster. + purge_cluster(nodes) run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'mkdir -p {INIT_DIR}') run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy new {hostname} --cluster-network={leader.cluster_network} --public-network={leader.public_network}') run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy --overwrite-conf config push {hostname}') + logger.info('Creating mon...') run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy mon create-initial') run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy admin {hostname}') + logger.info('waiting for mon to start...') + is_available('mon', leader.ip, leader.root_passwd, is_mon_available) + + logger.info('Creating mgr...') run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy mgr create {leader.hostname}') + logger.info('waiting for mgr to start...') + is_available('mgr', leader.ip, leader.root_passwd, is_mgr_available) + + logger.info('Creating mds...') run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, f'cd {INIT_DIR};ceph-deploy mds create {hostname}') - run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph mgr module enable cephadm --force') - run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph config set mgr mgr/prometheus/scrape_interval 10') - run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph mgr module enable prometheus --force') - time.sleep(20) # give some room to start + logger.info('waiting for mds to start...') + is_available('mds', leader.ip, leader.root_passwd, is_mds_available) + + # Config cluster + logger.info('mon, mgr, mds service is started.') + logger.info('Starting config cluster...') + logger.info('enabling cephadm module...') + run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph mgr module enable cephadm') + wait_for_mgr_restart(leader.ip, leader.root_passwd) run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph orch set backend cephadm') + + logger.info('enabling prometheus module...') + run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph config set mgr mgr/prometheus/scrape_interval 10') + run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph mgr module enable prometheus') + wait_for_mgr_restart(leader.ip, leader.root_passwd) + run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph cephadm generate-key') run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph cephadm get-pub-key > ~/ceph.pub') @@ -643,6 +821,7 @@ def command_init(args): run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph config set global mon_warn_on_pool_no_app false') run_ssh_command(leader.ip, DEPLOY_USERNAME, leader.root_passwd, 'ceph config set mgr mgr/devicehealth/enable_monitoring false') time.sleep(5) # give some room to start + command_info('') logger.info('init dsms-storage cluster success') -- Gitee