From 25ff10e6e7466997a140a5a79d4fa476e81f429c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Thu, 7 Mar 2024 10:52:56 +0800 Subject: [PATCH 001/302] =?UTF-8?q?=E6=9E=84=E5=BB=BA=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E9=80=82=E9=85=8D=E8=93=9D=E5=8C=BACI=E7=8E=AF=E5=A2=83?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/build.sh | 12 ------------ build/build_tf1_with_opensource.sh | 2 ++ build/build_tf2_with_opensource.sh | 2 ++ tests/run_python_dt.sh | 5 +++++ 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/build/build.sh b/build/build.sh index ad7db096..0eb688fd 100644 --- a/build/build.sh +++ b/build/build.sh @@ -103,14 +103,8 @@ clean() if [ "$(uname -m)" = "x86_64" ] then echo "-----Build gen tar -----" - source /opt/buildtools/tf1_env/bin/activate - pip3 install setuptools==65.6.3 bash ${ROOT_DIR}/build/build_tf1_with_opensource.sh - deactivate tf1_env - source /opt/buildtools/tf2_env/bin/activate - pip3 install setuptools==65.6.3 bash ${ROOT_DIR}/build/build_tf2_with_opensource.sh - deactivate tf2_env gen_tar_file echo "-----Build gen tar finished-----" @@ -121,14 +115,8 @@ fi if [ "$(uname -m)" = "aarch64" ] then echo "-----Build gen tar -----" - source /opt/buildtools/tf1_env/bin/activate - pip3 install setuptools==65.6.3 bash ${ROOT_DIR}/build/build_tf1_with_opensource.sh - deactivate tf1_env - source /opt/buildtools/tf2_env/bin/activate - pip3 install setuptools==65.6.3 bash ${ROOT_DIR}/build/build_tf2_with_opensource.sh - deactivate tf2_env gen_tar_file echo "-----Build gen tar finished-----" diff --git a/build/build_tf1_with_opensource.sh b/build/build_tf1_with_opensource.sh index 37cfcf64..ff59571c 100644 --- a/build/build_tf1_with_opensource.sh +++ b/build/build_tf1_with_opensource.sh @@ -60,7 +60,9 @@ prepare_pybind prepare_securec # 配置tf1路径 +source /opt/buildtools/tf1_env/bin/activate tf1_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow_core +deactivate tf1_env project_output_path="${MxRec_DIR}"/output/ VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml diff --git a/build/build_tf2_with_opensource.sh b/build/build_tf2_with_opensource.sh index bf4a5b03..08aaf164 100644 --- a/build/build_tf2_with_opensource.sh +++ b/build/build_tf2_with_opensource.sh @@ -60,7 +60,9 @@ prepare_pybind prepare_securec # 配置tf2路径 +source /opt/buildtools/tf2_env/bin/activate tf2_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow +deactivate tf2_env project_output_path="${MxRec_DIR}"/output/ VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml diff --git a/tests/run_python_dt.sh b/tests/run_python_dt.sh index e0d92666..a64a0913 100644 --- a/tests/run_python_dt.sh +++ b/tests/run_python_dt.sh @@ -20,6 +20,11 @@ set -e CUR_PATH=$(cd "$(dirname "$0")" || { warn "Failed to check path/to/run_python_dt.sh" ; exit ; } ; pwd) TOP_PATH="${CUR_PATH}"/../ +ARCH="$(uname -m)" +if [ $ARCH == "aarch64" ]; then + export LD_PRELOAD=/usr/local/gcc7.3.0/lib64/libgomp.so.1 +fi + # build mxRec and get output directory pip3 install setuptools==65.6.3 bash "$TOP_PATH"/build/build_tf1_with_opensource.sh -- Gitee From 7bbe353809783d414ff22dfbb756fe77fbce6bbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 11 Mar 2024 19:30:29 +0800 Subject: [PATCH 002/302] =?UTF-8?q?=E5=90=8C=E6=AD=A5master=E5=88=86?= =?UTF-8?q?=E6=94=AF=E7=9A=84tools=E5=B7=A5=E5=85=B7=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=E5=88=B0develop?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/atomic/gen_mt_data_0to1e.py | 78 ++++ tools/atomic/model_info.md | 19 + tools/atomic/sparse.sh | 60 +++ tools/atomic/sparse_lookup.py | 266 ++++++++++++ tools/atomic/sparse_lookup_with_grad.py | 277 +++++++++++++ tools/atomic/sparse_ops/__init__.py | 7 + tools/atomic/sparse_ops/config.py | 111 +++++ tools/atomic/sparse_ops/ops.py | 133 ++++++ tools/atomic/sparse_ops/utils.py | 23 ++ ...3\346\236\234-tf1.15-rec0630-cann530.xlsx" | Bin 0 -> 32363 bytes tools/feature_admit_tools/get_hist.py | 16 + tools/feature_admit_tools/static_key_count.py | 61 +++ ...71\346\257\224\346\226\271\346\263\225.md" | 21 + tools/model_convert/README.md | 119 ++++++ tools/model_convert/model_convert.py | 287 +++++++++++++ tools/model_convert/model_convert_mt_v2.py | 246 +++++++++++ tools/mx_rec_perf.sh | 71 ++++ tools/parse_data/data_parser.py | 133 ++++++ tools/parse_data/run.sh | 11 + tools/perf/fast.sh | 391 ++++++++++++++++++ tools/perf/host_set.sh | 17 + tools/perf/msprof.sh | 24 ++ tools/perf/mt_1207.sh | 60 +++ tools/perf/perf_flame_graph.sh | 37 ++ tools/python/images/clip_image002.jpg | Bin 0 -> 9453 bytes tools/python/images/clip_image004.jpg | Bin 0 -> 8027 bytes tools/python/images/clip_image006.jpg | Bin 0 -> 21733 bytes tools/python/images/clip_image008.jpg | Bin 0 -> 26810 bytes tools/python/images/clip_image010.jpg | Bin 0 -> 24851 bytes tools/python/images/clip_image012.jpg | Bin 0 -> 17452 bytes tools/python/images/clip_image014.jpg | Bin 0 -> 18658 bytes tools/python/images/clip_image016.jpg | Bin 0 -> 6056 bytes tools/python/images/clip_image018.gif | Bin 0 -> 70465 bytes tools/python/key_2_emb_formatter.py | 220 ++++++++++ tools/python/optimizer_process.py | 116 ++++++ tools/python/readme.md | 110 +++++ tools/stat_info/main.py | 339 +++++++++++++++ tools/stat_info/readme.md | 45 ++ 38 files changed, 3298 insertions(+) create mode 100644 tools/atomic/gen_mt_data_0to1e.py create mode 100644 tools/atomic/model_info.md create mode 100644 tools/atomic/sparse.sh create mode 100644 tools/atomic/sparse_lookup.py create mode 100644 tools/atomic/sparse_lookup_with_grad.py create mode 100644 tools/atomic/sparse_ops/__init__.py create mode 100644 tools/atomic/sparse_ops/config.py create mode 100644 tools/atomic/sparse_ops/ops.py create mode 100644 tools/atomic/sparse_ops/utils.py create mode 100644 "tools/atomic/\345\216\237\345\255\220\346\265\213\350\257\225\347\273\223\346\236\234-tf1.15-rec0630-cann530.xlsx" create mode 100644 tools/feature_admit_tools/get_hist.py create mode 100644 tools/feature_admit_tools/static_key_count.py create mode 100644 "tools/feature_admit_tools/\347\211\271\345\276\201\345\207\206\345\205\245\347\262\276\345\272\246\345\257\271\346\257\224\346\226\271\346\263\225.md" create mode 100644 tools/model_convert/README.md create mode 100644 tools/model_convert/model_convert.py create mode 100644 tools/model_convert/model_convert_mt_v2.py create mode 100644 tools/mx_rec_perf.sh create mode 100644 tools/parse_data/data_parser.py create mode 100644 tools/parse_data/run.sh create mode 100644 tools/perf/fast.sh create mode 100644 tools/perf/host_set.sh create mode 100644 tools/perf/msprof.sh create mode 100644 tools/perf/mt_1207.sh create mode 100644 tools/perf/perf_flame_graph.sh create mode 100644 tools/python/images/clip_image002.jpg create mode 100644 tools/python/images/clip_image004.jpg create mode 100644 tools/python/images/clip_image006.jpg create mode 100644 tools/python/images/clip_image008.jpg create mode 100644 tools/python/images/clip_image010.jpg create mode 100644 tools/python/images/clip_image012.jpg create mode 100644 tools/python/images/clip_image014.jpg create mode 100644 tools/python/images/clip_image016.jpg create mode 100644 tools/python/images/clip_image018.gif create mode 100644 tools/python/key_2_emb_formatter.py create mode 100644 tools/python/optimizer_process.py create mode 100644 tools/python/readme.md create mode 100644 tools/stat_info/main.py create mode 100644 tools/stat_info/readme.md diff --git a/tools/atomic/gen_mt_data_0to1e.py b/tools/atomic/gen_mt_data_0to1e.py new file mode 100644 index 00000000..b9c89c65 --- /dev/null +++ b/tools/atomic/gen_mt_data_0to1e.py @@ -0,0 +1,78 @@ +import numpy as np +import tensorflow as tf +import random + +np.random.seed(0) + +line_per_sample = 10000 +samples_num = 10000 * 800 # +sparse_feat_list = ['feat_ids'] +# todo +sparse_feat_len = [100] + +# uniq_ratio = pd.read_csv("./uniq_ratio.csv") +# uniq_ratio["uniq_num"] = round(uniq_ratio["uniq_ratio"] * 301) + +num = 0 +import sys + +hot_zhanbi = sys.argv[1:][0] +hot_zhanbi = float(hot_zhanbi)/10 +print(hot_zhanbi) + +tfpath = "/home/insert/data"+str(hot_zhanbi) +import os +if not os.path.exists(tfpath): + os.mkdir(tfpath) + +tfpath = "/home/insert/data"+str(hot_zhanbi)+"/tf" + +part1=np.array(random.sample(range(0 , 2), 1) ) + +def write_records(writer,line_cnt,file_cnt): + features = { + 'label': tf.train.Feature( + float_list=tf.train.FloatList(value=np.random.randint(2, size=line_per_sample).tolist())) + } + + count = 0 + for i, sparse_feat in enumerate(sparse_feat_list): + np.random.seed(count) + # global num + # print("process num: ",num) + print("===sparse=", sparse_feat) + part2=np.array(random.sample(range(0 + 100*line_per_sample*(10*file_cnt + line_cnt),100*line_per_sample*(10* file_cnt + line_cnt+1)),int(100 * line_per_sample* (1- hot_zhanbi)) )) + features[sparse_feat] = tf.train.Feature( + int64_list=tf.train.Int64List( + value=part1.astype(np.int64).tolist()* int(100 * line_per_sample * hot_zhanbi) + part2.astype(np.int64).tolist()) + ) + + count += 1 + features = tf.train.Features(feature=features) + example = tf.train.Example(features=features) + writer.write(example.SerializeToString()) + + +def gen_tfrecords(tfpath): + file_cnt = 0 + line_per_file = 10 + line_cnt = 0 + writer = tf.python_io.TFRecordWriter(f"{tfpath}_{file_cnt}.tfrecord") + sample_cnt = 0 + while True: + write_records(writer,line_cnt,file_cnt) + line_cnt += 1 + sample_cnt += line_per_sample + print(f">>>>>>>>>>>>count {sample_cnt} end.") + if sample_cnt == samples_num: + break + if line_cnt == line_per_file: + file_cnt += 1 + line_cnt = 0 + writer.close() + writer = tf.python_io.TFRecordWriter(f"{tfpath}_{file_cnt}.tfrecord") + writer.close() + + +if __name__ == '__main__': + gen_tfrecords(tfpath=tfpath) diff --git a/tools/atomic/model_info.md b/tools/atomic/model_info.md new file mode 100644 index 00000000..a14533cc --- /dev/null +++ b/tools/atomic/model_info.md @@ -0,0 +1,19 @@ + +### 业务领域/场景 +原子操作测试 + +### 模型框架 +TF1.15.0/TF2.6.5 + +### 使用方法 +#### 生成数据集 +Python3 gen_mt_data_0to1e.py 5 (这里5的含义为重复度50%) +默认生成在 /home/insert/ 路径下 + +#### 运行测试 +Sparse.sh 需要根据实际环境进行配置 +测试 sparse lookup +./sparse.sh 8(卡数) sparse_lookup.py 8(emb size) 5(重复度) 1 0 0 + + + diff --git a/tools/atomic/sparse.sh b/tools/atomic/sparse.sh new file mode 100644 index 00000000..56968da1 --- /dev/null +++ b/tools/atomic/sparse.sh @@ -0,0 +1,60 @@ +#!/bin/bash +local_rank_size=$1 +host=localhost +py=$2 +my_dim=$3 +chongfudu=$4 +all2all=$5 +pre=$6 +slp=$7 +rm -rf /root/atc_data/* +rm -rf /root/ascend/* +rm -rf kernel_meta_* + + +export ALL2ALL=$5 +export HOST_PIPELINE_OPS_LIB_PATH=/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec/libasc/libasc_ops.so +export EMPTY_TENSOR=1 +export ENABLE_RUNTIME_V2=0 +mpi_path=/usr/local/openmpi/bin/ +so_path=/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec/libasc/ +interface="enp61s0f0" +ulimit -c 0 +export ASCEND_GLOBAL_LOG_LEVEL=0 +export TF_CPP_MIN_LOG_LEVEL=3 +export ASCEND_INSTALL_PATH=/usr/local/Ascend/latest/ +export ASCEND_HOME_PATH=${ASCEND_INSTALL_PATH} +export ASCEND_LATEST_INSTALL_PATH=/usr/local/Ascend +#export ASCEND_HOME_PATH=${ASCEND_INSTALL_PATH}/ +CANN_BIN_PATH=${ASCEND_HOME_PATH}/bin:${ASCEND_HOME_PATH}/compiler/ccec_compiler/bin +CANN_PYTHONPATH=${ASCEND_HOME_PATH}/python/site-packages:${ASCEND_HOME_PATH}/opp/op_impl/built-in/ai_core/tbe #:${ASCEND_INSTALL_PATH}/tfplugin/latest/python/site-packages +PYTHON_BIN_PATH=/usr/local/python3.7.5/bin/ +export PATH=${mpi_path}/bin:${PYTHON_BIN_PATH}:${CANN_BIN_PATH}:$PATH +export PYTHONPATH=${PYTHONPATH}:/usr/local/Ascend/latest/python/site-packages:${so_path}:${CANN_PYTHONPATH} +export LD_PRELOAD=/lib64/libgomp.so.1 +CANN_LD_PATH=${ASCEND_HOME_PATH}/runtime/lib64:${ASCEND_HOME_PATH}/fwkacllib/lib64:${ASCEND_HOME_PATH}/lib64:${ASCEND_HOME_PATH}/lib64/plugin/opskernel:${ASCEND_HOME_PATH}/lib64/plugin/nnengine +export LD_LIBRARY_PATH=${so_path}:/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec/libasc/:/home/insert/src/platform/securec/lib/:${CANN_LD_PATH}:/home/opensource/opensource/hdf5/lib:/usr/local/lib:/usr/local/python3.7.5/lib:$LD_LIBRARY_PATH +export ASCEND_AICPU_PATH=${ASCEND_HOME_PATH} +export ASCEND_OPP_PATH=${ASCEND_HOME_PATH}/opp +export TOOLCHAIN_HOME=${ASCEND_HOME_PATH}/toolkit + +export BETTER_EXCEPTIONS=1 +mpi_args='-x BIND_INFO="0:48 48:48 96:48" -x SPDLOG_LEVEL=debug -bind-to none' +# rm logs +rm *txt >/dev/null +rm -rf /root/ascend/log/* + +# rm shm +for i in $(ipcs -m | tail -n +4 | awk {'print $2'}); do + ipcrm -m $i +done + +num_process=${local_rank_size} +host_string=${host//_/:${local_rank_size},node}:${local_rank_size} +echo run in $host_string + +interface="lo" + +#python3.7 -c "import tensorflow;print(tensorflow.__path__)" +horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ + python3.7 ${py} --local_rank_size ${local_rank_size} --hccl_json hccl_json_${local_rank_size}p.json --my_dim ${my_dim} --chongfudu $chongfudu --pre $pre --slp $slp |tee temp_{$my_dim}_{$chongfudu}_{$ALL2ALL}_{$pre}_{$slp}.log diff --git a/tools/atomic/sparse_lookup.py b/tools/atomic/sparse_lookup.py new file mode 100644 index 00000000..570c683e --- /dev/null +++ b/tools/atomic/sparse_lookup.py @@ -0,0 +1,266 @@ +import os +import sys +import time +import argparse +import numpy as np +import tensorflow as tf +from mpi4py import MPI # must before emb_cache after SparseOps +import psutil +import sys +from sklearn.metrics import roc_auc_score + +from tensorflow.python.ops import math_ops +from tensorflow.python.framework import ops +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +from npu_bridge.hccl import hccl_ops +from npu_bridge.estimator import npu_ops + +from mx_rec.graph.modifier import modify_graph_and_start_emb_cache +from mx_rec.core.asc.manager import start_asc_pipeline +from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func +from mx_rec.util.initialize import get_rank_size, init, clear_channel, get_rank_id, set_if_load, \ + terminate_config_initializer +from mx_rec.constants.constants import MxRecMode +from mx_rec.core.embedding import create_table, sparse_lookup +from mx_rec.util.initialize import get_ascend_global_hashtable_collection + +from sparse_ops.config import set_ascend_env + +USE_PIPELINE_TEST = False +USE_STATIC = False +USE_HOT = False +USE_EXPANSION = False + +from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET + + +class WideDeep: + def __init__(self, input_data, feature_spec_list, hashtable): + self.lbl_hldr = input_data["global_labels"][0] + self.input_data = input_data + self.feature_spec_list = feature_spec_list + self.hash_table_list = hashtable + self.forward() + + def forward(self): + for feature, hash_table in zip(self.feature_spec_list, self.hash_table_list): + self.embedding = sparse_lookup(hash_table, feature, 1024 * 1024 // rank_size, dim=None, is_train=True, + name="merged_embedding_lookup", modify_graph=False, batch=self.input_data) + + # with tf.control_dependencies([self.embedding]): + self.op = self.embedding[0][0] + return self.op + + +def input_fn_tfrecord(feature_spec_list, rank_id, local_rank_id, rank_size, data_path, file_pattern, total_batch_size, + num_epochs=1, perform_shuffle=False, training=True): + line_per_sample = 1024 * 8 + total_batch_size = int(total_batch_size / line_per_sample) + num_parallel = 8 + + def extract_fn(data_record): + features = { + 'label': tf.FixedLenFeature(shape=(line_per_sample,), dtype=tf.float32), + 'feat_ids': tf.FixedLenFeature(shape=(128 * line_per_sample,), dtype=tf.int64) + } + sample = tf.parse_single_example(data_record, features) + return sample + + def reshape_fn(batch): + batch['label'] = tf.reshape(batch['label'], [-1, ]) + batch['feat_ids'] = tf.reshape(batch['feat_ids'], [-1, 128]) + return batch + + all_files = os.listdir(data_path) + files = [os.path.join(data_path, f) for f in all_files if f.startswith(file_pattern)] + dataset = tf.data.TFRecordDataset(files, num_parallel_reads=num_parallel) + batch_size = total_batch_size // rank_size + dataset = dataset.shard(rank_size, rank_id) + dataset = dataset.repeat(num_epochs) + dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel).batch(batch_size, + drop_remainder=True) + dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel) + insert_fn = get_asc_insert_func(tgt_key_specs=feature_spec_list, is_training=True, dump_graph=False) + dataset = dataset.map(insert_fn) + + dataset = dataset.prefetch(int(100)) + return dataset + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='base') + parser.add_argument('--local_rank_size') + parser.add_argument('--hosts') + parser.add_argument('--hccl_json') + parser.add_argument('--my_dim') + parser.add_argument('--chongfudu') + parser.add_argument('--new_key') + parser.add_argument('--slp') + args = parser.parse_args() + local_rank_size = int(args.local_rank_size) + comm = MPI.COMM_WORLD + rank_id = comm.Get_rank() + rank_size = comm.Get_size() + print(f"rank {rank_id}/{rank_size}") + local_rank_id = rank_id % local_rank_size + set_ascend_env(rank_id, rank_size, local_rank_size, host=args.hosts, file=args.hccl_json) + + # create session + sess_config = tf.ConfigProto() + custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.parameter_map["use_off_line"].b = True + custom_op.parameter_map["mix_compile_mode"].b = True + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes('must_keep_origin_dtype') + sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + custom_op.parameter_map["enable_data_pre_proc"].b = True + sess_config.gpu_options.allow_growth = True + custom_op.parameter_map["hcom_parallel"].b = False + custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:fullmesh;level1:pairwise") + + custom_op.parameter_map["iterations_per_loop"].i = 10 + # custom_op.parameter_map["enable_dump"].b = True + # custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("./dump") + # custom_op.parameter_map["dump_step"].s = tf.compat.as_bytes("11|12") + # custom_op.parameter_map["dump_mode"].s = tf.compat.as_bytes("all") + # custom_op.parameter_map["op_debug_level"].i = 0 + custom_op.parameter_map["op_wait_timeout"].i = 500 + custom_op.parameter_map["op_execute_timeout"].i = 500 + custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes("op_impl_mode.ini") + custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(30000000000)) + custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes(str(30000000000)) + # custom_op.parameter_map["profiling_mode"].b = True + # custom_op.parameter_map["profiling_options"].s = tf.compat.as_bytes( + # '{"output":"/home","training_trace":"on","task_trace":"on","fp_point":"","bp_point":"","aicpu":"on","aic_metrics":"PipeUtilization"}') + + global_start_time = time.time() + tf.set_random_seed(10086) + np.random.seed(10086) + + my_dim = int(args.my_dim) + print("my_dim=", my_dim) + + hot_zhanbi = args.chongfudu + hot_zhanbi = float(hot_zhanbi) / 10 + + # if hot_zhanbi == 0: + # hot_zhanbi = int(hot_zhanbi) + + config = { + "data_path": "./data1/data" + str(hot_zhanbi) + "_" + str(float(args.new_key)) + "/", + "train_file_pattern": "tf", + "test_file_pattern": "test", + "batch_size": 1024 * 8, + "field_num": 128, + "send_count": 1024 * 1024 // rank_size, # 65536 * 10 > 39(field num) * 16000(bz) + "id_emb_dim": my_dim, + "ext_emb_vec_size": my_dim, + "train_epoch": 1, + "dev_vocab_size": 100000001 + } + + # model run parameter + print_steps = 300 + evaluate_stride = 80000 # eval every 200 steps + eval_steps = -1 # 8 ranks 34 + stop_steps = 95 + # Hybrid step1.1: init cache + emb_name = "wide_deep_emb" + + dev_vocab_size = config["dev_vocab_size"] # 23120 + host_vocab_size = 0 + + init(True, rank_id=rank_id, rank_size=local_rank_size, train_interval=100, eval_steps=-1, + prefetch_batch_number=1, use_dynamic=0, use_hot=1, use_dynamic_expansion=0) + + tf.disable_eager_execution() + ###################################### + feature_spec_list = [ + FeatureSpec("feat_ids", feat_count=128, table_name="merged_sparse_embeddings", batch_size=config["batch_size"])] + with tf.device('/cpu:0'): + train_dataset = input_fn_tfrecord(feature_spec_list=feature_spec_list, + rank_id=rank_id, + local_rank_id=local_rank_id, + rank_size=rank_size, + data_path=config["data_path"], + file_pattern=config["train_file_pattern"], + total_batch_size=int(rank_size * config["batch_size"]), + perform_shuffle=(not USE_PIPELINE_TEST), + num_epochs=config["train_epoch"]) + train_iterator = train_dataset.make_initializable_iterator() + train_next_iter = train_iterator.get_next() + + train_input_data = {"global_labels": train_next_iter["label"], + "feat_ids": train_next_iter["feat_ids"], + } + + sparse_hashtable = create_table(key_dtype=tf.int64, + dim=tf.TensorShape([my_dim]), + name="merged_sparse_embeddings", + emb_initializer=tf.variance_scaling_initializer(mode="fan_avg", + distribution='normal', seed=0), + device_vocabulary_size=dev_vocab_size * local_rank_size, + mode=MxRecMode.mapping("ASC")) + + model = WideDeep(train_input_data, feature_spec_list, [sparse_hashtable]) + MODIFY_GRAPH_FLAG = False + if MODIFY_GRAPH_FLAG: + modify_graph_and_start_emb_cache(dump_graph=False) + else: + start_asc_pipeline() + + with tf.Session(config=sess_config) as sess: + sess.run(tf.global_variables_initializer()) + sess.run([train_iterator.initializer]) + # build model + print("start build wdl(single domain) model") + print("=========start============") + # start run loop + total_start_time = time.time() + current_steps = 0 + train_finished = False + time.sleep(int(args.slp)) + while not train_finished: + try: + current_steps += 1 + print("current step =", current_steps) + # + run_dict = { + "adam": model.op, + "lbl_hldr": model.lbl_hldr, + } + if current_steps == 1: + total_start_time = time.time() + start_time = time.time() + print("start sess run") + results = sess.run(fetches=run_dict) + print("start sess run 1") + end_time = time.time() + print(f"current_steps: {current_steps} ,step time:{(end_time - start_time) * 1000}") + if current_steps <= 5: + total_start_time = time.time() + if current_steps % print_steps == 0: + print("----------" * 10) + try: + print( + f"current_steps: {current_steps} ,deep_loss:{results['deep_loss']}," + f"e2etime per step:{(end_time - start_time) * 1000}") + except KeyError: + print(f"current_steps: {current_steps}") + print("----------" * 10) + + if current_steps >= stop_steps: + train_finished = True + # + except tf.errors.OutOfRangeError: + train_finished = True + + # train_finished + # emb_cache.destroy() + # MPI.Finalize() + print( + f"training {current_steps} steps, consume time: {(time.time() - total_start_time) / (current_steps - 5) * 1000} ") + + terminate_config_initializer() + # emb_cache.destroy() + # MPI.Finalize() diff --git a/tools/atomic/sparse_lookup_with_grad.py b/tools/atomic/sparse_lookup_with_grad.py new file mode 100644 index 00000000..3d7d37e5 --- /dev/null +++ b/tools/atomic/sparse_lookup_with_grad.py @@ -0,0 +1,277 @@ +import os +import sys +import time +import argparse +import numpy as np +import tensorflow as tf +from mpi4py import MPI # must before emb_cache after SparseOps +import psutil +import sys +from sklearn.metrics import roc_auc_score + +from tensorflow.python.ops import math_ops +from tensorflow.python.framework import ops +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +from npu_bridge.hccl import hccl_ops +from npu_bridge.estimator import npu_ops + +from mx_rec.graph.modifier import modify_graph_and_start_emb_cache +from mx_rec.core.asc.manager import start_asc_pipeline +from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func +from mx_rec.util.initialize import get_rank_size, init, clear_channel, get_rank_id, set_if_load, \ + terminate_config_initializer +from mx_rec.constants.constants import MxRecMode +from mx_rec.core.embedding import create_table, sparse_lookup +from mx_rec.util.initialize import get_ascend_global_hashtable_collection +from mx_rec.optimizers.lazy_adam import CustomizedLazyAdam +from sparse_ops.config import set_ascend_env + +USE_PIPELINE_TEST = False +USE_STATIC = False +USE_HOT = False +USE_EXPANSION = False + + +def create_hash_optimizer(): + return CustomizedLazyAdam() + + +def get_sparse_optimizer(): + sparse_optimizer = create_hash_optimizer() + return sparse_optimizer + + +class WideDeep: + def __init__(self, input_data, feature_spec_list, hashtable): + self.lbl_hldr = input_data["global_labels"][0] + self.input_data = input_data + self.feature_spec_list = feature_spec_list + self.hash_table_list = hashtable + self.forward() + + def forward(self): + for feature, hash_table in zip(self.feature_spec_list, self.hash_table_list): + self.embedding = sparse_lookup(hash_table, feature, 1024 * 1024 // rank_size, dim=None, is_train=True, + name="merged_embedding_lookup", modify_graph=False, batch=self.input_data) + self.loss = tf.reduce_mean(self.embedding, axis=0) + with tf.control_dependencies([self.loss]): + self.op = tf.no_op() + return self.op + + +def input_fn_tfrecord(feature_spec_list, rank_id, local_rank_id, rank_size, data_path, file_pattern, total_batch_size, + num_epochs=1, perform_shuffle=False, training=True): + line_per_sample = 1024 * 8 + total_batch_size = int(total_batch_size / line_per_sample) + num_parallel = 8 + + def extract_fn(data_record): + features = { + 'label': tf.FixedLenFeature(shape=(line_per_sample,), dtype=tf.float32), + 'feat_ids': tf.FixedLenFeature(shape=(128 * line_per_sample,), dtype=tf.int64) + } + sample = tf.parse_single_example(data_record, features) + return sample + + def reshape_fn(batch): + batch['label'] = tf.reshape(batch['label'], [-1, ]) + batch['feat_ids'] = tf.reshape(batch['feat_ids'], [-1, 128]) + return batch + + all_files = os.listdir(data_path) + files = [os.path.join(data_path, f) for f in all_files if f.startswith(file_pattern)] + dataset = tf.data.TFRecordDataset(files, num_parallel_reads=num_parallel) + batch_size = total_batch_size // rank_size + dataset = dataset.shard(rank_size, rank_id) + dataset = dataset.repeat(num_epochs) + dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel).batch(batch_size, + drop_remainder=True) + dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel) + insert_fn = get_asc_insert_func(tgt_key_specs=feature_spec_list, is_training=True, dump_graph=False) + dataset = dataset.map(insert_fn) + dataset = dataset.prefetch(int(100)) + return dataset + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='base') + parser.add_argument('--local_rank_size') + parser.add_argument('--hosts') + parser.add_argument('--hccl_json') + parser.add_argument('--my_dim') + parser.add_argument('--chongfudu') + parser.add_argument('--new_key') + parser.add_argument('--slp') + args = parser.parse_args() + local_rank_size = int(args.local_rank_size) + comm = MPI.COMM_WORLD + rank_id = comm.Get_rank() + rank_size = comm.Get_size() + print(f"rank {rank_id}/{rank_size}") + local_rank_id = rank_id % local_rank_size + set_ascend_env(rank_id, rank_size, local_rank_size, host=args.hosts, file=args.hccl_json) + + # create session + sess_config = tf.ConfigProto() + custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.parameter_map["use_off_line"].b = True + custom_op.parameter_map["mix_compile_mode"].b = True + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes('must_keep_origin_dtype') + sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + custom_op.parameter_map["enable_data_pre_proc"].b = True + sess_config.gpu_options.allow_growth = True + custom_op.parameter_map["hcom_parallel"].b = False + custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:fullmesh;level1:pairwise") + + custom_op.parameter_map["iterations_per_loop"].i = 5 + custom_op.parameter_map["enable_dump"].b = True + custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes("./dump") + custom_op.parameter_map["dump_step"].s = tf.compat.as_bytes("1|2") + custom_op.parameter_map["dump_mode"].s = tf.compat.as_bytes("all") + custom_op.parameter_map["op_wait_timeout"].i = 500 + custom_op.parameter_map["op_execute_timeout"].i = 500 + custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes("op_impl_mode.ini") + custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(30000000000)) + custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes(str(30000000000)) + + global_start_time = time.time() + tf.set_random_seed(10086) + np.random.seed(10086) + + my_dim = int(args.my_dim) + print("my_dim=", my_dim) + + hot_zhanbi = args.chongfudu + hot_zhanbi = float(hot_zhanbi) / 10 + + # if hot_zhanbi == 0: + # hot_zhanbi = int(hot_zhanbi) + + config = { + "data_path": "./data1/data" + str(hot_zhanbi) + "_" + str(float(args.new_key)) + "/", + "train_file_pattern": "tf", + "test_file_pattern": "test", + "batch_size": 1024 * 8, + "field_num": 128, + "send_count": 1024 * 1024 // rank_size, # 65536 * 10 > 39(field num) * 16000(bz) + "id_emb_dim": my_dim, + "ext_emb_vec_size": my_dim, + "train_epoch": 1, + "dev_vocab_size": 5000001 + } + + # model run parameter + print_steps = 300 + evaluate_stride = 80000 # eval every 200 steps + eval_steps = -1 # 8 ranks 34 + stop_steps = 5 + # Hybrid step1.1: init cache + emb_name = "wide_deep_emb" + + dev_vocab_size = config["dev_vocab_size"] # 23120 + host_vocab_size = 0 + + init(True, rank_id=rank_id, rank_size=local_rank_size, train_interval=100, eval_steps=-1, + prefetch_batch_number=1, use_dynamic=0, use_hot=1, use_dynamic_expansion=0) + + tf.disable_eager_execution() + ###################################### + feature_spec_list = [ + FeatureSpec("feat_ids", feat_count=128, table_name="merged_sparse_embeddings", batch_size=config["batch_size"])] + with tf.device('/cpu:0'): + train_dataset = input_fn_tfrecord(feature_spec_list=feature_spec_list, + rank_id=rank_id, + local_rank_id=local_rank_id, + rank_size=rank_size, + data_path=config["data_path"], + file_pattern=config["train_file_pattern"], + total_batch_size=int(rank_size * config["batch_size"]), + perform_shuffle=(not USE_PIPELINE_TEST), + num_epochs=config["train_epoch"]) + train_iterator = train_dataset.make_initializable_iterator() + train_next_iter = train_iterator.get_next() + + train_input_data = {"global_labels": train_next_iter["label"], + "feat_ids": train_next_iter["feat_ids"], + } + + sparse_optimizer_list = get_sparse_optimizer() + + sparse_hashtable = create_table(key_dtype=tf.int64, + dim=tf.TensorShape([my_dim]), + name="merged_sparse_embeddings", + emb_initializer=tf.variance_scaling_initializer(mode="fan_avg", + distribution='normal', seed=0), + device_vocabulary_size=dev_vocab_size * local_rank_size, + optimizer_list=sparse_optimizer_list, + mode=MxRecMode.mapping("ASC")) + + sparse_variables = tf.compat.v1.get_collection(get_ascend_global_hashtable_collection()) + model = WideDeep(train_input_data, feature_spec_list, [sparse_hashtable]) + + train_ops = [] + for loss, sparse_optimizer in zip([model.loss], [sparse_optimizer_list]): + sparse_grads = tf.gradients(loss, sparse_variables) + grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)] + train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) + + MODIFY_GRAPH_FLAG = False + if MODIFY_GRAPH_FLAG: + modify_graph_and_start_emb_cache(dump_graph=False) + else: + start_asc_pipeline() + + with tf.Session(config=sess_config) as sess: + sess.run(tf.global_variables_initializer()) + sess.run([train_iterator.initializer]) + # build model + print("start build wdl(single domain) model") + print("=========start============") + # start run loop + total_start_time = time.time() + current_steps = 0 + train_finished = False + time.sleep(int(args.slp)) + while not train_finished: + try: + current_steps += 1 + print("current step =", current_steps) + # + run_dict = { + "loss": model.op, + "adam": train_ops, + "lbl_hldr": model.lbl_hldr, + } + if current_steps == 1: + total_start_time = time.time() + start_time = time.time() + print("start sess run") + results = sess.run(fetches=run_dict) + print("start sess run 1") + end_time = time.time() + print(f"current_steps: {current_steps} ,step time:{(end_time - start_time) * 1000}") + if current_steps <= 5: + total_start_time = time.time() + if current_steps % print_steps == 0: + print("----------" * 10) + try: + print( + f"current_steps: {current_steps} ,deep_loss:{results['deep_loss']}," + f"e2etime per step:{(end_time - start_time) * 1000}") + except KeyError: + print(f"current_steps: {current_steps}") + print("----------" * 10) + + if current_steps >= stop_steps: + train_finished = True + + except tf.errors.OutOfRangeError: + train_finished = True + + # train_finished + print( + f"training {current_steps} steps, consume time: {(time.time() - total_start_time) / (current_steps - 5) * 1000} ") + + terminate_config_initializer() + MPI.Finalize() \ No newline at end of file diff --git a/tools/atomic/sparse_ops/__init__.py b/tools/atomic/sparse_ops/__init__.py new file mode 100644 index 00000000..53640a7e --- /dev/null +++ b/tools/atomic/sparse_ops/__init__.py @@ -0,0 +1,7 @@ +""" +init +""" +from __future__ import absolute_import +from sparse_ops.config import get_path + +__all__ = ["get_path", ] diff --git a/tools/atomic/sparse_ops/config.py b/tools/atomic/sparse_ops/config.py new file mode 100644 index 00000000..f10d12fd --- /dev/null +++ b/tools/atomic/sparse_ops/config.py @@ -0,0 +1,111 @@ +""" +配置文件 +""" +from __future__ import absolute_import +import os +import json +import psutil + + +def get_path(): + """ + 打印当前行号 + """ + return os.path.dirname(__file__) + + +def gen_config(server_str, local_rank_size, path=None): + """ + 生成hccl配置 + """ + + def _device(local_rank_id, rank_id, server_id): + return { + "device_id": f"{local_rank_id}", + "device_ip": f'192.{local_rank_id % 4}.{server_id}.{1 + local_rank_id // 4}', + "rank_id": f"{rank_id}" + } + + def _server(server_id): + return { + "device": [], + "server_id": f"90.91.141.{server_id}" + } + + conf = { + "server_count": "-1", + "server_list": [], + "status": "completed", + "version": "1.0" + } + rank_id = 0 + servers = str(server_str).split('_') + conf['server_count'] = str(len(servers)) + for server in servers: + srv = _server(server) + for local_rank_id in range(local_rank_size): + dev = _device(local_rank_id, rank_id, server) + rank_id = rank_id + 1 + srv["device"].append(dev) + conf['server_list'].append(srv) + + conf_str = json.dumps(conf) + if path is None: + path = '/tmp/hccl.json' + with open(path, 'w') as file_handle: + file_handle.write(conf_str) + + +def set_ascend_env(rank, rank_size, local_rank_size, host, file=None, dev_id=-1, dev_index=-1): + """ + 配置昇腾相关的参数和环境变量,生成hccl配置 + """ + rank = str(rank) + rank_size = str(rank_size) + local_rank_size = int(local_rank_size) + host = str(host) + + os.environ["MOX_USE_NPU"] = "1" + os.environ["FUSION_TENSOR_SIZE"] = "2000000000" + os.environ["MOX_USE_TF_ESTIMATOR"] = "0" + os.environ["MOX_USE_TDT"] = "1" + os.environ["HEARTBEAT"] = "1" + os.environ["CONITNUE_TRAIN"] = "true" + + os.environ["RANK_ID"] = rank + local_rank_id = int(rank) % int(local_rank_size) + if dev_id != -1: + os.environ["DEVICE_ID"] = str(dev_id) + os.environ["ASCEND_DEVICE_ID"] = str(dev_id) + else: + os.environ["DEVICE_ID"] = str(local_rank_id) + os.environ["ASCEND_DEVICE_ID"] = str(local_rank_id) + if dev_index != -1: + os.environ["DEVICE_INDEX"] = str(dev_index) + else: + os.environ["DEVICE_INDEX"] = str(local_rank_id) + + os.environ["RANK_SIZE"] = rank_size + if file: + os.environ["RANK_TABLE_FILE"] = file + else: + gen_config(host, local_rank_size) + os.environ["RANK_TABLE_FILE"] = "/tmp/hccl.json" + os.environ["HCCL_CONNECT_TIMEOUT"] = "600" + + os.environ["JOB_ID"] = "10086" + os.environ["SOC_VERSION"] = "Ascend910" + os.environ["GE_AICPU_FLAG"] = "1" + os.environ["NEW_GE_FE_ID"] = "1" + os.environ["EXPERIMENTAL_DYNAMIC_PARTITION"] = "1" + os.environ["ENABLE_FORCE_V2_CONTROL"] = "1" + + +def bind_cpu(): + p = psutil.Process() + try: + bind_start = 48 + bind_count = 96 + p.cpu_affinity([bind_start + x for x in range(bind_count)]) + except IndexError: + print("error cpu bind info, skipped.") diff --git a/tools/atomic/sparse_ops/ops.py b/tools/atomic/sparse_ops/ops.py new file mode 100644 index 00000000..35fe2462 --- /dev/null +++ b/tools/atomic/sparse_ops/ops.py @@ -0,0 +1,133 @@ +""" +sparse ops +""" +from __future__ import absolute_import +import tensorflow as tf +from npu_bridge.hccl import hccl_ops +from sparse_ops import utils +from mpi4py import MPI + +MPI.Init_thread(MPI.THREAD_MULTIPLE) # must before emb_cache +utils.init = True + + +class SparseOps: + """ + embedding相关的接口 + """ + + def __init__(self, fallback=False): + # context + self.fallback = fallback + self.all2all = hccl_ops.all_to_all_v + + def get_a2a_args(self, lookup_vec_size, mini_bs_w_field, rank_size, send_count, emb_vec_size): + """ + 获取a2a args信息 + """ + if self.fallback: + send_count = tf.cond(lookup_vec_size > send_count * rank_size, + lambda: mini_bs_w_field // rank_size, + lambda: send_count) + all2all_args = { + "sc": tf.cast([send_count * emb_vec_size] * rank_size, tf.int64), + "ss": tf.cast([send_count * emb_vec_size * i for i in range(rank_size)], tf.int64)} + all2all_args['rc'] = all2all_args['sc'] + all2all_args['rs'] = all2all_args['ss'] + return all2all_args, send_count * rank_size + + def forward_alltoall(self, all2all_args, restore_vec, hot_pos, emb_vec, emb_vec_size): + """ + emb的前向通信 + all2all_args:用all2all用到的参数 + restore_vec:恢复向量 + emb_vec:输入的emb + """ + emb_vec = tf.reshape(emb_vec, [-1]) + + result = self.all2all(send_data=emb_vec, + send_counts=all2all_args['sc'], + send_displacements=all2all_args['ss'], + recv_counts=all2all_args['rc'], + recv_displacements=all2all_args['rs'] + ) + + result = tf.reshape(result, + [-1, emb_vec_size], + name="after_all2all_reshape") + if hot_pos is not None: + result = tf.concat([tf.gather(result, hot_pos, name="hot_pos"), result], axis=0) + + output = tf.gather(result, restore_vec) + return output + + def forward_alltoallc(self, all2all_args, restore_vec, emb_vec, emb_vec_size, rank): + """ + emb的前向通信 + all2all_args:用all2all用到的参数 + restore_vec:恢复向量 + emb_vec:输入的emb + """ + emb_vec = tf.reshape(emb_vec, [-1]) + + result = hccl_ops.all_to_all_v_c(send_data=emb_vec, + send_count_matrix=all2all_args, + rank=rank + ) + + result = tf.reshape(result, + [-1, emb_vec_size], + name="after_all2all_reshape") + output = tf.gather(result, restore_vec) + return output + + def backward_alltoall(self, emb_grad, hot_pos, segment_ids, num_segments, all2all_args): + """ + emb梯度的反向通信 + id_emb_grad:原始梯度 + segment_ids:恢复向量 + num_segments:压缩后的长度 + """ + # unique_local_grad 2node shape 37755 same with rc total and num_segment + # unique_local_grad shape is [40052, 80] + if hot_pos is not None: + unique_local_grad = tf.math.unsorted_segment_sum(emb_grad, + segment_ids=segment_ids, + num_segments=num_segments + tf.shape(hot_pos)[0], + name="backward_combine") + hot, cold = tf.split(unique_local_grad, + [tf.shape(hot_pos)[0], tf.shape(unique_local_grad)[0] - tf.shape(hot_pos)[0]], axis=0) + unique_local_grad = tf.tensor_scatter_nd_update(cold, tf.expand_dims(hot_pos, 1), hot) + else: + unique_local_grad = tf.math.unsorted_segment_sum(emb_grad, + segment_ids=segment_ids, + num_segments=num_segments, name="backward_combine") + + unique_grad = self.all2all(send_data=unique_local_grad, + send_counts=all2all_args['rc'], + send_displacements=all2all_args['rs'], + recv_counts=all2all_args['sc'], + recv_displacements=all2all_args['ss'] + ) + return unique_grad + + def backward_alltoallc(self, emb_grad, segment_ids, num_segments, all2all_args, rank): + """ + emb梯度的反向通信 + id_emb_grad:原始梯度 + segment_ids:恢复向量 + num_segments:压缩后的长度 + """ + unique_local_grad = tf.math.unsorted_segment_sum(emb_grad, + segment_ids=segment_ids, + num_segments=num_segments, name="backward_combine") + # unique_local_grad 2node shape 37755 same with rc total and num_segment + # unique_local_grad shape is [40052, 80] + unique_local_grad = tf.reshape(unique_local_grad, [-1]) + + all2all_args = tf.transpose(all2all_args) + unique_grad = hccl_ops.all_to_all_v_c(send_data=unique_local_grad, + send_count_matrix=all2all_args, + rank=rank + ) + return unique_grad diff --git a/tools/atomic/sparse_ops/utils.py b/tools/atomic/sparse_ops/utils.py new file mode 100644 index 00000000..07cf796d --- /dev/null +++ b/tools/atomic/sparse_ops/utils.py @@ -0,0 +1,23 @@ +""" +utils +""" +from __future__ import absolute_import +import tensorflow as tf +from mpi4py import rc + +tf.get_logger().setLevel("ERROR") +rc.initialize = False # if = True, The Init is done when "from mpi4py import MPI" is called + + +def ops(): + """ + 返回emb相关的算子 + """ + return tf.load_op_library("libcust_ops.so") + + +def dataset_ops(): + """ + 返回emb相关的算子 + """ + return tf.load_op_library("libasc_dataset_ops.so") diff --git "a/tools/atomic/\345\216\237\345\255\220\346\265\213\350\257\225\347\273\223\346\236\234-tf1.15-rec0630-cann530.xlsx" "b/tools/atomic/\345\216\237\345\255\220\346\265\213\350\257\225\347\273\223\346\236\234-tf1.15-rec0630-cann530.xlsx" new file mode 100644 index 0000000000000000000000000000000000000000..195f0ed29faf77934d67422b1775a106b03db272 GIT binary patch literal 32363 zcmeFY^LJ#?w=WvowyI)V9oy`nW81c^j%{?D?%3(rPCB-Yj`^y;=i$C_?tgIZ{$Y=* zvG*E#%{Au-bL}c+IY=lhFc>g6FfcH3FmT+-}FeF$oFibEw2pus8dsj1iR|9n~ zM>7{aCQmzCvLYx5ngTEg(Eb1C_#eCi)nDao2wBm(VP8mb^kOs>QA?rfT%=b`2&EC) z#^#pkt8ufOtUq7l_jy9MJy|!$qfYagAvA~EJx+hY)D77!`A(bgNLYNFnNf6;;u7Q^{&w% z>rwczRT>{eHWP{vo-uuCRKXRWm9LYxNw2+3meujzExHJwn$++O9Wj-;*Ox4F;9$<0 zMaAKZSW5;QO^5)P*HW2nCaVwxI-wYOg0s1!C&@`wRT`j*<=n2X&h{NMy#BVWYN=Jo zmuQ8+l0?01YplVe&F$7;;!Gm;?15vXt)ha~x~qPt;1w!hBeou834loZL3bU9vP%G| z-IFd?&H@>#TLbP|!B4bu%fAnfEX@yVGJho!k>Qit`vqgGVDT7Y(*%{chH`9wh9#+M9O>)_a4;9L7@w5vG1#2=`1dY$iizQ>_VOppFgY;brd5 z8<6OmSyW!hU%a+^gB&*+hZt~|P@O+5wjZQmV4t6mV9NhLfH!?*0o;MARvv_3L=eCY zoXu=qn3?{a{|~VL2c!9a3B5W=L8+ewDf~+MEn?(xapOChsGNt0bO*V*f1u1dYExV> zkZ_}ing~suAOun>pv(VlY;EI9-02AU!!Ad436S z+C@$H&lc{2)8y&wfTgsu6J()CUitI6OzaUCV~dqq-w|8#`)7i$YL>iKKaKO8`6#>$ zOl^Pv70>R$eD`Hk%o$Oo;J|odo0b};EVvK&*2r-)p6fNh4c}iiay=RxLt4HG(z^c+ z5__`C?=+w*8c<+h1YmIBp0>>Yg%b}4XB%S&2b+I7`2S!A93+OId;e!2)hP;c11xCa zSHYj;`}Xs-?M(G2Bv?ju?ZHdm6@`|$Xfw5&J~x$*aT)HD!4*9*OIalXPt zq;e)vt4!eI*)@z|B?aVdUb^Ry(f|>F#kkpq6mN=uar2{Lz30UXqpx+ zU@BC>TfvFF+W*5D?R$bSOXsHn9jQR%5SHz8?Yc7>pSgdTvEkx=`Yr#hr)Bc5Y=6~a zIt-Ft6xu2QWc%<#^BA^p()7T{yt5PWJ}u1%^eC5spxr965HV1I&}1p~O{E*>S8p9Qu#q3f5*Kg^|-6Si_NR(5m=c1};k_7iDYrg&vvZsHzM`On65qSamssCPQB+hfp%#dJUUqDw;K?MY=v;Vcis=w+f zt+8PFLNs@+FWFHKWFs_R&x+y>NyfQ}$nAV~~)Sw-uK8`26GkSTL+>t!F zZ&I~E1uMxcm|LYngh)>BG(QyP)a@ISJ;~C>JI(&hk=4+blC##1AR3!oU@lrjLOx&D zt6Y^j@~nGcH(7ci!cmt2{ZfV%hd*=r#_7m8f7X2C@|7FRiIL9eicfie(aYT#kZQh! zJ+hg!yk`AGbq&BH*>S{R=;EowY15FbIR_Vp6*$3tfk%P!!U(S_{FrWe1KEP6fT~%vi`(TU@Q1@d`-1H&d7beP6M~T! zs|WzE5;=gaaP6=h8@oE3hd>QWp7h+evMabwOJ{|hIEjH^NVs&aw3|?$b8xP+4o6c?Ll#&EkADhfT-3rk3iMr{fDZ*=K#9Cs9}i zFx|!ej|OMD&!#Ce;tuNis53P$*rY&R_ouGV%>2ssZ?!6c$=rllM2oQQTE+Xjr} zrHIoF6XZR-AS!C_O)Z|TE=4IFbX}%^z7)h zNpq~pFBXxQY&7|JtJ*P%*d*sa8a=KoR$db#dP$&o_GrJ@P84S8&i|5yM^TfCaJaBA z>A;C)Aj1jWU4i?tFdRekaKeN9&Kf8BiHGzvQYz+om&TpT(IM+RV7*08z^tEs;uk|^ zvqjc{jpvQuUKPOncZ=U$Aaf+47Hog*i312Z{~cS~%yJh(ApYh>0Ruz+FKoHEdfA$} z{NwI*&B^#RF0^jkV}7`Hn|qT{diXVAc zf^V6X^=k%4;t4u^%x0!_?j<>9 zH{uGkD)a*++)EO-$*O9ligOR5h9}imimM@>KhaE^i-sG;VmF12% zRRJ!xA)%p`_0q{&^YRS$QaTWMoNV99epIt}-AX(?4PlV66y6!Gz-wt>ju8bb8}+8K zIKAB5cSBC%@5f*-op)F~ek<_9qeK-V#*IV~`SMj*BqS15!bps;&^xdDJplq1sseTLU=`o`ipQhRhB zH{lHD5Y)U&Q$H$rM|btu#|+D7`Ka93&d5VGg&MgUli8$FsXo7B3m!Jhdd zpX}Yrwh8f}-vpNb+WRqnO*gIa6AJmxGj&L%&{JTS2iyacvLzz?;+0KIhT?^)FU}U; z`FQ_$7wY-sW81WrqfwV6%xb?TQUo*a;xEf)GGQUjIvMu*;lSSP;2(jH6PcA{hmsLt zCr7_F1koieX;J~zc0!n?UCmR)nrwoxNjq^mOWyr8<>w7fn+RtB|7v~HUg6PkXh6mM zR+6T)ah&0|Q_Yjbe1}wnU+F9IF5&TaW$c`88BE*F#qd9nFFu#k!g0WF+INCGL%f|= z^ZhV4Uf`dL?vU5+D#S2EsVkjAn$`74aS$JQK1h~K*F^2kJl>`4+t|>;e29Af;D$R^ zLSoNtH-(NAKG%LHIXKCHl1a@;i=_1&So8T9?7WiEoH`^MIOB(#_2 zAShSx8g?VE`h20{qw!XpevFq3vJYeES7(_0>zy7!J<_9=w=ZI9dFnfZUqG$K7u zhAW^eV;~m@_%EjUr{XS_W@fG~|F?1c&llGJ*EUus_9}ubwlK(Ii|zBvbU=z)o-n~# zY2uM?m)aA~@~POOz4`>Z7tRBjav0Wb|0Z{B_%Sv-mr$_&iul827%6rNNFdonnf5?6 z6i@WY!Ay?8gg4Kl6yl>cDuNy?lh@@JqeHY-7(ZqU_aGXz z%R1al-9kFnkZ-sWNb&0dci}l`e|B&u2**374NC~qs;hR49?qmg8<#%qU3HO|Ds1|S zv+kDgASB=%RjsZs(?s%3LlRfOBeCTH?^$ZDMsUUtHv=mnMAEd7!Dn$CufR zgAYm)OcgEsp4N#aYw)5hi^#~)i2BaFku)=&aGYp zxBY1Co62K{F(h*`y0WTpjR7#LBo|g}%i7*TqwYYS1P|xJPOO&Q2e<&3tT6dfM1Y-K2$4`>0lGRh>^66 zuwjq98{n5fLB11{W-uq@@APVnZf7Q{ZvUNgALfNRc^qme0$YGOpQM@!IhduqN%Ms> zaLi4&XS43!hTe}LP!gbJWgSz~pnn2`{|Bs;l(^mJUcbq92l;)GGkdM@rRka=ttX zJ)9r#xj%eGD)hFW;6TFD*uu-lvLl9JvZFURyorp{4?+B}2l-))(GQ0%2O&|gR@lxA zK=gDADo0%EZANr(GQ`Kdo_lLHJM$knO5!o^#u&tLOkiC#T0aS-w`S_^@AqPJQ<^%W zx`Yvxs%`6_`313_KOLwaC1=LnmzKFiD+|opg_2Y98%AMuufFsXW$3v~M7Ic5D0vjT z;fv(aV*|o!P#lQz1H)SxbYVox8Mn?iZuw%{H8%RCxGxWwwl&DY1N_M(f?+-{CPDiVGC#jMlj`eM6zP8 zm_%&A4YL231V+i9ckH66VtFlPy7oS1bcrpn4}en=_WQ*c(yvcb1!oJ_mdSVpmpdG; zqmRpFth_pjP{Va-l}(z&=&j}OG!(Pvf2-(DS1$sZn5)4zkI;`7LSv=3 zcA!1>T=(lVJkIjG+a#aFC5BPyxnN#B^20f!BKkB0CF^cBhe24mP6gtRoMhR$+1Wz) zPT2y4P+up$gDfy&N#{pyt1c4qvvbwyzT7%Q`@t)(2fXv~HRg6^5tyEzNNBGvujY~E zoFaKFVo=cGG4>(@2mwq*Q(qamJjN8x6|y715dc-2Q8Kp^yGQCPm>>LR2_1pFA;H5n zPhVwo6;kS5Du1tpzM`ga3UDaZn()Z{;XUiDT6y=1td$k2%9S*WX?ot#tCo?}u0C@Q~Gy23-xk$%Qb8QQuCi z+Mz-@N;TJ-JdZ1lQ#T>k%8qI|$dvOKJR^B?LKQbW6$2DetUU8j{gq6uSy!!D<A{AjhWfW5?s|* z+MT?t-K;7Fq)He8v91kqEORWuxH}B|lWDKoL+zOn3v5bl#%+Yo?5A-{=r$4*KU#yR zsMKZ#`0byYtl`q>0US2Zn8$ei!=5m0fCU}gBDGdOe3?A^r*xwEf6RjMbq<*&yBNf@|{0 z8UH?V)veeo&b_O^k zr3J1hJ2y!lftz}2p^@CFZ*Ia2XR^6ZFNf=$i{`K`L0`Nb19*O@9xM0da!M)Rm2!Wl zwfl|J252Ba7xZ#b{PwY-x{T7E`csF7v(KNr9HvwhV5^CE!BU;}YSgxHrheJmOHtfP zRMzqU+1ghSbAG9xP`9x)Jg$?42*AP3?9OMl?!SNcYujz_A+5~z+5!mkdDx_iG_0O4 zAWT~OD8qT@N*1ovxFK|XYuO~cYd0$9Z9VdlA|Pt4+9>$rfMtDP--5OvJN72opo~hF z>aOg-{UY+JuQBv{OzmaF7yNS3-43+v`@fN63EGVrK6WrLI(2X`y#HoY{y&S-nJbq- zJe+#;2sz4ME*ICvA8|c8vQg8Qd~XjAAKnKAKD}NiPc|af z#$%Q)3m>l(tBQ^lw+wXkgfhQ9-94DBwtD?NJY78ao2|Lk)zjIXPtexo>)|y0bhWAP z*Sgm7@Ub||@%XX(uy!TrdwY8L4x9D#a=Nkjo=`hWO?_qH_cHEK>tN5^?LYkZp|_&j z$@jd$7@+yqv+1$7FQRl+r@yhezR}sXHG6x)ad>@}@MG^}uu4pB=kc@LeC+V<;r5jt z{JOB>)JcqPO-yG|%;X+Gc!m-;I_>s);-OiWFZkTEhdX&K5uo9F`^@8iBcsN7Pu+7o zRjCwSSl7{Rl^?MAw)-jI^{{eAkCx`(=Irx)c#>?8lJBn_f1f;Wk>AFB9H6K7;mG~^ znEV=37vS=IdAs;d*_EQCxDWmQa(Jkt@Bs5oxIy9L-RKhc<0JQ#MDNOPfgQOHfc138&t7gG0pGCao~xcI+$nQ^H=o;=+sQ$XtIGhtzy0aynp>TztlS-y2g4~X zH}gwEf<2u+Vffbv?x{$piLnbJpk;R-QQ`1!W`vAbf?!Y!0z_;ee6O~M+N zmOuABiurf+Df0~U4Zr@;RX%opuKzUsovhHS?}v8V+qvzBcGKJW^IL=W^GeLK#rtgo z^I?9|`1I6KKIiy!cTT_`VnP0cC7w56X9($gvijvPUMFdPN}hu1RfLbdtU=`6?9)4@6W(V&{28vnsEYo8}G>ZK}Z+DLYPvfK?dvyUH5sw}L z?|+55eZ6kE%~Uw&g+wpbV$K{sFE39Y)0qwOWJ50N-nZ#DH*2l3>+M!6tzIJ@je<}g zK4zcf4d#Z2|IL|{RP9Bygm9ak^Rm#L$>91Vd1yy1t;4Chvg|avo~W|h+tVsn#h;hf zBDmay#ZCoTPvxtb!Jmx~US=|5?{~|{va1s@Q`u>%3e?2ALnMjAQoWt70p|Te zg8480KTC1=38D+FvukbMM=eyRtP~e#u-sG@>{i@R)s7+P|HLE=RdW8|SM%nQ@vAOr z!LI%{HtXf#!us^pP*YFH5d&}BySm-z;Y{A0lh&fIaw}C#uZff0V6n;W@AWQ2fw9od ztg;)GdfA=)svp>(PI$i3w5M=4+6jzwW?a4!B%c;G;a#DvjW(w2RvFczFV`a1eI)O% zHv0~5t|f**W8BAC;d?6Of5YP-IR*A0Dd&&_vj`p)tO;g%OP0m32DKK%IAia(FIKv9 zQvPPDf8{_P|7wud$urqMgR2ZbdHsuUyEk+!B~PZqsXQ$oMlOw`4cvTwq!V}7_teKh z_g#xyj~B;|zwv%KcGKTWb;?5DjU7vG7FT`@>D-6|abgH+LK2f*&710Ch*ig*z>rG8 zK5&JDZELEm8`b@=EH9e7<79Pait51Do$JDFh+vO7yUc%il#zvghez@HI^#RiGkbw-Z+Wo6 z)SiX80=IJZSGcVruCn&j-j=6t2^-cc+nY|U6*)!W+Sa|FvkPK#05H(I7I`xx0-JW^ zMBor&YyB?+c}x^VkW_iAI0SBbx&kh>37o7N&n?t_JOMJnRg?I zOUpPeC6j^21t^H?Gx5q^_dE+)-w1`%G9hU*2>8pV+J~mvSw*MQX&K z@UpQqP%2^MfazN31X?6h@U@R_z7X%*d`(dhx>FcNyi|Lv%(aoyzU29mW@q5a7%}25 zInSU3PCFrOr3PqzLb|E!WfVdI#Y#Op3yrAzI|a#)^LHt(-pXPxM0z$Ic$-MMa{Rh; zN$f-#t*(q!G9}$Z&rRl-U;8pnkK-;Q!=$;i*i3k!^5q zz83uY>1^3vbP}2v7np%jL2uZWk+!q{P^waf zsHoXg*jFk`@Q`#q#}iSl!3#YrpV{1m|ACk=q1$+p8F5Rhc=cIc;mIc6`W!#1FXnl^ z5=$S-%q2KyfZS5<5CArb0_K!!3!X&`vD7(gam6m1Fl6K`OYO*FbT1<{^zC^2%=hPq2i2u_t=Rv&&-` z=|i%K!Pfz@h4amyofOMThsXr9?u>`i&f}GcaYQjT)vB%NDGD=GPH?M9?{LBmVp6)) zI$Bx8UqBI0%<<^%{W(4DKI1`y`fIA87%G9nUX|5-PWi=_#G?@jM{!4-E*nz?G?q;k z2zDWj7G>j(cxRgwdWc)43ya+=(qf@B!>0N2*K|`DgO%4fYa=shv?C5GNu3r5KUOxX zu#47Z)+rshndpeIw1#uMQXTbr81psWaRfbquyIcLRb;3#*D3WZCNZ$bKa}#|xA7Ir z8P+P48KM9->~UN3UM-m4guy|MC*;h3@ax^ZlB&lknw~=0qsHm(f1GB0w5FvY@G1jA zg*OhSLjcYg##v$aM|_ana_#}a(p@>q#tw1jyZ|FB|0bE}z&nSN6P!}OzBZFtyZ@Q` z!Z(d-m?HAfnDSz5FqXf!R;1&4TFWN4xU?Zmn&V1U}99;T9E5RshW( z!;+aHJ8xB)wd@+$G?`q^BW@?KBuR=$H4>L(3(7G1v?e=>wVsr%Dev!Y)CN=F z#)Czv6bwVa0B)~LNl-&3@niby5SDvM^9nk4L`Kcg>ahv9IU(hW<=@^SE>y}2Q~aP> z#;~Ykrw-_OlRsawy`_HQt;;K+)r(UMQk-XDsY03Ac&j^sW!doOksbIEQ5nlQ5XvX> zeGw`X@7Q9twiX_syVeD->;oJdU8M0ww!rSCR{$0uD0KLt;^C-Cp6} zszq%AQD${+$pmw&Yo0O7Fde|NCgWR~J3Tt$>VpWS=5&^GgcG6Q)hctR_rEFGbrp1!;@X{E+l2+EmV&X z=meK0LJEIJWEFw_%s8-PM2rP~#rq|bXpRw))*MR$&v2pDmKaxW@9No0$88}qW-LVm zOYLARmmPk`7LYhDpQLOLg_GnfQ%@Pi8l#X5(hHO1e|n(Cu>rcrqZA|QlW5yG7A#f>D z#KZA8W4V7RXI6CP^p~Od0mMv)lNBP9Zu-mUthyJBT>G6zg77AVi#7*-?2rI=M_zZ$ z=`*ew$FNzYJ&udC+Uc(=p`tV6kE>s4FE}x=Q76|mTVz}2^(Nja475SEIfDw!7Yt#n zR<=XpgN3<__{qPg;>FWi&08`kFGeu34DfUTQ(0*h74}ig@PAs@7vS5HCXyCx=)%kL zc@xu)OC71>CUZd*>M)uMbxa(^017uKk62suZv&^Id>s>IMKM5-^svuh&pDZf*ezSO zC~Xue%~3bI##T)lE2ZXn4ky22*`sY`7gzBOuDbKE=V!K%$CgWgf7K|%8Kqmp*;!HF z;~}$MWv*3TwL6rwBOR@rY>IT5O{liL8VzQOLN&;8S;K0rH~|%X5UXv22mGe}1~l|8 z9UI+%PIpP#8qjTPmAT0D&RVo{k~?*Ia8Z@ftx{`P!BA$H^lH9ktKb?^eNyWtI)Ubx zz)Qp{Z%mB09AAXzp9a0b$G)hCH4PPm$~$&*We%O?9ZXhDY}%#Xpj zOR@&R!EYUUXhFZ+8iI=s6SWncqq?ERX2xk zdth2d=?sNk36@O>&YAa692bt^!w6cE;a_yx;nT41cYqiH=xn&PK!Jyr=U5f}4%>*v zpmT#$DHXp+O-ubZn<#Ar{JB-rNJc@iPHPJ(3K`uh9lRW384&-5(0c$|SY$-Sev>D5 z?_At|!aED<(l$Y*v@DrBmOBHoaErBEt81D3{|HnQZt~+c4f>dv-*#W=C{x1YHCkfe<%|+jB=1xuzRfQzP5fA~}L^>oy$2zcTL5tqZF=zn_JpaD}XOJ?(Pmu8E9$`>s={{ioY~; zgnqs(p~4te=vvc$dKb?scjpQL)TM$6PoUTo8#{-pT&hZga23)ZgSm7U-AfEl>GUtz zlX=I-Bn(QYzy`o?o%+a5mFxO?YOOrSQe+~8VTej7i9U>WtOpRNI;Ar)zCKo%>Adwt zwSdt(=TCkKEvcnZ{Ef)St&BkhZT{g7wAn;1E;NO-dI(K$ED42O2 zxqPhdMxaigWQ~%Mv~shp!XUVH?FCOlb;5JDKlLUU2gq>}r#OEiSK(s=c`A3-h6>Nw z0ib!m9!=rQp=UOfV!-1kq0gvhByO5ZsuKk_=UEmfKo@|1ab@{e5D@HnxWMG@r<7|N zwMa&tST5~=ShTu=hh7`ouZKu3N>{PpCLt_mB<`J!{?BrHUucl%o<@RhjJfuc%KnC6GEgi}jIqU8|#m0$z1VE42P@MBcouWtUlYdIj} z84qU1n88AtX}PDS^RH8;kw|I3WpIYZ#(YBL8~p+J%AY){qM2t+xNjrfb(X~P8;+mi zaUtMMqU2v&6Kuv~RBQ3ioLCYLrhv=|9d`mPZ3HoQz?gs`*7w9D_z#NoR4l934c(%T z@o$$hmB!hlSh6K8w4i)N)@a%v{JN*SOtj((>RKU+au4{)odj4dEPEul+1$CmLmOLj=wi*s3St&swN0MULjZmge za7EFa(6AYu%dt_frr64rw@_LU0NaKT#_#^4%-ulMzQrcuM=(npP_WRcpkOh$r|W3j zc=KLIDy$=Zq{P-22)6l$*~0RU-k_%wig*k-4Kg{BsDd<(aYmg;3*^O6vL2m0?;ij$ zDNlb(P^K~SZ9qlJ?R=x4M2R~#_E+r}@*}DIduFG%gY7t+%aCzGCV1C{3*jonabSJbHm)UOjF+)on1R2ZK0L@>-NoJ^Aq7qPve=A$&dINfl zIdAEIF|d{{lm|UIYk*uU83byq9>+Kb@U+SAt<2ui_5c%z;230X?iA%CrjcaT3tK}c z%RL6suNc)`)N`E1+O$ZchV1{06dQmiueiJY#N!_@iG@MgqSn098jDOKlG2{fTImwD zHr8&iH(VKz8{2AKG4sI!wKwGWw&DNU8~0v~g(Z=-NQzk_=$PdkibO3aro?@Bxw7GU z{5O;4c+O%ELTU^NAmyS#v@{4+N(dFQLLd2Ju85xT!t5M_uL4e zlbIe|BvhFb_wF$IUK+@`3Fj=eJTA#1S11XSO#mvcm892eHH`aTU=4A2=yrlX8zkT{ z`YsBWSC}zKXth-Z1LOhP^J^m#KspSHY!2>Dw8X$r@-TFfmGuDmU9?6@07jsD2~5

_mJH{;_)@an$7~5mAkIuYSJY|g=z{73QpSur}Sx}v$*=ko*#v>H&+0&Gxrcy zJe1k^gE~>c(P}*Z`W>UYbzj)2*~ z>6E}P`P%QQO@A{~CJnPL7W;&C2b_OJ=kGhX-!s1aTOg2_;gOC-VV$ZR@S$ zK}d?x_UM%~=m@>OnW=0iw5m*tt9a<*@ygVR5=>$2_;yqO3QNv;pw8~cKhhS@wEX+w zj&Z}_p0v8JuHekXjp6t_Zm`d$N!Qyy#Z^CW@dw-O&V7Y7j9p|!cmLlGgZGNY(tY&O z(0%jZ7pr9I)h@*%@$NC5%cN36`Vjhcb^iG;D_6hcJJd#4c7VS#$(rlrjg%y_1^!Jr zNdR8{Qrhr8I}GQ)K^^-F31xCAmGtnY^S*h!_H1yJiN=L*)w&it*=;M|B@8tlGn-M_ z49%Xnw5Q9=5#7U=0Smf^E}rJ6Fiar`dIuJRf1z(CYi$Xg0r4}6nHJ$Al}sTo)%4jf zxTjk+3VGg75&|lFUQR7~{CwOzZqKqw^*`RK@;^RM1D^W>-tMPAuRkt9yXnK}%t{~O zs`8U}3CgbzyDw`!9UU+42jl5K<_ccV8s@p#(o=J&+hlOYLS2Y{TxnA*IZGvpX2qIo zj33dV{H#}VROi>4Cy_~n(pt&nzIzG$sy$Xc!YhNp<&_XZ=fQq~q=nrP^g06foVYEt zjsb*R35Onm2d^!<&<{HZet*`#+-m~0y`1;$^#uy3_J$UT=s8zX1rbxkL8WH29?7>h zPLvs>Q7oF%rq)2}?cPK=qJ5@rJa1ZCco7`*yYt-|pLftcOsotVIEO$hyf>Ex1u=td zf;x94#O@=-Y8?00n$pzbUp!LWz!qD|75QC!;5sLa0fIwcCvng;P<=`~z`y}wTo&*#BsLv@?;|*v+C$qm;SoE8XtF~#{jo74sB|?att;jZc$G^k(MH<42}6019F6ITupEv1 zA?Z?^I}BySkcj~V?R!i!P%S~ShDnhF*^PtU?=fZ#@`VyY3+0LJjbq%Qli%S`o23`= zoBc4Be}{oLv?shqanRT8J``LVdLok_wL-GF2jEfKB-=Vs(7x^EHX184E$@2IO?|XE z;;$d8=19ZZijcB7?*Eh`7qMpCiKWU@nL6&JqO$2jIl^D@GWPqv6N<)#PM5qM47`?) z4~7iYgLbzTxO`s`!VPYjz2JF8@fCFu`UyLKR{jf^IysU8@D&1MwsDQl(1%B7Wa5uD zSGSEb;h(`W4hs`x3&{$i&)D=P*7|fY!S^+NQEWw86`^}YCrT5kNFR))Tw1`i#i|X_ zX7@DIH4LC(5D!Km+jLJ zQVm^h`@XEI>3O#xC2h&w^rKWJB5>`nm2UwsMHs1Q^vNj(9Zx;hUx+iM$AMHZc7&a9 zn0wz0Ag59E3h~h1Z6WkSBWHt%(?r}6S}V1k4Mn1iZ_;}+z0>BmMB(k`1+Keh4x5Z+knF2-olp91K zMnxs<%JQOODmuc;Ba2e{t*Tooy|83D(&2E$=phup|Dl^)E6;RR_AwQfb9A1G4}mOr z?S8eBlM(Q8&&nYtkb&F2GCUZc=yI$g_R_(UCnu1RE|jJ~YjQ|3jB#Xc#0Z*kf5$-Gej@9PRul^^is+?zSMMA}>u;wpE<*yJ@A`yPxtsu^#th(9ULYCqJ9J%Q z6;&n<*{ty~B|5WNsHaL>(581Z8aN{QT{DI5IeN%uuw!98;nZc&JAzECDm*hw_e3b7 z4z;gT3S7F*;WNPfv?>V>w{bGAY)tG)_wP+oG_aqDm!&nWg@@%B4U)Pbe%tXJQox@_ zW*xBSD7&{HA~zviMp!tE#e@wLkKZ$t+9lu!3Rica}?7 z9$Se)q6#~@7PAL+ER@^s2Ll?+*!8H%>Z!##9!W`AmG4SenF>`v3{Y_n%h+e^C7Ho{ z+S;)=Qu01iT}zDMK6a79DQDoi`~6%J#~edFnl59%|6)TPRsRIRDt>C}>}8MC1-vbd z{?1ItJ)<)XqyQV#N6hubyrU>VB%TnR{l*k#f{W1Gdw?Y>2y<*!qY!f=gAO;Qt`?As zN#0Bv!W`vVK~WMjhWwNU&RTdwT+uIzv>^$LKy2KK!=#Nz{u5C;Aj4L-y%00XoGp#; zH!`@-bB)1+dTBAP$atkW8*_3*LDfk7FZRKM81hNh~t8O5EQNOoQ&j=IvTf>5o? zr@t@cYHFHf4Mn;d|Glu+G%GWuP?2NFn>9Bcupp0)SXrJVjo)Yoh)raxea5yZ;KH*v zZQeEgmv6{X(RS&MVhzOD9{J)0DR`4pi&O14R5q_&m=pwlU1HAhsfV%|pln&RG8-MY ztwUElW5>S9kG`5#d{ERFOl|l(^UGssI?Qg27Wm)BCXWNz^AL6xT8p9NOu6C|3S=ha zv5m~t?E_m)%^?iPP;050F*n*5-G>wvlTgyuRN?2+VtZ->Ef8t4GQFvELMw6+f?0Bt zc(8kaDHd7IaQ#qCg^kKC^+z*pUB1Ol{=iwo?Fex>G5iIF;ms?JLE`XbUtZW|dV#%^ zlJlu;B?{-KxI~IpRWcqZ-USLXUA(|10WXrokW~#8mIrAOyS#XHY<{=^y7dK0MV>Kx zsKie8dxoJuphb`{E(;VL8ylDkbi)7-X-3YTiz0Dy8mi!dSo?L2`?yV|foo*AjvHWT zSOS4Mp$!{&`2iKM#;Z1M>J}UF9gIz$7hEzV;}i%3s?Kg1`-xPl{@_L87#}AAI|zf& z;8}F}ZN)=PxdHvjD+%Hwu{=DrL%szY8W88nj5KfM43MR1fu=V4S(a^)ddcN@d`t5*`!F zNY25E-8Dyt{F0hjeH1SA{j4sU0?g(!_?JNJ&LS%zf}*t{C1>#zZIy;Lhzu=}TWCCDemTsvu@S-AS(a-&Ypz>8HOM+{Pj>V19qRF13Ruv5B z!#ws)Ewa~yGD=LA*yBpMM^-M&W(;&KpkGQHgrk#;O8qC|qW`d0j&iQ#icB77;Y3Es2J~xQ~OM8{;VR0(;-u2IMZwttHM(I%!!u( zJ=jXZ9mI-(RSfu29jYE>vpkQsf#`Oy^+!Ieq%&T4=kIbVhT?Rb6Y_$sDU)3i4X85l zOj2TP{0CD@kYQXZk{yEeLlZ&L8v8wZ*3(`x%6vGPFbCe^G*d$qzU;fYgA78>bE*J9 z-qq3~z!ebE> zT(5;={&3zKl>ni2HPqyS@_#?o5A``NGSc5j*K?<*qUU2O9NH%R1M^-?N`%Tg8V9=r z{0s>`1ZFVKj=}oUhELLI+Rv_N)j($th!xM?je54cSd~!d!8zg@MKE#?2ppF4Uv?H4 z;Pc{ec&QO(=H|o#M-PoA><0k&476@(UOeL7#%*weelKl%6^pPSf21S`5CNLYmfN;P zhSb+~sjoWG$r2B3xRi{EK6;^ z!c5FHBj3*u%)Dh>3R4@5%8GpU@qz{REqMVoq*y47gnjq%C~ymV(*v7ikYwB4QF&SV zNXg!|Z!Qr@*Y<-n7mb-O7}9P?&ks6Gq$BaCMP)avB#$4t%Jlx!#d5RsFI7dR1GEmK z+}kKx5+JAm0E1q7XEu_a48S3`R`}S&?|8&J;F7i6L8d8Z5^Xx3hfrHOCiXDlY9u;z z_2Hx883u2cjG;16@`U7u>Ldhvi8(+&CS5qv%xElI6Oc`TI|65=^zd@gA^AGu2#}#& z+}`I_n>G${KB+KGPlFm=Wc(^>qGpA0H7Dtbn+kdnWtB=+ET4}!>jPZFD8d~TF51)H z=R8iuf--|08;^g6?kxZ<{&AeTxXmwt2bP!_l<9}0~%ubFG_vAGK# zy^bf{i0vUvLchj~Lf~TNa0U&0Kdg{+;j$a#CX)D&@58j0zN{t8nBX z6HJf*qH~uW-F9qJ6OZ4r$E6fI(MO8NOcFcxK2&vJUF!#4GPUL$&I$mFT<)I)G(zum7Yt4TkbC9Ib zR6Nm6K7C-Qz;3F54t2(*rp|M_Yq}qQfwOZEsS-+q>?@`R$y0*XJn>(WldY*#N5C3X7FgXE(sXWr;luB^A6+Y%Xm(&>JLul9W#Zs1Nwm1dqbPc5DA6o9d^; zG2Ww}MJyL97O@a|_G8+V*fQHZtGX1RS5lOJKOC-8?iVjdRQw5rYwTFRtL{5lD5EOY z(1`XT?q=eaZ52O9vdC8Qj$A)9S6XaE!|$tNNeVZ4sWlMe=0+qZrBWr*i$n?Fi5qdjPzm4l155{N`#|=@K+qd zv(whKrJqV5F7kNo72HhhNR!=Y(pY8d`=JDXnL9b$LQu!B-MViOoXzKTJz{T8PccNo_s}XW zU+IyvIXa6COXL)46f~OYO_FcVt~weK9Yq8&I#XjJE}t(xP++%OJSIk@_m^I`SrAYD zhhlda{wi@!GHu(PxFaJ+kV0rMo7*qwJ8pK6^+ur-4q4jM64RoKm|vdlQZfeV=1%rL zjIUfYMsTMV za$+;q`_fy~tgM!6Jc`c!2ni`CZX0!W)}W!=c!ePCiv}DXVfaEE5Q)&_^J^zX;bw|{C0YlL>BNRpV%l~M#6BYHLPWs`L1u(t z69pYY@H15X8}r#`ArPXViHklu+S0Q)`{rge5l{0i_b>%Juz%XJSdO+hAAgR*%KPDJ znvv;zDTJ#Iib0qiJZFA8mSFKGFUVSZwa&?TZQQ+^Gwn-5kKY5lTzGn2P(EI7|eD%%aO=?^$ucNItBLnZFF#ufE=X7Go z`}%BaZ*_`Ou<_Kf$;`Y!I&^Me53H&C} zyTVy$BnjN6d-=mmue^Qr7rvbyOIQ0+(|hQu8IaCa-X(gvl$&yd$s+YTA{w1O@%u5W zTVqXwdcH(pW*)gj`k|ebH-u+}pDIqz4jtpuh}l+M+NXd#l*%VE*|8RO8&N;ovut66 zC78K5{ug8zz79CQO~#{6uS2ajkBzAD8<6XW1N>v2w)9gOO+OLYau{GzFoPk2*0Ch>+A#3R1B54crHjkt%P9=NT=p=POEMfYj!c?yQo<{MO7ZYbXw(9P zaVZ}XcobxyHYd!fXY1C&G|NeY3Gx6jZU_Ug9XjFfFi@ej_-&X7;mes3p@ok;Bkxy4 z?~RsP*-N0Gd)ig{C=3JN@HAH$MO@O*q`!uj@)?uG98;$iA^=6q4uV%O@cg4rN&m12 zMWde*L#tFZKV?28D>=I(TtHGZxpw}CpHzvL*P{`0^kxtd$7R0URNe{xqB{NcO(*uA$nSW{HXwHX9Z2fy zBcPBilPIVJiP+<3R?U;BY-QRzU9PSI+(E%GP|6qc*PE?3mBkMI zEGloJe%q_mWlBzaU6ddGT;v#04v65ygi@vF7U?=RTXm0WJh7_qm3SCWmr&!fEPo@`?(XjBY;SA*s)w$ga8dUz zGIS19UcN66)oZTvxyqaVCezY%wZ40|xDOKGqBB3Q0PXFAp!@Lc-L;PGKAlTO?=VhI z&hu;d@CP3!clbh3EDj+62=Z|{_24mYcXIM@arg3ZP&ik}^sKFGYH6yiFKkh$AX?jH zG)QDALhP7W)VteSIlJ3iofvSP!W3N}_B#=3z}+1~b-9BDrOtsCK!-d3zTd`z|LdG0 zVpA|GVmHq7|LnJMvM_RdZ>r+#XlZBuS0#eHQDzxr=I5vfdI!AzvOLqOsLy)xMTG2~ zhP6f;Ag&OpL60w*G=ybh zU9UgAV?eFgI3hw@vtm@lM+NNcm)aUV3S@|!g5YA?J^Astx>582W@QaJ=?)2=9p1xo zM#`Wx?10MD$_Pr%4`K9=+9d69JYR*CMu&TP$4ZjZhUw~Qbhe&F*P};l&GqCdzD z6w_71dBuevMe%%j{H3uV%Qecs?RaW6 z?uu4L6oC*WMM8S|pMSu_$i_s(!pPF@_hxLHnE|^w^{0W$UcOMjEH4UdDJ4@Q-caKZ z{dXP@K9&7MF`fAumeX`CAKGbgnI`v<>G@?w<|UN7e_%r}V<>e|0W?*Z*tg3e;4(L` z=?FhMaJ~dL-0n_g`Xtq)ZeJfB+4HsN%Gb;ZDopKeH0-xD)Cp$n`+)fb^f*ra`%XVx zU)%bWMlH=uWsIbggjSmH#%)-3-<9yV>J2=B z7BUeJ>IOujSs>4U$jJ*1>F6h69`^SG7{_ekrGfg{R+y9KlAa|T&@^$`xk3O8= z$E0(neWMJ+|9A9bCei#gaF_sw0zW)IULqYoSoae`MI4d)S^qJIoIc_0Mk$2Qb_|wIZLO9joU-9m^u2V6?5${*NA4B$4!29 zDMn?#5)liGz@#G9W*ImA)uR}fO(&ujSdRIgc$Q`S{jWa7q-Z8&f;VQHOwFwwsVyP)mO81nqO+~nD8zJeYFpPKC2021d=)S; zuYXiqA%ts+J0=f2YD@V{(`oghXPIpt*Oh>SA&3TVBu`GMDMG8VOfoJh&j4%aR+dxQ zNht=p^3+${||a}DVse7sAe#bkerrs2it5|>2Q{?@F?R&#cOr>^4=oiK>2 z8kvo;_E8>|)NHZMju9m8HT&KxrZgUHXx>bzvL>K1#;zbtZg830LzNH*F*(`u+^RIV_?|y=H{6=$Um-5Uaz1To&EazE%B;2 zb{08S{^TB@0c<)tfNc**zL;yW)!U=aOA1dlj>K(}p>7Zc&GNrK2nO!zUT&!=usqpm zeX-l>GsMsk35PdaWBi*1au(zUw;?Rg?{Z?~h*DxEE>6xMTTR>lsVHXY{NDCYT?j-8 zzW-ZJ452Omq1KRx%$O)k4NJUn^XY?bPop>8v2iE&SaurLa`=~R^4}b;;+5ox zaxewwrBCiqVMDl34R#9xZ=z*N?Y~P(56^npg0`h(bbLJ`4Xk3k-e;At@s6L%(p5 zkZAsMAF@Zd0M73nRo**0|MmqK?_DzkjV(=fl+%5f*2C_o{;XLO!`77+MRjLdr@!=faW7moG6svisi4(mF| zCNpY$@=~AJgV(v(xH;ugx6gelK3O?K(>#D^fL!P19f1MoVawR8+2p2q5?<90aikmD z^cNnT6ed!qj0qLQl{Bh^+;p6dEk`}W)(u(g6d%_d6+Z3maws(6O8^`oW?Etl{G~TH zJu4|~jYL>IhJkl2V^-)m-YIMzw{J)6J1ncMXCa(PW!!|I*s%{wl*f}IBX}G^gl?R` zb$yRiiQC|>+A5sZCVEn#hSJlY=*4PYS5pKC;x>O_e1e{k%n7d&m7j75NUhQtUnz$$ zS8}r_o62twzZv|zU}Vo;In%STnq6O^MRzFIX21ZiIU~6sY62ehczs)*HOMI9cE#F- zopY|K+HqGOp4W7NGWQ}ed5;cCDqfnG8+8dUb_@YO5cKP>UvGri4i{|QsaVe|^e4x5 zWwv@b)Qzx5R27oH`9=?P*>9|u2Gb;}TuQt56k0}Shu#8+-=vYSm^+}T7!-N6wr7R! zmH@T&_UgX9*Z#D+YAa51WV3I#!hhUsdIhc4Y!zR3#k*VJec^U}xvyfIp2%6+t-<7G z(BXA85My$e774wWxBk-b{Pvf0hgRlz2ab1UEWuJ*m-edO5s&#+oMlKc@mXKXmTWqu z?bn0&4%lr+7e|wHar_R9L4S*`Wt!*euIx5*xBVfkxeFjrdd*ik_M?iPiu;lU_ByUN!jIG9;xyUM%rm)z$cL$&+&E86fv&BLo%VzKI2AC1O-D|-hrF1K%08@Q;Ir15w9{~K zYq~)l!JPZOI$svb@Yn)O^?NT%z3jHAZeO}Lr3g*tCt}5u=*3rm4URiv8MI`6YHb0h zeSda+Gxu~kD-Rd24XYwvZUd$4!ZXvz@WO%LMUhFQS#UFHgaF-mW! zBH)`3$k-*|AFhR!F&m`u{B!T06~9TM-I8E)%E^+-~&l0HjW(8g{3jnl62p&OLE zBcd+{*$71N7{BHReW^KZ8%y-yZlG-E*mnpOL35-pcO+d#w8I?(T#k$|@f7~|}@+s6W21AZYQ0HTqX;M!)i7qQ+&h}yW0be+G$+RTIG^tHIm#u)4$10(1ldtSSPi<>FDyrq{ z=pH=N53cG8)t>BIMMAuy^#zvQOibd{;crbzyoqMHMkyL@u|Lt}=cnd7X({h9N&->4 z49?_Rd>%h}l9Kq85)$IBJ^7K$4bpPFu>l>aScnCnN?b2u*SKzr1zsx5!m098Tg#PdCIV|h)BZ*NwG?qt0$nL)dqbu=gt>@PdS z8J%r{UuS0VieFK!Y3!TckVdK@YUE+8_^<7Nb7G%`{Rs4}&pPbCrr1Q4Xdqt;r1z-B zQ9gLBYBz^Ef46O@CIlx$q*@NAS`F{lq1^7hIGJN7nqt3D`lvS~;tL?|{$)XM^bkN1 zY~cMg+PuU{1WQ~6tCN4qa9+>v5P+p|r#P|n?-jSuUnW19ML1N|h`{5rES3Na$+43A<(nl<(xNmX#!9T1z9x4u`?-G?5FAJ_UC z*k9VU$qtkxo^%)lb6#auaAXST`?$4S9^Jw#EcnlI0~_`Oqv*z$3 zPlHc@8&pVVCJUlpF>-^*2hvzBwx)e+ImHOQ4vtg(g3GLoCOzw)gM`G0yGJ!`33qW7 zY;L%CI{3_-WlRct8xPH0%oHBAoek`B%rqiMNMETN(x=&{+4yS4%DVHOn?1NpZ~|Cx zo;1P(J)MbY6k@)5`jO|>03zTBd;Q;a)*g#G4Q~klFcHBggY|KTYHE#OPTduo;aKZMgLq`%n^pxn#1(ed#zz z)p1m`^$VjkHXNvw^7OgydUzDrAZKcTUO%hbz2I>P2^JNPuSnn4S;11^QD@tCTTD&H zRx#f^vb6$H)P1au=|g1N^_Zn$p?P+7W8T-dp!<4QhF++a7=UjarrJB|onENco7)YTHju`!%)y4Vf+ibFgZY;YTnyedBMl-xZos{nIOxpUUzuN=V z$IVv+6PyV3{rAl4K=GyCK)l0)n0-V@v`Bx#+uwohUq<<_==Nm}B@%7cOT;Jt_m^p; zoVEY&I?@5RI4tKxE>p#!4h}s8XTvg<*;7wpFPMm>jOpU;&c(sAXKZ||{nsNI0t0ED zv>WW&DDEjjIi){^=1BZ%rf8H!R{XyDC5_Od5G;@HlFzmRP@nnZ{2FJcjr|!AaokJn z8UKE9+^b!xbqJq(hmlZO<2vmikT9sRx3_d-F-p{$w!&Ikl1K_3 z3Sm*#T&@Uh`Upq%UKDN98H#vo2$&+(&E+dg;mWh}WgM%k8<~ zVxwSF#?(;?lo9HuactKV;ktQyFbmy>R_Kpk4}{v^1n}Bc^q#WrRX1TYciPOIEJs??{h&GhXlq9u>H*#b>3wHEm{X=+?ZnVyGsnmMT^%B%NwJV(@G*LoAyGi}D1AQ>MHH3@V%(05ZhaHJCB*wp?ui<;w z&{kpi@VB5ZQh94+EoM;@L()Z>A6AbR?D1Qe(UweVi`}_M7f&wAt>bMatPVb)0@Q-T zPF%(`U&wJRsTP>}S;qi--LZ+U#_IzEi|uFf?c8a9UYtKkNA$q>ILLDP*>R1aSj$vK z8Q3!xG~e{GjVD3VUmw+$x}SdHtP)*I{RO>)MZuCcQoKh-b3W+eEZQ#z-eRaey;c)e zT{$kY>nJ%_2jjk)UoQ<+#ox;uk+Wu#B}Cn=IJlgVtKuDl zA9?SzQ7D|f{Pfb+u^B6ot1IJl#kN$~c39kx?pHT@v826p@QX_IZ>amV zd-&OvGFn=5#3?eb%p(;DXMoXs0TGeQYW|aTn-#_a2mT-E4?&qJ#W!fXDQh>R^g8z1 zeu;c*+_QrtIfcWuIeI0XtYUOp+sM_=eJfY%S6m6Llrp-VT7vv-EzZ8!LU+m~f&)+Y z$i8;ppLq1+nI1E z`{9D6r&dR?9W~?vU$KumRlBsyPDQ<|isA`*bmZs7B^uozCG$pPdG=-v-El_=c;tI$ zkF_VY<>HTmBbVakR;E<*=Cmxg#pHWy!D*h@F^$WyxJKM!uI@mo>TR>+oax9c&ejJ| zvqNx9Uz~D-T6PM|D#*GauE?GWr{WEHcKEQ!gxgx#JRF}J;vl`9 zVXs!5@NSE;j1Exj0Bk*F(Uo^I9s-BltcFx2G|mFkVjxbqDTxP`Dt7S-arHeKj7~tg z{dfF9z%2{#Sm&Ugwng6?NhnM2apeU6n;Ii;om@{eC`R1&p$$5Qt)ehg>xTYd&_RI$ zVpTa1TQc0RVda0!b3RAfp{4JM&&|fu!wD%`2)R75E$44@2~cJp^BS>Xc@I{YyC`{wMWfRsp!0;5LbXb_b81V~qd)$(MB;mC^#UgYf)$luAeH#ZV464JFc z!`8W*8;n&7`8Wx_@vd=Gbx=+Q_k_x?dkD`A!Acq>!K$iX4v(O#^sk3}JA_jwE1qLy zWY;|Z02X&A45HNzc5DuB23%GgHrfp>I`9>EQ~e@8nE%$HGRJ`p81^RlC4YP7<&0lK zr(U*1nA)A~DZxBR;HehLGR12~<34NPxc73O`D$kr7}Pz%R>v@}(wG>iACkM}Zn@D* zl7||fSXPo@RY84PJnL5Mb{?{k#vdHHVtLtdV2%gwojTEiIYl-Bvi!mo^{H<|yx-Lr z?E)LLgg#-it#VRkuI!ksS_(^P`R%|k`HSQ|#`oK1D~6o5)AE;dZ3@EDV0V>2TuuoW z1VwQMvGX+IL4()GyQC*UWMh$zC-mS%??~BwtHieMx~ogSQ`A$!O%;mLZH))%F9qp` z0l3J#vQE04`un9Vrxt{Vv1BMW(h!ZXZOheIAC-dFErq(myKg)c?J`8A7c^QhE>J#X zJ2q#!c(d?+MV;~qnatFMkJRe4Ch8fQm*08BonDHIuM+WV>@oPHlT9bDWUJUXX-2l6 z#>jy*TS{I^bn*StIkEp37kiAf_Si}=Jb!zfp-4e9vKjE?1g?8cfO6Y?PI1K08wwb} zPBVyuqnA%vXuV(a_O$zB*EIBL= zFRXtTIUK2F6a=8n1mvoqho5%{>BfsGwANnnco%D+`xJ0x`_qhFtgSo= z=S3?iJki6dk~v4-oj=rQ{({*R?lkDVO1R32pUA+P03WX{JXC*@b^=>L3IDF*=~OJmIyxFR=GX#AE?1G?;?P@2};a3=uO ze+!xa4*uyr7|zRhNm3;;d$Y`3=49azV_mr4{gmt{*-`u0iIcQ$?)-u6t)ko6kzu2I zx&AW2u^-_^A#oVLFtydXdj zm`k(l)Am3YN=Yexc*7jG=E^x7doT^G*6aXEpH#5gtPJkFUTP6eR;rKpZEE&TQ%G)c zIy8FTJT%cYz$g~22+Qj$S!5ZcwZmAET}wSONFn56xvjRj!CJ>*!+G6g&1Zkcs*^U!(SCkPcJ0!duxXb=IHqLy&4mH<#{lfvc!84sq z)ZkZFZ&$z&q??|m#$t%#JSL~6}vc=m_R6jRmFS*-`%p_OAtiIQP0Vm5df!3OL z&hupLZ|AR2E(he=u>y}zH!nV!*=0;^!&LYcmTe~VKCm@3t27&K!=3?_g;X( zbXc7I($6~78&dN{as&_&Oh(Yc9-mQiLf8`RQRUXoV6LH8C+ zZ$KL7Dlw4ZMV7&Wweg~3PJE~kW@yCsHmr|UgDArs`l7_Vw$VNV$rWP`gqe<3p>nqg z^RvtN(RElqiNoesXZ<~E%3vChmg!EHiBF!;fgY#8zS#mCOwAB%VH70o+7P6{;b($- z$c4|>`)2WH_VMc86my>~pQ12p%hjqy@`46PWCCFztdo*>*h9*q!*|@B)&XR=Pp}|3h@$J zO(5PRUE$H#u}8;ZMIQ{pvSe=v!j4P{OcV&3Yb5I;8iWP6Ee4;R5%7q*&BMr&DEHyp{&bhw?B6EZM~J991T801DM}v z8=;UvB@YW@{p|CS(Z|?;~Fk8EZmR(q!v&b{+FRuUa!6+Q|%vtwZ@Z8J0aURz%EX&0l%GBrU_~KFBeDHh* zQm)SMqRRNpKgBY@Inrq$cicj#EHybS_QN6T2J3P@W7kM84sfXoqv?D_!>Q4NbR*+& z;TR9-GLGTO2y?EX3)^L!1pUh5eZI{r+YYI6O)ioa8?C& zPOtT11ijG1dEVKzzMWyFbv_PpgI;b(fO|d3#t*JWhI(4ox3eJF-2-DoQ$k(?JVl00 zi=Ma9?GM_vk<9fX$4p4KaO|^dN;%VDvUdEw>k-tjj_*sdYVC;}!RP**o3v?C(=pK9 zr+0bq0q#LIe~*3#>aup6$vMQi2K#J3h;`SG>?Vy%#V3MQP78j4za8_5Z++uuvOuTZ z23PEyhVZ_e*Dg2($=vbQGlDgI<#FcsyN=UT1lF!sLni#T9?2XRo2x5Xd{HF>CL6oB zc*oV$?@+)!n0MOY6L~&A4<@d`Hi=+uI=MSCFEc+nq}rv)bCJ6~BTs}1g<=j)$mb)l zx{@>8E36|}Z}l9gcSla0R%D~O2TurLvcSkoZij-%Z*z#tDVS_|=8!`{MQgHd%cW0O za^QI=dW-9e9qW8A42kp9Nr6{(fY|3xB0T`5gW$JKfP!<)lRMK(OCPoxoNs4V<9F>9 zmlkRe1Lz`_ICF7o_MhNY1P>pBwjq zcQO@V*_(dkf0nk8BS2?5`2B;{^G}gyC0IU{<4POL$^tmMC7zboN*r35UutONqlDc3 zKG-v~mkpj$P_~ed`p@m^s8%Ib!FvR6$Dk9l3I|ZA$B2%I<%uJ91*LQDd`Tr#%i~$1 z@Q`cmn0Sg_ul0uJ?h~?QIEzMP>sUjn3 zs9cf7xMcF$Ub_HTI6R`i5-RD{KISzzRrdJymcMFjT_?26`!yD7_#$Mca>gx?CIsZnyL@UHQkF}J8#dksozpmf~pDFEE$>T z$C)W1uUj$jgk>d36eP3I?vknADqItC$=9<{m9a$!bqmyO@GkN%y2^9dkw33<(V3D{ zk0(nlHJTfJuYLO6vBJcsG5w{HPPo#0gEh4;yl2^VJE^Il=5I#hcI9uofvFMvY-W%> zE;*j@0ISL3Xy9_9kQ!m{59_84wp{HJZ`SeI=XD~?XKBLk^Z6w-8Aqoc(5W^Fl4nEc z#d}Q~v@(+0i4$z`H?Jo3wI<${M0hFcpt5^B_ExO;NT{W)A;->SjkK=nCzP;-AK}ki z)tp{cR`<~uj@PGS_@JV6@tY3#^s3hD=FL(X%0n93r<9^;LC*u;ggqtH=R}hv^>1#p z*2$+|ax@pbdsuf{O1!pp(4MbYhDxRrb9yd`gev=973!(FP{&@n_py2!IE{n(L|JX9tIM?wp2<2B<$AACB=mB!CHzlzmH5z&@ zlc-(d)oWv1`U{NShTgpP9>Pqx6Wr}+>~PcXT0?20Z=53Y(X(goG)1#KpLa>t3b4_x z_*1fd&LerdR4Ac9tpG=wP8@1U6S%T90LW*0U9VJB=nDwe7e_Vd(jGj9Idm)iLLmbG zL_=?!mPnHD460$t=6H2W@swT7oo6t97#vM;_FyAQK zHJa31s)9D~COUdFVQti}Vwv|alyn{@%sAnA5NZ1xFMmSkGm;eN68AT**+zT8S~*^@ zpCLzn$9KDIaddjPy39HJWC>OQLP{3<8;Mvm?W9?x-=}GW<{BQ0{pZ4>)!#E5$5A>y= admit_threshold: + temp[key] = value + sorted_result = dict(sorted(temp.items(), key=lambda x: x[1], reverse=True)) + with open("key_count30.json", "w") as f: + json.dump(sorted_result, f, indent=4) + + print(sorted_result) + + +if __name__ == "__main__": + args = parser.parse_args() + static_key_count(args.file_path) + diff --git "a/tools/feature_admit_tools/\347\211\271\345\276\201\345\207\206\345\205\245\347\262\276\345\272\246\345\257\271\346\257\224\346\226\271\346\263\225.md" "b/tools/feature_admit_tools/\347\211\271\345\276\201\345\207\206\345\205\245\347\262\276\345\272\246\345\257\271\346\257\224\346\226\271\346\263\225.md" new file mode 100644 index 00000000..2cee54c6 --- /dev/null +++ "b/tools/feature_admit_tools/\347\211\271\345\276\201\345\207\206\345\205\245\347\262\276\345\272\246\345\257\271\346\257\224\346\226\271\346\263\225.md" @@ -0,0 +1,21 @@ +## **特征准入准确性对比使用说明** + + +----------------- +### **工具简介**: + +mxRec开启特征准入后,进行准确性比较工具。当前支持模型保存格式SAVE_EASY=False。 + +### **环境依赖** + +该工具在tf1环境上进行测试,环境配置如下,供用户参考: + +> **tf1** +
+tensorflow == 1.15.0 / 1.15.4
+numpy == 1.21.6
+python == 3.7.5
+ +### **使用说明**: +1)指定数据集,使用static_key_count.py查看数据集中指定阈值不同key出现的次数 +2)开启准入后,保存的HisRecord使用get_hist.py工具,查看运行后不同key出现的次数 diff --git a/tools/model_convert/README.md b/tools/model_convert/README.md new file mode 100644 index 00000000..945b2329 --- /dev/null +++ b/tools/model_convert/README.md @@ -0,0 +1,119 @@ +## **模型转换工具使用说明** + + +----------------- +### **工具简介**: + +将基于mxRec+NPU训练保存下来的NPU格式的稀疏表,转换为可被GPU、CPU加载使用的格式。 + +### **环境依赖** + +该工具在tf1/tf2环境上进行测试,两种环境配置如下,供用户参考: + +> **tf1** +
+tensorflow == 1.15.0 / 1.15.4
+numpy == 1.21.6
+python == 3.7.5
+ + +> **tf2** +
+tensorflow == 2.6.5
+numpy == 1.19.5
+python == 3.7.5
+ +
+ +### **使用说明**: + +
+ +**使用范例:** + +示例如下:
+`python3 model_convert.py --input_path=./saved_model --output_path=./saved-model-out --rank_size=8 --estimator=1 --ddr=1` + +打屏日志输出 `convert model success.` 代表模型成功转换完成。 + +**参数解释:** + +`input_path`: 类型:`str`。 NPU格式的模型保存路径。 + + 特别说明: + 1)estimator模式下,在NPU+mxRec训练模型阶段,模型保存路径请设置为: + model_dir = {路径}/{get_rank_id()} + 示例如下: + + from mx_rec.util.initialize import get_rank_id   + model_dir = f"{params.model_ckpt_dir}/{get_rank_id()}" + + 此时在进行模型转换的时候,input_path 填写{param.model_ckpt_dir}即可 + + 2)session run模式下,在NPU+mxRec训练模型阶段,模型保存路径请设置为: + path = {模型路径}/model-{get_rank_id()} + 示例如下: + + from mx_rec.utilinitialize import get_rank_id   + self.saver.save(self.session, f"./saved-model/model-{self.rank_id}", global_step=i)   + + 此时在进行模型转换的时候,input_path 填写./saved-model即可。 + +`output_path`: 类型:`str`。 转换后CPU/GPU 格式输出的路径。该参数可以用户自行设置,若该模型输出路径不存在,会新建目录。 + +`rank_size`: 类型:`int`。 NPU+mxrec训练模型时所用的卡数。范围为[1,16] + +`estimator`: 类型:`int`。是否使用Tensorflow的estimator模式。默认值为0。0代表不使用estimator模式,1代表使用estimator模式。 + + estimator 使用参考链接: + https://www.tensorflow.org/guide/estimator?hl=zh-cn + +`ddr`: 类型:`int`。是否采用mxrec的ddr模式。默认值为0。0代表使用HBM模式,1代表使用DDR模式。
+ +`dynamic_expansion`: 类型:`int`。是否采用动态扩容模式进行训练。默认值为0。0代表不使用动态扩容,1代表使用动态扩容。
+mxrec的ddr模式使用请参考《mxrec用户指南》。 + +
+ +**加载说明:** + +由于tf1\tf2接口更迭,故本工具在编码时tf1\tf2采取的接口不同。
在加载的时候根据tf版本情况采用不同的加载接口。示例如下: + +- tf1 +
+```python +restore_table = tf.contrib.lookup.MutableHashTable( + key_dtype=tf.int64,  + value_dtype=tf.float32,   +     default_value=initialize_value,   + name=args.table_name,   + checkpoint=True)   + +with tf.Session() as sess:   +    saver = tf.train.Saver()   +    saver.restore(sess, args.path + "/model.ckpt-0")   +    lookup_embedding = restore_table.lookup(key)   +``` + +- tf2 +```python +restore_table = tf.lookup.experimental.MutableHashTable( + key_dtype=tf.int64,  + value_dtype=tf.float32,   + default_value=np.zeros((240,)),   +            name="deep_sparse_table")   +  +restore_table1 = tf.lookup.experimental.MutableHashTable( + key_dtype=tf.int64,  + value_dtype=tf.float32,   +     default_value=np.zeros((37,)),   +     name="wide_sparse_table")   + +# 这里restore table的顺序需与保存的顺序保持一致,保存的顺序在模型转换的时候会输出   + +checkpoint = tf.train.Checkpoint(table_list=[restore_table,restore_table1])   +manager = tf.train.CheckpointManager(checkpoint, directory=args.path, max_to_keep=3)   +checkpoint.restore(manager.latest_checkpoint)   +  +lookup_embedding = restore_table1.lookup(key)   +``` \ No newline at end of file diff --git a/tools/model_convert/model_convert.py b/tools/model_convert/model_convert.py new file mode 100644 index 00000000..7608917a --- /dev/null +++ b/tools/model_convert/model_convert.py @@ -0,0 +1,287 @@ +import argparse +import json +import os +import re +from enum import Enum + +import tensorflow as tf +import numpy as np + +parser = argparse.ArgumentParser() +parser.add_argument('--input_path', type=str, required=True, help='path of the model file to be converted') +parser.add_argument('--output_path', type=str, required=True, help='output path of the converted model') +parser.add_argument('--rank_size', type=int, choices=range(1,17), default=8, required=False) +parser.add_argument('--estimator', type=int, choices=[0,1], default=0, required=False) +parser.add_argument('--ddr', type=int, choices=[0, 1], default=0, required=False) +parser.add_argument("--dynamic_expansion", type=int, choices=[0, 1], default=0, required=False) + +slice_prefix = "slice_" +sparse_file_prefix = "sparse-" +data_suffix = ".data" +attribute_suffix = ".attribute" +hbm_prefix_list = ["HashTable", "HBM"] +ddr_prefix_list = ["HashTable", "DDR"] +min_file_size = 1 +max_file_size = 1024 * 1024 * 1024 * 1024 + + +class DataAttr(Enum): + SHAPE = "shape" + DARATYPE = "data_type" + + +class ModelConverter: + def __init__(self, input_model_path, output_model_path, rank_size, estimator, ddr, dynamic_expansion): + self._input_path = input_model_path + self._output_path = output_model_path + self._rank_size = rank_size + self._is_estimator = bool(estimator) + self._is_ddr = bool(ddr) + self._use_dynamic_expansion = bool(dynamic_expansion) + self._load_ckpt_path = None + self._input_model_path_list = [] + self._table_list = [] + self.table_info_dict = {} + self.sparse_file_list = [] + + if not os.path.exists(self._input_path): + raise FileNotFoundError(f"the input path {self._input_path} does not exists. please check it.") + if not os.path.exists(self._output_path): + os.makedirs(self._output_path) + self._build_input_model_list(self._is_estimator) + self._build_sparse_file_list() + self._check_mode() + self._build_table_info_dict() + + def convert(self): + insert_op_list = [] + var_list = [] + hash_table_list = [] + # load old checkpoint and get var list + if not os.path.exists(self._load_ckpt_path): + raise FileNotFoundError(f"the checkpoint path {self._load_ckpt_path} does not exists.") + ckpt = tf.train.load_checkpoint(self._load_ckpt_path) + var_names = ckpt.get_variable_to_shape_map().keys() + var_values = [ckpt.get_tensor(name) for name in var_names] + for i, name in enumerate(var_names): + var = tf.Variable(var_values[i], name=name) + var_list.append(var) + + # get key and embedding from file to insert hashtable + for table_name, emb_size in self.table_info_dict.items(): + initialize_value = np.zeros((emb_size,)) + # create mutable hashtable + if tf.__version__.startswith("2"): + hash_table = tf.lookup.experimental.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.float32, + default_value=initialize_value, name=table_name) + else: + hash_table = tf.contrib.lookup.MutableHashTable(key_dtype=tf.int64, value_dtype=tf.float32, + default_value=initialize_value, name=table_name) + + for rank in range(self._rank_size): + offset, key = self._get_key_and_offset(self.sparse_file_list[rank], table_name) + if self._is_ddr: + emb_data = self._get_embedding_array(self.sparse_file_list[rank], table_name)[list(offset)] + else: + emb_data = self._get_embedding_array(self.sparse_file_list[rank], table_name) + insert_op = hash_table.insert(tf.convert_to_tensor(key), tf.convert_to_tensor(emb_data)) + insert_op_list.append(insert_op) + print("build save table:", table_name) + hash_table_list.append(hash_table) + if tf.__version__.startswith("2"): + checkpoint = tf.train.Checkpoint(table_list = hash_table_list) + manager = tf.train.CheckpointManager(checkpoint, directory=self._output_path, max_to_keep=5) + manager.save() + else: + with tf.Session() as sess: + sess.run(tf.global_variables_initializer()) + sess.run(insert_op_list) + saver = tf.train.Saver() + saver.save(sess, self._output_path + "/model.ckpt-0") + + def _get_key_and_offset(self, sparse_file_path, table_name): + if self._is_ddr: + upper_dir = generate_upper_dir(sparse_file_path, ddr_prefix_list, table_name, "embedding_hashmap") + else: + upper_dir = generate_upper_dir(sparse_file_path, hbm_prefix_list, table_name, "key") + attribute_data_dir, target_data_dir = get_attribute_and_data_file(upper_dir) + + with open(attribute_data_dir, "r") as fin: + validate_read_file(attribute_data_dir) + attributes = np.fromfile(attribute_data_dir, dtype=np.uint64) + data_shape = attributes[:2] + + with open(target_data_dir, "r") as fin: + validate_read_file(target_data_dir) + key_offset_data = np.fromfile(target_data_dir, dtype=np.int64) + key_offset_data = key_offset_data.reshape(data_shape) + offset = [] + if self._is_ddr: + offset = key_offset_data[:, 1] + key = key_offset_data[:, 0] + return offset, key + + def _get_embedding_array(self, sparse_file_path, table_name): + upper_dir = generate_upper_dir(sparse_file_path, hbm_prefix_list, table_name, "embedding") + attribute_data_dir, target_data_dir = get_attribute_and_data_file(upper_dir) + with open(attribute_data_dir, "r") as fin: + validate_read_file(attribute_data_dir) + if self._use_dynamic_expansion: + attributes = np.fromfile(attribute_data_dir, dtype=np.uint64) + data_shape = attributes[:2] + else: + emb_attributes = json.load(fin) + data_shape = emb_attributes.pop(DataAttr.SHAPE.value) + with open(target_data_dir, "r") as fin: + validate_read_file(target_data_dir) + emb_data = np.fromfile(target_data_dir, dtype=np.float32) + + emb_data = emb_data.reshape(data_shape) + + if self._is_ddr: + ddr_upper_dir = generate_upper_dir(sparse_file_path, ddr_prefix_list, table_name, "embedding_data") + attribute_data_dir, target_data_dir = get_attribute_and_data_file(ddr_upper_dir) + with open(attribute_data_dir, "r") as fin: + validate_read_file(attribute_data_dir) + attributes = np.fromfile(attribute_data_dir, dtype=np.uint64) + data_shape = attributes[:2] + with open(target_data_dir, "r") as fin: + validate_read_file(target_data_dir) + ddr_emb_data = np.fromfile(target_data_dir, dtype=np.float32) + ddr_emb_data = ddr_emb_data.reshape(data_shape) + emb_data = np.concatenate((emb_data, ddr_emb_data[:, :self.table_info_dict[table_name]]), axis=0) + return emb_data + + def _build_sparse_file_list(self): + if self._is_estimator: + latest_ckpt = self._get_latest_ckpt_name() + sparse_file_name = sparse_file_prefix + latest_ckpt + for rank in range(self._rank_size): + sparse_file_path = os.path.join(self._input_model_path_list[rank], sparse_file_name) + self.sparse_file_list.append(sparse_file_path) + else: + latest_ckpt = self._get_latest_ckpt_name() + latest_step = latest_ckpt.split("-")[-1] + pattern = re.compile(r"^sparse-.*{}$".format(latest_step)) + for folder_name in os.listdir(self._input_path): + if os.path.isdir(os.path.join(self._input_path, folder_name)) and pattern.match(folder_name): + sparse_file_path = os.path.join(self._input_path, folder_name) + self.sparse_file_list.append(sparse_file_path) + if len(self.sparse_file_list) != self._rank_size: + raise AssertionError( + f"the sparse file num should be {self._rank_size} rather than {len(self.sparse_file_list)}") + + def _build_input_model_list(self, is_estimator): + if is_estimator: + for i in range(self._rank_size): + model_path = os.path.join(self._input_path, str(i)) + self._input_model_path_list.append(model_path) + else: + self._input_model_path_list.append(self._input_path) + self._load_ckpt_path = self._input_model_path_list[0] + + def _get_latest_ckpt_name(self): + ckpt_path = os.path.join(self._load_ckpt_path, "checkpoint") + if not os.path.exists(ckpt_path): + raise FileNotFoundError(f"the input path you provided {ckpt_path} miss checkpoint file.please check it.") + with open(ckpt_path, "r") as fin: + # validate open file + validate_read_file(ckpt_path) + latest_ckpt = fin.readline().rstrip() + latest_ckpt = latest_ckpt.split(":")[1].strip(' ').replace('"','') + latest_ckpt = latest_ckpt.split("/")[-1] + return latest_ckpt + + def _build_table_info_dict(self): + tmp_file_list = [] + table_upper_file = os.path.join(self.sparse_file_list[0], "HashTable", "HBM") + if not os.path.exists(table_upper_file): + raise FileNotFoundError(f"the sparse file path {table_upper_file} does not exists.") + for _, table_name, _ in os.walk(table_upper_file): + tmp_file_list.append(table_name) + + if not tmp_file_list: + raise FileNotFoundError(f"under the sparse file path {table_upper_file}, no file exists.") + self._table_list = tmp_file_list[0] + for table_name in self._table_list: + table_path = os.path.join(table_upper_file, table_name, "embedding") + attribute_file = get_attribute_and_data_file(table_path)[0] + with open(attribute_file, "r") as fin: + validate_read_file(attribute_file) + if self._use_dynamic_expansion: + attributes = np.fromfile(attribute_file, dtype=np.uint64) + data_shape = attributes[:2] + else: + emb_attributes = json.load(fin) + data_shape = emb_attributes.pop(DataAttr.SHAPE.value) + self.table_info_dict[table_name] = data_shape[1] + + def _check_mode(self): + check_dir = os.path.join(self.sparse_file_list[0], "HashTable") + model_dirs = [] + for _, dirs, _ in os.walk(check_dir): + model_dirs.append(dirs) + if not self._is_ddr and "DDR" in model_dirs[0]: + raise ValueError(f"wrong mode choose! you choose hbm mode, however ddr dir exists. ") + if self._is_ddr and "DDR" not in model_dirs[0]: + raise ValueError(f"wrong mode choose! you choose ddr mode, however ddr dir not exists. ") + + +def get_attribute_and_data_file(table_path): + if not os.path.exists(table_path): + raise FileNotFoundError(f"the input table path {table_path} does not exists.") + + attribute_file_list = [] + data_file_list = [] + for file_name in os.listdir(table_path): + if file_name.endswith(attribute_suffix): + attribute_file_list.append(file_name) + if file_name.endswith(data_suffix): + data_file_list.append(file_name) + if len(attribute_file_list) != 1: + raise AssertionError(f"under the table path {table_path}, ther must only one attribute file. " + f"In fact, {len(attribute_file_list)} attribute file exists. ") + if len(data_file_list) != 1: + raise AssertionError(f"under the table path {table_path}, ther must only one data file. " + f"In fact, {len(data_file_list)} data file exists. ") + attribute_file = os.path.join(table_path, attribute_file_list[0]) + data_file = os.path.join(table_path, data_file_list[0]) + return attribute_file, data_file + + +def generate_upper_dir(sparse_file, dir_prefix_list, table_name, data_type): + temp_dir = sparse_file + for dir in dir_prefix_list: + temp_dir = os.path.join(temp_dir, dir) + return os.path.join(temp_dir, table_name, data_type) + + +def generate_attribute_dir(sparse_file, dir_prefix_list, table_name, data_type, rank_id): + temp_dir = sparse_file + for dir in dir_prefix_list: + temp_dir = os.path.join(temp_dir, dir) + return os.path.join(temp_dir, table_name, data_type, f"{slice_prefix}{rank_id}{attribute_suffix}") + + +def generate_data_dir(sparse_file, dir_prefix_list, table_name, data_type, rank_id): + temp_dir = sparse_file + for dir in dir_prefix_list: + temp_dir = os.path.join(temp_dir, dir) + return os.path.join(temp_dir, table_name, data_type, f"{slice_prefix}{rank_id}{data_suffix}") + + +def validate_read_file(read_path): + if os.path.islink(read_path): + raise ValueError(f"the path {read_path} to be read is soft link.") + file_stat = tf.io.gfile.stat(read_path) + if not min_file_size < file_stat.length <= max_file_size: + raise ValueError(f"file size: {file_stat.length} is invalid, not in ({min_file_size}, {max_file_size}]") + + +if __name__ == "__main__": + args = parser.parse_args() + convert_instance = ModelConverter(input_model_path=args.input_path, output_model_path=args.output_path, + rank_size=args.rank_size, + estimator=args.estimator, ddr=args.ddr, dynamic_expansion=args.dynamic_expansion) + convert_instance.convert() + print("convert model success.") \ No newline at end of file diff --git a/tools/model_convert/model_convert_mt_v2.py b/tools/model_convert/model_convert_mt_v2.py new file mode 100644 index 00000000..df3fde7d --- /dev/null +++ b/tools/model_convert/model_convert_mt_v2.py @@ -0,0 +1,246 @@ +import argparse +import json +import os +import re +from enum import Enum + +import tensorflow as tf +import numpy as np + +parser = argparse.ArgumentParser() +parser.add_argument('--input_path', type=str, required=True, help='path of the model file to be converted') +parser.add_argument('--output_path', type=str, required=True, help='output path must be local path') +parser.add_argument('--rank_size', type=int, choices=range(1, 17), default=8, required=False) +parser.add_argument('--estimator', type=int, choices=[0, 1], default=1, required=False) +parser.add_argument('--ddr', type=int, choices=[0, 1], default=0, required=False) +parser.add_argument('--save_easy', type=int, choices=[0, 1], default=1, required=False) + +slice_prefix = "slice_" +sparse_file_prefix = "sparse-" +data_suffix = ".data" +attribute_suffix = ".attribute" +hbm_prefix_list = ["HashTable", "HBM"] +ddr_prefix_list = ["HashTable", "DDR"] +min_file_size = 1 +max_file_size = 1024 * 1024 * 1024 * 1024 + + +class DataAttr(Enum): + SHAPE = "shape" + DARATYPE = "data_type" + + +class ModelConverter: + def __init__(self, input_model_path, output_model_path, rank_size, estimator, ddr, save_easy): + self._input_path = input_model_path + self._output_path = output_model_path + self._rank_size = rank_size + self._is_estimator = bool(estimator) + self._is_ddr = bool(ddr) + self._is_save_easy = bool(save_easy) + self._load_ckpt_path = None + self._input_model_path_list = [] + self._table_list = [] + self.table_info_dict = {} + self.sparse_file_list = [] + + if not tf.io.gfile.exists(self._input_path): + raise FileNotFoundError(f"the input path {self._input_path} does not exists. please check it.") + if not tf.io.gfile.exists(self._output_path): + tf.io.gfile.makedirs(self._output_path) + self._build_input_model_list(self._is_estimator) + self._build_sparse_file_list() + self._build_table_info_dict() + + def convert(self): + for table_name, emb_size in self.table_info_dict.items(): + result_key = np.array([]) + result_embedding = np.array([]) + for rank in range(self._rank_size): + if not self._is_save_easy: + offset, key = self._get_key_and_offset(self.sparse_file_list[rank], table_name) + emb_data = self._get_embedding_array(self.sparse_file_list[rank], table_name)[list(offset)] + else: + key = self._get_key_array(self.sparse_file_list[rank], table_name) + emb_data = self._get_embedding_array(self.sparse_file_list[rank], table_name) + + if rank == 0: + result_key = key + result_embedding = emb_data + else: + result_key = np.concatenate((result_key, key), axis=0) + result_embedding = np.concatenate((result_embedding, emb_data), axis=0) + + # save result every table + transformed_data = dict(zip(result_key[:], result_embedding[:])) + save_path = os.path.join(self._output_path, table_name + "_key_embedding" + ".npy") + save_dir = os.path.dirname(save_path) + os.makedirs(save_dir, exist_ok=True) + np.save(save_path, transformed_data) + + def _get_key_and_offset(self, sparse_file_path, table_name): + if self._is_ddr: + upper_dir = generate_upper_dir(sparse_file_path, ddr_prefix_list, table_name, "embedding_hashmap") + else: + upper_dir = generate_upper_dir(sparse_file_path, hbm_prefix_list, table_name, "key_offset_map") + attribute_data_dir, target_data_dir = get_attribute_and_data_file(upper_dir) + + with open(attribute_data_dir, "r") as fin: + attributes = np.fromfile(attribute_data_dir, dtype=np.uint64) + data_shape = attributes[:2] + + with open(target_data_dir, "r") as fin: + key_offset_data = np.fromfile(target_data_dir, dtype=np.int64) + key_offset_data = key_offset_data.reshape(data_shape) + offset = key_offset_data[:, 1] + key = key_offset_data[:, 0] + return offset, key + + def _get_key_array(self, sparse_file_path, table_name): + upper_dir = generate_upper_dir(sparse_file_path, hbm_prefix_list, table_name, "key") + attribute_data_dir, target_data_dir = get_attribute_and_data_file(upper_dir) + with tf.io.gfile.GFile(attribute_data_dir, "r") as fin: + emb_attributes = json.load(fin) + with tf.io.gfile.GFile(target_data_dir, "rb") as fin: + key_data = fin.read() + key_data = np.fromstring(key_data, dtype=emb_attributes.pop(DataAttr.DARATYPE.value)) + + data_shape = emb_attributes.pop(DataAttr.SHAPE.value) + key = key_data.reshape(data_shape) + return key + + def _get_embedding_array(self, sparse_file_path, table_name): + upper_dir = generate_upper_dir(sparse_file_path, hbm_prefix_list, table_name, "embedding") + attribute_data_dir, target_data_dir = get_attribute_and_data_file(upper_dir) + with tf.io.gfile.GFile(attribute_data_dir, "r") as fin: + emb_attributes = json.load(fin) + + with tf.io.gfile.GFile(target_data_dir, "rb") as fin: + emb_data = fin.read() + emb_data = np.fromstring(emb_data, dtype=emb_attributes.pop(DataAttr.DARATYPE.value)) + data_shape = emb_attributes.pop(DataAttr.SHAPE.value) + emb_data = emb_data.reshape(data_shape) + + if self._is_ddr: + ddr_upper_dir = generate_upper_dir(sparse_file_path, ddr_prefix_list, table_name, "embedding_data") + attribute_data_dir, target_data_dir = get_attribute_and_data_file(ddr_upper_dir) + with open(attribute_data_dir, "r") as fin: + attributes = np.fromfile(attribute_data_dir, dtype=np.uint64) + data_shape = attributes[:2] + with open(target_data_dir, "r") as fin: + ddr_emb_data = np.fromfile(target_data_dir, dtype=np.float32) + ddr_emb_data = ddr_emb_data.reshape(data_shape) + emb_data = np.concatenate((emb_data, ddr_emb_data[:, :self.table_info_dict[table_name]]), axis=0) + return emb_data + + def _build_sparse_file_list(self): + if self._is_estimator: + latest_ckpt = self._get_latest_ckpt_name() + sparse_file_name = sparse_file_prefix + latest_ckpt + for rank in range(self._rank_size): + sparse_file_path = os.path.join(self._input_model_path_list[rank], sparse_file_name) + self.sparse_file_list.append(sparse_file_path) + else: + pattern = re.compile(r"sparse-.+") + for folder_name in tf.io.gfile.listdir(self._input_path): + if tf.io.gfile.isdir(os.path.join(self._input_path, folder_name)) and pattern.match(folder_name): + sparse_file_path = os.path.join(self._input_path, folder_name) + self.sparse_file_list.append(sparse_file_path) + if len(self.sparse_file_list) != self._rank_size: + raise AssertionError( + f"the sparse file num should be {self._rank_size} rather than {len(self.sparse_file_list)}") + + def _build_input_model_list(self, is_estimator): + if is_estimator: + for i in range(self._rank_size): + # for mt, need two rank id + model_path = os.path.join(self._input_path, str(i)) + self._input_model_path_list.append(model_path) + else: + self._input_model_path_list.append(self._input_path) + self._load_ckpt_path = self._input_model_path_list[0] + + def _get_latest_ckpt_name(self): + ckpt_path = os.path.join(self._load_ckpt_path, "checkpoint") + if not tf.io.gfile.exists(ckpt_path): + raise FileNotFoundError(f"the input path you provided {ckpt_path} miss checkpoint file.please check it.") + with tf.io.gfile.GFile(ckpt_path, "r") as fin: + # validate open file + latest_ckpt = fin.readline().rstrip() + latest_ckpt = latest_ckpt.split(":")[1].strip(' ').replace('"', '') + latest_ckpt = latest_ckpt.split("/")[-1] + return latest_ckpt + + def _build_table_info_dict(self): + tmp_file_list = [] + table_upper_file = os.path.join(self.sparse_file_list[0], "HashTable", "HBM") + if not tf.io.gfile.exists(table_upper_file): + raise FileNotFoundError(f"the sparse file path {table_upper_file} does not exists.") + for _, table_name, _ in tf.io.gfile.walk(table_upper_file): + tmp_file_list.append(table_name) + + + if not tmp_file_list: + raise FileNotFoundError(f"under the sparse file path {table_upper_file}, no file exists.") + self._table_list = tmp_file_list[0] + for table_name in self._table_list: + table_name = f"{table_name}/table" + table_path = os.path.join(table_upper_file, table_name, "embedding") + attribute_file = get_attribute_and_data_file(table_path)[0] + with tf.io.gfile.GFile(attribute_file, "r") as fin: + emb_attributes = json.load(fin) + data_shape = emb_attributes.pop(DataAttr.SHAPE.value) + self.table_info_dict[table_name] = data_shape[1] + + +def get_attribute_and_data_file(table_path): + if not tf.io.gfile.exists(table_path): + raise FileNotFoundError(f"the input table path {table_path} does not exists.") + + attribute_file_list = [] + data_file_list = [] + for file_name in tf.io.gfile.listdir(table_path): + if file_name.endswith(attribute_suffix): + attribute_file_list.append(file_name) + if file_name.endswith(data_suffix): + data_file_list.append(file_name) + if len(attribute_file_list) != 1: + raise AssertionError(f"under the table path {table_path}, ther must only one attribute file. " + f"In fact, {len(attribute_file_list)} attribute file exists. ") + if len(data_file_list) != 1: + raise AssertionError(f"under the table path {table_path}, ther must only one data file. " + f"In fact, {len(data_file_list)} data file exists. ") + attribute_file = os.path.join(table_path, attribute_file_list[0]) + data_file = os.path.join(table_path, data_file_list[0]) + return attribute_file, data_file + + +def generate_upper_dir(sparse_file, dir_prefix_list, table_name, data_type): + temp_dir = sparse_file + for dir in dir_prefix_list: + temp_dir = os.path.join(temp_dir, dir) + return os.path.join(temp_dir, table_name, data_type) + + +def generate_attribute_dir(sparse_file, dir_prefix_list, table_name, data_type, rank_id): + temp_dir = sparse_file + for dir in dir_prefix_list: + temp_dir = os.path.join(temp_dir, dir) + return os.path.join(temp_dir, table_name, data_type, f"{slice_prefix}{rank_id}{attribute_suffix}") + + +def generate_data_dir(sparse_file, dir_prefix_list, table_name, data_type, rank_id): + temp_dir = sparse_file + for dir in dir_prefix_list: + temp_dir = os.path.join(temp_dir, dir) + return os.path.join(temp_dir, table_name, data_type, f"{slice_prefix}{rank_id}{data_suffix}") + + +if __name__ == "__main__": + args = parser.parse_args() + convert_instance = ModelConverter(input_model_path=args.input_path, output_model_path=args.output_path, + rank_size=args.rank_size, + estimator=args.estimator, ddr=args.ddr, save_easy=args.save_easy) + convert_instance.convert() + print(f"sparse table has been converted to numpy file. output path is {args.output_path}") + diff --git a/tools/mx_rec_perf.sh b/tools/mx_rec_perf.sh new file mode 100644 index 00000000..fe1ee706 --- /dev/null +++ b/tools/mx_rec_perf.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. +# Description MxRec性能分析脚本 V1.0 +set -e + +file="$1" #请输入spdlog文件 + +calculate_average() { + awk '{ + sum += $1; + count++ + } END { + average = sum / count; + print average + }' +} +perf() { + echo "read batch cost" + cat ${file} | grep 'read batch cost'|grep -v timeout|tail -n 20| awk 'NR%2==1' + echo "====================================" + echo "key process cost" + cat ${file} | grep 'key process cost'|tail + avg=$(cat ${file} | grep -Po '(?<=key process cost:)[^,:]+(?=,)'|tail -n +20 |calculate_average) + echo "Average: $avg" + echo "====================================" + echo "分析host和device流水,当 host key process 提前训练step时,host性能不为瓶颈" + echo "按输入训练step打印标志,(默认为step) Enter打开分析,按q退出" + read step + step="${step:-step}" + cat ${file} | grep -P "key process cost|${step}"|tail -n100|less +} +echo -e "\e[45m\e[1m =========MxRec分析脚本 V1.0========= \e[0m" +echo + +stuck_check() { + echo -e "\e[106m--------卡住、getnext超时问题定位----------\e[0m" + echo -n "超时通道为:" + cat ${file} | grep -Po "aicpu_getnext.*GetNext" + echo + echo "检查每张卡发送lookup数量:" + for i in {0..7} + do + line=$(cat ${file} | grep -P "send"|grep "h2d"|grep "1,${i}"|wc -l) + echo -n "$line " + done + echo + echo "检查每张卡发送h2d数量是否相同:" + for i in {0..7} + do + line=$(cat ${file} | grep "send"|grep "h2d"|grep "1,${i}"|wc -l) + echo -n "$line " + done + echo + echo "检查每张卡接收数量是否相同:" + for i in {0..7} + do + line=$(cat ${file} | grep "r recv"|grep "1,${i}"|wc -l) + echo -n "$line " + done + echo + echo "每张卡最后接收batch为:" + cat ${file}|grep "trans emb"|grep "info"|tail +} + +hot_check() { + # 查看hot emb去重率 + echo "表名及去重率(去重后/去重前)为:(应该要小于0.4)" + cat op_summary_*.csv |grep gather_for_restore_vector |awk -F "," '{print $6,$14,$15}'|sed 's/"//g'|sed 's/ [0-9]*;/\//' +} + +perf diff --git a/tools/parse_data/data_parser.py b/tools/parse_data/data_parser.py new file mode 100644 index 00000000..53c59fb9 --- /dev/null +++ b/tools/parse_data/data_parser.py @@ -0,0 +1,133 @@ +# coding: UTF-8 + +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# -----------------------------------------ReadMe Begin-------------------------------------------- +# 1. 功能描述 +# 本工具用于单测tensorflow数据解析阶段耗时,便于分析数据解析阶段是不是整个pipeline的瓶颈?堵塞了pipeline的流畅运行? +# 2. 注意事项 +# 数据解析逻辑主要包含在make_dataset函数中,本函数缺省使用criteo数据集。如果需要测试其他数据集的解析耗时,可根据需要重新定义make_dataset; +# 3. 绑核 +# 为了模拟真实场景,bind_cpu默认模拟了80核cpu、8worker平均分配核;如果worker数目不同、真实cpu核数不同,可根据需要重新定义bind_cpu函数; +# 4. 启动执行 +# 4.1 单worker执行: python3 data_parser.py +# 4.2 多worker执行: bash run.sh data_parser.py +# -----------------------------------------ReadMe End-------------------------------------------- + +import os +import sys +import time + +import logging +import psutil + +import tensorflow as tf + +logging.basicConfig(level=logging.DEBUG) + + +def make_dataset(data_path, batch_size=102400, line_per_sample=1024): + def extract_fn(data_record): + features = { + # Extract features using the keys set during creation + 'label': tf.FixedLenFeature(shape=(line_per_sample,), dtype=tf.int64), + 'sparse_feature': tf.FixedLenFeature(shape=(26 * line_per_sample,), dtype=tf.int64), + 'dense_feature': tf.FixedLenFeature(shape=(13 * line_per_sample,), dtype=tf.float32), + } + sample = tf.parse_single_example(data_record, features) + return sample + + def feat_cast(feat): + for name, tensor in feat.items(): + if tensor.dtype == tf.int64: + feat[name] = tf.cast(tensor, tf.int32) + return feat + + def reshape_fn(batch): + batch['label'] = tf.reshape(batch['label'], [-1, 1]) + batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 13]) + batch['dense_feature'] = tf.math.log(batch['dense_feature'] + 3.0) + batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 26]) + return batch + + file_list = sorted([os.path.join(data_path, file) for file in os.listdir(data_path)]) + dataset = tf.data.TFRecordDataset(file_list, num_parallel_reads=4) + + num_parallel = 8 + dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel) + + line_cnt = batch_size // line_per_sample + dataset = dataset.batch(line_cnt, drop_remainder=True) + + dataset = dataset.map(feat_cast, num_parallel_calls=num_parallel) + dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel) + + dataset = dataset.prefetch(10) + return dataset + + +def bind_cpu(rank_id): + process = psutil.Process() + cpu_kernels = { + 0: 0, + 1: 10, + 2: 40, + 3: 50, + 4: 20, + 5: 30, + 6: 60, + 7: 70 + } + try: + process.cpu_affinity([cpu_kernels.get(rank_id) + x for x in range(10)]) + except IndexError: + logging.error("error cpu bind info, skipped.") + + +if __name__ == '__main__': + RANK_ID = 0 + if (len(sys.argv) > 1): + RANK_ID = int(sys.argv[1]) + bind_cpu(RANK_ID) + + DATA_PATH = "/media/mxRec/data/criteo_tfrecord_small/train" + train_dataset = make_dataset(DATA_PATH) + iterator = train_dataset.make_initializable_iterator() + next_batch = iterator.get_next() + + input_data = [] + for example in next_batch: + input_data.append(next_batch[example]) + + COUNT = 0 + TOTAL_TIME = 0.0 + + with tf.Session() as sess: + sess.run(iterator.initializer) + while True: + try: + start_time = time.time() + result = sess.run(input_data[0]) + end_time = time.time() + + COUNT += 1 + + if COUNT > 1: + TOTAL_TIME += end_time - start_time + logging.info("StepId:%d, StepTimeCost(ms):%f", COUNT, (end_time - start_time)) + except tf.errors.OutOfRangeError as e: + logging.error("End of Training Dataset") + break + logging.info("StepTimeCost avg(ms):%f", TOTAL_TIME / (COUNT - 1)) \ No newline at end of file diff --git a/tools/parse_data/run.sh b/tools/parse_data/run.sh new file mode 100644 index 00000000..b3ab73bb --- /dev/null +++ b/tools/parse_data/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. 2021-2023. All rights reserved. +# Description: performace analysis tool +# Author: MindX SDK +# Create: 2023 +# History: NA + +for i in {0..7} +do + nohup python3 data_parser.py $i > rank_$i.log 2>&1 & +done \ No newline at end of file diff --git a/tools/perf/fast.sh b/tools/perf/fast.sh new file mode 100644 index 00000000..bf916090 --- /dev/null +++ b/tools/perf/fast.sh @@ -0,0 +1,391 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. 2021-2023. All rights reserved. +# Description: performace analysis tool +# Author: MindX SDK +# Create: 2023 +# History: NA + +# -----------------------------------------ReadMe Begin-------------------------------------------- +# 1. 功能描述 +# 本工具用来分析模型执行过程中pipeline中各个pipe的耗时、以及各个pipe中的子模块(Step)的耗时,以便于发现系统瓶颈。 +# (pipeline的基本原理是:每个pipe的耗时近似相等,pipe之间的耗时能够互相掩盖起来,这样,才能减少堵塞和等待,提升吞吐。) +# +# 2. 使用方法 +# bash fast.sh your_log_file.log +# +# 3. 注意事项 +# 基于spdlog::debug,mxRec中添加了TimeCost打点日志,因此,在执行前务必确保run.sh中设置 +# SPDLOG_LEVEL=debug (如果没有设置,本工具会退出,并给予提示) +# +# 4. 解读结果 +# (1) Pipeline: 整个Pipeline由多个Pipe串行构成,性能分析结果分Pipe呈现,例如Pipe-1/Pipe-2/Pipe-3/Pipe-4等; +# (2) Pipe: 每个Pipe级都会有一个整个耗时。(我们希望每个Pipe的耗时近似相等,这样Pipe之间才能互相掩盖,流水线效率才最高) +# (3) 子模块(Step):一个Pipe可能有多个串行的子模块(Step)构成、子模块又可能包含下一级子模块(SubStep)。因此,在性能分级报告中, +# 下一级的子模块耗时用--开头,再下一级的子模块耗时用----开头;依次类推;(上一级的耗时中包含了下一级的耗时) +# +# 5. 性能调优 +# 通过分析报告,我们可能会发现: +# (1)耗时特别长的Pipe; +# (2)耗时特别长的子模块; +# 需要具体问题具体分析,针对性的调优或者开展深度优化。 +# 例如:如果发现Tensorflow数据解析慢(Pipe-1),导致供应不足,可以调节Tensorflow侧解析数据的num_parallel参数; +# 如果发现CPU打满而导致数据预处理阻塞(Pipe 2: Data Preprocess),则可以调低KEY_PROCESS_THREAD_NUM (默认为6); +# 如果发现H2D阻塞(Pipe 4: H2D Send Tensors (no DDR)),则可能需要排查NPU侧GetNext或者DNN训练是否堵塞。 +# 然而,对于一些深层的问题,可能涉及到需要开展深度优化:比如Pipe拆分、串行改并行、锁优化、执行逻辑调整。 +# 另外,本工具也可以作为性能优化的参考,例如优化了某个子模块,可以对比观察(优化前vs优化后)该子模块的耗时, +# 同时对比观察端到端耗时、吞吐变化等。 +# +# 6. 该工具也需要不断升级,和代码同步更新,欢迎大家修改、完善。Good Luck! +# -----------------------------------------ReadMe End-------------------------------------------- +#set -x + +LOG_INFO() { echo -e "\033[1;4;32m$1\033[0m" ; } +LOG_NOTICE() { echo -e "\033[1;4;45m$1\033[0m" ; } +LOG_WARN() { echo -e "\033[1;31m[WARN]$1\033[0m" ; } +LOG_ERROR() { echo -e "\033[1;31m[Error]$1\033[0m" ; } + +logfile=$1 + +validate_options() +{ + if [ $# -ne 1 ]; then + LOG_ERROR "NO log_file" + echo "[Usage]: bash $0 log_file" + exit 1 + fi +} + +check_spdlog_level() +{ + $(grep 'ReadEmbKeyV2Static' $logfile > /dev/null 2>&1) + if [ $? != 0 ]; then + $(grep 'ReadEmbKeyV2Dynamic' $logfile > /dev/null 2>&1) + if [ $? != 0 ]; then + LOG_ERROR "No timecost-related logs, please check 'mpi_args' in your run.sh, + make sure SPDLOG_LEEL=debug, and run again!" + exit 1 + fi + fi +} + +parse_pipe_1_data_parser() +{ + LOG_NOTICE "Pipe-1: Data Parser" + + $(grep 'ReadEmbKeyV2Dynamic' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + LOG_INFO "Step-1.x ReadEmbKeyV2 Dynamic" + else + LOG_INFO "Step-1.x ReadEmbKeyV2 Static" + fi + + grep 'read batch cost(ms)' $logfile | cut -d" " -f10| \ + awk -F "[:,]" '{sum+=$2} END {printf "read batch cost: avg=%0.1f\n", sum/NR}' + + grep 'enqueueTC(ms)' $logfile | grep -v 'timeout' | cut -d" " -f14 | \ + awk -F "[:,]" '{sum+=$2} END {printf "--|enqueueTC: avg=%0.1f\n", sum/NR}' + + grep 'elapsed from last(ms)' $logfile | grep -v 'timeout' | cut -d" " -f13 | \ + awk -F "[:,]" '{print $2}' | \ + awk 'BEGIN {sum=0; count=0} {if($1<1000) {sum+=$NF; count++} } END \ + {printf "elapsed from last: avg=%0.1f\n", sum/count}' +} + +parse_pipe_2_key_process() +{ + LOG_NOTICE "Pipe-2: Data Preprocess" + + grep 'getAndProcessTC(ms)' $logfile | cut -d" " -f7 | \ + awk -F"[:,]" '{print $2}' | \ + awk 'BEGIN{count=0; total=0;} {if ($1<2000) {total+=$NF; count++;}} END \ + {printf "getAndProcessTC(filter>2000ms): avg=%0.3f\n", total/count}' + + LOG_INFO "Step-2.1 GetBatchData" + + grep 'getBatchDataTC' $logfile | \ + awk -F":" 'BEGIN { max=0 } { sum+=$NF; if($NF>max) max=$NF } END \ + {printf "--|getBatchDataTC: total=%d, max=%0.1f, avg=%0.1f\n", NR, max, sum/NR}' + + grep 'getBatchDataTC' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF<2000) {sum+=$NF; count++;}} END \ + {printf "--|getBatchDataTC(filter>2000ms): count=%d, avg=%0.1f\n", count, sum/count}' + + grep 'getBatchDataTC' $logfile | \ + awk -F":" 'BEGIN { total=0; none_zero_ms_num=0 } { total++; if($NF>0) none_zero_ms_num++ } END \ + {printf "--|getBatchDataTC: total=%d, none_zero_ms_num=%d, none_zero_ms_rate=%0.3f, zero_ms_rate=%0.3f\n", \ + total, none_zero_ms_num, none_zero_ms_num/total, (1-none_zero_ms_num/total)}' + + LOG_INFO "Step-2.2 KeyProcess" + + grep 'key process cost' $logfile | cut -d" " -f10 | cut -d ":" -f2 | cut -d"," -f1 | grep '^[0-9]' | grep '[0-9]$' | \ + awk 'BEGIN {sum=0; count=0;} {if($NF<2000) {sum+=$NF; count++;}} END \ + {printf "--|key process cost(filter>2000): avg=%0.1f\n", sum/count}' + + # fast-unique related start + $(grep 'ProcessBatchWithFastUnique(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'ProcessBatchWithFastUnique(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "----|ProcessBatchWithFastUnique: avg=%0.1f\n", sum/NR}' + fi + + $(grep 'FastUniqueCompute(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'FastUniqueCompute(ms)' $logfile | cut -d' ' -f6 | \ + awk -F"[:,]" '{sum+=$2} END {printf "------|FastUniqueCompute: avg=%0.1f\n", sum/NR}' + fi + + $(grep 'GetScAll TimeCost(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'GetScAll TimeCost(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "------|FastUniqueGetScAll: avg=%0.1f\n", sum/NR}' + fi + + $(grep 'all2allTC TimeCost(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'all2allTC TimeCost(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "------|FastUnique_all2allTC: avg=%0.1f\n", sum/NR}' + fi + # fast-unique related end + + $(grep 'uniqueTc(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'uniqueTc(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "----|UniqueInRankTC: avg=%0.1f\n", sum/NR}' + fi + + $(grep 'processSplitKeysTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'processSplitKeysTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "----|processSplitKeysTC: avg=%0.1f\n", sum/NR}' + fi + + $(grep 'getScAllTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'getScAllTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "------|getScAllTC(AllReduce-AllGather): avg=%0.1f\n", sum/NR}' + fi + + $(grep 'uniqueAll2AllTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'uniqueAll2AllTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "------|uniqueAll2AllTC(All2allv): avg=%0.1f\n", sum/NR}' + fi + + $(grep 'buildRestoreVecTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'buildRestoreVecTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "----|buildRestoreVecTC: avg=%0.1f\n", sum/NR}' + fi + + # common start + $(grep 'key2OffsetTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'key2OffsetTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "----|key2OffsetTC: avg=%0.1f\n", sum/NR}' + fi + + $(grep 'featureAdmitAndEvictTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'featureAdmitAndEvictTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "----|featureAdmitAndEvictTC: avg=%0.1f\n", sum/NR}' + fi + + $(grep 'globalUniqueSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'globalUniqueSyncTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "----|globalUniqueSyncTC, avg=%0.1f\n", sum/NR}' + fi + + $(grep 'pushResultTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'pushResultTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {printf "----|pushResultTC, avg=%0.1f\n", sum/NR}' + fi + # common end +} + +parse_pipe_3_get_tensors_async_no_ddr() +{ + LOG_NOTICE "Pipe-3: Get Tensors async (no DDR)" + + $(grep 'getTensorsSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'getTensorsSyncTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {print "getTensorsSyncTC, avg=", sum/NR}' + fi +} + +parse_pipe_4_send_tensors_async_no_ddr() +{ + LOG_NOTICE "Pipe-4: H2D Send Tensors async (no DDR)" + + $(grep 'sendAll2AllScSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendAll2AllScSyncTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {print "sendAll2AllScSyncTC, avg=", sum/NR}' + fi + + $(grep 'sendLookupSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendLookupSyncTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {print "--|sendLookupSyncTC, avg=", sum/NR}' + fi + + $(grep 'sendRestoreSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendRestoreSyncTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {print "--|sendRestoreTC, avg=", sum/NR}' + fi +} + +parse_pipe_3_get_and_send_tensors_with_ddr() +{ + LOG_NOTICE "Pipe-3: Get and Send Tensors (with DDR)" + + grep 'parseKeyTC(ms)' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>1000) {sum+=$NF; count++;}} END \ + {printf "parseKeyTC TimeCost(ms)(filter>1000ms): avg=%0.1f\n", sum/count}' + + grep 'getAndSendTensorsTC' $logfile | cut -d" " -f11 | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>1000) {sum+=$NF; count++;}} END \ + {printf "--getAndSendTensorsTC(filter>1000ms): avg=%0.1f\n", sum/count}' + + grep 'getTensorsTC' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>1000) {sum+=$NF; count++;}} END \ + {printf "----getTensorsTC(filter>1000ms): avg=%0.1f\n", sum/count}' + + $(grep 'sendRestoreSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendRestoreSyncTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {print "----|sendRestoreTC, avg=", sum/NR}' + fi + + $(grep 'prepareDDRDataTc(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'prepareDDRDataTc(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {print "----|prepareDDRDataTc, avg=", sum/NR}' + fi + + $(grep 'hostHashMapProcessTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'hostHashMapProcessTC(ms)' $logfile | \ + awk -F":" '{sum+=$NF} END {print "----|hostHashMapProcessTC, avg=", sum/NR}' + fi + + $(grep 'sendUniqueKeysSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendUniqueKeysSyncTC(ms)' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>200) {sum+=$NF; count++;}} END \ + {printf "----|sendUniqueKeysSyncTC(filter>200ms): avg=%0.1f\n", sum/count}' + fi + + $(grep 'sendRestoreVecSecSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendRestoreVecSecSyncTC(ms)' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>200) {sum+=$NF; count++;}} END \ + {printf "----|sendRestoreVecSecSyncTC(filter>200ms): avg=%0.1f\n", sum/count}' + fi + + $(grep 'sendTensorsTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendTensorsTC(ms)' $logfile | cut -d" " -f9 | cut -d ":" -f2 | cut -d"," -f1 | \ + awk '{sum+=$NF} END {printf "----|sendTensorsTC, avg=%0.3f\n", sum/NR}' + fi + + $(grep 'embHDTransWrapTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'embHDTransWrapTC' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>1000) {sum+=$NF; count++;}} END \ + {printf "--embHDTransWrapTC(filter>1000ms): avg=%0.1f\n", sum/count}' + fi + + grep 'hostEmbsTC' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>1000) {sum+=$NF; count++;}} END \ + {printf "----hostEmbsTC(filter>1000ms): "; if(count==0) print "no match result!\n"; \ + else printf "avg=%0.1f\n", sum/count}' + + grep 'h2dTC' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>1000) {sum+=$NF; count++;}} END \ + {printf "------h2dTC(filter>1000ms): avg=%0.1f\n", sum/count}' + + grep 'd2hTC' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>1000) {sum+=$NF; count++;}} END \ + {printf "------d2hTC(filter>1000ms): avg=%0.1f\n", sum/count}' +} + +parse_pipe_3_get_and_send_tensors_sync_without_ddr() +{ + LOG_NOTICE "Pipe-3: Get and Send Tensors sync (no DDR)" + + $(grep 'parseKeysTc HBM mode (ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'parseKeysTc HBM mode (ms)' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>2000) {sum+=$NF; count++;}} END \ + {printf "parseKeysTc(filter>2000ms): avg=%0.1f\n", sum/count}' + fi + + grep 'getTensorsSyncTC(ms)' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>1000) {sum+=$NF; count++;}} END \ + {printf "--|getTensorsSyncTC(filter>1000ms): avg=%0.1f\n", sum/count}' + + grep 'sendTensorsSyncTC(ms)' $logfile | cut -d" " -f7 | cut -d ":" -f2 | cut -d"," -f1 | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>1000) {sum+=$NF; count++;}} END \ + {printf "--|sendTensorsSyncTC(filter>1000ms): avg=%0.1f\n", sum/count}' + + $(grep 'sendAll2AllScSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendAll2AllScSyncTC(ms)' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>200) {sum+=$NF; count++;}} END \ + {printf "----|sendAll2AllScSyncTC(filter>200ms): avg=%0.1f\n", sum/count}' + fi + + grep 'sendLookupSyncTC(ms)' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>200) {sum+=$NF; count++;}} END \ + {printf "----|sendLookupSyncTC(filter>200ms): avg=%0.1f\n", sum/count}' + + $(grep 'sendUniqueKeysSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendUniqueKeysSyncTC(ms)' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>200) {sum+=$NF; count++;}} END \ + {printf "----|sendUniqueKeysSyncTC(filter>200ms): avg=%0.1f\n", sum/count}' + fi + + $(grep 'sendUniqueRestoreVecSyncTC(ms)' $logfile > /dev/null 2>&1) + if [ $? == 0 ]; then + grep 'sendUniqueRestoreVecSyncTC(ms)' $logfile | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF>200) {sum+=$NF; count++;}} END \ + {printf "----|sendUniqueRestoreVecSyncTC(filter>200ms): avg=%0.1f\n", sum/count}' + fi + + grep 'sendRestoreSyncTC(ms)' $logfile | cut -d" " -f6 | cut -d ":" -f2 | cut -d"," -f1 | \ + awk -F":" 'BEGIN {sum=0; count=0;} {if($NF<200) {sum+=$NF; count++;}} END \ + {printf "----|sendRestoreSyncTC(filter>200ms): avg=%0.1f\n", sum/count}' +} + +main() +{ + validate_options $@ + check_spdlog_level + + echo "+----------------------------------------------------------------+" + echo "+ Profile Result +" + echo "+----------------------------------------------------------------+" + + parse_pipe_1_data_parser + parse_pipe_2_key_process + + $(grep 'DDR mode' $logfile > /dev/null 2>&1) + if [ $? -eq 0 ]; then + parse_pipe_3_get_and_send_tensors_with_ddr + else + $(grep 'parseKeysTc HBM mode (ms)' $logfile > /dev/null 2>&1) + if [ $? -eq 0 ]; then + parse_pipe_3_get_and_send_tensors_sync_without_ddr + else + parse_pipe_3_get_tensors_async_no_ddr + parse_pipe_4_send_tensors_async_no_ddr + fi + fi +} + +main $@ \ No newline at end of file diff --git a/tools/perf/host_set.sh b/tools/perf/host_set.sh new file mode 100644 index 00000000..0120ebb9 --- /dev/null +++ b/tools/perf/host_set.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. 2021-2023. All rights reserved. +# Description: performace analysis tool +# Author: MindX SDK +# Create: 2023 +# History: NA + +# cpu with high-performance +cpupower frequency-set -g performance +cat /proc/cpuinfo|grep MHz + +# clear cache +echo 3 > /proc/sys/vm/drop_caches +free -h + +# swap off +swapoff -a diff --git a/tools/perf/msprof.sh b/tools/perf/msprof.sh new file mode 100644 index 00000000..c1821c83 --- /dev/null +++ b/tools/perf/msprof.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. 2021-2023. All rights reserved. +# Description: performace analysis tool +# Author: MindX SDK +# Create: 2023 +# History: NA + + +curr_path=$(cd $(dirname $0); pwd) + +# ---------------config start--------------------- +model_run_path=/path/to/model/run +run_cmd="bash run.sh" +# ---------------config end--------------------- + +# ------------------------------+ +# msprof + +# ------------------------------+ +output_path="${model_run_path}"/msprof_out + +cd "${model_run_path}" +rm -rf "${output_path}" + +msprof --application="${run_cmd}" --output="${output_path}" diff --git a/tools/perf/mt_1207.sh b/tools/perf/mt_1207.sh new file mode 100644 index 00000000..fc0af5db --- /dev/null +++ b/tools/perf/mt_1207.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. 2021-2023. All rights reserved. +# Description: performace analysis tool +# Author: MindX SDK +# Create: 2023 +# History: NA + +#set -x + +LOG_INFO() { echo -e "\033[1;4;32m$1\033[0m" ; } +LOG_NOTICE() { echo -e "\033[1;4;45m$1\033[0m" ; } +LOG_WARN() { echo -e "\033[1;31m[WARN]$1\033[0m" ; } +LOG_ERROR() { echo -e "\033[1;31m[Error]$1\033[0m" ; } + +logfile=$1 + +# ---------------config start--------------------- +batchsize=9600 +parallel=8 +nv_throughput=820000 +# ---------------config end--------------------- + +validate_options() +{ + if [ $# -ne 1 ]; then + LOG_ERROR "NO log_file" + echo "[Usage]: bash $0 your_file.log" + exit 1 + fi +} + +print_throughput() +{ + LOG_INFO "=========Throughput=====================" + nv_sps=$(awk 'BEGIN{printf "%.2f\n",('${nv_throughput}'/'$batchsize'/'$parallel')}') + LOG_NOTICE "batchsize:${batchsize}, parallel:${parallel}" + LOG_NOTICE "nv_throughput:${nv_throughput}, nv_sps:${nv_sps}" + + grep 'tensorflow:global_step/sec' $logfile | \ + awk -F" " '{sum+=$NF} END \ + {printf "Throughput: avg=%0.3f, xA100:%0.3f\n", \ + sum/NR, sum/NR/'${nv_sps}'}' + + grep 'tensorflow:global_step/sec' $logfile | \ + awk -F" " 'BEGIN {sum=0; count=0;} {if ($NF > 3) {sum+=$NF; count++;}} END \ + {printf "Throughput: after filter(<3), avg=%0.3f, xA100:%0.3f\n", \ + sum/count, sum/count/'${nv_sps}'}' + + grep 'tensorflow:global_step/sec' $logfile | \ + awk -F" " 'BEGIN {max=0} {if($2>max) max=$2} END \ + {printf "Throughput: max=%0.3f, xA100:%0.3f\n", max, max/'${nv_sps}'}' +} + +main() +{ + validate_options $@ + print_throughput +} + +main $@ diff --git a/tools/perf/perf_flame_graph.sh b/tools/perf/perf_flame_graph.sh new file mode 100644 index 00000000..dce91600 --- /dev/null +++ b/tools/perf/perf_flame_graph.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. 2021-2023. All rights reserved. +# Description: performace analysis tool +# Author: MindX SDK +# Create: 2023 +# History: NA + +#set -x + +curr_path=$(cd $(dirname $0); pwd) + +LOG_INFO() { echo -e "\033[1;4;32m$1\033[0m" ; } +LOG_NOTICE() { echo -e "\033[1;4;45m$1\033[0m" ; } +LOG_WARN() { echo -e "\033[1;31m[WARN]$1\033[0m" ; } +LOG_ERROR() { echo -e "\033[1;31m[Error]$1\033[0m" ; } + +# ---------------config start--------------------- +model_run_path=/path/to/model/run +run_cmd="bash run.sh" +flame_graph_path=/home/FlameGraph +# ---------------config end--------------------- + +cd "${model_run_path}" +rm -rf perf* + +#---- perf cpu-clock on all workers and build flame graph------------ +perf record -F 99 -a -g "${run_cmd}" +wait $! + +perf script -i perf.data | \ + "${flame_graph_path}"/stackcollapse-perf.pl | \ + "${flame_graph_path}"/flamegraph.pl > perf_mxRec.svg +wait $! + +LOG_INFO "perf_mxRec.svg is created, please check!" + + diff --git a/tools/python/images/clip_image002.jpg b/tools/python/images/clip_image002.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fd387c49710c1d99303cce3767ff720cad502944 GIT binary patch literal 9453 zcmc(EXH-<%vhHf41QDbW5Co)=43cw_)F2{~vq+{TNs>$x1f*?1vZO{b2#CZc2nZrk za+8`Q8AOmEIrPPS_Sxr+JMI{7pLg#Ycdb!>=A5f$eO2F@t5y|m0{0D|ysxUR3V=W$ zz~}M?a8rO1KuSzZLQF(Ta)pG9jFkKuJ>@kDifc?aXs^?=v9RBPus~Qj`6LB6xy5)` zAwtSRV$!k-3JUB3sv0VC_a)^OWPd9Gk&%&IyL#;wCFLzyE(n+GzkT680aRCjD7*S@f8wMvdaZElmH$GA0LkZpOBD%;Bs}~ z)b@lWO42_IUtZi)V>>XfFJv_ahd;9qMg}n%mhL~!pKwuK;=&^!z$YO7jSGb5eJS`<1cbK*iLT$%Cbo2=W)%v# z0#=O6u4pBJ2upg92Kj_HTa+?kmkFM zGjrsFs!(|K0I!R#XE7j1srsa`UPA~Wo06V=Cw)fGU zwkvB&D!p2i6!9gXBtKdXW9nd@_suFs^{!JR+ytK5H)N75rSp_>Kr$||G(4aVGON(x zfn0SR5ivD^{mdS4lSwFD=YDS>pwK1b^)t3Dg@+4+5LzqQX$1|wx3@_1i~~()c`}}l zppI5q%f!MdD}~CY=#}N{J~K0WiV0^aK_NA*qlo8rqBOVr-0Ri2kdIo}3!Hp&Vl>%LbUBJD)APY@^X3 z!hi-M`yPQkb`m6`SYh_wgQv6l2d4f=yru1qmZhi@3 zdX8d_V)n}K)b*AsVBOy&_@5R$y~es2;!x=*1KQ4_FQReo0r{?IRUrR#8|V8rWHG@n|iw6}Cvt_J1v=-^Mbve8iM{Mqkz_pDv^ zd=-Q@SqOOTU`0k;Z6EcHs|6tbba))a1u;R@Wq~h1v zyS}jsFP{;SFC=MfQC()XhID5>??g)T-Ri6%kaexpA5d181772~UfUG{qdaxN`a7CE zYLdiEIr*B8pK)C8B>aTZ5^e9!N=K<)*?$*B-BsF^AG+&?O;t?5ep@fYPS-mY_u4Hl zL`*$DU+ZIfV1J%~O?8FyebiN{ERt%WL-TW)g2XOvj8ula?|II4JsBAQk-kul0{L$ybll@1O?p&X7e3CR7dM%OSO)F0qCTDpW*pL(_|#(#iF`Ig&KW90}@x zN@+qprlmt!;HOGKx^E!35H4zxs_JQAt_r!rOB*we2B-#~b@?_Bwc&{Jz1 z%vyMy$~1k`P6gltrxG^-VZ)cxQaU!)l5}qga}pr*1K{h#x(Z|}G_dO4FC*c{Xn5rf zvXv!kj<*}Aj$rN~ola+w+C42sry(WuSGOVI5qXnHh{S*hP_2vNh~ZrFynUy?LzaP& zF_7E8k>p1*Z6a5U=N$NDYf@wo)NL>Z12XxO{=Z*=|BhU}N*@Q+T&c9??V zjqF`$`;5J>wjZAP8Oj>m zIGK08Icn3+H>wzdI?J%DL6&*k952dUa{jJ3B;wSD(CvNI^8~_R5k0-?ChLCHaP&QY zxo0o!`Ba;gzVplAC?TIuOMyJs@FhD%awEtkEC*w%r+<2I4lSeOatp8;6eSKWWizZo z!4E247jB?|Z$UB*8`&p&92WN|WEN^x8UZ~t}ym|5Uv9G+t4Zh%zE7!Lq7PnR>#`!{!)(8PX zB)RSZO3~95vxff(#r>kls9Cc-{w*!1YS3e%hdj5dH|mxI8GPvnueK7M*{`RS#Ry{a z5U?i%N|26g=15gIck;+|4e_lkC0(^TF}J7My{;5EpeiA1WThY4JK*{>>p>mT6zhV( z_#T$VN2ttSVe}X@psrH;lKD#TyV25!o94s0@i+PE(#JR;uHKm{{qxPz8$IR=jWGr) zOUF+HX0||(z2di+vAYD-^0ptI(}o23gjmT`?RjlXiH3kH6;9{l7M-Z-FEexP2cL>W zp|;!;cf*o$qim3qug7T771X|Wy0 z+6_Z>mWi&(und&cT}YT&&>so}hF-Vs zVtd9LabzV#b~b@={RB1{UgnE1$Ojn%EVlA{1A4N#S9b72^x`$J#&W&aw~$@eLz<2O znQ=CTD4WEPU>mE6vvte5)tTc7m_Z-o%%}`FUmpbgA zZJC*n{{AMnpNKM-ikh)fPY2%Q)lFEf3W^{0%N;-LZH)qClJf zp`pG$v&@;vsJ&F!kwNCkYEFjduaXm$QnVjanJsCU=dj`e`;)wIzQ*@bHU6x=($4t;a4V9q!QybitVC}o*et@xpZbjcD`7Bqs|~7Sx#1;@*I|l4z9Oj z`{oG-@r@!6G(dHlSq?8X>843I{VtJC05&`!y)-S1G@;?Mbhlu{%`knX{Y-Qd@z%w|@>JZg`+Rs@u7 zV+Q>~rbH68|2TZFG4E{6{fu3LCGOw=rp=3^3)NHwir90ja}|+&ku@AJb{C_orF%K_ zGl=+D7dCYM^Air>b_T%OS_okt$}4pAY`^lb_kR>A>c%6ztyJif^;KAINOTvbyOUZD zmaJZ#ur5<~i2|=}iYRk+w~DgXL6NziXoNc!=gK8YXEZ&f7GESaZ3J$dbp`dzJ~7`2 z%@6ZHlDbaf03X_~n)%*Ds|5Oy2_6x9mh+_UOZ$gkga8&^<)K@j83l>=zVLfneuOm7 zAuhR-)7LANau3ajI;PLNfizJtS9oAsFYBkbV%8|0N@U91?oaNg!xNs^W%G)I?sFVc zXVvh2}{ z?!$zM%5TKx%?5A}P~VReltzSlY+_4Q0xC@M{GGAN;q%4;9bE8cQ$bn=k#}3JaY);+#|Nnli~|3%Cc@JO|v1$FF=(c&#`f90#Cw zg6z*YcWE#DzZ_?6m#jv~>y=FY4tmjlF`bLLy??(cWwzxj0+G^Xar~2Z1>P2W8@~F{ zA{7UaN-i+}yAP+csmd0*e{czh6&L(?@L?1}%U=E1!4|Bo_XMo1a!Ka@f1-Z1`jFRP zmc-7on=0E%p4*~w+ST(^W-6E7Yq$TAKho`JEHct$%Us2&>^)m3Pk}|X5F;~Rkd?bE zAc|S@dx6b3q@8k^HhZejd0>4gP-h`MCSx6It-yE{eB5dpr9{IfjiIix zul!5*uapeY{5S(t_JjKalr2QMSYEm&J+&lwK!QBr9OI? z!LgIFKZrmiPb&3iM?htm^Nk%?g`h7#b-O0ouY9BA^iZGZ5t&XJQbkf>^jk?&!Eb^+ zs9bqaP4AShf6Vm*3;;z+wSkc`eovj8Nd+DOHZB$~7SFtrmw1n*pmkhIyStK8>v~<> z&pw5%Jx!>;9?4Ddk)ry^Uh86cjalWrmowJ09gj#Z*7Bl_bLz?HH^)g#^T6CId(7rII+A1fsvW=ors7J@# z1t-Qk`B9F zbuUJf`5&H<=-)jG zVh^Nw(l+ibbRIFQh$zrRh=a8;S(f@6HynO5frhu`JIjgqUeP;aIOg6j*Hvx4@%^Dp z5&1~NGaejpKZ=GOT2vN$Q=UIDo_1gVpttyST>%;0KQ!;H> z(MN7^Au&D>d6Sq!^{Z1P?n6_okmUwGzsYnPr#!5JIvl{_DflxnS6NY+wOQ;t(y@-sLJPGdRs1^JoEbzCuN#416hAQzrEgIr4f=c@swt|!MyF&0v0yCd^F2fo{#AD-Q z@O8G2va!*pZ*?NuVV>N3lOSa^n52^_B3Kf^wWn%{QhO6>ZIu@jNeu4!qqg>c)VB)% zDtZC`EN~V6v*L9*^2h)5!+&IH4nVOp4p=Tf_YkGwmR5B~>YX4`c}f@_6%}K2lH1&+ z>{3>6z|(x=ty`g`F-94nG+%Mc(SSA$&)Xr)nVQSXhdAI9pJLc#FAe}%oW)u2Wh_^1 z%pz+&k_XC+ae&5L)mO=>M(GH_{X%85)-zTifh{$boeik)?kNuV1}@ePw7pDO$}iN4 z_v~d>$507$R z7Eb~>;tN^rz;~|4-{&$NX4d^`l0Kl^!tdxO*%)~83S}p{=beu3VRkQ?l}}~F_cXRH zJ*Zu-0o)wBPHe@91AbtleLpU5KCzNj$~sRgAIRCbG2F@+Oq-^S7oKG56nVrzye!ud zAINNoPP_7k*mp8sp5NB8K4>?d0>l&%4(&jS@>LRg{FYAMC(H;;&)?Ys7;R2MIw2ZT+|x1?EI#$zce2V;lU z$VaO!&ftlecES#~`1=jw8~-jicue&&BT;uY~ArV_`_U>D~%)D@K z^xN{xvMu(i`j`c*D#9V%U9FH%tEdcqq#2-&1Mc;1!PoP}-rL?UuWCvC>36Rd!K44B z6H5+yxWi5auz7qcy1CcmSYPns3(XM2&G~0<30rxk11W?b)8YUUbQF9|G7tyAIdQ;? zsrHs%jLkTpCDBxac5_@f!LTJObHIIYC3I}FbaVd-|F_rlH0Ur+_yX}QX!hoCw7Y=w z&%)TAJM&nz2HP$G2V9}sa{LUR7v14FC%)VAbD4QN?I__Q%)8}q4hNVdgfI3Oj=_C5 z`Hp@b8XcDKeXwAwWbhPLWhLI!gb32J0MPn^^JMwW;|2T#I|eqJE&l28FdWe4xS(}H z@fZip^=KVw%0L^v4`#N0SW6#wF6&zbHZ}&1#(ECtSk!bn#J6|D==LDVKxM*2tSemC zaBHZB%U}||I7%ndcYenBf;rZrr#avf>qxsl8ty#1)gj!H5v13$o$ixz5tcG{xcj2| z!^6hJKD z0es>ed9(4$UyC;91&b333;0}*>Q2hb^q-D~k!b#Fzw#EnEIX-sj^oG1g5cu|{^r%u z*Wy+iI!W&nraw5VDNCE83 zI@9G2Cm8s@HUT@8Qn!1ktMf3;xC)!E*Iu9PCD7(6cCT{gi#Wc=5U8-ly;VlWut9QD zsm_cLN%1PlDPB5l`_hjeHGY%l=jrjio$@FO;rAAdp6AP?QcuOB4(sDD)9$>>Wve~$dT^w-!RFZJN&}SWee0O8Tzq(QZy-wi$Be_4Op9%v^(yh{OIrcW+edSgP zV;y7(`(gj&UP$z6jfcg*Xj8R+%LE4)rryDnvp2-iKCLc`CpOc+W6ho9yO6SfOmo?q zvb%j*nf_g!{<}ih$9^CytsAjrgv_~auE)IC*`xJALO$GRsAlq4l*@{`{hnQP=w<-_ z-q{1`Z`84VJ!}+(U>l0XG{EG~0ua+bd#_x7wPS;C{n>kk{nd@V9GX45wp%br!iX?X zMFxNGQ6nIt>-ymU`rT0r5c1XKS(Mhg@7D{~;fiK(x#HZJkQRZ^WY#H$x9}T>~aEtM2=JEZ* zH1YAVw3L_pcU>Tve1)&;)p}BxGJVR#$9T(S)MfihuU{!&^l^bSrdk7(#1s;@q{qco z04E=mBg~8L`cPbM=oqI~@+pQ=64ZF^$n-zoJN>`)V)6akwqC;>%z8N{hdB+;IhF4? zs6QhD?+-w*4Og`JYPdo>!OSpB`-4yrOA;v;$;dJ}U5%TV_o0brhbd1K%j@X;*ZWKn z`H{w$vZc)V<~nGdWK!7fD(3WbkkB#FXlqq@-au9V1kX)-NQI63l37pwW^sU~iZ|Np z+tE_ytEQUO1TyhQ;X;OT!6BVaW1=Pv7`ujAbnTNU!n)g#=;ugOX`s=`(4bV3!++5a z{>|q9ZD}l6&wX^N)cq`Z(Dbb@n=N0k8a}C{Oj@0cC})axjA^e;{1@X|o8)wd>E|(@ z3EYE{a<}I$R53W<{oQl!8uS^`* literal 0 HcmV?d00001 diff --git a/tools/python/images/clip_image004.jpg b/tools/python/images/clip_image004.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bcb5cff76d9c555e3cb414a2d30b1716a902d31a GIT binary patch literal 8027 zcmch6XIN8f)9#{%-UJ1tD4{oL(jkCS1O%x9qVx{Z1O$Z$C<4-})F{1o2q;oSI!fTP7=KuMHZUQI?09EWN94r<9n-U9$ z5)1teUx&~fb_vvfPx7N1yj?S)u!J*-i(XsJ~`JW4mOUo;Ntgh|u?H?Q-9iN<@ zUEsn3aDKu18`*!sMTx@6edL^%GrX6`5z-y?}O!?Eel}!2cJrzk&S=*91U!rM|V1{5_QYolw>Y|&KcaOn`{Pr%8zt}c9&iq(eI;)Z_&=p;E; zt-5BbxxMuC>;blxA_-BsKJAUUKsh|>r0fsap`4gQeQ<=sgSS96g75==zebE`NN7j4 zfv3n+AC@Au^(!LI?^4~+TRjt=IN{TISXJ5;apGoyVf*(mCbZ=6OvEVW@#l1&ODXvF z!$Gq3!$e$IJ`(fqmby||N;cCpCDsqAD>^CQ;#}1&n~&>bKL@4{%VE3wr;-dNhem_a}Pc9EYNw@aYmEGQcr|MhdwPBa6Qp&Q}Juk zk%+~?VI=e3T;0|)pkWngW}vYBK>Wnfe2)vhgqL(|$`nYu{4smNbE~~O)n+Co88G5(ZkBngqX}HOa)gSU+PY;7WVr*eB7l~nIGC~C0y@p{%Jwo0=0yai zF7jD_M)4w&|M?(YC;x;7v<#DB2){3OW4@zjp}AgEGE;AYRBmfi;>UZDMr70U?23di z8Q}hvviqx#c#6j+WX5XbSyj`mK1PY|O0`jT%80m^?(}GuNwc@0+PR`C7@dn+=hU4wU{)d*3{o%&#NYoecqDR+`i!pf)LIY27 zF3sA#3_glXJj}J3-Q$>ONBNc_QqVvQkKcC;I`m{WGrplQ@f1`Oi8wbo6+;88){m+Lud)~mG6-{M*efSYch{iY5RU7g z5!`6t$f*>uhX&fina2}8EOberf#-vZzG&dq9=Q|w=~3gJGfJg^Bf!u?k0)xJnWMa5 zE819`#RE?lyHz1bwmN6xaYd>KE3?9Vjw7plG?1W-Ium&zHhXp+IC zKm(nPr!4P`6-vQMb64Q7TN3RiKQiqyL^Js2NWYCNk!+LIiNa-UDs($f)q@S`%anSn&cip5!|rsubo3>slODRpLs%De`ugQGjW|s{GQ|N6NH^{-DF1DXA+m7^ zCBOL*4akn7?qlerbCdaew}8=~5%CczksYA&k?ihUR{+~$^}8J!u*MMH3qd>|NaG)u zW2eHR!;Ss^Zx*Nx=O{!O6(nBr|IqO9(_;uCR2q)r@&s)yJjEAqc<{$g@avrMZ;mbu zTN9ciNxytT0~n{k+o#Z|7q-w|OtQv8_mp1pCd3H6^WXnSM;yh~@C<5gTi@h%+8ZZq zWT95WZJ?kUk#)uY-hVJ)uSi-Sx+zq)0f0b^K7$L%Y|-yjvqR{dZw2AxA?!~F`GSNg z=~iOwloB1k;RVM$2d>2an-{LhIG_aPrOBqf^Q|9HZvAu#eio>Njw!?GlYlB{h0{^O z-gR54d!0?PaUt1{IwrNL+WQ1EqmNMyb4UGX0EVHQ9Stmo>Muj8AaTttvPV+l#Jvo| za%I@GT`Iji6&SRYeE(f)q)a`!{@TM-fSk=2+On!{xWMGY)~pws8dr!0#{Wu(#&OR{ zFK>>guBoY79p@;MMWzYJ{w8#B77?8q4umoe2JOvyiZU!jaBEgQs>*BKkptfpT-`h@ z%v6PQnsiFZH%o}X{pq-lbq z5Tiz9l0(?3GV}KTS!q3HjD!0^`C3DH#;B^3#UM^2h{!Y3S&o^G1<*v*r9l51i6l=O z3z&71hNlVCqJj0H09%9l(6w6P7#(Eh)$nEB_dF3 zch$7u-c1)F2+2^f2)?DWpep&N29xOp1~eeqAoAHF^{$45ID5{0LRL!ZYBzQ^i$%nu zM|&|Q*AI_>lbcoXH=@OD+?daiiO1!m`xwE7m&Pz))x!>re^x^(OivqKFxVD%4|5vg zfj6mglX4JI4bDY}(hJRH{;fG`wYyd!2_`~vDS*NGe&E?aSEyf~Se{#O7NvhW`9U7) zvWojO(}uLo5HhYYj_Rnoy;1r(mk`-_ch>bCFLno*@q6=R#OVq=_lrjZ$&SNG)zssb z{4DH|`zeYIZUh~49VKcTnD*dxSbF!woHexl!MRk=tFElz&c!9r9-sb%67_s98`W0? zEzI#_^*3ih7ij;BGvu1`$(_abEU`wxPpv2;D-O+Kyr-vd9@o3iTsI_^p0U+Da1Goq zvtD2C^Vrmk`fwQgwuhM zSMqvIzHIS)89|IZ3hqqoa?sx!5%BYtScs^%h%DBK=q~)4lC?K6wxr>?)wQtOX4{ad z`;aC6Jl)TaOy$n4qk1!ga_~mO7s$J6Yq76-w(^okdRk-T<>{e*&OZ-JwFcwgck?D* zVdgi)E)=`&-sV0!He#Hx#jgz%-Dwr*-KyLdI6lsItGLgy+V?ItEPZwdI>(?p3t~;N z>@Kphe4xbl_QRp8Ugimex1Tk7EkduCu9|qZROn&OV-e@8unuvuT9WJ0*Rh4f3-k%r zk6byW>c<3YS5Ss9GiA{lZt>@}aOBhertTeRXKf7oNSnpG@7CwH08eIUelw!LK$f`r zo9b#wE;ns{&61mNZ|{>pS4*qIIZ=(*!)MPg-E@Bh?}~hD7ZG&B zO*YLw%i3~~wbt-ycD!L9zYjKoN!e_+L|C!wS!}GDniLZh4G<}qoRM*$urVWy9y5Z= zeI*zQkL}@UbRO4KUx4_0!BunR?W9XE4cY)62=rJ>AIiYJ3T8a|DZ1_!*Ic72>+lVb zA4C?BwaSKKLy77QP6M=^YHVQ;eczb!HOX}Y&w9OyswD~}3eAc3LDclI= zMjocPu?_3vM^Ee0-m!e;xgT~uEJJcrGaWOR=|@R%vV$Dwka!*} zZ%MK9D%f7{ZbA8DhB8lprb){E`@I!%9;D0pv?sG_T8gAGqpItYgWA@>`f@)X;~V)E z^3x5-_IS21Y36au#q4MuVTEK@9$uM-&S{|mA){55;MYACK#9xSY`1h5%y8R+LME3Z ziy;U^Z+VaGB@e z(3QsOTuhV03_*UqC;h(}Is4}1;70<>AKGf%;Y0&Tb*$Q|D%v8)zV$V>UE`(%WThT@{KsJU;ny}+eq2Wv z+9nsjpa_+jV`#9O#>g@BGD3Y_QB#jw>A&N0&U*JwYv$v?x=QV0?N)OMwiL^wDi=Rl zmA6SC)+7GKm!#BWi2G8lZzhK$?wn8Z0H2cdCuC2ER?5p?!IGkhRo^R??yaP4Y57|E z1AlhL`H-&tppekj{FstB-*I|i(moESOMjmb93A<|m*t-A}U7lyotaF1Oztq+y zvAp>fBQcRz7IW{Y022~Rwm%Ko z?$12ecmhg)&wS9)|D!RhWo`e@oF7>eYRIW!x#GCFSP&` z>XE>KZ%&KbIb9XDKR8pny~*_}PW{O28yMaG?cufI++W%iE_7!|=56V%CO?7+>bOwR zoL0h~WENk8Q%uOT`Z)t+^5bj#wb{=5`H1X*_G zt4h?^bY|`%S6Ddvnd3cwb!3NN{xr%v627c73E^G_**HtI9O0>0ktXZPX%;oQ)3ogOcS-y<#1e-h)mL^)Am^%@r(3Y& zKQJIU5XYM52M)3ZlU~Py=YOe#pvUhnTimI`t!;ew^my+Pg5|It7Jb|I`Gi$AUUaxV zKB0@y%LrZ~;~HkONT}8JRkuMn1`4H9xjE=vO8iISCgrJE^j$MW8xsFl5)3Q2 zkzF4GrdmaWjJbo=N`4wpM!FjOOY@Nz{8wRgGIXwoWG4pQesR0>7*qY!oOr3Ggw=0G z+RwerB<*bkex`2QQ>pab)jh2hUqOb^V|FJO8TDV7j&v-B&$S+MJvlZxp@#OXma2=f z5}VU2nSExpL}b#+ra|eQQmu&-3@137@1$)=1%@#fyF)K35T*@X>&# za>wjYB5&`{EXaBDGxVz>If482La28Nm{#_E_Lq>Y@BVP^<~fyE68urEeVo-^NVxm| z23y}6rd*SBMNqjm6q;-+BYxx@87vPwFW+gPA~RsXmnI{x>nxyp+{MdULwcS;PK$uP zFEY6x<`Z-_Q^kXhPDMC}p#3cK-VZ)Dr}`x5Lzu@U*I!d*7km{T?!QGGVd{9_sFa0i z1-`kGth)aF*)!|7{7+6@_nmH4U+XUkOD{24r;m7F^2ztGwW6+@Ad8#!{AW4pvg0w^ z#Uf!{RplMeOm^be(ZB(HBc>d0DkIo@&;Sy@6vNEY7&q_a}VH)J` z7c$pOiiUgA5zNsL$0L2sZi^B;I!8_HGIE*NR`Rlo|g4 zndcf+5-c)t9XxIi)%jWcd_nU2X**K%cJHq%|LW*h&D=4q@^#@8T-avHMtbfK36a27 z$+eZImzAzr=fX8-!)N?Csme_{U!Hv@~I2@K2EGB zbrN<(cQQm&mst5H8V1cis0p30rRG&Kzk1J4q1C;UL6oq>7Fz>gJtNXC5+aIHHjan~ zQ(fE?g`ojn$Q*PaNfT3nnRg(oawNIThYp93fs{XQpaY`tw#;1OP1$QTRStJ4L@J6* z1f-zPub+=|q@KANaEl2rRmEZTvz~dpR@5DvGFEhaSN|cJu4mlQ(oKP>8ulVbdh{Bc z{-tYmXg5c~S4Cz{4qDum+-mCqQ;$~Q*??Mh$~x5}4bhvU_E|PPT9q)7h3Li-Hz<*o zcTThLXZ|$aC8q?1a{S<${pC2?_USei1W!`ND>0j);`zR(8%{A}K@CoCLEe!d(o__7 zQLOE#k8tnpu=AU?SrJ(Eyh(1aGa{6EQptSRld|!m{c#J)Ub`p)_9qE=@L_PcHmh^x zT}w*ka<6#Yw0aAe{SR{H@uNBK^v{qIXtq+1;X6Uz_IwW3fd{%)T~GQ;M%qqu&GE;K z0cdda;O?rC7kyfBX4gpR%0@E#Yu-0XSEVh_UOjJhAJXHhDl+5NimOY4GfrVoaM3?+ zp`8!Q-Q_YAqf*Y{8co#qimS+=}hy6-nP4ovEA2) zay4yYN%0{XAxm=w@_PNaSY#~%tqFGxl zzxsA-)_rhf`8{#}@(KY9zGc!{+BVNY%LaG=sRoZFoQJi`Vs5ciqFn~hVlLIid$pCi zZtMlCL!k4KFYlC^6HDt&s#dygC(L6K4;lBFlD2&3!aLV#S(0DSS&-!~-FiP6knReX zCQ0=uan9xubW@}HcF4N?mZy+Zr&43CqrY6~x}nB^3_(^#XsldD=*4aO|FHwdr)8^g zNZ%S$^qEVIJTG=JmA4@$NS_>*$3#n;w}Y+1l#D@oRBB6M@!)OG^$|GD4974V{Bwi~ z_I&Zgl0i3*a!|KmcZPdJ{Q^^U=YXk?ZWu^jO_3VFJil6ciy%MK;X(s@EhrF@6Dw{3 zRqU60reoKRG(u_~e87erzB|kr#N4ehXf$%dJpNDw9$UfLlqg7B#APrA{V>0vJVa8l z9*_0useG?Ef;*drA>F#6-;=NQbt~A##N9pJJzZ+Z9Y3YZN~%*%a&h@XMsoP;6~8wr zmV_MJGXFc|aa+X}84gaIc7m0@sr=xcLU!w0%Ta3&A3MP zdJtaOz4lVS^y(Cc?4Fp3UZ|6ZCpouwEX_?%Dk1lSqBHf>1rF*PQX+!<@QjR{)OUoj zJq9b+Sxf}x1E0>92ac2{ZY0n<6i*j_`QB~Zc^xLkn_ou5t-G+zu#*0>+(m}V7TeB= xJz!$STNk%hyFWE7ij1)RseGQDk@8EBwP~PzfV=-6m(Pd(#}e(Yq{cyy|1UT}!D9db literal 0 HcmV?d00001 diff --git a/tools/python/images/clip_image006.jpg b/tools/python/images/clip_image006.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8759d287be51d491663043f531f5e3041284811f GIT binary patch literal 21733 zcmd?R1ytP2mN(is1SddnN#hpWH3Sdt9wb1c!QF!f*Wm8%E(y{Q0t9z=X$T(tYtFed zC-=VjzL~ji-d*pl2dkRq-$m7yU+t>BcWoYLA6EfbvXU~A02mk;fE)B1@VEdF2RuVU zLP0`&hJuWOiuw!q6$K47kOKIUj+u{@j)9x;C6N6MJGX$4sHi9n ztE8NSuq>a5sL<0OFsP`gnCO_KSXiV&^gw!{|Mu5oHvk(M1{t;<4(25Q78?c*8|JYW zKn?)FAV8^oBKU7#FtBj&2#83?D9=!#9U8CzurP3Nu<&pQ2=MUG-rms90r1!eIFxK+ zh|iUckY3v3viry8AOm05e8E#0`$@%N>=1zR4F3fIArUnVEgd}rCl@ylFCV}78wp7% zX&G5nHFXV5Eo~hW)AwfP79dMUCubK|H+PS~k3qp9p<&^1@d=4Z$)8hFbMx{G3X6(M zN^9%t8ycIMTUx*N^!D|G2L`{5PfSit&&@U9N0cdbAQ02j414IDlq?VUfjG{m3 znT8{02k?fqEj76vhnPfVr~=Wno-eoYNW?yC{+5|rcMhQf&BPCqq=ZX+t+F4nDooUp zMCYp7c2qColjnQ{tm4+++n(+}5WEkA<@-X34AIeP*y_(J9C8Gi&BXQdxYI4jVf?x( zYvI)3h>LI4;Qpwmp4Jk*R$;kgI3c`XUpV+K%GaruTN_*{pb>dEYG=+=-bCKvpU_FOodE|xe$h;+KioPlGSWs)*;(|wT?J0Foio^ z+JTRviD0kKhV~Ogh4B&4w4kT0XgomW|Fw<#5zx{EqO5gWy}+INIV6?6$M1#nl9+xy zQ}UgULlN_@IC+T@H(7bfM}T_stxB(LP@CxMO)%A)80jOFm#r=nH+zo&_)%Bi0~fu( z-mE<eUgkmbZ$^l4sIii8l?c3kt4 zIG4=4nY%`xIKyE=1TInn6D#Eh^G}Eqii3 zcJE_M9o=_XB%iu&<-Xb+ygAQR23dZ8n7w0*yu;VeU?b3bIWy!z6Io$at4uW}kY|KU z#rmEK`^)iso%1Us8L#B%ZC$|%pI6!1c`M||FRTJ5u`@HL(2s)LoN9^+g0pPxcBCuO zKME*QlGK=dp^8G9j-0vJx{1Dd=#K8#}V4%2ai=b9QLV{z&Y`R>?~U*K$$* z$=gJ|`A2;T*{GWNl%2g`nOEt1IQ!Wwks~l%;d0`VzC|>5SJBFwlYH(NIn3n1_+|jx zd7rt?FNO65=^r*a>ohj(JEO4y>AXE3gdQyMx~i01A39K~oZTxLiXm$tjTwbntG2$i zxn|1e_QySB9YJh@*0u5=tZ^N^9Q&y*rwtSIE3o85fZHF5=eFEl-btGMRfU$oXq zOWS`3Mb?zu(2=KehN;!%=+xS;6KCHPk*r9rj^sdAMspk)btZowM?*a#n(gzY3od0 zA-7W6ugNiotqrXo4Ru%KoiQ$ib(zv}ZSMubS-hxS%Or~REH#x1?kDZRgiR_YA0-!N zs+i9ld>P=XE>`YBvvfOJPJ+X5FEv}86mgFR8B4SgzeGTc*@ZA)R`zVj@l`@51`=&- z-=7+_vZU!pD<*KSqLAQ-WJq!f8ABPU?wqP*`sh+KOe}6qytABbJGggV?DPs*E?*8F z-6~JU#@g4_fw^m(Q0nbT#O-!a*TtefY9?As=8k;3Rakyv(t{JiY#c5RT;|tSqQRe+ zu`H41_Gshzk<62I0of6V*8Vz^%*nar8o0_5fPZx}^UgeYa4WjMgW412OV6t7^{lJ2 zM?i%})di|I_u{ZKg~SLhOp>t&0;!B}KsC*Y_flk@o+TM+R}d;BdMFi%1?Bn!Pk`19 zj3`l;dFvw}ll~FlD{?dW+qmlbioK57GUjMHjwyDmK#uRRxoq$d2b6aoFeQM?WmQp3 zdAGXDV$OISZIBn(3vx`hfa}lfg>2~&I}HHPZv*{Kpbp)-^wdn^JLHW9em@T3QRhd1 zpz6be??v&g(9GS5;_dY#V7h{z4vypS%J8gDtKydvQ?yZ__et(R`A}2f)`cKO(X=1=Ut^K|HN!tm4C8N9OZ~>plvy~S z;$Jw`&f*bxE}3`q1}dZ9_)-Jke|Q8qJpyFC0UBqvT|09k$Hz~);y!rrj(>P~1YDF3 zU~c7Js({%wTvF;Tqszk25VK0=+u{|_f? zY$)%`2ffhDGaa@Kx~pEr2U}0gRss`PNO@RCesre-qJJ0BZ$d8hNa{cjYr^{246?%+ z`#K=v@*QPo(WJma%pQI6wXVC0)*uBtw-gDP1uUD;;OGte3HTqiJb0CLf4s|>GIi%& zxO)?E89xYfx@N9<`+9XCkA|C6EtBm)qI>tZIsZ;wUnkn#^DR9BbQTsDVvI3P-4E*7 z2xmKHqa63b2Mxo-s@@Pgy~_e|LlIH`2$);yx;I?fAUB{pcy}fs@*kt4jx!nuUFV2E zW93~OAvJoJh=cs+y_E`Er5FEiSup?4txXB2jbgo1=UU24roKqfkbs!DvM-jH5P*OB z&p>y@W_h7QCjn5EO}lE*FfS&j4A&>t(AbRF$X{X${syJHwUjcr?A2d12oVEH2h@rg zB$0s;D&!_3f-)eQ4$rlN7HgXXUa2?}+e&js!gqdHGRjez7YyPhdJ3@KS&$R0u*lFi zY`I|$6pT^48$yl-{+kk^;Dpc7N9cK55O4uEqksiXoQ2JLl(vGYlPKXsQHO9V>H}bx zibISfh=_fk*aQu4VAp$JmM$AM{GV%xtc~_&%m3zZM~Q-=$`+k)Jev zB+j6*2zMf~yH3EC$$o}>yzzXAeM4@hsy)&0aHwm=zwPrjEe7D+7ITIx-|3dS{e_Jc zObZE0go)cDz~8wH_tnA8HQc1JH*^>DymR54U;*Mz?yNIpfh_r z((~^0i~EJjub6t#BXMV{{Ot3!^WG(ElU_R+Rm|Yb)AuO-hsD>YNY-dKGrsobHm-FL z?w`ZeTY|ml^|$5)>W_eru973ZwbmSu0E58?Ttj7oOcL{!nVq&~t%~kYupPV|3XGhs zi2VXBuoA&JQFcLU!g?ELe(y3y77?!RS((o}^?81+{|X$CZBwS%wi>5tD><}J3`fQh z%#hN_7$AGSau?+mhF1mtqr+L#?d9*aUl&G!<>`t@mB*B&G1SPFwHrGrCyvoS2g{(e zvm-k`M?1Wo8`wBGFeshe=G@H<2hjbK&F&<+fIscdWKZFr$8%KCfkNTa%YSPKL>Z!Q zZCduy6roOGNMjQ0ht;7r4*5`T*{JpNH@pIqVDN#Sa}Dm^$aq*eB|S4?odlkG10z>E zx35*xtpUfTd!X;z`^B9C7ef5vavQi_R?AE7ZBRvC$7c@jl=sBd1QC#f9~y!C+{(+$ zX-f+g6AslWYJ4{=j-vP$r4#$yE=|~9>|O0}#~%S@Q8IYi4}R&Xc|`|dAMA!V#BGyi zA}u3E{bu1vJ=4##$$~p@Kl-820$fYW%0Nf8>?=BnVXgyjhCUjqj>7sekEzG2)F#c@ z0^6G`NE^$`7!pl(G3--ErrO2PD+7h|3Wx06UOFqri4MheV@V$m18kJlUOA%xG08G; zd6Mm8J2C6w8898cfVb4nZCuV&rdSS->c3dqsJjj{!KyF%hWPP;Ic*V_{xB42!O{Z* z2M|$;h``u=Iv9)JHJ@Jq)xq0{4WMvBJwRCk_iv4YA`XU-)B;8%!kd0_WrJK+No)j} z_dpa?0#^y<-y4usNe&Be1hN3+hSLFZV`B2C{DkBY{3vkPJur0uI&}Gq?2?jFme6;( ze$W2BY$oKkq2 zw&UB^yPKJzrVs<32W@^3P(qTCaBQWGU~cdh+LGS#487}TlR}Lst${=EBc#CKP&YrR z5A!ub=JbuAeDth`!H|;}3)AP~*pU*T^ze(bEsr>aG z!D=Ej;dYwy)$P}w+c~@+{EQK-0|kBdnqT=Rwzq5Q8BQ6s#F7+vsWgFW>fzt!oNgif ztCs7Uph7D%Mj(#jXX^e%?5Jx<_#s_oukomG(~sqqo=#GW^DDXL3yCYd5o_U9$P-Z_&~tDB(&+t1Cw0S?~8B z0tm1+dB;`Oj{r>NX=^7zX1ppV}ppsI6+d~HKGsL(QIaqn!fOsv!07N6V5OOfH*O3cE-p04nV>bRcQ zetK}1El_9agY*}65n2(2tRSl~Q3S3flh=nIO4;T4WBepBU;6Pa@NIGo0^MWv^haOD zaA)HA=xy4A?HpAtm>I-Dwr6d|IGbM0g0Xw4)<4}ZXOx*><+6n4vG!t6NEJMuB}?B5 zrAb)MdnH|oao2^`mLsXQ+lPPU=QTWn3k-@@!vD*fZ*9U zgR}!8V)AHDOZlZ=d91f^ou_i@URC&!*{ z@U5N%W{sE0Slwr3Q@w&){1!UakIj>~l#ZDVk9Mopwr@jjHmS0rICwXc<&eO`Y*z{< z8b;1Kq;wdEEnR?mSo5`+oV8+?$*puQoNW$ro~s(Fb9yW#Xl!q~E5eJ@$Y1g|IqGsh zAGWt^-JYx1(qQ-|O|j7otSOB=5|sNCZUu#Y$U_6Y?Ak@QiZ;-$?i_nFEA2;kn*}5} zav6jy5IEp3iD~4=uWt3}ealpMbIhx`hZ~%H(xuPpY0}gUO>9FBSMWIMOOl5!NTl{} z?5-coxJdL~c)r@-N@a+OC*9PbuzY?!@xqv(ShYzI7KxPMLd8{+HQn8G!!x}-m+sv2 zHmTtigJADlQ3|64Qbo(^pKs##4wHiM&ysgGwpVv@N^ERQljc^N!zDr>>;fz+7lH1; zpL-Yi+?WW92GR*%XI?848oXRR`f^U2+Rtdd>!Fa_rg?nttK$W>;t@6;wH!PJZ+5PF zY>V}aJbTlUZt1%6zR@hB_Z?G+WfiH`jW=JR-omoMzUym7<5+TLuRSe*OXG$g9~Yw{ zcb2cq>B1;qqFSCfHpQw5+2q2(_9H+ACp?6q&Sm})u*PKh(ley7DJTWSo~nS{7S}Q2 zHXU;$rv}uwoUaT-drwlA4p;A~-q5lH?`zSNVrr`K9$5jiP(rCGoF?_c)`rHSAO;Wp zIhG{SMXA*?M@aYj%!QNg)T_dY^Dn|L&s_T>+XN)H0a?>^$5gilWoG!>(|UxIAN(@f z;sw__5Dc5qi*$O{s7xjl{&4lIH#-Am0ilt-FldAmY+E6dv>>4_uqaVJ6} z;~dk`M3Ai$9LHX|{4qQ~Ns%Vc%Kp-~uA!Bk0~YghzKI=a-KcQF(!_+(Yx@M#eIe(& zt}_JFTix_vlx~n@|R~KiBwq}*FL$Iv6nJj1uJ_c8kcg-!cK9E;?49c#5gd#7Z+TT zQ_k?IHp0tp_ySIb%h%Su_*9UsHq32C$y0Go72Q6}^pR&7ixh_rzCEcU^_~^jXqp-m z`XS!^`v{98idV_SV7Q0~81Y}93S8wYUM(ngo!x@1;!;fOYE4&gYfG9bR&7W&PJjjC z2>1+8>N2LO%Yyjb%mc8&{@W0=B6`a(uIrdvzA9M zXs0Kd2SNs;s3M$&?DsB8#idyLuVn1tmgmN4W}uZ=eF#{IpJ%2(w^}Z=oI-B!P17?! z=~WZ^i#`=szWvuTHWv#wKQcZu#DM|MG_@*f$*Md&OuGedxldwz?5EPmipXOsj78xy z1L_J8t)|-JxNn6@Ya$S+Q$oK`c$ffJIP=wst@iG{9|5uWX{D6SR!BG;l6^QzAyy_e zysJ_CQ*xVG5ZB92S;6Gc{-xBn@8AKK8}T|sL_BlU8L<-(j`lBlMt(x<0#2pLmt)7x z9kooi=On9qw!Jc^tmyMfTA0og+3h2n<*s>KsWr zl=}3__P6pjht~h=;T#y(}c_382SEfT*|KN z1>Xdnd2;{%7Tn7*_)u0RPLC# zIf8v~181tvbNasK7U#CR8CN^XoA*6>F5+>uVVg87+> zY)0@#ygNlD+x3jnhm{T*TRUED=%k9%EfpS@0t=yt5e7Tz^~qO0Dz=fH>;sPgXG#8u zqjSVo<;i=HC|oui=l3XGiT3cNBa>+|U1sC!6QFdPTDp-2?UokI4|Cz%5(|FZ%Q(G9 zfJsNbRArt!1~G^>#6W?MmTLhEQwf7EucCYXPS}~(i`}$is@(dpH{MLmFGPI1 z$v4AnOq#O@Ho(J*^T0KJ;5X;Wdlh(n(>ijZuc0Y&6E@)+#wu0Wo>IXGCziMV98~{eK zy5mlF%@kdnA~juXyYT2dq`db@(tKfGF|>j0CquZG?sR7v86CPI*~wqftIS}D4zlY< z{eVoi6R++xTjeCLsTwTL?%B}U$|2>9tw@_b?BybnY>`ifLF3 zXRd9_l|;SWJ+;qd72CmO)`6a4Ui~r>IXDq$d{?R+1t6rUT|E(wc7&f0^TmyQos%(% z1K67(xks=VDS&LM6B>Vn`K3vWm{tzzgqdJEtk?`*=R=y0Bh9qz*Gig;k6es$l+MdF zd~R|4t|?FHqb_Hs;%6|q$d!US(t#S6B5wBsSzG6rt#PWHI}Z1^tb`;ItPdVUJ4QdZ z^C;2>O&rDs%NG`lTN6!VdW&Jq_#jbPBUM&6_&M&*>l}7)koMpyBPRw__BvZ&W;XV8 zmf}Gz^&`O0J1S7=gexdPGgB{Mef>viJ9bu@r}xyTsN$Z&kK-!4aT%5@1ygtgd`Cj= z%(dvZ_l!xx#FHXxpQjoOV)%3his=Nnl+#CT<41j|u5UyoYxz<6PN?4&qk|bXl(xiY~ zI`eruJfsd9EvBNWa;46wVH>>N^N}_|i5rn3-%t#K-h6}+Nx=`|L&Yfch6N!6Ey zyDu~1_$HpB4eVZnQC(;$HrCAQZ9*Y^*aOz9GnMA&FfE$Y?5UR7-RwD9+DF$JBzw3o zH1>69Kys75jnS|T+0zbFkm5;h-dk*5bd{8GK@05-6IQLaKLBrNzS>Z4 znfUnGok&up&*$=`&s3rXl}jxSQw@U-PHg-Zv6ro44|C*(!xEB(02!| zi}Rv|e9c!rZ=!L)AEKxL!356~4VI*SOtjhEoZ>_nr!{F_FWg8BdCMH(T-_7o=?reg z=y|QsS?`>bhjM|;Cl1+wv}qJ{-61T9T05D%hnAP=z)9wY7vT~Xwk*ULA(RP#BBY;! z({{7DMDH7Vv4dLpv=T9RnizO55eu5~PFd3F3)iQc3~9|*zE6(r#K}|wtwQ1l?NYQw zItIRs7Fm`FUG(S=!E*gjAhLGzTFKJPo=daw8kKc5PxxnF)flYaFgkr?6t z8%%ZPCqgnsc=x`P7$T^<$3{TQek5WYv17SY^>d>W#3|2zIocB}FQ?fh_wF<}nDJd4 z+Q2MD>CsLVS!bg59kshJ|33CmsRRijo9?K47|l{ooaF0GZ^z}R?5L(UQ&dys!#xX3 zbh^kY&I!C12yO>DTp?=>O_63%gj_;{4kkNY_?g&Sr~t(5h$g71v%?2U{ta+n~Fj=)}83+b?Yv|0I8n6yNqq32)P zw;gc6m%GkToRWJtUBNZ`l7mNrkSp3`y+cH$*0YVXEeU}TqwzQl!@j}oI!a1RlvquJ z*sVu4s(VSE>PP7!s<{m1`&cR=yktVchXy&#lh>o!D}#lCmsd$~l5tc&5}X!$kgPC1 zlGVdZj_FM|tv9cZU4Wc&-#1rN5`KR>7D5HfE__0`3D>9hbL-X;|MZ))r@V}~&5|8b za`g>%Gc=crpKUz`jeL$W2nBJoOibDL@h64DZfN7pSQQ)XO7Mr%!4%+yCn?FEQ#s{p zVDUDuz#!#Hjl7MT&DyG7L0mdj=z7E&5SJEXI5l}78)+E9f1<-ZxvW+LhW%6GX z`u%01-i~JO*|DO!H>rIyw|MGXnF6_a`z?_eL82w;Nf*ozg6gj&?5=HkeDl>e` zDMPh4vV=R$6)~BxjYi^sdf7yyV>AJo z@Jo4f1&cU7YxUr;O&bG?z8~3kj6C7qOY*#D>s7%)+RG}?s4H(76&>MQvV2FqosjYi z8=`sUmvYjk4Iis@zi{)=Q$-YVw52}T*{q^=4n-8V_W~(&d4)3(RTTydtTRdn9LnmMB-F=8g8X{FmV@H&{cJynp#sk z(i{ zJ0iu^V6&!=KeESSbNl^SnuPhjr)02u(wOC2)G)8dclPR?%`(Wg;$^00w1NnBaM zCBl9IGTzFbo5RAxpSbuJ!~&q1yQ&}S+tQMK}TH;ovUZ`sOZzc<8{o@y%5-hy!~0NSJ{ zca7NKyJMWvNiVbuZr2#w^Dy-Y>NsGY9JXz$eU~TGQC*a=RE@!QtiwabQP-3b7T{7B zWk`KAS=eX2adfmO={7)6tod~YC_Z-RGlp?XJk{dzrH9U8mh5iI>J>sm&}L|(4s&0b z!US{g4^3;UJBn=+TfTgc`^7ubu*>=d; z5JE{ZvZpoD)q~Gpe*bj3f;o+R^f0 z&@57gr5)RJ_h7Jdgcp#1v^~GKJkXh_d?&Yeq@9mrd%ESfv1um>kAs|JTeWc4RV4=X zvG@l`q=@-;v*W{iCj^zayqOzN_1VuC62D7a(kE;NAPs{Rir|*3GGxE*=^)!d476d? z)l+0>_fo$ke{~J{GVc@9#9H52%Q#TcEb#$`D$gXNNTlH9C-e= zO@rt@7Zu$VefZDX^E}Cy0f=*1Ia=34j$nlbxv%_{WyO~K-#r>9S%(S3v9jG62Ak=H zMk_tXK&!Lk&kGH4TgO&o`3?`zugo+B`|0+)?nVqK4KuwxV4WZe+UuRLc4cqH|-GRbpY$2)n%e z(6_j)v_O3w%}gQ4(}F}h{>cutN5H_>LW%<2nJOmK)_|>ggK)NIqJz}8isGS*y(`$x zc_!FL83D}IaLHmO7jt4mOYFo+xjw0m3t$8pK~8;j5IH5B8;1) zkm=|KkO}o!s~U?*;HnV(pTa(a)Y6(xk@X&vR_<^c)_&V@hwEH7({Gre&MWj%?Q}C6 zL5WHhgY%DlAzB6C0=Os}aymg7H*%0MRIl6$Wm=0m8 zF`c=*zELv#YBva4k8Okb9>`0ILV7pbmE$d>IqJcfJGtfRn!%(m{CJ z+t|D2jFXH~1;)(rJ4NRb3+6KJG`xm3n%(w;>Xt7QNdh0rzy^v9FW~ZXTafR(;y}ZK z8glmRhSc%Y4c`VnHV?e_0!ShsGIWJH&sEkln~hw~YU^v&W8Bg%2NC+C5ySlmV4IIV zxVi0y?3Sb2m>wj(sLR*Hzj27GUV0-F_d)pVrGJbN^5$jf15Qhv?ve^sYjf&19z{x) z#gqxgGh6gCfTH{)%*4|P_y452fBWny+3(Er3p}3j-@0P-`|kb}^vsZdihRnuj4C~h z1<&v3Hp}xUt)P&-p#o;Lw$P_z#~!2-04s8vmv9zTf-l#pPS7V?6;jK{u3v_wN*d_P z*$e{SdF*g0Ds)b9wG?I!IVVi4aMX~95^}E@wKVYKEFh7(T z*;y04qgG=WYx=yH>a()qT=dUPeT;o&$dn~nJWmf`M2z*Y0j%cdn zzOZe(i}G+}zg*uw^Kg4|5^4i;6eRXSkdWNUCEA0hk*KXNI@^LHL0wgf$i~ zH25RG&)eowG_)p5-=TGtN?VzgB2Wxfl#@UC8`N|>%?h=nBRe-)Z~BkY`L^{75Y-e|jL)s;tt>BnDq9FrGm~zw zS2yrukvVdwni^%hm#23}QjrkW5H{7$ExoR~-;nx))fNKB+$lUwD7i#}PYQ_*;b&G> z_xhV|i1_}+A2azvLpr5JpKn9Hd#F57sTlhy1HZVm)9Ml6HufYI7D-_zo!>cdVBL`@ zWvKdt8}}!^fEfQD?cfhR$`dP%+ck8t&p)_PP7FEXqPA=)Ij|oJ8)n1buvWk@Zr*__ zfQ#Mzcto02W%*)`s00t#8DrICGl`Uv>SPHW=m?PNO1D%i3v8i3{^u@kt$Hw>cMmRO8K zG*qVuNAI6j2La4(vt#4W^Ae*|&?Z!sK6w~^l0@~3`;z$TNG&?~Cry>QA=e|I-QsD{ zW--2C{~dIC5nj6u66#MMMgP%h(`e+A+HZdlB}wu>*CG4~&+|K#+#4;8uB3_GxEtc75DSnh)j*{hRibOyh{;h5$hfL3gQ_*FDr`oJXk($ z>&|BgZ|~0gp}Lo9FJNZ1MO^mDKJaDA%0;A3C<-(VXBrN=nnN}~Prp*ko_B|w1a8>lV zbEvM+=_!pqJ>vjV#dm`@^pP%M`-n>MimQKk70uUj%idOd=`!H9L<_Zj9S zWw(jOJ4p5~ZS>pd}C>dj_pUia9C4{ej z`z%=lKIVfYrs|~95hB0a_{I5^MTO0FtGuMZOM5EmwSIBH3VRHuxUyS_W^26U3=@}H zqqg;1oa@H5Ms^PKoI!Xzkpdgqj}>CBNk>N@s`jXhr8$Rjif%~vC3%p%_gb0U_3>7c zcAz1X9EA!iylHHd&~@&}%9Dpd+ErzE%BZ~(jRWFwbsZw@{6x#jn@~1Lus`DHS47Ic zQ6aX*JJvudx$`#4!bt{}xm8*hhxc~GzvY=5*}bJ_7m|*in6n+;(r`n?_cFewXro9H zlDHiI)y%x22GFe5;=BNHq{{Y*bhnI2*z0?4y27OF%U94NOYeWB28s3rvw)fBSIGa3 zBMou~^fY7RD2HYEg?hT6B$9o%o(J`^7KhueRWUa+GjS4nn^oJDUvNIY)l##9I!92a zyHkh%62n2eqnmRysUyhEn- z>bUz@`-N)~SaZA`BBjkylD;ZuKRR@pNX_0NNQS}KUOi4OH{Ln?p1O50*hPC8&A;W7 z6Fk|k1KCzIv^cKGiNz(Eh{ZY^XE4x=6&KSE%*lDgd8qNT;@jxE)lpZb)uR|aLS6H0 zjaW^QAloT90=sw1$;n|(fn>S_jnMKWSc=uGY&_fIm_M$KK3yICj~gr~!$^MF;P@Sa zX0%f9Ye1MkEs4LXe!66r4mju$;3Kn3elrsr-aA>OGUh>Azt^wa#?x4JCAB3vMH;dt ztA((%Kv?N0+CJqxix;L zeU$HSiza8w3==t~YxSJoI&ktFOps%B@2lTabaC$_&{1|jcCDtQr>mFp29qS1mTj;p zqntnRc-TnWU_W)c$dkHaQ6tGklVO|nyP#(AZmL)8!nc{z z#Zk_(^nTdLn&@@wi*sCBSG!lI_weTr9SXM==eTK_w|S?9-?u1YXG%+er1vBkEXj~4v#=j`6Y}Eq6op^Z zz55wi=1x|Eo|WQgx0NIX(`gF78gVI9!gViuBgEKCZN2%9(U>2!m<#G{VGLN0K*^Ic2iiNsG#6)8)bxcsTrhuyo9NhP>{ z^>5sjC#O-F32jf*-@R4L&qqM;Ik=^Him8G|Uzio)BR!|Tp{>k^wwg0C zBx@M$4+CfJSqoQoro+W@_KZ4Zoc@z^kucBPlss;qy{ZWMajxqQIaFS6&VP(^|7OG80`DFBt^h z@waltD_0IO#+;lm^nVc&p@}9y@H&vwO*H_+vKGWvbXV`-eeyP0A$*yOZbm^8fOw=} z9|f%xHT5x{+%5;f#Wc5(c5NIf8XZ9NoI9&VZ-*busxOnS z&HCQ}ds^LU>grEd2J1d6A}>3T;B@bFQB7K{c_JTYZUY^$;uN{1ff-3Jd^QX1Q@spS13G$#k^w>!A+ig4^(oWoE~)3klo9n7_i$L*A-p6bW7TPjx|+T7cu2W2g!aK-mh z>R{hOp8=ttx^;up)>P=J0$lkW1ov$TEJPbTkB#(fJ@0m|k ztwZB}uawo&Wc}+>Y+FX?(=Ruo&nk-4@x#Eqw>&^Uw-}1{ zFA?rdK3OLhw7Rrom79G-z^4(hsgOEZC6*<>3M+-1LDa0)*T@hQ&Lw? zXi=vxFgw`ne_O3;@8H?Jwwap^iUE&VLvTBsjCZ zF?kS%78?@*IwVy1)!@nhgrDc%<`V}gaUTvjk9g<}@E<{|!7U!df4qZ*;7RW(#VMvZ zkKPLo9v>e;36r|i;~PzJhSuP*`$j^A51kP2Z%~0m|8Ky`RV+EfrkoSOvd{=U8++2cDe$AR4O2g_VWto75+8 zK@q(0kMaqG%ItQb6RJ2UeE+~XltZao8Z?vPw!P?I*YS5Cg+v&nt0u3V0w|=Pz&r6n z31OZg9x5G2<{7>{M~Z_sD1fgq#r*yh%!u#S6ZMY&A*3tW{y;CSHB{gKwuJt1v2A-R zD1lg?Kn;e<7;+4`ySIS7z z<_X-9G)YjHH+vpT&Ax*r_)-nckX_$F@vk*k<&^52&pynx)DJ}X%slz=B<=&u=$iNu zYW1v_0AAvc=L}_zPh@+w;Jg)DpfCW%fQ{2WUJ%m@k;X>CG2w)CQ3Tiu8G!Z zncP<85N3XE5eA@sjUBlV%*S^y->9i#i!}R6bk*S#%LRTmPuUZ4Z zr(CF@8zqp}N90I4mPAoS-uCXrv1rQOp%|xN7~XDPlA4XE#43!o z3OU~JgRc`{TN%w*+dMA}_l1?BeFhLS=e1MsgyEZJZVSnQ_G;L(Nv}Dw`f}x=!j_7A z=mnKVIkcY#So2}jY(o@MO0qS|^_iZ=S(S@R3tGRTrJ*k+S;9i}E}SywKbXGye}wPKg|<&m?hJlolZ zG(X5I7G2Ewxc$jALZG`XU!1w_5 zt|@G6_|SemT%I)v|GH-|Z=Bdp-3M_B1X6Drv7m(cx-}D^@wklcf^M8#QUzI(h=djy zb{vghnCusm2KU18vt|ns^Nv0U(mfgH{kJIO?>Y|d`DFF|btG%iod3F8x2iw@uBu8r6GZ>1jr zq8)@_H3NR={?l;twDT+dM}#33_QKs~d2mD(cP>LWy_{^+b9GDzcfW7lv4^j{Z-4OQ zoZ}+^=XN6ETw;g#=~VFu00#jKbBh}bhuZ!|L;tbwrSHj_*yLv%;V;nnyk2#6HSPZ) zEz7skP-zi90*t&n?*G?-!zE6ya&FgO(041mD&F%J8W8p!tmi>?(jzs9qp`~-TNVy&&dP&5qR`~oCvVE zUsS5b0(8Wj`VIU30o}9!5;nk;^zHlizq}wr~qt`OLin*XPwjuD(@stW%d`u+QN-6BvY!fnZs#&6vB z_TK>q`2}DA-2cLTaZai0W9wRAn;w|{d>LXubI0~VFLwFOFK64TH$7QJYTMtR(-* zT{PMKx_fQiZLf{o3f!p@t!!6)9`>=HI`a7yo3`4yYw1F2+hyOr@c#L9W=>L(&yIlo zMiU$YJmmINg?oK7%dY))JJ&pL>-@6kWP71+x2StJbz2C^Lt!gKQou#lQi@A_Q1&SY2Ep2 zZX8r^TkC!MXPaBS368v1xM&&o+%jMrdva{4=Az;;C|GpKJ zBS7VcYpGdI{erlj$E)-uZ@>R{{hw8)cyQYBKi=kYIYhDQs!68>^yvef`Sx`qEaIK zl3W6UeBV0(gM))ZLO>!wMke5ULHdI4fBf^*0zi2NI0vtU03!i_qkuu6fIW2phyVaE zD3G<^4gSXu7&rtZ6g2P|3@jX|K@~Cp91H>i91;Qw3KA03+5_}=03-?&>T@O`Xf$~} zAc-A1vrk0YGt!r3tr!ZU$7C$}_P#K%m{{02xa1U+R4=Gm+1NQaxwwU2iHM4cOGqjz zDXXZescRS*8X23InwdMib#!ueadq>1?;jBO;bTx_)aU4!*tjq8=^2?>**Up+`Q;Ur zRn;}Mb@grS9i3g>J-vNn;}erp(=)Skt842Un_JsEyL%_6XXh7}SJyYU-*JHfApV5) z1KB^}LIL3dhlGTH1b)W_2JQ^HAW$Hoo-;wC3dsZY?9fPl1PVX^aDD+FgrOS_V1x!`6v{?4ygstyhT32}J`%h&91$_LBhZD-PngX-WVf$Br^5wWsiI z7~P!x4ME*RcF?rBcmYnB!@Izc0fE4%7RvqLiO-h*+M9`*OX#h(1np*3z{{gljL&2r z;x@k^y7o|-T6&ATtptnR?0*9Al067?h0*KWtWxeivgL^YjDB>Kqd^uSZ9AsfTP5&}SWu$nTpLgH09xh;sKJ_Acq?E4phJdNSS({Fe zSIF~P(n97(;d^s8!!oiHR(mQ#Jzpd@U9rokL|5kRafo|!o-q2O#g*EsNbyWOb!>Q) z-Kk99DNdXuXO>bxMVW!mh~dOJ7$DKd2eS4S=_GeSjP~U9Wh86(cE5fGO@v>l2WU(% zq&cMfzF0g_QcOf&s)XTcyo$zbjHqKH91lp~by zPLu;F(I*iVS`fNA7o4b)=w_8+1H}BlZnD$8pa1ym&tA;Uss2xC3}G*JU;3TpQa^k? z5q2du;lOzzRgm3OtckiYHhqLdjq7e&DR*W9Yn^R&kc~ASi0+ zP&V+0WEPh)2sQ~7VeDJsN5{Qz0^5HIh~BZXnrVb9cqK|cM7-ausGqxnEtG1-i`eL`=qqZVi^ zJ}QWuzwxyNO?!M+S9W>jDOuiTH?3uvs|g}ZD_)$_frWKo>c7k91gG+0N63$m3G!O_ z>tC6c=MsL%S`}3)eDgn~jn(xpwJnmQf?)VdaVeuH@9&yHM5r=Ng-=7z$ZGUECR!I3 zAqB@>dWobD68Iu?un2;RmkkD0jyOXAqID6AQIWPz?!KtUm%ztn{U^W(l=&2Nskl@+ z4DgWY6cK3wL+(0)q^9c|}bW-#X#d29t)+D0) z5HSDf>b2etN|B!p58~XFXO_}SJ=<+SY219W4id{B?Ji&Q<#Xf~5|z$_gp}dl1!NbI zUJWSGbgoZ8Qpp#EWZr-`p@KAH)w~`1$m`V=RyyzXkA{T3fmPZedo;iU_9ws@Xr3Vy zSQq=*(6D_j!!79VP0pBQyv+T>Cab@<>A*%_O+c^uhavp!KN`AI5KjJS(i7l^O+T#v zZj|T+hTkQg%|G0ealpR(mv!ZOhpGm)9+$VUj=CNJ%am7t!qoYXpneSXcc^}h1VZ(X zI4u8d9F+b*_0zrlKST1jsQ$*&KcM`-Z|kM67<}&U*VkDR#=! z!l7L8oC^s2!gU9;{BkWD7{tNwJ`p{)$~HgUMf6LIzLVt>%8svk6>#DWPju6>BHS+d zm&JAqiD%P6;}A%P)n&f`PSLAbd@QKB3cDT!F;s^HE(~%`N1JbOy(AI4^35xE445M! zO1s;W-W7!<7mc-j**jeA`IiYe93t=^WZc8cy**slx$+)=dJq)Pe>fdP>OA(q^!`j% z|E&Odl8}Ksic8Wo@}vU&-;>~%_UV5T?!OkB+WU&%#W*BLUNRlOj9&^%Fq(4}r=0 zT}|X8Tx5GIOrzxDUXO`Ak1}nUX$`>2u{7L9KR;e5@FGlg1iJT3_o=wF>d*+h4Xt|u zSa98Qg3|C;mH5?^@YYrJqVt=g$E7IVh;Gzgwo-nL{PJTk{M9glp&}&sdA`;dGh<71 zD$ ziNjCb`b9v0ApXVjpKA4&nE&RcKjg|SXnS6|(jX>K;27jVT&nMq%|-(OnnF|^p8z^c zPk=b)SyyRN16ec%V`8 z^Xy?Z?4+{xEIdD%oed91=>mjXn z=U8%oFY*NN2paQ{l@3WVY^qcVHfonl%^l*}4t2@tbvHumh}4?T7<|lApTN3xd&DTu zXD~GKsGolHBzXdS?Ig0UdIF?;X#Q5Fa)DqzEL&SsV;pis@#Xnx^t9U%FB5^CXhxB` zn!5H-3Hmp4ut@AhsS?f&r+c$Ixl0k*8Vyj&b?98pWE@U4pPgVmT++N*+nO4LSd@aN zk{KCel#|V2Q<+z(Ov*uJwZ%4eX?AI`nE@kn*Bw0+o<;wIPK=E4E5Bt$mK@IL! zw1pY@Kuz`6*VZVDT8t``0DR{>8=J#IOV3HlVH|2?Q3xD3WORmN33PY4OHrHoBCqW= zhPwm#%YB`j>ran&7->d%ns4T%*vEn4hxq-Bum*b5yz6N|t}X;_J}H^#QPRdLCnu`> z@;uu1T>Mkh%d~^wG=8395G)ry1r?fVZAX-)$xOS6l0m;CI#l=w04<#q3!0a-g)SLkY%e)N`NT{V3F zPGtOUtx&jRZZI!FpkgRz$igu?!ihDgAt=;ge8M1Ta<1X)&hD;Q6;?Up;}7rul6NX@ zn<3}*!FETYn=<2AGDeO)oPp zg+2eSFZTMjMe~Xo1pYlPVpv0GlmoDe%HxhRzq!nDHg`VlqnK&3c}WA0{@^D-C7;rn z-@*434AS|bMPh$(QL(sD%ErcaB+LK4ivwIgrs%GU>!#kG^H7TN;{oavoOk4n18HBw zI@OX!@mi}UWT+C7{R>c{3I8P6FA%x;n+%Fr_brk}>J)IEtvGL}B!A~!-krR=01MJz z5=QltrVDe;i$u|Qets1;z zj!1qde*xQjl?om~B`Mj>>!DY<;8%i=F^e1*Z}$+LiM4>G{x`EQyhP9(mzfT5#(&oGPLUJ$p=v2E2y1 zqeF-K9-m$7fse_ExPDI~8@t%#tvBB_oPTdXJe zR71toa|A7l;3(|?j)}mko7-;`2$92D#4FxYL_#C52<1(8ev!w@m3=(8$B|V zzWK~#!;kXfJ+!tkIJ=^0`SZI|s#whI7d3#Mh;TK-TRAAWA`ZC(CEt6x&n3@do}>Hw zzSn=IYioP0MaXFHS}3jBuBsxpKfQM%QxkLMhohiDtQy7;y8R9D-l5YeQVO4@UR6nJ zoz|ARsW=O2H1P$M-j_6T2-RZO|pmm}@y@<%i8s+VI{P?F;S zD{fQSS?Rs*fS4ER^i$&5O0Hkh;NRmLlyY_n7EI$tClxge^G&=!WfZkv2m9CpT&9RQ zeT2hfapZa2(M8Cx)0Y7N9Ld6=6m_E`LMQ^g*pu=!8|DHg-WPGuzGPBXf(R7_lKDt` z!4Xw!%VJmSuvcMu@!gsj7DzMl1#lBh-_X3LZrD+XeYZDkX6*8siO1mHuxpDp>=v_{pNZa>xK2Qfh}t zwLGF*1j9vBZvT0$z{p8e`B3r(>cPA6q<3YZLii5Bj$8E_R1mRs#~BOS!z3S4h49y) z)RR6p<|~ycwmEWj*~^%}c=z1?L<+g?Qy*haG!77kqmRt;7CD3PV5w?!5BI&BV_m%3 zR)iRt2#!5qPliX??458jPMxD8H6e_yQgHh1NC7=D~Uz?WWdf zg5c2L+S9%WZx+AAON{X+0M@E{%;WQl+6n3Sk5jTtX^1DtA;fq=0T?u?A??0 z7Ve1IivGg-!aYGEO`?Q2lvB#&$}%lL_Tl1I*@LnjDcxM12L)uEzc7zY2zfv&B16`uX*pgnjU0$5T+=1DewL^r4D94S{`$n!4y zrFMe#Xf2|{^f)@P!CO9xZJEg3Vg_ppRGFjP&1bEoK!2b>mZgel%3276Qj@!;W~P}E zIC?2mMY3PXams*7698Ran zT;oVUH|0*AWPrcLYXyh7J~Mm$sNNFTO72 z`nG{JSjhT}RhRn-fCk8M;%uP=1YhLmt@bYp$a{z`h_H27@uqczeBG$!F)$LK_BlO8 zIu4Bnk+w`p!p_!evD$SF^K{~fih1dMXtQJtfoGqG)PcC+8C;|J7BdQH-VfLje+(S+ zCMuYf$}^2+59*1M##~4|1m068S7b=!4I=hfVOc!ClOImOoEq-!_Dm8}oT_A_lbM#N zv`FY>7v)1Q6AHWQR963O%7oK-o51x!Cfb&YomFQ>GXyxBWeABN}GI_qcD zFMxFKffXdX)h$h9b1|IQC=104jAVukCFMbmUI4SvLb)07iQP#L0#N;O<)|xo^$0!z zrVn&(0Yw_Wz9m=UCows$kTi=#kQjKMYpMiM8SZK$X{ktdn@K^UMk`x;NDLp{Ew4fz zWAIl0Ll=0ez8cy@#QvdttE0fY#?l%aAkQ$u1sq z;@xfx`W(v{Q_DTo1jFh$tx-em;^r&nc~M!dx2f<+!m?lMCyGkQE2`>4iP)K}$eQV4 z@CK*LiWE@er6Jdl8n}yKz0SrjFI0nwj{8FlCqqt@(|?W#T~ zEWgF<0^{@HoiJM?U>caImwPViWUKlygI9lR$YpOw)g= zGq4=IZp>kVf4(~?SwEG0XjH1;G$(V#J+x#1DWuO3SOIA5_*$+S zF^Bfu3-XDvtgYdSUb=DAU}uK7o}TtlY^&k2PT}mG`mOk>E0(OP%`q)4-URZ~_P{xe zIv{X!!+q*@f6iSy!G_E4<7=LHN+o8bRYcJ-j|#;Q$K`n>rBDFBl2~{23b-Ly`?fXDK=f3KP50>^f z+x#gEx)v(kniUh)M!qvchVRz|FHebuy=|qaa9V+c7i=_4G zB)fsLjN99-=zF2;jF-*kkgxOi`U?9nLeVS0{CihSa8ePdshbWjzNP?^94R_z(QPG+ z`9MM>o0=c8M#p9~kZrOi4;I$z#mKdj#neE#D~S763HlVHoWq_e26?LRruz9wh;8P5wc{F#~CcLq0CosBY4Z3L0;MNf=22(D`xC%3}<53K@$XG#zo;Q zuTfejbB0cLWx#+R4;n$-698#l5bv{!W&C3jYDQJxha}!s%47cpbi>Pjd{0OAgnSM2 zsW$f)V@ltud`S00lc>Nq(d7l7xp}e$uF>#`KCIUGKBrIa^Blz;-sH_8g_S|-58lS6 z%#mE+6Tr>OHLtGM4`^^!CN85v<5VV)phg7NrNS$wEj?tds;z2D*cg0e*m~Ea#BkKo zogeBjGPXN~n_jgzm0?=GJcv0?+|cGfd|P`ACe*u~$LCQMTBFOLGJXi?>!50jZ1~Wl zJu?S`?~^#LH*IO+Do+;mPGSq)R9iikv(J~go!2fN&{Z+U05zDE)xkrjviU6zxP*m0 zIwpf#90+0A%j1}Q1 z>FKI^n~mc);_d-xFbGb{A{A*g`SB`_2}~B=@WwMtw6Z(VW2e+GY~d^W_PeGt&A<)~ z)0uadGs6`FDEuwX7@K82XYlss;#CbW7Z67HH;F7%c-S<|4=Z@sWG3NA=z#da#JNy^ zM+|1@?y*7WohP~B39eW@>dw;f`E6k8wQ-)@srIHR&V`Wl55vh&oJXK^(;!BlGp%Q)_yO_E5eZ!T!(xRNpK9@}fk!`S6(s z(P_R&M{L?OCr5;Vz!7ZjJJ6OhbI?0c+5GlR@sbZIJTi8D z6Vvsd**THUm~iI!sLCeNo3*;5Ifsz4Xn3@^GTe0(6LnxJV~?fnEb#K`>fL>ZlXh-S zTM4>s?PeyOjxVz<@4pm@bhc}VAT+hVW-;Dll#%7Loz;p8#y#B04C0)vgxh7~+=%R@ zNMa+ay>k!;I~6gO4_UR#F&sta)~S)e0N-FMJjXvntHUKBEM zM!3k=(yYGD#8GL*Ib8B)a^&IZUwk>|9>S`v(Ye)qsYrd%GfR&MwnB(R%I2THQ|}*X zQ8WFle0~t2)elv7Zj=A`Vlmhxw>z!)JQ04<-M!yRSXL@`_RW;i0!{#!j;)y*cOs}b zOulW~%@`-~rSt_<7m<|4zw+oj8$spQN|t@6vXz&$s5RSld9=|^%OYktkD+P+QHjib zvJeqJr%>pf1|{A@noT7p0Y=r=f(irV_45_r&hDt@svf^k9lsD?{UF4jKGs*X<*m!3 ztSrM*Js!!e;&q8}D&4V%vvPG_JQpO)S%8FOu0mrZ zFk`jZ6fm|1=}I8`x${aSlIiqteRDTOvSD=)Prd@n*NnVEf;{w<5(@q*K(7)!ABLdr zsu)zAuha9g<-SoXj+569U05jBiiMQgf}UgKBJxVSwTmyK_>!k(9GW5YGgVRNM`j; z_eBe&Tkr(8Sin({&Eu&39zB06I+3Mx5ykOf!#9Xz*x-mEn!I#x69$L$<*LES@{$CE z#Q`h?r-9V$!2}=qAus1HXnyONd<7R9g}U*>!a1@}cl^LmE;wI$JhzCJoo6x*3)2=N z0Q=_I_^`{7Y5p$fo8^=3EsDeBH-qyPHC%_uDbZ?`qx40Em8LzyFBTtI(#GETx; zhM=9?)c^Pb=Xv9YK@(PUFyoh4{IzR!+9qwcF=?y&rh0Y3Qc#g?)x+3EXN+K0v-s=Y zuHF1qGxCBs5=M>`)v;KbZ}2p2OC!2=HF@Ra0lT|$94jYZKtaARZlHS#R z(W-zEyRXDQ@pccT$+D6e1U=d02-&x93NL>J+f8)Zm#zO=U50{}xBh7;LO;y4>&iNa zZxCP~*nS(27pJEV{RF7O`ciBkRp=8${Q5o<6Mx0PMls&-E7nAO=RsSCmHtPvnlLYk zP>8VC@JvMCQY80fGK|ZIaq2z|)W|X$!NY{OZiFr(FjcujjvGwi9z8dqQi!s?(X<1z z_2@+>=9-sAll#2OT8X-nlpS8rG}OZd&V&@UIoHP_9sS(sHh!?}FH&hk}# zOo~K&g1tC9;=n9zP{HP3$~UBa-Xyw)x5zo?YP+C=7gog@Bq+W5RSwbFZUMl(mF6cn zWzg)WW|q!DahMe|LpA*Q`em}Qty8Yzc>dJ!bY;6EUClfxYG;F6Vz3pCwHqo>BiJQD zf?d{$L9P`CZyC7R(?e!JY>;@K3ahwVAFsLJHb5=y#eIWnO+*qm3ilw2z5ew@yo^t- z8AmK**2*1uhxn_~u3Y^Hz#EJjh&sXECxFRP)uCe%|3!puov4X0i7#wqs<-&CUc}9$ z*R{lX7&~_7T5OzkFr-1FTPPQP5MUtX)mQgVp9C9L>Y9QM7Gva;sI+{WoQ*hJy7?#N z?|JeT)aC?7$fUF9KV(n{iu40TvAYFddEC`hy`8?DO3DV_866;(r+I`BBZ`$@u=X?x zWyzXfA)8~4i{X1(xY|QM_YlV)@$SMkPLChHoT;Ear146-`B?iRM2`L7a7cAJEkDA= zM82vtMHVHqzyqE73<#!6?WyP2i@h8A7(LORKRfJXm0m0QMTp#k(N=7lFMkuDZ?0kW zZH~@HH@bvyYNd}jO8-7fU0eaMVcoO1I~_mtb>!QH=5RHy==4Bc4F&6G7*x)y%}OmiP?((sSy1_anBzg}syz%&xAlVvI#~Z;dViipQLbpY@K#G8hx|9#z5(!q z1wMxhu7a-^BTn9{3HAu2u`8s=38pgxb3uHWJzOy^rQ9E3FrVwy^x_UIA=1w_ToXf=7N zYYiyn@b+p9^LYYXNYD|dF}<@RFICPP;e26tXAh{%ewwxsu`O|Oo<(nkxVu~~O3uS> zjIZ%k!SU`obOT<-uLHspp@>)fb{ytnmK7`N>tp6%Mz=r3%B;K{t{t`$#8%~2UXw7( zF3KF+xh(7UfsgQG@w<1ABgg=(IP!T=7rLor<(P-$@+wMiNZ9fC5SwlyB2I=f^))Yo zQp>&AB8~eot~}HDl5UJiWIJR&Xq^v8915!5++nJKWubfQb+5M=UDt64Hw+D(QlC}z zOH>hn?>#dhGL0d@kk}F5jn69Fjapb1ri-rVc0Og8SMW%~&?IBt-omB~CTuFuKRR7% zTg+)l=G1>vLJFXMabZDYF>^?f2=@_S6#Rk`a)l5G!C&c!7(kzu*?5_f=|NP0UoME)9O1`$f!n+9 zTXWFo-U{qzJON5-o6o_Nkv@F6r%li2jWyi+x${vMC|6`W`I6Q_iD6 zLw$Leu^aU(G866fp7=Tc=YIFGD;p~^3J&=?Tl+%J9a}!Lvj(*=!@9cASDg;fRl1Zx z+lS?Q%-?F!Hp#;*{RQO~0dq_~zAs|q>C3PJX)O<}vN?#1nI#w!z6v?`1V^rbO>b>y z(^N^pi5D;J@u*?0cHt@q0|gYP3YF+kN;DJsvYHmRO5ToizO^QqbhykAI9mSzwG=mO z7ZL0{s6KjLaEK|zwO*O@1Q^ZIjEm8A=8BNCm0TiQEQNm>coAN-^eUNtTq*Fk*W8+^4}LwJFhusXjru2*S+b2y_u zh_Rf+`GSY>2GC9zn7E>Ar#)dOhWL$|O?p7RI@mBd^KN{nItHI@efXhi677u>xN<_Sj1{v#&Y|PD}E(bDH?7 zLb&KqcT0+57KbK3_pwLqAON(|3-~ik2nJNXIf4&U4cL8w-Xd&og2|ojw_`m$(0tYb^^I;q%PWsU=x+GR!z~C< z^IgWgP_i;XD^!9|pl8#6cVS~4YcE<{+7L9gi`mehZK0on9}<_Z94!w)yX-5mHIb@L zT3dcwM-GV+9G!pt#ln)4-ku2Z_6{C)&#*S&g?^BPfdmDaXmAS_vEhpIdc5J)Wen&U zgl~Xpkm0t{87yUoXT-a^d2HLgN&9dG&zkky3)mH#D29lOn?TB`GUMZ|>^Z&h&RTZc zS5ca&b}6if3$^D8_+eLpSxzC2;Ef-~J8;RDNn&wCF8KX|%$h4TUo}|7iB$O0Fxi#) z?D~x-2FrPG9-J4w6hLFmh6|%IN0t2o>q^2d-4IK}et$BIO;P^@Si7gLbV~}JE-@om zK(ha`h}r`e?AWuf;`Qk`&5}Sy>DukcrHZmP;se@(l!Z08xBBP&GV3PnlWvWo>|O5g z(JJvJxRA5mUT}}hoVYPay-F+28D^&BjcG-~0zJBY4KY{iD=Kl*7(q3Z=^$~o5dKXg zQ--2UHBm#O&&Ylg$rEoOfLfy^Rqi}+*!n1VF7LW)o%UMoHjW(vHaINZ&+ALIwax44 z*43-ZMIifGjbg7}x7s!mx)DaZRt+*)#cWo=$wRP@K96>a+KvBmfya47;wzPlX%=80 zOxz!)x0G~&5q#)q6fch1HM#NCzav6i5rsicxa>ew6k1k>hQx#4A_>vmmCky#nI&uD z4sG*GO$k_lki1WqP}?a@UZ7lMZC%U~UYubv!$m}fZ~p}b#!y*~dO6&ud6)hkQoKwe zmvh32fFZZx_8OIBicoo*>`F7l*ISdHFqb2pG4CGEv>jEfUXEg5FNjpl>pm}M@fE-K zVI#wmTLdzSBu&=MppHQ^v14d$0)NFJCIpE$|gOS(CV5mnc{kPlPCY z3J*CWBzO^4Q6w^K*Va!s!qMz}(?FuZbm3gb7^!+lU!u#-$PMbOE^gBusHb#PE@7{4r&45RSEhVWJ;9O3Ab?^iv1 zYlcvu(XMQh`<=(jeYv2ijpo_8B?jK-9ie;3Vl~$ey_6%WlJWkTF~*1VdKS9OMu_Z4 z-C4-Xa|^??j9Q&!WnP7^Y}Gw8ssIko+#Um72BVL?_P5gYue4W{>1ee`Nk>+F49&xmJR&IFwIGIg z#hYcCIIM_|4Nk&_aEvEcV-s3~c%}9>w5fYr5dB`rgd$^gKQ7&yuf5}4rKKSgVNP0p z{9ttZ9*T5MyHo4{Dd-(>ZJ>U&c!Wbwt98ejW|S;OqcrU)nO%vh69B1W!WrwbIs|uN zK?F_GDb*QJc6pJJi;bJ)*HnGSLm)~sX%+)RT6Y&bm^2vE)h5Wv)v^weC3(qVRxCtU zsLC{O6ynP9d|0N5O0Zw)pI(gEpYM20i-Gox*n#$pB*+9ly!E=c!g}spg z_S|$!xr|R}WG?`~BOPV#cub`@Y&LbrSfQFIA;iqLklj8|^KcbC&XU2Im&z0*quX;H z@uz2IB>T3jH%`z`tzDkZDU4Ii;(h4DzcJ*0XndFB4V)`u zx%?RjdDud{mB60jYJ`|gN7}3|lWH;pUl{vL)gQhndb@TLRL?2GB5!Yv9uQDL^vqw~}R zLt$HJgpV#({;vE<4%$s3mmfUG=o%-^`doc^@8=zY*T-4Ge6j^^t!wh6$cy%?#@mE_ z2Qtx>DWiOR5#49fCh zw(niAA3fxtU4;(T^YBSUbDgK(gA_rO4D`zRB{sY{{s9nsNVaTO>5#SxfkS$Hy|Sj6 z0jeWiQ^gFtN9uSN@ypUgxFu+&Jk|H^b zIhR%PVcW%X?9JK*e)y*iJdad@!D-vpmLZ2wCT$~hI>46kc-92%1zvwg{ELl5SL(8F zhM|;w@0wbP%LvZY_p^0gMw-)C;N=}rn67IwwiKin<8Zut|9MDaU4TR5FgHtYVYNH3 z#45eA^kXKoI2}SeHUvprStQvoKErFY-QWquojL!d1dL#n^32N!t&f>TZG%na;58@K z`-e*nb1y!ul~~!(e@l_Csj924v53>*7d4iSsHG+4KfuZnN)cAp9kV$+Z}{X|c|#U? z*?n-#oO**d;1GN8{+kM)4{#h~c~lS^t=rk7a5uy1T&2{tpC3QHr@uyzlh>~oBaJY3 z2>HSH!uiI_jp_7=%whO=opPMCNGIJJ4YbBYd4j9)+gk6h>%LU!8Ifz)8Lnh8tm2P% zEC8%McR3)PSC;v=wRr@g_3GLno*0x;T6p9eOQv80vJ1W;ppg8Y)3kH>9v-c=^PX)B z-E7o@gn~W(_GT+jb1d70MDA=StTg#zy$I!kQ#h1O+A! zAU4OXq3@N3yM+Q&S5hf66`^5D0C#7?tBKH%hZx$0*||=F4uApew$0JWV=JN; z#(D9D8gqaGY(%)2n+B%0os$zs|5}UzRJ&+>a0Qf$Z%6Lv`T){I^2`;ls>!{MKM1%d zI$a@wwcQkG(FV6>E;~To3c7&S7daLC3Za0JG_o;V=wjGBplj+0a0fdElh%}S;2=g@ zT|WOF3OzAD5pYe0k2W-Suvza5{74>+l4TVUz`0BrmfGUJ7a2)Xul82e@~9nrIYA4gh>DV9#ro5S)i@$a-iZ*jgz6NcLmVdC zNA>^$mMH1T!}s;?$IWo{wRoNfZN@JOYN4IwSz6mgwd{BEI!uiUYOQULuGK#d21=-t zREKg|4IrR=6q;ey3EuFj;XLv%-}YzyQpixUctC44G1P}kFkj}sVu%z|A4IiB@Xx!U z{`wsfw8pW0FH6_a@bml?PS_SHbrKW>Hl0#P#g=h55ZR*EN-4Iz4&JGS{d5e2Y^9xK;PQDD>$O&X7 zU?@yit-MjgLc?s6bOT7hzHPoC1r&L6Rtv+#; zabXJ!A3fApXA5H1V{H8aio_dk8yeIq0E52tkQ=W^?@L#1Xr&UqAH+=BTTUH}( z8wfd{0H)_H#U0BU9~DR)o<;P+1_;HF`uQv+)-duOf)C=o_M^krRYOwX9)ULbNDJE1 z+{pFzlhxLMJ_V~0l%K)PluC(JzHqUuqse6kZE0IfHknE=a<^NLtjGr8(!#XLP;OR| zZ|m*?#1+Z{__EMaXsZ3y)HvP3863_2hn%X~Dv=21@}A+8Hz>8O zI{*S|3r)NR1erlA&)Mzf&kR!V^Yw*#Qs7>ne1h?BgLaM>=!)1Mve&Sk@fqS)9nJ@z zH&tViEG)dF zvlsZE2?Y#!+{$H{(ZPiIbhAl$rqNSB z&PKdi>{#HW&xljG$?M^)FmE~+$8Ytu!ZIg&bNn;W18n(a?^jI=&=E=&n}TjH)86{AU^5fHwyEVk&((b;jZ36mSk`+4GE()7qP}4(fe7T^$LdD~^eM`c)zh znw0cGueXPD(;1Z z_lE_Mv-Lx2^co`f{vokxk{YkyhLisdXsF43$?+3_@z+sEG&P40;p`mfGp8=&(@=Wg zPz_97C?RMYdT|aDRgKhAX6wH_06{N5T0lz(Yk7G_MdCwwYgE{2b1mvG=G<297aTLb zJOOAxF7s?7Iy)7(;dla&SdXf`?AKtfH0Y4SK?m=njU^@w#A^ycGAMcl7S+L+N=8Lz zm_c?<N-Uc)n>4=mq*tP#=oGTv`K zCk%T6092mE@2;KH&)v!M;-{ z`&%9={Zn}UL9(BDIiLO=`7gQkx4?fR`~MtAI`cC=1pA&0k?|`1c!SqPUUBJ6s5#Vr zuAqH~8MHsq&W(`OG^8>kH=F$RQV89Xwgl8ym zfrJnHIfZ0pQ<;$6YuIYwLNtaskd8!d{-smPPRqz`^D>CEpI2@@_PCb6%Tq`0FY+`; z^WRAZqW`_p@qdQ^{?Ac>_E1lAt@T2-vVt<#&H;@!*2;41gWL*on$S&{<%(Uhn{rh4 zuM`b)7)8bXI+T-@5CvKYM>J~Wiu z#`#uCHm|BGmRoSlplR1~U+AQz){R*MPofprJSUHsP;z7ZdtZW?A^tYd z$?K6ksDNr`gAT@|COgAMuk{HB{^36+ZgkLbYh0lpH=!HiZk?V{o+k0v~O0nER6xG&=EW<5Hvv#3s?w6TrCUYim&e9B8XK zJWrU=CHXdJica0FA3qAIy4DO;n#p_MkRN&i#DZ!(CU;n`zv>8qPRvhvF2=jB5Lj+B zGo&+<7Hco^;};Mxix3uLljrkY$a4BkZ zYrKST2|$(=w2V!;2NfJ%{=ZMU@{=nn{`4!c@)k21Z);FBZhU4+Z^UWZHU)vCp%>46 zHN8+e`SJ&>iw%Wc(OTdoeHhgZ=erm=7aVp~lDjp+@a~0Sx2Jl=9tVZWru^RXKM%Cq z;qUk!#asoCl+{PQgBPGP(Elhq8veQLC?LfDnxHU3dvqkuBKbu>1CW?JF;RRlk|QLF ze-deaIrd<)NZ*x&F0Z8f*TDh%3t7;K?yyhIcW_}3VCI`fW^>WeNt^xVlHC>R6+?!CPSLbf$5)EaKssK(_;mQXT~-hCN*V=)&}=;q5d;VXWm55C6P-W5^V zD(O%C%r+<7A<+<5s>wkvpzm!>ZGAjeQq0(Ss+JEn8UFLdGedX=&Mudt`9YKR=}}r| z+@ep^J>>xU&+qz#5g${P!>%RX%HG1wc&)tj3VD0hH6L-|HhoRBF!1aG!Fa1*uHK^E za`wD4`KZ7S z14spYU>;e$K;H8jdDH#`07ttw-43(8t+Ltha^Sj}Vu4Nsq5A07#1o#ykIrvnm0rXL)sVnB}^n9Z;n_lAs8@&2#Pr(DvQU13E8hM_$;I(JziNj7x6}VlxvEU zX{u{lJ}*04@h8)LaGP8$ba63V?~K&*PRu z&58JJQ%^F{b$rug#T>KmyaAr`p`~pGs!7fpw`mLYl;2Gh_$>ww-L)PXHKzh~@o!a3 zes$JwB*Q;BB2q274?8kW!T{BjztSS(H+S>82TtyLfykfpee?IP;r(~!n~nZw1K<2% zpiYOG1;h8zC?6^QjIPu_nUc)czBkz%@h|||FZ(;p-);F$wWjY>W8Enn{twYC`5w(d zZW@-~aQ^D4KWH`eomMhIpvRN{B84)(8+h@5mw~$ipN{-18HJRIt^ z{f{g|CQ^uEtO+4YM7BD@;A9MD$j%63-wxSH$b=fgYn?3fa_k9Xr|gkoNU~&C_Uv2q zjyjz-?|IK3?{)ot*Zap@b6xYy_j&H;UOvzFd7k_JY}59;llJA`2ZgC&anJR=me&rH zZDa=#<(!`#N~pNr>!3W5Y5^e z8z+W$(8z&;)P5P(a?QFFw&mKfb=y8X$+dI;`>y@}KKA~*0%np@$J@7>?bPIQY0~hk zi#*PFJ!*BsLO{_({2DpC1P1+5;cT}UsXF@e((~iO^cDM5I3<6nGP!=uw1HQG>1<>+ zylwb?Rp2%_tGA~lp=7sWsg#=7N2DrS>3j8@H+C2+eiPu9?9zWShQQA8Z4Cc}c8CMC zv%X}3MHPoZB3;<0)}kxm=g(Zplwj1^)JhhSC-7z3ZC%`R{0Pv(N3xUX)VNs^6|uJl zimjJq)Vs(=hct{Fz*N^fwm zW_&`KPTSc5BCwdqcf84y1#Ne4TX$*5sDgpbZ~S#zD*(JF5U1(jtLY@d18H;{R&zcg z9ml`i0NG;|r8>2TYj!a<{dPu4=kB_*A}wQdds0};ay9QTti3ou@lW%zR1geu^1a~k zOY7D>Z^1ZL`EqvuSS6DXuD*prb1{N1LVBVkZqR?7PED|X8v;^OXpkhGDg6cNJCR^N z*KjnzO|-N(R6o6X*Lpz23`L|a5W?r>>HE)v=&ov|g-OdyXWE3+q0`DkkZ~(BJp{au zZwbv4%D7*hsT9Ka$^yPynZAc?9rP^yvCmN4_}=pa+B;y+O|j$UcXrcz@1$^|r*00r zE_q4Qb{PB4Vg4;ht|G={EkNLmJ51TeCFGA&S+*J}X)dp>YO%+cT#l$8fwl_tesisw zBa6PCIq&2b+OzLxMyivtxL-67K*;0RoY$QUq>-o8xtw3VI5JnAnzH8~n>1yZq&lE6 zR&k-#EB}488hy*+!A*+H5WUeD!q#bdVrEW@PuL+t z3Z23j3Ek8L1;!#qAJ83B6~Q@yx~~!9Fld+lK&WCY_+37{%2sh+3cET7+}maRIzDD8 z&Cp!DY#kXg`0{=!x2+V>=%R*T^pTXoP#VWTZ|pLoPnvC6ZANVa>LWm*C}89sIEy$R=2x3v9+=Yp;@Trf}vOZyU$l)_I}nV9L-_i(R{TAy=DDi#gK{&=-OMPOwVmfdvRdGHpioH>`#q;6&Gq4VnOxFs9?VMIIwpuyEDVKIEQs&J4hA} za*2}I>gm!LkS{pQ=XvA1GQ~($0YXs>>TcNmqDd@HxO84mx$)YV%k5jku^_Rn(+0pL zVLzqAluIh`6C1d)s*#dIQ$LF;!3gR&wmm+~fZ+k4cs@$rBDnoTl-oX|k*TtS5Uh5b zX!%}YHY(2#copkyGK0wVu{}&i^_7;>iM86>X6t|mVwcmvSx*=2A+`XW&bvf4 zV(A*>EPf|@jX>JP%-VLGUEo~c`_}U_caIe;ZEXZ+eOP6(%81qt@oeokq4Lic^Ewv|P_uWc}3}T0GH@1RmAD*BqiP`vp;**K`z=R z!@@2~|4I(F7@3n>euyR*4NR4MPFIU4m{*~AqzOQ9C@xC(VYy11%D|iE)H`@>;VeJu zLl8!=iSfosg2+_B&nOUgLz{7gtDaQmZ@ORy6_W929&WVfE!2EON6#HIrQ9;mZWnz> zupOlxs-L6~3zXHW6Ths2uK^k4Ap8%WnrCmG{#g(LXW zTVeCJDlF@wTdL66GrL7OX1MhuOYB%w$Vwjyg->$fw03{>t84f zhSNm7$m_-o+618vsI%&2J4pzk3z@QJ~4HjqgTZ@ExUO(Zy()Ii)Tq+RvFiPR!ZZ4{n z#&@gU^hspjV9T3;B@TO+Ev#)14%dbYovU@UtSg*T;x>Hh5hq@v^GmxkynBK-t^YN6 zV#o!KjAZV6N=KHcpMpFeI+P_8z$v|eerJ>M5O)`b(AatPp~EiR@qw|zjFgZolX)z^ zk0qG?F(XX@`E9I?oki;)eFAIFpyjL+4{TL0+Tz!u^82YE7@T<9tC6K4Rl0VIs!B9o9a5?#89Zcaaqp z#6<*(#SuR+t9APf(1Yef`FOU5%_%SO7+RHx6;TS+2nULffMm~ttcphfDgeOyD7ys2 zV#vu7Ia{m4W5g7tc0#oE(S5Dqowoq(fflwNS3@T{E$T1K+sXiZu> zGFED}wai32_ZK7$_q?Jswg1&s(5Urt6Ur!eh+S3$%)NbdIhM{Nn)ZUdH=-6erF@jK zopG|K?3K8C;LYgqi9NQo*omQg_hX>|k7hU6o9zHQ#L7MWup4JF$W~AdkPXuneHl3t znFm6$X?M5d?r!Yw#zy4!hAz*ajciaM9uM($&5zM6#tQ^^=r*U_G{J22OM_!^Xn~bE zmxGWgE3G^8zyNx-(UDkh&aI+FOEpAR@6?>u#~!blz@&8XF6@e2v?>7PruRF)?QF9x z3Ys{=1Exys(N=bcU*bnsbt){XvC>A2t|^2jk+$zM*_b%6eHHB`D)Z8C*u3<{blHr6 z>*?L!{O#N&N_xw}pys6*@*5*}3~!Oo&d%qncsgP(DE}y!Ed@qjiFnoUc8@wPGWWuv8CW0K@ zpN{4C@f)to4?%jmsXJ2PP2omc>8B-NXE6@fq z!1cllvzwx2Z5v62Ps+WLGiev88AVtT!w}6tMI`NwMDyZAA4uhLI2cIO?OolAm0AoU&%hc{uh%k)`S26 literal 0 HcmV?d00001 diff --git a/tools/python/images/clip_image010.jpg b/tools/python/images/clip_image010.jpg new file mode 100644 index 0000000000000000000000000000000000000000..71b63575957f04d9e844f6425eeb562470201840 GIT binary patch literal 24851 zcmeEubyS>P(q}hLaDuxu!QI`0Gz1Oq!QGwUfdGNVHMqMwB)D5}m*DQM%lmyhGw;lq z`F3{ySUG3A&pqAe^zG`BTlK4_s-AwCdszdZ%SeJH0YD%S;QsmqyetC50dL^p5#iz9 zAR-_lA-zFH!$(I$MMWdQ!NtS}k-ej$B%`FDVdQ0@q2r>bpk#Z`#>FovA|mpRMN(Em zNQPHfMDPzHKqMq2G!!&qbaY}tT1r~M|LMm|I{*U#umV*J4Ws}-VE~~qfG^zuQUCx5 z`^xPf4FC5B5DFRw77iW(@eR`JhB|Zr6c8F33I-Y$76#^ZxA*Jk02mBdOmbFHxVK85 z;3*ui*!-ij5Gci}Iyt)6mj!aB^|;@bZbjmync_2FrX@ zQB_md(9|+AHZe6bx3F|_c5!uc_wWq-5)}M3Bs450HZDFPF)2AEJ0~|Uzo4+FxVomc zuD+qMskyVOyQjCWe_(K8a%y^Jc5Z&*_xi@>*7nZs-u~J7#pTuY&F$U&A94W!(Elmc zUnTnoxiDVkf`Wm8hJpV>E+CZK>w?CBfhA{!!xUA5|K#wNg3TWRODsC8ssoXdUHKH- z&~f|?4v6D7)!84S{fA`#o?rq0OOpLnu)oQ*06>NYz8Vi210W0-Z#^At^7h~yt{+FF zG91Q{fEHSsB0c?jKzebUadYL{+ww3pS8BjEEYI!-uWZHE*R9AQZwI2=8l6Uv$8C_g zB8PN4K2)jcDLjt7^Kd2oz7pGVp}*C`ljr zv61ar5K9*$TyXZ^LgNO)4-?w^Y6;iTyXwyl3s;-F`Ofxl`u;|n5#CuAKG}Z(M8J34 zx5IpQZhryfmppUsS~y@Oth+dDVYE10<}(g+b4}D@mWE*ksBAjtJSaDqJbmtZc=y%q z>2o9x3DJ}WRAm4fS>R0KGsX|uXLt1?&=_i>_4v_PT){T!VOT=%-m$nHbhvY=o|}@& zwX6E&3m`!Mnv+D&s;%uAxJ7yzv9Ev88F?#t#eDZ$ug=#CC+(Bqat8&CVvA8x-=`Cm z%Vp0+r34MIG!v#;xG3n`HW2{4c>5e*ah+#7bblB96f55H{8K9F1<=V|A@ovF)73G<;EcdnIcJWdS zhDp4nv>}h+{YU+7ba9dr(0FrS0gohEfH9S7FnrMxoS^yj__M_A3n0l0-s|!}^ON1jF0Xo=G^(<*KgZ@ zrC$>#4;xee@^T7)a6221qA#D>bwpw^gRfP?>YL-}&(niw%Ixj;<3~A<-_BkDEZ1z$ zZnwVC{eP!X;XkXxdGa8ku#&SjdKuAHZ7wMnr7E&qP>>F3?CtzQjfX7Nf)s8VN5X@q zVI;_wTHqWFnmVwDWZ};Y83FLy{sS|sWK?<8w%*)79T?6fC8@f|bLPIsuDPMR)!)xjX#dg){x9EZ$;+96 zIn>~iRx#+vQufTb@{f{1Kb5U3++EbYy0`YLd(WIvzqcT>axENtazjrXX=l)^RLb#{sIWgITE6K#Q{R$ zhh5(@)y5m3Je*UydlANTui8H_;QQyDD&7S>`WG8!F92_}XSzC*nc6u{tjdIYnAXoW z{@-`3FWXC$tEAJuDzb~0VPF3l;-LSI%*Pi%{f;2-JtOwWQSZMkJP%8OPG&DV) zpr1qj$fR;9*l~p|)+HW*fX|d2X0{^JMB25`Ak;zqc=H126TZLn4p6u|c>zFtuMSL< z!H)UjU$k2!G(JIASczyb;3f0|a_Xw;sxXmc-7_(pqPs2OwgtgAC94Lp&t5Npjf^MH zztOsajoYpdUZY8&iei@Q$}?Lwj~;Wv2vw#Lx$C^*Y`R7i-f4U0eF4no9Af&8`93rTCqO*%pq5ONKHv3>pa%l?vwS(}m-Tt;odcmnWp8Zr(XbZqT6eCABZKT`X zo$~@9#(LsguyR~60AL)Xq|ke01yO4DPATTFUXw=f4>i63(hpt$J}-d#tI^Xl7w@tw z<^>@=<=_S@t}MZzlwP?KxgDK0o1zQ4j1rqiVCmjxR)H8hwIHJXNOxVIQ?Vd-o`He9%r~_s`FAQMklnLW4Ne~U?&Y2ShynB zDbp=wGZ0!!yteYPlS|8vHh6kx!ehLg_agJ0xzQMFB{km-qP40qta+w;DvNU@qEYnl z7<_?R7mmviaT)K7k(84+oO$J<@|F_Gf_Y9a(Bs3 zJO3um$(v7`*h)Ql_hF6g=qS}p5U`##^_kj5aJWXRRjHg&rWazgVz?Tmfhu&`v6ET; z{qH#Q_>*D}j-k^6yJoq#xe^oVe@X9c-z8iu)%Zt7XKv^s5qViamBOJ5#_c;6fO#_)dym;)iq*r5qcmuw=lGed}aI%w`lbH+l zmm`qBFfM{eInlwV)x}%iCFa`f@u>@=S2zT%Admk6T1DF;CyX9%WI1L2 zLsbh*2-6xd+={50Ff$&{=B(^LKl~zS0T_06M08+{DH`fI!`#9@IfhiC!wW!g_W9|R zkzFWHI~Gj4H3xddnHRwA&TGu| zqxlh0npm|W*ed6Q83&e$7(`P<0n;Uh#<@^pTm6eK1((BNad9V{F*t;<@Wn2TYq$Lk zTq0AAGv_>5PAOt@rDA!RCml&<-*=1yxO^2$dME_k`K4v75>+D-u z28rv~L|5u?C=t>$*Ti~zOB~0A<#)Fxgu0HBQd6pUt(A|K(Dr-9Y-GwJ>SZvJf9*e4eBDZ1JWe>C_W|D731z zzhBDdQp;VvT#VD07)V)V2!A8#iS|hZziojaBy1jOeu#7P+h*;Xp7-9m5QQ60uV|Ia z5&89rjhp$HQE##B<)qD8yKUl()3*UX`>-3}Vy!F_5v@T%Oo$9s{q3}ZGU1N}dtou1 zohV$3{S7F!HM~qc$w{d(;LR90V(~04`%$Z5<~kHrPFM}j&H9*aFaf{E(e?@u&oCDX zzH$@#PQAcC!KZZ^?~+)MBA(+klY~;rnHlC}p7Xb2b%z=DD!}>MCnkk#410KUib7+TMtS5Ryebhxc#L#0-G=?JO z?DG%kUM@w}9OfL`A7N(03tOA%^jtg3@To=Ew%wj#ybAfP zT{2*%P?HtapUpU2uBizJ9Dr^ha#Chk_>zb#Z1K}A%^f>`tlBCft@ud>S?@k$C zq`o($CGLo!CX9dFB<-SfL3THwpR|mhiM4Vk*TkOTNn(L;(|id)=qDmV=bK*O(|`)2 z$pyZ8!Jiilu5pOFVS%9NcCkgx0K*sln$JJ>w}#~yv=#lFR!Zjgj;zRx$gI5#8;d(j zD)6O@%p1=+ZkUx+nT~ADTv_zNw#X&`jp@pHcsI&&Al}}HcKIawzu{qju{RL+I8008 zAv?kE4NdmHwFrdH3O-7$aO?wUvcvwZCHYXWCo8?`{6uU1m!JJtuTXUteBD_f^8o`S zCici~g2C92Ui`Nd+>&aENSjTHebj|(u4S?pAyHBGL}qUkcIlI=PshFHDwxbttm5L+ zxdv3AsfjRBlBa@rnq*Ip>A|(x&NrSBX~9DCZm>UpM{oDz3TMnjbIRf?yF$*F6IP!b zFpDr=06)e^Bg`y4iBf`g(vlSJVP@KR29_Q?_9h`#eENtw9xNeR^IAyS8lC8_{GD5s zOWu>u5*){MgnZc3)+8h4Gq06O+Kwcd7U1QmgUaD&u#uaUlMCoqhYjqIcBr1}+-)D( zY&^YuI>Fx=50EUkjKO{itCms!W_7)<%kh348WbYA`H@?MBac2PfcCD_2j()Eb?! zH~m$$p>O-Ka~IhCGltGe7a4_CyWD&&8`BqNh8Oy*!_L$3*sJhubG8dPiRm^}y(M>K zV>-7{NyO)uh1$(?%TKk^9GDfHtePcP~8SlLi@pVNsGB$`qC!F%T6 zhBz11?lK2GHEZi2_nwd`dN-XMDK6;DQ+?6mx5O!Pk3zu55z)5SA>o_xF8^Ar=?yW4 z^jc(9KfIL!mA(a>t{t5$JM4{NrT_l5rzJwp#)L0+j3;Or1ZlExE9h|njf;#xEeY6> z4O=cFUf&cqTPF|*rP*YIi?r*eltSuj{a8T`=R1P8B=MnEYb6hYWOI4lY(D4o87%$% z#|JZ92JAJ^TY^$qIfcZ|1VL%PKH9m!WW_ZfMiCYAQ17B)UAWmYN(`iv+lqXmb!dRk z8k5=N6eKRoscEQ1g3IyXFAb|{9X5;t(%ag~Lj|S+qv$hZY)Fm~IYeD-62d#KU83Cp zaIBG^L~xln3He2~w~rCtyRU9btiU5F6BUr39nm1DR`IO{DpVmL1ZepMayd{Bv5UhIGx;W(3eYlM0Xm3q0|q;M1xnrQfYT@&B3TIFpi`G**VT(jVcuix!ZxWYN z-v^S{hJ_!zoL!No?ia3E*9%1U8e{(kqjJ_+{O$ZO^J7X(aSG|LIpqC<&9lVCdkqPp zjOzjzR=A19GrziJX$BD%V}Qc%G*VFmCz^){c`hID7|HP|t4jr|@`bPvNm+}wgfGiz#k4hTAYrpK}QYfw4URTDFI z7Q;mkoi0Z>s6nN#za$*pekn%i?m$X$*Fs{Q-(wqsN1hJW+*DJAzOf8fX(2jg<2m-` z8_#hM6Eu7;Y9-hr3uyW_vf$P$&wd^onN^HKLfA54yV+EefHLbOV@GM|+FyF8I4skb8jc7UQ6JINSll#L%1jbm9@uePlzWpN~oKV;iIfs%~rw{Hk) zV<{uM)D+VtzxMp+n!+DbZXc+q#3?=(H+4{M?Gz|mPW{A9murl{GAbN$9c@~|9#0I6 z{BeOc7MaQfEZ2bao@_@XMC+GOkp zSb#s!r!O<80Dq@P+*BJb!dwnAwQyXNX162F=Xc4M`>j1z_bo0O)YCJR6f_*QWkMF% zBJ!wZXBl+xfNKt-Zb#V=ly7wr-&BoqGZ>pvH$K5 zpwVl6tF*6&h|5Q}sI{tq-R){RFJy8)AW(y@l3KMd#er7T;!+}RND5XXFf|HZd?kDQ&`ekU`uvdBnz0t{gDq?3 zR^!IDp~9~>MNARJU0G5_W*jmj4wR#6Y7r@1rZ&Zk;E(+SedRQUlB0=*J;ifWQ*^tf znle0H^t5#CPY9go)9c<^a)RWk!%@j@SPKyWa6mx?%EgtOiH*h@mbq^THa6sek#u7F zgsxBlBGeEtWzmUp-IuT3h`vpq^XklHUH~|<&>00-E~r7~0wO4v^|SSknh1uN0}K=Cn{rcE7 zF4Sr3XcR*M|F_7|OlIJRDM%aD)uGxfp>D$bNSEOX`y|F;@5a)O&{r&=&zL%{R`?9t zqMV(8L#Wt5?Xj3NJXxwE8hpxlsT?V|sb-pqX%n)tz-zSdtAmm!`XX&Jighn?rhfM? zX&Mw#OXJ~hHfi@mLCg5^7VPd`F0mwjH?!~wtNIEDYpx(lJpM#7ZwbTsy<;h~$12U|dN0!!tiUz|ERjx-~(_tj>) zMX@It>gz{b)7?hc;e^a#GJdL@yN`XCk>2I^EHm>(Icr465jIQC`W*f#nEmk}#+&cL zs+Nn}DyMGBdWOG>TqGRNKON{@VC&H#xL}5i@n%n7!>|6C$pZsd!=k;N_ir zpZq*`sk3ZIg|lqQ5Unw*cNSK{%Ju#zvM9u30f@hY1)S2~cTF**T#>EhB^Oa1p2Oz> zVA|7lvr9R=kwYqt?$3&k+hh?$C`B5wdLUNrQ%nLEA;g#`iDQ;9J>{aaKM#;bsK~M1 zy4wzcy*#?Qs9I#s`k|_m+4d25@1)=t^Ua~l^1&41gkbt&NxvF)B7h*cTxeFLwYEvq zCMOLPsT}uygqAL03l)z2hMQ<(rbye}@%cb<%BV%Qp*dk7JAqb7jW_?*5jGk(iAZ<^ zCU%Lf-JUG0f?I2*@~0Stx(XC*U~9CKWGaT=x?hN6t}?;-D56?56{EjilOS3p(0_Ax!axRp>Id0RiE7=v7D0Bu!)47h+5Jb7-xWW$7g*zAzVBOQ7fu zN-28Okf9s@d)3zr!9k4xC(jlKrh&DiP=V&>5J2gD>EO!4ucJo^;IG}Uq0SAE`Hd#F zvMC>HAr<+DSf1r}9(=*Q@!PKoiAr3E4Eeh*;&x>11Ay2DytRvorV=@!2L&72+(4yJ z!`c{X9LsP~HYjZWF}%yNRl037Onv>b+DMGb2t{O*Wz7Mllm5qDI1AB2t)&$mtuDUS z>&4HT#2+asIX^0&y^GHcl~}%{2Z=kqC&FiaKKZCYAQ@3+kpjsm)Ip%K7|Pp!|AprQ z^=Vkv;Lw(b7{&5HS!cital&RZ=6Nn7d8(9O!?Col++CQ-0}E&)AeAR;#c_ReDj$`9 zgWl)}|H~=E^0U@<2IpCgY#h7ays|?;E-k4}%1*iU!hy&GbG$e9raG+?A70fCcpFMy zQdH-{&y(gzqU~#iO)D6h@n=GR6e~Mt38wt1qejE(zSC7sVKZ?AmXb*4e+~SX1fQX_ z6HLU!A__HPya2#=1%D(2$ra113mOI2HZ4OOF)(H-tDEJh`nHnCNzj6F$gr{9o;qq# zOLgma>`?uKrSs4yn1FuhCPDtmtyzv+JIeZ)O+*kK){#wa!!lu219jzyQ`DSX0p3o1 z)sIGqHmyxqe=*gC<`9lUDxfgqqV}}S~-Z&h*Sseo&IJ=?EuX?hPSD#x0lHEjQ>?mqrx;@V{Xs@L0MBj z0n>Ps)@YH)EQJsLV|L5b5yR&yC%&v`%_xJk4oNWHbs>C+bC>xIsYNF!;2kU;kJGrT zk*h*JJ@LCA+ysy!*n$P>>LBZ*OnK~#Kys4B5YkgU=j(JkPj_Tmp2_+cc@9O%;)=m(rU+jMJMEzbnrt;{1{md z3(%ar@)G>$P#wLg`b_z@INhOx4Ek0Hr^1XYiz-f;f9$P|J%x9%Ln==1(i-GyY;jIN z*fJOiMfQQ#0KtaDs3~^^;e5c_56-C{KCISy`&+l=y~~e7;I;)b{^l@dk&Zp0Fe|wy zFr4TYTzy8#{C-PBTqH%s8D5&o^;#70k?zQ80VQ=NB`(wocB?)q>JWVy-01PWIUh{! zkbb+Ti}D}WM}LG^aR=cTbQq@5MG+X(?wfm6^(o5Q1XjVOFVgmebdtW%C(0|Svad5) zx^35Lg>|n@4$M8@HQ{^oOq4hDsb|PH>o0)8n%0yBj>;6@<=(QUsqz(Rp#sQN(n5KK zl}*#}R5)8#k-;g(KxnXtZ+Ls~IVv9S63nFBPRm#d@w}#18cQ-X|Tt}fy>@JH%u~HEs+~5f=><(LPYGh0S7obw6fa;9|64V5{(f7co8@6BV$5GU|D=ZbkpC zB|<^0>F19o2?r+vOelsmzB`j(zUuJnyKKjRaudzj_f_q=Deh&{S{BNjrU*!t zfGQs7eK{wsF5m7+g~V?3MXIpeLCFh{-Y296R~4s-+x5)4AL)x-d?rPV*Zp~Qn7Pr% z^n?TcsgM?_DKACjLpot0X5s#ex1)<~ram@Jy$*XR>p9bPEneDIb?H_`<(M~5#My+R zkq(gnfKxBuMN08~d^cav$Z@haS03A*$Xz5}xA9=h&;4tBetYR2fG06_H?g`#K-Xx^ zkFScUW8X9=ec4BK+xTh@mCm2mM6Ly`ipRI#?r}|gOK4v4OHOu85WGDavz-FE{*w?0 zqu)$x@kiD1e^@)D4c6$+ra0Kfh)Uj!%oA#ov+tk|IfU^MA0r+%$$_vj@n5t&_z|VE zc5Fkke(fWN0H~&*Tm#de_+X4oQg(_u-CQ?E=6jn+*KZS*84^q+yT2pgyz9OlR49mg>b)Tnjps)U=52>B zgFYd+Xc1)F-q%Jv6oovG5*C#W8Pk*2Iz}9JQJxDH{>=JyiO*yZ!b{=ok{i+;Ii3MCD>tu&-gYiLJg{9(n z%i%@_vfsw^=z>QaVRKtNE1xpTTX2ptY@!9@D*U5kv>l#W00Ilm{X2y}BtgSilhPsH zg%`$)z;H#oF}(B)gS_)o->}Et!LMi4p1pdmcZ@O{3c!FL^6^uo9910MBV^XX$=?rqe8ET&bhw@a$Mpa)GUMG31)M zZ6-JK(?8U`|5Av7;$9Y(v5o=J=t-%FPA9Fsaq^zLEcJc+3W65nb<4Hf@4#XbL~|G? z9b{Dk006acA_uT|vOA=7f}$h}D*-@s*+ZMcN)yr_AjXDXIY272ro7TSLFn9;vU1!% zH(XhbGVXDBXB_Sn00eriIVgK1+_-7kX1HBzYab|C-?%wNQJcL%fCz1-0|j|0V?~H3 zh6r0InrjMH6g^0_B-24;9EG{{zmR~jdspS)6YHbT@;8G9=4ESXc@G>W1S~8HY5QOu z)U;bpb>%t;8q$}aTqT8i42hVfRc|TzFX}1h``+*f!ecq37C4R;-OQ`+JILXpCw>0S zU`qbz6}&pyq&+m`VQYJGg+L(7NH-5o?+|l;C=0; z`Em+#`0utYtc^UIXmQ9SEsdF|+591^FF6)Fp*_ooq`|=NS|UtXRij+5hik7 zrwGRQjd$OtE=)FWN!|lp-M<2p+7EQB$RomF(#<(-{z^}y^yK2w1Wdjj%CU=d9H}Vi z$O@8p)6;crh5Ogxp6WXF=^UwQGZ)yXfZS{TAb^FEhSqU~uZhQ|MxV9of~@lh*U1{? z7Ql0-!p<{OK-HJXXsn^`f{CIbX4Q&oZg^N7)oTMH$-YaYCHmNEXx%QX|O|d!L|#2=O%b z@a|2`k~>7xE{Ci7X9mz6{~TkepJ#_?+D2Bb^>x^!aP@hgUaXt~BG~(>?rdO`&E6e4 z{3;VQ!3)8bxy*KRd3)jH{M7v}7*)!%PA9J|v<~UyaIB`!g!DQt7NI*RG*rnE8X&13 z7{5Upgu}N|P`IC>p&oD(KO}Hmlzh+39J$ncFbSNc3ich)FX<-P)DRY{ieylRm6|`hiKExJEh`cuUGfPLD-{vB%r3|2SZ(r;q%20p&^C~-E43$Sw@hbKjRbMWqDzu$nYMM?A75P zjZ`_+7=G65E^1PT$A@OsaQim3N+KSK13RG;0TGi-d?2aV&WlWKt>Tmk(?z-mC zepzCw?Y~ZXsN~{u4ebhy9ktoG;eTvn=Wo846>R=kNt}k=A?oesW#<~Zk4uxl^-&h4 zi2NhDrmlDusTUA8uw(3posN5$l^m81*n~N4OX8v*s#|vu(liRRbw9L#|L7C8{wP|w zue`Wq;rJsn`CZ*A$nrM{uknIP#5_F2W0Pb$r$ldf`NE{i@Q|I>F?EcwA1| zToW*zCE-}SFT6WRLlVw*o-?WiwYG z=&@5~67ll=eDMhqBZ?EOaJXo-ZOykxe*`j&h;eBUqCzju`FC#XqorJRCY3`{~s z*c#)*!vxyd-n*;A6Tc1vnwIaYtM3tc+N9Pj>i75}m_29@R|iYlBXRNY+p7b7oPo>Z zI_9&c(^n%*>!ThN)yijJJZt#4?%|HMrFK$fi=RlXEneT>x5n&x7bv`?8{_>xYS88E zw|V%`1!2#y#JPsgnn!E3m>Pe*);*OItWUHl?DKG(joF7Ir2{9=n6P8hVWlE7lZ6}E z<(u!rn@Bj|tbh^}uo*iNHuq+6(F_ZX956SI`w0ns%#eS0h5h)nrD2tFc!R5My3#JW zU#%;>A7-?)gwye8zGr`6RFsdG!xk#ToCNplb`$NxEa~qEpRJSau!o}<&P`IAFkVBK zx%s;+xpp%^*2YTyw4K?H=GJA#tpII{{B6+cRzq^rPt%SaLMeF4?W|qGw2xgxkn*z} zHZv>i0iCNJ=7c1rD*Fs)qk@UX!Yji0-`+CXp%cga4_+?ysA5fU)If4ukOIg*;xq0Tk76ii(wy=0qmNW)9Ad8 z(IAw1?(gHmD_LN-wsG8<3>~rDc?-|)rCy{oWGOh_LrNStxNAzeow9kF85-fgVA%HU z$+x?xsFRoIE4T?|Zd90W~+^!r6t8bC=g(UUd$E6Is5> zvP52fZPx<{k2CTZDgVfQj$FOW8GtP@TPlqR38kH2?P{hKdGD&pbQ)-xu=N~*ZwRGz zvMnV}I2nw;=y_2IeFyRTR%jZ>3>A2bFt-mrdNa%qxw~-5dXx|LcOszsX|O#_XqZ9c zZ;}8qQR}_j%d^aFDK#mrP4J9~h^FF42otN%#5g7uP;x7W=VN?X-z^44v42@$-%Fh) zENDyYJSoIy>gn#3tyJF3tP>x^Y`XJP6w-5aI*^j}pY5{4ZgfDl^j~VR!5bpUeR!p( zvybHFvn@MsFnL(|I23AR7?du0etC@b7Si-dvwEno$BKYvN{xP5RHCdjHk$uEdIFr7 zqu&u@fD?pg3_kH{4s}=*FLjtwpQBLhb_Y7uY%(RL>NBky)o4h-wkF9sFIB{h8uCG1 z|AO{yRr~TvkX6W|fss{_9&WV-j1&<4t0JnPhd>HoA-hSAu?~d|Y;8-z>w?p8F|)2c zKo!*cNL2LjHJydU+r7xYd9qb%TdzhAnLUV`gA&!D5XWimkoU&+3u9H;Id`wX=Yxd{0iNXpp(vCc39;S65NJ++$&9!8kqC zZE7s3I*-RQ0D|lQRJBR5`U=Nv6Ry<>O=`6p+~~4Fh9dCkO?m;DQ@;q_r7*~4%-K8- zYkNc*^q%Nf{NSu*WGwR;wq|UNcs3nlFmk*3`SU6sEy89;nW8R1^XEiD2YECR=Dw!< zngayYn>OY)X5U6~klza2UBdBeNeeT1sSxk;d@qaWsMjyJ0}EFMmAYJ*LJ_vWh_Cc_ zBvN_Y4}|jU3e0n5j|06_{P1H}SG#oIZxfb3^V^l=wVq4xMyOFi`{)Mvhi?RUTk$KN zG0tY#z;M-cxe#C4kJ-f;?(Ysa+9vTFEW$xD&Gp?4dmoeG2~Fa27xzkyV~&n&m9`WOO~f9F);0}xCRe_@ ztZ(CZ`qxy(A0LvtHHJyPA^&tEPIS@uxr_K`Z8;7%v-5+TBsuX>>&NW04r86QNt21v zcbaP2JoqLO5D_C%wu)u=IkQNam{AKRhV=yV&&kB1<~MMQw_)kuMbd_U|9+wo-HZ^L zlWGMw43WAcn;k&t#B(f$Dd2in(R&TUJ&fLbEsnIaY>@z{RZm)9ZkbfqhXHPvw+acX z+Q-T$_7cE8vn5st&7T5kL>r%lgS4{!XvK5(uJ`!FO>9FQGC1EGp$eikyo-Inm3u&& z``H6O-(Y)Qk8FE&|1iBz0~NO*lwltQno&Ehe*x_Khkp{vh${IgPTb9mHC`IsxX|B4 z7AZ!1sR&Es>@Bpi+8;a!^UXchTr{G6leDn?vAAcX6u15eb_&d(a1-4tT4bL2hHbam zdL~+Hz{vKOm=lpHYta42t%*ak=HP8@X+3jEO2eZ88JDIC*Uunf0!no$Hh1WcIdLMKZT0Z#~l+*PumkGsiC>O1Ddha-?pxS8VQd= z1ws=cdrootkWJ^D9%r}!0%Qj&Qt+|J-|Uv&y1KAdf#{El|MY9cKketY{Dq~uZX87^_88e>fXBhz zbjNEBM7wVDzQ>7y+d-$9YCZ-7RcL7Icn(aG3k;kUaL#Ih;(GxcJ}6W%X3S?luo+ov z!;J?<`yYyM$QCSopvGK$zrI0YZwx=&C%4jAt~aJRRl!jPj(IH)YAxn<%3QY)SAU78 zS?!8qS-!h0vgIs!!WFBWVJ=tOpP!Z+Hl&ieo3fb?97VGeQ?MlQgQUxvnAriaA{TRi7PzBnbYmwlCVbbEOr9{cCq zt6spEPhDB5T#=?ly7#vZ%FdR&3quASWMjWl-d=ecs7Z?i1)qj8E5v3ce_GK-AK?fP zpK3(wg&ayQBEubKrD)=q&J1c=q2+)J-GHx&w-Job=bF40+dIg6tKbfPuO~q(7WW+0 zn2Ckopt>NF7`8nZnurb$pWSu=*v{Ci%S_Z}L0q}8L0RcTo0$di#Bj2Oz9k>1(#1t_ z3n)<=ObJY6IHJZwBQdSbdw!Ivp-=l~*p21J2j58CFc@icUon(l({mE7NJ(K0# z9Jh~)Wf-6cHTIJo!@0N_l)xQkRN&5-&a-oUvq@S+jrPJq4;FJ;e;B~}Y~(eNkaumAj%_d0X z3}6E|IPi55SpdJ4`&y7|AHf%iOMtVrQ{rvn*ou z2%8uTtOCUGxz456NV}ja8SBdQp7B0`DeKd0KrI#|s3wYAMt6vJ;=slm{34TCW}Q7; zoLSan=I#$1XES&Y^eib-XeQ9R1K7)q&J#p`T@A= zr+LJ5E<4`O?MrwY)1D$->M)_+8z@<~h~)(74|?nL&tR{%GS7b>$gUeFLK07`I-s(Nvs>Be(ik|(ayw-jJXZk zcml}?rgE*Ad;}#9h08jYRr8u7$mER3H<9;Bk8!Q-@Ki8k? zl{GQa*$iQYdq1K}O5qJ0N&S%@rL4--);4P%tz%S+Z&FmeDN0>Kt2i%#LkM4YBET1m zTZOpp@Qme|x!Wlvs*IC2H5ywW@g=CzrO*)wj_kKn-|{u7F7y~n#kPWxTO`+o>Vk6i??c{7 z6-hb0%cR6a5~iZ;Uk*^J35KRDdUj#pi@($Z#h{382dsd@UB7KIC(h>D~F5&2RcEiHstu!5kCA>nmvO z{&i{;>c>{LZcklR^du$4Yv}Bx)l3ea;SYgHeE|e%{}`}xUh2@&Aug5l`M9UbobBa4 zv%X>`j!zh$mc#-=-7W&>8R}3hFVNox%B8QH+gjUfS;a?WuVVLqITARs4vgOTS( zxGT>rf@Nh(2S*I++0nb{$;K2Ob}zKl7s~R#HykOmFd)H7&yS8z{T3xd(~Ikw<=75y zUZ$jpC+~;ZnuE&p++^V;Qi(ItyA^e^wxMn`d(dGUp&z*jiqKTI!^7*}z$??%N1I=e zMJ_h+$o2rS@o%N2e){Hl3b^j!e3Z+RBANiZRP!kIkB(O=`>j={e)0o3vd?5SotdWD zAw>I(bKi?;SA)=+@T&sU@q1CFDy)CO+Q@bmJF8SquGoZ~qExT~RY6NK_uSO!x7a3+ z^kxi3!zo7v4`NGCnL`hq`+4fAMgWqe9&>F=*M>yQE;~y8ZkeX@IU35s<#0^){!xkd z_s8jpZ6;QcD~K@f4-NgtB$y+znjKOvM9?d~GGgMzgBmprQw4IP7Etu-QKQPKMZr9bhYZ}L=(9Si# z&pE-Khcx1ieLNB&j8kWR0U!qUAXt6pnci-IY%d$7vY5E303mP_NK6Wp<%m5W1Z3@J zNeytKUH}$Ifk1uX8~y#q;p4@OmPP!r1`go3lFs05v+Q?$uEB0;!v?Ew@fKB;^lb6U zz$M2BNMNLAl6&R=_ji;+8y+1g>&ax6hb1Ni9mmY-z){b$*M={aI>S#6M5--|`uAsm>CzM1 z+S_~~C=q>6p29eNu2ukl%V8hOSOPLj` znbPVAP2XK?r4adGzz%2aKMEBa257VTUB)*wgmYGnuoLrJ#BRf0^pV9L-j-&fmDeH{ zOxQ!!J%X(FLCER&JeDk84_*hg^^`Wzhg{a%wof6LnMWswU#6nlfY#*xc)^!0uq*wY zx1)qqLlF03Sv$+fHVn*$)hWx$O5a3KqcFR>_lC->biM978y@%vfliTCZed=krWb%e zL8H)Fxs&{vPGUn<_e|xT;}D1JmcmczVUu88*NDuct7962Z`VJGYU_i46Uc@xtYg@o zf4KXE2Lp=-4KRLop&4fsSi~_nu4lco&*4&WXmB8YZO$*vm?VZ7XMBL1OWWuc&_s#a zIsF)FldvvCdi`??&RnzUgsE6Fvz_DDyP*@#J2#5>*)ux7ZX;L=k<Ny3!PQCkgl0xwcg3i_KS#g?#52s939kn_L3b1C9=#kY&jwpqoHYzA}I;;jacw(vJr%z46#N?1%t2 z$^CGVz59m=soyPL-lE4xwzg$MUr>mwTi#>>)uW{Gh=IqJaYFRz*9(*;hA|480|Ld9 zpP+K$?lhoV9N>w7kjA280SOKzjTNd=yMpCfnsC$Z+=1{zXy^ z?QC0jq#*C2Z$h9?7o~`zd%-*<+@3_Zu4meJvn-3bK1yNuTqOl^1*aR~1Q23FwOL3S z2m_mDlmy=`={ky)tZyjwY+%Db+Vd59+fl8Qr~+%!C!4^@HDtJy0e^>psl}d$^yx{Ti&iPx3ZENQVZGEII&1 z=)W~`UQtbL`5zB8B27_xl_DtRB279(gb*X42nYnYAcz4B5JE?#gE!QG^w6YBZ=oYi zdQGUIh;%}iqJlbu|NsBqnTPq$%)?pxJneNJ&RV~{zvZ)SlK9(FbuIpyC4_2ZV86zO zgFchlRPj6GGYyObMm#iWiULhTvZ$etou?dZ78zKzQxxJH0r~+5#A8pQ%UzIa0l!2D zH_tvc9j)8j%vC)ljnDoKwy${KmglfG&d+bvSXnC16DcKL!~VvyvK=|#raMsYso19; zp3Bw9wK4;FKHh`hRm@1St!(F^7*Qf4Y~&DE_EJgYlY+z-RkukBvQJwv0U|JXiC}3|Btb9Ixp$c~kJr3V}I_t`FgRxrRvK7b#wH4;TK}96sZ#IO_ zQjJQWZYk2th>aG?n~s{+w=^^aN<(Z@-_8u(RR;(MZ4L8j+H@=-atP~H%!r(v&vqG- zC^5^BBZWO504i$o{3DavhlPcS$F?GA&c1PS{)rE*C|2$HJMlsd;pW0IO1a)qpC-*% z-W!tMsnSsH~;u728%Rg3lG{ktwq#_FhgH(-Ik173a3xsiQWdi^0apUt-#B46za z8qmPDj_B^T*D-Sun&w4N(N1>~Iy{k0-z=skMxI1zg?BfOz}0Gc03}xukG1YATYl-A z`WwK``S*Cn!UC$*gg)LTey8_r`BwuW9!3*MPw6E}GC2a>nUsFkZq}(NRjgOVtlw-3 zrHvQ(C=)`A?Tj1pAHAIMKkxy%0%w!w zNZG!L3RkWt`)B&6;-~M}GcXo6M=?bxos?rPzW1U}#d0o3$*aDjSqR;Mz`vHpy6)_MWl3%Yg?PMgk@1Pw8o7&-|U`m5=O z>l6uvTkOgX>BwhDzZ_c`S34%}y?O8}Xe>TKiy(c z$3Cb}A_ubp6p90p#cy|Y3Bm3<)6Au~N^zOT8xdUYrF&5hxuHk)THmd$*E2>hYoT1- ztC~v#5El&%56%Vy7`Jke|(P&sL2jz7We5>$x3K!;LFf2dmg!M zYG!8HX&P_>_WAZRvrcgI+R&jy5ut!yz_t-h6sqIZLUrwPwbvfJHS2ekuWN_CzHpi2 zz_H3o9924;8`M#J-TrAGN9g$?kWCU_@`Clj9q_ENAsEhi%N{2QU51|$b@9REpl~^W8ew%Vn)as?Zabg zDx&(=GO`$ztSND?PFtbP-hjk&O@MkZ4*mJ~vJNe?2FUn*rAsm-3+gCYSbwKJQSSA9 z%tnq&P7Fk)L49(}P&=?ay))@ES;3nUGHJ`aF4mM{U&-&yc+qD3;7Sn)*oA6S*j(6G zfA$3adHo+t%fXpsdDa>a*K)5cOyM%Im#8jh?sFS(gUOC6_WBY3w*oU>6EehhG|zG` zG|ccYyCvP-{FE8pM zS9FntWj3!M%0REUB}(5xJJpqZ*y8P~b2+14-l0}f>`I?!7*kV3+WpP}c5XcGv~L>| z9Kd4H;i}e)^%Q3Ntm;0<<7+(oCA~rbO}qf00ZjBRA1+N^YEqcKd&4H!_Ndk^yM0ed z$!_;TP`((Q&FS>cFHdisLRvNrCDR9n99??a5@OvvHK`8$yd!diGO3 zX|7PU-8Md05xF3Y0m>w^<@WSTjh1svn*@?@2GO>Kd z$+L0vL9(ricPdw{RAE{|0L_!I%V%bjy`nUVA_a0g=8vS+II$;R9h8pfivqK?U-R-^ zU0r1c?G8Imy-Fy(S;d~gSU2Q`Z4^$_WMO{J5+E&p@2`6EEhiZXJ*Pdlw=VjH1Cx&p zTI-Vr?#Gxsdy*|ier0QchOYKDJG$B$!mY~&ue|GuIx=snG%|2CYJ@+Ql}9QsTRD-= zRCHZ)bS ziO+)XZF%9>X)?+wAl2=-A_Gli%Z9N1Nr^>oglk<1Tt1laQSVAka**ZqI3z#_NwxOe zW(D`kWX4dFhic24tDvx))LUSYmylnJyb%}hmS4hS1?S1pE1xeKQIWQec=4KhxDB3D zl>B2^%w`T1wL#@-Y4ekHiUx3@m7FMUL&5L}D3ATCs~JND>-3JYj@~RTDwTvGD}O&e zJx4^E_DIG%&ca|eme~&Oq0+nJ**Ah#M3uI8tr&%*wI${OG6)NSd5hcJ&?46|!AAVc zgQ1-9N*&OqZyvrS_-N!@HT`=kgOQug6fbyN^V+zSSluhq(5l#Nb#QSgSoaUs3H`Avm zFzv!1>UspKcc7f(XK*bdo5}rj)TsIJGQ<<{tg;@zD<@fS{<8LedsX z;p-2prUoba_E z^JEhO2z+^lsc?L&P)u8eWKIw%YeAcD?fB|vt=nC6iG)SJ(>D7#3HAipnV9fWMqM`D z1afp>A#vSDQIA2|(%fV9e(S#-|IjyVNLpT;MegQbmXgeW18kxnv!}ckql}AD+_qA8 zGs{fOx)8TsJA~ho7uP&uDLx+$)*O)?dnHGa?47G#@s9M7+Yu>zRTV}+2K>fQnk}U( z5U0N+YwE!$Ygpab;y<`061~Tp$R$V7LB$v+JC~2Zmpv)w{_##Qo{cmx7eu0S0rODk z#~Rkfb3Qd6)4ulH+phN3O5+)ZEztaJP3|FWg2X<9Ut&(YDlMPWm#vdBuK$?#{be4Q zr=5}A&GP(ozb$HtJasFtTH3G`h+Y7y$)^3^v^c?M0W-X1wD>gVH+V89C`Q1!*0YzM z5_hi3B2J^JlR0lC%?nG0Gz;UttjDYI8)7~WEH2M1IG)n8%!OtcLT?A!d#c7WgU0OC z{9Fdh9pgw{Xpxhq4vn*7`Y4StV&3GAU>j8Fw;S-(J9wEDQ8f)^Kvzrs#O?`qG;Fc2 z6ssi|b}co!i|6^}ycdC2=z7${GyJpXZw4Xr=iqb$-Jq2Dl+He1slh14eo>&tgN2Ak zH4QECQ`{+;%j(wBAEm$FRyt`_n8C{LT)LlWIKL9H60$XM5pjyalSgjBZeIqot)4FKT>WSAE%~ zTrkqj!D@$AlT-A7m2x}H&a7_vqb70^FPGj!<@fSs5oK(IzB%>@PFPzh-@Qn3=5LYTe9 z9{S&~b8~8L;OsNgeafJc*1bl)ECb|QLjnG_BPs@Ibx~;{X899(9ubUJ zlmu2(e_p3Z&2?m%B_CPlzfpc!6*(w<@0&T?%X7#f}g;8A6HFeO#QXTh1&Qh znWDnat0T)VC*Q?9+$2!ogtb&tO~=tW&F7WhyZlo*6G=We&;Dzg9lzGl(-M>$=3ZSj z&_v1hXy2NGs^&n)e@Y=$WOumR9nK4bH=Nx1-i!U%CoSGEm+;vSpd7MmwV>tfNb^QV z1<{SL!Ug?UR}wuGo`$3=>JlxRpcM;bwrw~3NMEK?%KY$oduewjigfR|$P>y4a~|wBwbE2xuqf4`<9uXG61m;jqx5`3vAh#QjVd6Wl29}QWOv6~ z-qL~!*gTTS!xzPik3`PAdwhyQEfT<#PF3VsgiU&%8U44Q`YZ`ugAJ8CUm8Zx!(PWm z_z=~z#a{Dsg$t`KIY~F^iUh5qa9rN|3$+};g!B?p%jJG!LF4+4`cXulOg2c6! z#`{tKkgubbDe!r{e)GW@R|#udrTjP0oRL83|9=fA;Z9j627 zQ{!hG)Q9Y-DZg1OZtD1|K6bd^r5FG?UlxFi$ZTM!4U&sw#x`G!5djn|&D*ok5_>hZ zwF(?7?3msWW*tpYtzA9#x_NQ9-EStEMxfo0spd2>s=yFPTTQe7+_dGRp(AN`ul$KOPRn&&;dDhwJf$o<(VePj(cZ%G7E>F3aE3kiS#{6l@{91pp S3f|QI`gi^_CrIOOvwsJY%8-Kq literal 0 HcmV?d00001 diff --git a/tools/python/images/clip_image012.jpg b/tools/python/images/clip_image012.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d6d1169e8ad6fa4bb7e21e764640087b4f0d3ef7 GIT binary patch literal 17452 zcmeIZ1yr0%wwTDgSOGjylmp2D5D*XmK966( z!#v<6022)j0}T}u104ek3lkfU{0SZ|E*=dDDFHbn9WxV<4#>dDDZ<0bCdAGFV1Sf{grFI`Hv- z05U!b0X?r2Dxtb58iOklUr0hOI#9Z*omgY+n33PiEffQjgp`b&f{B@h^(mWxpwKg6 zk>@XEWaZ=)K#H$5wX}6~_4LgxEUm0h9_7>mL}On4FrPnVo~JuEEzgHn+BSc27>v&Mz*n zu5W(+#)SYt`UBQKk^Ki;_>Z^{k&%&*(SGAXK=gi0NchMo^t`A9QtD`?u7nJHA?QTX z3At767(jlFV`4M6F-#IhfmNoH-_ZVn?0*he=>I2V{{;3IE*JnC3E|OrNcaE=z@;5) zUNGk9lmpJXoo2nfS}}6;kIyq)_W}3?;Yf%50PBz=Wl_;+bq#GRr{n^`mVU;vGiI0G z3jXo@2f#ZE%Jl}T@cPAJH>tS9Z$bzc#5pu;l-CJ`XY{E+Y*Z;E@o;&GD1lG51=04~ z+6w*HjqCW6f%1g}0i){DY?yuhA39Qbr7pD^gXagnVr1l=5p@A5Ma?&kCYaOj37RrB zFt}EA84SWf_;~3{He!ic`4X{CLeOHyF&SruWghKURH2lIXFsg#{ibZiKo)|L91O{f z)@NcpI3YCciTi=#rF`|Q^FCHgF0T?3wCFCInN9%uVhV1@Q7R--{%%B+j?ELT4y?r!!0IcRIy2q67FQSvexQ2#Dp)g8AvJ1V+*|Y zrJmBUs-V*y;|5{YE&<2kvHAvmNe~?y5z%0j;9f%utAhF&4rf7@DrjmzKf!Q3?s)l+ zh;18usXKAuZ!feu0uw4o_TMA-Z?XlWx?lOA&6G1PQKsy!R&ZHBZQrcHyt@Ld==;d8 zc6PKaX2!);_m{2XJ8C$aM-#%6Bc$P_RNz2mr5BC%*6w27iR}bE17&9$ACoyl8&+0m z4S(=csG^A)qVmf#a#1rzrHx6aVa1?`U98gxmpZpRD>0OJ$0n_kN1h>2lp`a-ZJ>*H zZ{Grc+ubZ(`-JnNOOn z629-`pyWlf0{%M*F{|7H22ORg0pk}c=SKDs>!?jsI&slwVK{ce8f=KCzPc6a3gLa1 zW7m%cY)Y)+kItFf0(oZ-(z~y#ih3`n)Ng^4OLV5O~bXH8KU{Nd^~4Asi3I@ z?kb2G*mLFE+8htk!dluO^+FLktVCH|2f{_&Yl$v-N`WtI8FW#)3Z6%3Ep&$5$!{=2 zC{d3W#&tWbX2R)7D?Oh^ntmkZ`ixfvlu)cGvQ4F63h(~b61pKa?eEHNQsSgw9;eY; ztimR6f@NrE$dp_R1BN3-5VG3xg|umjouZdfI5m=#Z+^?r0Abr;B!;AlC6V9LFaJ1^ zByu=c+QUkCmWDDQ{B%9Dk8=vFE9VQ}+ZG`<5~CY4Fz9u5XM(1>Q5A3zd!d@~_8L-r zU3!{eiM&JFgLp5uV!ph&Md@{P=MM5;yLNoDV_ui;1?@9^vY-_jVbW$zHW2eOU94?6 zR;*#_8UF#{4H=v}K6*J?HD6**0Fk=vza(OufuYM9&YI6EnvshU z()GfO-kU{5{KWE>8eMKgo-Za5MhtxXel7_Y5$OMFB5yeo{O~m+iLJYTD;G|Mt&aAd z+wst$J$-3yd=gb*eW4+dv1Y|ZRm{3jmxd{b_aHXx*EdVu@P-5h#?pYJZ61Wt2f&-G z00prRqPe#8wT%`r{olNSHIDPp()PW$>ENe}Hl(LYUIWLhEnR$()%@fRHT9}X^YMTS zL8}JTTU=mYcWRT84xH_7&uA1LJ--4~Py08jZdpxbQUjV|xK zmdcrof4L<$F$etgR3;R-+2@r|H|nuf{Ix8-sHn-1x9@n6)t5wK5lrfe#F(`(>F_s7C>ZJvXhRB}elm1+#7x|VhJLdRM%wY~*;)F;;2ejLa^YgXBaRIA| zBc}>IPSdksW40+F`gRsq+9i=+Pw7vvb)NTEh2Y1_Rr^M$VWAqeeZBw^Gl!SsIL#F+ zKm?FYFv6gB=y*+mu>d=DSt7unNlU3Vbu?4~&WF6%fQYc}Kh;#`0cwE8`G2n8KM)>4 z0m#<-RvgT>jTKmaATjZa@%r4^*~c{=B(}k2+0fHIHCZ8nJr!I%zx(7M4ffQnkZKDV zd!3?EBHCk{iw2dyHu`{u0L^hEpCc!vA1Pg>O)IWLniZ$|O?LG_6~>RQXCg5Gbcee} z7=Qa^x8NQ9mR!o2F~43bB4nL%8)v$W~WU(bjqirjsO}H1X9M^SumFnfHi%*xb(#h)bl5Tbc zSN!DlF2%X_*iv~@Z4InzWSO3~45d%oDHu+rcGoswjPpkK&Cb`I83V_1h$k4tfVaVp z(vc`cTh6YE(`h3k{gNrSPRTn$cI2QSer}XnMUK^&NSADlz9^w^@vAJB(61b) z`H(*HJ=MAs;>?ZncsWLVMhq7&a#uojYg*o)YUoid!gjQ!-S>{Wg;78r7uj=9pF$e(N@9{`OUGu*^ADg> z!7_$b6GA7$trQ?xZ=HHv7uV^HBWfFVcV@z6xI9eM|7-_jO&F^@l(5)!grRGpdE-Un z(wiYHj_?4W7^<7F^3Y8upC*Udq-{uB(0>p*o1i9g*P+m?+dTaxGr#Rzl+RlQ_GJLG z*m@BqTn02(C1hQ15yh2r=)8SxFrw9ac-b-kF|LI-#>92R?o0f_(hi7pj?`a8H^UJV z|5TVUBOXP|8X|5d+8uGgVs*O2T`ejH6nE*2UBumo$7sqWQQrq0Bks>k)e)K^MoNo! zy+^N@RTHhD89)mvF4wL77I8tI8&FMM@%{+thd(>zXtt=5>^Iikqz43sKmtE|)k7oX zzmaS^1$--N72jpKYe&C(l8H-Q9Y zst31Ni;SvVw3JS0>^NT|D!+Tf0M=uWjkGcdL)5`8pHRJ;dO=J zNDFdAm80g$;hw;oBhB0s>bc`}zoZmbfh%kh*wRxFS63DWogd+~c9?y=eSWO`&}&Lb zYt4mcJV=ZPl0+k6o|i`l_`bCB{MpSl~vxcx@b2WMzD}q(g;^DzaA&jy^lm?b}1L@V^Vo>rDB657rmL-<@d=@4mE)stcQ&m zx)abHPuL73g8?3CcSU~VNkEQ~VDbkedE^Mb{ySkdTsJwlu8=rIz^+ZnYWyH(v)l;@ zIqP-b&y@rXH3YSVZpP^vS?nTmYQ-&now^MByxDD-AHb}p{ftp!B&*TJ3`DGDFd%@fHhUp zmT)(|{p|PFhz$mOeHlwzvqTp){$wXq%9xXM3yOozWnF_|M*_hG|8&8mG=>``ix_h% z^9nZ`M4{uM5RY^4t_0Tr5Athi3%Nv*Gv0=PMZYcbqSo*w$~kU_Lm(-_dJ4l>!2E*A zVZcU`m!TI;67&sYLqw0kb-Ag%!j{csOP5$1k8!*?M8!q?eff*|t|EAsW05Z}fO;az zo#8;uo49vTq}T8VYD%u@uUrexCFx>65tHZ1Cd(CUSl^hHhNaoxH>$-GoUp!g(t*h- zStsPuy9_Cr4q?9_h7?5rHktX z`6y~I(~vV@+O~imZ|ji$M;03m=~S|}zd2$_vrO00y?@jz5rLeX ziYiO?hSqCiTfy#{({WaE;(S|Z*cL)8_}UNX{C4lc&`Qp@Up`DS;oB%CB0tok~`^+1`uneZuVS+yz7P^y|( z6a}MYR3}75PMnQC*WkmuPXWy-zAQ*U%1p29PSld^rh?O991o2YMkP_(U4OF!$V1`PaVTUBn#AH3C?7noJ(_2)=K9Tj2Pi?hvECvWIY*9sn zkGc~+JYB-_kDRyYz_666T92UuH{KJ*8ERNEOunUzVhB+l`fh=1#H1->qDZ{oAdGfa z{Oog<|LrOioLepHC_P;oK&xWu0!IjF^oM1vL^FQn6yUmM&$929{qM= zN^(N!Zy&kg9Wuof;{D!U|1FYbIcK1*>{w&okt)NwAZeM$6+-#Ro#%ftMNN%o$(X80Y zI{uHVB8d;?5aBmH18W;^HaH8!SW>n`lh%K2^fCljIo^po&GdMt$iy^!Y-_;SM1j#3 zxuFX0w3S#Lce#s_d~cqokJ>b>JbEK$8r4e<`u{u{_M{B8l$wAVlItxDT@FM zdetUXax!?aA!e|T_AI@2LuSiYl35796Ot39&lWOney<`!UuIMVFpEh)-muz{Yf=c*{%1EhnDU{)dunc z(&TIJSoPGbk&3BRfz)=x6>;`PZbh&&KsGy>JL%ec)QTOe1 zn6i%^B(@HPoApDTR(t%sMnuZ$wbf_843e8MB^Nm^eQ~oJrWdZ{s7Zd7xM09*A3|W% zH9pFUbDKl11NMQ7i@WPfA6^|e0le8{H^hil z9J!V@cR|IdQj82Fv5nF>i4!yFjj{mq?$r;Jo@sP=1j=ryxFs$ahNeM_@IqDpf?pM; z&tQkZWGXRP7a4hEDnyY!u`Pk7srBjS?xRR_o?BZ{1!(hFqi|T6u}_FYEi2c8G*^W= z%`0a0x{M0f<{#cKLO%pg9e-=Fi=rML?+!mp^m*Ymy?`Oyx1Iqf-%%9fwoNkUY_|aG zwx3CCj#eEU-Kt`jh}h1zi-qk-7GE9LS=#F zEs~klT$`AxK`RcwSp5(M2|E**c84#U6DeBUe3>G~w{tbO$hV(clb6S>-La=aPh^Lh z*7W(>H~70gdGSbyNnJUQEVVf9+LzpOcYKVoIEFo?uXNI9O4x{yNJ7u7_Qu9lSa8i) z%d}h#e5Tw`m(_!hoO1H^WImOd)07`VVT3i`YyrQv zmvOG)C@PDygna458nfwMb!{L42raf%zKMWP3XpbOqGFLTYiCi?Eg8D4l$zhkm_tw{ zf2OC*ch-}@0#c-j?Y+(r*%5B*-hM9rx;`aR*SsxQJ2qfYH`xXv%6SsXdo_r?iSsWpTC&4m{8kE{c4m@M=SvaQss@?=+E;JDHpcGB0bLG;`Fttl0x#|oD~80F zld(WWas$|{$z#nSmSN~jALyc@sY=abw!~ehOO5RH7f8z~I*X5DyMnCvzVATI?VrxK zQEgSeRA0W%yIk7cMb8ZjY+?=Kt!F+c{8%Q$AD4tHZi7WZdmIf*vasE&?USaVyiE{w7RXXfFD(LN#kH48V;g!@YU{F~SoI|)-vulQ7rvk! zs2sDmA^lz?@VMM)3vVYv& z|3#UH8Y;oH#-yCusHlTleOJOK7c!ptUNbbqV?bdLmO-_vyGwN+wG)$3bFq$yi4VNk z!3@fpgIyvf;p`kx)61BppTw;8k>f8DSV>L93tsalZb=kn=%vQQ$Tb z4J{4_iA!55q|89r-B)h+qxTu%F2W4Cb_;9ovk)j(DG@-pr1H@Q1Ts4 zRex=X6RHZI&j?$?Oab)!q{h*J(B)ThC>tM7JJ5;p$m`}k)f$BJ%`firdH;eC{SAH& zQ}S2Vwqhc}Vlj?;OTjct6teatp}V@uNiQ@_82E1M$*6dH^+5rAc{=H^U4;eF!6$V1 zRA8Rb36|2840kQ309n6!S|=#UcY~3!dFg>3rFcpktgv=r7GW{7Z2L7(g_wc^X8U@Y zMI@lS!9mTv^{`5$XRPmzbphwoJW*>c$r=9vc5K3AvWfgI8)YWTfp?S`^fggTN;Ewb z{Tk>ZJR->w*8R+&wPB*%Jbj6E9aU4FtILERWd*l5{maj9lN9GEGx(7Qsw5A9Ml);C zXHRIo5Jr4}bpX9U+L5feLvAi+@UlIJBNU~r%}sK|A6{gqB+jG1IOxgQ1tsjuebygR zhrq}Rz+IohJNQ&L4WW#DJ8a7;g#kf)1=eBmj_8Q@X{Y|IjHcPxdS*UJY}zbIUVfJ% z9lVv1BKn?Mjupu&nl%UU&s6pX*6S8F#8%+J{AEhCPN%tEN3C;fCFU|$6r*4&jXL5J zap^#V1A!!UjUMFMc5`~%MO#!LV}V#t_sFe)Ht}(nczyjBimd2a7noK<{R<(Err&(_;{^T9JfI8$crc zj3Hi=)(}6vviMp9-Eq-5bJLqYAn#=ratsJ2N=5b7V=$XcmU6$gHf6i&h5D-mIxUpQ zHkR17K--vHRfU9nT!c6I6?lb6oQvB$O2lKqE$*Y_uZ~1PvuaM% zZK!4Q*DAvv*DH;-h4GA%eU=WDsRQBSl4$Agl}}_QiatW8qmWZen}gDBTec8169#D- z?mZu)DP0m~jg4A$wsbXfmub&}Uwgm8TZ3m!BJf@N4Fe;AD%J8pr7aBHODi%n)quG5 zr}>@Q7^JBLgA|7#)P8jOnaA*Vi+|Ry-Q=z4G=T}h@}iw|_Egz~zhdnzz0AQ4jozH;e%%Nb9JeCu8~I$(O%$$tgOWtJ zGf^%WpySf9yXx?4uGbOyB-zaPDs5y1p^a?)@$q)2vErt@#*@`X(je384c8Tc7)ROb`I7VhhsntDpO=_b;5nbY@W_FNcfo3JYs z;kmaN*3(VY3S%lnb%Dm)tvWM4$%KIF8!b6?8ds~=h8wcK40IS^AWMY@{}q18EjhO$A~m_7Y;Q6CXQ&u zw3!+wa!mrtf-$NTi$fK3e80Ns-Sob_FMf+6<7xllo2V>FO%G(3&MAX=r9rwEsR%@t zgTpG|FY5CpFM_kwj2D1*c-I_6gCi;StSIz--%UQqT${s<ddQ>S-*#E&Qg)EXMI9p8W`@MU!>xxGO^tu(( zjCC+yU#W1gxcVzJsmg>b3^EVKo16gta2UsX`VK<)e1X3iGRgo)&Tf0FH3GYi(}36y zkB1V77Gp8eOX62)vIfN!KQeYdaK1;qP__-^>0!c#Z822Ku>}la!OEqOmJV4hkx@f#P8G zUY1z-KtQ={jF>)5Zs^16;}@LzYX>LZ&aMPcA)CAoFWWki256rx35y4n60?u%~v$95TAR*uIgkko8t?@R@2y=0OAI z<`2Cr$%{t)lfVC&6XSF@8O%oLuZQDm94A8)f2uv`Far^%t|`ip{?m^$GSkpHHCw` zCLH47-c*z`Nuuh=L0+sMF)>KXS+(1q&9?>POG_U9X?+B+0uhanftK*rcP6e!y#fe@ z5)lW*@y)}cB%Yj`B0=#&2Vvqw@%-ndl^zO}mcFmfzI=S4yQJ9DW$yDPtz9q!w%8q! zTZ+}o>5PnOGXse2}|_O&~>jISup2D_Q9FF>45rs zsyc4FlYUn$Sm(Oj7znxnzkAAjyiwKyk!3s)C9!~Q;WxXClyjWSdz$`@88)>(%^wt!|HHk~_Dak%dQPan&p z{T^%V2T`mE>1`Fv^4yD<-jJ>?EYP8~lq_ksNOjsLVF8M)gv99srtW*8{OueNL(n=x z6^(|D3mLY&SFc}SRvJ#y_)s|qWH9)R4u>>2S{#kJ%0Pb-=Oig9n(T0`)?XtBVkE`l zUBWpZg9*)fvkc;xZmJRfCZmUKNkpfiFpw-31HSu#x=5rhiSlRCYLVq8idZ$ZU_l9{ zdyHa#E_HtZ(AKveOKeEucao)VK6@6$ot#)jgqu$_nXu!P@TT9Jiry+Ql`eyDLU$MMahB%f#JSI;=VHL9%JCQ6hCX7fTXRgFvcB%wlbAw`^BHNU_Td}_O0>!Gwq*dF9Z zD)*fecX6!qp=mD%sWx8hjJS8Z_jZ#Cl}aQcf~cE@#fogB%*uGB^W#)9-z&n#xShGV z1euy`Fs0ngBjwQPUGRo)enhBpMHLSa30B8~#}Nxkv3oiD#BV3#3iZS$UR{m}BzDm! zN8WDphrd`zhspvp-pf}7%K(Gl?s;_#=~k?U^}on?E%EjI4p6@eh?ExM=k60h3WL?M z=3u(o3h~SjB75~!DTY3i`cXoLbWxD4P}P|Aw5VPV!hymf-bhY;>}ORmP0|=x*6-D1 z_1=lVIxZl!rt~!Dbv*B{t;@STMUO?0o~UotGqWt{=lOY%(EB`tUiUJiDzA401e^A%^O#f57n7^+$q`reymi+_5iSg1y)tWuvuBcbRlxQVIe9&xv;H_=ITNl^o}g;% zzgX-QbK~ze&I^88pJ=i3b-rS8mzD`dpvcp;Q6Iv+%ZyiNdVcFvc z;@IC#di;6fe>5`=~u@1wW`t_yEnAqokX*@9CA(&pG&FIeM zBx6o|;meHthjgqxUhCL*OAUKQcqg=lfY*T(?Qj`;-GO{j)^qO62SApJoy4{L`Em^h ztu?cNV}BZ1l5j7Y2$i2UYQ9ju^A{|saz&_<{G869D(BSq%ai*NjEkLUr?QgW`l}h2 zUquQU=hTSo1l>fg5WpxIV(@dpAGPOAq!>3@^A6py78;mO@!!S#vMP-G??^P`spr3n`A`g-8)TX)I^dC>sY z|0JnOXm}DY#vn6?oqXUsO>xfP{w9i0{d5l_21rlw%GsiiYFFSv_)Ct{7@e)v? z%84rMgk{8aq~4;bkYYSH*iRtVB=S-Zzs(1%tTo>H@6Dx?gAjg+AZJ?hm)3dS?G11tvN+3wH3V*({un2}}9 zt4N>SiEJTed*(<0H;I3a1il_N|6V1OS+c9lI1VD1zxmp=U;gdN zky7y-IiAP2_*dR(@9S}RV@pC99@px}TzSWL@k9Y_>((qW>OXKm9@H<)vBc0k@X^(i z=M<#oLxtBML3lf7wWI6%5G}Tm7%B3VgjvE}kuMk*?DL<5Eubx#&PUXRwNS}iZO%4! zP{Nf9U(ixpG0gs%<61CNmb%pyq4YC{(~<5zgty0Z5jSyYG;6`nqsmPif?o=l!ci1n z@8ZhuqDXPYnF_8pLFQhqo7Ag{nG5e^xMtNEbUh;#n#Lcscik}a2{V;iBA+y8Db_D` zb@#PiW0+9>4)Z!a7_*{OdpA1r$vr024UfuIk|HW^=VYPRrM{0TO1iXv1tFxjgvz=c zWL*TC6^M0wA{W0g&zQK=e{i5%=^f}?VKD3|>~Ei$&`aI2@>xV$MYBS2>${(MbZTef zBIDVd9lzL&!Uv!N6M>u*qiZ;&pNB*y!1+%{lF5$AO=f%6Q~eS*m(g*cNv}`pCzbYI z;KcbCRn(xq+@c4-4EL@b<1(MZP9eqH0WIcFHA?=@K`gVz*NXcOfUOyg2S6KnP)FIq zb*gGYzIU8vRD9+ajsohpWk%GRlLn5BclOV#Gw3z~-;=8)LV=H^TUW>uu0z|4z=n3G zAN$#G65)*I)f4juV@uv?wS%1;VfKv^1%r6*(DaFMV-FSn#P%%rg8npNUAs$zaE zH53Lq%R`(+&IfyviiWpT5=8^~p@mkV2FxH*VpWRwnu9dJ1{q8h45vm17Z5x| zn@dc#q}$xxZ`dhb-p91D^TET8zHT-t?CA~HJ5N?0;1ZE)Xh!18LUVqt; zx}fKkIBW9DZuq{nWS4{@Zy(I6&ZzNuMG0RA!3U=~+eP&C#B1YgJ<@*XAbU*l4El5_ z$WR5`WZk|0>awDRJwtOntL0nE7=I>?cyDXaHA)b}#C;eI|6RcYV2$90dC!;h`EvMBzT&>C&D#;nXA1}~P+wYe9 z44c45WZ+npq0sDv``|6Qcjp7(4`}4yWs85IidjEWxcxEk zQ_kI>iyiL!VB7DHlrq5p^Z=-24LV2A*UVJ>=s&#oc1cZ$}f`~WU|7Jn5?BnFQ2f%1W;R7I;?*R}_`ycKd$7Bnak8%BI z`yS6WabKgd{N3UT0Xo)U_aFW+zd=&$l`=_cP@Bj<8*lv&jr}h*F8}wQ3_R3rz3-5+ z-k(a`VcjD{JpeF_vhRNG8eb!EU#tJSJMBfH-xvW~xp#4oP#Ta> z@;@*0YvZm<0%C6K65iL1?xytZ7qpyC|6bK-eCZWYRFw?2&+HD-ttcFV>;1Mijx^*e zZPoQZ*_`i9LFvMtOsr!{?e)O6OY+8#uF?L@2O%Y7-72K8a>A>=M@);z3}gG*hX9`r zY;EjIN`xDv8jiErQaJLRzGQpn2J?9DLXB|aTLF&diOuf`n#CRfy%mKM_W_m<0L+Dg!?#JlUD4mP?auz$<^xjauVf#M zm#H|#ecrShif0+P!^&SoFQBUqvg(fj z7RkYMrfT(=^|C?DwyzNkcd9MSi$(plE$f!kt3-`KxQVVF2sqx2YpNUx-_DX2+Ni8= zCdx0+ngZF7d^!9=-}zdkz|A6|RF)VwbCc%90b{mTcPIycu5P6Bug5U@#*gCDw9TtK zWEVQ)RY4RmQl@ln1tziW?&D=b71R3QVy2c&({(&H!Hl6I zjawa&+ZYg%%|4F88MUbHgel}^G0!NK^D^|2Gto2FS_e07dH?wB+R#Q{-q7BoJkF#P zBORFr{2Q3^LYBrm>Bk9b_Y*I*knv~52iMIsf#!n=u6`=pQ&h|IKEr9x#j{~sCz?}T zLH14iNTr1v^?4ax6qXDs?0}Z_MAf<}W>w;vQCEgGmNY|w8+u|NPEy%dp)bArF7hUh zeg&@m#p}{oyu@VnKZdInZin2uQ?HIY>GC0AuYezU~dAv9ZG}g02xoFKRmJ{s&k2d z<0*s&03yZD88sG?wa4=T3}=liThscy}o`U54Qg#!eY=9tChYhk~a1DCKFj^d5?>@O>Of% z!e2K0(jsdeu6tURR+9V{cs)7SSRZ8-Auu&9)nGa91d#||L|+< zM!hPiNrH_4vitq$ntt^Cy*?v=vX;!YF488GWP4r(wc|QGI~{ynP>t`RBA2+1_hN(!?K(x;x3UL*_g91Js5tp` zTh(~Oz@PLs|1{t1@eP2tn)-n=iw^)~k*~{x+;r^iKJ^xw2x7@@GDEk75v}kpGJM1Gd<` zPk@A52+K)D8AAj%k`arV8{WbN(cQKW(mi3U0aDJbM$k#qk=H*}U*=qHCXp7{CovZU z`9lFJ=$Y8?7a~(5BIHe~_s^7CWgY;zdpeJ1se2@s>@9yukiX{>w`uH#cWJzydXLvwkT?f;mWjz-HK9_M)zhreCPaKxXkgeS54 z1vwZ<|<-8V{pr}+e{q1&YdItct}cTwJxp5NP0-I6B~YnzpcXV{)ZF}zkSmO)uu;( z_+E(e=8;o|aNo=HJif3!DEUYe%Ud^X{&sC|{r_tHrJ4WT+OU;aNjY+S45im+&B`hpx z#C89p+2PK+dr8q6;{h*Tly?EvXTR;da(hkNH>0jVgd2>h@2dP}eY%wVw4{u&ysTCG zmTUEqije&?9QJ2e?7zw%PXs(h2x|Y6_lW-t!TvMP?!JC3#`#~h`1^mlql?E4=Kt<- ig0pPm-I+0;h&9%VO|SqavHCzrkAI*|Ua-W&+ zo4mXC?z?;U?!9}T|MShEd6;whbXQeZS65g6iig>UWdNRnjJym02?+_{g}49@a{ws- z1{&HEw8t1v(4SypVqoEt;o;)o;8GKl5Ridr85w}IKze2lq36u3{A~0U^O5;8v0 zLnnX+06;=TQ2U$UzkZM&A)}x^Mnivsfr+S4iwAgwgpB+M1sN3;1qD&tAMqT3f{#i- z$0hmrsfr02y)z+qKx`H|@I_@ik?PnXh{x0=@CgPn2`L#l10xeN3o9=lzkr~Su+&Rw z8Cf}b1vPaIO)YI5T{CkFORKlmHm+{&9-dy_K0zOXLqb1(3X6+RNK8upoRXTIlbiQ7 zzo4+Fs=B7OuD+r1+mDXUuI`@RzW(uv$*Jj?*`KhL)wT6s8=G6(J4eSSr)TFEmsi)n z`9cC9|Anpp?f`Vc7qMBD z?N5L_s)t0TE@K$PAl?;*qu;Fk!P)OBanDjAH&qb6QtcYj)45&w@l{!I zkA)J!U%JKmUaC}v=Q}%kjj^eo23eBGlvMF^rF&)h@u9{}G` zY5vq?&TfN2>x=uWh?WO{y6e(m+zyS+)5jl>$in;aai5vM>+^M^{C&<>)!05aFZB2C zl44UY1}K}r4W8Z7^0%Jcf$t7kAS+094}i?=2f#sqyy;>1J!yVmcdgY``i)nA=A-l#dch*PQCeFI{g*!EZ^_X`&=yBxykNpRN5C?|)Lt5Z+#XY3wCx zyL2LS<@eg3W8G8FJ|}~;?s7$gBQ~gMQcd||Ir#-#DQS2G#ismGqJ*$LRk6Dtdeyul zQS-Q9xWk4iNx{e~a>y-PU+Z_s!SVxO_3iZ{?zP0B*USR|CrWsARj=t2Q^vpm-Ta_$ zYE+ZarB*z|?*XvX{s8D4djMb@J04`-T!JrC$1F+Rvmrs;-VO{+wU;?z`CV)sv~m2b zYdJXCP0#+@*B~&#cMpId+`U1C?aQ1>80lbfDSV=`vJgM2w5?F zeRuKzfS^49o+=rI&Y-)*eap0#xK9iLlDsHodaW>XY$Zd2`Y}gCJ;BkD*WITsD1SE= zgrfao*ikmClyJ zGBHhsld=V_nl*--arzRk1x2wXBZdp(;QpQE1EBQ*aA)(6Jz6PVJpdk4PnX}`+^S5$ zw()dfBj`MG4m_KRZ2NJqB3$03Vht%u zA>ic>*c5_6NSJWY3Y=R}5bnTE+ZaaQh1zM$INrlVMtyoGB2zma$Gn%g8piyR1=kZ^rf*Z1qY8xBHmvU2o{#s z_$o?T(uSM<>^2RSMUn(ii+SzpXWr6kK~7Et?tTk;R|R1_h?Z_|hY+Lt0JxsK%K&fQ zlKSelo*W#>8bC}h?Cn9S=o|G3awS@{5hkIlPl}QZ$!n_O7zqrNVz|6ap$2_VDUWkz z;%5ZQT<`JNTCc9s?teFZR`3r@7!D_&++&HA8eLsn=44ZoRt)c-aJ( zy%IrlpvMc}UE^}&l>FsQT~#4wbM_XnOY1n-6A1YlO=!Phnex9RSS;h&Z&@h(A&Zw# z@)|`Q`xY=#jo+0+8Q0Sv_ijzilMY!e{*@70eE%R? z$PZM+oC5PemL352wbXwn4M*%B=VQ_g3pdTrHl%ozZ+fG+g6-UZw3~bkQK3Y^gB(w} zLeLjXWd@#e&SivCH?B5j{{4i>`8!SuZ-gPg<&pS@JT}qBV}gQuY*zjUwaxo`wOwGR zsXV<0-Ujni5&XbBZwZ^yT=`Z}riSRLa(vb3FBA12`W5CiJTY=*=+7$D8~ceMYb+PwQiQ00Hsb*95T z5_DkP&g~CRv~W0@-TO+AfS+r0J|Q?{4|+QP?oK7c!q+|KqroJAw7j70HrN$%7+`={ zr3RNSersi^?BDCr-?J+W&PXqh+M^N{BEZQ=jk|B6DIj#zM%n&G8Qu5KirLGyukiu!7cgiI3mM zIGJGsPYbz{v=;LrcdYZBUQ;yI9=hTzubh~`qa6IZUylontTDZvpBTh;ehI~R{u-cL z)vUPd%=)N#xp2=q={m7xXkxBs!#nnF`AvjIf^_5~`e2n&j5lAnXaRbKXJ62)TaT^~ zh7_S`qc$v@3{m#kR^_cEv6?u})k z%=xgYvlf%npB!K6Bh8^}gLYhPFQwUs4sLL`&Qyc` z{;~OGY5bJt-US}goen38(1sy%gG+JSIYl;WPTcZ5SZL12>dC@5`9&4+ zZ)^o~Lx%OdUSbVW<-W=L9P?aoi|4F3Sr4N30BECgYdx#9yBv_h=z%Y}qcq?(GTf!w{ZTpqo z>!tMaviks@aMW3>|5|=c(jolHu0+*SxvGEBE-E*m!KvuE(58@hqB}2J_^yFdjp?Fd zL1bc4>&s8-MbJysPvBRo`j@);@?=E7 zVpgbPV@u1_KIhW`4zyW&g!y-M(D@!5y`G)nxU#*{T$Pghq4fJ_yN`hN%DLXP`~fM* z3}SUaK0uwghCcw%_BrP-!(Y{Erz&Rrcze5koMVkMa)9?`#eak_`8)lDU3MgNTYCUM z3Cu$ND*xj98>jD^Vf__{_k}rxaa|=}J*&cC7AYw&YK-6aO9)Xyal~tq#7SF6*CV#$ z=9Eg!>#EWMd({g~$((R$gs!}{gtE@+&suK*(xp6I{0A#Hm~aJ&c>Ruz#1JMw9C$EF z@P(-Q`SoT-nPHpZQ!~_DEf&@b>?S5Lu3(~Il0b(q;Yu9mdX2Qz#MQtdcD9(#5Y<+- z7}GYkj1_v_5dt7?OyLa^<@{I(Tj=erF%e9cWH7MM{Z?J>dnQ| zg{&k&pl0F+?e!VD1N#{b&;ae8MZlLLn_8+<^9#IEr+DF#r&lj$h$}@NnVFGU z>=4o4S=Xlcp7^ zeg^$b;|wyUT7o0U0noc*HtAYrbluuS_*ts;O|?B}{6jobk|q;Udoh;?!p!h3 zVv;g9F)95kVVYpuCk#6Kol;$OJl2JczeH$N`3vJca+azEJFY~C`>JSgso{FS} z*ER|=kq}Vgt9&`bQ*aq|&GA-`@A~I_v&c#X z@izZTx<#fFY_X9}#&@E&#ZhCN*MMKKIM13X#T05Qm@{|64^TaL-)5DLCkL0HV*2k| z7I7@M#VnvOp5_saE1PAAw~PJik;vZ4;zaa0U^i^902#a|Kyxzw>}LG|5J_Xw@Bo1R zAr{*gVStev?4H}xKSU$@r}js)-!tp|oYx!T4F|m(sHZJlL#D13=WK03yE}$-m#9b( z9l76afe+&Ec};h3Mt=+0ri&2Ch{R|y=(m_vhD4`#$gPc~*-03bxP7dktiSli%w^;W zxu3VIa-)C2$o5WM`J!XY&AM-a;YC$FBp+UWAo&3B%RDNi4`fEdmnk4!hf}?l|DQUN z`QOv@f33<8<%v8^TpKw*@mkaZtnChD77Yn3K{2vWp%LjYuGc>JmRE$9_!`P$VC>6 zyc&7(IbMfXO^E|n5pr#I(4Pke(uA5Hg8zjT_-7UZD}Fz^;NI=p)&-2O^yN_)^Z%-# z=idbFzEM9x*pC9{M@j#cclq0$@Bj5KLL!e62Ozo6Ca$N4svYVgZcMX}Igkl{&@bjj zOnuPMKmuUS5p{)hC?(A2Uho&H$m4F8h=7( zBWKZH4n`*1Rn@Zos3J`W_;*U=#8FQ83_hS3t-BsmHjkAApQ-bV*#;m@^vmu0?NJv~a3ACACSSZToSAVYB%#r5i z|1r-ak=BJPzDuojELo$mmM8aycHuklDJn!cwU*bOl|xum?L$CR2IB7!_rY<$Ddmqt`(&N!}N$S{Vxj3-pWz zy-v&JcAf>ahZ8?iZSD$aDh=gb-YI{lfeg$SBNASEUGiS$EX=s%hqc#W2UwZ;A@oMQ=-y7GBBI*qVgt zML@v7+HmD@9gx29;jnh9q{2vdvFk3kT9LPu{bd(j44{pf_C0@@5PaBGBeR`b9-~ie zsB{Qs_}Wc;%>ElOfiV#abar{OmS$$1W@f`Mx|7CXhPPlZ_W>Z?7_~Me`%2Ia_T06X zHa=GOTT-E8UR#{Lqh)NTa}RlvH|hBs;c2s=IY?)C)B!De;u0B=EPXfl2$XO?6Bfz2Ie<%wM$86W>XL z@Xwf1eZ9V;HBA6(YrZu^JpjBGE7Lo*0!e*>(yvVmhgK-&0-Qk?qlz3|s|%v9pB(F> z{H#piSBNk-KSLHOED>dK{@VT()3*VuPHl*8kU}L%RUS!lIpPTR;c<=6LiJ9LvUx+8 z6_%MbJ!=)b8=n82M`ibV^PH%qxz!xHJDfo`hp}F^7J-N;HBB?*2R{HR`Iy~p=Vr+` zum(>^mD8TT3tj@Hy8)cxx>|91USE3##t+O+?7v1C4}K`HuzN3yhDKGehf;6ITv?O! z(yh@1Bc#FYL)F~6)g~~XGE-rS`*A&YcY8wFFOyzTXZ_oIs#c~qQLt-t_!&5LotGhn zv_O+9eJw^LD6A2uG%LGPQWLJt=bIaK zY{8>6%j6f~vKg>rJ^t3&Mi*XIiQUVeN3@97tgxHOaWaP#?Wn9E9Qs1>0Ivgc9#*7` z-5aPC@ayA;nCx}9*pZ6$B2%3`;k=<-x2>LFnL}8&^U2q}Xv5F|oeD2%^A-{ULy7(D zWwPd=O~s8uWZ(GV`9n0){E_9|b}O;kuUS~u?q)=}a1d&gib9CdXEWMUp}dLf-@7fb zXLsJ@Gw+xxsuN0}&Mz@666qP}7A`!Y3)0$Ce8mQ|dUyGQ9Ii}>7(HYz2Z7E3rzMt$ zF4&rSvD6u&o~YzTwYpEJFbdQvt4H4p3blW|44hFFxCSSS+S5+ztePOrQNGhN(MBF| zh6+#M7hA9DVfJMQ`)zH+X@2Nom)4=u@|Zql@TRWy_;HV|K$1FJEmOto8@9%vgd-*+ zxzT7~s{kEQ%;;)MHfU**FNHDps?fhL0&HTC2!XC=H1c(1ziv;T$K3V4p}pww0aQy9 zzCl8iM*_;-NwMGF@z(rrO*9hWUzS`bq(QtE6`(rl1B1-;KLEBklqN_^v`9ga+`ilG zZ_#?n*H=b&sD(+!X^Iss@5Y!;ilWvBOip%HB-#i%iD5WCskPWM(6rdOdFt#&+i*|$ ziz^G2JkpM*O-oUk3b1jyI3{P{0m`)*@_4PYJmqL1Nz;Mb#d0Qf^>&IaYg$|NVrUedRuJm<$yPR?xKYQ`SZ$eS8#5hrC)E21a! zOV{_`*XkCR7WuHtvWT`4<_7hp*k0VpQ2B(d>U8-RC_j#8nA}P3x~)qj#APNE zK|XQqhG!S(8R_d;&v7{P{~RgSFR70o95ctV8+>o>Aa*${d?JnTKpNN)zcR0=0~oUi zwJKnuoyx-1GM~ZE^{p~H)#sLOx*MA0H|nN%Z6g)VMH0dmkM76Ftf;kNGTQI;kiQER zSJ`msdxfdDOn$AY(_Zx@jFQ4@z~a`st4_ymX%YG&D`oFbwc0yr~IwZ#C3W)JtYt_y1QP;E8^-F`)mbruC(0i7g6PO2<0LUYDg?Q!lo(Stm%jNKj(MxY|3e?}_SWp1rS9w(h=z<0MWSZ$!M zC+bq)yOnir?l#gd?(j7{i>ezj=C@k}Nr2*Y>JOAN*eh3ORGM;&{;J&L9ZWE|>V_0+ zoiGXvMOvBC{h}xd_QB&F;kccB*`^LCc%c8Hl>EomW3yh=dh^4e#qdZr=%<#}=C{8! zAZ=y0u)J9)-$1`^d!t@9W{u2LXtl;BwYrie{I7L*6I3hqIS~Zp_(HtKRipz~4;&B3 z1rSnaAk^mkZy#X6qnmxM$A8zg-o6~4s3`keRfUd*roW0sUYZ#5Z#}6P|37*F&S2oy zHfxON;zo*b`658XuRaMuGsQv@UWRe=37 zhcA(zyNO62i}7S0<`4F*OSe{Msz}y@jN!oXO4$fFxm+T<`(vFAUoZWa4RIPEA{qfI zTqfkGTHbf8wNP{{;u)9u;bVGO5pxXbFpUtM1lo7s2$jK(MDmdN*r1wd1@U3Io`GnE zL28y7uk2&m#?&1hL;SZzC`El2?R^+rvBS>2-L5(?qi_@tEglc&`_d6 z@p?+1*vdfI#?IRxHQhu86(QRVal*$=+)l#3sUl{m;qE-DBz0$1o;3Y3;g&eTJLk)s z=^{xQ!Q;PyJVTV2dzrwuFqOVxn=-xOdQ(dp!XQc_YBuW0Czc#uS{|HMnt1AL#KK>t z5LV2Fx=?>$PJqKn?+cR%c}8C{*e`D>)qVoRL~;On_P2hyN^T_ zolY+PBGc|mANhz#t|JIuu(NY^RZFOGF!zVp|6IPE$944a8x&F*ze~Rk6Cd`MR-JO3 z9#NS{+{_@BVmV%uW@DV>2w{qiFgI&auQ^H#?w1BtEa|!wGquoIf%mRY%iJzgbws6%t0X|H8-0Ble>IBY5jTLE&LwZ<+K9)}rpogG@K>Lv*ZTe~WhD;eE^Tnx z*X=r*smt-J3A!lcH%sCZ(vzeFKH4!5KDUi$_vG?+m5n%hkBEVsy+I}Q{lE?&9(Ri5w&LaX6-&z^hkDDV6WCIE;GOBSIeSw z?gfVJ*pxw?C+mpQsm3;8)`MGP)q-)bIP?Uf=1316o_)I+~nlFjkEDo<<@yg&1M;D22AW3 zb{$5dbrL%uytWj+pBU$m4Z7?NP<)&3Q-50~92|U^u*+-uW?K1@yAvyZ79bMml^3;5KB(VIj`FkR zLx;8G{Y)M}S=w^WD^#(m9_r-?JT9Ga%L_2bscc6Mn!U(=hG!kGJ>*gUmi9e7rh--d z993>uNk(w97GSNS)6q$8GBLE(q+A_(+}FO)deh{bu{b*%!u6b-+96Z z7v^s=stNVw*T^u_bR{#1K?kG)MzbqGMHoCYSLQ-`YDez;{(0C){&ouqhKL@2vai2$CDhHlOf!Vc2)3TvsO0N*W+C^731V3I<3T{u1qT!=PPUb5lDMl*$4`+#L(R`0V#?cQ3UdmzyKKI}5 z166R8kmt?}C+?KL(e1P1s%~mq?9XG4?ff{6vDkT2(Wz>GF5p)Zs}!8+yFZU%iu7M%7|z6| zwj5G24UG*+$E3zh@x$8YE8?of!`jPEJblG#1|8r}2DS2z!1}kV_CjxZ!>-6oU!2)3 zn|#A^p8f`7MjkmmZ>pau7hgL6YzJH2+>_yGCs#|qYB-2)%1kczu?pD`sC@f)eD@9X zSZ?GzIRg~zR~0H!-6m282d17rF(c?as`L1kD0orewo6(wB)r%^Um1qpY2C2C?8N<6 z*Ln+U!w+c7R~|Z4m!wKjPzEA-;sNq+T1ey`5ub#P}}khFZy%1`y= z1&!w`W;Li+muG^?Bw4@hw**fC{zq0#;qp4a!r)`(5m#a25^^2(OrlY^l+)NX8*_?D z(}HE1uub($`>u{xS?U1wj0TT|xmd$^X*Q%ym@3c0rK`%6D!dFkq4L0tKAzSBmTN^u zOWoqTk$bpsp5?vzIF4#GJWUAXm3xvwpCG)VU)jC`prs}UAs~=OUsF7VVvB$LYcWc|{%K9f!b`i(if5l2C}}dh5J0k1 zVOIOR*0Rg(%%ba>*M`p+-v_}86}>s-%<6rBkxW_PYd z3l0hw4SGO=GUBGUpY3P6Nvub{vbAOxUC#?^uRIcny^H?UdY*jV15?6#02H5iUZqt= z*sYzGI^v0!;y_;=m;Zd-|0H!-K!;$7$$41*w9n8vb^9$M4siScpcUQKtS7lvDyE8_ zV;a|hze0to`YrKXq{)kftx>mae51j)h9$sI@A4``6pes(DyXh(W@M=0ln+nTMXLFPK7Z*`Kd%qHZe-@!dad{Ec^PL42pn%W}xZXo(T;>anXVTJ1$G{0!#F{7Bt&K;`jdHM`l>50$ zpELW3Zbh@e8^4+$W(u4~gwEav zqOC}~Cuq3VR*DuE{#}egUYbL;H2==8{627sGlMSd2fRlEs=TRUBW4fY{e^&a6rLs! z=*}0r2@oJ(jiL#7y7MLrdg@>0#4NPnKoT9_YDGTxnibR#b)Xr5Wi^b>_8rr9AKt_S zbDWimuz{;ttlB+Jw@__`7;A*ACsH$hMv3ZEw)6V9cWCPyaFl4#C1Z8f?a!qL_li!| z-7^aCU(7r)-gdMGc5MuB(>DQ)kCl<Sjw*-wcER;-Kos*+`bs^Bq0ymIe(TvL0-aHc7HQfPHnEq) zqbxRfJoj-emP})f3@q-X9<}|c>z(hV0D(b=&@wA&u_Z&(dIIU+LaUw6O*3!{CJ6I(Q2Q52sjbfQS z$~?P*w4>d<10OzNuiYx=WjT9BuYDW9oJ8FH>un!aBPF%uMs;);bak4GA$aKX_gZgz z1`oA{4-0`v{pl{^iM)%{{5~3c9-ac6)FNMx#SN-N8y&p%yl8Yo4^CNkgD;p&-a= zJW`~ape)p?VAsRQJkZj@d#pp2#vnK93jx+QwC?E3)v%?4)v#{=s9or0HCQCh?M{uDn(ttR^^oWa|JhdfBS$i4jRVo>OPvu&2 zHCab@hZC+OX#!fTF9bYW-c;V_d}Gt1y2#SbCgAhnfg#IC;tPydm2o)JYj?4|n_;uL zzk_H0s{9*)HFc7R0V_84uk?f&J8Atl!TUBoA^Ri^bzW;3%dQvZuPl^D58ov`mrdUJ z&U3^E&EqAvpGrUfD4Wc%6~&VFUg;Nu^RV-A6qmI7j5tbNYet#l8PyBhHrt!jTpLvI0_j!1JEV zaP_t=nVAiT~N1+FEicpqFO&y>7-CQTIpaOjq5>&pAbRlq^C}j67LQ1e3b!G5ky_C|-Fo z7jd|tUjJ>X9o2NfFGlkh&;t0EN4R1t?P` z2#^nyWE*ylk)_o{oysW7TFqcmYnWg<-mLYykVkfi>{HEZyJ>dV?K08ZN+#CjOeXnh zxY+l#LBRT4Hf>|kuWGCiCBg~~I*GKcu7$~&S-9=NZ>8+pRNox1rvu|xL$~ChNbG|V z5^#HW&rg!TIA+dkrsI@dv7Aff^9Pfnd02zV*l3OM>ya}Sympv((7sH*nU``h$LlRK z^^YRlv~_R4Sl7aL%{4$SG-4;Q_e*WIcUbdysT0AaayKquRj(^MW*W%xj&vD1Z2b%+ zTMlj>YZ3m7ma5fTzL%ErEa&>Rbr6BE+W(nISY4rk42plxno(Apzo;|fi8^U-?=V<3 zFs5G6w|;F^`sVO%Z6JS6*T6bXxB4Z#21pth(njVWBZH1W(qRIGfiov;v; zydl@7&+~;@Y5vd!3Aq*vdvd09z1H^Q^7pqL04^&=Ju!3&7ukjawJu%T4)&A=DZ*@I ziaEJ5b0D3DT%X`(*kEOIbFG2lPCtjjM!$%=tykH!6mTRcChOk5)pJg$OI|#N-Hl3Y z(e~p3M;-~b=o_+6i2;i`XzVzvg4;cL0P9SG?N;-!6^q#0nYZJhPpPWA7sLoBb4T6*>kVHAwr+esBL>|AM?c5O^nP}iQh#O|EO~wmZ01SL6di0HQ+OMOGb ziQ5>)6&D$Lj7X(ajdR8t6zy?V-NXT}f$)9z#<=(9_lc~Kr-afoUJ+R@I)dAU`r5j1 zuWuzGMgdcXUh%Ihzww_8wy|>C?DZ&Yfbl62$4FLl*e6i(ms`ad}{jQ~St6NdA(u!ifq>sMpk;IQA zs&*hy%+|o&eZGLbJVaN01393&b*wdXI%eoGB5FIye-d6Smyh2^8Y^X&=ux#p3mwwXcA3H0oeIhF3oVLkhoSb*{UE-oP61LQf(gNqMcWk525ZNYY z3y^rt?&t>q^~E13*s@o)E>|&`d-lqo64v?+y&QzkN~r`oU8S*h?}CfJZ)a3hJ|hzo zztA-tXKYm~ojWqxuqn}1YOkemw9h)yJiXit{7D=0K7N{y~r1=04%z?cW&)2J8Z5`ZwTgJHA?U{V-J_p;< z4V-Ohy7CO`aI(XXYk4oU>1h)+(Q|Bl{x-EN+Mm3~u8KV?C(Op zgi9s_3}il!#VwrP`+jzB1(9|hS@VCTz>#y?+HG&ZW&EO4*DTS8bm!2(z%SbrB5|O zh=6X<1n(OKH#aWw;HG>5Weah)T2~nigetm}0sr6^{TqIeiniy(lGJJjCzl&)MtcIO zCfD%fu%i3!c36JFh$uB>J@Qwhv8JM=z6S zIKj1bWa&NY26{zy^B&6rz+XMOtO*X~E|hz4OZU{)Ti^VfwJY*<(Ps|;dU!b^59XA= ztQCO{igrWbx*$OPP{dwj-Jtfm^*&@O{&FVg4rdn9qj#v|xdNNzR>B0DXuWnsE-!z1}{8+9oqk}RiQ#lWy72csaD(j4;C17j) z-1%IE_ILua$kr{Z>YVO6p0w5>>_6Qu+`ChkX@+4GnSusBu#n{|0=>|FtY5AIO<@1{ z6V~r>*1j!4X1}s!Ik8^T+|<&f1(O=0T_RIUWL3w)^B&Q)TO_Y(4kMhjX!wrLLvtFU z*5r~%`ySAZYcoZVWzarZULdOa2`>h@5M+*ZMqPt6fQza0?ba^2{%B4*|1QJfWxqL- zw}_JD2cAv&ZxI3@ z@;Dho^oOjqzAT|i*=B6CG58S#16;m)fGnG9qGgT^}8l+y!)86u;QdXYCR;Ht_x1JUcY%X_SaCGCR!AD{mu6W zKpo_o%l2J`C?dW^AAE6F{QwvXSJ7I27pjSNw~=|7yx}+|O=QKa?y5{wBO{-o@)*fd zkTM-3uGmn|Kewxi0ciY7vK@p_D{|%D0?@AmJ9+FMO5?0nTy@>O2luZPwWljdd~1*M zVkj~{dpESTZI;{cx%5FO+khbcs%o}M%=9v0e{2P4vcUP1kwoxL_XnTg=4$lBk6yNd{72p_LUcgZ=SX4~o5y7#S1IhAWKT4; zX9GnjRaO1!%}QmSr-tqDvt2TNv$f#ZWIbSg6MXajoL&^%>&$GD zcfm6boV6G!Lw6ib=HYk~?r69aCXfSV(U}&)52JjpmhNvu14kDhAzW{e*QUqL2gL{U zT&-4f5-9Jn_30rG#k<)Ma!$?zDBnRMSz&B8uJu1V1BJ{UKVR}>i(GmyCXP72QKz$< zn>($)N=wirh90+LH}mghSvF=m8!J8D01}x}>=u8r&%q}54t%}-)mtq}DC;#oyU6RG zh@jO2AhuZAjI-Ejdio?Ltg0k-&GEJt4a4VDP}HCG*RQyFJz_Vl47EUcWMHb_%xto3 zVjjOvh}Y&vxzJxslo77!W+w%kErGDNYboc|AtqaoBI1-ybbK`Opjn`qVuH_4)IU8D zMD%Iy5*4@w9GxslMGS7ye-KADQRxrj~WarAOMEnb;K8rpz128TkHgpkSCW|?& zdkxl{q@~Zkh!gU| zkL?Hq>T4@R3I#M{bM6S&O6vX{WWDuJ$#kY;#`STCA{nf&R?j+6lm(sZLq}a#8?u<} zyR%}(6j$rjdVB8CgwOU?sq?6HD%6Bsv|>Eb@znWY;|PyH2KO3c7ko*13i$ykR01$M*LGB_ngYaC5(Q@Ctm= zwU}HWg%w|#us1GEI*@4279+AI)Ft~z`t~W&G~`fd>(5=eDp&ZjgAoGvyd|j1q*~{n zx!L1Hnqq`nkkT|#0N$*L68A_#o2Q>{ac}!@q{r%zqK|O`o1LC3?ipl_xTdd-e{;Ni zX^g-Olp(BYz{u`W-%{-~%}L*yxfGCnc+= zs0^y2rv!z2*%sZ1(FnI~x!t``RF=4kU^=Ea$!P9!v7-%t~M( z@SDd;v<%ZPD7O~dw;IXTnMdJ_0?EvSwCG04yT*4LFI_knEIP;=}{qf3>g zDT_^Rbeas{6_XMl3vw`W9;MPo*!_6cm(lDtq8k@xZjeY<5w*xNDq6nTXvO(cblFV& zGil-L8VirxcISx4w5EB#;k#^fjeSjZ>hZ7ZlgP3y?5oWn!@h1k`6~+F{J1_fw`^H< zZ)geilA@5;N)t<+$JdtMLF5W?QtYujc8EUDZJO~AxcJ2R#;fp0+L?bWf`^GM(K5yL zfHsBTx`e8~qD_jXo+1+P%eNC#hjdl|0a)5>cq$Q5p@;cHcFB|qI`QAEg3itg3-ZmC zMZ;9BH4j|spKM4m0~twJ8gK^D9_QZ(`z(0bt$*y~Deew~>rlZqhcD6J}JDCR$04eQLK+v6i>z$|3 z`qY2m&HuqqbiBOy?I)&yGXQFhj;F!bnq7^L7rDXTSS!cJmVBEw>x4~9yGX;HhsU9n zI+?sx&lh0fAZyX-mIt|ncDsH}DOwP#!E`o%Li}nk9ybly3z&t1CnL+}+$`;G;?4KH zlqi>GtJIyam}M4VoTnd)!0O8CMjITad+I7>*9FQSaGD&ynSZ6tN{mv#v>yeQ)Itm9 zoeo?Tn)V*ts;nO35n(Ot4+i~`5ItIK8pQf|Yvl`KX@}t2FMD{EF%cxkI4FS? z+4%(x_;uX7*faI6Gnt?GTBEAT(7Tmyyjg@F;H}yyG^9V_#%92BVl(sTIl3Iw^6eUR zSnje$VTm#Hd~rvb(bnk)Pki@EP|u1Ka%#nq|N7WZgC%Wg?2Pf(_hqX4k$9k^2t1EE zk<((l%cNtXif@zl#!&-rMiPI?>B`-|Saa&ec>&G9wr>BTt=C~q@WMlvr$(Yua4F{J z+3?zQcJ<+KrkSXVOts(#d@Ps#;h3Oy^xS3Vo{2C(TblZ$JGNAZXFXmAYGNmx-Bd`s z!HDG}4p&g@2o<)xCb8gA<|Lf@+JKiYr}4#rrm$83 z??4WPGMipI`_o;#ZwY!rkIMsqqnJx;(2meGYTd=_Kriz! z(z?~c*YXn#El`l3w?snyCfBu{ItQAtva(Ab%h8kJ{)^|i071=%ebrpbGg787m#He% zD!GySpVMh7L9Z3r7@B;+wQ@r|gb@dG^#So`Mbl8aDZbar(zfdLA+Q~mh{IlaAlr7C9I(GSOZR%czIaAgyki~%X8~2 zcsq-hHr6F*wwv`Kel}{{v4ydf=6V~jYo5ECo0dcfVWF?%%Vg$De%1SI-(m8(sk+kA zLrmw>dMe4;CGveM!PGy=+Ccw(U?ziz4vqe+1Asq2d;G2v;nox){```3Ep0@vZu*H;Lb(c{Q}BDa4u Pp#SXqzb{*~hoAo+yD?5` literal 0 HcmV?d00001 diff --git a/tools/python/images/clip_image016.jpg b/tools/python/images/clip_image016.jpg new file mode 100644 index 0000000000000000000000000000000000000000..05dc68f17f3a7f0554712a39332fa3e21cfac93f GIT binary patch literal 6056 zcmds5c{r49-@X})ElWlrB4e4zUbbRLh9S#Lgf`i-WT(bHrR+NiNn$M7l4J`>L`1S? zEBls|Z7__PZ%94w@jT!0zR$ND$NL@MJ;(LWegEeCo#*+x?(4dKH@TlY3b5%QkO+W^ ziVEHxFfn;? zL8$B*S^1MHYHDgC(gpoUKHa0$$qtK%&|K*GP8epdfPEx-B zQ3(Rn>{K9jDsm&h4**oO6t+8r_bw`G5DhIIn4W==i4su822fLhK-4rKT3Q+!O0+-a zIY7fsyZ?xcIvoef0xakTk-Z(4K@UA$QqPI*TojVCbPr@;JaCYUn@3ng6n0eX_z8Ih zMI~j8Gnxo3ZKTdQBjfWXre+tctZlB@+SxmJc;2{)@$&Y$b2lhBBs45MKH*;C{iNiS z2bs96?3~;udHK&vUzC+sR93xw)6n>~srg+?YgczqZ{Mf>fx)ryiOH$ync2CoOUo;( z__g&7!sd=HDgg9T)^BBh(#1~EMNLBkq5<#dqN2u728f-8_J|DKesvVs!i_^v_BK7_ zbX-PBJp)t@y~t_l-pP1C==hlM(vGxUWq(gt;D1ZmZ()Dv8U&a@RFvX@*a0{&ejBDG4+FJ7R(iTeDoZOum$KGD7ruWo*rFX? z``FHNY)WF%TULnhYL;t~b}}7JYi*RdHt{X9=TggBKqlXeOR;_;rvTp+k_?<0ArX#m zWX5XOLCL^~NJOy3Nn&$xkkQy@tztFZZz{WAH9y!0&-;xL(M* zOAnSI2khuSO`NMAvhr*1!Q}p6_fbh7Bc1j}V>=Ycz$?AOM(Hcn&_g6KB7Sd~x{@<0Yk4N`${N7c(GC zlY#XX++Mi7LdAOO3eFenU8k%vT6?%-kqjL1%Y)CP?kI@)6`r+=615As9=PfYZy*N! zk?7Q~_=H^?g>tu=nR%_6gSY__|D;r+C~*b5HQD*wq12(Rc0%W_h{KClVr)0Cv9Eq< z)7p#fa&h^gFoReltps@b*8g;Z@Sivlvy}ef@6y^;`p^9O1`n%K{%OcR3;TE3`mgz8 z|0e&nf6KJ^m;C?F8v8#c7I!bNftt1Q=X>UtbuU5`E`>jSIrj;{zE#%kRr++&S1nD5 zF&8ZarS^9R3;b)yR`cdL`{r#!GT=o9besSPN)s^e?LYA*Q6Q>TQ$Z^wvuAwhm>UC2 zbG#;vV0sP0>jRjRZ)P#z1hcfRhiDQFT4&=!e@{vC$8)~7t313aeE5iwAGKzN2_=H} zhUtD(RUAsXQw@!lF@>xLo4yYdyaBseX#=%iRSizj<`!-y`oa5B4{rA6UcxpvY}=P$ zi!LPl!G<`9_M3HWSLgKk-Cp4~yR~#&x1#X;eNP&7=FaO$u1)B@@R|swUr>mfo;;q# zpBMvH!b2CIIw4FMod8d#Tzh`Yo=GIG%}I2iK6U1h+Y2j2ftCIlK271C0}7vDg5h1m zNb8*aNnCV>2ah}tK&v`ecGP0FPug2;o1h0aSBL-P?S>96k<~hV&C~@D$s9Hf6l95g5=cZ0%WaT=H8&Sz0 zzTh0Togst~fweX=gh?vER4u^FrXJ5gk$9DpCdU}gGiL=Vv-cQ!&$k2D4Qw1YZR+}Q z(Pmk%heg}6-%E>6ZC;f+<*z3ZRkJpSweXB!PPq6zN_4R5>ibBUS=U3g)@TX=TzQ2P zTw8$woHSFJh91*+o>nO;D*ljPz1bo5tp&)Td`ZJbj?2nR_bj42))Z9D&AXH^*X~i7 z{TQrO;qzmwc20Pxs-~`bFg_gYf6S?E4davFAXIgmZGlECX8{MWX>sEPaq>Rcf#S>7 zXEU%u+^Wtpw?92xnKy$Nub5|q=kyk~dykjBOgUA#FwW|jBl~=m8@O~Qp?>SLPmhUs zE+WWEZ)nO@$czQ0u`ideNzUvA+I^WP=w<4glj2TiB{Cv$2z4vLhknD-!`dH|??lT* z8cEq&m&dh;3$P5V7%yI2&(`O7HqcA0>G00u- zwLC(!{S%X8n(Pu&!HE_Qa44 zBOQ~jrqx;1E4ES+F5M*F-qP!OI^$+8&_G#v$xhMxoJzwDJ<*EmbNYcpI;7KYYj)T26l@P0mayXC|Mzz!QE zVnOxihu*O5YQ|eaNU3>S9sjbhZv$uHTDt9R4H@B(wsUc>o!wQC&=~DhBh+w?R7@mD6 z?%}fkQD~^-m{`>CRXL}Ls+?>5doE<7IG!0J&B9&-fk@($o=b7Hgro`5nefwX zfYgk641vAV;PbC3eXdpD+Br!5J)2-}{wuKQWmfCDv;g>|hNjM&<4qe=mm6GLK+sRw zA{KMyXURYf{7tlrz?v@M6GgYDahyZjC8q01(V!cD#uKH93Pou`fWJE?NKdde z*lhWS*k0E4nAMz*RGJV&+SHCwqFr~6w$!+NR=grZW3}9U{cJCM6Rd{B+LU7N;36Uu zW)Z=7(X;4<+fc^LVCpd9-eytR_eA$3_EstDdmWYXsL#pPkBCyBQ#bI1J|ACoN_uYH ze1HkDOU!&Dqa^0)>K6S9pRVl~KS5IEpVy1ncIbWzpBN*(=&$Q5I;#qC3L_ogHlLRE z%w??{L2cXPvK9TCHe=?EEE(3{3UfD~u-Z~y@k6d5*Uu^-PL(Dd#`WD2?=*owmm#b|Z~!ODIC;z{VV2I03YKjoA#m0rjOot2Njjn`+$8#1uxv^TW+IsB z)8VXh$lAM;I2;T`L3yh2O)R5ww89|WmK1c>m*MULvzq=_*)Q4>FpjaXV|6>#)2_If98)jEa`Nxjtuv*&}E-!cSb{U+>KK zDXJ5+;gp!BFeRPoO*QNu=N9P$`h%$us~ez1mwdVQ>M0Ht1i7jjdZ>2|Nz%-kb<#9t&znu~6LOJi1wiIl!9 zPF|p{dw**?59gQo0^Q_gSKfFZcg9gG(K&;XUPr5!(cbc!Fw^&y72uO*6#)1&;)_yY!q6L)&uOdNl#)^-RFMs+ZXKe=m> zYQHa9afVOEw0eTlUP4Km$M*oc-KIbP15$}go>@I@L|~?uabloGGYo-*Cd8kNTg_KM za(*j5C3o?RyyTSCv!p?Vt5nfd`F}?Hn3&duz4Z_7VYbfcm`RuksO51pGn)x4fWVQ5 z)s<$7FF0}GX)A3M@BgmuGbso6Yn|8qdg#J9g%>* zj9g7pnS+s2cKeihN|j3CK&K-8C%begiWF@VYAqY^Rk8OXU=W!^q&|0N_P+}D^`K(Sp8?XTbG^}|ZOEzAR! zazGw9^O}7llB1aup%5-V%#`pQM|9wUO|9@0+1}u!z^BO8h|9KYz0sg+0zbX3D zHc)$Vxy|O*u+DXT(z&wp#fEWab~OuJ%5@ zI3OxBto@u>7724)dE~C_8&H;@dGdW_G3z8{^H19MwqWorb6l%m#Wh+}mP1dP$NHW= ztzgE;wsY1*Oh2ze6DFC@pMZ!zeLMxOIXFSOhWq-lEK53zmf1PpnTrrULzw9c#p`-K zvf|wjEOQ$j{Lm++z8RCeWafz~cCCTO1;D56-sXiC7dJ%fUB!&87HTNLQl4_1cZ{oIyt1z;2kQ|-z*RA0}I*Bfa?6C zz_Mp|aACVCaGBBO(*K`{XT>E3(tLner5DMDE2;O@Tte%N?R-TD`d|2pfF%$72QNtS AQvd(} literal 0 HcmV?d00001 diff --git a/tools/python/images/clip_image018.gif b/tools/python/images/clip_image018.gif new file mode 100644 index 0000000000000000000000000000000000000000..3995b48b3d4d0921f5da9eb4320a61e43a507aa2 GIT binary patch literal 70465 zcmXVW2UHW^)Anu>NPq-FPpAn^KtMv1A{atPPJ@rcmtcL~C-hR-L6dOG&+oj?SwIOmxc#K@$a5=j9|B%)B*svNox{XmQzo@9x_V^JvJV2oZ|Uw^7UrfK9UWGc6V}~U6`fPuJuqNrsY@b}Qhg}J^;K?mx^B8eyMVATdNCe_+I_ui zXGf&qv35Q_J^|5oZgzNG{1^b>vBe5_qEAl!n%kD0E|I7}D#qFsV-L(5Ad&E-1KY4+ z^Z2pUwR!qMZ?`2rteZZMT{|fMkB`!AH;-*QD8&aoV-z=^oCCF^>CtX$__ZpxoCB%z z`1&gR79O92kH)vz{jWb7UmuMRz}w;5*4)PMYh&|xdcE6NJ$?*dKjxMi7EPzu*XK|y z-Rj%ge736Gban0A>U}Ip-2?RppSvpHK$og$dX54XwdH0?gEQL<844@ZN+KSikK9=~c1EaR+ z^OUuD3O>g&8o&1cxhY$A^OUxOa_}&I&Bx7cASbMf9h2Ey6~!Yr|t0a!~^7`<(N zZ4HZh-Bx9(8*SHpU{M%7+GjqeI9iupZ%H4sv|Br%c-=8eyaJZ8mAdO=Nt_=W2n&cl zK!;oRmO?k)t$PgBjSsNY?VcZVLq%`Z@2;&4bXOGz_~5&@s#3eRP(&cCyNaTKt+%Wi z3pfy1HHLQ^3pju)oI@Dj)=!d3!z zS+{^&X~J#!R45Zh&$;Wv+ok=?h=a~Un6Nmj2B_3|Lgl*DUX26@5MyhlhGq@a2xNCD zcwN~#Z&`apt1)7{TG1?dE2lp~k=avISZHM)^Xg__GQLoEc(|{Qqd`DR@shQT)6xk^ ztsu1j*Vqd)R}BCirRetY>8lu$XUwi}dolozN~JZB;%|Ta=7L|Z?cC>Js-p)U)h*KynV6SO+u^@ z%YiwX#ER9F30#MEPOZ6_12aE%+0KzcN z3B;1#?J|FbaD#B_Ju|>KmSWm|;b|2hAoF;p3RF2Sdlv1Dg=O3M5(YSI>US1vYGuf= zkj1Om-qN+UkDniJ_nrXRP^=S2xB&-;EJtOCX=tfj_xrZpkMFgj>_?ylB*&pHR*#;f zOXDGSf_ZSC9!M8v9+&I_Xacxl^#p+~^KF>=R#KKKQZz+rAVG*)7kX*ym}&wMw)rI% z`?`CYN*qp%PcJF$X*o!#k2hK2oknO zRYM+aTYxz-N%o<-H}sij9Cd|%0a`~~p5CDtckL5;OsDH8{azeCw*V%IEE5k%`&z-r zVCJ7NF!4WOFhc&$W6NYK<3K#>V_A1*^Xh#;*l_Zf>*20@AE&VKX4t^Sgu}e-5?>=E2Qx!($B_9T3az_yG{FNQjjaWM(dUTWu13;us5<50l z58R0h;fr)O`B!%Q>R%h;e#c!2aG?Ra+H;O*nFMwuKMznd16IAJmUi;_E*1{y3(|1v zHRYMk)CDi#8JB)l@c!79zZD!@;}uR)cnty|n~%DL(e*=j~gnM!?$`wjTKgG`Khb9A)>+?r@97<$we3;-eBZI4 zTH;xhb>2)vVGBH=ZeH*dP;dnX46hZgjFeXfu=;f8VdZGxHV|?gOU1=dqXJr#0WY zr>Y!aa#{Da+;Gw9qSG$*ISkBtwMVSs@{ia|U#@;h}kAzn0#2ky@p{Hxll<2$;1y;yq4V}g`6VwV0j^M2|wUes+6WQ5*oiOCS$s~9m$ z2+PTeSrIJmdwOFnG`r}Suf)raA@_!Fw<`kuDKBPTRtfD?o22;~_C=0eo^QGbPxBv4 ze*GF`f+|S?677}_XXNMZHpYEwNNLQi`aFK#Y%0J&07GqQ|9mNg?O5RNpq3CW!$!+s zFXMP)!+cY;-;trTi{7C*Zx{WvD4T6oaj{dcpXSAOy|=y;mBE<*`s6BXCv7Wc>on)! z=iTnDxveOEu?5u7?xzpdu8|M<7m+U>_HY?&#qr>#r5PB$yNzGc!~QHw@aPQRBZp=c z{aIGnPl6p?d!6MCHE}ThqMp^O?u>u!47D@6{jZj9nWGW4tV0H@x$6 z{Gv+uokM>&^qe2DipIk8KmOS`{@16-Ay^W`xd-vAL*E(!SU!Z1rwMA)gga<>c^W&4M!?aqdo+n^ zh)fBPPWj(vFis(WTZQ zv+7v({5V1qq%=%dou*UFp#~=Ko_XTuMAY|O&;HjutAp-6lQ;gG3Z%5TEZ_BHI$Up^ zin{so#_>t*3wtNF*%EhF6Rvh>|J_cQjMux8uSYev{d*JQyn8?~EcpzBA8Fs$ zecqEZ9Q`sZ>hZ^vby%IVyB9WY{)f#;WaH$}a<7l4AvAlLefXtSMJk{u0 z^eFjEYF}dg?TBUi8ELA=V8M~c1$q-42|p(a8_pNUxfBol^!$Cf_)TZw-~B1ufdZ#x ze~;D3+%b*>_u-P)ucCI&mUPJgd_A61k^zj~A2cHSAEiQ23x`5alom#3F; zE@@oB7Or@Q`$92on9+|YT2*kK{dw_Od;E|4Mecirk2n$#mc?xw(GBMjTnrvp^V@3) zj2qgei%yBp*IbzG@oVgfA|r_lrWIGK%XBzTJunNL+$i@lw{qVrs-6m@o8LZ%bF1Dh zia}K*%G)Ask9-gHJgwlVZG*52Mt2R~{_@@7-7A%mHJ5J#RI^_$Lb8nFvq>tBd0w-w z!<~n2e)rIAvle{idNV?AP!=#_p$ugsyC-=B$Q%lc!pk~A$tKSA!a(wVVl~auISqb4 z{H)(%eqak4XSjZc! ztiJu;?m6e3%)$s`x7wcBn$LoDdC2IL+qIc)dC99aFGT7ed}vtyRrF9ga_M_i_Eqk0 z3?5U}ywSV4(f4zsbZgp>)q|X#4<9P92|=eJkr2 z37}>!qbX$A_wP7KEXpR6gTO#y2?i6rf3l z3sF%#40senRG$pTs={U%*Ar&7_s!T88xK86X|aD+dtF z4(D++0+}#A0JfY7;J0B>3sRT$*g04oL^512n8zUp(^ZZnsGds11qBS7B?MS;Fk#9J zq<+O?4Hhbz(GgCgTzb}}frY~xk#|2n;>C7@2kT!)QN^j*8=xxU?lborSzyWu!S(2e z-cqP^wZo9UW`Huw)=E3N0$MeI(#1 zOo$5FtXiy6*rOUbDSFy=RG?&%OJ|$Yh|J#7iaehaAS3fgrWI~W7Lg~zWP;JE)S$ep zJlr&dhbnhRCKkv9wE$GtLXYwna2bMgW+U|=Fyk9AoC%CC!W0(3G!{$;0@ax?Emc@I z0Dlj#Z7(>AazVxrL8OthHW=+S}Z| zOf(M`l?Dmp_8DB5G`PrRSUrTpUldViMHy_r)BxeD<5nph&e5^La%u@`189~Kz&XL^ z-8gQQi4Zdw`?DFxTwUz5yE0OJj9-@ZVl< zGTaJK(#Yg9!dOHSIMm5{;sDVmbn1I@uJoL?HG^=Xz6Rq3+hsH#1;#YHr+aBI7Y`PR zc~PJGl3Sc#)8kH5P1CD^2YWRjkPG|2#Aid z%oAG;1*u+xDsw0Yi>pf zFbcLGqQZinyK9rjyCX4#ZIT8Jtx&c=U?O4}@b}NBG|OP!WO!{R#uorhAi?vkuR#Ln zvj9uOcHY?TP4)2uI?(ce!FQ_D9IS`js&E(qR0Ci$BzWUEV0~6pp9;%F00I!#zdCp= znf=|h#6lY`ELoVx8%(U=SOQSrdeHh<5VHU$Hgddu@Vn0c^9vgUlkyYe^Y3^JCmlYMB(i^|BFH7#GUaPjlIPKs0lg39PmW?&bHVSS z&8eNpY!)vp7@c4vc&Z7e8FDaJIW!1>)Mo%C9*A{>FK=^yd5)H5?Vxh!`50VEfUeCu zIMxRC=CcbJ4EpiFl`@gPO1LNIK!6AKJy<{ijOkW|&s&?+(qK3y`T=3B`yfET;^bj) zoxFr8?BSBg1a7K~3(f6iNdCO>#lwt%2;7Q@+{Qpq;Z<)IoY}S2u)rG(7skS@R75oh zhqRa=cP3mP3o~l`q|8M79-wa=ycqy|7#iDKMK(Z4?9yjAqe{~dR!ac%32j`OBSBk%W}iq;ca|jK`Lzi#ADGG+AFOe=m4E z&Qux%Ez?n|LSki|q=0FT412Gctu0GY}5(tE05FG?B zKd2G}07kOi>Z=^9r*xO^y&~3y7|B6``dXZX+yk|B=nK=77b|u$A3L+|IZ*{C7+8C$ z%39YcRsmQ5ILj$s>szS7Y#l(OmxDF(0Ge^BC!G(S`HWrx3F$=9apip?{uOR)Vp=-o z!TX!H-mh(_PiUU?0%7HZ^oi)*fcd1hE30$gFCQ-&JSAd=KKqk7Y}`g=}%-Wp|WAvICe0q zX$>J@w@v1>&?R^yn_&?HG<**35XdRN&c;p}^fF99$Vtb-g=8$q$zld9*+X)r_ zw^0x-619y2Io+u7Y&>X%i_pD!Vh@P&?I0GAbk~6jydA@kQ#Kj!@n$bGBZHAxg=^uT-J?*;E)l~esIUPx{;ZHle+d^{J&7u1#wcK%Yegw{a<_l zMSjM8WEK6RwYi*xrc6Ayh->bMevnLDE;4`7Bs(LNWz2(sonJsWxtt&aNZ_a18&7YS z3kdgi-N2KJ!}&Cca~A8w1i;0G4U#&J8>)C|US zHR(Lm{qv~dacPS_6A34L8DU#L7w{aseaBIE)wa2wCyFa!OY6+9+^b%y(HIP!v^Yn)%)L?(21x{%T(G>i;G^Tg zL^__22_W>{8$f!2T7+Q+E&?wtjt*T|Wy6M__vQI?mFk5y8m6M?Satz8Djeu0Ar=s% zVd#fb7C`D^vK^f^LOM02IwEZNMsW}>oRPRMo(afkG@5cE{9)jVA)gj?5Kh?Umh4DH zMdHkKEe`>FVikBU(R6`+1Sn%=2qU<70q2=C=M}>kE2<%v%olyWRsj0|3*h^wlH^(; z_R--4xo|T4_D_giE}xfAu8FpXpyC>EaThc|T42@!q+wxru(#$Hi~;a+UuMwd+vyVd z1TB}cMI?Ef45#1CJl)C!k_a673otoy%n)2`frk5IloHipSVAT9TSEZin7Uy*j1@A=%gl2fzUii^#?vTqUEr_WNCS*_+@;9L`lg zb*c1e9wn+$hT-b9x5GlwQw<#t0f)4>YAn&^lZ7)<5{~mfRXQ8r-J<~8YvYNcH&JB_49Ww(6RE9BT~*3 zj29((VFLEi@EVK$ZJFloK^bR-I)OYpm%*==es(BDU;dVqr|z%4hRlZ?fgKW$4UFH3 z$aUox=(6~>zd5ewuQ|8@yI*4#0Wp1WZkZs2JiU2mPLdgIJdoU{^I@_oli>rcy%UVX<@ zJSo^;451`?6fTX0mcDEXbsl=$kR5O(uf_encB1%fNW)_*iKg>&MvKpN>Ml24e7g!I zN=p8H-w>t-aAzaWBz8Nu#mmyN&^Kq;goU159o#Izkg@CA1x@@xM>xz0UT^Qe9xw6O z4*9!z&a8!u7NUyzWiaW4n}UE%KIM3zvt$AW*@%Dc;oBxvx)`M>+Cb`6t)|k$aAgNt zw$%re1i)FdC_FBx9uv$B$1x;LSM0>}6 z3Fe|7|JygrpdZ~cpSC>j_$-2_9zGsd{tU$x&2gW((%hoIownt+#Cme2p<915`GnrW zrd(&crt0a7&PA^uK2-p$HXLF6mqKQ~C^$b`w*Ms-^vZK}EPVRg(6)JQ1paxA{n=Y$ zQ-Sf5{cjY)?Yu|sHXPE^7QSHhA|+o+02IQz)IuKjbQQ$lk&N746rU=r^VO$^35KjZ%9%&i#V+U^zmKnWFff+KjPRTlE&3i_E~F)rY?{>ojqTepKTr zE@6@yF|A_n$<3jib^SAOZEh`dHp=4_h z0nsb1)GI4Sk=5vxqxVXs_KMIctDVe`3v5=bUgecum7U%r2qRVByA@oGZ23s_;zW&O zM*Et^MjAP6n)JTNh`z|iI87@f_2NEVUn6Z}BfV6k}Jwwa>23*s;>cYQ=~X)iA@9|3+1uU*xRqm49H#Q(UX?UcO?0H={z48?BL1 zr&!|cAXgR2vx=?xuprt3Q~Xt3e9wpbM=G&p`q~1 zp@>Seh=cQxRI`}6p_m=BsLG*h0>e@LL(ryKwDE9Uzgdhg?7I6f9c>QPnI+SQ_9Kml z6J^YAY?={ORr~SiqV+_1RJS||$~10XAXwy(pY{+eY(-UaQ2Mq$ck=`+Mip=OH$Duk zZokg5i0;$8p&A=VI9iH+7^scB#P!fkZ=@nzt3+euZlz{v;K;qvgnRv(HE%7d0!NB> zp4P2At*)~?^Nbx%9(gcoQL)p05KOx}YH>z%qc+t0|-NA!f-%V&%o zO>d*l68f{d#qAB1BNeG5wHhbxD?JgHqa*z*Cn}9Q88Mu-GM0~gtKK>OO4ZJ=-)8U_i1j6q6eFkW1s#cf~+xsdIXGM+Y$gA zi7&GE8s@c6$tcuy&y4N*zW6!PGQXF&CjjhL5(}DMJo{xnk1~8hIyqXyF@#TVh)Q|V zquQL=R6xA>FY3WuJ~z`AJz-H?ulwkI1cvukTGi=CwbQKMU72SKZFn407gAVt)yaI< z9$=Jh5ie_=p0{!8Ei7ZC&4jX07!~|?*0aL&UOPEd$#dTBmMVBn?Jt{`kK_w`Q^1nJ z=|bh+qphl*@~GiY!w96+WB;lmF?B++w);|_Y=mBpui4n* z=Ek$bSr2=a(E)>X*@j=MJLT&$z&2I3ZY;Zp?gwvK=`D@ydA3 zNtC1Mz1sV=c9RN+OV575EEb#uKNeyFGKMPD$E8{>IvDnwGsZ)>#h7gU3`oi*8KDRh z8v(?6C4?)WctiA`{4xQD;une&^YXFcSSJ?`wmkqMc7O$CP(1)}PB-c=Xn<7E_0=nl zS{w1`M5pt)6-h1PI8d7O0917blL|ITulpmNz(&bO8~`We49Yvz4wt%JD+Y!E0>$=> zv5m7Mntz441X`397KxiP{>#=;D-z@52i*Q1j`(?aACUxb{WRt56U$+iF0g=4uDHwM zv`&TBx&E(#{G$I-oFm%qxRof2`8(A`yhcb1k6IYY{xub+SAhl_+%r4eEtk``I-IjRieIHF7IwT+&DD8apM0*(-RINkkdh)t)zeXI zZzE(mroLE;}M<1LXH`7uWYIT8VZ;Ax;s4< zxQ)!D|MX&TSI2+#_GD!uLh`eCs^hu6Qbb(J=95oL>sRr!;I&yyU=QE{q`+-ZAE4EG zffWo+4@NS7b-WV}5DErD0D3cnhKb<*fqRth$be}jxRDcJP7EPjV)Py$n#3n=nTf#C z*IX0ir{NYT@QN;Ysu|MXOE{bFDd5O~#vN)9@!Vsi;8p2QMKnif_1ten6lHJ9snNuT z7gF-QaAUb%6fAF0yfcf2S|~^VIfR5o1z9`vattS)ZM}YK%%?P*x4laPeXJFjL3J9Qr0C_y}x{r&}*I11!xWWNUbd~Fi}le|NwA!Hr{ z@IUh{_s|`^15tIq=E_)xd1J+oM}01+5RBT_Dt zsmC^G!GYgubVB0bJOD!aADU_kdgXa5=7ByeHp4u+SVSVH)49N|ma@mDSH;qQV0l_U zIN%)2_8USl#p|96q;+eQix3BTW2HVV%ZlY6gog+!|P`Xq${EP zGh(vXOauxg5E4u)cfEu4r;D)QRRo*oc05B=ZI2z*0V1!9T$_{P#z7K5Z+DCFP$-@p%(x_@-5;taINex(MbIK-DDnYKz4osq65HY!cLGtLx5)L zLcndj7BHa3{e|cbTEaS@r9Im2-Q@CqswRfTwLIKNKNe@Vnj z1p}W>-s~?37p=Z}I#T{_CWLyEc19q5pAvNQu}$_LlcJDoRDo?S-@Qt?C+u2YG*2uN zeL>3CW&8UWoQ#F@4@QdqQ}NoQbwv}GQa9&FnFP0y`h3}*SLlz z+;nKK6hvO33EOafGD$GC5sb2cuKF)|dF$fEqWsJu*llQ^O-8(Y=jsw$=9VnqULMu3 z2$3YjQGLn5v)uj-X8zB3#8QoW0yM!_0A@m@ZE)i=o7sbNov1^i*hCyT#*ByfM?|Po z`Bnb+jT)uA;!#`aW6%{XR{vD4J5vFnlA!;jQ zYmUC*PH5tWogNC`JyZ`mj4fnl_c>Q945R2Eo7)k*7 z)IX%U&;Uy$9|J(Xj^E8ojs{S?y7^t4jPnU)NG;t-m;jlH06YzSy? zAhO&$5ZeI8&Ri`4J5{ zHvPoKI|4IQ9!Ej?tsu7b_TLWL>yH9XDNFCbodH>Wf`x>1`M@kyyfv4itX?M-&*O8D zQi{qRCRm`YGi|xh(l)GEj2O^3B$2&Bh3?ql$i`@CyyN7)5sblRctlnEXBypmvUjw{ zzb6xmnqG*^=2@0$G&?f1X zIIah+ED#0lnHuRfC;Rkpv!|gdY+=6NPQ!pk7RVo&wta|0IiK|m{bYt}C@ove%#O!K zKom~GFsp|hkG3^_V{M+jn;v&GG#jC=;nmltaznL4;7~pbhUk%M8NuXOz|1}Mt$rLt zinBm8$}jfMcZQJKw4sIN*H(GRQY{vs+M2N+SXO`(hSP2qAbRt(4uBpwK*|AXs-l&D~BA5H)5)oq$Fj zRoY?hnGZ-PxC&~(0lfRlSst+n6zpqb%2i)7`}|HeT8TUu7z!25=k%ZmEvZitOdy&t zZj4~UeEK1T8BrMcs&Gom1v;kG2&l}1c)z4u2qSq8-&r|GLKSXlye)@x2|g)~huBOA z84MWkByc5OVbscx4I2=EJbzS(Wh`nYAjLgHN%!27_%op^hLTm({b zBmw=Ot?kIpZ7urUt0A!aGy}Snv59 z%oF>w*7C8IsGGtd?y?|B8FozBQX;rvsXVV`@VLYT&Ojh$Q)(FJHS7YyMR+fHHxKm~JfZpc|-`NZ7GW{mpwS2H+byZnj zootK1;*zbll@(jQvBSgPKb-UjW( z#{x^MY|&))X@wMpk5cMgUFZuw2CMm65zkx+ok9x5tH&m4!bTW^&t6_(_-}nUMZd>tITA2=@~vE0AK+_bT!vva$`)g9?_*R%XH!C3 z9(e$L5tt?-? zFJy2`T;1L6c=fJD+YKh?A2D9FS$o^P=5MZ>n}G&Xk(c}e6$#|K%3`^Qx1Umz5caP> zrM?-M;Z3Gm>tc4%gEa;! zZ3`af7o6w01v0aG!qrZ_;6JB?yxtp|af$a!XTVzr3!9kGX4tAWs!z7SX;2r!_KyUh;%}Avq)k!kUch}#v}|Lzt4w|1N-dF>aD!ooVUr7=0yA31rC~GFYPBO&7bB+%Sxsc_6f=xB40i| zzo|-0r;3#2kB`n}i+n1aVLM}>Gp7`iVZi3eck-Dsn@RnlCmvo)Mrj3}n4`WyJd0-u2pI96G$sb>7;q z^Z?q6VoN95@}>Av7T**&bt0r<=J?0T#64rmWVK2J3_OZ$Pr&kp?&_BeJ(t{<^)kSZ zM>|R>ULgtWz-W&8@PdD>%T*2INsyH~RH)P$rZp<(?@My+&N zRysJH~cR&=O zH`jqQu(V!(b1u2c;ZMaQResl$Ro5rpY+Yx7nvqiyZc5@Sx6UurbI8s(4Q1(W%{M?vK_xMD^!QZS>65Bzk-XN?um5G*13u?KVluJtoEpj#x z!X&(S3q9mKi6Moy%AmK3+98=NpJ0)p=uQtEr?O!0r%{R$QEPmcL%qVC)^zx!g`$=O zqr|1wPUv{g0(9p3D-V1^-&>slo+ydmv;x-@5%sK@K8q2$%4>C4vtTsrtZXt3Kk6${RN|r*w44Y5uG|HkX`3{#;_?w%|Cv z>i8=DqPrlTEyV+oyl1=lh>N(LO#IDh6Vq`)Uo{!ibl;8p&kkRg za5sp@VB;?mOv;WKA9sJ{nCxLYT{-DWH*~71^N{h5^~<@eSYapQJtD$Ty>MUULpFv{ z`s;o}jj)oYZ^QgXxq+`I=dyKg4JY^g>BSTFB9-+GmUrfjkP!us#BL?zdAa7zwq?&& zpI=XmEuHHux3|H(B8xq!nYtf*;iiV$9nYutW=)1i_h)@fZ#Mb~G_rs6Zhu*4KrfMC zlznh*7Qr2TbodN=?PCk^au`(NR&ufRu;*S$wfFhXuQI8VCik2+OFWZ%4P;G<15l>+ z|Ewjs%FQmMGmr7nI{suI;$-Rt&hPnPI4i~$?SWV$YW0h)M?VY&GU_yH(iQM(i=?saqT4{7pI71w4rF_rc2 zf;JvMO);*#$lL#EOy~qcyMHu9%A~U1#D;usai22b*YMWO$(UU48EmZQ6=X~eS`PX= zQO+LX=KasYt86iT%CK7GDGNOr&$Hh1hkEwY3Xh-=xN4IklUZ;zWp>@8wr#BBxRby6 z`SsgPRWm68NwEbMm&u&L(4AvJFPxMC8>Ve#{j8h7s_!Rm*2Qxi<+o#(J~t=}eKs7I zdj7oK!HOgMz2BAzaM9D3IqOFrciZZ2R5>QXJE^kp_cQZe^AWR64(`zm4c7;r$AaE_ z=Ycj5rEKon-65&@OaEB8*(3IEHiJexKfTnV6w?}G(zgDtCGV=#hko!^>-9PAvX*7p zub8R;8>M<`E$gqmzDJV?Dy;lf&sChT_IG5T%+ja?S|N!)>2gYl>m^L>bznyu;)-D2 zLr{>@Ow6AoIE3PDg|r2x|Ke9YUNb|iP2W~0BD=_+3h64-gWu2)4rhceAuA`@ewrr# zG+)csls)@&nC8NyO)@p<%%hBhG?Ql1WYd`lGHTsW00H61oLuW<6pQ>|8hCDu#E_eK zPZ8M3BLJn&Vw9gtNEwF!Cdat9_}QXOmWi1V`>X_#*wykoGJn>HYji! zE|y`=_4R^GElEb*nA^Yis5^;M9#q?73g1xcuLPu{$q!g$6nRKxilV7!LNQzlB4|NV*X^dM1w8Pf9{TZ02NTmp+fp z5Z*6I+;77)f2bpd$pZA-G7zfB9l^T}X>YU1J%|M+$$WYzniq#qW4Z zeebuOCztLE#QNQjN4zaXxm_;?&`QJm5sm^HNbWky{ROggB~#lROk1H>J~vjD3rhcS z_gH-_aRqkxZGLCBDaV3^^$Le*#%GCj!=IZ>j(j$^R!UDhPFQit4F@6cEdB6jhavK; z6*g&+BEcYY>_O+dOdHIHFVrz}lC=0E9JOK~F8BTucB*s7vISo!;3CMb1PC)3tKwjb z-tNW4q)51^aX?7-WJhF@eQh`svuENLq3jh5ImSUMs)?ASI1Y_oKyN%A%IV>Memou0^9l1Ao z{p;%z`yDEU>(V#REAb6?NcX)uB<#sg6bKHc(v1J3=uF(9`WiTX@9Y?iee8p=Z!^eF zW8Y=Tnl$!3L&{p+v5tL^vNy6MRF;$^jU}PfkVNYg6{S+CwEeu^=ed8tInQ&>J?Hr@ zpY!7s7CM|O!{da}9j>FmM8J?-@f40O*uSOjfQi z3#zAPD|thBj>CqR(y64d7%bYkbEko_U|^t*9JUxPKTb=BK`Vo7L;>M@7aMT&tLS`(j~`)a}FkOFJdPqkZBZj2?ONt zKw83LpG_eT`@*jI%bTkQ4d2SY&C63;%B{6c%iV{~0(ku$m9(A%3`}}Wnx?UncFE|; zZxLM|zIblv`6qCkTI)Tbio3L#u2!35{mQ(cBy0+dAG)Jd1te+J;C?<2B0@@%{sDaK z53bV|)cjlPW3hOMNTcAtV8Z<2FqtU<#}a-D``9mDyNQWd1bg+$Jqx0dEmXnZ>N&Mi zICIaQzAAsP{U%RCp8jVDq3z=Q7tJ$FL9@z|gL;i@`_gRbOPE;;ZVo?F55>qquqi zRPJ89-hBK#@<}(S{_KAAHpCY`^v024U4!(y0q8Sk@s8O@i~yoT>*4~E&zNo8aoMft zxm?eGW;psH`ve{FacK!!E)mdYA$OykNJ1fhD^n#6LbUqx({5wC%lQU@Q+R0GpYYu5 z0}*XWpAx{vEu$iZB&d&|N_XRRj6ct~opy6GHL8n2Ho- z08n+nVJSpgISE%@UJTnBT_pk;EHkM}bV1Qf(_!h|=01Qm3?lyXxVwX@QkitMjTWWG z2*5QDiI*&YB?70Z`=r7p_1mAEmZT{_)gXwJfwck{<|0&rQ&2>dZJybDISTPB3CuNb z1mA%fln@lApS-9Mq~J|^W4M~=MbrXBqEJN8WGn_pDXc0!#93sOcR8i%yvqmUCP<7w@CReU&j> zl5F8{%-(bO&GNh)Uk#a>wcGC-0p%%d%Ej=Rm}gRpIW$(K6DpJp~9 zaRB-+5aOKM2_vzg&6M{Fx#CP77LxKQi`tNfAcSHq`GssM<*s1?$S;Kp5qN!_5*;1E z02e?!68@>0ym5iuVb#_Eic}nTl7Nd7knlOibW~0vzV3DQ#sw+?UhLqh5B~uF~F% zV*gAyg<_=&t0jamI)e{@gP3?!2SL7eUmZ>PGJ<>Og$&w7qmf zuXqWMwba*|B=sc(4RB19V-P+e+;MlZWf7#v2A{**Q+e^coe4l~#3sS}v=mVh)hpx8 zGb4T}Afn*)2pCRqi>8?8C}z;DAjMB_Ku~_7!mlKlc^&K@Or)LCeu|!f$PRvrm+rrg ztqrLz3&a<&6o}0`;F;IaEOL%)ASnb8c8_x`H{VKg;JF3r@@bt~tRjKprfi-A;JMEc zX5{!DN>TX)kB2ZO*2OVj;2i!qWA@g9lSp_C8HO#ff$2M1{dpTDE8T4k>>*k4*@p>) z<8**w7M2Ibf%AD_lm$DG2w!rxxC}4qGs)dFPbo~tOJ_A0zP1p3aQD%^zFF`FT9 z$$J=h#^$K;ISxhVCXuSmxy{22fgaY}fv>B1>$-B`QU(xWGCI$TL@@%lauKE#A;1sd znxrI~fvK=Xo73QA4h5DIDywhI&i=Iw;4CQia?wzg_&ow(+yjw=XmIrs3a_z0kK|6M zhoha;JTotz&PVak77QH6SZsmu85JhWg=E>&237!$Scu1;Pb!@P-i9)9gl79xD_b5R zZ~gJXxhcly-3TlcUMk3*MSkuW#S$2BrzvPpCg*_D3sBf5h$4KCi6Ed@hz2p+)#FXfr`J#9 z$)pyg%o;5;PS`J7)EKy|Hs)K$vii!a3hWq*SD9*6HEcOt^CAI!TBPjMB|g}J09sJ- z5cWe|s;1=N=iGWzziZ+J1M=%4ZV%6$rhEABYSsGzJ>27y7--V$*GJ6m&Gy0ahvD`E-5JM_FL$*+cT_it{lhN$P$lNsJ=iZR|9e8Xb;3@L_c3BVISsgwq<-V3OjWSRGr&b|E61_r zOkXe1>E$?z_x$o^{eZcl?waz5*?OT)!Z1ThAa5r&<&-$117XL@9BMnPd zXT7GbD~cSs9MsbEFzroK>EXwEt?N&B-ttbIxpc~GHjATmejQ$9rgyjW&*Q`TrUhoa z%G7;{%1#c~uES4ed&-k`S=L|3|3%)I4~zK6UbG@`KaND=pdz zFd<(uLQ!;7XOBf~&L_#tfvNwR2Hji&mL@;th}`Kk;puy{6dAzjH`UdC!#WjPG~4iQ zSX)P^CQ!@ZA%V~T^4b6N4w%b^TOOEw*dJA&rm_0x%+RUdu@xcHnW-ZU`(^9VhX%H9 zACX3R{>i-fBKG?M?T4Y>Wq&?I4g7qg`?p8=;cB^2^5LJ!?vKU<0d%{0hmH{~(dxyg z?~{26BL&ZY>uTRW4@=7V72NrknrO8h&)h+9;M0kJCq5&r1mS}B+tl`Hp-8PK9stKb zb4`29pS*6luli4lMmT)_?_}Ep@z(eMUEZElURg@~qk%K;K|ZG?hM(AnPVz9in)vVU zzP~k>e+PA80xq_daID(ff6v1ZUnvXu5DdqPb57K9O#JN2iRFY^4VN+)YCq!d#7%wt zVNjMlmaXOAE_mm4pPIwga6!b(>tkctKgL3Tye;K_%#3i>6%h3F7MNSsZ~&78RU$t= z#nZwK1aOix;fTa%dKE$ro~bpe6QQ*+`77P8YS{PR4i_`t|qdT@FJ zrnJ~j6%{M~pNPr!p4*rfy}z5QJ5sc+z*tN*?)Q+cEC{npPg{Gn7`i&>vq1?vZBD) zUFg9`;7{Tdd02n>2hYdiP0n$R7$b(9xl&9EHpNnnVuSL}Par;`S>ObG>_m;OUY!L0Ex-Gpt-w5sF z|L*SgX#W9C>gK}d9~uTbhrIl!zj~C51^PS+Jcm*qQPb{!;N^BHhT`mBaM)X9>-+r$ z^c^@s#9L{vUM%AgP#_Q}=o5JJ=-Q->+uldoIPbtcB44*e_m6+~FBN-*-F*7#yOqw@ z7|tGTQ-Yvc+rm6f_)dlW2hva5DjjVhWd7c?gzC;-l%8y;M*jz%TAK*@rFjY)+ zrKWE^K4NaDa_PrZJ}bqo*C2{txyNY4fL6Ys1h-wcBHl^nzc$DDNjzZ$Ocq>88;jdH zOAL*A=xHF4=DC%O&d$p2KzfM7D36h}AKcgoT!Qmj zp&Szg)hKYT9zuzv&8UR!LuuLc-_Jj%XK!D2nC;ZPQU$ryeR>sDYm@)q9i!mgIXor~ zmplyTcz0D}iF7e3PgN#ST15#(vBqK(Sx`8y&^jquHaauEcgA4>(%4-5R4s; z_PvhrRY=uh9SgP&8nzYI0#x1E$f*{LSG@Pt+io({Y{-+OLkTwGMZv;J~P-i zNhlF!J6&nv$+x!5prRHXd27bdfeJ`-2u@=ID2xQFTVibqil(Ad-cyP`q#&lPI^U+8 zNN3HjG#K3R%t}Z212h^H{Q`u$vi1@p(2msSD43HTq1>{v;s^!nN~vo^z#}1KJgf2u z;PhLW7lKO&Q%%*l2A@EyPYNQVr;?x#VSwg$s6mgruJr{2=tOFw~6PWrLY!O-@{wV zN=q{*1wuVh5{zewp%mRFV-a$cQ(n|C+-jH}>%xvtv?+}Z5~!b6Yj;qJ#VMzGtY?Zt zAcQ3>s*Zp-OyNnfDg&&}1B8SSg2{XUZq3oZraGpzFM!V%_H;ir7{bot&^W4-HrZI- z#yGv-AR$@H5rI4BrrItpkp-aAM)8p>&Lk+IM(xZAE8flLWOr!(_rvA*puFT4_$13^ z$fN8S1cO1?NiI^QAl<%Ba)Kf##X`GM;4}r~E?Ps8dnjwp{H(BBlwzF?!Q20RShzHD zNi_p~uEa7QR$@)Z+Om`(cu81Y2mv_)!cL>DiLI?q6nRgC$z(K(rH!Fr7%PG`FL4yX z>s(DC$lf)lbADQgRhy2`}xFlxb|2 zXbNvI1ixYiz77LuTv@p+#3zubCXLu$3+cIfz?Cu-05B(k_pu-nSaO=mF`nRh(Ap!j zCvKe?moj+e5!l?AmF-xfxa$5_lLkl_aEJA|6UwxaT6d+NHir} z|1f?0&nyTiAY(yP%6pTi$Gj<2bjOh7pS6S-H9&9jWQvu3Jb>_0XMWuli~ocAy<}`i zP>%lwbAWEpkH}|?)#*_L*PfaonpGQKdn8kk86h&;?{ZF~S=-}{5C2N?lYiaMMvBW4 zpZG|NWP9xB<(DKA;L)plrMOU5wphwob`8*w6!LE3ndPka>6iUk}6n;Puz%-P+F+M84$=@N_iEBn0!kJfnj(A0obEl&98 zqv1q)mu7|Te)TNyEwaW6=D|Wwz6v!0vNKp`UXPnHn-6JRJ2Y}fUww~xNu@%cWB{C! z=^s*xrDF+Bmrlb_&RmB{-B6LR5J8Z4xS`tdS3Jt=M8)?X*oOlH&!;oUL zQ<8&OV<3m((sLWb;=@@-5hBsi7%jr6Y&+Z+QgAOUccz}?)15rPr*Wi3f(akj&DLya zwbxQZor0xE(Itu~!22ll@3yM6W41-yOBbzrb&qx5|EyT0CCGdw8wkmGQN?r!P7_>| z9wgbS7BT6M?D*^XY7LB(dPYs?BTFTPUFA$x&|HyLD9G6waAa zSLq}{0>bhHG-Qdj+HCZ8^jb=GpT!;1NY>t~tKEkwB9kCGmy(-z~YWBX$E_ubOHoqQn4cR`YGFjRj=^7S64Pi~bC} zGDf=3xh?s*WC+1zM?!LN{tBL6XNNHWb+|K{9(ubl3GXPtu~o9FQuNArgJ&!5^C#=Z zzQK7Q3UXw?SQtP(rSi587F@>ONN;zY)!SPhJP_;nUJ*lkHi4xSSuY#Kp#gr;YY9Me z`nykA2b@LF(0@pLr~-vo3Up_)NiDXzl*EKVlI=NUiEyamT2IAcsL~g1`rxfa|B-}- z!8!RoUvxkL$x`eVW{xjFM4HDk*G*v>1z&*1lD&4W;IhdkS==#Rfe9)=d;45850F|i zthd(vzRN+V+!e)=!60^nAgQMZ&POn?Mu^C-WkcxCSC~0?nN}{SBOkkaeEn1d$9aWW z+*jUpYn6bzgzvVOyL=OA+~U9L>PF*I(R3I>aEVIC@?eFmF*pP}8|_Nk3WcFKX+(h_ zXV|y}z6RJ|Z0=>36A#03tg*zDe7Gwc_pz*&`T$hCWb`Q82WPj2ftzcK%f z(|nh+@UJz%aB7Z1nj{O3_`HLMI(RZUn@ZS#(}ZdMOj~-zffXVNLXa3Uwoy`R=~cP^ zdNvhOKKD@MYxC{YfK#8&`_JDsle1QPIwS&Xd*`XYnZ0qFf`t5dR5CDAfFl#AUNOnp`qISljuuKjAhMlu_Hyv<L&H}U3Uxe#puLe;Dp4-Yhc}3M>DQxuo-u*Gw?q|9 zuxWsMz9SvJMHRv#U}gdU$UpxEZb4xkJn7|*$v+MKN)V8L`9&oAYN)VN4S`plX}jgf z;gibU4WbYVJ%^K}zac2tm$Xsr9t9pjz#6bjlO-5<7UOCNpmvWHre@Su%JmDek}_&W z0=J;<2HE|Ezza_?9?tZx_%(xfRECVtxg&2?JwJVVaPkcurK%MT z@)MYl-xWyASIK(E5U^9Sw-O?D!`l3;ug$iToTy9QpoFo zc}g5@VD}(yIj~-6!FBLs<%jS)zWpn_1Sfi^;LEK^Sn88K7CS?EkpX<6qT~_r!pK zua2r@1ox?6uxz8X3)69|Czswhzmm;9*yVXH&Tr+ak@5Oyp})fIo^xf@$l413pyiCt z0eigaxB0bu&eA^Y>s9;MM>bbU3&bBasR;c2i)AakqjG;1hsQHYJzzqwYlkOBPw;Jc zB8`Rt0sxA#L8P8X#CV{4n(_CN)!m_S2EW!e^maxfp1%5?l4dQ3nmwLiDk56@Yb2ID zyA(Vg87=TN%kJ>SGTHNH_dMA8iHkA0f7;*43&zd;Dc#vT@qBpCx7X*K7+odOxnySR(qa4_@jLp@W;8#SloZY)!($%Yl+z-Vmx-b zOWy$fFCzGwuSc5Pi*xkjB45^hdU6)jBWY0a9AG+gb0jV@NO8pV#@zE#)c3xa3=v@u z_Kk%6J#|OwVu!p^tk0-lKJKJ%K0qKmg*VB>So4$b?~gW^y$9&@#78U#w>wpcbzM1Q zs;_n<^TK|+eS2f&q;zcr21eRqn-WBtUjU!X&xV*3>?0(?8~}@5~**q=c-5V8ytBmaq(J%=D+Ua*UVyXYANh-*m%#i9Ga-PztH&h_-oUb zam!{ee#m5|x7WPtO00P{6P5Any^(k9+ehvhkE4FC@%YDnNKEuW{dqQ0aa(`>r@1fS zz3^UgUOG`ANJ?NuZf45JGD^wfdvDVnbK8%`8Ql$_e>gNFh096|wuoInR!;Tfsf@I{*HcBJfD zdbDSTz7SHh<}i9Gf`yLRh|KwABqDa^q+tzR10QXq7TYF7+p89TAu`)~FdMfN&pi_J zVBur~IZMnZ2c?#0ME3l7%}g%(#F=(uGA8EO&s-Hu{%;97A35Kw0}_zPS8&Lt4bqK~ z-U6xg(*8K4vB{s)1u}z@iFn9yNoUz1V!z7%&_{B6zJza79J8`L?j;dW;g{9mnOgfe zd@^rxsGbGwP!yi^F!(3mzW}5wrDFJsy81>*3@5ZZKeW zq3FiD%#J)pL{!YlRNqSm;WcX+Js3aI&lA%Q`@dZ)95gCC`#9smyR1<^NzcV2+h;6 zMc6YJqo7R23D>pim>={Fd9#96Hj*~;vkEI=|t2JXu#2YK~bU|=``-t@g~Up*0nq?#eTNM!F$`1(}%@aR#pXH$xi@m(kLy}b~cU6W@78pMKoLGu>=5)V%1_+^tf5y(8S{m~>2UmFbTY zBsF+?53kR$h~Zt=-RNXuQGWD#u5-FbH~`zoJ(RgAM5#a%m@-;8ZHkcfz(Y5MT^1wG zS^+ZsOhCohs>pIMB;(WoyxT60)8^Yw?0O3tuTR-HVrEgiGn4nMQ0A$#4BCz8r zvE8*5O}PtWX^5JVP_8s^FpyGNOtv{?-xqorr&sq*B02H;z~`x4Ljb1DmXh*_7>Q=u z-#OCBD150b6_#@mv#B8Lb={CHMol{S^P7An8>CyK?)4}S;$UVy68Qv7B_wuJSQ+50 z;A{~*Dr2S$;BGeLlUDFS2p2GKQ)nQm$$^J?_%3?b(3@rCC;ZbR!)g}cD{QU7@4_|; zl4_(KRY?G5L;#I&!pkiEyEZp}au5nMxGEP0V_BZCu6L;9FOgsxcq4+X)PnS2qHF}Q z2QD3i=)yHgva!GNLzQqy86o)aZFW8++4)7-n1Z-l1qwh|JGO5TFNQfJTmTv_Qeao5 zWB14eKB)k8oAvjB$_GM)o_Km>P(X|V>ZPZsF-h2!i*pO%_n0?e&ao`gq^FWZR5Zr4 zaASUPWN~&Fe}?+!5DYsR&etBBi32OVB$DbZ*_x_DI8rGW5cjx+(7O4P2Ahe4H=7~^ zTR~Nb*k7-Z6(J5VZ~l`?QvL?gnF7A|=-N&nNUPeXXN6fI@cv@UG4BjJS zM3Mt-?z&P&Ec)!p9v&}8;g@%&eX}M0ssU?V(jM69($9X2^~O} zOoORFckQw-s{jPDN4T^kb2zQ&a&O*^Imt?BpB)Pvt*uYKko`x6Zd8+XmY(=4k|c=( zZ~b;fab@5%P=Nprck-z0f_Pab=cRx{t0Dl)k<5YxWbfj;VFLgt*zbJ?6(i3hD;9KF z7y$EWu%OVcFbb#*U@U-#gLwI0#`6J{>CPWob;nCvq zH2#Kzb{R~=fNa2vh3JNqTsaGa6wtU?>~9H=QkR$x&O9)LUywT&=B2*TYHVYGlD7bE zk_2dDLSZ;{?Dsh$N7oquH8-_1F2Y^50;f^Has}=a3YZcN)LH<;HU;JB7cKcg%qB_g zpyo{$A4LIE7E(Rxn;2uU0~_ANneZp)c2ni%d4H0G$A#d#DA~JY*rJW-x(g@_V4i^Z zR#;?7mygu00b&J9ZqBxDM%UEA!&%D7&8C(LfYkGQUhJ1qV@g-GrLP~q6Dyvp#CmBu z3Wjq-~b%Dak1S(ARQ($QvJ@zWP8#VM)HbM z0f-4+WQ&IqLMqeEGW~j@<_Q=)jbEJV4}Pmzzk1@ORNd@W)t_royY)(23fQx|gIM5T zT+408lAa1;bTsv&GLEqT*C8Pe@~7I4V|iEfp0=6dZlIMQ33$(|qoO4B+^aXa{0ejM zDW;CT*iM?Rr|CV8$f7z*XA_2_JXd4Mb_uB19e<}lg7vh>2GAw=o6%z7aG4&lRFxMe z;ik0FSKmy;&$U@vD2mdXg$20$nPg!~|9a-U>h)I|L4 zc9SXKsDP5CU)7=+|1x~K0+-L~6I#(j+}ku;lP20%fL@-O-?OAYd+4F4{!>x!Gb?~h zHsZ(_tg#6`%(C=($fsf>9&2MkO%Uzbu**6}{iUH2X0zE*3F<+bS)u&uIJiv+CX@zO zD%aHH!eux2UG~sB)&_sEsrJd#TD}Xu`+Sjap6hIu@Wm#e4hNQv3PDtS&3^>c=8nLB z(t+u#V6SNUlAOr8i1V|Y_xhyKKYc->{r*3RPb>*M!quQckLE5ix|FKe_u`SD#~pnS zxx}KQxYbLto5BJ2TLpS~4U*+;H~0N$4Y>4`7X~a|CVlxG`b95Ce-(L*o#fd)%csj0 z2^(wg6c)bi#ka^@Ek4Xo-iOekeUM?v*62u_r2imMHfe3*mmb3vb0-ExrU4QnPY6yW zf4sZ{`&;4)Kd~$A0hK-9cXYJVzn_~@Ei5Vs%fGYVj)HJw9UuKg*TWR-!_S!wBz%N< z$|P&~E1)Ky$Xui?&XIfD8h}Vw!z36!AE}%V!F37XYXB24|BuLqiRtge&R%)Nw^wZM zxT+d}Q;hZidSB&SvdBOg_xQr?c%_uGa!II_gGH257-Cn&l$}7}Q;+}@W(x@c9=te` zHT#C#P_Pp$oXW@pV2+MO3Iw($nhYB@n~MdCal#Y7Ou!HomY6nS)qn%b2h}x=isq^* zkSM>7&yXk=;1o*!v@H4dyED&NMgOAMacY=Pfl#P*Zp{n~A#yg3Uy=z~SINm3YB^>n zm6sGY*w#c#MXkY0?X}QXrmW5>Ujn><9xOq{o>$QIhrw@d_U&Ta%VGXBQ^G!swEZ;4 z5l67%b#hJ9c7cwu3oGh989k%92%R9U+yBLdyhilx0NSB=(k5U#l&YLDJNSZNckvP` zi-Ko)qY0;B_AWx~;`RsiCPw3Gp8JurnR49HJn_)tS8{Glr`?FV)F>Mv6OLz{kx71k zz73#p91Yg(!wMw?Wg1?oNdo$m*3`N+Hg4rp@B7Bz*&-~Ax;tcrl~6&f$}#lDJ+y*? zta$)|m8vnzZ3Nz0jd^Mc%#l168VXblckjVoSwBWuAaJcV~ku4i3_#6sh(Z_cN)$<|r!Mm_k825gb`)r}&x~w3H&_ zwK4uloHhF0ep3OaXv!ul2f(hq(FiAt`U1({2b;~L0ZEF8ATb1`;uJkd-=(ZdDuJKP z+3FCOL3!0A*DVG|NJrwRM!3U0TFjKzyVJ|v7z2qF@gNcrLT^o7xI#9}_?h2;Fuq^^ z*u3WbbM=Gmba?lvwr;S_&*F!z6w&IH;_gyjmBIEXHA_f1=lhon3>THV@#yh!VX}xH z0=am{=nyO#I8J%Jla#Ot;DqB@3W(NBMs&+@e%z1N>sHiAfE>?(GqVm=u(^VfYbIfl zb)?B$+_ALX#BvZbyX1aiPgVm?1rFw3KrEekR0UfD5j=#xykz`d{vRvLVmGb1^U6rwitAqWGlLOg($K5%_9DF-;N zw4;-0Ea#5kWY=x)ixzdM@xam=9?E>qUl0lE+h55>O7#Sc@dAP_{?%qYz(J95aF76q zNJ9|(Ck`GSeQZb|d_|8ys=+7!P^uj6$+%Y;+STCG{px~Nxjq&^;QupCES5qBZ?}g!;v<-tOn;<>nl+ApQGzUShPdiZ{1`6 zrBoj3?Sz{VyFo{w#{Rjg(M9Kb5w9un|AqRfoz`Ukbbv6*YY?T^c_Gw;$x zFR%6?-fttNy!v8!nUOY!`+N$XZHATdP5UeK`6uVc);!t>`^C)diPLEhQ!73x4^;X2Fa1g|=EA%8F^4COB&v z7=3+Ege6zJ4|1P6w!iwY`=_2YfWa_Qf5*& zS{nXc)x2S{(dHajA(4H__pg7zAq6Va-PZRKmf>(DYI)&pTT8{YhQyt@Pl4C0Rmc}* zFv&YQ5-ZJa*H0#tbnh3adOdk{(BIto>jbz3w3s_;z#^2K6clP-$^KL--`;l)oM_!!xJJXb~nD(#!>j~w?)sVBB+SfL= z$}`_W!? zIx2M}werTlYU>{wb6qk|`ZAl!1+}yv#9VrM--vl_?P{NBg!>)hhv5zR)ay%Cb|C-u z7e7(PeU`M@TiV0SISJuwwO8GiUaRUYDMxUdXa8h2dR5g=JV-WrHTyPQV&~bD$MOyN z3d^5YYz!Yqrj=~`0lM?ZhFl*jyc4Ei0Ozg=xvefU$=%av?$uSQ>E zwWVvUPH;?buj^eshvE^3mUC~tTK_)1mg-_O$0}d&HWGcPA9%z7?qvxN*N?qRb~01_ z_tY@5SS>G3UqG@_FKIxmRnuWb<&VCVzexL<*{!1z!Dggkt-f5-dIX z+Dz5Kx@>1H0dfoa^2z?^dEGndLWvDN(s*|vCHDqr1=MzS~ zmCfO&j1wN?i%wTy@0F=7eNP@1`C0cJy<6lLDrWGEaYIQiVtIVT`KCrr%h8>NdTDLde_w2x~>IK=j z8xXB>ZyRW_lrwLH&zblSn^&u`?6SsXR+Pf94id52x1{YAs%}#d4#68X7b?ZC#tkKG z)cPLivkbC1N|cKZwM+irXsJw};u2jNxLh7|s(iIpV~k`tH-%M)l}=ni9=Lj9XDMgd)RK3J z?o#R0*y{vxfYZWm^4<|PGh zx-b?JD$*D}Q)cem_{@pj;nd;kwDDy3gTvua(>>lnSKT*H4XtG^-v4%9Y(h;Pzk7Xv z-^}fvx;NA>Q%aEJzw?aYQaIDJ(V_@BN55_Gy_S>>z!c198Idz5X?2j|VUBl-zwi>Wt>KvQWGyL^Bu_ zmE*-VVXfQ;H_QQ+^Q$0NcPX3pSg|kh!>2I9vb#zMx7Of7AwVjV z@640&0Octt`5I3$MbIX?N}|ToutEsfp_CyuJHPzBRSF5~)fFYgtNZT10t^ZJ3PQb8wtjT{pSWPriZ z4}U2r7kHK@rOlhC(c>JU39H6>$@ZcZ!?y*JAi}5@1myFY!-aWe_bI0-^_1 z)YtNoXr8=1^xbj#;u=^ilgNz6rFhuZpqt&(9R0i!%ST8Xd}%Pd-s)nZ0{ao3gGl;6 zppri1?{(Q@s6VrfxikA{WJV?2AH`=ZAhijdBZA4ZHTq&_;^&bkZ|0j%`E+lN$;+52 zV)i$I7t1y^^e_wdRJy1g>Oo0e69$rk0GpmAT&YP@runtd;gbeWux-|D8GbEQ0MF7* z&4~d!Z|A9uG25p^%cu5g>Y(i4lPZp|I30>-BXB(^VfLv#p5+%5l+%_A`5_Fye_qEf z3SiE~@u_NwlbDlZK@O(0g||q!2MmJFv=5$=s3}z>`HIw3m{ZGVqw_L3LUfkA^tAdJ z{9M?y3Wj}P@|6;T7Sy#ZJgw07*0@NK#sHxiQBDqG39h$&@EU^-;sNDoKoWa0Cpy0|CM8k`%LR zS##xk(oe@rp{3MU_1udGwt4RFQMQUM^W$0sbLT#VjXHL@j^_!Pee>-mJ=1dHhZC|9 zyUn(@r6RQ;?pNdK&A{1A5IB@eZE zYuG06*Z>bh#DXmD?b0p5EcL7=z-x0zG=%-pOXpu(Ng5en2?ckb zLo7(&{c8YmP6k*O-Q1@okgEoqKst-_YRo-B4>mL`mzK`ts}TG*hRg(>V-z_#F^83_ zA&d&N>zw7JY|`T}bRRbUV~pS%3}Bj_(hE62Q^afL+h<=XoHp!R= zO9MCf0ckFv4AFn|f^k!TqCZk#>8#`uJ(csaw5P~PF58F2Tp@w4J+tH3^&o}6NpiP* z`mVkIIoYI8Z7ER(7r*aU| zxLB^^9DWU<5mCHB$zMKarc=_=Jy^P%WSgkmqy5h_rxy3_A{|dEEia6FcMvmt9rI$X z!kjUq_wcql42n-%=XEtW`LXET;?n({b~-sNZX$YpmX=T=gbn;xWtFQW!-&EvfoQH; zv>id?D1G`9a3MN-SIDtyr|*(`<2eFkhtsuWhdDnP&hzlzi8idLfV3D150L zjb7SUBI{P*xAxvuzxZ+=*=3O!ni^bzQO=(RE3#_41=H|wU)Pb%yB8+E8Lp+M+#>dsr&M;)sYbf-*|GD zA?yEfbSM5$y>B1D&pER*#@Kg_eP=NCZH!%LvJ)ChLK0F*i!=5;d$wwbDitH?)QgB#B!tFp71{-KjP~p{Il*f<}ICR`ywY1rp^1U$u}kwpM|8HXuoocGz-|PP%_NQP+N_bx_vm<|SjWE8pgKqwawL zyRcoDZOd277OGwpeHN%_+R@$>hgpUD{#Css|G1?OAd52|bY(pvK{swqRrrZXtEV}x zG84$&&p+f2EjNOj__n#TfJRl}Y7P?ckBkCKq|%Ca%eRf>T19$TYLn^Y3@a9AtW?LmI>e;jVm{i7SG^@_ zKt=nW`Y^v2o!`(L0>dGUOfl!W9q(ci{E05`-|S65Y88NsOG9Vc6>a(tj(PIjjjY|eevIHaMlcj2>#M3Vy3puhVN#yOO1A>Yx;M8g`zeE<%V2>6WDI{1(`#o7fW#>z( z3>AE-cXD+;Pksn-$>Xtx00<1{1<%>-I-dYKZ^xj#G)Q2<9;xDM7jqDj8c~eKetQS! zek>>u<(ZKqAdeX0h<@~G8P61rnFIFdKZG$oq>j9=FVQSfd2Q1Xg>!Bs1s(Iv@$c;6 zEKqHGn`)<|Oq{AX5MJ0d2`FPZz-cnRc)(>hiz(oSEVdQW>1vqXHG7gwka{DTNYfEtB(dX-lSPEG-DD-% zwvQ-J74vovf0qte&?#pe!^OPcm)4c*Qh7AAPzn~X=lqWm?URZ;;5j}ooz`~T<>I;I zJR|PKYNdEJwi}c))lay0TN%J%S3>)3pzTTK4s=(SYa0FtgF_7GI1gz07~5vyE65mkyUZ;|{!-O!W$0&Kg2XEp-%+RvD%=li7D7IurA8N@JO zykKB$N#^^yP_Y{2dOeV8FE7`Xyr7Arlwb@CGhXx?$l@8oZkf{g8kkTAKjamABO+WV z)0eCP8H`NEid;&fi=#XGlyV5^9p63bFX8!~U2>p2jcGF3470@)lJX+}cj8pP?Ys7$ zIwWONfa#|^2Pm*2BPaV?S4(4I)$s83HogTRR-Mx5wN zY3Kd^+9T|7%^S2|Yqz{i%r9e0hc?yq%*Tmrtp|lb6mK#&LDP=$5C6%g_7(NRO&?$L zsF7QE)&bFfzh~2QFY&$DRT=7#7Y=9*P$qNxb+?Q5FjeGciDH;yXej+I?Tgm!wt-VvP#c%dNZS=L*|b7*D%M% zd-V4{8&e%=Qww0gRmO3-I;9hbR7%cdL5Q zQOzH%NVv}^npe4y6xWuz82$L)+RA;PSCtuaq6UcV%EQxn!*nBDfDjp`rj_t8P2Ao+ z&sR+3QxGxZxcQEB#f#@atfQ@R74EF_&jwVmCi63OiLn8SoJ?X^ygcq6J4~}nDWUFM zFx$(wyJ}#exF{!0VVUvjiz1?l7ny~9QUR%yZS~|GRf3n(kUd0Hb9@#e`2i!u6hO3+ zwUOf`NVYI6&w}G=_MteOi1-fmPMpyk%omd5_`EnyJHw0a&l({qUM?<%ekv#m#jD#k zGw`)$^h*WKdi~BzC()exCsnO18*yQb{a>e;|5}G69%Lkmivpr1ZG+RrMIo8`10!$n9qwwMWSg(k+odQ&0q-SK~QgvC)U#)C6$xYRry*I`q^#(`lv75A=lTh3Q|$ z@Vuac+*3GJZArU|+1Z--Km#LiL|LN0iiYTGu-F@+q8jkQLHlW`QOEKUcJ70HNYB^ zeFyd5@6OibvZ^L?-PBS(_$OiXHnh{Br!(ZK>RIg?)$+t1AJ3ib)7DsSi+^uSrlV0zh~V0EPa&@1s*8}I4xh7L z=}cS4+64*l+q3^Z`LzE~CP{CJKDupf+=X6=@jTNzYa`{=nr^k|sc^!!=i9F4a}Ju` zgTHO99V$Dr%e3t$TmtTf9lcp9Hl5q_?&96c!dE(bD+7CS_jshu!gJdf>NqKP8)-jU z8*4X`pUNnb>pVO=GhhYFhl$LkC)ZJw6>uu1S*iwhdNd zxW>5bRE*t?*+P!o4jk=YbP=(+-fdeJN4)$4)p=}bE%@|lk!Hx7M1K_MlU>=CO)&bB zAyfxfKI!S0>x>EDy1EbokF-Js3HiGRmp*gT+IEOu*09mCT=_KAohN1dweT*#m+%$; zfvINw*rZDlBWZraP*ePc@6sa}waNOFz%NrnmzqXV58PReI^Hd7 z1EJoZDgx5Xj))M34^U+g-=@2C+5>iLR$3j)y>oX)gtKn3aA|CG*28Lhb=-SeqW1Q^ zb(HM+z{eNEUb}Q=rGNIREu7Y!_e)Z@Iv`wsC)V0$XDe+VqvtT*=Rju}d`s+O&m7<0 zgYUo3W!K^^HCyQC%}go^p!;_$CTi}{7w0FO_z zMm>dvi>DuY&RvcBkt}4o_f1RB&5##chw~Q#IF7X~SFOLNbn>lRRV2T{I+5-pX4XXh zycmDuv(V;Keel2jqkY}truUsNi4`V2?LW#$kO7d7U1kibnrx(D{w%mHOYJmxSs$IP zInaJj^ugDI+i83H2It$&2C(H{FJPqn`@-goaC2YF5TBx}LN-7BiG4RC_bF2LPR&Kr ziKLojsqbwff89b-EY$V{Mzt798|B8GsOtQEvg{WL`94;|*_gA}!d^9%n)0B(lO3LECw zGB?$;1C$~8a`IAH9fx1*)vz_eWWJg}>v6LFV!b!9Rzfa#@&37Kk#ZI~$@)9k?t2Ac zsegE$UnF&mZ@XK<5|!HkbO8Jm@)9f^2w-ydittfx|yw ze^rQpOndgng@pTF9($zj>TOn%z7Lm9g1*;-W7e0IK22%{x*Dc69Du{#!~?O5uGo7G8_Lw8RREt% zKZe{J!=b9*8?Mc4Osbyt$D(vM!?fh9vB7!hjse1An9(&Vb{!DQofj)qRD0)a6+cZ-b$Mq^-!zUSbijC^8 z!hpWzCsisMS^EqLj=a3Dh}@Uzk#aTOjXQh=soh&jY1t_8{wX_BEwPtu4F$t?o46k57Y#>F?u7rg(KF=Vyg zf4s5~U;qa8WfR~Wz+mGg(ms}re(Ce z8>QGkh!1@bOqF+r3{2e)46wC$QsX3$Z^_&JhXj;?_@e&dAXquaPH8hupo(_}G+lBI zPKrdvt?f|aBWu33K>1>0$+Rd4@wujUV{VWS-Dry)GYj+-Qnz_T)u zJn7Ln4Yx|&T@1`+u;@LZ*Jk!7XK-lV$bUKLxh|NH0mE+Sc_j7xFra4&A0V(Xl#65A zFmf8&>=Nhu?#D;DsEgd|-=`+BqJCBtK+map5N-L_S1X@h6fQvA+Ez!d(D_N}f;otC z?3rPmq10P#0l8qRF;hS4;=y7n|4>P&`oqh;z}zNH-V{(+-PxcD@V<}izeR3@V0_|- z*Ey)$5z$I4)FlHDt-N2>p*73eDWho6U=b$G0qsTX>vQnIk=mi^z(WcP>Gg||U=fg0 zGo>Jx_vO6ogK~`7OKfYyy2h)AFuN&=$WI>RS&?`Wz!hnN<)>E*;pqa${$K1!zWElg zc!F;;#zAav8qQN7XXphvL%stRTw!-^Lf)%@2v14mum$?k!kUnKD9oon0Y1~IFX!e2 z(CmwrKw;dd#dyo`Fk2o8pd}xgI`iW#0Zd<-G6kYDXe0`|K$U$2pq(-32ciZzD^31g zlM&kx(%(U+3J`G?(km_#w1?%Xqqs@xIvrpq4aFD?3M|?^mxk`%W1Ggq;5Lt8dxnr4 zG%6N({sPVK&$d!t$M;YP(_%IAAI7BN?A0`dD9E6Ljia)yegeq9C9aEz_&`8ck{!QF zHFwTVWm82=KaHLCASjmsRaxB4eQf?!@J^C4b`jurhUE+T(Lq!s3}7g>n+E0GNpb!> z4PN%?uN7w@9)gGWiWy(!V*tqb4Ze;C^+mP?>hAphg9r+ZuOMAiwBI7)O1CK*XF6y7 z>7(ma;D3hVvk!lF$QX-bw%D6z*#9op)$JRUx9>-+_PhB=U>;zxO@rXX zG|=IIa~=>t4nKZnCs<%-+$DVF>lFprO$$mtKS0Ax1AI|w@{;;FBSPW2L)t?9c>gY= z)+tT-*a6*G?w~PKplL9{TStJxR@AW7T14d2sJ~mtZk8}1mTI20XHF&1-(F-_eWTfu z)IP__pS!8QMrFD_RNcy4j9HSo6j7?$Jszn%vw969`itsV$2MA};``DF08Ma+I=D>5 z#8SU+fQEetNK|)Y`B19LqP1> zBhv!Z-W=w^Z5pADs?`Jw&|!h%K@_oHGaeS*W*`6i33H1INwVKx-nteKIQAi=s;FF> zUrwa)M?lliC>|Ev{(AhSgxeg}lmH8p1_=~fy(CzG1*_%b-xpm2R@stW*>>qbISWb0 zQwz5rU~=rRt!_5-T_>$z6mrCZVb2`i60nTpTirI@Ou}!*;1<^bXE6V0W~aA%T14WZ zstV7j{e*QVd1|c65A-s$4ZyK`nvfz*L#Fm{7ef$!aACR_rtCs0VAvw5*LGLI%Bv3i zLFu{_9_>aUOi<(cc$X(zFe6Jh78p~1=oamJ^{$;@-!($iReH67-u8eI6?pdo7KmrB zwzP>7*UtWa7&OS9V^NaqTOR?>FZZDJoDVy5&i{T2w}*Vv3s2*Qt$-qPdLRD|x}14$ zwT?mFda&=hgWe?UbgwPHb=UezDck1KU{+sJooEASk?;*EUES+u=gqgh8 z6G;&{tDLi>JcJpC&%95BEdT*V;u`H&S&=RFr!5yf!Zzs3xYjbLwVIEU9Dq#XMX}%G z&i>@X!stG`cX`Ptv5xkQ)OZFdZ`DL~5lfJy=?3ZYPdk*551X69=@V?50`@!F$een{ zZ*#};6!ewT0_An-j&fdtU21u=z@7s5*LhyhMsK@(tMV;s>d831XoH<}K**nJve9mY zXQPfgs2o#*7EKAs&cu&b{M1qZ>JEz<>hkE%vKi)zVry&6-J9WZ&g_^6D zKws(9#qtf5W}ooJ(?0l@j~`N+yTT=Yo{?WS%5u6bY5(4YY#Zzh2uMOeala93=Ngd7 zyNA0!gAZyCpZ9uewLjgX_2{7?&x4R@y!DUkc87J6*d`R50qkVn#lEKoi~jsWBbweH zVw6_=dH?OrjNFB#`f{XM!)dw2RbbDxsXcNxU!cCL+D)>(;@t!a?5>p#X|~4OR}~^N zvT(}m(iQ3_x|6wNIR7~NeYw55HGl*bIv)?IpNA!7sBxl?eW-tYMQueQa$bbT?ZyMN z3)ESg?R~gX$vz~XCf1F?_RWbn)9(5Xi^UZn@sdkTV`;7Z>YyHoOcB|^y87%{IKUHAe;XK3nU1PQzaXzaUF1B-*0;? z*O`QP{+d?^WbBljrBRSVlTp#Ib)Fzw-P1{F6?>*H4M|3kx&Wa?`+)Alq3f83@$EBZ z?8gL8)*84Q$$pbIJ>lz?u{MB1(xARHURU=Lh#;@>kzjj%Dh45GniKBaDP@YDmOMtM z@{58{6Kl%ZAzwNFe+bq1~Rr35Fou-J~wt=_AueD z7HVm#Nif#`*-j-U5K@PsXc!D-?GShHAnoKx4EsozKkL~vCu9?(%o$Y~bP4!Fy)1&! zEc^YlZU3eCyWtvaizqHa+Dlm*=qlBjU@Z@vc_%+=>wMnp~NO-c?CdEz#A0ql_nc1NEf znBKa|dmy*~Ox`*Z=6Fg%)B+v{yhVa2t3v_kpB{_25V+^r^tXm&# zalZP>>4dM0cb66ZlBs#j?@`LT^*$>is?nR>pt%r&N)qH(CsP6O!2vePE<}0|(r_Lr zbyg6B7}!;$WP^in4$RT@KP(Dk9gjr}&q!?nO}G-82unh8btw7)MSvYTHRtcIqNb4h z_sfW1k1PjtI1w8rM$`ByHb;mz%gOm8Dhdbk=M{$usyY)HH`TrE**g{YLMtJW@00SM z3v<0avyOyPz672^uvl2RQ3eIZZ+KnJks&DlxSVIf#U|N<4N(JCvLv6Y_xJyi@>LO! zXuYJc5T@kTY^kSe_YyD-DRVl)JXJJ)rvIak%sKO+k%yK>;(+71it#sDep!-(59Cgq z+|g}JDro$J%BUVr#hM>rBV=@DdP%*d(T=kNHYV5!f?y<()>1Xkz|5;eMtaXK7a z8a_@`1HJB%uaw(R%p=eN4~xIyLq}mqK%Kqx1A&PkALWWGZ+D+7_1pdWtxKQGi>!^_ z6EG@wN7Ug8dF}@fF--gU49jWCW-#sg!AYS0G*|iCL$Fe=)mE%Is`Yo>(;Yk(eRy)= z{x5;`WKxqf*ycHG^%vd`ActO^a|TR5;W#J2TfimLr|E^nU8m-1<$r!N+3Z|m4_QM@G`faWm#q${QRVBt7%3WJDqgpdvmc; zEAHIKE)zL(7fhNmdsL+7;gHbjm{g1C0fOUZkvv@>`c?XCgu64TfLaVgM3R(CxLS5l ztE~o1jg7#NXg!msRXi0}jy|P{#Z6>~QvfhEiHIm(3j6PNaoTC5vwjhgDJj&nGIAv9WDs9A#LGW9e<;VkFNCBMJ#zH!8+Uo(66uea^V@C2+ySNV zzvWmR8*w3{O@2TvTtqRrbR=0NEJFKJn6&A>OW|^l0BvP^1=*<94AWds?H;LX!kS)c z3a&z9_7o3`eIHLMK~uGTxdvht=%d%)ghoGcxwG}S^!yYUWHvK8L9QIGJDRdT`uF|` z_?oUwM&kSL9>(-^6~le#!3n9y5(cd4k~%SzZ2k9N9+-`ibBa$VO!l`qzgUWEnos0d z=-XSw-_tmM{-o@>{Th3m*BBMkt}R& zJmmr$`Qgbkl2|;cT`4zGda7iZJKbn{WYK3Sw=XAT`<$T4((XSGC%P*}l5g8-nLmC% zEo{qw6h3hNTwVKQyJJry>g^?;(cEh{TBf5=F)u#`@>w*+RU8$DLNZTh_e1lj>z4;F zY>Mt=2`RIw=Rak81>aO?8!)oP3)3NS4a;W@#x(+-uVX_GxXDX@zHu?LF{VD+X`$s{ zi6=4dqb0x7tu?Q+;g2Wx2;A{O2|T}bv{@%9K*qjd+bD-?aWHPz$d32FE3Ys9eXU<{ zd_knhq=&YfNLf-3TCnzM=T2SzyZP{y+Wh72DxJq=%fC#yOP^f~-(HCv)mnJ0__-U* z(~U`QotT9lT(_`{gHzX9h&>WD+}BC+r?&WmF;dR`wVyV>P)^e3D@5xaK8VK|_PLEK zO7^v=j3wywzI-(Nb>Lxo1J~oEpP4b8>e2>6TWO-R)u{>3!#56}ZE(Ec-F9W?w(sG8>q5Z9 zG3?w+qgtHn2PF3XN1cPIALjRQ8Jh{#9#;%A6W>a``FW44Rh*Bjv$U`_8o;;fW+DhN zE1f!hZx0-c{q^#7`ibwH-_S~2o5J1@+m7jJsM&B@%+8`dstVxB)_ za;Dn-?Idk?rT7hP!OTC0gq5&IZ34`$>yKQ(ZdKX$^}J-C`8D)tdC$WyzVT_aqEodkhAe${@@-Pdsa8{^ds?g-@OSBzZD*;*85ERyL)fO z#AW{?>$5|OGQU!6`T{WS*GEM7ggdlhyJGN zeU}Y89TL?*6;LYHT2%j}yB5xM8!GUP0|MR$eEfCYK<>T%6byfuJbT*XN@g{D) zKXU!;+NE26A64^S?@lVt4m?B+!#Gb8L}L^gP$4$?B#n{kNyQAa%5q!;z)2$-9Dl z?2Y)}F{+Syr0%vVVtG?;hy=ZJcHHJzJ0_%VDhu6>!XF7mc(TEhi0dJ)z(%N`Ht7#d zY3rzgwCR3jF;HsWNh#%MwZswcbcaHC*-Hh9!>K3l1en9>HyATrSKgG1qM`3}kqOgvk!8 zna^d4NV(vtupht>sSU#pZ(7R(NbzNm#zs{!mEI}x5m^F9*lOiZIy}4RYn$h`X@WFZ z;s+IzoQc9F6a_v-#c>>czv>QLJdC8CtSNZ3@+n~i7I_~{*t46snx^!fv(uS@-RA6+ z2LwBtonjoNerP=^7(Jrts2qz@_h+HqSh2zmdvmDrJ9i8E?*oLts^F%O!c@Gv5NfQ{ ztbsHbS-|)3i}V3B@}51w^ZtByLuF z-B2cx0;1HZD%Kpu*$6>T5ryTjMa4Xu%?OH;(!K_ci5Ux($UsFJCdsm)5vs)LEwn5N zJyb?%25f8w^?#RXF{pS`6&Z@HL|)cdVi%To_A7>sHf12yIlAQO|LzIVJ!$IZ?@V+6 zOgvTZwSw_?hSV_U=RuUmSC*&49XqDmNR813I^%HYv{Ea`(J)n$a8Ql{DGCB!xWDuXD_ z_7Ut*Kjn%4pN*`TS|ZoM$|dx+|+!WOszGt8uX z+V{f)A{rb4A`fRkO6U!6$HuhHgJ(wBiW)$Lp$EEUZKz6e$Y#EA4B{Y3R2^xydKhh2 zXF0AHl4xQpi6S5*6_FB(eg$I24AGH;GOFB}@{_UiAx6G8)ey?{Hr!{(6mO++mY6?7 zi^AD)k|*>fAt+(mmIJO=lkg!%#tolFeh)6BX{z1&JgI6wa(;3|as5{7T9t8U^>U%6 zNCXmwc*djBl?-|fOCpMmY@mw7QZ=3r5v|#%NCYyH1X{Dv!e7L0&~~L7Mk3%yt*ftQ zKh`+H62{-c1#*n4K=Tk=3}8!CrlNybLY^#fIv^3!4@Lp_Lr2kZK$-`MsMw5=4iDZC z5U~)KRbYtLrqMmAg7N(r21l#_gxZabj6mOoSqvR;cl?4jeS=71@)}JTWdKELjk;tf zk*9I}TBd#q2>AoZSS|~}1M&jGVQdZ{pZ1H6#v^$^;&4AitPrE%^hLk= zK7W2rSzLbiV*fO@ULuf#R|~np@6m--Y;_8sGr9|#(A<4YtTwnv|ZW_9F>D6Ja8MY7W!T*f+%!C1dMG&zZ zR2WBDdI{;jq#2lw+*nHe&~$V+i>d2jYuSj1kkpe04u!IwT81IwIr7#=jk|pJV#Cn0 zP1bvssNQTFkR#V&D7T(}>0R#qld<7?c?PK$NAL%)CT_lH9iNpIROjKi!z{431Y7C5 z6@`+ROAqo-$8FeFWM@F{wgT6a5IM;)*+>c1Z-E_Nhm4;&q5Z^MIJ`C3Y|kN=`-*(8 zK&@Bx(l+x>!5l4N2G&Cg<>{-1e-9=x&~cSsndDK3UG}~e6T~P_C^uK;CW=uSFvU!R z$N<5X%7YVTn0ry>)o|ru4j94^iz!FuiR}`W=-ynjIqhZDK@*AO$g>>;BVcW1TPqUF zvYY0p0YkHIAy-4oKhJ?cpx{F|R+|;DQIBCTP$Xv+i(HZ}ltxi39LK9ejXHO>!-d#r zZP;*xfwvAvklBbE#ookpobp%Q{x^sLv9zkgz*kO7Ff}`je3dtSb`p?!RgdaWKxuQJ zs`A)6>Gs-g#F`%}-p?V4EwjI7u;*^=eKipjFe+tMDjstpLgSWFFuB0F) zkSbRHvrTZ%>S+;%rni!To$%AaF#8#z@l|p2aHpm)Rrf%hRqQm@RV6D8JIpw5GErx= zEL)|5c_OJ*#hY6M!Pqe1`)xxsptT(dl#_y9hZ>u*|7Jly0>?u0oq9;0X-bs_R+oBz zdb}U;)f8M0UU``B#x4e#ZkEn~VoQ1e2hB3YdtKN54qw)C|N-;n~@R=%XVd9DeDQX@f6gTyb2@zh&ihUb9?am_5$Fk7NU-a3NAjD#?}Z?u&79=~6?-&bX5(IgHBc?yG_WLoZj} zU2}i}m*kuon8;aVK@#fKtihP9gd#$V$c8eGnN`;#0|x#_-=oGk}womG?}S!!f+&OzEmmleIhNnF0&xX^3#-9D1#w{K69v z2QxKHkWHfZ&E8a7b7OloV~wZ7(RhVn2mKfu2`%}|LmkHGSWy*cHgBL0ZG9B=_b?eX zNGyL(qdh7rwyM429k+NDtEqTJfr^5?CauB*heJ{20Ga|wNctDtKXvU02{{z10IffM z{gLwI<;?To{fsp4(nZM}s|5@UNiU82u}Db?Vj;G|e_0s$yP|jh_7V%`e)e)YmwZe~ zCH*JoxJ&AetDthf44O6c-s|2&1><$^fqg@$wMHBxEWrE*UjY7{{|0CHvO$9Fh~`Rj zFRuMb!N2nwU!R7k%+ZA-j2olbJZ2ITMU@L+gQAHEmqYPY{To(13Y&zYQiaK-n$kyT zwj^W&mjfkH6=WdQmWc#<3r0CqvARV@ zhuX8R>}H@6pMCFk&3W}@E4TqA{CwlfX>kRcVv!5pC#jqJr#3A#zP$dk5zH7)ICfgH z+j@Ol)nb@D^dp{U@ZvE%sx(R2R*>EhZAT@(Y*SLC9@0erIYd>oVg>CbAsJ9mQ@L8q z(XN9qTA6zP+q4r4QIN$fj<#I@+(u7_1& z*4>a{B=NB7W58UC`Ucw=}KmkzW(b^)!9bRmh@wbmW7z41!=@%q>c+x1paY6sMjSkFugykZB!n9)*g`u`0!v%+*VtNX| zeynqX(}J6zuThn#FA7-kFmX)bS5)$7@{n*POO(W*#&U%1c-7 zozCp{^9#d7G-iR-OaRf%64fgOo~pP@vjTn2whF&M>MBXHFP_W7{cjK9Ywk`_jSO}f z&^N~@1bvHe)Vf=E-_G+^M$+f%p<<$@FZVDX3)D5WAKicLlvMN#qX)&-J9}zh+xxv5 z&QR;Y9g-D`rY54=A+2XqV)lr96CZ1=+w|0KId&R6z7LR?WZPG~3dI1W()lJa&5vS8 z7i$mRkotVC5Z!!jECl`N#n^4MbjV%QmFRsUS6&S39+h^sr2!LsE?&vH632be#*|Wn z>sKqs;vPVZy4W6lBLxR2c$xfu?6QK~%kJ6X+%W#z)${Evd(S@YHov0yD{jAL`85`f z#<0F&e-%~h)ks8~ia475aBUwu`HXC2qiZCP{|<}HjkxF+Fr)YUJXbws#2RfD)Bmr# z^4-Ip+>5QN4u1Rgt3*Fvt z_NU$tCtuzE?%ZmB9$mBX{1|L+qn+uLEPUNN}Y?O_POe7uT^jwP{C(5)%$`-O6f(pWo{?_?27WN_$kX(zQaxXqLMATFANg z8kXsN;7ckVEaS7-{%2K4<+GdAU5TSsqx=CN!4kAEAIAuF|uX*M^lG~O1 zCdXlCijQAu_jQ3KPh>mGobJipT{o)d;56I5jbKzFWBN;LXx*nUmqK60tUa0Y8KF&` z^bQUPo50uuq>3W*!k7=@M$=WS>AlY`eRQ(PJ2whRT!5R`V%;}oBk|Khp=#n!hYc9!V~43`!ahTB;Bc};$z-*S|04}SCeEXwsrN9*CI zr4eJw<&UqHsf#?b-Ty`<^ZU2(+q@hR%=W6=cB zY5pZ_`>~P$^N2}l?I&4@{{2wVyIlUh$06Y3x^s0w>G~g*9^}o{S5OS{Yxh|<>Y5$< z`cio3%))HU@zPp_qY+l2Zee%&buXCQ__*WFp6=k##Cynhs;;>OyFN3n)$gioPCJ4R zbL=fiq9MvsOSB&+r2H(u6IhC-Tx?4JcR5$$TgSeDg}wptug>v4F~x3LyDrKHc38;D zw`7mbOf6GJ&dX@o)>;@HIVF+)s`I0V9ak!7vRAM2fq>c`eYqI(Ycp5(ZfLWwMZEM7 zX{&cidP9&v&2)u{T{~oH?2faL@jvn7`Pt4kxmpS8^PWrdzAEbmU*@N~zjw{{%<42n z>%fCF&1Cf;b*2r2fYnSda|HR*wrJ>uCenq=}RpM40F7G#ed#Qb6du7G%IA+=> z?$LVpBR5MyPR&ucMPkQ4me!7hD^syyTp!fkhgW#1%KJJKJJwk(H&AL%5Oaxjuj}4f zrkamk2>GNdK&lHA{6)4+c_pK8~poL`{bB6 z->G{>H=mAeJvquAlr47DJZ`ZNU3LG$MDNWN*U6oS4qke{@%6*E#HWujNjA^^BbS%| z3;pk2@0~k@yXAYxYbO$avKAcIl-?X)DSZ87azc1=r=wKP+4%`=1rkC!^=YkqMu55p z@8tCj1^Rs}5k-2_bg(8r22Fx3`!1k#U}iByMG{~&K_MdI##Iav1dbD-6qeC(7G}#S zu3HQcB|_nVRR&7z(WdAL5);>j;ieEF1vY5KfrRI zE$T2lZVU0>;>#)qg_99H2TFcZ=kG%ib5rIS5VB;7$;BJ-FfsvZ>@y|Qdm%)eQ)<7` z@o}{S_MVz{97vXo6pBxS=n{U(nNsZZD?*SEX@?z)*OW-TU1BbtC3c*hiGTrzDXcOX zNdn})Nq{f|;3;f|lUVd-nmR{B9t$FfX*`rqDqdV~GaEYzJ~?u5Sv1Myxm=ikW~l_# z$JWGZ33$UsDg*v@OQ|QGiR;D-8;`*4k`R$s9&(tdWd#0Kj$$$~1)DV=X}&qDB4x^) zy@3hegt))YiZgzuspTMq$>>ydbXSy$6$w_|mQQ*Rr0_;XjSagErQ)6rg@jxjj7bkrv-0C zMD7ncAD&U&bFJ{dAO0b8S^TD{Ow*j`{;b60d?ps0d~-&vD)7eJDEvde#dIHOrw{`b zM@Oef|5B|-D~@BwT&)z)Z@MsiN?{NQGzJhq;1S_lS=0j%a%Frye4GSKA`>_hKN z?*_M|DP0)VMpaW7yc#OvIFHp~<8LYgMw4Y~&qZa5Wz|_4?(8zHr&5NS1b`*fQ%XG% zsbet-8FrN!ckvygh=`IQDLPV?AtTkr$8QLw4M3#~(3A~Xv-q=sf=gX0rd@nHGZ1=I zge+`&f+W?)LdbG}uU{%@rKiA6khfysNZM*-gf;+)vJm>z3SV4R&^%wFEqIp%FX9lI zllJAovR5e1Aj(EV7JFzKR=_Pi8Qw@=8?&5f{UC3P^@I?cqU0fTLsKv5{%*kRm z%IkQ@Vk%?+@vKN9LYfE|GX(=gK-qX`XB$SkSd{DiAsxnnqlywHldQJ=uA$4x9*qcb z4&rvd+wW@8&s1$VUOojrEB70GL)MaIRc94L?oz;j0pOM7q1?mxJc;Oq&%$Y%XhyjV z9a00p`?>t0?Z{qsjR^ltDF!;#+4PbzK*nKxA9K)kFDSvsPV zRU2KcNF-McRHy!k2bo=yeVzLhcBri!cNk#~ zVAU9qU>8w`0mk>9Q+z>J3P}7_5%y?%KeXvtjo^Qi?K9SF?p_&Ika@a2S2CQKvqCn4 zlOT0YlP(E(aLwUbwd@ELB z5?27fEoJh0+d7j9JVuo#23k!aDCFXE`e;YVa$`1LDPGW(OrWPDR3;&_O}^@DiX*=2 z!Zd^uwO*jH=1!;#e z!WwfRBDquGS8$F;Yps|2mVn<5aT;w`gs!$bW{>iGtlVPb$wTgO3fb~uJY(nTu7tHSy~$eQE@Nj08X+l1p|nf%HnNk*xi3=MB(QX{63xPW>u(hAlItTv zr7-^UBtMa*!BV&mQ{{%tc6~w?i`w#5ctnk`sK<}GPHdzNkG z9U)RN!0Cm;$cP`sgT2KV;wGOtnP|u^zeJMKrh@c_R+~8$f$PodwUE6T(h85@!LP(F zD5Do+KFRlav&2fmQH zv}P3I*N!1rnG1WV6~Bc4RL!gyki=if|DLOyw@o$qckJit zHA)vWVFr}zOYuDuHD%+AYGofxNxwqdg^VM2Y#JI+(>S-pxuiSYGPl*r>fUrU8Z)p! zZ$GEP;`*A`)7JUBWZf`Msy62Z?N5`+mR;SB64(( zmgVxno>Mm!Zi#->A~dyclMpyY<+~9`9YwMzyHApKoK)Wc!3m8y9C4~9@ePH&qR&E-8L!G?7TEn3v`4q+(gJSSNk07j z_6PBupDkDjzxly0rqjP;{SHSU{@nW|P_u+klK+fOk<5A;;nNR&1}>k715W)%0Pa9} z8&aAi+$?F1Yw3>YIU(4`0oMA?*l zT5m8@jk_lLm#-fTMHrqFsNq2L5=pDJE()MOdS?Q?3CdFuC{M+2^|+qRB6VTOm9U6x z1@wV_5dr>ZP*;=V7m&me<%6zcChn!G3E9lqJc%@MW8$C1&Us&-q>whozBK&22WoHt z5(65!q)7iDUi#&j2Iq-uynq55574WkH6#M-mzq2bQWNVcvq1_8NA7Curlp98|-|_gBK=vL_@;KrGpS@JhH`@M;caGkui*I(HLLz96Gal$7*tBEcf&%xBQ1h}15m zJS>8vN?qs|^$^mt^DqjyTHXIu3>&Ig$k_U$4OP4lF)w&}^KqgTX9vvY*I&*)IW4Rw z3tX*La zF#U5by!nh7zgmXf(-_YbfhDt8I6RF6c7-}fq)}Kq(P+BjT|W; zAgLooLTO1IDbgX0{3vxQD5+ra+n@K3Jt=B6OxY*vUW->$0>(O%9s zu70wAp-oX8>=d%VJb+7Jt-#kBXO{LQqhZ=DOtzw|Oz@7-0T9l#uR+wb07g)AfBn=; z+0!h_&fBpuXpD)fXesK*(q_~(OFnak)#;kLTR9)Vm`>Fww-g{<)+chQ*fl4O2ksH27dVO}##0ptzz-P}gbL zO(6P#XCO`qU~q=myW5zBVPGHzM_E4+ge9RXkiYbfBp(p}Ms}YSnLe+$HF(e304q(| zzcG3W`t^4f2Wi5iNpUrS7}i&1To2t^2mW}&&vX|sv&g6KcO=U*Uj$V6>J`tsFRb@0xy+F`XxFIi9lxb{3m&j;BT z=G_nPN$u5CP)Qu>uxxrqh$XbXOD`WY21zjc$sXIS$kvayD&2+-%6_rbkb{I%8AKXt z(&YK(WgCT&6<)m^J(T`Q^9}HAuLdMq=4%xX21~g#tCTe|ye?oBr~pw99>u+R{%hpL zX4}$aEhBi5+R<9uL6+U#buyQ3Golb>o~u+HNja#OHaDfzH=547sCIbSmmliYp1w|7 zlFfZ9qv+fuHtDG4T_adNE8RCou48JbeZDfMO+5sUO1D&I^#V+mQC~Vz`o-CnsFhi{ z)1-Ru+_z<^oJ1It$^sc&_I#7`Qg<@V*I7bernU4b(U3>EPSQEAl`aEh#tU)~(q_Hc z3Ujr;Qd_E2&0eau4`n^w_f^PsaNlvBSNXz7@_B8LYs2<(XQ8*4KRu;6{zy81eLS$u z4nIef!6vh+$Gh{IFDD(01Nz z@X>5bxx&c%d6hclQJxxqL5sTO$FtEbU#gysE!P6RxouzW)mwipsj5$CKgC0pXiT@E zfErtWr_u3?)%HfmuPc-T*^%ehoptWo4XI{-l^c`J>VEBT*>U#vlxppCoxNzHY4zOX zFO+-7ezwl>=%Bp%Ny9~zpBHZ{iq09ORQ)Xev91`IsJ1Q4_~g#~jnli;LvLOc&K_)S z2x5OOUqY(>e0tyI=klHYhY>5!EFPSs)jtumYP{58+LP{m!0hyoY}b`9K4ComRq#~l zb{E^CVPq0J3*v(MnV7e}vHS+B|M1Ee7til{G`eH9=cXkm+DGNvZ$D|ScQ0Ra4v5=^ z2ty`bUYyY#k6rt11%LLbZ~TPSuJWsyk@wLHam_)W-X-qS)uXmIHUk0W5vTUIc43cX zBvOLwIP4NK_PNq?SZ-!MyNw z+ouCOWF(Y8)CTwB8tHkcn==Q(MCM0k+GMrFY2>FC*IaYoJ9~DrJ6^H}DoRC8%O7oq zjr&Ddf>Rp#$o7GhcE`H!`uuL?%xovdJe~}Z0#0e)B?kDHww_5%#;zTndh5Ru^2f@Y zZeQ`%%e$x@v>l56aJ@4`a_AlIipp9Ojdg@kIe9-rJuK=GQ0Hzg#E?>k5p%1-g*fAF!fQO9fRC~@7nV6$cc z(XgrLcz4gX;?|PEO-;!Sgi2Vxmknpr^iKPDwbc8^3*H5f6Wcvyr?t*JE~@T`51vx{ z-Fd4!TY8egN}=R6XD#@Xw*k^f0`jYzTAI~9D*I15)7{;@bM%f*?4r;0*$$W0A32T# z?|pQi$t^42mEz`q7Ni^VCvT%P*m2g*M>nkhgWzWk_D9aj0rO4sr#$7S8vRptj#YQ% z*qB&2S<^Y+D0{p4=P&dU~+yzH_WDO>sXX=E&8q?`AZoKHZZ+nh^S>NV*g zj^f~TJ6Bfw!kQdcDX&I5So+-u=U4q`VO-_ns(MiiB!_ANB66GHEUt^rY3dS0fG8sl zMtrcqOrdweEvnu;k8{H;p|35eBgti`M`3|z4G4h||J;~8)Dqo!DmEuwVm+F7v;SF_ zSA-JC^5DKC_SX54^P_-d@7%T-NIsFstJcaAYH(;Rm=n@aDKtsx+&0*Vud(85JHUlz zJmN}SH(a(J3+|E?e!fjm%KMgcoy+~^pryx{F@2Tx&sPLg>m%Z9_Yjz>u6F(MFsqCiaYh{9H`x(a5KB%^XF2!quHUzj90QYq&! z6q2bJDU>UMO7Hz(cv%x0!@SkQFoc&oPBmoES-@e)pAjr#foVo$U2qGXpu-dLfo<1s z8T+6W&>ktEUJ!+@*JZR@pH<7f`pEHO=e%6*a~HT6&;{sjDdHd|6yR*}OI?{!))?L{ zeS``PV1!PJ=wNj@OI`wRu$WA%R%zfTm5M|*Fv5#yFXsaS0Aye`sF?~AK*fp~H?9F} zB;M~t`0};AA8lo127bxN2B8eP*}W;F2UB%dS)F zm>?Qa5QPY$-!clq4}da`K>DHL(wh`*&@n3n1wmC&fhrIi1%#`5$Vy0nh6C;ya1IO@fnvmAiI&qm&h5nZ1I?3~ zcoE-t5)G!&51_DJ0GbHyN@T;m0_gSPHZ6{!SRSJ8@GgMEJW}&6Y|tR}J~a4K3b@1L zLTKsVDXC2}r?DYeBBLZb2ou}N572j^osEe)mik}4L1VWo&KEppCM5CJKd;Uo;D`WB zCP}%F zhLRL!nGw{-vaJm2C*6EfIG?a8p*q4W4^ezjOf!VbP*D{&=+4Sc6 z4@VvY&fx$XK0a9tY;OiYc1Z`Z{0Wwl24be4qsj7L5eG#*-!e&(l9u|%qtKy=Dgjby za{{dpF)Kja*v>lAE&cq?EUi3O2Oobmn(?j~@pBWA289AU<04fcG@3})Z|rF>AIvly z3+2I@iQ`6%7Aa+cKBS|oC94JlEN`jokkrBr~Nn)8y;>w7# zC4eDlFwPjnx7#1D^vvhBOuSo-XJ_qba7S)Lg=RrHJGLA1yWqQKvQRjPcapT%26veR zaRNQ`jRd7hkPi{z(MIa}3YZL$5QnFoMj1rb45A_mU_?VuvIawaJoqJU7dVM^7vu~j z{_dT~3ulDTp;0fWOBL)){pGXhSnbJJ20&*dJpRx2(Bf@kG0?c+_%t<~(R3`;lOYYXZ*sh6akUf%^(k#E$4(A9YqN_-D zLsxV%P)GvI9L-37t+3)ut!2Ud^$Exf5HB7En(R?rBl>0xg<;LBisnQDf&N1fC%y@U zBm3|Ycj3AS9BJp8Xw+V6w`Xn88dg13#devJV^=}BoD z!-uBeL@Wg@~s1NaU z8Qh&HL?LnF64=qR4lrVFJ*XA7m}FCd)hoMZ+B@4&cFm!5EJ=8_db;T(d^&*{?dJzI-NK@#f^oYu*cVGbc4%W?1+!z7xW zFPu5UFCP4Nh_fE>ABab_fkdhI*tQ3z6Cd!RtX>cv6WIsEY{a9Gn-(J5Ad#0C7ZdjN zjTr@So5)EBnU1jzb&f^577r08n?L~z7Q8|vEuQ{M_h97X6{jB@$_Sz`9HjQ`!Y@H{ zdoyN5Nl%YSXTHf;cHo0CJJCp-G<+?A-pv{{==S-%}J8oI_w zi(iEhxh-P^7RZ`=2FcOf0+^T^Z6FSeNm?_jIqQ_uw>Jn`y%GbEBKiss4t|^x1L@cG zSB0+7F7zA7y;kD&hq(IqV?p7J9Rts8jA_cGZ+onP|1QJVKJiOVl4witKg8c2&$k_FQa5BCx;|#` zEH&!O6+0A(9wr-Uo&(7=L{xx8b^!_6t1m=6^CT$X*i0n+waMezFd8D?(z_^c){VIe`3_*Vc*$4@+BhcYo`o}nvLmfMO))@kWAlwp#k3SgzIENr-;B3s~G(C@3 zFTUaVXEZ(W#54jFP#SW-E6q+}ypyV-NRlJxh)HLzeb-V4k7 zAT$>1Ivelp$-syMf$%W4n=$DF|49<2qS}ax#BG)~KyZr~;{>bv;mjdhQQdZxk9xfb zos_h|#xY6Mew`pj1%lR2iToKc!t^nD1)_V}<3dPqFa)52`q2jP@!`QDA$ z-k#uRzC*~s;OOy86upGpZJ)`=zl7H;N zWGn9G-prKXd{-ezK-bqeTY!w;OH4Tn zK_K?4TsSuziuec#KZat@DALhXBq$s}9IL?DNK>3Z>KmAPVuaM>u?R&e^;E5ThyjU- z`vwwv#e3>0Dnu5>j||72kn#|x{}OGnO!7_MX*5tm*zv;i*@E8;Qi! zAxP@|k%dWSce42by(s(L`Nk6?&HM=caH68|Dm(7?trc>&$F;d=*1Y>e+UQw+^bsrY zmoqQOcZAq;e?i#mgbN?Xuu3(o>R4ST05h#b$>!G<3x_zoo1sm=w&rJQxy zIvXn(9VksF_{aa1u-UR1jN}dgfpX=AsCXa*Xciv zpRpuNj8^+ELE_BTY4gz;gTG_zm0&nX6bz>+ejL=|(2<40ajv#RC>Wf9&mkH!B6wE2 z`7s|E5F+k@u%XEkeNm9U9z9wZki+HD;)X;4xMV7z4ucP2NW9}*k}PC|Vr48B^dcul zW06!2;z1o0ZBgBjv+PlWsjdZWsSRq}Q)}mUYx?1( zJb!0R-0DK*xkerh2P&~Z8-g6tq5_}oJ76m{=XsB*3En|$w9~@EoBs4Ty0HpN=h+*h z--%~mM(_h4KXDdbNx$F{20)&%1SZ}sL?(tktedy{9lak~1p3x}O>2tI1nqco-tK&_ z&K?d9>9=MOrc9z3*i3T^=-}FJLY@4@c7GIxYRlR4r*NL2 z<1N%PQO-{9b|iARK;#g8&+NTHPESq6d4b!?UyL@157+5@sTCCAd^<>&Qe^(3p>-uw zR4ijFo53^h%oC`q(OSVLM81&!g?^mdgR1wHYryxa)1&l4mO6&V7AnIpuU%&#sEb^Y z=4kT(3jW;U_x<)vYPF>N&Y@FjP~)Oyk>E#(bAc*WA>ivxSB)i$Zg8kf=|JTi%I=kb z9bGLhS#V*VPU&D{4Ms8x=8|gLKe_w1Rxpi&_t=Pk@ERat| zUn~+Fws(B1I4T^~^{^`EB>Z9VYJ2MU8iCH#rrrw=|2W5JmB^OfJ9}|tzfbb4^M2Qv z|9Fl@6eglRjnv#oaO)2n4^u7V=iHXe5bKO6d?WPqR6w!tz;^8Wps(Bernv{j1G^Sl zQhuSU?Efv_8UFA*bwB!6-QPPgz`1Spel5AM=6=*j_s?7Yj?ean*?y)4#n?t&_&MO@ zo}m0n{FAKz_*HwM<}cEZih`-4G@r%^$zNWJZ@9crOW!mmJ9g(kMLId=H%u8_^Jz6P zSKO_s2q7y-rRMRC@#Pa){>xJ{d?K@wYAWR)9Iu7`?eDvwZcz+t4Zd6erA-| zYIT_l4ATKM))HYz3=2DhieT&axF0iIzct~nV?IwL>dkP$W-1FJgDh@XQ7yc)7W(po zkg=bX%N~GajG#U1Gc)~t==#KKKJ`o5!zsbezuW~b*>${KXU)1HyFS-w-GTJ9dT_sd zT}UP@A1D1S;Nn-26=`Lys6~q$aB{`Y@f|h2&=U9i>S#c%#l%E*L2{NL9dpOk8y!QR&q$3kYLIFJN3yBmhY7lzpL(Ks9UR= zWF{+fKi;*hnto)lonlb&%$sINJ!B8kQ>s{F3uvVtp3h0V)g`o~q#=EF>}w71=gXlb zdhm$X!X(3nZ~{GQWh!kc7clT%YBsGYl2RplncZ?!&l7B%&aS^;eI0GzGFQZajZcsh z`lYgjZ)Y;SuhCfrlB>e??bzHGP;uX9Ft+7eJvF~~(SvqCdwO9Or}oUfHV0TZ;$@;T zhnI{lYslnE1gddm{0in061*c4ku>a(zvVdAW;1rT!X&o`I>)fAB1N-$={ev>E@Mg> zop$4J(`t6s%!zeg&P;1x{5x?Y2A|Jqayh@-jQHAjzPC3YnX&nyfa6-5;qN$TH6r;3kMuf`W9t@(dZf9$wZ_e%cRx~=zv zP!{I^5p{iw-{OruW(Q>nXRVE6R~26(=-{|k1kX*LyqCx3AB2?ijbW#0Rgk($sXq0| z7d!Z3b$=(LiO{i2(!6pV5SA{@zP{4ABiB>y*&5j^{V{^LJ*6l@%mbASj@`9VGk5>E zLOeAtsVd@p+io61!WmqHg@KCRk{3g$C7;u_D`Qy+uA~C{Q6$v}NRSxHOZy}w|0BTr zQ5`q)H(xwp35=)Ag)mI^b24q+82!jaExw*Ri!@l%b-yF#6Nu4@aGh%w7La5`@5rm% z%&Z+|Zhlf&57GDv9|O_*$jQ4uBlKoZQ?E5Tk><2PRrAl}Z~GdT{Z%S*Pf{ z^3+Jg@&{8!TQ%NPU!OKSfB*DSN&Wa~_VtucUE`6H6^RCGJGe5CT4;hf2jh1MnDmvi z%K0v5CP8#r$RfH0$0pQ95~lBOYcQ@sm5q&=acdxzCdwHi7RTmY!mJOcFtiNm68mwn z^cdn3m2b?vUTY*Y7aoZLKu{icun8>&rbfiMVF|IU*nUQ#3-9C_+pHvB3P&u|8w8nU792z8MP3mD!D9{q-j~Jp z-RS7%{HXcf%~3`2DP1k<0GUPUiR`{{{Tq1E^xNQ%j%)gRhF5at=i9%_I!Ox-TG&

K{NB~9X*g&9T1=-+_P3Q{OVk7;)LRe^5sz8aBA#!k2D@&gahmgfbv?ePBalgOujkdu!$DdrSNd z%6mw7AK)JUgSvXc4OAFaaAz(P*=}t=5o?%kDZb|M?3ztZVdFdei4DJXLxw|6{#$si z<|~v=>87ZmjkX(-?LBvU6`z6C1Ba{p*BaQCjA!f21%^y=-}q$=w20n3UAHlpax$qf zVxaai_*16g?@|;WCXW3llTTsJAhF^hkuZSwU9W};Q6f5)IfN)o2ma!K`S-Cb{y220 znbA%%3jkA~0a~v@EDm7trGD?G zBH=^`4yVwp=kL|~&Sb>HfH&*>qk;@TRg)o}{-ulb1BYue##Hn%aOZG+W1WHS0}2xW z=r(CbapHxO8`n_YdpJ&^pUBbNtA!#7t>FLyAmi1m0jI-vIJPhJ2Z6vK3t_JX1II*` z`c;4?lJO3J9#S>B^x10l6jVys@`&3=VgMNIe@ciEqaZ>`iP;{jtWbcNhGRS~7Ww<+ z4`?wr!s^_6kAoxg6sdO+=Q-qd-YGbnV-VR#M{{5L@nl_o1zj_hqswwfSE*(MGH+J4 z6ib4`+BUrMDDBBzrTKe5?8M(r^%>;T^M)3A55H*Ji~8Ynda~5St%S#x5{mIh+MhE(rG$-fg?-;icIgTl6RKFuG|a4K`4b746SigaNG)@6*rjE8!{ zd}kGCIM!N#x3-u?rT4s7ul&X!Li-k*+Qu$u*pZ!t#FnzI;JD{5U*7^LYZHYqWfON0 zZ1OH@zOI@oAYLa3XO}*chR8pxSj8!h#}rIe*$0w{?1ofVa}^OzGf@ngq{NEL4Nbo8 z4R;$>?_#A!wYv?LN@B^H3kKz!e;RK`k-QiW;-;n*Gs-P7wY z;j$2)9wQ2?NMHytA5U;}B^DVj1kIE7v2(p|;BKzsDZ)fW)VlzLn6ND9l^iyQt2wf+ zMP1_Qyi+CJ?73Mwg%^8K==HP4&bf|tbNX}du9)z^A|~~~?1SRDjCXS^Oza4E1rj2} zRN1edRBi3S4K9OnX7NkF?6sHvPf+bVw{$!P|JL}Ihz{(KVR>Cf65s2K? zLeOJs#RWf6ZJZpA%92XeKkS|Gdvedo4O~w|mO}>)!ASf!CUTFjX%8St(hvg_h~JP) zYtlt-41_*u9F*Bu&lC!5)b+wMNb<+H#YG}4e1qxFDxTvhbd(Ap<;u9LL-=qC_*m7` zNTM(rpYnx0#^j-Yl4pNMAxaPlO`cvAu!(*yd!8v1@!9$?AOx}p?fdjPHQs4!M+ObO zUQcAh;Y6{kN+Cp7u{f9*l`({hrcmKUIHNxs`KUO>E#RLe@=84&p&+JF;@q}+bl0RT z%@~jeu;TrCK!pUWf-zeTgbgnn5CGx4XH4q-F!Y`=AgZqd;)C=`fD;5y%i@~&_E`J( zOj3y;4e{$mY{zt`SOxgD1Uw*krH2ra;gBl?15qT=MhzAB_ys4@|KM_ty=f z;4ZsWq1FWkxX0rW%SN**2|;)jW==yIXiXYwAHcx&ZHw3{tPUW&ho9^6bvgtjRyoA=X@@_9Ts8*#k`J=Py55GcLrx&4HM3#8Sm%sUB|y zk-y%(P`bMvxtB)%Y??+fwcuhjoHx*?4L;vO-0}*!1YEjwd)VM~tq%>%OR|{00B{Li zZ^Prp12jw)$mIAC@{#QzfT zjw_0yNNKQ7w0-#K3(Rzkbm;jGe2U9s^B>dqvI08qUWde9KQM0<*Rk2b2HqY}i{cs8 z{?P9`Q1v~06IC3b7RBqn?(UaiMr|BBsLHq>y%j7Iws2a%6;v}hwKiQzF!wuN8qjkZ zUU*{JosMir9{UbUJ>w2t@ESv2aT;LJP_-KkMETsA8Rk--rH(FPj4c3rB52Dke|Zdd z0D8c0dBoQD>*q*O!L=m*PR*5^uBGKvW(*zf99L65Ekj(A?Piztxa(lNw@Lg@H1?_O z-GQ%x50sfd(7H{{)21)QzS*qY@lk#n2+vY5jVZH@*lu)^^L4KD zR|Moi1mPiT{GxS2=*jT&?Til^z73(iGP~Nsqko3aL`QK-%pRZ0d(nG-p<15&`1h2! z!#q%(pd8N`(0=B^d9^-SUp41;(%sFZsdF<&Y*UR3P|zy6p_C)GD^*A#@IJkjXBXhK#>u_48;URy4F*InqmOxa6`Cq%@KpHb(Aw z7|StXHhi<5IkcA<{#Uxa@XC0x^^>_PYKd2$92_C5A=!}~t;sC{ub_cBoA2YCPaMR4 z3|e<2ef$A%X6wsIs(ytgTnJ1)dNF$aMS??w+d1vUOX*%d^`f73MtHO;Qc!QEl|`J_*Y zUH!fmE)_r#;(!>&ODHaEJ%uX)N)J<1?XLI7fd%!r)&kx8Q#eG8Lf|-$9!Vi_tGK*9 zcil&lx}i3b;y8miX7*JZNlY@3p~5`u3(u5%^MKyqKwlc=0nQY9A`hbmotw!;>M9*p z1ymBaf6(*)dJN+3sc|eIvjo#ZqcqwYolFn3B+9 z*S~1_%)T-rDf}bD4W!)=&|3A0t&`oZxzU-U2LcLu!(QXG5bF! zM{d0Bnr1*E+A^{`E!6fj%#t){)$S9zD?Fwc;Z;jntp18wsyuO%qm*qBWE(xz2X+rZ z^(^DWCa4Q@U(OBo7+YC9&;H04ti$(m#?r9a!c0MyHzj3T2jMQjq4iI8RBAv#S3q8e ziT_Ym{me_P$v4~IX0Ux*P_Ek=z^UB`MwLm4YA7j{wp3NWl#EPjlA3Td)2p*%3Q?)` z1L(p#%XDUFZ$((1}I>b(2AHhN9V8Z|o2Yo+djO>4_FIc81Y^zx}TH5_i% z%2)9yE6Ur7r&M2!9nVr(xF9vk-UMCNlUHB~$V#q**A?7qe0lGS40c*&?1w^s7A)k| znnY88Vy16P&O#nK9$EAFm-ejP&9wc?a={A0Iq!y=myBm3ABp%_MSaploToPsZIZsV z==QGXg5olFMRB-ch-}lw40&GEH7)7cebqiwU^cM+q3Zll+~{i@!u$mRh)h1vTOCwl zeG`k3!Pb6fuu-b}RP^=RrPb5vjk*tNdm~oo)~4S*=q^ANHJ+P}?#%D|6|J^S`q_+#LwpifbmH<^O2; zR`{)InXU}Y+34h5H()j)rCRLmFT#JB#{ z)a&WL@^cuto<};{rlF?z`_$u$LTuy+1uvrbM9lk~+-9ypoJTza&-PVc650$tlvM;g z4OyF+KhWluwv;MA2d`lU{f3qAjeWXyclwDNEwRYa4~3JHvhATYmL;s2MqSdf?Ny6^ zhVBRy6D)jtg%8&4fG`xVUov@Y#MAZBCLP~@YbH&}_$5lP%KXJGt5xbj`(sjiTI`h5 za88 zs&qH%_qt|es?8V)Jo|iV(sFqQ(;|BItm&0VSZ03Ynss|)Cd$C6VCtUZr&7h|NqO$b z?A)#XH!ezPY1n5Ul9U($JUJEowZ8{7#i9GmL;}|kIo`EumFbaQAM}0GwFG-fF5;KB zHERJn!P4%9&#gLZC`kBUnI&`N7*zZJG}Fx zA`Q#2d7!mp8lvhxFWGK4Zmd-3+|`E;hOX{=tCxlod)^AmIq?sS9*)juz#a!CPl>p` zYE`ESjZ0tueQo$Ak)ztK;f5j!#Bv!CZ~eG&(h^#j`uF*?JSJV-StgDTx0!F|swj40 z(lN)~N%9@V1K@$5;>LywjI;9|D>oW8z zN85L7<1d*&MKa|-G((6ACRHiTo#*reA}TjTSbUuXJg*h-Uv5saDm1$7nUI_Cq_a5HEa6}?`0ww> zI-WyKy`3|=F8ldUC$l4twy*!u?x;(BHK|?i?FOIu@Mve_3ngt?O3kpA+td%5^nuTf zyW&T0e)Q#%gT6F))!ORS*~dKAHaD5Pv;J~< zzme#{SCfq|R_X3~S~H~7fFI6co-+ELiL{TIZm-p;i??!7#a`*WgeL`0SY;;@T)q`> z?NT3z9sLPpn-|BSPisqj+19*J% zj+T_ci_BKt>OWRF!Ih<}?X|Yf?+Uzbw&whCOxH%iKF*`(g^Gp{UUNl_x@W8wg@0ew zQGJVuUuacTB+azFF4|i0?^jk%d;7{se{Rz<$?58ov|9yZEJ{os~2^iov-0yImPus?ORz~bv6=jPp~Xv!X)1RItH&3LMcGv@!;e^mA*{y1^Kfzro1`$1}b zG5@Mv(C{NZTPcKWC0necc|(jX1Gt_7@Agqb&}^DDzmgQ>c`7T`y1# zZK?_&&1~-C`4>J$@z~gL-*F-I&K@;1%SI?B=A_PqpBY1c894uH*Z$=&dlaj4!r#WK zOHHQJ2API$byq&saCjx$N3GzkDGuzMSDR0kTmFZfXkq*0m;lk*F|EtUY-K`6H z(dyvjQ*;v_^Ve4+#XR`=)dIJ}_gNhO&Yt|@>(+|;=XYQ}oUj>e@ynsi>G53ae=2Ws z3~oPER}3q92&CU?m$-S-lca9a>aSN~QJC|` z8Y1KqVCj>B_4LnUF*N<0PDV71oBs+nxsQ|UBuHIylsK77IZtFxd)~in1YG$i9bPs1 zHcxUZJ$``C$X3kER7~#e;G5^)JjX($ODfZ(IMN2EH;I}OU$i|B;6nqH=TF_fIX6~j zgu@+>iA(smwaD{9ZG%D|4Hc>n2KkE2FhGJQ%Ij~6oHO!(CjrTEBgF6AycCH)|}WmP(h|Aa)akYre|sA=W~Uy zjzKfMucD9NzQW%+)_$r(WB%Rx?DeYWinM$I2NNPlM*AIHe%vUN@cm(jsHwjcS~T#r zNc~ynr>DGHwFW~H>*ZSDVxTX}CIYPLB4r7(L9S9~KT0#Yp+Ojac>;GSj(<577LAcsPoW%I zJTrPxRyjOiwYP&JKlMzn<6#@7?Ao{amuWm04~?1n7f4BeQ;{da!BQSsKMHD>#%pez z67Ly$uXyi{4j(<9RQmGt&DPUugYQu>H?IFFE&Il}8)IFmNpmxujv`x1NK`1UROfaf zpPF$h%EBt)-<9^ZCALk~OikUJ!S`PENNg|n`0Kq}PD#AYqjDbMc1pq3!=1!(SR&{|uSGFGf zQ~<|A!%I{~x(qp4b`Gp^@#oArNeUy5ijr;CQ!Y1B#6W$wK6ib8*ei1yV3ieu8_~vI zT4hN2S8M)y5YZuGp*HSpFQBN=#6*LSgTB2;>~xHW*^P+tDw1 zvL+WQU`_TM@n?`Rq>g$15b?jqFw1i@tPA5E+F4z8P(HT9wWP2pRb zDwpMrrq30azo$w)-x$l$yrijSvZPsS+9%iG8=UWg;@J~Z-M;}?05F3!SfDP#iE_r0 zf^b}e9b!QJYcLl8yt)q6TY~|@u*eb!9TVoU0q-kGuIh4%>2j2EfFePT4wN%`I1Zo$ zVL_yG#DZ7=Du!VQFF7NIJq7+|9X=1KWyOg&}X$3TaOklh|o-nt-)0`sLn z_|_p}x?nR5D4kw&!-4g1*UTx5zj7{IB>;y-*i?SV<2eLms{UP=l z>g8h4*Jt3y#Wtq&V0bXu`u*TJfuDxA_L#0WTex93leLYX1=6xC;;&|WVJ|VW!F{wq ztrcs0fr_oeEZ4XlDUNakHa!9?o63Lhh&zyY3KQ!AI|2*^YcO9GfG33ehn3zhVNj)t zE@X{P8>8sDGjiRY5rgM<1^myPu>zo9dKld>E=f3!=@M8Tj&qQTa*Kz@S%CxBg)R{2 z0bc1V>W)_dP8R~i?E}b{02dQxFvTIBuRAjW5KRKGx(?B#Ks`kP{2FY!1YxuW8KgiR zseD%IFg<`V(v{lss+CSj{xv_N~&G6+C${ zS1RSdTKf~;IP_da=G zqj?l~?ZJPpz!1P6uFLptUOD*WPD2U1fxXdb$vu6w`)@FMq%wp65urq7`*(b{_Dp1r z0|&ry3v*1E+aFH$dsAKm_O0ZoW3}IPpv5!Pd5K^@#ILr8 z+hwZzd}}JoUFiSfGdoA?Ab*#rNX9gM7LW-lR?9E9OT;h?F7bLb?2kj8qH3z+5PG ziU5pDLy)zfG0ut!3xIcfU|>Sz^+&Ezu1G%NTr5tAZZy+FNRRpC&Z$Ayj$jPBS3F8j z9X_~n56N8;m<}@Dzr22YQTCbX1NV&jolizB8DAjFZAQn{$_+ z?V155tn@eD$9$zB#e82!`z^qNwVDkv49Sb{hO9_KK^GfM8u=|U3OoB`Zh;#&U6cfT zT8>i{g$CdoUTqm(SGqd5*1hXGUNM_J$5%Rwypkvf?r)~de9l<%u;eV=qAjGWC&2!d zA$<9HeDw{la)V+@5wa`iY_eB2ZeT7*7|J;@l@oh|2)A8bq5nXj!1c4xO+k$nHMO4? zP~R?E2{4jx%1P3X>lMbY@P^P-nQ0O@By7TlY~BPs`kCpDI;r z{L6U#2kC;^VZyd0Ikj5MAdc_0YjWt+Dix z(3e<^LhXGv<2$z4oB#^jugIt~NogYg4eI~O$2GFWrA~3(jcW0HE8vc3Yf3szy0d9r zllL$3*z{4Q`cM+s^=8rZ9^dC&s@vD@<7GSXmRqd#r=`meP4U)6Y#|9|YE71_;>;ob zZx$mzwmRf2(j;sUN6_fw=wtSa-rK0y&r13LNq3bvYakr!-PB_Zp zET^o@G$)>%lgEv-g%$Q~JsUYw<;pO?t(*F6GPz9gzVDvH{-IOMq`>z`>r;X!IpBfB zsKbY+1hbJm?{k)r`M;*g2FDFCTivdWOJu3_SLk!sr*#GoTN@v=B^FwC!g@mA2BAy> zzObc^7;W~*+@6cDa}_2dHRpZmcv5K^d6H&JhR%ffux(^m!i+Zy@Huh81Z}+zUsvgV8kGqbC*ZYdhP3Omo zBHB-EiZ7)vZrK~DDaJXt{2x^is_(WD*IsJxxx}1qc;VI{fEtM=UwgsrWtn;Rk#`zi zfT<^|5yep#8X&Z zIYiY>OrAyMRZgBYWkpRLb(c2gehiItF8HdUo- z8SSYino5o~sFO)as!^qpW*TLUDyq~^j&Ocu6d}%Wi3KN<1!#}v8xge%zal2=(Uk+9zw>e=&R#i={>0^&>!6lo6MFEhQ zY>j5t*G)A#MjLjmsmNNs*m*YabpqyC*mwpGmoRaJ*+v?|dUcl@|3@YUHr9YJiYDrd z-eqXp#v)eupixpJ)>BH8p~&)z^!*m{ydE7Ipp+wPcwxsK#%Hj4kk#9sg8J@k8)zi) zJk)^AaqLn}C9m}Icr8;j6pj|p%bj&8()?Xx8#2~3U+Sd>wb-!XZ1Bb0VMY>f01YmzaLPksa`C5;KNka1Kw+-$>!LH}uDJ?H%Hd8NBs4kh@`-?Vy$a36@Eg zVGb#@ahBUvm6*$U9p?5)i4|J5a!VE=$I$&RDM)b#8ZJj zuAsRAihQ!aqLt^mrZcaiK(!A4ESd$b{^yk6_IbFTu5LBE|6p+=)$CN^jjxjB%w?bS zNrY9WC7&okTjchI(sbohuNt~lks`$jx5&gVP^w#rm{j*9rOD4=Cb@`JdIyy$afwe> za$W7XGO0(+Dp`SoAeGS9q{NH~R!O1Mp-LB)$UTlnIH=Cu>LROn35-4wid%y^v$uZT zW+>U&%y$+ylD@qSJDa21Zkls9>9p)>2cyq?5+*d;6h?&qV^)KB!kv^AWqNgKRCLy3 zGQ+WqJsOLiqcD;;f_Y7gvw_>A4rD{qXvc@g;h@)=$FsY2k276!W35C6AnpjxC2d=p z+fX!-;7G?_RD(|$&z80JaHchI^qB5KM4tMHXgGyb|4IV<<{s;?XOj~F8eULlwaqEc zZ$X5Ti461?j!jW(jik{{Rz@?r`Rh1P*0{38rfqTWEaf_vGqX3qIR&dCz)RNQ z*2y1u>grJ8!Q+v*M_;T~Uvrugg=gHuTMLl2Cy> zJXJ!^^ha^ku}ls%l9g!UzobZofLyWOMg)_(F>Ont;N;vi-?S!%@+nwX8WlmDWV2S? z51Jc9(slmWLH!vjdi6^WKt~6unnF-5Y{|g8|42zNR2HpQjyce)>c+$QAW>qTLR}si zBu23w%y0&qS-i|Mnds37k#9whYUVV@uxZFWFiOZMACeydR_#F>1Ek3G$e;QQ5orig zB779aA~3>oHjcrXAq^#>@w5^nZd@#2?>3nsnkGbvbxnt&QrKU5=OBqu3|h6_Q%CbihN_t#}tMzEotg zeu7F$LkH6U3nV7=KqSvyiPA#>7N+muCut zq8=A{9pkRAJ$f9Z2xC&_h13f_L}O591q-oIk_>T*#$n$I_UfH6^?DBTXU-U^Tf?50 zI$Z^$izHN8)_Sd#6U#Jf1`Q%3|1z#TnC9hwbaunp_^pgPoON;;O~irO251wabazC! z)!UwpmCpgApbvW7&|HYm^ODZT1lZb1CP!;?$sJ!+|1LFxD!L0;LBi(t8qb1;fpS<_9L}kzvdvwITWrK>!0P2VPV3qfRbofwjI%W- z!+Hp~M@q(d3o64-E@yw=>E2kOYr!Ok9!ilDn?YK%owG_H@x1Kc9Pj){9nT8u;J0y$ zk9Q<+R}zEZjtl-dWU8N1KIfMT*)Qy!u*bH?VlTATzE zY$^RIwMo%Y^NoM37Fc8e{}(IF5f9l`7awYoPj*et6@_fWzDR20RGFB3-ulIEn|nA^ zL@mHFeO9lPY_odFp}TSEq7$v0+NFP^$ElyrZZmr9!9Yv3U^w>BPer625P6g=`H zKH_iCbWg{UU;Nc@!eSDNR8b*TKCx0wu48s46*{2?RK1~J8P`14L_S(KJn#cx)-yZm zB63HU6$AA>2lW&j|5j3Mg(N~}IYGugMnOd-BveF)T^`{@Fo9l6qC5WcWV-??DQHxJ zrBh=8YCna8wsUqG$X5C!W$4sU4#pHabxj|Kh47?f{{mI4#u`-P}TJpwyUxS2=VO4uD**jK_6FtMl`VWJm;v0E?G zU9NaNu62pU|J5<_q&2%me1z72^Or?6A~LFV8>yFr=7S<~V>B*;W}R^?H^W%RND#pW zig{L9pF~$F@^eT7R-0CA)OJVtVUPNCYQgnChh-G|<|C1#0SaP457863<2iGOEo5j` z=%OhixrN_CMg|6ALl+W|LPjOVC?Y|IX{J6LR9SOaDLgb!dE$hm)hE!xa!-+p_v3c! zB4)3mJ1J5hdV)U}fm9B}KL+-H1e6fO(iA-raRdb|{|8hiIZantJ1EqHnPVi|Lp8=T z8nEL$1IA-~mrtRgmI;LrGj>eUlRgNCAHCyWZ&*#zh!i$cbX}-r*fIh;=h$r-0cu$>F-(JQprm~F<{oUsMBG)4BLHsAW+Di2ei5f?#}zR)v4oF; z9w>5rEQnU3my21$Mi8@(#aSM&6kOtXFGrB~5lGP~kLT@@%FsfK) zd099>6CWJIiV2b)`zbtpmI1H;3l;emk4Y`^ah8sPm9XMdF8NM5rIg3Et^? z|8qZeL^~6h5OR5S8AvD|h)^qGZwp6cQ1?u7wPq4{f^?Upz>{Rzl8<1PlkqiQaJtblh)Ed9W-z?#U^E$ zh6i?0I%Sd<6i3QZEkOzqwnH1AvZSUqE#Bf#E73h0frXsKhVhj>sL7P|qmk;=VjSqC zgcuZ6#X@hvM(wnC0ToN*xQDDkOJ>tcwYgcVC={d0oO=~$^Y~@GWNq|FLv-|xn&?Wl z6d<)Bpb&Lu%a>>h1bXOcIJyTO;}?M7^*7a59?k_n@CAD@lUb4XAd#0KlJ#-g|7jEy z(;pM^a^p5g&<7nYsauobHhM-P$#Sl$N1MWDieW@BWP^PDk{qwdM$%=BK|>d!(J+`n zMY-9I1vxvUwOC?BAI8@c*H)n3C~ZhuobD!8fkl8aR9m7X0D~7fY7r!sLpj~_UYSa9 z?sODFB~HW9pik1KiqT=+GgNStb^O&OVOde3LMjRcD^$izW0^F4YJI6PPXFd&I!Y}! zxKkL#NzKGMOLikWL12Y~m&r<|%)jAX_{7ptE-SR>$y9Pk0Kuy=mz2YPXlNc*atl-94a zw=&7upXnGj@Hm~yrA&y1nPk%|$ohH{%74Ym7q7K!apj$K12@dqFsYht_r`(~0)4On zHV|X6+1qX_5*fL9Fq38?6tz{c5qyTC6euG?@wSWaTU?EWEC6%B54x;l)t;)vJj0TH zs5%@TOLz_lx6e}`w}WJm@|F>FCohJzx%86Kxu`fLYirwcm2xex|CBCd=45FZmSu51 zO6HW=6D>B}Qc01+bm(JdQiD|K!%SvmaF{D2_Bozn6DRDKd^fq|qqJwoD;q^}?qgp_ zr%rs>I3spqM7m8@QnpqiIvc^2C%CtG!lgq#88A!a6MfW>kr*T@w zVI><9d1F6WImLxIeUEE{f7(y;WyW;~7WLFX^R-eH_`m_sF2(c~v}9;MH$i;5g z*lhopl$h~W8yjt`7Fa(qhoL;Wpz&DOR!E*)y4Q6>>XvAL|Iu5L+J0CSNmLUcQd4=q zOK2Ori|19Z(i};hgjoNJib!c$*GDx3Gp{6Agk6JaTF>#Hqxf5CSyr~ zW+ literal 0 HcmV?d00001 diff --git a/tools/python/key_2_emb_formatter.py b/tools/python/key_2_emb_formatter.py new file mode 100644 index 00000000..617e7f99 --- /dev/null +++ b/tools/python/key_2_emb_formatter.py @@ -0,0 +1,220 @@ +# coding: UTF-8 + +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import numpy as np +import argparse +import tensorflow as tf +import json +import sys + +parser = argparse.ArgumentParser() +parser.add_argument('--path', type=str, required=True, help='path of the root dir of saved file') +parser.add_argument('--name', type=str, default="key_2_embedding", help='name of output file') +parser.add_argument('--ddr', type=bool, default=False, help='if saved data was from ddr mode, default False') +parser.add_argument('--step', type=int, default=0, help='the step when the data was saved, default 0') + + +class Formatter: + + def __init__(self, saved_file_path, out_file_name, is_ddr_mode, step): + self._device_dir_list = ["HashTable", "HBM"] + self._host_dir_list = ["HashTable", "DDR"] + self._device_emb_dir = "embedding" + self._host_emb_dir = "embedding_data" + self._device_hashmap_dir = "key_offset_map" + self._host_hashmap_dir = "embedding_hashmap" + self._attrib_suffix = ".attribute" + self._data_suffix = ".data" + + self._saved_file_path = saved_file_path + self._out_file_name = out_file_name + self._sub_dirs = self._get_sub_dirs(step) + self._table_names = None + self._father_table_names = None + self._step = step + + self._json_attrib_dtype = "data_type" + self._json_attrib_shape = "shape" + self._host_attrib_dtype = np.uint64 + self._hashmap_dtype = np.uint64 + self._raw_key_dtype = np.uint64 + self._key_dtype = np.int64 + self._raw_key_offset = np.iinfo(np.uint32).max + self._data_dtype = None + + self._is_ddr_mode = is_ddr_mode + + def process(self): + dev_dir = self._set_upper_dir_origin(self._sub_dirs[0], self._device_dir_list) + + self._table_names = self._get_table_names(dev_dir) + dict_out = {} + for table_name in self._table_names: + combined_key = None + combined_emb = None + for sub_dir in self._sub_dirs: + dev_dir = self._set_upper_dir(sub_dir, ["HashTable", "HBM"], table_name) + emb_data = self._data_process(dev_dir) + key, offset = self._hashmap_process(dev_dir) + emb_data = emb_data[offset] + if combined_key is not None: + combined_key = np.append(combined_key, key, axis=0) + else: + combined_key = key + if combined_emb is not None: + combined_emb = np.append(combined_emb, emb_data, axis=0) + else: + combined_emb = emb_data + print(f"{table_name} has combined key {combined_key.shape} and combined emb {combined_emb.shape}") + transformed_data = dict(zip(combined_key[:], combined_emb[:])) + dict_out[table_name] = transformed_data + np.save("./" + self._out_file_name + ".npy", dict_out) + + def fw_weight_process(self): + checkpoint_path = self._saved_file_path + "/model-0-" + str(self._step) + reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path) + var_to_shape_map = reader.get_variable_to_shape_map() + for key in var_to_shape_map: + if key == 'dense/fw_weight': + np.save('fw_weight.npy', reader.get_tensor(key)) + + def _data_process(self, dev_dir): + dev_emb_dir = os.path.join(dev_dir, self._device_emb_dir) + host_emb_dir = os.path.join(dev_dir, self._host_emb_dir) + data_file, attribute_file = self._get_file_names(dev_emb_dir) + dev_attribute = self._get_attribute(dev_emb_dir, attribute_file, is_json=True) + if not self._data_dtype: + self._data_dtype = dev_attribute.pop(self._json_attrib_dtype) + + dev_data_shape = dev_attribute.pop(self._json_attrib_shape) + emb_data = self._get_data(dev_emb_dir, data_file, self._data_dtype, dev_data_shape) + + if self._is_ddr_mode: + data_file, attribute_file = self._get_file_names(host_emb_dir) + host_attribute = self._get_attribute(host_emb_dir, attribute_file, is_json=False) + host_data_shape = [host_attribute[0], host_attribute[1]] + host_data = self._get_data(host_emb_dir, data_file, self._data_dtype, host_data_shape) + host_data = host_data[:, :dev_data_shape[1]] + emb_data = np.append(emb_data, host_data, axis=0) + + return emb_data + + def _hashmap_process(self, dev_dir, ): + dev_hashmap_dir = os.path.join(dev_dir, self._device_hashmap_dir) + host_hashmap_dir = os.path.join(host_dir, self._host_hashmap_dir) + if self._is_ddr_mode: + data_file, attribute_file = self._get_file_names(self._host_hashmap_dir) + else: + data_file, attribute_file = self._get_file_names(dev_hashmap_dir) + + attribute = self._get_attribute(dev_hashmap_dir, attribute_file, is_json=False) + data_shape = attribute[:2] + raw_hashmap = self._get_data(dev_hashmap_dir, data_file, self._hashmap_dtype, data_shape) + offset = raw_hashmap[:, -1] + raw_key = raw_hashmap[:, :2].astype(self._raw_key_dtype) + key = raw_key[:, 0] * self._raw_key_offset + raw_key[:, 1] + key = key.astype(self._key_dtype) + + return key, offset + + def _get_sub_dirs(self, step): + sub_dirs = [] + for _, sub_dir, _ in os.walk(self._saved_file_path): + sub_dirs.append(sub_dir) + + picked_sub_dirs = [] + for sub_dir in sub_dirs[0]: + if int(sub_dir.split("-")[-1]) == step: + picked_sub_dirs.append(sub_dir) + + if len(picked_sub_dirs) == 0: + raise FileExistsError("There is no sparse checkpoint for given training step.") + return picked_sub_dirs + + def _set_upper_dir(self, sub_dir, dir_list, table_name): + dir_list_copy = dir_list + dir_list_copy.append(table_name) + temp_dir = os.path.join(self._saved_file_path, sub_dir) + for directory in dir_list_copy: + temp_dir = os.path.join(temp_dir, directory) + father_table = [] + for _, i, _ in os.walk(temp_dir): + father_table.append(i) + + temp_dir = os.path.join(temp_dir, father_table[0][0]) + return temp_dir + + def _set_upper_dir_origin(self, sub_dir, dir_list): + temp_dir = os.path.join(self._saved_file_path, sub_dir) + for directory in dir_list: + temp_dir = os.path.join(temp_dir, directory) + + return temp_dir + + def _get_father_table_names(self, directory): + if directory: + table_names = [] + for _, table_name, _ in os.walk(directory): + table_names.append(table_name) + return table_names[0] + else: + raise ValueError("directory is None, cannot search for table names") + + def _get_table_names(self, directory): + if directory: + table_names = [] + for _, table_name, _ in os.walk(directory): + table_names.append(table_name) + return table_names[0] + else: + raise ValueError("directory is None, cannot search for table names") + + def _get_file_names(self, directory): + files = [] + data_file = None + attribute_file = None + for _, _, file in os.walk(directory): + files.append(file) + for file in files[0]: + if file.find(self._data_suffix) != -1: + data_file = file + elif file.find(self._attrib_suffix) != -1: + attribute_file = file + return data_file, attribute_file + + def _get_attribute(self, directory, file_name, is_json): + file_dir = os.path.join(directory, file_name) + if is_json: + with open(file_dir, "r") as fin: + attributes = json.load(fin) + return attributes + else: + attributes = np.fromfile(file_dir, self._host_attrib_dtype) + return attributes + + def _get_data(self, directory, file_name, dtype, shape): + file_dir = os.path.join(directory, file_name) + data = np.fromfile(file_dir, dtype=dtype) + data = data.reshape(shape) + return data + + +if __name__ == "__main__": + args = parser.parse_args() + formatter = Formatter(saved_file_path=args.path, out_file_name=args.name, is_ddr_mode=False, step=args.step) + formatter.process() diff --git a/tools/python/optimizer_process.py b/tools/python/optimizer_process.py new file mode 100644 index 00000000..8a658e29 --- /dev/null +++ b/tools/python/optimizer_process.py @@ -0,0 +1,116 @@ +# coding: UTF-8 + +# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +import numpy as np +import json +from enum import Enum + +# 每张卡处理自己的 + +parser = argparse.ArgumentParser() +parser.add_argument('--path', type=str, required=True, help='path of the model file to be converted') +parser.add_argument('--step', type=int, required=True) + +sparse_file_prefix = "sparse-model.ckpt-" +optimizer_prefix = "Optimizer" +data_suffix = ".data" +attribute_suffix = ".attribute" + + +class DataAttr(Enum): + SHAPE = "shape" + DATATYPE = "data_type" + + +def get_optimizer_name(sparse_file_path): + optimizer_list = [] + for folder_name in os.listdir(sparse_file_path): + optimizer_list.append(folder_name) + return optimizer_list + + +def get_table_list(table_upper_path): + table_list = [] + for folder_name in os.listdir(table_upper_path): + table_list.append(folder_name+"/table") + return table_list + + +def get_optimizer_param_name(table_path): + param_list = [] + for folder_name in os.listdir(table_path): + param_list.append(folder_name) + return param_list + + +def get_optimizer_data(): + pass + + +def get_attribute_and_data_file(table_path): + if not os.path.exists(table_path): + raise FileNotFoundError(f"the input table path {table_path} does not exists.") + + attribute_file_list = [] + data_file_list = [] + for file_name in os.listdir(table_path): + if file_name.endswith(attribute_suffix): + attribute_file_list.append(file_name) + if file_name.endswith(data_suffix): + data_file_list.append(file_name) + if len(attribute_file_list) != 1: + raise AssertionError(f"under the table path {table_path}, there must only one attribute file. " + f"In fact, {len(attribute_file_list)} attribute file exists.") + if len(data_file_list) != 1: + raise AssertionError(f"under the table path {table_path}, there must only one data file. " + f"In fact, {len(data_file_list)} data file exists.") + attribute_file = os.path.join(table_path, attribute_file_list[0]) + data_file = os.path.join(table_path, data_file_list[0]) + return attribute_file, data_file + + +def process(path, step): + save_dict = {} + sparse_file_name = sparse_file_prefix + str(step) + sparse_file_path = os.path.join(path, sparse_file_name,optimizer_prefix) + optimizer_list = get_optimizer_name(sparse_file_path) + for optimizer in optimizer_list: + table_upper_path = os.path.join(sparse_file_path, optimizer, "HBM") + table_list = get_table_list(table_upper_path) + + for table in table_list: + table_path = os.path.join(table_upper_path, table) + optimizer_param_list = get_optimizer_param_name(table_path) + optimizer_dict = {} + for param in optimizer_param_list: + data_path = os.path.join(table_path, param) + attribute_data_dir, target_data_dir = get_attribute_and_data_file(data_path) + with open(attribute_data_dir, "r") as fin: + optimizer_attributes = json.load(fin) + with open(target_data_dir, "r") as fin: + optimizer_data = np.fromfile(target_data_dir, + dtype=optimizer_attributes.pop(DataAttr.DATATYPE.value)) + data_shape = optimizer_attributes.pop(DataAttr.SHAPE.value) + optimizer_data = optimizer_data.reshape(data_shape) + optimizer_dict[param] = optimizer_data + save_dict[table] = optimizer_dict + np.save(path+"/optimizer_dict.npy", save_dict) + + +if __name__ == "__main__": + args = parser.parse_args() + process(args.path, args.step) \ No newline at end of file diff --git a/tools/python/readme.md b/tools/python/readme.md new file mode 100644 index 00000000..3f5e86df --- /dev/null +++ b/tools/python/readme.md @@ -0,0 +1,110 @@ +# 模型数据转换工具(key-value)使用说明 + +### 1. 美团1207模型ckpt保存路径说明 + +#### 1.1 训练时1207模型保存参数设置:(estimator模式) + +![img](./images/clip_image002.jpg) + +![img](./images/clip_image004.jpg) + +#### 1.2 训练后模型保存路径目录展示如下: + +![img](./images/clip_image006.jpg) + +#### 1.3 下面来看单个文件夹下存储的内容,以check_ran0为例: + +![img](./images/clip_image008.jpg) + +我们的模型数据转换工具就是要对该**sparse****文件夹中的数据进行转换**,转换成key-value形式,保存格式是npy文件,详情参考3. 输出文件格式说明。 + +下面介绍**如何使用该模型数据转换工具**。 + + + +### 2. 使用工具demo说明: + +**该转换工具model_data_to_key_value.py一共需要4个参数,path、name、ddr、step** + + + +| **参数名** | **数据类型** | **必选** | **默认值** | **描述** | +| ---------- | ------------ | -------- | ---------- | ---------------------------------- | +| --path | String | 是 | | 保存模型embedding数据的根路径 | +| --name | String | 否 | | 输出文件的名称,最终输出.npy | +| --ddr | Bool | 否 | False | 保存数据是否开启ddr模式 | +| --step | Int | 否 | 0 | 保存数据所属训练步数 | + + + +#### 2.1 参数确定: + +下面是一个选择参数的示例。 + +##### **1)** path路径确定 + +我们选择1207保存下来的0卡模型文件夹下的sparse部分数据进行转换,因此路径选到目录下:/home/lff/model/check_rank0/ + +![img](./images/clip_image010.jpg) + +**--path = /home/lff/model/check_rank0** + +(多卡的目录需要转换多次,一次只能转换一张卡下面sparse的数据) + + + +##### 2) name参数: 输出文件的名字,格式为.npy; + +例如:sparse_0,经过转换后的sparse数据就保存在当前目录下的sparse_0.npy文件中; + +**--name = sparse_0** + +##### 3) ddr参数:美团模型未开启ddr模式,因此选择False + +**--ddr = False** + +##### 4)step参数:在上面1207模型存储的目录下面,存了第0步的模型。 + +**--step=0** + + + +![img](./images/clip_image012.jpg) + +#### **2.2** **执行工具命令** + +python3 model_data_to_key_value.py --path=/home/lff/model/check_rank0 --name=sparse_0 --ddr=False --step=0 + +#### **2.3** **执行结果展示** + + + +![img](./images/clip_image014.jpg) + + + +### 3. 输出文件格式说明 + +**.npy** 文件 + +【在使用mxrec的时候,传入了一个特征one_big_feat,存在表名为one ascend hash embedding 的表里面。】如下图所示: + +![img](./images/clip_image016.jpg) + + + +转换了的npy文件构成为: + +{****:{key1:embedding1,key2:embedding2……}, + + :{key1:embedding1,key2:embedding2……} + + …… + +} + +示例:转换后的npy文件裁剪了10个key + +![img](./images/clip_image018.gif) + + \ No newline at end of file diff --git a/tools/stat_info/main.py b/tools/stat_info/main.py new file mode 100644 index 00000000..9c27754a --- /dev/null +++ b/tools/stat_info/main.py @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +# coding: UTF-8 +# Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. +from datetime import datetime, timezone +import logging +import os +import stat +import time +import threading + + +CURRENT_PATH = os.getcwd() +FORMATTED_TIME = datetime.now(timezone.utc).strftime("%d_%H_%M_%S") +TRAIN_LOG_PATH = f"{CURRENT_PATH}/train_{FORMATTED_TIME}.log" +EVAL_LOG_PATH = f"{CURRENT_PATH}/eval_{FORMATTED_TIME}.log" +STAT_PREFIX = "[StatInfo]" +DISPLAY_MODE_PRINT_SCREEN = "print_screen" +DISPLAY_MODE_SAVE_LOG = "save_log" +TABLE_NUM_LINE_PREFIX = "current_table_num" +CHANNEL_LINE_PREFIX = "channel_id" +VALUE_READ_START = 6 +VALUE_READ_INTERVAL = 2 +LOOP_SLEEP_TIME = 0.003 + +# run mode to be selected +HBM_NORMAL = {"key_process_time_cost": "key_process_time_cost", + "batch_key_num": "batch_key_num", + "unique_key_num": "unique_key_num"} +HBM_FAAE = {"key_process_time_cost": "key_process_time_cost", + "batch_key_num": "batch_key_num", + "unique_key_num": "faae_unique_key_num"} +HBM_HOT = {"key_process_time_cost": "key_process_time_cost", + "batch_key_num": "batch_key_num", + "unique_key_num": "hot_unique_key_num"} +HBM_FAST = {"key_process_time_cost": "key_process_time_cost_with_fast_unique", + "batch_key_num": "batch_key_num_with_fast_unique", + "unique_key_num": "unique_key_num_with_fast_unique"} + +DDR_NORMAL = {"key_process_time_cost": "key_process_time_cost", + "batch_key_num": "batch_key_num", + "unique_key_num": "unique_key_num", + "swap_key_size": "swap_key_size", + "swap_time_cost": "swap_time_cost"} +DDR_FAAE = {"key_process_time_cost": "key_process_time_cost", + "batch_key_num": "batch_key_num", + "unique_key_num": "faae_unique_key_num", + "swap_key_size": "swap_key_size", + "swap_time_cost": "swap_time_cost"} +DDR_HOT = {"key_process_time_cost": "key_process_time_cost", + "batch_key_num": "batch_key_num", + "unique_key_num": "hot_unique_key_num", + "swap_key_size": "swap_key_size", + "swap_time_cost": "swap_time_cost"} +DDR_FAST = {"key_process_time_cost": "key_process_time_cost_with_fast_unique", + "batch_key_num": "batch_key_num_with_fast_unique", + "unique_key_num": "unique_key_num_with_fast_unique", + "swap_key_size": "swap_key_size", + "swap_time_cost": "swap_time_cost"} + +DDR_LIST = [DDR_NORMAL, DDR_FAAE, DDR_HOT, DDR_FAST] + +# ====================== Please modify here according to readme before using ====================== +TARGET_REC_LOG_PATH = "/home/example.log" +RUN_MODE = DDR_FAST +RANK_SIZE = 8 +DISPLAY_MODE = "save_log" # can be "save_log" or "print_screen" +DISPLAY_INTERVAL = 1 +# ================================================================================================== + +TRAIN_DICT = dict() +EVAL_DICT = dict() +CURRENT_TABLE_NUM = 0 + +FULL_DICT_LEN = len(RUN_MODE) +TRAIN_TOTAL_DATA = dict() +EVAL_TOTAL_DATA = dict() + + +def read_log_by_line_loop(log_add: str): + """ + read log by line continuously + + Arg: + log_add: file address to read mxRec log file + """ + logging.info("============= log reading started =============") + with open(log_add, 'r') as log_file: + while True: + new_line = log_file.readline() # 读取一行新增内容 + check_line_content(new_line) + return + + +def check_line_content(line): + """ + check line file content and record relevant info + + Arg: + line: line object from file reading + """ + index = line.find("[StatInfo]") + if line and index != -1: + stat_data = line[index + len(STAT_PREFIX):].split() + if stat_data[0] == CHANNEL_LINE_PREFIX: + tar_dict = create_data(stat_data) + update_data(stat_data, tar_dict) + elif stat_data[0] == TABLE_NUM_LINE_PREFIX: + global CURRENT_TABLE_NUM + CURRENT_TABLE_NUM = int(stat_data[1]) + else: + # 没有新增内容时,可以选择休眠一段时间再继续读取,避免过度消耗资源 + time.sleep(LOOP_SLEEP_TIME) + return + + +def start_display_data(channel: int): + """ + start to display data continuously + + Arg: + channel: channel id 0 while train, 1 while eval + """ + logging.info("============= channel: %d stat display =============", channel) + if channel == 0: + glob_dict = TRAIN_DICT + elif channel == 1: + glob_dict = EVAL_DICT + else: + raise ValueError("channel num can only be 0 or 1") + display_per_step(glob_dict, channel) + return + + +def display_per_step(glob_dict: dict, channel: int): + """ + display stat info according to step num + + Arg: + glob_dict: which dict to use to record stat info, can be TRAIN_DICT or EVAL_DICT + channel: channel id 0 while train, 1 while eval + """ + step = 0 + while True: + if step not in glob_dict: + time.sleep(LOOP_SLEEP_TIME) + continue + display_per_rank(glob_dict[step], channel, step) + del glob_dict[step] + step += 1 + return + + +def display_per_rank(current_step_dict: dict, channel: int, step: int): + """ + display stat info in each step according to rank id + + Arg: + channel: channel id 0 while train, 1 while eval + step: current step num + """ + i = 0 + while i < RANK_SIZE: + if i not in current_step_dict: + time.sleep(LOOP_SLEEP_TIME) + continue + if len(current_step_dict[i]) == FULL_DICT_LEN: + display_data(current_step_dict[i], channel, step, i) + i += 1 + elif len(current_step_dict[i]) < FULL_DICT_LEN: + time.sleep(LOOP_SLEEP_TIME) + else: + raise ValueError("dict length shall not be bigger than FULL_DICT_LEN") + return + + +def create_total_dict(): + """ + create dict instance according to template + """ + template_dict = { + "total_batch_key_num": 0, + "total_unique_key_num": 0, + "total_key_process_time_cost": 0, + "total_swap_size": 0, + "total_swap_time": 0 + } + return template_dict.copy() + + +def construct_ddr_message(display_dict: dict, target_dict: dict, batch_key_num: int, total_batch_key_num: int): + """ + construct ddr info message to display + + Arg: + display_dict: info dict to display + target_dict: which total dict to update stat info + batch_key_num: key num of current batch in current device + total_batch_key_num: total key num in current device + """ + swap_key_size = display_dict[RUN_MODE["swap_key_size"]] + target_dict["total_swap_size"] += swap_key_size + total_swap_size = target_dict["total_swap_size"] + + swap_time_cost = display_dict[RUN_MODE["swap_time_cost"]] + target_dict["total_swap_time"] += swap_time_cost + total_swap_time = target_dict["total_swap_time"] + + swap_speed = 0 + if swap_time_cost != 0: + swap_speed = swap_key_size / swap_time_cost + + total_swap_speed = 0 + if total_swap_time != 0: + total_swap_speed = total_swap_size / total_swap_time + + ddr_message = f"Current Swap Key Num:{swap_key_size} " \ + f"\nCurrent Swap Speed:{round(swap_speed, 3)}" \ + f"\nCurrent HBM Rate:{round(((batch_key_num - swap_key_size) / batch_key_num), 3)}" \ + f"\nToTal Swap Key Num:{total_swap_size} " \ + f"\nAverage Swap Speed:{round(total_swap_speed, 3)}" \ + f"\nAverage HBM Rate:{round(((total_batch_key_num - total_swap_size) / total_batch_key_num), 3)}\n" + return ddr_message + + +def display_data(display_dict: dict, channel: int, step: int, rank_id: int): + """ + display stat info messages according to DISPLAY_MODE + + Arg: + display_dict: info dict to display + channel: channel id 0 while train, 1 while eval + step: current step num + rank_id: id of current device rank + """ + if channel == 0: + target_dict = TRAIN_TOTAL_DATA + else: + target_dict = EVAL_TOTAL_DATA + if rank_id not in target_dict: + target_dict[rank_id] = create_total_dict() + target_dict = target_dict[rank_id] + batch_key_num = display_dict[RUN_MODE["batch_key_num"]] + target_dict["total_batch_key_num"] += batch_key_num + total_batch_key_num = target_dict["total_batch_key_num"] + + unique_key_num = display_dict[RUN_MODE["unique_key_num"]] + target_dict["total_unique_key_num"] += unique_key_num + total_unique_key_num = target_dict["total_unique_key_num"] + + key_process_time_cost = display_dict[RUN_MODE["key_process_time_cost"]] + target_dict["total_key_process_time_cost"] += key_process_time_cost + total_key_process_time_cost = target_dict["total_key_process_time_cost"] + + key_process_speed = 0 + if key_process_time_cost != 0: + key_process_speed = batch_key_num / key_process_time_cost + + total_key_process_speed = 0 + if total_key_process_time_cost != 0: + total_key_process_speed = total_batch_key_num / total_key_process_time_cost + + message = f"[STATINFO]Channel:{channel} Current Step:{step} RankId:{rank_id} " \ + f"\nCurrentTableNum:{CURRENT_TABLE_NUM}" \ + f"\nCurrent Batch Key Num:{batch_key_num} Current Unique Key Num:{unique_key_num}" \ + f"\nCurrent Deduplication Key Rate:{round((1 - unique_key_num / batch_key_num), 3)}" \ + f"\nCurrent Key Process Speed:{round(key_process_speed, 3)} / ms" \ + f"\nToTal Batch Key Num:{total_batch_key_num} ToTal Unique Key Num:{total_unique_key_num}" \ + f"\nAverage Deduplication Key Rate: {round((1 - total_unique_key_num / total_batch_key_num), 3)}" \ + f"\nAverage Key Process Speed:{round(total_key_process_speed, 3)} / ms\n" + + if RUN_MODE in DDR_LIST: + ddr_message = construct_ddr_message(display_dict, target_dict, batch_key_num, total_batch_key_num) + message = message + ddr_message + + if step % DISPLAY_INTERVAL == 0: + if DISPLAY_MODE == DISPLAY_MODE_PRINT_SCREEN: + logging.info(message) + elif DISPLAY_MODE == DISPLAY_MODE_SAVE_LOG: + flags = os.O_WRONLY | os.O_CREAT | os.O_APPEND + modes = stat.S_IWUSR | stat.S_IRUSR + if channel == 0: + log_path = TRAIN_LOG_PATH + elif channel == 1: + log_path = EVAL_LOG_PATH + else: + raise ValueError("channel num can only be 0 or 1") + with os.fdopen(os.open(log_path, flags, modes), mode='a') as log_out: + log_out.write(message + "\n") + else: + raise ValueError(f"DISPLAY_MODE can only be 'print_screen' or 'save_log' but '{DISPLAY_MODE}' is given") + + +def create_data(line_stat_data): + """ + store stat data according to log file + + Arg: + line_stat_data: line object from file reading + """ + channel_id = int(line_stat_data[1]) + step_id = int(line_stat_data[3]) + rank_id = int(line_stat_data[5]) + if channel_id == 0: + global_dict = TRAIN_DICT + elif channel_id == 1: + global_dict = EVAL_DICT + else: + raise ValueError("channel num can only be 0 or 1 and ") + + if step_id not in global_dict: + global_dict[step_id] = dict() + if rank_id not in global_dict[step_id]: + global_dict[step_id][rank_id] = dict() + target_dict = global_dict[step_id][rank_id] + return target_dict + + +def update_data(line_stat_data, target_dict: dict): + """ + update stat data according to log file + + Arg: + line_stat_data: line object from file reading + """ + for i in range(VALUE_READ_START, len(line_stat_data), VALUE_READ_INTERVAL): + target_dict[line_stat_data[i]] = int(line_stat_data[i + 1]) + return + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + thread1 = threading.Thread(target=read_log_by_line_loop, args=(TARGET_REC_LOG_PATH,)) + thread2 = threading.Thread(target=start_display_data, args=(0,)) + thread3 = threading.Thread(target=start_display_data, args=(1,)) + + # 启动线程 + thread1.start() + thread2.start() + thread3.start() \ No newline at end of file diff --git a/tools/stat_info/readme.md b/tools/stat_info/readme.md new file mode 100644 index 00000000..84cbb5f7 --- /dev/null +++ b/tools/stat_info/readme.md @@ -0,0 +1,45 @@ +#### 脚本用途 + +该脚本用于统计mxRec在host侧运行时的各种日志信息, 并输出对应结果 + +#### 使用注意: + +1. 使用需要在mxRec对应训练脚本中配置系统环境变量 STAT_ON, 0为关闭统计,1为开启统计 mxRec默认关闭统计; +``` +#example +export STAT_ON=1 +``` + +2. 请把训练日志重定向到固定目录; +``` +#example +bash run.sh 2>&1 | tee ${test_mode}"_save".log +``` + +3. 请配置脚本中几个参数的具体值 + TARGET_REC_LOG_PATH 为mxrec产生的训练日志路径 + RUN_MODE 统计的运行模式,现支持的模式已在脚本中列出 + RANK_SIZE 训练脚本运行的rank size既几卡运行 + DISPLAY_MODE 统计信息展示的方式,支持落盘以及打屏 分别是 "save_log" or "print_screen" + DISPLAY_INTERVAL 统计日志的间隔, 既多少步输出一次统计信息 + + +4. 脚本可随训练进程同步统计,训练结束后请手动停止脚本释放资源 + +5. 请确保rec日志等于或低于为info +``` +#example +export MXREC_LOG_LEVEL="INFO" +``` + +#### 当前支持RUN_MODE说明 + +HBM_NORMAL:不使用多级缓存以及其他功能的训练 +HBM_FAAE:不使用多级缓存,使用准入淘汰功能的训练 +HBM_HOT:不使用多级缓存,使用hot embedding功能的训练 +HBM_FAST:不使用多级缓存,使用fast unique功能的训练 + +DDR_NORMAL:使用ddr缓存,但不使用其他功能的训练 +DDR_FAAE:使用ddr缓存,使用准入淘汰功能的训练 +DDR_HOT:使用ddr缓存,使用hot embedding功能的训练 +DDR_FAST:使用ddr缓存,使用fast unique功能的训练 \ No newline at end of file -- Gitee From 3c6fc3726dfa05d43692de4bf71528a182ef9d15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Wed, 13 Mar 2024 09:30:35 +0800 Subject: [PATCH 003/302] =?UTF-8?q?=E5=88=A0=E9=99=A4python=20dt=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E4=B8=AD=E5=AE=89=E8=A3=85setuptools=E7=9A=84?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/run_python_dt.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/run_python_dt.sh b/tests/run_python_dt.sh index a64a0913..f29bf7b5 100644 --- a/tests/run_python_dt.sh +++ b/tests/run_python_dt.sh @@ -26,7 +26,6 @@ if [ $ARCH == "aarch64" ]; then fi # build mxRec and get output directory -pip3 install setuptools==65.6.3 bash "$TOP_PATH"/build/build_tf1_with_opensource.sh # create libasc directory and copy so files into it -- Gitee From 8341c095dbab43770abb2813e69083c7c4a2cef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Wed, 13 Mar 2024 09:54:39 +0800 Subject: [PATCH 004/302] =?UTF-8?q?=E9=80=82=E9=85=8DCI=E6=9C=BA=E5=99=A8?= =?UTF-8?q?=EF=BC=9A=E7=BC=96=E8=AF=91=E6=97=B6=E4=BD=BF=E7=94=A88?= =?UTF-8?q?=E4=B8=AAcpu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/build.sh | 2 +- src/test_ut.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/build.sh b/src/build.sh index ed55e213..c9fdc330 100644 --- a/src/build.sh +++ b/src/build.sh @@ -38,5 +38,5 @@ cmake -DCMAKE_BUILD_TYPE=Release \ -DSECUREC_PATH="$2"/../opensource/securec \ -DCMAKE_INSTALL_PREFIX="$2"/output \ -DBUILD_CUST="$3" .. -make -j +make -j8 make install diff --git a/src/test_ut.sh b/src/test_ut.sh index 0517f809..156db1cc 100644 --- a/src/test_ut.sh +++ b/src/test_ut.sh @@ -140,7 +140,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug \ -DSECUREC_PATH="${ROOT_DIR}"/../opensource/securec \ -DBUILD_TESTS=on -DCOVERAGE=on "$(dirname "${PWD}")" -make -j +make -j8 make install # Run Test -- Gitee From c2b53491efd6e930aa88022d50c912368b9863ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Tue, 19 Mar 2024 19:11:26 +0800 Subject: [PATCH 005/302] =?UTF-8?q?=E4=BF=AE=E6=94=B9littl=20demo=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E4=B8=ADCM=5FCHIEF=5FPORT=E7=AB=AF=E5=8F=A3=E5=8F=B7?= =?UTF-8?q?=EF=BC=8C=E5=B0=866000=E6=94=B9=E4=B8=BA60001=EF=BC=8C=E9=81=BF?= =?UTF-8?q?=E5=85=8D=E7=AB=AF=E5=8F=A3=E8=A2=AB=E5=8D=A0=E7=94=A8=E5=AF=BC?= =?UTF-8?q?=E8=87=B4GE=E6=8A=A5=E9=94=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/run.sh | 2 +- examples/demo/little_demo/run.sh | 2 +- examples/demo/little_demo_estimator/run.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/DCNv2/run.sh b/examples/DCNv2/run.sh index 234e5e4b..f30e0ac6 100644 --- a/examples/DCNv2/run.sh +++ b/examples/DCNv2/run.sh @@ -85,7 +85,7 @@ if [ -n "$ip" ]; then # no ranktable echo "Current is no ranktable solution." export CM_CHIEF_IP=$ip # 主节点ip - export CM_CHIEF_PORT=6000 # 主节点监听端口 + export CM_CHIEF_PORT=60001 # 主节点监听端口 export CM_CHIEF_DEVICE=0 # 主节点device id export CM_WORKER_IP=$ip # 当前节点ip export CM_WORKER_SIZE=$num_process # 参与集群训练的device数量 diff --git a/examples/demo/little_demo/run.sh b/examples/demo/little_demo/run.sh index 712f6273..66c27e28 100644 --- a/examples/demo/little_demo/run.sh +++ b/examples/demo/little_demo/run.sh @@ -141,7 +141,7 @@ else echo "ip: $ip available." echo "The ranktable solution is removed." export CM_CHIEF_IP=$ip # 主节点ip - export CM_CHIEF_PORT=6000 # 主节点监听端口 + export CM_CHIEF_PORT=60001 # 主节点监听端口 export CM_CHIEF_DEVICE=0 # 主节点device id export CM_WORKER_IP=$ip # 当前节点ip export CM_WORKER_SIZE=$num_process # 参与集群训练的device数量 diff --git a/examples/demo/little_demo_estimator/run.sh b/examples/demo/little_demo_estimator/run.sh index 39e77fc8..33770e59 100644 --- a/examples/demo/little_demo_estimator/run.sh +++ b/examples/demo/little_demo_estimator/run.sh @@ -143,7 +143,7 @@ else echo "ip: $ip available." echo "The ranktable solution is removed." export CM_CHIEF_IP=$ip # 主节点ip - export CM_CHIEF_PORT=6000 # 主节点监听端口 + export CM_CHIEF_PORT=60001 # 主节点监听端口 export CM_CHIEF_DEVICE=0 # 主节点device id export CM_WORKER_IP=$ip # 当前节点ip export CM_WORKER_SIZE=$num_process # 参与集群训练的device数量 -- Gitee From 6466e02d0c2b9bc8c4cc2ccc0cef83e272167f75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Thu, 21 Mar 2024 17:00:20 +0800 Subject: [PATCH 006/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=97=A0=E7=94=A8?= =?UTF-8?q?=E7=9A=84=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/feature_admit_tools/get_hist.py | 16 -- tools/feature_admit_tools/static_key_count.py | 61 ----- ...71\346\257\224\346\226\271\346\263\225.md" | 21 -- tools/perf/mt_1207.sh | 60 ----- tools/python/images/clip_image002.jpg | Bin 9453 -> 0 bytes tools/python/images/clip_image004.jpg | Bin 8027 -> 0 bytes tools/python/images/clip_image006.jpg | Bin 21733 -> 0 bytes tools/python/images/clip_image008.jpg | Bin 26810 -> 0 bytes tools/python/images/clip_image010.jpg | Bin 24851 -> 0 bytes tools/python/images/clip_image012.jpg | Bin 17452 -> 0 bytes tools/python/images/clip_image014.jpg | Bin 18658 -> 0 bytes tools/python/images/clip_image016.jpg | Bin 6056 -> 0 bytes tools/python/images/clip_image018.gif | Bin 70465 -> 0 bytes tools/python/key_2_emb_formatter.py | 220 ------------------ tools/python/optimizer_process.py | 116 --------- tools/python/readme.md | 110 --------- 16 files changed, 604 deletions(-) delete mode 100644 tools/feature_admit_tools/get_hist.py delete mode 100644 tools/feature_admit_tools/static_key_count.py delete mode 100644 "tools/feature_admit_tools/\347\211\271\345\276\201\345\207\206\345\205\245\347\262\276\345\272\246\345\257\271\346\257\224\346\226\271\346\263\225.md" delete mode 100644 tools/perf/mt_1207.sh delete mode 100644 tools/python/images/clip_image002.jpg delete mode 100644 tools/python/images/clip_image004.jpg delete mode 100644 tools/python/images/clip_image006.jpg delete mode 100644 tools/python/images/clip_image008.jpg delete mode 100644 tools/python/images/clip_image010.jpg delete mode 100644 tools/python/images/clip_image012.jpg delete mode 100644 tools/python/images/clip_image014.jpg delete mode 100644 tools/python/images/clip_image016.jpg delete mode 100644 tools/python/images/clip_image018.gif delete mode 100644 tools/python/key_2_emb_formatter.py delete mode 100644 tools/python/optimizer_process.py delete mode 100644 tools/python/readme.md diff --git a/tools/feature_admit_tools/get_hist.py b/tools/feature_admit_tools/get_hist.py deleted file mode 100644 index 1afe061f..00000000 --- a/tools/feature_admit_tools/get_hist.py +++ /dev/null @@ -1,16 +0,0 @@ -import json - -import numpy as np - -file_name = "slice_0.data" -data = np.fromfile(file_name, dtype=np.int64) -data = data[1:].reshape(-1, 3) -result = {} - -with open("admit_hist.json", "w") as f: - for d in data: - key, count, _ = d - result[str(key)] = int(count) - - sorted_result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True)) - json.dump(sorted_result, f, indent=4) diff --git a/tools/feature_admit_tools/static_key_count.py b/tools/feature_admit_tools/static_key_count.py deleted file mode 100644 index 53e5237f..00000000 --- a/tools/feature_admit_tools/static_key_count.py +++ /dev/null @@ -1,61 +0,0 @@ -# coding: utf-8 -import argparse -import json - -import tensorflow as tf - -parser = argparse.ArgumentParser() -parser.add_argument("--file_path", type=str, required=True, help='path of the dataset') - - -def static_key_count(file_path): - admit_threshold = 30 - dataset = tf.data.TFRecordDataset(file_path) - dataset = dataset.batch(int(1), drop_remainder=False) - iterator = dataset.make_one_shot_iterator() - next_element = iterator.get_next() - offset_value = 2**48 - shift = 1 - result = {} - table_list = ["history_poi_seq_id_list#vector", "poi_id_end2end#vector", "rt_day_click_event_poi_id_list"] - with tf.Session() as sess: - while True: - try: - examples = sess.run(next_element) - example = tf.train.Example.FromString(examples[0]) - features = example.features - feature = features.feature - - for name, values in feature.items(): - num_list = [] - if name in table_list: - num_list = values.int64_list.value - if name not in table_list: - continue - - if len(num_list) == 0: - print("===================") - num_list = [0] - - for num in num_list: - num = num % offset_value + shift * offset_value - result[num] = result.get(num, 0) + 1 - - except tf.errors.OutOfRangeError: - print("EOS: OutOfRangeError") - break - temp = {} - for key, value in result.items(): - if value >= admit_threshold: - temp[key] = value - sorted_result = dict(sorted(temp.items(), key=lambda x: x[1], reverse=True)) - with open("key_count30.json", "w") as f: - json.dump(sorted_result, f, indent=4) - - print(sorted_result) - - -if __name__ == "__main__": - args = parser.parse_args() - static_key_count(args.file_path) - diff --git "a/tools/feature_admit_tools/\347\211\271\345\276\201\345\207\206\345\205\245\347\262\276\345\272\246\345\257\271\346\257\224\346\226\271\346\263\225.md" "b/tools/feature_admit_tools/\347\211\271\345\276\201\345\207\206\345\205\245\347\262\276\345\272\246\345\257\271\346\257\224\346\226\271\346\263\225.md" deleted file mode 100644 index 2cee54c6..00000000 --- "a/tools/feature_admit_tools/\347\211\271\345\276\201\345\207\206\345\205\245\347\262\276\345\272\246\345\257\271\346\257\224\346\226\271\346\263\225.md" +++ /dev/null @@ -1,21 +0,0 @@ -## **特征准入准确性对比使用说明** - - ------------------ -### **工具简介**: - -mxRec开启特征准入后,进行准确性比较工具。当前支持模型保存格式SAVE_EASY=False。 - -### **环境依赖** - -该工具在tf1环境上进行测试,环境配置如下,供用户参考: - -> **tf1** -
-tensorflow == 1.15.0 / 1.15.4
-numpy == 1.21.6
-python == 3.7.5
- -### **使用说明**: -1)指定数据集,使用static_key_count.py查看数据集中指定阈值不同key出现的次数 -2)开启准入后,保存的HisRecord使用get_hist.py工具,查看运行后不同key出现的次数 diff --git a/tools/perf/mt_1207.sh b/tools/perf/mt_1207.sh deleted file mode 100644 index fc0af5db..00000000 --- a/tools/perf/mt_1207.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -# Copyright (c) Huawei Technologies Co., Ltd. 2021-2023. All rights reserved. -# Description: performace analysis tool -# Author: MindX SDK -# Create: 2023 -# History: NA - -#set -x - -LOG_INFO() { echo -e "\033[1;4;32m$1\033[0m" ; } -LOG_NOTICE() { echo -e "\033[1;4;45m$1\033[0m" ; } -LOG_WARN() { echo -e "\033[1;31m[WARN]$1\033[0m" ; } -LOG_ERROR() { echo -e "\033[1;31m[Error]$1\033[0m" ; } - -logfile=$1 - -# ---------------config start--------------------- -batchsize=9600 -parallel=8 -nv_throughput=820000 -# ---------------config end--------------------- - -validate_options() -{ - if [ $# -ne 1 ]; then - LOG_ERROR "NO log_file" - echo "[Usage]: bash $0 your_file.log" - exit 1 - fi -} - -print_throughput() -{ - LOG_INFO "=========Throughput=====================" - nv_sps=$(awk 'BEGIN{printf "%.2f\n",('${nv_throughput}'/'$batchsize'/'$parallel')}') - LOG_NOTICE "batchsize:${batchsize}, parallel:${parallel}" - LOG_NOTICE "nv_throughput:${nv_throughput}, nv_sps:${nv_sps}" - - grep 'tensorflow:global_step/sec' $logfile | \ - awk -F" " '{sum+=$NF} END \ - {printf "Throughput: avg=%0.3f, xA100:%0.3f\n", \ - sum/NR, sum/NR/'${nv_sps}'}' - - grep 'tensorflow:global_step/sec' $logfile | \ - awk -F" " 'BEGIN {sum=0; count=0;} {if ($NF > 3) {sum+=$NF; count++;}} END \ - {printf "Throughput: after filter(<3), avg=%0.3f, xA100:%0.3f\n", \ - sum/count, sum/count/'${nv_sps}'}' - - grep 'tensorflow:global_step/sec' $logfile | \ - awk -F" " 'BEGIN {max=0} {if($2>max) max=$2} END \ - {printf "Throughput: max=%0.3f, xA100:%0.3f\n", max, max/'${nv_sps}'}' -} - -main() -{ - validate_options $@ - print_throughput -} - -main $@ diff --git a/tools/python/images/clip_image002.jpg b/tools/python/images/clip_image002.jpg deleted file mode 100644 index fd387c49710c1d99303cce3767ff720cad502944..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9453 zcmc(EXH-<%vhHf41QDbW5Co)=43cw_)F2{~vq+{TNs>$x1f*?1vZO{b2#CZc2nZrk za+8`Q8AOmEIrPPS_Sxr+JMI{7pLg#Ycdb!>=A5f$eO2F@t5y|m0{0D|ysxUR3V=W$ zz~}M?a8rO1KuSzZLQF(Ta)pG9jFkKuJ>@kDifc?aXs^?=v9RBPus~Qj`6LB6xy5)` zAwtSRV$!k-3JUB3sv0VC_a)^OWPd9Gk&%&IyL#;wCFLzyE(n+GzkT680aRCjD7*S@f8wMvdaZElmH$GA0LkZpOBD%;Bs}~ z)b@lWO42_IUtZi)V>>XfFJv_ahd;9qMg}n%mhL~!pKwuK;=&^!z$YO7jSGb5eJS`<1cbK*iLT$%Cbo2=W)%v# z0#=O6u4pBJ2upg92Kj_HTa+?kmkFM zGjrsFs!(|K0I!R#XE7j1srsa`UPA~Wo06V=Cw)fGU zwkvB&D!p2i6!9gXBtKdXW9nd@_suFs^{!JR+ytK5H)N75rSp_>Kr$||G(4aVGON(x zfn0SR5ivD^{mdS4lSwFD=YDS>pwK1b^)t3Dg@+4+5LzqQX$1|wx3@_1i~~()c`}}l zppI5q%f!MdD}~CY=#}N{J~K0WiV0^aK_NA*qlo8rqBOVr-0Ri2kdIo}3!Hp&Vl>%LbUBJD)APY@^X3 z!hi-M`yPQkb`m6`SYh_wgQv6l2d4f=yru1qmZhi@3 zdX8d_V)n}K)b*AsVBOy&_@5R$y~es2;!x=*1KQ4_FQReo0r{?IRUrR#8|V8rWHG@n|iw6}Cvt_J1v=-^Mbve8iM{Mqkz_pDv^ zd=-Q@SqOOTU`0k;Z6EcHs|6tbba))a1u;R@Wq~h1v zyS}jsFP{;SFC=MfQC()XhID5>??g)T-Ri6%kaexpA5d181772~UfUG{qdaxN`a7CE zYLdiEIr*B8pK)C8B>aTZ5^e9!N=K<)*?$*B-BsF^AG+&?O;t?5ep@fYPS-mY_u4Hl zL`*$DU+ZIfV1J%~O?8FyebiN{ERt%WL-TW)g2XOvj8ula?|II4JsBAQk-kul0{L$ybll@1O?p&X7e3CR7dM%OSO)F0qCTDpW*pL(_|#(#iF`Ig&KW90}@x zN@+qprlmt!;HOGKx^E!35H4zxs_JQAt_r!rOB*we2B-#~b@?_Bwc&{Jz1 z%vyMy$~1k`P6gltrxG^-VZ)cxQaU!)l5}qga}pr*1K{h#x(Z|}G_dO4FC*c{Xn5rf zvXv!kj<*}Aj$rN~ola+w+C42sry(WuSGOVI5qXnHh{S*hP_2vNh~ZrFynUy?LzaP& zF_7E8k>p1*Z6a5U=N$NDYf@wo)NL>Z12XxO{=Z*=|BhU}N*@Q+T&c9??V zjqF`$`;5J>wjZAP8Oj>m zIGK08Icn3+H>wzdI?J%DL6&*k952dUa{jJ3B;wSD(CvNI^8~_R5k0-?ChLCHaP&QY zxo0o!`Ba;gzVplAC?TIuOMyJs@FhD%awEtkEC*w%r+<2I4lSeOatp8;6eSKWWizZo z!4E247jB?|Z$UB*8`&p&92WN|WEN^x8UZ~t}ym|5Uv9G+t4Zh%zE7!Lq7PnR>#`!{!)(8PX zB)RSZO3~95vxff(#r>kls9Cc-{w*!1YS3e%hdj5dH|mxI8GPvnueK7M*{`RS#Ry{a z5U?i%N|26g=15gIck;+|4e_lkC0(^TF}J7My{;5EpeiA1WThY4JK*{>>p>mT6zhV( z_#T$VN2ttSVe}X@psrH;lKD#TyV25!o94s0@i+PE(#JR;uHKm{{qxPz8$IR=jWGr) zOUF+HX0||(z2di+vAYD-^0ptI(}o23gjmT`?RjlXiH3kH6;9{l7M-Z-FEexP2cL>W zp|;!;cf*o$qim3qug7T771X|Wy0 z+6_Z>mWi&(und&cT}YT&&>so}hF-Vs zVtd9LabzV#b~b@={RB1{UgnE1$Ojn%EVlA{1A4N#S9b72^x`$J#&W&aw~$@eLz<2O znQ=CTD4WEPU>mE6vvte5)tTc7m_Z-o%%}`FUmpbgA zZJC*n{{AMnpNKM-ikh)fPY2%Q)lFEf3W^{0%N;-LZH)qClJf zp`pG$v&@;vsJ&F!kwNCkYEFjduaXm$QnVjanJsCU=dj`e`;)wIzQ*@bHU6x=($4t;a4V9q!QybitVC}o*et@xpZbjcD`7Bqs|~7Sx#1;@*I|l4z9Oj z`{oG-@r@!6G(dHlSq?8X>843I{VtJC05&`!y)-S1G@;?Mbhlu{%`knX{Y-Qd@z%w|@>JZg`+Rs@u7 zV+Q>~rbH68|2TZFG4E{6{fu3LCGOw=rp=3^3)NHwir90ja}|+&ku@AJb{C_orF%K_ zGl=+D7dCYM^Air>b_T%OS_okt$}4pAY`^lb_kR>A>c%6ztyJif^;KAINOTvbyOUZD zmaJZ#ur5<~i2|=}iYRk+w~DgXL6NziXoNc!=gK8YXEZ&f7GESaZ3J$dbp`dzJ~7`2 z%@6ZHlDbaf03X_~n)%*Ds|5Oy2_6x9mh+_UOZ$gkga8&^<)K@j83l>=zVLfneuOm7 zAuhR-)7LANau3ajI;PLNfizJtS9oAsFYBkbV%8|0N@U91?oaNg!xNs^W%G)I?sFVc zXVvh2}{ z?!$zM%5TKx%?5A}P~VReltzSlY+_4Q0xC@M{GGAN;q%4;9bE8cQ$bn=k#}3JaY);+#|Nnli~|3%Cc@JO|v1$FF=(c&#`f90#Cw zg6z*YcWE#DzZ_?6m#jv~>y=FY4tmjlF`bLLy??(cWwzxj0+G^Xar~2Z1>P2W8@~F{ zA{7UaN-i+}yAP+csmd0*e{czh6&L(?@L?1}%U=E1!4|Bo_XMo1a!Ka@f1-Z1`jFRP zmc-7on=0E%p4*~w+ST(^W-6E7Yq$TAKho`JEHct$%Us2&>^)m3Pk}|X5F;~Rkd?bE zAc|S@dx6b3q@8k^HhZejd0>4gP-h`MCSx6It-yE{eB5dpr9{IfjiIix zul!5*uapeY{5S(t_JjKalr2QMSYEm&J+&lwK!QBr9OI? z!LgIFKZrmiPb&3iM?htm^Nk%?g`h7#b-O0ouY9BA^iZGZ5t&XJQbkf>^jk?&!Eb^+ zs9bqaP4AShf6Vm*3;;z+wSkc`eovj8Nd+DOHZB$~7SFtrmw1n*pmkhIyStK8>v~<> z&pw5%Jx!>;9?4Ddk)ry^Uh86cjalWrmowJ09gj#Z*7Bl_bLz?HH^)g#^T6CId(7rII+A1fsvW=ors7J@# z1t-Qk`B9F zbuUJf`5&H<=-)jG zVh^Nw(l+ibbRIFQh$zrRh=a8;S(f@6HynO5frhu`JIjgqUeP;aIOg6j*Hvx4@%^Dp z5&1~NGaejpKZ=GOT2vN$Q=UIDo_1gVpttyST>%;0KQ!;H> z(MN7^Au&D>d6Sq!^{Z1P?n6_okmUwGzsYnPr#!5JIvl{_DflxnS6NY+wOQ;t(y@-sLJPGdRs1^JoEbzCuN#416hAQzrEgIr4f=c@swt|!MyF&0v0yCd^F2fo{#AD-Q z@O8G2va!*pZ*?NuVV>N3lOSa^n52^_B3Kf^wWn%{QhO6>ZIu@jNeu4!qqg>c)VB)% zDtZC`EN~V6v*L9*^2h)5!+&IH4nVOp4p=Tf_YkGwmR5B~>YX4`c}f@_6%}K2lH1&+ z>{3>6z|(x=ty`g`F-94nG+%Mc(SSA$&)Xr)nVQSXhdAI9pJLc#FAe}%oW)u2Wh_^1 z%pz+&k_XC+ae&5L)mO=>M(GH_{X%85)-zTifh{$boeik)?kNuV1}@ePw7pDO$}iN4 z_v~d>$507$R z7Eb~>;tN^rz;~|4-{&$NX4d^`l0Kl^!tdxO*%)~83S}p{=beu3VRkQ?l}}~F_cXRH zJ*Zu-0o)wBPHe@91AbtleLpU5KCzNj$~sRgAIRCbG2F@+Oq-^S7oKG56nVrzye!ud zAINNoPP_7k*mp8sp5NB8K4>?d0>l&%4(&jS@>LRg{FYAMC(H;;&)?Ys7;R2MIw2ZT+|x1?EI#$zce2V;lU z$VaO!&ftlecES#~`1=jw8~-jicue&&BT;uY~ArV_`_U>D~%)D@K z^xN{xvMu(i`j`c*D#9V%U9FH%tEdcqq#2-&1Mc;1!PoP}-rL?UuWCvC>36Rd!K44B z6H5+yxWi5auz7qcy1CcmSYPns3(XM2&G~0<30rxk11W?b)8YUUbQF9|G7tyAIdQ;? zsrHs%jLkTpCDBxac5_@f!LTJObHIIYC3I}FbaVd-|F_rlH0Ur+_yX}QX!hoCw7Y=w z&%)TAJM&nz2HP$G2V9}sa{LUR7v14FC%)VAbD4QN?I__Q%)8}q4hNVdgfI3Oj=_C5 z`Hp@b8XcDKeXwAwWbhPLWhLI!gb32J0MPn^^JMwW;|2T#I|eqJE&l28FdWe4xS(}H z@fZip^=KVw%0L^v4`#N0SW6#wF6&zbHZ}&1#(ECtSk!bn#J6|D==LDVKxM*2tSemC zaBHZB%U}||I7%ndcYenBf;rZrr#avf>qxsl8ty#1)gj!H5v13$o$ixz5tcG{xcj2| z!^6hJKD z0es>ed9(4$UyC;91&b333;0}*>Q2hb^q-D~k!b#Fzw#EnEIX-sj^oG1g5cu|{^r%u z*Wy+iI!W&nraw5VDNCE83 zI@9G2Cm8s@HUT@8Qn!1ktMf3;xC)!E*Iu9PCD7(6cCT{gi#Wc=5U8-ly;VlWut9QD zsm_cLN%1PlDPB5l`_hjeHGY%l=jrjio$@FO;rAAdp6AP?QcuOB4(sDD)9$>>Wve~$dT^w-!RFZJN&}SWee0O8Tzq(QZy-wi$Be_4Op9%v^(yh{OIrcW+edSgP zV;y7(`(gj&UP$z6jfcg*Xj8R+%LE4)rryDnvp2-iKCLc`CpOc+W6ho9yO6SfOmo?q zvb%j*nf_g!{<}ih$9^CytsAjrgv_~auE)IC*`xJALO$GRsAlq4l*@{`{hnQP=w<-_ z-q{1`Z`84VJ!}+(U>l0XG{EG~0ua+bd#_x7wPS;C{n>kk{nd@V9GX45wp%br!iX?X zMFxNGQ6nIt>-ymU`rT0r5c1XKS(Mhg@7D{~;fiK(x#HZJkQRZ^WY#H$x9}T>~aEtM2=JEZ* zH1YAVw3L_pcU>Tve1)&;)p}BxGJVR#$9T(S)MfihuU{!&^l^bSrdk7(#1s;@q{qco z04E=mBg~8L`cPbM=oqI~@+pQ=64ZF^$n-zoJN>`)V)6akwqC;>%z8N{hdB+;IhF4? zs6QhD?+-w*4Og`JYPdo>!OSpB`-4yrOA;v;$;dJ}U5%TV_o0brhbd1K%j@X;*ZWKn z`H{w$vZc)V<~nGdWK!7fD(3WbkkB#FXlqq@-au9V1kX)-NQI63l37pwW^sU~iZ|Np z+tE_ytEQUO1TyhQ;X;OT!6BVaW1=Pv7`ujAbnTNU!n)g#=;ugOX`s=`(4bV3!++5a z{>|q9ZD}l6&wX^N)cq`Z(Dbb@n=N0k8a}C{Oj@0cC})axjA^e;{1@X|o8)wd>E|(@ z3EYE{a<}I$R53W<{oQl!8uS^`* diff --git a/tools/python/images/clip_image004.jpg b/tools/python/images/clip_image004.jpg deleted file mode 100644 index bcb5cff76d9c555e3cb414a2d30b1716a902d31a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8027 zcmch6XIN8f)9#{%-UJ1tD4{oL(jkCS1O%x9qVx{Z1O$Z$C<4-})F{1o2q;oSI!fTP7=KuMHZUQI?09EWN94r<9n-U9$ z5)1teUx&~fb_vvfPx7N1yj?S)u!J*-i(XsJ~`JW4mOUo;Ntgh|u?H?Q-9iN<@ zUEsn3aDKu18`*!sMTx@6edL^%GrX6`5z-y?}O!?Eel}!2cJrzk&S=*91U!rM|V1{5_QYolw>Y|&KcaOn`{Pr%8zt}c9&iq(eI;)Z_&=p;E; zt-5BbxxMuC>;blxA_-BsKJAUUKsh|>r0fsap`4gQeQ<=sgSS96g75==zebE`NN7j4 zfv3n+AC@Au^(!LI?^4~+TRjt=IN{TISXJ5;apGoyVf*(mCbZ=6OvEVW@#l1&ODXvF z!$Gq3!$e$IJ`(fqmby||N;cCpCDsqAD>^CQ;#}1&n~&>bKL@4{%VE3wr;-dNhem_a}Pc9EYNw@aYmEGQcr|MhdwPBa6Qp&Q}Juk zk%+~?VI=e3T;0|)pkWngW}vYBK>Wnfe2)vhgqL(|$`nYu{4smNbE~~O)n+Co88G5(ZkBngqX}HOa)gSU+PY;7WVr*eB7l~nIGC~C0y@p{%Jwo0=0yai zF7jD_M)4w&|M?(YC;x;7v<#DB2){3OW4@zjp}AgEGE;AYRBmfi;>UZDMr70U?23di z8Q}hvviqx#c#6j+WX5XbSyj`mK1PY|O0`jT%80m^?(}GuNwc@0+PR`C7@dn+=hU4wU{)d*3{o%&#NYoecqDR+`i!pf)LIY27 zF3sA#3_glXJj}J3-Q$>ONBNc_QqVvQkKcC;I`m{WGrplQ@f1`Oi8wbo6+;88){m+Lud)~mG6-{M*efSYch{iY5RU7g z5!`6t$f*>uhX&fina2}8EOberf#-vZzG&dq9=Q|w=~3gJGfJg^Bf!u?k0)xJnWMa5 zE819`#RE?lyHz1bwmN6xaYd>KE3?9Vjw7plG?1W-Ium&zHhXp+IC zKm(nPr!4P`6-vQMb64Q7TN3RiKQiqyL^Js2NWYCNk!+LIiNa-UDs($f)q@S`%anSn&cip5!|rsubo3>slODRpLs%De`ugQGjW|s{GQ|N6NH^{-DF1DXA+m7^ zCBOL*4akn7?qlerbCdaew}8=~5%CczksYA&k?ihUR{+~$^}8J!u*MMH3qd>|NaG)u zW2eHR!;Ss^Zx*Nx=O{!O6(nBr|IqO9(_;uCR2q)r@&s)yJjEAqc<{$g@avrMZ;mbu zTN9ciNxytT0~n{k+o#Z|7q-w|OtQv8_mp1pCd3H6^WXnSM;yh~@C<5gTi@h%+8ZZq zWT95WZJ?kUk#)uY-hVJ)uSi-Sx+zq)0f0b^K7$L%Y|-yjvqR{dZw2AxA?!~F`GSNg z=~iOwloB1k;RVM$2d>2an-{LhIG_aPrOBqf^Q|9HZvAu#eio>Njw!?GlYlB{h0{^O z-gR54d!0?PaUt1{IwrNL+WQ1EqmNMyb4UGX0EVHQ9Stmo>Muj8AaTttvPV+l#Jvo| za%I@GT`Iji6&SRYeE(f)q)a`!{@TM-fSk=2+On!{xWMGY)~pws8dr!0#{Wu(#&OR{ zFK>>guBoY79p@;MMWzYJ{w8#B77?8q4umoe2JOvyiZU!jaBEgQs>*BKkptfpT-`h@ z%v6PQnsiFZH%o}X{pq-lbq z5Tiz9l0(?3GV}KTS!q3HjD!0^`C3DH#;B^3#UM^2h{!Y3S&o^G1<*v*r9l51i6l=O z3z&71hNlVCqJj0H09%9l(6w6P7#(Eh)$nEB_dF3 zch$7u-c1)F2+2^f2)?DWpep&N29xOp1~eeqAoAHF^{$45ID5{0LRL!ZYBzQ^i$%nu zM|&|Q*AI_>lbcoXH=@OD+?daiiO1!m`xwE7m&Pz))x!>re^x^(OivqKFxVD%4|5vg zfj6mglX4JI4bDY}(hJRH{;fG`wYyd!2_`~vDS*NGe&E?aSEyf~Se{#O7NvhW`9U7) zvWojO(}uLo5HhYYj_Rnoy;1r(mk`-_ch>bCFLno*@q6=R#OVq=_lrjZ$&SNG)zssb z{4DH|`zeYIZUh~49VKcTnD*dxSbF!woHexl!MRk=tFElz&c!9r9-sb%67_s98`W0? zEzI#_^*3ih7ij;BGvu1`$(_abEU`wxPpv2;D-O+Kyr-vd9@o3iTsI_^p0U+Da1Goq zvtD2C^Vrmk`fwQgwuhM zSMqvIzHIS)89|IZ3hqqoa?sx!5%BYtScs^%h%DBK=q~)4lC?K6wxr>?)wQtOX4{ad z`;aC6Jl)TaOy$n4qk1!ga_~mO7s$J6Yq76-w(^okdRk-T<>{e*&OZ-JwFcwgck?D* zVdgi)E)=`&-sV0!He#Hx#jgz%-Dwr*-KyLdI6lsItGLgy+V?ItEPZwdI>(?p3t~;N z>@Kphe4xbl_QRp8Ugimex1Tk7EkduCu9|qZROn&OV-e@8unuvuT9WJ0*Rh4f3-k%r zk6byW>c<3YS5Ss9GiA{lZt>@}aOBhertTeRXKf7oNSnpG@7CwH08eIUelw!LK$f`r zo9b#wE;ns{&61mNZ|{>pS4*qIIZ=(*!)MPg-E@Bh?}~hD7ZG&B zO*YLw%i3~~wbt-ycD!L9zYjKoN!e_+L|C!wS!}GDniLZh4G<}qoRM*$urVWy9y5Z= zeI*zQkL}@UbRO4KUx4_0!BunR?W9XE4cY)62=rJ>AIiYJ3T8a|DZ1_!*Ic72>+lVb zA4C?BwaSKKLy77QP6M=^YHVQ;eczb!HOX}Y&w9OyswD~}3eAc3LDclI= zMjocPu?_3vM^Ee0-m!e;xgT~uEJJcrGaWOR=|@R%vV$Dwka!*} zZ%MK9D%f7{ZbA8DhB8lprb){E`@I!%9;D0pv?sG_T8gAGqpItYgWA@>`f@)X;~V)E z^3x5-_IS21Y36au#q4MuVTEK@9$uM-&S{|mA){55;MYACK#9xSY`1h5%y8R+LME3Z ziy;U^Z+VaGB@e z(3QsOTuhV03_*UqC;h(}Is4}1;70<>AKGf%;Y0&Tb*$Q|D%v8)zV$V>UE`(%WThT@{KsJU;ny}+eq2Wv z+9nsjpa_+jV`#9O#>g@BGD3Y_QB#jw>A&N0&U*JwYv$v?x=QV0?N)OMwiL^wDi=Rl zmA6SC)+7GKm!#BWi2G8lZzhK$?wn8Z0H2cdCuC2ER?5p?!IGkhRo^R??yaP4Y57|E z1AlhL`H-&tppekj{FstB-*I|i(moESOMjmb93A<|m*t-A}U7lyotaF1Oztq+y zvAp>fBQcRz7IW{Y022~Rwm%Ko z?$12ecmhg)&wS9)|D!RhWo`e@oF7>eYRIW!x#GCFSP&` z>XE>KZ%&KbIb9XDKR8pny~*_}PW{O28yMaG?cufI++W%iE_7!|=56V%CO?7+>bOwR zoL0h~WENk8Q%uOT`Z)t+^5bj#wb{=5`H1X*_G zt4h?^bY|`%S6Ddvnd3cwb!3NN{xr%v627c73E^G_**HtI9O0>0ktXZPX%;oQ)3ogOcS-y<#1e-h)mL^)Am^%@r(3Y& zKQJIU5XYM52M)3ZlU~Py=YOe#pvUhnTimI`t!;ew^my+Pg5|It7Jb|I`Gi$AUUaxV zKB0@y%LrZ~;~HkONT}8JRkuMn1`4H9xjE=vO8iISCgrJE^j$MW8xsFl5)3Q2 zkzF4GrdmaWjJbo=N`4wpM!FjOOY@Nz{8wRgGIXwoWG4pQesR0>7*qY!oOr3Ggw=0G z+RwerB<*bkex`2QQ>pab)jh2hUqOb^V|FJO8TDV7j&v-B&$S+MJvlZxp@#OXma2=f z5}VU2nSExpL}b#+ra|eQQmu&-3@137@1$)=1%@#fyF)K35T*@X>&# za>wjYB5&`{EXaBDGxVz>If482La28Nm{#_E_Lq>Y@BVP^<~fyE68urEeVo-^NVxm| z23y}6rd*SBMNqjm6q;-+BYxx@87vPwFW+gPA~RsXmnI{x>nxyp+{MdULwcS;PK$uP zFEY6x<`Z-_Q^kXhPDMC}p#3cK-VZ)Dr}`x5Lzu@U*I!d*7km{T?!QGGVd{9_sFa0i z1-`kGth)aF*)!|7{7+6@_nmH4U+XUkOD{24r;m7F^2ztGwW6+@Ad8#!{AW4pvg0w^ z#Uf!{RplMeOm^be(ZB(HBc>d0DkIo@&;Sy@6vNEY7&q_a}VH)J` z7c$pOiiUgA5zNsL$0L2sZi^B;I!8_HGIE*NR`Rlo|g4 zndcf+5-c)t9XxIi)%jWcd_nU2X**K%cJHq%|LW*h&D=4q@^#@8T-avHMtbfK36a27 z$+eZImzAzr=fX8-!)N?Csme_{U!Hv@~I2@K2EGB zbrN<(cQQm&mst5H8V1cis0p30rRG&Kzk1J4q1C;UL6oq>7Fz>gJtNXC5+aIHHjan~ zQ(fE?g`ojn$Q*PaNfT3nnRg(oawNIThYp93fs{XQpaY`tw#;1OP1$QTRStJ4L@J6* z1f-zPub+=|q@KANaEl2rRmEZTvz~dpR@5DvGFEhaSN|cJu4mlQ(oKP>8ulVbdh{Bc z{-tYmXg5c~S4Cz{4qDum+-mCqQ;$~Q*??Mh$~x5}4bhvU_E|PPT9q)7h3Li-Hz<*o zcTThLXZ|$aC8q?1a{S<${pC2?_USei1W!`ND>0j);`zR(8%{A}K@CoCLEe!d(o__7 zQLOE#k8tnpu=AU?SrJ(Eyh(1aGa{6EQptSRld|!m{c#J)Ub`p)_9qE=@L_PcHmh^x zT}w*ka<6#Yw0aAe{SR{H@uNBK^v{qIXtq+1;X6Uz_IwW3fd{%)T~GQ;M%qqu&GE;K z0cdda;O?rC7kyfBX4gpR%0@E#Yu-0XSEVh_UOjJhAJXHhDl+5NimOY4GfrVoaM3?+ zp`8!Q-Q_YAqf*Y{8co#qimS+=}hy6-nP4ovEA2) zay4yYN%0{XAxm=w@_PNaSY#~%tqFGxl zzxsA-)_rhf`8{#}@(KY9zGc!{+BVNY%LaG=sRoZFoQJi`Vs5ciqFn~hVlLIid$pCi zZtMlCL!k4KFYlC^6HDt&s#dygC(L6K4;lBFlD2&3!aLV#S(0DSS&-!~-FiP6knReX zCQ0=uan9xubW@}HcF4N?mZy+Zr&43CqrY6~x}nB^3_(^#XsldD=*4aO|FHwdr)8^g zNZ%S$^qEVIJTG=JmA4@$NS_>*$3#n;w}Y+1l#D@oRBB6M@!)OG^$|GD4974V{Bwi~ z_I&Zgl0i3*a!|KmcZPdJ{Q^^U=YXk?ZWu^jO_3VFJil6ciy%MK;X(s@EhrF@6Dw{3 zRqU60reoKRG(u_~e87erzB|kr#N4ehXf$%dJpNDw9$UfLlqg7B#APrA{V>0vJVa8l z9*_0useG?Ef;*drA>F#6-;=NQbt~A##N9pJJzZ+Z9Y3YZN~%*%a&h@XMsoP;6~8wr zmV_MJGXFc|aa+X}84gaIc7m0@sr=xcLU!w0%Ta3&A3MP zdJtaOz4lVS^y(Cc?4Fp3UZ|6ZCpouwEX_?%Dk1lSqBHf>1rF*PQX+!<@QjR{)OUoj zJq9b+Sxf}x1E0>92ac2{ZY0n<6i*j_`QB~Zc^xLkn_ou5t-G+zu#*0>+(m}V7TeB= xJz!$STNk%hyFWE7ij1)RseGQDk@8EBwP~PzfV=-6m(Pd(#}e(Yq{cyy|1UT}!D9db diff --git a/tools/python/images/clip_image006.jpg b/tools/python/images/clip_image006.jpg deleted file mode 100644 index 8759d287be51d491663043f531f5e3041284811f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21733 zcmd?R1ytP2mN(is1SddnN#hpWH3Sdt9wb1c!QF!f*Wm8%E(y{Q0t9z=X$T(tYtFed zC-=VjzL~ji-d*pl2dkRq-$m7yU+t>BcWoYLA6EfbvXU~A02mk;fE)B1@VEdF2RuVU zLP0`&hJuWOiuw!q6$K47kOKIUj+u{@j)9x;C6N6MJGX$4sHi9n ztE8NSuq>a5sL<0OFsP`gnCO_KSXiV&^gw!{|Mu5oHvk(M1{t;<4(25Q78?c*8|JYW zKn?)FAV8^oBKU7#FtBj&2#83?D9=!#9U8CzurP3Nu<&pQ2=MUG-rms90r1!eIFxK+ zh|iUckY3v3viry8AOm05e8E#0`$@%N>=1zR4F3fIArUnVEgd}rCl@ylFCV}78wp7% zX&G5nHFXV5Eo~hW)AwfP79dMUCubK|H+PS~k3qp9p<&^1@d=4Z$)8hFbMx{G3X6(M zN^9%t8ycIMTUx*N^!D|G2L`{5PfSit&&@U9N0cdbAQ02j414IDlq?VUfjG{m3 znT8{02k?fqEj76vhnPfVr~=Wno-eoYNW?yC{+5|rcMhQf&BPCqq=ZX+t+F4nDooUp zMCYp7c2qColjnQ{tm4+++n(+}5WEkA<@-X34AIeP*y_(J9C8Gi&BXQdxYI4jVf?x( zYvI)3h>LI4;Qpwmp4Jk*R$;kgI3c`XUpV+K%GaruTN_*{pb>dEYG=+=-bCKvpU_FOodE|xe$h;+KioPlGSWs)*;(|wT?J0Foio^ z+JTRviD0kKhV~Ogh4B&4w4kT0XgomW|Fw<#5zx{EqO5gWy}+INIV6?6$M1#nl9+xy zQ}UgULlN_@IC+T@H(7bfM}T_stxB(LP@CxMO)%A)80jOFm#r=nH+zo&_)%Bi0~fu( z-mE<eUgkmbZ$^l4sIii8l?c3kt4 zIG4=4nY%`xIKyE=1TInn6D#Eh^G}Eqii3 zcJE_M9o=_XB%iu&<-Xb+ygAQR23dZ8n7w0*yu;VeU?b3bIWy!z6Io$at4uW}kY|KU z#rmEK`^)iso%1Us8L#B%ZC$|%pI6!1c`M||FRTJ5u`@HL(2s)LoN9^+g0pPxcBCuO zKME*QlGK=dp^8G9j-0vJx{1Dd=#K8#}V4%2ai=b9QLV{z&Y`R>?~U*K$$* z$=gJ|`A2;T*{GWNl%2g`nOEt1IQ!Wwks~l%;d0`VzC|>5SJBFwlYH(NIn3n1_+|jx zd7rt?FNO65=^r*a>ohj(JEO4y>AXE3gdQyMx~i01A39K~oZTxLiXm$tjTwbntG2$i zxn|1e_QySB9YJh@*0u5=tZ^N^9Q&y*rwtSIE3o85fZHF5=eFEl-btGMRfU$oXq zOWS`3Mb?zu(2=KehN;!%=+xS;6KCHPk*r9rj^sdAMspk)btZowM?*a#n(gzY3od0 zA-7W6ugNiotqrXo4Ru%KoiQ$ib(zv}ZSMubS-hxS%Or~REH#x1?kDZRgiR_YA0-!N zs+i9ld>P=XE>`YBvvfOJPJ+X5FEv}86mgFR8B4SgzeGTc*@ZA)R`zVj@l`@51`=&- z-=7+_vZU!pD<*KSqLAQ-WJq!f8ABPU?wqP*`sh+KOe}6qytABbJGggV?DPs*E?*8F z-6~JU#@g4_fw^m(Q0nbT#O-!a*TtefY9?As=8k;3Rakyv(t{JiY#c5RT;|tSqQRe+ zu`H41_Gshzk<62I0of6V*8Vz^%*nar8o0_5fPZx}^UgeYa4WjMgW412OV6t7^{lJ2 zM?i%})di|I_u{ZKg~SLhOp>t&0;!B}KsC*Y_flk@o+TM+R}d;BdMFi%1?Bn!Pk`19 zj3`l;dFvw}ll~FlD{?dW+qmlbioK57GUjMHjwyDmK#uRRxoq$d2b6aoFeQM?WmQp3 zdAGXDV$OISZIBn(3vx`hfa}lfg>2~&I}HHPZv*{Kpbp)-^wdn^JLHW9em@T3QRhd1 zpz6be??v&g(9GS5;_dY#V7h{z4vypS%J8gDtKydvQ?yZ__et(R`A}2f)`cKO(X=1=Ut^K|HN!tm4C8N9OZ~>plvy~S z;$Jw`&f*bxE}3`q1}dZ9_)-Jke|Q8qJpyFC0UBqvT|09k$Hz~);y!rrj(>P~1YDF3 zU~c7Js({%wTvF;Tqszk25VK0=+u{|_f? zY$)%`2ffhDGaa@Kx~pEr2U}0gRss`PNO@RCesre-qJJ0BZ$d8hNa{cjYr^{246?%+ z`#K=v@*QPo(WJma%pQI6wXVC0)*uBtw-gDP1uUD;;OGte3HTqiJb0CLf4s|>GIi%& zxO)?E89xYfx@N9<`+9XCkA|C6EtBm)qI>tZIsZ;wUnkn#^DR9BbQTsDVvI3P-4E*7 z2xmKHqa63b2Mxo-s@@Pgy~_e|LlIH`2$);yx;I?fAUB{pcy}fs@*kt4jx!nuUFV2E zW93~OAvJoJh=cs+y_E`Er5FEiSup?4txXB2jbgo1=UU24roKqfkbs!DvM-jH5P*OB z&p>y@W_h7QCjn5EO}lE*FfS&j4A&>t(AbRF$X{X${syJHwUjcr?A2d12oVEH2h@rg zB$0s;D&!_3f-)eQ4$rlN7HgXXUa2?}+e&js!gqdHGRjez7YyPhdJ3@KS&$R0u*lFi zY`I|$6pT^48$yl-{+kk^;Dpc7N9cK55O4uEqksiXoQ2JLl(vGYlPKXsQHO9V>H}bx zibISfh=_fk*aQu4VAp$JmM$AM{GV%xtc~_&%m3zZM~Q-=$`+k)Jev zB+j6*2zMf~yH3EC$$o}>yzzXAeM4@hsy)&0aHwm=zwPrjEe7D+7ITIx-|3dS{e_Jc zObZE0go)cDz~8wH_tnA8HQc1JH*^>DymR54U;*Mz?yNIpfh_r z((~^0i~EJjub6t#BXMV{{Ot3!^WG(ElU_R+Rm|Yb)AuO-hsD>YNY-dKGrsobHm-FL z?w`ZeTY|ml^|$5)>W_eru973ZwbmSu0E58?Ttj7oOcL{!nVq&~t%~kYupPV|3XGhs zi2VXBuoA&JQFcLU!g?ELe(y3y77?!RS((o}^?81+{|X$CZBwS%wi>5tD><}J3`fQh z%#hN_7$AGSau?+mhF1mtqr+L#?d9*aUl&G!<>`t@mB*B&G1SPFwHrGrCyvoS2g{(e zvm-k`M?1Wo8`wBGFeshe=G@H<2hjbK&F&<+fIscdWKZFr$8%KCfkNTa%YSPKL>Z!Q zZCduy6roOGNMjQ0ht;7r4*5`T*{JpNH@pIqVDN#Sa}Dm^$aq*eB|S4?odlkG10z>E zx35*xtpUfTd!X;z`^B9C7ef5vavQi_R?AE7ZBRvC$7c@jl=sBd1QC#f9~y!C+{(+$ zX-f+g6AslWYJ4{=j-vP$r4#$yE=|~9>|O0}#~%S@Q8IYi4}R&Xc|`|dAMA!V#BGyi zA}u3E{bu1vJ=4##$$~p@Kl-820$fYW%0Nf8>?=BnVXgyjhCUjqj>7sekEzG2)F#c@ z0^6G`NE^$`7!pl(G3--ErrO2PD+7h|3Wx06UOFqri4MheV@V$m18kJlUOA%xG08G; zd6Mm8J2C6w8898cfVb4nZCuV&rdSS->c3dqsJjj{!KyF%hWPP;Ic*V_{xB42!O{Z* z2M|$;h``u=Iv9)JHJ@Jq)xq0{4WMvBJwRCk_iv4YA`XU-)B;8%!kd0_WrJK+No)j} z_dpa?0#^y<-y4usNe&Be1hN3+hSLFZV`B2C{DkBY{3vkPJur0uI&}Gq?2?jFme6;( ze$W2BY$oKkq2 zw&UB^yPKJzrVs<32W@^3P(qTCaBQWGU~cdh+LGS#487}TlR}Lst${=EBc#CKP&YrR z5A!ub=JbuAeDth`!H|;}3)AP~*pU*T^ze(bEsr>aG z!D=Ej;dYwy)$P}w+c~@+{EQK-0|kBdnqT=Rwzq5Q8BQ6s#F7+vsWgFW>fzt!oNgif ztCs7Uph7D%Mj(#jXX^e%?5Jx<_#s_oukomG(~sqqo=#GW^DDXL3yCYd5o_U9$P-Z_&~tDB(&+t1Cw0S?~8B z0tm1+dB;`Oj{r>NX=^7zX1ppV}ppsI6+d~HKGsL(QIaqn!fOsv!07N6V5OOfH*O3cE-p04nV>bRcQ zetK}1El_9agY*}65n2(2tRSl~Q3S3flh=nIO4;T4WBepBU;6Pa@NIGo0^MWv^haOD zaA)HA=xy4A?HpAtm>I-Dwr6d|IGbM0g0Xw4)<4}ZXOx*><+6n4vG!t6NEJMuB}?B5 zrAb)MdnH|oao2^`mLsXQ+lPPU=QTWn3k-@@!vD*fZ*9U zgR}!8V)AHDOZlZ=d91f^ou_i@URC&!*{ z@U5N%W{sE0Slwr3Q@w&){1!UakIj>~l#ZDVk9Mopwr@jjHmS0rICwXc<&eO`Y*z{< z8b;1Kq;wdEEnR?mSo5`+oV8+?$*puQoNW$ro~s(Fb9yW#Xl!q~E5eJ@$Y1g|IqGsh zAGWt^-JYx1(qQ-|O|j7otSOB=5|sNCZUu#Y$U_6Y?Ak@QiZ;-$?i_nFEA2;kn*}5} zav6jy5IEp3iD~4=uWt3}ealpMbIhx`hZ~%H(xuPpY0}gUO>9FBSMWIMOOl5!NTl{} z?5-coxJdL~c)r@-N@a+OC*9PbuzY?!@xqv(ShYzI7KxPMLd8{+HQn8G!!x}-m+sv2 zHmTtigJADlQ3|64Qbo(^pKs##4wHiM&ysgGwpVv@N^ERQljc^N!zDr>>;fz+7lH1; zpL-Yi+?WW92GR*%XI?848oXRR`f^U2+Rtdd>!Fa_rg?nttK$W>;t@6;wH!PJZ+5PF zY>V}aJbTlUZt1%6zR@hB_Z?G+WfiH`jW=JR-omoMzUym7<5+TLuRSe*OXG$g9~Yw{ zcb2cq>B1;qqFSCfHpQw5+2q2(_9H+ACp?6q&Sm})u*PKh(ley7DJTWSo~nS{7S}Q2 zHXU;$rv}uwoUaT-drwlA4p;A~-q5lH?`zSNVrr`K9$5jiP(rCGoF?_c)`rHSAO;Wp zIhG{SMXA*?M@aYj%!QNg)T_dY^Dn|L&s_T>+XN)H0a?>^$5gilWoG!>(|UxIAN(@f z;sw__5Dc5qi*$O{s7xjl{&4lIH#-Am0ilt-FldAmY+E6dv>>4_uqaVJ6} z;~dk`M3Ai$9LHX|{4qQ~Ns%Vc%Kp-~uA!Bk0~YghzKI=a-KcQF(!_+(Yx@M#eIe(& zt}_JFTix_vlx~n@|R~KiBwq}*FL$Iv6nJj1uJ_c8kcg-!cK9E;?49c#5gd#7Z+TT zQ_k?IHp0tp_ySIb%h%Su_*9UsHq32C$y0Go72Q6}^pR&7ixh_rzCEcU^_~^jXqp-m z`XS!^`v{98idV_SV7Q0~81Y}93S8wYUM(ngo!x@1;!;fOYE4&gYfG9bR&7W&PJjjC z2>1+8>N2LO%Yyjb%mc8&{@W0=B6`a(uIrdvzA9M zXs0Kd2SNs;s3M$&?DsB8#idyLuVn1tmgmN4W}uZ=eF#{IpJ%2(w^}Z=oI-B!P17?! z=~WZ^i#`=szWvuTHWv#wKQcZu#DM|MG_@*f$*Md&OuGedxldwz?5EPmipXOsj78xy z1L_J8t)|-JxNn6@Ya$S+Q$oK`c$ffJIP=wst@iG{9|5uWX{D6SR!BG;l6^QzAyy_e zysJ_CQ*xVG5ZB92S;6Gc{-xBn@8AKK8}T|sL_BlU8L<-(j`lBlMt(x<0#2pLmt)7x z9kooi=On9qw!Jc^tmyMfTA0og+3h2n<*s>KsWr zl=}3__P6pjht~h=;T#y(}c_382SEfT*|KN z1>Xdnd2;{%7Tn7*_)u0RPLC# zIf8v~181tvbNasK7U#CR8CN^XoA*6>F5+>uVVg87+> zY)0@#ygNlD+x3jnhm{T*TRUED=%k9%EfpS@0t=yt5e7Tz^~qO0Dz=fH>;sPgXG#8u zqjSVo<;i=HC|oui=l3XGiT3cNBa>+|U1sC!6QFdPTDp-2?UokI4|Cz%5(|FZ%Q(G9 zfJsNbRArt!1~G^>#6W?MmTLhEQwf7EucCYXPS}~(i`}$is@(dpH{MLmFGPI1 z$v4AnOq#O@Ho(J*^T0KJ;5X;Wdlh(n(>ijZuc0Y&6E@)+#wu0Wo>IXGCziMV98~{eK zy5mlF%@kdnA~juXyYT2dq`db@(tKfGF|>j0CquZG?sR7v86CPI*~wqftIS}D4zlY< z{eVoi6R++xTjeCLsTwTL?%B}U$|2>9tw@_b?BybnY>`ifLF3 zXRd9_l|;SWJ+;qd72CmO)`6a4Ui~r>IXDq$d{?R+1t6rUT|E(wc7&f0^TmyQos%(% z1K67(xks=VDS&LM6B>Vn`K3vWm{tzzgqdJEtk?`*=R=y0Bh9qz*Gig;k6es$l+MdF zd~R|4t|?FHqb_Hs;%6|q$d!US(t#S6B5wBsSzG6rt#PWHI}Z1^tb`;ItPdVUJ4QdZ z^C;2>O&rDs%NG`lTN6!VdW&Jq_#jbPBUM&6_&M&*>l}7)koMpyBPRw__BvZ&W;XV8 zmf}Gz^&`O0J1S7=gexdPGgB{Mef>viJ9bu@r}xyTsN$Z&kK-!4aT%5@1ygtgd`Cj= z%(dvZ_l!xx#FHXxpQjoOV)%3his=Nnl+#CT<41j|u5UyoYxz<6PN?4&qk|bXl(xiY~ zI`eruJfsd9EvBNWa;46wVH>>N^N}_|i5rn3-%t#K-h6}+Nx=`|L&Yfch6N!6Ey zyDu~1_$HpB4eVZnQC(;$HrCAQZ9*Y^*aOz9GnMA&FfE$Y?5UR7-RwD9+DF$JBzw3o zH1>69Kys75jnS|T+0zbFkm5;h-dk*5bd{8GK@05-6IQLaKLBrNzS>Z4 znfUnGok&up&*$=`&s3rXl}jxSQw@U-PHg-Zv6ro44|C*(!xEB(02!| zi}Rv|e9c!rZ=!L)AEKxL!356~4VI*SOtjhEoZ>_nr!{F_FWg8BdCMH(T-_7o=?reg z=y|QsS?`>bhjM|;Cl1+wv}qJ{-61T9T05D%hnAP=z)9wY7vT~Xwk*ULA(RP#BBY;! z({{7DMDH7Vv4dLpv=T9RnizO55eu5~PFd3F3)iQc3~9|*zE6(r#K}|wtwQ1l?NYQw zItIRs7Fm`FUG(S=!E*gjAhLGzTFKJPo=daw8kKc5PxxnF)flYaFgkr?6t z8%%ZPCqgnsc=x`P7$T^<$3{TQek5WYv17SY^>d>W#3|2zIocB}FQ?fh_wF<}nDJd4 z+Q2MD>CsLVS!bg59kshJ|33CmsRRijo9?K47|l{ooaF0GZ^z}R?5L(UQ&dys!#xX3 zbh^kY&I!C12yO>DTp?=>O_63%gj_;{4kkNY_?g&Sr~t(5h$g71v%?2U{ta+n~Fj=)}83+b?Yv|0I8n6yNqq32)P zw;gc6m%GkToRWJtUBNZ`l7mNrkSp3`y+cH$*0YVXEeU}TqwzQl!@j}oI!a1RlvquJ z*sVu4s(VSE>PP7!s<{m1`&cR=yktVchXy&#lh>o!D}#lCmsd$~l5tc&5}X!$kgPC1 zlGVdZj_FM|tv9cZU4Wc&-#1rN5`KR>7D5HfE__0`3D>9hbL-X;|MZ))r@V}~&5|8b za`g>%Gc=crpKUz`jeL$W2nBJoOibDL@h64DZfN7pSQQ)XO7Mr%!4%+yCn?FEQ#s{p zVDUDuz#!#Hjl7MT&DyG7L0mdj=z7E&5SJEXI5l}78)+E9f1<-ZxvW+LhW%6GX z`u%01-i~JO*|DO!H>rIyw|MGXnF6_a`z?_eL82w;Nf*ozg6gj&?5=HkeDl>e` zDMPh4vV=R$6)~BxjYi^sdf7yyV>AJo z@Jo4f1&cU7YxUr;O&bG?z8~3kj6C7qOY*#D>s7%)+RG}?s4H(76&>MQvV2FqosjYi z8=`sUmvYjk4Iis@zi{)=Q$-YVw52}T*{q^=4n-8V_W~(&d4)3(RTTydtTRdn9LnmMB-F=8g8X{FmV@H&{cJynp#sk z(i{ zJ0iu^V6&!=KeESSbNl^SnuPhjr)02u(wOC2)G)8dclPR?%`(Wg;$^00w1NnBaM zCBl9IGTzFbo5RAxpSbuJ!~&q1yQ&}S+tQMK}TH;ovUZ`sOZzc<8{o@y%5-hy!~0NSJ{ zca7NKyJMWvNiVbuZr2#w^Dy-Y>NsGY9JXz$eU~TGQC*a=RE@!QtiwabQP-3b7T{7B zWk`KAS=eX2adfmO={7)6tod~YC_Z-RGlp?XJk{dzrH9U8mh5iI>J>sm&}L|(4s&0b z!US{g4^3;UJBn=+TfTgc`^7ubu*>=d; z5JE{ZvZpoD)q~Gpe*bj3f;o+R^f0 z&@57gr5)RJ_h7Jdgcp#1v^~GKJkXh_d?&Yeq@9mrd%ESfv1um>kAs|JTeWc4RV4=X zvG@l`q=@-;v*W{iCj^zayqOzN_1VuC62D7a(kE;NAPs{Rir|*3GGxE*=^)!d476d? z)l+0>_fo$ke{~J{GVc@9#9H52%Q#TcEb#$`D$gXNNTlH9C-e= zO@rt@7Zu$VefZDX^E}Cy0f=*1Ia=34j$nlbxv%_{WyO~K-#r>9S%(S3v9jG62Ak=H zMk_tXK&!Lk&kGH4TgO&o`3?`zugo+B`|0+)?nVqK4KuwxV4WZe+UuRLc4cqH|-GRbpY$2)n%e z(6_j)v_O3w%}gQ4(}F}h{>cutN5H_>LW%<2nJOmK)_|>ggK)NIqJz}8isGS*y(`$x zc_!FL83D}IaLHmO7jt4mOYFo+xjw0m3t$8pK~8;j5IH5B8;1) zkm=|KkO}o!s~U?*;HnV(pTa(a)Y6(xk@X&vR_<^c)_&V@hwEH7({Gre&MWj%?Q}C6 zL5WHhgY%DlAzB6C0=Os}aymg7H*%0MRIl6$Wm=0m8 zF`c=*zELv#YBva4k8Okb9>`0ILV7pbmE$d>IqJcfJGtfRn!%(m{CJ z+t|D2jFXH~1;)(rJ4NRb3+6KJG`xm3n%(w;>Xt7QNdh0rzy^v9FW~ZXTafR(;y}ZK z8glmRhSc%Y4c`VnHV?e_0!ShsGIWJH&sEkln~hw~YU^v&W8Bg%2NC+C5ySlmV4IIV zxVi0y?3Sb2m>wj(sLR*Hzj27GUV0-F_d)pVrGJbN^5$jf15Qhv?ve^sYjf&19z{x) z#gqxgGh6gCfTH{)%*4|P_y452fBWny+3(Er3p}3j-@0P-`|kb}^vsZdihRnuj4C~h z1<&v3Hp}xUt)P&-p#o;Lw$P_z#~!2-04s8vmv9zTf-l#pPS7V?6;jK{u3v_wN*d_P z*$e{SdF*g0Ds)b9wG?I!IVVi4aMX~95^}E@wKVYKEFh7(T z*;y04qgG=WYx=yH>a()qT=dUPeT;o&$dn~nJWmf`M2z*Y0j%cdn zzOZe(i}G+}zg*uw^Kg4|5^4i;6eRXSkdWNUCEA0hk*KXNI@^LHL0wgf$i~ zH25RG&)eowG_)p5-=TGtN?VzgB2Wxfl#@UC8`N|>%?h=nBRe-)Z~BkY`L^{75Y-e|jL)s;tt>BnDq9FrGm~zw zS2yrukvVdwni^%hm#23}QjrkW5H{7$ExoR~-;nx))fNKB+$lUwD7i#}PYQ_*;b&G> z_xhV|i1_}+A2azvLpr5JpKn9Hd#F57sTlhy1HZVm)9Ml6HufYI7D-_zo!>cdVBL`@ zWvKdt8}}!^fEfQD?cfhR$`dP%+ck8t&p)_PP7FEXqPA=)Ij|oJ8)n1buvWk@Zr*__ zfQ#Mzcto02W%*)`s00t#8DrICGl`Uv>SPHW=m?PNO1D%i3v8i3{^u@kt$Hw>cMmRO8K zG*qVuNAI6j2La4(vt#4W^Ae*|&?Z!sK6w~^l0@~3`;z$TNG&?~Cry>QA=e|I-QsD{ zW--2C{~dIC5nj6u66#MMMgP%h(`e+A+HZdlB}wu>*CG4~&+|K#+#4;8uB3_GxEtc75DSnh)j*{hRibOyh{;h5$hfL3gQ_*FDr`oJXk($ z>&|BgZ|~0gp}Lo9FJNZ1MO^mDKJaDA%0;A3C<-(VXBrN=nnN}~Prp*ko_B|w1a8>lV zbEvM+=_!pqJ>vjV#dm`@^pP%M`-n>MimQKk70uUj%idOd=`!H9L<_Zj9S zWw(jOJ4p5~ZS>pd}C>dj_pUia9C4{ej z`z%=lKIVfYrs|~95hB0a_{I5^MTO0FtGuMZOM5EmwSIBH3VRHuxUyS_W^26U3=@}H zqqg;1oa@H5Ms^PKoI!Xzkpdgqj}>CBNk>N@s`jXhr8$Rjif%~vC3%p%_gb0U_3>7c zcAz1X9EA!iylHHd&~@&}%9Dpd+ErzE%BZ~(jRWFwbsZw@{6x#jn@~1Lus`DHS47Ic zQ6aX*JJvudx$`#4!bt{}xm8*hhxc~GzvY=5*}bJ_7m|*in6n+;(r`n?_cFewXro9H zlDHiI)y%x22GFe5;=BNHq{{Y*bhnI2*z0?4y27OF%U94NOYeWB28s3rvw)fBSIGa3 zBMou~^fY7RD2HYEg?hT6B$9o%o(J`^7KhueRWUa+GjS4nn^oJDUvNIY)l##9I!92a zyHkh%62n2eqnmRysUyhEn- z>bUz@`-N)~SaZA`BBjkylD;ZuKRR@pNX_0NNQS}KUOi4OH{Ln?p1O50*hPC8&A;W7 z6Fk|k1KCzIv^cKGiNz(Eh{ZY^XE4x=6&KSE%*lDgd8qNT;@jxE)lpZb)uR|aLS6H0 zjaW^QAloT90=sw1$;n|(fn>S_jnMKWSc=uGY&_fIm_M$KK3yICj~gr~!$^MF;P@Sa zX0%f9Ye1MkEs4LXe!66r4mju$;3Kn3elrsr-aA>OGUh>Azt^wa#?x4JCAB3vMH;dt ztA((%Kv?N0+CJqxix;L zeU$HSiza8w3==t~YxSJoI&ktFOps%B@2lTabaC$_&{1|jcCDtQr>mFp29qS1mTj;p zqntnRc-TnWU_W)c$dkHaQ6tGklVO|nyP#(AZmL)8!nc{z z#Zk_(^nTdLn&@@wi*sCBSG!lI_weTr9SXM==eTK_w|S?9-?u1YXG%+er1vBkEXj~4v#=j`6Y}Eq6op^Z zz55wi=1x|Eo|WQgx0NIX(`gF78gVI9!gViuBgEKCZN2%9(U>2!m<#G{VGLN0K*^Ic2iiNsG#6)8)bxcsTrhuyo9NhP>{ z^>5sjC#O-F32jf*-@R4L&qqM;Ik=^Him8G|Uzio)BR!|Tp{>k^wwg0C zBx@M$4+CfJSqoQoro+W@_KZ4Zoc@z^kucBPlss;qy{ZWMajxqQIaFS6&VP(^|7OG80`DFBt^h z@waltD_0IO#+;lm^nVc&p@}9y@H&vwO*H_+vKGWvbXV`-eeyP0A$*yOZbm^8fOw=} z9|f%xHT5x{+%5;f#Wc5(c5NIf8XZ9NoI9&VZ-*busxOnS z&HCQ}ds^LU>grEd2J1d6A}>3T;B@bFQB7K{c_JTYZUY^$;uN{1ff-3Jd^QX1Q@spS13G$#k^w>!A+ig4^(oWoE~)3klo9n7_i$L*A-p6bW7TPjx|+T7cu2W2g!aK-mh z>R{hOp8=ttx^;up)>P=J0$lkW1ov$TEJPbTkB#(fJ@0m|k ztwZB}uawo&Wc}+>Y+FX?(=Ruo&nk-4@x#Eqw>&^Uw-}1{ zFA?rdK3OLhw7Rrom79G-z^4(hsgOEZC6*<>3M+-1LDa0)*T@hQ&Lw? zXi=vxFgw`ne_O3;@8H?Jwwap^iUE&VLvTBsjCZ zF?kS%78?@*IwVy1)!@nhgrDc%<`V}gaUTvjk9g<}@E<{|!7U!df4qZ*;7RW(#VMvZ zkKPLo9v>e;36r|i;~PzJhSuP*`$j^A51kP2Z%~0m|8Ky`RV+EfrkoSOvd{=U8++2cDe$AR4O2g_VWto75+8 zK@q(0kMaqG%ItQb6RJ2UeE+~XltZao8Z?vPw!P?I*YS5Cg+v&nt0u3V0w|=Pz&r6n z31OZg9x5G2<{7>{M~Z_sD1fgq#r*yh%!u#S6ZMY&A*3tW{y;CSHB{gKwuJt1v2A-R zD1lg?Kn;e<7;+4`ySIS7z z<_X-9G)YjHH+vpT&Ax*r_)-nckX_$F@vk*k<&^52&pynx)DJ}X%slz=B<=&u=$iNu zYW1v_0AAvc=L}_zPh@+w;Jg)DpfCW%fQ{2WUJ%m@k;X>CG2w)CQ3Tiu8G!Z zncP<85N3XE5eA@sjUBlV%*S^y->9i#i!}R6bk*S#%LRTmPuUZ4Z zr(CF@8zqp}N90I4mPAoS-uCXrv1rQOp%|xN7~XDPlA4XE#43!o z3OU~JgRc`{TN%w*+dMA}_l1?BeFhLS=e1MsgyEZJZVSnQ_G;L(Nv}Dw`f}x=!j_7A z=mnKVIkcY#So2}jY(o@MO0qS|^_iZ=S(S@R3tGRTrJ*k+S;9i}E}SywKbXGye}wPKg|<&m?hJlolZ zG(X5I7G2Ewxc$jALZG`XU!1w_5 zt|@G6_|SemT%I)v|GH-|Z=Bdp-3M_B1X6Drv7m(cx-}D^@wklcf^M8#QUzI(h=djy zb{vghnCusm2KU18vt|ns^Nv0U(mfgH{kJIO?>Y|d`DFF|btG%iod3F8x2iw@uBu8r6GZ>1jr zq8)@_H3NR={?l;twDT+dM}#33_QKs~d2mD(cP>LWy_{^+b9GDzcfW7lv4^j{Z-4OQ zoZ}+^=XN6ETw;g#=~VFu00#jKbBh}bhuZ!|L;tbwrSHj_*yLv%;V;nnyk2#6HSPZ) zEz7skP-zi90*t&n?*G?-!zE6ya&FgO(041mD&F%J8W8p!tmi>?(jzs9qp`~-TNVy&&dP&5qR`~oCvVE zUsS5b0(8Wj`VIU30o}9!5;nk;^zHlizq}wr~qt`OLin*XPwjuD(@stW%d`u+QN-6BvY!fnZs#&6vB z_TK>q`2}DA-2cLTaZai0W9wRAn;w|{d>LXubI0~VFLwFOFK64TH$7QJYTMtR(-* zT{PMKx_fQiZLf{o3f!p@t!!6)9`>=HI`a7yo3`4yYw1F2+hyOr@c#L9W=>L(&yIlo zMiU$YJmmINg?oK7%dY))JJ&pL>-@6kWP71+x2StJbz2C^Lt!gKQou#lQi@A_Q1&SY2Ep2 zZX8r^TkC!MXPaBS368v1xM&&o+%jMrdva{4=Az;;C|GpKJ zBS7VcYpGdI{erlj$E)-uZ@>R{{hw8)cyQYBKi=kYIYhDQs!68>^yvef`Sx`qEaIK zl3W6UeBV0(gM))ZLO>!wMke5ULHdI4fBf^*0zi2NI0vtU03!i_qkuu6fIW2phyVaE zD3G<^4gSXu7&rtZ6g2P|3@jX|K@~Cp91H>i91;Qw3KA03+5_}=03-?&>T@O`Xf$~} zAc-A1vrk0YGt!r3tr!ZU$7C$}_P#K%m{{02xa1U+R4=Gm+1NQaxwwU2iHM4cOGqjz zDXXZescRS*8X23InwdMib#!ueadq>1?;jBO;bTx_)aU4!*tjq8=^2?>**Up+`Q;Ur zRn;}Mb@grS9i3g>J-vNn;}erp(=)Skt842Un_JsEyL%_6XXh7}SJyYU-*JHfApV5) z1KB^}LIL3dhlGTH1b)W_2JQ^HAW$Hoo-;wC3dsZY?9fPl1PVX^aDD+FgrOS_V1x!`6v{?4ygstyhT32}J`%h&91$_LBhZD-PngX-WVf$Br^5wWsiI z7~P!x4ME*RcF?rBcmYnB!@Izc0fE4%7RvqLiO-h*+M9`*OX#h(1np*3z{{gljL&2r z;x@k^y7o|-T6&ATtptnR?0*9Al067?h0*KWtWxeivgL^YjDB>Kqd^uSZ9AsfTP5&}SWu$nTpLgH09xh;sKJ_Acq?E4phJdNSS({Fe zSIF~P(n97(;d^s8!!oiHR(mQ#Jzpd@U9rokL|5kRafo|!o-q2O#g*EsNbyWOb!>Q) z-Kk99DNdXuXO>bxMVW!mh~dOJ7$DKd2eS4S=_GeSjP~U9Wh86(cE5fGO@v>l2WU(% zq&cMfzF0g_QcOf&s)XTcyo$zbjHqKH91lp~by zPLu;F(I*iVS`fNA7o4b)=w_8+1H}BlZnD$8pa1ym&tA;Uss2xC3}G*JU;3TpQa^k? z5q2du;lOzzRgm3OtckiYHhqLdjq7e&DR*W9Yn^R&kc~ASi0+ zP&V+0WEPh)2sQ~7VeDJsN5{Qz0^5HIh~BZXnrVb9cqK|cM7-ausGqxnEtG1-i`eL`=qqZVi^ zJ}QWuzwxyNO?!M+S9W>jDOuiTH?3uvs|g}ZD_)$_frWKo>c7k91gG+0N63$m3G!O_ z>tC6c=MsL%S`}3)eDgn~jn(xpwJnmQf?)VdaVeuH@9&yHM5r=Ng-=7z$ZGUECR!I3 zAqB@>dWobD68Iu?un2;RmkkD0jyOXAqID6AQIWPz?!KtUm%ztn{U^W(l=&2Nskl@+ z4DgWY6cK3wL+(0)q^9c|}bW-#X#d29t)+D0) z5HSDf>b2etN|B!p58~XFXO_}SJ=<+SY219W4id{B?Ji&Q<#Xf~5|z$_gp}dl1!NbI zUJWSGbgoZ8Qpp#EWZr-`p@KAH)w~`1$m`V=RyyzXkA{T3fmPZedo;iU_9ws@Xr3Vy zSQq=*(6D_j!!79VP0pBQyv+T>Cab@<>A*%_O+c^uhavp!KN`AI5KjJS(i7l^O+T#v zZj|T+hTkQg%|G0ealpR(mv!ZOhpGm)9+$VUj=CNJ%am7t!qoYXpneSXcc^}h1VZ(X zI4u8d9F+b*_0zrlKST1jsQ$*&KcM`-Z|kM67<}&U*VkDR#=! z!l7L8oC^s2!gU9;{BkWD7{tNwJ`p{)$~HgUMf6LIzLVt>%8svk6>#DWPju6>BHS+d zm&JAqiD%P6;}A%P)n&f`PSLAbd@QKB3cDT!F;s^HE(~%`N1JbOy(AI4^35xE445M! zO1s;W-W7!<7mc-j**jeA`IiYe93t=^WZc8cy**slx$+)=dJq)Pe>fdP>OA(q^!`j% z|E&Odl8}Ksic8Wo@}vU&-;>~%_UV5T?!OkB+WU&%#W*BLUNRlOj9&^%Fq(4}r=0 zT}|X8Tx5GIOrzxDUXO`Ak1}nUX$`>2u{7L9KR;e5@FGlg1iJT3_o=wF>d*+h4Xt|u zSa98Qg3|C;mH5?^@YYrJqVt=g$E7IVh;Gzgwo-nL{PJTk{M9glp&}&sdA`;dGh<71 zD$ ziNjCb`b9v0ApXVjpKA4&nE&RcKjg|SXnS6|(jX>K;27jVT&nMq%|-(OnnF|^p8z^c zPk=b)SyyRN16ec%V`8 z^Xy?Z?4+{xEIdD%oed91=>mjXn z=U8%oFY*NN2paQ{l@3WVY^qcVHfonl%^l*}4t2@tbvHumh}4?T7<|lApTN3xd&DTu zXD~GKsGolHBzXdS?Ig0UdIF?;X#Q5Fa)DqzEL&SsV;pis@#Xnx^t9U%FB5^CXhxB` zn!5H-3Hmp4ut@AhsS?f&r+c$Ixl0k*8Vyj&b?98pWE@U4pPgVmT++N*+nO4LSd@aN zk{KCel#|V2Q<+z(Ov*uJwZ%4eX?AI`nE@kn*Bw0+o<;wIPK=E4E5Bt$mK@IL! zw1pY@Kuz`6*VZVDT8t``0DR{>8=J#IOV3HlVH|2?Q3xD3WORmN33PY4OHrHoBCqW= zhPwm#%YB`j>ran&7->d%ns4T%*vEn4hxq-Bum*b5yz6N|t}X;_J}H^#QPRdLCnu`> z@;uu1T>Mkh%d~^wG=8395G)ry1r?fVZAX-)$xOS6l0m;CI#l=w04<#q3!0a-g)SLkY%e)N`NT{V3F zPGtOUtx&jRZZI!FpkgRz$igu?!ihDgAt=;ge8M1Ta<1X)&hD;Q6;?Up;}7rul6NX@ zn<3}*!FETYn=<2AGDeO)oPp zg+2eSFZTMjMe~Xo1pYlPVpv0GlmoDe%HxhRzq!nDHg`VlqnK&3c}WA0{@^D-C7;rn z-@*434AS|bMPh$(QL(sD%ErcaB+LK4ivwIgrs%GU>!#kG^H7TN;{oavoOk4n18HBw zI@OX!@mi}UWT+C7{R>c{3I8P6FA%x;n+%Fr_brk}>J)IEtvGL}B!A~!-krR=01MJz z5=QltrVDe;i$u|Qets1;z zj!1qde*xQjl?om~B`Mj>>!DY<;8%i=F^e1*Z}$+LiM4>G{x`EQyhP9(mzfT5#(&oGPLUJ$p=v2E2y1 zqeF-K9-m$7fse_ExPDI~8@t%#tvBB_oPTdXJe zR71toa|A7l;3(|?j)}mko7-;`2$92D#4FxYL_#C52<1(8ev!w@m3=(8$B|V zzWK~#!;kXfJ+!tkIJ=^0`SZI|s#whI7d3#Mh;TK-TRAAWA`ZC(CEt6x&n3@do}>Hw zzSn=IYioP0MaXFHS}3jBuBsxpKfQM%QxkLMhohiDtQy7;y8R9D-l5YeQVO4@UR6nJ zoz|ARsW=O2H1P$M-j_6T2-RZO|pmm}@y@<%i8s+VI{P?F;S zD{fQSS?Rs*fS4ER^i$&5O0Hkh;NRmLlyY_n7EI$tClxge^G&=!WfZkv2m9CpT&9RQ zeT2hfapZa2(M8Cx)0Y7N9Ld6=6m_E`LMQ^g*pu=!8|DHg-WPGuzGPBXf(R7_lKDt` z!4Xw!%VJmSuvcMu@!gsj7DzMl1#lBh-_X3LZrD+XeYZDkX6*8siO1mHuxpDp>=v_{pNZa>xK2Qfh}t zwLGF*1j9vBZvT0$z{p8e`B3r(>cPA6q<3YZLii5Bj$8E_R1mRs#~BOS!z3S4h49y) z)RR6p<|~ycwmEWj*~^%}c=z1?L<+g?Qy*haG!77kqmRt;7CD3PV5w?!5BI&BV_m%3 zR)iRt2#!5qPliX??458jPMxD8H6e_yQgHh1NC7=D~Uz?WWdf zg5c2L+S9%WZx+AAON{X+0M@E{%;WQl+6n3Sk5jTtX^1DtA;fq=0T?u?A??0 z7Ve1IivGg-!aYGEO`?Q2lvB#&$}%lL_Tl1I*@LnjDcxM12L)uEzc7zY2zfv&B16`uX*pgnjU0$5T+=1DewL^r4D94S{`$n!4y zrFMe#Xf2|{^f)@P!CO9xZJEg3Vg_ppRGFjP&1bEoK!2b>mZgel%3276Qj@!;W~P}E zIC?2mMY3PXams*7698Ran zT;oVUH|0*AWPrcLYXyh7J~Mm$sNNFTO72 z`nG{JSjhT}RhRn-fCk8M;%uP=1YhLmt@bYp$a{z`h_H27@uqczeBG$!F)$LK_BlO8 zIu4Bnk+w`p!p_!evD$SF^K{~fih1dMXtQJtfoGqG)PcC+8C;|J7BdQH-VfLje+(S+ zCMuYf$}^2+59*1M##~4|1m068S7b=!4I=hfVOc!ClOImOoEq-!_Dm8}oT_A_lbM#N zv`FY>7v)1Q6AHWQR963O%7oK-o51x!Cfb&YomFQ>GXyxBWeABN}GI_qcD zFMxFKffXdX)h$h9b1|IQC=104jAVukCFMbmUI4SvLb)07iQP#L0#N;O<)|xo^$0!z zrVn&(0Yw_Wz9m=UCows$kTi=#kQjKMYpMiM8SZK$X{ktdn@K^UMk`x;NDLp{Ew4fz zWAIl0Ll=0ez8cy@#QvdttE0fY#?l%aAkQ$u1sq z;@xfx`W(v{Q_DTo1jFh$tx-em;^r&nc~M!dx2f<+!m?lMCyGkQE2`>4iP)K}$eQV4 z@CK*LiWE@er6Jdl8n}yKz0SrjFI0nwj{8FlCqqt@(|?W#T~ zEWgF<0^{@HoiJM?U>caImwPViWUKlygI9lR$YpOw)g= zGq4=IZp>kVf4(~?SwEG0XjH1;G$(V#J+x#1DWuO3SOIA5_*$+S zF^Bfu3-XDvtgYdSUb=DAU}uK7o}TtlY^&k2PT}mG`mOk>E0(OP%`q)4-URZ~_P{xe zIv{X!!+q*@f6iSy!G_E4<7=LHN+o8bRYcJ-j|#;Q$K`n>rBDFBl2~{23b-Ly`?fXDK=f3KP50>^f z+x#gEx)v(kniUh)M!qvchVRz|FHebuy=|qaa9V+c7i=_4G zB)fsLjN99-=zF2;jF-*kkgxOi`U?9nLeVS0{CihSa8ePdshbWjzNP?^94R_z(QPG+ z`9MM>o0=c8M#p9~kZrOi4;I$z#mKdj#neE#D~S763HlVHoWq_e26?LRruz9wh;8P5wc{F#~CcLq0CosBY4Z3L0;MNf=22(D`xC%3}<53K@$XG#zo;Q zuTfejbB0cLWx#+R4;n$-698#l5bv{!W&C3jYDQJxha}!s%47cpbi>Pjd{0OAgnSM2 zsW$f)V@ltud`S00lc>Nq(d7l7xp}e$uF>#`KCIUGKBrIa^Blz;-sH_8g_S|-58lS6 z%#mE+6Tr>OHLtGM4`^^!CN85v<5VV)phg7NrNS$wEj?tds;z2D*cg0e*m~Ea#BkKo zogeBjGPXN~n_jgzm0?=GJcv0?+|cGfd|P`ACe*u~$LCQMTBFOLGJXi?>!50jZ1~Wl zJu?S`?~^#LH*IO+Do+;mPGSq)R9iikv(J~go!2fN&{Z+U05zDE)xkrjviU6zxP*m0 zIwpf#90+0A%j1}Q1 z>FKI^n~mc);_d-xFbGb{A{A*g`SB`_2}~B=@WwMtw6Z(VW2e+GY~d^W_PeGt&A<)~ z)0uadGs6`FDEuwX7@K82XYlss;#CbW7Z67HH;F7%c-S<|4=Z@sWG3NA=z#da#JNy^ zM+|1@?y*7WohP~B39eW@>dw;f`E6k8wQ-)@srIHR&V`Wl55vh&oJXK^(;!BlGp%Q)_yO_E5eZ!T!(xRNpK9@}fk!`S6(s z(P_R&M{L?OCr5;Vz!7ZjJJ6OhbI?0c+5GlR@sbZIJTi8D z6Vvsd**THUm~iI!sLCeNo3*;5Ifsz4Xn3@^GTe0(6LnxJV~?fnEb#K`>fL>ZlXh-S zTM4>s?PeyOjxVz<@4pm@bhc}VAT+hVW-;Dll#%7Loz;p8#y#B04C0)vgxh7~+=%R@ zNMa+ay>k!;I~6gO4_UR#F&sta)~S)e0N-FMJjXvntHUKBEM zM!3k=(yYGD#8GL*Ib8B)a^&IZUwk>|9>S`v(Ye)qsYrd%GfR&MwnB(R%I2THQ|}*X zQ8WFle0~t2)elv7Zj=A`Vlmhxw>z!)JQ04<-M!yRSXL@`_RW;i0!{#!j;)y*cOs}b zOulW~%@`-~rSt_<7m<|4zw+oj8$spQN|t@6vXz&$s5RSld9=|^%OYktkD+P+QHjib zvJeqJr%>pf1|{A@noT7p0Y=r=f(irV_45_r&hDt@svf^k9lsD?{UF4jKGs*X<*m!3 ztSrM*Js!!e;&q8}D&4V%vvPG_JQpO)S%8FOu0mrZ zFk`jZ6fm|1=}I8`x${aSlIiqteRDTOvSD=)Prd@n*NnVEf;{w<5(@q*K(7)!ABLdr zsu)zAuha9g<-SoXj+569U05jBiiMQgf}UgKBJxVSwTmyK_>!k(9GW5YGgVRNM`j; z_eBe&Tkr(8Sin({&Eu&39zB06I+3Mx5ykOf!#9Xz*x-mEn!I#x69$L$<*LES@{$CE z#Q`h?r-9V$!2}=qAus1HXnyONd<7R9g}U*>!a1@}cl^LmE;wI$JhzCJoo6x*3)2=N z0Q=_I_^`{7Y5p$fo8^=3EsDeBH-qyPHC%_uDbZ?`qx40Em8LzyFBTtI(#GETx; zhM=9?)c^Pb=Xv9YK@(PUFyoh4{IzR!+9qwcF=?y&rh0Y3Qc#g?)x+3EXN+K0v-s=Y zuHF1qGxCBs5=M>`)v;KbZ}2p2OC!2=HF@Ra0lT|$94jYZKtaARZlHS#R z(W-zEyRXDQ@pccT$+D6e1U=d02-&x93NL>J+f8)Zm#zO=U50{}xBh7;LO;y4>&iNa zZxCP~*nS(27pJEV{RF7O`ciBkRp=8${Q5o<6Mx0PMls&-E7nAO=RsSCmHtPvnlLYk zP>8VC@JvMCQY80fGK|ZIaq2z|)W|X$!NY{OZiFr(FjcujjvGwi9z8dqQi!s?(X<1z z_2@+>=9-sAll#2OT8X-nlpS8rG}OZd&V&@UIoHP_9sS(sHh!?}FH&hk}# zOo~K&g1tC9;=n9zP{HP3$~UBa-Xyw)x5zo?YP+C=7gog@Bq+W5RSwbFZUMl(mF6cn zWzg)WW|q!DahMe|LpA*Q`em}Qty8Yzc>dJ!bY;6EUClfxYG;F6Vz3pCwHqo>BiJQD zf?d{$L9P`CZyC7R(?e!JY>;@K3ahwVAFsLJHb5=y#eIWnO+*qm3ilw2z5ew@yo^t- z8AmK**2*1uhxn_~u3Y^Hz#EJjh&sXECxFRP)uCe%|3!puov4X0i7#wqs<-&CUc}9$ z*R{lX7&~_7T5OzkFr-1FTPPQP5MUtX)mQgVp9C9L>Y9QM7Gva;sI+{WoQ*hJy7?#N z?|JeT)aC?7$fUF9KV(n{iu40TvAYFddEC`hy`8?DO3DV_866;(r+I`BBZ`$@u=X?x zWyzXfA)8~4i{X1(xY|QM_YlV)@$SMkPLChHoT;Ear146-`B?iRM2`L7a7cAJEkDA= zM82vtMHVHqzyqE73<#!6?WyP2i@h8A7(LORKRfJXm0m0QMTp#k(N=7lFMkuDZ?0kW zZH~@HH@bvyYNd}jO8-7fU0eaMVcoO1I~_mtb>!QH=5RHy==4Bc4F&6G7*x)y%}OmiP?((sSy1_anBzg}syz%&xAlVvI#~Z;dViipQLbpY@K#G8hx|9#z5(!q z1wMxhu7a-^BTn9{3HAu2u`8s=38pgxb3uHWJzOy^rQ9E3FrVwy^x_UIA=1w_ToXf=7N zYYiyn@b+p9^LYYXNYD|dF}<@RFICPP;e26tXAh{%ewwxsu`O|Oo<(nkxVu~~O3uS> zjIZ%k!SU`obOT<-uLHspp@>)fb{ytnmK7`N>tp6%Mz=r3%B;K{t{t`$#8%~2UXw7( zF3KF+xh(7UfsgQG@w<1ABgg=(IP!T=7rLor<(P-$@+wMiNZ9fC5SwlyB2I=f^))Yo zQp>&AB8~eot~}HDl5UJiWIJR&Xq^v8915!5++nJKWubfQb+5M=UDt64Hw+D(QlC}z zOH>hn?>#dhGL0d@kk}F5jn69Fjapb1ri-rVc0Og8SMW%~&?IBt-omB~CTuFuKRR7% zTg+)l=G1>vLJFXMabZDYF>^?f2=@_S6#Rk`a)l5G!C&c!7(kzu*?5_f=|NP0UoME)9O1`$f!n+9 zTXWFo-U{qzJON5-o6o_Nkv@F6r%li2jWyi+x${vMC|6`W`I6Q_iD6 zLw$Leu^aU(G866fp7=Tc=YIFGD;p~^3J&=?Tl+%J9a}!Lvj(*=!@9cASDg;fRl1Zx z+lS?Q%-?F!Hp#;*{RQO~0dq_~zAs|q>C3PJX)O<}vN?#1nI#w!z6v?`1V^rbO>b>y z(^N^pi5D;J@u*?0cHt@q0|gYP3YF+kN;DJsvYHmRO5ToizO^QqbhykAI9mSzwG=mO z7ZL0{s6KjLaEK|zwO*O@1Q^ZIjEm8A=8BNCm0TiQEQNm>coAN-^eUNtTq*Fk*W8+^4}LwJFhusXjru2*S+b2y_u zh_Rf+`GSY>2GC9zn7E>Ar#)dOhWL$|O?p7RI@mBd^KN{nItHI@efXhi677u>xN<_Sj1{v#&Y|PD}E(bDH?7 zLb&KqcT0+57KbK3_pwLqAON(|3-~ik2nJNXIf4&U4cL8w-Xd&og2|ojw_`m$(0tYb^^I;q%PWsU=x+GR!z~C< z^IgWgP_i;XD^!9|pl8#6cVS~4YcE<{+7L9gi`mehZK0on9}<_Z94!w)yX-5mHIb@L zT3dcwM-GV+9G!pt#ln)4-ku2Z_6{C)&#*S&g?^BPfdmDaXmAS_vEhpIdc5J)Wen&U zgl~Xpkm0t{87yUoXT-a^d2HLgN&9dG&zkky3)mH#D29lOn?TB`GUMZ|>^Z&h&RTZc zS5ca&b}6if3$^D8_+eLpSxzC2;Ef-~J8;RDNn&wCF8KX|%$h4TUo}|7iB$O0Fxi#) z?D~x-2FrPG9-J4w6hLFmh6|%IN0t2o>q^2d-4IK}et$BIO;P^@Si7gLbV~}JE-@om zK(ha`h}r`e?AWuf;`Qk`&5}Sy>DukcrHZmP;se@(l!Z08xBBP&GV3PnlWvWo>|O5g z(JJvJxRA5mUT}}hoVYPay-F+28D^&BjcG-~0zJBY4KY{iD=Kl*7(q3Z=^$~o5dKXg zQ--2UHBm#O&&Ylg$rEoOfLfy^Rqi}+*!n1VF7LW)o%UMoHjW(vHaINZ&+ALIwax44 z*43-ZMIifGjbg7}x7s!mx)DaZRt+*)#cWo=$wRP@K96>a+KvBmfya47;wzPlX%=80 zOxz!)x0G~&5q#)q6fch1HM#NCzav6i5rsicxa>ew6k1k>hQx#4A_>vmmCky#nI&uD z4sG*GO$k_lki1WqP}?a@UZ7lMZC%U~UYubv!$m}fZ~p}b#!y*~dO6&ud6)hkQoKwe zmvh32fFZZx_8OIBicoo*>`F7l*ISdHFqb2pG4CGEv>jEfUXEg5FNjpl>pm}M@fE-K zVI#wmTLdzSBu&=MppHQ^v14d$0)NFJCIpE$|gOS(CV5mnc{kPlPCY z3J*CWBzO^4Q6w^K*Va!s!qMz}(?FuZbm3gb7^!+lU!u#-$PMbOE^gBusHb#PE@7{4r&45RSEhVWJ;9O3Ab?^iv1 zYlcvu(XMQh`<=(jeYv2ijpo_8B?jK-9ie;3Vl~$ey_6%WlJWkTF~*1VdKS9OMu_Z4 z-C4-Xa|^??j9Q&!WnP7^Y}Gw8ssIko+#Um72BVL?_P5gYue4W{>1ee`Nk>+F49&xmJR&IFwIGIg z#hYcCIIM_|4Nk&_aEvEcV-s3~c%}9>w5fYr5dB`rgd$^gKQ7&yuf5}4rKKSgVNP0p z{9ttZ9*T5MyHo4{Dd-(>ZJ>U&c!Wbwt98ejW|S;OqcrU)nO%vh69B1W!WrwbIs|uN zK?F_GDb*QJc6pJJi;bJ)*HnGSLm)~sX%+)RT6Y&bm^2vE)h5Wv)v^weC3(qVRxCtU zsLC{O6ynP9d|0N5O0Zw)pI(gEpYM20i-Gox*n#$pB*+9ly!E=c!g}spg z_S|$!xr|R}WG?`~BOPV#cub`@Y&LbrSfQFIA;iqLklj8|^KcbC&XU2Im&z0*quX;H z@uz2IB>T3jH%`z`tzDkZDU4Ii;(h4DzcJ*0XndFB4V)`u zx%?RjdDud{mB60jYJ`|gN7}3|lWH;pUl{vL)gQhndb@TLRL?2GB5!Yv9uQDL^vqw~}R zLt$HJgpV#({;vE<4%$s3mmfUG=o%-^`doc^@8=zY*T-4Ge6j^^t!wh6$cy%?#@mE_ z2Qtx>DWiOR5#49fCh zw(niAA3fxtU4;(T^YBSUbDgK(gA_rO4D`zRB{sY{{s9nsNVaTO>5#SxfkS$Hy|Sj6 z0jeWiQ^gFtN9uSN@ypUgxFu+&Jk|H^b zIhR%PVcW%X?9JK*e)y*iJdad@!D-vpmLZ2wCT$~hI>46kc-92%1zvwg{ELl5SL(8F zhM|;w@0wbP%LvZY_p^0gMw-)C;N=}rn67IwwiKin<8Zut|9MDaU4TR5FgHtYVYNH3 z#45eA^kXKoI2}SeHUvprStQvoKErFY-QWquojL!d1dL#n^32N!t&f>TZG%na;58@K z`-e*nb1y!ul~~!(e@l_Csj924v53>*7d4iSsHG+4KfuZnN)cAp9kV$+Z}{X|c|#U? z*?n-#oO**d;1GN8{+kM)4{#h~c~lS^t=rk7a5uy1T&2{tpC3QHr@uyzlh>~oBaJY3 z2>HSH!uiI_jp_7=%whO=opPMCNGIJJ4YbBYd4j9)+gk6h>%LU!8Ifz)8Lnh8tm2P% zEC8%McR3)PSC;v=wRr@g_3GLno*0x;T6p9eOQv80vJ1W;ppg8Y)3kH>9v-c=^PX)B z-E7o@gn~W(_GT+jb1d70MDA=StTg#zy$I!kQ#h1O+A! zAU4OXq3@N3yM+Q&S5hf66`^5D0C#7?tBKH%hZx$0*||=F4uApew$0JWV=JN; z#(D9D8gqaGY(%)2n+B%0os$zs|5}UzRJ&+>a0Qf$Z%6Lv`T){I^2`;ls>!{MKM1%d zI$a@wwcQkG(FV6>E;~To3c7&S7daLC3Za0JG_o;V=wjGBplj+0a0fdElh%}S;2=g@ zT|WOF3OzAD5pYe0k2W-Suvza5{74>+l4TVUz`0BrmfGUJ7a2)Xul82e@~9nrIYA4gh>DV9#ro5S)i@$a-iZ*jgz6NcLmVdC zNA>^$mMH1T!}s;?$IWo{wRoNfZN@JOYN4IwSz6mgwd{BEI!uiUYOQULuGK#d21=-t zREKg|4IrR=6q;ey3EuFj;XLv%-}YzyQpixUctC44G1P}kFkj}sVu%z|A4IiB@Xx!U z{`wsfw8pW0FH6_a@bml?PS_SHbrKW>Hl0#P#g=h55ZR*EN-4Iz4&JGS{d5e2Y^9xK;PQDD>$O&X7 zU?@yit-MjgLc?s6bOT7hzHPoC1r&L6Rtv+#; zabXJ!A3fApXA5H1V{H8aio_dk8yeIq0E52tkQ=W^?@L#1Xr&UqAH+=BTTUH}( z8wfd{0H)_H#U0BU9~DR)o<;P+1_;HF`uQv+)-duOf)C=o_M^krRYOwX9)ULbNDJE1 z+{pFzlhxLMJ_V~0l%K)PluC(JzHqUuqse6kZE0IfHknE=a<^NLtjGr8(!#XLP;OR| zZ|m*?#1+Z{__EMaXsZ3y)HvP3863_2hn%X~Dv=21@}A+8Hz>8O zI{*S|3r)NR1erlA&)Mzf&kR!V^Yw*#Qs7>ne1h?BgLaM>=!)1Mve&Sk@fqS)9nJ@z zH&tViEG)dF zvlsZE2?Y#!+{$H{(ZPiIbhAl$rqNSB z&PKdi>{#HW&xljG$?M^)FmE~+$8Ytu!ZIg&bNn;W18n(a?^jI=&=E=&n}TjH)86{AU^5fHwyEVk&((b;jZ36mSk`+4GE()7qP}4(fe7T^$LdD~^eM`c)zh znw0cGueXPD(;1Z z_lE_Mv-Lx2^co`f{vokxk{YkyhLisdXsF43$?+3_@z+sEG&P40;p`mfGp8=&(@=Wg zPz_97C?RMYdT|aDRgKhAX6wH_06{N5T0lz(Yk7G_MdCwwYgE{2b1mvG=G<297aTLb zJOOAxF7s?7Iy)7(;dla&SdXf`?AKtfH0Y4SK?m=njU^@w#A^ycGAMcl7S+L+N=8Lz zm_c?<N-Uc)n>4=mq*tP#=oGTv`K zCk%T6092mE@2;KH&)v!M;-{ z`&%9={Zn}UL9(BDIiLO=`7gQkx4?fR`~MtAI`cC=1pA&0k?|`1c!SqPUUBJ6s5#Vr zuAqH~8MHsq&W(`OG^8>kH=F$RQV89Xwgl8ym zfrJnHIfZ0pQ<;$6YuIYwLNtaskd8!d{-smPPRqz`^D>CEpI2@@_PCb6%Tq`0FY+`; z^WRAZqW`_p@qdQ^{?Ac>_E1lAt@T2-vVt<#&H;@!*2;41gWL*on$S&{<%(Uhn{rh4 zuM`b)7)8bXI+T-@5CvKYM>J~Wiu z#`#uCHm|BGmRoSlplR1~U+AQz){R*MPofprJSUHsP;z7ZdtZW?A^tYd z$?K6ksDNr`gAT@|COgAMuk{HB{^36+ZgkLbYh0lpH=!HiZk?V{o+k0v~O0nER6xG&=EW<5Hvv#3s?w6TrCUYim&e9B8XK zJWrU=CHXdJica0FA3qAIy4DO;n#p_MkRN&i#DZ!(CU;n`zv>8qPRvhvF2=jB5Lj+B zGo&+<7Hco^;};Mxix3uLljrkY$a4BkZ zYrKST2|$(=w2V!;2NfJ%{=ZMU@{=nn{`4!c@)k21Z);FBZhU4+Z^UWZHU)vCp%>46 zHN8+e`SJ&>iw%Wc(OTdoeHhgZ=erm=7aVp~lDjp+@a~0Sx2Jl=9tVZWru^RXKM%Cq z;qUk!#asoCl+{PQgBPGP(Elhq8veQLC?LfDnxHU3dvqkuBKbu>1CW?JF;RRlk|QLF ze-deaIrd<)NZ*x&F0Z8f*TDh%3t7;K?yyhIcW_}3VCI`fW^>WeNt^xVlHC>R6+?!CPSLbf$5)EaKssK(_;mQXT~-hCN*V=)&}=;q5d;VXWm55C6P-W5^V zD(O%C%r+<7A<+<5s>wkvpzm!>ZGAjeQq0(Ss+JEn8UFLdGedX=&Mudt`9YKR=}}r| z+@ep^J>>xU&+qz#5g${P!>%RX%HG1wc&)tj3VD0hH6L-|HhoRBF!1aG!Fa1*uHK^E za`wD4`KZ7S z14spYU>;e$K;H8jdDH#`07ttw-43(8t+Ltha^Sj}Vu4Nsq5A07#1o#ykIrvnm0rXL)sVnB}^n9Z;n_lAs8@&2#Pr(DvQU13E8hM_$;I(JziNj7x6}VlxvEU zX{u{lJ}*04@h8)LaGP8$ba63V?~K&*PRu z&58JJQ%^F{b$rug#T>KmyaAr`p`~pGs!7fpw`mLYl;2Gh_$>ww-L)PXHKzh~@o!a3 zes$JwB*Q;BB2q274?8kW!T{BjztSS(H+S>82TtyLfykfpee?IP;r(~!n~nZw1K<2% zpiYOG1;h8zC?6^QjIPu_nUc)czBkz%@h|||FZ(;p-);F$wWjY>W8Enn{twYC`5w(d zZW@-~aQ^D4KWH`eomMhIpvRN{B84)(8+h@5mw~$ipN{-18HJRIt^ z{f{g|CQ^uEtO+4YM7BD@;A9MD$j%63-wxSH$b=fgYn?3fa_k9Xr|gkoNU~&C_Uv2q zjyjz-?|IK3?{)ot*Zap@b6xYy_j&H;UOvzFd7k_JY}59;llJA`2ZgC&anJR=me&rH zZDa=#<(!`#N~pNr>!3W5Y5^e z8z+W$(8z&;)P5P(a?QFFw&mKfb=y8X$+dI;`>y@}KKA~*0%np@$J@7>?bPIQY0~hk zi#*PFJ!*BsLO{_({2DpC1P1+5;cT}UsXF@e((~iO^cDM5I3<6nGP!=uw1HQG>1<>+ zylwb?Rp2%_tGA~lp=7sWsg#=7N2DrS>3j8@H+C2+eiPu9?9zWShQQA8Z4Cc}c8CMC zv%X}3MHPoZB3;<0)}kxm=g(Zplwj1^)JhhSC-7z3ZC%`R{0Pv(N3xUX)VNs^6|uJl zimjJq)Vs(=hct{Fz*N^fwm zW_&`KPTSc5BCwdqcf84y1#Ne4TX$*5sDgpbZ~S#zD*(JF5U1(jtLY@d18H;{R&zcg z9ml`i0NG;|r8>2TYj!a<{dPu4=kB_*A}wQdds0};ay9QTti3ou@lW%zR1geu^1a~k zOY7D>Z^1ZL`EqvuSS6DXuD*prb1{N1LVBVkZqR?7PED|X8v;^OXpkhGDg6cNJCR^N z*KjnzO|-N(R6o6X*Lpz23`L|a5W?r>>HE)v=&ov|g-OdyXWE3+q0`DkkZ~(BJp{au zZwbv4%D7*hsT9Ka$^yPynZAc?9rP^yvCmN4_}=pa+B;y+O|j$UcXrcz@1$^|r*00r zE_q4Qb{PB4Vg4;ht|G={EkNLmJ51TeCFGA&S+*J}X)dp>YO%+cT#l$8fwl_tesisw zBa6PCIq&2b+OzLxMyivtxL-67K*;0RoY$QUq>-o8xtw3VI5JnAnzH8~n>1yZq&lE6 zR&k-#EB}488hy*+!A*+H5WUeD!q#bdVrEW@PuL+t z3Z23j3Ek8L1;!#qAJ83B6~Q@yx~~!9Fld+lK&WCY_+37{%2sh+3cET7+}maRIzDD8 z&Cp!DY#kXg`0{=!x2+V>=%R*T^pTXoP#VWTZ|pLoPnvC6ZANVa>LWm*C}89sIEy$R=2x3v9+=Yp;@Trf}vOZyU$l)_I}nV9L-_i(R{TAy=DDi#gK{&=-OMPOwVmfdvRdGHpioH>`#q;6&Gq4VnOxFs9?VMIIwpuyEDVKIEQs&J4hA} za*2}I>gm!LkS{pQ=XvA1GQ~($0YXs>>TcNmqDd@HxO84mx$)YV%k5jku^_Rn(+0pL zVLzqAluIh`6C1d)s*#dIQ$LF;!3gR&wmm+~fZ+k4cs@$rBDnoTl-oX|k*TtS5Uh5b zX!%}YHY(2#copkyGK0wVu{}&i^_7;>iM86>X6t|mVwcmvSx*=2A+`XW&bvf4 zV(A*>EPf|@jX>JP%-VLGUEo~c`_}U_caIe;ZEXZ+eOP6(%81qt@oeokq4Lic^Ewv|P_uWc}3}T0GH@1RmAD*BqiP`vp;**K`z=R z!@@2~|4I(F7@3n>euyR*4NR4MPFIU4m{*~AqzOQ9C@xC(VYy11%D|iE)H`@>;VeJu zLl8!=iSfosg2+_B&nOUgLz{7gtDaQmZ@ORy6_W929&WVfE!2EON6#HIrQ9;mZWnz> zupOlxs-L6~3zXHW6Ths2uK^k4Ap8%WnrCmG{#g(LXW zTVeCJDlF@wTdL66GrL7OX1MhuOYB%w$Vwjyg->$fw03{>t84f zhSNm7$m_-o+618vsI%&2J4pzk3z@QJ~4HjqgTZ@ExUO(Zy()Ii)Tq+RvFiPR!ZZ4{n z#&@gU^hspjV9T3;B@TO+Ev#)14%dbYovU@UtSg*T;x>Hh5hq@v^GmxkynBK-t^YN6 zV#o!KjAZV6N=KHcpMpFeI+P_8z$v|eerJ>M5O)`b(AatPp~EiR@qw|zjFgZolX)z^ zk0qG?F(XX@`E9I?oki;)eFAIFpyjL+4{TL0+Tz!u^82YE7@T<9tC6K4Rl0VIs!B9o9a5?#89Zcaaqp z#6<*(#SuR+t9APf(1Yef`FOU5%_%SO7+RHx6;TS+2nULffMm~ttcphfDgeOyD7ys2 zV#vu7Ia{m4W5g7tc0#oE(S5Dqowoq(fflwNS3@T{E$T1K+sXiZu> zGFED}wai32_ZK7$_q?Jswg1&s(5Urt6Ur!eh+S3$%)NbdIhM{Nn)ZUdH=-6erF@jK zopG|K?3K8C;LYgqi9NQo*omQg_hX>|k7hU6o9zHQ#L7MWup4JF$W~AdkPXuneHl3t znFm6$X?M5d?r!Yw#zy4!hAz*ajciaM9uM($&5zM6#tQ^^=r*U_G{J22OM_!^Xn~bE zmxGWgE3G^8zyNx-(UDkh&aI+FOEpAR@6?>u#~!blz@&8XF6@e2v?>7PruRF)?QF9x z3Ys{=1Exys(N=bcU*bnsbt){XvC>A2t|^2jk+$zM*_b%6eHHB`D)Z8C*u3<{blHr6 z>*?L!{O#N&N_xw}pys6*@*5*}3~!Oo&d%qncsgP(DE}y!Ed@qjiFnoUc8@wPGWWuv8CW0K@ zpN{4C@f)to4?%jmsXJ2PP2omc>8B-NXE6@fq z!1cllvzwx2Z5v62Ps+WLGiev88AVtT!w}6tMI`NwMDyZAA4uhLI2cIO?OolAm0AoU&%hc{uh%k)`S26 diff --git a/tools/python/images/clip_image010.jpg b/tools/python/images/clip_image010.jpg deleted file mode 100644 index 71b63575957f04d9e844f6425eeb562470201840..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24851 zcmeEubyS>P(q}hLaDuxu!QI`0Gz1Oq!QGwUfdGNVHMqMwB)D5}m*DQM%lmyhGw;lq z`F3{ySUG3A&pqAe^zG`BTlK4_s-AwCdszdZ%SeJH0YD%S;QsmqyetC50dL^p5#iz9 zAR-_lA-zFH!$(I$MMWdQ!NtS}k-ej$B%`FDVdQ0@q2r>bpk#Z`#>FovA|mpRMN(Em zNQPHfMDPzHKqMq2G!!&qbaY}tT1r~M|LMm|I{*U#umV*J4Ws}-VE~~qfG^zuQUCx5 z`^xPf4FC5B5DFRw77iW(@eR`JhB|Zr6c8F33I-Y$76#^ZxA*Jk02mBdOmbFHxVK85 z;3*ui*!-ij5Gci}Iyt)6mj!aB^|;@bZbjmync_2FrX@ zQB_md(9|+AHZe6bx3F|_c5!uc_wWq-5)}M3Bs450HZDFPF)2AEJ0~|Uzo4+FxVomc zuD+qMskyVOyQjCWe_(K8a%y^Jc5Z&*_xi@>*7nZs-u~J7#pTuY&F$U&A94W!(Elmc zUnTnoxiDVkf`Wm8hJpV>E+CZK>w?CBfhA{!!xUA5|K#wNg3TWRODsC8ssoXdUHKH- z&~f|?4v6D7)!84S{fA`#o?rq0OOpLnu)oQ*06>NYz8Vi210W0-Z#^At^7h~yt{+FF zG91Q{fEHSsB0c?jKzebUadYL{+ww3pS8BjEEYI!-uWZHE*R9AQZwI2=8l6Uv$8C_g zB8PN4K2)jcDLjt7^Kd2oz7pGVp}*C`ljr zv61ar5K9*$TyXZ^LgNO)4-?w^Y6;iTyXwyl3s;-F`Ofxl`u;|n5#CuAKG}Z(M8J34 zx5IpQZhryfmppUsS~y@Oth+dDVYE10<}(g+b4}D@mWE*ksBAjtJSaDqJbmtZc=y%q z>2o9x3DJ}WRAm4fS>R0KGsX|uXLt1?&=_i>_4v_PT){T!VOT=%-m$nHbhvY=o|}@& zwX6E&3m`!Mnv+D&s;%uAxJ7yzv9Ev88F?#t#eDZ$ug=#CC+(Bqat8&CVvA8x-=`Cm z%Vp0+r34MIG!v#;xG3n`HW2{4c>5e*ah+#7bblB96f55H{8K9F1<=V|A@ovF)73G<;EcdnIcJWdS zhDp4nv>}h+{YU+7ba9dr(0FrS0gohEfH9S7FnrMxoS^yj__M_A3n0l0-s|!}^ON1jF0Xo=G^(<*KgZ@ zrC$>#4;xee@^T7)a6221qA#D>bwpw^gRfP?>YL-}&(niw%Ixj;<3~A<-_BkDEZ1z$ zZnwVC{eP!X;XkXxdGa8ku#&SjdKuAHZ7wMnr7E&qP>>F3?CtzQjfX7Nf)s8VN5X@q zVI;_wTHqWFnmVwDWZ};Y83FLy{sS|sWK?<8w%*)79T?6fC8@f|bLPIsuDPMR)!)xjX#dg){x9EZ$;+96 zIn>~iRx#+vQufTb@{f{1Kb5U3++EbYy0`YLd(WIvzqcT>axENtazjrXX=l)^RLb#{sIWgITE6K#Q{R$ zhh5(@)y5m3Je*UydlANTui8H_;QQyDD&7S>`WG8!F92_}XSzC*nc6u{tjdIYnAXoW z{@-`3FWXC$tEAJuDzb~0VPF3l;-LSI%*Pi%{f;2-JtOwWQSZMkJP%8OPG&DV) zpr1qj$fR;9*l~p|)+HW*fX|d2X0{^JMB25`Ak;zqc=H126TZLn4p6u|c>zFtuMSL< z!H)UjU$k2!G(JIASczyb;3f0|a_Xw;sxXmc-7_(pqPs2OwgtgAC94Lp&t5Npjf^MH zztOsajoYpdUZY8&iei@Q$}?Lwj~;Wv2vw#Lx$C^*Y`R7i-f4U0eF4no9Af&8`93rTCqO*%pq5ONKHv3>pa%l?vwS(}m-Tt;odcmnWp8Zr(XbZqT6eCABZKT`X zo$~@9#(LsguyR~60AL)Xq|ke01yO4DPATTFUXw=f4>i63(hpt$J}-d#tI^Xl7w@tw z<^>@=<=_S@t}MZzlwP?KxgDK0o1zQ4j1rqiVCmjxR)H8hwIHJXNOxVIQ?Vd-o`He9%r~_s`FAQMklnLW4Ne~U?&Y2ShynB zDbp=wGZ0!!yteYPlS|8vHh6kx!ehLg_agJ0xzQMFB{km-qP40qta+w;DvNU@qEYnl z7<_?R7mmviaT)K7k(84+oO$J<@|F_Gf_Y9a(Bs3 zJO3um$(v7`*h)Ql_hF6g=qS}p5U`##^_kj5aJWXRRjHg&rWazgVz?Tmfhu&`v6ET; z{qH#Q_>*D}j-k^6yJoq#xe^oVe@X9c-z8iu)%Zt7XKv^s5qViamBOJ5#_c;6fO#_)dym;)iq*r5qcmuw=lGed}aI%w`lbH+l zmm`qBFfM{eInlwV)x}%iCFa`f@u>@=S2zT%Admk6T1DF;CyX9%WI1L2 zLsbh*2-6xd+={50Ff$&{=B(^LKl~zS0T_06M08+{DH`fI!`#9@IfhiC!wW!g_W9|R zkzFWHI~Gj4H3xddnHRwA&TGu| zqxlh0npm|W*ed6Q83&e$7(`P<0n;Uh#<@^pTm6eK1((BNad9V{F*t;<@Wn2TYq$Lk zTq0AAGv_>5PAOt@rDA!RCml&<-*=1yxO^2$dME_k`K4v75>+D-u z28rv~L|5u?C=t>$*Ti~zOB~0A<#)Fxgu0HBQd6pUt(A|K(Dr-9Y-GwJ>SZvJf9*e4eBDZ1JWe>C_W|D731z zzhBDdQp;VvT#VD07)V)V2!A8#iS|hZziojaBy1jOeu#7P+h*;Xp7-9m5QQ60uV|Ia z5&89rjhp$HQE##B<)qD8yKUl()3*UX`>-3}Vy!F_5v@T%Oo$9s{q3}ZGU1N}dtou1 zohV$3{S7F!HM~qc$w{d(;LR90V(~04`%$Z5<~kHrPFM}j&H9*aFaf{E(e?@u&oCDX zzH$@#PQAcC!KZZ^?~+)MBA(+klY~;rnHlC}p7Xb2b%z=DD!}>MCnkk#410KUib7+TMtS5Ryebhxc#L#0-G=?JO z?DG%kUM@w}9OfL`A7N(03tOA%^jtg3@To=Ew%wj#ybAfP zT{2*%P?HtapUpU2uBizJ9Dr^ha#Chk_>zb#Z1K}A%^f>`tlBCft@ud>S?@k$C zq`o($CGLo!CX9dFB<-SfL3THwpR|mhiM4Vk*TkOTNn(L;(|id)=qDmV=bK*O(|`)2 z$pyZ8!Jiilu5pOFVS%9NcCkgx0K*sln$JJ>w}#~yv=#lFR!Zjgj;zRx$gI5#8;d(j zD)6O@%p1=+ZkUx+nT~ADTv_zNw#X&`jp@pHcsI&&Al}}HcKIawzu{qju{RL+I8008 zAv?kE4NdmHwFrdH3O-7$aO?wUvcvwZCHYXWCo8?`{6uU1m!JJtuTXUteBD_f^8o`S zCici~g2C92Ui`Nd+>&aENSjTHebj|(u4S?pAyHBGL}qUkcIlI=PshFHDwxbttm5L+ zxdv3AsfjRBlBa@rnq*Ip>A|(x&NrSBX~9DCZm>UpM{oDz3TMnjbIRf?yF$*F6IP!b zFpDr=06)e^Bg`y4iBf`g(vlSJVP@KR29_Q?_9h`#eENtw9xNeR^IAyS8lC8_{GD5s zOWu>u5*){MgnZc3)+8h4Gq06O+Kwcd7U1QmgUaD&u#uaUlMCoqhYjqIcBr1}+-)D( zY&^YuI>Fx=50EUkjKO{itCms!W_7)<%kh348WbYA`H@?MBac2PfcCD_2j()Eb?! zH~m$$p>O-Ka~IhCGltGe7a4_CyWD&&8`BqNh8Oy*!_L$3*sJhubG8dPiRm^}y(M>K zV>-7{NyO)uh1$(?%TKk^9GDfHtePcP~8SlLi@pVNsGB$`qC!F%T6 zhBz11?lK2GHEZi2_nwd`dN-XMDK6;DQ+?6mx5O!Pk3zu55z)5SA>o_xF8^Ar=?yW4 z^jc(9KfIL!mA(a>t{t5$JM4{NrT_l5rzJwp#)L0+j3;Or1ZlExE9h|njf;#xEeY6> z4O=cFUf&cqTPF|*rP*YIi?r*eltSuj{a8T`=R1P8B=MnEYb6hYWOI4lY(D4o87%$% z#|JZ92JAJ^TY^$qIfcZ|1VL%PKH9m!WW_ZfMiCYAQ17B)UAWmYN(`iv+lqXmb!dRk z8k5=N6eKRoscEQ1g3IyXFAb|{9X5;t(%ag~Lj|S+qv$hZY)Fm~IYeD-62d#KU83Cp zaIBG^L~xln3He2~w~rCtyRU9btiU5F6BUr39nm1DR`IO{DpVmL1ZepMayd{Bv5UhIGx;W(3eYlM0Xm3q0|q;M1xnrQfYT@&B3TIFpi`G**VT(jVcuix!ZxWYN z-v^S{hJ_!zoL!No?ia3E*9%1U8e{(kqjJ_+{O$ZO^J7X(aSG|LIpqC<&9lVCdkqPp zjOzjzR=A19GrziJX$BD%V}Qc%G*VFmCz^){c`hID7|HP|t4jr|@`bPvNm+}wgfGiz#k4hTAYrpK}QYfw4URTDFI z7Q;mkoi0Z>s6nN#za$*pekn%i?m$X$*Fs{Q-(wqsN1hJW+*DJAzOf8fX(2jg<2m-` z8_#hM6Eu7;Y9-hr3uyW_vf$P$&wd^onN^HKLfA54yV+EefHLbOV@GM|+FyF8I4skb8jc7UQ6JINSll#L%1jbm9@uePlzWpN~oKV;iIfs%~rw{Hk) zV<{uM)D+VtzxMp+n!+DbZXc+q#3?=(H+4{M?Gz|mPW{A9murl{GAbN$9c@~|9#0I6 z{BeOc7MaQfEZ2bao@_@XMC+GOkp zSb#s!r!O<80Dq@P+*BJb!dwnAwQyXNX162F=Xc4M`>j1z_bo0O)YCJR6f_*QWkMF% zBJ!wZXBl+xfNKt-Zb#V=ly7wr-&BoqGZ>pvH$K5 zpwVl6tF*6&h|5Q}sI{tq-R){RFJy8)AW(y@l3KMd#er7T;!+}RND5XXFf|HZd?kDQ&`ekU`uvdBnz0t{gDq?3 zR^!IDp~9~>MNARJU0G5_W*jmj4wR#6Y7r@1rZ&Zk;E(+SedRQUlB0=*J;ifWQ*^tf znle0H^t5#CPY9go)9c<^a)RWk!%@j@SPKyWa6mx?%EgtOiH*h@mbq^THa6sek#u7F zgsxBlBGeEtWzmUp-IuT3h`vpq^XklHUH~|<&>00-E~r7~0wO4v^|SSknh1uN0}K=Cn{rcE7 zF4Sr3XcR*M|F_7|OlIJRDM%aD)uGxfp>D$bNSEOX`y|F;@5a)O&{r&=&zL%{R`?9t zqMV(8L#Wt5?Xj3NJXxwE8hpxlsT?V|sb-pqX%n)tz-zSdtAmm!`XX&Jighn?rhfM? zX&Mw#OXJ~hHfi@mLCg5^7VPd`F0mwjH?!~wtNIEDYpx(lJpM#7ZwbTsy<;h~$12U|dN0!!tiUz|ERjx-~(_tj>) zMX@It>gz{b)7?hc;e^a#GJdL@yN`XCk>2I^EHm>(Icr465jIQC`W*f#nEmk}#+&cL zs+Nn}DyMGBdWOG>TqGRNKON{@VC&H#xL}5i@n%n7!>|6C$pZsd!=k;N_ir zpZq*`sk3ZIg|lqQ5Unw*cNSK{%Ju#zvM9u30f@hY1)S2~cTF**T#>EhB^Oa1p2Oz> zVA|7lvr9R=kwYqt?$3&k+hh?$C`B5wdLUNrQ%nLEA;g#`iDQ;9J>{aaKM#;bsK~M1 zy4wzcy*#?Qs9I#s`k|_m+4d25@1)=t^Ua~l^1&41gkbt&NxvF)B7h*cTxeFLwYEvq zCMOLPsT}uygqAL03l)z2hMQ<(rbye}@%cb<%BV%Qp*dk7JAqb7jW_?*5jGk(iAZ<^ zCU%Lf-JUG0f?I2*@~0Stx(XC*U~9CKWGaT=x?hN6t}?;-D56?56{EjilOS3p(0_Ax!axRp>Id0RiE7=v7D0Bu!)47h+5Jb7-xWW$7g*zAzVBOQ7fu zN-28Okf9s@d)3zr!9k4xC(jlKrh&DiP=V&>5J2gD>EO!4ucJo^;IG}Uq0SAE`Hd#F zvMC>HAr<+DSf1r}9(=*Q@!PKoiAr3E4Eeh*;&x>11Ay2DytRvorV=@!2L&72+(4yJ z!`c{X9LsP~HYjZWF}%yNRl037Onv>b+DMGb2t{O*Wz7Mllm5qDI1AB2t)&$mtuDUS z>&4HT#2+asIX^0&y^GHcl~}%{2Z=kqC&FiaKKZCYAQ@3+kpjsm)Ip%K7|Pp!|AprQ z^=Vkv;Lw(b7{&5HS!cital&RZ=6Nn7d8(9O!?Col++CQ-0}E&)AeAR;#c_ReDj$`9 zgWl)}|H~=E^0U@<2IpCgY#h7ays|?;E-k4}%1*iU!hy&GbG$e9raG+?A70fCcpFMy zQdH-{&y(gzqU~#iO)D6h@n=GR6e~Mt38wt1qejE(zSC7sVKZ?AmXb*4e+~SX1fQX_ z6HLU!A__HPya2#=1%D(2$ra113mOI2HZ4OOF)(H-tDEJh`nHnCNzj6F$gr{9o;qq# zOLgma>`?uKrSs4yn1FuhCPDtmtyzv+JIeZ)O+*kK){#wa!!lu219jzyQ`DSX0p3o1 z)sIGqHmyxqe=*gC<`9lUDxfgqqV}}S~-Z&h*Sseo&IJ=?EuX?hPSD#x0lHEjQ>?mqrx;@V{Xs@L0MBj z0n>Ps)@YH)EQJsLV|L5b5yR&yC%&v`%_xJk4oNWHbs>C+bC>xIsYNF!;2kU;kJGrT zk*h*JJ@LCA+ysy!*n$P>>LBZ*OnK~#Kys4B5YkgU=j(JkPj_Tmp2_+cc@9O%;)=m(rU+jMJMEzbnrt;{1{md z3(%ar@)G>$P#wLg`b_z@INhOx4Ek0Hr^1XYiz-f;f9$P|J%x9%Ln==1(i-GyY;jIN z*fJOiMfQQ#0KtaDs3~^^;e5c_56-C{KCISy`&+l=y~~e7;I;)b{^l@dk&Zp0Fe|wy zFr4TYTzy8#{C-PBTqH%s8D5&o^;#70k?zQ80VQ=NB`(wocB?)q>JWVy-01PWIUh{! zkbb+Ti}D}WM}LG^aR=cTbQq@5MG+X(?wfm6^(o5Q1XjVOFVgmebdtW%C(0|Svad5) zx^35Lg>|n@4$M8@HQ{^oOq4hDsb|PH>o0)8n%0yBj>;6@<=(QUsqz(Rp#sQN(n5KK zl}*#}R5)8#k-;g(KxnXtZ+Ls~IVv9S63nFBPRm#d@w}#18cQ-X|Tt}fy>@JH%u~HEs+~5f=><(LPYGh0S7obw6fa;9|64V5{(f7co8@6BV$5GU|D=ZbkpC zB|<^0>F19o2?r+vOelsmzB`j(zUuJnyKKjRaudzj_f_q=Deh&{S{BNjrU*!t zfGQs7eK{wsF5m7+g~V?3MXIpeLCFh{-Y296R~4s-+x5)4AL)x-d?rPV*Zp~Qn7Pr% z^n?TcsgM?_DKACjLpot0X5s#ex1)<~ram@Jy$*XR>p9bPEneDIb?H_`<(M~5#My+R zkq(gnfKxBuMN08~d^cav$Z@haS03A*$Xz5}xA9=h&;4tBetYR2fG06_H?g`#K-Xx^ zkFScUW8X9=ec4BK+xTh@mCm2mM6Ly`ipRI#?r}|gOK4v4OHOu85WGDavz-FE{*w?0 zqu)$x@kiD1e^@)D4c6$+ra0Kfh)Uj!%oA#ov+tk|IfU^MA0r+%$$_vj@n5t&_z|VE zc5Fkke(fWN0H~&*Tm#de_+X4oQg(_u-CQ?E=6jn+*KZS*84^q+yT2pgyz9OlR49mg>b)Tnjps)U=52>B zgFYd+Xc1)F-q%Jv6oovG5*C#W8Pk*2Iz}9JQJxDH{>=JyiO*yZ!b{=ok{i+;Ii3MCD>tu&-gYiLJg{9(n z%i%@_vfsw^=z>QaVRKtNE1xpTTX2ptY@!9@D*U5kv>l#W00Ilm{X2y}BtgSilhPsH zg%`$)z;H#oF}(B)gS_)o->}Et!LMi4p1pdmcZ@O{3c!FL^6^uo9910MBV^XX$=?rqe8ET&bhw@a$Mpa)GUMG31)M zZ6-JK(?8U`|5Av7;$9Y(v5o=J=t-%FPA9Fsaq^zLEcJc+3W65nb<4Hf@4#XbL~|G? z9b{Dk006acA_uT|vOA=7f}$h}D*-@s*+ZMcN)yr_AjXDXIY272ro7TSLFn9;vU1!% zH(XhbGVXDBXB_Sn00eriIVgK1+_-7kX1HBzYab|C-?%wNQJcL%fCz1-0|j|0V?~H3 zh6r0InrjMH6g^0_B-24;9EG{{zmR~jdspS)6YHbT@;8G9=4ESXc@G>W1S~8HY5QOu z)U;bpb>%t;8q$}aTqT8i42hVfRc|TzFX}1h``+*f!ecq37C4R;-OQ`+JILXpCw>0S zU`qbz6}&pyq&+m`VQYJGg+L(7NH-5o?+|l;C=0; z`Em+#`0utYtc^UIXmQ9SEsdF|+591^FF6)Fp*_ooq`|=NS|UtXRij+5hik7 zrwGRQjd$OtE=)FWN!|lp-M<2p+7EQB$RomF(#<(-{z^}y^yK2w1Wdjj%CU=d9H}Vi z$O@8p)6;crh5Ogxp6WXF=^UwQGZ)yXfZS{TAb^FEhSqU~uZhQ|MxV9of~@lh*U1{? z7Ql0-!p<{OK-HJXXsn^`f{CIbX4Q&oZg^N7)oTMH$-YaYCHmNEXx%QX|O|d!L|#2=O%b z@a|2`k~>7xE{Ci7X9mz6{~TkepJ#_?+D2Bb^>x^!aP@hgUaXt~BG~(>?rdO`&E6e4 z{3;VQ!3)8bxy*KRd3)jH{M7v}7*)!%PA9J|v<~UyaIB`!g!DQt7NI*RG*rnE8X&13 z7{5Upgu}N|P`IC>p&oD(KO}Hmlzh+39J$ncFbSNc3ich)FX<-P)DRY{ieylRm6|`hiKExJEh`cuUGfPLD-{vB%r3|2SZ(r;q%20p&^C~-E43$Sw@hbKjRbMWqDzu$nYMM?A75P zjZ`_+7=G65E^1PT$A@OsaQim3N+KSK13RG;0TGi-d?2aV&WlWKt>Tmk(?z-mC zepzCw?Y~ZXsN~{u4ebhy9ktoG;eTvn=Wo846>R=kNt}k=A?oesW#<~Zk4uxl^-&h4 zi2NhDrmlDusTUA8uw(3posN5$l^m81*n~N4OX8v*s#|vu(liRRbw9L#|L7C8{wP|w zue`Wq;rJsn`CZ*A$nrM{uknIP#5_F2W0Pb$r$ldf`NE{i@Q|I>F?EcwA1| zToW*zCE-}SFT6WRLlVw*o-?WiwYG z=&@5~67ll=eDMhqBZ?EOaJXo-ZOykxe*`j&h;eBUqCzju`FC#XqorJRCY3`{~s z*c#)*!vxyd-n*;A6Tc1vnwIaYtM3tc+N9Pj>i75}m_29@R|iYlBXRNY+p7b7oPo>Z zI_9&c(^n%*>!ThN)yijJJZt#4?%|HMrFK$fi=RlXEneT>x5n&x7bv`?8{_>xYS88E zw|V%`1!2#y#JPsgnn!E3m>Pe*);*OItWUHl?DKG(joF7Ir2{9=n6P8hVWlE7lZ6}E z<(u!rn@Bj|tbh^}uo*iNHuq+6(F_ZX956SI`w0ns%#eS0h5h)nrD2tFc!R5My3#JW zU#%;>A7-?)gwye8zGr`6RFsdG!xk#ToCNplb`$NxEa~qEpRJSau!o}<&P`IAFkVBK zx%s;+xpp%^*2YTyw4K?H=GJA#tpII{{B6+cRzq^rPt%SaLMeF4?W|qGw2xgxkn*z} zHZv>i0iCNJ=7c1rD*Fs)qk@UX!Yji0-`+CXp%cga4_+?ysA5fU)If4ukOIg*;xq0Tk76ii(wy=0qmNW)9Ad8 z(IAw1?(gHmD_LN-wsG8<3>~rDc?-|)rCy{oWGOh_LrNStxNAzeow9kF85-fgVA%HU z$+x?xsFRoIE4T?|Zd90W~+^!r6t8bC=g(UUd$E6Is5> zvP52fZPx<{k2CTZDgVfQj$FOW8GtP@TPlqR38kH2?P{hKdGD&pbQ)-xu=N~*ZwRGz zvMnV}I2nw;=y_2IeFyRTR%jZ>3>A2bFt-mrdNa%qxw~-5dXx|LcOszsX|O#_XqZ9c zZ;}8qQR}_j%d^aFDK#mrP4J9~h^FF42otN%#5g7uP;x7W=VN?X-z^44v42@$-%Fh) zENDyYJSoIy>gn#3tyJF3tP>x^Y`XJP6w-5aI*^j}pY5{4ZgfDl^j~VR!5bpUeR!p( zvybHFvn@MsFnL(|I23AR7?du0etC@b7Si-dvwEno$BKYvN{xP5RHCdjHk$uEdIFr7 zqu&u@fD?pg3_kH{4s}=*FLjtwpQBLhb_Y7uY%(RL>NBky)o4h-wkF9sFIB{h8uCG1 z|AO{yRr~TvkX6W|fss{_9&WV-j1&<4t0JnPhd>HoA-hSAu?~d|Y;8-z>w?p8F|)2c zKo!*cNL2LjHJydU+r7xYd9qb%TdzhAnLUV`gA&!D5XWimkoU&+3u9H;Id`wX=Yxd{0iNXpp(vCc39;S65NJ++$&9!8kqC zZE7s3I*-RQ0D|lQRJBR5`U=Nv6Ry<>O=`6p+~~4Fh9dCkO?m;DQ@;q_r7*~4%-K8- zYkNc*^q%Nf{NSu*WGwR;wq|UNcs3nlFmk*3`SU6sEy89;nW8R1^XEiD2YECR=Dw!< zngayYn>OY)X5U6~klza2UBdBeNeeT1sSxk;d@qaWsMjyJ0}EFMmAYJ*LJ_vWh_Cc_ zBvN_Y4}|jU3e0n5j|06_{P1H}SG#oIZxfb3^V^l=wVq4xMyOFi`{)Mvhi?RUTk$KN zG0tY#z;M-cxe#C4kJ-f;?(Ysa+9vTFEW$xD&Gp?4dmoeG2~Fa27xzkyV~&n&m9`WOO~f9F);0}xCRe_@ ztZ(CZ`qxy(A0LvtHHJyPA^&tEPIS@uxr_K`Z8;7%v-5+TBsuX>>&NW04r86QNt21v zcbaP2JoqLO5D_C%wu)u=IkQNam{AKRhV=yV&&kB1<~MMQw_)kuMbd_U|9+wo-HZ^L zlWGMw43WAcn;k&t#B(f$Dd2in(R&TUJ&fLbEsnIaY>@z{RZm)9ZkbfqhXHPvw+acX z+Q-T$_7cE8vn5st&7T5kL>r%lgS4{!XvK5(uJ`!FO>9FQGC1EGp$eikyo-Inm3u&& z``H6O-(Y)Qk8FE&|1iBz0~NO*lwltQno&Ehe*x_Khkp{vh${IgPTb9mHC`IsxX|B4 z7AZ!1sR&Es>@Bpi+8;a!^UXchTr{G6leDn?vAAcX6u15eb_&d(a1-4tT4bL2hHbam zdL~+Hz{vKOm=lpHYta42t%*ak=HP8@X+3jEO2eZ88JDIC*Uunf0!no$Hh1WcIdLMKZT0Z#~l+*PumkGsiC>O1Ddha-?pxS8VQd= z1ws=cdrootkWJ^D9%r}!0%Qj&Qt+|J-|Uv&y1KAdf#{El|MY9cKketY{Dq~uZX87^_88e>fXBhz zbjNEBM7wVDzQ>7y+d-$9YCZ-7RcL7Icn(aG3k;kUaL#Ih;(GxcJ}6W%X3S?luo+ov z!;J?<`yYyM$QCSopvGK$zrI0YZwx=&C%4jAt~aJRRl!jPj(IH)YAxn<%3QY)SAU78 zS?!8qS-!h0vgIs!!WFBWVJ=tOpP!Z+Hl&ieo3fb?97VGeQ?MlQgQUxvnAriaA{TRi7PzBnbYmwlCVbbEOr9{cCq zt6spEPhDB5T#=?ly7#vZ%FdR&3quASWMjWl-d=ecs7Z?i1)qj8E5v3ce_GK-AK?fP zpK3(wg&ayQBEubKrD)=q&J1c=q2+)J-GHx&w-Job=bF40+dIg6tKbfPuO~q(7WW+0 zn2Ckopt>NF7`8nZnurb$pWSu=*v{Ci%S_Z}L0q}8L0RcTo0$di#Bj2Oz9k>1(#1t_ z3n)<=ObJY6IHJZwBQdSbdw!Ivp-=l~*p21J2j58CFc@icUon(l({mE7NJ(K0# z9Jh~)Wf-6cHTIJo!@0N_l)xQkRN&5-&a-oUvq@S+jrPJq4;FJ;e;B~}Y~(eNkaumAj%_d0X z3}6E|IPi55SpdJ4`&y7|AHf%iOMtVrQ{rvn*ou z2%8uTtOCUGxz456NV}ja8SBdQp7B0`DeKd0KrI#|s3wYAMt6vJ;=slm{34TCW}Q7; zoLSan=I#$1XES&Y^eib-XeQ9R1K7)q&J#p`T@A= zr+LJ5E<4`O?MrwY)1D$->M)_+8z@<~h~)(74|?nL&tR{%GS7b>$gUeFLK07`I-s(Nvs>Be(ik|(ayw-jJXZk zcml}?rgE*Ad;}#9h08jYRr8u7$mER3H<9;Bk8!Q-@Ki8k? zl{GQa*$iQYdq1K}O5qJ0N&S%@rL4--);4P%tz%S+Z&FmeDN0>Kt2i%#LkM4YBET1m zTZOpp@Qme|x!Wlvs*IC2H5ywW@g=CzrO*)wj_kKn-|{u7F7y~n#kPWxTO`+o>Vk6i??c{7 z6-hb0%cR6a5~iZ;Uk*^J35KRDdUj#pi@($Z#h{382dsd@UB7KIC(h>D~F5&2RcEiHstu!5kCA>nmvO z{&i{;>c>{LZcklR^du$4Yv}Bx)l3ea;SYgHeE|e%{}`}xUh2@&Aug5l`M9UbobBa4 zv%X>`j!zh$mc#-=-7W&>8R}3hFVNox%B8QH+gjUfS;a?WuVVLqITARs4vgOTS( zxGT>rf@Nh(2S*I++0nb{$;K2Ob}zKl7s~R#HykOmFd)H7&yS8z{T3xd(~Ikw<=75y zUZ$jpC+~;ZnuE&p++^V;Qi(ItyA^e^wxMn`d(dGUp&z*jiqKTI!^7*}z$??%N1I=e zMJ_h+$o2rS@o%N2e){Hl3b^j!e3Z+RBANiZRP!kIkB(O=`>j={e)0o3vd?5SotdWD zAw>I(bKi?;SA)=+@T&sU@q1CFDy)CO+Q@bmJF8SquGoZ~qExT~RY6NK_uSO!x7a3+ z^kxi3!zo7v4`NGCnL`hq`+4fAMgWqe9&>F=*M>yQE;~y8ZkeX@IU35s<#0^){!xkd z_s8jpZ6;QcD~K@f4-NgtB$y+znjKOvM9?d~GGgMzgBmprQw4IP7Etu-QKQPKMZr9bhYZ}L=(9Si# z&pE-Khcx1ieLNB&j8kWR0U!qUAXt6pnci-IY%d$7vY5E303mP_NK6Wp<%m5W1Z3@J zNeytKUH}$Ifk1uX8~y#q;p4@OmPP!r1`go3lFs05v+Q?$uEB0;!v?Ew@fKB;^lb6U zz$M2BNMNLAl6&R=_ji;+8y+1g>&ax6hb1Ni9mmY-z){b$*M={aI>S#6M5--|`uAsm>CzM1 z+S_~~C=q>6p29eNu2ukl%V8hOSOPLj` znbPVAP2XK?r4adGzz%2aKMEBa257VTUB)*wgmYGnuoLrJ#BRf0^pV9L-j-&fmDeH{ zOxQ!!J%X(FLCER&JeDk84_*hg^^`Wzhg{a%wof6LnMWswU#6nlfY#*xc)^!0uq*wY zx1)qqLlF03Sv$+fHVn*$)hWx$O5a3KqcFR>_lC->biM978y@%vfliTCZed=krWb%e zL8H)Fxs&{vPGUn<_e|xT;}D1JmcmczVUu88*NDuct7962Z`VJGYU_i46Uc@xtYg@o zf4KXE2Lp=-4KRLop&4fsSi~_nu4lco&*4&WXmB8YZO$*vm?VZ7XMBL1OWWuc&_s#a zIsF)FldvvCdi`??&RnzUgsE6Fvz_DDyP*@#J2#5>*)ux7ZX;L=k<Ny3!PQCkgl0xwcg3i_KS#g?#52s939kn_L3b1C9=#kY&jwpqoHYzA}I;;jacw(vJr%z46#N?1%t2 z$^CGVz59m=soyPL-lE4xwzg$MUr>mwTi#>>)uW{Gh=IqJaYFRz*9(*;hA|480|Ld9 zpP+K$?lhoV9N>w7kjA280SOKzjTNd=yMpCfnsC$Z+=1{zXy^ z?QC0jq#*C2Z$h9?7o~`zd%-*<+@3_Zu4meJvn-3bK1yNuTqOl^1*aR~1Q23FwOL3S z2m_mDlmy=`={ky)tZyjwY+%Db+Vd59+fl8Qr~+%!C!4^@HDtJy0e^>psl}d$^yx{Ti&iPx3ZENQVZGEII&1 z=)W~`UQtbL`5zB8B27_xl_DtRB279(gb*X42nYnYAcz4B5JE?#gE!QG^w6YBZ=oYi zdQGUIh;%}iqJlbu|NsBqnTPq$%)?pxJneNJ&RV~{zvZ)SlK9(FbuIpyC4_2ZV86zO zgFchlRPj6GGYyObMm#iWiULhTvZ$etou?dZ78zKzQxxJH0r~+5#A8pQ%UzIa0l!2D zH_tvc9j)8j%vC)ljnDoKwy${KmglfG&d+bvSXnC16DcKL!~VvyvK=|#raMsYso19; zp3Bw9wK4;FKHh`hRm@1St!(F^7*Qf4Y~&DE_EJgYlY+z-RkukBvQJwv0U|JXiC}3|Btb9Ixp$c~kJr3V}I_t`FgRxrRvK7b#wH4;TK}96sZ#IO_ zQjJQWZYk2th>aG?n~s{+w=^^aN<(Z@-_8u(RR;(MZ4L8j+H@=-atP~H%!r(v&vqG- zC^5^BBZWO504i$o{3DavhlPcS$F?GA&c1PS{)rE*C|2$HJMlsd;pW0IO1a)qpC-*% z-W!tMsnSsH~;u728%Rg3lG{ktwq#_FhgH(-Ik173a3xsiQWdi^0apUt-#B46za z8qmPDj_B^T*D-Sun&w4N(N1>~Iy{k0-z=skMxI1zg?BfOz}0Gc03}xukG1YATYl-A z`WwK``S*Cn!UC$*gg)LTey8_r`BwuW9!3*MPw6E}GC2a>nUsFkZq}(NRjgOVtlw-3 zrHvQ(C=)`A?Tj1pAHAIMKkxy%0%w!w zNZG!L3RkWt`)B&6;-~M}GcXo6M=?bxos?rPzW1U}#d0o3$*aDjSqR;Mz`vHpy6)_MWl3%Yg?PMgk@1Pw8o7&-|U`m5=O z>l6uvTkOgX>BwhDzZ_c`S34%}y?O8}Xe>TKiy(c z$3Cb}A_ubp6p90p#cy|Y3Bm3<)6Au~N^zOT8xdUYrF&5hxuHk)THmd$*E2>hYoT1- ztC~v#5El&%56%Vy7`Jke|(P&sL2jz7We5>$x3K!;LFf2dmg!M zYG!8HX&P_>_WAZRvrcgI+R&jy5ut!yz_t-h6sqIZLUrwPwbvfJHS2ekuWN_CzHpi2 zz_H3o9924;8`M#J-TrAGN9g$?kWCU_@`Clj9q_ENAsEhi%N{2QU51|$b@9REpl~^W8ew%Vn)as?Zabg zDx&(=GO`$ztSND?PFtbP-hjk&O@MkZ4*mJ~vJNe?2FUn*rAsm-3+gCYSbwKJQSSA9 z%tnq&P7Fk)L49(}P&=?ay))@ES;3nUGHJ`aF4mM{U&-&yc+qD3;7Sn)*oA6S*j(6G zfA$3adHo+t%fXpsdDa>a*K)5cOyM%Im#8jh?sFS(gUOC6_WBY3w*oU>6EehhG|zG` zG|ccYyCvP-{FE8pM zS9FntWj3!M%0REUB}(5xJJpqZ*y8P~b2+14-l0}f>`I?!7*kV3+WpP}c5XcGv~L>| z9Kd4H;i}e)^%Q3Ntm;0<<7+(oCA~rbO}qf00ZjBRA1+N^YEqcKd&4H!_Ndk^yM0ed z$!_;TP`((Q&FS>cFHdisLRvNrCDR9n99??a5@OvvHK`8$yd!diGO3 zX|7PU-8Md05xF3Y0m>w^<@WSTjh1svn*@?@2GO>Kd z$+L0vL9(ricPdw{RAE{|0L_!I%V%bjy`nUVA_a0g=8vS+II$;R9h8pfivqK?U-R-^ zU0r1c?G8Imy-Fy(S;d~gSU2Q`Z4^$_WMO{J5+E&p@2`6EEhiZXJ*Pdlw=VjH1Cx&p zTI-Vr?#Gxsdy*|ier0QchOYKDJG$B$!mY~&ue|GuIx=snG%|2CYJ@+Ql}9QsTRD-= zRCHZ)bS ziO+)XZF%9>X)?+wAl2=-A_Gli%Z9N1Nr^>oglk<1Tt1laQSVAka**ZqI3z#_NwxOe zW(D`kWX4dFhic24tDvx))LUSYmylnJyb%}hmS4hS1?S1pE1xeKQIWQec=4KhxDB3D zl>B2^%w`T1wL#@-Y4ekHiUx3@m7FMUL&5L}D3ATCs~JND>-3JYj@~RTDwTvGD}O&e zJx4^E_DIG%&ca|eme~&Oq0+nJ**Ah#M3uI8tr&%*wI${OG6)NSd5hcJ&?46|!AAVc zgQ1-9N*&OqZyvrS_-N!@HT`=kgOQug6fbyN^V+zSSluhq(5l#Nb#QSgSoaUs3H`Avm zFzv!1>UspKcc7f(XK*bdo5}rj)TsIJGQ<<{tg;@zD<@fS{<8LedsX z;p-2prUoba_E z^JEhO2z+^lsc?L&P)u8eWKIw%YeAcD?fB|vt=nC6iG)SJ(>D7#3HAipnV9fWMqM`D z1afp>A#vSDQIA2|(%fV9e(S#-|IjyVNLpT;MegQbmXgeW18kxnv!}ckql}AD+_qA8 zGs{fOx)8TsJA~ho7uP&uDLx+$)*O)?dnHGa?47G#@s9M7+Yu>zRTV}+2K>fQnk}U( z5U0N+YwE!$Ygpab;y<`061~Tp$R$V7LB$v+JC~2Zmpv)w{_##Qo{cmx7eu0S0rODk z#~Rkfb3Qd6)4ulH+phN3O5+)ZEztaJP3|FWg2X<9Ut&(YDlMPWm#vdBuK$?#{be4Q zr=5}A&GP(ozb$HtJasFtTH3G`h+Y7y$)^3^v^c?M0W-X1wD>gVH+V89C`Q1!*0YzM z5_hi3B2J^JlR0lC%?nG0Gz;UttjDYI8)7~WEH2M1IG)n8%!OtcLT?A!d#c7WgU0OC z{9Fdh9pgw{Xpxhq4vn*7`Y4StV&3GAU>j8Fw;S-(J9wEDQ8f)^Kvzrs#O?`qG;Fc2 z6ssi|b}co!i|6^}ycdC2=z7${GyJpXZw4Xr=iqb$-Jq2Dl+He1slh14eo>&tgN2Ak zH4QECQ`{+;%j(wBAEm$FRyt`_n8C{LT)LlWIKL9H60$XM5pjyalSgjBZeIqot)4FKT>WSAE%~ zTrkqj!D@$AlT-A7m2x}H&a7_vqb70^FPGj!<@fSs5oK(IzB%>@PFPzh-@Qn3=5LYTe9 z9{S&~b8~8L;OsNgeafJc*1bl)ECb|QLjnG_BPs@Ibx~;{X899(9ubUJ zlmu2(e_p3Z&2?m%B_CPlzfpc!6*(w<@0&T?%X7#f}g;8A6HFeO#QXTh1&Qh znWDnat0T)VC*Q?9+$2!ogtb&tO~=tW&F7WhyZlo*6G=We&;Dzg9lzGl(-M>$=3ZSj z&_v1hXy2NGs^&n)e@Y=$WOumR9nK4bH=Nx1-i!U%CoSGEm+;vSpd7MmwV>tfNb^QV z1<{SL!Ug?UR}wuGo`$3=>JlxRpcM;bwrw~3NMEK?%KY$oduewjigfR|$P>y4a~|wBwbE2xuqf4`<9uXG61m;jqx5`3vAh#QjVd6Wl29}QWOv6~ z-qL~!*gTTS!xzPik3`PAdwhyQEfT<#PF3VsgiU&%8U44Q`YZ`ugAJ8CUm8Zx!(PWm z_z=~z#a{Dsg$t`KIY~F^iUh5qa9rN|3$+};g!B?p%jJG!LF4+4`cXulOg2c6! z#`{tKkgubbDe!r{e)GW@R|#udrTjP0oRL83|9=fA;Z9j627 zQ{!hG)Q9Y-DZg1OZtD1|K6bd^r5FG?UlxFi$ZTM!4U&sw#x`G!5djn|&D*ok5_>hZ zwF(?7?3msWW*tpYtzA9#x_NQ9-EStEMxfo0spd2>s=yFPTTQe7+_dGRp(AN`ul$KOPRn&&;dDhwJf$o<(VePj(cZ%G7E>F3aE3kiS#{6l@{91pp S3f|QI`gi^_CrIOOvwsJY%8-Kq diff --git a/tools/python/images/clip_image012.jpg b/tools/python/images/clip_image012.jpg deleted file mode 100644 index d6d1169e8ad6fa4bb7e21e764640087b4f0d3ef7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17452 zcmeIZ1yr0%wwTDgSOGjylmp2D5D*XmK966( z!#v<6022)j0}T}u104ek3lkfU{0SZ|E*=dDDFHbn9WxV<4#>dDDZ<0bCdAGFV1Sf{grFI`Hv- z05U!b0X?r2Dxtb58iOklUr0hOI#9Z*omgY+n33PiEffQjgp`b&f{B@h^(mWxpwKg6 zk>@XEWaZ=)K#H$5wX}6~_4LgxEUm0h9_7>mL}On4FrPnVo~JuEEzgHn+BSc27>v&Mz*n zu5W(+#)SYt`UBQKk^Ki;_>Z^{k&%&*(SGAXK=gi0NchMo^t`A9QtD`?u7nJHA?QTX z3At767(jlFV`4M6F-#IhfmNoH-_ZVn?0*he=>I2V{{;3IE*JnC3E|OrNcaE=z@;5) zUNGk9lmpJXoo2nfS}}6;kIyq)_W}3?;Yf%50PBz=Wl_;+bq#GRr{n^`mVU;vGiI0G z3jXo@2f#ZE%Jl}T@cPAJH>tS9Z$bzc#5pu;l-CJ`XY{E+Y*Z;E@o;&GD1lG51=04~ z+6w*HjqCW6f%1g}0i){DY?yuhA39Qbr7pD^gXagnVr1l=5p@A5Ma?&kCYaOj37RrB zFt}EA84SWf_;~3{He!ic`4X{CLeOHyF&SruWghKURH2lIXFsg#{ibZiKo)|L91O{f z)@NcpI3YCciTi=#rF`|Q^FCHgF0T?3wCFCInN9%uVhV1@Q7R--{%%B+j?ELT4y?r!!0IcRIy2q67FQSvexQ2#Dp)g8AvJ1V+*|Y zrJmBUs-V*y;|5{YE&<2kvHAvmNe~?y5z%0j;9f%utAhF&4rf7@DrjmzKf!Q3?s)l+ zh;18usXKAuZ!feu0uw4o_TMA-Z?XlWx?lOA&6G1PQKsy!R&ZHBZQrcHyt@Ld==;d8 zc6PKaX2!);_m{2XJ8C$aM-#%6Bc$P_RNz2mr5BC%*6w27iR}bE17&9$ACoyl8&+0m z4S(=csG^A)qVmf#a#1rzrHx6aVa1?`U98gxmpZpRD>0OJ$0n_kN1h>2lp`a-ZJ>*H zZ{Grc+ubZ(`-JnNOOn z629-`pyWlf0{%M*F{|7H22ORg0pk}c=SKDs>!?jsI&slwVK{ce8f=KCzPc6a3gLa1 zW7m%cY)Y)+kItFf0(oZ-(z~y#ih3`n)Ng^4OLV5O~bXH8KU{Nd^~4Asi3I@ z?kb2G*mLFE+8htk!dluO^+FLktVCH|2f{_&Yl$v-N`WtI8FW#)3Z6%3Ep&$5$!{=2 zC{d3W#&tWbX2R)7D?Oh^ntmkZ`ixfvlu)cGvQ4F63h(~b61pKa?eEHNQsSgw9;eY; ztimR6f@NrE$dp_R1BN3-5VG3xg|umjouZdfI5m=#Z+^?r0Abr;B!;AlC6V9LFaJ1^ zByu=c+QUkCmWDDQ{B%9Dk8=vFE9VQ}+ZG`<5~CY4Fz9u5XM(1>Q5A3zd!d@~_8L-r zU3!{eiM&JFgLp5uV!ph&Md@{P=MM5;yLNoDV_ui;1?@9^vY-_jVbW$zHW2eOU94?6 zR;*#_8UF#{4H=v}K6*J?HD6**0Fk=vza(OufuYM9&YI6EnvshU z()GfO-kU{5{KWE>8eMKgo-Za5MhtxXel7_Y5$OMFB5yeo{O~m+iLJYTD;G|Mt&aAd z+wst$J$-3yd=gb*eW4+dv1Y|ZRm{3jmxd{b_aHXx*EdVu@P-5h#?pYJZ61Wt2f&-G z00prRqPe#8wT%`r{olNSHIDPp()PW$>ENe}Hl(LYUIWLhEnR$()%@fRHT9}X^YMTS zL8}JTTU=mYcWRT84xH_7&uA1LJ--4~Py08jZdpxbQUjV|xK zmdcrof4L<$F$etgR3;R-+2@r|H|nuf{Ix8-sHn-1x9@n6)t5wK5lrfe#F(`(>F_s7C>ZJvXhRB}elm1+#7x|VhJLdRM%wY~*;)F;;2ejLa^YgXBaRIA| zBc}>IPSdksW40+F`gRsq+9i=+Pw7vvb)NTEh2Y1_Rr^M$VWAqeeZBw^Gl!SsIL#F+ zKm?FYFv6gB=y*+mu>d=DSt7unNlU3Vbu?4~&WF6%fQYc}Kh;#`0cwE8`G2n8KM)>4 z0m#<-RvgT>jTKmaATjZa@%r4^*~c{=B(}k2+0fHIHCZ8nJr!I%zx(7M4ffQnkZKDV zd!3?EBHCk{iw2dyHu`{u0L^hEpCc!vA1Pg>O)IWLniZ$|O?LG_6~>RQXCg5Gbcee} z7=Qa^x8NQ9mR!o2F~43bB4nL%8)v$W~WU(bjqirjsO}H1X9M^SumFnfHi%*xb(#h)bl5Tbc zSN!DlF2%X_*iv~@Z4InzWSO3~45d%oDHu+rcGoswjPpkK&Cb`I83V_1h$k4tfVaVp z(vc`cTh6YE(`h3k{gNrSPRTn$cI2QSer}XnMUK^&NSADlz9^w^@vAJB(61b) z`H(*HJ=MAs;>?ZncsWLVMhq7&a#uojYg*o)YUoid!gjQ!-S>{Wg;78r7uj=9pF$e(N@9{`OUGu*^ADg> z!7_$b6GA7$trQ?xZ=HHv7uV^HBWfFVcV@z6xI9eM|7-_jO&F^@l(5)!grRGpdE-Un z(wiYHj_?4W7^<7F^3Y8upC*Udq-{uB(0>p*o1i9g*P+m?+dTaxGr#Rzl+RlQ_GJLG z*m@BqTn02(C1hQ15yh2r=)8SxFrw9ac-b-kF|LI-#>92R?o0f_(hi7pj?`a8H^UJV z|5TVUBOXP|8X|5d+8uGgVs*O2T`ejH6nE*2UBumo$7sqWQQrq0Bks>k)e)K^MoNo! zy+^N@RTHhD89)mvF4wL77I8tI8&FMM@%{+thd(>zXtt=5>^Iikqz43sKmtE|)k7oX zzmaS^1$--N72jpKYe&C(l8H-Q9Y zst31Ni;SvVw3JS0>^NT|D!+Tf0M=uWjkGcdL)5`8pHRJ;dO=J zNDFdAm80g$;hw;oBhB0s>bc`}zoZmbfh%kh*wRxFS63DWogd+~c9?y=eSWO`&}&Lb zYt4mcJV=ZPl0+k6o|i`l_`bCB{MpSl~vxcx@b2WMzD}q(g;^DzaA&jy^lm?b}1L@V^Vo>rDB657rmL-<@d=@4mE)stcQ&m zx)abHPuL73g8?3CcSU~VNkEQ~VDbkedE^Mb{ySkdTsJwlu8=rIz^+ZnYWyH(v)l;@ zIqP-b&y@rXH3YSVZpP^vS?nTmYQ-&now^MByxDD-AHb}p{ftp!B&*TJ3`DGDFd%@fHhUp zmT)(|{p|PFhz$mOeHlwzvqTp){$wXq%9xXM3yOozWnF_|M*_hG|8&8mG=>``ix_h% z^9nZ`M4{uM5RY^4t_0Tr5Athi3%Nv*Gv0=PMZYcbqSo*w$~kU_Lm(-_dJ4l>!2E*A zVZcU`m!TI;67&sYLqw0kb-Ag%!j{csOP5$1k8!*?M8!q?eff*|t|EAsW05Z}fO;az zo#8;uo49vTq}T8VYD%u@uUrexCFx>65tHZ1Cd(CUSl^hHhNaoxH>$-GoUp!g(t*h- zStsPuy9_Cr4q?9_h7?5rHktX z`6y~I(~vV@+O~imZ|ji$M;03m=~S|}zd2$_vrO00y?@jz5rLeX ziYiO?hSqCiTfy#{({WaE;(S|Z*cL)8_}UNX{C4lc&`Qp@Up`DS;oB%CB0tok~`^+1`uneZuVS+yz7P^y|( z6a}MYR3}75PMnQC*WkmuPXWy-zAQ*U%1p29PSld^rh?O991o2YMkP_(U4OF!$V1`PaVTUBn#AH3C?7noJ(_2)=K9Tj2Pi?hvECvWIY*9sn zkGc~+JYB-_kDRyYz_666T92UuH{KJ*8ERNEOunUzVhB+l`fh=1#H1->qDZ{oAdGfa z{Oog<|LrOioLepHC_P;oK&xWu0!IjF^oM1vL^FQn6yUmM&$929{qM= zN^(N!Zy&kg9Wuof;{D!U|1FYbIcK1*>{w&okt)NwAZeM$6+-#Ro#%ftMNN%o$(X80Y zI{uHVB8d;?5aBmH18W;^HaH8!SW>n`lh%K2^fCljIo^po&GdMt$iy^!Y-_;SM1j#3 zxuFX0w3S#Lce#s_d~cqokJ>b>JbEK$8r4e<`u{u{_M{B8l$wAVlItxDT@FM zdetUXax!?aA!e|T_AI@2LuSiYl35796Ot39&lWOney<`!UuIMVFpEh)-muz{Yf=c*{%1EhnDU{)dunc z(&TIJSoPGbk&3BRfz)=x6>;`PZbh&&KsGy>JL%ec)QTOe1 zn6i%^B(@HPoApDTR(t%sMnuZ$wbf_843e8MB^Nm^eQ~oJrWdZ{s7Zd7xM09*A3|W% zH9pFUbDKl11NMQ7i@WPfA6^|e0le8{H^hil z9J!V@cR|IdQj82Fv5nF>i4!yFjj{mq?$r;Jo@sP=1j=ryxFs$ahNeM_@IqDpf?pM; z&tQkZWGXRP7a4hEDnyY!u`Pk7srBjS?xRR_o?BZ{1!(hFqi|T6u}_FYEi2c8G*^W= z%`0a0x{M0f<{#cKLO%pg9e-=Fi=rML?+!mp^m*Ymy?`Oyx1Iqf-%%9fwoNkUY_|aG zwx3CCj#eEU-Kt`jh}h1zi-qk-7GE9LS=#F zEs~klT$`AxK`RcwSp5(M2|E**c84#U6DeBUe3>G~w{tbO$hV(clb6S>-La=aPh^Lh z*7W(>H~70gdGSbyNnJUQEVVf9+LzpOcYKVoIEFo?uXNI9O4x{yNJ7u7_Qu9lSa8i) z%d}h#e5Tw`m(_!hoO1H^WImOd)07`VVT3i`YyrQv zmvOG)C@PDygna458nfwMb!{L42raf%zKMWP3XpbOqGFLTYiCi?Eg8D4l$zhkm_tw{ zf2OC*ch-}@0#c-j?Y+(r*%5B*-hM9rx;`aR*SsxQJ2qfYH`xXv%6SsXdo_r?iSsWpTC&4m{8kE{c4m@M=SvaQss@?=+E;JDHpcGB0bLG;`Fttl0x#|oD~80F zld(WWas$|{$z#nSmSN~jALyc@sY=abw!~ehOO5RH7f8z~I*X5DyMnCvzVATI?VrxK zQEgSeRA0W%yIk7cMb8ZjY+?=Kt!F+c{8%Q$AD4tHZi7WZdmIf*vasE&?USaVyiE{w7RXXfFD(LN#kH48V;g!@YU{F~SoI|)-vulQ7rvk! zs2sDmA^lz?@VMM)3vVYv& z|3#UH8Y;oH#-yCusHlTleOJOK7c!ptUNbbqV?bdLmO-_vyGwN+wG)$3bFq$yi4VNk z!3@fpgIyvf;p`kx)61BppTw;8k>f8DSV>L93tsalZb=kn=%vQQ$Tb z4J{4_iA!55q|89r-B)h+qxTu%F2W4Cb_;9ovk)j(DG@-pr1H@Q1Ts4 zRex=X6RHZI&j?$?Oab)!q{h*J(B)ThC>tM7JJ5;p$m`}k)f$BJ%`firdH;eC{SAH& zQ}S2Vwqhc}Vlj?;OTjct6teatp}V@uNiQ@_82E1M$*6dH^+5rAc{=H^U4;eF!6$V1 zRA8Rb36|2840kQ309n6!S|=#UcY~3!dFg>3rFcpktgv=r7GW{7Z2L7(g_wc^X8U@Y zMI@lS!9mTv^{`5$XRPmzbphwoJW*>c$r=9vc5K3AvWfgI8)YWTfp?S`^fggTN;Ewb z{Tk>ZJR->w*8R+&wPB*%Jbj6E9aU4FtILERWd*l5{maj9lN9GEGx(7Qsw5A9Ml);C zXHRIo5Jr4}bpX9U+L5feLvAi+@UlIJBNU~r%}sK|A6{gqB+jG1IOxgQ1tsjuebygR zhrq}Rz+IohJNQ&L4WW#DJ8a7;g#kf)1=eBmj_8Q@X{Y|IjHcPxdS*UJY}zbIUVfJ% z9lVv1BKn?Mjupu&nl%UU&s6pX*6S8F#8%+J{AEhCPN%tEN3C;fCFU|$6r*4&jXL5J zap^#V1A!!UjUMFMc5`~%MO#!LV}V#t_sFe)Ht}(nczyjBimd2a7noK<{R<(Err&(_;{^T9JfI8$crc zj3Hi=)(}6vviMp9-Eq-5bJLqYAn#=ratsJ2N=5b7V=$XcmU6$gHf6i&h5D-mIxUpQ zHkR17K--vHRfU9nT!c6I6?lb6oQvB$O2lKqE$*Y_uZ~1PvuaM% zZK!4Q*DAvv*DH;-h4GA%eU=WDsRQBSl4$Agl}}_QiatW8qmWZen}gDBTec8169#D- z?mZu)DP0m~jg4A$wsbXfmub&}Uwgm8TZ3m!BJf@N4Fe;AD%J8pr7aBHODi%n)quG5 zr}>@Q7^JBLgA|7#)P8jOnaA*Vi+|Ry-Q=z4G=T}h@}iw|_Egz~zhdnzz0AQ4jozH;e%%Nb9JeCu8~I$(O%$$tgOWtJ zGf^%WpySf9yXx?4uGbOyB-zaPDs5y1p^a?)@$q)2vErt@#*@`X(je384c8Tc7)ROb`I7VhhsntDpO=_b;5nbY@W_FNcfo3JYs z;kmaN*3(VY3S%lnb%Dm)tvWM4$%KIF8!b6?8ds~=h8wcK40IS^AWMY@{}q18EjhO$A~m_7Y;Q6CXQ&u zw3!+wa!mrtf-$NTi$fK3e80Ns-Sob_FMf+6<7xllo2V>FO%G(3&MAX=r9rwEsR%@t zgTpG|FY5CpFM_kwj2D1*c-I_6gCi;StSIz--%UQqT${s<ddQ>S-*#E&Qg)EXMI9p8W`@MU!>xxGO^tu(( zjCC+yU#W1gxcVzJsmg>b3^EVKo16gta2UsX`VK<)e1X3iGRgo)&Tf0FH3GYi(}36y zkB1V77Gp8eOX62)vIfN!KQeYdaK1;qP__-^>0!c#Z822Ku>}la!OEqOmJV4hkx@f#P8G zUY1z-KtQ={jF>)5Zs^16;}@LzYX>LZ&aMPcA)CAoFWWki256rx35y4n60?u%~v$95TAR*uIgkko8t?@R@2y=0OAI z<`2Cr$%{t)lfVC&6XSF@8O%oLuZQDm94A8)f2uv`Far^%t|`ip{?m^$GSkpHHCw` zCLH47-c*z`Nuuh=L0+sMF)>KXS+(1q&9?>POG_U9X?+B+0uhanftK*rcP6e!y#fe@ z5)lW*@y)}cB%Yj`B0=#&2Vvqw@%-ndl^zO}mcFmfzI=S4yQJ9DW$yDPtz9q!w%8q! zTZ+}o>5PnOGXse2}|_O&~>jISup2D_Q9FF>45rs zsyc4FlYUn$Sm(Oj7znxnzkAAjyiwKyk!3s)C9!~Q;WxXClyjWSdz$`@88)>(%^wt!|HHk~_Dak%dQPan&p z{T^%V2T`mE>1`Fv^4yD<-jJ>?EYP8~lq_ksNOjsLVF8M)gv99srtW*8{OueNL(n=x z6^(|D3mLY&SFc}SRvJ#y_)s|qWH9)R4u>>2S{#kJ%0Pb-=Oig9n(T0`)?XtBVkE`l zUBWpZg9*)fvkc;xZmJRfCZmUKNkpfiFpw-31HSu#x=5rhiSlRCYLVq8idZ$ZU_l9{ zdyHa#E_HtZ(AKveOKeEucao)VK6@6$ot#)jgqu$_nXu!P@TT9Jiry+Ql`eyDLU$MMahB%f#JSI;=VHL9%JCQ6hCX7fTXRgFvcB%wlbAw`^BHNU_Td}_O0>!Gwq*dF9Z zD)*fecX6!qp=mD%sWx8hjJS8Z_jZ#Cl}aQcf~cE@#fogB%*uGB^W#)9-z&n#xShGV z1euy`Fs0ngBjwQPUGRo)enhBpMHLSa30B8~#}Nxkv3oiD#BV3#3iZS$UR{m}BzDm! zN8WDphrd`zhspvp-pf}7%K(Gl?s;_#=~k?U^}on?E%EjI4p6@eh?ExM=k60h3WL?M z=3u(o3h~SjB75~!DTY3i`cXoLbWxD4P}P|Aw5VPV!hymf-bhY;>}ORmP0|=x*6-D1 z_1=lVIxZl!rt~!Dbv*B{t;@STMUO?0o~UotGqWt{=lOY%(EB`tUiUJiDzA401e^A%^O#f57n7^+$q`reymi+_5iSg1y)tWuvuBcbRlxQVIe9&xv;H_=ITNl^o}g;% zzgX-QbK~ze&I^88pJ=i3b-rS8mzD`dpvcp;Q6Iv+%ZyiNdVcFvc z;@IC#di;6fe>5`=~u@1wW`t_yEnAqokX*@9CA(&pG&FIeM zBx6o|;meHthjgqxUhCL*OAUKQcqg=lfY*T(?Qj`;-GO{j)^qO62SApJoy4{L`Em^h ztu?cNV}BZ1l5j7Y2$i2UYQ9ju^A{|saz&_<{G869D(BSq%ai*NjEkLUr?QgW`l}h2 zUquQU=hTSo1l>fg5WpxIV(@dpAGPOAq!>3@^A6py78;mO@!!S#vMP-G??^P`spr3n`A`g-8)TX)I^dC>sY z|0JnOXm}DY#vn6?oqXUsO>xfP{w9i0{d5l_21rlw%GsiiYFFSv_)Ct{7@e)v? z%84rMgk{8aq~4;bkYYSH*iRtVB=S-Zzs(1%tTo>H@6Dx?gAjg+AZJ?hm)3dS?G11tvN+3wH3V*({un2}}9 zt4N>SiEJTed*(<0H;I3a1il_N|6V1OS+c9lI1VD1zxmp=U;gdN zky7y-IiAP2_*dR(@9S}RV@pC99@px}TzSWL@k9Y_>((qW>OXKm9@H<)vBc0k@X^(i z=M<#oLxtBML3lf7wWI6%5G}Tm7%B3VgjvE}kuMk*?DL<5Eubx#&PUXRwNS}iZO%4! zP{Nf9U(ixpG0gs%<61CNmb%pyq4YC{(~<5zgty0Z5jSyYG;6`nqsmPif?o=l!ci1n z@8ZhuqDXPYnF_8pLFQhqo7Ag{nG5e^xMtNEbUh;#n#Lcscik}a2{V;iBA+y8Db_D` zb@#PiW0+9>4)Z!a7_*{OdpA1r$vr024UfuIk|HW^=VYPRrM{0TO1iXv1tFxjgvz=c zWL*TC6^M0wA{W0g&zQK=e{i5%=^f}?VKD3|>~Ei$&`aI2@>xV$MYBS2>${(MbZTef zBIDVd9lzL&!Uv!N6M>u*qiZ;&pNB*y!1+%{lF5$AO=f%6Q~eS*m(g*cNv}`pCzbYI z;KcbCRn(xq+@c4-4EL@b<1(MZP9eqH0WIcFHA?=@K`gVz*NXcOfUOyg2S6KnP)FIq zb*gGYzIU8vRD9+ajsohpWk%GRlLn5BclOV#Gw3z~-;=8)LV=H^TUW>uu0z|4z=n3G zAN$#G65)*I)f4juV@uv?wS%1;VfKv^1%r6*(DaFMV-FSn#P%%rg8npNUAs$zaE zH53Lq%R`(+&IfyviiWpT5=8^~p@mkV2FxH*VpWRwnu9dJ1{q8h45vm17Z5x| zn@dc#q}$xxZ`dhb-p91D^TET8zHT-t?CA~HJ5N?0;1ZE)Xh!18LUVqt; zx}fKkIBW9DZuq{nWS4{@Zy(I6&ZzNuMG0RA!3U=~+eP&C#B1YgJ<@*XAbU*l4El5_ z$WR5`WZk|0>awDRJwtOntL0nE7=I>?cyDXaHA)b}#C;eI|6RcYV2$90dC!;h`EvMBzT&>C&D#;nXA1}~P+wYe9 z44c45WZ+npq0sDv``|6Qcjp7(4`}4yWs85IidjEWxcxEk zQ_kI>iyiL!VB7DHlrq5p^Z=-24LV2A*UVJ>=s&#oc1cZ$}f`~WU|7Jn5?BnFQ2f%1W;R7I;?*R}_`ycKd$7Bnak8%BI z`yS6WabKgd{N3UT0Xo)U_aFW+zd=&$l`=_cP@Bj<8*lv&jr}h*F8}wQ3_R3rz3-5+ z-k(a`VcjD{JpeF_vhRNG8eb!EU#tJSJMBfH-xvW~xp#4oP#Ta> z@;@*0YvZm<0%C6K65iL1?xytZ7qpyC|6bK-eCZWYRFw?2&+HD-ttcFV>;1Mijx^*e zZPoQZ*_`i9LFvMtOsr!{?e)O6OY+8#uF?L@2O%Y7-72K8a>A>=M@);z3}gG*hX9`r zY;EjIN`xDv8jiErQaJLRzGQpn2J?9DLXB|aTLF&diOuf`n#CRfy%mKM_W_m<0L+Dg!?#JlUD4mP?auz$<^xjauVf#M zm#H|#ecrShif0+P!^&SoFQBUqvg(fj z7RkYMrfT(=^|C?DwyzNkcd9MSi$(plE$f!kt3-`KxQVVF2sqx2YpNUx-_DX2+Ni8= zCdx0+ngZF7d^!9=-}zdkz|A6|RF)VwbCc%90b{mTcPIycu5P6Bug5U@#*gCDw9TtK zWEVQ)RY4RmQl@ln1tziW?&D=b71R3QVy2c&({(&H!Hl6I zjawa&+ZYg%%|4F88MUbHgel}^G0!NK^D^|2Gto2FS_e07dH?wB+R#Q{-q7BoJkF#P zBORFr{2Q3^LYBrm>Bk9b_Y*I*knv~52iMIsf#!n=u6`=pQ&h|IKEr9x#j{~sCz?}T zLH14iNTr1v^?4ax6qXDs?0}Z_MAf<}W>w;vQCEgGmNY|w8+u|NPEy%dp)bArF7hUh zeg&@m#p}{oyu@VnKZdInZin2uQ?HIY>GC0AuYezU~dAv9ZG}g02xoFKRmJ{s&k2d z<0*s&03yZD88sG?wa4=T3}=liThscy}o`U54Qg#!eY=9tChYhk~a1DCKFj^d5?>@O>Of% z!e2K0(jsdeu6tURR+9V{cs)7SSRZ8-Auu&9)nGa91d#||L|+< zM!hPiNrH_4vitq$ntt^Cy*?v=vX;!YF488GWP4r(wc|QGI~{ynP>t`RBA2+1_hN(!?K(x;x3UL*_g91Js5tp` zTh(~Oz@PLs|1{t1@eP2tn)-n=iw^)~k*~{x+;r^iKJ^xw2x7@@GDEk75v}kpGJM1Gd<` zPk@A52+K)D8AAj%k`arV8{WbN(cQKW(mi3U0aDJbM$k#qk=H*}U*=qHCXp7{CovZU z`9lFJ=$Y8?7a~(5BIHe~_s^7CWgY;zdpeJ1se2@s>@9yukiX{>w`uH#cWJzydXLvwkT?f;mWjz-HK9_M)zhreCPaKxXkgeS54 z1vwZ<|<-8V{pr}+e{q1&YdItct}cTwJxp5NP0-I6B~YnzpcXV{)ZF}zkSmO)uu;( z_+E(e=8;o|aNo=HJif3!DEUYe%Ud^X{&sC|{r_tHrJ4WT+OU;aNjY+S45im+&B`hpx z#C89p+2PK+dr8q6;{h*Tly?EvXTR;da(hkNH>0jVgd2>h@2dP}eY%wVw4{u&ysTCG zmTUEqije&?9QJ2e?7zw%PXs(h2x|Y6_lW-t!TvMP?!JC3#`#~h`1^mlql?E4=Kt<- ig0pPm-I+0;h&9%VO|SqavHCzrkAI*|Ua-W&+ zo4mXC?z?;U?!9}T|MShEd6;whbXQeZS65g6iig>UWdNRnjJym02?+_{g}49@a{ws- z1{&HEw8t1v(4SypVqoEt;o;)o;8GKl5Ridr85w}IKze2lq36u3{A~0U^O5;8v0 zLnnX+06;=TQ2U$UzkZM&A)}x^Mnivsfr+S4iwAgwgpB+M1sN3;1qD&tAMqT3f{#i- z$0hmrsfr02y)z+qKx`H|@I_@ik?PnXh{x0=@CgPn2`L#l10xeN3o9=lzkr~Su+&Rw z8Cf}b1vPaIO)YI5T{CkFORKlmHm+{&9-dy_K0zOXLqb1(3X6+RNK8upoRXTIlbiQ7 zzo4+Fs=B7OuD+r1+mDXUuI`@RzW(uv$*Jj?*`KhL)wT6s8=G6(J4eSSr)TFEmsi)n z`9cC9|Anpp?f`Vc7qMBD z?N5L_s)t0TE@K$PAl?;*qu;Fk!P)OBanDjAH&qb6QtcYj)45&w@l{!I zkA)J!U%JKmUaC}v=Q}%kjj^eo23eBGlvMF^rF&)h@u9{}G` zY5vq?&TfN2>x=uWh?WO{y6e(m+zyS+)5jl>$in;aai5vM>+^M^{C&<>)!05aFZB2C zl44UY1}K}r4W8Z7^0%Jcf$t7kAS+094}i?=2f#sqyy;>1J!yVmcdgY``i)nA=A-l#dch*PQCeFI{g*!EZ^_X`&=yBxykNpRN5C?|)Lt5Z+#XY3wCx zyL2LS<@eg3W8G8FJ|}~;?s7$gBQ~gMQcd||Ir#-#DQS2G#ismGqJ*$LRk6Dtdeyul zQS-Q9xWk4iNx{e~a>y-PU+Z_s!SVxO_3iZ{?zP0B*USR|CrWsARj=t2Q^vpm-Ta_$ zYE+ZarB*z|?*XvX{s8D4djMb@J04`-T!JrC$1F+Rvmrs;-VO{+wU;?z`CV)sv~m2b zYdJXCP0#+@*B~&#cMpId+`U1C?aQ1>80lbfDSV=`vJgM2w5?F zeRuKzfS^49o+=rI&Y-)*eap0#xK9iLlDsHodaW>XY$Zd2`Y}gCJ;BkD*WITsD1SE= zgrfao*ikmClyJ zGBHhsld=V_nl*--arzRk1x2wXBZdp(;QpQE1EBQ*aA)(6Jz6PVJpdk4PnX}`+^S5$ zw()dfBj`MG4m_KRZ2NJqB3$03Vht%u zA>ic>*c5_6NSJWY3Y=R}5bnTE+ZaaQh1zM$INrlVMtyoGB2zma$Gn%g8piyR1=kZ^rf*Z1qY8xBHmvU2o{#s z_$o?T(uSM<>^2RSMUn(ii+SzpXWr6kK~7Et?tTk;R|R1_h?Z_|hY+Lt0JxsK%K&fQ zlKSelo*W#>8bC}h?Cn9S=o|G3awS@{5hkIlPl}QZ$!n_O7zqrNVz|6ap$2_VDUWkz z;%5ZQT<`JNTCc9s?teFZR`3r@7!D_&++&HA8eLsn=44ZoRt)c-aJ( zy%IrlpvMc}UE^}&l>FsQT~#4wbM_XnOY1n-6A1YlO=!Phnex9RSS;h&Z&@h(A&Zw# z@)|`Q`xY=#jo+0+8Q0Sv_ijzilMY!e{*@70eE%R? z$PZM+oC5PemL352wbXwn4M*%B=VQ_g3pdTrHl%ozZ+fG+g6-UZw3~bkQK3Y^gB(w} zLeLjXWd@#e&SivCH?B5j{{4i>`8!SuZ-gPg<&pS@JT}qBV}gQuY*zjUwaxo`wOwGR zsXV<0-Ujni5&XbBZwZ^yT=`Z}riSRLa(vb3FBA12`W5CiJTY=*=+7$D8~ceMYb+PwQiQ00Hsb*95T z5_DkP&g~CRv~W0@-TO+AfS+r0J|Q?{4|+QP?oK7c!q+|KqroJAw7j70HrN$%7+`={ zr3RNSersi^?BDCr-?J+W&PXqh+M^N{BEZQ=jk|B6DIj#zM%n&G8Qu5KirLGyukiu!7cgiI3mM zIGJGsPYbz{v=;LrcdYZBUQ;yI9=hTzubh~`qa6IZUylontTDZvpBTh;ehI~R{u-cL z)vUPd%=)N#xp2=q={m7xXkxBs!#nnF`AvjIf^_5~`e2n&j5lAnXaRbKXJ62)TaT^~ zh7_S`qc$v@3{m#kR^_cEv6?u})k z%=xgYvlf%npB!K6Bh8^}gLYhPFQwUs4sLL`&Qyc` z{;~OGY5bJt-US}goen38(1sy%gG+JSIYl;WPTcZ5SZL12>dC@5`9&4+ zZ)^o~Lx%OdUSbVW<-W=L9P?aoi|4F3Sr4N30BECgYdx#9yBv_h=z%Y}qcq?(GTf!w{ZTpqo z>!tMaviks@aMW3>|5|=c(jolHu0+*SxvGEBE-E*m!KvuE(58@hqB}2J_^yFdjp?Fd zL1bc4>&s8-MbJysPvBRo`j@);@?=E7 zVpgbPV@u1_KIhW`4zyW&g!y-M(D@!5y`G)nxU#*{T$Pghq4fJ_yN`hN%DLXP`~fM* z3}SUaK0uwghCcw%_BrP-!(Y{Erz&Rrcze5koMVkMa)9?`#eak_`8)lDU3MgNTYCUM z3Cu$ND*xj98>jD^Vf__{_k}rxaa|=}J*&cC7AYw&YK-6aO9)Xyal~tq#7SF6*CV#$ z=9Eg!>#EWMd({g~$((R$gs!}{gtE@+&suK*(xp6I{0A#Hm~aJ&c>Ruz#1JMw9C$EF z@P(-Q`SoT-nPHpZQ!~_DEf&@b>?S5Lu3(~Il0b(q;Yu9mdX2Qz#MQtdcD9(#5Y<+- z7}GYkj1_v_5dt7?OyLa^<@{I(Tj=erF%e9cWH7MM{Z?J>dnQ| zg{&k&pl0F+?e!VD1N#{b&;ae8MZlLLn_8+<^9#IEr+DF#r&lj$h$}@NnVFGU z>=4o4S=Xlcp7^ zeg^$b;|wyUT7o0U0noc*HtAYrbluuS_*ts;O|?B}{6jobk|q;Udoh;?!p!h3 zVv;g9F)95kVVYpuCk#6Kol;$OJl2JczeH$N`3vJca+azEJFY~C`>JSgso{FS} z*ER|=kq}Vgt9&`bQ*aq|&GA-`@A~I_v&c#X z@izZTx<#fFY_X9}#&@E&#ZhCN*MMKKIM13X#T05Qm@{|64^TaL-)5DLCkL0HV*2k| z7I7@M#VnvOp5_saE1PAAw~PJik;vZ4;zaa0U^i^902#a|Kyxzw>}LG|5J_Xw@Bo1R zAr{*gVStev?4H}xKSU$@r}js)-!tp|oYx!T4F|m(sHZJlL#D13=WK03yE}$-m#9b( z9l76afe+&Ec};h3Mt=+0ri&2Ch{R|y=(m_vhD4`#$gPc~*-03bxP7dktiSli%w^;W zxu3VIa-)C2$o5WM`J!XY&AM-a;YC$FBp+UWAo&3B%RDNi4`fEdmnk4!hf}?l|DQUN z`QOv@f33<8<%v8^TpKw*@mkaZtnChD77Yn3K{2vWp%LjYuGc>JmRE$9_!`P$VC>6 zyc&7(IbMfXO^E|n5pr#I(4Pke(uA5Hg8zjT_-7UZD}Fz^;NI=p)&-2O^yN_)^Z%-# z=idbFzEM9x*pC9{M@j#cclq0$@Bj5KLL!e62Ozo6Ca$N4svYVgZcMX}Igkl{&@bjj zOnuPMKmuUS5p{)hC?(A2Uho&H$m4F8h=7( zBWKZH4n`*1Rn@Zos3J`W_;*U=#8FQ83_hS3t-BsmHjkAApQ-bV*#;m@^vmu0?NJv~a3ACACSSZToSAVYB%#r5i z|1r-ak=BJPzDuojELo$mmM8aycHuklDJn!cwU*bOl|xum?L$CR2IB7!_rY<$Ddmqt`(&N!}N$S{Vxj3-pWz zy-v&JcAf>ahZ8?iZSD$aDh=gb-YI{lfeg$SBNASEUGiS$EX=s%hqc#W2UwZ;A@oMQ=-y7GBBI*qVgt zML@v7+HmD@9gx29;jnh9q{2vdvFk3kT9LPu{bd(j44{pf_C0@@5PaBGBeR`b9-~ie zsB{Qs_}Wc;%>ElOfiV#abar{OmS$$1W@f`Mx|7CXhPPlZ_W>Z?7_~Me`%2Ia_T06X zHa=GOTT-E8UR#{Lqh)NTa}RlvH|hBs;c2s=IY?)C)B!De;u0B=EPXfl2$XO?6Bfz2Ie<%wM$86W>XL z@Xwf1eZ9V;HBA6(YrZu^JpjBGE7Lo*0!e*>(yvVmhgK-&0-Qk?qlz3|s|%v9pB(F> z{H#piSBNk-KSLHOED>dK{@VT()3*VuPHl*8kU}L%RUS!lIpPTR;c<=6LiJ9LvUx+8 z6_%MbJ!=)b8=n82M`ibV^PH%qxz!xHJDfo`hp}F^7J-N;HBB?*2R{HR`Iy~p=Vr+` zum(>^mD8TT3tj@Hy8)cxx>|91USE3##t+O+?7v1C4}K`HuzN3yhDKGehf;6ITv?O! z(yh@1Bc#FYL)F~6)g~~XGE-rS`*A&YcY8wFFOyzTXZ_oIs#c~qQLt-t_!&5LotGhn zv_O+9eJw^LD6A2uG%LGPQWLJt=bIaK zY{8>6%j6f~vKg>rJ^t3&Mi*XIiQUVeN3@97tgxHOaWaP#?Wn9E9Qs1>0Ivgc9#*7` z-5aPC@ayA;nCx}9*pZ6$B2%3`;k=<-x2>LFnL}8&^U2q}Xv5F|oeD2%^A-{ULy7(D zWwPd=O~s8uWZ(GV`9n0){E_9|b}O;kuUS~u?q)=}a1d&gib9CdXEWMUp}dLf-@7fb zXLsJ@Gw+xxsuN0}&Mz@666qP}7A`!Y3)0$Ce8mQ|dUyGQ9Ii}>7(HYz2Z7E3rzMt$ zF4&rSvD6u&o~YzTwYpEJFbdQvt4H4p3blW|44hFFxCSSS+S5+ztePOrQNGhN(MBF| zh6+#M7hA9DVfJMQ`)zH+X@2Nom)4=u@|Zql@TRWy_;HV|K$1FJEmOto8@9%vgd-*+ zxzT7~s{kEQ%;;)MHfU**FNHDps?fhL0&HTC2!XC=H1c(1ziv;T$K3V4p}pww0aQy9 zzCl8iM*_;-NwMGF@z(rrO*9hWUzS`bq(QtE6`(rl1B1-;KLEBklqN_^v`9ga+`ilG zZ_#?n*H=b&sD(+!X^Iss@5Y!;ilWvBOip%HB-#i%iD5WCskPWM(6rdOdFt#&+i*|$ ziz^G2JkpM*O-oUk3b1jyI3{P{0m`)*@_4PYJmqL1Nz;Mb#d0Qf^>&IaYg$|NVrUedRuJm<$yPR?xKYQ`SZ$eS8#5hrC)E21a! zOV{_`*XkCR7WuHtvWT`4<_7hp*k0VpQ2B(d>U8-RC_j#8nA}P3x~)qj#APNE zK|XQqhG!S(8R_d;&v7{P{~RgSFR70o95ctV8+>o>Aa*${d?JnTKpNN)zcR0=0~oUi zwJKnuoyx-1GM~ZE^{p~H)#sLOx*MA0H|nN%Z6g)VMH0dmkM76Ftf;kNGTQI;kiQER zSJ`msdxfdDOn$AY(_Zx@jFQ4@z~a`st4_ymX%YG&D`oFbwc0yr~IwZ#C3W)JtYt_y1QP;E8^-F`)mbruC(0i7g6PO2<0LUYDg?Q!lo(Stm%jNKj(MxY|3e?}_SWp1rS9w(h=z<0MWSZ$!M zC+bq)yOnir?l#gd?(j7{i>ezj=C@k}Nr2*Y>JOAN*eh3ORGM;&{;J&L9ZWE|>V_0+ zoiGXvMOvBC{h}xd_QB&F;kccB*`^LCc%c8Hl>EomW3yh=dh^4e#qdZr=%<#}=C{8! zAZ=y0u)J9)-$1`^d!t@9W{u2LXtl;BwYrie{I7L*6I3hqIS~Zp_(HtKRipz~4;&B3 z1rSnaAk^mkZy#X6qnmxM$A8zg-o6~4s3`keRfUd*roW0sUYZ#5Z#}6P|37*F&S2oy zHfxON;zo*b`658XuRaMuGsQv@UWRe=37 zhcA(zyNO62i}7S0<`4F*OSe{Msz}y@jN!oXO4$fFxm+T<`(vFAUoZWa4RIPEA{qfI zTqfkGTHbf8wNP{{;u)9u;bVGO5pxXbFpUtM1lo7s2$jK(MDmdN*r1wd1@U3Io`GnE zL28y7uk2&m#?&1hL;SZzC`El2?R^+rvBS>2-L5(?qi_@tEglc&`_d6 z@p?+1*vdfI#?IRxHQhu86(QRVal*$=+)l#3sUl{m;qE-DBz0$1o;3Y3;g&eTJLk)s z=^{xQ!Q;PyJVTV2dzrwuFqOVxn=-xOdQ(dp!XQc_YBuW0Czc#uS{|HMnt1AL#KK>t z5LV2Fx=?>$PJqKn?+cR%c}8C{*e`D>)qVoRL~;On_P2hyN^T_ zolY+PBGc|mANhz#t|JIuu(NY^RZFOGF!zVp|6IPE$944a8x&F*ze~Rk6Cd`MR-JO3 z9#NS{+{_@BVmV%uW@DV>2w{qiFgI&auQ^H#?w1BtEa|!wGquoIf%mRY%iJzgbws6%t0X|H8-0Ble>IBY5jTLE&LwZ<+K9)}rpogG@K>Lv*ZTe~WhD;eE^Tnx z*X=r*smt-J3A!lcH%sCZ(vzeFKH4!5KDUi$_vG?+m5n%hkBEVsy+I}Q{lE?&9(Ri5w&LaX6-&z^hkDDV6WCIE;GOBSIeSw z?gfVJ*pxw?C+mpQsm3;8)`MGP)q-)bIP?Uf=1316o_)I+~nlFjkEDo<<@yg&1M;D22AW3 zb{$5dbrL%uytWj+pBU$m4Z7?NP<)&3Q-50~92|U^u*+-uW?K1@yAvyZ79bMml^3;5KB(VIj`FkR zLx;8G{Y)M}S=w^WD^#(m9_r-?JT9Ga%L_2bscc6Mn!U(=hG!kGJ>*gUmi9e7rh--d z993>uNk(w97GSNS)6q$8GBLE(q+A_(+}FO)deh{bu{b*%!u6b-+96Z z7v^s=stNVw*T^u_bR{#1K?kG)MzbqGMHoCYSLQ-`YDez;{(0C){&ouqhKL@2vai2$CDhHlOf!Vc2)3TvsO0N*W+C^731V3I<3T{u1qT!=PPUb5lDMl*$4`+#L(R`0V#?cQ3UdmzyKKI}5 z166R8kmt?}C+?KL(e1P1s%~mq?9XG4?ff{6vDkT2(Wz>GF5p)Zs}!8+yFZU%iu7M%7|z6| zwj5G24UG*+$E3zh@x$8YE8?of!`jPEJblG#1|8r}2DS2z!1}kV_CjxZ!>-6oU!2)3 zn|#A^p8f`7MjkmmZ>pau7hgL6YzJH2+>_yGCs#|qYB-2)%1kczu?pD`sC@f)eD@9X zSZ?GzIRg~zR~0H!-6m282d17rF(c?as`L1kD0orewo6(wB)r%^Um1qpY2C2C?8N<6 z*Ln+U!w+c7R~|Z4m!wKjPzEA-;sNq+T1ey`5ub#P}}khFZy%1`y= z1&!w`W;Li+muG^?Bw4@hw**fC{zq0#;qp4a!r)`(5m#a25^^2(OrlY^l+)NX8*_?D z(}HE1uub($`>u{xS?U1wj0TT|xmd$^X*Q%ym@3c0rK`%6D!dFkq4L0tKAzSBmTN^u zOWoqTk$bpsp5?vzIF4#GJWUAXm3xvwpCG)VU)jC`prs}UAs~=OUsF7VVvB$LYcWc|{%K9f!b`i(if5l2C}}dh5J0k1 zVOIOR*0Rg(%%ba>*M`p+-v_}86}>s-%<6rBkxW_PYd z3l0hw4SGO=GUBGUpY3P6Nvub{vbAOxUC#?^uRIcny^H?UdY*jV15?6#02H5iUZqt= z*sYzGI^v0!;y_;=m;Zd-|0H!-K!;$7$$41*w9n8vb^9$M4siScpcUQKtS7lvDyE8_ zV;a|hze0to`YrKXq{)kftx>mae51j)h9$sI@A4``6pes(DyXh(W@M=0ln+nTMXLFPK7Z*`Kd%qHZe-@!dad{Ec^PL42pn%W}xZXo(T;>anXVTJ1$G{0!#F{7Bt&K;`jdHM`l>50$ zpELW3Zbh@e8^4+$W(u4~gwEav zqOC}~Cuq3VR*DuE{#}egUYbL;H2==8{627sGlMSd2fRlEs=TRUBW4fY{e^&a6rLs! z=*}0r2@oJ(jiL#7y7MLrdg@>0#4NPnKoT9_YDGTxnibR#b)Xr5Wi^b>_8rr9AKt_S zbDWimuz{;ttlB+Jw@__`7;A*ACsH$hMv3ZEw)6V9cWCPyaFl4#C1Z8f?a!qL_li!| z-7^aCU(7r)-gdMGc5MuB(>DQ)kCl<Sjw*-wcER;-Kos*+`bs^Bq0ymIe(TvL0-aHc7HQfPHnEq) zqbxRfJoj-emP})f3@q-X9<}|c>z(hV0D(b=&@wA&u_Z&(dIIU+LaUw6O*3!{CJ6I(Q2Q52sjbfQS z$~?P*w4>d<10OzNuiYx=WjT9BuYDW9oJ8FH>un!aBPF%uMs;);bak4GA$aKX_gZgz z1`oA{4-0`v{pl{^iM)%{{5~3c9-ac6)FNMx#SN-N8y&p%yl8Yo4^CNkgD;p&-a= zJW`~ape)p?VAsRQJkZj@d#pp2#vnK93jx+QwC?E3)v%?4)v#{=s9or0HCQCh?M{uDn(ttR^^oWa|JhdfBS$i4jRVo>OPvu&2 zHCab@hZC+OX#!fTF9bYW-c;V_d}Gt1y2#SbCgAhnfg#IC;tPydm2o)JYj?4|n_;uL zzk_H0s{9*)HFc7R0V_84uk?f&J8Atl!TUBoA^Ri^bzW;3%dQvZuPl^D58ov`mrdUJ z&U3^E&EqAvpGrUfD4Wc%6~&VFUg;Nu^RV-A6qmI7j5tbNYet#l8PyBhHrt!jTpLvI0_j!1JEV zaP_t=nVAiT~N1+FEicpqFO&y>7-CQTIpaOjq5>&pAbRlq^C}j67LQ1e3b!G5ky_C|-Fo z7jd|tUjJ>X9o2NfFGlkh&;t0EN4R1t?P` z2#^nyWE*ylk)_o{oysW7TFqcmYnWg<-mLYykVkfi>{HEZyJ>dV?K08ZN+#CjOeXnh zxY+l#LBRT4Hf>|kuWGCiCBg~~I*GKcu7$~&S-9=NZ>8+pRNox1rvu|xL$~ChNbG|V z5^#HW&rg!TIA+dkrsI@dv7Aff^9Pfnd02zV*l3OM>ya}Sympv((7sH*nU``h$LlRK z^^YRlv~_R4Sl7aL%{4$SG-4;Q_e*WIcUbdysT0AaayKquRj(^MW*W%xj&vD1Z2b%+ zTMlj>YZ3m7ma5fTzL%ErEa&>Rbr6BE+W(nISY4rk42plxno(Apzo;|fi8^U-?=V<3 zFs5G6w|;F^`sVO%Z6JS6*T6bXxB4Z#21pth(njVWBZH1W(qRIGfiov;v; zydl@7&+~;@Y5vd!3Aq*vdvd09z1H^Q^7pqL04^&=Ju!3&7ukjawJu%T4)&A=DZ*@I ziaEJ5b0D3DT%X`(*kEOIbFG2lPCtjjM!$%=tykH!6mTRcChOk5)pJg$OI|#N-Hl3Y z(e~p3M;-~b=o_+6i2;i`XzVzvg4;cL0P9SG?N;-!6^q#0nYZJhPpPWA7sLoBb4T6*>kVHAwr+esBL>|AM?c5O^nP}iQh#O|EO~wmZ01SL6di0HQ+OMOGb ziQ5>)6&D$Lj7X(ajdR8t6zy?V-NXT}f$)9z#<=(9_lc~Kr-afoUJ+R@I)dAU`r5j1 zuWuzGMgdcXUh%Ihzww_8wy|>C?DZ&Yfbl62$4FLl*e6i(ms`ad}{jQ~St6NdA(u!ifq>sMpk;IQA zs&*hy%+|o&eZGLbJVaN01393&b*wdXI%eoGB5FIye-d6Smyh2^8Y^X&=ux#p3mwwXcA3H0oeIhF3oVLkhoSb*{UE-oP61LQf(gNqMcWk525ZNYY z3y^rt?&t>q^~E13*s@o)E>|&`d-lqo64v?+y&QzkN~r`oU8S*h?}CfJZ)a3hJ|hzo zztA-tXKYm~ojWqxuqn}1YOkemw9h)yJiXit{7D=0K7N{y~r1=04%z?cW&)2J8Z5`ZwTgJHA?U{V-J_p;< z4V-Ohy7CO`aI(XXYk4oU>1h)+(Q|Bl{x-EN+Mm3~u8KV?C(Op zgi9s_3}il!#VwrP`+jzB1(9|hS@VCTz>#y?+HG&ZW&EO4*DTS8bm!2(z%SbrB5|O zh=6X<1n(OKH#aWw;HG>5Weah)T2~nigetm}0sr6^{TqIeiniy(lGJJjCzl&)MtcIO zCfD%fu%i3!c36JFh$uB>J@Qwhv8JM=z6S zIKj1bWa&NY26{zy^B&6rz+XMOtO*X~E|hz4OZU{)Ti^VfwJY*<(Ps|;dU!b^59XA= ztQCO{igrWbx*$OPP{dwj-Jtfm^*&@O{&FVg4rdn9qj#v|xdNNzR>B0DXuWnsE-!z1}{8+9oqk}RiQ#lWy72csaD(j4;C17j) z-1%IE_ILua$kr{Z>YVO6p0w5>>_6Qu+`ChkX@+4GnSusBu#n{|0=>|FtY5AIO<@1{ z6V~r>*1j!4X1}s!Ik8^T+|<&f1(O=0T_RIUWL3w)^B&Q)TO_Y(4kMhjX!wrLLvtFU z*5r~%`ySAZYcoZVWzarZULdOa2`>h@5M+*ZMqPt6fQza0?ba^2{%B4*|1QJfWxqL- zw}_JD2cAv&ZxI3@ z@;Dho^oOjqzAT|i*=B6CG58S#16;m)fGnG9qGgT^}8l+y!)86u;QdXYCR;Ht_x1JUcY%X_SaCGCR!AD{mu6W zKpo_o%l2J`C?dW^AAE6F{QwvXSJ7I27pjSNw~=|7yx}+|O=QKa?y5{wBO{-o@)*fd zkTM-3uGmn|Kewxi0ciY7vK@p_D{|%D0?@AmJ9+FMO5?0nTy@>O2luZPwWljdd~1*M zVkj~{dpESTZI;{cx%5FO+khbcs%o}M%=9v0e{2P4vcUP1kwoxL_XnTg=4$lBk6yNd{72p_LUcgZ=SX4~o5y7#S1IhAWKT4; zX9GnjRaO1!%}QmSr-tqDvt2TNv$f#ZWIbSg6MXajoL&^%>&$GD zcfm6boV6G!Lw6ib=HYk~?r69aCXfSV(U}&)52JjpmhNvu14kDhAzW{e*QUqL2gL{U zT&-4f5-9Jn_30rG#k<)Ma!$?zDBnRMSz&B8uJu1V1BJ{UKVR}>i(GmyCXP72QKz$< zn>($)N=wirh90+LH}mghSvF=m8!J8D01}x}>=u8r&%q}54t%}-)mtq}DC;#oyU6RG zh@jO2AhuZAjI-Ejdio?Ltg0k-&GEJt4a4VDP}HCG*RQyFJz_Vl47EUcWMHb_%xto3 zVjjOvh}Y&vxzJxslo77!W+w%kErGDNYboc|AtqaoBI1-ybbK`Opjn`qVuH_4)IU8D zMD%Iy5*4@w9GxslMGS7ye-KADQRxrj~WarAOMEnb;K8rpz128TkHgpkSCW|?& zdkxl{q@~Zkh!gU| zkL?Hq>T4@R3I#M{bM6S&O6vX{WWDuJ$#kY;#`STCA{nf&R?j+6lm(sZLq}a#8?u<} zyR%}(6j$rjdVB8CgwOU?sq?6HD%6Bsv|>Eb@znWY;|PyH2KO3c7ko*13i$ykR01$M*LGB_ngYaC5(Q@Ctm= zwU}HWg%w|#us1GEI*@4279+AI)Ft~z`t~W&G~`fd>(5=eDp&ZjgAoGvyd|j1q*~{n zx!L1Hnqq`nkkT|#0N$*L68A_#o2Q>{ac}!@q{r%zqK|O`o1LC3?ipl_xTdd-e{;Ni zX^g-Olp(BYz{u`W-%{-~%}L*yxfGCnc+= zs0^y2rv!z2*%sZ1(FnI~x!t``RF=4kU^=Ea$!P9!v7-%t~M( z@SDd;v<%ZPD7O~dw;IXTnMdJ_0?EvSwCG04yT*4LFI_knEIP;=}{qf3>g zDT_^Rbeas{6_XMl3vw`W9;MPo*!_6cm(lDtq8k@xZjeY<5w*xNDq6nTXvO(cblFV& zGil-L8VirxcISx4w5EB#;k#^fjeSjZ>hZ7ZlgP3y?5oWn!@h1k`6~+F{J1_fw`^H< zZ)geilA@5;N)t<+$JdtMLF5W?QtYujc8EUDZJO~AxcJ2R#;fp0+L?bWf`^GM(K5yL zfHsBTx`e8~qD_jXo+1+P%eNC#hjdl|0a)5>cq$Q5p@;cHcFB|qI`QAEg3itg3-ZmC zMZ;9BH4j|spKM4m0~twJ8gK^D9_QZ(`z(0bt$*y~Deew~>rlZqhcD6J}JDCR$04eQLK+v6i>z$|3 z`qY2m&HuqqbiBOy?I)&yGXQFhj;F!bnq7^L7rDXTSS!cJmVBEw>x4~9yGX;HhsU9n zI+?sx&lh0fAZyX-mIt|ncDsH}DOwP#!E`o%Li}nk9ybly3z&t1CnL+}+$`;G;?4KH zlqi>GtJIyam}M4VoTnd)!0O8CMjITad+I7>*9FQSaGD&ynSZ6tN{mv#v>yeQ)Itm9 zoeo?Tn)V*ts;nO35n(Ot4+i~`5ItIK8pQf|Yvl`KX@}t2FMD{EF%cxkI4FS? z+4%(x_;uX7*faI6Gnt?GTBEAT(7Tmyyjg@F;H}yyG^9V_#%92BVl(sTIl3Iw^6eUR zSnje$VTm#Hd~rvb(bnk)Pki@EP|u1Ka%#nq|N7WZgC%Wg?2Pf(_hqX4k$9k^2t1EE zk<((l%cNtXif@zl#!&-rMiPI?>B`-|Saa&ec>&G9wr>BTt=C~q@WMlvr$(Yua4F{J z+3?zQcJ<+KrkSXVOts(#d@Ps#;h3Oy^xS3Vo{2C(TblZ$JGNAZXFXmAYGNmx-Bd`s z!HDG}4p&g@2o<)xCb8gA<|Lf@+JKiYr}4#rrm$83 z??4WPGMipI`_o;#ZwY!rkIMsqqnJx;(2meGYTd=_Kriz! z(z?~c*YXn#El`l3w?snyCfBu{ItQAtva(Ab%h8kJ{)^|i071=%ebrpbGg787m#He% zD!GySpVMh7L9Z3r7@B;+wQ@r|gb@dG^#So`Mbl8aDZbar(zfdLA+Q~mh{IlaAlr7C9I(GSOZR%czIaAgyki~%X8~2 zcsq-hHr6F*wwv`Kel}{{v4ydf=6V~jYo5ECo0dcfVWF?%%Vg$De%1SI-(m8(sk+kA zLrmw>dMe4;CGveM!PGy=+Ccw(U?ziz4vqe+1Asq2d;G2v;nox){```3Ep0@vZu*H;Lb(c{Q}BDa4u Pp#SXqzb{*~hoAo+yD?5` diff --git a/tools/python/images/clip_image016.jpg b/tools/python/images/clip_image016.jpg deleted file mode 100644 index 05dc68f17f3a7f0554712a39332fa3e21cfac93f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6056 zcmds5c{r49-@X})ElWlrB4e4zUbbRLh9S#Lgf`i-WT(bHrR+NiNn$M7l4J`>L`1S? zEBls|Z7__PZ%94w@jT!0zR$ND$NL@MJ;(LWegEeCo#*+x?(4dKH@TlY3b5%QkO+W^ ziVEHxFfn;? zL8$B*S^1MHYHDgC(gpoUKHa0$$qtK%&|K*GP8epdfPEx-B zQ3(Rn>{K9jDsm&h4**oO6t+8r_bw`G5DhIIn4W==i4su822fLhK-4rKT3Q+!O0+-a zIY7fsyZ?xcIvoef0xakTk-Z(4K@UA$QqPI*TojVCbPr@;JaCYUn@3ng6n0eX_z8Ih zMI~j8Gnxo3ZKTdQBjfWXre+tctZlB@+SxmJc;2{)@$&Y$b2lhBBs45MKH*;C{iNiS z2bs96?3~;udHK&vUzC+sR93xw)6n>~srg+?YgczqZ{Mf>fx)ryiOH$ync2CoOUo;( z__g&7!sd=HDgg9T)^BBh(#1~EMNLBkq5<#dqN2u728f-8_J|DKesvVs!i_^v_BK7_ zbX-PBJp)t@y~t_l-pP1C==hlM(vGxUWq(gt;D1ZmZ()Dv8U&a@RFvX@*a0{&ejBDG4+FJ7R(iTeDoZOum$KGD7ruWo*rFX? z``FHNY)WF%TULnhYL;t~b}}7JYi*RdHt{X9=TggBKqlXeOR;_;rvTp+k_?<0ArX#m zWX5XOLCL^~NJOy3Nn&$xkkQy@tztFZZz{WAH9y!0&-;xL(M* zOAnSI2khuSO`NMAvhr*1!Q}p6_fbh7Bc1j}V>=Ycz$?AOM(Hcn&_g6KB7Sd~x{@<0Yk4N`${N7c(GC zlY#XX++Mi7LdAOO3eFenU8k%vT6?%-kqjL1%Y)CP?kI@)6`r+=615As9=PfYZy*N! zk?7Q~_=H^?g>tu=nR%_6gSY__|D;r+C~*b5HQD*wq12(Rc0%W_h{KClVr)0Cv9Eq< z)7p#fa&h^gFoReltps@b*8g;Z@Sivlvy}ef@6y^;`p^9O1`n%K{%OcR3;TE3`mgz8 z|0e&nf6KJ^m;C?F8v8#c7I!bNftt1Q=X>UtbuU5`E`>jSIrj;{zE#%kRr++&S1nD5 zF&8ZarS^9R3;b)yR`cdL`{r#!GT=o9besSPN)s^e?LYA*Q6Q>TQ$Z^wvuAwhm>UC2 zbG#;vV0sP0>jRjRZ)P#z1hcfRhiDQFT4&=!e@{vC$8)~7t313aeE5iwAGKzN2_=H} zhUtD(RUAsXQw@!lF@>xLo4yYdyaBseX#=%iRSizj<`!-y`oa5B4{rA6UcxpvY}=P$ zi!LPl!G<`9_M3HWSLgKk-Cp4~yR~#&x1#X;eNP&7=FaO$u1)B@@R|swUr>mfo;;q# zpBMvH!b2CIIw4FMod8d#Tzh`Yo=GIG%}I2iK6U1h+Y2j2ftCIlK271C0}7vDg5h1m zNb8*aNnCV>2ah}tK&v`ecGP0FPug2;o1h0aSBL-P?S>96k<~hV&C~@D$s9Hf6l95g5=cZ0%WaT=H8&Sz0 zzTh0Togst~fweX=gh?vER4u^FrXJ5gk$9DpCdU}gGiL=Vv-cQ!&$k2D4Qw1YZR+}Q z(Pmk%heg}6-%E>6ZC;f+<*z3ZRkJpSweXB!PPq6zN_4R5>ibBUS=U3g)@TX=TzQ2P zTw8$woHSFJh91*+o>nO;D*ljPz1bo5tp&)Td`ZJbj?2nR_bj42))Z9D&AXH^*X~i7 z{TQrO;qzmwc20Pxs-~`bFg_gYf6S?E4davFAXIgmZGlECX8{MWX>sEPaq>Rcf#S>7 zXEU%u+^Wtpw?92xnKy$Nub5|q=kyk~dykjBOgUA#FwW|jBl~=m8@O~Qp?>SLPmhUs zE+WWEZ)nO@$czQ0u`ideNzUvA+I^WP=w<4glj2TiB{Cv$2z4vLhknD-!`dH|??lT* z8cEq&m&dh;3$P5V7%yI2&(`O7HqcA0>G00u- zwLC(!{S%X8n(Pu&!HE_Qa44 zBOQ~jrqx;1E4ES+F5M*F-qP!OI^$+8&_G#v$xhMxoJzwDJ<*EmbNYcpI;7KYYj)T26l@P0mayXC|Mzz!QE zVnOxihu*O5YQ|eaNU3>S9sjbhZv$uHTDt9R4H@B(wsUc>o!wQC&=~DhBh+w?R7@mD6 z?%}fkQD~^-m{`>CRXL}Ls+?>5doE<7IG!0J&B9&-fk@($o=b7Hgro`5nefwX zfYgk641vAV;PbC3eXdpD+Br!5J)2-}{wuKQWmfCDv;g>|hNjM&<4qe=mm6GLK+sRw zA{KMyXURYf{7tlrz?v@M6GgYDahyZjC8q01(V!cD#uKH93Pou`fWJE?NKdde z*lhWS*k0E4nAMz*RGJV&+SHCwqFr~6w$!+NR=grZW3}9U{cJCM6Rd{B+LU7N;36Uu zW)Z=7(X;4<+fc^LVCpd9-eytR_eA$3_EstDdmWYXsL#pPkBCyBQ#bI1J|ACoN_uYH ze1HkDOU!&Dqa^0)>K6S9pRVl~KS5IEpVy1ncIbWzpBN*(=&$Q5I;#qC3L_ogHlLRE z%w??{L2cXPvK9TCHe=?EEE(3{3UfD~u-Z~y@k6d5*Uu^-PL(Dd#`WD2?=*owmm#b|Z~!ODIC;z{VV2I03YKjoA#m0rjOot2Njjn`+$8#1uxv^TW+IsB z)8VXh$lAM;I2;T`L3yh2O)R5ww89|WmK1c>m*MULvzq=_*)Q4>FpjaXV|6>#)2_If98)jEa`Nxjtuv*&}E-!cSb{U+>KK zDXJ5+;gp!BFeRPoO*QNu=N9P$`h%$us~ez1mwdVQ>M0Ht1i7jjdZ>2|Nz%-kb<#9t&znu~6LOJi1wiIl!9 zPF|p{dw**?59gQo0^Q_gSKfFZcg9gG(K&;XUPr5!(cbc!Fw^&y72uO*6#)1&;)_yY!q6L)&uOdNl#)^-RFMs+ZXKe=m> zYQHa9afVOEw0eTlUP4Km$M*oc-KIbP15$}go>@I@L|~?uabloGGYo-*Cd8kNTg_KM za(*j5C3o?RyyTSCv!p?Vt5nfd`F}?Hn3&duz4Z_7VYbfcm`RuksO51pGn)x4fWVQ5 z)s<$7FF0}GX)A3M@BgmuGbso6Yn|8qdg#J9g%>* zj9g7pnS+s2cKeihN|j3CK&K-8C%begiWF@VYAqY^Rk8OXU=W!^q&|0N_P+}D^`K(Sp8?XTbG^}|ZOEzAR! zazGw9^O}7llB1aup%5-V%#`pQM|9wUO|9@0+1}u!z^BO8h|9KYz0sg+0zbX3D zHc)$Vxy|O*u+DXT(z&wp#fEWab~OuJ%5@ zI3OxBto@u>7724)dE~C_8&H;@dGdW_G3z8{^H19MwqWorb6l%m#Wh+}mP1dP$NHW= ztzgE;wsY1*Oh2ze6DFC@pMZ!zeLMxOIXFSOhWq-lEK53zmf1PpnTrrULzw9c#p`-K zvf|wjEOQ$j{Lm++z8RCeWafz~cCCTO1;D56-sXiC7dJ%fUB!&87HTNLQl4_1cZ{oIyt1z;2kQ|-z*RA0}I*Bfa?6C zz_Mp|aACVCaGBBO(*K`{XT>E3(tLner5DMDE2;O@Tte%N?R-TD`d|2pfF%$72QNtS AQvd(} diff --git a/tools/python/images/clip_image018.gif b/tools/python/images/clip_image018.gif deleted file mode 100644 index 3995b48b3d4d0921f5da9eb4320a61e43a507aa2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 70465 zcmXVW2UHW^)Anu>NPq-FPpAn^KtMv1A{atPPJ@rcmtcL~C-hR-L6dOG&+oj?SwIOmxc#K@$a5=j9|B%)B*svNox{XmQzo@9x_V^JvJV2oZ|Uw^7UrfK9UWGc6V}~U6`fPuJuqNrsY@b}Qhg}J^;K?mx^B8eyMVATdNCe_+I_ui zXGf&qv35Q_J^|5oZgzNG{1^b>vBe5_qEAl!n%kD0E|I7}D#qFsV-L(5Ad&E-1KY4+ z^Z2pUwR!qMZ?`2rteZZMT{|fMkB`!AH;-*QD8&aoV-z=^oCCF^>CtX$__ZpxoCB%z z`1&gR79O92kH)vz{jWb7UmuMRz}w;5*4)PMYh&|xdcE6NJ$?*dKjxMi7EPzu*XK|y z-Rj%ge736Gban0A>U}Ip-2?RppSvpHK$og$dX54XwdH0?gEQL<844@ZN+KSikK9=~c1EaR+ z^OUuD3O>g&8o&1cxhY$A^OUxOa_}&I&Bx7cASbMf9h2Ey6~!Yr|t0a!~^7`<(N zZ4HZh-Bx9(8*SHpU{M%7+GjqeI9iupZ%H4sv|Br%c-=8eyaJZ8mAdO=Nt_=W2n&cl zK!;oRmO?k)t$PgBjSsNY?VcZVLq%`Z@2;&4bXOGz_~5&@s#3eRP(&cCyNaTKt+%Wi z3pfy1HHLQ^3pju)oI@Dj)=!d3!z zS+{^&X~J#!R45Zh&$;Wv+ok=?h=a~Un6Nmj2B_3|Lgl*DUX26@5MyhlhGq@a2xNCD zcwN~#Z&`apt1)7{TG1?dE2lp~k=avISZHM)^Xg__GQLoEc(|{Qqd`DR@shQT)6xk^ ztsu1j*Vqd)R}BCirRetY>8lu$XUwi}dolozN~JZB;%|Ta=7L|Z?cC>Js-p)U)h*KynV6SO+u^@ z%YiwX#ER9F30#MEPOZ6_12aE%+0KzcN z3B;1#?J|FbaD#B_Ju|>KmSWm|;b|2hAoF;p3RF2Sdlv1Dg=O3M5(YSI>US1vYGuf= zkj1Om-qN+UkDniJ_nrXRP^=S2xB&-;EJtOCX=tfj_xrZpkMFgj>_?ylB*&pHR*#;f zOXDGSf_ZSC9!M8v9+&I_Xacxl^#p+~^KF>=R#KKKQZz+rAVG*)7kX*ym}&wMw)rI% z`?`CYN*qp%PcJF$X*o!#k2hK2oknO zRYM+aTYxz-N%o<-H}sij9Cd|%0a`~~p5CDtckL5;OsDH8{azeCw*V%IEE5k%`&z-r zVCJ7NF!4WOFhc&$W6NYK<3K#>V_A1*^Xh#;*l_Zf>*20@AE&VKX4t^Sgu}e-5?>=E2Qx!($B_9T3az_yG{FNQjjaWM(dUTWu13;us5<50l z58R0h;fr)O`B!%Q>R%h;e#c!2aG?Ra+H;O*nFMwuKMznd16IAJmUi;_E*1{y3(|1v zHRYMk)CDi#8JB)l@c!79zZD!@;}uR)cnty|n~%DL(e*=j~gnM!?$`wjTKgG`Khb9A)>+?r@97<$we3;-eBZI4 zTH;xhb>2)vVGBH=ZeH*dP;dnX46hZgjFeXfu=;f8VdZGxHV|?gOU1=dqXJr#0WY zr>Y!aa#{Da+;Gw9qSG$*ISkBtwMVSs@{ia|U#@;h}kAzn0#2ky@p{Hxll<2$;1y;yq4V}g`6VwV0j^M2|wUes+6WQ5*oiOCS$s~9m$ z2+PTeSrIJmdwOFnG`r}Suf)raA@_!Fw<`kuDKBPTRtfD?o22;~_C=0eo^QGbPxBv4 ze*GF`f+|S?677}_XXNMZHpYEwNNLQi`aFK#Y%0J&07GqQ|9mNg?O5RNpq3CW!$!+s zFXMP)!+cY;-;trTi{7C*Zx{WvD4T6oaj{dcpXSAOy|=y;mBE<*`s6BXCv7Wc>on)! z=iTnDxveOEu?5u7?xzpdu8|M<7m+U>_HY?&#qr>#r5PB$yNzGc!~QHw@aPQRBZp=c z{aIGnPl6p?d!6MCHE}ThqMp^O?u>u!47D@6{jZj9nWGW4tV0H@x$6 z{Gv+uokM>&^qe2DipIk8KmOS`{@16-Ay^W`xd-vAL*E(!SU!Z1rwMA)gga<>c^W&4M!?aqdo+n^ zh)fBPPWj(vFis(WTZQ zv+7v({5V1qq%=%dou*UFp#~=Ko_XTuMAY|O&;HjutAp-6lQ;gG3Z%5TEZ_BHI$Up^ zin{so#_>t*3wtNF*%EhF6Rvh>|J_cQjMux8uSYev{d*JQyn8?~EcpzBA8Fs$ zecqEZ9Q`sZ>hZ^vby%IVyB9WY{)f#;WaH$}a<7l4AvAlLefXtSMJk{u0 z^eFjEYF}dg?TBUi8ELA=V8M~c1$q-42|p(a8_pNUxfBol^!$Cf_)TZw-~B1ufdZ#x ze~;D3+%b*>_u-P)ucCI&mUPJgd_A61k^zj~A2cHSAEiQ23x`5alom#3F; zE@@oB7Or@Q`$92on9+|YT2*kK{dw_Od;E|4Mecirk2n$#mc?xw(GBMjTnrvp^V@3) zj2qgei%yBp*IbzG@oVgfA|r_lrWIGK%XBzTJunNL+$i@lw{qVrs-6m@o8LZ%bF1Dh zia}K*%G)Ask9-gHJgwlVZG*52Mt2R~{_@@7-7A%mHJ5J#RI^_$Lb8nFvq>tBd0w-w z!<~n2e)rIAvle{idNV?AP!=#_p$ugsyC-=B$Q%lc!pk~A$tKSA!a(wVVl~auISqb4 z{H)(%eqak4XSjZc! ztiJu;?m6e3%)$s`x7wcBn$LoDdC2IL+qIc)dC99aFGT7ed}vtyRrF9ga_M_i_Eqk0 z3?5U}ywSV4(f4zsbZgp>)q|X#4<9P92|=eJkr2 z37}>!qbX$A_wP7KEXpR6gTO#y2?i6rf3l z3sF%#40senRG$pTs={U%*Ar&7_s!T88xK86X|aD+dtF z4(D++0+}#A0JfY7;J0B>3sRT$*g04oL^512n8zUp(^ZZnsGds11qBS7B?MS;Fk#9J zq<+O?4Hhbz(GgCgTzb}}frY~xk#|2n;>C7@2kT!)QN^j*8=xxU?lborSzyWu!S(2e z-cqP^wZo9UW`Huw)=E3N0$MeI(#1 zOo$5FtXiy6*rOUbDSFy=RG?&%OJ|$Yh|J#7iaehaAS3fgrWI~W7Lg~zWP;JE)S$ep zJlr&dhbnhRCKkv9wE$GtLXYwna2bMgW+U|=Fyk9AoC%CC!W0(3G!{$;0@ax?Emc@I z0Dlj#Z7(>AazVxrL8OthHW=+S}Z| zOf(M`l?Dmp_8DB5G`PrRSUrTpUldViMHy_r)BxeD<5nph&e5^La%u@`189~Kz&XL^ z-8gQQi4Zdw`?DFxTwUz5yE0OJj9-@ZVl< zGTaJK(#Yg9!dOHSIMm5{;sDVmbn1I@uJoL?HG^=Xz6Rq3+hsH#1;#YHr+aBI7Y`PR zc~PJGl3Sc#)8kH5P1CD^2YWRjkPG|2#Aid z%oAG;1*u+xDsw0Yi>pf zFbcLGqQZinyK9rjyCX4#ZIT8Jtx&c=U?O4}@b}NBG|OP!WO!{R#uorhAi?vkuR#Ln zvj9uOcHY?TP4)2uI?(ce!FQ_D9IS`js&E(qR0Ci$BzWUEV0~6pp9;%F00I!#zdCp= znf=|h#6lY`ELoVx8%(U=SOQSrdeHh<5VHU$Hgddu@Vn0c^9vgUlkyYe^Y3^JCmlYMB(i^|BFH7#GUaPjlIPKs0lg39PmW?&bHVSS z&8eNpY!)vp7@c4vc&Z7e8FDaJIW!1>)Mo%C9*A{>FK=^yd5)H5?Vxh!`50VEfUeCu zIMxRC=CcbJ4EpiFl`@gPO1LNIK!6AKJy<{ijOkW|&s&?+(qK3y`T=3B`yfET;^bj) zoxFr8?BSBg1a7K~3(f6iNdCO>#lwt%2;7Q@+{Qpq;Z<)IoY}S2u)rG(7skS@R75oh zhqRa=cP3mP3o~l`q|8M79-wa=ycqy|7#iDKMK(Z4?9yjAqe{~dR!ac%32j`OBSBk%W}iq;ca|jK`Lzi#ADGG+AFOe=m4E z&Qux%Ez?n|LSki|q=0FT412Gctu0GY}5(tE05FG?B zKd2G}07kOi>Z=^9r*xO^y&~3y7|B6``dXZX+yk|B=nK=77b|u$A3L+|IZ*{C7+8C$ z%39YcRsmQ5ILj$s>szS7Y#l(OmxDF(0Ge^BC!G(S`HWrx3F$=9apip?{uOR)Vp=-o z!TX!H-mh(_PiUU?0%7HZ^oi)*fcd1hE30$gFCQ-&JSAd=KKqk7Y}`g=}%-Wp|WAvICe0q zX$>J@w@v1>&?R^yn_&?HG<**35XdRN&c;p}^fF99$Vtb-g=8$q$zld9*+X)r_ zw^0x-619y2Io+u7Y&>X%i_pD!Vh@P&?I0GAbk~6jydA@kQ#Kj!@n$bGBZHAxg=^uT-J?*;E)l~esIUPx{;ZHle+d^{J&7u1#wcK%Yegw{a<_l zMSjM8WEK6RwYi*xrc6Ayh->bMevnLDE;4`7Bs(LNWz2(sonJsWxtt&aNZ_a18&7YS z3kdgi-N2KJ!}&Cca~A8w1i;0G4U#&J8>)C|US zHR(Lm{qv~dacPS_6A34L8DU#L7w{aseaBIE)wa2wCyFa!OY6+9+^b%y(HIP!v^Yn)%)L?(21x{%T(G>i;G^Tg zL^__22_W>{8$f!2T7+Q+E&?wtjt*T|Wy6M__vQI?mFk5y8m6M?Satz8Djeu0Ar=s% zVd#fb7C`D^vK^f^LOM02IwEZNMsW}>oRPRMo(afkG@5cE{9)jVA)gj?5Kh?Umh4DH zMdHkKEe`>FVikBU(R6`+1Sn%=2qU<70q2=C=M}>kE2<%v%olyWRsj0|3*h^wlH^(; z_R--4xo|T4_D_giE}xfAu8FpXpyC>EaThc|T42@!q+wxru(#$Hi~;a+UuMwd+vyVd z1TB}cMI?Ef45#1CJl)C!k_a673otoy%n)2`frk5IloHipSVAT9TSEZin7Uy*j1@A=%gl2fzUii^#?vTqUEr_WNCS*_+@;9L`lg zb*c1e9wn+$hT-b9x5GlwQw<#t0f)4>YAn&^lZ7)<5{~mfRXQ8r-J<~8YvYNcH&JB_49Ww(6RE9BT~*3 zj29((VFLEi@EVK$ZJFloK^bR-I)OYpm%*==es(BDU;dVqr|z%4hRlZ?fgKW$4UFH3 z$aUox=(6~>zd5ewuQ|8@yI*4#0Wp1WZkZs2JiU2mPLdgIJdoU{^I@_oli>rcy%UVX<@ zJSo^;451`?6fTX0mcDEXbsl=$kR5O(uf_encB1%fNW)_*iKg>&MvKpN>Ml24e7g!I zN=p8H-w>t-aAzaWBz8Nu#mmyN&^Kq;goU159o#Izkg@CA1x@@xM>xz0UT^Qe9xw6O z4*9!z&a8!u7NUyzWiaW4n}UE%KIM3zvt$AW*@%Dc;oBxvx)`M>+Cb`6t)|k$aAgNt zw$%re1i)FdC_FBx9uv$B$1x;LSM0>}6 z3Fe|7|JygrpdZ~cpSC>j_$-2_9zGsd{tU$x&2gW((%hoIownt+#Cme2p<915`GnrW zrd(&crt0a7&PA^uK2-p$HXLF6mqKQ~C^$b`w*Ms-^vZK}EPVRg(6)JQ1paxA{n=Y$ zQ-Sf5{cjY)?Yu|sHXPE^7QSHhA|+o+02IQz)IuKjbQQ$lk&N746rU=r^VO$^35KjZ%9%&i#V+U^zmKnWFff+KjPRTlE&3i_E~F)rY?{>ojqTepKTr zE@6@yF|A_n$<3jib^SAOZEh`dHp=4_h z0nsb1)GI4Sk=5vxqxVXs_KMIctDVe`3v5=bUgecum7U%r2qRVByA@oGZ23s_;zW&O zM*Et^MjAP6n)JTNh`z|iI87@f_2NEVUn6Z}BfV6k}Jwwa>23*s;>cYQ=~X)iA@9|3+1uU*xRqm49H#Q(UX?UcO?0H={z48?BL1 zr&!|cAXgR2vx=?xuprt3Q~Xt3e9wpbM=G&p`q~1 zp@>Seh=cQxRI`}6p_m=BsLG*h0>e@LL(ryKwDE9Uzgdhg?7I6f9c>QPnI+SQ_9Kml z6J^YAY?={ORr~SiqV+_1RJS||$~10XAXwy(pY{+eY(-UaQ2Mq$ck=`+Mip=OH$Duk zZokg5i0;$8p&A=VI9iH+7^scB#P!fkZ=@nzt3+euZlz{v;K;qvgnRv(HE%7d0!NB> zp4P2At*)~?^Nbx%9(gcoQL)p05KOx}YH>z%qc+t0|-NA!f-%V&%o zO>d*l68f{d#qAB1BNeG5wHhbxD?JgHqa*z*Cn}9Q88Mu-GM0~gtKK>OO4ZJ=-)8U_i1j6q6eFkW1s#cf~+xsdIXGM+Y$gA zi7&GE8s@c6$tcuy&y4N*zW6!PGQXF&CjjhL5(}DMJo{xnk1~8hIyqXyF@#TVh)Q|V zquQL=R6xA>FY3WuJ~z`AJz-H?ulwkI1cvukTGi=CwbQKMU72SKZFn407gAVt)yaI< z9$=Jh5ie_=p0{!8Ei7ZC&4jX07!~|?*0aL&UOPEd$#dTBmMVBn?Jt{`kK_w`Q^1nJ z=|bh+qphl*@~GiY!w96+WB;lmF?B++w);|_Y=mBpui4n* z=Ek$bSr2=a(E)>X*@j=MJLT&$z&2I3ZY;Zp?gwvK=`D@ydA3 zNtC1Mz1sV=c9RN+OV575EEb#uKNeyFGKMPD$E8{>IvDnwGsZ)>#h7gU3`oi*8KDRh z8v(?6C4?)WctiA`{4xQD;une&^YXFcSSJ?`wmkqMc7O$CP(1)}PB-c=Xn<7E_0=nl zS{w1`M5pt)6-h1PI8d7O0917blL|ITulpmNz(&bO8~`We49Yvz4wt%JD+Y!E0>$=> zv5m7Mntz441X`397KxiP{>#=;D-z@52i*Q1j`(?aACUxb{WRt56U$+iF0g=4uDHwM zv`&TBx&E(#{G$I-oFm%qxRof2`8(A`yhcb1k6IYY{xub+SAhl_+%r4eEtk``I-IjRieIHF7IwT+&DD8apM0*(-RINkkdh)t)zeXI zZzE(mroLE;}M<1LXH`7uWYIT8VZ;Ax;s4< zxQ)!D|MX&TSI2+#_GD!uLh`eCs^hu6Qbb(J=95oL>sRr!;I&yyU=QE{q`+-ZAE4EG zffWo+4@NS7b-WV}5DErD0D3cnhKb<*fqRth$be}jxRDcJP7EPjV)Py$n#3n=nTf#C z*IX0ir{NYT@QN;Ysu|MXOE{bFDd5O~#vN)9@!Vsi;8p2QMKnif_1ten6lHJ9snNuT z7gF-QaAUb%6fAF0yfcf2S|~^VIfR5o1z9`vattS)ZM}YK%%?P*x4laPeXJFjL3J9Qr0C_y}x{r&}*I11!xWWNUbd~Fi}le|NwA!Hr{ z@IUh{_s|`^15tIq=E_)xd1J+oM}01+5RBT_Dt zsmC^G!GYgubVB0bJOD!aADU_kdgXa5=7ByeHp4u+SVSVH)49N|ma@mDSH;qQV0l_U zIN%)2_8USl#p|96q;+eQix3BTW2HVV%ZlY6gog+!|P`Xq${EP zGh(vXOauxg5E4u)cfEu4r;D)QRRo*oc05B=ZI2z*0V1!9T$_{P#z7K5Z+DCFP$-@p%(x_@-5;taINex(MbIK-DDnYKz4osq65HY!cLGtLx5)L zLcndj7BHa3{e|cbTEaS@r9Im2-Q@CqswRfTwLIKNKNe@Vnj z1p}W>-s~?37p=Z}I#T{_CWLyEc19q5pAvNQu}$_LlcJDoRDo?S-@Qt?C+u2YG*2uN zeL>3CW&8UWoQ#F@4@QdqQ}NoQbwv}GQa9&FnFP0y`h3}*SLlz z+;nKK6hvO33EOafGD$GC5sb2cuKF)|dF$fEqWsJu*llQ^O-8(Y=jsw$=9VnqULMu3 z2$3YjQGLn5v)uj-X8zB3#8QoW0yM!_0A@m@ZE)i=o7sbNov1^i*hCyT#*ByfM?|Po z`Bnb+jT)uA;!#`aW6%{XR{vD4J5vFnlA!;jQ zYmUC*PH5tWogNC`JyZ`mj4fnl_c>Q945R2Eo7)k*7 z)IX%U&;Uy$9|J(Xj^E8ojs{S?y7^t4jPnU)NG;t-m;jlH06YzSy? zAhO&$5ZeI8&Ri`4J5{ zHvPoKI|4IQ9!Ej?tsu7b_TLWL>yH9XDNFCbodH>Wf`x>1`M@kyyfv4itX?M-&*O8D zQi{qRCRm`YGi|xh(l)GEj2O^3B$2&Bh3?ql$i`@CyyN7)5sblRctlnEXBypmvUjw{ zzb6xmnqG*^=2@0$G&?f1X zIIah+ED#0lnHuRfC;Rkpv!|gdY+=6NPQ!pk7RVo&wta|0IiK|m{bYt}C@ove%#O!K zKom~GFsp|hkG3^_V{M+jn;v&GG#jC=;nmltaznL4;7~pbhUk%M8NuXOz|1}Mt$rLt zinBm8$}jfMcZQJKw4sIN*H(GRQY{vs+M2N+SXO`(hSP2qAbRt(4uBpwK*|AXs-l&D~BA5H)5)oq$Fj zRoY?hnGZ-PxC&~(0lfRlSst+n6zpqb%2i)7`}|HeT8TUu7z!25=k%ZmEvZitOdy&t zZj4~UeEK1T8BrMcs&Gom1v;kG2&l}1c)z4u2qSq8-&r|GLKSXlye)@x2|g)~huBOA z84MWkByc5OVbscx4I2=EJbzS(Wh`nYAjLgHN%!27_%op^hLTm({b zBmw=Ot?kIpZ7urUt0A!aGy}Snv59 z%oF>w*7C8IsGGtd?y?|B8FozBQX;rvsXVV`@VLYT&Ojh$Q)(FJHS7YyMR+fHHxKm~JfZpc|-`NZ7GW{mpwS2H+byZnj zootK1;*zbll@(jQvBSgPKb-UjW( z#{x^MY|&))X@wMpk5cMgUFZuw2CMm65zkx+ok9x5tH&m4!bTW^&t6_(_-}nUMZd>tITA2=@~vE0AK+_bT!vva$`)g9?_*R%XH!C3 z9(e$L5tt?-? zFJy2`T;1L6c=fJD+YKh?A2D9FS$o^P=5MZ>n}G&Xk(c}e6$#|K%3`^Qx1Umz5caP> zrM?-M;Z3Gm>tc4%gEa;! zZ3`af7o6w01v0aG!qrZ_;6JB?yxtp|af$a!XTVzr3!9kGX4tAWs!z7SX;2r!_KyUh;%}Avq)k!kUch}#v}|Lzt4w|1N-dF>aD!ooVUr7=0yA31rC~GFYPBO&7bB+%Sxsc_6f=xB40i| zzo|-0r;3#2kB`n}i+n1aVLM}>Gp7`iVZi3eck-Dsn@RnlCmvo)Mrj3}n4`WyJd0-u2pI96G$sb>7;q z^Z?q6VoN95@}>Av7T**&bt0r<=J?0T#64rmWVK2J3_OZ$Pr&kp?&_BeJ(t{<^)kSZ zM>|R>ULgtWz-W&8@PdD>%T*2INsyH~RH)P$rZp<(?@My+&N zRysJH~cR&=O zH`jqQu(V!(b1u2c;ZMaQResl$Ro5rpY+Yx7nvqiyZc5@Sx6UurbI8s(4Q1(W%{M?vK_xMD^!QZS>65Bzk-XN?um5G*13u?KVluJtoEpj#x z!X&(S3q9mKi6Moy%AmK3+98=NpJ0)p=uQtEr?O!0r%{R$QEPmcL%qVC)^zx!g`$=O zqr|1wPUv{g0(9p3D-V1^-&>slo+ydmv;x-@5%sK@K8q2$%4>C4vtTsrtZXt3Kk6${RN|r*w44Y5uG|HkX`3{#;_?w%|Cv z>i8=DqPrlTEyV+oyl1=lh>N(LO#IDh6Vq`)Uo{!ibl;8p&kkRg za5sp@VB;?mOv;WKA9sJ{nCxLYT{-DWH*~71^N{h5^~<@eSYapQJtD$Ty>MUULpFv{ z`s;o}jj)oYZ^QgXxq+`I=dyKg4JY^g>BSTFB9-+GmUrfjkP!us#BL?zdAa7zwq?&& zpI=XmEuHHux3|H(B8xq!nYtf*;iiV$9nYutW=)1i_h)@fZ#Mb~G_rs6Zhu*4KrfMC zlznh*7Qr2TbodN=?PCk^au`(NR&ufRu;*S$wfFhXuQI8VCik2+OFWZ%4P;G<15l>+ z|Ewjs%FQmMGmr7nI{suI;$-Rt&hPnPI4i~$?SWV$YW0h)M?VY&GU_yH(iQM(i=?saqT4{7pI71w4rF_rc2 zf;JvMO);*#$lL#EOy~qcyMHu9%A~U1#D;usai22b*YMWO$(UU48EmZQ6=X~eS`PX= zQO+LX=KasYt86iT%CK7GDGNOr&$Hh1hkEwY3Xh-=xN4IklUZ;zWp>@8wr#BBxRby6 z`SsgPRWm68NwEbMm&u&L(4AvJFPxMC8>Ve#{j8h7s_!Rm*2Qxi<+o#(J~t=}eKs7I zdj7oK!HOgMz2BAzaM9D3IqOFrciZZ2R5>QXJE^kp_cQZe^AWR64(`zm4c7;r$AaE_ z=Ycj5rEKon-65&@OaEB8*(3IEHiJexKfTnV6w?}G(zgDtCGV=#hko!^>-9PAvX*7p zub8R;8>M<`E$gqmzDJV?Dy;lf&sChT_IG5T%+ja?S|N!)>2gYl>m^L>bznyu;)-D2 zLr{>@Ow6AoIE3PDg|r2x|Ke9YUNb|iP2W~0BD=_+3h64-gWu2)4rhceAuA`@ewrr# zG+)csls)@&nC8NyO)@p<%%hBhG?Ql1WYd`lGHTsW00H61oLuW<6pQ>|8hCDu#E_eK zPZ8M3BLJn&Vw9gtNEwF!Cdat9_}QXOmWi1V`>X_#*wykoGJn>HYji! zE|y`=_4R^GElEb*nA^Yis5^;M9#q?73g1xcuLPu{$q!g$6nRKxilV7!LNQzlB4|NV*X^dM1w8Pf9{TZ02NTmp+fp z5Z*6I+;77)f2bpd$pZA-G7zfB9l^T}X>YU1J%|M+$$WYzniq#qW4Z zeebuOCztLE#QNQjN4zaXxm_;?&`QJm5sm^HNbWky{ROggB~#lROk1H>J~vjD3rhcS z_gH-_aRqkxZGLCBDaV3^^$Le*#%GCj!=IZ>j(j$^R!UDhPFQit4F@6cEdB6jhavK; z6*g&+BEcYY>_O+dOdHIHFVrz}lC=0E9JOK~F8BTucB*s7vISo!;3CMb1PC)3tKwjb z-tNW4q)51^aX?7-WJhF@eQh`svuENLq3jh5ImSUMs)?ASI1Y_oKyN%A%IV>Memou0^9l1Ao z{p;%z`yDEU>(V#REAb6?NcX)uB<#sg6bKHc(v1J3=uF(9`WiTX@9Y?iee8p=Z!^eF zW8Y=Tnl$!3L&{p+v5tL^vNy6MRF;$^jU}PfkVNYg6{S+CwEeu^=ed8tInQ&>J?Hr@ zpY!7s7CM|O!{da}9j>FmM8J?-@f40O*uSOjfQi z3#zAPD|thBj>CqR(y64d7%bYkbEko_U|^t*9JUxPKTb=BK`Vo7L;>M@7aMT&tLS`(j~`)a}FkOFJdPqkZBZj2?ONt zKw83LpG_eT`@*jI%bTkQ4d2SY&C63;%B{6c%iV{~0(ku$m9(A%3`}}Wnx?UncFE|; zZxLM|zIblv`6qCkTI)Tbio3L#u2!35{mQ(cBy0+dAG)Jd1te+J;C?<2B0@@%{sDaK z53bV|)cjlPW3hOMNTcAtV8Z<2FqtU<#}a-D``9mDyNQWd1bg+$Jqx0dEmXnZ>N&Mi zICIaQzAAsP{U%RCp8jVDq3z=Q7tJ$FL9@z|gL;i@`_gRbOPE;;ZVo?F55>qquqi zRPJ89-hBK#@<}(S{_KAAHpCY`^v024U4!(y0q8Sk@s8O@i~yoT>*4~E&zNo8aoMft zxm?eGW;psH`ve{FacK!!E)mdYA$OykNJ1fhD^n#6LbUqx({5wC%lQU@Q+R0GpYYu5 z0}*XWpAx{vEu$iZB&d&|N_XRRj6ct~opy6GHL8n2Ho- z08n+nVJSpgISE%@UJTnBT_pk;EHkM}bV1Qf(_!h|=01Qm3?lyXxVwX@QkitMjTWWG z2*5QDiI*&YB?70Z`=r7p_1mAEmZT{_)gXwJfwck{<|0&rQ&2>dZJybDISTPB3CuNb z1mA%fln@lApS-9Mq~J|^W4M~=MbrXBqEJN8WGn_pDXc0!#93sOcR8i%yvqmUCP<7w@CReU&j> zl5F8{%-(bO&GNh)Uk#a>wcGC-0p%%d%Ej=Rm}gRpIW$(K6DpJp~9 zaRB-+5aOKM2_vzg&6M{Fx#CP77LxKQi`tNfAcSHq`GssM<*s1?$S;Kp5qN!_5*;1E z02e?!68@>0ym5iuVb#_Eic}nTl7Nd7knlOibW~0vzV3DQ#sw+?UhLqh5B~uF~F% zV*gAyg<_=&t0jamI)e{@gP3?!2SL7eUmZ>PGJ<>Og$&w7qmf zuXqWMwba*|B=sc(4RB19V-P+e+;MlZWf7#v2A{**Q+e^coe4l~#3sS}v=mVh)hpx8 zGb4T}Afn*)2pCRqi>8?8C}z;DAjMB_Ku~_7!mlKlc^&K@Or)LCeu|!f$PRvrm+rrg ztqrLz3&a<&6o}0`;F;IaEOL%)ASnb8c8_x`H{VKg;JF3r@@bt~tRjKprfi-A;JMEc zX5{!DN>TX)kB2ZO*2OVj;2i!qWA@g9lSp_C8HO#ff$2M1{dpTDE8T4k>>*k4*@p>) z<8**w7M2Ibf%AD_lm$DG2w!rxxC}4qGs)dFPbo~tOJ_A0zP1p3aQD%^zFF`FT9 z$$J=h#^$K;ISxhVCXuSmxy{22fgaY}fv>B1>$-B`QU(xWGCI$TL@@%lauKE#A;1sd znxrI~fvK=Xo73QA4h5DIDywhI&i=Iw;4CQia?wzg_&ow(+yjw=XmIrs3a_z0kK|6M zhoha;JTotz&PVak77QH6SZsmu85JhWg=E>&237!$Scu1;Pb!@P-i9)9gl79xD_b5R zZ~gJXxhcly-3TlcUMk3*MSkuW#S$2BrzvPpCg*_D3sBf5h$4KCi6Ed@hz2p+)#FXfr`J#9 z$)pyg%o;5;PS`J7)EKy|Hs)K$vii!a3hWq*SD9*6HEcOt^CAI!TBPjMB|g}J09sJ- z5cWe|s;1=N=iGWzziZ+J1M=%4ZV%6$rhEABYSsGzJ>27y7--V$*GJ6m&Gy0ahvD`E-5JM_FL$*+cT_it{lhN$P$lNsJ=iZR|9e8Xb;3@L_c3BVISsgwq<-V3OjWSRGr&b|E61_r zOkXe1>E$?z_x$o^{eZcl?waz5*?OT)!Z1ThAa5r&<&-$117XL@9BMnPd zXT7GbD~cSs9MsbEFzroK>EXwEt?N&B-ttbIxpc~GHjATmejQ$9rgyjW&*Q`TrUhoa z%G7;{%1#c~uES4ed&-k`S=L|3|3%)I4~zK6UbG@`KaND=pdz zFd<(uLQ!;7XOBf~&L_#tfvNwR2Hji&mL@;th}`Kk;puy{6dAzjH`UdC!#WjPG~4iQ zSX)P^CQ!@ZA%V~T^4b6N4w%b^TOOEw*dJA&rm_0x%+RUdu@xcHnW-ZU`(^9VhX%H9 zACX3R{>i-fBKG?M?T4Y>Wq&?I4g7qg`?p8=;cB^2^5LJ!?vKU<0d%{0hmH{~(dxyg z?~{26BL&ZY>uTRW4@=7V72NrknrO8h&)h+9;M0kJCq5&r1mS}B+tl`Hp-8PK9stKb zb4`29pS*6luli4lMmT)_?_}Ep@z(eMUEZElURg@~qk%K;K|ZG?hM(AnPVz9in)vVU zzP~k>e+PA80xq_daID(ff6v1ZUnvXu5DdqPb57K9O#JN2iRFY^4VN+)YCq!d#7%wt zVNjMlmaXOAE_mm4pPIwga6!b(>tkctKgL3Tye;K_%#3i>6%h3F7MNSsZ~&78RU$t= z#nZwK1aOix;fTa%dKE$ro~bpe6QQ*+`77P8YS{PR4i_`t|qdT@FJ zrnJ~j6%{M~pNPr!p4*rfy}z5QJ5sc+z*tN*?)Q+cEC{npPg{Gn7`i&>vq1?vZBD) zUFg9`;7{Tdd02n>2hYdiP0n$R7$b(9xl&9EHpNnnVuSL}Par;`S>ObG>_m;OUY!L0Ex-Gpt-w5sF z|L*SgX#W9C>gK}d9~uTbhrIl!zj~C51^PS+Jcm*qQPb{!;N^BHhT`mBaM)X9>-+r$ z^c^@s#9L{vUM%AgP#_Q}=o5JJ=-Q->+uldoIPbtcB44*e_m6+~FBN-*-F*7#yOqw@ z7|tGTQ-Yvc+rm6f_)dlW2hva5DjjVhWd7c?gzC;-l%8y;M*jz%TAK*@rFjY)+ zrKWE^K4NaDa_PrZJ}bqo*C2{txyNY4fL6Ys1h-wcBHl^nzc$DDNjzZ$Ocq>88;jdH zOAL*A=xHF4=DC%O&d$p2KzfM7D36h}AKcgoT!Qmj zp&Szg)hKYT9zuzv&8UR!LuuLc-_Jj%XK!D2nC;ZPQU$ryeR>sDYm@)q9i!mgIXor~ zmplyTcz0D}iF7e3PgN#ST15#(vBqK(Sx`8y&^jquHaauEcgA4>(%4-5R4s; z_PvhrRY=uh9SgP&8nzYI0#x1E$f*{LSG@Pt+io({Y{-+OLkTwGMZv;J~P-i zNhlF!J6&nv$+x!5prRHXd27bdfeJ`-2u@=ID2xQFTVibqil(Ad-cyP`q#&lPI^U+8 zNN3HjG#K3R%t}Z212h^H{Q`u$vi1@p(2msSD43HTq1>{v;s^!nN~vo^z#}1KJgf2u z;PhLW7lKO&Q%%*l2A@EyPYNQVr;?x#VSwg$s6mgruJr{2=tOFw~6PWrLY!O-@{wV zN=q{*1wuVh5{zewp%mRFV-a$cQ(n|C+-jH}>%xvtv?+}Z5~!b6Yj;qJ#VMzGtY?Zt zAcQ3>s*Zp-OyNnfDg&&}1B8SSg2{XUZq3oZraGpzFM!V%_H;ir7{bot&^W4-HrZI- z#yGv-AR$@H5rI4BrrItpkp-aAM)8p>&Lk+IM(xZAE8flLWOr!(_rvA*puFT4_$13^ z$fN8S1cO1?NiI^QAl<%Ba)Kf##X`GM;4}r~E?Ps8dnjwp{H(BBlwzF?!Q20RShzHD zNi_p~uEa7QR$@)Z+Om`(cu81Y2mv_)!cL>DiLI?q6nRgC$z(K(rH!Fr7%PG`FL4yX z>s(DC$lf)lbADQgRhy2`}xFlxb|2 zXbNvI1ixYiz77LuTv@p+#3zubCXLu$3+cIfz?Cu-05B(k_pu-nSaO=mF`nRh(Ap!j zCvKe?moj+e5!l?AmF-xfxa$5_lLkl_aEJA|6UwxaT6d+NHir} z|1f?0&nyTiAY(yP%6pTi$Gj<2bjOh7pS6S-H9&9jWQvu3Jb>_0XMWuli~ocAy<}`i zP>%lwbAWEpkH}|?)#*_L*PfaonpGQKdn8kk86h&;?{ZF~S=-}{5C2N?lYiaMMvBW4 zpZG|NWP9xB<(DKA;L)plrMOU5wphwob`8*w6!LE3ndPka>6iUk}6n;Puz%-P+F+M84$=@N_iEBn0!kJfnj(A0obEl&98 zqv1q)mu7|Te)TNyEwaW6=D|Wwz6v!0vNKp`UXPnHn-6JRJ2Y}fUww~xNu@%cWB{C! z=^s*xrDF+Bmrlb_&RmB{-B6LR5J8Z4xS`tdS3Jt=M8)?X*oOlH&!;oUL zQ<8&OV<3m((sLWb;=@@-5hBsi7%jr6Y&+Z+QgAOUccz}?)15rPr*Wi3f(akj&DLya zwbxQZor0xE(Itu~!22ll@3yM6W41-yOBbzrb&qx5|EyT0CCGdw8wkmGQN?r!P7_>| z9wgbS7BT6M?D*^XY7LB(dPYs?BTFTPUFA$x&|HyLD9G6waAa zSLq}{0>bhHG-Qdj+HCZ8^jb=GpT!;1NY>t~tKEkwB9kCGmy(-z~YWBX$E_ubOHoqQn4cR`YGFjRj=^7S64Pi~bC} zGDf=3xh?s*WC+1zM?!LN{tBL6XNNHWb+|K{9(ubl3GXPtu~o9FQuNArgJ&!5^C#=Z zzQK7Q3UXw?SQtP(rSi587F@>ONN;zY)!SPhJP_;nUJ*lkHi4xSSuY#Kp#gr;YY9Me z`nykA2b@LF(0@pLr~-vo3Up_)NiDXzl*EKVlI=NUiEyamT2IAcsL~g1`rxfa|B-}- z!8!RoUvxkL$x`eVW{xjFM4HDk*G*v>1z&*1lD&4W;IhdkS==#Rfe9)=d;45850F|i zthd(vzRN+V+!e)=!60^nAgQMZ&POn?Mu^C-WkcxCSC~0?nN}{SBOkkaeEn1d$9aWW z+*jUpYn6bzgzvVOyL=OA+~U9L>PF*I(R3I>aEVIC@?eFmF*pP}8|_Nk3WcFKX+(h_ zXV|y}z6RJ|Z0=>36A#03tg*zDe7Gwc_pz*&`T$hCWb`Q82WPj2ftzcK%f z(|nh+@UJz%aB7Z1nj{O3_`HLMI(RZUn@ZS#(}ZdMOj~-zffXVNLXa3Uwoy`R=~cP^ zdNvhOKKD@MYxC{YfK#8&`_JDsle1QPIwS&Xd*`XYnZ0qFf`t5dR5CDAfFl#AUNOnp`qISljuuKjAhMlu_Hyv<L&H}U3Uxe#puLe;Dp4-Yhc}3M>DQxuo-u*Gw?q|9 zuxWsMz9SvJMHRv#U}gdU$UpxEZb4xkJn7|*$v+MKN)V8L`9&oAYN)VN4S`plX}jgf z;gibU4WbYVJ%^K}zac2tm$Xsr9t9pjz#6bjlO-5<7UOCNpmvWHre@Su%JmDek}_&W z0=J;<2HE|Ezza_?9?tZx_%(xfRECVtxg&2?JwJVVaPkcurK%MT z@)MYl-xWyASIK(E5U^9Sw-O?D!`l3;ug$iToTy9QpoFo zc}g5@VD}(yIj~-6!FBLs<%jS)zWpn_1Sfi^;LEK^Sn88K7CS?EkpX<6qT~_r!pK zua2r@1ox?6uxz8X3)69|Czswhzmm;9*yVXH&Tr+ak@5Oyp})fIo^xf@$l413pyiCt z0eigaxB0bu&eA^Y>s9;MM>bbU3&bBasR;c2i)AakqjG;1hsQHYJzzqwYlkOBPw;Jc zB8`Rt0sxA#L8P8X#CV{4n(_CN)!m_S2EW!e^maxfp1%5?l4dQ3nmwLiDk56@Yb2ID zyA(Vg87=TN%kJ>SGTHNH_dMA8iHkA0f7;*43&zd;Dc#vT@qBpCx7X*K7+odOxnySR(qa4_@jLp@W;8#SloZY)!($%Yl+z-Vmx-b zOWy$fFCzGwuSc5Pi*xkjB45^hdU6)jBWY0a9AG+gb0jV@NO8pV#@zE#)c3xa3=v@u z_Kk%6J#|OwVu!p^tk0-lKJKJ%K0qKmg*VB>So4$b?~gW^y$9&@#78U#w>wpcbzM1Q zs;_n<^TK|+eS2f&q;zcr21eRqn-WBtUjU!X&xV*3>?0(?8~}@5~**q=c-5V8ytBmaq(J%=D+Ua*UVyXYANh-*m%#i9Ga-PztH&h_-oUb zam!{ee#m5|x7WPtO00P{6P5Any^(k9+ehvhkE4FC@%YDnNKEuW{dqQ0aa(`>r@1fS zz3^UgUOG`ANJ?NuZf45JGD^wfdvDVnbK8%`8Ql$_e>gNFh096|wuoInR!;Tfsf@I{*HcBJfD zdbDSTz7SHh<}i9Gf`yLRh|KwABqDa^q+tzR10QXq7TYF7+p89TAu`)~FdMfN&pi_J zVBur~IZMnZ2c?#0ME3l7%}g%(#F=(uGA8EO&s-Hu{%;97A35Kw0}_zPS8&Lt4bqK~ z-U6xg(*8K4vB{s)1u}z@iFn9yNoUz1V!z7%&_{B6zJza79J8`L?j;dW;g{9mnOgfe zd@^rxsGbGwP!yi^F!(3mzW}5wrDFJsy81>*3@5ZZKeW zq3FiD%#J)pL{!YlRNqSm;WcX+Js3aI&lA%Q`@dZ)95gCC`#9smyR1<^NzcV2+h;6 zMc6YJqo7R23D>pim>={Fd9#96Hj*~;vkEI=|t2JXu#2YK~bU|=``-t@g~Up*0nq?#eTNM!F$`1(}%@aR#pXH$xi@m(kLy}b~cU6W@78pMKoLGu>=5)V%1_+^tf5y(8S{m~>2UmFbTY zBsF+?53kR$h~Zt=-RNXuQGWD#u5-FbH~`zoJ(RgAM5#a%m@-;8ZHkcfz(Y5MT^1wG zS^+ZsOhCohs>pIMB;(WoyxT60)8^Yw?0O3tuTR-HVrEgiGn4nMQ0A$#4BCz8r zvE8*5O}PtWX^5JVP_8s^FpyGNOtv{?-xqorr&sq*B02H;z~`x4Ljb1DmXh*_7>Q=u z-#OCBD150b6_#@mv#B8Lb={CHMol{S^P7An8>CyK?)4}S;$UVy68Qv7B_wuJSQ+50 z;A{~*Dr2S$;BGeLlUDFS2p2GKQ)nQm$$^J?_%3?b(3@rCC;ZbR!)g}cD{QU7@4_|; zl4_(KRY?G5L;#I&!pkiEyEZp}au5nMxGEP0V_BZCu6L;9FOgsxcq4+X)PnS2qHF}Q z2QD3i=)yHgva!GNLzQqy86o)aZFW8++4)7-n1Z-l1qwh|JGO5TFNQfJTmTv_Qeao5 zWB14eKB)k8oAvjB$_GM)o_Km>P(X|V>ZPZsF-h2!i*pO%_n0?e&ao`gq^FWZR5Zr4 zaASUPWN~&Fe}?+!5DYsR&etBBi32OVB$DbZ*_x_DI8rGW5cjx+(7O4P2Ahe4H=7~^ zTR~Nb*k7-Z6(J5VZ~l`?QvL?gnF7A|=-N&nNUPeXXN6fI@cv@UG4BjJS zM3Mt-?z&P&Ec)!p9v&}8;g@%&eX}M0ssU?V(jM69($9X2^~O} zOoORFckQw-s{jPDN4T^kb2zQ&a&O*^Imt?BpB)Pvt*uYKko`x6Zd8+XmY(=4k|c=( zZ~b;fab@5%P=Nprck-z0f_Pab=cRx{t0Dl)k<5YxWbfj;VFLgt*zbJ?6(i3hD;9KF z7y$EWu%OVcFbb#*U@U-#gLwI0#`6J{>CPWob;nCvq zH2#Kzb{R~=fNa2vh3JNqTsaGa6wtU?>~9H=QkR$x&O9)LUywT&=B2*TYHVYGlD7bE zk_2dDLSZ;{?Dsh$N7oquH8-_1F2Y^50;f^Has}=a3YZcN)LH<;HU;JB7cKcg%qB_g zpyo{$A4LIE7E(Rxn;2uU0~_ANneZp)c2ni%d4H0G$A#d#DA~JY*rJW-x(g@_V4i^Z zR#;?7mygu00b&J9ZqBxDM%UEA!&%D7&8C(LfYkGQUhJ1qV@g-GrLP~q6Dyvp#CmBu z3Wjq-~b%Dak1S(ARQ($QvJ@zWP8#VM)HbM z0f-4+WQ&IqLMqeEGW~j@<_Q=)jbEJV4}Pmzzk1@ORNd@W)t_royY)(23fQx|gIM5T zT+408lAa1;bTsv&GLEqT*C8Pe@~7I4V|iEfp0=6dZlIMQ33$(|qoO4B+^aXa{0ejM zDW;CT*iM?Rr|CV8$f7z*XA_2_JXd4Mb_uB19e<}lg7vh>2GAw=o6%z7aG4&lRFxMe z;ik0FSKmy;&$U@vD2mdXg$20$nPg!~|9a-U>h)I|L4 zc9SXKsDP5CU)7=+|1x~K0+-L~6I#(j+}ku;lP20%fL@-O-?OAYd+4F4{!>x!Gb?~h zHsZ(_tg#6`%(C=($fsf>9&2MkO%Uzbu**6}{iUH2X0zE*3F<+bS)u&uIJiv+CX@zO zD%aHH!eux2UG~sB)&_sEsrJd#TD}Xu`+Sjap6hIu@Wm#e4hNQv3PDtS&3^>c=8nLB z(t+u#V6SNUlAOr8i1V|Y_xhyKKYc->{r*3RPb>*M!quQckLE5ix|FKe_u`SD#~pnS zxx}KQxYbLto5BJ2TLpS~4U*+;H~0N$4Y>4`7X~a|CVlxG`b95Ce-(L*o#fd)%csj0 z2^(wg6c)bi#ka^@Ek4Xo-iOekeUM?v*62u_r2imMHfe3*mmb3vb0-ExrU4QnPY6yW zf4sZ{`&;4)Kd~$A0hK-9cXYJVzn_~@Ei5Vs%fGYVj)HJw9UuKg*TWR-!_S!wBz%N< z$|P&~E1)Ky$Xui?&XIfD8h}Vw!z36!AE}%V!F37XYXB24|BuLqiRtge&R%)Nw^wZM zxT+d}Q;hZidSB&SvdBOg_xQr?c%_uGa!II_gGH257-Cn&l$}7}Q;+}@W(x@c9=te` zHT#C#P_Pp$oXW@pV2+MO3Iw($nhYB@n~MdCal#Y7Ou!HomY6nS)qn%b2h}x=isq^* zkSM>7&yXk=;1o*!v@H4dyED&NMgOAMacY=Pfl#P*Zp{n~A#yg3Uy=z~SINm3YB^>n zm6sGY*w#c#MXkY0?X}QXrmW5>Ujn><9xOq{o>$QIhrw@d_U&Ta%VGXBQ^G!swEZ;4 z5l67%b#hJ9c7cwu3oGh989k%92%R9U+yBLdyhilx0NSB=(k5U#l&YLDJNSZNckvP` zi-Ko)qY0;B_AWx~;`RsiCPw3Gp8JurnR49HJn_)tS8{Glr`?FV)F>Mv6OLz{kx71k zz73#p91Yg(!wMw?Wg1?oNdo$m*3`N+Hg4rp@B7Bz*&-~Ax;tcrl~6&f$}#lDJ+y*? zta$)|m8vnzZ3Nz0jd^Mc%#l168VXblckjVoSwBWuAaJcV~ku4i3_#6sh(Z_cN)$<|r!Mm_k825gb`)r}&x~w3H&_ zwK4uloHhF0ep3OaXv!ul2f(hq(FiAt`U1({2b;~L0ZEF8ATb1`;uJkd-=(ZdDuJKP z+3FCOL3!0A*DVG|NJrwRM!3U0TFjKzyVJ|v7z2qF@gNcrLT^o7xI#9}_?h2;Fuq^^ z*u3WbbM=Gmba?lvwr;S_&*F!z6w&IH;_gyjmBIEXHA_f1=lhon3>THV@#yh!VX}xH z0=am{=nyO#I8J%Jla#Ot;DqB@3W(NBMs&+@e%z1N>sHiAfE>?(GqVm=u(^VfYbIfl zb)?B$+_ALX#BvZbyX1aiPgVm?1rFw3KrEekR0UfD5j=#xykz`d{vRvLVmGb1^U6rwitAqWGlLOg($K5%_9DF-;N zw4;-0Ea#5kWY=x)ixzdM@xam=9?E>qUl0lE+h55>O7#Sc@dAP_{?%qYz(J95aF76q zNJ9|(Ck`GSeQZb|d_|8ys=+7!P^uj6$+%Y;+STCG{px~Nxjq&^;QupCES5qBZ?}g!;v<-tOn;<>nl+ApQGzUShPdiZ{1`6 zrBoj3?Sz{VyFo{w#{Rjg(M9Kb5w9un|AqRfoz`Ukbbv6*YY?T^c_Gw;$x zFR%6?-fttNy!v8!nUOY!`+N$XZHATdP5UeK`6uVc);!t>`^C)diPLEhQ!73x4^;X2Fa1g|=EA%8F^4COB&v z7=3+Ege6zJ4|1P6w!iwY`=_2YfWa_Qf5*& zS{nXc)x2S{(dHajA(4H__pg7zAq6Va-PZRKmf>(DYI)&pTT8{YhQyt@Pl4C0Rmc}* zFv&YQ5-ZJa*H0#tbnh3adOdk{(BIto>jbz3w3s_;z#^2K6clP-$^KL--`;l)oM_!!xJJXb~nD(#!>j~w?)sVBB+SfL= z$}`_W!? zIx2M}werTlYU>{wb6qk|`ZAl!1+}yv#9VrM--vl_?P{NBg!>)hhv5zR)ay%Cb|C-u z7e7(PeU`M@TiV0SISJuwwO8GiUaRUYDMxUdXa8h2dR5g=JV-WrHTyPQV&~bD$MOyN z3d^5YYz!Yqrj=~`0lM?ZhFl*jyc4Ei0Ozg=xvefU$=%av?$uSQ>E zwWVvUPH;?buj^eshvE^3mUC~tTK_)1mg-_O$0}d&HWGcPA9%z7?qvxN*N?qRb~01_ z_tY@5SS>G3UqG@_FKIxmRnuWb<&VCVzexL<*{!1z!Dggkt-f5-dIX z+Dz5Kx@>1H0dfoa^2z?^dEGndLWvDN(s*|vCHDqr1=MzS~ zmCfO&j1wN?i%wTy@0F=7eNP@1`C0cJy<6lLDrWGEaYIQiVtIVT`KCrr%h8>NdTDLde_w2x~>IK=j z8xXB>ZyRW_lrwLH&zblSn^&u`?6SsXR+Pf94id52x1{YAs%}#d4#68X7b?ZC#tkKG z)cPLivkbC1N|cKZwM+irXsJw};u2jNxLh7|s(iIpV~k`tH-%M)l}=ni9=Lj9XDMgd)RK3J z?o#R0*y{vxfYZWm^4<|PGh zx-b?JD$*D}Q)cem_{@pj;nd;kwDDy3gTvua(>>lnSKT*H4XtG^-v4%9Y(h;Pzk7Xv z-^}fvx;NA>Q%aEJzw?aYQaIDJ(V_@BN55_Gy_S>>z!c198Idz5X?2j|VUBl-zwi>Wt>KvQWGyL^Bu_ zmE*-VVXfQ;H_QQ+^Q$0NcPX3pSg|kh!>2I9vb#zMx7Of7AwVjV z@640&0Octt`5I3$MbIX?N}|ToutEsfp_CyuJHPzBRSF5~)fFYgtNZT10t^ZJ3PQb8wtjT{pSWPriZ z4}U2r7kHK@rOlhC(c>JU39H6>$@ZcZ!?y*JAi}5@1myFY!-aWe_bI0-^_1 z)YtNoXr8=1^xbj#;u=^ilgNz6rFhuZpqt&(9R0i!%ST8Xd}%Pd-s)nZ0{ao3gGl;6 zppri1?{(Q@s6VrfxikA{WJV?2AH`=ZAhijdBZA4ZHTq&_;^&bkZ|0j%`E+lN$;+52 zV)i$I7t1y^^e_wdRJy1g>Oo0e69$rk0GpmAT&YP@runtd;gbeWux-|D8GbEQ0MF7* z&4~d!Z|A9uG25p^%cu5g>Y(i4lPZp|I30>-BXB(^VfLv#p5+%5l+%_A`5_Fye_qEf z3SiE~@u_NwlbDlZK@O(0g||q!2MmJFv=5$=s3}z>`HIw3m{ZGVqw_L3LUfkA^tAdJ z{9M?y3Wj}P@|6;T7Sy#ZJgw07*0@NK#sHxiQBDqG39h$&@EU^-;sNDoKoWa0Cpy0|CM8k`%LR zS##xk(oe@rp{3MU_1udGwt4RFQMQUM^W$0sbLT#VjXHL@j^_!Pee>-mJ=1dHhZC|9 zyUn(@r6RQ;?pNdK&A{1A5IB@eZE zYuG06*Z>bh#DXmD?b0p5EcL7=z-x0zG=%-pOXpu(Ng5en2?ckb zLo7(&{c8YmP6k*O-Q1@okgEoqKst-_YRo-B4>mL`mzK`ts}TG*hRg(>V-z_#F^83_ zA&d&N>zw7JY|`T}bRRbUV~pS%3}Bj_(hE62Q^afL+h<=XoHp!R= zO9MCf0ckFv4AFn|f^k!TqCZk#>8#`uJ(csaw5P~PF58F2Tp@w4J+tH3^&o}6NpiP* z`mVkIIoYI8Z7ER(7r*aU| zxLB^^9DWU<5mCHB$zMKarc=_=Jy^P%WSgkmqy5h_rxy3_A{|dEEia6FcMvmt9rI$X z!kjUq_wcql42n-%=XEtW`LXET;?n({b~-sNZX$YpmX=T=gbn;xWtFQW!-&EvfoQH; zv>id?D1G`9a3MN-SIDtyr|*(`<2eFkhtsuWhdDnP&hzlzi8idLfV3D150L zjb7SUBI{P*xAxvuzxZ+=*=3O!ni^bzQO=(RE3#_41=H|wU)Pb%yB8+E8Lp+M+#>dsr&M;)sYbf-*|GD zA?yEfbSM5$y>B1D&pER*#@Kg_eP=NCZH!%LvJ)ChLK0F*i!=5;d$wwbDitH?)QgB#B!tFp71{-KjP~p{Il*f<}ICR`ywY1rp^1U$u}kwpM|8HXuoocGz-|PP%_NQP+N_bx_vm<|SjWE8pgKqwawL zyRcoDZOd277OGwpeHN%_+R@$>hgpUD{#Css|G1?OAd52|bY(pvK{swqRrrZXtEV}x zG84$&&p+f2EjNOj__n#TfJRl}Y7P?ckBkCKq|%Ca%eRf>T19$TYLn^Y3@a9AtW?LmI>e;jVm{i7SG^@_ zKt=nW`Y^v2o!`(L0>dGUOfl!W9q(ci{E05`-|S65Y88NsOG9Vc6>a(tj(PIjjjY|eevIHaMlcj2>#M3Vy3puhVN#yOO1A>Yx;M8g`zeE<%V2>6WDI{1(`#o7fW#>z( z3>AE-cXD+;Pksn-$>Xtx00<1{1<%>-I-dYKZ^xj#G)Q2<9;xDM7jqDj8c~eKetQS! zek>>u<(ZKqAdeX0h<@~G8P61rnFIFdKZG$oq>j9=FVQSfd2Q1Xg>!Bs1s(Iv@$c;6 zEKqHGn`)<|Oq{AX5MJ0d2`FPZz-cnRc)(>hiz(oSEVdQW>1vqXHG7gwka{DTNYfEtB(dX-lSPEG-DD-% zwvQ-J74vovf0qte&?#pe!^OPcm)4c*Qh7AAPzn~X=lqWm?URZ;;5j}ooz`~T<>I;I zJR|PKYNdEJwi}c))lay0TN%J%S3>)3pzTTK4s=(SYa0FtgF_7GI1gz07~5vyE65mkyUZ;|{!-O!W$0&Kg2XEp-%+RvD%=li7D7IurA8N@JO zykKB$N#^^yP_Y{2dOeV8FE7`Xyr7Arlwb@CGhXx?$l@8oZkf{g8kkTAKjamABO+WV z)0eCP8H`NEid;&fi=#XGlyV5^9p63bFX8!~U2>p2jcGF3470@)lJX+}cj8pP?Ys7$ zIwWONfa#|^2Pm*2BPaV?S4(4I)$s83HogTRR-Mx5wN zY3Kd^+9T|7%^S2|Yqz{i%r9e0hc?yq%*Tmrtp|lb6mK#&LDP=$5C6%g_7(NRO&?$L zsF7QE)&bFfzh~2QFY&$DRT=7#7Y=9*P$qNxb+?Q5FjeGciDH;yXej+I?Tgm!wt-VvP#c%dNZS=L*|b7*D%M% zd-V4{8&e%=Qww0gRmO3-I;9hbR7%cdL5Q zQOzH%NVv}^npe4y6xWuz82$L)+RA;PSCtuaq6UcV%EQxn!*nBDfDjp`rj_t8P2Ao+ z&sR+3QxGxZxcQEB#f#@atfQ@R74EF_&jwVmCi63OiLn8SoJ?X^ygcq6J4~}nDWUFM zFx$(wyJ}#exF{!0VVUvjiz1?l7ny~9QUR%yZS~|GRf3n(kUd0Hb9@#e`2i!u6hO3+ zwUOf`NVYI6&w}G=_MteOi1-fmPMpyk%omd5_`EnyJHw0a&l({qUM?<%ekv#m#jD#k zGw`)$^h*WKdi~BzC()exCsnO18*yQb{a>e;|5}G69%Lkmivpr1ZG+RrMIo8`10!$n9qwwMWSg(k+odQ&0q-SK~QgvC)U#)C6$xYRry*I`q^#(`lv75A=lTh3Q|$ z@Vuac+*3GJZArU|+1Z--Km#LiL|LN0iiYTGu-F@+q8jkQLHlW`QOEKUcJ70HNYB^ zeFyd5@6OibvZ^L?-PBS(_$OiXHnh{Br!(ZK>RIg?)$+t1AJ3ib)7DsSi+^uSrlV0zh~V0EPa&@1s*8}I4xh7L z=}cS4+64*l+q3^Z`LzE~CP{CJKDupf+=X6=@jTNzYa`{=nr^k|sc^!!=i9F4a}Ju` zgTHO99V$Dr%e3t$TmtTf9lcp9Hl5q_?&96c!dE(bD+7CS_jshu!gJdf>NqKP8)-jU z8*4X`pUNnb>pVO=GhhYFhl$LkC)ZJw6>uu1S*iwhdNd zxW>5bRE*t?*+P!o4jk=YbP=(+-fdeJN4)$4)p=}bE%@|lk!Hx7M1K_MlU>=CO)&bB zAyfxfKI!S0>x>EDy1EbokF-Js3HiGRmp*gT+IEOu*09mCT=_KAohN1dweT*#m+%$; zfvINw*rZDlBWZraP*ePc@6sa}waNOFz%NrnmzqXV58PReI^Hd7 z1EJoZDgx5Xj))M34^U+g-=@2C+5>iLR$3j)y>oX)gtKn3aA|CG*28Lhb=-SeqW1Q^ zb(HM+z{eNEUb}Q=rGNIREu7Y!_e)Z@Iv`wsC)V0$XDe+VqvtT*=Rju}d`s+O&m7<0 zgYUo3W!K^^HCyQC%}go^p!;_$CTi}{7w0FO_z zMm>dvi>DuY&RvcBkt}4o_f1RB&5##chw~Q#IF7X~SFOLNbn>lRRV2T{I+5-pX4XXh zycmDuv(V;Keel2jqkY}truUsNi4`V2?LW#$kO7d7U1kibnrx(D{w%mHOYJmxSs$IP zInaJj^ugDI+i83H2It$&2C(H{FJPqn`@-goaC2YF5TBx}LN-7BiG4RC_bF2LPR&Kr ziKLojsqbwff89b-EY$V{Mzt798|B8GsOtQEvg{WL`94;|*_gA}!d^9%n)0B(lO3LECw zGB?$;1C$~8a`IAH9fx1*)vz_eWWJg}>v6LFV!b!9Rzfa#@&37Kk#ZI~$@)9k?t2Ac zsegE$UnF&mZ@XK<5|!HkbO8Jm@)9f^2w-ydittfx|yw ze^rQpOndgng@pTF9($zj>TOn%z7Lm9g1*;-W7e0IK22%{x*Dc69Du{#!~?O5uGo7G8_Lw8RREt% zKZe{J!=b9*8?Mc4Osbyt$D(vM!?fh9vB7!hjse1An9(&Vb{!DQofj)qRD0)a6+cZ-b$Mq^-!zUSbijC^8 z!hpWzCsisMS^EqLj=a3Dh}@Uzk#aTOjXQh=soh&jY1t_8{wX_BEwPtu4F$t?o46k57Y#>F?u7rg(KF=Vyg zf4s5~U;qa8WfR~Wz+mGg(ms}re(Ce z8>QGkh!1@bOqF+r3{2e)46wC$QsX3$Z^_&JhXj;?_@e&dAXquaPH8hupo(_}G+lBI zPKrdvt?f|aBWu33K>1>0$+Rd4@wujUV{VWS-Dry)GYj+-Qnz_T)u zJn7Ln4Yx|&T@1`+u;@LZ*Jk!7XK-lV$bUKLxh|NH0mE+Sc_j7xFra4&A0V(Xl#65A zFmf8&>=Nhu?#D;DsEgd|-=`+BqJCBtK+map5N-L_S1X@h6fQvA+Ez!d(D_N}f;otC z?3rPmq10P#0l8qRF;hS4;=y7n|4>P&`oqh;z}zNH-V{(+-PxcD@V<}izeR3@V0_|- z*Ey)$5z$I4)FlHDt-N2>p*73eDWho6U=b$G0qsTX>vQnIk=mi^z(WcP>Gg||U=fg0 zGo>Jx_vO6ogK~`7OKfYyy2h)AFuN&=$WI>RS&?`Wz!hnN<)>E*;pqa${$K1!zWElg zc!F;;#zAav8qQN7XXphvL%stRTw!-^Lf)%@2v14mum$?k!kUnKD9oon0Y1~IFX!e2 z(CmwrKw;dd#dyo`Fk2o8pd}xgI`iW#0Zd<-G6kYDXe0`|K$U$2pq(-32ciZzD^31g zlM&kx(%(U+3J`G?(km_#w1?%Xqqs@xIvrpq4aFD?3M|?^mxk`%W1Ggq;5Lt8dxnr4 zG%6N({sPVK&$d!t$M;YP(_%IAAI7BN?A0`dD9E6Ljia)yegeq9C9aEz_&`8ck{!QF zHFwTVWm82=KaHLCASjmsRaxB4eQf?!@J^C4b`jurhUE+T(Lq!s3}7g>n+E0GNpb!> z4PN%?uN7w@9)gGWiWy(!V*tqb4Ze;C^+mP?>hAphg9r+ZuOMAiwBI7)O1CK*XF6y7 z>7(ma;D3hVvk!lF$QX-bw%D6z*#9op)$JRUx9>-+_PhB=U>;zxO@rXX zG|=IIa~=>t4nKZnCs<%-+$DVF>lFprO$$mtKS0Ax1AI|w@{;;FBSPW2L)t?9c>gY= z)+tT-*a6*G?w~PKplL9{TStJxR@AW7T14d2sJ~mtZk8}1mTI20XHF&1-(F-_eWTfu z)IP__pS!8QMrFD_RNcy4j9HSo6j7?$Jszn%vw969`itsV$2MA};``DF08Ma+I=D>5 z#8SU+fQEetNK|)Y`B19LqP1> zBhv!Z-W=w^Z5pADs?`Jw&|!h%K@_oHGaeS*W*`6i33H1INwVKx-nteKIQAi=s;FF> zUrwa)M?lliC>|Ev{(AhSgxeg}lmH8p1_=~fy(CzG1*_%b-xpm2R@stW*>>qbISWb0 zQwz5rU~=rRt!_5-T_>$z6mrCZVb2`i60nTpTirI@Ou}!*;1<^bXE6V0W~aA%T14WZ zstV7j{e*QVd1|c65A-s$4ZyK`nvfz*L#Fm{7ef$!aACR_rtCs0VAvw5*LGLI%Bv3i zLFu{_9_>aUOi<(cc$X(zFe6Jh78p~1=oamJ^{$;@-!($iReH67-u8eI6?pdo7KmrB zwzP>7*UtWa7&OS9V^NaqTOR?>FZZDJoDVy5&i{T2w}*Vv3s2*Qt$-qPdLRD|x}14$ zwT?mFda&=hgWe?UbgwPHb=UezDck1KU{+sJooEASk?;*EUES+u=gqgh8 z6G;&{tDLi>JcJpC&%95BEdT*V;u`H&S&=RFr!5yf!Zzs3xYjbLwVIEU9Dq#XMX}%G z&i>@X!stG`cX`Ptv5xkQ)OZFdZ`DL~5lfJy=?3ZYPdk*551X69=@V?50`@!F$een{ zZ*#};6!ewT0_An-j&fdtU21u=z@7s5*LhyhMsK@(tMV;s>d831XoH<}K**nJve9mY zXQPfgs2o#*7EKAs&cu&b{M1qZ>JEz<>hkE%vKi)zVry&6-J9WZ&g_^6D zKws(9#qtf5W}ooJ(?0l@j~`N+yTT=Yo{?WS%5u6bY5(4YY#Zzh2uMOeala93=Ngd7 zyNA0!gAZyCpZ9uewLjgX_2{7?&x4R@y!DUkc87J6*d`R50qkVn#lEKoi~jsWBbweH zVw6_=dH?OrjNFB#`f{XM!)dw2RbbDxsXcNxU!cCL+D)>(;@t!a?5>p#X|~4OR}~^N zvT(}m(iQ3_x|6wNIR7~NeYw55HGl*bIv)?IpNA!7sBxl?eW-tYMQueQa$bbT?ZyMN z3)ESg?R~gX$vz~XCf1F?_RWbn)9(5Xi^UZn@sdkTV`;7Z>YyHoOcB|^y87%{IKUHAe;XK3nU1PQzaXzaUF1B-*0;? z*O`QP{+d?^WbBljrBRSVlTp#Ib)Fzw-P1{F6?>*H4M|3kx&Wa?`+)Alq3f83@$EBZ z?8gL8)*84Q$$pbIJ>lz?u{MB1(xARHURU=Lh#;@>kzjj%Dh45GniKBaDP@YDmOMtM z@{58{6Kl%ZAzwNFe+bq1~Rr35Fou-J~wt=_AueD z7HVm#Nif#`*-j-U5K@PsXc!D-?GShHAnoKx4EsozKkL~vCu9?(%o$Y~bP4!Fy)1&! zEc^YlZU3eCyWtvaizqHa+Dlm*=qlBjU@Z@vc_%+=>wMnp~NO-c?CdEz#A0ql_nc1NEf znBKa|dmy*~Ox`*Z=6Fg%)B+v{yhVa2t3v_kpB{_25V+^r^tXm&# zalZP>>4dM0cb66ZlBs#j?@`LT^*$>is?nR>pt%r&N)qH(CsP6O!2vePE<}0|(r_Lr zbyg6B7}!;$WP^in4$RT@KP(Dk9gjr}&q!?nO}G-82unh8btw7)MSvYTHRtcIqNb4h z_sfW1k1PjtI1w8rM$`ByHb;mz%gOm8Dhdbk=M{$usyY)HH`TrE**g{YLMtJW@00SM z3v<0avyOyPz672^uvl2RQ3eIZZ+KnJks&DlxSVIf#U|N<4N(JCvLv6Y_xJyi@>LO! zXuYJc5T@kTY^kSe_YyD-DRVl)JXJJ)rvIak%sKO+k%yK>;(+71it#sDep!-(59Cgq z+|g}JDro$J%BUVr#hM>rBV=@DdP%*d(T=kNHYV5!f?y<()>1Xkz|5;eMtaXK7a z8a_@`1HJB%uaw(R%p=eN4~xIyLq}mqK%Kqx1A&PkALWWGZ+D+7_1pdWtxKQGi>!^_ z6EG@wN7Ug8dF}@fF--gU49jWCW-#sg!AYS0G*|iCL$Fe=)mE%Is`Yo>(;Yk(eRy)= z{x5;`WKxqf*ycHG^%vd`ActO^a|TR5;W#J2TfimLr|E^nU8m-1<$r!N+3Z|m4_QM@G`faWm#q${QRVBt7%3WJDqgpdvmc; zEAHIKE)zL(7fhNmdsL+7;gHbjm{g1C0fOUZkvv@>`c?XCgu64TfLaVgM3R(CxLS5l ztE~o1jg7#NXg!msRXi0}jy|P{#Z6>~QvfhEiHIm(3j6PNaoTC5vwjhgDJj&nGIAv9WDs9A#LGW9e<;VkFNCBMJ#zH!8+Uo(66uea^V@C2+ySNV zzvWmR8*w3{O@2TvTtqRrbR=0NEJFKJn6&A>OW|^l0BvP^1=*<94AWds?H;LX!kS)c z3a&z9_7o3`eIHLMK~uGTxdvht=%d%)ghoGcxwG}S^!yYUWHvK8L9QIGJDRdT`uF|` z_?oUwM&kSL9>(-^6~le#!3n9y5(cd4k~%SzZ2k9N9+-`ibBa$VO!l`qzgUWEnos0d z=-XSw-_tmM{-o@>{Th3m*BBMkt}R& zJmmr$`Qgbkl2|;cT`4zGda7iZJKbn{WYK3Sw=XAT`<$T4((XSGC%P*}l5g8-nLmC% zEo{qw6h3hNTwVKQyJJry>g^?;(cEh{TBf5=F)u#`@>w*+RU8$DLNZTh_e1lj>z4;F zY>Mt=2`RIw=Rak81>aO?8!)oP3)3NS4a;W@#x(+-uVX_GxXDX@zHu?LF{VD+X`$s{ zi6=4dqb0x7tu?Q+;g2Wx2;A{O2|T}bv{@%9K*qjd+bD-?aWHPz$d32FE3Ys9eXU<{ zd_knhq=&YfNLf-3TCnzM=T2SzyZP{y+Wh72DxJq=%fC#yOP^f~-(HCv)mnJ0__-U* z(~U`QotT9lT(_`{gHzX9h&>WD+}BC+r?&WmF;dR`wVyV>P)^e3D@5xaK8VK|_PLEK zO7^v=j3wywzI-(Nb>Lxo1J~oEpP4b8>e2>6TWO-R)u{>3!#56}ZE(Ec-F9W?w(sG8>q5Z9 zG3?w+qgtHn2PF3XN1cPIALjRQ8Jh{#9#;%A6W>a``FW44Rh*Bjv$U`_8o;;fW+DhN zE1f!hZx0-c{q^#7`ibwH-_S~2o5J1@+m7jJsM&B@%+8`dstVxB)_ za;Dn-?Idk?rT7hP!OTC0gq5&IZ34`$>yKQ(ZdKX$^}J-C`8D)tdC$WyzVT_aqEodkhAe${@@-Pdsa8{^ds?g-@OSBzZD*;*85ERyL)fO z#AW{?>$5|OGQU!6`T{WS*GEM7ggdlhyJGN zeU}Y89TL?*6;LYHT2%j}yB5xM8!GUP0|MR$eEfCYK<>T%6byfuJbT*XN@g{D) zKXU!;+NE26A64^S?@lVt4m?B+!#Gb8L}L^gP$4$?B#n{kNyQAa%5q!;z)2$-9Dl z?2Y)}F{+Syr0%vVVtG?;hy=ZJcHHJzJ0_%VDhu6>!XF7mc(TEhi0dJ)z(%N`Ht7#d zY3rzgwCR3jF;HsWNh#%MwZswcbcaHC*-Hh9!>K3l1en9>HyATrSKgG1qM`3}kqOgvk!8 zna^d4NV(vtupht>sSU#pZ(7R(NbzNm#zs{!mEI}x5m^F9*lOiZIy}4RYn$h`X@WFZ z;s+IzoQc9F6a_v-#c>>czv>QLJdC8CtSNZ3@+n~i7I_~{*t46snx^!fv(uS@-RA6+ z2LwBtonjoNerP=^7(Jrts2qz@_h+HqSh2zmdvmDrJ9i8E?*oLts^F%O!c@Gv5NfQ{ ztbsHbS-|)3i}V3B@}51w^ZtByLuF z-B2cx0;1HZD%Kpu*$6>T5ryTjMa4Xu%?OH;(!K_ci5Ux($UsFJCdsm)5vs)LEwn5N zJyb?%25f8w^?#RXF{pS`6&Z@HL|)cdVi%To_A7>sHf12yIlAQO|LzIVJ!$IZ?@V+6 zOgvTZwSw_?hSV_U=RuUmSC*&49XqDmNR813I^%HYv{Ea`(J)n$a8Ql{DGCB!xWDuXD_ z_7Ut*Kjn%4pN*`TS|ZoM$|dx+|+!WOszGt8uX z+V{f)A{rb4A`fRkO6U!6$HuhHgJ(wBiW)$Lp$EEUZKz6e$Y#EA4B{Y3R2^xydKhh2 zXF0AHl4xQpi6S5*6_FB(eg$I24AGH;GOFB}@{_UiAx6G8)ey?{Hr!{(6mO++mY6?7 zi^AD)k|*>fAt+(mmIJO=lkg!%#tolFeh)6BX{z1&JgI6wa(;3|as5{7T9t8U^>U%6 zNCXmwc*djBl?-|fOCpMmY@mw7QZ=3r5v|#%NCYyH1X{Dv!e7L0&~~L7Mk3%yt*ftQ zKh`+H62{-c1#*n4K=Tk=3}8!CrlNybLY^#fIv^3!4@Lp_Lr2kZK$-`MsMw5=4iDZC z5U~)KRbYtLrqMmAg7N(r21l#_gxZabj6mOoSqvR;cl?4jeS=71@)}JTWdKELjk;tf zk*9I}TBd#q2>AoZSS|~}1M&jGVQdZ{pZ1H6#v^$^;&4AitPrE%^hLk= zK7W2rSzLbiV*fO@ULuf#R|~np@6m--Y;_8sGr9|#(A<4YtTwnv|ZW_9F>D6Ja8MY7W!T*f+%!C1dMG&zZ zR2WBDdI{;jq#2lw+*nHe&~$V+i>d2jYuSj1kkpe04u!IwT81IwIr7#=jk|pJV#Cn0 zP1bvssNQTFkR#V&D7T(}>0R#qld<7?c?PK$NAL%)CT_lH9iNpIROjKi!z{431Y7C5 z6@`+ROAqo-$8FeFWM@F{wgT6a5IM;)*+>c1Z-E_Nhm4;&q5Z^MIJ`C3Y|kN=`-*(8 zK&@Bx(l+x>!5l4N2G&Cg<>{-1e-9=x&~cSsndDK3UG}~e6T~P_C^uK;CW=uSFvU!R z$N<5X%7YVTn0ry>)o|ru4j94^iz!FuiR}`W=-ynjIqhZDK@*AO$g>>;BVcW1TPqUF zvYY0p0YkHIAy-4oKhJ?cpx{F|R+|;DQIBCTP$Xv+i(HZ}ltxi39LK9ejXHO>!-d#r zZP;*xfwvAvklBbE#ookpobp%Q{x^sLv9zkgz*kO7Ff}`je3dtSb`p?!RgdaWKxuQJ zs`A)6>Gs-g#F`%}-p?V4EwjI7u;*^=eKipjFe+tMDjstpLgSWFFuB0F) zkSbRHvrTZ%>S+;%rni!To$%AaF#8#z@l|p2aHpm)Rrf%hRqQm@RV6D8JIpw5GErx= zEL)|5c_OJ*#hY6M!Pqe1`)xxsptT(dl#_y9hZ>u*|7Jly0>?u0oq9;0X-bs_R+oBz zdb}U;)f8M0UU``B#x4e#ZkEn~VoQ1e2hB3YdtKN54qw)C|N-;n~@R=%XVd9DeDQX@f6gTyb2@zh&ihUb9?am_5$Fk7NU-a3NAjD#?}Z?u&79=~6?-&bX5(IgHBc?yG_WLoZj} zU2}i}m*kuon8;aVK@#fKtihP9gd#$V$c8eGnN`;#0|x#_-=oGk}womG?}S!!f+&OzEmmleIhNnF0&xX^3#-9D1#w{K69v z2QxKHkWHfZ&E8a7b7OloV~wZ7(RhVn2mKfu2`%}|LmkHGSWy*cHgBL0ZG9B=_b?eX zNGyL(qdh7rwyM429k+NDtEqTJfr^5?CauB*heJ{20Ga|wNctDtKXvU02{{z10IffM z{gLwI<;?To{fsp4(nZM}s|5@UNiU82u}Db?Vj;G|e_0s$yP|jh_7V%`e)e)YmwZe~ zCH*JoxJ&AetDthf44O6c-s|2&1><$^fqg@$wMHBxEWrE*UjY7{{|0CHvO$9Fh~`Rj zFRuMb!N2nwU!R7k%+ZA-j2olbJZ2ITMU@L+gQAHEmqYPY{To(13Y&zYQiaK-n$kyT zwj^W&mjfkH6=WdQmWc#<3r0CqvARV@ zhuX8R>}H@6pMCFk&3W}@E4TqA{CwlfX>kRcVv!5pC#jqJr#3A#zP$dk5zH7)ICfgH z+j@Ol)nb@D^dp{U@ZvE%sx(R2R*>EhZAT@(Y*SLC9@0erIYd>oVg>CbAsJ9mQ@L8q z(XN9qTA6zP+q4r4QIN$fj<#I@+(u7_1& z*4>a{B=NB7W58UC`Ucw=}KmkzW(b^)!9bRmh@wbmW7z41!=@%q>c+x1paY6sMjSkFugykZB!n9)*g`u`0!v%+*VtNX| zeynqX(}J6zuThn#FA7-kFmX)bS5)$7@{n*POO(W*#&U%1c-7 zozCp{^9#d7G-iR-OaRf%64fgOo~pP@vjTn2whF&M>MBXHFP_W7{cjK9Ywk`_jSO}f z&^N~@1bvHe)Vf=E-_G+^M$+f%p<<$@FZVDX3)D5WAKicLlvMN#qX)&-J9}zh+xxv5 z&QR;Y9g-D`rY54=A+2XqV)lr96CZ1=+w|0KId&R6z7LR?WZPG~3dI1W()lJa&5vS8 z7i$mRkotVC5Z!!jECl`N#n^4MbjV%QmFRsUS6&S39+h^sr2!LsE?&vH632be#*|Wn z>sKqs;vPVZy4W6lBLxR2c$xfu?6QK~%kJ6X+%W#z)${Evd(S@YHov0yD{jAL`85`f z#<0F&e-%~h)ks8~ia475aBUwu`HXC2qiZCP{|<}HjkxF+Fr)YUJXbws#2RfD)Bmr# z^4-Ip+>5QN4u1Rgt3*Fvt z_NU$tCtuzE?%ZmB9$mBX{1|L+qn+uLEPUNN}Y?O_POe7uT^jwP{C(5)%$`-O6f(pWo{?_?27WN_$kX(zQaxXqLMATFANg z8kXsN;7ckVEaS7-{%2K4<+GdAU5TSsqx=CN!4kAEAIAuF|uX*M^lG~O1 zCdXlCijQAu_jQ3KPh>mGobJipT{o)d;56I5jbKzFWBN;LXx*nUmqK60tUa0Y8KF&` z^bQUPo50uuq>3W*!k7=@M$=WS>AlY`eRQ(PJ2whRT!5R`V%;}oBk|Khp=#n!hYc9!V~43`!ahTB;Bc};$z-*S|04}SCeEXwsrN9*CI zr4eJw<&UqHsf#?b-Ty`<^ZU2(+q@hR%=W6=cB zY5pZ_`>~P$^N2}l?I&4@{{2wVyIlUh$06Y3x^s0w>G~g*9^}o{S5OS{Yxh|<>Y5$< z`cio3%))HU@zPp_qY+l2Zee%&buXCQ__*WFp6=k##Cynhs;;>OyFN3n)$gioPCJ4R zbL=fiq9MvsOSB&+r2H(u6IhC-Tx?4JcR5$$TgSeDg}wptug>v4F~x3LyDrKHc38;D zw`7mbOf6GJ&dX@o)>;@HIVF+)s`I0V9ak!7vRAM2fq>c`eYqI(Ycp5(ZfLWwMZEM7 zX{&cidP9&v&2)u{T{~oH?2faL@jvn7`Pt4kxmpS8^PWrdzAEbmU*@N~zjw{{%<42n z>%fCF&1Cf;b*2r2fYnSda|HR*wrJ>uCenq=}RpM40F7G#ed#Qb6du7G%IA+=> z?$LVpBR5MyPR&ucMPkQ4me!7hD^syyTp!fkhgW#1%KJJKJJwk(H&AL%5Oaxjuj}4f zrkamk2>GNdK&lHA{6)4+c_pK8~poL`{bB6 z->G{>H=mAeJvquAlr47DJZ`ZNU3LG$MDNWN*U6oS4qke{@%6*E#HWujNjA^^BbS%| z3;pk2@0~k@yXAYxYbO$avKAcIl-?X)DSZ87azc1=r=wKP+4%`=1rkC!^=YkqMu55p z@8tCj1^Rs}5k-2_bg(8r22Fx3`!1k#U}iByMG{~&K_MdI##Iav1dbD-6qeC(7G}#S zu3HQcB|_nVRR&7z(WdAL5);>j;ieEF1vY5KfrRI zE$T2lZVU0>;>#)qg_99H2TFcZ=kG%ib5rIS5VB;7$;BJ-FfsvZ>@y|Qdm%)eQ)<7` z@o}{S_MVz{97vXo6pBxS=n{U(nNsZZD?*SEX@?z)*OW-TU1BbtC3c*hiGTrzDXcOX zNdn})Nq{f|;3;f|lUVd-nmR{B9t$FfX*`rqDqdV~GaEYzJ~?u5Sv1Myxm=ikW~l_# z$JWGZ33$UsDg*v@OQ|QGiR;D-8;`*4k`R$s9&(tdWd#0Kj$$$~1)DV=X}&qDB4x^) zy@3hegt))YiZgzuspTMq$>>ydbXSy$6$w_|mQQ*Rr0_;XjSagErQ)6rg@jxjj7bkrv-0C zMD7ncAD&U&bFJ{dAO0b8S^TD{Ow*j`{;b60d?ps0d~-&vD)7eJDEvde#dIHOrw{`b zM@Oef|5B|-D~@BwT&)z)Z@MsiN?{NQGzJhq;1S_lS=0j%a%Frye4GSKA`>_hKN z?*_M|DP0)VMpaW7yc#OvIFHp~<8LYgMw4Y~&qZa5Wz|_4?(8zHr&5NS1b`*fQ%XG% zsbet-8FrN!ckvygh=`IQDLPV?AtTkr$8QLw4M3#~(3A~Xv-q=sf=gX0rd@nHGZ1=I zge+`&f+W?)LdbG}uU{%@rKiA6khfysNZM*-gf;+)vJm>z3SV4R&^%wFEqIp%FX9lI zllJAovR5e1Aj(EV7JFzKR=_Pi8Qw@=8?&5f{UC3P^@I?cqU0fTLsKv5{%*kRm z%IkQ@Vk%?+@vKN9LYfE|GX(=gK-qX`XB$SkSd{DiAsxnnqlywHldQJ=uA$4x9*qcb z4&rvd+wW@8&s1$VUOojrEB70GL)MaIRc94L?oz;j0pOM7q1?mxJc;Oq&%$Y%XhyjV z9a00p`?>t0?Z{qsjR^ltDF!;#+4PbzK*nKxA9K)kFDSvsPV zRU2KcNF-McRHy!k2bo=yeVzLhcBri!cNk#~ zVAU9qU>8w`0mk>9Q+z>J3P}7_5%y?%KeXvtjo^Qi?K9SF?p_&Ika@a2S2CQKvqCn4 zlOT0YlP(E(aLwUbwd@ELB z5?27fEoJh0+d7j9JVuo#23k!aDCFXE`e;YVa$`1LDPGW(OrWPDR3;&_O}^@DiX*=2 z!Zd^uwO*jH=1!;#e z!WwfRBDquGS8$F;Yps|2mVn<5aT;w`gs!$bW{>iGtlVPb$wTgO3fb~uJY(nTu7tHSy~$eQE@Nj08X+l1p|nf%HnNk*xi3=MB(QX{63xPW>u(hAlItTv zr7-^UBtMa*!BV&mQ{{%tc6~w?i`w#5ctnk`sK<}GPHdzNkG z9U)RN!0Cm;$cP`sgT2KV;wGOtnP|u^zeJMKrh@c_R+~8$f$PodwUE6T(h85@!LP(F zD5Do+KFRlav&2fmQH zv}P3I*N!1rnG1WV6~Bc4RL!gyki=if|DLOyw@o$qckJit zHA)vWVFr}zOYuDuHD%+AYGofxNxwqdg^VM2Y#JI+(>S-pxuiSYGPl*r>fUrU8Z)p! zZ$GEP;`*A`)7JUBWZf`Msy62Z?N5`+mR;SB64(( zmgVxno>Mm!Zi#->A~dyclMpyY<+~9`9YwMzyHApKoK)Wc!3m8y9C4~9@ePH&qR&E-8L!G?7TEn3v`4q+(gJSSNk07j z_6PBupDkDjzxly0rqjP;{SHSU{@nW|P_u+klK+fOk<5A;;nNR&1}>k715W)%0Pa9} z8&aAi+$?F1Yw3>YIU(4`0oMA?*l zT5m8@jk_lLm#-fTMHrqFsNq2L5=pDJE()MOdS?Q?3CdFuC{M+2^|+qRB6VTOm9U6x z1@wV_5dr>ZP*;=V7m&me<%6zcChn!G3E9lqJc%@MW8$C1&Us&-q>whozBK&22WoHt z5(65!q)7iDUi#&j2Iq-uynq55574WkH6#M-mzq2bQWNVcvq1_8NA7Curlp98|-|_gBK=vL_@;KrGpS@JhH`@M;caGkui*I(HLLz96Gal$7*tBEcf&%xBQ1h}15m zJS>8vN?qs|^$^mt^DqjyTHXIu3>&Ig$k_U$4OP4lF)w&}^KqgTX9vvY*I&*)IW4Rw z3tX*La zF#U5by!nh7zgmXf(-_YbfhDt8I6RF6c7-}fq)}Kq(P+BjT|W; zAgLooLTO1IDbgX0{3vxQD5+ra+n@K3Jt=B6OxY*vUW->$0>(O%9s zu70wAp-oX8>=d%VJb+7Jt-#kBXO{LQqhZ=DOtzw|Oz@7-0T9l#uR+wb07g)AfBn=; z+0!h_&fBpuXpD)fXesK*(q_~(OFnak)#;kLTR9)Vm`>Fww-g{<)+chQ*fl4O2ksH27dVO}##0ptzz-P}gbL zO(6P#XCO`qU~q=myW5zBVPGHzM_E4+ge9RXkiYbfBp(p}Ms}YSnLe+$HF(e304q(| zzcG3W`t^4f2Wi5iNpUrS7}i&1To2t^2mW}&&vX|sv&g6KcO=U*Uj$V6>J`tsFRb@0xy+F`XxFIi9lxb{3m&j;BT z=G_nPN$u5CP)Qu>uxxrqh$XbXOD`WY21zjc$sXIS$kvayD&2+-%6_rbkb{I%8AKXt z(&YK(WgCT&6<)m^J(T`Q^9}HAuLdMq=4%xX21~g#tCTe|ye?oBr~pw99>u+R{%hpL zX4}$aEhBi5+R<9uL6+U#buyQ3Golb>o~u+HNja#OHaDfzH=547sCIbSmmliYp1w|7 zlFfZ9qv+fuHtDG4T_adNE8RCou48JbeZDfMO+5sUO1D&I^#V+mQC~Vz`o-CnsFhi{ z)1-Ru+_z<^oJ1It$^sc&_I#7`Qg<@V*I7bernU4b(U3>EPSQEAl`aEh#tU)~(q_Hc z3Ujr;Qd_E2&0eau4`n^w_f^PsaNlvBSNXz7@_B8LYs2<(XQ8*4KRu;6{zy81eLS$u z4nIef!6vh+$Gh{IFDD(01Nz z@X>5bxx&c%d6hclQJxxqL5sTO$FtEbU#gysE!P6RxouzW)mwipsj5$CKgC0pXiT@E zfErtWr_u3?)%HfmuPc-T*^%ehoptWo4XI{-l^c`J>VEBT*>U#vlxppCoxNzHY4zOX zFO+-7ezwl>=%Bp%Ny9~zpBHZ{iq09ORQ)Xev91`IsJ1Q4_~g#~jnli;LvLOc&K_)S z2x5OOUqY(>e0tyI=klHYhY>5!EFPSs)jtumYP{58+LP{m!0hyoY}b`9K4ComRq#~l zb{E^CVPq0J3*v(MnV7e}vHS+B|M1Ee7til{G`eH9=cXkm+DGNvZ$D|ScQ0Ra4v5=^ z2ty`bUYyY#k6rt11%LLbZ~TPSuJWsyk@wLHam_)W-X-qS)uXmIHUk0W5vTUIc43cX zBvOLwIP4NK_PNq?SZ-!MyNw z+ouCOWF(Y8)CTwB8tHkcn==Q(MCM0k+GMrFY2>FC*IaYoJ9~DrJ6^H}DoRC8%O7oq zjr&Ddf>Rp#$o7GhcE`H!`uuL?%xovdJe~}Z0#0e)B?kDHww_5%#;zTndh5Ru^2f@Y zZeQ`%%e$x@v>l56aJ@4`a_AlIipp9Ojdg@kIe9-rJuK=GQ0Hzg#E?>k5p%1-g*fAF!fQO9fRC~@7nV6$cc z(XgrLcz4gX;?|PEO-;!Sgi2Vxmknpr^iKPDwbc8^3*H5f6Wcvyr?t*JE~@T`51vx{ z-Fd4!TY8egN}=R6XD#@Xw*k^f0`jYzTAI~9D*I15)7{;@bM%f*?4r;0*$$W0A32T# z?|pQi$t^42mEz`q7Ni^VCvT%P*m2g*M>nkhgWzWk_D9aj0rO4sr#$7S8vRptj#YQ% z*qB&2S<^Y+D0{p4=P&dU~+yzH_WDO>sXX=E&8q?`AZoKHZZ+nh^S>NV*g zj^f~TJ6Bfw!kQdcDX&I5So+-u=U4q`VO-_ns(MiiB!_ANB66GHEUt^rY3dS0fG8sl zMtrcqOrdweEvnu;k8{H;p|35eBgti`M`3|z4G4h||J;~8)Dqo!DmEuwVm+F7v;SF_ zSA-JC^5DKC_SX54^P_-d@7%T-NIsFstJcaAYH(;Rm=n@aDKtsx+&0*Vud(85JHUlz zJmN}SH(a(J3+|E?e!fjm%KMgcoy+~^pryx{F@2Tx&sPLg>m%Z9_Yjz>u6F(MFsqCiaYh{9H`x(a5KB%^XF2!quHUzj90QYq&! z6q2bJDU>UMO7Hz(cv%x0!@SkQFoc&oPBmoES-@e)pAjr#foVo$U2qGXpu-dLfo<1s z8T+6W&>ktEUJ!+@*JZR@pH<7f`pEHO=e%6*a~HT6&;{sjDdHd|6yR*}OI?{!))?L{ zeS``PV1!PJ=wNj@OI`wRu$WA%R%zfTm5M|*Fv5#yFXsaS0Aye`sF?~AK*fp~H?9F} zB;M~t`0};AA8lo127bxN2B8eP*}W;F2UB%dS)F zm>?Qa5QPY$-!clq4}da`K>DHL(wh`*&@n3n1wmC&fhrIi1%#`5$Vy0nh6C;ya1IO@fnvmAiI&qm&h5nZ1I?3~ zcoE-t5)G!&51_DJ0GbHyN@T;m0_gSPHZ6{!SRSJ8@GgMEJW}&6Y|tR}J~a4K3b@1L zLTKsVDXC2}r?DYeBBLZb2ou}N572j^osEe)mik}4L1VWo&KEppCM5CJKd;Uo;D`WB zCP}%F zhLRL!nGw{-vaJm2C*6EfIG?a8p*q4W4^ezjOf!VbP*D{&=+4Sc6 z4@VvY&fx$XK0a9tY;OiYc1Z`Z{0Wwl24be4qsj7L5eG#*-!e&(l9u|%qtKy=Dgjby za{{dpF)Kja*v>lAE&cq?EUi3O2Oobmn(?j~@pBWA289AU<04fcG@3})Z|rF>AIvly z3+2I@iQ`6%7Aa+cKBS|oC94JlEN`jokkrBr~Nn)8y;>w7# zC4eDlFwPjnx7#1D^vvhBOuSo-XJ_qba7S)Lg=RrHJGLA1yWqQKvQRjPcapT%26veR zaRNQ`jRd7hkPi{z(MIa}3YZL$5QnFoMj1rb45A_mU_?VuvIawaJoqJU7dVM^7vu~j z{_dT~3ulDTp;0fWOBL)){pGXhSnbJJ20&*dJpRx2(Bf@kG0?c+_%t<~(R3`;lOYYXZ*sh6akUf%^(k#E$4(A9YqN_-D zLsxV%P)GvI9L-37t+3)ut!2Ud^$Exf5HB7En(R?rBl>0xg<;LBisnQDf&N1fC%y@U zBm3|Ycj3AS9BJp8Xw+V6w`Xn88dg13#devJV^=}BoD z!-uBeL@Wg@~s1NaU z8Qh&HL?LnF64=qR4lrVFJ*XA7m}FCd)hoMZ+B@4&cFm!5EJ=8_db;T(d^&*{?dJzI-NK@#f^oYu*cVGbc4%W?1+!z7xW zFPu5UFCP4Nh_fE>ABab_fkdhI*tQ3z6Cd!RtX>cv6WIsEY{a9Gn-(J5Ad#0C7ZdjN zjTr@So5)EBnU1jzb&f^577r08n?L~z7Q8|vEuQ{M_h97X6{jB@$_Sz`9HjQ`!Y@H{ zdoyN5Nl%YSXTHf;cHo0CJJCp-G<+?A-pv{{==S-%}J8oI_w zi(iEhxh-P^7RZ`=2FcOf0+^T^Z6FSeNm?_jIqQ_uw>Jn`y%GbEBKiss4t|^x1L@cG zSB0+7F7zA7y;kD&hq(IqV?p7J9Rts8jA_cGZ+onP|1QJVKJiOVl4witKg8c2&$k_FQa5BCx;|#` zEH&!O6+0A(9wr-Uo&(7=L{xx8b^!_6t1m=6^CT$X*i0n+waMezFd8D?(z_^c){VIe`3_*Vc*$4@+BhcYo`o}nvLmfMO))@kWAlwp#k3SgzIENr-;B3s~G(C@3 zFTUaVXEZ(W#54jFP#SW-E6q+}ypyV-NRlJxh)HLzeb-V4k7 zAT$>1Ivelp$-syMf$%W4n=$DF|49<2qS}ax#BG)~KyZr~;{>bv;mjdhQQdZxk9xfb zos_h|#xY6Mew`pj1%lR2iToKc!t^nD1)_V}<3dPqFa)52`q2jP@!`QDA$ z-k#uRzC*~s;OOy86upGpZJ)`=zl7H;N zWGn9G-prKXd{-ezK-bqeTY!w;OH4Tn zK_K?4TsSuziuec#KZat@DALhXBq$s}9IL?DNK>3Z>KmAPVuaM>u?R&e^;E5ThyjU- z`vwwv#e3>0Dnu5>j||72kn#|x{}OGnO!7_MX*5tm*zv;i*@E8;Qi! zAxP@|k%dWSce42by(s(L`Nk6?&HM=caH68|Dm(7?trc>&$F;d=*1Y>e+UQw+^bsrY zmoqQOcZAq;e?i#mgbN?Xuu3(o>R4ST05h#b$>!G<3x_zoo1sm=w&rJQxy zIvXn(9VksF_{aa1u-UR1jN}dgfpX=AsCXa*Xciv zpRpuNj8^+ELE_BTY4gz;gTG_zm0&nX6bz>+ejL=|(2<40ajv#RC>Wf9&mkH!B6wE2 z`7s|E5F+k@u%XEkeNm9U9z9wZki+HD;)X;4xMV7z4ucP2NW9}*k}PC|Vr48B^dcul zW06!2;z1o0ZBgBjv+PlWsjdZWsSRq}Q)}mUYx?1( zJb!0R-0DK*xkerh2P&~Z8-g6tq5_}oJ76m{=XsB*3En|$w9~@EoBs4Ty0HpN=h+*h z--%~mM(_h4KXDdbNx$F{20)&%1SZ}sL?(tktedy{9lak~1p3x}O>2tI1nqco-tK&_ z&K?d9>9=MOrc9z3*i3T^=-}FJLY@4@c7GIxYRlR4r*NL2 z<1N%PQO-{9b|iARK;#g8&+NTHPESq6d4b!?UyL@157+5@sTCCAd^<>&Qe^(3p>-uw zR4ijFo53^h%oC`q(OSVLM81&!g?^mdgR1wHYryxa)1&l4mO6&V7AnIpuU%&#sEb^Y z=4kT(3jW;U_x<)vYPF>N&Y@FjP~)Oyk>E#(bAc*WA>ivxSB)i$Zg8kf=|JTi%I=kb z9bGLhS#V*VPU&D{4Ms8x=8|gLKe_w1Rxpi&_t=Pk@ERat| zUn~+Fws(B1I4T^~^{^`EB>Z9VYJ2MU8iCH#rrrw=|2W5JmB^OfJ9}|tzfbb4^M2Qv z|9Fl@6eglRjnv#oaO)2n4^u7V=iHXe5bKO6d?WPqR6w!tz;^8Wps(Bernv{j1G^Sl zQhuSU?Efv_8UFA*bwB!6-QPPgz`1Spel5AM=6=*j_s?7Yj?ean*?y)4#n?t&_&MO@ zo}m0n{FAKz_*HwM<}cEZih`-4G@r%^$zNWJZ@9crOW!mmJ9g(kMLId=H%u8_^Jz6P zSKO_s2q7y-rRMRC@#Pa){>xJ{d?K@wYAWR)9Iu7`?eDvwZcz+t4Zd6erA-| zYIT_l4ATKM))HYz3=2DhieT&axF0iIzct~nV?IwL>dkP$W-1FJgDh@XQ7yc)7W(po zkg=bX%N~GajG#U1Gc)~t==#KKKJ`o5!zsbezuW~b*>${KXU)1HyFS-w-GTJ9dT_sd zT}UP@A1D1S;Nn-26=`Lys6~q$aB{`Y@f|h2&=U9i>S#c%#l%E*L2{NL9dpOk8y!QR&q$3kYLIFJN3yBmhY7lzpL(Ks9UR= zWF{+fKi;*hnto)lonlb&%$sINJ!B8kQ>s{F3uvVtp3h0V)g`o~q#=EF>}w71=gXlb zdhm$X!X(3nZ~{GQWh!kc7clT%YBsGYl2RplncZ?!&l7B%&aS^;eI0GzGFQZajZcsh z`lYgjZ)Y;SuhCfrlB>e??bzHGP;uX9Ft+7eJvF~~(SvqCdwO9Or}oUfHV0TZ;$@;T zhnI{lYslnE1gddm{0in061*c4ku>a(zvVdAW;1rT!X&o`I>)fAB1N-$={ev>E@Mg> zop$4J(`t6s%!zeg&P;1x{5x?Y2A|Jqayh@-jQHAjzPC3YnX&nyfa6-5;qN$TH6r;3kMuf`W9t@(dZf9$wZ_e%cRx~=zv zP!{I^5p{iw-{OruW(Q>nXRVE6R~26(=-{|k1kX*LyqCx3AB2?ijbW#0Rgk($sXq0| z7d!Z3b$=(LiO{i2(!6pV5SA{@zP{4ABiB>y*&5j^{V{^LJ*6l@%mbASj@`9VGk5>E zLOeAtsVd@p+io61!WmqHg@KCRk{3g$C7;u_D`Qy+uA~C{Q6$v}NRSxHOZy}w|0BTr zQ5`q)H(xwp35=)Ag)mI^b24q+82!jaExw*Ri!@l%b-yF#6Nu4@aGh%w7La5`@5rm% z%&Z+|Zhlf&57GDv9|O_*$jQ4uBlKoZQ?E5Tk><2PRrAl}Z~GdT{Z%S*Pf{ z^3+Jg@&{8!TQ%NPU!OKSfB*DSN&Wa~_VtucUE`6H6^RCGJGe5CT4;hf2jh1MnDmvi z%K0v5CP8#r$RfH0$0pQ95~lBOYcQ@sm5q&=acdxzCdwHi7RTmY!mJOcFtiNm68mwn z^cdn3m2b?vUTY*Y7aoZLKu{icun8>&rbfiMVF|IU*nUQ#3-9C_+pHvB3P&u|8w8nU792z8MP3mD!D9{q-j~Jp z-RS7%{HXcf%~3`2DP1k<0GUPUiR`{{{Tq1E^xNQ%j%)gRhF5at=i9%_I!Ox-TG&

K{NB~9X*g&9T1=-+_P3Q{OVk7;)LRe^5sz8aBA#!k2D@&gahmgfbv?ePBalgOujkdu!$DdrSNd z%6mw7AK)JUgSvXc4OAFaaAz(P*=}t=5o?%kDZb|M?3ztZVdFdei4DJXLxw|6{#$si z<|~v=>87ZmjkX(-?LBvU6`z6C1Ba{p*BaQCjA!f21%^y=-}q$=w20n3UAHlpax$qf zVxaai_*16g?@|;WCXW3llTTsJAhF^hkuZSwU9W};Q6f5)IfN)o2ma!K`S-Cb{y220 znbA%%3jkA~0a~v@EDm7trGD?G zBH=^`4yVwp=kL|~&Sb>HfH&*>qk;@TRg)o}{-ulb1BYue##Hn%aOZG+W1WHS0}2xW z=r(CbapHxO8`n_YdpJ&^pUBbNtA!#7t>FLyAmi1m0jI-vIJPhJ2Z6vK3t_JX1II*` z`c;4?lJO3J9#S>B^x10l6jVys@`&3=VgMNIe@ciEqaZ>`iP;{jtWbcNhGRS~7Ww<+ z4`?wr!s^_6kAoxg6sdO+=Q-qd-YGbnV-VR#M{{5L@nl_o1zj_hqswwfSE*(MGH+J4 z6ib4`+BUrMDDBBzrTKe5?8M(r^%>;T^M)3A55H*Ji~8Ynda~5St%S#x5{mIh+MhE(rG$-fg?-;icIgTl6RKFuG|a4K`4b746SigaNG)@6*rjE8!{ zd}kGCIM!N#x3-u?rT4s7ul&X!Li-k*+Qu$u*pZ!t#FnzI;JD{5U*7^LYZHYqWfON0 zZ1OH@zOI@oAYLa3XO}*chR8pxSj8!h#}rIe*$0w{?1ofVa}^OzGf@ngq{NEL4Nbo8 z4R;$>?_#A!wYv?LN@B^H3kKz!e;RK`k-QiW;-;n*Gs-P7wY z;j$2)9wQ2?NMHytA5U;}B^DVj1kIE7v2(p|;BKzsDZ)fW)VlzLn6ND9l^iyQt2wf+ zMP1_Qyi+CJ?73Mwg%^8K==HP4&bf|tbNX}du9)z^A|~~~?1SRDjCXS^Oza4E1rj2} zRN1edRBi3S4K9OnX7NkF?6sHvPf+bVw{$!P|JL}Ihz{(KVR>Cf65s2K? zLeOJs#RWf6ZJZpA%92XeKkS|Gdvedo4O~w|mO}>)!ASf!CUTFjX%8St(hvg_h~JP) zYtlt-41_*u9F*Bu&lC!5)b+wMNb<+H#YG}4e1qxFDxTvhbd(Ap<;u9LL-=qC_*m7` zNTM(rpYnx0#^j-Yl4pNMAxaPlO`cvAu!(*yd!8v1@!9$?AOx}p?fdjPHQs4!M+ObO zUQcAh;Y6{kN+Cp7u{f9*l`({hrcmKUIHNxs`KUO>E#RLe@=84&p&+JF;@q}+bl0RT z%@~jeu;TrCK!pUWf-zeTgbgnn5CGx4XH4q-F!Y`=AgZqd;)C=`fD;5y%i@~&_E`J( zOj3y;4e{$mY{zt`SOxgD1Uw*krH2ra;gBl?15qT=MhzAB_ys4@|KM_ty=f z;4ZsWq1FWkxX0rW%SN**2|;)jW==yIXiXYwAHcx&ZHw3{tPUW&ho9^6bvgtjRyoA=X@@_9Ts8*#k`J=Py55GcLrx&4HM3#8Sm%sUB|y zk-y%(P`bMvxtB)%Y??+fwcuhjoHx*?4L;vO-0}*!1YEjwd)VM~tq%>%OR|{00B{Li zZ^Prp12jw)$mIAC@{#QzfT zjw_0yNNKQ7w0-#K3(Rzkbm;jGe2U9s^B>dqvI08qUWde9KQM0<*Rk2b2HqY}i{cs8 z{?P9`Q1v~06IC3b7RBqn?(UaiMr|BBsLHq>y%j7Iws2a%6;v}hwKiQzF!wuN8qjkZ zUU*{JosMir9{UbUJ>w2t@ESv2aT;LJP_-KkMETsA8Rk--rH(FPj4c3rB52Dke|Zdd z0D8c0dBoQD>*q*O!L=m*PR*5^uBGKvW(*zf99L65Ekj(A?Piztxa(lNw@Lg@H1?_O z-GQ%x50sfd(7H{{)21)QzS*qY@lk#n2+vY5jVZH@*lu)^^L4KD zR|Moi1mPiT{GxS2=*jT&?Til^z73(iGP~Nsqko3aL`QK-%pRZ0d(nG-p<15&`1h2! z!#q%(pd8N`(0=B^d9^-SUp41;(%sFZsdF<&Y*UR3P|zy6p_C)GD^*A#@IJkjXBXhK#>u_48;URy4F*InqmOxa6`Cq%@KpHb(Aw z7|StXHhi<5IkcA<{#Uxa@XC0x^^>_PYKd2$92_C5A=!}~t;sC{ub_cBoA2YCPaMR4 z3|e<2ef$A%X6wsIs(ytgTnJ1)dNF$aMS??w+d1vUOX*%d^`f73MtHO;Qc!QEl|`J_*Y zUH!fmE)_r#;(!>&ODHaEJ%uX)N)J<1?XLI7fd%!r)&kx8Q#eG8Lf|-$9!Vi_tGK*9 zcil&lx}i3b;y8miX7*JZNlY@3p~5`u3(u5%^MKyqKwlc=0nQY9A`hbmotw!;>M9*p z1ymBaf6(*)dJN+3sc|eIvjo#ZqcqwYolFn3B+9 z*S~1_%)T-rDf}bD4W!)=&|3A0t&`oZxzU-U2LcLu!(QXG5bF! zM{d0Bnr1*E+A^{`E!6fj%#t){)$S9zD?Fwc;Z;jntp18wsyuO%qm*qBWE(xz2X+rZ z^(^DWCa4Q@U(OBo7+YC9&;H04ti$(m#?r9a!c0MyHzj3T2jMQjq4iI8RBAv#S3q8e ziT_Ym{me_P$v4~IX0Ux*P_Ek=z^UB`MwLm4YA7j{wp3NWl#EPjlA3Td)2p*%3Q?)` z1L(p#%XDUFZ$((1}I>b(2AHhN9V8Z|o2Yo+djO>4_FIc81Y^zx}TH5_i% z%2)9yE6Ur7r&M2!9nVr(xF9vk-UMCNlUHB~$V#q**A?7qe0lGS40c*&?1w^s7A)k| znnY88Vy16P&O#nK9$EAFm-ejP&9wc?a={A0Iq!y=myBm3ABp%_MSaploToPsZIZsV z==QGXg5olFMRB-ch-}lw40&GEH7)7cebqiwU^cM+q3Zll+~{i@!u$mRh)h1vTOCwl zeG`k3!Pb6fuu-b}RP^=RrPb5vjk*tNdm~oo)~4S*=q^ANHJ+P}?#%D|6|J^S`q_+#LwpifbmH<^O2; zR`{)InXU}Y+34h5H()j)rCRLmFT#JB#{ z)a&WL@^cuto<};{rlF?z`_$u$LTuy+1uvrbM9lk~+-9ypoJTza&-PVc650$tlvM;g z4OyF+KhWluwv;MA2d`lU{f3qAjeWXyclwDNEwRYa4~3JHvhATYmL;s2MqSdf?Ny6^ zhVBRy6D)jtg%8&4fG`xVUov@Y#MAZBCLP~@YbH&}_$5lP%KXJGt5xbj`(sjiTI`h5 za88 zs&qH%_qt|es?8V)Jo|iV(sFqQ(;|BItm&0VSZ03Ynss|)Cd$C6VCtUZr&7h|NqO$b z?A)#XH!ezPY1n5Ul9U($JUJEowZ8{7#i9GmL;}|kIo`EumFbaQAM}0GwFG-fF5;KB zHERJn!P4%9&#gLZC`kBUnI&`N7*zZJG}Fx zA`Q#2d7!mp8lvhxFWGK4Zmd-3+|`E;hOX{=tCxlod)^AmIq?sS9*)juz#a!CPl>p` zYE`ESjZ0tueQo$Ak)ztK;f5j!#Bv!CZ~eG&(h^#j`uF*?JSJV-StgDTx0!F|swj40 z(lN)~N%9@V1K@$5;>LywjI;9|D>oW8z zN85L7<1d*&MKa|-G((6ACRHiTo#*reA}TjTSbUuXJg*h-Uv5saDm1$7nUI_Cq_a5HEa6}?`0ww> zI-WyKy`3|=F8ldUC$l4twy*!u?x;(BHK|?i?FOIu@Mve_3ngt?O3kpA+td%5^nuTf zyW&T0e)Q#%gT6F))!ORS*~dKAHaD5Pv;J~< zzme#{SCfq|R_X3~S~H~7fFI6co-+ELiL{TIZm-p;i??!7#a`*WgeL`0SY;;@T)q`> z?NT3z9sLPpn-|BSPisqj+19*J% zj+T_ci_BKt>OWRF!Ih<}?X|Yf?+Uzbw&whCOxH%iKF*`(g^Gp{UUNl_x@W8wg@0ew zQGJVuUuacTB+azFF4|i0?^jk%d;7{se{Rz<$?58ov|9yZEJ{os~2^iov-0yImPus?ORz~bv6=jPp~Xv!X)1RItH&3LMcGv@!;e^mA*{y1^Kfzro1`$1}b zG5@Mv(C{NZTPcKWC0necc|(jX1Gt_7@Agqb&}^DDzmgQ>c`7T`y1# zZK?_&&1~-C`4>J$@z~gL-*F-I&K@;1%SI?B=A_PqpBY1c894uH*Z$=&dlaj4!r#WK zOHHQJ2API$byq&saCjx$N3GzkDGuzMSDR0kTmFZfXkq*0m;lk*F|EtUY-K`6H z(dyvjQ*;v_^Ve4+#XR`=)dIJ}_gNhO&Yt|@>(+|;=XYQ}oUj>e@ynsi>G53ae=2Ws z3~oPER}3q92&CU?m$-S-lca9a>aSN~QJC|` z8Y1KqVCj>B_4LnUF*N<0PDV71oBs+nxsQ|UBuHIylsK77IZtFxd)~in1YG$i9bPs1 zHcxUZJ$``C$X3kER7~#e;G5^)JjX($ODfZ(IMN2EH;I}OU$i|B;6nqH=TF_fIX6~j zgu@+>iA(smwaD{9ZG%D|4Hc>n2KkE2FhGJQ%Ij~6oHO!(CjrTEBgF6AycCH)|}WmP(h|Aa)akYre|sA=W~Uy zjzKfMucD9NzQW%+)_$r(WB%Rx?DeYWinM$I2NNPlM*AIHe%vUN@cm(jsHwjcS~T#r zNc~ynr>DGHwFW~H>*ZSDVxTX}CIYPLB4r7(L9S9~KT0#Yp+Ojac>;GSj(<577LAcsPoW%I zJTrPxRyjOiwYP&JKlMzn<6#@7?Ao{amuWm04~?1n7f4BeQ;{da!BQSsKMHD>#%pez z67Ly$uXyi{4j(<9RQmGt&DPUugYQu>H?IFFE&Il}8)IFmNpmxujv`x1NK`1UROfaf zpPF$h%EBt)-<9^ZCALk~OikUJ!S`PENNg|n`0Kq}PD#AYqjDbMc1pq3!=1!(SR&{|uSGFGf zQ~<|A!%I{~x(qp4b`Gp^@#oArNeUy5ijr;CQ!Y1B#6W$wK6ib8*ei1yV3ieu8_~vI zT4hN2S8M)y5YZuGp*HSpFQBN=#6*LSgTB2;>~xHW*^P+tDw1 zvL+WQU`_TM@n?`Rq>g$15b?jqFw1i@tPA5E+F4z8P(HT9wWP2pRb zDwpMrrq30azo$w)-x$l$yrijSvZPsS+9%iG8=UWg;@J~Z-M;}?05F3!SfDP#iE_r0 zf^b}e9b!QJYcLl8yt)q6TY~|@u*eb!9TVoU0q-kGuIh4%>2j2EfFePT4wN%`I1Zo$ zVL_yG#DZ7=Du!VQFF7NIJq7+|9X=1KWyOg&}X$3TaOklh|o-nt-)0`sLn z_|_p}x?nR5D4kw&!-4g1*UTx5zj7{IB>;y-*i?SV<2eLms{UP=l z>g8h4*Jt3y#Wtq&V0bXu`u*TJfuDxA_L#0WTex93leLYX1=6xC;;&|WVJ|VW!F{wq ztrcs0fr_oeEZ4XlDUNakHa!9?o63Lhh&zyY3KQ!AI|2*^YcO9GfG33ehn3zhVNj)t zE@X{P8>8sDGjiRY5rgM<1^myPu>zo9dKld>E=f3!=@M8Tj&qQTa*Kz@S%CxBg)R{2 z0bc1V>W)_dP8R~i?E}b{02dQxFvTIBuRAjW5KRKGx(?B#Ks`kP{2FY!1YxuW8KgiR zseD%IFg<`V(v{lss+CSj{xv_N~&G6+C${ zS1RSdTKf~;IP_da=G zqj?l~?ZJPpz!1P6uFLptUOD*WPD2U1fxXdb$vu6w`)@FMq%wp65urq7`*(b{_Dp1r z0|&ry3v*1E+aFH$dsAKm_O0ZoW3}IPpv5!Pd5K^@#ILr8 z+hwZzd}}JoUFiSfGdoA?Ab*#rNX9gM7LW-lR?9E9OT;h?F7bLb?2kj8qH3z+5PG ziU5pDLy)zfG0ut!3xIcfU|>Sz^+&Ezu1G%NTr5tAZZy+FNRRpC&Z$Ayj$jPBS3F8j z9X_~n56N8;m<}@Dzr22YQTCbX1NV&jolizB8DAjFZAQn{$_+ z?V155tn@eD$9$zB#e82!`z^qNwVDkv49Sb{hO9_KK^GfM8u=|U3OoB`Zh;#&U6cfT zT8>i{g$CdoUTqm(SGqd5*1hXGUNM_J$5%Rwypkvf?r)~de9l<%u;eV=qAjGWC&2!d zA$<9HeDw{la)V+@5wa`iY_eB2ZeT7*7|J;@l@oh|2)A8bq5nXj!1c4xO+k$nHMO4? zP~R?E2{4jx%1P3X>lMbY@P^P-nQ0O@By7TlY~BPs`kCpDI;r z{L6U#2kC;^VZyd0Ikj5MAdc_0YjWt+Dix z(3e<^LhXGv<2$z4oB#^jugIt~NogYg4eI~O$2GFWrA~3(jcW0HE8vc3Yf3szy0d9r zllL$3*z{4Q`cM+s^=8rZ9^dC&s@vD@<7GSXmRqd#r=`meP4U)6Y#|9|YE71_;>;ob zZx$mzwmRf2(j;sUN6_fw=wtSa-rK0y&r13LNq3bvYakr!-PB_Zp zET^o@G$)>%lgEv-g%$Q~JsUYw<;pO?t(*F6GPz9gzVDvH{-IOMq`>z`>r;X!IpBfB zsKbY+1hbJm?{k)r`M;*g2FDFCTivdWOJu3_SLk!sr*#GoTN@v=B^FwC!g@mA2BAy> zzObc^7;W~*+@6cDa}_2dHRpZmcv5K^d6H&JhR%ffux(^m!i+Zy@Huh81Z}+zUsvgV8kGqbC*ZYdhP3Omo zBHB-EiZ7)vZrK~DDaJXt{2x^is_(WD*IsJxxx}1qc;VI{fEtM=UwgsrWtn;Rk#`zi zfT<^|5yep#8X&Z zIYiY>OrAyMRZgBYWkpRLb(c2gehiItF8HdUo- z8SSYino5o~sFO)as!^qpW*TLUDyq~^j&Ocu6d}%Wi3KN<1!#}v8xge%zal2=(Uk+9zw>e=&R#i={>0^&>!6lo6MFEhQ zY>j5t*G)A#MjLjmsmNNs*m*YabpqyC*mwpGmoRaJ*+v?|dUcl@|3@YUHr9YJiYDrd z-eqXp#v)eupixpJ)>BH8p~&)z^!*m{ydE7Ipp+wPcwxsK#%Hj4kk#9sg8J@k8)zi) zJk)^AaqLn}C9m}Icr8;j6pj|p%bj&8()?Xx8#2~3U+Sd>wb-!XZ1Bb0VMY>f01YmzaLPksa`C5;KNka1Kw+-$>!LH}uDJ?H%Hd8NBs4kh@`-?Vy$a36@Eg zVGb#@ahBUvm6*$U9p?5)i4|J5a!VE=$I$&RDM)b#8ZJj zuAsRAihQ!aqLt^mrZcaiK(!A4ESd$b{^yk6_IbFTu5LBE|6p+=)$CN^jjxjB%w?bS zNrY9WC7&okTjchI(sbohuNt~lks`$jx5&gVP^w#rm{j*9rOD4=Cb@`JdIyy$afwe> za$W7XGO0(+Dp`SoAeGS9q{NH~R!O1Mp-LB)$UTlnIH=Cu>LROn35-4wid%y^v$uZT zW+>U&%y$+ylD@qSJDa21Zkls9>9p)>2cyq?5+*d;6h?&qV^)KB!kv^AWqNgKRCLy3 zGQ+WqJsOLiqcD;;f_Y7gvw_>A4rD{qXvc@g;h@)=$FsY2k276!W35C6AnpjxC2d=p z+fX!-;7G?_RD(|$&z80JaHchI^qB5KM4tMHXgGyb|4IV<<{s;?XOj~F8eULlwaqEc zZ$X5Ti461?j!jW(jik{{Rz@?r`Rh1P*0{38rfqTWEaf_vGqX3qIR&dCz)RNQ z*2y1u>grJ8!Q+v*M_;T~Uvrugg=gHuTMLl2Cy> zJXJ!^^ha^ku}ls%l9g!UzobZofLyWOMg)_(F>Ont;N;vi-?S!%@+nwX8WlmDWV2S? z51Jc9(slmWLH!vjdi6^WKt~6unnF-5Y{|g8|42zNR2HpQjyce)>c+$QAW>qTLR}si zBu23w%y0&qS-i|Mnds37k#9whYUVV@uxZFWFiOZMACeydR_#F>1Ek3G$e;QQ5orig zB779aA~3>oHjcrXAq^#>@w5^nZd@#2?>3nsnkGbvbxnt&QrKU5=OBqu3|h6_Q%CbihN_t#}tMzEotg zeu7F$LkH6U3nV7=KqSvyiPA#>7N+muCut zq8=A{9pkRAJ$f9Z2xC&_h13f_L}O591q-oIk_>T*#$n$I_UfH6^?DBTXU-U^Tf?50 zI$Z^$izHN8)_Sd#6U#Jf1`Q%3|1z#TnC9hwbaunp_^pgPoON;;O~irO251wabazC! z)!UwpmCpgApbvW7&|HYm^ODZT1lZb1CP!;?$sJ!+|1LFxD!L0;LBi(t8qb1;fpS<_9L}kzvdvwITWrK>!0P2VPV3qfRbofwjI%W- z!+Hp~M@q(d3o64-E@yw=>E2kOYr!Ok9!ilDn?YK%owG_H@x1Kc9Pj){9nT8u;J0y$ zk9Q<+R}zEZjtl-dWU8N1KIfMT*)Qy!u*bH?VlTATzE zY$^RIwMo%Y^NoM37Fc8e{}(IF5f9l`7awYoPj*et6@_fWzDR20RGFB3-ulIEn|nA^ zL@mHFeO9lPY_odFp}TSEq7$v0+NFP^$ElyrZZmr9!9Yv3U^w>BPer625P6g=`H zKH_iCbWg{UU;Nc@!eSDNR8b*TKCx0wu48s46*{2?RK1~J8P`14L_S(KJn#cx)-yZm zB63HU6$AA>2lW&j|5j3Mg(N~}IYGugMnOd-BveF)T^`{@Fo9l6qC5WcWV-??DQHxJ zrBh=8YCna8wsUqG$X5C!W$4sU4#pHabxj|Kh47?f{{mI4#u`-P}TJpwyUxS2=VO4uD**jK_6FtMl`VWJm;v0E?G zU9NaNu62pU|J5<_q&2%me1z72^Or?6A~LFV8>yFr=7S<~V>B*;W}R^?H^W%RND#pW zig{L9pF~$F@^eT7R-0CA)OJVtVUPNCYQgnChh-G|<|C1#0SaP457863<2iGOEo5j` z=%OhixrN_CMg|6ALl+W|LPjOVC?Y|IX{J6LR9SOaDLgb!dE$hm)hE!xa!-+p_v3c! zB4)3mJ1J5hdV)U}fm9B}KL+-H1e6fO(iA-raRdb|{|8hiIZantJ1EqHnPVi|Lp8=T z8nEL$1IA-~mrtRgmI;LrGj>eUlRgNCAHCyWZ&*#zh!i$cbX}-r*fIh;=h$r-0cu$>F-(JQprm~F<{oUsMBG)4BLHsAW+Di2ei5f?#}zR)v4oF; z9w>5rEQnU3my21$Mi8@(#aSM&6kOtXFGrB~5lGP~kLT@@%FsfK) zd099>6CWJIiV2b)`zbtpmI1H;3l;emk4Y`^ah8sPm9XMdF8NM5rIg3Et^? z|8qZeL^~6h5OR5S8AvD|h)^qGZwp6cQ1?u7wPq4{f^?Upz>{Rzl8<1PlkqiQaJtblh)Ed9W-z?#U^E$ zh6i?0I%Sd<6i3QZEkOzqwnH1AvZSUqE#Bf#E73h0frXsKhVhj>sL7P|qmk;=VjSqC zgcuZ6#X@hvM(wnC0ToN*xQDDkOJ>tcwYgcVC={d0oO=~$^Y~@GWNq|FLv-|xn&?Wl z6d<)Bpb&Lu%a>>h1bXOcIJyTO;}?M7^*7a59?k_n@CAD@lUb4XAd#0KlJ#-g|7jEy z(;pM^a^p5g&<7nYsauobHhM-P$#Sl$N1MWDieW@BWP^PDk{qwdM$%=BK|>d!(J+`n zMY-9I1vxvUwOC?BAI8@c*H)n3C~ZhuobD!8fkl8aR9m7X0D~7fY7r!sLpj~_UYSa9 z?sODFB~HW9pik1KiqT=+GgNStb^O&OVOde3LMjRcD^$izW0^F4YJI6PPXFd&I!Y}! zxKkL#NzKGMOLikWL12Y~m&r<|%)jAX_{7ptE-SR>$y9Pk0Kuy=mz2YPXlNc*atl-94a zw=&7upXnGj@Hm~yrA&y1nPk%|$ohH{%74Ym7q7K!apj$K12@dqFsYht_r`(~0)4On zHV|X6+1qX_5*fL9Fq38?6tz{c5qyTC6euG?@wSWaTU?EWEC6%B54x;l)t;)vJj0TH zs5%@TOLz_lx6e}`w}WJm@|F>FCohJzx%86Kxu`fLYirwcm2xex|CBCd=45FZmSu51 zO6HW=6D>B}Qc01+bm(JdQiD|K!%SvmaF{D2_Bozn6DRDKd^fq|qqJwoD;q^}?qgp_ zr%rs>I3spqM7m8@QnpqiIvc^2C%CtG!lgq#88A!a6MfW>kr*T@w zVI><9d1F6WImLxIeUEE{f7(y;WyW;~7WLFX^R-eH_`m_sF2(c~v}9;MH$i;5g z*lhopl$h~W8yjt`7Fa(qhoL;Wpz&DOR!E*)y4Q6>>XvAL|Iu5L+J0CSNmLUcQd4=q zOK2Ori|19Z(i};hgjoNJib!c$*GDx3Gp{6Agk6JaTF>#Hqxf5CSyr~ zW+ diff --git a/tools/python/key_2_emb_formatter.py b/tools/python/key_2_emb_formatter.py deleted file mode 100644 index 617e7f99..00000000 --- a/tools/python/key_2_emb_formatter.py +++ /dev/null @@ -1,220 +0,0 @@ -# coding: UTF-8 - -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -import numpy as np -import argparse -import tensorflow as tf -import json -import sys - -parser = argparse.ArgumentParser() -parser.add_argument('--path', type=str, required=True, help='path of the root dir of saved file') -parser.add_argument('--name', type=str, default="key_2_embedding", help='name of output file') -parser.add_argument('--ddr', type=bool, default=False, help='if saved data was from ddr mode, default False') -parser.add_argument('--step', type=int, default=0, help='the step when the data was saved, default 0') - - -class Formatter: - - def __init__(self, saved_file_path, out_file_name, is_ddr_mode, step): - self._device_dir_list = ["HashTable", "HBM"] - self._host_dir_list = ["HashTable", "DDR"] - self._device_emb_dir = "embedding" - self._host_emb_dir = "embedding_data" - self._device_hashmap_dir = "key_offset_map" - self._host_hashmap_dir = "embedding_hashmap" - self._attrib_suffix = ".attribute" - self._data_suffix = ".data" - - self._saved_file_path = saved_file_path - self._out_file_name = out_file_name - self._sub_dirs = self._get_sub_dirs(step) - self._table_names = None - self._father_table_names = None - self._step = step - - self._json_attrib_dtype = "data_type" - self._json_attrib_shape = "shape" - self._host_attrib_dtype = np.uint64 - self._hashmap_dtype = np.uint64 - self._raw_key_dtype = np.uint64 - self._key_dtype = np.int64 - self._raw_key_offset = np.iinfo(np.uint32).max - self._data_dtype = None - - self._is_ddr_mode = is_ddr_mode - - def process(self): - dev_dir = self._set_upper_dir_origin(self._sub_dirs[0], self._device_dir_list) - - self._table_names = self._get_table_names(dev_dir) - dict_out = {} - for table_name in self._table_names: - combined_key = None - combined_emb = None - for sub_dir in self._sub_dirs: - dev_dir = self._set_upper_dir(sub_dir, ["HashTable", "HBM"], table_name) - emb_data = self._data_process(dev_dir) - key, offset = self._hashmap_process(dev_dir) - emb_data = emb_data[offset] - if combined_key is not None: - combined_key = np.append(combined_key, key, axis=0) - else: - combined_key = key - if combined_emb is not None: - combined_emb = np.append(combined_emb, emb_data, axis=0) - else: - combined_emb = emb_data - print(f"{table_name} has combined key {combined_key.shape} and combined emb {combined_emb.shape}") - transformed_data = dict(zip(combined_key[:], combined_emb[:])) - dict_out[table_name] = transformed_data - np.save("./" + self._out_file_name + ".npy", dict_out) - - def fw_weight_process(self): - checkpoint_path = self._saved_file_path + "/model-0-" + str(self._step) - reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path) - var_to_shape_map = reader.get_variable_to_shape_map() - for key in var_to_shape_map: - if key == 'dense/fw_weight': - np.save('fw_weight.npy', reader.get_tensor(key)) - - def _data_process(self, dev_dir): - dev_emb_dir = os.path.join(dev_dir, self._device_emb_dir) - host_emb_dir = os.path.join(dev_dir, self._host_emb_dir) - data_file, attribute_file = self._get_file_names(dev_emb_dir) - dev_attribute = self._get_attribute(dev_emb_dir, attribute_file, is_json=True) - if not self._data_dtype: - self._data_dtype = dev_attribute.pop(self._json_attrib_dtype) - - dev_data_shape = dev_attribute.pop(self._json_attrib_shape) - emb_data = self._get_data(dev_emb_dir, data_file, self._data_dtype, dev_data_shape) - - if self._is_ddr_mode: - data_file, attribute_file = self._get_file_names(host_emb_dir) - host_attribute = self._get_attribute(host_emb_dir, attribute_file, is_json=False) - host_data_shape = [host_attribute[0], host_attribute[1]] - host_data = self._get_data(host_emb_dir, data_file, self._data_dtype, host_data_shape) - host_data = host_data[:, :dev_data_shape[1]] - emb_data = np.append(emb_data, host_data, axis=0) - - return emb_data - - def _hashmap_process(self, dev_dir, ): - dev_hashmap_dir = os.path.join(dev_dir, self._device_hashmap_dir) - host_hashmap_dir = os.path.join(host_dir, self._host_hashmap_dir) - if self._is_ddr_mode: - data_file, attribute_file = self._get_file_names(self._host_hashmap_dir) - else: - data_file, attribute_file = self._get_file_names(dev_hashmap_dir) - - attribute = self._get_attribute(dev_hashmap_dir, attribute_file, is_json=False) - data_shape = attribute[:2] - raw_hashmap = self._get_data(dev_hashmap_dir, data_file, self._hashmap_dtype, data_shape) - offset = raw_hashmap[:, -1] - raw_key = raw_hashmap[:, :2].astype(self._raw_key_dtype) - key = raw_key[:, 0] * self._raw_key_offset + raw_key[:, 1] - key = key.astype(self._key_dtype) - - return key, offset - - def _get_sub_dirs(self, step): - sub_dirs = [] - for _, sub_dir, _ in os.walk(self._saved_file_path): - sub_dirs.append(sub_dir) - - picked_sub_dirs = [] - for sub_dir in sub_dirs[0]: - if int(sub_dir.split("-")[-1]) == step: - picked_sub_dirs.append(sub_dir) - - if len(picked_sub_dirs) == 0: - raise FileExistsError("There is no sparse checkpoint for given training step.") - return picked_sub_dirs - - def _set_upper_dir(self, sub_dir, dir_list, table_name): - dir_list_copy = dir_list - dir_list_copy.append(table_name) - temp_dir = os.path.join(self._saved_file_path, sub_dir) - for directory in dir_list_copy: - temp_dir = os.path.join(temp_dir, directory) - father_table = [] - for _, i, _ in os.walk(temp_dir): - father_table.append(i) - - temp_dir = os.path.join(temp_dir, father_table[0][0]) - return temp_dir - - def _set_upper_dir_origin(self, sub_dir, dir_list): - temp_dir = os.path.join(self._saved_file_path, sub_dir) - for directory in dir_list: - temp_dir = os.path.join(temp_dir, directory) - - return temp_dir - - def _get_father_table_names(self, directory): - if directory: - table_names = [] - for _, table_name, _ in os.walk(directory): - table_names.append(table_name) - return table_names[0] - else: - raise ValueError("directory is None, cannot search for table names") - - def _get_table_names(self, directory): - if directory: - table_names = [] - for _, table_name, _ in os.walk(directory): - table_names.append(table_name) - return table_names[0] - else: - raise ValueError("directory is None, cannot search for table names") - - def _get_file_names(self, directory): - files = [] - data_file = None - attribute_file = None - for _, _, file in os.walk(directory): - files.append(file) - for file in files[0]: - if file.find(self._data_suffix) != -1: - data_file = file - elif file.find(self._attrib_suffix) != -1: - attribute_file = file - return data_file, attribute_file - - def _get_attribute(self, directory, file_name, is_json): - file_dir = os.path.join(directory, file_name) - if is_json: - with open(file_dir, "r") as fin: - attributes = json.load(fin) - return attributes - else: - attributes = np.fromfile(file_dir, self._host_attrib_dtype) - return attributes - - def _get_data(self, directory, file_name, dtype, shape): - file_dir = os.path.join(directory, file_name) - data = np.fromfile(file_dir, dtype=dtype) - data = data.reshape(shape) - return data - - -if __name__ == "__main__": - args = parser.parse_args() - formatter = Formatter(saved_file_path=args.path, out_file_name=args.name, is_ddr_mode=False, step=args.step) - formatter.process() diff --git a/tools/python/optimizer_process.py b/tools/python/optimizer_process.py deleted file mode 100644 index 8a658e29..00000000 --- a/tools/python/optimizer_process.py +++ /dev/null @@ -1,116 +0,0 @@ -# coding: UTF-8 - -# Copyright (C) 2023. Huawei Technologies Co., Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -import numpy as np -import json -from enum import Enum - -# 每张卡处理自己的 - -parser = argparse.ArgumentParser() -parser.add_argument('--path', type=str, required=True, help='path of the model file to be converted') -parser.add_argument('--step', type=int, required=True) - -sparse_file_prefix = "sparse-model.ckpt-" -optimizer_prefix = "Optimizer" -data_suffix = ".data" -attribute_suffix = ".attribute" - - -class DataAttr(Enum): - SHAPE = "shape" - DATATYPE = "data_type" - - -def get_optimizer_name(sparse_file_path): - optimizer_list = [] - for folder_name in os.listdir(sparse_file_path): - optimizer_list.append(folder_name) - return optimizer_list - - -def get_table_list(table_upper_path): - table_list = [] - for folder_name in os.listdir(table_upper_path): - table_list.append(folder_name+"/table") - return table_list - - -def get_optimizer_param_name(table_path): - param_list = [] - for folder_name in os.listdir(table_path): - param_list.append(folder_name) - return param_list - - -def get_optimizer_data(): - pass - - -def get_attribute_and_data_file(table_path): - if not os.path.exists(table_path): - raise FileNotFoundError(f"the input table path {table_path} does not exists.") - - attribute_file_list = [] - data_file_list = [] - for file_name in os.listdir(table_path): - if file_name.endswith(attribute_suffix): - attribute_file_list.append(file_name) - if file_name.endswith(data_suffix): - data_file_list.append(file_name) - if len(attribute_file_list) != 1: - raise AssertionError(f"under the table path {table_path}, there must only one attribute file. " - f"In fact, {len(attribute_file_list)} attribute file exists.") - if len(data_file_list) != 1: - raise AssertionError(f"under the table path {table_path}, there must only one data file. " - f"In fact, {len(data_file_list)} data file exists.") - attribute_file = os.path.join(table_path, attribute_file_list[0]) - data_file = os.path.join(table_path, data_file_list[0]) - return attribute_file, data_file - - -def process(path, step): - save_dict = {} - sparse_file_name = sparse_file_prefix + str(step) - sparse_file_path = os.path.join(path, sparse_file_name,optimizer_prefix) - optimizer_list = get_optimizer_name(sparse_file_path) - for optimizer in optimizer_list: - table_upper_path = os.path.join(sparse_file_path, optimizer, "HBM") - table_list = get_table_list(table_upper_path) - - for table in table_list: - table_path = os.path.join(table_upper_path, table) - optimizer_param_list = get_optimizer_param_name(table_path) - optimizer_dict = {} - for param in optimizer_param_list: - data_path = os.path.join(table_path, param) - attribute_data_dir, target_data_dir = get_attribute_and_data_file(data_path) - with open(attribute_data_dir, "r") as fin: - optimizer_attributes = json.load(fin) - with open(target_data_dir, "r") as fin: - optimizer_data = np.fromfile(target_data_dir, - dtype=optimizer_attributes.pop(DataAttr.DATATYPE.value)) - data_shape = optimizer_attributes.pop(DataAttr.SHAPE.value) - optimizer_data = optimizer_data.reshape(data_shape) - optimizer_dict[param] = optimizer_data - save_dict[table] = optimizer_dict - np.save(path+"/optimizer_dict.npy", save_dict) - - -if __name__ == "__main__": - args = parser.parse_args() - process(args.path, args.step) \ No newline at end of file diff --git a/tools/python/readme.md b/tools/python/readme.md deleted file mode 100644 index 3f5e86df..00000000 --- a/tools/python/readme.md +++ /dev/null @@ -1,110 +0,0 @@ -# 模型数据转换工具(key-value)使用说明 - -### 1. 美团1207模型ckpt保存路径说明 - -#### 1.1 训练时1207模型保存参数设置:(estimator模式) - -![img](./images/clip_image002.jpg) - -![img](./images/clip_image004.jpg) - -#### 1.2 训练后模型保存路径目录展示如下: - -![img](./images/clip_image006.jpg) - -#### 1.3 下面来看单个文件夹下存储的内容,以check_ran0为例: - -![img](./images/clip_image008.jpg) - -我们的模型数据转换工具就是要对该**sparse****文件夹中的数据进行转换**,转换成key-value形式,保存格式是npy文件,详情参考3. 输出文件格式说明。 - -下面介绍**如何使用该模型数据转换工具**。 - - - -### 2. 使用工具demo说明: - -**该转换工具model_data_to_key_value.py一共需要4个参数,path、name、ddr、step** - - - -| **参数名** | **数据类型** | **必选** | **默认值** | **描述** | -| ---------- | ------------ | -------- | ---------- | ---------------------------------- | -| --path | String | 是 | | 保存模型embedding数据的根路径 | -| --name | String | 否 | | 输出文件的名称,最终输出.npy | -| --ddr | Bool | 否 | False | 保存数据是否开启ddr模式 | -| --step | Int | 否 | 0 | 保存数据所属训练步数 | - - - -#### 2.1 参数确定: - -下面是一个选择参数的示例。 - -##### **1)** path路径确定 - -我们选择1207保存下来的0卡模型文件夹下的sparse部分数据进行转换,因此路径选到目录下:/home/lff/model/check_rank0/ - -![img](./images/clip_image010.jpg) - -**--path = /home/lff/model/check_rank0** - -(多卡的目录需要转换多次,一次只能转换一张卡下面sparse的数据) - - - -##### 2) name参数: 输出文件的名字,格式为.npy; - -例如:sparse_0,经过转换后的sparse数据就保存在当前目录下的sparse_0.npy文件中; - -**--name = sparse_0** - -##### 3) ddr参数:美团模型未开启ddr模式,因此选择False - -**--ddr = False** - -##### 4)step参数:在上面1207模型存储的目录下面,存了第0步的模型。 - -**--step=0** - - - -![img](./images/clip_image012.jpg) - -#### **2.2** **执行工具命令** - -python3 model_data_to_key_value.py --path=/home/lff/model/check_rank0 --name=sparse_0 --ddr=False --step=0 - -#### **2.3** **执行结果展示** - - - -![img](./images/clip_image014.jpg) - - - -### 3. 输出文件格式说明 - -**.npy** 文件 - -【在使用mxrec的时候,传入了一个特征one_big_feat,存在表名为one ascend hash embedding 的表里面。】如下图所示: - -![img](./images/clip_image016.jpg) - - - -转换了的npy文件构成为: - -{****:{key1:embedding1,key2:embedding2……}, - - :{key1:embedding1,key2:embedding2……} - - …… - -} - -示例:转换后的npy文件裁剪了10个key - -![img](./images/clip_image018.gif) - - \ No newline at end of file -- Gitee From 53edea92b2ddab65ed4cbe375395d63aa4e66868 Mon Sep 17 00:00:00 2001 From: gegaojian <14206008+gegaojian2@user.noreply.gitee.com> Date: Mon, 25 Mar 2024 14:27:53 +0800 Subject: [PATCH 007/302] =?UTF-8?q?dense=E5=B1=82=E5=8F=8D=E5=90=91?= =?UTF-8?q?=E9=87=8D=E5=A4=8D=E8=AE=A1=E7=AE=97=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/dlrm/model/main_mxrec.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index dd3e8d2d..2d0ee78e 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -24,6 +24,7 @@ import tensorflow as tf from sklearn.metrics import roc_auc_score import numpy as np +from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline from mx_rec.core.embedding import create_table, sparse_lookup @@ -323,15 +324,20 @@ if __name__ == "__main__": is_train=False, modify_graph=MODIFY_GRAPH_FLAG) dense_variables, sparse_variables = get_dense_and_sparse_variable() - + trainable_varibles = [] + trainable_varibles.extend(dense_variables) + if use_dynamic_expansion: + trainable_varibles.append(tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB)[0]) + else: + trainable_varibles.extend(sparse_variables) rank_size = mxrec_util.communication.hccl_ops.get_rank_size() train_ops = [] # multi task training for loss, (dense_optimizer, sparse_optimizer) in zip([train_model["loss"]], optimizer_list): # do dense optimization - grads = dense_optimizer.compute_gradients(loss, var_list=dense_variables) + grads = dense_optimizer.compute_gradients(loss, var_list=trainable_varibles) avg_grads = [] - for grad, var in grads: + for grad, var in grads[:-1]: if rank_size > 1: grad = hccl_ops.allreduce(grad, "sum") if grad is not None else None if grad is not None: @@ -340,17 +346,14 @@ if __name__ == "__main__": train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if use_dynamic_expansion: - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) - train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) # do sparse optimization by addr - sparse_grads = sparse_optimizer.compute_gradients(loss, train_emb_list) # local_embedding + sparse_grads = list(grads[-1]) # local_embedding grads_and_vars = [(grad, address) for grad, address in zip(sparse_grads, train_address_list)] train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) else: # do sparse optimization - sparse_grads = sparse_optimizer.compute_gradients(loss, sparse_variables) + sparse_grads = list(grads[-1]) print("sparse_grads_tensor:", sparse_grads) grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)] train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) -- Gitee From 0c890e15fb0c7f989a2a7b4bdc78382c45deb198 Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Fri, 29 Mar 2024 06:15:34 +0000 Subject: [PATCH 008/302] =?UTF-8?q?!66=20=E6=B7=BB=E5=8A=A0=E9=80=9A?= =?UTF-8?q?=E4=BF=A1=E7=9F=A9=E9=98=B5excel=20*=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E9=80=9A=E4=BF=A1=E7=9F=A9=E9=98=B5excel=20*=20=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E9=80=9A=E4=BF=A1=E7=9F=A9=E9=98=B5excel=20*=20?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E9=80=9A=E4=BF=A1=E7=9F=A9=E9=98=B5excel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...32\344\277\241\347\237\251\351\230\265.xlsx" | Bin 0 -> 31412 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 "docs/MindX 6.0.RC1 \351\200\232\344\277\241\347\237\251\351\230\265.xlsx" diff --git "a/docs/MindX 6.0.RC1 \351\200\232\344\277\241\347\237\251\351\230\265.xlsx" "b/docs/MindX 6.0.RC1 \351\200\232\344\277\241\347\237\251\351\230\265.xlsx" new file mode 100644 index 0000000000000000000000000000000000000000..5224de2b2f5ed98b0b0d0b0c15e65ebf98ad993e GIT binary patch literal 31412 zcmagF1CS;`w>H?eZQHh|ZQHhO+cu|r+O}=m-P4}7jqRBm|L*r9os&2r3CxHQ@)UT5P~SW?xrj!afv6xXZ_($uCXxAW1I#QTK5uc> z(CR?=3|8%p4T?W731it+xtVj2o$dVsxuu21N!7R++XdUn);G#C#=_hcIF$&#(g53^gsh5%^Nus8mRfJoa1u z&I52fi6qh!Bo!|iV2&v-q}Ad)?X#y?=q_?bCsvgV!0cWCJ+w^4mB$P^-WSuthU4at)db0Q6NjGEL!DLvOv6@;?R)_=7ijzZBD6jo)6!B4tHGk%|MC@gl^s9*q=M-kP*=W;S%@eI zXs#et90k)=iS`=>blHK{L&IuR9vW@xZX!yVMS2k|S9;6e0m^z;Vx4Ku*nD4mfT7tC z7_}uq&YWL+rDSiKKNE;u+xr&CUV0QnZMXFxb{`Id4HywOK@n!u)M^}$05lYdYIfQX zOW{4qMW5=;tROEsF~!kl-;L`uc1;l{^oOV262j(%>m1!5)=vH*yNQuw-zE7!m#RSW zxsrXsfR~m@{MP)uC@F! zx=9^xOF)OZl#rCLf2>8!&OL`AJg)eBmuKX_#byjga)2V>=J&QQ8+@uqAkt6 z>BT%#-ILu|i%<|NrAD*L`giI*zKf0b0RAT!f#`*S*Ps9Zq~BJ8{2ySrI62$d{Dq@S zZ7p_B9mxmZ;*)SlyvU+}=1R-Ow5e0y2xhngmY&o|ueR^aR`cUZM%G41DXHF6xQ9UF ziDNpVbwqe@!tO^Vk4t{}K95%pA^z?f@@ODWbi=~gWb>ez=c6*aFWRd_ZotYK#Oaz{ z-sfivdeu0zJS$^qo-JYhs|y;d%nf4TDFINKo?k1jSxb_UZn3YESEa1HFV8R_!fn2c z;IaS(Op^TUvP>=?`p&>q_u$(q>zCCi1N`>lQYj23WvHZ~9{Pj*l6n>el(d(tY zf*{TR$O!47q|yO>8bva3NlY#6+}2lDYYuXPYR=AD(_L;$uYdp#)ARP$#^)E43sxx9>jrvN|Q zCFtu>%+~_x>Wog{=9+MROM24dtvCM)(pvKLo*~DstqYK9gxC6EdOxBbAQmQ6;3C>C z0x9xs2EdIgw;T$>pwzMsx*k9s(>)n2c175I(Y)DfPwTQiSkFJA@pzIv?#NqQ=o_~(9Zs2DYnW6mT{>=CQ^ue_(F19 z65>{En&bW{rOV?ICgx2-5s=Er1gxfMQ;PVZ!hKMv#DS95n-x3;m<~ch4<}>b(8CXFq4!O)CmFMjk@$5|ZEqn-WP(xeTo-U3Z8OXEHB&M8 zWGlM$&`(l#AMwP4ClUn=!tJx_ReDCLvo49q`*fX;rAq=@IE*Z^K=>?T$wIXt@E|T6 z_{{cx4m;_Ps0GAEa5|D^?;j|JrqS}y>_PXG9 zJax2)WvvM<@{M(dK+ZEkBhiQQqg=2_6_KF3c<$}s#0Kl7aGLhT(jsrvOfBN0G|C?n znES6;n$?rj#Waag%$-N>=;QFe$2pMBoPDQux0RK46D~|ZEEY z2TO~op8hnA3u7?n6_bh~ajD**jvY7PGE?xm_KMwYXp%7*Dv+8c7CIJZ<1ZZJD0V^WY1h<3jW%rKI}KBOeBOP zBtxHlW%xeJ2olc^NFJ$M80m0$N^D21pWTZxj4drTB-TNpR{JE*0xrLhLLl!k1^G8s zYv3Uwh!>3x2zCy(};0-zfO!90GrokYWXY zejj$ww!o|zoSUwD&@hjs?FosI-PfyH4zAw`uDuTCt6})T2ryF;W2_tYOMTso3Nc$* zxG|{}O6;JfwDH30(WTpONLHFcCrw^6>*xtyypwl29=fNNgRN-w4U`T{1bZh*y7unvaOQhE=2M|jx_144J{HrBE8uCK<)X%q!J zJ*&0wccbuu)SsWN6nNI`hLk>^wst#rn&TAJQRc!ZRPE~=@iTNe`qoJSx)7Q zJyobxjEOeGe?E8uCJ7%PU|n4xx9QYjgr?uz>Ec2q!cP>YGDe2Q+?ef|YgJfBg71r3 zZx6Tb-oZH8TeGT)e&izfu5mQk!vb3>M;_BHg z#+}#C=a;CG1m1(&dDsQ`6dDCK7Qa~~y=D4*$g$8<3q+CXob@7U3}^CQfEasSl+aF$ zmYe&B>-Cn4g?JvbNZ$OQ-=3z*7Y2D&^6pi-95Hvq&OCReL1&1KQa!9R0w%Mbc!(eCrV8vl5ZF0h2cDNA*O$0#U-tj9XOK^Cxb`tado( zPQ+zwIif=(PLL>l=Cl#&#fp}Ox0G~g>-64yJZhfK%4s;~`AV^Fq7FZ5MCF!${R+(C z5f7gvIXY>odB|`nY9*#xP-aA*c>4O)MBe=1FxAp3GbM@h<+q2 zFCrycg1cBIrDREtr?Iy33MHV|iYuiT8YAoBp-r@-^{}G|o~V0ub#`KG2%455 z?sb~^mZGzxNI7{)(me7(nZ=Ez$f+glG!6ic=fJ~b z=(4nvt{~V3ru(^t?mVyiRMDJDI8YLc0nH^hKWK#v&$iI%vQWLkd&TwW(5Q`ok(XPf z?)MdjL(s=N0e^dPN+iLlT#SBuEfZ*vPZh>hK&UlnirCJ0LuQWzQ7_Z-yfCvw+c=-x z&`;)o78#=s7AH4#J~)H+I|aBB=2!r#MSH%?VE3-uClwYu?^Np-Pe z0xFn`u?=@ZIrkPDJsY(xL{Yu2@=C1^$H@i@Co@?R)E_t}hf#&6yL=WI5vJ zrs;f`WAcT*1l>nHrw=V!_6l^p%yG7WGbz<9Rk`Fc6gOic&vhnUzH9A89thA2ua`W( zYTa!pKkkh?nQ&DL44~jAAiWSE#E6@xfq;N+-*apaRWHN164?^nWjYD zB#2m8CQFGSU<1A?TcGk#_sXtD@7z5pDaypfvycWlNmY%#CR*o(q0_8b8ct2!a#AF{ zH+FuL)Df-^j{`F?SGB6a>yN}{14$oqUcaU`t5}7UaZ#tJ%&;Bvl%ZX%XBvNm> zsj^F%pD|Sor+~KFWuf{qluWaC4+x+^o+vKFI10k?LQ+JkbH7|4LXfJHdhue+9Ga4w zGg#t=(_|vmEi^Xe);ybySGq^IGPo~_Wq+^#k&y_VuW+Ip5Q=$opwcYK>&t8{6zza@ zsI{1X$Z7X#4w``;ID{hUo=^V)+ZsC|fKxnW`LgZ-1u{h_P{1Ioo`Iu%C{QX8i!O27 ztG3s{dqmNVxvEe=)b)i9z=*%o?bC5QK~H$`B`Wt~ArdGcrk! zE4D?rPjF;IJI?PSp#PTI#l5UieezN(A)f_UAC4*x@y92f`qa`l#J+bf@{o96<9K6X zBGEpzK|lJ}gs>(Dx|;d?YWmR1PgPkWi@W_lLxYrfeV~PajlnXL73O?}ju2Hytndzo z`0FGI%?}@)Xur}EA_0;bGyuDiV|Z%TI4Xb0Q*kgwkpz!7zHSHWGcMHNf{;(=_XZ`+ zamEfVkPxMQ6? zVQQEn2)f?eR#@DpL;(lri?#@CVn29X!>jqw!)e}#sap4@mx8GarC(WzK?k^OFD1Yj zpxH#Cg_{TQNQY(w(nNuQ>Yb1=bZjs5`XQ5x=k_#8Zcb8=z7KkTs>3o4(zu75z7FEU zkkWS!<*2zc@*!Rdg(NC6KtY)a#|>#j`wAiKL)Y((pj8TlgCEaON9q8lH+a0xv#ZPU zb2a?}Gmhz{Nn7~18esXpTNT#ou(b-LG||F+Q$ur2avxZ;BV%06pM9rhma(q(^?C?Q zSQ5_J05I)WOg8ewH%RIfH@70>DCDfkW8@0ZxtH@O32=Z>r9NlPXoL& z=o9e@{%wBF^z5x$#KP)^`TN(a=&^gn_-E*uUiHWvqhpg4?A3O#2k~%9&FMfsR)b^o z;)#*a;)gaqMO3X0-gp?;+{-O{7)8JwUy`Pux{0Yz!kAvYrd;2|7s%hwW&e9M^!KgN zzprHKHvh9IQcxw(qQwIM;D7}D_gU@#k+X0%vH7pMD96fL+r)Du4p@o-GpyV)$X?Lu z#=uH3dZgv1m?mq(G>MdRK)bgNhx0kwY#1>DxLewG7i7g58K!Qogwa3^MOFT%>bhz< zRf@0qgrz}Ct+fKbvQE=FDU?Pr;)XmW5oo4t-j2tuDbIAUHP0ZYu5L_rw#SUubauv_ z_Sejphur7ar9=18xnnc;Mm84t@y{GVv(dHFb9=Dt8ZI{u*y$}<_0O7wIfGwnqHhN~ zHlLr3{6;M}%p}a+N1JDx`#+6mTs?|Sj%uSzh-65McQyEF_yKod4#!*W}b2sdPHU3_HO2a0H9ROk}=_XLe%s z)E3=_e|>A^dW7TBcnK!VV5PY8JbZ(F8F7Mrr}D%#AXjw8MfY2QbqskIeOL768zDci ztH@5U+a~ct#~(U}`$}T|tW3F$c}dkyIj&5d!M`6_;j%hhnN7({TEY{5Z~T zmm+6nule3X_Di-~PE(Yreu;*D#^PxteTha3F#|I}t;kQf;#}rUEADNj_sN0&U3cUC zZ1?mTO`A2ZN9vRH3Y^~hhWg}ciq})Pnuyft1@&GHFVQWiyV0RJ#gTR6H)##C3&j?3LqQJu)|h!b`6oocyilaw#cr*>ooW~U;0pZ9H6NA#&B zxYs+ZL+KFWtDE95y`UXHi9KJmX0uPp2geW36=%P?W(HLrCM_=26jEHwgjms+Zp4Hz zwgj=CK?&jXxU@vLw4dpxIysX!2Q_PH5C@oByC?b*Vw8~TSDRNmWf@Ojzh%UXsPKP1 zWlMe@tli)=2)!U9%MI(E@wI$S=tL=fZRl{%pW0;IE8<>4+3hy~zYRNa8%0ns59CmD zp@*z_@@S-Z)B4<{HXsu#`E?(vJUtwgk$>(Xc=@_tx9o%d$)c1~Fe|$AzY{RjOZ5=P zRBY5dfR}3ianD1W7}{-@LThanGpR-U#k4cJJ_%nXml#zFSH zQc@IoC(phlkE4}dlUwn^7Nab;92axOAv*y$Wdb#~;c&ZAGCP+)8UJj3QD;MAt7(&! zxMkRn@1^t|VZXnuko|E~mM2p56R?OUtf>O0PbtWL*|rk?=fi zkrBlib##1PV`V9Kjz`=d75bGTjjYM<6gnGYb(2+2oXBVbejlc7?No01D|cJ_ZMlE= zqqb9wkNyloqF>ZF^`vxHncj+vrOs_wQT4Yhd7o}?R$MJ#U7Xuq=%M7w>1ZjXRy%|- z6*6~qvl-0P%~aiP{Jz*Y%>8{aM^3}J%}2D+b(?m>N=Mwld`ma__}=5V-{$DerTVyk z`QuV&E%rRAE=g&>?X;{Xdp9=h_Na4oWKKP-L-5>Q(l^5fspkTrf~QlUFt8?#RAexO z_Liu6gpQa|-Grn>L$q$j<@Aq5tKk^|EGGExR&Pj%9ym_6dES!qjeJIqztI; zwB!(2!8?BYx8Y@bvOnaQt2`DzyH7utI2M8tH&I(}G90%W@O6E#kI&=NFnnsVf^x^$ zh!)M}C7(cQUDU%`7S;h=2>`Df+vV|ZW0`*t%Hqr*(qH*l7b-QC@F^jGJRX!!MTg!> zj740c=Y9I`0>0s-`NeTd10&FUlP{1GY!wVT>8cNP;cQ^NS0T2)*b$qS$bJd_;E=mSmDa)hsEFGr?8sA zhvn7o#Zjer6Ex`Zru9Z%NIVl0861Eaayys=>V4AcHE<0!cA1oDlW-P%JGxdjIEP^t zd!dE9(3RM6&cErz(#$g8UbY_tZjHU|A@ zX45Wqi8@E_tfQMo2AKIr%`EFJ-Ch#e99eUwNiakx1t%%7@U+ER1t)c8n9MLAIQ9xd zrfS}cBvO<*??cfTZW4amu0FPC=O2^p{jsMlSny9D$AJLmjK}-JYq_o8_l^lSUl=+~ zYgw{pY3IsPQpQ5DP5l)YBon9hPIZ1RYfzAI4r;0`wlQw4Ds)^G^9)K!V@nA)8I+p4p8XV=XlNc0xz81?LsY3F}=kB`wlhEstSzMjq zAs>LxD|LT!J9vhi-yzOqZm&|m?a|O-s1_3UQQt!J{wh3dGNEp52`!94?rmH!KGa@F zI%%;QP3JRz_n4a}*PXcMR2vKW;ziG^_1GJ{(0y;?AV==Yc-k7WmPcA;KI|xXuIiq$ z=JdvY`FUTb7ww2DzVCi{wHbR(UAFH?Yey=}=S1^bR@1WI1}7+r;b%0lee61n=H@Vk zyC&|f@SKKh$1~Wp58T843STq=@9Vie;9q|2gZ|WmIdG-i_V6bXMGVcaWkQG6oaj@u zE3l9$J7njp(Otf%C8VRU9d_EyaFQT~;AhME63{8Ice!y*fMmp&xC@mOb$B{zV)3!*)vIzk?n z!eL&s2|-&QjKk%>kd)k;u-Tk&^4eKmqISFr+y_)Y!%?yX@0B_ri~W~6T#{Ff6YX9pL;A|<*5XZiRj9R>^)4^ zoxIuJaBSCXm$~Xiq7PFSU+&BBQ?==jU|_VsKZ@wZ!T=}$9opMorz(nNy$`)=(%@uBk66r$QI5GE`kTyU6tL} zC^AJP*$qfC!(-{AB>FrL}4V((8YwXDR?EJ?()m@#_|J1Hj#y)@0e zN~d%%J+>g{4Srgl_&E}m$*QywmOY&wzr5rr=dV`uoGnOv==kySGUu@`z-QQ1@`ay` zbYx`ghJwTSZEi6+(wvG-z2*F386GYvJv!SOR$3W+i_K1N6{#HYHdV!*n4fc0KkF0JYg1h5m#nLRUlkY0~EH)D>4B}kbFhRK(KEm}&FuOwj zV)^uT4ujZi#?0a6=|md+)UTaj5T zcnm;@!F|=B&*eOc*3ih*yL2NfjvsrJof%pE*9e^&)1CBF8#bto*;h>9gg50)fEiy= z^WB)bN%!;nD%Me;M|S!y&q)C$?IdXytLnMv7&6pOm>u$Xo}{v=Uv}xmAX=8y{x}u= zYHMDMFA{T;mGw-ay@@ZM%Q9+Ch8J6D20o)n(Fvi7sUwfQsdMw#iab>a0a8`w< zGFZZLWuFpS;B;VL%yH`mZVvafSTu7&DoOK3xbCseVtT0WpoVkZVqrV^wabP33gBeOrpz%wc8hE+wl1^bCsy6uX`K+7D5l2+_KAXu(K2)W7NH=1xZ2w6#@JGmVu9+y=R7*mGid zB2Tb3=$}GONY6oxCCQ%%eAphg;|B^uW{Fi=!^W&K1d1yV~X8~N0(R3GYu-S$SMv$PcsqptS{+v zLq>xhLS0ge)u_k zj`s~@VUOY}*(ismz%cMJt}94FrYa9}A@B(#E={CkAOqtt&ZkKX@TrpEGV;Vn)23ef zHgw9H6d&kJx8S`@1)Mv*Gn$_0G{_TrUg#6irEnN)R$zym9F~Gbyvw4oDmBl? z?zraVRag>q+*{PKF!Y@4>BRVynTVh>8?9H07Z^ToKd#^0%S z+KOTi9sK5UO~+P}hagYIy(3$G%&Zdmg8Ga-Yl_Q}pYxhPvyabBLTXOkM~!^MOaecp zamVSHm{c{|MWhr2xPVd0pGEWZB9RpWWdZY=r=Y9jcHZRIann9M-TnSd+}R6AK=iz5 z{W2aLTppYm1qerEII{5t& z6INw`5-8aPYM?3;1*^mWtB*1!46&bQ5@L@<6hjqM2Xi(2f(ouYz^qj z0>Ofy$n3&wco$+|KCoekuc{K%ONTnBPwTJuO-1giM32bxss_y!%#JGQGX>!$eI-?# zOW_R9cFEB@XK?uD$_Z9`MEBepj`5il%ao(vx1kzd(yQo}Ln*_Ta@2L#4C4$Qnr=am z=swN2j5I8aVf}esXysSG)r-_KZ|g~Y!*;%;FU8M>=K7dre22}11QS=Sa`!5TYZhUV zKkGNIQiT+R2`Y{mYICOCYWx?9?(_WHmtS0RqxxU-Bjpu=SGL7jKw}C-V~-U(mDCCY zoL0$=YL8jt!1ACT+ri9ZA~xNHcxN*3PXl1NmWP~bi>5_KvZ~#5*S!W9zVa}A*6vFB z%3{f>`X}94Q6pBkM+&bxhW49?t8}{tk}{z8Q!X7d9?YYU^Yz?4TX?#>DO^XpKHtgX zj&rYcuX1yq1^3!Nva&mTIu`e6F8%J_@7g^+Ipl&WAM-?~7Z~B6u3=8l+k1S29a=kI z7tM0Pr=0T9-Ey2NzDp^5nC`A$^F5MBA-7>(x~unwoBvF51=%J@t2b)em;r`^##mu$ zuc*|f&+;?{-5gAgC(Hzb2Y4QW}U<`T;l^M>jVoLP{h zl(QcO@2~qc{i8V9NBRDyTF|R|0zK|+g9Cn3kA^WPX4meXOYy0oy4sq+@{c$ys{1Wo zAx&%ED3W66puaMB)y$luA3y#j{XadBica4+CD zptv|zNXKNGww{&Lw+eUSR=>PaL~X_axP?-LMa8N_O@dYaRnkfnv052D*IJWGl%A2l z)N=D}>$xyY!uF1uEVp}huWGgc_0Pmk#SIO!B*GHY@h>{L3kZli0@YZ7v08BSt3i88 zE4P;X34#OrLJ&u82ViEE>XW})i&c9PsuL^|1}G>32^k5pX^2f(zqw;j2kPWw>H3}9 zc)~=GM{xYEBvcwqvaG6=5zf1B?Jp;MC9zt187#Yo-o*#m()@1*SO|UZFD49M>KWhF zfK|LKu0Eh=MWSwWA66NYqEojce5eTu>lzpO9cCz>&}~sS^~#qw{i^cIr0-vf#>EG3 z>p`*Pz{%QpmeV-n&@RtBN*?R7S%Pg<5d^?fOgj>gZ5`tI%+xp`@+PnyW03$s$M))^8d0=)6$tY;WL6y_%J zPJa~Jzhn0|Z;HRuDmp63b;L3XGQ7z0mg*1@v!CSjETJ@*dZAB3W~K4qZ)*psZ{I9~ z&Uo7(U^LY1L}8_>HE5^OC=R7{4>_UFK%FD&j`l=TXIPi2lyHHbmWubu7_0nFCtSDm zF6`7-H}zDlf|7Q~)=SS43VVz_PStLj&8Z$thEFm>-=b96Uv<-5DA9&9f&We8RFfe( z#6IOS1ZY`ej&(!f`9p=A^h@bO!M5g>^Cl}36se>COXe*cn#TK{S-tz_JD(B(>qUut z5p2lY>8zL5n4{vv?Ece5FvAX!vj?Iau_`AFy7QxhBc8DxPlKF7z22~`ClCHf&FgT~ zQ3D(Yj%>-GN(I4WpaY<%I9J>?x+T<3FeSd|q%{Z_h2{^%CE9oGU^VqIRMNXG7^#0J z5359z$TNM7SSCUo;7n2bijo3aD8|vN3=>FeRQ$~QPl=)KkK4cH#&+&x7v^g+pFHbc zUP!*hzYz^>J+(B~IO}%N@5H~214j8X3Pi8Ehz^BGgt1^ht#u-D+v%|oXJ6u zSgxlu@APe}cj; z)9pL21gr$X609rloZ1ALT$Puw)-DV9FraS|RP3_+&+T2ZF5I^E43?TB_+*Z61#5}W zqT~>PllYp*=*N3EkKQ`#*mP1M+FoOL^+0a!c(xI9|4i)GAxZ!=Tg61Oz4`+F&!&w$ zzM%tGB~qFt7ZhDUYYKN+f7WQ4 zBmIf0pQ!)xA@U|^mUTk>vC=&~;)bceMh3@B(RzeWH#O8a&b!YM0TXSUrwAlUlr2;n;O(Lk- zw-g`h6&)9YV0?7~L)#DR!}P|$iHw|MOLYIC?|^^4P!LCy#DxBsakRA*Bj5A&UM#no zR7!pGrT+o_pK4?2Lj1}9R+s0ydFXGi%73cO^!KoFO57A&AOk}1u>u6VfCxNRkp1I1 z@^es~1RycR>+R{ttJ9r+X7fhITE@kGZrQ9#N+6Mr^65`i-55(+6l#@1L^qIi3)uF3 znBlaN*13g3ie%~_?jozk5n}nS_d++J7)N{FUT3b`%!xWrXY54J0VJC-2YuY*^mk?E zaws==WG%%gDT~bFLuIyoaIF9jQ`~tn>nF3B`;C3ycghvy|5QjmMw+qzTao_{9RF#h zBRgR^z<|(+ehY7S6=|&1Bvgx&oi0#a9HJZ2DGQT|(kW?9`SC<`u>2RuqBWnFj4g)( zN-*FdO`w6@J%*5mfVd~(9e^U^0sUi)*c&fVw^AD?>UCJ-+^!ndAHA{$a2?dk_;%omFo8sOTFJ=pu?7QJGsC|d-iXaiWn)qB zqq#uuV!aGD;)$6v5ViR@t^gs1B6-@5B{x(mp@4kosx4&WeOVi>+9ozoD{)R#UUI3|l9!;SY zz%CoWT$2d2)UX0F?U%RJF{t?xIQ4CutC{IY)(7WIqkK&lmdUGXv8rrSEZ2z)JQb^A zUk2iXCbEy2olrMaI5G~nVL_YeYv2a5g@A{ANh<6s_+?2O@xCCJOv;_hF8ksiA?u&C zDvpM^KGF}0^qY?q%f^#Z_g0<@*i)q^v#SOAIAOPlhfuZ$J|KFjcR!k!$Wd`h(Ka*L z+k4sJmorX^W)H?kuC8?U<~>btnc++743|pR&+^n?@Vj2)-qL%0WOrqeT5BLT9e`S- z_tqyzdXkw@pO|@9{>K6}Zx+1|{D1qef2;IA{Fj(ZmOkgtCYZKVFi+H5E=G05YpPF;G-nx#m{`{Rh z??DXGhU_Ud;XWKcML4}(B7IyGKES|bUDV+YBay7A+OPAM8OSVgf0=I6(8gxMOOUOV(qq2{7g$Lf2(jLglLY9J&{7ZasRw#hmg|dqVW5CDk=&n@P-4$ z4J&h=skm(FRh-Hn9Q6$cgTOkTI#F|0M>Ximg{|3m%p_8!d`C&{_a2ur5u31X2G>Qe zHQ}0Ki7o4lrpF5cF%nfiqhkHQ=9G{Oqwjy631GwC zmt9#fwMlJ35o&OR)J1VTuq*+#$ZS=FNSa`TGg)B`?F%&JVIVXb=a^kmG9B!?7zX7rq?vJY{ zo);h=>eh>lu`%_M?EB8hLjlK|UO{C;+78xf8lU7L+Fd zUp)iUw|UFDc2cN!+2_F0PiQ9>%51q=wUb%t3#Ks6LP~FoPkEz5hR2Rcq7Z~y(+}}e zNtUUNA>aFoD_h0x!DKG*&Mj06cV9)92qulocUxLwq|`&h6un|k^-Xn-H%nt;ok%#A z-Ir@3N>JX8I5|zWgE_BI;VHQpy(a1p!tX)cEs`nN7wZ-WDmFtK{b`V)%uUQOe=$>- zsMJ=@NpBJ|O(b;(Y}UITkk?~npOEYh0TXa3hi6FYPTD?!zF`Uf!Q=eWeIfF(@)Ail zA*fP@4HkfI+Z)M-jliGoM~v7R282LPf!>ELsR?%k2#OtCAeu6kIxsNM2q1FG`ff1j zBYi1la>Gx7_<4H`S@I>g@iHR>#64kjo5WGe-@3WgP=Bs3G;+XBaMyY8#sv;o#Xxk2 z5I}?|=n(CFKzkJ{;fF}|NtLUrj#@^4{lu&8p^{bj+0p)b9NzfM65Vw@orji5@2;E8 zhyKaieZtdO99iHHE;5KAg8cgX1b@(Pq_B^|mfFYpx!EiZB=5z>vc0k?V+dW1*?nUjT)}Cs( zkb>K#SdUE8Wz39aICv*1CF2J-YBFT|~`*Ef0nn z>Ha2fXfik9BW!w~Fth_L~&COyri z0*kLmiQBVYM+=5b+Ogor3uso9fi7ks0*pJOT> z_;bb{_$0meK$Apv+K+plU7v`{Np=b0{LUuGH(z2A&mck~N!_K26?E68`-=RRyMFnq zdH{dp^8fCA;r`2~y4g8eIhmW7IR8)o%j~hi&-k5YR%8BGsr2os|C{W;A^%FHs*DN8 zJyDdpMWu8C0mwRU=cp;aC;*KFTcCqPWxFD zkA|%1e==fb&XBO`#dUw#+W#C=Y9|fGWDv@Ro1!)9pq`NYppEEk6mcU7O@);qit97E zH81W6c0a3BLeZ4%*lHl650_!=SGi4OJTXOnHkSPG5jG?PlMp{)9Od5v>uAKdxO)}Z zuR^|pn9)3NO59U!Ox|D9A?=~KM$VjvwR&Nx7q&cA($!3xcM|Pp`oF6C>ae($q;K5a z-GjTky9Ey#BoN$P0}Kwq0t63%K(OHM8r*_A!JPyT`VC?Ca_?+*-@m`}JPe1S>(^aX zefm^&S9QncX~(@6Ja8{HS|Ju(kU)dk=ks2mAnz!2jP7~0SEr!omu~K;M%HML&OQ4% zN?JX}bJHT0&f$Z0PCLIpz0=OyAe)d)!t33`j-YG&pKN43r|SzwOPvB37$4%ew7WU$ zph;+bS649Xs(;Pkx5=FP;1Kl&1$`fNIdCo;`^*R@5{jmUzjoRbw)MKE{&^JFtjBe% zlEtNS+3Wd_o|ZTle%U?H@FcZJ@jF#yV2hQkVSN!qvT$!DDn}rHaR#Y=9}k1Da~uzA z8liZ@W+Wl8`$^ltyO;zCZhhr)EbPE|H6z%Q%DmWwNOZ#dHl^Kv zwzI7~Y?ae&YT){GK-jI;b*yv*Su21MjVN;A>dQ+BWYNN6OrjHIZblX^?3E#LBHq46 z(XIxP-1LPQ@$|@~#n@y6+WzM+lXc1S4d~|V8Pg0b`k(iUDk1#rqv%)J>4#O>GlF+E zR}X)FI7aqS1`El?UFs~frw)SB&5SZ#nUs5i^(f<1;p~0 zr;5r_zmO{9*&=|_la|67$PY8Y517!+!`HDyv%$w!Wy}#PGqXbPWB;-Dd6<1PiV2WF zgeX~oq~=5st3F`#>~pGUZyJXZ*}BpTK-Pv$2}M7B4h8XgVQvI%61wacv(&gmnD!)9 z^lo-w1$j7ejF2J;dyJ4nGwiZ`<)TyrWY1_Iz)4>e0Re%GHt~%^i7&ZmLU}uvh{clP z&}VBuGz#w16#G!GJizF8^N<6X)FWk+0xnht*>8#zPniFSR$yQT1);@aFew%Ang9yg z3HsZJ8~kvb-W3V$=k&`$*w>pr7g*^KjmBc>@j*^%8UALDa$jg$~H9%McuBxzI3T|BByzs4ZNTa#0fXsWbH4q7?(&(s|Y z)t`F(7wp0;y)wbWzq)9GbLNKZ^S_&e9hKal)}MM$Orb+~)Cb$|@r$ z)jH^IC_%oy$eue>3G?d133u|II1I0iKbyc93J4u`B|t5aGD~^eb7+|uIq$eP#;~~K z^J14IT=i(N!N(y@e<`vEZKx!F$s@Tw6z@$!E1_YwALXz}_eln=5(*Vd6dydFprjAK z8cjTPYJx9Ud;PT`$?W!UTuGkuA+KF-hTY~SQ` zq5gOMKWs|{)W3^`nXMVfUJ2-L9e6^4)<`_i6?~Xc_^a6O!oL-Lp!3|GPztM+@+Yyh zx4kWm8-a{#;Y>N9#kh32-o%=R&{Bq%n>iPU@-CrOjI3-}pqH4qJ}+688bg5OFHSnC zT-jTMrj%KOqb}?C1URUPP*e9!o}+v3lh^0RiQecEu#&RO?5sIa4Dl7A*wHQ~oCB@VeS*GKVa$WjCv`7-i23?L@TbJJt*bh0MldWxGQ6FQ?C zLzhzIM?ncgHJ}$2YZO^0!i~>Jutodt5B*HOf)cv_`T4z}&@ng1t^o|A$+I*65lO>w z>R}83v62-5xp_q=6n|Us26zs-2vMwezXXy3G$vE2mETLV6DMf@*H;E-IBgVNtma7xxS)A0Z8+7C}2I&vO zUThIH!ASmUUjonFffEbM8pZ>#(h)^`ywMT68T^UGB3pOq8a^Z;dEEPUe|~x-Gi!Q; zgmm$ZmkThi5EeE1@QuACAELVh5f*OC)z!P@)=M;Cn!vydR1t=r0gAi#N5WorC!aS& zd$Z~#S%lx*?2Y%Zkoevlr-eTI!4wcGb?GlHnN7Hkc5njc!V%*5j7Gw-1{;8;ZVrtS z;0C`_fWzi{br}?kad4=zJLCHP zDo{aKYa2z%R#8ETJxhWnb~Dn=0?IEXaR;fy&8E_uu$qry+$)W!PF5CfP(FwhZ8P~N zeI`9oVjR&*7&fvWaZ#CK@15eMl>Q6)A1bGe zI2|NYVZ$<0A}|bJK3fQ1zKeRx#-<4)-=JPVNpP<-`xGm?Ojm!r`IdJ1 z%aCi_q-|Tlw-<1x-K#ifHR7#bMwi~Sh_eiAoRkins3B?U@=(=Gb7#0$RsAGU|9WNh zwb#|s%O0L^G#?h=mBHs4e?yiO^dp}GACI?A)Tj~$zC<{Sem|5qprv`!k|ID+4i@8? zeuw86()({y9^1ExLblwed|g^7;gigyv)?s^(usMT*rf~G(k(fpi&EPxQ@*^E4-d4n z?28NWbQ?<~=f!mYzGDl1b5v z!W|x$V(+4izmnZ+-p)yNRKsCFjIa>0Vco7=gV$nku(qmr6EJU?u3`GZu!H$z$?&Hd zPP;P;c=g&&@2ZJY1s-ut)9Y8c-^od7nCy{c%siqnel$!BQN0G&=lvx=QzpWeqLeh! zJUfWx{4NWomUQY65kbI}3|-Ck75>~PtGXpznYMGqaVck8@m8*=92b8ER?!|B(tR?P zk<4_psUUR=J%1x>wynA0xLW_`h%i51en|;#>}$M*QVHx+d{uZJl01rS+`7#1DQXWqxY{ST4JwXi1!lO;R@mCd#J zVVTLbI%kXw{yN4C7I~KUAr$G3Mb+5WVrz-g)I=VRs;}?{3A3S?<74k2uSv0mDXF8* zLiks%RrJt(T1UIcY{&{(X1+r1Kv=?tj`Gfm5h|u8aM{lBTNat6o$a{}ub*x9tu#u>HJK zMGs5LSAogA+y)%FAzM_?sEr%1N^(}8PBk}w@Q{^EFV%A|7F?>vi~*t8I?halacK(K zdX#%d1e5Ok`8DqOVf1?QyJ#$?G#un&aI5%L@1v8Y>xJgE#W-so6t8Hf`Qwn9dHeis zO(o4@zxjii+vbp~#e*q%CC!kQ*`ws>`C5YAk6aq|#cyt{ttIY9%k2*Il1lv}?#ezM zUf->FePp#c(i3v7+eIlBzWL_boVn9kQ2PQ8MQ~srDO_cQZaY`S2uq)?7RZ?}2pP`hbEXieza z`Re&8qz;qZ6`fjJjEshcmMPLRU7lJqQvs1un#RgJizsa~sx7(t z-WzjVwlbcHME6F}AAC;^;`Uk_I_^dVTAFk6gGaHZik|P8iQnJuq6@UWe(l$xW$tH+ zs}?}31b{0pu~NVLi5PfJ$P=HznRhngpF4(FJ#s^EiGS@4kt`m&cvf99za__P8EfWb z=4*Qil%Y#j3YKNaU}pDlPfx4g&wcRvsU@Cfa!z|t0h{Ypq zSe)3;%=E&ytG80v7wXwLLtH;{%q7<3{22ok&UHtv?`Tz75T$gxjy=95R9SNRZu@6_ z&!Cfo(6_{sj_nTvYoKzM@^U0+fm-avw70MQSfk6OG;XC^xINyuDS*kxZe2VHVU25L zgDX?=l@fQoAfdmU!dmgfpt8JL=c175m?g05lL%?UD<>MhRQ8AN}1ZB{h7&_JZu z7PY%EJ2#haKGd0YGs>MnzDUUtVgC-F-nLq|blSRG{)<)-Veab2JPY8~t3a!C8%$y# zwXpK;eVb3Ueo11a>mu#zMPt}=mcF5ih6PT_YXv!uiq-_q@i>q3FZC7cPv5-YDDf2g ztZLwt8h4xf34hj5K6Pi%^!y7x&P0-K+F#$XSi*r>$0#WMMxngkQsnSe3=I!3nBHp> zuf||(!Rc!tYOHD5)HWa`p%U`8|8R4*2<}Q@?et1dD?BQS$P=YzG1l z<7*y`H>5Z@#;E||tu)GIWSlsJ^*qt*1#dbS1mnXmU>uTs7cVyl16!b;=MnP%k|~_k zk9qxU^OB2!b)-=zyh=C_vO+PGZC{7jdm<78UkJ186jON+buExOo7+CNhdIZ}LNllM zh4$ch1MU_1C|ekgVT*j4I8_{Tuq>GwLR^>BW$*i!->$-_zWoY=QUm@T-hI_W+#Zq; z79&O-CoXrv{^Ir2NIeC{^R50*XRgZLaq}M;=3R^t8aAZna;tLT4t!fiIu7Jm1HXYM z=G*cOGpVt+Ouy|{4Nk}bFwd1i>J{{6qE^s^5p^`VI`qn;3g{Se5PO#{S6-D_b{+dP zDsd-QeV;hCkntVOP_rIo;#Fi31nb$>9BgmnvkCCE%E$nR@~aUWpc84uUtM{>bgI8> zb?st}=Xf-zvSXLY0FE8PU|exbqS4U0FAp`Dr8iXqr7@vx+(2Hg+RV0lp!+iN(0biA zH&JRWa4FIS0+3HkA3g6EX@ixsmm8ci3|WOwPv6CWqwl0dE;2 zSR;BBu-3>R%+)2Df&y@ND~$SBBIq@^mx{!3)4=#%ac7s1>|((}!@Y+^!1e>5P@@d) z=hYx3p{r|YlI@-ptIGr{bWOQ6(EkqD;|s{o5r@t9uzDx(?(P1Ed6kQgIp3t|ot0Iy zcVSq}*Rh`!l6)fB;!Mgn!UF$^Eeapz9D!l3?~&K1N!UXh%qCuAnO{0Di%{=8!(FZSruN{2us$XXa;(I_v@bkWN{i18`&l zKU6vhS6*b%&~Mb9lf6tj*7J7w3ht;3ECJucCl&rs1hV%N=CF$+lN}T35^kXvf8)`drQMV*=*%h#s|w^Vfkj&7%g%j-Hf)iA>vV zXMn@(YeTeOV+)^Dv{v}X?Sm?tZ77lSKRdVDQ1A=R!%=H=I#->lVsSU`H)h_V6m#bg z^H+MiC6Ib=8qM?j`|{YBtA8U9%0hvp(n#+GcSs`7i(DY@VEhRghf#~Yj&__P9!-x4 z_5S?`?Yid{()ZS4RVkF9^zTi=FU)KFg}%<7ROfgK-iRH;~Usw_d4$?0R#$)3@f{xkO4^Ay7f=fee@=$eM_1yKQ92TqM0e#{Eos z!;J-CC1X1e&v588H`-cXj_G;+1_vQ_uJd5@h0W)KrkgcqR`y@sra>+j`XMbW{7f9u zGcBdJtX0x#U!UzTgpJ1|k;Dg(gT;S5svPRQ*;?8GY`t3+L^z1;!?O$Zt+Av>S{o4- z;=9{D{r+H14kbW1}EMpL-5zYPD9VWeJM z*QuzcR*S;TbiuH={6vPA8^?hI&wwD0BayOPf*`NsLc{pgz7(H{^1bf_cFi03E{7&^ zP0oC8hA0^=98(+?>Y%&agvMqOIv+Hq8U`1Jd$t?AahocgTsSsiqAYTT=6aCvhuXAF zHye$*)B>kx8wS7|0$%ZgkfXy-k@6EiMe(4Z)%?Cn2LmDW;+?p;>Gj|CF2pC)m#coa zlH4Rv+mx@vI4gb@SNX$MTTc5`PRC?eda&;q2G?VOjx?%=+wcidJcmML%Ol!wgmo$p zMqEB?Sw2ES>=NnqV)6}}hf3q>LmpAdg!7^Cfw=UXIq!((#S$5^Q6of5s3IGDWs({mxT9;_-f;THOb7<86Gl#QDI&JXi0pMTq3~p zLCMT}|D3l20L*sYP+G2z%)H6E*wHp{Uca0li$P8#eG}SphiG8nFr9X8JN=oSfL_Ee zqltbPW|dXv;tFl)D5SN&w~}!8c@1~r+0d7y7PWD1D+cL4`qGI3!ZZf6oci+m*+T1v zsT(?|c%RFgE3q-{{k2uyW)*W%(a?gUY)9YiHk#h%wpp+nCciHqvoT>loGwHA58XY_ zphe0-tt`F&J*o9%^}8*x3TZ)sfjI-0-X6XM&Q>{F7(1Dns=GKnggQJJ|IJ$S%7ibl zd`l9_~(Ld$+@INiy}&d`o+!UU0T@3$So0#t?f zalI+ln*1|+;w)xJJ6p$SP#gvOuK}yHBRw6jqF+Wk^L5}Z$ zKpmMS=`lCoe4(`tE;vj~9?^}O z)oDIzb#@Xl;NF^;VqU*sPLp$n>|M`iO#8x^Lf>P`Wqn?*#u!*kAG;i!E*{ZiTJRbr z_ZUT!y3YwYm9G+2EiG%%a$Tdi42QX7oqaZ-aE1fa_GKs)b}%pAy+w${(VHGt31y58 zn+D+vm*%UY7sW_-CEIJ;HV1qIFP1Z)H4#EvN45PHs|QmcHx5rr;>@D&>)V{Z;Q?H; zp)L(ud3|I%oq@|-(%QOAB(Pcycib|4oyGL5?`GLbQ?2NJ;bc})IO2t}5JSNv|7$$+ zI-9&nD@Zl$FLnL|fNrFT_eNKewv2>29?EPO9{oL8s`e~3`A+gEYss=B({krr>y@WX zvICaQ`vZ{iag&?WwoE2qy?eUnV07Pf1vHqgj}dO&5sq-2q^Pko(sMC0+FHNypVkK> z(k;a`ap8O$H}zxb6+wjdSpI>03wuP^g%jl;P2LtGNV+xjBdeDw?>%f;Yv7!gb)(|? zW>_37YK9Qckd|~V0KPb~?hQPA`?75bNy*lW((!afEpDc~XwEn@X--VI^;Hddk0WH{ zfTjrCD8`Za<8!mhA93FVSSz3o4ycv7hpJPc5Hz$zm&D^MU$C=p-?`eZJ~+;&Ib#ZU~Zmfg+7kVv=Puzg<=jaay-E!BMPnEr0B!jk!uAbTVe z^C#%#ANI8~@*P6}>aL*vtpjWwA;V5Kxm={ZlPhTbOie*ntK+0sLW1D%jFx#DkK<{2l8^ zQoXIiHo0%5($vXxKLKUK+J(jFv4wu(hK8`~VYxpMi_SO$W&|8f~ zyKz>m7MWFI_R9GwxqCgXl+rWHOF-U4ee_M813;sD`MchK>^##NjX| z6T98zeOKJLvU-(Ky@kk-6l)O_Jy!L>0FEoGt{(um6n300xQB;UP3gd(Jgko zt}Lj=swXCV!Cd&lmNK|@+Y_})u|=1Whj)U0dP?&?@vzo%tiJrCkZZml`c~Y#$l{bT z4Kh5ruV{c#SQuSfmRfQir(+V)cNAfCfL=~Z9^Kgncxp;mPnNHMmHf!sQGp3;kKufR zsNvX)VbA-^gXy*D52zx3B0j(TToz7Ztcg#GdEXS+SX;r8%XKIir*yltSqix2+Adh1 z;HqsDap6u#eYo)&s4Y5>FTmjyE7+_$h(F@7%JmY-pGs4{cTqtVF8yq6iM7^@u3$-9 zw_Z*kD%joaa)4Yo-oZy0mWn8J|FTwPFWIK7U-;P?MY&bqFieKyazcg4J*r3Uks$z}N6KT3@muXE7SET+mh(EuBh@@q3~*~hezaFOgKTV8ZipsQ-2sC}h4HeN30$fs` z`9xk(0=S#)RP6eXrN4e4QBe}?UZM~4ZiI2<4Q~2nZ(q1Kydsp6BQe6ku&xo8;c63$ z9R={onQ28klY)Wa8*y?K9yoHB4=w)knh9A-KR8)EVqrtS)sZo!M_-vV;#t^)uUNuI)7hO$n_OhxYheMRiy1eT{@{Xa?7MTF0*30R62 zxF`zTHt#=5|Fvu0Z$Sr-v`0X&A;GLPt_UP7WDI+t#qD_L;53ETtyH_Pb97s(r_x@X z<2+|z+-HOTb@ypeq@(ySc~+fsRD+8)D%y)w22y0}Xc7l2@@;blU3LHs`+nm_w$OK* zT*ja4m_^x=+^pfBGN$&ak&+nX@diw@h>8=~WB7I4t0{L~4eVX+Y(+1O3nlEyNUvZf zex=AC-*Y1a+%KgLaaj9F(dQPmY4tmN1{3OfxnpQynv6VZQwtXm8?4WI-YkNoH0USM zP&EG^eE&`X_&49cJOCiy-XD*QTny9CS>?mW74X-&r4gG6WaJcI4QR_-F8k$u#(@!# z7hh)K3iVE&MCS00fZ^6eM##xo>KSYMb$V6~*=_R5-aNu$B>*YZt=Y=Yp++sq<2YPX zm4Yx?^4m*mH%yoT>}OGC>?$Z)@+mSI#f@|5XEh}vu6|ruz8sp;I3Yu;%{1& zI#-BE`k26TZqPq2c8Gt+*gR4UkF+@nGigH%F}K!B^cJgq>vvsx3lci)eFBm5e=+_$ zs2qfG$l63XhC0 z*EJ+(HMWMX^;|}E9Fx5Gv}+Rq{02@+em3KRu%Y(r!|3mBFh4G)1h2l(Uq&=5sbC`M z7DbDio>EXhf)+ZxI&A$8VpJn_5ybyrjQ;3;nA_bZ2k z6&y)!%=z1^QgFK}GhbePZ{HnXw_46_kH-jPW@#Ecxy!up%~RQz{OBZ!LBeVUW-#e5 zoTI@$#wnIT@h9^8j~RSB;C;2oTE zVs}tK?)zuxI2B5Qc&{3Rg&-7J&Cdb{HX(+X`IN5U`RQTSiA*3ojni_X&XK3y87w3T?e##a5{-*ORcL<$69V#d~1@Lo;rO%73#~FP&2du}tgi|l) zIG5gF4#}OoGI)y(R=&OLhOgFp9m)V1mA*fE`a5Ii&kFgM=l-{-*FWjTl2opH1gMib zpnfPHb@Lw-Ptmf!mGmXH&kB+aN%ScA4lpJ(rHJw|!NUnYSYP4_ykm;drOBw?^7z7) zHW&%YYe#i{{9R3B6K)}WL{4R6?7DDtU-b`B}wAn8nEVMqzeQS zJE^;yt+VAvGy~TbB0{0ZM$nnJO(Un8F-02vkhqY#IoExC2F;VzxIfC(uRkbrU!|2W zM)|$_QgYvDxw5*v$n#I1V^=XAs~#wec(}(%9X$zqI~U-YCWacG4ra~<4}Ql`l7clM z8(Pm%fM2N8^lIWsR?w;|ikug{92wndq5P}TvX>U7J$L8$bdK9F)2LGk`fg)Z=69`Q zf^IS|6s!xOB`21+_Nz!L5YkWO8!bEigl_TlD-hl}$m1Pl88e8waostwd+9>AMHCZr z7c0^EY!R6uY8f3s^3J7PT1aJ^r5}^6D=G$jKnA|;PMHG~013WjKeu71P3+WM8$b1w zQkf1H?oD4*Xmm=#Nh`#`C2SeAjM+I^pZtqb^|SOvAQv4Ekbbk3tHsD62m7G~st zI13Y8J8E6QcsjG|Z@4U8_qj4gSL}?;qyaMtV#6uAZ+>~oA)`%LlJrN_k{=nNsR-2h zA?Gz9nUkpB;T#@sSOmPY`~G#-Dx*|e0koI3#338B720_rz}$LsI)0am>;^7tGOB><+-45?n%^vRMd!I30WlU40$8QW;YzvNRIyH>dHo8hh^G zqoF*mXZ=}BL)tl!p<~nL%0`UI4TOpUI0V-38Lh{|0zq{R?EYZjU}V5g3p+5dKWP3D z;LnfSgWvFYJy^X4rvLU3Kr03HBOYiTD-HOI=J#Xvzbbod!UO3;Wp8l)ZaDuhx`)z_ z*TY?b+285@Jv#hTMxX}z1NCEbr~g3xPjelVbx@=C32Wr_f41ZS>&XNL#R9b>pRjt( z|BUryRf1xHnr2T}?G}H=dNR~Nu|PKff5Q6w4=w#ADNq2=WalS<8(=EglQ;F3ct8O_ zlWd;=x_LkWe#-2bd4^*H==0in~Qfm4;*?*XvpbUc^YxaZ_|JdGt9CG%* z8vl?W`glEX8zT6-@qbGf{il{e6Gk5>AKPCBLJ5*S`oEg+5R(3QJv8CH>>s?Z|3&`b zu0384 Date: Tue, 2 Apr 2024 16:53:23 +0800 Subject: [PATCH 009/302] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=BA=90=E7=A0=81?= =?UTF-8?q?=E7=BC=96=E8=AF=91=E5=AE=89=E8=A3=85mxRec=E7=9A=84README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 4a2c310b..3efbe7d8 100644 --- a/README.md +++ b/README.md @@ -58,23 +58,17 @@ bash run.sh - CMake 3.20.6 开源依赖: -- pybind11 v2.10.3 -- securec -- openmpi 4.1.1: 请参考软件文档在编译环境完成安装 +- [pybind11 v2.10.3](https://github.com/pybind/pybind11/archive/refs/tags/v2.10.3.zip) +- [securec](https://github.com/huaweicloud/huaweicloud-sdk-c-obs/archive/refs/tags/v3.23.9.zip) +- [openmpi 4.1.5](https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz): 请参考软件文档在编译环境完成安装 - tensorflow 1.15/2.6.5:根据实际需求选择对应版本 -pybind11的压缩包放在与MxRec代码同级的opensource/opensource目录下,如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource/opensource目录。然后将pybind11的压缩包放在opensource/opensource目录下。解压压缩包,并且将解压之后的压缩包改名为pybind11。 - -securec是华为开源的安全函数库。下载后: -1. 将platform下的eSDK_LogAPI_V2.1.10文件夹删除 -2. 将platform下的huaweisecurec改名为securec -3. 在securec文件夹下,有src、lib和include三个文件夹,删除lib文件夹下的所有文件 -4. 将platform文件夹放到MxRec代码目录下 +将pybind11和securec的压缩包放在与MxRec代码同级的opensource目录下,如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource目录,然后将pybind11和securec的压缩包放在opensource目录下。 为了构建多个版本的whl包,编译脚本在python虚拟环境完成对应tensorflow版本的安装。用户可以根据实际情况调整编译脚本,指定tensorflow的安装路径。编译方法: -- build/build.sh:执行脚本完成tf1和tf2版本whl包的构建和打包。执行脚本前,请参考build/build_tf1.sh、build/build_tf2.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 -- build/build_tf1_with_opensource.sh:执行脚本完成tf1版本whl包的构建,构建成功后,whl包在tf1_whl子目录下。执行脚本前,创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 -- build/build_tf2_with_opensource.sh:执行脚本完成tf2版本whl包的构建,构建成功后,whl包在tf2_whl子目录下。执行脚本前,创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 +- build/build.sh:执行脚本完成tf1和tf2版本whl包的构建和打包。执行脚本前,请参考build/build_tf1_with_opensource.sh、build/build_tf2_with_opensource.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 +- build/build_tf1_with_opensource.sh:执行脚本完成tf1版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 +- build/build_tf2_with_opensource.sh:执行脚本完成tf2版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 如需使用动态扩容功能,进入“./cust_op/cust_op_by_addr”目录中。参考以下命令编译并安装动态扩容算子包。 ```shell -- Gitee From d3c5f66b0e94533df66baaae3da51f9fbde8c1b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Wed, 3 Apr 2024 10:23:31 +0800 Subject: [PATCH 010/302] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=BA=90=E7=A0=81?= =?UTF-8?q?=E7=BC=96=E8=AF=91=E5=AE=89=E8=A3=85mxRec=E7=9A=84README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3efbe7d8..fccc0244 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ bash run.sh - [openmpi 4.1.5](https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz): 请参考软件文档在编译环境完成安装 - tensorflow 1.15/2.6.5:根据实际需求选择对应版本 -将pybind11和securec的压缩包放在与MxRec代码同级的opensource目录下,如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource目录,然后将pybind11和securec的压缩包放在opensource目录下。 +将pybind11和securec的压缩包放在与MxRec代码同级的opensource目录下,并且将其分别更名为pybind11-2.10.3.zip、huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource目录,然后将pybind11和securec的压缩包放在opensource目录下。 为了构建多个版本的whl包,编译脚本在python虚拟环境完成对应tensorflow版本的安装。用户可以根据实际情况调整编译脚本,指定tensorflow的安装路径。编译方法: - build/build.sh:执行脚本完成tf1和tf2版本whl包的构建和打包。执行脚本前,请参考build/build_tf1_with_opensource.sh、build/build_tf2_with_opensource.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 @@ -75,6 +75,46 @@ bash run.sh bash run.sh ``` +## 测试用例 + +### Python侧测试用例 + +运行Python测试用例所需依赖: + +- pytest 7.1.1 +- pytest-cov 4.1.0 +- pytest-html + +如需使用python测试用例,需要先安装上述依赖以及能够在tf1环境下进行源码编译,然后进入tests目录中。参考以下命令执行python侧测试用例: +```shell +bash run_python_dt.sh +``` + +### C++侧测试用例 + +运行C++侧测试用例所需依赖: + +- [googletest 1.8.1](https://github.com/google/googletest/archive/refs/tags/release-1.8.1.zip) +- [emock 0.9.0](https://github.com/ez8-co/emock/archive/refs/tags/v0.9.0.zip) +- [pybind11 v2.10.3](https://github.com/pybind/pybind11/archive/refs/tags/v2.10.3.zip) +- [securec](https://github.com/huaweicloud/huaweicloud-sdk-c-obs/archive/refs/tags/v3.23.9.zip) + +将googletest、emock、pybind11和securec的压缩包放在与MxRec代码同级的opensource目录下,并且将其分别更名为googletest-release-1.8.1.zip、 +emock-0.9.0.zip、pybind11-2.10.3.zip、 huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource目录, +然后将前述几个压缩包放在opensource目录下。 + +如需使用C++测试用例,需要按照上述描述准备需要的依赖,准备好之后,进入src目录中。参考以下命令执行C++测试用例: + +tf1环境下使用如下命令: +```shell +bash test_ut.sh tf1 +``` + +tf2环境下使用如下命令: +```shell +bash test_ut.sh tf2 +``` + ## 使用指导 mxRec所支持的使用环境、功能特性、API接口与使用样例请参考昇腾开源社区MindX SDK产品文档。 -- Gitee From 0b8faaac6b4c30fc46c232db67be5780e98fe72d Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Sun, 7 Apr 2024 01:11:54 +0000 Subject: [PATCH 011/302] =?UTF-8?q?!73=20=E4=BF=AE=E5=A4=8Dint=E7=B1=BB?= =?UTF-8?q?=E5=9E=8B=E5=8F=82=E6=95=B0=E6=A0=A1=E9=AA=8C=20*=20=E4=BF=AE?= =?UTF-8?q?=E5=A4=8Dint=E5=8F=82=E6=95=B0=E6=A0=A1=E9=AA=8C=20*=20?= =?UTF-8?q?=E4=BF=AE=E5=A4=8Dint=E7=B1=BB=E5=9E=8B=E5=8F=82=E6=95=B0?= =?UTF-8?q?=E6=A0=A1=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/validator/validator.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mx_rec/validator/validator.py b/mx_rec/validator/validator.py index c9abde87..013fe565 100644 --- a/mx_rec/validator/validator.py +++ b/mx_rec/validator/validator.py @@ -437,7 +437,14 @@ class IntValidator(NumValidator): def __init__(self, name: str, value: int, min_value: int = None, max_value: int = None, invalid_options: List = None, constrained_options: List = None, msg: str = ""): super(IntValidator, self).__init__(name, value, min_value, max_value, invalid_options, constrained_options, msg) - self.register_checker(lambda: isinstance(self.value, int), msg if msg else f"type of '{name}' is not int") + + def check_type(): + if isinstance(self.value, bool): + # bool is subclass of int + return False + return isinstance(self.value, int) + + self.register_checker(check_type, msg if msg else f"type of '{name}' is not int") class OptionalIntValidator(IntValidator): -- Gitee From de72afa620e5abe701f1cdf4fefa5d5811f5b2ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Sun, 7 Apr 2024 11:49:40 +0800 Subject: [PATCH 012/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91test=20first=20time?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 894dc230..d5563ce4 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -663,6 +663,7 @@ void HybridMgmt::EvalTask(TaskType type) /// \param channelId 通道索引(训练/推理) /// \param batchId 已处理的batch数 /// \return +// lqklqk bool HybridMgmt::ParseKeysHBM(int channelId, int& batchId) { LOG_INFO(MGMT + "nBatch:{} channelId:{} batchId:{}, ParseKeys with HBM mode start.", -- Gitee From fb3c55c3a417188c6d50b38c6031784b63e8aaed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 8 Apr 2024 11:26:33 +0800 Subject: [PATCH 013/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8Dcpp=E6=B5=8B=E6=94=B9=E5=8A=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/asc/manager.py | 6 +++++- mx_rec/optimizers/adagrad.py | 10 ++++++++++ mx_rec/optimizers/ftrl.py | 5 +++++ mx_rec/optimizers/gradient_descent.py | 5 +++++ mx_rec/optimizers/gradient_descent_by_addr.py | 5 +++++ mx_rec/optimizers/lazy_adam.py | 5 +++++ mx_rec/optimizers/lazy_adam_by_addr.py | 5 +++++ src/core/hybrid_mgmt/hybrid_mgmt.cpp | 7 ++----- src/core/key_process/key_process.cpp | 3 +-- src/core/utils/common.cpp | 1 + src/core/utils/common.h | 8 +++++--- src/core/utils/config.cpp | 15 +-------------- src/core/utils/config.h | 7 ------- src/pybind/module_main.cpp | 2 ++ src/tests/utils/config_test.cpp | 4 ---- 15 files changed, 52 insertions(+), 36 deletions(-) diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index 2829ab98..4d822b37 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -18,7 +18,7 @@ import tensorflow as tf from mxrec_pybind import InitializeInfo, ConstantInitializerInfo, NormalInitializerInfo, EmbInfo, EmbInfoParams, \ - ThresholdValue, HybridMgmt, RankInfo, USE_STATIC, USE_HOT, USE_DYNAMIC_EXPANSION + ThresholdValue, HybridMgmt, RankInfo, USE_STATIC, USE_HOT, USE_DYNAMIC_EXPANSION, USE_SUM_SAME_ID_GRADIENTS from mx_rec.util.communication.hccl_ops import get_rank_id, get_device_id, get_rank_size from mx_rec.util.initialize import ConfigInitializer @@ -205,6 +205,10 @@ def initialize_emb_cache(table_info_list, threshold_list): if ConfigInitializer.get_instance().use_dynamic_expansion: option = option | USE_DYNAMIC_EXPANSION + optimizer = ConfigInitializer.get_instance().optimizer_config.optimizer_instance + if optimizer.derivative == 1: + option = option | USE_SUM_SAME_ID_GRADIENTS + # [train_steps, eval_steps, save_steps] pass step information to HybridMgmt for data process loop rank_info = RankInfo(rank_id, device_id, rank_size, option, [train_steps, eval_steps, save_steps]) diff --git a/mx_rec/optimizers/adagrad.py b/mx_rec/optimizers/adagrad.py index d99be3b3..a5fa7975 100644 --- a/mx_rec/optimizers/adagrad.py +++ b/mx_rec/optimizers/adagrad.py @@ -76,6 +76,16 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): initial_accumulator_value=initial_accumulator_value, use_locking=use_locking, name=self.unique_name) + self._slot_num = 1 + self._derivative = 2 + + @property + def slot_num(self): + return self._slot_num + + @property + def derivative(self): + return self._derivative def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. diff --git a/mx_rec/optimizers/ftrl.py b/mx_rec/optimizers/ftrl.py index 5c68b929..d6ddb093 100644 --- a/mx_rec/optimizers/ftrl.py +++ b/mx_rec/optimizers/ftrl.py @@ -80,11 +80,16 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): l2_shrinkage_regularization_strength=kwargs.get("l2_shrinkage_regularization_strength", 0.0) ) self._slot_num = 2 + self._derivative = 2 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def initialize_slots(self, var, table_instance): val = constant_op.constant( self._initial_accumulator_value, dtype=var.dtype, shape=var.get_shape()) diff --git a/mx_rec/optimizers/gradient_descent.py b/mx_rec/optimizers/gradient_descent.py index 6881d6ad..2ba72789 100644 --- a/mx_rec/optimizers/gradient_descent.py +++ b/mx_rec/optimizers/gradient_descent.py @@ -55,11 +55,16 @@ class CustomizedGradientDescent(gradient_descent.GradientDescentOptimizer, Custo super(CustomizedGradientDescent, self).__init__(learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name) self._slot_num = 0 + self._derivative = 1 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def initialize_slots(self, var, table_instance): return [] diff --git a/mx_rec/optimizers/gradient_descent_by_addr.py b/mx_rec/optimizers/gradient_descent_by_addr.py index 22b33852..11a9fda6 100644 --- a/mx_rec/optimizers/gradient_descent_by_addr.py +++ b/mx_rec/optimizers/gradient_descent_by_addr.py @@ -60,11 +60,16 @@ class CustomizedGradientDescentByAddr(gradient_descent.GradientDescentOptimizer, name=self.unique_name) self._slot_num = 0 + self._derivative = 1 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def initialize_slots(self, var, table_instance): return [] diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index d79b6d23..1ed68556 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -72,11 +72,16 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): super(CustomizedLazyAdam, self).__init__(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, use_locking=use_locking, name=self.unique_name) self._slot_num = 2 + self._derivative = 2 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. def creat_one_single_slot(var, op_name): diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index 92252824..e147c7bf 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -73,11 +73,16 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): name=self.unique_name) self._slot_num = 2 + self._derivative = 2 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def get_slot_init_values(self): # return state value list of adam that needs to initialize in ASC DDR. initial_momentum_value = 0.0 diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index d5563ce4..eb618f40 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -663,7 +663,6 @@ void HybridMgmt::EvalTask(TaskType type) /// \param channelId 通道索引(训练/推理) /// \param batchId 已处理的batch数 /// \return -// lqklqk bool HybridMgmt::ParseKeysHBM(int channelId, int& batchId) { LOG_INFO(MGMT + "nBatch:{} channelId:{} batchId:{}, ParseKeys with HBM mode start.", @@ -705,8 +704,7 @@ bool HybridMgmt::ParseKeysHBM(int channelId, int& batchId) LOG_DEBUG("channelId:{} batchId:{}, sendLookupSyncTC(ms):{}", channelId, batchId, sendLookupSyncTC.ElapsedMS()); // 训练时,使用全局去重聚合梯度,发送全局去重的key和对应的恢复向量 - if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && - channelId == TRAIN_CHANNEL_ID) { + if (mgmtRankInfo.useSumSameIdGradients && channelId == TRAIN_CHANNEL_ID) { SendUniqKeysAndRestoreVecHBM(channelId, batchId, embInfo, infoVecs); } @@ -865,8 +863,7 @@ bool HybridMgmt::ProcessEmbInfo(const std::string& embName, int batchId, int cha LOG_DEBUG("channelId:{} batchId:{}, hostHashMapProcessTC(ms):{}", channelId, batchId, hostHashMapProcessTC.ElapsedMS()); - if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && - channelId == TRAIN_CHANNEL_ID && remainBatchOut) { + if (mgmtRankInfo.useSumSameIdGradients && channelId == TRAIN_CHANNEL_ID && remainBatchOut) { SendUniqKeysAndRestoreVecDDR(embName, batchId, channelId, ddrParam); } diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index f76f6907..8ab030a8 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -470,8 +470,7 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel) { - if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && - channel == TRAIN_CHANNEL_ID) { + if (rankInfo.useSumSameIdGradients && channel == TRAIN_CHANNEL_ID) { KeysT uniqueKeys; vector restoreVecSec; diff --git a/src/core/utils/common.cpp b/src/core/utils/common.cpp index 38e64444..9512b181 100644 --- a/src/core/utils/common.cpp +++ b/src/core/utils/common.cpp @@ -47,6 +47,7 @@ namespace MxRec { useStatic = static_cast(option) bitand HybridOption::USE_STATIC; useHot = static_cast(option) bitand HybridOption::USE_HOT; useDynamicExpansion = static_cast(option) bitand HybridOption::USE_DYNAMIC_EXPANSION; + useSumSameIdGradients = static_cast(option) bitand HybridOption::USE_SUM_SAME_ID_GRADIENTS; } RankInfo::RankInfo(int localRankSize, int option, const vector& maxStep) diff --git a/src/core/utils/common.h b/src/core/utils/common.h index f6c3de3f..9706a699 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -115,9 +115,10 @@ namespace MxRec { using TensorInfoT = std::tuple>>::iterator>; namespace HybridOption { - const unsigned int USE_STATIC = 0x001; - const unsigned int USE_HOT = 0x001 << 1; - const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 2; + const unsigned int USE_STATIC = 0x0001; + const unsigned int USE_HOT = 0x0001 << 1; + const unsigned int USE_DYNAMIC_EXPANSION = 0x0001 << 2; + const unsigned int USE_SUM_SAME_ID_GRADIENTS = 0x0001 << 3; }; string GetChipName(int devID); @@ -226,6 +227,7 @@ namespace MxRec { bool isDDR { false }; bool isSSDEnabled { false }; bool useDynamicExpansion {false}; + bool useSumSameIdGradients {true}; std::vector ctrlSteps; // 包含三个步数: train_steps, eval_steps, save_steps }; diff --git a/src/core/utils/config.cpp b/src/core/utils/config.cpp index 9cfec739..57478553 100644 --- a/src/core/utils/config.cpp +++ b/src/core/utils/config.cpp @@ -20,13 +20,7 @@ See the License for the specific language governing permissions and using namespace std; namespace MxRec { - namespace ApplyGradientsStrategyOptions { - const std::string DIRECT_APPLY = "direct_apply"; - const std::string SUM_SAME_ID_GRADIENTS_AND_APPLY = "sum_same_id_gradients_and_apply"; - }; - // 设置环境变量默认值 - string GlobalEnv::applyGradientsStrategy = ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY; int GlobalEnv::aclTimeout = -1; // 默认阻塞方式,一直等待直到数据接收完成。 int GlobalEnv::hdChannelSize = 40; // 默认通道深度40 int GlobalEnv::keyProcessThreadNum = 6; // 默认6个线程 @@ -42,12 +36,6 @@ namespace MxRec { /// 配置环境变量,Python侧已经做了变量值校验,CPP侧直接使用即可;bool类型,1代表true,0代表false void ConfigGlobalEnv() { - // 设置梯度策略 - const char *envStrategy = getenv(RecEnvNames::APPLY_GRADIENTS_STRATEGY); - if (envStrategy != nullptr) { - GlobalEnv::applyGradientsStrategy = envStrategy; - } - // 设置ACL超时时间 const char *envAclTimeout = getenv(RecEnvNames::ACL_TIMEOUT); if (envAclTimeout != nullptr) { @@ -117,9 +105,8 @@ namespace MxRec { void LogGlobalEnv() { - LOG_DEBUG("Environment variables are: [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], " + LOG_DEBUG("Environment variables are: [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], " "[{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}]", - RecEnvNames::APPLY_GRADIENTS_STRATEGY, GlobalEnv::applyGradientsStrategy, RecEnvNames::ACL_TIMEOUT, GlobalEnv::aclTimeout, RecEnvNames::HD_CHANNEL_SIZE, GlobalEnv::hdChannelSize, RecEnvNames::KEY_PROCESS_THREAD_NUM, GlobalEnv::keyProcessThreadNum, diff --git a/src/core/utils/config.h b/src/core/utils/config.h index 4c56c0d4..3ecb4c36 100644 --- a/src/core/utils/config.h +++ b/src/core/utils/config.h @@ -20,7 +20,6 @@ See the License for the specific language governing permissions and namespace MxRec { namespace RecEnvNames { - const char *const APPLY_GRADIENTS_STRATEGY = "APPLY_GRADIENTS_STRATEGY"; const char *const ACL_TIMEOUT = "AclTimeout"; const char *const HD_CHANNEL_SIZE = "HD_CHANNEL_SIZE"; const char *const KEY_PROCESS_THREAD_NUM = "KEY_PROCESS_THREAD_NUM"; @@ -34,13 +33,7 @@ namespace MxRec { const char *const RECORD_KEY_COUNT = "RECORD_KEY_COUNT"; }; - namespace ApplyGradientsStrategyOptions { - extern const std::string DIRECT_APPLY; - extern const std::string SUM_SAME_ID_GRADIENTS_AND_APPLY; - }; - struct GlobalEnv { - static std::string applyGradientsStrategy; static int aclTimeout; static int hdChannelSize; static int keyProcessThreadNum; diff --git a/src/pybind/module_main.cpp b/src/pybind/module_main.cpp index 403692fb..b0249ca6 100644 --- a/src/pybind/module_main.cpp +++ b/src/pybind/module_main.cpp @@ -69,6 +69,8 @@ namespace { m.attr("USE_DYNAMIC_EXPANSION") = py::int_(HybridOption::USE_DYNAMIC_EXPANSION); + m.attr("USE_SUM_SAME_ID_GRADIENTS") = py::int_(HybridOption::USE_SUM_SAME_ID_GRADIENTS); + GetRankInfo(m); GetEmbInfoParams(m); diff --git a/src/tests/utils/config_test.cpp b/src/tests/utils/config_test.cpp index d7e51b57..54e0ec67 100644 --- a/src/tests/utils/config_test.cpp +++ b/src/tests/utils/config_test.cpp @@ -24,7 +24,6 @@ using namespace MxRec; void SetEnvironmentVariables() { - setenv(RecEnvNames::APPLY_GRADIENTS_STRATEGY, "sum_same_id_gradients_and_apply", 1); setenv(RecEnvNames::ACL_TIMEOUT, "100", 1); setenv(RecEnvNames::HD_CHANNEL_SIZE, "50", 1); setenv(RecEnvNames::KEY_PROCESS_THREAD_NUM, "8", 1); @@ -40,7 +39,6 @@ void SetEnvironmentVariables() void UnsetEnvironmentVariables() { - unsetenv(RecEnvNames::APPLY_GRADIENTS_STRATEGY); unsetenv(RecEnvNames::ACL_TIMEOUT); unsetenv(RecEnvNames::HD_CHANNEL_SIZE); unsetenv(RecEnvNames::KEY_PROCESS_THREAD_NUM); @@ -56,7 +54,6 @@ void UnsetEnvironmentVariables() TEST(GlobalEnv, DefaultValues) { - ASSERT_EQ(GlobalEnv::applyGradientsStrategy, ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY); ASSERT_EQ(GlobalEnv::aclTimeout, -1); ASSERT_EQ(GlobalEnv::hdChannelSize, 40); ASSERT_EQ(GlobalEnv::keyProcessThreadNum, 6); @@ -77,7 +74,6 @@ TEST(GlobalEnv, ConfigGlobalEnv) ConfigGlobalEnv(); // 验证环境变量是否已经被正确配置 - ASSERT_EQ(GlobalEnv::applyGradientsStrategy, "sum_same_id_gradients_and_apply"); ASSERT_EQ(GlobalEnv::aclTimeout, 100); ASSERT_EQ(GlobalEnv::hdChannelSize, 50); ASSERT_EQ(GlobalEnv::keyProcessThreadNum, 8); -- Gitee From 63a8f1b259325a43152e2939ac699ca3f7297997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 8 Apr 2024 11:51:05 +0800 Subject: [PATCH 014/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8Dcpp=E6=B5=8B=E6=94=B9=E5=8A=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/constants/constants.py | 1 - mx_rec/core/asc/manager.py | 2 +- tests/mx_rec/core/test_build_graph.py | 4 ---- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index 03fa28b4..a57297fa 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -166,7 +166,6 @@ class ASCAnchorAttr(Enum): MOCK_LOOKUP_RESULT = "mock_lookup_result" RESTORE_VECTOR_SECOND = "restore_vector_second" UNIQUE_KEYS = "unique_keys" - GRADIENTS_STRATEGY = "gradients_strategy" IS_GRAD = "is_grad" diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index 4d822b37..f50037ea 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -206,7 +206,7 @@ def initialize_emb_cache(table_info_list, threshold_list): option = option | USE_DYNAMIC_EXPANSION optimizer = ConfigInitializer.get_instance().optimizer_config.optimizer_instance - if optimizer.derivative == 1: + if optimizer.derivative == 2: option = option | USE_SUM_SAME_ID_GRADIENTS # [train_steps, eval_steps, save_steps] pass step information to HybridMgmt for data process loop diff --git a/tests/mx_rec/core/test_build_graph.py b/tests/mx_rec/core/test_build_graph.py index c15d851f..0b90b790 100644 --- a/tests/mx_rec/core/test_build_graph.py +++ b/tests/mx_rec/core/test_build_graph.py @@ -346,7 +346,6 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, use_hot=True, use_dynamic_expansion=True) - global_env.apply_gradients_strategy = "direct_apply" @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), @@ -363,7 +362,6 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc - global_env.apply_gradients_strategy = "sum_same_id_gradients_and_apply" with tf.Graph().as_default(): mock_config_initializer = MockConfigInitializer(use_static=True) build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) @@ -388,7 +386,6 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc - global_env.apply_gradients_strategy = "sum_same_id_gradients_and_apply" with tf.Graph().as_default(): mock_config_initializer = MockConfigInitializer() build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) @@ -413,7 +410,6 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc - global_env.apply_gradients_strategy = "sum_same_id_gradients_and_apply" with tf.Graph().as_default(): mock_config_initializer = MockConfigInitializer() build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) -- Gitee From 5b0cb455ad570189dacc5e1b00942af04d74f810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 8 Apr 2024 15:45:54 +0800 Subject: [PATCH 015/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=E4=BC=98=E5=8C=96-lazyAdam=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo/run_mode.py | 4 +- mx_rec/constants/constants.py | 1 - mx_rec/core/asc/build_graph.py | 50 ----------------- mx_rec/core/emb/dynamic_sparse_embedding.py | 10 ++-- mx_rec/core/emb/sparse_embedding.py | 7 +-- mx_rec/optimizers/base.py | 61 +++++++++++++++++++++ mx_rec/optimizers/lazy_adam.py | 10 ++-- 7 files changed, 75 insertions(+), 68 deletions(-) diff --git a/examples/demo/little_demo/run_mode.py b/examples/demo/little_demo/run_mode.py index 0f7a8cc4..305d9f64 100644 --- a/examples/demo/little_demo/run_mode.py +++ b/examples/demo/little_demo/run_mode.py @@ -95,11 +95,11 @@ class RunMode: self.train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))): - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) # do sparse optimization by addr local_grads = tf.gradients(loss, train_emb_list) # local_embedding diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index a57297fa..2c2cd2fe 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -22,7 +22,6 @@ ASCEND_GLOBAL_HASHTABLE_COLLECTION = "ASCEND_GLOBAL_HASHTABLE_COLLECTION" ASCEND_CUTTING_POINT_INITIALIZER = "ASCEND_CUTTING_POINT_INITIALIZER" ASCEND_SPARSE_LOOKUP_ENTRANCE = "ASCEND_SPARSE_LOOKUP_ENTRANCE" ASCEND_SPARSE_LOOKUP_ID_OFFSET = "ASCEND_SPARSE_LOOKUP_ID_OFFSET" -ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS = "ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS" ASCEND_TIMESTAMP = "ASCEND_TIMESTAMP" ASCEND_SPARSE_LOOKUP_LOCAL_EMB = "ASCEND_SPARSE_LOOKUP_LOCAL_EMB" EMPTY_STR = "" diff --git a/mx_rec/core/asc/build_graph.py b/mx_rec/core/asc/build_graph.py index 13ddad4a..2bb72621 100644 --- a/mx_rec/core/asc/build_graph.py +++ b/mx_rec/core/asc/build_graph.py @@ -22,7 +22,6 @@ import tensorflow as tf import mxrec_pybind from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.tf_version_adapter import npu_ops -from mx_rec.constants.constants import TRAIN_CHANNEL_ID from mx_rec.util.log import logger @@ -81,46 +80,6 @@ def get_id_offsets(max_lookup_vec_size, config): return id_offsets, swap_pos, swap_len -def get_restore_vector_second(max_lookup_vec_size: int, config: dict) -> tf.Tensor: - """ - Get restore vector which is calculated after the second all2all - :param max_lookup_vec_size: the size of restore_vector_second - :param config: embedding config - :return: the restore vector calculated after the second all2all - """ - logger.debug('Channel %s_restore_second_%s was built for getnext', - config.get("table_name"), config.get("channel_id")) - with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): - restore_vector_second = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32], - output_shapes=[[max_lookup_vec_size]], - channel_name=f'{config.get("table_name")}_restore_second_{config.get("channel_id")}')[0] - return restore_vector_second - - -def get_unique_keys(max_lookup_vec_size: int, config: dict) -> tf.Tensor: - """ - Get the global unique keys which is calculated after the second all2all - :param max_lookup_vec_size: the size of global unique keys - :param config: embedding config - :return: the global unique keys calculated after the second all2all - """ - logger.debug('Channel %s_uniquekeys_%s was built for getnext', config.get("table_name"), config.get("channel_id")) - with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): - if config.get("use_dynamic_expansion"): - unique_keys = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int64], - output_shapes=[[max_lookup_vec_size]], - channel_name=f'{config.get("table_name")}_uniquekeys_{config.get("channel_id")}')[0] - return unique_keys - - unique_keys = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32], - output_shapes=[[max_lookup_vec_size]], - channel_name=f'{config.get("table_name")}_uniquekeys_{config.get("channel_id")}')[0] - return unique_keys - - def get_all2all_args(use_static: bool, config: dict) -> Optional[list]: """ Get all2all parameters for dynamic condition @@ -211,13 +170,4 @@ def get_preprocessed_tensor_for_asc(table, config): 'all2all_args': all2all_args, } - if config.get("channel_id") != TRAIN_CHANNEL_ID: - return result - - with tf.compat.v1.variable_scope("restore_vector_second"): - restore_vector_second = get_restore_vector_second(max_lookup_vec_size, config) - - with tf.compat.v1.variable_scope("unique_keys"): - unique_keys = get_unique_keys(max_lookup_vec_size, config) - result.update({'restore_vector_second': restore_vector_second, 'unique_keys': unique_keys}) return result diff --git a/mx_rec/core/emb/dynamic_sparse_embedding.py b/mx_rec/core/emb/dynamic_sparse_embedding.py index 194b2795..c2e8d9e5 100644 --- a/mx_rec/core/emb/dynamic_sparse_embedding.py +++ b/mx_rec/core/emb/dynamic_sparse_embedding.py @@ -6,10 +6,9 @@ import abc from typing import Optional, Union, Callable import tensorflow as tf -from tensorflow.python.ops import array_ops from mx_rec.constants.constants import ASCEND_TABLE_NAME_MUST_CONTAIN, ASCEND_SPARSE_LOOKUP_LOCAL_EMB, \ - ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS + ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding @@ -51,9 +50,7 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): def _get_update_grad(self, local_grad: tf.Tensor, result: dict, table: Union[tf.compat.v1.Variable, tf.Tensor]) -> Union[tf.IndexedSlices, tf.Tensor]: - return tf.compat.v1.unsorted_segment_sum(local_grad, - result.get("restore_vector_second"), - array_ops.shape(result.get("unique_keys"))[0]) + return local_grad def _get_local_embeddings(self, table: Union[tf.compat.v1.Variable, tf.Tensor], result: dict, feature_spec: FeatureSpec, **kwargs) -> tf.Tensor: @@ -72,7 +69,8 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): return sparse_forward_fn(local_embeddings) tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB, local_embeddings) - tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS, result.get("unique_keys")) + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get("id_offsets")) + # tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS, result.get("unique_keys")) return sparse_forward_fn(local_embeddings) diff --git a/mx_rec/core/emb/sparse_embedding.py b/mx_rec/core/emb/sparse_embedding.py index d8ce63b1..938f917d 100644 --- a/mx_rec/core/emb/sparse_embedding.py +++ b/mx_rec/core/emb/sparse_embedding.py @@ -53,11 +53,8 @@ class SparseEmbedding(BaseSparseEmbedding): def _get_update_grad(self, local_grad: tf.Tensor, result: dict, table: Union[tf.compat.v1.Variable, tf.Tensor]) -> Union[tf.IndexedSlices, tf.Tensor]: - unique_local_grad = tf.compat.v1.unsorted_segment_sum(local_grad, - result.get("restore_vector_second"), - array_ops.shape(result.get("unique_keys"))[0]) - return ops.IndexedSlices(values=unique_local_grad, - indices=result.get("unique_keys"), + return ops.IndexedSlices(values=local_grad, + indices=result.get("id_offsets"), dense_shape=tf.shape(table)) def _get_local_embeddings(self, table: Union[tf.compat.v1.Variable, tf.Tensor], result: dict, diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index a5d68a70..b4115bce 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -21,9 +21,13 @@ from __future__ import print_function from collections import defaultdict +import tensorflow as tf from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops from tensorflow.python.training.optimizer import _TensorProcessor +from mx_rec.util.tf_version_adapter import npu_ops +from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger @@ -54,6 +58,63 @@ class CustomizedOptimizer: self.unique_name = name + "_" + str(count) self.base_name = name + def get_restore_vector_second(table_name) -> tf.Tensor: + """ + Get restore vector which is calculated after the second all2all + :param table_name: embedding table_name + :return: the restore vector calculated after the second all2all + """ + channel_id = 0 + logger.debug('Channel %s_restore_second_%s was built for getnext', + table_name, channel_id) + with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): + restore_vector_second = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32], + output_shapes=[[None]], + channel_name=f'{table_name}_restore_second_{channel_id}')[0] + return restore_vector_second + + def get_unique_keys(table_name, is_expansion) -> tf.Tensor: + """ + Get the global unique keys which is calculated after the second all2all + :param table_name: embedding table_name + :param is_expansion: use dynamic expansion + :return: the global unique keys calculated after the second all2all + """ + channel_id = 0 + logger.debug('Channel %s_uniquekeys_%s was built for getnext', table_name, channel_id) + with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): + if is_expansion: + unique_keys = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int64], + output_shapes=[[None]], + channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] + return unique_keys + + unique_keys = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32], + output_shapes=[[None]], + channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] + return unique_keys + + def sum_same_id_gradients(self, grad, var, is_expansion): + table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) + table_name = table_instance.table_name + with tf.compat.v1.variable_scope("restore_vector_second"): + restore_vector_second = self.get_restore_vector_second(table_name) + + with tf.compat.v1.variable_scope("unique_keys"): + unique_keys = self.get_unique_keys(table_name, is_expansion) + + unique_local_grad = tf.compat.v1.unsorted_segment_sum(grad, + restore_vector_second, + array_ops.shape(unique_keys)[0]) + if is_expansion: + unique_local_grad = ops.IndexedSlices(values=unique_local_grad, + indices=unique_keys, + dense_shape=tf.shape(var)) + return unique_local_grad, unique_keys + def custom_update_op(self, opt, grad): if isinstance(grad, ops.Tensor): diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index 1ed68556..70549702 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -156,6 +156,8 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): lambda x, i, v: tf.compat.v1.scatter_nd_add(x, i, v)) def _apply_sparse_shared(self, grad, var, indices, scatter_nd_add): + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad, var=var, is_expansion=False) + power_b1, power_b2 = self._get_beta_accumulators() power_b1 = math_ops.cast(power_b1, var.dtype.base_dtype) power_b2 = math_ops.cast(power_b2, var.dtype.base_dtype) @@ -166,17 +168,17 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): temp_epsilon = temp.get("temp_epsilon") learning_rate = tf.divide(temp_lr * math_ops.sqrt(1 - power_b2), (1 - power_b1)) - abs_indices = tf.math.maximum(indices, 0) - nd_indices = tf.expand_dims(indices, 1) + abs_indices = tf.math.maximum(unique_keys, 0) + nd_indices = tf.expand_dims(unique_keys, 1) momentum = self.get_slot(var, "m") old_m_slice = tf.gather(momentum, abs_indices) - m_t_slice = temp_b1 * old_m_slice + (1 - temp_b1) * grad + m_t_slice = temp_b1 * old_m_slice + (1 - temp_b1) * unique_local_grad m_update_op = scatter_nd_add(momentum, nd_indices, m_t_slice - old_m_slice) velocity = self.get_slot(var, "v") old_v_slice = tf.gather(velocity, abs_indices) - v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(grad) + v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(unique_local_grad) v_update_op = scatter_nd_add(velocity, nd_indices, v_t_slice - old_v_slice) denominator_slice = math_ops.sqrt(v_t_slice) + temp_epsilon -- Gitee From f9de15aae3ba106aa216729f37505220d584aa96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 8 Apr 2024 16:07:10 +0800 Subject: [PATCH 016/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=E4=BC=98=E5=8C=96-lazyAdam=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/emb/dynamic_sparse_embedding.py | 1 - mx_rec/optimizers/base.py | 4 ++-- mx_rec/optimizers/lazy_adam_by_addr.py | 2 ++ 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mx_rec/core/emb/dynamic_sparse_embedding.py b/mx_rec/core/emb/dynamic_sparse_embedding.py index c2e8d9e5..671c593e 100644 --- a/mx_rec/core/emb/dynamic_sparse_embedding.py +++ b/mx_rec/core/emb/dynamic_sparse_embedding.py @@ -70,7 +70,6 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB, local_embeddings) tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get("id_offsets")) - # tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS, result.get("unique_keys")) return sparse_forward_fn(local_embeddings) diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index b4115bce..77067b58 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -58,7 +58,7 @@ class CustomizedOptimizer: self.unique_name = name + "_" + str(count) self.base_name = name - def get_restore_vector_second(table_name) -> tf.Tensor: + def get_restore_vector_second(self, table_name: str) -> tf.Tensor: """ Get restore vector which is calculated after the second all2all :param table_name: embedding table_name @@ -74,7 +74,7 @@ class CustomizedOptimizer: channel_name=f'{table_name}_restore_second_{channel_id}')[0] return restore_vector_second - def get_unique_keys(table_name, is_expansion) -> tf.Tensor: + def get_unique_keys(self, table_name: str, is_expansion: bool) -> tf.Tensor: """ Get the global unique keys which is calculated after the second all2all :param table_name: embedding table_name diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index e147c7bf..ad9d6ca8 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -119,6 +119,8 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): addr) def _apply_sparse_shared(self, grad, addr): + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad, var=addr, is_expansion=True) + power_b1, power_b2 = self._get_beta_accumulators() power_b1 = math_ops.cast(power_b1, grad.dtype.base_dtype) power_b2 = math_ops.cast(power_b2, grad.dtype.base_dtype) -- Gitee From 278118aa8ec96d788f090d656172fa68aeaa86f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 8 Apr 2024 16:07:10 +0800 Subject: [PATCH 017/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=E4=BC=98=E5=8C=96-lazyAdam=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/emb/dynamic_sparse_embedding.py | 1 - mx_rec/optimizers/base.py | 14 +++++++++----- mx_rec/optimizers/lazy_adam_by_addr.py | 18 ++++++++++-------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/mx_rec/core/emb/dynamic_sparse_embedding.py b/mx_rec/core/emb/dynamic_sparse_embedding.py index c2e8d9e5..671c593e 100644 --- a/mx_rec/core/emb/dynamic_sparse_embedding.py +++ b/mx_rec/core/emb/dynamic_sparse_embedding.py @@ -70,7 +70,6 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB, local_embeddings) tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get("id_offsets")) - # tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS, result.get("unique_keys")) return sparse_forward_fn(local_embeddings) diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index b4115bce..91c72d52 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -58,7 +58,7 @@ class CustomizedOptimizer: self.unique_name = name + "_" + str(count) self.base_name = name - def get_restore_vector_second(table_name) -> tf.Tensor: + def get_restore_vector_second(self, table_name: str) -> tf.Tensor: """ Get restore vector which is calculated after the second all2all :param table_name: embedding table_name @@ -74,7 +74,7 @@ class CustomizedOptimizer: channel_name=f'{table_name}_restore_second_{channel_id}')[0] return restore_vector_second - def get_unique_keys(table_name, is_expansion) -> tf.Tensor: + def get_unique_keys(self, table_name: str, is_expansion: bool) -> tf.Tensor: """ Get the global unique keys which is calculated after the second all2all :param table_name: embedding table_name @@ -98,8 +98,12 @@ class CustomizedOptimizer: return unique_keys def sum_same_id_gradients(self, grad, var, is_expansion): - table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) - table_name = table_instance.table_name + if isinstance(var, ops.Tensor): + # 扩容模式从scope获取表名 + table_name = var.op.name.split('/')[0] + else: + table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) + table_name = table_instance.table_name with tf.compat.v1.variable_scope("restore_vector_second"): restore_vector_second = self.get_restore_vector_second(table_name) @@ -109,7 +113,7 @@ class CustomizedOptimizer: unique_local_grad = tf.compat.v1.unsorted_segment_sum(grad, restore_vector_second, array_ops.shape(unique_keys)[0]) - if is_expansion: + if not is_expansion: unique_local_grad = ops.IndexedSlices(values=unique_local_grad, indices=unique_keys, dense_shape=tf.shape(var)) diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index e147c7bf..0f7d7139 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -119,10 +119,12 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): addr) def _apply_sparse_shared(self, grad, addr): + unique_local_grad, unique_addr = self.sum_same_id_gradients(grad=grad, var=addr, is_expansion=True) + power_b1, power_b2 = self._get_beta_accumulators() - power_b1 = math_ops.cast(power_b1, grad.dtype.base_dtype) - power_b2 = math_ops.cast(power_b2, grad.dtype.base_dtype) - temp = self._cast_to_base_type(grad) + power_b1 = math_ops.cast(power_b1, unique_local_grad.dtype.base_dtype) + power_b2 = math_ops.cast(power_b2, unique_local_grad.dtype.base_dtype) + temp = self._cast_to_base_type(unique_local_grad) temp_lr = temp.get("temp_lr") temp_b1 = temp.get("temp_b1") temp_b2 = temp.get("temp_b2") @@ -130,23 +132,23 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): learning_rate = tf.divide(temp_lr * math_ops.sqrt(1 - power_b2), (1 - power_b1)) host_pipeline_ops = import_host_pipeline_ops() - dim = grad.shape.as_list()[-1] + dim = unique_local_grad.shape.as_list()[-1] combined_tensor = \ - host_pipeline_ops.embedding_lookup_by_address(addr, embedding_dim=3 * dim, embedding_type=1) + host_pipeline_ops.embedding_lookup_by_address(unique_addr, embedding_dim=3 * dim, embedding_type=1) split_length = [dim] + [dim] + [dim] split_tensors = tf.split(combined_tensor, split_length, axis=1) old_m_slice = split_tensors[1] - m_t_slice = temp_b1 * old_m_slice + (1 - temp_b1) * grad + m_t_slice = temp_b1 * old_m_slice + (1 - temp_b1) * unique_local_grad old_v_slice = split_tensors[2] - v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(grad) + v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(unique_local_grad) denominator_slice = math_ops.sqrt(v_t_slice) + temp_epsilon update_list = [tf.divide(-learning_rate * m_t_slice, denominator_slice)] + [m_t_slice - old_m_slice] + \ [v_t_slice - old_v_slice] update_tensor = tf.concat(update_list, axis=1) - var_update_op = host_pipeline_ops.embedding_update_by_address(addr, update_tensor, update_type=0) + var_update_op = host_pipeline_ops.embedding_update_by_address(unique_addr, update_tensor, update_type=0) return var_update_op -- Gitee From b85136384f035fcea249b4452d9a119cc633c253 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 8 Apr 2024 19:26:38 +0800 Subject: [PATCH 018/302] =?UTF-8?q?mxrec=E6=9E=84=E5=BB=BA=E4=BC=98?= =?UTF-8?q?=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7=20setup.py=20bdis?= =?UTF-8?q?t=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...ld_tf1_with_opensource.sh => build_tf1.sh} | 47 ++------- ...ld_tf2_with_opensource.sh => build_tf2.sh} | 47 ++------- build/{build.sh => gen_mxrec_tar_pkg.sh} | 96 +++++-------------- build/gen_tf1_tar_pkg.sh | 33 +++++++ build/gen_tf2_tar_pkg.sh | 33 +++++++ setup.py | 83 +++++----------- setup_tf1.py | 88 +++++++++++++++++ setup_tf2.py | 88 +++++++++++++++++ 8 files changed, 304 insertions(+), 211 deletions(-) rename build/{build_tf1_with_opensource.sh => build_tf1.sh} (75%) rename build/{build_tf2_with_opensource.sh => build_tf2.sh} (75%) rename build/{build.sh => gen_mxrec_tar_pkg.sh} (44%) create mode 100644 build/gen_tf1_tar_pkg.sh create mode 100644 build/gen_tf2_tar_pkg.sh create mode 100644 setup_tf1.py create mode 100644 setup_tf2.py diff --git a/build/build_tf1_with_opensource.sh b/build/build_tf1.sh similarity index 75% rename from build/build_tf1_with_opensource.sh rename to build/build_tf1.sh index ff59571c..fe2a78be 100644 --- a/build/build_tf1_with_opensource.sh +++ b/build/build_tf1.sh @@ -15,7 +15,7 @@ # ============================================================================== ################################################################## -# build_tf1_with_opensource.sh 编译MxRec和动态扩容算子 +# build_tf1.sh 编译MxRec和动态扩容算子 # 编译环境:Python3.7.5 GCC 7.3.0 CMake 3.20.6 # 代码主要分为四部分: # 1、准备编译MxRec所需依赖:pybind11(v2.10.3) securec @@ -64,33 +64,6 @@ source /opt/buildtools/tf1_env/bin/activate tf1_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow_core deactivate tf1_env -project_output_path="${MxRec_DIR}"/output/ -VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml - -function get_version() { - if [ -f "$VERSION_FILE" ]; then - VERSION=$(sed '/.*mindxsdk:/!d;s/.*: //' "$VERSION_FILE") - if [[ "$VERSION" == *.[b/B]* ]] && [[ "$VERSION" != *.[RC/rc]* ]]; then - VERSION=${VERSION%.*} - fi - else - VERSION="5.0.0" - fi -} - -rm -rf "${project_output_path}" -rm -rf "${SCRIPT_DIR}/lib" - -# 获取MxRec版本信息 -get_version -export VERSION -echo "MindX SDK MxRec: ${VERSION}" >> ./version.info - -pkg_dir=mindxsdk-mxrec -rm -rf "${pkg_dir}" -mkdir "${pkg_dir}" -mv version.info "${pkg_dir}" - # 配置MxRec C++代码路径和AccCTR路径 src_path="${MxRec_DIR}"/src acc_ctr_path="${MxRec_DIR}"/src/AccCTR @@ -142,11 +115,6 @@ function gen_wheel_file() touch "${src_path}"/libasc/__init__.py rm -rf "${MxRec_DIR}"/mx_rec/libasc mv "${src_path}"/libasc "${MxRec_DIR}"/mx_rec - python3.7 setup.py bdist_wheel --plat-name=linux_$(arch) - mkdir -p "$1" - echo "moving whl file $1" - mv dist/mx_rec*.whl "$1" - rm -rf "${MxRec_DIR}"/mx_rec/libasc } # start to build MxRec @@ -158,13 +126,12 @@ echo "---------------- compile MxRec so files ----------------" compile_so_file "${tf1_path}" echo "---------------- collect so files and mv them to libasc ----------------" collect_so_file -echo "---------------- generate MxRec wheel package ----------------" -gen_wheel_file "$SCRIPT_DIR"/"${pkg_dir}"/tf1_whl +gen_wheel_file echo "---------------- compile MxRec success!!!! ----------------" # start to compile cust op -echo "---------------- start to compile cust op ----------------" -cd "${MxRec_DIR}"/cust_op/cust_op_by_addr -chmod u+x run.sh -./run.sh -echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file +#echo "---------------- start to compile cust op ----------------" +#cd "${MxRec_DIR}"/cust_op/cust_op_by_addr +#chmod u+x run.sh +#./run.sh +#echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file diff --git a/build/build_tf2_with_opensource.sh b/build/build_tf2.sh similarity index 75% rename from build/build_tf2_with_opensource.sh rename to build/build_tf2.sh index 08aaf164..50a6c5a0 100644 --- a/build/build_tf2_with_opensource.sh +++ b/build/build_tf2.sh @@ -15,7 +15,7 @@ # ============================================================================== ################################################################## -# build_tf2_with_opensource.sh 编译MxRec和动态扩容算子 +# build_tf2.sh 编译MxRec和动态扩容算子 # 编译环境:Python3.7.5 GCC 7.3.0 CMake 3.20.6 # 代码主要分为四部分: # 1、准备编译MxRec所需依赖:pybind11(v2.10.3) securec @@ -64,33 +64,6 @@ source /opt/buildtools/tf2_env/bin/activate tf2_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow deactivate tf2_env -project_output_path="${MxRec_DIR}"/output/ -VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml - -function get_version() { - if [ -f "$VERSION_FILE" ]; then - VERSION=$(sed '/.*mindxsdk:/!d;s/.*: //' "$VERSION_FILE") - if [[ "$VERSION" == *.[b/B]* ]] && [[ "$VERSION" != *.[RC/rc]* ]]; then - VERSION=${VERSION%.*} - fi - else - VERSION="5.0.0" - fi -} - -rm -rf "${project_output_path}" -rm -rf "${SCRIPT_DIR}/lib" - -# 获取MxRec版本信息 -get_version -export VERSION -echo "MindX SDK MxRec: ${VERSION}" >> ./version.info - -pkg_dir=mindxsdk-mxrec -rm -rf "${pkg_dir}" -mkdir "${pkg_dir}" -mv version.info "${pkg_dir}" - # 配置MxRec C++代码路径和AccCTR路径 src_path="${MxRec_DIR}"/src acc_ctr_path="${MxRec_DIR}"/src/AccCTR @@ -142,11 +115,6 @@ function gen_wheel_file() touch "${src_path}"/libasc/__init__.py rm -rf "${MxRec_DIR}"/mx_rec/libasc mv "${src_path}"/libasc "${MxRec_DIR}"/mx_rec - python3.7 setup.py bdist_wheel --plat-name=linux_$(arch) - mkdir -p "$1" - echo "moving whl file $1" - mv dist/mx_rec*.whl "$1" - rm -rf "${MxRec_DIR}"/mx_rec/libasc } # start to build MxRec @@ -158,13 +126,12 @@ echo "---------------- compile MxRec so files ----------------" compile_so_file "${tf2_path}" echo "---------------- collect so files and mv them to libasc ----------------" collect_so_file -echo "---------------- generate MxRec wheel package ----------------" -gen_wheel_file "$SCRIPT_DIR"/"${pkg_dir}"/tf2_whl +gen_wheel_file echo "---------------- compile MxRec success!!!! ----------------" # start to compile cust op -echo "---------------- start to compile cust op ----------------" -cd "${MxRec_DIR}"/cust_op/cust_op_by_addr -chmod u+x run.sh -./run.sh -echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file +#echo "---------------- start to compile cust op ----------------" +#cd "${MxRec_DIR}"/cust_op/cust_op_by_addr +#chmod u+x run.sh +#./run.sh +#echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file diff --git a/build/build.sh b/build/gen_mxrec_tar_pkg.sh similarity index 44% rename from build/build.sh rename to build/gen_mxrec_tar_pkg.sh index 0eb688fd..2a53285f 100644 --- a/build/build.sh +++ b/build/gen_mxrec_tar_pkg.sh @@ -18,11 +18,9 @@ set -e warn() { echo >&2 -e "\033[1;31m[WARN ][Depend ] $1\033[1;37m" ; } ARCH="$(uname -m)" SCRIPT_DIR=$(dirname "$(readlink -f "$0")") -ROOT_DIR=$(dirname "${SCRIPT_DIR}") -cd "$SCRIPT_DIR" +MxRec_DIR=$(dirname "${SCRIPT_DIR}") - -VERSION_FILE="${ROOT_DIR}"/../mindxsdk/build/conf/config.yaml +VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml get_version() { if [ -f "$VERSION_FILE" ]; then VERSION=$(sed '/.*mindxsdk:/!d;s/.*: //' "$VERSION_FILE") @@ -30,96 +28,54 @@ get_version() { VERSION=${VERSION%.*} fi else - VERSION="5.0.0" - fi -} - -remove() -{ - if [ -d "$1" ]; then - rm -rf "$1" - elif [ -f "$1" ]; then - rm -f "$1" + VERSION="6.0.RC2" fi } -project_output_path="${ROOT_DIR}"/output/ -remove "${project_output_path}" -remove "${SCRIPT_DIR}/lib" get_version -export VERSION echo "MindX SDK mxrec: ${VERSION}" >> ./version.info pkg_dir=mindxsdk-mxrec -remove "${pkg_dir}" -mkdir "${pkg_dir}" -mv version.info "${pkg_dir}" - -src_path="${ROOT_DIR}"/src -cd "${ROOT_DIR}" - release_tar=Ascend-"${pkg_dir}"_"${VERSION}"_linux-"${ARCH}".tar.gz +mv version.info "${SCRIPT_DIR}"/"${pkg_dir}" -gen_tar_file() +function gen_tar_file() { - cd "${src_path}" - cp -r "${src_path}"/../cust_op ../build/"${pkg_dir}" - cp -r "${src_path}"/../examples ../build/"${pkg_dir}" + cd "${MxRec_DIR}" + cp -r ./cust_op ./build/"${pkg_dir}" + cp -r ./examples ./build/"${pkg_dir}" # change dirs and files 's permission - chmod 550 ../build/"${pkg_dir}"/tf1_whl - chmod 550 ../build/"${pkg_dir}"/tf1_whl/mx_rec*.whl - chmod 550 ../build/"${pkg_dir}"/tf2_whl - chmod 550 ../build/"${pkg_dir}"/tf2_whl/mx_rec*.whl - chmod 550 ../build/"${pkg_dir}"/cust_op/ - chmod 550 ../build/"${pkg_dir}"/cust_op/cust_op_by_addr - cd ../build/"${pkg_dir}"/cust_op/cust_op_by_addr + chmod 550 ./build/"${pkg_dir}"/tf1_whl + chmod 550 ./build/"${pkg_dir}"/tf1_whl/mx_rec*.whl + chmod 550 ./build/"${pkg_dir}"/tf2_whl + chmod 550 ./build/"${pkg_dir}"/tf2_whl/mx_rec*.whl + chmod 550 ./build/"${pkg_dir}"/cust_op/ + chmod 550 ./build/"${pkg_dir}"/cust_op/cust_op_by_addr + cd ./build/"${pkg_dir}"/cust_op/cust_op_by_addr chmod 550 *.sh chmod 640 *.json chmod 550 op_host op_kernel op_host/* op_kernel/* cd - - cd ../build + cd ./build tar -zvcf "${release_tar}" "${pkg_dir}" || { warn "compression failed, packages might be broken" } - mv "${release_tar}" "${SCRIPT_DIR}"/../output/ + mv "${release_tar}" ../output/ } -clean() +function clean() { - remove "${ROOT_DIR}"/dist - remove "${ROOT_DIR}"/install - remove "${ROOT_DIR}"/mx_rec.egg-info - remove "${ROOT_DIR}"/src/build - remove "${ROOT_DIR}"/build/bdist.linux-"$(arch)" - remove "${ROOT_DIR}"/build/tf2_env - remove "${ROOT_DIR}"/build/tf1_env - remove "${ROOT_DIR}"/build/lib - remove "${ROOT_DIR}"/build/mindxsdk-mxrec + rm -rf "${MxRec_DIR}"/dist + rm -rf "${MxRec_DIR}"/mx_rec.egg-info + rm -rf "${MxRec_DIR}"/src/build + rm -rf "${MxRec_DIR}"/mx_rec/libasc + rm -rf "${MxRec_DIR}"/build/lib + rm -rf "${MxRec_DIR}"/build/bdist.linux-${ARCH} } +gen_tar_file -if [ "$(uname -m)" = "x86_64" ] -then - echo "-----Build gen tar -----" - bash ${ROOT_DIR}/build/build_tf1_with_opensource.sh - bash ${ROOT_DIR}/build/build_tf2_with_opensource.sh - gen_tar_file - echo "-----Build gen tar finished-----" - - # clean - echo "-----Done-----" -fi - -if [ "$(uname -m)" = "aarch64" ] -then - echo "-----Build gen tar -----" - bash ${ROOT_DIR}/build/build_tf1_with_opensource.sh - bash ${ROOT_DIR}/build/build_tf2_with_opensource.sh - gen_tar_file - echo "-----Build gen tar finished-----" +clean - # clean - echo "-----Done-----" -fi \ No newline at end of file diff --git a/build/gen_tf1_tar_pkg.sh b/build/gen_tf1_tar_pkg.sh new file mode 100644 index 00000000..0464597b --- /dev/null +++ b/build/gen_tf1_tar_pkg.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e +warn() { echo >&2 -e "\033[1;31m[WARN ][Depend ] $1\033[1;37m" ; } +ARCH="$(uname -m)" +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") +MxRec_DIR=$(dirname "${SCRIPT_DIR}") +pkg_dir=mindxsdk-mxrec + +function move_whl_pkg() { + mkdir -p "$SCRIPT_DIR"/"${pkg_dir}"/tf1_whl + mv ${MxRec_DIR}/dist/mx_rec*.whl "$SCRIPT_DIR"/"${pkg_dir}"/tf1_whl + cd "$SCRIPT_DIR"/"${pkg_dir}"/tf1_whl + whl_file=$(ls .) + mv "$whl_file" "${whl_file/any/linux_${ARCH}}" + cd - +} + +move_whl_pkg \ No newline at end of file diff --git a/build/gen_tf2_tar_pkg.sh b/build/gen_tf2_tar_pkg.sh new file mode 100644 index 00000000..e9d71f48 --- /dev/null +++ b/build/gen_tf2_tar_pkg.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e +warn() { echo >&2 -e "\033[1;31m[WARN ][Depend ] $1\033[1;37m" ; } +ARCH="$(uname -m)" +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") +MxRec_DIR=$(dirname "${SCRIPT_DIR}") +pkg_dir=mindxsdk-mxrec + +function move_whl_pkg() { + mkdir -p "$SCRIPT_DIR"/"${pkg_dir}"/tf2_whl + mv ${MxRec_DIR}/dist/mx_rec*.whl "$SCRIPT_DIR"/"${pkg_dir}"/tf2_whl + cd "$SCRIPT_DIR"/"${pkg_dir}"/tf2_whl + whl_file=$(ls .) + mv "$whl_file" "${whl_file/any/linux_${ARCH}}" + cd - +} + +move_whl_pkg \ No newline at end of file diff --git a/setup.py b/setup.py index efb4c994..ead4083f 100644 --- a/setup.py +++ b/setup.py @@ -16,64 +16,25 @@ # ============================================================================== import os -import stat -from setuptools import setup, find_packages -import pkg_resources -from setuptools.extern.packaging import version as packaging_version - - -# Patch Version class to preserve original version string -class NoNormalizeVersion(packaging_version.Version): - def __init__(self, version): - self._orig_version = version - super().__init__(version) - - def __str__(self): - return self._orig_version - - -packaging_version.Version = NoNormalizeVersion -# Patch safe_version() to prevent version normalization -pkg_resources.safe_version = lambda v: v - -try: - with open("README.md") as file: - LONG_DESCRIPTION = file.read() -except IOError: - LONG_DESCRIPTION = "" - -env_version = os.getenv("VERSION") -VERSION = env_version if env_version is not None else '5.0.rc3' - -INIT_FILE = "mx_rec/__init__.py" -with open(INIT_FILE, 'r') as file: - lines = file.readlines() - -for idx, line in enumerate(lines): - if "__version__ = " not in line: - continue - lines[idx] = f"__version__ = '{VERSION}'\n" - break - -FLAG = os.O_WRONLY | os.O_TRUNC -MODE = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH -with os.fdopen(os.open(INIT_FILE, FLAG, MODE), 'w') as out: - out.writelines(lines) - -setup( - name='mx_rec', - version=VERSION, - author='HUAWEI Inc', - description='MindX SDK Recommend', - long_description=LONG_DESCRIPTION, - # include mx_rec - packages=find_packages( - where='.', - include=["mx_rec*"] - ), - package_dir={}, - # other file - package_data={'': ['tools/*', 'tools/*/*', '*.yml', '*.sh', '*.so*']}, - # dependency - python_requires='>=3.7.5' -) +import shutil +import subprocess + +# clean pkg_dir existed +pkg_dir = "./build/mindxsdk-mxrec" +if os.path.exists(pkg_dir): + shutil.rmtree(pkg_dir) + +# build tf1's wheel file +res = subprocess.run(["python3.7", "setup_tf1.py", "bdist_wheel"], shell=False) +if res.returncode: + raise RuntimeError(f"build tf1's wheel file failed!") + +# build tf2's wheel file +res = subprocess.run(["python3.7", "setup_tf2.py", "bdist_wheel"], shell=False) +if res.returncode: + raise RuntimeError(f"build tf2's wheel file failed!") + +# copy cust_op, examples files, etc. Then gen mxrec's tar pkg +res = subprocess.run(["bash", "./build/gen_mxrec_tar_pkg.sh"], shell=False) +if res.returncode: + raise RuntimeError(f"gen mxrec's tar pkg failed!") diff --git a/setup_tf1.py b/setup_tf1.py new file mode 100644 index 00000000..4ad4cf20 --- /dev/null +++ b/setup_tf1.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import stat +from setuptools import setup, find_packages +import pkg_resources +from setuptools.extern.packaging import version as packaging_version +import subprocess + + +# Patch Version class to preserve original version string +class NoNormalizeVersion(packaging_version.Version): + def __init__(self, version): + self._orig_version = version + super().__init__(version) + + def __str__(self): + return self._orig_version + + +packaging_version.Version = NoNormalizeVersion +# Patch safe_version() to prevent version normalization +pkg_resources.safe_version = lambda v: v + +try: + with open("README.md") as file: + LONG_DESCRIPTION = file.read() +except IOError: + LONG_DESCRIPTION = "" + +env_version = os.getenv("VERSION") +VERSION = env_version if env_version is not None else '6.0.RC2' + +INIT_FILE = "mx_rec/__init__.py" +with open(INIT_FILE, 'r') as file: + lines = file.readlines() + +for idx, line in enumerate(lines): + if "__version__ = " not in line: + continue + lines[idx] = f"__version__ = '{VERSION}'\n" + break + +FLAG = os.O_WRONLY | os.O_TRUNC +MODE = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH +with os.fdopen(os.open(INIT_FILE, FLAG, MODE), 'w') as out: + out.writelines(lines) + +# compile so files +res = subprocess.run(["bash", f"./build/build_tf1.sh"], shell=False) +if res.returncode: + raise RuntimeError("compile so files failed!") + +setup( + name='mx_rec', + version=VERSION, + author='HUAWEI Inc', + description='MindX SDK Recommend', + long_description=LONG_DESCRIPTION, + # include mx_rec + packages=find_packages( + where='.', + include=["mx_rec*"] + ), + # other file + package_data={'': ['tools/*', 'tools/*/*', '*.yml', '*.sh', '*.so*']}, + # dependency + python_requires='>=3.7.5' +) + +res = subprocess.run(["bash", f"./build/gen_tf1_tar_pkg.sh"], shell=False) +if res.returncode: + raise RuntimeError(f"gen tf1 tar pkg failed!") diff --git a/setup_tf2.py b/setup_tf2.py new file mode 100644 index 00000000..3bb52ffd --- /dev/null +++ b/setup_tf2.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import stat +from setuptools import setup, find_packages +import pkg_resources +from setuptools.extern.packaging import version as packaging_version +import subprocess + + +# Patch Version class to preserve original version string +class NoNormalizeVersion(packaging_version.Version): + def __init__(self, version): + self._orig_version = version + super().__init__(version) + + def __str__(self): + return self._orig_version + + +packaging_version.Version = NoNormalizeVersion +# Patch safe_version() to prevent version normalization +pkg_resources.safe_version = lambda v: v + +try: + with open("README.md") as file: + LONG_DESCRIPTION = file.read() +except IOError: + LONG_DESCRIPTION = "" + +env_version = os.getenv("VERSION") +VERSION = env_version if env_version is not None else '6.0.RC2' + +INIT_FILE = "mx_rec/__init__.py" +with open(INIT_FILE, 'r') as file: + lines = file.readlines() + +for idx, line in enumerate(lines): + if "__version__ = " not in line: + continue + lines[idx] = f"__version__ = '{VERSION}'\n" + break + +FLAG = os.O_WRONLY | os.O_TRUNC +MODE = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH +with os.fdopen(os.open(INIT_FILE, FLAG, MODE), 'w') as out: + out.writelines(lines) + +# compile so files +res = subprocess.run(["bash", f"./build/build_tf2.sh"], shell=False) +if res.returncode: + raise RuntimeError("compile so files failed!") + +setup( + name='mx_rec', + version=VERSION, + author='HUAWEI Inc', + description='MindX SDK Recommend', + long_description=LONG_DESCRIPTION, + # include mx_rec + packages=find_packages( + where='.', + include=["mx_rec*"] + ), + # other file + package_data={'': ['tools/*', 'tools/*/*', '*.yml', '*.sh', '*.so*']}, + # dependency + python_requires='>=3.7.5' +) + +res = subprocess.run(["bash", f"./build/gen_tf2_tar_pkg.sh"], shell=False) +if res.returncode: + raise RuntimeError(f"gen tf2 tar pkg failed!") -- Gitee From 3e47771661fe9eccf47d2d01fad6d4b4364cea3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 8 Apr 2024 19:26:38 +0800 Subject: [PATCH 019/302] =?UTF-8?q?mxrec=E6=9E=84=E5=BB=BA=E4=BC=98?= =?UTF-8?q?=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7=20setup.py=20bdis?= =?UTF-8?q?t=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...ld_tf1_with_opensource.sh => build_tf1.sh} | 44 +------- ...ld_tf2_with_opensource.sh => build_tf2.sh} | 44 +------- build/{build.sh => gen_mxrec_tar_pkg.sh} | 102 ++++++------------ build/gen_tf1_tar_pkg.sh | 33 ++++++ build/gen_tf2_tar_pkg.sh | 33 ++++++ setup.py | 83 ++++---------- setup_tf1.py | 88 +++++++++++++++ setup_tf2.py | 88 +++++++++++++++ 8 files changed, 300 insertions(+), 215 deletions(-) rename build/{build_tf1_with_opensource.sh => build_tf1.sh} (75%) rename build/{build_tf2_with_opensource.sh => build_tf2.sh} (75%) rename build/{build.sh => gen_mxrec_tar_pkg.sh} (44%) create mode 100644 build/gen_tf1_tar_pkg.sh create mode 100644 build/gen_tf2_tar_pkg.sh create mode 100644 setup_tf1.py create mode 100644 setup_tf2.py diff --git a/build/build_tf1_with_opensource.sh b/build/build_tf1.sh similarity index 75% rename from build/build_tf1_with_opensource.sh rename to build/build_tf1.sh index ff59571c..f59c13fa 100644 --- a/build/build_tf1_with_opensource.sh +++ b/build/build_tf1.sh @@ -15,7 +15,7 @@ # ============================================================================== ################################################################## -# build_tf1_with_opensource.sh 编译MxRec和动态扩容算子 +# build_tf1.sh 编译MxRec和动态扩容算子 # 编译环境:Python3.7.5 GCC 7.3.0 CMake 3.20.6 # 代码主要分为四部分: # 1、准备编译MxRec所需依赖:pybind11(v2.10.3) securec @@ -64,33 +64,6 @@ source /opt/buildtools/tf1_env/bin/activate tf1_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow_core deactivate tf1_env -project_output_path="${MxRec_DIR}"/output/ -VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml - -function get_version() { - if [ -f "$VERSION_FILE" ]; then - VERSION=$(sed '/.*mindxsdk:/!d;s/.*: //' "$VERSION_FILE") - if [[ "$VERSION" == *.[b/B]* ]] && [[ "$VERSION" != *.[RC/rc]* ]]; then - VERSION=${VERSION%.*} - fi - else - VERSION="5.0.0" - fi -} - -rm -rf "${project_output_path}" -rm -rf "${SCRIPT_DIR}/lib" - -# 获取MxRec版本信息 -get_version -export VERSION -echo "MindX SDK MxRec: ${VERSION}" >> ./version.info - -pkg_dir=mindxsdk-mxrec -rm -rf "${pkg_dir}" -mkdir "${pkg_dir}" -mv version.info "${pkg_dir}" - # 配置MxRec C++代码路径和AccCTR路径 src_path="${MxRec_DIR}"/src acc_ctr_path="${MxRec_DIR}"/src/AccCTR @@ -142,11 +115,6 @@ function gen_wheel_file() touch "${src_path}"/libasc/__init__.py rm -rf "${MxRec_DIR}"/mx_rec/libasc mv "${src_path}"/libasc "${MxRec_DIR}"/mx_rec - python3.7 setup.py bdist_wheel --plat-name=linux_$(arch) - mkdir -p "$1" - echo "moving whl file $1" - mv dist/mx_rec*.whl "$1" - rm -rf "${MxRec_DIR}"/mx_rec/libasc } # start to build MxRec @@ -158,13 +126,5 @@ echo "---------------- compile MxRec so files ----------------" compile_so_file "${tf1_path}" echo "---------------- collect so files and mv them to libasc ----------------" collect_so_file -echo "---------------- generate MxRec wheel package ----------------" -gen_wheel_file "$SCRIPT_DIR"/"${pkg_dir}"/tf1_whl +gen_wheel_file echo "---------------- compile MxRec success!!!! ----------------" - -# start to compile cust op -echo "---------------- start to compile cust op ----------------" -cd "${MxRec_DIR}"/cust_op/cust_op_by_addr -chmod u+x run.sh -./run.sh -echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file diff --git a/build/build_tf2_with_opensource.sh b/build/build_tf2.sh similarity index 75% rename from build/build_tf2_with_opensource.sh rename to build/build_tf2.sh index 08aaf164..70acb99f 100644 --- a/build/build_tf2_with_opensource.sh +++ b/build/build_tf2.sh @@ -15,7 +15,7 @@ # ============================================================================== ################################################################## -# build_tf2_with_opensource.sh 编译MxRec和动态扩容算子 +# build_tf2.sh 编译MxRec和动态扩容算子 # 编译环境:Python3.7.5 GCC 7.3.0 CMake 3.20.6 # 代码主要分为四部分: # 1、准备编译MxRec所需依赖:pybind11(v2.10.3) securec @@ -64,33 +64,6 @@ source /opt/buildtools/tf2_env/bin/activate tf2_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow deactivate tf2_env -project_output_path="${MxRec_DIR}"/output/ -VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml - -function get_version() { - if [ -f "$VERSION_FILE" ]; then - VERSION=$(sed '/.*mindxsdk:/!d;s/.*: //' "$VERSION_FILE") - if [[ "$VERSION" == *.[b/B]* ]] && [[ "$VERSION" != *.[RC/rc]* ]]; then - VERSION=${VERSION%.*} - fi - else - VERSION="5.0.0" - fi -} - -rm -rf "${project_output_path}" -rm -rf "${SCRIPT_DIR}/lib" - -# 获取MxRec版本信息 -get_version -export VERSION -echo "MindX SDK MxRec: ${VERSION}" >> ./version.info - -pkg_dir=mindxsdk-mxrec -rm -rf "${pkg_dir}" -mkdir "${pkg_dir}" -mv version.info "${pkg_dir}" - # 配置MxRec C++代码路径和AccCTR路径 src_path="${MxRec_DIR}"/src acc_ctr_path="${MxRec_DIR}"/src/AccCTR @@ -142,11 +115,6 @@ function gen_wheel_file() touch "${src_path}"/libasc/__init__.py rm -rf "${MxRec_DIR}"/mx_rec/libasc mv "${src_path}"/libasc "${MxRec_DIR}"/mx_rec - python3.7 setup.py bdist_wheel --plat-name=linux_$(arch) - mkdir -p "$1" - echo "moving whl file $1" - mv dist/mx_rec*.whl "$1" - rm -rf "${MxRec_DIR}"/mx_rec/libasc } # start to build MxRec @@ -158,13 +126,5 @@ echo "---------------- compile MxRec so files ----------------" compile_so_file "${tf2_path}" echo "---------------- collect so files and mv them to libasc ----------------" collect_so_file -echo "---------------- generate MxRec wheel package ----------------" -gen_wheel_file "$SCRIPT_DIR"/"${pkg_dir}"/tf2_whl +gen_wheel_file echo "---------------- compile MxRec success!!!! ----------------" - -# start to compile cust op -echo "---------------- start to compile cust op ----------------" -cd "${MxRec_DIR}"/cust_op/cust_op_by_addr -chmod u+x run.sh -./run.sh -echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file diff --git a/build/build.sh b/build/gen_mxrec_tar_pkg.sh similarity index 44% rename from build/build.sh rename to build/gen_mxrec_tar_pkg.sh index 0eb688fd..72ccfe49 100644 --- a/build/build.sh +++ b/build/gen_mxrec_tar_pkg.sh @@ -18,11 +18,9 @@ set -e warn() { echo >&2 -e "\033[1;31m[WARN ][Depend ] $1\033[1;37m" ; } ARCH="$(uname -m)" SCRIPT_DIR=$(dirname "$(readlink -f "$0")") -ROOT_DIR=$(dirname "${SCRIPT_DIR}") -cd "$SCRIPT_DIR" +MxRec_DIR=$(dirname "${SCRIPT_DIR}") - -VERSION_FILE="${ROOT_DIR}"/../mindxsdk/build/conf/config.yaml +VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml get_version() { if [ -f "$VERSION_FILE" ]; then VERSION=$(sed '/.*mindxsdk:/!d;s/.*: //' "$VERSION_FILE") @@ -30,96 +28,60 @@ get_version() { VERSION=${VERSION%.*} fi else - VERSION="5.0.0" - fi -} - -remove() -{ - if [ -d "$1" ]; then - rm -rf "$1" - elif [ -f "$1" ]; then - rm -f "$1" + VERSION="6.0.RC2" fi } -project_output_path="${ROOT_DIR}"/output/ -remove "${project_output_path}" -remove "${SCRIPT_DIR}/lib" get_version -export VERSION echo "MindX SDK mxrec: ${VERSION}" >> ./version.info pkg_dir=mindxsdk-mxrec -remove "${pkg_dir}" -mkdir "${pkg_dir}" -mv version.info "${pkg_dir}" - -src_path="${ROOT_DIR}"/src -cd "${ROOT_DIR}" - release_tar=Ascend-"${pkg_dir}"_"${VERSION}"_linux-"${ARCH}".tar.gz +mv version.info "${SCRIPT_DIR}"/"${pkg_dir}" -gen_tar_file() +function gen_tar_file() { - cd "${src_path}" - cp -r "${src_path}"/../cust_op ../build/"${pkg_dir}" - cp -r "${src_path}"/../examples ../build/"${pkg_dir}" + cd "${MxRec_DIR}" + cp -r ./cust_op ./build/"${pkg_dir}" + cp -r ./examples ./build/"${pkg_dir}" # change dirs and files 's permission - chmod 550 ../build/"${pkg_dir}"/tf1_whl - chmod 550 ../build/"${pkg_dir}"/tf1_whl/mx_rec*.whl - chmod 550 ../build/"${pkg_dir}"/tf2_whl - chmod 550 ../build/"${pkg_dir}"/tf2_whl/mx_rec*.whl - chmod 550 ../build/"${pkg_dir}"/cust_op/ - chmod 550 ../build/"${pkg_dir}"/cust_op/cust_op_by_addr - cd ../build/"${pkg_dir}"/cust_op/cust_op_by_addr + chmod 550 ./build/"${pkg_dir}"/tf1_whl + chmod 550 ./build/"${pkg_dir}"/tf1_whl/mx_rec*.whl + chmod 550 ./build/"${pkg_dir}"/tf2_whl + chmod 550 ./build/"${pkg_dir}"/tf2_whl/mx_rec*.whl + chmod 550 ./build/"${pkg_dir}"/cust_op/ + chmod 550 ./build/"${pkg_dir}"/cust_op/cust_op_by_addr + cd ./build/"${pkg_dir}"/cust_op/cust_op_by_addr chmod 550 *.sh chmod 640 *.json chmod 550 op_host op_kernel op_host/* op_kernel/* cd - - cd ../build + cd ./build tar -zvcf "${release_tar}" "${pkg_dir}" || { warn "compression failed, packages might be broken" } - mv "${release_tar}" "${SCRIPT_DIR}"/../output/ + mv "${release_tar}" ../output/ } -clean() +function clean() { - remove "${ROOT_DIR}"/dist - remove "${ROOT_DIR}"/install - remove "${ROOT_DIR}"/mx_rec.egg-info - remove "${ROOT_DIR}"/src/build - remove "${ROOT_DIR}"/build/bdist.linux-"$(arch)" - remove "${ROOT_DIR}"/build/tf2_env - remove "${ROOT_DIR}"/build/tf1_env - remove "${ROOT_DIR}"/build/lib - remove "${ROOT_DIR}"/build/mindxsdk-mxrec + rm -rf "${MxRec_DIR}"/dist + rm -rf "${MxRec_DIR}"/mx_rec.egg-info + rm -rf "${MxRec_DIR}"/src/build + rm -rf "${MxRec_DIR}"/mx_rec/libasc + rm -rf "${MxRec_DIR}"/build/lib + rm -rf "${MxRec_DIR}"/build/bdist.linux-${ARCH} } +gen_tar_file -if [ "$(uname -m)" = "x86_64" ] -then - echo "-----Build gen tar -----" - bash ${ROOT_DIR}/build/build_tf1_with_opensource.sh - bash ${ROOT_DIR}/build/build_tf2_with_opensource.sh - gen_tar_file - echo "-----Build gen tar finished-----" - - # clean - echo "-----Done-----" -fi - -if [ "$(uname -m)" = "aarch64" ] -then - echo "-----Build gen tar -----" - bash ${ROOT_DIR}/build/build_tf1_with_opensource.sh - bash ${ROOT_DIR}/build/build_tf2_with_opensource.sh - gen_tar_file - echo "-----Build gen tar finished-----" +clean - # clean - echo "-----Done-----" -fi \ No newline at end of file +# compile cust op +echo "---------------- start to compile cust op ----------------" +cd "${MxRec_DIR}"/cust_op/cust_op_by_addr +chmod u+x run.sh +./run.sh +echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file diff --git a/build/gen_tf1_tar_pkg.sh b/build/gen_tf1_tar_pkg.sh new file mode 100644 index 00000000..0464597b --- /dev/null +++ b/build/gen_tf1_tar_pkg.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e +warn() { echo >&2 -e "\033[1;31m[WARN ][Depend ] $1\033[1;37m" ; } +ARCH="$(uname -m)" +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") +MxRec_DIR=$(dirname "${SCRIPT_DIR}") +pkg_dir=mindxsdk-mxrec + +function move_whl_pkg() { + mkdir -p "$SCRIPT_DIR"/"${pkg_dir}"/tf1_whl + mv ${MxRec_DIR}/dist/mx_rec*.whl "$SCRIPT_DIR"/"${pkg_dir}"/tf1_whl + cd "$SCRIPT_DIR"/"${pkg_dir}"/tf1_whl + whl_file=$(ls .) + mv "$whl_file" "${whl_file/any/linux_${ARCH}}" + cd - +} + +move_whl_pkg \ No newline at end of file diff --git a/build/gen_tf2_tar_pkg.sh b/build/gen_tf2_tar_pkg.sh new file mode 100644 index 00000000..e9d71f48 --- /dev/null +++ b/build/gen_tf2_tar_pkg.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e +warn() { echo >&2 -e "\033[1;31m[WARN ][Depend ] $1\033[1;37m" ; } +ARCH="$(uname -m)" +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") +MxRec_DIR=$(dirname "${SCRIPT_DIR}") +pkg_dir=mindxsdk-mxrec + +function move_whl_pkg() { + mkdir -p "$SCRIPT_DIR"/"${pkg_dir}"/tf2_whl + mv ${MxRec_DIR}/dist/mx_rec*.whl "$SCRIPT_DIR"/"${pkg_dir}"/tf2_whl + cd "$SCRIPT_DIR"/"${pkg_dir}"/tf2_whl + whl_file=$(ls .) + mv "$whl_file" "${whl_file/any/linux_${ARCH}}" + cd - +} + +move_whl_pkg \ No newline at end of file diff --git a/setup.py b/setup.py index efb4c994..ead4083f 100644 --- a/setup.py +++ b/setup.py @@ -16,64 +16,25 @@ # ============================================================================== import os -import stat -from setuptools import setup, find_packages -import pkg_resources -from setuptools.extern.packaging import version as packaging_version - - -# Patch Version class to preserve original version string -class NoNormalizeVersion(packaging_version.Version): - def __init__(self, version): - self._orig_version = version - super().__init__(version) - - def __str__(self): - return self._orig_version - - -packaging_version.Version = NoNormalizeVersion -# Patch safe_version() to prevent version normalization -pkg_resources.safe_version = lambda v: v - -try: - with open("README.md") as file: - LONG_DESCRIPTION = file.read() -except IOError: - LONG_DESCRIPTION = "" - -env_version = os.getenv("VERSION") -VERSION = env_version if env_version is not None else '5.0.rc3' - -INIT_FILE = "mx_rec/__init__.py" -with open(INIT_FILE, 'r') as file: - lines = file.readlines() - -for idx, line in enumerate(lines): - if "__version__ = " not in line: - continue - lines[idx] = f"__version__ = '{VERSION}'\n" - break - -FLAG = os.O_WRONLY | os.O_TRUNC -MODE = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH -with os.fdopen(os.open(INIT_FILE, FLAG, MODE), 'w') as out: - out.writelines(lines) - -setup( - name='mx_rec', - version=VERSION, - author='HUAWEI Inc', - description='MindX SDK Recommend', - long_description=LONG_DESCRIPTION, - # include mx_rec - packages=find_packages( - where='.', - include=["mx_rec*"] - ), - package_dir={}, - # other file - package_data={'': ['tools/*', 'tools/*/*', '*.yml', '*.sh', '*.so*']}, - # dependency - python_requires='>=3.7.5' -) +import shutil +import subprocess + +# clean pkg_dir existed +pkg_dir = "./build/mindxsdk-mxrec" +if os.path.exists(pkg_dir): + shutil.rmtree(pkg_dir) + +# build tf1's wheel file +res = subprocess.run(["python3.7", "setup_tf1.py", "bdist_wheel"], shell=False) +if res.returncode: + raise RuntimeError(f"build tf1's wheel file failed!") + +# build tf2's wheel file +res = subprocess.run(["python3.7", "setup_tf2.py", "bdist_wheel"], shell=False) +if res.returncode: + raise RuntimeError(f"build tf2's wheel file failed!") + +# copy cust_op, examples files, etc. Then gen mxrec's tar pkg +res = subprocess.run(["bash", "./build/gen_mxrec_tar_pkg.sh"], shell=False) +if res.returncode: + raise RuntimeError(f"gen mxrec's tar pkg failed!") diff --git a/setup_tf1.py b/setup_tf1.py new file mode 100644 index 00000000..4ad4cf20 --- /dev/null +++ b/setup_tf1.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import stat +from setuptools import setup, find_packages +import pkg_resources +from setuptools.extern.packaging import version as packaging_version +import subprocess + + +# Patch Version class to preserve original version string +class NoNormalizeVersion(packaging_version.Version): + def __init__(self, version): + self._orig_version = version + super().__init__(version) + + def __str__(self): + return self._orig_version + + +packaging_version.Version = NoNormalizeVersion +# Patch safe_version() to prevent version normalization +pkg_resources.safe_version = lambda v: v + +try: + with open("README.md") as file: + LONG_DESCRIPTION = file.read() +except IOError: + LONG_DESCRIPTION = "" + +env_version = os.getenv("VERSION") +VERSION = env_version if env_version is not None else '6.0.RC2' + +INIT_FILE = "mx_rec/__init__.py" +with open(INIT_FILE, 'r') as file: + lines = file.readlines() + +for idx, line in enumerate(lines): + if "__version__ = " not in line: + continue + lines[idx] = f"__version__ = '{VERSION}'\n" + break + +FLAG = os.O_WRONLY | os.O_TRUNC +MODE = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH +with os.fdopen(os.open(INIT_FILE, FLAG, MODE), 'w') as out: + out.writelines(lines) + +# compile so files +res = subprocess.run(["bash", f"./build/build_tf1.sh"], shell=False) +if res.returncode: + raise RuntimeError("compile so files failed!") + +setup( + name='mx_rec', + version=VERSION, + author='HUAWEI Inc', + description='MindX SDK Recommend', + long_description=LONG_DESCRIPTION, + # include mx_rec + packages=find_packages( + where='.', + include=["mx_rec*"] + ), + # other file + package_data={'': ['tools/*', 'tools/*/*', '*.yml', '*.sh', '*.so*']}, + # dependency + python_requires='>=3.7.5' +) + +res = subprocess.run(["bash", f"./build/gen_tf1_tar_pkg.sh"], shell=False) +if res.returncode: + raise RuntimeError(f"gen tf1 tar pkg failed!") diff --git a/setup_tf2.py b/setup_tf2.py new file mode 100644 index 00000000..3bb52ffd --- /dev/null +++ b/setup_tf2.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import stat +from setuptools import setup, find_packages +import pkg_resources +from setuptools.extern.packaging import version as packaging_version +import subprocess + + +# Patch Version class to preserve original version string +class NoNormalizeVersion(packaging_version.Version): + def __init__(self, version): + self._orig_version = version + super().__init__(version) + + def __str__(self): + return self._orig_version + + +packaging_version.Version = NoNormalizeVersion +# Patch safe_version() to prevent version normalization +pkg_resources.safe_version = lambda v: v + +try: + with open("README.md") as file: + LONG_DESCRIPTION = file.read() +except IOError: + LONG_DESCRIPTION = "" + +env_version = os.getenv("VERSION") +VERSION = env_version if env_version is not None else '6.0.RC2' + +INIT_FILE = "mx_rec/__init__.py" +with open(INIT_FILE, 'r') as file: + lines = file.readlines() + +for idx, line in enumerate(lines): + if "__version__ = " not in line: + continue + lines[idx] = f"__version__ = '{VERSION}'\n" + break + +FLAG = os.O_WRONLY | os.O_TRUNC +MODE = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH +with os.fdopen(os.open(INIT_FILE, FLAG, MODE), 'w') as out: + out.writelines(lines) + +# compile so files +res = subprocess.run(["bash", f"./build/build_tf2.sh"], shell=False) +if res.returncode: + raise RuntimeError("compile so files failed!") + +setup( + name='mx_rec', + version=VERSION, + author='HUAWEI Inc', + description='MindX SDK Recommend', + long_description=LONG_DESCRIPTION, + # include mx_rec + packages=find_packages( + where='.', + include=["mx_rec*"] + ), + # other file + package_data={'': ['tools/*', 'tools/*/*', '*.yml', '*.sh', '*.so*']}, + # dependency + python_requires='>=3.7.5' +) + +res = subprocess.run(["bash", f"./build/gen_tf2_tar_pkg.sh"], shell=False) +if res.returncode: + raise RuntimeError(f"gen tf2 tar pkg failed!") -- Gitee From 67ca37a888d4d5f059b4d8bfeaa51d10b332d060 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 8 Apr 2024 22:21:55 +0800 Subject: [PATCH 020/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=E4=BC=98=E5=8C=96-lazyAdam=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/optimizers/base.py | 4 ---- mx_rec/optimizers/lazy_adam_by_addr.py | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index 91c72d52..c5c0e601 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -113,10 +113,6 @@ class CustomizedOptimizer: unique_local_grad = tf.compat.v1.unsorted_segment_sum(grad, restore_vector_second, array_ops.shape(unique_keys)[0]) - if not is_expansion: - unique_local_grad = ops.IndexedSlices(values=unique_local_grad, - indices=unique_keys, - dense_shape=tf.shape(var)) return unique_local_grad, unique_keys diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index 0f7d7139..22b8af33 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -122,9 +122,9 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): unique_local_grad, unique_addr = self.sum_same_id_gradients(grad=grad, var=addr, is_expansion=True) power_b1, power_b2 = self._get_beta_accumulators() - power_b1 = math_ops.cast(power_b1, unique_local_grad.dtype.base_dtype) - power_b2 = math_ops.cast(power_b2, unique_local_grad.dtype.base_dtype) - temp = self._cast_to_base_type(unique_local_grad) + power_b1 = math_ops.cast(power_b1, grad.dtype.base_dtype) + power_b2 = math_ops.cast(power_b2, grad.dtype.base_dtype) + temp = self._cast_to_base_type(grad) temp_lr = temp.get("temp_lr") temp_b1 = temp.get("temp_b1") temp_b2 = temp.get("temp_b2") -- Gitee From 8f6ff1ba4fa6eb332ad1a2bbf60fb0f1a735176f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 8 Apr 2024 22:23:50 +0800 Subject: [PATCH 021/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=E4=BC=98=E5=8C=96-lazyAdam=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/key_process/key_process.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 8ab030a8..b72f3c8e 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -470,6 +470,8 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel) { + LOG_INFO(KEY_PROCESS "rank:{}, channel:{}, useSumSameIdGradients:{} ...", + rankInfo.rankId, channel, rankInfo.useSumSameIdGradients); if (rankInfo.useSumSameIdGradients && channel == TRAIN_CHANNEL_ID) { KeysT uniqueKeys; vector restoreVecSec; -- Gitee From 9c253c4d1a36b017190da6cec3be27b3d99786f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Tue, 9 Apr 2024 10:54:44 +0800 Subject: [PATCH 022/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=E4=BC=98=E5=8C=96-=E5=85=A8=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/optimizers/base.py | 12 ++++++------ mx_rec/optimizers/ftrl.py | 10 ++++++---- mx_rec/optimizers/lazy_adam.py | 15 +++++++-------- mx_rec/optimizers/lazy_adam_by_addr.py | 17 ++++++++--------- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index c5c0e601..395e60eb 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -58,7 +58,7 @@ class CustomizedOptimizer: self.unique_name = name + "_" + str(count) self.base_name = name - def get_restore_vector_second(self, table_name: str) -> tf.Tensor: + def _get_restore_vector_second(self, table_name: str) -> tf.Tensor: """ Get restore vector which is calculated after the second all2all :param table_name: embedding table_name @@ -74,7 +74,7 @@ class CustomizedOptimizer: channel_name=f'{table_name}_restore_second_{channel_id}')[0] return restore_vector_second - def get_unique_keys(self, table_name: str, is_expansion: bool) -> tf.Tensor: + def _get_unique_keys(self, table_name: str, is_expansion: bool) -> tf.Tensor: """ Get the global unique keys which is calculated after the second all2all :param table_name: embedding table_name @@ -99,16 +99,16 @@ class CustomizedOptimizer: def sum_same_id_gradients(self, grad, var, is_expansion): if isinstance(var, ops.Tensor): - # 扩容模式从scope获取表名 - table_name = var.op.name.split('/')[0] + # 扩容模式从scope获取表名,偏移是-2 + table_name = var.op.name.split('/')[-2] else: table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) table_name = table_instance.table_name with tf.compat.v1.variable_scope("restore_vector_second"): - restore_vector_second = self.get_restore_vector_second(table_name) + restore_vector_second = self._get_restore_vector_second(table_name) with tf.compat.v1.variable_scope("unique_keys"): - unique_keys = self.get_unique_keys(table_name, is_expansion) + unique_keys = self._get_unique_keys(table_name, is_expansion) unique_local_grad = tf.compat.v1.unsorted_segment_sum(grad, restore_vector_second, diff --git a/mx_rec/optimizers/ftrl.py b/mx_rec/optimizers/ftrl.py index d6ddb093..3659ffcd 100644 --- a/mx_rec/optimizers/ftrl.py +++ b/mx_rec/optimizers/ftrl.py @@ -140,17 +140,19 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): self._resource_scatter_nd_update) def _apply_sparse(self, grad, var): + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) + if self._l2_shrinkage_regularization_strength <= 0.0: return self._apply_sparse_shared( - grad.values, + unique_local_grad, var, - grad.indices, + unique_keys, lambda x, i, v: tf.compat.v1.scatter_nd_update(x, i, v)) else: return self._apply_sparse_shared_v2( - grad.values, + unique_local_grad, var, - grad.indices, + unique_keys, lambda x, i, v: tf.compat.v1.scatter_nd_update(x, i, v)) def _apply_sparse_shared(self, grad, var, indices, scatter_nd_update): diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index 70549702..bab8245f 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -149,15 +149,14 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): self._resource_scatter_nd_add) def _apply_sparse(self, grad, var): + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) return self._apply_sparse_shared( - grad.values, + unique_local_grad, var, - grad.indices, + unique_keys, lambda x, i, v: tf.compat.v1.scatter_nd_add(x, i, v)) def _apply_sparse_shared(self, grad, var, indices, scatter_nd_add): - unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad, var=var, is_expansion=False) - power_b1, power_b2 = self._get_beta_accumulators() power_b1 = math_ops.cast(power_b1, var.dtype.base_dtype) power_b2 = math_ops.cast(power_b2, var.dtype.base_dtype) @@ -168,17 +167,17 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): temp_epsilon = temp.get("temp_epsilon") learning_rate = tf.divide(temp_lr * math_ops.sqrt(1 - power_b2), (1 - power_b1)) - abs_indices = tf.math.maximum(unique_keys, 0) - nd_indices = tf.expand_dims(unique_keys, 1) + abs_indices = tf.math.maximum(indices, 0) + nd_indices = tf.expand_dims(indices, 1) momentum = self.get_slot(var, "m") old_m_slice = tf.gather(momentum, abs_indices) - m_t_slice = temp_b1 * old_m_slice + (1 - temp_b1) * unique_local_grad + m_t_slice = temp_b1 * old_m_slice + (1 - temp_b1) * grad m_update_op = scatter_nd_add(momentum, nd_indices, m_t_slice - old_m_slice) velocity = self.get_slot(var, "v") old_v_slice = tf.gather(velocity, abs_indices) - v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(unique_local_grad) + v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(grad) v_update_op = scatter_nd_add(velocity, nd_indices, v_t_slice - old_v_slice) denominator_slice = math_ops.sqrt(v_t_slice) + temp_epsilon diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index 22b8af33..cd4ee878 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -114,13 +114,12 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): return temp def _apply_sparse(self, grad, addr): + unique_local_grad, unique_addr = self.sum_same_id_gradients(grad=grad, var=addr, is_expansion=True) return self._apply_sparse_shared( - grad, - addr) + unique_local_grad, + unique_addr) def _apply_sparse_shared(self, grad, addr): - unique_local_grad, unique_addr = self.sum_same_id_gradients(grad=grad, var=addr, is_expansion=True) - power_b1, power_b2 = self._get_beta_accumulators() power_b1 = math_ops.cast(power_b1, grad.dtype.base_dtype) power_b2 = math_ops.cast(power_b2, grad.dtype.base_dtype) @@ -132,23 +131,23 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): learning_rate = tf.divide(temp_lr * math_ops.sqrt(1 - power_b2), (1 - power_b1)) host_pipeline_ops = import_host_pipeline_ops() - dim = unique_local_grad.shape.as_list()[-1] + dim = grad.shape.as_list()[-1] combined_tensor = \ - host_pipeline_ops.embedding_lookup_by_address(unique_addr, embedding_dim=3 * dim, embedding_type=1) + host_pipeline_ops.embedding_lookup_by_address(addr, embedding_dim=3 * dim, embedding_type=1) split_length = [dim] + [dim] + [dim] split_tensors = tf.split(combined_tensor, split_length, axis=1) old_m_slice = split_tensors[1] - m_t_slice = temp_b1 * old_m_slice + (1 - temp_b1) * unique_local_grad + m_t_slice = temp_b1 * old_m_slice + (1 - temp_b1) * grad old_v_slice = split_tensors[2] - v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(unique_local_grad) + v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(grad) denominator_slice = math_ops.sqrt(v_t_slice) + temp_epsilon update_list = [tf.divide(-learning_rate * m_t_slice, denominator_slice)] + [m_t_slice - old_m_slice] + \ [v_t_slice - old_v_slice] update_tensor = tf.concat(update_list, axis=1) - var_update_op = host_pipeline_ops.embedding_update_by_address(unique_addr, update_tensor, update_type=0) + var_update_op = host_pipeline_ops.embedding_update_by_address(addr, update_tensor, update_type=0) return var_update_op -- Gitee From fcc359ff92c87f95a5e746745662e012a71c9b6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Tue, 9 Apr 2024 11:45:15 +0800 Subject: [PATCH 023/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=E4=BC=98=E5=8C=96-=E6=A8=A1=E5=9E=8B=E9=80=82?= =?UTF-8?q?=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 4 ++-- examples/demo/little_demo_estimator/nn_optim.py | 4 ++-- examples/dlrm/model/gradient_descent_w.py | 10 ++++++++++ examples/dlrm/model/main_mxrec.py | 4 ++-- mx_rec/optimizers/adagrad.py | 5 +++-- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index 540445e8..a47590c2 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -336,9 +336,9 @@ if __name__ == "__main__": train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if use_dynamic_expansion: - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) # do sparse optimization by addr sparse_grads = sparse_optimizer.compute_gradients(loss, train_emb_list) # local_embedding diff --git a/examples/demo/little_demo_estimator/nn_optim.py b/examples/demo/little_demo_estimator/nn_optim.py index 4438627d..3be3c7ed 100644 --- a/examples/demo/little_demo_estimator/nn_optim.py +++ b/examples/demo/little_demo_estimator/nn_optim.py @@ -73,11 +73,11 @@ def get_train_op_list(losses, learning_rate): # do sparse optimization if use_dynamic_expansion: - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) local_grads = tf.gradients(loss, train_emb_list) # local_embedding grads_and_vars = [(grad, address) for grad, address in zip(local_grads, train_address_list)] diff --git a/examples/dlrm/model/gradient_descent_w.py b/examples/dlrm/model/gradient_descent_w.py index f3ae78d7..6c34b726 100644 --- a/examples/dlrm/model/gradient_descent_w.py +++ b/examples/dlrm/model/gradient_descent_w.py @@ -47,6 +47,16 @@ class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOp super(CustomizedGradientDescentWithWeighDecay, self).__init__( learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name ) + self._slot_num = 0 + self._derivative = 1 + + @property + def slot_num(self): + return self._slot_num + + @property + def derivative(self): + return self._derivative def initialize_slots(self, var, table_instance): logger.info("no slot for gradient descent") diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index dd3e8d2d..627b6c8f 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -340,9 +340,9 @@ if __name__ == "__main__": train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if use_dynamic_expansion: - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) # do sparse optimization by addr sparse_grads = sparse_optimizer.compute_gradients(loss, train_emb_list) # local_embedding diff --git a/mx_rec/optimizers/adagrad.py b/mx_rec/optimizers/adagrad.py index a5fa7975..4ba444a6 100644 --- a/mx_rec/optimizers/adagrad.py +++ b/mx_rec/optimizers/adagrad.py @@ -131,10 +131,11 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): def _apply_sparse(self, grad, var): acc = self.get_slot(var, "acc") + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) return training_ops.sparse_apply_adagrad( var, acc, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), - grad.values, - grad.indices, + unique_local_grad, + unique_keys, use_locking=self._use_locking) def _resource_apply_sparse(self, grad, var, indices): -- Gitee From dd259e7f25a7251e901c8e6a4b0e9e9a2f0b8d3f Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Wed, 10 Apr 2024 10:09:50 +0800 Subject: [PATCH 024/302] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=85=AC=E7=BD=91?= =?UTF-8?q?=E5=9C=B0=E5=9D=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...56\347\256\261\345\234\260\345\235\200.xlsx" | Bin 0 -> 19596 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 "docs/MindX\342\200\242SDK\342\200\2426.0.RC1\342\200\242mxRec\342\200\242\345\205\254\347\275\221\345\234\260\345\235\200\345\222\214\351\202\256\347\256\261\345\234\260\345\235\200.xlsx" diff --git "a/docs/MindX\342\200\242SDK\342\200\2426.0.RC1\342\200\242mxRec\342\200\242\345\205\254\347\275\221\345\234\260\345\235\200\345\222\214\351\202\256\347\256\261\345\234\260\345\235\200.xlsx" "b/docs/MindX\342\200\242SDK\342\200\2426.0.RC1\342\200\242mxRec\342\200\242\345\205\254\347\275\221\345\234\260\345\235\200\345\222\214\351\202\256\347\256\261\345\234\260\345\235\200.xlsx" new file mode 100644 index 0000000000000000000000000000000000000000..2fa2165b6f608106df678d1afd31562884a3029d GIT binary patch literal 19596 zcmZ^KV|XA@lXYy{p4fIKwkEc1+s?$cF|lpT#I|kQ$(PKF-FLs;{?WQsbx)mh`}Wh- z-Exw^AW#5bONh+&_pj~07Rb*XLmLA*dmCE^I=N3Xl+P0of6Z!Zz!ts(0szmkDktidgkDzLBnD*zOppIU8SbK)bsYrsSXi~=*D|F50S^)wq zF@y(TSRqJ*>Q&C6%vRVdFS`m$lvuL6(riWms2|PL_SMBEl@gL|ioF<+) z2Hh`&%nv}TY&HD^PC^%&^lk{hu}urEVa!Ny(!d7qbI!v>j!Q<%wDa+GcB}`9vO{}& z%nw@Em)*r;u=o+&2RQMMAVzrgw*0|)6;2Wh=>mvu?S>Wnxw1VvFHE@B2;D5GrJDS{ zCkt3OUIo53;*r5OxcFd#sPms}-~av%s`$ya$tT;0|DA0E8+)TKx+CNGW%}q5gdfCK z39or;;?Y2q<0O1r^N^5)37nI7q-l`IYPFVefcqT{+)cun_)GpLR4yy?0B~M8CS2#>!QxmuA7n&CJC%Hj(>3cka05qE;}; zeEqU@D=*H9I;F-cIC5CfGA&@nl9}^TM%Y8c?x;KbEoyq>5t%JkbR?l=oNHd~XN@aW z53(-$uNp41&Y^Iyy$&^|Z2kOhI9eN{#pID<%=bf!@Ix7aj*cRbZq`|R$2R)TLHhXH z+5Mq$(43t(2=Ko3V_vT;(>DJpLEbxACgrCDh@TQ5|Az!l4vscfUm_I7N=x>AM+m;~ z{~$c&S&Al=ZX}@?6XoYtt-hz!t1|;$TB~j=LA&Z|D^$MP6pm)PxxO5gYV|}7aM@kH zl`sYXhj7(ooV&Nr_A5SHmi0BUBwjI=i!<5mhgIEgVmlHki?vvB*x?9d#$;FJq9tHH z?G`owVoS1~rl7$(k?061J#qg-^_#S(FbE`g{u@=T(y671^UBPj z93oEdVlA^LwAW`;g{VhXpkgN&wy}UmHg;+~b*)y)|4u_O48TOWP7nMtt zQ~ggpPX^@!(BJjpCWgX4={3>G#kWgEDm6s)v=kpPcTl_s6kO*XiziPh9X}p7?=8uU zf0p1c<1X;MLWGA#i*IIOZMuc-Y*=nSsVD2~<9|{6zfU>9pVYcq{+(?8cLTV87`WKj zTR4~+89DyXiOII?^~@Or03habR>JuIZ2!72{JmmQB;(WB5qvK+8b18C@#q)-MAv49 zkKI%&l~kHw5nX9XxQ)tcxiX*Rvld)l9Pd zAh}ILd&qyk;CnBwsGO75)ZB?c+x_)H(D43rdw?I3q+E3u5F-K%E$lUmFRgGcT_=M_ z5^WKXRN$M1U2)y)w{UDdZzQ};r83MkqfL#&6g?w=j&?tHv6$1Ok%o{=I+1}AN&h33 z+PeGWIX0=G_TKxA#$$ofnoxOfqFY$y*73d^^_osLK79Um9s_brdJ8Mc-sD`wZXc47 zIli2`QagK(j9yc~md-4RmCv<#m~i^mdAX5gxt29fz@0%Xrh!4+!EGLPY^^!$ZpLq- zdtsOxx01pzR=Ptuc18X}|BQHLJ~!Z}r=++Bd=?Aq0bN38ZPcOrEcz(MkXUbR)$hsn zSm+_fYD1eG>vv@3EljDj%(1Py+KU^`Cv`Iw`=8lr4!KJYx69m7&oU~fHp{^z9eh~i zvxPOrBFJfeppR916lFSEBRekZjRGD=80ig_-?+WarsC2B29h`@C2%c_EcxWrCBB&^ zPG9HyDgOXpmFL9ntMxg{Bu>mxl4#uW%mHaRF!NQW@H)gTcc=uenD%4viDgScuLP+0 z@o2U##UR@bq>7jYhib*5nnSPW$w>(5gjr|RzkHEXiZuTO>hAY_UH6Ygj!p6J9?yVl z@q&IeF<)mgy~uC0s=auU&8qqwNtf0A=G0pTVRx!T-ALoAMEywVH9{mL<$BF4GW~%G z&Tc{aSwm#`qH9~Jj;CiNp>@aS@hvCiH;^s5kQR)@l?Y3h49XTBW<9ml3**+HJ_Z)+ z<%fUlPJi1YweaCQ8dV|zed{UhwdyAph3cgU1*yjAe1NkEI2(1XV1O#KnQC!M5;{#h}7N#_E@9t?}lG!$mJ~uyf*VqHV%$Vk812 zA|!GDwGg1xo9rDs04P|Z?!n)KeS-t!-Q@k`KTJsx8dJA9Yj^RuRKcNOfn$I0*w-Y4 z#74{e%7@AqOiB@0KBPekS(?WHI7{h|JXng)dU*Bxj|l8iW1zj41@$CYVE!={gu8mX z1}M5H`Y064$PgK-`#^fGNTEWXGpjvKAa{gk1W<%P1W}MN(9=*&#FjV#5-c$Ppb;5* z(W0aq#uEl8Mi7QD)EG!FZ!nXEkT>yDqG%;zSkhW;OE+19#~Lv@F+4GTK0rP~zF02J zvvc2>6lm>Iogmr+=OzYH+c9dtlO#@~!;iJ6-{td2&(w==kcwQCbUx!*u%p)ITJWnC z!;doW;bk7`v+5$&Ic=NhnG%}){gQ%`#tI&}u$oLyJHVCCAkLYfnzQxTg^})lfI`_5 zgxFOtG+VcP|9MY2PMVe6{Jft`fd8L!731HzN=d^eogT$IyZ*!XRohy8h7_ojlw!F+ zy0l0uON*UE8IWlO@~ms4CKE*La2Xxu;5zeSRm$<akC+d&><=@`()2|lm<=i5gK26gxn;CCCWK!ZdqfEU%wpPfLY9;XemoC z6)d@zAaYvlN+sxS-%?fNm63l|BjERJ3imLo-dL*eS5Sxg#!>FlHI>JT)+fITy0+xd z?7Q#Ii_~}!}~QSL#bU+l1CCI1&$Y#EX2`Ac|6Yr6rQpQ{?k!H z&YIer$<*j#ZHmTc`&EsqbD!ag1uY@wXqyWqnx{O6+MpK7n4Q%J_LsM)*hS!y58rmN zu@qiVydb|zq59P|{KynL!Y#shqe1;NU7JD0mr{-0Yy8TrE#s$c&7!m!}!l z8+K`5XE}o|rUmgxk!{Dx`EFx%rX zZwO>x1Pq(@LJEv+8t^s-kKYR&GBmBlVn~IPo+l<<5R4B&j9;20!DVRSNOeuQqOh_& z5^E(tL+aphOH-!?AAXW+Ii?cy`K1}sXg_lku^v;byq*h_vZEPEFM1T_h5@rf_cFYh z;iiL`#p1$KGs;C-gaW^_JsZxo189SDhPY0C=9)b=R$JA+xuz0)u*1yKK2ZRl|D*l= zRx`V*Q z;Qly_s=e?GYdags_Q=y~3N^!k{lyQkQ6w2FH_GK1mFxrHKd%bcCG!MnSO5S}-T(8d zVETJiWN6uJi=hQys4Rc@HfML&MEWVmO_9i3s2nGX)afiR+j0IP9QU6&dV25}1{MSZ(eXKN%*UU5uP)KC{$tOA8(o_0^w#Y5vixCYs zcD!fTP@z+rew96^_j+=s^I`v{ebI%VRaT051VI;)e&sOJ?wuwDcXD9f#Hl$~HTSdg zTJgfAOyr~WgO#KGyll?krD)ALPP%x{uxYtHq79MvjE_;|?Y3S`1^({YN~ME&jo`(+ z&3yLbvfYg*L*(V@G4~4Zq0Hq?O_rDEcgT6-jq4BJ4ePq~g^=fx7WIWT@#0{=zdoDi>8DWp^V}c`5{{vZed) zXOpvSAxd>v8`@1wmoMJ-9&gQ4l))wVXJy6dS2ov2{Kk8#I%cUKmZcAZ7Zdr$cX!!y zjp}T3`Lsz_ze;qv6lKS}8rF{6etL34jci+J`M}$}wF%7Fw%^lik(W8uJ;^&g1_Tz~ zb=`MwO}I}EJ^gy)%ZA}?l015NJFIb6@32aLaevtuGq~|&(vFh-xEFnUZ!*6*Ymx7G zHD|?}vl(-~`QcUKt*GJEH^vSRuD+mwtMtLOY}H$fAMIS0efF`@hG^6ZaiQI0#Dymu zb#b~LkGqlo!Pj-Lja|=v#xD>X1wM3B#=$dOfNVK}NP2efuG1Cm$X{mmQg_q_S5;=a zJX9ahdN?d#|p1Yy=X96JJ zP}RFsF5YMN>oB!;=zO_fBFU zUJ!=)=)-~Jp8|klk<(C!U96cZV% zCkq7*A?;_;3rp4as3XyI?{VslqnTj^@gvzd3KWG>FgjiXY0z8L$PDHnGYV@a3zE^* zgWjNJ#_*FFLJ=qTL!SeX-1z@(4T_cC4srp|Br}|ZOeHE>wLNbKu;D|7N5(k`e2S16 zL{a8-Bcx5~It+y2Bvk0nz@DYJvF8R@UquAVfMEKG$?AA*&)xSL#EA%&X`7K6o0^)u z1895lWzF<+iK{$5c?WZ7L-po-SpdVa*UnfZ%0xQ~yt@LK@i1_7bnt3)tJBK=)yd(@iQ@~7!XGKjU4`>K*MFunJ$SYH z8JHm zfW?vJ!GQ;`0Ezj@>p%VeqL9LzNYq1f<;XI4-^LRCFTYQH822w2pud>jXP^UfqWsL_ z`!;j5+QMImOl;RBm!F-ND2yiN|4nCg(7*TJ95@g|_iCm7e4zvs%iQMhEdPfE`vufz zjxp>#<4k&!kOQgM2U=U7Ng@?<%zgng#+%sKay!^_^?UmHHwQIUwavdre|i0xLeapzk%dhF z5-0Wkli8{N?nhwPd-nySj=7TNzvQQmv>5rXkuiVaQuvAB@*e=bZ7BXN(5F|=SCd#q zk{KETSmq$RNL2DC8UnjvgGxuW?_WbM>0hVR|Ge`29bD8q6Rd+*)1O%v$e0-;{dF@= z`gansC5UN-=(~Vo1%_BN=j=YX51km7Q-XgNK7dS zP=xgYB1IxZqaDof08+Z#iEWT12MOWN-v*zb}ZUEhsS)-ywp?J9=&2{virH8VWE+Rj#NZ0AOe*w35YS#v>U+* zGWqKTK7quJK6*A>*75-S1NN;teK6CZ(Tl$jX;?ugiJSVz@rD_Oy4zAG zL2SfdoN<_L4Y!}}=dDCvWY%JMe!}dqg#T_1>LyH3PXw&C!LH-Jga@$k5BG=DkcR78 zucuf`G6eNlKX;ek!OGf=bmAm^XW&4&sMv#IQ-#TAfMO=q3xm^InUI0Xh6E?^qtl_#EJd+ ztDe{e_A)Tlnw5j~Qawm{`RxOwh#y8k|7m=xJ75);vf)@h=8t|D!Xu|O5|$eQZ5L=h z!(bE!L9tQZtP=FwR-;hk4zvvGGp6vAnq2*-_%t`Ys+C|Z69!+);3Dj0tVoZYgUVpdb`eVBhAg zwd6Q0cJsX9DB~Y7=CtzDgR(G>fD^S1MyDd=OhpK4p4A*3=2(b(B1?LnC3dXs%X~KQ zsI_X9Reo9d}% zb|s9w$U@w*cB00S2t%PC9HUx6ili=;X(VEwo3Bqj^MOdMJC1$3m{ri$*I5zKsZ*^f`0=bXT^yy@-S}U#Xi7+vyiy?ky zVV*qYW$D5yZi~$H#qP94r?~mPHn7+UB^NOLdKjjFe#6b7ccTUq(rSV6mq)?F-n`z} zDy=f%Suu$jWRNikQrVi=LqkVetg*6j@RiJ0R!3s1QS(V+aS}P%!SekPn!x4r#C9+0 z8r#6e7#CkDV8!&GSzCp{r|A9V(p+%*W<$$_r1=W#ySiQ&Cg!lY=@hQ(LIkMGtMut( zY7%ol5))&>+KbYKNAsk70@w=zla-zE|8o5F$}(jGD|5;Rk}4>`MP-bgh~HDrJ{4s z?ykUuJl9?rw#$s2SJVF!%-A_kY;9sGS+D@0|Ibtt1y&m8tSWFBe7?Y z(sbz#FwNoeO7)mX!vM!*pk61I}}n4Nu1cE-Om$jmJ~s84=hMu7+G~W3UaPBqT!onla6G-@g|sZ~u5t3nC)_ z@d283Hqz=dgi)vH9+s5z65+B{CLm8Djy32OJYWh@Z$&5p zmH{Gah$P7ZzT7wXY#kh|ZzU#;9!f}-Y8i-c3<0HAayNH#q0Qri)$abj`?^(k6T+9p z=kEA^37$WK$Sri#kLLL_R_x<>Cx7`eQ>DZEz8a0r`kwq*7k?_Ac>PYES13%^>3V%q zM@HBAy4z%f@39Gv18eD;O`;0IcBy9j^V?=RT!aM&-sM94;Xo)m5;@o!-W)C_S2wnU zkXJLQbAU^T;KwCk)z~>#=G?V5+VG<|;8iGNO&x zv*9L!8HJ-5(%{Id2|>Yf*w|HO&d{%k6INcQSUH4H_lOIFyGhci{ z%$JDMv8TyDPJw|^vh+s$CV@ehL#w-+Rf)GJlf18Hd__0?!uH}tyqTh*4A@J?Gs+Qh zqakOp%#Ahpp%ii`pkz|dn**a22zPL9-;e${ZV)btKirGhVsjQ-O~(-&k3@LbyTLMr zbaimx5|_8ZI64*Ukddq{w~xVGEISi(Z}w|&hfs~VBcJ;$=B=0}W$FG3zx#Hq72>^+ z+t<`7Xj8i9(@96C!Qg1tfXTdK)$t_r>T$u4TTga?f)Cm8hF1L?(Mta-@RlDL9lVA* zSXAjx&RUN&JEb8h(E>9f{~r(>_`b-}S#VBC>!^W!SG7`=d{YKL-niaC_tr4p5?aNi z(J+U6ZG*fxKuPETU1Egp`&`7N6A?Ta@X+m8Muz0-x9V)2nPS&q=xo}AGFz07=;lJx znmF6E<)dOVTQC>y>P-Zbo*v!`8hY~N(p+U2eNl4;?uTq8oJ zjf+_Fnsv2rl>3aor)zU_LW4Q}u_4Scr=Xle2@w_QzUsmP%$$YRR8gXGyQ2%ku_e7g z!hFzA=t=BLiV8ACBNo(3(HAsACp}_To%b)(R;jPD$g<-uLV#%I2&C!--V04bIENR# zw5RalmbX&&Xt-=D&(OS68O7cTYIoXK;`Vh|EWIe&USHhqy)vRLz5CH1CVXUXsI5(g zdOw#L4!~Y4U}OU1FQ4G+cQS~+Z#fc>o}8f?5T*`gg~fTIEpHJFi@h*#CNPZM{17KH zWt3<Ap5>QJJE_wLe;YNKI8jbCiNQvZc?k*6Tb{DjceJ*Hpon0kYif z8X>~|z|Ug415AA-iBB|F(azg!(LPn0Nb3yboS*{Km@&#At=0}vV6<#M%VM=fVA}&* zWwG3NHXuP_4&6Xy_i|zdxF}apy4Zr!VR17Kx~r@PEx7ahUbas7qeAxsv>WUpccgx6 z*h!?29`}2|{RhZ@mh?n>>*uRJOM0|6005txuN#Gfqno9X!&e<|P0J!0TLtqys^15L z!yg`E0MElS)zPJ|(nYisB{vi#5QH3SWoHf1Aesx0tW7u($P5Anv;+!)JdwOY;PHE- z975zW=^Il`-AhuiQ#g3THiYgx~gIzqeKMlZrw%>FaFs#tv zG+kUv6tSG(B070nP2nQ}U#IIbJI{MnOuCq{ID^-55%W5gN)txze(11)gGToidXquN zcb>mGa#K5>zRS6cHJYWKBkn21JZDN)UBFw4+Gyk)=jMZDY%%t>+BoBQ7(T52>@GEY za5o+ZH6>q=jIx^{S(k^_zz(p}IcXy#^f(S%A_pa;*O{o+i8uAp>A9<=%MhDgJv$h^ z9w{ZEXp(qYs7*bwIACqNN~a^k`XhA@Up~p5ZWHfmu8HaX0mMkc+p&q&{oWU}C zJeLM`K3idWE9jgMDdO>B(wP`-4sM>*1ay$Nuj+@cLQ5^N5@{*FdG9)qJIX zx+ONi>RscZ_^r1M(n3Z_i=(6^=TUY$oP$$MvWaNfEHBUJgSO)AE)VPZac_^hoSrVm zS7OcmgM7P)XwO)ooFXG6aa2ioiSTYx2utHShWzjH<>Tj*5xvWcnbKIcFqh_}vcqbC z*7|0@IG^@en6Hz?cil$IXqMYqS)*H{M$jCbNy%V2W+tQ^l9GIfOO@xa?t+3~Q}=fQ zAuQalpOw3uC+!=f556y>fU^V>d8$fA#G%fh^ zs@<)wlAgBm{KTw!bO<$XfKZV#w~Jk0#v38`l_L&bDN&{i3;+wCKE@`hPJx@ zBo2IculytZ-)87*zwi5CPRRI+y}#*+T&o}%llXko)Bb@Dkd&s3_4h#JT8ClO5dt>g zzVXaaPwU6^!tyRDOiG7`26obvgWlwE$_x1JOhbqvghd-|`L30fqqNrs-XWT15>WVN zyj-t8X}Oj)+A6TB2h6jt2>Z7mp*vX;fLIy)fO0S;ye9i4r$qBK^DcxBioe?d ziUy)xHeR2KDZ{x_)RVx{1ehDhf#l7X>YgbRx1Mf64vOa*3Div{31E)=mx@ah=0*fW z-f>oBf?i{Jzl~F2QvAq6LX8d1B^Or44q)#k8*lO>^vbu0deP&A72WSY2mJ}5p&caU zO%7YQDrcV9dT>)-3(VCl(({0(5jlz_AQVdz7)oFOf%u!-(vZqb%}NMm$x;Xe^K)y` zP-79Thb|W?$`==!OAuB(Ep#UJ4N)~sB7SZ@fs!IA->h2RuaYUNP8-SsN@ZilYCoJT z^~7Z}hi{RA`ULB!`K=n=849(Q;-osnS-j_+r8XegvY#lqx^_p$9rydi$UK;3$`bx< zB;e@?1J1%}j1~{>qq(bw9h?V~C2H#hMPyY0e8_w(*ma5H1@^D*&a+7wnwL)8!yaob zC(DJyW=B44%u%koQb*`H^;47@dN znL092lu<3$QlU>{80RlY86+U|tWgsVUjmOQG05y+F99*4*;9_bRi-NFVL6O32V=Id z%%s#K=j{T#*dqIl7=`=>N}kPKJrToV$i>ec5|aQJxi!d$qTmQuu!HQNPN*}S-h8$W z5&_&(I%FnocIxpZ;ZfG18Y~r~8hk68foiyESEjl{H6J0&Cf4bJFS}9FG$8MThg3Jy z)KaA%-cIAg^d9^f^{Xd95^Zfap|Oxb49g_a4@a4Xe#C=dR=3h^)bei&@`H&<7|EK*nHn@Lq+BZM+o1=IM_ab38;#&w zarJ^6V?wGUuZ=tq$%&uqzq{<)PgWk)NuMj*{+pG1U7F9;X6QlFS&#aD(-}Wv)#tiL zz1QhoV~|qF2ey9HW+i~Q^SZG(Q01`W&b?j{eqwvHyhKoVxcb>+k?SIjXFK|p8M?){ zi%ny%VolJF(`9e0VCXh_DOylTghsgQ+8!Nq}{vxIO2j-Hy~JVrP@J8CJvcHK18b|Lj)YxDfh2gJ$9A8^>-S8B|h z>1}&)+s6ln;+TjL3$f;N{YWfl?f$|*7g@Q+xSWQ7pT$mbN-iEDCuTwKOfUzBu;oFG zJV+FT`Z823>f`zPQBSPb00;~YDDFtKso2Mjx#iJCWasKG2>s;E#2QQTPL}`&wKuQ{q2YTd2EuSc`e|Ki5nmal;909ht$>(bOC_W}| zKtMP&Xb3$!^YsQ!*YkRJ>nP+R`}G+aOoN>#yiQy-jB3wDOqe1THX1dh3`Iyl+@x=( zn6h7%ClfZBBIOL(l$f5ZJQ)>FRf(`#zXPc@SD_KcU>z~BE|GSgDgri|i(OPoSEgJA z1G`z^vLbv7$0OY>_NdF6f1xkyh3|{N~n&AB|&aK^H zcnjFg8|;DaG|+ zESKBvQRG}bR|1a2L&5YdH_Ro}!e8)F7$KNRfd(uFH6Ygk4h`&?LS$MZ%qRUET($tI z46;6^N|)U957g|%s<-@`kG1~AdOOhI6TP*VABJOn>g4QBh5acx@p9E4T%DpxIZu2SB3Cdqb=Sk4frPiQM9pi87x7e551ApVX;}JHEK}PUC=&~P{sOr zOBREy3J)p!EY;yNixnZd86=+khZHC8sxGayO~FQ4{dZa(rLd8WOI5QL9ox@6d&#vj zGb8B=#SQ-&5GMNxf41%JAuL)RiMM$_D@;XBj4jx%!$XG3SHO0{VcSlV4XB?p3>KPp z3&GNe&CIw&1BvIU9&XoqkzIVo{6hHZ$D=g!>J;JJ>xCRhuU@O%Zb5+T0H3WoLvnqLf# z%SHXa1~(Z-^V7Fks>@fLoeId9ot>VW zo45S1?f~5+p^IyCB&Ry?P}5MODOrc!(=E;|*Tho0lwcc>ZTAN1_;YTBHknMdOD=9x z0B0T`?1xw=|CrZS@d5mwPj(;?h_qu!008x3003WKhkrfUnd;da87es1n^~LueZc#z zY2|=D;N`QC>J6SEL9JiV2K@a}Toy%Ip3y==UuX~>k>zm98W6`$wsj2{A}m8bgFI=5 z+}y;l;l4mRqlPDIw&uDh%&O%s1d?C8>KXNUMGF8Vn*H*--*E&8-nsKf(?`e~AKmz< z6}xk(43N3o`BF`J8m1)T!ll$mMFwu0cJr^%h(huiErlkYUnoGF6#5XapAL;^>R3p`Ql^c^?3MuAo~u^)a!m= z-XvRheq0pZ#ruludw0S-`bF2{+1f$Zlf}uF!8@n#^Ch1P9`9<)3ktDEQZU;o_ahcl z7GKZeL-xj7izXIYkRNFDk@JM9Q(qCb{hUwROXv4`xo;GN))eUadPPwWFnn3Hrj!NE zBiu08SQNoiF-rMq`Uv1@*TWQfJJ{c78&Z_WO2CqNMt+Np(gYA5|G=E#k#b14pBMKn zpr9=UObOz^DNRv<;}-S&*DIM=y<=vX# zBo43!eD*2#yTb|m$5m4spT*h-k1JIu#lSV;#=A_^^81!Wckc}}2>ar1>frYL(TS$> zkCXeE35{$Xu+907E=xYI)#tsA`GpO8!p8$T^#{JBS4*C!PN$2T%$I0F=ZTNo-#7TY zuJ?O)#!wB8<%{F9xJ z_W`TPBteJ!cTFPC6vhmB=)J<78~P7X|fk%5ok&sS)cjOQxAnQ^z8?$!#$ zD2!G`Y&_|F_bVH0Z^12U%+&aKsq6hRVP3|HVe)+nqX#?Fo_RZ0Y+*y`IQs6$oOBpI z$nK-iX-6Eqqy#*`musd;z$0@Nx-(AXV3$GG>p6+(flg?Now2PH{rV?=mK4|q3Ue&+ zWGDPs*XR)9M~6~r=$lNwCs0oCz8x<;L}$Ofn^cf>^_$!>xRB)B7uH1ao}R+hY~{mK zF!cJOPyqaDqUrk8(D`2NbRlWhSztc%YCL2=W~m%xkgIHs{;<0ERz#yYZ()eZoKLx; zJH@sZgTWT+k-m69(^QV6+);t0%8u+ge`&w9k0!^f(-`S| zXaGXb!&1275JLkFLVkpQbWt9Pns`xoh;_Er+Vxw5kmiKG`MqZlmD1d0hD!Btl>THO zIeJWn#45T4Ud*BAbO0Csfy(CQr?R>F=&)}rv*CVIPyC?Wa7`2v(2J9vA z-7$X<46{5p*CD ztO7AD;QUr)cygYilz4!KXyKq#pq`av>0DKa9OdzFCRfVr=|;jllP4nPrq4pizm4GU zU)KXLOz?*rP~QW!Aza8@UnsUwkw(=q?4VYK^7mOmET>6taS|{5#C`9UJPq6N4GeuV zVk1F0F>fYO=x(({d8g=QUs#c=fppr0&`MP{K*LJi(|8Bl?Omv`o2iOQ|JYPJhf*;F7R=_AvBL*V zg^?GFlowQHCtuMP90WASO|aX`k=`pNtz%luy42HThldSv3m3;I#d=hLq6+`T{e8)R zn9L({G6*S$yxSHX1JB;KzH6WG7T{bU2IDvs32_VRFO4#$js`QpiN1^(U4S2!7isJ2-lp*341xzLl+5Sh$5-%Fp48_0L&g$ zJSt55x8nEktr*ZU5Zv7G8Zi>VhTB}g-B~jyfdDhzEQ2-DI9)MK2N7 zN>pytZ89s+5+umJ1c*4yAY*|8i;eE;M1uI@t~F3xk(qtkRebINxitE|bN?x&0}@WAw@1UOK+*p_f77u;cA7Ge#XW3hSBA<_ zJ#&Rs)10@tY_BfWGon1gd(3IAGUr{lv7t*s&_@T;lmLu=D2-4W{PQmi$fp9eGe%91 z3j9NZk(Lq)BM!FApFQ-~mHcT0T##Bk92FeoWZ=@E`bvG{F-n4kvx==3Ay!OQ+J)iB zrSfq~hbzw060oM4Z+X!dvv7Qz;X+V?wAra%9lbIi1`P9WKV!;8yXER0tfKWyaxx1% zmKdDkQ;$GOY0~aiu^q`}U?B!B?RO2Wz|n0%w8LxTvh4642UWDDOM(Vpbkz`KN(?Xu zm)wj<>-lOS2!Fp{vuu$GcqEDYf{_@Ch!d=~a>`eWPx_#wD3V|{Cl2_wvE$A&E@Lk@kGpp6@I)02f*aW9e%C= z;mpS$wr{ik24k>IC~_ zi%_%j*Wx92(eL_H3J1~!QiD@oiOx}8xgA^lr+rIsTS!(?_i+SxyQm>4kZ1=8O43I; z#-~?ZLX4+Yex?n9xAQ$u(OmGu)rvZ_x2yhk{0syiSZpKZ8;~DT{eRBz5A10l5@j4u z5StxLV-NC{oCI30Zobmg1IWVNrZdeiIS@%nEvSxWK=ZLz^~s3$01d^kQm~RBe#%*Wb1oQ7 zwZv_&CsMeKJ!&&++cGc=UIVH)c)a9wJtfshH9%t?u4v6J3}(UM`Ua0^cb!>7hGpJ1 ziIP}p@S}t(-2Zy^171B(4;r_)Pp@gc`VP!Q0$W-bMF$)W&I1L}QG-Nbl$g81SInrz z)=5sg0~|uO_!JGUKk<)3XC{=h7Cf~)pIS5b$GpDX)|$)DvY;m_^!+NW7#j@I+ahfY z3?W5jH_~Nz$+ZH_2Z5)lm79^|N*b*eIZuzP<>To&DuIwBYqHVFGx=~2=|nN`#(<*^ z*aCX6_AMDCqQ+`x6b%D2r@LqUnrc@!Be%}#P{z8`SIKGd1tm8R1>FuW7%a{n1J-^7 zMgl=Et0w2^z9-e>TU3OfD(Xi5EIp0!QS_}H56;;U-A{?JGS>;;v(KXf%mdf|%pB^+thK#SmxFnRV(BFahavf|_9=0Mwyl`12 zRePS(y8N`ogc9N_IDM1ih5UscwQIE)zqbr7((&;;)y9N>!zF`^BO0A+x^UnDFIl8o z=AjMHz7}8&6h7>6NF?Ju)Ck2q5Zv#mbELrZX8(4?(6yAEx@|bs8IOp_dd!QQ!5@ur zVTw*fTbU59BX3?H-XKYpoRD~8LvBOUSxl`(K}}0lc0o?HP^sPADW@8Qu}`YVEz`K^ zUDXGvek4KWsuy8Gb`a17 z@sY97-J%=hfWFJYQp(_5q*mkrff8kDD`-DO5~l^Z{oJlXQRKQhMuXwY%Rm< zhkw2ad)wC!$^s8u?j6PmD2=iM;GPFw94J^ii8yOxw8~C#p@vNQ`OhN)GizIIhDz6h zDN!ec410_aGLW390T$h*Bv{KZ^3&`dKKk=WPCeKGSUlSo336^Qn#*AebcI{2Bav?A z#Ya+U?(8iYxSICic4y=hBgm-evbk+1Bf;hT@BuuJ2`hVTER9Vr8~nZ)_yG{e?t(+i zCmN|1ip=xa;0r9QYdgT-hQP$dR2o?7NMd5Lvh>7$<2gHPCkXEEl7qEg)mZQ%4>fWK zgF({FbL^+!stZP&ElR`~yU^S;aVZTCE~o<8c1}mv<>i*hVr5?;r;1fyo}W)OF2~@Q z43zr1W5fOxKv{qv&e}#W2B~(BllQF42 z@iJRjy2qjX$0O*82FC=M;#a_LV1aqSz7h&1a76=tYh^BHib=aa>x7rp0_#@to&heD z#805Gw3}q*+71lb4+w>A?IuMYp#XfjOu0r(>c%31GaY%!iknNywbxoAuMF zy&9K>IUBfUY^WRYeAfPw;1-JJw44*dh6A#Aw(1bsSnC$Po z=T~s>|0sWb{h6$9V`K64X+xr-md!Ffir43ldogP5{>Z!%{DDwdc}nJGP^xV_gz>As zN&``r3crfp9<)NDi%ytp$PTnOJjPoFoWWGuG+pU(U{x4N`1{E4289YWD+R8t@D}L8 zOsL~2kR=4fw0mO8whm+iDyAnS3$u7YVn|6WEPnD9lBx-gO9`S9*pVr4nxF}GPi91Dne#U`GG~)IX`<9MVuohs7Fq-FCw;x2<$NvYE`<(1rRerS& z$LRv(3WUiy&K}xKOdX??ru`dd|4h1m`xcOk2Thmw;U)8Kz@KjEmq9&?^=sPOM$IDg z7t}{{tEpb=rk|?RHLsximY8u!ValzN&~g7m-2JZ1YSxpwz8%>rohK37ytEqT%e(xd?Cm5bgK+ zN2M4^8h5Lu(KD1uRqvZYo$JTy3125K|3bU9u&dz7K`nBr{}2pxPzj9+z*DWX;6K=g z&!VIn56O!MKWiy;>_mSY%enz}zMo&Tzhg}c**$1>;JDZ5Fz&zwx9h;YeSLQIoT_xm zYo0p-o4PAN{@~O;$`l;sT|LPlQ^!nP63Me~?V85d7PS>0vDS6YznX#t_o(k2`4e(V zk$Q#jPe zdz5S=8s)Ys_y@OxC&WN}@xaIGCENe~O(xjqP}8wDvizSvzk9Gdw|{)LZF~+%=zrVL zepyz;O-S_9BmBAWzd+dUly*TyTymr%I+Zr`@=cNYm?TAxQ#)X>*8rbBJf1+$(;D?%uh`UW5YxZ7ct_&$S#IpOs|%%?rpo^~vqne7OK$o1^!%I;wZG z6h-w?wlfzypEuTi5;AdaY0d=Ep5~gxH}(sBb!B6-JYQ65clv#~SNoc^ zX}Y|wF|9L#yY4)V*m{m zD(y_;J&Hc*t}e$NmeBRB*n(5x$2-a2z+k)rJe5ESNCOAAA3oPClIlK7XV2 zU-OJw-P@VYoD(-|w&vXa!rWq8ClGBk>*eA4`v<#sv^~i4F>9LX_EozaKFs@e)R*PPhI==T>6G_bV4`Iq;n&bozo*v1-!iXEIoRm-{XqahvRuqq|JI2A;EDxiTjnwmq&m0Z__`m zc3gaZec6j2U9HP*vFY3co@%0U$SmXpTiRQX4#)WR$|%vc_OQ6(MG~)7e;+q}8)$@*`|{FYAu3$PY54h>N;tfkD0f@%ihFGsxzJ(GnhW(%NzZ){ip?EekH3? zG4PlSSzrjs+EpTc@1PI|MqbWa=;eiQ7dD16HomV(PY= zrW^EV7+v~SLOJ;>US~rY>St!o5Wh7v-V_!{@YC~JzG|p zT<)p$iL?5yb5Nm2XsYRijiL(DyzAGqO`gi^v*K35u`Yo-%+EErFE9naK5)S&Z?9Nm zTivO-6O0X6@4YX3zhjG91Lxd;>5b_2IXp{` zhw|OnQW@u^(7SUPN6?Dsumx6qDhJzz>TKjwy>G8rrc|wR%{=$uxUSQ!+EtnNzpk6OYrn>w3-139HU7DJ{B8a32OkUN z>wt4Mj7%cn`5DORJ?OCnj3@?11_dB#?}jzbAO>JRkpom1K>+9=pqb8yq6yVf^uh_G z2V&`7xE^q3!e<#Mw?Y8OG7vyIpa|7I^dpKuiXrx?BWr^mScD!#=tu4#bWa5)Z@8tf z19;H2qaPQ8&@KTiOc2`PhsWUaG*AZ=fIM9cJlzCX65-Q;8cI)HkcuvJhoYBpAd4WO z#N&#j4Wq_1(yBW}(jNbf17@+3}wih|r@HrZEstp8yVt5K9hC!O)XWXE=9Q~9VkX}fv z9l+3wIP(VG`RM0QAWX>trXXOjA-Mo-3ivb%bW_mx>my8Q;=yVPc=vvQH!HZ{5C&Eg MlYwr2><-cg0N2vEpa1{> literal 0 HcmV?d00001 -- Gitee From 1c02c0909830740a7b1c75ebb8a6ca3f0f98fac8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Wed, 10 Apr 2024 10:28:32 +0800 Subject: [PATCH 025/302] =?UTF-8?q?mxrec=E6=9E=84=E5=BB=BA=E4=BC=98?= =?UTF-8?q?=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7=20setup.py=20bdis?= =?UTF-8?q?t=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 +++++--- build/move_whl_file_2_pkg_dir.sh | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fccc0244..eb9bbb2a 100644 --- a/README.md +++ b/README.md @@ -66,9 +66,11 @@ bash run.sh 将pybind11和securec的压缩包放在与MxRec代码同级的opensource目录下,并且将其分别更名为pybind11-2.10.3.zip、huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource目录,然后将pybind11和securec的压缩包放在opensource目录下。 为了构建多个版本的whl包,编译脚本在python虚拟环境完成对应tensorflow版本的安装。用户可以根据实际情况调整编译脚本,指定tensorflow的安装路径。编译方法: -- build/build.sh:执行脚本完成tf1和tf2版本whl包的构建和打包。执行脚本前,请参考build/build_tf1_with_opensource.sh、build/build_tf2_with_opensource.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 -- build/build_tf1_with_opensource.sh:执行脚本完成tf1版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 -- build/build_tf2_with_opensource.sh:执行脚本完成tf2版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 + +进入mxrec代码目录: +- setup.py:执行脚本setup.py,比如:**python3.7 setup.py**完成tf1和tf2版本whl包的构建和打包,构建成功后,whl包在/build/mindxsdk-mxrec/目录下,其中tf1_whl和tf2_whl目录下存在对应的whl包。执行脚本前,请参考build/build_tf1.sh、build/build_tf2.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 +- setup_tf1.py:执行脚本setup_tf1.py,比如:**python3.7 setup_tf1.py bdist_wheel**完成tf1版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,请参考build/build_tf1.sh创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 +- setup_tf2.py:执行脚本setup_tf2.py,比如:**python3.7 setup_tf2.py bdist_wheel**完成tf2版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,请参考build/build_tf2.sh创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 如需使用动态扩容功能,进入“./cust_op/cust_op_by_addr”目录中。参考以下命令编译并安装动态扩容算子包。 ```shell diff --git a/build/move_whl_file_2_pkg_dir.sh b/build/move_whl_file_2_pkg_dir.sh index 824ac52a..d489c2fb 100644 --- a/build/move_whl_file_2_pkg_dir.sh +++ b/build/move_whl_file_2_pkg_dir.sh @@ -24,6 +24,7 @@ tf_version=$1 function move_whl_file_2_pkg_dir() { mkdir -p "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl + rm -rf "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl/* mv ${MxRec_DIR}/dist/mx_rec*.whl "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl cd "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl whl_file=$(ls .) -- Gitee From 63c1be723d0e076af36e121c4e69711de5cb78f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 10 Apr 2024 11:34:38 +0800 Subject: [PATCH 026/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=E4=BC=98=E5=8C=96-=E6=A8=A1=E5=9E=8B=E9=80=82?= =?UTF-8?q?=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/optimizers/base.py | 117 +++++++++++++------------- tests/mx_rec/core/mock_class.py | 1 + tests/mx_rec/core/test_build_graph.py | 95 +-------------------- tests/mx_rec/core/test_manager.py | 8 ++ 4 files changed, 72 insertions(+), 149 deletions(-) diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index 395e60eb..49594d40 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -31,6 +31,47 @@ from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger +def get_restore_vector_second(table_name: str) -> tf.Tensor: + """ + Get restore vector which is calculated after the second all2all + :param table_name: embedding table_name + :return: the restore vector calculated after the second all2all + """ + channel_id = 0 + logger.debug('Channel %s_restore_second_%s was built for getnext', + table_name, channel_id) + with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): + restore_vector_second = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32], + output_shapes=[[None]], + channel_name=f'{table_name}_restore_second_{channel_id}')[0] + return restore_vector_second + + +def get_unique_keys(table_name: str, is_expansion: bool) -> tf.Tensor: + """ + Get the global unique keys which is calculated after the second all2all + :param table_name: embedding table_name + :param is_expansion: use dynamic expansion + :return: the global unique keys calculated after the second all2all + """ + channel_id = 0 + logger.debug('Channel %s_uniquekeys_%s was built for getnext', table_name, channel_id) + with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): + if is_expansion: + unique_keys = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int64], + output_shapes=[[None]], + channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] + return unique_keys + + unique_keys = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32], + output_shapes=[[None]], + channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] + return unique_keys + + class CustomizedOptimizer: name_counter = defaultdict(int) @@ -39,6 +80,25 @@ class CustomizedOptimizer: self.unique_name = "" self.base_name = "" + @staticmethod + def sum_same_id_gradients(grad, var, is_expansion): + if isinstance(var, ops.Tensor): + # 扩容模式从scope获取表名,偏移是-2 + table_name = var.op.name.split('/')[-2] + else: + table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) + table_name = table_instance.table_name + with tf.compat.v1.variable_scope("restore_vector_second"): + restore_vector_second = get_restore_vector_second(table_name) + + with tf.compat.v1.variable_scope("unique_keys"): + unique_keys = get_unique_keys(table_name, is_expansion) + + unique_local_grad = tf.compat.v1.unsorted_segment_sum(grad, + restore_vector_second, + array_ops.shape(unique_keys)[0]) + return unique_local_grad, unique_keys + def initialize_slots(self, var, table_instance): raise NotImplementedError(f"Please define a specific realization on {self.__class__.__name__}") @@ -58,63 +118,6 @@ class CustomizedOptimizer: self.unique_name = name + "_" + str(count) self.base_name = name - def _get_restore_vector_second(self, table_name: str) -> tf.Tensor: - """ - Get restore vector which is calculated after the second all2all - :param table_name: embedding table_name - :return: the restore vector calculated after the second all2all - """ - channel_id = 0 - logger.debug('Channel %s_restore_second_%s was built for getnext', - table_name, channel_id) - with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): - restore_vector_second = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32], - output_shapes=[[None]], - channel_name=f'{table_name}_restore_second_{channel_id}')[0] - return restore_vector_second - - def _get_unique_keys(self, table_name: str, is_expansion: bool) -> tf.Tensor: - """ - Get the global unique keys which is calculated after the second all2all - :param table_name: embedding table_name - :param is_expansion: use dynamic expansion - :return: the global unique keys calculated after the second all2all - """ - channel_id = 0 - logger.debug('Channel %s_uniquekeys_%s was built for getnext', table_name, channel_id) - with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): - if is_expansion: - unique_keys = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int64], - output_shapes=[[None]], - channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] - return unique_keys - - unique_keys = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32], - output_shapes=[[None]], - channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] - return unique_keys - - def sum_same_id_gradients(self, grad, var, is_expansion): - if isinstance(var, ops.Tensor): - # 扩容模式从scope获取表名,偏移是-2 - table_name = var.op.name.split('/')[-2] - else: - table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) - table_name = table_instance.table_name - with tf.compat.v1.variable_scope("restore_vector_second"): - restore_vector_second = self._get_restore_vector_second(table_name) - - with tf.compat.v1.variable_scope("unique_keys"): - unique_keys = self._get_unique_keys(table_name, is_expansion) - - unique_local_grad = tf.compat.v1.unsorted_segment_sum(grad, - restore_vector_second, - array_ops.shape(unique_keys)[0]) - return unique_local_grad, unique_keys - def custom_update_op(self, opt, grad): if isinstance(grad, ops.Tensor): diff --git a/tests/mx_rec/core/mock_class.py b/tests/mx_rec/core/mock_class.py index 7566aa1a..04c9ae56 100644 --- a/tests/mx_rec/core/mock_class.py +++ b/tests/mx_rec/core/mock_class.py @@ -208,6 +208,7 @@ class MockOptimizer: def __init__(self): self.slot_num = 2 + self.derivative = 2 def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. diff --git a/tests/mx_rec/core/test_build_graph.py b/tests/mx_rec/core/test_build_graph.py index 0b90b790..14913cf7 100644 --- a/tests/mx_rec/core/test_build_graph.py +++ b/tests/mx_rec/core/test_build_graph.py @@ -156,84 +156,6 @@ class TestGetIdOffsetsFunc(unittest.TestCase): self.assertEqual(swap_len, 0) -class TestGetRestoreVectorSecondFunc(unittest.TestCase): - """ - Test for 'mx_rec.core.asc.build_graph.get_restore_vector_second'. - """ - - def setUp(self): - # 默认动态扩容、hot emb、HBM - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) - self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") - - def tearDown(self): - # 恢复config - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) - - @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") - def test_get_restore_vector_second(self, mock_get_next): - """ - case: test get_restore_vector_second - """ - - from mx_rec.core.asc.build_graph import get_restore_vector_second - - with tf.Graph().as_default(): - mock_get_next.return_value = [0] - restore_vector_second = get_restore_vector_second(self.max_lookup_vec_size, self.config) - self.assertEqual(restore_vector_second, 0) - - -class TestGetUniqueKeysFunc(unittest.TestCase): - """ - Test for 'mx_rec.core.asc.build_graph.get_unique_keys'. - """ - - def setUp(self): - # 默认动态扩容、hot emb、HBM - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) - self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") - - def tearDown(self): - # 恢复config - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) - - @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") - def test_get_unique_keys_case1(self, mock_get_next): - """ - case1: 动态扩容 - """ - - from mx_rec.core.asc.build_graph import get_unique_keys - - with tf.Graph().as_default(): - mock_get_next.return_value = [0] - unique_keys = get_unique_keys(self.max_lookup_vec_size, self.config) - self.assertEqual(unique_keys, 0) - - @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") - def test_get_unique_keys_case2(self, mock_get_next): - """ - case2: 非动态扩容 - """ - - from mx_rec.core.asc.build_graph import get_unique_keys - - with tf.Graph().as_default(): - self.config["use_dynamic_expansion"] = False - mock_get_next.return_value = [1] - unique_keys = get_unique_keys(self.max_lookup_vec_size, self.config) - self.assertEqual(unique_keys, 1) - - class TestGetAll2allArgsFunc(unittest.TestCase): """ Test for 'mx_rec.core.asc.build_graph.get_all2all_args'. @@ -351,9 +273,7 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0), - get_restore_vector_second=mock.MagicMock(return_value=0), - get_unique_keys=mock.MagicMock(return_value=0)) + get_swap_info=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case1(self, build_graph_config_initializer): """ @@ -368,16 +288,12 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): result = get_preprocessed_tensor_for_asc(None, self.config) self.assertIsNotNone(result.get("restore_vector")) - self.assertIsNotNone(result.get("restore_vector_second")) - self.assertIsNotNone(result.get("unique_keys")) @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0), - get_restore_vector_second=mock.MagicMock(return_value=0), - get_unique_keys=mock.MagicMock(return_value=0)) + get_swap_info=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case2(self, build_graph_config_initializer): """ @@ -392,16 +308,12 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): result = get_preprocessed_tensor_for_asc(None, self.config) self.assertIsNotNone(result.get("restore_vector")) - self.assertIsNotNone(result.get("restore_vector_second")) - self.assertIsNotNone(result.get("unique_keys")) @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0), - get_restore_vector_second=mock.MagicMock(return_value=0), - get_unique_keys=mock.MagicMock(return_value=0)) + get_swap_info=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case3(self, build_graph_config_initializer): """ @@ -417,7 +329,6 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): self.config["channel_id"] = 1 result = get_preprocessed_tensor_for_asc(None, self.config) self.assertIsNotNone(result.get("restore_vector")) - self.assertIsNone(result.get("restore_vector_second")) if __name__ == '__main__': diff --git a/tests/mx_rec/core/test_manager.py b/tests/mx_rec/core/test_manager.py index 815ad843..ffa8b09e 100644 --- a/tests/mx_rec/core/test_manager.py +++ b/tests/mx_rec/core/test_manager.py @@ -385,6 +385,7 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): USE_STATIC=mock.MagicMock(return_value=0), USE_HOT=mock.MagicMock(return_value=1), USE_DYNAMIC_EXPANSION=mock.MagicMock(return_value=2), + USE_SUM_SAME_ID_GRADIENTS=mock.MagicMock(return_value=4), RankInfo=mock.MagicMock(return_value="mock_info"), HybridMgmt=mock.MagicMock(return_value=MockHybridMgmt(is_initialized=False))) @mock.patch("mx_rec.core.asc.manager.ConfigInitializer") @@ -398,6 +399,9 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): mock_config_initializer = MockConfigInitializer(use_static=True, use_dynamic_expansion=True) manager_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) + mock_opt = MockOptimizer() + manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt + with self.assertRaises(RuntimeError): initialize_emb_cache([], []) @@ -408,6 +412,7 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): USE_STATIC=mock.MagicMock(return_value=0), USE_HOT=mock.MagicMock(return_value=1), USE_DYNAMIC_EXPANSION=mock.MagicMock(return_value=2), + USE_SUM_SAME_ID_GRADIENTS=mock.MagicMock(return_value=4), RankInfo=mock.MagicMock(return_value="mock_info")) @mock.patch("mx_rec.core.asc.manager.ConfigInitializer") @mock.patch("mx_rec.core.asc.manager.HybridMgmt") @@ -421,6 +426,9 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): mock_config_initializer = MockConfigInitializer(use_static=True, use_dynamic_expansion=True) manager_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) + mock_opt = MockOptimizer() + manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt + mock_mgmt = MockHybridMgmt(is_initialized=True) mock_hybrid_mgmt.return_value = mock_mgmt initialize_emb_cache([], []) -- Gitee From 29683ecef60b80c034f19dcfc833f1583c7ea7bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Wed, 10 Apr 2024 10:28:32 +0800 Subject: [PATCH 027/302] =?UTF-8?q?mxrec=E6=9E=84=E5=BB=BA=E4=BC=98?= =?UTF-8?q?=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7=20setup.py=20bdis?= =?UTF-8?q?t=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 20 +++++++++++--------- build/move_whl_file_2_pkg_dir.sh | 1 + 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index fccc0244..ae3ec0a9 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ mxRec作为面向互联网市场搜索推荐广告的应用使能SDK产品,对 ## 安装方式 -安装前,请参考《CANN 软件安装指南CANN 软件安装指南》安装CANN开发套件软件包和TensorFlow适配昇腾插件。 +安装前,请参考《CANN 软件安装指南》安装CANN开发套件软件包和TensorFlow适配昇腾插件。 CANN软件提供进程级环境变量设置脚本,供用户在进程中引用,以自动完成环境变量设置。用户进程结束后自动失效。可在程序启动的Shell脚本中使用如下命令设置CANN的相关环境变量,也可通过命令行执行如下命令(以root用户默认安装路径“/usr/local/Ascend”为例): ```shell @@ -63,12 +63,14 @@ bash run.sh - [openmpi 4.1.5](https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz): 请参考软件文档在编译环境完成安装 - tensorflow 1.15/2.6.5:根据实际需求选择对应版本 -将pybind11和securec的压缩包放在与MxRec代码同级的opensource目录下,并且将其分别更名为pybind11-2.10.3.zip、huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource目录,然后将pybind11和securec的压缩包放在opensource目录下。 +将pybind11和securec的压缩包放在与mxRec代码同级的opensource目录下,并且将其分别更名为pybind11-2.10.3.zip、huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在mxRec同级的目录下手动创建opensource目录,然后将pybind11和securec的压缩包放在opensource目录下。 为了构建多个版本的whl包,编译脚本在python虚拟环境完成对应tensorflow版本的安装。用户可以根据实际情况调整编译脚本,指定tensorflow的安装路径。编译方法: -- build/build.sh:执行脚本完成tf1和tf2版本whl包的构建和打包。执行脚本前,请参考build/build_tf1_with_opensource.sh、build/build_tf2_with_opensource.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 -- build/build_tf1_with_opensource.sh:执行脚本完成tf1版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 -- build/build_tf2_with_opensource.sh:执行脚本完成tf2版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 + +进入mxRec代码目录: +- setup.py:执行脚本setup.py,比如:**python3.7 setup.py**完成tf1和tf2版本whl包的构建和打包,构建成功后,whl包在/build/mindxsdk-mxrec/目录下,其中tf1_whl和tf2_whl目录下存在对应的whl包。执行脚本前,请参考build/build_tf1.sh、build/build_tf2.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 +- setup_tf1.py:执行脚本setup_tf1.py,比如:**python3.7 setup_tf1.py bdist_wheel**完成tf1版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,请参考build/build_tf1.sh创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 +- setup_tf2.py:执行脚本setup_tf2.py,比如:**python3.7 setup_tf2.py bdist_wheel**完成tf2版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,请参考build/build_tf2.sh创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 如需使用动态扩容功能,进入“./cust_op/cust_op_by_addr”目录中。参考以下命令编译并安装动态扩容算子包。 ```shell @@ -99,8 +101,8 @@ bash run_python_dt.sh - [pybind11 v2.10.3](https://github.com/pybind/pybind11/archive/refs/tags/v2.10.3.zip) - [securec](https://github.com/huaweicloud/huaweicloud-sdk-c-obs/archive/refs/tags/v3.23.9.zip) -将googletest、emock、pybind11和securec的压缩包放在与MxRec代码同级的opensource目录下,并且将其分别更名为googletest-release-1.8.1.zip、 -emock-0.9.0.zip、pybind11-2.10.3.zip、 huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource目录, +将googletest、emock、pybind11和securec的压缩包放在与mxRec代码同级的opensource目录下,并且将其分别更名为googletest-release-1.8.1.zip、 +emock-0.9.0.zip、pybind11-2.10.3.zip、 huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在mxRec同级的目录下手动创建opensource目录, 然后将前述几个压缩包放在opensource目录下。 如需使用C++测试用例,需要按照上述描述准备需要的依赖,准备好之后,进入src目录中。参考以下命令执行C++测试用例: @@ -117,11 +119,11 @@ bash test_ut.sh tf2 ## 使用指导 -mxRec所支持的使用环境、功能特性、API接口与使用样例请参考昇腾开源社区MindX SDK产品文档。 +mxRec所支持的使用环境、功能特性、API接口与使用样例请参考mxRec用户指南。 ## 参考设计 -mxrec框架基础镜像,基于TensorFlow 1.15.0、tensorflow2.6.5制作的基础镜像,安装mxrec后即可开始训练,以及样例使用介绍。 +mxRec框架基础镜像,基于TensorFlow 1.15.0、tensorflow2.6.5制作的基础镜像,安装mxRec后即可开始训练,以及样例使用介绍。 1. https://ascendhub.huawei.com/#/detail/mxrec-tf1 diff --git a/build/move_whl_file_2_pkg_dir.sh b/build/move_whl_file_2_pkg_dir.sh index 824ac52a..d489c2fb 100644 --- a/build/move_whl_file_2_pkg_dir.sh +++ b/build/move_whl_file_2_pkg_dir.sh @@ -24,6 +24,7 @@ tf_version=$1 function move_whl_file_2_pkg_dir() { mkdir -p "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl + rm -rf "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl/* mv ${MxRec_DIR}/dist/mx_rec*.whl "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl cd "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl whl_file=$(ls .) -- Gitee From 7259761ec4ea349ca9a54212bf85b8eea934d961 Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Thu, 11 Apr 2024 11:30:56 +0000 Subject: [PATCH 028/302] =?UTF-8?q?!80=20=E4=BF=AE=E6=94=B9=E4=B8=80?= =?UTF-8?q?=E4=BA=9B=E6=97=A5=E5=BF=97=E6=8B=BC=E5=86=99=E9=94=99=E8=AF=AF?= =?UTF-8?q?=E5=A6=82deivce=E5=8F=8A=E6=89=93=E5=8D=B0f"",raise=E9=94=99?= =?UTF-8?q?=E8=AF=AF=E6=97=A5=E5=BF=97=E5=BC=80=E5=A4=B4=E5=B0=8F=E5=86=99?= =?UTF-8?q?=EF=BC=8Clogger=E6=97=A5=E5=BF=97=E5=BC=80=E5=A4=B4=E5=A4=A7?= =?UTF-8?q?=E5=86=99=20*=20=E4=BF=AE=E6=94=B9=E4=B8=80=E4=BA=9B=E6=97=A5?= =?UTF-8?q?=E5=BF=97=E6=8B=BC=E5=86=99=E9=94=99=E8=AF=AF=E5=A6=82deivce?= =?UTF-8?q?=E5=8F=8A=E6=89=93=E5=8D=B0f""=20*=20=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=B8=80=E4=BA=9B=E6=8B=BC=E5=86=99=E5=8F=8A=E6=89=93=E5=8D=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 10 +++++----- examples/demo/little_demo/main.py | 8 ++++---- examples/demo/little_demo/run_mode.py | 8 ++++---- examples/demo/little_demo_estimator/main.py | 8 ++++---- .../demo/little_demo_estimator/nn_model_input.py | 6 +++--- examples/dlrm/model/main_mxrec.py | 6 +++--- mx_rec/core/asc/helper.py | 2 +- mx_rec/core/feature_process.py | 4 ++-- mx_rec/graph/acg_push_ops.py | 14 +++++++------- mx_rec/graph/merge_lookup.py | 2 +- mx_rec/graph/modifier.py | 16 ++++++++-------- mx_rec/optimizers/emb_optimizer.py | 4 ++-- mx_rec/saver/patch.py | 2 +- mx_rec/util/config_utils/feature_spec_utils.py | 2 +- mx_rec/util/cpu.py | 6 +++--- mx_rec/util/normalization.py | 4 ++-- mx_rec/util/perf.py | 2 +- tests/mx_rec/saver/sparse_embedding_mock.py | 2 +- tools/model_convert/model_convert.py | 4 ++-- 19 files changed, 55 insertions(+), 55 deletions(-) diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index 540445e8..d5a51312 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -125,7 +125,7 @@ def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph): elif len(embedding_list) > 1: emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False) else: - raise ValueError("The length of embedding_list must be greater than or equal to 1.") + raise ValueError("the length of embedding_list must be greater than or equal to 1.") my_model = MyModel() model_output = my_model.build_model(embedding=emb, dense_feature=batch["dense_feature"], @@ -261,8 +261,8 @@ if __name__ == "__main__": MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0))) use_faae = bool(int(os.getenv("USE_FAAE", 0))) except ValueError as err: - raise ValueError(f"please correctly config USE_DYNAMIC_EXPANSION or USE_MULTI_LOOKUP or USE_FAAE " - f"or USE_MODIFY_GRAPH only 0 or 1 is supported.") from err + raise ValueError("please correctly config USE_DYNAMIC_EXPANSION or USE_MULTI_LOOKUP or USE_FAAE " + "or USE_MODIFY_GRAPH only 0 or 1 is supported.") from err use_dynamic = bool(int(os.getenv("USE_DYNAMIC", 0))) logger.info(f"USE_DYNAMIC: {use_dynamic}") @@ -270,7 +270,7 @@ if __name__ == "__main__": use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion) IF_LOAD = False rank_id = mxrec_util.communication.hccl_ops.get_rank_id() - filelist = glob(f"./saved-model/sparse-model-0") + filelist = glob("./saved-model/sparse-model-0") if filelist: IF_LOAD = True ConfigInitializer.get_instance().if_load = IF_LOAD @@ -409,7 +409,7 @@ if __name__ == "__main__": lr = sess.run(cfg.learning_rate) global_step = sess.run(cfg.global_step) except tf.errors.OutOfRangeError: - logger.info(f"Encounter the end of Sequence for training.") + logger.info("Encounter the end of Sequence for training.") break end_time = time.time() diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index 8813de44..05d6896f 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -194,14 +194,14 @@ if __name__ == "__main__": USE_TIMESTAMP = bool(int(os.getenv("USE_TIMESTAMP", 0))) USE_ONE_SHOT = bool(int(os.getenv("USE_ONE_SHOT", 0))) except ValueError as err: - raise ValueError(f"please correctly config USE_MPI or USE_DYNAMIC or USE_HOT or USE_DYNAMIC_EXPANSION or " - f"USE_MULTI_LOOKUP or USE_MODIFY_GRAPH or USE_TIMESTAMP or USE_ONE_SHOT " - f"only 0 or 1 is supported.") from err + raise ValueError("please correctly config USE_MPI or USE_DYNAMIC or USE_HOT or USE_DYNAMIC_EXPANSION or " + "USE_MULTI_LOOKUP or USE_MODIFY_GRAPH or USE_TIMESTAMP or USE_ONE_SHOT " + "only 0 or 1 is supported.") from err try: MULTI_LOOKUP_TIMES = int(os.getenv("MULTI_LOOKUP_TIMES", 2)) except ValueError as err: - raise ValueError(f"please correctly config MULTI_LOOKUP_TIMES only int is supported.") from err + raise ValueError("please correctly config MULTI_LOOKUP_TIMES only int is supported.") from err if_load = False save_path = "./saved-model" diff --git a/examples/demo/little_demo/run_mode.py b/examples/demo/little_demo/run_mode.py index 0f7a8cc4..e750ceb5 100644 --- a/examples/demo/little_demo/run_mode.py +++ b/examples/demo/little_demo/run_mode.py @@ -75,7 +75,7 @@ class RunMode: try: self.session.run(self.eval_model.loss_list) except tf.errors.OutOfRangeError: - logger.info(f"Encounter the end of Sequence for eval.") + logger.info("Encounter the end of Sequence for eval.") break def set_train_ops(self): @@ -140,7 +140,7 @@ class RunMode: try: self.session.run([self.train_ops, self.train_model.loss_list]) except tf.errors.OutOfRangeError: - logger.info(f"Encounter the end of Sequence for training.") + logger.info("Encounter the end of Sequence for training.") break else: for t in self.table_list: @@ -170,14 +170,14 @@ class RunMode: self.epoch += 1 def predict(self, model_file: List[str]): - logger.info(f"############### start predict ################") + logger.info("############### start predict ################") # get the latest model latest_step = get_load_step(model_file) self.saver = tf.compat.v1.train.Saver() self.saver.restore(self.session, f"./saved-model/model-{latest_step}") self._infer() - logger.info(f"############### predict end ################") + logger.info("############### predict end ################") def change_threshold(self): thres_tensor = tf.constant(60, dtype=tf.int32) diff --git a/examples/demo/little_demo_estimator/main.py b/examples/demo/little_demo_estimator/main.py index 901bf23a..5c3c94d1 100644 --- a/examples/demo/little_demo_estimator/main.py +++ b/examples/demo/little_demo_estimator/main.py @@ -166,14 +166,14 @@ if __name__ == '__main__': args.use_one_shot = bool(int(os.getenv("USE_ONE_SHOT", 0))) args.enable_push_ops_test = bool(int(os.getenv("ENABLE_PUSH_OPS_TEST", 0))) except ValueError as err: - raise ValueError(f"please correctly config USE_MPI or USE_DYNAMIC or USE_HOT or USE_DYNAMIC_EXPANSION or " - f"USE_MULTI_LOOKUP or USE_MODIFY_GRAPH or USE_TIMESTAMP or USE_ONE_SHOT " - f"only 0 or 1 is supported.") from err + raise ValueError("please correctly config USE_MPI or USE_DYNAMIC or USE_HOT or USE_DYNAMIC_EXPANSION or " + "USE_MULTI_LOOKUP or USE_MODIFY_GRAPH or USE_TIMESTAMP or USE_ONE_SHOT " + "only 0 or 1 is supported.") from err try: MULTI_LOOKUP_TIMES = int(os.getenv("MULTI_LOOKUP_TIMES", 2)) except ValueError as err: - raise ValueError(f"please correctly config MULTI_LOOKUP_TIMES only int is supported.") from err + raise ValueError("please correctly config MULTI_LOOKUP_TIMES only int is supported.") from err if args.run_mode == 'train': args.train_steps = -1 diff --git a/examples/demo/little_demo_estimator/nn_model_input.py b/examples/demo/little_demo_estimator/nn_model_input.py index 2ce70d41..d763c058 100644 --- a/examples/demo/little_demo_estimator/nn_model_input.py +++ b/examples/demo/little_demo_estimator/nn_model_input.py @@ -39,19 +39,19 @@ def get_model_fn(create_fs_params, cfg, access_and_evict_config_dict=None): loss_dict = {} if mode == tf.estimator.ModeKeys.TRAIN: - logger.info(f"use estimator train mode") + logger.info("Use estimator train mode") loss_dict['loss'] = [['train_loss', loss]] return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=get_train_op(params, loss_dict.get('loss'))) if mode == tf.estimator.ModeKeys.EVAL: - logger.info("use estimator eval mode") + logger.info("Use estimator eval mode") return tf.estimator.EstimatorSpec(mode=mode, loss=loss) if mode == tf.estimator.ModeKeys.PREDICT: - logger.info("use estimator predict mode") + logger.info("Use estimator predict mode") loss_dict['task_1'] = prediction[0] loss_dict['task_2'] = prediction[1] diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index 2d0ee78e..4bbd16de 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -129,7 +129,7 @@ def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph): elif len(embedding_list) > 1: emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False) else: - raise ValueError("The length of embedding_list must be greater than or equal to 1.") + raise ValueError("the length of embedding_list must be greater than or equal to 1.") my_model = MyModel() model_output = my_model.build_model(embedding=emb, dense_feature=batch["dense_feature"], @@ -266,8 +266,8 @@ if __name__ == "__main__": MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0))) use_faae = bool(int(os.getenv("USE_FAAE", 0))) except ValueError as err: - raise ValueError(f"please correctly config USE_DYNAMIC_EXPANSION or USE_MULTI_LOOKUP or USE_FAAE " - f"or USE_MODIFY_GRAPH only 0 or 1 is supported.") from err + raise ValueError("please correctly config USE_DYNAMIC_EXPANSION or USE_MULTI_LOOKUP or USE_FAAE " + "or USE_MODIFY_GRAPH only 0 or 1 is supported.") from err use_dynamic = bool(int(os.getenv("USE_DYNAMIC", 0))) logger.info(f"USE_DYNAMIC:{use_dynamic}") diff --git a/mx_rec/core/asc/helper.py b/mx_rec/core/asc/helper.py index 771f359f..aaa97017 100644 --- a/mx_rec/core/asc/helper.py +++ b/mx_rec/core/asc/helper.py @@ -281,7 +281,7 @@ def do_insert(args, insert_tensors, splits, table_names, input_dict): def export_read_emb_key_v2_op(args, pipeline_op): origin_batch = list(args) if len(origin_batch) < 1: - raise ValueError("The length of args is less than 1.") + raise ValueError("the length of args is less than 1.") if isinstance(origin_batch[0], dict): output_batch = origin_batch[0] valid_key = get_valid_op_key(output_batch) diff --git a/mx_rec/core/feature_process.py b/mx_rec/core/feature_process.py index 3963f6d5..7a90e78b 100644 --- a/mx_rec/core/feature_process.py +++ b/mx_rec/core/feature_process.py @@ -50,9 +50,9 @@ class EvictHook(tf.compat.v1.train.SessionRunHook): self._global_step_tensor = None if evict_step_interval is None: - logger.info(f"_EvictHook - > evict_time_interval: %d", self._evict_time_interval) + logger.info("_EvictHook - > evict_time_interval: %d", self._evict_time_interval) else: - logger.info(f"_EvictHook - > evict_time_interval: %d, evict_step_interval: %d", + logger.info("_EvictHook - > evict_time_interval: %d, evict_step_interval: %d", self._evict_time_interval, self._evict_step_interval) def begin(self): diff --git a/mx_rec/graph/acg_push_ops.py b/mx_rec/graph/acg_push_ops.py index 625ef92f..ed3e18e6 100644 --- a/mx_rec/graph/acg_push_ops.py +++ b/mx_rec/graph/acg_push_ops.py @@ -71,7 +71,7 @@ class ACGPushOpsToDatasetHook(tf.estimator.SessionRunHook): def after_create_session(self, session, coord): logger.info("[ACGPushOpsToDatasetHook] Trigger after create session!") initializers = tf.compat.v1.get_collection(_ACG_NEW_INITIALIZER) - logger.info(f"[ACGPushOpsToDatasetHook] Got new initialzers: %s.", initializers) + logger.info("[ACGPushOpsToDatasetHook] Got new initialzers: %s.", initializers) session.run(initializers) def end(self, session): @@ -185,12 +185,12 @@ def _find_op_from_base_op(base_ops: tf.Operation, target_op_type: str) -> tf.Ope for base_op in base_ops: parent_ops.extend(modifier.find_parent_op(base_op)) if not parent_ops: - raise ValueError(f"Op {target_op_type} was not found.") + raise ValueError(f"op {target_op_type} was not found.") def _get_dataset_op(graph: tf.Graph, get_next_op: Operation) -> Operation: if get_next_op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: - raise TypeError("Op '{get_next_op}' must be one instance of IteratorGetNext.") + raise TypeError(f"op '{get_next_op}' must be one instance of IteratorGetNext.") # looking for the MakeIterator operator which corresponds to given batch_tensor base_op = modifier.find_make_iterator_op(get_next_op.outputs[0]) # looking for the op which is the one before OptimizeDataset operator @@ -198,9 +198,9 @@ def _get_dataset_op(graph: tf.Graph, get_next_op: Operation) -> Operation: optimize_dataset_op = _find_op_from_base_op(base_op, "ModelDataset") target_op = modifier.find_parent_op(optimize_dataset_op) if not target_op: - raise RuntimeError(f"The parent op for 'ModelDataset' op was not found.") + raise RuntimeError("the parent op for 'ModelDataset' op was not found.") if target_op[0].type != "OptimizeDataset": - raise TypeError(f"Op OptimizeDataset was not found.") + raise TypeError("op OptimizeDataset was not found.") target_op = target_op[0] else: # 'OptimizeDataset' is not available in TensorFlow2.X @@ -225,7 +225,7 @@ def _add_sorted_additional_tensors(addition_funcgraph_output_tensor, k_inputs, n def _get_tensor_consumers_unsafe(tensor: tf.Tensor) -> List[tf.Operation]: if isinstance(tensor, tf.Operation): - raise RuntimeError("not support type: {node}") + raise RuntimeError(f"not support type: {node}") from tensorflow.python import pywrap_tensorflow as c_api @@ -502,7 +502,7 @@ def _update_iterator_getnext( subgraph_to_push: Set[tf.Operation], ): if not get_next_op.outputs: - raise RuntimeError("There is no tensor in the dataset. Please check the dataset and data processing.") + raise RuntimeError("there is no tensor in the dataset. Please check the dataset and data processing.") iterator_type = "" if get_next_op.inputs: iterator_type = get_next_op.inputs[0].op.type diff --git a/mx_rec/graph/merge_lookup.py b/mx_rec/graph/merge_lookup.py index 8a11e515..b28872e4 100644 --- a/mx_rec/graph/merge_lookup.py +++ b/mx_rec/graph/merge_lookup.py @@ -50,7 +50,7 @@ def do_merge_lookup(is_train: bool = True): # get anchor ids cutting_point_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE) if not cutting_point_list: - raise RuntimeError("The sparse table does not have sparse lookup.") + raise RuntimeError("the sparse table does not have sparse lookup.") check_cutting_points(cutting_point_list) # get lookup info diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index a5843e02..8338e870 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -174,7 +174,7 @@ def find_make_iterator_op(batch_tensor: Tensor) -> Operation: logger.debug("Op MakeIterator '%s' was found.", each_op.name) return each_op - raise ValueError(f"Op MakeIterator was not found.") + raise ValueError(f"op MakeIterator was not found.") @performance("find_target_dataset_op") @@ -198,7 +198,7 @@ def find_target_dataset_op(base_ops: Operation, op_type: str) -> Operation: parent_ops.extend(find_parent_op(base_op)) if not parent_ops: - raise ValueError(f"Op {op_type} was not found.") + raise ValueError(f"op {op_type} was not found.") def get_dataset_op(get_next_op: Operation) -> Operation: @@ -214,7 +214,7 @@ def get_dataset_op(get_next_op: Operation) -> Operation: """ if get_next_op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: - raise TypeError("Op '{get_next_op}' must be one instance of IteratorGetNext.") + raise TypeError(f"op '{get_next_op}' must be one instance of IteratorGetNext.") # looking for the MakeIterator operator which corresponds to given batch_tensor base_op = find_make_iterator_op(get_next_op.outputs[0]) @@ -223,9 +223,9 @@ def get_dataset_op(get_next_op: Operation) -> Operation: optimize_dataset_op = find_target_dataset_op(base_op, AnchorDatasetOp.MODEL_DATASET.value) target_op = find_parent_op(optimize_dataset_op) if not target_op: - raise RuntimeError(f"The parent op for 'ModelDataset' op was not found.") + raise RuntimeError("the parent op for 'ModelDataset' op was not found.") if target_op[0].type != AnchorDatasetOp.OPTIMIZE_DATASET.value: - raise TypeError(f"Op OptimizeDataset was not found.") + raise TypeError("op OptimizeDataset was not found.") target_op = target_op[0] else: # 'OptimizeDataset' is not available in TensorFlow2.X @@ -283,7 +283,7 @@ def find_target_instance_dataset(variant_tensor: Tensor) -> DatasetV1Adapter: if not isinstance(ins.element_spec, dict) and not ( isinstance(ins.element_spec, (list, tuple)) and len(ins.element_spec) == 2 and isinstance( ins.element_spec[0], dict)): - raise NotImplementedError("The found dataset does not return a valid layout.") + raise NotImplementedError("the found dataset does not return a valid layout.") return ins @@ -517,7 +517,7 @@ def update_iterator_getnext(get_next_op: Operation, """ if not get_next_op.outputs: - raise RuntimeError("There is no tensor in the dataset. Please check the dataset and data processing.") + raise RuntimeError("there is no tensor in the dataset. Please check the dataset and data processing.") iterator_type = "" if get_next_op.outputs[0].op.inputs: iterator_type = get_next_op.outputs[0].op.inputs[0].op.type @@ -640,7 +640,7 @@ class GraphModifierHook(tf.estimator.SessionRunHook): self._iterator_type = ConfigInitializer.get_instance().train_params_config.iterator_type if self._modify_graph and self._iterator_type not in (AnchorIteratorOp.MAKE_ITERATOR.value, AnchorIteratorOp.ONE_SHOT_ITERATOR.value): - raise ValueError("The value of iterator type should be like `MakeIterator` or `OneShotIterator`.") + raise ValueError("the value of iterator type should be like `MakeIterator` or `OneShotIterator`.") logger.debug("In GraphModifierHook, iterator type is `%s`.", self._iterator_type) def after_create_session(self, session, coord): diff --git a/mx_rec/optimizers/emb_optimizer.py b/mx_rec/optimizers/emb_optimizer.py index c7f1b64a..9e6a80e1 100644 --- a/mx_rec/optimizers/emb_optimizer.py +++ b/mx_rec/optimizers/emb_optimizer.py @@ -57,7 +57,7 @@ class EmbOptimizer: Returns: None """ if key in self._optimizer: - raise ValueError(f"Optimizer {key} has been set for hash table {table_name}.") + raise ValueError(f"optimizer {key} has been set for hash table {table_name}.") self._optimizer[key] = state_dict def check_optimizer_instance_list(self): @@ -73,4 +73,4 @@ class EmbOptimizer: optimizer_instance = getattr(optimizer_instance, '_opt') if not isinstance(optimizer_instance, CustomizedOptimizer): - raise TypeError("The optimizer instance must be an instance of CustomizedOptimizer.") + raise TypeError("the optimizer instance must be an instance of CustomizedOptimizer.") diff --git a/mx_rec/saver/patch.py b/mx_rec/saver/patch.py index fcf1134f..6cffcc18 100644 --- a/mx_rec/saver/patch.py +++ b/mx_rec/saver/patch.py @@ -289,7 +289,7 @@ def restore(self, sess, save_path): if self._is_empty: return if not checkpoint_management.checkpoint_exists_internal(checkpoint_prefix): - raise ValueError("The passed save_path is not a valid checkpoint: " + + raise ValueError("the passed save_path is not a valid checkpoint: " + checkpoint_prefix) tf_logging.info("Restoring parameters from %s", checkpoint_prefix) diff --git a/mx_rec/util/config_utils/feature_spec_utils.py b/mx_rec/util/config_utils/feature_spec_utils.py index 4c40996c..f244bb39 100644 --- a/mx_rec/util/config_utils/feature_spec_utils.py +++ b/mx_rec/util/config_utils/feature_spec_utils.py @@ -25,7 +25,7 @@ class FeatureSpecConfig: def clear_same_table_feature_spec(self, table_name: Optional[str], is_training: bool) -> None: if self.table_name_to_feature_spec.get(table_name) is None or \ self.table_name_to_feature_spec.get(table_name).get(is_training) is None: - raise KeyError("The table name `%s` does not exist in table_name_to_feature_spec, " + raise KeyError("the table name `%s` does not exist in table_name_to_feature_spec, " "please check whether the insert_feature_spec(...) is invoked.", table_name) self.table_name_to_feature_spec.get(table_name)[is_training] = [] logger.debug("The feature spec of the table name `%s` has been cleared.", table_name) diff --git a/mx_rec/util/cpu.py b/mx_rec/util/cpu.py index f4d299ed..69700262 100644 --- a/mx_rec/util/cpu.py +++ b/mx_rec/util/cpu.py @@ -26,7 +26,7 @@ class PcieInfo(ctypes.Structure): ] -def get_card_and_deivce(logic_id): +def get_card_and_device(logic_id): """ 通过芯片逻辑id获取芯片的卡id和device id 一张卡可能有多个芯片,对应多个device_id,但每个芯片的逻辑ID @@ -52,7 +52,7 @@ def get_pcie_id(card_id, device_id): dev = ctypes.c_int(device_id) ret = g_dcmi.dcmi_get_device_pcie_info_v2(card, dev, ctypes.pointer(info)) if ret != 0: - raise OSError("cant get pcie info of device {card_id}:{deivce_id}") + raise OSError(f"cant get pcie info of device {card_id}:{device_id}") pcie_id = f'{info.domain:04X}:{info.bdf_busid:02x}:' pcie_id += f'{info.bdf_deviceid:02x}.{info.bdf_funcid}' return pcie_id @@ -87,7 +87,7 @@ def bind_cpu_by_device_logic_id(logic_id): logger.error(e) return False try: - card_id, device_id = get_card_and_deivce(logic_id) + card_id, device_id = get_card_and_device(logic_id) pcie_id = get_pcie_id(card_id, device_id) numa = get_numa_by_pcie(pcie_id) cpu_list = get_cpu_list_by_numa(numa) diff --git a/mx_rec/util/normalization.py b/mx_rec/util/normalization.py index dc9dd2c1..a9b25132 100644 --- a/mx_rec/util/normalization.py +++ b/mx_rec/util/normalization.py @@ -33,6 +33,6 @@ def fix_invalid_table_name(name): if not fix_name: raise ValueError(f"The table name '{name}' doesn't contain valid character, " f"according to the rule '{pattern}'") - logger.warning(f"The table name '%s' contains invalid characters. The system automatically " - f"remove invalid characters. The table name was changed to '%s'", name, fix_name) + logger.warning("The table name '%s' contains invalid characters. The system automatically " + "remove invalid characters. The table name was changed to '%s'", name, fix_name) return fix_name diff --git a/mx_rec/util/perf.py b/mx_rec/util/perf.py index 3feb7332..81089f63 100644 --- a/mx_rec/util/perf.py +++ b/mx_rec/util/perf.py @@ -26,7 +26,7 @@ def performance(method_name): start = time.perf_counter() result = func(*args, **kwargs) span = time.perf_counter() - start - logger.debug(f"%s method consume %s (s).", method_name, round(span, 6)) + logger.debug("%s method consume %s (s).", method_name, round(span, 6)) return result return wrapper return decorator diff --git a/tests/mx_rec/saver/sparse_embedding_mock.py b/tests/mx_rec/saver/sparse_embedding_mock.py index 03df1466..83507e63 100644 --- a/tests/mx_rec/saver/sparse_embedding_mock.py +++ b/tests/mx_rec/saver/sparse_embedding_mock.py @@ -34,6 +34,6 @@ class SparseEmbeddingMock: def set_optimizer(self, key, state_dict): if key in self.optimizer: - raise ValueError(f"Optimizer {key} has been set for hash table {self.table_name}") + raise ValueError(f"optimizer {key} has been set for hash table {self.table_name}") self.optimizer[key] = state_dict diff --git a/tools/model_convert/model_convert.py b/tools/model_convert/model_convert.py index 7608917a..eb2432db 100644 --- a/tools/model_convert/model_convert.py +++ b/tools/model_convert/model_convert.py @@ -222,9 +222,9 @@ class ModelConverter: for _, dirs, _ in os.walk(check_dir): model_dirs.append(dirs) if not self._is_ddr and "DDR" in model_dirs[0]: - raise ValueError(f"wrong mode choose! you choose hbm mode, however ddr dir exists. ") + raise ValueError("wrong mode choose! you choose hbm mode, however ddr dir exists. ") if self._is_ddr and "DDR" not in model_dirs[0]: - raise ValueError(f"wrong mode choose! you choose ddr mode, however ddr dir not exists. ") + raise ValueError("wrong mode choose! you choose ddr mode, however ddr dir not exists. ") def get_attribute_and_data_file(table_path): -- Gitee From 7d1fdf6042854ac79cab9a4cc33282ea39100437 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Thu, 11 Apr 2024 11:40:33 +0000 Subject: [PATCH 029/302] =?UTF-8?q?!78=20=E5=88=86=E5=B8=83=E5=BC=8F?= =?UTF-8?q?=E8=AE=AD=E7=BB=83=E8=B5=84=E6=BA=90=E9=85=8D=E7=BD=AE=E6=96=B9?= =?UTF-8?q?=E6=A1=88=E9=80=82=E9=85=8D=20*=20=E9=9B=86=E5=90=88=E9=80=9A?= =?UTF-8?q?=E4=BF=A1=E4=B8=8E=E5=88=86=E5=B8=83=E5=BC=8F=E8=AE=AD=E7=BB=83?= =?UTF-8?q?=E8=B5=84=E6=BA=90=E9=85=8D=E7=BD=AE=E6=96=B9=E6=A1=88=E9=80=82?= =?UTF-8?q?=E9=85=8D=20*=20=E9=9B=86=E5=90=88=E9=80=9A=E4=BF=A1=E4=B8=8E?= =?UTF-8?q?=E5=88=86=E5=B8=83=E5=BC=8F=E8=AE=AD=E7=BB=83=E8=B5=84=E6=BA=90?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=96=B9=E6=A1=88=E9=80=82=E9=85=8D=20*=20?= =?UTF-8?q?=E9=9B=86=E5=90=88=E9=80=9A=E4=BF=A1=E4=B8=8E=E5=88=86=E5=B8=83?= =?UTF-8?q?=E5=BC=8F=E8=AE=AD=E7=BB=83=E8=B5=84=E6=BA=90=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E6=96=B9=E6=A1=88=E9=80=82=E9=85=8D=20*=20=E9=9B=86=E5=90=88?= =?UTF-8?q?=E9=80=9A=E4=BF=A1=E4=B8=8E=E5=88=86=E5=B8=83=E5=BC=8F=E8=AE=AD?= =?UTF-8?q?=E7=BB=83=E8=B5=84=E6=BA=90=E9=85=8D=E7=BD=AE=E6=96=B9=E6=A1=88?= =?UTF-8?q?=E9=80=82=E9=85=8D=20*=20=E9=9B=86=E5=90=88=E9=80=9A=E4=BF=A1?= =?UTF-8?q?=E4=B8=8E=E5=88=86=E5=B8=83=E5=BC=8F=E8=AE=AD=E7=BB=83=E8=B5=84?= =?UTF-8?q?=E6=BA=90=E9=85=8D=E7=BD=AE=E6=96=B9=E6=A1=88=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/util/communication/hccl_mgmt.py | 95 +++++++------------ mx_rec/util/communication/hccl_ops.py | 4 +- mx_rec/util/framework_npu_env/tfa_env.py | 4 +- src/pybind/module_main.cpp | 12 +++ .../util/communication/test_hccl_mgmt.py | 10 -- 5 files changed, 50 insertions(+), 75 deletions(-) diff --git a/mx_rec/util/communication/hccl_mgmt.py b/mx_rec/util/communication/hccl_mgmt.py index 6eb5a70f..2f50e832 100644 --- a/mx_rec/util/communication/hccl_mgmt.py +++ b/mx_rec/util/communication/hccl_mgmt.py @@ -16,19 +16,20 @@ # ============================================================================== import json -import os -import re +from typing import Dict, List -from mx_rec.constants.constants import VALID_DEVICE_ID_LIST, MIN_SIZE, MAX_CONFIG_SIZE, MAX_DEVICE_ID, \ - MIN_RANK_SIZE, MAX_RANK_SIZE -from mx_rec.validator.validator import FileValidator, para_checker_decorator, StringValidator, \ - Convert2intValidator +from mx_rec.constants.constants import MIN_SIZE, MAX_CONFIG_SIZE, MAX_DEVICE_ID +from mx_rec.validator.validator import FileValidator from mx_rec.util.global_env_conf import global_env -def parse_hccl_json(): +def parse_hccl_json() -> Dict[int, int]: + """ + Used for rank table file configured training situation. + :return: rank_id to logic_id mapping dictionary. + """ rank_table_path = global_env.rank_table_file - with open(rank_table_path, "r", encoding="utf-8"): + with open(rank_table_path, "r", encoding="utf-8") as file: # check whether json file is valid file_validator = FileValidator("RANK_TABLE_FILE", rank_table_path) # 1.check whether rank_table_path is soft link @@ -37,14 +38,13 @@ def parse_hccl_json(): file_validator.check_file_size(MAX_CONFIG_SIZE, MIN_SIZE) file_validator.check() - rank_table_path = os.path.realpath(global_env.rank_table_file) - with open(rank_table_path, "r", encoding="utf-8") as file: try: table_hccl = json.load(file) except FileNotFoundError as e: raise ValueError("rank table file not found") from e except json.JSONDecodeError as e: raise ValueError("rank table file is unable to parse as json") from e + if "server_list" not in table_hccl: raise AttributeError(f"Lack of attribute server_list.") if not table_hccl.get("server_list"): @@ -62,76 +62,51 @@ def parse_hccl_json(): if "rank_id" not in device or not device.get("rank_id").isdigit(): raise ValueError(f"hccl_json rank_id wrong.") rank_id = int(device.get("rank_id")) + if "device_id" not in device or not device.get("device_id").isdigit(): raise ValueError(f"hccl_json device_id wrong.") import mxrec_pybind - res = mxrec_pybind.get_logic_id(int(device.get("device_id"))) - if res < 0: - raise RuntimeError( - f"get logic id from physic id fail, error code is {res}, please check if dsmi api is functional.") - if res > MAX_DEVICE_ID: + logic_id = mxrec_pybind.get_logic_id(int(device.get("device_id"))) + if logic_id < 0: + raise RuntimeError(f"get logic id from physic id fail, error code is {logic_id}, " + f"please check if dsmi api is functional.") + if logic_id > MAX_DEVICE_ID: raise ValueError(f"get logic id from physic id fail, the device id is invalid.") - rank_to_device_dict[rank_id] = res - + rank_to_device_dict[rank_id] = logic_id return rank_to_device_dict -def set_hccl_info_without_json() -> dict: +def set_hccl_info_without_json() -> Dict[int, int]: """ Used for no rank table file configured training situation. - :return: device_id and logic_id mapping. + :return: rank_id to logic_id mapping dictionary. """ - visible_devices = global_env.ascend_visible_devices rank_size = global_env.cm_worker_size chief_device = global_env.cm_chief_device - device_list = get_device_list(visible_devices) + device_list = get_device_list() chief_device = int(chief_device) rank_size = int(rank_size) - sorted_device_list = sorted(device_list) - - if chief_device not in sorted_device_list: + if chief_device not in device_list: raise ValueError(f"The environment variable CM_CHIEF_DEVICE {chief_device} is not in the local device list. ") rank_to_device_dict = {} - chief_index = sorted_device_list.index(chief_device) - sorted_device_list = sorted_device_list[chief_index:] + sorted_device_list[0: chief_index] - sorted_device_list = sorted_device_list[:rank_size] - - for device_idx in sorted_device_list: - import mxrec_pybind - res = mxrec_pybind.get_logic_id(int(device_idx)) - if res < 0: - raise RuntimeError( - f"get logic id from physic id fail, error code is {res}, please check if dsmi api is functional.") - - if res > MAX_DEVICE_ID: - raise ValueError(f"get logic id from physic id fail. res: {res}, chief_device: {chief_device}, " - f"device_idx: {device_idx}") - index = sorted_device_list.index(device_idx) - rank_to_device_dict[index] = res + chief_index = device_list.index(chief_device) + device_list = device_list[chief_index:] + device_list[:chief_index] + device_list = device_list[:rank_size] + + for rank_id, device_id in enumerate(device_list): + rank_to_device_dict[rank_id] = device_id return rank_to_device_dict -def get_device_list(ascend_visible_devices): - device_list = [] - try: - nums = re.findall(r'\d+', ascend_visible_devices) - # eg1:4-11, 则nums=['4', '11'] eg2:0-3,8-11 则nums['0', '3', '8', '11'] - if not all(int(i) <= MAX_DEVICE_ID for i in nums): - raise ValueError("invalid env variable ascend_visible_devices.") - ranges = re.findall(r'\d+-\d+', ascend_visible_devices) - # eg1:4-11, 则ranges=['4-11'] eg2:0-3,8-11 则ranges['0-3', '8-11'] - for r in ranges: - start, end = map(int, r.split('-')) # '4-11', 则start 4, end 11. ['0-3', '8-11'] - if start >= end: - raise ValueError("invalid env variable ascend_visible_devices.") - nums.extend(range(start, end + 1)) - device_list = sorted(list(set(map(int, nums)))) - except ValueError as error: - raise ValueError("Invalid env variable ascend_visible_devices, no valid device id is configured.") from error - - if not device_list: - raise ValueError("No device is available in the environment.") +def get_device_list() -> List[int]: + """ + Obtain the number of visible Ascend devices in the environment. + :return: the logic id list of visible Ascend devices . + """ + import mxrec_pybind + device_count = mxrec_pybind.get_device_count() + device_list = [i for i in range(device_count)] return device_list \ No newline at end of file diff --git a/mx_rec/util/communication/hccl_ops.py b/mx_rec/util/communication/hccl_ops.py index 52fbf74c..d4ea6136 100644 --- a/mx_rec/util/communication/hccl_ops.py +++ b/mx_rec/util/communication/hccl_ops.py @@ -29,9 +29,9 @@ def get_rank_id() -> Optional[int]: def get_device_id() -> Optional[int]: """ - Get the device id of the calling process + Get the device logic id of the calling process Note: this method should be used after mpi init - :return: int or None, the device id of the calling process + :return: int or None, the device logic id of the calling process """ if global_env.rank_table_file: rank_to_device_dict = parse_hccl_json() diff --git a/mx_rec/util/framework_npu_env/tfa_env.py b/mx_rec/util/framework_npu_env/tfa_env.py index a00fd7ce..bcd0b0ee 100644 --- a/mx_rec/util/framework_npu_env/tfa_env.py +++ b/mx_rec/util/framework_npu_env/tfa_env.py @@ -13,14 +13,12 @@ def set_ascend_env(): 配置昇腾相关的参数和环境变量 """ logger.debug("Ascend env set start.") - os.environ["RANK_ID"] = str(get_rank_id()) device_id = str(get_device_id()) - os.environ["DEVICE_ID"] = device_id os.environ["ASCEND_DEVICE_ID"] = device_id - os.environ["DEVICE_INDEX"] = device_id if global_env.rank_table_file: + os.environ["RANK_ID"] = str(get_rank_id()) rank_size = get_rank_size() os.environ["RANK_SIZE"] = str(rank_size) diff --git a/src/pybind/module_main.cpp b/src/pybind/module_main.cpp index 403692fb..4a08f992 100644 --- a/src/pybind/module_main.cpp +++ b/src/pybind/module_main.cpp @@ -57,12 +57,24 @@ namespace { return logicId; } + uint32_t GetDeviceCount() + { + uint32_t count; + aclError ec = aclrtGetDeviceCount(&count); + if (ec != 0) { + throw runtime_error("failed to get device count. "); + } + return count; + } + PYBIND11_MODULE(mxrec_pybind, m) { m.def("get_ub_hot_size", &GetUBHotSize, py::arg("device_id")); m.def("get_logic_id", &GetLogicID, py::arg("physic_id")); + m.def("get_device_count", &GetDeviceCount); + m.attr("USE_STATIC") = py::int_(HybridOption::USE_STATIC); m.attr("USE_HOT") = py::int_(HybridOption::USE_HOT); diff --git a/tests/mx_rec/util/communication/test_hccl_mgmt.py b/tests/mx_rec/util/communication/test_hccl_mgmt.py index f0257022..870f8a3a 100644 --- a/tests/mx_rec/util/communication/test_hccl_mgmt.py +++ b/tests/mx_rec/util/communication/test_hccl_mgmt.py @@ -104,16 +104,6 @@ class HCCLMGMTTest(unittest.TestCase): with self.assertRaises(ValueError): rank_to_device_dict, local_rank_size = parse_hccl_json() - def test_get_device_list(self): - device_list = get_device_list("0-7") - self.assertEqual([0, 1, 2, 3, 4, 5, 6, 7], device_list) - device_list = get_device_list("0-3, 8-11") - self.assertEqual([0, 1, 2, 3, 8, 9, 10, 11], device_list) - with self.assertRaises(ValueError): - device_list = get_device_list("7-5, 9, 10") - with self.assertRaises(ValueError): - device_list = get_device_list("17") - if __name__ == '__main__': unittest.main() -- Gitee From ff3bb82d00a67b56aa91d4a017b33787b49ac585 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B4=E6=B4=AA=E5=8F=91?= <1660398197@qq.com> Date: Thu, 11 Apr 2024 21:33:43 +0800 Subject: [PATCH 030/302] =?UTF-8?q?=E6=89=80=E6=9C=89=E5=88=A4=E6=96=ADHot?= =?UTF-8?q?=20embed=E7=9A=84=E4=BB=A3=E7=A0=81=EF=BC=8C=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/key_process/key_process.cpp | 36 +++++++++------------- src/tests/key_process/key_process_test.cpp | 3 -- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index f76f6907..eebd70a3 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -45,17 +45,15 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos int seed) { this->rankInfo = rInfo; - if (rankInfo.useHot) { - SetupHotEmbUpdateStep(); - } + + SetupHotEmbUpdateStep(); + map scInfo; for (const auto& info: eInfos) { embInfos[info.name] = info; scInfo[info.name] = info.sendCount; - if (rankInfo.useHot) { - InitHotEmbTotCount(info, rInfo); - } + InitHotEmbTotCount(info, rInfo); if (rankInfo.useDynamicExpansion) { // 动态扩容 embeddingTableMap[info.name].Init(info, rInfo, seed); @@ -89,8 +87,8 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos } } - LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}, useHot:{}", - MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic, rInfo.useHot); + LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}", + MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic); #ifndef GTEST Start(); #endif @@ -342,11 +340,7 @@ void KeyProcess::HashSplitHelper(const unique_ptr & batch, vector name] != SingleEmbTableStatus::SETS_NONE) { tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 } else { - if (rankInfo.useHot) { - tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 - } else { - tie(splitKeys, restore) = HashSplit(batch); // 按存储dev id切分并去重 - } + tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 } LOG_DEBUG("uniqueTc(ms):{}", uniqueTc.ElapsedMS()); } @@ -387,10 +381,10 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch auto tensors = make_unique>(); tensors->push_back(Vec2TensorI32(uniqueInfo.restore)); - if (rankInfo.useHot) { - uniqueInfo.hotPos.resize(hotEmbTotCount[batch->name], -1); - tensors->push_back(Vec2TensorI32(uniqueInfo.hotPos)); - } + + uniqueInfo.hotPos.resize(hotEmbTotCount[batch->name], -1); + tensors->push_back(Vec2TensorI32(uniqueInfo.hotPos)); + if (!rankInfo.isDDR) { PushGlobalUniqueTensors(move(tensors), uniqueInfo.all2AllInfo.keyRecv, channel); @@ -449,10 +443,10 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, TimeCost pushResultTC; auto tensors = make_unique>(); tensors->push_back(Vec2TensorI32(restore)); - if (rankInfo.useHot) { - hotPos.resize(hotEmbTotCount[batch->name], 0); - tensors->push_back(Vec2TensorI32(hotPos)); - } + + hotPos.resize(hotEmbTotCount[batch->name], 0); + tensors->push_back(Vec2TensorI32(hotPos)); + if (!rankInfo.isDDR) { PushGlobalUniqueTensors(tensors, lookupKeys, channel); diff --git a/src/tests/key_process/key_process_test.cpp b/src/tests/key_process/key_process_test.cpp index a5e618cd..6b06dc30 100644 --- a/src/tests/key_process/key_process_test.cpp +++ b/src/tests/key_process/key_process_test.cpp @@ -76,7 +76,6 @@ protected: rankInfo.isDDR = false; rankInfo.useDynamicExpansion = false; rankInfo.ctrlSteps = { 1, -1 }; - rankInfo.useHot = false; // 初始化emb信息 GenEmbInfos(embNum, embInfos, fieldNums); splits = fieldNums; @@ -639,7 +638,6 @@ TEST_F(KeyProcessTest, KeyProcessTaskHelper) { rankInfo.isDDR = false; rankInfo.useStatic = false; - rankInfo.useHot = false; rankInfo.useDynamicExpansion = false; EmbeddingMgmt::Instance()->Init(rankInfo, embInfos); ASSERT_EQ(process.Initialize(rankInfo, embInfos), true); @@ -688,7 +686,6 @@ TEST_F(KeyProcessTest, KeyProcessTaskHelperDDR) { rankInfo.isDDR = true; rankInfo.useStatic = true; - rankInfo.useHot = false; rankInfo.useDynamicExpansion = false; EmbeddingMgmt::Instance()->Init(rankInfo, embInfos); ASSERT_EQ(process.Initialize(rankInfo, embInfos), true); -- Gitee From d1b1a871fe9f1dc9336db45bfbe6e2bd2ceb637d Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Thu, 11 Apr 2024 22:11:46 +0800 Subject: [PATCH 031/302] =?UTF-8?q?=E6=89=80=E6=9C=89=E5=88=A4=E6=96=ADHot?= =?UTF-8?q?=20embed=E7=9A=84=E4=BB=A3=E7=A0=81=EF=BC=8C=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/asc/manager.py | 2 +- src/core/key_process/key_process.cpp | 20 +++++++++----------- src/core/utils/common.cpp | 2 -- src/core/utils/common.h | 2 -- src/pybind/module_main.cpp | 2 -- 5 files changed, 10 insertions(+), 18 deletions(-) diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index 2829ab98..5f8eeb5d 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -18,7 +18,7 @@ import tensorflow as tf from mxrec_pybind import InitializeInfo, ConstantInitializerInfo, NormalInitializerInfo, EmbInfo, EmbInfoParams, \ - ThresholdValue, HybridMgmt, RankInfo, USE_STATIC, USE_HOT, USE_DYNAMIC_EXPANSION + ThresholdValue, HybridMgmt, RankInfo, USE_STATIC, USE_DYNAMIC_EXPANSION from mx_rec.util.communication.hccl_ops import get_rank_id, get_device_id, get_rank_size from mx_rec.util.initialize import ConfigInitializer diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index eebd70a3..1d922cee 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -645,17 +645,15 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, Uniqu absl::flat_hash_map hotMap = hotKey[batch->name]; lock.unlock(); - if (rankInfo.useHot) { - int hotOffset = 0; - uniqueInfoOut.hotPos.resize(hotEmbTotCount[batch->name]); - hotOffset = hotEmbTotCount[batch->name]; - - TimeCost computeHotTc; - ComputeHotPos(batch, hotMap, uniqueInfoOut.hotPos, uniqueInfoOut.restore, hotOffset); - LOG_DEBUG("ComputeHot TimeCost(ms):{}", computeHotTc.ElapsedMS()); - UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, - hotOffset, batch->batchId % hotEmbUpdateStep == 0, batch->name); - } + int hotOffset = 0; + uniqueInfoOut.hotPos.resize(hotEmbTotCount[batch->name]); + hotOffset = hotEmbTotCount[batch->name]; + + TimeCost computeHotTc; + ComputeHotPos(batch, hotMap, uniqueInfoOut.hotPos, uniqueInfoOut.restore, hotOffset); + LOG_DEBUG("ComputeHot TimeCost(ms):{}", computeHotTc.ElapsedMS()); + UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, + hotOffset, batch->batchId % hotEmbUpdateStep == 0, batch->name); if (rankInfo.useStatic) { sc.resize(rankInfo.rankSize, embInfos[batch->name].sendCount); diff --git a/src/core/utils/common.cpp b/src/core/utils/common.cpp index 38e64444..839d3790 100644 --- a/src/core/utils/common.cpp +++ b/src/core/utils/common.cpp @@ -45,7 +45,6 @@ namespace MxRec { localRankId = rankId % localRankSize; } useStatic = static_cast(option) bitand HybridOption::USE_STATIC; - useHot = static_cast(option) bitand HybridOption::USE_HOT; useDynamicExpansion = static_cast(option) bitand HybridOption::USE_DYNAMIC_EXPANSION; } @@ -58,7 +57,6 @@ namespace MxRec { localRankId = rankId % localRankSize; } useStatic = static_cast(option) & HybridOption::USE_STATIC; - useHot = static_cast(option) & HybridOption::USE_HOT; } RandomInfo::RandomInfo(int start, int len, float constantVal, float randomMin, float randomMax) diff --git a/src/core/utils/common.h b/src/core/utils/common.h index f6c3de3f..9ce80073 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -116,7 +116,6 @@ namespace MxRec { namespace HybridOption { const unsigned int USE_STATIC = 0x001; - const unsigned int USE_HOT = 0x001 << 1; const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 2; }; @@ -220,7 +219,6 @@ namespace MxRec { int localRankId {}; int localRankSize {}; bool useStatic { false }; - bool useHot {}; uint32_t option {}; int nBatch {}; bool isDDR { false }; diff --git a/src/pybind/module_main.cpp b/src/pybind/module_main.cpp index 4a08f992..0df47092 100644 --- a/src/pybind/module_main.cpp +++ b/src/pybind/module_main.cpp @@ -77,8 +77,6 @@ namespace { m.attr("USE_STATIC") = py::int_(HybridOption::USE_STATIC); - m.attr("USE_HOT") = py::int_(HybridOption::USE_HOT); - m.attr("USE_DYNAMIC_EXPANSION") = py::int_(HybridOption::USE_DYNAMIC_EXPANSION); GetRankInfo(m); -- Gitee From 1976ba3979a7040010755053852e83fdf9caa2ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Fri, 12 Apr 2024 01:44:23 +0000 Subject: [PATCH 032/302] =?UTF-8?q?!75=20mxrec=20=E9=9C=80=E6=B1=82?= =?UTF-8?q?=EF=BC=9A=E6=A0=B9=E6=8D=AE=E4=BC=98=E5=8C=96=E5=99=A8=E7=B1=BB?= =?UTF-8?q?=E5=9E=8B=E8=87=AA=E5=8A=A8=E5=88=A4=E6=96=AD=E6=98=AF=E5=90=A6?= =?UTF-8?q?=E5=BC=80=E5=90=AF=E5=85=A8=E5=B1=80=E5=8E=BB=E9=87=8D=E7=89=B9?= =?UTF-8?q?=E6=80=A7=E3=80=82=20*=20Merge=20remote-tracking=20branch=20'up?= =?UTF-8?q?stream/develop'=20into=20develop-global-unique=20*=20Merge=20re?= =?UTF-8?q?mote-tracking=20branch=20'upstream/develop'=20into=20develop-gl?= =?UTF-8?q?obal-unique=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB=E9=87=8D?= =?UTF-8?q?=E4=BC=98=E5=8C=96-=E6=A8=A1=E5=9E=8B=E9=80=82=E9=85=8D=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB=E9=87=8D=E4=BC=98=E5=8C=96?= =?UTF-8?q?-=E6=A8=A1=E5=9E=8B=E9=80=82=E9=85=8D=20*=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E5=85=A8?= =?UTF-8?q?=E5=B1=80=E5=8E=BB=E9=87=8D=E4=BC=98=E5=8C=96-=E5=85=A8?= =?UTF-8?q?=E9=80=82=E9=85=8D=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D=E4=BC=98=E5=8C=96-lazyAdam=E9=80=82=E9=85=8D=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB=E9=87=8D=E4=BC=98=E5=8C=96?= =?UTF-8?q?-lazyAdam=E9=80=82=E9=85=8D=20*=20Merge=20remote-tracking=20bra?= =?UTF-8?q?nch=20'origin/develop-global-unique'=20into=20devel=E2=80=A6=20?= =?UTF-8?q?*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modificatio?= =?UTF-8?q?n=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB=E9=87=8D=E4=BC=98?= =?UTF-8?q?=E5=8C=96-lazyAdam=E9=80=82=E9=85=8D=20*=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E5=85=A8?= =?UTF-8?q?=E5=B1=80=E5=8E=BB=E9=87=8D=E4=BC=98=E5=8C=96-lazyAdam=E9=80=82?= =?UTF-8?q?=E9=85=8D=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB=E9=87=8D?= =?UTF-8?q?=E4=BC=98=E5=8C=96-lazyAdam=E9=80=82=E9=85=8D=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91?= =?UTF-8?q?=E5=85=A8=E5=B1=80=E5=8E=BB=E9=87=8Dcpp=E6=B5=8B=E6=94=B9?= =?UTF-8?q?=E5=8A=A8=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=20Modification=E3=80=91=E5=85=A8=E5=B1=80=E5=8E=BB=E9=87=8Dcpp?= =?UTF-8?q?=E6=B5=8B=E6=94=B9=E5=8A=A8=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91test=20first=20time?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 4 +- examples/demo/little_demo/run_mode.py | 4 +- .../demo/little_demo_estimator/nn_optim.py | 4 +- examples/dlrm/model/gradient_descent_w.py | 10 ++ examples/dlrm/model/main_mxrec.py | 4 +- mx_rec/constants/constants.py | 2 - mx_rec/core/asc/build_graph.py | 50 ---------- mx_rec/core/asc/manager.py | 6 +- mx_rec/core/emb/dynamic_sparse_embedding.py | 9 +- mx_rec/core/emb/sparse_embedding.py | 7 +- mx_rec/optimizers/adagrad.py | 15 ++- mx_rec/optimizers/base.py | 64 ++++++++++++ mx_rec/optimizers/ftrl.py | 15 ++- mx_rec/optimizers/gradient_descent.py | 5 + mx_rec/optimizers/gradient_descent_by_addr.py | 5 + mx_rec/optimizers/lazy_adam.py | 10 +- mx_rec/optimizers/lazy_adam_by_addr.py | 10 +- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 6 +- src/core/key_process/key_process.cpp | 5 +- src/core/utils/common.cpp | 1 + src/core/utils/common.h | 8 +- src/core/utils/config.cpp | 15 +-- src/core/utils/config.h | 7 -- src/pybind/module_main.cpp | 2 + src/tests/utils/config_test.cpp | 4 - tests/mx_rec/core/mock_class.py | 1 + tests/mx_rec/core/test_build_graph.py | 99 +------------------ tests/mx_rec/core/test_manager.py | 8 ++ 28 files changed, 168 insertions(+), 212 deletions(-) diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index d5a51312..0a9462bc 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -336,9 +336,9 @@ if __name__ == "__main__": train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if use_dynamic_expansion: - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) # do sparse optimization by addr sparse_grads = sparse_optimizer.compute_gradients(loss, train_emb_list) # local_embedding diff --git a/examples/demo/little_demo/run_mode.py b/examples/demo/little_demo/run_mode.py index e750ceb5..6a3301c4 100644 --- a/examples/demo/little_demo/run_mode.py +++ b/examples/demo/little_demo/run_mode.py @@ -95,11 +95,11 @@ class RunMode: self.train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))): - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) # do sparse optimization by addr local_grads = tf.gradients(loss, train_emb_list) # local_embedding diff --git a/examples/demo/little_demo_estimator/nn_optim.py b/examples/demo/little_demo_estimator/nn_optim.py index 4438627d..3be3c7ed 100644 --- a/examples/demo/little_demo_estimator/nn_optim.py +++ b/examples/demo/little_demo_estimator/nn_optim.py @@ -73,11 +73,11 @@ def get_train_op_list(losses, learning_rate): # do sparse optimization if use_dynamic_expansion: - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) local_grads = tf.gradients(loss, train_emb_list) # local_embedding grads_and_vars = [(grad, address) for grad, address in zip(local_grads, train_address_list)] diff --git a/examples/dlrm/model/gradient_descent_w.py b/examples/dlrm/model/gradient_descent_w.py index f3ae78d7..6c34b726 100644 --- a/examples/dlrm/model/gradient_descent_w.py +++ b/examples/dlrm/model/gradient_descent_w.py @@ -47,6 +47,16 @@ class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOp super(CustomizedGradientDescentWithWeighDecay, self).__init__( learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name ) + self._slot_num = 0 + self._derivative = 1 + + @property + def slot_num(self): + return self._slot_num + + @property + def derivative(self): + return self._derivative def initialize_slots(self, var, table_instance): logger.info("no slot for gradient descent") diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index 4bbd16de..ab2eb04c 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -24,7 +24,7 @@ import tensorflow as tf from sklearn.metrics import roc_auc_score import numpy as np -from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS +from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline from mx_rec.core.embedding import create_table, sparse_lookup @@ -346,7 +346,7 @@ if __name__ == "__main__": train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if use_dynamic_expansion: - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) # do sparse optimization by addr sparse_grads = list(grads[-1]) # local_embedding grads_and_vars = [(grad, address) for grad, address in zip(sparse_grads, train_address_list)] diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index 03fa28b4..2c2cd2fe 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -22,7 +22,6 @@ ASCEND_GLOBAL_HASHTABLE_COLLECTION = "ASCEND_GLOBAL_HASHTABLE_COLLECTION" ASCEND_CUTTING_POINT_INITIALIZER = "ASCEND_CUTTING_POINT_INITIALIZER" ASCEND_SPARSE_LOOKUP_ENTRANCE = "ASCEND_SPARSE_LOOKUP_ENTRANCE" ASCEND_SPARSE_LOOKUP_ID_OFFSET = "ASCEND_SPARSE_LOOKUP_ID_OFFSET" -ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS = "ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS" ASCEND_TIMESTAMP = "ASCEND_TIMESTAMP" ASCEND_SPARSE_LOOKUP_LOCAL_EMB = "ASCEND_SPARSE_LOOKUP_LOCAL_EMB" EMPTY_STR = "" @@ -166,7 +165,6 @@ class ASCAnchorAttr(Enum): MOCK_LOOKUP_RESULT = "mock_lookup_result" RESTORE_VECTOR_SECOND = "restore_vector_second" UNIQUE_KEYS = "unique_keys" - GRADIENTS_STRATEGY = "gradients_strategy" IS_GRAD = "is_grad" diff --git a/mx_rec/core/asc/build_graph.py b/mx_rec/core/asc/build_graph.py index 13ddad4a..2bb72621 100644 --- a/mx_rec/core/asc/build_graph.py +++ b/mx_rec/core/asc/build_graph.py @@ -22,7 +22,6 @@ import tensorflow as tf import mxrec_pybind from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.tf_version_adapter import npu_ops -from mx_rec.constants.constants import TRAIN_CHANNEL_ID from mx_rec.util.log import logger @@ -81,46 +80,6 @@ def get_id_offsets(max_lookup_vec_size, config): return id_offsets, swap_pos, swap_len -def get_restore_vector_second(max_lookup_vec_size: int, config: dict) -> tf.Tensor: - """ - Get restore vector which is calculated after the second all2all - :param max_lookup_vec_size: the size of restore_vector_second - :param config: embedding config - :return: the restore vector calculated after the second all2all - """ - logger.debug('Channel %s_restore_second_%s was built for getnext', - config.get("table_name"), config.get("channel_id")) - with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): - restore_vector_second = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32], - output_shapes=[[max_lookup_vec_size]], - channel_name=f'{config.get("table_name")}_restore_second_{config.get("channel_id")}')[0] - return restore_vector_second - - -def get_unique_keys(max_lookup_vec_size: int, config: dict) -> tf.Tensor: - """ - Get the global unique keys which is calculated after the second all2all - :param max_lookup_vec_size: the size of global unique keys - :param config: embedding config - :return: the global unique keys calculated after the second all2all - """ - logger.debug('Channel %s_uniquekeys_%s was built for getnext', config.get("table_name"), config.get("channel_id")) - with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): - if config.get("use_dynamic_expansion"): - unique_keys = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int64], - output_shapes=[[max_lookup_vec_size]], - channel_name=f'{config.get("table_name")}_uniquekeys_{config.get("channel_id")}')[0] - return unique_keys - - unique_keys = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32], - output_shapes=[[max_lookup_vec_size]], - channel_name=f'{config.get("table_name")}_uniquekeys_{config.get("channel_id")}')[0] - return unique_keys - - def get_all2all_args(use_static: bool, config: dict) -> Optional[list]: """ Get all2all parameters for dynamic condition @@ -211,13 +170,4 @@ def get_preprocessed_tensor_for_asc(table, config): 'all2all_args': all2all_args, } - if config.get("channel_id") != TRAIN_CHANNEL_ID: - return result - - with tf.compat.v1.variable_scope("restore_vector_second"): - restore_vector_second = get_restore_vector_second(max_lookup_vec_size, config) - - with tf.compat.v1.variable_scope("unique_keys"): - unique_keys = get_unique_keys(max_lookup_vec_size, config) - result.update({'restore_vector_second': restore_vector_second, 'unique_keys': unique_keys}) return result diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index 2829ab98..f50037ea 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -18,7 +18,7 @@ import tensorflow as tf from mxrec_pybind import InitializeInfo, ConstantInitializerInfo, NormalInitializerInfo, EmbInfo, EmbInfoParams, \ - ThresholdValue, HybridMgmt, RankInfo, USE_STATIC, USE_HOT, USE_DYNAMIC_EXPANSION + ThresholdValue, HybridMgmt, RankInfo, USE_STATIC, USE_HOT, USE_DYNAMIC_EXPANSION, USE_SUM_SAME_ID_GRADIENTS from mx_rec.util.communication.hccl_ops import get_rank_id, get_device_id, get_rank_size from mx_rec.util.initialize import ConfigInitializer @@ -205,6 +205,10 @@ def initialize_emb_cache(table_info_list, threshold_list): if ConfigInitializer.get_instance().use_dynamic_expansion: option = option | USE_DYNAMIC_EXPANSION + optimizer = ConfigInitializer.get_instance().optimizer_config.optimizer_instance + if optimizer.derivative == 2: + option = option | USE_SUM_SAME_ID_GRADIENTS + # [train_steps, eval_steps, save_steps] pass step information to HybridMgmt for data process loop rank_info = RankInfo(rank_id, device_id, rank_size, option, [train_steps, eval_steps, save_steps]) diff --git a/mx_rec/core/emb/dynamic_sparse_embedding.py b/mx_rec/core/emb/dynamic_sparse_embedding.py index 194b2795..671c593e 100644 --- a/mx_rec/core/emb/dynamic_sparse_embedding.py +++ b/mx_rec/core/emb/dynamic_sparse_embedding.py @@ -6,10 +6,9 @@ import abc from typing import Optional, Union, Callable import tensorflow as tf -from tensorflow.python.ops import array_ops from mx_rec.constants.constants import ASCEND_TABLE_NAME_MUST_CONTAIN, ASCEND_SPARSE_LOOKUP_LOCAL_EMB, \ - ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS + ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding @@ -51,9 +50,7 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): def _get_update_grad(self, local_grad: tf.Tensor, result: dict, table: Union[tf.compat.v1.Variable, tf.Tensor]) -> Union[tf.IndexedSlices, tf.Tensor]: - return tf.compat.v1.unsorted_segment_sum(local_grad, - result.get("restore_vector_second"), - array_ops.shape(result.get("unique_keys"))[0]) + return local_grad def _get_local_embeddings(self, table: Union[tf.compat.v1.Variable, tf.Tensor], result: dict, feature_spec: FeatureSpec, **kwargs) -> tf.Tensor: @@ -72,7 +69,7 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): return sparse_forward_fn(local_embeddings) tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB, local_embeddings) - tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS, result.get("unique_keys")) + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get("id_offsets")) return sparse_forward_fn(local_embeddings) diff --git a/mx_rec/core/emb/sparse_embedding.py b/mx_rec/core/emb/sparse_embedding.py index d8ce63b1..938f917d 100644 --- a/mx_rec/core/emb/sparse_embedding.py +++ b/mx_rec/core/emb/sparse_embedding.py @@ -53,11 +53,8 @@ class SparseEmbedding(BaseSparseEmbedding): def _get_update_grad(self, local_grad: tf.Tensor, result: dict, table: Union[tf.compat.v1.Variable, tf.Tensor]) -> Union[tf.IndexedSlices, tf.Tensor]: - unique_local_grad = tf.compat.v1.unsorted_segment_sum(local_grad, - result.get("restore_vector_second"), - array_ops.shape(result.get("unique_keys"))[0]) - return ops.IndexedSlices(values=unique_local_grad, - indices=result.get("unique_keys"), + return ops.IndexedSlices(values=local_grad, + indices=result.get("id_offsets"), dense_shape=tf.shape(table)) def _get_local_embeddings(self, table: Union[tf.compat.v1.Variable, tf.Tensor], result: dict, diff --git a/mx_rec/optimizers/adagrad.py b/mx_rec/optimizers/adagrad.py index d99be3b3..4ba444a6 100644 --- a/mx_rec/optimizers/adagrad.py +++ b/mx_rec/optimizers/adagrad.py @@ -76,6 +76,16 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): initial_accumulator_value=initial_accumulator_value, use_locking=use_locking, name=self.unique_name) + self._slot_num = 1 + self._derivative = 2 + + @property + def slot_num(self): + return self._slot_num + + @property + def derivative(self): + return self._derivative def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. @@ -121,10 +131,11 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): def _apply_sparse(self, grad, var): acc = self.get_slot(var, "acc") + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) return training_ops.sparse_apply_adagrad( var, acc, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), - grad.values, - grad.indices, + unique_local_grad, + unique_keys, use_locking=self._use_locking) def _resource_apply_sparse(self, grad, var, indices): diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index a5d68a70..49594d40 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -21,12 +21,57 @@ from __future__ import print_function from collections import defaultdict +import tensorflow as tf from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops from tensorflow.python.training.optimizer import _TensorProcessor +from mx_rec.util.tf_version_adapter import npu_ops +from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger +def get_restore_vector_second(table_name: str) -> tf.Tensor: + """ + Get restore vector which is calculated after the second all2all + :param table_name: embedding table_name + :return: the restore vector calculated after the second all2all + """ + channel_id = 0 + logger.debug('Channel %s_restore_second_%s was built for getnext', + table_name, channel_id) + with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): + restore_vector_second = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32], + output_shapes=[[None]], + channel_name=f'{table_name}_restore_second_{channel_id}')[0] + return restore_vector_second + + +def get_unique_keys(table_name: str, is_expansion: bool) -> tf.Tensor: + """ + Get the global unique keys which is calculated after the second all2all + :param table_name: embedding table_name + :param is_expansion: use dynamic expansion + :return: the global unique keys calculated after the second all2all + """ + channel_id = 0 + logger.debug('Channel %s_uniquekeys_%s was built for getnext', table_name, channel_id) + with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): + if is_expansion: + unique_keys = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int64], + output_shapes=[[None]], + channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] + return unique_keys + + unique_keys = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32], + output_shapes=[[None]], + channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] + return unique_keys + + class CustomizedOptimizer: name_counter = defaultdict(int) @@ -35,6 +80,25 @@ class CustomizedOptimizer: self.unique_name = "" self.base_name = "" + @staticmethod + def sum_same_id_gradients(grad, var, is_expansion): + if isinstance(var, ops.Tensor): + # 扩容模式从scope获取表名,偏移是-2 + table_name = var.op.name.split('/')[-2] + else: + table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) + table_name = table_instance.table_name + with tf.compat.v1.variable_scope("restore_vector_second"): + restore_vector_second = get_restore_vector_second(table_name) + + with tf.compat.v1.variable_scope("unique_keys"): + unique_keys = get_unique_keys(table_name, is_expansion) + + unique_local_grad = tf.compat.v1.unsorted_segment_sum(grad, + restore_vector_second, + array_ops.shape(unique_keys)[0]) + return unique_local_grad, unique_keys + def initialize_slots(self, var, table_instance): raise NotImplementedError(f"Please define a specific realization on {self.__class__.__name__}") diff --git a/mx_rec/optimizers/ftrl.py b/mx_rec/optimizers/ftrl.py index 5c68b929..3659ffcd 100644 --- a/mx_rec/optimizers/ftrl.py +++ b/mx_rec/optimizers/ftrl.py @@ -80,11 +80,16 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): l2_shrinkage_regularization_strength=kwargs.get("l2_shrinkage_regularization_strength", 0.0) ) self._slot_num = 2 + self._derivative = 2 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def initialize_slots(self, var, table_instance): val = constant_op.constant( self._initial_accumulator_value, dtype=var.dtype, shape=var.get_shape()) @@ -135,17 +140,19 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): self._resource_scatter_nd_update) def _apply_sparse(self, grad, var): + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) + if self._l2_shrinkage_regularization_strength <= 0.0: return self._apply_sparse_shared( - grad.values, + unique_local_grad, var, - grad.indices, + unique_keys, lambda x, i, v: tf.compat.v1.scatter_nd_update(x, i, v)) else: return self._apply_sparse_shared_v2( - grad.values, + unique_local_grad, var, - grad.indices, + unique_keys, lambda x, i, v: tf.compat.v1.scatter_nd_update(x, i, v)) def _apply_sparse_shared(self, grad, var, indices, scatter_nd_update): diff --git a/mx_rec/optimizers/gradient_descent.py b/mx_rec/optimizers/gradient_descent.py index 6881d6ad..2ba72789 100644 --- a/mx_rec/optimizers/gradient_descent.py +++ b/mx_rec/optimizers/gradient_descent.py @@ -55,11 +55,16 @@ class CustomizedGradientDescent(gradient_descent.GradientDescentOptimizer, Custo super(CustomizedGradientDescent, self).__init__(learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name) self._slot_num = 0 + self._derivative = 1 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def initialize_slots(self, var, table_instance): return [] diff --git a/mx_rec/optimizers/gradient_descent_by_addr.py b/mx_rec/optimizers/gradient_descent_by_addr.py index 22b33852..11a9fda6 100644 --- a/mx_rec/optimizers/gradient_descent_by_addr.py +++ b/mx_rec/optimizers/gradient_descent_by_addr.py @@ -60,11 +60,16 @@ class CustomizedGradientDescentByAddr(gradient_descent.GradientDescentOptimizer, name=self.unique_name) self._slot_num = 0 + self._derivative = 1 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def initialize_slots(self, var, table_instance): return [] diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index d79b6d23..bab8245f 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -72,11 +72,16 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): super(CustomizedLazyAdam, self).__init__(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, use_locking=use_locking, name=self.unique_name) self._slot_num = 2 + self._derivative = 2 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. def creat_one_single_slot(var, op_name): @@ -144,10 +149,11 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): self._resource_scatter_nd_add) def _apply_sparse(self, grad, var): + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) return self._apply_sparse_shared( - grad.values, + unique_local_grad, var, - grad.indices, + unique_keys, lambda x, i, v: tf.compat.v1.scatter_nd_add(x, i, v)) def _apply_sparse_shared(self, grad, var, indices, scatter_nd_add): diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index 92252824..cd4ee878 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -73,11 +73,16 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): name=self.unique_name) self._slot_num = 2 + self._derivative = 2 @property def slot_num(self): return self._slot_num + @property + def derivative(self): + return self._derivative + def get_slot_init_values(self): # return state value list of adam that needs to initialize in ASC DDR. initial_momentum_value = 0.0 @@ -109,9 +114,10 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): return temp def _apply_sparse(self, grad, addr): + unique_local_grad, unique_addr = self.sum_same_id_gradients(grad=grad, var=addr, is_expansion=True) return self._apply_sparse_shared( - grad, - addr) + unique_local_grad, + unique_addr) def _apply_sparse_shared(self, grad, addr): power_b1, power_b2 = self._get_beta_accumulators() diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 894dc230..eb618f40 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -704,8 +704,7 @@ bool HybridMgmt::ParseKeysHBM(int channelId, int& batchId) LOG_DEBUG("channelId:{} batchId:{}, sendLookupSyncTC(ms):{}", channelId, batchId, sendLookupSyncTC.ElapsedMS()); // 训练时,使用全局去重聚合梯度,发送全局去重的key和对应的恢复向量 - if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && - channelId == TRAIN_CHANNEL_ID) { + if (mgmtRankInfo.useSumSameIdGradients && channelId == TRAIN_CHANNEL_ID) { SendUniqKeysAndRestoreVecHBM(channelId, batchId, embInfo, infoVecs); } @@ -864,8 +863,7 @@ bool HybridMgmt::ProcessEmbInfo(const std::string& embName, int batchId, int cha LOG_DEBUG("channelId:{} batchId:{}, hostHashMapProcessTC(ms):{}", channelId, batchId, hostHashMapProcessTC.ElapsedMS()); - if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && - channelId == TRAIN_CHANNEL_ID && remainBatchOut) { + if (mgmtRankInfo.useSumSameIdGradients && channelId == TRAIN_CHANNEL_ID && remainBatchOut) { SendUniqKeysAndRestoreVecDDR(embName, batchId, channelId, ddrParam); } diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index f76f6907..b72f3c8e 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -470,8 +470,9 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel) { - if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && - channel == TRAIN_CHANNEL_ID) { + LOG_INFO(KEY_PROCESS "rank:{}, channel:{}, useSumSameIdGradients:{} ...", + rankInfo.rankId, channel, rankInfo.useSumSameIdGradients); + if (rankInfo.useSumSameIdGradients && channel == TRAIN_CHANNEL_ID) { KeysT uniqueKeys; vector restoreVecSec; diff --git a/src/core/utils/common.cpp b/src/core/utils/common.cpp index 38e64444..9512b181 100644 --- a/src/core/utils/common.cpp +++ b/src/core/utils/common.cpp @@ -47,6 +47,7 @@ namespace MxRec { useStatic = static_cast(option) bitand HybridOption::USE_STATIC; useHot = static_cast(option) bitand HybridOption::USE_HOT; useDynamicExpansion = static_cast(option) bitand HybridOption::USE_DYNAMIC_EXPANSION; + useSumSameIdGradients = static_cast(option) bitand HybridOption::USE_SUM_SAME_ID_GRADIENTS; } RankInfo::RankInfo(int localRankSize, int option, const vector& maxStep) diff --git a/src/core/utils/common.h b/src/core/utils/common.h index f6c3de3f..9706a699 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -115,9 +115,10 @@ namespace MxRec { using TensorInfoT = std::tuple>>::iterator>; namespace HybridOption { - const unsigned int USE_STATIC = 0x001; - const unsigned int USE_HOT = 0x001 << 1; - const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 2; + const unsigned int USE_STATIC = 0x0001; + const unsigned int USE_HOT = 0x0001 << 1; + const unsigned int USE_DYNAMIC_EXPANSION = 0x0001 << 2; + const unsigned int USE_SUM_SAME_ID_GRADIENTS = 0x0001 << 3; }; string GetChipName(int devID); @@ -226,6 +227,7 @@ namespace MxRec { bool isDDR { false }; bool isSSDEnabled { false }; bool useDynamicExpansion {false}; + bool useSumSameIdGradients {true}; std::vector ctrlSteps; // 包含三个步数: train_steps, eval_steps, save_steps }; diff --git a/src/core/utils/config.cpp b/src/core/utils/config.cpp index 9cfec739..57478553 100644 --- a/src/core/utils/config.cpp +++ b/src/core/utils/config.cpp @@ -20,13 +20,7 @@ See the License for the specific language governing permissions and using namespace std; namespace MxRec { - namespace ApplyGradientsStrategyOptions { - const std::string DIRECT_APPLY = "direct_apply"; - const std::string SUM_SAME_ID_GRADIENTS_AND_APPLY = "sum_same_id_gradients_and_apply"; - }; - // 设置环境变量默认值 - string GlobalEnv::applyGradientsStrategy = ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY; int GlobalEnv::aclTimeout = -1; // 默认阻塞方式,一直等待直到数据接收完成。 int GlobalEnv::hdChannelSize = 40; // 默认通道深度40 int GlobalEnv::keyProcessThreadNum = 6; // 默认6个线程 @@ -42,12 +36,6 @@ namespace MxRec { /// 配置环境变量,Python侧已经做了变量值校验,CPP侧直接使用即可;bool类型,1代表true,0代表false void ConfigGlobalEnv() { - // 设置梯度策略 - const char *envStrategy = getenv(RecEnvNames::APPLY_GRADIENTS_STRATEGY); - if (envStrategy != nullptr) { - GlobalEnv::applyGradientsStrategy = envStrategy; - } - // 设置ACL超时时间 const char *envAclTimeout = getenv(RecEnvNames::ACL_TIMEOUT); if (envAclTimeout != nullptr) { @@ -117,9 +105,8 @@ namespace MxRec { void LogGlobalEnv() { - LOG_DEBUG("Environment variables are: [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], " + LOG_DEBUG("Environment variables are: [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], " "[{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}]", - RecEnvNames::APPLY_GRADIENTS_STRATEGY, GlobalEnv::applyGradientsStrategy, RecEnvNames::ACL_TIMEOUT, GlobalEnv::aclTimeout, RecEnvNames::HD_CHANNEL_SIZE, GlobalEnv::hdChannelSize, RecEnvNames::KEY_PROCESS_THREAD_NUM, GlobalEnv::keyProcessThreadNum, diff --git a/src/core/utils/config.h b/src/core/utils/config.h index 4c56c0d4..3ecb4c36 100644 --- a/src/core/utils/config.h +++ b/src/core/utils/config.h @@ -20,7 +20,6 @@ See the License for the specific language governing permissions and namespace MxRec { namespace RecEnvNames { - const char *const APPLY_GRADIENTS_STRATEGY = "APPLY_GRADIENTS_STRATEGY"; const char *const ACL_TIMEOUT = "AclTimeout"; const char *const HD_CHANNEL_SIZE = "HD_CHANNEL_SIZE"; const char *const KEY_PROCESS_THREAD_NUM = "KEY_PROCESS_THREAD_NUM"; @@ -34,13 +33,7 @@ namespace MxRec { const char *const RECORD_KEY_COUNT = "RECORD_KEY_COUNT"; }; - namespace ApplyGradientsStrategyOptions { - extern const std::string DIRECT_APPLY; - extern const std::string SUM_SAME_ID_GRADIENTS_AND_APPLY; - }; - struct GlobalEnv { - static std::string applyGradientsStrategy; static int aclTimeout; static int hdChannelSize; static int keyProcessThreadNum; diff --git a/src/pybind/module_main.cpp b/src/pybind/module_main.cpp index 4a08f992..cb128a15 100644 --- a/src/pybind/module_main.cpp +++ b/src/pybind/module_main.cpp @@ -81,6 +81,8 @@ namespace { m.attr("USE_DYNAMIC_EXPANSION") = py::int_(HybridOption::USE_DYNAMIC_EXPANSION); + m.attr("USE_SUM_SAME_ID_GRADIENTS") = py::int_(HybridOption::USE_SUM_SAME_ID_GRADIENTS); + GetRankInfo(m); GetEmbInfoParams(m); diff --git a/src/tests/utils/config_test.cpp b/src/tests/utils/config_test.cpp index d7e51b57..54e0ec67 100644 --- a/src/tests/utils/config_test.cpp +++ b/src/tests/utils/config_test.cpp @@ -24,7 +24,6 @@ using namespace MxRec; void SetEnvironmentVariables() { - setenv(RecEnvNames::APPLY_GRADIENTS_STRATEGY, "sum_same_id_gradients_and_apply", 1); setenv(RecEnvNames::ACL_TIMEOUT, "100", 1); setenv(RecEnvNames::HD_CHANNEL_SIZE, "50", 1); setenv(RecEnvNames::KEY_PROCESS_THREAD_NUM, "8", 1); @@ -40,7 +39,6 @@ void SetEnvironmentVariables() void UnsetEnvironmentVariables() { - unsetenv(RecEnvNames::APPLY_GRADIENTS_STRATEGY); unsetenv(RecEnvNames::ACL_TIMEOUT); unsetenv(RecEnvNames::HD_CHANNEL_SIZE); unsetenv(RecEnvNames::KEY_PROCESS_THREAD_NUM); @@ -56,7 +54,6 @@ void UnsetEnvironmentVariables() TEST(GlobalEnv, DefaultValues) { - ASSERT_EQ(GlobalEnv::applyGradientsStrategy, ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY); ASSERT_EQ(GlobalEnv::aclTimeout, -1); ASSERT_EQ(GlobalEnv::hdChannelSize, 40); ASSERT_EQ(GlobalEnv::keyProcessThreadNum, 6); @@ -77,7 +74,6 @@ TEST(GlobalEnv, ConfigGlobalEnv) ConfigGlobalEnv(); // 验证环境变量是否已经被正确配置 - ASSERT_EQ(GlobalEnv::applyGradientsStrategy, "sum_same_id_gradients_and_apply"); ASSERT_EQ(GlobalEnv::aclTimeout, 100); ASSERT_EQ(GlobalEnv::hdChannelSize, 50); ASSERT_EQ(GlobalEnv::keyProcessThreadNum, 8); diff --git a/tests/mx_rec/core/mock_class.py b/tests/mx_rec/core/mock_class.py index 7566aa1a..04c9ae56 100644 --- a/tests/mx_rec/core/mock_class.py +++ b/tests/mx_rec/core/mock_class.py @@ -208,6 +208,7 @@ class MockOptimizer: def __init__(self): self.slot_num = 2 + self.derivative = 2 def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. diff --git a/tests/mx_rec/core/test_build_graph.py b/tests/mx_rec/core/test_build_graph.py index c15d851f..14913cf7 100644 --- a/tests/mx_rec/core/test_build_graph.py +++ b/tests/mx_rec/core/test_build_graph.py @@ -156,84 +156,6 @@ class TestGetIdOffsetsFunc(unittest.TestCase): self.assertEqual(swap_len, 0) -class TestGetRestoreVectorSecondFunc(unittest.TestCase): - """ - Test for 'mx_rec.core.asc.build_graph.get_restore_vector_second'. - """ - - def setUp(self): - # 默认动态扩容、hot emb、HBM - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) - self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") - - def tearDown(self): - # 恢复config - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) - - @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") - def test_get_restore_vector_second(self, mock_get_next): - """ - case: test get_restore_vector_second - """ - - from mx_rec.core.asc.build_graph import get_restore_vector_second - - with tf.Graph().as_default(): - mock_get_next.return_value = [0] - restore_vector_second = get_restore_vector_second(self.max_lookup_vec_size, self.config) - self.assertEqual(restore_vector_second, 0) - - -class TestGetUniqueKeysFunc(unittest.TestCase): - """ - Test for 'mx_rec.core.asc.build_graph.get_unique_keys'. - """ - - def setUp(self): - # 默认动态扩容、hot emb、HBM - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) - self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") - - def tearDown(self): - # 恢复config - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) - - @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") - def test_get_unique_keys_case1(self, mock_get_next): - """ - case1: 动态扩容 - """ - - from mx_rec.core.asc.build_graph import get_unique_keys - - with tf.Graph().as_default(): - mock_get_next.return_value = [0] - unique_keys = get_unique_keys(self.max_lookup_vec_size, self.config) - self.assertEqual(unique_keys, 0) - - @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") - def test_get_unique_keys_case2(self, mock_get_next): - """ - case2: 非动态扩容 - """ - - from mx_rec.core.asc.build_graph import get_unique_keys - - with tf.Graph().as_default(): - self.config["use_dynamic_expansion"] = False - mock_get_next.return_value = [1] - unique_keys = get_unique_keys(self.max_lookup_vec_size, self.config) - self.assertEqual(unique_keys, 1) - - class TestGetAll2allArgsFunc(unittest.TestCase): """ Test for 'mx_rec.core.asc.build_graph.get_all2all_args'. @@ -346,15 +268,12 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, use_hot=True, use_dynamic_expansion=True) - global_env.apply_gradients_strategy = "direct_apply" @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0), - get_restore_vector_second=mock.MagicMock(return_value=0), - get_unique_keys=mock.MagicMock(return_value=0)) + get_swap_info=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case1(self, build_graph_config_initializer): """ @@ -363,23 +282,18 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc - global_env.apply_gradients_strategy = "sum_same_id_gradients_and_apply" with tf.Graph().as_default(): mock_config_initializer = MockConfigInitializer(use_static=True) build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) result = get_preprocessed_tensor_for_asc(None, self.config) self.assertIsNotNone(result.get("restore_vector")) - self.assertIsNotNone(result.get("restore_vector_second")) - self.assertIsNotNone(result.get("unique_keys")) @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0), - get_restore_vector_second=mock.MagicMock(return_value=0), - get_unique_keys=mock.MagicMock(return_value=0)) + get_swap_info=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case2(self, build_graph_config_initializer): """ @@ -388,23 +302,18 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc - global_env.apply_gradients_strategy = "sum_same_id_gradients_and_apply" with tf.Graph().as_default(): mock_config_initializer = MockConfigInitializer() build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) result = get_preprocessed_tensor_for_asc(None, self.config) self.assertIsNotNone(result.get("restore_vector")) - self.assertIsNotNone(result.get("restore_vector_second")) - self.assertIsNotNone(result.get("unique_keys")) @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0), - get_restore_vector_second=mock.MagicMock(return_value=0), - get_unique_keys=mock.MagicMock(return_value=0)) + get_swap_info=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case3(self, build_graph_config_initializer): """ @@ -413,7 +322,6 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc - global_env.apply_gradients_strategy = "sum_same_id_gradients_and_apply" with tf.Graph().as_default(): mock_config_initializer = MockConfigInitializer() build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) @@ -421,7 +329,6 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): self.config["channel_id"] = 1 result = get_preprocessed_tensor_for_asc(None, self.config) self.assertIsNotNone(result.get("restore_vector")) - self.assertIsNone(result.get("restore_vector_second")) if __name__ == '__main__': diff --git a/tests/mx_rec/core/test_manager.py b/tests/mx_rec/core/test_manager.py index 815ad843..ffa8b09e 100644 --- a/tests/mx_rec/core/test_manager.py +++ b/tests/mx_rec/core/test_manager.py @@ -385,6 +385,7 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): USE_STATIC=mock.MagicMock(return_value=0), USE_HOT=mock.MagicMock(return_value=1), USE_DYNAMIC_EXPANSION=mock.MagicMock(return_value=2), + USE_SUM_SAME_ID_GRADIENTS=mock.MagicMock(return_value=4), RankInfo=mock.MagicMock(return_value="mock_info"), HybridMgmt=mock.MagicMock(return_value=MockHybridMgmt(is_initialized=False))) @mock.patch("mx_rec.core.asc.manager.ConfigInitializer") @@ -398,6 +399,9 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): mock_config_initializer = MockConfigInitializer(use_static=True, use_dynamic_expansion=True) manager_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) + mock_opt = MockOptimizer() + manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt + with self.assertRaises(RuntimeError): initialize_emb_cache([], []) @@ -408,6 +412,7 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): USE_STATIC=mock.MagicMock(return_value=0), USE_HOT=mock.MagicMock(return_value=1), USE_DYNAMIC_EXPANSION=mock.MagicMock(return_value=2), + USE_SUM_SAME_ID_GRADIENTS=mock.MagicMock(return_value=4), RankInfo=mock.MagicMock(return_value="mock_info")) @mock.patch("mx_rec.core.asc.manager.ConfigInitializer") @mock.patch("mx_rec.core.asc.manager.HybridMgmt") @@ -421,6 +426,9 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): mock_config_initializer = MockConfigInitializer(use_static=True, use_dynamic_expansion=True) manager_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) + mock_opt = MockOptimizer() + manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt + mock_mgmt = MockHybridMgmt(is_initialized=True) mock_hybrid_mgmt.return_value = mock_mgmt initialize_emb_cache([], []) -- Gitee From 05b163e6bd6c3a6ee8c8e2c8dad88537b215b8dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Fri, 12 Apr 2024 02:31:16 +0000 Subject: [PATCH 033/302] =?UTF-8?q?!74=20mxrec=E6=9E=84=E5=BB=BA=E4=BC=98?= =?UTF-8?q?=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7=20setup.py=20bdis?= =?UTF-8?q?t=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA=20*=20mxrec?= =?UTF-8?q?=E6=9E=84=E5=BB=BA=E4=BC=98=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8?= =?UTF-8?q?python3.7=20setup.py=20bdist=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84?= =?UTF-8?q?=E5=BB=BA=20*=20mxrec=E6=9E=84=E5=BB=BA=E4=BC=98=E5=8C=96?= =?UTF-8?q?=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7=20setup.py=20bdist=5Fwheel?= =?UTF-8?q?=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA=20*=20mxrec=E6=9E=84?= =?UTF-8?q?=E5=BB=BA=E4=BC=98=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7?= =?UTF-8?q?=20setup.py=20bdist=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA?= =?UTF-8?q?=20*=20mxrec=E6=9E=84=E5=BB=BA=E4=BC=98=E5=8C=96=EF=BC=9A?= =?UTF-8?q?=E4=BD=BF=E7=94=A8python3.7=20setup.py=20bdist=5Fwheel=E6=96=B9?= =?UTF-8?q?=E5=BC=8F=E6=9E=84=E5=BB=BA=20*=20mxrec=E6=9E=84=E5=BB=BA?= =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7=20setup.?= =?UTF-8?q?py=20bdist=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA=20*=20mx?= =?UTF-8?q?rec=E6=9E=84=E5=BB=BA=E4=BC=98=E5=8C=96=EF=BC=9A=E4=BD=BF?= =?UTF-8?q?=E7=94=A8python3.7=20setup.py=20bdist=5Fwheel=E6=96=B9=E5=BC=8F?= =?UTF-8?q?=E6=9E=84=E5=BB=BA=20*=20Merge=20remote-tracking=20branch=20'or?= =?UTF-8?q?igin/develop'=20into=20develop=20*=20Merge=20remote-tracking=20?= =?UTF-8?q?branch=20'origin/develop'=20into=20develop=20*=20Merge=20remote?= =?UTF-8?q?-tracking=20branch=20'origin/develop'=20into=20develop=20*=20Me?= =?UTF-8?q?rge=20remote-tracking=20branch=20'origin/develop'=20into=20deve?= =?UTF-8?q?lop=20*=20Merge=20remote-tracking=20branch=20'origin/develop'?= =?UTF-8?q?=20into=20develop=20*=20Merge=20remote-tracking=20branch=20'ori?= =?UTF-8?q?gin/develop'=20into=20develop=20*=20Merge=20remote-tracking=20b?= =?UTF-8?q?ranch=20'origin/develop'=20into=20develop=20*=20mxrec=E6=9E=84?= =?UTF-8?q?=E5=BB=BA=E4=BC=98=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7?= =?UTF-8?q?=20setup.py=20bdist=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA?= =?UTF-8?q?=20*=20mxrec=E6=9E=84=E5=BB=BA=E4=BC=98=E5=8C=96=EF=BC=9A?= =?UTF-8?q?=E4=BD=BF=E7=94=A8python3.7=20setup.py=20bdist=5Fwheel=E6=96=B9?= =?UTF-8?q?=E5=BC=8F=E6=9E=84=E5=BB=BA=20*=20mxrec=E6=9E=84=E5=BB=BA?= =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=9A=E4=BD=BF=E7=94=A8python3.7=20setup.?= =?UTF-8?q?py=20bdist=5Fwheel=E6=96=B9=E5=BC=8F=E6=9E=84=E5=BB=BA=20*=20mx?= =?UTF-8?q?rec=E6=9E=84=E5=BB=BA=E4=BC=98=E5=8C=96=EF=BC=9A=E4=BD=BF?= =?UTF-8?q?=E7=94=A8python3.7=20setup.py=20bdist=5Fwheel=E6=96=B9=E5=BC=8F?= =?UTF-8?q?=E6=9E=84=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 20 ++-- ...ld_tf1_with_opensource.sh => build_tf1.sh} | 51 +-------- ...ld_tf2_with_opensource.sh => build_tf2.sh} | 51 +-------- build/{build.sh => gen_mxrec_tar_pkg.sh} | 102 ++++++------------ build/move_whl_file_2_pkg_dir.sh | 35 ++++++ setup.py | 95 ++++++---------- setup_tf1.py | 96 +++++++++++++++++ setup_tf2.py | 96 +++++++++++++++++ tests/run_python_dt.sh | 2 +- 9 files changed, 310 insertions(+), 238 deletions(-) rename build/{build_tf1_with_opensource.sh => build_tf1.sh} (71%) rename build/{build_tf2_with_opensource.sh => build_tf2.sh} (71%) rename build/{build.sh => gen_mxrec_tar_pkg.sh} (44%) create mode 100644 build/move_whl_file_2_pkg_dir.sh create mode 100644 setup_tf1.py create mode 100644 setup_tf2.py diff --git a/README.md b/README.md index fccc0244..6f49f4ba 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ mxRec作为面向互联网市场搜索推荐广告的应用使能SDK产品,对 ## 安装方式 -安装前,请参考《CANN 软件安装指南CANN 软件安装指南》安装CANN开发套件软件包和TensorFlow适配昇腾插件。 +安装前,请参考《CANN 软件安装指南》安装CANN开发套件软件包和TensorFlow适配昇腾插件。 CANN软件提供进程级环境变量设置脚本,供用户在进程中引用,以自动完成环境变量设置。用户进程结束后自动失效。可在程序启动的Shell脚本中使用如下命令设置CANN的相关环境变量,也可通过命令行执行如下命令(以root用户默认安装路径“/usr/local/Ascend”为例): ```shell @@ -63,12 +63,14 @@ bash run.sh - [openmpi 4.1.5](https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz): 请参考软件文档在编译环境完成安装 - tensorflow 1.15/2.6.5:根据实际需求选择对应版本 -将pybind11和securec的压缩包放在与MxRec代码同级的opensource目录下,并且将其分别更名为pybind11-2.10.3.zip、huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource目录,然后将pybind11和securec的压缩包放在opensource目录下。 +将pybind11和securec的压缩包放在与mxRec代码同级的opensource目录下,并且将其分别更名为pybind11-2.10.3.zip、huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在mxRec同级的目录下手动创建opensource目录,然后将pybind11和securec的压缩包放在opensource目录下。 为了构建多个版本的whl包,编译脚本在python虚拟环境完成对应tensorflow版本的安装。用户可以根据实际情况调整编译脚本,指定tensorflow的安装路径。编译方法: -- build/build.sh:执行脚本完成tf1和tf2版本whl包的构建和打包。执行脚本前,请参考build/build_tf1_with_opensource.sh、build/build_tf2_with_opensource.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 -- build/build_tf1_with_opensource.sh:执行脚本完成tf1版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 -- build/build_tf2_with_opensource.sh:执行脚本完成tf2版本whl包的构建,构建成功后,whl包在/build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 + +进入mxRec代码目录: +- setup.py:执行脚本setup.py,比如:**python3.7 setup.py**完成tf1和tf2版本whl包的构建和打包,构建成功后,whl包在build/mindxsdk-mxrec/目录下,其中tf1_whl和tf2_whl目录下存在对应的whl包。执行脚本前,请参考build/build_tf1.sh、build/build_tf2.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 +- setup_tf1.py:执行脚本setup_tf1.py,比如:**python3.7 setup_tf1.py bdist_wheel**完成tf1版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,请参考build/build_tf1.sh创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 +- setup_tf2.py:执行脚本setup_tf2.py,比如:**python3.7 setup_tf2.py bdist_wheel**完成tf2版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,请参考build/build_tf2.sh创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 如需使用动态扩容功能,进入“./cust_op/cust_op_by_addr”目录中。参考以下命令编译并安装动态扩容算子包。 ```shell @@ -99,8 +101,8 @@ bash run_python_dt.sh - [pybind11 v2.10.3](https://github.com/pybind/pybind11/archive/refs/tags/v2.10.3.zip) - [securec](https://github.com/huaweicloud/huaweicloud-sdk-c-obs/archive/refs/tags/v3.23.9.zip) -将googletest、emock、pybind11和securec的压缩包放在与MxRec代码同级的opensource目录下,并且将其分别更名为googletest-release-1.8.1.zip、 -emock-0.9.0.zip、pybind11-2.10.3.zip、 huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在MxRec同级的目录下手动创建opensource目录, +将googletest、emock、pybind11和securec的压缩包放在与mxRec代码同级的opensource目录下,并且将其分别更名为googletest-release-1.8.1.zip、 +emock-0.9.0.zip、pybind11-2.10.3.zip、 huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在mxRec同级的目录下手动创建opensource目录, 然后将前述几个压缩包放在opensource目录下。 如需使用C++测试用例,需要按照上述描述准备需要的依赖,准备好之后,进入src目录中。参考以下命令执行C++测试用例: @@ -117,11 +119,11 @@ bash test_ut.sh tf2 ## 使用指导 -mxRec所支持的使用环境、功能特性、API接口与使用样例请参考昇腾开源社区MindX SDK产品文档。 +mxRec所支持的使用环境、功能特性、API接口与使用样例请参考mxRec用户指南。 ## 参考设计 -mxrec框架基础镜像,基于TensorFlow 1.15.0、tensorflow2.6.5制作的基础镜像,安装mxrec后即可开始训练,以及样例使用介绍。 +mxRec框架基础镜像,基于TensorFlow 1.15.0、tensorflow2.6.5制作的基础镜像,安装mxRec后即可开始训练,以及样例使用介绍。 1. https://ascendhub.huawei.com/#/detail/mxrec-tf1 diff --git a/build/build_tf1_with_opensource.sh b/build/build_tf1.sh similarity index 71% rename from build/build_tf1_with_opensource.sh rename to build/build_tf1.sh index ff59571c..5d6632d6 100644 --- a/build/build_tf1_with_opensource.sh +++ b/build/build_tf1.sh @@ -15,13 +15,11 @@ # ============================================================================== ################################################################## -# build_tf1_with_opensource.sh 编译MxRec和动态扩容算子 +# build_tf1.sh 编译MxRec # 编译环境:Python3.7.5 GCC 7.3.0 CMake 3.20.6 -# 代码主要分为四部分: +# 代码主要分为两部分: # 1、准备编译MxRec所需依赖:pybind11(v2.10.3) securec # 2、编译securec、AccCTR以及MxRec -# 3、生成MxRec Wheel包,生成的whl包在当前目录下的mindxsdk-mxrec/tf1_whl -# 4、编译动态扩容算子 ################################################################## set -e @@ -64,33 +62,6 @@ source /opt/buildtools/tf1_env/bin/activate tf1_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow_core deactivate tf1_env -project_output_path="${MxRec_DIR}"/output/ -VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml - -function get_version() { - if [ -f "$VERSION_FILE" ]; then - VERSION=$(sed '/.*mindxsdk:/!d;s/.*: //' "$VERSION_FILE") - if [[ "$VERSION" == *.[b/B]* ]] && [[ "$VERSION" != *.[RC/rc]* ]]; then - VERSION=${VERSION%.*} - fi - else - VERSION="5.0.0" - fi -} - -rm -rf "${project_output_path}" -rm -rf "${SCRIPT_DIR}/lib" - -# 获取MxRec版本信息 -get_version -export VERSION -echo "MindX SDK MxRec: ${VERSION}" >> ./version.info - -pkg_dir=mindxsdk-mxrec -rm -rf "${pkg_dir}" -mkdir "${pkg_dir}" -mv version.info "${pkg_dir}" - # 配置MxRec C++代码路径和AccCTR路径 src_path="${MxRec_DIR}"/src acc_ctr_path="${MxRec_DIR}"/src/AccCTR @@ -134,19 +105,10 @@ function collect_so_file() cp ${acc_ctr_path}/output/ock_ctr_common/lib/* libasc cp -df "${MxRec_DIR}"/output/*.so* libasc cp "${opensource_path}"/securec/lib/libsecurec.so libasc -} - -function gen_wheel_file() -{ cd "${MxRec_DIR}" touch "${src_path}"/libasc/__init__.py rm -rf "${MxRec_DIR}"/mx_rec/libasc mv "${src_path}"/libasc "${MxRec_DIR}"/mx_rec - python3.7 setup.py bdist_wheel --plat-name=linux_$(arch) - mkdir -p "$1" - echo "moving whl file $1" - mv dist/mx_rec*.whl "$1" - rm -rf "${MxRec_DIR}"/mx_rec/libasc } # start to build MxRec @@ -158,13 +120,4 @@ echo "---------------- compile MxRec so files ----------------" compile_so_file "${tf1_path}" echo "---------------- collect so files and mv them to libasc ----------------" collect_so_file -echo "---------------- generate MxRec wheel package ----------------" -gen_wheel_file "$SCRIPT_DIR"/"${pkg_dir}"/tf1_whl echo "---------------- compile MxRec success!!!! ----------------" - -# start to compile cust op -echo "---------------- start to compile cust op ----------------" -cd "${MxRec_DIR}"/cust_op/cust_op_by_addr -chmod u+x run.sh -./run.sh -echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file diff --git a/build/build_tf2_with_opensource.sh b/build/build_tf2.sh similarity index 71% rename from build/build_tf2_with_opensource.sh rename to build/build_tf2.sh index 08aaf164..639024ff 100644 --- a/build/build_tf2_with_opensource.sh +++ b/build/build_tf2.sh @@ -15,13 +15,11 @@ # ============================================================================== ################################################################## -# build_tf2_with_opensource.sh 编译MxRec和动态扩容算子 +# build_tf2.sh 编译MxRec # 编译环境:Python3.7.5 GCC 7.3.0 CMake 3.20.6 -# 代码主要分为四部分: +# 代码主要分为两部分: # 1、准备编译MxRec所需依赖:pybind11(v2.10.3) securec # 2、编译securec、AccCTR以及MxRec -# 3、生成MxRec Wheel包,生成的whl包在当前目录下的mindxsdk-mxrec/tf2_whl -# 4、编译动态扩容算子 ################################################################## set -e @@ -64,33 +62,6 @@ source /opt/buildtools/tf2_env/bin/activate tf2_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow deactivate tf2_env -project_output_path="${MxRec_DIR}"/output/ -VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml - -function get_version() { - if [ -f "$VERSION_FILE" ]; then - VERSION=$(sed '/.*mindxsdk:/!d;s/.*: //' "$VERSION_FILE") - if [[ "$VERSION" == *.[b/B]* ]] && [[ "$VERSION" != *.[RC/rc]* ]]; then - VERSION=${VERSION%.*} - fi - else - VERSION="5.0.0" - fi -} - -rm -rf "${project_output_path}" -rm -rf "${SCRIPT_DIR}/lib" - -# 获取MxRec版本信息 -get_version -export VERSION -echo "MindX SDK MxRec: ${VERSION}" >> ./version.info - -pkg_dir=mindxsdk-mxrec -rm -rf "${pkg_dir}" -mkdir "${pkg_dir}" -mv version.info "${pkg_dir}" - # 配置MxRec C++代码路径和AccCTR路径 src_path="${MxRec_DIR}"/src acc_ctr_path="${MxRec_DIR}"/src/AccCTR @@ -134,19 +105,10 @@ function collect_so_file() cp ${acc_ctr_path}/output/ock_ctr_common/lib/* libasc cp -df "${MxRec_DIR}"/output/*.so* libasc cp "${opensource_path}"/securec/lib/libsecurec.so libasc -} - -function gen_wheel_file() -{ cd "${MxRec_DIR}" touch "${src_path}"/libasc/__init__.py rm -rf "${MxRec_DIR}"/mx_rec/libasc mv "${src_path}"/libasc "${MxRec_DIR}"/mx_rec - python3.7 setup.py bdist_wheel --plat-name=linux_$(arch) - mkdir -p "$1" - echo "moving whl file $1" - mv dist/mx_rec*.whl "$1" - rm -rf "${MxRec_DIR}"/mx_rec/libasc } # start to build MxRec @@ -158,13 +120,4 @@ echo "---------------- compile MxRec so files ----------------" compile_so_file "${tf2_path}" echo "---------------- collect so files and mv them to libasc ----------------" collect_so_file -echo "---------------- generate MxRec wheel package ----------------" -gen_wheel_file "$SCRIPT_DIR"/"${pkg_dir}"/tf2_whl echo "---------------- compile MxRec success!!!! ----------------" - -# start to compile cust op -echo "---------------- start to compile cust op ----------------" -cd "${MxRec_DIR}"/cust_op/cust_op_by_addr -chmod u+x run.sh -./run.sh -echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file diff --git a/build/build.sh b/build/gen_mxrec_tar_pkg.sh similarity index 44% rename from build/build.sh rename to build/gen_mxrec_tar_pkg.sh index 0eb688fd..72ccfe49 100644 --- a/build/build.sh +++ b/build/gen_mxrec_tar_pkg.sh @@ -18,11 +18,9 @@ set -e warn() { echo >&2 -e "\033[1;31m[WARN ][Depend ] $1\033[1;37m" ; } ARCH="$(uname -m)" SCRIPT_DIR=$(dirname "$(readlink -f "$0")") -ROOT_DIR=$(dirname "${SCRIPT_DIR}") -cd "$SCRIPT_DIR" +MxRec_DIR=$(dirname "${SCRIPT_DIR}") - -VERSION_FILE="${ROOT_DIR}"/../mindxsdk/build/conf/config.yaml +VERSION_FILE="${MxRec_DIR}"/../mindxsdk/build/conf/config.yaml get_version() { if [ -f "$VERSION_FILE" ]; then VERSION=$(sed '/.*mindxsdk:/!d;s/.*: //' "$VERSION_FILE") @@ -30,96 +28,60 @@ get_version() { VERSION=${VERSION%.*} fi else - VERSION="5.0.0" - fi -} - -remove() -{ - if [ -d "$1" ]; then - rm -rf "$1" - elif [ -f "$1" ]; then - rm -f "$1" + VERSION="6.0.RC2" fi } -project_output_path="${ROOT_DIR}"/output/ -remove "${project_output_path}" -remove "${SCRIPT_DIR}/lib" get_version -export VERSION echo "MindX SDK mxrec: ${VERSION}" >> ./version.info pkg_dir=mindxsdk-mxrec -remove "${pkg_dir}" -mkdir "${pkg_dir}" -mv version.info "${pkg_dir}" - -src_path="${ROOT_DIR}"/src -cd "${ROOT_DIR}" - release_tar=Ascend-"${pkg_dir}"_"${VERSION}"_linux-"${ARCH}".tar.gz +mv version.info "${SCRIPT_DIR}"/"${pkg_dir}" -gen_tar_file() +function gen_tar_file() { - cd "${src_path}" - cp -r "${src_path}"/../cust_op ../build/"${pkg_dir}" - cp -r "${src_path}"/../examples ../build/"${pkg_dir}" + cd "${MxRec_DIR}" + cp -r ./cust_op ./build/"${pkg_dir}" + cp -r ./examples ./build/"${pkg_dir}" # change dirs and files 's permission - chmod 550 ../build/"${pkg_dir}"/tf1_whl - chmod 550 ../build/"${pkg_dir}"/tf1_whl/mx_rec*.whl - chmod 550 ../build/"${pkg_dir}"/tf2_whl - chmod 550 ../build/"${pkg_dir}"/tf2_whl/mx_rec*.whl - chmod 550 ../build/"${pkg_dir}"/cust_op/ - chmod 550 ../build/"${pkg_dir}"/cust_op/cust_op_by_addr - cd ../build/"${pkg_dir}"/cust_op/cust_op_by_addr + chmod 550 ./build/"${pkg_dir}"/tf1_whl + chmod 550 ./build/"${pkg_dir}"/tf1_whl/mx_rec*.whl + chmod 550 ./build/"${pkg_dir}"/tf2_whl + chmod 550 ./build/"${pkg_dir}"/tf2_whl/mx_rec*.whl + chmod 550 ./build/"${pkg_dir}"/cust_op/ + chmod 550 ./build/"${pkg_dir}"/cust_op/cust_op_by_addr + cd ./build/"${pkg_dir}"/cust_op/cust_op_by_addr chmod 550 *.sh chmod 640 *.json chmod 550 op_host op_kernel op_host/* op_kernel/* cd - - cd ../build + cd ./build tar -zvcf "${release_tar}" "${pkg_dir}" || { warn "compression failed, packages might be broken" } - mv "${release_tar}" "${SCRIPT_DIR}"/../output/ + mv "${release_tar}" ../output/ } -clean() +function clean() { - remove "${ROOT_DIR}"/dist - remove "${ROOT_DIR}"/install - remove "${ROOT_DIR}"/mx_rec.egg-info - remove "${ROOT_DIR}"/src/build - remove "${ROOT_DIR}"/build/bdist.linux-"$(arch)" - remove "${ROOT_DIR}"/build/tf2_env - remove "${ROOT_DIR}"/build/tf1_env - remove "${ROOT_DIR}"/build/lib - remove "${ROOT_DIR}"/build/mindxsdk-mxrec + rm -rf "${MxRec_DIR}"/dist + rm -rf "${MxRec_DIR}"/mx_rec.egg-info + rm -rf "${MxRec_DIR}"/src/build + rm -rf "${MxRec_DIR}"/mx_rec/libasc + rm -rf "${MxRec_DIR}"/build/lib + rm -rf "${MxRec_DIR}"/build/bdist.linux-${ARCH} } +gen_tar_file -if [ "$(uname -m)" = "x86_64" ] -then - echo "-----Build gen tar -----" - bash ${ROOT_DIR}/build/build_tf1_with_opensource.sh - bash ${ROOT_DIR}/build/build_tf2_with_opensource.sh - gen_tar_file - echo "-----Build gen tar finished-----" - - # clean - echo "-----Done-----" -fi - -if [ "$(uname -m)" = "aarch64" ] -then - echo "-----Build gen tar -----" - bash ${ROOT_DIR}/build/build_tf1_with_opensource.sh - bash ${ROOT_DIR}/build/build_tf2_with_opensource.sh - gen_tar_file - echo "-----Build gen tar finished-----" +clean - # clean - echo "-----Done-----" -fi \ No newline at end of file +# compile cust op +echo "---------------- start to compile cust op ----------------" +cd "${MxRec_DIR}"/cust_op/cust_op_by_addr +chmod u+x run.sh +./run.sh +echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file diff --git a/build/move_whl_file_2_pkg_dir.sh b/build/move_whl_file_2_pkg_dir.sh new file mode 100644 index 00000000..d489c2fb --- /dev/null +++ b/build/move_whl_file_2_pkg_dir.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e +warn() { echo >&2 -e "\033[1;31m[WARN ][Depend ] $1\033[1;37m" ; } +ARCH="$(uname -m)" +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") +MxRec_DIR=$(dirname "${SCRIPT_DIR}") +pkg_dir=mindxsdk-mxrec +tf_version=$1 + +function move_whl_file_2_pkg_dir() { + mkdir -p "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl + rm -rf "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl/* + mv ${MxRec_DIR}/dist/mx_rec*.whl "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl + cd "$SCRIPT_DIR"/"${pkg_dir}"/"${tf_version}"_whl + whl_file=$(ls .) + mv "$whl_file" "${whl_file/any/linux_${ARCH}}" + cd - +} + +move_whl_file_2_pkg_dir \ No newline at end of file diff --git a/setup.py b/setup.py index efb4c994..87454130 100644 --- a/setup.py +++ b/setup.py @@ -16,64 +16,39 @@ # ============================================================================== import os +import glob import stat -from setuptools import setup, find_packages -import pkg_resources -from setuptools.extern.packaging import version as packaging_version - - -# Patch Version class to preserve original version string -class NoNormalizeVersion(packaging_version.Version): - def __init__(self, version): - self._orig_version = version - super().__init__(version) - - def __str__(self): - return self._orig_version - - -packaging_version.Version = NoNormalizeVersion -# Patch safe_version() to prevent version normalization -pkg_resources.safe_version = lambda v: v - -try: - with open("README.md") as file: - LONG_DESCRIPTION = file.read() -except IOError: - LONG_DESCRIPTION = "" - -env_version = os.getenv("VERSION") -VERSION = env_version if env_version is not None else '5.0.rc3' - -INIT_FILE = "mx_rec/__init__.py" -with open(INIT_FILE, 'r') as file: - lines = file.readlines() - -for idx, line in enumerate(lines): - if "__version__ = " not in line: - continue - lines[idx] = f"__version__ = '{VERSION}'\n" - break - -FLAG = os.O_WRONLY | os.O_TRUNC -MODE = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH -with os.fdopen(os.open(INIT_FILE, FLAG, MODE), 'w') as out: - out.writelines(lines) - -setup( - name='mx_rec', - version=VERSION, - author='HUAWEI Inc', - description='MindX SDK Recommend', - long_description=LONG_DESCRIPTION, - # include mx_rec - packages=find_packages( - where='.', - include=["mx_rec*"] - ), - package_dir={}, - # other file - package_data={'': ['tools/*', 'tools/*/*', '*.yml', '*.sh', '*.so*']}, - # dependency - python_requires='>=3.7.5' -) +import shutil +import subprocess + +# get the absolute path of the Python 3.7 program +res = subprocess.run(["/usr/bin/which", "python3.7"], stdout=subprocess.PIPE, text=True, shell=False) +if res.returncode: + raise RuntimeError("get the absolute path of the Python 3.7 program failed!") +python37_path = res.stdout.strip() + +# add execution permission to the file with the .sh suffix +scripts = glob.glob(os.path.join(os.getcwd(), "build/*.sh")) +for script in scripts: + if os.path.isfile(script): + os.chmod(script, os.stat(script).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + +# clean pkg_dir existed +PKG_DIR = "./build/mindxsdk-mxrec" +if os.path.exists(PKG_DIR): + shutil.rmtree(PKG_DIR) + +# build tf1's wheel file +res = subprocess.run([python37_path, "setup_tf1.py", "bdist_wheel"], shell=False) +if res.returncode: + raise RuntimeError(f"build tf1's wheel file failed!") + +# build tf2's wheel file +res = subprocess.run([python37_path, "setup_tf2.py", "bdist_wheel"], shell=False) +if res.returncode: + raise RuntimeError(f"build tf2's wheel file failed!") + +# copy cust_op, examples files, etc. Then gen mxrec's tar pkg +res = subprocess.run(["./build/gen_mxrec_tar_pkg.sh"], shell=False) +if res.returncode: + raise RuntimeError(f"gen mxrec's tar pkg failed!") diff --git a/setup_tf1.py b/setup_tf1.py new file mode 100644 index 00000000..df8c731e --- /dev/null +++ b/setup_tf1.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import stat +import subprocess +from setuptools import setup, find_packages +import pkg_resources +from setuptools.extern.packaging import version as packaging_version + +script_path = os.getcwd() + + +# Patch Version class to preserve original version string +class NoNormalizeVersion(packaging_version.Version): + def __init__(self, version): + self._orig_version = version + super().__init__(version) + + def __str__(self): + return self._orig_version + + +def safe_version(v): + return v + + +packaging_version.Version = NoNormalizeVersion +# Patch safe_version() to prevent version normalization +pkg_resources.safe_version = safe_version + +try: + with open("README.md") as file: + LONG_DESCRIPTION = file.read() +except IOError: + LONG_DESCRIPTION = "" + +env_version = os.getenv("VERSION") +VERSION = env_version if env_version is not None else '6.0.RC2' + +INIT_FILE = "mx_rec/__init__.py" +with open(INIT_FILE, 'r') as file: + lines = file.readlines() + +for idx, line in enumerate(lines): + if "__version__ = " not in line: + continue + lines[idx] = f"__version__ = '{VERSION}'\n" + break + +FLAG = os.O_WRONLY | os.O_TRUNC +MODE = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH +with os.fdopen(os.open(INIT_FILE, FLAG, MODE), 'w') as out: + out.writelines(lines) + +# compile so files +tf1_script = os.path.join(script_path, "./build/build_tf1.sh") +res = subprocess.run([tf1_script], shell=False) +if res.returncode: + raise RuntimeError("compile so files failed!") + +setup( + name='mx_rec', + version=VERSION, + author='HUAWEI Inc', + description='MindX SDK Recommend', + long_description=LONG_DESCRIPTION, + # include mx_rec + packages=find_packages( + where='.', + include=["mx_rec*"] + ), + # other file + package_data={'': ['tools/*', 'tools/*/*', '*.yml', '*.sh', '*.so*']}, + # dependency + python_requires='>=3.7.5' +) + +move_whl_script = os.path.join(script_path, "./build/move_whl_file_2_pkg_dir.sh") +res = subprocess.run([move_whl_script, "tf1"], shell=False) +if res.returncode: + raise RuntimeError(f"move tf1 whl file to pkg dir failed!") diff --git a/setup_tf2.py b/setup_tf2.py new file mode 100644 index 00000000..31e61a99 --- /dev/null +++ b/setup_tf2.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import stat +import subprocess +from setuptools import setup, find_packages +import pkg_resources +from setuptools.extern.packaging import version as packaging_version + +script_path = os.getcwd() + + +# Patch Version class to preserve original version string +class NoNormalizeVersion(packaging_version.Version): + def __init__(self, version): + self._orig_version = version + super().__init__(version) + + def __str__(self): + return self._orig_version + + +def safe_version(v): + return v + + +packaging_version.Version = NoNormalizeVersion +# Patch safe_version() to prevent version normalization +pkg_resources.safe_version = safe_version + +try: + with open("README.md") as file: + LONG_DESCRIPTION = file.read() +except IOError: + LONG_DESCRIPTION = "" + +env_version = os.getenv("VERSION") +VERSION = env_version if env_version is not None else '6.0.RC2' + +INIT_FILE = "mx_rec/__init__.py" +with open(INIT_FILE, 'r') as file: + lines = file.readlines() + +for idx, line in enumerate(lines): + if "__version__ = " not in line: + continue + lines[idx] = f"__version__ = '{VERSION}'\n" + break + +FLAG = os.O_WRONLY | os.O_TRUNC +MODE = stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH +with os.fdopen(os.open(INIT_FILE, FLAG, MODE), 'w') as out: + out.writelines(lines) + +# compile so files +tf2_script = os.path.join(script_path, "./build/build_tf2.sh") +res = subprocess.run([tf2_script], shell=False) +if res.returncode: + raise RuntimeError("compile so files failed!") + +setup( + name='mx_rec', + version=VERSION, + author='HUAWEI Inc', + description='MindX SDK Recommend', + long_description=LONG_DESCRIPTION, + # include mx_rec + packages=find_packages( + where='.', + include=["mx_rec*"] + ), + # other file + package_data={'': ['tools/*', 'tools/*/*', '*.yml', '*.sh', '*.so*']}, + # dependency + python_requires='>=3.7.5' +) + +move_whl_script = os.path.join(script_path, "./build/move_whl_file_2_pkg_dir.sh") +res = subprocess.run([move_whl_script, "tf2"], shell=False) +if res.returncode: + raise RuntimeError(f"move tf2 whl file to pkg dir failed!") diff --git a/tests/run_python_dt.sh b/tests/run_python_dt.sh index f29bf7b5..139e7ff7 100644 --- a/tests/run_python_dt.sh +++ b/tests/run_python_dt.sh @@ -26,7 +26,7 @@ if [ $ARCH == "aarch64" ]; then fi # build mxRec and get output directory -bash "$TOP_PATH"/build/build_tf1_with_opensource.sh +bash "$TOP_PATH"/build/build_tf1.sh # create libasc directory and copy so files into it cd "$TOP_PATH"/mx_rec -- Gitee From 27252d274752efc6d96268d8b8476934b8861aa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Sat, 13 Apr 2024 19:22:39 +0800 Subject: [PATCH 034/302] =?UTF-8?q?=E5=86=92=E7=83=9F=E5=A4=B1=E8=B4=A5?= =?UTF-8?q?=EF=BC=8C=E5=9B=9E=E9=80=80=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 4 +- examples/demo/little_demo/run_mode.py | 4 +- .../demo/little_demo_estimator/nn_optim.py | 4 +- examples/dlrm/model/gradient_descent_w.py | 10 -- examples/dlrm/model/main_mxrec.py | 4 +- mx_rec/constants/constants.py | 2 + mx_rec/core/asc/build_graph.py | 51 ++++++++++ mx_rec/core/asc/manager.py | 6 +- mx_rec/core/emb/dynamic_sparse_embedding.py | 9 +- mx_rec/core/emb/sparse_embedding.py | 7 +- mx_rec/optimizers/adagrad.py | 15 +-- mx_rec/optimizers/base.py | 64 ------------ mx_rec/optimizers/ftrl.py | 15 +-- mx_rec/optimizers/gradient_descent.py | 5 - mx_rec/optimizers/gradient_descent_by_addr.py | 5 - mx_rec/optimizers/lazy_adam.py | 10 +- mx_rec/optimizers/lazy_adam_by_addr.py | 10 +- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 6 +- src/core/key_process/key_process.cpp | 5 +- src/core/utils/common.cpp | 1 - src/core/utils/common.h | 8 +- src/core/utils/config.cpp | 15 ++- src/core/utils/config.h | 7 ++ src/pybind/module_main.cpp | 2 - src/tests/utils/config_test.cpp | 4 + tests/mx_rec/core/mock_class.py | 1 - tests/mx_rec/core/test_build_graph.py | 99 ++++++++++++++++++- tests/mx_rec/core/test_manager.py | 8 -- 28 files changed, 213 insertions(+), 168 deletions(-) diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index 0a9462bc..d5a51312 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -336,9 +336,9 @@ if __name__ == "__main__": train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if use_dynamic_expansion: - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) # do sparse optimization by addr sparse_grads = sparse_optimizer.compute_gradients(loss, train_emb_list) # local_embedding diff --git a/examples/demo/little_demo/run_mode.py b/examples/demo/little_demo/run_mode.py index 6a3301c4..e750ceb5 100644 --- a/examples/demo/little_demo/run_mode.py +++ b/examples/demo/little_demo/run_mode.py @@ -95,11 +95,11 @@ class RunMode: self.train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))): - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) # do sparse optimization by addr local_grads = tf.gradients(loss, train_emb_list) # local_embedding diff --git a/examples/demo/little_demo_estimator/nn_optim.py b/examples/demo/little_demo_estimator/nn_optim.py index 3be3c7ed..4438627d 100644 --- a/examples/demo/little_demo_estimator/nn_optim.py +++ b/examples/demo/little_demo_estimator/nn_optim.py @@ -73,11 +73,11 @@ def get_train_op_list(losses, learning_rate): # do sparse optimization if use_dynamic_expansion: - from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET + from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) local_grads = tf.gradients(loss, train_emb_list) # local_embedding grads_and_vars = [(grad, address) for grad, address in zip(local_grads, train_address_list)] diff --git a/examples/dlrm/model/gradient_descent_w.py b/examples/dlrm/model/gradient_descent_w.py index 6c34b726..f3ae78d7 100644 --- a/examples/dlrm/model/gradient_descent_w.py +++ b/examples/dlrm/model/gradient_descent_w.py @@ -47,16 +47,6 @@ class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOp super(CustomizedGradientDescentWithWeighDecay, self).__init__( learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name ) - self._slot_num = 0 - self._derivative = 1 - - @property - def slot_num(self): - return self._slot_num - - @property - def derivative(self): - return self._derivative def initialize_slots(self, var, table_instance): logger.info("no slot for gradient descent") diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index ab2eb04c..4bbd16de 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -24,7 +24,7 @@ import tensorflow as tf from sklearn.metrics import roc_auc_score import numpy as np -from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET +from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline from mx_rec.core.embedding import create_table, sparse_lookup @@ -346,7 +346,7 @@ if __name__ == "__main__": train_ops.append(dense_optimizer.apply_gradients(avg_grads)) if use_dynamic_expansion: - train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS) # do sparse optimization by addr sparse_grads = list(grads[-1]) # local_embedding grads_and_vars = [(grad, address) for grad, address in zip(sparse_grads, train_address_list)] diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index 2c2cd2fe..03fa28b4 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -22,6 +22,7 @@ ASCEND_GLOBAL_HASHTABLE_COLLECTION = "ASCEND_GLOBAL_HASHTABLE_COLLECTION" ASCEND_CUTTING_POINT_INITIALIZER = "ASCEND_CUTTING_POINT_INITIALIZER" ASCEND_SPARSE_LOOKUP_ENTRANCE = "ASCEND_SPARSE_LOOKUP_ENTRANCE" ASCEND_SPARSE_LOOKUP_ID_OFFSET = "ASCEND_SPARSE_LOOKUP_ID_OFFSET" +ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS = "ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS" ASCEND_TIMESTAMP = "ASCEND_TIMESTAMP" ASCEND_SPARSE_LOOKUP_LOCAL_EMB = "ASCEND_SPARSE_LOOKUP_LOCAL_EMB" EMPTY_STR = "" @@ -165,6 +166,7 @@ class ASCAnchorAttr(Enum): MOCK_LOOKUP_RESULT = "mock_lookup_result" RESTORE_VECTOR_SECOND = "restore_vector_second" UNIQUE_KEYS = "unique_keys" + GRADIENTS_STRATEGY = "gradients_strategy" IS_GRAD = "is_grad" diff --git a/mx_rec/core/asc/build_graph.py b/mx_rec/core/asc/build_graph.py index 2bb72621..5e9fea58 100644 --- a/mx_rec/core/asc/build_graph.py +++ b/mx_rec/core/asc/build_graph.py @@ -22,6 +22,7 @@ import tensorflow as tf import mxrec_pybind from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.tf_version_adapter import npu_ops +from mx_rec.constants.constants import TRAIN_CHANNEL_ID from mx_rec.util.log import logger @@ -80,6 +81,46 @@ def get_id_offsets(max_lookup_vec_size, config): return id_offsets, swap_pos, swap_len +def get_restore_vector_second(max_lookup_vec_size: int, config: dict) -> tf.Tensor: + """ + Get restore vector which is calculated after the second all2all + :param max_lookup_vec_size: the size of restore_vector_second + :param config: embedding config + :return: the restore vector calculated after the second all2all + """ + logger.debug('Channel %s_restore_second_%s was built for getnext', + config.get("table_name"), config.get("channel_id")) + with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): + restore_vector_second = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32], + output_shapes=[[max_lookup_vec_size]], + channel_name=f'{config.get("table_name")}_restore_second_{config.get("channel_id")}')[0] + return restore_vector_second + + +def get_unique_keys(max_lookup_vec_size: int, config: dict) -> tf.Tensor: + """ + Get the global unique keys which is calculated after the second all2all + :param max_lookup_vec_size: the size of global unique keys + :param config: embedding config + :return: the global unique keys calculated after the second all2all + """ + logger.debug('Channel %s_uniquekeys_%s was built for getnext', config.get("table_name"), config.get("channel_id")) + with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): + if config.get("use_dynamic_expansion"): + unique_keys = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int64], + output_shapes=[[max_lookup_vec_size]], + channel_name=f'{config.get("table_name")}_uniquekeys_{config.get("channel_id")}')[0] + return unique_keys + + unique_keys = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32], + output_shapes=[[max_lookup_vec_size]], + channel_name=f'{config.get("table_name")}_uniquekeys_{config.get("channel_id")}')[0] + return unique_keys + + def get_all2all_args(use_static: bool, config: dict) -> Optional[list]: """ Get all2all parameters for dynamic condition @@ -170,4 +211,14 @@ def get_preprocessed_tensor_for_asc(table, config): 'all2all_args': all2all_args, } + if config.get("channel_id") != TRAIN_CHANNEL_ID: + return result + + with tf.compat.v1.variable_scope("restore_vector_second"): + restore_vector_second = get_restore_vector_second(max_lookup_vec_size, config) + + with tf.compat.v1.variable_scope("unique_keys"): + unique_keys = get_unique_keys(max_lookup_vec_size, config) + result.update({'restore_vector_second': restore_vector_second, 'unique_keys': unique_keys}) + return result diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index f50037ea..2829ab98 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -18,7 +18,7 @@ import tensorflow as tf from mxrec_pybind import InitializeInfo, ConstantInitializerInfo, NormalInitializerInfo, EmbInfo, EmbInfoParams, \ - ThresholdValue, HybridMgmt, RankInfo, USE_STATIC, USE_HOT, USE_DYNAMIC_EXPANSION, USE_SUM_SAME_ID_GRADIENTS + ThresholdValue, HybridMgmt, RankInfo, USE_STATIC, USE_HOT, USE_DYNAMIC_EXPANSION from mx_rec.util.communication.hccl_ops import get_rank_id, get_device_id, get_rank_size from mx_rec.util.initialize import ConfigInitializer @@ -205,10 +205,6 @@ def initialize_emb_cache(table_info_list, threshold_list): if ConfigInitializer.get_instance().use_dynamic_expansion: option = option | USE_DYNAMIC_EXPANSION - optimizer = ConfigInitializer.get_instance().optimizer_config.optimizer_instance - if optimizer.derivative == 2: - option = option | USE_SUM_SAME_ID_GRADIENTS - # [train_steps, eval_steps, save_steps] pass step information to HybridMgmt for data process loop rank_info = RankInfo(rank_id, device_id, rank_size, option, [train_steps, eval_steps, save_steps]) diff --git a/mx_rec/core/emb/dynamic_sparse_embedding.py b/mx_rec/core/emb/dynamic_sparse_embedding.py index 671c593e..bf1c6569 100644 --- a/mx_rec/core/emb/dynamic_sparse_embedding.py +++ b/mx_rec/core/emb/dynamic_sparse_embedding.py @@ -6,9 +6,10 @@ import abc from typing import Optional, Union, Callable import tensorflow as tf +from tensorflow.python.ops import array_ops from mx_rec.constants.constants import ASCEND_TABLE_NAME_MUST_CONTAIN, ASCEND_SPARSE_LOOKUP_LOCAL_EMB, \ - ASCEND_SPARSE_LOOKUP_ID_OFFSET + ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding @@ -50,7 +51,9 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): def _get_update_grad(self, local_grad: tf.Tensor, result: dict, table: Union[tf.compat.v1.Variable, tf.Tensor]) -> Union[tf.IndexedSlices, tf.Tensor]: - return local_grad + return tf.compat.v1.unsorted_segment_sum(local_grad, + result.get("restore_vector_second"), + array_ops.shape(result.get("unique_keys"))[0]) def _get_local_embeddings(self, table: Union[tf.compat.v1.Variable, tf.Tensor], result: dict, feature_spec: FeatureSpec, **kwargs) -> tf.Tensor: @@ -69,7 +72,7 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): return sparse_forward_fn(local_embeddings) tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB, local_embeddings) - tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get("id_offsets")) + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_UNIQUE_KEYS, result.get("unique_keys")) return sparse_forward_fn(local_embeddings) diff --git a/mx_rec/core/emb/sparse_embedding.py b/mx_rec/core/emb/sparse_embedding.py index 938f917d..d8ce63b1 100644 --- a/mx_rec/core/emb/sparse_embedding.py +++ b/mx_rec/core/emb/sparse_embedding.py @@ -53,8 +53,11 @@ class SparseEmbedding(BaseSparseEmbedding): def _get_update_grad(self, local_grad: tf.Tensor, result: dict, table: Union[tf.compat.v1.Variable, tf.Tensor]) -> Union[tf.IndexedSlices, tf.Tensor]: - return ops.IndexedSlices(values=local_grad, - indices=result.get("id_offsets"), + unique_local_grad = tf.compat.v1.unsorted_segment_sum(local_grad, + result.get("restore_vector_second"), + array_ops.shape(result.get("unique_keys"))[0]) + return ops.IndexedSlices(values=unique_local_grad, + indices=result.get("unique_keys"), dense_shape=tf.shape(table)) def _get_local_embeddings(self, table: Union[tf.compat.v1.Variable, tf.Tensor], result: dict, diff --git a/mx_rec/optimizers/adagrad.py b/mx_rec/optimizers/adagrad.py index 4ba444a6..d99be3b3 100644 --- a/mx_rec/optimizers/adagrad.py +++ b/mx_rec/optimizers/adagrad.py @@ -76,16 +76,6 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): initial_accumulator_value=initial_accumulator_value, use_locking=use_locking, name=self.unique_name) - self._slot_num = 1 - self._derivative = 2 - - @property - def slot_num(self): - return self._slot_num - - @property - def derivative(self): - return self._derivative def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. @@ -131,11 +121,10 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): def _apply_sparse(self, grad, var): acc = self.get_slot(var, "acc") - unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) return training_ops.sparse_apply_adagrad( var, acc, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), - unique_local_grad, - unique_keys, + grad.values, + grad.indices, use_locking=self._use_locking) def _resource_apply_sparse(self, grad, var, indices): diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index 49594d40..a5d68a70 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -21,57 +21,12 @@ from __future__ import print_function from collections import defaultdict -import tensorflow as tf from tensorflow.python.framework import ops -from tensorflow.python.ops import array_ops from tensorflow.python.training.optimizer import _TensorProcessor -from mx_rec.util.tf_version_adapter import npu_ops -from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger -def get_restore_vector_second(table_name: str) -> tf.Tensor: - """ - Get restore vector which is calculated after the second all2all - :param table_name: embedding table_name - :return: the restore vector calculated after the second all2all - """ - channel_id = 0 - logger.debug('Channel %s_restore_second_%s was built for getnext', - table_name, channel_id) - with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): - restore_vector_second = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32], - output_shapes=[[None]], - channel_name=f'{table_name}_restore_second_{channel_id}')[0] - return restore_vector_second - - -def get_unique_keys(table_name: str, is_expansion: bool) -> tf.Tensor: - """ - Get the global unique keys which is calculated after the second all2all - :param table_name: embedding table_name - :param is_expansion: use dynamic expansion - :return: the global unique keys calculated after the second all2all - """ - channel_id = 0 - logger.debug('Channel %s_uniquekeys_%s was built for getnext', table_name, channel_id) - with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): - if is_expansion: - unique_keys = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int64], - output_shapes=[[None]], - channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] - return unique_keys - - unique_keys = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32], - output_shapes=[[None]], - channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] - return unique_keys - - class CustomizedOptimizer: name_counter = defaultdict(int) @@ -80,25 +35,6 @@ class CustomizedOptimizer: self.unique_name = "" self.base_name = "" - @staticmethod - def sum_same_id_gradients(grad, var, is_expansion): - if isinstance(var, ops.Tensor): - # 扩容模式从scope获取表名,偏移是-2 - table_name = var.op.name.split('/')[-2] - else: - table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) - table_name = table_instance.table_name - with tf.compat.v1.variable_scope("restore_vector_second"): - restore_vector_second = get_restore_vector_second(table_name) - - with tf.compat.v1.variable_scope("unique_keys"): - unique_keys = get_unique_keys(table_name, is_expansion) - - unique_local_grad = tf.compat.v1.unsorted_segment_sum(grad, - restore_vector_second, - array_ops.shape(unique_keys)[0]) - return unique_local_grad, unique_keys - def initialize_slots(self, var, table_instance): raise NotImplementedError(f"Please define a specific realization on {self.__class__.__name__}") diff --git a/mx_rec/optimizers/ftrl.py b/mx_rec/optimizers/ftrl.py index 3659ffcd..5c68b929 100644 --- a/mx_rec/optimizers/ftrl.py +++ b/mx_rec/optimizers/ftrl.py @@ -80,16 +80,11 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): l2_shrinkage_regularization_strength=kwargs.get("l2_shrinkage_regularization_strength", 0.0) ) self._slot_num = 2 - self._derivative = 2 @property def slot_num(self): return self._slot_num - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): val = constant_op.constant( self._initial_accumulator_value, dtype=var.dtype, shape=var.get_shape()) @@ -140,19 +135,17 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): self._resource_scatter_nd_update) def _apply_sparse(self, grad, var): - unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) - if self._l2_shrinkage_regularization_strength <= 0.0: return self._apply_sparse_shared( - unique_local_grad, + grad.values, var, - unique_keys, + grad.indices, lambda x, i, v: tf.compat.v1.scatter_nd_update(x, i, v)) else: return self._apply_sparse_shared_v2( - unique_local_grad, + grad.values, var, - unique_keys, + grad.indices, lambda x, i, v: tf.compat.v1.scatter_nd_update(x, i, v)) def _apply_sparse_shared(self, grad, var, indices, scatter_nd_update): diff --git a/mx_rec/optimizers/gradient_descent.py b/mx_rec/optimizers/gradient_descent.py index 2ba72789..6881d6ad 100644 --- a/mx_rec/optimizers/gradient_descent.py +++ b/mx_rec/optimizers/gradient_descent.py @@ -55,16 +55,11 @@ class CustomizedGradientDescent(gradient_descent.GradientDescentOptimizer, Custo super(CustomizedGradientDescent, self).__init__(learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name) self._slot_num = 0 - self._derivative = 1 @property def slot_num(self): return self._slot_num - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): return [] diff --git a/mx_rec/optimizers/gradient_descent_by_addr.py b/mx_rec/optimizers/gradient_descent_by_addr.py index 11a9fda6..22b33852 100644 --- a/mx_rec/optimizers/gradient_descent_by_addr.py +++ b/mx_rec/optimizers/gradient_descent_by_addr.py @@ -60,16 +60,11 @@ class CustomizedGradientDescentByAddr(gradient_descent.GradientDescentOptimizer, name=self.unique_name) self._slot_num = 0 - self._derivative = 1 @property def slot_num(self): return self._slot_num - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): return [] diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index bab8245f..d79b6d23 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -72,16 +72,11 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): super(CustomizedLazyAdam, self).__init__(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, use_locking=use_locking, name=self.unique_name) self._slot_num = 2 - self._derivative = 2 @property def slot_num(self): return self._slot_num - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. def creat_one_single_slot(var, op_name): @@ -149,11 +144,10 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): self._resource_scatter_nd_add) def _apply_sparse(self, grad, var): - unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) return self._apply_sparse_shared( - unique_local_grad, + grad.values, var, - unique_keys, + grad.indices, lambda x, i, v: tf.compat.v1.scatter_nd_add(x, i, v)) def _apply_sparse_shared(self, grad, var, indices, scatter_nd_add): diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index cd4ee878..92252824 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -73,16 +73,11 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): name=self.unique_name) self._slot_num = 2 - self._derivative = 2 @property def slot_num(self): return self._slot_num - @property - def derivative(self): - return self._derivative - def get_slot_init_values(self): # return state value list of adam that needs to initialize in ASC DDR. initial_momentum_value = 0.0 @@ -114,10 +109,9 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): return temp def _apply_sparse(self, grad, addr): - unique_local_grad, unique_addr = self.sum_same_id_gradients(grad=grad, var=addr, is_expansion=True) return self._apply_sparse_shared( - unique_local_grad, - unique_addr) + grad, + addr) def _apply_sparse_shared(self, grad, addr): power_b1, power_b2 = self._get_beta_accumulators() diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index eb618f40..894dc230 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -704,7 +704,8 @@ bool HybridMgmt::ParseKeysHBM(int channelId, int& batchId) LOG_DEBUG("channelId:{} batchId:{}, sendLookupSyncTC(ms):{}", channelId, batchId, sendLookupSyncTC.ElapsedMS()); // 训练时,使用全局去重聚合梯度,发送全局去重的key和对应的恢复向量 - if (mgmtRankInfo.useSumSameIdGradients && channelId == TRAIN_CHANNEL_ID) { + if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && + channelId == TRAIN_CHANNEL_ID) { SendUniqKeysAndRestoreVecHBM(channelId, batchId, embInfo, infoVecs); } @@ -863,7 +864,8 @@ bool HybridMgmt::ProcessEmbInfo(const std::string& embName, int batchId, int cha LOG_DEBUG("channelId:{} batchId:{}, hostHashMapProcessTC(ms):{}", channelId, batchId, hostHashMapProcessTC.ElapsedMS()); - if (mgmtRankInfo.useSumSameIdGradients && channelId == TRAIN_CHANNEL_ID && remainBatchOut) { + if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && + channelId == TRAIN_CHANNEL_ID && remainBatchOut) { SendUniqKeysAndRestoreVecDDR(embName, batchId, channelId, ddrParam); } diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index b72f3c8e..f76f6907 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -470,9 +470,8 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel) { - LOG_INFO(KEY_PROCESS "rank:{}, channel:{}, useSumSameIdGradients:{} ...", - rankInfo.rankId, channel, rankInfo.useSumSameIdGradients); - if (rankInfo.useSumSameIdGradients && channel == TRAIN_CHANNEL_ID) { + if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && + channel == TRAIN_CHANNEL_ID) { KeysT uniqueKeys; vector restoreVecSec; diff --git a/src/core/utils/common.cpp b/src/core/utils/common.cpp index 9512b181..38e64444 100644 --- a/src/core/utils/common.cpp +++ b/src/core/utils/common.cpp @@ -47,7 +47,6 @@ namespace MxRec { useStatic = static_cast(option) bitand HybridOption::USE_STATIC; useHot = static_cast(option) bitand HybridOption::USE_HOT; useDynamicExpansion = static_cast(option) bitand HybridOption::USE_DYNAMIC_EXPANSION; - useSumSameIdGradients = static_cast(option) bitand HybridOption::USE_SUM_SAME_ID_GRADIENTS; } RankInfo::RankInfo(int localRankSize, int option, const vector& maxStep) diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 9706a699..f6c3de3f 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -115,10 +115,9 @@ namespace MxRec { using TensorInfoT = std::tuple>>::iterator>; namespace HybridOption { - const unsigned int USE_STATIC = 0x0001; - const unsigned int USE_HOT = 0x0001 << 1; - const unsigned int USE_DYNAMIC_EXPANSION = 0x0001 << 2; - const unsigned int USE_SUM_SAME_ID_GRADIENTS = 0x0001 << 3; + const unsigned int USE_STATIC = 0x001; + const unsigned int USE_HOT = 0x001 << 1; + const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 2; }; string GetChipName(int devID); @@ -227,7 +226,6 @@ namespace MxRec { bool isDDR { false }; bool isSSDEnabled { false }; bool useDynamicExpansion {false}; - bool useSumSameIdGradients {true}; std::vector ctrlSteps; // 包含三个步数: train_steps, eval_steps, save_steps }; diff --git a/src/core/utils/config.cpp b/src/core/utils/config.cpp index 57478553..9cfec739 100644 --- a/src/core/utils/config.cpp +++ b/src/core/utils/config.cpp @@ -20,7 +20,13 @@ See the License for the specific language governing permissions and using namespace std; namespace MxRec { + namespace ApplyGradientsStrategyOptions { + const std::string DIRECT_APPLY = "direct_apply"; + const std::string SUM_SAME_ID_GRADIENTS_AND_APPLY = "sum_same_id_gradients_and_apply"; + }; + // 设置环境变量默认值 + string GlobalEnv::applyGradientsStrategy = ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY; int GlobalEnv::aclTimeout = -1; // 默认阻塞方式,一直等待直到数据接收完成。 int GlobalEnv::hdChannelSize = 40; // 默认通道深度40 int GlobalEnv::keyProcessThreadNum = 6; // 默认6个线程 @@ -36,6 +42,12 @@ namespace MxRec { /// 配置环境变量,Python侧已经做了变量值校验,CPP侧直接使用即可;bool类型,1代表true,0代表false void ConfigGlobalEnv() { + // 设置梯度策略 + const char *envStrategy = getenv(RecEnvNames::APPLY_GRADIENTS_STRATEGY); + if (envStrategy != nullptr) { + GlobalEnv::applyGradientsStrategy = envStrategy; + } + // 设置ACL超时时间 const char *envAclTimeout = getenv(RecEnvNames::ACL_TIMEOUT); if (envAclTimeout != nullptr) { @@ -105,8 +117,9 @@ namespace MxRec { void LogGlobalEnv() { - LOG_DEBUG("Environment variables are: [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], " + LOG_DEBUG("Environment variables are: [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], " "[{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}], [{}: {}]", + RecEnvNames::APPLY_GRADIENTS_STRATEGY, GlobalEnv::applyGradientsStrategy, RecEnvNames::ACL_TIMEOUT, GlobalEnv::aclTimeout, RecEnvNames::HD_CHANNEL_SIZE, GlobalEnv::hdChannelSize, RecEnvNames::KEY_PROCESS_THREAD_NUM, GlobalEnv::keyProcessThreadNum, diff --git a/src/core/utils/config.h b/src/core/utils/config.h index 3ecb4c36..4c56c0d4 100644 --- a/src/core/utils/config.h +++ b/src/core/utils/config.h @@ -20,6 +20,7 @@ See the License for the specific language governing permissions and namespace MxRec { namespace RecEnvNames { + const char *const APPLY_GRADIENTS_STRATEGY = "APPLY_GRADIENTS_STRATEGY"; const char *const ACL_TIMEOUT = "AclTimeout"; const char *const HD_CHANNEL_SIZE = "HD_CHANNEL_SIZE"; const char *const KEY_PROCESS_THREAD_NUM = "KEY_PROCESS_THREAD_NUM"; @@ -33,7 +34,13 @@ namespace MxRec { const char *const RECORD_KEY_COUNT = "RECORD_KEY_COUNT"; }; + namespace ApplyGradientsStrategyOptions { + extern const std::string DIRECT_APPLY; + extern const std::string SUM_SAME_ID_GRADIENTS_AND_APPLY; + }; + struct GlobalEnv { + static std::string applyGradientsStrategy; static int aclTimeout; static int hdChannelSize; static int keyProcessThreadNum; diff --git a/src/pybind/module_main.cpp b/src/pybind/module_main.cpp index cb128a15..4a08f992 100644 --- a/src/pybind/module_main.cpp +++ b/src/pybind/module_main.cpp @@ -81,8 +81,6 @@ namespace { m.attr("USE_DYNAMIC_EXPANSION") = py::int_(HybridOption::USE_DYNAMIC_EXPANSION); - m.attr("USE_SUM_SAME_ID_GRADIENTS") = py::int_(HybridOption::USE_SUM_SAME_ID_GRADIENTS); - GetRankInfo(m); GetEmbInfoParams(m); diff --git a/src/tests/utils/config_test.cpp b/src/tests/utils/config_test.cpp index 54e0ec67..d7e51b57 100644 --- a/src/tests/utils/config_test.cpp +++ b/src/tests/utils/config_test.cpp @@ -24,6 +24,7 @@ using namespace MxRec; void SetEnvironmentVariables() { + setenv(RecEnvNames::APPLY_GRADIENTS_STRATEGY, "sum_same_id_gradients_and_apply", 1); setenv(RecEnvNames::ACL_TIMEOUT, "100", 1); setenv(RecEnvNames::HD_CHANNEL_SIZE, "50", 1); setenv(RecEnvNames::KEY_PROCESS_THREAD_NUM, "8", 1); @@ -39,6 +40,7 @@ void SetEnvironmentVariables() void UnsetEnvironmentVariables() { + unsetenv(RecEnvNames::APPLY_GRADIENTS_STRATEGY); unsetenv(RecEnvNames::ACL_TIMEOUT); unsetenv(RecEnvNames::HD_CHANNEL_SIZE); unsetenv(RecEnvNames::KEY_PROCESS_THREAD_NUM); @@ -54,6 +56,7 @@ void UnsetEnvironmentVariables() TEST(GlobalEnv, DefaultValues) { + ASSERT_EQ(GlobalEnv::applyGradientsStrategy, ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY); ASSERT_EQ(GlobalEnv::aclTimeout, -1); ASSERT_EQ(GlobalEnv::hdChannelSize, 40); ASSERT_EQ(GlobalEnv::keyProcessThreadNum, 6); @@ -74,6 +77,7 @@ TEST(GlobalEnv, ConfigGlobalEnv) ConfigGlobalEnv(); // 验证环境变量是否已经被正确配置 + ASSERT_EQ(GlobalEnv::applyGradientsStrategy, "sum_same_id_gradients_and_apply"); ASSERT_EQ(GlobalEnv::aclTimeout, 100); ASSERT_EQ(GlobalEnv::hdChannelSize, 50); ASSERT_EQ(GlobalEnv::keyProcessThreadNum, 8); diff --git a/tests/mx_rec/core/mock_class.py b/tests/mx_rec/core/mock_class.py index 04c9ae56..7566aa1a 100644 --- a/tests/mx_rec/core/mock_class.py +++ b/tests/mx_rec/core/mock_class.py @@ -208,7 +208,6 @@ class MockOptimizer: def __init__(self): self.slot_num = 2 - self.derivative = 2 def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. diff --git a/tests/mx_rec/core/test_build_graph.py b/tests/mx_rec/core/test_build_graph.py index 14913cf7..c15d851f 100644 --- a/tests/mx_rec/core/test_build_graph.py +++ b/tests/mx_rec/core/test_build_graph.py @@ -156,6 +156,84 @@ class TestGetIdOffsetsFunc(unittest.TestCase): self.assertEqual(swap_len, 0) +class TestGetRestoreVectorSecondFunc(unittest.TestCase): + """ + Test for 'mx_rec.core.asc.build_graph.get_restore_vector_second'. + """ + + def setUp(self): + # 默认动态扩容、hot emb、HBM + self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_hot=True, use_dynamic_expansion=True) + self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") + + def tearDown(self): + # 恢复config + self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_hot=True, use_dynamic_expansion=True) + + @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") + def test_get_restore_vector_second(self, mock_get_next): + """ + case: test get_restore_vector_second + """ + + from mx_rec.core.asc.build_graph import get_restore_vector_second + + with tf.Graph().as_default(): + mock_get_next.return_value = [0] + restore_vector_second = get_restore_vector_second(self.max_lookup_vec_size, self.config) + self.assertEqual(restore_vector_second, 0) + + +class TestGetUniqueKeysFunc(unittest.TestCase): + """ + Test for 'mx_rec.core.asc.build_graph.get_unique_keys'. + """ + + def setUp(self): + # 默认动态扩容、hot emb、HBM + self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_hot=True, use_dynamic_expansion=True) + self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") + + def tearDown(self): + # 恢复config + self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_hot=True, use_dynamic_expansion=True) + + @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") + def test_get_unique_keys_case1(self, mock_get_next): + """ + case1: 动态扩容 + """ + + from mx_rec.core.asc.build_graph import get_unique_keys + + with tf.Graph().as_default(): + mock_get_next.return_value = [0] + unique_keys = get_unique_keys(self.max_lookup_vec_size, self.config) + self.assertEqual(unique_keys, 0) + + @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") + def test_get_unique_keys_case2(self, mock_get_next): + """ + case2: 非动态扩容 + """ + + from mx_rec.core.asc.build_graph import get_unique_keys + + with tf.Graph().as_default(): + self.config["use_dynamic_expansion"] = False + mock_get_next.return_value = [1] + unique_keys = get_unique_keys(self.max_lookup_vec_size, self.config) + self.assertEqual(unique_keys, 1) + + class TestGetAll2allArgsFunc(unittest.TestCase): """ Test for 'mx_rec.core.asc.build_graph.get_all2all_args'. @@ -268,12 +346,15 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, use_hot=True, use_dynamic_expansion=True) + global_env.apply_gradients_strategy = "direct_apply" @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0)) + get_swap_info=mock.MagicMock(return_value=0), + get_restore_vector_second=mock.MagicMock(return_value=0), + get_unique_keys=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case1(self, build_graph_config_initializer): """ @@ -282,18 +363,23 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc + global_env.apply_gradients_strategy = "sum_same_id_gradients_and_apply" with tf.Graph().as_default(): mock_config_initializer = MockConfigInitializer(use_static=True) build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) result = get_preprocessed_tensor_for_asc(None, self.config) self.assertIsNotNone(result.get("restore_vector")) + self.assertIsNotNone(result.get("restore_vector_second")) + self.assertIsNotNone(result.get("unique_keys")) @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0)) + get_swap_info=mock.MagicMock(return_value=0), + get_restore_vector_second=mock.MagicMock(return_value=0), + get_unique_keys=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case2(self, build_graph_config_initializer): """ @@ -302,18 +388,23 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc + global_env.apply_gradients_strategy = "sum_same_id_gradients_and_apply" with tf.Graph().as_default(): mock_config_initializer = MockConfigInitializer() build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) result = get_preprocessed_tensor_for_asc(None, self.config) self.assertIsNotNone(result.get("restore_vector")) + self.assertIsNotNone(result.get("restore_vector_second")) + self.assertIsNotNone(result.get("unique_keys")) @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0)) + get_swap_info=mock.MagicMock(return_value=0), + get_restore_vector_second=mock.MagicMock(return_value=0), + get_unique_keys=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case3(self, build_graph_config_initializer): """ @@ -322,6 +413,7 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc + global_env.apply_gradients_strategy = "sum_same_id_gradients_and_apply" with tf.Graph().as_default(): mock_config_initializer = MockConfigInitializer() build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) @@ -329,6 +421,7 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): self.config["channel_id"] = 1 result = get_preprocessed_tensor_for_asc(None, self.config) self.assertIsNotNone(result.get("restore_vector")) + self.assertIsNone(result.get("restore_vector_second")) if __name__ == '__main__': diff --git a/tests/mx_rec/core/test_manager.py b/tests/mx_rec/core/test_manager.py index ffa8b09e..815ad843 100644 --- a/tests/mx_rec/core/test_manager.py +++ b/tests/mx_rec/core/test_manager.py @@ -385,7 +385,6 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): USE_STATIC=mock.MagicMock(return_value=0), USE_HOT=mock.MagicMock(return_value=1), USE_DYNAMIC_EXPANSION=mock.MagicMock(return_value=2), - USE_SUM_SAME_ID_GRADIENTS=mock.MagicMock(return_value=4), RankInfo=mock.MagicMock(return_value="mock_info"), HybridMgmt=mock.MagicMock(return_value=MockHybridMgmt(is_initialized=False))) @mock.patch("mx_rec.core.asc.manager.ConfigInitializer") @@ -399,9 +398,6 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): mock_config_initializer = MockConfigInitializer(use_static=True, use_dynamic_expansion=True) manager_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) - mock_opt = MockOptimizer() - manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt - with self.assertRaises(RuntimeError): initialize_emb_cache([], []) @@ -412,7 +408,6 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): USE_STATIC=mock.MagicMock(return_value=0), USE_HOT=mock.MagicMock(return_value=1), USE_DYNAMIC_EXPANSION=mock.MagicMock(return_value=2), - USE_SUM_SAME_ID_GRADIENTS=mock.MagicMock(return_value=4), RankInfo=mock.MagicMock(return_value="mock_info")) @mock.patch("mx_rec.core.asc.manager.ConfigInitializer") @mock.patch("mx_rec.core.asc.manager.HybridMgmt") @@ -426,9 +421,6 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): mock_config_initializer = MockConfigInitializer(use_static=True, use_dynamic_expansion=True) manager_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) - mock_opt = MockOptimizer() - manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt - mock_mgmt = MockHybridMgmt(is_initialized=True) mock_hybrid_mgmt.return_value = mock_mgmt initialize_emb_cache([], []) -- Gitee From 1dffba4b0fd93598d9b0cd903b3dcacc009e732a Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Tue, 16 Apr 2024 15:11:35 +0800 Subject: [PATCH 035/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/key_process/key_process.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 1d922cee..b6e4d5fb 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -48,7 +48,6 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos SetupHotEmbUpdateStep(); - map scInfo; for (const auto& info: eInfos) { embInfos[info.name] = info; @@ -385,7 +384,6 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch uniqueInfo.hotPos.resize(hotEmbTotCount[batch->name], -1); tensors->push_back(Vec2TensorI32(uniqueInfo.hotPos)); - if (!rankInfo.isDDR) { PushGlobalUniqueTensors(move(tensors), uniqueInfo.all2AllInfo.keyRecv, channel); tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) : @@ -447,7 +445,6 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, hotPos.resize(hotEmbTotCount[batch->name], 0); tensors->push_back(Vec2TensorI32(hotPos)); - if (!rankInfo.isDDR) { PushGlobalUniqueTensors(tensors, lookupKeys, channel); tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) : Vec2TensorI32(lookupKeys)); -- Gitee From 0f38f2118d4fcafe2f9707b4e288cf849ba3c256 Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Tue, 16 Apr 2024 15:29:15 +0800 Subject: [PATCH 036/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/key_process/key_process.cpp | 2 +- src/core/utils/common.h | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index b6e4d5fb..98df97ed 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -650,7 +650,7 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, Uniqu ComputeHotPos(batch, hotMap, uniqueInfoOut.hotPos, uniqueInfoOut.restore, hotOffset); LOG_DEBUG("ComputeHot TimeCost(ms):{}", computeHotTc.ElapsedMS()); UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, - hotOffset, batch->batchId % hotEmbUpdateStep == 0, batch->name); + hotOffset, batch->batchId % hotEmbUpdateStep == 0, batch->name); if (rankInfo.useStatic) { sc.resize(rankInfo.rankSize, embInfos[batch->name].sendCount); diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 9ce80073..99184fed 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -148,7 +148,11 @@ namespace MxRec { {"910B2", UBSize::ASCEND910_B2}, {"910B3", UBSize::ASCEND910_B3}, {"910B4", UBSize::ASCEND910_B4}, - {"910B2C", UBSize::ASCEND910_B2C}}; + {"910B2C", UBSize::ASCEND910_B2C}, + {"910C1", UBSize::ASCEND910_C1}, + {"910C2", UBSize::ASCEND910_C1}, + {"910C3", UBSize::ASCEND910_C3} + }; auto it = chipUbSizeList.find(GetChipName(devID)); if (it != chipUbSizeList.end()) { return it->second; -- Gitee From 660c945582872750c98b99b494834d83155bb914 Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Tue, 16 Apr 2024 20:23:36 +0800 Subject: [PATCH 037/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo/main.py | 4 +-- examples/demo/little_demo/run.sh | 1 - examples/demo/little_demo_estimator/main.py | 4 +-- examples/demo/little_demo_estimator/run.sh | 1 - src/core/utils/common.h | 3 ++- src/tests/key_process/key_process_test.cpp | 8 +++++- tests/mx_rec/core/mock_class.py | 1 - tests/mx_rec/core/test_build_graph.py | 28 ++++++++++----------- tests/mx_rec/core/test_manager.py | 2 -- tools/atomic/sparse_lookup.py | 3 +-- tools/atomic/sparse_lookup_with_grad.py | 3 +-- 11 files changed, 27 insertions(+), 31 deletions(-) diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index 05d6896f..5d5e151e 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -187,14 +187,13 @@ if __name__ == "__main__": # get init configuration try: use_dynamic = bool(int(os.getenv("USE_DYNAMIC", 0))) - use_hot = bool(int(os.getenv("USE_HOT", 0))) use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))) use_multi_lookup = bool(int(os.getenv("USE_MULTI_LOOKUP", 1))) MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0))) USE_TIMESTAMP = bool(int(os.getenv("USE_TIMESTAMP", 0))) USE_ONE_SHOT = bool(int(os.getenv("USE_ONE_SHOT", 0))) except ValueError as err: - raise ValueError("please correctly config USE_MPI or USE_DYNAMIC or USE_HOT or USE_DYNAMIC_EXPANSION or " + raise ValueError("please correctly config USE_MPI or USE_DYNAMIC or USE_DYNAMIC_EXPANSION or " "USE_MULTI_LOOKUP or USE_MODIFY_GRAPH or USE_TIMESTAMP or USE_ONE_SHOT " "only 0 or 1 is supported.") from err @@ -218,7 +217,6 @@ if __name__ == "__main__": eval_steps=EVAL_STEPS, save_steps=SAVING_INTERVAL, use_dynamic=use_dynamic, - use_hot=use_hot, use_dynamic_expansion=use_dynamic_expansion, if_load=if_load) diff --git a/examples/demo/little_demo/run.sh b/examples/demo/little_demo/run.sh index ab74adb2..e0d1766b 100644 --- a/examples/demo/little_demo/run.sh +++ b/examples/demo/little_demo/run.sh @@ -104,7 +104,6 @@ export USE_MPI=1 ################# 参数配置 ###################### export USE_DYNAMIC=1 # 0:静态shape;1:动态shape -export USE_HOT=0 # 0:关闭hot emb;1: 开启hot emb export USE_DYNAMIC_EXPANSION=0 # 0:关闭动态扩容;1: 开启动态扩容 export USE_MULTI_LOOKUP=1 # 0:一表一查;1:一表多查 export MULTI_LOOKUP_TIMES=2 # 一表多查次数:默认2,上限127(因为一表已经有一查);仅当export USE_MULTI_LOOKUP=1时生效 diff --git a/examples/demo/little_demo_estimator/main.py b/examples/demo/little_demo_estimator/main.py index 5c3c94d1..8df1420c 100644 --- a/examples/demo/little_demo_estimator/main.py +++ b/examples/demo/little_demo_estimator/main.py @@ -158,7 +158,6 @@ if __name__ == '__main__': # get init configuration try: use_dynamic = bool(int(os.getenv("USE_DYNAMIC", 0))) - use_hot = bool(int(os.getenv("USE_HOT", 0))) use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))) use_multi_lookup = bool(int(os.getenv("USE_MULTI_LOOKUP", 1))) MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0))) @@ -166,7 +165,7 @@ if __name__ == '__main__': args.use_one_shot = bool(int(os.getenv("USE_ONE_SHOT", 0))) args.enable_push_ops_test = bool(int(os.getenv("ENABLE_PUSH_OPS_TEST", 0))) except ValueError as err: - raise ValueError("please correctly config USE_MPI or USE_DYNAMIC or USE_HOT or USE_DYNAMIC_EXPANSION or " + raise ValueError("please correctly config USE_MPI or USE_DYNAMIC or USE_DYNAMIC_EXPANSION or " "USE_MULTI_LOOKUP or USE_MODIFY_GRAPH or USE_TIMESTAMP or USE_ONE_SHOT " "only 0 or 1 is supported.") from err @@ -187,7 +186,6 @@ if __name__ == '__main__': init(train_steps=args.train_steps, eval_steps=args.eval_steps, use_dynamic=use_dynamic, - use_hot=use_hot, use_dynamic_expansion=use_dynamic_expansion) args.model_dir = f"{args.model_ckpt_dir}_rank" diff --git a/examples/demo/little_demo_estimator/run.sh b/examples/demo/little_demo_estimator/run.sh index 33770e59..373b3535 100644 --- a/examples/demo/little_demo_estimator/run.sh +++ b/examples/demo/little_demo_estimator/run.sh @@ -93,7 +93,6 @@ fi ################# 参数配置 ###################### export USE_DYNAMIC=1 # 0:静态shape;1:动态shape -export USE_HOT=1 # 0:关闭hot emb;1: 开启hot emb export USE_DYNAMIC_EXPANSION=0 # 0:关闭动态扩容;1: 开启动态扩容 export USE_MULTI_LOOKUP=1 # 0:一表一查;1:一表多查 export MULTI_LOOKUP_TIMES=2 # 一表多查次数:默认2,上限127(因为一表已经有一查);仅当export USE_MULTI_LOOKUP=1时生效 diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 99184fed..0861cdfc 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -116,7 +116,8 @@ namespace MxRec { namespace HybridOption { const unsigned int USE_STATIC = 0x001; - const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 2; + const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 1 + ; }; string GetChipName(int devID); diff --git a/src/tests/key_process/key_process_test.cpp b/src/tests/key_process/key_process_test.cpp index 6b06dc30..e2d289f4 100644 --- a/src/tests/key_process/key_process_test.cpp +++ b/src/tests/key_process/key_process_test.cpp @@ -658,6 +658,11 @@ TEST_F(KeyProcessTest, KeyProcessTaskHelper) ASSERT_EQ(CheckMatrixTensor(*all2all, allExpectAll2all), true); ASSERT_EQ(CheckFlatTensor({infoVecs->back()}, allExpectOffset[worldRank]), true); infoVecs->pop_back(); + int64_t hotPosition = process.hotEmbTotCount[batch->name]; + vector expectRestore(allExpectRestore[worldRank].size()); + for(int i=0; ibatchId); // 测试batchId错误 @@ -711,9 +716,10 @@ TEST_F(KeyProcessTest, KeyProcessTaskHelperDDR) auto tmpTensor = (*infoVecs).at(0); auto tmpData = tmpTensor.flat(); + int64_t hotPosition = process.hotEmbTotCount[batch->name]; vector actualGetRestore(col); for (int j = 0; j < col; j++) { - actualGetRestore[j] = tmpData(j); + actualGetRestore[j] = tmpData(j)-hotPosition; } LOG_INFO("KeyProcessTaskHelperDDR, rankid: {}, batchid: {}, Restore: {}", rankInfo.rankId, batch->batchId, VectorToString(actualGetRestore)); diff --git a/tests/mx_rec/core/mock_class.py b/tests/mx_rec/core/mock_class.py index 7566aa1a..1e3e7ba1 100644 --- a/tests/mx_rec/core/mock_class.py +++ b/tests/mx_rec/core/mock_class.py @@ -121,7 +121,6 @@ class MockConfigInitializer: def __init__(self, **kwargs): self.use_dynamic_expansion = kwargs.get("use_dynamic_expansion", False) self.use_static = kwargs.get("use_static", False) - self.use_hot = kwargs.get("use_static", True) self.modify_graph = kwargs.get("modify_graph", True) self.max_steps = kwargs.get("max_steps", -1) self.train_steps = kwargs.get("get_train_steps", -1) diff --git a/tests/mx_rec/core/test_build_graph.py b/tests/mx_rec/core/test_build_graph.py index c15d851f..08b66c55 100644 --- a/tests/mx_rec/core/test_build_graph.py +++ b/tests/mx_rec/core/test_build_graph.py @@ -33,13 +33,13 @@ class TestGetRestoreVectorFunc(unittest.TestCase): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) def test_get_restore_vector_case1(self): """ @@ -115,14 +115,14 @@ class TestGetIdOffsetsFunc(unittest.TestCase): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") def test_get_id_offsets_case1(self, mock_get_next): @@ -165,14 +165,14 @@ class TestGetRestoreVectorSecondFunc(unittest.TestCase): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") def test_get_restore_vector_second(self, mock_get_next): @@ -197,14 +197,14 @@ class TestGetUniqueKeysFunc(unittest.TestCase): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") def test_get_unique_keys_case1(self, mock_get_next): @@ -243,13 +243,13 @@ class TestGetAll2allArgsFunc(unittest.TestCase): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) def test_get_all2all_args_case1(self): """ @@ -285,13 +285,13 @@ class TestGetSwapInfoFunc(unittest.TestCase): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_swap_info_case1(self, build_graph_config_initializer): @@ -339,13 +339,13 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_hot=True, use_dynamic_expansion=True) + use_dynamic_expansion=True) global_env.apply_gradients_strategy = "direct_apply" @mock.patch.multiple("mx_rec.core.asc.build_graph", diff --git a/tests/mx_rec/core/test_manager.py b/tests/mx_rec/core/test_manager.py index 815ad843..a14db367 100644 --- a/tests/mx_rec/core/test_manager.py +++ b/tests/mx_rec/core/test_manager.py @@ -383,7 +383,6 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): get_device_id=mock.MagicMock(return_value=0), get_rank_size=mock.MagicMock(return_value=0), USE_STATIC=mock.MagicMock(return_value=0), - USE_HOT=mock.MagicMock(return_value=1), USE_DYNAMIC_EXPANSION=mock.MagicMock(return_value=2), RankInfo=mock.MagicMock(return_value="mock_info"), HybridMgmt=mock.MagicMock(return_value=MockHybridMgmt(is_initialized=False))) @@ -406,7 +405,6 @@ class TestInitializeEmbCacheFunc(unittest.TestCase): get_device_id=mock.MagicMock(return_value=0), get_rank_size=mock.MagicMock(return_value=0), USE_STATIC=mock.MagicMock(return_value=0), - USE_HOT=mock.MagicMock(return_value=1), USE_DYNAMIC_EXPANSION=mock.MagicMock(return_value=2), RankInfo=mock.MagicMock(return_value="mock_info")) @mock.patch("mx_rec.core.asc.manager.ConfigInitializer") diff --git a/tools/atomic/sparse_lookup.py b/tools/atomic/sparse_lookup.py index 570c683e..73ff7f33 100644 --- a/tools/atomic/sparse_lookup.py +++ b/tools/atomic/sparse_lookup.py @@ -28,7 +28,6 @@ from sparse_ops.config import set_ascend_env USE_PIPELINE_TEST = False USE_STATIC = False -USE_HOT = False USE_EXPANSION = False from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET @@ -171,7 +170,7 @@ if __name__ == '__main__': host_vocab_size = 0 init(True, rank_id=rank_id, rank_size=local_rank_size, train_interval=100, eval_steps=-1, - prefetch_batch_number=1, use_dynamic=0, use_hot=1, use_dynamic_expansion=0) + prefetch_batch_number=1, use_dynamic=0, use_dynamic_expansion=0) tf.disable_eager_execution() ###################################### diff --git a/tools/atomic/sparse_lookup_with_grad.py b/tools/atomic/sparse_lookup_with_grad.py index 3d7d37e5..26633abe 100644 --- a/tools/atomic/sparse_lookup_with_grad.py +++ b/tools/atomic/sparse_lookup_with_grad.py @@ -28,7 +28,6 @@ from sparse_ops.config import set_ascend_env USE_PIPELINE_TEST = False USE_STATIC = False -USE_HOT = False USE_EXPANSION = False @@ -173,7 +172,7 @@ if __name__ == '__main__': host_vocab_size = 0 init(True, rank_id=rank_id, rank_size=local_rank_size, train_interval=100, eval_steps=-1, - prefetch_batch_number=1, use_dynamic=0, use_hot=1, use_dynamic_expansion=0) + prefetch_batch_number=1, use_dynamic=0, use_dynamic_expansion=0) tf.disable_eager_execution() ###################################### -- Gitee From e99ae38e3a0503b0720a31e6d3490fa9e4a2e827 Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Tue, 16 Apr 2024 20:27:23 +0800 Subject: [PATCH 038/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/key_process/key_process_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/key_process/key_process_test.cpp b/src/tests/key_process/key_process_test.cpp index e2d289f4..5cc4b90b 100644 --- a/src/tests/key_process/key_process_test.cpp +++ b/src/tests/key_process/key_process_test.cpp @@ -663,7 +663,7 @@ TEST_F(KeyProcessTest, KeyProcessTaskHelper) for(int i=0; ibatchId); // 测试batchId错误 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); -- Gitee From bc26150ce17f739c7479e2f0b55fa84464fb7247 Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Tue, 16 Apr 2024 20:32:24 +0800 Subject: [PATCH 039/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/key_process/key_process_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/key_process/key_process_test.cpp b/src/tests/key_process/key_process_test.cpp index 5cc4b90b..f84dfba9 100644 --- a/src/tests/key_process/key_process_test.cpp +++ b/src/tests/key_process/key_process_test.cpp @@ -660,8 +660,8 @@ TEST_F(KeyProcessTest, KeyProcessTaskHelper) infoVecs->pop_back(); int64_t hotPosition = process.hotEmbTotCount[batch->name]; vector expectRestore(allExpectRestore[worldRank].size()); - for(int i=0; ibatchId); -- Gitee From a19a1e699423da0bc3ade7cc2a6594d31bfd0103 Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Tue, 16 Apr 2024 20:54:23 +0800 Subject: [PATCH 040/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/key_process/key_process_test.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/tests/key_process/key_process_test.cpp b/src/tests/key_process/key_process_test.cpp index f84dfba9..8bb21dcd 100644 --- a/src/tests/key_process/key_process_test.cpp +++ b/src/tests/key_process/key_process_test.cpp @@ -23,6 +23,7 @@ See the License for the specific language governing permissions and #include "ock_ctr_common/include/unique.h" #include "ock_ctr_common/include/error_code.h" #include "emb_table/embedding_mgmt.h" +#include "emock/emock.hpp" using namespace std; using namespace MxRec; @@ -60,6 +61,9 @@ class KeyProcessTest : public testing::Test { protected: void SetUp() { + int defaultUBSize = 196608; + EMOCK(GetUBSize).stubs().with(any()).will(returnValue(defaultUBSize)); + int claimed; MPI_Query_thread(&claimed); ASSERT_EQ(claimed, MPI_THREAD_MULTIPLE); @@ -660,7 +664,7 @@ TEST_F(KeyProcessTest, KeyProcessTaskHelper) infoVecs->pop_back(); int64_t hotPosition = process.hotEmbTotCount[batch->name]; vector expectRestore(allExpectRestore[worldRank].size()); - for(int i=0; i Date: Tue, 16 Apr 2024 21:05:41 +0800 Subject: [PATCH 041/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/key_process/key_process_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/key_process/key_process_test.cpp b/src/tests/key_process/key_process_test.cpp index 8bb21dcd..86ec3f80 100644 --- a/src/tests/key_process/key_process_test.cpp +++ b/src/tests/key_process/key_process_test.cpp @@ -321,6 +321,7 @@ protected: void TearDown() { // delete + GlobalMockObject::reset(); } }; @@ -664,7 +665,7 @@ TEST_F(KeyProcessTest, KeyProcessTaskHelper) infoVecs->pop_back(); int64_t hotPosition = process.hotEmbTotCount[batch->name]; vector expectRestore(allExpectRestore[worldRank].size()); - for(int i = 0; i < expectRestore.size(); i++) { + for (int i = 0; i < expectRestore.size(); i++) { expectRestore[i] = allExpectRestore[worldRank][i] + hotPosition; } ASSERT_EQ(CheckFlatTensor(*infoVecs, expectRestore), true); -- Gitee From 9239843a278b5d34bf8f457f259239b7feae6b75 Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Wed, 17 Apr 2024 12:53:29 +0800 Subject: [PATCH 042/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/asc/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index 5f8eeb5d..f1b2df69 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -201,7 +201,7 @@ def initialize_emb_cache(table_info_list, threshold_list): if ConfigInitializer.get_instance().use_static: option = option | USE_STATIC # use hot always True - option = option | USE_STATIC << 1 + # option = option | USE_STATIC << 1 if ConfigInitializer.get_instance().use_dynamic_expansion: option = option | USE_DYNAMIC_EXPANSION -- Gitee From e4b6f672ca6596b362929fd68c05025cfb210c69 Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Wed, 17 Apr 2024 14:33:21 +0800 Subject: [PATCH 043/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/asc/manager.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index f1b2df69..c006f645 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -200,8 +200,6 @@ def initialize_emb_cache(table_info_list, threshold_list): option = 0 if ConfigInitializer.get_instance().use_static: option = option | USE_STATIC - # use hot always True - # option = option | USE_STATIC << 1 if ConfigInitializer.get_instance().use_dynamic_expansion: option = option | USE_DYNAMIC_EXPANSION -- Gitee From 5798ba29a7fbb995c69d8f7eba1b40f786fce438 Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Thu, 18 Apr 2024 09:04:15 +0800 Subject: [PATCH 044/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/utils/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 0861cdfc..3ef0bc65 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -153,7 +153,7 @@ namespace MxRec { {"910C1", UBSize::ASCEND910_C1}, {"910C2", UBSize::ASCEND910_C1}, {"910C3", UBSize::ASCEND910_C3} - }; + }; auto it = chipUbSizeList.find(GetChipName(devID)); if (it != chipUbSizeList.end()) { return it->second; -- Gitee From 8d95edca3bbef48e368ba766538d798f6bb35be1 Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Thu, 18 Apr 2024 14:11:01 +0800 Subject: [PATCH 045/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=80=E6=9C=89?= =?UTF-8?q?=E5=88=A4=E6=96=ADHot=20embed=E7=9A=84=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E9=BB=98=E8=AE=A4=E5=BC=80=E5=90=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/utils/common.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 3ef0bc65..95a76ca5 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -116,8 +116,7 @@ namespace MxRec { namespace HybridOption { const unsigned int USE_STATIC = 0x001; - const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 1 - ; + const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 1; }; string GetChipName(int devID); -- Gitee From 42400d51205c3c50492333b8ad25e3843b0bd989 Mon Sep 17 00:00:00 2001 From: sihaixianyu Date: Wed, 17 Apr 2024 07:21:35 +0000 Subject: [PATCH 046/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E7=A4=BA=E4=BE=8B?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E7=9A=84=20USE=5FMPI=20=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E9=80=89=E9=A1=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: sihaixianyu --- examples/DCNv2/run.sh | 33 +++----------------- examples/demo/little_demo/run.sh | 1 - examples/demo/little_demo_estimator/run.sh | 1 - examples/dlrm/model/run.sh | 36 ++++------------------ 4 files changed, 11 insertions(+), 60 deletions(-) diff --git a/examples/DCNv2/run.sh b/examples/DCNv2/run.sh index f30e0ac6..1709959c 100644 --- a/examples/DCNv2/run.sh +++ b/examples/DCNv2/run.sh @@ -75,8 +75,6 @@ RANK_ID_START=0 export MXREC_MODE="ASC" echo "MXREC_MODE is $MXREC_MODE" -export USE_MPI=1 -echo "USE_MPI is $USE_MPI" export py=main_mxrec.py echo "py is $py" @@ -103,30 +101,9 @@ else export RANK_TABLE_FILE=${hccl_cfg_json} fi -if [ $USE_MPI -eq 0 ]; then - echo "use for loop to start tasks" - for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); - do - #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID - if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then - rm -rf $cur_path/output/${ASCEND_DEVICE_ID} - mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} - else - mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} - fi - nohup python3 ${py} > $cur_path/output/$ASCEND_DEVICE_ID/test_$ASCEND_DEVICE_ID.log 2>&1 & - done -else - echo "use horovod to start tasks" - # GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO - mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' - - horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ - python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${num_process}p_$(date +%Y%m%d_%H%M%S).log -fi - +echo "use horovod to start tasks" +# GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO +mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' +horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ +python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${num_process}p_$(date +%Y%m%d_%H%M%S).log diff --git a/examples/demo/little_demo/run.sh b/examples/demo/little_demo/run.sh index ab74adb2..394ce1f6 100644 --- a/examples/demo/little_demo/run.sh +++ b/examples/demo/little_demo/run.sh @@ -100,7 +100,6 @@ export TF_CPP_MIN_LOG_LEVEL=3 # tensorflow日志级别,3对应FATAL # 设置应用类日志的全局日志级别及各模块日志级别,具体请参考昇腾官网CANN文档 export ASCEND_GLOBAL_LOG_LEVEL=3 # “设置日志级别”章节0:debug, 1:info, 2:warning, 3:error, 4:NULL export MXREC_MODE="ASC" -export USE_MPI=1 ################# 参数配置 ###################### export USE_DYNAMIC=1 # 0:静态shape;1:动态shape diff --git a/examples/demo/little_demo_estimator/run.sh b/examples/demo/little_demo_estimator/run.sh index 33770e59..30b5e0c9 100644 --- a/examples/demo/little_demo_estimator/run.sh +++ b/examples/demo/little_demo_estimator/run.sh @@ -83,7 +83,6 @@ export TF_CPP_MIN_LOG_LEVEL=3 # tensorflow日志级别,3对应FATAL # 设置应用类日志的全局日志级别及各模块日志级别,具体请参考昇腾官网CANN文档 export ASCEND_GLOBAL_LOG_LEVEL=3 # “设置日志级别”章节0:debug, 1:info, 2:warning, 3:error, 4:NULL export MXREC_MODE="ASC" -export USE_MPI=1 export USE_MODE="train_and_evaluate" # 支持[train, predict, train_and_evaluate] if [ $USE_MODE = "train" ] || [ $USE_MODE = "train_and_evaluate" ];then diff --git a/examples/dlrm/model/run.sh b/examples/dlrm/model/run.sh index 919f0f98..f5cb4449 100644 --- a/examples/dlrm/model/run.sh +++ b/examples/dlrm/model/run.sh @@ -75,37 +75,13 @@ RANK_ID_START=0 export MXREC_MODE="ASC" echo "MXREC_MODE is $MXREC_MODE" -export USE_MPI=1 -echo "USE_MPI is $USE_MPI" export py=main_mxrec.py echo "py is $py" +echo "use horovod to start tasks" +# GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO +mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' +interface="lo" -if [ $USE_MPI -eq 0 ]; then - echo "use for loop to start tasks" - for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++)); - do - #设置环境变量,不需要修改 - echo "Device ID: $RANK_ID" - export RANK_ID=$RANK_ID - export ASCEND_DEVICE_ID=$RANK_ID - ASCEND_DEVICE_ID=$RANK_ID - if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then - rm -rf $cur_path/output/${ASCEND_DEVICE_ID} - mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} - else - mkdir -p $cur_path/output/${ASCEND_DEVICE_ID} - fi - nohup python3 ${py} > $cur_path/output/$ASCEND_DEVICE_ID/test_$ASCEND_DEVICE_ID.log 2>&1 & - done -else - echo "use horovod to start tasks" - # GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO - mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' - interface="lo" - - horovodrun --network-interface ${interface} -np ${RANK_SIZE} --mpi-args "${mpi_args}" --mpi -H localhost:${RANK_SIZE} \ - python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${RANK_SIZE}p.log -fi - - +horovodrun --network-interface ${interface} -np ${RANK_SIZE} --mpi-args "${mpi_args}" --mpi -H localhost:${RANK_SIZE} \ +python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${RANK_SIZE}p.log -- Gitee From d47551445874f9208cb722a5e516701ba4635eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Fri, 19 Apr 2024 14:32:28 +0800 Subject: [PATCH 047/302] =?UTF-8?q?README=E4=B8=AD=E6=B7=BB=E5=8A=A0mxRec?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E6=8C=87=E5=8D=97=E7=A4=BE=E5=8C=BA=E9=93=BE?= =?UTF-8?q?=E6=8E=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f49f4ba..fd3b0691 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ bash test_ut.sh tf2 ## 使用指导 -mxRec所支持的使用环境、功能特性、API接口与使用样例请参考mxRec用户指南。 +mxRec所支持的使用环境、功能特性、API接口与使用样例请参考[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0001.html)。 ## 参考设计 -- Gitee From 3a07cefc4a4ee873a07ab408389a383e361034b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Fri, 19 Apr 2024 08:39:53 +0000 Subject: [PATCH 048/302] =?UTF-8?q?!89=20README=E4=B8=AD=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?mxRec=E7=94=A8=E6=88=B7=E6=8C=87=E5=8D=97=E7=A4=BE=E5=8C=BA?= =?UTF-8?q?=E9=93=BE=E6=8E=A5=20*=20Merge=20remote-tracking=20branch=20'or?= =?UTF-8?q?igin/develop'=20into=20develop=20*=20README=E4=B8=AD=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0mxRec=E7=94=A8=E6=88=B7=E6=8C=87=E5=8D=97=E7=A4=BE?= =?UTF-8?q?=E5=8C=BA=E9=93=BE=E6=8E=A5=E4=BB=A5=E5=8F=8A=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E5=85=AC=E7=BD=91=E5=9C=B0=E5=9D=80=20*=20README=E4=B8=AD?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0mxRec=E7=94=A8=E6=88=B7=E6=8C=87=E5=8D=97?= =?UTF-8?q?=E7=A4=BE=E5=8C=BA=E9=93=BE=E6=8E=A5=20*=20README=E4=B8=AD?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0mxRec=E7=94=A8=E6=88=B7=E6=8C=87=E5=8D=97?= =?UTF-8?q?=E7=A4=BE=E5=8C=BA=E9=93=BE=E6=8E=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- ...\347\256\261\345\234\260\345\235\200.xlsx" | Bin 19596 -> 14736 bytes 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f49f4ba..fd3b0691 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ bash test_ut.sh tf2 ## 使用指导 -mxRec所支持的使用环境、功能特性、API接口与使用样例请参考mxRec用户指南。 +mxRec所支持的使用环境、功能特性、API接口与使用样例请参考[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0001.html)。 ## 参考设计 diff --git "a/docs/MindX\342\200\242SDK\342\200\2426.0.RC1\342\200\242mxRec\342\200\242\345\205\254\347\275\221\345\234\260\345\235\200\345\222\214\351\202\256\347\256\261\345\234\260\345\235\200.xlsx" "b/docs/MindX\342\200\242SDK\342\200\2426.0.RC1\342\200\242mxRec\342\200\242\345\205\254\347\275\221\345\234\260\345\235\200\345\222\214\351\202\256\347\256\261\345\234\260\345\235\200.xlsx" index 2fa2165b6f608106df678d1afd31562884a3029d..de085d900879882e548b9fa5cd539bb2f698cc0f 100644 GIT binary patch literal 14736 zcmaKT1y~%**6pCd8QfihySux)y9aj&9^9Sa?!g^`OK^7$8Z?36{)Th!yXT&~|9|s+ zGt)KIYgN_m-Mg#ytWuJNfJ6s?17HCBo=I9?$T}k_pa1|c8~^|f004N}GP+us+nN9C zlbONO&h|`O-eHXc`-L|6XO-DX{RXl9c1@#h5=VFgCjTrdeO-xIU{T@&N6|Mhh)}nF z1_<{x<7Lb|D0i*U=g+nT%}B$B*c2n_3MGpIo^be z_FLpLiwZ5J#Lh1^HS5OsIe;Q`JwS zKt>~r@^ELUgy~c@7$VhX3dII=(Lgd{3+4-us775seqVRa zlYVFDuRFvh8dB2m4dmv%e>vUb=HlfR;P-a&zM4ClD=n22JSP9uGt4O4^>EYB-Ti|7 zbp4d6In`AL{B9n&<$t%=Vkl4}6v^^?Kkb|US1!N%g;IaNTe+}bEL;q`w@%?)tzk&L zgovuIY?RZV^_9#71GG>A1A~v_MNz1RKe9?S_7JL=-UW;Z=ZSAB5YqZcg2_TIlKg_s z4_ePpzM`6Ybme5#{7oJ41oucs4RK4%H=XgT#?0&UKz(bShP9v&+=4zS8nQ1s3D_(v zkGRzPTNqP{F5YGHS`Sis z2u(69--1XBNwN81fWjjff+@cPK9u?SJ0BWa%piT5E55Q-)*~HCNQs4X|A2&6qXvKS zca>dQvq{0$A0d^SBrVOrl^a_jk^>2M4%d(T*q1a9ywi+*1*;S;Wxjvp>3_!*;jX;; zUX$#}CfG3Me986BpSd+k+b-2`nUz2P<;*t-7AFEnc#RM5Lhs>+-N=M(f;62V%|M`! z9=2+Lz^6gh+{)kF;nZolQqeN{E5(Vyx9^jDS{9w@K3E%*BvknrENVB*=GqLpV3fye z7m(8;^;})HZj53KPm2V7O*SKR2!ve~t=6KV;=N(aLO^YF{he{P+;-tHqv+B}DOFV6 zlTV&%!x4Ztq(ZH@28k+oj9ooU(4|K(7##(+CmcH#d*CBV2ZVqOBPg^>DU>X@&6?Mx ztqiq$b}(1x?e=BBk#K2xnKmOuR69iUaCmlHC2Sea24-zeHZ!QiE!qam)Guo5)+!`kznh*|MsOX8qO+Srjd+h+}PzEb|WC zsUiB4;+CaJG;Dz&#NFfcUSd>f-8U zYwq$EPxrMaV%Io;ei*{H)pYmWwFE`N2V@Kh0ruu4Tj${p@M`LETkmOZ2h ziKQM{5T~;q;0TmowRQbB<&PKNr59NqJX(Yngor*N!?YK&Y>blr4%7m*QPNYd%We0&XzQepB;8WLqAw#wP!fa(~c7gF|7VJo-!4a zbD!Ljh6lzH|PG(9!HC6`XBlgfdP!vqT zT;S{d{nTxDg)lNyFyX8*?ZP?<3fRK@s%HT-#+)24aWp);P0{qxoQ5;M7N2PHa!?j@ zRl1;!_T7!nR-Ywg?wkUWTe+s3zyIaZV+)c0%#e=?(lyu1@{GFVp#M@}0V*$inIhGsnHOgBMDd<*hbxT-aom8dFRUKJ zOPnKG@~MUT`yBqB>No+`9=n#TQj%Ahhq4mgj$r(f*YD zBAS9^xybw@Fs=g=f!i~#y^ab;9Zb7YGfJ7B`?=;COPWN4_OrVj6@FSnj&hY)Z&^nT zdVXp(^l$huGERh*7~vCbV$}>+UWEB#T?GiY?dDw}eI7yNWYtb_>TXMH#hp*iH(nL zifC&Jy(P*$$#qLrE2&p(n&$9%7@>l)FTwo%a)B8^U0}d{L`(HL_$GeLd%V1^hEC zS$pn9ae@N?*`Uyb{I9U&;o$ti#KGaiTVTpo<#$+P0zT8yY_pe`FFDa_A*pl-H=qlv z3`7M6P%5IJ_?3}kteN_WmAX?Yxt^_(z&u!@nw=URYB%Fi7|b%v5|n?u{n%{Q6 zvrS`$f{U?AT6*0!0S2K(B;ztUnPkmN{090t-3?3i10(6v>fJywIdZ-f)ivL{37B=+sRE*Z(uejey>KH}L|@)91xN0TixLY2OkSCoEu&wSpIyXD}}weJ7o zpPlpTkbr*@fzD}FWL|-#-U+F3q$)-oFQ(>j@v#|H%IqkBmzh2P;( z)x*N9%%ZC)Q6W62cHjzMo|k#*^yjJ<8e426fS=VYovQLwX%D}I+b_co7lGBS_kLT! zJ+rNjw$C9LE-oO$Tu%1Vs;0C&F5(bl2npDalZR%XWyF1q;j+eOGq|xveC)<+B4S^a zYyZh&JFv0w-uH0#9vDT`CKE7$iSY7=k2PHTZk2$1YZmkdg5G8hrb^BZjxLPGj*f3m zCLb#g_lXHxtOm87GeSj%nwfN2MDrRBwcD!aDPHWtUjIs}Q$!TYF0DV)Yy9M5Z^9n4 zQ8;{c1vf3&7*eBMd98ftgUTQ*XV7i07mT#1Set`s%#Y-!^jRuPCQ)*nHMwMX3n}i< z2%h4`;K`AN0Qp9fQz>IbBS~!dR}7P0-CtPrVabf)NX=tEAqK*K5`ncJLn*CoS*a%^ z;=}A-`S%?C%JTf)l*7%+8pS=KfQ%c*CH2ex!vsG7vEc2-dE%{jLM-JF;AFDMG`;+r2KOhXb; z;CzC-iQ8mC#z`C9J~%{EIt{b$kM{IYAr8UTcCJ#=%XytDRe*~6py{JzLd&Y7#V^`~63x>mOW z?a8)IdJy-T2-|KA-6v@GP?R(7VGnZJEh0 znM-3j@KTJGp5C2V6v*%n>;@C5w=Il#!yyP44#us9L!1b+2KwpkGNQpqN#A-G*xH1h za<9S&alFB#rG(y*$qc(B?GC&`(B0%dg?T2#yXc5Y?vm|*0bAM>YY6?p{u!pO%?K5V zK;{Mmnfq-``X3iEb#rlbuzNFjal+V}!NvV4ehXX_N?$4^R>Z2p)&XGW=_~L{W7O6X z&d_NF((1k^-9hOImuuAq?vLvc5KQ>``_f`5Iuqds*L~8!=s^JwRMPYKw5nm`?E5)T zTz?anA8r@g7$R4*nX47y*vK>feX6RRa7(3GL=*Yw6u3LDi2uls#+L3CdN#;p3|x(>w~BUUMuJV@0T$#M2b#6 zx5-(|yE0!x8{v5GCw{FrGiGd(q8mD!KfR(*HE1)d$SLy(Y*05DndfttF75I;Bs$bh z{49#h&-G!bVXb+}JJFHwE^+y>!yf%wK$Y%%@1={gOcIQ{!p|Ht@uRubh*>$)0 zJYS+-y)Hnud3`p1oN0*(d+jQcf*AL|Ao7Pa0ok@=$OAcWo~icYeB@@go9*Q;*T4Jm za_CtQAUimz#lE+;C0W?)cZa(e^kNM@%^O08;Pp;kf#c`v+Tz?4UM|1Jj`Ob?T}2{J zKew8T>z2sd*Wcp1FW*q`@D#fo)*pQyUrasyUVbp65pm_E%mr?FH}S;X0s_E*0?71 z{23+$#ch#P0(;b*N=vptiprgEE?TQWr1+k_N$zi_bZly9yRQmRD}(S9XaV9VT6+~9 zCLey{*NV|^D?M#&Rj#h$GHozV?FGi3=;W>G;_Xn&ZgjCuX#<+ee1Ci&h$FaPrTw&8 z;qV2vJ^xBJGyUrO;MKtBZ1(tl{`kQ@7zAN``uM?Q0EW~1kA9vmyD3L_s`s~;76_gT zWExi)FlTJoN4>qhuo(>2rF9pSTyf@3A4K?G8;B=VnH%5;5YmgMa7Gxy?QxL;2w`uvgoRpvGEvG|t0V~r7khVUSR;-;U*L0?h z0ug?28?=X*0Z9UBq1-ih2414ISCfFI!Sj;iRsR03Zh#w{bB{^kNY$oP z;qJaV;-iSG0ccDSoTMvg7qu3mZihR!kA+6cM0?DgVC^$?$b9wcAqJmohKkio$L1Bh zdqOi(HUYas@3tU64SIggQ8MLPVFzqh)ee-0QE#2=fkB|>;f)peL=7Jd9qrH4(_?0i zr7>>e;xBV$<^bx;9DuRo_ZRX;*7IL;5vBy5k5_Kva-X+<#Jukgnt~XG;DLtbckan% z6F#|mQ4szTkQ5XE743G6H9oXN#PQP?$E$eQHQi6EC?|UY7f6S_f8Cc8n$78Q%u}7n zO`OQIw;Gt8`tEo@_x+^$aS8${?MbLQPODqu`S^sw#5m(w9D+^&c4g~mq-KPnvjRT4 zu-SJ!jdGvF%m7h$u@kqP`OAvlT0JA#*PFAG5}?K^YopxP{2N{d*Y#EL(DVj?qxN$7E83K-}GVSaXTx)~nnzU6*Yatw@OTidMt&xhz9>Oq` zU%F|gE4LF#*274VG}M6e1~{#Z#Z0fpby?n#v=C9I_C^6amX=mtxj{K8m1TTsKtc6L z;drG{rpXVERMd|`%3~MaMrK-}@ok)xp=3TUNvG(`5MRGmw7hAQWkDtq=`Vd#aEn+dE?8J8OL`#RqDsjnLJwJK#D)p9dG<={WKZ)@x)g3Pa?0#aU+9LD&1oukml398!snGA4Cyg>B zWm!)nH~bkHMe7GI!|wm(73k3(Qncu!58zyznnIkhM=-Z z5uEeUh8l=#{$6HHa;l2PM{hY}f;&)O|814&RxUIjer3j%A&J!#uIz!RFAFLsMz1z& zbtjCW0pgtxk`|}5bGSE5$IT&Acvl#yRG3734B*Tj_^FfK#0Vd(ku;d*h)iKj*ox0A z$T)Sp_=KDkwzmIPXB0A4+%2;khvmG5k!L{}T;$F>p#crTaad~n_Y(gENz7x=@B@v@$? zm)9IJQVqIy9wT%%n29v@Y)Ub{8M=@dRLN5pipCd~1WYgNNUGLN;!j$HK7v^z@|q(d z*443#eJ3hA^a#Qm%G7w$chy398iBgx32w5})fkzEz=QhV)ymDu3u*$D9BAASar=I zO(zRhW7Zzy6aYu;f9|XLO~q(aCKzpsv_!ELDaZzBD|5Q?c}?Vf_h@-vKm;zu^%#4g zbNZ>)5)`DCr1`OVeg6b12?r#r20Q8EAj4Ao=0N`!?fA6JnbS>5+K#pTan5wv$gFu1 zt7yUi$}F+P!9nn>y7Io*p5Bgi%eq2*^5CvfbxFwvUPa}`A1)e&Jm34JB)>%#!KkqD zD|nHji&sfMQg=X7dzEp1Sad0PW^5Ltb+;uAdTkOIuH%7M5;psmCl;W1v#5rsChGUO zkU#G;TraqGe|4^0?kAT*x{lp>)A!=ATR#_m+PZ%zTw8spt6O$#8Y7A&kPj1yj%$KA z!`SGnFxpH&5c){O$-?1^P>JxvnXyUcl*EOst=@mmBOdIPZaFI z_}#4nr38G_wPaijBC@6azFV*a-BceF#IU@+|jE%csiD~F=gF8 z!OM>X`l;)US7yT&6CZG?N$=TEFohF*juL{VzCgbuCd6aM>T3l<#Ou;OM!7xMwms9| z<=-u3zKR%xhS$X~2}pM9EQ~8$?NFEvjE*SI5wq5~PV{pu*N)b>7D<5&DvgDc4Xw$rQ#s z%iuS~z<9G7KzZZrx&y6^7hPj{r#mn*liOO+^Te{Hx=_C@2*;(p?Ug@XI)EH|` ztqIX9{X2T@Ur(K_eUdx)}8iP5qU;n zE#cL=Cr?l?^K{jx^X`6EH+$<%gDdw(eQT^yvJIRE>{U@G5!cy}F0H!QH=DmRB7YvU zO>I1op)|YE`bsnnwt_4KdN=^7-Dm-wsqe1m6>?? zpa>*z5g^rnM1n88d#r_Qn(*@Yod7IHQRmr{XaKkKtiTN@jNjod z6JDX4rKkEZAj!0@=-SwM3jT<1hh{uOlVV3Ou}z}u0d$U^fDsan=o#C0GbQVsU_`&* zaY|HHvDtsL;WL__IoctG=OM6a3Ne#5U)-S&XQPA|*-YWy6T?0d|C6lV7aOqmA4kBzt=K2~cDiE#CM z8N;avZmOO&d%=qY%S-f&es_`!iS(H-9_pex^a?r$=NMq^L2pt3XA?Yr zh$iM)3Ic-*3o0vfnF)G$Rug_kug)!N1yXpNfG0hh!O-L2P4Z?G>yL|gtywGG7} zwJ;;b?@T_J2dQy64LFN0IQ2AfGItPop{)F|f||hj+}74o!#FZc5_SJ%yXDpw)QG)< zI@~cfqCpy8>;$vnrBxHp_3_o;s1cg8-6iP|?ivQ?OG>RQfv?7@xW%k67kFI7T3Nzu zCAMzGN8*Z+V=etn5?wxRdTfe@zGOa+UOtx?({2$x@EMsr8MKh41w@H)>jsN#Pq6|q zrxanz{qo)OMe!azN>(!qRtt;uV0^%Kfsq0+taG{KMvr*Lr!sD+ro_5MsrO>Q%9C8C z{zCZp{EBzXS(8Re(J`^SB*(kU;63WC6!D{#A29KXC_=svZar3&(ntWMPG4{SE>x!(|FIzjamnv|Jy~{ElKxnp?6& zb)f&HHE4Dpd8zLf2-7AYA4%pKPfx7XyfB&qSc2uZg$Pj$k!ZMyV-Jbfd_n$a4&`fs z)S?Rn08oMi0AT+shw_#>cd;@zcXa`!Q$SyTi6v6f#u78RutN+(Oa!3^I>+2;HuSK{ zGe%7;$I+1oGXI*;ryWQ8u{kZLfK`~?Tz*5KjEL7iWqlG|Q z)m-0yHH*!i50e{F%dg66!&+>Kpeo;d@mjR$$Ifg;L5tu~WhWaO?>US*b$dm9fyS-w zw=y%PbE`;|-#-+!f5>wfTpf&4Xe#LZ6*RPioMO4z9Erk;9*EMh1m`Yym zJ_1mtu00Cz51BhsH{fYs>5(?PN*|81eJxsoF-u*Rmgd)&zb<8^(7&ozn3OeDsI73m z_@lYct+aNsbvF}KnxzeG(8!5p3|p~$iMh=gOJ;C!PXC2uE}g`mz8)jI>BK#|FUL^t zwCU0|yZ@3Kki+JcG;;3{#z^pztEPx{8@2VD_@w^1*_g9R`?P2bl!Q|w0T>4=Ca?MY08cKG-Fx;Pd2ACRv_LoOWAz)6_JjcrYEpV7&hcXd!alhf|55lgR*!QL92YOV24oIhp_XwWZ`7GqmN; zb93oqZ4n$yJ%zvufXcyC63N6e6gu#NzOslHRk(ydsXNVv_yp>WLPBWibc`?HWHq$B zEg@@yq807s2GCf>o0N+uFq<*#6nuE+X}CKU`*$?5mBnpiv)&7=PxqTGJ20+9_>E?$ zD!0f;$8~D|kiC_#^&gP_#^57-H`?x(O}{w(92(2eRrJf{1L?P`tlS*v`K@cphiP^3>j(w0IPDTv%Dq=l+^dp?hRuT zfWI%0kzr@L(y1~Bm#xZd`c?V02tzr?%F6Brj)b;gLWR&r8J(_FH-U&&=K~Q`?`&E! znw?5{B0F63i~yP}dtzc|gVF_unz$C?+_G^uHh*O=zE=N~$k*k;7GH3Pv7)R44sK!# zt{KFAzQ9aJ*g)@;#=`?Y+(+NM?WaUymo7i4+EOTP=T4L-&b1Skz}xenXY?O6$pkWX zljtAevDmTn+8E0|^M8P$`F$|tLi=r5m$ZK(P9~}%3df+a3eJsYzOt{U--A=FRdqYx z#!VK(t>REW#GY5rwPL@6=7A|~i!hRRiGGGWrkD%jsCi?Xetd8r(~~W{zK)owmF1io zQTa62Ha_-}?OU(uq(v-R4EGxMq{;Zogj1@i0>`(u6Yj3^CPwV*kOGpA?B$M)UV_wi ztr0=e@@2GcI^W)TF7EVx)m`rx?9bXBpzk_xFUnfq!)(crM(Ob4>RA`WVc-VL-AhI?vMj)pi zebnWDETIY9AKdA==W?Ii>F$1YdhT~vn>dwi5&ZJFF|l4T8eSL<UE||JV96_kPl;dYigm9Am~lW&Lvx_e_Ev;$M-b9_{ZPBWpLZs+z~H?fpLbwx1cE z{5;v-`LR;_^fYBZw}HITfA{5}VHfRg{kEI%*OS**Yo83!*C*K)TmkpYiHRpk+SS69 zqtDD+w%jkb#UDEwaOsg=S3D|P3^7}lqHZNSp6=-f6-&?kx26c&7k-xUUI5)cBt*#wqq=*R{&LUwZU&;dtM({8T-$f}QR*nLUVlK<>CcVue6ujS ziNsHo+)bElh=27%61_$eeJhqKvND6n(rMDuJ0mx>|M!jemVfq{qDO9yEUx757tj8< zZ^n2IRNrR0(usyLyVnnnyf5~xepj?X#H4I}5J*(K?mU>|oM7I?KH}GW$*639NTn03 zoGUivmzK!zNsMNj=s4)Wh8m#ye(sL?iER!$kD4V`StJ@2sYM!Lx)pnm%!E8V;rscB z=z8GLAaRVRGEKO$aI^_31RDeNefUfni3DPpBIOW)l^()xY5c-KYiLaco^RC>noG)7!#0j!N3N(?*=n$W%`K9&>@H#Kr1uH~s9;wQ#Mb!!jsO2Zbb)qJZ;*ocGSo); z@MdvP6Tg*()L>UJ#nwojs{rB*aN#t0 z68Lue?*(F3vIR2J&BUviu9Lt1yR`X_($9aF^c~1nF?)tb^_`+X!)a}VZfvrv!TSLXxR%G-_7?vb?&zvq>U}I6L^H3S5QSC~x z*g;n+pbV3>jC?hC2CQyXKO?@mLOF)Ncw-f)(YLFu z;e)Rswg!j2@q4kNKM`<{lg?F>7%!Nb% zu&d;lYKE$eNB~KrVI;I@)0MD2>?U(Da=|@T=5XF(-!p2h89)VFio7LYAl|?Mx_6PY z@$~3`P6-J(Ckgn!P>}ZFJ&4A0CAq6bzm11!uv9xhl~Ui2Y)Mm)MK#(23`8OAVKV0s zy*)tn(=u{rp)l(}4&To(fh+$k*acC5a|X*3QHmHft7P@kxBcfBMELJx)GX_W9(v=s z5Rw%5ehri2CDcG&^7CZYx29#zrtLtL40`$??wJbU^oNKZyYQa3stRCKWh%@%D#wPn zqsl##6=*;pV4nrE{#5*ESTB3+=T&-?RtnpPgb3j0s`(dxQb*b4D`Qef3tDy-vG1w1 z+H#Cgqb`^|EQP*klweFVm`x&bpbeB%auSP^$Zizj%G#q5gf<(-t_~y z66CloEsZ2i5qJrB92a|>cvJzCOmY2>0)r{}~ z1E-hWoI$BfyVn9PTF<-?I)h>;Me!wD9v|t*OjNN_JhnPim?Jnkn-MmDh5Q$1* z`s%oYo_(PL?$-yZl}u_xDOigQ_#xLfzxtTQ?0KOP3&;!;Ov>y3?}fC zWnZBKMZ~#`OIpMAxeDR%jHuMYOj(;Y25M0()yxWo>fl(?%O`F_JGlyZ@W$(v*kdQ= zF@5^ovy{tYB&Z;5*pl%3VzZQ+V!+?pO95x+*pG&@ze;>7RS+31r?FPFDdz#pXTE5_ zOW01}q-;9PS}Xs*iPeMY<-s}pqZ&lw^7BuIgCDbK&KM3 zVN=b9>*vT;hMm+41egP1CeX3#R4vs)Ks~F87Gi~xuwBH#*m>i$E$In=<|)*{?+p9P zNGV&j=9X%)g(z<=%!m5*SA(YivEU(6ppJPMFSoMNjB2xXQbx1D1Y0DEO4(G5Y41uA zRBXbsHVO6LWGh=Efrr3(_Q^TH@OnxtaKoXKf=q^Fj5bv!BndLL1?wQm=`PN~;TJc2 zIQ8Gb=@~c6qpUP@%6ZG0PkC>p3D%1F^fi;%w+3uhO+}~5v@EOqkwc^=ESy)#%F7M@ zMZQcWN-V|$TKln*RlkyH3c&hCCs{Fhw?&g!l$ zmk&V%gM!AC-PS%S4lCerke@L>=Aqw{a&M#?40>yAT$oH3^%!-3Jd( zW7C3L_}z35E`p$0*wwCTi_hVWWUuFI#jmhi!&sEfZoj7+|9{S$gYE7I&q1k@InZhh zG=T;fI-A?NFfzP-Rwed=Pc*I;?Hn2C|EnnSv&%%kmCG(m9DMOajgTX>s zM?|CEG?o*v;(F*-O=UtQh)7Y>#-|FeoN3SkQ`z1Xbd^McoNnTfBKNT)dFO?DhuYQ# z*V0ka;tKn2INUHoWU*Xa!x{H`gvzVXye1S46n>g^wkq`dA)^@v6Po7;f&&LK*@Na} z)Q$^g$ zi?~y1>#3n2GAAhf(kehoD2=!Y8f|*|Eq~lMFMr?6@x=-mjqhLCc-|MTStmG14EZ+lFc@GjQY|5WfgSQ z=_V@3JP1Zs?2S{(4rt0HQHM^dfr7W8g;wv|&5#Dj+L=CCdYo%NmL(7~c?@~uI(Etn zv4(?-*?c1BN+!@>?r(E)?;-j)D?=xS(&4K z@NgPeK>sMG|8T(xBXm5W=12)@Er$ceO(drpJ3CG&$BI|%m6I5Hw@jk1Jn7B$YCh_t(CHZ?IO5{;^lAJ( zLC^w9C339wb!Y>2DkIx(3$irO9SsJK4ib(1Z`;K{#`>?<|FC)NPl7*pM*NM|0PRA1 zQ>OiApTwWgKh?{BL(gFUg8ui_`V;)8EcI`&GQxi;4F4@s{S*DC0Q7G(1!#N6-}=OV z%R&Ey|0!tv8=eeGC;tuqkG%0uf%SSONd9BI|0kRM zC(ED7wZB) q^?wY;e-iu|F8(HfV*jiE|4Z;tl7#|A{I?xJhyZy|sG;Y0EBzm7{jT=_ literal 19596 zcmZ^KV|XA@lXYy{p4fIKwkEc1+s?$cF|lpT#I|kQ$(PKF-FLs;{?WQsbx)mh`}Wh- z-Exw^AW#5bONh+&_pj~07Rb*XLmLA*dmCE^I=N3Xl+P0of6Z!Zz!ts(0szmkDktidgkDzLBnD*zOppIU8SbK)bsYrsSXi~=*D|F50S^)wq zF@y(TSRqJ*>Q&C6%vRVdFS`m$lvuL6(riWms2|PL_SMBEl@gL|ioF<+) z2Hh`&%nv}TY&HD^PC^%&^lk{hu}urEVa!Ny(!d7qbI!v>j!Q<%wDa+GcB}`9vO{}& z%nw@Em)*r;u=o+&2RQMMAVzrgw*0|)6;2Wh=>mvu?S>Wnxw1VvFHE@B2;D5GrJDS{ zCkt3OUIo53;*r5OxcFd#sPms}-~av%s`$ya$tT;0|DA0E8+)TKx+CNGW%}q5gdfCK z39or;;?Y2q<0O1r^N^5)37nI7q-l`IYPFVefcqT{+)cun_)GpLR4yy?0B~M8CS2#>!QxmuA7n&CJC%Hj(>3cka05qE;}; zeEqU@D=*H9I;F-cIC5CfGA&@nl9}^TM%Y8c?x;KbEoyq>5t%JkbR?l=oNHd~XN@aW z53(-$uNp41&Y^Iyy$&^|Z2kOhI9eN{#pID<%=bf!@Ix7aj*cRbZq`|R$2R)TLHhXH z+5Mq$(43t(2=Ko3V_vT;(>DJpLEbxACgrCDh@TQ5|Az!l4vscfUm_I7N=x>AM+m;~ z{~$c&S&Al=ZX}@?6XoYtt-hz!t1|;$TB~j=LA&Z|D^$MP6pm)PxxO5gYV|}7aM@kH zl`sYXhj7(ooV&Nr_A5SHmi0BUBwjI=i!<5mhgIEgVmlHki?vvB*x?9d#$;FJq9tHH z?G`owVoS1~rl7$(k?061J#qg-^_#S(FbE`g{u@=T(y671^UBPj z93oEdVlA^LwAW`;g{VhXpkgN&wy}UmHg;+~b*)y)|4u_O48TOWP7nMtt zQ~ggpPX^@!(BJjpCWgX4={3>G#kWgEDm6s)v=kpPcTl_s6kO*XiziPh9X}p7?=8uU zf0p1c<1X;MLWGA#i*IIOZMuc-Y*=nSsVD2~<9|{6zfU>9pVYcq{+(?8cLTV87`WKj zTR4~+89DyXiOII?^~@Or03habR>JuIZ2!72{JmmQB;(WB5qvK+8b18C@#q)-MAv49 zkKI%&l~kHw5nX9XxQ)tcxiX*Rvld)l9Pd zAh}ILd&qyk;CnBwsGO75)ZB?c+x_)H(D43rdw?I3q+E3u5F-K%E$lUmFRgGcT_=M_ z5^WKXRN$M1U2)y)w{UDdZzQ};r83MkqfL#&6g?w=j&?tHv6$1Ok%o{=I+1}AN&h33 z+PeGWIX0=G_TKxA#$$ofnoxOfqFY$y*73d^^_osLK79Um9s_brdJ8Mc-sD`wZXc47 zIli2`QagK(j9yc~md-4RmCv<#m~i^mdAX5gxt29fz@0%Xrh!4+!EGLPY^^!$ZpLq- zdtsOxx01pzR=Ptuc18X}|BQHLJ~!Z}r=++Bd=?Aq0bN38ZPcOrEcz(MkXUbR)$hsn zSm+_fYD1eG>vv@3EljDj%(1Py+KU^`Cv`Iw`=8lr4!KJYx69m7&oU~fHp{^z9eh~i zvxPOrBFJfeppR916lFSEBRekZjRGD=80ig_-?+WarsC2B29h`@C2%c_EcxWrCBB&^ zPG9HyDgOXpmFL9ntMxg{Bu>mxl4#uW%mHaRF!NQW@H)gTcc=uenD%4viDgScuLP+0 z@o2U##UR@bq>7jYhib*5nnSPW$w>(5gjr|RzkHEXiZuTO>hAY_UH6Ygj!p6J9?yVl z@q&IeF<)mgy~uC0s=auU&8qqwNtf0A=G0pTVRx!T-ALoAMEywVH9{mL<$BF4GW~%G z&Tc{aSwm#`qH9~Jj;CiNp>@aS@hvCiH;^s5kQR)@l?Y3h49XTBW<9ml3**+HJ_Z)+ z<%fUlPJi1YweaCQ8dV|zed{UhwdyAph3cgU1*yjAe1NkEI2(1XV1O#KnQC!M5;{#h}7N#_E@9t?}lG!$mJ~uyf*VqHV%$Vk812 zA|!GDwGg1xo9rDs04P|Z?!n)KeS-t!-Q@k`KTJsx8dJA9Yj^RuRKcNOfn$I0*w-Y4 z#74{e%7@AqOiB@0KBPekS(?WHI7{h|JXng)dU*Bxj|l8iW1zj41@$CYVE!={gu8mX z1}M5H`Y064$PgK-`#^fGNTEWXGpjvKAa{gk1W<%P1W}MN(9=*&#FjV#5-c$Ppb;5* z(W0aq#uEl8Mi7QD)EG!FZ!nXEkT>yDqG%;zSkhW;OE+19#~Lv@F+4GTK0rP~zF02J zvvc2>6lm>Iogmr+=OzYH+c9dtlO#@~!;iJ6-{td2&(w==kcwQCbUx!*u%p)ITJWnC z!;doW;bk7`v+5$&Ic=NhnG%}){gQ%`#tI&}u$oLyJHVCCAkLYfnzQxTg^})lfI`_5 zgxFOtG+VcP|9MY2PMVe6{Jft`fd8L!731HzN=d^eogT$IyZ*!XRohy8h7_ojlw!F+ zy0l0uON*UE8IWlO@~ms4CKE*La2Xxu;5zeSRm$<akC+d&><=@`()2|lm<=i5gK26gxn;CCCWK!ZdqfEU%wpPfLY9;XemoC z6)d@zAaYvlN+sxS-%?fNm63l|BjERJ3imLo-dL*eS5Sxg#!>FlHI>JT)+fITy0+xd z?7Q#Ii_~}!}~QSL#bU+l1CCI1&$Y#EX2`Ac|6Yr6rQpQ{?k!H z&YIer$<*j#ZHmTc`&EsqbD!ag1uY@wXqyWqnx{O6+MpK7n4Q%J_LsM)*hS!y58rmN zu@qiVydb|zq59P|{KynL!Y#shqe1;NU7JD0mr{-0Yy8TrE#s$c&7!m!}!l z8+K`5XE}o|rUmgxk!{Dx`EFx%rX zZwO>x1Pq(@LJEv+8t^s-kKYR&GBmBlVn~IPo+l<<5R4B&j9;20!DVRSNOeuQqOh_& z5^E(tL+aphOH-!?AAXW+Ii?cy`K1}sXg_lku^v;byq*h_vZEPEFM1T_h5@rf_cFYh z;iiL`#p1$KGs;C-gaW^_JsZxo189SDhPY0C=9)b=R$JA+xuz0)u*1yKK2ZRl|D*l= zRx`V*Q z;Qly_s=e?GYdags_Q=y~3N^!k{lyQkQ6w2FH_GK1mFxrHKd%bcCG!MnSO5S}-T(8d zVETJiWN6uJi=hQys4Rc@HfML&MEWVmO_9i3s2nGX)afiR+j0IP9QU6&dV25}1{MSZ(eXKN%*UU5uP)KC{$tOA8(o_0^w#Y5vixCYs zcD!fTP@z+rew96^_j+=s^I`v{ebI%VRaT051VI;)e&sOJ?wuwDcXD9f#Hl$~HTSdg zTJgfAOyr~WgO#KGyll?krD)ALPP%x{uxYtHq79MvjE_;|?Y3S`1^({YN~ME&jo`(+ z&3yLbvfYg*L*(V@G4~4Zq0Hq?O_rDEcgT6-jq4BJ4ePq~g^=fx7WIWT@#0{=zdoDi>8DWp^V}c`5{{vZed) zXOpvSAxd>v8`@1wmoMJ-9&gQ4l))wVXJy6dS2ov2{Kk8#I%cUKmZcAZ7Zdr$cX!!y zjp}T3`Lsz_ze;qv6lKS}8rF{6etL34jci+J`M}$}wF%7Fw%^lik(W8uJ;^&g1_Tz~ zb=`MwO}I}EJ^gy)%ZA}?l015NJFIb6@32aLaevtuGq~|&(vFh-xEFnUZ!*6*Ymx7G zHD|?}vl(-~`QcUKt*GJEH^vSRuD+mwtMtLOY}H$fAMIS0efF`@hG^6ZaiQI0#Dymu zb#b~LkGqlo!Pj-Lja|=v#xD>X1wM3B#=$dOfNVK}NP2efuG1Cm$X{mmQg_q_S5;=a zJX9ahdN?d#|p1Yy=X96JJ zP}RFsF5YMN>oB!;=zO_fBFU zUJ!=)=)-~Jp8|klk<(C!U96cZV% zCkq7*A?;_;3rp4as3XyI?{VslqnTj^@gvzd3KWG>FgjiXY0z8L$PDHnGYV@a3zE^* zgWjNJ#_*FFLJ=qTL!SeX-1z@(4T_cC4srp|Br}|ZOeHE>wLNbKu;D|7N5(k`e2S16 zL{a8-Bcx5~It+y2Bvk0nz@DYJvF8R@UquAVfMEKG$?AA*&)xSL#EA%&X`7K6o0^)u z1895lWzF<+iK{$5c?WZ7L-po-SpdVa*UnfZ%0xQ~yt@LK@i1_7bnt3)tJBK=)yd(@iQ@~7!XGKjU4`>K*MFunJ$SYH z8JHm zfW?vJ!GQ;`0Ezj@>p%VeqL9LzNYq1f<;XI4-^LRCFTYQH822w2pud>jXP^UfqWsL_ z`!;j5+QMImOl;RBm!F-ND2yiN|4nCg(7*TJ95@g|_iCm7e4zvs%iQMhEdPfE`vufz zjxp>#<4k&!kOQgM2U=U7Ng@?<%zgng#+%sKay!^_^?UmHHwQIUwavdre|i0xLeapzk%dhF z5-0Wkli8{N?nhwPd-nySj=7TNzvQQmv>5rXkuiVaQuvAB@*e=bZ7BXN(5F|=SCd#q zk{KETSmq$RNL2DC8UnjvgGxuW?_WbM>0hVR|Ge`29bD8q6Rd+*)1O%v$e0-;{dF@= z`gansC5UN-=(~Vo1%_BN=j=YX51km7Q-XgNK7dS zP=xgYB1IxZqaDof08+Z#iEWT12MOWN-v*zb}ZUEhsS)-ywp?J9=&2{virH8VWE+Rj#NZ0AOe*w35YS#v>U+* zGWqKTK7quJK6*A>*75-S1NN;teK6CZ(Tl$jX;?ugiJSVz@rD_Oy4zAG zL2SfdoN<_L4Y!}}=dDCvWY%JMe!}dqg#T_1>LyH3PXw&C!LH-Jga@$k5BG=DkcR78 zucuf`G6eNlKX;ek!OGf=bmAm^XW&4&sMv#IQ-#TAfMO=q3xm^InUI0Xh6E?^qtl_#EJd+ ztDe{e_A)Tlnw5j~Qawm{`RxOwh#y8k|7m=xJ75);vf)@h=8t|D!Xu|O5|$eQZ5L=h z!(bE!L9tQZtP=FwR-;hk4zvvGGp6vAnq2*-_%t`Ys+C|Z69!+);3Dj0tVoZYgUVpdb`eVBhAg zwd6Q0cJsX9DB~Y7=CtzDgR(G>fD^S1MyDd=OhpK4p4A*3=2(b(B1?LnC3dXs%X~KQ zsI_X9Reo9d}% zb|s9w$U@w*cB00S2t%PC9HUx6ili=;X(VEwo3Bqj^MOdMJC1$3m{ri$*I5zKsZ*^f`0=bXT^yy@-S}U#Xi7+vyiy?ky zVV*qYW$D5yZi~$H#qP94r?~mPHn7+UB^NOLdKjjFe#6b7ccTUq(rSV6mq)?F-n`z} zDy=f%Suu$jWRNikQrVi=LqkVetg*6j@RiJ0R!3s1QS(V+aS}P%!SekPn!x4r#C9+0 z8r#6e7#CkDV8!&GSzCp{r|A9V(p+%*W<$$_r1=W#ySiQ&Cg!lY=@hQ(LIkMGtMut( zY7%ol5))&>+KbYKNAsk70@w=zla-zE|8o5F$}(jGD|5;Rk}4>`MP-bgh~HDrJ{4s z?ykUuJl9?rw#$s2SJVF!%-A_kY;9sGS+D@0|Ibtt1y&m8tSWFBe7?Y z(sbz#FwNoeO7)mX!vM!*pk61I}}n4Nu1cE-Om$jmJ~s84=hMu7+G~W3UaPBqT!onla6G-@g|sZ~u5t3nC)_ z@d283Hqz=dgi)vH9+s5z65+B{CLm8Djy32OJYWh@Z$&5p zmH{Gah$P7ZzT7wXY#kh|ZzU#;9!f}-Y8i-c3<0HAayNH#q0Qri)$abj`?^(k6T+9p z=kEA^37$WK$Sri#kLLL_R_x<>Cx7`eQ>DZEz8a0r`kwq*7k?_Ac>PYES13%^>3V%q zM@HBAy4z%f@39Gv18eD;O`;0IcBy9j^V?=RT!aM&-sM94;Xo)m5;@o!-W)C_S2wnU zkXJLQbAU^T;KwCk)z~>#=G?V5+VG<|;8iGNO&x zv*9L!8HJ-5(%{Id2|>Yf*w|HO&d{%k6INcQSUH4H_lOIFyGhci{ z%$JDMv8TyDPJw|^vh+s$CV@ehL#w-+Rf)GJlf18Hd__0?!uH}tyqTh*4A@J?Gs+Qh zqakOp%#Ahpp%ii`pkz|dn**a22zPL9-;e${ZV)btKirGhVsjQ-O~(-&k3@LbyTLMr zbaimx5|_8ZI64*Ukddq{w~xVGEISi(Z}w|&hfs~VBcJ;$=B=0}W$FG3zx#Hq72>^+ z+t<`7Xj8i9(@96C!Qg1tfXTdK)$t_r>T$u4TTga?f)Cm8hF1L?(Mta-@RlDL9lVA* zSXAjx&RUN&JEb8h(E>9f{~r(>_`b-}S#VBC>!^W!SG7`=d{YKL-niaC_tr4p5?aNi z(J+U6ZG*fxKuPETU1Egp`&`7N6A?Ta@X+m8Muz0-x9V)2nPS&q=xo}AGFz07=;lJx znmF6E<)dOVTQC>y>P-Zbo*v!`8hY~N(p+U2eNl4;?uTq8oJ zjf+_Fnsv2rl>3aor)zU_LW4Q}u_4Scr=Xle2@w_QzUsmP%$$YRR8gXGyQ2%ku_e7g z!hFzA=t=BLiV8ACBNo(3(HAsACp}_To%b)(R;jPD$g<-uLV#%I2&C!--V04bIENR# zw5RalmbX&&Xt-=D&(OS68O7cTYIoXK;`Vh|EWIe&USHhqy)vRLz5CH1CVXUXsI5(g zdOw#L4!~Y4U}OU1FQ4G+cQS~+Z#fc>o}8f?5T*`gg~fTIEpHJFi@h*#CNPZM{17KH zWt3<Ap5>QJJE_wLe;YNKI8jbCiNQvZc?k*6Tb{DjceJ*Hpon0kYif z8X>~|z|Ug415AA-iBB|F(azg!(LPn0Nb3yboS*{Km@&#At=0}vV6<#M%VM=fVA}&* zWwG3NHXuP_4&6Xy_i|zdxF}apy4Zr!VR17Kx~r@PEx7ahUbas7qeAxsv>WUpccgx6 z*h!?29`}2|{RhZ@mh?n>>*uRJOM0|6005txuN#Gfqno9X!&e<|P0J!0TLtqys^15L z!yg`E0MElS)zPJ|(nYisB{vi#5QH3SWoHf1Aesx0tW7u($P5Anv;+!)JdwOY;PHE- z975zW=^Il`-AhuiQ#g3THiYgx~gIzqeKMlZrw%>FaFs#tv zG+kUv6tSG(B070nP2nQ}U#IIbJI{MnOuCq{ID^-55%W5gN)txze(11)gGToidXquN zcb>mGa#K5>zRS6cHJYWKBkn21JZDN)UBFw4+Gyk)=jMZDY%%t>+BoBQ7(T52>@GEY za5o+ZH6>q=jIx^{S(k^_zz(p}IcXy#^f(S%A_pa;*O{o+i8uAp>A9<=%MhDgJv$h^ z9w{ZEXp(qYs7*bwIACqNN~a^k`XhA@Up~p5ZWHfmu8HaX0mMkc+p&q&{oWU}C zJeLM`K3idWE9jgMDdO>B(wP`-4sM>*1ay$Nuj+@cLQ5^N5@{*FdG9)qJIX zx+ONi>RscZ_^r1M(n3Z_i=(6^=TUY$oP$$MvWaNfEHBUJgSO)AE)VPZac_^hoSrVm zS7OcmgM7P)XwO)ooFXG6aa2ioiSTYx2utHShWzjH<>Tj*5xvWcnbKIcFqh_}vcqbC z*7|0@IG^@en6Hz?cil$IXqMYqS)*H{M$jCbNy%V2W+tQ^l9GIfOO@xa?t+3~Q}=fQ zAuQalpOw3uC+!=f556y>fU^V>d8$fA#G%fh^ zs@<)wlAgBm{KTw!bO<$XfKZV#w~Jk0#v38`l_L&bDN&{i3;+wCKE@`hPJx@ zBo2IculytZ-)87*zwi5CPRRI+y}#*+T&o}%llXko)Bb@Dkd&s3_4h#JT8ClO5dt>g zzVXaaPwU6^!tyRDOiG7`26obvgWlwE$_x1JOhbqvghd-|`L30fqqNrs-XWT15>WVN zyj-t8X}Oj)+A6TB2h6jt2>Z7mp*vX;fLIy)fO0S;ye9i4r$qBK^DcxBioe?d ziUy)xHeR2KDZ{x_)RVx{1ehDhf#l7X>YgbRx1Mf64vOa*3Div{31E)=mx@ah=0*fW z-f>oBf?i{Jzl~F2QvAq6LX8d1B^Or44q)#k8*lO>^vbu0deP&A72WSY2mJ}5p&caU zO%7YQDrcV9dT>)-3(VCl(({0(5jlz_AQVdz7)oFOf%u!-(vZqb%}NMm$x;Xe^K)y` zP-79Thb|W?$`==!OAuB(Ep#UJ4N)~sB7SZ@fs!IA->h2RuaYUNP8-SsN@ZilYCoJT z^~7Z}hi{RA`ULB!`K=n=849(Q;-osnS-j_+r8XegvY#lqx^_p$9rydi$UK;3$`bx< zB;e@?1J1%}j1~{>qq(bw9h?V~C2H#hMPyY0e8_w(*ma5H1@^D*&a+7wnwL)8!yaob zC(DJyW=B44%u%koQb*`H^;47@dN znL092lu<3$QlU>{80RlY86+U|tWgsVUjmOQG05y+F99*4*;9_bRi-NFVL6O32V=Id z%%s#K=j{T#*dqIl7=`=>N}kPKJrToV$i>ec5|aQJxi!d$qTmQuu!HQNPN*}S-h8$W z5&_&(I%FnocIxpZ;ZfG18Y~r~8hk68foiyESEjl{H6J0&Cf4bJFS}9FG$8MThg3Jy z)KaA%-cIAg^d9^f^{Xd95^Zfap|Oxb49g_a4@a4Xe#C=dR=3h^)bei&@`H&<7|EK*nHn@Lq+BZM+o1=IM_ab38;#&w zarJ^6V?wGUuZ=tq$%&uqzq{<)PgWk)NuMj*{+pG1U7F9;X6QlFS&#aD(-}Wv)#tiL zz1QhoV~|qF2ey9HW+i~Q^SZG(Q01`W&b?j{eqwvHyhKoVxcb>+k?SIjXFK|p8M?){ zi%ny%VolJF(`9e0VCXh_DOylTghsgQ+8!Nq}{vxIO2j-Hy~JVrP@J8CJvcHK18b|Lj)YxDfh2gJ$9A8^>-S8B|h z>1}&)+s6ln;+TjL3$f;N{YWfl?f$|*7g@Q+xSWQ7pT$mbN-iEDCuTwKOfUzBu;oFG zJV+FT`Z823>f`zPQBSPb00;~YDDFtKso2Mjx#iJCWasKG2>s;E#2QQTPL}`&wKuQ{q2YTd2EuSc`e|Ki5nmal;909ht$>(bOC_W}| zKtMP&Xb3$!^YsQ!*YkRJ>nP+R`}G+aOoN>#yiQy-jB3wDOqe1THX1dh3`Iyl+@x=( zn6h7%ClfZBBIOL(l$f5ZJQ)>FRf(`#zXPc@SD_KcU>z~BE|GSgDgri|i(OPoSEgJA z1G`z^vLbv7$0OY>_NdF6f1xkyh3|{N~n&AB|&aK^H zcnjFg8|;DaG|+ zESKBvQRG}bR|1a2L&5YdH_Ro}!e8)F7$KNRfd(uFH6Ygk4h`&?LS$MZ%qRUET($tI z46;6^N|)U957g|%s<-@`kG1~AdOOhI6TP*VABJOn>g4QBh5acx@p9E4T%DpxIZu2SB3Cdqb=Sk4frPiQM9pi87x7e551ApVX;}JHEK}PUC=&~P{sOr zOBREy3J)p!EY;yNixnZd86=+khZHC8sxGayO~FQ4{dZa(rLd8WOI5QL9ox@6d&#vj zGb8B=#SQ-&5GMNxf41%JAuL)RiMM$_D@;XBj4jx%!$XG3SHO0{VcSlV4XB?p3>KPp z3&GNe&CIw&1BvIU9&XoqkzIVo{6hHZ$D=g!>J;JJ>xCRhuU@O%Zb5+T0H3WoLvnqLf# z%SHXa1~(Z-^V7Fks>@fLoeId9ot>VW zo45S1?f~5+p^IyCB&Ry?P}5MODOrc!(=E;|*Tho0lwcc>ZTAN1_;YTBHknMdOD=9x z0B0T`?1xw=|CrZS@d5mwPj(;?h_qu!008x3003WKhkrfUnd;da87es1n^~LueZc#z zY2|=D;N`QC>J6SEL9JiV2K@a}Toy%Ip3y==UuX~>k>zm98W6`$wsj2{A}m8bgFI=5 z+}y;l;l4mRqlPDIw&uDh%&O%s1d?C8>KXNUMGF8Vn*H*--*E&8-nsKf(?`e~AKmz< z6}xk(43N3o`BF`J8m1)T!ll$mMFwu0cJr^%h(huiErlkYUnoGF6#5XapAL;^>R3p`Ql^c^?3MuAo~u^)a!m= z-XvRheq0pZ#ruludw0S-`bF2{+1f$Zlf}uF!8@n#^Ch1P9`9<)3ktDEQZU;o_ahcl z7GKZeL-xj7izXIYkRNFDk@JM9Q(qCb{hUwROXv4`xo;GN))eUadPPwWFnn3Hrj!NE zBiu08SQNoiF-rMq`Uv1@*TWQfJJ{c78&Z_WO2CqNMt+Np(gYA5|G=E#k#b14pBMKn zpr9=UObOz^DNRv<;}-S&*DIM=y<=vX# zBo43!eD*2#yTb|m$5m4spT*h-k1JIu#lSV;#=A_^^81!Wckc}}2>ar1>frYL(TS$> zkCXeE35{$Xu+907E=xYI)#tsA`GpO8!p8$T^#{JBS4*C!PN$2T%$I0F=ZTNo-#7TY zuJ?O)#!wB8<%{F9xJ z_W`TPBteJ!cTFPC6vhmB=)J<78~P7X|fk%5ok&sS)cjOQxAnQ^z8?$!#$ zD2!G`Y&_|F_bVH0Z^12U%+&aKsq6hRVP3|HVe)+nqX#?Fo_RZ0Y+*y`IQs6$oOBpI z$nK-iX-6Eqqy#*`musd;z$0@Nx-(AXV3$GG>p6+(flg?Now2PH{rV?=mK4|q3Ue&+ zWGDPs*XR)9M~6~r=$lNwCs0oCz8x<;L}$Ofn^cf>^_$!>xRB)B7uH1ao}R+hY~{mK zF!cJOPyqaDqUrk8(D`2NbRlWhSztc%YCL2=W~m%xkgIHs{;<0ERz#yYZ()eZoKLx; zJH@sZgTWT+k-m69(^QV6+);t0%8u+ge`&w9k0!^f(-`S| zXaGXb!&1275JLkFLVkpQbWt9Pns`xoh;_Er+Vxw5kmiKG`MqZlmD1d0hD!Btl>THO zIeJWn#45T4Ud*BAbO0Csfy(CQr?R>F=&)}rv*CVIPyC?Wa7`2v(2J9vA z-7$X<46{5p*CD ztO7AD;QUr)cygYilz4!KXyKq#pq`av>0DKa9OdzFCRfVr=|;jllP4nPrq4pizm4GU zU)KXLOz?*rP~QW!Aza8@UnsUwkw(=q?4VYK^7mOmET>6taS|{5#C`9UJPq6N4GeuV zVk1F0F>fYO=x(({d8g=QUs#c=fppr0&`MP{K*LJi(|8Bl?Omv`o2iOQ|JYPJhf*;F7R=_AvBL*V zg^?GFlowQHCtuMP90WASO|aX`k=`pNtz%luy42HThldSv3m3;I#d=hLq6+`T{e8)R zn9L({G6*S$yxSHX1JB;KzH6WG7T{bU2IDvs32_VRFO4#$js`QpiN1^(U4S2!7isJ2-lp*341xzLl+5Sh$5-%Fp48_0L&g$ zJSt55x8nEktr*ZU5Zv7G8Zi>VhTB}g-B~jyfdDhzEQ2-DI9)MK2N7 zN>pytZ89s+5+umJ1c*4yAY*|8i;eE;M1uI@t~F3xk(qtkRebINxitE|bN?x&0}@WAw@1UOK+*p_f77u;cA7Ge#XW3hSBA<_ zJ#&Rs)10@tY_BfWGon1gd(3IAGUr{lv7t*s&_@T;lmLu=D2-4W{PQmi$fp9eGe%91 z3j9NZk(Lq)BM!FApFQ-~mHcT0T##Bk92FeoWZ=@E`bvG{F-n4kvx==3Ay!OQ+J)iB zrSfq~hbzw060oM4Z+X!dvv7Qz;X+V?wAra%9lbIi1`P9WKV!;8yXER0tfKWyaxx1% zmKdDkQ;$GOY0~aiu^q`}U?B!B?RO2Wz|n0%w8LxTvh4642UWDDOM(Vpbkz`KN(?Xu zm)wj<>-lOS2!Fp{vuu$GcqEDYf{_@Ch!d=~a>`eWPx_#wD3V|{Cl2_wvE$A&E@Lk@kGpp6@I)02f*aW9e%C= z;mpS$wr{ik24k>IC~_ zi%_%j*Wx92(eL_H3J1~!QiD@oiOx}8xgA^lr+rIsTS!(?_i+SxyQm>4kZ1=8O43I; z#-~?ZLX4+Yex?n9xAQ$u(OmGu)rvZ_x2yhk{0syiSZpKZ8;~DT{eRBz5A10l5@j4u z5StxLV-NC{oCI30Zobmg1IWVNrZdeiIS@%nEvSxWK=ZLz^~s3$01d^kQm~RBe#%*Wb1oQ7 zwZv_&CsMeKJ!&&++cGc=UIVH)c)a9wJtfshH9%t?u4v6J3}(UM`Ua0^cb!>7hGpJ1 ziIP}p@S}t(-2Zy^171B(4;r_)Pp@gc`VP!Q0$W-bMF$)W&I1L}QG-Nbl$g81SInrz z)=5sg0~|uO_!JGUKk<)3XC{=h7Cf~)pIS5b$GpDX)|$)DvY;m_^!+NW7#j@I+ahfY z3?W5jH_~Nz$+ZH_2Z5)lm79^|N*b*eIZuzP<>To&DuIwBYqHVFGx=~2=|nN`#(<*^ z*aCX6_AMDCqQ+`x6b%D2r@LqUnrc@!Be%}#P{z8`SIKGd1tm8R1>FuW7%a{n1J-^7 zMgl=Et0w2^z9-e>TU3OfD(Xi5EIp0!QS_}H56;;U-A{?JGS>;;v(KXf%mdf|%pB^+thK#SmxFnRV(BFahavf|_9=0Mwyl`12 zRePS(y8N`ogc9N_IDM1ih5UscwQIE)zqbr7((&;;)y9N>!zF`^BO0A+x^UnDFIl8o z=AjMHz7}8&6h7>6NF?Ju)Ck2q5Zv#mbELrZX8(4?(6yAEx@|bs8IOp_dd!QQ!5@ur zVTw*fTbU59BX3?H-XKYpoRD~8LvBOUSxl`(K}}0lc0o?HP^sPADW@8Qu}`YVEz`K^ zUDXGvek4KWsuy8Gb`a17 z@sY97-J%=hfWFJYQp(_5q*mkrff8kDD`-DO5~l^Z{oJlXQRKQhMuXwY%Rm< zhkw2ad)wC!$^s8u?j6PmD2=iM;GPFw94J^ii8yOxw8~C#p@vNQ`OhN)GizIIhDz6h zDN!ec410_aGLW390T$h*Bv{KZ^3&`dKKk=WPCeKGSUlSo336^Qn#*AebcI{2Bav?A z#Ya+U?(8iYxSICic4y=hBgm-evbk+1Bf;hT@BuuJ2`hVTER9Vr8~nZ)_yG{e?t(+i zCmN|1ip=xa;0r9QYdgT-hQP$dR2o?7NMd5Lvh>7$<2gHPCkXEEl7qEg)mZQ%4>fWK zgF({FbL^+!stZP&ElR`~yU^S;aVZTCE~o<8c1}mv<>i*hVr5?;r;1fyo}W)OF2~@Q z43zr1W5fOxKv{qv&e}#W2B~(BllQF42 z@iJRjy2qjX$0O*82FC=M;#a_LV1aqSz7h&1a76=tYh^BHib=aa>x7rp0_#@to&heD z#805Gw3}q*+71lb4+w>A?IuMYp#XfjOu0r(>c%31GaY%!iknNywbxoAuMF zy&9K>IUBfUY^WRYeAfPw;1-JJw44*dh6A#Aw(1bsSnC$Po z=T~s>|0sWb{h6$9V`K64X+xr-md!Ffir43ldogP5{>Z!%{DDwdc}nJGP^xV_gz>As zN&``r3crfp9<)NDi%ytp$PTnOJjPoFoWWGuG+pU(U{x4N`1{E4289YWD+R8t@D}L8 zOsL~2kR=4fw0mO8whm+iDyAnS3$u7YVn|6WEPnD9lBx-gO9`S9*pVr4nxF}GPi91Dne#U`GG~)IX`<9MVuohs7Fq-FCw;x2<$NvYE`<(1rRerS& z$LRv(3WUiy&K}xKOdX??ru`dd|4h1m`xcOk2Thmw;U)8Kz@KjEmq9&?^=sPOM$IDg z7t}{{tEpb=rk|?RHLsximY8u!ValzN&~g7m-2JZ1YSxpwz8%>rohK37ytEqT%e(xd?Cm5bgK+ zN2M4^8h5Lu(KD1uRqvZYo$JTy3125K|3bU9u&dz7K`nBr{}2pxPzj9+z*DWX;6K=g z&!VIn56O!MKWiy;>_mSY%enz}zMo&Tzhg}c**$1>;JDZ5Fz&zwx9h;YeSLQIoT_xm zYo0p-o4PAN{@~O;$`l;sT|LPlQ^!nP63Me~?V85d7PS>0vDS6YznX#t_o(k2`4e(V zk$Q#jPe zdz5S=8s)Ys_y@OxC&WN}@xaIGCENe~O(xjqP}8wDvizSvzk9Gdw|{)LZF~+%=zrVL zepyz;O-S_9BmBAWzd+dUly*TyTymr%I+Zr`@=cNYm?TAxQ#)X>*8rbBJf1+$(;D?%uh`UW5YxZ7ct_&$S#IpOs|%%?rpo^~vqne7OK$o1^!%I;wZG z6h-w?wlfzypEuTi5;AdaY0d=Ep5~gxH}(sBb!B6-JYQ65clv#~SNoc^ zX}Y|wF|9L#yY4)V*m{m zD(y_;J&Hc*t}e$NmeBRB*n(5x$2-a2z+k)rJe5ESNCOAAA3oPClIlK7XV2 zU-OJw-P@VYoD(-|w&vXa!rWq8ClGBk>*eA4`v<#sv^~i4F>9LX_EozaKFs@e)R*PPhI==T>6G_bV4`Iq;n&bozo*v1-!iXEIoRm-{XqahvRuqq|JI2A;EDxiTjnwmq&m0Z__`m zc3gaZec6j2U9HP*vFY3co@%0U$SmXpTiRQX4#)WR$|%vc_OQ6(MG~)7e;+q}8)$@*`|{FYAu3$PY54h>N;tfkD0f@%ihFGsxzJ(GnhW(%NzZ){ip?EekH3? zG4PlSSzrjs+EpTc@1PI|MqbWa=;eiQ7dD16HomV(PY= zrW^EV7+v~SLOJ;>US~rY>St!o5Wh7v-V_!{@YC~JzG|p zT<)p$iL?5yb5Nm2XsYRijiL(DyzAGqO`gi^v*K35u`Yo-%+EErFE9naK5)S&Z?9Nm zTivO-6O0X6@4YX3zhjG91Lxd;>5b_2IXp{` zhw|OnQW@u^(7SUPN6?Dsumx6qDhJzz>TKjwy>G8rrc|wR%{=$uxUSQ!+EtnNzpk6OYrn>w3-139HU7DJ{B8a32OkUN z>wt4Mj7%cn`5DORJ?OCnj3@?11_dB#?}jzbAO>JRkpom1K>+9=pqb8yq6yVf^uh_G z2V&`7xE^q3!e<#Mw?Y8OG7vyIpa|7I^dpKuiXrx?BWr^mScD!#=tu4#bWa5)Z@8tf z19;H2qaPQ8&@KTiOc2`PhsWUaG*AZ=fIM9cJlzCX65-Q;8cI)HkcuvJhoYBpAd4WO z#N&#j4Wq_1(yBW}(jNbf17@+3}wih|r@HrZEstp8yVt5K9hC!O)XWXE=9Q~9VkX}fv z9l+3wIP(VG`RM0QAWX>trXXOjA-Mo-3ivb%bW_mx>my8Q;=yVPc=vvQH!HZ{5C&Eg MlYwr2><-cg0N2vEpa1{> -- Gitee From 76a83b83526607fdb2325dd9271b4ee84522ab72 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Thu, 18 Apr 2024 15:46:10 +0800 Subject: [PATCH 049/302] =?UTF-8?q?warm=20start=E5=8A=9F=E8=83=BD=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=EF=BC=8C=E5=AE=9E=E7=8E=B0=E4=BB=8E=E5=A4=9A=E4=B8=AA?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E8=B7=AF=E5=BE=84=E5=8A=A0=E8=BD=BD=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=8F=82=E6=95=B0=E3=80=81=E7=A8=80=E7=96=8F=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo_estimator/main.py | 1 + mx_rec/__init__.py | 2 + mx_rec/saver/saver.py | 43 +++- mx_rec/saver/warm_start.py | 272 ++++++++++++++++++++ 4 files changed, 307 insertions(+), 11 deletions(-) create mode 100644 mx_rec/saver/warm_start.py diff --git a/examples/demo/little_demo_estimator/main.py b/examples/demo/little_demo_estimator/main.py index 901bf23a..d8e801b0 100644 --- a/examples/demo/little_demo_estimator/main.py +++ b/examples/demo/little_demo_estimator/main.py @@ -68,6 +68,7 @@ def main(params, cfg): config_for_item_table = dict(access_threshold=cfg.access_threshold, eviction_threshold=cfg.eviction_threshold) access_and_evict = dict(user_table=config_for_user_table, item_table=config_for_item_table) + evict_hook = EvictHook(evict_enable=True, evict_time_interval=10) hooks_list.append(evict_hook) create_fs_params = dict(cfg=cfg, use_timestamp=params.use_timestamp, diff --git a/mx_rec/__init__.py b/mx_rec/__init__.py index bdb85131..d7f4ae82 100644 --- a/mx_rec/__init__.py +++ b/mx_rec/__init__.py @@ -24,6 +24,7 @@ from mx_rec.graph.patch import patch_for_dataset, patch_for_chief_session_creato patch_for_assert_eval_spec, patch_for_scale_loss, patch_for_session from mx_rec.data.patch import patch_for_dataset_eos_map from mx_rec.optimizers.base import patch_for_optimizer +from mx_rec.saver.warm_start import patch_for_warm_start patch_for_saver() patch_for_dataset() @@ -34,6 +35,7 @@ patch_for_assert_eval_spec() patch_for_bool_gauge() patch_for_optimizer() patch_for_session() +patch_for_warm_start() __version__ = "5.0.RC2" diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index d776b699..dc545822 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -67,7 +67,7 @@ class Saver(object): ("prefix_name", ClassValidator, {"classes": (str, type(None))}), ("prefix_name", OptionalStringValidator, {"min_len": 1, "max_len": 50}, ["check_string_length"]), ]) - def __init__(self, var_list=None, max_to_keep=3, prefix_name="checkpoint"): + def __init__(self, var_list=None, max_to_keep=3, prefix_name="checkpoint", warm_start_tables = None): self.max_to_keep = max_to_keep self._prefix_name = prefix_name self.var_list = var_list @@ -75,11 +75,12 @@ class Saver(object): self.local_rank_size = get_local_rank_size() self.local_rank_id = self.rank_id % self.local_rank_size self.save_op_dict = defaultdict(dict) - self.restore_fetch_list = [] + self.restore_fetch_dict = defaultdict() self.placeholder_dict = defaultdict(dict) self._last_checkponts = [] self.config_instance = ConfigInitializer.get_instance() self.build() + self.warm_start_tables = warm_start_tables def build(self): if self.var_list is None: @@ -175,7 +176,7 @@ class Saver(object): logger.info("======== Saving finished for rank id %s ========", self.rank_id) @performance("Restore") - def restore(self, sess, reading_path): + def restore(self, sess, reading_path, warm_start_tables=None): logger.debug("======== Start restoring ========") if not check_file_system_is_valid(reading_path): raise ValueError("the path to save sparse embedding table data belong to invalid file system, " @@ -185,11 +186,10 @@ class Saver(object): ckpt_name = f"sparse-{base_name}" reading_path = os.path.join(directory, ckpt_name) - self.config_instance.train_params_config.sparse_dir = reading_path if not tf.io.gfile.exists(reading_path): raise FileExistsError(f"Given dir {reading_path} does not exist, please double check.") - self._restore(sess, reading_path) + self._restore(sess, reading_path, warm_start_tables) logger.info("sparse model was restored from dir '%s' .", reading_path) logger.debug("======== Restoring finished ========") @@ -283,6 +283,7 @@ class Saver(object): sub_dict["optimizer"] = optimizer def _build_restore(self): + # 这里build_restore的地方不变 for var in self.var_list: if global_env.tf_device == TFDevice.NPU.value and "merged" not in var.name: continue @@ -294,7 +295,7 @@ class Saver(object): table_instance.emb_size], name=DataName.EMBEDDING.value) assign_op = var.assign(variable) - self.restore_fetch_list.append(assign_op) + self.restore_fetch_dict[table_instance.table_name]= [assign_op] optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name( table_instance.table_name) if optimizer: @@ -313,10 +314,30 @@ class Saver(object): if sub_optimizer_placeholder_dict.get(key_state).graph is not state.graph: continue assign_op = state.assign(sub_optimizer_placeholder_dict.get(key_state)) - self.restore_fetch_list.append(assign_op) + self.restore_fetch_dict[table_instance.table_name].append(assign_op) + + def get_warm_start_dict(self, table_list): + placeholder_dict = defaultdict(dict) + restore_fetch_list = [] + for table_name, v in self.placeholder_dict.items(): + if table_name in table_list: + placeholder_dict[table_name] = v + restore_fetch_list.append(self.restore_fetch_dict.get(table_name)) + + if not restore_fetch_list: + logger.warning("no tables can be warm start restored.") + return placeholder_dict, restore_fetch_list + + def _restore(self, sess, reading_path , warm_start_tables=None): + # todo:这里增加新的参数,table_list + # 根据table_list去改造 + if warm_start_tables: + placeholder_dict, restore_fetch_list = self.get_warm_start_dict(warm_start_tables) + else: + placeholder_dict, restore_fetch_list = self.placeholder_dict, self.restore_fetch_dict + - def _restore(self, sess, reading_path): - for table_name in self.placeholder_dict: + for table_name in placeholder_dict: optimizer_instance = ConfigInitializer.get_instance().optimizer_config.optimizer_instance if optimizer_instance: set_optimizer_info(optimizer_instance, table_name) @@ -331,7 +352,7 @@ class Saver(object): restore_feed_dict = defaultdict(dict) - for table_name, sub_placeholder_dict in self.placeholder_dict.items(): + for table_name, sub_placeholder_dict in placeholder_dict.items(): load_offset = self.config_instance.hybrid_manager_config.get_load_offset(table_name) fill_placeholder(reading_path, sub_placeholder_dict, restore_feed_dict, NameDescriptor(table_name, DataName.EMBEDDING.value), load_offset) @@ -341,7 +362,7 @@ class Saver(object): _fill_placeholder_for_optimizer(optimizer_state_placeholder_dict_group, reading_path, restore_feed_dict, table_name, load_offset) - sess.run(self.restore_fetch_list, feed_dict=restore_feed_dict) + sess.run(restore_fetch_list, feed_dict=restore_feed_dict) class NameDescriptor: diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py new file mode 100644 index 00000000..53324b06 --- /dev/null +++ b/mx_rec/saver/warm_start.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import logging + +import six +import re +import os +from typing import List + +import tensorflow as tf +from tensorflow.python.estimator import estimator as estimator_lib +from tensorflow.python.training import warm_starting_util + +from mx_rec.util.log import logger +from mx_rec.saver.saver import Saver + +if tf.__version__.startswith("1"): + from npu_bridge.npu_init import NPUEstimator +else: + from npu_device.compat.v1.npu_init import NPUEstimator + +class WarmStartController: + _instance = None # 类属性,用于存储唯一的实例 + + def __new__(cls): + if cls._instance is None: + cls._instance = super(WarmStartController, cls).__new__(cls) + cls._instance._warm_start_dict = {} + cls._instance.table_name_to_prev_table_name = {} + return cls._instance + + def __init__(self): + logging.info("start to build WarmStartController.") + + def add_element(self, path: str, table_list: List[str]): + """添加 path, table list""" + if path not in self._warm_start_dict: + self._warm_start_dict[path] = table_list + else: + self._warm_start_dict[path] += table_list + + def add_table_to_prev_table(self, table: str, prev_table: str): + self.table_name_to_prev_table_name[table] = prev_table + + def get_elements(self): + """返回dict中的所有元素""" + return self._warm_start_dict + + +def patch_for_warm_start(): + estimator_lib.Estimator.__init__ = patch_estimator_init(estimator_lib.Estimator.__init__) + warm_starting_util.warm_start = patch_for_func_warm_start(warm_starting_util.warm_start) + NPUEstimator.train = patch_for_estimator_train(NPUEstimator.train) + + +def patch_estimator_init(func): + def wrapper(*args, **kwargs): + warm_start_from = kwargs.get('warm_start_from', None) + if warm_start_from: + kwargs['warm_start_from'] = warm_settings_filter(warm_start_from) + return func(*args, **kwargs) + return wrapper + + +def patch_for_func_warm_start(func): + def wrapper(*args, **kwargs): + ckpt_to_initialize_from = args[0] + if isinstance(ckpt_to_initialize_from, (list, tuple)): + vars_to_warm_start_list = kwargs.get('vars_to_warm_start') + var_name_to_prev_var_name_list = kwargs.get('var_name_to_prev_var_name') + results = [] + for i in range(len(ckpt_to_initialize_from)): + results.append( + func(ckpt_to_initialize_from[i], vars_to_warm_start_list[i], var_name_to_prev_var_name_list[i], + args[3:], **kwargs)) + return results + else: + return func(*args, **kwargs) + return wrapper + +def patch_for_estimator_train(func): + def warpper(*args, **kwargs): + hooks = kwargs.get('hook', []) + if WarmStartController().get_elements(): + hooks.append(SparseRestoreHook()) + return func(*args, *kwargs) + return warpper + + +def warm_settings_filter(warm_start_from): + # condition 1: 原始入参为settings + if isinstance(warm_start_from, estimator_lib.WarmStartSettings): + # mx_rec 定制 warm start的写法, 定制写法的策略应该和原始warm start的过滤策略不一样 + if isinstance(warm_start_from.ckpt_to_initialize_from, (list, tuple)): + out_setting_list = [] + logger.info("According to warm_start_settings, warm start will load from more than one checkpoint path.") + warm_start_settings_list = _build_warm_settings_list(warm_start_from) + for setting in warm_start_settings_list: + filter_setting = _warm_settings_filter(setting) + if filter_setting: + out_setting_list.append(filter_setting) + # 这里out setting list 必须要revcover成warm_start_settings再返回 + if out_setting_list: + warm_start_from = recover_warm_settings(out_setting_list) + return warm_start_from + # 原始写法 + elif isinstance(warm_start_from.ckpt_to_initialize_from, (six.string_types, six.binary_type)): + logger.info("According to warm_start_settings, warm start will load from only one checkpoint path.") + filter_setting = _warm_settings_filter(warm_start_from) + if filter_setting: + return filter_setting + return None + # condition 2: 原始入参为str + elif isinstance(warm_start_from, (six.string_types, six.binary_type)): + # 这里还有一种类型是:str 这种类型相对比较简单,传递就好。但是在这里要调用以下controller来指定一下sparse的地址和表名, + # 这里可以单独写函数 + table_name_list = get_table_name_set_by_ckpt_path(warm_start_from) + WarmStartController().add_element(warm_start_from, table_name_list) + return warm_start_from + else: + pass + + +def recover_warm_settings(setting_list): + ckpt_to_initialize_from_list = [] + vars_to_warm_start_list = [] + var_name_to_prev_var_name_list = [] + for setting in setting_list: + ckpt_to_initialize_from_list.append(setting.ckpt_to_initialize_from) + vars_to_warm_start_list.append(setting.vars_to_warm_start) + var_name_to_prev_var_name_list.append(setting.var_name_to_prev_var_name) + + return estimator_lib.WarmStartSettings( + ckpt_to_initialize_from=ckpt_to_initialize_from_list, + vars_to_warm_start=vars_to_warm_start_list, + var_name_to_prev_var_name=var_name_to_prev_var_name_list) + + +# 处理定制的warm settings, 将warm_start_from进行校验 +def _build_warm_settings_list(warm_start_from): + # 这里可以修改一下传参,用参数解包来做,更加简洁高效 + ckpt_to_initialize_from = warm_start_from.ckpt_to_initialize_from + vars_to_warm_start = warm_start_from.vars_to_warm_start + var_name_to_prev_var_name = warm_start_from.var_name_to_prev_var_name + # 类型校验 + for params in [vars_to_warm_start, var_name_to_prev_var_name]: + if not isinstance(params, (list, tuple)): + raise ValueError("If you choose to load from multiple model paths through the warm start option, " + "then the parameter type in the warm settings should be a list.") + # 长度校验 + if not (len(ckpt_to_initialize_from) == len(vars_to_warm_start) == len(var_name_to_prev_var_name)): + raise ValueError("If you choose to load from multiple model paths through the warm start option, " + "then the parameter list list should be the same length. ") + warm_start_settings_count = len(ckpt_to_initialize_from) + + warm_start_settings_list = [] + for i in range(warm_start_settings_count): + tmp_settings = estimator_lib.WarmStartSettings( + ckpt_to_initialize_from=ckpt_to_initialize_from[i], + vars_to_warm_start=vars_to_warm_start[i], + var_name_to_prev_var_name=var_name_to_prev_var_name[i]) + warm_start_settings_list.append(tmp_settings) + return warm_start_settings_list + + +def _warm_settings_filter(warm_start_setting): + # 将settings里面的稀疏摘出来 + # 要考虑名字有对应的场景 + vars_to_warm_start = warm_start_setting.vars_to_warm_start + var_name_to_prev_var_name = warm_start_setting.var_name_to_prev_var_name + vars_to_warm_start_res = [] + # table_name_set从路径里面去获取 + table_name_list = get_table_name_set_by_ckpt_path(warm_start_setting.ckpt_to_initialize_from) + # 稀疏支持以下格式: 1.str(支持表名) ; 2. list[str]; + if isinstance(vars_to_warm_start, str): + # condition 1: vars_to_warm_start : str(正则表达式、表名) + # 表名 + matching_tables = [table for table in table_name_list if re.match(vars_to_warm_start, table)] + # 如果匹配到了,那么这个warm_start_settings对于dense部分就是无效的 + # add WarmStartController(path:table_name) + if matching_tables: + warm_start_setting = None + #add controller to set sparse + WarmStartController().add_element(vars_to_warm_start.ckpt_to_initialize_from, matching_tables) + if vars_to_warm_start != ".*": + return None + # path: embedding_table_name + return warm_start_setting + elif all(isinstance(v, str) for v in vars_to_warm_start): + sparse_vars = [] + for v in vars_to_warm_start: + matching_tables = [table for table in table_name_list if re.match(v, table)] + if matching_tables: + sparse_vars.append(v) + WarmStartController().add_element(vars_to_warm_start.ckpt_to_initialize_from, matching_tables) + vars_to_warm_start_res = [v for v in vars_to_warm_start if v not in sparse_vars] + if not vars_to_warm_start_res: + warm_start_setting = None + else: + warm_start_setting.vars_to_warm_start = vars_to_warm_start_res + return warm_start_setting + else: + raise ValueError("vars_to_warm_start must be list or str!") + + +def get_table_name_set_by_ckpt_path(warm_start_path: str) -> List[str]: + ''' + Get the list of sparse table names saved under the path 'warm_start_path'. + ''' + table_name_list = [] + if tf.io.gfile.idsir(warm_start_path): + restore_path = get_latest_ckpt(warm_start_path) + else: + restore_path = warm_start_path + directory, base_name = os.path.split(restore_path) + ckpt_name = f"sparse-{base_name}" + sparse_path = os.path.join(directory, ckpt_name) + # 如果这个sparse_path不存在的话,可能是gpu路径,不能直接报错,只需要返回一个空的table_name_set就可以了 + if not tf.io.gfile.isdir(sparse_path): + logger.info(f"under the warm start path {warm_start_path}, sparse directory {sparse_path} not exists.") + else: + for dirname in tf.io.gfile.listdir(sparse_path): + table_name_list.append(dirname) + return table_name_list + + +def get_latest_ckpt(warm_start_path) -> str: + ckpt_path = os.path.join(warm_start_path, "checkpoint") + if not tf.io.gfile.exists(ckpt_path): + raise FileNotFoundError(f"Checkpoint file is missing under the warm start model path {warm_start_path}") + with tf.io.gfile.GFile(ckpt_path, "r") as f: + latest_ckpt = f.readline().rstrip() + latest_ckpt = latest_ckpt.split(":")[1].strip(' ').replace('"', '') + latest_ckpt = latest_ckpt.split("/")[-1] + + path = os.path.join(warm_start_path, latest_ckpt) + return path + + + + + +class SparseRestoreHook(tf.estimator.SessionRunHook): + def __init__(self): + logging.info("In warm start mode, SparseRestoreHook has been initialized.") + pass + + def begin(self): + self._saver = Saver() + logging.info("In warm start mode, begin SparseRestoreHook.") + + def after_create_session(self, session, coord): + #这里mxrec需要适配新的restore接口,这里的策略是调用多次restore接口 + self._warm_start_dict = WarmStartController().get_elements() + for path, restore_tables in self._warm_start_dict.items(): + restore_path = get_latest_ckpt(path) + self._saver.restore(session, restore_path, restore_tables) + -- Gitee From d7ed2aa49e8c464e6dc61c3e6216eb18f4e8ae42 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 22 Apr 2024 14:32:49 +0800 Subject: [PATCH 050/302] add .clang-format --- .clang-format | 49 +++ src/core/key_process/key_process.cpp | 535 +++++++++++++++------------ src/core/key_process/key_process.h | 382 +++++++++---------- 3 files changed, 550 insertions(+), 416 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..f1f5b0d0 --- /dev/null +++ b/.clang-format @@ -0,0 +1,49 @@ +Language: Cpp +BasedOnStyle: Google +AccessModifierOffset: -4 +ColumnLimit: 100 +IndentWidth: 4 +UseTab: Never +AlignOperands: Align +AlignAfterOpenBracket: Align +AlignTrailingComments: true +DerivePointerAlignment: false +PointerAlignment: Left +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: Empty +AllowShortCaseLabelsOnASingleLine: false +AllowShortEnumsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AllowShortLambdasOnASingleLine: Inline +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakBeforeMultilineStrings: false +BinPackArguments: true +BinPackParameters: true +BreakBeforeBraces: Custom +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeColon +BreakStringLiterals: true +CompactNamespaces: false +PackConstructorInitializers: CurrentLine +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DisableFormat: false +FixNamespaceComments: true +IndentWrappedFunctionNames: false +Standard: Latest diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index f76f6907..58312ca1 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -15,19 +15,21 @@ See the License for the specific language governing permissions and #include "key_process.h" +#include + #include #include -#include + +#include "emb_table/embedding_mgmt.h" +#include "hd_transfer/hd_transfer.h" +#include "host_emb/host_emb.h" +#include "ock_ctr_common/include/error_code.h" #include "utils/common.h" +#include "utils/config.h" #include "utils/logger.h" #include "utils/safe_queue.h" #include "utils/singleton.h" #include "utils/time_cost.h" -#include "utils/config.h" -#include "host_emb/host_emb.h" -#include "emb_table/embedding_mgmt.h" -#include "hd_transfer/hd_transfer.h" -#include "ock_ctr_common/include/error_code.h" using namespace std; using namespace chrono; @@ -41,8 +43,7 @@ void KeyProcess::SetupHotEmbUpdateStep() } bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues, - int seed) + const vector& thresholdValues, int seed) { this->rankInfo = rInfo; if (rankInfo.useHot) { @@ -50,7 +51,7 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos } map scInfo; - for (const auto& info: eInfos) { + for (const auto& info : eInfos) { embInfos[info.name] = info; scInfo[info.name] = info.sendCount; if (rankInfo.useHot) { @@ -66,8 +67,8 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos LOG_INFO(KEY_PROCESS "hot emb count info:{}", MapToString(hotEmbTotCount)); MPI_Group worldGroup; MPI_Comm_group(MPI_COMM_WORLD, &worldGroup); - for (auto& i: comm) { - for (auto& j: i) { + for (auto& i : comm) { + for (auto& j : i) { MPI_Comm_create(MPI_COMM_WORLD, worldGroup, &j); } } @@ -85,12 +86,14 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos if (GlobalEnv::fastUnique) { int result = ock::ctr::Factory::Create(factory); if (result != 0) { - throw runtime_error(Logger::Format("create fast factory failed, error code:{}", result)); + throw runtime_error( + Logger::Format("create fast factory failed, error code:{}", result)); } } LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}, useHot:{}", - MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic, rInfo.useHot); + MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic, + rInfo.useHot); #ifndef GTEST Start(); #endif @@ -103,8 +106,9 @@ int KeyProcess::Start() // bind like: // 0 1 2 3 4 5 0 1 2 3 4 5 // | rank0 | | rank1 | - // each rank creates KEY_PROCESS_THREAD threads, each thread process one batchdata - LOG_INFO("CPU Core Num: {}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 + // each rank creates KEY_PROCESS_THREAD threads, each thread process one + // batchdata + LOG_INFO("CPU Core Num: {}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 auto fn = [this](int channel, int threadId) { #ifndef GTEST auto ret = aclrtSetDevice(static_cast(rankInfo.deviceId)); @@ -118,7 +122,7 @@ int KeyProcess::Start() } else { KeyProcessTask(channel, threadId); } - }; // for clean code + }; // for clean code int threadNum = GetThreadNumEnv(); for (int channel = 0; channel < MAX_CHANNEL_NUM; ++channel) { LOG_INFO(KEY_PROCESS "key process thread num: {}", threadNum); @@ -136,8 +140,9 @@ void KeyProcess::InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo) if (rankInfo.useDynamicExpansion) { embeddingSize = info.embeddingSize; } - hotEmbTotCount[info.name] = static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) * - HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); + hotEmbTotCount[info.name] = + static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) * + HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); } OffsetMemT KeyProcess::GetMaxOffset() @@ -182,7 +187,7 @@ void KeyProcess::Destroy() { isRunning = false; LOG_INFO(KEY_PROCESS "rankId:{} KeyProcess begin destroy.", rankInfo.rankId); - for (auto& i: procThreads) { + for (auto& i : procThreads) { i->join(); } procThreads.clear(); @@ -192,8 +197,8 @@ void KeyProcess::Destroy() /// 每个数据通道的所有数据处理线程上锁 void KeyProcess::LoadSaveLock() { - for (int channelId { 0 }; channelId < MAX_CHANNEL_NUM; ++channelId) { - for (int threadId { 0 }; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { + for (int channelId{0}; channelId < MAX_CHANNEL_NUM; ++channelId) { + for (int threadId{0}; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { loadSaveMut[channelId][threadId].lock(); } } @@ -202,8 +207,8 @@ void KeyProcess::LoadSaveLock() /// 每个数据通道的所有数据处理线程释放锁 void KeyProcess::LoadSaveUnlock() { - for (int channelId { 0 }; channelId < MAX_CHANNEL_NUM; ++channelId) { - for (int threadId { 0 }; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { + for (int channelId{0}; channelId < MAX_CHANNEL_NUM; ++channelId) { + for (int threadId{0}; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { loadSaveMut[channelId][threadId].unlock(); } } @@ -229,8 +234,9 @@ void KeyProcess::GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf) uniqueConf.maxThreadNum = GlobalEnv::maxUniqueThreadNum; } -void KeyProcess::InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, - const unique_ptr & batch, ock::ctr::UniquePtr& unique) +void KeyProcess::InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, + bool& uniqueInitialize, const unique_ptr& batch, + ock::ctr::UniquePtr& unique) { uniqueConf.desiredSize = static_cast(batch->Size()); if (preBatchSize != batch->Size()) { @@ -272,7 +278,8 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = + GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -285,7 +292,8 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) break; } LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process with fast unique cost:{}," - " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", + " get data time(ms):{}, batch name:{}, channelId:{}, " + "threadId:{}, batchId:{}", getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); @@ -293,14 +301,13 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) batchQueue->PutDirty(move(batch)); } unique->UnInitialize(); - } catch (const EndRunExit &e) { + } catch (const EndRunExit& e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } LOG_INFO(KEY_PROCESS "KeyProcessTaskWithFastUnique exit. rank:{} channelId:{}, threadId:{}", - rankInfo.rankId, channel, threadId); + rankInfo.rankId, channel, threadId); } - void KeyProcess::KeyProcessTask(int channel, int threadId) { unique_ptr batch; @@ -308,7 +315,8 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = + GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -320,43 +328,46 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) break; } LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process cost:{}," - " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", + " get data time(ms):{}, batch name:{}, " + "channelId:{}, threadId:{}, batchId:{}", getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); auto batchQueue = SingletonQueue::GetInstances(queueIndex); batchQueue->PutDirty(move(batch)); } - } catch (const EndRunExit &e) { + } catch (const EndRunExit& e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } - LOG_INFO(KEY_PROCESS "KeyProcessTask exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, channel, threadId); + LOG_INFO(KEY_PROCESS "KeyProcessTask exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, + channel, threadId); } -void KeyProcess::HashSplitHelper(const unique_ptr & batch, vector & splitKeys, - vector & restore, vector & hotPos, - vector >& keyCount) +void KeyProcess::HashSplitHelper(const unique_ptr& batch, vector& splitKeys, + vector& restore, vector& hotPos, + vector>& keyCount) { TimeCost uniqueTc; if (m_featureAdmitAndEvict.GetFunctionSwitch() && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { - tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 } else { if (rankInfo.useHot) { - tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 } else { - tie(splitKeys, restore) = HashSplit(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore) = HashSplit(batch); // 按存储dev id切分并去重 } } LOG_DEBUG("uniqueTc(ms):{}", uniqueTc.ElapsedMS()); } -bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, ock::ctr::UniquePtr& unique, - int channel, int threadId) +bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, + ock::ctr::UniquePtr& unique, int channel, + int threadId) { // tuple for keyRec restore hotPos scAll countRecv isWithFAAE = m_featureAdmitAndEvict.GetFunctionSwitch() && - FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; + FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; TimeCost totalTimeCost = TimeCost(); TimeCost fastUniqueTC; UniqueInfo uniqueInfo; @@ -365,11 +376,11 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch // 特征准入&淘汰 if (isWithFAAE && - (m_featureAdmitAndEvict.FeatureAdmit( - channel, batch, uniqueInfo.all2AllInfo.keyRecv, uniqueInfo.all2AllInfo.countRecv) == - FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, uniqueInfo.all2AllInfo.keyRecv, + uniqueInfo.all2AllInfo.countRecv) == + FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", - rankInfo.rankId, threadId, channel); + rankInfo.rankId, threadId, channel); return false; } std::lock_guard lock(loadSaveMut[channel][threadId]); @@ -383,7 +394,9 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch LOG_DEBUG("key2OffsetTC(ms):{}", key2OffsetTC.ElapsedMS()); } // Static all2all,need send count - if (!rankInfo.useStatic) { SendA2A(uniqueInfo.all2AllInfo.scAll, batch->name, batch->channel, batch->batchId); } + if (!rankInfo.useStatic) { + SendA2A(uniqueInfo.all2AllInfo.scAll, batch->name, batch->channel, batch->batchId); + } auto tensors = make_unique>(); tensors->push_back(Vec2TensorI32(uniqueInfo.restore)); @@ -394,15 +407,17 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch if (!rankInfo.isDDR) { PushGlobalUniqueTensors(move(tensors), uniqueInfo.all2AllInfo.keyRecv, channel); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) : - Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); + tensors->push_back(rankInfo.useDynamicExpansion + ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) + : Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); } TimeCost pushResultTC; PushResult(batch, move(tensors), uniqueInfo.all2AllInfo.keyRecv); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost_with_fast_unique {}", - channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} " + "key_process_time_cost_with_fast_unique {}", + channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); return true; @@ -430,8 +445,8 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, // 特征准入&淘汰 if (m_featureAdmitAndEvict.GetFunctionSwitch() && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE && - (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, - countRecv) == FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, countRecv) == + FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", rankInfo.rankId, threadId, channel); return false; @@ -444,7 +459,9 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, } // Static all2all,need send count - if (!rankInfo.useStatic) { SendA2A(scAll, batch->name, batch->channel, batch->batchId); } + if (!rankInfo.useStatic) { + SendA2A(scAll, batch->name, batch->channel, batch->batchId); + } TimeCost pushResultTC; auto tensors = make_unique>(); @@ -456,21 +473,24 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, if (!rankInfo.isDDR) { PushGlobalUniqueTensors(tensors, lookupKeys, channel); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) : Vec2TensorI32(lookupKeys)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) + : Vec2TensorI32(lookupKeys)); } PushResult(batch, move(tensors), lookupKeys); LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", - channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", channel, + batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } return true; } -void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel) +void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, + KeysT& lookupKeys, int channel) { - if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && + if (GlobalEnv::applyGradientsStrategy == + ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && channel == TRAIN_CHANNEL_ID) { KeysT uniqueKeys; vector restoreVecSec; @@ -479,36 +499,39 @@ void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tenso GlobalUnique(lookupKeys, uniqueKeys, restoreVecSec); LOG_DEBUG("globalUniqueSyncTC(ms):{}", globalUniqueSyncTC.ElapsedMS()); tensors->push_back(Vec2TensorI32(restoreVecSec)); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) : Vec2TensorI32(uniqueKeys)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) + : Vec2TensorI32(uniqueKeys)); } } vector KeyProcess::GetCountRecv(const unique_ptr& batch, int id, - vector>& keyCount, vector scAll, vector ss) + vector>& keyCount, vector scAll, + vector ss) { TimeCost getCountRecvTC; if (rankInfo.useStatic) { - for (auto& cnt: keyCount) { + for (auto& cnt : keyCount) { cnt.resize(embInfos[batch->name].sendCount, 0); } } vector countSend; - for (auto& cnt: keyCount) { + for (auto& cnt : keyCount) { countSend.insert(countSend.cend(), cnt.cbegin(), cnt.cend()); } vector sc; for (int i = 0; i < rankInfo.rankSize; ++i) { sc.push_back(scAll.at(rankInfo.rankSize * rankInfo.rankId + i)); } - vector rc; // receive count + vector rc; // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { rc.push_back(scAll.at(i * rankInfo.rankSize + rankInfo.rankId)); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 vector countRecv; countRecv.resize(rs.back() + rc.back()); - int retCode = MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), - rc.data(), rs.data(), MPI_UINT32_T, comm[batch->channel][id]); + int retCode = + MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), + rc.data(), rs.data(), MPI_UINT32_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -521,16 +544,19 @@ void KeyProcess::PushResult(unique_ptr& batch, unique_ptr lockGuard(mut); storage.push_front(move(tensors)); - infoList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, storage.begin())); + infoList[batch->name][batch->channel].push( + make_tuple(batch->batchId, batch->name, storage.begin())); if (rankInfo.isDDR) { - lookupKeysList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, move(lookupKeys))); + lookupKeysList[batch->name][batch->channel].push( + make_tuple(batch->batchId, batch->name, move(lookupKeys))); } lockGuard.unlock(); } /* - * 从共享队列SingletonQueue中读取batch数据并返回。batch数据由 ReadEmbKeyV2 写入。 - * commID为线程标识[0, KEY_PROCESS_THREAD-1],不同线程、训练或推理数据用不同的共享队列通信 + * 从共享队列SingletonQueue中读取batch数据并返回。batch数据由 + * ReadEmbKeyV2 写入。 commID为线程标识[0, + * KEY_PROCESS_THREAD-1],不同线程、训练或推理数据用不同的共享队列通信 */ unique_ptr KeyProcess::GetBatchData(int channel, int commId) const { @@ -551,32 +577,37 @@ unique_ptr KeyProcess::GetBatchData(int channel, int commId) const this_thread::sleep_for(100us); if (tc.ElapsedSec() > GET_BATCH_TIMEOUT) { if (commId == 0) { - LOG_WARN(KEY_PROCESS "getting batch timeout! 1. check last 'read batch cost' print. " - "channel[{}] commId[{}]", channel, commId); + LOG_WARN(KEY_PROCESS + "getting batch timeout! 1. check last 'read batch cost' print. " + "channel[{}] commId[{}]", + channel, commId); } this_thread::sleep_for(seconds(1)); tc = TimeCost(); } if (!isRunning) { - LOG_WARN("channelId:{} threadId:{}, isRunning is false when GetBatchData", channel, commId); + LOG_WARN("channelId:{} threadId:{}, isRunning is false when GetBatchData", channel, + commId); throw EndRunExit("GetBatchData end run."); } } EASY_END_BLOCK - LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, get batch data done, batchName:{}. bs:{} sample:[{}]", + LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, get batch data " + "done, batchName:{}. bs:{} sample:[{}]", batch->channel, commId, batch->batchId, batch->name, batch->Size(), batch->UnParse()); #if defined(PROFILING) && defined(BUILD_WITH_EASY_PROFILER) if (batch->batchId == PROFILING_START_BATCH_ID) { EASY_PROFILER_ENABLE } else if (batch->batchId == PROFILING_END_BATCH_ID) { - ::profiler::dumpBlocksToFile(StringFormat("/home/MX_REC-profile-%d.prof", rankInfo.rankId).c_str()); + ::profiler::dumpBlocksToFile( + StringFormat("/home/MX_REC-profile-%d.prof", rankInfo.rankId).c_str()); } #endif return batch; } -size_t KeyProcess::GetKeySize(const unique_ptr &batch) +size_t KeyProcess::GetKeySize(const unique_ptr& batch) { size_t size = rankInfo.rankSize * embInfos[batch->name].sendCount; if (!rankInfo.useStatic) { @@ -585,8 +616,9 @@ size_t KeyProcess::GetKeySize(const unique_ptr &batch) return size; } -void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniquePtr& unique, - int id, UniqueInfo& uniqueInfoOut) +void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, + ock::ctr::UniquePtr& unique, int id, + UniqueInfo& uniqueInfoOut) { EASY_FUNCTION(profiler::colors::Purple) EASY_VALUE("batchId", batch->batchId) @@ -605,10 +637,10 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniqueIn uniqueIn; uniqueIn.inputIdCnt = static_cast(batch->Size()); - uniqueIn.inputId = reinterpret_cast(batch->sample.data()); + uniqueIn.inputId = reinterpret_cast(batch->sample.data()); ock::ctr::EnhancedUniqueOut uniqueOut; - uniqueOut.uniqueId = reinterpret_cast(keySendInfo.keySend.data()); + uniqueOut.uniqueId = reinterpret_cast(keySendInfo.keySend.data()); uniqueOut.index = reinterpret_cast(uniqueInfoOut.restore.data()); if (rankInfo.useStatic) { uniqueOut.idCnt = idCount.data(); @@ -617,7 +649,7 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, uniqueOut.idCnt = keySendInfo.keyCount.data(); } uniqueOut.uniqueIdCntInBucket = splitSize.data(); - uniqueOut.uniqueIdInBucket = reinterpret_cast(uniqueVector.data()); + uniqueOut.uniqueIdInBucket = reinterpret_cast(uniqueVector.data()); uniqueOut.uniqueIdCnt = 0; int ret = unique->DoEnhancedUnique(uniqueIn, uniqueOut); @@ -633,19 +665,21 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, All2All(sc, id, batch, keySendInfo, uniqueInfoOut.all2AllInfo); LOG_DEBUG(KEY_PROCESS "ProcessBatchWithFastUnique get batchId:{}, batchSize:{}," - " channel:{}, name:{}, restore:{}, keyCount:{}", - batch->batchId, batch->Size(), batch->channel, batch->name, - uniqueInfoOut.restore.size(), keySendInfo.keyCount.size()); + " channel:{}, name:{}, restore:{}, keyCount:{}", + batch->batchId, batch->Size(), batch->channel, batch->name, + uniqueInfoOut.restore.size(), keySendInfo.keyCount.size()); if (GlogConfig::gStatOn) { LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} " - "batch_key_num_with_fast_unique {} unique_key_num_with_fast_unique {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueOut.uniqueIdCnt); + "batch_key_num_with_fast_unique {} unique_key_num_with_fast_unique {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), + uniqueOut.uniqueIdCnt); } } -void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, UniqueInfo& uniqueInfoOut, - KeySendInfo& keySendInfo, vector& sc, vector& splitSize) +void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, + UniqueInfo& uniqueInfoOut, KeySendInfo& keySendInfo, + vector& sc, vector& splitSize) { std::shared_lock lock(g_smut); absl::flat_hash_map hotMap = hotKey[batch->name]; @@ -659,8 +693,8 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, Uniqu TimeCost computeHotTc; ComputeHotPos(batch, hotMap, uniqueInfoOut.hotPos, uniqueInfoOut.restore, hotOffset); LOG_DEBUG("ComputeHot TimeCost(ms):{}", computeHotTc.ElapsedMS()); - UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, - hotOffset, batch->batchId % hotEmbUpdateStep == 0, batch->name); + UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, hotOffset, + batch->batchId % hotEmbUpdateStep == 0, batch->name); } if (rankInfo.useStatic) { @@ -673,8 +707,9 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, Uniqu } } -void KeyProcess::ComputeHotPos(const unique_ptr &batch, absl::flat_hash_map &hotMap, - vector &hotPos, vector &restore, const int hotOffset) const +void KeyProcess::ComputeHotPos(const unique_ptr& batch, + absl::flat_hash_map& hotMap, vector& hotPos, + vector& restore, const int hotOffset) const { emb_key_t* inputData = batch->sample.data(); size_t miniBs = batch->Size(); @@ -697,48 +732,52 @@ void KeyProcess::ComputeHotPos(const unique_ptr &batch, absl::flat_ha } } -void KeyProcess::All2All(vector& sc, int id, const unique_ptr &batch, KeySendInfo& keySendInfo, - All2AllInfo& all2AllInfoOut) +void KeyProcess::All2All(vector& sc, int id, const unique_ptr& batch, + KeySendInfo& keySendInfo, All2AllInfo& all2AllInfoOut) { TimeCost getScAllTC; int channel = batch->channel; - GetScAllForUnique(sc, id, batch, all2AllInfoOut.scAll); // Allgather通信获取所有(不同rank相同thread id的) + GetScAllForUnique(sc, id, batch, + all2AllInfoOut.scAll); // Allgather通信获取所有(不同rank相同thread id的) LOG_DEBUG("GetScAll TimeCost(ms):{}", getScAllTC.ElapsedMS()); TimeCost all2allTC; - vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 - vector rc(rankInfo.rankSize); // receive count + vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 + vector rc(rankInfo.rankSize); // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { // 通信量矩阵某一列的和即为本地要从其他设备接受的key数据量 rc[i] = all2AllInfoOut.scAll.at(i * rankInfo.rankSize + rankInfo.rankId); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 all2AllInfoOut.keyRecv.resize(rs.back() + rc.back()); EASY_BLOCK("all2all") int retCode = MPI_Alltoallv(keySendInfo.keySend.data(), sc.data(), ss.data(), MPI_INT64_T, - all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), - MPI_INT64_T, comm[channel][id]); + all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, + comm[channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All MPI_Alltoallv end.", channel, id, batch->batchId); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All MPI_Alltoallv end.", channel, id, + batch->batchId); all2AllInfoOut.countRecv.resize(rs.back() + rc.back()); if (isWithFAAE) { retCode = MPI_Alltoallv(keySendInfo.keyCount.data(), sc.data(), ss.data(), MPI_UINT32_T, - all2AllInfoOut.countRecv.data(), rc.data(), - rs.data(), MPI_UINT32_T, comm[channel][id]); + all2AllInfoOut.countRecv.data(), rc.data(), rs.data(), MPI_UINT32_T, + comm[channel][id]); if (retCode != MPI_SUCCESS) { - LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", - channel, id, batch->batchId, retCode); + LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", channel, id, + batch->batchId, retCode); } } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All end, all2allTC TimeCost(ms):{}", + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All end, all2allTC " + "TimeCost(ms):{}", channel, id, batch->batchId, all2allTC.ElapsedMS()); EASY_END_BLOCK } auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, - vector& splitKeys) -> tuple, vector> + vector& splitKeys) + -> tuple, vector> { TimeCost processSplitKeysTC; EASY_FUNCTION(profiler::colors::Purple) @@ -746,44 +785,47 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, LOG_INFO(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, ProcessSplitKeys start.", batch->channel, id, batch->batchId); - // 使用静态all2all通信:发送或接受量为预置固定值 scInfo[batch->name] = 65536 / rankSize 经验值 - if (rankInfo.useStatic) { // maybe move after all2all - for (KeysT& i: splitKeys) { + // 使用静态all2all通信:发送或接受量为预置固定值 scInfo[batch->name] = 65536 / + // rankSize 经验值 + if (rankInfo.useStatic) { // maybe move after all2all + for (KeysT& i : splitKeys) { if (static_cast(i.size()) > embInfos[batch->name].sendCount) { - LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", - batch->name, batch->channel, batch->batchId, i.size()); + LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", batch->name, + batch->channel, batch->batchId, i.size()); throw runtime_error( StringFormat("%s[%d]:%d overflow! set send count bigger than %d", - batch->name.c_str(), batch->channel, batch->batchId, i.size()).c_str()); + batch->name.c_str(), batch->channel, batch->batchId, i.size()) + .c_str()); } i.resize(embInfos[batch->name].sendCount, -1); } } KeysT keySend; - vector sc; // send count - for (const auto& i: splitKeys) { + vector sc; // send count + for (const auto& i : splitKeys) { sc.push_back(static_cast(i.size())); keySend.insert(keySend.cend(), i.cbegin(), i.cend()); } KeysT keyRecv; TimeCost getScAllTC; - vector scAll = GetScAll(sc, id, batch); // Allgather通信获取所有(不同rank相同thread id的)线程间通信量矩阵 + vector scAll = GetScAll(sc, id, batch); // Allgather通信获取所有(不同rank相同thread + // id的)线程间通信量矩阵 LOG_DEBUG("getScAllTC(ms)(AllReduce-AllGather):{}", getScAllTC.ElapsedMS()); vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 - vector rc; // receive count + vector rc; // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { // 通信量矩阵某一列的和即为本地要从其他设备接受的key数据量 rc.push_back(scAll.at(i * rankInfo.rankSize + rankInfo.rankId)); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 keyRecv.resize(rs.back() + rc.back()); EASY_BLOCK("all2all") TimeCost uniqueAll2AllTC; - int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, - keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, comm[batch->channel][id]); + int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, keyRecv.data(), + rc.data(), rs.data(), MPI_INT64_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -792,8 +834,8 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, EASY_END_BLOCK LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, batchName:{}, MPI_Alltoallv finish." " processSplitKeysTC(ms):{}", - batch->channel, id, batch->batchId, batch->name, processSplitKeysTC.ElapsedMS()); - return { keyRecv, scAll, ss }; + batch->channel, id, batch->batchId, batch->name, processSplitKeysTC.ElapsedMS()); + return {keyRecv, scAll, ss}; } /* @@ -801,15 +843,16 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, * splitKeys返回:将数据的key切分到其所在dev id对应的桶中,并去重。 * restore返回:去重后key在桶内偏移量(用于计算恢复向量) */ -tuple, vector> KeyProcess::HashSplit(const unique_ptr& batch) const +tuple, vector> KeyProcess::HashSplit( + const unique_ptr& batch) const { EASY_FUNCTION(profiler::colors::Gold) emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); vector restore(batch->Size()); - vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 - absl::flat_hash_map uKey; // 用于去重查询 + vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 + absl::flat_hash_map uKey; // 用于去重查询 EASY_BLOCK("split push back") for (size_t i = 0; i < miniBs; i++) { const emb_key_t& key = batchData[i]; @@ -817,9 +860,10 @@ tuple, vector> KeyProcess::HashSplit(const unique_ptrsecond; } } @@ -832,10 +876,11 @@ tuple, vector> KeyProcess::HashSplit(const unique_ptrchannel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} " + "unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - return { splitKeys, restore }; + return {splitKeys, restore}; } void KeyProcess::PaddingAlltoallVC(vector& splitKeys) const @@ -857,10 +902,10 @@ tuple, vector, vector>> KeyProcess::Hash emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); - vector> keyCount(rankInfo.rankSize); // splitKeys在原始batch中对应的频次 + vector> keyCount(rankInfo.rankSize); // splitKeys在原始batch中对应的频次 vector restore(batch->Size()); - vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 - absl::flat_hash_map> uKey; // 用于去重查询 + vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 + absl::flat_hash_map> uKey; // 用于去重查询 EASY_BLOCK("split push back") for (size_t i = 0; i < miniBs; i++) { const emb_key_t& key = batchData[i]; @@ -868,10 +913,11 @@ tuple, vector, vector>> KeyProcess::Hash auto result = uKey.find(key); if (result == uKey.end()) { splitKeys[devId].push_back(key); - restore[i] = hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) + restore[i] = + hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) uKey[key].first = restore[i]; uKey[key].second = 1; - } else { // 去重 + } else { // 去重 restore[i] = result->second.first; uKey[key].second++; } @@ -897,20 +943,22 @@ tuple, vector, vector>> KeyProcess::Hash for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} faae_unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} " + "faae_unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - return { splitKeys, restore, keyCount }; + return {splitKeys, restore, keyCount}; } -tuple, vector, vector> KeyProcess::HotHashSplit(const unique_ptr& batch) +tuple, vector, vector> KeyProcess::HotHashSplit( + const unique_ptr& batch) { EASY_FUNCTION(profiler::colors::Gold) emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); vector restore(batch->Size()); - absl::flat_hash_map uKey; // 用于去重查询 + absl::flat_hash_map uKey; // 用于去重查询 absl::flat_hash_map keyCountMapByEmbName; std::shared_lock lock(g_smut); auto hotMap = hotKey[batch->name]; @@ -919,31 +967,31 @@ tuple, vector, vector> KeyProcess::HotHashSplit(cons vector hotPosDev(hotEmbTotCount[batch->name]); int hotCount = 0; int hotOffset = hotEmbTotCount[batch->name]; - for (size_t i = 0; i < miniBs; i++) { // for mini batch + for (size_t i = 0; i < miniBs; i++) { // for mini batch const emb_key_t& key = batchData[i]; if (batch->batchId % hotEmbUpdateStep == 0) { keyCountMapByEmbName[key]++; } emb_key_t devId = abs(key % static_cast(rankInfo.rankSize)); auto result = uKey.find(key); - if (result != uKey.end()) { // // already in splitKeys + if (result != uKey.end()) { // // already in splitKeys restore[i] = result->second; continue; } // new key in current batch - splitKeys[devId].push_back(key); // push to bucket + splitKeys[devId].push_back(key); // push to bucket auto hot = hotMap.find(key); - if (hot != hotMap.end()) { // is hot key - if (hot->second == -1) { // is new hot key in this batch + if (hot != hotMap.end()) { // is hot key + if (hot->second == -1) { // is new hot key in this batch // pos in lookup vec (need add ss) for hot-gather hotPos[hotCount] = static_cast(splitKeys[devId].size()) - 1; - hotPosDev[hotCount] = devId; // which dev, for get ss + hotPosDev[hotCount] = devId; // which dev, for get ss hot->second = hotCount; - restore[i] = hotCount++; // get pos of hot emb + restore[i] = hotCount++; // get pos of hot emb } else { restore[i] = hot->second; } - } else { // is not hot key + } else { // is not hot key // restore记录去重后key在桶内偏移量(用于计算恢复向量) restore[i] = static_cast(splitKeys[devId].size() + (hotOffset - 1)); } @@ -955,22 +1003,25 @@ tuple, vector, vector> KeyProcess::HotHashSplit(cons for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} hot_unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} " + "hot_unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - UpdateHotMap(keyCountMapByEmbName, hotEmbTotCount[batch->name], batch->batchId % hotEmbUpdateStep == 0, - batch->name); + UpdateHotMap(keyCountMapByEmbName, hotEmbTotCount[batch->name], + batch->batchId % hotEmbUpdateStep == 0, batch->name); AddCountStartToHotPos(splitKeys, hotPos, hotPosDev, batch); - return { splitKeys, restore, hotPos }; + return {splitKeys, restore, hotPos}; } -void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, +void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& hotPos, + const vector& hotPosDev, const unique_ptr& batch) { vector splitKeysSize; - for (auto& splitKey: splitKeys) { - int tmp = rankInfo.useStatic ? embInfos[batch->name].sendCount : static_cast(splitKey.size()); + for (auto& splitKey : splitKeys) { + int tmp = rankInfo.useStatic ? embInfos[batch->name].sendCount + : static_cast(splitKey.size()); splitKeysSize.push_back(tmp); } @@ -980,13 +1031,13 @@ void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& ho } } -void KeyProcess::UpdateHotMapForUnique(const KeysT &keySend, const vector &keyCount, +void KeyProcess::UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, uint32_t count, bool refresh, const string& embName) { auto& hotMap = hotKey[embName]; if (refresh) { priority_queue> pq; - for (size_t i = 0;i < keySend.size(); ++i) { + for (size_t i = 0; i < keySend.size(); ++i) { if (keySend[i] == -1) { continue; } @@ -1005,15 +1056,15 @@ void KeyProcess::UpdateHotMapForUnique(const KeysT &keySend, const vector& keyCountMapByEmbName, uint32_t count, bool refresh, - const string& embName) +void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, + uint32_t count, bool refresh, const string& embName) { if (!refresh) { return; } auto& hotMap = hotKey[embName]; - priority_queue> pq; // top k key - for (auto& p: keyCountMapByEmbName) { + priority_queue> pq; // top k key + for (auto& p : keyCountMapByEmbName) { pq.push(pair(-p.second, p.first)); if (pq.size() > count) { pq.pop(); @@ -1029,43 +1080,46 @@ void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapBy } /* - * 将本地(rank)batch要发送的key数据量进行Allgather通信,获取所有(不同rank相同thread id的)线程间的通信量矩阵 + * 将本地(rank)batch要发送的key数据量进行Allgather通信,获取所有(不同rank相同thread + * id的)线程间的通信量矩阵 * scAll返回:所有线程间的通信量矩阵(按行平铺的一维向量) */ -vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch) +vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, + const unique_ptr& batch) { EASY_FUNCTION() vector scAll; scAll.resize(rankInfo.rankSize * rankInfo.rankSize); - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll start.", batch->channel, commId, batch->batchId); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll start.", batch->channel, commId, + batch->batchId); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, - scAll.data(), rankInfo.rankSize, MPI_INT, - comm[batch->channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAll.data(), + rankInfo.rankSize, MPI_INT, comm[batch->channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {} commId {}, MPI_Allgather failed:{}", rankInfo.rankId, commId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, key scAll matrix:\n{}", + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, " + "key scAll matrix:\n{}", batch->channel, commId, batch->batchId, VectorToString(scAll)); return scAll; } -void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr &batch, - vector &scAllOut) +void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, + const unique_ptr& batch, vector& scAllOut) { EASY_FUNCTION() int channel = batch->channel; scAllOut.resize(rankInfo.rankSize * rankInfo.rankSize); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, - scAllOut.data(), rankInfo.rankSize, MPI_INT, - comm[channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAllOut.data(), + rankInfo.rankSize, MPI_INT, comm[channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Allgather failed:{}", rankInfo.rankId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key scAllOut matrix:\n{}", + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key " + "scAllOut matrix:\n{}", channel, commId, batch->batchId, VectorToString(scAllOut)); } @@ -1073,9 +1127,9 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe { TimeCost key2OffsetTC; EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD auto& key2Offset = keyOffsetMap[embName]; - auto& maxOffsetTmp = maxOffset[embName]; + auto& maxOffsetTmp = maxOffset[embName]; auto& evictPos = evictPosMap[embName]; for (long& key : splitKey) { if (key == -1) { @@ -1088,8 +1142,9 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe size_t offset; // 新值, emb有pos可复用 offset = evictPos.back(); - LOG_TRACE("HBM mode, evictPos is not null, name[{}] key [{}] reuse offset [{}], evictSize [{}]!!!", - embName, key, offset, evictPos.size()); + LOG_TRACE("HBM mode, evictPos is not null, name[{}] key [{}] reuse " + "offset [{}], evictSize [{}]!!!", + embName, key, offset, evictPos.size()); key2Offset[key] = offset; key = offset; evictPos.pop_back(); @@ -1107,18 +1162,18 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe LOG_ERROR("dev cache overflow {} > {}", maxOffsetTmp, embInfos[embName].devVocabSize); throw std::runtime_error("dev cache overflow!"); } - LOG_DEBUG("current hbm emb:{}, usage:{}/{} key2OffsetTC({} ms)", - embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); + LOG_DEBUG("current hbm emb:{}, usage:{}/{} key2OffsetTC({} ms)", embName, maxOffsetTmp, + embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); } void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel) { TimeCost key2OffsetTC; EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD auto& key2Offset = keyOffsetMap[embName]; - auto& maxOffsetTmp = maxOffset[embName]; - auto& curEmbTable = embeddingTableMap[embName]; // empty when not use dynamic expansion + auto& maxOffsetTmp = maxOffset[embName]; + auto& curEmbTable = embeddingTableMap[embName]; // empty when not use dynamic expansion for (long& key : splitKey) { if (key == -1) { key = 0; @@ -1141,8 +1196,8 @@ void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& spli key = 0; } } - LOG_DEBUG("current expansion emb:{}, usage:{}/{}, key2OffsetTC({} ms)", - embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); + LOG_DEBUG("current expansion emb:{}, usage:{}/{}, key2OffsetTC({} ms)", embName, maxOffsetTmp, + embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); } /* @@ -1150,7 +1205,8 @@ void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& spli * 输入接收到emb块的偏移blockOffset,batch内每个key在块内的偏移restoreVec * 输出恢复向量restoreVec,即batch到keySend(平铺的splitKeys)的映射 * 实现方案2:用map记录keySend中key和表内index/offset的映射,在恢复emb时直接根据batch的key查询该map即可找到receive - * emb中的 位置,时间复杂度:O(map构建keySend.size + map查询),空间复杂度:O(map) + * emb中的 位置,时间复杂度:O(map构建keySend.size + + * map查询),空间复杂度:O(map) */ void KeyProcess::BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, vector& restoreVec, int hotPosSize) const @@ -1167,11 +1223,11 @@ void KeyProcess::BuildRestoreVec(const unique_ptr& batch, const vecto hotNum += 1; } } - LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", - hotNum, batch->Size(), buildRestoreVecTC.ElapsedMS()); + LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", hotNum, batch->Size(), + buildRestoreVecTC.ElapsedMS()); } -template +template T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, int channel) { std::lock_guard lockGuard(mut); @@ -1181,7 +1237,8 @@ T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, in } auto topBatch = get(list[embName][channel].top()); if (topBatch < batch) { - LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", topBatch, batch, channel); + LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", topBatch, + batch, channel); this_thread::sleep_for(1s); } if (topBatch != batch) { @@ -1201,7 +1258,8 @@ T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, in KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) { TimeCost tc = TimeCost(); - // 循环尝试获取list中的数据;如果key process线程退出或者处理数据超时,返回空vector + // 循环尝试获取list中的数据;如果key + // process线程退出或者处理数据超时,返回空vector while (true) { if (!isRunning) { return {}; @@ -1209,8 +1267,9 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { - LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", - embName, channel, batch); + LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, " + "exiting the loop! {}[{}]:{}", + embName, channel, batch); return {}; } if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { @@ -1231,8 +1290,9 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) SendEos(batch, channel); return {}; } - LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", - embName, channel, batch, readEmbKeyBatchId); + LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: " + "{}, readEmbKey batchId: {}.", + embName, channel, batch, readEmbKeyBatchId); this_thread::sleep_for(1ms); } catch (WrongListTop&) { LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); @@ -1256,22 +1316,28 @@ void KeyProcess::SendEos(int batchId, int channel) vector tensors; bool isNeedResend = true; - for (const auto& emb: as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos start.", channel, batchId, emb.first); + for (const auto& emb : + as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 + LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos " + "start.", + channel, batchId, emb.first); if (!isRunning) { throw EndRunExit("SendEos end run, isRunning is false after lock destroyMutex."); } for (const string& transName : usedChannelNames) { - string sendName = StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); + string sendName = + StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); size_t channelSize = 0; - + acltdtQueryChannelSize(transChannels[sendName], &channelSize); LOG_INFO("[EOS] Before send eos, {} contains {}.", sendName, channelSize); - SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, isNeedResend); + SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, + isNeedResend); acltdtQueryChannelSize(transChannels[sendName], &channelSize); LOG_INFO("[EOS] After send eos, {} contains {}.", sendName, channelSize); } - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos end.", channel, batchId, emb.first); + LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos end.", channel, + batchId, emb.first); } LOG_INFO("channelId:{} batchId:{}, SendEos end.", channel, batchId); @@ -1285,7 +1351,8 @@ void KeyProcess::SendEos(int batchId, int channel) /// \param channel 通道索引(训练/推理) /// \param type 数据类型 /// \return -unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type) +unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embName, int channel, + ProcessedInfo type) { TimeCost tc = TimeCost(); info_list_t* list; @@ -1302,7 +1369,8 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa throw std::invalid_argument("Invalid ProcessedInfo Type."); } - // 循环尝试获取list中的数据;如果key process线程退出或者处理数据超时,返回空指针 + // 循环尝试获取list中的数据;如果key + // process线程退出或者处理数据超时,返回空指针 while (true) { if (!isRunning) { return nullptr; @@ -1310,8 +1378,9 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { - LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", - embName, channel, batch); + LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, " + "exiting the loop! {}[{}]:{}", + embName, channel, batch); return nullptr; } if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { @@ -1328,15 +1397,18 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa return uTensor; } catch (EmptyList&) { unique_lock lockEosGuard(eosMutex); - // 避免eos在keyProcess还未处理完数据时插队到通道前面, readEmbKey真实的次数是readEmbedBatchId减1 - if (isNeedSendEos[channel] && (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { + // 避免eos在keyProcess还未处理完数据时插队到通道前面, + // readEmbKey真实的次数是readEmbedBatchId减1 + if (isNeedSendEos[channel] && + (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { LOG_INFO("channelId:{} batchId:{}, GetInfoVec eos.", channel, batch); unique_lock lockDestroyGuard(destroyMutex); SendEos(batch, channel); return nullptr; } - LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", - embName, channel, batch, (hybridMgmtBlock->readEmbedBatchId[channel] - 1)); + LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: " + "{}, readEmbKey batchId: {}.", + embName, channel, batch, (hybridMgmtBlock->readEmbedBatchId[channel] - 1)); this_thread::sleep_for(1ms); } catch (WrongListTop&) { LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); @@ -1349,7 +1421,7 @@ void KeyProcess::SendA2A(const vector& a2aInfo, const string& embName, int { // 数据放到队列里,在mgmt里面发送(检查发送数据量) auto tensors = make_unique>(); - Tensor tmpTensor(tensorflow::DT_INT64, { rankInfo.rankSize, rankInfo.rankSize }); + Tensor tmpTensor(tensorflow::DT_INT64, {rankInfo.rankSize, rankInfo.rankSize}); auto tmpData = tmpTensor.matrix(); for (int i = 0; i < rankInfo.rankSize; ++i) { for (int j = 0; j < rankInfo.rankSize; ++j) { @@ -1369,13 +1441,14 @@ int KeyProcess::GetMaxStep(int channelId) const return rankInfo.ctrlSteps.at(channelId); } -void KeyProcess::EvictKeys(const string& embName, const vector& keys) // hbm +void KeyProcess::EvictKeys(const string& embName, + const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm funEvictCall: [{}]! keySize:{}", embName, keys.size()); EmbeddingMgmt::Instance()->EvictKeys(embName, keys); } -void KeyProcess::EvictKeysCombine(const vector& keys) // hbm +void KeyProcess::EvictKeysCombine(const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm combine funEvictCall, keySize:{}", keys.size()); EmbeddingMgmt::Instance()->EvictKeysCombine(keys); @@ -1384,7 +1457,7 @@ void KeyProcess::EvictKeysCombine(const vector& keys) // hbm void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vector& keys) { EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD size_t keySize = keys.size(); auto& devHashMap = keyOffsetMap.at(embName); @@ -1398,7 +1471,7 @@ void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vectorsecond; @@ -1406,24 +1479,26 @@ void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vector offset) { if (offset.size() > embInfos[embName].devVocabSize) { - LOG_ERROR("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", - embName, offset.size(), embInfos[embName].devVocabSize); - throw runtime_error( - Logger::Format("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", - embName, offset.size(), embInfos[embName].devVocabSize - ).c_str()); + LOG_ERROR("{} overflow! init evict dev, evictOffset size {} bigger than " + "dev vocabSize {}", + embName, offset.size(), embInfos[embName].devVocabSize); + throw runtime_error(Logger::Format("{} overflow! init evict dev, evictOffset size {} " + "bigger than dev vocabSize {}", + embName, offset.size(), embInfos[embName].devVocabSize) + .c_str()); } vector tmpDataOut; Tensor tmpData = Vec2TensorI32(offset); tmpDataOut.emplace_back(tmpData); - tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); + tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, {1})); auto evictLen = tmpDataOut.back().flat(); int evictSize = static_cast(offset.size()); @@ -1433,15 +1508,16 @@ void KeyProcess::EvictInitDeviceEmb(const string& embName, vector offset auto trans = Singleton::GetInstance(); trans->Send(TransferChannel::EVICT, tmpDataOut, TRAIN_CHANNEL_ID, embName); - LOG_INFO(KEY_PROCESS "hbm EvictInitDeviceEmb: [{}]! send offsetSize:{}", embName, offset.size()); + LOG_INFO(KEY_PROCESS "hbm EvictInitDeviceEmb: [{}]! send offsetSize:{}", embName, + offset.size()); } -string KeyProcess::DumpSplitKeys(vector> &splitKeys) const +string KeyProcess::DumpSplitKeys(vector>& splitKeys) const { stringstream ssTrace; for (int devId = 0; devId < rankInfo.rankSize; ++devId) { ssTrace << '|' << devId << ":"; - for (auto key: splitKeys[devId]) { + for (auto key : splitKeys[devId]) { ssTrace << key << ','; } ssTrace << '|'; @@ -1480,7 +1556,8 @@ void KeyProcess::RecordKeyCountMap(const unique_ptr& batch) void KeyProcess::SetEos(int status, int channelId) { unique_lock lockGuard(eosMutex); - LOG_INFO("isNeedSendEos status is changed, before status:[{}], input status:{}, channel:[{}], ", + LOG_INFO("isNeedSendEos status is changed, before status:[{}], input " + "status:{}, channel:[{}], ", isNeedSendEos[channelId], status, channelId); isNeedSendEos[channelId] = (status == 1); } diff --git a/src/core/key_process/key_process.h b/src/core/key_process/key_process.h index 8bd7b8d0..d6a0b80b 100644 --- a/src/core/key_process/key_process.h +++ b/src/core/key_process/key_process.h @@ -16,283 +16,291 @@ See the License for the specific language governing permissions and #ifndef MX_REC_KEY_PROCESS_H #define MX_REC_KEY_PROCESS_H -#include +#include +#include + #include #include +#include #include #include -#include - -#include -#include -#include "ock_ctr_common/include/factory.h" +#include -#include "utils/common.h" #include "emb_table/emb_table.h" #include "feature_admit_and_evict.h" #include "hybrid_mgmt/hybrid_mgmt_block.h" +#include "ock_ctr_common/include/factory.h" +#include "utils/common.h" #include "utils/singleton.h" namespace MxRec { - using namespace std; +using namespace std; - template - struct Cmp { - bool operator()(const T& a, const T& b) const - { - return get(a) > get(b); // batch id order - } - }; +template +struct Cmp { + bool operator()(const T& a, const T& b) const + { + return get(a) > get(b); // batch id order + } +}; - template - using heap_t = priority_queue, Cmp>; +template +using heap_t = priority_queue, Cmp>; - template - using info_list_t = map, MAX_QUEUE_NUM>>; +template +using info_list_t = map, MAX_QUEUE_NUM>>; - enum class ProcessedInfo { - RESTORE, - ALL2ALL, - INVALID - }; +enum class ProcessedInfo { + RESTORE, + ALL2ALL, + INVALID +}; - class EndRunExit : public std::exception { - public: - explicit EndRunExit(const char* message) : errorMessage(message) {} +class EndRunExit : public std::exception { +public: + explicit EndRunExit(const char* message) : errorMessage(message) {} - const char* what() const noexcept override - { - return errorMessage; - } + const char* what() const noexcept override + { + return errorMessage; + } - private: - const char* errorMessage; - }; +private: + const char* errorMessage; +}; - constexpr int MPI_ABNORMAL_SEND_VALUE = 0; // MPI异常通信时发送0 - constexpr int MPI_NORMAL_SEND_VALUE = 1; // MPI正常通信时发送1 +constexpr int MPI_ABNORMAL_SEND_VALUE = 0; // MPI异常通信时发送0 +constexpr int MPI_NORMAL_SEND_VALUE = 1; // MPI正常通信时发送1 - class EmptyList : public std::exception { - }; +class EmptyList : public std::exception {}; - class WrongListTop : public std::exception { - }; +class WrongListTop : public std::exception {}; - class KeyProcess { - public: - bool Initialize(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues = {}, int seed = 0); +class KeyProcess { +public: + bool Initialize(const RankInfo& rInfo, const vector& eInfos, + const vector& thresholdValues = {}, int seed = 0); - unique_ptr> GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type); + unique_ptr> GetInfoVec(int batch, const string& embName, int channel, + ProcessedInfo type); - KeysT GetLookupKeys(int batch, const string& embName, int channel); + KeysT GetLookupKeys(int batch, const string& embName, int channel); - int GetMaxStep(int channelId) const; + int GetMaxStep(int channelId) const; - OffsetMemT GetMaxOffset(); + OffsetMemT GetMaxOffset(); - KeyOffsetMemT GetKeyOffsetMap(); + KeyOffsetMemT GetKeyOffsetMap(); - KeyCountMemT GetKeyCountMap(); + KeyCountMemT GetKeyCountMap(); - FeatureAdmitAndEvict& GetFeatAdmitAndEvict(); + FeatureAdmitAndEvict& GetFeatAdmitAndEvict(); - void LoadMaxOffset(OffsetMemT& loadData); + void LoadMaxOffset(OffsetMemT& loadData); - void LoadKeyOffsetMap(KeyOffsetMemT& loadData); + void LoadKeyOffsetMap(KeyOffsetMemT& loadData); - void LoadKeyCountMap(KeyCountMemT& loadData); + void LoadKeyCountMap(KeyCountMemT& loadData); - void Destroy(); + void Destroy(); - void LoadSaveLock(); + void LoadSaveLock(); - void LoadSaveUnlock(); + void LoadSaveUnlock(); - void EvictKeys(const string& embName, const vector& keys); + void EvictKeys(const string& embName, const vector& keys); - void EvictKeysCombine(const vector& keys); + void EvictKeysCombine(const vector& keys); - void SetupHotEmbUpdateStep(); + void SetupHotEmbUpdateStep(); - int64_t GetExpansionTableSize(const string& embName); + int64_t GetExpansionTableSize(const string& embName); - int64_t GetExpansionTableCapacity(const string& embName); + int64_t GetExpansionTableCapacity(const string& embName); - void RecordKeyCountMap(const unique_ptr& batch); + void RecordKeyCountMap(const unique_ptr& batch); - template - void GlobalUnique(T& lookupKeys, T& uniqueKeys, vector& restoreVecSec) - { - absl::flat_hash_map umap; - restoreVecSec.resize(lookupKeys.size(), -1); - int32_t length = 0; + template + void GlobalUnique(T& lookupKeys, T& uniqueKeys, vector& restoreVecSec) + { + absl::flat_hash_map umap; + restoreVecSec.resize(lookupKeys.size(), -1); + int32_t length = 0; - for (size_t i = 0; i < lookupKeys.size(); ++i) { - int64_t key = lookupKeys[i]; - if (rankInfo.useStatic && ( - (!rankInfo.useDynamicExpansion && key == -1) || (rankInfo.useDynamicExpansion && key == 0))) { - continue; - } + for (size_t i = 0; i < lookupKeys.size(); ++i) { + int64_t key = lookupKeys[i]; + if (rankInfo.useStatic && ((!rankInfo.useDynamicExpansion && key == -1) || + (rankInfo.useDynamicExpansion && key == 0))) { + continue; + } - auto result = umap.find(key); - if (result == umap.end()) { - uniqueKeys.push_back(lookupKeys[i]); - umap[key] = length; - restoreVecSec[i] = length; - length++; - } else { - restoreVecSec[i] = result->second; - } + auto result = umap.find(key); + if (result == umap.end()) { + uniqueKeys.push_back(lookupKeys[i]); + umap[key] = length; + restoreVecSec[i] = length; + length++; + } else { + restoreVecSec[i] = result->second; } + } - if (rankInfo.useStatic) { - if (rankInfo.useDynamicExpansion) { - uniqueKeys.resize(lookupKeys.size(), 0); - } else { - uniqueKeys.resize(lookupKeys.size(), -1); - } + if (rankInfo.useStatic) { + if (rankInfo.useDynamicExpansion) { + uniqueKeys.resize(lookupKeys.size(), 0); + } else { + uniqueKeys.resize(lookupKeys.size(), -1); } } + } + + void SetEos(int status, int channelId); - void SetEos(int status, int channelId); + void SendEos(int batchId, int channel); - void SendEos(int batchId, int channel); + bool isRunning{false}; - bool isRunning { false }; + std::mutex destroyMutex; + std::mutex eosMutex; + inline bool HasEmbName(const string& embName) + { + return embInfos.find(embName) != embInfos.end(); + }; + GTEST_PRIVATE : - std::mutex destroyMutex; - std::mutex eosMutex; - inline bool HasEmbName(const string& embName) - { - return embInfos.find(embName) != embInfos.end(); - }; - GTEST_PRIVATE: + int + Start(); - int Start(); + template + T GetInfo(info_list_t& list, int batch, const string& embName, int channel); - template - T GetInfo(info_list_t& list, int batch, const string& embName, int channel); + RankInfo rankInfo; + map embInfos; + MPI_Comm comm[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]; + std::mutex mut{}; + vector> procThreads{}; + std::mutex loadSaveMut[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]{}; + info_list_t lookupKeysList; + list>> storage; + info_list_t infoList; + info_list_t all2AllList; + map maxOffset{}; + map> keyOffsetMap{}; + map> keyCountMap{}; + FeatureAdmitAndEvict m_featureAdmitAndEvict{}; + map> evictPosMap{}; + map> hotKey{}; + map hotEmbTotCount; + map embeddingTableMap{}; + ock::ctr::FactoryPtr factory{}; + int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; + bool isWithFAAE; + bool isNeedSendEos[2] = {0, 0}; // 分别代表通道0、1的eos状态 - RankInfo rankInfo; - map embInfos; - MPI_Comm comm[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]; - std::mutex mut {}; - vector> procThreads {}; - std::mutex loadSaveMut[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD] {}; - info_list_t lookupKeysList; - list>> storage; - info_list_t infoList; - info_list_t all2AllList; - map maxOffset {}; - map> keyOffsetMap {}; - map> keyCountMap {}; - FeatureAdmitAndEvict m_featureAdmitAndEvict {}; - map> evictPosMap {}; - map> hotKey {}; - map hotEmbTotCount; - map embeddingTableMap {}; - ock::ctr::FactoryPtr factory {}; - int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; - bool isWithFAAE; - bool isNeedSendEos[2] = { 0, 0 }; // 分别代表通道0、1的eos状态 + void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); - void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); + void KeyProcessTask(int channel, int threadId); - void KeyProcessTask(int channel, int threadId); + void KeyProcessTaskWithFastUnique(int channel, int threadId); - void KeyProcessTaskWithFastUnique(int channel, int threadId); + bool KeyProcessTaskHelper(unique_ptr& batch, int channel, int threadId); - bool KeyProcessTaskHelper(unique_ptr& batch, int channel, int threadId); + bool KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, + ock::ctr::UniquePtr& unique, int channel, int threadId); - bool KeyProcessTaskHelperWithFastUnique(unique_ptr &batch, ock::ctr::UniquePtr& unique, - int channel, int threadId); + tuple, vector> ProcessSplitKeys(const unique_ptr& batch, + int id, vector& splitKeys); - tuple, vector> ProcessSplitKeys(const unique_ptr& batch, - int id, vector& splitKeys); + void GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf); - void GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf); + void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, + bool& uniqueInitialize, const unique_ptr& batch, + ock::ctr::UniquePtr& unique); - void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, - const unique_ptr & batch, ock::ctr::UniquePtr& unique); + void ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniquePtr& unique, + int id, UniqueInfo& uniqueInfoOut); - void ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniquePtr& unique, - int id, UniqueInfo& uniqueInfoOut); + size_t GetKeySize(const unique_ptr& batch); - size_t GetKeySize(const unique_ptr &batch); + void All2All(vector& sc, int id, const unique_ptr& batch, + KeySendInfo& keySendInfo, All2AllInfo& all2AllInfoOut); - void All2All(vector& sc, int id, const unique_ptr &batch, KeySendInfo& keySendInfo, - All2AllInfo& all2AllInfoOut); + auto HashSplit(const unique_ptr& batch) const + -> tuple, vector>; - auto HashSplit(const unique_ptr& batch) const -> tuple, vector>; + auto HotHashSplit(const unique_ptr& batch) + -> tuple, vector, vector>; - auto HotHashSplit(const unique_ptr& batch) -> tuple, vector, vector>; + void PaddingAlltoallVC(vector& splitKeys) const; - void PaddingAlltoallVC(vector& splitKeys) const; + tuple, vector, vector>> HashSplitWithFAAE( + const unique_ptr& batch) const; - tuple, vector, vector>> - HashSplitWithFAAE(const unique_ptr& batch) const; + vector GetScAll(const vector& keyScLocal, int commId, + const unique_ptr& batch); - vector GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch); + void GetScAllForUnique(const vector& keyScLocal, int commId, + const unique_ptr& batch, vector& scAllOut); - void GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr &batch, - vector &scAllOut); + void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); - void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); + void Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel); - void Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel); + unique_ptr GetBatchData(int channel, int commId) const; - unique_ptr GetBatchData(int channel, int commId) const; + void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, + vector& restoreVec, int hotPosSize = 0) const; - void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, - vector& restoreVec, int hotPosSize = 0) const; - - void SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch); + void SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch); - void EvictDeleteDeviceEmb(const string& embName, const vector& keys); + void EvictDeleteDeviceEmb(const string& embName, const vector& keys); - void EvictInitDeviceEmb(const string& embName, vector offset); + void EvictInitDeviceEmb(const string& embName, vector offset); - void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, bool refresh, - const string& embName); + void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, + bool refresh, const string& embName); - void UpdateHotMapForUnique(const KeysT &keySend, const vector &keyCount, - uint32_t count, bool refresh, const string& embName); + void UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, + uint32_t count, bool refresh, const string& embName); - void HandleHotAndSendCount(const unique_ptr &batch, UniqueInfo& uniqueInfoOut, - KeySendInfo& keySendInfo, vector& sc, vector& splitSize); + void HandleHotAndSendCount(const unique_ptr& batch, UniqueInfo& uniqueInfoOut, + KeySendInfo& keySendInfo, vector& sc, vector& splitSize); - void PushResult(unique_ptr& batch, unique_ptr> tensors, KeysT& lookupKeys); + void PushResult(unique_ptr& batch, unique_ptr> tensors, + KeysT& lookupKeys); - void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel); + void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, + int channel); - void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, - const unique_ptr& batch); + void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, + const vector& hotPosDev, const unique_ptr& batch); - void ComputeHotPos(const unique_ptr &batch, absl::flat_hash_map &hotMap, - vector &hotPos, vector &restore, const int hotOffset) const; + void ComputeHotPos(const unique_ptr& batch, + absl::flat_hash_map& hotMap, vector& hotPos, + vector& restore, const int hotOffset) const; - vector GetCountRecv(const unique_ptr& batch, int id, - vector>& keyCount, vector scAll, vector ss); + vector GetCountRecv(const unique_ptr& batch, int id, + vector>& keyCount, vector scAll, + vector ss); - void HashSplitHelper(const unique_ptr & batch, vector & splitKeys, - vector & restore, vector & hotPos, - vector >& keyCount); + void HashSplitHelper(const unique_ptr& batch, vector& splitKeys, + vector& restore, vector& hotPos, + vector>& keyCount); - template - inline vector Count2Start(const vector& count) const - { - vector start = { 0 }; - for (size_t i = 0; i < count.size() - 1; ++i) { - start.push_back(count[i] + start.back()); - } - return start; + template + inline vector Count2Start(const vector& count) const + { + vector start = {0}; + for (size_t i = 0; i < count.size() - 1; ++i) { + start.push_back(count[i] + start.back()); } + return start; + } - string DumpSplitKeys(vector>& splitKeys) const; - }; + string DumpSplitKeys(vector>& splitKeys) const; +}; #define KEY_PROCESS_INSTANCE Singleton::GetInstance() -} // end namespace MxRec +} // end namespace MxRec -#endif // MX_REC_KEY_PROCESS_H +#endif // MX_REC_KEY_PROCESS_H -- Gitee From fa9bb8d73b5a87972bd4dfcd846941eb9e23a8a3 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 22 Apr 2024 14:50:22 +0800 Subject: [PATCH 051/302] Revert "add .clang-format" This reverts commit d7ed2aa49e8c464e6dc61c3e6216eb18f4e8ae42. --- src/core/key_process/key_process.cpp | 535 ++++++++++++--------------- src/core/key_process/key_process.h | 382 ++++++++++--------- 2 files changed, 416 insertions(+), 501 deletions(-) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 58312ca1..f76f6907 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -15,21 +15,19 @@ See the License for the specific language governing permissions and #include "key_process.h" -#include - #include #include - -#include "emb_table/embedding_mgmt.h" -#include "hd_transfer/hd_transfer.h" -#include "host_emb/host_emb.h" -#include "ock_ctr_common/include/error_code.h" +#include #include "utils/common.h" -#include "utils/config.h" #include "utils/logger.h" #include "utils/safe_queue.h" #include "utils/singleton.h" #include "utils/time_cost.h" +#include "utils/config.h" +#include "host_emb/host_emb.h" +#include "emb_table/embedding_mgmt.h" +#include "hd_transfer/hd_transfer.h" +#include "ock_ctr_common/include/error_code.h" using namespace std; using namespace chrono; @@ -43,7 +41,8 @@ void KeyProcess::SetupHotEmbUpdateStep() } bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues, int seed) + const vector& thresholdValues, + int seed) { this->rankInfo = rInfo; if (rankInfo.useHot) { @@ -51,7 +50,7 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos } map scInfo; - for (const auto& info : eInfos) { + for (const auto& info: eInfos) { embInfos[info.name] = info; scInfo[info.name] = info.sendCount; if (rankInfo.useHot) { @@ -67,8 +66,8 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos LOG_INFO(KEY_PROCESS "hot emb count info:{}", MapToString(hotEmbTotCount)); MPI_Group worldGroup; MPI_Comm_group(MPI_COMM_WORLD, &worldGroup); - for (auto& i : comm) { - for (auto& j : i) { + for (auto& i: comm) { + for (auto& j: i) { MPI_Comm_create(MPI_COMM_WORLD, worldGroup, &j); } } @@ -86,14 +85,12 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos if (GlobalEnv::fastUnique) { int result = ock::ctr::Factory::Create(factory); if (result != 0) { - throw runtime_error( - Logger::Format("create fast factory failed, error code:{}", result)); + throw runtime_error(Logger::Format("create fast factory failed, error code:{}", result)); } } LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}, useHot:{}", - MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic, - rInfo.useHot); + MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic, rInfo.useHot); #ifndef GTEST Start(); #endif @@ -106,9 +103,8 @@ int KeyProcess::Start() // bind like: // 0 1 2 3 4 5 0 1 2 3 4 5 // | rank0 | | rank1 | - // each rank creates KEY_PROCESS_THREAD threads, each thread process one - // batchdata - LOG_INFO("CPU Core Num: {}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 + // each rank creates KEY_PROCESS_THREAD threads, each thread process one batchdata + LOG_INFO("CPU Core Num: {}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 auto fn = [this](int channel, int threadId) { #ifndef GTEST auto ret = aclrtSetDevice(static_cast(rankInfo.deviceId)); @@ -122,7 +118,7 @@ int KeyProcess::Start() } else { KeyProcessTask(channel, threadId); } - }; // for clean code + }; // for clean code int threadNum = GetThreadNumEnv(); for (int channel = 0; channel < MAX_CHANNEL_NUM; ++channel) { LOG_INFO(KEY_PROCESS "key process thread num: {}", threadNum); @@ -140,9 +136,8 @@ void KeyProcess::InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo) if (rankInfo.useDynamicExpansion) { embeddingSize = info.embeddingSize; } - hotEmbTotCount[info.name] = - static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) * - HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); + hotEmbTotCount[info.name] = static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) * + HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); } OffsetMemT KeyProcess::GetMaxOffset() @@ -187,7 +182,7 @@ void KeyProcess::Destroy() { isRunning = false; LOG_INFO(KEY_PROCESS "rankId:{} KeyProcess begin destroy.", rankInfo.rankId); - for (auto& i : procThreads) { + for (auto& i: procThreads) { i->join(); } procThreads.clear(); @@ -197,8 +192,8 @@ void KeyProcess::Destroy() /// 每个数据通道的所有数据处理线程上锁 void KeyProcess::LoadSaveLock() { - for (int channelId{0}; channelId < MAX_CHANNEL_NUM; ++channelId) { - for (int threadId{0}; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { + for (int channelId { 0 }; channelId < MAX_CHANNEL_NUM; ++channelId) { + for (int threadId { 0 }; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { loadSaveMut[channelId][threadId].lock(); } } @@ -207,8 +202,8 @@ void KeyProcess::LoadSaveLock() /// 每个数据通道的所有数据处理线程释放锁 void KeyProcess::LoadSaveUnlock() { - for (int channelId{0}; channelId < MAX_CHANNEL_NUM; ++channelId) { - for (int threadId{0}; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { + for (int channelId { 0 }; channelId < MAX_CHANNEL_NUM; ++channelId) { + for (int threadId { 0 }; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { loadSaveMut[channelId][threadId].unlock(); } } @@ -234,9 +229,8 @@ void KeyProcess::GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf) uniqueConf.maxThreadNum = GlobalEnv::maxUniqueThreadNum; } -void KeyProcess::InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, - bool& uniqueInitialize, const unique_ptr& batch, - ock::ctr::UniquePtr& unique) +void KeyProcess::InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, + const unique_ptr & batch, ock::ctr::UniquePtr& unique) { uniqueConf.desiredSize = static_cast(batch->Size()); if (preBatchSize != batch->Size()) { @@ -278,8 +272,7 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = - GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -292,8 +285,7 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) break; } LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process with fast unique cost:{}," - " get data time(ms):{}, batch name:{}, channelId:{}, " - "threadId:{}, batchId:{}", + " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); @@ -301,13 +293,14 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) batchQueue->PutDirty(move(batch)); } unique->UnInitialize(); - } catch (const EndRunExit& e) { + } catch (const EndRunExit &e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } LOG_INFO(KEY_PROCESS "KeyProcessTaskWithFastUnique exit. rank:{} channelId:{}, threadId:{}", - rankInfo.rankId, channel, threadId); + rankInfo.rankId, channel, threadId); } + void KeyProcess::KeyProcessTask(int channel, int threadId) { unique_ptr batch; @@ -315,8 +308,7 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = - GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -328,46 +320,43 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) break; } LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process cost:{}," - " get data time(ms):{}, batch name:{}, " - "channelId:{}, threadId:{}, batchId:{}", + " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); auto batchQueue = SingletonQueue::GetInstances(queueIndex); batchQueue->PutDirty(move(batch)); } - } catch (const EndRunExit& e) { + } catch (const EndRunExit &e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } - LOG_INFO(KEY_PROCESS "KeyProcessTask exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, - channel, threadId); + LOG_INFO(KEY_PROCESS "KeyProcessTask exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, channel, threadId); } -void KeyProcess::HashSplitHelper(const unique_ptr& batch, vector& splitKeys, - vector& restore, vector& hotPos, - vector>& keyCount) +void KeyProcess::HashSplitHelper(const unique_ptr & batch, vector & splitKeys, + vector & restore, vector & hotPos, + vector >& keyCount) { TimeCost uniqueTc; if (m_featureAdmitAndEvict.GetFunctionSwitch() && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { - tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 } else { if (rankInfo.useHot) { - tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 } else { - tie(splitKeys, restore) = HashSplit(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore) = HashSplit(batch); // 按存储dev id切分并去重 } } LOG_DEBUG("uniqueTc(ms):{}", uniqueTc.ElapsedMS()); } -bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, - ock::ctr::UniquePtr& unique, int channel, - int threadId) +bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, ock::ctr::UniquePtr& unique, + int channel, int threadId) { // tuple for keyRec restore hotPos scAll countRecv isWithFAAE = m_featureAdmitAndEvict.GetFunctionSwitch() && - FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; + FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; TimeCost totalTimeCost = TimeCost(); TimeCost fastUniqueTC; UniqueInfo uniqueInfo; @@ -376,11 +365,11 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch // 特征准入&淘汰 if (isWithFAAE && - (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, uniqueInfo.all2AllInfo.keyRecv, - uniqueInfo.all2AllInfo.countRecv) == - FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + (m_featureAdmitAndEvict.FeatureAdmit( + channel, batch, uniqueInfo.all2AllInfo.keyRecv, uniqueInfo.all2AllInfo.countRecv) == + FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", - rankInfo.rankId, threadId, channel); + rankInfo.rankId, threadId, channel); return false; } std::lock_guard lock(loadSaveMut[channel][threadId]); @@ -394,9 +383,7 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch LOG_DEBUG("key2OffsetTC(ms):{}", key2OffsetTC.ElapsedMS()); } // Static all2all,need send count - if (!rankInfo.useStatic) { - SendA2A(uniqueInfo.all2AllInfo.scAll, batch->name, batch->channel, batch->batchId); - } + if (!rankInfo.useStatic) { SendA2A(uniqueInfo.all2AllInfo.scAll, batch->name, batch->channel, batch->batchId); } auto tensors = make_unique>(); tensors->push_back(Vec2TensorI32(uniqueInfo.restore)); @@ -407,17 +394,15 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch if (!rankInfo.isDDR) { PushGlobalUniqueTensors(move(tensors), uniqueInfo.all2AllInfo.keyRecv, channel); - tensors->push_back(rankInfo.useDynamicExpansion - ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) - : Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) : + Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); } TimeCost pushResultTC; PushResult(batch, move(tensors), uniqueInfo.all2AllInfo.keyRecv); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} " - "key_process_time_cost_with_fast_unique {}", - channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost_with_fast_unique {}", + channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); return true; @@ -445,8 +430,8 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, // 特征准入&淘汰 if (m_featureAdmitAndEvict.GetFunctionSwitch() && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE && - (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, countRecv) == - FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, + countRecv) == FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", rankInfo.rankId, threadId, channel); return false; @@ -459,9 +444,7 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, } // Static all2all,need send count - if (!rankInfo.useStatic) { - SendA2A(scAll, batch->name, batch->channel, batch->batchId); - } + if (!rankInfo.useStatic) { SendA2A(scAll, batch->name, batch->channel, batch->batchId); } TimeCost pushResultTC; auto tensors = make_unique>(); @@ -473,24 +456,21 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, if (!rankInfo.isDDR) { PushGlobalUniqueTensors(tensors, lookupKeys, channel); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) - : Vec2TensorI32(lookupKeys)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) : Vec2TensorI32(lookupKeys)); } PushResult(batch, move(tensors), lookupKeys); LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", channel, - batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", + channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } return true; } -void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, - KeysT& lookupKeys, int channel) +void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel) { - if (GlobalEnv::applyGradientsStrategy == - ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && + if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && channel == TRAIN_CHANNEL_ID) { KeysT uniqueKeys; vector restoreVecSec; @@ -499,39 +479,36 @@ void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tenso GlobalUnique(lookupKeys, uniqueKeys, restoreVecSec); LOG_DEBUG("globalUniqueSyncTC(ms):{}", globalUniqueSyncTC.ElapsedMS()); tensors->push_back(Vec2TensorI32(restoreVecSec)); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) - : Vec2TensorI32(uniqueKeys)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) : Vec2TensorI32(uniqueKeys)); } } vector KeyProcess::GetCountRecv(const unique_ptr& batch, int id, - vector>& keyCount, vector scAll, - vector ss) + vector>& keyCount, vector scAll, vector ss) { TimeCost getCountRecvTC; if (rankInfo.useStatic) { - for (auto& cnt : keyCount) { + for (auto& cnt: keyCount) { cnt.resize(embInfos[batch->name].sendCount, 0); } } vector countSend; - for (auto& cnt : keyCount) { + for (auto& cnt: keyCount) { countSend.insert(countSend.cend(), cnt.cbegin(), cnt.cend()); } vector sc; for (int i = 0; i < rankInfo.rankSize; ++i) { sc.push_back(scAll.at(rankInfo.rankSize * rankInfo.rankId + i)); } - vector rc; // receive count + vector rc; // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { rc.push_back(scAll.at(i * rankInfo.rankSize + rankInfo.rankId)); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 vector countRecv; countRecv.resize(rs.back() + rc.back()); - int retCode = - MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), - rc.data(), rs.data(), MPI_UINT32_T, comm[batch->channel][id]); + int retCode = MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), + rc.data(), rs.data(), MPI_UINT32_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -544,19 +521,16 @@ void KeyProcess::PushResult(unique_ptr& batch, unique_ptr lockGuard(mut); storage.push_front(move(tensors)); - infoList[batch->name][batch->channel].push( - make_tuple(batch->batchId, batch->name, storage.begin())); + infoList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, storage.begin())); if (rankInfo.isDDR) { - lookupKeysList[batch->name][batch->channel].push( - make_tuple(batch->batchId, batch->name, move(lookupKeys))); + lookupKeysList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, move(lookupKeys))); } lockGuard.unlock(); } /* - * 从共享队列SingletonQueue中读取batch数据并返回。batch数据由 - * ReadEmbKeyV2 写入。 commID为线程标识[0, - * KEY_PROCESS_THREAD-1],不同线程、训练或推理数据用不同的共享队列通信 + * 从共享队列SingletonQueue中读取batch数据并返回。batch数据由 ReadEmbKeyV2 写入。 + * commID为线程标识[0, KEY_PROCESS_THREAD-1],不同线程、训练或推理数据用不同的共享队列通信 */ unique_ptr KeyProcess::GetBatchData(int channel, int commId) const { @@ -577,37 +551,32 @@ unique_ptr KeyProcess::GetBatchData(int channel, int commId) const this_thread::sleep_for(100us); if (tc.ElapsedSec() > GET_BATCH_TIMEOUT) { if (commId == 0) { - LOG_WARN(KEY_PROCESS - "getting batch timeout! 1. check last 'read batch cost' print. " - "channel[{}] commId[{}]", - channel, commId); + LOG_WARN(KEY_PROCESS "getting batch timeout! 1. check last 'read batch cost' print. " + "channel[{}] commId[{}]", channel, commId); } this_thread::sleep_for(seconds(1)); tc = TimeCost(); } if (!isRunning) { - LOG_WARN("channelId:{} threadId:{}, isRunning is false when GetBatchData", channel, - commId); + LOG_WARN("channelId:{} threadId:{}, isRunning is false when GetBatchData", channel, commId); throw EndRunExit("GetBatchData end run."); } } EASY_END_BLOCK - LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, get batch data " - "done, batchName:{}. bs:{} sample:[{}]", + LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, get batch data done, batchName:{}. bs:{} sample:[{}]", batch->channel, commId, batch->batchId, batch->name, batch->Size(), batch->UnParse()); #if defined(PROFILING) && defined(BUILD_WITH_EASY_PROFILER) if (batch->batchId == PROFILING_START_BATCH_ID) { EASY_PROFILER_ENABLE } else if (batch->batchId == PROFILING_END_BATCH_ID) { - ::profiler::dumpBlocksToFile( - StringFormat("/home/MX_REC-profile-%d.prof", rankInfo.rankId).c_str()); + ::profiler::dumpBlocksToFile(StringFormat("/home/MX_REC-profile-%d.prof", rankInfo.rankId).c_str()); } #endif return batch; } -size_t KeyProcess::GetKeySize(const unique_ptr& batch) +size_t KeyProcess::GetKeySize(const unique_ptr &batch) { size_t size = rankInfo.rankSize * embInfos[batch->name].sendCount; if (!rankInfo.useStatic) { @@ -616,9 +585,8 @@ size_t KeyProcess::GetKeySize(const unique_ptr& batch) return size; } -void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, - ock::ctr::UniquePtr& unique, int id, - UniqueInfo& uniqueInfoOut) +void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniquePtr& unique, + int id, UniqueInfo& uniqueInfoOut) { EASY_FUNCTION(profiler::colors::Purple) EASY_VALUE("batchId", batch->batchId) @@ -637,10 +605,10 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniqueIn uniqueIn; uniqueIn.inputIdCnt = static_cast(batch->Size()); - uniqueIn.inputId = reinterpret_cast(batch->sample.data()); + uniqueIn.inputId = reinterpret_cast(batch->sample.data()); ock::ctr::EnhancedUniqueOut uniqueOut; - uniqueOut.uniqueId = reinterpret_cast(keySendInfo.keySend.data()); + uniqueOut.uniqueId = reinterpret_cast(keySendInfo.keySend.data()); uniqueOut.index = reinterpret_cast(uniqueInfoOut.restore.data()); if (rankInfo.useStatic) { uniqueOut.idCnt = idCount.data(); @@ -649,7 +617,7 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, uniqueOut.idCnt = keySendInfo.keyCount.data(); } uniqueOut.uniqueIdCntInBucket = splitSize.data(); - uniqueOut.uniqueIdInBucket = reinterpret_cast(uniqueVector.data()); + uniqueOut.uniqueIdInBucket = reinterpret_cast(uniqueVector.data()); uniqueOut.uniqueIdCnt = 0; int ret = unique->DoEnhancedUnique(uniqueIn, uniqueOut); @@ -665,21 +633,19 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, All2All(sc, id, batch, keySendInfo, uniqueInfoOut.all2AllInfo); LOG_DEBUG(KEY_PROCESS "ProcessBatchWithFastUnique get batchId:{}, batchSize:{}," - " channel:{}, name:{}, restore:{}, keyCount:{}", - batch->batchId, batch->Size(), batch->channel, batch->name, - uniqueInfoOut.restore.size(), keySendInfo.keyCount.size()); + " channel:{}, name:{}, restore:{}, keyCount:{}", + batch->batchId, batch->Size(), batch->channel, batch->name, + uniqueInfoOut.restore.size(), keySendInfo.keyCount.size()); if (GlogConfig::gStatOn) { LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} " - "batch_key_num_with_fast_unique {} unique_key_num_with_fast_unique {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), - uniqueOut.uniqueIdCnt); + "batch_key_num_with_fast_unique {} unique_key_num_with_fast_unique {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueOut.uniqueIdCnt); } } -void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, - UniqueInfo& uniqueInfoOut, KeySendInfo& keySendInfo, - vector& sc, vector& splitSize) +void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, UniqueInfo& uniqueInfoOut, + KeySendInfo& keySendInfo, vector& sc, vector& splitSize) { std::shared_lock lock(g_smut); absl::flat_hash_map hotMap = hotKey[batch->name]; @@ -693,8 +659,8 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, TimeCost computeHotTc; ComputeHotPos(batch, hotMap, uniqueInfoOut.hotPos, uniqueInfoOut.restore, hotOffset); LOG_DEBUG("ComputeHot TimeCost(ms):{}", computeHotTc.ElapsedMS()); - UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, hotOffset, - batch->batchId % hotEmbUpdateStep == 0, batch->name); + UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, + hotOffset, batch->batchId % hotEmbUpdateStep == 0, batch->name); } if (rankInfo.useStatic) { @@ -707,9 +673,8 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, } } -void KeyProcess::ComputeHotPos(const unique_ptr& batch, - absl::flat_hash_map& hotMap, vector& hotPos, - vector& restore, const int hotOffset) const +void KeyProcess::ComputeHotPos(const unique_ptr &batch, absl::flat_hash_map &hotMap, + vector &hotPos, vector &restore, const int hotOffset) const { emb_key_t* inputData = batch->sample.data(); size_t miniBs = batch->Size(); @@ -732,52 +697,48 @@ void KeyProcess::ComputeHotPos(const unique_ptr& batch, } } -void KeyProcess::All2All(vector& sc, int id, const unique_ptr& batch, - KeySendInfo& keySendInfo, All2AllInfo& all2AllInfoOut) +void KeyProcess::All2All(vector& sc, int id, const unique_ptr &batch, KeySendInfo& keySendInfo, + All2AllInfo& all2AllInfoOut) { TimeCost getScAllTC; int channel = batch->channel; - GetScAllForUnique(sc, id, batch, - all2AllInfoOut.scAll); // Allgather通信获取所有(不同rank相同thread id的) + GetScAllForUnique(sc, id, batch, all2AllInfoOut.scAll); // Allgather通信获取所有(不同rank相同thread id的) LOG_DEBUG("GetScAll TimeCost(ms):{}", getScAllTC.ElapsedMS()); TimeCost all2allTC; - vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 - vector rc(rankInfo.rankSize); // receive count + vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 + vector rc(rankInfo.rankSize); // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { // 通信量矩阵某一列的和即为本地要从其他设备接受的key数据量 rc[i] = all2AllInfoOut.scAll.at(i * rankInfo.rankSize + rankInfo.rankId); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 all2AllInfoOut.keyRecv.resize(rs.back() + rc.back()); EASY_BLOCK("all2all") int retCode = MPI_Alltoallv(keySendInfo.keySend.data(), sc.data(), ss.data(), MPI_INT64_T, - all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, - comm[channel][id]); + all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), + MPI_INT64_T, comm[channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All MPI_Alltoallv end.", channel, id, - batch->batchId); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All MPI_Alltoallv end.", channel, id, batch->batchId); all2AllInfoOut.countRecv.resize(rs.back() + rc.back()); if (isWithFAAE) { retCode = MPI_Alltoallv(keySendInfo.keyCount.data(), sc.data(), ss.data(), MPI_UINT32_T, - all2AllInfoOut.countRecv.data(), rc.data(), rs.data(), MPI_UINT32_T, - comm[channel][id]); + all2AllInfoOut.countRecv.data(), rc.data(), + rs.data(), MPI_UINT32_T, comm[channel][id]); if (retCode != MPI_SUCCESS) { - LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", channel, id, - batch->batchId, retCode); + LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", + channel, id, batch->batchId, retCode); } } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All end, all2allTC " - "TimeCost(ms):{}", + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All end, all2allTC TimeCost(ms):{}", channel, id, batch->batchId, all2allTC.ElapsedMS()); EASY_END_BLOCK } auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, - vector& splitKeys) - -> tuple, vector> + vector& splitKeys) -> tuple, vector> { TimeCost processSplitKeysTC; EASY_FUNCTION(profiler::colors::Purple) @@ -785,47 +746,44 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, LOG_INFO(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, ProcessSplitKeys start.", batch->channel, id, batch->batchId); - // 使用静态all2all通信:发送或接受量为预置固定值 scInfo[batch->name] = 65536 / - // rankSize 经验值 - if (rankInfo.useStatic) { // maybe move after all2all - for (KeysT& i : splitKeys) { + // 使用静态all2all通信:发送或接受量为预置固定值 scInfo[batch->name] = 65536 / rankSize 经验值 + if (rankInfo.useStatic) { // maybe move after all2all + for (KeysT& i: splitKeys) { if (static_cast(i.size()) > embInfos[batch->name].sendCount) { - LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", batch->name, - batch->channel, batch->batchId, i.size()); + LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", + batch->name, batch->channel, batch->batchId, i.size()); throw runtime_error( StringFormat("%s[%d]:%d overflow! set send count bigger than %d", - batch->name.c_str(), batch->channel, batch->batchId, i.size()) - .c_str()); + batch->name.c_str(), batch->channel, batch->batchId, i.size()).c_str()); } i.resize(embInfos[batch->name].sendCount, -1); } } KeysT keySend; - vector sc; // send count - for (const auto& i : splitKeys) { + vector sc; // send count + for (const auto& i: splitKeys) { sc.push_back(static_cast(i.size())); keySend.insert(keySend.cend(), i.cbegin(), i.cend()); } KeysT keyRecv; TimeCost getScAllTC; - vector scAll = GetScAll(sc, id, batch); // Allgather通信获取所有(不同rank相同thread - // id的)线程间通信量矩阵 + vector scAll = GetScAll(sc, id, batch); // Allgather通信获取所有(不同rank相同thread id的)线程间通信量矩阵 LOG_DEBUG("getScAllTC(ms)(AllReduce-AllGather):{}", getScAllTC.ElapsedMS()); vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 - vector rc; // receive count + vector rc; // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { // 通信量矩阵某一列的和即为本地要从其他设备接受的key数据量 rc.push_back(scAll.at(i * rankInfo.rankSize + rankInfo.rankId)); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 keyRecv.resize(rs.back() + rc.back()); EASY_BLOCK("all2all") TimeCost uniqueAll2AllTC; - int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, keyRecv.data(), - rc.data(), rs.data(), MPI_INT64_T, comm[batch->channel][id]); + int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, + keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -834,8 +792,8 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, EASY_END_BLOCK LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, batchName:{}, MPI_Alltoallv finish." " processSplitKeysTC(ms):{}", - batch->channel, id, batch->batchId, batch->name, processSplitKeysTC.ElapsedMS()); - return {keyRecv, scAll, ss}; + batch->channel, id, batch->batchId, batch->name, processSplitKeysTC.ElapsedMS()); + return { keyRecv, scAll, ss }; } /* @@ -843,16 +801,15 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, * splitKeys返回:将数据的key切分到其所在dev id对应的桶中,并去重。 * restore返回:去重后key在桶内偏移量(用于计算恢复向量) */ -tuple, vector> KeyProcess::HashSplit( - const unique_ptr& batch) const +tuple, vector> KeyProcess::HashSplit(const unique_ptr& batch) const { EASY_FUNCTION(profiler::colors::Gold) emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); vector restore(batch->Size()); - vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 - absl::flat_hash_map uKey; // 用于去重查询 + vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 + absl::flat_hash_map uKey; // 用于去重查询 EASY_BLOCK("split push back") for (size_t i = 0; i < miniBs; i++) { const emb_key_t& key = batchData[i]; @@ -860,10 +817,9 @@ tuple, vector> KeyProcess::HashSplit( auto result = uKey.find(key); if (result == uKey.end()) { splitKeys[devId].push_back(key); - restore[i] = - hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) + restore[i] = hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) uKey[key] = restore[i]; - } else { // 去重 + } else { // 去重 restore[i] = result->second; } } @@ -876,11 +832,10 @@ tuple, vector> KeyProcess::HashSplit( for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} " - "unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - return {splitKeys, restore}; + return { splitKeys, restore }; } void KeyProcess::PaddingAlltoallVC(vector& splitKeys) const @@ -902,10 +857,10 @@ tuple, vector, vector>> KeyProcess::Hash emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); - vector> keyCount(rankInfo.rankSize); // splitKeys在原始batch中对应的频次 + vector> keyCount(rankInfo.rankSize); // splitKeys在原始batch中对应的频次 vector restore(batch->Size()); - vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 - absl::flat_hash_map> uKey; // 用于去重查询 + vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 + absl::flat_hash_map> uKey; // 用于去重查询 EASY_BLOCK("split push back") for (size_t i = 0; i < miniBs; i++) { const emb_key_t& key = batchData[i]; @@ -913,11 +868,10 @@ tuple, vector, vector>> KeyProcess::Hash auto result = uKey.find(key); if (result == uKey.end()) { splitKeys[devId].push_back(key); - restore[i] = - hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) + restore[i] = hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) uKey[key].first = restore[i]; uKey[key].second = 1; - } else { // 去重 + } else { // 去重 restore[i] = result->second.first; uKey[key].second++; } @@ -943,22 +897,20 @@ tuple, vector, vector>> KeyProcess::Hash for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} " - "faae_unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} faae_unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - return {splitKeys, restore, keyCount}; + return { splitKeys, restore, keyCount }; } -tuple, vector, vector> KeyProcess::HotHashSplit( - const unique_ptr& batch) +tuple, vector, vector> KeyProcess::HotHashSplit(const unique_ptr& batch) { EASY_FUNCTION(profiler::colors::Gold) emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); vector restore(batch->Size()); - absl::flat_hash_map uKey; // 用于去重查询 + absl::flat_hash_map uKey; // 用于去重查询 absl::flat_hash_map keyCountMapByEmbName; std::shared_lock lock(g_smut); auto hotMap = hotKey[batch->name]; @@ -967,31 +919,31 @@ tuple, vector, vector> KeyProcess::HotHashSplit( vector hotPosDev(hotEmbTotCount[batch->name]); int hotCount = 0; int hotOffset = hotEmbTotCount[batch->name]; - for (size_t i = 0; i < miniBs; i++) { // for mini batch + for (size_t i = 0; i < miniBs; i++) { // for mini batch const emb_key_t& key = batchData[i]; if (batch->batchId % hotEmbUpdateStep == 0) { keyCountMapByEmbName[key]++; } emb_key_t devId = abs(key % static_cast(rankInfo.rankSize)); auto result = uKey.find(key); - if (result != uKey.end()) { // // already in splitKeys + if (result != uKey.end()) { // // already in splitKeys restore[i] = result->second; continue; } // new key in current batch - splitKeys[devId].push_back(key); // push to bucket + splitKeys[devId].push_back(key); // push to bucket auto hot = hotMap.find(key); - if (hot != hotMap.end()) { // is hot key - if (hot->second == -1) { // is new hot key in this batch + if (hot != hotMap.end()) { // is hot key + if (hot->second == -1) { // is new hot key in this batch // pos in lookup vec (need add ss) for hot-gather hotPos[hotCount] = static_cast(splitKeys[devId].size()) - 1; - hotPosDev[hotCount] = devId; // which dev, for get ss + hotPosDev[hotCount] = devId; // which dev, for get ss hot->second = hotCount; - restore[i] = hotCount++; // get pos of hot emb + restore[i] = hotCount++; // get pos of hot emb } else { restore[i] = hot->second; } - } else { // is not hot key + } else { // is not hot key // restore记录去重后key在桶内偏移量(用于计算恢复向量) restore[i] = static_cast(splitKeys[devId].size() + (hotOffset - 1)); } @@ -1003,25 +955,22 @@ tuple, vector, vector> KeyProcess::HotHashSplit( for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} " - "hot_unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} hot_unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - UpdateHotMap(keyCountMapByEmbName, hotEmbTotCount[batch->name], - batch->batchId % hotEmbUpdateStep == 0, batch->name); + UpdateHotMap(keyCountMapByEmbName, hotEmbTotCount[batch->name], batch->batchId % hotEmbUpdateStep == 0, + batch->name); AddCountStartToHotPos(splitKeys, hotPos, hotPosDev, batch); - return {splitKeys, restore, hotPos}; + return { splitKeys, restore, hotPos }; } -void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& hotPos, - const vector& hotPosDev, +void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, const unique_ptr& batch) { vector splitKeysSize; - for (auto& splitKey : splitKeys) { - int tmp = rankInfo.useStatic ? embInfos[batch->name].sendCount - : static_cast(splitKey.size()); + for (auto& splitKey: splitKeys) { + int tmp = rankInfo.useStatic ? embInfos[batch->name].sendCount : static_cast(splitKey.size()); splitKeysSize.push_back(tmp); } @@ -1031,13 +980,13 @@ void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& ho } } -void KeyProcess::UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, +void KeyProcess::UpdateHotMapForUnique(const KeysT &keySend, const vector &keyCount, uint32_t count, bool refresh, const string& embName) { auto& hotMap = hotKey[embName]; if (refresh) { priority_queue> pq; - for (size_t i = 0; i < keySend.size(); ++i) { + for (size_t i = 0;i < keySend.size(); ++i) { if (keySend[i] == -1) { continue; } @@ -1056,15 +1005,15 @@ void KeyProcess::UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCountMapByEmbName, - uint32_t count, bool refresh, const string& embName) +void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, bool refresh, + const string& embName) { if (!refresh) { return; } auto& hotMap = hotKey[embName]; - priority_queue> pq; // top k key - for (auto& p : keyCountMapByEmbName) { + priority_queue> pq; // top k key + for (auto& p: keyCountMapByEmbName) { pq.push(pair(-p.second, p.first)); if (pq.size() > count) { pq.pop(); @@ -1080,46 +1029,43 @@ void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapBy } /* - * 将本地(rank)batch要发送的key数据量进行Allgather通信,获取所有(不同rank相同thread - * id的)线程间的通信量矩阵 + * 将本地(rank)batch要发送的key数据量进行Allgather通信,获取所有(不同rank相同thread id的)线程间的通信量矩阵 * scAll返回:所有线程间的通信量矩阵(按行平铺的一维向量) */ -vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, - const unique_ptr& batch) +vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch) { EASY_FUNCTION() vector scAll; scAll.resize(rankInfo.rankSize * rankInfo.rankSize); - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll start.", batch->channel, commId, - batch->batchId); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll start.", batch->channel, commId, batch->batchId); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAll.data(), - rankInfo.rankSize, MPI_INT, comm[batch->channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, + scAll.data(), rankInfo.rankSize, MPI_INT, + comm[batch->channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {} commId {}, MPI_Allgather failed:{}", rankInfo.rankId, commId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, " - "key scAll matrix:\n{}", + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, key scAll matrix:\n{}", batch->channel, commId, batch->batchId, VectorToString(scAll)); return scAll; } -void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, - const unique_ptr& batch, vector& scAllOut) +void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr &batch, + vector &scAllOut) { EASY_FUNCTION() int channel = batch->channel; scAllOut.resize(rankInfo.rankSize * rankInfo.rankSize); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAllOut.data(), - rankInfo.rankSize, MPI_INT, comm[channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, + scAllOut.data(), rankInfo.rankSize, MPI_INT, + comm[channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Allgather failed:{}", rankInfo.rankId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key " - "scAllOut matrix:\n{}", + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key scAllOut matrix:\n{}", channel, commId, batch->batchId, VectorToString(scAllOut)); } @@ -1127,9 +1073,9 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe { TimeCost key2OffsetTC; EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD auto& key2Offset = keyOffsetMap[embName]; - auto& maxOffsetTmp = maxOffset[embName]; + auto& maxOffsetTmp = maxOffset[embName]; auto& evictPos = evictPosMap[embName]; for (long& key : splitKey) { if (key == -1) { @@ -1142,9 +1088,8 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe size_t offset; // 新值, emb有pos可复用 offset = evictPos.back(); - LOG_TRACE("HBM mode, evictPos is not null, name[{}] key [{}] reuse " - "offset [{}], evictSize [{}]!!!", - embName, key, offset, evictPos.size()); + LOG_TRACE("HBM mode, evictPos is not null, name[{}] key [{}] reuse offset [{}], evictSize [{}]!!!", + embName, key, offset, evictPos.size()); key2Offset[key] = offset; key = offset; evictPos.pop_back(); @@ -1162,18 +1107,18 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe LOG_ERROR("dev cache overflow {} > {}", maxOffsetTmp, embInfos[embName].devVocabSize); throw std::runtime_error("dev cache overflow!"); } - LOG_DEBUG("current hbm emb:{}, usage:{}/{} key2OffsetTC({} ms)", embName, maxOffsetTmp, - embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); + LOG_DEBUG("current hbm emb:{}, usage:{}/{} key2OffsetTC({} ms)", + embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); } void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel) { TimeCost key2OffsetTC; EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD auto& key2Offset = keyOffsetMap[embName]; - auto& maxOffsetTmp = maxOffset[embName]; - auto& curEmbTable = embeddingTableMap[embName]; // empty when not use dynamic expansion + auto& maxOffsetTmp = maxOffset[embName]; + auto& curEmbTable = embeddingTableMap[embName]; // empty when not use dynamic expansion for (long& key : splitKey) { if (key == -1) { key = 0; @@ -1196,8 +1141,8 @@ void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& spli key = 0; } } - LOG_DEBUG("current expansion emb:{}, usage:{}/{}, key2OffsetTC({} ms)", embName, maxOffsetTmp, - embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); + LOG_DEBUG("current expansion emb:{}, usage:{}/{}, key2OffsetTC({} ms)", + embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); } /* @@ -1205,8 +1150,7 @@ void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& spli * 输入接收到emb块的偏移blockOffset,batch内每个key在块内的偏移restoreVec * 输出恢复向量restoreVec,即batch到keySend(平铺的splitKeys)的映射 * 实现方案2:用map记录keySend中key和表内index/offset的映射,在恢复emb时直接根据batch的key查询该map即可找到receive - * emb中的 位置,时间复杂度:O(map构建keySend.size + - * map查询),空间复杂度:O(map) + * emb中的 位置,时间复杂度:O(map构建keySend.size + map查询),空间复杂度:O(map) */ void KeyProcess::BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, vector& restoreVec, int hotPosSize) const @@ -1223,11 +1167,11 @@ void KeyProcess::BuildRestoreVec(const unique_ptr& batch, const vecto hotNum += 1; } } - LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", hotNum, batch->Size(), - buildRestoreVecTC.ElapsedMS()); + LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", + hotNum, batch->Size(), buildRestoreVecTC.ElapsedMS()); } -template +template T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, int channel) { std::lock_guard lockGuard(mut); @@ -1237,8 +1181,7 @@ T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, in } auto topBatch = get(list[embName][channel].top()); if (topBatch < batch) { - LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", topBatch, - batch, channel); + LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", topBatch, batch, channel); this_thread::sleep_for(1s); } if (topBatch != batch) { @@ -1258,8 +1201,7 @@ T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, in KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) { TimeCost tc = TimeCost(); - // 循环尝试获取list中的数据;如果key - // process线程退出或者处理数据超时,返回空vector + // 循环尝试获取list中的数据;如果key process线程退出或者处理数据超时,返回空vector while (true) { if (!isRunning) { return {}; @@ -1267,9 +1209,8 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { - LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, " - "exiting the loop! {}[{}]:{}", - embName, channel, batch); + LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", + embName, channel, batch); return {}; } if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { @@ -1290,9 +1231,8 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) SendEos(batch, channel); return {}; } - LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: " - "{}, readEmbKey batchId: {}.", - embName, channel, batch, readEmbKeyBatchId); + LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", + embName, channel, batch, readEmbKeyBatchId); this_thread::sleep_for(1ms); } catch (WrongListTop&) { LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); @@ -1316,28 +1256,22 @@ void KeyProcess::SendEos(int batchId, int channel) vector tensors; bool isNeedResend = true; - for (const auto& emb : - as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos " - "start.", - channel, batchId, emb.first); + for (const auto& emb: as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 + LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos start.", channel, batchId, emb.first); if (!isRunning) { throw EndRunExit("SendEos end run, isRunning is false after lock destroyMutex."); } for (const string& transName : usedChannelNames) { - string sendName = - StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); + string sendName = StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); size_t channelSize = 0; - + acltdtQueryChannelSize(transChannels[sendName], &channelSize); LOG_INFO("[EOS] Before send eos, {} contains {}.", sendName, channelSize); - SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, - isNeedResend); + SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, isNeedResend); acltdtQueryChannelSize(transChannels[sendName], &channelSize); LOG_INFO("[EOS] After send eos, {} contains {}.", sendName, channelSize); } - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos end.", channel, - batchId, emb.first); + LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos end.", channel, batchId, emb.first); } LOG_INFO("channelId:{} batchId:{}, SendEos end.", channel, batchId); @@ -1351,8 +1285,7 @@ void KeyProcess::SendEos(int batchId, int channel) /// \param channel 通道索引(训练/推理) /// \param type 数据类型 /// \return -unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embName, int channel, - ProcessedInfo type) +unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type) { TimeCost tc = TimeCost(); info_list_t* list; @@ -1369,8 +1302,7 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa throw std::invalid_argument("Invalid ProcessedInfo Type."); } - // 循环尝试获取list中的数据;如果key - // process线程退出或者处理数据超时,返回空指针 + // 循环尝试获取list中的数据;如果key process线程退出或者处理数据超时,返回空指针 while (true) { if (!isRunning) { return nullptr; @@ -1378,9 +1310,8 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { - LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, " - "exiting the loop! {}[{}]:{}", - embName, channel, batch); + LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", + embName, channel, batch); return nullptr; } if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { @@ -1397,18 +1328,15 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa return uTensor; } catch (EmptyList&) { unique_lock lockEosGuard(eosMutex); - // 避免eos在keyProcess还未处理完数据时插队到通道前面, - // readEmbKey真实的次数是readEmbedBatchId减1 - if (isNeedSendEos[channel] && - (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { + // 避免eos在keyProcess还未处理完数据时插队到通道前面, readEmbKey真实的次数是readEmbedBatchId减1 + if (isNeedSendEos[channel] && (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { LOG_INFO("channelId:{} batchId:{}, GetInfoVec eos.", channel, batch); unique_lock lockDestroyGuard(destroyMutex); SendEos(batch, channel); return nullptr; } - LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: " - "{}, readEmbKey batchId: {}.", - embName, channel, batch, (hybridMgmtBlock->readEmbedBatchId[channel] - 1)); + LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", + embName, channel, batch, (hybridMgmtBlock->readEmbedBatchId[channel] - 1)); this_thread::sleep_for(1ms); } catch (WrongListTop&) { LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); @@ -1421,7 +1349,7 @@ void KeyProcess::SendA2A(const vector& a2aInfo, const string& embName, int { // 数据放到队列里,在mgmt里面发送(检查发送数据量) auto tensors = make_unique>(); - Tensor tmpTensor(tensorflow::DT_INT64, {rankInfo.rankSize, rankInfo.rankSize}); + Tensor tmpTensor(tensorflow::DT_INT64, { rankInfo.rankSize, rankInfo.rankSize }); auto tmpData = tmpTensor.matrix(); for (int i = 0; i < rankInfo.rankSize; ++i) { for (int j = 0; j < rankInfo.rankSize; ++j) { @@ -1441,14 +1369,13 @@ int KeyProcess::GetMaxStep(int channelId) const return rankInfo.ctrlSteps.at(channelId); } -void KeyProcess::EvictKeys(const string& embName, - const vector& keys) // hbm +void KeyProcess::EvictKeys(const string& embName, const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm funEvictCall: [{}]! keySize:{}", embName, keys.size()); EmbeddingMgmt::Instance()->EvictKeys(embName, keys); } -void KeyProcess::EvictKeysCombine(const vector& keys) // hbm +void KeyProcess::EvictKeysCombine(const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm combine funEvictCall, keySize:{}", keys.size()); EmbeddingMgmt::Instance()->EvictKeysCombine(keys); @@ -1457,7 +1384,7 @@ void KeyProcess::EvictKeysCombine(const vector& keys) // hbm void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vector& keys) { EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD size_t keySize = keys.size(); auto& devHashMap = keyOffsetMap.at(embName); @@ -1471,7 +1398,7 @@ void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vectorsecond; @@ -1479,26 +1406,24 @@ void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vector offset) { if (offset.size() > embInfos[embName].devVocabSize) { - LOG_ERROR("{} overflow! init evict dev, evictOffset size {} bigger than " - "dev vocabSize {}", - embName, offset.size(), embInfos[embName].devVocabSize); - throw runtime_error(Logger::Format("{} overflow! init evict dev, evictOffset size {} " - "bigger than dev vocabSize {}", - embName, offset.size(), embInfos[embName].devVocabSize) - .c_str()); + LOG_ERROR("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", + embName, offset.size(), embInfos[embName].devVocabSize); + throw runtime_error( + Logger::Format("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", + embName, offset.size(), embInfos[embName].devVocabSize + ).c_str()); } vector tmpDataOut; Tensor tmpData = Vec2TensorI32(offset); tmpDataOut.emplace_back(tmpData); - tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, {1})); + tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); auto evictLen = tmpDataOut.back().flat(); int evictSize = static_cast(offset.size()); @@ -1508,16 +1433,15 @@ void KeyProcess::EvictInitDeviceEmb(const string& embName, vector offset auto trans = Singleton::GetInstance(); trans->Send(TransferChannel::EVICT, tmpDataOut, TRAIN_CHANNEL_ID, embName); - LOG_INFO(KEY_PROCESS "hbm EvictInitDeviceEmb: [{}]! send offsetSize:{}", embName, - offset.size()); + LOG_INFO(KEY_PROCESS "hbm EvictInitDeviceEmb: [{}]! send offsetSize:{}", embName, offset.size()); } -string KeyProcess::DumpSplitKeys(vector>& splitKeys) const +string KeyProcess::DumpSplitKeys(vector> &splitKeys) const { stringstream ssTrace; for (int devId = 0; devId < rankInfo.rankSize; ++devId) { ssTrace << '|' << devId << ":"; - for (auto key : splitKeys[devId]) { + for (auto key: splitKeys[devId]) { ssTrace << key << ','; } ssTrace << '|'; @@ -1556,8 +1480,7 @@ void KeyProcess::RecordKeyCountMap(const unique_ptr& batch) void KeyProcess::SetEos(int status, int channelId) { unique_lock lockGuard(eosMutex); - LOG_INFO("isNeedSendEos status is changed, before status:[{}], input " - "status:{}, channel:[{}], ", + LOG_INFO("isNeedSendEos status is changed, before status:[{}], input status:{}, channel:[{}], ", isNeedSendEos[channelId], status, channelId); isNeedSendEos[channelId] = (status == 1); } diff --git a/src/core/key_process/key_process.h b/src/core/key_process/key_process.h index d6a0b80b..8bd7b8d0 100644 --- a/src/core/key_process/key_process.h +++ b/src/core/key_process/key_process.h @@ -16,291 +16,283 @@ See the License for the specific language governing permissions and #ifndef MX_REC_KEY_PROCESS_H #define MX_REC_KEY_PROCESS_H -#include -#include - +#include #include #include -#include #include #include -#include +#include +#include +#include +#include "ock_ctr_common/include/factory.h" + +#include "utils/common.h" #include "emb_table/emb_table.h" #include "feature_admit_and_evict.h" #include "hybrid_mgmt/hybrid_mgmt_block.h" -#include "ock_ctr_common/include/factory.h" -#include "utils/common.h" #include "utils/singleton.h" namespace MxRec { -using namespace std; + using namespace std; -template -struct Cmp { - bool operator()(const T& a, const T& b) const - { - return get(a) > get(b); // batch id order - } -}; + template + struct Cmp { + bool operator()(const T& a, const T& b) const + { + return get(a) > get(b); // batch id order + } + }; -template -using heap_t = priority_queue, Cmp>; + template + using heap_t = priority_queue, Cmp>; -template -using info_list_t = map, MAX_QUEUE_NUM>>; + template + using info_list_t = map, MAX_QUEUE_NUM>>; -enum class ProcessedInfo { - RESTORE, - ALL2ALL, - INVALID -}; + enum class ProcessedInfo { + RESTORE, + ALL2ALL, + INVALID + }; -class EndRunExit : public std::exception { -public: - explicit EndRunExit(const char* message) : errorMessage(message) {} + class EndRunExit : public std::exception { + public: + explicit EndRunExit(const char* message) : errorMessage(message) {} - const char* what() const noexcept override - { - return errorMessage; - } + const char* what() const noexcept override + { + return errorMessage; + } -private: - const char* errorMessage; -}; + private: + const char* errorMessage; + }; -constexpr int MPI_ABNORMAL_SEND_VALUE = 0; // MPI异常通信时发送0 -constexpr int MPI_NORMAL_SEND_VALUE = 1; // MPI正常通信时发送1 + constexpr int MPI_ABNORMAL_SEND_VALUE = 0; // MPI异常通信时发送0 + constexpr int MPI_NORMAL_SEND_VALUE = 1; // MPI正常通信时发送1 -class EmptyList : public std::exception {}; + class EmptyList : public std::exception { + }; -class WrongListTop : public std::exception {}; + class WrongListTop : public std::exception { + }; -class KeyProcess { -public: - bool Initialize(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues = {}, int seed = 0); + class KeyProcess { + public: + bool Initialize(const RankInfo& rInfo, const vector& eInfos, + const vector& thresholdValues = {}, int seed = 0); - unique_ptr> GetInfoVec(int batch, const string& embName, int channel, - ProcessedInfo type); + unique_ptr> GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type); - KeysT GetLookupKeys(int batch, const string& embName, int channel); + KeysT GetLookupKeys(int batch, const string& embName, int channel); - int GetMaxStep(int channelId) const; + int GetMaxStep(int channelId) const; - OffsetMemT GetMaxOffset(); + OffsetMemT GetMaxOffset(); - KeyOffsetMemT GetKeyOffsetMap(); + KeyOffsetMemT GetKeyOffsetMap(); - KeyCountMemT GetKeyCountMap(); + KeyCountMemT GetKeyCountMap(); - FeatureAdmitAndEvict& GetFeatAdmitAndEvict(); + FeatureAdmitAndEvict& GetFeatAdmitAndEvict(); - void LoadMaxOffset(OffsetMemT& loadData); + void LoadMaxOffset(OffsetMemT& loadData); - void LoadKeyOffsetMap(KeyOffsetMemT& loadData); + void LoadKeyOffsetMap(KeyOffsetMemT& loadData); - void LoadKeyCountMap(KeyCountMemT& loadData); + void LoadKeyCountMap(KeyCountMemT& loadData); - void Destroy(); + void Destroy(); - void LoadSaveLock(); + void LoadSaveLock(); - void LoadSaveUnlock(); + void LoadSaveUnlock(); - void EvictKeys(const string& embName, const vector& keys); + void EvictKeys(const string& embName, const vector& keys); - void EvictKeysCombine(const vector& keys); + void EvictKeysCombine(const vector& keys); - void SetupHotEmbUpdateStep(); + void SetupHotEmbUpdateStep(); - int64_t GetExpansionTableSize(const string& embName); + int64_t GetExpansionTableSize(const string& embName); - int64_t GetExpansionTableCapacity(const string& embName); + int64_t GetExpansionTableCapacity(const string& embName); - void RecordKeyCountMap(const unique_ptr& batch); + void RecordKeyCountMap(const unique_ptr& batch); - template - void GlobalUnique(T& lookupKeys, T& uniqueKeys, vector& restoreVecSec) - { - absl::flat_hash_map umap; - restoreVecSec.resize(lookupKeys.size(), -1); - int32_t length = 0; + template + void GlobalUnique(T& lookupKeys, T& uniqueKeys, vector& restoreVecSec) + { + absl::flat_hash_map umap; + restoreVecSec.resize(lookupKeys.size(), -1); + int32_t length = 0; - for (size_t i = 0; i < lookupKeys.size(); ++i) { - int64_t key = lookupKeys[i]; - if (rankInfo.useStatic && ((!rankInfo.useDynamicExpansion && key == -1) || - (rankInfo.useDynamicExpansion && key == 0))) { - continue; - } + for (size_t i = 0; i < lookupKeys.size(); ++i) { + int64_t key = lookupKeys[i]; + if (rankInfo.useStatic && ( + (!rankInfo.useDynamicExpansion && key == -1) || (rankInfo.useDynamicExpansion && key == 0))) { + continue; + } - auto result = umap.find(key); - if (result == umap.end()) { - uniqueKeys.push_back(lookupKeys[i]); - umap[key] = length; - restoreVecSec[i] = length; - length++; - } else { - restoreVecSec[i] = result->second; + auto result = umap.find(key); + if (result == umap.end()) { + uniqueKeys.push_back(lookupKeys[i]); + umap[key] = length; + restoreVecSec[i] = length; + length++; + } else { + restoreVecSec[i] = result->second; + } } - } - if (rankInfo.useStatic) { - if (rankInfo.useDynamicExpansion) { - uniqueKeys.resize(lookupKeys.size(), 0); - } else { - uniqueKeys.resize(lookupKeys.size(), -1); + if (rankInfo.useStatic) { + if (rankInfo.useDynamicExpansion) { + uniqueKeys.resize(lookupKeys.size(), 0); + } else { + uniqueKeys.resize(lookupKeys.size(), -1); + } } } - } - - void SetEos(int status, int channelId); - void SendEos(int batchId, int channel); + void SetEos(int status, int channelId); - bool isRunning{false}; + void SendEos(int batchId, int channel); - std::mutex destroyMutex; - std::mutex eosMutex; - inline bool HasEmbName(const string& embName) - { - return embInfos.find(embName) != embInfos.end(); - }; - GTEST_PRIVATE : + bool isRunning { false }; - int - Start(); + std::mutex destroyMutex; + std::mutex eosMutex; + inline bool HasEmbName(const string& embName) + { + return embInfos.find(embName) != embInfos.end(); + }; + GTEST_PRIVATE: - template - T GetInfo(info_list_t& list, int batch, const string& embName, int channel); + int Start(); - RankInfo rankInfo; - map embInfos; - MPI_Comm comm[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]; - std::mutex mut{}; - vector> procThreads{}; - std::mutex loadSaveMut[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]{}; - info_list_t lookupKeysList; - list>> storage; - info_list_t infoList; - info_list_t all2AllList; - map maxOffset{}; - map> keyOffsetMap{}; - map> keyCountMap{}; - FeatureAdmitAndEvict m_featureAdmitAndEvict{}; - map> evictPosMap{}; - map> hotKey{}; - map hotEmbTotCount; - map embeddingTableMap{}; - ock::ctr::FactoryPtr factory{}; - int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; - bool isWithFAAE; - bool isNeedSendEos[2] = {0, 0}; // 分别代表通道0、1的eos状态 + template + T GetInfo(info_list_t& list, int batch, const string& embName, int channel); - void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); + RankInfo rankInfo; + map embInfos; + MPI_Comm comm[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]; + std::mutex mut {}; + vector> procThreads {}; + std::mutex loadSaveMut[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD] {}; + info_list_t lookupKeysList; + list>> storage; + info_list_t infoList; + info_list_t all2AllList; + map maxOffset {}; + map> keyOffsetMap {}; + map> keyCountMap {}; + FeatureAdmitAndEvict m_featureAdmitAndEvict {}; + map> evictPosMap {}; + map> hotKey {}; + map hotEmbTotCount; + map embeddingTableMap {}; + ock::ctr::FactoryPtr factory {}; + int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; + bool isWithFAAE; + bool isNeedSendEos[2] = { 0, 0 }; // 分别代表通道0、1的eos状态 - void KeyProcessTask(int channel, int threadId); + void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); - void KeyProcessTaskWithFastUnique(int channel, int threadId); + void KeyProcessTask(int channel, int threadId); - bool KeyProcessTaskHelper(unique_ptr& batch, int channel, int threadId); + void KeyProcessTaskWithFastUnique(int channel, int threadId); - bool KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, - ock::ctr::UniquePtr& unique, int channel, int threadId); + bool KeyProcessTaskHelper(unique_ptr& batch, int channel, int threadId); - tuple, vector> ProcessSplitKeys(const unique_ptr& batch, - int id, vector& splitKeys); + bool KeyProcessTaskHelperWithFastUnique(unique_ptr &batch, ock::ctr::UniquePtr& unique, + int channel, int threadId); - void GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf); + tuple, vector> ProcessSplitKeys(const unique_ptr& batch, + int id, vector& splitKeys); - void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, - bool& uniqueInitialize, const unique_ptr& batch, - ock::ctr::UniquePtr& unique); + void GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf); - void ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniquePtr& unique, - int id, UniqueInfo& uniqueInfoOut); + void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, + const unique_ptr & batch, ock::ctr::UniquePtr& unique); - size_t GetKeySize(const unique_ptr& batch); + void ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniquePtr& unique, + int id, UniqueInfo& uniqueInfoOut); - void All2All(vector& sc, int id, const unique_ptr& batch, - KeySendInfo& keySendInfo, All2AllInfo& all2AllInfoOut); + size_t GetKeySize(const unique_ptr &batch); - auto HashSplit(const unique_ptr& batch) const - -> tuple, vector>; + void All2All(vector& sc, int id, const unique_ptr &batch, KeySendInfo& keySendInfo, + All2AllInfo& all2AllInfoOut); - auto HotHashSplit(const unique_ptr& batch) - -> tuple, vector, vector>; + auto HashSplit(const unique_ptr& batch) const -> tuple, vector>; - void PaddingAlltoallVC(vector& splitKeys) const; + auto HotHashSplit(const unique_ptr& batch) -> tuple, vector, vector>; - tuple, vector, vector>> HashSplitWithFAAE( - const unique_ptr& batch) const; + void PaddingAlltoallVC(vector& splitKeys) const; - vector GetScAll(const vector& keyScLocal, int commId, - const unique_ptr& batch); + tuple, vector, vector>> + HashSplitWithFAAE(const unique_ptr& batch) const; - void GetScAllForUnique(const vector& keyScLocal, int commId, - const unique_ptr& batch, vector& scAllOut); + vector GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch); - void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); + void GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr &batch, + vector &scAllOut); - void Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel); + void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); - unique_ptr GetBatchData(int channel, int commId) const; + void Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel); - void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, - vector& restoreVec, int hotPosSize = 0) const; + unique_ptr GetBatchData(int channel, int commId) const; - void SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch); + void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, + vector& restoreVec, int hotPosSize = 0) const; + + void SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch); - void EvictDeleteDeviceEmb(const string& embName, const vector& keys); + void EvictDeleteDeviceEmb(const string& embName, const vector& keys); - void EvictInitDeviceEmb(const string& embName, vector offset); + void EvictInitDeviceEmb(const string& embName, vector offset); - void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, - bool refresh, const string& embName); + void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, bool refresh, + const string& embName); - void UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, - uint32_t count, bool refresh, const string& embName); + void UpdateHotMapForUnique(const KeysT &keySend, const vector &keyCount, + uint32_t count, bool refresh, const string& embName); - void HandleHotAndSendCount(const unique_ptr& batch, UniqueInfo& uniqueInfoOut, - KeySendInfo& keySendInfo, vector& sc, vector& splitSize); + void HandleHotAndSendCount(const unique_ptr &batch, UniqueInfo& uniqueInfoOut, + KeySendInfo& keySendInfo, vector& sc, vector& splitSize); - void PushResult(unique_ptr& batch, unique_ptr> tensors, - KeysT& lookupKeys); + void PushResult(unique_ptr& batch, unique_ptr> tensors, KeysT& lookupKeys); - void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, - int channel); + void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel); - void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, - const vector& hotPosDev, const unique_ptr& batch); + void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, + const unique_ptr& batch); - void ComputeHotPos(const unique_ptr& batch, - absl::flat_hash_map& hotMap, vector& hotPos, - vector& restore, const int hotOffset) const; + void ComputeHotPos(const unique_ptr &batch, absl::flat_hash_map &hotMap, + vector &hotPos, vector &restore, const int hotOffset) const; - vector GetCountRecv(const unique_ptr& batch, int id, - vector>& keyCount, vector scAll, - vector ss); + vector GetCountRecv(const unique_ptr& batch, int id, + vector>& keyCount, vector scAll, vector ss); - void HashSplitHelper(const unique_ptr& batch, vector& splitKeys, - vector& restore, vector& hotPos, - vector>& keyCount); + void HashSplitHelper(const unique_ptr & batch, vector & splitKeys, + vector & restore, vector & hotPos, + vector >& keyCount); - template - inline vector Count2Start(const vector& count) const - { - vector start = {0}; - for (size_t i = 0; i < count.size() - 1; ++i) { - start.push_back(count[i] + start.back()); + template + inline vector Count2Start(const vector& count) const + { + vector start = { 0 }; + for (size_t i = 0; i < count.size() - 1; ++i) { + start.push_back(count[i] + start.back()); + } + return start; } - return start; - } - string DumpSplitKeys(vector>& splitKeys) const; -}; + string DumpSplitKeys(vector>& splitKeys) const; + }; #define KEY_PROCESS_INSTANCE Singleton::GetInstance() -} // end namespace MxRec +} // end namespace MxRec -#endif // MX_REC_KEY_PROCESS_H +#endif // MX_REC_KEY_PROCESS_H -- Gitee From f3db56ec0161daa8159ba38d0bdf7949d81ba993 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 22 Apr 2024 14:57:24 +0800 Subject: [PATCH 052/302] =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E5=8C=96cpp=E6=BA=90?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/key_process/key_process.cpp | 520 +++++++++++++++------------ src/core/key_process/key_process.h | 382 ++++++++++---------- 2 files changed, 489 insertions(+), 413 deletions(-) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 98df97ed..85b17bbb 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -15,19 +15,21 @@ See the License for the specific language governing permissions and #include "key_process.h" +#include + #include #include -#include + +#include "emb_table/embedding_mgmt.h" +#include "hd_transfer/hd_transfer.h" +#include "host_emb/host_emb.h" +#include "ock_ctr_common/include/error_code.h" #include "utils/common.h" +#include "utils/config.h" #include "utils/logger.h" #include "utils/safe_queue.h" #include "utils/singleton.h" #include "utils/time_cost.h" -#include "utils/config.h" -#include "host_emb/host_emb.h" -#include "emb_table/embedding_mgmt.h" -#include "hd_transfer/hd_transfer.h" -#include "ock_ctr_common/include/error_code.h" using namespace std; using namespace chrono; @@ -41,15 +43,14 @@ void KeyProcess::SetupHotEmbUpdateStep() } bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues, - int seed) + const vector& thresholdValues, int seed) { this->rankInfo = rInfo; - + SetupHotEmbUpdateStep(); - + map scInfo; - for (const auto& info: eInfos) { + for (const auto& info : eInfos) { embInfos[info.name] = info; scInfo[info.name] = info.sendCount; InitHotEmbTotCount(info, rInfo); @@ -63,8 +64,8 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos LOG_INFO(KEY_PROCESS "hot emb count info:{}", MapToString(hotEmbTotCount)); MPI_Group worldGroup; MPI_Comm_group(MPI_COMM_WORLD, &worldGroup); - for (auto& i: comm) { - for (auto& j: i) { + for (auto& i : comm) { + for (auto& j : i) { MPI_Comm_create(MPI_COMM_WORLD, worldGroup, &j); } } @@ -82,12 +83,13 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos if (GlobalEnv::fastUnique) { int result = ock::ctr::Factory::Create(factory); if (result != 0) { - throw runtime_error(Logger::Format("create fast factory failed, error code:{}", result)); + throw runtime_error( + Logger::Format("create fast factory failed, error code:{}", result)); } } LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}", - MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic); + MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic); #ifndef GTEST Start(); #endif @@ -101,7 +103,7 @@ int KeyProcess::Start() // 0 1 2 3 4 5 0 1 2 3 4 5 // | rank0 | | rank1 | // each rank creates KEY_PROCESS_THREAD threads, each thread process one batchdata - LOG_INFO("CPU Core Num: {}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 + LOG_INFO("CPU Core Num: {}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 auto fn = [this](int channel, int threadId) { #ifndef GTEST auto ret = aclrtSetDevice(static_cast(rankInfo.deviceId)); @@ -115,7 +117,7 @@ int KeyProcess::Start() } else { KeyProcessTask(channel, threadId); } - }; // for clean code + }; // for clean code int threadNum = GetThreadNumEnv(); for (int channel = 0; channel < MAX_CHANNEL_NUM; ++channel) { LOG_INFO(KEY_PROCESS "key process thread num: {}", threadNum); @@ -133,8 +135,9 @@ void KeyProcess::InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo) if (rankInfo.useDynamicExpansion) { embeddingSize = info.embeddingSize; } - hotEmbTotCount[info.name] = static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) * - HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); + hotEmbTotCount[info.name] = + static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) * + HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); } OffsetMemT KeyProcess::GetMaxOffset() @@ -179,7 +182,7 @@ void KeyProcess::Destroy() { isRunning = false; LOG_INFO(KEY_PROCESS "rankId:{} KeyProcess begin destroy.", rankInfo.rankId); - for (auto& i: procThreads) { + for (auto& i : procThreads) { i->join(); } procThreads.clear(); @@ -189,8 +192,8 @@ void KeyProcess::Destroy() /// 每个数据通道的所有数据处理线程上锁 void KeyProcess::LoadSaveLock() { - for (int channelId { 0 }; channelId < MAX_CHANNEL_NUM; ++channelId) { - for (int threadId { 0 }; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { + for (int channelId{0}; channelId < MAX_CHANNEL_NUM; ++channelId) { + for (int threadId{0}; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { loadSaveMut[channelId][threadId].lock(); } } @@ -199,8 +202,8 @@ void KeyProcess::LoadSaveLock() /// 每个数据通道的所有数据处理线程释放锁 void KeyProcess::LoadSaveUnlock() { - for (int channelId { 0 }; channelId < MAX_CHANNEL_NUM; ++channelId) { - for (int threadId { 0 }; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { + for (int channelId{0}; channelId < MAX_CHANNEL_NUM; ++channelId) { + for (int threadId{0}; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { loadSaveMut[channelId][threadId].unlock(); } } @@ -226,8 +229,9 @@ void KeyProcess::GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf) uniqueConf.maxThreadNum = GlobalEnv::maxUniqueThreadNum; } -void KeyProcess::InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, - const unique_ptr & batch, ock::ctr::UniquePtr& unique) +void KeyProcess::InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, + bool& uniqueInitialize, const unique_ptr& batch, + ock::ctr::UniquePtr& unique) { uniqueConf.desiredSize = static_cast(batch->Size()); if (preBatchSize != batch->Size()) { @@ -269,7 +273,8 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = + GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -281,8 +286,9 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) if (!KeyProcessTaskHelperWithFastUnique(batch, unique, channel, threadId)) { break; } - LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process with fast unique cost:{}," - " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", + LOG_INFO(KEY_PROCESS + "getAndProcessTC(ms):{}, key process with fast unique cost:{}," + " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); @@ -290,14 +296,13 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) batchQueue->PutDirty(move(batch)); } unique->UnInitialize(); - } catch (const EndRunExit &e) { + } catch (const EndRunExit& e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } LOG_INFO(KEY_PROCESS "KeyProcessTaskWithFastUnique exit. rank:{} channelId:{}, threadId:{}", - rankInfo.rankId, channel, threadId); + rankInfo.rankId, channel, threadId); } - void KeyProcess::KeyProcessTask(int channel, int threadId) { unique_ptr batch; @@ -305,7 +310,8 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = + GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -316,40 +322,43 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) if (!KeyProcessTaskHelper(batch, channel, threadId)) { break; } - LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process cost:{}," - " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", + LOG_INFO(KEY_PROCESS + "getAndProcessTC(ms):{}, key process cost:{}," + " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); auto batchQueue = SingletonQueue::GetInstances(queueIndex); batchQueue->PutDirty(move(batch)); } - } catch (const EndRunExit &e) { + } catch (const EndRunExit& e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } - LOG_INFO(KEY_PROCESS "KeyProcessTask exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, channel, threadId); + LOG_INFO(KEY_PROCESS "KeyProcessTask exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, + channel, threadId); } -void KeyProcess::HashSplitHelper(const unique_ptr & batch, vector & splitKeys, - vector & restore, vector & hotPos, - vector >& keyCount) +void KeyProcess::HashSplitHelper(const unique_ptr& batch, vector& splitKeys, + vector& restore, vector& hotPos, + vector>& keyCount) { TimeCost uniqueTc; if (m_featureAdmitAndEvict.GetFunctionSwitch() && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { - tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 } else { - tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 } LOG_DEBUG("uniqueTc(ms):{}", uniqueTc.ElapsedMS()); } -bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, ock::ctr::UniquePtr& unique, - int channel, int threadId) +bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, + ock::ctr::UniquePtr& unique, int channel, + int threadId) { // tuple for keyRec restore hotPos scAll countRecv isWithFAAE = m_featureAdmitAndEvict.GetFunctionSwitch() && - FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; + FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; TimeCost totalTimeCost = TimeCost(); TimeCost fastUniqueTC; UniqueInfo uniqueInfo; @@ -358,11 +367,11 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch // 特征准入&淘汰 if (isWithFAAE && - (m_featureAdmitAndEvict.FeatureAdmit( - channel, batch, uniqueInfo.all2AllInfo.keyRecv, uniqueInfo.all2AllInfo.countRecv) == - FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, uniqueInfo.all2AllInfo.keyRecv, + uniqueInfo.all2AllInfo.countRecv) == + FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", - rankInfo.rankId, threadId, channel); + rankInfo.rankId, threadId, channel); return false; } std::lock_guard lock(loadSaveMut[channel][threadId]); @@ -376,25 +385,29 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch LOG_DEBUG("key2OffsetTC(ms):{}", key2OffsetTC.ElapsedMS()); } // Static all2all,need send count - if (!rankInfo.useStatic) { SendA2A(uniqueInfo.all2AllInfo.scAll, batch->name, batch->channel, batch->batchId); } + if (!rankInfo.useStatic) { + SendA2A(uniqueInfo.all2AllInfo.scAll, batch->name, batch->channel, batch->batchId); + } auto tensors = make_unique>(); tensors->push_back(Vec2TensorI32(uniqueInfo.restore)); uniqueInfo.hotPos.resize(hotEmbTotCount[batch->name], -1); tensors->push_back(Vec2TensorI32(uniqueInfo.hotPos)); - + if (!rankInfo.isDDR) { PushGlobalUniqueTensors(move(tensors), uniqueInfo.all2AllInfo.keyRecv, channel); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) : - Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); + tensors->push_back(rankInfo.useDynamicExpansion + ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) + : Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); } TimeCost pushResultTC; PushResult(batch, move(tensors), uniqueInfo.all2AllInfo.keyRecv); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost_with_fast_unique {}", - channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO + "channel_id {} batch_id {} rank_id {} key_process_time_cost_with_fast_unique {}", + channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); return true; @@ -422,8 +435,8 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, // 特征准入&淘汰 if (m_featureAdmitAndEvict.GetFunctionSwitch() && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE && - (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, - countRecv) == FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, countRecv) == + FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", rankInfo.rankId, threadId, channel); return false; @@ -436,7 +449,9 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, } // Static all2all,need send count - if (!rankInfo.useStatic) { SendA2A(scAll, batch->name, batch->channel, batch->batchId); } + if (!rankInfo.useStatic) { + SendA2A(scAll, batch->name, batch->channel, batch->batchId); + } TimeCost pushResultTC; auto tensors = make_unique>(); @@ -444,24 +459,27 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, hotPos.resize(hotEmbTotCount[batch->name], 0); tensors->push_back(Vec2TensorI32(hotPos)); - + if (!rankInfo.isDDR) { PushGlobalUniqueTensors(tensors, lookupKeys, channel); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) : Vec2TensorI32(lookupKeys)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) + : Vec2TensorI32(lookupKeys)); } PushResult(batch, move(tensors), lookupKeys); LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", - channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", channel, + batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } return true; } -void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel) +void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, + KeysT& lookupKeys, int channel) { - if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && + if (GlobalEnv::applyGradientsStrategy == + ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && channel == TRAIN_CHANNEL_ID) { KeysT uniqueKeys; vector restoreVecSec; @@ -470,36 +488,39 @@ void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tenso GlobalUnique(lookupKeys, uniqueKeys, restoreVecSec); LOG_DEBUG("globalUniqueSyncTC(ms):{}", globalUniqueSyncTC.ElapsedMS()); tensors->push_back(Vec2TensorI32(restoreVecSec)); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) : Vec2TensorI32(uniqueKeys)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) + : Vec2TensorI32(uniqueKeys)); } } vector KeyProcess::GetCountRecv(const unique_ptr& batch, int id, - vector>& keyCount, vector scAll, vector ss) + vector>& keyCount, vector scAll, + vector ss) { TimeCost getCountRecvTC; if (rankInfo.useStatic) { - for (auto& cnt: keyCount) { + for (auto& cnt : keyCount) { cnt.resize(embInfos[batch->name].sendCount, 0); } } vector countSend; - for (auto& cnt: keyCount) { + for (auto& cnt : keyCount) { countSend.insert(countSend.cend(), cnt.cbegin(), cnt.cend()); } vector sc; for (int i = 0; i < rankInfo.rankSize; ++i) { sc.push_back(scAll.at(rankInfo.rankSize * rankInfo.rankId + i)); } - vector rc; // receive count + vector rc; // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { rc.push_back(scAll.at(i * rankInfo.rankSize + rankInfo.rankId)); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 vector countRecv; countRecv.resize(rs.back() + rc.back()); - int retCode = MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), - rc.data(), rs.data(), MPI_UINT32_T, comm[batch->channel][id]); + int retCode = + MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), + rc.data(), rs.data(), MPI_UINT32_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -512,9 +533,11 @@ void KeyProcess::PushResult(unique_ptr& batch, unique_ptr lockGuard(mut); storage.push_front(move(tensors)); - infoList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, storage.begin())); + infoList[batch->name][batch->channel].push( + make_tuple(batch->batchId, batch->name, storage.begin())); if (rankInfo.isDDR) { - lookupKeysList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, move(lookupKeys))); + lookupKeysList[batch->name][batch->channel].push( + make_tuple(batch->batchId, batch->name, move(lookupKeys))); } lockGuard.unlock(); } @@ -542,32 +565,38 @@ unique_ptr KeyProcess::GetBatchData(int channel, int commId) const this_thread::sleep_for(100us); if (tc.ElapsedSec() > GET_BATCH_TIMEOUT) { if (commId == 0) { - LOG_WARN(KEY_PROCESS "getting batch timeout! 1. check last 'read batch cost' print. " - "channel[{}] commId[{}]", channel, commId); + LOG_WARN(KEY_PROCESS + "getting batch timeout! 1. check last 'read batch cost' print. " + "channel[{}] commId[{}]", + channel, commId); } this_thread::sleep_for(seconds(1)); tc = TimeCost(); } if (!isRunning) { - LOG_WARN("channelId:{} threadId:{}, isRunning is false when GetBatchData", channel, commId); + LOG_WARN("channelId:{} threadId:{}, isRunning is false when GetBatchData", channel, + commId); throw EndRunExit("GetBatchData end run."); } } EASY_END_BLOCK - LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, get batch data done, batchName:{}. bs:{} sample:[{}]", - batch->channel, commId, batch->batchId, batch->name, batch->Size(), batch->UnParse()); + LOG_DEBUG( + KEY_PROCESS + "channelId:{} threadId:{} batchId:{}, get batch data done, batchName:{}. bs:{} sample:[{}]", + batch->channel, commId, batch->batchId, batch->name, batch->Size(), batch->UnParse()); #if defined(PROFILING) && defined(BUILD_WITH_EASY_PROFILER) if (batch->batchId == PROFILING_START_BATCH_ID) { EASY_PROFILER_ENABLE } else if (batch->batchId == PROFILING_END_BATCH_ID) { - ::profiler::dumpBlocksToFile(StringFormat("/home/MX_REC-profile-%d.prof", rankInfo.rankId).c_str()); + ::profiler::dumpBlocksToFile( + StringFormat("/home/MX_REC-profile-%d.prof", rankInfo.rankId).c_str()); } #endif return batch; } -size_t KeyProcess::GetKeySize(const unique_ptr &batch) +size_t KeyProcess::GetKeySize(const unique_ptr& batch) { size_t size = rankInfo.rankSize * embInfos[batch->name].sendCount; if (!rankInfo.useStatic) { @@ -576,8 +605,9 @@ size_t KeyProcess::GetKeySize(const unique_ptr &batch) return size; } -void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniquePtr& unique, - int id, UniqueInfo& uniqueInfoOut) +void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, + ock::ctr::UniquePtr& unique, int id, + UniqueInfo& uniqueInfoOut) { EASY_FUNCTION(profiler::colors::Purple) EASY_VALUE("batchId", batch->batchId) @@ -596,10 +626,10 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniqueIn uniqueIn; uniqueIn.inputIdCnt = static_cast(batch->Size()); - uniqueIn.inputId = reinterpret_cast(batch->sample.data()); + uniqueIn.inputId = reinterpret_cast(batch->sample.data()); ock::ctr::EnhancedUniqueOut uniqueOut; - uniqueOut.uniqueId = reinterpret_cast(keySendInfo.keySend.data()); + uniqueOut.uniqueId = reinterpret_cast(keySendInfo.keySend.data()); uniqueOut.index = reinterpret_cast(uniqueInfoOut.restore.data()); if (rankInfo.useStatic) { uniqueOut.idCnt = idCount.data(); @@ -608,7 +638,7 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, uniqueOut.idCnt = keySendInfo.keyCount.data(); } uniqueOut.uniqueIdCntInBucket = splitSize.data(); - uniqueOut.uniqueIdInBucket = reinterpret_cast(uniqueVector.data()); + uniqueOut.uniqueIdInBucket = reinterpret_cast(uniqueVector.data()); uniqueOut.uniqueIdCnt = 0; int ret = unique->DoEnhancedUnique(uniqueIn, uniqueOut); @@ -624,19 +654,21 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, All2All(sc, id, batch, keySendInfo, uniqueInfoOut.all2AllInfo); LOG_DEBUG(KEY_PROCESS "ProcessBatchWithFastUnique get batchId:{}, batchSize:{}," - " channel:{}, name:{}, restore:{}, keyCount:{}", - batch->batchId, batch->Size(), batch->channel, batch->name, - uniqueInfoOut.restore.size(), keySendInfo.keyCount.size()); + " channel:{}, name:{}, restore:{}, keyCount:{}", + batch->batchId, batch->Size(), batch->channel, batch->name, + uniqueInfoOut.restore.size(), keySendInfo.keyCount.size()); if (GlogConfig::gStatOn) { LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} " - "batch_key_num_with_fast_unique {} unique_key_num_with_fast_unique {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueOut.uniqueIdCnt); + "batch_key_num_with_fast_unique {} unique_key_num_with_fast_unique {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), + uniqueOut.uniqueIdCnt); } } -void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, UniqueInfo& uniqueInfoOut, - KeySendInfo& keySendInfo, vector& sc, vector& splitSize) +void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, + UniqueInfo& uniqueInfoOut, KeySendInfo& keySendInfo, + vector& sc, vector& splitSize) { std::shared_lock lock(g_smut); absl::flat_hash_map hotMap = hotKey[batch->name]; @@ -649,8 +681,8 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, Uniqu TimeCost computeHotTc; ComputeHotPos(batch, hotMap, uniqueInfoOut.hotPos, uniqueInfoOut.restore, hotOffset); LOG_DEBUG("ComputeHot TimeCost(ms):{}", computeHotTc.ElapsedMS()); - UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, - hotOffset, batch->batchId % hotEmbUpdateStep == 0, batch->name); + UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, hotOffset, + batch->batchId % hotEmbUpdateStep == 0, batch->name); if (rankInfo.useStatic) { sc.resize(rankInfo.rankSize, embInfos[batch->name].sendCount); @@ -662,8 +694,9 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, Uniqu } } -void KeyProcess::ComputeHotPos(const unique_ptr &batch, absl::flat_hash_map &hotMap, - vector &hotPos, vector &restore, const int hotOffset) const +void KeyProcess::ComputeHotPos(const unique_ptr& batch, + absl::flat_hash_map& hotMap, vector& hotPos, + vector& restore, const int hotOffset) const { emb_key_t* inputData = batch->sample.data(); size_t miniBs = batch->Size(); @@ -686,39 +719,41 @@ void KeyProcess::ComputeHotPos(const unique_ptr &batch, absl::flat_ha } } -void KeyProcess::All2All(vector& sc, int id, const unique_ptr &batch, KeySendInfo& keySendInfo, - All2AllInfo& all2AllInfoOut) +void KeyProcess::All2All(vector& sc, int id, const unique_ptr& batch, + KeySendInfo& keySendInfo, All2AllInfo& all2AllInfoOut) { TimeCost getScAllTC; int channel = batch->channel; - GetScAllForUnique(sc, id, batch, all2AllInfoOut.scAll); // Allgather通信获取所有(不同rank相同thread id的) + GetScAllForUnique(sc, id, batch, + all2AllInfoOut.scAll); // Allgather通信获取所有(不同rank相同thread id的) LOG_DEBUG("GetScAll TimeCost(ms):{}", getScAllTC.ElapsedMS()); TimeCost all2allTC; - vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 - vector rc(rankInfo.rankSize); // receive count + vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 + vector rc(rankInfo.rankSize); // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { // 通信量矩阵某一列的和即为本地要从其他设备接受的key数据量 rc[i] = all2AllInfoOut.scAll.at(i * rankInfo.rankSize + rankInfo.rankId); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 all2AllInfoOut.keyRecv.resize(rs.back() + rc.back()); EASY_BLOCK("all2all") int retCode = MPI_Alltoallv(keySendInfo.keySend.data(), sc.data(), ss.data(), MPI_INT64_T, - all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), - MPI_INT64_T, comm[channel][id]); + all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, + comm[channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All MPI_Alltoallv end.", channel, id, batch->batchId); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All MPI_Alltoallv end.", channel, id, + batch->batchId); all2AllInfoOut.countRecv.resize(rs.back() + rc.back()); if (isWithFAAE) { retCode = MPI_Alltoallv(keySendInfo.keyCount.data(), sc.data(), ss.data(), MPI_UINT32_T, - all2AllInfoOut.countRecv.data(), rc.data(), - rs.data(), MPI_UINT32_T, comm[channel][id]); + all2AllInfoOut.countRecv.data(), rc.data(), rs.data(), MPI_UINT32_T, + comm[channel][id]); if (retCode != MPI_SUCCESS) { - LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", - channel, id, batch->batchId, retCode); + LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", channel, id, + batch->batchId, retCode); } } LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All end, all2allTC TimeCost(ms):{}", @@ -727,7 +762,8 @@ void KeyProcess::All2All(vector& sc, int id, const unique_ptr &b } auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, - vector& splitKeys) -> tuple, vector> + vector& splitKeys) + -> tuple, vector> { TimeCost processSplitKeysTC; EASY_FUNCTION(profiler::colors::Purple) @@ -736,43 +772,45 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, batch->channel, id, batch->batchId); // 使用静态all2all通信:发送或接受量为预置固定值 scInfo[batch->name] = 65536 / rankSize 经验值 - if (rankInfo.useStatic) { // maybe move after all2all - for (KeysT& i: splitKeys) { + if (rankInfo.useStatic) { // maybe move after all2all + for (KeysT& i : splitKeys) { if (static_cast(i.size()) > embInfos[batch->name].sendCount) { - LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", - batch->name, batch->channel, batch->batchId, i.size()); + LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", batch->name, + batch->channel, batch->batchId, i.size()); throw runtime_error( StringFormat("%s[%d]:%d overflow! set send count bigger than %d", - batch->name.c_str(), batch->channel, batch->batchId, i.size()).c_str()); + batch->name.c_str(), batch->channel, batch->batchId, i.size()) + .c_str()); } i.resize(embInfos[batch->name].sendCount, -1); } } KeysT keySend; - vector sc; // send count - for (const auto& i: splitKeys) { + vector sc; // send count + for (const auto& i : splitKeys) { sc.push_back(static_cast(i.size())); keySend.insert(keySend.cend(), i.cbegin(), i.cend()); } KeysT keyRecv; TimeCost getScAllTC; - vector scAll = GetScAll(sc, id, batch); // Allgather通信获取所有(不同rank相同thread id的)线程间通信量矩阵 + vector scAll = GetScAll( + sc, id, batch); // Allgather通信获取所有(不同rank相同thread id的)线程间通信量矩阵 LOG_DEBUG("getScAllTC(ms)(AllReduce-AllGather):{}", getScAllTC.ElapsedMS()); vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 - vector rc; // receive count + vector rc; // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { // 通信量矩阵某一列的和即为本地要从其他设备接受的key数据量 rc.push_back(scAll.at(i * rankInfo.rankSize + rankInfo.rankId)); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 keyRecv.resize(rs.back() + rc.back()); EASY_BLOCK("all2all") TimeCost uniqueAll2AllTC; - int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, - keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, comm[batch->channel][id]); + int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, keyRecv.data(), + rc.data(), rs.data(), MPI_INT64_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -781,8 +819,8 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, EASY_END_BLOCK LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, batchName:{}, MPI_Alltoallv finish." " processSplitKeysTC(ms):{}", - batch->channel, id, batch->batchId, batch->name, processSplitKeysTC.ElapsedMS()); - return { keyRecv, scAll, ss }; + batch->channel, id, batch->batchId, batch->name, processSplitKeysTC.ElapsedMS()); + return {keyRecv, scAll, ss}; } /* @@ -790,15 +828,16 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, * splitKeys返回:将数据的key切分到其所在dev id对应的桶中,并去重。 * restore返回:去重后key在桶内偏移量(用于计算恢复向量) */ -tuple, vector> KeyProcess::HashSplit(const unique_ptr& batch) const +tuple, vector> KeyProcess::HashSplit( + const unique_ptr& batch) const { EASY_FUNCTION(profiler::colors::Gold) emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); vector restore(batch->Size()); - vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 - absl::flat_hash_map uKey; // 用于去重查询 + vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 + absl::flat_hash_map uKey; // 用于去重查询 EASY_BLOCK("split push back") for (size_t i = 0; i < miniBs; i++) { const emb_key_t& key = batchData[i]; @@ -806,9 +845,10 @@ tuple, vector> KeyProcess::HashSplit(const unique_ptrsecond; } } @@ -821,10 +861,11 @@ tuple, vector> KeyProcess::HashSplit(const unique_ptrchannel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO + "channel_id {} batch_id {} rank_id {} batch_key_num {} unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - return { splitKeys, restore }; + return {splitKeys, restore}; } void KeyProcess::PaddingAlltoallVC(vector& splitKeys) const @@ -846,10 +887,10 @@ tuple, vector, vector>> KeyProcess::Hash emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); - vector> keyCount(rankInfo.rankSize); // splitKeys在原始batch中对应的频次 + vector> keyCount(rankInfo.rankSize); // splitKeys在原始batch中对应的频次 vector restore(batch->Size()); - vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 - absl::flat_hash_map> uKey; // 用于去重查询 + vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 + absl::flat_hash_map> uKey; // 用于去重查询 EASY_BLOCK("split push back") for (size_t i = 0; i < miniBs; i++) { const emb_key_t& key = batchData[i]; @@ -857,10 +898,11 @@ tuple, vector, vector>> KeyProcess::Hash auto result = uKey.find(key); if (result == uKey.end()) { splitKeys[devId].push_back(key); - restore[i] = hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) + restore[i] = + hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) uKey[key].first = restore[i]; uKey[key].second = 1; - } else { // 去重 + } else { // 去重 restore[i] = result->second.first; uKey[key].second++; } @@ -886,20 +928,22 @@ tuple, vector, vector>> KeyProcess::Hash for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} faae_unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO + "channel_id {} batch_id {} rank_id {} batch_key_num {} faae_unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - return { splitKeys, restore, keyCount }; + return {splitKeys, restore, keyCount}; } -tuple, vector, vector> KeyProcess::HotHashSplit(const unique_ptr& batch) +tuple, vector, vector> KeyProcess::HotHashSplit( + const unique_ptr& batch) { EASY_FUNCTION(profiler::colors::Gold) emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); vector restore(batch->Size()); - absl::flat_hash_map uKey; // 用于去重查询 + absl::flat_hash_map uKey; // 用于去重查询 absl::flat_hash_map keyCountMapByEmbName; std::shared_lock lock(g_smut); auto hotMap = hotKey[batch->name]; @@ -908,31 +952,31 @@ tuple, vector, vector> KeyProcess::HotHashSplit(cons vector hotPosDev(hotEmbTotCount[batch->name]); int hotCount = 0; int hotOffset = hotEmbTotCount[batch->name]; - for (size_t i = 0; i < miniBs; i++) { // for mini batch + for (size_t i = 0; i < miniBs; i++) { // for mini batch const emb_key_t& key = batchData[i]; if (batch->batchId % hotEmbUpdateStep == 0) { keyCountMapByEmbName[key]++; } emb_key_t devId = abs(key % static_cast(rankInfo.rankSize)); auto result = uKey.find(key); - if (result != uKey.end()) { // // already in splitKeys + if (result != uKey.end()) { // // already in splitKeys restore[i] = result->second; continue; } // new key in current batch - splitKeys[devId].push_back(key); // push to bucket + splitKeys[devId].push_back(key); // push to bucket auto hot = hotMap.find(key); - if (hot != hotMap.end()) { // is hot key - if (hot->second == -1) { // is new hot key in this batch + if (hot != hotMap.end()) { // is hot key + if (hot->second == -1) { // is new hot key in this batch // pos in lookup vec (need add ss) for hot-gather hotPos[hotCount] = static_cast(splitKeys[devId].size()) - 1; - hotPosDev[hotCount] = devId; // which dev, for get ss + hotPosDev[hotCount] = devId; // which dev, for get ss hot->second = hotCount; - restore[i] = hotCount++; // get pos of hot emb + restore[i] = hotCount++; // get pos of hot emb } else { restore[i] = hot->second; } - } else { // is not hot key + } else { // is not hot key // restore记录去重后key在桶内偏移量(用于计算恢复向量) restore[i] = static_cast(splitKeys[devId].size() + (hotOffset - 1)); } @@ -944,22 +988,25 @@ tuple, vector, vector> KeyProcess::HotHashSplit(cons for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} hot_unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO + "channel_id {} batch_id {} rank_id {} batch_key_num {} hot_unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - UpdateHotMap(keyCountMapByEmbName, hotEmbTotCount[batch->name], batch->batchId % hotEmbUpdateStep == 0, - batch->name); + UpdateHotMap(keyCountMapByEmbName, hotEmbTotCount[batch->name], + batch->batchId % hotEmbUpdateStep == 0, batch->name); AddCountStartToHotPos(splitKeys, hotPos, hotPosDev, batch); - return { splitKeys, restore, hotPos }; + return {splitKeys, restore, hotPos}; } -void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, +void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& hotPos, + const vector& hotPosDev, const unique_ptr& batch) { vector splitKeysSize; - for (auto& splitKey: splitKeys) { - int tmp = rankInfo.useStatic ? embInfos[batch->name].sendCount : static_cast(splitKey.size()); + for (auto& splitKey : splitKeys) { + int tmp = rankInfo.useStatic ? embInfos[batch->name].sendCount + : static_cast(splitKey.size()); splitKeysSize.push_back(tmp); } @@ -969,13 +1016,13 @@ void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& ho } } -void KeyProcess::UpdateHotMapForUnique(const KeysT &keySend, const vector &keyCount, +void KeyProcess::UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, uint32_t count, bool refresh, const string& embName) { auto& hotMap = hotKey[embName]; if (refresh) { priority_queue> pq; - for (size_t i = 0;i < keySend.size(); ++i) { + for (size_t i = 0; i < keySend.size(); ++i) { if (keySend[i] == -1) { continue; } @@ -994,15 +1041,15 @@ void KeyProcess::UpdateHotMapForUnique(const KeysT &keySend, const vector& keyCountMapByEmbName, uint32_t count, bool refresh, - const string& embName) +void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, + uint32_t count, bool refresh, const string& embName) { if (!refresh) { return; } auto& hotMap = hotKey[embName]; - priority_queue> pq; // top k key - for (auto& p: keyCountMapByEmbName) { + priority_queue> pq; // top k key + for (auto& p : keyCountMapByEmbName) { pq.push(pair(-p.second, p.first)); if (pq.size() > count) { pq.pop(); @@ -1018,53 +1065,55 @@ void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapBy } /* - * 将本地(rank)batch要发送的key数据量进行Allgather通信,获取所有(不同rank相同thread id的)线程间的通信量矩阵 - * scAll返回:所有线程间的通信量矩阵(按行平铺的一维向量) + * 将本地(rank)batch要发送的key数据量进行Allgather通信,获取所有(不同rank相同thread + * id的)线程间的通信量矩阵 scAll返回:所有线程间的通信量矩阵(按行平铺的一维向量) */ -vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch) +vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, + const unique_ptr& batch) { EASY_FUNCTION() vector scAll; scAll.resize(rankInfo.rankSize * rankInfo.rankSize); - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll start.", batch->channel, commId, batch->batchId); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll start.", batch->channel, commId, + batch->batchId); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, - scAll.data(), rankInfo.rankSize, MPI_INT, - comm[batch->channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAll.data(), + rankInfo.rankSize, MPI_INT, comm[batch->channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {} commId {}, MPI_Allgather failed:{}", rankInfo.rankId, commId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, key scAll matrix:\n{}", - batch->channel, commId, batch->batchId, VectorToString(scAll)); + LOG_DEBUG( + "channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, key scAll matrix:\n{}", + batch->channel, commId, batch->batchId, VectorToString(scAll)); return scAll; } -void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr &batch, - vector &scAllOut) +void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, + const unique_ptr& batch, vector& scAllOut) { EASY_FUNCTION() int channel = batch->channel; scAllOut.resize(rankInfo.rankSize * rankInfo.rankSize); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, - scAllOut.data(), rankInfo.rankSize, MPI_INT, - comm[channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAllOut.data(), + rankInfo.rankSize, MPI_INT, comm[channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Allgather failed:{}", rankInfo.rankId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key scAllOut matrix:\n{}", - channel, commId, batch->batchId, VectorToString(scAllOut)); + LOG_DEBUG( + "channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key scAllOut matrix:\n{}", + channel, commId, batch->batchId, VectorToString(scAllOut)); } void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel) { TimeCost key2OffsetTC; EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD auto& key2Offset = keyOffsetMap[embName]; - auto& maxOffsetTmp = maxOffset[embName]; + auto& maxOffsetTmp = maxOffset[embName]; auto& evictPos = evictPosMap[embName]; for (long& key : splitKey) { if (key == -1) { @@ -1077,8 +1126,9 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe size_t offset; // 新值, emb有pos可复用 offset = evictPos.back(); - LOG_TRACE("HBM mode, evictPos is not null, name[{}] key [{}] reuse offset [{}], evictSize [{}]!!!", - embName, key, offset, evictPos.size()); + LOG_TRACE("HBM mode, evictPos is not null, name[{}] key [{}] reuse offset [{}], " + "evictSize [{}]!!!", + embName, key, offset, evictPos.size()); key2Offset[key] = offset; key = offset; evictPos.pop_back(); @@ -1096,18 +1146,18 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe LOG_ERROR("dev cache overflow {} > {}", maxOffsetTmp, embInfos[embName].devVocabSize); throw std::runtime_error("dev cache overflow!"); } - LOG_DEBUG("current hbm emb:{}, usage:{}/{} key2OffsetTC({} ms)", - embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); + LOG_DEBUG("current hbm emb:{}, usage:{}/{} key2OffsetTC({} ms)", embName, maxOffsetTmp, + embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); } void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel) { TimeCost key2OffsetTC; EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD auto& key2Offset = keyOffsetMap[embName]; - auto& maxOffsetTmp = maxOffset[embName]; - auto& curEmbTable = embeddingTableMap[embName]; // empty when not use dynamic expansion + auto& maxOffsetTmp = maxOffset[embName]; + auto& curEmbTable = embeddingTableMap[embName]; // empty when not use dynamic expansion for (long& key : splitKey) { if (key == -1) { key = 0; @@ -1130,8 +1180,8 @@ void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& spli key = 0; } } - LOG_DEBUG("current expansion emb:{}, usage:{}/{}, key2OffsetTC({} ms)", - embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); + LOG_DEBUG("current expansion emb:{}, usage:{}/{}, key2OffsetTC({} ms)", embName, maxOffsetTmp, + embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); } /* @@ -1156,11 +1206,11 @@ void KeyProcess::BuildRestoreVec(const unique_ptr& batch, const vecto hotNum += 1; } } - LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", - hotNum, batch->Size(), buildRestoreVecTC.ElapsedMS()); + LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", hotNum, batch->Size(), + buildRestoreVecTC.ElapsedMS()); } -template +template T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, int channel) { std::lock_guard lockGuard(mut); @@ -1170,7 +1220,8 @@ T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, in } auto topBatch = get(list[embName][channel].top()); if (topBatch < batch) { - LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", topBatch, batch, channel); + LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", topBatch, + batch, channel); this_thread::sleep_for(1s); } if (topBatch != batch) { @@ -1198,8 +1249,10 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { - LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", - embName, channel, batch); + LOG_DEBUG( + KEY_PROCESS + "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", + embName, channel, batch); return {}; } if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { @@ -1220,8 +1273,9 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) SendEos(batch, channel); return {}; } - LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", - embName, channel, batch, readEmbKeyBatchId); + LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey " + "batchId: {}.", + embName, channel, batch, readEmbKeyBatchId); this_thread::sleep_for(1ms); } catch (WrongListTop&) { LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); @@ -1245,22 +1299,27 @@ void KeyProcess::SendEos(int batchId, int channel) vector tensors; bool isNeedResend = true; - for (const auto& emb: as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos start.", channel, batchId, emb.first); + for (const auto& emb : + as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 + LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos start.", channel, + batchId, emb.first); if (!isRunning) { throw EndRunExit("SendEos end run, isRunning is false after lock destroyMutex."); } for (const string& transName : usedChannelNames) { - string sendName = StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); + string sendName = + StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); size_t channelSize = 0; - + acltdtQueryChannelSize(transChannels[sendName], &channelSize); LOG_INFO("[EOS] Before send eos, {} contains {}.", sendName, channelSize); - SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, isNeedResend); + SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, + isNeedResend); acltdtQueryChannelSize(transChannels[sendName], &channelSize); LOG_INFO("[EOS] After send eos, {} contains {}.", sendName, channelSize); } - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos end.", channel, batchId, emb.first); + LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos end.", channel, + batchId, emb.first); } LOG_INFO("channelId:{} batchId:{}, SendEos end.", channel, batchId); @@ -1274,7 +1333,8 @@ void KeyProcess::SendEos(int batchId, int channel) /// \param channel 通道索引(训练/推理) /// \param type 数据类型 /// \return -unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type) +unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embName, int channel, + ProcessedInfo type) { TimeCost tc = TimeCost(); info_list_t* list; @@ -1299,7 +1359,9 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { - LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", + LOG_DEBUG( + KEY_PROCESS + "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", embName, channel, batch); return nullptr; } @@ -1317,15 +1379,18 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa return uTensor; } catch (EmptyList&) { unique_lock lockEosGuard(eosMutex); - // 避免eos在keyProcess还未处理完数据时插队到通道前面, readEmbKey真实的次数是readEmbedBatchId减1 - if (isNeedSendEos[channel] && (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { + // 避免eos在keyProcess还未处理完数据时插队到通道前面, + // readEmbKey真实的次数是readEmbedBatchId减1 + if (isNeedSendEos[channel] && + (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { LOG_INFO("channelId:{} batchId:{}, GetInfoVec eos.", channel, batch); unique_lock lockDestroyGuard(destroyMutex); SendEos(batch, channel); return nullptr; } - LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", - embName, channel, batch, (hybridMgmtBlock->readEmbedBatchId[channel] - 1)); + LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey " + "batchId: {}.", + embName, channel, batch, (hybridMgmtBlock->readEmbedBatchId[channel] - 1)); this_thread::sleep_for(1ms); } catch (WrongListTop&) { LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); @@ -1338,7 +1403,7 @@ void KeyProcess::SendA2A(const vector& a2aInfo, const string& embName, int { // 数据放到队列里,在mgmt里面发送(检查发送数据量) auto tensors = make_unique>(); - Tensor tmpTensor(tensorflow::DT_INT64, { rankInfo.rankSize, rankInfo.rankSize }); + Tensor tmpTensor(tensorflow::DT_INT64, {rankInfo.rankSize, rankInfo.rankSize}); auto tmpData = tmpTensor.matrix(); for (int i = 0; i < rankInfo.rankSize; ++i) { for (int j = 0; j < rankInfo.rankSize; ++j) { @@ -1358,13 +1423,13 @@ int KeyProcess::GetMaxStep(int channelId) const return rankInfo.ctrlSteps.at(channelId); } -void KeyProcess::EvictKeys(const string& embName, const vector& keys) // hbm +void KeyProcess::EvictKeys(const string& embName, const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm funEvictCall: [{}]! keySize:{}", embName, keys.size()); EmbeddingMgmt::Instance()->EvictKeys(embName, keys); } -void KeyProcess::EvictKeysCombine(const vector& keys) // hbm +void KeyProcess::EvictKeysCombine(const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm combine funEvictCall, keySize:{}", keys.size()); EmbeddingMgmt::Instance()->EvictKeysCombine(keys); @@ -1373,7 +1438,7 @@ void KeyProcess::EvictKeysCombine(const vector& keys) // hbm void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vector& keys) { EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD size_t keySize = keys.size(); auto& devHashMap = keyOffsetMap.at(embName); @@ -1387,7 +1452,7 @@ void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vectorsecond; @@ -1395,24 +1460,26 @@ void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vector offset) { if (offset.size() > embInfos[embName].devVocabSize) { LOG_ERROR("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", - embName, offset.size(), embInfos[embName].devVocabSize); + embName, offset.size(), embInfos[embName].devVocabSize); throw runtime_error( - Logger::Format("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", - embName, offset.size(), embInfos[embName].devVocabSize - ).c_str()); + Logger::Format( + "{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", + embName, offset.size(), embInfos[embName].devVocabSize) + .c_str()); } vector tmpDataOut; Tensor tmpData = Vec2TensorI32(offset); tmpDataOut.emplace_back(tmpData); - tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); + tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, {1})); auto evictLen = tmpDataOut.back().flat(); int evictSize = static_cast(offset.size()); @@ -1422,15 +1489,16 @@ void KeyProcess::EvictInitDeviceEmb(const string& embName, vector offset auto trans = Singleton::GetInstance(); trans->Send(TransferChannel::EVICT, tmpDataOut, TRAIN_CHANNEL_ID, embName); - LOG_INFO(KEY_PROCESS "hbm EvictInitDeviceEmb: [{}]! send offsetSize:{}", embName, offset.size()); + LOG_INFO(KEY_PROCESS "hbm EvictInitDeviceEmb: [{}]! send offsetSize:{}", embName, + offset.size()); } -string KeyProcess::DumpSplitKeys(vector> &splitKeys) const +string KeyProcess::DumpSplitKeys(vector>& splitKeys) const { stringstream ssTrace; for (int devId = 0; devId < rankInfo.rankSize; ++devId) { ssTrace << '|' << devId << ":"; - for (auto key: splitKeys[devId]) { + for (auto key : splitKeys[devId]) { ssTrace << key << ','; } ssTrace << '|'; diff --git a/src/core/key_process/key_process.h b/src/core/key_process/key_process.h index 8bd7b8d0..d6a0b80b 100644 --- a/src/core/key_process/key_process.h +++ b/src/core/key_process/key_process.h @@ -16,283 +16,291 @@ See the License for the specific language governing permissions and #ifndef MX_REC_KEY_PROCESS_H #define MX_REC_KEY_PROCESS_H -#include +#include +#include + #include #include +#include #include #include -#include - -#include -#include -#include "ock_ctr_common/include/factory.h" +#include -#include "utils/common.h" #include "emb_table/emb_table.h" #include "feature_admit_and_evict.h" #include "hybrid_mgmt/hybrid_mgmt_block.h" +#include "ock_ctr_common/include/factory.h" +#include "utils/common.h" #include "utils/singleton.h" namespace MxRec { - using namespace std; +using namespace std; - template - struct Cmp { - bool operator()(const T& a, const T& b) const - { - return get(a) > get(b); // batch id order - } - }; +template +struct Cmp { + bool operator()(const T& a, const T& b) const + { + return get(a) > get(b); // batch id order + } +}; - template - using heap_t = priority_queue, Cmp>; +template +using heap_t = priority_queue, Cmp>; - template - using info_list_t = map, MAX_QUEUE_NUM>>; +template +using info_list_t = map, MAX_QUEUE_NUM>>; - enum class ProcessedInfo { - RESTORE, - ALL2ALL, - INVALID - }; +enum class ProcessedInfo { + RESTORE, + ALL2ALL, + INVALID +}; - class EndRunExit : public std::exception { - public: - explicit EndRunExit(const char* message) : errorMessage(message) {} +class EndRunExit : public std::exception { +public: + explicit EndRunExit(const char* message) : errorMessage(message) {} - const char* what() const noexcept override - { - return errorMessage; - } + const char* what() const noexcept override + { + return errorMessage; + } - private: - const char* errorMessage; - }; +private: + const char* errorMessage; +}; - constexpr int MPI_ABNORMAL_SEND_VALUE = 0; // MPI异常通信时发送0 - constexpr int MPI_NORMAL_SEND_VALUE = 1; // MPI正常通信时发送1 +constexpr int MPI_ABNORMAL_SEND_VALUE = 0; // MPI异常通信时发送0 +constexpr int MPI_NORMAL_SEND_VALUE = 1; // MPI正常通信时发送1 - class EmptyList : public std::exception { - }; +class EmptyList : public std::exception {}; - class WrongListTop : public std::exception { - }; +class WrongListTop : public std::exception {}; - class KeyProcess { - public: - bool Initialize(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues = {}, int seed = 0); +class KeyProcess { +public: + bool Initialize(const RankInfo& rInfo, const vector& eInfos, + const vector& thresholdValues = {}, int seed = 0); - unique_ptr> GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type); + unique_ptr> GetInfoVec(int batch, const string& embName, int channel, + ProcessedInfo type); - KeysT GetLookupKeys(int batch, const string& embName, int channel); + KeysT GetLookupKeys(int batch, const string& embName, int channel); - int GetMaxStep(int channelId) const; + int GetMaxStep(int channelId) const; - OffsetMemT GetMaxOffset(); + OffsetMemT GetMaxOffset(); - KeyOffsetMemT GetKeyOffsetMap(); + KeyOffsetMemT GetKeyOffsetMap(); - KeyCountMemT GetKeyCountMap(); + KeyCountMemT GetKeyCountMap(); - FeatureAdmitAndEvict& GetFeatAdmitAndEvict(); + FeatureAdmitAndEvict& GetFeatAdmitAndEvict(); - void LoadMaxOffset(OffsetMemT& loadData); + void LoadMaxOffset(OffsetMemT& loadData); - void LoadKeyOffsetMap(KeyOffsetMemT& loadData); + void LoadKeyOffsetMap(KeyOffsetMemT& loadData); - void LoadKeyCountMap(KeyCountMemT& loadData); + void LoadKeyCountMap(KeyCountMemT& loadData); - void Destroy(); + void Destroy(); - void LoadSaveLock(); + void LoadSaveLock(); - void LoadSaveUnlock(); + void LoadSaveUnlock(); - void EvictKeys(const string& embName, const vector& keys); + void EvictKeys(const string& embName, const vector& keys); - void EvictKeysCombine(const vector& keys); + void EvictKeysCombine(const vector& keys); - void SetupHotEmbUpdateStep(); + void SetupHotEmbUpdateStep(); - int64_t GetExpansionTableSize(const string& embName); + int64_t GetExpansionTableSize(const string& embName); - int64_t GetExpansionTableCapacity(const string& embName); + int64_t GetExpansionTableCapacity(const string& embName); - void RecordKeyCountMap(const unique_ptr& batch); + void RecordKeyCountMap(const unique_ptr& batch); - template - void GlobalUnique(T& lookupKeys, T& uniqueKeys, vector& restoreVecSec) - { - absl::flat_hash_map umap; - restoreVecSec.resize(lookupKeys.size(), -1); - int32_t length = 0; + template + void GlobalUnique(T& lookupKeys, T& uniqueKeys, vector& restoreVecSec) + { + absl::flat_hash_map umap; + restoreVecSec.resize(lookupKeys.size(), -1); + int32_t length = 0; - for (size_t i = 0; i < lookupKeys.size(); ++i) { - int64_t key = lookupKeys[i]; - if (rankInfo.useStatic && ( - (!rankInfo.useDynamicExpansion && key == -1) || (rankInfo.useDynamicExpansion && key == 0))) { - continue; - } + for (size_t i = 0; i < lookupKeys.size(); ++i) { + int64_t key = lookupKeys[i]; + if (rankInfo.useStatic && ((!rankInfo.useDynamicExpansion && key == -1) || + (rankInfo.useDynamicExpansion && key == 0))) { + continue; + } - auto result = umap.find(key); - if (result == umap.end()) { - uniqueKeys.push_back(lookupKeys[i]); - umap[key] = length; - restoreVecSec[i] = length; - length++; - } else { - restoreVecSec[i] = result->second; - } + auto result = umap.find(key); + if (result == umap.end()) { + uniqueKeys.push_back(lookupKeys[i]); + umap[key] = length; + restoreVecSec[i] = length; + length++; + } else { + restoreVecSec[i] = result->second; } + } - if (rankInfo.useStatic) { - if (rankInfo.useDynamicExpansion) { - uniqueKeys.resize(lookupKeys.size(), 0); - } else { - uniqueKeys.resize(lookupKeys.size(), -1); - } + if (rankInfo.useStatic) { + if (rankInfo.useDynamicExpansion) { + uniqueKeys.resize(lookupKeys.size(), 0); + } else { + uniqueKeys.resize(lookupKeys.size(), -1); } } + } + + void SetEos(int status, int channelId); - void SetEos(int status, int channelId); + void SendEos(int batchId, int channel); - void SendEos(int batchId, int channel); + bool isRunning{false}; - bool isRunning { false }; + std::mutex destroyMutex; + std::mutex eosMutex; + inline bool HasEmbName(const string& embName) + { + return embInfos.find(embName) != embInfos.end(); + }; + GTEST_PRIVATE : - std::mutex destroyMutex; - std::mutex eosMutex; - inline bool HasEmbName(const string& embName) - { - return embInfos.find(embName) != embInfos.end(); - }; - GTEST_PRIVATE: + int + Start(); - int Start(); + template + T GetInfo(info_list_t& list, int batch, const string& embName, int channel); - template - T GetInfo(info_list_t& list, int batch, const string& embName, int channel); + RankInfo rankInfo; + map embInfos; + MPI_Comm comm[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]; + std::mutex mut{}; + vector> procThreads{}; + std::mutex loadSaveMut[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]{}; + info_list_t lookupKeysList; + list>> storage; + info_list_t infoList; + info_list_t all2AllList; + map maxOffset{}; + map> keyOffsetMap{}; + map> keyCountMap{}; + FeatureAdmitAndEvict m_featureAdmitAndEvict{}; + map> evictPosMap{}; + map> hotKey{}; + map hotEmbTotCount; + map embeddingTableMap{}; + ock::ctr::FactoryPtr factory{}; + int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; + bool isWithFAAE; + bool isNeedSendEos[2] = {0, 0}; // 分别代表通道0、1的eos状态 - RankInfo rankInfo; - map embInfos; - MPI_Comm comm[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]; - std::mutex mut {}; - vector> procThreads {}; - std::mutex loadSaveMut[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD] {}; - info_list_t lookupKeysList; - list>> storage; - info_list_t infoList; - info_list_t all2AllList; - map maxOffset {}; - map> keyOffsetMap {}; - map> keyCountMap {}; - FeatureAdmitAndEvict m_featureAdmitAndEvict {}; - map> evictPosMap {}; - map> hotKey {}; - map hotEmbTotCount; - map embeddingTableMap {}; - ock::ctr::FactoryPtr factory {}; - int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; - bool isWithFAAE; - bool isNeedSendEos[2] = { 0, 0 }; // 分别代表通道0、1的eos状态 + void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); - void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); + void KeyProcessTask(int channel, int threadId); - void KeyProcessTask(int channel, int threadId); + void KeyProcessTaskWithFastUnique(int channel, int threadId); - void KeyProcessTaskWithFastUnique(int channel, int threadId); + bool KeyProcessTaskHelper(unique_ptr& batch, int channel, int threadId); - bool KeyProcessTaskHelper(unique_ptr& batch, int channel, int threadId); + bool KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, + ock::ctr::UniquePtr& unique, int channel, int threadId); - bool KeyProcessTaskHelperWithFastUnique(unique_ptr &batch, ock::ctr::UniquePtr& unique, - int channel, int threadId); + tuple, vector> ProcessSplitKeys(const unique_ptr& batch, + int id, vector& splitKeys); - tuple, vector> ProcessSplitKeys(const unique_ptr& batch, - int id, vector& splitKeys); + void GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf); - void GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf); + void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, + bool& uniqueInitialize, const unique_ptr& batch, + ock::ctr::UniquePtr& unique); - void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, - const unique_ptr & batch, ock::ctr::UniquePtr& unique); + void ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniquePtr& unique, + int id, UniqueInfo& uniqueInfoOut); - void ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniquePtr& unique, - int id, UniqueInfo& uniqueInfoOut); + size_t GetKeySize(const unique_ptr& batch); - size_t GetKeySize(const unique_ptr &batch); + void All2All(vector& sc, int id, const unique_ptr& batch, + KeySendInfo& keySendInfo, All2AllInfo& all2AllInfoOut); - void All2All(vector& sc, int id, const unique_ptr &batch, KeySendInfo& keySendInfo, - All2AllInfo& all2AllInfoOut); + auto HashSplit(const unique_ptr& batch) const + -> tuple, vector>; - auto HashSplit(const unique_ptr& batch) const -> tuple, vector>; + auto HotHashSplit(const unique_ptr& batch) + -> tuple, vector, vector>; - auto HotHashSplit(const unique_ptr& batch) -> tuple, vector, vector>; + void PaddingAlltoallVC(vector& splitKeys) const; - void PaddingAlltoallVC(vector& splitKeys) const; + tuple, vector, vector>> HashSplitWithFAAE( + const unique_ptr& batch) const; - tuple, vector, vector>> - HashSplitWithFAAE(const unique_ptr& batch) const; + vector GetScAll(const vector& keyScLocal, int commId, + const unique_ptr& batch); - vector GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch); + void GetScAllForUnique(const vector& keyScLocal, int commId, + const unique_ptr& batch, vector& scAllOut); - void GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr &batch, - vector &scAllOut); + void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); - void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); + void Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel); - void Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel); + unique_ptr GetBatchData(int channel, int commId) const; - unique_ptr GetBatchData(int channel, int commId) const; + void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, + vector& restoreVec, int hotPosSize = 0) const; - void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, - vector& restoreVec, int hotPosSize = 0) const; - - void SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch); + void SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch); - void EvictDeleteDeviceEmb(const string& embName, const vector& keys); + void EvictDeleteDeviceEmb(const string& embName, const vector& keys); - void EvictInitDeviceEmb(const string& embName, vector offset); + void EvictInitDeviceEmb(const string& embName, vector offset); - void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, bool refresh, - const string& embName); + void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, + bool refresh, const string& embName); - void UpdateHotMapForUnique(const KeysT &keySend, const vector &keyCount, - uint32_t count, bool refresh, const string& embName); + void UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, + uint32_t count, bool refresh, const string& embName); - void HandleHotAndSendCount(const unique_ptr &batch, UniqueInfo& uniqueInfoOut, - KeySendInfo& keySendInfo, vector& sc, vector& splitSize); + void HandleHotAndSendCount(const unique_ptr& batch, UniqueInfo& uniqueInfoOut, + KeySendInfo& keySendInfo, vector& sc, vector& splitSize); - void PushResult(unique_ptr& batch, unique_ptr> tensors, KeysT& lookupKeys); + void PushResult(unique_ptr& batch, unique_ptr> tensors, + KeysT& lookupKeys); - void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel); + void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, + int channel); - void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, - const unique_ptr& batch); + void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, + const vector& hotPosDev, const unique_ptr& batch); - void ComputeHotPos(const unique_ptr &batch, absl::flat_hash_map &hotMap, - vector &hotPos, vector &restore, const int hotOffset) const; + void ComputeHotPos(const unique_ptr& batch, + absl::flat_hash_map& hotMap, vector& hotPos, + vector& restore, const int hotOffset) const; - vector GetCountRecv(const unique_ptr& batch, int id, - vector>& keyCount, vector scAll, vector ss); + vector GetCountRecv(const unique_ptr& batch, int id, + vector>& keyCount, vector scAll, + vector ss); - void HashSplitHelper(const unique_ptr & batch, vector & splitKeys, - vector & restore, vector & hotPos, - vector >& keyCount); + void HashSplitHelper(const unique_ptr& batch, vector& splitKeys, + vector& restore, vector& hotPos, + vector>& keyCount); - template - inline vector Count2Start(const vector& count) const - { - vector start = { 0 }; - for (size_t i = 0; i < count.size() - 1; ++i) { - start.push_back(count[i] + start.back()); - } - return start; + template + inline vector Count2Start(const vector& count) const + { + vector start = {0}; + for (size_t i = 0; i < count.size() - 1; ++i) { + start.push_back(count[i] + start.back()); } + return start; + } - string DumpSplitKeys(vector>& splitKeys) const; - }; + string DumpSplitKeys(vector>& splitKeys) const; +}; #define KEY_PROCESS_INSTANCE Singleton::GetInstance() -} // end namespace MxRec +} // end namespace MxRec -#endif // MX_REC_KEY_PROCESS_H +#endif // MX_REC_KEY_PROCESS_H -- Gitee From d6db1b2256f2a7d70d67652b20735dd52b35f822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 22 Apr 2024 16:22:54 +0800 Subject: [PATCH 053/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=A0=B9=E6=8D=AE=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E5=99=A8=E7=B1=BB=E5=9E=8B=E8=87=AA=E5=8A=A8=E5=88=A4?= =?UTF-8?q?=E6=96=AD=E6=98=AF=E5=90=A6=E5=BC=80=E5=90=AF=E5=85=A8=E5=B1=80?= =?UTF-8?q?=E5=8E=BB=E9=87=8D=E7=89=B9=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/optimizers/adagrad.py | 20 +++++++++++++++++--- mx_rec/optimizers/ftrl.py | 22 ++++++++++++++-------- mx_rec/optimizers/lazy_adam.py | 15 ++++++++++----- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/mx_rec/optimizers/adagrad.py b/mx_rec/optimizers/adagrad.py index 4ba444a6..fe8a0a2d 100644 --- a/mx_rec/optimizers/adagrad.py +++ b/mx_rec/optimizers/adagrad.py @@ -21,6 +21,7 @@ from __future__ import print_function from collections import defaultdict +from tensorflow.python.framework import ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.training import adagrad, training_ops @@ -129,13 +130,26 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): self._get_or_make_slot_with_initializer(var, init, var.get_shape(), dtype, "acc", acc_state_name) + def _apply_sparse_duplicate_indices(self, grad, var): + # _apply_sparse_duplicate_indices method include tf.unique and unsorted_segment_sum operations which may + # introduce dynamic shape problem, if encounter that, please de-annotation the method below. + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) + gradient_no_duplicate_indices = ops.IndexedSlices( + indices=unique_keys, + values=unique_local_grad, + dense_shape=grad.dense_shape) + return self._apply_sparse(gradient_no_duplicate_indices, var) + + def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices): + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad, var=handle, is_expansion=False) + return self._resource_apply_sparse(unique_local_grad, handle, unique_keys) + def _apply_sparse(self, grad, var): acc = self.get_slot(var, "acc") - unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) return training_ops.sparse_apply_adagrad( var, acc, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), - unique_local_grad, - unique_keys, + grad.values, + grad.indices, use_locking=self._use_locking) def _resource_apply_sparse(self, grad, var, indices): diff --git a/mx_rec/optimizers/ftrl.py b/mx_rec/optimizers/ftrl.py index 3659ffcd..855fa9c4 100644 --- a/mx_rec/optimizers/ftrl.py +++ b/mx_rec/optimizers/ftrl.py @@ -120,10 +120,18 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): return [self._initial_accumulator_value, initial_linear_value] def _apply_sparse_duplicate_indices(self, grad, var): - return self._apply_sparse(grad, var) + # _apply_sparse_duplicate_indices method include tf.unique and unsorted_segment_sum operations which may + # introduce dynamic shape problem, if encounter that, please de-annotation the method below. + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) + gradient_no_duplicate_indices = ops.IndexedSlices( + indices=unique_keys, + values=unique_local_grad, + dense_shape=grad.dense_shape) + return self._apply_sparse(gradient_no_duplicate_indices, var) def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices): - return self._resource_apply_sparse(grad, handle, indices) + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad, var=handle, is_expansion=False) + return self._resource_apply_sparse(unique_local_grad, handle, unique_keys) def _resource_apply_sparse(self, grad, handle, indices): if self._l2_shrinkage_regularization_strength <= 0.0: @@ -140,19 +148,17 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): self._resource_scatter_nd_update) def _apply_sparse(self, grad, var): - unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) - if self._l2_shrinkage_regularization_strength <= 0.0: return self._apply_sparse_shared( - unique_local_grad, + grad.values, var, - unique_keys, + grad.indices, lambda x, i, v: tf.compat.v1.scatter_nd_update(x, i, v)) else: return self._apply_sparse_shared_v2( - unique_local_grad, + grad.values, var, - unique_keys, + grad.indices, lambda x, i, v: tf.compat.v1.scatter_nd_update(x, i, v)) def _apply_sparse_shared(self, grad, var, indices, scatter_nd_update): diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index bab8245f..6ac7e844 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -119,10 +119,16 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): def _apply_sparse_duplicate_indices(self, grad, var): # _apply_sparse_duplicate_indices method include tf.unique and unsorted_segment_sum operations which may # introduce dynamic shape problem, if encounter that, please de-annotation the method below. - return self._apply_sparse(grad, var) + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) + gradient_no_duplicate_indices = ops.IndexedSlices( + indices=unique_keys, + values=unique_local_grad, + dense_shape=grad.dense_shape) + return self._apply_sparse(gradient_no_duplicate_indices, var) def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices): - return self._resource_apply_sparse(grad, handle, indices) + unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad, var=handle, is_expansion=False) + return self._resource_apply_sparse(unique_local_grad, handle, unique_keys) def _apply_dense(self, grad, var): raise NotImplementedError("You are using a wrong type of variable.") @@ -149,11 +155,10 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): self._resource_scatter_nd_add) def _apply_sparse(self, grad, var): - unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad.values, var=var, is_expansion=False) return self._apply_sparse_shared( - unique_local_grad, + grad.values, var, - unique_keys, + grad.indices, lambda x, i, v: tf.compat.v1.scatter_nd_add(x, i, v)) def _apply_sparse_shared(self, grad, var, indices, scatter_nd_add): -- Gitee From ba920189251739b7654296b881cdb9501f49eef3 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 22 Apr 2024 16:48:57 +0800 Subject: [PATCH 054/302] =?UTF-8?q?=E6=9B=B4=E6=94=B9=E8=A1=8C=E5=AE=BD?= =?UTF-8?q?=E5=92=8C=E4=BA=8C=E5=85=83=E8=BF=90=E7=AE=97=E7=AC=A6=E9=85=8D?= =?UTF-8?q?=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .clang-format | 4 +- src/core/key_process/key_process.cpp | 351 +++++++++++---------------- src/core/key_process/key_process.h | 82 +++---- 3 files changed, 181 insertions(+), 256 deletions(-) diff --git a/.clang-format b/.clang-format index f1f5b0d0..ee9f3a3c 100644 --- a/.clang-format +++ b/.clang-format @@ -1,7 +1,7 @@ Language: Cpp BasedOnStyle: Google AccessModifierOffset: -4 -ColumnLimit: 100 +ColumnLimit: 120 IndentWidth: 4 UseTab: Never AlignOperands: Align @@ -34,7 +34,7 @@ BraceWrapping: BeforeCatch: false BeforeElse: false IndentBraces: false -BreakBeforeBinaryOperators: None +BreakBeforeBinaryOperators: NonAssignment BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeColon BreakStringLiterals: true diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 85b17bbb..9751e268 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -83,13 +83,12 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos if (GlobalEnv::fastUnique) { int result = ock::ctr::Factory::Create(factory); if (result != 0) { - throw runtime_error( - Logger::Format("create fast factory failed, error code:{}", result)); + throw runtime_error(Logger::Format("create fast factory failed, error code:{}", result)); } } - LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}", - MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic); + LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}", MapToString(scInfo), + rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic); #ifndef GTEST Start(); #endif @@ -135,9 +134,8 @@ void KeyProcess::InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo) if (rankInfo.useDynamicExpansion) { embeddingSize = info.embeddingSize; } - hotEmbTotCount[info.name] = - static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) * - HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); + hotEmbTotCount[info.name] = static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) + * HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); } OffsetMemT KeyProcess::GetMaxOffset() @@ -229,9 +227,8 @@ void KeyProcess::GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf) uniqueConf.maxThreadNum = GlobalEnv::maxUniqueThreadNum; } -void KeyProcess::InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, - bool& uniqueInitialize, const unique_ptr& batch, - ock::ctr::UniquePtr& unique) +void KeyProcess::InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, + const unique_ptr& batch, ock::ctr::UniquePtr& unique) { uniqueConf.desiredSize = static_cast(batch->Size()); if (preBatchSize != batch->Size()) { @@ -273,8 +270,7 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = - GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -286,11 +282,10 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) if (!KeyProcessTaskHelperWithFastUnique(batch, unique, channel, threadId)) { break; } - LOG_INFO(KEY_PROCESS - "getAndProcessTC(ms):{}, key process with fast unique cost:{}," - " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", - getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, - batch->name, batch->channel, threadId, batch->batchId); + LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process with fast unique cost:{}," + " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", + getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, + batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); auto batchQueue = SingletonQueue::GetInstances(queueIndex); batchQueue->PutDirty(move(batch)); @@ -299,8 +294,8 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) } catch (const EndRunExit& e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } - LOG_INFO(KEY_PROCESS "KeyProcessTaskWithFastUnique exit. rank:{} channelId:{}, threadId:{}", - rankInfo.rankId, channel, threadId); + LOG_INFO(KEY_PROCESS "KeyProcessTaskWithFastUnique exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, + channel, threadId); } void KeyProcess::KeyProcessTask(int channel, int threadId) @@ -310,8 +305,7 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = - GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -322,11 +316,10 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) if (!KeyProcessTaskHelper(batch, channel, threadId)) { break; } - LOG_INFO(KEY_PROCESS - "getAndProcessTC(ms):{}, key process cost:{}," - " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", - getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, - batch->name, batch->channel, threadId, batch->batchId); + LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process cost:{}," + " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", + getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, + batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); auto batchQueue = SingletonQueue::GetInstances(queueIndex); batchQueue->PutDirty(move(batch)); @@ -334,17 +327,15 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) } catch (const EndRunExit& e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } - LOG_INFO(KEY_PROCESS "KeyProcessTask exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, - channel, threadId); + LOG_INFO(KEY_PROCESS "KeyProcessTask exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, channel, threadId); } -void KeyProcess::HashSplitHelper(const unique_ptr& batch, vector& splitKeys, - vector& restore, vector& hotPos, - vector>& keyCount) +void KeyProcess::HashSplitHelper(const unique_ptr& batch, vector& splitKeys, vector& restore, + vector& hotPos, vector>& keyCount) { TimeCost uniqueTc; - if (m_featureAdmitAndEvict.GetFunctionSwitch() && - FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { + if (m_featureAdmitAndEvict.GetFunctionSwitch() + && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 } else { tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 @@ -352,13 +343,12 @@ void KeyProcess::HashSplitHelper(const unique_ptr& batch, vector& batch, - ock::ctr::UniquePtr& unique, int channel, - int threadId) +bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, ock::ctr::UniquePtr& unique, + int channel, int threadId) { // tuple for keyRec restore hotPos scAll countRecv - isWithFAAE = m_featureAdmitAndEvict.GetFunctionSwitch() && - FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; + isWithFAAE = m_featureAdmitAndEvict.GetFunctionSwitch() + && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; TimeCost totalTimeCost = TimeCost(); TimeCost fastUniqueTC; UniqueInfo uniqueInfo; @@ -366,12 +356,12 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch LOG_DEBUG("ProcessBatchWithFastUnique(ms):{}", fastUniqueTC.ElapsedMS()); // 特征准入&淘汰 - if (isWithFAAE && - (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, uniqueInfo.all2AllInfo.keyRecv, - uniqueInfo.all2AllInfo.countRecv) == - FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { - LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", - rankInfo.rankId, threadId, channel); + if (isWithFAAE + && (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, uniqueInfo.all2AllInfo.keyRecv, + uniqueInfo.all2AllInfo.countRecv) + == FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", rankInfo.rankId, + threadId, channel); return false; } std::lock_guard lock(loadSaveMut[channel][threadId]); @@ -397,17 +387,15 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch if (!rankInfo.isDDR) { PushGlobalUniqueTensors(move(tensors), uniqueInfo.all2AllInfo.keyRecv, channel); - tensors->push_back(rankInfo.useDynamicExpansion - ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) - : Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) + : Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); } TimeCost pushResultTC; PushResult(batch, move(tensors), uniqueInfo.all2AllInfo.keyRecv); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO - "channel_id {} batch_id {} rank_id {} key_process_time_cost_with_fast_unique {}", - channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost_with_fast_unique {}", channel, + batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); return true; @@ -424,8 +412,8 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, auto [lookupKeys, scAll, ss] = ProcessSplitKeys(batch, threadId, splitKeys); vector countRecv; - if (m_featureAdmitAndEvict.GetFunctionSwitch() && - FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { + if (m_featureAdmitAndEvict.GetFunctionSwitch() + && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { countRecv = GetCountRecv(batch, threadId, keyCount, scAll, ss); } std::lock_guard lock(loadSaveMut[channel][threadId]); @@ -433,12 +421,12 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, BuildRestoreVec(batch, ss, restore, static_cast(hotPos.size())); // 特征准入&淘汰 - if (m_featureAdmitAndEvict.GetFunctionSwitch() && - FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE && - (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, countRecv) == - FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { - LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", - rankInfo.rankId, threadId, channel); + if (m_featureAdmitAndEvict.GetFunctionSwitch() + && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE + && (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, countRecv) + == FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", rankInfo.rankId, + threadId, channel); return false; } @@ -462,25 +450,22 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, if (!rankInfo.isDDR) { PushGlobalUniqueTensors(tensors, lookupKeys, channel); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) - : Vec2TensorI32(lookupKeys)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) : Vec2TensorI32(lookupKeys)); } PushResult(batch, move(tensors), lookupKeys); LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", channel, - batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", channel, batch->batchId, + rankInfo.rankId, totalTimeCost.ElapsedMS()); } return true; } -void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, - KeysT& lookupKeys, int channel) +void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel) { - if (GlobalEnv::applyGradientsStrategy == - ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && - channel == TRAIN_CHANNEL_ID) { + if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY + && channel == TRAIN_CHANNEL_ID) { KeysT uniqueKeys; vector restoreVecSec; @@ -488,14 +473,12 @@ void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tenso GlobalUnique(lookupKeys, uniqueKeys, restoreVecSec); LOG_DEBUG("globalUniqueSyncTC(ms):{}", globalUniqueSyncTC.ElapsedMS()); tensors->push_back(Vec2TensorI32(restoreVecSec)); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) - : Vec2TensorI32(uniqueKeys)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) : Vec2TensorI32(uniqueKeys)); } } vector KeyProcess::GetCountRecv(const unique_ptr& batch, int id, - vector>& keyCount, vector scAll, - vector ss) + vector>& keyCount, vector scAll, vector ss) { TimeCost getCountRecvTC; if (rankInfo.useStatic) { @@ -518,9 +501,8 @@ vector KeyProcess::GetCountRecv(const unique_ptr& batch, in vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 vector countRecv; countRecv.resize(rs.back() + rc.back()); - int retCode = - MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), - rc.data(), rs.data(), MPI_UINT32_T, comm[batch->channel][id]); + int retCode = MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), rc.data(), + rs.data(), MPI_UINT32_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -528,16 +510,13 @@ vector KeyProcess::GetCountRecv(const unique_ptr& batch, in return countRecv; } -void KeyProcess::PushResult(unique_ptr& batch, unique_ptr> tensors, - KeysT& lookupKeys) +void KeyProcess::PushResult(unique_ptr& batch, unique_ptr> tensors, KeysT& lookupKeys) { std::unique_lock lockGuard(mut); storage.push_front(move(tensors)); - infoList[batch->name][batch->channel].push( - make_tuple(batch->batchId, batch->name, storage.begin())); + infoList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, storage.begin())); if (rankInfo.isDDR) { - lookupKeysList[batch->name][batch->channel].push( - make_tuple(batch->batchId, batch->name, move(lookupKeys))); + lookupKeysList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, move(lookupKeys))); } lockGuard.unlock(); } @@ -565,9 +544,8 @@ unique_ptr KeyProcess::GetBatchData(int channel, int commId) const this_thread::sleep_for(100us); if (tc.ElapsedSec() > GET_BATCH_TIMEOUT) { if (commId == 0) { - LOG_WARN(KEY_PROCESS - "getting batch timeout! 1. check last 'read batch cost' print. " - "channel[{}] commId[{}]", + LOG_WARN(KEY_PROCESS "getting batch timeout! 1. check last 'read batch cost' print. " + "channel[{}] commId[{}]", channel, commId); } this_thread::sleep_for(seconds(1)); @@ -575,22 +553,18 @@ unique_ptr KeyProcess::GetBatchData(int channel, int commId) const } if (!isRunning) { - LOG_WARN("channelId:{} threadId:{}, isRunning is false when GetBatchData", channel, - commId); + LOG_WARN("channelId:{} threadId:{}, isRunning is false when GetBatchData", channel, commId); throw EndRunExit("GetBatchData end run."); } } EASY_END_BLOCK - LOG_DEBUG( - KEY_PROCESS - "channelId:{} threadId:{} batchId:{}, get batch data done, batchName:{}. bs:{} sample:[{}]", - batch->channel, commId, batch->batchId, batch->name, batch->Size(), batch->UnParse()); + LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, get batch data done, batchName:{}. bs:{} sample:[{}]", + batch->channel, commId, batch->batchId, batch->name, batch->Size(), batch->UnParse()); #if defined(PROFILING) && defined(BUILD_WITH_EASY_PROFILER) if (batch->batchId == PROFILING_START_BATCH_ID) { EASY_PROFILER_ENABLE } else if (batch->batchId == PROFILING_END_BATCH_ID) { - ::profiler::dumpBlocksToFile( - StringFormat("/home/MX_REC-profile-%d.prof", rankInfo.rankId).c_str()); + ::profiler::dumpBlocksToFile(StringFormat("/home/MX_REC-profile-%d.prof", rankInfo.rankId).c_str()); } #endif return batch; @@ -605,8 +579,7 @@ size_t KeyProcess::GetKeySize(const unique_ptr& batch) return size; } -void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, - ock::ctr::UniquePtr& unique, int id, +void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniquePtr& unique, int id, UniqueInfo& uniqueInfoOut) { EASY_FUNCTION(profiler::colors::Purple) @@ -655,20 +628,18 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, LOG_DEBUG(KEY_PROCESS "ProcessBatchWithFastUnique get batchId:{}, batchSize:{}," " channel:{}, name:{}, restore:{}, keyCount:{}", - batch->batchId, batch->Size(), batch->channel, batch->name, - uniqueInfoOut.restore.size(), keySendInfo.keyCount.size()); + batch->batchId, batch->Size(), batch->channel, batch->name, uniqueInfoOut.restore.size(), + keySendInfo.keyCount.size()); if (GlogConfig::gStatOn) { LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} " "batch_key_num_with_fast_unique {} unique_key_num_with_fast_unique {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), - uniqueOut.uniqueIdCnt); + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueOut.uniqueIdCnt); } } -void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, - UniqueInfo& uniqueInfoOut, KeySendInfo& keySendInfo, - vector& sc, vector& splitSize) +void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, UniqueInfo& uniqueInfoOut, + KeySendInfo& keySendInfo, vector& sc, vector& splitSize) { std::shared_lock lock(g_smut); absl::flat_hash_map hotMap = hotKey[batch->name]; @@ -681,8 +652,8 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, TimeCost computeHotTc; ComputeHotPos(batch, hotMap, uniqueInfoOut.hotPos, uniqueInfoOut.restore, hotOffset); LOG_DEBUG("ComputeHot TimeCost(ms):{}", computeHotTc.ElapsedMS()); - UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, hotOffset, - batch->batchId % hotEmbUpdateStep == 0, batch->name); + UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, hotOffset, batch->batchId % hotEmbUpdateStep == 0, + batch->name); if (rankInfo.useStatic) { sc.resize(rankInfo.rankSize, embInfos[batch->name].sendCount); @@ -694,9 +665,8 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, } } -void KeyProcess::ComputeHotPos(const unique_ptr& batch, - absl::flat_hash_map& hotMap, vector& hotPos, - vector& restore, const int hotOffset) const +void KeyProcess::ComputeHotPos(const unique_ptr& batch, absl::flat_hash_map& hotMap, + vector& hotPos, vector& restore, const int hotOffset) const { emb_key_t* inputData = batch->sample.data(); size_t miniBs = batch->Size(); @@ -719,8 +689,8 @@ void KeyProcess::ComputeHotPos(const unique_ptr& batch, } } -void KeyProcess::All2All(vector& sc, int id, const unique_ptr& batch, - KeySendInfo& keySendInfo, All2AllInfo& all2AllInfoOut) +void KeyProcess::All2All(vector& sc, int id, const unique_ptr& batch, KeySendInfo& keySendInfo, + All2AllInfo& all2AllInfoOut) { TimeCost getScAllTC; int channel = batch->channel; @@ -739,48 +709,43 @@ void KeyProcess::All2All(vector& sc, int id, const unique_ptr& b all2AllInfoOut.keyRecv.resize(rs.back() + rc.back()); EASY_BLOCK("all2all") int retCode = MPI_Alltoallv(keySendInfo.keySend.data(), sc.data(), ss.data(), MPI_INT64_T, - all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, - comm[channel][id]); + all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, comm[channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All MPI_Alltoallv end.", channel, id, - batch->batchId); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All MPI_Alltoallv end.", channel, id, batch->batchId); all2AllInfoOut.countRecv.resize(rs.back() + rc.back()); if (isWithFAAE) { retCode = MPI_Alltoallv(keySendInfo.keyCount.data(), sc.data(), ss.data(), MPI_UINT32_T, - all2AllInfoOut.countRecv.data(), rc.data(), rs.data(), MPI_UINT32_T, - comm[channel][id]); + all2AllInfoOut.countRecv.data(), rc.data(), rs.data(), MPI_UINT32_T, comm[channel][id]); if (retCode != MPI_SUCCESS) { - LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", channel, id, - batch->batchId, retCode); + LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", channel, id, batch->batchId, + retCode); } } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All end, all2allTC TimeCost(ms):{}", - channel, id, batch->batchId, all2allTC.ElapsedMS()); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All end, all2allTC TimeCost(ms):{}", channel, id, + batch->batchId, all2allTC.ElapsedMS()); EASY_END_BLOCK } -auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, - vector& splitKeys) +auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, vector& splitKeys) -> tuple, vector> { TimeCost processSplitKeysTC; EASY_FUNCTION(profiler::colors::Purple) EASY_VALUE("batchId", batch->batchId) - LOG_INFO(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, ProcessSplitKeys start.", - batch->channel, id, batch->batchId); + LOG_INFO(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, ProcessSplitKeys start.", batch->channel, id, + batch->batchId); // 使用静态all2all通信:发送或接受量为预置固定值 scInfo[batch->name] = 65536 / rankSize 经验值 if (rankInfo.useStatic) { // maybe move after all2all for (KeysT& i : splitKeys) { if (static_cast(i.size()) > embInfos[batch->name].sendCount) { - LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", batch->name, - batch->channel, batch->batchId, i.size()); - throw runtime_error( - StringFormat("%s[%d]:%d overflow! set send count bigger than %d", - batch->name.c_str(), batch->channel, batch->batchId, i.size()) - .c_str()); + LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", batch->name, batch->channel, + batch->batchId, i.size()); + throw runtime_error(StringFormat("%s[%d]:%d overflow! set send count bigger than %d", + batch->name.c_str(), batch->channel, batch->batchId, i.size()) + .c_str()); } i.resize(embInfos[batch->name].sendCount, -1); } @@ -794,8 +759,7 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, KeysT keyRecv; TimeCost getScAllTC; - vector scAll = GetScAll( - sc, id, batch); // Allgather通信获取所有(不同rank相同thread id的)线程间通信量矩阵 + vector scAll = GetScAll(sc, id, batch); // Allgather通信获取所有(不同rank相同thread id的)线程间通信量矩阵 LOG_DEBUG("getScAllTC(ms)(AllReduce-AllGather):{}", getScAllTC.ElapsedMS()); vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 @@ -809,8 +773,8 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, EASY_BLOCK("all2all") TimeCost uniqueAll2AllTC; - int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, keyRecv.data(), - rc.data(), rs.data(), MPI_INT64_T, comm[batch->channel][id]); + int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, keyRecv.data(), rc.data(), rs.data(), + MPI_INT64_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -828,8 +792,7 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, * splitKeys返回:将数据的key切分到其所在dev id对应的桶中,并去重。 * restore返回:去重后key在桶内偏移量(用于计算恢复向量) */ -tuple, vector> KeyProcess::HashSplit( - const unique_ptr& batch) const +tuple, vector> KeyProcess::HashSplit(const unique_ptr& batch) const { EASY_FUNCTION(profiler::colors::Gold) emb_key_t* batchData = batch->sample.data(); @@ -845,8 +808,7 @@ tuple, vector> KeyProcess::HashSplit( auto result = uKey.find(key); if (result == uKey.end()) { splitKeys[devId].push_back(key); - restore[i] = - hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) + restore[i] = hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) uKey[key] = restore[i]; } else { // 去重 restore[i] = result->second; @@ -861,9 +823,8 @@ tuple, vector> KeyProcess::HashSplit( for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO - "channel_id {} batch_id {} rank_id {} batch_key_num {} unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} unique_key_num {}", batch->channel, + batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } return {splitKeys, restore}; } @@ -889,7 +850,7 @@ tuple, vector, vector>> KeyProcess::Hash vector splitKeys(rankInfo.rankSize); vector> keyCount(rankInfo.rankSize); // splitKeys在原始batch中对应的频次 vector restore(batch->Size()); - vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 + vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 absl::flat_hash_map> uKey; // 用于去重查询 EASY_BLOCK("split push back") for (size_t i = 0; i < miniBs; i++) { @@ -898,8 +859,7 @@ tuple, vector, vector>> KeyProcess::Hash auto result = uKey.find(key); if (result == uKey.end()) { splitKeys[devId].push_back(key); - restore[i] = - hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) + restore[i] = hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) uKey[key].first = restore[i]; uKey[key].second = 1; } else { // 去重 @@ -928,15 +888,13 @@ tuple, vector, vector>> KeyProcess::Hash for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO - "channel_id {} batch_id {} rank_id {} batch_key_num {} faae_unique_key_num {}", + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} faae_unique_key_num {}", batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } return {splitKeys, restore, keyCount}; } -tuple, vector, vector> KeyProcess::HotHashSplit( - const unique_ptr& batch) +tuple, vector, vector> KeyProcess::HotHashSplit(const unique_ptr& batch) { EASY_FUNCTION(profiler::colors::Gold) emb_key_t* batchData = batch->sample.data(); @@ -988,25 +946,22 @@ tuple, vector, vector> KeyProcess::HotHashSplit( for (int devId = 0; devId < rankInfo.rankSize; ++devId) { uniqueKeyNum += splitKeys[devId].size(); } - LOG_INFO(STAT_INFO - "channel_id {} batch_id {} rank_id {} batch_key_num {} hot_unique_key_num {}", + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} hot_unique_key_num {}", batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - UpdateHotMap(keyCountMapByEmbName, hotEmbTotCount[batch->name], - batch->batchId % hotEmbUpdateStep == 0, batch->name); + UpdateHotMap(keyCountMapByEmbName, hotEmbTotCount[batch->name], batch->batchId % hotEmbUpdateStep == 0, + batch->name); AddCountStartToHotPos(splitKeys, hotPos, hotPosDev, batch); return {splitKeys, restore, hotPos}; } -void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& hotPos, - const vector& hotPosDev, +void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, const unique_ptr& batch) { vector splitKeysSize; for (auto& splitKey : splitKeys) { - int tmp = rankInfo.useStatic ? embInfos[batch->name].sendCount - : static_cast(splitKey.size()); + int tmp = rankInfo.useStatic ? embInfos[batch->name].sendCount : static_cast(splitKey.size()); splitKeysSize.push_back(tmp); } @@ -1016,8 +971,8 @@ void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& ho } } -void KeyProcess::UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, - uint32_t count, bool refresh, const string& embName) +void KeyProcess::UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, uint32_t count, + bool refresh, const string& embName) { auto& hotMap = hotKey[embName]; if (refresh) { @@ -1041,8 +996,8 @@ void KeyProcess::UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCountMapByEmbName, - uint32_t count, bool refresh, const string& embName) +void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, bool refresh, + const string& embName) { if (!refresh) { return; @@ -1068,43 +1023,39 @@ void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapBy * 将本地(rank)batch要发送的key数据量进行Allgather通信,获取所有(不同rank相同thread * id的)线程间的通信量矩阵 scAll返回:所有线程间的通信量矩阵(按行平铺的一维向量) */ -vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, - const unique_ptr& batch) +vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch) { EASY_FUNCTION() vector scAll; scAll.resize(rankInfo.rankSize * rankInfo.rankSize); - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll start.", batch->channel, commId, - batch->batchId); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll start.", batch->channel, commId, batch->batchId); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAll.data(), - rankInfo.rankSize, MPI_INT, comm[batch->channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAll.data(), rankInfo.rankSize, + MPI_INT, comm[batch->channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {} commId {}, MPI_Allgather failed:{}", rankInfo.rankId, commId, retCode); } - LOG_DEBUG( - "channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, key scAll matrix:\n{}", - batch->channel, commId, batch->batchId, VectorToString(scAll)); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, key scAll matrix:\n{}", batch->channel, + commId, batch->batchId, VectorToString(scAll)); return scAll; } -void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, - const unique_ptr& batch, vector& scAllOut) +void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr& batch, + vector& scAllOut) { EASY_FUNCTION() int channel = batch->channel; scAllOut.resize(rankInfo.rankSize * rankInfo.rankSize); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAllOut.data(), - rankInfo.rankSize, MPI_INT, comm[channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAllOut.data(), rankInfo.rankSize, + MPI_INT, comm[channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Allgather failed:{}", rankInfo.rankId, retCode); } - LOG_DEBUG( - "channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key scAllOut matrix:\n{}", - channel, commId, batch->batchId, VectorToString(scAllOut)); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key scAllOut matrix:\n{}", channel, commId, + batch->batchId, VectorToString(scAllOut)); } void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel) @@ -1206,8 +1157,7 @@ void KeyProcess::BuildRestoreVec(const unique_ptr& batch, const vecto hotNum += 1; } } - LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", hotNum, batch->Size(), - buildRestoreVecTC.ElapsedMS()); + LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", hotNum, batch->Size(), buildRestoreVecTC.ElapsedMS()); } template @@ -1220,8 +1170,7 @@ T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, in } auto topBatch = get(list[embName][channel].top()); if (topBatch < batch) { - LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", topBatch, - batch, channel); + LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", topBatch, batch, channel); this_thread::sleep_for(1s); } if (topBatch != batch) { @@ -1249,10 +1198,8 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { - LOG_DEBUG( - KEY_PROCESS - "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", - embName, channel, batch); + LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", + embName, channel, batch); return {}; } if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { @@ -1299,27 +1246,22 @@ void KeyProcess::SendEos(int batchId, int channel) vector tensors; bool isNeedResend = true; - for (const auto& emb : - as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos start.", channel, - batchId, emb.first); + for (const auto& emb : as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 + LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos start.", channel, batchId, emb.first); if (!isRunning) { throw EndRunExit("SendEos end run, isRunning is false after lock destroyMutex."); } for (const string& transName : usedChannelNames) { - string sendName = - StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); + string sendName = StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); size_t channelSize = 0; acltdtQueryChannelSize(transChannels[sendName], &channelSize); LOG_INFO("[EOS] Before send eos, {} contains {}.", sendName, channelSize); - SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, - isNeedResend); + SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, isNeedResend); acltdtQueryChannelSize(transChannels[sendName], &channelSize); LOG_INFO("[EOS] After send eos, {} contains {}.", sendName, channelSize); } - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos end.", channel, - batchId, emb.first); + LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos end.", channel, batchId, emb.first); } LOG_INFO("channelId:{} batchId:{}, SendEos end.", channel, batchId); @@ -1333,8 +1275,7 @@ void KeyProcess::SendEos(int batchId, int channel) /// \param channel 通道索引(训练/推理) /// \param type 数据类型 /// \return -unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embName, int channel, - ProcessedInfo type) +unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type) { TimeCost tc = TimeCost(); info_list_t* list; @@ -1359,10 +1300,8 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { - LOG_DEBUG( - KEY_PROCESS - "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", - embName, channel, batch); + LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", + embName, channel, batch); return nullptr; } if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { @@ -1381,8 +1320,7 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa unique_lock lockEosGuard(eosMutex); // 避免eos在keyProcess还未处理完数据时插队到通道前面, // readEmbKey真实的次数是readEmbedBatchId减1 - if (isNeedSendEos[channel] && - (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { + if (isNeedSendEos[channel] && (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { LOG_INFO("channelId:{} batchId:{}, GetInfoVec eos.", channel, batch); unique_lock lockDestroyGuard(destroyMutex); SendEos(batch, channel); @@ -1460,19 +1398,17 @@ void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vector offset) { if (offset.size() > embInfos[embName].devVocabSize) { - LOG_ERROR("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", - embName, offset.size(), embInfos[embName].devVocabSize); + LOG_ERROR("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", embName, + offset.size(), embInfos[embName].devVocabSize); throw runtime_error( - Logger::Format( - "{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", - embName, offset.size(), embInfos[embName].devVocabSize) + Logger::Format("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", embName, + offset.size(), embInfos[embName].devVocabSize) .c_str()); } @@ -1489,8 +1425,7 @@ void KeyProcess::EvictInitDeviceEmb(const string& embName, vector offset auto trans = Singleton::GetInstance(); trans->Send(TransferChannel::EVICT, tmpDataOut, TRAIN_CHANNEL_ID, embName); - LOG_INFO(KEY_PROCESS "hbm EvictInitDeviceEmb: [{}]! send offsetSize:{}", embName, - offset.size()); + LOG_INFO(KEY_PROCESS "hbm EvictInitDeviceEmb: [{}]! send offsetSize:{}", embName, offset.size()); } string KeyProcess::DumpSplitKeys(vector>& splitKeys) const diff --git a/src/core/key_process/key_process.h b/src/core/key_process/key_process.h index d6a0b80b..4dafc07f 100644 --- a/src/core/key_process/key_process.h +++ b/src/core/key_process/key_process.h @@ -81,8 +81,7 @@ public: bool Initialize(const RankInfo& rInfo, const vector& eInfos, const vector& thresholdValues = {}, int seed = 0); - unique_ptr> GetInfoVec(int batch, const string& embName, int channel, - ProcessedInfo type); + unique_ptr> GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type); KeysT GetLookupKeys(int batch, const string& embName, int channel); @@ -129,8 +128,8 @@ public: for (size_t i = 0; i < lookupKeys.size(); ++i) { int64_t key = lookupKeys[i]; - if (rankInfo.useStatic && ((!rankInfo.useDynamicExpansion && key == -1) || - (rankInfo.useDynamicExpansion && key == 0))) { + if (rankInfo.useStatic + && ((!rankInfo.useDynamicExpansion && key == -1) || (rankInfo.useDynamicExpansion && key == 0))) { continue; } @@ -205,42 +204,38 @@ public: bool KeyProcessTaskHelper(unique_ptr& batch, int channel, int threadId); - bool KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, - ock::ctr::UniquePtr& unique, int channel, int threadId); + bool KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, ock::ctr::UniquePtr& unique, int channel, + int threadId); - tuple, vector> ProcessSplitKeys(const unique_ptr& batch, - int id, vector& splitKeys); + tuple, vector> ProcessSplitKeys(const unique_ptr& batch, int id, + vector& splitKeys); void GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf); - void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, - bool& uniqueInitialize, const unique_ptr& batch, - ock::ctr::UniquePtr& unique); + void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, + const unique_ptr& batch, ock::ctr::UniquePtr& unique); - void ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniquePtr& unique, - int id, UniqueInfo& uniqueInfoOut); + void ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniquePtr& unique, int id, + UniqueInfo& uniqueInfoOut); size_t GetKeySize(const unique_ptr& batch); - void All2All(vector& sc, int id, const unique_ptr& batch, - KeySendInfo& keySendInfo, All2AllInfo& all2AllInfoOut); + void All2All(vector& sc, int id, const unique_ptr& batch, KeySendInfo& keySendInfo, + All2AllInfo& all2AllInfoOut); - auto HashSplit(const unique_ptr& batch) const - -> tuple, vector>; + auto HashSplit(const unique_ptr& batch) const -> tuple, vector>; - auto HotHashSplit(const unique_ptr& batch) - -> tuple, vector, vector>; + auto HotHashSplit(const unique_ptr& batch) -> tuple, vector, vector>; void PaddingAlltoallVC(vector& splitKeys) const; tuple, vector, vector>> HashSplitWithFAAE( const unique_ptr& batch) const; - vector GetScAll(const vector& keyScLocal, int commId, - const unique_ptr& batch); + vector GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch); - void GetScAllForUnique(const vector& keyScLocal, int commId, - const unique_ptr& batch, vector& scAllOut); + void GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr& batch, + vector& scAllOut); void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); @@ -248,8 +243,8 @@ public: unique_ptr GetBatchData(int channel, int commId) const; - void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, - vector& restoreVec, int hotPosSize = 0) const; + void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, vector& restoreVec, + int hotPosSize = 0) const; void SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch); @@ -257,35 +252,30 @@ public: void EvictInitDeviceEmb(const string& embName, vector offset); - void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, - bool refresh, const string& embName); + void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, bool refresh, + const string& embName); - void UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, - uint32_t count, bool refresh, const string& embName); + void UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, uint32_t count, bool refresh, + const string& embName); - void HandleHotAndSendCount(const unique_ptr& batch, UniqueInfo& uniqueInfoOut, - KeySendInfo& keySendInfo, vector& sc, vector& splitSize); + void HandleHotAndSendCount(const unique_ptr& batch, UniqueInfo& uniqueInfoOut, KeySendInfo& keySendInfo, + vector& sc, vector& splitSize); - void PushResult(unique_ptr& batch, unique_ptr> tensors, - KeysT& lookupKeys); + void PushResult(unique_ptr& batch, unique_ptr> tensors, KeysT& lookupKeys); - void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, - int channel); + void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel); - void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, - const vector& hotPosDev, const unique_ptr& batch); + void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, + const unique_ptr& batch); - void ComputeHotPos(const unique_ptr& batch, - absl::flat_hash_map& hotMap, vector& hotPos, - vector& restore, const int hotOffset) const; + void ComputeHotPos(const unique_ptr& batch, absl::flat_hash_map& hotMap, + vector& hotPos, vector& restore, const int hotOffset) const; - vector GetCountRecv(const unique_ptr& batch, int id, - vector>& keyCount, vector scAll, - vector ss); + vector GetCountRecv(const unique_ptr& batch, int id, vector>& keyCount, + vector scAll, vector ss); - void HashSplitHelper(const unique_ptr& batch, vector& splitKeys, - vector& restore, vector& hotPos, - vector>& keyCount); + void HashSplitHelper(const unique_ptr& batch, vector& splitKeys, vector& restore, + vector& hotPos, vector>& keyCount); template inline vector Count2Start(const vector& count) const -- Gitee From b2a422158e202b4185d556e11fb6368f5f7d5932 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 22 Apr 2024 17:04:31 +0800 Subject: [PATCH 055/302] clean code --- .clang-format | 3 ++- src/core/key_process/key_process.cpp | 35 ++++++++++++++-------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.clang-format b/.clang-format index ee9f3a3c..c1bb9720 100644 --- a/.clang-format +++ b/.clang-format @@ -10,6 +10,7 @@ AlignTrailingComments: true DerivePointerAlignment: false PointerAlignment: Left AllowAllParametersOfDeclarationOnNextLine: false +AllowAllArgumentsOnNextLine: false AllowShortBlocksOnASingleLine: Empty AllowShortCaseLabelsOnASingleLine: false AllowShortEnumsOnASingleLine: false @@ -34,7 +35,7 @@ BraceWrapping: BeforeCatch: false BeforeElse: false IndentBraces: false -BreakBeforeBinaryOperators: NonAssignment +BreakBeforeBinaryOperators: None BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeColon BreakStringLiterals: true diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 9751e268..c5ec9204 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -134,8 +134,8 @@ void KeyProcess::InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo) if (rankInfo.useDynamicExpansion) { embeddingSize = info.embeddingSize; } - hotEmbTotCount[info.name] = static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) - * HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); + hotEmbTotCount[info.name] = static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) * + HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); } OffsetMemT KeyProcess::GetMaxOffset() @@ -334,8 +334,8 @@ void KeyProcess::HashSplitHelper(const unique_ptr& batch, vector& hotPos, vector>& keyCount) { TimeCost uniqueTc; - if (m_featureAdmitAndEvict.GetFunctionSwitch() - && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { + if (m_featureAdmitAndEvict.GetFunctionSwitch() && + FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 } else { tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 @@ -347,8 +347,8 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch int channel, int threadId) { // tuple for keyRec restore hotPos scAll countRecv - isWithFAAE = m_featureAdmitAndEvict.GetFunctionSwitch() - && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; + isWithFAAE = m_featureAdmitAndEvict.GetFunctionSwitch() && + FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; TimeCost totalTimeCost = TimeCost(); TimeCost fastUniqueTC; UniqueInfo uniqueInfo; @@ -356,10 +356,9 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch LOG_DEBUG("ProcessBatchWithFastUnique(ms):{}", fastUniqueTC.ElapsedMS()); // 特征准入&淘汰 - if (isWithFAAE - && (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, uniqueInfo.all2AllInfo.keyRecv, - uniqueInfo.all2AllInfo.countRecv) - == FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + const auto errStatus = FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR; + if (isWithFAAE && (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, uniqueInfo.all2AllInfo.keyRecv, + uniqueInfo.all2AllInfo.countRecv) == errStatus)) { LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", rankInfo.rankId, threadId, channel); return false; @@ -412,8 +411,8 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, auto [lookupKeys, scAll, ss] = ProcessSplitKeys(batch, threadId, splitKeys); vector countRecv; - if (m_featureAdmitAndEvict.GetFunctionSwitch() - && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { + if (m_featureAdmitAndEvict.GetFunctionSwitch() && + FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { countRecv = GetCountRecv(batch, threadId, keyCount, scAll, ss); } std::lock_guard lock(loadSaveMut[channel][threadId]); @@ -421,10 +420,10 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, BuildRestoreVec(batch, ss, restore, static_cast(hotPos.size())); // 特征准入&淘汰 - if (m_featureAdmitAndEvict.GetFunctionSwitch() - && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE - && (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, countRecv) - == FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + if (m_featureAdmitAndEvict.GetFunctionSwitch() && + FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE && + (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, countRecv) == + FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", rankInfo.rankId, threadId, channel); return false; @@ -464,8 +463,8 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, void KeyProcess::PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel) { - if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY - && channel == TRAIN_CHANNEL_ID) { + if (GlobalEnv::applyGradientsStrategy == ApplyGradientsStrategyOptions::SUM_SAME_ID_GRADIENTS_AND_APPLY && + channel == TRAIN_CHANNEL_ID) { KeysT uniqueKeys; vector restoreVecSec; -- Gitee From 622cde53968d0c31535f50a0257442ef6f996a7c Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 23 Apr 2024 15:42:53 +0800 Subject: [PATCH 056/302] =?UTF-8?q?warm=20start=E5=8A=9F=E8=83=BD=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=EF=BC=8C=E5=AE=9E=E7=8E=B0=E4=BB=8E=E5=A4=9A=E4=B8=AA?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E8=B7=AF=E5=BE=84=E5=8A=A0=E8=BD=BD=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=8F=82=E6=95=B0=E3=80=81=E7=A8=80=E7=96=8F=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/warm_start.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index 53324b06..31a5e358 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -80,21 +80,19 @@ def patch_for_func_warm_start(func): def wrapper(*args, **kwargs): ckpt_to_initialize_from = args[0] if isinstance(ckpt_to_initialize_from, (list, tuple)): - vars_to_warm_start_list = kwargs.get('vars_to_warm_start') - var_name_to_prev_var_name_list = kwargs.get('var_name_to_prev_var_name') - results = [] + vars_to_warm_start_list = args[1] + var_name_to_prev_var_name_list = args[3] for i in range(len(ckpt_to_initialize_from)): - results.append( - func(ckpt_to_initialize_from[i], vars_to_warm_start_list[i], var_name_to_prev_var_name_list[i], - args[3:], **kwargs)) - return results + f = func(ckpt_to_initialize_from[i], vars_to_warm_start_list[i], var_name_to_prev_var_name_list[i], + args[3:], **kwargs) + return f else: return func(*args, **kwargs) return wrapper def patch_for_estimator_train(func): def warpper(*args, **kwargs): - hooks = kwargs.get('hook', []) + hooks = kwargs.get('hooks', []) if WarmStartController().get_elements(): hooks.append(SparseRestoreHook()) return func(*args, *kwargs) @@ -193,11 +191,10 @@ def _warm_settings_filter(warm_start_setting): # 如果匹配到了,那么这个warm_start_settings对于dense部分就是无效的 # add WarmStartController(path:table_name) if matching_tables: - warm_start_setting = None #add controller to set sparse WarmStartController().add_element(vars_to_warm_start.ckpt_to_initialize_from, matching_tables) - if vars_to_warm_start != ".*": - return None + if vars_to_warm_start != ".*": + return None # path: embedding_table_name return warm_start_setting elif all(isinstance(v, str) for v in vars_to_warm_start): -- Gitee From 32685509402c9cf3a2fffc3a5c762c146788fe1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Tue, 23 Apr 2024 16:08:27 +0800 Subject: [PATCH 057/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=A0=B9=E6=8D=AE=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E5=99=A8=E7=B1=BB=E5=9E=8B=E8=87=AA=E5=8A=A8=E5=88=A4?= =?UTF-8?q?=E6=96=AD=E6=98=AF=E5=90=A6=E5=BC=80=E5=90=AF=E5=85=A8=E5=B1=80?= =?UTF-8?q?=E5=8E=BB=E9=87=8D=E7=89=B9=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/asc/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index ef4597b2..64611295 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -204,7 +204,7 @@ def initialize_emb_cache(table_info_list, threshold_list): option = option | USE_DYNAMIC_EXPANSION optimizer = ConfigInitializer.get_instance().optimizer_config.optimizer_instance - if optimizer.derivative == 2: + if optimizer and optimizer.derivative == 2: option = option | USE_SUM_SAME_ID_GRADIENTS # [train_steps, eval_steps, save_steps] pass step information to HybridMgmt for data process loop -- Gitee From c13c7a6e9267e7b87cc77ce1798ffa1f179f0f69 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Wed, 24 Apr 2024 16:53:18 +0800 Subject: [PATCH 058/302] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E9=80=9A=E8=AE=AF?= =?UTF-8?q?=E7=9F=A9=E9=98=B5=EF=BC=8C=E4=BC=98=E5=8C=96=E6=8F=8F=E8=BF=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...\344\277\241\347\237\251\351\230\265.xlsx" | Bin 31412 -> 31424 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git "a/docs/MindX 6.0.RC1 \351\200\232\344\277\241\347\237\251\351\230\265.xlsx" "b/docs/MindX 6.0.RC1 \351\200\232\344\277\241\347\237\251\351\230\265.xlsx" index 5224de2b2f5ed98b0b0d0b0c15e65ebf98ad993e..9e14cd61ae280c9144d8def5fec2024625260f48 100644 GIT binary patch delta 8975 zcmZ8n1yCK$vc}zmy9al74;);BJHg#`gS#Fi5P}7_-~>51goC>VCpZKM8X!FK-&gm& zd#7rwwVe6)05j!Ou@6Esqu#|v_az7V-)KA1`S!i%(@q-(> z2{KDRI&v7nU`I->_p|`J(q0STy5ig{iyHv$qVk`hfm)goP7~>uvGTrglF~#_&;xx8 zN-}L=$SWH5w@7DVf%o)+i6%U})}?jiU`1DNsq78FV*=#0&%pt>n(hgv606svJh7Z${#orj+6=n$U^pvMlQ^7#TGog7BCM5Oydm;1xnT zDV+}gU^Iay4IF^-ewEkXLt4|`)O-H?be7Pu=2A!2LL8c(+B(1EDax`?x}1TK&It0b zm%bY*#a>OWQBwS=-UyKq>PCw(87Pr=d zTjcDeNG*rBKH4}9edAR-4D`I0mk@s?X5cJV&dA@JzI9isI&F1?;343R|0DLnEWt10 z5arwF1hspiy?%S_9pjXiep89!D>4mbcm#YH7#I|oXtiyrEkXzLb3_=JG8`C~S1(#? zSHMcc!`01`)5_b^%hj2~&)Es0Z;Cs!hk z>=)U2;`535ks)ceo8AIp0XM5<-G6?_>sDRD>8N5Hl^MlWhkTM()eed%5Q&jGTBLGG zZd?UhnU;ijcGRe8goq8n;#?M}N-sz;VSqKJ7F6>jh&D#XB^Do+_@5o2T-fXL^GW=K zf^GSxUHD)*1V&*0q{0vRb7R?=^V_sYZ;o+%)rBndW}*zcj?qaRN}T!-he}s28bT`e$Wp2}r`#5Bp$+1)ciP3}Jl_e??hE}$&9N=SC$Fl9B=&7+V$L3k<~_SmQwau|78l=_oXbJ z7Z3{Do$%=}i`P956jUc5dtKDBn}X78!iqyn-+`x!AZ>TfsaxmZ&@Pev=PR5WD{LBH zM!W;|ys~4iZUKV?dNfhgjU2lUNpt1iiZ!nX)&G1#i?k+1Qc7vBi!bobao~?6i$#*>?|m9|DDEYT79JUpcbPT?rbJxa)QKdb=mu(v7qG>eiWRSyMeGa{sMX|YY|5&{$^kp2;HP5c?QnrYGFObL0 z)*MCLk3mJmG^cjz$l_AvN(4nL!B;Gzn1YKOB16Krv9LK^~7krl{+n( zICdIXDdcn5Zz8l(CDW#bqWKffD_lu|=_jGpGwxy^R3@VQl|>zLo_yOiX{yES9G8~mY4{&G@hyb_ zswveB$9;_nzCw&-&L$WWSs4Bj0UNA@Oa-j^eP7`U6F!!5?sM>@#ohh)VXB6~IVz`%r}qTMHg?Qa=I-?lKjHeWoG{CgYHfkR z?uwJugCO~)ugj~$mILZpEj;cOC1ow54O*}H1H~ewkA)2#=qo(eOx<;`W`*OA!Z$n- z@@t$~Sg}kz1bY3CufD(IJWLpNhG*Bs55Ik@eb0h9h4XF~OZ6~SAPr%#$iP{kM{fj_ zn2AcYs{f{N+lFddKFSnDjrOg0{9+8iD@o;1t`@N!1gCx9&=yK9z+!r#OCbk67U}+GcRZ1TcJ0_cgx?7TR4!XT;bpX#eh~`~zZG^RIPVuz9ug%;rg@@V4;!9T z+f*-j3|WwDQMx4m{$2A2G+*lRG)SN7)w9=|m=p}9w4aAJ+@p`DIyAo~DC&pfXl14l zk}QO}%$afsaU*bXKWsAV(`O4JU_80T72IMx-(&Ze0gY~`6GDhfw<^l~U|AVPz-31O zKk63oH&z&-iA}Yl&k23AIPvJ>KwE+5p<(Kwehhs8{fZi;1seWogZh-SuRK*b@8qJaQyPYNiAmNh9#Tvu3T2&qeJz>UnHp#crM`iM>(kAy>UD}nb zM)*}#YvUVE9NI6&by5oqqGUlGhYF6pH^JQ3zPPd3#Pl?U->9FxUOCNLHto+>4H+@4 z!|oaGdZusdH>K|^Q!sQhoxjSN;di{}Ymjp|V&kkvx6e!cf>CTxJ*=Du_^b+Yof9(N z;zYm(LQ9S4<@EU{w%qI}TI@%Yf3YPI4xbrBv~{FU+nsPvlHc$mp1pEhGDED3GY+7m zIS##_=;{;|+zM6*LK~(LZx{SN0lZAH*q#%V-kN@)hvi;y)_vipM_=#++6+=GSOY4orx~aj2~$ z)*??29yXDq=+BmYx(#)l{J=NfcZ1R_a3x$8sfLodQ(U6luKL@!TzciB*eq57u%wuqEkfoN;^Xc z9GeaDvazhi6VKK!A81oCt{8|^F|LY8B=oM3o~m58!)dB+B}J$?{)ALPxB!=xt{2MQ zQ{w^KMTo?TtqbvdkG$Vwdq_@(%7cf3p--cT`P|i@#&nQvCQM5%*+(Aq3`@-H?XA7)^(uGP5cmy_2V|gLM%c!Ho-HMz#g7Xj{0I+YldLB-lp!~ zRz#{wNfGWXN@+)!Khsk19Y(%P#Z|1a$w@6i>dn-|9poWJFe)WXWrGKp77YNToO+v- ze^Y{`dRvB}Ku?{@Ek3M3Y7C zgM7M_{2JpIs20)>T~l!aZ%ADd!nBr{Pr%7umeAkq) zJCacO3ql9>eu+tEqJW$8=t*Mdf%gK=PqvM1`3pP;j#%)K}~L1 zxI^65>mwwx@gbC_yn#ra?9TR0sS0`!q?DRH3zf2fiG8JXS=kQ*`gy;5_t~8*ZPc8f z3p_XiMf>psFpPQzyP=Xb)+^4U0KShE-v-cOU=&PXU@%{P0{omfJ-z&$Y&`#F|MSMj zuB!_K&l2X(@V({_Tyuo-nqTHT2uBGu3fR#G;E|ldrIpjiGw}H1MozF08%nZUu}Czm zjarI&+6fJAEG}AH^ut?}p0gF}B~Q(k+HTW2_d!|sPX%({qy`p+_kAgW+uR4*MHlib z@*gcfFr8aBG~%mrD)paQR_h-EF|PwxYcg^wF@hU6xG7p)`R8OE%h&av_XHQB{uElR zt+cY4!4f`M$gbT!^G;BfwHSEHrxH+ldt+B8I^z=vKilf-{OtytnN#<$`Ac`pqOZ8;B5XObsrW8NG;ejF z&#Gs%%c2>g9%Aci^XdXdpJmCWL94SRxL1n+@HIW|87!*gD*5PwY<|MDp;-&@IL&GK z+zCnozJ^V9J%Lk?yW9Ih4qk1nY0k+oq5YaKyXPr-4t|QEFz4oNt((jKcp}4sQ(})3 zI;ZOlk_?gF4B1g=NE_mNCaIeXpt;TN2+ga%ebs{x_p|nk_nL&ShY~y|nDyAP-#2{~ zD1{#{Rus3sVe^rY_%j}Fi#^IKA_FkG|$7}MzN@U67o%?j@s{t zc4wATQiyXW1mxA!h7@q+WG9B2BJTz@L=PILdVOhAsu-{DqJESn&|Hd*ibA@colD=7 zkz;{Sn*b-3byus%nWu~%DmXK+y6RjG)aa?sHsxfF{JyHVACZe2Lvw7FS}8$W9n7q9 zA4LpdDbcDPCY=qMcVctagE+)Lmf&U2WQGNlYubL!qfX3GL;0;NP)dzS!b`S>SJW`a z(p;G__B~1meT5uXe}qHr9b$`%o*=K2FJ377|(tr!k{Vmh9mqp0CQTWRTglDBv_;t6zTndyV0(I4Wt~bmb0MV*_^&4m;?z7>?Hr4u=hwbEqvW zD2N{mVT@oq%aTL|(komz^T?Le=pA1N)aX~tJ9vNAG46ci8UB*xa2@C;%!$wOhZXkL>lW z1vhD_@PW_<`MpbBj1Dz6;`eKYQ!#(8nbAuyZ7FmcGf>AFhOdn1>s_C_v6>y4bd9);)`ia@d41+gC< zcY1MF8((9<1VgHwNETyFt>;R-F_y`4DY|w_=Hx6#3A?=A&tKkzlDFo!)bjHBJ}+6z&jFrutIeH+Ym39HAlCRH z#N~MUQ0X}=f^W{nIoUV9iGx zq1AFuYPoyk)V_Z2z|ONVNFebw?^TQli`VO;*+bbkC5bwFxkYmGn5GlM ztRNk6Me&IWcFkTUxNX+=cmU2mKEusdu2QHj**B(037J}r?XUPpo-Uebh@x+q>c9Om z3d8|vv3afxX&R?F_#{m}XpY(jCV`BneZ z`#qVUos3SLRfMl~6aH9W+sM7j3)Q-HL=%Ia4wI#IqNdZxAcEHu&h#oa$HHz;1%d0Q zoJ=>{MkY;C=YFuEHZ+Jh03 z5In;8U7uuF_knFc-}Q9tyh*$!vPmF*wra&ulrVwN-Y2JMw2O8CBRMirMJ7VC^GW24x3_IBj z2)SE_@`-0BZC+iSX4+?if#=h!gC`B@Y?4icy7Jc-i%@WPlB6GS8T}dh2EaZX)~A@B zz7(K4-1$7K{)bp=3Q<;$mj4;LqYnFhcfv}O)MRL<$;LiNrCf-?LBmTo@fAUdDS8Zb zjy`87hX)O9#VQ;vO4)lwjO3y@S6Ey?j1(vKqIX>(PhboB3wb9{)9XFj(h(OX$1lPZ z@?cygTspYDVjKEzfWQAE7cH)pOxaIX75I-q0k)U>Z+0HKsWv)9@9(N&^b*ibj+h89 zd7HfS@;{npwH!jblNf!wFb1cYedK5pD=t3UPXDr8m@1*+{_`jlU^+(J&PS>S{CwOV zoPCWOON`#mM>u2_?0z}ffwQyo8e3iR{$O`8|DulJGwocHmp$D=vnw+zh?b07&wK^6 z?Pn!H#vNjAs?=A?lb=Ye&2>R%L8tlJAfhBib zr!8&Bq*_;qOh~3S>gQ^4BPiVu=)r)4w+|XJjBLcUYN1bD)~|9Cu-wfoP9<|Bm{3SC zmL+XV(~sqGR7x6Nww6_}*F7^exAvIx5l$zNj#lkejJr4C7Tv0WpYruzDXt{};h+lsV+| zmm;^v=*)0pGny@(f{OuZ+Y0N-1Z)s*Wuq)G_DRq+Za+1)SRh-u_5hTAuzO<;k9?J` zv$k1LPBYw$nimHryDqPSND7-K$Ro0@iNP?UZ)0y{nc{OmR4Z$gSu)PRhmgkRjBUS} zg#*0ucQij2_B%3_s7HpE%n#!I*fmR-_(=WWRb4CW@pyXhPWHz)*QrG(nt4QZh|Pvi z@JLbN<<26Mb-9`jtI_fc_LjKAovuROng-C)tXbpg(tMSX)1qm_4bt;3Ftz_#&ii2k7bc1u1^t1kTzW}95D$#aoy1a zL@)vm`7D~m3RF^N&yxRH`d8YF)U`%pjaU$hOQ_Tb;er(B-;S;$u=)Y25aqc}x%lz%Awe`)%a8G) zIIW>3IQT*RkHkI`l#&-jJjd{a%ibKIlFnt%{QIu##@Y@bQo(uhK86?r{mI9c$hP$& zOp+W0P5t3CIU0C6Oj=_ENziPPVWJ0hKOu6i{A}-jo?wVe!wg2O-;khHvXT@fF9IU4 z?+0qni2=@nJtZN$OQk!(%Q>1Pc|S8L@-2U6zN3S^ucmK?iSSPZ06*gtt}l&~E@VVu zK+-Qt#y0QOGoM)LR%H*cLnaI6Ql7goee0+N$|~AV4dlW$%S^S_>_j(LXFFy#JkB~vmoN)Yq=#5 zx!B=Xi;8ZI4?aIXWkHP1%<`9gmaJ%YgOm5AwUfYYKvP3?#xrEUIW`F4t(qvi-4B+4 zrogY|ubVwE@fgG|Af?F zACOr#Sd4o!FQLS27#uD@I0ci=FAhn@IZSk}#P_zz{v#OM5viYwIuB!eu>^DevzYt$ z*kc>Hv*x-3I+U5Gm25K~`KiCmu3|wV3~lJ0H<*C-J0yE zrOx>g*n}95UZHAH)<4ne<8FkK0r~Q}tYyRQom561)XjQyGEdR3XrpG9!$L@-KULR&pfg8r57^L)~Ee~zh9H()BUuDl@a zNX5;=FeT5@kQQ>kWF{B~ZV7sI$*hxM<1=c2eqMjcvjQL=r!x`GKnTr*MxO zHs80p|Hi3;{6vrFx6-GfYg=-o6BuYj?h)X+i|JaUbzDxag){WZyWfEwmG%7Ok;UeSNQEkf zdl-wBs5AsWfGasxTX&8Xyz}Sc#;5C-hy-Zao;Q*FuJAAf`#-}Id@_l=C>#t-%S(vD z{#SVNb@gzxbai$7Tb4-FnRZ#>!Vk`G0suuwSJ*3Rd>kctw8KgQ9nB#TRqY>)g6917 zbgLMp_CTGu3+F?${p3Ubjp#dMCa%)hETP-I+#Sy^UuGpttShczBZ_nVhnZK0;Rr82 zG{i=mCiiK<8))GYg421=r{1I-%mII@#rL=WB>f6hM&~*#tkCjDw<1xcp=%iPGzWew z_sJ`yV+|sV8pFi?z?ZUNu3hq>u#Mnp(y@z`&ppDa z)uw~KrKvnao$i?UC1$w=6@4b0gko7msRMZUnwG59lw(AH>;ffj`gCchoF=jwr?sfh zq2Z52{ic3%Lq-P_M`&VJI@d`1!@zzg@OX24aXcAK)lJnEBq?%+@U1!1AJEI~)@k8y z4v#6G&Zrx+6xrmX=$PRmh?~zrJt|ZWT}3%@Xw<5VRMW2%R_(w?xHg2IM(N%T z9OhkrKmQfbPN=}WOje(u5T=}0?lHjInOh5y_9p#m4zI~uxWD1bId8~FR@8JQ zig~r+eIui>)HWwz1EIj+J$g61WeD6~ir{lUgvLfkC<3e(IM_`J)(+g|#tWMOzH}3U zeGg`J7ldsB7rMU!e{;hDN4b;2ZoiC_VOPMX?#ghpw%{{&La>GhAKaMTKiympKG<^b zw1*Vz6By4^3T{Xcyd_8gw)bR#bq1%uD3JKUH2g14Jz3x;1YWuXU~Vr?SZ}a}mpojO z=syaGSKR*|)==>lA^Y$111~Qs3@i-wi_`=cdP^aDp#GCsq`8y7tON_W(D9!kxYvyk z%;`f4I}8T-ynzLS(|p8Xqrp=?!f^Fs;4v`@Fq1C}Y%W;aR|0O(9*pil09OsPvmk{|s k+jU4VFtsEwFvNd-ytIF@=9U4g$dH2b0&tK_-Ttco2T^0m-T(jq delta 8989 zcmZ8{Wmp`|()R8KcPF?L+}$O(yGsbZxD#Y?O|Zp-yL)gC1b27$AOQjd{m6OV@0{zL z`B76-U43`e^ff)*cX#)1V8d@<%@h#UpbZyf44ns(LZ+01*>J+nm3-l+eXNlMn&5Qv zEPwUV(2iTTgO@qALnW5Szce+~pz3iM_x^c_JoRRD{<(4_7%&m6}zq2)(y}@mYSFsi~ zTqxQehlr+wcny6oK~9Q;FglyfyD>XxZ+YHelEkZ{3|fMRNy%Zuuox!C%?Q;Vq3)&( z4_3L^pA(W?>~}l`j$@Y8iNhp6InH6Mo_b8N9C5dc_B)OY9QepSxiQyvRIiM3s!8Z4Yyx zJ!5)AdekF3DXX8Kejs?Ci}=q^sKBX6+rZa>l_Vs--xWE=OPf-3+%KFjW)y>hK#mnBVLuV;wTNlfZ%;uS*UpC(+m+9QV-SS(z=|_|wZK$j?A5Cg{ z9vt+a&XttyB{ZOQmhZdB^=|pbBbx`__l-D8rVF{}ly2 zv-)(S$?K1IFP9a#u!L~DoO^3p41L98_uoX(q91 zf_^kE^mp~CP*(L9`T@ka%uy1b7o$Ugs!q-;Wr^T#d>`xVdsyUt0S~gFug}iKaO3f| zWE*$FL8V}5As=rv+XJVTH1>;|W<`sCA4uSdXwU_2Ei&6J_$o$qI+Urr^g;-@{?Iv< zO|(Tn&1(fIHL=@&FXrmqP{5mS;nLvJGhvxE*33I)5tZ~3>$`WxJ#l9cvvIwGyXV^ ze8QZJPRdUQe`A9w#1V!mM>~Bll|h{{M2wHt^J)!kpIjYI>?VGcq&8CTTX;qQ(#uBs zZ#^=;pi3i%WdhDUeQOvCwP-SygVQbb3yPxGFmKrf-@?vQMMR2 zFo<;^L3N)dl=UHk8E0~E3>LEq*)3ow$wBjjEhIL%-$DUXgiJzZO->fvqCXsvR5Cv- zXJP$KA`(#(o1EJ+WlWtc?0pXs2}qpHj zp&1INuVId=AVnc!TxDmVP=3K4VMQQf<0xg2>SdpYj+wDEdkLs759(0w$i4AHmeh<| z8UhZVF3FtH|IBT#+Rwwa__xW5*{DK->tSg};u`xkQ@SY*ZB*ku1{wc1;X^^+cASTrb)Ya?;4-mjpBjm!W>o^`x_?(d?s${@ zh%&1q@OXWQ^giDmO>5J#e7g=&l+iveu#637vlf<7_(JVowZa%P{6WA<&F{h|X0!gA zl0|>6!ZEkg)gjYA(I-v~)s_E?4;&+@ic*=@iG- zN1_ut&Gw&Et%H-uJ#xEcR=)_HA^QiCXmgRO87ihAEgoKsKIwkO&f_524ftVY4)zb$ z@Et>~>?=s27jZ-#V=pEH8!wOY=PP*Ti^?6P^71lN&RrGBHco^ZkL(cAQC>aYEbbBc@o|QPLV|-@uJbfGmj3G2Mtnb0&@3KTDkKMy_`8#D zk17{4B?1*CTaQMKEJrpb%5@ig&^pf{>%&vAXuUJyI3F6P_lhVx8~{*7006uNK;u3U zLMqcnT(+fg!fs241xp$RhlAs~O$EDWoiI&zY8hRJ^zpu^>8W$7zuA#V)htY~$Rc0) zsqURKDkP&^?w+gSDQ9dR%<|-(oqzSR7GZgD?}|eRpHKk&5?-`s`xP$IuN7^3AotH! z$HQ?im->D`n6-eNAboksX87-Wg>>>N0=o(yG z!`suVd#RmD>~n+({ax3DZOw+noh2ihF1ics^l4P^AFjIhw#ORA`YF?{(w!V}MG)Qi z6roekYST>;p;bu?xS#w!tCSSIWzI1JA5WSU4E!^!oYl$O95H#8ia$$h#Cq>r$I&}w z>*n_e1TeT9eF?D(+b4QBKI;v6Ao#^jK5)IeYP$Yi5DP0io?Ew{s~MJ*&To4Y%i-If zMS^abYhOI#!H|%xNw-tinyF$Y1W`yhhBm>@b$i5tWn+uVXYM7hhd3qcJJ%Y^m0WKd zyj%ag0a!%RCX|Gu26Dh!JzK<36tOJA7s>jGH-ET7)V5|KKn~6`5$zL{5O%Y7CqJ!l zhl3ZrWv2Rv+PZ(yweqbczX5{p-7ZC(nC=&a2GUv9+L4K`SiYq71T1~;u8n|fb6L!f zJY0{6cr>|BRZpO52Qpz0N3Y&j8cU*zl~rI9Un%o2v2x>Xj!6*n4Yi5&wUQQQtj0=Y zL?^Gsr5e!yIuXZCNmwu~ zdx;@CCOpfCe*%XN+}NcEoc$zAo)8|_@c$u8k>a&{tHf)oXkxr8@ zN)p2O$OfB2&}}e&QlP3QDPLk74hp-C>#&UB<8e`qxJ{xEBg0T8&y)vSZoVhK(4*v; zHZ4O8p*>Q>M!;R1BQNQ8#jTBBM8-WLM@?qf&I#|t8!WXDkS+RZXPXe_ z9uUG9NK4_2lth>j1kUOg5$M@q*b(5WG3ATbSlD6?aa^1fk8@0ZW)4myMt)z1qV7T& zr!itmQ=Bd~_?1(Md{;>)IA_nUigK7CpOR#^tT2i$8B?~>B0W9{t|wUyv!CO&fg-$k zrZ=LH#1p1BV~ZTJL$zYmBjhyiKtM8`@>j&yRNMJyU?kSvv|xN(tHk5T@EG!R|3-yS zFwZeIC?9P4ulP`dm^GqhlLK!zM>!tL6|Y$S8?DsH0tTebYBVPmXh{eI>jFDH;ej|F z?|dM&BZpxA5(*L9tRDt_slypO+F`K7jJRex^tqFkA~s)5igKG`9Ig~oHY!b(3O22u zrR-%gL)QGcvg=cq*mJ|MA&mR5?|X}r0c!goo{IEZF&+VI58~@KMZosht_dNzk(^>blv>f)5k+$ zde#T#b5}z2n86~L0M*OkVX|IE)OT6qX6Iq(X({~ZX{l}K;eoT6@A_^p_nkhxb|g9z zf8+}z{bIXX0!Qc5uZ5+BjTfPZ+ia7rr>DIW$F6pAl0-5)qVvr^jvc$0!;T^5Z@$?z znG(qMHM$CWj32VJA%kzRV+e5Fn3oB2**D0KmEt0D$_sBJi?fb#?Q$ zvvB>p6gV|}aa{R<`{LLA3?I~U^Ig>dN6Jru*{aqK2-}(`CS=;p^S0srk|DiXVaezj z8@WNc_emU3s?mZGsRA5tAzd4K9?SRJMFn`5EtpM64Z2O!; zuGSw`J9gIM!MtdnVq8`(!x~o{OZv5xv?}~pP8Xj#!tU2j=jD~O!aA4EQ)5<|2#<5Q zH61G;Phha*%Ve#?sX=meK-6+|FCqOndmenQ8L0AGkp;WzjnNR^8RO;3@37 z04#ww7M?{3=PdK_SaA(^STKRrf^Ay)wla^>YgV|a*JO{C+1lDUPxf117*f)(rncc& zBJC#kp{sp9Fj`9MgEH?LTKA$_XQ83b-ZHnnjCT_8iz#fj-_>dSNmpy{^Q2%Wq$9s1 zbP{L2T;;?<;^pZWQ?T38(!W>R%HJGcJ&;Z*7@?xdR^#~>a?lMCZ$c(l(eH_X!YSl$ z6OV*<1P?x-REfB?-`}cM4&+#D;w)S&{Os>u-=bSi8jelaXjcFDz(AM(ub;q{`E}k7 z3NAzsgCX(-Bt~N-SUl5z016(l&0^{4-;mcJ0pu*oRO&qSJ1S?({)z^N5=-rkvU$Q0* zCD!7sN`4U_x`hFAxq*&Y`BHPD3OeT>@ct;ak8G?xcIm_dms)|!j zGYKvh-nhJH=2HvWIsFd&e06_eDnE(4=;wO2el_|%PkF>mATal2ktE2hT23ZK+%LdG zw^T#aGngz-+&2{U0duoqv&h~#3CGErJ_sXb7HMf|8-r!E93WB(5gDT$GCb5tZcHB= zt;fADY}(w|yK}Esd^U=yw6rLnTWuva=>B}Xx3s)mVm0|xc1 z9uDclStWq;J^!uo{%BAqj7kxaz#o~iCBs-tn*BR&Mz)DIy~qZUpf`1j;p{*4NPK3a zu?U2*Yp$`CN6~kJX!Cd+;|5srZLPKPD|B>6XIk;^DJIz?@Ju@8ze-R;;#opv$t{rL z`=ss$Q;etYBdDkUM8K%O-p7wW4v_SOB}T-G)5c54-E!zy&QG*ZVyPSq7yNct_K9D~ zWn6LlfYiDtwOrUxh;ZuHInjG6#}+gVOe(SGA7@tQ=v??RtQMM>AI!2`^TwcbFbloz zT?BD&ySvk%Ji4Hs2`34}r+TCQzQ(5S(zi{CC$(W~_R?C$Z!%LIJju+b$SMRFIMf>L z>E^c!^s~*(1j6_?ijUBXb`flErmSBZZrDD!f!{ixkE$LyWHAD9!x%r*U6N|HcKwux znaeSluY%E>)&0;)QLEO$etfEL9DN4fwJ%JP+6h{Zb^`^MkT5`ER{W#waPm(IL(|5A zEx_0rZ)d7fLCFZU@4=Gm3ZQ7~7hG(w+}M#Ed`X0@rQcEgiPzxEVd&c#rooH^*b76? zJ#r|E9-al=oZKjGUIw%42UoK*iiE%~Oi^r6g9CECK)Mxb7#A3|D1Sp7NE>M?a9b+EBJ&z zuq0mszQoHGDhU1ZGkZn#HaCA-n!#0BE$L*@2mCouo~-w7$Ar5`x2`{Vs<;W!>xDL?I85rNWg3g-J=l*D}4{X6(9@4*D)&(vyxs`jS9jAKZBhaDA)VXch`n%H5{IfpKV!BXcav9Bm(t59*JZ=`=%=b0e-x8RvOcU!Z~Pfs5LVN zflkR3MbWDiy-dH}#A7w#?qXb~K_p@ruwhbCCg^s(4^XzcD%7OVLNd16MRcqh1B8Dp zU46^<7J3wAYS*|>R05ziCid+7E zI;mqH_q={= z@bO^%DEI)nA%t`qH}uva+^^Aw0cB@GM412i_QBkTJpOYYQwBfcTRiP%j&Ky z(hht=YEC>u032R|Z>G?n5y;)sEp3-*y6;_+YRo7FIlsQU6_cNv+x;&x3B=a8$oBj)zZI-AuRU6vAX?2MU(&DFnKu!aM(xWWZmKerdOfvTdKZ1vQ*KsszZ{bv7F!*Q z8_P4#1Mx$*8ogX3E>V&!YNpn1$cOBvulpW$n$78@E;M^auL>cbL}}RhSwXb?>@Tsm zFtFB(~J5ZP~uxCsf~;Z^nA%n`q+eFYI;YbU)_z&P8N| z`u)b@{>P*DEA?MBa#rlEQyHqwIb9^uF0~gEK0jENpNI&zN_w|~V$bextMM&i|G<)q zl)w_a0_TBnkn=!gH6VJce_)vv{;6W_pf|j_SmMM@={lmmZ(D6G2!X#~E+0pG`gqpv ztRWODLH%XO&{;|+FGiUZ{bn=ZETNTX6_*>Pg}28DlYLjRPcsLT+?y(YLNF@#y|A|4 za`M{(lKZ_=wqy$%SHw4gObRQKuTzVH_v?gjebKUtQdIbQgR@mS_mnoiMQ1(c+#cx~ zx$fSrOvR!mkwL;cpCQObMotS~Z|oO}83-9f{WIGc#^JWv^ltAl*3ZMbh6n43j#V0Y z%6^YkCU>gO@Ypg+4>45Fju3riw8(F%ZCNSH8?>Hgf=*6&cYA`=TQ zJ5OE-FnTEp)7l#6_3T^j5QPG2 z$NI%0R=QCTkUBqzge_UcWZ#AU{3R!C{)0=6VQ*~xwtimV;q${;_7?jJrwBtBWpBXR z()T6%c8mxZ$zCY%AwNDF()_V0lD!M#I#vn->^vV4_ueKWXQ!t}M0DUD&koEv;otIiZu!4S6* znjsN2U|wp8R(Oe~MLXnzn$BO3uKqP=)a^FD04DrIRnWHDKJBVumze%kuacXxI7jnc zHBLRc9Jvf`w7Uab86t6oQAXgSp#eh)ADzJmoN~R zO=d7ZYY@(y8aEFi5UDOv!z_=L?n`ylweJX=23E)!(U}Qj?4sLG#~Hwt%FQ6sk-D-P z`uTNa?0LPi8?@CCo0jMH3z_&VRb5>h#Dd%35H33x9&(szh8{O;wbaXBRoAjJc#;z?bVAY({@l{icHGeaL^1-yD*KQm(z{bB3jOH~YV*_y6Y$BlCa z^rX&)r9g-ynwg~lcH_bk(zGbwI~Gja7czWsimNAV@||6-5as0R9@a2Ri|F05nk3lH zkc8}(Mo|1f#LOZN>gGe(-%X0j6B*-d+~aZgImdLFCh|;~gNlyZFP`nah+FV1oKi-- zxwJhV8|FdXML?VLK0Ekg9d()_U^N+AwLtr{66q7LF|XS~GASX9tJB zCzeTM_wnx|6w6K9U(4Y zGe<}8Un1g*=9jM1aE*s z^A-ulLi@xG@uFN?BTnvmAOE|J+dqN>14mg*Gkx~}3H5Qq-9&qHAhP%SrOaq!l^}Iw zJ#|dH_z+H~DVey#KA-1`w$1HlNb;4B0y5iCUYfcd{6~Uua+S2T=mlyF2#=THRaknM~B{_xsN(yJX+^+k_^zMd|K}tOy z%F>$-#keGlQaDU4jFyrz$<6?!ycj0-iAR2=>jjUwD)OJgxZd<`re&Gx5Zytg^^p0X zx(HTdP{&vLwO-5m(r-8|Bt$waWjgj$p-qS0=naaU`b@lhvkVLKS}#dwO*T_4wYkFX zCH|NP@zCgsv>HwFw{kx)f+ykO^zB)jD0p2iNyVU)5%j@>TsFM=ORb2sRPf%cKY}+) zqMIfKXK}s8O9(%Y$K8&5LtgGq7j_o1(MA15egF8otzN}~NvZ0MSIYZ=0Y`up8ZQOjm}`H3RZk%eE3n^aFy?P`Wa&kEoC(IY3L{Y!f zPNM5YyxX7mydZ$ToL7C2c^E8Wi*_l70TA#?%VXN20U_y2M7Auo{(Zn(DqNs@pNUGNFA-s4V(&>rP#KHHDtul;vXE1?OgxiIyIphDU0!` zKF{wlzfLqp`|&;e*HR5XHt#oX5|YHGiiCoBu!SCEmi6h#;9Q~7Lqf!4*(~O>#lr(N zU<_nP~i4zzIeRkplLZH0PyuebZqq8t*!2)`2 zg$M0(<^oMv{!7Jh;Q|d>y{hC;LKh6^vNZ-&zy%4|1C4i402V{nT*QEnP*PVhU_aF4 zRoX#wUL`Da%T)?A#siJ#$A!M|z(UTyd=&A=l zs744nCPV_|@?-#|$U?Vlu%WK6YOBqwN(627L?ixp=e%|h^Iz=O>mT^@zfJ^DR$IJ( zrMe`Gf%2ciAOC>e%VPdbhUnju7$gAT@6m_Xe6W$m2qFy)lUYcXawsh3gN{5N|;s47A%@5JYPa4Y0?9HhW?~sXh^cbpJlm{0*1r Yzqj!I7100X#RiT2#0aPE^w;+P0QPv@-2eap -- Gitee From 54b16a1e0f01ef660f27c66423a6b5e34294d33e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 24 Apr 2024 17:07:32 +0800 Subject: [PATCH 059/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91slot=E5=92=8Cderivative?= =?UTF-8?q?=E7=A7=BB=E8=87=B3=E4=B8=8A=E5=B1=82base?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/dlrm/model/gradient_descent_w.py | 8 -------- mx_rec/optimizers/adagrad.py | 8 -------- mx_rec/optimizers/base.py | 10 ++++++++++ mx_rec/optimizers/ftrl.py | 8 -------- mx_rec/optimizers/gradient_descent.py | 8 -------- mx_rec/optimizers/gradient_descent_by_addr.py | 8 -------- mx_rec/optimizers/lazy_adam.py | 8 -------- mx_rec/optimizers/lazy_adam_by_addr.py | 8 -------- 8 files changed, 10 insertions(+), 56 deletions(-) diff --git a/examples/dlrm/model/gradient_descent_w.py b/examples/dlrm/model/gradient_descent_w.py index 6c34b726..a2a5635a 100644 --- a/examples/dlrm/model/gradient_descent_w.py +++ b/examples/dlrm/model/gradient_descent_w.py @@ -50,14 +50,6 @@ class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOp self._slot_num = 0 self._derivative = 1 - @property - def slot_num(self): - return self._slot_num - - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): logger.info("no slot for gradient descent") return [] diff --git a/mx_rec/optimizers/adagrad.py b/mx_rec/optimizers/adagrad.py index fe8a0a2d..125346b9 100644 --- a/mx_rec/optimizers/adagrad.py +++ b/mx_rec/optimizers/adagrad.py @@ -80,14 +80,6 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): self._slot_num = 1 self._derivative = 2 - @property - def slot_num(self): - return self._slot_num - - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. def creat_one_single_slot(var, op_name): diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index 49594d40..ed765539 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -79,6 +79,16 @@ class CustomizedOptimizer: def __init__(self): self.unique_name = "" self.base_name = "" + self._slot_num = 0 # 优化器对应slot的个数 + self._derivative = 1 # 优化器阶数,如果不做全局去重可以数学等价,则为1阶,其余2阶 + + @property + def slot_num(self): + return self._slot_num + + @property + def derivative(self): + return self._derivative @staticmethod def sum_same_id_gradients(grad, var, is_expansion): diff --git a/mx_rec/optimizers/ftrl.py b/mx_rec/optimizers/ftrl.py index 855fa9c4..ef617c2d 100644 --- a/mx_rec/optimizers/ftrl.py +++ b/mx_rec/optimizers/ftrl.py @@ -82,14 +82,6 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): self._slot_num = 2 self._derivative = 2 - @property - def slot_num(self): - return self._slot_num - - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): val = constant_op.constant( self._initial_accumulator_value, dtype=var.dtype, shape=var.get_shape()) diff --git a/mx_rec/optimizers/gradient_descent.py b/mx_rec/optimizers/gradient_descent.py index 2ba72789..d021f69f 100644 --- a/mx_rec/optimizers/gradient_descent.py +++ b/mx_rec/optimizers/gradient_descent.py @@ -57,14 +57,6 @@ class CustomizedGradientDescent(gradient_descent.GradientDescentOptimizer, Custo self._slot_num = 0 self._derivative = 1 - @property - def slot_num(self): - return self._slot_num - - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): return [] diff --git a/mx_rec/optimizers/gradient_descent_by_addr.py b/mx_rec/optimizers/gradient_descent_by_addr.py index 11a9fda6..9db7c2ae 100644 --- a/mx_rec/optimizers/gradient_descent_by_addr.py +++ b/mx_rec/optimizers/gradient_descent_by_addr.py @@ -62,14 +62,6 @@ class CustomizedGradientDescentByAddr(gradient_descent.GradientDescentOptimizer, self._slot_num = 0 self._derivative = 1 - @property - def slot_num(self): - return self._slot_num - - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): return [] diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index 6ac7e844..1f491d14 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -74,14 +74,6 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): self._slot_num = 2 self._derivative = 2 - @property - def slot_num(self): - return self._slot_num - - @property - def derivative(self): - return self._derivative - def initialize_slots(self, var, table_instance): # Create slots for the first and second moments. def creat_one_single_slot(var, op_name): diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index cd4ee878..f1f8a2df 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -75,14 +75,6 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): self._slot_num = 2 self._derivative = 2 - @property - def slot_num(self): - return self._slot_num - - @property - def derivative(self): - return self._derivative - def get_slot_init_values(self): # return state value list of adam that needs to initialize in ASC DDR. initial_momentum_value = 0.0 -- Gitee From b40e8d32057f45840e66bb361691e5420d6a2785 Mon Sep 17 00:00:00 2001 From: rome_zhouyang <9538256+rome_sky@user.noreply.gitee.com> Date: Wed, 24 Apr 2024 20:35:53 +0800 Subject: [PATCH 060/302] add FasterKV --- src/core/utils/MapperFast.cpp | 262 ++++++++++++++++++++++++++++++++++ src/core/utils/MapperFast.h | 97 +++++++++++++ 2 files changed, 359 insertions(+) create mode 100644 src/core/utils/MapperFast.cpp create mode 100644 src/core/utils/MapperFast.h diff --git a/src/core/utils/MapperFast.cpp b/src/core/utils/MapperFast.cpp new file mode 100644 index 00000000..3ed25102 --- /dev/null +++ b/src/core/utils/MapperFast.cpp @@ -0,0 +1,262 @@ +// +// Created by z00576261 on 2024/4/15. +// + +#include "MapperFast.h" +#include +#include +#include +#include + +RecMapper::BuckStatus RecMapper::InnerBuck::Insert(uint64_t key, uint64_t& value, std::function ValueSet) +{ + for (int i = 0; i < BUCKCAPACITY; ++i){ + uint64_t old_key = 0; + if (keys_[i].load(std::memory_order_relaxed) == 0 && keys_[i].compare_exchange_strong(old_key, key)){ + bool ret = ValueSet(); + if (!ret){ + keys_[i].store(0); + return BuckStatus::BUCK_ERROR; + } + values_[i] = value; + return BuckStatus::BUCK_NOEXIST; + } + } + return BuckStatus::BUCK_ERROR; +} + +RecMapper::BuckStatus RecMapper::InnerBuck::Find(uint64_t key, uint64_t& value) +{ + for (int i = 0; i < BUCKCAPACITY; ++i){ + if (keys_[i].load(std::memory_order_relaxed) == key){ + value = values_[i]; + return BuckStatus::BUCK_EXIST; + } + } + return BuckStatus::BUCK_NOEXIST; +} + +RecMapper::BuckStatus RecMapper::InnerBuck::Remove(uint64_t key) +{ + for (int i = 0; i < BUCKCAPACITY; ++i) { + uint64_t oldkey = key; + if (keys_[i].load(std::memory_order_relaxed) == key){ + if (keys_[i].compare_exchange_strong(oldkey, 0)){ + values_[i] = 0; + return BuckStatus::BUCK_EXIST; + } + } + } + return BUCK_ERROR; +} + +bool RecMapper::MapperFast::InitializeBuck() +{ + uint16_t i = 0; + + while(i <= prime_max){ + if (pow(2, i) < reserve_){ + i++; + continue; + } + break; + } + buck_count_ = i < 7 ? 128 : pow(2, i); + + for(auto &buck_map : buck_maps_){ + InnerBuck* buck_map_temp = new (std::nothrow) InnerBuck[buck_count_]; + if (buck_map_temp == nullptr) { + FreeBuckMaps(); + return false; + } + memset(buck_map_temp, 0, sizeof(InnerBuck) * buck_count_); + buck_map = buck_map_temp; + } + return true; +} + +void RecMapper::MapperFast::UnInitializeBuck() +{ + FreeBuckExpend(); + FreeBuckMaps(); +} + +void RecMapper::MapperFast::FreeBuckMaps() +{ + for (auto &buck_map : buck_maps_){ + if (buck_map != nullptr){ + delete[] buck_map; + buck_map = nullptr; + } + } +} + +void RecMapper::MapperFast::FreeBuckExpend() +{ + for (auto &buck_map : buck_maps_ ){ + if (buck_map == nullptr){ + continue; + } + for (uint32_t i = 0; i < buck_count_; ++i){ + InnerBuck* buck_attch = buck_map[i].next_; + while (buck_attch != nullptr){ + InnerBuck* buck_attch_temp = buck_attch->next_; + delete buck_attch; + buck_attch = buck_attch_temp; + } + } + } +} + +RecMapper::MapperStatus RecMapper::MapperFast::Put(uint64_t key, uint64_t& value) +{ + if (size_.load() > capacity_){ + return MapperStatus::MAPPER_ERROR; + } + + if(key == 0){ + if (spec_buck != nullptr) { + spec_buck->spin.lock(); + spec_buck->Find(key, value); + spec_buck->spin.unlock(); + return MapperStatus::MAPPER_OK; + } + spec_buck = new (std::nothrow) InnerBuck; + memset(spec_buck, 0, sizeof(InnerBuck)); + spec_buck->spin.lock(); + spec_buck->keys_[0].store(key); + spec_buck->values_[0] = offset_.fetch_add(1) + 1; + size_.fetch_add(1); + spec_buck->spin.unlock(); + return MapperStatus::MAPPER_OK; + } + InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); + //first,find key if exist in buck + while(buck != nullptr){ + buck->spin.lock(); + if(buck->Find(key, value) == BuckStatus::BUCK_EXIST){ + buck->spin.unlock(); + return MapperStatus::MAPPER_OK; + } + buck->spin.unlock(); + if(buck->next_ != nullptr){ + buck = buck->next_; + } else{ + break; + } + } + + //if not find, + for (int i = 0; i < 8192; ++i){ + // insert exist buck + while(buck != nullptr){ + buck->spin.lock(); + auto value_func = [&]() ->bool { + value = offset_.fetch_add(1); + return true;}; + BuckStatus ret = buck->Insert(key, value, value_func); + + buck->spin.unlock(); + if (ret == BuckStatus::BUCK_ERROR) { + return MapperStatus::MAPPER_ERROR; + } else if (ret == BuckStatus::BUCK_NOEXIST) { + size_.fetch_add(1); + return MapperStatus::MAPPER_OK; + } + if (buck->next_ != nullptr) { + buck = buck->next_; + } else { + break; + } + } + + //insert not exist buck + auto& old_spin = buck->spin; + old_spin.lock(); + if (buck->next_ != nullptr) { + buck = buck->next_; + old_spin.unlock(); + continue; + } + + InnerBuck* new_buck = new (std::nothrow) InnerBuck; + memset(new_buck, 0, sizeof(InnerBuck)); + buck->next_ = new_buck; + buck = new_buck; + old_spin.unlock(); + } + return MapperStatus::MAPPER_ERROR; +} + +RecMapper::MapperStatus RecMapper::MapperFast::Find(uint64_t key, uint64_t& value) { + if(key == 0) { + if (spec_buck != nullptr) { + spec_buck->spin.lock(); + value = spec_buck->values_[0]; + spec_buck->spin.unlock(); + return MapperStatus::MAPPER_OK; + } + return MapperStatus::MAPPER_INVALID; + } + InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); + if (buck == nullptr) { + return MapperStatus::MAPPER_ERROR; + } + if (buck->Find(key,value) == BuckStatus::BUCK_NOEXIST) { + return MapperStatus::MAPPER_INVALID; + } + return MapperStatus::MAPPER_OK; +} + +RecMapper::MapperStatus RecMapper::MapperFast::Remove(uint64_t key) +{ + if(key == 0) { + if (spec_buck != nullptr) { + delete spec_buck; + spec_buck = nullptr; + size_.fetch_sub(1); + return MapperStatus::MAPPER_OK; + } + return MapperStatus::MAPPER_INVALID; + } + InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); + while(buck != nullptr) { + uint64_t value; + if (buck->Find(key, value) == BuckStatus::BUCK_NOEXIST) { + return MapperStatus::MAPPER_INVALID; + } + + buck->spin.lock(); + if (buck->Remove(key) == BuckStatus::BUCK_EXIST){ + size_.fetch_sub(1); + return MapperStatus::MAPPER_OK; + } + buck = buck->next_; + } + return MapperStatus::MAPPER_INVALID; +} + +RecMapper::MapperStatus RecMapper::MapperFast::ToVector(std::vector>& vec) +{ + if (spec_buck != nullptr) { + vec.push_back(std::make_pair(spec_buck->keys_[0], spec_buck->values_[0])); + } + for (auto& sub_map : buck_maps_){ + if (sub_map == nullptr){ + continue; + } + for(int i = 0; i < buck_count_; ++i){ + InnerBuck* buck = &sub_map[i]; + while(buck) { + for (int j = 0; j < BUCKCAPACITY; ++j){ + if (buck->keys_[j] == 0) { + continue; + } + vec.push_back(std::make_pair(buck->keys_[j], buck->values_[j])); + } + buck = buck->next_; + } + } + } + return MapperStatus::MAPPER_OK; +} \ No newline at end of file diff --git a/src/core/utils/MapperFast.h b/src/core/utils/MapperFast.h new file mode 100644 index 00000000..0ad73d5b --- /dev/null +++ b/src/core/utils/MapperFast.h @@ -0,0 +1,97 @@ +// +// Created by z00576261 on 2024/4/15. +// + +#ifndef FAST_MAPPERFAST_H +#define FAST_MAPPERFAST_H + +#include +#include +#include +#include +#include + +namespace RecMapper { + constexpr int BUCKCAPACITY = 3; + enum BuckStatus{ + BUCK_EXIST, + BUCK_NOEXIST, + BUCK_ERROR + }; + + enum MapperStatus{ + MAPPER_ERROR, + MAPPER_INVALID, + MAPPER_OK + }; + + class SpinLock { + public: + SpinLock() = default; + SpinLock(const SpinLock&) = delete; + SpinLock& operator=(const SpinLock) = delete; + + void lock() { + while(f.test_and_set(std::memory_order_acquire)); + } + + void unlock() { + f.clear(std::memory_order_release); + } + + private: + std::atomic_flag f; + }; + + struct InnerBuck{ + std::atomic keys_[BUCKCAPACITY]{}; + int64_t values_[BUCKCAPACITY]{}; + InnerBuck* next_ = nullptr; + SpinLock spin; + + BuckStatus Insert(uint64_t, uint64_t&, std::function); + BuckStatus Find(uint64_t, uint64_t&); + BuckStatus Remove(uint64_t); + + }; + + class MapperFast { + public: + MapperFast(uint64_t cap, uint64_t res) : capacity_(cap), reserve_(res) {}; + + ~MapperFast() = default; + + bool InitializeBuck(); + void UnInitializeBuck(); + + MapperStatus Put(uint64_t key, uint64_t& value); + + MapperStatus Find(uint64_t key, uint64_t& value); + + MapperStatus Remove(uint64_t key); + + MapperStatus ToVector(std::vector>& vec); + + uint64_t Size() { + return size_.load(); + } + + private: + void FreeBuckMaps(); + void FreeBuckExpend(); + + std::atomic size_{ 0 }; + std::atomic offset_{ 0 }; + uint64_t capacity_; + uint64_t reserve_; + uint32_t buck_count_; + + static constexpr uint32_t sub_map_count = 5; + static constexpr uint32_t prime_max = 32; + + InnerBuck* buck_maps_[sub_map_count] {}; + InnerBuck* spec_buck = nullptr; + }; +} + +#endif //FAST_MAPPERFAST_H -- Gitee From b2125d0db79021a5f1142c161fdcea90395a7cfd Mon Sep 17 00:00:00 2001 From: rome_zhouyang <9538256+rome_sky@user.noreply.gitee.com> Date: Wed, 24 Apr 2024 20:43:14 +0800 Subject: [PATCH 061/302] add FasterKV fix --- src/core/utils/mapper_fast.cpp | 262 +++++++++++++++++++++++++++++++++ src/core/utils/mapper_fast.h | 97 ++++++++++++ 2 files changed, 359 insertions(+) create mode 100644 src/core/utils/mapper_fast.cpp create mode 100644 src/core/utils/mapper_fast.h diff --git a/src/core/utils/mapper_fast.cpp b/src/core/utils/mapper_fast.cpp new file mode 100644 index 00000000..3ed25102 --- /dev/null +++ b/src/core/utils/mapper_fast.cpp @@ -0,0 +1,262 @@ +// +// Created by z00576261 on 2024/4/15. +// + +#include "MapperFast.h" +#include +#include +#include +#include + +RecMapper::BuckStatus RecMapper::InnerBuck::Insert(uint64_t key, uint64_t& value, std::function ValueSet) +{ + for (int i = 0; i < BUCKCAPACITY; ++i){ + uint64_t old_key = 0; + if (keys_[i].load(std::memory_order_relaxed) == 0 && keys_[i].compare_exchange_strong(old_key, key)){ + bool ret = ValueSet(); + if (!ret){ + keys_[i].store(0); + return BuckStatus::BUCK_ERROR; + } + values_[i] = value; + return BuckStatus::BUCK_NOEXIST; + } + } + return BuckStatus::BUCK_ERROR; +} + +RecMapper::BuckStatus RecMapper::InnerBuck::Find(uint64_t key, uint64_t& value) +{ + for (int i = 0; i < BUCKCAPACITY; ++i){ + if (keys_[i].load(std::memory_order_relaxed) == key){ + value = values_[i]; + return BuckStatus::BUCK_EXIST; + } + } + return BuckStatus::BUCK_NOEXIST; +} + +RecMapper::BuckStatus RecMapper::InnerBuck::Remove(uint64_t key) +{ + for (int i = 0; i < BUCKCAPACITY; ++i) { + uint64_t oldkey = key; + if (keys_[i].load(std::memory_order_relaxed) == key){ + if (keys_[i].compare_exchange_strong(oldkey, 0)){ + values_[i] = 0; + return BuckStatus::BUCK_EXIST; + } + } + } + return BUCK_ERROR; +} + +bool RecMapper::MapperFast::InitializeBuck() +{ + uint16_t i = 0; + + while(i <= prime_max){ + if (pow(2, i) < reserve_){ + i++; + continue; + } + break; + } + buck_count_ = i < 7 ? 128 : pow(2, i); + + for(auto &buck_map : buck_maps_){ + InnerBuck* buck_map_temp = new (std::nothrow) InnerBuck[buck_count_]; + if (buck_map_temp == nullptr) { + FreeBuckMaps(); + return false; + } + memset(buck_map_temp, 0, sizeof(InnerBuck) * buck_count_); + buck_map = buck_map_temp; + } + return true; +} + +void RecMapper::MapperFast::UnInitializeBuck() +{ + FreeBuckExpend(); + FreeBuckMaps(); +} + +void RecMapper::MapperFast::FreeBuckMaps() +{ + for (auto &buck_map : buck_maps_){ + if (buck_map != nullptr){ + delete[] buck_map; + buck_map = nullptr; + } + } +} + +void RecMapper::MapperFast::FreeBuckExpend() +{ + for (auto &buck_map : buck_maps_ ){ + if (buck_map == nullptr){ + continue; + } + for (uint32_t i = 0; i < buck_count_; ++i){ + InnerBuck* buck_attch = buck_map[i].next_; + while (buck_attch != nullptr){ + InnerBuck* buck_attch_temp = buck_attch->next_; + delete buck_attch; + buck_attch = buck_attch_temp; + } + } + } +} + +RecMapper::MapperStatus RecMapper::MapperFast::Put(uint64_t key, uint64_t& value) +{ + if (size_.load() > capacity_){ + return MapperStatus::MAPPER_ERROR; + } + + if(key == 0){ + if (spec_buck != nullptr) { + spec_buck->spin.lock(); + spec_buck->Find(key, value); + spec_buck->spin.unlock(); + return MapperStatus::MAPPER_OK; + } + spec_buck = new (std::nothrow) InnerBuck; + memset(spec_buck, 0, sizeof(InnerBuck)); + spec_buck->spin.lock(); + spec_buck->keys_[0].store(key); + spec_buck->values_[0] = offset_.fetch_add(1) + 1; + size_.fetch_add(1); + spec_buck->spin.unlock(); + return MapperStatus::MAPPER_OK; + } + InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); + //first,find key if exist in buck + while(buck != nullptr){ + buck->spin.lock(); + if(buck->Find(key, value) == BuckStatus::BUCK_EXIST){ + buck->spin.unlock(); + return MapperStatus::MAPPER_OK; + } + buck->spin.unlock(); + if(buck->next_ != nullptr){ + buck = buck->next_; + } else{ + break; + } + } + + //if not find, + for (int i = 0; i < 8192; ++i){ + // insert exist buck + while(buck != nullptr){ + buck->spin.lock(); + auto value_func = [&]() ->bool { + value = offset_.fetch_add(1); + return true;}; + BuckStatus ret = buck->Insert(key, value, value_func); + + buck->spin.unlock(); + if (ret == BuckStatus::BUCK_ERROR) { + return MapperStatus::MAPPER_ERROR; + } else if (ret == BuckStatus::BUCK_NOEXIST) { + size_.fetch_add(1); + return MapperStatus::MAPPER_OK; + } + if (buck->next_ != nullptr) { + buck = buck->next_; + } else { + break; + } + } + + //insert not exist buck + auto& old_spin = buck->spin; + old_spin.lock(); + if (buck->next_ != nullptr) { + buck = buck->next_; + old_spin.unlock(); + continue; + } + + InnerBuck* new_buck = new (std::nothrow) InnerBuck; + memset(new_buck, 0, sizeof(InnerBuck)); + buck->next_ = new_buck; + buck = new_buck; + old_spin.unlock(); + } + return MapperStatus::MAPPER_ERROR; +} + +RecMapper::MapperStatus RecMapper::MapperFast::Find(uint64_t key, uint64_t& value) { + if(key == 0) { + if (spec_buck != nullptr) { + spec_buck->spin.lock(); + value = spec_buck->values_[0]; + spec_buck->spin.unlock(); + return MapperStatus::MAPPER_OK; + } + return MapperStatus::MAPPER_INVALID; + } + InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); + if (buck == nullptr) { + return MapperStatus::MAPPER_ERROR; + } + if (buck->Find(key,value) == BuckStatus::BUCK_NOEXIST) { + return MapperStatus::MAPPER_INVALID; + } + return MapperStatus::MAPPER_OK; +} + +RecMapper::MapperStatus RecMapper::MapperFast::Remove(uint64_t key) +{ + if(key == 0) { + if (spec_buck != nullptr) { + delete spec_buck; + spec_buck = nullptr; + size_.fetch_sub(1); + return MapperStatus::MAPPER_OK; + } + return MapperStatus::MAPPER_INVALID; + } + InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); + while(buck != nullptr) { + uint64_t value; + if (buck->Find(key, value) == BuckStatus::BUCK_NOEXIST) { + return MapperStatus::MAPPER_INVALID; + } + + buck->spin.lock(); + if (buck->Remove(key) == BuckStatus::BUCK_EXIST){ + size_.fetch_sub(1); + return MapperStatus::MAPPER_OK; + } + buck = buck->next_; + } + return MapperStatus::MAPPER_INVALID; +} + +RecMapper::MapperStatus RecMapper::MapperFast::ToVector(std::vector>& vec) +{ + if (spec_buck != nullptr) { + vec.push_back(std::make_pair(spec_buck->keys_[0], spec_buck->values_[0])); + } + for (auto& sub_map : buck_maps_){ + if (sub_map == nullptr){ + continue; + } + for(int i = 0; i < buck_count_; ++i){ + InnerBuck* buck = &sub_map[i]; + while(buck) { + for (int j = 0; j < BUCKCAPACITY; ++j){ + if (buck->keys_[j] == 0) { + continue; + } + vec.push_back(std::make_pair(buck->keys_[j], buck->values_[j])); + } + buck = buck->next_; + } + } + } + return MapperStatus::MAPPER_OK; +} \ No newline at end of file diff --git a/src/core/utils/mapper_fast.h b/src/core/utils/mapper_fast.h new file mode 100644 index 00000000..0ad73d5b --- /dev/null +++ b/src/core/utils/mapper_fast.h @@ -0,0 +1,97 @@ +// +// Created by z00576261 on 2024/4/15. +// + +#ifndef FAST_MAPPERFAST_H +#define FAST_MAPPERFAST_H + +#include +#include +#include +#include +#include + +namespace RecMapper { + constexpr int BUCKCAPACITY = 3; + enum BuckStatus{ + BUCK_EXIST, + BUCK_NOEXIST, + BUCK_ERROR + }; + + enum MapperStatus{ + MAPPER_ERROR, + MAPPER_INVALID, + MAPPER_OK + }; + + class SpinLock { + public: + SpinLock() = default; + SpinLock(const SpinLock&) = delete; + SpinLock& operator=(const SpinLock) = delete; + + void lock() { + while(f.test_and_set(std::memory_order_acquire)); + } + + void unlock() { + f.clear(std::memory_order_release); + } + + private: + std::atomic_flag f; + }; + + struct InnerBuck{ + std::atomic keys_[BUCKCAPACITY]{}; + int64_t values_[BUCKCAPACITY]{}; + InnerBuck* next_ = nullptr; + SpinLock spin; + + BuckStatus Insert(uint64_t, uint64_t&, std::function); + BuckStatus Find(uint64_t, uint64_t&); + BuckStatus Remove(uint64_t); + + }; + + class MapperFast { + public: + MapperFast(uint64_t cap, uint64_t res) : capacity_(cap), reserve_(res) {}; + + ~MapperFast() = default; + + bool InitializeBuck(); + void UnInitializeBuck(); + + MapperStatus Put(uint64_t key, uint64_t& value); + + MapperStatus Find(uint64_t key, uint64_t& value); + + MapperStatus Remove(uint64_t key); + + MapperStatus ToVector(std::vector>& vec); + + uint64_t Size() { + return size_.load(); + } + + private: + void FreeBuckMaps(); + void FreeBuckExpend(); + + std::atomic size_{ 0 }; + std::atomic offset_{ 0 }; + uint64_t capacity_; + uint64_t reserve_; + uint32_t buck_count_; + + static constexpr uint32_t sub_map_count = 5; + static constexpr uint32_t prime_max = 32; + + InnerBuck* buck_maps_[sub_map_count] {}; + InnerBuck* spec_buck = nullptr; + }; +} + +#endif //FAST_MAPPERFAST_H -- Gitee From 54212c205f72f1347a9d9fc53ead982cec6217b4 Mon Sep 17 00:00:00 2001 From: rome_zhouyang <9538256+rome_sky@user.noreply.gitee.com> Date: Wed, 24 Apr 2024 20:46:17 +0800 Subject: [PATCH 062/302] add FasterKV fix1 --- src/core/utils/MapperFast.cpp | 262 --------------------------------- src/core/utils/MapperFast.h | 97 ------------ src/core/utils/mapper_fast.cpp | 2 +- 3 files changed, 1 insertion(+), 360 deletions(-) delete mode 100644 src/core/utils/MapperFast.cpp delete mode 100644 src/core/utils/MapperFast.h diff --git a/src/core/utils/MapperFast.cpp b/src/core/utils/MapperFast.cpp deleted file mode 100644 index 3ed25102..00000000 --- a/src/core/utils/MapperFast.cpp +++ /dev/null @@ -1,262 +0,0 @@ -// -// Created by z00576261 on 2024/4/15. -// - -#include "MapperFast.h" -#include -#include -#include -#include - -RecMapper::BuckStatus RecMapper::InnerBuck::Insert(uint64_t key, uint64_t& value, std::function ValueSet) -{ - for (int i = 0; i < BUCKCAPACITY; ++i){ - uint64_t old_key = 0; - if (keys_[i].load(std::memory_order_relaxed) == 0 && keys_[i].compare_exchange_strong(old_key, key)){ - bool ret = ValueSet(); - if (!ret){ - keys_[i].store(0); - return BuckStatus::BUCK_ERROR; - } - values_[i] = value; - return BuckStatus::BUCK_NOEXIST; - } - } - return BuckStatus::BUCK_ERROR; -} - -RecMapper::BuckStatus RecMapper::InnerBuck::Find(uint64_t key, uint64_t& value) -{ - for (int i = 0; i < BUCKCAPACITY; ++i){ - if (keys_[i].load(std::memory_order_relaxed) == key){ - value = values_[i]; - return BuckStatus::BUCK_EXIST; - } - } - return BuckStatus::BUCK_NOEXIST; -} - -RecMapper::BuckStatus RecMapper::InnerBuck::Remove(uint64_t key) -{ - for (int i = 0; i < BUCKCAPACITY; ++i) { - uint64_t oldkey = key; - if (keys_[i].load(std::memory_order_relaxed) == key){ - if (keys_[i].compare_exchange_strong(oldkey, 0)){ - values_[i] = 0; - return BuckStatus::BUCK_EXIST; - } - } - } - return BUCK_ERROR; -} - -bool RecMapper::MapperFast::InitializeBuck() -{ - uint16_t i = 0; - - while(i <= prime_max){ - if (pow(2, i) < reserve_){ - i++; - continue; - } - break; - } - buck_count_ = i < 7 ? 128 : pow(2, i); - - for(auto &buck_map : buck_maps_){ - InnerBuck* buck_map_temp = new (std::nothrow) InnerBuck[buck_count_]; - if (buck_map_temp == nullptr) { - FreeBuckMaps(); - return false; - } - memset(buck_map_temp, 0, sizeof(InnerBuck) * buck_count_); - buck_map = buck_map_temp; - } - return true; -} - -void RecMapper::MapperFast::UnInitializeBuck() -{ - FreeBuckExpend(); - FreeBuckMaps(); -} - -void RecMapper::MapperFast::FreeBuckMaps() -{ - for (auto &buck_map : buck_maps_){ - if (buck_map != nullptr){ - delete[] buck_map; - buck_map = nullptr; - } - } -} - -void RecMapper::MapperFast::FreeBuckExpend() -{ - for (auto &buck_map : buck_maps_ ){ - if (buck_map == nullptr){ - continue; - } - for (uint32_t i = 0; i < buck_count_; ++i){ - InnerBuck* buck_attch = buck_map[i].next_; - while (buck_attch != nullptr){ - InnerBuck* buck_attch_temp = buck_attch->next_; - delete buck_attch; - buck_attch = buck_attch_temp; - } - } - } -} - -RecMapper::MapperStatus RecMapper::MapperFast::Put(uint64_t key, uint64_t& value) -{ - if (size_.load() > capacity_){ - return MapperStatus::MAPPER_ERROR; - } - - if(key == 0){ - if (spec_buck != nullptr) { - spec_buck->spin.lock(); - spec_buck->Find(key, value); - spec_buck->spin.unlock(); - return MapperStatus::MAPPER_OK; - } - spec_buck = new (std::nothrow) InnerBuck; - memset(spec_buck, 0, sizeof(InnerBuck)); - spec_buck->spin.lock(); - spec_buck->keys_[0].store(key); - spec_buck->values_[0] = offset_.fetch_add(1) + 1; - size_.fetch_add(1); - spec_buck->spin.unlock(); - return MapperStatus::MAPPER_OK; - } - InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); - //first,find key if exist in buck - while(buck != nullptr){ - buck->spin.lock(); - if(buck->Find(key, value) == BuckStatus::BUCK_EXIST){ - buck->spin.unlock(); - return MapperStatus::MAPPER_OK; - } - buck->spin.unlock(); - if(buck->next_ != nullptr){ - buck = buck->next_; - } else{ - break; - } - } - - //if not find, - for (int i = 0; i < 8192; ++i){ - // insert exist buck - while(buck != nullptr){ - buck->spin.lock(); - auto value_func = [&]() ->bool { - value = offset_.fetch_add(1); - return true;}; - BuckStatus ret = buck->Insert(key, value, value_func); - - buck->spin.unlock(); - if (ret == BuckStatus::BUCK_ERROR) { - return MapperStatus::MAPPER_ERROR; - } else if (ret == BuckStatus::BUCK_NOEXIST) { - size_.fetch_add(1); - return MapperStatus::MAPPER_OK; - } - if (buck->next_ != nullptr) { - buck = buck->next_; - } else { - break; - } - } - - //insert not exist buck - auto& old_spin = buck->spin; - old_spin.lock(); - if (buck->next_ != nullptr) { - buck = buck->next_; - old_spin.unlock(); - continue; - } - - InnerBuck* new_buck = new (std::nothrow) InnerBuck; - memset(new_buck, 0, sizeof(InnerBuck)); - buck->next_ = new_buck; - buck = new_buck; - old_spin.unlock(); - } - return MapperStatus::MAPPER_ERROR; -} - -RecMapper::MapperStatus RecMapper::MapperFast::Find(uint64_t key, uint64_t& value) { - if(key == 0) { - if (spec_buck != nullptr) { - spec_buck->spin.lock(); - value = spec_buck->values_[0]; - spec_buck->spin.unlock(); - return MapperStatus::MAPPER_OK; - } - return MapperStatus::MAPPER_INVALID; - } - InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); - if (buck == nullptr) { - return MapperStatus::MAPPER_ERROR; - } - if (buck->Find(key,value) == BuckStatus::BUCK_NOEXIST) { - return MapperStatus::MAPPER_INVALID; - } - return MapperStatus::MAPPER_OK; -} - -RecMapper::MapperStatus RecMapper::MapperFast::Remove(uint64_t key) -{ - if(key == 0) { - if (spec_buck != nullptr) { - delete spec_buck; - spec_buck = nullptr; - size_.fetch_sub(1); - return MapperStatus::MAPPER_OK; - } - return MapperStatus::MAPPER_INVALID; - } - InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); - while(buck != nullptr) { - uint64_t value; - if (buck->Find(key, value) == BuckStatus::BUCK_NOEXIST) { - return MapperStatus::MAPPER_INVALID; - } - - buck->spin.lock(); - if (buck->Remove(key) == BuckStatus::BUCK_EXIST){ - size_.fetch_sub(1); - return MapperStatus::MAPPER_OK; - } - buck = buck->next_; - } - return MapperStatus::MAPPER_INVALID; -} - -RecMapper::MapperStatus RecMapper::MapperFast::ToVector(std::vector>& vec) -{ - if (spec_buck != nullptr) { - vec.push_back(std::make_pair(spec_buck->keys_[0], spec_buck->values_[0])); - } - for (auto& sub_map : buck_maps_){ - if (sub_map == nullptr){ - continue; - } - for(int i = 0; i < buck_count_; ++i){ - InnerBuck* buck = &sub_map[i]; - while(buck) { - for (int j = 0; j < BUCKCAPACITY; ++j){ - if (buck->keys_[j] == 0) { - continue; - } - vec.push_back(std::make_pair(buck->keys_[j], buck->values_[j])); - } - buck = buck->next_; - } - } - } - return MapperStatus::MAPPER_OK; -} \ No newline at end of file diff --git a/src/core/utils/MapperFast.h b/src/core/utils/MapperFast.h deleted file mode 100644 index 0ad73d5b..00000000 --- a/src/core/utils/MapperFast.h +++ /dev/null @@ -1,97 +0,0 @@ -// -// Created by z00576261 on 2024/4/15. -// - -#ifndef FAST_MAPPERFAST_H -#define FAST_MAPPERFAST_H - -#include -#include -#include -#include -#include - -namespace RecMapper { - constexpr int BUCKCAPACITY = 3; - enum BuckStatus{ - BUCK_EXIST, - BUCK_NOEXIST, - BUCK_ERROR - }; - - enum MapperStatus{ - MAPPER_ERROR, - MAPPER_INVALID, - MAPPER_OK - }; - - class SpinLock { - public: - SpinLock() = default; - SpinLock(const SpinLock&) = delete; - SpinLock& operator=(const SpinLock) = delete; - - void lock() { - while(f.test_and_set(std::memory_order_acquire)); - } - - void unlock() { - f.clear(std::memory_order_release); - } - - private: - std::atomic_flag f; - }; - - struct InnerBuck{ - std::atomic keys_[BUCKCAPACITY]{}; - int64_t values_[BUCKCAPACITY]{}; - InnerBuck* next_ = nullptr; - SpinLock spin; - - BuckStatus Insert(uint64_t, uint64_t&, std::function); - BuckStatus Find(uint64_t, uint64_t&); - BuckStatus Remove(uint64_t); - - }; - - class MapperFast { - public: - MapperFast(uint64_t cap, uint64_t res) : capacity_(cap), reserve_(res) {}; - - ~MapperFast() = default; - - bool InitializeBuck(); - void UnInitializeBuck(); - - MapperStatus Put(uint64_t key, uint64_t& value); - - MapperStatus Find(uint64_t key, uint64_t& value); - - MapperStatus Remove(uint64_t key); - - MapperStatus ToVector(std::vector>& vec); - - uint64_t Size() { - return size_.load(); - } - - private: - void FreeBuckMaps(); - void FreeBuckExpend(); - - std::atomic size_{ 0 }; - std::atomic offset_{ 0 }; - uint64_t capacity_; - uint64_t reserve_; - uint32_t buck_count_; - - static constexpr uint32_t sub_map_count = 5; - static constexpr uint32_t prime_max = 32; - - InnerBuck* buck_maps_[sub_map_count] {}; - InnerBuck* spec_buck = nullptr; - }; -} - -#endif //FAST_MAPPERFAST_H diff --git a/src/core/utils/mapper_fast.cpp b/src/core/utils/mapper_fast.cpp index 3ed25102..021daaca 100644 --- a/src/core/utils/mapper_fast.cpp +++ b/src/core/utils/mapper_fast.cpp @@ -2,7 +2,7 @@ // Created by z00576261 on 2024/4/15. // -#include "MapperFast.h" +#include "mapper_fast.h" #include #include #include -- Gitee From 85cabc7f4f8feae3e10325bb4a56094bba5d7708 Mon Sep 17 00:00:00 2001 From: rome_zhouyang <9538256+rome_sky@user.noreply.gitee.com> Date: Thu, 25 Apr 2024 09:04:11 +0800 Subject: [PATCH 063/302] delete fasterKV --- src/core/utils/mapper_fast.cpp | 262 --------------------------------- src/core/utils/mapper_fast.h | 97 ------------ 2 files changed, 359 deletions(-) delete mode 100644 src/core/utils/mapper_fast.cpp delete mode 100644 src/core/utils/mapper_fast.h diff --git a/src/core/utils/mapper_fast.cpp b/src/core/utils/mapper_fast.cpp deleted file mode 100644 index 021daaca..00000000 --- a/src/core/utils/mapper_fast.cpp +++ /dev/null @@ -1,262 +0,0 @@ -// -// Created by z00576261 on 2024/4/15. -// - -#include "mapper_fast.h" -#include -#include -#include -#include - -RecMapper::BuckStatus RecMapper::InnerBuck::Insert(uint64_t key, uint64_t& value, std::function ValueSet) -{ - for (int i = 0; i < BUCKCAPACITY; ++i){ - uint64_t old_key = 0; - if (keys_[i].load(std::memory_order_relaxed) == 0 && keys_[i].compare_exchange_strong(old_key, key)){ - bool ret = ValueSet(); - if (!ret){ - keys_[i].store(0); - return BuckStatus::BUCK_ERROR; - } - values_[i] = value; - return BuckStatus::BUCK_NOEXIST; - } - } - return BuckStatus::BUCK_ERROR; -} - -RecMapper::BuckStatus RecMapper::InnerBuck::Find(uint64_t key, uint64_t& value) -{ - for (int i = 0; i < BUCKCAPACITY; ++i){ - if (keys_[i].load(std::memory_order_relaxed) == key){ - value = values_[i]; - return BuckStatus::BUCK_EXIST; - } - } - return BuckStatus::BUCK_NOEXIST; -} - -RecMapper::BuckStatus RecMapper::InnerBuck::Remove(uint64_t key) -{ - for (int i = 0; i < BUCKCAPACITY; ++i) { - uint64_t oldkey = key; - if (keys_[i].load(std::memory_order_relaxed) == key){ - if (keys_[i].compare_exchange_strong(oldkey, 0)){ - values_[i] = 0; - return BuckStatus::BUCK_EXIST; - } - } - } - return BUCK_ERROR; -} - -bool RecMapper::MapperFast::InitializeBuck() -{ - uint16_t i = 0; - - while(i <= prime_max){ - if (pow(2, i) < reserve_){ - i++; - continue; - } - break; - } - buck_count_ = i < 7 ? 128 : pow(2, i); - - for(auto &buck_map : buck_maps_){ - InnerBuck* buck_map_temp = new (std::nothrow) InnerBuck[buck_count_]; - if (buck_map_temp == nullptr) { - FreeBuckMaps(); - return false; - } - memset(buck_map_temp, 0, sizeof(InnerBuck) * buck_count_); - buck_map = buck_map_temp; - } - return true; -} - -void RecMapper::MapperFast::UnInitializeBuck() -{ - FreeBuckExpend(); - FreeBuckMaps(); -} - -void RecMapper::MapperFast::FreeBuckMaps() -{ - for (auto &buck_map : buck_maps_){ - if (buck_map != nullptr){ - delete[] buck_map; - buck_map = nullptr; - } - } -} - -void RecMapper::MapperFast::FreeBuckExpend() -{ - for (auto &buck_map : buck_maps_ ){ - if (buck_map == nullptr){ - continue; - } - for (uint32_t i = 0; i < buck_count_; ++i){ - InnerBuck* buck_attch = buck_map[i].next_; - while (buck_attch != nullptr){ - InnerBuck* buck_attch_temp = buck_attch->next_; - delete buck_attch; - buck_attch = buck_attch_temp; - } - } - } -} - -RecMapper::MapperStatus RecMapper::MapperFast::Put(uint64_t key, uint64_t& value) -{ - if (size_.load() > capacity_){ - return MapperStatus::MAPPER_ERROR; - } - - if(key == 0){ - if (spec_buck != nullptr) { - spec_buck->spin.lock(); - spec_buck->Find(key, value); - spec_buck->spin.unlock(); - return MapperStatus::MAPPER_OK; - } - spec_buck = new (std::nothrow) InnerBuck; - memset(spec_buck, 0, sizeof(InnerBuck)); - spec_buck->spin.lock(); - spec_buck->keys_[0].store(key); - spec_buck->values_[0] = offset_.fetch_add(1) + 1; - size_.fetch_add(1); - spec_buck->spin.unlock(); - return MapperStatus::MAPPER_OK; - } - InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); - //first,find key if exist in buck - while(buck != nullptr){ - buck->spin.lock(); - if(buck->Find(key, value) == BuckStatus::BUCK_EXIST){ - buck->spin.unlock(); - return MapperStatus::MAPPER_OK; - } - buck->spin.unlock(); - if(buck->next_ != nullptr){ - buck = buck->next_; - } else{ - break; - } - } - - //if not find, - for (int i = 0; i < 8192; ++i){ - // insert exist buck - while(buck != nullptr){ - buck->spin.lock(); - auto value_func = [&]() ->bool { - value = offset_.fetch_add(1); - return true;}; - BuckStatus ret = buck->Insert(key, value, value_func); - - buck->spin.unlock(); - if (ret == BuckStatus::BUCK_ERROR) { - return MapperStatus::MAPPER_ERROR; - } else if (ret == BuckStatus::BUCK_NOEXIST) { - size_.fetch_add(1); - return MapperStatus::MAPPER_OK; - } - if (buck->next_ != nullptr) { - buck = buck->next_; - } else { - break; - } - } - - //insert not exist buck - auto& old_spin = buck->spin; - old_spin.lock(); - if (buck->next_ != nullptr) { - buck = buck->next_; - old_spin.unlock(); - continue; - } - - InnerBuck* new_buck = new (std::nothrow) InnerBuck; - memset(new_buck, 0, sizeof(InnerBuck)); - buck->next_ = new_buck; - buck = new_buck; - old_spin.unlock(); - } - return MapperStatus::MAPPER_ERROR; -} - -RecMapper::MapperStatus RecMapper::MapperFast::Find(uint64_t key, uint64_t& value) { - if(key == 0) { - if (spec_buck != nullptr) { - spec_buck->spin.lock(); - value = spec_buck->values_[0]; - spec_buck->spin.unlock(); - return MapperStatus::MAPPER_OK; - } - return MapperStatus::MAPPER_INVALID; - } - InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); - if (buck == nullptr) { - return MapperStatus::MAPPER_ERROR; - } - if (buck->Find(key,value) == BuckStatus::BUCK_NOEXIST) { - return MapperStatus::MAPPER_INVALID; - } - return MapperStatus::MAPPER_OK; -} - -RecMapper::MapperStatus RecMapper::MapperFast::Remove(uint64_t key) -{ - if(key == 0) { - if (spec_buck != nullptr) { - delete spec_buck; - spec_buck = nullptr; - size_.fetch_sub(1); - return MapperStatus::MAPPER_OK; - } - return MapperStatus::MAPPER_INVALID; - } - InnerBuck* buck = &(buck_maps_[key % sub_map_count][key % buck_count_]); - while(buck != nullptr) { - uint64_t value; - if (buck->Find(key, value) == BuckStatus::BUCK_NOEXIST) { - return MapperStatus::MAPPER_INVALID; - } - - buck->spin.lock(); - if (buck->Remove(key) == BuckStatus::BUCK_EXIST){ - size_.fetch_sub(1); - return MapperStatus::MAPPER_OK; - } - buck = buck->next_; - } - return MapperStatus::MAPPER_INVALID; -} - -RecMapper::MapperStatus RecMapper::MapperFast::ToVector(std::vector>& vec) -{ - if (spec_buck != nullptr) { - vec.push_back(std::make_pair(spec_buck->keys_[0], spec_buck->values_[0])); - } - for (auto& sub_map : buck_maps_){ - if (sub_map == nullptr){ - continue; - } - for(int i = 0; i < buck_count_; ++i){ - InnerBuck* buck = &sub_map[i]; - while(buck) { - for (int j = 0; j < BUCKCAPACITY; ++j){ - if (buck->keys_[j] == 0) { - continue; - } - vec.push_back(std::make_pair(buck->keys_[j], buck->values_[j])); - } - buck = buck->next_; - } - } - } - return MapperStatus::MAPPER_OK; -} \ No newline at end of file diff --git a/src/core/utils/mapper_fast.h b/src/core/utils/mapper_fast.h deleted file mode 100644 index 0ad73d5b..00000000 --- a/src/core/utils/mapper_fast.h +++ /dev/null @@ -1,97 +0,0 @@ -// -// Created by z00576261 on 2024/4/15. -// - -#ifndef FAST_MAPPERFAST_H -#define FAST_MAPPERFAST_H - -#include -#include -#include -#include -#include - -namespace RecMapper { - constexpr int BUCKCAPACITY = 3; - enum BuckStatus{ - BUCK_EXIST, - BUCK_NOEXIST, - BUCK_ERROR - }; - - enum MapperStatus{ - MAPPER_ERROR, - MAPPER_INVALID, - MAPPER_OK - }; - - class SpinLock { - public: - SpinLock() = default; - SpinLock(const SpinLock&) = delete; - SpinLock& operator=(const SpinLock) = delete; - - void lock() { - while(f.test_and_set(std::memory_order_acquire)); - } - - void unlock() { - f.clear(std::memory_order_release); - } - - private: - std::atomic_flag f; - }; - - struct InnerBuck{ - std::atomic keys_[BUCKCAPACITY]{}; - int64_t values_[BUCKCAPACITY]{}; - InnerBuck* next_ = nullptr; - SpinLock spin; - - BuckStatus Insert(uint64_t, uint64_t&, std::function); - BuckStatus Find(uint64_t, uint64_t&); - BuckStatus Remove(uint64_t); - - }; - - class MapperFast { - public: - MapperFast(uint64_t cap, uint64_t res) : capacity_(cap), reserve_(res) {}; - - ~MapperFast() = default; - - bool InitializeBuck(); - void UnInitializeBuck(); - - MapperStatus Put(uint64_t key, uint64_t& value); - - MapperStatus Find(uint64_t key, uint64_t& value); - - MapperStatus Remove(uint64_t key); - - MapperStatus ToVector(std::vector>& vec); - - uint64_t Size() { - return size_.load(); - } - - private: - void FreeBuckMaps(); - void FreeBuckExpend(); - - std::atomic size_{ 0 }; - std::atomic offset_{ 0 }; - uint64_t capacity_; - uint64_t reserve_; - uint32_t buck_count_; - - static constexpr uint32_t sub_map_count = 5; - static constexpr uint32_t prime_max = 32; - - InnerBuck* buck_maps_[sub_map_count] {}; - InnerBuck* spec_buck = nullptr; - }; -} - -#endif //FAST_MAPPERFAST_H -- Gitee From a7dd3ad107ca6b2e5a199c73f7fb01f52b0ae0cc Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Thu, 25 Apr 2024 01:33:14 +0000 Subject: [PATCH 064/302] =?UTF-8?q?!90=20=E6=B8=85=E7=90=86cleancode?= =?UTF-8?q?=E5=91=8A=E8=AD=A6=EF=BC=88=E6=9C=80=E5=B0=8F=E9=9B=86=E2=80=94?= =?UTF-8?q?=E2=80=94=E4=B8=A5=E9=87=8D=EF=BC=89=20*=20cleancode=E5=91=8A?= =?UTF-8?q?=E8=AD=A6=E6=B8=85=E7=90=86=20*=20cleancode=E5=91=8A=E8=AD=A6?= =?UTF-8?q?=E6=B8=85=E7=90=86=20*=20cleancode=E5=91=8A=E8=AD=A6=E6=B8=85?= =?UTF-8?q?=E7=90=86=20*=20cleancode=E5=91=8A=E8=AD=A6=E6=B8=85=E7=90=86?= =?UTF-8?q?=20*=20cleancode=E5=91=8A=E8=AD=A6=E6=B8=85=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 24 +++--- examples/demo/little_demo_estimator/main.py | 20 ++--- examples/dlrm/criteo_tb/gen_ttf.py | 83 +++++++++++---------- examples/dlrm/model/config.py | 8 +- examples/dlrm/model/main_mxrec.py | 31 ++++---- mx_rec/util/communication/hccl_mgmt.py | 8 +- src/AccCTR/tests/ut/src/unique_test.cpp | 21 +++--- 7 files changed, 97 insertions(+), 98 deletions(-) diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index d5a51312..205b0f67 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -53,8 +53,8 @@ def add_timestamp_func(batch): return batch -def make_batch_and_iterator(cfg, feature_spec_list, is_training, dump_graph, use_faae=False): - if cfg.USE_PIPELINE_TEST: +def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, is_use_faae=False): + if config.USE_PIPELINE_TEST: num_parallel = 1 else: num_parallel = 8 @@ -62,9 +62,9 @@ def make_batch_and_iterator(cfg, feature_spec_list, is_training, dump_graph, use def extract_fn(data_record): features = { # Extract features using the keys set during creation - 'label': tf.compat.v1.FixedLenFeature(shape=(cfg.line_per_sample,), dtype=tf.int64), - 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * cfg.line_per_sample,), dtype=tf.int64), - 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * cfg.line_per_sample,), dtype=tf.float32), + 'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64), + 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64), + 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32), } sample = tf.compat.v1.parse_single_example(data_record, features) return sample @@ -77,24 +77,24 @@ def make_batch_and_iterator(cfg, feature_spec_list, is_training, dump_graph, use return batch if is_training: - files_list = glob(os.path.join(cfg.data_path, cfg.train_file_pattern) + '/*.tfrecord') + files_list = glob(os.path.join(config.data_path, config.train_file_pattern) + '/*.tfrecord') else: - files_list = glob(os.path.join(cfg.data_path, cfg.test_file_pattern) + '/*.tfrecord') + files_list = glob(os.path.join(config.data_path, config.test_file_pattern) + '/*.tfrecord') dataset = tf.data.TFRecordDataset(files_list, num_parallel_reads=num_parallel) - batch_size = cfg.batch_size // cfg.line_per_sample + batch_size = config.batch_size // config.line_per_sample - dataset = dataset.shard(cfg.rank_size, cfg.rank_id) + dataset = dataset.shard(config.rank_size, config.rank_id) if is_training: dataset = dataset.shuffle(batch_size * 1000, seed=SHUFFLE_SEED) if is_training: - dataset = dataset.repeat(cfg.train_epoch) + dataset = dataset.repeat(config.train_epoch) else: - dataset = dataset.repeat(cfg.test_epoch) + dataset = dataset.repeat(config.test_epoch) dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel).batch(batch_size, drop_remainder=True) dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel) - if use_faae: + if is_use_faae: dataset = dataset.map(add_timestamp_func) if not MODIFY_GRAPH_FLAG: diff --git a/examples/demo/little_demo_estimator/main.py b/examples/demo/little_demo_estimator/main.py index 8df1420c..de0b6c86 100644 --- a/examples/demo/little_demo_estimator/main.py +++ b/examples/demo/little_demo_estimator/main.py @@ -37,7 +37,7 @@ from utils import FeatureSpecIns tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) -def main(params, cfg): +def main(params, config): mg_session_config = tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False) run_config = NPURunConfig( model_dir=params.model_dir, @@ -64,27 +64,29 @@ def main(params, cfg): hooks_list = [ACGPushOpsToDatasetHook(dump_graph=True), GraphModifierHook(modify_graph=params.modify_graph)] if params.use_timestamp: - config_for_user_table = dict(access_threshold=cfg.access_threshold, eviction_threshold=cfg.eviction_threshold) - config_for_item_table = dict(access_threshold=cfg.access_threshold, eviction_threshold=cfg.eviction_threshold) + config_for_user_table = dict(access_threshold=config.access_threshold, + eviction_threshold=config.eviction_threshold) + config_for_item_table = dict(access_threshold=config.access_threshold, + eviction_threshold=config.eviction_threshold) access_and_evict = dict(user_table=config_for_user_table, item_table=config_for_item_table) evict_hook = EvictHook(evict_enable=True, evict_time_interval=10) hooks_list.append(evict_hook) - create_fs_params = dict(cfg=cfg, use_timestamp=params.use_timestamp, + create_fs_params = dict(cfg=config, use_timestamp=params.use_timestamp, use_multi_lookup=use_multi_lookup, multi_lookup_times=MULTI_LOOKUP_TIMES) est = NPUEstimator( - model_fn=get_model_fn(create_fs_params, cfg, access_and_evict), + model_fn=get_model_fn(create_fs_params, config, access_and_evict), params=params, model_dir=params.model_dir, config=run_config ) if params.run_mode == 'train': - est.train(input_fn=lambda: input_fn(params, create_fs_params, cfg), max_steps=params.max_steps, + est.train(input_fn=lambda: input_fn(params, create_fs_params, config), max_steps=params.max_steps, hooks=npu_hooks_append(hooks_list)) elif params.run_mode == 'train_and_evaluate': - train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(params, create_fs_params, cfg, + train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(params, create_fs_params, config, use_one_shot=args.use_one_shot), max_steps=params.max_steps, hooks=npu_hooks_append(hooks_list)) # 在开启evict时,eval时不支持淘汰,所以无需加入evict hook @@ -95,14 +97,14 @@ def main(params, cfg): eval_hook_list = [ACGPushOpsToDatasetHook(dump_graph=True), GraphModifierHook(modify_graph=params.modify_graph)] - eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(params, create_fs_params, cfg, is_eval=True, + eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(params, create_fs_params, config, is_eval=True, use_one_shot=args.use_one_shot), steps=params.eval_steps, hooks=npu_hooks_append(eval_hook_list), throttle_secs=0) tf.estimator.train_and_evaluate(est, train_spec=train_spec, eval_spec=eval_spec) elif params.run_mode == 'predict': - results = est.predict(input_fn=lambda: input_fn(params, create_fs_params, cfg), + results = est.predict(input_fn=lambda: input_fn(params, create_fs_params, config), hooks=npu_hooks_append(hooks_list=hooks_list), yield_single_examples=False) output_pred1 = [] output_pred2 = [] diff --git a/examples/dlrm/criteo_tb/gen_ttf.py b/examples/dlrm/criteo_tb/gen_ttf.py index 92fabb3d..04b7b767 100644 --- a/examples/dlrm/criteo_tb/gen_ttf.py +++ b/examples/dlrm/criteo_tb/gen_ttf.py @@ -50,11 +50,11 @@ class Logger(object): self.logger.addHandler(sh) # 把对象加到logger里 self.logger.addHandler(th) - def info(self, *args): - if len(args) == 1: - self.logger.info(*args) + def info(self, *prams): + if len(prams) == 1: + self.logger.info(*prams) else: - self.logger.info([*args]) + self.logger.info([*prams]) class CriteoStatsDict(): @@ -89,12 +89,11 @@ class CriteoStatsDict(): for i, cat in enumerate(cat_list): map_cat_count(i, cat) - # - def save_dict(self, output_path, hist_map, prefix=""): - with open(os.path.join(output_path, "{}hist_map.pkl".format(prefix)), "wb") as file_wrt: + @staticmethod + def save_dict(output_file_path, hist_map, prefix=""): + with open(os.path.join(output_file_path, "{}hist_map.pkl".format(prefix)), "wb") as file_wrt: pickle.dump(hist_map, file_wrt) - # def load_dict(self, dict_path, prefix=""): with open(os.path.join(dict_path, "{}hist_map.pkl".format(prefix)), "rb") as file_wrt: self.hist_map = pickle.load(file_wrt) @@ -128,13 +127,14 @@ class CriteoStatsDict(): return dense_list, cat_list -def statsdata_multiprocess(process_num, process_id, data_file_path, output_path, criteo_stats): + +def statsdata_multiprocess(proc_num, proc_id, data_file_path, output_file_path, criteo_stats_data): start_time = time.time() with open(data_file_path, encoding="utf-8") as file_in: errorline_list = [] count = 0 for i, line in enumerate(file_in): - if i % process_num != process_id: + if i % proc_num != proc_id: continue count += 1 line = line.strip("\n") @@ -146,26 +146,26 @@ def statsdata_multiprocess(process_num, process_id, data_file_path, output_path, if count % 1000000 == 0: print("Have handle {}w lines.".format(count // 10000)) cats = items[14:] - criteo_stats.stats_cats(cats) - criteo_stats.save_dict(output_path) + criteo_stats_data.stats_cats(cats) + criteo_stats_data.save_dict(output_file_path) print('statsdata time cost: {:.2f}s'.format(time.time() - start_time)) -def get_unique_id_multiprocess(process_num, process_id, data_file_path, output_path, criteo_stats): - if os.path.exists(os.path.join(output_path, "unique_id.pkl")): +def get_unique_id_multiprocess(proc_num, proc_id, data_file_path, output_file_path, criteo_stats_data): + if os.path.exists(os.path.join(output_file_path, "unique_id.pkl")): return start_time = time.time() - cat_sets = [OrderedDict() for col in criteo_stats.cat_cols] - cat_global_id_nums = [0 for col in criteo_stats.cat_cols] - hash_bucket = criteo_stats.hash_bucket + cat_sets = [OrderedDict() for col in criteo_stats_data.cat_cols] + cat_global_id_nums = [0 for col in criteo_stats_data.cat_cols] + hash_bucket = criteo_stats_data.hash_bucket line_num = 0 with open(data_file_path, encoding="utf-8") as file_in: errorline_list = [] for i, line in enumerate(file_in): line_num += 1 - start_line = process_id * ((line_num + process_num) // process_num) - end_line = (process_id + 1) * ((line_num + process_num) // process_num) + start_line = proc_id * ((line_num + proc_num) // proc_num) + end_line = (proc_id + 1) * ((line_num + proc_num) // proc_num) with open(data_file_path, encoding="utf-8") as file_in: errorline_list = [] count = 0 @@ -183,21 +183,17 @@ def get_unique_id_multiprocess(process_num, process_id, data_file_path, output_p print("Have handle {}w lines.".format(count // 10000)) sys.stdout.flush() cats = items[14:] - # criteo_stats.stats_cats(cats) - # def map_cat_count(i, cat): for k, cat in enumerate(cats): - # map_cat_count(i, cat) capped_value = int(cat, 16) % hash_bucket if cat else hash_bucket - # if capped_value not in self.hist_map[key_col]: if capped_value not in cat_sets: cat_sets[k][capped_value] = cat_global_id_nums[k] cat_global_id_nums[k] += 1 - with open(os.path.join(output_path, "unique_id.pkl"), "wb") as file_wrt: + with open(os.path.join(output_file_path, "unique_id.pkl"), "wb") as file_wrt: pickle.dump(cat_sets, file_wrt) print('statsdata time cost: {:.2f}s'.format(time.time() - start_time)) -def merge_stats_count(stats_dir, criteo_stats): +def merge_stats_count(stats_dir, criteo_stats_data): if os.path.exists(f'{stats_dir}/hist_map.pkl'): return stats_sub_dirs = sorted(glob(f'{stats_dir}/*[0-9]')) @@ -207,15 +203,15 @@ def merge_stats_count(stats_dir, criteo_stats): for i in tqdm(range(1, len(stats_sub_dirs))): with open(f'{stats_sub_dirs[i]}/unique_id.pkl', 'rb') as f: others_count = pickle.load(f) - for k, _ in enumerate(criteo_stats.cat_cols): + for k, _ in enumerate(criteo_stats_data.cat_cols): all_count_1, others_count_1 = all_hist_map[k], others_count[k] all_count_1.update(others_count_1) all_hist_map[k] = all_count_1 hist_map = {} - for i, col in enumerate(criteo_stats.cat_cols): + for i, col in enumerate(criteo_stats_data.cat_cols): hist_map[col] = dict(zip(list(all_hist_map[i].keys()), range(len(all_hist_map[i])))) - criteo_stats.save_dict(stats_dir, hist_map) + criteo_stats_data.save_dict(stats_dir, hist_map) def mkdir_path(file_path): @@ -235,13 +231,14 @@ def make_example(label_list, dense_feat_list, sparse_feat_list): return example -def convert_input2tfrd_multiprocess(process_num, process_id, in_file_path, output_path, criteo_stats, line_per_sample=1024, - part_rows=2000000, mode="train_"): + +def convert_input2tfrd_multiprocess(proc_num, proc_id, in_file_path, output_file_path, criteo_stats_dict, + line_per_sample=1024, part_rows=2000000): start_time = time.time() print("----------" * 10 + "\n" * 2) part_number = 0 - file_name = output_path + "part_{:0>8d}.tfrecord" + file_name = output_file_path + "part_{:0>8d}.tfrecord" file_writer = tf.python_io.TFRecordWriter(file_name.format(part_number)) sample_count = 0 @@ -253,8 +250,8 @@ def convert_input2tfrd_multiprocess(process_num, process_id, in_file_path, outpu for i, line in tqdm(enumerate(file_in)): line_num += 1 print(f'line_num: {line_num}') - start_line = process_id * ((line_num + process_num) // process_num) - end_line = (process_id + 1) * ((line_num + process_num) // process_num) + start_line = proc_id * ((line_num + proc_num) // proc_num) + end_line = (proc_id + 1) * ((line_num + proc_num) // proc_num) dense_res_list = [] cat_res_list = [] label_res_list = [] @@ -276,9 +273,11 @@ def convert_input2tfrd_multiprocess(process_num, process_id, in_file_path, outpu label = int(items[0]) values = items[1:14] cats = items[14:] - assert len(values) == 13, "values.size: {}".format(len(values)) - assert len(cats) == 26, "cats.size: {}".format(len(cats)) - val_list, cat_list = criteo_stats.map_cat2id(values, cats) + if len(values) == 13: + raise ValueError("values.size: {}".format(len(values))) + if len(cats) == 26: + raise ValueError("cats.size: {}".format(len(cats))) + val_list, cat_list = criteo_stats_dict.map_cat2id(values, cats) dense_res_list.append(val_list) cat_res_list.append(cat_list) label_res_list.append(label) @@ -362,8 +361,10 @@ if __name__ == "__main__": mkdir_path(save_tfrecord_path) processs = [] process_num = args.train_process_num - assert process_num % len(train_data_files) == 0, print( - f'process_num {process_num} must exact div length of data_files {len(data_files)}') + if len(train_data_files) == 0: + raise ValueError(f'file not exist in train_data_dir:{train_data_dir}') + if process_num % len(train_data_files) == 0: + raise ValueError(f'process_num {process_num} must exact div length of train_data_files {len(train_data_files)}') for process_id in range(process_num): sub_process_num = process_num // len(train_data_files) @@ -384,8 +385,10 @@ if __name__ == "__main__": mkdir_path(save_tfrecord_path) processs = [] process_num = args.test_process_num - assert process_num % len(test_data_files) == 0, print( - f'process_num {process_num} must exact div length of data_files {len(data_files)}') + if len(test_data_files) == 0: + raise ValueError(f'file not exist in test_data_dir:{test_data_dir}') + if process_num % len(test_data_files) == 0: + raise ValueError(f'process_num {process_num} must exact div length of test_data_files {len(test_data_files)}') for process_id in range(process_num): sub_process_num = process_num // len(test_data_files) diff --git a/examples/dlrm/model/config.py b/examples/dlrm/model/config.py index 452b2a7f..23b042c2 100644 --- a/examples/dlrm/model/config.py +++ b/examples/dlrm/model/config.py @@ -40,7 +40,6 @@ class LearningRateScheduler: # used for the warmup stage warmup_step = tf.cast(1 / self.warmup_steps, tf.float32) lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step - # lr_factor_warmup = tf.cast(global_step, tf.float32) / tf.cast(self.warmup_steps, tf.float32) #hx lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32) # used for the constant stage lr_factor_constant = tf.cast(1.0, tf.float32) @@ -55,7 +54,6 @@ class LearningRateScheduler: global_step < self.decay_end_step, lambda: lr_factor_decay, lambda: sparse_after_decay, - # lambda: 0.000 #hx ) lr_factor_decay_dense = tf.cond( @@ -119,7 +117,6 @@ class Config: self.emb_dim = 128 self.hashtable_threshold = 1 - # self.learning_rate = 0.01 self.USE_PIPELINE_TEST = False @@ -182,8 +179,8 @@ def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"): custom_op.parameter_map["mix_compile_mode"].b = False custom_op.parameter_map["use_off_line"].b = True custom_op.parameter_map["min_group_size"].b = 1 + # 可选配置level0:pairwise;level1:pairwise custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:fullmesh;level1:fullmesh") - # custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:pairwise;level1:pairwise") custom_op.parameter_map["enable_data_pre_proc"].b = True custom_op.parameter_map["iterations_per_loop"].i = 10 custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") @@ -228,7 +225,6 @@ def get_npu_run_config(): iterations_per_loop=1, jit_compile=False, op_compiler_cache_mode="enable", - HCCL_algorithm="level0:fullmesh;level1:fullmesh" - # HCCL_algorithm="level0:pairwise;level1:pairwise" + HCCL_algorithm="level0:fullmesh;level1:fullmesh" # 可选配置:level0:pairwise;level1:pairwise ) return run_config diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index 4bbd16de..8c98238b 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -57,8 +57,8 @@ def add_timestamp_func(batch): return batch -def make_batch_and_iterator(cfg, feature_spec_list, is_training, dump_graph, use_faae=False): - if cfg.USE_PIPELINE_TEST: +def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, is_use_faae=False): + if config.USE_PIPELINE_TEST: num_parallel = 1 else: num_parallel = 8 @@ -66,9 +66,9 @@ def make_batch_and_iterator(cfg, feature_spec_list, is_training, dump_graph, use def extract_fn(data_record): features = { # Extract features using the keys set during creation - 'label': tf.compat.v1.FixedLenFeature(shape=(cfg.line_per_sample,), dtype=tf.int64), - 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * cfg.line_per_sample,), dtype=tf.int64), - 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * cfg.line_per_sample,), dtype=tf.float32), + 'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64), + 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64), + 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32), } sample = tf.compat.v1.parse_single_example(data_record, features) return sample @@ -81,24 +81,23 @@ def make_batch_and_iterator(cfg, feature_spec_list, is_training, dump_graph, use return batch if is_training: - files_list = glob(os.path.join(cfg.data_path, cfg.train_file_pattern) + '/*.tfrecord') + files_list = glob(os.path.join(config.data_path, config.train_file_pattern) + '/*.tfrecord') else: - files_list = glob(os.path.join(cfg.data_path, cfg.test_file_pattern) + '/*.tfrecord') + files_list = glob(os.path.join(config.data_path, config.test_file_pattern) + '/*.tfrecord') dataset = tf.data.TFRecordDataset(files_list, num_parallel_reads=num_parallel) - batch_size = cfg.batch_size // cfg.line_per_sample + batch_size = config.batch_size // config.line_per_sample - dataset = dataset.shard(cfg.rank_size, cfg.rank_id) + dataset = dataset.shard(config.rank_size, config.rank_id) if is_training: dataset = dataset.shuffle(batch_size * 1000, seed=shuffle_seed) if is_training: - dataset = dataset.repeat(cfg.train_epoch) + dataset = dataset.repeat(config.train_epoch) else: - dataset = dataset.repeat(cfg.test_epoch) - # dataset = dataset.repeat(cfg.num_epochs) + dataset = dataset.repeat(config.test_epoch) dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel).batch(batch_size, drop_remainder=True) dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel) - if use_faae: + if is_use_faae: dataset = dataset.map(add_timestamp_func) if not MODIFY_GRAPH_FLAG: @@ -161,11 +160,11 @@ def evaluate(): eval_start = time.time() eval_loss, pred, label = sess.run([eval_model["loss"], eval_model["pred"], eval_label]) eval_cost = time.time() - eval_start - qps = (1 / eval_cost) * rank_size * cfg.batch_size + qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size log_loss_list += list(eval_loss.reshape(-1)) pred_list += list(pred.reshape(-1)) label_list += list(label.reshape(-1)) - print(f"eval current_steps: {eval_current_steps}, qps: {qps}") + print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") if eval_current_steps == eval_steps: finished = True except tf.errors.OutOfRangeError: @@ -217,7 +216,6 @@ def evaluate_fix(step): os.mknod(f"flag_{rank_id}.txt") while True: file_exists_list = [os.path.exists(f"flag_{i}.txt") for i in range(rank_size)] - # print(file_exists_list) if sum(file_exists_list) == rank_size: print("All saved!!!!!!!!!!") break @@ -424,7 +422,6 @@ if __name__ == "__main__": cost_time = end_time - start_time qps = (1 / cost_time) * rank_size * cfg.batch_size * iteration_per_loop cost_sum += cost_time - # qps_sum += qps logger.info(f"step: {i * iteration_per_loop}; training loss: {loss}") logger.info(f"step: {i * iteration_per_loop}; grad: {grad}") logger.info(f"step: {i * iteration_per_loop}; lr: {lr}") diff --git a/mx_rec/util/communication/hccl_mgmt.py b/mx_rec/util/communication/hccl_mgmt.py index 2f50e832..43042d6b 100644 --- a/mx_rec/util/communication/hccl_mgmt.py +++ b/mx_rec/util/communication/hccl_mgmt.py @@ -82,11 +82,11 @@ def set_hccl_info_without_json() -> Dict[int, int]: Used for no rank table file configured training situation. :return: rank_id to logic_id mapping dictionary. """ - rank_size = global_env.cm_worker_size - chief_device = global_env.cm_chief_device + env_rank_size = global_env.cm_worker_size + env_chief_device = global_env.cm_chief_device device_list = get_device_list() - chief_device = int(chief_device) - rank_size = int(rank_size) + chief_device = int(env_chief_device) + rank_size = int(env_rank_size) if chief_device not in device_list: raise ValueError(f"The environment variable CM_CHIEF_DEVICE {chief_device} is not in the local device list. ") diff --git a/src/AccCTR/tests/ut/src/unique_test.cpp b/src/AccCTR/tests/ut/src/unique_test.cpp index ef6846f8..f971bb91 100644 --- a/src/AccCTR/tests/ut/src/unique_test.cpp +++ b/src/AccCTR/tests/ut/src/unique_test.cpp @@ -1162,14 +1162,14 @@ TEST_F(UniqueTest, DoUniqueShardMultipleTimes) unordered_set uniqueIdSet; map expectedIdCntMap; - for (size_t i = 0; i < uniqueIn.inputIdCnt; i++) { - restoreIds[i] = uniqueId[index[i]]; - expectedIdCntMap[inputId[i]]++; - if (uniqueIdSet.find(inputId[i]) != uniqueIdSet.end()) { + for (size_t j = 0; j < uniqueIn.inputIdCnt; j++) { + restoreIds[j] = uniqueId[index[j]]; + expectedIdCntMap[inputId[j]]++; + if (uniqueIdSet.find(inputId[j]) != uniqueIdSet.end()) { continue; } else { - uniqueIdSet.insert(inputId[i]); - expectedUniqueIdCnt[inputId[i] % conf.shardingNum]++; + uniqueIdSet.insert(inputId[j]); + expectedUniqueIdCnt[inputId[j] % conf.shardingNum]++; } } @@ -1177,13 +1177,14 @@ TEST_F(UniqueTest, DoUniqueShardMultipleTimes) int uniqueSum = 0; - for (int i = 0; i < conf.shardingNum; i++) { - uniqueSum += uniqueIdCntInBucket[i]; + for (int j = 0; j < conf.shardingNum; j++) { + uniqueSum += uniqueIdCntInBucket[j]; } vector expectedIdCnt(uniqueSum); - for (int i = 0; i < uniqueSum; i++) { - expectedIdCnt[i] = expectedIdCntMap[uniqueId[i]]; + + for (int j = 0; j < uniqueSum; j++) { + expectedIdCnt[j] = expectedIdCntMap[uniqueId[j]]; } expectedIdCnt.resize(uniqueIn.inputIdCnt); -- Gitee From 8a3e5af57410974ca8d7850655f05d6d034cf562 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 23 Apr 2024 15:42:53 +0800 Subject: [PATCH 065/302] =?UTF-8?q?warm=20start=E5=8A=9F=E8=83=BD=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=EF=BC=8C=E5=AE=9E=E7=8E=B0=E4=BB=8E=E5=A4=9A=E4=B8=AA?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E8=B7=AF=E5=BE=84=E5=8A=A0=E8=BD=BD=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E5=8F=82=E6=95=B0=E3=80=81=E7=A8=80=E7=96=8F=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/warm_start.py | 57 ++++++++++++-------------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index 31a5e358..520c3df3 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -33,8 +33,9 @@ if tf.__version__.startswith("1"): else: from npu_device.compat.v1.npu_init import NPUEstimator + class WarmStartController: - _instance = None # 类属性,用于存储唯一的实例 + _instance = None def __new__(cls): if cls._instance is None: @@ -47,7 +48,6 @@ class WarmStartController: logging.info("start to build WarmStartController.") def add_element(self, path: str, table_list: List[str]): - """添加 path, table list""" if path not in self._warm_start_dict: self._warm_start_dict[path] = table_list else: @@ -57,7 +57,6 @@ class WarmStartController: self.table_name_to_prev_table_name[table] = prev_table def get_elements(self): - """返回dict中的所有元素""" return self._warm_start_dict @@ -83,26 +82,25 @@ def patch_for_func_warm_start(func): vars_to_warm_start_list = args[1] var_name_to_prev_var_name_list = args[3] for i in range(len(ckpt_to_initialize_from)): - f = func(ckpt_to_initialize_from[i], vars_to_warm_start_list[i], var_name_to_prev_var_name_list[i], - args[3:], **kwargs) + f = func(ckpt_to_initialize_from[i], vars_to_warm_start_list[i], args[2], + var_name_to_prev_var_name_list[i], **kwargs) return f else: return func(*args, **kwargs) return wrapper + def patch_for_estimator_train(func): - def warpper(*args, **kwargs): + def wrapper(*args, **kwargs): hooks = kwargs.get('hooks', []) if WarmStartController().get_elements(): hooks.append(SparseRestoreHook()) return func(*args, *kwargs) - return warpper + return wrapper def warm_settings_filter(warm_start_from): - # condition 1: 原始入参为settings if isinstance(warm_start_from, estimator_lib.WarmStartSettings): - # mx_rec 定制 warm start的写法, 定制写法的策略应该和原始warm start的过滤策略不一样 if isinstance(warm_start_from.ckpt_to_initialize_from, (list, tuple)): out_setting_list = [] logger.info("According to warm_start_settings, warm start will load from more than one checkpoint path.") @@ -111,21 +109,16 @@ def warm_settings_filter(warm_start_from): filter_setting = _warm_settings_filter(setting) if filter_setting: out_setting_list.append(filter_setting) - # 这里out setting list 必须要revcover成warm_start_settings再返回 if out_setting_list: warm_start_from = recover_warm_settings(out_setting_list) return warm_start_from - # 原始写法 elif isinstance(warm_start_from.ckpt_to_initialize_from, (six.string_types, six.binary_type)): logger.info("According to warm_start_settings, warm start will load from only one checkpoint path.") filter_setting = _warm_settings_filter(warm_start_from) if filter_setting: return filter_setting return None - # condition 2: 原始入参为str elif isinstance(warm_start_from, (six.string_types, six.binary_type)): - # 这里还有一种类型是:str 这种类型相对比较简单,传递就好。但是在这里要调用以下controller来指定一下sparse的地址和表名, - # 这里可以单独写函数 table_name_list = get_table_name_set_by_ckpt_path(warm_start_from) WarmStartController().add_element(warm_start_from, table_name_list) return warm_start_from @@ -148,9 +141,7 @@ def recover_warm_settings(setting_list): var_name_to_prev_var_name=var_name_to_prev_var_name_list) -# 处理定制的warm settings, 将warm_start_from进行校验 def _build_warm_settings_list(warm_start_from): - # 这里可以修改一下传参,用参数解包来做,更加简洁高效 ckpt_to_initialize_from = warm_start_from.ckpt_to_initialize_from vars_to_warm_start = warm_start_from.vars_to_warm_start var_name_to_prev_var_name = warm_start_from.var_name_to_prev_var_name @@ -176,26 +167,16 @@ def _build_warm_settings_list(warm_start_from): def _warm_settings_filter(warm_start_setting): - # 将settings里面的稀疏摘出来 - # 要考虑名字有对应的场景 vars_to_warm_start = warm_start_setting.vars_to_warm_start var_name_to_prev_var_name = warm_start_setting.var_name_to_prev_var_name vars_to_warm_start_res = [] - # table_name_set从路径里面去获取 table_name_list = get_table_name_set_by_ckpt_path(warm_start_setting.ckpt_to_initialize_from) - # 稀疏支持以下格式: 1.str(支持表名) ; 2. list[str]; if isinstance(vars_to_warm_start, str): - # condition 1: vars_to_warm_start : str(正则表达式、表名) - # 表名 matching_tables = [table for table in table_name_list if re.match(vars_to_warm_start, table)] - # 如果匹配到了,那么这个warm_start_settings对于dense部分就是无效的 - # add WarmStartController(path:table_name) if matching_tables: - #add controller to set sparse - WarmStartController().add_element(vars_to_warm_start.ckpt_to_initialize_from, matching_tables) + WarmStartController().add_element(warm_start_setting.ckpt_to_initialize_from, matching_tables) if vars_to_warm_start != ".*": return None - # path: embedding_table_name return warm_start_setting elif all(isinstance(v, str) for v in vars_to_warm_start): sparse_vars = [] @@ -203,7 +184,7 @@ def _warm_settings_filter(warm_start_setting): matching_tables = [table for table in table_name_list if re.match(v, table)] if matching_tables: sparse_vars.append(v) - WarmStartController().add_element(vars_to_warm_start.ckpt_to_initialize_from, matching_tables) + WarmStartController().add_element(warm_start_setting.ckpt_to_initialize_from, matching_tables) vars_to_warm_start_res = [v for v in vars_to_warm_start if v not in sparse_vars] if not vars_to_warm_start_res: warm_start_setting = None @@ -219,14 +200,13 @@ def get_table_name_set_by_ckpt_path(warm_start_path: str) -> List[str]: Get the list of sparse table names saved under the path 'warm_start_path'. ''' table_name_list = [] - if tf.io.gfile.idsir(warm_start_path): + if tf.io.gfile.isdir(warm_start_path): restore_path = get_latest_ckpt(warm_start_path) else: restore_path = warm_start_path directory, base_name = os.path.split(restore_path) ckpt_name = f"sparse-{base_name}" sparse_path = os.path.join(directory, ckpt_name) - # 如果这个sparse_path不存在的话,可能是gpu路径,不能直接报错,只需要返回一个空的table_name_set就可以了 if not tf.io.gfile.isdir(sparse_path): logger.info(f"under the warm start path {warm_start_path}, sparse directory {sparse_path} not exists.") else: @@ -248,22 +228,19 @@ def get_latest_ckpt(warm_start_path) -> str: return path - - - class SparseRestoreHook(tf.estimator.SessionRunHook): def __init__(self): logging.info("In warm start mode, SparseRestoreHook has been initialized.") - pass + self._is_warm_start = False def begin(self): self._saver = Saver() logging.info("In warm start mode, begin SparseRestoreHook.") def after_create_session(self, session, coord): - #这里mxrec需要适配新的restore接口,这里的策略是调用多次restore接口 - self._warm_start_dict = WarmStartController().get_elements() - for path, restore_tables in self._warm_start_dict.items(): - restore_path = get_latest_ckpt(path) - self._saver.restore(session, restore_path, restore_tables) - + if not self._is_warm_start: + self._warm_start_dict = WarmStartController().get_elements() + for path, restore_tables in self._warm_start_dict.items(): + restore_path = get_latest_ckpt(path) + self._saver.restore(session, restore_path, restore_tables) + self._is_warm_start = False -- Gitee From 744c293ea213020e4a4dce8c8f5615a4fdf5a1c3 Mon Sep 17 00:00:00 2001 From: yangzhen Date: Fri, 26 Apr 2024 09:03:14 +0800 Subject: [PATCH 066/302] =?UTF-8?q?=E4=BF=AE=E5=A4=8DdcnV2=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index 1721370e..6fd235ba 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - import time import warnings import random @@ -286,9 +285,9 @@ if __name__ == "__main__": feature_spec_list_eval = create_feature_spec_list(use_timestamp=False) train_batch, train_iterator = make_batch_and_iterator(cfg, feature_spec_list_train, is_training=True, - dump_graph=True, use_faae=use_faae) + dump_graph=True, is_use_faae=use_faae) eval_batch, eval_iterator = make_batch_and_iterator(cfg, feature_spec_list_eval, is_training=False, - dump_graph=False, use_faae=use_faae) + dump_graph=False, is_use_faae=use_faae) logger.info(f"train_batch: {train_batch}") if use_faae: -- Gitee From ae3aff4c793206f684ac44baf05d972db0fd859b Mon Sep 17 00:00:00 2001 From: yangzhen Date: Fri, 26 Apr 2024 10:15:19 +0800 Subject: [PATCH 067/302] =?UTF-8?q?=E4=BF=AE=E5=A4=8Ddlrm=E5=8F=82?= =?UTF-8?q?=E6=95=B0=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/dlrm/model/main_mxrec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index 05369038..b6036804 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -289,9 +289,9 @@ if __name__ == "__main__": feature_spec_list_eval = create_feature_spec_list(use_timestamp=False) train_batch, train_iterator = make_batch_and_iterator(cfg, feature_spec_list_train, is_training=True, - dump_graph=True, use_faae=use_faae) + dump_graph=True, is_use_faae=use_faae) eval_batch, eval_iterator = make_batch_and_iterator(cfg, feature_spec_list_eval, is_training=False, - dump_graph=False, use_faae=use_faae) + dump_graph=False, is_use_faae=use_faae) logger.info(f"train_batch: {train_batch}") if use_faae: -- Gitee From 407cb4adf00a42ee91962f432b5967cbf2991dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Fri, 26 Apr 2024 09:36:25 +0000 Subject: [PATCH 068/302] =?UTF-8?q?!106=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E8=A7=A3=E5=86=B3?= =?UTF-8?q?=E5=85=A8=E5=B1=80unique=E5=AF=BC=E8=87=B4=E9=9D=99=E6=80=81sha?= =?UTF-8?q?pe=E6=80=A7=E8=83=BD=E4=B8=8B=E9=99=8D=E9=97=AE=E9=A2=98=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91=E8=A7=A3=E5=86=B3=E5=85=A8=E5=B1=80unique=E5=AF=BC?= =?UTF-8?q?=E8=87=B4=E9=9D=99=E6=80=81shape=E6=80=A7=E8=83=BD=E4=B8=8B?= =?UTF-8?q?=E9=99=8D=E9=97=AE=E9=A2=98=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E8=A7=A3=E5=86=B3?= =?UTF-8?q?=E5=85=A8=E5=B1=80unique=E5=AF=BC=E8=87=B4=E9=9D=99=E6=80=81sha?= =?UTF-8?q?pe=E6=80=A7=E8=83=BD=E4=B8=8B=E9=99=8D=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/optimizers/base.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index ed765539..696406f8 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -26,15 +26,18 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.training.optimizer import _TensorProcessor +from mx_rec.constants.constants import ASCAnchorAttr from mx_rec.util.tf_version_adapter import npu_ops from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger +from mx_rec.util.communication.hccl_ops import get_rank_size -def get_restore_vector_second(table_name: str) -> tf.Tensor: +def get_restore_vector_second(table_name: str, max_lookup_vec_size: int) -> tf.Tensor: """ Get restore vector which is calculated after the second all2all :param table_name: embedding table_name + :param max_lookup_vec_size: static shape :return: the restore vector calculated after the second all2all """ channel_id = 0 @@ -43,15 +46,16 @@ def get_restore_vector_second(table_name: str) -> tf.Tensor: with tf.compat.v1.variable_scope(table_name, reuse=tf.compat.v1.AUTO_REUSE): restore_vector_second = npu_ops.gen_npu_ops.get_next( output_types=[tf.int32], - output_shapes=[[None]], + output_shapes=[[max_lookup_vec_size]], channel_name=f'{table_name}_restore_second_{channel_id}')[0] return restore_vector_second -def get_unique_keys(table_name: str, is_expansion: bool) -> tf.Tensor: +def get_unique_keys(table_name: str, max_lookup_vec_size: int, is_expansion: bool) -> tf.Tensor: """ Get the global unique keys which is calculated after the second all2all :param table_name: embedding table_name + :param max_lookup_vec_size: static shape :param is_expansion: use dynamic expansion :return: the global unique keys calculated after the second all2all """ @@ -61,13 +65,13 @@ def get_unique_keys(table_name: str, is_expansion: bool) -> tf.Tensor: if is_expansion: unique_keys = npu_ops.gen_npu_ops.get_next( output_types=[tf.int64], - output_shapes=[[None]], + output_shapes=[[max_lookup_vec_size]], channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] return unique_keys unique_keys = npu_ops.gen_npu_ops.get_next( output_types=[tf.int32], - output_shapes=[[None]], + output_shapes=[[max_lookup_vec_size]], channel_name=f'{table_name}_uniquekeys_{channel_id}')[0] return unique_keys @@ -95,14 +99,23 @@ class CustomizedOptimizer: if isinstance(var, ops.Tensor): # 扩容模式从scope获取表名,偏移是-2 table_name = var.op.name.split('/')[-2] + table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance_by_name(table_name) else: table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) table_name = table_instance.table_name - with tf.compat.v1.variable_scope("restore_vector_second"): - restore_vector_second = get_restore_vector_second(table_name) - with tf.compat.v1.variable_scope("unique_keys"): - unique_keys = get_unique_keys(table_name, is_expansion) + max_lookup_vec_size = None + use_static = ConfigInitializer.get_instance().use_static + if use_static: + send_count = table_instance.send_count + rank_size = get_rank_size() + max_lookup_vec_size = send_count * rank_size if send_count > 0 else None + + with tf.compat.v1.variable_scope(str(ASCAnchorAttr.RESTORE_VECTOR_SECOND)): + restore_vector_second = get_restore_vector_second(table_name, max_lookup_vec_size) + + with tf.compat.v1.variable_scope(str(ASCAnchorAttr.UNIQUE_KEYS)): + unique_keys = get_unique_keys(table_name, max_lookup_vec_size, is_expansion) unique_local_grad = tf.compat.v1.unsorted_segment_sum(grad, restore_vector_second, -- Gitee From b1521fa72b3847a6e3256b1d71f824657e0b34d3 Mon Sep 17 00:00:00 2001 From: chenhangcal <1764252734@qq.com> Date: Mon, 29 Apr 2024 01:24:07 +0000 Subject: [PATCH 069/302] =?UTF-8?q?!98=20little-demo=E7=A1=AE=E5=AE=9A?= =?UTF-8?q?=E6=80=A7=E8=AE=A1=E7=AE=97loss=E7=94=A8=E4=BE=8B=20*=20add=20e?= =?UTF-8?q?xamples/demo/little=5Fdemo/deterministic=5Floss/loss1.=20*=20ad?= =?UTF-8?q?d=20examples/demo/little=5Fdemo/deterministic=5Floss/loss2.=20*?= =?UTF-8?q?=20update=20examples/demo/little=5Fdemo/run=5Fmode.py.=20*=20up?= =?UTF-8?q?date=20examples/demo/little=5Fdemo/config.py.=20*=20update=20ex?= =?UTF-8?q?amples/demo/little=5Fdemo/main.py.=20*=20update=20examples/demo?= =?UTF-8?q?/little=5Fdemo/run=5Fmode.py.=20*=20update=20examples/demo/litt?= =?UTF-8?q?le=5Fdemo/config.py.=20*=20update=20examples/demo/little=5Fdemo?= =?UTF-8?q?/main.py.=20*=20update=20examples/demo/little=5Fdemo/run=5Fdete?= =?UTF-8?q?rministic.sh.=20*=20rename=20*=20rename=20*=20update=20examples?= =?UTF-8?q?/demo/little=5Fdemo/run=5Fdeterministic.sh.=20*=20update=20exam?= =?UTF-8?q?ples/demo/little=5Fdemo/run=5Fdeterministic.sh.=20*=20update=20?= =?UTF-8?q?examples/demo/little=5Fdemo/run=5Fdeterministic.sh.=20*=20add?= =?UTF-8?q?=20examples/demo/little=5Fdemo/deterministic=5Floss/Ascend910B3?= =?UTF-8?q?.=20*=20add=20examples/demo/little=5Fdemo/deterministic=5Floss/?= =?UTF-8?q?Ascend910B.=20*=20add=20examples/demo/little=5Fdemo/run=5Fdeter?= =?UTF-8?q?ministic.sh.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo/config.py | 8 +- .../demo/little_demo/deterministic_loss/loss | 200 ++++++++++++++++++ .../demo/little_demo/deterministic_loss/loss1 | 200 ++++++++++++++++++ .../demo/little_demo/deterministic_loss/loss2 | 200 ++++++++++++++++++ .../demo/little_demo/deterministic_loss/loss3 | 200 ++++++++++++++++++ examples/demo/little_demo/main.py | 18 +- .../demo/little_demo/run_deterministic.sh | 45 ++++ examples/demo/little_demo/run_mode.py | 8 +- 8 files changed, 870 insertions(+), 9 deletions(-) create mode 100644 examples/demo/little_demo/deterministic_loss/loss create mode 100644 examples/demo/little_demo/deterministic_loss/loss1 create mode 100644 examples/demo/little_demo/deterministic_loss/loss2 create mode 100644 examples/demo/little_demo/deterministic_loss/loss3 create mode 100644 examples/demo/little_demo/run_deterministic.sh diff --git a/examples/demo/little_demo/config.py b/examples/demo/little_demo/config.py index 2cc48216..a0912ac5 100644 --- a/examples/demo/little_demo/config.py +++ b/examples/demo/little_demo/config.py @@ -95,7 +95,7 @@ class Config: self.learning_rate = 0.01 -def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"): +def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2", use_deterministic=0): session_config = tf.compat.v1.ConfigProto(allow_soft_placement=False, log_device_placement=False) @@ -108,7 +108,11 @@ def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"): custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:pairwise;level1:pairwise") custom_op.parameter_map["enable_data_pre_proc"].b = True custom_op.parameter_map["iterations_per_loop"].i = 1 - custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + if use_deterministic: + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("must_keep_origin_dtype") + custom_op.parameter_map["deterministic"].i = 1 + else: + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") custom_op.parameter_map["hcom_parallel"].b = False custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes("op_impl_mode.ini") custom_op.parameter_map["op_execute_timeout"].i = 2000 diff --git a/examples/demo/little_demo/deterministic_loss/loss b/examples/demo/little_demo/deterministic_loss/loss new file mode 100644 index 00000000..3bd00f80 --- /dev/null +++ b/examples/demo/little_demo/deterministic_loss/loss @@ -0,0 +1,200 @@ +0.6931473016738892 +0.6930400133132935 +0.6931400895118713 +0.69315505027771 +0.6931849122047424 +0.6931070685386658 +0.6931337714195251 +0.6931014657020569 +0.6931450963020325 +0.6931362152099609 +0.6930745244026184 +0.6931930184364319 +0.693183958530426 +0.6931136846542358 +0.6932246088981628 +0.69315105676651 +0.6931785941123962 +0.6931335926055908 +0.6931543946266174 +0.6931360960006714 +0.6931753158569336 +0.6931651830673218 +0.6931512951850891 +0.6931533217430115 +0.6931378841400146 +0.6931486129760742 +0.6931435465812683 +0.6931432485580444 +0.6930928230285645 +0.6931749582290649 +0.693172037601471 +0.6931487917900085 +0.6931713819503784 +0.6931683421134949 +0.6931532621383667 +0.6931494474411011 +0.6932084560394287 +0.6930452585220337 +0.6931130886077881 +0.6932073831558228 +0.6931206583976746 +0.6931828856468201 +0.6931034922599792 +0.6931605935096741 +0.6931373476982117 +0.6931723952293396 +0.6931106448173523 +0.6931154131889343 +0.6931938529014587 +0.6932826638221741 +0.6932423114776611 +0.6931906342506409 +0.6931505799293518 +0.6931438446044922 +0.6931610107421875 +0.6931508779525757 +0.6931482553482056 +0.693139910697937 +0.693148136138916 +0.6931435465812683 +0.6930944323539734 +0.693130373954773 +0.6931836009025574 +0.6930789947509766 +0.6932032108306885 +0.693130373954773 +0.6933913230895996 +0.6931992173194885 +0.6931376457214355 +0.6931767463684082 +0.6931583881378174 +0.6931485533714294 +0.693138837814331 +0.6931250095367432 +0.693103015422821 +0.6931023597717285 +0.6932260990142822 +0.6931752562522888 +0.6930729150772095 +0.6929311156272888 +0.693302571773529 +0.6932254433631897 +0.69317626953125 +0.693097710609436 +0.6930376291275024 +0.6931532621383667 +0.6931279301643372 +0.6931777596473694 +0.6931577324867249 +0.6931435465812683 +0.6931730508804321 +0.693141520023346 +0.6931696534156799 +0.6931543350219727 +0.6931476593017578 +0.6931471824645996 +0.6931589245796204 +0.693145751953125 +0.6931431293487549 +0.6931287050247192 +0.6931427717208862 +0.6931363344192505 +0.6931345462799072 +0.6931136250495911 +0.6930984258651733 +0.6931260228157043 +0.6932109594345093 +0.6931638121604919 +0.6931529641151428 +0.6931443214416504 +0.6931478381156921 +0.6931700110435486 +0.69312983751297 +0.6932106614112854 +0.6930972933769226 +0.6931629776954651 +0.6931963562965393 +0.6932249665260315 +0.6932281851768494 +0.6932195425033569 +0.6931582093238831 +0.6931502819061279 +0.693153440952301 +0.6930547952651978 +0.6932091116905212 +0.6930832862854004 +0.69318687915802 +0.693234384059906 +0.6931787133216858 +0.6931472420692444 +0.6931833624839783 +0.6931379437446594 +0.6931558847427368 +0.693196713924408 +0.6931143999099731 +0.693136990070343 +0.6931957602500916 +0.6931578516960144 +0.6931463479995728 +0.6931509375572205 +0.6931226253509521 +0.6931785941123962 +0.6931405663490295 +0.6931736469268799 +0.6931595206260681 +0.6931319236755371 +0.6931323409080505 +0.6931301355361938 +0.6931783556938171 +0.6931540966033936 +0.6930714249610901 +0.693152904510498 +0.6931881904602051 +0.6931595206260681 +0.6931363940238953 +0.6931393146514893 +0.6931549310684204 +0.6931518316268921 +0.6931600570678711 +0.6931359767913818 +0.693086564540863 +0.6930826306343079 +0.693168044090271 +0.6931942105293274 +0.6932410001754761 +0.693097710609436 +0.693099856376648 +0.69315505027771 +0.693153977394104 +0.6931472420692444 +0.6931328177452087 +0.6931746602058411 +0.6931381821632385 +0.6931582689285278 +0.6933059692382812 +0.6930915117263794 +0.6931243538856506 +0.6934514045715332 +0.6933988928794861 +0.6932798624038696 +0.6931632161140442 +0.6931505799293518 +0.6931473016738892 +0.6931563019752502 +0.6931017637252808 +0.6932226419448853 +0.6932034492492676 +0.6931058764457703 +0.6932246088981628 +0.6930988430976868 +0.6931736469268799 +0.6931524276733398 +0.6931332945823669 +0.6931236386299133 +0.6931801438331604 +0.6931136250495911 +0.6931392550468445 +0.6931288838386536 +0.6931090950965881 +0.6931648254394531 \ No newline at end of file diff --git a/examples/demo/little_demo/deterministic_loss/loss1 b/examples/demo/little_demo/deterministic_loss/loss1 new file mode 100644 index 00000000..cfe29fc9 --- /dev/null +++ b/examples/demo/little_demo/deterministic_loss/loss1 @@ -0,0 +1,200 @@ +0.6931475400924683 +0.6930400133132935 +0.693139910697937 +0.6931551098823547 +0.6931850910186768 +0.6931071877479553 +0.6931338310241699 +0.6931014060974121 +0.6931450963020325 +0.69313645362854 +0.6930742263793945 +0.6931931376457214 +0.6931841373443604 +0.6931138038635254 +0.6932246685028076 +0.6931509971618652 +0.6931785941123962 +0.693133533000946 +0.6931544542312622 +0.6931360363960266 +0.6931753158569336 +0.6931651830673218 +0.6931511163711548 +0.6931532621383667 +0.6931378245353699 +0.6931488513946533 +0.6931437253952026 +0.6931431889533997 +0.693092942237854 +0.6931750178337097 +0.693172037601471 +0.6931487917900085 +0.6931712627410889 +0.6931683421134949 +0.6931533813476562 +0.6931492686271667 +0.6932083964347839 +0.6930453181266785 +0.6931129693984985 +0.6932074427604675 +0.6931206583976746 +0.6931827068328857 +0.6931033730506897 +0.6931606531143188 +0.6931372880935669 +0.69317227602005 +0.6931107044219971 +0.6931154727935791 +0.6931938529014587 +0.6932826638221741 +0.6932423710823059 +0.6931905746459961 +0.6931506395339966 +0.6931438446044922 +0.6931609511375427 +0.69315105676651 +0.6931482553482056 +0.6931400895118713 +0.6931483149528503 +0.6931435465812683 +0.6930944919586182 +0.6931304931640625 +0.6931834816932678 +0.6930789947509766 +0.6932030916213989 +0.693130373954773 +0.6933913826942444 +0.6931991577148438 +0.6931377649307251 +0.6931768655776978 +0.6931586861610413 +0.6931484341621399 +0.6931391358375549 +0.6931250691413879 +0.6931028366088867 +0.6931021213531494 +0.6932262182235718 +0.6931752562522888 +0.6930727362632751 +0.6929311156272888 +0.6933025121688843 +0.6932255625724792 +0.6931764483451843 +0.6930979490280151 +0.6930376887321472 +0.6931535005569458 +0.6931277513504028 +0.6931778788566589 +0.6931575536727905 +0.6931436657905579 +0.6931729316711426 +0.6931415796279907 +0.6931697726249695 +0.6931543946266174 +0.6931476593017578 +0.6931473016738892 +0.6931586861610413 +0.6931456923484802 +0.6931430697441101 +0.6931284070014954 +0.693142831325531 +0.6931363940238953 +0.6931345462799072 +0.6931135058403015 +0.6930984258651733 +0.6931260228157043 +0.6932108998298645 +0.6931638717651367 +0.6931529641151428 +0.6931443810462952 +0.6931477785110474 +0.6931700110435486 +0.6931299567222595 +0.6932107210159302 +0.6930974125862122 +0.6931627988815308 +0.6931964159011841 +0.6932250261306763 +0.6932283043861389 +0.6932194828987122 +0.6931582093238831 +0.6931501626968384 +0.693153440952301 +0.6930548548698425 +0.6932091116905212 +0.6930834650993347 +0.6931867599487305 +0.6932343244552612 +0.6931787133216858 +0.6931471824645996 +0.6931833028793335 +0.6931377649307251 +0.6931559443473816 +0.693196713924408 +0.6931144595146179 +0.6931368708610535 +0.6931958198547363 +0.6931577920913696 +0.6931461691856384 +0.6931511163711548 +0.6931224465370178 +0.693178653717041 +0.6931405663490295 +0.6931737661361694 +0.6931594014167786 +0.6931319236755371 +0.6931324005126953 +0.6931299567222595 +0.6931784152984619 +0.6931542754173279 +0.6930714845657349 +0.693152666091919 +0.6931881308555603 +0.6931596994400024 +0.6931365132331848 +0.6931394338607788 +0.6931548714637756 +0.6931518316268921 +0.6931599974632263 +0.6931360363960266 +0.6930868029594421 +0.6930827498435974 +0.6931679844856262 +0.6931941509246826 +0.6932410001754761 +0.693097710609436 +0.693099856376648 +0.6931549906730652 +0.6931538581848145 +0.6931471824645996 +0.693132758140564 +0.6931745409965515 +0.6931381225585938 +0.6931583881378174 +0.6933057904243469 +0.693091630935669 +0.6931243538856506 +0.6934512853622437 +0.6933985948562622 +0.6932798624038696 +0.6931629180908203 +0.6931505799293518 +0.6931473612785339 +0.6931563019752502 +0.6931016445159912 +0.6932225227355957 +0.6932035088539124 +0.693105936050415 +0.6932247877120972 +0.6930989027023315 +0.6931736469268799 +0.6931525468826294 +0.6931331753730774 +0.6931236982345581 +0.69318026304245 +0.6931138038635254 +0.6931390762329102 +0.6931287050247192 +0.6931091547012329 +0.6931648850440979 \ No newline at end of file diff --git a/examples/demo/little_demo/deterministic_loss/loss2 b/examples/demo/little_demo/deterministic_loss/loss2 new file mode 100644 index 00000000..cfe29fc9 --- /dev/null +++ b/examples/demo/little_demo/deterministic_loss/loss2 @@ -0,0 +1,200 @@ +0.6931475400924683 +0.6930400133132935 +0.693139910697937 +0.6931551098823547 +0.6931850910186768 +0.6931071877479553 +0.6931338310241699 +0.6931014060974121 +0.6931450963020325 +0.69313645362854 +0.6930742263793945 +0.6931931376457214 +0.6931841373443604 +0.6931138038635254 +0.6932246685028076 +0.6931509971618652 +0.6931785941123962 +0.693133533000946 +0.6931544542312622 +0.6931360363960266 +0.6931753158569336 +0.6931651830673218 +0.6931511163711548 +0.6931532621383667 +0.6931378245353699 +0.6931488513946533 +0.6931437253952026 +0.6931431889533997 +0.693092942237854 +0.6931750178337097 +0.693172037601471 +0.6931487917900085 +0.6931712627410889 +0.6931683421134949 +0.6931533813476562 +0.6931492686271667 +0.6932083964347839 +0.6930453181266785 +0.6931129693984985 +0.6932074427604675 +0.6931206583976746 +0.6931827068328857 +0.6931033730506897 +0.6931606531143188 +0.6931372880935669 +0.69317227602005 +0.6931107044219971 +0.6931154727935791 +0.6931938529014587 +0.6932826638221741 +0.6932423710823059 +0.6931905746459961 +0.6931506395339966 +0.6931438446044922 +0.6931609511375427 +0.69315105676651 +0.6931482553482056 +0.6931400895118713 +0.6931483149528503 +0.6931435465812683 +0.6930944919586182 +0.6931304931640625 +0.6931834816932678 +0.6930789947509766 +0.6932030916213989 +0.693130373954773 +0.6933913826942444 +0.6931991577148438 +0.6931377649307251 +0.6931768655776978 +0.6931586861610413 +0.6931484341621399 +0.6931391358375549 +0.6931250691413879 +0.6931028366088867 +0.6931021213531494 +0.6932262182235718 +0.6931752562522888 +0.6930727362632751 +0.6929311156272888 +0.6933025121688843 +0.6932255625724792 +0.6931764483451843 +0.6930979490280151 +0.6930376887321472 +0.6931535005569458 +0.6931277513504028 +0.6931778788566589 +0.6931575536727905 +0.6931436657905579 +0.6931729316711426 +0.6931415796279907 +0.6931697726249695 +0.6931543946266174 +0.6931476593017578 +0.6931473016738892 +0.6931586861610413 +0.6931456923484802 +0.6931430697441101 +0.6931284070014954 +0.693142831325531 +0.6931363940238953 +0.6931345462799072 +0.6931135058403015 +0.6930984258651733 +0.6931260228157043 +0.6932108998298645 +0.6931638717651367 +0.6931529641151428 +0.6931443810462952 +0.6931477785110474 +0.6931700110435486 +0.6931299567222595 +0.6932107210159302 +0.6930974125862122 +0.6931627988815308 +0.6931964159011841 +0.6932250261306763 +0.6932283043861389 +0.6932194828987122 +0.6931582093238831 +0.6931501626968384 +0.693153440952301 +0.6930548548698425 +0.6932091116905212 +0.6930834650993347 +0.6931867599487305 +0.6932343244552612 +0.6931787133216858 +0.6931471824645996 +0.6931833028793335 +0.6931377649307251 +0.6931559443473816 +0.693196713924408 +0.6931144595146179 +0.6931368708610535 +0.6931958198547363 +0.6931577920913696 +0.6931461691856384 +0.6931511163711548 +0.6931224465370178 +0.693178653717041 +0.6931405663490295 +0.6931737661361694 +0.6931594014167786 +0.6931319236755371 +0.6931324005126953 +0.6931299567222595 +0.6931784152984619 +0.6931542754173279 +0.6930714845657349 +0.693152666091919 +0.6931881308555603 +0.6931596994400024 +0.6931365132331848 +0.6931394338607788 +0.6931548714637756 +0.6931518316268921 +0.6931599974632263 +0.6931360363960266 +0.6930868029594421 +0.6930827498435974 +0.6931679844856262 +0.6931941509246826 +0.6932410001754761 +0.693097710609436 +0.693099856376648 +0.6931549906730652 +0.6931538581848145 +0.6931471824645996 +0.693132758140564 +0.6931745409965515 +0.6931381225585938 +0.6931583881378174 +0.6933057904243469 +0.693091630935669 +0.6931243538856506 +0.6934512853622437 +0.6933985948562622 +0.6932798624038696 +0.6931629180908203 +0.6931505799293518 +0.6931473612785339 +0.6931563019752502 +0.6931016445159912 +0.6932225227355957 +0.6932035088539124 +0.693105936050415 +0.6932247877120972 +0.6930989027023315 +0.6931736469268799 +0.6931525468826294 +0.6931331753730774 +0.6931236982345581 +0.69318026304245 +0.6931138038635254 +0.6931390762329102 +0.6931287050247192 +0.6931091547012329 +0.6931648850440979 \ No newline at end of file diff --git a/examples/demo/little_demo/deterministic_loss/loss3 b/examples/demo/little_demo/deterministic_loss/loss3 new file mode 100644 index 00000000..a38ce81c --- /dev/null +++ b/examples/demo/little_demo/deterministic_loss/loss3 @@ -0,0 +1,200 @@ +0.6931473016738892 +0.6930400729179382 +0.6931402087211609 +0.69315505027771 +0.6931849122047424 +0.6931070685386658 +0.6931337118148804 +0.6931014060974121 +0.693144679069519 +0.6931362748146057 +0.6930745840072632 +0.6931930780410767 +0.6931840777397156 +0.6931135654449463 +0.6932245492935181 +0.6931509375572205 +0.6931784152984619 +0.6931337714195251 +0.693154513835907 +0.6931360363960266 +0.6931752562522888 +0.6931653022766113 +0.6931512355804443 +0.6931530833244324 +0.6931378841400146 +0.6931486129760742 +0.6931437253952026 +0.6931434273719788 +0.6930927634239197 +0.6931749582290649 +0.6931719779968262 +0.6931490302085876 +0.6931714415550232 +0.6931683421134949 +0.6931533217430115 +0.6931492686271667 +0.6932084560394287 +0.6930454969406128 +0.6931130290031433 +0.6932073831558228 +0.6931207776069641 +0.6931827664375305 +0.693103551864624 +0.6931607127189636 +0.6931374669075012 +0.69317227602005 +0.6931108832359314 +0.6931152939796448 +0.6931939125061035 +0.6932826638221741 +0.6932422518730164 +0.6931905746459961 +0.693150520324707 +0.6931438446044922 +0.693160891532898 +0.6931508779525757 +0.693148136138916 +0.6931400299072266 +0.6931481957435608 +0.6931434869766235 +0.6930946111679077 +0.6931304335594177 +0.693183422088623 +0.6930789947509766 +0.6932030320167542 +0.6931302547454834 +0.6933913826942444 +0.6931991577148438 +0.6931378841400146 +0.6931770443916321 +0.6931586265563965 +0.6931484937667847 +0.6931388974189758 +0.6931250095367432 +0.693103015422821 +0.6931023001670837 +0.6932262182235718 +0.6931753754615784 +0.6930726170539856 +0.6929312944412231 +0.693302571773529 +0.6932252645492554 +0.6931763291358948 +0.693097710609436 +0.6930376291275024 +0.6931533217430115 +0.6931278705596924 +0.6931778788566589 +0.6931576728820801 +0.6931436657905579 +0.6931729912757874 +0.6931415796279907 +0.6931697130203247 +0.6931542754173279 +0.6931475400924683 +0.6931473016738892 +0.6931589245796204 +0.6931456923484802 +0.6931431293487549 +0.6931285858154297 +0.693142831325531 +0.6931363344192505 +0.6931344270706177 +0.6931136250495911 +0.6930983066558838 +0.6931259632110596 +0.693211019039154 +0.6931636929512024 +0.6931530237197876 +0.6931443214416504 +0.6931476593017578 +0.6931700706481934 +0.69312983751297 +0.6932106614112854 +0.6930974125862122 +0.6931630373001099 +0.6931962370872498 +0.6932251453399658 +0.6932281851768494 +0.6932194828987122 +0.6931582093238831 +0.6931502819061279 +0.693153440952301 +0.6930545568466187 +0.693209171295166 +0.6930832862854004 +0.6931869387626648 +0.6932346224784851 +0.693178653717041 +0.6931472420692444 +0.6931833624839783 +0.6931377649307251 +0.6931559443473816 +0.6931968331336975 +0.6931143999099731 +0.6931371092796326 +0.6931959390640259 +0.6931577324867249 +0.6931463479995728 +0.6931511759757996 +0.6931225061416626 +0.6931787133216858 +0.6931406259536743 +0.6931735873222351 +0.6931595206260681 +0.6931317448616028 +0.6931322813034058 +0.6931302547454834 +0.6931782960891724 +0.6931543946266174 +0.6930716037750244 +0.6931528449058533 +0.6931881904602051 +0.6931595802307129 +0.6931363940238953 +0.6931394934654236 +0.6931549906730652 +0.6931518912315369 +0.6931601762771606 +0.6931359767913818 +0.6930863261222839 +0.6930826902389526 +0.693168044090271 +0.6931941509246826 +0.6932411193847656 +0.6930977702140808 +0.6930997967720032 +0.6931549906730652 +0.6931539177894592 +0.6931472420692444 +0.6931326985359192 +0.6931745409965515 +0.6931379437446594 +0.6931582689285278 +0.6933060884475708 +0.693091630935669 +0.6931243538856506 +0.6934512853622437 +0.693398654460907 +0.6932798624038696 +0.6931633353233337 +0.6931505799293518 +0.6931473612785339 +0.6931564211845398 +0.693101704120636 +0.6932228207588196 +0.6932032704353333 +0.6931060552597046 +0.6932246685028076 +0.6930988430976868 +0.6931737065315247 +0.6931525468826294 +0.6931332945823669 +0.6931236386299133 +0.6931802034378052 +0.6931136846542358 +0.6931393146514893 +0.6931288242340088 +0.6931090950965881 +0.6931647658348083 \ No newline at end of file diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index 5d5e151e..14b2e065 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -21,6 +21,7 @@ import shutil import warnings from glob import glob +import numpy as np import tensorflow as tf from mx_rec.constants.constants import ASCEND_TIMESTAMP @@ -192,9 +193,10 @@ if __name__ == "__main__": MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0))) USE_TIMESTAMP = bool(int(os.getenv("USE_TIMESTAMP", 0))) USE_ONE_SHOT = bool(int(os.getenv("USE_ONE_SHOT", 0))) + USE_DETERMINISTIC = bool(int(os.getenv("USE_DETERMINISTIC", 0))) except ValueError as err: raise ValueError("please correctly config USE_MPI or USE_DYNAMIC or USE_DYNAMIC_EXPANSION or " - "USE_MULTI_LOOKUP or USE_MODIFY_GRAPH or USE_TIMESTAMP or USE_ONE_SHOT " + "USE_MULTI_LOOKUP or USE_MODIFY_GRAPH or USE_TIMESTAMP or USE_ONE_SHOT or USE_DETERMINISTIC" "only 0 or 1 is supported.") from err try: @@ -202,6 +204,10 @@ if __name__ == "__main__": except ValueError as err: raise ValueError("please correctly config MULTI_LOOKUP_TIMES only int is supported.") from err + if USE_DETERMINISTIC: + np.random.seed(128) + tf.random.set_random_seed(128) + if_load = False save_path = "./saved-model" model_file = [] @@ -261,11 +267,12 @@ if __name__ == "__main__": raise ValueError(f"cache mode must in {list(cache_mode_dict.keys())}, get:{cache_mode}") if cache_mode in ["DDR", "SSD"] and not use_dynamic: logger.warning("when cache_mode in [DDR, SSD], suggest use_dynamic=true to avoid tuning size parameter") - + emb_initializer = tf.compat.v1.constant_initializer(0) if USE_DETERMINISTIC \ + else tf.compat.v1.truncated_normal_initializer() user_hashtable = create_table(key_dtype=tf.int64, dim=tf.TensorShape([cfg.user_hashtable_dim]), name='user_table', - emb_initializer=tf.compat.v1.truncated_normal_initializer(), + emb_initializer=emb_initializer, optimizer_list=sparse_optimizer_list, all2all_gradients_op="sum_gradients_and_div_by_ranksize", **cache_mode_dict[cache_mode]) @@ -273,7 +280,7 @@ if __name__ == "__main__": item_hashtable = create_table(key_dtype=tf.int64, dim=tf.TensorShape([cfg.item_hashtable_dim]), name='item_table', - emb_initializer=tf.compat.v1.truncated_normal_initializer(), + emb_initializer=emb_initializer, optimizer_list=sparse_optimizer_list, **cache_mode_dict[cache_mode]) @@ -293,7 +300,8 @@ if __name__ == "__main__": batch_number=MAX_DATASET_GENERATE * get_rank_size()) dense_variables, sparse_variables = get_dense_and_sparse_variable() - params = {"train_batch": train_batch, "eval_batch": eval_batch, "use_one_shot": USE_ONE_SHOT} + params = {"train_batch": train_batch, "eval_batch": eval_batch, "use_one_shot": USE_ONE_SHOT, + "use_deterministic": USE_DETERMINISTIC} run_mode = RunMode( MODIFY_GRAPH_FLAG, USE_TIMESTAMP, table_list, optimizer_list, train_model, eval_model, train_iterator, eval_iterator, MAX_TRAIN_STEPS, EVAL_STEPS, params diff --git a/examples/demo/little_demo/run_deterministic.sh b/examples/demo/little_demo/run_deterministic.sh new file mode 100644 index 00000000..fbb4342d --- /dev/null +++ b/examples/demo/little_demo/run_deterministic.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +export USE_DETERMINISTIC=1 + +sh run.sh main.py | tee log + +grep -rn "loss" log | grep "1,0" | awk '{print $NF}'> loss + +rm -f log + +soc_name=`python3 -c 'import acl;print(acl.get_soc_name())'` +echo "soc_name: $soc_name" + +loss_file=deterministic_loss/loss${soc_name:10:1} + +if [ ! -e $loss_file ];then + echo "$loss_file file does not exist" + rm -f loss + exit +fi + + +diff $loss_file loss + +if [ $? -eq 0 ]; then + echo "deterministic loss check passed" +else + echo "deterministic loss check failed" +fi + +rm -f loss diff --git a/examples/demo/little_demo/run_mode.py b/examples/demo/little_demo/run_mode.py index 6a3301c4..f164322a 100644 --- a/examples/demo/little_demo/run_mode.py +++ b/examples/demo/little_demo/run_mode.py @@ -44,7 +44,9 @@ class RunMode: eval_model, train_iterator, eval_iterator, max_train_steps: int, infer_steps: int, params: dict): self.is_modify_graph = is_modify_graph self.is_faae = is_faae - self.session = tf.compat.v1.Session(config=sess_config(dump_data=False)) + self.use_deterministic = params.get("use_deterministic") + self.session = tf.compat.v1.Session( + config=sess_config(dump_data=False, use_deterministic=self.use_deterministic)) self.train_model = train_model self.train_iterator = train_iterator self.eval_model = eval_model @@ -138,7 +140,9 @@ class RunMode: for i in range(start_step, start_step + self.max_train_steps): logger.info("################ training at step %d ################", i) try: - self.session.run([self.train_ops, self.train_model.loss_list]) + _, loss = self.session.run([self.train_ops, self.train_model.loss_list]) + if self.use_deterministic: + logger.info(f"train_loss: {loss[0]}") except tf.errors.OutOfRangeError: logger.info("Encounter the end of Sequence for training.") break -- Gitee From d566330b9910877827beabbc87ad10b436986a1a Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Mon, 29 Apr 2024 06:08:04 +0000 Subject: [PATCH 070/302] !97 cleancode * Merge branch 'develop' of gitee.com:ascend/mxrec into develop_cleancode * Merge branch 'develop' of gitee.com:ascend/mxrec into develop_cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * Merge branch 'develop' of gitee.com:ascend/mxrec into develop_cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode * cleancode --- .../aclnn_op_test/inc/op_runner.h | 8 +- .../op_host/embedding_lookup_by_address.cpp | 26 ++- .../op_host/embedding_update_by_address.cpp | 23 ++- .../op_kernel/embedding_lookup_by_address.cpp | 150 ++++++++------- .../op_kernel/embedding_update_by_address.cpp | 172 ++++++++---------- examples/DCNv2/main_mxrec.py | 8 +- .../little_demo_estimator/nn_model_build.py | 11 +- .../little_demo_estimator/nn_model_input.py | 2 +- .../demo/little_demo_estimator/nn_optim.py | 4 +- examples/dlrm/criteo_tb/gen_ttf.py | 23 ++- examples/dlrm/model/main_mxrec.py | 14 +- examples/dlrm/model/mean_auc.py | 4 +- mx_rec/__init__.py | 2 +- mx_rec/core/asc/merge_table.py | 8 +- mx_rec/util/cpu.py | 1 - src/AccCTR/src/unique/unique_func.cpp | 7 +- src/AccCTR/src/unique/unique_func.h | 34 ++-- .../local_file_system/local_file_system.h | 2 - src/core/hd_transfer/hd_transfer.cpp | 4 +- src/core/hd_transfer/hd_transfer.h | 4 +- .../random_normal_initializer.cpp | 7 +- .../random_normal_initializer.h | 1 + .../truncated_normal_initializer.cpp | 4 +- src/core/utils/common.h | 22 +-- src/dataset_tf/eos_dataset_op.cc | 79 +++++--- 25 files changed, 318 insertions(+), 302 deletions(-) diff --git a/cust_op/cust_op_by_addr/aclnn_op_test/inc/op_runner.h b/cust_op/cust_op_by_addr/aclnn_op_test/inc/op_runner.h index bf923d7e..e41e3596 100644 --- a/cust_op/cust_op_by_addr/aclnn_op_test/inc/op_runner.h +++ b/cust_op/cust_op_by_addr/aclnn_op_test/inc/op_runner.h @@ -140,16 +140,16 @@ public: /** * @brief Print readable input by index * @param [in] index: input index - * @param [in] elementsPerRow: number of elements per row + * @param [in] numElementsPerRow: number of elements per row */ - void PrintInput(size_t index, size_t elementsPerRow = 16); + void PrintInput(size_t index, size_t numElementsPerRow = 16); /** * @brief Print readable output by index * @param [in] index: output index - * @param [in] elementsPerRow: number of elements per row + * @param [in] numElementsPerRow: number of elements per row */ - void PrintOutput(size_t index, size_t elementsPerRow = 16); + void PrintOutput(size_t index, size_t numElementsPerRow = 16); /** * @brief Compile static op diff --git a/cust_op/cust_op_by_addr/op_host/embedding_lookup_by_address.cpp b/cust_op/cust_op_by_addr/op_host/embedding_lookup_by_address.cpp index 45681773..41a5b33a 100644 --- a/cust_op/cust_op_by_addr/op_host/embedding_lookup_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_host/embedding_lookup_by_address.cpp @@ -16,6 +16,12 @@ See the License for the specific language governing permissions and #include "embedding_lookup_by_address_tiling.h" #include "register/op_def_registry.h" +namespace { + constexpr int32_t EMBEDDING_TYPE_FLOAT16 = 2; + constexpr int32_t EMBEDDING_TYPE_INT32 = 0; + constexpr int32_t EMBEDDING_TYPE_FLOAT32 = 1; +} + namespace optiling { constexpr int32_t BLOCK_DIM = 48; // 910b一张卡48个vector核 @@ -81,7 +87,7 @@ namespace optiling int32_t inputShape = inputTensor->GetShapeSize(); int32_t typeSize = SIZE_OF_FLOAT_OR_INT; - if (embeddingType == 2) { + if (embeddingType == EMBEDDING_TYPE_FLOAT16) { typeSize = SIZE_OF_HALF; } // shape需要对齐到的最小单位, MIN_BLOCK_SIZE=32 @@ -92,7 +98,8 @@ namespace optiling int32_t occupyAddressBytesNum = sizeof(int64_t) + typeSize * embeddingDimAligned * PING_PONG_NUM * 2; // 一轮计算中最多计算多少个addr,由于地址也要搬到ub,所以需要对齐32, - int32_t addrPerLoop = (UB_LIMIT / occupyAddressBytesNum) & (~3); // & (~3),保证地址数是4的倍数 + int32_t addrPerLoop = static_cast((UB_LIMIT / + static_cast(occupyAddressBytesNum)) & (~3u)); // & (~3u),保证地址数是4的倍数 if (addrPerLoop <= 0) { return ge::GRAPH_FAILED; } @@ -116,6 +123,7 @@ namespace optiling namespace ge { + constexpr int OUTPUT_DIMENSION = 2; static ge::graphStatus InferShape1(gert::InferShapeContext *context) { @@ -140,8 +148,12 @@ namespace ge int64_t updateDim = *attr0Value; - int64_t inputShape = context->GetInputTensor(0)->GetShapeSize(); - yShape->SetDimNum(2); + auto *inputTensor2 = context->GetInputTensor(0); + if (optiling::CheckNullPointer(inputTensor2, "inputTensor2") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + int64_t inputShape = inputTensor2->GetShapeSize(); + yShape->SetDimNum(OUTPUT_DIMENSION); yShape->SetDim(0, inputShape); yShape->SetDim(1, updateDim); return GRAPH_SUCCESS; @@ -165,15 +177,15 @@ namespace ge } embbedingType = *attr1Value; - if (embbedingType == 0) + if (embbedingType == EMBEDDING_TYPE_INT32) { context->SetOutputDataType(0, ge::DataType(DT_INT32)); } - else if (embbedingType == 1) + else if (embbedingType == EMBEDDING_TYPE_FLOAT32) { context->SetOutputDataType(0, ge::DataType(DT_FLOAT)); } - else if (embbedingType == 2) + else if (embbedingType == EMBEDDING_TYPE_FLOAT16) { context->SetOutputDataType(0, ge::DataType(DT_FLOAT16)); diff --git a/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp b/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp index 5c45e2ab..d0e4b778 100644 --- a/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp @@ -16,13 +16,19 @@ See the License for the specific language governing permissions and #include "embedding_update_by_address_tiling.h" #include "register/op_def_registry.h" +namespace { + constexpr int32_t EMBEDDING_TYPE_FLOAT16 = 2; + constexpr int32_t EMBEDDING_TYPE_INT32 = 0; + constexpr int32_t EMBEDDING_TYPE_FLOAT32 = 1; +} + namespace optiling { constexpr int32_t BLOCK_DIM = 48; // 910b一张卡48个vector核 constexpr int32_t SIZE_OF_HALF = 2; constexpr int32_t SIZE_OF_FLOAT_OR_INT = 4; constexpr int32_t MIN_BLOCK_SIZE = 32; // ub空间的数据都要按照32对齐 - constexpr int32_t UB_LIMIT = 175 * 1024; + constexpr uint32_t UB_LIMIT = 175 * 1024; constexpr int32_t USR_SIZE = 256; constexpr int32_t SYS_WORKSPACE_SIZE = 16 * 1024 * 1024; constexpr int32_t PING_PONG_NUM = 1; @@ -67,7 +73,7 @@ namespace optiling return ge::GRAPH_FAILED; } - int32_t inputShape = inputTensor->GetShapeSize(); + int64_t inputShape = static_cast(inputTensor->GetShapeSize()); if (CheckPositiveInt(inputShape, "inputShape") != ge::GRAPH_SUCCESS) { return ge::GRAPH_FAILED; } @@ -78,7 +84,7 @@ namespace optiling } const int32_t inputShapeTmp = (inputShape > 0) ? inputShape : 1; - int32_t inputDim = inputTensor1->GetShapeSize() / inputShapeTmp; + int64_t inputDim = static_cast(inputTensor1->GetShapeSize() / inputShapeTmp); if (CheckPositiveInt(inputDim, "inputDim") != ge::GRAPH_SUCCESS) { return ge::GRAPH_FAILED; } @@ -97,15 +103,15 @@ namespace optiling ge::DataType inputDatatype = inputTensor1->GetDataType(); int32_t embeddingType; if (inputDatatype == ge::DT_FLOAT16) { - embeddingType = 2; + embeddingType = EMBEDDING_TYPE_FLOAT16; } else if (inputDatatype == ge::DT_INT32) { - embeddingType = 0; + embeddingType = EMBEDDING_TYPE_INT32; } else { - embeddingType = 1; + embeddingType = EMBEDDING_TYPE_FLOAT32; } int32_t typeSize = SIZE_OF_FLOAT_OR_INT; - if (embeddingType == 2) { + if (embeddingType == EMBEDDING_TYPE_FLOAT16) { typeSize = SIZE_OF_HALF; } int32_t alignNum = MIN_BLOCK_SIZE / typeSize; @@ -116,7 +122,8 @@ namespace optiling int32_t occupyAddressBytesNum = sizeof(int64_t) + typeSize * inputDimAligned * PING_PONG_NUM * 2; // 一轮计算中最多计算多少个addr,由于地址也要搬到ub,所以需要对齐32 - int32_t addrPerLoop = (UB_LIMIT / occupyAddressBytesNum) & (~3); // & (~3),保证地址数是4的倍数 + int32_t addrPerLoop = static_cast((UB_LIMIT / + occupyAddressBytesNum) & (~3U)); // & (~3U),保证地址数是4的倍数 if (CheckPositiveInt(addrPerLoop, "addrPerLoop") != ge::GRAPH_SUCCESS) { return ge::GRAPH_FAILED; } diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp index 1a58768c..3fded632 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp @@ -32,7 +32,7 @@ public: needComputeAddrLen = singleCoreAddrLen; if (block_idx == block_num - 1) // 最后一个core,需要多计算的addr长度 { - needComputeAddrLen = addrNums * sizeof(int64_t) - singleCoreAddrLen * (block_num - 1); + needComputeAddrLen = addrNums * sizeof(int64_t) - singleCoreAddrLen * (block_num - 1); } loopCount = needComputeAddrLen / (addrNumPerLoop * sizeof(int64_t)); // 可能为0 @@ -73,105 +73,99 @@ public: if (loopCount > 0) { - for (int32_t i = 0; i < loopCount; i++) - { - DataCopy(srcAddrLocal, srcAddrGlobal[i * addrNumPerLoop], addrNumPerLoop); - MoveProcess(srcAddrLocal, i, addrNumPerLoop); - } + for (int32_t i = 0; i < loopCount; i++) { + DataCopy(srcAddrLocal, srcAddrGlobal[i * addrNumPerLoop], addrNumPerLoop); + MoveProcess(srcAddrLocal, i, addrNumPerLoop); + } } // 处理最后一张卡剩下的addr int unProcess = (needComputeAddrLen / sizeof(int64_t)) % addrNumPerLoop; if (unProcess) { - int unProcessAligned = (unProcess + 3) & (~3); // 处理 addressList 不对齐32b的情况 - // 地址列表访问越界,对齐考虑无问题,会自动多申请一部分,兼容 - DataCopy(srcAddrLocal, srcAddrGlobal[loopCount * addrNumPerLoop], unProcessAligned); - MoveProcess(srcAddrLocal, loopCount, unProcess); + int unProcessAligned = static_cast + ((static_cast(unProcess) + 3) & (~3U)); // 处理 addressList 不对齐32b的情况 + // 地址列表访问越界,对齐考虑无问题,会自动多申请一部分,兼容 + DataCopy(srcAddrLocal, srcAddrGlobal[loopCount * addrNumPerLoop], unProcessAligned); + MoveProcess(srcAddrLocal, loopCount, unProcess); } } private: - __aicore__ inline void MoveProcess(const LocalTensor srcAddrLocal, const int turns, int addrNum) - { - set_flag(PIPE_MTE2, PIPE_S, 0); - wait_flag(PIPE_MTE2, PIPE_S, 0); - LocalTensor dataLocal = inQueue.AllocTensor(); // Queue的大小可以容下一个循环的所有emb - bool isFull = false; - int nums = 0; - int outIndex = 0; - int times = embDimAligned >> 3; // >>3位运算:除以8。 embDimAligned一定是8的倍数,若地址无效时,每次填充8个0 - int tmpCache = cache - 1; // 设计初是一次cache执行多次copyin、一次compute和一次copyout,现状是一次loop就只对应一次cache - - for (int i = 0; i < addrNum; i++) + __aicore__ inline void MoveProcess(const LocalTensor srcAddrLocal, const int turns, int addrNum) { - // 多次copyIn, 对应一次compute和copyOut,由cache决定 - dataLocal = isFull ? inQueue.AllocTensor() : dataLocal; - int64_t address = srcAddrLocal.GetValue(i); - - if (address != 0) - { - srcDataBufferGm.SetGlobalBuffer((__gm__ T *)(address), embDimAligned); - DataCopy(dataLocal[embDimAligned * nums], srcDataBufferGm, embDimAligned); - } - else - { - for (int j = 0; j < times; j++) + set_flag(PIPE_MTE2, PIPE_S, 0); + wait_flag(PIPE_MTE2, PIPE_S, 0); + LocalTensor dataLocal = inQueue.AllocTensor(); // Queue的大小可以容下一个循环的所有emb + bool isFull = false; + int nums = 0; + int outIndex = 0; + int times = embDimAligned >> 3; // >>3位运算:除以8。 embDimAligned一定是8的倍数,若地址无效时,每次填充8个0 + int tmpCache = cache - 1; // 设计初是一次cache执行多次copyin、一次compute和一次copyout,现状是一次loop就只对应一次cache + + for (int i = 0; i < addrNum; i++) { - Duplicate(dataLocal[embDimAligned * nums + j * PADDING_ZERO_NUM_PER_TIME], (T)0, PADDING_ZERO_NUM_PER_TIME); + // 多次copyIn, 对应一次compute和copyOut,由cache决定 + dataLocal = isFull ? inQueue.AllocTensor() : dataLocal; + int64_t address = srcAddrLocal.GetValue(i); + + if (address != 0) { + srcDataBufferGm.SetGlobalBuffer((__gm__ T *)(address), embDimAligned); + DataCopy(dataLocal[embDimAligned * nums], srcDataBufferGm, embDimAligned); + } else { + for (int j = 0; j < times; j++) { + Duplicate(dataLocal[embDimAligned * nums + j * PADDING_ZERO_NUM_PER_TIME], + (T)0, PADDING_ZERO_NUM_PER_TIME); + } + } + + nums++; + isFull = (i == tmpCache || i == addrNum - 1); // cache满了,或者最后一个地址 + if (isFull) { + inQueue.EnQue(dataLocal); + Compute(nums); + CopyOut(outIndex, turns, nums); + nums = 0; + outIndex = i + 1; + tmpCache += cache; + } } - } - - nums++; - isFull = (i == tmpCache || i == addrNum - 1); // cache满了,或者最后一个地址 - if (isFull) - { - inQueue.EnQue(dataLocal); - Compute(nums); - CopyOut(outIndex, turns, nums); - nums = 0; - outIndex = i + 1; - tmpCache += cache; - } } - } - __aicore__ inline void Compute(const int nums) - { - // deque input tensors from VECIN queue - LocalTensor srcLocal = inQueue.DeQue(); - LocalTensor dstLocal = outQueue.AllocTensor(); + __aicore__ inline void Compute(const int nums) + { + // deque input tensors from VECIN queue + LocalTensor srcLocal = inQueue.DeQue(); + LocalTensor dstLocal = outQueue.AllocTensor(); - DataCopyParams copyParams; - copyParams.blockCount = 1; - copyParams.blockLen = (embDimAligned * sizeof(T) * nums) >> 5; // >> 5, 除以32,ub空间对齐 - DataCopy(dstLocal, srcLocal, copyParams); + DataCopyParams copyParams; + copyParams.blockCount = 1; + copyParams.blockLen = (embDimAligned * sizeof(T) * nums) >> 5; // >> 5, 除以32,ub空间对齐 + DataCopy(dstLocal, srcLocal, copyParams); - outQueue.EnQue(dstLocal); - inQueue.FreeTensor(srcLocal); - } + outQueue.EnQue(dstLocal); + inQueue.FreeTensor(srcLocal); + } - __aicore__ inline void CopyOut(const int index, const int turns, const int nums) - { - LocalTensor dstLocal = outQueue.DeQue(); + __aicore__ inline void CopyOut(const int index, const int turns, const int nums) + { + LocalTensor dstLocal = outQueue.DeQue(); - int offset = block_idx * dim * singleCoreAddrLen / sizeof(int64_t) + (turns * addrNumPerLoop * dim) + dim * index; + int offset = block_idx * dim * singleCoreAddrLen / + sizeof(int64_t) + (turns * addrNumPerLoop * dim) + dim * index; #if defined(__DAV_C220_VEC__) - if (typeSize == SIZE_OF_FLOAT_OR_INT) - { - copy_ubuf_to_gm_align_b32((__gm__ T *)dstDataGm[offset].GetPhyAddr(), (__ubuf__ T *)dstLocal.GetPhyAddr(), 0, - nums, dim * sizeof(T), 0, 0, 0, 0); - } - else if (typeSize == SIZE_OF_HALF) - { - copy_ubuf_to_gm_align_b16((__gm__ T *)dstDataGm[offset].GetPhyAddr(), (__ubuf__ T *)dstLocal.GetPhyAddr(), 0, - nums, dim * sizeof(T), 0, 0, 0, 0); - } + if (typeSize == SIZE_OF_FLOAT_OR_INT) { + copy_ubuf_to_gm_align_b32((__gm__ T *)dstDataGm[offset].GetPhyAddr(), + (__ubuf__ T *)dstLocal.GetPhyAddr(), 0, nums, dim * sizeof(T), 0, 0, 0, 0); + } else if (typeSize == SIZE_OF_HALF) { + copy_ubuf_to_gm_align_b16((__gm__ T *)dstDataGm[offset].GetPhyAddr(), + (__ubuf__ T *)dstLocal.GetPhyAddr(), 0, nums, dim * sizeof(T), 0, 0, 0, 0); + } #else - DataCopy(dstDataGm[offset], dstLocal, embDimAligned * nums); + DataCopy(dstDataGm[offset], dstLocal, embDimAligned * nums); #endif - outQueue.FreeTensor(dstLocal); - } + outQueue.FreeTensor(dstLocal); + } public: int32_t addrNumPerLoop, loopCount, singleCoreAddrLen, needComputeAddrLen, veclen, dim, pingpongNum, cache; diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp index 98847260..4a13c3eb 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp @@ -31,7 +31,7 @@ public: needComputeAddrLen = singleCoreAddrLen; if (block_idx == block_num - 1) { - needComputeAddrLen = addrNums * sizeof(int64_t) - singleCoreAddrLen * (block_num - 1); + needComputeAddrLen = addrNums * sizeof(int64_t) - singleCoreAddrLen * (block_num - 1); } loopCount = needComputeAddrLen / (addrNumPerLoop * sizeof(int64_t)); @@ -41,7 +41,8 @@ public: // get start index for current core, core parallel block_indx block_dim srcAddrGlobal.SetGlobalBuffer((__gm__ int64_t *)(address + block_idx * singleCoreAddrLen)); - srcDataBufferGm.SetGlobalBuffer((__gm__ T *)(embedding + block_idx * singleCoreAddrLen / sizeof(int64_t) * sizeof(T) * dim)); + srcDataBufferGm.SetGlobalBuffer((__gm__ T *)(embedding + block_idx * singleCoreAddrLen + / sizeof(int64_t) * sizeof(T) * dim)); outDataGm.SetGlobalBuffer((__gm__ T *)(y)); } @@ -72,120 +73,105 @@ public: if (loopCount > 0) { - for (int32_t i = 0; i < loopCount; i++) - { - DataCopy(srcAddrLocal, srcAddrGlobal[i * addrNumPerLoop], addrNumPerLoop); - MoveProcess(srcAddrLocal, i, addrNumPerLoop); - } + for (int32_t i = 0; i < loopCount; i++) { + DataCopy(srcAddrLocal, srcAddrGlobal[i * addrNumPerLoop], addrNumPerLoop); + MoveProcess(srcAddrLocal, i, addrNumPerLoop); + } } int unProcess = (needComputeAddrLen / sizeof(int64_t)) % addrNumPerLoop; if (unProcess) { - int unProcessAligned = (unProcess + 3) & (~3); // 处理 addressList 不对齐32b的情况 - DataCopy(srcAddrLocal, srcAddrGlobal[loopCount * addrNumPerLoop], unProcessAligned); - MoveProcess(srcAddrLocal, loopCount, unProcess); + int unProcessAligned = (static_cast(unProcess) + 3) & (~3U); // 处理 addressList 不对齐32b的情况 + DataCopy(srcAddrLocal, srcAddrGlobal[loopCount * addrNumPerLoop], unProcessAligned); + MoveProcess(srcAddrLocal, loopCount, unProcess); } } private: - __aicore__ inline void MoveProcess(const LocalTensor srcAddrLocal, const int turns, int addrNum) - { - set_flag(PIPE_MTE2, PIPE_S, 0); - wait_flag(PIPE_MTE2, PIPE_S, 0); - LocalTensor dataLocal; - - int64_t address = 0; - if (dim == inputDimAligned) // copyIn 和 compute一次,copyOut多次 + __aicore__ inline void MoveProcess(const LocalTensor srcAddrLocal, const int turns, int addrNum) { - dataLocal = inQueue.AllocTensor(); - DataCopy(dataLocal, srcDataBufferGm[turns * addrNumPerLoop * dim], addrNum * inputDimAligned); - inQueue.EnQue(dataLocal); - - Compute(addrNum); // 只有copyOut的管道支持拷贝到gm上 - - LocalTensor dstLocal = outQueue.DeQue(); - if (updateType == 0) - { - SetAtomicAdd(); - } - for (int i = 0; i < addrNum; i++) - { - address = srcAddrLocal.GetValue(i); - if (address != 0) + set_flag(PIPE_MTE2, PIPE_S, 0); + wait_flag(PIPE_MTE2, PIPE_S, 0); + LocalTensor dataLocal; + + int64_t address = 0; + if (dim == inputDimAligned) // copyIn 和 compute一次,copyOut多次 { - dstDataGm.SetGlobalBuffer((__gm__ T*)(address)); - DataCopy(dstDataGm, dstLocal[i * inputDimAligned], inputDimAligned); + dataLocal = inQueue.AllocTensor(); + DataCopy(dataLocal, srcDataBufferGm[turns * addrNumPerLoop * dim], addrNum * inputDimAligned); + inQueue.EnQue(dataLocal); + + Compute(addrNum); // 只有copyOut的管道支持拷贝到gm上 + + LocalTensor dstLocal = outQueue.DeQue(); + if (updateType == 0) { + SetAtomicAdd(); + } + for (int i = 0; i < addrNum; i++) { + address = srcAddrLocal.GetValue(i); + if (address != 0) { + dstDataGm.SetGlobalBuffer((__gm__ T*)(address)); + DataCopy(dstDataGm, dstLocal[i * inputDimAligned], inputDimAligned); + } + } + if (updateType == 0) { + SetAtomicNone(); + } + outQueue.FreeTensor(dstLocal); + } else { + for (int i = 0; i < addrNum; i++) { + dataLocal = inQueue.AllocTensor(); + DataCopy(dataLocal, srcDataBufferGm[i * dim + turns * addrNumPerLoop * dim], inputDimAligned); + inQueue.EnQue(dataLocal); + Compute(1); + address = srcAddrLocal.GetValue(i); + CopyOut(address, turns, i); + } } - } - if (updateType == 0) - { - SetAtomicNone(); - } - outQueue.FreeTensor(dstLocal); } - else + + __aicore__ inline void Compute(const int nums) { - for (int i = 0; i < addrNum; i++) - { - dataLocal = inQueue.AllocTensor(); - DataCopy(dataLocal, srcDataBufferGm[i * dim + turns * addrNumPerLoop * dim], inputDimAligned); - inQueue.EnQue(dataLocal); - Compute(1); - address = srcAddrLocal.GetValue(i); - CopyOut(address, turns, i); - } + // deque input tensors from VECIN queue + LocalTensor srcLocal = inQueue.DeQue(); + LocalTensor dstLocal = outQueue.AllocTensor(); + DataCopyParams copyparams; + copyparams.blockCount = 1; + copyparams.blockLen = (inputDimAligned * sizeof(T) * nums) >> 5; // >> 5, 除以32,ub空间对齐 + DataCopy(dstLocal, srcLocal, copyparams); + outQueue.EnQue(dstLocal); + inQueue.FreeTensor(srcLocal); } - } - - __aicore__ inline void Compute(const int nums) - { - // deque input tensors from VECIN queue - LocalTensor srcLocal = inQueue.DeQue(); - LocalTensor dstLocal = outQueue.AllocTensor(); - DataCopyParams copyparams; - copyparams.blockCount = 1; - copyparams.blockLen = (inputDimAligned * sizeof(T) * nums) >> 5; // >> 5, 除以32,ub空间对齐 - DataCopy(dstLocal, srcLocal, copyparams); - outQueue.EnQue(dstLocal); - inQueue.FreeTensor(srcLocal); - } - __aicore__ inline void CopyOut(const int64_t address, const int64_t turns, const int64_t index) - { - LocalTensor dstLocal = outQueue.DeQue(); - - if (address != 0) + __aicore__ inline void CopyOut(const int64_t address, const int64_t turns, const int64_t index) { - dstDataGm.SetGlobalBuffer((__gm__ T *)(address)); + LocalTensor dstLocal = outQueue.DeQue(); - if (updateType == 0) - { - SetAtomicAdd(); - } + if (address != 0) { + dstDataGm.SetGlobalBuffer((__gm__ T *)(address)); + + if (updateType == 0) { + SetAtomicAdd(); + } #if defined(__DAV_C220_VEC__) - if (typeSize == SIZE_OF_FLOAT_OR_INT) - { - - copy_ubuf_to_gm_align_b32((__gm__ T *)dstDataGm.GetPhyAddr(), (__ubuf__ T *)dstLocal.GetPhyAddr(), 0, - 1, dim * sizeof(T), 0, 0, 0, 0); - } - else if (typeSize == SIZE_OF_HALF) - { - copy_ubuf_to_gm_align_b16((__gm__ T *)dstDataGm.GetPhyAddr(), (__ubuf__ T *)dstLocal.GetPhyAddr(), 0, - 1, dim * sizeof(T), 0, 0, 0, 0); - } + if (typeSize == SIZE_OF_FLOAT_OR_INT) { + copy_ubuf_to_gm_align_b32((__gm__ T *)dstDataGm.GetPhyAddr(), (__ubuf__ T *)dstLocal.GetPhyAddr(), 0, + 1, dim * sizeof(T), 0, 0, 0, 0); + } else if (typeSize == SIZE_OF_HALF) { + copy_ubuf_to_gm_align_b16((__gm__ T *)dstDataGm.GetPhyAddr(), (__ubuf__ T *)dstLocal.GetPhyAddr(), 0, + 1, dim * sizeof(T), 0, 0, 0, 0); + } #else - DataCopy(dstDataGm, dstLocal, inputDimAligned); + DataCopy(dstDataGm, dstLocal, inputDimAligned); #endif + } + if (updateType == 0) { + SetAtomicNone(); + } + outQueue.FreeTensor(dstLocal); } - if (updateType == 0) - { - SetAtomicNone(); - } - outQueue.FreeTensor(dstLocal); - } public: int32_t addrNumPerLoop, loopCount, singleCoreAddrLen, needComputeAddrLen, addrNums, cache, veclen, dim, pingpongNum; diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index 6fd235ba..5e4efe02 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -154,7 +154,7 @@ def evaluate(): try: eval_current_steps += 1 eval_start = time.time() - eval_loss, pred, label = sess.run([eval_model["loss"], eval_model["pred"], eval_label]) + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_label]) eval_cost = time.time() - eval_start eval_qps = (1 / eval_cost) * rank_size * cfg.batch_size log_loss_list += list(eval_loss.reshape(-1)) @@ -185,7 +185,7 @@ def evaluate_fix(step): while not finished: try: eval_current_steps += 1 - eval_loss, pred, label = sess.run([eval_model["loss"], eval_model["pred"], eval_model["label"]]) + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_model.get("label")]) log_loss_list += list(eval_loss.reshape(-1)) pred_list += list(pred.reshape(-1)) label_list += list(label.reshape(-1)) @@ -322,7 +322,7 @@ if __name__ == "__main__": rank_size = mxrec_util.communication.hccl_ops.get_rank_size() train_ops = [] # multi task training - for loss, (dense_optimizer, sparse_optimizer) in zip([train_model["loss"]], optimizer_list): + for loss, (dense_optimizer, sparse_optimizer) in zip([train_model.get("loss")], optimizer_list): # do dense optimization grads = dense_optimizer.compute_gradients(loss, var_list=dense_variables) avg_grads = [] @@ -404,7 +404,7 @@ if __name__ == "__main__": start_time = time.time() try: - grad, loss = sess.run([train_ops, train_model["loss"]]) + grad, loss = sess.run([train_ops, train_model.get("loss")]) lr = sess.run(cfg.learning_rate) global_step = sess.run(cfg.global_step) except tf.errors.OutOfRangeError: diff --git a/examples/demo/little_demo_estimator/nn_model_build.py b/examples/demo/little_demo_estimator/nn_model_build.py index e715f930..67820d04 100644 --- a/examples/demo/little_demo_estimator/nn_model_build.py +++ b/examples/demo/little_demo_estimator/nn_model_build.py @@ -207,15 +207,16 @@ class LittleModel: return embedding_list -def _make_ids_with_const_ops(input: Tensor) -> Tensor: - const_ids = tf.constant(1, shape=input.shape, dtype=input.dtype) +def _make_ids_with_const_ops(input_tensor: Tensor) -> Tensor: + const_ids = tf.constant(1, shape=input_tensor.shape, dtype=input_tensor.dtype) const_ids = tf.compat.v1.add(const_ids, 1) const_ids = tf.compat.v1.subtract(const_ids, 1) return const_ids -def _make_ids_with_str_ops(input: Tensor) -> Tensor: - str_ids = tf.compat.v1.strings.as_string(input) + +def _make_ids_with_str_ops(input_tensor: Tensor) -> Tensor: + str_ids = tf.compat.v1.strings.as_string(input_tensor) str_ids = tf.compat.v1.strings.to_number(str_ids) - + return str_ids diff --git a/examples/demo/little_demo_estimator/nn_model_input.py b/examples/demo/little_demo_estimator/nn_model_input.py index d763c058..d6ebb529 100644 --- a/examples/demo/little_demo_estimator/nn_model_input.py +++ b/examples/demo/little_demo_estimator/nn_model_input.py @@ -17,10 +17,10 @@ import tensorflow as tf from mx_rec.constants.constants import ASCEND_TIMESTAMP +from mx_rec.util.log import logger from nn_model_build import LittleModel from nn_optim import get_train_op -from mx_rec.util.log import logger def get_model_fn(create_fs_params, cfg, access_and_evict_config_dict=None): diff --git a/examples/demo/little_demo_estimator/nn_optim.py b/examples/demo/little_demo_estimator/nn_optim.py index 3be3c7ed..415c5ff2 100644 --- a/examples/demo/little_demo_estimator/nn_optim.py +++ b/examples/demo/little_demo_estimator/nn_optim.py @@ -55,9 +55,7 @@ def get_train_op_list(losses, learning_rate): dense_variables, sparse_variables = get_dense_and_sparse_variable() trainable_variables = [dense_variables] - for i in range(len(losses)): - name = losses[i][0] - loss = losses[i][1] + for i, (name, loss) in enumerate(losses): with tf.control_dependencies(update_ops): # do dense grad grads = dense_optimizer.compute_gradients(loss, var_list=trainable_variables) diff --git a/examples/dlrm/criteo_tb/gen_ttf.py b/examples/dlrm/criteo_tb/gen_ttf.py index 04b7b767..8715f048 100644 --- a/examples/dlrm/criteo_tb/gen_ttf.py +++ b/examples/dlrm/criteo_tb/gen_ttf.py @@ -19,12 +19,12 @@ import collections import logging import argparse from multiprocessing import Process -import numpy as np +import sys import time +import numpy as np from tqdm import tqdm from glob import glob from collections import Counter, OrderedDict -import sys import tensorflow as tf @@ -91,7 +91,7 @@ class CriteoStatsDict(): @staticmethod def save_dict(output_file_path, hist_map, prefix=""): - with open(os.path.join(output_file_path, "{}hist_map.pkl".format(prefix)), "wb") as file_wrt: + with os.fdopen(os.path.join(output_file_path, "{}hist_map.pkl".format(prefix)), "wb") as file_wrt: pickle.dump(hist_map, file_wrt) def load_dict(self, dict_path, prefix=""): @@ -188,7 +188,7 @@ def get_unique_id_multiprocess(proc_num, proc_id, data_file_path, output_file_pa if capped_value not in cat_sets: cat_sets[k][capped_value] = cat_global_id_nums[k] cat_global_id_nums[k] += 1 - with open(os.path.join(output_file_path, "unique_id.pkl"), "wb") as file_wrt: + with os.fdopen(os.path.join(output_file_path, "unique_id.pkl"), "wb") as file_wrt: pickle.dump(cat_sets, file_wrt) print('statsdata time cost: {:.2f}s'.format(time.time() - start_time)) @@ -247,7 +247,7 @@ def convert_input2tfrd_multiprocess(proc_num, proc_id, in_file_path, output_file with open(in_file_path, encoding="utf-8") as file_in: errorline_list = [] - for i, line in tqdm(enumerate(file_in)): + for _ in tqdm(file_in): line_num += 1 print(f'line_num: {line_num}') start_line = proc_id * ((line_num + proc_num) // proc_num) @@ -370,9 +370,9 @@ if __name__ == "__main__": sub_process_num = process_num // len(train_data_files) data_file = train_data_files[process_id // sub_process_num] output_path = f'{save_tfrecord_path}/{process_id:04}_' - p = Process(target=convert_input2tfrd_multiprocess, args=(sub_process_num, process_id%sub_process_num, data_file, output_path, - criteo_stats, spe_num, - 5000000)) + p = Process(target=convert_input2tfrd_multiprocess, args=(sub_process_num, process_id % sub_process_num, + data_file, output_path, criteo_stats, spe_num, + 5000000)) processs.append(p) for p in processs: p.start() @@ -394,10 +394,9 @@ if __name__ == "__main__": sub_process_num = process_num // len(test_data_files) data_file = test_data_files[process_id // sub_process_num] output_path = f'{save_tfrecord_path}/{process_id:04}_' - p = Process(target=convert_input2tfrd_multiprocess, args=(sub_process_num, process_id%sub_process_num, data_file, output_path, - criteo_stats, spe_num, - 5000000)) - + p = Process(target=convert_input2tfrd_multiprocess, args=(sub_process_num, process_id % sub_process_num, + data_file, output_path, criteo_stats, spe_num, + 5000000)) processs.append(p) for p in processs: p.start() diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index b6036804..8c4cdd7e 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -24,6 +24,9 @@ import tensorflow as tf from sklearn.metrics import roc_auc_score import numpy as np +from optimizer import get_dense_and_sparse_optimizer +from config import sess_config, Config +from model import MyModel from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline @@ -38,9 +41,6 @@ from mx_rec.util.variable import get_dense_and_sparse_variable from mx_rec.util.log import logger from npu_bridge.npu_init import * -from model import MyModel -from config import sess_config, Config -from optimizer import get_dense_and_sparse_optimizer npu_plugin.set_device_sat_mode(0) @@ -158,7 +158,7 @@ def evaluate(): try: eval_current_steps += 1 eval_start = time.time() - eval_loss, pred, label = sess.run([eval_model["loss"], eval_model["pred"], eval_label]) + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_label]) eval_cost = time.time() - eval_start qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size log_loss_list += list(eval_loss.reshape(-1)) @@ -189,7 +189,7 @@ def evaluate_fix(step): while not finished: try: eval_current_steps += 1 - eval_loss, pred, label = sess.run([eval_model["loss"], eval_model["pred"], eval_model["label"]]) + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_model.get("label")]) log_loss_list += list(eval_loss.reshape(-1)) pred_list += list(pred.reshape(-1)) label_list += list(label.reshape(-1)) @@ -331,7 +331,7 @@ if __name__ == "__main__": rank_size = mxrec_util.communication.hccl_ops.get_rank_size() train_ops = [] # multi task training - for loss, (dense_optimizer, sparse_optimizer) in zip([train_model["loss"]], optimizer_list): + for loss, (dense_optimizer, sparse_optimizer) in zip([train_model.get("loss")], optimizer_list): # do dense optimization grads = dense_optimizer.compute_gradients(loss, var_list=trainable_varibles) avg_grads = [] @@ -411,7 +411,7 @@ if __name__ == "__main__": start_time = time.time() try: - grad, loss = sess.run([train_ops, train_model["loss"]]) + grad, loss = sess.run([train_ops, train_model.get("loss")]) lr = sess.run(cfg.learning_rate) global_step = sess.run(cfg.global_step) except tf.errors.OutOfRangeError: diff --git a/examples/dlrm/model/mean_auc.py b/examples/dlrm/model/mean_auc.py index 1116ebd5..ff57df00 100644 --- a/examples/dlrm/model/mean_auc.py +++ b/examples/dlrm/model/mean_auc.py @@ -15,8 +15,8 @@ # ============================================================================== import os -import numpy as np from glob import glob +import numpy as np def split_auc(log_input): @@ -26,7 +26,7 @@ def split_auc(log_input): if 'Test' in line: all_auc.append(float(line.split(';')[0].split(':')[-1].strip())) all_auc_len = len(all_auc) - all_auc_arr = np.array(all_auc)[:all_auc_len - all_auc_len%8] + all_auc_arr = np.array(all_auc)[:all_auc_len - all_auc_len % 8] test_auc = np.mean(all_auc_arr.reshape(-1, 8), axis=-1) return test_auc diff --git a/mx_rec/__init__.py b/mx_rec/__init__.py index bdb85131..64cdcc16 100644 --- a/mx_rec/__init__.py +++ b/mx_rec/__init__.py @@ -15,6 +15,7 @@ # limitations under the License. # ============================================================================== +__version__ = "5.0.RC2" __all__ = ["version", "__version__"] from mx_rec.constants.constants import ASCEND_GLOBAL_HASHTABLE_COLLECTION @@ -34,7 +35,6 @@ patch_for_assert_eval_spec() patch_for_bool_gauge() patch_for_optimizer() patch_for_session() -__version__ = "5.0.RC2" def version(): diff --git a/mx_rec/core/asc/merge_table.py b/mx_rec/core/asc/merge_table.py index 776a72c4..fb993032 100644 --- a/mx_rec/core/asc/merge_table.py +++ b/mx_rec/core/asc/merge_table.py @@ -196,7 +196,9 @@ def check_dangling_table(): config_instance = ConfigInitializer.get_instance() dangling_table = config_instance.sparse_embed_config.dangling_table if not dangling_table: - dangling_table = find_dangling_table([table_instance.table_name - for _, table_instance in - config_instance.sparse_embed_config.table_instance_dict.items()]) + table_names = [] + for _, table_instance in config_instance.sparse_embed_config.table_instance_dict.items(): + table_names.append(table_instance.table_name) + dangling_table = find_dangling_table(table_names) + return dangling_table diff --git a/mx_rec/util/cpu.py b/mx_rec/util/cpu.py index 69700262..a7848d7f 100644 --- a/mx_rec/util/cpu.py +++ b/mx_rec/util/cpu.py @@ -3,7 +3,6 @@ # Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. import ctypes -from ctypes import * import psutil from mx_rec.util.log import logger diff --git a/src/AccCTR/src/unique/unique_func.cpp b/src/AccCTR/src/unique/unique_func.cpp index 64ad6d52..717d8890 100644 --- a/src/AccCTR/src/unique/unique_func.cpp +++ b/src/AccCTR/src/unique/unique_func.cpp @@ -119,10 +119,11 @@ void Dedup::NewParameter() // Time to check the proper size of sharded tables for performance // sake. uint64_t shardedTableSize = 0; - if (std::numeric_limits::max() / n / groupCount_ < newBucketCountPowerOf2) { - shardedTableSize = std::numeric_limits::max(); + if (std::numeric_limits::max() / static_cast(n) / static_cast(groupCount_) + < newBucketCountPowerOf2) { + shardedTableSize = static_cast(std::numeric_limits::max()); } else { - shardedTableSize = newBucketCountPowerOf2 * n * groupCount_; + shardedTableSize = newBucketCountPowerOf2 * n * static_cast(groupCount_); } int largeCount = 0; diff --git a/src/AccCTR/src/unique/unique_func.h b/src/AccCTR/src/unique/unique_func.h index 39e5a6b3..46718bde 100644 --- a/src/AccCTR/src/unique/unique_func.h +++ b/src/AccCTR/src/unique/unique_func.h @@ -171,7 +171,7 @@ public: if (idCountEnable_) { idCount[total] = bucket->idCount[j]; } - out[total++] = bucket->data[j]; + out[total++] = static_cast::type>(bucket->data[j]); } replaceOffset += bucket->count; } @@ -179,7 +179,7 @@ public: int32_t totalOverflow = 0; while (it != overflow_.end()) { if (idCountEnable_) { - idCount[total] = idCountOverflow_[it->first]; + idCount[total] = static_cast(idCountOverflow_[it->first]); } out[total++] = it->first; it->second = replaceOffset++; @@ -189,7 +189,7 @@ public: // set total overflow count stats_.totalUniques = static_cast(total - priorTotal); - stats_.totalOverflowUniques = totalOverflow; + stats_.totalOverflowUniques = static_cast(totalOverflow); return total - priorTotal; } @@ -244,18 +244,20 @@ public: { const int numOfGroupsInShard = groupMethod_.GroupCount(); uint32_t totalSize = conf.desiredSize + (conf.desiredSize >> 1); - while (bucketCountPower2_ * K_BUCKET_WIDTH * numOfGroupsInShard * estimatedDuplicateRatio < totalSize) { + while (bucketCountPower2_ * static_cast(K_BUCKET_WIDTH) * + static_cast(numOfGroupsInShard) * static_cast(estimatedDuplicateRatio) < totalSize) { bucketCountPower2_ <<= 1; } idCountEnable_ = (conf.outputType == OutputType::ENHANCED) && conf.useIdCount; - for (int32_t i = 0; i < numOfGroupsInShard; ++i) { - auto obj = new DedupT(bucketCountPower2_, numOfGroupsInShard, idCountEnable_); - if (obj == nullptr) { - ExternalLogger::PrintLog(LogLevel::ERROR, "creat object error"); - throw NullptrError(); + try { + for (int32_t i = 0; i < numOfGroupsInShard; ++i) { + auto obj = new DedupT(bucketCountPower2_, numOfGroupsInShard, idCountEnable_); + dedupShards_.emplace_back(obj); } - dedupShards_.emplace_back(obj); + } catch (const std::bad_alloc& e) { + ExternalLogger::PrintLog(LogLevel::ERROR, "Memory allocation failed during loop: " + std::string(e.what())); + throw; } } @@ -302,7 +304,7 @@ public: if (conf.outputType == OutputType::ENHANCED) { int totalNumber = 0; for (int i = 0; i < conf.shardingNum; i++) { - totalUniqueSize[i] = totalNumber; + totalUniqueSize[i] = static_cast(totalNumber); if (conf.useSharding) { totalNumber += uniqueOut.uniqueIdCntInBucket[i]; } @@ -365,14 +367,14 @@ private: if (conf.useSharding && conf.useIdCount) { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueIdInBucket, total, uniqueOut.idCnt); // 特征计数使能和shard同时使能 - uniqueOut.uniqueIdCntInBucket[j] = inGroupTotal; + uniqueOut.uniqueIdCntInBucket[j] = static_cast(inGroupTotal); } else if (!conf.useSharding && conf.useIdCount) { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total, uniqueOut.idCnt); // 特征计数使能和shard不使能 } else if (conf.useSharding && !conf.useIdCount) { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueIdInBucket, total, nullptr); // 特征计数使能和shard不使能 - uniqueOut.uniqueIdCntInBucket[j] = inGroupTotal; + uniqueOut.uniqueIdCntInBucket[j] = static_cast(inGroupTotal); } else { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total, nullptr); // 特征计数不使能和shard不使能,跟普通unique对等 @@ -380,7 +382,7 @@ private: } else { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total, nullptr); } - total += inGroupTotal; + total += static_cast(inGroupTotal); } uniqueOut.uniqueIdCnt = total; } @@ -523,8 +525,8 @@ private: uint32_t *beginPtr = uniqueOut.index; uint32_t *finishPtr = beginPtr + uniqueIn.inputIdCnt; uint32_t *partBeginPtr = beginPtr; - auto *partEndPtr = - reinterpret_cast(CacheLineAlign(reinterpret_cast(partBeginPtr + partSize))); + auto alignedAddress = CacheLineAlign(reinterpret_cast(partBeginPtr + partSize)); + auto *partEndPtr = reinterpret_cast(alignedAddress); std::vector> tasks; auto val = TypeTrans(uniqueIn.inputId); while (partBeginPtr < finishPtr) { diff --git a/src/core/file_system/local_file_system/local_file_system.h b/src/core/file_system/local_file_system/local_file_system.h index d137f158..f8eefd5b 100644 --- a/src/core/file_system/local_file_system/local_file_system.h +++ b/src/core/file_system/local_file_system/local_file_system.h @@ -46,8 +46,6 @@ namespace MxRec { void WriterFn(BufferQueue& queue, int fd, ssize_t& writerBytesNum); void FillToBuffer(BufferQueue& queue, const char* data, size_t dataSize); void CalculateMapSize(off_t fileSize, size_t& mapByteSize, size_t& mapRowNum, size_t onceReadByteSize) const; - void HandleMappedData(char* mappedData, size_t mapRowNum, size_t onceReadByteSize, - vector>& dst, size_t cnt) const; private: const mode_t dirMode; diff --git a/src/core/hd_transfer/hd_transfer.cpp b/src/core/hd_transfer/hd_transfer.cpp index 7bd083ab..a32ddf28 100644 --- a/src/core/hd_transfer/hd_transfer.cpp +++ b/src/core/hd_transfer/hd_transfer.cpp @@ -101,9 +101,9 @@ void HDTransfer::CreateChannel(const uint32_t localRankId, const string& embName TransferChannel2Str(channel) == "uniquekeys" || TransferChannel2Str(channel) == "evict" /* for noDDR */ ) { - transferChannels[sendName] = tdtCreateChannel(localRankId, sendName.c_str(), channelSize); + transferChannels[sendName] = TDT_CREATE_CHANNEL(localRankId, sendName.c_str(), channelSize); } else { - transferChannels[sendName] = tdtCreateChannel(localRankId, sendName.c_str(), PING_PONG_SIZE); + transferChannels[sendName] = TDT_CREATE_CHANNEL(localRankId, sendName.c_str(), PING_PONG_SIZE); } LOG_INFO("create channel:{} {}", sendName, static_cast(transferChannels[sendName])); } diff --git a/src/core/hd_transfer/hd_transfer.h b/src/core/hd_transfer/hd_transfer.h index 0ff29e1b..f9528578 100644 --- a/src/core/hd_transfer/hd_transfer.h +++ b/src/core/hd_transfer/hd_transfer.h @@ -24,8 +24,8 @@ See the License for the specific language governing permissions and #include "utils/common.h" #include "utils/config.h" -#ifndef tdtCreateChannel -#define tdtCreateChannel acltdtCreateChannelWithCapacity +#ifndef TDT_CREATE_CHANNEL +#define TDT_CREATE_CHANNEL acltdtCreateChannelWithCapacity #endif namespace MxRec { diff --git a/src/core/initializer/random_normal_initializer/random_normal_initializer.cpp b/src/core/initializer/random_normal_initializer/random_normal_initializer.cpp index 1ea0084f..addc4647 100644 --- a/src/core/initializer/random_normal_initializer/random_normal_initializer.cpp +++ b/src/core/initializer/random_normal_initializer/random_normal_initializer.cpp @@ -20,11 +20,10 @@ See the License for the specific language governing permissions and using namespace MxRec; RandomNormalInitializer::RandomNormalInitializer(int start, int len, NormalInitializerInfo& initInfo) - : start(start), len(len), mean(initInfo.mean), stddev(initInfo.stddev), seed(initInfo.seed) + : start(start), len(len), mean(initInfo.mean), stddev(initInfo.stddev), seed(initInfo.seed), + initParam(initInfo.initK), generator(std::default_random_engine(seed)), + distribution(std::normal_distribution(mean, stddev)) { - initParam = initInfo.initK; - generator = std::default_random_engine(seed); - distribution = std::normal_distribution(mean, stddev); } void RandomNormalInitializer::GenerateData(float* const emb, const int embSize) diff --git a/src/core/initializer/random_normal_initializer/random_normal_initializer.h b/src/core/initializer/random_normal_initializer/random_normal_initializer.h index 9d5f9942..e342f75f 100644 --- a/src/core/initializer/random_normal_initializer/random_normal_initializer.h +++ b/src/core/initializer/random_normal_initializer/random_normal_initializer.h @@ -37,6 +37,7 @@ namespace MxRec { float mean; float stddev; int seed; + float initParam; std::default_random_engine generator; std::normal_distribution distribution; diff --git a/src/core/initializer/truncated_normal_initializer/truncated_normal_initializer.cpp b/src/core/initializer/truncated_normal_initializer/truncated_normal_initializer.cpp index d50a7a97..e011cfc7 100644 --- a/src/core/initializer/truncated_normal_initializer/truncated_normal_initializer.cpp +++ b/src/core/initializer/truncated_normal_initializer/truncated_normal_initializer.cpp @@ -20,7 +20,8 @@ See the License for the specific language governing permissions and using namespace MxRec; TruncatedNormalInitializer::TruncatedNormalInitializer(int start, int len, NormalInitializerInfo& initInfo) - : start(start), len(len), seed(initInfo.seed) + : start(start), len(len), seed(initInfo.seed), generator(std::default_random_engine(initInfo.seed)), + distribution(std::normal_distribution(initInfo.mean, initInfo.stddev)) { initParam = initInfo.initK; // 校验stddev mean值范围 @@ -43,7 +44,6 @@ TruncatedNormalInitializer::TruncatedNormalInitializer(int start, int len, Norma stddev = initInfo.stddev; } - generator = std::default_random_engine(seed); distribution = std::normal_distribution(mean, stddev); minBound = initParam * (mean - static_cast(boundNum) * stddev); maxBound = initParam * (mean + static_cast(boundNum) * stddev); diff --git a/src/core/utils/common.h b/src/core/utils/common.h index f833b759..b761a1ef 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -269,15 +269,11 @@ namespace MxRec { }; struct EmbeddingSizeInfo { + size_t embeddingSize = 0; + size_t extendEmbSize = 0; EmbeddingSizeInfo() = default; EmbeddingSizeInfo(size_t embSize, size_t extendSize) - { - embeddingSize = embSize; - extendEmbSize = extendSize; - } - - size_t embeddingSize; - size_t extendEmbSize; + : embeddingSize(embSize), extendEmbSize(extendSize) {} }; struct OptimizerInfo { @@ -417,6 +413,12 @@ namespace MxRec { } struct EmbInfoParams { + std::string name; + int sendCount; + int embeddingSize; + int extEmbeddingSize; + bool isSave; + bool isGrad; EmbInfoParams() = default; EmbInfoParams(const std::string& name, @@ -433,12 +435,6 @@ namespace MxRec { isGrad(isGrad) { } - std::string name; - int sendCount; - int embeddingSize; - int extEmbeddingSize; - bool isSave; - bool isGrad; }; struct EmbInfo { diff --git a/src/dataset_tf/eos_dataset_op.cc b/src/dataset_tf/eos_dataset_op.cc index 85b8e1d0..afc3fe3a 100644 --- a/src/dataset_tf/eos_dataset_op.cc +++ b/src/dataset_tf/eos_dataset_op.cc @@ -74,15 +74,15 @@ int CheckCommFinished(MPI_Request& req, int channelId) // 表示数据集的不可变性定义,这个类的 MakeIterator() 方法告诉 TensorFlow 怎样在数据集上生成迭代器对象。 class EosDatasetOp::Dataset : public DatasetBase { public: - explicit Dataset(OpKernelContext *ctx, const DatasetBase *input, int32_t channelId, int32_t maxTrainSteps, - int32_t maxEvalSteps) + explicit Dataset(OpKernelContext *ctx, const DatasetBase *input, int32_t channelId, + int32_t maxTrainSteps, + int32_t maxEvalSteps) : DatasetBase(DatasetContext(ctx)), input_(input), channelId_(channelId), maxTrainSteps_(maxTrainSteps), maxEvalSteps_(maxEvalSteps), - id_(g_datasetId[channelId]) - { + id_(g_datasetId[channelId]) { input_->Ref(); auto os_input = input->output_shapes(); output_shapes_ = os_input; @@ -93,12 +93,13 @@ public: MPI_Comm_size(g_comm[channelId], &g_rankSize); LOG_DEBUG("EosDataset: {} was born for channel: {}, maxTrainSteps: {}, maxEvalSteps: {}.", - g_datasetId[channelId], channelId, maxTrainSteps, maxEvalSteps); + g_datasetId[channelId], channelId, maxTrainSteps, maxEvalSteps); g_datasetId[channelId] += 1; } - Dataset(const Dataset&) = delete; - Dataset& operator=(const Dataset&) = delete; + Dataset(const Dataset &) = delete; + + Dataset &operator=(const Dataset &) = delete; ~Dataset() override { @@ -147,8 +148,10 @@ public: } protected: - Status AsGraphDefInternal(SerializationContext *ctx, DatasetGraphDefBuilder *b, Node **output) const override - { + Status + AsGraphDefInternal(SerializationContext *ctx, DatasetGraphDefBuilder *b, + Node **output) const override + { Node *input_graph = nullptr; TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph)); Node *channel_id_x = nullptr; @@ -158,7 +161,8 @@ protected: Node *max_eval_steps_x = nullptr; TF_RETURN_IF_ERROR(b->AddScalar(maxEvalSteps_, &max_eval_steps_x)); TF_RETURN_IF_ERROR( - b->AddDataset(this, { input_graph, channel_id_x, max_train_steps_x, max_eval_steps_x }, output)); + b->AddDataset(this, {input_graph, channel_id_x, max_train_steps_x, max_eval_steps_x}, + output)); return Status::OK(); } @@ -166,20 +170,27 @@ private: // 表示特定数据集上的迭代器的可变性,这个类的 GetNextInternal() 方法告诉 TensorFlow 怎样获取迭代器的下一个元素。 class Iterator : public DatasetIterator { public: - explicit Iterator(const Params ¶ms) : DatasetIterator(params), i_(0), iter_times_(0) {} + explicit Iterator(const Params ¶ms) : DatasetIterator(params), i_(0), + iter_times_(0) {} + #if defined(TF_VERSION_TF2) Status Initialize(IteratorContext* ctx) override { return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_); } #else + Status Initialize(IteratorContext *ctx) override { return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_); } + #endif - Status GetNextInternal(IteratorContext *ctx, std::vector *out_tensors, bool *end_of_sequence) override - { + + Status + GetNextInternal(IteratorContext *ctx, std::vector *out_tensors, + bool *end_of_sequence) override + { mutex_lock l(mu_); if (!input_impl_) { *end_of_sequence = true; @@ -202,12 +213,14 @@ private: getNextStatus = GET_NEXT_TERMINATE; MPI_Request req; - MPI_Iallreduce(MPI_IN_PLACE, &getNextStatus, 1, MPI_INT, MPI_SUM, g_comm[channelId], &req); + MPI_Iallreduce(MPI_IN_PLACE, &getNextStatus, 1, MPI_INT, MPI_SUM, g_comm[channelId], + &req); CheckCommFinished(req, channelId); keyProcess->SetEos(1, dataset()->channelId_); - LOG_DEBUG("[ACTIVE] GetNext eos was triggered actively, channel: {}, iter: {}", dataset()->channelId_, - iter_times_); + LOG_DEBUG("[ACTIVE] GetNext eos was triggered actively, channel: {}, iter: {}", + dataset()->channelId_, + iter_times_); input_impl_.reset(); return Status::OK(); @@ -220,7 +233,8 @@ private: if (getNextStatus < g_rankSize) { *end_of_sequence = true; keyProcess->SetEos(1, dataset()->channelId_); - LOG_DEBUG("[PASSIVE] GetNext eos was triggered passively, channel: {}, iter: {}, sum: {}", + LOG_DEBUG( + "[PASSIVE] GetNext eos was triggered passively, channel: {}, iter: {}, sum: {}", dataset()->channelId_, iter_times_, getNextStatus); input_impl_.reset(); @@ -232,11 +246,12 @@ private: } protected: - std::shared_ptr CreateNode( - IteratorContext* ctx, model::Node::Args args) const override - { - return model::MakeKnownRatioNode(std::move(args), /* ratio= */ 1); + std::shared_ptr CreateNode( + IteratorContext *ctx, model::Node::Args args) const override + { + return model::MakeKnownRatioNode(std::move(args), 1); // ratio = 1 } + #if defined(TF_VERSION_TF2) Status SaveInternal(SerializationContext* ctx, IteratorStateWriter* writer) override { @@ -244,15 +259,18 @@ private: return Status::OK(); } #else - Status SaveInternal(IteratorStateWriter* writer) override + + Status SaveInternal(IteratorStateWriter *writer) override { TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); return Status::OK(); } + #endif - Status RestoreInternal(IteratorContext* ctx, - IteratorStateReader* reader) override - { + + Status RestoreInternal(IteratorContext *ctx, + IteratorStateReader *reader) override + { mutex_lock l(mu_); TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); return Status::OK(); @@ -261,11 +279,14 @@ private: private: static constexpr int GET_NEXT_CONTINUE = 1; static constexpr int GET_NEXT_TERMINATE = 0; - + tensorflow::mutex mu_; - int64 i_ GUARDED_BY(mu_); - int64 iter_times_ GUARDED_BY(mu_); - std::unique_ptr input_impl_ GUARDED_BY(mu_); + int64 i_ + GUARDED_BY(mu_); + int64 iter_times_ + GUARDED_BY(mu_); + std::unique_ptr input_impl_ + GUARDED_BY(mu_); }; const DatasetBase *input_; -- Gitee From 97cd35bf918697852cc5991f03650b470c22e9cb Mon Sep 17 00:00:00 2001 From: sihaixianyu Date: Mon, 29 Apr 2024 06:12:30 +0000 Subject: [PATCH 071/302] =?UTF-8?q?!92=20=E3=80=90=E5=86=92=E7=83=9F?= =?UTF-8?q?=E5=B7=B2=E8=BF=87=E3=80=91=E5=88=87=E5=9B=BE=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=E5=A2=9E=E5=BC=BA=20*=20Slicer=E5=8A=9F=E8=83=BD=E5=A2=9E?= =?UTF-8?q?=E5=BC=BA=EF=BC=8C=E5=85=BC=E5=AE=B9TF1=E3=80=81TF2=EF=BC=8C?= =?UTF-8?q?=E6=94=AF=E6=8C=81Summary=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/constants/constants.py | 13 - mx_rec/core/embedding.py | 4 +- mx_rec/graph/__init__.py | 9 +- mx_rec/graph/acg_push_ops.py | 641 ----------------- mx_rec/graph/constants.py | 37 + mx_rec/graph/graph_typing.py | 35 - mx_rec/graph/modifier.py | 39 +- mx_rec/graph/slicers.py | 879 ++++++++++++++++++++++++ mx_rec/graph/utils.py | 36 +- tests/mx_rec/graph/test_acg_push_ops.py | 514 -------------- tests/mx_rec/graph/test_modifier.py | 2 +- 11 files changed, 975 insertions(+), 1234 deletions(-) delete mode 100644 mx_rec/graph/acg_push_ops.py create mode 100644 mx_rec/graph/constants.py delete mode 100644 mx_rec/graph/graph_typing.py create mode 100644 mx_rec/graph/slicers.py delete mode 100644 tests/mx_rec/graph/test_acg_push_ops.py diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index 2c2cd2fe..a5f055ab 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -212,16 +212,3 @@ class TFDevice(Enum): class Flag(Enum): TRUE = "1" FALSE = "0" - - -class AnchorDatasetOp(Enum): - MODEL_DATASET = "ModelDataset" - OPTIMIZE_DATASET = "OptimizeDataset" - PREFETCH_DATASET = "PrefetchDataset" - - -class AnchorIteratorOp(Enum): - ITERATOR_GET_NEXT = "IteratorGetNext" - MAKE_ITERATOR = "MakeIterator" - ONE_SHOT_ITERATOR = "OneShotIterator" - diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index b38c486b..f90efcf6 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -26,8 +26,8 @@ from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.core.emb.emb_factory import HBMDynamicSparseEmbeddingFactory, HBMSparseEmbeddingFactory, \ ExternalStorageSparseEmbeddingFactory -from mx_rec.graph.utils import tag_orphan_ids from mx_rec.constants.constants import MAX_INT32, All2allGradientsOp, MAX_VOCABULARY_SIZE, MAX_DEVICE_VOCABULARY_SIZE +from mx_rec.graph.utils import mark_orphan_lookup_key from mx_rec.util.initialize import ConfigInitializer from mx_rec.validator.validator import ClassValidator, StringValidator, SSDFeatureValidator, \ para_checker_decorator, IntValidator, NumValidator, OptionValidator, OptionalIntValidator, \ @@ -172,7 +172,7 @@ def sparse_lookup(hashtable: BaseSparseEmbedding, # 对于向上找没有IteratorGetNext的孤儿ids需要标记,以便于后续ACGPushOpsToDataset工作 if isinstance(ids, tf.Tensor): - ids = tag_orphan_ids(ids) + ids = mark_orphan_lookup_key(ids) with tf.compat.v1.variable_scope("{0}//{1}".format(hashtable.table_name, kwargs.get("name"))): if isinstance(ids, FeatureSpec): diff --git a/mx_rec/graph/__init__.py b/mx_rec/graph/__init__.py index f4d2642c..b91d2a49 100644 --- a/mx_rec/graph/__init__.py +++ b/mx_rec/graph/__init__.py @@ -15,8 +15,13 @@ # limitations under the License. # ============================================================================== -__all__ = ["modify_graph_and_start_emb_cache", "GraphModifierHook", "run", "ACGPushOpsToDatasetHook"] +__all__ = [ + "modify_graph_and_start_emb_cache", + "GraphModifierHook", + "run", + "LookupSubgraphSlicerHook", + "OrphanLookupKeySlicerHook", +] from mx_rec.graph.modifier import GraphModifierHook, modify_graph_and_start_emb_cache from mx_rec.graph.patch import run -from mx_rec.graph.acg_push_ops import ACGPushOpsToDatasetHook diff --git a/mx_rec/graph/acg_push_ops.py b/mx_rec/graph/acg_push_ops.py deleted file mode 100644 index ed3e18e6..00000000 --- a/mx_rec/graph/acg_push_ops.py +++ /dev/null @@ -1,641 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from typing import Dict, Tuple, List, Set - -import tensorflow as tf -from tensorflow.python.data.ops.dataset_ops import DatasetV1Adapter -from tensorflow.python.framework.ops import Operation -from tensorflow.python.util import nest as tf_nest -from tensorflow.core.framework import node_def_pb2 -from tensorflow.core.framework import attr_value_pb2 -from tensorflow.python.framework import tensor_util - -from mx_rec.graph import modifier -from mx_rec.util.log import logger -from mx_rec.graph.utils import export_pb_graph -from mx_rec.graph.graph_typing import SubgraphInfo -from mx_rec.constants.constants import ASCEND_TIMESTAMP, ANCHOR_DATASET_NAME, MAX_WHILE_SIZE, AnchorIteratorOp -from mx_rec.validator.validator import para_checker_decorator, ClassValidator - -tf.compat.v1.disable_eager_execution() - -_ACG_NEW_NODE_PREFIX = "ACG_" -_ACG_NEW_ITERATOR = "ACG_NEW_ITERATOR" -_ACG_NEW_INITIALIZER = "ACG_NEW_INITIALIZER" - -_OP_TYPE_TO_PUSH = frozenset(["StringSplit", "StringToNumber"]) -_OP_TYPE_TO_IGNORE = frozenset([AnchorIteratorOp.ITERATOR_GET_NEXT]) -_OP_TYPE_CONTAIN_STRING_TO_IGNORE = frozenset(["Dataset", "Summary"]) -_OP_NAME_CONTAIN_STRING_TO_IGNORE = frozenset(["save", "report_", "loss"]) -_OP_NAME_CONTAIN_STRING_TO_PUSH = frozenset(["ACG_PUSH_NODE"]) - -_TENSOR_TYPE_TO_IGNORE = frozenset([tf.variant, tf.resource]) - -_VARIABLE_TYPES = frozenset(["Variable", "VariableV2", "VarHandleOp"]) -_IGNORE_REPLACE_NODE = frozenset(["Assign", "SaveV2"]) - - -class ACGPushOpsToDatasetHook(tf.estimator.SessionRunHook): - @para_checker_decorator( - check_option_list=[ - ("dump_graph", ClassValidator, {"classes": (bool,)}), - ] - ) - def __init__(self, dump_graph: bool = False) -> None: - super().__init__() - self._dump_graph = dump_graph - - modifier.get_src_dataset = _patched_get_src_dataset - logger.info("[ACGPushOpsToDatasetHook] The function `get_src_dataset` of modifier has been replaced!") - - def begin(self): - logger.info("[ACGPushOpsToDataset] Trigger at beginning!") - graph = tf.compat.v1.get_default_graph() - _find_ops_to_be_pushed(graph=graph, dump_graph=self._dump_graph) - - def after_create_session(self, session, coord): - logger.info("[ACGPushOpsToDatasetHook] Trigger after create session!") - initializers = tf.compat.v1.get_collection(_ACG_NEW_INITIALIZER) - logger.info("[ACGPushOpsToDatasetHook] Got new initialzers: %s.", initializers) - session.run(initializers) - - def end(self, session): - logger.info("[ACGPushOpsToDatasetHook] Trigger in the end!") - - -def _find_ops_to_be_pushed(graph: tf.Graph, dump_graph: bool = False): - export_pb_graph("before_push_graph.pbtxt", dump_graph, graph_def=graph.as_graph_def()) - op_nodes = graph.get_operations() - nodes_to_push = set() - - for op_node in op_nodes: - if op_node.type in _OP_TYPE_TO_IGNORE: - continue - - pushable = False - if op_node.type in _OP_TYPE_TO_PUSH: - pushable = True - - for ignore_type in _OP_TYPE_CONTAIN_STRING_TO_IGNORE: - if ignore_type in op_node.type: - pushable = False - if not pushable: - continue - for ignore_name in _OP_NAME_CONTAIN_STRING_TO_IGNORE: - if ignore_name in op_node.name: - pushable = False - if not pushable: - continue - for each_tensor in list(op_node.outputs) + list(op_node.inputs): - if each_tensor.dtype in _TENSOR_TYPE_TO_IGNORE: - pushable = False - if not pushable: - continue - - for push_name in _OP_NAME_CONTAIN_STRING_TO_PUSH: - if push_name in op_node.name: - pushable = True - break - - if pushable: - nodes_to_push.add(op_node) - - if not nodes_to_push: - logger.info("No target op has to be pushed to dataset map func!") - return - - logger.info("Found operations should be pushed: %s.", nodes_to_push) - subgraph_nodes = _find_subgraph_nodes( - graph, nodes_to_push, tgt_op_type=AnchorIteratorOp.ITERATOR_GET_NEXT.value, exclude_tgt_op=True - ) - _push_subgraph_to_dataset(graph, subgraph_nodes, dump_graph) - export_pb_graph("after_push_graph.pbtxt", dump_graph, graph_def=graph.as_graph_def()) - - -def _find_subgraph_nodes( - graph: tf.Graph, - base_nodes: Set[tf.Operation], - tgt_op_type: str, - exclude_tgt_op: bool = True, -) -> Set[tf.Operation]: - subgraph_nodes = set() - visited_nodes = base_nodes - found_nodes = base_nodes - all_nodes = graph.get_operations() - logger.info("Got base_nodes: %s.", base_nodes) - - loop_cnt = 0 - while len(found_nodes) > 0: - loop_cnt += 1 - if loop_cnt > MAX_WHILE_SIZE: - raise RuntimeError(f"In bfs_lookup function, the maximum cycle depth is greater than {MAX_WHILE_SIZE}.") - - base_nodes = set() - for parent_node in found_nodes: - if (not exclude_tgt_op) and parent_node.type == tgt_op_type: - continue - base_nodes.add(parent_node) - found_nodes = set() - for base_node in base_nodes: - tmp_nodes = [x.op for x in base_node.inputs] + base_node.control_inputs - _warn_for_var_scope_nodes(all_nodes, base_node) - - tmp_nodes = set(tmp_nodes) - visited_nodes - if exclude_tgt_op: - tmp_nodes = set(filter(lambda node: node.type != tgt_op_type, tmp_nodes)) - found_nodes.update(tmp_nodes) - visited_nodes.update(tmp_nodes) - - subgraph_nodes.update(visited_nodes) - logger.info("Found subgraph from nodes_to_push: %s.", subgraph_nodes) - return subgraph_nodes - - -def _warn_for_var_scope_nodes(all_nodes: List[tf.Operation], base_node: tf.Operation): - if base_node.type in _VARIABLE_TYPES: - for x in base_node.outputs: - varable_scope_node = [x for x in all_nodes if x.name.startswith(f"{base_node.name}/")] - logger.warning("Got base_node: %s and varable_scope_node: %s.", base_node, varable_scope_node) - - -def _find_op_from_base_op(base_ops: tf.Operation, target_op_type: str) -> tf.Operation: - base_ops = modifier.check_input_list(base_ops, tf.Operation) - parent_ops = base_ops - while True: - for parent_op in parent_ops: - if parent_op.type == target_op_type: - return parent_op - base_ops = parent_ops - parent_ops = [] - for base_op in base_ops: - parent_ops.extend(modifier.find_parent_op(base_op)) - if not parent_ops: - raise ValueError(f"op {target_op_type} was not found.") - - -def _get_dataset_op(graph: tf.Graph, get_next_op: Operation) -> Operation: - if get_next_op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: - raise TypeError(f"op '{get_next_op}' must be one instance of IteratorGetNext.") - # looking for the MakeIterator operator which corresponds to given batch_tensor - base_op = modifier.find_make_iterator_op(get_next_op.outputs[0]) - # looking for the op which is the one before OptimizeDataset operator - if tf.__version__.startswith("1"): - optimize_dataset_op = _find_op_from_base_op(base_op, "ModelDataset") - target_op = modifier.find_parent_op(optimize_dataset_op) - if not target_op: - raise RuntimeError("the parent op for 'ModelDataset' op was not found.") - if target_op[0].type != "OptimizeDataset": - raise TypeError("op OptimizeDataset was not found.") - target_op = target_op[0] - else: - # 'OptimizeDataset' is not available in TensorFlow2.X - raise RuntimeError("Not supoprt tf2") - return target_op - - -def _ordered_output_from_subgraph(subgraph_out: Dict[tf.Operation, Set[tf.Operation]]) -> List[tf.Tensor]: - addition_funcgraph_output_tensor = [] - for k, v in sorted(subgraph_out.items(), key=lambda x: x[0].name): - k_inputs = set(k.inputs) - for node in v: - _add_sorted_additional_tensors(addition_funcgraph_output_tensor, k_inputs, node) - return addition_funcgraph_output_tensor - - -def _add_sorted_additional_tensors(addition_funcgraph_output_tensor, k_inputs, node): - for each_tensor in sorted(node.outputs, key=lambda x: x.name): - if each_tensor in k_inputs: - addition_funcgraph_output_tensor.append(each_tensor) - - -def _get_tensor_consumers_unsafe(tensor: tf.Tensor) -> List[tf.Operation]: - if isinstance(tensor, tf.Operation): - raise RuntimeError(f"not support type: {node}") - - from tensorflow.python import pywrap_tensorflow as c_api - - consumer_names = c_api.TF_OperationOutputConsumers_wrapper(tensor._as_tf_output()) - graph = tensor.graph - result = [] - for name in consumer_names: - with graph._lock: - if name in graph._nodes_by_name: # ignore deleted node - result.append(graph._nodes_by_name[name]) - - return result - - -def _push_subgraph_to_dataset(graph: tf.Graph, subgraph_to_push: Set[tf.Operation], dump_graph: bool = False): - subgraph_in, subgraph_out = _find_subgraph_in_out(subgraph_to_push) - logger.info("Got input tensor of extracted subgraph: %s", subgraph_in) - logger.info("Got output tensor of extracted subgraph: %s", subgraph_out) - - get_next_node = graph.get_operation_by_name(AnchorIteratorOp.ITERATOR_GET_NEXT.value) - src_dataset = _get_src_dataset(graph, get_next_node) - - def acg_func(*x): # pragma: no cover - old_x = x - logger.debug("Got old batch layout: %s", x) - - x = tf_nest.flatten(x) - for each_tensor in x: - if not isinstance(each_tensor, tf.Tensor): - raise RuntimeError(f"Expected tensor as input of mapfunc. but got: {x}!") - - funcgraph = tf.compat.v1.get_default_graph() - subgraph_info = SubgraphInfo(subgraph_in, subgraph_out, subgraph_to_push) - new_batch = _clone_subgraph_into_funcgraph( - funcgraph, - graph, - subgraph_info, - x, - old_x, - ) - - logger.debug("Got new batch layout: %s.", new_batch) - export_pb_graph("map_func_graph.pbtxt", dump_graph, graph_def=funcgraph.as_graph_def()) - return new_batch - - tgt_dataset = src_dataset.map(acg_func) - tgt_dataset = tgt_dataset.prefetch(0) - _update_iterator_getnext( - graph=graph, - get_next_op=get_next_node, - tgt_dataset=tgt_dataset, - subgraph_out=subgraph_out, - subgraph_to_push=subgraph_to_push, - ) - - -def _find_subgraph_in_out( - sub_graph_nodes: Set[tf.Operation], -) -> Tuple[Dict[tf.Operation, Set[tf.Operation]], Dict[tf.Operation, Set[tf.Operation]]]: - relay_input_nodes = set() - relay_output_nodes = set() - input_to_subnodes = dict() - output_to_subnodes = dict() - - for base_node in sub_graph_nodes: - _update_subgraph_in(base_node, input_to_subnodes, relay_input_nodes, sub_graph_nodes) - _update_subgraph_out(base_node, output_to_subnodes, relay_output_nodes, sub_graph_nodes) - - return input_to_subnodes, output_to_subnodes - - -def _update_subgraph_in( - base_node: tf.Operation, - input_to_subnodes: Dict[tf.Operation, Set[tf.Operation]], - relay_input_nodes: Set[tf.Operation], - sub_graph_nodes: Set[tf.Operation], -): - for input_tensor in base_node.inputs: - input_node = input_tensor.op - if input_node not in sub_graph_nodes: - relay_input_nodes.add(input_node) - res = input_to_subnodes.get(input_node, set()) - res.add(base_node) - input_to_subnodes[input_node] = res - - -def _update_subgraph_out( - base_node: tf.Operation, - output_to_subnodes: Dict[tf.Operation, Set[tf.Operation]], - relay_output_nodes: Set[tf.Operation], - sub_graph_nodes: Set[tf.Operation], -): - for output_tensor in base_node.outputs: - for output_consumer in output_tensor.consumers(): - if output_consumer not in sub_graph_nodes: - relay_output_nodes.add(output_consumer) - res = output_to_subnodes.get(output_consumer, set()) - res.add(base_node) - output_to_subnodes[output_consumer] = res - - -def _get_src_dataset(graph: tf.Graph, get_next_op: Operation) -> DatasetV1Adapter: - try: - target_op = _get_dataset_op(graph, get_next_op) - except (ValueError, TypeError, RuntimeError) as err: - logger.warning("The dataset op was not found, the error is %s. Start to traverse the operations.", err) - dataset_op_list = [op for op in graph.get_operations() if ANCHOR_DATASET_NAME in op.name] - if len(dataset_op_list) != 1: - raise RuntimeError( - f"The `{ANCHOR_DATASET_NAME}` was not found from the operations, dataset_op_list: " - f"{dataset_op_list}." - ) from err - target_op = dataset_op_list[0] - except Exception as err: - raise RuntimeError(f"The dataset was not found, the error is `{err}`.") from err - if not target_op.outputs: - raise ValueError(f"The length of the outputs of target op `{target_op}` is 0.") - logger.info("Find target op `%s`, and output is `%s`.", target_op.name, target_op.outputs) - src_dataset = modifier.find_target_instance_dataset(target_op.outputs[0]) - return src_dataset - - -def _clone_subgraph_into_funcgraph( - funcgraph: tf.Graph, - defaultgraph: tf.Graph, - subgraph_info: SubgraphInfo, - x: List[tf.Tensor], - old_x: Tuple[Dict[str, tf.Tensor]], -) -> Dict[str, tf.Tensor]: - topo_subgraph_list = _topo_subgraph(subgraph_info.subgraph_to_push) # node - tensor_mapping = {} # subgraph-tensor -> funcgraph-tensor - node_mapping = {} # subgraph-node -> funcgraph-node - for k, v in subgraph_info.subgraph_in.items(): - _get_mapping_for_subgraph_in(k, v, x, tensor_mapping) - for old_node in topo_subgraph_list: - _get_mapping_for_subgraph(funcgraph, defaultgraph, node_mapping, old_node, tensor_mapping) - - logger.info("Got node_mapping: %s", node_mapping) - logger.info("Got tensor_mapping: %s", tensor_mapping) - - ordered_output_subgraph_tensors = _ordered_output_from_subgraph(subgraph_info.subgraph_out) - addition_funcgraph_output_tensor = _get_mapping_tensor(tensor_mapping, ordered_output_subgraph_tensors) - new_funcgraph_output_tensor = list(x) + addition_funcgraph_output_tensor - logger.info("Got new_funcgraph_output_tensor: %s", new_funcgraph_output_tensor) - - new_x = old_x[0] - for tensor in addition_funcgraph_output_tensor: - last_key = f"{sorted(new_x)[-1]}_last_key" - new_x[last_key] = tensor - - return new_x - - -def _get_mapping_for_subgraph_in( - from_node: tf.Operation, to_nodes: Set[tf.Operation], x: List[tf.Tensor], tensor_mapping -): - if from_node.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: - raise RuntimeError(f"Expect IteratorGetNext for input tensor of subgraph, but got {from_node}") - for node in to_nodes: - for each_tensor in node.inputs: - if each_tensor.op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: - continue - old_tensor_name = each_tensor.name - x_index = int(old_tensor_name.split(":")[-1]) - tensor_mapping[each_tensor] = x[x_index] - - -def _get_mapping_for_subgraph( - funcgraph: tf.Graph, - defaultgraph: tf.Graph, - node_mapping: Dict[tf.Operation, tf.Operation], - old_node: tf.Operation, - tensor_mapping: Dict[tf.Tensor, tf.Tensor], -): - logger.debug("old_node: %s \n old_node_inputs: %s", old_node, [x for x in old_node.inputs]) - node_def = old_node.node_def - for each_tensor in old_node.inputs: - if each_tensor not in tensor_mapping: - raise RuntimeError( - f"each_tensor(input) {each_tensor} need by {old_node.name} not in tensor_mapping.{tensor_mapping}" - ) - new_inputs = _get_mapping_tensor(tensor_mapping, old_node.inputs) - if old_node.type in _VARIABLE_TYPES: - node_def = _frozen_variable_node_to_func_const_node_def( - variable_node=old_node, funcgraph=funcgraph, defaultgraph=defaultgraph - ) - node_def.name = _ACG_NEW_NODE_PREFIX + node_def.name - new_node = tf.Operation(node_def=node_def, g=funcgraph, inputs=new_inputs) - node_mapping[old_node] = new_node - for old_out_tensor, new_out_tensor in zip(old_node.outputs, new_node.outputs): - tensor_mapping[old_out_tensor] = new_out_tensor - - -def _frozen_variable_node_to_func_const_node_def( - variable_node: tf.Operation, funcgraph: tf.Graph, defaultgraph: tf.Graph -) -> node_def_pb2.NodeDef: - def create_const_node_def(node_name, dtype, data, data_shape=None): - """Creates a Const op.""" - output_node = node_def_pb2.NodeDef() - output_node.op = "Const" - output_node.name = node_name - output_node.attr["dtype"].CopyFrom(dtype) - output_node.attr["value"].CopyFrom( - attr_value_pb2.AttrValue(tensor=tensor_util.make_tensor_proto(data, dtype=dtype.type, shape=data_shape)) - ) - return output_node - - # NOTE: Variable node type is readonly in funcgraph, all nodes of this type have to be fronzen. - variable_name = variable_node.name - if variable_node.type == "VarHandleOp": - variable_name = f"{variable_name}/Read/ReadVariableOp:0" - else: - variable_name = f"{variable_name}:0" - initializer = defaultgraph.get_operation_by_name(f"{variable_node.name}/Assign") - logger.info(f"VariableV2: {variable_node.name}, initializer: {initializer.name} ") - defaultsession = tf.compat.v1.Session(graph=defaultgraph) - _ = defaultsession.run([initializer]) - logger.info(f"Start run variables data: {variable_name}") - returned_variable_data = defaultsession.run(variable_name) - logger.info(f"Start froze variables: {variable_name} {returned_variable_data}") - new_const_node = create_const_node_def( - variable_node.name, variable_node.node_def.attr["dtype"], returned_variable_data, returned_variable_data.shape - ) - return new_const_node - - -def _get_mapping_tensor(tsr2tsr: Dict[tf.Tensor, tf.Tensor], keys: List[tf.Tensor]) -> List[tf.Tensor]: - tensors = [] - for k in keys: - if k not in tsr2tsr: - raise KeyError(f"Failed to find key tensor: {k} from tensor map: {tsr2tsr}.") - tensors.append(tsr2tsr[k]) - return tensors - - -def _topo_subgraph(subgraph: Set[tf.Operation]) -> List[tf.Operation]: - topo_subgraph_list = [] - topo_subgraph_set = set() - start_nodes = set() - [start_nodes.add(x) for x in subgraph] - logger.info("Got topo_subgraph start nodes: %s", start_nodes) - - def topo_subgraph_dfs(curr_node, output_list, output_set): - if not isinstance(curr_node, tf.Operation): - raise RuntimeError(f"topo_subgraph_dfs input should be node(aka. tf.Operator). {curr_node}") - curr_inputs = curr_node.inputs - logger.debug("Got topo_dfs: %s <- %s", curr_node.name, [x.name for x in curr_inputs]) - current_control_inputs = curr_node.control_inputs - if len(current_control_inputs) > 0: - raise RuntimeError( - f"Control input are not supported: {curr_node.name}, control_inputs: {current_control_inputs}" - ) - if curr_node in output_set: - return - output_set.add(curr_node) - for tensor in curr_inputs: - node = tensor.op - if node.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value and node not in output_set: - topo_subgraph_dfs(node, output_list, output_set) - output_list.append(curr_node) - - [topo_subgraph_dfs(x, topo_subgraph_list, topo_subgraph_set) for x in start_nodes] - if len(topo_subgraph_list) != len(topo_subgraph_set): - raise RuntimeError(f"Got duplicated topo node: {sorted(topo_subgraph_list, key=lambda x: x.name)}.") - logger.info("Got topo_subgraph: %s", topo_subgraph_list) - return topo_subgraph_list - - -def _update_iterator_getnext( - graph: tf.Graph, - get_next_op: Operation, - tgt_dataset: DatasetV1Adapter, - subgraph_out: Dict[tf.Operation, Set[tf.Operation]], - subgraph_to_push: Set[tf.Operation], -): - if not get_next_op.outputs: - raise RuntimeError("there is no tensor in the dataset. Please check the dataset and data processing.") - iterator_type = "" - if get_next_op.inputs: - iterator_type = get_next_op.inputs[0].op.type - if iterator_type == "IteratorV2": - iterator_type = modifier.find_make_iterator_op(get_next_op.outputs[0]).type - if iterator_type not in (AnchorIteratorOp.MAKE_ITERATOR.value, AnchorIteratorOp.ONE_SHOT_ITERATOR.value): - raise RuntimeError( - f"Only iterators `MakeIterator` and `OneShotIterator` are supported in `graph modify` mode, " - f"but the current iterator is `{iterator_type}`." - ) - logger.info("The iterator type of dataset is %s.", iterator_type) - if iterator_type == AnchorIteratorOp.MAKE_ITERATOR.value: - new_iterator = tgt_dataset.make_initializable_iterator() - logger.info("Got new_iterator: %s, new_iterator.initializer: %s.", new_iterator, new_iterator.initializer) - graph.add_to_collection(_ACG_NEW_INITIALIZER, new_iterator.initializer) - else: - new_iterator = tgt_dataset.make_one_shot_iterator() - new_batch = new_iterator.get_next(_ACG_NEW_ITERATOR) - if "timestamp" in new_batch.keys(): - tf.compat.v1.add_to_collection(ASCEND_TIMESTAMP, new_batch["timestamp"]) - try: - new_batch_tensor = new_batch - while not isinstance(new_batch_tensor, tf.Tensor): - if isinstance(new_batch_tensor, tuple): - new_batch_tensor = new_batch_tensor[0] - elif isinstance(new_batch_tensor, dict): - new_batch_tensor = list(new_batch_tensor.values()) - elif isinstance(new_batch_tensor, list): - new_batch_tensor = new_batch_tensor[0] - elif isinstance(new_batch_tensor, tf.Tensor): - break - else: - raise RuntimeError( - f"Need to support new_batch_tensor{new_batch_tensor}, type: {type(new_batch_tensor)}" - ) - except IndexError as err: - raise IndexError("Cannot find a tensor from given batch.") from err - new_get_next_op = _find_op_from_base_op(new_batch_tensor.op, AnchorIteratorOp.ITERATOR_GET_NEXT.value) - logger.info("Got new_get_next_op: %s.", new_get_next_op) - _replace_get_next_op(graph, get_next_op, new_get_next_op, subgraph_out, subgraph_to_push) - - -def _replace_get_next_op( - graph: tf.Graph, - old_get_next_op: tf.Operation, - new_get_next_op: tf.Operation, - subgraph_out: Dict[tf.Operation, Set[tf.Operation]], - subgraph_to_push: Set[tf.Operation], -): - for output_tensor in old_get_next_op.outputs: - _update_old_consumer(graph, new_get_next_op, output_tensor, subgraph_to_push) - - old_get_next_op_output_size = len(old_get_next_op.outputs) - ordered_output_tensor = _ordered_output_from_subgraph(subgraph_out) - - for i, output_tensor in enumerate(ordered_output_tensor): - offset = old_get_next_op_output_size + i - _update_subgraph_out_consumer(graph, new_get_next_op, offset, output_tensor) - - -def _update_old_consumer( - graph: tf.Graph, new_get_next_op: tf.Operation, output_tensor: tf.Tensor, subgraph_to_push: List[tf.Operation] -): - old_tensor_name = output_tensor.name - output_index = old_tensor_name.split(":")[-1] - new_tensor_name = f"{new_get_next_op.name}:{output_index}" - logger.info("Replace old_tensor_name: %s to new_tensor_name: %s", old_tensor_name, new_tensor_name) - new_tensor = graph.get_tensor_by_name(new_tensor_name) - for output_consumer in _get_tensor_consumers_unsafe(output_tensor): - if output_consumer in subgraph_to_push: - logger.info( - "Ignore consumer in old subgraph %s, not let it connect to new IteratorGetNext.", output_consumer - ) - continue - for i, consumer_input in enumerate(output_consumer.inputs): - if consumer_input != output_tensor: - logger.debug("Not replace output_consumer: %s consumer_input: %s.", output_consumer, consumer_input) - continue - logger.info( - "Success replace output_consumer: %s type: %s from consumer_input: %s to new_tensor: %s", - output_consumer.name, - output_consumer.type, - consumer_input, - new_tensor, - ) - output_consumer._update_input(i, new_tensor) - - -def _update_subgraph_out_consumer( - graph: tf.Graph, new_get_next_op: tf.Operation, offset: int, output_tensor: tf.Tensor -): - new_tensor_name = f"{new_get_next_op.name}:{offset}" - logger.info("Replace old_tensor_name: %s to new_tensor_name: %s.", output_tensor.name, new_tensor_name) - new_tensor = graph.get_tensor_by_name(new_tensor_name) - for output_consumer in _get_tensor_consumers_unsafe(output_tensor): - if output_consumer.type in _IGNORE_REPLACE_NODE: - logger.info("Ignore replace output_consumer: %s, it's of type: %s.", output_consumer, output_consumer.type) - continue - for j, consumer_input in enumerate(output_consumer.inputs): - if consumer_input != output_tensor: - logger.debug("Not replace output_consumer: %s consumer_input: %s.", output_consumer, consumer_input) - continue - logger.info( - "Success replace output_consumer: %s type: %s from consumer_input: %s to new_tensor: %s", - output_consumer.name, - output_consumer.type, - consumer_input, - new_tensor, - ) - output_consumer._update_input(j, new_tensor) - - -def _patched_get_src_dataset(get_next_op: Operation, is_training: bool) -> DatasetV1Adapter: - try: - target_op = modifier.get_dataset_op(get_next_op) - except (ValueError, TypeError, RuntimeError) as err: - logger.debug("In `OneShotIterator` mode, find `PrefetchDataset` from all ops in graph.") - graph = tf.compat.v1.get_default_graph() - dataset_op_list = [op for op in graph.get_operations() if ANCHOR_DATASET_NAME in op.name] - dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) - logger.debug("Got sorted dataset_op_list: %s.", dataset_op_list) - if len(dataset_op_list) != 2: - raise RuntimeError( - f"Expect two `PrefetchDataset` ops in dataset_op_list, but got: {dataset_op_list}." - ) from err - target_op = dataset_op_list[1] - except Exception as err: - raise RuntimeError(f"The source dataset can't be found, got error: {err}.") from err - - if not target_op.outputs: - raise ValueError(f"The length of the outputs of target op `{target_op}` is 0.") - - logger.debug("Find target dataset op: %s, and output is %s.", target_op, target_op.outputs) - src_dataset = modifier.find_target_instance_dataset(target_op.outputs[0]) - - return src_dataset diff --git a/mx_rec/graph/constants.py b/mx_rec/graph/constants.py new file mode 100644 index 00000000..077405d6 --- /dev/null +++ b/mx_rec/graph/constants.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + + +from enum import Enum + + +class DeprecatedOp(Enum): + DEPRECATED_ITERATOR_GET_NEXT = "DEPRECATED_ITERATOR_GET_NEXT" + DEPRECATED_PREFETCH_DATASET = "DEPRECATED_PREFETCH_DATASET" + + +class AnchorDatasetOp(Enum): + MODEL_DATASET = "ModelDataset" + OPTIMIZE_DATASET = "OptimizeDataset" + PREFETCH_DATASET = "PrefetchDataset" + + +class AnchorIteratorOp(Enum): + ITERATOR_GET_NEXT = "IteratorGetNext" + ITERATOR_V2 = "IteratorV2" + MAKE_ITERATOR = "MakeIterator" + ONE_SHOT_ITERATOR = "OneShotIterator" diff --git a/mx_rec/graph/graph_typing.py b/mx_rec/graph/graph_typing.py deleted file mode 100644 index c11bd4c0..00000000 --- a/mx_rec/graph/graph_typing.py +++ /dev/null @@ -1,35 +0,0 @@ -# !/usr/bin/env python3 -# -- coding: utf-8 -- -# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. - -import dataclasses -from typing import Dict, DefaultDict, List, Tuple, Set - -from tensorflow import Operation, Tensor -from tensorflow.core.framework.graph_pb2 import GraphDef - - -# DefaultDict: -# Key: Tensor => Represent output tensor of `IteratorGetNext` operation. -# Val: List[Tuple[int, Operation]] => Contains target operation of output tensor and it's corresponding index. -ReplacementSpec = DefaultDict[Tensor, List[Tuple[int, Operation]]] - - -@dataclasses.dataclass -class AnchorRecord: - replacement_spec: ReplacementSpec - passing_tensors: List[Tensor] - batch_tensor_indexs: List[int] - sub_cutting_points: List[Tensor] - sub_graph_def: GraphDef - input_names: List[str] - output_names: List[str] - is_training: bool - input_indexs: List[int] = None - - -@dataclasses.dataclass -class SubgraphInfo: - subgraph_in: Dict[Operation, Set[Operation]] - subgraph_out: Dict[Operation, Set[Operation]] - subgraph_to_push: Set[Operation] diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index 8338e870..e0b4bdeb 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -15,9 +15,10 @@ # limitations under the License. # ============================================================================== +import dataclasses from collections import defaultdict from collections.abc import Callable -from typing import Any, List, Dict, Tuple +from typing import Any, List, Dict, Tuple, DefaultDict import tensorflow as tf from tensorflow import Operation, Tensor @@ -26,16 +27,15 @@ from tensorflow.python.data.ops.dataset_ops import DatasetV1Adapter from tensorflow.python.framework.errors_impl import InvalidArgumentError from mx_rec.constants.constants import ASCEND_CUTTING_POINT_INITIALIZER, ASCEND_SPARSE_LOOKUP_ENTRANCE, \ - ASCAnchorAttr, ASCEND_TIMESTAMP, MAX_WHILE_SIZE, LIBREC_EOS_OPS_SO, AnchorDatasetOp, \ - AnchorIteratorOp + ASCAnchorAttr, ASCEND_TIMESTAMP, MAX_WHILE_SIZE, LIBREC_EOS_OPS_SO from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.asc.helper import get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.graph.merge_lookup import do_merge_lookup -from mx_rec.graph.utils import check_input_list, find_parent_op, check_cutting_points, record_ops_to_replace, \ - export_pb_graph, make_sorted_key_to_tensor_list -from mx_rec.graph.graph_typing import AnchorRecord, ReplacementSpec +from mx_rec.graph.utils import check_input_list, find_parent_op, check_cutting_points, \ +record_ops_to_replace, export_pb_graph, make_sorted_key_to_tensor_list +from mx_rec.graph.constants import DeprecatedOp, AnchorDatasetOp, AnchorIteratorOp from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger from mx_rec.util.ops import import_host_pipeline_ops @@ -43,6 +43,19 @@ from mx_rec.util.perf import performance from mx_rec.validator.validator import para_checker_decorator, ClassValidator +@dataclasses.dataclass +class AnchorRecord: + replacement_spec: DefaultDict[Tensor, List[Tuple[int, Operation]]] + passing_tensors: List[Tensor] + batch_tensor_indexs: List[int] + sub_cutting_points: List[Tensor] + sub_graph_def: GraphDef + input_names: List[str] + output_names: List[str] + is_training: bool + input_indexs: List[int] = None + + def get_preprocessing_map_func( graph_def: GraphDef, input_names: List[str], @@ -142,7 +155,7 @@ def parse_batch(data_args: Any, data_batch: dict, key: str = None): def get_input_index_list( cutting_point_list: List[Tensor], - replacement_specs: ReplacementSpec, + replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], mapping_name_list: List[str], base_count: int, timestamp_index: int = None @@ -319,7 +332,7 @@ def get_sub_graph( return sub_graph_def, input_name_list, output_name_list -def update_input_tensor_with_new_batch(replacement_specs: ReplacementSpec, +def update_input_tensor_with_new_batch(replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], new_get_next_op_name: str, new_batch: Dict[str, Tensor]): """ @@ -428,6 +441,14 @@ def get_src_dataset(get_next_op: Operation, is_training: bool) -> DatasetV1Adapt logger.warning("The dataset op was not found, the error is `%s`. Start to traverse the operations.", err) graph = tf.compat.v1.get_default_graph() dataset_op_list = [op for op in graph.get_operations() if AnchorDatasetOp.PREFETCH_DATASET.value in op.name] + + # WARN: Couple with NoGradSubGraphSlicer::_find_old_dataset. + dataset_op_list = list( + filter(lambda op: op not in tf.compat.v1.get_collection(DeprecatedOp.DEPRECATED_PREFETCH_DATASET), + dataset_op_list) + ) + dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) + logger.debug("In get_src_dataset function, current mode(train: True, eval: False): %s, dataset_op_list: %s.", is_training, dataset_op_list) @@ -440,7 +461,7 @@ def get_src_dataset(get_next_op: Operation, is_training: bool) -> DatasetV1Adapt prefetch_dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) target_op = prefetch_dataset_op_list[1] else: - raise RuntimeError(f"`{AnchorDatasetOp.PREFETCH_DATASET.value}` not found, got dataset_op_list: " + raise RuntimeError(f"'{AnchorDatasetOp.PREFETCH_DATASET.value}' not found, got transformation datasets: " f"{dataset_op_list}.") from err except Exception as err: raise RuntimeError(f"The dataset was not found, the error is `{err}`.") from err diff --git a/mx_rec/graph/slicers.py b/mx_rec/graph/slicers.py new file mode 100644 index 00000000..d22af868 --- /dev/null +++ b/mx_rec/graph/slicers.py @@ -0,0 +1,879 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import abc +from typing import List, Dict, Set, Tuple, Union + +import pandas as pd +import tensorflow as tf +from tensorflow import Operation, Tensor, SparseTensor, Graph, variant, resource +from tensorflow.python.data.ops.dataset_ops import DatasetV1Adapter + +from mx_rec.graph import utils, modifier +from mx_rec.util.log import logger +from mx_rec.validator.validator import ClassValidator, para_checker_decorator +from mx_rec.constants.constants import ( + ASCEND_TIMESTAMP, + MAX_WHILE_SIZE, + ASCAnchorAttr, + ASCEND_SPARSE_LOOKUP_ENTRANCE, +) +from mx_rec.graph.constants import DeprecatedOp, AnchorDatasetOp, AnchorIteratorOp +from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding + + +class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): + _SLICED_OP_NAME_PREFIX = "sliced" + + _SLICING_SUMMARY_NAME = "slicing_summary.csv" + _UNSLICED_FULL_GRAPH_NAME = "unsliced_full_graph.pbtxt" + _SLICED_SUB_GRAPH_NAME = "sliced_sub_graph.pbtxt" + _SLICED_FULL_GRAPH_NAME = "sliced_full_graph.pbtxt" + + _INVALID_STR_IN_OP_TYPE = ("Dataset", "Summary") + _INVALID_STR_IN_OP_NAME = ("save", "report_", "loss") + _INVALID_CONSUMER_OP_TYPE = ("Assign", "SaveV2") + + _VALID_TENSOR_CLASS = (Tensor, SparseTensor) + _INVALID_TENSOR_DTYPE = (variant, resource) + + def __init__(self, full_graph: Graph = None, info_dir: str = "slicing") -> None: + if not full_graph: + full_graph = tf.compat.v1.get_default_graph() + self._full_graph = full_graph + + if not os.path.exists(info_dir): + os.makedirs(info_dir) + self._info_dir = info_dir + + @abc.abstractmethod + def summarize(self) -> None: + pass + + @abc.abstractmethod + def slice(self) -> None: + pass + + def _slice_ops(self, sliceable_ops: Set[Operation], is_training: bool) -> None: + sliced_ops = self._find_min_dep_ops(sliceable_ops) + in_op_to_edge_ops, out_op_to_edge_ops = self._find_subgraph_in_and_out(sliced_ops) + + old_get_next = self._find_old_get_next(sliceable_ops) + old_dataset = self._find_old_dataset(old_get_next, is_training) + + new_dataset = self._make_new_dataset(old_dataset, sliced_ops, in_op_to_edge_ops, out_op_to_edge_ops) + new_dataset = new_dataset.prefetch(0) + + new_get_next = self._make_new_get_next(old_get_next, new_dataset) + self._replace_get_next(old_get_next, new_get_next, out_op_to_edge_ops, sliced_ops) + + def _make_new_dataset( + self, + old_dataset: DatasetV1Adapter, + sliced_ops: Set[Operation], + in_op_to_edge_ops: Dict[Operation, Set[Operation]], + out_op_to_edge_ops: Dict[Operation, Set[Operation]], + ) -> DatasetV1Adapter: + def slice_map_func(*batch): # pragma: no cover + logger.debug("The layout of old batch: %s.", batch) + + funcgraph = tf.compat.v1.get_default_graph() + flatten_batch = tf.nest.flatten(batch) + + for t in flatten_batch: + if isinstance(t, NoGradSubgraphSlicer._VALID_TENSOR_CLASS): + continue + raise RuntimeError(f"expected 'tf.Tensor' or 'tf.SparseTensor' in batch, but got %s.", t) + + new_batch = self._clone_subgraph_into_funcgraph(sliced_ops, in_op_to_edge_ops, out_op_to_edge_ops, batch) + utils.export_pb_graph( + file_name=NoGradSubgraphSlicer._SLICED_SUB_GRAPH_NAME, + dump_graph=True, + graph_def=funcgraph.as_graph_def(), + export_path=self._info_dir, + ) + + return new_batch + + return old_dataset.map(slice_map_func) + + def _find_subgraph_in_and_out( + self, + sub_graph_ops: Set[Operation], + ) -> Tuple[Dict[Operation, Set[Operation]], Dict[Operation, Set[Operation]]]: + in_op_to_edge_ops = dict() + out_op_to_edge_ops = dict() + + for base_node in sub_graph_ops: + self._update_subgraph_in(base_node, in_op_to_edge_ops, sub_graph_ops) + self._update_subgraph_out(base_node, out_op_to_edge_ops, sub_graph_ops) + + logger.info("Got input relationship of extracted subgraph: %s", in_op_to_edge_ops) + logger.info("Got output relationship of extracted subgraph: %s", out_op_to_edge_ops) + return in_op_to_edge_ops, out_op_to_edge_ops + + def _find_old_get_next(self, sliceable_ops: Set[Operation]) -> Operation: + old_get_next = self._upward_bfs_op(sliceable_ops, AnchorIteratorOp.ITERATOR_GET_NEXT.value) + + tf.compat.v1.add_to_collection(DeprecatedOp.DEPRECATED_ITERATOR_GET_NEXT, old_get_next) + logger.info("Old 'IteratorGetNext' operation has been deprecated now.") + + return old_get_next + + def _find_old_dataset(self, get_next: Operation, is_training: bool) -> DatasetV1Adapter: + tgt_trans_dataset = None + try: + tgt_trans_dataset = self._find_trans_dataset(get_next) + except (ValueError, TypeError, RuntimeError) as err: + trans_datasets = [ + op for op in self._full_graph.get_operations() if AnchorDatasetOp.PREFETCH_DATASET.value in op.name + ] + trans_datasets = list( + filter( + lambda op: op not in tf.compat.v1.get_collection(DeprecatedOp.DEPRECATED_PREFETCH_DATASET), + trans_datasets, + ) + ) + sorted_datasets = sorted(trans_datasets, key=lambda op: op.name) + + if len(trans_datasets) == 1: + tgt_trans_dataset = sorted_datasets[0] + elif is_training and len(sorted_datasets) == 2: + tgt_trans_dataset = sorted_datasets[0] + elif not is_training and len(sorted_datasets) == 2: + tgt_trans_dataset = sorted_datasets[0] + else: + raise RuntimeError(f"target transformation dataset not found, got datasets: {trans_datasets}.") from err + except Exception as err: + raise RuntimeError(f"the dataset was not found, the error is `{err}`.") from err + + if not tgt_trans_dataset.outputs: + raise ValueError(f"the length of the outputs of target op `{tgt_trans_dataset}` is 0.") + logger.info("Find target op `%s`, and output is `%s`.", tgt_trans_dataset.name, tgt_trans_dataset.outputs) + + # WARN: Couple with modifier module, global collection used for filtering deprecated prefetch dataset. + self._full_graph.add_to_collection(DeprecatedOp.DEPRECATED_PREFETCH_DATASET, tgt_trans_dataset) + old_dataset = modifier.find_target_instance_dataset(tgt_trans_dataset.outputs[0]) + + return old_dataset + + def _find_trans_dataset(self, get_next: Operation) -> Operation: + if get_next.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: + raise TypeError(f"operation '{get_next}' must be one instance of 'IteratorGetNext'.") + + make_iter = modifier.find_make_iterator_op(get_next.outputs[0]) + + trans_dataset = None + if tf.__version__.startswith("1"): + optimize_dataset_op = self._upward_bfs_op(make_iter, AnchorDatasetOp.MODEL_DATASET.value) + trans_dataset = utils.find_parent_op(optimize_dataset_op) + if not trans_dataset: + raise RuntimeError("parent operation of 'ModelDataset' was not found.") + if trans_dataset[0].type != AnchorDatasetOp.OPTIMIZE_DATASET.value: + raise TypeError(f"operation 'OptimizeDataset' was not found.") + trans_dataset = trans_dataset[0] + else: + trans_dataset = self._upward_bfs_op(make_iter, AnchorDatasetOp.PREFETCH_DATASET.value) + + return trans_dataset + + def _clone_subgraph_into_funcgraph( + self, + sliced_ops: Set[Operation], + in_op_to_edge_ops: Set[Operation], + out_op_to_edge_ops: Set[Operation], + batch: Tuple[Dict[str, Union[Tensor, SparseTensor, Dict]]], + ) -> Dict[str, Union[Tensor, SparseTensor, Dict]]: + """Clone the sliced subgraph into a new funcgraph. + + Args: + sliced_ops: The operation set that has been sliced. + in_op_to_edge_ops: The input relationship of sliced subgraph. + out_op_to_edge_ops: The output relationship of sliced subgraph. + batch: The original batch layout of old dataset. + + Returns: + new_batch: The new batch layout of new dataset. + """ + + topo_subgraph_list = self._topo_sort_sliced_ops(sliced_ops) + + node_mapping = {} # subgraph-node -> funcgraph-node + tensor_mapping = {} # subgraph-tensor -> funcgraph-tensor + for in_op, edge_ops in in_op_to_edge_ops.items(): + self._get_mapping_for_subgraph_in(in_op, edge_ops, tensor_mapping) + for old_op in topo_subgraph_list: + self._get_mapping_for_subgraph(old_op, node_mapping, tensor_mapping) + + logger.info("Got node_mapping: %s", node_mapping) + logger.info("Got tensor_mapping: %s", tensor_mapping) + + ordered_output_tensors = self._sort_sliced_graph_outputs(out_op_to_edge_ops) + extra_output_tensor = self._get_mapped_tensor(tensor_mapping, ordered_output_tensors) + + if not isinstance(batch, tuple): + batch = (batch,) + + new_batch = batch[0] + for tensor in extra_output_tensor: + next_last_key = f"{sorted(new_batch)[-1]}_" + new_batch[next_last_key] = tensor + + logger.debug("Got new batch layout: %s.", new_batch) + return new_batch + + def _make_new_get_next( + self, + old_get_next: Operation, + new_dataset: DatasetV1Adapter, + ) -> Operation: + """Make new 'IteratorGetNext' operation. + + 1. This func will automatically detect the iterator type of the old dataset, and then make 'IteratorGetNext' + from the corresponding iterator. + 2. Only 'MakeIterator' and 'OneShotIterator' are available now. + + Args: + old_get_next: The old 'IteratorGetNext' operation. + new_dataset: The new dataset which contains sliced subgraph and corresponding additional outputs. + + Returns: + new_get_next: The new 'IteratorGetNext' operation. + """ + + if not old_get_next.outputs: + raise RuntimeError("no available tensor in the dataset. Please check the dataset and data processing.") + + iter_type = None + if old_get_next.inputs: + iter_type = old_get_next.inputs[0].op.type + if iter_type == AnchorIteratorOp.ITERATOR_V2.value: + iter_type = modifier.find_make_iterator_op(old_get_next.outputs[0]).type + if iter_type not in (AnchorIteratorOp.MAKE_ITERATOR.value, AnchorIteratorOp.ONE_SHOT_ITERATOR.value): + raise RuntimeError( + f"only iterators `MakeIterator` and `OneShotIterator` are supported in `graph modify` mode, " + f"but the current iterator is `{iter_type}`." + ) + logger.info("The iterator type of old dataset is %s.", iter_type) + + if iter_type == AnchorIteratorOp.MAKE_ITERATOR.value: + new_iterator = tf.compat.v1.data.make_initializable_iterator(new_dataset) + else: + new_iterator = tf.compat.v1.data.make_one_shot_iterator(new_dataset) + logger.info("Got new iterator: %s from dataset %s.", new_iterator, new_dataset) + + new_batch_name = "{}/{}".format( + NoGradSubgraphSlicer._SLICED_OP_NAME_PREFIX, AnchorIteratorOp.ITERATOR_GET_NEXT.value + ) + new_batch = new_iterator.get_next(name=new_batch_name) + + # WARN: Couple with user model, this collection has been addded manually. + if "timestamp" in new_batch.keys(): + tf.compat.v1.add_to_collection(ASCEND_TIMESTAMP, new_batch["timestamp"]) + + try: + new_batch_tensor = new_batch + while not isinstance(new_batch_tensor, NoGradSubgraphSlicer._VALID_TENSOR_CLASS): + if isinstance(new_batch_tensor, tuple): + new_batch_tensor = new_batch_tensor[0] + elif isinstance(new_batch_tensor, dict): + new_batch_tensor = list(new_batch_tensor.values()) + elif isinstance(new_batch_tensor, list): + new_batch_tensor = new_batch_tensor[0] + elif isinstance(new_batch_tensor, NoGradSubgraphSlicer._VALID_TENSOR_CLASS): + break + else: + raise RuntimeError(f"batch value {new_batch_tensor} of {type(new_batch_tensor)} is not supported.") + except IndexError as err: + raise IndexError("cannot find a tensor from given batch.") from err + + new_get_next = self._upward_bfs_op(new_batch_tensor.op, AnchorIteratorOp.ITERATOR_GET_NEXT.value) + + logger.info("Got old_new_get_next: %s.", new_get_next) + return new_get_next + + def _replace_get_next( + self, + old_get_next: Operation, + new_get_next: Operation, + out_op_to_edge_ops: Dict[Operation, Set[Operation]], + sliced_ops: Set[Operation], + ) -> None: + """Replace the old 'IteratorGetNext' operation with the new one. + + 1. This func will update the consumer of the old 'IteratorGetNext' operation to the new one. + 2. This func will update the consumer of the output tensors of the sliced subgraph to the new one. + + Args: + old_get_next: The old 'IteratorGetNext' operation. + new_get_next: The new 'IteratorGetNext' operation. + out_op_to_edge_ops: The output relationship of sliced subgraph. + sliced_ops: The operation set that has been sliced. + """ + + for t in old_get_next.outputs: + self._update_old_get_next_consumer(t, new_get_next, sliced_ops) + + next_offset = len(old_get_next.outputs) - 1 + sorted_outputs = self._sort_sliced_graph_outputs(out_op_to_edge_ops) + + for t in sorted_outputs: + next_offset += 1 + self._update_sliced_graph_consumer(t, new_get_next, next_offset) + + def _update_old_get_next_consumer( + self, old_get_next_output: Tensor, new_get_next: Operation, sliced_ops: Set[Operation] + ) -> None: + """Update the consumer of the old 'IteratorGetNext' operation to the new one. + + Args: + old_get_next_output: The output tensor of the old 'IteratorGetNext' operation. + new_get_next: The new 'IteratorGetNext' operation. + sliced_ops: The operation set that has been sliced. + """ + + old_tensor_name = old_get_next_output.name + output_index = old_tensor_name.split(":")[-1] + new_tensor_name = f"{new_get_next.name}:{output_index}" + new_tensor = self._full_graph.get_tensor_by_name(new_tensor_name) + + old_tensor_consumers = self._get_tensor_consumers(old_get_next_output) + for consumer in old_tensor_consumers: + if consumer in sliced_ops: + logger.debug("Ignore consumer: %s in sliced operations.", consumer.name) + continue + for i, t in enumerate(consumer.inputs): + if t != old_get_next_output: + logger.debug( + "Ignore input %s of consumer %s, cause it not output of 'IteratorGetNext'.", + t.name, + consumer.name, + ) + continue + consumer._update_input(i, new_tensor) + logger.debug( + "Succeed replace old input %s of consumer %s to new input %s.", + old_tensor_name, + consumer.name, + new_tensor, + ) + + def _update_sliced_graph_consumer( + self, sliced_graph_output: Tensor, new_get_next: Operation, next_offset: int + ) -> None: + """Update the consumer of the output tensors of the sliced subgraph to the new one. + + The outputs of the sliced subgraph are not the original outputs of 'IteratorGetNext'. Thus, next offset should + trace the last index of outputs of new 'IteratorGetNext'. + + Args: + sliced_graph_output: The output tensor of the sliced subgraph. + new_get_next: The new 'IteratorGetNext' operation. + next_offset: The last offset of the new 'IteratorGetNext' operation. + """ + + new_tensor_name = f"{new_get_next.name}:{next_offset}" + new_tensor = self._full_graph.get_tensor_by_name(new_tensor_name) + + old_tensor_consumers = self._get_tensor_consumers(sliced_graph_output) + for consumer in old_tensor_consumers: + if consumer.type in NoGradSubgraphSlicer._INVALID_CONSUMER_OP_TYPE: + logger.debug("Ignore invalid consumer: %s.", consumer.name) + continue + for i, t in enumerate(consumer.inputs): + if t != sliced_graph_output: + logger.debug( + "Ignore input %s of consumer %s, cause it not output of sliced graph.", + t.name, + consumer.name, + ) + continue + consumer._update_input(i, new_tensor) + logger.debug( + "Succeed replace old input %s of consumer %s to new input %s.", + sliced_graph_output, + consumer.name, + new_tensor, + ) + + @staticmethod + def _find_min_dep_ops( + tgt_ops: Set[Operation], + ) -> Set[Operation]: + logger.debug("Search from base nodes: %s.", tgt_ops) + base_ops = tgt_ops.copy() + visited_ops = base_ops + + loop_cnt = 0 + while base_ops: + loop_cnt += 1 + if loop_cnt > MAX_WHILE_SIZE: + raise RuntimeError(f"maximum loop times exceed limit: {MAX_WHILE_SIZE}.") + + parent_ops = set() + for base_node in base_ops: + if len(base_node.control_inputs) != 0: + raise ValueError("control dependencies are not supported.") + + parent_ops.update( + tensor_in.op + for tensor_in in base_node.inputs + if tensor_in.op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value + ) + + new_ops = parent_ops - visited_ops + base_ops = parent_ops + visited_ops.update(new_ops) + + logger.debug("Found minimum dependency graph nodes: %s.", visited_ops) + return visited_ops + + @staticmethod + def _validate_op(op: Operation) -> bool: + op_type = op.type + op_name = op.name + op_inputs = op.inputs + op_outputs = op.outputs + + for s in NoGradSubgraphSlicer._INVALID_STR_IN_OP_TYPE: + if s in op_type: + logger.warning("Invalid operation type: %s which contains str: %s.", op_type, s) + return False + for s in NoGradSubgraphSlicer._INVALID_STR_IN_OP_NAME: + if s in op_name: + logger.warning("Invalid operation name: %s which contains str: %s.", op_name, s) + return False + for t in op_inputs: + if t.dtype in NoGradSubgraphSlicer._INVALID_TENSOR_DTYPE: + logger.warning("Invalid operation input tensor of operation: %s whose type is %s.", t, t.dtype) + return False + for t in op_outputs: + if t.dtype in NoGradSubgraphSlicer._INVALID_TENSOR_DTYPE: + logger.warning("Invalid operation output tensor of operation: %s whose type is %s.", t, t.dtype) + return False + + return True + + @staticmethod + def _update_subgraph_in( + base_ops: Operation, + input_to_edge_ops: Dict[Operation, Set[Operation]], + sub_graph_ops: Set[Operation], + ) -> None: + for input_tensor in base_ops.inputs: + input_node = input_tensor.op + if input_node not in sub_graph_ops: + res = input_to_edge_ops.get(input_node, set()) + res.add(base_ops) + input_to_edge_ops[input_node] = res + + @staticmethod + def _update_subgraph_out( + base_ops: Operation, + out_op_to_edge_ops: Dict[Operation, Set[Operation]], + sub_graph_ops: Set[Operation], + ) -> None: + for output_tensor in base_ops.outputs: + for output_consumer in output_tensor.consumers(): + if output_consumer not in sub_graph_ops: + res = out_op_to_edge_ops.get(output_consumer, set()) + res.add(base_ops) + out_op_to_edge_ops[output_consumer] = res + + @staticmethod + def _upward_bfs_op(base_ops: Union[Operation, Set[Operation], List[Operation]], tgt_op_type: str) -> Operation: + if not isinstance(base_ops, (set, list)): + base_ops = [base_ops] + + parent_ops = base_ops + while True: + for parent_op in parent_ops: + if parent_op.type == tgt_op_type: + return parent_op + base_ops = parent_ops + parent_ops = [] + for base_op in base_ops: + parent_ops.extend(utils.find_parent_op(base_op)) + if not parent_ops: + raise ValueError(f"target operation '{tgt_op_type}'' was not found.") + + @staticmethod + def _topo_sort_sliced_ops(sliced_ops: Set[Operation]) -> List[Operation]: + topo_subgraph_list = [] + topo_subgraph_set = set() + start_nodes = set() + [start_nodes.add(x) for x in sliced_ops] + logger.info("Got topo_subgraph start nodes: %s", start_nodes) + + def topo_sort_helper(curr_op, output_list, output_set): + if not isinstance(curr_op, Operation): + raise RuntimeError(f"topo_subgraph_dfs input should be node(aka. tf.Operator). {curr_op}") + curr_inputs = curr_op.inputs + logger.debug("Got topo_dfs: %s <- %s", curr_op.name, [x.name for x in curr_inputs]) + current_control_inputs = curr_op.control_inputs + if len(current_control_inputs) > 0: + raise RuntimeError( + f"control input are not supported: {curr_op.name}, control_inputs: {current_control_inputs}" + ) + if curr_op in output_set: + return + output_set.add(curr_op) + for tensor in curr_inputs: + node = tensor.op + if node.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value and node not in output_set: + topo_sort_helper(node, output_list, output_set) + output_list.append(curr_op) + + [topo_sort_helper(x, topo_subgraph_list, topo_subgraph_set) for x in start_nodes] + if len(topo_subgraph_list) != len(topo_subgraph_set): + raise RuntimeError(f"got duplicated topo node: {sorted(topo_subgraph_list, key=lambda x: x.name)}.") + logger.info("Got topo_subgraph: %s", topo_subgraph_list) + return topo_subgraph_list + + @staticmethod + def _get_mapping_for_subgraph_in( + from_op: Operation, + to_ops: Set[Operation], + tensor_mapping: Union[Dict[Tensor, Tensor], Dict[SparseTensor, SparseTensor]], + ) -> None: + if from_op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: + raise RuntimeError(f"expect IteratorGetNext for input tensor of subgraph, but got {from_op}") + for node in to_ops: + for each_tensor in node.inputs: + if each_tensor.op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: + continue + old_tensor_name = each_tensor.name + x_index = int(old_tensor_name.split(":")[-1]) + g = tf.compat.v1.get_default_graph() + arg_tensor = g.get_tensor_by_name("args_%d:0" % x_index) + tensor_mapping[each_tensor] = arg_tensor + + @staticmethod + def _get_mapping_for_subgraph( + old_op: Operation, + node_mapping: Dict[Operation, Operation], + tensor_mapping: Dict[Tensor, Tensor], + ) -> None: + logger.debug("old operation name: %s\nold operation inputs: %s\n", old_op.name, [x for x in old_op.inputs]) + + for each_tensor in old_op.inputs: + if each_tensor not in tensor_mapping: + raise RuntimeError( + f"each_tensor(input) {each_tensor} need by {old_op.name} not in tensor_mapping.{tensor_mapping}" + ) + new_inputs = NoGradSubgraphSlicer._get_mapped_tensor(tensor_mapping, old_op.inputs) + + node_def = old_op.node_def + node_def.name = "{}/{}".format(NoGradSubgraphSlicer._SLICED_OP_NAME_PREFIX, node_def.name) + new_node = tf.Operation(node_def=node_def, g=tf.compat.v1.get_default_graph(), inputs=new_inputs) + + node_mapping[old_op] = new_node + for old_out_tensor, new_out_tensor in zip(old_op.outputs, new_node.outputs): + tensor_mapping[old_out_tensor] = new_out_tensor + + @staticmethod + def _get_mapped_tensor(tensor2tensor: Dict[Tensor, Tensor], keys: List[Tensor]) -> List[Tensor]: + tensors = [] + for k in keys: + if k not in tensor2tensor: + raise KeyError(f"failed to find key tensor: {k} from tensor map: {tensor2tensor}.") + tensors.append(tensor2tensor[k]) + return tensors + + @staticmethod + def _sort_sliced_graph_outputs(subgraph_out: Dict[Operation, Set[Operation]]) -> List[Tensor]: + extra_outputs = [] + sorted_outputs = sorted(subgraph_out.items(), key=lambda x: x[0].name) + for outside_op, edge_ops in sorted_outputs: + outside_op_inputs = set(outside_op.inputs) + for edge_op in edge_ops: + NoGradSubgraphSlicer._add_sorted_additional_tensors(extra_outputs, outside_op_inputs, edge_op) + return extra_outputs + + @staticmethod + def _add_sorted_additional_tensors(extra_outputs, outside_op_inputs, edge_op) -> None: + for each_tensor in sorted(edge_op.outputs, key=lambda x: x.name): + if each_tensor not in outside_op_inputs: + continue + if each_tensor in extra_outputs: + continue + extra_outputs.append(each_tensor) + + @staticmethod + def _get_tensor_consumers(tensor: Tensor) -> List[Operation]: + if not isinstance(tensor, NoGradSubgraphSlicer._VALID_TENSOR_CLASS): + raise RuntimeError(f"expected 'tf.Tensor' or 'tf.SparseTensor', but got: {tensor}") + + graph = tensor.graph + consumers = [] + consumer_names = [op.name for op in tensor.consumers()] + + with graph._lock: + for name in consumer_names: + if name not in graph._nodes_by_name: # ignore deleted node + continue + consumers.append(graph._nodes_by_name[name]) + + return consumers + + +@para_checker_decorator( + check_option_list=[ + ("op_types", ClassValidator, {"classes": (list,)}), + ("full_graph", ClassValidator, {"classes": (Graph, type(None))}), + ("info_dir", ClassValidator, {"classes": (str,)}), + ] +) +class LookupSubgraphSlicer(NoGradSubgraphSlicer): + def __init__(self, op_types: List[str], full_graph: Graph = None, info_dir: str = "lookup_slicing") -> None: + """Initialize LookupSubgraphSlicer. + Args: + op_types: The list of operation types to be sliced in lookup subgraph. + full_graph: The full graph to be sliced. If None, the default graph will be used. + info_dir: The directory to save the slicing information. Defaults to "lookup_slicing". + """ + super().__init__(full_graph, info_dir) + if not op_types: + raise ValueError("no slicing operation types specified!") + self._op_types = set(op_types) + + def summarize(self) -> None: # pragma: no cover + all_tgt_ops = self._find_all_tgt_ops() + (train_sliceable_tgt_ops, eval_sliceable_tgt_ops) = self._find_sliceable_tgt_ops() + all_sliceable_tgt_ops = train_sliceable_tgt_ops | eval_sliceable_tgt_ops + + result = {"Operation Type": [], "Total Num": [], "Sliceable Num": [], "Sliceable Ratio": []} + + for op_type in self._op_types: + tgt_ops = set(filter(lambda op: op.type == op_type, all_tgt_ops)) + sliceable_tgt_ops = set(filter(lambda op: op.type == op_type, all_sliceable_tgt_ops)) + + total_num = len(tgt_ops) + sliceable_num = len(sliceable_tgt_ops) + + try: + sliceable_ratio = sliceable_num / total_num + except ZeroDivisionError: + logger.warning("No target operaiton types '%s' found in given graph.", self._op_types) + + result["Operation Type"].append(op_type) + result["Total Num"].append(total_num) + result["Sliceable Num"].append(sliceable_num) + result["Sliceable Ratio"].append(sliceable_ratio) + + result_df = pd.DataFrame(data=result) + file = "{}/{}".format(self._info_dir, NoGradSubgraphSlicer._SLICING_SUMMARY_NAME) + result_df.to_csv(file, sep=",") + + logger.info("Summary of slicing:\n%s", result_df) + + def slice(self) -> None: + utils.export_pb_graph( + file_name=NoGradSubgraphSlicer._UNSLICED_FULL_GRAPH_NAME, + dump_graph=True, + graph_def=self._full_graph.as_graph_def(), + export_path=self._info_dir, + ) + + (train_sliceable_ops, eval_sliceable_ops) = self._find_sliceable_tgt_ops() + + if train_sliceable_ops: + logger.info("Start to slice training lookup subgraph.") + self._slice_ops(train_sliceable_ops, is_training=True) + + if eval_sliceable_ops: + logger.info("Start to slice evaluation lookup subgraph.") + self._slice_ops(eval_sliceable_ops, is_training=False) + + utils.export_pb_graph( + file_name=NoGradSubgraphSlicer._SLICED_FULL_GRAPH_NAME, + dump_graph=True, + graph_def=self._full_graph.as_graph_def(), + export_path=self._info_dir, + ) + + def _find_all_tgt_ops(self) -> Set[Operation]: + """Found all operations of specific types in full graph.""" + all_tgt_ops = set() + all_ops = self._full_graph.get_operations() + + for op in all_ops: + if op.type not in self._op_types: + continue + all_tgt_ops.add(op) + + return all_tgt_ops + + def _find_sliceable_tgt_ops(self) -> Tuple[Set[Operation], Set[Operation]]: + """Found sliceable operations of given types in lookup subgraph.""" + + # WARN: Couple with mx_rec::core::embedding module. + lookup_keys = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE) + + train_base_ops = set() + eval_base_ops = set() + for t in lookup_keys: + if BaseSparseEmbedding.get_anchor_attribute(t, ASCAnchorAttr.IS_TRAINING): + train_base_ops.add(t.op) + else: + eval_base_ops.add(t.op) + + def find_sliceable_ops(base_ops): + min_dep_ops = self._find_min_dep_ops(base_ops) + + sliceable_ops = set() + for op in min_dep_ops: + if not self._validate_op(op): + continue + if op.type not in self._op_types: + continue + sliceable_ops.add(op) + + return sliceable_ops + + train_sliceable_ops = find_sliceable_ops(train_base_ops) + eval_sliceable_ops = find_sliceable_ops(eval_base_ops) + + logger.debug("Found sliceable operations in training lookup subgraph: %s.", train_sliceable_ops) + logger.debug("Found sliceable operations in evaluation lookup subgraph: %s.", eval_sliceable_ops) + return (train_sliceable_ops, eval_sliceable_ops) + + +@para_checker_decorator( + check_option_list=[ + ("full_graph", ClassValidator, {"classes": (Graph, type(None))}), + ("info_dir", ClassValidator, {"classes": (str,)}), + ] +) +class OrphanLookupKeySlicer(NoGradSubgraphSlicer): + SLICEABLE_ORPHAN_LOOKUP_KEY_PREFIX = "orphan" + + def __init__(self, full_graph: Graph = None, info_dir: str = "orphan_slicing") -> None: + """Initialize OrphanLookupKeySlicer. + Args: + full_graph: The full graph to be sliced. If None, the default graph will be used. + info_dir: The directory to save the slicing information. Defaults to "orphan_slicing". + """ + super().__init__(full_graph, info_dir) + + def summarize(self) -> None: # pragma: no cover + (train_sliceable_ops, _) = self._find_sliceable_tgt_ops() + + if len(train_sliceable_ops) == 0: + return + + result = {"Operation Type": [], "Operation Name": []} + for op in train_sliceable_ops: + result["Operation Type"].append(op.type) + result["Operation Name"].append(op.name) + + result_df = pd.DataFrame(data=result) + file = "{}/{}".format(self._info_dir, NoGradSubgraphSlicer._SLICING_SUMMARY_NAME) + result_df.to_csv(file, sep=",") + + logger.info("Summary of slicing:\n%s", result_df) + + def slice(self) -> None: + utils.export_pb_graph( + file_name=NoGradSubgraphSlicer._UNSLICED_FULL_GRAPH_NAME, + dump_graph=True, + graph_def=self._full_graph.as_graph_def(), + export_path=self._info_dir, + ) + + (train_sliceable_ops, eval_sliceable_ops) = self._find_sliceable_tgt_ops() + + if train_sliceable_ops: + logger.info("Start to slice training lookup subgraph.") + self._slice_ops(train_sliceable_ops, is_training=True) + + if eval_sliceable_ops: + logger.info("Start to slice evaluation lookup subgraph.") + self._slice_ops(eval_sliceable_ops, is_training=False) + + utils.export_pb_graph( + file_name=NoGradSubgraphSlicer._SLICED_FULL_GRAPH_NAME, + dump_graph=True, + graph_def=self._full_graph.as_graph_def(), + export_path=self._info_dir, + ) + + def _slice_ops(self, sliceable_ops: Set[Operation], is_training: bool) -> None: + """Override the '_slice_ops' protected method of super class.""" + + sliced_ops = self._find_min_dep_ops(sliceable_ops) + in_op_to_edge_ops, out_op_to_edge_ops = self._find_subgraph_in_and_out(sliced_ops) + + all_get_nexts = [ + op for op in self._full_graph.get_operations() if op.type == AnchorIteratorOp.ITERATOR_GET_NEXT.value + ] + alive_get_nexts = list( + filter( + lambda op: op not in tf.compat.v1.get_collection(DeprecatedOp.DEPRECATED_ITERATOR_GET_NEXT), + all_get_nexts, + ) + ) + alive_get_nexts = sorted(alive_get_nexts, key=lambda op: op.name) + + old_get_next = None + if len(alive_get_nexts) == 1: + old_get_next = alive_get_nexts[0] + else: + old_get_next = alive_get_nexts[0] if is_training else alive_get_nexts[1] + + old_dataset = self._find_old_dataset(old_get_next, is_training) + + new_dataset = self._make_new_dataset(old_dataset, sliced_ops, in_op_to_edge_ops, out_op_to_edge_ops) + new_dataset = new_dataset.prefetch(0) + + new_get_next = self._make_new_get_next(old_get_next, new_dataset) + self._replace_get_next(old_get_next, new_get_next, out_op_to_edge_ops, sliced_ops) + + def _find_sliceable_tgt_ops(self) -> Tuple[Set[Operation], Set[Operation]]: + """Found orhpan keys' additional identity operation in lookup subgraph.""" + + # WARN: Couple with mx_rec::core::embedding module. + lookup_keys = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE) + + train_base_ops = set() + eval_base_ops = set() + for t in lookup_keys: + if BaseSparseEmbedding.get_anchor_attribute(t, ASCAnchorAttr.IS_TRAINING): + train_base_ops.add(t.op) + else: + eval_base_ops.add(t.op) + + def find_sliceable_ops(base_ops): + min_dep_ops = self._find_min_dep_ops(base_ops) + + sliceable_ops = set() + for op in min_dep_ops: + if not self._validate_op(op): + continue + if OrphanLookupKeySlicer.SLICEABLE_ORPHAN_LOOKUP_KEY_PREFIX not in op.name: + continue + sliceable_ops.add(op) + + return sliceable_ops + + train_sliceable_ops = find_sliceable_ops(train_base_ops) + eval_sliceable_ops = find_sliceable_ops(eval_base_ops) + + logger.debug("Found sliceable operations in training lookup subgraph: %s.", train_sliceable_ops) + logger.debug("Found sliceable operations in evaluation lookup subgraph: %s.", eval_sliceable_ops) + return (train_sliceable_ops, eval_sliceable_ops) diff --git a/mx_rec/graph/utils.py b/mx_rec/graph/utils.py index c010d80d..8ffc8bc6 100644 --- a/mx_rec/graph/utils.py +++ b/mx_rec/graph/utils.py @@ -17,16 +17,17 @@ import os from collections import defaultdict -from typing import List, Dict, Union +from typing import List, Dict, Union, DefaultDict, Tuple import tensorflow as tf from tensorflow import Operation, Tensor from tensorflow.core.framework.graph_pb2 import GraphDef from tensorflow.python.framework.errors_impl import InvalidArgumentError +from mx_rec.graph.slicers import OrphanLookupKeySlicer +from mx_rec.graph.constants import AnchorIteratorOp from mx_rec.constants.constants import ASCAnchorAttr, DUMP_MIDIFY_GRAPH_FILE_MODE from mx_rec.core.embedding import BaseSparseEmbedding -from mx_rec.graph.graph_typing import ReplacementSpec from mx_rec.util.log import logger @@ -46,21 +47,21 @@ def find_parent_op(operator: Operation) -> List[Operation]: parent_ops = [] for input_tensor in operator.inputs: parent_op = input_tensor.op - if isinstance(parent_op, tf.Operation): + if isinstance(parent_op, Operation): parent_ops.append(parent_op) return parent_ops def check_cutting_points(cutting_point_list: List[Tensor]): for tensor in cutting_point_list: - if not isinstance(tensor, tf.Tensor): + if not isinstance(tensor, Tensor): raise TypeError(f"Collection ASCEND_CUTTING_POINT can only contain Tensors, but '{tensor}' was found.") if tensor.op.type != "Identity": raise ValueError(f"Cutting point can only be the output of an Operator 'Identity'.") -def record_ops_to_replace(src_op: Operation) -> ReplacementSpec: +def record_ops_to_replace(src_op: Operation) -> DefaultDict[Tensor, List[Tuple[int, Operation]]]: replacement_specs = defaultdict(list) output_list = src_op.outputs op_list = tf.compat.v1.get_default_graph().get_operations() @@ -73,7 +74,7 @@ def record_ops_to_replace(src_op: Operation) -> ReplacementSpec: return replacement_specs -def replace_anchor(replacement_specs: ReplacementSpec, new_tensor_list: List[Tensor]): +def replace_anchor(replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], new_tensor_list: List[Tensor]): if len(replacement_specs) != len(new_tensor_list): raise ValueError(f"Given replacement_specs and new_tensor_list must have the same length. " f"replacement_specs: {replacement_specs}, new_tensor_list: {new_tensor_list}") @@ -93,7 +94,7 @@ def export_pb_graph(file_name: str, dump_graph: bool = False, graph_def: GraphDef = None, export_path: str = "./export_graph", - as_text: bool = False): + as_text: bool = True): """ Save tensorflow graph before and after modifier graph :param file_name: FileName of the graph @@ -164,15 +165,16 @@ def replace_anchor_vec(cutting_point: Tensor, attribute: ASCAnchorAttr, anchor: replace_anchor(replacement_specs_for_anchor_vec, [anchor]) -def tag_orphan_ids(ids: tf.Tensor) -> tf.Tensor: - """ - 将孤儿ids使用identity操作创建ACG_PUSH_NODE前缀命名的标记节点,以便在PushOps时能找到。 - """ +def mark_orphan_lookup_key(lookup_key: Tensor) -> Tensor: graph_def = tf.compat.v1.get_default_graph().as_graph_def() - subgraph = tf.compat.v1.graph_util.extract_sub_graph(graph_def, [ids.op.name]) + subgraph = tf.compat.v1.graph_util.extract_sub_graph(graph_def, [lookup_key.op.name]) + for node in subgraph.node: - if node.op == 'IteratorGetNext': - return ids - new_ids = tf.identity(ids, name=f"ACG_PUSH_NODE_{ids.op.name}") - logger.info('Tag orphan op node: %s with %s.', ids, new_ids) - return new_ids + if node.op == AnchorIteratorOp.ITERATOR_GET_NEXT.value: + return lookup_key + + name_prefix = OrphanLookupKeySlicer.SLICEABLE_ORPHAN_LOOKUP_KEY_PREFIX + marked_lookup_key = tf.identity(lookup_key, name="{}/{}".format(name_prefix, lookup_key.op.name)) + + logger.info('Mark orphan lookup key %s as %s.', lookup_key, marked_lookup_key) + return marked_lookup_key diff --git a/tests/mx_rec/graph/test_acg_push_ops.py b/tests/mx_rec/graph/test_acg_push_ops.py deleted file mode 100644 index 129b773f..00000000 --- a/tests/mx_rec/graph/test_acg_push_ops.py +++ /dev/null @@ -1,514 +0,0 @@ -#!/usr/bin/env python3 -# coding: UTF-8 -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from unittest import TestCase -from unittest.mock import patch, Mock - -import tensorflow as tf -from tensorflow.core.framework import node_def_pb2 -from tensorflow.python.data.ops.dataset_ops import DatasetV1 -from mx_rec.graph.acg_push_ops import ( - ACGPushOpsToDatasetHook, - SubgraphInfo, - _OP_NAME_CONTAIN_STRING_TO_PUSH, - _ACG_NEW_INITIALIZER, - _find_ops_to_be_pushed, - _find_op_from_base_op, - _find_subgraph_nodes, - _get_mapping_tensor, - _topo_subgraph, - _get_dataset_op, - _clone_subgraph_into_funcgraph, - _update_subgraph_out_consumer, - _get_src_dataset, - _update_iterator_getnext, - _find_subgraph_in_out, - _push_subgraph_to_dataset, - _warn_for_var_scope_nodes, - _frozen_variable_node_to_func_const_node_def, - _update_old_consumer, - _get_mapping_for_subgraph, - _get_mapping_for_subgraph_in, - _ordered_output_from_subgraph, - _replace_get_next_op, - _patched_get_src_dataset, -) -from tests.mx_rec.core.mock_class import MockConfigInitializer -from tests.mx_rec.graph.mock_dataset import gen_mock_dataset - - -@patch.multiple( - "mx_rec.graph.patch", - ConfigInitializer=Mock(return_value=MockConfigInitializer(modify_graph=True, is_graph_modify_hook_running=True)), -) -@patch.multiple( - "tensorflow.compat.v1.train.Saver", - __init__=Mock(return_value=None), - build=Mock(), -) -@patch.multiple("mx_rec.graph.acg_push_ops", _find_ops_to_be_pushed=Mock()) -class ACGPushOpsToDatasetHookTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_cutting_point = tf.identity(mock_ids) - - mock_new_iterator = mock_dataset.make_initializable_iterator() - tf.compat.v1.add_to_collection(_ACG_NEW_INITIALIZER, mock_new_iterator.initializer) - - with tf.compat.v1.train.MonitoredSession(hooks=[ACGPushOpsToDatasetHook()]) as sess: - sess.run(mock_iterator.initializer) - sess.run(mock_cutting_point) - - -@patch.multiple( - "mx_rec.graph.acg_push_ops", - _find_subgraph_nodes=Mock(return_value=set()), - _push_subgraph_to_dataset=Mock(), -) -class FindOpsToBePushedTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok_op_contain_str_to_push(self): - tensor = tf.constant(value=[1, 2, 3], name="MOCK" + list(_OP_NAME_CONTAIN_STRING_TO_PUSH)[0]) - mock_graph = tf.compat.v1.get_default_graph() - _find_ops_to_be_pushed(mock_graph) - - def test_ok_op_type_to_push(self): - const_tensor = tf.constant(value=[1, 2, 3], dtype=tf.int32) - str_tensor = tf.compat.v1.as_string(const_tensor) - num_tensor = tf.compat.v1.string_to_number(str_tensor) - mock_graph = tf.compat.v1.get_default_graph() - _find_ops_to_be_pushed(mock_graph) - - def test_ok_no_node_to_push(self): - mock_graph = tf.compat.v1.get_default_graph() - _find_ops_to_be_pushed(mock_graph) - - -class FindSubgraphNodesTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - - tensor_in_subgraph = tf.identity(mock_ids) - tensor_out_subgraph = tf.identity(tensor_in_subgraph) - mock_base_nodes = {tensor_out_subgraph.op} - - subgraph_nodes = _find_subgraph_nodes( - tf.compat.v1.get_default_graph(), mock_base_nodes, tgt_op_type="IteratorGetNext" - ) - self.assertEqual(subgraph_nodes, {tensor_in_subgraph.op, tensor_out_subgraph.op}) - - -class WarnForVarScopeNodesTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - with tf.compat.v1.variable_scope("mock_var_scope"): - var1 = tf.compat.v1.get_variable("var", shape=(3, 3), initializer=tf.random_normal_initializer()) - - mock_all_nodes = tf.compat.v1.get_default_graph().get_operations() - mock_base_node = var1.op - _warn_for_var_scope_nodes(mock_all_nodes, mock_base_node) - - -class FindOpFromBaseOpTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_err_no_tgt_op_type(self): - parent_tensor = tf.ones(shape=(3, 3)) - child_tensor = tf.identity(parent_tensor) - with self.assertRaises(ValueError): - _find_op_from_base_op(child_tensor.op, "IteratorGetNext") - - -class GetDatasetOpTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_prefetch_dataset = mock_dataset.prefetch(buffer_size=10) - mock_iterator = mock_prefetch_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_get_next_op = mock_ids.op - - mock_graph = tf.compat.v1.get_default_graph() - expected = mock_graph.get_operation_by_name("OptimizeDataset") - - tgt_dataset_op = _get_dataset_op(mock_graph, mock_get_next_op) - self.assertEqual(tgt_dataset_op, expected) - - def test_err_invalid_get_next_op_type(self): - mock_get_next_op = tf.zeros(shape=(3,)).op - mock_graph = tf.compat.v1.get_default_graph() - - with self.assertRaises(TypeError): - _get_dataset_op(mock_graph, mock_get_next_op) - - @patch.multiple("mx_rec.graph.acg_push_ops", _find_op_from_base_op=Mock(return_value=None)) - @patch.multiple("mx_rec.graph.acg_push_ops.modifier", find_parent_op=Mock(return_value=None)) - def test_err_no_tgt_op_found(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_get_next_op = mock_ids.op - - mock_graph = tf.compat.v1.get_default_graph() - - with self.assertRaises(RuntimeError): - _get_dataset_op(mock_graph, mock_get_next_op) - - -class OrderedOutputFromSubgraphTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next(name="IteratorGetNext") - mock_ids = mock_batch.get("mock_ids") - - mock_subgraph_out = {tf.identity(mock_ids).op: {mock_ids.op}} - - addition_funcgraph_output_tensor = _ordered_output_from_subgraph(mock_subgraph_out) - self.assertEqual(addition_funcgraph_output_tensor, [mock_ids]) - - -class PushSubgraphToDatasetTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - - tensor_in_subgraph = tf.identity(mock_ids) - tensor_out_subgraph = tf.identity(tensor_in_subgraph) - mock_subgraph_to_push = {tensor_in_subgraph.op} - _push_subgraph_to_dataset(tf.compat.v1.get_default_graph(), mock_subgraph_to_push) - - -class FindSubgraphInOutTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - - tensor_in_subgraph = tf.identity(mock_ids) - tensor_out_subgraph = tf.identity(tensor_in_subgraph) - mock_subgraph_nodes = {tensor_in_subgraph.op} - - ( - subgraph_in, - subgraph_out, - ) = _find_subgraph_in_out(mock_subgraph_nodes) - self.assertEqual(subgraph_in, {mock_ids.op: {tensor_in_subgraph.op}}) - self.assertEqual(subgraph_out, {tensor_out_subgraph.op: {tensor_in_subgraph.op}}) - - -class GetSrcDatasetTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok_make_iterator(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_get_next_op = mock_ids.op - - src_dataset = _get_src_dataset(tf.compat.v1.get_default_graph(), mock_get_next_op) - self.assertEqual(src_dataset, mock_dataset) - - def test_ok_one_shot_iterator(self): - mock_dataset = gen_mock_dataset() - mock_prefetch_dataset = mock_dataset.prefetch(10) - mock_iterator = mock_prefetch_dataset.make_one_shot_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_get_next_op = mock_ids.op - - src_dataset = _get_src_dataset(tf.compat.v1.get_default_graph(), mock_get_next_op) - self.assertEqual(src_dataset, mock_dataset) - - def test_err_no_anchor_dataset(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_one_shot_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_get_next_op = mock_ids.op - - with self.assertRaises(RuntimeError): - _get_src_dataset(tf.compat.v1.get_default_graph(), mock_get_next_op) - - -class CloneSubgraphIntoFuncgraphTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - - mock_subgraph_in = {mock_ids.op: {tf.identity(mock_ids).op}} - mock_subgraph_out = {tf.identity(mock_ids).op: {mock_ids.op}} - mock_subgraph_to_push = set() - mock_subgraph_info = SubgraphInfo(mock_subgraph_in, mock_subgraph_out, mock_subgraph_to_push) - - mock_new_ids = tf.ones_like(mock_ids) - mock_x = [mock_new_ids] - mock_old_x = ({"mock_new_ids": mock_new_ids},) - - mock_defaultgraph = tf.compat.v1.get_default_graph() - with tf.Graph().as_default(): - mock_funcgraph = tf.compat.v1.get_default_graph() - _clone_subgraph_into_funcgraph(mock_funcgraph, mock_defaultgraph, mock_subgraph_info, mock_x, mock_old_x) - - -class GetMappingForSubgraphInTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_prefetch_dataset = mock_dataset.prefetch(10) - mock_iterator = mock_prefetch_dataset.make_one_shot_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - - mock_from_node = mock_ids.op - mock_to_nodes = {tf.identity(mock_ids).op} - mock_new_ids = tf.zeros_like(mock_ids) - mock_x = [mock_new_ids] - tensor_mapping = dict() - - _get_mapping_for_subgraph_in(mock_from_node, mock_to_nodes, mock_x, tensor_mapping) - self.assertEqual(tensor_mapping, {mock_ids: mock_new_ids}) - - -class GetMappingForSubgraphTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_defaultgraph = tf.compat.v1.get_default_graph() - - # NOTE: Simulate independent graph environment while executing `dataset.map()` method. - with tf.Graph().as_default(): - key_tensor = tf.zeros(shape=(1)) - val_tensor = tf.zeros(shape=(1)) - mock_tensor_mapping = {key_tensor: val_tensor} - - mock_node_mapping = dict() - mock_old_node = tf.identity(key_tensor).op - mock_funcgraph = tf.compat.v1.get_default_graph() - - _get_mapping_for_subgraph( - mock_funcgraph, mock_defaultgraph, mock_node_mapping, mock_old_node, mock_tensor_mapping - ) - - self.assertEqual(len(mock_node_mapping), 1) - self.assertEqual(len(mock_tensor_mapping), 2) - - -@patch.multiple( - "mx_rec.graph.patch", - ConfigInitializer=Mock(return_value=MockConfigInitializer(modify_graph=True, is_graph_modify_hook_running=True)), -) -class FrozenVariableNodeToFuncConstNodeDefTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - var_tensor = tf.Variable(initial_value=[1], shape=(1,)) - tf.compat.v1.assign(ref=var_tensor, value=[1]) - - mock_funcgraph = tf.Graph() - mock_defaultgraph = tf.compat.v1.get_default_graph() - new_const_node: node_def_pb2.NodeDef = _frozen_variable_node_to_func_const_node_def( - var_tensor.op, mock_funcgraph, mock_defaultgraph - ) - self.assertEqual(new_const_node.op, "Const") - - -class GetMappingTensorTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - key_tensor = tf.zeros(shape=(3, 3)) - val_tensor = tf.ones(shape=(3, 3)) - tsr2tsr = {key_tensor: val_tensor} - keys = [key_tensor] - - mapped_tensors = _get_mapping_tensor(tsr2tsr, keys) - self.assertEqual(mapped_tensors, [val_tensor]) - - def test_err_key_tensor_not_exist(self): - tsr2tsr = {tf.zeros(shape=(3, 3)): tf.ones(shape=(3, 3))} - keys = [tf.ones(shape=(3, 3))] - - with self.assertRaises(KeyError): - _get_mapping_tensor(tsr2tsr, keys) - - -class TopoSubgraphTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_prefetch_dataset = mock_dataset.prefetch(10) - mock_iterator = mock_prefetch_dataset.make_one_shot_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_get_next_op = mock_ids.op - - tensor1 = tf.identity(mock_ids) - tensor2 = tf.add(tensor1, 1) - mock_subgraph = {tensor1.op, tensor2.op} - - const_op_for_add = None - for tensor in tensor2.op.inputs: - if tensor.op.name != "Add/y": - continue - const_op_for_add = tensor.op - - if not const_op_for_add: - self.fail( - f"Failed to find input of add operation, input tensor of add op: {[x.op for x in tensor2.op.inputs]}" - ) - - topo_subgraph_list = _topo_subgraph(mock_subgraph) - self.assertEqual(topo_subgraph_list, [tensor1.op, const_op_for_add, tensor2.op]) - - -class UpdateIteratorGetNextTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_old_dataset = gen_mock_dataset() - mock_old_iterator = mock_old_dataset.make_initializable_iterator() - mock_old_batch = mock_old_iterator.get_next(name="OldIteratorGetNext") - mock_old_ids = mock_old_batch.get("mock_ids") - mock_old_get_next_op = mock_old_ids.op - - mock_new_dataset: DatasetV1 = mock_old_dataset.map(lambda x: x) - mock_subgraph_out = {tf.identity(mock_old_ids).op: {mock_old_ids.op}} - - _update_iterator_getnext( - graph=tf.compat.v1.get_default_graph(), - get_next_op=mock_old_get_next_op, - tgt_dataset=mock_new_dataset, - subgraph_out=mock_subgraph_out, - subgraph_to_push=set(), - ) - - -class UpdateOldConsumerTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next(name="NewIteratorGetNext") - mock_ids = mock_batch.get("mock_ids") - mock_new_get_next_op = mock_ids.op - mock_output_tensor = tf.identity(mock_ids) - - _update_old_consumer( - graph=tf.compat.v1.get_default_graph(), - new_get_next_op=mock_new_get_next_op, - output_tensor=mock_ids, - subgraph_to_push=set(), - ) - - -class UpdateSubgraphOutConsumerTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next(name="NewIteratorGetNext") - mock_ids = mock_batch.get("mock_ids") - mock_new_get_next_op = mock_ids.op - mock_output_tensor = tf.identity(mock_ids) - - _update_subgraph_out_consumer( - graph=tf.compat.v1.get_default_graph(), - new_get_next_op=mock_new_get_next_op, - offset=0, - output_tensor=mock_ids, - ) - - -class PatchedGetSrcDatasetTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_prefetch_dataset = mock_dataset.prefetch(10) - mock_double_prefetch_dataset = mock_prefetch_dataset.prefetch(10) - mock_iterator = mock_prefetch_dataset.make_one_shot_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_get_next_op = mock_ids.op - - src_dataset = _patched_get_src_dataset(mock_get_next_op, is_training=True) - self.assertEqual(src_dataset, mock_prefetch_dataset) - - def test_err_single_prefetch_dataset(self): - mock_dataset = gen_mock_dataset() - mock_prefetch_dataset = mock_dataset.prefetch(10) - mock_iterator = mock_prefetch_dataset.make_one_shot_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_get_next_op = mock_ids.op - - with self.assertRaises(RuntimeError): - _patched_get_src_dataset(mock_get_next_op, is_training=True) diff --git a/tests/mx_rec/graph/test_modifier.py b/tests/mx_rec/graph/test_modifier.py index 14b87617..2a9af10d 100644 --- a/tests/mx_rec/graph/test_modifier.py +++ b/tests/mx_rec/graph/test_modifier.py @@ -31,9 +31,9 @@ from mx_rec.constants.constants import ( ASCAnchorAttr, ) from mx_rec.core.asc import FeatureSpec -from mx_rec.graph.graph_typing import AnchorRecord from mx_rec.graph.modifier import ( GraphModifierHook, + AnchorRecord, find_make_iterator_op, find_target_dataset_op, find_target_instance_dataset, -- Gitee From d0367f93d3458535409a7f1d3b96d75ec6b9678b Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Mon, 29 Apr 2024 06:13:07 +0000 Subject: [PATCH 072/302] =?UTF-8?q?!107=20=E5=8E=BB=E9=99=A4ascend=5Fvisib?= =?UTF-8?q?le=5Fdevices=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=EF=BC=8C?= =?UTF-8?q?=E5=A2=9E=E6=B7=BBCM=5FWORKER=5FSIZE=E7=9A=84=E8=8C=83=E5=9B=B4?= =?UTF-8?q?=E6=A0=A1=E9=AA=8C=20*=20=E5=8E=BB=E9=99=A4ascend=5Fvisible=5Fd?= =?UTF-8?q?evices=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F=EF=BC=8C=E5=A2=9E?= =?UTF-8?q?=E6=B7=BBCM=5FWORKER=5FSIZE=E7=9A=84=E8=8C=83=E5=9B=B4=E6=A0=A1?= =?UTF-8?q?=E9=AA=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/run.sh | 1 - examples/demo/little_demo/run.sh | 1 - examples/demo/little_demo_estimator/run.sh | 1 - mx_rec/constants/constants.py | 6 +++++- mx_rec/util/global_env_conf.py | 16 +++++----------- tests/mx_rec/util/test_variable.py | 3 --- 6 files changed, 10 insertions(+), 18 deletions(-) diff --git a/examples/DCNv2/run.sh b/examples/DCNv2/run.sh index 1709959c..860ff53f 100644 --- a/examples/DCNv2/run.sh +++ b/examples/DCNv2/run.sh @@ -92,7 +92,6 @@ if [ -n "$ip" ]; then echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE" echo "CM_WORKER_IP=$CM_WORKER_IP" echo "CM_WORKER_SIZE=$CM_WORKER_SIZE" - echo "ASCEND_VISIBLE_DEVICES=$ASCEND_VISIBLE_DEVICES" else # ranktable echo "Current is ranktable solution, hccl json file:${hccl_cfg_json}" diff --git a/examples/demo/little_demo/run.sh b/examples/demo/little_demo/run.sh index de7fd806..9462a0cb 100644 --- a/examples/demo/little_demo/run.sh +++ b/examples/demo/little_demo/run.sh @@ -160,7 +160,6 @@ else echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE" echo "CM_WORKER_IP=$CM_WORKER_IP" echo "CM_WORKER_SIZE=$CM_WORKER_SIZE" - echo "ASCEND_VISIBLE_DEVICES=$ASCEND_VISIBLE_DEVICES" ######################################################### else echo "ip: $ip not available!" # 使用ranktable方案 diff --git a/examples/demo/little_demo_estimator/run.sh b/examples/demo/little_demo_estimator/run.sh index 2c78166f..6534fb21 100644 --- a/examples/demo/little_demo_estimator/run.sh +++ b/examples/demo/little_demo_estimator/run.sh @@ -150,7 +150,6 @@ else echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE" echo "CM_WORKER_IP=$CM_WORKER_IP" echo "CM_WORKER_SIZE=$CM_WORKER_SIZE" - echo "ASCEND_VISIBLE_DEVICES=$ASCEND_VISIBLE_DEVICES" ######################################################### else echo "ip: $ip not available!" # 使用ranktable方案 diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index a5f055ab..f69f32c8 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -43,6 +43,11 @@ DEFAULT_HD_CHANNEL_SIZE = 40 MAX_HD_CHANNEL_SIZE = 8192 MIN_HD_CHANNEL_SIZE = 2 +# CM_WORKER_SIZE集群节点数 +DEFAULT_CM_WORKER_SIZE = 0 +MAX_CM_WORKER_SIZE = 512 +MIN_CM_WORKER_SIZE = 0 + # key process线程数 DEFAULT_KP_THREAD_NUM = 6 MIN_KP_THREAD_NUM = 1 @@ -116,7 +121,6 @@ class BaseEnum(Enum): class EnvOption(Enum): MXREC_LOG_LEVEL = "MXREC_LOG_LEVEL" RANK_TABLE_FILE = "RANK_TABLE_FILE" - ASCEND_VISIBLE_DEVICES = "ASCEND_VISIBLE_DEVICES" CM_CHIEF_DEVICE = "CM_CHIEF_DEVICE" CM_WORKER_SIZE = "CM_WORKER_SIZE" TF_DEVICE = "TF_DEVICE" diff --git a/mx_rec/util/global_env_conf.py b/mx_rec/util/global_env_conf.py index 52b5af46..313f1693 100644 --- a/mx_rec/util/global_env_conf.py +++ b/mx_rec/util/global_env_conf.py @@ -22,7 +22,7 @@ from mx_rec.constants.constants import EnvOption, RecPyLogLevel, Flag, EMPTY_STR DEFAULT_HD_CHANNEL_SIZE, DEFAULT_KP_THREAD_NUM, DEFAULT_FAST_UNIQUE_THREAD_NUM, RecCPPLogLevel, MAX_INT32, \ MIN_HD_CHANNEL_SIZE, MAX_HD_CHANNEL_SIZE, MIN_KP_THREAD_NUM, MAX_KP_THREAD_NUM, \ MIN_FAST_UNIQUE_THREAD_NUM, MAX_FAST_UNIQUE_THREAD_NUM, DEFAULT_HOT_EMB_UPDATE_STEP, MIN_HOT_EMB_UPDATE_STEP, \ - MAX_HOT_EMB_UPDATE_STEP, TFDevice + MAX_HOT_EMB_UPDATE_STEP, TFDevice, MAX_CM_WORKER_SIZE, MIN_CM_WORKER_SIZE, DEFAULT_CM_WORKER_SIZE from mx_rec.validator.validator import para_checker_decorator, OptionValidator, DirectoryValidator, Convert2intValidator @@ -30,7 +30,6 @@ from mx_rec.validator.validator import para_checker_decorator, OptionValidator, class RecEnv: mxrec_log_level: str rank_table_file: str - ascend_visible_devices: str cm_chief_device: str cm_worker_size: str tf_device: str @@ -45,9 +44,6 @@ class RecEnv: use_combine_faae: str stat_on: str record_key_count: str - rank_id_env: str - rank_size_env: str - local_rank_size_env: str def get_global_env_conf() -> RecEnv: @@ -58,9 +54,8 @@ def get_global_env_conf() -> RecEnv: rec_env = RecEnv( mxrec_log_level=os.getenv(EnvOption.MXREC_LOG_LEVEL.value, RecPyLogLevel.INFO.value), rank_table_file=os.getenv(EnvOption.RANK_TABLE_FILE.value, EMPTY_STR), - ascend_visible_devices=os.getenv(EnvOption.ASCEND_VISIBLE_DEVICES.value), cm_chief_device=os.getenv(EnvOption.CM_CHIEF_DEVICE.value), - cm_worker_size=os.getenv(EnvOption.CM_WORKER_SIZE.value), + cm_worker_size=os.getenv(EnvOption.CM_WORKER_SIZE.value, DEFAULT_CM_WORKER_SIZE), tf_device=os.getenv(EnvOption.TF_DEVICE.value, TFDevice.NONE.value), acl_timeout=os.getenv(EnvOption.ACL_TIMEOUT.value, "-1"), hd_channel_size=os.getenv(EnvOption.HD_CHANNEL_SIZE.value, DEFAULT_HD_CHANNEL_SIZE), @@ -72,10 +67,7 @@ def get_global_env_conf() -> RecEnv: glog_stderrthreahold=os.getenv(EnvOption.GLOG_STDERRTHREAHOLD.value, RecCPPLogLevel.INFO.value), use_combine_faae=os.getenv(EnvOption.USE_COMBINE_FAAE.value, Flag.FALSE.value), stat_on=os.getenv(EnvOption.STAT_ON.value, Flag.FALSE.value), - record_key_count=os.getenv(EnvOption.RECORD_KEY_COUNT.value, Flag.FALSE.value), - rank_id_env=os.getenv(EnvOption.OMPI_COMM_WORLD_RANK.value), - rank_size_env=os.getenv(EnvOption.OMPI_COMM_WORLD_LOCAL_SIZE.value), - local_rank_size_env=os.getenv(EnvOption.OMPI_COMM_WORLD_LOCAL_SIZE.value), + record_key_count=os.getenv(EnvOption.RECORD_KEY_COUNT.value, Flag.FALSE.value) ) return rec_env @@ -84,6 +76,8 @@ def get_global_env_conf() -> RecEnv: @para_checker_decorator(check_option_list=[ ("mxrec_log_level", OptionValidator, {"options": [i.value for i in list(RecPyLogLevel)]}), ("rank_table_file", DirectoryValidator, {}, ["check_exists_if_not_empty"]), + ("cm_worker_size", Convert2intValidator, {"min_value": MIN_CM_WORKER_SIZE, "max_value": MAX_CM_WORKER_SIZE}, + ["check_value"]), ("tf_device", OptionValidator, {"options": [i.value for i in list(TFDevice)]}), ("acl_timeout", Convert2intValidator, {"min_value": -1, "max_value": MAX_INT32}, ["check_value"]), ("hd_channel_size", Convert2intValidator, diff --git a/tests/mx_rec/util/test_variable.py b/tests/mx_rec/util/test_variable.py index c72ed9dc..f8cd2725 100644 --- a/tests/mx_rec/util/test_variable.py +++ b/tests/mx_rec/util/test_variable.py @@ -44,10 +44,8 @@ class VariableTest(unittest.TestCase): """ self.cm_worker_size = global_env.cm_worker_size self.cm_chief_device = global_env.cm_chief_device - self.ascend_visible_devices = global_env.ascend_visible_devices global_env.cm_worker_size = "8" global_env.cm_chief_device = "0" - global_env.ascend_visible_devices = "0-7" def tearDown(self): """ @@ -56,7 +54,6 @@ class VariableTest(unittest.TestCase): """ global_env.cm_worker_size = self.cm_worker_size global_env.cm_chief_device = self.cm_chief_device - global_env.ascend_visible_devices = self.ascend_visible_devices @mock.patch("mx_rec.util.variable.ConfigInitializer") def test_get_dense_and_sparse_variable(self, variable_config_initializer): -- Gitee From e27d5206ab32673d50720b8d62be289167a253ac Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 29 Apr 2024 14:50:10 +0800 Subject: [PATCH 073/302] add clang-format comment --- .clang-format | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/.clang-format b/.clang-format index c1bb9720..1595fa33 100644 --- a/.clang-format +++ b/.clang-format @@ -1,13 +1,39 @@ +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +--- +# 详细配置说明 https://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- Language: Cpp BasedOnStyle: Google +# public等标识符不缩进 AccessModifierOffset: -4 +# 限制行宽120字符 ColumnLimit: 120 +# 4空格缩进 IndentWidth: 4 +# 不使用tab UseTab: Never +# 二元运算符换行时对齐 AlignOperands: Align +# 参数换行时对齐 AlignAfterOpenBracket: Align +# 行末注释对齐 AlignTrailingComments: true DerivePointerAlignment: false +# 引用和指针左对齐 PointerAlignment: Left AllowAllParametersOfDeclarationOnNextLine: false AllowAllArgumentsOnNextLine: false @@ -18,8 +44,10 @@ AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AllowShortLambdasOnASingleLine: Inline +# Break after return type automatically AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakBeforeMultilineStrings: false +# 允许参数部分换行 BinPackArguments: true BinPackParameters: true BreakBeforeBraces: Custom @@ -27,6 +55,7 @@ BraceWrapping: AfterClass: false AfterControlStatement: false AfterEnum: false + # 只有函数括号另起一行 AfterFunction: true AfterNamespace: false AfterStruct: false @@ -35,16 +64,22 @@ BraceWrapping: BeforeCatch: false BeforeElse: false IndentBraces: false +# 二元运算符换行时 运算符在第一行末尾 BreakBeforeBinaryOperators: None +# 三元运算符换行时 运算符在下一行 BreakBeforeTernaryOperators: true +# 构造函数初始化列表冒号在换行后 逗号在换行前 BreakConstructorInitializers: BeforeColon BreakStringLiterals: true CompactNamespaces: false +# 初始化要么一行 要么每个一行 PackConstructorInitializers: CurrentLine ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 +# 使用cpp11统一初始化风格 Cpp11BracedListStyle: true DisableFormat: false FixNamespaceComments: true +# 返回值类型声明后换行时不缩进 IndentWrappedFunctionNames: false Standard: Latest -- Gitee From 309366a9969f231f1528d87c1610e2e5a968cef2 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 29 Apr 2024 15:21:39 +0800 Subject: [PATCH 074/302] format comment --- src/core/key_process/key_process.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index c5ec9204..47f9b719 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -1019,8 +1019,8 @@ void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapBy } /* - * 将本地(rank)batch要发送的key数据量进行Allgather通信,获取所有(不同rank相同thread - * id的)线程间的通信量矩阵 scAll返回:所有线程间的通信量矩阵(按行平铺的一维向量) + * 将本地(rank)batch要发送的key数据量进行Allgather通信,获取所有(不同rank相同thread id的)线程间的通信量矩阵 + * scAll返回:所有线程间的通信量矩阵(按行平铺的一维向量) */ vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch) { -- Gitee From d8e72c5a5532da62f136ade434d416031f5f8028 Mon Sep 17 00:00:00 2001 From: sihaixianyu Date: Mon, 29 Apr 2024 10:50:23 +0000 Subject: [PATCH 075/302] =?UTF-8?q?!108=20Slicer=E8=A1=A5=E5=85=85?= =?UTF-8?q?=E6=B3=A8=E9=87=8A=E5=92=8C=E5=8D=95=E6=B5=8B=E3=80=82=20*=20Sl?= =?UTF-8?q?icer=E8=A1=A5=E5=85=85=E6=B3=A8=E9=87=8A=E5=92=8C=E5=8D=95?= =?UTF-8?q?=E6=B5=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/graph/__init__.py | 1 + mx_rec/graph/hooks.py | 63 ++++++ mx_rec/graph/slicers.py | 63 ++++++ tests/mx_rec/graph/test_slicers.py | 304 +++++++++++++++++++++++++++++ 4 files changed, 431 insertions(+) create mode 100644 mx_rec/graph/hooks.py create mode 100644 tests/mx_rec/graph/test_slicers.py diff --git a/mx_rec/graph/__init__.py b/mx_rec/graph/__init__.py index b91d2a49..687e78ff 100644 --- a/mx_rec/graph/__init__.py +++ b/mx_rec/graph/__init__.py @@ -25,3 +25,4 @@ __all__ = [ from mx_rec.graph.modifier import GraphModifierHook, modify_graph_and_start_emb_cache from mx_rec.graph.patch import run +from mx_rec.graph.hooks import LookupSubgraphSlicerHook, OrphanLookupKeySlicerHook diff --git a/mx_rec/graph/hooks.py b/mx_rec/graph/hooks.py new file mode 100644 index 00000000..5cf64b15 --- /dev/null +++ b/mx_rec/graph/hooks.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from typing import List + +import tensorflow as tf +from tensorflow import Operation, Graph + +from mx_rec.util.log import logger +from mx_rec.graph.slicers import LookupSubgraphSlicer, OrphanLookupKeySlicer +from mx_rec.validator.validator import ClassValidator, para_checker_decorator + + +@para_checker_decorator( + check_option_list=[ + ("op_types", ClassValidator, {"classes": (list)}), + ("full_graph", ClassValidator, {"classes": (Graph, type(None))}), + ] +) +class LookupSubgraphSlicerHook(tf.estimator.SessionRunHook): + def __init__(self, op_types: List[Operation], full_graph: Graph = None) -> None: + super().__init__() + self._op_types = op_types + self._full_graph = full_graph + + def begin(self) -> None: + slicer = LookupSubgraphSlicer(self._op_types, self._full_graph) + + logger.info("Starts to summarize sliceable specific operations in lookup subgraph!") + slicer.summarize() + + logger.info("Starts to slice specific operations and their corresponding minimum dependency graphs!") + slicer.slice() + + +@para_checker_decorator(check_option_list=[("full_graph", ClassValidator, {"classes": (Graph, type(None))})]) +class OrphanLookupKeySlicerHook(tf.estimator.SessionRunHook): + def __init__(self, full_graph: Graph = None) -> None: + super().__init__() + self._full_graph = full_graph + + def begin(self) -> None: + slicer = OrphanLookupKeySlicer(self._full_graph) + + logger.info("Starts to summarize sliceable orphan lookup keys!") + slicer.summarize() + + logger.info("Starts to slice orphan lookup keys and their corresponding minimum dependency graphs!") + slicer.slice() diff --git a/mx_rec/graph/slicers.py b/mx_rec/graph/slicers.py index d22af868..3204af4e 100644 --- a/mx_rec/graph/slicers.py +++ b/mx_rec/graph/slicers.py @@ -70,6 +70,13 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): pass def _slice_ops(self, sliceable_ops: Set[Operation], is_training: bool) -> None: + """Slice the minimum dependency graph of given operation set. + + Args: + sliceable_ops (Set[Operation]): The operation set that can be sliced. + is_training (bool): Whether the slicing is for training graph or not. + """ + sliced_ops = self._find_min_dep_ops(sliceable_ops) in_op_to_edge_ops, out_op_to_edge_ops = self._find_subgraph_in_and_out(sliced_ops) @@ -89,6 +96,18 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): in_op_to_edge_ops: Dict[Operation, Set[Operation]], out_op_to_edge_ops: Dict[Operation, Set[Operation]], ) -> DatasetV1Adapter: + """Make a new dataset which clones the sliced subgraph by mapfunc. + + Args: + old_dataset: The old dataset that needs to be mapped. + sliced_ops: The operation set that has been sliced. + in_op_to_edge_ops: The input relationship of sliced subgraph. + out_op_to_edge_ops: The output relationship of sliced subgraph. + + Returns: + DatasetV1Adapter: The new dataset that has cloned the sliced subgraph. + """ + def slice_map_func(*batch): # pragma: no cover logger.debug("The layout of old batch: %s.", batch) @@ -116,6 +135,16 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): self, sub_graph_ops: Set[Operation], ) -> Tuple[Dict[Operation, Set[Operation]], Dict[Operation, Set[Operation]]]: + """Find the input and output relationship of sliced subgraph. + + Args: + sub_graph_ops: The operation set that has been sliced. + + Returns: + in_op_to_edge_ops: The input relationship of sliced subgraph. + out_op_to_edge_ops: The output relationship of sliced subgraph. + """ + in_op_to_edge_ops = dict() out_op_to_edge_ops = dict() @@ -128,6 +157,15 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): return in_op_to_edge_ops, out_op_to_edge_ops def _find_old_get_next(self, sliceable_ops: Set[Operation]) -> Operation: + """Find the old 'IteratorGetNext' operation. + + Args: + sliceable_ops: The operation set that can be sliced. + + Returns: + old_get_next: The old 'IteratorGetNext' operation. + """ + old_get_next = self._upward_bfs_op(sliceable_ops, AnchorIteratorOp.ITERATOR_GET_NEXT.value) tf.compat.v1.add_to_collection(DeprecatedOp.DEPRECATED_ITERATOR_GET_NEXT, old_get_next) @@ -136,6 +174,22 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): return old_get_next def _find_old_dataset(self, get_next: Operation, is_training: bool) -> DatasetV1Adapter: + """Find the old dataset that needs to be mapped. + + Due to the different iterator types, the search method is different. + 1. If the iterator type is 'MakeIterator', this func will exec upward bfs search through get_next. + 2. If the iterator type is 'OneShotIterator', this func will fetch all operation in 'self._full_graph', then + filter out the 'PrefetchDataset' operation. This diff is caused by the isolation of 'OneShotIterator' and the + 'PrefetchDataset'. + + Args: + get_next: The old 'IteratorGetNext' operation. + is_training: Whether the slicing is for training graph or not. + + Returns: + old_dataset: The old dataset that needs to be mapped. + """ + tgt_trans_dataset = None try: tgt_trans_dataset = self._find_trans_dataset(get_next) @@ -173,6 +227,15 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): return old_dataset def _find_trans_dataset(self, get_next: Operation) -> Operation: + """Find the transformation dataset through 'get_next'. + + Args: + get_next: The old 'IteratorGetNext' operation. + + Returns: + trans_dataset: The target transformation dataset. + """ + if get_next.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: raise TypeError(f"operation '{get_next}' must be one instance of 'IteratorGetNext'.") diff --git a/tests/mx_rec/graph/test_slicers.py b/tests/mx_rec/graph/test_slicers.py new file mode 100644 index 00000000..b6d9cad9 --- /dev/null +++ b/tests/mx_rec/graph/test_slicers.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 +# coding: UTF-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import unittest +from unittest.mock import patch, Mock + +import tensorflow as tf +from tensorflow import Graph + +from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_ENTRANCE +from mx_rec.graph.constants import AnchorDatasetOp +from mx_rec.graph.slicers import NoGradSubgraphSlicer, LookupSubgraphSlicer, OrphanLookupKeySlicer +from tests.mx_rec.graph.mock_dataset import gen_mock_dataset + + +class MockNoGradSubgraphSlicer(NoGradSubgraphSlicer): + def __init__(self, full_graph: Graph = None, info_dir: str = "slicing") -> None: + super().__init__(full_graph, info_dir) + + def summarize(self) -> None: + pass + + def slice(self) -> None: + pass + + +class NoGradSubgraphSlicerTestCase(unittest.TestCase): + def test_ok_slice_ops(self): + with tf.compat.v1.Graph().as_default(): + dataset = gen_mock_dataset() + prefetch_dataset = dataset.prefetch(0) + + iterator = tf.compat.v1.data.make_initializable_iterator(prefetch_dataset) + batch = iterator.get_next() + + mock_ids = batch["mock_ids"] + mock_labels = batch["mock_labels"] + + inner_tensor = tf.identity(mock_ids) + inner_op = inner_tensor.op + + tf.identity(inner_tensor) + tf.identity(mock_labels) + + sliced_ops = {inner_op} + MockNoGradSubgraphSlicer()._slice_ops(sliced_ops, is_training=True) + + g = tf.compat.v1.get_default_graph() + prefetch_datasets = [op for op in g.get_operations() if AnchorDatasetOp.PREFETCH_DATASET.value in op.name] + self.assertEqual(len(prefetch_datasets), 2) + + def test_ok_find_min_dep_ops(self): + with tf.compat.v1.Graph().as_default(): + dataset = gen_mock_dataset() + iterator = dataset.make_initializable_iterator() + batch = iterator.get_next() + ids = batch["mock_ids"] + + subgraph_in = tf.identity(ids) + subgraph_out = tf.identity(subgraph_in) + base_ops = {subgraph_out.op} + + min_dep_ops = NoGradSubgraphSlicer._find_min_dep_ops(base_ops) + self.assertEqual(min_dep_ops, {subgraph_in.op, subgraph_out.op}) + + def test_ok_validate_op(self): + with tf.compat.v1.Graph().as_default(): + t = tf.constant(0) + t = tf.add(t, 1) + t = tf.subtract(t, 1) + op = t.op + + is_valid = NoGradSubgraphSlicer._validate_op(op) + self.assertTrue(is_valid, True) + + def test_ok_find_subgraph_in_and_out(self): + with tf.compat.v1.Graph().as_default(): + dataset = gen_mock_dataset() + iterator = dataset.make_initializable_iterator() + batch = iterator.get_next() + ids = batch.get("mock_ids") + + input_tensor = tf.identity(ids) + inner_tensor = tf.identity(input_tensor) + output_tensor = tf.identity(inner_tensor) + subgraph_ops = {inner_tensor.op} + + (subgraph_in, subgraph_out) = MockNoGradSubgraphSlicer()._find_subgraph_in_and_out(subgraph_ops) + self.assertEqual(subgraph_in, {input_tensor.op: {inner_tensor.op}}) + self.assertEqual(subgraph_out, {output_tensor.op: {inner_tensor.op}}) + + def test_ok_find_old_dataset(self): + with tf.compat.v1.Graph().as_default(): + dataset = gen_mock_dataset() + iterator = tf.compat.v1.data.make_initializable_iterator(dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + get_next = ids.op + + old_dataset = MockNoGradSubgraphSlicer()._find_old_dataset(get_next, is_training=True) + self.assertEqual(old_dataset, dataset) + + with tf.compat.v1.Graph().as_default(): + dataset = gen_mock_dataset() + prefetch_dataset = dataset.prefetch(0) + iterator = tf.compat.v1.data.make_one_shot_iterator(prefetch_dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + get_next = ids.op + + old_dataset = MockNoGradSubgraphSlicer()._find_old_dataset(get_next, is_training=True) + self.assertEqual(old_dataset, dataset) + + with tf.compat.v1.Graph().as_default(): + dataset = gen_mock_dataset() + prefetch_dataset = dataset.prefetch(0) + gen_mock_dataset().prefetch(0) + + iterator = tf.compat.v1.data.make_one_shot_iterator(dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + get_next = ids.op + + old_dataset = MockNoGradSubgraphSlicer()._find_old_dataset(get_next, is_training=True) + self.assertEqual(old_dataset, dataset) + + with tf.compat.v1.Graph().as_default(): + dataset = gen_mock_dataset() + prefetch_dataset = dataset.prefetch(0) + gen_mock_dataset().prefetch(0) + + iterator = tf.compat.v1.data.make_one_shot_iterator(dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + get_next = ids.op + + old_dataset = MockNoGradSubgraphSlicer()._find_old_dataset(get_next, is_training=False) + self.assertEqual(old_dataset, dataset) + + def test_ok_make_new_dataset(self): + with tf.compat.v1.Graph().as_default(): + dataset = gen_mock_dataset() + prefetch_dataset = dataset.prefetch(0) + iterator = tf.compat.v1.data.make_initializable_iterator(prefetch_dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + + in_op = ids.op + inner_tensor = tf.identity(ids) + inner_op = inner_tensor.op + out_op = tf.identity(inner_tensor).op + + sliced_ops = {inner_op} + in_op_to_edge_ops = {in_op: {inner_op}} + out_op_to_edge_ops = {out_op: {inner_op}} + + new_dataset = MockNoGradSubgraphSlicer()._make_new_dataset( + dataset, sliced_ops, in_op_to_edge_ops, out_op_to_edge_ops + ) + new_prefetch_dataset = new_dataset + new_iter = tf.compat.v1.data.make_initializable_iterator(new_prefetch_dataset) + new_batch = new_iter.get_next() + self.assertEqual(len(new_batch), 4) + + def test_ok_topo_sort_sliced_ops(self): + with tf.compat.v1.Graph().as_default(): + t1 = tf.constant(0) + t2 = tf.identity(t1) + t3 = tf.identity(t2) + ops = {t3.op, t2.op, t1.op} + + topo_sorted_ops = NoGradSubgraphSlicer._topo_sort_sliced_ops(ops) + self.assertEqual(topo_sorted_ops, [t1.op, t2.op, t3.op]) + + def test_ok_clone_subgraph_into_funcgraph(self): + with tf.compat.v1.Graph().as_default(): + prefetch_dataset = gen_mock_dataset().prefetch(0) + iterator = tf.compat.v1.data.make_initializable_iterator(prefetch_dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + + in_op = ids.op + inner_tensor = tf.identity(ids) + inner_op = inner_tensor.op + out_op = tf.identity(inner_tensor).op + + sliced_ops = {inner_op} + in_op_to_edge_ops = {in_op: {inner_op}} + out_op_to_edge_ops = {out_op: {inner_op}} + + with patch.object(tf.compat.v1.Graph, "get_tensor_by_name", return_value=tf.identity(inner_tensor)): + new_batch = MockNoGradSubgraphSlicer()._clone_subgraph_into_funcgraph( + sliced_ops, in_op_to_edge_ops, out_op_to_edge_ops, batch + ) + self.assertEqual(len(new_batch), 4) + + def test_ok_make_new_get_next(self): + with tf.compat.v1.Graph().as_default(): + prefetch_dataset = gen_mock_dataset().prefetch(0) + iterator = tf.compat.v1.data.make_initializable_iterator(prefetch_dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + + old_get_next = ids.op + new_dataset = gen_mock_dataset().prefetch(0) + + new_get_next = MockNoGradSubgraphSlicer()._make_new_get_next(old_get_next, new_dataset) + self.assertIsNotNone(new_get_next) + + with tf.compat.v1.Graph().as_default(): + prefetch_dataset = gen_mock_dataset().prefetch(0) + iterator = tf.compat.v1.data.make_one_shot_iterator(prefetch_dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + + old_get_next = ids.op + new_dataset = gen_mock_dataset().prefetch(0) + + new_get_next = MockNoGradSubgraphSlicer()._make_new_get_next(old_get_next, new_dataset) + self.assertIsNotNone(new_get_next) + + +class LookupSubGraphSlicerTestCase(unittest.TestCase): + def test_ok_find_all_tgt_ops(self): + with tf.compat.v1.Graph().as_default(): + prefetch_dataset = gen_mock_dataset().prefetch(0) + iterator = tf.compat.v1.data.make_initializable_iterator(prefetch_dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + + inner_tensor = tf.identity(ids) + tf.identity(inner_tensor) + + all_tgt_ops = LookupSubgraphSlicer(op_types=["Identity"])._find_all_tgt_ops() + self.assertEqual(len(all_tgt_ops), 2) + + @patch.multiple( + "mx_rec.core.emb.base_sparse_embedding.BaseSparseEmbedding", get_anchor_attribute=Mock(return_value=True) + ) + def test_ok_find_sliceable_tgt_ops(self): + with tf.compat.v1.Graph().as_default(): + prefetch_dataset = gen_mock_dataset().prefetch(0) + iterator = tf.compat.v1.data.make_initializable_iterator(prefetch_dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + + inner_tensor = tf.identity(ids) + lookup_key = tf.identity(inner_tensor) + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE, lookup_key) + + all_tgt_ops = LookupSubgraphSlicer(op_types=["Identity"])._find_sliceable_tgt_ops() + self.assertEqual(len(all_tgt_ops), 2) + + +class OrphanLookupKeySlicerTestCase(unittest.TestCase): + @patch.multiple("mx_rec.graph.slicers.utils", export_pb_graph=Mock(return_value=None)) + def test_ok_slice_ops(self): + with tf.compat.v1.Graph().as_default(): + prefetch_dataset = gen_mock_dataset().prefetch(0) + iterator = tf.compat.v1.data.make_initializable_iterator(prefetch_dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + + inner_tensor = tf.constant(0, dtype=ids.dtype, shape=ids.shape) + lookup_key = tf.identity(inner_tensor) + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE, lookup_key) + + sliceable_ops = {inner_tensor.op} + OrphanLookupKeySlicer()._slice_ops(sliceable_ops, is_training=False) + + g = tf.compat.v1.get_default_graph() + prefetch_datasets = [op for op in g.get_operations() if AnchorDatasetOp.PREFETCH_DATASET.value in op.name] + self.assertEqual(len(prefetch_datasets), 2) + + @patch.multiple( + "mx_rec.core.emb.base_sparse_embedding.BaseSparseEmbedding", get_anchor_attribute=Mock(return_value=True) + ) + def test_ok_find_sliceable_tgt_ops(self): + with tf.compat.v1.Graph().as_default(): + prefetch_dataset = gen_mock_dataset().prefetch(0) + iterator = tf.compat.v1.data.make_initializable_iterator(prefetch_dataset) + batch = iterator.get_next() + ids = batch["mock_ids"] + + inner_tensor = tf.constant(0, dtype=ids.dtype, shape=ids.shape) + lookup_key = tf.identity(inner_tensor) + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE, lookup_key) + + all_tgt_ops = OrphanLookupKeySlicer()._find_sliceable_tgt_ops() + self.assertEqual(len(all_tgt_ops), 2) -- Gitee From 07a159c2dbf87f9348f7871f7130aba53f069476 Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Mon, 29 Apr 2024 19:30:02 +0800 Subject: [PATCH 076/302] cleancode --- .../op_host/embedding_update_by_address.cpp | 13 +++++++------ .../op_kernel/embedding_update_by_address.cpp | 10 +++++++--- examples/demo/little_demo_estimator/nn_optim.py | 2 +- src/AccCTR/src/unique/unique_func.cpp | 5 +++-- src/AccCTR/src/unique/unique_func.h | 2 +- src/core/emb_table/embedding_ddr.cpp | 11 ----------- 6 files changed, 19 insertions(+), 24 deletions(-) diff --git a/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp b/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp index d0e4b778..5f823889 100644 --- a/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp @@ -73,7 +73,7 @@ namespace optiling return ge::GRAPH_FAILED; } - int64_t inputShape = static_cast(inputTensor->GetShapeSize()); + int32_t inputShape = static_cast(inputTensor->GetShapeSize()); if (CheckPositiveInt(inputShape, "inputShape") != ge::GRAPH_SUCCESS) { return ge::GRAPH_FAILED; } @@ -84,7 +84,7 @@ namespace optiling } const int32_t inputShapeTmp = (inputShape > 0) ? inputShape : 1; - int64_t inputDim = static_cast(inputTensor1->GetShapeSize() / inputShapeTmp); + int32_t inputDim = static_cast(inputTensor1->GetShapeSize()) / inputShapeTmp; if (CheckPositiveInt(inputDim, "inputDim") != ge::GRAPH_SUCCESS) { return ge::GRAPH_FAILED; } @@ -122,8 +122,9 @@ namespace optiling int32_t occupyAddressBytesNum = sizeof(int64_t) + typeSize * inputDimAligned * PING_PONG_NUM * 2; // 一轮计算中最多计算多少个addr,由于地址也要搬到ub,所以需要对齐32 - int32_t addrPerLoop = static_cast((UB_LIMIT / - occupyAddressBytesNum) & (~3U)); // & (~3U),保证地址数是4的倍数 + int32_t addrPerLoop = static_cast( + UB_LIMIT / static_cast(occupyAddressBytesNum) & (~3U)); // & (~3U),保证地址数是4的倍数 + if (CheckPositiveInt(addrPerLoop, "addrPerLoop") != ge::GRAPH_SUCCESS) { return ge::GRAPH_FAILED; } @@ -132,8 +133,8 @@ namespace optiling tiling.set_update_type(updateType); tiling.set_embedding_type(embeddingType); - tiling.set_update_dim(inputDim); - tiling.set_addr_nums(inputShape); + tiling.set_update_dim(static_cast(inputDim)); + tiling.set_addr_nums(static_cast(inputShape)); tiling.set_addr_per_loop(addrPerLoop); tiling.set_type_size(typeSize); tiling.set_input_dim_aligned(inputDimAligned); diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp index 4a13c3eb..d129947b 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp @@ -20,6 +20,9 @@ constexpr int32_t SIZE_OF_HALF = 2; constexpr int32_t SIZE_OF_FLOAT_OR_INT = 4; template + +namespace AscendKernel { + class KernelEimtable_update { public: @@ -185,6 +188,7 @@ private: GlobalTensor srcDataBufferGm, dstDataGm, outDataGm; GlobalTensor srcAddrGlobal; }; +} extern "C" __global__ __aicore__ void embedding_update_by_address(GM_ADDR address, GM_ADDR embedding, GM_ADDR y, GM_ADDR usrWorkspace, GM_ADDR tiling) @@ -197,7 +201,7 @@ extern "C" __global__ __aicore__ void embedding_update_by_address(GM_ADDR addres { case 0: { - KernelEimtable_update op; + AscendKernel::KernelEimtable_update op; op.Init_param(tiling); op.Init(address, embedding, y); op.Process(); @@ -205,7 +209,7 @@ extern "C" __global__ __aicore__ void embedding_update_by_address(GM_ADDR addres break; case 2: { - KernelEimtable_update op; + AscendKernel::KernelEimtable_update op; op.Init_param(tiling); op.Init(address, embedding, y); op.Process(); @@ -213,7 +217,7 @@ extern "C" __global__ __aicore__ void embedding_update_by_address(GM_ADDR addres break; default: { - KernelEimtable_update op; + AscendKernel::KernelEimtable_update op; op.Init_param(tiling); op.Init(address, embedding, y); op.Process(); diff --git a/examples/demo/little_demo_estimator/nn_optim.py b/examples/demo/little_demo_estimator/nn_optim.py index 415c5ff2..4d519366 100644 --- a/examples/demo/little_demo_estimator/nn_optim.py +++ b/examples/demo/little_demo_estimator/nn_optim.py @@ -55,7 +55,7 @@ def get_train_op_list(losses, learning_rate): dense_variables, sparse_variables = get_dense_and_sparse_variable() trainable_variables = [dense_variables] - for i, (name, loss) in enumerate(losses): + for _, (name, loss) in enumerate(losses): with tf.control_dependencies(update_ops): # do dense grad grads = dense_optimizer.compute_gradients(loss, var_list=trainable_variables) diff --git a/src/AccCTR/src/unique/unique_func.cpp b/src/AccCTR/src/unique/unique_func.cpp index 717d8890..d208eac9 100644 --- a/src/AccCTR/src/unique/unique_func.cpp +++ b/src/AccCTR/src/unique/unique_func.cpp @@ -119,8 +119,9 @@ void Dedup::NewParameter() // Time to check the proper size of sharded tables for performance // sake. uint64_t shardedTableSize = 0; - if (std::numeric_limits::max() / static_cast(n) / static_cast(groupCount_) - < newBucketCountPowerOf2) { + if (std::numeric_limits::max() / static_cast(n) / + static_cast(groupCount_) < + newBucketCountPowerOf2) { shardedTableSize = static_cast(std::numeric_limits::max()); } else { shardedTableSize = newBucketCountPowerOf2 * n * static_cast(groupCount_); diff --git a/src/AccCTR/src/unique/unique_func.h b/src/AccCTR/src/unique/unique_func.h index 46718bde..07c8ebb7 100644 --- a/src/AccCTR/src/unique/unique_func.h +++ b/src/AccCTR/src/unique/unique_func.h @@ -526,7 +526,7 @@ private: uint32_t *finishPtr = beginPtr + uniqueIn.inputIdCnt; uint32_t *partBeginPtr = beginPtr; auto alignedAddress = CacheLineAlign(reinterpret_cast(partBeginPtr + partSize)); - auto *partEndPtr = reinterpret_cast(alignedAddress); + auto *partEndPtr = reinterpret_cast(static_cast(alignedAddress)); std::vector> tasks; auto val = TypeTrans(uniqueIn.inputId); while (partBeginPtr < finishPtr) { diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 02d7c116..8f529646 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -24,17 +24,6 @@ See the License for the specific language governing permissions and using namespace MxRec; -constexpr int ELEMENT_NUM = 4; -constexpr int CURRENT_UPDATE_IDX = 0; -constexpr int HOST_VOCAB_SIZE_IDX = 1; -constexpr int DEV_VOCAB_SIZE_IDX = 2; -constexpr int MAX_OFFSET_IDX = 3; - -constexpr int EMB_INFO_ELEMENT_NUM = 3; -constexpr int EMB_INFO_EXT_SIZE_IDX = 0; -constexpr int EMB_INFO_DEV_VOCAB_SIZE_IDX = 1; -constexpr int EMB_INFO_HOST_VOCAB_SIZE_IDX = 2; - EmbeddingDDR::EmbeddingDDR() { } -- Gitee From 19966c0c38372304f373c097e20bf76413084b8e Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Mon, 29 Apr 2024 20:07:24 +0800 Subject: [PATCH 077/302] cleancode --- .../op_kernel/embedding_update_by_address.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp index d129947b..828d7fbe 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp @@ -16,13 +16,11 @@ See the License for the specific language governing permissions and #include "kernel_operator.h" using namespace AscendC; +namespace KernelOps { constexpr int32_t SIZE_OF_HALF = 2; constexpr int32_t SIZE_OF_FLOAT_OR_INT = 4; template - -namespace AscendKernel { - class KernelEimtable_update { public: @@ -201,7 +199,7 @@ extern "C" __global__ __aicore__ void embedding_update_by_address(GM_ADDR addres { case 0: { - AscendKernel::KernelEimtable_update op; + KernelOps::KernelEimtable_update op; op.Init_param(tiling); op.Init(address, embedding, y); op.Process(); @@ -209,7 +207,7 @@ extern "C" __global__ __aicore__ void embedding_update_by_address(GM_ADDR addres break; case 2: { - AscendKernel::KernelEimtable_update op; + KernelOps::KernelEimtable_update op; op.Init_param(tiling); op.Init(address, embedding, y); op.Process(); @@ -217,7 +215,7 @@ extern "C" __global__ __aicore__ void embedding_update_by_address(GM_ADDR addres break; default: { - AscendKernel::KernelEimtable_update op; + KernelOps::KernelEimtable_update op; op.Init_param(tiling); op.Init(address, embedding, y); op.Process(); -- Gitee From b6ff564e63a54c47571eeda75b3a950a4259e5f2 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 29 Apr 2024 20:25:02 +0800 Subject: [PATCH 078/302] =?UTF-8?q?LazyAdam=E8=9E=8D=E5=90=88=E7=AE=97?= =?UTF-8?q?=E5=AD=90-aclnn=E9=83=A8=E5=88=86=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/inc/common.h | 51 ++++ .../aclnn_lazy_adam_test/inc/op_runner.h | 188 +++++++++++++++ .../aclnn_lazy_adam_test/inc/operator_desc.h | 66 +++++ .../aclnn_lazy_adam_test/input/.keep | 0 .../aclnn_lazy_adam_test/output/.keep | 0 .../aclnn_lazy_adam_test/run.sh | 106 ++++++++ .../aclnn_lazy_adam_test/scripts/gen_data.py | 145 +++++++++++ .../scripts/verify_result.py | 50 ++++ .../aclnn_lazy_adam_test/src/CMakeLists.txt | 67 ++++++ .../aclnn_lazy_adam_test/src/common.cpp | 85 +++++++ .../aclnn_lazy_adam_test/src/main.cpp | 226 ++++++++++++++++++ 11 files changed, 984 insertions(+) create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/input/.keep create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/output/.keep create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/verify_result.py create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h new file mode 100644 index 00000000..ba754761 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h @@ -0,0 +1,51 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size); + +#endif // COMMON_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h new file mode 100644 index 00000000..ed432a1e --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h @@ -0,0 +1,188 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "aclnn/acl_meta.h" +#include "acl/acl.h" +#include "common.h" +#include "operator_desc.h" + +/** + * Op Runner + */ +class OpRunner { +public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + const size_t GetInputNumDims(size_t index) const; + aclDataType GetInputDataType(size_t index) const; + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + const size_t GetOutputNumDims(size_t index) const; + aclDataType GetOutputDataType(size_t index) const; + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template + T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template + const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] numElementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t numElementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] numElementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t numElementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(); + +private: + size_t numInputs_; + size_t numOutputs_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; +}; + +#endif // OP_RUNNER_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h new file mode 100644 index 00000000..0c76260b --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h @@ -0,0 +1,66 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +/** + * Op description + */ +struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; + double beta1; + double beta2; + double epsilon; +}; + +#endif // OPERATOR_DESC_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/input/.keep b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/input/.keep new file mode 100644 index 00000000..e69de29b diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/output/.keep b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/output/.keep new file mode 100644 index 00000000..e69de29b diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh new file mode 100644 index 00000000..3d4af97c --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=0 + +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) +cd $CURRENT_DIR + +# 导出环境变量 +SHORT=v:, +LONG=dtype:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + # float16, float, int32 + (-v | --dtype) + DTYPE="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +if [ ! $ASCEND_HOME_DIR ]; then + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest + else + export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $ASCEND_HOME_DIR/bin/setenv.bash + +export DDK_PATH=$ASCEND_HOME_DIR +arch=$(uname -m) +export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64 + +function main { + # 1. 清除遗留生成文件和日志文件 + rm -rf $HOME/ascend/log/* + rm ./input/*.bin + rm ./output/*.bin + + # 2. 生成输入数据和真值数据 + cd $CURRENT_DIR + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + # 3. 编译acl可执行文件 + cd $CURRENT_DIR; rm -rf build; mkdir -p build; cd build + cmake ../src + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + # 4. 运行可执行文件 + cd $CURRENT_DIR/output + echo "INFO: execute op!" + ./execute_op + + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + + # 5. 比较真值文件 + cd $CURRENT_DIR + python3 scripts/verify_result.py +} + +main diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py new file mode 100644 index 00000000..6e07f836 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import copy +import os +import numpy as np + +# 获取项目路径 +_CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_PATH = os.path.dirname(_CURRENT_PATH) +_INPUT_PATH = os.path.join(_PROJECT_PATH, "./input") +_OUTPUT_PATH = os.path.join(_PROJECT_PATH, "./output") + +_DIM_0 = 2000000 +_DIM_1 = 564096 +_DIM_2 = 32 + + +def _gather(input_data, indices): + out = np.zeros((len(indices), input_data.shape[1])) + for i, index_ in enumerate(indices): + # 跳过index小于0的数据 + if index_[0] < 0: + continue + out[i] = input_data[index_[0]] + return out + + +def _scatter_nd_update(momentum, indices, update_value): + out = copy.deepcopy(momentum) + for i, index_ in enumerate(indices): + if index_[0] < 0: + continue + else: + out[index_[0]] = update_value[i] + return out + + +def _scatter_nd_add(momentum, indices, update_value): + out = copy.deepcopy(momentum) + for i, index_ in enumerate(indices): + if index_[0] < 0: + continue + else: + out[indices[i][0]] = out[index_[0]] + update_value[i] + return out + + +def _gen_input_data(): + range_start = 1 + range_end = 2 + + dtype_chose = np.float32 + shape0 = (_DIM_0, _DIM_2) + indices_shape = (_DIM_1, 1) + grad_shape = (_DIM_1, _DIM_2) + + input_var = np.random.uniform(range_start, range_end, size=shape0).astype(dtype_chose) # shape [2000000,32] + input_m = np.random.uniform(range_start, range_end, size=shape0).astype(dtype_chose) # shape [2000000,32] + input_v = np.random.uniform(range_start, range_end, size=shape0).astype(dtype_chose) # shape [2000000,32] + + # indices shape [564096,1] + indices = np.random.permutation(np.arange(_DIM_0)).astype(np.int32)[:indices_shape[0]].reshape(-1, 1) + # gradient shape [564096,32] + gradient = np.random.uniform(range_start, range_end, size=grad_shape).astype(dtype_chose) + + if not os.path.exists(_INPUT_PATH): + os.makedirs(_INPUT_PATH) + indices.tofile(os.path.join(_INPUT_PATH, "indices.bin")) + gradient.tofile(os.path.join(_INPUT_PATH, "gradient.bin")) + input_m.tofile(os.path.join(_INPUT_PATH, "inputM.bin")) + input_v.tofile(os.path.join(_INPUT_PATH, "inputV.bin")) + input_var.tofile(os.path.join(_INPUT_PATH, "inputVar.bin")) + + +def _gen_golden_data(): + beta1 = 0.9 + beta2 = 0.999 + lr = 0.001 + epsilon = 1e-7 + + lr = np.array(lr).astype(np.float32) + beta1 = np.array(beta1).astype(np.float32) + beta2 = np.array(beta2).astype(np.float32) + epsilon = np.array(epsilon).astype(np.float32) + + lr.tofile(os.path.join(_INPUT_PATH, "learningRate.bin")) + + indices = np.fromfile(os.path.join(_INPUT_PATH, "indices.bin"), dtype=np.int32).reshape( + (_DIM_1, 1)) # shape (564096,1) + gradient = np.fromfile(os.path.join(_INPUT_PATH, "gradient.bin"), dtype=np.float32).reshape( + (_DIM_1, _DIM_2)) # shape (564096,32) + input_m = np.fromfile(os.path.join(_INPUT_PATH, "inputM.bin"), dtype=np.float32).reshape( + (_DIM_0, _DIM_2)) # shape (2000000,32) + input_v = np.fromfile(os.path.join(_INPUT_PATH, "inputV.bin"), dtype=np.float32).reshape( + (_DIM_0, _DIM_2)) # shape (2000000,32) + input_var = np.fromfile(os.path.join(_INPUT_PATH, "inputVar.bin"), dtype=np.float32).reshape( + (_DIM_0, _DIM_2)) # shape (2000000,32) + + old_m_slice = _gather(input_m, indices) # shape(564096,32) + old_m_slice = np.array(old_m_slice).astype(np.float32) # + update_m = beta1 * old_m_slice + (1 - beta1) * gradient + out_m = _scatter_nd_update(input_m, indices, update_m) + + old_v_slice = _gather(input_v, indices) + old_v_slice = np.array(old_v_slice).astype(np.float32) + update_v = beta2 * old_v_slice + (1 - beta2) * np.square(gradient) + out_v = _scatter_nd_update(input_v, indices, update_v) + + denominator_slice = np.sqrt(update_v) + epsilon + update_var = np.divide(-lr * update_m, denominator_slice) + out_var = _scatter_nd_add(input_var, indices, update_var) + + return out_m, out_v, out_var + + +def _gen_input_and_golden_data(): + # 产生输入数据 + _gen_input_data() + + # 产生真值数据 + out_m, out_v, out_var = _gen_golden_data() + if not os.path.exists(_OUTPUT_PATH): + os.makedirs(_OUTPUT_PATH) + out_m.tofile(os.path.join(_OUTPUT_PATH, "goldenOutputM.bin")) + out_v.tofile(os.path.join(_OUTPUT_PATH, "goldenOutputV.bin")) + out_var.tofile(os.path.join(_OUTPUT_PATH, "goldenOutputVar.bin")) + + +if __name__ == "__main__": + _gen_input_and_golden_data() diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/verify_result.py b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/verify_result.py new file mode 100644 index 00000000..1cc516db --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/verify_result.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import logging +import numpy as np + +_LOSS_THRESHOLD = 1e-6 # 容忍偏差,一般fp16要求绝对误差和相对误差均不超过万分之一 +_MINIMUM = 10e-10 + +logging.getLogger().setLevel(logging.INFO) + + +def verify_result(real_result, golden): + real_result = np.fromfile(real_result, dtype=np.float32) # 从bin文件读取实际运算结果 + golden = np.fromfile(golden, dtype=np.float32) # 从bin文件读取预期运算结果 + result = np.abs(real_result - golden) # 计算运算结果和预期结果偏差 + deno = np.maximum(np.abs(real_result), np.abs(golden)) # 获取最大值并组成新数组 + result_atol = np.less_equal(result, _LOSS_THRESHOLD) # 计算绝对误差 + result_rtol = np.less_equal(result / np.add(deno, _MINIMUM), _LOSS_THRESHOLD) # 计算相对误差 + if not result_rtol.all() and not result_atol.all(): + # 误差超出预期时返回打印错误,返回对比失败 + if np.sum(result_rtol == False) > real_result.size * _LOSS_THRESHOLD \ + and np.sum(result_atol == False) > real_result.size * _LOSS_THRESHOLD: + logging.error("[ERROR] output verify result error.") + return False + logging.info("output verify pass.") + return True + + +if __name__ == '__main__': + logging.info("start verify outputM.") + verify_result("output/outputM.bin", "output/goldenOutputM.bin") + logging.info("start verify outputV.") + verify_result("output/outputV.bin", "output/goldenOutputV.bin") + logging.info("start verify outputVar.") + verify_result("output/outputVar.bin", "output/goldenOutputVar.bin") diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt new file mode 100644 index 00000000..1642e3ca --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt @@ -0,0 +1,67 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_lazy_adam) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize_lazy_adam/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/") + set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +set(AUTO_GEN_PATH "../../lazy_adam/build_out/autogen") +# Header path +include_directories( + ${INC_PATH}/runtime/include + ${INC_PATH}/atc/include + ../inc + ${CUST_PKG_PATH}/include + ${AUTO_GEN_PATH} +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${LIB_PATH1} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_op + main.cpp + common.cpp +) + +target_link_libraries(execute_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp new file mode 100644 index 00000000..1c295bfc --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp @@ -0,0 +1,85 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include +#include +#include +#include + +#include "common.h" + +extern bool g_isDevice; + +bool ReadFile(const std::string& filePath, size_t fileSize, void* buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf* buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string& filePath, const void* buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp new file mode 100644 index 00000000..f32efcaa --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp @@ -0,0 +1,226 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "aclnn_lazy_adam.h" +#include "common.h" +#include "op_runner.h" + +bool g_isDevice = false; +int g_deviceId = 0; +constexpr int DIM0 = 2000000; // inputM inputV inputVar 的行数 +constexpr int DIM1 = 564096; // indices长度 +constexpr int DIM2 = 32; // inputM inputV inputVar gradient等每行的数据个数 +constexpr int INPUT_M_INDEX = 2; +constexpr int INPUT_V_INDEX = 3; +constexpr int INPUT_VAR_INDEX = 4; +constexpr int LEARNING_RATE_INDEX = 5; +constexpr int OUTPUT_M_INDEX = 0; +constexpr int OUTPUT_V_INDEX = 1; +constexpr int OUTPUT_VAR_INDEX = 2; +constexpr float LEARNING_RATE = 0.001; +constexpr float BETA1 = 0.9; +constexpr float BETA2 = 0.999; +constexpr float EPSILON = 1e-7; +const char* READ_ERROR_INFO = "read input file error, please check whether file exist and access rights is correct"; +const char* WRITE_ERROR_INFO = "write output file error, please check access rights is correct"; + +OperatorDesc CreateOpDesc() +{ + std::vector indicesShape{DIM1, 1}; + std::vector gradientShape{DIM1, DIM2}; + std::vector inputMShape{DIM0, DIM2}; // inputM inputV inputVar 的shape相同 + std::vector learningRateShape{1}; + aclDataType dataType = ACL_FLOAT; + aclDataType indexDataType = ACL_INT32; + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(dataType, gradientShape.size(), gradientShape.data(), format); + opDesc.AddInputTensorDesc(indexDataType, indicesShape.size(), indicesShape.data(), format); + opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputM + opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputV + opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputVar + opDesc.AddInputTensorDesc(dataType, learningRateShape.size(), learningRateShape.data(), format); // learningRate + opDesc.beta1 = BETA1; + opDesc.beta2 = BETA2; + opDesc.epsilon = EPSILON; + return opDesc; +} + +bool SetInputData(OpRunner& runner) +{ + size_t fileSize = 0; + if (!ReadFile("../input/gradient.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/indices.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/inputM.bin", fileSize, runner.GetInputBuffer(INPUT_M_INDEX), + runner.GetInputSize(INPUT_M_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/inputV.bin", fileSize, runner.GetInputBuffer(INPUT_V_INDEX), + runner.GetInputSize(INPUT_V_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/inputVar.bin", fileSize, runner.GetInputBuffer(INPUT_VAR_INDEX), + runner.GetInputSize(INPUT_VAR_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/learningRate.bin", fileSize, runner.GetInputBuffer(LEARNING_RATE_INDEX), + runner.GetInputSize(LEARNING_RATE_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + INFO_LOG("Set input success"); + return true; +} + +bool ProcessOutputData(OpRunner& runner) +{ + // 保存输出数据 由于输出仅有hostOutputs_数据,未设置outputDesc,因此数据size从inputTensor获取 + if (!WriteFile("../output/outputM.bin", runner.GetOutputBuffer(OUTPUT_M_INDEX), + runner.GetInputSize(INPUT_M_INDEX))) { + throw std::runtime_error(WRITE_ERROR_INFO); + } + if (!WriteFile("../output/outputV.bin", runner.GetOutputBuffer(OUTPUT_V_INDEX), + runner.GetInputSize(INPUT_V_INDEX))) { + throw std::runtime_error(WRITE_ERROR_INFO); + } + if (!WriteFile("../output/outputVar.bin", runner.GetOutputBuffer(OUTPUT_VAR_INDEX), + runner.GetInputSize(INPUT_VAR_INDEX))) { + throw std::runtime_error(WRITE_ERROR_INFO); + } + INFO_LOG("Write output success"); + return true; +} + +void DestroyResource() +{ + bool flag = false; + if (aclrtResetDevice(g_deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", g_deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destroy resource failed"); + } else { + INFO_LOG("Destroy resource success"); + } +} + +bool InitResource() +{ + std::string output = "../output"; + if (access(output.c_str(), 0) == -1) { + int ret = mkdir(output.c_str(), 0700); + if (ret == 0) { + INFO_LOG("Make output directory successfully"); + } else { + ERROR_LOG("Make output directory fail"); + return false; + } + } + + // acl.json is dump or profiling config file + if (aclInit(NULL) != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + if (aclrtSetDevice(g_deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. g_deviceId is %d", g_deviceId); + (void) aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", g_deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestroyResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; +} + +bool RunOp() +{ + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp()) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + + INFO_LOG("Run op success"); + return true; +} + +int main(int argc, char** argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + if (!RunOp()) { + DestroyResource(); + return FAILED; + } + + DestroyResource(); + + return SUCCESS; +} -- Gitee From f27dd548deda4eb170eee12d862e54b516ff54df Mon Sep 17 00:00:00 2001 From: sihaixianyu Date: Tue, 30 Apr 2024 02:59:28 +0000 Subject: [PATCH 079/302] =?UTF-8?q?!112=20=E9=80=82=E9=85=8D=E5=88=87?= =?UTF-8?q?=E5=9B=BE=E5=8A=9F=E8=83=BD=E7=9A=84LittleDemo?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E7=94=A8=E4=BE=8B=20*=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E5=88=87=E5=9B=BE=E5=8A=9F=E8=83=BD=E7=9A=84LittleDemo=E6=A8=A1=E5=9E=8B=E7=94=A8=E4=BE=8B=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo_estimator/main.py | 22 ++++++++++--------- .../little_demo_estimator/nn_model_build.py | 2 +- .../little_demo_estimator/nn_model_input.py | 2 +- examples/demo/little_demo_estimator/run.sh | 3 +-- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/examples/demo/little_demo_estimator/main.py b/examples/demo/little_demo_estimator/main.py index de0b6c86..cca5a7a5 100644 --- a/examples/demo/little_demo_estimator/main.py +++ b/examples/demo/little_demo_estimator/main.py @@ -19,16 +19,14 @@ import argparse import os import tensorflow as tf - from mx_rec.util.initialize import init, terminate_config_initializer -from mx_rec.util.communication.hccl_ops import get_rank_id from mx_rec.core.asc.helper import FeatureSpec from mx_rec.graph.modifier import GraphModifierHook -from mx_rec.graph.acg_push_ops import ACGPushOpsToDatasetHook +from mx_rec.graph.hooks import OrphanLookupKeySlicerHook, LookupSubgraphSlicerHook from mx_rec.core.feature_process import EvictHook from mx_rec.util.log import logger -from tf_adapter import NPURunConfig, NPUEstimator, npu_hooks_append, DumpConfig +from tf_adapter import NPURunConfig, NPUEstimator, npu_hooks_append from nn_reader import input_fn from nn_model_input import get_model_fn from config import Config @@ -58,10 +56,12 @@ def main(params, config): # access_threshold unit counts; eviction_threshold unit seconds access_and_evict = None - if not params.enable_push_ops_test: + if not params.enable_slicer_test: hooks_list = [GraphModifierHook(modify_graph=params.modify_graph)] else: - hooks_list = [ACGPushOpsToDatasetHook(dump_graph=True), GraphModifierHook(modify_graph=params.modify_graph)] + orphan_slicer_hook = OrphanLookupKeySlicerHook() + lookup_slicer_hook = LookupSubgraphSlicerHook(op_types=["StringToNumber"]) + hooks_list = [orphan_slicer_hook, lookup_slicer_hook, GraphModifierHook(modify_graph=params.modify_graph)] if params.use_timestamp: config_for_user_table = dict(access_threshold=config.access_threshold, @@ -89,12 +89,14 @@ def main(params, config): train_spec = tf.estimator.TrainSpec(input_fn=lambda: input_fn(params, create_fs_params, config, use_one_shot=args.use_one_shot), max_steps=params.max_steps, hooks=npu_hooks_append(hooks_list)) - # 在开启evict时,eval时不支持淘汰,所以无需加入evict hook - if not params.enable_push_ops_test: + if not params.enable_slicer_test: + # 在开启evict时,eval时不支持淘汰,所以无需加入evict hook eval_hook_list = [GraphModifierHook(modify_graph=params.modify_graph)] else: - eval_hook_list = [ACGPushOpsToDatasetHook(dump_graph=True), + orphan_slicer_hook = OrphanLookupKeySlicerHook() + lookup_slicer_hook = LookupSubgraphSlicerHook(op_types=["StringToNumber"]) + eval_hook_list = [orphan_slicer_hook, lookup_slicer_hook, GraphModifierHook(modify_graph=params.modify_graph)] eval_spec = tf.estimator.EvalSpec(input_fn=lambda: input_fn(params, create_fs_params, config, is_eval=True, @@ -165,7 +167,7 @@ if __name__ == '__main__': MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0))) USE_TIMESTAMP = bool(int(os.getenv("USE_TIMESTAMP", 0))) args.use_one_shot = bool(int(os.getenv("USE_ONE_SHOT", 0))) - args.enable_push_ops_test = bool(int(os.getenv("ENABLE_PUSH_OPS_TEST", 0))) + args.enable_slicer_test = bool(int(os.getenv("ENABLE_SLICER_TEST", 0))) except ValueError as err: raise ValueError("please correctly config USE_MPI or USE_DYNAMIC or USE_DYNAMIC_EXPANSION or " "USE_MULTI_LOOKUP or USE_MODIFY_GRAPH or USE_TIMESTAMP or USE_ONE_SHOT " diff --git a/examples/demo/little_demo_estimator/nn_model_build.py b/examples/demo/little_demo_estimator/nn_model_build.py index 67820d04..11faadf1 100644 --- a/examples/demo/little_demo_estimator/nn_model_build.py +++ b/examples/demo/little_demo_estimator/nn_model_build.py @@ -155,7 +155,7 @@ class LittleModel: optimizer_list=sparse_optimizer_list) if self.params.modify_graph: - if not self.params.enable_push_ops_test: + if not self.params.enable_slicer_test: input_list = [[self.features["user_ids"], self.features["item_ids"]], [user_hashtable, item_hashtable], [self.cfg.user_send_cnt, self.cfg.item_send_cnt], diff --git a/examples/demo/little_demo_estimator/nn_model_input.py b/examples/demo/little_demo_estimator/nn_model_input.py index d6ebb529..973a457c 100644 --- a/examples/demo/little_demo_estimator/nn_model_input.py +++ b/examples/demo/little_demo_estimator/nn_model_input.py @@ -29,7 +29,7 @@ def get_model_fn(create_fs_params, cfg, access_and_evict_config_dict=None): if params.use_timestamp: model = LittleModel(params, cfg, mode, features, create_fs_params, access_and_evict_config_dict=access_and_evict_config_dict) - tf.add_to_collection(ASCEND_TIMESTAMP, features["timestamp"]) + tf.compat.v1.add_to_collection(ASCEND_TIMESTAMP, features["timestamp"]) else: model = LittleModel(params, cfg, mode, features, create_fs_params) else: diff --git a/examples/demo/little_demo_estimator/run.sh b/examples/demo/little_demo_estimator/run.sh index 6534fb21..f3d34c82 100644 --- a/examples/demo/little_demo_estimator/run.sh +++ b/examples/demo/little_demo_estimator/run.sh @@ -104,8 +104,7 @@ export KEY_PROCESS_THREAD_NUM=6 #default 6, max 10 export FAST_UNIQUE=0 #if use fast unique export MGMT_HBM_TASK_MODE=0 #if async h2d (get and send tensors) ################## 测试配置项 ##################### -# NOTE: 仅在测试constant、string相关op作为稀疏表输入时启用,当前版本只支持TF1。 -export ENABLE_PUSH_OPS_TEST=0 +export ENABLE_SLICER_TEST=0 # 帮助信息,不需要修改 if [[ $1 == --help || $1 == -h ]];then -- Gitee From 564f2c2ad7334a7938614bc8a884701a0c0e7fbe Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Tue, 30 Apr 2024 11:47:08 +0800 Subject: [PATCH 080/302] cleancode --- .../op_host/embedding_update_by_address.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp b/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp index 5f823889..43d7a886 100644 --- a/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_host/embedding_update_by_address.cpp @@ -28,7 +28,7 @@ namespace optiling constexpr int32_t SIZE_OF_HALF = 2; constexpr int32_t SIZE_OF_FLOAT_OR_INT = 4; constexpr int32_t MIN_BLOCK_SIZE = 32; // ub空间的数据都要按照32对齐 - constexpr uint32_t UB_LIMIT = 175 * 1024; + constexpr uint64_t UB_LIMIT = 175 * 1024; constexpr int32_t USR_SIZE = 256; constexpr int32_t SYS_WORKSPACE_SIZE = 16 * 1024 * 1024; constexpr int32_t PING_PONG_NUM = 1; @@ -44,7 +44,7 @@ namespace optiling return ge::GRAPH_SUCCESS; } - static ge::graphStatus CheckPositiveInt(int32_t value, const char *errorMessage) + static ge::graphStatus CheckPositiveInt(int64_t value, const char *errorMessage) { if (value < 0) { printf("%s can not be smaller than 0\n", errorMessage); @@ -73,7 +73,7 @@ namespace optiling return ge::GRAPH_FAILED; } - int32_t inputShape = static_cast(inputTensor->GetShapeSize()); + int64_t inputShape = inputTensor->GetShapeSize(); if (CheckPositiveInt(inputShape, "inputShape") != ge::GRAPH_SUCCESS) { return ge::GRAPH_FAILED; } @@ -83,8 +83,8 @@ namespace optiling return ge::GRAPH_FAILED; } - const int32_t inputShapeTmp = (inputShape > 0) ? inputShape : 1; - int32_t inputDim = static_cast(inputTensor1->GetShapeSize()) / inputShapeTmp; + const int64_t inputShapeTmp = (inputShape > 0) ? inputShape : 1; + int64_t inputDim = inputTensor1->GetShapeSize() / inputShapeTmp; if (CheckPositiveInt(inputDim, "inputDim") != ge::GRAPH_SUCCESS) { return ge::GRAPH_FAILED; } @@ -122,8 +122,8 @@ namespace optiling int32_t occupyAddressBytesNum = sizeof(int64_t) + typeSize * inputDimAligned * PING_PONG_NUM * 2; // 一轮计算中最多计算多少个addr,由于地址也要搬到ub,所以需要对齐32 - int32_t addrPerLoop = static_cast( - UB_LIMIT / static_cast(occupyAddressBytesNum) & (~3U)); // & (~3U),保证地址数是4的倍数 + int64_t addrPerLoop = static_cast( + UB_LIMIT / static_cast(occupyAddressBytesNum) & (~3U)); // & (~3U),保证地址数是4的倍数 if (CheckPositiveInt(addrPerLoop, "addrPerLoop") != ge::GRAPH_SUCCESS) { return ge::GRAPH_FAILED; -- Gitee From 19f0fa3308abe6eddc2c7b4aaed07317874289e0 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 30 Apr 2024 15:41:39 +0800 Subject: [PATCH 081/302] =?UTF-8?q?aclnn=E6=B5=8B=E8=AF=95=E9=97=A8?= =?UTF-8?q?=E7=A6=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/inc/common.h | 35 +- .../aclnn_lazy_adam_test/inc/op_runner.h | 321 +++++++++--------- .../aclnn_lazy_adam_test/inc/operator_desc.h | 73 ++-- .../aclnn_lazy_adam_test/src/common.cpp | 107 +++--- .../aclnn_lazy_adam_test/src/main.cpp | 39 +-- 5 files changed, 290 insertions(+), 285 deletions(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h index ba754761..601a2617 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h @@ -24,6 +24,7 @@ See the License for the specific language governing permissions and #include "acl/acl.h" +namespace AclnnLazyAdam { #define SUCCESS 0 #define FAILED 1 @@ -31,21 +32,21 @@ See the License for the specific language governing permissions and #define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) #define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) -/** - * @brief Read data from file - * @param [in] filePath: file path - * @param [out] fileSize: file size - * @return read result - */ -bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); - -/** - * @brief Write data to file - * @param [in] filePath: file path - * @param [in] buffer: data to write to file - * @param [in] size: size to write - * @return write result - */ -bool WriteFile(const std::string &filePath, const void *buffer, size_t size); - + /** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ + bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + + /** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ + bool WriteFile(const std::string &filePath, const void *buffer, size_t size); +} #endif // COMMON_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h index ed432a1e..cfb6a1b7 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h @@ -21,168 +21,173 @@ See the License for the specific language governing permissions and #include "common.h" #include "operator_desc.h" -/** - * Op Runner - */ -class OpRunner { -public: +namespace AclnnLazyAdam { /** - * @brief Constructor - * @param [in] opDesc: op description + * Op Runner */ - explicit OpRunner(OperatorDesc *opDesc); - - /** - * @brief Destructor - */ - virtual ~OpRunner(); - - /** - * @brief Init op runner - */ - bool Init(); - - /** - * @brief Get number of inputs - * @return number of inputs - */ - const size_t NumInputs(); - - /** - * @brief Get number of outputs - * @return number of outputs - */ - const size_t NumOutputs(); - - /** - * @brief Get input size by index - * @param [in] index: input index - * @return size of the input - */ - const size_t GetInputSize(size_t index) const; - const size_t GetInputNumDims(size_t index) const; - aclDataType GetInputDataType(size_t index) const; - aclFormat GetInputFormat(size_t index) const; - - /** - * @brief Get output size by index - * @param [in] index: output index - * @return size of the output - */ - size_t GetOutputSize(size_t index) const; - const size_t GetOutputNumDims(size_t index) const; - aclDataType GetOutputDataType(size_t index) const; - aclFormat GetOutputFormat(size_t index) const; - - /** - * @brief Get input element count by index - * @param i[in] ndex: input index - * @return element count of the input - */ - size_t GetInputElementCount(size_t index) const; - - /** - * @brief Get output element count by index - * @param [in] index: output index - * @return element count of the output - */ - size_t GetOutputElementCount(size_t index) const; - - /** - * @brief Get input shape by index - * @param [in] index: input index - * @return shape of the output - */ - std::vector GetInputShape(size_t index) const; - - /** - * @brief Get output shape by index - * @param [in] index: output index - * @return shape of the output - */ - std::vector GetOutputShape(size_t index) const; - - /** - * @brief Get input buffer(host memory) by index - * @tparam T: data type - * @param [in] index: input index - * @return host address of the input - */ - template - T *GetInputBuffer(size_t index) - { - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return nullptr; + class OpRunner { + public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + + const size_t GetInputNumDims(size_t index) const; + + aclDataType GetInputDataType(size_t index) const; + + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + + const size_t GetOutputNumDims(size_t index) const; + + aclDataType GetOutputDataType(size_t index) const; + + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template + T *GetInputBuffer(size_t index) { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); } - return reinterpret_cast(hostInputs_[index]); - } - /** - * @brief Get output buffer(host memory) by index - * @tparam T: data type - * @param [in] index: output index - * @return host address of the output - */ - template - const T *GetOutputBuffer(size_t index) - { - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return nullptr; + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template + const T *GetOutputBuffer(size_t index) { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); } - return reinterpret_cast(hostOutputs_[index]); - } - - /** - * @brief Print readable input by index - * @param [in] index: input index - * @param [in] numElementsPerRow: number of elements per row - */ - void PrintInput(size_t index, size_t numElementsPerRow = 16); - - /** - * @brief Print readable output by index - * @param [in] index: output index - * @param [in] numElementsPerRow: number of elements per row - */ - void PrintOutput(size_t index, size_t numElementsPerRow = 16); - - /** - * @brief Compile static op - * @return compile result - */ - bool CompileStaticOp(); - - /** - * @brief Compile dynamic op - * @return compile result - */ - bool CompileDynamicOp(); - - /** - * @brief Run op - * @return run result - */ - bool RunOp(); - -private: - size_t numInputs_; - size_t numOutputs_; - - std::vector inputBuffers_; - std::vector outputBuffers_; - - std::vector devInputs_; - std::vector devOutputs_; - - std::vector hostInputs_; - std::vector hostOutputs_; - - std::vector inputTensor_; - std::vector outputTensor_; - OperatorDesc *opDesc_; -}; - + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] numElementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t numElementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] numElementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t numElementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(); + + private: + size_t numInputs_; + size_t numOutputs_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; + }; +} #endif // OP_RUNNER_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h index 0c76260b..ddd3b3a9 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h @@ -21,46 +21,47 @@ See the License for the specific language governing permissions and #include "acl/acl.h" -/** - * Op description - */ -struct OperatorDesc { +namespace AclnnLazyAdam { /** - * Constructor + * Op description */ - explicit OperatorDesc(); + struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); - /** - * Destructor - */ - virtual ~OperatorDesc(); + /** + * Destructor + */ + virtual ~OperatorDesc(); - /** - * Add an input tensor description - * @param [in] dataType: data type - * @param [in] numDims: number of dims - * @param [in] dims: dims - * @param [in] format: format - * @return OperatorDesc - */ - OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); - - /** - * Add an output tensor description - * @param [in] dataType: data type - * @param [in] numDims: number of dims - * @param [in] dims: dims - * @param [in] format: format - * @return OperatorDesc - */ - OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); - std::string opType; - std::vector inputDesc; - std::vector outputDesc; - double beta1; - double beta2; - double epsilon; -}; + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + std::string opType; + std::vector inputDesc; + std::vector outputDesc; + double beta1; + double beta2; + double epsilon; + }; +} #endif // OPERATOR_DESC_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp index 1c295bfc..1f353b68 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp @@ -20,66 +20,63 @@ See the License for the specific language governing permissions and #include "common.h" -extern bool g_isDevice; +namespace AclnnLazyAdam { + bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) { + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } -bool ReadFile(const std::string& filePath, size_t fileSize, void* buffer, size_t bufferSize) -{ - struct stat sBuf; - int fileStatus = stat(filePath.data(), &sBuf); - if (fileStatus == -1) { - ERROR_LOG("failed to get file %s", filePath.c_str()); - return false; - } - if (S_ISREG(sBuf.st_mode) == 0) { - ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); - return false; - } + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } - std::ifstream file; - file.open(filePath, std::ios::binary); - if (!file.is_open()) { - ERROR_LOG("Open file failed. path = %s", filePath.c_str()); - return false; - } - - std::filebuf* buf = file.rdbuf(); - size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); - if (size == 0) { - ERROR_LOG("file size is 0"); + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; file.close(); - return false; + return true; } - if (size > bufferSize) { - ERROR_LOG("file size is larger than buffer size"); - file.close(); - return false; - } - buf->pubseekpos(0, std::ios::in); - buf->sgetn(static_cast(buffer), size); - fileSize = size; - file.close(); - return true; -} -bool WriteFile(const std::string& filePath, const void* buffer, size_t size) -{ - if (buffer == nullptr) { - ERROR_LOG("Write file failed. buffer is nullptr"); - return false; - } + bool WriteFile(const std::string &filePath, const void *buffer, size_t size) { + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } - int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); - if (fd < 0) { - ERROR_LOG("Open file failed. path = %s", filePath.c_str()); - return false; - } + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } - auto writeSize = write(fd, buffer, size); - (void) close(fd); - if (writeSize != size) { - ERROR_LOG("Write file Failed."); - return false; + return true; } - - return true; -} +} \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp index f32efcaa..526da630 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp @@ -25,24 +25,28 @@ See the License for the specific language governing permissions and #include "common.h" #include "op_runner.h" +using namespace AclnnLazyAdam; + bool g_isDevice = false; int g_deviceId = 0; -constexpr int DIM0 = 2000000; // inputM inputV inputVar 的行数 -constexpr int DIM1 = 564096; // indices长度 -constexpr int DIM2 = 32; // inputM inputV inputVar gradient等每行的数据个数 -constexpr int INPUT_M_INDEX = 2; -constexpr int INPUT_V_INDEX = 3; -constexpr int INPUT_VAR_INDEX = 4; -constexpr int LEARNING_RATE_INDEX = 5; -constexpr int OUTPUT_M_INDEX = 0; -constexpr int OUTPUT_V_INDEX = 1; -constexpr int OUTPUT_VAR_INDEX = 2; -constexpr float LEARNING_RATE = 0.001; -constexpr float BETA1 = 0.9; -constexpr float BETA2 = 0.999; -constexpr float EPSILON = 1e-7; -const char* READ_ERROR_INFO = "read input file error, please check whether file exist and access rights is correct"; -const char* WRITE_ERROR_INFO = "write output file error, please check access rights is correct"; +namespace { + constexpr int DIM0 = 2000000; // inputM inputV inputVar 的行数 + constexpr int DIM1 = 564096; // indices长度 + constexpr int DIM2 = 32; // inputM inputV inputVar gradient等每行的数据个数 + constexpr int INPUT_M_INDEX = 2; + constexpr int INPUT_V_INDEX = 3; + constexpr int INPUT_VAR_INDEX = 4; + constexpr int LEARNING_RATE_INDEX = 5; + constexpr int OUTPUT_M_INDEX = 0; + constexpr int OUTPUT_V_INDEX = 1; + constexpr int OUTPUT_VAR_INDEX = 2; + constexpr float LEARNING_RATE = 0.001; + constexpr float BETA1 = 0.9; + constexpr float BETA2 = 0.999; + constexpr float EPSILON = 1e-7; + const char* READ_ERROR_INFO = "read input file error, please check whether file exist and access rights is correct"; + const char* WRITE_ERROR_INFO = "write output file error, please check access rights is correct"; +} OperatorDesc CreateOpDesc() { @@ -202,7 +206,6 @@ bool RunOp() ERROR_LOG("Process output data failed"); return false; } - INFO_LOG("Run op success"); return true; } @@ -219,8 +222,6 @@ int main(int argc, char** argv) DestroyResource(); return FAILED; } - DestroyResource(); - return SUCCESS; } -- Gitee From a1f85f8ff7cade87aab728915de571fbd76ebf17 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 30 Apr 2024 16:21:10 +0800 Subject: [PATCH 082/302] =?UTF-8?q?=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h | 6 ++++-- cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h index cfb6a1b7..6f91f905 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h @@ -116,7 +116,8 @@ namespace AclnnLazyAdam { * @return host address of the input */ template - T *GetInputBuffer(size_t index) { + T *GetInputBuffer(size_t index) + { if (index >= numInputs_) { ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); return nullptr; @@ -131,7 +132,8 @@ namespace AclnnLazyAdam { * @return host address of the output */ template - const T *GetOutputBuffer(size_t index) { + const T *GetOutputBuffer(size_t index) + { if (index >= numOutputs_) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); return nullptr; diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp index 1f353b68..e2cd6865 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp @@ -21,7 +21,8 @@ See the License for the specific language governing permissions and #include "common.h" namespace AclnnLazyAdam { - bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) { + bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) + { struct stat sBuf; int fileStatus = stat(filePath.data(), &sBuf); if (fileStatus == -1) { @@ -59,7 +60,8 @@ namespace AclnnLazyAdam { return true; } - bool WriteFile(const std::string &filePath, const void *buffer, size_t size) { + bool WriteFile(const std::string &filePath, const void *buffer, size_t size) + { if (buffer == nullptr) { ERROR_LOG("Write file failed. buffer is nullptr"); return false; -- Gitee From fb0eacdee9bd361babe62083f91b1175abddb7b1 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 30 Apr 2024 16:30:45 +0800 Subject: [PATCH 083/302] =?UTF-8?q?=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/src/main.cpp | 289 +++++++++--------- 1 file changed, 145 insertions(+), 144 deletions(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp index 526da630..c4253996 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp @@ -46,168 +46,169 @@ namespace { constexpr float EPSILON = 1e-7; const char* READ_ERROR_INFO = "read input file error, please check whether file exist and access rights is correct"; const char* WRITE_ERROR_INFO = "write output file error, please check access rights is correct"; -} - -OperatorDesc CreateOpDesc() -{ - std::vector indicesShape{DIM1, 1}; - std::vector gradientShape{DIM1, DIM2}; - std::vector inputMShape{DIM0, DIM2}; // inputM inputV inputVar 的shape相同 - std::vector learningRateShape{1}; - aclDataType dataType = ACL_FLOAT; - aclDataType indexDataType = ACL_INT32; - aclFormat format = ACL_FORMAT_ND; - OperatorDesc opDesc; - opDesc.AddInputTensorDesc(dataType, gradientShape.size(), gradientShape.data(), format); - opDesc.AddInputTensorDesc(indexDataType, indicesShape.size(), indicesShape.data(), format); - opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputM - opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputV - opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputVar - opDesc.AddInputTensorDesc(dataType, learningRateShape.size(), learningRateShape.data(), format); // learningRate - opDesc.beta1 = BETA1; - opDesc.beta2 = BETA2; - opDesc.epsilon = EPSILON; - return opDesc; -} -bool SetInputData(OpRunner& runner) -{ - size_t fileSize = 0; - if (!ReadFile("../input/gradient.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0))) { - throw std::runtime_error(READ_ERROR_INFO); - } - if (!ReadFile("../input/indices.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1))) { - throw std::runtime_error(READ_ERROR_INFO); - } - if (!ReadFile("../input/inputM.bin", fileSize, runner.GetInputBuffer(INPUT_M_INDEX), - runner.GetInputSize(INPUT_M_INDEX))) { - throw std::runtime_error(READ_ERROR_INFO); - } - if (!ReadFile("../input/inputV.bin", fileSize, runner.GetInputBuffer(INPUT_V_INDEX), - runner.GetInputSize(INPUT_V_INDEX))) { - throw std::runtime_error(READ_ERROR_INFO); - } - if (!ReadFile("../input/inputVar.bin", fileSize, runner.GetInputBuffer(INPUT_VAR_INDEX), - runner.GetInputSize(INPUT_VAR_INDEX))) { - throw std::runtime_error(READ_ERROR_INFO); - } - if (!ReadFile("../input/learningRate.bin", fileSize, runner.GetInputBuffer(LEARNING_RATE_INDEX), - runner.GetInputSize(LEARNING_RATE_INDEX))) { - throw std::runtime_error(READ_ERROR_INFO); - } - INFO_LOG("Set input success"); - return true; -} - -bool ProcessOutputData(OpRunner& runner) -{ - // 保存输出数据 由于输出仅有hostOutputs_数据,未设置outputDesc,因此数据size从inputTensor获取 - if (!WriteFile("../output/outputM.bin", runner.GetOutputBuffer(OUTPUT_M_INDEX), - runner.GetInputSize(INPUT_M_INDEX))) { - throw std::runtime_error(WRITE_ERROR_INFO); - } - if (!WriteFile("../output/outputV.bin", runner.GetOutputBuffer(OUTPUT_V_INDEX), - runner.GetInputSize(INPUT_V_INDEX))) { - throw std::runtime_error(WRITE_ERROR_INFO); - } - if (!WriteFile("../output/outputVar.bin", runner.GetOutputBuffer(OUTPUT_VAR_INDEX), - runner.GetInputSize(INPUT_VAR_INDEX))) { - throw std::runtime_error(WRITE_ERROR_INFO); + OperatorDesc CreateOpDesc() + { + std::vector indicesShape{DIM1, 1}; + std::vector gradientShape{DIM1, DIM2}; + std::vector inputMShape{DIM0, DIM2}; // inputM inputV inputVar 的shape相同 + std::vector learningRateShape{1}; + aclDataType dataType = ACL_FLOAT; + aclDataType indexDataType = ACL_INT32; + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(dataType, gradientShape.size(), gradientShape.data(), format); + opDesc.AddInputTensorDesc(indexDataType, indicesShape.size(), indicesShape.data(), format); + opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputM + opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputV + opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputVar + opDesc.AddInputTensorDesc(dataType, learningRateShape.size(), learningRateShape.data(), + format); // learningRate + opDesc.beta1 = BETA1; + opDesc.beta2 = BETA2; + opDesc.epsilon = EPSILON; + return opDesc; + } + + bool SetInputData(OpRunner& runner) + { + size_t fileSize = 0; + if (!ReadFile("../input/gradient.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/indices.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/inputM.bin", fileSize, runner.GetInputBuffer(INPUT_M_INDEX), + runner.GetInputSize(INPUT_M_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/inputV.bin", fileSize, runner.GetInputBuffer(INPUT_V_INDEX), + runner.GetInputSize(INPUT_V_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/inputVar.bin", fileSize, runner.GetInputBuffer(INPUT_VAR_INDEX), + runner.GetInputSize(INPUT_VAR_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/learningRate.bin", fileSize, runner.GetInputBuffer(LEARNING_RATE_INDEX), + runner.GetInputSize(LEARNING_RATE_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + INFO_LOG("Set input success"); + return true; } - INFO_LOG("Write output success"); - return true; -} -void DestroyResource() -{ - bool flag = false; - if (aclrtResetDevice(g_deviceId) != ACL_SUCCESS) { - ERROR_LOG("Reset device %d failed", g_deviceId); - flag = true; - } - INFO_LOG("Reset Device success"); - if (aclFinalize() != ACL_SUCCESS) { - ERROR_LOG("Finalize acl failed"); - flag = true; - } - if (flag) { - ERROR_LOG("Destroy resource failed"); - } else { - INFO_LOG("Destroy resource success"); + bool ProcessOutputData(OpRunner& runner) + { + // 保存输出数据 由于输出仅有hostOutputs_数据,未设置outputDesc,因此数据size从inputTensor获取 + if (!WriteFile("../output/outputM.bin", runner.GetOutputBuffer(OUTPUT_M_INDEX), + runner.GetInputSize(INPUT_M_INDEX))) { + throw std::runtime_error(WRITE_ERROR_INFO); + } + if (!WriteFile("../output/outputV.bin", runner.GetOutputBuffer(OUTPUT_V_INDEX), + runner.GetInputSize(INPUT_V_INDEX))) { + throw std::runtime_error(WRITE_ERROR_INFO); + } + if (!WriteFile("../output/outputVar.bin", runner.GetOutputBuffer(OUTPUT_VAR_INDEX), + runner.GetInputSize(INPUT_VAR_INDEX))) { + throw std::runtime_error(WRITE_ERROR_INFO); + } + INFO_LOG("Write output success"); + return true; } -} -bool InitResource() -{ - std::string output = "../output"; - if (access(output.c_str(), 0) == -1) { - int ret = mkdir(output.c_str(), 0700); - if (ret == 0) { - INFO_LOG("Make output directory successfully"); + void DestroyResource() + { + bool flag = false; + if (aclrtResetDevice(g_deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", g_deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destroy resource failed"); } else { - ERROR_LOG("Make output directory fail"); - return false; + INFO_LOG("Destroy resource success"); } } - // acl.json is dump or profiling config file - if (aclInit(NULL) != ACL_SUCCESS) { - ERROR_LOG("acl init failed"); - return false; - } + bool InitResource() + { + std::string output = "../output"; + if (access(output.c_str(), 0) == -1) { + int ret = mkdir(output.c_str(), 0700); + if (ret == 0) { + INFO_LOG("Make output directory successfully"); + } else { + ERROR_LOG("Make output directory fail"); + return false; + } + } - if (aclrtSetDevice(g_deviceId) != ACL_SUCCESS) { - ERROR_LOG("Set device failed. g_deviceId is %d", g_deviceId); - (void) aclFinalize(); - return false; - } - INFO_LOG("Set device[%d] success", g_deviceId); + // acl.json is dump or profiling config file + if (aclInit(NULL) != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } - // runMode is ACL_HOST which represents app is running in host - // runMode is ACL_DEVICE which represents app is running in device - aclrtRunMode runMode; - if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { - ERROR_LOG("Get run mode failed"); - DestroyResource(); - return false; + if (aclrtSetDevice(g_deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. g_deviceId is %d", g_deviceId); + (void) aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", g_deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestroyResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; } - g_isDevice = (runMode == ACL_DEVICE); - INFO_LOG("Get RunMode[%d] success", runMode); - return true; -} + bool RunOp() + { + // create op desc + OperatorDesc opDesc = CreateOpDesc(); -bool RunOp() -{ - // create op desc - OperatorDesc opDesc = CreateOpDesc(); - - // create Runner - OpRunner opRunner(&opDesc); - if (!opRunner.Init()) { - ERROR_LOG("Init OpRunner failed"); - return false; - } + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } - // Load inputs - if (!SetInputData(opRunner)) { - ERROR_LOG("Set input data failed"); - return false; - } + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } - // Run op - if (!opRunner.RunOp()) { - ERROR_LOG("Run op failed"); - return false; - } + // Run op + if (!opRunner.RunOp()) { + ERROR_LOG("Run op failed"); + return false; + } - // process output data - if (!ProcessOutputData(opRunner)) { - ERROR_LOG("Process output data failed"); - return false; + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + INFO_LOG("Run op success"); + return true; } - INFO_LOG("Run op success"); - return true; } int main(int argc, char** argv) -- Gitee From f9500b2688d13b7ef428fc1b2ffb5bce53a3e39a Mon Sep 17 00:00:00 2001 From: sihaixianyu Date: Mon, 6 May 2024 01:37:03 +0000 Subject: [PATCH 084/302] =?UTF-8?q?!114=20CleanCode=E6=B8=85=E7=90=86=20*?= =?UTF-8?q?=20CleanCode=E6=B8=85=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/graph/slicers.py | 440 ++++++++++++++++++++-------------------- 1 file changed, 220 insertions(+), 220 deletions(-) diff --git a/mx_rec/graph/slicers.py b/mx_rec/graph/slicers.py index 3204af4e..3999cdd4 100644 --- a/mx_rec/graph/slicers.py +++ b/mx_rec/graph/slicers.py @@ -69,6 +69,226 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): def slice(self) -> None: pass + @staticmethod + def _find_min_dep_ops( + tgt_ops: Set[Operation], + ) -> Set[Operation]: + logger.debug("Search from base nodes: %s.", tgt_ops) + base_ops = tgt_ops.copy() + visited_ops = base_ops + + loop_cnt = 0 + while base_ops: + loop_cnt += 1 + if loop_cnt > MAX_WHILE_SIZE: + raise RuntimeError(f"maximum loop times exceed limit: {MAX_WHILE_SIZE}.") + + parent_ops = set() + for base_node in base_ops: + if len(base_node.control_inputs) != 0: + raise ValueError("control dependencies are not supported.") + + parent_ops.update( + tensor_in.op + for tensor_in in base_node.inputs + if tensor_in.op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value + ) + + new_ops = parent_ops - visited_ops + base_ops = parent_ops + visited_ops.update(new_ops) + + logger.debug("Found minimum dependency graph nodes: %s.", visited_ops) + return visited_ops + + @staticmethod + def _validate_op(op: Operation) -> bool: + op_type = op.type + op_name = op.name + op_inputs = op.inputs + op_outputs = op.outputs + + for s in NoGradSubgraphSlicer._INVALID_STR_IN_OP_TYPE: + if s in op_type: + logger.warning("Invalid operation type: %s which contains str: %s.", op_type, s) + return False + for s in NoGradSubgraphSlicer._INVALID_STR_IN_OP_NAME: + if s in op_name: + logger.warning("Invalid operation name: %s which contains str: %s.", op_name, s) + return False + for t in op_inputs: + if t.dtype in NoGradSubgraphSlicer._INVALID_TENSOR_DTYPE: + logger.warning("Invalid operation input tensor of operation: %s whose type is %s.", t, t.dtype) + return False + for t in op_outputs: + if t.dtype in NoGradSubgraphSlicer._INVALID_TENSOR_DTYPE: + logger.warning("Invalid operation output tensor of operation: %s whose type is %s.", t, t.dtype) + return False + + return True + + @staticmethod + def _update_subgraph_in( + base_ops: Operation, + input_to_edge_ops: Dict[Operation, Set[Operation]], + sub_graph_ops: Set[Operation], + ) -> None: + for input_tensor in base_ops.inputs: + input_node = input_tensor.op + if input_node not in sub_graph_ops: + res = input_to_edge_ops.get(input_node, set()) + res.add(base_ops) + input_to_edge_ops[input_node] = res + + @staticmethod + def _update_subgraph_out( + base_ops: Operation, + out_op_to_edge_ops: Dict[Operation, Set[Operation]], + sub_graph_ops: Set[Operation], + ) -> None: + for output_tensor in base_ops.outputs: + for output_consumer in output_tensor.consumers(): + if output_consumer not in sub_graph_ops: + res = out_op_to_edge_ops.get(output_consumer, set()) + res.add(base_ops) + out_op_to_edge_ops[output_consumer] = res + + @staticmethod + def _upward_bfs_op(base_ops: Union[Operation, Set[Operation], List[Operation]], tgt_op_type: str) -> Operation: + if not isinstance(base_ops, (set, list)): + base_ops = [base_ops] + + parent_ops = base_ops + while True: + for parent_op in parent_ops: + if parent_op.type == tgt_op_type: + return parent_op + base_ops = parent_ops + parent_ops = [] + for base_op in base_ops: + parent_ops.extend(utils.find_parent_op(base_op)) + if not parent_ops: + raise ValueError(f"target operation '{tgt_op_type}'' was not found.") + + @staticmethod + def _topo_sort_sliced_ops(sliced_ops: Set[Operation]) -> List[Operation]: + topo_subgraph_list = [] + topo_subgraph_set = set() + start_nodes = set() + [start_nodes.add(x) for x in sliced_ops] + logger.info("Got topo_subgraph start nodes: %s", start_nodes) + + def topo_sort_helper(curr_op, output_list, output_set): + if not isinstance(curr_op, Operation): + raise RuntimeError(f"topo_subgraph_dfs input should be node(aka. tf.Operator). {curr_op}") + curr_inputs = curr_op.inputs + logger.debug("Got topo_dfs: %s <- %s", curr_op.name, [x.name for x in curr_inputs]) + current_control_inputs = curr_op.control_inputs + if len(current_control_inputs) > 0: + raise RuntimeError( + f"control input are not supported: {curr_op.name}, control_inputs: {current_control_inputs}" + ) + if curr_op in output_set: + return + output_set.add(curr_op) + for tensor in curr_inputs: + node = tensor.op + if node.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value and node not in output_set: + topo_sort_helper(node, output_list, output_set) + output_list.append(curr_op) + + [topo_sort_helper(x, topo_subgraph_list, topo_subgraph_set) for x in start_nodes] + if len(topo_subgraph_list) != len(topo_subgraph_set): + raise RuntimeError(f"got duplicated topo node: {sorted(topo_subgraph_list, key=lambda x: x.name)}.") + logger.info("Got topo_subgraph: %s", topo_subgraph_list) + return topo_subgraph_list + + @staticmethod + def _get_mapping_for_subgraph_in( + from_op: Operation, + to_ops: Set[Operation], + tensor_mapping: Union[Dict[Tensor, Tensor], Dict[SparseTensor, SparseTensor]], + ) -> None: + if from_op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: + raise RuntimeError(f"expect IteratorGetNext for input tensor of subgraph, but got {from_op}") + for node in to_ops: + for each_tensor in node.inputs: + if each_tensor.op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: + continue + old_tensor_name = each_tensor.name + x_index = int(old_tensor_name.split(":")[-1]) + g = tf.compat.v1.get_default_graph() + arg_tensor = g.get_tensor_by_name("args_%d:0" % x_index) + tensor_mapping[each_tensor] = arg_tensor + + @staticmethod + def _get_mapping_for_subgraph( + old_op: Operation, + node_mapping: Dict[Operation, Operation], + tensor_mapping: Dict[Tensor, Tensor], + ) -> None: + logger.debug("old operation name: %s\nold operation inputs: %s\n", old_op.name, [x for x in old_op.inputs]) + + for each_tensor in old_op.inputs: + if each_tensor not in tensor_mapping: + raise RuntimeError( + f"each_tensor(input) {each_tensor} need by {old_op.name} not in tensor_mapping.{tensor_mapping}" + ) + new_inputs = NoGradSubgraphSlicer._get_mapped_tensor(tensor_mapping, old_op.inputs) + + node_def = old_op.node_def + node_def.name = "{}/{}".format(NoGradSubgraphSlicer._SLICED_OP_NAME_PREFIX, node_def.name) + new_node = tf.Operation(node_def=node_def, g=tf.compat.v1.get_default_graph(), inputs=new_inputs) + + node_mapping[old_op] = new_node + for old_out_tensor, new_out_tensor in zip(old_op.outputs, new_node.outputs): + tensor_mapping[old_out_tensor] = new_out_tensor + + @staticmethod + def _get_mapped_tensor(tensor2tensor: Dict[Tensor, Tensor], keys: List[Tensor]) -> List[Tensor]: + tensors = [] + for k in keys: + if k not in tensor2tensor: + raise KeyError(f"failed to find key tensor: {k} from tensor map: {tensor2tensor}.") + tensors.append(tensor2tensor[k]) + return tensors + + @staticmethod + def _sort_sliced_graph_outputs(subgraph_out: Dict[Operation, Set[Operation]]) -> List[Tensor]: + extra_outputs = [] + sorted_outputs = sorted(subgraph_out.items(), key=lambda x: x[0].name) + for outside_op, edge_ops in sorted_outputs: + outside_op_inputs = set(outside_op.inputs) + for edge_op in edge_ops: + NoGradSubgraphSlicer._add_sorted_additional_tensors(extra_outputs, outside_op_inputs, edge_op) + return extra_outputs + + @staticmethod + def _add_sorted_additional_tensors(extra_outputs, outside_op_inputs, edge_op) -> None: + for each_tensor in sorted(edge_op.outputs, key=lambda x: x.name): + if each_tensor not in outside_op_inputs: + continue + if each_tensor in extra_outputs: + continue + extra_outputs.append(each_tensor) + + @staticmethod + def _get_tensor_consumers(tensor: Tensor) -> List[Operation]: + if not isinstance(tensor, NoGradSubgraphSlicer._VALID_TENSOR_CLASS): + raise RuntimeError(f"expected 'tf.Tensor' or 'tf.SparseTensor', but got: {tensor}") + + graph = tensor.graph + consumers = [] + consumer_names = [op.name for op in tensor.consumers()] + + with graph._lock: + for name in consumer_names: + if name not in graph._nodes_by_name: # ignore deleted node + continue + consumers.append(graph._nodes_by_name[name]) + + return consumers + def _slice_ops(self, sliceable_ops: Set[Operation], is_training: bool) -> None: """Slice the minimum dependency graph of given operation set. @@ -474,226 +694,6 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): new_tensor, ) - @staticmethod - def _find_min_dep_ops( - tgt_ops: Set[Operation], - ) -> Set[Operation]: - logger.debug("Search from base nodes: %s.", tgt_ops) - base_ops = tgt_ops.copy() - visited_ops = base_ops - - loop_cnt = 0 - while base_ops: - loop_cnt += 1 - if loop_cnt > MAX_WHILE_SIZE: - raise RuntimeError(f"maximum loop times exceed limit: {MAX_WHILE_SIZE}.") - - parent_ops = set() - for base_node in base_ops: - if len(base_node.control_inputs) != 0: - raise ValueError("control dependencies are not supported.") - - parent_ops.update( - tensor_in.op - for tensor_in in base_node.inputs - if tensor_in.op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value - ) - - new_ops = parent_ops - visited_ops - base_ops = parent_ops - visited_ops.update(new_ops) - - logger.debug("Found minimum dependency graph nodes: %s.", visited_ops) - return visited_ops - - @staticmethod - def _validate_op(op: Operation) -> bool: - op_type = op.type - op_name = op.name - op_inputs = op.inputs - op_outputs = op.outputs - - for s in NoGradSubgraphSlicer._INVALID_STR_IN_OP_TYPE: - if s in op_type: - logger.warning("Invalid operation type: %s which contains str: %s.", op_type, s) - return False - for s in NoGradSubgraphSlicer._INVALID_STR_IN_OP_NAME: - if s in op_name: - logger.warning("Invalid operation name: %s which contains str: %s.", op_name, s) - return False - for t in op_inputs: - if t.dtype in NoGradSubgraphSlicer._INVALID_TENSOR_DTYPE: - logger.warning("Invalid operation input tensor of operation: %s whose type is %s.", t, t.dtype) - return False - for t in op_outputs: - if t.dtype in NoGradSubgraphSlicer._INVALID_TENSOR_DTYPE: - logger.warning("Invalid operation output tensor of operation: %s whose type is %s.", t, t.dtype) - return False - - return True - - @staticmethod - def _update_subgraph_in( - base_ops: Operation, - input_to_edge_ops: Dict[Operation, Set[Operation]], - sub_graph_ops: Set[Operation], - ) -> None: - for input_tensor in base_ops.inputs: - input_node = input_tensor.op - if input_node not in sub_graph_ops: - res = input_to_edge_ops.get(input_node, set()) - res.add(base_ops) - input_to_edge_ops[input_node] = res - - @staticmethod - def _update_subgraph_out( - base_ops: Operation, - out_op_to_edge_ops: Dict[Operation, Set[Operation]], - sub_graph_ops: Set[Operation], - ) -> None: - for output_tensor in base_ops.outputs: - for output_consumer in output_tensor.consumers(): - if output_consumer not in sub_graph_ops: - res = out_op_to_edge_ops.get(output_consumer, set()) - res.add(base_ops) - out_op_to_edge_ops[output_consumer] = res - - @staticmethod - def _upward_bfs_op(base_ops: Union[Operation, Set[Operation], List[Operation]], tgt_op_type: str) -> Operation: - if not isinstance(base_ops, (set, list)): - base_ops = [base_ops] - - parent_ops = base_ops - while True: - for parent_op in parent_ops: - if parent_op.type == tgt_op_type: - return parent_op - base_ops = parent_ops - parent_ops = [] - for base_op in base_ops: - parent_ops.extend(utils.find_parent_op(base_op)) - if not parent_ops: - raise ValueError(f"target operation '{tgt_op_type}'' was not found.") - - @staticmethod - def _topo_sort_sliced_ops(sliced_ops: Set[Operation]) -> List[Operation]: - topo_subgraph_list = [] - topo_subgraph_set = set() - start_nodes = set() - [start_nodes.add(x) for x in sliced_ops] - logger.info("Got topo_subgraph start nodes: %s", start_nodes) - - def topo_sort_helper(curr_op, output_list, output_set): - if not isinstance(curr_op, Operation): - raise RuntimeError(f"topo_subgraph_dfs input should be node(aka. tf.Operator). {curr_op}") - curr_inputs = curr_op.inputs - logger.debug("Got topo_dfs: %s <- %s", curr_op.name, [x.name for x in curr_inputs]) - current_control_inputs = curr_op.control_inputs - if len(current_control_inputs) > 0: - raise RuntimeError( - f"control input are not supported: {curr_op.name}, control_inputs: {current_control_inputs}" - ) - if curr_op in output_set: - return - output_set.add(curr_op) - for tensor in curr_inputs: - node = tensor.op - if node.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value and node not in output_set: - topo_sort_helper(node, output_list, output_set) - output_list.append(curr_op) - - [topo_sort_helper(x, topo_subgraph_list, topo_subgraph_set) for x in start_nodes] - if len(topo_subgraph_list) != len(topo_subgraph_set): - raise RuntimeError(f"got duplicated topo node: {sorted(topo_subgraph_list, key=lambda x: x.name)}.") - logger.info("Got topo_subgraph: %s", topo_subgraph_list) - return topo_subgraph_list - - @staticmethod - def _get_mapping_for_subgraph_in( - from_op: Operation, - to_ops: Set[Operation], - tensor_mapping: Union[Dict[Tensor, Tensor], Dict[SparseTensor, SparseTensor]], - ) -> None: - if from_op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: - raise RuntimeError(f"expect IteratorGetNext for input tensor of subgraph, but got {from_op}") - for node in to_ops: - for each_tensor in node.inputs: - if each_tensor.op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: - continue - old_tensor_name = each_tensor.name - x_index = int(old_tensor_name.split(":")[-1]) - g = tf.compat.v1.get_default_graph() - arg_tensor = g.get_tensor_by_name("args_%d:0" % x_index) - tensor_mapping[each_tensor] = arg_tensor - - @staticmethod - def _get_mapping_for_subgraph( - old_op: Operation, - node_mapping: Dict[Operation, Operation], - tensor_mapping: Dict[Tensor, Tensor], - ) -> None: - logger.debug("old operation name: %s\nold operation inputs: %s\n", old_op.name, [x for x in old_op.inputs]) - - for each_tensor in old_op.inputs: - if each_tensor not in tensor_mapping: - raise RuntimeError( - f"each_tensor(input) {each_tensor} need by {old_op.name} not in tensor_mapping.{tensor_mapping}" - ) - new_inputs = NoGradSubgraphSlicer._get_mapped_tensor(tensor_mapping, old_op.inputs) - - node_def = old_op.node_def - node_def.name = "{}/{}".format(NoGradSubgraphSlicer._SLICED_OP_NAME_PREFIX, node_def.name) - new_node = tf.Operation(node_def=node_def, g=tf.compat.v1.get_default_graph(), inputs=new_inputs) - - node_mapping[old_op] = new_node - for old_out_tensor, new_out_tensor in zip(old_op.outputs, new_node.outputs): - tensor_mapping[old_out_tensor] = new_out_tensor - - @staticmethod - def _get_mapped_tensor(tensor2tensor: Dict[Tensor, Tensor], keys: List[Tensor]) -> List[Tensor]: - tensors = [] - for k in keys: - if k not in tensor2tensor: - raise KeyError(f"failed to find key tensor: {k} from tensor map: {tensor2tensor}.") - tensors.append(tensor2tensor[k]) - return tensors - - @staticmethod - def _sort_sliced_graph_outputs(subgraph_out: Dict[Operation, Set[Operation]]) -> List[Tensor]: - extra_outputs = [] - sorted_outputs = sorted(subgraph_out.items(), key=lambda x: x[0].name) - for outside_op, edge_ops in sorted_outputs: - outside_op_inputs = set(outside_op.inputs) - for edge_op in edge_ops: - NoGradSubgraphSlicer._add_sorted_additional_tensors(extra_outputs, outside_op_inputs, edge_op) - return extra_outputs - - @staticmethod - def _add_sorted_additional_tensors(extra_outputs, outside_op_inputs, edge_op) -> None: - for each_tensor in sorted(edge_op.outputs, key=lambda x: x.name): - if each_tensor not in outside_op_inputs: - continue - if each_tensor in extra_outputs: - continue - extra_outputs.append(each_tensor) - - @staticmethod - def _get_tensor_consumers(tensor: Tensor) -> List[Operation]: - if not isinstance(tensor, NoGradSubgraphSlicer._VALID_TENSOR_CLASS): - raise RuntimeError(f"expected 'tf.Tensor' or 'tf.SparseTensor', but got: {tensor}") - - graph = tensor.graph - consumers = [] - consumer_names = [op.name for op in tensor.consumers()] - - with graph._lock: - for name in consumer_names: - if name not in graph._nodes_by_name: # ignore deleted node - continue - consumers.append(graph._nodes_by_name[name]) - - return consumers - @para_checker_decorator( check_option_list=[ -- Gitee From 9266d4b823e83a591ce76b31c5bef9bec6aed329 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 6 May 2024 11:23:24 +0800 Subject: [PATCH 085/302] reset key_process --- src/core/key_process/key_process.cpp | 396 +++++++++++++-------------- src/core/key_process/key_process.h | 372 ++++++++++++------------- 2 files changed, 384 insertions(+), 384 deletions(-) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 7ba9106d..b2dfab04 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -15,21 +15,19 @@ See the License for the specific language governing permissions and #include "key_process.h" -#include - #include #include - -#include "emb_table/embedding_mgmt.h" -#include "hd_transfer/hd_transfer.h" -#include "host_emb/host_emb.h" -#include "ock_ctr_common/include/error_code.h" +#include #include "utils/common.h" -#include "utils/config.h" #include "utils/logger.h" #include "utils/safe_queue.h" #include "utils/singleton.h" #include "utils/time_cost.h" +#include "utils/config.h" +#include "host_emb/host_emb.h" +#include "emb_table/embedding_mgmt.h" +#include "hd_transfer/hd_transfer.h" +#include "ock_ctr_common/include/error_code.h" using namespace std; using namespace chrono; @@ -43,14 +41,15 @@ void KeyProcess::SetupHotEmbUpdateStep() } bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues, int seed) + const vector& thresholdValues, + int seed) { this->rankInfo = rInfo; - + SetupHotEmbUpdateStep(); - + map scInfo; - for (const auto& info : eInfos) { + for (const auto& info: eInfos) { embInfos[info.name] = info; scInfo[info.name] = info.sendCount; InitHotEmbTotCount(info, rInfo); @@ -64,8 +63,8 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos LOG_INFO(KEY_PROCESS "hot emb count info:{}", MapToString(hotEmbTotCount)); MPI_Group worldGroup; MPI_Comm_group(MPI_COMM_WORLD, &worldGroup); - for (auto& i : comm) { - for (auto& j : i) { + for (auto& i: comm) { + for (auto& j: i) { MPI_Comm_create(MPI_COMM_WORLD, worldGroup, &j); } } @@ -87,8 +86,8 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos } } - LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}", MapToString(scInfo), - rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic); + LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}", + MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic); #ifndef GTEST Start(); #endif @@ -102,7 +101,7 @@ int KeyProcess::Start() // 0 1 2 3 4 5 0 1 2 3 4 5 // | rank0 | | rank1 | // each rank creates KEY_PROCESS_THREAD threads, each thread process one batchdata - LOG_INFO("CPU Core Num: {}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 + LOG_INFO("CPU Core Num: {}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 auto fn = [this](int channel, int threadId) { #ifndef GTEST auto ret = aclrtSetDevice(static_cast(rankInfo.deviceId)); @@ -116,7 +115,7 @@ int KeyProcess::Start() } else { KeyProcessTask(channel, threadId); } - }; // for clean code + }; // for clean code int threadNum = GetThreadNumEnv(); for (int channel = 0; channel < MAX_CHANNEL_NUM; ++channel) { LOG_INFO(KEY_PROCESS "key process thread num: {}", threadNum); @@ -180,7 +179,7 @@ void KeyProcess::Destroy() { isRunning = false; LOG_INFO(KEY_PROCESS "rankId:{} KeyProcess begin destroy.", rankInfo.rankId); - for (auto& i : procThreads) { + for (auto& i: procThreads) { i->join(); } procThreads.clear(); @@ -190,8 +189,8 @@ void KeyProcess::Destroy() /// 每个数据通道的所有数据处理线程上锁 void KeyProcess::LoadSaveLock() { - for (int channelId{0}; channelId < MAX_CHANNEL_NUM; ++channelId) { - for (int threadId{0}; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { + for (int channelId { 0 }; channelId < MAX_CHANNEL_NUM; ++channelId) { + for (int threadId { 0 }; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { loadSaveMut[channelId][threadId].lock(); } } @@ -200,8 +199,8 @@ void KeyProcess::LoadSaveLock() /// 每个数据通道的所有数据处理线程释放锁 void KeyProcess::LoadSaveUnlock() { - for (int channelId{0}; channelId < MAX_CHANNEL_NUM; ++channelId) { - for (int threadId{0}; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { + for (int channelId { 0 }; channelId < MAX_CHANNEL_NUM; ++channelId) { + for (int threadId { 0 }; threadId < MAX_KEY_PROCESS_THREAD; ++threadId) { loadSaveMut[channelId][threadId].unlock(); } } @@ -228,7 +227,7 @@ void KeyProcess::GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf) } void KeyProcess::InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, - const unique_ptr& batch, ock::ctr::UniquePtr& unique) + const unique_ptr & batch, ock::ctr::UniquePtr& unique) { uniqueConf.desiredSize = static_cast(batch->Size()); if (preBatchSize != batch->Size()) { @@ -270,7 +269,7 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -284,20 +283,21 @@ void KeyProcess::KeyProcessTaskWithFastUnique(int channel, int threadId) } LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process with fast unique cost:{}," " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", - getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, - batch->channel, threadId, batch->batchId); + getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, + batch->name, batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); auto batchQueue = SingletonQueue::GetInstances(queueIndex); batchQueue->PutDirty(move(batch)); } unique->UnInitialize(); - } catch (const EndRunExit& e) { + } catch (const EndRunExit &e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } - LOG_INFO(KEY_PROCESS "KeyProcessTaskWithFastUnique exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, - channel, threadId); + LOG_INFO(KEY_PROCESS "KeyProcessTaskWithFastUnique exit. rank:{} channelId:{}, threadId:{}", + rankInfo.rankId, channel, threadId); } + void KeyProcess::KeyProcessTask(int channel, int threadId) { unique_ptr batch; @@ -305,7 +305,7 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) while (true) { TimeCost getAndProcessTC; TimeCost getBatchDataTC; - batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue + batch = GetBatchData(channel, threadId); // get batch data from SingletonQueue LOG_DEBUG("getBatchDataTC(ms):{}", getBatchDataTC.ElapsedMS()); if (batch == nullptr) { break; @@ -318,27 +318,28 @@ void KeyProcess::KeyProcessTask(int channel, int threadId) } LOG_INFO(KEY_PROCESS "getAndProcessTC(ms):{}, key process cost:{}," " get data time(ms):{}, batch name:{}, channelId:{}, threadId:{}, batchId:{}", - getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, batch->name, - batch->channel, threadId, batch->batchId); + getAndProcessTC.ElapsedMS(), processDataTime.ElapsedMS(), getBatchTime, + batch->name, batch->channel, threadId, batch->batchId); int queueIndex = threadId + (MAX_KEY_PROCESS_THREAD * batch->channel); auto batchQueue = SingletonQueue::GetInstances(queueIndex); batchQueue->PutDirty(move(batch)); } - } catch (const EndRunExit& e) { + } catch (const EndRunExit &e) { LOG_INFO(KEY_PROCESS "channel: {}, thread: {}, abort run: {}", channel, threadId, e.what()); } LOG_INFO(KEY_PROCESS "KeyProcessTask exit. rank:{} channelId:{}, threadId:{}", rankInfo.rankId, channel, threadId); } -void KeyProcess::HashSplitHelper(const unique_ptr& batch, vector& splitKeys, vector& restore, - vector& hotPos, vector>& keyCount) +void KeyProcess::HashSplitHelper(const unique_ptr & batch, vector & splitKeys, + vector & restore, vector & hotPos, + vector >& keyCount) { TimeCost uniqueTc; if (m_featureAdmitAndEvict.GetFunctionSwitch() && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE) { - tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore, keyCount) = HashSplitWithFAAE(batch); // 按存储dev id切分并去重 } else { - tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 + tie(splitKeys, restore, hotPos) = HotHashSplit(batch); // 按存储dev id切分并去重 } LOG_DEBUG("uniqueTc(ms):{}", uniqueTc.ElapsedMS()); } @@ -348,7 +349,7 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch { // tuple for keyRec restore hotPos scAll countRecv isWithFAAE = m_featureAdmitAndEvict.GetFunctionSwitch() && - FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; + FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE; TimeCost totalTimeCost = TimeCost(); TimeCost fastUniqueTC; UniqueInfo uniqueInfo; @@ -356,11 +357,12 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch LOG_DEBUG("ProcessBatchWithFastUnique(ms):{}", fastUniqueTC.ElapsedMS()); // 特征准入&淘汰 - const auto errStatus = FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR; - if (isWithFAAE && (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, uniqueInfo.all2AllInfo.keyRecv, - uniqueInfo.all2AllInfo.countRecv) == errStatus)) { - LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", rankInfo.rankId, - threadId, channel); + if (isWithFAAE && + (m_featureAdmitAndEvict.FeatureAdmit( + channel, batch, uniqueInfo.all2AllInfo.keyRecv, uniqueInfo.all2AllInfo.countRecv) == + FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", + rankInfo.rankId, threadId, channel); return false; } std::lock_guard lock(loadSaveMut[channel][threadId]); @@ -374,27 +376,25 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch LOG_DEBUG("key2OffsetTC(ms):{}", key2OffsetTC.ElapsedMS()); } // Static all2all,need send count - if (!rankInfo.useStatic) { - SendA2A(uniqueInfo.all2AllInfo.scAll, batch->name, batch->channel, batch->batchId); - } + if (!rankInfo.useStatic) { SendA2A(uniqueInfo.all2AllInfo.scAll, batch->name, batch->channel, batch->batchId); } auto tensors = make_unique>(); tensors->push_back(Vec2TensorI32(uniqueInfo.restore)); uniqueInfo.hotPos.resize(hotEmbTotCount[batch->name], -1); tensors->push_back(Vec2TensorI32(uniqueInfo.hotPos)); - + if (!rankInfo.isDDR) { PushGlobalUniqueTensors(move(tensors), uniqueInfo.all2AllInfo.keyRecv, channel); - tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) - : Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); + tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) : + Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); } TimeCost pushResultTC; PushResult(batch, move(tensors), uniqueInfo.all2AllInfo.keyRecv); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost_with_fast_unique {}", channel, - batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost_with_fast_unique {}", + channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); return true; @@ -422,10 +422,10 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, // 特征准入&淘汰 if (m_featureAdmitAndEvict.GetFunctionSwitch() && FeatureAdmitAndEvict::m_embStatus[batch->name] != SingleEmbTableStatus::SETS_NONE && - (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, countRecv) == - FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { - LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", rankInfo.rankId, - threadId, channel); + (m_featureAdmitAndEvict.FeatureAdmit(channel, batch, lookupKeys, + countRecv) == FeatureAdmitReturnType::FEATURE_ADMIT_RETURN_ERROR)) { + LOG_ERROR(KEY_PROCESS "rank:{} thread:{}, channel:{}, Feature-admit-and-evict error ...", + rankInfo.rankId, threadId, channel); return false; } @@ -436,9 +436,7 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, } // Static all2all,need send count - if (!rankInfo.useStatic) { - SendA2A(scAll, batch->name, batch->channel, batch->batchId); - } + if (!rankInfo.useStatic) { SendA2A(scAll, batch->name, batch->channel, batch->batchId); } TimeCost pushResultTC; auto tensors = make_unique>(); @@ -446,7 +444,7 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, hotPos.resize(hotEmbTotCount[batch->name], 0); tensors->push_back(Vec2TensorI32(hotPos)); - + if (!rankInfo.isDDR) { PushGlobalUniqueTensors(tensors, lookupKeys, channel); tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) : Vec2TensorI32(lookupKeys)); @@ -455,8 +453,8 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, PushResult(batch, move(tensors), lookupKeys); LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", channel, batch->batchId, - rankInfo.rankId, totalTimeCost.ElapsedMS()); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", + channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } return true; } @@ -482,27 +480,27 @@ vector KeyProcess::GetCountRecv(const unique_ptr& batch, in { TimeCost getCountRecvTC; if (rankInfo.useStatic) { - for (auto& cnt : keyCount) { + for (auto& cnt: keyCount) { cnt.resize(embInfos[batch->name].sendCount, 0); } } vector countSend; - for (auto& cnt : keyCount) { + for (auto& cnt: keyCount) { countSend.insert(countSend.cend(), cnt.cbegin(), cnt.cend()); } vector sc; for (int i = 0; i < rankInfo.rankSize; ++i) { sc.push_back(scAll.at(rankInfo.rankSize * rankInfo.rankId + i)); } - vector rc; // receive count + vector rc; // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { rc.push_back(scAll.at(i * rankInfo.rankSize + rankInfo.rankId)); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 vector countRecv; countRecv.resize(rs.back() + rc.back()); - int retCode = MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), rc.data(), - rs.data(), MPI_UINT32_T, comm[batch->channel][id]); + int retCode = MPI_Alltoallv(countSend.data(), sc.data(), ss.data(), MPI_UINT32_T, countRecv.data(), + rc.data(), rs.data(), MPI_UINT32_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -510,7 +508,8 @@ vector KeyProcess::GetCountRecv(const unique_ptr& batch, in return countRecv; } -void KeyProcess::PushResult(unique_ptr& batch, unique_ptr> tensors, KeysT& lookupKeys) +void KeyProcess::PushResult(unique_ptr& batch, unique_ptr> tensors, + KeysT& lookupKeys) { std::unique_lock lockGuard(mut); storage.push_front(move(tensors)); @@ -545,8 +544,7 @@ unique_ptr KeyProcess::GetBatchData(int channel, int commId) const if (tc.ElapsedSec() > GET_BATCH_TIMEOUT) { if (commId == 0) { LOG_WARN(KEY_PROCESS "getting batch timeout! 1. check last 'read batch cost' print. " - "channel[{}] commId[{}]", - channel, commId); + "channel[{}] commId[{}]", channel, commId); } this_thread::sleep_for(seconds(1)); tc = TimeCost(); @@ -570,7 +568,7 @@ unique_ptr KeyProcess::GetBatchData(int channel, int commId) const return batch; } -size_t KeyProcess::GetKeySize(const unique_ptr& batch) +size_t KeyProcess::GetKeySize(const unique_ptr &batch) { size_t size = rankInfo.rankSize * embInfos[batch->name].sendCount; if (!rankInfo.useStatic) { @@ -579,8 +577,8 @@ size_t KeyProcess::GetKeySize(const unique_ptr& batch) return size; } -void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniquePtr& unique, int id, - UniqueInfo& uniqueInfoOut) +void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniquePtr& unique, + int id, UniqueInfo& uniqueInfoOut) { EASY_FUNCTION(profiler::colors::Purple) EASY_VALUE("batchId", batch->batchId) @@ -599,10 +597,10 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniqueIn uniqueIn; uniqueIn.inputIdCnt = static_cast(batch->Size()); - uniqueIn.inputId = reinterpret_cast(batch->sample.data()); + uniqueIn.inputId = reinterpret_cast(batch->sample.data()); ock::ctr::EnhancedUniqueOut uniqueOut; - uniqueOut.uniqueId = reinterpret_cast(keySendInfo.keySend.data()); + uniqueOut.uniqueId = reinterpret_cast(keySendInfo.keySend.data()); uniqueOut.index = reinterpret_cast(uniqueInfoOut.restore.data()); if (rankInfo.useStatic) { uniqueOut.idCnt = idCount.data(); @@ -611,7 +609,7 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, uniqueOut.idCnt = keySendInfo.keyCount.data(); } uniqueOut.uniqueIdCntInBucket = splitSize.data(); - uniqueOut.uniqueIdInBucket = reinterpret_cast(uniqueVector.data()); + uniqueOut.uniqueIdInBucket = reinterpret_cast(uniqueVector.data()); uniqueOut.uniqueIdCnt = 0; int ret = unique->DoEnhancedUnique(uniqueIn, uniqueOut); @@ -627,18 +625,18 @@ void KeyProcess::ProcessBatchWithFastUnique(const unique_ptr& batch, All2All(sc, id, batch, keySendInfo, uniqueInfoOut.all2AllInfo); LOG_DEBUG(KEY_PROCESS "ProcessBatchWithFastUnique get batchId:{}, batchSize:{}," - " channel:{}, name:{}, restore:{}, keyCount:{}", - batch->batchId, batch->Size(), batch->channel, batch->name, uniqueInfoOut.restore.size(), - keySendInfo.keyCount.size()); + " channel:{}, name:{}, restore:{}, keyCount:{}", + batch->batchId, batch->Size(), batch->channel, batch->name, + uniqueInfoOut.restore.size(), keySendInfo.keyCount.size()); if (GlogConfig::gStatOn) { LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} " - "batch_key_num_with_fast_unique {} unique_key_num_with_fast_unique {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueOut.uniqueIdCnt); + "batch_key_num_with_fast_unique {} unique_key_num_with_fast_unique {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueOut.uniqueIdCnt); } } -void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, UniqueInfo& uniqueInfoOut, +void KeyProcess::HandleHotAndSendCount(const unique_ptr &batch, UniqueInfo& uniqueInfoOut, KeySendInfo& keySendInfo, vector& sc, vector& splitSize) { std::shared_lock lock(g_smut); @@ -652,8 +650,8 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, Uniqu TimeCost computeHotTc; ComputeHotPos(batch, hotMap, uniqueInfoOut.hotPos, uniqueInfoOut.restore, hotOffset); LOG_DEBUG("ComputeHot TimeCost(ms):{}", computeHotTc.ElapsedMS()); - UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, hotOffset, batch->batchId % hotEmbUpdateStep == 0, - batch->name); + UpdateHotMapForUnique(keySendInfo.keySend, keySendInfo.keyCount, + hotOffset, batch->batchId % hotEmbUpdateStep == 0, batch->name); if (rankInfo.useStatic) { sc.resize(rankInfo.rankSize, embInfos[batch->name].sendCount); @@ -665,8 +663,8 @@ void KeyProcess::HandleHotAndSendCount(const unique_ptr& batch, Uniqu } } -void KeyProcess::ComputeHotPos(const unique_ptr& batch, absl::flat_hash_map& hotMap, - vector& hotPos, vector& restore, const int hotOffset) const +void KeyProcess::ComputeHotPos(const unique_ptr &batch, absl::flat_hash_map &hotMap, + vector &hotPos, vector &restore, const int hotOffset) const { emb_key_t* inputData = batch->sample.data(); size_t miniBs = batch->Size(); @@ -689,27 +687,27 @@ void KeyProcess::ComputeHotPos(const unique_ptr& batch, absl::flat_ha } } -void KeyProcess::All2All(vector& sc, int id, const unique_ptr& batch, KeySendInfo& keySendInfo, +void KeyProcess::All2All(vector& sc, int id, const unique_ptr &batch, KeySendInfo& keySendInfo, All2AllInfo& all2AllInfoOut) { TimeCost getScAllTC; int channel = batch->channel; - GetScAllForUnique(sc, id, batch, - all2AllInfoOut.scAll); // Allgather通信获取所有(不同rank相同thread id的) + GetScAllForUnique(sc, id, batch, all2AllInfoOut.scAll); // Allgather通信获取所有(不同rank相同thread id的) LOG_DEBUG("GetScAll TimeCost(ms):{}", getScAllTC.ElapsedMS()); TimeCost all2allTC; - vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 - vector rc(rankInfo.rankSize); // receive count + vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 + vector rc(rankInfo.rankSize); // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { // 通信量矩阵某一列的和即为本地要从其他设备接受的key数据量 rc[i] = all2AllInfoOut.scAll.at(i * rankInfo.rankSize + rankInfo.rankId); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 all2AllInfoOut.keyRecv.resize(rs.back() + rc.back()); EASY_BLOCK("all2all") int retCode = MPI_Alltoallv(keySendInfo.keySend.data(), sc.data(), ss.data(), MPI_INT64_T, - all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, comm[channel][id]); + all2AllInfoOut.keyRecv.data(), rc.data(), rs.data(), + MPI_INT64_T, comm[channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -717,64 +715,65 @@ void KeyProcess::All2All(vector& sc, int id, const unique_ptr& b all2AllInfoOut.countRecv.resize(rs.back() + rc.back()); if (isWithFAAE) { retCode = MPI_Alltoallv(keySendInfo.keyCount.data(), sc.data(), ss.data(), MPI_UINT32_T, - all2AllInfoOut.countRecv.data(), rc.data(), rs.data(), MPI_UINT32_T, comm[channel][id]); + all2AllInfoOut.countRecv.data(), rc.data(), + rs.data(), MPI_UINT32_T, comm[channel][id]); if (retCode != MPI_SUCCESS) { - LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", channel, id, batch->batchId, - retCode); + LOG_ERROR("channelId:{} threadId:{} batchId:{}, MPI_Alltoallv failed:{}", + channel, id, batch->batchId, retCode); } } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All end, all2allTC TimeCost(ms):{}", channel, id, - batch->batchId, all2allTC.ElapsedMS()); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, All2All end, all2allTC TimeCost(ms):{}", + channel, id, batch->batchId, all2allTC.ElapsedMS()); EASY_END_BLOCK } -auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, vector& splitKeys) - -> tuple, vector> +auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, + vector& splitKeys) -> tuple, vector> { TimeCost processSplitKeysTC; EASY_FUNCTION(profiler::colors::Purple) EASY_VALUE("batchId", batch->batchId) - LOG_INFO(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, ProcessSplitKeys start.", batch->channel, id, - batch->batchId); + LOG_INFO(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, ProcessSplitKeys start.", + batch->channel, id, batch->batchId); // 使用静态all2all通信:发送或接受量为预置固定值 scInfo[batch->name] = 65536 / rankSize 经验值 - if (rankInfo.useStatic) { // maybe move after all2all - for (KeysT& i : splitKeys) { + if (rankInfo.useStatic) { // maybe move after all2all + for (KeysT& i: splitKeys) { if (static_cast(i.size()) > embInfos[batch->name].sendCount) { - LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", batch->name, batch->channel, - batch->batchId, i.size()); - throw runtime_error(StringFormat("%s[%d]:%d overflow! set send count bigger than %d", - batch->name.c_str(), batch->channel, batch->batchId, i.size()) - .c_str()); + LOG_ERROR("{}[{}]:{} overflow! set send count bigger than {}", + batch->name, batch->channel, batch->batchId, i.size()); + throw runtime_error( + StringFormat("%s[%d]:%d overflow! set send count bigger than %d", + batch->name.c_str(), batch->channel, batch->batchId, i.size()).c_str()); } i.resize(embInfos[batch->name].sendCount, -1); } } KeysT keySend; - vector sc; // send count - for (const auto& i : splitKeys) { + vector sc; // send count + for (const auto& i: splitKeys) { sc.push_back(static_cast(i.size())); keySend.insert(keySend.cend(), i.cbegin(), i.cend()); } KeysT keyRecv; TimeCost getScAllTC; - vector scAll = GetScAll(sc, id, batch); // Allgather通信获取所有(不同rank相同thread id的)线程间通信量矩阵 + vector scAll = GetScAll(sc, id, batch); // Allgather通信获取所有(不同rank相同thread id的)线程间通信量矩阵 LOG_DEBUG("getScAllTC(ms)(AllReduce-AllGather):{}", getScAllTC.ElapsedMS()); vector ss = Count2Start(sc); // send displays/offset 发送数据的起始偏移量 - vector rc; // receive count + vector rc; // receive count for (int i = 0; i < rankInfo.rankSize; ++i) { // 通信量矩阵某一列的和即为本地要从其他设备接受的key数据量 rc.push_back(scAll.at(i * rankInfo.rankSize + rankInfo.rankId)); } - vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 + vector rs = Count2Start(rc); // receive displays/offset 接受数据的起始偏移量 keyRecv.resize(rs.back() + rc.back()); EASY_BLOCK("all2all") TimeCost uniqueAll2AllTC; - int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, keyRecv.data(), rc.data(), rs.data(), - MPI_INT64_T, comm[batch->channel][id]); + int retCode = MPI_Alltoallv(keySend.data(), sc.data(), ss.data(), MPI_INT64_T, + keyRecv.data(), rc.data(), rs.data(), MPI_INT64_T, comm[batch->channel][id]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Alltoallv failed:{}", rankInfo.rankId, retCode); } @@ -783,8 +782,8 @@ auto KeyProcess::ProcessSplitKeys(const unique_ptr& batch, int id, ve EASY_END_BLOCK LOG_DEBUG(KEY_PROCESS "channelId:{} threadId:{} batchId:{}, batchName:{}, MPI_Alltoallv finish." " processSplitKeysTC(ms):{}", - batch->channel, id, batch->batchId, batch->name, processSplitKeysTC.ElapsedMS()); - return {keyRecv, scAll, ss}; + batch->channel, id, batch->batchId, batch->name, processSplitKeysTC.ElapsedMS()); + return { keyRecv, scAll, ss }; } /* @@ -799,8 +798,8 @@ tuple, vector> KeyProcess::HashSplit(const unique_ptrSize(); vector splitKeys(rankInfo.rankSize); vector restore(batch->Size()); - vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 - absl::flat_hash_map uKey; // 用于去重查询 + vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 + absl::flat_hash_map uKey; // 用于去重查询 EASY_BLOCK("split push back") for (size_t i = 0; i < miniBs; i++) { const emb_key_t& key = batchData[i]; @@ -808,9 +807,9 @@ tuple, vector> KeyProcess::HashSplit(const unique_ptrsecond; } } @@ -823,10 +822,10 @@ tuple, vector> KeyProcess::HashSplit(const unique_ptrchannel, - batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} unique_key_num {}", + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - return {splitKeys, restore}; + return { splitKeys, restore }; } void KeyProcess::PaddingAlltoallVC(vector& splitKeys) const @@ -848,10 +847,10 @@ tuple, vector, vector>> KeyProcess::Hash emb_key_t* batchData = batch->sample.data(); size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); - vector> keyCount(rankInfo.rankSize); // splitKeys在原始batch中对应的频次 + vector> keyCount(rankInfo.rankSize); // splitKeys在原始batch中对应的频次 vector restore(batch->Size()); - vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 - absl::flat_hash_map> uKey; // 用于去重查询 + vector hashSplitLens(rankInfo.rankSize); // 初始化全0,记录每个桶的长度 + absl::flat_hash_map> uKey; // 用于去重查询 EASY_BLOCK("split push back") for (size_t i = 0; i < miniBs; i++) { const emb_key_t& key = batchData[i]; @@ -859,10 +858,10 @@ tuple, vector, vector>> KeyProcess::Hash auto result = uKey.find(key); if (result == uKey.end()) { splitKeys[devId].push_back(key); - restore[i] = hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) + restore[i] = hashSplitLens[devId]++; // restore记录去重后key在桶内偏移量(用于计算恢复向量) uKey[key].first = restore[i]; uKey[key].second = 1; - } else { // 去重 + } else { // 去重 restore[i] = result->second.first; uKey[key].second++; } @@ -889,9 +888,9 @@ tuple, vector, vector>> KeyProcess::Hash uniqueKeyNum += splitKeys[devId].size(); } LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} faae_unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } - return {splitKeys, restore, keyCount}; + return { splitKeys, restore, keyCount }; } tuple, vector, vector> KeyProcess::HotHashSplit(const unique_ptr& batch) @@ -901,7 +900,7 @@ tuple, vector, vector> KeyProcess::HotHashSplit(cons size_t miniBs = batch->Size(); vector splitKeys(rankInfo.rankSize); vector restore(batch->Size()); - absl::flat_hash_map uKey; // 用于去重查询 + absl::flat_hash_map uKey; // 用于去重查询 absl::flat_hash_map keyCountMapByEmbName; std::shared_lock lock(g_smut); auto hotMap = hotKey[batch->name]; @@ -910,31 +909,31 @@ tuple, vector, vector> KeyProcess::HotHashSplit(cons vector hotPosDev(hotEmbTotCount[batch->name]); int hotCount = 0; int hotOffset = hotEmbTotCount[batch->name]; - for (size_t i = 0; i < miniBs; i++) { // for mini batch + for (size_t i = 0; i < miniBs; i++) { // for mini batch const emb_key_t& key = batchData[i]; if (batch->batchId % hotEmbUpdateStep == 0) { keyCountMapByEmbName[key]++; } emb_key_t devId = abs(key % static_cast(rankInfo.rankSize)); auto result = uKey.find(key); - if (result != uKey.end()) { // // already in splitKeys + if (result != uKey.end()) { // // already in splitKeys restore[i] = result->second; continue; } // new key in current batch - splitKeys[devId].push_back(key); // push to bucket + splitKeys[devId].push_back(key); // push to bucket auto hot = hotMap.find(key); - if (hot != hotMap.end()) { // is hot key - if (hot->second == -1) { // is new hot key in this batch + if (hot != hotMap.end()) { // is hot key + if (hot->second == -1) { // is new hot key in this batch // pos in lookup vec (need add ss) for hot-gather hotPos[hotCount] = static_cast(splitKeys[devId].size()) - 1; - hotPosDev[hotCount] = devId; // which dev, for get ss + hotPosDev[hotCount] = devId; // which dev, for get ss hot->second = hotCount; - restore[i] = hotCount++; // get pos of hot emb + restore[i] = hotCount++; // get pos of hot emb } else { restore[i] = hot->second; } - } else { // is not hot key + } else { // is not hot key // restore记录去重后key在桶内偏移量(用于计算恢复向量) restore[i] = static_cast(splitKeys[devId].size() + (hotOffset - 1)); } @@ -947,20 +946,20 @@ tuple, vector, vector> KeyProcess::HotHashSplit(cons uniqueKeyNum += splitKeys[devId].size(); } LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} batch_key_num {} hot_unique_key_num {}", - batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); + batch->channel, batch->batchId, rankInfo.rankId, batch->Size(), uniqueKeyNum); } UpdateHotMap(keyCountMapByEmbName, hotEmbTotCount[batch->name], batch->batchId % hotEmbUpdateStep == 0, batch->name); AddCountStartToHotPos(splitKeys, hotPos, hotPosDev, batch); - return {splitKeys, restore, hotPos}; + return { splitKeys, restore, hotPos }; } void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, const unique_ptr& batch) { vector splitKeysSize; - for (auto& splitKey : splitKeys) { + for (auto& splitKey: splitKeys) { int tmp = rankInfo.useStatic ? embInfos[batch->name].sendCount : static_cast(splitKey.size()); splitKeysSize.push_back(tmp); } @@ -971,13 +970,13 @@ void KeyProcess::AddCountStartToHotPos(vector& splitKeys, vector& ho } } -void KeyProcess::UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, uint32_t count, - bool refresh, const string& embName) +void KeyProcess::UpdateHotMapForUnique(const KeysT &keySend, const vector &keyCount, + uint32_t count, bool refresh, const string& embName) { auto& hotMap = hotKey[embName]; if (refresh) { priority_queue> pq; - for (size_t i = 0; i < keySend.size(); ++i) { + for (size_t i = 0;i < keySend.size(); ++i) { if (keySend[i] == -1) { continue; } @@ -1003,8 +1002,8 @@ void KeyProcess::UpdateHotMap(absl::flat_hash_map& keyCountMapBy return; } auto& hotMap = hotKey[embName]; - priority_queue> pq; // top k key - for (auto& p : keyCountMapByEmbName) { + priority_queue> pq; // top k key + for (auto& p: keyCountMapByEmbName) { pq.push(pair(-p.second, p.first)); if (pq.size() > count) { pq.pop(); @@ -1031,40 +1030,42 @@ vector KeyProcess::GetScAll(const vector& keyScLocal, int commId, cons LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll start.", batch->channel, commId, batch->batchId); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAll.data(), rankInfo.rankSize, - MPI_INT, comm[batch->channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, + scAll.data(), rankInfo.rankSize, MPI_INT, + comm[batch->channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {} commId {}, MPI_Allgather failed:{}", rankInfo.rankId, commId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, key scAll matrix:\n{}", batch->channel, - commId, batch->batchId, VectorToString(scAll)); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAll MPI_Allgather end, key scAll matrix:\n{}", + batch->channel, commId, batch->batchId, VectorToString(scAll)); return scAll; } -void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr& batch, - vector& scAllOut) +void KeyProcess::GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr &batch, + vector &scAllOut) { EASY_FUNCTION() int channel = batch->channel; scAllOut.resize(rankInfo.rankSize * rankInfo.rankSize); // allgather keyScLocal(key all2all keyScLocal = device all2all rc) - auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, scAllOut.data(), rankInfo.rankSize, - MPI_INT, comm[channel][commId]); + auto retCode = MPI_Allgather(keyScLocal.data(), rankInfo.rankSize, MPI_INT, + scAllOut.data(), rankInfo.rankSize, MPI_INT, + comm[channel][commId]); if (retCode != MPI_SUCCESS) { LOG_ERROR("rank {}, MPI_Allgather failed:{}", rankInfo.rankId, retCode); } - LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key scAllOut matrix:\n{}", channel, commId, - batch->batchId, VectorToString(scAllOut)); + LOG_DEBUG("channelId:{} threadId:{} batchId:{}, GetScAllForUnique end, key scAllOut matrix:\n{}", + channel, commId, batch->batchId, VectorToString(scAllOut)); } void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel) { TimeCost key2OffsetTC; EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD auto& key2Offset = keyOffsetMap[embName]; - auto& maxOffsetTmp = maxOffset[embName]; + auto& maxOffsetTmp = maxOffset[embName]; auto& evictPos = evictPosMap[embName]; for (long& key : splitKey) { if (key == -1) { @@ -1077,9 +1078,8 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe size_t offset; // 新值, emb有pos可复用 offset = evictPos.back(); - LOG_TRACE("HBM mode, evictPos is not null, name[{}] key [{}] reuse offset [{}], " - "evictSize [{}]!!!", - embName, key, offset, evictPos.size()); + LOG_TRACE("HBM mode, evictPos is not null, name[{}] key [{}] reuse offset [{}], evictSize [{}]!!!", + embName, key, offset, evictPos.size()); key2Offset[key] = offset; key = offset; evictPos.pop_back(); @@ -1097,18 +1097,18 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe LOG_ERROR("dev cache overflow {} > {}", maxOffsetTmp, embInfos[embName].devVocabSize); throw std::runtime_error("dev cache overflow!"); } - LOG_DEBUG("current hbm emb:{}, usage:{}/{} key2OffsetTC({} ms)", embName, maxOffsetTmp, - embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); + LOG_DEBUG("current hbm emb:{}, usage:{}/{} key2OffsetTC({} ms)", + embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); } void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel) { TimeCost key2OffsetTC; EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD auto& key2Offset = keyOffsetMap[embName]; - auto& maxOffsetTmp = maxOffset[embName]; - auto& curEmbTable = embeddingTableMap[embName]; // empty when not use dynamic expansion + auto& maxOffsetTmp = maxOffset[embName]; + auto& curEmbTable = embeddingTableMap[embName]; // empty when not use dynamic expansion for (long& key : splitKey) { if (key == -1) { key = 0; @@ -1131,8 +1131,8 @@ void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& spli key = 0; } } - LOG_DEBUG("current expansion emb:{}, usage:{}/{}, key2OffsetTC({} ms)", embName, maxOffsetTmp, - embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); + LOG_DEBUG("current expansion emb:{}, usage:{}/{}, key2OffsetTC({} ms)", + embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); } /* @@ -1157,10 +1157,11 @@ void KeyProcess::BuildRestoreVec(const unique_ptr& batch, const vecto hotNum += 1; } } - LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", hotNum, batch->Size(), buildRestoreVecTC.ElapsedMS()); + LOG_DEBUG("hot num in all:{}/{} buildRestoreVecTC(ms):{}", + hotNum, batch->Size(), buildRestoreVecTC.ElapsedMS()); } -template +template T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, int channel) { std::lock_guard lockGuard(mut); @@ -1199,7 +1200,7 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", - embName, channel, batch); + embName, channel, batch); return {}; } if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { @@ -1220,9 +1221,8 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) SendEos(batch, channel); return {}; } - LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey " - "batchId: {}.", - embName, channel, batch, readEmbKeyBatchId); + LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", + embName, channel, batch, readEmbKeyBatchId); this_thread::sleep_for(1ms); } catch (WrongListTop&) { LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); @@ -1246,7 +1246,7 @@ void KeyProcess::SendEos(int batchId, int channel) vector tensors; bool isNeedResend = true; - for (const auto& emb : as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 + for (const auto& emb: as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos start.", channel, batchId, emb.first); if (!isRunning) { throw EndRunExit("SendEos end run, isRunning is false after lock destroyMutex."); @@ -1254,7 +1254,7 @@ void KeyProcess::SendEos(int batchId, int channel) for (const string& transName : usedChannelNames) { string sendName = StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); size_t channelSize = 0; - + acltdtQueryChannelSize(transChannels[sendName], &channelSize); LOG_INFO("[EOS] Before send eos, {} contains {}.", sendName, channelSize); SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, isNeedResend); @@ -1301,7 +1301,7 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); if (batch != hybridMgmtBlock->hybridBatchId[channel]) { LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", - embName, channel, batch); + embName, channel, batch); return nullptr; } if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { @@ -1318,17 +1318,15 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa return uTensor; } catch (EmptyList&) { unique_lock lockEosGuard(eosMutex); - // 避免eos在keyProcess还未处理完数据时插队到通道前面, - // readEmbKey真实的次数是readEmbedBatchId减1 + // 避免eos在keyProcess还未处理完数据时插队到通道前面, readEmbKey真实的次数是readEmbedBatchId减1 if (isNeedSendEos[channel] && (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { LOG_INFO("channelId:{} batchId:{}, GetInfoVec eos.", channel, batch); unique_lock lockDestroyGuard(destroyMutex); SendEos(batch, channel); return nullptr; } - LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey " - "batchId: {}.", - embName, channel, batch, (hybridMgmtBlock->readEmbedBatchId[channel] - 1)); + LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", + embName, channel, batch, (hybridMgmtBlock->readEmbedBatchId[channel] - 1)); this_thread::sleep_for(1ms); } catch (WrongListTop&) { LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); @@ -1341,7 +1339,7 @@ void KeyProcess::SendA2A(const vector& a2aInfo, const string& embName, int { // 数据放到队列里,在mgmt里面发送(检查发送数据量) auto tensors = make_unique>(); - Tensor tmpTensor(tensorflow::DT_INT64, {rankInfo.rankSize, rankInfo.rankSize}); + Tensor tmpTensor(tensorflow::DT_INT64, { rankInfo.rankSize, rankInfo.rankSize }); auto tmpData = tmpTensor.matrix(); for (int i = 0; i < rankInfo.rankSize; ++i) { for (int j = 0; j < rankInfo.rankSize; ++j) { @@ -1361,13 +1359,13 @@ int KeyProcess::GetMaxStep(int channelId) const return rankInfo.ctrlSteps.at(channelId); } -void KeyProcess::EvictKeys(const string& embName, const vector& keys) // hbm +void KeyProcess::EvictKeys(const string& embName, const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm funEvictCall: [{}]! keySize:{}", embName, keys.size()); EmbeddingMgmt::Instance()->EvictKeys(embName, keys); } -void KeyProcess::EvictKeysCombine(const vector& keys) // hbm +void KeyProcess::EvictKeysCombine(const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm combine funEvictCall, keySize:{}", keys.size()); EmbeddingMgmt::Instance()->EvictKeysCombine(keys); @@ -1376,7 +1374,7 @@ void KeyProcess::EvictKeysCombine(const vector& keys) // hbm void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vector& keys) { EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD + std::lock_guard lk(mut); // lock for PROCESS_THREAD size_t keySize = keys.size(); auto& devHashMap = keyOffsetMap.at(embName); @@ -1390,7 +1388,7 @@ void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vectorsecond; @@ -1404,18 +1402,18 @@ void KeyProcess::EvictDeleteDeviceEmb(const string& embName, const vector offset) { if (offset.size() > embInfos[embName].devVocabSize) { - LOG_ERROR("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", embName, - offset.size(), embInfos[embName].devVocabSize); + LOG_ERROR("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", + embName, offset.size(), embInfos[embName].devVocabSize); throw runtime_error( - Logger::Format("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", embName, - offset.size(), embInfos[embName].devVocabSize) - .c_str()); + Logger::Format("{} overflow! init evict dev, evictOffset size {} bigger than dev vocabSize {}", + embName, offset.size(), embInfos[embName].devVocabSize + ).c_str()); } vector tmpDataOut; Tensor tmpData = Vec2TensorI32(offset); tmpDataOut.emplace_back(tmpData); - tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, {1})); + tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); auto evictLen = tmpDataOut.back().flat(); int evictSize = static_cast(offset.size()); @@ -1428,12 +1426,12 @@ void KeyProcess::EvictInitDeviceEmb(const string& embName, vector offset LOG_INFO(KEY_PROCESS "hbm EvictInitDeviceEmb: [{}]! send offsetSize:{}", embName, offset.size()); } -string KeyProcess::DumpSplitKeys(vector>& splitKeys) const +string KeyProcess::DumpSplitKeys(vector> &splitKeys) const { stringstream ssTrace; for (int devId = 0; devId < rankInfo.rankSize; ++devId) { ssTrace << '|' << devId << ":"; - for (auto key : splitKeys[devId]) { + for (auto key: splitKeys[devId]) { ssTrace << key << ','; } ssTrace << '|'; diff --git a/src/core/key_process/key_process.h b/src/core/key_process/key_process.h index 4dafc07f..8bd7b8d0 100644 --- a/src/core/key_process/key_process.h +++ b/src/core/key_process/key_process.h @@ -16,281 +16,283 @@ See the License for the specific language governing permissions and #ifndef MX_REC_KEY_PROCESS_H #define MX_REC_KEY_PROCESS_H -#include -#include - +#include #include #include -#include #include #include -#include +#include +#include +#include +#include "ock_ctr_common/include/factory.h" + +#include "utils/common.h" #include "emb_table/emb_table.h" #include "feature_admit_and_evict.h" #include "hybrid_mgmt/hybrid_mgmt_block.h" -#include "ock_ctr_common/include/factory.h" -#include "utils/common.h" #include "utils/singleton.h" namespace MxRec { -using namespace std; + using namespace std; -template -struct Cmp { - bool operator()(const T& a, const T& b) const - { - return get(a) > get(b); // batch id order - } -}; + template + struct Cmp { + bool operator()(const T& a, const T& b) const + { + return get(a) > get(b); // batch id order + } + }; -template -using heap_t = priority_queue, Cmp>; + template + using heap_t = priority_queue, Cmp>; -template -using info_list_t = map, MAX_QUEUE_NUM>>; + template + using info_list_t = map, MAX_QUEUE_NUM>>; -enum class ProcessedInfo { - RESTORE, - ALL2ALL, - INVALID -}; + enum class ProcessedInfo { + RESTORE, + ALL2ALL, + INVALID + }; -class EndRunExit : public std::exception { -public: - explicit EndRunExit(const char* message) : errorMessage(message) {} + class EndRunExit : public std::exception { + public: + explicit EndRunExit(const char* message) : errorMessage(message) {} - const char* what() const noexcept override - { - return errorMessage; - } + const char* what() const noexcept override + { + return errorMessage; + } -private: - const char* errorMessage; -}; + private: + const char* errorMessage; + }; -constexpr int MPI_ABNORMAL_SEND_VALUE = 0; // MPI异常通信时发送0 -constexpr int MPI_NORMAL_SEND_VALUE = 1; // MPI正常通信时发送1 + constexpr int MPI_ABNORMAL_SEND_VALUE = 0; // MPI异常通信时发送0 + constexpr int MPI_NORMAL_SEND_VALUE = 1; // MPI正常通信时发送1 -class EmptyList : public std::exception {}; + class EmptyList : public std::exception { + }; -class WrongListTop : public std::exception {}; + class WrongListTop : public std::exception { + }; -class KeyProcess { -public: - bool Initialize(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues = {}, int seed = 0); + class KeyProcess { + public: + bool Initialize(const RankInfo& rInfo, const vector& eInfos, + const vector& thresholdValues = {}, int seed = 0); - unique_ptr> GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type); + unique_ptr> GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type); - KeysT GetLookupKeys(int batch, const string& embName, int channel); + KeysT GetLookupKeys(int batch, const string& embName, int channel); - int GetMaxStep(int channelId) const; + int GetMaxStep(int channelId) const; - OffsetMemT GetMaxOffset(); + OffsetMemT GetMaxOffset(); - KeyOffsetMemT GetKeyOffsetMap(); + KeyOffsetMemT GetKeyOffsetMap(); - KeyCountMemT GetKeyCountMap(); + KeyCountMemT GetKeyCountMap(); - FeatureAdmitAndEvict& GetFeatAdmitAndEvict(); + FeatureAdmitAndEvict& GetFeatAdmitAndEvict(); - void LoadMaxOffset(OffsetMemT& loadData); + void LoadMaxOffset(OffsetMemT& loadData); - void LoadKeyOffsetMap(KeyOffsetMemT& loadData); + void LoadKeyOffsetMap(KeyOffsetMemT& loadData); - void LoadKeyCountMap(KeyCountMemT& loadData); + void LoadKeyCountMap(KeyCountMemT& loadData); - void Destroy(); + void Destroy(); - void LoadSaveLock(); + void LoadSaveLock(); - void LoadSaveUnlock(); + void LoadSaveUnlock(); - void EvictKeys(const string& embName, const vector& keys); + void EvictKeys(const string& embName, const vector& keys); - void EvictKeysCombine(const vector& keys); + void EvictKeysCombine(const vector& keys); - void SetupHotEmbUpdateStep(); + void SetupHotEmbUpdateStep(); - int64_t GetExpansionTableSize(const string& embName); + int64_t GetExpansionTableSize(const string& embName); - int64_t GetExpansionTableCapacity(const string& embName); + int64_t GetExpansionTableCapacity(const string& embName); - void RecordKeyCountMap(const unique_ptr& batch); + void RecordKeyCountMap(const unique_ptr& batch); - template - void GlobalUnique(T& lookupKeys, T& uniqueKeys, vector& restoreVecSec) - { - absl::flat_hash_map umap; - restoreVecSec.resize(lookupKeys.size(), -1); - int32_t length = 0; + template + void GlobalUnique(T& lookupKeys, T& uniqueKeys, vector& restoreVecSec) + { + absl::flat_hash_map umap; + restoreVecSec.resize(lookupKeys.size(), -1); + int32_t length = 0; - for (size_t i = 0; i < lookupKeys.size(); ++i) { - int64_t key = lookupKeys[i]; - if (rankInfo.useStatic - && ((!rankInfo.useDynamicExpansion && key == -1) || (rankInfo.useDynamicExpansion && key == 0))) { - continue; - } + for (size_t i = 0; i < lookupKeys.size(); ++i) { + int64_t key = lookupKeys[i]; + if (rankInfo.useStatic && ( + (!rankInfo.useDynamicExpansion && key == -1) || (rankInfo.useDynamicExpansion && key == 0))) { + continue; + } - auto result = umap.find(key); - if (result == umap.end()) { - uniqueKeys.push_back(lookupKeys[i]); - umap[key] = length; - restoreVecSec[i] = length; - length++; - } else { - restoreVecSec[i] = result->second; + auto result = umap.find(key); + if (result == umap.end()) { + uniqueKeys.push_back(lookupKeys[i]); + umap[key] = length; + restoreVecSec[i] = length; + length++; + } else { + restoreVecSec[i] = result->second; + } } - } - if (rankInfo.useStatic) { - if (rankInfo.useDynamicExpansion) { - uniqueKeys.resize(lookupKeys.size(), 0); - } else { - uniqueKeys.resize(lookupKeys.size(), -1); + if (rankInfo.useStatic) { + if (rankInfo.useDynamicExpansion) { + uniqueKeys.resize(lookupKeys.size(), 0); + } else { + uniqueKeys.resize(lookupKeys.size(), -1); + } } } - } - - void SetEos(int status, int channelId); - void SendEos(int batchId, int channel); + void SetEos(int status, int channelId); - bool isRunning{false}; + void SendEos(int batchId, int channel); - std::mutex destroyMutex; - std::mutex eosMutex; - inline bool HasEmbName(const string& embName) - { - return embInfos.find(embName) != embInfos.end(); - }; - GTEST_PRIVATE : + bool isRunning { false }; - int - Start(); + std::mutex destroyMutex; + std::mutex eosMutex; + inline bool HasEmbName(const string& embName) + { + return embInfos.find(embName) != embInfos.end(); + }; + GTEST_PRIVATE: - template - T GetInfo(info_list_t& list, int batch, const string& embName, int channel); + int Start(); - RankInfo rankInfo; - map embInfos; - MPI_Comm comm[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]; - std::mutex mut{}; - vector> procThreads{}; - std::mutex loadSaveMut[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]{}; - info_list_t lookupKeysList; - list>> storage; - info_list_t infoList; - info_list_t all2AllList; - map maxOffset{}; - map> keyOffsetMap{}; - map> keyCountMap{}; - FeatureAdmitAndEvict m_featureAdmitAndEvict{}; - map> evictPosMap{}; - map> hotKey{}; - map hotEmbTotCount; - map embeddingTableMap{}; - ock::ctr::FactoryPtr factory{}; - int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; - bool isWithFAAE; - bool isNeedSendEos[2] = {0, 0}; // 分别代表通道0、1的eos状态 + template + T GetInfo(info_list_t& list, int batch, const string& embName, int channel); - void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); + RankInfo rankInfo; + map embInfos; + MPI_Comm comm[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD]; + std::mutex mut {}; + vector> procThreads {}; + std::mutex loadSaveMut[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD] {}; + info_list_t lookupKeysList; + list>> storage; + info_list_t infoList; + info_list_t all2AllList; + map maxOffset {}; + map> keyOffsetMap {}; + map> keyCountMap {}; + FeatureAdmitAndEvict m_featureAdmitAndEvict {}; + map> evictPosMap {}; + map> hotKey {}; + map hotEmbTotCount; + map embeddingTableMap {}; + ock::ctr::FactoryPtr factory {}; + int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; + bool isWithFAAE; + bool isNeedSendEos[2] = { 0, 0 }; // 分别代表通道0、1的eos状态 - void KeyProcessTask(int channel, int threadId); + void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); - void KeyProcessTaskWithFastUnique(int channel, int threadId); + void KeyProcessTask(int channel, int threadId); - bool KeyProcessTaskHelper(unique_ptr& batch, int channel, int threadId); + void KeyProcessTaskWithFastUnique(int channel, int threadId); - bool KeyProcessTaskHelperWithFastUnique(unique_ptr& batch, ock::ctr::UniquePtr& unique, int channel, - int threadId); + bool KeyProcessTaskHelper(unique_ptr& batch, int channel, int threadId); - tuple, vector> ProcessSplitKeys(const unique_ptr& batch, int id, - vector& splitKeys); + bool KeyProcessTaskHelperWithFastUnique(unique_ptr &batch, ock::ctr::UniquePtr& unique, + int channel, int threadId); - void GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf); + tuple, vector> ProcessSplitKeys(const unique_ptr& batch, + int id, vector& splitKeys); - void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, - const unique_ptr& batch, ock::ctr::UniquePtr& unique); + void GetUniqueConfig(ock::ctr::UniqueConf& uniqueConf); - void ProcessBatchWithFastUnique(const unique_ptr& batch, ock::ctr::UniquePtr& unique, int id, - UniqueInfo& uniqueInfoOut); + void InitializeUnique(ock::ctr::UniqueConf& uniqueConf, size_t& preBatchSize, bool& uniqueInitialize, + const unique_ptr & batch, ock::ctr::UniquePtr& unique); - size_t GetKeySize(const unique_ptr& batch); + void ProcessBatchWithFastUnique(const unique_ptr &batch, ock::ctr::UniquePtr& unique, + int id, UniqueInfo& uniqueInfoOut); - void All2All(vector& sc, int id, const unique_ptr& batch, KeySendInfo& keySendInfo, - All2AllInfo& all2AllInfoOut); + size_t GetKeySize(const unique_ptr &batch); - auto HashSplit(const unique_ptr& batch) const -> tuple, vector>; + void All2All(vector& sc, int id, const unique_ptr &batch, KeySendInfo& keySendInfo, + All2AllInfo& all2AllInfoOut); - auto HotHashSplit(const unique_ptr& batch) -> tuple, vector, vector>; + auto HashSplit(const unique_ptr& batch) const -> tuple, vector>; - void PaddingAlltoallVC(vector& splitKeys) const; + auto HotHashSplit(const unique_ptr& batch) -> tuple, vector, vector>; - tuple, vector, vector>> HashSplitWithFAAE( - const unique_ptr& batch) const; + void PaddingAlltoallVC(vector& splitKeys) const; - vector GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch); + tuple, vector, vector>> + HashSplitWithFAAE(const unique_ptr& batch) const; - void GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr& batch, - vector& scAllOut); + vector GetScAll(const vector& keyScLocal, int commId, const unique_ptr& batch); - void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); + void GetScAllForUnique(const vector& keyScLocal, int commId, const unique_ptr &batch, + vector &scAllOut); - void Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel); + void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); - unique_ptr GetBatchData(int channel, int commId) const; + void Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel); - void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, vector& restoreVec, - int hotPosSize = 0) const; + unique_ptr GetBatchData(int channel, int commId) const; - void SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch); + void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, + vector& restoreVec, int hotPosSize = 0) const; + + void SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch); - void EvictDeleteDeviceEmb(const string& embName, const vector& keys); + void EvictDeleteDeviceEmb(const string& embName, const vector& keys); - void EvictInitDeviceEmb(const string& embName, vector offset); + void EvictInitDeviceEmb(const string& embName, vector offset); - void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, bool refresh, - const string& embName); + void UpdateHotMap(absl::flat_hash_map& keyCountMapByEmbName, uint32_t count, bool refresh, + const string& embName); - void UpdateHotMapForUnique(const KeysT& keySend, const vector& keyCount, uint32_t count, bool refresh, - const string& embName); + void UpdateHotMapForUnique(const KeysT &keySend, const vector &keyCount, + uint32_t count, bool refresh, const string& embName); - void HandleHotAndSendCount(const unique_ptr& batch, UniqueInfo& uniqueInfoOut, KeySendInfo& keySendInfo, - vector& sc, vector& splitSize); + void HandleHotAndSendCount(const unique_ptr &batch, UniqueInfo& uniqueInfoOut, + KeySendInfo& keySendInfo, vector& sc, vector& splitSize); - void PushResult(unique_ptr& batch, unique_ptr> tensors, KeysT& lookupKeys); + void PushResult(unique_ptr& batch, unique_ptr> tensors, KeysT& lookupKeys); - void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel); + void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel); - void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, - const unique_ptr& batch); + void AddCountStartToHotPos(vector& splitKeys, vector& hotPos, const vector& hotPosDev, + const unique_ptr& batch); - void ComputeHotPos(const unique_ptr& batch, absl::flat_hash_map& hotMap, - vector& hotPos, vector& restore, const int hotOffset) const; + void ComputeHotPos(const unique_ptr &batch, absl::flat_hash_map &hotMap, + vector &hotPos, vector &restore, const int hotOffset) const; - vector GetCountRecv(const unique_ptr& batch, int id, vector>& keyCount, - vector scAll, vector ss); + vector GetCountRecv(const unique_ptr& batch, int id, + vector>& keyCount, vector scAll, vector ss); - void HashSplitHelper(const unique_ptr& batch, vector& splitKeys, vector& restore, - vector& hotPos, vector>& keyCount); + void HashSplitHelper(const unique_ptr & batch, vector & splitKeys, + vector & restore, vector & hotPos, + vector >& keyCount); - template - inline vector Count2Start(const vector& count) const - { - vector start = {0}; - for (size_t i = 0; i < count.size() - 1; ++i) { - start.push_back(count[i] + start.back()); + template + inline vector Count2Start(const vector& count) const + { + vector start = { 0 }; + for (size_t i = 0; i < count.size() - 1; ++i) { + start.push_back(count[i] + start.back()); + } + return start; } - return start; - } - string DumpSplitKeys(vector>& splitKeys) const; -}; + string DumpSplitKeys(vector>& splitKeys) const; + }; #define KEY_PROCESS_INSTANCE Singleton::GetInstance() -} // end namespace MxRec +} // end namespace MxRec -#endif // MX_REC_KEY_PROCESS_H +#endif // MX_REC_KEY_PROCESS_H -- Gitee From d52aca711795d11361f102c41ca72fc23e17d6d6 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 06:12:35 +0000 Subject: [PATCH 086/302] =?UTF-8?q?!111=20LazyAdam=E8=9E=8D=E5=90=88?= =?UTF-8?q?=E7=AE=97=E5=AD=90-aclnn=E9=83=A8=E5=88=86=E6=8F=90=E4=BA=A4=20?= =?UTF-8?q?*=20=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B93=20*=20=E9=97=A8?= =?UTF-8?q?=E7=A6=81=E4=BF=AE=E6=94=B92=20*=20aclnn=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B9=20*=20LazyAdam=E8=9E=8D?= =?UTF-8?q?=E5=90=88=E7=AE=97=E5=AD=90-aclnn=E9=83=A8=E5=88=86=E6=8F=90?= =?UTF-8?q?=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/inc/common.h | 52 ++++ .../aclnn_lazy_adam_test/inc/op_runner.h | 195 +++++++++++++++ .../aclnn_lazy_adam_test/inc/operator_desc.h | 67 +++++ .../aclnn_lazy_adam_test/input/.keep | 0 .../aclnn_lazy_adam_test/output/.keep | 0 .../aclnn_lazy_adam_test/run.sh | 106 ++++++++ .../aclnn_lazy_adam_test/scripts/gen_data.py | 145 +++++++++++ .../scripts/verify_result.py | 50 ++++ .../aclnn_lazy_adam_test/src/CMakeLists.txt | 67 +++++ .../aclnn_lazy_adam_test/src/common.cpp | 84 +++++++ .../aclnn_lazy_adam_test/src/main.cpp | 228 ++++++++++++++++++ 11 files changed, 994 insertions(+) create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/input/.keep create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/output/.keep create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/verify_result.py create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h new file mode 100644 index 00000000..601a2617 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/common.h @@ -0,0 +1,52 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +namespace AclnnLazyAdam { +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + + /** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ + bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + + /** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ + bool WriteFile(const std::string &filePath, const void *buffer, size_t size); +} +#endif // COMMON_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h new file mode 100644 index 00000000..6f91f905 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h @@ -0,0 +1,195 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "aclnn/acl_meta.h" +#include "acl/acl.h" +#include "common.h" +#include "operator_desc.h" + +namespace AclnnLazyAdam { + /** + * Op Runner + */ + class OpRunner { + public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + + const size_t GetInputNumDims(size_t index) const; + + aclDataType GetInputDataType(size_t index) const; + + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + + const size_t GetOutputNumDims(size_t index) const; + + aclDataType GetOutputDataType(size_t index) const; + + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template + T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template + const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] numElementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t numElementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] numElementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t numElementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(); + + private: + size_t numInputs_; + size_t numOutputs_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; + }; +} +#endif // OP_RUNNER_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h new file mode 100644 index 00000000..ddd3b3a9 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/operator_desc.h @@ -0,0 +1,67 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +namespace AclnnLazyAdam { + /** + * Op description + */ + struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; + double beta1; + double beta2; + double epsilon; + }; +} +#endif // OPERATOR_DESC_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/input/.keep b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/input/.keep new file mode 100644 index 00000000..e69de29b diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/output/.keep b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/output/.keep new file mode 100644 index 00000000..e69de29b diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh new file mode 100644 index 00000000..3d4af97c --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=0 + +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) +cd $CURRENT_DIR + +# 导出环境变量 +SHORT=v:, +LONG=dtype:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + # float16, float, int32 + (-v | --dtype) + DTYPE="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +if [ ! $ASCEND_HOME_DIR ]; then + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest + else + export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $ASCEND_HOME_DIR/bin/setenv.bash + +export DDK_PATH=$ASCEND_HOME_DIR +arch=$(uname -m) +export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64 + +function main { + # 1. 清除遗留生成文件和日志文件 + rm -rf $HOME/ascend/log/* + rm ./input/*.bin + rm ./output/*.bin + + # 2. 生成输入数据和真值数据 + cd $CURRENT_DIR + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + # 3. 编译acl可执行文件 + cd $CURRENT_DIR; rm -rf build; mkdir -p build; cd build + cmake ../src + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + # 4. 运行可执行文件 + cd $CURRENT_DIR/output + echo "INFO: execute op!" + ./execute_op + + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + + # 5. 比较真值文件 + cd $CURRENT_DIR + python3 scripts/verify_result.py +} + +main diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py new file mode 100644 index 00000000..6e07f836 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import copy +import os +import numpy as np + +# 获取项目路径 +_CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) +_PROJECT_PATH = os.path.dirname(_CURRENT_PATH) +_INPUT_PATH = os.path.join(_PROJECT_PATH, "./input") +_OUTPUT_PATH = os.path.join(_PROJECT_PATH, "./output") + +_DIM_0 = 2000000 +_DIM_1 = 564096 +_DIM_2 = 32 + + +def _gather(input_data, indices): + out = np.zeros((len(indices), input_data.shape[1])) + for i, index_ in enumerate(indices): + # 跳过index小于0的数据 + if index_[0] < 0: + continue + out[i] = input_data[index_[0]] + return out + + +def _scatter_nd_update(momentum, indices, update_value): + out = copy.deepcopy(momentum) + for i, index_ in enumerate(indices): + if index_[0] < 0: + continue + else: + out[index_[0]] = update_value[i] + return out + + +def _scatter_nd_add(momentum, indices, update_value): + out = copy.deepcopy(momentum) + for i, index_ in enumerate(indices): + if index_[0] < 0: + continue + else: + out[indices[i][0]] = out[index_[0]] + update_value[i] + return out + + +def _gen_input_data(): + range_start = 1 + range_end = 2 + + dtype_chose = np.float32 + shape0 = (_DIM_0, _DIM_2) + indices_shape = (_DIM_1, 1) + grad_shape = (_DIM_1, _DIM_2) + + input_var = np.random.uniform(range_start, range_end, size=shape0).astype(dtype_chose) # shape [2000000,32] + input_m = np.random.uniform(range_start, range_end, size=shape0).astype(dtype_chose) # shape [2000000,32] + input_v = np.random.uniform(range_start, range_end, size=shape0).astype(dtype_chose) # shape [2000000,32] + + # indices shape [564096,1] + indices = np.random.permutation(np.arange(_DIM_0)).astype(np.int32)[:indices_shape[0]].reshape(-1, 1) + # gradient shape [564096,32] + gradient = np.random.uniform(range_start, range_end, size=grad_shape).astype(dtype_chose) + + if not os.path.exists(_INPUT_PATH): + os.makedirs(_INPUT_PATH) + indices.tofile(os.path.join(_INPUT_PATH, "indices.bin")) + gradient.tofile(os.path.join(_INPUT_PATH, "gradient.bin")) + input_m.tofile(os.path.join(_INPUT_PATH, "inputM.bin")) + input_v.tofile(os.path.join(_INPUT_PATH, "inputV.bin")) + input_var.tofile(os.path.join(_INPUT_PATH, "inputVar.bin")) + + +def _gen_golden_data(): + beta1 = 0.9 + beta2 = 0.999 + lr = 0.001 + epsilon = 1e-7 + + lr = np.array(lr).astype(np.float32) + beta1 = np.array(beta1).astype(np.float32) + beta2 = np.array(beta2).astype(np.float32) + epsilon = np.array(epsilon).astype(np.float32) + + lr.tofile(os.path.join(_INPUT_PATH, "learningRate.bin")) + + indices = np.fromfile(os.path.join(_INPUT_PATH, "indices.bin"), dtype=np.int32).reshape( + (_DIM_1, 1)) # shape (564096,1) + gradient = np.fromfile(os.path.join(_INPUT_PATH, "gradient.bin"), dtype=np.float32).reshape( + (_DIM_1, _DIM_2)) # shape (564096,32) + input_m = np.fromfile(os.path.join(_INPUT_PATH, "inputM.bin"), dtype=np.float32).reshape( + (_DIM_0, _DIM_2)) # shape (2000000,32) + input_v = np.fromfile(os.path.join(_INPUT_PATH, "inputV.bin"), dtype=np.float32).reshape( + (_DIM_0, _DIM_2)) # shape (2000000,32) + input_var = np.fromfile(os.path.join(_INPUT_PATH, "inputVar.bin"), dtype=np.float32).reshape( + (_DIM_0, _DIM_2)) # shape (2000000,32) + + old_m_slice = _gather(input_m, indices) # shape(564096,32) + old_m_slice = np.array(old_m_slice).astype(np.float32) # + update_m = beta1 * old_m_slice + (1 - beta1) * gradient + out_m = _scatter_nd_update(input_m, indices, update_m) + + old_v_slice = _gather(input_v, indices) + old_v_slice = np.array(old_v_slice).astype(np.float32) + update_v = beta2 * old_v_slice + (1 - beta2) * np.square(gradient) + out_v = _scatter_nd_update(input_v, indices, update_v) + + denominator_slice = np.sqrt(update_v) + epsilon + update_var = np.divide(-lr * update_m, denominator_slice) + out_var = _scatter_nd_add(input_var, indices, update_var) + + return out_m, out_v, out_var + + +def _gen_input_and_golden_data(): + # 产生输入数据 + _gen_input_data() + + # 产生真值数据 + out_m, out_v, out_var = _gen_golden_data() + if not os.path.exists(_OUTPUT_PATH): + os.makedirs(_OUTPUT_PATH) + out_m.tofile(os.path.join(_OUTPUT_PATH, "goldenOutputM.bin")) + out_v.tofile(os.path.join(_OUTPUT_PATH, "goldenOutputV.bin")) + out_var.tofile(os.path.join(_OUTPUT_PATH, "goldenOutputVar.bin")) + + +if __name__ == "__main__": + _gen_input_and_golden_data() diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/verify_result.py b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/verify_result.py new file mode 100644 index 00000000..1cc516db --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/verify_result.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import logging +import numpy as np + +_LOSS_THRESHOLD = 1e-6 # 容忍偏差,一般fp16要求绝对误差和相对误差均不超过万分之一 +_MINIMUM = 10e-10 + +logging.getLogger().setLevel(logging.INFO) + + +def verify_result(real_result, golden): + real_result = np.fromfile(real_result, dtype=np.float32) # 从bin文件读取实际运算结果 + golden = np.fromfile(golden, dtype=np.float32) # 从bin文件读取预期运算结果 + result = np.abs(real_result - golden) # 计算运算结果和预期结果偏差 + deno = np.maximum(np.abs(real_result), np.abs(golden)) # 获取最大值并组成新数组 + result_atol = np.less_equal(result, _LOSS_THRESHOLD) # 计算绝对误差 + result_rtol = np.less_equal(result / np.add(deno, _MINIMUM), _LOSS_THRESHOLD) # 计算相对误差 + if not result_rtol.all() and not result_atol.all(): + # 误差超出预期时返回打印错误,返回对比失败 + if np.sum(result_rtol == False) > real_result.size * _LOSS_THRESHOLD \ + and np.sum(result_atol == False) > real_result.size * _LOSS_THRESHOLD: + logging.error("[ERROR] output verify result error.") + return False + logging.info("output verify pass.") + return True + + +if __name__ == '__main__': + logging.info("start verify outputM.") + verify_result("output/outputM.bin", "output/goldenOutputM.bin") + logging.info("start verify outputV.") + verify_result("output/outputV.bin", "output/goldenOutputV.bin") + logging.info("start verify outputVar.") + verify_result("output/outputVar.bin", "output/goldenOutputVar.bin") diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt new file mode 100644 index 00000000..1642e3ca --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt @@ -0,0 +1,67 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_lazy_adam) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize_lazy_adam/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/") + set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +set(AUTO_GEN_PATH "../../lazy_adam/build_out/autogen") +# Header path +include_directories( + ${INC_PATH}/runtime/include + ${INC_PATH}/atc/include + ../inc + ${CUST_PKG_PATH}/include + ${AUTO_GEN_PATH} +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${LIB_PATH1} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_op + main.cpp + common.cpp +) + +target_link_libraries(execute_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp new file mode 100644 index 00000000..e2cd6865 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/common.cpp @@ -0,0 +1,84 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include +#include +#include +#include + +#include "common.h" + +namespace AclnnLazyAdam { + bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) + { + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; + } + + bool WriteFile(const std::string &filePath, const void *buffer, size_t size) + { + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; + } +} \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp new file mode 100644 index 00000000..c4253996 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/main.cpp @@ -0,0 +1,228 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "aclnn_lazy_adam.h" +#include "common.h" +#include "op_runner.h" + +using namespace AclnnLazyAdam; + +bool g_isDevice = false; +int g_deviceId = 0; +namespace { + constexpr int DIM0 = 2000000; // inputM inputV inputVar 的行数 + constexpr int DIM1 = 564096; // indices长度 + constexpr int DIM2 = 32; // inputM inputV inputVar gradient等每行的数据个数 + constexpr int INPUT_M_INDEX = 2; + constexpr int INPUT_V_INDEX = 3; + constexpr int INPUT_VAR_INDEX = 4; + constexpr int LEARNING_RATE_INDEX = 5; + constexpr int OUTPUT_M_INDEX = 0; + constexpr int OUTPUT_V_INDEX = 1; + constexpr int OUTPUT_VAR_INDEX = 2; + constexpr float LEARNING_RATE = 0.001; + constexpr float BETA1 = 0.9; + constexpr float BETA2 = 0.999; + constexpr float EPSILON = 1e-7; + const char* READ_ERROR_INFO = "read input file error, please check whether file exist and access rights is correct"; + const char* WRITE_ERROR_INFO = "write output file error, please check access rights is correct"; + + OperatorDesc CreateOpDesc() + { + std::vector indicesShape{DIM1, 1}; + std::vector gradientShape{DIM1, DIM2}; + std::vector inputMShape{DIM0, DIM2}; // inputM inputV inputVar 的shape相同 + std::vector learningRateShape{1}; + aclDataType dataType = ACL_FLOAT; + aclDataType indexDataType = ACL_INT32; + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(dataType, gradientShape.size(), gradientShape.data(), format); + opDesc.AddInputTensorDesc(indexDataType, indicesShape.size(), indicesShape.data(), format); + opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputM + opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputV + opDesc.AddInputTensorDesc(dataType, inputMShape.size(), inputMShape.data(), format); // inputVar + opDesc.AddInputTensorDesc(dataType, learningRateShape.size(), learningRateShape.data(), + format); // learningRate + opDesc.beta1 = BETA1; + opDesc.beta2 = BETA2; + opDesc.epsilon = EPSILON; + return opDesc; + } + + bool SetInputData(OpRunner& runner) + { + size_t fileSize = 0; + if (!ReadFile("../input/gradient.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/indices.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/inputM.bin", fileSize, runner.GetInputBuffer(INPUT_M_INDEX), + runner.GetInputSize(INPUT_M_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/inputV.bin", fileSize, runner.GetInputBuffer(INPUT_V_INDEX), + runner.GetInputSize(INPUT_V_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/inputVar.bin", fileSize, runner.GetInputBuffer(INPUT_VAR_INDEX), + runner.GetInputSize(INPUT_VAR_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + if (!ReadFile("../input/learningRate.bin", fileSize, runner.GetInputBuffer(LEARNING_RATE_INDEX), + runner.GetInputSize(LEARNING_RATE_INDEX))) { + throw std::runtime_error(READ_ERROR_INFO); + } + INFO_LOG("Set input success"); + return true; + } + + bool ProcessOutputData(OpRunner& runner) + { + // 保存输出数据 由于输出仅有hostOutputs_数据,未设置outputDesc,因此数据size从inputTensor获取 + if (!WriteFile("../output/outputM.bin", runner.GetOutputBuffer(OUTPUT_M_INDEX), + runner.GetInputSize(INPUT_M_INDEX))) { + throw std::runtime_error(WRITE_ERROR_INFO); + } + if (!WriteFile("../output/outputV.bin", runner.GetOutputBuffer(OUTPUT_V_INDEX), + runner.GetInputSize(INPUT_V_INDEX))) { + throw std::runtime_error(WRITE_ERROR_INFO); + } + if (!WriteFile("../output/outputVar.bin", runner.GetOutputBuffer(OUTPUT_VAR_INDEX), + runner.GetInputSize(INPUT_VAR_INDEX))) { + throw std::runtime_error(WRITE_ERROR_INFO); + } + INFO_LOG("Write output success"); + return true; + } + + void DestroyResource() + { + bool flag = false; + if (aclrtResetDevice(g_deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", g_deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destroy resource failed"); + } else { + INFO_LOG("Destroy resource success"); + } + } + + bool InitResource() + { + std::string output = "../output"; + if (access(output.c_str(), 0) == -1) { + int ret = mkdir(output.c_str(), 0700); + if (ret == 0) { + INFO_LOG("Make output directory successfully"); + } else { + ERROR_LOG("Make output directory fail"); + return false; + } + } + + // acl.json is dump or profiling config file + if (aclInit(NULL) != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + if (aclrtSetDevice(g_deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. g_deviceId is %d", g_deviceId); + (void) aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", g_deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestroyResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; + } + + bool RunOp() + { + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp()) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + INFO_LOG("Run op success"); + return true; + } +} + +int main(int argc, char** argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + if (!RunOp()) { + DestroyResource(); + return FAILED; + } + DestroyResource(); + return SUCCESS; +} -- Gitee From 646f6224bd493247f4eb157f8bfbafca55659b55 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 14:30:18 +0800 Subject: [PATCH 087/302] =?UTF-8?q?=E8=9E=8D=E5=90=88=E7=AE=97=E5=AD=90acl?= =?UTF-8?q?nn=E9=AA=8C=E8=AF=81-part2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/gen_mxrec_tar_pkg.sh | 3 + .../aclnn_lazy_adam_test/src/op_runner.cpp | 423 ++++++++++++++++++ .../src/operator_desc.cpp | 53 +++ cust_op/fused_lazy_adam/lazy_adam.json | 117 +++++ cust_op/fused_lazy_adam/op_host/lazy_adam.cpp | 223 +++++++++ .../op_host/lazy_adam_tiling.h | 41 ++ 6 files changed, 860 insertions(+) create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp create mode 100644 cust_op/fused_lazy_adam/lazy_adam.json create mode 100644 cust_op/fused_lazy_adam/op_host/lazy_adam.cpp create mode 100644 cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h diff --git a/build/gen_mxrec_tar_pkg.sh b/build/gen_mxrec_tar_pkg.sh index 72ccfe49..1f9045b3 100644 --- a/build/gen_mxrec_tar_pkg.sh +++ b/build/gen_mxrec_tar_pkg.sh @@ -51,11 +51,14 @@ function gen_tar_file() chmod 550 ./build/"${pkg_dir}"/tf2_whl/mx_rec*.whl chmod 550 ./build/"${pkg_dir}"/cust_op/ chmod 550 ./build/"${pkg_dir}"/cust_op/cust_op_by_addr + chmod 550 ./build/"${pkg_dir}"/cust_op/fused_lazy_adam cd ./build/"${pkg_dir}"/cust_op/cust_op_by_addr chmod 550 *.sh chmod 640 *.json chmod 550 op_host op_kernel op_host/* op_kernel/* cd - + cd ./build/"${pkg_dir}"/cust_op/fused_lazy_adam + cd ./build tar -zvcf "${release_tar}" "${pkg_dir}" || { warn "compression failed, packages might be broken" diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp new file mode 100644 index 00000000..fb2ccd19 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp @@ -0,0 +1,423 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include +#include + +#include "acl/acl_op_compiler.h" +#include "aclnn_lazy_adam.h" +#include "common.h" +#include "op_runner.h" + +extern bool g_isDevice; + +namespace AclnnLazyAdam { + using namespace std; + constexpr int PRINT_OUT_WIDTH = 10; + constexpr int PRINT_OUT_PRECISION = 4; + constexpr int STREAM_TIMEOUT = 5000; // 等待Stream任务完成,超时时间单位:ms + constexpr int OUTPUT_SIZE = 3; + constexpr int INPUT_TENSOR_OFFSET = 2; + + OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) { + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); + } + + OpRunner::~OpRunner() { + for (size_t i = 0; i < numInputs_; ++i) { + (void) aclDestroyTensor(inputTensor_[i]); + (void) aclDestroyDataBuffer(inputBuffers_[i]); + (void) aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void) aclrtFree(hostInputs_[i]); + } else { + (void) aclrtFreeHost(hostInputs_[i]); + } + } + for (size_t i = 0; i < numOutputs_; ++i) { + if (g_isDevice) { + (void) aclrtFree(hostOutputs_[i]); + } else { + (void) aclrtFreeHost(hostOutputs_[i]); + } + } + } + + bool OpRunner::Init() { + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor *inputTensor = aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), + nullptr, 0, GetInputFormat(i), GetInputShape(i).data(), + GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + // 手动修改输出数据实现,仅申请host上的输出数据空间,析构出需同时适配 + numOutputs_ = OUTPUT_SIZE; + for (size_t i = 0; i < numOutputs_; ++i) { + int inputTensorIndex = i + INPUT_TENSOR_OFFSET; + auto size = GetInputSize(inputTensorIndex); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + } + return true; + } + + const size_t OpRunner::NumInputs() { + return numInputs_; + } + + const size_t OpRunner::NumOutputs() { + return numOutputs_; + } + + const size_t OpRunner::GetInputSize(size_t index) const { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + return aclGetTensorDescSize(opDesc_->inputDesc[index]); + } + + const size_t OpRunner::GetInputNumDims(size_t index) const { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); + } + + aclDataType OpRunner::GetInputDataType(size_t index) const { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + return aclGetTensorDescType(opDesc_->inputDesc[index]); + } + + aclFormat OpRunner::GetInputFormat(size_t index) const { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); + } + + std::vector OpRunner::GetInputShape(size_t index) const { + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; + } + + size_t OpRunner::GetOutputSize(size_t index) const { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + return aclGetTensorDescSize(opDesc_->outputDesc[index]); + } + + const size_t OpRunner::GetOutputNumDims(size_t index) const { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); + } + + aclDataType OpRunner::GetOutputDataType(size_t index) const { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + return aclGetTensorDescType(opDesc_->outputDesc[index]); + } + + + aclFormat OpRunner::GetOutputFormat(size_t index) const { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); + } + + std::vector OpRunner::GetOutputShape(size_t index) const { + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; + } + + size_t OpRunner::GetInputElementCount(size_t index) const { + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); + } + + size_t OpRunner::GetOutputElementCount(size_t index) const { + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); + } + + bool OpRunner::RunOp() { + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor *handle = nullptr; + auto ret = aclnnLazyAdamGetWorkspaceSize(inputTensor_[0], inputTensor_[1], + inputTensor_[2], inputTensor_[3], inputTensor_[4], inputTensor_[5], + opDesc_->beta1, opDesc_->beta2, opDesc_->epsilon, + &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void) aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); + + void *workspace = nullptr; + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = aclnnLazyAdam(workspace, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void) aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustom success"); + + ret = aclrtSynchronizeStreamWithTimeout(stream, STREAM_TIMEOUT); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void) aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + // 把输入数据:inputM inputV inputVar 作为输出数据拷贝出来 + for (size_t i = 0; i < OUTPUT_SIZE; ++i) { + int inputTensorIndex = i + INPUT_TENSOR_OFFSET; // 加上输入tensor偏移值 + auto size = GetInputSize(inputTensorIndex); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devInputs_[inputTensorIndex], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void) aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void) aclrtDestroyStream(stream); + return true; + } + + + template + void DoPrintData(const T *data, size_t count, size_t elementsPerRow) { + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(PRINT_OUT_WIDTH) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } + } + + void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) { + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(PRINT_OUT_WIDTH) << std::setprecision(PRINT_OUT_PRECISION) + << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } + } + + void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) { + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + } + + void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); + } + + void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); + } +} \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp new file mode 100644 index 00000000..826de46a --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp @@ -0,0 +1,53 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include "common.h" +#include "operator_desc.h" +namespace AclnnLazyAdam { + using namespace std; + + OperatorDesc::OperatorDesc() {} + + OperatorDesc::~OperatorDesc() { + for (auto *desc: inputDesc) { + aclDestroyTensorDesc(desc); + } + for (auto *desc: outputDesc) { + aclDestroyTensorDesc(desc); + } + } + + OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, + aclFormat format) { + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; + } + + OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, + const int64_t *dims, aclFormat format) { + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + outputDesc.emplace_back(desc); + return *this; + } +} \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/lazy_adam.json b/cust_op/fused_lazy_adam/lazy_adam.json new file mode 100644 index 00000000..e6fc2c00 --- /dev/null +++ b/cust_op/fused_lazy_adam/lazy_adam.json @@ -0,0 +1,117 @@ +[ + { + "op": "LazyAdam", + "language": "cpp", + "input_desc": [ + { + "name": "gradient", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "indices", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "int32" + ] + }, + { + "name": "inputM", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "inputV", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "inputVar", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "lr", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + } + ], + "output_desc": [ + { + "name": "inputM", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "inputV", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "inputVar", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + } + ], + "attr": [ + { + "name": "beta1", + "param_type": "required", + "type": "float" + }, + { + "name": "beta2", + "param_type": "required", + "type": "float" + }, + { + "name": "epsilon", + "param_type": "required", + "type": "float" + } + ] + } +] \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp new file mode 100644 index 00000000..1a147912 --- /dev/null +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp @@ -0,0 +1,223 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include "lazy_adam_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +namespace optiling { + constexpr int BLOCK_SIZE = 32; + constexpr int RESERVE_UB_SIZE = 20 * 1024; + constexpr int DATA_NUM_PER_COMPUTE = 8; + constexpr int32_t USR_SIZE = 256; + constexpr int32_t SYS_WORKSPACE_SIZE = 16 * 1024 * 1024; + + template + static ge::graphStatus CheckNullPointer(T* pointer, const char* errorMessage) + { + if (pointer == nullptr) { + printf("%s nullptr\n", errorMessage); + return ge::GRAPH_FAILED; + } + + return ge::GRAPH_SUCCESS; + } + + static ge::graphStatus LazyAdamTilingFunc(gert::TilingContext* context) + { + size_t* currentWorkspace = context->GetWorkspaceSizes(1); + if (CheckNullPointer(currentWorkspace, "currentWorkspace") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + currentWorkspace[0] = SYS_WORKSPACE_SIZE + USR_SIZE; + + LazyAdamTilingData tiling; + const gert::StorageShape* indicesShape = context->GetInputShape(1); + const gert::StorageShape* inputMShape = context->GetInputShape(2); + uint64_t dim0 = inputMShape->GetStorageShape().GetDim(0); + uint64_t dim1 = indicesShape->GetStorageShape().GetDim(0); + uint64_t dim2 = inputMShape->GetStorageShape().GetDim(1); + ge::DataType inputMDtype = context->GetInputDesc(2)->GetDataType(); + int inputMDtypeSize = ge::GetSizeByDataType(inputMDtype); + ge::DataType indicesDtype = context->GetInputDesc(1)->GetDataType(); + int indicesDtypeSize = ge::GetSizeByDataType(indicesDtype); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + auto attrs = context->GetAttrs(); + + float beta1 = *attrs->GetAttrPointer(0); + float beta2 = *attrs->GetAttrPointer(1); + float epsilon = *attrs->GetAttrPointer(2); + + auto platformInfo = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + uint32_t coreNum = platformInfo.GetCoreNum(); + uint64_t ub; + platformInfo.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub); + ub = ub - RESERVE_UB_SIZE; + // ub大小除以每行的数据大小,得到每次处理的行数 + uint64_t row = ub / (dim2 * inputMDtypeSize * DATA_NUM_PER_COMPUTE + 1 * indicesDtypeSize); + if (row > dim1) { + row = dim1; + } + + // 保证申请的内存是32的倍数并且向上取整 计算方式:(num+31)/32*32 + uint64_t indicesAllocSize = (row * indicesDtypeSize + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; + uint64_t otherAllocSize = (row * inputMDtypeSize * dim2 + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; + // 前 CORE_NUM - 1 个核分配的任务量 + uint64_t batch = dim1 / coreNum; + // 实际使用的核数 + context->SetBlockDim(coreNum); + uint64_t loopCount = batch / row; // CORE_NUM - 1 个核的任务量,除以UB每一次能处理的数据,得到处理次数 + uint64_t rowLeft = batch - row * loopCount; // UB处理 loopCount 那么多次后,分给当前core剩下的数据量 + + // 最后一个核分配的任务量 + uint64_t batchTail = dim1 - batch * (coreNum - 1); // phy 该写法适配了dim1刚好整除coreNum的情况 + uint64_t loopCountTail = batchTail / row; + uint64_t rowLeftTail = batchTail - row * loopCountTail; + + tiling.set_beta1(beta1); + tiling.set_beta2(beta2); + tiling.set_epsilon(epsilon); + tiling.set_dim0(dim0); + tiling.set_dim1(dim1); + tiling.set_dim2(dim2); + tiling.set_row(row); // 每个ai core一次能分配的数据行数 + tiling.set_indicesAllocSize(indicesAllocSize); // indices大小,用于申请空间 + tiling.set_otherAllocSize(otherAllocSize); // 入参中非indices要申请的空间大小 + tiling.set_batch(batch); // 前CORE_NUM - 1个核分配的任务量 + tiling.set_loopCount(loopCount); // 前CORE_NUM - 1 个核内循环处理次数 + tiling.set_rowLeft(rowLeft); // 前CORE_NUM - 1 个核, 核内处理 loopCount 次后,分给当前core剩下的数据量 + tiling.set_loopCountTail(loopCountTail); // 最后一个核,核内循环次数 + tiling.set_rowLeftTail(rowLeftTail); // 最后一个核,核内循环loopCountTail次后,剩余数据量 + tiling.set_coreNum(coreNum); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + + return ge::GRAPH_SUCCESS; + } +} + +namespace ge { + static ge::graphStatus LazyAdamInferShape(gert::InferShapeContext* context) + { + if (optiling::CheckNullPointer(context, "context") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + + gert::Shape* outputMShape = context->GetOutputShape(0); + if (optiling::CheckNullPointer(outputMShape, "outputMShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + const gert::Shape* inputMShape = context->GetInputShape(2); + if (optiling::CheckNullPointer(inputMShape, "inputMShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + *outputMShape = *inputMShape; + + gert::Shape* outputVShape = context->GetOutputShape(1); + if (optiling::CheckNullPointer(outputVShape, "outputVShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + const gert::Shape* inputVShape = context->GetInputShape(3); + if (optiling::CheckNullPointer(inputVShape, "inputVShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + *outputVShape = *inputVShape; + + gert::Shape* outputVarShape = context->GetOutputShape(2); + if (optiling::CheckNullPointer(outputVarShape, "outputVarShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + const gert::Shape* inputVarShape = context->GetInputShape(4); + if (optiling::CheckNullPointer(inputVarShape, "inputVarShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + *outputVarShape = *inputVarShape; + + return GRAPH_SUCCESS; + } + + static ge::graphStatus LazyAdamInferDataType(gert::InferDataTypeContext* context) + { + return GRAPH_SUCCESS; + } +} + + +namespace ops { + class LazyAdam : public OpDef { + public: + explicit LazyAdam(const char* name) : OpDef(name) + { + this->Input("gradient") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("indices") + .ParamType(REQUIRED) + .DataType({ge::DT_INT32}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("inputM") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("inputV") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("inputVar") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("lr") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("inputM") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("inputV") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("inputVar") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Attr("beta1").Float(); + this->Attr("beta2").Float(); + this->Attr("epsilon").Float(); + this->SetInferShape(ge::LazyAdamInferShape) + .SetInferDataType(ge::LazyAdamInferDataType); + + this->AICore().SetTiling(optiling::LazyAdamTilingFunc); + this->AICore().AddConfig("ascend910b"); + } + }; + + OP_ADD(LazyAdam); +} diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h b/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h new file mode 100644 index 00000000..10b11a9a --- /dev/null +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h @@ -0,0 +1,41 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef LAZY_ADAM_TILING_H +#define LAZY_ADAM_TILING_H +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(LazyAdamTilingData) + TILING_DATA_FIELD_DEF(float, beta1); + TILING_DATA_FIELD_DEF(float, beta2); + TILING_DATA_FIELD_DEF(float, epsilon); + TILING_DATA_FIELD_DEF(int32_t, dim0); + TILING_DATA_FIELD_DEF(int32_t, dim1); + TILING_DATA_FIELD_DEF(int32_t, dim2); + TILING_DATA_FIELD_DEF(int32_t, row); + TILING_DATA_FIELD_DEF(int32_t, indicesAllocSize); + TILING_DATA_FIELD_DEF(int32_t, otherAllocSize); + TILING_DATA_FIELD_DEF(int32_t, batch); + TILING_DATA_FIELD_DEF(int32_t, loopCount); + TILING_DATA_FIELD_DEF(int32_t, rowLeft); + TILING_DATA_FIELD_DEF(int32_t, loopCountTail); + TILING_DATA_FIELD_DEF(int32_t, rowLeftTail); + TILING_DATA_FIELD_DEF(int32_t, coreNum); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(LazyAdam, LazyAdamTilingData) +} +#endif // LAZY_ADAM_TILING_H \ No newline at end of file -- Gitee From 3f57fbbc9877bbfbe22e55d0b53314dab87a2f38 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 14:41:04 +0800 Subject: [PATCH 088/302] =?UTF-8?q?=E5=87=BA=E5=8C=85=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E8=BF=98=E5=8E=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/gen_mxrec_tar_pkg.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/build/gen_mxrec_tar_pkg.sh b/build/gen_mxrec_tar_pkg.sh index 1f9045b3..72ccfe49 100644 --- a/build/gen_mxrec_tar_pkg.sh +++ b/build/gen_mxrec_tar_pkg.sh @@ -51,14 +51,11 @@ function gen_tar_file() chmod 550 ./build/"${pkg_dir}"/tf2_whl/mx_rec*.whl chmod 550 ./build/"${pkg_dir}"/cust_op/ chmod 550 ./build/"${pkg_dir}"/cust_op/cust_op_by_addr - chmod 550 ./build/"${pkg_dir}"/cust_op/fused_lazy_adam cd ./build/"${pkg_dir}"/cust_op/cust_op_by_addr chmod 550 *.sh chmod 640 *.json chmod 550 op_host op_kernel op_host/* op_kernel/* cd - - cd ./build/"${pkg_dir}"/cust_op/fused_lazy_adam - cd ./build tar -zvcf "${release_tar}" "${pkg_dir}" || { warn "compression failed, packages might be broken" -- Gitee From d5cdcf92b4039531285e5575a82ed32b3448aeb9 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 14:54:26 +0800 Subject: [PATCH 089/302] =?UTF-8?q?=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/src/op_runner.cpp | 79 +++++++++++++------ .../src/operator_desc.cpp | 25 +++--- cust_op/fused_lazy_adam/op_host/lazy_adam.cpp | 3 + 3 files changed, 71 insertions(+), 36 deletions(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp index fb2ccd19..c1a732e1 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include #include +#include #include "acl/acl_op_compiler.h" #include "aclnn_lazy_adam.h" @@ -31,12 +31,14 @@ namespace AclnnLazyAdam { constexpr int OUTPUT_SIZE = 3; constexpr int INPUT_TENSOR_OFFSET = 2; - OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) { + OpRunner::OpRunner(OperatorDesc* opDesc) : opDesc_(opDesc) + { numInputs_ = opDesc->inputDesc.size(); numOutputs_ = opDesc->outputDesc.size(); } - OpRunner::~OpRunner() { + OpRunner::~OpRunner() + { for (size_t i = 0; i < numInputs_; ++i) { (void) aclDestroyTensor(inputTensor_[i]); (void) aclDestroyDataBuffer(inputBuffers_[i]); @@ -56,7 +58,8 @@ namespace AclnnLazyAdam { } } - bool OpRunner::Init() { + bool OpRunner::Init() + { for (size_t i = 0; i < numInputs_; ++i) { auto size = GetInputSize(i); void *devMem = nullptr; @@ -122,15 +125,18 @@ namespace AclnnLazyAdam { return true; } - const size_t OpRunner::NumInputs() { + const size_t OpRunner::NumInputs() + { return numInputs_; } - const size_t OpRunner::NumOutputs() { + const size_t OpRunner::NumOutputs() + { return numOutputs_; } - const size_t OpRunner::GetInputSize(size_t index) const { + const size_t OpRunner::GetInputSize(size_t index) const + { if (index >= numInputs_) { ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); return 0; @@ -138,7 +144,8 @@ namespace AclnnLazyAdam { return aclGetTensorDescSize(opDesc_->inputDesc[index]); } - const size_t OpRunner::GetInputNumDims(size_t index) const { + const size_t OpRunner::GetInputNumDims(size_t index) const + { if (index >= numInputs_) { ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); return 0; @@ -146,7 +153,8 @@ namespace AclnnLazyAdam { return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); } - aclDataType OpRunner::GetInputDataType(size_t index) const { + aclDataType OpRunner::GetInputDataType(size_t index) const + { if (index >= numInputs_) { ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); return ACL_DT_UNDEFINED; @@ -154,7 +162,8 @@ namespace AclnnLazyAdam { return aclGetTensorDescType(opDesc_->inputDesc[index]); } - aclFormat OpRunner::GetInputFormat(size_t index) const { + aclFormat OpRunner::GetInputFormat(size_t index) const + { if (index >= numInputs_) { ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); return ACL_FORMAT_UNDEFINED; @@ -162,7 +171,8 @@ namespace AclnnLazyAdam { return aclGetTensorDescFormat(opDesc_->inputDesc[index]); } - std::vector OpRunner::GetInputShape(size_t index) const { + std::vector OpRunner::GetInputShape(size_t index) const + { std::vector ret; if (index >= numInputs_) { ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); @@ -182,7 +192,8 @@ namespace AclnnLazyAdam { return ret; } - size_t OpRunner::GetOutputSize(size_t index) const { + size_t OpRunner::GetOutputSize(size_t index) const + { if (index >= numOutputs_) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); return 0; @@ -190,7 +201,8 @@ namespace AclnnLazyAdam { return aclGetTensorDescSize(opDesc_->outputDesc[index]); } - const size_t OpRunner::GetOutputNumDims(size_t index) const { + const size_t OpRunner::GetOutputNumDims(size_t index) const + { if (index >= numOutputs_) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); return 0; @@ -198,7 +210,8 @@ namespace AclnnLazyAdam { return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); } - aclDataType OpRunner::GetOutputDataType(size_t index) const { + aclDataType OpRunner::GetOutputDataType(size_t index) const + { if (index >= numOutputs_) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); return ACL_DT_UNDEFINED; @@ -207,7 +220,8 @@ namespace AclnnLazyAdam { } - aclFormat OpRunner::GetOutputFormat(size_t index) const { + aclFormat OpRunner::GetOutputFormat(size_t index) const + { if (index >= numOutputs_) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); return ACL_FORMAT_UNDEFINED; @@ -216,7 +230,8 @@ namespace AclnnLazyAdam { return aclGetTensorDescFormat(opDesc_->outputDesc[index]); } - std::vector OpRunner::GetOutputShape(size_t index) const { + std::vector OpRunner::GetOutputShape(size_t index) const + { std::vector ret; if (index >= numOutputs_) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); @@ -236,7 +251,8 @@ namespace AclnnLazyAdam { return ret; } - size_t OpRunner::GetInputElementCount(size_t index) const { + size_t OpRunner::GetInputElementCount(size_t index) const + { if (index >= opDesc_->inputDesc.size()) { ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); return 0; @@ -245,7 +261,8 @@ namespace AclnnLazyAdam { return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); } - size_t OpRunner::GetOutputElementCount(size_t index) const { + size_t OpRunner::GetOutputElementCount(size_t index) const + { if (index >= opDesc_->outputDesc.size()) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); return 0; @@ -253,7 +270,8 @@ namespace AclnnLazyAdam { return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); } - bool OpRunner::RunOp() { + bool OpRunner::RunOp() + { for (size_t i = 0; i < numInputs_; ++i) { auto size = GetInputSize(i); aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; @@ -332,8 +350,11 @@ namespace AclnnLazyAdam { template - void DoPrintData(const T *data, size_t count, size_t elementsPerRow) { - assert(elementsPerRow != 0); + void DoPrintData(const T *data, size_t count, size_t elementsPerRow) + { + if (elementsPerRow == 0) { + throw std::runtime_error("value must not be zero."); + } for (size_t i = 0; i < count; ++i) { std::cout << std::setw(PRINT_OUT_WIDTH) << data[i]; if (i % elementsPerRow == elementsPerRow - 1) { @@ -342,8 +363,11 @@ namespace AclnnLazyAdam { } } - void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) { - assert(elementsPerRow != 0); + void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) + { + if (elementsPerRow == 0) { + throw std::runtime_error("value must not be zero."); + } for (size_t i = 0; i < count; ++i) { std::cout << std::setw(PRINT_OUT_WIDTH) << std::setprecision(PRINT_OUT_PRECISION) << aclFloat16ToFloat(data[i]); @@ -353,7 +377,8 @@ namespace AclnnLazyAdam { } } - void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) { + void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) + { if (data == nullptr) { ERROR_LOG("Print data failed. data is nullptr"); return; @@ -401,7 +426,8 @@ namespace AclnnLazyAdam { } } - void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) { + void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) + { if (index >= numInputs_) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); return; @@ -411,7 +437,8 @@ namespace AclnnLazyAdam { PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); } - void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) { + void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) + { if (index >= numOutputs_) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); return; diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp index 826de46a..dad4ab0f 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp @@ -15,23 +15,27 @@ See the License for the specific language governing permissions and #include "common.h" #include "operator_desc.h" + namespace AclnnLazyAdam { using namespace std; - OperatorDesc::OperatorDesc() {} + OperatorDesc::OperatorDesc() + {} - OperatorDesc::~OperatorDesc() { - for (auto *desc: inputDesc) { + OperatorDesc::~OperatorDesc() + { + for (auto* desc: inputDesc) { aclDestroyTensorDesc(desc); } - for (auto *desc: outputDesc) { + for (auto* desc: outputDesc) { aclDestroyTensorDesc(desc); } } - OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, - aclFormat format) { - aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + OperatorDesc& OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t* dims, + aclFormat format) + { + aclTensorDesc* desc = aclCreateTensorDesc(dataType, numDims, dims, format); if (desc == nullptr) { ERROR_LOG("create tensor failed"); return *this; @@ -40,9 +44,10 @@ namespace AclnnLazyAdam { return *this; } - OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, - const int64_t *dims, aclFormat format) { - aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + OperatorDesc& OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, + const int64_t* dims, aclFormat format) + { + aclTensorDesc* desc = aclCreateTensorDesc(dataType, numDims, dims, format); if (desc == nullptr) { ERROR_LOG("create tensor failed"); return *this; diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp index 1a147912..b93fc0d2 100644 --- a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp @@ -64,6 +64,9 @@ namespace optiling { auto platformInfo = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); uint32_t coreNum = platformInfo.GetCoreNum(); + if (coreNum == 0) { + return ge::GRAPH_FAILED; + } uint64_t ub; platformInfo.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub); ub = ub - RESERVE_UB_SIZE; -- Gitee From 0c781001539ec2c9253c35f57e7d9aa2dc5d8f49 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 15:30:51 +0800 Subject: [PATCH 090/302] =?UTF-8?q?=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/inc/op_runner.h | 5 ++ .../aclnn_lazy_adam_test/src/op_runner.cpp | 55 ++++++++++--------- cust_op/fused_lazy_adam/op_host/lazy_adam.cpp | 4 +- 3 files changed, 36 insertions(+), 28 deletions(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h index 6f91f905..77f0aee5 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h @@ -43,6 +43,11 @@ namespace AclnnLazyAdam { */ bool Init(); + /** + * @brief Init op runner output info + */ + bool InitOutputInfo(); + /** * @brief Get number of inputs * @return number of inputs diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp index c1a732e1..0f126212 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp @@ -58,6 +58,35 @@ namespace AclnnLazyAdam { } } + bool OpRunner::InitOutputInfo() + { + // 手动修改输出数据实现,仅申请host上的输出数据空间,析构出需同时适配 + numOutputs_ = OUTPUT_SIZE; + for (size_t i = 0; i < numOutputs_; ++i) { + int inputTensorIndex = i + INPUT_TENSOR_OFFSET; + auto size = GetInputSize(inputTensorIndex); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + } + return true; + } + bool OpRunner::Init() { for (size_t i = 0; i < numInputs_; ++i) { @@ -98,31 +127,7 @@ namespace AclnnLazyAdam { inputTensor_.emplace_back(inputTensor); } - // 手动修改输出数据实现,仅申请host上的输出数据空间,析构出需同时适配 - numOutputs_ = OUTPUT_SIZE; - for (size_t i = 0; i < numOutputs_; ++i) { - int inputTensorIndex = i + INPUT_TENSOR_OFFSET; - auto size = GetInputSize(inputTensorIndex); - - void *hostOutput = nullptr; - if (g_isDevice) { - if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for output[%zu] failed", i); - return false; - } - } else { - if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for output[%zu] failed", i); - return false; - } - } - if (hostOutput == nullptr) { - ERROR_LOG("Malloc host memory for output[%zu] failed", i); - return false; - } - hostOutputs_.emplace_back(hostOutput); - } - return true; + return InitOutputInfo(); } const size_t OpRunner::NumInputs() diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp index b93fc0d2..34fc9c7e 100644 --- a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp @@ -214,9 +214,7 @@ namespace ops { this->Attr("beta1").Float(); this->Attr("beta2").Float(); this->Attr("epsilon").Float(); - this->SetInferShape(ge::LazyAdamInferShape) - .SetInferDataType(ge::LazyAdamInferDataType); - + this->SetInferShape(ge::LazyAdamInferShape).SetInferDataType(ge::LazyAdamInferDataType); this->AICore().SetTiling(optiling::LazyAdamTilingFunc); this->AICore().AddConfig("ascend910b"); } -- Gitee From 2881bae3a7b1275596f5ec3a6a94a9d526d8278f Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 16:00:19 +0800 Subject: [PATCH 091/302] =?UTF-8?q?clang-format=E6=96=87=E4=BB=B6=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/src/op_runner.cpp | 721 +++++++++--------- .../src/operator_desc.cpp | 67 +- cust_op/fused_lazy_adam/op_host/lazy_adam.cpp | 381 +++++---- .../op_host/lazy_adam_tiling.h | 34 +- 4 files changed, 599 insertions(+), 604 deletions(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp index 0f126212..3d737564 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp @@ -13,443 +13,440 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "op_runner.h" + #include #include #include "acl/acl_op_compiler.h" #include "aclnn_lazy_adam.h" #include "common.h" -#include "op_runner.h" extern bool g_isDevice; namespace AclnnLazyAdam { - using namespace std; - constexpr int PRINT_OUT_WIDTH = 10; - constexpr int PRINT_OUT_PRECISION = 4; - constexpr int STREAM_TIMEOUT = 5000; // 等待Stream任务完成,超时时间单位:ms - constexpr int OUTPUT_SIZE = 3; - constexpr int INPUT_TENSOR_OFFSET = 2; - - OpRunner::OpRunner(OperatorDesc* opDesc) : opDesc_(opDesc) - { - numInputs_ = opDesc->inputDesc.size(); - numOutputs_ = opDesc->outputDesc.size(); - } - - OpRunner::~OpRunner() - { - for (size_t i = 0; i < numInputs_; ++i) { - (void) aclDestroyTensor(inputTensor_[i]); - (void) aclDestroyDataBuffer(inputBuffers_[i]); - (void) aclrtFree(devInputs_[i]); - if (g_isDevice) { - (void) aclrtFree(hostInputs_[i]); - } else { - (void) aclrtFreeHost(hostInputs_[i]); - } +using namespace std; +constexpr int PRINT_OUT_WIDTH = 10; +constexpr int PRINT_OUT_PRECISION = 4; +constexpr int STREAM_TIMEOUT = 5000; // 等待Stream任务完成,超时时间单位:ms +constexpr int OUTPUT_SIZE = 3; +constexpr int INPUT_TENSOR_OFFSET = 2; + +OpRunner::OpRunner(OperatorDesc* opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); +} + +OpRunner::~OpRunner() +{ + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); } - for (size_t i = 0; i < numOutputs_; ++i) { - if (g_isDevice) { - (void) aclrtFree(hostOutputs_[i]); - } else { - (void) aclrtFreeHost(hostOutputs_[i]); - } + } + for (size_t i = 0; i < numOutputs_; ++i) { + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); } } - - bool OpRunner::InitOutputInfo() - { - // 手动修改输出数据实现,仅申请host上的输出数据空间,析构出需同时适配 - numOutputs_ = OUTPUT_SIZE; - for (size_t i = 0; i < numOutputs_; ++i) { - int inputTensorIndex = i + INPUT_TENSOR_OFFSET; - auto size = GetInputSize(inputTensorIndex); - - void *hostOutput = nullptr; - if (g_isDevice) { - if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for output[%zu] failed", i); - return false; - } - } else { - if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for output[%zu] failed", i); - return false; - } +} + +bool OpRunner::InitOutputInfo() +{ + // 手动修改输出数据实现,仅申请host上的输出数据空间,析构出需同时适配 + numOutputs_ = OUTPUT_SIZE; + for (size_t i = 0; i < numOutputs_; ++i) { + int inputTensorIndex = i + INPUT_TENSOR_OFFSET; + auto size = GetInputSize(inputTensorIndex); + + void* hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; } - if (hostOutput == nullptr) { - ERROR_LOG("Malloc host memory for output[%zu] failed", i); + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); return false; } - hostOutputs_.emplace_back(hostOutput); } - return true; + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); } + return true; +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void* devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); - bool OpRunner::Init() - { - for (size_t i = 0; i < numInputs_; ++i) { - auto size = GetInputSize(i); - void *devMem = nullptr; - if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + void* hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { ERROR_LOG("Malloc device memory for input[%zu] failed", i); return false; } - devInputs_.emplace_back(devMem); - inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); - - void *hostInput = nullptr; - if (g_isDevice) { - if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for input[%zu] failed", i); - return false; - } - } else { - if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for input[%zu] failed", i); - return false; - } - } - if (hostInput == nullptr) { - ERROR_LOG("Malloc memory for input[%zu] failed", i); - return false; - } - hostInputs_.emplace_back(hostInput); - - aclTensor *inputTensor = aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), - nullptr, 0, GetInputFormat(i), GetInputShape(i).data(), - GetInputNumDims(i), devInputs_[i]); - if (inputTensor == nullptr) { - ERROR_LOG("Create Tensor for input[%zu] failed", i); + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); return false; } - inputTensor_.emplace_back(inputTensor); } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); - return InitOutputInfo(); + aclTensor* inputTensor = + aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0, + GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); } - const size_t OpRunner::NumInputs() - { - return numInputs_; - } + return InitOutputInfo(); +} - const size_t OpRunner::NumOutputs() - { - return numOutputs_; - } +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} - const size_t OpRunner::GetInputSize(size_t index) const - { - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return 0; - } - return aclGetTensorDescSize(opDesc_->inputDesc[index]); - } +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} - const size_t OpRunner::GetInputNumDims(size_t index) const - { - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return 0; - } - return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; } - - aclDataType OpRunner::GetInputDataType(size_t index) const - { - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return ACL_DT_UNDEFINED; - } - return aclGetTensorDescType(opDesc_->inputDesc[index]); + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; } - - aclFormat OpRunner::GetInputFormat(size_t index) const - { - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return ACL_FORMAT_UNDEFINED; - } - return aclGetTensorDescFormat(opDesc_->inputDesc[index]); + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; } - - std::vector OpRunner::GetInputShape(size_t index) const - { - std::vector ret; - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return ret; - } - - auto desc = opDesc_->inputDesc[index]; - for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { - int64_t dimSize; - if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { - ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); - ret.clear(); - return ret; - } - ret.emplace_back(dimSize); - } + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); return ret; } - size_t OpRunner::GetOutputSize(size_t index) const - { - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return 0; + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; } - return aclGetTensorDescSize(opDesc_->outputDesc[index]); + ret.emplace_back(dimSize); } - - const size_t OpRunner::GetOutputNumDims(size_t index) const - { - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return 0; - } - return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; } - - aclDataType OpRunner::GetOutputDataType(size_t index) const - { - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return ACL_DT_UNDEFINED; - } - return aclGetTensorDescType(opDesc_->outputDesc[index]); + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; } + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} - aclFormat OpRunner::GetOutputFormat(size_t index) const - { - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return ACL_FORMAT_UNDEFINED; - } - - return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; } - std::vector OpRunner::GetOutputShape(size_t index) const - { - std::vector ret; - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); return ret; } - - auto desc = opDesc_->outputDesc[index]; - for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { - int64_t dimSize; - if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { - ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); - ret.clear(); - return ret; - } - ret.emplace_back(dimSize); - } - return ret; + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; } - size_t OpRunner::GetInputElementCount(size_t index) const - { - if (index >= opDesc_->inputDesc.size()) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return 0; - } + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} - return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; } - - size_t OpRunner::GetOutputElementCount(size_t index) const - { - if (index >= opDesc_->outputDesc.size()) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return 0; + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; } - return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); + INFO_LOG("Copy input[%zu] success", i); } - bool OpRunner::RunOp() - { - for (size_t i = 0; i < numInputs_; ++i) { - auto size = GetInputSize(i); - aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; - if (g_isDevice) { - kind = ACL_MEMCPY_DEVICE_TO_DEVICE; - } - if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { - ERROR_LOG("Copy input[%zu] failed", i); - return false; - } - INFO_LOG("Copy input[%zu] success", i); - } + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor* handle = nullptr; + auto ret = aclnnLazyAdamGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], + inputTensor_[4], inputTensor_[5], opDesc_->beta1, opDesc_->beta2, + opDesc_->epsilon, &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); - aclrtStream stream = nullptr; - if (aclrtCreateStream(&stream) != ACL_SUCCESS) { - ERROR_LOG("Create stream failed"); - return false; + void* workspace = nullptr; + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); } - INFO_LOG("Create stream success"); - - size_t workspaceSize = 0; - aclOpExecutor *handle = nullptr; - auto ret = aclnnLazyAdamGetWorkspaceSize(inputTensor_[0], inputTensor_[1], - inputTensor_[2], inputTensor_[3], inputTensor_[4], inputTensor_[5], - opDesc_->beta1, opDesc_->beta2, opDesc_->epsilon, - &workspaceSize, &handle); - if (ret != ACL_SUCCESS) { - (void) aclrtDestroyStream(stream); - ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); - return false; - } - INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); + } - void *workspace = nullptr; - if (workspaceSize != 0) { - if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory failed"); - } - } + ret = aclnnLazyAdam(workspace, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustom success"); - ret = aclnnLazyAdam(workspace, workspaceSize, handle, stream); - if (ret != ACL_SUCCESS) { - (void) aclrtDestroyStream(stream); - ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); - return false; - } - INFO_LOG("Execute aclnnAddCustom success"); + ret = aclrtSynchronizeStreamWithTimeout(stream, STREAM_TIMEOUT); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); - ret = aclrtSynchronizeStreamWithTimeout(stream, STREAM_TIMEOUT); - if (ret != SUCCESS) { - ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); - (void) aclrtDestroyStream(stream); - return false; + // 把输入数据:inputM inputV inputVar 作为输出数据拷贝出来 + for (size_t i = 0; i < OUTPUT_SIZE; ++i) { + int inputTensorIndex = i + INPUT_TENSOR_OFFSET; // 加上输入tensor偏移值 + auto size = GetInputSize(inputTensorIndex); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; } - INFO_LOG("Synchronize stream success"); - - // 把输入数据:inputM inputV inputVar 作为输出数据拷贝出来 - for (size_t i = 0; i < OUTPUT_SIZE; ++i) { - int inputTensorIndex = i + INPUT_TENSOR_OFFSET; // 加上输入tensor偏移值 - auto size = GetInputSize(inputTensorIndex); - aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; - if (g_isDevice) { - kind = ACL_MEMCPY_DEVICE_TO_DEVICE; - } - if (aclrtMemcpy(hostOutputs_[i], size, devInputs_[inputTensorIndex], size, kind) != ACL_SUCCESS) { - INFO_LOG("Copy output[%zu] success", i); - (void) aclrtDestroyStream(stream); - return false; - } + if (aclrtMemcpy(hostOutputs_[i], size, devInputs_[inputTensorIndex], size, kind) != ACL_SUCCESS) { INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; } - - (void) aclrtDestroyStream(stream); - return true; + INFO_LOG("Copy output[%zu] success", i); } + (void)aclrtDestroyStream(stream); + return true; +} - template - void DoPrintData(const T *data, size_t count, size_t elementsPerRow) - { - if (elementsPerRow == 0) { - throw std::runtime_error("value must not be zero."); - } - for (size_t i = 0; i < count; ++i) { - std::cout << std::setw(PRINT_OUT_WIDTH) << data[i]; - if (i % elementsPerRow == elementsPerRow - 1) { - std::cout << std::endl; - } - } +template +void DoPrintData(const T* data, size_t count, size_t elementsPerRow) +{ + if (elementsPerRow == 0) { + throw std::runtime_error("value must not be zero."); } - - void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) - { - if (elementsPerRow == 0) { - throw std::runtime_error("value must not be zero."); - } - for (size_t i = 0; i < count; ++i) { - std::cout << std::setw(PRINT_OUT_WIDTH) << std::setprecision(PRINT_OUT_PRECISION) - << aclFloat16ToFloat(data[i]); - if (i % elementsPerRow == elementsPerRow - 1) { - std::cout << std::endl; - } + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(PRINT_OUT_WIDTH) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; } } +} - void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) - { - if (data == nullptr) { - ERROR_LOG("Print data failed. data is nullptr"); - return; +void DoPrintFp16Data(const aclFloat16* data, size_t count, size_t elementsPerRow) +{ + if (elementsPerRow == 0) { + throw std::runtime_error("value must not be zero."); + } + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(PRINT_OUT_WIDTH) << std::setprecision(PRINT_OUT_PRECISION) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; } + } +} - switch (dataType) { - case ACL_BOOL: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_INT8: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_UINT8: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_INT16: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_UINT16: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_INT32: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_UINT32: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_INT64: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_UINT64: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_FLOAT16: - DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_FLOAT: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_DOUBLE: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - default: - ERROR_LOG("Unsupported type: %d", dataType); - } +void PrintData(const void* data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; } - void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) - { - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); - return; - } + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} - auto desc = opDesc_->inputDesc[index]; - PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; } - void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) - { - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return; - } + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} - auto desc = opDesc_->outputDesc[index]; - PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; } -} \ No newline at end of file + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} +} // namespace AclnnLazyAdam \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp index dad4ab0f..13602e17 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp @@ -13,46 +13,45 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "common.h" #include "operator_desc.h" +#include "common.h" + namespace AclnnLazyAdam { - using namespace std; - - OperatorDesc::OperatorDesc() - {} - - OperatorDesc::~OperatorDesc() - { - for (auto* desc: inputDesc) { - aclDestroyTensorDesc(desc); - } - for (auto* desc: outputDesc) { - aclDestroyTensorDesc(desc); - } +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto* desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + for (auto* desc : outputDesc) { + aclDestroyTensorDesc(desc); } +} - OperatorDesc& OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t* dims, - aclFormat format) - { - aclTensorDesc* desc = aclCreateTensorDesc(dataType, numDims, dims, format); - if (desc == nullptr) { - ERROR_LOG("create tensor failed"); - return *this; - } - inputDesc.emplace_back(desc); +OperatorDesc& OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t* dims, aclFormat format) +{ + aclTensorDesc* desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); return *this; } - - OperatorDesc& OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, - const int64_t* dims, aclFormat format) - { - aclTensorDesc* desc = aclCreateTensorDesc(dataType, numDims, dims, format); - if (desc == nullptr) { - ERROR_LOG("create tensor failed"); - return *this; - } - outputDesc.emplace_back(desc); + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc& OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t* dims, + aclFormat format) +{ + aclTensorDesc* desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); return *this; } -} \ No newline at end of file + outputDesc.emplace_back(desc); + return *this; +} +} // namespace AclnnLazyAdam \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp index 34fc9c7e..77826029 100644 --- a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp @@ -18,207 +18,206 @@ See the License for the specific language governing permissions and #include "tiling/platform/platform_ascendc.h" namespace optiling { - constexpr int BLOCK_SIZE = 32; - constexpr int RESERVE_UB_SIZE = 20 * 1024; - constexpr int DATA_NUM_PER_COMPUTE = 8; - constexpr int32_t USR_SIZE = 256; - constexpr int32_t SYS_WORKSPACE_SIZE = 16 * 1024 * 1024; - - template - static ge::graphStatus CheckNullPointer(T* pointer, const char* errorMessage) - { - if (pointer == nullptr) { - printf("%s nullptr\n", errorMessage); - return ge::GRAPH_FAILED; - } - - return ge::GRAPH_SUCCESS; +constexpr int BLOCK_SIZE = 32; +constexpr int RESERVE_UB_SIZE = 20 * 1024; +constexpr int DATA_NUM_PER_COMPUTE = 8; +constexpr int32_t USR_SIZE = 256; +constexpr int32_t SYS_WORKSPACE_SIZE = 16 * 1024 * 1024; + +template +static ge::graphStatus CheckNullPointer(T* pointer, const char* errorMessage) +{ + if (pointer == nullptr) { + printf("%s nullptr\n", errorMessage); + return ge::GRAPH_FAILED; } - static ge::graphStatus LazyAdamTilingFunc(gert::TilingContext* context) - { - size_t* currentWorkspace = context->GetWorkspaceSizes(1); - if (CheckNullPointer(currentWorkspace, "currentWorkspace") != ge::GRAPH_SUCCESS) { - return ge::GRAPH_FAILED; - } - currentWorkspace[0] = SYS_WORKSPACE_SIZE + USR_SIZE; - - LazyAdamTilingData tiling; - const gert::StorageShape* indicesShape = context->GetInputShape(1); - const gert::StorageShape* inputMShape = context->GetInputShape(2); - uint64_t dim0 = inputMShape->GetStorageShape().GetDim(0); - uint64_t dim1 = indicesShape->GetStorageShape().GetDim(0); - uint64_t dim2 = inputMShape->GetStorageShape().GetDim(1); - ge::DataType inputMDtype = context->GetInputDesc(2)->GetDataType(); - int inputMDtypeSize = ge::GetSizeByDataType(inputMDtype); - ge::DataType indicesDtype = context->GetInputDesc(1)->GetDataType(); - int indicesDtypeSize = ge::GetSizeByDataType(indicesDtype); - - tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); - context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); - auto attrs = context->GetAttrs(); - - float beta1 = *attrs->GetAttrPointer(0); - float beta2 = *attrs->GetAttrPointer(1); - float epsilon = *attrs->GetAttrPointer(2); - - auto platformInfo = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); - uint32_t coreNum = platformInfo.GetCoreNum(); - if (coreNum == 0) { - return ge::GRAPH_FAILED; - } - uint64_t ub; - platformInfo.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub); - ub = ub - RESERVE_UB_SIZE; - // ub大小除以每行的数据大小,得到每次处理的行数 - uint64_t row = ub / (dim2 * inputMDtypeSize * DATA_NUM_PER_COMPUTE + 1 * indicesDtypeSize); - if (row > dim1) { - row = dim1; - } - - // 保证申请的内存是32的倍数并且向上取整 计算方式:(num+31)/32*32 - uint64_t indicesAllocSize = (row * indicesDtypeSize + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; - uint64_t otherAllocSize = (row * inputMDtypeSize * dim2 + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; - // 前 CORE_NUM - 1 个核分配的任务量 - uint64_t batch = dim1 / coreNum; - // 实际使用的核数 - context->SetBlockDim(coreNum); - uint64_t loopCount = batch / row; // CORE_NUM - 1 个核的任务量,除以UB每一次能处理的数据,得到处理次数 - uint64_t rowLeft = batch - row * loopCount; // UB处理 loopCount 那么多次后,分给当前core剩下的数据量 - - // 最后一个核分配的任务量 - uint64_t batchTail = dim1 - batch * (coreNum - 1); // phy 该写法适配了dim1刚好整除coreNum的情况 - uint64_t loopCountTail = batchTail / row; - uint64_t rowLeftTail = batchTail - row * loopCountTail; - - tiling.set_beta1(beta1); - tiling.set_beta2(beta2); - tiling.set_epsilon(epsilon); - tiling.set_dim0(dim0); - tiling.set_dim1(dim1); - tiling.set_dim2(dim2); - tiling.set_row(row); // 每个ai core一次能分配的数据行数 - tiling.set_indicesAllocSize(indicesAllocSize); // indices大小,用于申请空间 - tiling.set_otherAllocSize(otherAllocSize); // 入参中非indices要申请的空间大小 - tiling.set_batch(batch); // 前CORE_NUM - 1个核分配的任务量 - tiling.set_loopCount(loopCount); // 前CORE_NUM - 1 个核内循环处理次数 - tiling.set_rowLeft(rowLeft); // 前CORE_NUM - 1 个核, 核内处理 loopCount 次后,分给当前core剩下的数据量 - tiling.set_loopCountTail(loopCountTail); // 最后一个核,核内循环次数 - tiling.set_rowLeftTail(rowLeftTail); // 最后一个核,核内循环loopCountTail次后,剩余数据量 - tiling.set_coreNum(coreNum); - - tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); - context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); - - return ge::GRAPH_SUCCESS; + return ge::GRAPH_SUCCESS; +} + +static ge::graphStatus LazyAdamTilingFunc(gert::TilingContext* context) +{ + size_t* currentWorkspace = context->GetWorkspaceSizes(1); + if (CheckNullPointer(currentWorkspace, "currentWorkspace") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + currentWorkspace[0] = SYS_WORKSPACE_SIZE + USR_SIZE; + + LazyAdamTilingData tiling; + const gert::StorageShape* indicesShape = context->GetInputShape(1); + const gert::StorageShape* inputMShape = context->GetInputShape(2); + uint64_t dim0 = inputMShape->GetStorageShape().GetDim(0); + uint64_t dim1 = indicesShape->GetStorageShape().GetDim(0); + uint64_t dim2 = inputMShape->GetStorageShape().GetDim(1); + ge::DataType inputMDtype = context->GetInputDesc(2)->GetDataType(); + int inputMDtypeSize = ge::GetSizeByDataType(inputMDtype); + ge::DataType indicesDtype = context->GetInputDesc(1)->GetDataType(); + int indicesDtypeSize = ge::GetSizeByDataType(indicesDtype); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + auto attrs = context->GetAttrs(); + + float beta1 = *attrs->GetAttrPointer(0); + float beta2 = *attrs->GetAttrPointer(1); + float epsilon = *attrs->GetAttrPointer(2); + + auto platformInfo = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + uint32_t coreNum = platformInfo.GetCoreNum(); + if (coreNum == 0) { + return ge::GRAPH_FAILED; } + uint64_t ub; + platformInfo.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub); + ub = ub - RESERVE_UB_SIZE; + // ub大小除以每行的数据大小,得到每次处理的行数 + uint64_t row = ub / (dim2 * inputMDtypeSize * DATA_NUM_PER_COMPUTE + 1 * indicesDtypeSize); + if (row > dim1) { + row = dim1; + } + + // 保证申请的内存是32的倍数并且向上取整 计算方式:(num+31)/32*32 + uint64_t indicesAllocSize = (row * indicesDtypeSize + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; + uint64_t otherAllocSize = (row * inputMDtypeSize * dim2 + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; + // 前 CORE_NUM - 1 个核分配的任务量 + uint64_t batch = dim1 / coreNum; + // 实际使用的核数 + context->SetBlockDim(coreNum); + uint64_t loopCount = batch / row; // CORE_NUM - 1 个核的任务量,除以UB每一次能处理的数据,得到处理次数 + uint64_t rowLeft = batch - row * loopCount; // UB处理 loopCount 那么多次后,分给当前core剩下的数据量 + + // 最后一个核分配的任务量 + uint64_t batchTail = dim1 - batch * (coreNum - 1); // phy 该写法适配了dim1刚好整除coreNum的情况 + uint64_t loopCountTail = batchTail / row; + uint64_t rowLeftTail = batchTail - row * loopCountTail; + + tiling.set_beta1(beta1); + tiling.set_beta2(beta2); + tiling.set_epsilon(epsilon); + tiling.set_dim0(dim0); + tiling.set_dim1(dim1); + tiling.set_dim2(dim2); + tiling.set_row(row); // 每个ai core一次能分配的数据行数 + tiling.set_indicesAllocSize(indicesAllocSize); // indices大小,用于申请空间 + tiling.set_otherAllocSize(otherAllocSize); // 入参中非indices要申请的空间大小 + tiling.set_batch(batch); // 前CORE_NUM - 1个核分配的任务量 + tiling.set_loopCount(loopCount); // 前CORE_NUM - 1 个核内循环处理次数 + tiling.set_rowLeft(rowLeft); // 前CORE_NUM - 1 个核, 核内处理 loopCount 次后,分给当前core剩下的数据量 + tiling.set_loopCountTail(loopCountTail); // 最后一个核,核内循环次数 + tiling.set_rowLeftTail(rowLeftTail); // 最后一个核,核内循环loopCountTail次后,剩余数据量 + tiling.set_coreNum(coreNum); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + + return ge::GRAPH_SUCCESS; } +} // namespace optiling namespace ge { - static ge::graphStatus LazyAdamInferShape(gert::InferShapeContext* context) - { - if (optiling::CheckNullPointer(context, "context") != ge::GRAPH_SUCCESS) { - return ge::GRAPH_FAILED; - } - - gert::Shape* outputMShape = context->GetOutputShape(0); - if (optiling::CheckNullPointer(outputMShape, "outputMShape") != ge::GRAPH_SUCCESS) { - return ge::GRAPH_FAILED; - } - const gert::Shape* inputMShape = context->GetInputShape(2); - if (optiling::CheckNullPointer(inputMShape, "inputMShape") != ge::GRAPH_SUCCESS) { - return ge::GRAPH_FAILED; - } - *outputMShape = *inputMShape; - - gert::Shape* outputVShape = context->GetOutputShape(1); - if (optiling::CheckNullPointer(outputVShape, "outputVShape") != ge::GRAPH_SUCCESS) { - return ge::GRAPH_FAILED; - } - const gert::Shape* inputVShape = context->GetInputShape(3); - if (optiling::CheckNullPointer(inputVShape, "inputVShape") != ge::GRAPH_SUCCESS) { - return ge::GRAPH_FAILED; - } - *outputVShape = *inputVShape; - - gert::Shape* outputVarShape = context->GetOutputShape(2); - if (optiling::CheckNullPointer(outputVarShape, "outputVarShape") != ge::GRAPH_SUCCESS) { - return ge::GRAPH_FAILED; - } - const gert::Shape* inputVarShape = context->GetInputShape(4); - if (optiling::CheckNullPointer(inputVarShape, "inputVarShape") != ge::GRAPH_SUCCESS) { - return ge::GRAPH_FAILED; - } - *outputVarShape = *inputVarShape; - - return GRAPH_SUCCESS; +static ge::graphStatus LazyAdamInferShape(gert::InferShapeContext* context) +{ + if (optiling::CheckNullPointer(context, "context") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; } - static ge::graphStatus LazyAdamInferDataType(gert::InferDataTypeContext* context) - { - return GRAPH_SUCCESS; + gert::Shape* outputMShape = context->GetOutputShape(0); + if (optiling::CheckNullPointer(outputMShape, "outputMShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + const gert::Shape* inputMShape = context->GetInputShape(2); + if (optiling::CheckNullPointer(inputMShape, "inputMShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; } + *outputMShape = *inputMShape; + + gert::Shape* outputVShape = context->GetOutputShape(1); + if (optiling::CheckNullPointer(outputVShape, "outputVShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + const gert::Shape* inputVShape = context->GetInputShape(3); + if (optiling::CheckNullPointer(inputVShape, "inputVShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + *outputVShape = *inputVShape; + + gert::Shape* outputVarShape = context->GetOutputShape(2); + if (optiling::CheckNullPointer(outputVarShape, "outputVarShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + const gert::Shape* inputVarShape = context->GetInputShape(4); + if (optiling::CheckNullPointer(inputVarShape, "inputVarShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + *outputVarShape = *inputVarShape; + + return GRAPH_SUCCESS; } +static ge::graphStatus LazyAdamInferDataType(gert::InferDataTypeContext* context) +{ + return GRAPH_SUCCESS; +} +} // namespace ge namespace ops { - class LazyAdam : public OpDef { - public: - explicit LazyAdam(const char* name) : OpDef(name) - { - this->Input("gradient") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Input("indices") - .ParamType(REQUIRED) - .DataType({ge::DT_INT32}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Input("inputM") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Input("inputV") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Input("inputVar") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Input("lr") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Output("inputM") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Output("inputV") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Output("inputVar") - .ParamType(REQUIRED) - .DataType({ge::DT_FLOAT}) - .Format({ge::FORMAT_ND}) - .UnknownShapeFormat({ge::FORMAT_ND}); - this->Attr("beta1").Float(); - this->Attr("beta2").Float(); - this->Attr("epsilon").Float(); - this->SetInferShape(ge::LazyAdamInferShape).SetInferDataType(ge::LazyAdamInferDataType); - this->AICore().SetTiling(optiling::LazyAdamTilingFunc); - this->AICore().AddConfig("ascend910b"); - } - }; - - OP_ADD(LazyAdam); -} +class LazyAdam : public OpDef { +public: + explicit LazyAdam(const char* name) : OpDef(name) + { + this->Input("gradient") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("indices") + .ParamType(REQUIRED) + .DataType({ge::DT_INT32}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("inputM") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("inputV") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("inputVar") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("lr") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("inputM") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("inputV") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("inputVar") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Attr("beta1").Float(); + this->Attr("beta2").Float(); + this->Attr("epsilon").Float(); + this->SetInferShape(ge::LazyAdamInferShape).SetInferDataType(ge::LazyAdamInferDataType); + this->AICore().SetTiling(optiling::LazyAdamTilingFunc); + this->AICore().AddConfig("ascend910b"); + } +}; + +OP_ADD(LazyAdam); +} // namespace ops diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h b/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h index 10b11a9a..4f1534a4 100644 --- a/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h @@ -19,23 +19,23 @@ See the License for the specific language governing permissions and namespace optiling { BEGIN_TILING_DATA_DEF(LazyAdamTilingData) - TILING_DATA_FIELD_DEF(float, beta1); - TILING_DATA_FIELD_DEF(float, beta2); - TILING_DATA_FIELD_DEF(float, epsilon); - TILING_DATA_FIELD_DEF(int32_t, dim0); - TILING_DATA_FIELD_DEF(int32_t, dim1); - TILING_DATA_FIELD_DEF(int32_t, dim2); - TILING_DATA_FIELD_DEF(int32_t, row); - TILING_DATA_FIELD_DEF(int32_t, indicesAllocSize); - TILING_DATA_FIELD_DEF(int32_t, otherAllocSize); - TILING_DATA_FIELD_DEF(int32_t, batch); - TILING_DATA_FIELD_DEF(int32_t, loopCount); - TILING_DATA_FIELD_DEF(int32_t, rowLeft); - TILING_DATA_FIELD_DEF(int32_t, loopCountTail); - TILING_DATA_FIELD_DEF(int32_t, rowLeftTail); - TILING_DATA_FIELD_DEF(int32_t, coreNum); +TILING_DATA_FIELD_DEF(float, beta1); +TILING_DATA_FIELD_DEF(float, beta2); +TILING_DATA_FIELD_DEF(float, epsilon); +TILING_DATA_FIELD_DEF(int32_t, dim0); +TILING_DATA_FIELD_DEF(int32_t, dim1); +TILING_DATA_FIELD_DEF(int32_t, dim2); +TILING_DATA_FIELD_DEF(int32_t, row); +TILING_DATA_FIELD_DEF(int32_t, indicesAllocSize); +TILING_DATA_FIELD_DEF(int32_t, otherAllocSize); +TILING_DATA_FIELD_DEF(int32_t, batch); +TILING_DATA_FIELD_DEF(int32_t, loopCount); +TILING_DATA_FIELD_DEF(int32_t, rowLeft); +TILING_DATA_FIELD_DEF(int32_t, loopCountTail); +TILING_DATA_FIELD_DEF(int32_t, rowLeftTail); +TILING_DATA_FIELD_DEF(int32_t, coreNum); END_TILING_DATA_DEF; REGISTER_TILING_DATA_CLASS(LazyAdam, LazyAdamTilingData) -} -#endif // LAZY_ADAM_TILING_H \ No newline at end of file +} // namespace optiling +#endif // LAZY_ADAM_TILING_H \ No newline at end of file -- Gitee From cdd048be9cfb7133001fb8251dee42adea8a531e Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 16:05:50 +0800 Subject: [PATCH 092/302] =?UTF-8?q?=E7=AE=97=E5=AD=90=E6=B3=A8=E5=86=8C?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=B7=BB=E5=8A=A0910c?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/op_host/lazy_adam.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp index 77826029..fb7f86b3 100644 --- a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp @@ -216,6 +216,7 @@ public: this->SetInferShape(ge::LazyAdamInferShape).SetInferDataType(ge::LazyAdamInferDataType); this->AICore().SetTiling(optiling::LazyAdamTilingFunc); this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend910c"); } }; -- Gitee From f1538624fb81c35472a4da4a9f340c668aa849cc Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 09:04:49 +0000 Subject: [PATCH 093/302] =?UTF-8?q?!117=20=E8=9E=8D=E5=90=88=E7=AE=97?= =?UTF-8?q?=E5=AD=90aclnn=E9=AA=8C=E8=AF=81=EF=BC=8C=E7=AE=97=E5=AD=90host?= =?UTF-8?q?=E4=BE=A7=E5=AE=9E=E7=8E=B0-part2=20*=20=E7=AE=97=E5=AD=90?= =?UTF-8?q?=E6=B3=A8=E5=86=8C=E9=85=8D=E7=BD=AE=E6=B7=BB=E5=8A=A0910c=20*?= =?UTF-8?q?=20clang-format=E6=96=87=E4=BB=B6=E6=A0=BC=E5=BC=8F=E5=8C=96=20?= =?UTF-8?q?*=20=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B92=20*=20=E9=97=A8?= =?UTF-8?q?=E7=A6=81=E4=BF=AE=E6=94=B91=20*=20=E5=87=BA=E5=8C=85=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E8=BF=98=E5=8E=9F=20*=20Merge=20branch=20'develop'=20?= =?UTF-8?q?of=20gitee.com:ascend/mxrec=20into=20develop=20*=20=E8=9E=8D?= =?UTF-8?q?=E5=90=88=E7=AE=97=E5=AD=90aclnn=E9=AA=8C=E8=AF=81-part2=20*=20?= =?UTF-8?q?=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B93=20*=20=E9=97=A8=E7=A6=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B92=20*=20aclnn=E6=B5=8B=E8=AF=95=E9=97=A8?= =?UTF-8?q?=E7=A6=81=E4=BF=AE=E6=94=B9=20*=20LazyAdam=E8=9E=8D=E5=90=88?= =?UTF-8?q?=E7=AE=97=E5=AD=90-aclnn=E9=83=A8=E5=88=86=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/inc/op_runner.h | 5 + .../aclnn_lazy_adam_test/src/op_runner.cpp | 452 ++++++++++++++++++ .../src/operator_desc.cpp | 57 +++ cust_op/fused_lazy_adam/lazy_adam.json | 117 +++++ cust_op/fused_lazy_adam/op_host/lazy_adam.cpp | 224 +++++++++ .../op_host/lazy_adam_tiling.h | 41 ++ 6 files changed, 896 insertions(+) create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp create mode 100644 cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp create mode 100644 cust_op/fused_lazy_adam/lazy_adam.json create mode 100644 cust_op/fused_lazy_adam/op_host/lazy_adam.cpp create mode 100644 cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h index 6f91f905..77f0aee5 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h @@ -43,6 +43,11 @@ namespace AclnnLazyAdam { */ bool Init(); + /** + * @brief Init op runner output info + */ + bool InitOutputInfo(); + /** * @brief Get number of inputs * @return number of inputs diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp new file mode 100644 index 00000000..3d737564 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp @@ -0,0 +1,452 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include "op_runner.h" + +#include +#include + +#include "acl/acl_op_compiler.h" +#include "aclnn_lazy_adam.h" +#include "common.h" + +extern bool g_isDevice; + +namespace AclnnLazyAdam { +using namespace std; +constexpr int PRINT_OUT_WIDTH = 10; +constexpr int PRINT_OUT_PRECISION = 4; +constexpr int STREAM_TIMEOUT = 5000; // 等待Stream任务完成,超时时间单位:ms +constexpr int OUTPUT_SIZE = 3; +constexpr int INPUT_TENSOR_OFFSET = 2; + +OpRunner::OpRunner(OperatorDesc* opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); +} + +OpRunner::~OpRunner() +{ + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); + } + } + for (size_t i = 0; i < numOutputs_; ++i) { + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); + } + } +} + +bool OpRunner::InitOutputInfo() +{ + // 手动修改输出数据实现,仅申请host上的输出数据空间,析构出需同时适配 + numOutputs_ = OUTPUT_SIZE; + for (size_t i = 0; i < numOutputs_; ++i) { + int inputTensorIndex = i + INPUT_TENSOR_OFFSET; + auto size = GetInputSize(inputTensorIndex); + + void* hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + } + return true; +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void* devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void* hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor* inputTensor = + aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0, + GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + return InitOutputInfo(); +} + +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} + +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} + +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} + +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} + +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor* handle = nullptr; + auto ret = aclnnLazyAdamGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], + inputTensor_[4], inputTensor_[5], opDesc_->beta1, opDesc_->beta2, + opDesc_->epsilon, &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); + + void* workspace = nullptr; + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = aclnnLazyAdam(workspace, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustom success"); + + ret = aclrtSynchronizeStreamWithTimeout(stream, STREAM_TIMEOUT); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + // 把输入数据:inputM inputV inputVar 作为输出数据拷贝出来 + for (size_t i = 0; i < OUTPUT_SIZE; ++i) { + int inputTensorIndex = i + INPUT_TENSOR_OFFSET; // 加上输入tensor偏移值 + auto size = GetInputSize(inputTensorIndex); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devInputs_[inputTensorIndex], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void)aclrtDestroyStream(stream); + return true; +} + +template +void DoPrintData(const T* data, size_t count, size_t elementsPerRow) +{ + if (elementsPerRow == 0) { + throw std::runtime_error("value must not be zero."); + } + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(PRINT_OUT_WIDTH) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintFp16Data(const aclFloat16* data, size_t count, size_t elementsPerRow) +{ + if (elementsPerRow == 0) { + throw std::runtime_error("value must not be zero."); + } + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(PRINT_OUT_WIDTH) << std::setprecision(PRINT_OUT_PRECISION) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void* data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} + +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} + +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} +} // namespace AclnnLazyAdam \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp new file mode 100644 index 00000000..13602e17 --- /dev/null +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/operator_desc.cpp @@ -0,0 +1,57 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include "operator_desc.h" + +#include "common.h" + +namespace AclnnLazyAdam { +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto* desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + for (auto* desc : outputDesc) { + aclDestroyTensorDesc(desc); + } +} + +OperatorDesc& OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t* dims, aclFormat format) +{ + aclTensorDesc* desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc& OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t* dims, + aclFormat format) +{ + aclTensorDesc* desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + outputDesc.emplace_back(desc); + return *this; +} +} // namespace AclnnLazyAdam \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/lazy_adam.json b/cust_op/fused_lazy_adam/lazy_adam.json new file mode 100644 index 00000000..e6fc2c00 --- /dev/null +++ b/cust_op/fused_lazy_adam/lazy_adam.json @@ -0,0 +1,117 @@ +[ + { + "op": "LazyAdam", + "language": "cpp", + "input_desc": [ + { + "name": "gradient", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "indices", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "int32" + ] + }, + { + "name": "inputM", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "inputV", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "inputVar", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "lr", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + } + ], + "output_desc": [ + { + "name": "inputM", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "inputV", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + }, + { + "name": "inputVar", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "fp32" + ] + } + ], + "attr": [ + { + "name": "beta1", + "param_type": "required", + "type": "float" + }, + { + "name": "beta2", + "param_type": "required", + "type": "float" + }, + { + "name": "epsilon", + "param_type": "required", + "type": "float" + } + ] + } +] \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp new file mode 100644 index 00000000..fb7f86b3 --- /dev/null +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp @@ -0,0 +1,224 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include "lazy_adam_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/platform/platform_ascendc.h" + +namespace optiling { +constexpr int BLOCK_SIZE = 32; +constexpr int RESERVE_UB_SIZE = 20 * 1024; +constexpr int DATA_NUM_PER_COMPUTE = 8; +constexpr int32_t USR_SIZE = 256; +constexpr int32_t SYS_WORKSPACE_SIZE = 16 * 1024 * 1024; + +template +static ge::graphStatus CheckNullPointer(T* pointer, const char* errorMessage) +{ + if (pointer == nullptr) { + printf("%s nullptr\n", errorMessage); + return ge::GRAPH_FAILED; + } + + return ge::GRAPH_SUCCESS; +} + +static ge::graphStatus LazyAdamTilingFunc(gert::TilingContext* context) +{ + size_t* currentWorkspace = context->GetWorkspaceSizes(1); + if (CheckNullPointer(currentWorkspace, "currentWorkspace") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + currentWorkspace[0] = SYS_WORKSPACE_SIZE + USR_SIZE; + + LazyAdamTilingData tiling; + const gert::StorageShape* indicesShape = context->GetInputShape(1); + const gert::StorageShape* inputMShape = context->GetInputShape(2); + uint64_t dim0 = inputMShape->GetStorageShape().GetDim(0); + uint64_t dim1 = indicesShape->GetStorageShape().GetDim(0); + uint64_t dim2 = inputMShape->GetStorageShape().GetDim(1); + ge::DataType inputMDtype = context->GetInputDesc(2)->GetDataType(); + int inputMDtypeSize = ge::GetSizeByDataType(inputMDtype); + ge::DataType indicesDtype = context->GetInputDesc(1)->GetDataType(); + int indicesDtypeSize = ge::GetSizeByDataType(indicesDtype); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + auto attrs = context->GetAttrs(); + + float beta1 = *attrs->GetAttrPointer(0); + float beta2 = *attrs->GetAttrPointer(1); + float epsilon = *attrs->GetAttrPointer(2); + + auto platformInfo = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); + uint32_t coreNum = platformInfo.GetCoreNum(); + if (coreNum == 0) { + return ge::GRAPH_FAILED; + } + uint64_t ub; + platformInfo.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ub); + ub = ub - RESERVE_UB_SIZE; + // ub大小除以每行的数据大小,得到每次处理的行数 + uint64_t row = ub / (dim2 * inputMDtypeSize * DATA_NUM_PER_COMPUTE + 1 * indicesDtypeSize); + if (row > dim1) { + row = dim1; + } + + // 保证申请的内存是32的倍数并且向上取整 计算方式:(num+31)/32*32 + uint64_t indicesAllocSize = (row * indicesDtypeSize + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; + uint64_t otherAllocSize = (row * inputMDtypeSize * dim2 + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE; + // 前 CORE_NUM - 1 个核分配的任务量 + uint64_t batch = dim1 / coreNum; + // 实际使用的核数 + context->SetBlockDim(coreNum); + uint64_t loopCount = batch / row; // CORE_NUM - 1 个核的任务量,除以UB每一次能处理的数据,得到处理次数 + uint64_t rowLeft = batch - row * loopCount; // UB处理 loopCount 那么多次后,分给当前core剩下的数据量 + + // 最后一个核分配的任务量 + uint64_t batchTail = dim1 - batch * (coreNum - 1); // phy 该写法适配了dim1刚好整除coreNum的情况 + uint64_t loopCountTail = batchTail / row; + uint64_t rowLeftTail = batchTail - row * loopCountTail; + + tiling.set_beta1(beta1); + tiling.set_beta2(beta2); + tiling.set_epsilon(epsilon); + tiling.set_dim0(dim0); + tiling.set_dim1(dim1); + tiling.set_dim2(dim2); + tiling.set_row(row); // 每个ai core一次能分配的数据行数 + tiling.set_indicesAllocSize(indicesAllocSize); // indices大小,用于申请空间 + tiling.set_otherAllocSize(otherAllocSize); // 入参中非indices要申请的空间大小 + tiling.set_batch(batch); // 前CORE_NUM - 1个核分配的任务量 + tiling.set_loopCount(loopCount); // 前CORE_NUM - 1 个核内循环处理次数 + tiling.set_rowLeft(rowLeft); // 前CORE_NUM - 1 个核, 核内处理 loopCount 次后,分给当前core剩下的数据量 + tiling.set_loopCountTail(loopCountTail); // 最后一个核,核内循环次数 + tiling.set_rowLeftTail(rowLeftTail); // 最后一个核,核内循环loopCountTail次后,剩余数据量 + tiling.set_coreNum(coreNum); + + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static ge::graphStatus LazyAdamInferShape(gert::InferShapeContext* context) +{ + if (optiling::CheckNullPointer(context, "context") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + + gert::Shape* outputMShape = context->GetOutputShape(0); + if (optiling::CheckNullPointer(outputMShape, "outputMShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + const gert::Shape* inputMShape = context->GetInputShape(2); + if (optiling::CheckNullPointer(inputMShape, "inputMShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + *outputMShape = *inputMShape; + + gert::Shape* outputVShape = context->GetOutputShape(1); + if (optiling::CheckNullPointer(outputVShape, "outputVShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + const gert::Shape* inputVShape = context->GetInputShape(3); + if (optiling::CheckNullPointer(inputVShape, "inputVShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + *outputVShape = *inputVShape; + + gert::Shape* outputVarShape = context->GetOutputShape(2); + if (optiling::CheckNullPointer(outputVarShape, "outputVarShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + const gert::Shape* inputVarShape = context->GetInputShape(4); + if (optiling::CheckNullPointer(inputVarShape, "inputVarShape") != ge::GRAPH_SUCCESS) { + return ge::GRAPH_FAILED; + } + *outputVarShape = *inputVarShape; + + return GRAPH_SUCCESS; +} + +static ge::graphStatus LazyAdamInferDataType(gert::InferDataTypeContext* context) +{ + return GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { +class LazyAdam : public OpDef { +public: + explicit LazyAdam(const char* name) : OpDef(name) + { + this->Input("gradient") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("indices") + .ParamType(REQUIRED) + .DataType({ge::DT_INT32}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("inputM") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("inputV") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("inputVar") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Input("lr") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("inputM") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("inputV") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Output("inputVar") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT}) + .Format({ge::FORMAT_ND}) + .UnknownShapeFormat({ge::FORMAT_ND}); + this->Attr("beta1").Float(); + this->Attr("beta2").Float(); + this->Attr("epsilon").Float(); + this->SetInferShape(ge::LazyAdamInferShape).SetInferDataType(ge::LazyAdamInferDataType); + this->AICore().SetTiling(optiling::LazyAdamTilingFunc); + this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend910c"); + } +}; + +OP_ADD(LazyAdam); +} // namespace ops diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h b/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h new file mode 100644 index 00000000..4f1534a4 --- /dev/null +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam_tiling.h @@ -0,0 +1,41 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef LAZY_ADAM_TILING_H +#define LAZY_ADAM_TILING_H +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(LazyAdamTilingData) +TILING_DATA_FIELD_DEF(float, beta1); +TILING_DATA_FIELD_DEF(float, beta2); +TILING_DATA_FIELD_DEF(float, epsilon); +TILING_DATA_FIELD_DEF(int32_t, dim0); +TILING_DATA_FIELD_DEF(int32_t, dim1); +TILING_DATA_FIELD_DEF(int32_t, dim2); +TILING_DATA_FIELD_DEF(int32_t, row); +TILING_DATA_FIELD_DEF(int32_t, indicesAllocSize); +TILING_DATA_FIELD_DEF(int32_t, otherAllocSize); +TILING_DATA_FIELD_DEF(int32_t, batch); +TILING_DATA_FIELD_DEF(int32_t, loopCount); +TILING_DATA_FIELD_DEF(int32_t, rowLeft); +TILING_DATA_FIELD_DEF(int32_t, loopCountTail); +TILING_DATA_FIELD_DEF(int32_t, rowLeftTail); +TILING_DATA_FIELD_DEF(int32_t, coreNum); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(LazyAdam, LazyAdamTilingData) +} // namespace optiling +#endif // LAZY_ADAM_TILING_H \ No newline at end of file -- Gitee From a16bd07b5553d59befe30ac51ea8a611e5ef09d8 Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Mon, 6 May 2024 09:23:27 +0000 Subject: [PATCH 094/302] !115 cleancode * cleancode * cleancode * cleancode * cleancode --- .../op_host/embedding_lookup_by_address.cpp | 2 +- .../op_kernel/embedding_lookup_by_address.cpp | 9 ++++++--- src/core/emb_table/embedding_ddr.cpp | 6 +++++- src/core/emb_table/embedding_static.cpp | 1 + src/core/utils/config.h | 2 -- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/cust_op/cust_op_by_addr/op_host/embedding_lookup_by_address.cpp b/cust_op/cust_op_by_addr/op_host/embedding_lookup_by_address.cpp index 41a5b33a..722914d3 100644 --- a/cust_op/cust_op_by_addr/op_host/embedding_lookup_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_host/embedding_lookup_by_address.cpp @@ -28,7 +28,7 @@ namespace optiling constexpr int32_t SIZE_OF_HALF = 2; constexpr int32_t SIZE_OF_FLOAT_OR_INT = 4; constexpr int32_t MIN_BLOCK_SIZE = 32; // ub空间的数据都要按照32对齐 - constexpr int32_t UB_LIMIT = 175 * 1024; + constexpr uint32_t UB_LIMIT = 175 * 1024; constexpr int32_t USR_SIZE = 256; constexpr int32_t SYS_WORKSPACE_SIZE = 16 * 1024 * 1024; constexpr int32_t PING_PONG_NUM = 1; diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp index 3fded632..cc45c5be 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp @@ -16,6 +16,8 @@ See the License for the specific language governing permissions and #include "kernel_operator.h" using namespace AscendC; +namespace AscendC { + constexpr int32_t SIZE_OF_HALF = 2; constexpr int32_t SIZE_OF_FLOAT_OR_INT = 4; constexpr int32_t PADDING_ZERO_NUM_PER_TIME = 8; @@ -180,6 +182,7 @@ private: GlobalTensor srcDataBufferGm, dstDataGm; GlobalTensor srcAddrGlobal; }; +} extern "C" __global__ __aicore__ void embedding_lookup_by_address(GM_ADDR address, GM_ADDR y, GM_ADDR usrWorkspace, GM_ADDR tiling) @@ -192,7 +195,7 @@ extern "C" __global__ __aicore__ void embedding_lookup_by_address(GM_ADDR addres { case 0: { - KernelEimtable op; + AscendC::KernelEimtable op; op.Init_param(tiling); op.Init(address, y); op.Process(); @@ -200,7 +203,7 @@ extern "C" __global__ __aicore__ void embedding_lookup_by_address(GM_ADDR addres break; case 2: { - KernelEimtable op; + AscendC::KernelEimtable op; op.Init_param(tiling); op.Init(address, y); op.Process(); @@ -208,7 +211,7 @@ extern "C" __global__ __aicore__ void embedding_lookup_by_address(GM_ADDR addres break; default: { - KernelEimtable op; + AscendC::KernelEimtable op; op.Init_param(tiling); op.Init(address, y); op.Process(); diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 8f529646..24aa07a7 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -345,7 +345,11 @@ int EmbeddingDDR::LoadHashMap(const string& savePath) LOG_ERROR("malloc failed: {}", strerror(errno)); return -1; } - fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); + ssize_t result = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); + if (result == -1) { + free(static_cast(buf)); + return -1; + } size_t loadKeySize = fileSize / sizeof(int64_t); diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index 225c90c9..dab8a195 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -153,6 +153,7 @@ int EmbeddingStatic::LoadKey(const string &savePath) if (loadOffset.size() > devVocabSize) { LOG_ERROR("load key size exceeds device vocab size: {}", strerror(errno)); + free(static_cast(buf)); return -1; } diff --git a/src/core/utils/config.h b/src/core/utils/config.h index 3ecb4c36..fc5536f6 100644 --- a/src/core/utils/config.h +++ b/src/core/utils/config.h @@ -16,8 +16,6 @@ See the License for the specific language governing permissions and #ifndef MXREC_CONFIG_H #define MXREC_CONFIG_H -#include - namespace MxRec { namespace RecEnvNames { const char *const ACL_TIMEOUT = "AclTimeout"; -- Gitee From 30bd34ebdd69e48b37e9a2c48f6d4755280c8a69 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 17:44:05 +0800 Subject: [PATCH 095/302] =?UTF-8?q?LazyAdam=E8=9E=8D=E5=90=88=E7=AE=97?= =?UTF-8?q?=E5=AD=90-part3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 129 +++++++++ .../fused_lazy_adam/op_kernel/lazy_adam.cpp | 245 ++++++++++++++++++ cust_op/fused_lazy_adam/run.sh | 53 ++++ src/ops_tf/hybrid_dataset_ops.cpp | 18 ++ 4 files changed, 445 insertions(+) create mode 100644 cust_op/fused_lazy_adam/README.md create mode 100644 cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp create mode 100644 cust_op/fused_lazy_adam/run.sh diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md new file mode 100644 index 00000000..e0e64d23 --- /dev/null +++ b/cust_op/fused_lazy_adam/README.md @@ -0,0 +1,129 @@ +# LazyAdam优化器融合算子及样例说明 + +## LazyAdam融合算子文件结构 +```shell +├── aclnn_lazy_adam_test # 单算子测试用例 +├── lazy_adam.json # 算子原型配置 +├── op_host # LazyAdam融合算子Host侧实现 +├── op_kernel # LazyAdam融合算子Kernel测实现 +├── README.md # LazyAdam融合算子说明文档 +└── run.sh # LazyAdam融合算子安装脚本 +``` + +## Ascend C参考设计 +更多详情可以参考CANN官方的Ascend C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0001.html)。 + + +## lazy_adam优化器同名融合算子lazy_adam + +1. 算子分析 + +a) 算子的主要功能是实现lazy_adam优化器反向更新时m、v、variable三项数据的计算和更新; +b) 算子参数说明: +* gradient: lazy_adam优化器计算时使用的梯度; +* indices: 参与计算/更新的数据索引; +* inputM: lazy_adam优化器一阶矩估计;计算结果原地更新; +* inputV: lazy_adam优化器二阶矩估计;计算结果原地更新; +* inputVar: embedding表对应的variable数据;计算结果原地更新; +c) 算子约束说明: +* 支持的型号:Atlas A2系列产品; +* 支持的输入数据类型:float32; +* embedding表的dim值需要时8的倍数; + +2. Host侧算子实现 + +Host侧算子实现在目录 fused_lazy_adam/op_host下,其中包括:lazy_adam.cpp和 +lazy_adam_tiling.h。 + +a) Tiling实现 + +namespace optiling域中的LazyAdamTilingFunc函数,主要实现从context中获取外部入参信息(输入参数指针、shape信息),及校验有效性; +并计算kernel侧需要的数据切分相关参数,包括row、loopCount、batch等(详情见tiling文件注释),设置BlockDim,最后通过TilingData传递属性信息。 + +b) Shape推导 + +因算子计算结果原地更新到输入参数中,namespace ge域中的InferShape和InferDataType函数体为空。 + +c) 原型注册 + +namespace ops域中的LazyAdam类定义了算子原型,并将算子注册到GE。 + +3. Kernel侧算子实现 + +Kernel侧算子实现在目录fused_lazy_adam/op_kernel下,其中包括:lazy_adam.cpp。 + +a) 核函数的入口:extern "C" __global__ __aicore__ void lazy_adam + +b) 解析tiling参数:GET_TILING_DATA(tilingData, tiling)从TilingData中获取host侧传入的数据 + +c) Init方法,进行算子运行数据的初始化; + +d) Process方法,进行数据搬入和计算,并且计算完成后将计算结果数据分别更新到对应入参中; + +## AclNN单算子测试参考设计 + +更多详情可以参考CANN官方的[Ascend C单算子调用概述](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0036.html)。 + +单算子调用分为两种方式:单算子API执行和模型执行。mxRec提供单算子API执行供参考。 + +单算子测试用例在目录fused_lazy_adam/aclnn_lazy_adam_test下,其中: +* inc是头文件目录 +* scripts存放生成数据和验证数据的python脚本 +* input是存放算子入参的bin文件 +* output是存放生成的可执行程序execute_op、算子输出bin文件和用于验证的golden数据bin文件 +* src是存放公共函数common、构造算子输入输出描述类oprator_desc、单算子调用主体流程实现op_runner文件和入口main文件 + +执行单算子测试: +```shell +bash run.sh +``` + +### 前置条件 + +1. 参考[基于msopgen工具创建算子工程](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0023.html)完成算子工程的创建, +参考[kernel侧算子实现](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0024.html)完成kernel侧实现的相关准备, +参考[host侧算子实现](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0026.html)完成host侧实现相关准备。 +2. 参考[算子编译部署](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0031.html)完成算子的编译部署,编译部署时需要开启算子的二进制编译功能:修改算子工程中的编译配置项文件CMakePresets.json,将 +ENABLE_BINARY_PACKAGE设置为True。编译部署时可将算子的二进制部署到当前环境,便于后续算子的调用。 +3. 检查API执行需要的头文件和库文件是否自动生成,针对mxRec,检查cust_op/fused_lazy_adam/lazy_adam/build_out/autogen目录下,是否有 +aclnn_lazy_adam.cpp和aclnn_lazy_adam.h等。 + +注意:对于cust_op/fused_lazy_adam/run.sh脚本,安装算子后会删除构建目录。运行单算子测试时,需要屏蔽掉删除rm rf ./lazy_adam这一步,以确保前置条件3。 + +### 融合算子 lazy_adam +针对lazy_adam算子,入口src/main.cpp中: + +1. InitResource函数:初始化AscendCL并运行管理资源申请,不用修改 +2. RunLookupOp运行算子: + +a) 创建算子输入输出描述CreateOpDesc,OperatorDesc对象定义(inc/operator_desc.h)中设置了算子入参为成员变量,以便后续 +op_runner中使用; + +b) 创建OpRunner的对象,并依次执行: +* opRunner.Init():申请内存存放执行算子的输入输出数据 +* SetInputData():加载数据输入bin文件并传输给OpRunner的Buffer供后续算子执行使用 +* opRunner.RunOp():算子执行,主要流程为:入参数据拷贝,创建Stream,执行Stream,输出数据拷贝,释放Stream资源 +* ProcessOutputData():算子输出数据处理,并落盘文件,以供后续与golden数据比对 + +3. DestroyResource函数:释放内存,不用修改 + +### 运行脚本 +run.sh脚本依次执行: +1. 清除遗留生成文件和日志文件 +2. 生成输入数据和真值数据 +3. 编译acl可执行文件 +4. 运行可执行文件 +5. 比较真值文件 + +### scripts脚本 +* gen_data.py:生成lazy_adam算子的输入数据和用于精度校验的golden数据,可自行修改测试相关dim参数。 +* verify_result.py:将算子的输出和脚本生成的golden数据进行精度比对,并输出比较结果。比对规则为:允许误差精度loss:1e-4 + +a) 绝对误差 +b) 相对误差 +c) 误差相对个数 + +同时满足绝对误差不全小于loss,相对误差不全小于loss,且绝对误差和相对误差大于loss的个数都超过总数的1/loss,也就是 +1/10000(双万分之一),即认为算子精度不达标。其余情况均认为算子达标。 + +用户可自行修改允许精度误差范围loss。 \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp new file mode 100644 index 00000000..815e6567 --- /dev/null +++ b/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp @@ -0,0 +1,245 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include "kernel_operator.h" + +using namespace AscendC; + +template +class LazyAdam { +public: + __aicore__ inline LazyAdam() + {} + + // 初始化函数,完成内存初始化相关操作 + __aicore__ inline void Init(GM_ADDR gradient, GM_ADDR indices, + GM_ADDR inputM, GM_ADDR inputV, GM_ADDR inputVar, GM_ADDR lr, + GM_ADDR inputMRef, GM_ADDR inputVRef, GM_ADDR inputVarRef, + float beta1, float beta2, float epsilon, + int32_t dim0, int32_t dim1, int32_t dim2, + int32_t row, int32_t indicesAllocSize, int32_t otherAllocSize, + int32_t batch, int32_t loopCount, int32_t rowLeft, + int32_t loopCountTail, int32_t rowLeftTail, int32_t coreNum) + { + ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + // 属性赋值 + this->beta1 = beta1; + this->beta2 = beta2; + this->epsilon = epsilon; + // tiling 数据赋值 + this->dim0 = dim0; + this->dim1 = dim1; + this->dim2 = dim2; + this->row = row; + this->batch = batch; + this->loopCount = loopCount; + this->rowLeft = rowLeft; + this->loopCountTail = loopCountTail; + this->rowLeftTail = rowLeftTail; + this->coreNum = coreNum; + // 输入的 gm shape 大小 + int32_t shape = this->dim0 * this->dim2; + int32_t shapeIndices = this->dim1 * 1; + int32_t shapeGradient = this->dim1 * this->dim2; + this->gmGradient.SetGlobalBuffer((__gm__ T *)gradient + this->batch * this->dim2 * get_block_idx(), + shapeGradient); + this->gmIndices.SetGlobalBuffer((__gm__ int32_t *)indices + this->batch * get_block_idx(), shapeIndices); + + this->gmInputM.SetGlobalBuffer((__gm__ T *)inputM, shape); + this->gmInputV.SetGlobalBuffer((__gm__ T *)inputV, shape); + this->gmInputVar.SetGlobalBuffer((__gm__ T *)inputVar, shape); + + this->gmLearningRate.SetGlobalBuffer((__gm__ T *)lr, sizeof(float)); + this->lr = this->gmLearningRate.GetValue(0); + + // 将输出地址指向输入地址 + inputMRef = inputM; + inputVRef = inputV; + inputVarRef = inputVar; + + // 单次循环申请的 ub 大小, 32位对齐后的大小 + this->pipe.InitBuffer(this->inQueGradient, 1, otherAllocSize); + this->pipe.InitBuffer(this->inQueIndices, 1, indicesAllocSize); + this->pipe.InitBuffer(this->queMSlice, 1, otherAllocSize); + this->pipe.InitBuffer(this->queVSlice, 1, otherAllocSize); + this->pipe.InitBuffer(this->queVarSlice, 1, otherAllocSize); + + this->pipe.InitBuffer(this->calcBufM, otherAllocSize); + this->updateM = this->calcBufM.template Get(); + + this->pipe.InitBuffer(this->calcBufV, otherAllocSize); + this->updateV = this->calcBufV.template Get(); + + this->pipe.InitBuffer(this->calcBufVar, otherAllocSize); + this->updateVar = this->calcBufVar.template Get(); + + this->pipe.InitBuffer(this->calcBuf, otherAllocSize); + this->temp = this->calcBuf.template Get(); + } + + // 核心处理函数,实现算子逻辑,调用私有成员函数CopyIn、Compute、CopyOut完成矢量算子的三级流水操作 + __aicore__ inline void Process() + { + if (get_block_idx() == this->coreNum - 1) { + for (int32_t i = 0; i < this->loopCountTail; i++) { + CopyIn(i, this->row); + Compute(i, this->row); + } + // 尾块处理 + if (this->rowLeft > 0) { + CopyIn(this->loopCountTail, this->rowLeftTail); + Compute(this->loopCountTail, this->rowLeftTail); + } + } else { + for (int32_t i = 0; i < this->loopCount; i++) { + CopyIn(i, this->row); + Compute(i, this->row); + } + // 尾块处理 + if (this->rowLeft > 0) { + CopyIn(this->loopCount, this->rowLeft); + Compute(this->loopCount, this->rowLeft); + } + } + } + +private: + // 搬入函数,完成CopyIn阶段的处理,被核心Process函数调用 + __aicore__ inline void CopyIn(int32_t progress, int32_t row) + { + LocalTensor localGradient = this->inQueGradient.template AllocTensor(); + uint32_t gradientDataLen = row * this->dim2 * sizeof(T); + // 连续传输数据块个数;len:连续传输数据块长度,Byte,非对齐搬运;0, 0, 0:源/目标数据块间隔,保留字段 + DataCopyExtParams gradientParams{1, gradientDataLen, 0, 0, 0}; + // 搬运填充参数 + DataCopyPadExtParams gradientPadParams{true, 0, 2, 0}; + DataCopyPad(localGradient, this->gmGradient[progress * this->row * this->dim2], gradientParams, + gradientPadParams); + + LocalTensor localIndices = this->inQueIndices.template AllocTensor(); + uint32_t indicesDataLen = row * sizeof(int32_t); + DataCopyExtParams indicesParams{1, indicesDataLen, 0, 0, 0}; + DataCopyPadExtParams indicesPadParams{true, 0, 2, 0}; + DataCopyPad(localIndices, this->gmIndices[progress * this->row], indicesParams, indicesPadParams); + + this->inQueGradient.EnQue(localGradient); + this->inQueIndices.EnQue(localIndices); + } + + // 计算函数,完成Compute阶段的处理,被核心Process函数调用 + __aicore__ inline void Compute(int32_t progress, int32_t row) + { + LocalTensor localGradient = this->inQueGradient.template DeQue(); + LocalTensor localIndices = this->inQueIndices.template DeQue(); + Muls(localIndices, localIndices, this->dim2, row); + // 根据 indices 从 inputM 中切分出来 m_slice + LocalTensor localMSlice = this->queMSlice.template AllocTensor(); + LocalTensor localVSlice = this->queVSlice.template AllocTensor(); + LocalTensor localVarSlice = this->queVarSlice.template AllocTensor(); + + pipe_barrier(PIPE_ALL); + + int32_t index = 0; + for (int32_t i = 0; i < row; i++) { + index = localIndices.GetValue(i); + if (index >= 0) { + DataCopy(localMSlice[i * this->dim2], gmInputM[index], this->dim2); + DataCopy(localVSlice[i * this->dim2], gmInputV[index], this->dim2); + DataCopy(localVarSlice[i * this->dim2], gmInputVar[index], this->dim2); + } + } + + this->queMSlice.EnQue(localMSlice); + this->queVSlice.EnQue(localVSlice); + this->queVarSlice.EnQue(localVarSlice); + localMSlice = this->queMSlice.template DeQue(); + localVSlice = this->queVSlice.template DeQue(); + localVarSlice = this->queVarSlice.template DeQue(); + + // 计算M + Muls(localMSlice, localMSlice, this->beta1, row * this->dim2); + Muls(this->updateM, localGradient, (1 - this->beta1), row * this->dim2); + this->updateM = localMSlice + this->updateM; + + // 计算V + Muls(localVSlice, localVSlice, this->beta2, row * this->dim2); + Mul(this->updateV, localGradient, localGradient, row * this->dim2); + Muls(this->updateV, this->updateV, (1 - this->beta2), row * this->dim2); + this->updateV = localVSlice + this->updateV; + + // 计算Var + Sqrt(this->updateVar, this->updateV, row * this->dim2); + Adds(this->updateVar, this->updateVar, this->epsilon, row * this->dim2); + Muls(this->temp, this->updateM, -this->lr, row * this->dim2); + Div(this->updateVar, this->temp, this->updateVar, row * this->dim2); + Add(this->updateVar, this->updateVar, localVarSlice, row * this->dim2); + + pipe_barrier(PIPE_ALL); + + // 计算结果数据原地更新到输入tensor中 + for (int32_t i = 0; i < row; i++) { + index = localIndices.GetValue(i); + if (index >= 0) { + // __GET_CODE_CHANNEL__宏的作用是防止拷贝操作被识别为matmul而报错 +#ifndef __GET_CODE_CHANNEL__ + DataCopy(this->gmInputM[index], this->updateM[i * this->dim2], this->dim2); + DataCopy(this->gmInputV[index], this->updateV[i * this->dim2], this->dim2); + DataCopy(this->gmInputVar[index], this->updateVar[i * this->dim2], this->dim2); +#endif + } + } + pipe_barrier(PIPE_ALL); + + this->inQueGradient.FreeTensor(localGradient); + this->queMSlice.FreeTensor(localMSlice); + this->queVSlice.FreeTensor(localVSlice); + this->queVarSlice.FreeTensor(localVarSlice); + this->inQueIndices.FreeTensor(localIndices); + } + +private: + float lr, beta1, beta2, epsilon; + int32_t dim0, dim1, dim2, row, batch, loopCount, rowLeft, loopCountTail, rowLeftTail, coreNum; + LocalTensor updateM, updateV, updateVar, temp; + LocalTensor localIndices; + GlobalTensor gmGradient, gmInputM, gmInputV, gmInputVar; + GlobalTensor gmIndices; + GlobalTensor gmLearningRate; + TPipe pipe; + TQue inQueGradient, inQueIndices; + TQue queMSlice, queVSlice, queVarSlice; + TBuf calcBufM; + TBuf calcBufV; + TBuf calcBufVar; + TBuf calcBuf; +}; + +extern "C" __global__ __aicore__ void lazy_adam(GM_ADDR gradient, GM_ADDR indices, + GM_ADDR inputM, GM_ADDR inputV, GM_ADDR inputVar, GM_ADDR lr, + GM_ADDR inputMRef, GM_ADDR inputVRef, GM_ADDR inputVarRef, + GM_ADDR workspace, GM_ADDR tiling) +{ + GET_TILING_DATA(tiling_data, tiling); + LazyAdam op32; + op32.Init(gradient, indices, + inputM, inputV, inputVar, lr, + inputMRef, inputVRef, inputVarRef, + tiling_data.beta1, tiling_data.beta2, tiling_data.epsilon, + tiling_data.dim0, tiling_data.dim1, tiling_data.dim2, + tiling_data.row, tiling_data.indicesAllocSize, tiling_data.otherAllocSize, + tiling_data.batch, tiling_data.loopCount, tiling_data.rowLeft, + tiling_data.loopCountTail, tiling_data.rowLeftTail, tiling_data.coreNum); + op32.Process(); +} \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/run.sh b/cust_op/fused_lazy_adam/run.sh new file mode 100644 index 00000000..c1e80ce5 --- /dev/null +++ b/cust_op/fused_lazy_adam/run.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +source /etc/profile + +# 查找msopgen的路径,加入到环境变量PATH中 +msopgen_path=$(find /usr/local/Ascend/ -name msopgen | grep bin) +parent_dir=$(dirname "$msopgen_path") +export PATH=$parent_dir:$PATH + +# 利用msopgen生成可编译文件 +rm -rf ./lazy_adam +msopgen gen -i lazy_adam.json -f tf -c ai_core-Ascend910B1 -lan cpp -out ./lazy_adam -m 0 -op LazyAdam + +cp -rf op_kernel lazy_adam/ +cp -rf op_host lazy_adam/ + +cd lazy_adam + +# 判断当前目录下是否存在CMakePresets.json文件 +if [ ! -f "CMakePresets.json" ]; then + echo "ERROR, CMakePresets.json file not exist." + exit 1 +fi + +# 禁止生成CRC校验和 +sed -i 's/--nomd5/--nomd5 --nocrc/g' ./cmake/makeself.cmake + +# 修改cann安装路径 +sed -i 's:"/usr/local/Ascend/latest":"/usr/local/Ascend/ascend-toolkit/latest":g' CMakePresets.json +# 修改vendor_name 防止覆盖之前vendor_name为customize的算子 +sed -i 's:"customize":"customize_lazy_adam":g' CMakePresets.json + +bash build.sh + +# 安装编译成功的算子包 +bash ./build_out/custom_opp*.run + +cd .. +rm -rf ./lazy_adam diff --git a/src/ops_tf/hybrid_dataset_ops.cpp b/src/ops_tf/hybrid_dataset_ops.cpp index c3687e8a..2eee8531 100644 --- a/src/ops_tf/hybrid_dataset_ops.cpp +++ b/src/ops_tf/hybrid_dataset_ops.cpp @@ -640,4 +640,22 @@ namespace tensorflow { }); REGISTER_KERNEL_BUILDER(Name("EmbeddingUpdateByAddress").Device(DEVICE_CPU), MxRec::CustOps); + + // ######################## tf注册LazyAdam融合算子同名算子 ######################## + REGISTER_OP("LazyAdam") + .Input("gradient: float32") + .Input("indices: int32") + .Input("input_m: float32") + .Input("input_v: float32") + .Input("input_var: float32") + .Input("lr: float32") + .Attr("beta1: float") + .Attr("beta2: float") + .Attr("epsilon: float") + .Output("output_m: float32") + .Output("output_v: float32") + .Output("output_var: float32") + .SetIsStateful() + .SetShapeFn(::tensorflow::shape_inference::UnknownShape); + REGISTER_KERNEL_BUILDER(Name("LazyAdam").Device(DEVICE_CPU), MxRec::CustOps); } \ No newline at end of file -- Gitee From b84fd8fbb628fd42138186575e872e92b6a8f3ce Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 19:30:55 +0800 Subject: [PATCH 096/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=9C=AA=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E5=88=B0=E7=9A=84Print=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/inc/op_runner.h | 42 +- .../aclnn_lazy_adam_test/src/op_runner.cpp | 632 ++++++++---------- 2 files changed, 281 insertions(+), 393 deletions(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h index 77f0aee5..2e25341f 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/inc/op_runner.h @@ -31,7 +31,7 @@ namespace AclnnLazyAdam { * @brief Constructor * @param [in] opDesc: op description */ - explicit OpRunner(OperatorDesc *opDesc); + explicit OpRunner(OperatorDesc* opDesc); /** * @brief Destructor @@ -121,13 +121,13 @@ namespace AclnnLazyAdam { * @return host address of the input */ template - T *GetInputBuffer(size_t index) + T* GetInputBuffer(size_t index) { if (index >= numInputs_) { ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); return nullptr; } - return reinterpret_cast(hostInputs_[index]); + return reinterpret_cast(hostInputs_[index]); } /** @@ -137,30 +137,16 @@ namespace AclnnLazyAdam { * @return host address of the output */ template - const T *GetOutputBuffer(size_t index) + const T* GetOutputBuffer(size_t index) { if (index >= numOutputs_) { ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); return nullptr; } - return reinterpret_cast(hostOutputs_[index]); + return reinterpret_cast(hostOutputs_[index]); } - /** - * @brief Print readable input by index - * @param [in] index: input index - * @param [in] numElementsPerRow: number of elements per row - */ - void PrintInput(size_t index, size_t numElementsPerRow = 16); - - /** - * @brief Print readable output by index - * @param [in] index: output index - * @param [in] numElementsPerRow: number of elements per row - */ - void PrintOutput(size_t index, size_t numElementsPerRow = 16); - /** * @brief Compile static op * @return compile result @@ -183,18 +169,18 @@ namespace AclnnLazyAdam { size_t numInputs_; size_t numOutputs_; - std::vector inputBuffers_; - std::vector outputBuffers_; + std::vector inputBuffers_; + std::vector outputBuffers_; - std::vector devInputs_; - std::vector devOutputs_; + std::vector devInputs_; + std::vector devOutputs_; - std::vector hostInputs_; - std::vector hostOutputs_; + std::vector hostInputs_; + std::vector hostOutputs_; - std::vector inputTensor_; - std::vector outputTensor_; - OperatorDesc *opDesc_; + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc* opDesc_; }; } #endif // OP_RUNNER_H diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp index 3d737564..e9711379 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp @@ -25,428 +25,330 @@ See the License for the specific language governing permissions and extern bool g_isDevice; namespace AclnnLazyAdam { -using namespace std; -constexpr int PRINT_OUT_WIDTH = 10; -constexpr int PRINT_OUT_PRECISION = 4; -constexpr int STREAM_TIMEOUT = 5000; // 等待Stream任务完成,超时时间单位:ms -constexpr int OUTPUT_SIZE = 3; -constexpr int INPUT_TENSOR_OFFSET = 2; - -OpRunner::OpRunner(OperatorDesc* opDesc) : opDesc_(opDesc) -{ - numInputs_ = opDesc->inputDesc.size(); - numOutputs_ = opDesc->outputDesc.size(); -} - -OpRunner::~OpRunner() -{ - for (size_t i = 0; i < numInputs_; ++i) { - (void)aclDestroyTensor(inputTensor_[i]); - (void)aclDestroyDataBuffer(inputBuffers_[i]); - (void)aclrtFree(devInputs_[i]); - if (g_isDevice) { - (void)aclrtFree(hostInputs_[i]); - } else { - (void)aclrtFreeHost(hostInputs_[i]); + using namespace std; + constexpr int PRINT_OUT_WIDTH = 10; + constexpr int PRINT_OUT_PRECISION = 4; + constexpr int STREAM_TIMEOUT = 5000; // 等待Stream任务完成,超时时间单位:ms + constexpr int OUTPUT_SIZE = 3; + constexpr int INPUT_TENSOR_OFFSET = 2; + + OpRunner::OpRunner(OperatorDesc* opDesc) : opDesc_(opDesc) + { + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); + } + + OpRunner::~OpRunner() + { + for (size_t i = 0; i < numInputs_; ++i) { + (void) aclDestroyTensor(inputTensor_[i]); + (void) aclDestroyDataBuffer(inputBuffers_[i]); + (void) aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void) aclrtFree(hostInputs_[i]); + } else { + (void) aclrtFreeHost(hostInputs_[i]); + } } - } - for (size_t i = 0; i < numOutputs_; ++i) { - if (g_isDevice) { - (void)aclrtFree(hostOutputs_[i]); - } else { - (void)aclrtFreeHost(hostOutputs_[i]); + for (size_t i = 0; i < numOutputs_; ++i) { + if (g_isDevice) { + (void) aclrtFree(hostOutputs_[i]); + } else { + (void) aclrtFreeHost(hostOutputs_[i]); + } } } -} - -bool OpRunner::InitOutputInfo() -{ - // 手动修改输出数据实现,仅申请host上的输出数据空间,析构出需同时适配 - numOutputs_ = OUTPUT_SIZE; - for (size_t i = 0; i < numOutputs_; ++i) { - int inputTensorIndex = i + INPUT_TENSOR_OFFSET; - auto size = GetInputSize(inputTensorIndex); - - void* hostOutput = nullptr; - if (g_isDevice) { - if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for output[%zu] failed", i); - return false; + + bool OpRunner::InitOutputInfo() + { + // 手动修改输出数据实现,仅申请host上的输出数据空间,析构出需同时适配 + numOutputs_ = OUTPUT_SIZE; + for (size_t i = 0; i < numOutputs_; ++i) { + int inputTensorIndex = i + INPUT_TENSOR_OFFSET; + auto size = GetInputSize(inputTensorIndex); + + void* hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } } - } else { - if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for output[%zu] failed", i); + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); return false; } + hostOutputs_.emplace_back(hostOutput); } - if (hostOutput == nullptr) { - ERROR_LOG("Malloc host memory for output[%zu] failed", i); - return false; - } - hostOutputs_.emplace_back(hostOutput); + return true; } - return true; -} - -bool OpRunner::Init() -{ - for (size_t i = 0; i < numInputs_; ++i) { - auto size = GetInputSize(i); - void* devMem = nullptr; - if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for input[%zu] failed", i); - return false; - } - devInputs_.emplace_back(devMem); - inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); - void* hostInput = nullptr; - if (g_isDevice) { - if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + bool OpRunner::Init() + { + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void* devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { ERROR_LOG("Malloc device memory for input[%zu] failed", i); return false; } - } else { - if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory for input[%zu] failed", i); + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void* hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); return false; } - } - if (hostInput == nullptr) { - ERROR_LOG("Malloc memory for input[%zu] failed", i); - return false; - } - hostInputs_.emplace_back(hostInput); + hostInputs_.emplace_back(hostInput); - aclTensor* inputTensor = - aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0, - GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); - if (inputTensor == nullptr) { - ERROR_LOG("Create Tensor for input[%zu] failed", i); - return false; + aclTensor* inputTensor = + aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0, + GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); } - inputTensor_.emplace_back(inputTensor); - } - - return InitOutputInfo(); -} - -const size_t OpRunner::NumInputs() -{ - return numInputs_; -} -const size_t OpRunner::NumOutputs() -{ - return numOutputs_; -} - -const size_t OpRunner::GetInputSize(size_t index) const -{ - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return 0; - } - return aclGetTensorDescSize(opDesc_->inputDesc[index]); -} - -const size_t OpRunner::GetInputNumDims(size_t index) const -{ - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return 0; - } - return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); -} - -aclDataType OpRunner::GetInputDataType(size_t index) const -{ - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return ACL_DT_UNDEFINED; + return InitOutputInfo(); } - return aclGetTensorDescType(opDesc_->inputDesc[index]); -} - -aclFormat OpRunner::GetInputFormat(size_t index) const -{ - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return ACL_FORMAT_UNDEFINED; + + const size_t OpRunner::NumInputs() + { + return numInputs_; } - return aclGetTensorDescFormat(opDesc_->inputDesc[index]); -} - -std::vector OpRunner::GetInputShape(size_t index) const -{ - std::vector ret; - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return ret; + + const size_t OpRunner::NumOutputs() + { + return numOutputs_; } - auto desc = opDesc_->inputDesc[index]; - for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { - int64_t dimSize; - if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { - ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); - ret.clear(); - return ret; + const size_t OpRunner::GetInputSize(size_t index) const + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; } - ret.emplace_back(dimSize); - } - return ret; -} - -size_t OpRunner::GetOutputSize(size_t index) const -{ - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return 0; + return aclGetTensorDescSize(opDesc_->inputDesc[index]); } - return aclGetTensorDescSize(opDesc_->outputDesc[index]); -} - -const size_t OpRunner::GetOutputNumDims(size_t index) const -{ - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return 0; + + const size_t OpRunner::GetInputNumDims(size_t index) const + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); } - return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); -} - -aclDataType OpRunner::GetOutputDataType(size_t index) const -{ - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return ACL_DT_UNDEFINED; + + aclDataType OpRunner::GetInputDataType(size_t index) const + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + return aclGetTensorDescType(opDesc_->inputDesc[index]); } - return aclGetTensorDescType(opDesc_->outputDesc[index]); -} - -aclFormat OpRunner::GetOutputFormat(size_t index) const -{ - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return ACL_FORMAT_UNDEFINED; + + aclFormat OpRunner::GetInputFormat(size_t index) const + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); } - return aclGetTensorDescFormat(opDesc_->outputDesc[index]); -} + std::vector OpRunner::GetInputShape(size_t index) const + { + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } -std::vector OpRunner::GetOutputShape(size_t index) const -{ - std::vector ret; - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } return ret; } - auto desc = opDesc_->outputDesc[index]; - for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { - int64_t dimSize; - if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { - ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); - ret.clear(); - return ret; + size_t OpRunner::GetOutputSize(size_t index) const + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; } - ret.emplace_back(dimSize); - } - return ret; -} - -size_t OpRunner::GetInputElementCount(size_t index) const -{ - if (index >= opDesc_->inputDesc.size()) { - ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); - return 0; + return aclGetTensorDescSize(opDesc_->outputDesc[index]); } - return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); -} - -size_t OpRunner::GetOutputElementCount(size_t index) const -{ - if (index >= opDesc_->outputDesc.size()) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return 0; - } - return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); -} - -bool OpRunner::RunOp() -{ - for (size_t i = 0; i < numInputs_; ++i) { - auto size = GetInputSize(i); - aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; - if (g_isDevice) { - kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + const size_t OpRunner::GetOutputNumDims(size_t index) const + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; } - if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { - ERROR_LOG("Copy input[%zu] failed", i); - return false; - } - INFO_LOG("Copy input[%zu] success", i); + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); } - aclrtStream stream = nullptr; - if (aclrtCreateStream(&stream) != ACL_SUCCESS) { - ERROR_LOG("Create stream failed"); - return false; - } - INFO_LOG("Create stream success"); - - size_t workspaceSize = 0; - aclOpExecutor* handle = nullptr; - auto ret = aclnnLazyAdamGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], - inputTensor_[4], inputTensor_[5], opDesc_->beta1, opDesc_->beta2, - opDesc_->epsilon, &workspaceSize, &handle); - if (ret != ACL_SUCCESS) { - (void)aclrtDestroyStream(stream); - ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); - return false; + aclDataType OpRunner::GetOutputDataType(size_t index) const + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + return aclGetTensorDescType(opDesc_->outputDesc[index]); } - INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); - void* workspace = nullptr; - if (workspaceSize != 0) { - if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { - ERROR_LOG("Malloc device memory failed"); + aclFormat OpRunner::GetOutputFormat(size_t index) const + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; } - } - ret = aclnnLazyAdam(workspace, workspaceSize, handle, stream); - if (ret != ACL_SUCCESS) { - (void)aclrtDestroyStream(stream); - ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); - return false; + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); } - INFO_LOG("Execute aclnnAddCustom success"); - ret = aclrtSynchronizeStreamWithTimeout(stream, STREAM_TIMEOUT); - if (ret != SUCCESS) { - ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); - (void)aclrtDestroyStream(stream); - return false; - } - INFO_LOG("Synchronize stream success"); - - // 把输入数据:inputM inputV inputVar 作为输出数据拷贝出来 - for (size_t i = 0; i < OUTPUT_SIZE; ++i) { - int inputTensorIndex = i + INPUT_TENSOR_OFFSET; // 加上输入tensor偏移值 - auto size = GetInputSize(inputTensorIndex); - aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; - if (g_isDevice) { - kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + std::vector OpRunner::GetOutputShape(size_t index) const + { + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; } - if (aclrtMemcpy(hostOutputs_[i], size, devInputs_[inputTensorIndex], size, kind) != ACL_SUCCESS) { - INFO_LOG("Copy output[%zu] success", i); - (void)aclrtDestroyStream(stream); - return false; + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); } - INFO_LOG("Copy output[%zu] success", i); + return ret; } - (void)aclrtDestroyStream(stream); - return true; -} + size_t OpRunner::GetInputElementCount(size_t index) const + { + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } -template -void DoPrintData(const T* data, size_t count, size_t elementsPerRow) -{ - if (elementsPerRow == 0) { - throw std::runtime_error("value must not be zero."); + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); } - for (size_t i = 0; i < count; ++i) { - std::cout << std::setw(PRINT_OUT_WIDTH) << data[i]; - if (i % elementsPerRow == elementsPerRow - 1) { - std::cout << std::endl; + + size_t OpRunner::GetOutputElementCount(size_t index) const + { + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; } + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); } -} -void DoPrintFp16Data(const aclFloat16* data, size_t count, size_t elementsPerRow) -{ - if (elementsPerRow == 0) { - throw std::runtime_error("value must not be zero."); - } - for (size_t i = 0; i < count; ++i) { - std::cout << std::setw(PRINT_OUT_WIDTH) << std::setprecision(PRINT_OUT_PRECISION) << aclFloat16ToFloat(data[i]); - if (i % elementsPerRow == elementsPerRow - 1) { - std::cout << std::endl; + bool OpRunner::RunOp() + { + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); } - } -} -void PrintData(const void* data, size_t count, aclDataType dataType, size_t elementsPerRow) -{ - if (data == nullptr) { - ERROR_LOG("Print data failed. data is nullptr"); - return; - } + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor* handle = nullptr; + auto ret = aclnnLazyAdamGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], + inputTensor_[4], inputTensor_[5], opDesc_->beta1, opDesc_->beta2, + opDesc_->epsilon, &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void) aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); - switch (dataType) { - case ACL_BOOL: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_INT8: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_UINT8: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_INT16: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_UINT16: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_INT32: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_UINT32: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_INT64: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_UINT64: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_FLOAT16: - DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_FLOAT: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case ACL_DOUBLE: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - default: - ERROR_LOG("Unsupported type: %d", dataType); - } -} + void* workspace = nullptr; + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_NORMAL_ONLY) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } -void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) -{ - if (index >= numInputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); - return; - } + ret = aclnnLazyAdam(workspace, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void) aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustom success"); - auto desc = opDesc_->inputDesc[index]; - PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); -} + ret = aclrtSynchronizeStreamWithTimeout(stream, STREAM_TIMEOUT); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void) aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + // 把输入数据:inputM inputV inputVar 作为输出数据拷贝出来 + for (size_t i = 0; i < OUTPUT_SIZE; ++i) { + int inputTensorIndex = i + INPUT_TENSOR_OFFSET; // 加上输入tensor偏移值 + auto size = GetInputSize(inputTensorIndex); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devInputs_[inputTensorIndex], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void) aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } -void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) -{ - if (index >= numOutputs_) { - ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); - return; + (void) aclrtDestroyStream(stream); + return true; } - - auto desc = opDesc_->outputDesc[index]; - PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); -} } // namespace AclnnLazyAdam \ No newline at end of file -- Gitee From e86cc03685b2f4ff29ce297fd3b21ab8fcdcac4f Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 19:56:29 +0800 Subject: [PATCH 097/302] =?UTF-8?q?clang=5Fformat=E6=A0=BC=E5=BC=8F?= =?UTF-8?q?=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../fused_lazy_adam/op_kernel/lazy_adam.cpp | 86 +++++++++---------- 1 file changed, 39 insertions(+), 47 deletions(-) diff --git a/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp index 815e6567..76164e50 100644 --- a/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp +++ b/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp @@ -17,21 +17,17 @@ See the License for the specific language governing permissions and using namespace AscendC; -template +template class LazyAdam { public: - __aicore__ inline LazyAdam() - {} + __aicore__ inline LazyAdam() {} // 初始化函数,完成内存初始化相关操作 - __aicore__ inline void Init(GM_ADDR gradient, GM_ADDR indices, - GM_ADDR inputM, GM_ADDR inputV, GM_ADDR inputVar, GM_ADDR lr, - GM_ADDR inputMRef, GM_ADDR inputVRef, GM_ADDR inputVarRef, - float beta1, float beta2, float epsilon, - int32_t dim0, int32_t dim1, int32_t dim2, - int32_t row, int32_t indicesAllocSize, int32_t otherAllocSize, - int32_t batch, int32_t loopCount, int32_t rowLeft, - int32_t loopCountTail, int32_t rowLeftTail, int32_t coreNum) + __aicore__ inline void Init(GM_ADDR gradient, GM_ADDR indices, GM_ADDR inputM, GM_ADDR inputV, GM_ADDR inputVar, + GM_ADDR lr, GM_ADDR inputMRef, GM_ADDR inputVRef, GM_ADDR inputVarRef, float beta1, + float beta2, float epsilon, int32_t dim0, int32_t dim1, int32_t dim2, int32_t row, + int32_t indicesAllocSize, int32_t otherAllocSize, int32_t batch, int32_t loopCount, + int32_t rowLeft, int32_t loopCountTail, int32_t rowLeftTail, int32_t coreNum) { ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); // 属性赋值 @@ -53,15 +49,15 @@ public: int32_t shape = this->dim0 * this->dim2; int32_t shapeIndices = this->dim1 * 1; int32_t shapeGradient = this->dim1 * this->dim2; - this->gmGradient.SetGlobalBuffer((__gm__ T *)gradient + this->batch * this->dim2 * get_block_idx(), + this->gmGradient.SetGlobalBuffer((__gm__ T*)gradient + this->batch * this->dim2 * get_block_idx(), shapeGradient); - this->gmIndices.SetGlobalBuffer((__gm__ int32_t *)indices + this->batch * get_block_idx(), shapeIndices); + this->gmIndices.SetGlobalBuffer((__gm__ int32_t*)indices + this->batch * get_block_idx(), shapeIndices); - this->gmInputM.SetGlobalBuffer((__gm__ T *)inputM, shape); - this->gmInputV.SetGlobalBuffer((__gm__ T *)inputV, shape); - this->gmInputVar.SetGlobalBuffer((__gm__ T *)inputVar, shape); + this->gmInputM.SetGlobalBuffer((__gm__ T*)inputM, shape); + this->gmInputV.SetGlobalBuffer((__gm__ T*)inputV, shape); + this->gmInputVar.SetGlobalBuffer((__gm__ T*)inputVar, shape); - this->gmLearningRate.SetGlobalBuffer((__gm__ T *)lr, sizeof(float)); + this->gmLearningRate.SetGlobalBuffer((__gm__ T*)lr, sizeof(float)); this->lr = this->gmLearningRate.GetValue(0); // 将输出地址指向输入地址 @@ -119,19 +115,19 @@ private: // 搬入函数,完成CopyIn阶段的处理,被核心Process函数调用 __aicore__ inline void CopyIn(int32_t progress, int32_t row) { - LocalTensor localGradient = this->inQueGradient.template AllocTensor(); + LocalTensor localGradient = this->inQueGradient.template AllocTensor(); uint32_t gradientDataLen = row * this->dim2 * sizeof(T); // 连续传输数据块个数;len:连续传输数据块长度,Byte,非对齐搬运;0, 0, 0:源/目标数据块间隔,保留字段 DataCopyExtParams gradientParams{1, gradientDataLen, 0, 0, 0}; // 搬运填充参数 - DataCopyPadExtParams gradientPadParams{true, 0, 2, 0}; + DataCopyPadExtParams gradientPadParams{true, 0, 2, 0}; DataCopyPad(localGradient, this->gmGradient[progress * this->row * this->dim2], gradientParams, gradientPadParams); - LocalTensor localIndices = this->inQueIndices.template AllocTensor(); + LocalTensor localIndices = this->inQueIndices.template AllocTensor(); uint32_t indicesDataLen = row * sizeof(int32_t); DataCopyExtParams indicesParams{1, indicesDataLen, 0, 0, 0}; - DataCopyPadExtParams indicesPadParams{true, 0, 2, 0}; + DataCopyPadExtParams indicesPadParams{true, 0, 2, 0}; DataCopyPad(localIndices, this->gmIndices[progress * this->row], indicesParams, indicesPadParams); this->inQueGradient.EnQue(localGradient); @@ -141,13 +137,13 @@ private: // 计算函数,完成Compute阶段的处理,被核心Process函数调用 __aicore__ inline void Compute(int32_t progress, int32_t row) { - LocalTensor localGradient = this->inQueGradient.template DeQue(); - LocalTensor localIndices = this->inQueIndices.template DeQue(); + LocalTensor localGradient = this->inQueGradient.template DeQue(); + LocalTensor localIndices = this->inQueIndices.template DeQue(); Muls(localIndices, localIndices, this->dim2, row); // 根据 indices 从 inputM 中切分出来 m_slice - LocalTensor localMSlice = this->queMSlice.template AllocTensor(); - LocalTensor localVSlice = this->queVSlice.template AllocTensor(); - LocalTensor localVarSlice = this->queVarSlice.template AllocTensor(); + LocalTensor localMSlice = this->queMSlice.template AllocTensor(); + LocalTensor localVSlice = this->queVSlice.template AllocTensor(); + LocalTensor localVarSlice = this->queVarSlice.template AllocTensor(); pipe_barrier(PIPE_ALL); @@ -212,34 +208,30 @@ private: private: float lr, beta1, beta2, epsilon; int32_t dim0, dim1, dim2, row, batch, loopCount, rowLeft, loopCountTail, rowLeftTail, coreNum; - LocalTensor updateM, updateV, updateVar, temp; - LocalTensor localIndices; - GlobalTensor gmGradient, gmInputM, gmInputV, gmInputVar; - GlobalTensor gmIndices; - GlobalTensor gmLearningRate; + LocalTensor updateM, updateV, updateVar, temp; + LocalTensor localIndices; + GlobalTensor gmGradient, gmInputM, gmInputV, gmInputVar; + GlobalTensor gmIndices; + GlobalTensor gmLearningRate; TPipe pipe; TQue inQueGradient, inQueIndices; TQue queMSlice, queVSlice, queVarSlice; - TBuf calcBufM; - TBuf calcBufV; - TBuf calcBufVar; - TBuf calcBuf; + TBuf calcBufM; + TBuf calcBufV; + TBuf calcBufVar; + TBuf calcBuf; }; -extern "C" __global__ __aicore__ void lazy_adam(GM_ADDR gradient, GM_ADDR indices, - GM_ADDR inputM, GM_ADDR inputV, GM_ADDR inputVar, GM_ADDR lr, - GM_ADDR inputMRef, GM_ADDR inputVRef, GM_ADDR inputVarRef, - GM_ADDR workspace, GM_ADDR tiling) +extern "C" __global__ __aicore__ void lazy_adam(GM_ADDR gradient, GM_ADDR indices, GM_ADDR inputM, GM_ADDR inputV, + GM_ADDR inputVar, GM_ADDR lr, GM_ADDR inputMRef, GM_ADDR inputVRef, + GM_ADDR inputVarRef, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tiling_data, tiling); LazyAdam op32; - op32.Init(gradient, indices, - inputM, inputV, inputVar, lr, - inputMRef, inputVRef, inputVarRef, - tiling_data.beta1, tiling_data.beta2, tiling_data.epsilon, - tiling_data.dim0, tiling_data.dim1, tiling_data.dim2, - tiling_data.row, tiling_data.indicesAllocSize, tiling_data.otherAllocSize, - tiling_data.batch, tiling_data.loopCount, tiling_data.rowLeft, - tiling_data.loopCountTail, tiling_data.rowLeftTail, tiling_data.coreNum); + op32.Init(gradient, indices, inputM, inputV, inputVar, lr, inputMRef, inputVRef, inputVarRef, tiling_data.beta1, + tiling_data.beta2, tiling_data.epsilon, tiling_data.dim0, tiling_data.dim1, tiling_data.dim2, + tiling_data.row, tiling_data.indicesAllocSize, tiling_data.otherAllocSize, tiling_data.batch, + tiling_data.loopCount, tiling_data.rowLeft, tiling_data.loopCountTail, tiling_data.rowLeftTail, + tiling_data.coreNum); op32.Process(); } \ No newline at end of file -- Gitee From 161c2f4595f09d0989ea4b5cfaad8d4ef9fd8cf9 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 20:09:47 +0800 Subject: [PATCH 098/302] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=A3=80=E8=A7=86?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 52 +++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index e0e64d23..e97ef46f 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -1,6 +1,7 @@ # LazyAdam优化器融合算子及样例说明 ## LazyAdam融合算子文件结构 + ```shell ├── aclnn_lazy_adam_test # 单算子测试用例 ├── lazy_adam.json # 算子原型配置 @@ -11,33 +12,36 @@ ``` ## Ascend C参考设计 -更多详情可以参考CANN官方的Ascend C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0001.html)。 +更多详情可以参考CANN官方的Ascend +C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0001.html)。 ## lazy_adam优化器同名融合算子lazy_adam 1. 算子分析 a) 算子的主要功能是实现lazy_adam优化器反向更新时m、v、variable三项数据的计算和更新; -b) 算子参数说明: +b) 算子参数说明: + * gradient: lazy_adam优化器计算时使用的梯度; * indices: 参与计算/更新的数据索引; * inputM: lazy_adam优化器一阶矩估计;计算结果原地更新; * inputV: lazy_adam优化器二阶矩估计;计算结果原地更新; * inputVar: embedding表对应的variable数据;计算结果原地更新; -c) 算子约束说明: + c) 算子约束说明: * 支持的型号:Atlas A2系列产品; -* 支持的输入数据类型:float32; -* embedding表的dim值需要时8的倍数; +* 支持的输入数据类型:float32; +* embedding表的dim值需要是8的倍数; 2. Host侧算子实现 Host侧算子实现在目录 fused_lazy_adam/op_host下,其中包括:lazy_adam.cpp和 lazy_adam_tiling.h。 -a) Tiling实现 +a) Tiling实现 -namespace optiling域中的LazyAdamTilingFunc函数,主要实现从context中获取外部入参信息(输入参数指针、shape信息),及校验有效性; +namespace +optiling域中的LazyAdamTilingFunc函数,主要实现从context中获取外部入参信息(输入参数指针、shape信息),及校验有效性; 并计算kernel侧需要的数据切分相关参数,包括row、loopCount、batch等(详情见tiling文件注释),设置BlockDim,最后通过TilingData传递属性信息。 b) Shape推导 @@ -67,6 +71,7 @@ d) Process方法,进行数据搬入和计算,并且计算完成后将计算 单算子调用分为两种方式:单算子API执行和模型执行。mxRec提供单算子API执行供参考。 单算子测试用例在目录fused_lazy_adam/aclnn_lazy_adam_test下,其中: + * inc是头文件目录 * scripts存放生成数据和验证数据的python脚本 * input是存放算子入参的bin文件 @@ -74,23 +79,32 @@ d) Process方法,进行数据搬入和计算,并且计算完成后将计算 * src是存放公共函数common、构造算子输入输出描述类oprator_desc、单算子调用主体流程实现op_runner文件和入口main文件 执行单算子测试: + ```shell bash run.sh ``` ### 前置条件 -1. 参考[基于msopgen工具创建算子工程](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0023.html)完成算子工程的创建, -参考[kernel侧算子实现](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0024.html)完成kernel侧实现的相关准备, -参考[host侧算子实现](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0026.html)完成host侧实现相关准备。 -2. 参考[算子编译部署](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0031.html)完成算子的编译部署,编译部署时需要开启算子的二进制编译功能:修改算子工程中的编译配置项文件CMakePresets.json,将 +1. +参考[基于msopgen工具创建算子工程](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0023.html) +完成算子工程的创建, +参考[kernel侧算子实现](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0024.html) +完成kernel侧实现的相关准备, +参考[host侧算子实现](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0026.html) +完成host侧实现相关准备。 +2. +参考[算子编译部署](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0031.html) +完成算子的编译部署,编译部署时需要开启算子的二进制编译功能:修改算子工程中的编译配置项文件CMakePresets.json,将 ENABLE_BINARY_PACKAGE设置为True。编译部署时可将算子的二进制部署到当前环境,便于后续算子的调用。 3. 检查API执行需要的头文件和库文件是否自动生成,针对mxRec,检查cust_op/fused_lazy_adam/lazy_adam/build_out/autogen目录下,是否有 -aclnn_lazy_adam.cpp和aclnn_lazy_adam.h等。 + aclnn_lazy_adam.cpp和aclnn_lazy_adam.h等。 -注意:对于cust_op/fused_lazy_adam/run.sh脚本,安装算子后会删除构建目录。运行单算子测试时,需要屏蔽掉删除rm rf ./lazy_adam这一步,以确保前置条件3。 +注意:对于cust_op/fused_lazy_adam/run.sh脚本,安装算子后会删除构建目录。运行单算子测试时,需要屏蔽掉删除rm rf +./lazy_adam这一步,以确保前置条件3。 ### 融合算子 lazy_adam + 针对lazy_adam算子,入口src/main.cpp中: 1. InitResource函数:初始化AscendCL并运行管理资源申请,不用修改 @@ -100,6 +114,7 @@ a) 创建算子输入输出描述CreateOpDesc,OperatorDesc对象定义(inc/ope op_runner中使用; b) 创建OpRunner的对象,并依次执行: + * opRunner.Init():申请内存存放执行算子的输入输出数据 * SetInputData():加载数据输入bin文件并传输给OpRunner的Buffer供后续算子执行使用 * opRunner.RunOp():算子执行,主要流程为:入参数据拷贝,创建Stream,执行Stream,输出数据拷贝,释放Stream资源 @@ -108,14 +123,17 @@ b) 创建OpRunner的对象,并依次执行: 3. DestroyResource函数:释放内存,不用修改 ### 运行脚本 + run.sh脚本依次执行: -1. 清除遗留生成文件和日志文件 -2. 生成输入数据和真值数据 -3. 编译acl可执行文件 -4. 运行可执行文件 + +1. 清除遗留生成文件和日志文件 +2. 生成输入数据和真值数据 +3. 编译acl可执行文件 +4. 运行可执行文件 5. 比较真值文件 ### scripts脚本 + * gen_data.py:生成lazy_adam算子的输入数据和用于精度校验的golden数据,可自行修改测试相关dim参数。 * verify_result.py:将算子的输出和脚本生成的golden数据进行精度比对,并输出比较结果。比对规则为:允许误差精度loss:1e-4 -- Gitee From 69e2d86f47dfcfae9e2d7d876250dd3a702398b4 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 20:52:28 +0800 Subject: [PATCH 099/302] =?UTF-8?q?=E7=AE=97=E5=AD=90vendor=5Fname?= =?UTF-8?q?=E5=90=8D=E7=A7=B0=E4=BF=AE=E6=94=B9=EF=BC=8C=E8=A7=A3=E5=86=B3?= =?UTF-8?q?=E5=A4=9A=E7=AE=97=E5=AD=90=E5=9C=BA=E6=99=AF=E4=B8=8B=E7=AE=97?= =?UTF-8?q?=E5=AD=90=E8=A6=86=E7=9B=96=E9=97=AE=E9=A2=98=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/run.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cust_op/fused_lazy_adam/run.sh b/cust_op/fused_lazy_adam/run.sh index c1e80ce5..6f51d7a7 100644 --- a/cust_op/fused_lazy_adam/run.sh +++ b/cust_op/fused_lazy_adam/run.sh @@ -41,8 +41,10 @@ sed -i 's/--nomd5/--nomd5 --nocrc/g' ./cmake/makeself.cmake # 修改cann安装路径 sed -i 's:"/usr/local/Ascend/latest":"/usr/local/Ascend/ascend-toolkit/latest":g' CMakePresets.json -# 修改vendor_name 防止覆盖之前vendor_name为customize的算子 -sed -i 's:"customize":"customize_lazy_adam":g' CMakePresets.json +# 修改vendor_name 防止覆盖之前vendor_name为customize的算子; +# vendor_name需要和aclnn中的CMakeLists.txt中的CUST_PKG_PATH值同步,不同步aclnn会调用失败; +# vendor_name字段值不能包含customize;包含会导致多算子部署场景CANN的vendors路径下config.ini文件内容截取错误,部署工具bug; +sed -i 's:"customize":"mxrec_fused_lazy_adam":g' CMakePresets.json bash build.sh -- Gitee From 554b60add9f01e6b3a307824c4d0869c6ff2cf78 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 20:54:09 +0800 Subject: [PATCH 100/302] =?UTF-8?q?aclnn=E4=B8=ADvendor=5Fname=E5=90=8D?= =?UTF-8?q?=E7=A7=B0=E4=BF=AE=E6=94=B9=EF=BC=8C=E8=A7=A3=E5=86=B3=E5=A4=9A?= =?UTF-8?q?=E7=AE=97=E5=AD=90=E5=9C=BA=E6=99=AF=E4=B8=8B=E7=AE=97=E5=AD=90?= =?UTF-8?q?=E8=A6=86=E7=9B=96=E9=97=AE=E9=A2=98=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/src/CMakeLists.txt | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt index 1642e3ca..c4a727bf 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt @@ -19,9 +19,9 @@ if (NOT DEFINED ENV{DDK_PATH}) message(STATUS "set default INC_PATH: ${INC_PATH}") else () message(STATUS "env INC_PATH: ${INC_PATH}") -endif() +endif () -set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize_lazy_adam/op_api") +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/mxrec_fused_lazy_adam/op_api") set(LIB_PATH $ENV{NPU_HOST_LIB}) @@ -32,23 +32,23 @@ if (NOT DEFINED ENV{NPU_HOST_LIB}) message(STATUS "set default LIB_PATH: ${LIB_PATH}") else () message(STATUS "env LIB_PATH: ${LIB_PATH}") -endif() +endif () set(AUTO_GEN_PATH "../../lazy_adam/build_out/autogen") # Header path include_directories( - ${INC_PATH}/runtime/include - ${INC_PATH}/atc/include - ../inc - ${CUST_PKG_PATH}/include - ${AUTO_GEN_PATH} + ${INC_PATH}/runtime/include + ${INC_PATH}/atc/include + ../inc + ${CUST_PKG_PATH}/include + ${AUTO_GEN_PATH} ) # add host lib path link_directories( - ${LIB_PATH} - ${LIB_PATH1} - ${CUST_PKG_PATH}/lib + ${LIB_PATH} + ${LIB_PATH1} + ${CUST_PKG_PATH}/lib ) add_executable(execute_op @@ -57,11 +57,11 @@ add_executable(execute_op ) target_link_libraries(execute_op - ascendcl - cust_opapi - acl_op_compiler - nnopbase - stdc++ + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ ) install(TARGETS execute_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) -- Gitee From aa732bccd3d2f52c3ea0da4d73d1a632ec18d1f1 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 21:30:52 +0800 Subject: [PATCH 101/302] =?UTF-8?q?=E7=AE=97=E5=AD=90run.sh=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cust_op/fused_lazy_adam/run.sh b/cust_op/fused_lazy_adam/run.sh index 6f51d7a7..ff604cea 100644 --- a/cust_op/fused_lazy_adam/run.sh +++ b/cust_op/fused_lazy_adam/run.sh @@ -43,7 +43,7 @@ sed -i 's/--nomd5/--nomd5 --nocrc/g' ./cmake/makeself.cmake sed -i 's:"/usr/local/Ascend/latest":"/usr/local/Ascend/ascend-toolkit/latest":g' CMakePresets.json # 修改vendor_name 防止覆盖之前vendor_name为customize的算子; # vendor_name需要和aclnn中的CMakeLists.txt中的CUST_PKG_PATH值同步,不同步aclnn会调用失败; -# vendor_name字段值不能包含customize;包含会导致多算子部署场景CANN的vendors路径下config.ini文件内容截取错误,部署工具bug; +# vendor_name字段值不能包含customize;包含会导致多算子部署场景CANN的vendors路径下config.ini文件内容截取错误 sed -i 's:"customize":"mxrec_fused_lazy_adam":g' CMakePresets.json bash build.sh -- Gitee From d66828edaa2dd649a1785125b3713c17aead80b6 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 6 May 2024 23:14:11 +0800 Subject: [PATCH 102/302] =?UTF-8?q?readme=E8=84=9A=E6=9C=AC=E6=9B=B4?= =?UTF-8?q?=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index e97ef46f..42f5bfc9 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -16,6 +16,16 @@ 更多详情可以参考CANN官方的Ascend C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0001.html)。 +## lazy_adam融合算子使用 + +1. 进入当前目录,执行指令进行编译和部署lazy_adam融合算子 + +``` +bash run.sh +``` + +2. 模型py脚本中导入mxRec中的lazy_adam优化器。lazy_adam优化器使用知道参考mxRec用户指南。 + ## lazy_adam优化器同名融合算子lazy_adam 1. 算子分析 @@ -87,16 +97,20 @@ bash run.sh ### 前置条件 1. + 参考[基于msopgen工具创建算子工程](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0023.html) 完成算子工程的创建, 参考[kernel侧算子实现](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0024.html) 完成kernel侧实现的相关准备, 参考[host侧算子实现](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0026.html) 完成host侧实现相关准备。 + 2. + 参考[算子编译部署](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0031.html) 完成算子的编译部署,编译部署时需要开启算子的二进制编译功能:修改算子工程中的编译配置项文件CMakePresets.json,将 ENABLE_BINARY_PACKAGE设置为True。编译部署时可将算子的二进制部署到当前环境,便于后续算子的调用。 + 3. 检查API执行需要的头文件和库文件是否自动生成,针对mxRec,检查cust_op/fused_lazy_adam/lazy_adam/build_out/autogen目录下,是否有 aclnn_lazy_adam.cpp和aclnn_lazy_adam.h等。 -- Gitee From 25d94cd481cf5bdcd9a523be2d75ccd897fe643d Mon Sep 17 00:00:00 2001 From: sihaixianyu Date: Tue, 7 May 2024 00:58:57 +0000 Subject: [PATCH 103/302] =?UTF-8?q?!119=20=E3=80=90CleanCode=E3=80=91?= =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=8A=BD=E8=B1=A1=E6=96=B9=E6=B3=95=E5=92=8C?= =?UTF-8?q?=E9=9D=99=E6=80=81=E6=96=B9=E6=B3=95=E7=9A=84=E9=A1=BA=E5=BA=8F?= =?UTF-8?q?=E3=80=82=20*=20=E3=80=90CleanCode=E3=80=91=E8=B0=83=E6=95=B4?= =?UTF-8?q?=E6=8A=BD=E8=B1=A1=E6=96=B9=E6=B3=95=E5=92=8C=E9=9D=99=E6=80=81?= =?UTF-8?q?=E6=96=B9=E6=B3=95=E7=9A=84=E9=A1=BA=E5=BA=8F=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/graph/slicers.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mx_rec/graph/slicers.py b/mx_rec/graph/slicers.py index 3999cdd4..a4014195 100644 --- a/mx_rec/graph/slicers.py +++ b/mx_rec/graph/slicers.py @@ -61,14 +61,6 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): os.makedirs(info_dir) self._info_dir = info_dir - @abc.abstractmethod - def summarize(self) -> None: - pass - - @abc.abstractmethod - def slice(self) -> None: - pass - @staticmethod def _find_min_dep_ops( tgt_ops: Set[Operation], @@ -289,6 +281,14 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): return consumers + @abc.abstractmethod + def summarize(self) -> None: + pass + + @abc.abstractmethod + def slice(self) -> None: + pass + def _slice_ops(self, sliceable_ops: Set[Operation], is_training: bool) -> None: """Slice the minimum dependency graph of given operation set. -- Gitee From 99c68d2f16fd91c3fb4a40579cba5f1fbcf161df Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Tue, 7 May 2024 00:59:37 +0000 Subject: [PATCH 104/302] !120 cleancode * cleancode --- src/core/emb_table/embedding_ddr.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 24aa07a7..be5fab22 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -363,6 +363,7 @@ int EmbeddingDDR::LoadHashMap(const string& savePath) } if (keyCount > devVocabSize + hostVocabSize) { LOG_ERROR("load key size exceeds the sum of device vocab size and host vocab size: {}", strerror(errno)); + free(static_cast(buf)); return -1; } else if (keyCount < devVocabSize) { loadOffset.push_back(i); -- Gitee From 7ae9e65543a3a5bae9406a37f23102280c844b91 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 7 May 2024 09:48:47 +0800 Subject: [PATCH 105/302] =?UTF-8?q?hdfs=E4=B8=ADread=E3=80=81write?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E5=8A=A0=E5=9B=BA=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=E4=BF=9D=E5=AD=98=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 12 +- src/core/checkpoint/checkpoint.cpp | 18 ++- src/core/emb_table/embedding_ddr.cpp | 120 +++++++++----- src/core/emb_table/embedding_ddr.h | 4 +- src/core/emb_table/embedding_dynamic.cpp | 54 +++---- src/core/emb_table/embedding_dynamic.h | 4 +- src/core/emb_table/embedding_static.cpp | 54 +++---- src/core/emb_table/embedding_static.h | 4 +- src/core/file_system/file_system.h | 6 + .../hdfs_file_system/hdfs_file_system.cpp | 152 +++++++++--------- .../hdfs_file_system/hdfs_wrapper.h | 25 ++- src/core/utils/common.h | 3 + 12 files changed, 259 insertions(+), 197 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index d776b699..e2e58340 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -233,7 +233,9 @@ class Saver(object): attribute = attribute.astype(np.int64) attribute_dir = os.path.join(upper_dir, "slice.attribute") - attribute.tofile(attribute_dir) + with tf.io.gfile.GFile(attribute_dir, "wb") as file: + attribute = attribute.tostring() + file.write(attribute) @performance("_save") def _save(self, sess, root_dir): @@ -445,8 +447,9 @@ def write_binary_data(writing_path, suffix, data, attributes=None): raise RuntimeError(f"make dir {writing_path} for writing data failed!") from err data_file, attribute_file = generate_file_name(suffix) target_data_dir = os.path.join(writing_path, data_file) - - with tf.io.gfile.GFile(target_data_dir, "ab") as file: + # append mode of hdfs system supports not well when the file not exists. + file_mode = "wb" if not tf.io.gfile.exists(target_data_dir) else "ab" + with tf.io.gfile.GFile(target_data_dir, file_mode) as file: data = data.tostring() file.write(data) @@ -470,7 +473,8 @@ def read_binary_data(reading_path: str, data_name: str, table_name: str, load_of with tf.io.gfile.GFile(target_attribute_dir, "rb") as fin: validate_read_file(target_attribute_dir) - attributes = np.fromfile(target_attribute_dir, dtype=np.int64) + attributes = fin.read() + attributes = np.fromstring(attributes, dtype=np.int64) with tf.io.gfile.GFile(target_data_dir, "rb") as file: validate_read_file(target_data_dir) diff --git a/src/core/checkpoint/checkpoint.cpp b/src/core/checkpoint/checkpoint.cpp index 673c7ce3..b4ce187e 100644 --- a/src/core/checkpoint/checkpoint.cpp +++ b/src/core/checkpoint/checkpoint.cpp @@ -210,8 +210,13 @@ void Checkpoint::WriteStream(CkptTransData& transData, const string& dataDir, si } if (writeBytesNum == -1) { - LOG_ERROR("error happened when writing data to file."); - throw runtime_error("error happened when writing data to file."); + throw runtime_error("Error: Save data failed. data type: {} .An error occurred while writing file: {}.", + dataType, dataDir); + } + if (writeBytesNum != dataSize) { + throw runtime_error( + "Error: Save data failed. data type: {} .Expected to write {} bytes, but actually write {} bytes to file {}.", + dataType, dataSize, writeBytesNum, dataDir); } } @@ -330,8 +335,13 @@ void Checkpoint::ReadStream(CkptTransData& transData, } if (readBytesNum == -1) { - LOG_ERROR("error happened when reading data from file."); - throw runtime_error("error happened when reading data from file."); + throw runtime_error("Error: Load data failed. data type: {} .An error occurred while reading file: {}.", + dataType, dataDir); + } + if (readBytesNum != datasetSize) { + throw runtime_error( + "Error: Load data failed. data type: {} .Expected to read {} bytes, but actually read {} bytes to file {}.", + dataType, datasetSize, readBytesNum, dataDir); } } diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 02d7c116..f8820a7d 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -316,47 +316,42 @@ void EmbeddingDDR::SetStartCount() freeSize_ = devVocabSize; } -void EmbeddingDDR::Load(const string& savePath) -{ - int res = LoadHashMap(savePath); - if (res == -1) { - throw std::runtime_error("load key failed!"); - } +void EmbeddingDDR::Load(const string& savePath) { + LoadKey(savePath); LoadEmbAndOptim(savePath); } -void EmbeddingDDR::Save(const string& savePath) -{ +void EmbeddingDDR::Save(const string& savePath) { SaveKey(savePath); SaveEmbAndOptim(savePath); } -int EmbeddingDDR::LoadHashMap(const string& savePath) -{ +void EmbeddingDDR::LoadKey(const string& savePath) { stringstream ss; ss << savePath << "/" << name << "/key/slice.data"; unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - size_t fileSize = 0; - try { - fileSize = fileSystemPtr->GetFileSize(ss.str()); - } catch (exception& e) { - LOG_ERROR("open file {} failed:{}", ss.str(), strerror(errno)); - return -1; - } + size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - LOG_ERROR("file {} size = {} is too big", ss.str(), fileSize); - return -1; + throw runtime_error("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - LOG_ERROR("malloc failed: {}", strerror(errno)); - return -1; + throw runtime_error("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize); + } + + ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); + if (res == -1) { + throw runtime_error("Error: Load keys failed. An error occurred while reading file: {}.", ss.str()); + } + if (res != fileSize) { + throw runtime_error( + "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", + fileSize, res, ss.str()); } - fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); size_t loadKeySize = fileSize / sizeof(int64_t); @@ -369,8 +364,9 @@ int EmbeddingDDR::LoadHashMap(const string& savePath) continue; } if (keyCount > devVocabSize + hostVocabSize) { - LOG_ERROR("load key size exceeds the sum of device vocab size and host vocab size: {}", strerror(errno)); - return -1; + throw runtime_error( + "Error: Load keys failed. Load key size :{} exceeds the sum of device vocab size and host vocab size: {}.", + keyCount, devVocabSize + hostVocabSize); } else if (keyCount < devVocabSize) { loadOffset.push_back(i); devOffset2Key[keyCount] = buf[i]; @@ -381,9 +377,7 @@ int EmbeddingDDR::LoadHashMap(const string& savePath) keyCount++; } maxOffset = keyOffsetMap.size(); - free(static_cast(buf)); - return 0; } void EmbeddingDDR::LoadEmbAndOptim(const string& savePath) @@ -404,21 +398,41 @@ void EmbeddingDDR::LoadEmbAndOptim(const string& savePath) // 读embedding stringstream embedStream; embedStream << ss.str() << "/" << "embedding/slice.data"; + + size_t readSize = hostLoadOffset.size() * embSize_ * sizeof(float); ssize_t res = fileSystemPtr->Read(embedStream.str(), table.embData, 0, hostLoadOffset, embSize_); + if (res == -1) { + throw runtime_error("Error: Load embeddings failed. An error occurred while reading file: {}.", + embedStream.str()); + } + if (res != readSize) { + throw runtime_error( + "Error: Load embeddings failed. Expected to read {} bytes, but actually read {} bytes to file {}.", + readSize, res, embedStream.str()); + } // 读optim int64_t optimIndex = 1; for (const auto ¶m: optimParams) { stringstream paramStream; paramStream << ss.str() << "/" << optimName + "_" + param << "/slice.data"; + ssize_t res = fileSystemPtr->Read(paramStream.str(), table.embData, optimIndex, hostLoadOffset, embSize_); - optimIndex ++; + if (res == -1) { + throw runtime_error("Error: Load optimizers failed. An error occurred while reading file: {}.", + paramStream.str()); + } + if (res != readSize) { + throw runtime_error( + "Error: Load embeddings failed. Expected to read {} bytes, but actually read {} bytes to file {}.", + readSize, res, paramStream.str()); + } + optimIndex++; } } -int EmbeddingDDR::SaveKey(const string& savePath) -{ +void EmbeddingDDR::SaveKey(const string& savePath) { stringstream ss; ss << savePath << "/" << name << "/key/"; MakeDir(ss.str()); @@ -442,19 +456,17 @@ int EmbeddingDDR::SaveKey(const string& savePath) } } - ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(hostKey.data()), - static_cast(hostKey.size() * sizeof(int64_t))); + hostKey.insert(hostKey.end(), deviceKey.begin(), deviceKey.end()); + size_t writeSize = static_cast(hostKey.size() * sizeof(int64_t)); + ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(hostKey.data()), writeSize); if (res == -1) { - return -1; + throw runtime_error("Error: Save keys failed. An error occurred while writing file: {}.", ss.str()); } - ssize_t res2 = fileSystemPtr->Write( - ss.str(), reinterpret_cast(deviceKey.data()), - static_cast(deviceKey.size() * sizeof(int64_t)) - ); - if (res2 == -1) { - return -1; + if (res != writeSize) { + throw runtime_error( + "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", + writeSize, res, ss.str()); } - return 0; } void EmbeddingDDR::SaveEmbData(const string& savePath) @@ -466,8 +478,17 @@ void EmbeddingDDR::SaveEmbData(const string& savePath) unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - vector attribute; - fileSystemPtr->Write(ss.str(), embContent, embSize_ * sizeof(float)); + + size_t writeSize = embSize_ * sizeof(float) * embContent.size(); + ssize_t res = fileSystemPtr->Write(ss.str(), embContent, embSize_ * sizeof(float)); + if (res == -1) { + throw runtime_error("Error: Save embeddings failed. An error occurred while writing file: {}.", ss.str()); + } + if (res != writeSize) { + throw runtime_error( + "Error: Save embeddings failed. Expected to write {} bytes, but actually write {} bytes to file {}.", + writeSize, res, ss.str()); + } } void EmbeddingDDR::SaveOptimData(const string& savePath) @@ -480,8 +501,18 @@ void EmbeddingDDR::SaveOptimData(const string& savePath) unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - vector attribute; - fileSystemPtr->Write(ss.str(), content.second, embSize_ * sizeof(float)); + + size_t writeSize = embSize_ * sizeof(float) * content.second.size(); + ssize_t res = fileSystemPtr->Write(ss.str(), content.second, embSize_ * sizeof(float)); + + if (res == -1) { + throw runtime_error("Error: Save optimizers failed. An error occurred while writing file: {}.", ss.str()); + } + if (res != writeSize) { + throw runtime_error( + "Error: Save optimizers failed. Expected to write {} bytes, but actually write {} bytes to file {}.", + writeSize, res, ss.str()); + } } } @@ -501,7 +532,7 @@ void EmbeddingDDR::SaveEmbAndOptim(const string& savePath) int optim_param_count = 1; for (const string ¶m: optimParams) { optimContentMap[param].push_back(table.embData[offset - devVocabSize].data() + - sizeof(float) * embSize_ * optim_param_count); + sizeof(float) * embSize_ * optim_param_count); optim_param_count++; } } @@ -523,6 +554,7 @@ void EmbeddingDDR::SetOptimizerInfo(OptimizerInfo& optimizerInfo) optimContentMap[param] = vector{}; } } + void EmbeddingDDR::SetCacheManager(CacheManager *cm) { cacheManager_ = cm; diff --git a/src/core/emb_table/embedding_ddr.h b/src/core/emb_table/embedding_ddr.h index b2a461d8..ab7cc3fb 100644 --- a/src/core/emb_table/embedding_ddr.h +++ b/src/core/emb_table/embedding_ddr.h @@ -74,10 +74,10 @@ public: GTEST_PRIVATE: - int LoadHashMap(const string& savePath); + void LoadKey(const string& savePath); void LoadEmbAndOptim(const string& savePath); - int SaveKey(const string& savePath); + void SaveKey(const string& savePath); void SaveEmbData(const string &savePath); void SaveOptimData(const string& savePath); void SaveEmbAndOptim(const string& savePath); diff --git a/src/core/emb_table/embedding_dynamic.cpp b/src/core/emb_table/embedding_dynamic.cpp index 9fd26546..f81f2ab7 100644 --- a/src/core/emb_table/embedding_dynamic.cpp +++ b/src/core/emb_table/embedding_dynamic.cpp @@ -128,14 +128,11 @@ void EmbeddingDynamic::RandomInit(void* addr, size_t embNum) void EmbeddingDynamic::Save(const string& savePath) { - int res = SaveKey(savePath); - if (res == -1) { - throw std::runtime_error("save key failed!"); - } + SaveKey(savePath); SaveEmbAndOptim(savePath); } -int EmbeddingDynamic::SaveKey(const string& savePath) +void EmbeddingDynamic::SaveKey(const string& savePath) { stringstream ss; ss << savePath << "/" << name << "/key/"; @@ -153,12 +150,16 @@ int EmbeddingDynamic::SaveKey(const string& savePath) embAddress.push_back(it.second); } - ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), - static_cast(deviceKey.size() * sizeof(int64_t))); + size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); + ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { - return -1; + throw runtime_error("Error: Save keys failed. An error occurred while writing file: {}.", ss.str()); + } + if (res != writeSize) { + throw runtime_error( + "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", + writeSize, res, ss.str()); } - return 0; } void EmbeddingDynamic::SaveEmbAndOptim(const string& savePath) @@ -215,10 +216,7 @@ void EmbeddingDynamic::SaveOptimData(const string &savePath) void EmbeddingDynamic::Load(const string& savePath) { - int res = LoadKey(savePath); - if (res == -1) { - throw std::runtime_error("load key failed!"); - } + LoadKey(savePath); LoadEmbAndOptim(savePath); } @@ -234,7 +232,7 @@ void EmbeddingDynamic::LoadEmbAndOptim(const string& savePath) stringstream embedStream; embedStream << ss.str() << "/" << "embedding/slice.data"; EmbeddingSizeInfo embeddingSizeInfo = {embSize_, extEmbSize_}; - fileSystemPtr->ReadEmbedding(savePath, embeddingSizeInfo, firstAddress, rankId_, loadOffset); + fileSystemPtr->ReadEmbedding(embedStream.str(), embeddingSizeInfo, firstAddress, rankId_, loadOffset); // 读optim int optimIndex = 1; @@ -255,24 +253,25 @@ int EmbeddingDynamic::LoadKey(const string& savePath) unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - size_t fileSize = 0; - try { - fileSize = fileSystemPtr->GetFileSize(ss.str()); - } catch (exception& e) { - LOG_ERROR("open file {} failed:{}", ss.str(), strerror(errno)); - return -1; - } + size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - LOG_ERROR("file {} size = {} is too big", ss.str(), fileSize); - return -1; + throw runtime_error("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - LOG_ERROR("malloc failed: {}", strerror(errno)); - return -1; + throw runtime_error("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize); + } + + ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); + if (res == -1) { + throw runtime_error("Error: Load keys failed. An error occurred while reading file: {}.", ss.str()); + } + if (res != fileSize) { + throw runtime_error( + "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", + fileSize, res, ss.str()); } - fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); size_t loadKeySize = fileSize / sizeof(int64_t); @@ -289,7 +288,7 @@ int EmbeddingDynamic::LoadKey(const string& savePath) void *newBlock = nullptr; aclError ret = aclrtMalloc(&newBlock, static_cast(datasetSize), ACL_MEM_MALLOC_HUGE_FIRST); if (ret != ACL_SUCCESS) { - throw runtime_error(StringFormat("aclrtMalloc failed, ret=%d", ret).c_str()); + throw runtime_error("Error: in dynamic expansion mode, aclrtMalloc failed, malloc size: {}.", datasetSize); } // 此处的 newBlock -> first address; // 对key_offset map 进行一个恢复操作 @@ -303,5 +302,4 @@ int EmbeddingDynamic::LoadKey(const string& savePath) maxOffset = keyOffsetMap.size(); free(static_cast(buf)); - return 0; } diff --git a/src/core/emb_table/embedding_dynamic.h b/src/core/emb_table/embedding_dynamic.h index 2c867530..59418229 100644 --- a/src/core/emb_table/embedding_dynamic.h +++ b/src/core/emb_table/embedding_dynamic.h @@ -48,13 +48,13 @@ private: void MallocEmbeddingBlock(int embNum); - int SaveKey(const string& savePath); + void SaveKey(const string& savePath); void SaveEmbAndOptim(const string& savePath); void SetOptimizerInfo(OptimizerInfo& optimizerInfo); - int LoadKey(const string& savePath); + void LoadKey(const string& savePath); void LoadEmbAndOptim(const string& savePath); diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index 225c90c9..2ff5a49e 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -73,10 +73,7 @@ int64_t EmbeddingStatic::capacity() const void EmbeddingStatic::Save(const string& savePath) { - int res = SaveKey(savePath); - if (res == -1) { - throw std::runtime_error("save embedding table failed!"); - } + SaveKey(savePath); } int EmbeddingStatic::SaveKey(const string& savePath) @@ -97,23 +94,24 @@ int EmbeddingStatic::SaveKey(const string& savePath) deviceOffset.push_back(it.second); } - ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), - static_cast(deviceKey.size() * sizeof(int64_t))); + size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); + ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { - return -1; + throw runtime_error("Error: Save keys failed. An error occurred while writing file: {}.", ss.str()); + } + if (res != writeSize) { + throw runtime_error( + "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", + writeSize, res, ss.str()); } - return 0; } void EmbeddingStatic::Load(const string& savePath) { - int res = LoadKey(savePath); - if (res == -1) { - throw std::runtime_error("load embedding table failed!"); - } + LoadKey(savePath); } -int EmbeddingStatic::LoadKey(const string &savePath) +void EmbeddingStatic::LoadKey(const string &savePath) { stringstream ss; ss << savePath << "/" << name << "/key/slice.data"; @@ -121,24 +119,25 @@ int EmbeddingStatic::LoadKey(const string &savePath) unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - size_t fileSize = 0; - try { - fileSize = fileSystemPtr->GetFileSize(ss.str()); - } catch (exception &e) { - LOG_ERROR("open file {} failed:{}", ss.str(), strerror(errno)); - return -1; - } + size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - LOG_ERROR("file {} size = {} is too big", ss.str(), fileSize); - return -1; + throw runtime_error("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - LOG_ERROR("malloc failed: {}", strerror(errno)); - return -1; + throw runtime_error("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize); + } + + ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); + if (res == -1) { + throw runtime_error("Error: Load keys failed. An error occurred while reading file: {}.", ss.str()); + } + if (res != fileSize) { + throw runtime_error( + "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", + fileSize, res, ss.str()); } - fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); size_t loadKeySize = fileSize / sizeof(int64_t); loadOffset.clear(); @@ -152,14 +151,13 @@ int EmbeddingStatic::LoadKey(const string &savePath) } if (loadOffset.size() > devVocabSize) { - LOG_ERROR("load key size exceeds device vocab size: {}", strerror(errno)); - return -1; + throw runtime_error("Error: Load keys failed. Load key size :{} exceeds device vocab size: {}.", + loadOffset.size(), devVocabSize); } maxOffset = keyOffsetMap.size(); free(static_cast(buf)); - return 0; } vector EmbeddingStatic::GetDeviceOffset() diff --git a/src/core/emb_table/embedding_static.h b/src/core/emb_table/embedding_static.h index 06e24efa..965bce0e 100644 --- a/src/core/emb_table/embedding_static.h +++ b/src/core/emb_table/embedding_static.h @@ -42,9 +42,9 @@ public: vector GetDeviceOffset(); GTEST_PRIVATE: - int SaveKey(const string& savePath); + void SaveKey(const string& savePath); - int LoadKey(const string& savePath); + void LoadKey(const string& savePath); vector deviceKey; vector deviceOffset; diff --git a/src/core/file_system/file_system.h b/src/core/file_system/file_system.h index 2f7d3b62..66c142db 100644 --- a/src/core/file_system/file_system.h +++ b/src/core/file_system/file_system.h @@ -32,12 +32,18 @@ namespace MxRec { virtual ssize_t Write(const string& filePath, const char* fileContent, size_t dataSize) = 0; virtual ssize_t Write(const string& filePath, vector fileContent, size_t dataSize) = 0; + + // In the dynamic expansion mode, embedding is transported to the host side from the device side + // and written into a file. virtual void WriteEmbedding(const string& filePath, const int& embeddingSize, const vector& addressArr, int deviceId) = 0; virtual ssize_t Read(const string& filePath, char* fileContent, size_t datasetSize) = 0; virtual ssize_t Read(const string& filePath, vector>& fileContent, int64_t contentOffset, vector offsetArr, const size_t& embeddingSize) = 0; + + // In the dynamic expansion mode, embedding is read from the file + // and transported from the host side to the device side. virtual void ReadEmbedding(const string& filePath, EmbeddingSizeInfo& embedSizeInfo, int64_t firstAddress, int deviceId, vector offsetArr) = 0; diff --git a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp index 999f2fa9..ec9e9bac 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp +++ b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp @@ -60,7 +60,7 @@ size_t HdfsFileSystem::GetFileSize(const string& filePath) hdfsFileInfo* fileInfo = hdfs->GetPathInfo(fs, filePath.c_str()); hdfs->Disconnect(fs); if (fileInfo == nullptr) { - return 0; + throw runtime_error("Error: Unable to get hdfs file info : {}.", filePath.c_str()); } auto fileSize = static_cast(fileInfo->mSize); return fileSize; @@ -69,35 +69,25 @@ size_t HdfsFileSystem::GetFileSize(const string& filePath) ssize_t HdfsFileSystem::Write(const string& filePath, const char* fileContent, size_t dataSize) { hdfsFS fs = ConnectHdfs(); - - hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_WRONLY | O_CREAT, 0, 0, 0); + int flag = O_WRONLY | O_CREAT; + hdfsFileInfo* fileInfo = hdfs->GetPathInfo(fs, filePath.c_str()); + if (fileInfo) { + flag = O_WRONLY | O_APPEND; + } + hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), flag, 0, 0, 0); if (!file) { hdfs->Disconnect(fs); - throw runtime_error("Error writing to hdfs file."); + throw runtime_error("Error: Unable to open hdfs file : {}.", filePath.c_str()); } - size_t dataCol = dataSize; - size_t writeSize = 0; - size_t idx = 0; tSize writeBytesNum = 0; - - while (dataCol != 0) { - if (dataCol > oneTimeReadWriteLen) { - writeSize = oneTimeReadWriteLen; - } else { - writeSize = dataCol; - } - - tSize res = hdfs->Write(fs, file, fileContent + idx, writeSize); - if (res == -1) { - hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); - return static_cast(res); - } - dataCol -= writeSize; - idx += writeSize; - writeBytesNum += res; + tSize res = hdfs->Write(fs, file, fileContent, dataSize, sizeof(char)); + if (res == -1) { + hdfs->CloseFile(fs, file); + hdfs->Disconnect(fs); + return static_cast(res); } + writeBytesNum += res; hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); @@ -111,31 +101,19 @@ ssize_t HdfsFileSystem::Write(const string& filePath, vector fileContent hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_WRONLY | O_CREAT, 0, 0, 0); if (!file) { hdfs->Disconnect(fs); - throw runtime_error("Error writing to hdfs file."); + throw runtime_error("Error: Unable to open hdfs file : {}.", filePath.c_str()); } tSize writeBytesNum = 0; size_t loops = fileContent.size(); for (size_t i = 0; i < loops; i++) { - size_t dataCol = dataSize; - size_t writeSize = 0; - size_t idx = 0; - while (dataCol != 0) { - if (dataCol > oneTimeReadWriteLen) { - writeSize = oneTimeReadWriteLen; - } else { - writeSize = dataCol; - } - tSize res = hdfs->Write(fs, file, fileContent[i] + idx, writeSize); - if (res == -1) { - hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); - return static_cast(res); - } - dataCol -= writeSize; - idx += writeSize; - writeBytesNum += res; + tSize res = hdfs->Write(fs, file, fileContent[i], dataSize, sizeof(float)); + if (res == -1) { + hdfs->CloseFile(fs, file); + hdfs->Disconnect(fs); + return static_cast(res); } + writeBytesNum += res; } hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); @@ -156,11 +134,10 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_WRONLY | O_CREAT, 0, 0, 0); if (!file) { hdfs->Disconnect(fs); - throw runtime_error("Error writing to hdfs file."); + throw runtime_error("Error: Unable to open hdfs file : {}.", filePath.c_str()); } #ifndef GTEST - for (size_t i = 0; i < addressArr.size(); i += embHashNum) { vector row(embeddingSize); int64_t address = addressArr.at(i); @@ -172,14 +149,21 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding if (ret != ACL_SUCCESS) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); - throw runtime_error("aclrtMemcpy failed"); + throw runtime_error("Error: Execute aclrtmemcpy from device to host failed."); } - auto numBytesWritten = hdfs->Write(fs, file, row.data(), embeddingSize * sizeof(float)); - if (numBytesWritten != embeddingSize * sizeof(float)) { + tSize res = hdfs->Write(fs, file, row.data(), embeddingSize * sizeof(float), sizeof(float)); + if (res == -1) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); - throw runtime_error("Error writing to hdfs file."); + throw runtime_error("Error: An error occurred while writing file: {}.", filePath.c_str()); + } + + if (res != embeddingSize * sizeof(float)) { + hdfs->CloseFile(fs, file); + hdfs->Disconnect(fs); + throw runtime_error("Error: Expected to write {} bytes, but actually write {} bytes to file {}.", + embeddingSize * sizeof(float), res, filePath.c_str()); } } #endif @@ -194,29 +178,17 @@ ssize_t HdfsFileSystem::Read(const string& filePath, char* fileContent, size_t d hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_RDONLY, 0, 0, 0); if (!file) { hdfs->Disconnect(fs); - throw runtime_error("open hdfs file failed."); + throw runtime_error("Error: Unable to open hdfs file : {}.", filePath.c_str()); } - size_t dataCol = datasetSize; - size_t idx = 0; - size_t readSize = 0; tSize readBytesNum = 0; - while (dataCol != 0) { - if (dataCol > oneTimeReadWriteLen) { - readSize = oneTimeReadWriteLen; - } else { - readSize = dataCol; - } - tSize res = hdfs->Read(fs, file, fileContent + idx, readSize); - if (res == -1) { - hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); - return static_cast(res); - } - dataCol -= readSize; - idx += readSize; - readBytesNum += res; + tSize res = hdfs->Read(fs, file, fileContent, datasetSize); + if (res == -1) { + hdfs->CloseFile(fs, file); + hdfs->Disconnect(fs); + return static_cast(res); } + readBytesNum += res; hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); @@ -231,7 +203,7 @@ ssize_t HdfsFileSystem::Read(const string& filePath, vector>& file hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_RDONLY, 0, 0, 0); if (!file) { hdfs->Disconnect(fs); - throw runtime_error("open hdfs file failed."); + throw runtime_error("Error: Unable to open hdfs file : {}.", filePath.c_str()); } ssize_t readBytesNum = 0; @@ -241,9 +213,13 @@ ssize_t HdfsFileSystem::Read(const string& filePath, vector>& file tSize res = hdfs->Read(fs, file, fileContent[embeddingCount].data() + contentOffset * embeddingSize, embeddingSize * sizeof(float)); - + if (res == -1) { + hdfs->CloseFile(fs, file); + hdfs->Disconnect(fs); + return static_cast(res); + } embeddingCount++; - readBytesNum += embeddingSize * sizeof(float); + readBytesNum += res; } hdfs->CloseFile(fs, file); @@ -266,26 +242,44 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_RDONLY, 0, 0, 0); if (!file) { hdfs->Disconnect(fs); - throw runtime_error("open hdfs file failed."); + throw runtime_error("Error: Unable to open hdfs file : {}.", filePath.c_str()); } float* floatPtr = reinterpret_cast(firstAddress); auto i = 0; for (const auto& offset: offsetArr) { vector row(embedSizeInfo.embeddingSize); - hdfs->Seek(fs, file, offset * embedSizeInfo.embeddingSize * sizeof(float)); + int seekRes = hdfs->Seek(fs, file, offset * embedSizeInfo.embeddingSize * sizeof(float)); + if (seekRes == -1) { + hdfs->CloseFile(fs, file); + hdfs->Disconnect(fs); + throw runtime_error("Error: hdfsSeek failed with error. file offset: {}", + offset * embedSizeInfo.embeddingSize * sizeof(float)); + } + tSize res = hdfs->Read(fs, file, row.data(), embedSizeInfo.embeddingSize * sizeof(float)); - try { - aclrtMemcpy(floatPtr + i * embedSizeInfo.extendEmbSize, embedSizeInfo.embeddingSize * sizeof(float), - row.data(), embedSizeInfo.embeddingSize * sizeof(float), ACL_MEMCPY_HOST_TO_DEVICE); - } catch (std::exception& e) { + if (res == -1) { + hdfs->CloseFile(fs, file); + hdfs->Disconnect(fs); + throw runtime_error("Error: An error occurred while reading file: {}.", filePath.c_str()); + } + if (res != embeddingSize * sizeof(float)) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); - throw runtime_error(StringFormat("error happen when acl memory copy from host to device: %s", e.what())); + throw runtime_error("Error: Expected to read {} bytes, but actually read {} bytes from file {}.", + embeddingSize * sizeof(float), res, filePath.c_str()); + } + + aclError ret = aclrtMemcpy(floatPtr + i * embedSizeInfo.extendEmbSize, + embedSizeInfo.embeddingSize * sizeof(float), + row.data(), embedSizeInfo.embeddingSize * sizeof(float), ACL_MEMCPY_HOST_TO_DEVICE); + if (ret != ACL_SUCCESS) { + hdfs->CloseFile(fs, file); + hdfs->Disconnect(fs); + throw runtime_error("Error: Execute aclrtmemcpy from host to device failed."); } i++; } - hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); #endif diff --git a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h index 0f33934f..144f0a3a 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h +++ b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h @@ -134,20 +134,37 @@ namespace MxRec { return hdfsCloseFile(fs, file); } - tSize Read(hdfsFS fs, hdfsFile file, void* buffer, tSize length) const + tSize Read(hdfsFS fs, hdfsFile file, void* buffer, tSize length, tSize typeSize) const { if (hdfsRead == nullptr) { throw runtime_error("Failed to obtain the pointer of the function hdfsRead from the libhdfs."); } - return hdfsRead(fs, file, buffer, length); + return WrapperHdfsRead(fs, file, buffer, length, typeSize); } - tSize Write(hdfsFS fs, hdfsFile file, const void* buffer, tSize length) const + tSize WrapperHdfsRead(hdfsFS fs, hdfsFile file, void *buffer, tSize length, tSize typeSize) { + tSize reTryCount = 0; + tSize unReadLength = length; + tSize readBytes = 0; + + while (unReadLength != 0 && reTryCount < RETRY_COUNT) { + tSize offset = buffer + (length - unReadLength) / typeSize; + tSize res = hdfsRead(fs, file, buffer + offset, unReadLength); + if (res == -1) { + return res; + } + unReadLength -= res; + readBytes += res; + } + return readBytes; + } + + tSize Write(hdfsFS fs, hdfsFile file, const void* buffer, tSize length, tSize typeSize) const { if (hdfsWrite == nullptr) { throw runtime_error("Failed to obtain the pointer of the function hdfsWrite from the libhdfs."); } - return hdfsWrite(fs, file, buffer, length); + return WrapperHdfsWrite(fs, file, buffer, length, typeSize); } int Seek(hdfsFS fs, hdfsFile file, tOffset desiredPos) const diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 95a76ca5..3839b725 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -77,6 +77,9 @@ namespace MxRec { constexpr int GLOG_TIME_WIDTH_6 = 6; constexpr char GLOG_STAT_FLAG[] = "statOn"; + // for file system + constexpr int RETRY_COUNT = 100; + // unique related config constexpr int UNIQUE_BUCKET = 6; constexpr int MIN_UNIQUE_THREAD_NUM = 1; -- Gitee From 36ca59798c7c9346596c4e79f7e660711436af92 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 7 May 2024 10:26:39 +0800 Subject: [PATCH 106/302] =?UTF-8?q?1=E3=80=81readme=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E6=9B=B4=E6=96=B0=202=E3=80=81lazy=5Fadam=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=99=A8=E5=AE=9E=E7=8E=B0=E9=80=82=E9=85=8D=E8=9E=8D=E5=90=88?= =?UTF-8?q?=E7=AE=97=E5=AD=90=203=E3=80=81=E6=89=93=E5=8C=85=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/gen_mxrec_tar_pkg.sh | 4 ++++ cust_op/fused_lazy_adam/README.md | 14 +++++++++++--- mx_rec/optimizers/lazy_adam.py | 32 +++++++++++++++++++++++++------ 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/build/gen_mxrec_tar_pkg.sh b/build/gen_mxrec_tar_pkg.sh index 72ccfe49..3b6a9713 100644 --- a/build/gen_mxrec_tar_pkg.sh +++ b/build/gen_mxrec_tar_pkg.sh @@ -56,6 +56,10 @@ function gen_tar_file() chmod 640 *.json chmod 550 op_host op_kernel op_host/* op_kernel/* cd - + cd ./build/"${pkg_dir}"/cust_op/ + chmod 550 -R fused_lazy_adam + chmod 640 fused_lazy_adam/*.json + cd - cd ./build tar -zvcf "${release_tar}" "${pkg_dir}" || { warn "compression failed, packages might be broken" diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index 42f5bfc9..32167b43 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -18,13 +18,21 @@ C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/deta ## lazy_adam融合算子使用 -1. 进入当前目录,执行指令进行编译和部署lazy_adam融合算子 +1. 上次fused_lazy_adam文件夹到目标环境,并进入当前目录,执行指令进行编译和部署lazy_adam融合算子 -``` +```shell bash run.sh ``` -2. 模型py脚本中导入mxRec中的lazy_adam优化器。lazy_adam优化器使用知道参考mxRec用户指南。 +2. 模型脚本中創建lazy_adam优化器并指定使用融合算子。创建使用融合算子的lazy_adam优化器示例: + +```python +from mx_rec.optimizers.lazy_adam import create_hash_optimizer + +# 创建lazy_adam优化器时增加"use_fusion_optim=True"参数,表示使用融合算子实现。use_fusion_optim参数默认值为False。 +# lazy_adam优化器详细使用指导请参考mxRec用户指南 +sparse_optimizer = create_hash_optimizer(learning_rate=0.001, use_fusion_optim=True) +``` ## lazy_adam优化器同名融合算子lazy_adam diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index 1f491d14..81c8ecba 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -32,7 +32,8 @@ from tensorflow.python.training import slot_creator from mx_rec.optimizers.base import CustomizedOptimizer from mx_rec.util.initialize import ConfigInitializer -from mx_rec.validator.validator import para_checker_decorator, StringValidator, FloatValidator +from mx_rec.util.ops import import_host_pipeline_ops +from mx_rec.validator.validator import para_checker_decorator, StringValidator, FloatValidator, ClassValidator @para_checker_decorator(check_option_list=[ @@ -40,9 +41,11 @@ from mx_rec.validator.validator import para_checker_decorator, StringValidator, ("beta1", FloatValidator, {"min_value": 0.0, "max_value": 1.0}, ["check_value_for_open_interval"]), ("beta2", FloatValidator, {"min_value": 0.0, "max_value": 1.0}, ["check_value"]), ("epsilon", FloatValidator, {"min_value": 0.0, "max_value": 1.0}, ["check_value_for_left_open_interval"]), - ("name", StringValidator, {"min_len": 1, "max_len": 200}, ["check_string_length"]) + ("name", StringValidator, {"min_len": 1, "max_len": 200}, ["check_string_length"]), + ("use_fusion_optim", ClassValidator, {"classes": (bool, type(None))}), ]) -def create_hash_optimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, name="LazyAdam"): +def create_hash_optimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, name="LazyAdam", + use_fusion_optim=False): """ Args: learning_rate: learning rate @@ -50,13 +53,14 @@ def create_hash_optimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1 beta2: epsilon: name: - + use_fusion_optim: if use fused optimizer Returns: a customized optimizer instance """ if ConfigInitializer.get_instance().use_dynamic_expansion: raise ValueError("dynamic expansion mode is not compatible with the optimizer, please config dynamic " "expansion mode and optimizer correctly") - optimizer = CustomizedLazyAdam(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, name=name) + optimizer = CustomizedLazyAdam(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, name=name, + use_fusion_optim=use_fusion_optim) ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer return optimizer @@ -64,10 +68,16 @@ def create_hash_optimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1 class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): name_counter = defaultdict(int) - def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, use_locking=False, name="LazyAdam"): + def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, use_locking=False, name="LazyAdam", + use_fusion_optim=False): self.optimizer_type = "LazyAdam" self.optim_param_list = ["momentum", "velocity"] self.config_instance = ConfigInitializer.get_instance() + self.use_fusion_optim = use_fusion_optim + if self.use_fusion_optim: + self._custom_initial_beta1 = beta1 + self._custom_initial_beta2 = beta2 + self._custom_initial_epsilon = epsilon super(CustomizedLazyAdam, self)._get_name(name=name) super(CustomizedLazyAdam, self).__init__(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, use_locking=use_locking, name=self.unique_name) @@ -164,6 +174,16 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): temp_epsilon = temp.get("temp_epsilon") learning_rate = tf.divide(temp_lr * math_ops.sqrt(1 - power_b2), (1 - power_b1)) + if self.use_fusion_optim: + nd_indices = tf.expand_dims(indices, 1) + slot_m = self.get_slot(var, "m") + slot_v = self.get_slot(var, "v") + output_m, output_v, output_var =\ + import_host_pipeline_ops().lazy_adam(grad, nd_indices, slot_m, slot_v, var, learning_rate, + self._custom_initial_beta1, self._custom_initial_beta2, + self._custom_initial_epsilon) + return control_flow_ops.group(output_m, output_v, output_var) + abs_indices = tf.math.maximum(indices, 0) nd_indices = tf.expand_dims(indices, 1) -- Gitee From d0f34e40bb9aa774de0c16a33dad9c29a6958f9f Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 7 May 2024 10:32:45 +0800 Subject: [PATCH 107/302] =?UTF-8?q?1=E3=80=81readme=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index 32167b43..3c30a40a 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -18,19 +18,19 @@ C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/deta ## lazy_adam融合算子使用 -1. 上次fused_lazy_adam文件夹到目标环境,并进入当前目录,执行指令进行编译和部署lazy_adam融合算子 +1. 上传fused_lazy_adam文件夹到目标环境,并进入当前目录,执行指令对lazy_adam融合算子进行编译和部署 ```shell bash run.sh ``` -2. 模型脚本中創建lazy_adam优化器并指定使用融合算子。创建使用融合算子的lazy_adam优化器示例: +2. 模型脚本中创建lazy_adam优化器并指定使用融合算子。代码示例: ```python from mx_rec.optimizers.lazy_adam import create_hash_optimizer # 创建lazy_adam优化器时增加"use_fusion_optim=True"参数,表示使用融合算子实现。use_fusion_optim参数默认值为False。 -# lazy_adam优化器详细使用指导请参考mxRec用户指南 +# lazy_adam优化器详细使用指导请参考mxRec用户指南。 sparse_optimizer = create_hash_optimizer(learning_rate=0.001, use_fusion_optim=True) ``` -- Gitee From 6584ea407ba256523d815c653a7c72b1bf5a05e7 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 7 May 2024 11:03:32 +0800 Subject: [PATCH 108/302] =?UTF-8?q?hdfs=E4=B8=ADread=E3=80=81write?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E5=8A=A0=E5=9B=BA=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=E4=BF=9D=E5=AD=98=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hdfs_file_system/hdfs_wrapper.h | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h index 144f0a3a..78699b01 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h +++ b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h @@ -142,7 +142,8 @@ namespace MxRec { return WrapperHdfsRead(fs, file, buffer, length, typeSize); } - tSize WrapperHdfsRead(hdfsFS fs, hdfsFile file, void *buffer, tSize length, tSize typeSize) { + tSize WrapperHdfsRead(hdfsFS fs, hdfsFile file, void *buffer, tSize length, tSize typeSize) const + { tSize reTryCount = 0; tSize unReadLength = length; tSize readBytes = 0; @@ -167,6 +168,24 @@ namespace MxRec { return WrapperHdfsWrite(fs, file, buffer, length, typeSize); } + tSize WrapperHdfsWrite(hdfsFS fs, hdfsFile file, const void *buffer, tSize length, tSize typeSize) const + { + tSize reTryCount = 0; + tSize unWriteLength = length; + tSize writeBytes = 0; + + while (unWriteLength != 0 && reTryCount < RETRY_COUNT) { + tSize offset = buffer + (length - unWriteLength) / typeSize; + tSize res = hdfsWrite(fs, file, buffer + offset, unWriteLength); + if (res == -1) { + return res; + } + unWriteLength -= res; + writeBytes += res; + } + return writeBytes; + } + int Seek(hdfsFS fs, hdfsFile file, tOffset desiredPos) const { if (hdfsSeek == nullptr) { -- Gitee From ffbc239527973d61032d6d52dea59a469a4d182d Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 7 May 2024 11:03:32 +0800 Subject: [PATCH 109/302] =?UTF-8?q?hdfs=E4=B8=ADread=E3=80=81write?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E5=8A=A0=E5=9B=BA=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=E4=BF=9D=E5=AD=98=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hdfs_file_system/hdfs_file_system.cpp | 6 +-- .../hdfs_file_system/hdfs_wrapper.h | 50 ++++++++++++++++--- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp index ec9e9bac..7fde1a22 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp +++ b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp @@ -81,7 +81,7 @@ ssize_t HdfsFileSystem::Write(const string& filePath, const char* fileContent, s } tSize writeBytesNum = 0; - tSize res = hdfs->Write(fs, file, fileContent, dataSize, sizeof(char)); + tSize res = hdfs->Write(fs, file, fileContent, dataSize); if (res == -1) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); @@ -107,7 +107,7 @@ ssize_t HdfsFileSystem::Write(const string& filePath, vector fileContent tSize writeBytesNum = 0; size_t loops = fileContent.size(); for (size_t i = 0; i < loops; i++) { - tSize res = hdfs->Write(fs, file, fileContent[i], dataSize, sizeof(float)); + tSize res = hdfs->Write(fs, file, fileContent[i], dataSize); if (res == -1) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); @@ -152,7 +152,7 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding throw runtime_error("Error: Execute aclrtmemcpy from device to host failed."); } - tSize res = hdfs->Write(fs, file, row.data(), embeddingSize * sizeof(float), sizeof(float)); + tSize res = hdfs->Write(fs, file, row.data(), embeddingSize * sizeof(float)); if (res == -1) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); diff --git a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h index 78699b01..33f2738b 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h +++ b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h @@ -134,22 +134,40 @@ namespace MxRec { return hdfsCloseFile(fs, file); } - tSize Read(hdfsFS fs, hdfsFile file, void* buffer, tSize length, tSize typeSize) const + tSize Read(hdfsFS fs, hdfsFile file, char* buffer, tSize length) const { if (hdfsRead == nullptr) { throw runtime_error("Failed to obtain the pointer of the function hdfsRead from the libhdfs."); } - return WrapperHdfsRead(fs, file, buffer, length, typeSize); + + tSize reTryCount = 0; + tSize unReadLength = length; + tSize readBytes = 0; + + while (unReadLength != 0 && reTryCount < RETRY_COUNT) { + tSize offset = buffer + (length - unReadLength) / sizeof(char); + tSize res = hdfsRead(fs, file, buffer + offset, unReadLength); + if (res == -1) { + return res; + } + unReadLength -= res; + readBytes += res; + } + return readBytes; } - tSize WrapperHdfsRead(hdfsFS fs, hdfsFile file, void *buffer, tSize length, tSize typeSize) const + tSize Read(hdfsFS fs, hdfsFile file, float* buffer, tSize length) const { + if (hdfsRead == nullptr) { + throw runtime_error("Failed to obtain the pointer of the function hdfsRead from the libhdfs."); + } + tSize reTryCount = 0; tSize unReadLength = length; tSize readBytes = 0; while (unReadLength != 0 && reTryCount < RETRY_COUNT) { - tSize offset = buffer + (length - unReadLength) / typeSize; + tSize offset = buffer + (length - unReadLength) / sizeof(float); tSize res = hdfsRead(fs, file, buffer + offset, unReadLength); if (res == -1) { return res; @@ -160,22 +178,38 @@ namespace MxRec { return readBytes; } - tSize Write(hdfsFS fs, hdfsFile file, const void* buffer, tSize length, tSize typeSize) const + tSize Write(hdfsFS fs, hdfsFile file, const char* buffer, tSize length, tSize typeSize) const { if (hdfsWrite == nullptr) { throw runtime_error("Failed to obtain the pointer of the function hdfsWrite from the libhdfs."); } - return WrapperHdfsWrite(fs, file, buffer, length, typeSize); + tSize reTryCount = 0; + tSize unWriteLength = length; + tSize writeBytes = 0; + + while (unWriteLength != 0 && reTryCount < RETRY_COUNT) { + tSize offset = buffer + (length - unWriteLength) / sizeof(char); + tSize res = hdfsWrite(fs, file, buffer + offset, unWriteLength); + if (res == -1) { + return res; + } + unWriteLength -= res; + writeBytes += res; + } + return writeBytes; } - tSize WrapperHdfsWrite(hdfsFS fs, hdfsFile file, const void *buffer, tSize length, tSize typeSize) const + tSize Write(hdfsFS fs, hdfsFile file, const float* buffer, tSize length, tSize typeSize) const { + if (hdfsWrite == nullptr) { + throw runtime_error("Failed to obtain the pointer of the function hdfsWrite from the libhdfs."); + } tSize reTryCount = 0; tSize unWriteLength = length; tSize writeBytes = 0; while (unWriteLength != 0 && reTryCount < RETRY_COUNT) { - tSize offset = buffer + (length - unWriteLength) / typeSize; + tSize offset = buffer + (length - unWriteLength) / sizeof(float); tSize res = hdfsWrite(fs, file, buffer + offset, unWriteLength); if (res == -1) { return res; -- Gitee From a505ea1b0d403d263a82f9dfddf8b5abdc853565 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 7 May 2024 13:06:22 +0800 Subject: [PATCH 110/302] =?UTF-8?q?1=E3=80=81aclnn=20cmake=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt index c4a727bf..112c0a8c 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt @@ -52,6 +52,8 @@ link_directories( ) add_executable(execute_op + operator_desc.cpp + op_runner.cpp main.cpp common.cpp ) -- Gitee From 836e97bec2cc3beb902769601672333e34732efb Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 7 May 2024 14:47:00 +0800 Subject: [PATCH 111/302] =?UTF-8?q?1=E3=80=81readme=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index 3c30a40a..fb92806d 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -24,7 +24,7 @@ C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/deta bash run.sh ``` -2. 模型脚本中创建lazy_adam优化器并指定使用融合算子。代码示例: +2. 模型脚本中创建lazy_adam优化器并指定使用融合算子实现。代码示例: ```python from mx_rec.optimizers.lazy_adam import create_hash_optimizer @@ -34,7 +34,7 @@ from mx_rec.optimizers.lazy_adam import create_hash_optimizer sparse_optimizer = create_hash_optimizer(learning_rate=0.001, use_fusion_optim=True) ``` -## lazy_adam优化器同名融合算子lazy_adam +## LazyAdam融合算子 1. 算子分析 @@ -119,15 +119,17 @@ bash run.sh 完成算子的编译部署,编译部署时需要开启算子的二进制编译功能:修改算子工程中的编译配置项文件CMakePresets.json,将 ENABLE_BINARY_PACKAGE设置为True。编译部署时可将算子的二进制部署到当前环境,便于后续算子的调用。 -3. 检查API执行需要的头文件和库文件是否自动生成,针对mxRec,检查cust_op/fused_lazy_adam/lazy_adam/build_out/autogen目录下,是否有 - aclnn_lazy_adam.cpp和aclnn_lazy_adam.h等。 +3. + +检查API执行需要的头文件和库文件是否自动生成,针对融合算子,检查cust_op/fused_lazy_adam/lazy_adam/build_out/autogen目录下,是否有 +aclnn_lazy_adam.cpp和aclnn_lazy_adam.h等。 注意:对于cust_op/fused_lazy_adam/run.sh脚本,安装算子后会删除构建目录。运行单算子测试时,需要屏蔽掉删除rm rf ./lazy_adam这一步,以确保前置条件3。 -### 融合算子 lazy_adam +### LazyAdam融合算子de AclNN调用实现 -针对lazy_adam算子,入口src/main.cpp中: +针对LazyAdam融合算子,入口src/main.cpp中: 1. InitResource函数:初始化AscendCL并运行管理资源申请,不用修改 2. RunLookupOp运行算子: @@ -156,14 +158,14 @@ run.sh脚本依次执行: ### scripts脚本 -* gen_data.py:生成lazy_adam算子的输入数据和用于精度校验的golden数据,可自行修改测试相关dim参数。 -* verify_result.py:将算子的输出和脚本生成的golden数据进行精度比对,并输出比较结果。比对规则为:允许误差精度loss:1e-4 +* gen_data.py:生成LazyAdam融合算子的输入数据和用于精度校验的golden数据,可自行修改测试相关dim参数。 +* verify_result.py:将算子的输出和脚本生成的golden数据进行精度比对,并输出比较结果。比对规则为:允许误差精度loss:1e-6 a) 绝对误差 b) 相对误差 c) 误差相对个数 同时满足绝对误差不全小于loss,相对误差不全小于loss,且绝对误差和相对误差大于loss的个数都超过总数的1/loss,也就是 -1/10000(双万分之一),即认为算子精度不达标。其余情况均认为算子达标。 +1/1000000(百万分之一),即认为算子精度不达标。其余情况均认为算子达标。 用户可自行修改允许精度误差范围loss。 \ No newline at end of file -- Gitee From b55b2586c884d5c79fc07ec727bc6050d9e11bb3 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 7 May 2024 15:10:52 +0800 Subject: [PATCH 112/302] =?UTF-8?q?1=E3=80=81=E8=9E=8D=E5=90=88=E7=AE=97?= =?UTF-8?q?=E5=AD=90readme=E8=84=9A=E6=9C=AC=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index fb92806d..136a50e7 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -16,7 +16,7 @@ 更多详情可以参考CANN官方的Ascend C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0001.html)。 -## lazy_adam融合算子使用 +## LazyAdam融合算子使用 1. 上传fused_lazy_adam文件夹到目标环境,并进入当前目录,执行指令对lazy_adam融合算子进行编译和部署 @@ -24,6 +24,12 @@ C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/deta bash run.sh ``` +注:需先环境中设置CANN相关环境变量,再执行算子编译和安装指令。使用默认路径安装CANN时设置环境变量指令如下: + +```shell +source /usr/local/Ascend/ascend-toolkit/set_env.sh +``` + 2. 模型脚本中创建lazy_adam优化器并指定使用融合算子实现。代码示例: ```python @@ -34,7 +40,7 @@ from mx_rec.optimizers.lazy_adam import create_hash_optimizer sparse_optimizer = create_hash_optimizer(learning_rate=0.001, use_fusion_optim=True) ``` -## LazyAdam融合算子 +## LazyAdam融合算子介绍 1. 算子分析 -- Gitee From 18cdbad6d302af3d35360d170cb0bb41ea2a071f Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 7 May 2024 15:14:25 +0800 Subject: [PATCH 113/302] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E6=A3=80=E8=A7=86?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt index 112c0a8c..c2366f4a 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt @@ -52,9 +52,9 @@ link_directories( ) add_executable(execute_op + main.cpp operator_desc.cpp op_runner.cpp - main.cpp common.cpp ) -- Gitee From e842f98fe595d6b16ee9d15385cd8685a64fbd31 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 7 May 2024 15:26:26 +0800 Subject: [PATCH 114/302] =?UTF-8?q?hdfs=E4=B8=ADread=E3=80=81write?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E5=8A=A0=E5=9B=BA=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=E4=BF=9D=E5=AD=98=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/emb_table/embedding_ddr.cpp | 55 +++++++++++++----------- src/core/emb_table/embedding_dynamic.cpp | 24 ++++++----- src/core/emb_table/embedding_static.cpp | 24 ++++++----- 3 files changed, 57 insertions(+), 46 deletions(-) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 7226f849..ca48230b 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -324,24 +324,26 @@ void EmbeddingDDR::LoadKey(const string& savePath) { size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize); + throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize); + throw runtime_error( + StringFormat("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { free(static_cast(buf)); - throw runtime_error("Error: Load keys failed. An error occurred while reading file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Load keys failed. An error occurred while reading file: {}.", ss.str())); } if (res != fileSize) { free(static_cast(buf)); - throw runtime_error( - "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - fileSize, res, ss.str()); + throw runtime_error(StringFormat( + "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", fileSize, + res, ss.str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -356,9 +358,9 @@ void EmbeddingDDR::LoadKey(const string& savePath) { } if (keyCount > devVocabSize + hostVocabSize) { free(static_cast(buf)); - throw runtime_error( + throw runtime_error(StringFormat( "Error: Load keys failed. Load key size :{} exceeds the sum of device vocab size and host vocab size: {}.", - keyCount, devVocabSize + hostVocabSize); + keyCount, devVocabSize + hostVocabSize)); } else if (keyCount < devVocabSize) { loadOffset.push_back(i); devOffset2Key[keyCount] = buf[i]; @@ -394,13 +396,13 @@ void EmbeddingDDR::LoadEmbAndOptim(const string& savePath) size_t readSize = hostLoadOffset.size() * embSize_ * sizeof(float); ssize_t res = fileSystemPtr->Read(embedStream.str(), table.embData, 0, hostLoadOffset, embSize_); if (res == -1) { - throw runtime_error("Error: Load embeddings failed. An error occurred while reading file: {}.", - embedStream.str()); + throw runtime_error(StringFormat("Error: Load embeddings failed. An error occurred while reading file: {}.", + embedStream.str())); } if (res != readSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Load embeddings failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - readSize, res, embedStream.str()); + readSize, res, embedStream.str())); } // 读optim @@ -411,13 +413,13 @@ void EmbeddingDDR::LoadEmbAndOptim(const string& savePath) ssize_t res = fileSystemPtr->Read(paramStream.str(), table.embData, optimIndex, hostLoadOffset, embSize_); if (res == -1) { - throw runtime_error("Error: Load optimizers failed. An error occurred while reading file: {}.", - paramStream.str()); + throw runtime_error(StringFormat("Error: Load optimizers failed. An error occurred while reading file: {}.", + paramStream.str())); } if (res != readSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Load embeddings failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - readSize, res, paramStream.str()); + readSize, res, paramStream.str())); } optimIndex++; } @@ -452,12 +454,13 @@ void EmbeddingDDR::SaveKey(const string& savePath) { size_t writeSize = static_cast(hostKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(hostKey.data()), writeSize); if (res == -1) { - throw runtime_error("Error: Save keys failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } @@ -474,12 +477,13 @@ void EmbeddingDDR::SaveEmbData(const string& savePath) size_t writeSize = embSize_ * sizeof(float) * embContent.size(); ssize_t res = fileSystemPtr->Write(ss.str(), embContent, embSize_ * sizeof(float)); if (res == -1) { - throw runtime_error("Error: Save embeddings failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save embeddings failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save embeddings failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } @@ -498,12 +502,13 @@ void EmbeddingDDR::SaveOptimData(const string& savePath) ssize_t res = fileSystemPtr->Write(ss.str(), content.second, embSize_ * sizeof(float)); if (res == -1) { - throw runtime_error("Error: Save optimizers failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save optimizers failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save optimizers failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } } diff --git a/src/core/emb_table/embedding_dynamic.cpp b/src/core/emb_table/embedding_dynamic.cpp index f81f2ab7..a4562d10 100644 --- a/src/core/emb_table/embedding_dynamic.cpp +++ b/src/core/emb_table/embedding_dynamic.cpp @@ -153,12 +153,13 @@ void EmbeddingDynamic::SaveKey(const string& savePath) size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { - throw runtime_error("Error: Save keys failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } @@ -255,22 +256,24 @@ int EmbeddingDynamic::LoadKey(const string& savePath) size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize); + throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize); + throw runtime_error( + StringFormat("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { - throw runtime_error("Error: Load keys failed. An error occurred while reading file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Load keys failed. An error occurred while reading file: {}.", ss.str())); } if (res != fileSize) { - throw runtime_error( - "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - fileSize, res, ss.str()); + throw runtime_error(StringFormat( + "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", fileSize, + res, ss.str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -288,7 +291,8 @@ int EmbeddingDynamic::LoadKey(const string& savePath) void *newBlock = nullptr; aclError ret = aclrtMalloc(&newBlock, static_cast(datasetSize), ACL_MEM_MALLOC_HUGE_FIRST); if (ret != ACL_SUCCESS) { - throw runtime_error("Error: in dynamic expansion mode, aclrtMalloc failed, malloc size: {}.", datasetSize); + throw runtime_error( + StringFormat("Error: in dynamic expansion mode, aclrtMalloc failed, malloc size: {}.", datasetSize)); } // 此处的 newBlock -> first address; // 对key_offset map 进行一个恢复操作 diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index 3c741e46..caf15e7c 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -97,12 +97,13 @@ int EmbeddingStatic::SaveKey(const string& savePath) size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { - throw runtime_error("Error: Save keys failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } @@ -121,22 +122,23 @@ void EmbeddingStatic::LoadKey(const string &savePath) size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize); + throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize); + throw runtime_error(StringFormat("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { - throw runtime_error("Error: Load keys failed. An error occurred while reading file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Load keys failed. An error occurred while reading file: {}.", ss.str())); } if (res != fileSize) { - throw runtime_error( - "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - fileSize, res, ss.str()); + throw runtime_error(StringFormat( + "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", fileSize, + res, ss.str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -152,8 +154,8 @@ void EmbeddingStatic::LoadKey(const string &savePath) if (loadOffset.size() > devVocabSize) { free(static_cast(buf)); - throw runtime_error("Error: Load keys failed. Load key size :{} exceeds device vocab size: {}.", - loadOffset.size(), devVocabSize); + throw runtime_error(StringFormat("Error: Load keys failed. Load key size :{} exceeds device vocab size: {}.", + loadOffset.size(), devVocabSize)); } maxOffset = keyOffsetMap.size(); -- Gitee From 741a8b70eaa547e407be3e49615f61a6666f196c Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 7 May 2024 15:26:26 +0800 Subject: [PATCH 115/302] =?UTF-8?q?hdfs=E4=B8=ADread=E3=80=81write?= =?UTF-8?q?=E5=87=BD=E6=95=B0=E5=8A=A0=E5=9B=BA=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=E4=BF=9D=E5=AD=98=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/emb_table/embedding_ddr.cpp | 55 ++++++++++--------- src/core/emb_table/embedding_dynamic.cpp | 24 ++++---- src/core/emb_table/embedding_static.cpp | 24 ++++---- .../hdfs_file_system/hdfs_wrapper.h | 8 +-- 4 files changed, 61 insertions(+), 50 deletions(-) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 7226f849..ca48230b 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -324,24 +324,26 @@ void EmbeddingDDR::LoadKey(const string& savePath) { size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize); + throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize); + throw runtime_error( + StringFormat("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { free(static_cast(buf)); - throw runtime_error("Error: Load keys failed. An error occurred while reading file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Load keys failed. An error occurred while reading file: {}.", ss.str())); } if (res != fileSize) { free(static_cast(buf)); - throw runtime_error( - "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - fileSize, res, ss.str()); + throw runtime_error(StringFormat( + "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", fileSize, + res, ss.str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -356,9 +358,9 @@ void EmbeddingDDR::LoadKey(const string& savePath) { } if (keyCount > devVocabSize + hostVocabSize) { free(static_cast(buf)); - throw runtime_error( + throw runtime_error(StringFormat( "Error: Load keys failed. Load key size :{} exceeds the sum of device vocab size and host vocab size: {}.", - keyCount, devVocabSize + hostVocabSize); + keyCount, devVocabSize + hostVocabSize)); } else if (keyCount < devVocabSize) { loadOffset.push_back(i); devOffset2Key[keyCount] = buf[i]; @@ -394,13 +396,13 @@ void EmbeddingDDR::LoadEmbAndOptim(const string& savePath) size_t readSize = hostLoadOffset.size() * embSize_ * sizeof(float); ssize_t res = fileSystemPtr->Read(embedStream.str(), table.embData, 0, hostLoadOffset, embSize_); if (res == -1) { - throw runtime_error("Error: Load embeddings failed. An error occurred while reading file: {}.", - embedStream.str()); + throw runtime_error(StringFormat("Error: Load embeddings failed. An error occurred while reading file: {}.", + embedStream.str())); } if (res != readSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Load embeddings failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - readSize, res, embedStream.str()); + readSize, res, embedStream.str())); } // 读optim @@ -411,13 +413,13 @@ void EmbeddingDDR::LoadEmbAndOptim(const string& savePath) ssize_t res = fileSystemPtr->Read(paramStream.str(), table.embData, optimIndex, hostLoadOffset, embSize_); if (res == -1) { - throw runtime_error("Error: Load optimizers failed. An error occurred while reading file: {}.", - paramStream.str()); + throw runtime_error(StringFormat("Error: Load optimizers failed. An error occurred while reading file: {}.", + paramStream.str())); } if (res != readSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Load embeddings failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - readSize, res, paramStream.str()); + readSize, res, paramStream.str())); } optimIndex++; } @@ -452,12 +454,13 @@ void EmbeddingDDR::SaveKey(const string& savePath) { size_t writeSize = static_cast(hostKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(hostKey.data()), writeSize); if (res == -1) { - throw runtime_error("Error: Save keys failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } @@ -474,12 +477,13 @@ void EmbeddingDDR::SaveEmbData(const string& savePath) size_t writeSize = embSize_ * sizeof(float) * embContent.size(); ssize_t res = fileSystemPtr->Write(ss.str(), embContent, embSize_ * sizeof(float)); if (res == -1) { - throw runtime_error("Error: Save embeddings failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save embeddings failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save embeddings failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } @@ -498,12 +502,13 @@ void EmbeddingDDR::SaveOptimData(const string& savePath) ssize_t res = fileSystemPtr->Write(ss.str(), content.second, embSize_ * sizeof(float)); if (res == -1) { - throw runtime_error("Error: Save optimizers failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save optimizers failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save optimizers failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } } diff --git a/src/core/emb_table/embedding_dynamic.cpp b/src/core/emb_table/embedding_dynamic.cpp index f81f2ab7..a4562d10 100644 --- a/src/core/emb_table/embedding_dynamic.cpp +++ b/src/core/emb_table/embedding_dynamic.cpp @@ -153,12 +153,13 @@ void EmbeddingDynamic::SaveKey(const string& savePath) size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { - throw runtime_error("Error: Save keys failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } @@ -255,22 +256,24 @@ int EmbeddingDynamic::LoadKey(const string& savePath) size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize); + throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize); + throw runtime_error( + StringFormat("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { - throw runtime_error("Error: Load keys failed. An error occurred while reading file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Load keys failed. An error occurred while reading file: {}.", ss.str())); } if (res != fileSize) { - throw runtime_error( - "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - fileSize, res, ss.str()); + throw runtime_error(StringFormat( + "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", fileSize, + res, ss.str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -288,7 +291,8 @@ int EmbeddingDynamic::LoadKey(const string& savePath) void *newBlock = nullptr; aclError ret = aclrtMalloc(&newBlock, static_cast(datasetSize), ACL_MEM_MALLOC_HUGE_FIRST); if (ret != ACL_SUCCESS) { - throw runtime_error("Error: in dynamic expansion mode, aclrtMalloc failed, malloc size: {}.", datasetSize); + throw runtime_error( + StringFormat("Error: in dynamic expansion mode, aclrtMalloc failed, malloc size: {}.", datasetSize)); } // 此处的 newBlock -> first address; // 对key_offset map 进行一个恢复操作 diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index 3c741e46..caf15e7c 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -97,12 +97,13 @@ int EmbeddingStatic::SaveKey(const string& savePath) size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { - throw runtime_error("Error: Save keys failed. An error occurred while writing file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error( + throw runtime_error(StringFormat( "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str()); + writeSize, res, ss.str())); } } @@ -121,22 +122,23 @@ void EmbeddingStatic::LoadKey(const string &savePath) size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize); + throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize); + throw runtime_error(StringFormat("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { - throw runtime_error("Error: Load keys failed. An error occurred while reading file: {}.", ss.str()); + throw runtime_error( + StringFormat("Error: Load keys failed. An error occurred while reading file: {}.", ss.str())); } if (res != fileSize) { - throw runtime_error( - "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - fileSize, res, ss.str()); + throw runtime_error(StringFormat( + "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", fileSize, + res, ss.str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -152,8 +154,8 @@ void EmbeddingStatic::LoadKey(const string &savePath) if (loadOffset.size() > devVocabSize) { free(static_cast(buf)); - throw runtime_error("Error: Load keys failed. Load key size :{} exceeds device vocab size: {}.", - loadOffset.size(), devVocabSize); + throw runtime_error(StringFormat("Error: Load keys failed. Load key size :{} exceeds device vocab size: {}.", + loadOffset.size(), devVocabSize)); } maxOffset = keyOffsetMap.size(); diff --git a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h index 33f2738b..205e5365 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h +++ b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h @@ -145,7 +145,7 @@ namespace MxRec { tSize readBytes = 0; while (unReadLength != 0 && reTryCount < RETRY_COUNT) { - tSize offset = buffer + (length - unReadLength) / sizeof(char); + tSize offset = (length - unReadLength) / sizeof(char); tSize res = hdfsRead(fs, file, buffer + offset, unReadLength); if (res == -1) { return res; @@ -167,7 +167,7 @@ namespace MxRec { tSize readBytes = 0; while (unReadLength != 0 && reTryCount < RETRY_COUNT) { - tSize offset = buffer + (length - unReadLength) / sizeof(float); + tSize offset = (length - unReadLength) / sizeof(float); tSize res = hdfsRead(fs, file, buffer + offset, unReadLength); if (res == -1) { return res; @@ -188,7 +188,7 @@ namespace MxRec { tSize writeBytes = 0; while (unWriteLength != 0 && reTryCount < RETRY_COUNT) { - tSize offset = buffer + (length - unWriteLength) / sizeof(char); + tSize offset = (length - unWriteLength) / sizeof(char); tSize res = hdfsWrite(fs, file, buffer + offset, unWriteLength); if (res == -1) { return res; @@ -209,7 +209,7 @@ namespace MxRec { tSize writeBytes = 0; while (unWriteLength != 0 && reTryCount < RETRY_COUNT) { - tSize offset = buffer + (length - unWriteLength) / sizeof(float); + tSize offset = (length - unWriteLength) / sizeof(float); tSize res = hdfsWrite(fs, file, buffer + offset, unWriteLength); if (res == -1) { return res; -- Gitee From 61525ff68b3798bf86f46fae48b3b4a964acffea Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 7 May 2024 15:38:52 +0800 Subject: [PATCH 116/302] =?UTF-8?q?=E5=8F=82=E6=95=B0=E6=A0=A1=E9=AA=8C?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/optimizers/lazy_adam.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index 81c8ecba..875f350f 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -42,7 +42,7 @@ from mx_rec.validator.validator import para_checker_decorator, StringValidator, ("beta2", FloatValidator, {"min_value": 0.0, "max_value": 1.0}, ["check_value"]), ("epsilon", FloatValidator, {"min_value": 0.0, "max_value": 1.0}, ["check_value_for_left_open_interval"]), ("name", StringValidator, {"min_len": 1, "max_len": 200}, ["check_string_length"]), - ("use_fusion_optim", ClassValidator, {"classes": (bool, type(None))}), + ("use_fusion_optim", ClassValidator, {"classes": (bool,)}), ]) def create_hash_optimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, name="LazyAdam", use_fusion_optim=False): @@ -178,7 +178,7 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): nd_indices = tf.expand_dims(indices, 1) slot_m = self.get_slot(var, "m") slot_v = self.get_slot(var, "v") - output_m, output_v, output_var =\ + output_m, output_v, output_var = \ import_host_pipeline_ops().lazy_adam(grad, nd_indices, slot_m, slot_v, var, learning_rate, self._custom_initial_beta1, self._custom_initial_beta2, self._custom_initial_epsilon) -- Gitee From 199e06dd2506248e9830d81edb3d0720c080be64 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 7 May 2024 15:44:19 +0800 Subject: [PATCH 117/302] =?UTF-8?q?readme=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index 136a50e7..c42d1bfe 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -127,15 +127,15 @@ ENABLE_BINARY_PACKAGE设置为True。编译部署时可将算子的二进制部 3. -检查API执行需要的头文件和库文件是否自动生成,针对融合算子,检查cust_op/fused_lazy_adam/lazy_adam/build_out/autogen目录下,是否有 +检查API执行需要的头文件和库文件是否自动生成,检查cust_op/fused_lazy_adam/lazy_adam/build_out/autogen目录下,是否有 aclnn_lazy_adam.cpp和aclnn_lazy_adam.h等。 注意:对于cust_op/fused_lazy_adam/run.sh脚本,安装算子后会删除构建目录。运行单算子测试时,需要屏蔽掉删除rm rf ./lazy_adam这一步,以确保前置条件3。 -### LazyAdam融合算子de AclNN调用实现 +### LazyAdam融合算子的AclNN调用实现 -针对LazyAdam融合算子,入口src/main.cpp中: +调用入口在src/main.cpp中: 1. InitResource函数:初始化AscendCL并运行管理资源申请,不用修改 2. RunLookupOp运行算子: -- Gitee From fb792fec233d602971287ccbd7f5c06ff5ea0139 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 7 May 2024 08:05:17 +0000 Subject: [PATCH 118/302] =?UTF-8?q?!121=20=E8=9E=8D=E5=90=88=E7=AE=97?= =?UTF-8?q?=E5=AD=90=E9=80=82=E9=85=8D=20*=20readme=E4=BF=AE=E6=94=B9=20*?= =?UTF-8?q?=20=E5=8F=82=E6=95=B0=E6=A0=A1=E9=AA=8C=E4=BF=AE=E6=94=B9=20*?= =?UTF-8?q?=20=E4=BB=A3=E7=A0=81=E6=A3=80=E8=A7=86=E4=BF=AE=E6=94=B9=20*?= =?UTF-8?q?=201=E3=80=81=E8=9E=8D=E5=90=88=E7=AE=97=E5=AD=90readme?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=E6=9B=B4=E6=96=B0=20*=201=E3=80=81readme?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=E6=9B=B4=E6=96=B0=20*=201=E3=80=81aclnn=20cm?= =?UTF-8?q?ake=E4=BF=AE=E6=94=B9=20*=201=E3=80=81readme=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E6=9B=B4=E6=96=B0=20*=201=E3=80=81readme=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/gen_mxrec_tar_pkg.sh | 4 ++ cust_op/fused_lazy_adam/README.md | 40 +++++++++++++------ .../aclnn_lazy_adam_test/src/CMakeLists.txt | 2 + mx_rec/optimizers/lazy_adam.py | 32 ++++++++++++--- 4 files changed, 60 insertions(+), 18 deletions(-) diff --git a/build/gen_mxrec_tar_pkg.sh b/build/gen_mxrec_tar_pkg.sh index 72ccfe49..3b6a9713 100644 --- a/build/gen_mxrec_tar_pkg.sh +++ b/build/gen_mxrec_tar_pkg.sh @@ -56,6 +56,10 @@ function gen_tar_file() chmod 640 *.json chmod 550 op_host op_kernel op_host/* op_kernel/* cd - + cd ./build/"${pkg_dir}"/cust_op/ + chmod 550 -R fused_lazy_adam + chmod 640 fused_lazy_adam/*.json + cd - cd ./build tar -zvcf "${release_tar}" "${pkg_dir}" || { warn "compression failed, packages might be broken" diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index 42f5bfc9..c42d1bfe 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -16,17 +16,31 @@ 更多详情可以参考CANN官方的Ascend C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/detail/zh/canncommercial/70RC1/operatordev/Ascendcopdevg/atlas_ascendc_10_0001.html)。 -## lazy_adam融合算子使用 +## LazyAdam融合算子使用 -1. 进入当前目录,执行指令进行编译和部署lazy_adam融合算子 +1. 上传fused_lazy_adam文件夹到目标环境,并进入当前目录,执行指令对lazy_adam融合算子进行编译和部署 -``` +```shell bash run.sh ``` -2. 模型py脚本中导入mxRec中的lazy_adam优化器。lazy_adam优化器使用知道参考mxRec用户指南。 +注:需先环境中设置CANN相关环境变量,再执行算子编译和安装指令。使用默认路径安装CANN时设置环境变量指令如下: + +```shell +source /usr/local/Ascend/ascend-toolkit/set_env.sh +``` + +2. 模型脚本中创建lazy_adam优化器并指定使用融合算子实现。代码示例: + +```python +from mx_rec.optimizers.lazy_adam import create_hash_optimizer -## lazy_adam优化器同名融合算子lazy_adam +# 创建lazy_adam优化器时增加"use_fusion_optim=True"参数,表示使用融合算子实现。use_fusion_optim参数默认值为False。 +# lazy_adam优化器详细使用指导请参考mxRec用户指南。 +sparse_optimizer = create_hash_optimizer(learning_rate=0.001, use_fusion_optim=True) +``` + +## LazyAdam融合算子介绍 1. 算子分析 @@ -111,15 +125,17 @@ bash run.sh 完成算子的编译部署,编译部署时需要开启算子的二进制编译功能:修改算子工程中的编译配置项文件CMakePresets.json,将 ENABLE_BINARY_PACKAGE设置为True。编译部署时可将算子的二进制部署到当前环境,便于后续算子的调用。 -3. 检查API执行需要的头文件和库文件是否自动生成,针对mxRec,检查cust_op/fused_lazy_adam/lazy_adam/build_out/autogen目录下,是否有 - aclnn_lazy_adam.cpp和aclnn_lazy_adam.h等。 +3. + +检查API执行需要的头文件和库文件是否自动生成,检查cust_op/fused_lazy_adam/lazy_adam/build_out/autogen目录下,是否有 +aclnn_lazy_adam.cpp和aclnn_lazy_adam.h等。 注意:对于cust_op/fused_lazy_adam/run.sh脚本,安装算子后会删除构建目录。运行单算子测试时,需要屏蔽掉删除rm rf ./lazy_adam这一步,以确保前置条件3。 -### 融合算子 lazy_adam +### LazyAdam融合算子的AclNN调用实现 -针对lazy_adam算子,入口src/main.cpp中: +调用入口在src/main.cpp中: 1. InitResource函数:初始化AscendCL并运行管理资源申请,不用修改 2. RunLookupOp运行算子: @@ -148,14 +164,14 @@ run.sh脚本依次执行: ### scripts脚本 -* gen_data.py:生成lazy_adam算子的输入数据和用于精度校验的golden数据,可自行修改测试相关dim参数。 -* verify_result.py:将算子的输出和脚本生成的golden数据进行精度比对,并输出比较结果。比对规则为:允许误差精度loss:1e-4 +* gen_data.py:生成LazyAdam融合算子的输入数据和用于精度校验的golden数据,可自行修改测试相关dim参数。 +* verify_result.py:将算子的输出和脚本生成的golden数据进行精度比对,并输出比较结果。比对规则为:允许误差精度loss:1e-6 a) 绝对误差 b) 相对误差 c) 误差相对个数 同时满足绝对误差不全小于loss,相对误差不全小于loss,且绝对误差和相对误差大于loss的个数都超过总数的1/loss,也就是 -1/10000(双万分之一),即认为算子精度不达标。其余情况均认为算子达标。 +1/1000000(百万分之一),即认为算子精度不达标。其余情况均认为算子达标。 用户可自行修改允许精度误差范围loss。 \ No newline at end of file diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt index c4a727bf..c2366f4a 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/CMakeLists.txt @@ -53,6 +53,8 @@ link_directories( add_executable(execute_op main.cpp + operator_desc.cpp + op_runner.cpp common.cpp ) diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index 1f491d14..875f350f 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -32,7 +32,8 @@ from tensorflow.python.training import slot_creator from mx_rec.optimizers.base import CustomizedOptimizer from mx_rec.util.initialize import ConfigInitializer -from mx_rec.validator.validator import para_checker_decorator, StringValidator, FloatValidator +from mx_rec.util.ops import import_host_pipeline_ops +from mx_rec.validator.validator import para_checker_decorator, StringValidator, FloatValidator, ClassValidator @para_checker_decorator(check_option_list=[ @@ -40,9 +41,11 @@ from mx_rec.validator.validator import para_checker_decorator, StringValidator, ("beta1", FloatValidator, {"min_value": 0.0, "max_value": 1.0}, ["check_value_for_open_interval"]), ("beta2", FloatValidator, {"min_value": 0.0, "max_value": 1.0}, ["check_value"]), ("epsilon", FloatValidator, {"min_value": 0.0, "max_value": 1.0}, ["check_value_for_left_open_interval"]), - ("name", StringValidator, {"min_len": 1, "max_len": 200}, ["check_string_length"]) + ("name", StringValidator, {"min_len": 1, "max_len": 200}, ["check_string_length"]), + ("use_fusion_optim", ClassValidator, {"classes": (bool,)}), ]) -def create_hash_optimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, name="LazyAdam"): +def create_hash_optimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, name="LazyAdam", + use_fusion_optim=False): """ Args: learning_rate: learning rate @@ -50,13 +53,14 @@ def create_hash_optimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1 beta2: epsilon: name: - + use_fusion_optim: if use fused optimizer Returns: a customized optimizer instance """ if ConfigInitializer.get_instance().use_dynamic_expansion: raise ValueError("dynamic expansion mode is not compatible with the optimizer, please config dynamic " "expansion mode and optimizer correctly") - optimizer = CustomizedLazyAdam(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, name=name) + optimizer = CustomizedLazyAdam(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, name=name, + use_fusion_optim=use_fusion_optim) ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer return optimizer @@ -64,10 +68,16 @@ def create_hash_optimizer(learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1 class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): name_counter = defaultdict(int) - def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, use_locking=False, name="LazyAdam"): + def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, use_locking=False, name="LazyAdam", + use_fusion_optim=False): self.optimizer_type = "LazyAdam" self.optim_param_list = ["momentum", "velocity"] self.config_instance = ConfigInitializer.get_instance() + self.use_fusion_optim = use_fusion_optim + if self.use_fusion_optim: + self._custom_initial_beta1 = beta1 + self._custom_initial_beta2 = beta2 + self._custom_initial_epsilon = epsilon super(CustomizedLazyAdam, self)._get_name(name=name) super(CustomizedLazyAdam, self).__init__(learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, use_locking=use_locking, name=self.unique_name) @@ -164,6 +174,16 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): temp_epsilon = temp.get("temp_epsilon") learning_rate = tf.divide(temp_lr * math_ops.sqrt(1 - power_b2), (1 - power_b1)) + if self.use_fusion_optim: + nd_indices = tf.expand_dims(indices, 1) + slot_m = self.get_slot(var, "m") + slot_v = self.get_slot(var, "v") + output_m, output_v, output_var = \ + import_host_pipeline_ops().lazy_adam(grad, nd_indices, slot_m, slot_v, var, learning_rate, + self._custom_initial_beta1, self._custom_initial_beta2, + self._custom_initial_epsilon) + return control_flow_ops.group(output_m, output_v, output_var) + abs_indices = tf.math.maximum(indices, 0) nd_indices = tf.expand_dims(indices, 1) -- Gitee From 5e6bd96e69ca5893cf07511522b6a3f68ce4c59f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Tue, 7 May 2024 13:46:16 +0000 Subject: [PATCH 119/302] =?UTF-8?q?!81=20create=5Ftable=E6=8E=A5=E5=8F=A3?= =?UTF-8?q?=E4=B8=8E=E4=BC=98=E5=8C=96=E5=99=A8=E5=88=9B=E5=BB=BA=E8=A7=A3?= =?UTF-8?q?=E8=80=A6=EF=BC=88=E4=B8=8D=E4=BC=A0=E5=85=A5=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=99=A8=E5=8F=82=E6=95=B0=EF=BC=89=20*=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91clean=20code?= =?UTF-8?q?=E5=92=8C=E8=85=BE=E8=AE=AFeval=E9=83=A8=E5=88=86=E6=94=B9?= =?UTF-8?q?=E5=9B=BE=E7=9A=84=E4=BF=AE=E6=94=B9=20*=20Merge=20remote-track?= =?UTF-8?q?ing=20branch=20'upstream/develop'=20into=20develop-ddr-witho?= =?UTF-8?q?=E2=80=A6=20*=20Merge=20remote-tracking=20branch=20'upstream/de?= =?UTF-8?q?velop'=20into=20develop-ddr-witho=E2=80=A6=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91slo?= =?UTF-8?q?t=E5=92=8Cderivative=E7=A7=BB=E8=87=B3=E4=B8=8A=E5=B1=82base=20?= =?UTF-8?q?*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modificatio?= =?UTF-8?q?n=E3=80=91create=5Ftable=E6=8E=A5=E5=8F=A3=E4=B8=8E=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E5=99=A8=E5=88=9B=E5=BB=BA=E8=A7=A3=E8=80=A6=20*=20Me?= =?UTF-8?q?rge=20remote-tracking=20branch=20'origin/develop-global-unique'?= =?UTF-8?q?=20into=20devel=E2=80=A6=20*=20Merge=20remote-tracking=20branch?= =?UTF-8?q?=20'upstream/develop'=20into=20develop-ddr-witho=E2=80=A6=20*?= =?UTF-8?q?=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr?= =?UTF-8?q?=20without=20optimizer=20for=20fp=20*=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr=20withou?= =?UTF-8?q?t=20optimizer=20for=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr=20without=20optim?= =?UTF-8?q?izer=20for=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91ddr=20without=20optimizer=20fo?= =?UTF-8?q?r=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Mo?= =?UTF-8?q?dification=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20Merge=20rem?= =?UTF-8?q?ote-tracking=20branch=20'upstream/develop'=20into=20develop-ddr?= =?UTF-8?q?-witho=E2=80=A6=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91ddr=20without=20optimizer=20fo?= =?UTF-8?q?r=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Mo?= =?UTF-8?q?dification=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr?= =?UTF-8?q?=20without=20optimizer=20for=20fp=20*=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr=20withou?= =?UTF-8?q?t=20optimizer=20for=20fp=20*=20Merge=20remote-tracking=20branch?= =?UTF-8?q?=20'upstream/develop'=20into=20develop-ddr-witho=E2=80=A6=20*?= =?UTF-8?q?=20Merge=20remote-tracking=20branch=20'origin/develop-ddr-witho?= =?UTF-8?q?ut-optimizer'=20in=E2=80=A6=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr=20without=20optim?= =?UTF-8?q?izer=20for=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91ddr=20without=20optimizer=20fo?= =?UTF-8?q?r=20fp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 2 - examples/demo/little_demo/main.py | 5 +- examples/demo/little_demo/run.sh | 2 +- .../little_demo_estimator/nn_model_build.py | 9 +- .../demo/little_demo_estimator/nn_optim.py | 12 -- examples/dlrm/model/gradient_descent_w.py | 8 -- examples/dlrm/model/main_mxrec.py | 2 - mx_rec/core/asc/build_graph.py | 58 ++------ mx_rec/core/asc/manager.py | 11 +- mx_rec/core/asc/swap_args.py | 57 ++++++++ mx_rec/core/emb/base_sparse_embedding.py | 61 ++++---- mx_rec/core/emb/dynamic_sparse_embedding.py | 35 ----- mx_rec/core/emb/sparse_embedding.py | 76 ---------- mx_rec/core/embedding.py | 6 +- mx_rec/graph/modifier.py | 131 +++++++++++++++++- mx_rec/graph/utils.py | 49 +++++++ mx_rec/optimizers/adagrad.py | 25 ---- mx_rec/optimizers/base.py | 6 - mx_rec/optimizers/emb_optimizer.py | 76 ---------- mx_rec/optimizers/ftrl.py | 25 ---- mx_rec/optimizers/gradient_descent.py | 3 - mx_rec/optimizers/gradient_descent_by_addr.py | 3 - mx_rec/optimizers/lazy_adam.py | 29 ---- mx_rec/util/variable.py | 7 +- tests/mx_rec/core/mock_class.py | 19 --- tests/mx_rec/core/test_build_graph.py | 88 ++---------- tests/mx_rec/core/test_embedding.py | 25 +--- tests/mx_rec/core/test_manager.py | 4 - tests/mx_rec/graph/test_modifier.py | 21 ++- tests/mx_rec/saver/sparse_embedding_mock.py | 7 - tests/mx_rec/saver/test_saver.py | 5 +- tests/mx_rec/util/test_variable.py | 11 +- tools/atomic/sparse_lookup_with_grad.py | 1 - 33 files changed, 318 insertions(+), 561 deletions(-) create mode 100644 mx_rec/core/asc/swap_args.py delete mode 100644 mx_rec/optimizers/emb_optimizer.py diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index 5e4efe02..eb1d91ea 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -294,7 +294,6 @@ if __name__ == "__main__": cfg.dev_vocab_size = cfg.dev_vocab_size // 2 optimizer_list = [get_dense_and_sparse_optimizer(cfg)] - sparse_optimizer_list = [sparse_optimizer for dense_optimizer, sparse_optimizer in optimizer_list] # note: variance_scaling_initializer only support HBM mode emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=SPARSE_HASHTABLE_SEED) \ @@ -305,7 +304,6 @@ if __name__ == "__main__": dim=tf.TensorShape([cfg.emb_dim]), name="sparse_embeddings", emb_initializer=emb_initializer, - optimizer_list=[sparse_optimizer_list[0]._optimizer], **cfg.get_emb_table_cfg() ) if use_faae: diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index 14b2e065..a6ef96fc 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -246,7 +246,6 @@ if __name__ == "__main__": eval_feature_spec_list = create_feature_spec_list(use_timestamp=USE_TIMESTAMP) optimizer_list = [create_dense_and_sparse_optimizer(cfg)] - sparse_optimizer_list = [sparse_optimizer for dense_optimizer, sparse_optimizer in optimizer_list] # 如需验证DDR模式,请按照key数量、batch unique数量合理设置device与host表大小。 # 验证DDR的配置参考:建议跑dynamic避免调参。数据集key总量大于device表,小于device+host;一个batch的unique key数量小于device表。 @@ -273,7 +272,6 @@ if __name__ == "__main__": dim=tf.TensorShape([cfg.user_hashtable_dim]), name='user_table', emb_initializer=emb_initializer, - optimizer_list=sparse_optimizer_list, all2all_gradients_op="sum_gradients_and_div_by_ranksize", **cache_mode_dict[cache_mode]) @@ -281,7 +279,6 @@ if __name__ == "__main__": dim=tf.TensorShape([cfg.item_hashtable_dim]), name='item_table', emb_initializer=emb_initializer, - optimizer_list=sparse_optimizer_list, **cache_mode_dict[cache_mode]) # 在predict的场景下,train model不需要被执行 @@ -300,7 +297,7 @@ if __name__ == "__main__": batch_number=MAX_DATASET_GENERATE * get_rank_size()) dense_variables, sparse_variables = get_dense_and_sparse_variable() - params = {"train_batch": train_batch, "eval_batch": eval_batch, "use_one_shot": USE_ONE_SHOT, + params = {"train_batch": train_batch, "eval_batch": eval_batch, "use_one_shot": USE_ONE_SHOT, "use_deterministic": USE_DETERMINISTIC} run_mode = RunMode( MODIFY_GRAPH_FLAG, USE_TIMESTAMP, table_list, optimizer_list, train_model, eval_model, train_iterator, diff --git a/examples/demo/little_demo/run.sh b/examples/demo/little_demo/run.sh index 9462a0cb..d585be02 100644 --- a/examples/demo/little_demo/run.sh +++ b/examples/demo/little_demo/run.sh @@ -106,7 +106,7 @@ export USE_DYNAMIC=1 # 0:静态shape;1:动态shape export USE_DYNAMIC_EXPANSION=0 # 0:关闭动态扩容;1: 开启动态扩容 export USE_MULTI_LOOKUP=1 # 0:一表一查;1:一表多查 export MULTI_LOOKUP_TIMES=2 # 一表多查次数:默认2,上限127(因为一表已经有一查);仅当export USE_MULTI_LOOKUP=1时生效 -export USE_MODIFY_GRAPH=0 # 0:feature spec模式;1:自动改图模式 +export USE_MODIFY_GRAPH=1 # 0:feature spec模式;1:自动改图模式 export USE_TIMESTAMP=0 # 0:关闭特征准入淘汰;1:开启特征准入淘汰 export USE_ONE_SHOT=0 # 0:MakeIterator;1:OneShotIterator export UpdateEmb_V2=1 # 0: UpdateEmb同步更新;1:UpdateEmb_V2异步更新 diff --git a/examples/demo/little_demo_estimator/nn_model_build.py b/examples/demo/little_demo_estimator/nn_model_build.py index 11faadf1..aeeab8f8 100644 --- a/examples/demo/little_demo_estimator/nn_model_build.py +++ b/examples/demo/little_demo_estimator/nn_model_build.py @@ -21,7 +21,6 @@ from mx_rec.util.tf_version_adapter import npu_ops from mx_rec.core.embedding import create_table, sparse_lookup from mx_rec.constants.constants import ASCEND_TIMESTAMP -from nn_optim import get_dense_and_sparse_optimizer from utils import FeatureSpecIns @@ -137,22 +136,18 @@ class LittleModel: return logit_list def _get_embedding_list(self): - optimizer_list = [get_dense_and_sparse_optimizer(self.cfg)] - sparse_optimizer_list = [sparse_optimizer for dense_optimizer, sparse_optimizer in optimizer_list] user_hashtable = create_table(key_dtype=tf.int64, dim=tf.TensorShape([self.cfg.user_hashtable_dim]), name='user_table', emb_initializer=tf.compat.v1.truncated_normal_initializer(), device_vocabulary_size=self.cfg.user_vocab_size * 10, - host_vocabulary_size=self.cfg.user_vocab_size * 0, - optimizer_list=sparse_optimizer_list) + host_vocabulary_size=self.cfg.user_vocab_size * 0) item_hashtable = create_table(key_dtype=tf.int64, dim=tf.TensorShape([self.cfg.item_hashtable_dim]), name='item_table', emb_initializer=tf.compat.v1.truncated_normal_initializer(), device_vocabulary_size=self.cfg.item_vocab_size * 10, - host_vocabulary_size=self.cfg.item_vocab_size * 0, - optimizer_list=sparse_optimizer_list) + host_vocabulary_size=self.cfg.item_vocab_size * 0) if self.params.modify_graph: if not self.params.enable_slicer_test: diff --git a/examples/demo/little_demo_estimator/nn_optim.py b/examples/demo/little_demo_estimator/nn_optim.py index 4d519366..d07556a6 100644 --- a/examples/demo/little_demo_estimator/nn_optim.py +++ b/examples/demo/little_demo_estimator/nn_optim.py @@ -28,18 +28,6 @@ from mx_rec.optimizers.gradient_descent_by_addr import create_hash_optimizer_by_ from mx_rec.util.log import logger -def get_dense_and_sparse_optimizer(cfg): - dense_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=cfg.learning_rate) - if ConfigInitializer.get_instance().use_dynamic_expansion: - sparse_optimizer = create_hash_optimizer_by_addr(learning_rate=cfg.learning_rate) - logger.info("optimizer create_hash_optimizer_by_addr") - else: - sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate) - logger.info("optimizer create_hash_optimizer") - - return dense_optimizer, sparse_optimizer - - def get_train_op_list(losses, learning_rate): train_ops_list = [] update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) diff --git a/examples/dlrm/model/gradient_descent_w.py b/examples/dlrm/model/gradient_descent_w.py index a2a5635a..53adb996 100644 --- a/examples/dlrm/model/gradient_descent_w.py +++ b/examples/dlrm/model/gradient_descent_w.py @@ -50,14 +50,6 @@ class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOp self._slot_num = 0 self._derivative = 1 - def initialize_slots(self, var, table_instance): - logger.info("no slot for gradient descent") - return [] - - def insert_slot(self, slot, named_slots_key, slot_name): - logger.info("no slot for gradient descent") - return dict() - def get_slot_init_values(self): logger.info("no slot for gradient descent") return [] diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index 8c4cdd7e..3464f84e 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -298,7 +298,6 @@ if __name__ == "__main__": cfg.dev_vocab_size = cfg.dev_vocab_size // 2 optimizer_list = [get_dense_and_sparse_optimizer(cfg)] - sparse_optimizer_list = [sparse_optimizer for dense_optimizer, sparse_optimizer in optimizer_list] # note: variance_scaling_initializer only support HBM mode emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) \ @@ -309,7 +308,6 @@ if __name__ == "__main__": dim=tf.TensorShape([cfg.emb_dim]), name="sparse_embeddings", emb_initializer=emb_initializer, - optimizer_list=[sparse_optimizer_list[0]._optimizer], **cfg.get_emb_table_cfg() ) if use_faae: diff --git a/mx_rec/core/asc/build_graph.py b/mx_rec/core/asc/build_graph.py index 2bb72621..82e40b29 100644 --- a/mx_rec/core/asc/build_graph.py +++ b/mx_rec/core/asc/build_graph.py @@ -23,6 +23,7 @@ import mxrec_pybind from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.tf_version_adapter import npu_ops from mx_rec.util.log import logger +from mx_rec.core.asc.swap_args import SwapArgs, SwapDataType def get_restore_vector(config): @@ -38,7 +39,7 @@ def get_restore_vector(config): raise TypeError("ext_emb_size must be a int") if config.get("ext_emb_size") < 1: raise ValueError("ext_emb_size is less than 1") - emb_size = config.get("ext_emb_size") + emb_size = None if ConfigInitializer.get_instance().use_static: restore_size = config.get("batch_size") * config.get("feat_cnt") @@ -46,8 +47,7 @@ def get_restore_vector(config): restore_size = None with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): - device_id = int(config.get("device_id")) - hot_size = int(mxrec_pybind.get_ub_hot_size(device_id) / emb_size) + hot_size = None restore_vector, hot_pos = npu_ops.gen_npu_ops.get_next( output_types=[tf.int32, tf.int32], output_shapes=[restore_size, [hot_size]], @@ -103,49 +103,6 @@ def get_all2all_args(use_static: bool, config: dict) -> Optional[list]: return all2all_args -def get_swap_info(config: dict, swap_len: int, swap_pos: list, table: tf.Variable) -> list: - """ - Get swap info if threshold is configured. - :param config: training job config - :param swap_len: swap length - :param swap_pos: swap position - :param table: the instance to do swap - :return: swap info - """ - use_static = ConfigInitializer.get_instance().use_static - max_lookup_vec_size = None - if use_static: - max_lookup_vec_size = config.get("send_count") * config.get("rank_size") - - if config.get("is_hbm"): - swap_in = [tf.no_op()] - else: - with tf.compat.v1.variable_scope("h2d_emb"): - logger.debug('Channel %s_h2d_%s was built for getnext', config.get("table_name"), config.get("channel_id")) - h2d_emb = npu_ops.gen_npu_ops.get_next( - output_types=[tf.float32], - output_shapes=[[max_lookup_vec_size, config.get("ext_emb_size")]], - channel_name=f'{config.get("table_name")}_h2d_{config.get("channel_id")}')[0] - logger.debug("h2d_emb shape: %s", h2d_emb) - if not isinstance(table, list): - raise RuntimeError("When enable emb_transfer, optimizer should have slots") - if use_static: - swap_pos = swap_pos[0:swap_len] - h2d_emb = h2d_emb[0:swap_len, :] - swap_outs = [tf.gather(one_table, swap_pos) for one_table in table] - swap_out = tf.concat(swap_outs, axis=1) - logger.debug('Channel %s_d2h_%s was built for op outfeed.', config.get("table_name"), config.get("channel_id")) - swap_out_op = npu_ops.outfeed_enqueue_op( - channel_name=f'{config.get("table_name")}_d2h_{config.get("channel_id")}', inputs=[swap_out]) - with tf.control_dependencies([swap_out_op]): - nd_swap_pos = tf.expand_dims(swap_pos, 1) - table_num = len(table) - h2d_emb_split = tf.split(h2d_emb, table_num, axis=1) - swap_in = [tf.compat.v1.scatter_nd_update(table[i], nd_swap_pos, h2d_emb_split[i]) - for i in range(len(table))] - return swap_in - - def get_preprocessed_tensor_for_asc(table, config): use_static = ConfigInitializer.get_instance().use_static max_lookup_vec_size = None @@ -158,15 +115,18 @@ def get_preprocessed_tensor_for_asc(table, config): with tf.compat.v1.variable_scope("id_offsets"): id_offsets, swap_pos, swap_len = get_id_offsets(max_lookup_vec_size, config) - all2all_args = get_all2all_args(use_static, config) + if not config.get("is_hbm"): + # 一表多查时,会多次进入get_preprocessed_tensor_for_asc,最后一次大查询替换map的key-value即可 + swap_args = SwapArgs() + swap_args.set_data(SwapDataType.CONFIG.value, var_name=config.get("table_name"), + var_channel=config.get("channel_id"), config=config, swap_pos=swap_pos, swap_len=swap_len) - swap_in = get_swap_info(config, swap_len, swap_pos, table) + all2all_args = get_all2all_args(use_static, config) result = { 'restore_vector': restore_vector, 'hot_pos': hot_pos, 'id_offsets': id_offsets, - 'swap_in': swap_in, 'all2all_args': all2all_args, } diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index 64611295..8b62b66b 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -37,16 +37,10 @@ def generate_table_info_list(): raise ValueError(f"The DDR mode of all tables must be used or not used at the same time. However, is_hbm " f"of each table `{table_instance_dict.keys()}` is `{is_hbm_list}`.") - optimizer = ConfigInitializer.get_instance().optimizer_config.optimizer_instance # generate table info dangling_table = check_dangling_table() for _, table_instance in ConfigInitializer.get_instance().sparse_embed_config.table_instance_dict.items(): - # When dynamic expansion mode, ext_emb_size is set by optimizer - if ConfigInitializer.get_instance().use_dynamic_expansion and optimizer: - table_instance.ext_emb_size = table_instance.emb_size * (1 + optimizer.slot_num) - logger.debug("ext_emb_size is reset to be %s for EmbInfo", table_instance.ext_emb_size) - skip = should_skip(table_instance.table_name) if table_instance.table_name in dangling_table or skip: logger.info("skip table %s: %s which does not need to be provided to the EmbInfo.", @@ -158,9 +152,8 @@ def matched_opt_slot_initializers(table_instance): slot_initializers.append(slot_initializer) start_index += table_instance.emb_size - logger.debug("matched_opt_slot_initializers, ext emb size:%s, optimizer_instance_list size:%s, " - "slot_initializers size:%s", table_instance.ext_emb_size, len(table_instance.optimizer_instance_list), - len(slot_initializers)) + logger.debug("matched_opt_slot_initializers, ext emb size:%s, slot_initializers size:%s", + table_instance.ext_emb_size, len(slot_initializers)) return slot_initializers diff --git a/mx_rec/core/asc/swap_args.py b/mx_rec/core/asc/swap_args.py new file mode 100644 index 00000000..4494cc26 --- /dev/null +++ b/mx_rec/core/asc/swap_args.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +import functools +from collections import defaultdict +from enum import Enum + + +class SwapDataType(Enum): + CONFIG = "config" + CONTROL = "control" + + +def singleton(cls): + _instance = {} + + def inner(): + if cls not in _instance: + _instance[cls] = cls() + return _instance[cls] + + return inner + + +@singleton +class SwapArgs: + def __init__(self): + self.swap_config_dict = defaultdict(dict) + self.swap_control_dict = defaultdict(dict) + + def set_data(self, data_type: str, **kwargs): + if "var_name" not in kwargs: + raise ValueError("Missing Required key: var_name") + if "var_channel" not in kwargs: + raise ValueError("Missing Required key: var_channel") + var_name = kwargs.pop("var_name") + var_channel = kwargs.pop("var_channel") + + if data_type == SwapDataType.CONFIG.value: + self.swap_config_dict[var_name][var_channel] = kwargs + elif data_type == SwapDataType.CONTROL.value: + self.swap_control_dict[var_name][var_channel] = kwargs + else: + raise ValueError(f"Error data type in swap args: {data_type}") diff --git a/mx_rec/core/emb/base_sparse_embedding.py b/mx_rec/core/emb/base_sparse_embedding.py index 07dc70f7..2a52b3a6 100644 --- a/mx_rec/core/emb/base_sparse_embedding.py +++ b/mx_rec/core/emb/base_sparse_embedding.py @@ -10,7 +10,9 @@ import tensorflow as tf from tensorflow.python.ops import array_ops from mx_rec.constants.constants import All2allGradientsOp, ASCEND_SPARSE_LOOKUP_ENTRANCE, ASCAnchorAttr +from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc from mx_rec.core.asc.feature_spec import set_temporary_feature_spec_attribute, get_feature_spec, FeatureSpec +from mx_rec.core.asc.swap_args import SwapArgs, SwapDataType from mx_rec.util.communication.hccl_ops import get_rank_size, get_rank_id, get_device_id from mx_rec.util.tf_version_adapter import hccl_ops from mx_rec.util.initialize import ConfigInitializer @@ -81,14 +83,6 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): ConfigInitializer.get_instance().train_params_config.ascend_global_hashtable_collection, self._variable) self._set_ext_emb_size() - @property - def optimizer_instance_list(self): - return [] - - @property - def optimizer(self): - return dict() - @property def embedding_size(self): return self._embedding_size @@ -117,6 +111,10 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): def send_count(self): return self._send_count + @property + def rank_size(self): + return self._rank_size + @property def slice_device_vocabulary_size(self): return self._slice_device_vocabulary_size @@ -201,35 +199,11 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): """ pass - @abc.abstractmethod - def set_optimizer(self, key: str, state_dict: dict): - """ - 设置optimizer state. - - Args: - key: 优化器名字 - state_dict: optimizer state - - Returns: None - """ - pass @abc.abstractmethod def _set_slice_vocab_size(self): pass - @abc.abstractmethod - def _set_ext_emb_size(self): - pass - - @abc.abstractmethod - def _build_optimizer_states(self): - pass - - @abc.abstractmethod - def _get_preprocessed_tensor(self, feature_spec: FeatureSpec, is_training: bool, send_count: Optional[int]) -> dict: - pass - @abc.abstractmethod def _get_update_grad(self, local_grad: tf.Tensor, result: dict, table: Union[tf.compat.v1.Variable, tf.Tensor]) -> Union[tf.IndexedSlices, tf.Tensor]: @@ -289,6 +263,19 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): """ self._multi_lookup_times[is_training] = self._multi_lookup_times.get(is_training) + 1 + def _set_ext_emb_size(self): + # 初始设置_ext_emb_size等于_emb_size,改图阶段会根据优化器的不同而exchange该值 + self._ext_emb_size = self._emb_size * self._ext_coefficient + logger.debug("Init table, ext_emb_size is set to be %s.", self._ext_emb_size) + + def _get_preprocessed_tensor(self, feature_spec: FeatureSpec, channel_id: int, send_count: Optional[int]) -> dict: + config = dict(batch_size=feature_spec.batch_size, feat_cnt=feature_spec.feat_cnt, send_count=send_count, + rank_size=self._rank_size, channel_id=channel_id, table_name=self._table_name, + is_hbm=self._is_hbm, ext_emb_size=self._ext_emb_size, emb_size=self._emb_size, + use_dynamic_expansion=ConfigInitializer.get_instance().use_dynamic_expansion) + + return get_preprocessed_tensor_for_asc(self._variable, config) + def lookup(self, ids: tf.Tensor, send_count: Optional[int], **kwargs) -> tf.Tensor: """ 稀疏表的lookup,自动改图模式. @@ -409,7 +396,8 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): check_emb_lookup_params(hashtable_params, feature_spec, send_count, is_training) if ConfigInitializer.get_instance().use_static: self._send_count = send_count - result = self._get_preprocessed_tensor(feature_spec, is_training, send_count) + channel_id = ConfigInitializer.get_instance().train_params_config.get_training_mode_channel_id(is_training) + result = self._get_preprocessed_tensor(feature_spec, channel_id, send_count) @tf.custom_gradient def sparse_forward(table): @@ -469,7 +457,11 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): return array_ops.reshape(embeddings, dest_shape), grad - with tf.control_dependencies(result.get("swap_in")): + ddr_control_ops = tf.no_op(name="place_holder_swap_op") + swap_args = SwapArgs() + swap_args.set_data(SwapDataType.CONTROL.value, var_name=self._table_name, var_channel=channel_id, + control_ops=ddr_control_ops) + with tf.control_dependencies([ddr_control_ops]): return self._get_sparse_forward_result(sparse_forward, self._variable, result, is_training) def __initialize_variables(self): @@ -481,7 +473,6 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): ConfigInitializer.get_instance().sparse_embed_config.insert_removing_var_list(self._variable.name) self.__record() - self._build_optimizer_states() def __record(self, eval_flag=False): ConfigInitializer.get_instance().sparse_embed_config.insert_table_instance( diff --git a/mx_rec/core/emb/dynamic_sparse_embedding.py b/mx_rec/core/emb/dynamic_sparse_embedding.py index 671c593e..49979261 100644 --- a/mx_rec/core/emb/dynamic_sparse_embedding.py +++ b/mx_rec/core/emb/dynamic_sparse_embedding.py @@ -10,7 +10,6 @@ import tensorflow as tf from mx_rec.constants.constants import ASCEND_TABLE_NAME_MUST_CONTAIN, ASCEND_SPARSE_LOOKUP_LOCAL_EMB, \ ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.feature_spec import FeatureSpec -from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger @@ -28,26 +27,10 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): def capacity(self) -> int: return ConfigInitializer.get_instance().hybrid_manager_config.asc_manager.get_table_capacity(self._table_name) - @abc.abstractmethod - def set_optimizer(self, key: str, state_dict: dict): - pass - - @abc.abstractmethod - def _build_optimizer_states(self): - pass - - @abc.abstractmethod - def _set_ext_emb_size(self): - pass - @abc.abstractmethod def _set_slice_vocab_size(self): pass - @abc.abstractmethod - def _get_preprocessed_tensor(self, feature_spec: FeatureSpec, is_training: bool, send_count: Optional[int]) -> dict: - pass - def _get_update_grad(self, local_grad: tf.Tensor, result: dict, table: Union[tf.compat.v1.Variable, tf.Tensor]) -> Union[tf.IndexedSlices, tf.Tensor]: return local_grad @@ -81,25 +64,7 @@ class HBMDynamicSparseEmbedding(DynamicSparseEmbedding): def __init__(self, config: dict): super(DynamicSparseEmbedding, self).__init__(config) - def set_optimizer(self, key: str, state_dict: dict): - pass - - def _build_optimizer_states(self): - pass - - def _set_ext_emb_size(self): - self._ext_emb_size = self._emb_size * self._ext_coefficient - logger.debug("init table, ext_emb_size is set to be %s.", self._ext_emb_size) - def _set_slice_vocab_size(self): # 动态扩容模式下,保留device侧variable,大小设置为1 self._slice_device_vocabulary_size = 1 - def _get_preprocessed_tensor(self, feature_spec: FeatureSpec, is_training: bool, send_count: Optional[int]) -> dict: - channel_id = ConfigInitializer.get_instance().train_params_config.get_training_mode_channel_id(is_training) - config = dict(batch_size=feature_spec.batch_size, feat_cnt=feature_spec.feat_cnt, send_count=send_count, - rank_size=self._rank_size, channel_id=channel_id, table_name=self._table_name, - is_hbm=self._is_hbm, ext_emb_size=self._ext_emb_size, - emb_size=self._emb_size, device_id=self._device_id, use_dynamic_expansion=True) - - return get_preprocessed_tensor_for_asc(self._variable, config) diff --git a/mx_rec/core/emb/sparse_embedding.py b/mx_rec/core/emb/sparse_embedding.py index 938f917d..071f4506 100644 --- a/mx_rec/core/emb/sparse_embedding.py +++ b/mx_rec/core/emb/sparse_embedding.py @@ -11,10 +11,7 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from mx_rec.core.asc.feature_spec import FeatureSpec -from mx_rec.core.asc.build_graph import get_preprocessed_tensor_for_asc from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding -from mx_rec.optimizers.emb_optimizer import EmbOptimizer -from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger @@ -30,22 +27,6 @@ class SparseEmbedding(BaseSparseEmbedding): def capacity(self) -> int: pass - @abc.abstractmethod - def set_optimizer(self, key: str, state_dict: dict): - pass - - @abc.abstractmethod - def _set_ext_emb_size(self): - pass - - @abc.abstractmethod - def _build_optimizer_states(self): - pass - - @abc.abstractmethod - def _get_preprocessed_tensor(self, feature_spec: FeatureSpec, is_training: bool, send_count: Optional[int]) -> dict: - pass - def _set_slice_vocab_size(self): self._slice_device_vocabulary_size = math.ceil(self._device_vocabulary_size / self._rank_size) self._slice_host_vocabulary_size = math.ceil(self._host_vocabulary_size / self._rank_size) @@ -84,25 +65,6 @@ class HBMSparseEmbedding(SparseEmbedding): def capacity(self) -> int: return self._device_vocabulary_size - def set_optimizer(self, key: str, state_dict: dict): - pass - - def _build_optimizer_states(self): - pass - - def _set_ext_emb_size(self): - self._ext_emb_size = self._emb_size * self._ext_coefficient - logger.debug("Init table, ext_emb_size is set to be %s.", self._ext_emb_size) - - def _get_preprocessed_tensor(self, feature_spec: FeatureSpec, is_training: bool, send_count: Optional[int]) -> dict: - channel_id = ConfigInitializer.get_instance().train_params_config.get_training_mode_channel_id(is_training) - config = dict(batch_size=feature_spec.batch_size, feat_cnt=feature_spec.feat_cnt, send_count=send_count, - rank_size=self._rank_size, channel_id=channel_id, table_name=self._table_name, - is_hbm=self._is_hbm, ext_emb_size=self._ext_emb_size, - emb_size=self._emb_size, device_id=self._device_id) - - return get_preprocessed_tensor_for_asc(self._variable, config) - class ExternalStorageSparseEmbedding(SparseEmbedding): """ @@ -110,19 +72,8 @@ class ExternalStorageSparseEmbedding(SparseEmbedding): """ def __init__(self, config: dict): - self.emb_optimizer = EmbOptimizer(config.get("optimizer_list")) - self.emb_optimizer.check_optimizer_instance_list() - super(ExternalStorageSparseEmbedding, self).__init__(config) - @property - def optimizer(self): - return self.emb_optimizer.optimizer - - @property - def optimizer_instance_list(self): - return self.emb_optimizer.optimizer_instance_list - def capacity(self) -> int: # DDR if not self._ssd_vocabulary_size: @@ -130,33 +81,6 @@ class ExternalStorageSparseEmbedding(SparseEmbedding): # SSD return self._device_vocabulary_size + self._host_vocabulary_size + self._ssd_vocabulary_size - def set_optimizer(self, key: str, state_dict: dict): - self.emb_optimizer.set_optimizer(key, state_dict, self._table_name) - - def _set_ext_emb_size(self): - self._ext_coefficient += len(self.emb_optimizer.optimizer_slot_info_list) - self._ext_emb_size = self._emb_size * self._ext_coefficient - logger.debug("Init table, ext_emb_size is set to be %s.", self._ext_emb_size) - - def _build_optimizer_states(self): - for sparse_optimizer_instance in self.emb_optimizer.optimizer_instance_list: - slot_info_list = sparse_optimizer_instance.initialize_slots(self._variable, self) - self.emb_optimizer.optimizer_slot_info_list.extend(slot_info_list) - - for slot_info in self.emb_optimizer.optimizer_slot_info_list: - self.emb_optimizer.set_optimizer_slot(slot_info) - - def _get_preprocessed_tensor(self, feature_spec: FeatureSpec, is_training: bool, send_count: Optional[int]) -> dict: - channel_id = ConfigInitializer.get_instance().train_params_config.get_training_mode_channel_id(is_training) - config = dict(batch_size=feature_spec.batch_size, feat_cnt=feature_spec.feat_cnt, send_count=send_count, - rank_size=self._rank_size, channel_id=channel_id, table_name=self._table_name, - is_hbm=self._is_hbm, ext_emb_size=self._ext_emb_size, - emb_size=self._emb_size, device_id=self._device_id) - - variable_list = [self._variable] + \ - [slot_info.get("slot") for slot_info in self.emb_optimizer.optimizer_slot_info_list] - return get_preprocessed_tensor_for_asc(variable_list, config) - def _set_specific_value_for_non_valid_key(id_offsets: Optional[tf.Tensor], embeddings: Optional[tf.Tensor], diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index f90efcf6..16f19d04 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -43,7 +43,6 @@ from mx_rec.util.log import logger ("dim", NumValidator, {"min_value": 1, "max_value": 8192}, ["check_value"]), ("name", StringValidator, {"min_len": 1, "max_len": 100}, ["check_string_length", "check_whitelist"]), ("emb_initializer", ClassValidator, {"classes": (InitializerV1, InitializerV2)}), - ("optimizer_list", ClassValidator, {"classes": (list, type(None))}), (["ssd_vocabulary_size", "ssd_data_path", "host_vocabulary_size"], SSDFeatureValidator), ("device_vocabulary_size", IntValidator, {"min_value": 1, "max_value": MAX_DEVICE_VOCABULARY_SIZE}, ["check_value"]), @@ -59,7 +58,6 @@ from mx_rec.util.log import logger ("hashtable_threshold", IntValidator, {"min_value": 0, "max_value": MAX_INT32}, ["check_value"]) ]) def create_table(key_dtype, dim, name, emb_initializer, - optimizer_list: Optional[list] = None, device_vocabulary_size=1, host_vocabulary_size=0, ssd_vocabulary_size=0, @@ -77,7 +75,6 @@ def create_table(key_dtype, dim, name, emb_initializer, dim: embedding vector size name: hash table name emb_initializer: the initializer for embedding values - optimizer_list: specify the optimizers to use for current hash table device_vocabulary_size: embedding vector numbers on device host_vocabulary_size: embedding vector numbers on ddr ssd_vocabulary_size: embedding vector numbers on ssd @@ -95,8 +92,7 @@ def create_table(key_dtype, dim, name, emb_initializer, config = dict(key_dtype=key_dtype, embedding_size=dim, table_name=name, emb_initializer=emb_initializer, device_vocabulary_size=device_vocabulary_size, host_vocabulary_size=host_vocabulary_size, ssd_vocabulary_size=ssd_vocabulary_size, ssd_data_path=ssd_data_path, - optimizer_list=optimizer_list, init_param=init_param, is_save=is_save, - all2all_gradients_op=all2all_gradients_op) + init_param=init_param, is_save=is_save, all2all_gradients_op=all2all_gradients_op) # 动态扩容 if ConfigInitializer.get_instance().use_dynamic_expansion: return HBMDynamicSparseEmbeddingFactory().create_embedding(config) diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index e0b4bdeb..72772c5f 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -31,15 +31,17 @@ from mx_rec.constants.constants import ASCEND_CUTTING_POINT_INITIALIZER, ASCEND_ from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.asc.helper import get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline +from mx_rec.core.asc.swap_args import SwapArgs from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.graph.merge_lookup import do_merge_lookup -from mx_rec.graph.utils import check_input_list, find_parent_op, check_cutting_points, \ -record_ops_to_replace, export_pb_graph, make_sorted_key_to_tensor_list +from mx_rec.graph.utils import check_input_list, find_parent_op, check_cutting_points, record_ops_to_replace, \ + export_pb_graph, make_sorted_key_to_tensor_list, replace_anchor_control from mx_rec.graph.constants import DeprecatedOp, AnchorDatasetOp, AnchorIteratorOp from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger from mx_rec.util.ops import import_host_pipeline_ops from mx_rec.util.perf import performance +from mx_rec.util.tf_version_adapter import hccl_ops, npu_ops from mx_rec.validator.validator import para_checker_decorator, ClassValidator @@ -381,6 +383,14 @@ def get_dataset_tensor_count(dataset: DatasetV1Adapter) -> int: return len(src_sorted_keys) +def change_ext_emb_size_by_opt(optimizer): + for _, table_instance in ConfigInitializer.get_instance().sparse_embed_config.table_instance_dict.items(): + # When dynamic expansion mode, ext_emb_size is set by optimizer + if ConfigInitializer.get_instance().use_dynamic_expansion or not table_instance.is_hbm: + table_instance.ext_emb_size = table_instance.emb_size * (1 + optimizer.slot_num) + logger.debug("ext_emb_size is reset to be %s for EmbInfo", table_instance.ext_emb_size) + + @para_checker_decorator( check_option_list=[("dump_graph", ClassValidator, {"classes": (bool,)})] ) @@ -457,6 +467,9 @@ def get_src_dataset(get_next_op: Operation, is_training: bool) -> DatasetV1Adapt elif is_training and len(dataset_op_list) == 2: prefetch_dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) target_op = prefetch_dataset_op_list[0] + elif not is_training and len(dataset_op_list) == 2: + prefetch_dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) + target_op = prefetch_dataset_op_list[1] elif not is_training and len(dataset_op_list) == 3: prefetch_dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) target_op = prefetch_dataset_op_list[1] @@ -567,6 +580,118 @@ def update_iterator_getnext(get_next_op: Operation, update_input_tensor_with_new_batch(record.replacement_spec, new_get_next_op_name, new_batch) +def get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: list, swap_len: int, swap_pos: list, + channel_id: int) -> list: + """ + Get swap info if threshold is configured. + :param table_instance: BaseSparseEmbedding + :param variable_and_slot_list: [var + slots] + :param swap_len: swap length + :param swap_pos: swap position + :param channel_id: train or predict + :return: swap info + """ + use_static = ConfigInitializer.get_instance().use_static + max_lookup_vec_size = None + if use_static: + max_lookup_vec_size = table_instance.send_count * table_instance.rank_size + + if table_instance.is_hbm: + swap_in = [tf.no_op()] + else: + with tf.compat.v1.variable_scope("h2d_emb"): + logger.debug('Channel %s_h2d_%s was built for getnext', table_instance.table_name, channel_id) + h2d_emb = npu_ops.gen_npu_ops.get_next( + output_types=[tf.float32], + output_shapes=[[max_lookup_vec_size, table_instance.ext_emb_size]], + channel_name=f'{table_instance.table_name}_h2d_{channel_id}')[0] + logger.debug("h2d_emb shape: %s", h2d_emb) + if not isinstance(variable_and_slot_list, list): + raise RuntimeError("When enable emb_transfer, optimizer should have slots") + if use_static: + swap_pos = swap_pos[0:swap_len] + h2d_emb = h2d_emb[0:swap_len, :] + swap_outs = [tf.gather(one_table, swap_pos) for one_table in variable_and_slot_list] + swap_out = tf.concat(swap_outs, axis=1) + logger.debug('Channel %s_d2h_%s was built for op outfeed.', table_instance.table_name, channel_id) + swap_out_op = npu_ops.outfeed_enqueue_op( + channel_name=f'{table_instance.table_name}_d2h_{channel_id}', inputs=[swap_out]) + with tf.control_dependencies([swap_out_op]): + nd_swap_pos = tf.expand_dims(swap_pos, 1) + table_num = len(variable_and_slot_list) + h2d_emb_split = tf.split(h2d_emb, table_num, axis=1) + optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name( + table_instance.table_name) + if optimizer is None and channel_id == 1: + swap_in = [tf.compat.v1.scatter_nd_update(variable_and_slot_list[0], nd_swap_pos, h2d_emb_split[0])] + else: + swap_in = [tf.compat.v1.scatter_nd_update(variable_and_slot_list[i], nd_swap_pos, h2d_emb_split[i]) + for i in range(len(variable_and_slot_list))] + return swap_in + + +def get_variable_and_slot_list(each_var, slot_num, table_name, channel_id): + variable_and_slot_list = [each_var] + if slot_num == 0: + return variable_and_slot_list + + # 通过apply_gradients创建optimizer + optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name(table_name) + if optimizer is None and channel_id == 0: + raise RuntimeError("In training mode, table_instance should have been set_optimizer_for_table " + "before modify_graph, please check whether apply_gradients is performed") + + # predict不需要传优化器,但是如果客户创建了优化器,ddr模式加载的是维度ext_size的emb用作换入换出,所以需要给slot零值占位 + if optimizer is None and channel_id == 1: + slot_place_holder = tf.zeros_like(each_var) + for i in range(slot_num): + variable_and_slot_list.append(slot_place_holder) + else: + # opt name to slot dict + for slot_dict in optimizer.values(): + for slot_val in slot_dict.values(): + variable_and_slot_list.append(slot_val) + + return variable_and_slot_list + + +def modify_graph_for_ddr(get_next_op_map): + # 通过create_hash_optimizer创建optimizer_instance + optimizer_instance = ConfigInitializer.get_instance().optimizer_config.optimizer_instance + # predict + if optimizer_instance is None: + slot_num = 0 + else: + # ddr和扩容需要在获取优化器后重置ext + change_ext_emb_size_by_opt(optimizer_instance) + slot_num = optimizer_instance.slot_num + + for _, record in get_next_op_map.items(): + is_training = record.is_training + channel_id = 0 if is_training else 1 + + swap_args = SwapArgs() + sparse_variables = tf.compat.v1.get_collection( + ConfigInitializer.get_instance().train_params_config.ascend_global_hashtable_collection) + + for each_var in sparse_variables: + table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(each_var) + if table_instance.is_hbm: + continue + swap_args_dict = swap_args.swap_config_dict[table_instance.table_name][channel_id] + swap_pos = swap_args_dict['swap_pos'] + swap_len = swap_args_dict['swap_len'] + variable_and_slot_list = get_variable_and_slot_list(each_var, slot_num, table_instance.table_name, + channel_id) + + swap_op = get_swap_info(table_instance, variable_and_slot_list, swap_len, swap_pos, channel_id) + swap_control_dict = swap_args.swap_control_dict[table_instance.table_name][channel_id] + if "control_ops" not in swap_control_dict: + raise ValueError("Missing Required key in modify_graph_for_asc: control_ops") + control_ops = swap_control_dict['control_ops'] + replace_anchor_control(control_ops, swap_op) + + @performance("graph_modifier") def modify_graph_for_asc(dump_graph: bool = False, prefetch: int = 10): cutting_point_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE) @@ -612,6 +737,8 @@ def modify_graph_for_asc(dump_graph: bool = False, prefetch: int = 10): if is_training and not ConfigInitializer.get_instance().train_params_config.get_merged_multi_lookup(True): raise RuntimeError("In training mode, `do_merge_lookup` should have been executed in compute gradients " "phase. Please check whether compute gradients is performed.") + # ddr + modify_graph_for_ddr(get_next_op_map) logger.info("Graph has been revised.") export_pb_graph("new_graph.pb", dump_graph) diff --git a/mx_rec/graph/utils.py b/mx_rec/graph/utils.py index 8ffc8bc6..ca328ae3 100644 --- a/mx_rec/graph/utils.py +++ b/mx_rec/graph/utils.py @@ -23,11 +23,13 @@ import tensorflow as tf from tensorflow import Operation, Tensor from tensorflow.core.framework.graph_pb2 import GraphDef from tensorflow.python.framework.errors_impl import InvalidArgumentError +from tensorflow.python.ops import control_flow_ops from mx_rec.graph.slicers import OrphanLookupKeySlicer from mx_rec.graph.constants import AnchorIteratorOp from mx_rec.constants.constants import ASCAnchorAttr, DUMP_MIDIFY_GRAPH_FILE_MODE from mx_rec.core.embedding import BaseSparseEmbedding +from mx_rec.core.asc.swap_args import SwapArgs, SwapDataType from mx_rec.util.log import logger @@ -90,6 +92,32 @@ def replace_anchor(replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operat f"new tensor: {new_tensor_list[tensor_idx]}.") from err +def record_control_to_replace(src_op: Operation) -> DefaultDict[Tensor, List[Tuple[int, Operation]]]: + replacement_specs = defaultdict(list) + op_list = tf.compat.v1.get_default_graph().get_operations() + for operator in op_list: + if src_op in operator.control_inputs: + input_index = operator.control_inputs.index(src_op) + replacement_specs[src_op].append((input_index, operator)) + + return replacement_specs + + +def replace_control_anchor(replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], + new_tensor_list: List[Tensor]): + + for tensor_idx, (old_tensor, items) in enumerate(replacement_specs.items()): + for _, operator in items: + try: + control_op = control_flow_ops.group(new_tensor_list) + operator._add_control_input(control_op) + except InvalidArgumentError as err: + logger.info("The replacement control specs keys (old batch) is: %s. \n\t\t The new_tensor_list is: %s.", + replacement_specs.keys(), new_tensor_list) + raise RuntimeError(f"Cannot update edge, old tensor: {old_tensor}, " + f"new tensor: {new_tensor_list[tensor_idx]}.") from err + + def export_pb_graph(file_name: str, dump_graph: bool = False, graph_def: GraphDef = None, @@ -165,6 +193,27 @@ def replace_anchor_vec(cutting_point: Tensor, attribute: ASCAnchorAttr, anchor: replace_anchor(replacement_specs_for_anchor_vec, [anchor]) +def replace_anchor_control(place_holder_control: tf.Operation, real_anchor: Tensor): + """ + 将place_holder_control替换为入参real_anchor. + + Args: + place_holder_control: control op + real_anchor: 用来替换打桩节点的tensor + + Returns: None + + """ + + if place_holder_control is None: + raise RuntimeError(f"Node place_holder_control does not exist. Check whether the sparse lookup interface " + f"is correctly invoked.") + # find the op with stub node as the input + replacement_specs_for_anchor_vec = record_control_to_replace(place_holder_control) + # replace anchor_vec with anchor + replace_control_anchor(replacement_specs_for_anchor_vec, real_anchor) + + def mark_orphan_lookup_key(lookup_key: Tensor) -> Tensor: graph_def = tf.compat.v1.get_default_graph().as_graph_def() subgraph = tf.compat.v1.graph_util.extract_sub_graph(graph_def, [lookup_key.op.name]) diff --git a/mx_rec/optimizers/adagrad.py b/mx_rec/optimizers/adagrad.py index 125346b9..9998ec1f 100644 --- a/mx_rec/optimizers/adagrad.py +++ b/mx_rec/optimizers/adagrad.py @@ -25,7 +25,6 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.training import adagrad, training_ops -from tensorflow.python.training import slot_creator from mx_rec.optimizers.base import CustomizedOptimizer from mx_rec.util.initialize import ConfigInitializer @@ -80,30 +79,6 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): self._slot_num = 1 self._derivative = 2 - def initialize_slots(self, var, table_instance): - # Create slots for the first and second moments. - def creat_one_single_slot(var, op_name): - new_slot_variable = slot_creator.create_zeros_slot(var, op_name) - # make sure sparse optimizer statements will not be saved and restored within tf checkpoint. - return new_slot_variable - - accumulator = creat_one_single_slot(var, self._name + "/" + "accumulator") - ConfigInitializer.get_instance().sparse_embed_config.insert_removing_var_list(accumulator.name) - named_slot_key = (var.op.graph, var.op.name) - table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) - ConfigInitializer.get_instance().optimizer_config.set_optimizer_for_table(table_instance.table_name, - self.optimizer_type, - {"accumulator": accumulator}) - return [{"slot": accumulator, "named_slot_key": named_slot_key, "slot_name": "acc", "optimizer": self}] - - def insert_slot(self, slot, named_slots_key, slot_name): - named_slots = self._slot_dict(slot_name) - if named_slots_key in named_slots: - raise EnvironmentError(f"named_slots_key should be global unique, but it has been in use now, " - f"please double check.") - - named_slots[named_slots_key] = slot - def get_slot_init_values(self): # return state value list of adagrad that needs to initialize in ASC DDR. initial_accumulator_value = 0.0 diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index 696406f8..fbc63193 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -122,12 +122,6 @@ class CustomizedOptimizer: array_ops.shape(unique_keys)[0]) return unique_local_grad, unique_keys - def initialize_slots(self, var, table_instance): - raise NotImplementedError(f"Please define a specific realization on {self.__class__.__name__}") - - def insert_slot(self, slot, named_slots_key, slot_name): - raise NotImplementedError(f"Please define a specific realization on {self.__class__.__name__}") - def get_slot_init_values(self): raise NotImplementedError(f"Please define a specific realization on {self.__class__.__name__}") diff --git a/mx_rec/optimizers/emb_optimizer.py b/mx_rec/optimizers/emb_optimizer.py deleted file mode 100644 index 9e6a80e1..00000000 --- a/mx_rec/optimizers/emb_optimizer.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved. - -from mx_rec.optimizers.base import CustomizedOptimizer -from mx_rec.util.tf_version_adapter import NPULossScaleOptimizer - - -class EmbOptimizer: - """ - 稀疏表的优化器. - """ - - def __init__(self, optimizer_list): - self._optimizer_instance_list = optimizer_list - self._optimizer_slot_info_list = [] - self._optimizer = dict() - - @property - def optimizer_instance_list(self): - return self._optimizer_instance_list - - @property - def optimizer_slot_info_list(self): - return self._optimizer_slot_info_list - - @property - def optimizer(self): - return self._optimizer - - @staticmethod - def set_optimizer_slot(slot_info: dict): - """ - 设置稀疏表优化器的slot信息. - - Args: - slot_info: 优化器slot信息 - - Returns: None - """ - slot = slot_info.get("slot") - slot_name = slot_info.get("slot_name") - optimizer = slot_info.get("optimizer") - named_slot_key = slot_info.get("named_slot_key") - - optimizer.insert_slot(slot, named_slot_key, slot_name) - - def set_optimizer(self, key: str, state_dict: dict, table_name: str): - """ - 设置optimizer state. - - Args: - key: 优化器名字 - state_dict: optimizer state - table_name: 稀疏表名 - - Returns: None - """ - if key in self._optimizer: - raise ValueError(f"optimizer {key} has been set for hash table {table_name}.") - self._optimizer[key] = state_dict - - def check_optimizer_instance_list(self): - """ - 校验优化器实例列表. - """ - if not self._optimizer_instance_list: - raise ValueError("External storage mode should config optimizers before instantiating sparse table, " - "but nothing was configured.") - - for optimizer_instance in self._optimizer_instance_list: - if isinstance(optimizer_instance, NPULossScaleOptimizer): - optimizer_instance = getattr(optimizer_instance, '_opt') - - if not isinstance(optimizer_instance, CustomizedOptimizer): - raise TypeError("the optimizer instance must be an instance of CustomizedOptimizer.") diff --git a/mx_rec/optimizers/ftrl.py b/mx_rec/optimizers/ftrl.py index ef617c2d..30287abd 100644 --- a/mx_rec/optimizers/ftrl.py +++ b/mx_rec/optimizers/ftrl.py @@ -29,7 +29,6 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import gen_state_ops from tensorflow.python.training import ftrl -from tensorflow.python.training import slot_creator from mx_rec.optimizers.base import CustomizedOptimizer from mx_rec.util.initialize import ConfigInitializer @@ -82,30 +81,6 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): self._slot_num = 2 self._derivative = 2 - def initialize_slots(self, var, table_instance): - val = constant_op.constant( - self._initial_accumulator_value, dtype=var.dtype, shape=var.get_shape()) - - accum = slot_creator.create_slot(var, val, self._name + "/" + "accum") - linear = slot_creator.create_zeros_slot(var, self._name + "/" + "linear") - ConfigInitializer.get_instance().sparse_embed_config.insert_removing_var_list(accum.name) - ConfigInitializer.get_instance().sparse_embed_config.insert_removing_var_list(linear.name) - named_slot_key = (var.op.graph, var.op.name) - table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) - ConfigInitializer.get_instance().optimizer_config.set_optimizer_for_table(table_instance.table_name, - self.optimizer_type, - {"accum": accum, "linear": linear}) - return [{"slot": accum, "named_slot_key": named_slot_key, "slot_name": "accum", "optimizer": self}, - {"slot": linear, "named_slot_key": named_slot_key, "slot_name": "linear", "optimizer": self}] - - def insert_slot(self, slot, named_slots_key, slot_name): - named_slots = self._slot_dict(slot_name) - if named_slots_key in named_slots: - raise EnvironmentError(f"named_slots_key should be global unique, but it has been in use now, " - f"please double check.") - - named_slots[named_slots_key] = slot - def get_slot_init_values(self): # return state value list of ftrl that needs to initialize in ASC DDR. initial_linear_value = 0.0 diff --git a/mx_rec/optimizers/gradient_descent.py b/mx_rec/optimizers/gradient_descent.py index d021f69f..89d67d89 100644 --- a/mx_rec/optimizers/gradient_descent.py +++ b/mx_rec/optimizers/gradient_descent.py @@ -57,9 +57,6 @@ class CustomizedGradientDescent(gradient_descent.GradientDescentOptimizer, Custo self._slot_num = 0 self._derivative = 1 - def initialize_slots(self, var, table_instance): - return [] - def get_slot_init_values(self): return [] diff --git a/mx_rec/optimizers/gradient_descent_by_addr.py b/mx_rec/optimizers/gradient_descent_by_addr.py index 9db7c2ae..8cf9257e 100644 --- a/mx_rec/optimizers/gradient_descent_by_addr.py +++ b/mx_rec/optimizers/gradient_descent_by_addr.py @@ -62,9 +62,6 @@ class CustomizedGradientDescentByAddr(gradient_descent.GradientDescentOptimizer, self._slot_num = 0 self._derivative = 1 - def initialize_slots(self, var, table_instance): - return [] - def get_slot_init_values(self): return [] diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index 875f350f..9aee0204 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -28,7 +28,6 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import gen_state_ops from tensorflow.python.ops import math_ops from tensorflow.python.training import adam -from tensorflow.python.training import slot_creator from mx_rec.optimizers.base import CustomizedOptimizer from mx_rec.util.initialize import ConfigInitializer @@ -84,34 +83,6 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): self._slot_num = 2 self._derivative = 2 - def initialize_slots(self, var, table_instance): - # Create slots for the first and second moments. - def creat_one_single_slot(var, op_name): - new_slot_variable = slot_creator.create_zeros_slot(var, op_name) - # make sure sparse optimizer statements will not be saved and restored within tf checkpoint. - return new_slot_variable - - momentum = creat_one_single_slot(var, self._name + "/" + "momentum") - velocity = creat_one_single_slot(var, self._name + "/" + "velocity") - self.config_instance.sparse_embed_config.insert_removing_var_list(momentum.name) - self.config_instance.sparse_embed_config.insert_removing_var_list(velocity.name) - named_slot_key = (var.op.graph, var.op.name) - table_instance = self.config_instance.sparse_embed_config.get_table_instance(var) - ConfigInitializer.get_instance().optimizer_config.set_optimizer_for_table(table_instance.table_name, - self.optimizer_type, - {"momentum": momentum, - "velocity": velocity}) - return [{"slot": momentum, "named_slot_key": named_slot_key, "slot_name": "m", "optimizer": self}, - {"slot": velocity, "named_slot_key": named_slot_key, "slot_name": "v", "optimizer": self}] - - def insert_slot(self, slot, named_slots_key, slot_name): - named_slots = self._slot_dict(slot_name) - if named_slots_key in named_slots: - raise EnvironmentError(f"named_slots_key should be global unique, but it has been in use now, " - f"please double check.") - - named_slots[named_slots_key] = slot - def get_slot_init_values(self): # return state value list of adam that needs to initialize in ASC DDR. initial_momentum_value = 0.0 diff --git a/mx_rec/util/variable.py b/mx_rec/util/variable.py index 2c9f49a9..0040e2b5 100644 --- a/mx_rec/util/variable.py +++ b/mx_rec/util/variable.py @@ -27,11 +27,6 @@ def get_dense_and_sparse_variable(): return dense_variables, sparse_variables -def check_and_get_config_via_var(variable, optimizer_type: str): +def get_config_via_var(variable): table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(variable) - - if not table_instance.is_hbm and not table_instance.optimizer: - raise EnvironmentError(f"When ASC with DDR, you must pass the '{optimizer_type}' optimizer instances to the" - f" init method of SparseEmbedding.") - return table_instance diff --git a/tests/mx_rec/core/mock_class.py b/tests/mx_rec/core/mock_class.py index 6fb2ef38..e02f6257 100644 --- a/tests/mx_rec/core/mock_class.py +++ b/tests/mx_rec/core/mock_class.py @@ -20,8 +20,6 @@ import tensorflow as tf from tensorflow_core.python.training import slot_creator from mx_rec import ASCEND_GLOBAL_HASHTABLE_COLLECTION -from mx_rec.optimizers.lazy_adam import CustomizedLazyAdam -from mx_rec.util.config_utils.embedding_utils import SparseEmbedConfig from mx_rec.util.config_utils.feature_spec_utils import FeatureSpecConfig from mx_rec.util.config_utils.optimizer_utils import OptimizerConfig @@ -209,23 +207,6 @@ class MockOptimizer: self.slot_num = 2 self.derivative = 2 - def initialize_slots(self, var, table_instance): - # Create slots for the first and second moments. - def creat_one_single_slot(var, op_name): - new_slot_variable = slot_creator.create_zeros_slot(var, op_name) - return new_slot_variable - - momentum = creat_one_single_slot(var, self._name + "/" + "momentum") - velocity = creat_one_single_slot(var, self._name + "/" + "velocity") - named_slot_key = (var.op.graph, var.op.name) - - table_instance.set_optimizer(self._name, {"momentum": momentum, "velocity": velocity}) - return [{"slot": momentum, "named_slot_key": named_slot_key, "slot_name": "m", "optimizer": self}, - {"slot": velocity, "named_slot_key": named_slot_key, "slot_name": "v", "optimizer": self}] - - def insert_slot(self, slot, named_slots_key, slot_name): - pass - def get_slot_init_values(self): initial_momentum_value = 0.0 initial_velocity_value = 0.0 diff --git a/tests/mx_rec/core/test_build_graph.py b/tests/mx_rec/core/test_build_graph.py index dd17afec..c5766179 100644 --- a/tests/mx_rec/core/test_build_graph.py +++ b/tests/mx_rec/core/test_build_graph.py @@ -32,14 +32,12 @@ class TestGetRestoreVectorFunc(unittest.TestCase): def setUp(self): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) def test_get_restore_vector_case1(self): """ @@ -114,15 +112,13 @@ class TestGetIdOffsetsFunc(unittest.TestCase): def setUp(self): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") def test_get_id_offsets_case1(self, mock_get_next): @@ -164,14 +160,12 @@ class TestGetAll2allArgsFunc(unittest.TestCase): def setUp(self): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) def test_get_all2all_args_case1(self): """ @@ -198,60 +192,6 @@ class TestGetAll2allArgsFunc(unittest.TestCase): self.assertEqual(all2all_args, 0) -class TestGetSwapInfoFunc(unittest.TestCase): - """ - Test for 'mx_rec.core.asc.build_graph.get_swap_info'. - """ - - def setUp(self): - # 默认动态扩容、hot emb、HBM - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) - - def tearDown(self): - # 恢复config - self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) - - @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") - def test_get_swap_info_case1(self, build_graph_config_initializer): - """ - case1: 静态shape,HBM - """ - - from mx_rec.core.asc.build_graph import get_swap_info - - with tf.Graph().as_default(): - mock_config_initializer = MockConfigInitializer(use_static=True) - build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) - - swap_in = get_swap_info(self.config, None, None, None) - self.assertIsInstance(swap_in[0], type(tf.no_op())) - - @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") - @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") - def test_get_swap_info_case2(self, mock_get_next, build_graph_config_initializer): - """ - case2: 静态shape,非HBM,table传入非list,抛出异常 - """ - - from mx_rec.core.asc.build_graph import get_swap_info - - with tf.Graph().as_default(): - mock_config_initializer = MockConfigInitializer(use_static=True) - build_graph_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) - - mock_get_next.return_value = tf.ones(shape=[8, 8], dtype=tf.float32) - swap_pos = tf.constant([8, 9], dtype=tf.int32) - swap_len = tf.constant(2, dtype=tf.int32) - table = tf.compat.v1.get_variable("test_table", shape=[10, 8], initializer=tf.ones_initializer()) - self.config["is_hbm"] = False - with self.assertRaises(RuntimeError): - get_swap_info(self.config, swap_len, swap_pos, table) - - class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): """ Test for 'mx_rec.core.asc.build_graph.get_preprocessed_tensor_for_asc'. @@ -260,21 +200,17 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): def setUp(self): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, - use_dynamic_expansion=True) - global_env.apply_gradients_strategy = "direct_apply" + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), - get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0)) + get_all2all_args=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case1(self, build_graph_config_initializer): """ @@ -293,8 +229,7 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), - get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0)) + get_all2all_args=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case2(self, build_graph_config_initializer): """ @@ -313,8 +248,7 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), - get_all2all_args=mock.MagicMock(return_value=0), - get_swap_info=mock.MagicMock(return_value=0)) + get_all2all_args=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case3(self, build_graph_config_initializer): """ diff --git a/tests/mx_rec/core/test_embedding.py b/tests/mx_rec/core/test_embedding.py index bf7d9240..5bc762f4 100644 --- a/tests/mx_rec/core/test_embedding.py +++ b/tests/mx_rec/core/test_embedding.py @@ -120,8 +120,7 @@ class TestCreateTableFunc(unittest.TestCase): dim=8, name='test_table', emb_initializer=tf.compat.v1.truncated_normal_initializer(), - host_vocabulary_size=8, - optimizer_list=[create_hash_optimizer(learning_rate=0.01)]) + host_vocabulary_size=8) self.assertIsInstance(test_table, ExternalStorageSparseEmbedding) @@ -134,12 +133,11 @@ class TestSparseLookupFunc(unittest.TestCase): get_rank_size=mock.MagicMock(return_value=8), get_rank_id=mock.MagicMock(return_value=0), get_device_id=mock.MagicMock(return_value=0)) - @mock.patch("mx_rec.core.emb.sparse_embedding.get_preprocessed_tensor_for_asc") + @mock.patch("mx_rec.core.emb.base_sparse_embedding.get_preprocessed_tensor_for_asc") @mock.patch("mx_rec.core.embedding.ConfigInitializer") @mock.patch("mx_rec.core.emb.base_sparse_embedding.ConfigInitializer") @mock.patch("mx_rec.validator.emb_validator.ConfigInitializer") - @mock.patch("mx_rec.core.emb.sparse_embedding.ConfigInitializer") - def test_sparse_lookup_case1(self, embedding_config_initializer, base_sparse_embedding_config_initializer, + def test_sparse_lookup_case1(self, base_sparse_embedding_config_initializer, emb_validator_config_initializer, sparse_embedding_config_initializer, mock_get_preprocessed_tensor_for_asc): """ @@ -154,7 +152,6 @@ class TestSparseLookupFunc(unittest.TestCase): # mock mock_config_initializer = MockConfigInitializer(use_dynamic_expansion=False) - embedding_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) base_sparse_embedding_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) emb_validator_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) sparse_embedding_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) @@ -166,12 +163,9 @@ class TestSparseLookupFunc(unittest.TestCase): batch = {"case1_feat": tf.ones(shape=[8, 8], dtype=tf.int64)} mock_get_preprocessed_tensor_for_asc.return_value = { "restore_vector": tf.ones(shape=[8, 8], dtype=tf.int64), - "restore_vector_second": tf.ones(shape=[8, ], dtype=tf.int64), - "unique_keys": tf.ones(shape=[8, ], dtype=tf.int64), "hot_pos": tf.ones(shape=[8, ], dtype=tf.int64), "id_offsets": tf.ones(shape=[8, ], dtype=tf.int64), - "all2all_args": tf.ones(shape=[8, 8], dtype=tf.int64), - "swap_in": [tf.no_op()] + "all2all_args": tf.ones(shape=[8, 8], dtype=tf.int64) } # test @@ -190,12 +184,11 @@ class TestSparseLookupFunc(unittest.TestCase): get_rank_id=mock.MagicMock(return_value=0), get_device_id=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.feature_spec.ConfigInitializer") - @mock.patch("mx_rec.core.emb.sparse_embedding.get_preprocessed_tensor_for_asc") + @mock.patch("mx_rec.core.emb.base_sparse_embedding.get_preprocessed_tensor_for_asc") @mock.patch("mx_rec.core.embedding.ConfigInitializer") @mock.patch("mx_rec.core.emb.base_sparse_embedding.ConfigInitializer") @mock.patch("mx_rec.validator.emb_validator.ConfigInitializer") - @mock.patch("mx_rec.core.emb.sparse_embedding.ConfigInitializer") - def test_sparse_lookup_case2(self, embedding_config_initializer, base_sparse_embedding_config_initializer, + def test_sparse_lookup_case2(self, base_sparse_embedding_config_initializer, emb_validator_config_initializer, sparse_embedding_config_initializer, mock_get_preprocessed_tensor_for_asc, feature_spec_config_initializer): """ @@ -210,7 +203,6 @@ class TestSparseLookupFunc(unittest.TestCase): # mock mock_config_initializer = MockConfigInitializer(use_dynamic_expansion=False) - embedding_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) base_sparse_embedding_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) emb_validator_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) sparse_embedding_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) @@ -219,12 +211,9 @@ class TestSparseLookupFunc(unittest.TestCase): case2_feat = tf.ones(shape=[8, 8], dtype=tf.int64) mock_get_preprocessed_tensor_for_asc.return_value = { "restore_vector": tf.ones(shape=[8, 8], dtype=tf.int64), - "restore_vector_second": tf.ones(shape=[8, ], dtype=tf.int64), - "unique_keys": tf.ones(shape=[8, ], dtype=tf.int64), "hot_pos": tf.ones(shape=[8, ], dtype=tf.int64), "id_offsets": tf.ones(shape=[8, ], dtype=tf.int64), - "all2all_args": tf.ones(shape=[8, 8], dtype=tf.int64), - "swap_in": [tf.no_op()] + "all2all_args": tf.ones(shape=[8, 8], dtype=tf.int64) } # test diff --git a/tests/mx_rec/core/test_manager.py b/tests/mx_rec/core/test_manager.py index b08a6a6f..70c2f150 100644 --- a/tests/mx_rec/core/test_manager.py +++ b/tests/mx_rec/core/test_manager.py @@ -74,7 +74,6 @@ class TestGenerateTableInfoListFunc(unittest.TestCase): mock_opt = MockOptimizer() manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt - test_table.optimizer_instance_list = [mock_opt] table_info_list = generate_table_info_list() self.assertListEqual(table_info_list, []) @@ -100,7 +99,6 @@ class TestGenerateTableInfoListFunc(unittest.TestCase): mock_opt = MockOptimizer() manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt - test_table.optimizer_instance_list = [mock_opt] table_info_list = generate_table_info_list() self.assertListEqual(table_info_list, []) @@ -139,7 +137,6 @@ class TestGenerateTableInfoListFunc(unittest.TestCase): mock_opt = MockOptimizer() manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt - test_table.optimizer_instance_list = [mock_opt] table_info_list = generate_table_info_list() self.assertListEqual(table_info_list, ["test_table_info"]) @@ -338,7 +335,6 @@ class TestMatchedOptSlotInitializersFunc(unittest.TestCase): table_instance.ext_emb_size = 24 mock_opt = MockOptimizer() manager_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt - table_instance.optimizer_instance_list = [mock_opt] slot_initializers = matched_opt_slot_initializers(table_instance) self.assertListEqual(slot_initializers, ["slot_initializer", "slot_initializer"]) diff --git a/tests/mx_rec/graph/test_modifier.py b/tests/mx_rec/graph/test_modifier.py index 2a9af10d..ff9a6664 100644 --- a/tests/mx_rec/graph/test_modifier.py +++ b/tests/mx_rec/graph/test_modifier.py @@ -18,7 +18,7 @@ import unittest from collections import defaultdict from unittest import TestCase -from unittest.mock import patch, Mock +from unittest.mock import patch, Mock, MagicMock from typing import Union, Callable import tensorflow as tf @@ -47,7 +47,7 @@ from mx_rec.graph.modifier import ( get_timestamp_index, modify_graph_for_asc, ) -from tests.mx_rec.core.mock_class import MockConfigInitializer +from tests.mx_rec.core.mock_class import MockConfigInitializer, MockSparseEmbedding, MockOptimizer from tests.mx_rec.graph.mock_dataset import gen_mock_dataset @@ -257,6 +257,9 @@ class ModifyGraphForAscTest(TestCase): get_asc_insert_func=Mock(return_value=lambda x, y: x), ) @patch.multiple("mx_rec.graph.modifier.BaseSparseEmbedding", get_anchor_attribute=_gen_mock_get_anchor_attribute()) + @patch.multiple("mx_rec.core.asc.manager", + should_skip=MagicMock(return_value=True), + check_dangling_table=MagicMock(return_value=["test_table"])) @patch("mx_rec.graph.modifier.ConfigInitializer") def test_ok_train_mode(self, modifier_config_initializer): mock_config_initializer = MockConfigInitializer(modify_graph=True, merged_multi_lookup=True) @@ -268,6 +271,13 @@ class ModifyGraphForAscTest(TestCase): mock_ids = mock_batch.get("mock_ids") mock_cutting_point = tf.identity(mock_ids) + test_table = MockSparseEmbedding("test_table") + test_table.is_hbm = True + mock_config_initializer.get_instance().sparse_embed_config.table_instance_dict = dict(test_table=test_table) + + mock_opt = MockOptimizer() + modifier_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE, mock_cutting_point) modify_graph_for_asc() @@ -293,6 +303,13 @@ class ModifyGraphForAscTest(TestCase): mock_ids = mock_batch.get("mock_ids") mock_cutting_point = tf.identity(mock_ids) + test_table = MockSparseEmbedding("test_table") + test_table.is_hbm = True + mock_config_initializer.get_instance().sparse_embed_config.table_instance_dict = dict(test_table=test_table) + + mock_opt = MockOptimizer() + modifier_config_initializer.get_instance().optimizer_config.optimizer_instance = mock_opt + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE, mock_cutting_point) modify_graph_for_asc() diff --git a/tests/mx_rec/saver/sparse_embedding_mock.py b/tests/mx_rec/saver/sparse_embedding_mock.py index 83507e63..7f7d437d 100644 --- a/tests/mx_rec/saver/sparse_embedding_mock.py +++ b/tests/mx_rec/saver/sparse_embedding_mock.py @@ -29,11 +29,4 @@ class SparseEmbeddingMock: self.emb_size = 4 self.is_hbm = host_vocab_size == 0 self.host_vocabulary_size = host_vocab_size - self.optimizer = dict() self.use_dynamic_expansion = False - - def set_optimizer(self, key, state_dict): - if key in self.optimizer: - raise ValueError(f"optimizer {key} has been set for hash table {self.table_name}") - - self.optimizer[key] = state_dict diff --git a/tests/mx_rec/saver/test_saver.py b/tests/mx_rec/saver/test_saver.py index 60c40a21..c0436a72 100644 --- a/tests/mx_rec/saver/test_saver.py +++ b/tests/mx_rec/saver/test_saver.py @@ -23,6 +23,7 @@ import tensorflow as tf from mx_rec.saver.saver import Saver from mx_rec.constants.constants import ASCEND_GLOBAL_HASHTABLE_COLLECTION +from mx_rec.util.initialize import ConfigInitializer from tests.mx_rec.core.mock_class import MockConfigInitializer from tests.mx_rec.saver.sparse_embedding_mock import SparseEmbeddingMock @@ -40,8 +41,7 @@ class TestSaver(unittest.TestCase): @mock.patch.multiple("mx_rec.saver.saver", get_rank_id=mock.MagicMock(return_value=0), - get_local_rank_size=mock.MagicMock(return_value=1), - set_optimizer_info=mock.MagicMock(return_value=None)) + get_local_rank_size=mock.MagicMock(return_value=1)) @mock.patch("mx_rec.saver.saver.ConfigInitializer") def test_save_and_load_is_consistent(self, saver_config_initializer): mock_config_initializer = \ @@ -86,7 +86,6 @@ class TestSaver(unittest.TestCase): optim_v_tensor = emb_initializer(self.shape) self.optimizer_v = tf.compat.v1.get_variable(self.optim_v_name, trainable=False, initializer=optim_v_tensor) - table_instance.set_optimizer("LazyAdam", {"momentum": self.optimizer_m, "velocity": self.optimizer_v}) tf.compat.v1.add_to_collection(ASCEND_GLOBAL_HASHTABLE_COLLECTION, self.var) return self.graph diff --git a/tests/mx_rec/util/test_variable.py b/tests/mx_rec/util/test_variable.py index f8cd2725..a3370e84 100644 --- a/tests/mx_rec/util/test_variable.py +++ b/tests/mx_rec/util/test_variable.py @@ -21,7 +21,7 @@ from unittest.mock import patch import tensorflow as tf from mx_rec.util.global_env_conf import global_env -from mx_rec.util.variable import check_and_get_config_via_var +from mx_rec.util.variable import get_config_via_var from mx_rec.util.variable import get_dense_and_sparse_variable from tests.mx_rec.core.mock_class import MockConfigInitializer @@ -29,7 +29,6 @@ from tests.mx_rec.core.mock_class import MockConfigInitializer class MockTableInstance: def __init__(self): self.is_hbm = False - self.optimizer = False @patch.multiple( @@ -73,14 +72,6 @@ class VariableTest(unittest.TestCase): self.assertTrue(result_run) tf.reset_default_graph() - @mock.patch("mx_rec.util.variable.ConfigInitializer") - def test_check_and_get_config_via_var_when_environment_error(self, variable_config_initializer): - mock_config_initializer = MockConfigInitializer(var=MockTableInstance()) - variable_config_initializer.get_instance = mock.Mock(return_value=mock_config_initializer) - - with self.assertRaises(EnvironmentError): - self.assertEqual(MockTableInstance(), check_and_get_config_via_var("1", "optimize")) - if __name__ == '__main__': unittest.main() diff --git a/tools/atomic/sparse_lookup_with_grad.py b/tools/atomic/sparse_lookup_with_grad.py index 26633abe..ea80bce3 100644 --- a/tools/atomic/sparse_lookup_with_grad.py +++ b/tools/atomic/sparse_lookup_with_grad.py @@ -203,7 +203,6 @@ if __name__ == '__main__': emb_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=0), device_vocabulary_size=dev_vocab_size * local_rank_size, - optimizer_list=sparse_optimizer_list, mode=MxRecMode.mapping("ASC")) sparse_variables = tf.compat.v1.get_collection(get_ascend_global_hashtable_collection()) -- Gitee From d2676a117aaba5fed51519e5999c50e951978456 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 7 May 2024 21:53:54 +0800 Subject: [PATCH 120/302] Merge remote-tracking branch 'origin/hdfs_dev_dts' into hdfs_dev_dts --- src/core/file_system/hdfs_file_system/hdfs_file_system.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp index 715107d3..704a89b5 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp +++ b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp @@ -267,7 +267,7 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em if (res != embedSizeInfo.embeddingSize * sizeof(float)) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); - throw runtime_error(StringFormat( + throw runtime_error( StringFormat("Error: Expected to read {} bytes, but actually read {} bytes from file {}.", embedSizeInfo.embeddingSize * sizeof(float), res, filePath.c_str())); } -- Gitee From cfd97d0f41f6cd1e21164c3187e4cd713f619d13 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 7 May 2024 22:11:02 +0800 Subject: [PATCH 121/302] Merge remote-tracking branch 'origin/hdfs_dev_dts' into hdfs_dev_dts --- src/core/emb_table/embedding_ddr.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index ca48230b..2a8f1548 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -450,7 +450,6 @@ void EmbeddingDDR::SaveKey(const string& savePath) { } } - hostKey.insert(hostKey.end(), deviceKey.begin(), deviceKey.end()); size_t writeSize = static_cast(hostKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(hostKey.data()), writeSize); if (res == -1) { @@ -462,6 +461,20 @@ void EmbeddingDDR::SaveKey(const string& savePath) { "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", writeSize, res, ss.str())); } + + writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); + res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); + if (res == -1) { + throw runtime_error( + StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); + } + if (res != writeSize) { + throw runtime_error(StringFormat( + "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", + writeSize, res, ss.str())); + } + + } void EmbeddingDDR::SaveEmbData(const string& savePath) -- Gitee From 1c468f72a5acf5c4e4bd814f6c27d7c41b81a7f8 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Wed, 8 May 2024 11:31:46 +0800 Subject: [PATCH 122/302] =?UTF-8?q?=E8=9E=8D=E5=90=88=E7=AE=97=E5=AD=90rea?= =?UTF-8?q?dme=E5=92=8Crun=E8=84=9A=E6=9C=AC=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 2 +- cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh | 8 ++++---- cust_op/fused_lazy_adam/run.sh | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index c42d1bfe..994a7153 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -24,7 +24,7 @@ C算子开发手册[Ascend C算子开发](https://www.hiascend.com/document/deta bash run.sh ``` -注:需先环境中设置CANN相关环境变量,再执行算子编译和安装指令。使用默认路径安装CANN时设置环境变量指令如下: +注:需先在环境中设置CANN相关环境变量,再执行算子编译和安装指令。使用默认路径安装CANN时设置环境变量指令如下: ```shell source /usr/local/Ascend/ascend-toolkit/set_env.sh diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh index 3d4af97c..37b00b42 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh @@ -59,9 +59,9 @@ export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64 function main { # 1. 清除遗留生成文件和日志文件 - rm -rf $HOME/ascend/log/* - rm ./input/*.bin - rm ./output/*.bin + rm -rf $HOME/ascend/log/* > /dev/null 2>&1 + rm ./input/*.bin > /dev/null 2>&1 + rm ./output/*.bin > /dev/null 2>&1 # 2. 生成输入数据和真值数据 cd $CURRENT_DIR @@ -76,7 +76,7 @@ function main { cd $CURRENT_DIR; rm -rf build; mkdir -p build; cd build cmake ../src if [ $? -ne 0 ]; then - echo "ERROR: cmake failed!" + echo "ERROR: cmake f ailed!" return 1 fi echo "INFO: cmake success!" diff --git a/cust_op/fused_lazy_adam/run.sh b/cust_op/fused_lazy_adam/run.sh index ff604cea..63bf7af4 100644 --- a/cust_op/fused_lazy_adam/run.sh +++ b/cust_op/fused_lazy_adam/run.sh @@ -14,6 +14,8 @@ # limitations under the License. # ============================================================================== +set -e + source /etc/profile # 查找msopgen的路径,加入到环境变量PATH中 -- Gitee From 56be32ba3fe81b86a73b3cc181986af9bd9c6ecd Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Wed, 8 May 2024 10:16:07 +0800 Subject: [PATCH 123/302] =?UTF-8?q?hdfs=E4=B8=AD=E7=9A=84read=E3=80=81writ?= =?UTF-8?q?e=E5=87=BD=E6=95=B0=E5=8A=A0=E5=9B=BA=EF=BC=8C=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E5=8A=A0=E8=BD=BD=E4=BF=9D=E5=AD=98=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 15 ++-- src/core/checkpoint/checkpoint.cpp | 22 +++-- src/core/emb_table/embedding_ddr.cpp | 84 +++++++++---------- src/core/emb_table/embedding_dynamic.cpp | 26 +++--- src/core/emb_table/embedding_static.cpp | 25 +++--- .../hdfs_file_system/hdfs_file_system.cpp | 12 +-- .../hdfs_file_system/hdfs_wrapper.h | 4 + .../file_system/hdfs_file_system_test.cpp | 22 ----- 8 files changed, 92 insertions(+), 118 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index e2e58340..d6c1d9e4 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -395,7 +395,7 @@ def save_embedding_data(root_dir, table_name, dump_data_dict, suffix): attribute = dict() attribute[DataAttr.DATATYPE.value] = data_to_write.dtype.name attribute[DataAttr.SHAPE.value] = data_to_write.shape - write_binary_data(target_path, suffix, data_to_write, attributes=attribute) + write_binary_data(target_path, suffix, data_to_write) def save_feature_mapping_data(root_dir, table_name, dump_data_dict, suffix): @@ -407,7 +407,7 @@ def save_feature_mapping_data(root_dir, table_name, dump_data_dict, suffix): attribute = dict() attribute[DataAttr.DATATYPE.value] = data_to_write.dtype.name attribute[DataName.THRESHOLD.value] = int(dump_data_dict.get(DataName.THRESHOLD.value)) - write_binary_data(target_path, suffix, data_to_write, attributes=attribute) + write_binary_data(target_path, suffix, data_to_write) def save_offset_data(root_dir, table_name, dump_data_dict, suffix): @@ -418,7 +418,7 @@ def save_offset_data(root_dir, table_name, dump_data_dict, suffix): attribute = dict() attribute[DataAttr.DATATYPE.value] = data_to_write.dtype.name - write_binary_data(target_path, suffix, data_to_write, attributes=attribute) + write_binary_data(target_path, suffix, data_to_write) def save_optimizer_state_data(root_dir, table_name, optimizer_name, dump_optimizer_data, suffix): @@ -429,7 +429,7 @@ def save_optimizer_state_data(root_dir, table_name, optimizer_name, dump_optimiz attribute = dict() attribute[DataAttr.DATATYPE.value] = data_to_write.dtype.name attribute[DataAttr.SHAPE.value] = data_to_write.shape - write_binary_data(target_path, suffix, data_to_write, attributes=attribute) + write_binary_data(target_path, suffix, data_to_write) def generate_path(*args): @@ -440,7 +440,7 @@ def generate_file_name(suffix): return "slice_%d.data" % suffix, "slice_%d.attribute" % suffix -def write_binary_data(writing_path, suffix, data, attributes=None): +def write_binary_data(writing_path: str, suffix: int, data: np.ndarray): try: tf.io.gfile.makedirs(writing_path) except Exception as err: @@ -474,7 +474,10 @@ def read_binary_data(reading_path: str, data_name: str, table_name: str, load_of with tf.io.gfile.GFile(target_attribute_dir, "rb") as fin: validate_read_file(target_attribute_dir) attributes = fin.read() - attributes = np.fromstring(attributes, dtype=np.int64) + try: + attributes = np.fromstring(attributes, dtype=np.int64) + except ValueError as err: + raise RuntimeError(f"get attributes from file {target_attribute_dir} failed.") from err with tf.io.gfile.GFile(target_data_dir, "rb") as file: validate_read_file(target_data_dir) diff --git a/src/core/checkpoint/checkpoint.cpp b/src/core/checkpoint/checkpoint.cpp index bbb1fd6c..0fc03feb 100644 --- a/src/core/checkpoint/checkpoint.cpp +++ b/src/core/checkpoint/checkpoint.cpp @@ -210,14 +210,13 @@ void Checkpoint::WriteStream(CkptTransData& transData, const string& dataDir, si } if (writeBytesNum == -1) { - throw runtime_error( - StringFormat("Error: Save data failed. data type: {} .An error occurred while writing file: {}.", - dataType, dataDir)); + throw runtime_error(StringFormat("Error: Save data failed. data type: {}. " + "An error occurred while writing file: {}.", dataType, dataDir)); } if (writeBytesNum != dataSize) { - throw runtime_error(StringFormat( - "Error: Save data failed. data type: {} .Expected to write {} bytes, but actually write {} bytes to file {}.", - dataType, dataSize, writeBytesNum, dataDir)); + throw runtime_error(StringFormat("Error: Save data failed. data type: {} ." + "Expected to write {} bytes, but actually write {} bytes to file {}.", + dataType, dataSize, writeBytesNum, dataDir)); } } @@ -336,14 +335,13 @@ void Checkpoint::ReadStream(CkptTransData& transData, } if (readBytesNum == -1) { - throw runtime_error( - StringFormat("Error: Load data failed. data type: {} .An error occurred while reading file: {}.", - dataType, dataDir)); + throw runtime_error(StringFormat("Error: Load data failed. data type: {} ." + "An error occurred while reading file: {}.", dataType, dataDir)); } if (readBytesNum != datasetSize) { - throw runtime_error(StringFormat( - "Error: Load data failed. data type: {} .Expected to read {} bytes, but actually read {} bytes to file {}.", - dataType, datasetSize, readBytesNum, dataDir)); + throw runtime_error(StringFormat("Error: Load data failed. data type: {} ." + "Expected to read {} bytes, but actually read {} bytes to file {}.", + dataType, datasetSize, readBytesNum, dataDir)); } } diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 2a8f1548..3d2b77e7 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -305,17 +305,20 @@ void EmbeddingDDR::SetStartCount() freeSize_ = devVocabSize; } -void EmbeddingDDR::Load(const string& savePath) { +void EmbeddingDDR::Load(const string& savePath) +{ LoadKey(savePath); LoadEmbAndOptim(savePath); } -void EmbeddingDDR::Save(const string& savePath) { +void EmbeddingDDR::Save(const string& savePath) +{ SaveKey(savePath); SaveEmbAndOptim(savePath); } -void EmbeddingDDR::LoadKey(const string& savePath) { +void EmbeddingDDR::LoadKey(const string& savePath) +{ stringstream ss; ss << savePath << "/" << name << "/key/slice.data"; @@ -324,26 +327,25 @@ void EmbeddingDDR::LoadKey(const string& savePath) { size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); + throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error( - StringFormat("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize)); + throw runtime_error(StringFormat("Error: Load keys failed. " + "failed to allocate {} bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { free(static_cast(buf)); - throw runtime_error( - StringFormat("Error: Load keys failed. An error occurred while reading file: {}.", ss.str())); + throw runtime_error(StringFormat("Error: Load keys failed. " + "An error occurred while reading file: {}.", ss.str())); } if (res != fileSize) { free(static_cast(buf)); - throw runtime_error(StringFormat( - "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", fileSize, - res, ss.str())); + throw runtime_error(StringFormat("Error: Load keys failed. Expected to read {} bytes, " + "but actually read {} bytes to file {}.", fileSize, res, ss.str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -358,9 +360,9 @@ void EmbeddingDDR::LoadKey(const string& savePath) { } if (keyCount > devVocabSize + hostVocabSize) { free(static_cast(buf)); - throw runtime_error(StringFormat( - "Error: Load keys failed. Load key size :{} exceeds the sum of device vocab size and host vocab size: {}.", - keyCount, devVocabSize + hostVocabSize)); + throw runtime_error(StringFormat("Error: Load keys failed. Load key size :{} , " + "exceeds the sum of device vocab size and host vocab size: {}.", + keyCount, devVocabSize + hostVocabSize)); } else if (keyCount < devVocabSize) { loadOffset.push_back(i); devOffset2Key[keyCount] = buf[i]; @@ -400,9 +402,8 @@ void EmbeddingDDR::LoadEmbAndOptim(const string& savePath) embedStream.str())); } if (res != readSize) { - throw runtime_error(StringFormat( - "Error: Load embeddings failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - readSize, res, embedStream.str())); + throw runtime_error(StringFormat("Error: Load embeddings failed. Expected to read {} bytes, " + "but actually read {} bytes to file {}.", readSize, res, embedStream.str())); } // 读optim @@ -417,16 +418,16 @@ void EmbeddingDDR::LoadEmbAndOptim(const string& savePath) paramStream.str())); } if (res != readSize) { - throw runtime_error(StringFormat( - "Error: Load embeddings failed. Expected to read {} bytes, but actually read {} bytes to file {}.", - readSize, res, paramStream.str())); + throw runtime_error(StringFormat("Error: Load embeddings failed. Expected to read {} bytes, " + "but actually read {} bytes to file {}.", + readSize, res, paramStream.str())); } optimIndex++; } } - -void EmbeddingDDR::SaveKey(const string& savePath) { +void EmbeddingDDR::SaveKey(const string& savePath) +{ stringstream ss; ss << savePath << "/" << name << "/key/"; MakeDir(ss.str()); @@ -453,28 +454,24 @@ void EmbeddingDDR::SaveKey(const string& savePath) { size_t writeSize = static_cast(hostKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(hostKey.data()), writeSize); if (res == -1) { - throw runtime_error( - StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. " + "An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error(StringFormat( - "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. Expected to write {} bytes, " + "but actually write {} bytes to file {}.", writeSize, res, ss.str())); } writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { - throw runtime_error( - StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. " + "An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error(StringFormat( - "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. Expected to write {} bytes, " + "but actually write {} bytes to file {}.", writeSize, res, ss.str())); } - - } void EmbeddingDDR::SaveEmbData(const string& savePath) @@ -490,13 +487,12 @@ void EmbeddingDDR::SaveEmbData(const string& savePath) size_t writeSize = embSize_ * sizeof(float) * embContent.size(); ssize_t res = fileSystemPtr->Write(ss.str(), embContent, embSize_ * sizeof(float)); if (res == -1) { - throw runtime_error( - StringFormat("Error: Save embeddings failed. An error occurred while writing file: {}.", ss.str())); + throw runtime_error(StringFormat("Error: Save embeddings failed. " + "An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error(StringFormat( - "Error: Save embeddings failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Save embeddings failed. Expected to write {} bytes, " + "but actually write {} bytes to file {}.", writeSize, res, ss.str())); } } @@ -513,15 +509,13 @@ void EmbeddingDDR::SaveOptimData(const string& savePath) size_t writeSize = embSize_ * sizeof(float) * content.second.size(); ssize_t res = fileSystemPtr->Write(ss.str(), content.second, embSize_ * sizeof(float)); - if (res == -1) { - throw runtime_error( - StringFormat("Error: Save optimizers failed. An error occurred while writing file: {}.", ss.str())); + throw runtime_error(StringFormat("Error: Save optimizers failed. " + "An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error(StringFormat( - "Error: Save optimizers failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Save optimizers failed. Expected to write {} bytes, " + "but actually write {} bytes to file {}.", writeSize, res, ss.str())); } } } diff --git a/src/core/emb_table/embedding_dynamic.cpp b/src/core/emb_table/embedding_dynamic.cpp index 706f399e..bca77178 100644 --- a/src/core/emb_table/embedding_dynamic.cpp +++ b/src/core/emb_table/embedding_dynamic.cpp @@ -153,13 +153,12 @@ void EmbeddingDynamic::SaveKey(const string& savePath) size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { - throw runtime_error( - StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. " + "An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error(StringFormat( - "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. Expected to write {} bytes, " + "but actually write {} bytes to file {}.", writeSize, res, ss.str())); } } @@ -261,19 +260,18 @@ void EmbeddingDynamic::LoadKey(const string& savePath) int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error( - StringFormat("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize)); + throw runtime_error(StringFormat("Error: Load keys failed. " + "failed to allocate {} bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { - throw runtime_error( - StringFormat("Error: Load keys failed. An error occurred while reading file: {}.", ss.str())); + throw runtime_error(StringFormat("Error: Load keys failed. " + "An error occurred while reading file: {}.", ss.str())); } if (res != fileSize) { - throw runtime_error(StringFormat( - "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", fileSize, - res, ss.str())); + throw runtime_error(StringFormat("Error: Load keys failed. Expected to read {} bytes, " + "but actually read {} bytes to file {}.", fileSize, res, ss.str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -291,8 +289,8 @@ void EmbeddingDynamic::LoadKey(const string& savePath) void *newBlock = nullptr; aclError ret = aclrtMalloc(&newBlock, static_cast(datasetSize), ACL_MEM_MALLOC_HUGE_FIRST); if (ret != ACL_SUCCESS) { - throw runtime_error( - StringFormat("Error: in dynamic expansion mode, aclrtMalloc failed, malloc size: {}.", datasetSize)); + throw runtime_error(StringFormat("Error: in dynamic expansion mode, " + "aclrtMalloc failed, malloc size: {}.", datasetSize)); } // 此处的 newBlock -> first address; // 对key_offset map 进行一个恢复操作 diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index f80f076a..312b8a77 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -97,13 +97,12 @@ void EmbeddingStatic::SaveKey(const string& savePath) size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { - throw runtime_error( - StringFormat("Error: Save keys failed. An error occurred while writing file: {}.", ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. " + "An error occurred while writing file: {}.", ss.str())); } if (res != writeSize) { - throw runtime_error(StringFormat( - "Error: Save keys failed. Expected to write {} bytes, but actually write {} bytes to file {}.", - writeSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. Expected to write {} bytes, " + "but actually write {} bytes to file {}.", writeSize, res, ss.str())); } } @@ -112,7 +111,7 @@ void EmbeddingStatic::Load(const string& savePath) LoadKey(savePath); } -void EmbeddingStatic::LoadKey(const string &savePath) +void EmbeddingStatic::LoadKey(const string& savePath) { stringstream ss; ss << savePath << "/" << name << "/key/slice.data"; @@ -125,20 +124,20 @@ void EmbeddingStatic::LoadKey(const string &savePath) throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } - int64_t* buf = static_cast(malloc(fileSize)); + int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error(StringFormat("Error: Load keys failed. failed to allocate {} bytes using malloc.", fileSize)); + throw runtime_error(StringFormat("Error: Load keys failed. " + "failed to allocate {} bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { - throw runtime_error( - StringFormat("Error: Load keys failed. An error occurred while reading file: {}.", ss.str())); + throw runtime_error(StringFormat("Error: Load keys failed. " + "An error occurred while reading file: {}.", ss.str())); } if (res != fileSize) { - throw runtime_error(StringFormat( - "Error: Load keys failed. Expected to read {} bytes, but actually read {} bytes to file {}.", fileSize, - res, ss.str())); + throw runtime_error(StringFormat("Error: Load keys failed. Expected to read {} bytes, " + "but actually read {} bytes to file {}.", fileSize, res, ss.str())); } size_t loadKeySize = fileSize / sizeof(int64_t); diff --git a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp index 704a89b5..2c463115 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp +++ b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp @@ -162,9 +162,9 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding if (res != embeddingSize * sizeof(float)) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); - throw runtime_error( - StringFormat("Error: Expected to write {} bytes, but actually write {} bytes to file {}.", - embeddingSize * sizeof(float), res, filePath.c_str())); + throw runtime_error(StringFormat("Error: Expected to write {} bytes, " + "but actually write {} bytes to file {}.", + embeddingSize * sizeof(float), res, filePath.c_str())); } } #endif @@ -267,9 +267,9 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em if (res != embedSizeInfo.embeddingSize * sizeof(float)) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); - throw runtime_error( - StringFormat("Error: Expected to read {} bytes, but actually read {} bytes from file {}.", - embedSizeInfo.embeddingSize * sizeof(float), res, filePath.c_str())); + throw runtime_error(StringFormat("Error: Expected to read {} bytes, " + "but actually read {} bytes from file {}.", + embedSizeInfo.embeddingSize * sizeof(float), res, filePath.c_str())); } aclError ret = aclrtMemcpy(floatPtr + i * embedSizeInfo.extendEmbSize, diff --git a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h index 6ba0d7bb..6b9fe19c 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h +++ b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h @@ -152,6 +152,7 @@ namespace MxRec { } unReadLength -= res; readBytes += res; + reTryCount++; } return readBytes; } @@ -174,6 +175,7 @@ namespace MxRec { } unReadLength -= res; readBytes += res; + reTryCount++; } return readBytes; } @@ -195,6 +197,7 @@ namespace MxRec { } unWriteLength -= res; writeBytes += res; + reTryCount++; } return writeBytes; } @@ -216,6 +219,7 @@ namespace MxRec { } unWriteLength -= res; writeBytes += res; + reTryCount++; } return writeBytes; } diff --git a/src/tests/file_system/hdfs_file_system_test.cpp b/src/tests/file_system/hdfs_file_system_test.cpp index a8c8bbf5..3794d14d 100644 --- a/src/tests/file_system/hdfs_file_system_test.cpp +++ b/src/tests/file_system/hdfs_file_system_test.cpp @@ -38,8 +38,6 @@ void MockHdfs() EMOCK(&HdfsWrapper::FreeFileInfo).stubs().will(ignoreReturnValue()); EMOCK(&HdfsWrapper::OpenFile).stubs().will(returnValue(hdfsFileHandler)); EMOCK(&HdfsWrapper::CloseFile).stubs().will(returnValue(1)); - EMOCK(&HdfsWrapper::Write).stubs().will(returnValue(1)); - EMOCK(&HdfsWrapper::Read).stubs().will(returnValue(1)); EMOCK(&HdfsWrapper::Seek).stubs().will(returnValue(1)); } @@ -86,23 +84,3 @@ TEST_F(HdfsFileSystemTest, GetFileSize) EXPECT_NO_THROW(fileSystemPtr->GetFileSize(filePath)); } -TEST_F(HdfsFileSystemTest, testCase) -{ - string filePath = "hdfs://master:9000/test_dir/"; - auto fileSystemHandler = make_unique(); - auto fileSystemPtr = fileSystemHandler->Create(filePath); - - vector dirs; - dirs = fileSystemPtr->ListDir(filePath); - EXPECT_EQ(dirs.size(), 0); - - vector writeData = {0, 1, 2, 3, 4, 5}; - size_t testDataSize = writeData.size() * sizeof(int64_t); - EXPECT_NO_THROW(fileSystemPtr->Write(filePath, reinterpret_cast(writeData.data()), testDataSize)); - float p[5] = {1.1, 2.2, 3.3, 4.4, 5.5}; - vector writeData1 = {p, p+1, p+2, p+3, p+4}; - EXPECT_NO_THROW(fileSystemPtr->Write(filePath, writeData1, sizeof(float))); - - vector readData = {}; - EXPECT_NO_THROW(fileSystemPtr->Read(filePath, reinterpret_cast(readData.data()), 1)); -} \ No newline at end of file -- Gitee From 4b7de4c286ad018e5b0fa830b93ee3b47928b0ca Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Wed, 8 May 2024 17:52:50 +0800 Subject: [PATCH 124/302] =?UTF-8?q?=E8=9E=8D=E5=90=88=E7=AE=97=E5=AD=90rea?= =?UTF-8?q?dme=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index 994a7153..7aa64218 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -54,6 +54,7 @@ b) 算子参数说明: * inputVar: embedding表对应的variable数据;计算结果原地更新; c) 算子约束说明: * 支持的型号:Atlas A2系列产品; +* 支持的CANN版本:8.0.RC1及之后版本; * 支持的输入数据类型:float32; * embedding表的dim值需要是8的倍数; -- Gitee From 57116797d207dd8a730a13f07f643268dbfc9abb Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Wed, 8 May 2024 17:54:43 +0800 Subject: [PATCH 125/302] =?UTF-8?q?=E8=9E=8D=E5=90=88=E7=AE=97=E5=AD=90rea?= =?UTF-8?q?dme=E4=BF=AE=E6=94=B92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh index 37b00b42..b44855df 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/run.sh @@ -76,7 +76,7 @@ function main { cd $CURRENT_DIR; rm -rf build; mkdir -p build; cd build cmake ../src if [ $? -ne 0 ]; then - echo "ERROR: cmake f ailed!" + echo "ERROR: cmake failed!" return 1 fi echo "INFO: cmake success!" -- Gitee From 0989313f37ad2fb2dfb95df19bbfd77d8069e1a2 Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Wed, 8 May 2024 19:51:07 +0800 Subject: [PATCH 126/302] cleancode --- mx_rec/core/asc/swap_args.py | 1 + mx_rec/core/emb/base_sparse_embedding.py | 26 ++++++++++++------------ mx_rec/graph/modifier.py | 2 +- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/mx_rec/core/asc/swap_args.py b/mx_rec/core/asc/swap_args.py index 4494cc26..5bcba234 100644 --- a/mx_rec/core/asc/swap_args.py +++ b/mx_rec/core/asc/swap_args.py @@ -27,6 +27,7 @@ class SwapDataType(Enum): def singleton(cls): _instance = {} + @functools.wraps(cls) def inner(): if cls not in _instance: _instance[cls] = cls() diff --git a/mx_rec/core/emb/base_sparse_embedding.py b/mx_rec/core/emb/base_sparse_embedding.py index 2a52b3a6..2c29f9c8 100644 --- a/mx_rec/core/emb/base_sparse_embedding.py +++ b/mx_rec/core/emb/base_sparse_embedding.py @@ -263,19 +263,6 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): """ self._multi_lookup_times[is_training] = self._multi_lookup_times.get(is_training) + 1 - def _set_ext_emb_size(self): - # 初始设置_ext_emb_size等于_emb_size,改图阶段会根据优化器的不同而exchange该值 - self._ext_emb_size = self._emb_size * self._ext_coefficient - logger.debug("Init table, ext_emb_size is set to be %s.", self._ext_emb_size) - - def _get_preprocessed_tensor(self, feature_spec: FeatureSpec, channel_id: int, send_count: Optional[int]) -> dict: - config = dict(batch_size=feature_spec.batch_size, feat_cnt=feature_spec.feat_cnt, send_count=send_count, - rank_size=self._rank_size, channel_id=channel_id, table_name=self._table_name, - is_hbm=self._is_hbm, ext_emb_size=self._ext_emb_size, emb_size=self._emb_size, - use_dynamic_expansion=ConfigInitializer.get_instance().use_dynamic_expansion) - - return get_preprocessed_tensor_for_asc(self._variable, config) - def lookup(self, ids: tf.Tensor, send_count: Optional[int], **kwargs) -> tf.Tensor: """ 稀疏表的lookup,自动改图模式. @@ -388,6 +375,19 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): return tf.stop_gradient(self._lookup_result.get(spec_name).get(is_training), name="stop_grad_lookup_res") return self._lookup_result.get(spec_name).get(is_training) + def _set_ext_emb_size(self): + # 初始设置_ext_emb_size等于_emb_size,改图阶段会根据优化器的不同而exchange该值 + self._ext_emb_size = self._emb_size * self._ext_coefficient + logger.debug("Init table, ext_emb_size is set to be %s.", self._ext_emb_size) + + def _get_preprocessed_tensor(self, feature_spec: FeatureSpec, channel_id: int, send_count: Optional[int]) -> dict: + config = dict(batch_size=feature_spec.batch_size, feat_cnt=feature_spec.feat_cnt, send_count=send_count, + rank_size=self._rank_size, channel_id=channel_id, table_name=self._table_name, + is_hbm=self._is_hbm, ext_emb_size=self._ext_emb_size, emb_size=self._emb_size, + use_dynamic_expansion=ConfigInitializer.get_instance().use_dynamic_expansion) + + return get_preprocessed_tensor_for_asc(self._variable, config) + def _lookup_forward(self, feature_spec: FeatureSpec, send_count: Optional[int], **kwargs) -> tf.Tensor: is_training = kwargs.get("is_train") hashtable_params = dict(slice_device_vocabulary_size=self._slice_device_vocabulary_size, diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index 72772c5f..33c4b958 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -644,7 +644,7 @@ def get_variable_and_slot_list(each_var, slot_num, table_name, channel_id): # predict不需要传优化器,但是如果客户创建了优化器,ddr模式加载的是维度ext_size的emb用作换入换出,所以需要给slot零值占位 if optimizer is None and channel_id == 1: slot_place_holder = tf.zeros_like(each_var) - for i in range(slot_num): + for _ in range(slot_num): variable_and_slot_list.append(slot_place_holder) else: # opt name to slot dict -- Gitee From 2c6746af79208e34df9848f74f1b621acb22758c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 8 May 2024 13:42:28 +0000 Subject: [PATCH 127/302] =?UTF-8?q?!127=20hot=E6=A8=A1=E5=BC=8F=E7=9A=84?= =?UTF-8?q?=E9=9D=99=E6=80=81shape=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91hot?= =?UTF-8?q?=20size=E9=9D=99=E6=80=81=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91hot?= =?UTF-8?q?=20size=E9=9D=99=E6=80=81=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91hot?= =?UTF-8?q?=20size=E9=9D=99=E6=80=81=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91hot?= =?UTF-8?q?=20size=E9=9D=99=E6=80=81=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91hot?= =?UTF-8?q?=20size=E9=9D=99=E6=80=81=E4=BF=AE=E5=A4=8D=20*=20Merge=20remot?= =?UTF-8?q?e-tracking=20branch=20'upstream/develop'=20into=20develop-ddr-w?= =?UTF-8?q?itho=E2=80=A6=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91hot=20size=E9=9D=99=E6=80=81?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91clean=20code=E5=92=8C=E8=85=BE?= =?UTF-8?q?=E8=AE=AFeval=E9=83=A8=E5=88=86=E6=94=B9=E5=9B=BE=E7=9A=84?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=20*=20Merge=20remote-tracking=20branch=20'up?= =?UTF-8?q?stream/develop'=20into=20develop-ddr-witho=E2=80=A6=20*=20Merge?= =?UTF-8?q?=20remote-tracking=20branch=20'upstream/develop'=20into=20devel?= =?UTF-8?q?op-ddr-witho=E2=80=A6=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91slot=E5=92=8Cderivati?= =?UTF-8?q?ve=E7=A7=BB=E8=87=B3=E4=B8=8A=E5=B1=82base=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91cre?= =?UTF-8?q?ate=5Ftable=E6=8E=A5=E5=8F=A3=E4=B8=8E=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=99=A8=E5=88=9B=E5=BB=BA=E8=A7=A3=E8=80=A6=20*=20Merge=20rem?= =?UTF-8?q?ote-tracking=20branch=20'origin/develop-global-unique'=20into?= =?UTF-8?q?=20devel=E2=80=A6=20*=20Merge=20remote-tracking=20branch=20'ups?= =?UTF-8?q?tream/develop'=20into=20develop-ddr-witho=E2=80=A6=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr?= =?UTF-8?q?=20without=20optimizer=20for=20fp=20*=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr=20withou?= =?UTF-8?q?t=20optimizer=20for=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr=20without=20optim?= =?UTF-8?q?izer=20for=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91ddr=20without=20optimizer=20fo?= =?UTF-8?q?r=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Mo?= =?UTF-8?q?dification=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20Merge=20rem?= =?UTF-8?q?ote-tracking=20branch=20'upstream/develop'=20into=20develop-ddr?= =?UTF-8?q?-witho=E2=80=A6=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91ddr=20without=20optimizer=20fo?= =?UTF-8?q?r=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Mo?= =?UTF-8?q?dification=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91ddr=20without=20optimizer=20for=20fp=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr?= =?UTF-8?q?=20without=20optimizer=20for=20fp=20*=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr=20withou?= =?UTF-8?q?t=20optimizer=20for=20fp=20*=20Merge=20remote-tracking=20branch?= =?UTF-8?q?=20'upstream/develop'=20into=20develop-ddr-witho=E2=80=A6=20*?= =?UTF-8?q?=20Merge=20remote-tracking=20branch=20'origin/develop-ddr-witho?= =?UTF-8?q?ut-optimizer'=20in=E2=80=A6=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91ddr=20without=20optim?= =?UTF-8?q?izer=20for=20fp=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91ddr=20without=20optimizer=20fo?= =?UTF-8?q?r=20fp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/asc/build_graph.py | 6 +- mx_rec/core/emb/base_sparse_embedding.py | 3 +- src/core/key_process/key_process.cpp | 6 +- src/tests/key_process/key_process_test.cpp | 174 --------------------- tests/mx_rec/core/test_build_graph.py | 24 ++- 5 files changed, 23 insertions(+), 190 deletions(-) diff --git a/mx_rec/core/asc/build_graph.py b/mx_rec/core/asc/build_graph.py index 82e40b29..46dbf193 100644 --- a/mx_rec/core/asc/build_graph.py +++ b/mx_rec/core/asc/build_graph.py @@ -39,15 +39,17 @@ def get_restore_vector(config): raise TypeError("ext_emb_size must be a int") if config.get("ext_emb_size") < 1: raise ValueError("ext_emb_size is less than 1") - emb_size = None + emb_size = config.get("emb_size") if ConfigInitializer.get_instance().use_static: restore_size = config.get("batch_size") * config.get("feat_cnt") + device_id = int(config.get("device_id")) + hot_size = int(mxrec_pybind.get_ub_hot_size(device_id) / emb_size) else: restore_size = None + hot_size = None with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): - hot_size = None restore_vector, hot_pos = npu_ops.gen_npu_ops.get_next( output_types=[tf.int32, tf.int32], output_shapes=[restore_size, [hot_size]], diff --git a/mx_rec/core/emb/base_sparse_embedding.py b/mx_rec/core/emb/base_sparse_embedding.py index 2a52b3a6..a654629a 100644 --- a/mx_rec/core/emb/base_sparse_embedding.py +++ b/mx_rec/core/emb/base_sparse_embedding.py @@ -272,7 +272,8 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): config = dict(batch_size=feature_spec.batch_size, feat_cnt=feature_spec.feat_cnt, send_count=send_count, rank_size=self._rank_size, channel_id=channel_id, table_name=self._table_name, is_hbm=self._is_hbm, ext_emb_size=self._ext_emb_size, emb_size=self._emb_size, - use_dynamic_expansion=ConfigInitializer.get_instance().use_dynamic_expansion) + use_dynamic_expansion=ConfigInitializer.get_instance().use_dynamic_expansion, + device_id=self._device_id) return get_preprocessed_tensor_for_asc(self._variable, config) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index b2dfab04..22148581 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -129,12 +129,8 @@ int KeyProcess::Start() void KeyProcess::InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo) { - int embeddingSize = info.extEmbeddingSize; - if (rankInfo.useDynamicExpansion) { - embeddingSize = info.embeddingSize; - } hotEmbTotCount[info.name] = static_cast(static_cast(GetUBSize(rInfo.deviceId) / sizeof(float)) * - HOT_EMB_CACHE_PCT / static_cast(embeddingSize)); + HOT_EMB_CACHE_PCT / static_cast(info.embeddingSize)); } OffsetMemT KeyProcess::GetMaxOffset() diff --git a/src/tests/key_process/key_process_test.cpp b/src/tests/key_process/key_process_test.cpp index 86ec3f80..a68f4787 100644 --- a/src/tests/key_process/key_process_test.cpp +++ b/src/tests/key_process/key_process_test.cpp @@ -428,34 +428,6 @@ TEST_F(KeyProcessTest, PaddingHashSplitWithFAAE) } } -TEST_F(KeyProcessTest, HotHashSplit) -{ - PrepareBatch(); - ASSERT_EQ(process.Initialize(rankInfo, embInfos), true); - LOG_INFO("CPU Core Num: %{}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 - - auto fn = [this](int channel, int id) { - auto embName = embInfos[0].name; - process.hotEmbTotCount[embName] = 10; - vector splitKeys; - vector restore; - vector hotPos; - unique_ptr batch; - batch = process.GetBatchData(channel, id); // get batch data from SingletonQueue - LOG_INFO("rankid :{},batchid: {}", rankInfo.rankId, batch->batchId); - tie(splitKeys, restore, hotPos) = process.HotHashSplit(batch); - LOG_INFO("rankid :{},batchid: {}, hotPos {}", rankInfo.rankId, batch->batchId, VectorToString(hotPos)); - }; // for clean code - for (int channel = 0; channel < 1; ++channel) { - for (int id = 0; id < 1; ++id) { - // use lambda expression initialize thread - process.procThreads.emplace_back(std::make_unique(fn, channel, id)); - } - } - this_thread::sleep_for(10s); - process.Destroy(); -} - TEST_F(KeyProcessTest, GetScAll) { vector keyScLocal(worldSize, worldRank + 1); // 用worldRank+1初始化发送数据量 @@ -527,38 +499,6 @@ TEST_F(KeyProcessTest, BuildRestoreVec_4cpu) ASSERT_THAT(restore, ElementsAreArray(allExpectRestore[worldRank])); } -// hot模式,batch随机数,ProcessSplitKeys后人为校验lookupKeys、scAll、restore -TEST_F(KeyProcessTest, BuildRestoreVec_rebuilt) -{ - PrepareBatch(); - ASSERT_EQ(process.Initialize(rankInfo, embInfos), true); - LOG_INFO("CPU Core Num: {}", sysconf(_SC_NPROCESSORS_CONF)); // 查看CPU核数 - - auto fn = [this](int channel, int id) { - auto embName = embInfos[0].name; - vector splitKeys; - vector restore; - vector hotPos; - unique_ptr batch; - batch = process.GetBatchData(channel, id); // get batch data from SingletonQueue - LOG_INFO("rankid :{}, batchid: {}", rankInfo.rankId, batch->batchId); - tie(splitKeys, restore, hotPos) = process.HotHashSplit(batch); - auto [lookupKeys, scAll, ss] = process.ProcessSplitKeys(batch, id, splitKeys); - process.BuildRestoreVec(batch, ss, restore, hotPos.size()); - LOG_INFO("rankid :{}, batchid: {}, lookupKeys: {}, scAll: {}, restore after build {}", - rankInfo.rankId, batch->batchId, VectorToString(lookupKeys), - VectorToString(scAll), VectorToString(restore)); - }; // for clean code - for (int channel = 0; channel < 1; ++channel) { - for (int id = 0; id < KEY_PROCESS_THREAD; ++id) { - // use lambda expression initialize thread - process.procThreads.emplace_back(std::make_unique(fn, channel, id)); - } - } - this_thread::sleep_for(10s); - process.Destroy(); -} - // 准入模式,batch随机数,ProcessSplitKeys后人为校验lookupKeys、scAll、count TEST_F(KeyProcessTest, GetCountRecv) { @@ -638,120 +578,6 @@ TEST_F(KeyProcessTest, GetUniqueConfig) process.GetUniqueConfig(uniqueConf); } -// HBM端到端测试,动态shape,固定batch输入 -TEST_F(KeyProcessTest, KeyProcessTaskHelper) -{ - rankInfo.isDDR = false; - rankInfo.useStatic = false; - rankInfo.useDynamicExpansion = false; - EmbeddingMgmt::Instance()->Init(rankInfo, embInfos); - ASSERT_EQ(process.Initialize(rankInfo, embInfos), true); - ASSERT_EQ(process.isRunning, true); - int batchId = 0; - int channelId = 0; - auto batch = GenBatch(embInfos[0].name, batchId, channelId); // 测试一个表 - - LOG_INFO("KeyProcessTaskHelper, rankid: {}, batchid: {}, batchSize: {}", - rankInfo.rankId, batch->batchId, batch->sample.size()); - - ASSERT_EQ(process.KeyProcessTaskHelper(batch, channelId, 0), true); // threadId = 0 - auto infoVecs = process.GetInfoVec(batchId, embInfos[0].name, channelId, ProcessedInfo::RESTORE); - ASSERT_NE(infoVecs, nullptr); - auto all2all = process.GetInfoVec(batchId, embInfos[0].name, channelId, ProcessedInfo::ALL2ALL); - ASSERT_NE(all2all, nullptr); - - ASSERT_EQ(CheckMatrixTensor(*all2all, allExpectAll2all), true); - ASSERT_EQ(CheckFlatTensor({infoVecs->back()}, allExpectOffset[worldRank]), true); - infoVecs->pop_back(); - int64_t hotPosition = process.hotEmbTotCount[batch->name]; - vector expectRestore(allExpectRestore[worldRank].size()); - for (int i = 0; i < expectRestore.size(); i++) { - expectRestore[i] = allExpectRestore[worldRank][i] + hotPosition; - } - ASSERT_EQ(CheckFlatTensor(*infoVecs, expectRestore), true); - LOG_INFO("KeyProcessTaskHelper, rankid: {}, batchid: {}, normal status success", rankInfo.rankId, batch->batchId); - // 测试batchId错误 - HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); - hybridMgmtBlock->hybridBatchId[0] = 1; - ASSERT_EQ(process.GetInfoVec(batchId, embInfos[0].name, channelId, ProcessedInfo::RESTORE), nullptr); - LOG_INFO("KeyProcessTaskHelper, rankid: {}, batchid: {}, batchId exception success", - rankInfo.rankId, batch->batchId); - // 测试empty场景 - hybridMgmtBlock->pythonBatchId[1] = 1; - hybridMgmtBlock->hybridBatchId[1] = 1; - hybridMgmtBlock->readEmbedBatchId[1] = 1; - hybridMgmtBlock->loop[1] = 1; - ASSERT_EQ(process.GetInfoVec(batchId + 1, embInfos[0].name, channelId + 1, ProcessedInfo::RESTORE), nullptr); - LOG_INFO("KeyProcessTaskHelper, rankid: {}, batchid: {}, batch empty success", rankInfo.rankId, batch->batchId); - // eos - process.SetEos(1, 1); - ASSERT_EQ(process.GetInfoVec(batchId + 1, embInfos[0].name, channelId + 1, ProcessedInfo::RESTORE), nullptr); - LOG_INFO("KeyProcessTaskHelper, rankid: {}, batchid: {}, eos status success", rankInfo.rankId, batch->batchId); - this_thread::sleep_for(10s); - process.Destroy(); -} - -// DDR端到端测试,静态shape,固定batch输入 -TEST_F(KeyProcessTest, KeyProcessTaskHelperDDR) -{ - rankInfo.isDDR = true; - rankInfo.useStatic = true; - rankInfo.useDynamicExpansion = false; - EmbeddingMgmt::Instance()->Init(rankInfo, embInfos); - ASSERT_EQ(process.Initialize(rankInfo, embInfos), true); - ASSERT_EQ(process.isRunning, true); - int batchId = 0; - int channelId = 0; - auto batch = GenBatch(embInfos[0].name, batchId, channelId); // 测试第一个表 - HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); - hybridMgmtBlock->hybridBatchId[0] = 0; - LOG_INFO("KeyProcessTaskHelperDDR, rankid: {}, batchid: {}", rankInfo.rankId, batch->batchId); - - ASSERT_EQ(process.KeyProcessTaskHelper(batch, channelId, 0), true); // threadId = 0 - - auto lookupKeys = process.GetLookupKeys(batchId, embInfos[0].name, channelId); // lookup list返回的不是tensor - ASSERT_EQ(lookupKeys.size(), sendCount * worldSize); - LOG_INFO("KeyProcessTaskHelperDDR, rankid: {}, batchid: {}, lookupKeys: {}", - rankInfo.rankId, batch->batchId, VectorToString(lookupKeys)); - ASSERT_EQ(CheckPaddingVec(lookupKeys, allExpectLookupKeys[worldRank]), true); - - auto infoVecs = process.GetInfoVec(batchId, embInfos[0].name, channelId, ProcessedInfo::RESTORE); - ASSERT_NE(infoVecs, nullptr); - int col = allExpectRestore[worldRank].size(); - auto tmpTensor = (*infoVecs).at(0); - auto tmpData = tmpTensor.flat(); - - int64_t hotPosition = process.hotEmbTotCount[batch->name]; - vector actualGetRestore(col); - for (int j = 0; j < col; j++) { - actualGetRestore[j] = tmpData(j)-hotPosition; - } - LOG_INFO("KeyProcessTaskHelperDDR, rankid: {}, batchid: {}, Restore: {}", - rankInfo.rankId, batch->batchId, VectorToString(actualGetRestore)); - ASSERT_THAT(actualGetRestore, ElementsAreArray(allExpectRestoreStatic[worldRank])); - LOG_INFO("KeyProcessTaskHelperDDR, rankid: {}, batchid: {}, normal status success", - rankInfo.rankId, batch->batchId); - - // 测试batchId错误 - hybridMgmtBlock->hybridBatchId[0] = 1; - ASSERT_EQ(process.GetLookupKeys(batchId, embInfos[0].name, channelId).empty(), true); - LOG_INFO("KeyProcessTaskHelper, rankid: {}, batchid: {}, batchId exception success", - rankInfo.rankId, batch->batchId); - // 测试empty场景 - hybridMgmtBlock->pythonBatchId[1] = 1; - hybridMgmtBlock->hybridBatchId[1] = 1; - hybridMgmtBlock->readEmbedBatchId[1] = 1; - hybridMgmtBlock->loop[1] = 1; - ASSERT_EQ(process.GetLookupKeys(batchId + 1, embInfos[0].name, channelId + 1).empty(), true); - LOG_INFO("KeyProcessTaskHelper, rankid: {}, batchid: {}, batch empty success", rankInfo.rankId, batch->batchId); - // eos - process.SetEos(1, 1); - ASSERT_EQ(process.GetLookupKeys(batchId + 1, embInfos[0].name, channelId + 1).empty(), true); - LOG_INFO("KeyProcessTaskHelper, rankid: {}, batchid: {}, eos status success", rankInfo.rankId, batch->batchId); - this_thread::sleep_for(10s); - process.Destroy(); -} - TEST_F(KeyProcessTest, InitializeUnique) { ASSERT_EQ(ock::ctr::Factory::Create(factory), -1); diff --git a/tests/mx_rec/core/test_build_graph.py b/tests/mx_rec/core/test_build_graph.py index c5766179..5360f908 100644 --- a/tests/mx_rec/core/test_build_graph.py +++ b/tests/mx_rec/core/test_build_graph.py @@ -32,12 +32,14 @@ class TestGetRestoreVectorFunc(unittest.TestCase): def setUp(self): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_dynamic_expansion=True) def test_get_restore_vector_case1(self): """ @@ -112,13 +114,15 @@ class TestGetIdOffsetsFunc(unittest.TestCase): def setUp(self): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_dynamic_expansion=True) self.max_lookup_vec_size = self.config.get("send_count") * self.config.get("rank_size") def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_dynamic_expansion=True) @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") def test_get_id_offsets_case1(self, mock_get_next): @@ -160,12 +164,14 @@ class TestGetAll2allArgsFunc(unittest.TestCase): def setUp(self): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_dynamic_expansion=True) def test_get_all2all_args_case1(self): """ @@ -200,12 +206,14 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): def setUp(self): # 默认动态扩容、hot emb、HBM self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_dynamic_expansion=True) def tearDown(self): # 恢复config self.config = dict(table_name="test_table", channel_id=0, is_hbm=True, emb_size=8, ext_emb_size=8, - feat_cnt=8, batch_size=32, rank_size=8, send_count=1, use_dynamic_expansion=True) + feat_cnt=8, batch_size=32, rank_size=8, send_count=1, device_id=0, + use_dynamic_expansion=True) @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), -- Gitee From c8355ad1fde2e9828da0ed8edb85aaa99f154b86 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 9 May 2024 15:34:19 +0800 Subject: [PATCH 128/302] =?UTF-8?q?dockerfile=E5=8F=96=E6=B6=88=E8=AE=BE?= =?UTF-8?q?=E7=BD=AECC=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/build_mxRec_images/centos_build/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/build_mxRec_images/centos_build/Dockerfile b/docs/build_mxRec_images/centos_build/Dockerfile index ee1d98e8..4e21166c 100644 --- a/docs/build_mxRec_images/centos_build/Dockerfile +++ b/docs/build_mxRec_images/centos_build/Dockerfile @@ -130,6 +130,9 @@ RUN pip3.7 install -U pip && \ pip3.7 install h5py==3.1.0 && \ rm -rf /root/.cache/pip +# 安装mpi4py时使用该环境变了,安装完成后取消 +RUN unset CC + # 10.设置驱动路径环境变量 ARG ASCEND_BASE=/usr/local/Ascend ENV LD_LIBRARY_PATH=$ASCEND_BASE/driver/lib64:$ASCEND_BASE/driver/lib64/common:$ASCEND_BASE/driver/lib64/driver:$LD_LIBRARY_PATH -- Gitee From c9561815ded4b64c83863c09e247f1b966aa39e5 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 9 May 2024 15:38:45 +0800 Subject: [PATCH 129/302] =?UTF-8?q?dockerfile=E5=8F=96=E6=B6=88=E8=AE=BE?= =?UTF-8?q?=E7=BD=AECC=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F-=E6=8F=8F?= =?UTF-8?q?=E8=BF=B0=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/build_mxRec_images/centos_build/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/build_mxRec_images/centos_build/Dockerfile b/docs/build_mxRec_images/centos_build/Dockerfile index 4e21166c..3e93704e 100644 --- a/docs/build_mxRec_images/centos_build/Dockerfile +++ b/docs/build_mxRec_images/centos_build/Dockerfile @@ -130,7 +130,7 @@ RUN pip3.7 install -U pip && \ pip3.7 install h5py==3.1.0 && \ rm -rf /root/.cache/pip -# 安装mpi4py时使用该环境变了,安装完成后取消 +# 安装mpi4py时使用该环境变,安装完成后取消 RUN unset CC # 10.设置驱动路径环境变量 -- Gitee From de83703090bfd967a0dc09a29f95052195aa4aa9 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 9 May 2024 15:51:57 +0800 Subject: [PATCH 130/302] =?UTF-8?q?dockerfile=E5=8F=96=E6=B6=88=E8=AE=BE?= =?UTF-8?q?=E7=BD=AECC=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F-=E6=8F=8F?= =?UTF-8?q?=E8=BF=B0=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/build_mxRec_images/centos_build/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/build_mxRec_images/centos_build/Dockerfile b/docs/build_mxRec_images/centos_build/Dockerfile index 3e93704e..16fd0688 100644 --- a/docs/build_mxRec_images/centos_build/Dockerfile +++ b/docs/build_mxRec_images/centos_build/Dockerfile @@ -130,7 +130,7 @@ RUN pip3.7 install -U pip && \ pip3.7 install h5py==3.1.0 && \ rm -rf /root/.cache/pip -# 安装mpi4py时使用该环境变,安装完成后取消 +# 安装mpi4py时使用该环境变量,安装完成后取消 RUN unset CC # 10.设置驱动路径环境变量 @@ -139,6 +139,8 @@ ENV LD_LIBRARY_PATH=$ASCEND_BASE/driver/lib64:$ASCEND_BASE/driver/lib64/common:$ # 11.CANN相关参数 ARG TOOLKIT_PKG=Ascend-cann-toolkit*.run + + ARG TOOLKIT_PATH=$ASCEND_BASE/ascend-toolkit/latest # 12.TF相关 -- Gitee From 3df4015a148ec9544d0e13ef1cd4fd32dee48ba5 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 9 May 2024 15:52:52 +0800 Subject: [PATCH 131/302] =?UTF-8?q?dockerfile=E5=8F=96=E6=B6=88=E8=AE=BE?= =?UTF-8?q?=E7=BD=AECC=E7=8E=AF=E5=A2=83=E5=8F=98=E9=87=8F-=E6=8F=8F?= =?UTF-8?q?=E8=BF=B0=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/build_mxRec_images/centos_build/Dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/build_mxRec_images/centos_build/Dockerfile b/docs/build_mxRec_images/centos_build/Dockerfile index 16fd0688..2d2b3579 100644 --- a/docs/build_mxRec_images/centos_build/Dockerfile +++ b/docs/build_mxRec_images/centos_build/Dockerfile @@ -139,8 +139,6 @@ ENV LD_LIBRARY_PATH=$ASCEND_BASE/driver/lib64:$ASCEND_BASE/driver/lib64/common:$ # 11.CANN相关参数 ARG TOOLKIT_PKG=Ascend-cann-toolkit*.run - - ARG TOOLKIT_PATH=$ASCEND_BASE/ascend-toolkit/latest # 12.TF相关 -- Gitee From 14af0e7af544001db46f5612e599f044480bea5a Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Fri, 10 May 2024 09:17:19 +0800 Subject: [PATCH 132/302] bugfix --- mx_rec/core/emb/base_sparse_embedding.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mx_rec/core/emb/base_sparse_embedding.py b/mx_rec/core/emb/base_sparse_embedding.py index 2c29f9c8..f984697c 100644 --- a/mx_rec/core/emb/base_sparse_embedding.py +++ b/mx_rec/core/emb/base_sparse_embedding.py @@ -384,7 +384,8 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): config = dict(batch_size=feature_spec.batch_size, feat_cnt=feature_spec.feat_cnt, send_count=send_count, rank_size=self._rank_size, channel_id=channel_id, table_name=self._table_name, is_hbm=self._is_hbm, ext_emb_size=self._ext_emb_size, emb_size=self._emb_size, - use_dynamic_expansion=ConfigInitializer.get_instance().use_dynamic_expansion) + use_dynamic_expansion=ConfigInitializer.get_instance().use_dynamic_expansion, + device_id=self._device_id) return get_preprocessed_tensor_for_asc(self._variable, config) -- Gitee From b85aaa314050ce38b48b1c2cb0b7facf4ec02c79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Fri, 10 May 2024 08:59:39 +0000 Subject: [PATCH 133/302] =?UTF-8?q?!133=20=E5=85=A8=E5=B1=80=E5=8E=BB?= =?UTF-8?q?=E9=87=8D+=E6=89=A9=E5=AE=B9=E6=A8=A1=E5=BC=8F=EF=BC=8C?= =?UTF-8?q?=E8=A1=A8=E5=90=8D=E5=B8=A6=E6=9C=89=E2=80=9C/=E2=80=9D?= =?UTF-8?q?=E5=AD=97=E6=A0=B7=E9=9A=90=E6=82=A3=E4=BF=AE=E5=A4=8D=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91=E5=85=A8=E5=B1=80unique=E5=8A=9F=E8=83=BD=E5=9C=A8?= =?UTF-8?q?=E6=89=A9=E5=AE=B9=E6=A8=A1=E5=BC=8F=E4=B8=8B=EF=BC=8C=E8=A1=A8?= =?UTF-8?q?=E5=90=8D=E5=AD=97=E2=80=9C/=E2=80=9D=E9=9A=90=E6=82=A3?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80unique?= =?UTF-8?q?=E5=8A=9F=E8=83=BD=E5=9C=A8=E6=89=A9=E5=AE=B9=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E4=B8=8B=EF=BC=8C=E8=A1=A8=E5=90=8D=E5=AD=97=E2=80=9C/?= =?UTF-8?q?=E2=80=9D=E9=9A=90=E6=82=A3=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91?= =?UTF-8?q?=E5=85=A8=E5=B1=80unique=E5=8A=9F=E8=83=BD=E5=9C=A8=E6=89=A9?= =?UTF-8?q?=E5=AE=B9=E6=A8=A1=E5=BC=8F=E4=B8=8B=EF=BC=8C=E8=A1=A8=E5=90=8D?= =?UTF-8?q?=E5=AD=97=E2=80=9C/=E2=80=9D=E9=9A=90=E6=82=A3=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/asc/manager.py | 6 ++++++ mx_rec/core/emb/dynamic_sparse_embedding.py | 4 +++- mx_rec/graph/modifier.py | 2 +- mx_rec/optimizers/base.py | 5 ++--- mx_rec/util/config_utils/embedding_utils.py | 14 ++++++++++++++ 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index 8b62b66b..97a71a4d 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -37,10 +37,16 @@ def generate_table_info_list(): raise ValueError(f"The DDR mode of all tables must be used or not used at the same time. However, is_hbm " f"of each table `{table_instance_dict.keys()}` is `{is_hbm_list}`.") + # 通过create_hash_optimizer创建optimizer_instance + optimizer_instance = ConfigInitializer.get_instance().optimizer_config.optimizer_instance # generate table info dangling_table = check_dangling_table() for _, table_instance in ConfigInitializer.get_instance().sparse_embed_config.table_instance_dict.items(): + # FS模式扩容场景 + if ConfigInitializer.get_instance().use_dynamic_expansion and optimizer_instance: + table_instance.ext_emb_size = table_instance.emb_size * (1 + optimizer_instance.slot_num) + logger.info("ext_emb_size is reset to be %s in generate_table_info_list.", table_instance.ext_emb_size) skip = should_skip(table_instance.table_name) if table_instance.table_name in dangling_table or skip: logger.info("skip table %s: %s which does not need to be provided to the EmbInfo.", diff --git a/mx_rec/core/emb/dynamic_sparse_embedding.py b/mx_rec/core/emb/dynamic_sparse_embedding.py index 49979261..a7616991 100644 --- a/mx_rec/core/emb/dynamic_sparse_embedding.py +++ b/mx_rec/core/emb/dynamic_sparse_embedding.py @@ -50,7 +50,9 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): self._table_name, ASCEND_TABLE_NAME_MUST_CONTAIN) if not add_collection_condition: return sparse_forward_fn(local_embeddings) - + # 创建扩容查询tensor和table_instance的映射关系,以便优化器中使用 + ConfigInitializer.get_instance().sparse_embed_config.insert_table_instance_to_tensor_dict( + result.get("id_offsets"), self) tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB, local_embeddings) tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get("id_offsets")) return sparse_forward_fn(local_embeddings) diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index 33c4b958..6b6013d8 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -388,7 +388,7 @@ def change_ext_emb_size_by_opt(optimizer): # When dynamic expansion mode, ext_emb_size is set by optimizer if ConfigInitializer.get_instance().use_dynamic_expansion or not table_instance.is_hbm: table_instance.ext_emb_size = table_instance.emb_size * (1 + optimizer.slot_num) - logger.debug("ext_emb_size is reset to be %s for EmbInfo", table_instance.ext_emb_size) + logger.info("ext_emb_size is reset to be %s in change_ext_emb_size_by_opt", table_instance.ext_emb_size) @para_checker_decorator( diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index fbc63193..f74e9778 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -97,9 +97,8 @@ class CustomizedOptimizer: @staticmethod def sum_same_id_gradients(grad, var, is_expansion): if isinstance(var, ops.Tensor): - # 扩容模式从scope获取表名,偏移是-2 - table_name = var.op.name.split('/')[-2] - table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance_by_name(table_name) + table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance_by_tensor(var) + table_name = table_instance.table_name else: table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(var) table_name = table_instance.table_name diff --git a/mx_rec/util/config_utils/embedding_utils.py b/mx_rec/util/config_utils/embedding_utils.py index 68ceef3a..e13d9d51 100644 --- a/mx_rec/util/config_utils/embedding_utils.py +++ b/mx_rec/util/config_utils/embedding_utils.py @@ -3,6 +3,7 @@ # Copyright (c) Huawei Technologies Co., Ltd. 2023. All rights reserved. from typing import Optional +from tensorflow.python.framework import ops from tensorflow import Variable from mx_rec.util.log import logger @@ -18,6 +19,7 @@ class SparseEmbedConfig: self._table_name_set = set() self._removing_var_list = [] self._name_to_var_dict = dict() + self._tensor_to_table_instance_dict = dict() @property def table_instance_dict(self): @@ -45,6 +47,12 @@ class SparseEmbedConfig: return self._table_instance_dict.get(key) + def get_table_instance_by_tensor(self, tensor) -> object: + if tensor not in self._tensor_to_table_instance_dict: + raise KeyError(f"Given tensor does not exist.") + + return self._tensor_to_table_instance_dict.get(tensor) + def get_table_instance_by_name(self, table_name: Optional[str]) -> object: if table_name not in self._name_to_var_dict: raise KeyError(f"Given table name does not exist.") @@ -74,5 +82,11 @@ class SparseEmbedConfig: self._name_to_var_dict[name] = key self._table_instance_dict[key] = instance + def insert_table_instance_to_tensor_dict(self, tensor: ops.Tensor, instance: object) -> None: + if tensor in self._tensor_to_table_instance_dict: + raise KeyError(f"Given tensor {tensor} has been used.") + logger.debug("Record one hash table for expansion mode, with tensor: %s.", tensor) + self._tensor_to_table_instance_dict[tensor] = instance + def export_table_num(self) -> int: return len(self.table_instance_dict) if self.table_instance_dict else 0 -- Gitee From 0bd44e8ea30d2450de60aedd35cf50cc4dc68524 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Sat, 11 May 2024 10:45:52 +0800 Subject: [PATCH 134/302] =?UTF-8?q?=E8=9E=8D=E5=90=88=E7=AE=97=E5=AD=90rea?= =?UTF-8?q?dme=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index 7aa64218..13ed6994 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -52,7 +52,13 @@ b) 算子参数说明: * inputM: lazy_adam优化器一阶矩估计;计算结果原地更新; * inputV: lazy_adam优化器二阶矩估计;计算结果原地更新; * inputVar: embedding表对应的variable数据;计算结果原地更新; - c) 算子约束说明: +* lr: 学习率; +* beta1: 一阶矩估计的指数衰减率; +* beta2: 二阶矩估计的指数衰减率; +* epsilon: 极小值; + +c) 算子约束说明: + * 支持的型号:Atlas A2系列产品; * 支持的CANN版本:8.0.RC1及之后版本; * 支持的输入数据类型:float32; -- Gitee From 7ea23a49d9d6691517d7079c62b9b1ba20260d28 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Wed, 15 May 2024 10:53:14 +0800 Subject: [PATCH 135/302] =?UTF-8?q?warm=20start=20=E5=BC=80=E5=8F=91?= =?UTF-8?q?=E8=A1=A5=E5=85=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 2 +- mx_rec/saver/warm_start.py | 4 ++-- mx_rec/util/config_utils/hybrid_mgmt_utils.py | 4 ++-- src/core/emb_table/embedding_mgmt.cpp | 6 ++++++ src/core/emb_table/embedding_mgmt.h | 5 +++++ src/core/hybrid_mgmt/hybrid_mgmt.cpp | 11 +++++++++-- src/core/hybrid_mgmt/hybrid_mgmt.h | 2 +- src/pybind/module_main.cpp | 2 +- 8 files changed, 27 insertions(+), 9 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index dc545822..4f789a30 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -343,7 +343,7 @@ class Saver(object): set_optimizer_info(optimizer_instance, table_name) if self.config_instance.hybrid_manager_config.asc_manager: - self.config_instance.hybrid_manager_config.restore_host_data(reading_path) + self.config_instance.hybrid_manager_config.restore_host_data(reading_path, warm_start_tables) logger.info("host data was restored.") if self.config_instance.use_dynamic_expansion: diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index 520c3df3..b5df5887 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -95,7 +95,7 @@ def patch_for_estimator_train(func): hooks = kwargs.get('hooks', []) if WarmStartController().get_elements(): hooks.append(SparseRestoreHook()) - return func(*args, *kwargs) + return func(*args, **kwargs) return wrapper @@ -243,4 +243,4 @@ class SparseRestoreHook(tf.estimator.SessionRunHook): for path, restore_tables in self._warm_start_dict.items(): restore_path = get_latest_ckpt(path) self._saver.restore(session, restore_path, restore_tables) - self._is_warm_start = False + self._is_warm_start = True diff --git a/mx_rec/util/config_utils/hybrid_mgmt_utils.py b/mx_rec/util/config_utils/hybrid_mgmt_utils.py index 737ce7cb..89ba16cf 100644 --- a/mx_rec/util/config_utils/hybrid_mgmt_utils.py +++ b/mx_rec/util/config_utils/hybrid_mgmt_utils.py @@ -83,11 +83,11 @@ class HybridManagerConfig: self.asc_manager.save(root_dir) logger.debug("Data from host pipeline has been saved.") - def restore_host_data(self, root_dir: Optional[str]) -> None: + def restore_host_data(self, root_dir: Optional[str], warm_start_tables=None) -> None: if self.asc_manager is None: raise RuntimeError("ASC manager does not exist.") - if not self.asc_manager.load(root_dir): + if not self.asc_manager.load(root_dir, warm_start_tables): raise TypeError("Asc load data does not match usr setups, \ please re-consider if you want to restore from this dir") logger.debug("Data from host pipeline has been restored.") diff --git a/src/core/emb_table/embedding_mgmt.cpp b/src/core/emb_table/embedding_mgmt.cpp index 2c2f9e39..f850e254 100644 --- a/src/core/emb_table/embedding_mgmt.cpp +++ b/src/core/emb_table/embedding_mgmt.cpp @@ -142,6 +142,12 @@ std::shared_ptr EmbeddingMgmt::GetTable(const string& name) return std::dynamic_pointer_cast(it->second); } +void EmbeddingMgmt::Load(const string& name, const string& filePath) +{ + return embeddings[name]->Load(filePath); +} + + void EmbeddingMgmt::Load(const string& filePath) { for (auto& tablePair: embeddings) { diff --git a/src/core/emb_table/embedding_mgmt.h b/src/core/emb_table/embedding_mgmt.h index 11ed2325..d091bdef 100644 --- a/src/core/emb_table/embedding_mgmt.h +++ b/src/core/emb_table/embedding_mgmt.h @@ -83,6 +83,11 @@ public: std::shared_ptr GetTable(const string& name); + /** + * 加载单个表 + */ + void Load(const string& name, const string& filePath); + /** * 加载所有表 */ diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 894dc230..be12dd53 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -279,7 +279,7 @@ bool HybridMgmt::Save(const string savePath) /// 加载模型 /// \param loadPath /// \return -bool HybridMgmt::Load(const string& loadPath) +bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) { #ifndef GTEST if (!isInitialized) { @@ -296,7 +296,14 @@ bool HybridMgmt::Load(const string& loadPath) vector loadFeatures; SetFeatureTypeForLoad(loadFeatures); - EmbeddingMgmt::Instance()->Load(loadPath); + if(warmStartTables.size() == 0) { + EmbeddingMgmt::Instance()->Load(loadPath); + } else { + for (auto& tableName: warmStartTables) { + EmbeddingMgmt::Instance()->Load(tableName, loadPath); + } + } + loadOffsetToSend = EmbeddingMgmt::Instance()->GetLoadOffsets(); // 执行加载操作 diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index 0251eb91..a7bdcee6 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -61,7 +61,7 @@ namespace MxRec { bool Save(const string savePath); - bool Load(const string& loadPath); + bool Load(const string& loadPath, vector warmStartTables); OffsetT SendHostMap(const string tableName); diff --git a/src/pybind/module_main.cpp b/src/pybind/module_main.cpp index 0df47092..acb914f2 100644 --- a/src/pybind/module_main.cpp +++ b/src/pybind/module_main.cpp @@ -214,7 +214,7 @@ namespace { py::arg("seed") = DEFAULT_RANDOM_SEED, py::arg("threshold_values") = vector {}, py::arg("if_load") = false) .def("save", &MxRec::HybridMgmt::Save, py::arg("save_path") = "") - .def("load", &MxRec::HybridMgmt::Load, py::arg("load_path") = "") + .def("load", &MxRec::HybridMgmt::Load, py::arg("load_path") = "", py::arg("warm_start_tables") = vector {}) .def("destroy", &MxRec::HybridMgmt::Destroy) .def("evict", &MxRec::HybridMgmt::Evict) .def("send", &MxRec::HybridMgmt::SendHostMap, py::arg("table_name") = "") -- Gitee From ca6369f6718e213efc4e8474cb5089da94f60242 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 16 May 2024 06:29:15 +0000 Subject: [PATCH 136/302] =?UTF-8?q?!136=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E5=BC=82=E5=B8=B8=E6=8D=95=E8=8E=B7=EF=BC=8C=E9=9D=9Ehbm?= =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E4=B8=8B=E5=BF=85=E9=A1=BB=E4=BD=BF=E7=94=A8?= =?UTF-8?q?=E6=94=B9=E5=9B=BE=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=A2=9E=E5=8A=A0=E5=BC=82?= =?UTF-8?q?=E5=B8=B8=E6=8D=95=E8=8E=B7=EF=BC=8C=E9=9D=9Ehbm=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E4=B8=8B=E5=BF=85=E9=A1=BB=E4=BD=BF=E7=94=A8=E6=94=B9?= =?UTF-8?q?=E5=9B=BE=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=20Modification=E3=80=91=E5=A2=9E=E5=8A=A0=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E6=8D=95=E8=8E=B7=EF=BC=8C=E9=9D=9Ehbm=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E4=B8=8B=E5=BF=85=E9=A1=BB=E4=BD=BF=E7=94=A8=E6=94=B9=E5=9B=BE?= =?UTF-8?q?=20*=20Merge=20remote-tracking=20branch=20'upstream/develop'=20?= =?UTF-8?q?into=20develop-bugfix=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80uni?= =?UTF-8?q?que=E5=8A=9F=E8=83=BD=E5=9C=A8=E6=89=A9=E5=AE=B9=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E4=B8=8B=EF=BC=8C=E8=A1=A8=E5=90=8D=E5=AD=97=E2=80=9C?= =?UTF-8?q?/=E2=80=9D=E9=9A=90=E6=82=A3=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91?= =?UTF-8?q?=E5=85=A8=E5=B1=80unique=E5=8A=9F=E8=83=BD=E5=9C=A8=E6=89=A9?= =?UTF-8?q?=E5=AE=B9=E6=A8=A1=E5=BC=8F=E4=B8=8B=EF=BC=8C=E8=A1=A8=E5=90=8D?= =?UTF-8?q?=E5=AD=97=E2=80=9C/=E2=80=9D=E9=9A=90=E6=82=A3=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=20Modification=E3=80=91=E5=85=A8=E5=B1=80unique=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=E5=9C=A8=E6=89=A9=E5=AE=B9=E6=A8=A1=E5=BC=8F=E4=B8=8B?= =?UTF-8?q?=EF=BC=8C=E8=A1=A8=E5=90=8D=E5=AD=97=E2=80=9C/=E2=80=9D?= =?UTF-8?q?=E9=9A=90=E6=82=A3=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/emb/base_sparse_embedding.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mx_rec/core/emb/base_sparse_embedding.py b/mx_rec/core/emb/base_sparse_embedding.py index f984697c..1a59bd24 100644 --- a/mx_rec/core/emb/base_sparse_embedding.py +++ b/mx_rec/core/emb/base_sparse_embedding.py @@ -296,6 +296,8 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): # set modify graph self._modify_graph = kwargs.get("modify_graph", True) + if not self._modify_graph and not self._is_hbm: + raise RuntimeError("when the 'ddr or ssd' mode are used, the 'modify graph' is required") # return the stub tensor of the lookup result if not self._use_static: @@ -328,7 +330,9 @@ class BaseSparseEmbedding(metaclass=abc.ABCMeta): return lookup_result if not self._use_static and not self._modify_graph and kwargs.get("batch") is None: - raise RuntimeError("When the 'feature spec' mode and 'dynamic shape' are used, the 'batch' is required.") + raise RuntimeError("when the 'feature spec' mode and 'dynamic shape' are used, the 'batch' is required") + if not self._modify_graph and not self._is_hbm: + raise RuntimeError("when the 'ddr or ssd' mode are used, the 'modify graph' is required") table_name = feature_spec.table_name same_table_feature_spec = \ ConfigInitializer.get_instance().feature_spec_config.table_name_to_feature_spec[table_name][is_training] -- Gitee From 84324b354023eb6f05051ae9c2749c071832e21d Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 16 May 2024 15:15:25 +0800 Subject: [PATCH 137/302] =?UTF-8?q?=E9=80=82=E9=85=8Dno=20ranktable?= =?UTF-8?q?=E5=90=AF=E5=8A=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 2 +- examples/dlrm/model/config.py | 6 +-- examples/dlrm/model/main_mxrec.py | 3 +- examples/dlrm/model/run.sh | 63 +++++++++++++++++++++---------- 4 files changed, 48 insertions(+), 26 deletions(-) diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index eb1d91ea..a1e38897 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -249,7 +249,7 @@ if __name__ == "__main__": warnings.filterwarnings("ignore") rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None - rank_size = int(os.getenv("RANK_SIZE")) if os.getenv("RANK_SIZE") else None + rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None train_steps = 10000 eval_steps = 1360 diff --git a/examples/dlrm/model/config.py b/examples/dlrm/model/config.py index 23b042c2..fd38276d 100644 --- a/examples/dlrm/model/config.py +++ b/examples/dlrm/model/config.py @@ -89,10 +89,10 @@ class LearningRateScheduler: class Config: def __init__(self, ): - self.rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None - tmp = os.getenv("RANK_SIZE") + self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None + tmp = os.getenv("TRAIN_RANK_SIZE") if tmp is None: - raise ValueError("please export RANK_SIZE") + raise ValueError("please export TRAIN_RANK_SIZE") self.rank_size = int(tmp) self.data_path = os.getenv("DLRM_CRITEO_DATA_PATH") diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index 3464f84e..6fda4f0a 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -41,7 +41,6 @@ from mx_rec.util.variable import get_dense_and_sparse_variable from mx_rec.util.log import logger from npu_bridge.npu_init import * - npu_plugin.set_device_sat_mode(0) dense_hashtable_seed = 128 @@ -253,7 +252,7 @@ if __name__ == "__main__": warnings.filterwarnings("ignore") rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None - rank_size = int(os.getenv("RANK_SIZE")) if os.getenv("RANK_SIZE") else None + rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None train_steps = 10000 eval_steps = 1360 diff --git a/examples/dlrm/model/run.sh b/examples/dlrm/model/run.sh index f5cb4449..be509608 100644 --- a/examples/dlrm/model/run.sh +++ b/examples/dlrm/model/run.sh @@ -20,10 +20,25 @@ so_path=$1 mx_rec_package_path=$2 hccl_cfg_json=$3 dlrm_criteo_data_path=$4 +ip=$5 # no ranktable时传入该参数 -export RANK_SIZE=8 -echo "RANK_SIZE=${RANK_SIZE}, please make sure hccl configuration json file match this parameter" -export RANK_TABLE_FILE=${hccl_cfg_json} +interface="lo" +num_server=1 +local_rank_size=8 +num_process=$((num_server * local_rank_size)) +export TRAIN_RANK_SIZE=$num_process + +# 删除数据 +echo "CACHE_MODE:${CACHE_MODE}" +if [ ${CACHE_MODE} = "SSD" ]; then + echo "SSD train mode not allow file exist before training, + deleting dir ${cur_path}/ssd_data then create for SSD use case" + rm -rf ssd_data + mkdir ssd_data +fi +rm -rf kernel* +rm -rf /root/ascend/log/* +rm -rf model_dir_rank* op_cache ################# 参数配置 ###################### export USE_DYNAMIC=0 # 0:静态shape;1:动态shape @@ -34,25 +49,11 @@ export USE_MULTI_LOOKUP=0 # 0:一表一查;1:一表多查 export USE_MODIFY_GRAPH=0 # 0:feature spec模式;1:自动改图模式 ################################################ -echo "CACHE_MODE:${CACHE_MODE}" -if [ ${CACHE_MODE} = "SSD" ]; then - echo "SSD train mode not allow file exist before training, - deleting dir ${cur_path}/ssd_data then create for SSD use case" - rm -rf ssd_data - mkdir ssd_data -fi - export HCCL_CONNECT_TIMEOUT=1200 - export DLRM_CRITEO_DATA_PATH=${dlrm_criteo_data_path} export PYTHONPATH=${mx_rec_package_path}:${so_path}:$PYTHONPATH export LD_PRELOAD=/usr/lib64/libgomp.so.1 export LD_LIBRARY_PATH=${so_path}:/usr/local/lib:$LD_LIBRARY_PATH - -rm -rf kernel* -rm -rf /root/ascend/log/* -rm -rf model_dir_rank* op_cache - export ASCEND_DEVICE_ID=0 export RANK_ID_START=0 export JOB_ID=10086 @@ -78,10 +79,32 @@ echo "MXREC_MODE is $MXREC_MODE" export py=main_mxrec.py echo "py is $py" +# 区分ranktable和no ranktable +if [ -n "$ip" ]; then + # no ranktable分支 + echo "Current is no ranktable solution." + echo "Input node ip: $ip, please make sure this ip is available." + export CM_CHIEF_IP=$ip # 主节点ip + export CM_CHIEF_PORT=60001 # 主节点监听端口 + export CM_CHIEF_DEVICE=0 # 主节点device id + export CM_WORKER_IP=$ip # 当前节点ip + export CM_WORKER_SIZE=$num_process # 参与集群训练的device数量 + echo "CM_CHIEF_IP=$CM_CHIEF_IP" + echo "CM_CHIEF_PORT=$CM_CHIEF_PORT" + echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE" + echo "CM_WORKER_IP=$CM_WORKER_IP" + echo "CM_WORKER_SIZE=$CM_WORKER_SIZE" +else + # ranktable分支 + echo "Current is ranktable solution, hccl json file:${hccl_cfg_json}" + export RANK_SIZE=$num_process + echo "RANK_SIZE=${RANK_SIZE}, please make sure hccl configuration json file match this parameter" + export RANK_TABLE_FILE=${hccl_cfg_json} +fi + echo "use horovod to start tasks" # GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' -interface="lo" -horovodrun --network-interface ${interface} -np ${RANK_SIZE} --mpi-args "${mpi_args}" --mpi -H localhost:${RANK_SIZE} \ -python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${RANK_SIZE}p.log +horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ +python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${num_process}p.log -- Gitee From ae23ba5eed96ec82762ba14be9890e2bb6ab401b Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 16 May 2024 15:21:16 +0800 Subject: [PATCH 138/302] =?UTF-8?q?=E5=A2=9E=E5=8A=A0pandas=E6=A8=A1?= =?UTF-8?q?=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/build_mxRec_images/centos_build/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/build_mxRec_images/centos_build/Dockerfile b/docs/build_mxRec_images/centos_build/Dockerfile index 2d2b3579..190ec21b 100644 --- a/docs/build_mxRec_images/centos_build/Dockerfile +++ b/docs/build_mxRec_images/centos_build/Dockerfile @@ -114,6 +114,7 @@ RUN pip3.7 install -U pip && \ pip3.7 install cffi==1.12.3 && \ pip3.7 install pyyaml && \ pip3.7 install pathlib2 && \ + pip3.7 install pandas && \ pip3.7 install grpcio && \ pip3.7 install grpcio-tools && \ pip3.7 install protobuf==3.20.0 && \ -- Gitee From c0a1b74b3146047143d9de2ff4085fff7e367e7c Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 16 May 2024 15:24:34 +0800 Subject: [PATCH 139/302] =?UTF-8?q?=E5=A2=9E=E5=8A=A0pandas=E6=A8=A1?= =?UTF-8?q?=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fd3b0691..44481bee 100644 --- a/README.md +++ b/README.md @@ -17,17 +17,20 @@ mxRec作为面向互联网市场搜索推荐广告的应用使能SDK产品,对 安装前,请参考《CANN 软件安装指南》安装CANN开发套件软件包和TensorFlow适配昇腾插件。 CANN软件提供进程级环境变量设置脚本,供用户在进程中引用,以自动完成环境变量设置。用户进程结束后自动失效。可在程序启动的Shell脚本中使用如下命令设置CANN的相关环境变量,也可通过命令行执行如下命令(以root用户默认安装路径“/usr/local/Ascend”为例): + ```shell source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/tfplugin/set_env.sh ``` 安装依赖,若未构建镜像,直接在物理机上进行开发,则须安装以下Python依赖 + ```shell -pip3 install numpy decorator sympy==1.4 cffi==1.12.3 pyyaml pathlib2 grpcio grpcio-tools protobuf==3.20.0 scipy requests mpi4py easydict scikit-learn==0.20.0 attrs +pip3 install numpy decorator sympy==1.4 cffi==1.12.3 pyyaml pathlib2 pandas grpcio grpcio-tools protobuf==3.20.0 scipy requests mpi4py easydict scikit-learn==0.20.0 attrs ``` horovod依赖安装前需配置“HOROVOD_WITH_MPI”、“HOROVOD_WITH_TENSORFLOW”,依赖安装命令参考如下。 + ```shell HOROVOD_WITH_MPI=1 HOROVOD_WITH_TENSORFLOW=1 pip3.7 install horovod --no-cache-dir ``` @@ -35,6 +38,7 @@ HOROVOD_WITH_MPI=1 HOROVOD_WITH_TENSORFLOW=1 pip3.7 install horovod --no-cache-d ### 二进制包安装 从昇腾开源社区直接获取编译打包后的产品包。解压后包含tf1和tf2两个版本的whl安装包,使用pip命令安装whl包(请根据实际需求,选取对应TensorFlow版本匹配的Wheel包): + ```shell pip3 install mx_rec-{version}-py3-none-linux_{arch}.whl ``` @@ -46,6 +50,7 @@ export PYTHONPATH={mxrec_install_path}:{mxrec_install_path}/mxRec:$PYTHONPATH ``` 如需使用动态扩容功能,进入已解压的mxRec软件包“mindxsdk-mxrec/cust_op/cust_op_by_addr”目录中。参考以下命令编译并安装动态扩容算子包。 + ```shell bash run.sh ``` @@ -53,11 +58,13 @@ bash run.sh ### 源码编译安装 编译环境依赖: + - Python3.7.5 - GCC 7.3.0 - CMake 3.20.6 开源依赖: + - [pybind11 v2.10.3](https://github.com/pybind/pybind11/archive/refs/tags/v2.10.3.zip) - [securec](https://github.com/huaweicloud/huaweicloud-sdk-c-obs/archive/refs/tags/v3.23.9.zip) - [openmpi 4.1.5](https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz): 请参考软件文档在编译环境完成安装 @@ -68,11 +75,18 @@ bash run.sh 为了构建多个版本的whl包,编译脚本在python虚拟环境完成对应tensorflow版本的安装。用户可以根据实际情况调整编译脚本,指定tensorflow的安装路径。编译方法: 进入mxRec代码目录: -- setup.py:执行脚本setup.py,比如:**python3.7 setup.py**完成tf1和tf2版本whl包的构建和打包,构建成功后,whl包在build/mindxsdk-mxrec/目录下,其中tf1_whl和tf2_whl目录下存在对应的whl包。执行脚本前,请参考build/build_tf1.sh、build/build_tf2.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 -- setup_tf1.py:执行脚本setup_tf1.py,比如:**python3.7 setup_tf1.py bdist_wheel**完成tf1版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,请参考build/build_tf1.sh创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 -- setup_tf2.py:执行脚本setup_tf2.py,比如:**python3.7 setup_tf2.py bdist_wheel**完成tf2版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,请参考build/build_tf2.sh创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 + +- setup.py:执行脚本setup.py,比如:**python3.7 setup.py** + 完成tf1和tf2版本whl包的构建和打包,构建成功后,whl包在build/mindxsdk-mxrec/目录下,其中tf1_whl和tf2_whl目录下存在对应的whl包。执行脚本前,请参考build/build_tf1.sh、build/build_tf2.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 +- setup_tf1.py:执行脚本setup_tf1.py,比如:**python3.7 setup_tf1.py bdist_wheel** + 完成tf1版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,请参考build/build_tf1.sh创建tf1虚拟环境,在虚拟环境中完成tensorflow + 1.15.0版本的安装,并修改对应的激活命令。 +- setup_tf2.py:执行脚本setup_tf2.py,比如:**python3.7 setup_tf2.py bdist_wheel** + 完成tf2版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,请参考build/build_tf2.sh创建tf2虚拟环境,在虚拟环境中完成tensorflow + 2.6.5版本的安装,并修改对应的激活命令。 如需使用动态扩容功能,进入“./cust_op/cust_op_by_addr”目录中。参考以下命令编译并安装动态扩容算子包。 + ```shell bash run.sh ``` @@ -88,6 +102,7 @@ bash run.sh - pytest-html 如需使用python测试用例,需要先安装上述依赖以及能够在tf1环境下进行源码编译,然后进入tests目录中。参考以下命令执行python侧测试用例: + ```shell bash run_python_dt.sh ``` @@ -108,11 +123,13 @@ emock-0.9.0.zip、pybind11-2.10.3.zip、 huaweicloud-sdk-c-obs-3.23.9.zip。如 如需使用C++测试用例,需要按照上述描述准备需要的依赖,准备好之后,进入src目录中。参考以下命令执行C++测试用例: tf1环境下使用如下命令: + ```shell bash test_ut.sh tf1 ``` tf2环境下使用如下命令: + ```shell bash test_ut.sh tf2 ``` -- Gitee From 80cc0503448850b5d0c3c9a289480647f3cb0df3 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 16 May 2024 15:42:59 +0800 Subject: [PATCH 140/302] =?UTF-8?q?readme=E5=A2=9E=E5=8A=A0pandas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 44481bee..17d38fcd 100644 --- a/README.md +++ b/README.md @@ -17,20 +17,17 @@ mxRec作为面向互联网市场搜索推荐广告的应用使能SDK产品,对 安装前,请参考《CANN 软件安装指南》安装CANN开发套件软件包和TensorFlow适配昇腾插件。 CANN软件提供进程级环境变量设置脚本,供用户在进程中引用,以自动完成环境变量设置。用户进程结束后自动失效。可在程序启动的Shell脚本中使用如下命令设置CANN的相关环境变量,也可通过命令行执行如下命令(以root用户默认安装路径“/usr/local/Ascend”为例): - ```shell source /usr/local/Ascend/ascend-toolkit/set_env.sh source /usr/local/Ascend/tfplugin/set_env.sh ``` 安装依赖,若未构建镜像,直接在物理机上进行开发,则须安装以下Python依赖 - ```shell pip3 install numpy decorator sympy==1.4 cffi==1.12.3 pyyaml pathlib2 pandas grpcio grpcio-tools protobuf==3.20.0 scipy requests mpi4py easydict scikit-learn==0.20.0 attrs ``` horovod依赖安装前需配置“HOROVOD_WITH_MPI”、“HOROVOD_WITH_TENSORFLOW”,依赖安装命令参考如下。 - ```shell HOROVOD_WITH_MPI=1 HOROVOD_WITH_TENSORFLOW=1 pip3.7 install horovod --no-cache-dir ``` @@ -38,7 +35,6 @@ HOROVOD_WITH_MPI=1 HOROVOD_WITH_TENSORFLOW=1 pip3.7 install horovod --no-cache-d ### 二进制包安装 从昇腾开源社区直接获取编译打包后的产品包。解压后包含tf1和tf2两个版本的whl安装包,使用pip命令安装whl包(请根据实际需求,选取对应TensorFlow版本匹配的Wheel包): - ```shell pip3 install mx_rec-{version}-py3-none-linux_{arch}.whl ``` @@ -50,7 +46,6 @@ export PYTHONPATH={mxrec_install_path}:{mxrec_install_path}/mxRec:$PYTHONPATH ``` 如需使用动态扩容功能,进入已解压的mxRec软件包“mindxsdk-mxrec/cust_op/cust_op_by_addr”目录中。参考以下命令编译并安装动态扩容算子包。 - ```shell bash run.sh ``` @@ -58,13 +53,11 @@ bash run.sh ### 源码编译安装 编译环境依赖: - - Python3.7.5 - GCC 7.3.0 - CMake 3.20.6 开源依赖: - - [pybind11 v2.10.3](https://github.com/pybind/pybind11/archive/refs/tags/v2.10.3.zip) - [securec](https://github.com/huaweicloud/huaweicloud-sdk-c-obs/archive/refs/tags/v3.23.9.zip) - [openmpi 4.1.5](https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.5.tar.gz): 请参考软件文档在编译环境完成安装 @@ -75,18 +68,11 @@ bash run.sh 为了构建多个版本的whl包,编译脚本在python虚拟环境完成对应tensorflow版本的安装。用户可以根据实际情况调整编译脚本,指定tensorflow的安装路径。编译方法: 进入mxRec代码目录: - -- setup.py:执行脚本setup.py,比如:**python3.7 setup.py** - 完成tf1和tf2版本whl包的构建和打包,构建成功后,whl包在build/mindxsdk-mxrec/目录下,其中tf1_whl和tf2_whl目录下存在对应的whl包。执行脚本前,请参考build/build_tf1.sh、build/build_tf2.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 -- setup_tf1.py:执行脚本setup_tf1.py,比如:**python3.7 setup_tf1.py bdist_wheel** - 完成tf1版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,请参考build/build_tf1.sh创建tf1虚拟环境,在虚拟环境中完成tensorflow - 1.15.0版本的安装,并修改对应的激活命令。 -- setup_tf2.py:执行脚本setup_tf2.py,比如:**python3.7 setup_tf2.py bdist_wheel** - 完成tf2版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,请参考build/build_tf2.sh创建tf2虚拟环境,在虚拟环境中完成tensorflow - 2.6.5版本的安装,并修改对应的激活命令。 +- setup.py:执行脚本setup.py,比如:**python3.7 setup.py**完成tf1和tf2版本whl包的构建和打包,构建成功后,whl包在build/mindxsdk-mxrec/目录下,其中tf1_whl和tf2_whl目录下存在对应的whl包。执行脚本前,请参考build/build_tf1.sh、build/build_tf2.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 +- setup_tf1.py:执行脚本setup_tf1.py,比如:**python3.7 setup_tf1.py bdist_wheel**完成tf1版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,请参考build/build_tf1.sh创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 +- setup_tf2.py:执行脚本setup_tf2.py,比如:**python3.7 setup_tf2.py bdist_wheel**完成tf2版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,请参考build/build_tf2.sh创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 如需使用动态扩容功能,进入“./cust_op/cust_op_by_addr”目录中。参考以下命令编译并安装动态扩容算子包。 - ```shell bash run.sh ``` @@ -102,7 +88,6 @@ bash run.sh - pytest-html 如需使用python测试用例,需要先安装上述依赖以及能够在tf1环境下进行源码编译,然后进入tests目录中。参考以下命令执行python侧测试用例: - ```shell bash run_python_dt.sh ``` @@ -123,13 +108,11 @@ emock-0.9.0.zip、pybind11-2.10.3.zip、 huaweicloud-sdk-c-obs-3.23.9.zip。如 如需使用C++测试用例,需要按照上述描述准备需要的依赖,准备好之后,进入src目录中。参考以下命令执行C++测试用例: tf1环境下使用如下命令: - ```shell bash test_ut.sh tf1 ``` tf2环境下使用如下命令: - ```shell bash test_ut.sh tf2 ``` -- Gitee From 75539d487923064b20dce016beeffa4a92f1cdad Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 16 May 2024 21:37:35 +0800 Subject: [PATCH 141/302] =?UTF-8?q?=E4=BF=AE=E6=94=B9main=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E5=86=85=E5=88=A0=E9=99=A4=E5=B7=B2=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 23 +++++++++-- examples/demo/little_demo/main.py | 42 +++++++++++++-------- examples/demo/little_demo/run.sh | 16 +------- examples/demo/little_demo_estimator/main.py | 24 ++++++++++++ examples/demo/little_demo_estimator/run.sh | 7 +--- 5 files changed, 72 insertions(+), 40 deletions(-) diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index a1e38897..fb2efdee 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -13,13 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +import numpy as np +import os +import random +import shutil import time import warnings -import random from glob import glob - from sklearn.metrics import roc_auc_score -import numpy as np + from npu_bridge.npu_init import * from model import MyModel @@ -244,11 +246,24 @@ def create_feature_spec_list(use_timestamp=False): return feature_spec_list +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + + if __name__ == "__main__": tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) warnings.filterwarnings("ignore") + _clear_saved_model() - rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None train_steps = 10000 diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index a6ef96fc..426eb64b 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -148,23 +148,34 @@ def create_feature_spec_list(use_timestamp=False): return feature_spec_list -def clear_saved_model(): +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + _del_related_dir("kernel*") + _del_related_dir("export_graph") + mode = UseMode.mapping(os.getenv("USE_MODE")) - if mode == UseMode.TRAIN: - logger.info("current mode is train, will delete previous saved model data if exist.") - save_model_path = os.path.join(os.getcwd(), "saved-model") - shutil.rmtree(save_model_path, ignore_errors=True) - if not (os.getenv("CACHE_MODE", "") == CacheModeEnum.SSD.value and mode == UseMode.TRAIN): + if mode != UseMode.TRAIN: return + logger.info("current mode is train, will delete previous saved model data if exist.") + _del_related_dir("saved-model") - # ssd not allow overwrite file, should clear it before training - logger.info("current cache mode is SSD, will delete previous saved ssd data if exist.") - for part_path in _SSD_SAVE_PATH: - if "/" not in part_path and "\\" not in part_path: - part_path = os.path.join(os.getcwd(), part_path) - shutil.rmtree(part_path, ignore_errors=True) + if not (os.getenv("CACHE_MODE", "") == CacheModeEnum.SSD.value): + return + logger.info("current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" + " then create empty directory for this use case.") + for sub_path in _SSD_SAVE_PATH: + _del_related_dir(sub_path) try: - os.mkdir(part_path) + os.mkdir(sub_path) except OSError: logger.warning("ssd path has exist") # 多进程并行,忽略异常 @@ -172,6 +183,7 @@ def clear_saved_model(): if __name__ == "__main__": tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) warnings.filterwarnings("ignore") + _clear_saved_model() use_mode = UseMode.mapping(os.getenv("USE_MODE")) # 最大数据集生成数量 @@ -217,7 +229,7 @@ if __name__ == "__main__": if len(model_file) == 0: raise ValueError(f"get USE_MODE:{use_mode}, but no model file exist at:{load_path_pattern}") if_load = True - + # nbatch function needs to be used together with the prefetch and host_vocabulary_size != 0 init(train_steps=TRAIN_STEPS, eval_steps=EVAL_STEPS, @@ -267,7 +279,7 @@ if __name__ == "__main__": if cache_mode in ["DDR", "SSD"] and not use_dynamic: logger.warning("when cache_mode in [DDR, SSD], suggest use_dynamic=true to avoid tuning size parameter") emb_initializer = tf.compat.v1.constant_initializer(0) if USE_DETERMINISTIC \ - else tf.compat.v1.truncated_normal_initializer() + else tf.compat.v1.truncated_normal_initializer() user_hashtable = create_table(key_dtype=tf.int64, dim=tf.TensorShape([cfg.user_hashtable_dim]), name='user_table', diff --git a/examples/demo/little_demo/run.sh b/examples/demo/little_demo/run.sh index d585be02..5b45af84 100644 --- a/examples/demo/little_demo/run.sh +++ b/examples/demo/little_demo/run.sh @@ -15,26 +15,12 @@ # ============================================================================== kill -9 `ps -ef | grep python | grep -v grep | awk '{print $2}'` > /dev/null 2>&1 -rm -rf /root/ascend/log/* -rm -rf ./kernel* -rm -rf ./export_graph/* # 支持[train, load_and_train, predict] -export USE_MODE="train" -if [ $USE_MODE = "train" ]; then - echo "train mode: saved-model will be deleted" - rm -rf ./saved-model -fi +export USE_MODE="train" # if train mode, will remove dir ./saved-model # cache mode support: HBM, DDR, SSD export CACHE_MODE="HBM" -if [ $CACHE_MODE = "SSD" ] && [ $USE_MODE = "train" ]; then - echo "SSD train mode not allow file exist in directory when training a model from stratch in case overwrite, - deleting directory ssd_data then create for this use case" - rm -rf ssd_data - mkdir ssd_data -fi - # 获取输入参数:py、ip if [ $# -ge 1 ]; then diff --git a/examples/demo/little_demo_estimator/main.py b/examples/demo/little_demo_estimator/main.py index cca5a7a5..20b7381d 100644 --- a/examples/demo/little_demo_estimator/main.py +++ b/examples/demo/little_demo_estimator/main.py @@ -17,6 +17,8 @@ import argparse import os +import shutil +from glob import glob import tensorflow as tf from mx_rec.util.initialize import init, terminate_config_initializer @@ -142,6 +144,27 @@ def create_feature_spec_list(use_timestamp=False): return feature_spec_list +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + _del_related_dir("kernel*") + _del_related_dir("export_graph") + + mode = args.run_mode + if not mode.startswith("train"): + return + logger.info("current mode contains train, will delete previous saved model data if exist.") + _del_related_dir("_rank*") + + if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--run_mode', type=str, default='train_and_evaluate') # 运行模式,在run.sh中进行配置 @@ -185,6 +208,7 @@ if __name__ == '__main__': args.eval_steps = -1 elif args.run_mode == 'train_and_evaluate': args.save_checkpoints_steps = args.train_steps + _clear_saved_model() # set init init(train_steps=args.train_steps, diff --git a/examples/demo/little_demo_estimator/run.sh b/examples/demo/little_demo_estimator/run.sh index f3d34c82..79fdc3f4 100644 --- a/examples/demo/little_demo_estimator/run.sh +++ b/examples/demo/little_demo_estimator/run.sh @@ -83,12 +83,7 @@ export TF_CPP_MIN_LOG_LEVEL=3 # tensorflow日志级别,3对应FATAL # 设置应用类日志的全局日志级别及各模块日志级别,具体请参考昇腾官网CANN文档 export ASCEND_GLOBAL_LOG_LEVEL=3 # “设置日志级别”章节0:debug, 1:info, 2:warning, 3:error, 4:NULL export MXREC_MODE="ASC" -export USE_MODE="train_and_evaluate" # 支持[train, predict, train_and_evaluate] - -if [ $USE_MODE = "train" ] || [ $USE_MODE = "train_and_evaluate" ];then - echo "train mode: saved-model will be deleted" - rm -rf ./_rank* -fi +export USE_MODE="train_and_evaluate" # 支持[train, predict, train_and_evaluate],train相关模式将删除./_rank*目录 ################# 参数配置 ###################### export USE_DYNAMIC=1 # 0:静态shape;1:动态shape -- Gitee From 639e33ad4eab81762d607ae00368406e9dc97df7 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 16 May 2024 21:47:23 +0800 Subject: [PATCH 142/302] =?UTF-8?q?=E4=BF=AE=E6=94=B9main=E8=84=9A?= =?UTF-8?q?=E6=9C=AC=E5=86=85=E5=88=A0=E9=99=A4=E5=B7=B2=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index 426eb64b..d8dd851a 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -176,6 +176,7 @@ def _clear_saved_model() -> None: _del_related_dir(sub_path) try: os.mkdir(sub_path) + logger.info(f"mkdir dir:{sub_path}") except OSError: logger.warning("ssd path has exist") # 多进程并行,忽略异常 -- Gitee From d9e866bb86c9654e24294fdb0a9e27202aaba036 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 16 May 2024 22:02:51 +0800 Subject: [PATCH 143/302] =?UTF-8?q?=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/main_mxrec.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index fb2efdee..f789b2c5 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -import numpy as np + import os import random import shutil @@ -22,6 +22,8 @@ import warnings from glob import glob from sklearn.metrics import roc_auc_score +import numpy as np + from npu_bridge.npu_init import * from model import MyModel -- Gitee From b05c404ddf42e4593d83d57c72a762ad9fe95c88 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Fri, 17 May 2024 10:34:53 +0800 Subject: [PATCH 144/302] =?UTF-8?q?=E5=88=A0=E9=99=A4run=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E5=86=97=E4=BD=99=E6=8C=87=E4=BB=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo_estimator/run.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/demo/little_demo_estimator/run.sh b/examples/demo/little_demo_estimator/run.sh index 79fdc3f4..8bb43b19 100644 --- a/examples/demo/little_demo_estimator/run.sh +++ b/examples/demo/little_demo_estimator/run.sh @@ -15,9 +15,6 @@ # ============================================================================== kill -9 `ps -ef | grep python | grep -v grep | awk '{print $2}'` > /dev/null 2>&1 -rm -rf /root/ascend/log/* -rm -rf ./kernel* -rm -rf ./export_graph/* # 获取输入参数:py、ip if [ $# -ge 1 ]; then -- Gitee From 1cef4c2bdfc0431d9027702071ef4d3a94028273 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Fri, 17 May 2024 15:40:35 +0800 Subject: [PATCH 145/302] =?UTF-8?q?=E5=BC=95=E7=94=A8=E5=BD=93=E5=89=8D?= =?UTF-8?q?=E7=9B=AE=E5=BD=95=E4=B8=8Bconfig=E6=96=87=E4=BB=B6=E5=92=8Cdlr?= =?UTF-8?q?m=E8=A7=A3=E9=99=A4=E8=80=A6=E5=90=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/config.py | 230 +++++++++++++++++++++++++++++++++++ examples/DCNv2/main_mxrec.py | 2 +- 2 files changed, 231 insertions(+), 1 deletion(-) create mode 100644 examples/DCNv2/config.py diff --git a/examples/DCNv2/config.py b/examples/DCNv2/config.py new file mode 100644 index 00000000..fd38276d --- /dev/null +++ b/examples/DCNv2/config.py @@ -0,0 +1,230 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os + +import tensorflow as tf +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +from npu_bridge.estimator.npu.npu_config import NPURunConfig + + +class LearningRateScheduler: + """ + LR Scheduler combining Polynomial Decay with Warmup at the beginning. + TF-based cond operations necessary for performance in graph mode. + """ + + def __init__(self, base_lr_dense, base_lr_sparse, warmup_steps, decay_start_step, decay_steps): + self.warmup_steps = tf.constant(warmup_steps, dtype=tf.int32) + self.decay_start_step = tf.constant(decay_start_step, dtype=tf.int32) + self.decay_steps = tf.constant(decay_steps) + self.decay_end_step = decay_start_step + decay_steps # 65041 + self.poly_power = 2.0 + self.base_lr_dense = base_lr_dense + self.base_lr_sparse = base_lr_sparse + + def calc(self, global_step): + # used for the warmup stage + warmup_step = tf.cast(1 / self.warmup_steps, tf.float32) + lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step + lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32) + # used for the constant stage + lr_factor_constant = tf.cast(1.0, tf.float32) + + # used for the decay stage + lr_factor_decay = (self.decay_end_step - global_step) / self.decay_steps + lr_factor_decay = tf.math.pow(lr_factor_decay, self.poly_power) + lr_factor_decay = tf.cast(lr_factor_decay, tf.float32) + sparse_after_decay = tf.cast(1 / self.decay_steps, tf.float32) + + lr_factor_decay_sparse = tf.cond( + global_step < self.decay_end_step, + lambda: lr_factor_decay, + lambda: sparse_after_decay, + ) + + lr_factor_decay_dense = tf.cond( + global_step < self.decay_end_step, + lambda: lr_factor_decay, + lambda: sparse_after_decay, + ) + + poly_schedule_sparse = tf.cond( + global_step < self.decay_start_step, + lambda: lr_factor_constant, + lambda: lr_factor_decay_sparse, + ) + + poly_schedule_dense = tf.cond( + global_step < self.decay_start_step, + lambda: lr_factor_constant, + lambda: lr_factor_decay_dense, + ) + + lr_factor_sparse = tf.cond( + global_step < self.warmup_steps, lambda: lr_factor_warmup, lambda: poly_schedule_sparse + ) + + lr_factor_dense = tf.cond( + global_step < self.warmup_steps, lambda: lr_factor_warmup, lambda: poly_schedule_dense + ) + + lr_sparse = self.base_lr_sparse * lr_factor_sparse + lr_dense = self.base_lr_dense * lr_factor_dense + return lr_dense, lr_sparse + + +class Config: + def __init__(self, ): + self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None + tmp = os.getenv("TRAIN_RANK_SIZE") + if tmp is None: + raise ValueError("please export TRAIN_RANK_SIZE") + self.rank_size = int(tmp) + + self.data_path = os.getenv("DLRM_CRITEO_DATA_PATH") + self.train_file_pattern = "train" + self.test_file_pattern = "test" + + self.batch_size = 8192 + self.line_per_sample = 1024 + self.train_epoch = 3 + self.test_epoch = 1 + self.perform_shuffle = False + + self.key_type = tf.int64 + self.label_type = tf.float32 + self.value_type = tf.int64 + + self.feat_cnt = 26 + self.__set_emb_table_size() + + self.field_num = 26 + self.send_count = 46000 // self.rank_size + + self.emb_dim = 128 + self.hashtable_threshold = 1 + + self.USE_PIPELINE_TEST = False + + # 动态学习率 + GLOBAL_BATCH_SIZE = 8192 * 8 + LR_SCHEDULE_STEPS = [ + int(2750 * 55296 / GLOBAL_BATCH_SIZE), + int(49315 * 55296 / GLOBAL_BATCH_SIZE), + int(27772 * 55296 / GLOBAL_BATCH_SIZE), + ] + self.global_step = tf.Variable(0, trainable=False) + _lr_scheduler = LearningRateScheduler( + 28.443, + 33.71193, + LR_SCHEDULE_STEPS[0], + LR_SCHEDULE_STEPS[1], + LR_SCHEDULE_STEPS[2], + ) + self.learning_rate = _lr_scheduler.calc(self.global_step) + + def __set_emb_table_size(self): + self.cache_mode = os.getenv("CACHE_MODE") + if self.cache_mode is None: + raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") + + if self.cache_mode == "HBM": + self.dev_vocab_size = 24_000_000 * self.rank_size + self.host_vocab_size = 0 + elif self.cache_mode == "DDR": + self.dev_vocab_size = 500_000 * self.rank_size + self.host_vocab_size = 24_000_000 * self.rank_size + elif self.cache_mode == "SSD": + self.dev_vocab_size = 100_000 * self.rank_size + self.host_vocab_size = 2_000_000 * self.rank_size + self.ssd_vocab_size = 24_000_000 * self.rank_size + else: + raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") + + def get_emb_table_cfg(self) -> dict: + if self.cache_mode == "HBM": + return {"device_vocabulary_size": self.dev_vocab_size} + elif self.cache_mode == "DDR": + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size} + elif self.cache_mode == "SSD": + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size, + "ssd_vocabulary_size": self.ssd_vocab_size, + "ssd_data_path": ["ssd_data"]} + else: + raise RuntimeError(f"get CACHE_MODE:{self.cache_mode}, check Config.__set_emb_table_size implementation") + + +def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"): + session_config = tf.ConfigProto(allow_soft_placement=False, + log_device_placement=False) + session_config.gpu_options.allow_growth = True + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["mix_compile_mode"].b = False + custom_op.parameter_map["use_off_line"].b = True + custom_op.parameter_map["min_group_size"].b = 1 + # 可选配置level0:pairwise;level1:pairwise + custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:fullmesh;level1:fullmesh") + custom_op.parameter_map["enable_data_pre_proc"].b = True + custom_op.parameter_map["iterations_per_loop"].i = 10 + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + custom_op.parameter_map["hcom_parallel"].b = False + custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes("op_impl_mode.ini") + custom_op.parameter_map["op_execute_timeout"].i = 2000 + custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes( + str(13 * 1024 * 1024 * 1024)) # total 31 need 13; + custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(18 * 1024 * 1024 * 1024)) # need 25 + custom_op.parameter_map["stream_max_parallel_num"].s = tf.compat.as_bytes("DNN_VM_AICPU:3,AIcoreEngine:3") + + if dump_data: + custom_op.parameter_map["enable_dump"].b = True + custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes(dump_path) + custom_op.parameter_map["dump_step"].s = tf.compat.as_bytes(dump_steps) + custom_op.parameter_map["dump_mode"].s = tf.compat.as_bytes("all") + + session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + return session_config + + +def get_npu_run_config(): + session_config = tf.ConfigProto(allow_soft_placement=False, + log_device_placement=False) + + session_config.gpu_options.allow_growth = True + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + run_config = NPURunConfig( + save_summary_steps=1000, + save_checkpoints_steps=100, + keep_checkpoint_max=5, + session_config=session_config, + log_step_count_steps=20, + precision_mode='allow_mix_precision', + enable_data_pre_proc=True, + iterations_per_loop=1, + jit_compile=False, + op_compiler_cache_mode="enable", + HCCL_algorithm="level0:fullmesh;level1:fullmesh" # 可选配置:level0:pairwise;level1:pairwise + ) + return run_config diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index f789b2c5..18ab273e 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -27,7 +27,7 @@ import numpy as np from npu_bridge.npu_init import * from model import MyModel -from dlrm.model.config import sess_config, Config +from config import sess_config, Config from optimizer import get_dense_and_sparse_optimizer from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline -- Gitee From 7a77b33512a726a4dff6e70b5aa6cd2f6dbef67f Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Fri, 17 May 2024 15:47:37 +0800 Subject: [PATCH 146/302] =?UTF-8?q?=E5=88=A0=E9=99=A4config=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E4=B8=AD=E5=86=97=E4=BD=99=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/config.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/examples/DCNv2/config.py b/examples/DCNv2/config.py index fd38276d..73ab2797 100644 --- a/examples/DCNv2/config.py +++ b/examples/DCNv2/config.py @@ -18,7 +18,6 @@ import os import tensorflow as tf from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig -from npu_bridge.estimator.npu.npu_config import NPURunConfig class LearningRateScheduler: @@ -202,29 +201,3 @@ def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"): session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF return session_config - - -def get_npu_run_config(): - session_config = tf.ConfigProto(allow_soft_placement=False, - log_device_placement=False) - - session_config.gpu_options.allow_growth = True - custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() - custom_op.name = "NpuOptimizer" - session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF - session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF - - run_config = NPURunConfig( - save_summary_steps=1000, - save_checkpoints_steps=100, - keep_checkpoint_max=5, - session_config=session_config, - log_step_count_steps=20, - precision_mode='allow_mix_precision', - enable_data_pre_proc=True, - iterations_per_loop=1, - jit_compile=False, - op_compiler_cache_mode="enable", - HCCL_algorithm="level0:fullmesh;level1:fullmesh" # 可选配置:level0:pairwise;level1:pairwise - ) - return run_config -- Gitee From a610ddbf53660ac128bb1122a43941013a44d4f5 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Fri, 17 May 2024 08:35:38 +0000 Subject: [PATCH 147/302] =?UTF-8?q?!138=20=E3=80=90=E9=9C=80=E6=B1=82?= =?UTF-8?q?=E3=80=91=E6=96=B0=E5=A2=9E=E5=8A=A8=E6=80=81=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E5=9C=BA=E6=99=AFadagrad=E5=AE=9E=E7=8E=B0=20*=20update=20mx?= =?UTF-8?q?=5Frec/optimizers/adagrad=5Fby=5Faddr.py.=20*=20update=20mx=5Fr?= =?UTF-8?q?ec/optimizers/adagrad=5Fby=5Faddr.py.=20*=20add=20mx=5Frec/opti?= =?UTF-8?q?mizers/adagrad=5Fby=5Faddr.py.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/optimizers/adagrad_by_addr.py | 125 +++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 mx_rec/optimizers/adagrad_by_addr.py diff --git a/mx_rec/optimizers/adagrad_by_addr.py b/mx_rec/optimizers/adagrad_by_addr.py new file mode 100644 index 00000000..72f1d86e --- /dev/null +++ b/mx_rec/optimizers/adagrad_by_addr.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import, division, print_function + +from typing import List + +import tensorflow as tf +from tensorflow.python.ops import math_ops +from tensorflow.python.training import adagrad +from tensorflow.python.training.optimizer import Optimizer + +from mx_rec.optimizers.base import CustomizedOptimizer +from mx_rec.util.initialize import ConfigInitializer +from mx_rec.util.ops import import_host_pipeline_ops +from mx_rec.validator.validator import ( + FloatValidator, + StringValidator, + para_checker_decorator, +) + + +@para_checker_decorator( + check_option_list=[ + ("learning_rate", FloatValidator, {"min_value": 0.0, "max_value": 10.0}, ["check_value"]), + ( + "initial_accumulator_value", + FloatValidator, + {"min_value": 0.0, "max_value": 1.0}, + ["check_value_for_left_open_interval"], + ), + ("name", StringValidator, {"min_len": 1, "max_len": 200}, ["check_string_length"]), + ] +) +def create_hash_optimizer_by_address(learning_rate=0.001, initial_accumulator_value=0.9, name="Adagrad") -> Optimizer: + """Create an instance of adagrad hash optimizer. + + Args: + learning_rate: A `Tensor` or a floating point value. The learning rate. + initial_accumulator_value: A floating point value. Starting value for the accumulators, must be positive. + name: Optional name prefix for the operations created when applying gradients. Defaults to "Adagrad". + + Returns: + Adagrad hash optimizer instance + + Raises: + ValueError: If `use_dynamic_expansion` was not set. + """ + if not ConfigInitializer.get_instance().use_dynamic_expansion: + raise ValueError( + "dynamic expansion mode is not compatible with the optimizer, please config dynamic " + "expansion mode and optimizer correctly" + ) + optimizer = CustomizedAdagradByAddress( + learning_rate=learning_rate, + initial_accumulator_value=initial_accumulator_value, + name=name, + ) + ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer + return optimizer + + +class CustomizedAdagradByAddress(adagrad.AdagradOptimizer, CustomizedOptimizer): + def __init__( + self, + learning_rate: float, + initial_accumulator_value: float, + name="Adagrad", + ): + self.optimizer_type = "Adagrad" + self.optim_param_list = ["accumulator"] + super(CustomizedAdagradByAddress, self)._get_name(name=name) + super(CustomizedAdagradByAddress, self).__init__( + learning_rate=learning_rate, + initial_accumulator_value=initial_accumulator_value, + name=self.unique_name, + ) + self._epsilon = 1e-7 + self._slot_num = 1 + self._derivative = 2 + + def get_slot_init_values(self) -> List[float]: + # return state value list of adagrad that needs to initialize in ASC DDR. + return [self._initial_accumulator_value] + + def _apply_sparse(self, grad: tf.Tensor, var: tf.Tensor) -> tf.Operation: + grad, var = self.sum_same_id_gradients(grad=grad, var=var, is_expansion=True) + learning_rate_tensor = math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype) + epsilon = math_ops.cast(self._epsilon, grad.dtype.base_dtype) + + host_pipeline_ops = import_host_pipeline_ops() + dim = grad.shape.as_list()[-1] + + combined_tensor = host_pipeline_ops.embedding_lookup_by_address(var, embedding_dim=2 * dim, embedding_type=1) + split_length = [dim] + [dim] + split_tensors = tf.split(combined_tensor, split_length, axis=1) + + old_s_slice = split_tensors[1] + s_t_slice = old_s_slice + math_ops.square(grad) + + denominator_slice = math_ops.sqrt(s_t_slice + epsilon) + + update_list = [tf.divide(-learning_rate_tensor * grad, denominator_slice)] + [s_t_slice - old_s_slice] + update_tensor = tf.concat(update_list, axis=1) + var_update_op = host_pipeline_ops.embedding_update_by_address(var, update_tensor, update_type=0) + + return var_update_op + + def _create_slots(self, var_list: List[tf.Variable]): + # slot变量由lookup算子控制 跳过父类的实现 + pass -- Gitee From ddbfce3d7bffa17f05ccb56dad65550ac8b80618 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 20 May 2024 08:53:07 +0000 Subject: [PATCH 148/302] =?UTF-8?q?!142=20=E6=94=AF=E6=8C=81no=20ranktable?= =?UTF-8?q?=EF=BC=8Cmain=E8=84=9A=E6=9C=AC=E5=86=85=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E5=B7=B2=E4=BF=9D=E5=AD=98=E6=95=B0=E6=8D=AE=20*=20=E5=88=A0?= =?UTF-8?q?=E9=99=A4config=E6=96=87=E4=BB=B6=E4=B8=AD=E5=86=97=E4=BD=99?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=20*=20=E5=BC=95=E7=94=A8=E5=BD=93=E5=89=8D?= =?UTF-8?q?=E7=9B=AE=E5=BD=95=E4=B8=8Bconfig=E6=96=87=E4=BB=B6=E5=92=8Cdlr?= =?UTF-8?q?m=E8=A7=A3=E9=99=A4=E8=80=A6=E5=90=88=20*=20=E5=88=A0=E9=99=A4r?= =?UTF-8?q?un=E8=84=9A=E6=9C=AC=E5=86=97=E4=BD=99=E6=8C=87=E4=BB=A4=20*=20?= =?UTF-8?q?=E9=97=A8=E7=A6=81=E4=BF=AE=E6=94=B9=20*=20=E4=BF=AE=E6=94=B9ma?= =?UTF-8?q?in=E8=84=9A=E6=9C=AC=E5=86=85=E5=88=A0=E9=99=A4=E5=B7=B2?= =?UTF-8?q?=E4=BF=9D=E5=AD=98=E6=96=87=E4=BB=B6=20*=20=E4=BF=AE=E6=94=B9ma?= =?UTF-8?q?in=E8=84=9A=E6=9C=AC=E5=86=85=E5=88=A0=E9=99=A4=E5=B7=B2?= =?UTF-8?q?=E4=BF=9D=E5=AD=98=E6=96=87=E4=BB=B6=20*=20=E9=80=82=E9=85=8Dno?= =?UTF-8?q?=20ranktable=E5=90=AF=E5=8A=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/config.py | 203 ++++++++++++++++++++ examples/DCNv2/main_mxrec.py | 27 ++- examples/demo/little_demo/main.py | 43 +++-- examples/demo/little_demo/run.sh | 16 +- examples/demo/little_demo_estimator/main.py | 24 +++ examples/demo/little_demo_estimator/run.sh | 10 +- examples/dlrm/model/config.py | 6 +- examples/dlrm/model/main_mxrec.py | 3 +- examples/dlrm/model/run.sh | 63 ++++-- 9 files changed, 326 insertions(+), 69 deletions(-) create mode 100644 examples/DCNv2/config.py diff --git a/examples/DCNv2/config.py b/examples/DCNv2/config.py new file mode 100644 index 00000000..73ab2797 --- /dev/null +++ b/examples/DCNv2/config.py @@ -0,0 +1,203 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os + +import tensorflow as tf +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig + + +class LearningRateScheduler: + """ + LR Scheduler combining Polynomial Decay with Warmup at the beginning. + TF-based cond operations necessary for performance in graph mode. + """ + + def __init__(self, base_lr_dense, base_lr_sparse, warmup_steps, decay_start_step, decay_steps): + self.warmup_steps = tf.constant(warmup_steps, dtype=tf.int32) + self.decay_start_step = tf.constant(decay_start_step, dtype=tf.int32) + self.decay_steps = tf.constant(decay_steps) + self.decay_end_step = decay_start_step + decay_steps # 65041 + self.poly_power = 2.0 + self.base_lr_dense = base_lr_dense + self.base_lr_sparse = base_lr_sparse + + def calc(self, global_step): + # used for the warmup stage + warmup_step = tf.cast(1 / self.warmup_steps, tf.float32) + lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step + lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32) + # used for the constant stage + lr_factor_constant = tf.cast(1.0, tf.float32) + + # used for the decay stage + lr_factor_decay = (self.decay_end_step - global_step) / self.decay_steps + lr_factor_decay = tf.math.pow(lr_factor_decay, self.poly_power) + lr_factor_decay = tf.cast(lr_factor_decay, tf.float32) + sparse_after_decay = tf.cast(1 / self.decay_steps, tf.float32) + + lr_factor_decay_sparse = tf.cond( + global_step < self.decay_end_step, + lambda: lr_factor_decay, + lambda: sparse_after_decay, + ) + + lr_factor_decay_dense = tf.cond( + global_step < self.decay_end_step, + lambda: lr_factor_decay, + lambda: sparse_after_decay, + ) + + poly_schedule_sparse = tf.cond( + global_step < self.decay_start_step, + lambda: lr_factor_constant, + lambda: lr_factor_decay_sparse, + ) + + poly_schedule_dense = tf.cond( + global_step < self.decay_start_step, + lambda: lr_factor_constant, + lambda: lr_factor_decay_dense, + ) + + lr_factor_sparse = tf.cond( + global_step < self.warmup_steps, lambda: lr_factor_warmup, lambda: poly_schedule_sparse + ) + + lr_factor_dense = tf.cond( + global_step < self.warmup_steps, lambda: lr_factor_warmup, lambda: poly_schedule_dense + ) + + lr_sparse = self.base_lr_sparse * lr_factor_sparse + lr_dense = self.base_lr_dense * lr_factor_dense + return lr_dense, lr_sparse + + +class Config: + def __init__(self, ): + self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None + tmp = os.getenv("TRAIN_RANK_SIZE") + if tmp is None: + raise ValueError("please export TRAIN_RANK_SIZE") + self.rank_size = int(tmp) + + self.data_path = os.getenv("DLRM_CRITEO_DATA_PATH") + self.train_file_pattern = "train" + self.test_file_pattern = "test" + + self.batch_size = 8192 + self.line_per_sample = 1024 + self.train_epoch = 3 + self.test_epoch = 1 + self.perform_shuffle = False + + self.key_type = tf.int64 + self.label_type = tf.float32 + self.value_type = tf.int64 + + self.feat_cnt = 26 + self.__set_emb_table_size() + + self.field_num = 26 + self.send_count = 46000 // self.rank_size + + self.emb_dim = 128 + self.hashtable_threshold = 1 + + self.USE_PIPELINE_TEST = False + + # 动态学习率 + GLOBAL_BATCH_SIZE = 8192 * 8 + LR_SCHEDULE_STEPS = [ + int(2750 * 55296 / GLOBAL_BATCH_SIZE), + int(49315 * 55296 / GLOBAL_BATCH_SIZE), + int(27772 * 55296 / GLOBAL_BATCH_SIZE), + ] + self.global_step = tf.Variable(0, trainable=False) + _lr_scheduler = LearningRateScheduler( + 28.443, + 33.71193, + LR_SCHEDULE_STEPS[0], + LR_SCHEDULE_STEPS[1], + LR_SCHEDULE_STEPS[2], + ) + self.learning_rate = _lr_scheduler.calc(self.global_step) + + def __set_emb_table_size(self): + self.cache_mode = os.getenv("CACHE_MODE") + if self.cache_mode is None: + raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") + + if self.cache_mode == "HBM": + self.dev_vocab_size = 24_000_000 * self.rank_size + self.host_vocab_size = 0 + elif self.cache_mode == "DDR": + self.dev_vocab_size = 500_000 * self.rank_size + self.host_vocab_size = 24_000_000 * self.rank_size + elif self.cache_mode == "SSD": + self.dev_vocab_size = 100_000 * self.rank_size + self.host_vocab_size = 2_000_000 * self.rank_size + self.ssd_vocab_size = 24_000_000 * self.rank_size + else: + raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") + + def get_emb_table_cfg(self) -> dict: + if self.cache_mode == "HBM": + return {"device_vocabulary_size": self.dev_vocab_size} + elif self.cache_mode == "DDR": + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size} + elif self.cache_mode == "SSD": + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size, + "ssd_vocabulary_size": self.ssd_vocab_size, + "ssd_data_path": ["ssd_data"]} + else: + raise RuntimeError(f"get CACHE_MODE:{self.cache_mode}, check Config.__set_emb_table_size implementation") + + +def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"): + session_config = tf.ConfigProto(allow_soft_placement=False, + log_device_placement=False) + session_config.gpu_options.allow_growth = True + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["mix_compile_mode"].b = False + custom_op.parameter_map["use_off_line"].b = True + custom_op.parameter_map["min_group_size"].b = 1 + # 可选配置level0:pairwise;level1:pairwise + custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:fullmesh;level1:fullmesh") + custom_op.parameter_map["enable_data_pre_proc"].b = True + custom_op.parameter_map["iterations_per_loop"].i = 10 + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + custom_op.parameter_map["hcom_parallel"].b = False + custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes("op_impl_mode.ini") + custom_op.parameter_map["op_execute_timeout"].i = 2000 + custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes( + str(13 * 1024 * 1024 * 1024)) # total 31 need 13; + custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(18 * 1024 * 1024 * 1024)) # need 25 + custom_op.parameter_map["stream_max_parallel_num"].s = tf.compat.as_bytes("DNN_VM_AICPU:3,AIcoreEngine:3") + + if dump_data: + custom_op.parameter_map["enable_dump"].b = True + custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes(dump_path) + custom_op.parameter_map["dump_step"].s = tf.compat.as_bytes(dump_steps) + custom_op.parameter_map["dump_mode"].s = tf.compat.as_bytes("all") + + session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + return session_config diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index eb1d91ea..18ab273e 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -13,17 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== + +import os +import random +import shutil import time import warnings -import random from glob import glob - from sklearn.metrics import roc_auc_score + import numpy as np + from npu_bridge.npu_init import * from model import MyModel -from dlrm.model.config import sess_config, Config +from config import sess_config, Config from optimizer import get_dense_and_sparse_optimizer from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline @@ -244,12 +248,25 @@ def create_feature_spec_list(use_timestamp=False): return feature_spec_list +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + + if __name__ == "__main__": tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) warnings.filterwarnings("ignore") + _clear_saved_model() - rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None - rank_size = int(os.getenv("RANK_SIZE")) if os.getenv("RANK_SIZE") else None + rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None train_steps = 10000 eval_steps = 1360 diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index a6ef96fc..d8dd851a 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -148,23 +148,35 @@ def create_feature_spec_list(use_timestamp=False): return feature_spec_list -def clear_saved_model(): +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + _del_related_dir("kernel*") + _del_related_dir("export_graph") + mode = UseMode.mapping(os.getenv("USE_MODE")) - if mode == UseMode.TRAIN: - logger.info("current mode is train, will delete previous saved model data if exist.") - save_model_path = os.path.join(os.getcwd(), "saved-model") - shutil.rmtree(save_model_path, ignore_errors=True) - if not (os.getenv("CACHE_MODE", "") == CacheModeEnum.SSD.value and mode == UseMode.TRAIN): + if mode != UseMode.TRAIN: return + logger.info("current mode is train, will delete previous saved model data if exist.") + _del_related_dir("saved-model") - # ssd not allow overwrite file, should clear it before training - logger.info("current cache mode is SSD, will delete previous saved ssd data if exist.") - for part_path in _SSD_SAVE_PATH: - if "/" not in part_path and "\\" not in part_path: - part_path = os.path.join(os.getcwd(), part_path) - shutil.rmtree(part_path, ignore_errors=True) + if not (os.getenv("CACHE_MODE", "") == CacheModeEnum.SSD.value): + return + logger.info("current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" + " then create empty directory for this use case.") + for sub_path in _SSD_SAVE_PATH: + _del_related_dir(sub_path) try: - os.mkdir(part_path) + os.mkdir(sub_path) + logger.info(f"mkdir dir:{sub_path}") except OSError: logger.warning("ssd path has exist") # 多进程并行,忽略异常 @@ -172,6 +184,7 @@ def clear_saved_model(): if __name__ == "__main__": tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) warnings.filterwarnings("ignore") + _clear_saved_model() use_mode = UseMode.mapping(os.getenv("USE_MODE")) # 最大数据集生成数量 @@ -217,7 +230,7 @@ if __name__ == "__main__": if len(model_file) == 0: raise ValueError(f"get USE_MODE:{use_mode}, but no model file exist at:{load_path_pattern}") if_load = True - + # nbatch function needs to be used together with the prefetch and host_vocabulary_size != 0 init(train_steps=TRAIN_STEPS, eval_steps=EVAL_STEPS, @@ -267,7 +280,7 @@ if __name__ == "__main__": if cache_mode in ["DDR", "SSD"] and not use_dynamic: logger.warning("when cache_mode in [DDR, SSD], suggest use_dynamic=true to avoid tuning size parameter") emb_initializer = tf.compat.v1.constant_initializer(0) if USE_DETERMINISTIC \ - else tf.compat.v1.truncated_normal_initializer() + else tf.compat.v1.truncated_normal_initializer() user_hashtable = create_table(key_dtype=tf.int64, dim=tf.TensorShape([cfg.user_hashtable_dim]), name='user_table', diff --git a/examples/demo/little_demo/run.sh b/examples/demo/little_demo/run.sh index d585be02..5b45af84 100644 --- a/examples/demo/little_demo/run.sh +++ b/examples/demo/little_demo/run.sh @@ -15,26 +15,12 @@ # ============================================================================== kill -9 `ps -ef | grep python | grep -v grep | awk '{print $2}'` > /dev/null 2>&1 -rm -rf /root/ascend/log/* -rm -rf ./kernel* -rm -rf ./export_graph/* # 支持[train, load_and_train, predict] -export USE_MODE="train" -if [ $USE_MODE = "train" ]; then - echo "train mode: saved-model will be deleted" - rm -rf ./saved-model -fi +export USE_MODE="train" # if train mode, will remove dir ./saved-model # cache mode support: HBM, DDR, SSD export CACHE_MODE="HBM" -if [ $CACHE_MODE = "SSD" ] && [ $USE_MODE = "train" ]; then - echo "SSD train mode not allow file exist in directory when training a model from stratch in case overwrite, - deleting directory ssd_data then create for this use case" - rm -rf ssd_data - mkdir ssd_data -fi - # 获取输入参数:py、ip if [ $# -ge 1 ]; then diff --git a/examples/demo/little_demo_estimator/main.py b/examples/demo/little_demo_estimator/main.py index cca5a7a5..20b7381d 100644 --- a/examples/demo/little_demo_estimator/main.py +++ b/examples/demo/little_demo_estimator/main.py @@ -17,6 +17,8 @@ import argparse import os +import shutil +from glob import glob import tensorflow as tf from mx_rec.util.initialize import init, terminate_config_initializer @@ -142,6 +144,27 @@ def create_feature_spec_list(use_timestamp=False): return feature_spec_list +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + _del_related_dir("kernel*") + _del_related_dir("export_graph") + + mode = args.run_mode + if not mode.startswith("train"): + return + logger.info("current mode contains train, will delete previous saved model data if exist.") + _del_related_dir("_rank*") + + if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--run_mode', type=str, default='train_and_evaluate') # 运行模式,在run.sh中进行配置 @@ -185,6 +208,7 @@ if __name__ == '__main__': args.eval_steps = -1 elif args.run_mode == 'train_and_evaluate': args.save_checkpoints_steps = args.train_steps + _clear_saved_model() # set init init(train_steps=args.train_steps, diff --git a/examples/demo/little_demo_estimator/run.sh b/examples/demo/little_demo_estimator/run.sh index f3d34c82..8bb43b19 100644 --- a/examples/demo/little_demo_estimator/run.sh +++ b/examples/demo/little_demo_estimator/run.sh @@ -15,9 +15,6 @@ # ============================================================================== kill -9 `ps -ef | grep python | grep -v grep | awk '{print $2}'` > /dev/null 2>&1 -rm -rf /root/ascend/log/* -rm -rf ./kernel* -rm -rf ./export_graph/* # 获取输入参数:py、ip if [ $# -ge 1 ]; then @@ -83,12 +80,7 @@ export TF_CPP_MIN_LOG_LEVEL=3 # tensorflow日志级别,3对应FATAL # 设置应用类日志的全局日志级别及各模块日志级别,具体请参考昇腾官网CANN文档 export ASCEND_GLOBAL_LOG_LEVEL=3 # “设置日志级别”章节0:debug, 1:info, 2:warning, 3:error, 4:NULL export MXREC_MODE="ASC" -export USE_MODE="train_and_evaluate" # 支持[train, predict, train_and_evaluate] - -if [ $USE_MODE = "train" ] || [ $USE_MODE = "train_and_evaluate" ];then - echo "train mode: saved-model will be deleted" - rm -rf ./_rank* -fi +export USE_MODE="train_and_evaluate" # 支持[train, predict, train_and_evaluate],train相关模式将删除./_rank*目录 ################# 参数配置 ###################### export USE_DYNAMIC=1 # 0:静态shape;1:动态shape diff --git a/examples/dlrm/model/config.py b/examples/dlrm/model/config.py index 23b042c2..fd38276d 100644 --- a/examples/dlrm/model/config.py +++ b/examples/dlrm/model/config.py @@ -89,10 +89,10 @@ class LearningRateScheduler: class Config: def __init__(self, ): - self.rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None - tmp = os.getenv("RANK_SIZE") + self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None + tmp = os.getenv("TRAIN_RANK_SIZE") if tmp is None: - raise ValueError("please export RANK_SIZE") + raise ValueError("please export TRAIN_RANK_SIZE") self.rank_size = int(tmp) self.data_path = os.getenv("DLRM_CRITEO_DATA_PATH") diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index 3464f84e..6fda4f0a 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -41,7 +41,6 @@ from mx_rec.util.variable import get_dense_and_sparse_variable from mx_rec.util.log import logger from npu_bridge.npu_init import * - npu_plugin.set_device_sat_mode(0) dense_hashtable_seed = 128 @@ -253,7 +252,7 @@ if __name__ == "__main__": warnings.filterwarnings("ignore") rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None - rank_size = int(os.getenv("RANK_SIZE")) if os.getenv("RANK_SIZE") else None + rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None train_steps = 10000 eval_steps = 1360 diff --git a/examples/dlrm/model/run.sh b/examples/dlrm/model/run.sh index f5cb4449..be509608 100644 --- a/examples/dlrm/model/run.sh +++ b/examples/dlrm/model/run.sh @@ -20,10 +20,25 @@ so_path=$1 mx_rec_package_path=$2 hccl_cfg_json=$3 dlrm_criteo_data_path=$4 +ip=$5 # no ranktable时传入该参数 -export RANK_SIZE=8 -echo "RANK_SIZE=${RANK_SIZE}, please make sure hccl configuration json file match this parameter" -export RANK_TABLE_FILE=${hccl_cfg_json} +interface="lo" +num_server=1 +local_rank_size=8 +num_process=$((num_server * local_rank_size)) +export TRAIN_RANK_SIZE=$num_process + +# 删除数据 +echo "CACHE_MODE:${CACHE_MODE}" +if [ ${CACHE_MODE} = "SSD" ]; then + echo "SSD train mode not allow file exist before training, + deleting dir ${cur_path}/ssd_data then create for SSD use case" + rm -rf ssd_data + mkdir ssd_data +fi +rm -rf kernel* +rm -rf /root/ascend/log/* +rm -rf model_dir_rank* op_cache ################# 参数配置 ###################### export USE_DYNAMIC=0 # 0:静态shape;1:动态shape @@ -34,25 +49,11 @@ export USE_MULTI_LOOKUP=0 # 0:一表一查;1:一表多查 export USE_MODIFY_GRAPH=0 # 0:feature spec模式;1:自动改图模式 ################################################ -echo "CACHE_MODE:${CACHE_MODE}" -if [ ${CACHE_MODE} = "SSD" ]; then - echo "SSD train mode not allow file exist before training, - deleting dir ${cur_path}/ssd_data then create for SSD use case" - rm -rf ssd_data - mkdir ssd_data -fi - export HCCL_CONNECT_TIMEOUT=1200 - export DLRM_CRITEO_DATA_PATH=${dlrm_criteo_data_path} export PYTHONPATH=${mx_rec_package_path}:${so_path}:$PYTHONPATH export LD_PRELOAD=/usr/lib64/libgomp.so.1 export LD_LIBRARY_PATH=${so_path}:/usr/local/lib:$LD_LIBRARY_PATH - -rm -rf kernel* -rm -rf /root/ascend/log/* -rm -rf model_dir_rank* op_cache - export ASCEND_DEVICE_ID=0 export RANK_ID_START=0 export JOB_ID=10086 @@ -78,10 +79,32 @@ echo "MXREC_MODE is $MXREC_MODE" export py=main_mxrec.py echo "py is $py" +# 区分ranktable和no ranktable +if [ -n "$ip" ]; then + # no ranktable分支 + echo "Current is no ranktable solution." + echo "Input node ip: $ip, please make sure this ip is available." + export CM_CHIEF_IP=$ip # 主节点ip + export CM_CHIEF_PORT=60001 # 主节点监听端口 + export CM_CHIEF_DEVICE=0 # 主节点device id + export CM_WORKER_IP=$ip # 当前节点ip + export CM_WORKER_SIZE=$num_process # 参与集群训练的device数量 + echo "CM_CHIEF_IP=$CM_CHIEF_IP" + echo "CM_CHIEF_PORT=$CM_CHIEF_PORT" + echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE" + echo "CM_WORKER_IP=$CM_WORKER_IP" + echo "CM_WORKER_SIZE=$CM_WORKER_SIZE" +else + # ranktable分支 + echo "Current is ranktable solution, hccl json file:${hccl_cfg_json}" + export RANK_SIZE=$num_process + echo "RANK_SIZE=${RANK_SIZE}, please make sure hccl configuration json file match this parameter" + export RANK_TABLE_FILE=${hccl_cfg_json} +fi + echo "use horovod to start tasks" # GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' -interface="lo" -horovodrun --network-interface ${interface} -np ${RANK_SIZE} --mpi-args "${mpi_args}" --mpi -H localhost:${RANK_SIZE} \ -python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${RANK_SIZE}p.log +horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ +python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${num_process}p.log -- Gitee From ff62ae878e8738095b0cf0808686f2475df9509e Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Mon, 20 May 2024 19:22:47 +0800 Subject: [PATCH 149/302] =?UTF-8?q?warm=20start=20=E5=BC=80=E5=8F=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 7 +++---- mx_rec/saver/warm_start.py | 22 ++++++++++------------ src/core/hybrid_mgmt/hybrid_mgmt.cpp | 2 +- src/pybind/module_main.cpp | 3 ++- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index 0dc28a99..f1ce6ea3 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -67,7 +67,7 @@ class Saver(object): ("prefix_name", ClassValidator, {"classes": (str, type(None))}), ("prefix_name", OptionalStringValidator, {"min_len": 1, "max_len": 50}, ["check_string_length"]), ]) - def __init__(self, var_list=None, max_to_keep=3, prefix_name="checkpoint", warm_start_tables = None): + def __init__(self, var_list=None, max_to_keep=3, prefix_name="checkpoint", warm_start_tables=None): self.max_to_keep = max_to_keep self._prefix_name = prefix_name self.var_list = var_list @@ -297,7 +297,7 @@ class Saver(object): table_instance.emb_size], name=DataName.EMBEDDING.value) assign_op = var.assign(variable) - self.restore_fetch_dict[table_instance.table_name]= [assign_op] + self.restore_fetch_dict[table_instance.table_name] = [assign_op] optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name( table_instance.table_name) if optimizer: @@ -330,8 +330,7 @@ class Saver(object): logger.warning("no tables can be warm start restored.") return placeholder_dict, restore_fetch_list - def _restore(self, sess, reading_path , warm_start_tables=None): - # todo:这里增加新的参数,table_list + def _restore(self, sess, reading_path, warm_start_tables=None): # 根据table_list去改造 if warm_start_tables: placeholder_dict, restore_fetch_list = self.get_warm_start_dict(warm_start_tables) diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index b5df5887..c6040316 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -14,25 +14,23 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +import os import logging - -import six import re -import os from typing import List +import six import tensorflow as tf from tensorflow.python.estimator import estimator as estimator_lib from tensorflow.python.training import warm_starting_util - -from mx_rec.util.log import logger -from mx_rec.saver.saver import Saver - if tf.__version__.startswith("1"): from npu_bridge.npu_init import NPUEstimator else: from npu_device.compat.v1.npu_init import NPUEstimator +from mx_rec.util.log import logger +from mx_rec.saver.saver import Saver + class WarmStartController: _instance = None @@ -81,7 +79,8 @@ def patch_for_func_warm_start(func): if isinstance(ckpt_to_initialize_from, (list, tuple)): vars_to_warm_start_list = args[1] var_name_to_prev_var_name_list = args[3] - for i in range(len(ckpt_to_initialize_from)): + warm_start_num = len(ckpt_to_initialize_from) + for i in range(warm_start_num): f = func(ckpt_to_initialize_from[i], vars_to_warm_start_list[i], args[2], var_name_to_prev_var_name_list[i], **kwargs) return f @@ -117,13 +116,10 @@ def warm_settings_filter(warm_start_from): filter_setting = _warm_settings_filter(warm_start_from) if filter_setting: return filter_setting - return None elif isinstance(warm_start_from, (six.string_types, six.binary_type)): table_name_list = get_table_name_set_by_ckpt_path(warm_start_from) WarmStartController().add_element(warm_start_from, table_name_list) return warm_start_from - else: - pass def recover_warm_settings(setting_list): @@ -176,7 +172,7 @@ def _warm_settings_filter(warm_start_setting): if matching_tables: WarmStartController().add_element(warm_start_setting.ckpt_to_initialize_from, matching_tables) if vars_to_warm_start != ".*": - return None + return return warm_start_setting elif all(isinstance(v, str) for v in vars_to_warm_start): sparse_vars = [] @@ -232,6 +228,8 @@ class SparseRestoreHook(tf.estimator.SessionRunHook): def __init__(self): logging.info("In warm start mode, SparseRestoreHook has been initialized.") self._is_warm_start = False + self._saver = None + self._warm_start_dict = {} def begin(self): self._saver = Saver() diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 8eca48ba..78621829 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -296,7 +296,7 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) vector loadFeatures; SetFeatureTypeForLoad(loadFeatures); - if(warmStartTables.size() == 0) { + if (warmStartTables.size() == 0) { EmbeddingMgmt::Instance()->Load(loadPath); } else { for (auto& tableName: warmStartTables) { diff --git a/src/pybind/module_main.cpp b/src/pybind/module_main.cpp index 4de10fc8..351d19a4 100644 --- a/src/pybind/module_main.cpp +++ b/src/pybind/module_main.cpp @@ -216,7 +216,8 @@ namespace { py::arg("seed") = DEFAULT_RANDOM_SEED, py::arg("threshold_values") = vector {}, py::arg("if_load") = false) .def("save", &MxRec::HybridMgmt::Save, py::arg("save_path") = "") - .def("load", &MxRec::HybridMgmt::Load, py::arg("load_path") = "", py::arg("warm_start_tables") = vector {}) + .def("load", &MxRec::HybridMgmt::Load, py::arg("load_path") = "", + py::arg("warm_start_tables") = vector {}) .def("destroy", &MxRec::HybridMgmt::Destroy) .def("evict", &MxRec::HybridMgmt::Evict) .def("send", &MxRec::HybridMgmt::SendHostMap, py::arg("table_name") = "") -- Gitee From 3878e4cffc106ec5aff6b28b7b895017a7e18365 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Mon, 20 May 2024 19:22:47 +0800 Subject: [PATCH 150/302] =?UTF-8?q?warm=20start=20=E5=BC=80=E5=8F=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 7 +++--- mx_rec/saver/warm_start.py | 34 ++++++++++++++-------------- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 2 +- src/pybind/module_main.cpp | 3 ++- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index 0dc28a99..f1ce6ea3 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -67,7 +67,7 @@ class Saver(object): ("prefix_name", ClassValidator, {"classes": (str, type(None))}), ("prefix_name", OptionalStringValidator, {"min_len": 1, "max_len": 50}, ["check_string_length"]), ]) - def __init__(self, var_list=None, max_to_keep=3, prefix_name="checkpoint", warm_start_tables = None): + def __init__(self, var_list=None, max_to_keep=3, prefix_name="checkpoint", warm_start_tables=None): self.max_to_keep = max_to_keep self._prefix_name = prefix_name self.var_list = var_list @@ -297,7 +297,7 @@ class Saver(object): table_instance.emb_size], name=DataName.EMBEDDING.value) assign_op = var.assign(variable) - self.restore_fetch_dict[table_instance.table_name]= [assign_op] + self.restore_fetch_dict[table_instance.table_name] = [assign_op] optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name( table_instance.table_name) if optimizer: @@ -330,8 +330,7 @@ class Saver(object): logger.warning("no tables can be warm start restored.") return placeholder_dict, restore_fetch_list - def _restore(self, sess, reading_path , warm_start_tables=None): - # todo:这里增加新的参数,table_list + def _restore(self, sess, reading_path, warm_start_tables=None): # 根据table_list去改造 if warm_start_tables: placeholder_dict, restore_fetch_list = self.get_warm_start_dict(warm_start_tables) diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index b5df5887..6f1e637b 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -14,25 +14,23 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +import os import logging - -import six import re -import os from typing import List +import six import tensorflow as tf from tensorflow.python.estimator import estimator as estimator_lib from tensorflow.python.training import warm_starting_util - -from mx_rec.util.log import logger -from mx_rec.saver.saver import Saver - if tf.__version__.startswith("1"): from npu_bridge.npu_init import NPUEstimator else: from npu_device.compat.v1.npu_init import NPUEstimator +from mx_rec.util.log import logger +from mx_rec.saver.saver import Saver + class WarmStartController: _instance = None @@ -81,7 +79,8 @@ def patch_for_func_warm_start(func): if isinstance(ckpt_to_initialize_from, (list, tuple)): vars_to_warm_start_list = args[1] var_name_to_prev_var_name_list = args[3] - for i in range(len(ckpt_to_initialize_from)): + warm_start_num = len(ckpt_to_initialize_from) + for i in range(warm_start_num): f = func(ckpt_to_initialize_from[i], vars_to_warm_start_list[i], args[2], var_name_to_prev_var_name_list[i], **kwargs) return f @@ -100,6 +99,7 @@ def patch_for_estimator_train(func): def warm_settings_filter(warm_start_from): + warm_start_from_res = None if isinstance(warm_start_from, estimator_lib.WarmStartSettings): if isinstance(warm_start_from.ckpt_to_initialize_from, (list, tuple)): out_setting_list = [] @@ -110,20 +110,19 @@ def warm_settings_filter(warm_start_from): if filter_setting: out_setting_list.append(filter_setting) if out_setting_list: - warm_start_from = recover_warm_settings(out_setting_list) - return warm_start_from + warm_start_from_res = recover_warm_settings(out_setting_list) elif isinstance(warm_start_from.ckpt_to_initialize_from, (six.string_types, six.binary_type)): logger.info("According to warm_start_settings, warm start will load from only one checkpoint path.") filter_setting = _warm_settings_filter(warm_start_from) if filter_setting: - return filter_setting - return None + warm_start_from_res = filter_setting elif isinstance(warm_start_from, (six.string_types, six.binary_type)): table_name_list = get_table_name_set_by_ckpt_path(warm_start_from) WarmStartController().add_element(warm_start_from, table_name_list) - return warm_start_from + warm_start_from_res = warm_start_from else: - pass + raise ValueError("Invalid parameter: warm_start_from. ") + return warm_start_from_res def recover_warm_settings(setting_list): @@ -176,7 +175,7 @@ def _warm_settings_filter(warm_start_setting): if matching_tables: WarmStartController().add_element(warm_start_setting.ckpt_to_initialize_from, matching_tables) if vars_to_warm_start != ".*": - return None + return return warm_start_setting elif all(isinstance(v, str) for v in vars_to_warm_start): sparse_vars = [] @@ -215,7 +214,7 @@ def get_table_name_set_by_ckpt_path(warm_start_path: str) -> List[str]: return table_name_list -def get_latest_ckpt(warm_start_path) -> str: +def get_latest_ckpt(warm_start_path: str) -> str: ckpt_path = os.path.join(warm_start_path, "checkpoint") if not tf.io.gfile.exists(ckpt_path): raise FileNotFoundError(f"Checkpoint file is missing under the warm start model path {warm_start_path}") @@ -223,7 +222,6 @@ def get_latest_ckpt(warm_start_path) -> str: latest_ckpt = f.readline().rstrip() latest_ckpt = latest_ckpt.split(":")[1].strip(' ').replace('"', '') latest_ckpt = latest_ckpt.split("/")[-1] - path = os.path.join(warm_start_path, latest_ckpt) return path @@ -232,6 +230,8 @@ class SparseRestoreHook(tf.estimator.SessionRunHook): def __init__(self): logging.info("In warm start mode, SparseRestoreHook has been initialized.") self._is_warm_start = False + self._saver = None + self._warm_start_dict = {} def begin(self): self._saver = Saver() diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 8eca48ba..78621829 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -296,7 +296,7 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) vector loadFeatures; SetFeatureTypeForLoad(loadFeatures); - if(warmStartTables.size() == 0) { + if (warmStartTables.size() == 0) { EmbeddingMgmt::Instance()->Load(loadPath); } else { for (auto& tableName: warmStartTables) { diff --git a/src/pybind/module_main.cpp b/src/pybind/module_main.cpp index 4de10fc8..351d19a4 100644 --- a/src/pybind/module_main.cpp +++ b/src/pybind/module_main.cpp @@ -216,7 +216,8 @@ namespace { py::arg("seed") = DEFAULT_RANDOM_SEED, py::arg("threshold_values") = vector {}, py::arg("if_load") = false) .def("save", &MxRec::HybridMgmt::Save, py::arg("save_path") = "") - .def("load", &MxRec::HybridMgmt::Load, py::arg("load_path") = "", py::arg("warm_start_tables") = vector {}) + .def("load", &MxRec::HybridMgmt::Load, py::arg("load_path") = "", + py::arg("warm_start_tables") = vector {}) .def("destroy", &MxRec::HybridMgmt::Destroy) .def("evict", &MxRec::HybridMgmt::Evict) .def("send", &MxRec::HybridMgmt::SendHostMap, py::arg("table_name") = "") -- Gitee From 85419a98b260a950a2a0e6c37302cf022bec8c78 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 21 May 2024 10:54:45 +0800 Subject: [PATCH 151/302] =?UTF-8?q?warm=20start=20=E8=A1=A5=E5=85=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/warm_start.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index 9e2b2ba9..a29dc4ba 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -186,11 +186,11 @@ def _warm_settings_filter(warm_start_setting): sparse_vars.append(v) WarmStartController().add_element(warm_start_setting.ckpt_to_initialize_from, matching_tables) vars_to_warm_start_res = [v for v in vars_to_warm_start if v not in sparse_vars] - if not vars_to_warm_start_res: - warm_start_setting = None - else: - warm_start_setting.vars_to_warm_start = vars_to_warm_start_res - warm_start_setting_res = warm_start_setting + if vars_to_warm_start_res: + warm_start_setting_res = estimator_lib.WarmStartSettings( + ckpt_to_initialize_from=warm_start_setting.ckpt_to_initialize_from, + vars_to_warm_start=vars_to_warm_start_res, + var_name_to_prev_var_name=warm_start_setting.var_name_to_prev_var_name) else: raise ValueError("vars_to_warm_start must be list or str!") return warm_start_setting_res -- Gitee From e8674ed6b2527eeeec3d635257c177ab52c17978 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Tue, 21 May 2024 03:49:04 +0000 Subject: [PATCH 152/302] =?UTF-8?q?!147=20cleancode=EF=BC=8C=E4=BD=BF?= =?UTF-8?q?=E7=94=A8SCAnchorAttr.ID=5FOFFSETS=20*=20=E3=80=90=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91cleancode=20?= =?UTF-8?q?*=20Merge=20remote-tracking=20branch=20'upstream/develop'=20int?= =?UTF-8?q?o=20develop-bugfix=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E5=A2=9E=E5=8A=A0=E5=BC=82?= =?UTF-8?q?=E5=B8=B8=E6=8D=95=E8=8E=B7=EF=BC=8C=E9=9D=9Ehbm=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E4=B8=8B=E5=BF=85=E9=A1=BB=E4=BD=BF=E7=94=A8=E6=94=B9?= =?UTF-8?q?=E5=9B=BE=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=20Modification=E3=80=91=E5=A2=9E=E5=8A=A0=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E6=8D=95=E8=8E=B7=EF=BC=8C=E9=9D=9Ehbm=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E4=B8=8B=E5=BF=85=E9=A1=BB=E4=BD=BF=E7=94=A8=E6=94=B9=E5=9B=BE?= =?UTF-8?q?=20*=20Merge=20remote-tracking=20branch=20'upstream/develop'=20?= =?UTF-8?q?into=20develop-bugfix=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E5=85=A8=E5=B1=80uni?= =?UTF-8?q?que=E5=8A=9F=E8=83=BD=E5=9C=A8=E6=89=A9=E5=AE=B9=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E4=B8=8B=EF=BC=8C=E8=A1=A8=E5=90=8D=E5=AD=97=E2=80=9C?= =?UTF-8?q?/=E2=80=9D=E9=9A=90=E6=82=A3=E4=BF=AE=E5=A4=8D=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91?= =?UTF-8?q?=E5=85=A8=E5=B1=80unique=E5=8A=9F=E8=83=BD=E5=9C=A8=E6=89=A9?= =?UTF-8?q?=E5=AE=B9=E6=A8=A1=E5=BC=8F=E4=B8=8B=EF=BC=8C=E8=A1=A8=E5=90=8D?= =?UTF-8?q?=E5=AD=97=E2=80=9C/=E2=80=9D=E9=9A=90=E6=82=A3=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=20Modification=E3=80=91=E5=85=A8=E5=B1=80unique=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=E5=9C=A8=E6=89=A9=E5=AE=B9=E6=A8=A1=E5=BC=8F=E4=B8=8B?= =?UTF-8?q?=EF=BC=8C=E8=A1=A8=E5=90=8D=E5=AD=97=E2=80=9C/=E2=80=9D?= =?UTF-8?q?=E9=9A=90=E6=82=A3=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/emb/dynamic_sparse_embedding.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mx_rec/core/emb/dynamic_sparse_embedding.py b/mx_rec/core/emb/dynamic_sparse_embedding.py index a7616991..4781491c 100644 --- a/mx_rec/core/emb/dynamic_sparse_embedding.py +++ b/mx_rec/core/emb/dynamic_sparse_embedding.py @@ -8,7 +8,7 @@ from typing import Optional, Union, Callable import tensorflow as tf from mx_rec.constants.constants import ASCEND_TABLE_NAME_MUST_CONTAIN, ASCEND_SPARSE_LOOKUP_LOCAL_EMB, \ - ASCEND_SPARSE_LOOKUP_ID_OFFSET + ASCEND_SPARSE_LOOKUP_ID_OFFSET, ASCAnchorAttr from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.util.initialize import ConfigInitializer @@ -42,7 +42,7 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): def _get_sparse_forward_result(self, sparse_forward_fn: Callable, table: Union[tf.compat.v1.Variable, tf.Tensor], result: dict, is_training: bool) -> tf.Tensor: local_embeddings = import_host_pipeline_ops().embedding_lookup_by_address( - result.get("id_offsets"), embedding_dim=self._emb_size, embedding_type=1) + result.get(str(ASCAnchorAttr.ID_OFFSETS)), embedding_dim=self._emb_size, embedding_type=1) add_collection_condition = is_training and ( ASCEND_TABLE_NAME_MUST_CONTAIN is None or ASCEND_TABLE_NAME_MUST_CONTAIN in self._table_name) @@ -52,9 +52,9 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): return sparse_forward_fn(local_embeddings) # 创建扩容查询tensor和table_instance的映射关系,以便优化器中使用 ConfigInitializer.get_instance().sparse_embed_config.insert_table_instance_to_tensor_dict( - result.get("id_offsets"), self) + result.get(str(ASCAnchorAttr.ID_OFFSETS)), self) tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB, local_embeddings) - tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get("id_offsets")) + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get(str(ASCAnchorAttr.ID_OFFSETS))) return sparse_forward_fn(local_embeddings) -- Gitee From 436b753b27260440560c9400e3fcfb3407b73b43 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 21 May 2024 15:58:19 +0800 Subject: [PATCH 153/302] =?UTF-8?q?warm=20start=20=E8=A1=A5=E5=85=85typing?= =?UTF-8?q?=E5=92=8C=E5=87=BD=E6=95=B0=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/warm_start.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index a29dc4ba..baf01ce8 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -125,7 +125,10 @@ def warm_settings_filter(warm_start_from): return warm_start_from_res -def recover_warm_settings(setting_list): +def recover_warm_settings(setting_list: List[tf.estimator.WarmStartSettings]) -> tf.estimator.WarmStartSettings: + """ + Recover WarmStartSettings from a list of custom-defined WarmStartSettings. + """ ckpt_to_initialize_from_list = [] vars_to_warm_start_list = [] var_name_to_prev_var_name_list = [] @@ -140,7 +143,10 @@ def recover_warm_settings(setting_list): var_name_to_prev_var_name=var_name_to_prev_var_name_list) -def _build_warm_settings_list(warm_start_from): +def _build_warm_settings_list(warm_start_from: tf.estimator.WarmStartSettings) -> List[tf.estimator.WarmStartSettings]: + """ + Converts custom-defined WarmStartSettings into a list of TensorFlow-native WarmStartSettings. + """ ckpt_to_initialize_from = warm_start_from.ckpt_to_initialize_from vars_to_warm_start = warm_start_from.vars_to_warm_start var_name_to_prev_var_name = warm_start_from.var_name_to_prev_var_name @@ -165,7 +171,10 @@ def _build_warm_settings_list(warm_start_from): return warm_start_settings_list -def _warm_settings_filter(warm_start_setting): +def _warm_settings_filter(warm_start_setting: tf.estimator.WarmStartSettings) -> tf.estimator.WarmStartSettings: + """ + Filter the vars_to_warm_start parameter to remove sparse table parameters. + """ vars_to_warm_start = warm_start_setting.vars_to_warm_start var_name_to_prev_var_name = warm_start_setting.var_name_to_prev_var_name vars_to_warm_start_res = [] @@ -175,8 +184,8 @@ def _warm_settings_filter(warm_start_setting): matching_tables = [table for table in table_name_list if re.match(vars_to_warm_start, table)] if matching_tables: WarmStartController().add_element(warm_start_setting.ckpt_to_initialize_from, matching_tables) - if vars_to_warm_start != ".*": - return warm_start_setting_res + if vars_to_warm_start != ".*": + return warm_start_setting_res warm_start_setting_res = warm_start_setting elif all(isinstance(v, str) for v in vars_to_warm_start): sparse_vars = [] -- Gitee From ea4c5f0a7aae9f68810398d11591809ada52a5b0 Mon Sep 17 00:00:00 2001 From: sihaixianyu Date: Tue, 21 May 2024 11:10:48 +0000 Subject: [PATCH 154/302] =?UTF-8?q?!144=20=E6=94=B9=E5=9B=BE=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E4=BC=A0=E5=85=A5TF=E5=9B=BE=E5=AE=9E=E4=BE=8B=20*=20?= =?UTF-8?q?Adapt=20unit=20test=20for=20modifier.=20*=20Add=20inference=20m?= =?UTF-8?q?ode.=20*=20Add=20matrix=20factorization=20model.=20*=20Init=20f?= =?UTF-8?q?eat=20branch.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/constants/constants.py | 4 +- mx_rec/core/embedding.py | 19 +- mx_rec/graph/__init__.py | 4 +- mx_rec/graph/constants.py | 1 - mx_rec/graph/hooks.py | 12 +- mx_rec/graph/merge_lookup.py | 3 +- mx_rec/graph/modifier.py | 1110 +++++++++++++-------------- mx_rec/graph/slicers.py | 68 +- mx_rec/graph/utils.py | 293 ++++--- tests/mx_rec/graph/test_modifier.py | 158 ++-- tests/mx_rec/graph/test_utils.py | 132 +++- 11 files changed, 923 insertions(+), 881 deletions(-) diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index f69f32c8..13b3d583 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -29,8 +29,8 @@ EMPTY_STR = "" # 获取ConfigInitializer对象实例失败提示信息 GET_CONFIG_INSTANCE_ERR_MSG = "Please init the environment for mx_rec at first." -# 自动改图模式下从计算图中寻找dataset的锚点名称 -ANCHOR_DATASET_NAME = "PrefetchDataset" +# Used for slicer finding the orphan lookup key. +ORPHAN_LOOKUP_KEY_PREFIX = "orphan" # the name of the embedding table merged by third party ASCEND_TABLE_NAME_MUST_CONTAIN = None diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index 16f19d04..348ab9d6 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -19,15 +19,17 @@ import os from typing import Optional, Union import tensorflow as tf +from tensorflow import Tensor from tensorflow.python.ops.init_ops import Initializer as InitializerV1 from tensorflow.python.ops.init_ops_v2 import Initializer as InitializerV2 +from mx_rec.constants import constants from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.core.emb.emb_factory import HBMDynamicSparseEmbeddingFactory, HBMSparseEmbeddingFactory, \ ExternalStorageSparseEmbeddingFactory from mx_rec.constants.constants import MAX_INT32, All2allGradientsOp, MAX_VOCABULARY_SIZE, MAX_DEVICE_VOCABULARY_SIZE -from mx_rec.graph.utils import mark_orphan_lookup_key +from mx_rec.graph.constants import AnchorIteratorOp from mx_rec.util.initialize import ConfigInitializer from mx_rec.validator.validator import ClassValidator, StringValidator, SSDFeatureValidator, \ para_checker_decorator, IntValidator, NumValidator, OptionValidator, OptionalIntValidator, \ @@ -184,3 +186,18 @@ def sparse_lookup(hashtable: BaseSparseEmbedding, ConfigInitializer.get_instance().modify_graph = modify_graph return hashtable.lookup(ids, send_count, **kwargs) + + +def mark_orphan_lookup_key(lookup_key: Tensor) -> Tensor: + graph_def = tf.compat.v1.get_default_graph().as_graph_def() + subgraph = tf.compat.v1.graph_util.extract_sub_graph(graph_def, [lookup_key.op.name]) + + for node in subgraph.node: + if node.op == AnchorIteratorOp.ITERATOR_GET_NEXT.value: + return lookup_key + + name_prefix = constants.ORPHAN_LOOKUP_KEY_PREFIX + marked_lookup_key = tf.identity(lookup_key, name="{}/{}".format(name_prefix, lookup_key.op.name)) + + logger.info('Mark orphan lookup key %s as %s.', lookup_key, marked_lookup_key) + return marked_lookup_key diff --git a/mx_rec/graph/__init__.py b/mx_rec/graph/__init__.py index 687e78ff..f1465971 100644 --- a/mx_rec/graph/__init__.py +++ b/mx_rec/graph/__init__.py @@ -16,13 +16,11 @@ # ============================================================================== __all__ = [ - "modify_graph_and_start_emb_cache", "GraphModifierHook", - "run", "LookupSubgraphSlicerHook", "OrphanLookupKeySlicerHook", + "modify_graph_and_start_emb_cache", ] from mx_rec.graph.modifier import GraphModifierHook, modify_graph_and_start_emb_cache -from mx_rec.graph.patch import run from mx_rec.graph.hooks import LookupSubgraphSlicerHook, OrphanLookupKeySlicerHook diff --git a/mx_rec/graph/constants.py b/mx_rec/graph/constants.py index 077405d6..6c67b201 100644 --- a/mx_rec/graph/constants.py +++ b/mx_rec/graph/constants.py @@ -15,7 +15,6 @@ # limitations under the License. # ============================================================================== - from enum import Enum diff --git a/mx_rec/graph/hooks.py b/mx_rec/graph/hooks.py index 5cf64b15..c97ae299 100644 --- a/mx_rec/graph/hooks.py +++ b/mx_rec/graph/hooks.py @@ -28,17 +28,15 @@ from mx_rec.validator.validator import ClassValidator, para_checker_decorator @para_checker_decorator( check_option_list=[ ("op_types", ClassValidator, {"classes": (list)}), - ("full_graph", ClassValidator, {"classes": (Graph, type(None))}), ] ) class LookupSubgraphSlicerHook(tf.estimator.SessionRunHook): - def __init__(self, op_types: List[Operation], full_graph: Graph = None) -> None: + def __init__(self, op_types: List[Operation]) -> None: super().__init__() self._op_types = op_types - self._full_graph = full_graph def begin(self) -> None: - slicer = LookupSubgraphSlicer(self._op_types, self._full_graph) + slicer = LookupSubgraphSlicer(self._op_types) logger.info("Starts to summarize sliceable specific operations in lookup subgraph!") slicer.summarize() @@ -47,14 +45,12 @@ class LookupSubgraphSlicerHook(tf.estimator.SessionRunHook): slicer.slice() -@para_checker_decorator(check_option_list=[("full_graph", ClassValidator, {"classes": (Graph, type(None))})]) class OrphanLookupKeySlicerHook(tf.estimator.SessionRunHook): - def __init__(self, full_graph: Graph = None) -> None: + def __init__(self) -> None: super().__init__() - self._full_graph = full_graph def begin(self) -> None: - slicer = OrphanLookupKeySlicer(self._full_graph) + slicer = OrphanLookupKeySlicer() logger.info("Starts to summarize sliceable orphan lookup keys!") slicer.summarize() diff --git a/mx_rec/graph/merge_lookup.py b/mx_rec/graph/merge_lookup.py index b28872e4..0b646cab 100644 --- a/mx_rec/graph/merge_lookup.py +++ b/mx_rec/graph/merge_lookup.py @@ -91,7 +91,8 @@ def do_merge_lookup(is_train: bool = True): if not ConfigInitializer.get_instance().use_static: kwargs["feature_spec_name_ids_dict"] = feature_spec_name_ids_dict lookup_result = table_instance.lookup_for_feat_spec(feature_spec, send_count, **kwargs) - replace_anchor_vec(cutting_point, ASCAnchorAttr.MOCK_LOOKUP_RESULT, lookup_result) + graph = tf.compat.v1.get_default_graph() + replace_anchor_vec(graph, cutting_point, ASCAnchorAttr.MOCK_LOOKUP_RESULT, lookup_result) logger.debug("The mock lookup result of %s for %s was replaced.", feature_spec.name, table_instance.table_name) # records whether the current mode has been merged or restored lookup diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index 6b6013d8..179de09f 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -21,32 +21,71 @@ from collections.abc import Callable from typing import Any, List, Dict, Tuple, DefaultDict import tensorflow as tf -from tensorflow import Operation, Tensor +from tensorflow import Operation, Tensor, Graph from tensorflow.core.framework.graph_pb2 import GraphDef from tensorflow.python.data.ops.dataset_ops import DatasetV1Adapter from tensorflow.python.framework.errors_impl import InvalidArgumentError -from mx_rec.constants.constants import ASCEND_CUTTING_POINT_INITIALIZER, ASCEND_SPARSE_LOOKUP_ENTRANCE, \ - ASCAnchorAttr, ASCEND_TIMESTAMP, MAX_WHILE_SIZE, LIBREC_EOS_OPS_SO +from mx_rec.graph import utils +from mx_rec.constants.constants import ( + ASCEND_CUTTING_POINT_INITIALIZER, + ASCEND_SPARSE_LOOKUP_ENTRANCE, + ASCAnchorAttr, + ASCEND_TIMESTAMP, + MAX_WHILE_SIZE, + LIBREC_EOS_OPS_SO, +) from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.asc.helper import get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline from mx_rec.core.asc.swap_args import SwapArgs from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.graph.merge_lookup import do_merge_lookup -from mx_rec.graph.utils import check_input_list, find_parent_op, check_cutting_points, record_ops_to_replace, \ - export_pb_graph, make_sorted_key_to_tensor_list, replace_anchor_control +from mx_rec.graph.utils import check_and_force_list, export_pb_graph from mx_rec.graph.constants import DeprecatedOp, AnchorDatasetOp, AnchorIteratorOp from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.log import logger from mx_rec.util.ops import import_host_pipeline_ops from mx_rec.util.perf import performance -from mx_rec.util.tf_version_adapter import hccl_ops, npu_ops +from mx_rec.util.tf_version_adapter import npu_ops from mx_rec.validator.validator import para_checker_decorator, ClassValidator +class GraphModifierHook(tf.estimator.SessionRunHook): + @para_checker_decorator( + check_option_list=[ + ("dump_graph", ClassValidator, {"classes": (bool,)}), + ("modify_graph", ClassValidator, {"classes": (bool,)}), + ] + ) + def __init__(self, dump_graph: bool = False, modify_graph: bool = True): + self._dump_graph = dump_graph + self._modify_graph = modify_graph + self._iterator_type = None + + ConfigInitializer.get_instance().train_params_config.is_graph_modify_hook_running = True + + def begin(self): + if self._modify_graph: + modify_graph_and_start_emb_cache(dump_graph=self._dump_graph) + else: + start_asc_pipeline() + + self._iterator_type = ConfigInitializer.get_instance().train_params_config.iterator_type + if self._modify_graph and self._iterator_type not in ( + AnchorIteratorOp.MAKE_ITERATOR.value, + AnchorIteratorOp.ONE_SHOT_ITERATOR.value, + ): + raise ValueError("the value of iterator type should be like `MakeIterator` or `OneShotIterator`.") + logger.debug("In GraphModifierHook, iterator type is `%s`.", self._iterator_type) + + def after_create_session(self, session, coord): + if self._modify_graph and self._iterator_type == AnchorIteratorOp.MAKE_ITERATOR.value: + session.run(tf.compat.v1.get_collection(ASCEND_CUTTING_POINT_INITIALIZER)) + + @dataclasses.dataclass -class AnchorRecord: +class _AnchorRecord: replacement_spec: DefaultDict[Tensor, List[Tuple[int, Operation]]] passing_tensors: List[Tensor] batch_tensor_indexs: List[int] @@ -58,56 +97,437 @@ class AnchorRecord: input_indexs: List[int] = None -def get_preprocessing_map_func( +class _GraphModifier: + @para_checker_decorator( + check_option_list=[ + ("dump_graph", ClassValidator, {"classes": (bool,)}), + ("modify_graph", ClassValidator, {"classes": (bool,)}), + ] + ) + def __init__(self, full_graph: Graph = None, dump_graph: bool = False): + if not full_graph: + full_graph = tf.compat.v1.get_default_graph() + self._full_graph = full_graph + self._dump_graph = dump_graph + + @staticmethod + def _get_preprocessing_map_func( graph_def: GraphDef, input_names: List[str], output_names: List[str], batch_tensor_names: List[str] = None, - pipeline_input_indexes: List[int] = None -) -> Callable: - input_names = check_input_list(input_names, str) - output_names = check_input_list(output_names, str) - batch_tensor_names = check_input_list(batch_tensor_names, str) - pipeline_input_indexes = check_input_list(pipeline_input_indexes, int) - both_is_none = batch_tensor_names is None and pipeline_input_indexes is None - both_not_none = batch_tensor_names is not None and pipeline_input_indexes is not None - if both_is_none or both_not_none: - raise ValueError("It is legal when and only when one of the parameters 'batch_tensor_names' and " - "'pipeline_input_indexes' was given.") - - def map_func(*args): - logger.debug("In get_preprocessing_map_func, the old batch is: %s.", args) - batch = dict() - parse_batch(args, batch, key=None) - logger.debug("In get_preprocessing_map_func, the parse batch is: %s.", batch) - - input_tensors = [] - if batch_tensor_names is not None: - for tensor_name in batch_tensor_names: - tensor = batch.get(tensor_name) - if tensor is None: - raise ValueError(f"Given input_tensor_name '{tensor_name}' is invalid.") - - input_tensors.append(tensor) + pipeline_input_indexes: List[int] = None, + ) -> Callable: + input_names = check_and_force_list(input_names, str) + output_names = check_and_force_list(output_names, str) + batch_tensor_names = check_and_force_list(batch_tensor_names, str) + pipeline_input_indexes = check_and_force_list(pipeline_input_indexes, int) + both_is_none = batch_tensor_names is None and pipeline_input_indexes is None + both_not_none = batch_tensor_names is not None and pipeline_input_indexes is not None + if both_is_none or both_not_none: + raise ValueError( + "It is legal when and only when one of the parameters 'batch_tensor_names' and " + "'pipeline_input_indexes' was given." + ) + + def map_func(*args): + logger.debug("In get_preprocessing_map_func, the old batch is: %s.", args) + batch = dict() + _parse_batch(args, batch, key=None) + logger.debug("In get_preprocessing_map_func, the parse batch is: %s.", batch) + + input_tensors = [] + if batch_tensor_names is not None: + for tensor_name in batch_tensor_names: + tensor = batch.get(tensor_name) + if tensor is None: + raise ValueError(f"Given input_tensor_name '{tensor_name}' is invalid.") + + input_tensors.append(tensor) + + else: + graph = tf.compat.v1.get_default_graph() + for index in pipeline_input_indexes: + tensor = graph.get_tensor_by_name("args_%d:0" % index) + input_tensors.append(tensor) + + # 以tf.import_graph_def()作为read emb key的输入,保证数据读取到传入lookup的ids过程中的特征处理关系能够保留在子图中。 + output_list = tf.import_graph_def( + graph_def, input_map=dict(zip(input_names, input_tensors)), return_elements=output_names + ) + + output_batch = [batch, tuple(output_list)] + logger.debug("In get_preprocessing_map_func, the output batch is: %s.", output_batch) + return tuple(output_batch) + + return map_func + + @performance("graph_modifier") + def modify_graph_for_asc(self, prefetch: int = 10): + cutting_point_list = self._full_graph.get_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE) + utils.check_cutting_points(cutting_point_list) + if not cutting_point_list: + logger.warning("Nothing to revise.") + return + + export_pb_graph("old_graph.pbtxt", self._dump_graph, graph_def=self._full_graph.as_graph_def()) + get_next_op_map = self._generate_get_next_op_specs(cutting_point_list) + logger.debug( + "In modify_graph_for_asc function, get_next_op_map.len: %d, get_next_op_map.key: %s.", + len(get_next_op_map), + get_next_op_map.keys(), + ) + + for get_next_op, record in get_next_op_map.items(): + is_training = record.is_training + + # get source dataset + src_dataset = self._get_src_dataset(get_next_op, is_training) + + # generate target dataset + timestamp_index = _get_timestamp_index(self._full_graph, get_next_op, is_training) + original_batch_tensor_count = _get_dataset_tensor_count(src_dataset) + sub_cutting_points = record.sub_cutting_points + input_index_list = _get_input_index_list( + sub_cutting_points, + record.replacement_spec, + record.output_names, + original_batch_tensor_count, + timestamp_index=timestamp_index, + ) + record.input_indexs = input_index_list + + with self._full_graph.as_default(): + tgt_dataset = self._get_tgt_dataset(src_dataset, sub_cutting_points, record, prefetch=prefetch) + self._update_iterator_getnext(get_next_op, tgt_dataset, is_training, record) + + # In eval mode, backward is not required. In addition, compute gradients is not executed when + # only eval is used. Therefore, `do_merge_lookup` needs to be invoked during modify graph. + if not is_training: + with self._full_graph.as_default(): + do_merge_lookup(is_train=False) + if "evaluate" in ConfigInitializer.get_instance().train_params_config.bool_gauge_set: + logger.debug("In estimator mode, eval re-creates graph each time, so the flag needs to be cleared.") + ConfigInitializer.get_instance().train_params_config.insert_merged_multi_lookup(is_training, False) + # In training mode, `do_merge_lookup` should have been executed in compute gradients phase. + if is_training and not ConfigInitializer.get_instance().train_params_config.get_merged_multi_lookup(True): + raise RuntimeError( + "In training mode, `do_merge_lookup` should have been executed in compute gradients " + "phase. Please check whether compute gradients is performed." + ) + + self._modify_graph_for_ddr(get_next_op_map) + + logger.info("Graph has been revised.") + export_pb_graph("new_graph.pbtxt", self._dump_graph, graph_def=self._full_graph.as_graph_def()) + + def _modify_graph_for_ddr(self, get_next_op_map: Dict[Tensor, _AnchorRecord]): + # 通过create_hash_optimizer创建optimizer_instance + optimizer_instance = ConfigInitializer.get_instance().optimizer_config.optimizer_instance + # Predict mode + if optimizer_instance is None: + slot_num = 0 + else: + # DDR和扩容需要在获取优化器后重置ext + _change_ext_emb_size_by_opt(optimizer_instance) + slot_num = optimizer_instance.slot_num + + for _, record in get_next_op_map.items(): + is_training = record.is_training + channel_id = 0 if is_training else 1 + + swap_args = SwapArgs() + sparse_variables = self._full_graph.get_collection( + ConfigInitializer.get_instance().train_params_config.ascend_global_hashtable_collection + ) + + for each_var in sparse_variables: + table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(each_var) + if table_instance.is_hbm: + continue + swap_args_dict = swap_args.swap_config_dict[table_instance.table_name][channel_id] + swap_pos = swap_args_dict["swap_pos"] + swap_len = swap_args_dict["swap_len"] + variable_and_slot_list = _get_variable_and_slot_list( + each_var, slot_num, table_instance.table_name, channel_id + ) + + swap_op = _get_swap_info(table_instance, variable_and_slot_list, swap_len, swap_pos, channel_id) + swap_control_dict = swap_args.swap_control_dict[table_instance.table_name][channel_id] + if "control_ops" not in swap_control_dict: + raise ValueError("Missing Required key in modify_graph_for_asc: control_ops") + control_ops = swap_control_dict["control_ops"] + utils.replace_anchor_control(self._full_graph, control_ops, swap_op) + + def _generate_get_next_op_specs(self, cutting_point_list: List[Tensor]) -> Dict[Tensor, _AnchorRecord]: + get_next_op_map = defaultdict(dict) + + for input_tensor in cutting_point_list: + get_next_op = utils.upward_bfs_op(input_tensor.op, AnchorIteratorOp.ITERATOR_GET_NEXT.value) + if get_next_op not in get_next_op_map: + logger.debug("find a new get_next_op named '%s'", get_next_op.name) + + replacement_specs = utils.record_ops_to_replace(self._full_graph, get_next_op) + passing_tensors, batch_tensor_indexs, sub_cutting_points = _get_passing_tensor_list( + cutting_point_list, get_next_op + ) + sub_graph_def, input_names, output_names = self._get_sub_graph(passing_tensors, sub_cutting_points) + is_training = BaseSparseEmbedding.get_anchor_attribute(input_tensor, ASCAnchorAttr.IS_TRAINING) + + record = _AnchorRecord( + replacement_specs, + passing_tensors, + batch_tensor_indexs, + sub_cutting_points, + sub_graph_def, + input_names, + output_names, + is_training, + ) + get_next_op_map[get_next_op] = record + + export_pb_graph(f"cut_graph_{get_next_op.name}.pbtxt", self._dump_graph, graph_def=sub_graph_def) + + return get_next_op_map + + def _get_sub_graph( + self, input_tensors: List[Tensor], output_tensors: List[Tensor] + ) -> Tuple[GraphDef, List[str], List[str]]: + input_tensors = check_and_force_list(input_tensors, tf.Tensor) + output_tensors = check_and_force_list(output_tensors, tf.Tensor) + input_op_name_list = [tensor.op.name for tensor in input_tensors] + output_op_name_list = [tensor.op.name for tensor in output_tensors] + + graph_def = self._full_graph.as_graph_def() + cut_graph_input = tf.compat.v1.graph_util.extract_sub_graph(graph_def, input_op_name_list) + cut_graph_output = tf.compat.v1.graph_util.extract_sub_graph(graph_def, output_op_name_list) + + node_list = [] + node_list_input = cut_graph_input.node + node_list_output = cut_graph_output.node + for node in node_list_output: + if node not in node_list_input: + node_list.append(node) + + sub_graph_def = tf.compat.v1.GraphDef() + sub_graph_def.node.extend(node_list) + + input_name_list = [tensor.name for tensor in input_tensors] + output_name_list = [tensor.name for tensor in output_tensors] + + return sub_graph_def, input_name_list, output_name_list + + def _get_src_dataset(self, get_next_op: Operation, is_training: bool) -> DatasetV1Adapter: + """ + 根据`IteratorGetNext`算子在计算图中找出原始dataset. + + Args: + get_next_op: `IteratorGetNext`算子 + is_training: 当前是否为训练模式,训练模式为True,否则为False + + Returns: 原始数据集 + + """ + + try: + target_op = utils.find_trans_dataset(self._full_graph, get_next_op) + except (ValueError, TypeError, RuntimeError) as err: + logger.warning("The dataset op was not found, the error is `%s`. Start to traverse the operations.", err) + graph = self._full_graph + dataset_op_list = [op for op in graph.get_operations() if AnchorDatasetOp.PREFETCH_DATASET.value in op.name] + + # WARN: Couple with NoGradSubGraphSlicer::_find_old_dataset. + dataset_op_list = list( + filter( + lambda op: op not in self._full_graph.get_collection(DeprecatedOp.DEPRECATED_PREFETCH_DATASET), + dataset_op_list, + ) + ) + dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) + + logger.debug( + "In get_src_dataset function, current mode(train: True, eval: False): %s, dataset_op_list: %s.", + is_training, + dataset_op_list, + ) + + if len(dataset_op_list) == 1: + target_op = dataset_op_list[0] + elif is_training and len(dataset_op_list) == 2: + prefetch_dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) + target_op = prefetch_dataset_op_list[0] + elif not is_training and len(dataset_op_list) == 3: + prefetch_dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) + target_op = prefetch_dataset_op_list[1] + else: + raise RuntimeError( + f"'{AnchorDatasetOp.PREFETCH_DATASET.value}' not found, got transformation datasets: " + f"{dataset_op_list}." + ) from err + except Exception as err: + raise RuntimeError(f"The dataset was not found, the error is `{err}`.") from err + + if not target_op.outputs: + raise ValueError(f"The length of the outputs of target op `{target_op}` is 0.") + logger.debug("Find target op `%s`, and output is `%s`.", target_op.name, target_op.outputs) + src_dataset = utils.find_target_instance_dataset(self._full_graph, target_op.outputs[0]) + return src_dataset + + def _get_tgt_dataset( + self, + src_dataset: DatasetV1Adapter, + sub_cutting_point_list: List[Tensor], + record: _AnchorRecord, + prefetch: int = 10, + ) -> DatasetV1Adapter: + """ + 根据原始数据集生成新的数据集实例. + + Args: + src_dataset: 原始数据集实例 + sub_cutting_point_list: 打桩的lookup ids列表 + records: 记录被打桩ids对应输入/输出算子、子图关系等信息的字典 + dump_graph: 是否dump计算图,默认为False + prefetch: dataset预取数据量,默认为10 + + Returns: 新数据集实例 + + """ + + librec = import_host_pipeline_ops(LIBREC_EOS_OPS_SO) + channel_id = ConfigInitializer.get_instance().train_params_config.get_training_mode_channel_id( + record.is_training + ) + # 在数据读取完时,通过EosDataset向acl数据通道发送end_of_sequence + max_train_steps = ConfigInitializer.get_instance().max_steps + max_eval_steps = ConfigInitializer.get_instance().eval_steps + src_dataset = src_dataset.eos_map(librec, channel_id, max_train_steps, max_eval_steps) + + tgt_dataset = src_dataset.map( + self._get_preprocessing_map_func( + record.sub_graph_def, + record.input_names, + record.output_names, + pipeline_input_indexes=record.batch_tensor_indexs, + ) + ) + + feature_numbers = [ + BaseSparseEmbedding.get_anchor_attribute(cutting_point, ASCAnchorAttr.FEATURE_SPEC).feat_cnt + for cutting_point in sub_cutting_point_list + ] + table_names = [ + BaseSparseEmbedding.get_anchor_attribute(cutting_point, ASCAnchorAttr.FEATURE_SPEC).table_name + for cutting_point in sub_cutting_point_list + ] + tgt_dataset = tgt_dataset.map( + get_asc_insert_func( + feature_numbers=feature_numbers, + table_names=table_names, + args_index_list=record.input_indexs, + is_training=record.is_training, + dump_graph=self._dump_graph, + ) + ) + + tgt_dataset = tgt_dataset.prefetch(prefetch) + return tgt_dataset + + def _update_iterator_getnext( + self, get_next_op: Operation, tgt_dataset: DatasetV1Adapter, is_training: bool, record: _AnchorRecord + ) -> None: + """ + 用新数据集中的`IteratorGetNext`算子替换计算图中原始数据集的`IteratorGetNext`算子,即用新数据集的batch替换原始数据集的batch. + Args: + get_next_op: `IteratorGetNext`算子 + tgt_dataset: 新数据集 + is_training: 当前是否为训练模式,训练模式为True,否则为False + records: 记录被打桩ids对应输入/输出算子、子图关系等信息的字典 + + Returns: None + + """ + if not get_next_op.outputs: + raise RuntimeError("there is no tensor in the dataset. Please check the dataset and data processing.") + iterator_type = "" + if get_next_op.outputs[0].op.inputs: + iterator_type = get_next_op.outputs[0].op.inputs[0].op.type + if iterator_type == "IteratorV2": + iterator_type = utils.find_make_iterator_op(self._full_graph, get_next_op.outputs[0]).type + if iterator_type not in (AnchorIteratorOp.MAKE_ITERATOR.value, AnchorIteratorOp.ONE_SHOT_ITERATOR.value): + raise RuntimeError( + f"Only iterators `MakeIterator` and `OneShotIterator` are supported in `graph modify` mode, " + f"but the current iterator is `{iterator_type}`." + ) + ConfigInitializer.get_instance().train_params_config.iterator_type = iterator_type + logger.info("The iterator type of dataset is `%s`.", iterator_type) + + if iterator_type == AnchorIteratorOp.MAKE_ITERATOR.value: + new_iterator = tgt_dataset.make_initializable_iterator() + tf.compat.v1.add_to_collection(ASCEND_CUTTING_POINT_INITIALIZER, new_iterator.initializer) + ConfigInitializer.get_instance().train_params_config.set_initializer(is_training, new_iterator.initializer) else: - graph = tf.compat.v1.get_default_graph() - for index in pipeline_input_indexes: - tensor = graph.get_tensor_by_name("args_%d:0" % index) - input_tensors.append(tensor) + new_iterator = tgt_dataset.make_one_shot_iterator() + new_batch = new_iterator.get_next() + ConfigInitializer.get_instance().train_params_config.set_target_batch(is_training, new_batch) + + try: + new_batch_tensor = list(new_batch.values())[0] + except IndexError as err: + raise IndexError("Cannot find a tensor from given batch.") from err + new_get_next_op_name = utils.upward_bfs_op(new_batch_tensor.op, AnchorIteratorOp.ITERATOR_GET_NEXT.value).name + self._update_input_tensor_with_new_batch(record.replacement_spec, new_get_next_op_name, new_batch) + + def _update_input_tensor_with_new_batch( + self, + replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], + new_get_next_op_name: str, + new_batch: Dict[str, Tensor], + ) -> None: + """ + 用新batch中的IteratorGetNext替换计算图中老batch的IteratorGetNext. - # 以tf.import_graph_def()作为read emb key的输入,保证数据读取到传入lookup的ids过程中的特征处理关系能够保留在子图中。 - output_list = tf.import_graph_def(graph_def, input_map=dict(zip(input_names, input_tensors)), - return_elements=output_names) + Args: + replacement_specs: 记录待替换算子的dict,key为老batch的IteratorGetNext,value为以老batch作为输入的算子 + new_get_next_op_name: 新数据集的get_next算子名称 + new_batch: 新数据集的batch + + Returns: None + + """ - output_batch = [batch, tuple(output_list)] - logger.debug("In get_preprocessing_map_func, the output batch is: %s.", output_batch) - return tuple(output_batch) + for old_tensor, item in replacement_specs.items(): + for idx, operator in item: + old_tensor_name = old_tensor.name + output_index = old_tensor_name.split(":")[-1] + new_tensor_name = f"{new_get_next_op_name}:{output_index}" + new_tensor = self._full_graph.get_tensor_by_name(new_tensor_name) + try: + operator._update_input(idx, new_tensor) + except InvalidArgumentError as err: + logger.info( + "The replacement specs keys (old batch) is: %s. \n\t\t The new batch is: %s.", + replacement_specs.keys(), + new_batch, + ) + raise RuntimeError( + f"Cannot update edge, old tensor: {old_tensor}, new tensor: {new_tensor}." + ) from err - return map_func + +@para_checker_decorator( + check_option_list=[ + ("dump_graph", ClassValidator, {"classes": (bool,)}), + ] +) +def modify_graph_and_start_emb_cache(full_graph: Graph = None, dump_graph: bool = False): + modifier = _GraphModifier(full_graph=full_graph, dump_graph=dump_graph) + modifier.modify_graph_for_asc() + start_asc_pipeline() -def parse_batch(data_args: Any, data_batch: dict, key: str = None): +def _parse_batch(data_args: Any, data_batch: dict, key: str = None): """ 解析原始数据集中的batch,并将非dict格式的batch转为dict格式. Args: @@ -131,7 +551,7 @@ def parse_batch(data_args: Any, data_batch: dict, key: str = None): """ - if key is not None: + if key: data_batch[key] = data_tensor return @@ -141,11 +561,11 @@ def parse_batch(data_args: Any, data_batch: dict, key: str = None): # 开始解析old batch if isinstance(data_args, dict): for key, data_tensor in data_args.items(): - parse_batch(data_tensor, data_batch, key) + _parse_batch(data_tensor, data_batch, key) return if isinstance(data_args, (list, tuple)): for data_arg in data_args: - parse_batch(data_arg, data_batch, key) + _parse_batch(data_arg, data_batch, key) return if isinstance(data_args, Tensor): # 将old batch中的tensor加入到dict中 @@ -155,12 +575,12 @@ def parse_batch(data_args: Any, data_batch: dict, key: str = None): raise ValueError(f"Invalid batch type, expected: (dict, list, tuple, Tensor), got: {type(data_args)}.") -def get_input_index_list( - cutting_point_list: List[Tensor], - replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], - mapping_name_list: List[str], - base_count: int, - timestamp_index: int = None +def _get_input_index_list( + cutting_point_list: List[Tensor], + replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], + mapping_name_list: List[str], + base_count: int, + timestamp_index: int = None, ) -> List[int]: input_index_list = [] for cutting_point in cutting_point_list: @@ -179,78 +599,8 @@ def get_input_index_list( return input_index_list -def find_make_iterator_op(batch_tensor: Tensor) -> Operation: - graph = tf.compat.v1.get_default_graph() - operations = graph.get_operations() - for each_op in operations: - for input_tensor in batch_tensor.op.inputs: - if input_tensor.op.outputs and input_tensor.op.outputs[0] in list( - each_op.inputs) and each_op.type == AnchorIteratorOp.MAKE_ITERATOR.value: - logger.debug("Op MakeIterator '%s' was found.", each_op.name) - return each_op - - raise ValueError(f"op MakeIterator was not found.") - - -@performance("find_target_dataset_op") -def find_target_dataset_op(base_ops: Operation, op_type: str) -> Operation: - base_ops = check_input_list(base_ops, tf.Operation) - parent_ops = base_ops - - while_num = 0 - while True: - while_num += 1 - if while_num > MAX_WHILE_SIZE: - raise RuntimeError(f"In find_target_dataset_op function, the maximum cycle depth is greater " - f"than {MAX_WHILE_SIZE}.") - for parent_op in parent_ops: - if parent_op.type == op_type: - return parent_op - - base_ops = parent_ops - parent_ops = [] - for base_op in base_ops: - parent_ops.extend(find_parent_op(base_op)) - - if not parent_ops: - raise ValueError(f"op {op_type} was not found.") - - -def get_dataset_op(get_next_op: Operation) -> Operation: - """ - 根据`IteratorGetNext`算子从图中找到`OptimizeDataset`的dataset op. - 注: TF2没有`OptimizeDataset`,则找的是dataset的默认锚点. - - Args: - get_next_op: `IteratorGetNext`算子 - - Returns: TF1返回`OptimizeDataset`算子,TF2返回dataset默认锚点的算子 - - """ - - if get_next_op.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: - raise TypeError(f"op '{get_next_op}' must be one instance of IteratorGetNext.") - - # looking for the MakeIterator operator which corresponds to given batch_tensor - base_op = find_make_iterator_op(get_next_op.outputs[0]) - # looking for the op which is the one before OptimizeDataset operator - if tf.__version__.startswith("1"): - optimize_dataset_op = find_target_dataset_op(base_op, AnchorDatasetOp.MODEL_DATASET.value) - target_op = find_parent_op(optimize_dataset_op) - if not target_op: - raise RuntimeError("the parent op for 'ModelDataset' op was not found.") - if target_op[0].type != AnchorDatasetOp.OPTIMIZE_DATASET.value: - raise TypeError("op OptimizeDataset was not found.") - target_op = target_op[0] - else: - # 'OptimizeDataset' is not available in TensorFlow2.X - target_op = find_target_dataset_op(base_op, AnchorDatasetOp.PREFETCH_DATASET.value) - return target_op - - -def get_passing_tensor_list( - src_tensors: List[Tensor], - target_op: Operation +def _get_passing_tensor_list( + src_tensors: List[Tensor], target_op: Operation ) -> Tuple[List[Tensor], List[int], List[Tensor]]: def get_passing_tensors(src_tensor): passing_tensors = [] @@ -259,8 +609,9 @@ def get_passing_tensor_list( while tensor_list: while_num += 1 if while_num > MAX_WHILE_SIZE: - raise RuntimeError(f"In get_passing_tensors function, the maximum cycle depth is greater " - f"than {MAX_WHILE_SIZE}.") + raise RuntimeError( + f"In get_passing_tensors function, the maximum cycle depth is greater " f"than {MAX_WHILE_SIZE}." + ) last_tensor = tensor_list.pop() if last_tensor.op is target_op: passing_tensors.append(last_tensor) @@ -269,7 +620,7 @@ def get_passing_tensor_list( return passing_tensors - src_tensors = check_input_list(src_tensors, Tensor) + src_tensors = check_and_force_list(src_tensors, Tensor) passing_tensor_list = [] sub_src_tensors = [] for tensor in src_tensors: @@ -288,83 +639,7 @@ def get_passing_tensor_list( return passing_tensor_list, output_index_list, sub_src_tensors -def find_target_instance_dataset(variant_tensor: Tensor) -> DatasetV1Adapter: - dataset_instance_list = tf.compat.v1.get_collection("dataset_group") - for ins in dataset_instance_list: - if ins._variant_tensor == variant_tensor: - if not isinstance(ins, DatasetV1Adapter): - ins = ins._input_dataset - logger.debug("Find target instance '%s', whose variant_tensor is '%s'.", ins, variant_tensor) - if not isinstance(ins.element_spec, dict) and not ( - isinstance(ins.element_spec, (list, tuple)) and len(ins.element_spec) == 2 and isinstance( - ins.element_spec[0], dict)): - raise NotImplementedError("the found dataset does not return a valid layout.") - - return ins - - raise LookupError(f"Can not find target instance, whose variant_tensor is '{variant_tensor}' respectively.") - - -def get_sub_graph( - input_tensors: List[Tensor], - output_tensors: List[Tensor] -) -> Tuple[GraphDef, List[str], List[str]]: - input_tensors = check_input_list(input_tensors, tf.Tensor) - output_tensors = check_input_list(output_tensors, tf.Tensor) - input_op_name_list = [tensor.op.name for tensor in input_tensors] - output_op_name_list = [tensor.op.name for tensor in output_tensors] - - graph_def = tf.compat.v1.get_default_graph().as_graph_def() - cut_graph_input = tf.compat.v1.graph_util.extract_sub_graph(graph_def, input_op_name_list) - cut_graph_output = tf.compat.v1.graph_util.extract_sub_graph(graph_def, output_op_name_list) - - node_list = [] - node_list_input = cut_graph_input.node - node_list_output = cut_graph_output.node - for node in node_list_output: - if node not in node_list_input: - node_list.append(node) - - sub_graph_def = tf.compat.v1.GraphDef() - sub_graph_def.node.extend(node_list) - - input_name_list = [tensor.name for tensor in input_tensors] - output_name_list = [tensor.name for tensor in output_tensors] - - return sub_graph_def, input_name_list, output_name_list - - -def update_input_tensor_with_new_batch(replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], - new_get_next_op_name: str, - new_batch: Dict[str, Tensor]): - """ - 用新batch中的IteratorGetNext替换计算图中老batch的IteratorGetNext. - - Args: - replacement_specs: 记录待替换算子的dict,key为老batch的IteratorGetNext,value为以老batch作为输入的算子 - new_get_next_op_name: 新数据集的get_next算子名称 - new_batch: 新数据集的batch - - Returns: None - - """ - - graph = tf.compat.v1.get_default_graph() - for old_tensor, item in replacement_specs.items(): - for idx, operator in item: - old_tensor_name = old_tensor.name - output_index = old_tensor_name.split(":")[-1] - new_tensor_name = f"{new_get_next_op_name}:{output_index}" - new_tensor = graph.get_tensor_by_name(new_tensor_name) - try: - operator._update_input(idx, new_tensor) - except InvalidArgumentError as err: - logger.info("The replacement specs keys (old batch) is: %s. \n\t\t The new batch is: %s.", - replacement_specs.keys(), new_batch) - raise RuntimeError(f"Cannot update edge, old tensor: {old_tensor}, new tensor: {new_tensor}.") from err - - -def get_dataset_tensor_count(dataset: DatasetV1Adapter) -> int: +def _get_dataset_tensor_count(dataset: DatasetV1Adapter) -> int: """ 获取数据集中batch的tensor数量. @@ -378,12 +653,37 @@ def get_dataset_tensor_count(dataset: DatasetV1Adapter) -> int: src_element_spec = dataset.element_spec if not isinstance(src_element_spec, (list, tuple)): src_element_spec = [src_element_spec] - src_sorted_keys = make_sorted_key_to_tensor_list(src_element_spec, []) + src_sorted_keys = utils.make_sorted_key_to_tensor_list(src_element_spec, []) return len(src_sorted_keys) -def change_ext_emb_size_by_opt(optimizer): +def _get_timestamp_index(graph: Graph, get_next_op: Operation, is_training: bool) -> int: + timestamp_tensor_list = graph.get_collection(ASCEND_TIMESTAMP) + timestamp_index = None + for timestamp in timestamp_tensor_list: + if timestamp in get_next_op.outputs: + timestamp_index = int(timestamp.name.split(":")[1]) + timestamp_feature_spec = ConfigInitializer.get_instance().feature_spec_config.get_feature_spec("timestamp") + if timestamp_feature_spec is None: + timestamp_feature_spec = FeatureSpec("timestamp", index_key=timestamp_index, is_timestamp=True) + timestamp_feature_spec.include_timestamp(is_training) + ConfigInitializer.get_instance().feature_spec_config.insert_feature_spec( + timestamp_feature_spec, is_training + ) + break + + if timestamp_feature_spec.index_key != timestamp_index: + raise ValueError( + f"Given timestamp_index, which is {timestamp_index}, does not match index " + f"key. Please double check." + ) + timestamp_feature_spec.include_timestamp(is_training) + break + return timestamp_index + + +def _change_ext_emb_size_by_opt(optimizer): for _, table_instance in ConfigInitializer.get_instance().sparse_embed_config.table_instance_dict.items(): # When dynamic expansion mode, ext_emb_size is set by optimizer if ConfigInitializer.get_instance().use_dynamic_expansion or not table_instance.is_hbm: @@ -391,197 +691,36 @@ def change_ext_emb_size_by_opt(optimizer): logger.info("ext_emb_size is reset to be %s in change_ext_emb_size_by_opt", table_instance.ext_emb_size) -@para_checker_decorator( - check_option_list=[("dump_graph", ClassValidator, {"classes": (bool,)})] -) -def modify_graph_and_start_emb_cache(dump_graph: bool = False): - modify_graph_for_asc(dump_graph=dump_graph) - start_asc_pipeline() - - -def generate_get_next_op_specs( - cutting_point_list: List[Tensor], - dump_graph: bool = False -) -> Dict[Tensor, AnchorRecord]: - get_next_op_map = defaultdict(dict) - - for input_tensor in cutting_point_list: - get_next_op = find_target_dataset_op(input_tensor.op, AnchorIteratorOp.ITERATOR_GET_NEXT.value) - if get_next_op not in get_next_op_map: - logger.debug("find a new get_next_op named '%s'", get_next_op.name) - - replacement_specs = record_ops_to_replace(get_next_op) - passing_tensors, batch_tensor_indexs, sub_cutting_points = \ - get_passing_tensor_list(cutting_point_list, get_next_op) - sub_graph_def, input_names, output_names = get_sub_graph(passing_tensors, sub_cutting_points) - is_training = BaseSparseEmbedding.get_anchor_attribute(input_tensor, ASCAnchorAttr.IS_TRAINING) - - record = AnchorRecord( - replacement_specs, - passing_tensors, - batch_tensor_indexs, - sub_cutting_points, - sub_graph_def, - input_names, - output_names, - is_training - ) - get_next_op_map[get_next_op] = record - - export_pb_graph(f"cut_graph_{get_next_op.name}.pb", dump_graph, graph_def=sub_graph_def) - - return get_next_op_map - - -def get_src_dataset(get_next_op: Operation, is_training: bool) -> DatasetV1Adapter: - """ - 根据`IteratorGetNext`算子在计算图中找出原始dataset. - - Args: - get_next_op: `IteratorGetNext`算子 - is_training: 当前是否为训练模式,训练模式为True,否则为False - - Returns: 原始数据集 - - """ +def _get_variable_and_slot_list(each_var, slot_num, table_name, channel_id): + variable_and_slot_list = [each_var] + if slot_num == 0: + return variable_and_slot_list - try: - target_op = get_dataset_op(get_next_op) - except (ValueError, TypeError, RuntimeError) as err: - logger.warning("The dataset op was not found, the error is `%s`. Start to traverse the operations.", err) - graph = tf.compat.v1.get_default_graph() - dataset_op_list = [op for op in graph.get_operations() if AnchorDatasetOp.PREFETCH_DATASET.value in op.name] - - # WARN: Couple with NoGradSubGraphSlicer::_find_old_dataset. - dataset_op_list = list( - filter(lambda op: op not in tf.compat.v1.get_collection(DeprecatedOp.DEPRECATED_PREFETCH_DATASET), - dataset_op_list) + # 通过apply_gradients创建optimizer + optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name(table_name) + if optimizer is None and channel_id == 0: + raise RuntimeError( + "In training mode, table_instance should have been set_optimizer_for_table " + "before modify_graph, please check whether apply_gradients is performed" ) - dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) - - logger.debug("In get_src_dataset function, current mode(train: True, eval: False): %s, dataset_op_list: %s.", - is_training, dataset_op_list) - - if len(dataset_op_list) == 1: - target_op = dataset_op_list[0] - elif is_training and len(dataset_op_list) == 2: - prefetch_dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) - target_op = prefetch_dataset_op_list[0] - elif not is_training and len(dataset_op_list) == 2: - prefetch_dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) - target_op = prefetch_dataset_op_list[1] - elif not is_training and len(dataset_op_list) == 3: - prefetch_dataset_op_list = sorted(dataset_op_list, key=lambda op: op.name) - target_op = prefetch_dataset_op_list[1] - else: - raise RuntimeError(f"'{AnchorDatasetOp.PREFETCH_DATASET.value}' not found, got transformation datasets: " - f"{dataset_op_list}.") from err - except Exception as err: - raise RuntimeError(f"The dataset was not found, the error is `{err}`.") from err - - if not target_op.outputs: - raise ValueError(f"The length of the outputs of target op `{target_op}` is 0.") - logger.debug("Find target op `%s`, and output is `%s`.", target_op.name, target_op.outputs) - src_dataset = find_target_instance_dataset(target_op.outputs[0]) - return src_dataset - - -def get_tgt_dataset( - src_dataset: DatasetV1Adapter, - sub_cutting_point_list: List[Tensor], - record: AnchorRecord, - dump_graph: bool = False, - prefetch: int = 10 -) -> DatasetV1Adapter: - """ - 根据原始数据集生成新的数据集实例. - - Args: - src_dataset: 原始数据集实例 - sub_cutting_point_list: 打桩的lookup ids列表 - records: 记录被打桩ids对应输入/输出算子、子图关系等信息的字典 - dump_graph: 是否dump计算图,默认为False - prefetch: dataset预取数据量,默认为10 - Returns: 新数据集实例 - - """ - - librec = import_host_pipeline_ops(LIBREC_EOS_OPS_SO) - channel_id = ConfigInitializer.get_instance().train_params_config.get_training_mode_channel_id( - record.is_training) - # 在数据读取完时,通过EosDataset向acl数据通道发送end_of_sequence - max_train_steps = ConfigInitializer.get_instance().max_steps - max_eval_steps = ConfigInitializer.get_instance().eval_steps - src_dataset = src_dataset.eos_map(librec, channel_id, max_train_steps, max_eval_steps) - - tgt_dataset = src_dataset.map(get_preprocessing_map_func(record.sub_graph_def, - record.input_names, - record.output_names, - pipeline_input_indexes=record.batch_tensor_indexs)) - - feature_numbers = [BaseSparseEmbedding.get_anchor_attribute(cutting_point, ASCAnchorAttr.FEATURE_SPEC).feat_cnt for - cutting_point in sub_cutting_point_list] - table_names = [BaseSparseEmbedding.get_anchor_attribute(cutting_point, ASCAnchorAttr.FEATURE_SPEC).table_name for - cutting_point in sub_cutting_point_list] - tgt_dataset = tgt_dataset.map(get_asc_insert_func(feature_numbers=feature_numbers, - table_names=table_names, - args_index_list=record.input_indexs, - is_training=record.is_training, - dump_graph=dump_graph)) - - tgt_dataset = tgt_dataset.prefetch(prefetch) - return tgt_dataset - - -def update_iterator_getnext(get_next_op: Operation, - tgt_dataset: DatasetV1Adapter, - is_training: bool, - record: AnchorRecord): - """ - 用新数据集中的`IteratorGetNext`算子替换计算图中原始数据集的`IteratorGetNext`算子,即用新数据集的batch替换原始数据集的batch. - - Args: - get_next_op: `IteratorGetNext`算子 - tgt_dataset: 新数据集 - is_training: 当前是否为训练模式,训练模式为True,否则为False - records: 记录被打桩ids对应输入/输出算子、子图关系等信息的字典 - - Returns: None - - """ - if not get_next_op.outputs: - raise RuntimeError("there is no tensor in the dataset. Please check the dataset and data processing.") - iterator_type = "" - if get_next_op.outputs[0].op.inputs: - iterator_type = get_next_op.outputs[0].op.inputs[0].op.type - if iterator_type == "IteratorV2": - iterator_type = find_make_iterator_op(get_next_op.outputs[0]).type - if iterator_type not in (AnchorIteratorOp.MAKE_ITERATOR.value, AnchorIteratorOp.ONE_SHOT_ITERATOR.value): - raise RuntimeError(f"Only iterators `MakeIterator` and `OneShotIterator` are supported in `graph modify` mode, " - f"but the current iterator is `{iterator_type}`.") - ConfigInitializer.get_instance().train_params_config.iterator_type = iterator_type - logger.info("The iterator type of dataset is `%s`.", iterator_type) - - if iterator_type == AnchorIteratorOp.MAKE_ITERATOR.value: - new_iterator = tgt_dataset.make_initializable_iterator() - tf.compat.v1.add_to_collection(ASCEND_CUTTING_POINT_INITIALIZER, new_iterator.initializer) - ConfigInitializer.get_instance().train_params_config.set_initializer(is_training, new_iterator.initializer) + # predict不需要传优化器,但是如果客户创建了优化器,ddr模式加载的是维度ext_size的emb用作换入换出,所以需要给slot零值占位 + if optimizer is None and channel_id == 1: + slot_place_holder = tf.zeros_like(each_var) + for _ in range(slot_num): + variable_and_slot_list.append(slot_place_holder) else: - new_iterator = tgt_dataset.make_one_shot_iterator() - new_batch = new_iterator.get_next() - ConfigInitializer.get_instance().train_params_config.set_target_batch(is_training, new_batch) + # opt name to slot dict + for slot_dict in optimizer.values(): + for slot_val in slot_dict.values(): + variable_and_slot_list.append(slot_val) - try: - new_batch_tensor = list(new_batch.values())[0] - except IndexError as err: - raise IndexError("Cannot find a tensor from given batch.") from err - new_get_next_op_name = find_target_dataset_op(new_batch_tensor.op, AnchorIteratorOp.ITERATOR_GET_NEXT.value).name - update_input_tensor_with_new_batch(record.replacement_spec, new_get_next_op_name, new_batch) + return variable_and_slot_list -def get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: list, swap_len: int, swap_pos: list, - channel_id: int) -> list: +def _get_swap_info( + table_instance: BaseSparseEmbedding, variable_and_slot_list: list, swap_len: int, swap_pos: list, channel_id: int +) -> list: """ Get swap info if threshold is configured. :param table_instance: BaseSparseEmbedding @@ -600,11 +739,12 @@ def get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: l swap_in = [tf.no_op()] else: with tf.compat.v1.variable_scope("h2d_emb"): - logger.debug('Channel %s_h2d_%s was built for getnext', table_instance.table_name, channel_id) + logger.debug("Channel %s_h2d_%s was built for getnext", table_instance.table_name, channel_id) h2d_emb = npu_ops.gen_npu_ops.get_next( output_types=[tf.float32], output_shapes=[[max_lookup_vec_size, table_instance.ext_emb_size]], - channel_name=f'{table_instance.table_name}_h2d_{channel_id}')[0] + channel_name=f"{table_instance.table_name}_h2d_{channel_id}", + )[0] logger.debug("h2d_emb shape: %s", h2d_emb) if not isinstance(variable_and_slot_list, list): raise RuntimeError("When enable emb_transfer, optimizer should have slots") @@ -613,184 +753,22 @@ def get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: l h2d_emb = h2d_emb[0:swap_len, :] swap_outs = [tf.gather(one_table, swap_pos) for one_table in variable_and_slot_list] swap_out = tf.concat(swap_outs, axis=1) - logger.debug('Channel %s_d2h_%s was built for op outfeed.', table_instance.table_name, channel_id) + logger.debug("Channel %s_d2h_%s was built for op outfeed.", table_instance.table_name, channel_id) swap_out_op = npu_ops.outfeed_enqueue_op( - channel_name=f'{table_instance.table_name}_d2h_{channel_id}', inputs=[swap_out]) + channel_name=f"{table_instance.table_name}_d2h_{channel_id}", inputs=[swap_out] + ) with tf.control_dependencies([swap_out_op]): nd_swap_pos = tf.expand_dims(swap_pos, 1) table_num = len(variable_and_slot_list) h2d_emb_split = tf.split(h2d_emb, table_num, axis=1) optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name( - table_instance.table_name) + table_instance.table_name + ) if optimizer is None and channel_id == 1: swap_in = [tf.compat.v1.scatter_nd_update(variable_and_slot_list[0], nd_swap_pos, h2d_emb_split[0])] else: - swap_in = [tf.compat.v1.scatter_nd_update(variable_and_slot_list[i], nd_swap_pos, h2d_emb_split[i]) - for i in range(len(variable_and_slot_list))] + swap_in = [ + tf.compat.v1.scatter_nd_update(variable_and_slot_list[i], nd_swap_pos, h2d_emb_split[i]) + for i in range(len(variable_and_slot_list)) + ] return swap_in - - -def get_variable_and_slot_list(each_var, slot_num, table_name, channel_id): - variable_and_slot_list = [each_var] - if slot_num == 0: - return variable_and_slot_list - - # 通过apply_gradients创建optimizer - optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name(table_name) - if optimizer is None and channel_id == 0: - raise RuntimeError("In training mode, table_instance should have been set_optimizer_for_table " - "before modify_graph, please check whether apply_gradients is performed") - - # predict不需要传优化器,但是如果客户创建了优化器,ddr模式加载的是维度ext_size的emb用作换入换出,所以需要给slot零值占位 - if optimizer is None and channel_id == 1: - slot_place_holder = tf.zeros_like(each_var) - for _ in range(slot_num): - variable_and_slot_list.append(slot_place_holder) - else: - # opt name to slot dict - for slot_dict in optimizer.values(): - for slot_val in slot_dict.values(): - variable_and_slot_list.append(slot_val) - - return variable_and_slot_list - - -def modify_graph_for_ddr(get_next_op_map): - # 通过create_hash_optimizer创建optimizer_instance - optimizer_instance = ConfigInitializer.get_instance().optimizer_config.optimizer_instance - # predict - if optimizer_instance is None: - slot_num = 0 - else: - # ddr和扩容需要在获取优化器后重置ext - change_ext_emb_size_by_opt(optimizer_instance) - slot_num = optimizer_instance.slot_num - - for _, record in get_next_op_map.items(): - is_training = record.is_training - channel_id = 0 if is_training else 1 - - swap_args = SwapArgs() - sparse_variables = tf.compat.v1.get_collection( - ConfigInitializer.get_instance().train_params_config.ascend_global_hashtable_collection) - - for each_var in sparse_variables: - table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(each_var) - if table_instance.is_hbm: - continue - swap_args_dict = swap_args.swap_config_dict[table_instance.table_name][channel_id] - swap_pos = swap_args_dict['swap_pos'] - swap_len = swap_args_dict['swap_len'] - variable_and_slot_list = get_variable_and_slot_list(each_var, slot_num, table_instance.table_name, - channel_id) - - swap_op = get_swap_info(table_instance, variable_and_slot_list, swap_len, swap_pos, channel_id) - swap_control_dict = swap_args.swap_control_dict[table_instance.table_name][channel_id] - if "control_ops" not in swap_control_dict: - raise ValueError("Missing Required key in modify_graph_for_asc: control_ops") - control_ops = swap_control_dict['control_ops'] - replace_anchor_control(control_ops, swap_op) - - -@performance("graph_modifier") -def modify_graph_for_asc(dump_graph: bool = False, prefetch: int = 10): - cutting_point_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE) - check_cutting_points(cutting_point_list) - if not cutting_point_list: - logger.warning("Nothing to revise.") - return - - export_pb_graph("old_graph.pb", dump_graph) - get_next_op_map = generate_get_next_op_specs(cutting_point_list, dump_graph) - logger.debug("In modify_graph_for_asc function, get_next_op_map.len: %d, get_next_op_map.key: %s.", - len(get_next_op_map), get_next_op_map.keys()) - - for get_next_op, record in get_next_op_map.items(): - is_training = record.is_training - - # get source dataset - src_dataset = get_src_dataset(get_next_op, is_training) - - # generate target dataset - timestamp_index = get_timestamp_index(get_next_op, is_training) - original_batch_tensor_count = get_dataset_tensor_count(src_dataset) - sub_cutting_points = record.sub_cutting_points - input_index_list = get_input_index_list(sub_cutting_points, - record.replacement_spec, - record.output_names, - original_batch_tensor_count, timestamp_index=timestamp_index) - record.input_indexs = input_index_list - tgt_dataset = get_tgt_dataset(src_dataset, sub_cutting_points, record, - dump_graph=dump_graph, prefetch=prefetch) - - # update the batch of dataset - update_iterator_getnext(get_next_op, tgt_dataset, is_training, record) - - # In eval mode, backward is not required. In addition, compute gradients is not executed when - # only eval is used. Therefore, `do_merge_lookup` needs to be invoked during modify graph. - if not is_training: - do_merge_lookup(is_train=False) - if 'evaluate' in ConfigInitializer.get_instance().train_params_config.bool_gauge_set: - logger.debug("In estimator mode, eval re-creates graph each time, so the flag needs to be cleared.") - ConfigInitializer.get_instance().train_params_config.insert_merged_multi_lookup(is_training, False) - # In training mode, `do_merge_lookup` should have been executed in compute gradients phase. - if is_training and not ConfigInitializer.get_instance().train_params_config.get_merged_multi_lookup(True): - raise RuntimeError("In training mode, `do_merge_lookup` should have been executed in compute gradients " - "phase. Please check whether compute gradients is performed.") - # ddr - modify_graph_for_ddr(get_next_op_map) - - logger.info("Graph has been revised.") - export_pb_graph("new_graph.pb", dump_graph) - - -def get_timestamp_index(get_next_op: Operation, is_training: bool) -> int: - timestamp_tensor_list = tf.compat.v1.get_collection(ASCEND_TIMESTAMP) - timestamp_index = None - for timestamp in timestamp_tensor_list: - if timestamp in get_next_op.outputs: - timestamp_index = int(timestamp.name.split(":")[1]) - timestamp_feature_spec = ConfigInitializer.get_instance().feature_spec_config.get_feature_spec("timestamp") - if timestamp_feature_spec is None: - timestamp_feature_spec = FeatureSpec("timestamp", index_key=timestamp_index, is_timestamp=True) - timestamp_feature_spec.include_timestamp(is_training) - ConfigInitializer.get_instance().feature_spec_config.insert_feature_spec(timestamp_feature_spec, - is_training) - break - - if timestamp_feature_spec.index_key != timestamp_index: - raise ValueError(f"Given timestamp_index, which is {timestamp_index}, does not match index " - f"key. Please double check.") - timestamp_feature_spec.include_timestamp(is_training) - break - return timestamp_index - - -class GraphModifierHook(tf.estimator.SessionRunHook): - @para_checker_decorator( - check_option_list=[ - ("dump_graph", ClassValidator, {"classes": (bool,)}), - ("modify_graph", ClassValidator, {"classes": (bool,)}) - ] - ) - def __init__(self, dump_graph=False, modify_graph=True): - self._dump_graph = dump_graph - self._modify_graph = modify_graph - self._iterator_type = "" - ConfigInitializer.get_instance().train_params_config.is_graph_modify_hook_running = True - - def begin(self): - if self._modify_graph: - modify_graph_and_start_emb_cache(dump_graph=self._dump_graph) - else: - start_asc_pipeline() - - self._iterator_type = ConfigInitializer.get_instance().train_params_config.iterator_type - if self._modify_graph and self._iterator_type not in (AnchorIteratorOp.MAKE_ITERATOR.value, - AnchorIteratorOp.ONE_SHOT_ITERATOR.value): - raise ValueError("the value of iterator type should be like `MakeIterator` or `OneShotIterator`.") - logger.debug("In GraphModifierHook, iterator type is `%s`.", self._iterator_type) - - def after_create_session(self, session, coord): - if self._modify_graph and self._iterator_type == AnchorIteratorOp.MAKE_ITERATOR.value: - session.run(tf.compat.v1.get_collection(ASCEND_CUTTING_POINT_INITIALIZER)) diff --git a/mx_rec/graph/slicers.py b/mx_rec/graph/slicers.py index a4014195..c86e60f1 100644 --- a/mx_rec/graph/slicers.py +++ b/mx_rec/graph/slicers.py @@ -24,14 +24,15 @@ import tensorflow as tf from tensorflow import Operation, Tensor, SparseTensor, Graph, variant, resource from tensorflow.python.data.ops.dataset_ops import DatasetV1Adapter -from mx_rec.graph import utils, modifier +from mx_rec.graph import utils from mx_rec.util.log import logger from mx_rec.validator.validator import ClassValidator, para_checker_decorator from mx_rec.constants.constants import ( + ASCAnchorAttr, ASCEND_TIMESTAMP, MAX_WHILE_SIZE, - ASCAnchorAttr, ASCEND_SPARSE_LOOKUP_ENTRANCE, + ORPHAN_LOOKUP_KEY_PREFIX ) from mx_rec.graph.constants import DeprecatedOp, AnchorDatasetOp, AnchorIteratorOp from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding @@ -145,22 +146,6 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): res.add(base_ops) out_op_to_edge_ops[output_consumer] = res - @staticmethod - def _upward_bfs_op(base_ops: Union[Operation, Set[Operation], List[Operation]], tgt_op_type: str) -> Operation: - if not isinstance(base_ops, (set, list)): - base_ops = [base_ops] - - parent_ops = base_ops - while True: - for parent_op in parent_ops: - if parent_op.type == tgt_op_type: - return parent_op - base_ops = parent_ops - parent_ops = [] - for base_op in base_ops: - parent_ops.extend(utils.find_parent_op(base_op)) - if not parent_ops: - raise ValueError(f"target operation '{tgt_op_type}'' was not found.") @staticmethod def _topo_sort_sliced_ops(sliced_ops: Set[Operation]) -> List[Operation]: @@ -386,9 +371,9 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): old_get_next: The old 'IteratorGetNext' operation. """ - old_get_next = self._upward_bfs_op(sliceable_ops, AnchorIteratorOp.ITERATOR_GET_NEXT.value) + old_get_next = utils.upward_bfs_op(sliceable_ops, AnchorIteratorOp.ITERATOR_GET_NEXT.value) - tf.compat.v1.add_to_collection(DeprecatedOp.DEPRECATED_ITERATOR_GET_NEXT, old_get_next) + self._full_graph.add_to_collection(DeprecatedOp.DEPRECATED_ITERATOR_GET_NEXT, old_get_next) logger.info("Old 'IteratorGetNext' operation has been deprecated now.") return old_get_next @@ -412,7 +397,7 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): tgt_trans_dataset = None try: - tgt_trans_dataset = self._find_trans_dataset(get_next) + tgt_trans_dataset = utils.find_trans_dataset(self._full_graph, get_next) except (ValueError, TypeError, RuntimeError) as err: trans_datasets = [ op for op in self._full_graph.get_operations() if AnchorDatasetOp.PREFETCH_DATASET.value in op.name @@ -442,39 +427,10 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): # WARN: Couple with modifier module, global collection used for filtering deprecated prefetch dataset. self._full_graph.add_to_collection(DeprecatedOp.DEPRECATED_PREFETCH_DATASET, tgt_trans_dataset) - old_dataset = modifier.find_target_instance_dataset(tgt_trans_dataset.outputs[0]) + old_dataset = utils.find_target_instance_dataset(self._full_graph, tgt_trans_dataset.outputs[0]) return old_dataset - def _find_trans_dataset(self, get_next: Operation) -> Operation: - """Find the transformation dataset through 'get_next'. - - Args: - get_next: The old 'IteratorGetNext' operation. - - Returns: - trans_dataset: The target transformation dataset. - """ - - if get_next.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: - raise TypeError(f"operation '{get_next}' must be one instance of 'IteratorGetNext'.") - - make_iter = modifier.find_make_iterator_op(get_next.outputs[0]) - - trans_dataset = None - if tf.__version__.startswith("1"): - optimize_dataset_op = self._upward_bfs_op(make_iter, AnchorDatasetOp.MODEL_DATASET.value) - trans_dataset = utils.find_parent_op(optimize_dataset_op) - if not trans_dataset: - raise RuntimeError("parent operation of 'ModelDataset' was not found.") - if trans_dataset[0].type != AnchorDatasetOp.OPTIMIZE_DATASET.value: - raise TypeError(f"operation 'OptimizeDataset' was not found.") - trans_dataset = trans_dataset[0] - else: - trans_dataset = self._upward_bfs_op(make_iter, AnchorDatasetOp.PREFETCH_DATASET.value) - - return trans_dataset - def _clone_subgraph_into_funcgraph( self, sliced_ops: Set[Operation], @@ -546,7 +502,7 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): if old_get_next.inputs: iter_type = old_get_next.inputs[0].op.type if iter_type == AnchorIteratorOp.ITERATOR_V2.value: - iter_type = modifier.find_make_iterator_op(old_get_next.outputs[0]).type + iter_type = utils.find_make_iterator_op(self._full_graph, old_get_next.outputs[0]).type if iter_type not in (AnchorIteratorOp.MAKE_ITERATOR.value, AnchorIteratorOp.ONE_SHOT_ITERATOR.value): raise RuntimeError( f"only iterators `MakeIterator` and `OneShotIterator` are supported in `graph modify` mode, " @@ -585,7 +541,7 @@ class NoGradSubgraphSlicer(metaclass=abc.ABCMeta): except IndexError as err: raise IndexError("cannot find a tensor from given batch.") from err - new_get_next = self._upward_bfs_op(new_batch_tensor.op, AnchorIteratorOp.ITERATOR_GET_NEXT.value) + new_get_next = utils.upward_bfs_op(new_batch_tensor.op, AnchorIteratorOp.ITERATOR_GET_NEXT.value) logger.info("Got old_new_get_next: %s.", new_get_next) return new_get_next @@ -824,8 +780,6 @@ class LookupSubgraphSlicer(NoGradSubgraphSlicer): ] ) class OrphanLookupKeySlicer(NoGradSubgraphSlicer): - SLICEABLE_ORPHAN_LOOKUP_KEY_PREFIX = "orphan" - def __init__(self, full_graph: Graph = None, info_dir: str = "orphan_slicing") -> None: """Initialize OrphanLookupKeySlicer. Args: @@ -887,7 +841,7 @@ class OrphanLookupKeySlicer(NoGradSubgraphSlicer): ] alive_get_nexts = list( filter( - lambda op: op not in tf.compat.v1.get_collection(DeprecatedOp.DEPRECATED_ITERATOR_GET_NEXT), + lambda op: op not in self._full_graph.get_collection(DeprecatedOp.DEPRECATED_ITERATOR_GET_NEXT), all_get_nexts, ) ) @@ -928,7 +882,7 @@ class OrphanLookupKeySlicer(NoGradSubgraphSlicer): for op in min_dep_ops: if not self._validate_op(op): continue - if OrphanLookupKeySlicer.SLICEABLE_ORPHAN_LOOKUP_KEY_PREFIX not in op.name: + if ORPHAN_LOOKUP_KEY_PREFIX not in op.name: continue sliceable_ops.add(op) diff --git a/mx_rec/graph/utils.py b/mx_rec/graph/utils.py index ca328ae3..17f071ac 100644 --- a/mx_rec/graph/utils.py +++ b/mx_rec/graph/utils.py @@ -17,32 +17,64 @@ import os from collections import defaultdict -from typing import List, Dict, Union, DefaultDict, Tuple +from typing import List, Dict, Set, Union, DefaultDict, Tuple import tensorflow as tf -from tensorflow import Operation, Tensor +from tensorflow import Operation, Tensor, Graph from tensorflow.core.framework.graph_pb2 import GraphDef +from tensorflow.python.data.ops.dataset_ops import DatasetV1Adapter from tensorflow.python.framework.errors_impl import InvalidArgumentError from tensorflow.python.ops import control_flow_ops -from mx_rec.graph.slicers import OrphanLookupKeySlicer -from mx_rec.graph.constants import AnchorIteratorOp +from mx_rec.graph.constants import AnchorDatasetOp, AnchorIteratorOp from mx_rec.constants.constants import ASCAnchorAttr, DUMP_MIDIFY_GRAPH_FILE_MODE from mx_rec.core.embedding import BaseSparseEmbedding -from mx_rec.core.asc.swap_args import SwapArgs, SwapDataType from mx_rec.util.log import logger -def check_input_list(objs: Union[object, List[object]], obj_type: type) -> Union[object, List[object]]: - if isinstance(objs, obj_type): - objs = [objs] +def find_trans_dataset(graph: Graph, get_next: Operation) -> Operation: + """Find the transformation dataset through 'get_next'. - if isinstance(objs, list): - for tensor in objs: - if not isinstance(tensor, obj_type): - raise ValueError(f"Given input parameter must be a {obj_type} or a list of {obj_type}") + Args: + get_next: The old 'IteratorGetNext' operation. + + Returns: + trans_dataset: The target transformation dataset. + """ + + if get_next.type != AnchorIteratorOp.ITERATOR_GET_NEXT.value: + raise TypeError(f"operation '{get_next}' must be one instance of 'IteratorGetNext'.") + + make_iter = find_make_iterator_op(graph, get_next.outputs[0]) + + trans_dataset = None + if tf.__version__.startswith("1"): + optimize_dataset_op = upward_bfs_op(make_iter, AnchorDatasetOp.MODEL_DATASET.value) + trans_dataset = find_parent_op(optimize_dataset_op) + if not trans_dataset: + raise RuntimeError("parent operation of 'ModelDataset' was not found.") + if trans_dataset[0].type != AnchorDatasetOp.OPTIMIZE_DATASET.value: + raise TypeError(f"operation 'OptimizeDataset' was not found.") + trans_dataset = trans_dataset[0] + else: + trans_dataset = upward_bfs_op(make_iter, AnchorDatasetOp.PREFETCH_DATASET.value) - return objs + return trans_dataset + + +def find_make_iterator_op(graph: Graph, batch_tensor: Tensor) -> Operation: + operations = graph.get_operations() + for each_op in operations: + for input_tensor in batch_tensor.op.inputs: + if ( + input_tensor.op.outputs + and input_tensor.op.outputs[0] in list(each_op.inputs) + and each_op.type == AnchorIteratorOp.MAKE_ITERATOR.value + ): + logger.debug("Op MakeIterator '%s' was found.", each_op.name) + return each_op + + raise ValueError(f"operation `MakeIterator` cannot be found.") def find_parent_op(operator: Operation) -> List[Operation]: @@ -54,6 +86,54 @@ def find_parent_op(operator: Operation) -> List[Operation]: return parent_ops +def upward_bfs_op(base_ops: Union[Operation, Set[Operation], List[Operation]], tgt_op_type: str) -> Operation: + if not isinstance(base_ops, (set, list)): + base_ops = [base_ops] + + parent_ops = base_ops + while True: + for parent_op in parent_ops: + if parent_op.type == tgt_op_type: + return parent_op + base_ops = parent_ops + parent_ops = [] + for base_op in base_ops: + parent_ops.extend(find_parent_op(base_op)) + if not parent_ops: + raise ValueError(f"target operation '{tgt_op_type}'' was not found.") + + +def find_target_instance_dataset(graph: Graph, variant_tensor: Tensor) -> DatasetV1Adapter: + dataset_instance_list = graph.get_collection("dataset_group") + for ins in dataset_instance_list: + if ins._variant_tensor == variant_tensor: + if not isinstance(ins, DatasetV1Adapter): + ins = ins._input_dataset + logger.debug("Find target instance '%s', whose variant_tensor is '%s'.", ins, variant_tensor) + if not isinstance(ins.element_spec, dict) and not ( + isinstance(ins.element_spec, (list, tuple)) + and len(ins.element_spec) == 2 + and isinstance(ins.element_spec[0], dict) + ): + raise NotImplementedError("the found dataset does not return a valid layout.") + + return ins + + raise LookupError(f"Can not find target instance, whose variant_tensor is '{variant_tensor}' respectively.") + + +def check_and_force_list(obj: Union[object, List[object]], obj_type: type) -> Union[object, List[object]]: + if isinstance(obj, obj_type): + obj = [obj] + + if isinstance(obj, list): + for tensor in obj: + if not isinstance(tensor, obj_type): + raise ValueError(f"Given input parameter must be a {obj_type} or a list of {obj_type}") + + return obj + + def check_cutting_points(cutting_point_list: List[Tensor]): for tensor in cutting_point_list: if not isinstance(tensor, Tensor): @@ -63,10 +143,10 @@ def check_cutting_points(cutting_point_list: List[Tensor]): raise ValueError(f"Cutting point can only be the output of an Operator 'Identity'.") -def record_ops_to_replace(src_op: Operation) -> DefaultDict[Tensor, List[Tuple[int, Operation]]]: +def record_ops_to_replace(graph: Graph, src_op: Operation) -> DefaultDict[Tensor, List[Tuple[int, Operation]]]: replacement_specs = defaultdict(list) output_list = src_op.outputs - op_list = tf.compat.v1.get_default_graph().get_operations() + op_list = graph.get_operations() for tensor in output_list: for operator in op_list: if tensor in operator.inputs: @@ -78,23 +158,52 @@ def record_ops_to_replace(src_op: Operation) -> DefaultDict[Tensor, List[Tuple[i def replace_anchor(replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], new_tensor_list: List[Tensor]): if len(replacement_specs) != len(new_tensor_list): - raise ValueError(f"Given replacement_specs and new_tensor_list must have the same length. " - f"replacement_specs: {replacement_specs}, new_tensor_list: {new_tensor_list}") + raise ValueError( + f"Given replacement_specs and new_tensor_list must have the same length. " + f"replacement_specs: {replacement_specs}, new_tensor_list: {new_tensor_list}" + ) for tensor_idx, (old_tensor, items) in enumerate(replacement_specs.items()): for input_idx, operator in items: try: operator._update_input(input_idx, new_tensor_list[tensor_idx]) except InvalidArgumentError as err: - logger.info("The replacement specs keys (old batch) is: %s. \n\t\t The new_tensor_list is: %s.", - replacement_specs.keys(), new_tensor_list) - raise RuntimeError(f"Cannot update edge, old tensor: {old_tensor}, " - f"new tensor: {new_tensor_list[tensor_idx]}.") from err + logger.info( + "The replacement specs keys (old batch) is: %s. \n\t\t The new_tensor_list is: %s.", + replacement_specs.keys(), + new_tensor_list, + ) + raise RuntimeError( + f"Cannot update edge, old tensor: {old_tensor}, " f"new tensor: {new_tensor_list[tensor_idx]}." + ) from err + + +def replace_anchor_control(graph: Graph, place_holder_control: tf.Operation, real_anchor: Tensor): + """ + 将place_holder_control替换为入参real_anchor. + + Args: + place_holder_control: control op + real_anchor: 用来替换打桩节点的tensor + Returns: None -def record_control_to_replace(src_op: Operation) -> DefaultDict[Tensor, List[Tuple[int, Operation]]]: + """ + + if place_holder_control is None: + raise RuntimeError( + f"Node place_holder_control does not exist. Check whether the sparse lookup interface " + f"is correctly invoked." + ) + # find the op with stub node as the input + replacement_specs_for_anchor_vec = record_control_to_replace(graph, place_holder_control) + # replace anchor_vec with anchor + replace_control_anchor(replacement_specs_for_anchor_vec, real_anchor) + + +def record_control_to_replace(graph: Graph, src_op: Operation) -> DefaultDict[Tensor, List[Tuple[int, Operation]]]: replacement_specs = defaultdict(list) - op_list = tf.compat.v1.get_default_graph().get_operations() + op_list = graph.get_operations() for operator in op_list: if src_op in operator.control_inputs: input_index = operator.control_inputs.index(src_op) @@ -103,8 +212,9 @@ def record_control_to_replace(src_op: Operation) -> DefaultDict[Tensor, List[Tup return replacement_specs -def replace_control_anchor(replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], - new_tensor_list: List[Tensor]): +def replace_control_anchor( + replacement_specs: DefaultDict[Tensor, List[Tuple[int, Operation]]], new_tensor_list: List[Tensor] +): for tensor_idx, (old_tensor, items) in enumerate(replacement_specs.items()): for _, operator in items: @@ -112,43 +222,48 @@ def replace_control_anchor(replacement_specs: DefaultDict[Tensor, List[Tuple[int control_op = control_flow_ops.group(new_tensor_list) operator._add_control_input(control_op) except InvalidArgumentError as err: - logger.info("The replacement control specs keys (old batch) is: %s. \n\t\t The new_tensor_list is: %s.", - replacement_specs.keys(), new_tensor_list) - raise RuntimeError(f"Cannot update edge, old tensor: {old_tensor}, " - f"new tensor: {new_tensor_list[tensor_idx]}.") from err + logger.info( + "The replacement control specs keys (old batch) is: %s. \n\t\t The new_tensor_list is: %s.", + replacement_specs.keys(), + new_tensor_list, + ) + raise RuntimeError( + f"Cannot update edge, old tensor: {old_tensor}, " f"new tensor: {new_tensor_list[tensor_idx]}." + ) from err -def export_pb_graph(file_name: str, - dump_graph: bool = False, - graph_def: GraphDef = None, - export_path: str = "./export_graph", - as_text: bool = True): +def replace_anchor_vec(graph: Graph, cutting_point: Tensor, attribute: ASCAnchorAttr, anchor: Tensor): """ - Save tensorflow graph before and after modifier graph - :param file_name: FileName of the graph - :param dump_graph: Is serialize graph or not - :param graph_def: A Graph or a GraphDef protocol buffer. - :param export_path: Directory where to write the graph. - This can refer to remote filesystems, such as Google Cloud Storage (GCS). - :param as_text: If True, writes the graph as an ASCII proto - :return: None + 根据打桩节点的名字找到以此为输入的op,并将该op的输入替换为入参anchor. + + Args: + cutting_point: sparse lookup查询的ids + attribute: 被替换的打桩节点的名字 + anchor: 用来替换打桩节点的tensor + + Returns: None + """ - if dump_graph: - dir_path = os.path.dirname(os.path.join(export_path, file_name)) - os.makedirs(dir_path, mode=DUMP_MIDIFY_GRAPH_FILE_MODE, exist_ok=True) - graph_def = graph_def if graph_def else tf.compat.v1.get_default_graph().as_graph_def() - tf.io.write_graph(graph_def, export_path, file_name, as_text) + + # get stub node + anchor_vec = BaseSparseEmbedding.get_anchor_attribute(cutting_point, attribute) + if anchor_vec is None: + raise RuntimeError( + f"Node `{attribute.value}` does not exist. Check whether the sparse lookup interface " + f"is correctly invoked." + ) + # find the op with stub node as the input + replacement_specs_for_anchor_vec = record_ops_to_replace(graph, anchor_vec.op) + # replace anchor_vec with anchor + replace_anchor(replacement_specs_for_anchor_vec, [anchor]) def make_sorted_key_to_tensor_list( - element_spec: List[Dict[str, Tensor]], - sorted_keys: List[str], - prefix: str = "" + element_spec: List[Dict[str, Tensor]], sorted_keys: List[str], prefix: str = "" ) -> List[str]: if isinstance(element_spec, tf.TensorSpec): sorted_keys.append(prefix) return sorted_keys - elif isinstance(element_spec, dict): for key, item in element_spec.items(): if not isinstance(key, str): @@ -169,61 +284,25 @@ def make_sorted_key_to_tensor_list( raise TypeError(f"Given element_spec, whose type is {type(element_spec)}, is invalid.") -def replace_anchor_vec(cutting_point: Tensor, attribute: ASCAnchorAttr, anchor: Tensor): +def export_pb_graph( + file_name: str, + dump_graph: bool = False, + graph_def: GraphDef = None, + export_path: str = "./export_graph", + as_text: bool = True, +): """ - 根据打桩节点的名字找到以此为输入的op,并将该op的输入替换为入参anchor. - - Args: - cutting_point: sparse lookup查询的ids - attribute: 被替换的打桩节点的名字 - anchor: 用来替换打桩节点的tensor - - Returns: None - - """ - - # get stub node - anchor_vec = BaseSparseEmbedding.get_anchor_attribute(cutting_point, attribute) - if anchor_vec is None: - raise RuntimeError(f"Node `{attribute.value}` does not exist. Check whether the sparse lookup interface " - f"is correctly invoked.") - # find the op with stub node as the input - replacement_specs_for_anchor_vec = record_ops_to_replace(anchor_vec.op) - # replace anchor_vec with anchor - replace_anchor(replacement_specs_for_anchor_vec, [anchor]) - - -def replace_anchor_control(place_holder_control: tf.Operation, real_anchor: Tensor): - """ - 将place_holder_control替换为入参real_anchor. - - Args: - place_holder_control: control op - real_anchor: 用来替换打桩节点的tensor - - Returns: None - + Save tensorflow graph before and after modifier graph + :param file_name: FileName of the graph + :param dump_graph: Is serialize graph or not + :param graph_def: A Graph or a GraphDef protocol buffer. + :param export_path: Directory where to write the graph. + This can refer to remote filesystems, such as Google Cloud Storage (GCS). + :param as_text: If True, writes the graph as an ASCII proto + :return: None """ - - if place_holder_control is None: - raise RuntimeError(f"Node place_holder_control does not exist. Check whether the sparse lookup interface " - f"is correctly invoked.") - # find the op with stub node as the input - replacement_specs_for_anchor_vec = record_control_to_replace(place_holder_control) - # replace anchor_vec with anchor - replace_control_anchor(replacement_specs_for_anchor_vec, real_anchor) - - -def mark_orphan_lookup_key(lookup_key: Tensor) -> Tensor: - graph_def = tf.compat.v1.get_default_graph().as_graph_def() - subgraph = tf.compat.v1.graph_util.extract_sub_graph(graph_def, [lookup_key.op.name]) - - for node in subgraph.node: - if node.op == AnchorIteratorOp.ITERATOR_GET_NEXT.value: - return lookup_key - - name_prefix = OrphanLookupKeySlicer.SLICEABLE_ORPHAN_LOOKUP_KEY_PREFIX - marked_lookup_key = tf.identity(lookup_key, name="{}/{}".format(name_prefix, lookup_key.op.name)) - - logger.info('Mark orphan lookup key %s as %s.', lookup_key, marked_lookup_key) - return marked_lookup_key + if dump_graph: + dir_path = os.path.dirname(os.path.join(export_path, file_name)) + os.makedirs(dir_path, mode=DUMP_MIDIFY_GRAPH_FILE_MODE, exist_ok=True) + graph_def = graph_def if graph_def else tf.compat.v1.get_default_graph().as_graph_def() + tf.io.write_graph(graph_def, export_path, file_name, as_text) diff --git a/tests/mx_rec/graph/test_modifier.py b/tests/mx_rec/graph/test_modifier.py index ff9a6664..25caf429 100644 --- a/tests/mx_rec/graph/test_modifier.py +++ b/tests/mx_rec/graph/test_modifier.py @@ -30,22 +30,13 @@ from mx_rec.constants.constants import ( ASCEND_TIMESTAMP, ASCAnchorAttr, ) -from mx_rec.core.asc import FeatureSpec from mx_rec.graph.modifier import ( GraphModifierHook, - AnchorRecord, - find_make_iterator_op, - find_target_dataset_op, - find_target_instance_dataset, - generate_get_next_op_specs, - get_dataset_op, - get_input_index_list, - get_passing_tensor_list, - get_preprocessing_map_func, - get_src_dataset, - get_tgt_dataset, - get_timestamp_index, - modify_graph_for_asc, + _GraphModifier, + _AnchorRecord, + _get_input_index_list, + _get_passing_tensor_list, + _get_timestamp_index, ) from tests.mx_rec.core.mock_class import MockConfigInitializer, MockSparseEmbedding, MockOptimizer from tests.mx_rec.graph.mock_dataset import gen_mock_dataset @@ -70,16 +61,19 @@ def _gen_mock_get_anchor_attribute(is_training: bool = True) -> Callable: class GetPreprocessingMapFuncTest(TestCase): + def setUp(self) -> None: + self._modifier = _GraphModifier() + def tearDown(self) -> None: tf.compat.v1.reset_default_graph() def test_err_none_names_and_indexes(self): - mock_graph_def = tf.compat.v1.GraphDef() + mock_graph_def = self._modifier._full_graph.as_graph_def() mock_input_names = [] mock_output_names = [] with self.assertRaises(ValueError): - get_preprocessing_map_func(mock_graph_def, mock_input_names, mock_output_names) + _GraphModifier._get_preprocessing_map_func(mock_graph_def, mock_input_names, mock_output_names) class GetInputIndexListTest(TestCase): @@ -93,70 +87,11 @@ class GetInputIndexListTest(TestCase): mock_base_count = 0 with self.assertRaises(ValueError): - get_input_index_list( + _get_input_index_list( mock_cutting_point_list, mock_replace_ment_specs, mock_mapping_name_list, mock_base_count ) -class FindMakeIteratorOpTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - - found_iter_op = find_make_iterator_op(mock_ids) - self.assertEqual(found_iter_op.type, "MakeIterator") - - def test_err_no_tgt_dataset_op(self): - mock_ids = tf.zeros(shape=(4096, 8)) - with self.assertRaises(ValueError): - find_make_iterator_op(mock_ids) - - -class FindTargetDatasetOpTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_base_op = tf.identity(mock_ids).op - - found_tgt_dataset_op = find_target_dataset_op(base_ops=mock_base_op, op_type="IteratorGetNext") - self.assertEqual(found_tgt_dataset_op, mock_ids.op) - - def test_err_no_tgt_op_type(self): - mock_ids = tf.zeros(shape=(4096, 8)) - mock_base_op = mock_ids.op - with self.assertRaises(ValueError): - find_target_dataset_op(mock_base_op, "IteratorGetNext") - - -class GetDatasetOpTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_ok(self): - mock_dataset = gen_mock_dataset() - mock_iterator = mock_dataset.make_initializable_iterator() - mock_batch = mock_iterator.get_next() - mock_ids = mock_batch.get("mock_ids") - mock_get_next_op = mock_ids.op - - found_dataset_op = get_dataset_op(mock_get_next_op) - self.assertEqual(found_dataset_op.type, "OptimizeDataset") - - def test_err_invalid_op_type(self): - mock_get_next_op = tf.zeros(shape=(4096, 8)).op - with self.assertRaises(TypeError): - get_dataset_op(mock_get_next_op) - class GetPassingTensorList(TestCase): def tearDown(self) -> None: @@ -176,7 +111,7 @@ class GetPassingTensorList(TestCase): "output_index_list": [0], "sub_src_tensors": mock_cutting_point_list, } - passing_tensor_list, output_index_list, sub_src_tensors = get_passing_tensor_list( + passing_tensor_list, output_index_list, sub_src_tensors = _get_passing_tensor_list( mock_cutting_point_list, mock_tgt_op ) self.assertEqual(passing_tensor_list, expected["passing_tensor_list"]) @@ -184,29 +119,23 @@ class GetPassingTensorList(TestCase): self.assertEqual(sub_src_tensors, expected["sub_src_tensors"]) -class FindTargetInstanceDatasetTest(TestCase): - def tearDown(self) -> None: - tf.compat.v1.reset_default_graph() - - def test_err_no_target_dataset_instance(self): - with self.assertRaises(LookupError): - find_target_instance_dataset(None) - class GetSrcDatasetTest(TestCase): + def setUp(self) -> None: + self._modifier = _GraphModifier() + def tearDown(self) -> None: tf.compat.v1.reset_default_graph() def test_ok_one_shot(self): mock_dataset = gen_mock_dataset() mock_prefetch_dataset = mock_dataset.prefetch(10) - mock_double_prefetch_dataset = mock_prefetch_dataset.prefetch(10) mock_iterator = mock_prefetch_dataset.make_one_shot_iterator() mock_batch = mock_iterator.get_next() mock_ids = mock_batch.get("mock_ids") mock_get_next_op = mock_ids.op - src_dataset = get_src_dataset(mock_get_next_op, is_training=True) + src_dataset = self._modifier._get_src_dataset(mock_get_next_op, is_training=True) self.assertEqual(src_dataset, mock_dataset) @@ -215,6 +144,9 @@ class GetSrcDatasetTest(TestCase): ConfigInitializer=Mock(return_value=MockConfigInitializer()), ) class GetTgtDatasetTest(TestCase): + def setUp(self) -> None: + self._modifier = _GraphModifier() + def tearDown(self) -> None: tf.compat.v1.reset_default_graph() @@ -233,22 +165,16 @@ class GetTgtDatasetTest(TestCase): mock_batch = mock_iterator.get_next() mock_ids = mock_batch.get("mock_ids") mock_sub_cutting_point_list = [mock_ids] - mock_records = AnchorRecord( - defaultdict(), - [], - [], - [], - tf.compat.v1.GraphDef(), - [], - [], - True - ) + mock_records = _AnchorRecord(defaultdict(), [], [], [], tf.compat.v1.GraphDef(), [], [], True) - tgt_dataset = get_tgt_dataset(mock_dataset, mock_sub_cutting_point_list, mock_records) + tgt_dataset = self._modifier._get_tgt_dataset(mock_dataset, mock_sub_cutting_point_list, mock_records) self.assertIsNotNone(tgt_dataset) class ModifyGraphForAscTest(TestCase): + def setUp(self) -> None: + self._modifier = _GraphModifier() + def tearDown(self) -> None: tf.compat.v1.reset_default_graph() @@ -257,9 +183,11 @@ class ModifyGraphForAscTest(TestCase): get_asc_insert_func=Mock(return_value=lambda x, y: x), ) @patch.multiple("mx_rec.graph.modifier.BaseSparseEmbedding", get_anchor_attribute=_gen_mock_get_anchor_attribute()) - @patch.multiple("mx_rec.core.asc.manager", - should_skip=MagicMock(return_value=True), - check_dangling_table=MagicMock(return_value=["test_table"])) + @patch.multiple( + "mx_rec.core.asc.manager", + should_skip=MagicMock(return_value=True), + check_dangling_table=MagicMock(return_value=["test_table"]), + ) @patch("mx_rec.graph.modifier.ConfigInitializer") def test_ok_train_mode(self, modifier_config_initializer): mock_config_initializer = MockConfigInitializer(modify_graph=True, merged_multi_lookup=True) @@ -280,7 +208,7 @@ class ModifyGraphForAscTest(TestCase): tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE, mock_cutting_point) - modify_graph_for_asc() + self._modifier.modify_graph_for_asc() @patch.multiple( "mx_rec.graph.modifier", @@ -289,12 +217,13 @@ class ModifyGraphForAscTest(TestCase): ) @patch.multiple( "mx_rec.graph.modifier.BaseSparseEmbedding", - get_anchor_attribute=_gen_mock_get_anchor_attribute(is_training=False) + get_anchor_attribute=_gen_mock_get_anchor_attribute(is_training=False), ) @patch("mx_rec.graph.modifier.ConfigInitializer") def test_ok_eval_mode(self, modifier_config_initializer): - mock_config_initializer = MockConfigInitializer(modify_graph=True, merged_multi_lookup=True, - bool_gauge_set={"evaluate"}) + mock_config_initializer = MockConfigInitializer( + modify_graph=True, merged_multi_lookup=True, bool_gauge_set={"evaluate"} + ) modifier_config_initializer.get_instance = Mock(return_value=mock_config_initializer) mock_dataset = gen_mock_dataset() @@ -312,7 +241,7 @@ class ModifyGraphForAscTest(TestCase): tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE, mock_cutting_point) - modify_graph_for_asc() + self._modifier.modify_graph_for_asc() @patch.multiple( "mx_rec.graph.modifier", @@ -333,10 +262,13 @@ class ModifyGraphForAscTest(TestCase): tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ENTRANCE, mock_cutting_point) with self.assertRaises(RuntimeError): - modify_graph_for_asc() + self._modifier.modify_graph_for_asc() class GetTimestampIndexTest(TestCase): + def setUp(self) -> None: + self._graph = tf.compat.v1.get_default_graph() + def tearDown(self) -> None: tf.compat.v1.reset_default_graph() @@ -358,7 +290,7 @@ class GetTimestampIndexTest(TestCase): tf.compat.v1.add_to_collection(ASCEND_TIMESTAMP, mock_timestamp) - timestamp_index = get_timestamp_index(mock_get_next_op, is_training=True) + timestamp_index = _get_timestamp_index(self._graph, mock_get_next_op, is_training=True) self.assertEqual(timestamp_index, 2) @@ -382,8 +314,9 @@ class GraphModifierHookTest(TestCase): ) @patch("mx_rec.graph.modifier.ConfigInitializer") def test_ok(self, modifier_config_initializer): - mock_config_initializer = MockConfigInitializer(modify_graph=True, is_graph_modify_hook_running=True, - iterator_type="MakeIterator") + mock_config_initializer = MockConfigInitializer( + modify_graph=True, is_graph_modify_hook_running=True, iterator_type="MakeIterator" + ) modifier_config_initializer.get_instance = Mock(return_value=mock_config_initializer) mock_dataset = gen_mock_dataset() @@ -406,8 +339,9 @@ class GraphModifierHookTest(TestCase): ) @patch("mx_rec.graph.modifier.ConfigInitializer") def test_err_invalid_iterator_type(self, modifier_config_initializer): - mock_config_initializer = MockConfigInitializer(modify_graph=True, is_graph_modify_hook_running=True, - iterator_type="InvalidIterator") + mock_config_initializer = MockConfigInitializer( + modify_graph=True, is_graph_modify_hook_running=True, iterator_type="InvalidIterator" + ) modifier_config_initializer.get_instance = Mock(return_value=mock_config_initializer) mock_dataset = gen_mock_dataset() diff --git a/tests/mx_rec/graph/test_utils.py b/tests/mx_rec/graph/test_utils.py index 5a4efffc..7aead90e 100644 --- a/tests/mx_rec/graph/test_utils.py +++ b/tests/mx_rec/graph/test_utils.py @@ -15,7 +15,6 @@ # limitations under the License. # ============================================================================== -import sys import os import pathlib import shutil @@ -24,42 +23,45 @@ from unittest import TestCase import tensorflow as tf from tensorflow import Tensor, TensorSpec + from mx_rec.constants.constants import ASCAnchorAttr from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.graph.utils import ( - check_input_list, + find_trans_dataset, find_parent_op, + find_make_iterator_op, + find_target_instance_dataset, + upward_bfs_op, + check_and_force_list, check_cutting_points, export_pb_graph, make_sorted_key_to_tensor_list, replace_anchor_vec, ) +from tests.mx_rec.graph.mock_dataset import gen_mock_dataset -class CheckInputListTest(TestCase): - def tearDown(self): - tf.compat.v1.reset_default_graph() - - def test_ok_single_object(self): - mock_obj = "obj" - obj_type = str - - checked_objs = check_input_list(mock_obj, obj_type) - self.assertEqual([mock_obj], checked_objs) +class FindTransDatasetTest(TestCase): + def setUp(self) -> None: + self._graph = tf.compat.v1.get_default_graph() - def test_ok_object_list(self): - mock_objs = ["obj1", "obj2", "ojb3"] - obj_type = str + def tearDown(self) -> None: + tf.compat.v1.reset_default_graph() - checked_cutting_points = check_input_list(mock_objs, obj_type) - self.assertEqual(mock_objs, checked_cutting_points) + def test_ok(self): + mock_dataset = gen_mock_dataset() + mock_iterator = mock_dataset.make_initializable_iterator() + mock_batch = mock_iterator.get_next() + mock_ids = mock_batch.get("mock_ids") + mock_get_next_op = mock_ids.op - def test_err_inconsistent_object_and_type(self): - mock_objs = ["obj1", "obj2", "ojb3"] - obj_type = Tensor + found_dataset_op = find_trans_dataset(self._graph, mock_get_next_op) + self.assertEqual(found_dataset_op.type, "OptimizeDataset") - with self.assertRaises(ValueError): - check_input_list(mock_objs, obj_type) + def test_err_invalid_op_type(self): + mock_get_next_op = tf.zeros(shape=(4096, 8)).op + with self.assertRaises(TypeError): + find_trans_dataset(self._graph, mock_get_next_op) class FindParentOpTest(TestCase): @@ -76,6 +78,64 @@ class FindParentOpTest(TestCase): self.assertEqual([mock_parent_op], parent_op) +class FindMakeIteratorOpTest(TestCase): + def setUp(self) -> None: + self._graph = tf.compat.v1.get_default_graph() + + def tearDown(self) -> None: + tf.compat.v1.reset_default_graph() + + def test_ok(self): + mock_dataset = gen_mock_dataset() + mock_iterator = mock_dataset.make_initializable_iterator() + mock_batch = mock_iterator.get_next() + mock_ids = mock_batch.get("mock_ids") + + found_iter_op = find_make_iterator_op(self._graph, mock_ids) + self.assertEqual(found_iter_op.type, "MakeIterator") + + def test_err_no_tgt_dataset_op(self): + mock_ids = tf.zeros(shape=(4096, 8)) + with self.assertRaises(ValueError): + find_make_iterator_op(self._graph, mock_ids) + + +class FindTargetInstanceDatasetTest(TestCase): + def setUp(self) -> None: + self._graph = tf.compat.v1.get_default_graph() + + def tearDown(self) -> None: + tf.compat.v1.reset_default_graph() + + def test_err_no_target_dataset_instance(self): + with self.assertRaises(LookupError): + find_target_instance_dataset(self._graph, None) + + +class UpwardBFSOpTest(TestCase): + def setUp(self) -> None: + self._graph = tf.compat.v1.get_default_graph() + + def tearDown(self) -> None: + tf.compat.v1.reset_default_graph() + + def test_ok(self): + mock_dataset = gen_mock_dataset() + mock_iterator = mock_dataset.make_initializable_iterator() + mock_batch = mock_iterator.get_next() + mock_ids = mock_batch.get("mock_ids") + mock_base_op = tf.identity(mock_ids).op + + found_tgt_dataset_op = upward_bfs_op(base_ops=mock_base_op, tgt_op_type="IteratorGetNext") + self.assertEqual(found_tgt_dataset_op, mock_ids.op) + + def test_err_no_tgt_op_type(self): + mock_ids = tf.zeros(shape=(4096, 8)) + mock_base_op = mock_ids.op + with self.assertRaises(ValueError): + upward_bfs_op(base_ops=mock_base_op, tgt_op_type="IteratorGetNext") + + class CheckCuttingPointsTest(TestCase): def setUp(self): self._generator_iter_times = 3 @@ -98,6 +158,32 @@ class CheckCuttingPointsTest(TestCase): check_cutting_points(mock_cutting_point_list) +class CheckAndForceListTest(TestCase): + def tearDown(self): + tf.compat.v1.reset_default_graph() + + def test_ok_single_object(self): + mock_obj = "obj" + obj_type = str + + checked_objs = check_and_force_list(mock_obj, obj_type) + self.assertEqual([mock_obj], checked_objs) + + def test_ok_object_list(self): + mock_objs = ["obj1", "obj2", "ojb3"] + obj_type = str + + checked_cutting_points = check_and_force_list(mock_objs, obj_type) + self.assertEqual(mock_objs, checked_cutting_points) + + def test_err_inconsistent_object_and_type(self): + mock_objs = ["obj1", "obj2", "ojb3"] + obj_type = Tensor + + with self.assertRaises(ValueError): + check_and_force_list(mock_objs, obj_type) + + class ExportPBGraphTest(TestCase): def setUp(self) -> None: self._dir_name = "./export_graph" @@ -162,7 +248,7 @@ class ReplaceAnchorVecTest(TestCase): anchor_vec_output = tf.identity(anchor_vec, name="anchor_vec_output") BaseSparseEmbedding.anchor_tensor_specs[mock_cutting_point][mock_attribute] = anchor_vec - replace_anchor_vec(mock_cutting_point, mock_attribute, mock_anchor) + replace_anchor_vec(tf.compat.v1.get_default_graph(), mock_cutting_point, mock_attribute, mock_anchor) self.assertEqual(anchor_vec_output.op.inputs[0], mock_anchor) -- Gitee From 61a3be346f100c067e06dcb2b70dd739d45c31a3 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 21 May 2024 19:25:34 +0800 Subject: [PATCH 155/302] =?UTF-8?q?warm=20start=20=E4=BF=AE=E6=94=B9DT?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/mx_rec/core/test_feature_process.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mx_rec/core/test_feature_process.py b/tests/mx_rec/core/test_feature_process.py index b8bb0742..787648f4 100644 --- a/tests/mx_rec/core/test_feature_process.py +++ b/tests/mx_rec/core/test_feature_process.py @@ -78,13 +78,13 @@ class TestAfterRunFuncOfEvictHookClass(TestEvictHookClass): mock_get_next.return_value = [tf.constant([8, 9], dtype=tf.int32), tf.constant(2, dtype=tf.int32)] - evict_hook = EvictHook(evict_enable=True, evict_time_interval=1) + evict_hook = EvictHook(evict_enable=True, evict_time_interval=10) with tf.compat.v1.train.MonitoredSession(hooks=[evict_hook]) as sess: sess.graph._unsafe_unfinalize() sess.run(tf.compat.v1.global_variables_initializer()) # sleep 1s 等待淘汰时间evict_time_interval - time.sleep(1) + time.sleep(10) # 获取原variable,淘汰会发生在此session run之后 ori_variable = sess.run(test_table.variable) -- Gitee From 0b3043d3dbf6852aa7409a8326455e1d188de0ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Tue, 21 May 2024 12:05:51 +0000 Subject: [PATCH 156/302] =?UTF-8?q?!150=20cleancode=20bug=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91cle?= =?UTF-8?q?ancode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/emb/dynamic_sparse_embedding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mx_rec/core/emb/dynamic_sparse_embedding.py b/mx_rec/core/emb/dynamic_sparse_embedding.py index 4781491c..8dfe504c 100644 --- a/mx_rec/core/emb/dynamic_sparse_embedding.py +++ b/mx_rec/core/emb/dynamic_sparse_embedding.py @@ -42,7 +42,7 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): def _get_sparse_forward_result(self, sparse_forward_fn: Callable, table: Union[tf.compat.v1.Variable, tf.Tensor], result: dict, is_training: bool) -> tf.Tensor: local_embeddings = import_host_pipeline_ops().embedding_lookup_by_address( - result.get(str(ASCAnchorAttr.ID_OFFSETS)), embedding_dim=self._emb_size, embedding_type=1) + result.get(str(ASCAnchorAttr.ID_OFFSETS.value)), embedding_dim=self._emb_size, embedding_type=1) add_collection_condition = is_training and ( ASCEND_TABLE_NAME_MUST_CONTAIN is None or ASCEND_TABLE_NAME_MUST_CONTAIN in self._table_name) @@ -52,9 +52,9 @@ class DynamicSparseEmbedding(BaseSparseEmbedding): return sparse_forward_fn(local_embeddings) # 创建扩容查询tensor和table_instance的映射关系,以便优化器中使用 ConfigInitializer.get_instance().sparse_embed_config.insert_table_instance_to_tensor_dict( - result.get(str(ASCAnchorAttr.ID_OFFSETS)), self) + result.get(str(ASCAnchorAttr.ID_OFFSETS.value)), self) tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB, local_embeddings) - tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get(str(ASCAnchorAttr.ID_OFFSETS))) + tf.compat.v1.add_to_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET, result.get(str(ASCAnchorAttr.ID_OFFSETS.value))) return sparse_forward_fn(local_embeddings) -- Gitee From 906acf0c55b5bdafc91d6ce1f7204489a06d2d86 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 21 May 2024 21:15:51 +0800 Subject: [PATCH 157/302] =?UTF-8?q?warm=20start=20=E5=AF=B9=E5=8E=9F?= =?UTF-8?q?=E7=94=9FEstimator=E6=89=93patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/warm_start.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index baf01ce8..b3d08ef4 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -23,10 +23,6 @@ import six import tensorflow as tf from tensorflow.python.estimator import estimator as estimator_lib from tensorflow.python.training import warm_starting_util -if tf.__version__.startswith("1"): - from npu_bridge.npu_init import NPUEstimator -else: - from npu_device.compat.v1.npu_init import NPUEstimator from mx_rec.util.log import logger from mx_rec.saver.saver import Saver @@ -61,7 +57,7 @@ class WarmStartController: def patch_for_warm_start(): estimator_lib.Estimator.__init__ = patch_estimator_init(estimator_lib.Estimator.__init__) warm_starting_util.warm_start = patch_for_func_warm_start(warm_starting_util.warm_start) - NPUEstimator.train = patch_for_estimator_train(NPUEstimator.train) + estimator_lib.Estimator.train = patch_for_estimator_train(estimator_lib.Estimator.train) def patch_estimator_init(func): -- Gitee From 76c3993f67ba17fbd75a5df68c479afa25a1c40a Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Wed, 22 May 2024 10:32:04 +0800 Subject: [PATCH 158/302] =?UTF-8?q?dcnv2=EF=BC=8Cdlrm=E6=A8=A1=E5=9E=8Bmai?= =?UTF-8?q?n=E8=84=9A=E6=9C=AC=E5=86=85=E5=88=AA=E9=99=A4=E4=BB=A5?= =?UTF-8?q?=E4=BF=9D=E5=AD=98=E6=95=B0=E6=8D=AE=EF=BC=8C=E9=80=82=E9=85=8D?= =?UTF-8?q?=E5=A4=9A=E6=9C=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/config.py | 23 ++++++++++++++++------- examples/DCNv2/main_mxrec.py | 10 +++++++++- examples/demo/little_demo/main.py | 7 ++----- examples/dlrm/model/config.py | 23 ++++++++++++++++------- examples/dlrm/model/main_mxrec.py | 29 ++++++++++++++++++++++++++++- examples/dlrm/model/run.sh | 13 +------------ 6 files changed, 72 insertions(+), 33 deletions(-) diff --git a/examples/DCNv2/config.py b/examples/DCNv2/config.py index 73ab2797..fab17d32 100644 --- a/examples/DCNv2/config.py +++ b/examples/DCNv2/config.py @@ -14,11 +14,14 @@ # limitations under the License. # ============================================================================== +import enum import os import tensorflow as tf from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +SSD_DATA_PATH = ["ssd_data"] + class LearningRateScheduler: """ @@ -86,6 +89,12 @@ class LearningRateScheduler: return lr_dense, lr_sparse +class CacheModeEnum(enum.Enum): + HBM = "HBM" + DDR = "DDR" + SSD = "SSD" + + class Config: def __init__(self, ): self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None @@ -141,13 +150,13 @@ class Config: if self.cache_mode is None: raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") - if self.cache_mode == "HBM": + if self.cache_mode == CacheModeEnum.HBM.value: self.dev_vocab_size = 24_000_000 * self.rank_size self.host_vocab_size = 0 - elif self.cache_mode == "DDR": + elif self.cache_mode == CacheModeEnum.DDR.value: self.dev_vocab_size = 500_000 * self.rank_size self.host_vocab_size = 24_000_000 * self.rank_size - elif self.cache_mode == "SSD": + elif self.cache_mode == CacheModeEnum.SSD.value: self.dev_vocab_size = 100_000 * self.rank_size self.host_vocab_size = 2_000_000 * self.rank_size self.ssd_vocab_size = 24_000_000 * self.rank_size @@ -155,16 +164,16 @@ class Config: raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") def get_emb_table_cfg(self) -> dict: - if self.cache_mode == "HBM": + if self.cache_mode == CacheModeEnum.HBM.value: return {"device_vocabulary_size": self.dev_vocab_size} - elif self.cache_mode == "DDR": + elif self.cache_mode == CacheModeEnum.DDR.value: return {"device_vocabulary_size": self.dev_vocab_size, "host_vocabulary_size": self.host_vocab_size} - elif self.cache_mode == "SSD": + elif self.cache_mode == CacheModeEnum.SSD.value: return {"device_vocabulary_size": self.dev_vocab_size, "host_vocabulary_size": self.host_vocab_size, "ssd_vocabulary_size": self.ssd_vocab_size, - "ssd_data_path": ["ssd_data"]} + "ssd_data_path": SSD_DATA_PATH} else: raise RuntimeError(f"get CACHE_MODE:{self.cache_mode}, check Config.__set_emb_table_size implementation") diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index 18ab273e..12cf9428 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -27,7 +27,7 @@ import numpy as np from npu_bridge.npu_init import * from model import MyModel -from config import sess_config, Config +from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum from optimizer import get_dense_and_sparse_optimizer from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline @@ -259,6 +259,14 @@ def _del_related_dir(del_path: str) -> None: def _clear_saved_model() -> None: _del_related_dir("/root/ascend/log/*") + if os.getenv("CACHE_MODE", "") != CacheModeEnum.SSD.value: + return + logger.info("current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" + " then create empty directory for this use case.") + for sub_path in SSD_DATA_PATH: + _del_related_dir(sub_path) + os.makedirs(sub_path, mode=0o550, exist_ok=True) + logger.info(f"mkdir dir:{sub_path}") if __name__ == "__main__": diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index d8dd851a..15478aa3 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -174,11 +174,8 @@ def _clear_saved_model() -> None: " then create empty directory for this use case.") for sub_path in _SSD_SAVE_PATH: _del_related_dir(sub_path) - try: - os.mkdir(sub_path) - logger.info(f"mkdir dir:{sub_path}") - except OSError: - logger.warning("ssd path has exist") # 多进程并行,忽略异常 + os.makedirs(sub_path, mode=0o550, exist_ok=True) + logger.info(f"mkdir dir:{sub_path}") if __name__ == "__main__": diff --git a/examples/dlrm/model/config.py b/examples/dlrm/model/config.py index fd38276d..d6259eb0 100644 --- a/examples/dlrm/model/config.py +++ b/examples/dlrm/model/config.py @@ -14,12 +14,15 @@ # limitations under the License. # ============================================================================== +import enum import os import tensorflow as tf from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig from npu_bridge.estimator.npu.npu_config import NPURunConfig +SSD_DATA_PATH = ["ssd_data"] + class LearningRateScheduler: """ @@ -87,6 +90,12 @@ class LearningRateScheduler: return lr_dense, lr_sparse +class CacheModeEnum(enum.Enum): + HBM = "HBM" + DDR = "DDR" + SSD = "SSD" + + class Config: def __init__(self, ): self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None @@ -142,13 +151,13 @@ class Config: if self.cache_mode is None: raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") - if self.cache_mode == "HBM": + if self.cache_mode == CacheModeEnum.HBM.value: self.dev_vocab_size = 24_000_000 * self.rank_size self.host_vocab_size = 0 - elif self.cache_mode == "DDR": + elif self.cache_mode == CacheModeEnum.DDR.value: self.dev_vocab_size = 500_000 * self.rank_size self.host_vocab_size = 24_000_000 * self.rank_size - elif self.cache_mode == "SSD": + elif self.cache_mode == CacheModeEnum.SSD.value: self.dev_vocab_size = 100_000 * self.rank_size self.host_vocab_size = 2_000_000 * self.rank_size self.ssd_vocab_size = 24_000_000 * self.rank_size @@ -156,16 +165,16 @@ class Config: raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") def get_emb_table_cfg(self) -> dict: - if self.cache_mode == "HBM": + if self.cache_mode == CacheModeEnum.HBM.value: return {"device_vocabulary_size": self.dev_vocab_size} - elif self.cache_mode == "DDR": + elif self.cache_mode == CacheModeEnum.DDR.value: return {"device_vocabulary_size": self.dev_vocab_size, "host_vocabulary_size": self.host_vocab_size} - elif self.cache_mode == "SSD": + elif self.cache_mode == CacheModeEnum.SSD.value: return {"device_vocabulary_size": self.dev_vocab_size, "host_vocabulary_size": self.host_vocab_size, "ssd_vocabulary_size": self.ssd_vocab_size, - "ssd_data_path": ["ssd_data"]} + "ssd_data_path": SSD_DATA_PATH} else: raise RuntimeError(f"get CACHE_MODE:{self.cache_mode}, check Config.__set_emb_table_size implementation") diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index 6fda4f0a..a630813a 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -15,6 +15,7 @@ # ============================================================================== import os +import shutil import time import warnings import random @@ -25,7 +26,7 @@ from sklearn.metrics import roc_auc_score import numpy as np from optimizer import get_dense_and_sparse_optimizer -from config import sess_config, Config +from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum from model import MyModel from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func @@ -247,9 +248,35 @@ def create_feature_spec_list(use_timestamp=False): return feature_spec_list +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + _del_related_dir("kernel*") + _del_related_dir("model_dir_rank*") + _del_related_dir("op_cache") + + if os.getenv("CACHE_MODE", "") != CacheModeEnum.SSD.value: + return + logger.info("current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" + " then create empty directory for this use case.") + for sub_path in SSD_DATA_PATH: + _del_related_dir(sub_path) + os.makedirs(sub_path, mode=0o550, exist_ok=True) + logger.info(f"mkdir dir:{sub_path}") + + if __name__ == "__main__": tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) warnings.filterwarnings("ignore") + _clear_saved_model() rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None diff --git a/examples/dlrm/model/run.sh b/examples/dlrm/model/run.sh index be509608..6c142443 100644 --- a/examples/dlrm/model/run.sh +++ b/examples/dlrm/model/run.sh @@ -28,18 +28,6 @@ local_rank_size=8 num_process=$((num_server * local_rank_size)) export TRAIN_RANK_SIZE=$num_process -# 删除数据 -echo "CACHE_MODE:${CACHE_MODE}" -if [ ${CACHE_MODE} = "SSD" ]; then - echo "SSD train mode not allow file exist before training, - deleting dir ${cur_path}/ssd_data then create for SSD use case" - rm -rf ssd_data - mkdir ssd_data -fi -rm -rf kernel* -rm -rf /root/ascend/log/* -rm -rf model_dir_rank* op_cache - ################# 参数配置 ###################### export USE_DYNAMIC=0 # 0:静态shape;1:动态shape export CACHE_MODE="HBM" # HBM;DDR;SSD @@ -48,6 +36,7 @@ export USE_DYNAMIC_EXPANSION=0 # 0:关闭动态扩容;1: 开启动态扩容 export USE_MULTI_LOOKUP=0 # 0:一表一查;1:一表多查 export USE_MODIFY_GRAPH=0 # 0:feature spec模式;1:自动改图模式 ################################################ +echo "CACHE_MODE:${CACHE_MODE}" export HCCL_CONNECT_TIMEOUT=1200 export DLRM_CRITEO_DATA_PATH=${dlrm_criteo_data_path} -- Gitee From 2a13ae80f882183a4a65fd93cd5277d9e02155e8 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Wed, 22 May 2024 06:44:00 +0000 Subject: [PATCH 159/302] =?UTF-8?q?!153=20cleancode=E6=B8=85=E7=90=86=20*?= =?UTF-8?q?=20clean=20code=E6=B8=85=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 46 +++++++++++++++++++------------------- mx_rec/saver/warm_start.py | 2 +- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index 45033b4f..9f34cca3 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -50,17 +50,6 @@ class SaveModelThread(threading.Thread): class Saver(object): - @staticmethod - def _make_table_name_dir(root_dir, table_instance, table_name): - if not table_instance.is_hbm: - table_dir = os.path.join(root_dir, "HashTable", "DDR", table_name) - else: - table_dir = os.path.join(root_dir, "HashTable", "HBM", table_name) - try: - tf.io.gfile.makedirs(table_dir) - except Exception as err: - raise RuntimeError(f"make dir {table_dir} for saving sparse table failed!") from err - @para_checker_decorator(check_option_list=[ ("var_list", ClassValidator, {"classes": (list, type(None))}), ("max_to_keep", IntValidator, {"min_value": 0, "max_value": MAX_INT32}, ["check_value"]), @@ -82,6 +71,17 @@ class Saver(object): self.build() self.warm_start_tables = warm_start_tables + @staticmethod + def _make_table_name_dir(root_dir, table_instance, table_name): + if not table_instance.is_hbm: + table_dir = os.path.join(root_dir, "HashTable", "DDR", table_name) + else: + table_dir = os.path.join(root_dir, "HashTable", "HBM", table_name) + try: + tf.io.gfile.makedirs(table_dir) + except Exception as err: + raise RuntimeError(f"make dir {table_dir} for saving sparse table failed!") from err + def build(self): if self.var_list is None: self.var_list = [] @@ -237,6 +237,18 @@ class Saver(object): attribute = attribute.tostring() file.write(attribute) + def get_warm_start_dict(self, table_list): + placeholder_dict = defaultdict(dict) + restore_fetch_list = [] + for table_name, v in self.placeholder_dict.items(): + if table_name in table_list: + placeholder_dict[table_name] = v + restore_fetch_list.append(self.restore_fetch_dict.get(table_name)) + + if not restore_fetch_list: + logger.warning("no tables can be warm start restored.") + return placeholder_dict, restore_fetch_list + @performance("_save") def _save(self, sess, root_dir): for table_name in self.save_op_dict: @@ -317,18 +329,6 @@ class Saver(object): assign_op = state.assign(sub_optimizer_placeholder_dict.get(key_state)) self.restore_fetch_dict[table_instance.table_name].append(assign_op) - def get_warm_start_dict(self, table_list): - placeholder_dict = defaultdict(dict) - restore_fetch_list = [] - for table_name, v in self.placeholder_dict.items(): - if table_name in table_list: - placeholder_dict[table_name] = v - restore_fetch_list.append(self.restore_fetch_dict.get(table_name)) - - if not restore_fetch_list: - logger.warning("no tables can be warm start restored.") - return placeholder_dict, restore_fetch_list - def _restore(self, sess, reading_path, warm_start_tables=None): # 根据table_list去改造 if warm_start_tables: diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index b3d08ef4..22e2be43 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -214,7 +214,7 @@ def get_table_name_set_by_ckpt_path(warm_start_path: str) -> List[str]: ckpt_name = f"sparse-{base_name}" sparse_path = os.path.join(directory, ckpt_name) if not tf.io.gfile.isdir(sparse_path): - logger.info(f"under the warm start path {warm_start_path}, sparse directory {sparse_path} not exists.") + logger.info("under the warm start path %s, sparse directory %s not exists.", warm_start_path, sparse_path) else: for dirname in tf.io.gfile.listdir(sparse_path): table_name_list.append(dirname) -- Gitee From 56951145d397fb8bd6c6638dcb1472c5296bacd2 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Wed, 22 May 2024 16:59:22 +0800 Subject: [PATCH 160/302] =?UTF-8?q?=20run=E8=84=9A=E6=9C=AC=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo/run.sh | 2 +- examples/demo/little_demo_estimator/run.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/demo/little_demo/run.sh b/examples/demo/little_demo/run.sh index 5b45af84..5c5d9d1d 100644 --- a/examples/demo/little_demo/run.sh +++ b/examples/demo/little_demo/run.sh @@ -160,5 +160,5 @@ fi echo "use horovod to start tasks" DATE=$(date +%Y-%m-%d-%H-%M-%S) horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ -python3.7 ${py} 2>&1 | tee "temp_${local_rank_size}p_${KEY_PROCESS_THREAD_NUM}t_${USE_MODE}_${CACHE_MODE}_${DATE}.log" +python3.7 ${py} 2>&1 | tee "temp_${num_process}p_${KEY_PROCESS_THREAD_NUM}t_${USE_MODE}_${CACHE_MODE}_${DATE}.log" diff --git a/examples/demo/little_demo_estimator/run.sh b/examples/demo/little_demo_estimator/run.sh index 8bb43b19..011f0001 100644 --- a/examples/demo/little_demo_estimator/run.sh +++ b/examples/demo/little_demo_estimator/run.sh @@ -157,4 +157,4 @@ DATE=$(date +%Y-%m-%d-%H-%M-%S) horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ python3.7 ${py} \ --run_mode=$USE_MODE \ -2>&1 | tee "temp_${local_rank_size}p_${KEY_PROCESS_THREAD_NUM}t_${DATE}.log" +2>&1 | tee "temp_${num_process}p_${KEY_PROCESS_THREAD_NUM}t_${DATE}.log" -- Gitee From 256ee5b4da42dae7cf586fa6799fb184d50a4df7 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 23 May 2024 14:52:45 +0800 Subject: [PATCH 161/302] =?UTF-8?q?=E6=97=A5=E5=BF=97=E6=89=93=E5=8D=B0?= =?UTF-8?q?=E7=BB=9F=E4=B8=80=E5=A4=A7=E5=86=99=E5=BC=80=E5=A4=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/config.py | 2 +- examples/DCNv2/main_mxrec.py | 4 ++-- examples/demo/little_demo/main.py | 2 +- examples/dlrm/model/config.py | 2 +- examples/dlrm/model/main_mxrec.py | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/DCNv2/config.py b/examples/DCNv2/config.py index fab17d32..463f9aa1 100644 --- a/examples/DCNv2/config.py +++ b/examples/DCNv2/config.py @@ -163,7 +163,7 @@ class Config: else: raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") - def get_emb_table_cfg(self) -> dict: + def get_emb_table_cfg(self): if self.cache_mode == CacheModeEnum.HBM.value: return {"device_vocabulary_size": self.dev_vocab_size} elif self.cache_mode == CacheModeEnum.DDR.value: diff --git a/examples/DCNv2/main_mxrec.py b/examples/DCNv2/main_mxrec.py index 12cf9428..a04e1c47 100644 --- a/examples/DCNv2/main_mxrec.py +++ b/examples/DCNv2/main_mxrec.py @@ -261,12 +261,12 @@ def _clear_saved_model() -> None: _del_related_dir("/root/ascend/log/*") if os.getenv("CACHE_MODE", "") != CacheModeEnum.SSD.value: return - logger.info("current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" + logger.info("Current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" " then create empty directory for this use case.") for sub_path in SSD_DATA_PATH: _del_related_dir(sub_path) os.makedirs(sub_path, mode=0o550, exist_ok=True) - logger.info(f"mkdir dir:{sub_path}") + logger.info(f"Create dir:{sub_path}") if __name__ == "__main__": diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index 15478aa3..80940e86 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -175,7 +175,7 @@ def _clear_saved_model() -> None: for sub_path in _SSD_SAVE_PATH: _del_related_dir(sub_path) os.makedirs(sub_path, mode=0o550, exist_ok=True) - logger.info(f"mkdir dir:{sub_path}") + logger.info(f"Create dir:{sub_path}") if __name__ == "__main__": diff --git a/examples/dlrm/model/config.py b/examples/dlrm/model/config.py index d6259eb0..45e8af40 100644 --- a/examples/dlrm/model/config.py +++ b/examples/dlrm/model/config.py @@ -164,7 +164,7 @@ class Config: else: raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") - def get_emb_table_cfg(self) -> dict: + def get_emb_table_cfg(self): if self.cache_mode == CacheModeEnum.HBM.value: return {"device_vocabulary_size": self.dev_vocab_size} elif self.cache_mode == CacheModeEnum.DDR.value: diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index a630813a..767eeb2f 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -265,12 +265,12 @@ def _clear_saved_model() -> None: if os.getenv("CACHE_MODE", "") != CacheModeEnum.SSD.value: return - logger.info("current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" + logger.info("Current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" " then create empty directory for this use case.") for sub_path in SSD_DATA_PATH: _del_related_dir(sub_path) os.makedirs(sub_path, mode=0o550, exist_ok=True) - logger.info(f"mkdir dir:{sub_path}") + logger.info(f"Create dir:{sub_path}") if __name__ == "__main__": -- Gitee From c01f45a1110251356f7c1bdab75d1c1f9d73d1a7 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 23 May 2024 14:55:23 +0800 Subject: [PATCH 162/302] =?UTF-8?q?=E6=97=A5=E5=BF=97=E6=89=93=E5=8D=B0?= =?UTF-8?q?=E7=BB=9F=E4=B8=80=E5=A4=A7=E5=86=99=E5=BC=80=E5=A4=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/dlrm/model/main_mxrec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/dlrm/model/main_mxrec.py b/examples/dlrm/model/main_mxrec.py index 767eeb2f..51ed7c4a 100644 --- a/examples/dlrm/model/main_mxrec.py +++ b/examples/dlrm/model/main_mxrec.py @@ -254,7 +254,7 @@ def _del_related_dir(del_path: str) -> None: dirs = glob(del_path) for sub_dir in dirs: shutil.rmtree(sub_dir, ignore_errors=True) - logger.info(f"delete dir:{sub_dir}") + logger.info(f"Delete dir:{sub_dir}") def _clear_saved_model() -> None: -- Gitee From 99f126e6b492c2c7e8fb7e4b644dfe08681b7962 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Thu, 23 May 2024 07:41:42 +0000 Subject: [PATCH 163/302] =?UTF-8?q?!157=20warm=20start=E5=8E=BB=E9=99=A4?= =?UTF-8?q?=E5=86=97=E4=BD=99=E5=88=A4=E6=96=AD=20*=20Merge=20remote-track?= =?UTF-8?q?ing=20branch=20'upstream/develop'=20into=20warm=5Fstart=5Fdev?= =?UTF-8?q?=20*=20warm=20start=E5=8E=BB=E9=99=A4=E5=86=97=E4=BD=99?= =?UTF-8?q?=E5=88=A4=E6=96=AD=E9=80=BB=E8=BE=91=20*=20clean=20code?= =?UTF-8?q?=E6=B8=85=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/warm_start.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mx_rec/saver/warm_start.py b/mx_rec/saver/warm_start.py index 22e2be43..7ceb14c1 100644 --- a/mx_rec/saver/warm_start.py +++ b/mx_rec/saver/warm_start.py @@ -180,8 +180,6 @@ def _warm_settings_filter(warm_start_setting: tf.estimator.WarmStartSettings) -> matching_tables = [table for table in table_name_list if re.match(vars_to_warm_start, table)] if matching_tables: WarmStartController().add_element(warm_start_setting.ckpt_to_initialize_from, matching_tables) - if vars_to_warm_start != ".*": - return warm_start_setting_res warm_start_setting_res = warm_start_setting elif all(isinstance(v, str) for v in vars_to_warm_start): sparse_vars = [] -- Gitee From 332d9ce17fa4cdafb4b8a215056a3359280905be Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Sat, 25 May 2024 09:35:23 +0000 Subject: [PATCH 164/302] =?UTF-8?q?!151=20=E5=BC=95=E5=85=A5embCache?= =?UTF-8?q?=E7=89=B9=E6=80=A7=20*=20increase=20send=20eos=20wait=20time=20?= =?UTF-8?q?*=20fix=20save=20bug;=20simplify=20ddr=20process=20logic=20*=20?= =?UTF-8?q?add=20init=20specialProcessStatus=20*=20=E4=BF=AE=E5=A4=8Dstep\?= =?UTF-8?q?interval=E5=85=A8=E4=B8=BA1=E4=B8=94=E5=A4=9A=E8=BD=AE=E5=88=87?= =?UTF-8?q?=E6=8D=A2=E5=9C=BA=E6=99=AF=20*=20adapt=20merge=20change=20*=20?= =?UTF-8?q?=E5=90=8C=E6=AD=A5pr143=20*=20!148=20=E5=BC=95=E5=85=A5embCache?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=A4=84=E7=90=86=E9=80=BB=E8=BE=91=20*=20!1?= =?UTF-8?q?45=20=E5=AE=9E=E7=8E=B0embCache=E4=BF=9D=E5=AD=98=E5=8A=A0?= =?UTF-8?q?=E8=BD=BD=E5=8A=9F=E8=83=BD=20*=20!140=20=E5=90=8C=E6=AD=A5AccC?= =?UTF-8?q?TR=E4=BF=9D=E5=AD=98=E5=8A=A0=E8=BD=BD=E4=BB=A3=E7=A0=81=20*=20?= =?UTF-8?q?!141=20SSD=E6=96=B0=E5=A2=9Eswap=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E8=B0=83=E6=95=B4key=E6=95=B0=E6=8D=AE=E7=B1=BB=E5=9E=8B=20*?= =?UTF-8?q?=20!135=20=E5=A2=9E=E5=8A=A0embCache=E5=A4=B4=E6=96=87=E4=BB=B6?= =?UTF-8?q?=EF=BC=9B=E9=80=82=E9=85=8DInitializer=EF=BC=9B=E9=80=82?= =?UTF-8?q?=E9=85=8Dtest=20*=20!139=20little=20demo=E4=BF=AE=E6=AD=A3step?= =?UTF-8?q?=E4=B8=BA-1=E6=88=96=E9=9D=9E=E6=95=B4=E6=95=B0=E6=97=B6?= =?UTF-8?q?=E4=B8=8D=E7=AC=A6=E5=90=88=E9=A2=84=E6=9C=9F=E8=A1=8C=E4=B8=BA?= =?UTF-8?q?=EF=BC=9Bvocab=20size=E9=80=82=E9=85=8D=20*=20!134=20=E6=96=B0?= =?UTF-8?q?=E5=A2=9E=E4=BF=9D=E5=AD=98channel=EF=BC=8C=E5=BC=95=E5=85=A5?= =?UTF-8?q?=E5=A4=9A=E7=BA=BF=E7=A8=8B=EF=BC=9B=E6=96=B0=E5=A2=9Eblock?= =?UTF-8?q?=E5=88=A4=E6=96=AD=E6=8E=A5=E5=8F=A3=20*=20!130=20=E5=90=8C?= =?UTF-8?q?=E6=AD=A5AccCTR=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo/main.py | 26 +- examples/demo/little_demo/run_mode.py | 21 +- mx_rec/core/asc/build_graph.py | 41 +- mx_rec/core/asc/manager.py | 5 +- mx_rec/core/feature_process.py | 6 +- mx_rec/graph/modifier.py | 97 +- mx_rec/saver/saver.py | 69 +- mx_rec/util/config_utils/hybrid_mgmt_utils.py | 6 + src/AccCTR/3rdparty/CMakeLists.txt | 14 + src/AccCTR/CMakeLists.txt | 16 +- src/AccCTR/README.md | 4 +- src/AccCTR/src/CMakeLists.txt | 9 +- src/AccCTR/src/common/util/error_code.h | 15 +- .../src/common/util/external_threader.h | 70 + src/AccCTR/src/embedding_cache/CMakeLists.txt | 27 + .../cache_manager/cache_manager.cpp | 421 ++++ .../cache_manager/cache_manager.h | 95 + src/AccCTR/src/embedding_cache/common.h | 65 + .../embedding_local_table/emb_local_table.cpp | 475 ++++ .../embedding_local_table/emb_local_table.h | 84 + .../constant_initializer.cpp | 62 + .../initializer/initializer.cpp | 56 + .../random_normal_initializer.cpp | 78 + .../truncated_normal_initializer.cpp | 94 + src/AccCTR/src/embedding_cache/limited_set.h | 118 + .../offset_mapper/address_mapper.h | 308 +++ .../offset_mapper/mapper_base.h | 810 +++++++ .../offset_mapper/offset_mapper.h | 248 ++ src/AccCTR/src/factory_impl.cpp | 11 + src/AccCTR/src/factory_impl.h | 2 + src/AccCTR/src/include/CMakeLists.txt | 2 +- src/AccCTR/src/include/embedding_cache.h | 321 +++ src/AccCTR/src/include/factory.h | 5 +- src/AccCTR/src/include/ock_ctr_common_def.h | 2 +- src/AccCTR/src/include/unique.h | 1 + src/AccCTR/src/unique/unique_func.cpp | 63 +- src/AccCTR/src/unique/unique_func.h | 237 +- src/AccCTR/src/unique/unique_impl.cpp | 8 + src/AccCTR/src/unique/unique_impl.h | 2 +- src/AccCTR/tests/tools/create_fake_id.py | 6 - src/AccCTR/tests/ut/conf/toolchain.cmake | 24 + src/AccCTR/tests/ut/src/CMakeLists.txt | 26 +- src/AccCTR/tests/ut/src/common.h | 64 + src/AccCTR/tests/ut/src/emb_cache_test.cpp | 1999 ++++++++++++++++ src/AccCTR/tests/ut/src/emb_cache_test.h | 62 + src/AccCTR/tests/ut/src/unique_test.cpp | 53 +- src/AccCTR/tests/ut/src/unique_test.h | 16 - src/core/CMakeLists.txt | 10 +- src/core/checkpoint/checkpoint.cpp | 15 +- src/core/checkpoint/checkpoint.h | 3 - .../ckpt_data_handler/ckpt_data_handler.cpp | 5 +- .../ckpt_data_handler/ckpt_data_handler.h | 2 - .../feat_admit_n_evict_ckpt.cpp | 2 +- src/core/emb_hashmap/emb_hashmap.cpp | 477 ---- src/core/emb_hashmap/emb_hashmap.h | 81 - src/core/emb_table/emb_table.cpp | 4 +- src/core/emb_table/embedding_ddr.cpp | 689 ++---- src/core/emb_table/embedding_ddr.h | 61 +- src/core/emb_table/embedding_dynamic.cpp | 10 +- src/core/emb_table/embedding_dynamic.h | 3 +- src/core/emb_table/embedding_mgmt.cpp | 95 +- src/core/emb_table/embedding_mgmt.h | 34 +- src/core/emb_table/embedding_static.cpp | 2 +- src/core/emb_table/embedding_static.h | 2 +- src/core/emb_table/embedding_table.cpp | 68 +- src/core/emb_table/embedding_table.h | 34 +- src/core/file_system/file_system.h | 5 +- .../hdfs_file_system/hdfs_file_system.cpp | 16 +- .../hdfs_file_system/hdfs_file_system.h | 2 +- .../local_file_system/local_file_system.cpp | 45 +- .../local_file_system/local_file_system.h | 2 +- src/core/hd_transfer/hd_transfer.cpp | 84 +- src/core/hd_transfer/hd_transfer.h | 11 +- src/core/host_emb/host_emb.cpp | 278 --- src/core/host_emb/host_emb.h | 76 - src/core/hybrid_mgmt/hybrid_mgmt.cpp | 2046 ++++++++++++----- src/core/hybrid_mgmt/hybrid_mgmt.h | 233 +- src/core/hybrid_mgmt/hybrid_mgmt_block.cpp | 40 +- src/core/hybrid_mgmt/hybrid_mgmt_block.h | 15 +- .../key_process/feature_admit_and_evict.cpp | 4 +- .../key_process/feature_admit_and_evict.h | 5 +- src/core/key_process/key_process.cpp | 393 +++- src/core/key_process/key_process.h | 39 +- .../ock_ctr_common/include/embedding_cache.h | 321 +++ src/core/ock_ctr_common/include/factory.h | 23 +- .../include/ock_ctr_common_def.h | 18 +- src/core/ock_ctr_common/include/unique.h | 1 + src/core/ssd_cache/cache_manager.cpp | 578 ++--- src/core/ssd_cache/cache_manager.h | 99 +- src/core/ssd_cache/lfu_cache.cpp | 37 +- src/core/ssd_cache/lfu_cache.h | 24 +- src/core/ssd_cache/preprocess_mapper.h | 108 + src/core/ssd_engine/file.cpp | 59 +- src/core/ssd_engine/file.h | 26 +- src/core/ssd_engine/ssd_engine.cpp | 36 +- src/core/ssd_engine/ssd_engine.h | 14 +- src/core/ssd_engine/table.cpp | 71 +- src/core/ssd_engine/table.h | 28 +- src/core/utils/common.cpp | 2 + src/core/utils/common.h | 97 +- src/core/utils/task_queue.h | 110 + src/pybind/module_main.cpp | 49 +- src/tests/checkpoint/checkpoint_test.cpp | 8 +- src/tests/emb_hashmap/emb_hashmap_test.cpp | 185 -- src/tests/emb_mgmt/emb_mgmt_test.cpp | 82 - src/tests/emb_table/embedding_ddr_test.cpp | 76 +- src/tests/emb_table/embedding_mgmt_test.cpp | 6 +- src/tests/emb_table/embedding_static_test.cpp | 5 +- .../file_system/hdfs_file_system_test.cpp | 1 - .../file_system/local_file_system_test.cpp | 7 +- src/tests/host_emb/host_emb_test.cpp | 107 - .../feature_admit_and_evict_test.cpp | 4 +- src/tests/key_process/key_process_test.cpp | 14 +- src/tests/ssd_cache/cache_manager_test.cpp | 269 +-- src/tests/ssd_cache/lfu_cache_test.cpp | 16 +- src/tests/ssd_engine/engine_test.cpp | 12 +- src/tests/ssd_engine/file_test.cpp | 43 +- src/tests/ssd_engine/table_test.cpp | 12 +- src/tests/utils/common_h_test.cpp | 6 - tests/mx_rec/core/test_build_graph.py | 23 +- tests/mx_rec/saver/test_saver.py | 1 + tests/run_python_dt.sh | 2 +- 122 files changed, 9634 insertions(+), 4081 deletions(-) create mode 100644 src/AccCTR/src/embedding_cache/CMakeLists.txt create mode 100644 src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp create mode 100644 src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h create mode 100644 src/AccCTR/src/embedding_cache/common.h create mode 100644 src/AccCTR/src/embedding_cache/embedding_local_table/emb_local_table.cpp create mode 100644 src/AccCTR/src/embedding_cache/embedding_local_table/emb_local_table.h create mode 100644 src/AccCTR/src/embedding_cache/initializer/constant_initializer/constant_initializer.cpp create mode 100644 src/AccCTR/src/embedding_cache/initializer/initializer.cpp create mode 100644 src/AccCTR/src/embedding_cache/initializer/random_normal_initializer/random_normal_initializer.cpp create mode 100644 src/AccCTR/src/embedding_cache/initializer/truncated_normal_initializer/truncated_normal_initializer.cpp create mode 100644 src/AccCTR/src/embedding_cache/limited_set.h create mode 100644 src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h create mode 100644 src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h create mode 100644 src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h create mode 100644 src/AccCTR/src/include/embedding_cache.h create mode 100644 src/AccCTR/tests/ut/conf/toolchain.cmake create mode 100644 src/AccCTR/tests/ut/src/common.h create mode 100644 src/AccCTR/tests/ut/src/emb_cache_test.cpp create mode 100644 src/AccCTR/tests/ut/src/emb_cache_test.h delete mode 100644 src/core/emb_hashmap/emb_hashmap.cpp delete mode 100644 src/core/emb_hashmap/emb_hashmap.h delete mode 100644 src/core/host_emb/host_emb.cpp delete mode 100644 src/core/host_emb/host_emb.h create mode 100644 src/core/ock_ctr_common/include/embedding_cache.h create mode 100644 src/core/ssd_cache/preprocess_mapper.h create mode 100644 src/core/utils/task_queue.h delete mode 100644 src/tests/emb_hashmap/emb_hashmap_test.cpp delete mode 100644 src/tests/host_emb/host_emb_test.cpp mode change 100644 => 100755 tests/run_python_dt.sh diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index 80940e86..ff09bc50 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -185,7 +185,8 @@ if __name__ == "__main__": use_mode = UseMode.mapping(os.getenv("USE_MODE")) # 最大数据集生成数量 - MAX_DATASET_GENERATE = 200 + MAX_DATASET_GENERATE_TRAIN = 200 + MAX_DATASET_GENERATE_EVAL = 10 # 最大训练的步数 MAX_TRAIN_STEPS = 200 # 训练多少步切换为评估 @@ -232,6 +233,7 @@ if __name__ == "__main__": init(train_steps=TRAIN_STEPS, eval_steps=EVAL_STEPS, save_steps=SAVING_INTERVAL, + max_steps=MAX_TRAIN_STEPS, use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion, if_load=if_load) @@ -261,12 +263,12 @@ if __name__ == "__main__": # 验证DDR的配置参考:建议跑dynamic避免调参。数据集key总量大于device表,小于device+host;一个batch的unique key数量小于device表。 # 验证SSD的配置参考:建议跑dynamic避免调参。数据集key总量大于device+host;一个batch的unique key数量小于device表。 hbm_test_cfg = {"device_vocabulary_size": cfg.user_vocab_size, "host_vocabulary_size": 0} - ddr_test_cfg = {"device_vocabulary_size": int(cfg.user_vocab_size * 0.2), - "host_vocabulary_size": int(cfg.user_vocab_size * 0.8)} + ddr_test_cfg = {"device_vocabulary_size": int(cfg.user_vocab_size * 0.4), + "host_vocabulary_size": int(cfg.user_vocab_size * 1.0)} ssd_test_cfg = { - "device_vocabulary_size": int(cfg.user_vocab_size * 0.1), - "host_vocabulary_size": int(cfg.user_vocab_size * 0.1), - "ssd_vocabulary_size": int(cfg.user_vocab_size * 0.8), "ssd_data_path": _SSD_SAVE_PATH + "device_vocabulary_size": int(cfg.user_vocab_size * 0.4), + "host_vocabulary_size": int(cfg.user_vocab_size * 0.8), + "ssd_vocabulary_size": int(cfg.user_vocab_size * 1.8), "ssd_data_path": _SSD_SAVE_PATH } cache_mode_dict = {CacheModeEnum.HBM.value: hbm_test_cfg, CacheModeEnum.DDR.value: ddr_test_cfg, CacheModeEnum.SSD.value: ssd_test_cfg} @@ -297,14 +299,16 @@ if __name__ == "__main__": train_batch = None table_list = [user_hashtable, item_hashtable] if use_mode in [UseMode.TRAIN, UseMode.LOAD_AND_TRAIN]: - train_iterator, train_model, train_batch = build_graph(table_list, is_train=True, - feature_spec_list=train_feature_spec_list, - config_dict=ACCESS_AND_EVICT, - batch_number=MAX_DATASET_GENERATE * get_rank_size()) + train_iterator, train_model, train_batch = build_graph( + table_list, is_train=True, + feature_spec_list=train_feature_spec_list, + config_dict=ACCESS_AND_EVICT, + batch_number=MAX_DATASET_GENERATE_TRAIN * get_rank_size() + ) eval_iterator, eval_model, eval_batch = build_graph(table_list, is_train=False, feature_spec_list=eval_feature_spec_list, config_dict=ACCESS_AND_EVICT, - batch_number=MAX_DATASET_GENERATE * get_rank_size()) + batch_number=MAX_DATASET_GENERATE_EVAL * get_rank_size()) dense_variables, sparse_variables = get_dense_and_sparse_variable() params = {"train_batch": train_batch, "eval_batch": eval_batch, "use_one_shot": USE_ONE_SHOT, diff --git a/examples/demo/little_demo/run_mode.py b/examples/demo/little_demo/run_mode.py index f164322a..1a15fcc6 100644 --- a/examples/demo/little_demo/run_mode.py +++ b/examples/demo/little_demo/run_mode.py @@ -16,6 +16,7 @@ # ============================================================================== import os +import sys from typing import List import tensorflow as tf @@ -72,6 +73,8 @@ class RunMode: channel_id = ConfigInitializer.get_instance().train_params_config.get_training_mode_channel_id(False) import_host_pipeline_ops().clear_channel(channel_id) + if self.infer_steps == -1: + self.infer_steps = sys.maxsize # 消耗全部数据 for i in range(1, self.infer_steps + 1): logger.info("############### infer at step %d ################", i) try: @@ -126,17 +129,19 @@ class RunMode: self.session.run(initializer) else: logger.debug(f"use one shot iterator and modify graph is `{self.is_modify_graph}`.") - self.saver = tf.compat.v1.train.Saver() - start_step = 1 + latest_ckpt_step = 0 + start_step = 1 if if_load: - latest_step = get_load_step(model_file) - start_step = latest_step + 1 - self.saver.restore(self.session, f"./saved-model/model-{latest_step}") + latest_ckpt_step = get_load_step(model_file) + start_step = latest_ckpt_step + 1 + self.saver.restore(self.session, f"./saved-model/model-{latest_ckpt_step}") else: self.session.run(tf.compat.v1.global_variables_initializer()) + if self.max_train_steps == -1: + self.max_train_steps = sys.maxsize # 消耗全部数据 for i in range(start_step, start_step + self.max_train_steps): logger.info("################ training at step %d ################", i) try: @@ -151,13 +156,13 @@ class RunMode: logger.info(f"training at step:{i}, table[{t.table_name}], table size:{t.size()}, " f"table capacity:{t.capacity()}") - if i % train_interval == 0: + if train_interval != -1 and (i - latest_ckpt_step) % train_interval == 0: self.evaluate() - if i % saving_interval == 0: + if saving_interval != -1 and (i - latest_ckpt_step) % saving_interval == 0: self.saver.save(self.session, f"./saved-model/model", global_step=i) - if self.is_faae and i == train_interval // 2: + if train_interval != -1 and self.is_faae and i == train_interval // 2: logger.info("############### set_threshold at step:%d ################", i) self.change_threshold() diff --git a/mx_rec/core/asc/build_graph.py b/mx_rec/core/asc/build_graph.py index 46dbf193..0ddf313e 100644 --- a/mx_rec/core/asc/build_graph.py +++ b/mx_rec/core/asc/build_graph.py @@ -15,7 +15,8 @@ # limitations under the License. # ============================================================================== -from typing import Optional +from dataclasses import dataclass, field +from typing import Optional, List, Dict, Union, Tuple import tensorflow as tf @@ -26,6 +27,14 @@ from mx_rec.util.log import logger from mx_rec.core.asc.swap_args import SwapArgs, SwapDataType +@dataclass +class SwapInfo: + swap_in_len: int = 0 + swap_in_pos: List[tf.Tensor] = field(default_factory=lambda: []) + swap_out_len: int = 0 + swap_out_pos: List[tf.Tensor] = field(default_factory=lambda: []) + + def get_restore_vector(config): logger.debug('Channel %s_restore_%s was built for getnext', config.get("table_name"), config.get("channel_id")) if config.get("is_hbm"): @@ -58,28 +67,37 @@ def get_restore_vector(config): return restore_vector, hot_pos -def get_id_offsets(max_lookup_vec_size, config): +def get_id_offsets(max_lookup_vec_size: int, config: dict) -> Tuple[int, SwapInfo]: logger.debug('Channel %s_lookup_%s was built for getnext', config.get("table_name"), config.get("channel_id")) # 自动扩容当前只支持HBM模式,默认没有换入换出 + swap_info = SwapInfo() + with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): if config.get("use_dynamic_expansion"): [id_offsets] = npu_ops.gen_npu_ops.get_next( output_types=[tf.int64], output_shapes=[[max_lookup_vec_size]], channel_name=f'{config.get("table_name")}_lookup_{config.get("channel_id")}') - return id_offsets, [], 0 + return id_offsets, swap_info [id_offsets] = npu_ops.gen_npu_ops.get_next( output_types=[tf.int32], output_shapes=[[max_lookup_vec_size]], channel_name=f'{config.get("table_name")}_lookup_{config.get("channel_id")}') if config.get("is_hbm"): - return id_offsets, [], 0 - swap_pos, swap_len = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32, tf.int32], - output_shapes=[[max_lookup_vec_size], []], - channel_name=f'{config.get("table_name")}_swap_{config.get("channel_id")}') - return id_offsets, swap_pos, swap_len + return id_offsets, swap_info + ( + swap_info.swap_in_pos, + swap_info.swap_out_pos, + swap_info.swap_in_len, + swap_info.swap_out_len, + ) = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32, tf.int32, tf.int32, tf.int32], + output_shapes=[[max_lookup_vec_size], [max_lookup_vec_size], [], []], + channel_name=f'{config.get("table_name")}_swap_all', + ) + logger.debug('Channel %s_swap_all was built for getnext', config.get("table_name")) + return id_offsets, swap_info def get_all2all_args(use_static: bool, config: dict) -> Optional[list]: @@ -115,13 +133,14 @@ def get_preprocessed_tensor_for_asc(table, config): restore_vector, hot_pos = get_restore_vector(config) with tf.compat.v1.variable_scope("id_offsets"): - id_offsets, swap_pos, swap_len = get_id_offsets(max_lookup_vec_size, config) + id_offsets, swap_info = get_id_offsets(max_lookup_vec_size, config) if not config.get("is_hbm"): # 一表多查时,会多次进入get_preprocessed_tensor_for_asc,最后一次大查询替换map的key-value即可 swap_args = SwapArgs() + swap_args.set_data(SwapDataType.CONFIG.value, var_name=config.get("table_name"), - var_channel=config.get("channel_id"), config=config, swap_pos=swap_pos, swap_len=swap_len) + var_channel=config.get("channel_id"), config=config, swap_info=swap_info) all2all_args = get_all2all_args(use_static, config) diff --git a/mx_rec/core/asc/manager.py b/mx_rec/core/asc/manager.py index 97a71a4d..3a24b3d7 100644 --- a/mx_rec/core/asc/manager.py +++ b/mx_rec/core/asc/manager.py @@ -194,6 +194,7 @@ def initialize_emb_cache(table_info_list, threshold_list): train_steps = ConfigInitializer.get_instance().train_steps eval_steps = ConfigInitializer.get_instance().eval_steps save_steps = ConfigInitializer.get_instance().save_steps + max_train_steps = ConfigInitializer.get_instance().max_steps if_load = ConfigInitializer.get_instance().if_load option = 0 @@ -206,8 +207,8 @@ def initialize_emb_cache(table_info_list, threshold_list): if optimizer and optimizer.derivative == 2: option = option | USE_SUM_SAME_ID_GRADIENTS - # [train_steps, eval_steps, save_steps] pass step information to HybridMgmt for data process loop - rank_info = RankInfo(rank_id, device_id, rank_size, option, [train_steps, eval_steps, save_steps]) + # pass step information to HybridMgmt for data process loop + rank_info = RankInfo(rank_id, device_id, rank_size, option, [train_steps, eval_steps, save_steps, max_train_steps]) emb_cache = HybridMgmt() diff --git a/mx_rec/core/feature_process.py b/mx_rec/core/feature_process.py index 7a90e78b..a2161d02 100644 --- a/mx_rec/core/feature_process.py +++ b/mx_rec/core/feature_process.py @@ -61,6 +61,8 @@ class EvictHook(tf.compat.v1.train.SessionRunHook): raise RuntimeError("Global step should be created to use _EvictHook.") self.check_name_and_get_hashtable() for name, instance in self._hash_table_instance.items(): + if not instance.is_hbm: + continue scope_name = f"{instance.table_name}//evict" with tf.compat.v1.variable_scope(scope_name): logger.debug('Channel %s_evict_%d was built for op getnext', instance.table_name, TRAIN_CHANNEL_ID) @@ -99,7 +101,9 @@ class EvictHook(tf.compat.v1.train.SessionRunHook): if not ConfigInitializer.get_instance().hybrid_manager_config.trigger_evict(): return self._start_time = cur_time - for name in self._hash_table_instance.keys(): + for name, instance in self._hash_table_instance.items(): + if not instance.is_hbm: + continue run_context.session.run(self._evict_op.get(name)) def check_name_and_get_hashtable(self): diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index 179de09f..01aeda94 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -39,6 +39,7 @@ from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.asc.helper import get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline from mx_rec.core.asc.swap_args import SwapArgs +from mx_rec.core.asc.build_graph import SwapInfo from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.graph.merge_lookup import do_merge_lookup from mx_rec.graph.utils import check_and_force_list, export_pb_graph @@ -245,14 +246,13 @@ class _GraphModifier: table_instance = ConfigInitializer.get_instance().sparse_embed_config.get_table_instance(each_var) if table_instance.is_hbm: continue - swap_args_dict = swap_args.swap_config_dict[table_instance.table_name][channel_id] - swap_pos = swap_args_dict["swap_pos"] - swap_len = swap_args_dict["swap_len"] variable_and_slot_list = _get_variable_and_slot_list( each_var, slot_num, table_instance.table_name, channel_id ) - swap_op = _get_swap_info(table_instance, variable_and_slot_list, swap_len, swap_pos, channel_id) + swap_args_dict = swap_args.swap_config_dict[table_instance.table_name][channel_id] + swap_op = _get_swap_info( + table_instance, variable_and_slot_list, swap_args_dict["swap_info"], channel_id) swap_control_dict = swap_args.swap_control_dict[table_instance.table_name][channel_id] if "control_ops" not in swap_control_dict: raise ValueError("Missing Required key in modify_graph_for_asc: control_ops") @@ -518,6 +518,7 @@ class _GraphModifier: @para_checker_decorator( check_option_list=[ + ("full_graph", ClassValidator, {"classes": (Graph, type(None))}), ("dump_graph", ClassValidator, {"classes": (bool,)}), ] ) @@ -718,57 +719,57 @@ def _get_variable_and_slot_list(each_var, slot_num, table_name, channel_id): return variable_and_slot_list -def _get_swap_info( - table_instance: BaseSparseEmbedding, variable_and_slot_list: list, swap_len: int, swap_pos: list, channel_id: int -) -> list: +def _get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: list, + swap_info: SwapInfo, channel_id: int) -> list: """ - Get swap info if threshold is configured. + Get swap op. :param table_instance: BaseSparseEmbedding :param variable_and_slot_list: [var + slots] - :param swap_len: swap length - :param swap_pos: swap position + :param swap_info: swap in/out length and position :param channel_id: train or predict - :return: swap info + :return: swap op """ + if table_instance.is_hbm: + return [tf.no_op()] + + if len(variable_and_slot_list) == 0: + raise RuntimeError("When enable emb_transfer, optimizer should have slots") + use_static = ConfigInitializer.get_instance().use_static max_lookup_vec_size = None if use_static: max_lookup_vec_size = table_instance.send_count * table_instance.rank_size - if table_instance.is_hbm: - swap_in = [tf.no_op()] - else: - with tf.compat.v1.variable_scope("h2d_emb"): - logger.debug("Channel %s_h2d_%s was built for getnext", table_instance.table_name, channel_id) - h2d_emb = npu_ops.gen_npu_ops.get_next( - output_types=[tf.float32], - output_shapes=[[max_lookup_vec_size, table_instance.ext_emb_size]], - channel_name=f"{table_instance.table_name}_h2d_{channel_id}", - )[0] - logger.debug("h2d_emb shape: %s", h2d_emb) - if not isinstance(variable_and_slot_list, list): - raise RuntimeError("When enable emb_transfer, optimizer should have slots") - if use_static: - swap_pos = swap_pos[0:swap_len] - h2d_emb = h2d_emb[0:swap_len, :] - swap_outs = [tf.gather(one_table, swap_pos) for one_table in variable_and_slot_list] - swap_out = tf.concat(swap_outs, axis=1) - logger.debug("Channel %s_d2h_%s was built for op outfeed.", table_instance.table_name, channel_id) - swap_out_op = npu_ops.outfeed_enqueue_op( - channel_name=f"{table_instance.table_name}_d2h_{channel_id}", inputs=[swap_out] - ) - with tf.control_dependencies([swap_out_op]): - nd_swap_pos = tf.expand_dims(swap_pos, 1) - table_num = len(variable_and_slot_list) - h2d_emb_split = tf.split(h2d_emb, table_num, axis=1) - optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name( - table_instance.table_name - ) - if optimizer is None and channel_id == 1: - swap_in = [tf.compat.v1.scatter_nd_update(variable_and_slot_list[0], nd_swap_pos, h2d_emb_split[0])] - else: - swap_in = [ - tf.compat.v1.scatter_nd_update(variable_and_slot_list[i], nd_swap_pos, h2d_emb_split[i]) - for i in range(len(variable_and_slot_list)) - ] - return swap_in + with tf.compat.v1.variable_scope("h2d_emb"): + logger.debug('Channel %s_h2d_%s was built for getnext', table_instance.table_name, channel_id) + h2d_emb = npu_ops.gen_npu_ops.get_next( + output_types=[tf.float32], + output_shapes=[[max_lookup_vec_size, table_instance.ext_emb_size]], + channel_name=f'{table_instance.table_name}_h2d_all')[0] + logger.debug("h2d_emb shape: %s", h2d_emb) + + swap_out_pos = swap_info.swap_out_pos + swap_in_pos = swap_info.swap_in_pos + if use_static: + swap_out_pos = swap_out_pos[:swap_info.swap_out_len] + h2d_emb = h2d_emb[:swap_info.swap_in_len, :] + swap_in_pos = swap_in_pos[:swap_info.swap_in_len] + swap_outs = [tf.gather(one_table, swap_out_pos) for one_table in variable_and_slot_list] + swap_out = tf.concat(swap_outs, axis=1) + logger.debug('Channel %s_d2h_all was built for op outfeed.', table_instance.table_name) + + swap_out_op = npu_ops.outfeed_enqueue_op( + channel_name=f'{table_instance.table_name}_d2h_all', inputs=[swap_out]) + with tf.control_dependencies([swap_out_op]): + nd_swap_pos = tf.expand_dims(swap_in_pos, 1) + var_num = len(variable_and_slot_list) + h2d_emb_split = tf.split(h2d_emb, var_num, axis=1) + + optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name( + table_instance.table_name) + if optimizer is None and channel_id == 1: + swap_in_op = [tf.compat.v1.scatter_nd_update(variable_and_slot_list[0], nd_swap_pos, h2d_emb_split[0])] + else: + swap_in_op = [tf.compat.v1.scatter_nd_update(variable_and_slot_list[i], nd_swap_pos, h2d_emb_split[i]) + for i in range(var_num)] + return swap_in_op diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index 9f34cca3..a91599bc 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -24,7 +24,7 @@ import tensorflow as tf from tensorflow.python.util import compat from mx_rec.constants.constants import DataName, DataAttr, MIN_SIZE, MAX_FILE_SIZE, Flag, TFDevice, \ - MAX_INT32, HDFS_FILE_PREFIX + MAX_INT32, HDFS_FILE_PREFIX, TRAIN_CHANNEL_ID from mx_rec.util.communication.hccl_ops import get_rank_id, get_rank_size, get_local_rank_size from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.perf import performance @@ -33,6 +33,7 @@ from mx_rec.validator.validator import DirectoryValidator, FileValidator, para_c from mx_rec.util.global_env_conf import global_env from mx_rec.util.log import logger from mx_rec.optimizers.base import CustomizedOptimizer +from mx_rec.util.tf_version_adapter import npu_ops # define save model thread @@ -63,6 +64,7 @@ class Saver(object): self.rank_id = get_rank_id() self.local_rank_size = get_local_rank_size() self.local_rank_id = self.rank_id % self.local_rank_size + self.rank_size = get_rank_size() self.save_op_dict = defaultdict(dict) self.restore_fetch_dict = defaultdict() self.placeholder_dict = defaultdict(dict) @@ -256,25 +258,54 @@ class Saver(object): if optimizer_instance: set_optimizer_info(optimizer_instance, table_name) - if self.config_instance.hybrid_manager_config.asc_manager: + table_instance0 = self.config_instance.sparse_embed_config.get_table_instance(self.var_list[0]) + if table_instance0.is_hbm: self.config_instance.hybrid_manager_config.save_host_data(root_dir) - logger.debug(f"host data was saved.") + if self.config_instance.use_dynamic_expansion: + # Data related to dynamic expansion needs to be saved only on the host side. + return - if self.config_instance.use_dynamic_expansion: - # Data related to dynamic expansion needs to be saved only on the host side. - return + result = self.save_op_dict + threads = [] + for table_name in result.keys(): + thread = SaveModelThread(self, sess, result, root_dir, table_name) + threads.append(thread) - result = self.save_op_dict - threads = [] - for table_name in result.keys(): - thread = SaveModelThread(self, sess, result, root_dir, table_name) - threads.append(thread) + for thread in threads: + thread.start() - for thread in threads: - thread.start() - - for thread in threads: - thread.join() + for thread in threads: + thread.join() + else: + # 接受host侧传来的需要swap_out的offset用于更新host侧并保存 + self.config_instance.hybrid_manager_config.fetch_device_emb() + for var in self.var_list: + table_instance = self.config_instance.sparse_embed_config.get_table_instance(var) + table_name = table_instance.table_name + + use_static = ConfigInitializer.get_instance().use_static + max_lookup_vec_size = None + if use_static: + max_lookup_vec_size = table_instance.send_count * self.rank_size + swap_out_pos, swap_out_len = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32, tf.int32], + output_shapes=[[max_lookup_vec_size], []], + channel_name=f'{table_name}_save_h2d_{TRAIN_CHANNEL_ID}') + if use_static: + swap_out_pos = swap_out_pos[:swap_out_len] + + optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name(table_name) + table = [var] + [slot_var for slots in optimizer.values() for slot_var in slots.values()] + + swap_outs = [tf.gather(one_table, swap_out_pos) for one_table in table] + swap_out = tf.concat(swap_outs, axis=1) + channel_name = f'{table_name}_save_d2h_{TRAIN_CHANNEL_ID}' + logger.debug('channel %s was built for op swap_out_op.', channel_name) + swap_out_op = npu_ops.outfeed_enqueue_op(channel_name=channel_name, inputs=[swap_out]) + # 发送host需要的embedding + sess.run(swap_out_op) + self.config_instance.hybrid_manager_config.save_host_data(root_dir) + logger.debug(f"host data was saved.") def _get_valid_dict_data(self, dump_data_dict, table_name): host_data = self.config_instance.hybrid_manager_config.get_host_data(table_name) @@ -346,6 +377,10 @@ class Saver(object): self.config_instance.hybrid_manager_config.restore_host_data(reading_path, warm_start_tables) logger.info("host data was restored.") + table_instance0 = self.config_instance.sparse_embed_config.get_table_instance(self.var_list[0]) + if not table_instance0.is_hbm: + return + if self.config_instance.use_dynamic_expansion: # Data related to dynamic expansion needs to be restored only on the host side. return @@ -355,7 +390,7 @@ class Saver(object): for table_name, sub_placeholder_dict in placeholder_dict.items(): load_offset = self.config_instance.hybrid_manager_config.get_load_offset(table_name) fill_placeholder(reading_path, sub_placeholder_dict, restore_feed_dict, - NameDescriptor(table_name, DataName.EMBEDDING.value), load_offset) + NameDescriptor(table_name, DataName.EMBEDDING.value), load_offset) if "optimizer" in sub_placeholder_dict: optimizer_state_placeholder_dict_group = sub_placeholder_dict.get("optimizer") diff --git a/mx_rec/util/config_utils/hybrid_mgmt_utils.py b/mx_rec/util/config_utils/hybrid_mgmt_utils.py index b2ad0efd..26624461 100644 --- a/mx_rec/util/config_utils/hybrid_mgmt_utils.py +++ b/mx_rec/util/config_utils/hybrid_mgmt_utils.py @@ -92,3 +92,9 @@ class HybridManagerConfig: raise TypeError("Asc load data does not match usr setups, \ please re-consider if you want to restore from this dir") logger.debug("Data from host pipeline has been restored.") + + def fetch_device_emb(self): + if self.asc_manager is None: + raise RuntimeError("ASC manager not exist.") + self.asc_manager.fetch_device_emb() + logger.debug("request of fetching embedding from device to host for saving has been send") diff --git a/src/AccCTR/3rdparty/CMakeLists.txt b/src/AccCTR/3rdparty/CMakeLists.txt index a17e472c..3a05f585 100644 --- a/src/AccCTR/3rdparty/CMakeLists.txt +++ b/src/AccCTR/3rdparty/CMakeLists.txt @@ -1,3 +1,17 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2022-2024. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + message("build mode " ${BUILD_MODE}) set(PLATFORM_UTILITIES_3RDPARTY_SOURCE_DIR ${PROJECT_SOURCE_DIR}/../../../opensource) diff --git a/src/AccCTR/CMakeLists.txt b/src/AccCTR/CMakeLists.txt index 0cb63176..60e2d638 100644 --- a/src/AccCTR/CMakeLists.txt +++ b/src/AccCTR/CMakeLists.txt @@ -23,8 +23,6 @@ if (${BUILD_MODE} MATCHES "release") -Wall -fPIC -fms-extensions - -Wno-unused-parameter - -Wno-unused-function -Wunused-variable -Wunused-value -Wcast-align @@ -47,8 +45,6 @@ elseif (${BUILD_MODE} MATCHES "debug") -Wall -fPIC -fms-extensions - -Wno-unused-parameter - -Wno-unused-function -Wunused-variable -Wunused-value -Winvalid-pch @@ -67,8 +63,6 @@ elseif (${BUILD_MODE} MATCHES "ut") -Wall -fPIC -fms-extensions - -Wno-unused-parameter - -Wno-unused-function -Wunused-variable -Wunused-value -Winvalid-pch @@ -79,10 +73,6 @@ elseif (${BUILD_MODE} MATCHES "ut") -Wfloat-equal -Wextra -std=c++17 - #-fsanitize=address - #-fno-omit-frame-pointer - #-fstack-protector-all - #-fstack-protector-strong ) else () message(FATAL_ERROR "======BUILD_MODE not found") @@ -100,7 +90,6 @@ elseif (${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "x86_64") ${CXX_FLAGS} -msse2 -mavx - #-w ) else () message(FATAL_ERROR "don't support ${CMAKE_HOST_SYSTEM_PROCESSOR}") @@ -110,6 +99,11 @@ set(OCK_CTR_PLATFORM_UTIL_DIR ${PROJECT_SOURCE_DIR}/../../../opensource) message(===============${OCK_CTR_PLATFORM_UTIL_DIR}) include_directories(${OCK_CTR_PLATFORM_UTIL_DIR}/securec/include) +include_directories( + ${PROJECT_SOURCE_DIR}/src + ${PROJECT_SOURCE_DIR}/src/embedding_cache +) + add_subdirectory(3rdparty) add_subdirectory(src) diff --git a/src/AccCTR/README.md b/src/AccCTR/README.md index 1a394699..1b25534d 100644 --- a/src/AccCTR/README.md +++ b/src/AccCTR/README.md @@ -6,4 +6,6 @@ 2、bash build.sh debug //编译debug -3、bash build.sh ut //编译并运行ut,覆盖率在tests/build/cov/gen目录下 +3、编译和运行UT: + (1)bash build.sh ut //编译ut,覆盖率在tests/build/cov/gen目录下 + (2)cd build && bash build_test.sh ut //进入到build目录下并运行ut \ No newline at end of file diff --git a/src/AccCTR/src/CMakeLists.txt b/src/AccCTR/src/CMakeLists.txt index 09da4670..1f4d9269 100644 --- a/src/AccCTR/src/CMakeLists.txt +++ b/src/AccCTR/src/CMakeLists.txt @@ -23,12 +23,17 @@ set(OUTPUT ${PROJECT_SOURCE_DIR}/output) set(OCK_CTR_PLATFORM_UTIL_DIR ${PROJECT_SOURCE_DIR}/../../../opensource) set(OCK_CTR_UTIL_INSTALL_DIR ${PROJECT_SOURCE_DIR}/install) - if (${BUILD_MODE} MATCHES "ut") add_compile_options(-ftest-coverage -fprofile-arcs) link_libraries(gcov) +else() + add_compile_options(-D_GLIBCXX_USE_CXX11_ABI=0) # must set this option otherwise pybind will not find embCache symbol endif (${BUILD_MODE} MATCHES "ut") +if (${BUILD_MODE} MATCHES "fuzz") + add_compile_options(-ftest-coverage -fprofile-arcs -fdump-rtl-expand) + link_libraries(gcov asan) +endif (${BUILD_MODE} MATCHES "fuzz") message("include : " ${OCK_CTR_SRC_INCLUDE_DIR}) @@ -37,6 +42,7 @@ set(LIB_HW_SECURE ${OCK_CTR_PLATFORM_UTIL_DIR}/securec/lib/libsecurec.so) add_subdirectory(include) add_subdirectory(common) add_subdirectory(unique) +add_subdirectory(embedding_cache) file(GLOB_RECURSE CTR_SRC factory_impl.cpp) @@ -52,6 +58,7 @@ target_include_directories(_ock_ctr_common target_link_libraries(_ock_ctr_common PUBLIC -Wl,--start-group unique + embedding_cache dl utils ${LIB_HW_SECURE} diff --git a/src/AccCTR/src/common/util/error_code.h b/src/AccCTR/src/common/util/error_code.h index 04d26a57..b30bfd83 100644 --- a/src/AccCTR/src/common/util/error_code.h +++ b/src/AccCTR/src/common/util/error_code.h @@ -29,7 +29,20 @@ using CTRCode = enum : int { H_OUTPUT_TYPE_ERROR = 8, H_SCENE_ERROR = 9, H_MEMORY_ALLOC_ERROR = 10, - H_UNIQUE_UNINITIALIZED_ERROR = 11 + H_UNIQUE_UNINITIALIZED_ERROR = 11, + H_TABLE_NOT_EXIST = 12, + H_LOAD_ERROR = 13, + H_INITIALIZER_INVALID = 14, + H_EXT_EMBEDDING_SIZE_INVALID = 15, + H_MAX_CACHESIZE_TOO_SMALL = 16, + H_HOST_VOCAB_SIZE_TOO_SMALL = 17, + H_THREAD_NUM_ERROR = 18, + H_TABLE_CREATE_DUPLICATE = 19, + H_ARG_NOT_EMPTY = 20, + H_SIZE_ZERO = 21, + H_TABLE_NAME_EMPTY = 22, + H_PREFILL_BUFFER_SIZE_INVALID = 23, + H_TABLE_NAME_TOO_LONG = 24, }; } } diff --git a/src/AccCTR/src/common/util/external_threader.h b/src/AccCTR/src/common/util/external_threader.h index 5a1132af..5f7c500f 100644 --- a/src/AccCTR/src/common/util/external_threader.h +++ b/src/AccCTR/src/common/util/external_threader.h @@ -20,11 +20,81 @@ limitations under the License. #include #include #include +#include +#include +#include +#include #include "singleton.h" using ExternalThread = void (*)(const std::vector> &tasks); namespace ock { +class ThreadPoolAsync { +public: + ThreadPoolAsync() : stop(false) {} + + ~ThreadPoolAsync() + { + { + std::lock_guard lock(taskMutex); + stop = true; + } + taskCv.notify_all(); + for (auto &t : workerThreads) { + t.join(); + } + } + + void SetNumThreads(int n) + { + if (n < 1) { + return; + } + + for (int i = 0; i < n; ++i) { + workerThreads.emplace_back(std::bind(&ThreadPoolAsync::WorkerThread, this)); + } + } + + template std::future AddTask(F &&f) + { + std::lock_guard lock(taskMutex); + + auto pt = std::make_unique>(std::forward(f)); + auto fut = pt->get_future(); + tasks.emplace(std::move(pt)); + taskCv.notify_one(); + return fut; + } + +private: + std::vector workerThreads; + std::queue>> tasks; + std::mutex taskMutex; + std::condition_variable taskCv; + std::atomic stop = false; + + void WorkerThread() + { + while (true) { + std::unique_ptr> task; + { + std::unique_lock lock(taskMutex); + while (tasks.empty() && !stop) { + taskCv.wait(lock); + } + if (stop) { + break; + } + task = std::move(tasks.front()); + tasks.pop(); + } + (*task)(); + } + } +}; + + class SimpleThreadPool { public: static void SyncRun(const std::vector> &tasks) diff --git a/src/AccCTR/src/embedding_cache/CMakeLists.txt b/src/AccCTR/src/embedding_cache/CMakeLists.txt new file mode 100644 index 00000000..e0278a6e --- /dev/null +++ b/src/AccCTR/src/embedding_cache/CMakeLists.txt @@ -0,0 +1,27 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +file(GLOB_RECURSE SRCS *.cpp *.h) + +add_library(embedding_cache OBJECT ${SRCS}) + +target_link_libraries(embedding_cache + -Wl,--start-group + -Wl,--end-group + ) + +target_include_directories(embedding_cache + PUBLIC + ${PROJECT_SOURCE_DIR}/src/common/util + ${PROJECT_SOURCE_DIR}/src/include) \ No newline at end of file diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp new file mode 100644 index 00000000..3620c5d0 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -0,0 +1,421 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#include "cache_manager.h" + +#include + +#include "external_logger.h" + +using namespace EmbCache; +using namespace ock; +using namespace ock::ctr; + +int64_t EmbCache::INVALID_KEY = -1; + +int EmbCacheManagerImpl::CreateCacheForTable(const EmbCacheInfo& embCacheInfo, + const std::vector& initializerInfos, int64_t invalidKey, + uint64_t prefillBufferSize, uint32_t refillThreadNum) +{ + int checkTableNameRet = CheckCreateTableName(embCacheInfo.tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + + if (embCacheInfo.extEmbeddingSize == 0 || embCacheInfo.embeddingSize == 0 || embCacheInfo.vocabSize == 0 || + embCacheInfo.maxCacheSize == 0) { + ExternalLogger::PrintLog(LogLevel::ERROR, "size must be positive"); + return H_SIZE_ZERO; + } + + if (embCacheInfo.vocabSize < embCacheInfo.maxCacheSize) { + ExternalLogger::PrintLog(LogLevel::ERROR, "vocabSize must be greater than or equal to maxCacheSize"); + return H_HOST_VOCAB_SIZE_TOO_SMALL; + } + + auto om = offsetMappers.find(embCacheInfo.tableName); + auto embTable = embTables.find(embCacheInfo.tableName); + if (om != offsetMappers.end() || embTable != embTables.end()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "This table has already been created"); + return H_TABLE_CREATE_DUPLICATE; + } + + if (embCacheInfo.extEmbeddingSize % embCacheInfo.embeddingSize != 0) { + ExternalLogger::PrintLog(LogLevel::ERROR, "extEmbeddingSize = embeddingSize + optimizerSize, " + "which is divisible by embeddingSize"); + return H_EXT_EMBEDDING_SIZE_INVALID; + } + + if (!CheckInitializer(embCacheInfo.extEmbeddingSize, initializerInfos)) { + return H_INITIALIZER_INVALID; + } + + if ((prefillBufferSize < 1) || (prefillBufferSize > embCacheInfo.vocabSize)) { + ExternalLogger::PrintLog(LogLevel::ERROR, "prefillBufferSize has to be between [1, hostVocabSize]"); + return H_PREFILL_BUFFER_SIZE_INVALID; + } + + if (!CheckValidThreadNum(refillThreadNum)) { + return H_THREAD_NUM_ERROR; + } + + uint32_t reserve = embCacheInfo.vocabSize / VOCAB_CACHE_RATIO; + if (!offsetMappers[embCacheInfo.tableName].Initialize(reserve, embCacheInfo.maxCacheSize)) { + offsetMappers[embCacheInfo.tableName].UnInitialize(); + offsetMappers.erase(embCacheInfo.tableName); + return H_MEMORY_ALLOC_ERROR; + } + + EmbPoolParam embPoolParam{prefillBufferSize, refillThreadNum}; + + if (!embTables[embCacheInfo.tableName].Initialize(embCacheInfo, reserve, initializerInfos, embPoolParam)) { + offsetMappers.erase(embCacheInfo.tableName); + embTables.erase(embCacheInfo.tableName); + return H_MEMORY_ALLOC_ERROR; + } + + embCacheInfos.insert({embCacheInfo.tableName, embCacheInfo}); + INVALID_KEY = invalidKey; + return H_OK; +} + +int EmbCacheManagerImpl::GetSwapPairsAndKey2Offset(const std::string& tableName, std::vector& keys, + KeyOffsetPair& swapInKoPair, KeyOffsetPair& swapOutKoPair) +{ + int checkRet = CheckGetSwapPairsAndKey2Offset(tableName, swapInKoPair, swapOutKoPair); + if (checkRet != H_OK) { + return checkRet; + } + return offsetMappers[tableName].GetSwapPairsAndKey2Offset(keys, swapInKoPair, swapOutKoPair); +} + +int EmbCacheManagerImpl::EmbeddingLookup(const std::string& tableName, const std::vector& keys, + float* embAddr, uint32_t threadNum) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + + if (!CheckValidThreadNum(threadNum)) { + return H_THREAD_NUM_ERROR; + } + + if (keys.empty()) { + return H_OK; + } + + if (embAddr == nullptr) { + ExternalLogger::PrintLog(LogLevel::ERROR, "embAddr is nullptr"); + return H_ADDRESS_NULL; + } + + return embTables[tableName].Gather(reinterpret_cast(embAddr), keys, threadNum); +} + +int EmbCacheManagerImpl::EmbeddingLookupAddrs(const std::string& tableName, const std::vector& keys, + std::vector& addrs, uint32_t threadNum) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + + if (!CheckValidThreadNum(threadNum)) { + return H_THREAD_NUM_ERROR; + } + + if (keys.empty()) { + return H_OK; + } + + return embTables[tableName].GatherAddrs(keys, addrs, threadNum); +} + +// 如果多线程使用,严格保证传入的key线程间不会重复(unique key),否则可能出现未定义结果 +int EmbCacheManagerImpl::EmbeddingLookupAndRemove(const std::string& tableName, const std::vector& keys, + float* embAddr, uint32_t threadNum) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + + if (!CheckValidThreadNum(threadNum)) { + return H_THREAD_NUM_ERROR; + } + + if (keys.empty()) { + return H_OK; + } + + if (embAddr == nullptr) { + ExternalLogger::PrintLog(LogLevel::ERROR, "embAddr is nullptr"); + return H_ADDRESS_NULL; + } + + return embTables[tableName].GatherAndRemove(reinterpret_cast(embAddr), keys, threadNum); +} + +int EmbCacheManagerImpl::EmbeddingUpdate(const std::string& tableName, const std::vector& keys, + float* embAddr, uint32_t threadNum) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + + if (!CheckValidThreadNum(threadNum)) { // 检查thread是否小于核数 + return H_THREAD_NUM_ERROR; + } + + if (keys.empty()) { + return H_OK; + } + + if (embAddr == nullptr) { // 检查embAddr是不是空指针 + ExternalLogger::PrintLog(LogLevel::ERROR, "embAddr is nullptr"); + return H_ADDRESS_NULL; + } + + return embTables[tableName].Scatter(reinterpret_cast(embAddr), keys, threadNum); +} + +int EmbCacheManagerImpl::EmbeddingRemove(const std::string& tableName, const std::vector& keys, + uint32_t threadNum) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + + if (!CheckValidThreadNum(threadNum)) { // 检查thread是否小于核数 + return H_THREAD_NUM_ERROR; + } + + if (keys.empty()) { + return H_OK; + } + + return embTables[tableName].RemoveByKeys(keys, threadNum); +} + +int EmbCacheManagerImpl::RemoveEmbsByKeys(const std::string& tableName, const std::vector& keys) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + const auto& om = offsetMappers.find(tableName); + const auto& embTable = embTables.find(tableName); + for (auto key : keys) { + if (key == static_cast(INVALID_KEY)) { + ExternalLogger::PrintLog(LogLevel::WARN, "Try to evict invalid key"); + continue; + } + om->second.Remove(key); + embTable->second.Remove(key); + } + return H_OK; +} + +int EmbCacheManagerImpl::GetEmbTableNames(std::vector& allTableNames) +{ + if (!allTableNames.empty()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "allTableNames should be empty"); + return H_ARG_NOT_EMPTY; + } + allTableNames.reserve(embTables.size()); + for (auto& embTable : embTables) { + allTableNames.emplace_back(embTable.first); + } + return H_OK; +} + +int EmbCacheManagerImpl::ExportDeviceKeyOffsetPairs(const std::string& tableName, + std::vector>& koVec) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + OffsetMapper& om = offsetMappers[tableName]; + koVec = om.ExportSortedKVPairs(); + return H_OK; +} + +int EmbCacheManagerImpl::Serialize(const std::string& tableName, std::vector& buffer) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + buffer = embTables[tableName].Serialize(); + return H_OK; +} + +int EmbCacheManagerImpl::Deserialize(const std::string& tableName, const std::vector& buffer) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + if (!embTables[tableName].Deserialize(buffer)) { + return H_LOAD_ERROR; + } + return H_OK; +} + +int EmbCacheManagerImpl::GetEmbTableInfos(std::string tableName, std::vector& keys, + std::vector>& embeddings, + std::vector>& optimizerSlots) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + if (!keys.empty()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "keys should be empty"); + return H_ARG_NOT_EMPTY; + } + if (!embeddings.empty()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "embeddings should be empty"); + return H_ARG_NOT_EMPTY; + } + if (!optimizerSlots.empty()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "optimizerSlots should be empty"); + return H_ARG_NOT_EMPTY; + } + embTables[tableName].GetEmbTableInfos(keys, embeddings, optimizerSlots); + return H_OK; +} + +int EmbCacheManagerImpl::LoadEmbTableInfos(std::string tableName, const std::vector& keys, + const std::vector>& embeddings, + const std::vector>& optimizerSlots) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + if (!embTables[tableName].LoadEmbTableInfos(keys, embeddings, optimizerSlots)) { + return H_LOAD_ERROR; + } + return H_OK; +} + +void EmbCacheManagerImpl::Destroy() +{ + for (auto it = offsetMappers.begin(); it != offsetMappers.end(); it++) { + it->second.UnInitialize(); + } + for (auto it = embTables.begin(); it != embTables.end(); it++) { + it->second.UnInitialize(); + } + embCacheInfos.clear(); + offsetMappers.clear(); + embTables.clear(); +} + +int EmbCacheManagerImpl::CheckValidTableName(const std::string& tableName) +{ + if (tableName.size() > TABLE_NAME_MAX_SIZE) { + ExternalLogger::PrintLog(LogLevel::ERROR, + "tableName size can not larger than " + std::to_string(TABLE_NAME_MAX_SIZE)); + return H_TABLE_NAME_TOO_LONG; + } + auto om = offsetMappers.find(tableName); + auto embTable = embTables.find(tableName); + if (om == offsetMappers.end() || embTable == embTables.end()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "can not find table"); + return H_TABLE_NOT_EXIST; + } + return H_OK; +} + +bool EmbCacheManagerImpl::CheckInitializer(uint32_t extEmbSize, std::vector initializerInfos) +{ + std::sort(initializerInfos.begin(), initializerInfos.end(), + [](const auto& u, const auto& v) { return u.start < v.start; }); + uint32_t cur_pos = 0; + for (const auto& info : initializerInfos) { + if (info.initializer == nullptr) { + ExternalLogger::PrintLog(LogLevel::ERROR, "initializer is nullptr"); + return false; + } + if (info.start != cur_pos) { + ExternalLogger::PrintLog(LogLevel::ERROR, "Initializers got coverage problems"); + return false; + } + cur_pos += info.len; + } + // 最后判断 + if (cur_pos != extEmbSize) { + ExternalLogger::PrintLog(LogLevel::ERROR, "Initializers got coverage problems"); + return false; + } + return true; +} + +bool EmbCacheManagerImpl::CheckValidThreadNum(uint32_t threadNum) +{ + uint32_t processCoreNum = std::thread::hardware_concurrency(); + if (threadNum > processCoreNum) { + ExternalLogger::PrintLog(LogLevel::ERROR, "ThreadNum can not larger than cpu core num"); + return false; + } + + if (threadNum == 0) { + ExternalLogger::PrintLog(LogLevel::ERROR, "ThreadNum can not be zero"); + return false; + } + return true; +} + +int EmbCacheManagerImpl::CheckGetSwapPairsAndKey2Offset(const std::string& tableName, const KeyOffsetPair& swapInKoPair, + const KeyOffsetPair& swapOutKoPair) +{ + if (!swapInKoPair.first.empty() || !swapInKoPair.second.empty() || !swapOutKoPair.first.empty() || + !swapOutKoPair.second.empty()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "koPair should be empty"); + return H_ARG_NOT_EMPTY; + } + + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + + return H_OK; +} + +int EmbCacheManagerImpl::CheckCreateTableName(const std::string& tableName) +{ + if (tableName.empty()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "tableName can not be empty"); + return H_TABLE_NAME_EMPTY; + } + + if (tableName.size() > TABLE_NAME_MAX_SIZE) { + ExternalLogger::PrintLog(LogLevel::ERROR, + "tableName size can not larger than " + std::to_string(TABLE_NAME_MAX_SIZE)); + return H_TABLE_NAME_TOO_LONG; + } + return H_OK; +} + +uint32_t EmbCacheManagerImpl::GetUsage(const std::string& tableName) +{ + return embTables[tableName].GetUsage(); +} diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h new file mode 100644 index 00000000..80fbcd46 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h @@ -0,0 +1,95 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef EMBEDDING_CACHE_MANAGER_H +#define EMBEDDING_CACHE_MANAGER_H + +#include +#include +#include +#include + +#include "embedding_cache.h" +#include "embedding_local_table/emb_local_table.h" +#include "error_code.h" +#include "offset_mapper/offset_mapper.h" + +namespace EmbCache { +class EmbCacheManagerImpl : public EmbCacheManager { +public: + EmbCacheManagerImpl() = default; + + ~EmbCacheManagerImpl() override = default; + + int CreateCacheForTable(const EmbCacheInfo& embCacheInfo, const std::vector& initializerInfos, + int64_t invalidKey, uint64_t prefillBufferSize, uint32_t refillThreadNum) override; + + int GetSwapPairsAndKey2Offset(const std::string& tableName, std::vector& keys, + KeyOffsetPair& swapInKoPair, KeyOffsetPair& swapOutKoPair) override; + + int EmbeddingLookup(const std::string& tableName, const std::vector& keys, float* embAddr, + uint32_t threadNum) override; + + int EmbeddingLookupAddrs(const std::string& tableName, const std::vector& keys, + std::vector& addrs, uint32_t threadNum) override; + + int EmbeddingUpdate(const std::string& tableName, const std::vector& keys, float* embAddr, + uint32_t threadNum) override; + + int EmbeddingRemove(const std::string& tableName, const std::vector& keys, uint32_t threadNum) override; + + int EmbeddingLookupAndRemove(const std::string& tableName, const std::vector& keys, float* embAddr, + uint32_t threadNum) override; + + int RemoveEmbsByKeys(const std::string& tableName, const std::vector& keys) override; + + int GetEmbTableNames(std::vector& allTableNames) override; + + int ExportDeviceKeyOffsetPairs(const std::string& tableName, + std::vector>& koVec) override; + + int Serialize(const std::string& tableName, std::vector& buffer) override; + + int Deserialize(const std::string& tableName, const std::vector& buffer) override; + + void Destroy() override; + + int GetEmbTableInfos(std::string tableName, std::vector& keys, + std::vector>& embeddings, + std::vector>& optimizerSlots) override; + + int LoadEmbTableInfos(std::string tableName, const std::vector& keys, + const std::vector>& embeddings, + const std::vector>& optimizerSlots) override; + + uint32_t GetUsage(const std::string& tableName) override; + +private: + std::map embCacheInfos; + std::map offsetMappers; + std::map embTables; + + int CheckValidTableName(const std::string& tableName); + + bool CheckInitializer(uint32_t extEmbSize, std::vector initializerInfos); + + bool CheckValidThreadNum(uint32_t threadNum); + + int CheckGetSwapPairsAndKey2Offset(const std::string& tableName, const KeyOffsetPair& swapInKoPair, + const KeyOffsetPair& swapOutKoPair); + + int CheckCreateTableName(const std::string& tableName); +}; +} // namespace EmbCache +#endif // EMBEDDING_CACHE_MANAGER_H diff --git a/src/AccCTR/src/embedding_cache/common.h b/src/AccCTR/src/embedding_cache/common.h new file mode 100644 index 00000000..72433332 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/common.h @@ -0,0 +1,65 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef MXREC_COMMON_H +#define MXREC_COMMON_H + +#include "limited_set.h" + +#ifndef HM_UNLIKELY +#define HM_UNLIKELY(x) __builtin_expect(!!(x), 0) +#endif + +#ifndef HM_LIKELY +#define HM_LIKELY(x) __builtin_expect(!!(x), 1) +#endif + +namespace EmbCache { + + +enum class FkvState { + FKV_EXIST, + FKV_NOT_EXIST, + FKV_KEY_CONFLICT, + FKV_BEFORE_PUT_FUNC_FAIL, + FKV_BEFORE_REMOVE_FUNC_FAIL, + FKV_NO_SPACE, + FKV_FAIL, +}; + +enum class BeforePutFuncState { + BEFORE_SUCCESS, + BEFORE_NO_SPACE, + BEFORE_FAIL, +}; + +enum class BeforeRemoveFuncState { + BEFORE_SUCCESS, + BEFORE_FAIL, +}; + +extern int64_t INVALID_KEY; +constexpr uint64_t TABLE_NAME_MAX_SIZE = 1024; +const uint32_t VOCAB_CACHE_RATIO = 15; +constexpr float NORMAL_MEAN_MAX = 1e9; +constexpr float NORMAL_MEAN_MIN = -1e9; +constexpr float NORMAL_STDDEV_MAX = 100; +constexpr float NORMAL_STDDEV_MIN = 0; +constexpr float CONSTANT_VALUE_MAX = 1e9; +constexpr float CONSTANT_VALUE_MIN = -1e9; +constexpr float INIT_K_MAX = 10000; +constexpr float INIT_K_MIN = -10000; +const int INVALID_EMB_SIZE = -1; +} +#endif // MXREC_COMMON_H diff --git a/src/AccCTR/src/embedding_cache/embedding_local_table/emb_local_table.cpp b/src/AccCTR/src/embedding_cache/embedding_local_table/emb_local_table.cpp new file mode 100644 index 00000000..dc59a303 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/embedding_local_table/emb_local_table.cpp @@ -0,0 +1,475 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#include "emb_local_table.h" + +#include +#include + +#include "error_code.h" +#include "securec.h" + +using namespace std; +using namespace EmbCache; +using namespace ock; +using namespace ock::ctr; + +bool EmbLocalTable::Initialize(const EmbCacheInfo& embCacheInfo, uint64_t reserve, + const std::vector& initializerInfos, const EmbPoolParam& embPoolParam) +{ + emExpendMemInfo = make_shared(embPoolParam.prefillBufferSize, initializerInfos, + embCacheInfo.extEmbeddingSize, embCacheInfo.vocabSize, + embPoolParam.refillThreadNum); + embeddingSize = embCacheInfo.embeddingSize; + extEmbeddingSize = embCacheInfo.extEmbeddingSize; + return embMap.Initialize(reserve, embCacheInfo.vocabSize, emExpendMemInfo); +} + +void EmbLocalTable::UnInitialize() +{ + embMap.UnInitialize(); +} + +int EmbLocalTable::FindAndPutIfNotFound(uint64_t key, uint64_t& value) +{ + FkvState ret = embMap.FindAndPutIfNotFound(key, value); + if (ret == FkvState::FKV_FAIL) { + return H_ERROR; + } + if (ret == FkvState::FKV_BEFORE_PUT_FUNC_FAIL) { + return H_MEMORY_ALLOC_ERROR; + } + if (ret == FkvState::FKV_NO_SPACE) { + return H_HOST_VOCAB_SIZE_TOO_SMALL; + } + return H_OK; +} + +bool EmbLocalTable::Remove(uint64_t key) +{ + return embMap.Remove(key) != FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL; +} + +int EmbLocalTable::RemoveByKeys(const std::vector& keys, uint32_t threadNum) +{ + if (threadNum == 1) { + for (uint64_t key : keys) { + if (!Remove(key)) { + return H_ERROR; + } + } + return H_OK; + } + // 每个线程处理[start[threadId],start[threadId+1])这个区间的key + uint32_t m = keys.size() % threadNum; + vector start(threadNum + 1); + // 前keys.size()%threadNum个线程向上取整 + for (uint32_t threadId = 0; threadId < m; threadId++) { + start[threadId] = ((keys.size() + threadNum - 1) / threadNum) * threadId; + } + // 后面的向下取整 + for (uint32_t threadId = m; threadId <= threadNum; threadId++) { + start[threadId] = (keys.size() / threadNum) * threadId + m; + } + + vector> threads(threadNum); + for (uint32_t threadId = 0; threadId < threadNum; threadId++) { + threads[threadId] = std::async(std::launch::async, [&, threadId]() { + for (uint64_t i = start[threadId]; i < start[threadId + 1]; i++) { + if (!Remove(keys[i])) { + return H_ERROR; + } + } + return H_OK; + }); + } + for (auto& t : threads) { + auto res = t.get(); + if (res != H_OK) { + return res; + } + } + return H_OK; +} + +int EmbLocalTable::OneThreadHandle(uint64_t startAddr, const std::vector& keys, bool isGather) +{ + for (uint64_t i = 0; i < keys.size(); i++) { + uint64_t embAddr; + int ret = FindAndPutIfNotFound(keys[i], embAddr); + if (ret != H_OK) { + return ret; + } + uint64_t memSize = emExpendMemInfo->extEmbeddingSize * sizeof(float); + auto addr = startAddr + i * memSize; + if (isGather) { + auto rc = memcpy_s(reinterpret_cast(addr), memSize, reinterpret_cast(embAddr), memSize); + if (rc != 0) { + ExternalLogger::PrintLog(LogLevel::ERROR, + "gather memcpy_s failed... dstSize: " + std::to_string(memSize)); + return H_COPY_ERROR; + } + } else { + auto rc = memcpy_s(reinterpret_cast(embAddr), memSize, // 按顺序把新的embedding拷贝到对应地址中 + reinterpret_cast(addr), memSize); + if (rc != 0) { + ExternalLogger::PrintLog(LogLevel::ERROR, + "scatter memcpy_s failed... dstSize: " + std::to_string(memSize)); + return H_COPY_ERROR; + } + } + } + + return H_OK; +} + +int EmbLocalTable::Gather(uint64_t startAddr, const vector& keys, uint32_t threadNum) +{ + if (threadNum == 1) { + return OneThreadHandle(startAddr, keys, true); + } + + // 每个线程处理[start[threadId],start[threadId+1])这个区间的key + uint32_t m = keys.size() % threadNum; + vector start(threadNum + 1); + // 前keys.size()%threadNum个线程向上取整 + for (uint32_t threadId = 0; threadId < m; threadId++) { + start[threadId] = ((keys.size() + threadNum - 1) / threadNum) * threadId; + } + // 后面的向下取整 + for (uint32_t threadId = m; threadId <= threadNum; threadId++) { + start[threadId] = (keys.size() / threadNum) * threadId + m; + } + + vector threads(threadNum); + int ret = H_OK; + for (uint32_t threadId = 0; threadId < threadNum; threadId++) { + threads[threadId] = thread([&, threadId] { + for (uint64_t i = start[threadId]; i < start[threadId + 1]; i++) { + uint64_t embAddr; + int temp_ret = FindAndPutIfNotFound(keys[i], embAddr); + if (temp_ret != H_OK) { + ret = temp_ret; + return; + } + uint64_t memSize = emExpendMemInfo->extEmbeddingSize * sizeof(float); + auto addr = startAddr + i * memSize; + auto rc = memcpy_s(reinterpret_cast(addr), memSize, reinterpret_cast(embAddr), memSize); + if (rc != 0) { + ExternalLogger::PrintLog(LogLevel::ERROR, "memcpy_s failed... dstSize: " + std::to_string(memSize)); + ret = H_COPY_ERROR; + return; + } + } + }); + } + for (auto& t : threads) { + t.join(); + } + return ret; +} + +int EmbLocalTable::GatherAddrs(const std::vector& keys, std::vector& addrs, uint32_t threadNum) +{ + if (threadNum == 1) { + addrs.resize(keys.size()); + for (uint64_t i = 0; i < keys.size(); i++) { + int temp_ret = FindAndPutIfNotFound(keys[i], reinterpret_cast(addrs[i])); + if (temp_ret != H_OK) { + return temp_ret; + } + } + return H_OK; + } + // 每个线程处理[start[threadId],start[threadId+1])这个区间的key + uint32_t m = keys.size() % threadNum; + vector start(threadNum + 1); + // 前keys.size()%threadNum个线程向上取整 + for (uint32_t threadId = 0; threadId < m; threadId++) { + start[threadId] = ((keys.size() + threadNum - 1) / threadNum) * threadId; + } + // 后面的向下取整 + for (uint32_t threadId = m; threadId <= threadNum; threadId++) { + start[threadId] = (keys.size() / threadNum) * threadId + m; + } + addrs.resize(keys.size()); + + vector threads(threadNum); + int ret = H_OK; + for (uint32_t threadId = 0; threadId < threadNum; threadId++) { + threads[threadId] = thread([&, threadId] { + for (uint64_t i = start[threadId]; i < start[threadId + 1]; i++) { + int temp_ret = FindAndPutIfNotFound(keys[i], reinterpret_cast(addrs[i])); + if (temp_ret != H_OK) { + ret = temp_ret; + return; + } + } + }); + } + for (auto& t : threads) { + t.join(); + } + return ret; +} + +// 如果多线程使用,严格保证传入的key线程间不会重复(unique key),否则可能出现未定义结果 +int EmbLocalTable::GatherAndRemove(uint64_t startAddr, const vector& keys, uint32_t threadNum) +{ + if (threadNum == 1) { + for (uint64_t i = 0; i < keys.size(); i++) { + uint64_t memSize = emExpendMemInfo->extEmbeddingSize * sizeof(float); + auto addr = startAddr + i * memSize; + auto ret = embMap.FindAndRemoveIfFound(keys[i], addr); // 如果找到了就拷贝出来然后把key删了 + if (ret == FkvState::FKV_NOT_EXIST) { // 没找到key,给一个新的初始化值并且不需要存入key + auto* embAddr = reinterpret_cast(addr); + for (const auto& initializerInfo : emExpendMemInfo->initializerInfos) { + initializerInfo.initializer->GenerateData(embAddr, INVALID_EMB_SIZE); + } + } else if (ret == FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL) { + ExternalLogger::PrintLog(LogLevel::ERROR, "memcpy_s failed... dstSize: " + std::to_string(memSize)); + return H_COPY_ERROR; + } + } + return H_OK; + } + + // 每个线程处理[start[threadId],start[threadId+1])这个区间的key + uint32_t m = keys.size() % threadNum; + vector start(threadNum + 1); + // 前keys.size()%threadNum个线程向上取整 + for (uint32_t threadId = 0; threadId < m; threadId++) { + start[threadId] = ((keys.size() + threadNum - 1) / threadNum) * threadId; + } + // 后面的向下取整 + for (uint32_t threadId = m; threadId <= threadNum; threadId++) { + start[threadId] = (keys.size() / threadNum) * threadId + m; + } + + vector threads(threadNum); + int retVal = H_OK; + for (uint32_t threadId = 0; threadId < threadNum; threadId++) { + threads[threadId] = thread([&, threadId] { + for (uint64_t i = start[threadId]; i < start[threadId + 1]; i++) { + uint64_t memSize = emExpendMemInfo->extEmbeddingSize * sizeof(float); + auto addr = startAddr + i * memSize; + auto ret = embMap.FindAndRemoveIfFound(keys[i], addr); // 如果找到了就拷贝出来然后把key删了 + if (ret == FkvState::FKV_NOT_EXIST) { // 没找到key,给一个新的初始化值并且不需要存入key + auto* embAddr = reinterpret_cast(addr); + for (const auto& initializerInfo : emExpendMemInfo->initializerInfos) { + initializerInfo.initializer->GenerateData(embAddr, INVALID_EMB_SIZE); + } + } else if (ret == FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL) { + ExternalLogger::PrintLog(LogLevel::ERROR, "memcpy_s failed... dstSize: " + std::to_string(memSize)); + retVal = H_COPY_ERROR; + return; + } + } + }); + } + for (auto& t : threads) { + t.join(); + } + return retVal; +} + +int EmbLocalTable::Scatter(const uint64_t startAddr, const vector& keys, uint32_t threadNum) +{ + if (threadNum == 1) { // 单线程版本 + return OneThreadHandle(startAddr, keys, false); + } + + // 多线程版本 + // 每个线程处理[start[threadId],start[threadId+1])这个区间的key + uint32_t m = keys.size() % threadNum; + vector start(threadNum + 1); + // 前keys.size()%threadNum个线程向上取整 + for (uint32_t threadId = 0; threadId < m; threadId++) { + start[threadId] = ((keys.size() + threadNum - 1) / threadNum) * threadId; + } + // 后面的向下取整 + for (uint32_t threadId = m; threadId <= threadNum; threadId++) { + start[threadId] = (keys.size() / threadNum) * threadId + m; + } + + vector threads(threadNum); + int ret = H_OK; + for (uint32_t threadId = 0; threadId < threadNum; threadId++) { + threads[threadId] = thread([&, threadId] { + for (uint64_t i = start[threadId]; i < start[threadId + 1]; i++) { + uint64_t embAddr; + int temp_ret = FindAndPutIfNotFound(keys[i], embAddr); // 获取每个key的embedding对应首地址 + if (temp_ret != H_OK) { + ret = temp_ret; + return; + } + uint64_t memSize = emExpendMemInfo->extEmbeddingSize * sizeof(float); + auto addr = startAddr + i * memSize; + auto rc = memcpy_s(reinterpret_cast(embAddr), memSize, // 按顺序把新的embedding拷贝到对应地址中 + reinterpret_cast(addr), memSize); + if (rc != 0) { + ExternalLogger::PrintLog(LogLevel::ERROR, "memcpy_s failed... dstSize: " + std::to_string(memSize)); + ret = H_COPY_ERROR; + return; + } + } + }); + } + for (auto& t : threads) { + t.join(); + } + return ret; +} + +// 导出存储的所有kv对 +vector> EmbLocalTable::ExportVec() +{ + return embMap.ExportVec(); +} + +template +void EmbLocalTable::insertData(vector& buffer, T& data) +{ + buffer.insert(buffer.end(), (char*)&data, (char*)&data + sizeof(data)); +} + +template +bool EmbLocalTable::getData(const vector& buffer, T& data, uint64_t& i) +{ + if (i + sizeof(T) > buffer.size()) { + return false; + } + data = *reinterpret_cast(&buffer[i]); + i += sizeof(T); + return true; +} + +// 把所存储的key-embedding信息序列化 +vector EmbLocalTable::Serialize() +{ + vector buffer; + vector> kvVec = ExportVec(); + + for (auto& p : kvVec) { + uint64_t key = p.first; + uint64_t value = p.second; + insertData(buffer, key); + auto* addr = reinterpret_cast(value); + buffer.insert(buffer.end(), reinterpret_cast(addr), + reinterpret_cast((addr + emExpendMemInfo->extEmbeddingSize))); + } + return buffer; +} + +// 反序列化key-embedding,存进map +bool EmbLocalTable::Deserialize(const vector& buffer) +{ + uint64_t i = 0; + while (i < buffer.size()) { + uint64_t key; + if (!getData(buffer, key, i)) { + ExternalLogger::PrintLog(LogLevel::ERROR, "get data failed!"); + return false; + } + uint64_t value = 0; + if (FindAndPutIfNotFound(key, value) != H_OK) { + ExternalLogger::PrintLog(LogLevel::ERROR, "FindAndPutIfNotFound failed!"); + return false; + } + + auto* addr = reinterpret_cast(value); + for (uint32_t j = 0; j < emExpendMemInfo->extEmbeddingSize; j++) { + if (!getData(buffer, addr[j], i)) { + ExternalLogger::PrintLog(LogLevel::ERROR, "get data failed!"); + return false; + } + } + } + return true; +} + +uint32_t EmbLocalTable::GetUsage() +{ + return embMap.current_size; +} + +void EmbLocalTable::GetEmbTableInfos(std::vector& keys, std::vector>& embeddings, + std::vector>& optimizerSlots) +{ + vector> kvVec = ExportVec(); + + for (auto& p : kvVec) { + std::vector curEmbedding; + keys.emplace_back(p.first); + auto* addr = reinterpret_cast(p.second); + curEmbedding.insert(curEmbedding.end(), addr, reinterpret_cast((addr + embeddingSize))); + embeddings.emplace_back(curEmbedding); + if (extEmbeddingSize > embeddingSize) { + std::vector curOptimizerSlot; + curOptimizerSlot.insert(curOptimizerSlot.end(), reinterpret_cast(addr + embeddingSize), + reinterpret_cast((addr + extEmbeddingSize))); + optimizerSlots.emplace_back(curOptimizerSlot); + } + } +} + +bool EmbLocalTable::LoadEmbTableInfos(const std::vector& keys, + const std::vector>& embeddings, + const std::vector>& optimizerSlots) +{ + if (keys.size() != embeddings.size()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "the size of keys and embeddings should be same!"); + return false; + } + uint32_t optimizerSlotSize = extEmbeddingSize - embeddingSize; + if (optimizerSlotSize > 0) { + if (keys.size() != optimizerSlots.size()) { + ExternalLogger::PrintLog(LogLevel::ERROR, "the size of keys and optimizerSlots should be same!"); + return false; + } + } + for (uint64_t i = 0; i < keys.size(); i++) { + uint64_t value = 0; + if (FindAndPutIfNotFound(keys[i], value) != H_OK) { + ExternalLogger::PrintLog(LogLevel::ERROR, "FindAndPutIfNotFound failed!"); + return false; + } + if (embeddings[i].size() != embeddingSize) { + ExternalLogger::PrintLog(LogLevel::ERROR, + "The size of entering Embedding does not equals to embeddingSize"); + return false; + } + auto* addr = reinterpret_cast(value); + auto rc = memcpy_s(addr, embeddingSize * sizeof(float), embeddings[i].data(), embeddingSize * sizeof(float)); + if (rc != 0) { + ExternalLogger::PrintLog(LogLevel::ERROR, "embedding memcpy_s failed... "); + return false; + } + if (optimizerSlotSize > 0) { + if (optimizerSlots[i].size() != optimizerSlotSize) { + ExternalLogger::PrintLog( + LogLevel::ERROR, + "The size of entering optimizerSlot does not equals to extEmbeddingSize - embeddingSize"); + return false; + } + auto rc2 = memcpy_s(reinterpret_cast(addr + embeddingSize), optimizerSlotSize * sizeof(float), + optimizerSlots[i].data(), optimizerSlotSize * sizeof(float)); + if (rc2 != 0) { + ExternalLogger::PrintLog(LogLevel::ERROR, "optimizerSlot memcpy_s failed... "); + return false; + } + } + } + return true; +} \ No newline at end of file diff --git a/src/AccCTR/src/embedding_cache/embedding_local_table/emb_local_table.h b/src/AccCTR/src/embedding_cache/embedding_local_table/emb_local_table.h new file mode 100644 index 00000000..ee93bb91 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/embedding_local_table/emb_local_table.h @@ -0,0 +1,84 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef EMB_LOCAL_TABLE_H +#define EMB_LOCAL_TABLE_H + +#include +#include +#include + +#include "offset_mapper/address_mapper.h" + +namespace EmbCache { +struct EmbPoolParam { + uint64_t prefillBufferSize; + uint32_t refillThreadNum; +}; + +class EmbLocalTable { +public: + EmbLocalTable() = default; + + ~EmbLocalTable() = default; + + bool Initialize(const EmbCacheInfo& embCacheInfo, uint64_t reserve, + const std::vector& initializerInfos, const EmbPoolParam& embPoolParam); + + void UnInitialize(); + + int FindAndPutIfNotFound(uint64_t key, uint64_t& value); + + bool Remove(uint64_t key); + + int RemoveByKeys(const std::vector& keys, uint32_t threadNum); + + int Gather(uint64_t startAddr, const std::vector& keys, uint32_t threadNum); + + int GatherAddrs(const std::vector& keys, std::vector& addrs, uint32_t threadNum); + + int Scatter(uint64_t startAddr, const std::vector& keys, uint32_t threadNum); + + int OneThreadHandle(uint64_t startAddr, const std::vector& keys, bool isGather); + + int GatherAndRemove(uint64_t startAddr, const std::vector& keys, uint32_t threadNum); + + std::vector> ExportVec(); + + std::vector Serialize(); + + bool Deserialize(const std::vector& buffer); + + uint32_t GetUsage(); + + void GetEmbTableInfos(std::vector& keys, std::vector>& embeddings, + std::vector>& optimizerSlots); + + bool LoadEmbTableInfos(const std::vector& keys, const std::vector>& embeddings, + const std::vector>& optimizerSlots); + +private: + std::shared_ptr emExpendMemInfo; + AddressMapper embMap; + uint32_t embeddingSize; + uint32_t extEmbeddingSize; + + template + void insertData(std::vector& buffer, T& data); + + template + bool getData(const std::vector& buffer, T& data, uint64_t& i); +}; +} // namespace EmbCache +#endif // EMB_LOCAL_TABLE_H diff --git a/src/AccCTR/src/embedding_cache/initializer/constant_initializer/constant_initializer.cpp b/src/AccCTR/src/embedding_cache/initializer/constant_initializer/constant_initializer.cpp new file mode 100644 index 00000000..0e0ecb0d --- /dev/null +++ b/src/AccCTR/src/embedding_cache/initializer/constant_initializer/constant_initializer.cpp @@ -0,0 +1,62 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#include "embedding_cache.h" +#include "embedding_cache/common.h" +#include "external_logger.h" + +using namespace std; +using namespace EmbCache; +using namespace ock; + +ConstantInitializer::ConstantInitializer(uint32_t start, uint32_t len, float value, float initK) + : start(start), len(len) +{ + if (value > CONSTANT_VALUE_MAX) { + ExternalLogger::PrintLog(LogLevel::WARN, "constant value is greater than " + + std::to_string(CONSTANT_VALUE_MAX) + ", and will use " + std::to_string(CONSTANT_VALUE_MAX) + "."); + constantValue = CONSTANT_VALUE_MAX; + } else if (value < CONSTANT_VALUE_MIN) { + ExternalLogger::PrintLog(LogLevel::WARN, "constant value is less than " + std::to_string(CONSTANT_VALUE_MIN) + + ", and will use " + std::to_string(CONSTANT_VALUE_MIN) + "."); + constantValue = CONSTANT_VALUE_MIN; + } else { + constantValue = value; + } + if (initK > INIT_K_MAX) { + ExternalLogger::PrintLog(LogLevel::WARN, "constant initK is greater than " + std::to_string(INIT_K_MAX) + + ", and will use " + std::to_string(INIT_K_MAX) + "."); + initParam = INIT_K_MAX; + } else if (initK < INIT_K_MIN) { + ExternalLogger::PrintLog(LogLevel::WARN, "constant initK is less than " + std::to_string(INIT_K_MIN) + + ", and will use " + std::to_string(INIT_K_MIN) + "."); + initParam = INIT_K_MIN; + } else { + initParam = initK; + } +} + +void ConstantInitializer::GenerateData(float* emb, int embSize) +{ + if (len == 0) { + return; + } + if (embSize != INVALID_EMB_SIZE && embSize < static_cast(start + len)) { + ExternalLogger::PrintLog(LogLevel::WARN, + "InitializeInfo start " + std::to_string(start) + " + len " + std::to_string(len) + + " is larger than embedding size " + std::to_string(embSize)); + return; + } + std::fill_n(emb + start, len, initParam * constantValue); +} diff --git a/src/AccCTR/src/embedding_cache/initializer/initializer.cpp b/src/AccCTR/src/embedding_cache/initializer/initializer.cpp new file mode 100644 index 00000000..887aaee0 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/initializer/initializer.cpp @@ -0,0 +1,56 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#include + +#include "external_logger.h" +#include "embedding_cache.h" + +using namespace EmbCache; + +ConstantInitializerInfo::ConstantInitializerInfo(float constantValue, float initK) + : constantValue(constantValue), initK(initK) +{} + +NormalInitializerInfo::NormalInitializerInfo(float mean, float stddev, uint32_t seed, float initK) + : mean(mean), stddev(stddev), seed(seed), initK(initK) +{} + +InitializerInfo::InitializerInfo(std::string &name, uint32_t start, uint32_t len, + ConstantInitializerInfo constantInitializerInfo) + : name(name), start(start), len(len), constantInitializerInfo(constantInitializerInfo) +{ + if (name == "constant_initializer") { + initializerType = InitializerType::CONSTANT; + initializer = std::make_shared(start, len, constantInitializerInfo.constantValue, + constantInitializerInfo.initK); + } else { + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "Invalid Initializer Type."); + } +} + +InitializerInfo::InitializerInfo(std::string &name, uint32_t start, uint32_t len, + NormalInitializerInfo normalInitializerInfo) + : name(name), start(start), len(len), normalInitializerInfo(normalInitializerInfo) +{ + if (name == "truncated_normal_initializer") { + initializerType = InitializerType::TRUNCATED_NORMAL; + initializer = std::make_shared(start, len, normalInitializerInfo); + } else if (name == "random_normal_initializer") { + initializerType = InitializerType::RANDOM_NORMAL; + initializer = std::make_shared(start, len, normalInitializerInfo); + } else { + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "Invalid Initializer Type."); + } +} diff --git a/src/AccCTR/src/embedding_cache/initializer/random_normal_initializer/random_normal_initializer.cpp b/src/AccCTR/src/embedding_cache/initializer/random_normal_initializer/random_normal_initializer.cpp new file mode 100644 index 00000000..c4b01062 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/initializer/random_normal_initializer/random_normal_initializer.cpp @@ -0,0 +1,78 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#include +#include +#include "embedding_cache.h" +#include "embedding_cache/common.h" +#include "external_logger.h" + +using namespace EmbCache; +using namespace ock; + +RandomNormalInitializer::RandomNormalInitializer(uint32_t start, uint32_t len, NormalInitializerInfo &initInfo) + : start(start), len(len), mean(initInfo.mean), stddev(initInfo.stddev), seed(initInfo.seed) +{ + // 校验stddev mean及initK值范围 + if (initInfo.mean > NORMAL_MEAN_MAX) { + ExternalLogger::PrintLog(LogLevel::WARN, "random normal mean param is greater than " + + std::to_string(NORMAL_MEAN_MAX) + ", and will use " + std::to_string(NORMAL_MEAN_MAX) + "."); + mean = NORMAL_MEAN_MAX; + } else if (initInfo.mean < NORMAL_MEAN_MIN) { + ExternalLogger::PrintLog(LogLevel::WARN, "random normal mean param is less than " + + std::to_string(NORMAL_MEAN_MIN) + ", and will use " + std::to_string(NORMAL_MEAN_MIN) + "."); + mean = NORMAL_MEAN_MIN; + } else { + mean = initInfo.mean; + } + if (initInfo.stddev > NORMAL_STDDEV_MAX) { + ExternalLogger::PrintLog(LogLevel::WARN, "random normal stddev param is greater than " + + std::to_string(NORMAL_STDDEV_MAX) + ", and will use " + std::to_string(NORMAL_STDDEV_MAX) + "."); + stddev = NORMAL_STDDEV_MAX; + } else if (initInfo.stddev < NORMAL_STDDEV_MIN) { + ExternalLogger::PrintLog(LogLevel::WARN, "random normal stddev param is less than " + + std::to_string(NORMAL_STDDEV_MIN) + ", and will use " + std::to_string(NORMAL_STDDEV_MIN) + "."); + stddev = NORMAL_STDDEV_MIN; + } else { + stddev = initInfo.stddev; + } + if (initInfo.initK > INIT_K_MAX) { + ExternalLogger::PrintLog(LogLevel::WARN, "random normal initK is greater than " + std::to_string(INIT_K_MAX) + + ", and will use " + std::to_string(INIT_K_MAX) + "."); + initParam = INIT_K_MAX; + } else if (initInfo.initK < INIT_K_MIN) { + ExternalLogger::PrintLog(LogLevel::WARN, "random normal initK is less than " + std::to_string(INIT_K_MIN) + + ", and will use " + std::to_string(INIT_K_MIN) + "."); + initParam = INIT_K_MIN; + } else { + initParam = initInfo.initK; + } + + generator = std::default_random_engine(seed); + distribution = std::normal_distribution(mean, stddev); +} + +void RandomNormalInitializer::GenerateData(float* emb, int embSize) +{ + if (len == 0) { + return; + } + if (embSize != INVALID_EMB_SIZE && embSize < static_cast(start + len)) { + ExternalLogger::PrintLog(LogLevel::WARN, + "InitializeInfo start " + std::to_string(start) + " + len " + std::to_string(len) + + " is larger than embedding size " + std::to_string(embSize)); + return; + } + std::generate_n(emb + start, len, [this]() { return initParam * distribution(generator); }); +} \ No newline at end of file diff --git a/src/AccCTR/src/embedding_cache/initializer/truncated_normal_initializer/truncated_normal_initializer.cpp b/src/AccCTR/src/embedding_cache/initializer/truncated_normal_initializer/truncated_normal_initializer.cpp new file mode 100644 index 00000000..95e09757 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/initializer/truncated_normal_initializer/truncated_normal_initializer.cpp @@ -0,0 +1,94 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#include +#include "embedding_cache.h" +#include "embedding_cache/common.h" +#include "external_logger.h" + +using namespace EmbCache; +using namespace ock; + +TruncatedNormalInitializer::TruncatedNormalInitializer(uint32_t start, uint32_t len, NormalInitializerInfo &initInfo) + : start(start), len(len), mean(initInfo.mean), stddev(initInfo.stddev), seed(initInfo.seed) +{ + // 校验stddev mean及initK值范围 + if (initInfo.mean > NORMAL_MEAN_MAX) { + ExternalLogger::PrintLog(LogLevel::WARN, "truncated normal mean param is greater than " + + std::to_string(NORMAL_MEAN_MAX) + ", and will use " + std::to_string(NORMAL_MEAN_MAX) + "."); + mean = NORMAL_MEAN_MAX; + } else if (initInfo.mean < NORMAL_MEAN_MIN) { + ExternalLogger::PrintLog(LogLevel::WARN, "truncated normal mean param is less than " + + std::to_string(NORMAL_MEAN_MIN) + ", and will use " + std::to_string(NORMAL_MEAN_MIN) + "."); + mean = NORMAL_MEAN_MIN; + } else { + mean = initInfo.mean; + } + + if (initInfo.stddev > NORMAL_STDDEV_MAX) { + ExternalLogger::PrintLog(LogLevel::WARN, "truncated normal stddev param is greater than " + + std::to_string(NORMAL_STDDEV_MAX) + ", and will use " + std::to_string(NORMAL_STDDEV_MAX) + "."); + stddev = NORMAL_STDDEV_MAX; + } else if (initInfo.stddev < NORMAL_STDDEV_MIN) { + ExternalLogger::PrintLog(LogLevel::WARN, "truncated normal stddev param is less than " + + std::to_string(NORMAL_STDDEV_MIN) + ", and will use " + std::to_string(NORMAL_STDDEV_MIN) + "."); + stddev = NORMAL_STDDEV_MIN; + } else { + stddev = initInfo.stddev; + } + + if (abs(stddev) < std::numeric_limits::epsilon()) { + ExternalLogger::PrintLog( + LogLevel::WARN, + "truncated normal stddev param is zero, initialization can be slow, suggest using constant initializer"); + } + + if (initInfo.initK > INIT_K_MAX) { + ExternalLogger::PrintLog(LogLevel::WARN, "truncated normal initK is greater than " + + std::to_string(INIT_K_MAX) + ", and will use " + std::to_string(INIT_K_MAX) + "."); + initParam = INIT_K_MAX; + } else if (initInfo.initK < INIT_K_MIN) { + ExternalLogger::PrintLog(LogLevel::WARN, "truncated normal initK is less than " + std::to_string(INIT_K_MIN) + + ", and will use " + std::to_string(INIT_K_MIN) + "."); + initParam = INIT_K_MIN; + } else { + initParam = initInfo.initK; + } + + generator = std::default_random_engine(seed); + distribution = std::normal_distribution(mean, stddev); + minBound = initParam * (mean - static_cast(boundNum) * stddev); + maxBound = initParam * (mean + static_cast(boundNum) * stddev); +} + + +void TruncatedNormalInitializer::GenerateData(float* emb, int embSize) +{ + if (len == 0) { + return; + } + if (embSize != INVALID_EMB_SIZE && embSize < static_cast(start + len)) { + ExternalLogger::PrintLog(LogLevel::WARN, + "InitializeInfo start " + std::to_string(start) + " + len " + std::to_string(len) + + " is larger than embedding size " + std::to_string(embSize)); + return; + } + std::generate_n(emb + start, len, [this]() { + float tmp = initParam * distribution(generator); + while (tmp < minBound || tmp > maxBound) { + tmp = initParam * distribution(generator); + } + return tmp; + }); +} diff --git a/src/AccCTR/src/embedding_cache/limited_set.h b/src/AccCTR/src/embedding_cache/limited_set.h new file mode 100644 index 00000000..036a6477 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/limited_set.h @@ -0,0 +1,118 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef MXREC_LIMITED_SET_H +#define MXREC_LIMITED_SET_H + +#include +#include + +namespace EmbCache { + +class LimitedSet { +public: + struct Node { + uint64_t value; + Node *prev, *next; + Node(uint64_t val = -1) : value(val), prev(nullptr), next(nullptr) {} + }; + + LimitedSet(uint64_t maxRange) : head(new Node(-1)), tail(new Node(-1)) + { + nodes.resize(maxRange); + for (auto &node : nodes) { + node = new Node(-1); + } + head->next = tail; + tail->prev = head; + } + + ~LimitedSet() + { + for (auto &node : nodes) { + delete node; + } + delete head; + delete tail; + } + + void insert(uint64_t value) + { + if (nodes[value]->value == value) { + return; + } + Node *node = nodes[value]; + node->value = value; + Node *next = head->next; + node->next = next; + node->prev = head; + head->next = node; + next->prev = node; + } + + void remove(uint64_t value) + { + if (nodes[value]->value != value) { + return; + } + Node *node = nodes[value]; + node->prev->next = node->next; + node->next->prev = node->prev; + node->value = -1; + } + + bool find(uint64_t value) + { + return nodes[value]->value == value; + } + + class Iterator { + public: + Iterator(Node *node) : current(node) {} + bool operator != (const Iterator &other) const + { + return current != other.current; + } + const uint64_t &operator*() const + { + return current->value; + } + Iterator &operator ++ () + { + current = current->next; + return *this; + } + + private: + Node *current; + }; + + Iterator begin() + { + return { head->next }; + } + + Iterator end() + { + return { tail }; + } + +private: + Node *head; + Node *tail; + std::vector nodes; +}; + +} +#endif // MXREC_LIMITED_SET_H diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h new file mode 100644 index 00000000..649b2d8a --- /dev/null +++ b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h @@ -0,0 +1,308 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef MXREC_FASTER_QUERY_H +#define MXREC_FASTER_QUERY_H + +#include +#include +#include +#include +#include +#include +#include + +#include "embedding_cache.h" +#include "offset_mapper/mapper_base.h" +#include "securec.h" + +namespace EmbCache { +using EmExpandMemUint = struct em_expand_memory_uint_ { + uint64_t address = 0; + uint64_t capacity = 0; + uint64_t leftCapacity = 0; + + em_expand_memory_uint_() = default; + + em_expand_memory_uint_(uint64_t a, uint64_t c) : address(a), capacity(c), leftCapacity(c) {} +}; + +template +class QWithLock { +public: + bool pop(T& ele) + { + std::lock_guard lk(mut); + if (dataQ.empty()) { + return false; + } + ele = dataQ.front(); + dataQ.pop(); + return true; + } + + void push(const T& ele) + { + std::lock_guard lk(mut); + dataQ.push(ele); + } + + uint64_t GetLength() + { + std::lock_guard lk(mut); + return dataQ.size(); + } + +private: + std::mutex mut; + std::queue dataQ; +}; + +class AutoRefillEmbeddingMemoryPool { +public: + std::vector expandedMemory; + uint32_t extEmbeddingSize; + std::vector initializerInfos; + + AutoRefillEmbeddingMemoryPool(uint64_t bufferSize, std::vector initInfos, uint32_t extEmbSize, + uint64_t hostVocabSize, uint32_t refillThreadNum = 1) + : extEmbeddingSize(extEmbSize), + initializerInfos(std::move(initInfos)), + maxBufferSize(bufferSize), + totalLeftVocabSize(hostVocabSize), + numThreads(refillThreadNum) + { + itemSize = extEmbeddingSize * sizeof(float); + maxExpandSize = maxBufferSize * itemSize; + for (uint32_t i = 0; i < numThreads; i++) { + producerThreads.emplace_back([this] { ProducerWorker(); }); + } + } + + ~AutoRefillEmbeddingMemoryPool() + { + { + std::lock_guard lock(producerMutex); + stop = true; + } + producerCv.notify_all(); + fullCv.notify_all(); + for (auto& t : producerThreads) { + t.join(); + } + } + + void Stop() + { + std::lock_guard lock(producerMutex); + stop = true; + producerCv.notify_all(); + fullCv.notify_all(); + } + + BeforePutFuncState GetNewValueToBeInserted(uint64_t& value, uint32_t maxRetry = 1000) + { + for (uint32_t i = 0; i < maxRetry; i++) { + if (BufferBin.pop(value)) { + producerCv.notify_one(); + return BeforePutFuncState::BEFORE_SUCCESS; + }; + producerCv.notify_one(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + ock::ExternalLogger::PrintLog( + ock::LogLevel::ERROR, + "Failed to get new address for embedding, it is likely due to refill thread memory allocation failure " + "or max retry has been reached. Please check for memory alloc error or increase refill thread num!"); + return BeforePutFuncState::BEFORE_FAIL; + } + + void GetValueToBeRecycled(uint64_t value) + { + std::lock_guard lock(producerMutex); + recycleBin.push(value); + full = false; + fullCv.notify_one(); + } + +private: + uint64_t maxBufferSize; + uint64_t totalLeftVocabSize; + uint32_t numThreads; + std::atomic currBufferSize{0}; + volatile bool stop = false; + volatile std::atomic full = false; + std::mutex producerMutex; + std::mutex getAddrMutex; + std::condition_variable producerCv; + std::condition_variable fullCv; + QWithLock BufferBin; + QWithLock recycleBin; + std::vector producerThreads; + EmExpandMemUint currentMemoryUint{}; + uint64_t dynamicExpandRatio = 2; + uint64_t maxExpandSize; + uint64_t itemSize; + + bool GetNewAddr(uint64_t& newAddr) + { + std::lock_guard lg(getAddrMutex); + if (HM_UNLIKELY(currentMemoryUint.leftCapacity <= 0)) { + /* need to expand memory */ + uint64_t maxSize = std::min(maxExpandSize, totalLeftVocabSize * itemSize); + uint64_t newSize = currentMemoryUint.capacity + ? std::min(currentMemoryUint.capacity * dynamicExpandRatio, maxSize) + : itemSize; + if (newSize == 0) { + if (recycleBin.GetLength() == 0) { + full = true; + } + return false; + } + auto newAddress = (uint64_t)malloc(newSize); + if (newAddress == 0) { + ock::ExternalLogger::PrintLog(ock::LogLevel::WARN, "Refill thread allocate memory failed!"); + return false; + } + expandedMemory.emplace_back(newAddress, newSize); + currentMemoryUint.address = newAddress; + currentMemoryUint.capacity = newSize; + currentMemoryUint.leftCapacity = newSize; + totalLeftVocabSize -= newSize / itemSize; + } + newAddr = currentMemoryUint.address + currentMemoryUint.capacity - currentMemoryUint.leftCapacity; + currentMemoryUint.leftCapacity -= itemSize; + return true; + } + + void Produce() + { + uint64_t newAddr; + if (!recycleBin.pop(newAddr)) { + if (!GetNewAddr(newAddr)) { + return; + } + } + GenerateData(newAddr); + BufferBin.push(newAddr); + } + + void GenerateData(const uint64_t& addr) + { + auto* embAddr = reinterpret_cast(addr); + for (const auto& initializerInfo : initializerInfos) { + initializerInfo.initializer->GenerateData(embAddr, INVALID_EMB_SIZE); + } + } + + void ProducerWorker() + { + std::unique_lock lock(producerMutex); + while (!stop) { + if (full) { + fullCv.wait(lock); + continue; + } + if (BufferBin.GetLength() < maxBufferSize) { + Produce(); + continue; + } + producerCv.wait(lock); + } + } +}; + +class AddressMapper : public MapperBase { +public: + AddressMapper() = default; + + ~AddressMapper() = default; + + bool Initialize(uint32_t reserve, uint32_t vocabSize, std::shared_ptr expendInfoPtr) + { + hostVocabSize = vocabSize; + emExpendMemInfoPtr = expendInfoPtr; + return MapperBase::Initialize(reserve); + } + + void UnInitialize() override + { + emExpendMemInfoPtr->Stop(); + FreeExpandedMemory(); + MapperBase::UnInitialize(); + } + + FkvState Remove(uint64_t key) + { + return MapperBase::Remove(key, [&](uint64_t value) { + emExpendMemInfoPtr->GetValueToBeRecycled(value); + return BeforeRemoveFuncState::BEFORE_SUCCESS; + }); + } + + FkvState FindAndPutIfNotFound(uint64_t key, uint64_t& value) + { + FkvState ret = MapperBase::FindAndPutIfNotFound(key, value, [&]() { + if (HM_UNLIKELY(current_size.load() >= hostVocabSize)) { + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "host does not have enough space"); + return BeforePutFuncState::BEFORE_NO_SPACE; + } + return emExpendMemInfoPtr->GetNewValueToBeInserted(value); + }); + if (ret == FkvState::FKV_FAIL) { + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "FindAndPutIfNotFound failed!"); + return ret; + } + if (ret == FkvState::FKV_BEFORE_PUT_FUNC_FAIL) { + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "malloc failed"); + return ret; + } + return ret; + } + + // 如果多线程使用,严格保证传入的key线程间不会重复(unique key),否则可能出现未定义结果 + FkvState FindAndRemoveIfFound(uint64_t key, const uint64_t startAddr) + { + return MapperBase::Remove(key, [&](uint64_t value) { + uint64_t memSize = emExpendMemInfoPtr->extEmbeddingSize * sizeof(float); + auto rc = memcpy_s(reinterpret_cast(startAddr), memSize, reinterpret_cast(value), memSize); + if (rc != 0) { + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, + "memcpy_s failed... dstSize: " + std::to_string(memSize)); + return BeforeRemoveFuncState::BEFORE_FAIL; + } + emExpendMemInfoPtr->GetValueToBeRecycled(value); + return BeforeRemoveFuncState::BEFORE_SUCCESS; + }); + } + + uint32_t GetUsage() + { + return MapperBase::current_size; + } + +private: + void FreeExpandedMemory() + { + for (auto& memUint : emExpendMemInfoPtr->expandedMemory) { + free(reinterpret_cast(memUint.address)); + } + } + +private: + uint32_t hostVocabSize; + std::shared_ptr emExpendMemInfoPtr; +}; +} // namespace EmbCache +#endif // MXREC_FASTER_QUERY_H diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h new file mode 100644 index 00000000..969845ee --- /dev/null +++ b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h @@ -0,0 +1,810 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef MXREC_MAPPER_BASE_H +#define MXREC_MAPPER_BASE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "securec.h" +#include "embedding_cache/common.h" +#include "external_logger.h" + +namespace EmbCache { +/* + * @brief Allocator template, for extend memory allocation for overflowed buckets + */ + +static constexpr size_t K_ALIGNMENT = 64; +static constexpr size_t K_KVNUMINBUCKET = 3; + +enum BucketIdx { + FIRST, + SECOND, + THIRD +}; + +class NetHeapAllocator { +public: + void *Allocate(uint32_t size) + { + return calloc(1, size); + } + + void Free(void *p) + { + if (HM_LIKELY(p != nullptr)) { + free(p); + p = nullptr; + } + } +}; + +/* + * @brief Spin lock entry in bucket + * used for alloc overflowed buckets + */ + +struct NetHashLockEntry { + uint64_t lock = 0; + + /* + * @brief Spin lock + */ + void Lock() + { + while (!__sync_bool_compare_and_swap(&lock, 0, 1)) { + } + } + + /* + * @brief Unlock + */ + void UnLock() + { + __atomic_store_n(&lock, 0, __ATOMIC_SEQ_CST); + } +} __attribute__((packed)); + +/* + * @brief Store the key/value into a linked array with 6 items, + * because 64bytes is one cache line + */ + +struct alignas(K_ALIGNMENT)NetHashBucket { + std::atomic keys[K_KVNUMINBUCKET]{}; + uint64_t values[K_KVNUMINBUCKET]{}; + NetHashBucket *next = nullptr; + NetHashLockEntry spinLock{}; + + FkvState Put(uint64_t key, uint64_t &value, const std::function &beforePutFunc) + { + /* don't put them into loop, flat code is faster than loop */ + uint64_t oldKey = 0; + if (keys[BucketIdx::FIRST].load(std::memory_order_relaxed) == 0 && + keys[BucketIdx::FIRST].compare_exchange_strong(oldKey, key)) { + BeforePutFuncState ret = beforePutFunc(); + if (HM_UNLIKELY(ret == BeforePutFuncState::BEFORE_FAIL)) { + keys[BucketIdx::FIRST] = 0; + return FkvState::FKV_BEFORE_PUT_FUNC_FAIL; + } + if (HM_UNLIKELY(ret == BeforePutFuncState::BEFORE_NO_SPACE)) { + keys[BucketIdx::FIRST] = 0; + return FkvState::FKV_NO_SPACE; + } + values[BucketIdx::FIRST] = value; + return FkvState::FKV_NOT_EXIST; + } + + if (HM_UNLIKELY(oldKey == key)) { + return FkvState::FKV_KEY_CONFLICT; + } + + oldKey = 0; + if (keys[BucketIdx::SECOND].load(std::memory_order_relaxed) == 0 && + keys[BucketIdx::SECOND].compare_exchange_strong(oldKey, key)) { + BeforePutFuncState ret = beforePutFunc(); + if (HM_UNLIKELY(ret == BeforePutFuncState::BEFORE_FAIL)) { + keys[BucketIdx::SECOND] = 0; + return FkvState::FKV_BEFORE_PUT_FUNC_FAIL; + } + if (HM_UNLIKELY(ret == BeforePutFuncState::BEFORE_NO_SPACE)) { + keys[BucketIdx::SECOND] = 0; + return FkvState::FKV_NO_SPACE; + } + values[BucketIdx::SECOND] = value; + return FkvState::FKV_NOT_EXIST; + } + + if (HM_UNLIKELY(oldKey == key)) { + return FkvState::FKV_KEY_CONFLICT; + } + + oldKey = 0; + if (keys[BucketIdx::THIRD].load(std::memory_order_relaxed) == 0 && + keys[BucketIdx::THIRD].compare_exchange_strong(oldKey, key)) { + BeforePutFuncState ret = beforePutFunc(); + if (HM_UNLIKELY(ret == BeforePutFuncState::BEFORE_FAIL)) { + keys[BucketIdx::THIRD] = 0; + return FkvState::FKV_BEFORE_PUT_FUNC_FAIL; + } + if (HM_UNLIKELY(ret == BeforePutFuncState::BEFORE_NO_SPACE)) { + keys[BucketIdx::THIRD] = 0; + return FkvState::FKV_NO_SPACE; + } + values[BucketIdx::THIRD] = value; + return FkvState::FKV_NOT_EXIST; + } + + if (HM_UNLIKELY(oldKey == key)) { + return FkvState::FKV_KEY_CONFLICT; + } + + return FkvState::FKV_FAIL; + } + + /* + * @brief Remove the address from the bucket and get size + */ + bool Find(const uint64_t key, uint64_t &value) + { + /* + * expand the loop, instead of put them into a for/while loop for performance + */ + if (key == keys[BucketIdx::FIRST].load(std::memory_order_relaxed)) { + value = values[BucketIdx::FIRST]; + return true; + } + + if (key == keys[BucketIdx::SECOND].load(std::memory_order_relaxed)) { + value = values[BucketIdx::SECOND]; + return true; + } + + if (key == keys[BucketIdx::THIRD].load(std::memory_order_relaxed)) { + value = values[BucketIdx::THIRD]; + return true; + } + + return false; + } + + FkvState Remove(uint64_t key) + { + /* don't put them into loop, flat code is faster than loop */ + uint64_t oldValue = key; + if (keys[BucketIdx::FIRST].load(std::memory_order_relaxed) == key && + keys[BucketIdx::FIRST].compare_exchange_strong(oldValue, 0)) { + values[BucketIdx::FIRST] = 0; + return FkvState::FKV_EXIST; + } + if (HM_UNLIKELY(oldValue == 0)) { + return FkvState::FKV_EXIST; + } + oldValue = key; + + if (keys[BucketIdx::SECOND].load(std::memory_order_relaxed) == key && + keys[BucketIdx::SECOND].compare_exchange_strong(oldValue, 0)) { + values[BucketIdx::SECOND] = 0; + return FkvState::FKV_EXIST; + } + if (HM_UNLIKELY(oldValue == 0)) { + return FkvState::FKV_EXIST; + } + oldValue = key; + + if (keys[BucketIdx::THIRD].load(std::memory_order_relaxed) == key && + keys[BucketIdx::THIRD].compare_exchange_strong(oldValue, 0)) { + values[BucketIdx::THIRD] = 0; + return FkvState::FKV_EXIST; + } + if (HM_UNLIKELY(oldValue == 0)) { + return FkvState::FKV_EXIST; + } + + return FkvState::FKV_NOT_EXIST; + } + + FkvState Remove(uint64_t key, const std::function &beforeRemoveFunc) + { + /* don't put them into loop, flat code is faster than loop */ + uint64_t oldValue = key; + if (keys[BucketIdx::FIRST].load(std::memory_order_relaxed) == key && + keys[BucketIdx::FIRST].compare_exchange_strong(oldValue, 0)) { + if (HM_UNLIKELY(beforeRemoveFunc(values[BucketIdx::FIRST]) == BeforeRemoveFuncState::BEFORE_FAIL)) { + return FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL; + } + + values[BucketIdx::FIRST] = 0; + return FkvState::FKV_EXIST; + } + if (HM_UNLIKELY(oldValue == 0)) { + return FkvState::FKV_EXIST; + } + oldValue = key; + + if (keys[BucketIdx::SECOND].load(std::memory_order_relaxed) == key && + keys[BucketIdx::SECOND].compare_exchange_strong(oldValue, 0)) { + if (HM_UNLIKELY(beforeRemoveFunc(values[BucketIdx::SECOND]) == BeforeRemoveFuncState::BEFORE_FAIL)) { + return FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL; + } + + values[BucketIdx::SECOND] = 0; + return FkvState::FKV_EXIST; + } + if (HM_UNLIKELY(oldValue == 0)) { + return FkvState::FKV_EXIST; + } + oldValue = key; + + if (keys[BucketIdx::THIRD].load(std::memory_order_relaxed) == key && + keys[BucketIdx::THIRD].compare_exchange_strong(oldValue, 0)) { + if (HM_UNLIKELY(beforeRemoveFunc(values[BucketIdx::THIRD]) == BeforeRemoveFuncState::BEFORE_FAIL)) { + return FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL; + } + + values[BucketIdx::THIRD] = 0; + return FkvState::FKV_EXIST; + } + if (HM_UNLIKELY(oldValue == 0)) { + return FkvState::FKV_EXIST; + } + + return FkvState::FKV_NOT_EXIST; + } +}; + + +class MapperBase { +public: + // DEFINE_RDMA_REF_COUNT_FUNCTIONS + std::atomic current_size{ 0 }; + + MapperBase() = default; + + ~MapperBase() = default; + + bool Initialize(uint32_t reserve) + { + /* already initialized */ + if (mOverflowEntryAlloc != nullptr) { + return true; + } + + /* get proper bucket count */ + uint32_t bucketCount = std::max(reserve, uint32_t(128)); + if (bucketCount > gPrimes[gPrimesCount - 1]) { + bucketCount = gPrimes[gPrimesCount - 1]; + } else { + uint32_t i = 0; + while (i < gPrimesCount && gPrimes[i] < bucketCount) { + i++; + } + bucketCount = gPrimes[i]; + } + + /* allocate buckets for sub-maps */ + for (auto &mSubMap : mSubMaps) { + auto tmp = new (std::nothrow) NetHashBucket[bucketCount]; + if (HM_UNLIKELY(tmp == nullptr)) { + FreeSubMaps(); + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, + "Failed to new hash bucket, probably out of memory"); + return false; + } + + /* make physical page and set to zero */ + auto ret = memset_s(tmp, sizeof(NetHashBucket) * bucketCount, 0, sizeof(NetHashBucket) * bucketCount); + if (ret != 0) { + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, + "memset_s failed... size: " + std::to_string(sizeof(NetHashBucket) * bucketCount)); + return false; + } + + mSubMap = tmp; + } + + /* create overflow entry allocator */ + mOverflowEntryAlloc = new (std::nothrow) NetHeapAllocator(); + if (HM_UNLIKELY(mOverflowEntryAlloc == nullptr)) { + FreeSubMaps(); + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, + "Failed to new overflow entry allocator, probably out of memory"); + return false; + } + + /* set bucket count */ + mBucketCount = bucketCount; + ock::ExternalLogger::PrintLog(ock::LogLevel::INFO, + "fastKV inited, mBucketCount: " + std::to_string(mBucketCount)); + return true; + } + + virtual void UnInitialize() + { + if (mOverflowEntryAlloc == nullptr) { + return; + } + + /* free overflowed entries firstly */ + FreeOverFlowedEntries(); + + /* free sub map secondly */ + FreeSubMaps(); + + /* free overflow entry at last */ + delete mOverflowEntryAlloc; + mOverflowEntryAlloc = nullptr; + mBucketCount = 0; + } + + FkvState FindAndPutIfNotFound(uint64_t key, uint64_t &value, + const std::function &beforePutFunc) + { + if (HM_UNLIKELY(key == 0)) { + if (zeroInside) { + value = zeroValue; + return FkvState::FKV_EXIST; + } + if (__sync_bool_compare_and_swap(&zeroInside, false, true)) { + BeforePutFuncState ret = beforePutFunc(); + if (HM_UNLIKELY(ret == BeforePutFuncState::BEFORE_FAIL)) { + return FkvState::FKV_BEFORE_PUT_FUNC_FAIL; + } + if (HM_UNLIKELY(ret == BeforePutFuncState::BEFORE_NO_SPACE)) { + return FkvState::FKV_NO_SPACE; + } + zeroValue = value; + current_size++; + return FkvState::FKV_NOT_EXIST; + } + return FkvState::FKV_KEY_CONFLICT; + } + + /* get bucket */ + auto buck = &(mSubMaps[key % gSubMapCount][key % mBucketCount]); + + /* loop all buckets linked */ + while (buck != nullptr) { + buck->spinLock.Lock(); + if (buck->Find(key, value)) { + buck->spinLock.UnLock(); + return FkvState::FKV_EXIST; + } + buck->spinLock.UnLock(); + + if (buck->next != nullptr) { + buck = buck->next; + } else { + break; + } + } + + // did not find, now do put. continue from the last bucket in find + return PutKeyValue(key, value, buck, beforePutFunc); + } + + FkvState Remove(uint64_t key) + { + if (HM_UNLIKELY(key == 0)) { + if (zeroInside) { + if (__sync_bool_compare_and_swap(&zeroInside, true, false)) { + zeroValue = 0; + current_size--; + } + return FkvState::FKV_EXIST; + } + return FkvState::FKV_NOT_EXIST; + } + + /* get bucket */ + auto buck = &(mSubMaps[key % gSubMapCount][key % mBucketCount]); + + /* loop all buckets linked */ + uint64_t value; + while (buck != nullptr) { + if (buck->Find(key, value)) { + buck->Remove(key); + current_size--; + return FkvState::FKV_EXIST; + } + + buck = buck->next; + } + + return FkvState::FKV_NOT_EXIST; + } + + FkvState Remove(uint64_t key, const std::function &beforeRemoveFunc) + { + if (HM_UNLIKELY(key == 0)) { + if (!zeroInside) { + return FkvState::FKV_NOT_EXIST; + } + if (__sync_bool_compare_and_swap(&zeroInside, true, false)) { + auto ret = beforeRemoveFunc(zeroValue); + if (HM_UNLIKELY(ret == BeforeRemoveFuncState::BEFORE_FAIL)) { + return FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL; + } + zeroValue = 0; + current_size--; + } + return FkvState::FKV_EXIST; + } + + /* get bucket */ + auto buck = &(mSubMaps[key % gSubMapCount][key % mBucketCount]); + + /* loop all buckets linked */ + uint64_t value; + while (buck != nullptr) { + if (buck->Find(key, value)) { + auto ret = buck->Remove(key, beforeRemoveFunc); + if (HM_UNLIKELY(ret == FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL)) { + return FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL; + } + + current_size--; + return FkvState::FKV_EXIST; + } + + buck = buck->next; + } + + return FkvState::FKV_NOT_EXIST; + } + + FkvState Put(uint64_t key, uint64_t value) + { + if (HM_UNLIKELY(key == 0)) { + if (__sync_bool_compare_and_swap(&zeroInside, false, true)) { + zeroValue = value; + current_size++; + return FkvState::FKV_NOT_EXIST; + } + return FkvState::FKV_KEY_CONFLICT; + } + + /* get bucket */ + auto buck = &(mSubMaps[key % gSubMapCount][key % mBucketCount]); + /* loop all buckets linked */ + while (buck != nullptr) { + if (buck->next != nullptr) { + buck = buck->next; + } else { + break; + } + } + + // did not find, now do put. continue from the last bucket in find + /* try 8192 times */ + for (uint16_t i = 0; i < 8192; i++) { + /* loop all buckets linked */ + while (buck != nullptr) { + /* if there is an entry to put, just break */ + FkvState putRet = buck->Put(key, value, []() -> BeforePutFuncState { return {}; }); + if (putRet == FkvState::FKV_NOT_EXIST) { + current_size++; + return FkvState::FKV_NOT_EXIST; + } + + if (HM_UNLIKELY(putRet == FkvState::FKV_KEY_CONFLICT)) { + return FkvState::FKV_KEY_CONFLICT; + } + /* + * if no next bucket exist, just for break, + * else move to next bucket linked + */ + if (buck->next == nullptr) { + break; + } else { + buck = buck->next; + } + } + + /* + * if not put successfully in existing buckets, allocate a new one + * + * NOTES: just allocate memory, don't access new bucket in the spin lock scope, + * if access new bucket, which could trigger physical memory allocation which + * could trigger page fault, that is quite slow. In this case, spin lock + * could occupy too much CPU + */ + auto &lock = buck->spinLock; + lock.Lock(); + /* if other thread allocated new buck already, unlock and continue */ + if (buck->next != nullptr) { + buck = buck->next; + lock.UnLock(); + continue; + } + + /* firstly entered thread allocate new bucket */ + auto newBuck = static_cast(mOverflowEntryAlloc->Allocate(sizeof(NetHashBucket))); + if (HM_UNLIKELY(newBuck == nullptr)) { + lock.UnLock(); + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "Failed to allocate new bucket"); + return FkvState::FKV_FAIL; + } + /* link to current buck, set buck to new buck */ + buck->next = newBuck; + buck = newBuck; + + /* unlock */ + lock.UnLock(); + } + return FkvState::FKV_FAIL; + } + + bool Find(const uint64_t key, uint64_t &value) + { + if (HM_UNLIKELY(key == 0)) { + if (zeroInside) { + value = zeroValue; + return true; + } + return false; + } + /* get bucket */ + auto buck = &(mSubMaps[key % gSubMapCount][key % mBucketCount]); + + /* loop all buckets linked */ + while (buck != nullptr) { + if (buck->Find(key, value)) { + return true; + } + + buck = buck->next; + } + + return false; + } + + /* When used in muti thread, this function can only be used when keys are uniqued */ + FkvState FindAndDeleteIfFound(const uint64_t key, uint64_t &value, + const std::function &beforeRemoveFunc) + { + if (HM_UNLIKELY(key == 0)) { + if (!zeroInside) { + return FkvState::FKV_NOT_EXIST; + } + value = zeroValue; + if (__sync_bool_compare_and_swap(&zeroInside, true, false)) { + auto ret = beforeRemoveFunc(zeroValue); + if (HM_UNLIKELY(ret == BeforeRemoveFuncState::BEFORE_FAIL)) { + return FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL; + } + zeroValue = 0; + current_size--; + } + + return FkvState::FKV_EXIST; + } + /* get bucket */ + auto buck = &(mSubMaps[key % gSubMapCount][key % mBucketCount]); + + while (buck != nullptr) { + if (buck->Find(key, value)) { + auto ret = buck->Remove(key, beforeRemoveFunc); + if (HM_UNLIKELY(ret == FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL)) { + return FkvState::FKV_BEFORE_REMOVE_FUNC_FAIL; + } + current_size--; + return FkvState::FKV_EXIST; + } + + buck = buck->next; + } + + return FkvState::FKV_NOT_EXIST; + } + + std::vector> ExportVec() + { + std::vector> kvVec; + if (zeroInside) { + kvVec.emplace_back(0, zeroValue); + } + for (auto &mSubMap : mSubMaps) { + for (uint32_t j = 0; j < mBucketCount; j++) { + auto buck = &mSubMap[j]; + ExtractKeyValInBuck(buck, kvVec); + } + } + return kvVec; + } + +protected: + static constexpr uint16_t gSubMapCount = 5; /* count of sub map */ + static constexpr uint32_t gPrimesCount = 256; + + /* make sure the size of this class is 64 bytes, fit into one cache line */ + NetHeapAllocator *mOverflowEntryAlloc = nullptr; /* allocate overflowed entry in one bucket */ + NetHashBucket *mSubMaps[gSubMapCount]{}; /* sub map */ + uint32_t mBucketCount = 0; /* bucket count of each sub map */ + uint32_t mBaseSize = 4096; /* base size */ + bool zeroInside = false; + uint64_t zeroValue = 0; + + const uint32_t gPrimes[gPrimesCount] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, + 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, + 97, 103, 109, 113, 127, 137, 139, 149, 157, 167, + 179, 193, 199, 211, 227, 241, 257, 277, 293, 313, + 337, 359, 383, 409, 439, 467, 503, 541, 577, 619, + 661, 709, 761, 823, 887, 953, 1031, 1109, 1193, 1289, + 1381, 1493, 1613, 1741, 1879, 2029, 2179, 2357, 2549, + 2753, 2971, 3209, 3469, 3739, 4027, 4349, 4703, 5087, + 5503, 5953, 6427, 6949, 7517, 8123, 8783, 9497, 10273, + 11113, 12011, 12983, 14033, 15173, 16411, 17749, 19183, + 20753, 22447, 24281, 26267, 28411, 30727, 33223, 35933, + 38873, 42043, 45481, 49201, 53201, 57557, 62233, 67307, + 72817, 78779, 85229, 92203, 99733, 107897, 116731, 126271, + 136607, 147793, 159871, 172933, 187091, 202409, 218971, 236897, + 256279, 277261, 299951, 324503, 351061, 379787, 410857, 444487, + 480881, 520241, 562841, 608903, 658753, 712697, 771049, 834181, + 902483, 976369, 1056323, 1142821, 1236397, 1337629, 1447153, + 1565659, 1693859, 1832561, 1982627, 2144977, 2320627, 2510653, + 2716249, 2938679, 3179303, 3439651, 3721303, 4026031, 4355707, + 4712381, 5098259, 5515729, 5967347, 6456007, 6984629, 7556579, + 8175383, 8844859, 9569143, 10352717, 11200489, 12117689, + 13109983, 14183539, 15345007, 16601593, 17961079, 19431899, + 21023161, 22744717, 24607243, 26622317, 28802401, 31160981, + 33712729, 36473443, 39460231, 42691603, 46187573, 49969847, + 54061849, 58488943, 63278561, 68460391, 74066549, 80131819, + 86693767, 93793069, 101473717, 109783337, 118773397, 128499677, + 139022417, 150406843, 162723577, 176048909, 190465427, + 206062531, 222936881, 241193053, 260944219, 282312799, + 305431229, 330442829, 357502601, 386778277, 418451333, + 452718089, 489790921, 529899637, 573292817, 620239453, + 671030513, 725980837, 785430967, 849749479, 919334987, + 994618837, 1076067617, 1164186217, 1259520799, 1362662261, + 1474249943, 1594975441, 1725587117, 1866894511, 2019773507, + 2185171673, 2364114217, 2557710269, 2767159799, 2993761039, + 3238918481, 3504151727, 3791104843, 4101556399, 4294967291}; + +private: + void FreeSubMaps() + { + /* free all sub maps */ + for (auto &mSubMap : mSubMaps) { + if (mSubMap != nullptr) { + delete[] mSubMap; + mSubMap = nullptr; + } + } + } + + void FreeOverFlowedEntries() + { + for (auto &mSubMap : mSubMaps) { + if (mSubMap == nullptr) { + continue; + } + + /* free overflow entries in one sub map */ + for (uint32_t buckIndex = 0; buckIndex < mBucketCount; ++buckIndex) { + auto curBuck = mSubMap[buckIndex].next; + NetHashBucket *nextOverflowEntryBuck = nullptr; + + /* exit loop when curBuck is null */ + while (curBuck != nullptr) { + /* assign next overflow buck to tmp variable */ + nextOverflowEntryBuck = curBuck->next; + + /* free this overflow bucket */ + mOverflowEntryAlloc->Free(curBuck); + + /* assign next to current */ + curBuck = nextOverflowEntryBuck; + } + } + } + } + + FkvState PutKeyValue(uint64_t key, uint64_t& value, EmbCache::NetHashBucket *buck, + const std::function& beforePutFunc) + { + /* try 8192 times */ + for (uint16_t i = 0; i < 8192; i++) { + /* loop all buckets linked */ + while (buck != nullptr) { + /* if there is an entry to put, just break */ + buck->spinLock.Lock(); + FkvState putRet = buck->Put(key, value, beforePutFunc); + buck->spinLock.UnLock(); + if (putRet == FkvState::FKV_NOT_EXIST) { + current_size++; + return FkvState::FKV_NOT_EXIST; + } + + if (HM_UNLIKELY(putRet == FkvState::FKV_KEY_CONFLICT)) { + return FkvState::FKV_KEY_CONFLICT; + } + + if (HM_UNLIKELY(putRet == FkvState::FKV_BEFORE_PUT_FUNC_FAIL)) { + return FkvState::FKV_BEFORE_PUT_FUNC_FAIL; + } + + if (HM_UNLIKELY(putRet == FkvState::FKV_NO_SPACE)) { + return FkvState::FKV_NO_SPACE; + } + + /* + * if no next bucket exist, just for break, + * else move to next bucket linked + */ + if (buck->next == nullptr) { + break; + } else { + buck = buck->next; + } + } + + /* + * if not put successfully in existing buckets, allocate a new one + * + * NOTES: just allocate memory, don't access new bucket in the spin lock scope, + * if access new bucket, which could trigger physical memory allocation which + * could trigger page fault, that is quite slow. In this case, spin lock + * could occupy too much CPU + */ + auto &lock = buck->spinLock; + lock.Lock(); + /* if other thread allocated new buck already, unlock and continue */ + if (buck->next != nullptr) { + buck = buck->next; + lock.UnLock(); + continue; + } + + /* firstly entered thread allocate new bucket */ + auto newBuck = static_cast(mOverflowEntryAlloc->Allocate(sizeof(NetHashBucket))); + if (HM_UNLIKELY(newBuck == nullptr)) { + lock.UnLock(); + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "Failed to allocate new bucket"); + return FkvState::FKV_FAIL; + } + /* link to current buck, set buck to new buck */ + buck->next = newBuck; + buck = newBuck; + + /* unlock */ + lock.UnLock(); + } + return FkvState::FKV_FAIL; + } + + void ExtractKeyValInBuck(EmbCache::NetHashBucket *buck, std::vector>& kvVec) + { + while (buck) { + for (size_t k = 0; k < K_KVNUMINBUCKET; k++) { + if (buck->keys[k] == 0) { + continue; + } + kvVec.emplace_back(buck->keys[k].load(), buck->values[k]); + } + buck = buck->next; + } + } +}; +} +#endif // MXREC_MAPPER_BASE_H diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h new file mode 100644 index 00000000..80170989 --- /dev/null +++ b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h @@ -0,0 +1,248 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef MXREC_OFFSET_MAPPER_H +#define MXREC_OFFSET_MAPPER_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mapper_base.h" + +namespace EmbCache { +class OffsetMapper : public MapperBase { +public: + OffsetMapper() = default; + + ~OffsetMapper() = default; + + bool Initialize(uint32_t reserve, uint32_t maxSize = 0) + { + maxCacheSize = maxSize; + useLength = 0; + pos2Key.resize(maxSize); + std::fill(pos2Key.begin(), pos2Key.end(), INVALID_KEY); + try { + validPos = new LimitedSet(maxSize); + evictPos = new LimitedSet(maxSize); + } catch (const std::bad_alloc &e) { + return false; + } + return MapperBase::Initialize(reserve); + } + + void UnInitialize() override + { + delete validPos; + delete evictPos; + validPos = nullptr; + evictPos = nullptr; + MapperBase::UnInitialize(); + } + + FkvState Remove(uint64_t key) + { + return MapperBase::Remove(key, [&](uint64_t value) { + validPos->remove(value); + auto pos = std::find(lastBatchPos.begin(), lastBatchPos.end(), value); + if (pos != lastBatchPos.end()) { + lastBatchPos.erase(pos); + } + evictPos->insert(value); + evictSize++; + return BeforeRemoveFuncState::BEFORE_SUCCESS; + }); + } + + std::vector> ExportSortedKVPairs() + { + auto koVec = ExportVec(); + std::sort(koVec.begin(), koVec.end(), [](const auto &u, const auto &v) { return u.second < v.second; }); + return koVec; + } + + uint64_t GetFreeLength() + { + return maxCacheSize - useLength + evictSize; + } + + int GetSwapPairsAndKey2Offset(std::vector& keys, KeyOffsetPair& swapInKoPair, + KeyOffsetPair& swapOutKoPair) + { + std::vector swapInKeysID = FilterKeys(keys, swapInKoPair); + + uint64_t swapInCnt = 0; + int ret = FindInUsedPos(keys, swapInCnt, swapInKeysID, swapInKoPair, swapOutKoPair); + if (ret != ock::ctr::H_OK) { + return ret; + } + + // 剩下的Key从om中分配位置 + ret = FindInOffsetMapper(keys, swapInKoPair, swapInCnt, swapInKeysID); + if (ret != ock::ctr::H_OK) { + return ret; + } + + // 上个batch中的pos可被换出,加入validPos中 + for (uint64_t pos : lastBatchPos) { + if (HM_UNLIKELY(pos == static_cast(INVALID_KEY))) { + continue; + } + validPos->insert(pos); + } + + // 这里keys都已被替换成offset,这个batch使用的pos在下个batch不能被换出,移出validPos + for (uint64_t pos : keys) { + if (HM_UNLIKELY(pos == static_cast(INVALID_KEY))) { + continue; + } + validPos->remove(pos); + evictPos->remove(pos); + } + + lastBatchPos = keys; + return ock::ctr::H_OK; + } + + uint32_t GetUsage() + { + return useLength - evictSize; + } + + uint64_t FindInUsedPos(std::vector& keys, uint64_t& swapInCnt, std::vector& swapInKeysID, + KeyOffsetPair& swapInKoPair, KeyOffsetPair& swapOutKoPair) + { + std::vector &swapInKeys = swapInKoPair.first; + std::vector &swapInPos = swapInKoPair.second; + std::vector &swapOutKeys = swapOutKoPair.first; + std::vector &swapOutPos = swapOutKoPair.second; + + // 换出量 = 换入量 - 剩余空间 + uint64_t swapOutNum = swapInKeys.size() <= GetFreeLength() ? 0 : swapInKeys.size() - GetFreeLength(); + swapOutKeys.resize(swapOutNum); + swapOutPos.resize(swapOutNum); + + // 空间不足,前swapOutNum个Key从evictPos中拿可换出位置 + for (uint64_t pos : *evictPos) { + if (swapInCnt == swapInKeys.size()) { + break; + } + // 记录swapInPos + swapInPos[swapInCnt] = pos; + // key->offset + keys[swapInKeysID[swapInCnt]] = pos; + // 放入新key-pos + Put(swapInKeys[swapInCnt], pos); + // 更新pos2Key + pos2Key[pos] = swapInKeys[swapInCnt]; + swapInCnt++; + evictSize--; + } + + uint64_t swapOutCnt = 0; + // 空间不足,前swapOutNum个Key从validPos中拿可换出位置 + for (uint64_t pos : *validPos) { + if (swapOutCnt == swapOutNum) { + break; + } + // 记录swapInPos + swapInPos[swapInCnt] = pos; + // key->offset + keys[swapInKeysID[swapInCnt]] = pos; + // 删除原key-pos,放入新key-pos + uint64_t key = pos2Key[pos]; + MapperBase::Remove(key); + Put(swapInKeys[swapInCnt], pos); + // 记录swapOutKoPair + swapOutKeys[swapOutCnt] = key; + swapOutPos[swapOutCnt] = pos; + // 更新pos2Key + pos2Key[pos] = swapInKeys[swapInCnt]; + swapInCnt++; + swapOutCnt++; + } + + if (swapOutCnt < swapOutNum) { + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "max cache size is too small"); + return ock::ctr::H_MAX_CACHESIZE_TOO_SMALL; + } + + return ock::ctr::H_OK; + } + + int FindInOffsetMapper(std::vector& keys, KeyOffsetPair& swapInKoPair, uint64_t swapInCnt, + std::vector& swapInKeysID) + { + std::vector &swapInKeys = swapInKoPair.first; + std::vector &swapInPos = swapInKoPair.second; + + for (uint64_t i = swapInCnt; i < swapInKeys.size(); i++) { + swapInPos[i] = useLength++; + if (HM_UNLIKELY(swapInPos[i] >= maxCacheSize)) { + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "max cache size is too small"); + return ock::ctr::H_MAX_CACHESIZE_TOO_SMALL; + } + // 放入新key-pos + Put(swapInKeys[i], swapInPos[i]); + // 更新pos2Key + pos2Key[swapInPos[i]] = swapInKeys[i]; + // key->offset + keys[swapInKeysID[i]] = swapInPos[i]; + } + return ock::ctr::H_OK; + } + + std::vector FilterKeys(std::vector& keys, KeyOffsetPair &swapInKoPair) + { + std::vector &swapInKeys = swapInKoPair.first; + std::vector &swapInPos = swapInKoPair.second; + + std::vector swapInKeysID; + for (uint64_t i = 0; i < keys.size(); i++) { + // Invalid key 不考虑 + if (HM_UNLIKELY(keys[i] == static_cast(INVALID_KEY))) { + continue; + } + // 在HBM中的key, 原地替换为pos后从validPos中移除 + // 不在HBM中的key,加入swapInKeys,并记录在keys中的下标,用于后续key->offset + if (Find(keys[i], keys[i])) { + validPos->remove(keys[i]); + } else { + swapInKeys.push_back(keys[i]); + swapInKeysID.push_back(i); + } + } + swapInPos.resize(swapInKeys.size()); + return swapInKeysID; + } + +private: + uint32_t maxCacheSize{}; // HBM可容纳embedding条数 + uint32_t useLength{}; // HBM存储的embedding条数 + LimitedSet *validPos{}; // HBM中可被换出的位置 + LimitedSet *evictPos{}; // 淘汰出的位置 + std::vector pos2Key; // HBM中每个位置对应的key + std::vector lastBatchPos; // 上个batch的keys在HBM中占用的pos + uint64_t evictSize; // evictPos的长度 +}; +} +#endif // MXREC_OFFSET_MAPPER_H diff --git a/src/AccCTR/src/factory_impl.cpp b/src/AccCTR/src/factory_impl.cpp index f0f5cdac..654e1d76 100644 --- a/src/AccCTR/src/factory_impl.cpp +++ b/src/AccCTR/src/factory_impl.cpp @@ -54,6 +54,17 @@ int FactoryImpl::CreateUnique(std::shared_ptr &out) return H_OK; } +int FactoryImpl::CreateEmbCacheManager(std::shared_ptr &out) +{ + auto tmp = new (std::nothrow) EmbCache::EmbCacheManagerImpl(); + if (tmp == nullptr) { + return H_NEW_OBJECT_FAILED; + } + + out.reset(dynamic_cast(tmp)); + return H_OK; +} + int FactoryImpl::SetExternalLogFuncInner(ExternalLog logFunc) { auto logger = ExternalLogger::Instance(); diff --git a/src/AccCTR/src/factory_impl.h b/src/AccCTR/src/factory_impl.h index cc1c025a..aa5cd211 100644 --- a/src/AccCTR/src/factory_impl.h +++ b/src/AccCTR/src/factory_impl.h @@ -17,6 +17,7 @@ limitations under the License. #include "include/factory.h" #include "unique/unique_impl.h" +#include "embedding_cache/cache_manager/cache_manager.h" namespace ock { namespace ctr { @@ -27,6 +28,7 @@ public: public: int CreateUnique(std::shared_ptr &out) override; + int CreateEmbCacheManager(std::shared_ptr &out) override; int SetExternalLogFuncInner(ExternalLog logFunc) override; public: diff --git a/src/AccCTR/src/include/CMakeLists.txt b/src/AccCTR/src/include/CMakeLists.txt index c9d2b215..7f8b2b6d 100644 --- a/src/AccCTR/src/include/CMakeLists.txt +++ b/src/AccCTR/src/include/CMakeLists.txt @@ -12,7 +12,7 @@ # limitations under the License. # ============================================================================== -set(INCLUDE_HEADERS factory.h ock_ctr_common_def.h unique.h) +set(INCLUDE_HEADERS factory.h ock_ctr_common_def.h unique.h embedding_cache.h) set(TARGET_INSTALL_INCLUDE ${OUTPUT}/ock_ctr_common/include) diff --git a/src/AccCTR/src/include/embedding_cache.h b/src/AccCTR/src/include/embedding_cache.h new file mode 100644 index 00000000..4adf1fbf --- /dev/null +++ b/src/AccCTR/src/include/embedding_cache.h @@ -0,0 +1,321 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2022-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef EMBEDDING_CACHE_H +#define EMBEDDING_CACHE_H + +#include +#include +#include +#include + +namespace EmbCache { +using KeyOffsetPair = std::pair, std::vector>; + +class Initializer { +public: + Initializer() = default; + virtual ~Initializer() = default; + + /* * + * 生成随机数 + * @Param emb embedding的首地址 + */ + virtual void GenerateData(float* emb, int embSize) = 0; + uint32_t start{}; // 起始位置 + uint32_t len{}; // 初始化的长度 + float initParam = 1.0; // 初始化器生成的初始值均需要乘以initParam +}; + +enum class InitializerType { + INVALID, + CONSTANT, + TRUNCATED_NORMAL, + RANDOM_NORMAL +}; + +struct ConstantInitializerInfo { + ConstantInitializerInfo() = default; + + ConstantInitializerInfo(float constantValue, float initK); + + float constantValue = 0; // 常量值 + float initK = 1.0; // 初始化出来的值需乘以initK +}; + +struct NormalInitializerInfo { + NormalInitializerInfo() = default; + + NormalInitializerInfo(float mean, float stddev, uint32_t seed, float initK); + + float mean = 0; // 平均值 + float stddev = 0; // 标准差 + uint32_t seed = 0; // 随机数种子 + float initK = 1.0; // 初始化出来的值需乘以initK +}; + +class ConstantInitializer : public Initializer { +public: + ConstantInitializer() = default; + + ConstantInitializer(uint32_t start, uint32_t len, float value, float initK); + + ~ConstantInitializer() override = default; + + void GenerateData(float* emb, int embSize) override; + + uint32_t start = 0; // 起始位置 + uint32_t len = 0; // 初始化的长度 + float constantValue = 0; // 常量值 +}; + +class RandomNormalInitializer : public Initializer { +public: + RandomNormalInitializer() = default; + RandomNormalInitializer(uint32_t start, uint32_t len, NormalInitializerInfo& initInfo); + + ~RandomNormalInitializer() override = default; + + void GenerateData(float* emb, int embSize) override; + + uint32_t start = 0; // 起始位置 + uint32_t len = 0; // 初始化的长度 + float mean = 0; // 平均值 + float stddev = 0; // 标准差 + uint32_t seed = 0; // 随机数种子 + + std::default_random_engine generator; // 随机数生成器 + std::normal_distribution distribution; // 正态分布 +}; + +class TruncatedNormalInitializer : public Initializer { +public: + TruncatedNormalInitializer() = default; + + TruncatedNormalInitializer(uint32_t start, uint32_t len, NormalInitializerInfo& initInfo); + + ~TruncatedNormalInitializer() override = default; + + void GenerateData(float* emb, int embSize) override; + + int boundNum = 2; + + uint32_t start = 0; // 起始位置 + uint32_t len = 0; // 初始化的长度 + float mean = 0; // 平均值 + float stddev = 0; // 标准差 + uint32_t seed = 0; // 随机数种子 + + std::default_random_engine generator; // 随机数生成器 + std::normal_distribution distribution; + float minBound = 0; // 下界 + float maxBound = 0; // 上界 +}; + +struct InitializerInfo { + InitializerInfo() = default; + + InitializerInfo(std::string& name, uint32_t start, uint32_t len, ConstantInitializerInfo constantInitializerInfo); + + InitializerInfo(std::string& name, uint32_t start, uint32_t len, NormalInitializerInfo normalInitializerInfo); + + std::string name = ""; // 初始化器的名称 + uint32_t start = 0; // 初始化开始的位置 + uint32_t len = 0; // 待初始化的长度 + InitializerType initializerType = InitializerType::INVALID; + + ConstantInitializerInfo constantInitializerInfo; + NormalInitializerInfo normalInitializerInfo; + + std::shared_ptr initializer; +}; + +struct EmbCacheInfo { + EmbCacheInfo(std::string tableName, uint32_t vocabSize, uint32_t embeddingSize, uint32_t extEmbeddingSize, + uint32_t maxCacheSize) + : tableName(tableName), + vocabSize(vocabSize), + embeddingSize(embeddingSize), + extEmbeddingSize(extEmbeddingSize), + maxCacheSize(maxCacheSize) + { + } + std::string tableName = ""; + uint32_t vocabSize = 0; // host侧的容量(能存多少条embedding) + uint32_t embeddingSize = 0; + uint32_t extEmbeddingSize = 0; // 包含embedding和优化器信息的embedding长度 + uint32_t maxCacheSize = 0; // device侧的容量(能存多少条embedding) +}; + +class EmbCacheManager { +public: + virtual ~EmbCacheManager() = default; + + /* * + * 对当前embInfo对应的table在cache_manager中进行table初始化 + * @Param EmbCacheInfo: embedding cache的初始化信息 + * @Param std::vector 初始化器的信息 + * @Param uint64_t prefillBufferSize emb内存池恒定可用大小 + * @Param uint32_t refillThreadNum emb内存池自动填充线程数 + * @Return errorCode + */ + virtual int CreateCacheForTable(const EmbCacheInfo& embCacheInfo, + const std::vector& initializerInfos, int64_t invalidKey = -1, + uint64_t prefillBufferSize = 500000, uint32_t refillThreadNum = 1) = 0; + + /* * + * 查找当前keys对应的offsets并将本不存在与offsetMapper中的keys插入到offsetMapper中并得到其偏移值offsets, + * 并且当offsetMapper可存放空间不足时,释放可swapOut的keys,获取当前需要被换入换出的keys和offsets的pair + * @Param tableName: 表名 + * @Param keys: 当前batch所有unique的keys + * @Param swapInKoPair: 输出参数,需要换入的Key-offset pair + * @Param swapOutKoPair: 输出参数,需要换出的Key-offset pair + * @Return errorCode + */ + virtual int GetSwapPairsAndKey2Offset(const std::string& tableName, std::vector& keys, + KeyOffsetPair& swapInKoPair, KeyOffsetPair& swapOutKoPair) = 0; + + /* * + * 查询Embedding + * @Param tableName: 表名 + * @Param keys: 待查询的keys + * @Param embAddr: 申请出来存放embedding的空间首地址 + * @Param threadNum: 线程数 + * @Return errorCode + */ + virtual int EmbeddingLookup(const std::string& tableName, const std::vector& keys, float* embAddr, + uint32_t threadNum = 4) = 0; + + /* * + * 查询Embedding的地址 + * @Param tableName: 表名 + * @Param keys: 待查询的keys + * @Param addrs: keys对应的申请出来存放embedding的空间首地址 + * @Param threadNum: 线程数 + * @Return errorCode + */ + virtual int EmbeddingLookupAddrs(const std::string& tableName, const std::vector& keys, + std::vector& addrs, uint32_t threadNum = 4) = 0; + + /* * + * 查询Embedding并且在查询完成之后删除embedding对应的key。如果多线程使用,严格保证传入的key线程间不会重复(unique + * key),否则可能出现未定义结果 + * @Param tableName: 表名 + * @Param keys: 待查询的keys + * @Param embAddr: 申请出来存放embedding的空间首地址 + * @Param threadNum: 线程数 + * @Return errorCode + */ + virtual int EmbeddingLookupAndRemove(const std::string& tableName, const std::vector& keys, + float* embAddr, uint32_t threadNum = 4) = 0; + + /* * + * 更新Embedding + * @Param tableName: 表名 + * @Param keys: 待更新的keys,用于查询出每个key在DDR上存放的地址 + * @Param embAddr: 待更新到DDR上的embedding的首地址 + * @Param threadNum: 线程数 + * @Return errorCode + */ + virtual int EmbeddingUpdate(const std::string& tableName, const std::vector& keys, float* embAddr, + uint32_t threadNum = 4) = 0; + + /* * + * 在EmbLocalTable中移除keys,并将存储其embedding的内存位置记为可复用 + * @Param tableName: 表名 + * @Param keys: 待移除的keys + * @Return errorCode + */ + virtual int EmbeddingRemove(const std::string& tableName, const std::vector& keys, + uint32_t threadNum = 4) = 0; + + /* * + * 将需要被淘汰的keys从offsetMapper的记录中移除,同时也在EmbLocalTable中移除,并将存储其embedding的内存位置记为可复用 + * @Param tableName: 表名 + * @Param keys: 待淘汰的keys + * @Return errorCode + */ + virtual int RemoveEmbsByKeys(const std::string& tableName, const std::vector& keys) = 0; + + /* * + * 获取所有table names + * @Param allTableNames: 输出参数,用于存放所有的table names + * @Return errorCode + */ + virtual int GetEmbTableNames(std::vector& allTableNames) = 0; + + /* * + * 获取以values为增序排列的当前记录在offsetMapper中所有的keys和values的pairs + * @Param tableName: 表名 + * koVec: 输出参数 + * @Return errorCode + */ + virtual int ExportDeviceKeyOffsetPairs(const std::string& tableName, + std::vector>& koVec) = 0; + + /* * + * 获取当前table的序列化信息 + * @Param tableName: 要序列化的表 + * @Param buffer: 输出参数,存储序列化之后的信息 + * @Return errorCode + */ + virtual int Serialize(const std::string& tableName, std::vector& buffer) = 0; + + /* * + * 将当前table的序列化信息进行反序列化 + * @Param tableName: 要反序列化的表 + * @Param buffer: 输入参数,将buffer中的内容进行反序列化 + * @Return errorCode + */ + virtual int Deserialize(const std::string& tableName, const std::vector& buffer) = 0; + + /* * + * 析构所有embCache,释放内存 + */ + virtual void Destroy() = 0; + + /* * + * 查询表的使用量 + * @Param tableName: 要查询的表 + * @Return 当前表的使用量 + */ + virtual uint32_t GetUsage(const std::string& tableName) = 0; + + /* * + * 获取当前host侧所存储的所有keys及其对应的embeddings和优化器参数 + * @Param tableName: 需要获取信息的table名字 + * @Param keys: 输入参数,输入空vector,获取的存储的所有keys会赋到该vector中 + * @Param embeddings: 输入参数,输入空vector,获取的存储的所有embeddings会赋到该vector中 + * @Param optimizerSlots: 输入参数,输入空vector,获取的存储的所有optimizerSlots会赋到该vector中 + * @Return errorCode + */ + virtual int GetEmbTableInfos(std::string tableName, std::vector& keys, + std::vector>& embeddings, + std::vector>& optimizerSlots) = 0; + + /* * + * 将所需存储的keys及其对应的embeddings和优化器参数传入,来装载LocalEmbeddingTable + * @Param tableName: 需要加载信息的table名字 + * @Param keys: 输入参数,需要加载的所有keys + * @Param embeddings: 输入参数,需要加载的所有embeddings + * @Param optimizerSlots: 输入参数,需要加载的所有optimizerSlots + * @Return errorCode + */ + virtual int LoadEmbTableInfos(std::string tableName, const std::vector& keys, + const std::vector>& embeddings, + const std::vector>& optimizerSlots) = 0; +}; +} // namespace EmbCache + +#endif // EMBEDDING_CACHE_H diff --git a/src/AccCTR/src/include/factory.h b/src/AccCTR/src/include/factory.h index 14732cf9..69e8217a 100644 --- a/src/AccCTR/src/include/factory.h +++ b/src/AccCTR/src/include/factory.h @@ -19,6 +19,7 @@ limitations under the License. #include #include #include "unique.h" +#include "embedding_cache.h" #ifdef __cplusplus @@ -39,11 +40,13 @@ class Factory; using FactoryPtr = std::shared_ptr; using UniquePtr = std::shared_ptr; +using EmbCacheManagerPtr = std::shared_ptr; class Factory { public: virtual ~Factory() = default; virtual int CreateUnique(UniquePtr &out) = 0; + virtual int CreateEmbCacheManager(EmbCacheManagerPtr &out) = 0; virtual int SetExternalLogFuncInner(ExternalLog logFunc) = 0; public: @@ -52,7 +55,7 @@ public: int result = 0; uintptr_t factory = 0; /* dynamic load function */ - if ((result = OckCtrCommonDef::CreatFactory(&factory)) == 0) { + if ((result = OckCtrCommonDef::CreateFactory(&factory)) == 0) { out.reset(reinterpret_cast(factory)); } return result; diff --git a/src/AccCTR/src/include/ock_ctr_common_def.h b/src/AccCTR/src/include/ock_ctr_common_def.h index ed955996..75e7e9cb 100644 --- a/src/AccCTR/src/include/ock_ctr_common_def.h +++ b/src/AccCTR/src/include/ock_ctr_common_def.h @@ -25,7 +25,7 @@ namespace ock { namespace ctr { class OckCtrCommonDef { public: - static int CreatFactory(uintptr_t *factory) + static int CreateFactory(uintptr_t *factory) { static void *handle = nullptr; static std::mutex m; diff --git a/src/AccCTR/src/include/unique.h b/src/AccCTR/src/include/unique.h index 3154a784..1f58f8a4 100644 --- a/src/AccCTR/src/include/unique.h +++ b/src/AccCTR/src/include/unique.h @@ -58,6 +58,7 @@ using UniqueConf = struct UniqueConfCTR { uint32_t maxThreadNum = 8; // 最大工作线程数 int64_t maxIdVal = 0; // 最大id值 bool trace = false; // 是否开启性能检测,需要配合外部日志输出 + bool performance = false; // 是否开启增强接口,增强接口shardingNum必须是2的幂次方,默认用取模分桶 } __attribute__((packed)); using UniqueIn = struct UniqueInCTR { diff --git a/src/AccCTR/src/unique/unique_func.cpp b/src/AccCTR/src/unique/unique_func.cpp index d208eac9..45ac768a 100644 --- a/src/AccCTR/src/unique/unique_func.cpp +++ b/src/AccCTR/src/unique/unique_func.cpp @@ -27,7 +27,6 @@ void Dedup::Insert(uint64_t val) for (int8_t i = 0; i < count; ++i) { if (bucket->data[totalCount] == val) { - TryIncreaseIdCount(bucket->idCount[totalCount]); // found one return; } @@ -38,7 +37,6 @@ void Dedup::Insert(uint64_t val) std::lock_guard lg(bucket->lock); for (int8_t j = totalCount; j < bucket->count; ++j) { if (bucket->data[totalCount] == val) { - TryIncreaseIdCount(bucket->idCount[totalCount]); // found one return; } @@ -47,7 +45,6 @@ void Dedup::Insert(uint64_t val) if (totalCount < n) { bucket->data[totalCount] = val; bucket->count++; - TryIncreaseIdCount(bucket->idCount[totalCount]); return; } } @@ -55,13 +52,6 @@ void Dedup::Insert(uint64_t val) InsertOverflow(val); } -inline void Dedup::TryIncreaseIdCount(std::atomic &val) -{ - if (idCountEnable_) { - val++; - } -} - int32_t Dedup::GetReplaceOffsetUnsafe(uint64_t val) { auto h = static_cast(Hash(val) & bucketCountMask_); @@ -108,7 +98,6 @@ void Dedup::Clear(uint64_t newBucketCountPowerOf2) } bzero(table_, sizeof(Meta) * bucketCount_); overflow_.clear(); - idCountOverflow_.clear(); } void Dedup::NewParameter() @@ -168,6 +157,58 @@ int32_t ShardedDedup::GetFillOffset(const std::vector &totalUniqueSize, } } +void ShardedDedup::GetIndexAndStart(const int32_t *uniqueSizeInBucket, bool usePadding, int shardingNumber, int &start, + int &index) +{ + if (shardingNumber > 0) { + index += uniqueSizeInBucket[shardingNumber - 1]; + } + + if (usePadding) { + start = shardingNumber * conf.paddingSize; + } else { + start = index; + } +} + +int ShardedDedup::PrintMemCpyLog(int rc, const uint32_t dstSize, const std::string &logMsg) +{ + if (rc != 0) { + std::stringstream ssm; + ssm << "[" << logMsg << "] memcpy_s failed... dstSize: " << dstSize; + ExternalLogger::PrintLog(LogLevel::ERROR, ssm.str()); + return H_COPY_ERROR; + } else { + return H_OK; + } +} + +int ShardedDedup::HandleIdCountFill(std::vector> &idCount, UniqueOutSelf &uniqueOut) +{ + if (conf.usePadding) { + uint32_t memSize = idCount.size() * sizeof(int32_t); + auto rc = memcpy_s(uniqueOut.idCntFill, memSize, (int32_t *)(idCount.data()), memSize); + if (rc != 0) { + return rc; + } + int ret = PrintMemCpyLog(rc, memSize, "[TileAndFill/idCntFill]"); + if (ret != 0) { + return ret; + } + } else { + uint32_t memSize = idCount.size() * sizeof(int32_t); + auto rc = memcpy_s(uniqueOut.idCnt, memSize, (int32_t *)(idCount.data()), memSize); + if (rc != 0) { + return rc; + } + + int ret = PrintMemCpyLog(rc, memSize, "[TileAndFill/idCnt]"); + if (ret != 0) { + return ret; + } + } + return H_OK; +} size_t ShardedDedup::CalThreadNum() const { diff --git a/src/AccCTR/src/unique/unique_func.h b/src/AccCTR/src/unique/unique_func.h index 07c8ebb7..4812f74c 100644 --- a/src/AccCTR/src/unique/unique_func.h +++ b/src/AccCTR/src/unique/unique_func.h @@ -30,6 +30,7 @@ limitations under the License. #include #include #include +#include #include "securec.h" #include "common_includes.h" @@ -37,6 +38,14 @@ limitations under the License. namespace ock { namespace ctr { +#ifndef LIKELY +#define LIKELY(x) __builtin_expect(!!(x), 1) +#endif + +#ifndef UNLIKELY +#define UNLIKELY(x) __builtin_expect(!!(x), 0) +#endif + using UniqueOutSelf = struct UniqueSelf { void *uniqueId = nullptr; // 去重分桶填充之后最终的的ids(需要用户申请)必选 uint32_t *index = nullptr; // 去重后id的索引位置(需要用户申请)必选 @@ -47,7 +56,7 @@ using UniqueOutSelf = struct UniqueSelf { int uniqueIdCnt = 0; // 每个桶去重后的id个数(需要用户申请) }; -constexpr int UNIQUE_MAX_BUCKET_WIDTH = 5; +constexpr int UNIQUE_MAX_BUCKET_WIDTH = 6; template struct Map {}; template <> struct Map { @@ -111,7 +120,7 @@ class Dedup { static constexpr uint32_t K_MINIMAL_WORKLOAD_PER_WORKER = 1 << 12; static constexpr size_t K_ALIGNMENT = 64; static const int kDefaultBucketCount = 1 << 24; - static const int8_t n = 4; + static const int8_t n = UNIQUE_MAX_BUCKET_WIDTH; template struct Meta { static_assert(M <= UNIQUE_MAX_BUCKET_WIDTH, "should be no larger than max bucket width"); @@ -119,7 +128,6 @@ class Dedup { volatile int8_t count {}; uint32_t replaceBase {}; volatile uint64_t data[M] {}; - std::atomic idCount[M] {}; } __attribute__((__aligned__(64))); struct Statistics { @@ -152,11 +160,10 @@ public: void Insert(uint64_t val); int32_t GetReplaceOffsetUnsafe(uint64_t val); void InitTable(); - void TryIncreaseIdCount(std::atomic &val); void Clear(uint64_t newBucketCountPowerOf2); void NewParameter(); - template uint32_t UniqueRaw(void *output, uint32_t priorTotal, int32_t *idCount) + template uint32_t UniqueRaw(void *output, uint32_t priorTotal) { uint32_t total = priorTotal; uint32_t replaceOffset = priorTotal; @@ -168,19 +175,13 @@ public: } bucket->replaceBase = replaceOffset; for (int j = 0; j < bucket->count; ++j) { - if (idCountEnable_) { - idCount[total] = bucket->idCount[j]; - } - out[total++] = static_cast::type>(bucket->data[j]); + out[total++] = bucket->data[j]; } replaceOffset += bucket->count; } auto it = overflow_.begin(); int32_t totalOverflow = 0; while (it != overflow_.end()) { - if (idCountEnable_) { - idCount[total] = static_cast(idCountOverflow_[it->first]); - } out[total++] = it->first; it->second = replaceOffset++; ++it; @@ -189,7 +190,7 @@ public: // set total overflow count stats_.totalUniques = static_cast(total - priorTotal); - stats_.totalOverflowUniques = static_cast(totalOverflow); + stats_.totalOverflowUniques = totalOverflow; return total - priorTotal; } @@ -200,14 +201,13 @@ private: int largeCount_ { 0 }; Meta *table_ {}; std::unordered_map overflow_; - std::unordered_map idCountOverflow_; SpinLockG overflowMutex_; Statistics stats_; bool idCountEnable_ { false }; static inline uint64_t Hash(uint64_t val) { - return val ^ (val >> HASH_L_L) ^ (val >> HASH_L_L) ^ (val >> HASH_H); + return val ^ (val >> HASH_L_L) ^ (val >> HASH_L) ^ (val >> HASH_H); } void InsertOverflow(uint64_t val) @@ -217,10 +217,6 @@ private: if (it == overflow_.end()) { overflow_[val] = 0; } - - if (idCountEnable_) { - idCountOverflow_[val]++; - } } int32_t GetReplaceOffsetFromOverflowUnsafe(uint64_t val) @@ -234,6 +230,7 @@ class ShardedDedup { static constexpr uint32_t K_MINIMAL_WORKLOAD_PER_WORKER = 1 << 13; static constexpr int K_DEFAULT_DUPLICATE_RATIO = 4; static constexpr int K_BUCKET_WIDTH = 4; + static constexpr int CLEAR_WAIT_TIME = 10; public: using DedupT = Dedup; @@ -244,44 +241,45 @@ public: { const int numOfGroupsInShard = groupMethod_.GroupCount(); uint32_t totalSize = conf.desiredSize + (conf.desiredSize >> 1); - while (bucketCountPower2_ * static_cast(K_BUCKET_WIDTH) * - static_cast(numOfGroupsInShard) * static_cast(estimatedDuplicateRatio) < totalSize) { + while (bucketCountPower2_ * K_BUCKET_WIDTH * numOfGroupsInShard * estimatedDuplicateRatio < totalSize) { bucketCountPower2_ <<= 1; } idCountEnable_ = (conf.outputType == OutputType::ENHANCED) && conf.useIdCount; - try { - for (int32_t i = 0; i < numOfGroupsInShard; ++i) { - auto obj = new DedupT(bucketCountPower2_, numOfGroupsInShard, idCountEnable_); - dedupShards_.emplace_back(obj); + for (int32_t i = 0; i < numOfGroupsInShard; ++i) { + auto obj = new DedupT(bucketCountPower2_, numOfGroupsInShard, idCountEnable_); + if (obj == nullptr) { + ExternalLogger::PrintLog(LogLevel::ERROR, "creat object error"); + throw NullptrError(); } - } catch (const std::bad_alloc& e) { - ExternalLogger::PrintLog(LogLevel::ERROR, "Memory allocation failed during loop: " + std::string(e.what())); - throw; + dedupShards_.emplace_back(obj); } } ~ShardedDedup() = default; - void StartNewRound() + int StartNewRound() { for (auto &s : dedupShards_) { s->NewParameter(); } + clearFinish_ = true; + return 0; } public: template int Compute(UniqueIn &uniqueIn, UniqueOutSelf &uniqueOut) { - try { - if (!firstEnterFlag_) { - StartNewRound(); - } - } catch (AllocError &) { - ExternalLogger::PrintLog(LogLevel::ERROR, "memory alloc error"); - return H_MEMORY_ALLOC_ERROR; + if (firstEnter_) { + pool_.SetNumThreads(1); + firstEnter_ = false; } - firstEnterFlag_ = false; + + while (!clearFinish_) { + usleep(CLEAR_WAIT_TIME); + } + + clearFinish_ = false; size_t threadNum = CalThreadNum(); partSize = (uniqueIn.inputIdCnt + threadNum - 1) / threadNum; @@ -304,23 +302,29 @@ public: if (conf.outputType == OutputType::ENHANCED) { int totalNumber = 0; for (int i = 0; i < conf.shardingNum; i++) { - totalUniqueSize[i] = static_cast(totalNumber); + totalUniqueSize[i] = totalNumber; if (conf.useSharding) { totalNumber += uniqueOut.uniqueIdCntInBucket[i]; } } } - ret = CalUniqueOut(uniqueIn, uniqueOut, totalUniqueSize); + int size = 1; + if (conf.useIdCount) { + size = conf.usePadding ? conf.paddingSize * conf.shardingNum : uniqueOut.uniqueIdCnt; + } + std::vector> idCount(size); + ret = CalUniqueOut(uniqueIn, uniqueOut, totalUniqueSize, idCount); if (ret != H_OK) { ExternalLogger::PrintLog(LogLevel::ERROR, "CalUniqueOut ERROR"); return ret; } if (conf.outputType == OutputType::ENHANCED) { - HandleTileAndFill(uniqueIn, uniqueOut); + HandleTileAndFill(uniqueOut, idCount); } + pool_.AddTask([this]() { return StartNewRound(); }); return H_OK; } @@ -336,17 +340,22 @@ private: int32_t GetFillOffset(const std::vector &totalUniqueSize, int64_t val, int32_t group); - template int HandleTileAndFill(UniqueIn &uniqueIn, UniqueOutSelf &uniqueOut) + void GetIndexAndStart(const int32_t *uniqueSizeInBucket, bool usePadding, int shardingNumber, int &start, + int &index); + + int PrintMemCpyLog(int rc, const uint32_t dstSize, const std::string &logMsg); + + int HandleIdCountFill(std::vector> &idCount, UniqueOutSelf &uniqueOut); + + template int HandleTileAndFill(UniqueOutSelf &uniqueOut, std::vector> &idCount) { int ret = H_OK; if (conf.useSharding) { // 使能shard - ret = TileAndFill(uniqueOut.uniqueIdInBucket, uniqueOut.uniqueIdCntInBucket, uniqueOut.uniqueId, - uniqueOut.idCnt, uniqueOut.idCntFill); + ret = TileAndFill(uniqueOut, uniqueOut.uniqueIdCntInBucket, idCount); } else if (!conf.useSharding && conf.useIdCount) { // 不使能shard和使能特征计数 std::vector count; count.emplace_back(uniqueOut.uniqueIdCnt); // 记录去重后id个数 - ret = TileAndFill(uniqueOut.uniqueId, count.data(), uniqueOut.uniqueId, uniqueOut.idCnt, - uniqueOut.idCntFill); + ret = TileAndFill(uniqueOut, count.data(), idCount); } if (ret != H_OK) { @@ -365,37 +374,37 @@ private: uint64_t inGroupTotal; if (conf.outputType == OutputType::ENHANCED) { if (conf.useSharding && conf.useIdCount) { - inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueIdInBucket, total, - uniqueOut.idCnt); // 特征计数使能和shard同时使能 - uniqueOut.uniqueIdCntInBucket[j] = static_cast(inGroupTotal); + inGroupTotal = + dedupShards_[j]->UniqueRaw(uniqueOut.uniqueIdInBucket, total); // 特征计数使能和shard同时使能 + uniqueOut.uniqueIdCntInBucket[j] = inGroupTotal; } else if (!conf.useSharding && conf.useIdCount) { - inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total, - uniqueOut.idCnt); // 特征计数使能和shard不使能 + inGroupTotal = + dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total); // 特征计数使能和shard不使能 } else if (conf.useSharding && !conf.useIdCount) { - inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueIdInBucket, total, - nullptr); // 特征计数使能和shard不使能 - uniqueOut.uniqueIdCntInBucket[j] = static_cast(inGroupTotal); + inGroupTotal = + dedupShards_[j]->UniqueRaw(uniqueOut.uniqueIdInBucket, total); // 特征计数使能和shard不使能 + uniqueOut.uniqueIdCntInBucket[j] = inGroupTotal; } else { - inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total, - nullptr); // 特征计数不使能和shard不使能,跟普通unique对等 + inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, + total); // 特征计数不使能和shard不使能,跟普通unique对等 } } else { - inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total, nullptr); + inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total); } - total += static_cast(inGroupTotal); + total += inGroupTotal; } uniqueOut.uniqueIdCnt = total; } template - int TileAndFill(void *uniqueIdInBucket, const int32_t *uniqueSizeInBucket, void *uniqueIds, const int32_t *idCnt, - int32_t *idCntFill) + int TileAndFill(UniqueOutSelf &uniqueOut, const int32_t *uniqueSizeInBucket, + std::vector> &idCount) { int start = 0; int index = 0; - auto uIdInBucket = TypeTrans(uniqueIdInBucket); - auto uIds = TypeTrans(uniqueIds); + auto uIdInBucket = TypeTrans(conf.useSharding ? uniqueOut.uniqueIdInBucket : uniqueOut.uniqueId); + auto uIds = TypeTrans(uniqueOut.uniqueId); for (int i = 0; i < conf.shardingNum; i++) { GetIndexAndStart(uniqueSizeInBucket, conf.usePadding, i, start, index); @@ -419,35 +428,31 @@ private: if (conf.useIdCount && conf.usePadding) { memSize = uniqueSizeInBucket[i] * sizeof(int32_t); - rc = memcpy_s(idCntFill + start, memSize, idCnt + index, memSize); - ret = PrintMemCpyLog(rc, memSize, "[TileAndFill/idCntFill]"); + rc = memcpy_s(uniqueOut.idCnt + index, memSize, (int32_t *)(idCount.data()) + start, + memSize); // 填充idCount + ret = PrintMemCpyLog(rc, memSize, "[TileAndFill/idCnt]"); + } + + if (ret != 0) { + return ret; } + } + + if (conf.useIdCount) { + int ret = HandleIdCountFill(idCount, uniqueOut); if (ret != 0) { return ret; } } if (conf.usePadding) { - HandleFill(uIds, uniqueSizeInBucket, idCntFill); + HandleFill(uIds, uniqueSizeInBucket); } return H_OK; } - int PrintMemCpyLog(int rc, const uint32_t dstSize, const std::string &logMsg) - { - if (rc != 0) { - std::stringstream ssm; - ssm << "[" << logMsg << "] memcpy_s failed... dstSize: " << dstSize; - ExternalLogger::PrintLog(LogLevel::ERROR, ssm.str()); - return H_COPY_ERROR; - } else { - return H_OK; - } - } - - template - void HandleFill(typename Map::type *uIds, const int32_t *uniqueSizeInBucket, int32_t *idCntFill) + template void HandleFill(typename Map::type *uIds, const int32_t *uniqueSizeInBucket) { int start = 0; int index = 0; @@ -459,26 +464,6 @@ private: for (int j = 0; j < fillLen; j++) { uIds[start + uniqueSizeInBucket[i] + j] = conf.paddingVal; // padding填充 } - - if (idCntFill != nullptr) { - for (int y = 0; y < fillLen; y++) { - idCntFill[start + uniqueSizeInBucket[i] + y] = 0; // 特征计数填充 - } - } - } - } - - void GetIndexAndStart(const int32_t *uniqueSizeInBucket, bool usePadding, int shardingNumber, int &start, - int &index) - { - if (shardingNumber > 0) { - index += uniqueSizeInBucket[shardingNumber - 1]; - } - - if (usePadding) { - start = shardingNumber * conf.paddingSize; - } else { - start = index; } } @@ -493,13 +478,18 @@ private: tasks.push_back([this, val, start, end, &ret]() { for (uint64_t j = start; j < end; ++j) { auto value = val[j]; - if (value > conf.maxIdVal) { + if (UNLIKELY(value > conf.maxIdVal)) { ExternalLogger::PrintLog(LogLevel::ERROR, "id val is larger than maxIdVal"); ret = H_ID_LARGE; break; } - auto group = groupMethod_.GroupId(value); - dedupShards_[group]->Insert(value); + + if (conf.performance) { + dedupShards_[value & (conf.shardingNum - 1)]->Insert(value); + } else { + auto group = groupMethod_.GroupId(value); + dedupShards_[group]->Insert(value); + } } }); } @@ -520,31 +510,46 @@ private: } template - int CalUniqueOut(UniqueIn &uniqueIn, UniqueOutSelf &uniqueOut, std::vector &totalUniqueSize) + int CalUniqueOut(UniqueIn &uniqueIn, UniqueOutSelf &uniqueOut, std::vector &totalUniqueSize, + std::vector> &idCount) { uint32_t *beginPtr = uniqueOut.index; uint32_t *finishPtr = beginPtr + uniqueIn.inputIdCnt; uint32_t *partBeginPtr = beginPtr; - auto alignedAddress = CacheLineAlign(reinterpret_cast(partBeginPtr + partSize)); - auto *partEndPtr = reinterpret_cast(static_cast(alignedAddress)); + auto *partEndPtr = + reinterpret_cast(CacheLineAlign(reinterpret_cast(partBeginPtr + partSize))); std::vector> tasks; auto val = TypeTrans(uniqueIn.inputId); while (partBeginPtr < finishPtr) { if (partEndPtr > finishPtr) { partEndPtr = finishPtr; } - if (partBeginPtr < partEndPtr) { - // Due to cacheline alignment computation, the actual number of - // threads created here may not match threadNum exactly but - // should be +/-1 off. - tasks.push_back([this, val, beginPtr, partBeginPtr, partEndPtr, totalUniqueSize]() { - for (uint32_t *ptr = partBeginPtr; ptr < partEndPtr; ++ptr) { + + if (partBeginPtr >= partEndPtr) { + partBeginPtr = partEndPtr; + partEndPtr += partSize; + continue; + } + + // Due to cacheline alignment computation, the actual number of + // threads created here may not match threadNum exactly but + // should be +/-1 off. + tasks.push_back([this, val, beginPtr, partBeginPtr, partEndPtr, totalUniqueSize, &idCount]() { + for (uint32_t *ptr = partBeginPtr; ptr < partEndPtr; ++ptr) { + int32_t fillOffset; + if (conf.performance) { + fillOffset = GetFillOffset(totalUniqueSize, val[ptr - beginPtr], + val[ptr - beginPtr] & (conf.shardingNum - 1)); + } else { auto group = groupMethod_.GroupId(val[ptr - beginPtr]); - int32_t fillOffset = GetFillOffset(totalUniqueSize, val[ptr - beginPtr], group); - *ptr = fillOffset; + fillOffset = GetFillOffset(totalUniqueSize, val[ptr - beginPtr], group); } - }); - } + *ptr = fillOffset; + if (LIKELY(conf.useIdCount)) { + idCount[fillOffset]++; + } + } + }); partBeginPtr = partEndPtr; partEndPtr += partSize; } @@ -569,8 +574,10 @@ private: UniqueConf conf; std::vector> dedupShards_ {}; uint32_t partSize; - bool firstEnterFlag_ = false; + bool clearFinish_ = true; bool idCountEnable_ { false }; + ThreadPoolAsync pool_; + bool firstEnter_ = true; }; } } diff --git a/src/AccCTR/src/unique/unique_impl.cpp b/src/AccCTR/src/unique/unique_impl.cpp index 77113214..800f21de 100644 --- a/src/AccCTR/src/unique/unique_impl.cpp +++ b/src/AccCTR/src/unique/unique_impl.cpp @@ -228,6 +228,14 @@ int UniqueImpl::CheckEnhancedUniqueConf(const UniqueConf &conf) if (CheckInputZero(conf.shardingNum, "shardingNum")) { return H_NUM_SMALL; } + if (conf.performance) { + bool isExponentOfTwo = + (conf.shardingNum > 0) && ((conf.shardingNum & (conf.shardingNum - 1)) == 0); // 判断是不是2的N次幂 + if (!isExponentOfTwo) { + ExternalLogger::PrintLog(LogLevel::ERROR, "if performance is true, shardingNum must be 2^N"); + return H_ERROR; + } + } } return H_OK; diff --git a/src/AccCTR/src/unique/unique_impl.h b/src/AccCTR/src/unique/unique_impl.h index f4c45fde..e37a58db 100644 --- a/src/AccCTR/src/unique/unique_impl.h +++ b/src/AccCTR/src/unique/unique_impl.h @@ -43,7 +43,7 @@ private: private: ShardedDedup *unique = nullptr; - UniqueConf uniqueConf {}; + UniqueConf uniqueConf{}; }; } } diff --git a/src/AccCTR/tests/tools/create_fake_id.py b/src/AccCTR/tests/tools/create_fake_id.py index fc0f1f8e..aa42f071 100644 --- a/src/AccCTR/tests/tools/create_fake_id.py +++ b/src/AccCTR/tests/tools/create_fake_id.py @@ -68,12 +68,6 @@ def write_data(file_name, x, y, dup): def main(): - # 300w id去重率20% - # 6x + y =300 - # x + y = 60 - # x = 48 y =12 - write_data('data20.txt', 48*10000, 12*10000, 6) - # 300w id去重率30% # 6x + y =300 # x + y = 90 diff --git a/src/AccCTR/tests/ut/conf/toolchain.cmake b/src/AccCTR/tests/ut/conf/toolchain.cmake new file mode 100644 index 00000000..bd6617e4 --- /dev/null +++ b/src/AccCTR/tests/ut/conf/toolchain.cmake @@ -0,0 +1,24 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# 添加编译选项 +option(USE32BIT "Use 32-Bit" OFF) +if(USE32BIT) + add_compile_options(-m32) + add_link_options(-m32) +endif() + +add_compile_options(-Wall) +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 11) \ No newline at end of file diff --git a/src/AccCTR/tests/ut/src/CMakeLists.txt b/src/AccCTR/tests/ut/src/CMakeLists.txt index a4c631e8..3da58244 100644 --- a/src/AccCTR/tests/ut/src/CMakeLists.txt +++ b/src/AccCTR/tests/ut/src/CMakeLists.txt @@ -19,6 +19,11 @@ set(OCK_CTR_UTIL_INSTALL_DIR ${PROJECT_SOURCE_DIR}/install) set(OCK_CTR_SRC_DIR ${PROJECT_SOURCE_DIR}/src) message("src" ${OCK_CTR_SRC_DIR}) +# 包含所有组件的cmake +include("${CMAKE_CURRENT_SOURCE_DIR}/../conf/toolchain.cmake") +set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../src) +set(TOP_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../) + file(GLOB_RECURSE TEST_UNIQUE_FILES *.cpp *.h) add_executable(test_unique_files ${TEST_UNIQUE_FILES}) include_directories(${OCK_CTR_UTIL_INSTALL_DIR}/googletest-release-1.8.1/include) @@ -29,17 +34,36 @@ SET(LIB_3RD_GTEST ${OCK_CTR_UTIL_INSTALL_DIR}/googletest-release-1.8.1/lib64/lib message(${OCK_CTR_SRC_DIR}/include) +# 添加库文件的搜索路径 +target_link_directories(test_unique_files + PUBLIC + ${PROJECT_SOURCE_DIR}/output/ock_ctr_common/lib + ) +# 添加头文件的搜索路径 target_include_directories(test_unique_files PUBLIC - ${OCK_CTR_SRC_DIR}/include) + ${OCK_CTR_SRC_DIR}/include + ${PROJECT_SOURCE_DIR} + ${OCK_CTR_SRC_DIR}/common/util + ) +# 用来指定要链接的库 target_link_libraries(test_unique_files PUBLIC -Wl,--start-group + _ock_ctr_common pthread dl ${LIB_3RD_GTEST} ${LIB_3RD_GMOCK} -Wl,--end-group) +# 打印构建选项 +get_target_property(COMPILE_FLAGS test_unique_files COMPILE_OPTIONS) +get_target_property(LINK_FLAGS test_unique_files LINK_OPTIONS) +message(STATUS "Compiler id: ${CMAKE_CXX_COMPILER_ID}") +message(STATUS "Compile flags: ${COMPILE_FLAGS}") +message(STATUS "Link flags: ${LINK_FLAGS}") +message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}") + diff --git a/src/AccCTR/tests/ut/src/common.h b/src/AccCTR/tests/ut/src/common.h new file mode 100644 index 00000000..7302d10c --- /dev/null +++ b/src/AccCTR/tests/ut/src/common.h @@ -0,0 +1,64 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef CTR_COMMON_H +#define CTR_COMMON_H +#include + +#include "factory.h" + +extern ock::ctr::FactoryPtr factory; + +enum CTRLogLevel { + DEBUG = 0, + INFO, + WARN, + ERROR, +}; + +class SimpleThreadPool { +public: + static void SyncRun(const std::vector> &tasks) + { + std::vector> futs; + for (auto &task : tasks) { + futs.push_back(std::async(task)); + } + for (auto &fut : futs) { + fut.wait(); + } + } +}; + +static void CTRLog(int level, const char *msg) +{ + switch (level) { + case CTRLogLevel::DEBUG: + std::cout << "DEBUG:" << msg << std::endl; + break; + case CTRLogLevel::INFO: + std::cout << "INFO:" << msg << std::endl; + break; + case CTRLogLevel::WARN: + std::cout << "WARN:" << msg << std::endl; + break; + case CTRLogLevel::ERROR: + std::cout << "ERROR:" << msg << std::endl; + break; + default: + break; + } +} + +#endif // CTR_COMMON_H diff --git a/src/AccCTR/tests/ut/src/emb_cache_test.cpp b/src/AccCTR/tests/ut/src/emb_cache_test.cpp new file mode 100644 index 00000000..dda5423c --- /dev/null +++ b/src/AccCTR/tests/ut/src/emb_cache_test.cpp @@ -0,0 +1,1999 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#include +#include + +#include "common/util/error_code.h" +#include "emb_cache_test.h" +#include "common.h" + +using namespace std; +using namespace ock::ctr; + +FactoryPtr factory; +EmbCacheManagerPtr embCache = nullptr; + +std::vector GenKeys(uint64_t n, uint32_t seed = 0, uint64_t min = 0, uint64_t max = UINT64_MAX) +{ + std::mt19937 generator(seed); + std::uniform_int_distribution distribution(min, max); + std::vector data(n); + for (uint64_t &x : data) { + x = distribution(generator); + } + sort(data.begin(), data.end()); + data.erase(unique(data.begin(), data.end()), data.end()); + return data; +} + +std::vector GenUniqueKeys(uint64_t n) +{ + std::vector data(n); + for (uint64_t i = 0; i < n; i++) { + data[i] = i; + } + return data; +} + +EmbCacheManagerPtr EmbCacheTest::SimpleCreateTable(std::string tableName, uint32_t hostVocabSize, + uint32_t embeddingSize, uint32_t extEmbeddingSize, uint32_t devVocabSize, pair normalPara, + float constPara) +{ + factory->CreateEmbCacheManager(embCache); + EmbCache::EmbCacheInfo embCacheInfo(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + EmbCache::NormalInitializerInfo normalInitializerInfo(normalPara.first, normalPara.second, 0, 1.0); + std::string normalInitializeName = "random_normal_initializer"; + EmbCache::InitializerInfo normalInitializeInfo(normalInitializeName, 0, embeddingSize, normalInitializerInfo); + + EmbCache::ConstantInitializerInfo constantInitializerInfo(constPara, 1.0); + std::string constantInitializeName = "constant_initializer"; + + std::vector initializeInfos(extEmbeddingSize / embeddingSize); + initializeInfos[0] = normalInitializeInfo; + for (uint64_t i = 1; i < initializeInfos.size(); i++) { + initializeInfos[i] = EmbCache::InitializerInfo(constantInitializeName, embeddingSize * i, embeddingSize, + constantInitializerInfo); + } + int ret = embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize, 1); + if (ret != H_OK) { + string msg = "CreateCacheForTable Failed. ret: " + std::to_string(ret); + CTRLog(CTRLogLevel::ERROR, msg.c_str()); + return nullptr; + } + return embCache; +} + +EmbCacheManagerPtr EmbCacheTest::ConstZeroCreateTable(std::string tableName, uint32_t hostVocabSize, + uint32_t embeddingSize, uint32_t extEmbeddingSize, uint32_t devVocabSize, uint64_t prefillBufferSize, + uint8_t prefillThreadNum) +{ + factory->CreateEmbCacheManager(embCache); + EmbCache::EmbCacheInfo embCacheInfo(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + EmbCache::ConstantInitializerInfo constantInitializerInfo(0.0, 1.0); + std::string constantInitializeName = "constant_initializer"; + + std::vector initializeInfos = { EmbCache::InitializerInfo(constantInitializeName, 0, + extEmbeddingSize, constantInitializerInfo) }; + int ret = embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, prefillBufferSize, prefillThreadNum); + if (ret != H_OK) { + string msg = "CreateCacheForTable Failed. ret: " + std::to_string(ret); + CTRLog(CTRLogLevel::ERROR, msg.c_str()); + return nullptr; + } + return embCache; +} + +void EmbCacheTest::SetUpTestCase() +{ + Factory::Create(factory); + factory->SetExternalLogFuncInner(CTRLog); +} + +void EmbCacheTest::TearDownTestCase() {} + +void EmbCacheTest::SetUp() {} + +void EmbCacheTest::TearDown() +{ + if (embCache != nullptr) { + embCache->Destroy(); + embCache = nullptr; + } +} + +TEST_F(EmbCacheTest, ConstantInitializerInfo) +{ + CTRLog(CTRLogLevel::INFO, "===========ConstantInitializerInfo start============="); + + // 正确初始化ConstantInitializerInfo结构体,无日志信息反馈 + EmbCache::ConstantInitializerInfo constantInitializerInfo(0.233, 1.0); + CTRLog(CTRLogLevel::INFO, "===========ConstantInitializerInfo end============="); +} + +TEST_F(EmbCacheTest, NormalInitializerInfo) +{ + CTRLog(CTRLogLevel::INFO, "===========NormalInitializerInfo start============="); + // 正确初始化NormalInitializerInfo结构体,无日志信息反馈 + EmbCache::NormalInitializerInfo normalInitializerInfo(0, 0.05, 0, 1.0); + // 标准差负值数学意义不明,传入负值问题用户自己承担 + EmbCache::NormalInitializerInfo normalInitializerInfo_ne_dev(0, -0.05, 0, 1.0); + CTRLog(CTRLogLevel::INFO, "===========NormalInitializerInfo end============="); +} + +TEST_F(EmbCacheTest, InitializerInfo) +{ + CTRLog(CTRLogLevel::INFO, "===========InitializerInfo start============="); + uint32_t embeddingSize = 13; + + EmbCache::NormalInitializerInfo normalInitializerInfo(0, 0.05, 0, 1.0); + EmbCache::ConstantInitializerInfo constantInitializerInfo(0.233, 1.0); + + // 传入的std::string不为"constant_initializer" 日志打印"Invalid Initializer Type." + std::string not_a_initializer_name = "not_a_initializer_name"; + EmbCache::InitializerInfo constantInitializeInfo = + EmbCache::InitializerInfo(not_a_initializer_name, embeddingSize, embeddingSize, constantInitializerInfo); + + // 传入的std::string不为"constant_initializer" 日志打印"Invalid Initializer Type." + not_a_initializer_name = ""; + constantInitializeInfo = + EmbCache::InitializerInfo(not_a_initializer_name, embeddingSize, embeddingSize, constantInitializerInfo); + + // 正确初始化InitializeInfo结构体,无日志信息反馈 + std::string constantInitializeName = "constant_initializer"; + constantInitializeInfo = + EmbCache::InitializerInfo(constantInitializeName, embeddingSize, embeddingSize + 1, constantInitializerInfo); + + // 传入的std::string不为"random_normal_initializer"或truncated_normal_initializer 日志打印"Invalid Initializer + // Type." + not_a_initializer_name = "not_a_initializer_name"; + EmbCache::InitializerInfo normalInitializeInfo = + EmbCache::InitializerInfo(not_a_initializer_name, embeddingSize, embeddingSize, normalInitializerInfo); + + // 传入的std::string不为"random_normal_initializer"或truncated_normal_initializer 日志打印"Invalid Initializer + // Type." + not_a_initializer_name = ""; + normalInitializeInfo = + EmbCache::InitializerInfo(not_a_initializer_name, embeddingSize, embeddingSize, normalInitializerInfo); + + // 正确初始化InitializeInfo结构体,无日志信息反馈 + std::string normalInitializeName = "random_normal_initializer"; + normalInitializeInfo = EmbCache::InitializerInfo(normalInitializeName, 0, embeddingSize, normalInitializerInfo); + + // 正确初始化InitializeInfo结构体,无日志信息反馈 + std::string truncatedNormalInitializeName = "truncated_normal_initializer"; + EmbCache::InitializerInfo truncatedNormalInitializeInfo = + EmbCache::InitializerInfo(truncatedNormalInitializeName, 0, embeddingSize, normalInitializerInfo); + + CTRLog(CTRLogLevel::INFO, "===========InitializerInfo end============="); +} + +TEST_F(EmbCacheTest, EmbCacheInfo) +{ + CTRLog(CTRLogLevel::INFO, "===========EmbCacheInfo start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + // 正确初始化EmbCacheInfo结构体,无日志信息反馈 + EmbCache::EmbCacheInfo embCacheInfo(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + CTRLog(CTRLogLevel::INFO, "===========EmbCacheInfo end============="); +} + +TEST_F(EmbCacheTest, CreateCacheForTable) +{ + factory->CreateEmbCacheManager(embCache); + CTRLog(CTRLogLevel::INFO, "===========CreateCacheForTable start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + EmbCache::EmbCacheInfo embCacheInfo(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, {}, -1, hostVocabSize), H_INITIALIZER_INVALID); + + EmbCache::NormalInitializerInfo normalInitializerInfo(0, 0.05, 0, 1.0); + std::string normalInitializeName = "random_normal_initializer"; + EmbCache::InitializerInfo normalInitializeInfo(normalInitializeName, 0, embeddingSize, normalInitializerInfo); + + // 空initializer 日志打印出"Initializer is nullptr" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, { {}, {} }, -1, hostVocabSize), H_INITIALIZER_INVALID); + + normalInitializeInfo.initializer = nullptr; + // 空initializer 日志打印出"Initializer is nullptr" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, { normalInitializeInfo }, -1, hostVocabSize), + H_INITIALIZER_INVALID); + + normalInitializeInfo = EmbCache::InitializerInfo(normalInitializeName, 0, embeddingSize, normalInitializerInfo); + EmbCache::ConstantInitializerInfo constantInitializerInfo(0.233, 1.0); + std::string constantInitializeName = "constant_initializer"; + EmbCache::InitializerInfo constantInitializeInfo(constantInitializeName, embeddingSize, embeddingSize + 1, + constantInitializerInfo); + std::vector initializeInfos = { normalInitializeInfo, constantInitializeInfo }; + + // initializerInfos的区间之间有重叠或者遗漏 日志打印出"Initializers got coverage problems" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_INITIALIZER_INVALID); + + constantInitializeInfo = + EmbCache::InitializerInfo(constantInitializeName, embeddingSize + 1, embeddingSize, constantInitializerInfo); + initializeInfos = { normalInitializeInfo, constantInitializeInfo }; + // initializerInfos的区间之间有重叠或者遗漏 日志打印出"Initializers got coverage problems" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_INITIALIZER_INVALID); + + + embCacheInfo.extEmbeddingSize = extEmbeddingSize; + std::string not_a_initializer_name = "not_a_initializer_name"; + constantInitializeInfo = + EmbCache::InitializerInfo(not_a_initializer_name, embeddingSize, embeddingSize, constantInitializerInfo); + initializeInfos = { normalInitializeInfo, constantInitializeInfo }; + + // 传入的Initializer的name不符要求 日志打印出"Invalid Initializer Type.\nInitializer is nullptr" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_INITIALIZER_INVALID); + + constantInitializeInfo = + EmbCache::InitializerInfo(constantInitializeName, embeddingSize, embeddingSize, constantInitializerInfo); + initializeInfos = { normalInitializeInfo, constantInitializeInfo }; + + embCacheInfo.extEmbeddingSize++; + + // 传入的embInfo中的传入的extEmbeddingSize并非embeddingSize的整数倍 日志打印出"extEmbeddingSize = embeddingSize + + // optimizerSize, which is divisible by embeddingSize" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), + H_EXT_EMBEDDING_SIZE_INVALID); + + embCacheInfo.maxCacheSize = 100; + // maxCacheSize>vocabSize 日志打印出"vocabSize must be greater than or equal to maxCacheSize" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), + H_HOST_VOCAB_SIZE_TOO_SMALL); + embCacheInfo.maxCacheSize = devVocabSize; + + embCacheInfo.extEmbeddingSize = 0; + // extEmbeddingSize为0 日志打印出"size must be positive" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_SIZE_ZERO); + embCacheInfo.extEmbeddingSize = extEmbeddingSize; + + embCacheInfo.embeddingSize = 0; + // embeddingSize为0 日志打印出"size must be positive" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_SIZE_ZERO); + embCacheInfo.embeddingSize = embeddingSize; + + embCacheInfo.vocabSize = 0; + // vocabSize为0 日志打印出"size must be positive" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_SIZE_ZERO); + embCacheInfo.vocabSize = hostVocabSize; + + embCacheInfo.maxCacheSize = 0; + // maxCacheSize为0 日志打印出"size must be positive" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_SIZE_ZERO); + embCacheInfo.maxCacheSize = devVocabSize; + + embCacheInfo.tableName = ""; + // 传入的tableName空 日志打印出"tableName can not be empty" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_TABLE_NAME_EMPTY); + + embCacheInfo.tableName = + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "0000000001000000000100000000010001"; + // 传入的tableName长度正好为长度上限1024 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_OK); + + embCacheInfo.tableName = + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100012"; + // 传入的tableName长度为1025超过了长度上限 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_TABLE_NAME_TOO_LONG); + embCacheInfo.tableName = tableName; + + // 正常创建 日志中不会打印异常信息 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_OK); + + // 重复创建同名Table 日志打印出"This table has already been created" + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), + H_TABLE_CREATE_DUPLICATE); + embCache->Destroy(); + + // Destroy后仍能正常创建 日志中不会打印异常信息 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize), H_OK); + embCache->Destroy(); + + // prefill单线程 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, 3, 1), H_OK); + embCache->Destroy(); + + // prefill多线程 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, 3, 3), H_OK); + embCache->Destroy(); + + // prefill多线程 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, 3, 0), H_THREAD_NUM_ERROR); + embCache->Destroy(); + + // prefill过多线程 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, 3, 10000), H_THREAD_NUM_ERROR); + embCache->Destroy(); + + // prefill 正常buffersize + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, 3, 1), H_OK); + embCache->Destroy(); + + // prefill 超大buffersize + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, 10, 1), H_PREFILL_BUFFER_SIZE_INVALID); + embCache->Destroy(); + + // prefill 0buffersize + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, 0, 1), H_PREFILL_BUFFER_SIZE_INVALID); + CTRLog(CTRLogLevel::INFO, "===========CreateCacheForTable end============="); +} + +TEST_F(EmbCacheTest, EMBEDDING_LOOKUP_ADDRS) +{ + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_ADDRS start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::vector lookupKeys; + std::vector addrs; + + lookupKeys = { 0, 1, 2, 3, 4 }; + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs), H_OK); + + // lookupkeys 为空 + lookupKeys = {}; + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs), H_OK); + + lookupKeys = { 0 }; + ASSERT_EQ(embCache->EmbeddingLookupAddrs("not_a_table", lookupKeys, addrs), H_TABLE_NOT_EXIST); + + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tooLongTableName, lookupKeys, addrs), H_TABLE_NAME_TOO_LONG); + + lookupKeys = { 5 }; + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs), H_HOST_VOCAB_SIZE_TOO_SMALL); + + lookupKeys = { 5 }; + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs, 1), H_HOST_VOCAB_SIZE_TOO_SMALL); + + lookupKeys = { 0, 1, 4 }; + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs), H_OK); + + lookupKeys = { 0, 1, 4 }; + uint32_t threadNum = std::thread::hardware_concurrency(); + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs, threadNum + 1), H_THREAD_NUM_ERROR); + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs, threadNum), H_OK); + // 单线程lookup + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs, 1), H_OK); + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs, 0), H_THREAD_NUM_ERROR); + embCache->Destroy(); +} + +TEST_F(EmbCacheTest, EMBEDDING_LOOKUP_ADDRS_DATA) +{ + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_ADDRS_DATA start============="); + factory->CreateEmbCacheManager(embCache); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 3000000; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 39; + uint32_t devVocabSize = 100000; + EmbCache::EmbCacheInfo embCacheInfo(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::string normalInitializeName = "random_normal_initializer"; + std::string constantInitializeName = "constant_initializer"; + EmbCache::NormalInitializerInfo normalInitializerInfo(0, 0.05, 0, 1.0); + EmbCache::ConstantInitializerInfo constantInitializerInfo(0.233, 1.0); + + std::string truncatedNormalInitializeName = "truncated_normal_initializer"; + // 加入所有初始化器的所有分支 + std::vector initializeInfos = { + EmbCache::InitializerInfo(normalInitializeName, 0, embeddingSize, normalInitializerInfo), + EmbCache::InitializerInfo(normalInitializeName, embeddingSize, 0, normalInitializerInfo), + EmbCache::InitializerInfo(constantInitializeName, embeddingSize, embeddingSize, constantInitializerInfo), + EmbCache::InitializerInfo(constantInitializeName, 2 * embeddingSize, 0, constantInitializerInfo), + EmbCache::InitializerInfo(truncatedNormalInitializeName, 2 * embeddingSize, embeddingSize, + normalInitializerInfo), + EmbCache::InitializerInfo(truncatedNormalInitializeName, 3 * embeddingSize, 0, normalInitializerInfo), + }; + // 正确创建 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos), H_OK); + std::vector lookupKeys; + std::vector addrs; + lookupKeys = GenKeys(hostVocabSize, 123321); + ASSERT_EQ(embCache->EmbeddingLookupAddrs(tableName, lookupKeys, addrs), H_OK); + + long double sum = 0.0; + long double cnt = 0.0; + long double accum = 0.0; + for (uint32_t i = 0; i < lookupKeys.size(); i++) { + // normalInitializer 生成数据 + for (uint32_t j = 0; j < embeddingSize; j++) { + sum += addrs[i][j]; + cnt++; + } + + // constantInitializer 生成数据 + for (uint32_t j = embeddingSize; j < 2 * embeddingSize; j++) { + ASSERT_LE(std::abs(addrs[i][j] - 0.233), 1e-6f); + } + // truncatedNormalInitializer 生成数据 + for (uint32_t j = 2 * embeddingSize; j < 3 * embeddingSize; j++) { + // 在[-2*stddev, 2*stddev]范围中 + ASSERT_LE(std::abs(addrs[i][j]), 0.1f + 1e-6f); + } + } + + long double mean = sum / cnt; + for (uint32_t i = 0; i < lookupKeys.size(); ++i) { + for (uint32_t j = 0; j < embeddingSize; j++) { + accum += (addrs[i][j] - mean) * (addrs[i][j] - mean); + } + } + long double stdev = sqrt(accum / cnt); + ASSERT_LE(std::abs(mean), 5e-6f); + ASSERT_LE(std::abs(stdev - 0.05), 5e-6f); + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_ADDRS_DATA end============="); +} + +TEST_F(EmbCacheTest, EMBEDDING_LOOKUP_300W) +{ + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_300W start============="); + factory->CreateEmbCacheManager(embCache); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 3000000; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 39; + uint32_t devVocabSize = 100000; + EmbCache::EmbCacheInfo embCacheInfo(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::string normalInitializeName = "random_normal_initializer"; + std::string constantInitializeName = "constant_initializer"; + EmbCache::NormalInitializerInfo normalInitializerInfo(0, 0.05, 0, 1.0); + EmbCache::ConstantInitializerInfo constantInitializerInfo(0.233, 1.0); + + std::string truncatedNormalInitializeName = "truncated_normal_initializer"; + // 加入所有初始化器的所有分支 + std::vector initializeInfos = { + EmbCache::InitializerInfo(normalInitializeName, 0, embeddingSize, normalInitializerInfo), + EmbCache::InitializerInfo(normalInitializeName, embeddingSize, 0, normalInitializerInfo), + EmbCache::InitializerInfo(constantInitializeName, embeddingSize, embeddingSize, constantInitializerInfo), + EmbCache::InitializerInfo(constantInitializeName, 2 * embeddingSize, 0, constantInitializerInfo), + EmbCache::InitializerInfo(truncatedNormalInitializeName, 2 * embeddingSize, embeddingSize, + normalInitializerInfo), + EmbCache::InitializerInfo(truncatedNormalInitializeName, 3 * embeddingSize, 0, normalInitializerInfo), + }; + // 正确创建 + ASSERT_EQ(embCache->CreateCacheForTable(embCacheInfo, initializeInfos), H_OK); + std::vector lookupKeys; + float *addr; + lookupKeys = GenKeys(hostVocabSize, 123321); + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + + long double sum = 0.0; + long double cnt = 0.0; + long double accum = 0.0; + for (uint32_t i = 0; i < lookupKeys.size(); i++) { + // normalInitializer 生成数据 + for (uint32_t j = 0; j < embeddingSize; j++) { + sum += addr[i * extEmbeddingSize + j]; + cnt++; + } + + // constantInitializer 生成数据 + for (uint32_t j = embeddingSize; j < 2 * embeddingSize; j++) { + ASSERT_LE(std::abs(addr[i * extEmbeddingSize + j] - 0.233), 1e-6f); + } + // truncatedNormalInitializer 生成数据 + for (uint32_t j = 2 * embeddingSize; j < 3 * embeddingSize; j++) { + // 在[-2*stddev, 2*stddev]范围中 + ASSERT_LE(std::abs(addr[i * extEmbeddingSize + j]), 0.1f + 1e-6f); + } + } + + long double mean = sum / cnt; + for (uint32_t i = 0; i < lookupKeys.size(); ++i) { + for (uint32_t j = 0; j < embeddingSize; j++) { + accum += (addr[i * extEmbeddingSize + j] - mean) * (addr[i * extEmbeddingSize + j] - mean); + } + } + long double stdev = sqrt(accum / cnt); + ASSERT_LE(std::abs(mean), 5e-6f); + ASSERT_LE(std::abs(stdev - 0.05), 5e-6f); + free(addr); + CTRLog(CTRLogLevel::INFO, "===========GenerateData end============="); +} + +TEST_F(EmbCacheTest, EMBEDDING_LOOKUP_AND_REMOVE) +{ + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_AND_REMOVE start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::vector lookupKeys; + float *addr; + + lookupKeys = { 0, 1, 2, 3, 4 }; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr), H_OK); + free(addr); + + // lookupkeys 为空 + lookupKeys = {}; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr), H_OK); + free(addr); + + lookupKeys = { 0 }; + addr = nullptr; + ASSERT_EQ(embCache->EmbeddingLookupAndRemove("not_a_table", lookupKeys, addr), H_TABLE_NOT_EXIST); + + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tooLongTableName, lookupKeys, addr), H_TABLE_NAME_TOO_LONG); + + lookupKeys = { 0 }; + addr = nullptr; + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr), H_ADDRESS_NULL); + + lookupKeys = { 0, 1, 4 }; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + uint32_t threadNum = std::thread::hardware_concurrency(); + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr, threadNum + 1), H_THREAD_NUM_ERROR); + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr, threadNum), H_OK); + // 单线程lookup + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr, 1), H_OK); + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr, 0), H_THREAD_NUM_ERROR); + free(addr); + embCache->Destroy(); + + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_AND_REMOVE end============="); +} + +TEST_F(EmbCacheTest, EMBEDDING_LOOKUP_AND_REMOVE_2) +{ + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_AND_REMOVE_2 start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 200; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::vector lookupKeys; + float *addr; + + for (int i = 0; i < 100; i++) { + for (int j = 0; j < 2; j++) { + lookupKeys.emplace_back(i); + } + } + + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr, 1), H_OK); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr), H_OK); + free(addr); + embCache->Destroy(); + + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_AND_REMOVE_2 end============="); +} + +TEST_F(EmbCacheTest, EMBEDDING_LOOKUP) +{ + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::vector lookupKeys; + float *addr; + + lookupKeys = { 0, 1, 2, 3, 4 }; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + free(addr); + + // lookupkeys 为空 + lookupKeys = {}; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + free(addr); + + lookupKeys = { 0 }; + addr = nullptr; + ASSERT_EQ(embCache->EmbeddingLookup("not_a_table", lookupKeys, addr), H_TABLE_NOT_EXIST); + + ASSERT_EQ(embCache->EmbeddingLookup(tooLongTableName, lookupKeys, addr), H_TABLE_NAME_TOO_LONG); + + lookupKeys = { 5 }; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_HOST_VOCAB_SIZE_TOO_SMALL); + free(addr); + + lookupKeys = { 0 }; + addr = nullptr; + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_ADDRESS_NULL); + + lookupKeys = { 0, 1, 4 }; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + free(addr); + + lookupKeys = { 0, 1, 4 }; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + uint32_t threadNum = std::thread::hardware_concurrency(); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr, threadNum + 1), H_THREAD_NUM_ERROR); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr, threadNum), H_OK); + // 单线程lookup + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr, 1), H_OK); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr, 0), H_THREAD_NUM_ERROR); + free(addr); + embCache->Destroy(); + + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP end============="); +} + +TEST_F(EmbCacheTest, EMBEDDING_LOOKUP_AND_REMOVE_300W) +{ + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_AND_REMOVE_300W start============="); + std::string tableName = "test_table"; + std::vector lookupKeys; + float *newEmb; + + // 300w个key + uint32_t hostVocabSize = 3000000; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 100000; + embCache = ConstZeroCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + lookupKeys = GenUniqueKeys(hostVocabSize); + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size(); i++) { + for (uint32_t j = 0; j < extEmbeddingSize; j++) { + newEmb[i * extEmbeddingSize + j] = i + 0.01f * j; // 生成特殊数据 + } + } + CTRLog(CTRLogLevel::INFO, "gen done"); + // 把特殊数据放到表中 + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + CTRLog(CTRLogLevel::INFO, "EmbeddingUpdate done"); + + float *addr; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + // 查询特殊数据 + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + CTRLog(CTRLogLevel::INFO, "EmbeddingLookup done"); + for (uint32_t i = 0; i < lookupKeys.size(); i++) { + for (uint32_t j = 0; j < extEmbeddingSize; j++) { + // 验证表中数据正确性 + ASSERT_LE(std::abs(addr[i * extEmbeddingSize + j] - (i + 0.01f * j)), 1e-6f); + } + } + free(addr); + addr = nullptr; + + // Remove之后再Lookup,观察这些embedding是不是被正确remove + // 首先确认EmbeddingLookupAndRemove不会报错 + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookupAndRemove(tableName, lookupKeys, addr, 4), H_OK); + for (uint32_t i = 0; i < lookupKeys.size(); i++) { + for (uint32_t j = 0; j < extEmbeddingSize; j++) { + // 验证表中数据正确性 + ASSERT_LE(std::abs(addr[i * extEmbeddingSize + j] - (i + 0.01f * j)), 1e-6f); + } + } + free(addr); + addr = nullptr; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + // 然后再lookup,并确保lookup不会报错 + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + // 因为用const zero初始化, EmbeddingLookupAndRemove之后再lookup,结果应该全是0 + for (uint32_t i = 0; i < lookupKeys.size(); i++) { + for (uint32_t j = 0; j < extEmbeddingSize; j++) { + // 验证表中数据正确性 + ASSERT_LE(std::abs(addr[i * extEmbeddingSize + j] - 0), 1e-6f); + } + } + free(addr); + + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_LOOKUP_AND_REMOVE_300W end============="); +} + +TEST_F(EmbCacheTest, EMBEDDING_UPDATE_300W) +{ + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_UPDATE_300W start============="); + std::string tableName = "test_table"; + std::vector lookupKeys; + float *newEmb; + + // 300w个key + uint32_t hostVocabSize = 3000000; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 100000; + embCache = ConstZeroCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize, 50000, 6); + lookupKeys = GenKeys(hostVocabSize, 123321); + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size(); i++) { + for (uint32_t j = 0; j < extEmbeddingSize; j++) { + newEmb[i * extEmbeddingSize + j] = i + 0.01f * j; // 生成特殊数据 + } + } + CTRLog(CTRLogLevel::INFO, "gen done"); + // 把特殊数据放到表中 + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + CTRLog(CTRLogLevel::INFO, "EmbeddingUpdate done"); + + float *addr; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + // 查询特殊数据 + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + CTRLog(CTRLogLevel::INFO, "EmbeddingLookup done"); + for (uint32_t i = 0; i < lookupKeys.size(); i++) { + for (uint32_t j = 0; j < extEmbeddingSize; j++) { + // 验证表中数据正确性 + ASSERT_LE(std::abs(addr[i * extEmbeddingSize + j] - (i + 0.01f * j)), 1e-6f); + } + } + // Remove之后再Lookup,观察这些embedding是不是被正确remove + // 首先确认remove不会报错 + ASSERT_EQ(embCache->RemoveEmbsByKeys(tableName, lookupKeys), H_OK); + // 然后再lookup,并确保lookup不会报错 + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + // 因为用const zero初始化, 删除之后再lookup,结果应该全是0 + for (uint32_t i = 0; i < lookupKeys.size(); i++) { + for (uint32_t j = 0; j < extEmbeddingSize; j++) { + // 验证表中数据正确性 + ASSERT_LE(std::abs(addr[i * extEmbeddingSize + j] - 0), 1e-6f); + } + } + free(addr); + + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_UPDATE_300W end============="); +} + +TEST_F(EmbCacheTest, EMBEDDING_UPDATE) +{ + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_UPDATE start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::vector lookupKeys; + float *newEmb; + + lookupKeys = { 0, 1, 2, 3, 4 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + + // 更新存在的table,应当正常更新 + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + + lookupKeys = { 0 }; + newEmb = nullptr; + // 更新不存在的table + ASSERT_EQ(embCache->EmbeddingUpdate("not_a_table", lookupKeys, newEmb), H_TABLE_NOT_EXIST); + + // 表名超过上限 + ASSERT_EQ(embCache->EmbeddingUpdate(tooLongTableName, lookupKeys, newEmb), H_TABLE_NAME_TOO_LONG); + + lookupKeys = { 5 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + + // 当前embLocalTable中存储的key已达到hostVocabSize上限,并继续添加新key + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_HOST_VOCAB_SIZE_TOO_SMALL); + free(newEmb); + + lookupKeys = { 0 }; + newEmb = nullptr; + // 传入embAddr为空指针 + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_ADDRESS_NULL); + + // 更新存在于table的keys, 传入embAddr不为空指针 + lookupKeys = { 0, 1, 4 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + + // 线程数未超过核数 + lookupKeys = { 0, 1, 4 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb, 4), H_OK); + free(newEmb); + + // 线程数等于核数 + uint32_t processCoreNum = std::thread::hardware_concurrency(); + lookupKeys = { 0, 1, 4 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb, processCoreNum), H_OK); + free(newEmb); + + // 线程数大于核数 + processCoreNum = std::thread::hardware_concurrency(); + lookupKeys = { 0, 1, 4 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb, processCoreNum + 1), H_THREAD_NUM_ERROR); + free(newEmb); + + // 线程数为0 + processCoreNum = std::thread::hardware_concurrency(); + lookupKeys = { 0, 1, 4 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb, 0), H_THREAD_NUM_ERROR); + free(newEmb); + + // 线程数为1 + lookupKeys = { 0, 1, 4 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb, 1), H_OK); + free(newEmb); + + // lookupkeys为空 + lookupKeys = {}; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb, 1), H_OK); + free(newEmb); + + TearDown(); + + // 更新不存在于table的key,且当前embLocalTable中存储的key未达到hostVocabSize上限,继续添加新key + tableName = "test_table_one"; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + lookupKeys = { 0, 1 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb); + free(newEmb); + lookupKeys = { 2, 3 }; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + + + CTRLog(CTRLogLevel::INFO, "===========EMBEDDING_UPDATE end============="); +} + +TEST_F(EmbCacheTest, GetSwapPairsAndKey2Offset) +{ + CTRLog(CTRLogLevel::INFO, "===========GetSwapPairsAndKey2Offset start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 100; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 10; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::vector insertKeys; + std::pair, std::vector> swapInKoPair, swapOutKoPair; + + // 使用不存在的table + insertKeys = { 0, 1, 2, 3, 4 }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset("not_a_table", insertKeys, swapInKoPair, swapOutKoPair), + H_TABLE_NOT_EXIST); + + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tooLongTableName, insertKeys, swapInKoPair, swapOutKoPair), + H_TABLE_NAME_TOO_LONG); + + // 正常查找不存在的keys + insertKeys = { 0, 1, 2, 3, 4 }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair, swapOutKoPair), H_OK); + bool ret1 = true; + for (uint64_t i = 0; i < swapInKoPair.first.size(); i++) { + if (swapInKoPair.first[i] != i) { + string msg = "the " + std::to_string(i) + "th has key " + std::to_string(swapInKoPair.first[i]) + + ", but expect " + std::to_string(i); + CTRLog(CTRLogLevel::INFO, msg.c_str()); + ret1 = false; + } + } + ASSERT_EQ(ret1, true); + + // 正常查找存在的keys + std::pair, std::vector> swapInKoPair2, swapOutKoPair2; + insertKeys = { 1, 2, 3 }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair2, swapOutKoPair2), H_OK); + uint64_t uint_zero = 0; + ASSERT_EQ(swapInKoPair2.first.size(), uint_zero); + + std::pair, std::vector> swapInKoPair3, swapOutKoPair3; + insertKeys = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + // 使用非空的koPair + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair, swapOutKoPair3), + H_ARG_NOT_EMPTY); + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair3, swapInKoPair), H_ARG_NOT_EMPTY); + // 存入keys正好达到maxCacheSize上限值 + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair3, swapOutKoPair3), H_OK); + + // 存入keys正好越过到maxCacheSize上限值 + std::pair, std::vector> swapInKoPair4, swapOutKoPair4; + insertKeys = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair4, swapOutKoPair4), + H_MAX_CACHESIZE_TOO_SMALL); + + embCache->Destroy(); + // 单次存入keys超过maxCacheSize上限值 + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::pair, std::vector> swapInKoPair5, swapOutKoPair5; + insertKeys = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair5, swapOutKoPair5), + H_MAX_CACHESIZE_TOO_SMALL); + + embCache->Destroy(); + // 单次存入keys正好达到上限值后,再次查找已存在的keys + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::pair, std::vector> swapInKoPair6, swapOutKoPair6; + insertKeys = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair6, swapOutKoPair6), H_OK); + + embCache->Destroy(); + // 连续两次存入的keys未超过上限,第三次传入keys达到上限 + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::pair, std::vector> swapInKoPair7, swapOutKoPair7; + insertKeys = { 0, 1, 2, 3, 4 }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair7, swapOutKoPair7), H_OK); + + std::pair, std::vector> swapInKoPair8, swapOutKoPair8; + insertKeys = { 5, 6, 7, 8, 9 }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair8, swapOutKoPair8), H_OK); + + std::pair, std::vector> swapInKoPair9, swapOutKoPair9; + insertKeys = { 10, 11, 12, 13, 14 }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair9, swapOutKoPair9), H_OK); + + embCache->Destroy(); + // 查询INVALID_KEY + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::pair, std::vector> swapInKoPair10, swapOutKoPair10; + uint64_t neg_one = -1; + insertKeys = { neg_one, neg_one, neg_one, neg_one, neg_one }; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair10, swapOutKoPair10), H_OK); + ASSERT_EQ(swapInKoPair10.first.empty(), true); + ASSERT_EQ(swapInKoPair10.second.empty(), true); + ASSERT_EQ(swapOutKoPair10.first.empty(), true); + ASSERT_EQ(swapOutKoPair10.second.empty(), true); + + // 查找空keys + std::pair, std::vector> swapInKoPair11, swapOutKoPair11; + insertKeys = {}; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, insertKeys, swapInKoPair11, swapOutKoPair11), H_OK); + ASSERT_EQ(swapInKoPair11.first.empty(), true); + ASSERT_EQ(swapInKoPair11.second.empty(), true); + ASSERT_EQ(swapOutKoPair11.first.empty(), true); + ASSERT_EQ(swapOutKoPair11.second.empty(), true); + CTRLog(CTRLogLevel::INFO, "===========GetSwapPairsAndKey2Offset end============="); +} + + +bool checkKeys(std::set &keySet, std::vector> &historyKeyVec, + const std::vector &keys, const std::vector &swapInKeys, + const std::vector &swapOutKeys, uint32_t maxCacheSize) +{ + std::set newKeys; + for (auto key : keys) { + if (keySet.find(key) == keySet.end()) { + newKeys.insert(key); + } + keySet.insert(key); + } + for (auto key : swapInKeys) { + if (newKeys.find(key) == newKeys.end()) { + CTRLog(CTRLogLevel::ERROR, "swapIn key error1"); + return false; + } + } + if (swapInKeys.size() != newKeys.size()) { + CTRLog(CTRLogLevel::ERROR, "swapIn key error2"); + return false; + } + historyKeyVec.insert(historyKeyVec.begin(), { keys.begin(), keys.end() }); + if (historyKeyVec.size() > 2) { + historyKeyVec.pop_back(); + } + for (auto key : swapOutKeys) { + if (historyKeyVec[0].find(key) != historyKeyVec[0].end() || + historyKeyVec[1].find(key) != historyKeyVec[1].end()) { + CTRLog(CTRLogLevel::ERROR, "swapOut key error1"); + return false; + } + } + for (auto key : swapOutKeys) { + if (keySet.find(key) == keySet.end()) { + CTRLog(CTRLogLevel::ERROR, "swapOut key error2"); + return false; + } + } + for (auto key : swapOutKeys) { + keySet.erase(key); + } + if (keySet.size() > maxCacheSize) { + CTRLog(CTRLogLevel::ERROR, "total key size error"); + return false; + } + return true; +} + +bool checkOffsets(std::set &offsetSet, const std::vector &swapInOffsets, + const std::vector &swapOutOffset) +{ + for (auto offset : swapOutOffset) { + if (offsetSet.find(offset) == offsetSet.end()) { + CTRLog(CTRLogLevel::ERROR, "swapOut offset error1"); + return false; + } + } + + for (auto offset : swapOutOffset) { + offsetSet.erase(offset); + } + + for (auto offset : swapInOffsets) { + if (offsetSet.find(offset) != offsetSet.end()) { + CTRLog(CTRLogLevel::ERROR, "swapIn offset error"); + return false; + } + offsetSet.insert(offset); + } + + return true; +} + + +TEST_F(EmbCacheTest, DEVICE_COMBINE_TEST) +{ + CTRLog(CTRLogLevel::INFO, "===========DEVICE_COMBINE_TEST start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 4000000; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 30000; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::set keySet; + std::set offsetSet; + std::vector> historyKeyVec; + std::vector> historyOffsetVec; + std::vector lookupKeys; + std::vector check_keys; + for (uint32_t i = 0; i < 50; i++) { + lookupKeys = GenKeys(10000, 123 + i, 0, 100000); + check_keys = lookupKeys; + std::pair, std::vector> koPair1; + std::pair, std::vector> koPair2; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, lookupKeys, koPair1, koPair2), H_OK); + bool retKey1 = checkKeys(keySet, historyKeyVec, check_keys, koPair1.first, koPair2.first, devVocabSize); + bool retOffset1 = checkOffsets(offsetSet, koPair1.second, koPair2.second); + ASSERT_EQ(retKey1, true); + ASSERT_EQ(retOffset1, true); + } + + CTRLog(CTRLogLevel::INFO, "===========DEVICE_COMBINE_TEST end============="); +} + +TEST_F(EmbCacheTest, REMOVE_KEYS) +{ + CTRLog(CTRLogLevel::INFO, "===========REMOVE_KEYS start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 100; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 10; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::vector lookupKeys; + std::vector removeKeys; + float *addr; + float *newEmb; + + for (uint32_t i = 0; i < hostVocabSize - 1; i++) { + lookupKeys.emplace_back(i); + for (uint32_t j = 0; j < hostVocabSize - 1; j++) { + removeKeys.emplace_back(i + j); + } + } + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + free(addr); + + // 表存在 + ASSERT_EQ(embCache->RemoveEmbsByKeys(tableName, lookupKeys), H_OK); + + // 表不存在 + ASSERT_EQ(embCache->RemoveEmbsByKeys("not_a_table", lookupKeys), H_TABLE_NOT_EXIST); + + // 表名超过上限 + ASSERT_EQ(embCache->RemoveEmbsByKeys(tooLongTableName, lookupKeys), H_TABLE_NAME_TOO_LONG); + + // remove INVALID_KEY + uint64_t neg_one = -1; + lookupKeys = { neg_one, neg_one, neg_one, neg_one, neg_one }; + ASSERT_EQ(embCache->RemoveEmbsByKeys(tableName, lookupKeys), H_OK); + + // 判断embLocalTable是否remove掉记录信息 + lookupKeys = { 0, 1, 4 }; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + free(addr); + + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 999.99f; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + bool ret1 = true; + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + if (fabs(addr[i] - 999.99f) > 0.0000001) { + ret1 = false; + } + } + free(addr); + ASSERT_EQ(ret1, true); + + ASSERT_EQ(embCache->RemoveEmbsByKeys(tableName, lookupKeys), H_OK); + + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + bool ret2 = true; + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + if (fabs(addr[i] - 999.99f) <= 0.0000001) { + ret2 = false; + } + } + free(addr); + ASSERT_EQ(ret2, true); + + // 判断offsetMapper是否remove掉记录信息 + lookupKeys = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + std::pair, std::vector> swapInKoPair, swapOutKoPair; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, lookupKeys, swapInKoPair, swapOutKoPair), H_OK); + removeKeys = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + ASSERT_EQ(embCache->RemoveEmbsByKeys(tableName, removeKeys), H_OK); + std::vector> koVec; + ASSERT_EQ(embCache->ExportDeviceKeyOffsetPairs(tableName, koVec), H_OK); + bool ret3 = true; + for (uint32_t i = 0; i < koVec.size(); i++) { + if (std::find(removeKeys.begin(), removeKeys.end(), koVec[i].first) != removeKeys.end()) { + ret3 = false; + } + } + ASSERT_EQ(ret3, true); + // 判断删除后,还能再添加 + lookupKeys = { 9, 10, 11, 12, 13 }; + std::vector oldKeys = lookupKeys; + std::pair, std::vector> swapInKoPair2, swapOutKoPair2; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, lookupKeys, swapInKoPair2, swapOutKoPair2), H_OK); + bool ret4 = true; + for (uint32_t i = 0; i < 5; i++) { + if (oldKeys[i] != swapInKoPair2.first[i]) { + ret4 = false; + } + } + bool ret5 = true; + for (uint32_t i = 0; i < 5; i++) { + if (lookupKeys[i] != swapInKoPair2.second[i]) { + ret5 = false; + } + } + ASSERT_EQ(ret4, true); + ASSERT_EQ(ret5, true); + ASSERT_EQ(swapInKoPair2.first.size(), 5ull); + ASSERT_EQ(swapInKoPair2.second.size(), 5ull); + ASSERT_EQ(swapOutKoPair2.first.empty(), true); + ASSERT_EQ(swapOutKoPair2.second.empty(), true); + + removeKeys = { 9, 10, 11, 3 }; + ASSERT_EQ(embCache->RemoveEmbsByKeys(tableName, removeKeys), H_OK); + std::vector> koVec2; + ASSERT_EQ(embCache->ExportDeviceKeyOffsetPairs(tableName, koVec2), H_OK); + bool ret6 = true; + for (uint32_t i = 0; i < koVec2.size(); i++) { + if (std::find(removeKeys.begin(), removeKeys.end(), koVec2[i].first) != removeKeys.end()) { + ret6 = false; + } + } + ASSERT_EQ(ret6, true); + + // 判断删除后,还能再添加 + lookupKeys = { 0, 1, 2, 3, 4, 5, 6, 7 }; + std::vector oldKeys2 = lookupKeys; + std::pair, std::vector> swapInKoPair3, swapOutKoPair3; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, lookupKeys, swapInKoPair3, swapOutKoPair3), H_OK); + bool ret7 = true; + for (uint32_t i = 0; i < 8; i++) { + if (oldKeys2[i] != swapInKoPair3.first[i]) { + ret7 = false; + } + } + bool ret8 = true; + for (uint32_t i = 0; i < 8; i++) { + if (lookupKeys[i] != swapInKoPair3.second[i]) { + ret8 = false; + } + } + ASSERT_EQ(ret7, true); + ASSERT_EQ(ret8, true); + ASSERT_EQ(swapInKoPair3.first.size(), 8ull); + ASSERT_EQ(swapInKoPair3.second.size(), 8ull); + ASSERT_EQ(swapOutKoPair3.first.empty(), true); + ASSERT_EQ(swapOutKoPair3.second.empty(), true); + + lookupKeys = { 15 }; + std::pair, std::vector> swapInKoPair4, swapOutKoPair4; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, lookupKeys, swapInKoPair4, swapOutKoPair4), + H_OK); + + CTRLog(CTRLogLevel::INFO, "===========REMOVE_KEYS end============="); +} + +TEST_F(EmbCacheTest, ExportDeviceKeyOffsetPairs) +{ + CTRLog(CTRLogLevel::INFO, "===========ExportDeviceKeyOffsetPairs start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 10; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 8; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + // 使用不存在的table名字 + std::vector> koVec; + ASSERT_EQ(embCache->ExportDeviceKeyOffsetPairs("not_a_table", koVec), H_TABLE_NOT_EXIST); + + ASSERT_EQ(embCache->ExportDeviceKeyOffsetPairs(tooLongTableName, koVec), H_TABLE_NAME_TOO_LONG); + + // 正常export出koPair + std::vector lookupKeys; + std::vector checkKeys; + lookupKeys = { 6, 0, 8, 1, 3, 4 }; + checkKeys = lookupKeys; + std::pair, std::vector> swapInKoPair, swapOutKoPair; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, lookupKeys, swapInKoPair, swapOutKoPair), H_OK); + std::vector> koVec2; + ASSERT_EQ(embCache->ExportDeviceKeyOffsetPairs(tableName, koVec2), H_OK); + ASSERT_EQ(koVec2.size(), lookupKeys.size()); + bool ret1 = true; + for (uint32_t i = 0; i < koVec2.size(); i++) { + if (koVec2[i].first != checkKeys[i] || koVec2[i].second != lookupKeys[i]) { + ret1 = false; + } + } + ASSERT_EQ(ret1, true); + + CTRLog(CTRLogLevel::INFO, "===========ExportDeviceKeyOffsetPairs end============="); +} + +TEST_F(EmbCacheTest, GetEmbTableNames) +{ + CTRLog(CTRLogLevel::INFO, "===========GetEmbTableNames start============="); + factory->CreateEmbCacheManager(embCache); + uint32_t hostVocabSize = 10; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 8; + std::vector tableNameVec; + tableNameVec.emplace_back("table1"); + tableNameVec.emplace_back("table2"); + tableNameVec.emplace_back("table3"); + for (const std::string tableName : tableNameVec) { + EmbCache::EmbCacheInfo embCacheInfo(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + EmbCache::NormalInitializerInfo normalInitializerInfo(0, 0.5, 0, 1.0); + std::string normalInitializeName = "random_normal_initializer"; + EmbCache::InitializerInfo normalInitializeInfo(normalInitializeName, 0, embeddingSize, normalInitializerInfo); + + EmbCache::ConstantInitializerInfo constantInitializerInfo(0.233, 1.0); + std::string constantInitializeName = "constant_initializer"; + EmbCache::InitializerInfo constantInitializeInfo(constantInitializeName, embeddingSize, embeddingSize, + constantInitializerInfo); + + std::vector initializeInfos(extEmbeddingSize / embeddingSize); + initializeInfos[0] = normalInitializeInfo; + for (uint64_t i = 1; i < initializeInfos.size(); i++) { + initializeInfos[i] = constantInitializeInfo; + } + embCache->CreateCacheForTable(embCacheInfo, initializeInfos, -1, hostVocabSize); + } + std::vector allTableNames; + std::vector notEmptyVector = { "123" }; + ASSERT_EQ(embCache->GetEmbTableNames(notEmptyVector), H_ARG_NOT_EMPTY); + + ASSERT_EQ(embCache->GetEmbTableNames(allTableNames), H_OK); + bool ret1 = true; + for (auto tableName : allTableNames) { + if (std::find(tableNameVec.begin(), tableNameVec.end(), tableName) == tableNameVec.end()) { + ret1 = false; + } + } + for (auto tableName : tableNameVec) { + if (std::find(allTableNames.begin(), allTableNames.end(), tableName) == allTableNames.end()) { + ret1 = false; + } + } + ASSERT_EQ(ret1, true); + + CTRLog(CTRLogLevel::INFO, "===========GetEmbTableNames end============="); +} + +TEST_F(EmbCacheTest, SERIALIZE) +{ + CTRLog(CTRLogLevel::INFO, "===========SERIALIZE start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + std::vector lookupKeys; + + lookupKeys = { 0 }; + std::vector buffer; + ASSERT_EQ(embCache->Serialize("not_a_table", buffer), H_TABLE_NOT_EXIST); + // 表名超过上限 + ASSERT_EQ(embCache->Serialize(tooLongTableName, buffer), H_TABLE_NAME_TOO_LONG); + CTRLog(CTRLogLevel::INFO, "===========SERIALIZE end============="); +} + +TEST_F(EmbCacheTest, DESERIALIZE) +{ + CTRLog(CTRLogLevel::INFO, "===========DESERIALIZE start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + std::vector lookupKeys; + + lookupKeys = { 0 }; + std::vector buffer = { 'A', 'B', '1', '2' }; + ASSERT_EQ(embCache->Deserialize("not_a_table", buffer), H_TABLE_NOT_EXIST); + + ASSERT_EQ(embCache->Deserialize(tooLongTableName, buffer), H_TABLE_NAME_TOO_LONG); + + ASSERT_EQ(embCache->Deserialize(tableName, buffer), H_LOAD_ERROR); + + lookupKeys = { 0, 1, 2, 3, 4 }; + float *newEmb; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + std::vector buffer1; + ASSERT_EQ(embCache->Serialize(tableName, buffer1), H_OK); + buffer1.erase(buffer1.begin() + buffer1.size() / 2, buffer1.end()); + ASSERT_EQ(embCache->Deserialize(tableName, buffer1), H_LOAD_ERROR); + + CTRLog(CTRLogLevel::INFO, "===========DESERIALIZE end============="); +} + +TEST_F(EmbCacheTest, SERIALIZE_DESERIALIZE) +{ + CTRLog(CTRLogLevel::INFO, "===========SERIALIZE_DESERIALIZE start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + std::vector lookupKeys; + lookupKeys = { 0, 1, 2, 3, 4 }; + float *newEmb; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + + std::vector buffer1; + std::vector buffer2; + + ASSERT_EQ(embCache->Serialize(tableName, buffer1), H_OK); + ASSERT_EQ(embCache->Deserialize(tableName, buffer1), H_OK); + ASSERT_EQ(embCache->Serialize(tableName, buffer2), H_OK); + ASSERT_EQ(buffer1.size(), buffer2.size()); + for (uint64_t i = 0; i < buffer1.size(); i++) { + ASSERT_EQ(buffer1[i], buffer2[i]); + } + ASSERT_EQ(buffer1, buffer2); + CTRLog(CTRLogLevel::INFO, "===========SERIALIZE_DESERIALIZE end============="); +} + +TEST_F(EmbCacheTest, ERROR_INITIALIZER) +{ + CTRLog(CTRLogLevel::INFO, "===========ERROR_INITIALIZER start============="); + uint32_t embeddingSize = 13; + /* 对ConstantInitializerInfo的constValue和initK的校验 */ + std::string constantInitializeName = "constant_initializer"; + // 日志打印"constant value is less than -1000000000, and will use -1000000000.",并正常初始化InitializerInfo + EmbCache::ConstantInitializerInfo constantInitializerInfo1(-1e9 - 1e8, 1.0); + EmbCache::InitializerInfo constantInitializeInfo = + EmbCache::InitializerInfo(constantInitializeName, embeddingSize, embeddingSize + 1, constantInitializerInfo1); + + // 日志打印"constant value is greater than 1000000000, and will use 1000000000.",并正常初始化InitializerInfo + EmbCache::ConstantInitializerInfo constantInitializerInfo2(1e9 + 1e8, 1.0); + constantInitializeInfo = + EmbCache::InitializerInfo(constantInitializeName, embeddingSize, embeddingSize + 1, constantInitializerInfo2); + + // 日志打印"constant initK is greater than 10000, and will use 10000.",并正常初始化InitializerInfo + EmbCache::ConstantInitializerInfo constantInitializerInfo3(0.233, 10001); + constantInitializeInfo = + EmbCache::InitializerInfo(constantInitializeName, embeddingSize, embeddingSize + 1, constantInitializerInfo3); + + // 日志打印"constant initK is less than -10000, and will use -10000.",并正常初始化InitializerInfo + EmbCache::ConstantInitializerInfo constantInitializerInfo4(0.233, -10001); + constantInitializeInfo = + EmbCache::InitializerInfo(constantInitializeName, embeddingSize, embeddingSize + 1, constantInitializerInfo4); + + /* 对NormalIntializerInfo的mean、stdev和initK的校验 */ + std::string normalInitializeName = "random_normal_initializer"; + // 日志打印"random normal mean param is greater than 1000000000, and will use + // 1000000000.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo1(1e9 + 1e8, 0.05, 0, 1.0); + EmbCache::InitializerInfo normalInitializeInfo = + EmbCache::InitializerInfo(normalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo1); + + // 日志打印"random normal mean param is less than -1000000000, and will use + // -1000000000.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo2(-1e9 - 1e8, 0.05, 0, 1.0); + normalInitializeInfo = + EmbCache::InitializerInfo(normalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo2); + + // 日志打印"random normal stddev param is greater than 100, and will use 100.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo3(0, 101, 0, 1.0); + normalInitializeInfo = + EmbCache::InitializerInfo(normalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo3); + + // 日志打印"random normal stddev param is less than 0, and will use 0.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo4(0, -1, 0, 1.0); + normalInitializeInfo = + EmbCache::InitializerInfo(normalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo4); + // 日志打印"random normal initK is greater than 10000, and will use 10000.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo5(0, 0.05, 0, 10001); + normalInitializeInfo = + EmbCache::InitializerInfo(normalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo5); + // 日志打印"random normal initK is less than -10000, and will use -10000.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo6(0, 0.05, 0, -10001); + normalInitializeInfo = + EmbCache::InitializerInfo(normalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo6); + + /* 对TruncatedNormalInitializer的mean、stdev以及initK的校验 */ + std::string truncatedNormalInitializeName = "truncated_normal_initializer"; + // 日志打印"truncated normal mean param is greater than 1000000000, and will use + // 1000000000.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo7(1e9 + 1e8, 0.05, 0, 1.0); + EmbCache::InitializerInfo truncatedNormalInitializeInfo = + EmbCache::InitializerInfo(truncatedNormalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo7); + + // 日志打印"truncated normal mean param is less than -1000000000, and will use + // -1000000000.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo8(-1e9 - 1e8, 0.05, 0, 1.0); + truncatedNormalInitializeInfo = + EmbCache::InitializerInfo(truncatedNormalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo8); + + // 日志打印"truncated normal stddev param is greater than 100, and will use 100.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo9(0, 101, 0, 1.0); + truncatedNormalInitializeInfo = + EmbCache::InitializerInfo(truncatedNormalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo9); + + // 日志打印"truncated normal stddev param is less than 0.000000, and will use 0.000000."并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo10(0, -1, 0, 1.0); + truncatedNormalInitializeInfo = + EmbCache::InitializerInfo(truncatedNormalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo10); + // 日志打印"truncated normal initK is greater than 10000, and will use 10000.",并正常初始化InitializerInfo + EmbCache::NormalInitializerInfo normalInitializerInfo11(0, 0.05, 0, 10001); + truncatedNormalInitializeInfo = + EmbCache::InitializerInfo(truncatedNormalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo11); + // 日志打印"truncated normal initK is less than -10000, and will use -10000." + EmbCache::NormalInitializerInfo normalInitializerInfo12(0, 0.05, 0, -10001); + truncatedNormalInitializeInfo = + EmbCache::InitializerInfo(truncatedNormalInitializeName, embeddingSize, embeddingSize, normalInitializerInfo12); + CTRLog(CTRLogLevel::INFO, "===========ERROR_INITIALIZER end============="); +} + + +TEST_F(EmbCacheTest, EmbeddingRemove) +{ + CTRLog(CTRLogLevel::INFO, "===========EmbeddingRemove start============="); + std::string tableName = "test_table"; + uint32_t hostVocabSize = 100; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint32_t devVocabSize = 100; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::vector lookupKeys; + std::vector removeKeys; + float *addr; + float *newEmb; + + for (uint32_t i = 0; i < hostVocabSize - 1; i++) { + lookupKeys.emplace_back(i); + for (uint32_t j = 0; j < hostVocabSize - 1; j++) { + removeKeys.emplace_back(i + j); + } + } + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + // 表存在 + ASSERT_EQ(embCache->EmbeddingRemove(tableName, lookupKeys), H_OK); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + // 单线程 + ASSERT_EQ(embCache->EmbeddingRemove(tableName, lookupKeys, 1), H_OK); + + free(addr); + // REMOVE空keys + std::vector emptyRemoveKeys; + ASSERT_EQ(embCache->EmbeddingRemove(tableName, emptyRemoveKeys), H_OK); + + // 表不存在 + ASSERT_EQ(embCache->EmbeddingRemove("not_a_table", lookupKeys), H_TABLE_NOT_EXIST); + // 表名超过上限 + ASSERT_EQ(embCache->EmbeddingRemove(tooLongTableName, lookupKeys), H_TABLE_NAME_TOO_LONG); + + // remove INVALID_KEY + uint64_t neg_one = -1; + lookupKeys = { neg_one, neg_one, neg_one, neg_one, neg_one }; + ASSERT_EQ(embCache->EmbeddingRemove(tableName, lookupKeys), H_OK); + + // 判断embLocalTable是否remove掉记录信息 + lookupKeys = { 0, 1, 4 }; + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + free(addr); + + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 999.99f; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + bool ret1 = true; + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + if (fabs(addr[i] - 999.99f) > 0.0000001) { + ret1 = false; + } + } + free(addr); + ASSERT_EQ(ret1, true); + + ASSERT_EQ(embCache->EmbeddingRemove(tableName, lookupKeys), H_OK); + + addr = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + ASSERT_EQ(embCache->EmbeddingLookup(tableName, lookupKeys, addr), H_OK); + bool ret2 = true; + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + if (fabs(addr[i] - 999.99f) <= 0.0000001) { + ret2 = false; + } + } + free(addr); + ASSERT_EQ(ret2, true); + + // 判断offsetMapper是否remove掉记录信息 + lookupKeys = { 6, 0, 8, 1, 3, 4 }; + std::pair, std::vector> swapInKoPair, swapOutKoPair; + ASSERT_EQ(embCache->GetSwapPairsAndKey2Offset(tableName, lookupKeys, swapInKoPair, swapOutKoPair), H_OK); + removeKeys = { 0, 1, 4 }; + ASSERT_EQ(embCache->EmbeddingRemove(tableName, removeKeys), H_OK); + + CTRLog(CTRLogLevel::INFO, "===========EmbeddingRemove end============="); +} + +TEST_F(EmbCacheTest, GET_EMB_TABLE_INFO) +{ + CTRLog(CTRLogLevel::INFO, "===========GET_EMB_TABLE_INFO start============="); + std::string tableName = "test_table"; + uint64_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint64_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + std::vector lookupKeys; + lookupKeys = { 0, 1, 2, 3, 4 }; + float *newEmb; + newEmb = (float *)malloc(lookupKeys.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys.size() * extEmbeddingSize; i++) { + newEmb[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys, newEmb), H_OK); + free(newEmb); + + std::vector keys; + std::vector> embeddings; + std::vector> optimizerSlots; + + ASSERT_EQ(embCache->GetEmbTableInfos("Invalid_table_name", keys, embeddings, optimizerSlots), H_TABLE_NOT_EXIST); + ASSERT_EQ(embCache->GetEmbTableInfos(tooLongTableName, keys, embeddings, optimizerSlots), H_TABLE_NAME_TOO_LONG); + ASSERT_EQ(embCache->GetEmbTableInfos(tableName, keys, embeddings, optimizerSlots), H_OK); + bool ret = true; + if (keys.size() != 5) { + ret = false; + } + uint32_t optimizerSlotSize = extEmbeddingSize - embeddingSize; + for (auto key : keys) { + auto it = std::find(lookupKeys.begin(), lookupKeys.end(), key); + if (it == lookupKeys.end()) { + ret = false; + break; + } + uint32_t index = it - lookupKeys.begin(); + for (uint32_t i = 0; i < embeddingSize; i++) { + if (fabs(embeddings[index][i] - 0.01f * (i + index * extEmbeddingSize)) > 0.0000001) { + ret = false; + } + } + for (uint32_t i = 0; i < optimizerSlotSize; i++) { + if (fabs(optimizerSlots[index][i] - 0.01f * (i + index * extEmbeddingSize + embeddingSize)) > 0.0000001) { + ret = false; + } + } + } + ASSERT_EQ(ret, true); + + std::vector keys2 = { 1, 2, 3 }; + std::vector> embeddings2; + std::vector> optimizerSlots2; + ASSERT_EQ(embCache->GetEmbTableInfos(tableName, keys2, embeddings2, optimizerSlots2), H_ARG_NOT_EMPTY); + + std::vector keys3; + std::vector> embeddings3; + std::vector> optimizerSlots3; + embeddings3.emplace_back(std::vector({ 0.1f, 0.2f })); + ASSERT_EQ(embCache->GetEmbTableInfos(tableName, keys3, embeddings3, optimizerSlots3), H_ARG_NOT_EMPTY); + + std::vector keys4; + std::vector> embeddings4; + std::vector> optimizerSlots4; + optimizerSlots4.emplace_back(std::vector({ 0.1f, 0.2f })); + ASSERT_EQ(embCache->GetEmbTableInfos(tableName, keys4, embeddings4, optimizerSlots4), H_ARG_NOT_EMPTY); + embCache->Destroy(); + + hostVocabSize = 5; + embeddingSize = 13; + extEmbeddingSize = 13; + devVocabSize = 2; + + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + std::vector lookupKeys2; + lookupKeys2 = { 0, 1, 2, 3, 4 }; + float *newEmb2; + newEmb2 = (float *)malloc(lookupKeys2.size() * extEmbeddingSize * sizeof(float)); + for (uint32_t i = 0; i < lookupKeys2.size() * extEmbeddingSize; i++) { + newEmb2[i] = 0.01f * i; + } + ASSERT_EQ(embCache->EmbeddingUpdate(tableName, lookupKeys2, newEmb2), H_OK); + free(newEmb2); + + std::vector keys5; + std::vector> embeddings5; + std::vector> optimizerSlots5; + + ASSERT_EQ(embCache->GetEmbTableInfos(tableName, keys5, embeddings5, optimizerSlots5), H_OK); + bool ret2 = true; + if (keys.size() != 5) { + ret2 = false; + } + for (auto key : keys) { + auto it = std::find(lookupKeys2.begin(), lookupKeys2.end(), key); + if (it == lookupKeys2.end()) { + ret2 = false; + break; + } + uint32_t index = it - lookupKeys2.begin(); + for (uint32_t i = 0; i < embeddingSize; i++) { + if (fabs(embeddings5[index][i] - 0.01f * (i + index * extEmbeddingSize)) > 0.0000001) { + ret2 = false; + } + } + } + if (!optimizerSlots5.empty()) { + ret2 = false; + } + ASSERT_EQ(ret2, true); + + CTRLog(CTRLogLevel::INFO, "===========GET_EMB_TABLE_INFO end============="); +} + +TEST_F(EmbCacheTest, LOAD_EMB_TABLE_INFO) +{ + CTRLog(CTRLogLevel::INFO, "===========LOAD_EMB_TABLE_INFO start============="); + std::string tableName = "test_table"; + uint64_t hostVocabSize = 5; + uint32_t embeddingSize = 13; + uint32_t extEmbeddingSize = 26; + uint64_t devVocabSize = 2; + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + std::vector keys; + std::vector> embeddings; + std::vector> optimizerSlots; + + keys = { 0, 1, 2, 3, 4 }; + for (uint64_t i = 0; i < keys.size(); i++) { + std::vector curEmbedding; + for (uint64_t j = 0; j < embeddingSize; j++) { + curEmbedding.emplace_back(0.01f * (i * extEmbeddingSize + j)); + } + embeddings.emplace_back(curEmbedding); + } + uint32_t optimizerSlotSize = extEmbeddingSize - embeddingSize; + for (uint64_t i = 0; i < keys.size(); i++) { + std::vector curOptimizerSlot; + for (uint64_t j = 0; j < optimizerSlotSize; j++) { + curOptimizerSlot.emplace_back(0.01f * (i * extEmbeddingSize + embeddingSize + j)); + } + optimizerSlots.emplace_back(curOptimizerSlot); + } + ASSERT_EQ(embCache->LoadEmbTableInfos("Invalid_table_name", keys, embeddings, optimizerSlots), H_TABLE_NOT_EXIST); + ASSERT_EQ(embCache->LoadEmbTableInfos(tooLongTableName, keys, embeddings, optimizerSlots), H_TABLE_NAME_TOO_LONG); + ASSERT_EQ(embCache->LoadEmbTableInfos(tableName, keys, embeddings, optimizerSlots), H_OK); + + std::vector keys2; + std::vector> embeddings2; + std::vector> optimizerSlots2; + ASSERT_EQ(embCache->GetEmbTableInfos(tableName, keys2, embeddings2, optimizerSlots2), H_OK); + + bool ret = true; + if (keys2.size() != 5) { + ret = false; + } + for (auto key : keys2) { + auto it = std::find(keys.begin(), keys.end(), key); + if (it == keys.end()) { + ret = false; + break; + } + uint32_t index = it - keys.begin(); + for (uint32_t i = 0; i < embeddingSize; i++) { + if (fabs(embeddings2[index][i] - 0.01f * (i + index * extEmbeddingSize)) > 0.0000001) { + ret = false; + } + } + for (uint32_t i = 0; i < optimizerSlotSize; i++) { + if (fabs(optimizerSlots2[index][i] - 0.01f * (i + index * extEmbeddingSize + embeddingSize)) > 0.0000001) { + ret = false; + } + } + } + ASSERT_EQ(ret, true); + + std::vector keys3; + std::vector> embeddings3; + std::vector> optimizerSlots3; + + keys3 = { 0, 1, 2, 3, 4 }; + for (uint64_t i = 0; i < keys3.size() - 1; i++) { + std::vector curEmbedding; + for (uint64_t j = 0; j < embeddingSize; j++) { + curEmbedding.emplace_back(0.01f * (i * extEmbeddingSize + j)); + } + embeddings3.emplace_back(curEmbedding); + } + for (uint64_t i = 0; i < keys3.size(); i++) { + std::vector curOptimizerSlot; + for (uint64_t j = 0; j < optimizerSlotSize; j++) { + curOptimizerSlot.emplace_back(0.01f * (i * extEmbeddingSize + embeddingSize + j)); + } + optimizerSlots3.emplace_back(curOptimizerSlot); + } + // keys num != embeddings num + ASSERT_EQ(embCache->LoadEmbTableInfos(tableName, keys3, embeddings3, optimizerSlots3), H_LOAD_ERROR); + + std::vector keys4; + std::vector> embeddings4; + std::vector> optimizerSlots4; + + keys4 = { 0, 1, 2, 3, 4 }; + for (uint64_t i = 0; i < keys4.size(); i++) { + std::vector curEmbedding; + for (uint64_t j = 0; j < embeddingSize; j++) { + curEmbedding.emplace_back(0.01f * (i * extEmbeddingSize + j)); + } + embeddings4.emplace_back(curEmbedding); + } + for (uint64_t i = 0; i < keys4.size() - 1; i++) { + std::vector curOptimizerSlot; + for (uint64_t j = 0; j < optimizerSlotSize; j++) { + curOptimizerSlot.emplace_back(0.01f * (i * extEmbeddingSize + embeddingSize + j)); + } + optimizerSlots4.emplace_back(curOptimizerSlot); + } + // keys num != optimizerSlots num + ASSERT_EQ(embCache->LoadEmbTableInfos(tableName, keys4, embeddings4, optimizerSlots4), H_LOAD_ERROR); + + std::vector keys5; + std::vector> embeddings5; + std::vector> optimizerSlots5; + + keys5 = { 0, 1, 2, 3, 4, 5 }; + for (uint64_t i = 0; i < keys5.size(); i++) { + std::vector curEmbedding; + for (uint64_t j = 0; j < embeddingSize; j++) { + curEmbedding.emplace_back(0.01f * (i * extEmbeddingSize + j)); + } + embeddings5.emplace_back(curEmbedding); + } + for (uint64_t i = 0; i < keys5.size(); i++) { + std::vector curOptimizerSlot; + for (uint64_t j = 0; j < optimizerSlotSize; j++) { + curOptimizerSlot.emplace_back(0.01f * (i * extEmbeddingSize + embeddingSize + j)); + } + optimizerSlots5.emplace_back(curOptimizerSlot); + } + // loadKeys num > hostVocabSize + ASSERT_EQ(embCache->LoadEmbTableInfos(tableName, keys5, embeddings5, optimizerSlots5), H_LOAD_ERROR); + + std::vector keys6; + std::vector> embeddings6; + std::vector> optimizerSlots6; + + keys6 = { 0, 1, 2, 3, 4 }; + for (uint64_t i = 0; i < keys6.size(); i++) { + std::vector curEmbedding; + for (uint64_t j = 0; j < embeddingSize - 1; j++) { + curEmbedding.emplace_back(0.01f * (i * extEmbeddingSize + j)); + } + embeddings6.emplace_back(curEmbedding); + } + for (uint64_t i = 0; i < keys6.size(); i++) { + std::vector curOptimizerSlot; + for (uint64_t j = 0; j < optimizerSlotSize; j++) { + curOptimizerSlot.emplace_back(0.01f * (i * extEmbeddingSize + embeddingSize + j)); + } + optimizerSlots6.emplace_back(curOptimizerSlot); + } + // entering embeddingSize != table embeddingSize + ASSERT_EQ(embCache->LoadEmbTableInfos(tableName, keys6, embeddings6, optimizerSlots6), H_LOAD_ERROR); + + std::vector keys7; + std::vector> embeddings7; + std::vector> optimizerSlots7; + + keys7 = { 0, 1, 2, 3, 4 }; + for (uint64_t i = 0; i < keys7.size(); i++) { + std::vector curEmbedding; + for (uint64_t j = 0; j < embeddingSize; j++) { + curEmbedding.emplace_back(0.01f * (i * extEmbeddingSize + j)); + } + embeddings7.emplace_back(curEmbedding); + } + for (uint64_t i = 0; i < keys7.size(); i++) { + std::vector curOptimizerSlot; + for (uint64_t j = 0; j < optimizerSlotSize - 1; j++) { + curOptimizerSlot.emplace_back(0.01f * (i * extEmbeddingSize + embeddingSize + j)); + } + optimizerSlots7.emplace_back(curOptimizerSlot); + } + // entering optimizerSlotSize != table optimizerSlotSize + ASSERT_EQ(embCache->LoadEmbTableInfos(tableName, keys7, embeddings7, optimizerSlots7), H_LOAD_ERROR); + embCache->Destroy(); + + hostVocabSize = 5; + embeddingSize = 13; + extEmbeddingSize = 13; + devVocabSize = 2; + + embCache = SimpleCreateTable(tableName, hostVocabSize, embeddingSize, extEmbeddingSize, devVocabSize); + + std::vector keys8; + std::vector> embeddings8; + std::vector> optimizerSlots8; + + keys8 = { 0, 1, 2, 3, 4 }; + for (uint64_t i = 0; i < keys8.size(); i++) { + std::vector curEmbedding; + for (uint64_t j = 0; j < embeddingSize; j++) { + curEmbedding.emplace_back(0.01f * (i * extEmbeddingSize + j)); + } + embeddings8.emplace_back(curEmbedding); + } + + ASSERT_EQ(embCache->LoadEmbTableInfos(tableName, keys8, embeddings8, optimizerSlots8), H_OK); + + std::vector keys9; + std::vector> embeddings9; + std::vector> optimizerSlots9; + ASSERT_EQ(embCache->GetEmbTableInfos(tableName, keys9, embeddings9, optimizerSlots9), H_OK); + + double eps = 0.0000001; + bool ret2 = true; + if (keys9.size() != 5) { + ret2 = false; + } + for (auto key : keys9) { + auto it = std::find(keys9.begin(), keys9.end(), key); + if (it == keys9.end()) { + ret2 = false; + break; + } + uint32_t index = it - keys9.begin(); + for (uint32_t i = 0; i < embeddingSize; i++) { + if (fabs(embeddings9[index][i] - 0.01f * (i + index * extEmbeddingSize)) > eps) { + ret2 = false; + } + } + } + if (!optimizerSlots9.empty()) { + ret2 = false; + } + ASSERT_EQ(ret2, true); + + CTRLog(CTRLogLevel::INFO, "===========LOAD_EMB_TABLE_INFO end============="); +} diff --git a/src/AccCTR/tests/ut/src/emb_cache_test.h b/src/AccCTR/tests/ut/src/emb_cache_test.h new file mode 100644 index 00000000..5c87237b --- /dev/null +++ b/src/AccCTR/tests/ut/src/emb_cache_test.h @@ -0,0 +1,62 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + ==============================================================================*/ + +#ifndef CTR_EMB_CACHE_TEST_H +#define CTR_EMB_CACHE_TEST_H + +#include +#include +#include + +#include "gtest/gtest.h" +#include "gmock/gmock.h" + +#include "factory.h" +#include "embedding_cache.h" + + +class EmbCacheTest : public testing::Test { +protected: + EmbCacheTest(){}; + ~EmbCacheTest(){}; + static void SetUpTestCase(); + static void TearDownTestCase(); + + + void SetUp() override; + + void TearDown() override; + + static ock::ctr::EmbCacheManagerPtr SimpleCreateTable(std::string tableName, uint32_t hostVocabSize, uint32_t embeddingSize, + uint32_t extEmbeddingSize, uint32_t devVocabSize, std::pair normalPara = { 0, 0.05 }, + float constPara = 0.233); + + static ock::ctr::EmbCacheManagerPtr ConstZeroCreateTable(std::string tableName, uint32_t hostVocabSize, + uint32_t embeddingSize, uint32_t extEmbeddingSize, uint32_t devVocabSize, uint64_t prefillBufferSize = 50000, + uint8_t prefillThreadNum = 1); + + std::string tooLongTableName = + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100000000010000000001000000000100000000010000000001000000000100000000010000000001" + "00000000010000000001000000000100012"; +}; + +#endif // CTR_EMB_CACHE_TEST_H diff --git a/src/AccCTR/tests/ut/src/unique_test.cpp b/src/AccCTR/tests/ut/src/unique_test.cpp index f971bb91..a94ebaf7 100644 --- a/src/AccCTR/tests/ut/src/unique_test.cpp +++ b/src/AccCTR/tests/ut/src/unique_test.cpp @@ -15,8 +15,7 @@ limitations under the License. #include #include #include "unique_test.h" - -FactoryPtr factory; +#include "common.h" void UniqueTest::SetUpTestCase() { @@ -144,7 +143,10 @@ TEST_F(UniqueTest, DoUniqueNormal) std::string input_path(path); std::cout << "input_path:" + input_path + "/data30.txt" << std::endl; std::ifstream input(input_path + "/data30.txt"); - + if(!input.good()) { + std::cout << "Failed to open file:" + input_path + "/data30.txt" << std::endl; + return; + } std::vector numbers; std::string line; while (std::getline(input, line, ',')) { @@ -156,6 +158,8 @@ TEST_F(UniqueTest, DoUniqueNormal) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.trace = true; conf.desiredSize = numbers.size(); @@ -213,6 +217,8 @@ TEST_F(UniqueTest, UseErrOutputTypeEnhanced) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 6; conf.dataType = DataType::INT64; @@ -253,6 +259,8 @@ TEST_F(UniqueTest, UseErrOutputTypeNormal) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 6; conf.dataType = DataType::INT64; @@ -292,6 +300,8 @@ TEST_F(UniqueTest, DoEnhancedUnique) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 6; conf.dataType = DataType::INT64; @@ -340,6 +350,8 @@ TEST_F(UniqueTest, DoEnhancedUniqueErr) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 6; conf.dataType = DataType::INT64; @@ -402,6 +414,8 @@ TEST_F(UniqueTest, DoEnhancedUnique_UniqueIdSize) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 6; conf.dataType = DataType::INT64; @@ -449,6 +463,8 @@ TEST_F(UniqueTest, idCntIsNull) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 6; conf.dataType = DataType::INT64; @@ -488,6 +504,8 @@ TEST_F(UniqueTest, idCntIsNullSharding) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 6; conf.dataType = DataType::INT64; @@ -537,6 +555,8 @@ TEST_F(UniqueTest, DoUniqueShard) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.useSharding = true; conf.useIdCount = true; @@ -612,6 +632,8 @@ TEST_F(UniqueTest, DoUniqueOnlyShard) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.useSharding = true; conf.desiredSize = 6; @@ -675,6 +697,8 @@ TEST_F(UniqueTest, DoUniquePadding) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.usePadding = true; conf.useSharding = true; @@ -755,6 +779,8 @@ TEST_F(UniqueTest, DoUniqueNoThreadPool) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 20; // 配置空间大于实际输入数组长度,验证正常运行 conf.dataType = DataType::INT64; @@ -817,6 +843,8 @@ TEST_F(UniqueTest, DoUniqueShardNumberOversize) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.useSharding = true; conf.desiredSize = 6; @@ -895,6 +923,7 @@ TEST_F(UniqueTest, DoUniqueSpecial) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); int count = 1000000; UniqueConf conf; @@ -963,6 +992,8 @@ TEST_F(UniqueTest, IdLarge) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 6; conf.dataType = DataType::INT64; @@ -999,6 +1030,8 @@ TEST_F(UniqueTest, DoUniqueNormalInt32) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.useSharding = true; conf.desiredSize = 6; @@ -1122,6 +1155,8 @@ TEST_F(UniqueTest, DoUniqueShardMultipleTimes) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.useSharding = true; conf.desiredSize = 6; @@ -1286,6 +1321,8 @@ TEST_F(UniqueTest, IdCntSmall) UniquePtr unique; ASSERT_EQ(factory->CreateUnique(unique), 0); + factory->SetExternalLogFuncInner(CTRLog); + UniqueConf conf; conf.desiredSize = 6; conf.dataType = DataType::INT64; @@ -1321,7 +1358,10 @@ TEST_F(UniqueTest, DoUniqueLotsDataFunction) std::string input_path(path); std::cout << "input_path:" + input_path + "/data40.txt" << std::endl; std::ifstream input(input_path + "/data40.txt"); - + if(!input.good()) { + std::cout << "Failed to open file:" + input_path + "/data40.txt" << std::endl; + return; + } std::vector numbers; std::string line; while (std::getline(input, line, ',')) { @@ -1423,7 +1463,10 @@ TEST_F(UniqueTest, DoUniqueLotsDataPaddingFunction) std::string input_path(path); std::cout << "input_path:" + input_path + "/data30.txt" << std::endl; std::ifstream input(input_path + "/data30.txt"); - + if(!input.good()) { + std::cout << "Failed to open file:" + input_path + "/data30.txt" << std::endl; + return; + } std::vector numbers; std::string line; while (std::getline(input, line, ',')) { diff --git a/src/AccCTR/tests/ut/src/unique_test.h b/src/AccCTR/tests/ut/src/unique_test.h index 0243f262..c3bc64f3 100644 --- a/src/AccCTR/tests/ut/src/unique_test.h +++ b/src/AccCTR/tests/ut/src/unique_test.h @@ -19,7 +19,6 @@ limitations under the License. #include #include #include -#include "factory.h" #include "gtest/gtest.h" #include "gmock/gmock.h" #include "unique.h" @@ -28,21 +27,6 @@ using namespace std; using namespace ock::ctr; -class SimpleThreadPool { -public: - static void SyncRun(const std::vector> &tasks) - { - std::vector> futs; - for (auto &task : tasks) { - futs.push_back(std::async(task)); - } - for (auto &fut : futs) { - fut.wait(); - } - } -}; - - class UniqueTest : public testing::Test { protected: UniqueTest() {}; diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index dd1052f2..64a076b9 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -27,6 +27,11 @@ if(NOT SECUREC_PATH) endif() message("SECUREC_PATH: " ${SECUREC_PATH}) +if(NOT ACCCTR_PATH) + set(ACCCTR_PATH ${PROJECT_SOURCE_DIR}/AccCTR) +endif() +message("ACCCTR_PATH: " ${ACCCTR_PATH}) + include_directories(${ABSEIL_PATH}/include) link_directories(${ABSEIL_PATH}/lib) @@ -38,7 +43,7 @@ endif() link_libraries(stdc++fs) -file(GLOB_RECURSE MXREC_SRC ./*.cpp) +file(GLOB_RECURSE MXREC_SRC ./*.cpp ./*.h) add_library(ASC SHARED ${MXREC_SRC}) target_include_directories(ASC @@ -55,10 +60,11 @@ target_link_directories(ASC ${HDF5_PATH}/lib ${SECUREC_PATH}/lib ${ASCEND_DRIVER_PATH}/lib64/driver + ${ACCCTR_PATH}/output/ock_ctr_common/lib ) target_link_libraries(ASC PUBLIC ascendcl msprofiler ge_executor gert runtime ge_common register graph ascend_protobuf - profapi opt_feature error_manager exe_graph acl_tdt_channel acl_tdt_queue securec drvdsmi_host) + profapi opt_feature error_manager exe_graph acl_tdt_channel acl_tdt_queue securec drvdsmi_host _ock_ctr_common) target_link_libraries(ASC PUBLIC -l:_tf_adapter.so OpenMP::OpenMP_CXX ${MPI_CXX_LIBRARIES} diff --git a/src/core/checkpoint/checkpoint.cpp b/src/core/checkpoint/checkpoint.cpp index 0fc03feb..8a6750d5 100644 --- a/src/core/checkpoint/checkpoint.cpp +++ b/src/core/checkpoint/checkpoint.cpp @@ -196,9 +196,7 @@ void Checkpoint::WriteStream(CkptTransData& transData, const string& dataDir, si } ssize_t writeBytesNum; - if (floatTransSet.find(dataType) != floatTransSet.end()) { - writeBytesNum = fileSystemPtr->Write(dataDir, transData.floatArr, dataSize); - } else if (int32TransSet.find(dataType) != int32TransSet.end()) { + if (int32TransSet.find(dataType) != int32TransSet.end()) { writeBytesNum = fileSystemPtr->Write(dataDir, reinterpret_cast(transData.int32Arr.data()), dataSize); } else if (int64TransSet.find(dataType) != int64TransSet.end()) { @@ -207,6 +205,8 @@ void Checkpoint::WriteStream(CkptTransData& transData, const string& dataDir, si } else if (dataType == CkptDataType::ATTRIBUTE) { writeBytesNum = fileSystemPtr->Write(dataDir, reinterpret_cast(transData.attribute.data()), dataSize); + } else { + throw runtime_error("unknown CkptDataType"); } if (writeBytesNum == -1) { @@ -276,7 +276,6 @@ void Checkpoint::LoadDataset(const vector& embNames, auto attributeDir { datasetPath + dirSeparator + "slice" + attribFileType }; CkptTransData transData; - LOG_DEBUG("====Start reading data from: {}", attributeDir); auto dataElmtBytes { dataHandler->GetDataElmtBytes(CkptDataType::ATTRIBUTE) }; ReadStream(transData, attributeDir, CkptDataType::ATTRIBUTE, dataElmtBytes); @@ -328,10 +327,10 @@ void Checkpoint::ReadStream(CkptTransData& transData, readBytesNum = fileSystemPtr->Read(dataDir, reinterpret_cast(transData.int32Arr.data()), datasetSize); } else if (int64TransSet.find(dataType) != int64TransSet.end()) { readBytesNum = fileSystemPtr->Read(dataDir, reinterpret_cast(transData.int64Arr.data()), datasetSize); - } else if (floatTransSet.find(dataType) != floatTransSet.end()) { - readBytesNum = fileSystemPtr->Read(dataDir, reinterpret_cast(transData.floatArr.data()), datasetSize); } else if (dataType == CkptDataType::ATTRIBUTE) { readBytesNum = fileSystemPtr->Read(dataDir, reinterpret_cast(transData.attribute.data()), datasetSize); + } else { + throw runtime_error("unknown CkptDataType"); } if (readBytesNum == -1) { @@ -383,9 +382,9 @@ void Checkpoint::SetTransDataSize(CkptTransData& transData, size_t datasetSize, transData.int32Arr.resize(datasetSize); } else if (int64TransSet.find(dataType) != int64TransSet.end()) { transData.int64Arr.resize(datasetSize); - } else if (floatTransSet.find(dataType) != floatTransSet.end()) { - transData.floatArr.resize(datasetSize); } else if (dataType == CkptDataType::ATTRIBUTE) { transData.attribute.resize(datasetSize); + } else { + throw runtime_error("unknown CkptDataType"); } } diff --git a/src/core/checkpoint/checkpoint.h b/src/core/checkpoint/checkpoint.h index 362881b2..625660ff 100644 --- a/src/core/checkpoint/checkpoint.h +++ b/src/core/checkpoint/checkpoint.h @@ -63,9 +63,6 @@ namespace MxRec { CkptDataType::KEY_COUNT_MAP, CkptDataType::EVICT_POS }; - const set floatTransSet{ - CkptDataType::EMB_DATA - }; vector> dataHandlers; string processPath; diff --git a/src/core/ckpt_data_handler/ckpt_data_handler.cpp b/src/core/ckpt_data_handler/ckpt_data_handler.cpp index 18f1a090..04feb4b3 100644 --- a/src/core/ckpt_data_handler/ckpt_data_handler.cpp +++ b/src/core/ckpt_data_handler/ckpt_data_handler.cpp @@ -33,7 +33,6 @@ void CkptDataHandler::CleanTransfer() { transferData.int64Arr.clear(); transferData.int32Arr.clear(); - transferData.floatArr.clear(); transferData.attribute.clear(); transferData.datasetSize = 0; transferData.attributeSize = 0; @@ -42,7 +41,7 @@ void CkptDataHandler::CleanTransfer() void CkptDataHandler::SetDatasetForLoadEmb(CkptDataType dataType, string embName, CkptTransData& loadedData, CkptData& ckptData) { - LOG_ERROR("Load host emb failed. dataType:{}, embName:{}, loadedData:{}, ckptData:{}", - dataType, embName, loadedData.datasetSize, ckptData.embHashMaps.empty()); + LOG_ERROR("Load host emb failed. dataType:{}, embName:{}, loadedData:{}", + dataType, embName, loadedData.datasetSize); throw runtime_error("only EMB_INFO and EMB_DATA supported for load host emb"); } \ No newline at end of file diff --git a/src/core/ckpt_data_handler/ckpt_data_handler.h b/src/core/ckpt_data_handler/ckpt_data_handler.h index 383317d9..0ca33294 100644 --- a/src/core/ckpt_data_handler/ckpt_data_handler.h +++ b/src/core/ckpt_data_handler/ckpt_data_handler.h @@ -18,8 +18,6 @@ See the License for the specific language governing permissions and #include -#include "emb_hashmap/emb_hashmap.h" -#include "host_emb/host_emb.h" #include "utils/common.h" namespace MxRec { diff --git a/src/core/ckpt_data_handler/feat_admit_n_evict_ckpt/feat_admit_n_evict_ckpt.cpp b/src/core/ckpt_data_handler/feat_admit_n_evict_ckpt/feat_admit_n_evict_ckpt.cpp index be35044b..140b9c77 100644 --- a/src/core/ckpt_data_handler/feat_admit_n_evict_ckpt/feat_admit_n_evict_ckpt.cpp +++ b/src/core/ckpt_data_handler/feat_admit_n_evict_ckpt/feat_admit_n_evict_ckpt.cpp @@ -157,7 +157,7 @@ void FeatAdmitNEvictCkpt::SetHistRec(string embName) for (size_t i = featItemInfoOffset; i < featItemInfoTotalSize + featItemInfoOffset; i += featItemInfoSaveNum) { process = i % printPerStep; if (process == 1) { - LOG_DEBUG("====in SetHistRec, process : %f", i / featItemInfoTotalSize); + LOG_TRACE("====in SetHistRec, process : {}", i / featItemInfoTotalSize); } auto featureId = transArr[i + featureIdIdxOffset]; auto count = transArr[i + countIdxOffset]; diff --git a/src/core/emb_hashmap/emb_hashmap.cpp b/src/core/emb_hashmap/emb_hashmap.cpp deleted file mode 100644 index 977b2c0b..00000000 --- a/src/core/emb_hashmap/emb_hashmap.cpp +++ /dev/null @@ -1,477 +0,0 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ - -#include "emb_hashmap.h" -#include -#include - -#include "hybrid_mgmt/hybrid_mgmt_block.h" -#include "utils/common.h" -#include "emb_table/embedding_mgmt.h" - -using namespace MxRec; - -void EmbHashMap::Init(const RankInfo& ri, const vector& embInfos, bool ifLoad) -{ - this->rankInfo = ri; - if (!ifLoad) { - EmbHashMapInfo embHashMapInfo; - LOG_INFO("init emb hash map from scratch"); - for (const auto& embInfo: embInfos) { - embHashMapInfo.devOffset2Batch.resize(embInfo.devVocabSize); - embHashMapInfo.devOffset2Key.resize(embInfo.devVocabSize); - embHashMapInfo.hostVocabSize = embInfo.hostVocabSize; - embHashMapInfo.devVocabSize = embInfo.devVocabSize; - embHashMapInfo.currentUpdatePos = 0; - fill(embHashMapInfo.devOffset2Batch.begin(), embHashMapInfo.devOffset2Batch.end(), -1); - fill(embHashMapInfo.devOffset2Key.begin(), embHashMapInfo.devOffset2Key.end(), -1); - embHashMaps[embInfo.name] = embHashMapInfo; - - LOG_TRACE("devOffset2Key, {}", VectorToString(embHashMaps.at(embInfo.name).devOffset2Key)); - LOG_TRACE("devOffset2Batch, {}", VectorToString(embHashMaps.at(embInfo.name).devOffset2Batch)); - } - } -} - -void EmbHashMap::ClearLookupAndSwapOffset(EmbHashMapInfo& embHashMap) const -{ - embHashMap.swapPos.clear(); - embHashMap.lookUpVec.clear(); - embHashMap.ddr2HbmKeys.clear(); -} - -/// DDR模型下处理特征的offset、swap信息等 -/// \param embName 表名 -/// \param keys 查询向量 -/// \param DDRParam 临时向量 -/// \param channelId 通道索引(训练/推理) -void EmbHashMap::Process(const string& embName, vector& keys, DDRParam& ddrParam, int channelId) -{ -#ifndef GTEST - EASY_FUNCTION(profiler::colors::Pink) - TimeCost swapTimeCost; - std::shared_ptr table = EmbeddingMgmt::Instance()->GetTable(embName); - - int32_t keepBatch = swapId; // 处理batch的次数,多个预取一起处理算一次 - vector swapPos; - vector lookUpVec = table->FindOffset(keys, swapId, channelId, swapPos); - - table->RefreshFreqInfoWithSwap(); - - EASY_BLOCK("hostHashMaps->tdt") - - std::copy(lookUpVec.begin(), lookUpVec.end(), std::back_inserter(ddrParam.offsetsOut)); - - // 构造查询向量tensor - int lookUpVecSize = static_cast(lookUpVec.size()); - ddrParam.tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, { lookUpVecSize })); - - auto lookupTensorData = ddrParam.tmpDataOut.back().flat(); - for (int i = 0; i < lookUpVecSize; i++) { - lookupTensorData(i) = static_cast(lookUpVec[i]); - } - LOG_TRACE("lookupTensor, {}", VectorToString(lookUpVec)); - - // 构造交换向量tensor - int swapSize = static_cast(swapPos.size()); - ddrParam.tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, { swapSize })); - - auto swapTensorData = ddrParam.tmpDataOut.back().flat(); - for (int i = 0; i < swapSize; i++) { - swapTensorData(i) = static_cast(swapPos[i]); - } - if (swapSize > 0) { - LOG_DEBUG("swap num: {}", swapSize); - } - - LOG_TRACE("swapTensor, {}", VectorToString(swapPos)); - // 清空本次记录的查询偏移和交换偏移 - table->ClearLookupAndSwapOffset(); - - LOG_INFO("current ddr emb:{}, usage:{}/[{}+{}]", embName, table->GetMaxOffset(), - table->GetDevVocabSize(), table->GetHostVocabSize()); - - ddrParam.tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); - auto swapLen = ddrParam.tmpDataOut.back().flat(); - swapLen(0) = swapSize; - - if (GlogConfig::gStatOn) { - LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} swap_key_size {} swap_time_cost {}", - channelId, swapId, rankInfo.rankId, swapSize, swapTimeCost.ElapsedMS()); - } - - swapId++; - EASY_END_BLOCK -#endif -} - - -auto EmbHashMap::GetHashMaps() -> absl::flat_hash_map -{ - LOG_DEBUG(HYBRID_BLOCKING + " start GetHashMaps"); - HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); - auto embHashMapsOld = embHashMaps; - int checkResult = hybridMgmtBlock->CheckSaveEmbMapValid(); - if (checkResult == 0) { - // 检查是否需要回退 - return embHashMapsOld; - } - if (checkResult == 1) { - // 回退一步 - for (auto& temp: embHashMapsOld) { - auto &embHashMap = temp.second; - for (auto &swapKeys: embHashMap.oldSwap) { - emb_key_t oldKey = swapKeys.first; - emb_key_t key = swapKeys.second; - int tempOffset = static_cast(embHashMap.hostHashMap[key]); - embHashMap.hostHashMap[key] = embHashMap.hostHashMap[oldKey]; - embHashMap.hostHashMap[oldKey] = static_cast(tempOffset); - } - embHashMap.maxOffset = embHashMap.maxOffsetOld; - for (auto &offset2Key: embHashMap.devOffset2KeyOld) { - embHashMap.devOffset2Key[offset2Key.first] = offset2Key.second; - } - } - return embHashMapsOld; - } - // 此时需要回退2步,无法满足此条件,保存的东西错误,直接回退 - if (rankInfo.isDDR) { - throw HybridMgmtBlockingException("EmbHashMap::GetHashMaps() "); - } - return embHashMapsOld; -} - -void EmbHashMap::LoadHashMap(EmbHashMemT& loadData) -{ - embHashMaps = std::move(loadData); -} - -/// 对HBM剩余空间和更新位置进行初始化 -void EmbHashMapInfo::SetStartCount() -{ - currentUpdatePosStart = currentUpdatePos; - freeSize = devVocabSize; -} - -/// 判断HBM是否有剩余空间 -/// \param i 查询向量的大小 -/// \return -bool EmbHashMapInfo::HasFree(size_t i) const -{ - return freeSize < i; -} - -/* -* 删除淘汰key的映射关系,并将其offset更新到evictPos,待后续复用 -*/ -void EmbHashMap::EvictDeleteEmb(const string& embName, const vector& keys) -{ - EASY_FUNCTION() - size_t keySize = keys.size(); - auto& embHashMap = embHashMaps.at(embName); - vector evictHBMKeys; - vector evictDDRKeys; - for (size_t i = 0; i < keySize; i++) { - size_t offset; - auto key = keys[i]; - if (key == -1) { - LOG_WARN("evict key equal -1!"); - continue; - } - const auto& iter = embHashMap.hostHashMap.find(key); - if (iter != embHashMap.hostHashMap.end()) { - offset = iter->second; - embHashMap.hostHashMap.erase(iter); - LOG_TRACE("evict embName {}, offset {}", embName, offset); - } else { - // 淘汰依据keyProcess中的history,hashmap映射关系创建于ParseKey;两者异步,造成淘汰的值在hashmap里可能未创建 - continue; - } - - if (offset < embHashMap.devVocabSize) { - embHashMap.devOffset2Batch[offset] = -1; - embHashMap.devOffset2KeyOld.emplace_back(offset, embHashMap.devOffset2Key[offset]); - embHashMap.devOffset2Key[offset] = -1; - embHashMap.evictDevPos.emplace_back(offset); - evictHBMKeys.emplace_back(key); - } else { - embHashMap.evictPos.emplace_back(offset); - evictDDRKeys.emplace_back(key); - } - } - if (isSSDEnabled) { - cacheManager->RefreshFreqInfoCommon(embName, evictHBMKeys, TransferType::HBM_2_EVICT); - cacheManager->RefreshFreqInfoCommon(embName, evictDDRKeys, TransferType::DDR_2_EVICT); - } - - LOG_INFO("ddr EvictDeleteEmb, emb: [{}], hostEvictSize: {}, devEvictSize: {}", - embName, embHashMap.evictPos.size(), embHashMap.evictDevPos.size()); - LOG_TRACE("hostHashMap, {}", MapToString(embHashMaps[embName].hostHashMap)); -} - - -/// 从embHashMaps获取key对应的位置,构造查询向量;更新devOffset2Batch;记录dev与host需要交换的偏移 -/// \param embName 表名 -/// \param keys 查询向量 -/// \param currentBatchId 已处理的batch数 -/// \param keepBatchId 处理batch的次数,多个预取一起处理算一次 -/// \param channelId 通道索引(训练/推理) -void EmbHashMap::FindOffset(const string& embName, const vector& keys, - size_t currentBatchId, size_t keepBatchId, int channelId) -{ - EASY_FUNCTION() - size_t keySize = keys.size(); - auto it = embHashMaps.find(embName); - if (it == embHashMaps.end()) { - throw runtime_error("table not exist in embHashMaps"); - } - auto &embHashMap = it->second; - UpdateBatchId(keys, currentBatchId, keySize, embHashMap); - for (size_t i = 0; i < keySize; i++) { - auto key = keys[i]; - if (key == -1) { - embHashMap.lookUpVec.emplace_back(INVALID_KEY_VALUE); - continue; - } - size_t offset; - auto isOffsetValid = FindOffsetHelper(key, embHashMap, channelId, offset); - if (!isOffsetValid) { - embHashMap.lookUpVec.emplace_back(INVALID_KEY_VALUE); - continue; - } - AddKeyFreqInfo(embName, key, RecordType::NOT_DDR); - if (offset < embHashMap.devVocabSize) { - // 偏移小于等于HBM容量:直接放入查询向量;更新偏移之前关联的key和当前关联的key - embHashMap.lookUpVec.emplace_back(offset); - embHashMap.devOffset2KeyOld.emplace_back(offset, static_cast(embHashMap.devOffset2Key[offset])); - embHashMap.devOffset2Key[offset] = key; - } else { - // 偏移大于HBM容量:记录在host emb上的偏移;找到需要交换的HBM偏移 - embHashMap.missingKeysHostPos.emplace_back(offset - embHashMap.devVocabSize); - FindSwapPosOld(embName, key, offset, currentBatchId, keepBatchId); - } - } - if (currentBatchId == 0) { - LOG_INFO("max offset {}", embHashMap.maxOffset); - } - LOG_TRACE("hostHashMap, {}", MapToString(embHashMaps[embName].hostHashMap)); -} - - -/// 查找key对应的偏移;1. 已在hash map中,直接返回对应的offset;2. 开启淘汰的情况下,复用淘汰的位置;3. 没有则新分配 -/// \param key 输入特征 -/// \param embHashMap hash map实例 -/// \param channelId 通道索引(训练/推理) -/// \param offset 未初始化变量,用于记录 -/// \return -bool EmbHashMap::FindOffsetHelper(const emb_key_t& key, EmbHashMapInfo& embHashMap, int channelId, size_t& offset) const - -{ - const auto& iter = embHashMap.hostHashMap.find(key); - if (iter != embHashMap.hostHashMap.end()) { - offset = iter->second; - LOG_TRACE("devVocabSize, {} , offset , {}", embHashMap.devVocabSize, offset); - if (isSSDEnabled && offset >= embHashMap.devVocabSize) { - embHashMap.ddr2HbmKeys.emplace_back(key); - } - } else if (embHashMap.evictDevPos.size() != 0 && channelId == TRAIN_CHANNEL_ID) { // 优先复用hbm表 - offset = embHashMap.evictDevPos.back(); - embHashMap.hostHashMap[key] = offset; - LOG_TRACE("ddr mode, dev evictPos is not null, key [{}] reuse offset [{}], evictSize [{}]", - key, offset, embHashMap.evictDevPos.size()); - embHashMap.evictDevPos.pop_back(); - } else if (embHashMap.evictPos.size() != 0 && channelId == TRAIN_CHANNEL_ID) { // hbm不足,再复用ddr表 - offset = embHashMap.evictPos.back(); - embHashMap.hostHashMap[key] = offset; - LOG_TRACE("ddr mode, host evictPos is not null, key [{}] reuse offset [{}], evictSize [{}]", - key, offset, embHashMap.evictPos.size()); - embHashMap.evictPos.pop_back(); - } else { - if (channelId == TRAIN_CHANNEL_ID) { - embHashMap.hostHashMap[key] = embHashMap.maxOffset; - offset = embHashMap.maxOffset; - embHashMap.maxOffset++; - if (embHashMap.maxOffset == embHashMap.devVocabSize) { - LOG_INFO("start using host vocab!"); - } - if (embHashMap.maxOffset > embHashMap.hostVocabSize + embHashMap.devVocabSize) { - LOG_ERROR("hostVocabSize too small! dev:{} host:{}", embHashMap.devVocabSize, embHashMap.hostVocabSize); - throw runtime_error("hostVocabSize too small"); - } - } else { - return false; - } - } - return true; -} - -/// 更新HBM中的key相应offset最近出现的batch步数,用于跟踪哪些offset是最近在使用的 -/// \param keys 查询向量 -/// \param currentBatchId 已处理的batch数 -/// \param keySize 查询向量长度 -/// \param embHashMap hash map实例 -void EmbHashMap::UpdateBatchId(const vector& keys, size_t currentBatchId, size_t keySize, - EmbHashMapInfo& embHashMap) const -{ - for (size_t i = 0; i < keySize; i++) { - size_t offset; - auto key = keys[i]; - if (key == -1) { - continue; - } - const auto& iter = embHashMap.hostHashMap.find(key); - if (iter != embHashMap.hostHashMap.end()) { - offset = iter->second; - - LOG_TRACE("key will be used, {} , offset , {}", key, offset); - if (offset < embHashMap.devVocabSize) { - // devOffset2Batch size equal to devVocabSize, unnecessary to check index boundary - embHashMap.devOffset2Batch[offset] = static_cast(currentBatchId); - } - } - } -} - -/// 利用devOffset2Batch上key最近使用的batchId,来选择需要淘汰的key,记录淘汰位置和device侧所需的keys -/// \param embName 表名 -/// \param key 输入特征 -/// \param hostOffset 全局偏移 -/// \param currentBatchId 已处理的batch数 -/// \param keepBatchId 处理batch的次数,多个预取一起处理算一次 -/// \return 是否找到需要交换的位置 -bool EmbHashMap::FindSwapPosOld(const string& embName, emb_key_t key, size_t hostOffset, size_t currentBatchId, - size_t keepBatchId) -{ - bool notFind = true; - auto it = embHashMaps.find(embName); - if (it == embHashMaps.end()) { - throw runtime_error("table not exist in embHashMaps"); - } - auto &embHashMap = it->second; - while (notFind) { - // 找到本次预取之前的偏移(保证所有预取batch的key都在HBM中) - if (embHashMap.currentUpdatePos >= embHashMap.devOffset2Batch.size()) { - throw runtime_error("currentUpdatePos out of range"); - } - - if (embHashMap.devOffset2Batch[embHashMap.currentUpdatePos] < static_cast(keepBatchId)) { - embHashMap.devOffset2Batch[embHashMap.currentUpdatePos] = static_cast(currentBatchId); - embHashMap.swapPos.emplace_back(embHashMap.currentUpdatePos); // 记录需要被换出的HBM偏移 - embHashMap.lookUpVec.emplace_back(embHashMap.currentUpdatePos); // 交换的位置就是该key查询的偏移 - embHashMap.hostHashMap[key] = embHashMap.currentUpdatePos; // 更新key对应的HBM偏移 - // 记录HBM偏移之前的key - embHashMap.devOffset2KeyOld.emplace_back(embHashMap.currentUpdatePos, - embHashMap.devOffset2Key[embHashMap.currentUpdatePos]); - auto& oldKey = embHashMap.devOffset2Key[embHashMap.currentUpdatePos]; - embHashMap.oldSwap.emplace_back(oldKey, key); // 记录交换的两个key oldKey:HBM->DDR key:DDR->HBM - embHashMap.hostHashMap[oldKey] = hostOffset; // 更新被替换的key的偏移 - oldKey = key; - notFind = false; - } - embHashMap.currentUpdatePos++; // 查找位置+1 - embHashMap.freeSize--; // HBM可用空间-1 - - // 遍历完一遍整个HBM表后,从头开始遍历 - if (embHashMap.currentUpdatePos == embHashMap.devVocabSize) { - embHashMap.currentUpdatePos = 0; - } - - // 已经找完整个HBM空间,且没找到可用位置,表示HBM空间不足以放下整个batch(预取batch数)的key,无法正常执行训练,故运行时错误退出 - if (embHashMap.currentUpdatePos == embHashMap.currentUpdatePosStart && notFind) { - LOG_ERROR("devVocabSize is too small"); - throw runtime_error("devVocabSize is too small"); - } - } - return true; -} - -/// HBM-DDR换入换出时刷新频次信息 -/// \param embName emb表名 -/// \param embHashMap emb hash map -void EmbHashMap::RefreshFreqInfoWithSwap(const string& embName, EmbHashMapInfo& embHashMap) const -{ - if (!isSSDEnabled) { - return; - } - // 换入换出key列表,元素为pair: pair oldKey为从HBM移出的key, key为从DDR移出的key - auto& oldSwap = embHashMap.oldSwap; - LOG_DEBUG("RefreshFreqInfoWithSwap:oldSwap Size:{}", oldSwap.size()); - vector enterDDRKeys; - for (auto keyPair : oldSwap) { - enterDDRKeys.emplace_back(keyPair.first); - } - cacheManager->RefreshFreqInfoCommon(embName, enterDDRKeys, TransferType::HBM_2_DDR); - cacheManager->RefreshFreqInfoCommon(embName, embHashMap.ddr2HbmKeys, TransferType::DDR_2_HBM); - - AddCacheManagerTraceLog(embName, embHashMap); -} - -/// 记录日志:HBM和DDR换入换出后,比较hostHashMap中DDR内key和表对应的lfuCache对象中的key内容 -void EmbHashMap::AddCacheManagerTraceLog(const string& embTableName, const EmbHashMapInfo& embHashMap) const -{ - if (Logger::GetLevel() != Logger::TRACE) { - return; - } - auto& hostMap = embHashMap.hostHashMap; - auto& devSize = embHashMap.devVocabSize; - auto iter = cacheManager->ddrKeyFreqMap.find(embTableName); - if (iter == cacheManager->ddrKeyFreqMap.end()) { - throw runtime_error("table not in ddrKeyFreqMap"); - } - auto &lfu = iter->second; - const auto& lfuTab = lfu.GetFreqTable(); - if (lfuTab.empty()) { - return; - } - size_t tableKeyInDdr = 0; - vector ddrKeys; // 获取hostHashMap中保存在DDR的key - for (const auto& item : hostMap) { - if (item.second < devSize) { - continue; - } - ddrKeys.emplace_back(item.first); - ++tableKeyInDdr; - } - vector lfuKeys; - for (const auto& it : lfuTab) { - lfuKeys.emplace_back(it.first); - } - std::sort(ddrKeys.begin(), ddrKeys.end()); - std::sort(lfuKeys.begin(), lfuKeys.end()); - std::string ddrKeysString = VectorToString(ddrKeys); - std::string lfuKeysString = VectorToString(lfuKeys); - if (ddrKeysString != lfuKeysString) { - LOG_ERROR("swap HBM with DDR step error, key string not equal, ddrKeysString:{}, lfuKeysString:{}", - ddrKeysString, lfuKeysString); - } else { - LOG_INFO("swap HBM with DDR step OK, table:{}, ddrKeysString == lfuKeysString, string length:{}", - embTableName, lfuKeysString.length()); - } - - LOG_INFO("swap HBM with DDR step end, table:{}, tableKeyInDdr:{}, tableKeyInLfu:{}", - embTableName, tableKeyInDdr, lfu.keyTable.size()); -} - -/// 记录key频次数据 -/// \param embTableName emb表名 -/// \param key key -/// \param type 记录类型枚举 -void EmbHashMap::AddKeyFreqInfo(const string& embTableName, const emb_key_t& key, RecordType type) const -{ - if (!isSSDEnabled) { - return; - } - cacheManager->PutKey(embTableName, key, type); -} diff --git a/src/core/emb_hashmap/emb_hashmap.h b/src/core/emb_hashmap/emb_hashmap.h deleted file mode 100644 index 96a75e54..00000000 --- a/src/core/emb_hashmap/emb_hashmap.h +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ - -#ifndef MX_REC_EMB_HASHMAP_H -#define MX_REC_EMB_HASHMAP_H - -#include -#include -#include -#include "absl/container/flat_hash_map.h" -#include "host_emb/host_emb.h" -#include "ssd_cache/cache_manager.h" -#include "utils/common.h" -#include "utils/time_cost.h" - -namespace MxRec { - using namespace std; - - class EmbHashMap { - public: - EmbHashMap() = default; - - void Init(const RankInfo& ri, const vector& embInfos, bool ifLoad = false); - - void Process(const string& embName, std::vector& keys, DDRParam& ddrParam, int channelId); - - auto GetHashMaps() -> absl::flat_hash_map; - - void LoadHashMap(absl::flat_hash_map& loadData); - - void EvictDeleteEmb(const string& embName, const vector& keys); - - absl::flat_hash_map embHashMaps; - - bool FindOffsetHelper(const emb_key_t& key, EmbHashMapInfo& embHashMap, int channelId, size_t& offset) const; - - void UpdateBatchId(const vector& keys, size_t currentBatchId, size_t keySize, - EmbHashMapInfo& embHashMap) const; - - bool FindSwapPosOld(const string& embName, emb_key_t key, size_t hostOffset, size_t currentBatchId, - size_t keepBatchId); - - std::vector& GetEvictPos(const string& embName) - { - return embHashMaps.at(embName).evictPos; - } - - bool isSSDEnabled { false }; - CacheManager* cacheManager; - - GTEST_PRIVATE: - - void FindOffset(const string& embName, const vector& keys, - size_t currentBatchId, size_t keepBatchId, int channelId); - - void AddCacheManagerTraceLog(const string& embTableName, const EmbHashMapInfo& embHashMap) const; - - void AddKeyFreqInfo(const string& embTableName, const emb_key_t& key, RecordType type) const; - - void ClearLookupAndSwapOffset(EmbHashMapInfo& embHashMap) const; - - void RefreshFreqInfoWithSwap(const string& embName, EmbHashMapInfo& embHashMap) const; - - RankInfo rankInfo; - int swapId { 0 }; - }; -} - -#endif // MX_REC_EMB_HASHMAP_H diff --git a/src/core/emb_table/emb_table.cpp b/src/core/emb_table/emb_table.cpp index 1c24eb2b..914cf535 100644 --- a/src/core/emb_table/emb_table.cpp +++ b/src/core/emb_table/emb_table.cpp @@ -78,6 +78,7 @@ EmbTable::~EmbTable() // 从embeddingList获取一个可用的emb地址 int64_t EmbTable::GetEmbAddress() { + int64_t ret = -1; #ifndef GTEST if (embeddingList.empty()) { PrintStatus(); @@ -97,8 +98,9 @@ int64_t EmbTable::GetEmbAddress() float *embAddr = embeddingList.front(); embeddingList.pop_front(); usedCapacity++; - return reinterpret_cast(embAddr); + ret = reinterpret_cast(embAddr); #endif + return ret; } void EmbTable::RandomInit(void* newBlock) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 3d2b77e7..caec0229 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ - #include "emb_table/embedding_ddr.h" + #include + #include "utils/logger.h" #include "utils/singleton.h" -#include "host_emb/host_emb.h" #include "file_system/file_system_handler.h" #include "ssd_cache/cache_manager.h" -#include "emb_table/embedding_mgmt.h" +#include "ock_ctr_common/include/error_code.h" using namespace MxRec; @@ -29,18 +29,15 @@ EmbeddingDDR::EmbeddingDDR() } EmbeddingDDR::EmbeddingDDR(const EmbInfo& info, const RankInfo& rankInfo, int inSeed) - : EmbeddingTable(info, rankInfo, inSeed) + : EmbeddingTable(info, rankInfo, inSeed), deviceId(rankInfo.deviceId) { - LOG_INFO("Init DDR table [{}] devVocabSize = {} hostVocabSize = {}", name, devVocabSize, hostVocabSize); - currentUpdatePos = 0; - devOffset2Key.resize(devVocabSize); - devOffset2Batch.resize(devVocabSize); - std::fill(devOffset2Batch.begin(), devOffset2Batch.end(), -1); - std::fill(devOffset2Key.begin(), devOffset2Key.end(), -1); + LOG_INFO("Init DDR table:{}, devVocabSize:{}, hostVocabSize:{}", name, devVocabSize, hostVocabSize); } EmbeddingDDR::~EmbeddingDDR() { + hdTransfer = nullptr; + embCache = nullptr; } void EmbeddingDDR::Key2Offset(std::vector& splitKey, int channel) @@ -52,214 +49,11 @@ int64_t EmbeddingDDR::capacity() const return capacity_; } -std::vector EmbeddingDDR::FindOffset(const vector& keys, - size_t batchId, int channelId, - std::vector& swapPos) -{ - devOffset2KeyOld.clear(); - oldSwap.clear(); - maxOffsetOld = maxOffset; - - UpdateBatchId(keys, batchId); - std::vector lookUpVec; - for (size_t i = 0; i < keys.size(); i++) { - emb_key_t key = keys[i]; - if (key == INVALID_KEY_VALUE) { - lookUpVec.emplace_back(INVALID_KEY_VALUE); - continue; - } - emb_key_t offset = FindOffsetHelper(key, channelId); - if (offset == INVALID_KEY_VALUE) { - lookUpVec.emplace_back(INVALID_KEY_VALUE); - continue; - } - AddKeyFreqInfo(key, RecordType::NOT_DDR); - if (offset < devVocabSize) { - // 偏移小于等于HBM容量:直接放入查询向量;更新偏移之前关联的key和当前关联的key - lookUpVec.push_back(offset); - devOffset2KeyOld.emplace_back(offset, static_cast(devOffset2Key[offset])); - devOffset2Key[offset] = key; - } else { - // 偏移大于HBM容量:记录在host emb上的偏移;找到需要交换的HBM偏移 - missingKeysHostPos_.emplace_back(offset - devVocabSize); - offset = FindSwapPosOld(key, offset, batchId, swapPos); - lookUpVec.emplace_back(offset); - } - } - if (batchId == 0) { - LOG_INFO("max offset {}", maxOffset); - } - LOG_TRACE("keyOffsetMap, {}", MapToString(keyOffsetMap)); - return lookUpVec; -} - -emb_key_t EmbeddingDDR::FindOffsetHelper(const emb_key_t& key, int channelId) -{ - const auto& iter = keyOffsetMap.find(key); - emb_key_t offset = INVALID_KEY_VALUE; - if (iter != keyOffsetMap.end()) { - offset = iter->second; - LOG_TRACE("devVocabSize, {} , offset , {}", devVocabSize, offset); - if (isSSDEnabled_ && offset >= devVocabSize) { - ddr2HbmKeys.emplace_back(key); - } - return offset; - } - if (channelId != TRAIN_CHANNEL_ID) { - return offset; - } - if (evictDevPos.size() != 0) { // 优先复用hbm表 - offset = evictDevPos.back(); - keyOffsetMap[key] = offset; - LOG_TRACE("ddr mode, dev evictDevPos is not null, key [{}] reuse offset [{}], evictSize [{}]", - key, offset, evictDevPos.size()); - evictDevPos.pop_back(); - LOG_ERROR("dev evicted offset = {}", offset); - return offset; - } - - if (evictHostPos.size() != 0) { // hbm不足,再复用host/ddr表 - offset = evictHostPos.back(); - keyOffsetMap[key] = offset; - LOG_TRACE("ddr mode, host evictPos is not null, key [{}] reuse offset [{}], evictSize [{}]", - key, offset, evictHostPos.size()); - evictHostPos.pop_back(); - LOG_TRACE("host evicted offset = {}", offset); - return offset; - } - keyOffsetMap[key] = maxOffset; - offset = maxOffset; - maxOffset++; - if (maxOffset == devVocabSize) { - LOG_INFO("start using host vocab!"); - } - if (maxOffset > (hostVocabSize + devVocabSize)) { - LOG_ERROR("hostVocabSize too small! dev:{} host:{}", devVocabSize, hostVocabSize); - throw runtime_error("hostVocabSize too small"); - } - return offset; -} - -void EmbeddingDDR::UpdateBatchId(const vector& keys, size_t currentBatchId) -{ - for (size_t i = 0; i < keys.size(); i++) { - size_t offset; - emb_key_t key = keys[i]; - if (key == -1) { - continue; - } - const auto& iter = keyOffsetMap.find(key); - if (iter != keyOffsetMap.end()) { - offset = iter->second; - - LOG_TRACE("key will be used, {} , offset , {}", key, offset); - if (offset < devVocabSize) { - // devOffset2Batch size equal to devVocabSize, unnecessary to check index boundary - devOffset2Batch[offset] = static_cast(currentBatchId); - } - } - } -} - -/// 利用devOffset2Batch上key最近使用的batchId,来选择需要淘汰的key,记录淘汰位置和device侧所需的keys -/// \param embName 表名 -/// \param key 输入特征 -/// \param hostOffset 全局偏移 -/// \param currentBatchId 已处理的batch数 -/// \param keepBatchId 处理batch的次数,多个预取一起处理算一次 -/// \return 是否找到需要交换的位置 -emb_key_t EmbeddingDDR::FindSwapPosOld(emb_key_t key, size_t hostOffset, size_t batchId, - std::vector& swapPos) -{ - bool notFind = true; - emb_key_t offset = INVALID_KEY_VALUE; - while (notFind) { - // 找到本次预取之前的偏移(保证所有预取batch的key都在HBM中) - if (currentUpdatePos >= devOffset2Batch.size()) { - LOG_ERROR("outofrange {} >= {}", currentUpdatePos, devOffset2Batch.size()); - throw runtime_error("currentUpdatePos out of range"); - } - - if (devOffset2Batch[currentUpdatePos] < static_cast(batchId)) { - devOffset2Batch[currentUpdatePos] = static_cast(batchId); - swapPos.emplace_back(currentUpdatePos); // 记录需要被换出的HBM偏移 - offset = currentUpdatePos; - keyOffsetMap[key] = currentUpdatePos; // 更新key对应的HBM偏移 - // 记录HBM偏移之前的key - devOffset2KeyOld.emplace_back(currentUpdatePos, devOffset2Key[currentUpdatePos]); - auto& oldKey = devOffset2Key[currentUpdatePos]; - oldSwap.emplace_back(oldKey, key); // 记录交换的两个key oldKey:HBM->DDR key:DDR->HBM - keyOffsetMap[oldKey] = hostOffset; // 更新被替换的key的偏移 - oldKey = key; - notFind = false; - } - currentUpdatePos++; // 查找位置+1 - freeSize_--; // HBM可用空间-1 - - // 遍历完一遍整个HBM表后,从头开始遍历 - if (currentUpdatePos == devVocabSize) { - currentUpdatePos = 0; - } - - /** - * currentUpdatePos已经绕了HBM一圈 - * 已经找完整个HBM空间,且没找到可用位置,表示HBM空间不足以放下整个batch(预取batch数)的key, - * 无法正常执行训练,故运行时错误退出 - */ - if (currentUpdatePos == currentUpdatePosStart && notFind) { - LOG_ERROR("devVocabSize is too small"); - throw runtime_error("devVocabSize is too small"); - } - } - return offset; -} - /* * 删除淘汰key的映射关系,并将其offset更新到evictPos,待后续复用 */ void EmbeddingDDR::EvictDeleteEmb(const vector& keys) { - EASY_FUNCTION() - size_t keySize = keys.size(); - vector evictHBMKeys; - vector evictDDRKeys; - for (size_t i = 0; i < keySize; ++i) { - size_t offset; - emb_key_t key = keys[i]; - if (key == INVALID_KEY_VALUE) { - LOG_WARN("evict key equal -1!"); - continue; - } - const auto& iter = keyOffsetMap.find(key); - if (iter == keyOffsetMap.end()) { - // 淘汰依据keyProcess中的history,hashmap映射关系创建于ParseKey;两者异步,造成淘汰的值在hashmap里可能未创建 - continue; - } - offset = iter->second; - keyOffsetMap.erase(iter); - LOG_TRACE("evict embName {}, offset {}", name, offset); - - if (offset < devVocabSize) { - // offset 在device中 - devOffset2Batch[offset] = -1; - devOffset2KeyOld.emplace_back(offset, devOffset2Key[offset]); - devOffset2Key[offset] = -1; - evictDevPos.emplace_back(offset); - evictHBMKeys.emplace_back(key); - } else { - // offset 在Host - evictHostPos.emplace_back(offset); - evictDDRKeys.emplace_back(key); // 删除映射表、初始化host表、发送dev淘汰位置 - } - } - if (isSSDEnabled_) { - cacheManager_->RefreshFreqInfoCommon(name, evictHBMKeys, TransferType::HBM_2_EVICT); - cacheManager_->RefreshFreqInfoCommon(name, evictDDRKeys, TransferType::DDR_2_EVICT); - } - - LOG_INFO("ddr EvictDeleteEmb, emb: [{}], hostEvictSize: {}, devEvictSize: {}", - name, evictHostPos.size(), evictDevPos.size()); - LOG_TRACE("keyOffsetMap, {}", MapToString(keyOffsetMap)); } /// DDR模式下的淘汰:删除映射表、初始化host表、发送dev淘汰位置 @@ -267,57 +61,27 @@ void EmbeddingDDR::EvictDeleteEmb(const vector& keys) /// \param keys void EmbeddingDDR::EvictKeys(const vector& keys) { - EASY_FUNCTION() - for (const emb_key_t& key : keys) { - size_t offset; - if (key == INVALID_KEY_VALUE) { - LOG_WARN("evict key equal -1!"); - continue; - } - const auto& iter = keyOffsetMap.find(key); - if (iter == keyOffsetMap.end()) { - continue; - } - // 淘汰依据keyProcess中的history,hashmap映射关系创建于ParseKey;两者异步,造成淘汰的值在hashmap里可能未创建 - offset = iter->second; - keyOffsetMap.erase(iter); - LOG_TRACE("evict embName {}, offset {}", name, offset); - - if (offset < devVocabSize) { - devOffset2Batch[offset] = INVALID_KEY_VALUE; - devOffset2KeyOld.emplace_back(offset, devOffset2Key[offset]); - devOffset2Key[offset] = INVALID_KEY_VALUE; - evictDevPos.emplace_back(offset); - } else { - evictHostPos.emplace_back(offset); - } - } } -void EmbeddingDDR::ClearLookupAndSwapOffset() +void EmbeddingDDR::Load(const string& savePath, map>& trainKeySet) { - ddr2HbmKeys.clear(); -} + vector keys; + vector> embeddings; + vector> optimizerSlots; -void EmbeddingDDR::SetStartCount() -{ - currentUpdatePosStart = currentUpdatePos; - freeSize_ = devVocabSize; -} + LoadKey(savePath, keys); + LoadEmbedding(savePath, embeddings); + LoadOptimizerSlot(savePath, optimizerSlots); -void EmbeddingDDR::Load(const string& savePath) -{ - LoadKey(savePath); - LoadEmbAndOptim(savePath); -} + auto rc = embCache->LoadEmbTableInfos(name, keys, embeddings, optimizerSlots); + if (rc != 0) { + throw runtime_error("embCache->LoadEmbTableInfos failed, err code:" + to_string(rc)); + } -void EmbeddingDDR::Save(const string& savePath) -{ - SaveKey(savePath); - SaveEmbAndOptim(savePath); + trainKeySet[name].insert(keys.cbegin(), keys.cend()); } -void EmbeddingDDR::LoadKey(const string& savePath) +void EmbeddingDDR::LoadKey(const string &savePath, vector &keys) { stringstream ss; ss << savePath << "/" << name << "/key/slice.data"; @@ -325,108 +89,170 @@ void EmbeddingDDR::LoadKey(const string& savePath) unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); + size_t fileSize = 0; + try { + fileSize = fileSystemPtr->GetFileSize(ss.str()); + } catch (exception& e) { + string errMsg = StringFormat("open file failed:%s, error code:%d", ss.str().c_str(), strerror(errno)); + throw runtime_error(errMsg); + } if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); + string errMsg = StringFormat("file:%s, size:%d is too big", ss.str().c_str(), fileSize); + throw runtime_error(errMsg); } - int64_t* buf = static_cast(malloc(fileSize)); + // 暂时向HBM兼容,转成int64_t,后续再归一key类型为uint64_t + auto buf = static_cast(malloc(fileSize)); if (buf == nullptr) { - throw runtime_error(StringFormat("Error: Load keys failed. " - "failed to allocate {} bytes using malloc.", fileSize)); + string errMsg = StringFormat("malloc buffer failed, error code:%d", strerror(errno)); + throw runtime_error(errMsg); } - - ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); - if (res == -1) { + ssize_t result = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); + if (result == -1) { free(static_cast(buf)); - throw runtime_error(StringFormat("Error: Load keys failed. " - "An error occurred while reading file: {}.", ss.str())); + string errMsg = StringFormat("read buffer failed, error code:%d", strerror(errno)); + throw runtime_error(errMsg); } - if (res != fileSize) { + if (result != fileSize) { free(static_cast(buf)); throw runtime_error(StringFormat("Error: Load keys failed. Expected to read {} bytes, " - "but actually read {} bytes to file {}.", fileSize, res, ss.str())); + "but actually read {} bytes to file {}.", fileSize, result, ss.str())); } - size_t loadKeySize = fileSize / sizeof(int64_t); - - // key优先加载至device - loadOffset.clear(); hostLoadOffset.clear(); - int keyCount = 0; - for (int i = 0; i < loadKeySize; i = i + 1) { + size_t loadKeySize = fileSize / sizeof(int64_t); + for (size_t i = 0; i < loadKeySize; i++) { + // 分配到不同的卡 if (buf[i] % rankSize_ != rankId_) { continue; } - if (keyCount > devVocabSize + hostVocabSize) { - free(static_cast(buf)); - throw runtime_error(StringFormat("Error: Load keys failed. Load key size :{} , " - "exceeds the sum of device vocab size and host vocab size: {}.", - keyCount, devVocabSize + hostVocabSize)); - } else if (keyCount < devVocabSize) { - loadOffset.push_back(i); - devOffset2Key[keyCount] = buf[i]; - } else { - hostLoadOffset.push_back(i); - } - keyOffsetMap[buf[i]] = keyCount; - keyCount++; + hostLoadOffset.emplace_back(i); + keys.emplace_back(static_cast(buf[i])); } - maxOffset = keyOffsetMap.size(); + free(static_cast(buf)); + LOG_DEBUG("load key done, table:{}", name); } -void EmbeddingDDR::LoadEmbAndOptim(const string& savePath) +void EmbeddingDDR::LoadEmbedding(const string &savePath, vector> &embeddings) { + // must init first + for (size_t i = 0; i < hostLoadOffset.size(); i++) { + vector tmp(embSize_); + embeddings.emplace_back(tmp); + } + stringstream ss; ss << savePath << "/" << name; unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - HostEmb *hostEmbs = Singleton::GetInstance(); - HostEmbTable &table = hostEmbs->GetEmb(name); - if (table.embData.empty()) { - LOG_ERROR("hostEmb data is empty"); - return; - } - - // 读embedding stringstream embedStream; embedStream << ss.str() << "/" << "embedding/slice.data"; + ssize_t res = fileSystemPtr->Read(embedStream.str(), embeddings, 0, hostLoadOffset, embSize_); + LOG_DEBUG("load embedding done, table:{}, read bytes:{}", name, res); +} - size_t readSize = hostLoadOffset.size() * embSize_ * sizeof(float); - ssize_t res = fileSystemPtr->Read(embedStream.str(), table.embData, 0, hostLoadOffset, embSize_); - if (res == -1) { - throw runtime_error(StringFormat("Error: Load embeddings failed. An error occurred while reading file: {}.", - embedStream.str())); - } - if (res != readSize) { - throw runtime_error(StringFormat("Error: Load embeddings failed. Expected to read {} bytes, " - "but actually read {} bytes to file {}.", readSize, res, embedStream.str())); +void EmbeddingDDR::LoadOptimizerSlot(const string &savePath, vector> &optimizerSlots) +{ + // must init first + for (size_t i = 0; i < hostLoadOffset.size(); i++) { + vector tmp(extEmbSize_ - embSize_); + optimizerSlots.emplace_back(tmp); } - // 读optim - int64_t optimIndex = 1; + stringstream ss; + ss << savePath << "/" << name; + + unique_ptr fileSystemHandler = make_unique(); + unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); + + int64_t slotIdx = 0; for (const auto ¶m: optimParams) { stringstream paramStream; paramStream << ss.str() << "/" << optimName + "_" + param << "/slice.data"; + ssize_t res = fileSystemPtr->Read(paramStream.str(), optimizerSlots, slotIdx, hostLoadOffset, embSize_); + slotIdx++; + LOG_DEBUG("load optimizer slot, table:{}, slot:{}, read bytes:{}", name, param, res); + } + + LOG_DEBUG("load optimizer slot done, table:{}", name); +} + +void EmbeddingDDR::Save(const string& savePath) +{ + SyncLatestEmbedding(); + + vector keys; + vector> embeddings; + vector> optimizerSlots; + embCache->GetEmbTableInfos(name, keys, embeddings, optimizerSlots); + + SaveKey(savePath, keys); + SaveEmbedding(savePath, embeddings); + SaveOptimizerSlot(savePath, optimizerSlots, keys.size()); +} - ssize_t res = fileSystemPtr->Read(paramStream.str(), table.embData, optimIndex, hostLoadOffset, embSize_); - if (res == -1) { - throw runtime_error(StringFormat("Error: Load optimizers failed. An error occurred while reading file: {}.", - paramStream.str())); +void EmbeddingDDR::SyncLatestEmbedding() +{ + // 导出host记录的存在于npu的embedding + std::vector> koVec; + int rc = embCache->ExportDeviceKeyOffsetPairs(name, koVec); + if (rc != ock::ctr::H_OK) { + string errMsg = StringFormat("ExportDeviceKeyOffsetPairs failed, table:%s, error code:%d", name.c_str(), rc); + throw std::invalid_argument(errMsg); + } + std::vector swapOutKeys; + for (const auto& p : koVec) { + swapOutKeys.push_back(p.first); + } + LOG_DEBUG("save swapOutKeys.size:{}, table:{}", swapOutKeys.size(), name); + + // 接收python save接口发送的卡内embedding + auto size = hdTransfer->RecvAcl(TransferChannel::SAVE_D2H, TRAIN_CHANNEL_ID, name, 0, -1); + LOG_DEBUG("save acltdtGetDatasetSize, size: {}, table:{}", size, name); + auto aclData = acltdtGetDataItem(hdTransfer->aclDatasets[name][0], 0); + if (aclData == nullptr) { + throw runtime_error("Acl get tensor data from dataset failed."); + } + auto* ptr = reinterpret_cast(acltdtGetDataAddrFromItem(aclData)); + + if (ssdVocabSize == 0) { + // 在保存之前先更新host的embedding + rc = embCache->EmbeddingUpdate(name, swapOutKeys, ptr); + if (rc != ock::ctr::H_OK) { + string errMsg = StringFormat("EmbeddingUpdate failed, table:%s, error code:%d", name.c_str(), rc); + throw std::invalid_argument(errMsg); } - if (res != readSize) { - throw runtime_error(StringFormat("Error: Load embeddings failed. Expected to read {} bytes, " - "but actually read {} bytes to file {}.", - readSize, res, paramStream.str())); + } else { + // 在保存之前先更新ddr和ssd的embedding + SwapOutInfo info; + cacheManager_->ProcessSwapOutKeys(name, swapOutKeys, info); + vector swapOutAddrs; + rc = embCache->EmbeddingLookupAddrs(name, info.swapOutDDRKeys, swapOutAddrs); + if (rc != ock::ctr::H_OK) { + string errMsg = StringFormat("EmbeddingLookupAddrs failed, table:%s, error code:%d", name.c_str(), rc); + throw std::invalid_argument(errMsg); + } + uint32_t extEmbeddingSize = embInfo_.extEmbeddingSize; + uint32_t memSize = extEmbeddingSize * sizeof(float); + // DDR更新 +#pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ + shared(swapOutAddrs, info, ptr, extEmbeddingSize, memSize) + for (uint64_t i = 0; i < swapOutAddrs.size(); i++) { + int errCode = memcpy_s( + swapOutAddrs[i], memSize, ptr + info.swapOutDDRAddrOffs[i] * extEmbeddingSize, memSize); + if (errCode != 0) { + string errMsg = StringFormat("memcpy_s failed, table:%s, error code:%d", name.c_str(), errCode); + throw std::invalid_argument(errMsg); + } } - optimIndex++; + cacheManager_->UpdateSSDEmb(name, ptr, embInfo_.extEmbeddingSize, info.swapOutSSDKeys, info.swapOutSSDAddrOffs); } } -void EmbeddingDDR::SaveKey(const string& savePath) +void EmbeddingDDR::SaveKey(const string& savePath, vector& keys) { stringstream ss; ss << savePath << "/" << name << "/key/"; @@ -436,45 +262,17 @@ void EmbeddingDDR::SaveKey(const string& savePath) unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - hostKey.clear(); - hostOffset.clear(); - deviceKey.clear(); - deviceOffset.clear(); - - for (const auto& it: keyOffsetMap) { - if (it.second >= devVocabSize) { - hostKey.push_back(it.first); - hostOffset.push_back(it.second); - } else { - deviceKey.push_back(it.first); - deviceOffset.push_back(it.second); - } - } + // 暂时向HBM兼容,转成int64_t,后续再归一key类型为uint64_t + vector keysCompat(keys.cbegin(), keys.cend()); - size_t writeSize = static_cast(hostKey.size() * sizeof(int64_t)); - ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(hostKey.data()), writeSize); + ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(keysCompat.data()), + static_cast(keys.size() * sizeof(int64_t))); if (res == -1) { - throw runtime_error(StringFormat("Error: Save keys failed. " - "An error occurred while writing file: {}.", ss.str())); - } - if (res != writeSize) { - throw runtime_error(StringFormat("Error: Save keys failed. Expected to write {} bytes, " - "but actually write {} bytes to file {}.", writeSize, res, ss.str())); - } - - writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); - res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); - if (res == -1) { - throw runtime_error(StringFormat("Error: Save keys failed. " - "An error occurred while writing file: {}.", ss.str())); - } - if (res != writeSize) { - throw runtime_error(StringFormat("Error: Save keys failed. Expected to write {} bytes, " - "but actually write {} bytes to file {}.", writeSize, res, ss.str())); + throw runtime_error("save key failed!"); } } -void EmbeddingDDR::SaveEmbData(const string& savePath) +void EmbeddingDDR::SaveEmbedding(const string& savePath, vector>& embeddings) { stringstream ss; ss << savePath << "/" << name << "/embedding/"; @@ -484,157 +282,68 @@ void EmbeddingDDR::SaveEmbData(const string& savePath) unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - size_t writeSize = embSize_ * sizeof(float) * embContent.size(); - ssize_t res = fileSystemPtr->Write(ss.str(), embContent, embSize_ * sizeof(float)); - if (res == -1) { - throw runtime_error(StringFormat("Error: Save embeddings failed. " - "An error occurred while writing file: {}.", ss.str())); - } - if (res != writeSize) { - throw runtime_error(StringFormat("Error: Save embeddings failed. Expected to write {} bytes, " - "but actually write {} bytes to file {}.", writeSize, res, ss.str())); + ssize_t writeBytesNum = fileSystemPtr->Write(ss.str(), embeddings, embSize_); + ssize_t expectWriteBytes = embeddings.size() * embSize_ * sizeof(float); + if (writeBytesNum != expectWriteBytes) { + string errMsg = StringFormat("save embedding failed, write expect:%d, actual:%d, path:%s", + expectWriteBytes, writeBytesNum, savePath.c_str()); + throw runtime_error(errMsg); } } -void EmbeddingDDR::SaveOptimData(const string& savePath) +void EmbeddingDDR::SaveOptimizerSlot(const string& savePath, vector>& optimizerSlots, size_t keySize) { - for (const auto &content: optimContentMap) { + if (optimizerSlots.size() != keySize) { + string errMsg = StringFormat("optimizer slot data size not equal to key size, " + "optimizerSlots.size:%d, keySize:%d", + optimizerSlots.size(), keySize); + throw runtime_error(errMsg); + } + + size_t slotIdx = 0; + for (const auto &slotName: optimParams) { stringstream ss; - ss << savePath << "/" << name << "/" << optimName + "_" + content.first << "/"; + ss << savePath << "/" << name << "/" << optimName + "_" + slotName << "/"; MakeDir(ss.str()); ss << "slice_" << rankId_ << ".data"; unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - size_t writeSize = embSize_ * sizeof(float) * content.second.size(); - ssize_t res = fileSystemPtr->Write(ss.str(), content.second, embSize_ * sizeof(float)); - if (res == -1) { - throw runtime_error(StringFormat("Error: Save optimizers failed. " - "An error occurred while writing file: {}.", ss.str())); + vector> slotData; + for (const auto &data: optimizerSlots) { + vector tmp(data.cbegin() + slotIdx * embSize_, data.cbegin() + (slotIdx+1) * embSize_); + slotData.emplace_back(tmp); } - if (res != writeSize) { - throw runtime_error(StringFormat("Error: Save optimizers failed. Expected to write {} bytes, " - "but actually write {} bytes to file {}.", writeSize, res, ss.str())); + ssize_t writeBytesNum = fileSystemPtr->Write(ss.str(), slotData, embSize_); + ssize_t expectWriteBytes = slotData.size() * embSize_ * sizeof(float); + if (writeBytesNum != expectWriteBytes) { + string errMsg = StringFormat("save optimizer slot failed, write expect:%d, actual:%d, path:%s", + expectWriteBytes, writeBytesNum, savePath.c_str()); + throw runtime_error(errMsg); } - } -} -void EmbeddingDDR::SaveEmbAndOptim(const string& savePath) -{ - HostEmb *hostEmbs = Singleton::GetInstance(); - HostEmbTable &table = hostEmbs->GetEmb(name); - if (table.embData.empty()) { - LOG_ERROR("host embedding data is empty"); + slotIdx++; } - embContent.clear(); - for (const string ¶m: optimParams) { - optimContentMap[param].clear(); - } - for (int64_t &offset: hostOffset) { - embContent.push_back(table.embData[offset - devVocabSize].data()); - int optim_param_count = 1; - for (const string ¶m: optimParams) { - optimContentMap[param].push_back(table.embData[offset - devVocabSize].data() + - sizeof(float) * embSize_ * optim_param_count); - optim_param_count++; - } - } - SaveEmbData(savePath); - SaveOptimData(savePath); } - vector EmbeddingDDR::GetDeviceOffset() { - return deviceOffset; + throw runtime_error("GetDeviceOffset deprecated in ddr/ssd mode"); } void EmbeddingDDR::SetOptimizerInfo(OptimizerInfo& optimizerInfo) { optimName = optimizerInfo.optimName; optimParams = optimizerInfo.optimParams; - for (const string ¶m: optimParams) { - optimContentMap[param] = vector{}; - } } void EmbeddingDDR::SetCacheManager(CacheManager *cm) { + LOG_DEBUG("set CacheManager"); cacheManager_ = cm; } -void EmbeddingDDR::AddKeyFreqInfo(const emb_key_t& key, RecordType type) -{ - if (!isSSDEnabled_) { - return; - } - cacheManager_->PutKey(name, key, type); -} - -void EmbeddingDDR::RefreshFreqInfoWithSwap() -{ - if (!isSSDEnabled_) { - return; - } - // 换入换出key列表,元素为pair: pair oldKey为从HBM移出的key, key为从DDR移出的key - LOG_DEBUG("RefreshFreqInfoWithSwap, table:{}, oldSwap Size:{}", name, oldSwap.size()); - vector enterDDRKeys; - for (auto keyPair : oldSwap) { - enterDDRKeys.emplace_back(keyPair.first); - } - cacheManager_->RefreshFreqInfoCommon(name, enterDDRKeys, TransferType::HBM_2_DDR); - cacheManager_->RefreshFreqInfoCommon(name, ddr2HbmKeys, TransferType::DDR_2_HBM); - - AddCacheManagerTraceLog(); -} - -/// 记录日志:HBM和DDR换入换出后,比较hostHashMap中DDR内key和表对应的lfuCache对象中的key内容 -void EmbeddingDDR::AddCacheManagerTraceLog() const -{ - if (Logger::GetLevel() != Logger::TRACE) { - return; - } - auto& hostMap = keyOffsetMap; - auto& devSize = devVocabSize; - auto iter = cacheManager_->ddrKeyFreqMap.find(name); - if (iter == cacheManager_->ddrKeyFreqMap.end()) { - throw runtime_error("table not in ddrKeyFreqMap"); - } - auto &lfu = iter->second; - const auto& lfuTab = lfu.GetFreqTable(); - if (lfuTab.empty()) { - return; - } - size_t tableKeyInDdr = 0; - vector ddrKeys; // 获取hostHashMap中保存在DDR的key - for (const auto& item : hostMap) { - if (item.second < devSize) { - continue; - } - ddrKeys.emplace_back(item.first); - ++tableKeyInDdr; - } - vector lfuKeys; - for (const auto& it : lfuTab) { - lfuKeys.emplace_back(it.first); - } - std::sort(ddrKeys.begin(), ddrKeys.end()); - std::sort(lfuKeys.begin(), lfuKeys.end()); - std::string ddrKeysString = VectorToString(ddrKeys); - std::string lfuKeysString = VectorToString(lfuKeys); - if (ddrKeysString != lfuKeysString) { - LOG_ERROR("swap HBM with DDR step error, key string not equal, table:{}, ddrKeysString:{}, lfuKeysString:{}", - name, ddrKeysString, lfuKeysString); - } else { - LOG_INFO("swap HBM with DDR step OK, table:{}, ddrKeysString == lfuKeysString, string length:{}", - name, lfuKeysString.length()); - } - - LOG_INFO("swap HBM with DDR step end, table:{}, tableKeyInDdr:{}, tableKeyInLfu:{}", - name, tableKeyInDdr, lfu.keyTable.size()); -} - TableInfo EmbeddingDDR::GetTableInfo() { TableInfo ti = { @@ -643,42 +352,16 @@ TableInfo EmbeddingDDR::GetTableInfo() .devVocabSize=devVocabSize, .maxOffset=maxOffset, .keyOffsetMap=keyOffsetMap, - .evictDevPos=evictDevPos, - .evictHostPos=evictHostPos, }; return ti; } -void EmbeddingDDR::RefreshFreqInfoAfterLoad() +void EmbeddingDDR::SetHDTransfer(HDTransfer *hdTransfer) { - vector h2d; - vector d2h; - - for (const auto& it: cacheManager_->ddrKeyFreqMap[name].keyTable) { - auto key = it.first; - auto iter = keyOffsetMap.find(key); - if (iter == keyOffsetMap.end()) { - throw runtime_error("ddrKeyFreqMap key not in keyOffsetMap"); - } - auto offset = iter->second; - if (offset < devVocabSize) { - d2h.emplace_back(key); - } - } - for (const auto& it: cacheManager_->excludeDDRKeyCountMap[name]) { - auto key = it.first; - auto iter = keyOffsetMap.find(key); - if (iter == keyOffsetMap.end()) { - continue; - } - auto offset = iter->second; - if (offset >= devVocabSize) { - h2d.emplace_back(key); - } - } - - cacheManager_->RefreshFreqInfoCommon(name, h2d, TransferType::HBM_2_DDR); - cacheManager_->RefreshFreqInfoCommon(name, d2h, TransferType::DDR_2_HBM); + this->hdTransfer = hdTransfer; +} - LOG_DEBUG("RefreshFreqInfoAfterLoad done"); +void EmbeddingDDR::SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache) +{ + this->embCache = embCache; } diff --git a/src/core/emb_table/embedding_ddr.h b/src/core/emb_table/embedding_ddr.h index ab7cc3fb..ac5c5878 100644 --- a/src/core/emb_table/embedding_ddr.h +++ b/src/core/emb_table/embedding_ddr.h @@ -34,45 +34,35 @@ public: virtual int64_t capacity() const; - virtual std::vector FindOffset(const vector& keys, - size_t batchId, int channelId, - std::vector& swapPos); + virtual void EvictKeys(const vector& keys); - emb_key_t FindOffsetHelper(const emb_key_t& key, int channelId); + void Load(const string& savePath, map>& trainKeySet); - void UpdateBatchId(const vector& keys, size_t currentBatchId); + void LoadKey(const string& savePath, vector& keys); - emb_key_t FindSwapPosOld(emb_key_t key, size_t hostOffset, size_t batchId, std::vector& swapPos); + void LoadEmbedding(const string& savePath, vector>& embeddings); - virtual void EvictKeys(const vector& keys); + void LoadOptimizerSlot(const string& savePath, vector>& optimizerSlots); -// std::vector lookUpVec; // 查询结果 + void Save(const string& savePath); - virtual void ClearLookupAndSwapOffset(); + void SyncLatestEmbedding(); - void SetStartCount(); + void SaveKey(const string& savePath, vector& keys); - void Load(const string& savePath); + void SaveEmbedding(const string& savePath, vector>& embeddings); - void Save(const string& savePath); + void SaveOptimizerSlot(const string& savePath, vector>& optimizerSlots, size_t keySize); vector GetDeviceOffset(); void SetOptimizerInfo(OptimizerInfo& optimizerInfo); - void RefreshFreqInfoWithSwap(); - - void AddKeyFreqInfo(const emb_key_t& key, RecordType type); - void SetCacheManager(CacheManager *cm); - void AddCacheManagerTraceLog() const; - TableInfo GetTableInfo(); - void RefreshFreqInfoAfterLoad(); - -GTEST_PRIVATE: + void SetHDTransfer(HDTransfer* hdTransfer); void LoadKey(const string& savePath); void LoadEmbAndOptim(const string& savePath); @@ -81,10 +71,11 @@ GTEST_PRIVATE: void SaveEmbData(const string &savePath); void SaveOptimData(const string& savePath); void SaveEmbAndOptim(const string& savePath); + void SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache); - void EvictDeleteEmb(const vector& keys); +GTEST_PRIVATE: - std::vector devOffset2Key; + void EvictDeleteEmb(const vector& keys); size_t maxOffsetOld { 0 }; std::vector evictPosChange; @@ -92,32 +83,16 @@ GTEST_PRIVATE: std::vector> devOffset2KeyOld; std::vector> oldSwap; // (old on dev, old on host) - /* - * HBM与DDR换入换出时,已存在于DDR且要转移到HBM的key(不包含新key); 用于SSD模式 - * (区别于oldSwap: pair.second为已存在于DDR key + 换入换出前映射到DDR的新key) - */ - std::vector ddr2HbmKeys; - std::vector devOffset2Batch; // has -1 - - /** - * 记录HBM上查找空位的当前位置 - * 值域为[0, devVocabSize] - **/ - size_t currentUpdatePos; - size_t currentUpdatePosStart; // 记录HBM上查找空位的起始位置 - - vector hostKey; - vector hostOffset; - vector deviceKey; - vector deviceOffset; - vector embContent; std::string optimName; std::vector optimParams; - std::map> optimContentMap; vector hostLoadOffset; + + HDTransfer *hdTransfer = nullptr; + ock::ctr::EmbCacheManagerPtr embCache = nullptr; + int deviceId = -1; }; } diff --git a/src/core/emb_table/embedding_dynamic.cpp b/src/core/emb_table/embedding_dynamic.cpp index bca77178..78c94862 100644 --- a/src/core/emb_table/embedding_dynamic.cpp +++ b/src/core/emb_table/embedding_dynamic.cpp @@ -27,7 +27,7 @@ EmbeddingDynamic::EmbeddingDynamic() } EmbeddingDynamic::EmbeddingDynamic(const EmbInfo& info, const RankInfo& rankInfo, int inSeed) - : EmbeddingTable(info, rankInfo, inSeed) + : EmbeddingTable(info, rankInfo, inSeed), deviceId(rankInfo.deviceId) { if (isDynamic_) { auto ret = aclrtSetDevice(static_cast(rankInfo.deviceId)); @@ -197,7 +197,7 @@ void EmbeddingDynamic::SaveEmbData(const string& savePath) unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - fileSystemPtr->WriteEmbedding(ss.str(), embSize_, embAddress, rankId_); + fileSystemPtr->WriteEmbedding(ss.str(), embSize_, embAddress, deviceId); } void EmbeddingDynamic::SaveOptimData(const string &savePath) @@ -210,11 +210,11 @@ void EmbeddingDynamic::SaveOptimData(const string &savePath) unique_ptr fileSystemHandler = make_unique(); unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - fileSystemPtr->WriteEmbedding(ss.str(), embSize_, content.second, rankId_); + fileSystemPtr->WriteEmbedding(ss.str(), embSize_, content.second, deviceId); } } -void EmbeddingDynamic::Load(const string& savePath) +void EmbeddingDynamic::Load(const string& savePath, map>& trainKeySet) { LoadKey(savePath); LoadEmbAndOptim(savePath); @@ -240,7 +240,7 @@ void EmbeddingDynamic::LoadEmbAndOptim(const string& savePath) stringstream paramStream; paramStream << ss.str() << "/" << optimName + "_" + param << "/slice.data"; fileSystemPtr->ReadEmbedding(paramStream.str(), embeddingSizeInfo, - firstAddress + optimIndex * embSize_ * sizeof(float), rankId_, loadOffset); + firstAddress + optimIndex * embSize_ * sizeof(float), deviceId, loadOffset); optimIndex++; } } diff --git a/src/core/emb_table/embedding_dynamic.h b/src/core/emb_table/embedding_dynamic.h index 59418229..5cf49718 100644 --- a/src/core/emb_table/embedding_dynamic.h +++ b/src/core/emb_table/embedding_dynamic.h @@ -35,7 +35,7 @@ public: virtual int64_t capacity() const; - void Load(const string& savePath); + void Load(const string& savePath, map>& trainKeySet); void Save(const string& savePath); @@ -74,6 +74,7 @@ private: std::string optimName; std::vector optimParams; std::map> optimAddressMap; + int deviceId = -1; int64_t firstAddress; }; diff --git a/src/core/emb_table/embedding_mgmt.cpp b/src/core/emb_table/embedding_mgmt.cpp index f850e254..33e1c671 100644 --- a/src/core/emb_table/embedding_mgmt.cpp +++ b/src/core/emb_table/embedding_mgmt.cpp @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ - #include "emb_table/embedding_mgmt.h" + +#include + #include "emb_table/embedding_static.h" #include "emb_table/embedding_dynamic.h" #include "emb_table/embedding_ddr.h" @@ -25,8 +27,7 @@ EmbeddingMgmt::EmbeddingMgmt() { } -void EmbeddingMgmt::Init(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues, int seed) +void EmbeddingMgmt::Init(const RankInfo& rInfo, const vector& eInfos, int seed) { for (size_t i = 0; i < eInfos.size(); ++i) { if (rInfo.isDDR) { @@ -54,17 +55,7 @@ void EmbeddingMgmt::Key2Offset(const std::string& name, std::vector& size_t EmbeddingMgmt::GetMaxOffset(const std::string& name) { - embeddings[name]->GetMaxOffset(); -} - -void EmbeddingMgmt::LoadMaxOffset(OffsetMemT& loadData) -{ - LOG_ERROR("load max offset"); -} - -void EmbeddingMgmt::LoadKeyOffsetMap(KeyOffsetMemT& loadData) -{ - LOG_ERROR("load key offset"); + return embeddings[name]->GetMaxOffset(); } std::map EmbeddingMgmt::GetMaxOffset() @@ -85,7 +76,7 @@ KeyOffsetMemT EmbeddingMgmt::GetKeyOffsetMap() return keyOffsetMap; } -void EmbeddingMgmt::EvictKeys(const string& name, const vector& keys) +void EmbeddingMgmt::EvictKeys(const string& name, const vector& keys) { LOG_ERROR("evict keys for {}", name); if (keys.size() != 0) { @@ -94,7 +85,7 @@ void EmbeddingMgmt::EvictKeys(const string& name, const vector& keys) embeddings[name]->EvictInitDeviceEmb(); } -void EmbeddingMgmt::EvictKeysCombine(const vector& keys) +void EmbeddingMgmt::EvictKeysCombine(const vector& keys) { if (keys.size() != 0) { for (auto& table: embeddings) { @@ -117,41 +108,16 @@ int64_t EmbeddingMgmt::GetCapacity(const std::string &name) return embeddings[name]->capacity(); } -void EmbeddingMgmt::FindOffset(const std::string& name, const vector& keys, - size_t currentBatchId, size_t keepBatchId, int channel) +void EmbeddingMgmt::Load(const string& name, const string& filePath, + map>& trainKeySet) { - return embeddings[name]->FindOffset(keys, currentBatchId, keepBatchId, channel); + return embeddings[name]->Load(filePath, trainKeySet); } -const std::vector& EmbeddingMgmt::GetMissingKeys(const std::string& name) -{ - return embeddings[name]->GetMissingKeys(); -} - -void EmbeddingMgmt::ClearMissingKeys(const std::string& name) -{ - return embeddings[name]->ClearMissingKeys(); -} - -std::shared_ptr EmbeddingMgmt::GetTable(const string& name) -{ - auto it = embeddings.find(name); - if (it == embeddings.end()) { - LOG_ERROR("table not found"); - } - return std::dynamic_pointer_cast(it->second); -} - -void EmbeddingMgmt::Load(const string& name, const string& filePath) -{ - return embeddings[name]->Load(filePath); -} - - -void EmbeddingMgmt::Load(const string& filePath) +void EmbeddingMgmt::Load(const string& filePath, map>& trainKeySet) { for (auto& tablePair: embeddings) { - tablePair.second->Load(filePath); + tablePair.second->Load(filePath, trainKeySet); } } @@ -162,8 +128,14 @@ void EmbeddingMgmt::Save(const string& name, const string& filePath) void EmbeddingMgmt::Save(const string& filePath) { + // use multi-thread to prevent receiving save_d2h blocked when table order different between cpp and python + vector> futures; for (auto& tablePair: embeddings) { - tablePair.second->Save(filePath); + futures.emplace_back( + std::async(std::launch::async, [table = tablePair.second, filePath] { table->Save(filePath); })); + } + for (auto& f: futures) { + f.get(); // get() will repost exception if happened } } @@ -181,18 +153,6 @@ void EmbeddingMgmt::SetOptimizerInfo(const string& name, OptimizerInfo& optimize embeddings[name]->SetOptimizerInfo(optimizerInfo); } -EmbHashMemT EmbeddingMgmt::GetEmbHashMaps() -{ - EmbHashMemT EmbHashMaps; - for (auto& tablePair: embeddings) { - EmbHashMaps[tablePair.first].hostHashMap = tablePair.second ->GetKeyOffsetMap(); - EmbHashMaps[tablePair.first].devVocabSize = tablePair.second ->GetDevVocabSize(); - EmbHashMaps[tablePair.first].hostVocabSize = tablePair.second ->GetHostVocabSize(); - EmbHashMaps[tablePair.first].maxOffset = tablePair.second ->GetMaxOffset(); - } - return EmbHashMaps; -} - OffsetMapT EmbeddingMgmt::GetLoadOffsets() { OffsetMapT AllLoadOffsets; @@ -209,25 +169,16 @@ void EmbeddingMgmt::SetCacheManagerForEmbTable(CacheManager* cacheManager) } } -void EmbeddingMgmt::EnableSSD() +void EmbeddingMgmt::SetHDTransferForEmbTable(HDTransfer* hdTransfer) { for (auto& table: embeddings) { - table.second->EnableSSD(); + table.second->SetHDTransfer(hdTransfer); } } -void EmbeddingMgmt::LockSave() +void EmbeddingMgmt::SetEmbCacheForEmbTable(const ock::ctr::EmbCacheManagerPtr& embCache) { for (auto& table: embeddings) { - table.second->mutSave_.lock(); + table.second->SetEmbCache(embCache); } - LOG_DEBUG("LockSave"); } - -void EmbeddingMgmt::UnLockSave() -{ - for (auto& table: embeddings) { - table.second->mutSave_.unlock(); - } - LOG_DEBUG("UnLockSave"); -} \ No newline at end of file diff --git a/src/core/emb_table/embedding_mgmt.h b/src/core/emb_table/embedding_mgmt.h index d091bdef..ef106786 100644 --- a/src/core/emb_table/embedding_mgmt.h +++ b/src/core/emb_table/embedding_mgmt.h @@ -34,8 +34,7 @@ public: * @param[in] rInfo 从python侧传过了的rank信息 * @param[in] eInfos 从python侧传过了的embedding表信息 */ - void Init(const RankInfo& rInfo, const vector& eInfos, - const vector& thresholdValues = {}, int seed = 0); + void Init(const RankInfo& rInfo, const vector& eInfos, int seed = 0); /** * 从embedding表中查批量查找key @@ -45,29 +44,18 @@ public: */ void Key2Offset(const std::string& name, std::vector& keys, int channel); - void FindOffset(const std::string& name, const vector& keys, - size_t currentBatchId, size_t keepBatchId, int channel); - /** * 在指定的embedding表中淘汰key * @param[in] name embedding表名 * @param[in] keys 待淘汰的key */ - void EvictKeys(const std::string& name, const vector& keys); + void EvictKeys(const std::string& name, const vector& keys); /** * 在全部的embedding表中淘汰key * @param[in] keys 待淘汰的key */ - void EvictKeysCombine(const vector& keys); - - const std::vector& GetMissingKeys(const std::string& name); - - void ClearMissingKeys(const std::string& name); - - void LoadMaxOffset(OffsetMemT& loadData); - - void LoadKeyOffsetMap(KeyOffsetMemT& loadData); + void EvictKeysCombine(const vector& keys); size_t GetMaxOffset(const std::string& name); @@ -81,17 +69,15 @@ public: static EmbeddingMgmt* Instance(); - std::shared_ptr GetTable(const string& name); - - /** + /** * 加载单个表 */ - void Load(const string& name, const string& filePath); + void Load(const string& name, const string& filePath, map>& trainKeySet); /** * 加载所有表 */ - void Load(const string& filePath); + void Load(const string& filePath, map>& trainKeySet); /** * 保存单个表 @@ -113,8 +99,6 @@ public: */ OffsetMapT GetLoadOffsets(); - EmbHashMemT GetEmbHashMaps(); - /** * 设置某张表的优化器信息 */ @@ -122,11 +106,9 @@ public: void SetCacheManagerForEmbTable(CacheManager* cacheManager); - void EnableSSD(); - - void LockSave(); + void SetHDTransferForEmbTable(HDTransfer* hdTransfer); - void UnLockSave(); + void SetEmbCacheForEmbTable(const ock::ctr::EmbCacheManagerPtr& embCache); private: EmbeddingMgmt(); diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index 312b8a77..fdda5ede 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -106,7 +106,7 @@ void EmbeddingStatic::SaveKey(const string& savePath) } } -void EmbeddingStatic::Load(const string& savePath) +void EmbeddingStatic::Load(const string& savePath, map>& trainKeySet) { LoadKey(savePath); } diff --git a/src/core/emb_table/embedding_static.h b/src/core/emb_table/embedding_static.h index 965bce0e..6515f586 100644 --- a/src/core/emb_table/embedding_static.h +++ b/src/core/emb_table/embedding_static.h @@ -35,7 +35,7 @@ public: virtual int64_t capacity() const; - void Load(const string& savePath); + void Load(const string& savePath, map>& trainKeySet); void Save(const string& savePath); diff --git a/src/core/emb_table/embedding_table.cpp b/src/core/emb_table/embedding_table.cpp index 7cfc125e..1579282f 100644 --- a/src/core/emb_table/embedding_table.cpp +++ b/src/core/emb_table/embedding_table.cpp @@ -27,7 +27,7 @@ EmbeddingTable::EmbeddingTable() EmbeddingTable::EmbeddingTable(const EmbInfo& info, const RankInfo& rankInfo, int inSeed) : name(info.name), hostVocabSize(info.hostVocabSize), devVocabSize(info.devVocabSize), - freeSize_(0), maxOffset(0), isDynamic_(rankInfo.useDynamicExpansion), + ssdVocabSize(info.ssdVocabSize), freeSize_(0), maxOffset(0), isDynamic_(rankInfo.useDynamicExpansion), embSize_(info.embeddingSize), extEmbSize_(info.extEmbeddingSize), embInfo_(info), seed_(inSeed), rankId_(rankInfo.rankId), rankSize_(rankInfo.rankSize) { @@ -43,19 +43,6 @@ void EmbeddingTable::Key2Offset(std::vector& keys, int channel) return; } -void EmbeddingTable::FindOffset(const vector& keys, - size_t currentBatchId, size_t keepBatchId, int channelId) -{ - return; -} - -std::vector EmbeddingTable::FindOffset(const vector& keys, - size_t batchId, int channelId, - std::vector& swapPos) -{ - return {}; -} - size_t EmbeddingTable::GetMaxOffset() { return maxOffset; @@ -71,7 +58,7 @@ size_t EmbeddingTable::size() const return maxOffset; } -void EmbeddingTable::EvictKeys(const std::vector& keys) +void EmbeddingTable::EvictKeys(const std::vector& keys) { std::lock_guard lk(mut_); // lock for PROCESS_THREAD size_t keySize = keys.size(); @@ -132,40 +119,12 @@ absl::flat_hash_map EmbeddingTable::GetKeyOffsetMap() return keyOffsetMap; } -void EmbeddingTable::ClearMissingKeys() -{ - missingKeysHostPos_.clear(); -} - -const std::vector& EmbeddingTable::GetMissingKeys() -{ - return missingKeysHostPos_; -} - -void EmbeddingTable::SetStartCount() -{ -} - -void EmbeddingTable::ClearLookupAndSwapOffset() -{ -} - -size_t EmbeddingTable::GetDevVocabSize() -{ - return devVocabSize; -} - -size_t EmbeddingTable::GetHostVocabSize() -{ - return hostVocabSize; -} - vector EmbeddingTable::GetLoadOffset() { return loadOffset; } -void EmbeddingTable::Load(const string& filePath) +void EmbeddingTable::Load(const string& filePath, map>& trainKeySet) { } @@ -184,15 +143,6 @@ void EmbeddingTable::SetCacheManager(CacheManager *cm) { } -void EmbeddingTable::EnableSSD() -{ - isSSDEnabled_ = true; -} - -void EmbeddingTable::RefreshFreqInfoWithSwap() -{ -} - TableInfo EmbeddingTable::GetTableInfo() { TableInfo ti = { @@ -201,8 +151,6 @@ TableInfo EmbeddingTable::GetTableInfo() .devVocabSize=devVocabSize, .maxOffset=maxOffset, .keyOffsetMap=keyOffsetMap, - .evictDevPos=evictDevPos, - .evictHostPos=evictHostPos, }; return ti; } @@ -214,4 +162,12 @@ vector EmbeddingTable::GetDeviceOffset() void EmbeddingTable::SetOptimizerInfo(OptimizerInfo& optimizerInfo) { -} \ No newline at end of file +} + +void EmbeddingTable::SetHDTransfer(HDTransfer *hdTransfer) +{ +} + +void EmbeddingTable::SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache) +{ +} diff --git a/src/core/emb_table/embedding_table.h b/src/core/emb_table/embedding_table.h index 0c05a0a0..1fa9008b 100644 --- a/src/core/emb_table/embedding_table.h +++ b/src/core/emb_table/embedding_table.h @@ -37,21 +37,11 @@ public: */ virtual void Key2Offset(std::vector& keys, int channel); - /** - * DDR模式使用 - */ - virtual void FindOffset(const vector& keys, - size_t currentBatchId, size_t keepBatchId, int channelId); - - virtual std::vector FindOffset(const vector& keys, - size_t batchId, int channelId, - std::vector& swapPos); - /** * 淘汰key, 配合GetEvictedKeys一起使用GetEvictedKeys * EvictKeys执行,通过GetEvictedKeys, GetEvictedKeys拿结果 */ - virtual void EvictKeys(const std::vector& keys); + virtual void EvictKeys(const std::vector& keys); /** * 获取设备侧淘汰的key的偏移或者地址 @@ -73,24 +63,12 @@ public: virtual size_t size() const; - void ClearMissingKeys(); - - virtual const std::vector& GetMissingKeys(); - absl::flat_hash_map GetKeyOffsetMap(); - virtual void SetStartCount(); - - virtual void ClearLookupAndSwapOffset(); - - virtual void Load(const string& savePath); + virtual void Load(const string& savePath, map>& trainKeySet); virtual void Save(const string& savePath); - size_t GetDevVocabSize(); - - size_t GetHostVocabSize(); - static void MakeDir(const string& dirName); virtual vector GetDeviceOffset(); @@ -101,20 +79,20 @@ public: virtual void SetCacheManager(CacheManager* cacheManager); - void EnableSSD(); + virtual TableInfo GetTableInfo(); - virtual void RefreshFreqInfoWithSwap(); + virtual void SetHDTransfer(HDTransfer *hdTransfer); - virtual TableInfo GetTableInfo(); + virtual void SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache); std::string name; size_t hostVocabSize; size_t devVocabSize; + size_t ssdVocabSize; size_t maxOffset; absl::flat_hash_map keyOffsetMap; std::vector evictDevPos; // 记录HBM内被淘汰的key std::vector evictHostPos; // 记录Host内淘汰列表 - std::mutex mutSave_; // 用于保存时锁住KeyOffsetMap #ifdef NDEBUG protected: diff --git a/src/core/file_system/file_system.h b/src/core/file_system/file_system.h index 66c142db..5546c691 100644 --- a/src/core/file_system/file_system.h +++ b/src/core/file_system/file_system.h @@ -31,10 +31,7 @@ namespace MxRec { virtual size_t GetFileSize(const string& filePath) = 0; virtual ssize_t Write(const string& filePath, const char* fileContent, size_t dataSize) = 0; - virtual ssize_t Write(const string& filePath, vector fileContent, size_t dataSize) = 0; - - // In the dynamic expansion mode, embedding is transported to the host side from the device side - // and written into a file. + virtual ssize_t Write(const string& filePath, vector>& fileContent, size_t dataSize) = 0; virtual void WriteEmbedding(const string& filePath, const int& embeddingSize, const vector& addressArr, int deviceId) = 0; diff --git a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp index 2c463115..68fc47a8 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp +++ b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp @@ -94,7 +94,7 @@ ssize_t HdfsFileSystem::Write(const string& filePath, const char* fileContent, s return static_cast(writeBytesNum); } -ssize_t HdfsFileSystem::Write(const string& filePath, vector fileContent, size_t dataSize) +ssize_t HdfsFileSystem::Write(const string& filePath, vector>& fileContent, size_t dataSize) { hdfsFS fs = ConnectHdfs(); @@ -107,7 +107,7 @@ ssize_t HdfsFileSystem::Write(const string& filePath, vector fileContent tSize writeBytesNum = 0; size_t loops = fileContent.size(); for (size_t i = 0; i < loops; i++) { - tSize res = hdfs->Write(fs, file, fileContent[i], dataSize); + tSize res = hdfs->Write(fs, file, reinterpret_cast(&fileContent[i]), dataSize); if (res == -1) { hdfs->CloseFile(fs, file); hdfs->Disconnect(fs); @@ -138,6 +138,13 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding } #ifndef GTEST + auto res = aclrtSetDevice(static_cast(deviceId)); + if (res != ACL_ERROR_NONE) { + hdfs->CloseFile(fs, file); + hdfs->Disconnect(fs); + throw runtime_error(StringFormat("Set device failed, device_id:%d", deviceId).c_str()); + } + for (size_t i = 0; i < addressArr.size(); i += embHashNum) { vector row(embeddingSize); int64_t address = addressArr.at(i); @@ -246,6 +253,11 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); } + auto res = aclrtSetDevice(static_cast(deviceId)); + if (res != ACL_ERROR_NONE) { + throw runtime_error(StringFormat("Set device failed, device_id:%d", deviceId).c_str()); + } + float* floatPtr = reinterpret_cast(firstAddress); auto i = 0; for (const auto& offset: offsetArr) { diff --git a/src/core/file_system/hdfs_file_system/hdfs_file_system.h b/src/core/file_system/hdfs_file_system/hdfs_file_system.h index 8d436d3d..f6c6a489 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_file_system.h +++ b/src/core/file_system/hdfs_file_system/hdfs_file_system.h @@ -35,7 +35,7 @@ namespace MxRec { size_t GetFileSize(const string& filePath) override; ssize_t Write(const string& filePath, const char* fileContent, size_t dataSize) override; - ssize_t Write(const string& filePath, vector fileContent, size_t dataSize) override; + ssize_t Write(const string& filePath, vector>& fileContent, size_t dataSize) override; void WriteEmbedding(const string& filePath, const int& embeddingSize, const vector& addressArr, int deviceId) override; diff --git a/src/core/file_system/local_file_system/local_file_system.cpp b/src/core/file_system/local_file_system/local_file_system.cpp index 43cd0033..6215d2ac 100644 --- a/src/core/file_system/local_file_system/local_file_system.cpp +++ b/src/core/file_system/local_file_system/local_file_system.cpp @@ -112,44 +112,23 @@ ssize_t LocalFileSystem::Write(const string& filePath, const char* fileContent, return writeBytesNum; } -ssize_t LocalFileSystem::Write(const string& filePath, vector fileContent, size_t dataSize) +ssize_t LocalFileSystem::Write(const string& filePath, vector>& fileContent, size_t dataSize) { int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, fileMode); if (fd == -1) { throw runtime_error(StringFormat("open file %s to write failed.", filePath.c_str())); } - buffer.reserve(BUFFER_SIZE); - BufferQueue queue; - ssize_t writeBytesNum = 0; - std::thread writer(&LocalFileSystem::WriterFn, this, std::ref(queue), fd, std::ref(writeBytesNum)); - - size_t loops = fileContent.size(); - for (size_t i = 0; i < loops; i++) { - size_t idx = 0; - size_t writeSize = 0; - size_t dataCol = dataSize; - while (dataCol != 0) { - if (dataCol > oneTimeReadWriteLen) { - writeSize = oneTimeReadWriteLen; - } else { - writeSize = dataCol; - } - FillToBuffer(queue, reinterpret_cast(fileContent[i]) + idx, writeSize); - dataCol -= writeSize; - idx += writeSize; - } + vector flattenContent; + for (auto& vec : fileContent) { + flattenContent.insert(flattenContent.cend(), vec.cbegin(), vec.cend()); } - // After all data has been processed, check if there is any data left in the buffer - if (!buffer.empty()) { - queue.Push(std::move(buffer)); - buffer.clear(); - } + ssize_t writeBytesNum = + write(fd, reinterpret_cast(flattenContent.data()), flattenContent.size() * sizeof(float)); - queue.Push(std::vector()); - writer.join(); close(fd); + return writeBytesNum; } @@ -168,6 +147,12 @@ void LocalFileSystem::WriteEmbedding(const string& filePath, const int& embeddin } #ifndef GTEST + auto res = aclrtSetDevice(static_cast(deviceId)); + if (res != ACL_ERROR_NONE) { + close(fd); + throw runtime_error(StringFormat("Set device failed, device_id:%d", deviceId).c_str()); + } + for (size_t i = 0; i < addressArr.size(); i += keyAddrElem) { vector row(embeddingSize); int64_t address = addressArr.at(i); @@ -271,6 +256,10 @@ void LocalFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& e if (fp == nullptr) { throw runtime_error(StringFormat("Failed to open read file: %s", filePath.c_str())); } + auto res = aclrtSetDevice(static_cast(deviceId)); + if (res != ACL_ERROR_NONE) { + throw runtime_error(StringFormat("Set device failed, device_id:%d", deviceId).c_str()); + } float* floatPtr = reinterpret_cast(firstAddress); auto i = 0; diff --git a/src/core/file_system/local_file_system/local_file_system.h b/src/core/file_system/local_file_system/local_file_system.h index f8eefd5b..9b09f34d 100644 --- a/src/core/file_system/local_file_system/local_file_system.h +++ b/src/core/file_system/local_file_system/local_file_system.h @@ -33,7 +33,7 @@ namespace MxRec { size_t GetFileSize(const string& filePath) override; ssize_t Write(const string& filePath, const char* fileContent, size_t dataSize) override; - ssize_t Write(const string& filePath, vector fileContent, size_t dataSize) override; + ssize_t Write(const string& filePath, vector>& fileContent, size_t dataSize) override; void WriteEmbedding(const string& filePath, const int& embeddingSize, const vector& addressArr, int deviceId) override; diff --git a/src/core/hd_transfer/hd_transfer.cpp b/src/core/hd_transfer/hd_transfer.cpp index a32ddf28..8fc2a282 100644 --- a/src/core/hd_transfer/hd_transfer.cpp +++ b/src/core/hd_transfer/hd_transfer.cpp @@ -50,7 +50,14 @@ int HDTransfer::Init(const vector& embInfos, uint32_t localRankId) CreateChannel(localRankId, embInfo.name, i); } // 创建acltdtDataset类型的数据,对等一个Vector。同步接口。 - aclDatasets[embInfo.name] = acltdtCreateDataset(); + for (int j = 0; j < EMBEDDING_THREAD_NUM; j++) { + acltdtDataset* dataset = acltdtCreateDataset(); + if (dataset == nullptr) { + LOG_ERROR("create acltdtDataset failed, table:{}, threadId:{}", embName, j); + throw runtime_error("create acltdtDataset failed"); + } + aclDatasets[embInfo.name][j] = dataset; + } } running = true; LOG(INFO) << "hd_transfer init"; @@ -71,9 +78,11 @@ void HDTransfer::Destroy() } LOG_INFO(HD + "destroy channel:{}", c.first); } - for (auto& d: aclDatasets) { - if (acltdtDestroyDataset(d.second) != ACL_ERROR_NONE) { - throw runtime_error("Acl destroy tensor dataset failed."); + for (auto& datasetMap: aclDatasets) { + for (auto &d: datasetMap.second) { + if (acltdtDestroyDataset(d.second) != ACL_ERROR_NONE) { + throw runtime_error("Acl destroy tensor dataset failed."); + } } } aclFinalize(); @@ -90,16 +99,26 @@ void HDTransfer::CreateChannel(const uint32_t localRankId, const string& embName int channelSize = GlobalEnv::hdChannelSize; LOG_INFO("user config all2all restore lookup channel size:{}", channelSize); for (int c = static_cast(TransferChannel::D2H); c != static_cast(TransferChannel::INVALID); c++) { + if ((c == static_cast(TransferChannel::SWAP) || c == static_cast(TransferChannel::D2H) || + c == static_cast(TransferChannel::H2D)) && channelNum == EVAL_CHANNEL_ID) { + continue; + } + auto channel = static_cast(c); - string sendName = StringFormat( - "%s_%s_%d", embName.c_str(), TransferChannel2Str(channel).c_str(), channelNum - ); + std::string sendName; + if (c == static_cast(TransferChannel::SWAP) || c == static_cast(TransferChannel::D2H) || + c == static_cast(TransferChannel::H2D)) { + sendName = StringFormat("%s_%s_all", embName.c_str(), TransferChannel2Str(channel).c_str()); + } else { + sendName = StringFormat("%s_%s_%d", embName.c_str(), TransferChannel2Str(channel).c_str(), channelNum); + } if (TransferChannel2Str(channel) == "all2all" || TransferChannel2Str(channel) == "restore" || TransferChannel2Str(channel) == "lookup" || TransferChannel2Str(channel) == "restore_second" || TransferChannel2Str(channel) == "uniquekeys" || - TransferChannel2Str(channel) == "evict" /* for noDDR */ + TransferChannel2Str(channel) == "evict" || + TransferChannel2Str(channel) == "swap" ) { transferChannels[sendName] = TDT_CREATE_CHANNEL(localRankId, sendName.c_str(), channelSize); } else { @@ -128,10 +147,16 @@ void HDTransfer::Send(TransferChannel channel, const vector &tensors, in for (auto& t: tensors) { sizes.push_back(t.NumElements()); } - string sendName = StringFormat("%s_%s_%d", embName.c_str(), TransferChannel2Str(channel).c_str(), channelId); - LOG_INFO(HD + "hd transfer send {}, send count is {}, size list:{}", - sendName, sizes.size(), VectorToString(sizes)); + string sendName; + if (channel == TransferChannel::SWAP || channel == TransferChannel::D2H || channel == TransferChannel::H2D) { + sendName = StringFormat("%s_%s_all", embName.c_str(), TransferChannel2Str(channel).c_str()); + } else { + sendName = StringFormat("%s_%s_%d", embName.c_str(), TransferChannel2Str(channel).c_str(), channelId); + } + + LOG_INFO(HD + "hd transfer send:{}, batchId:{}, send count:{}, size list:{}", + sendName, batchId, sizes.size(), VectorToString(sizes)); if (sizes.size() == 0) { LOG_WARN("tensors num can not be zero"); @@ -171,9 +196,15 @@ void HDTransfer::Send(TransferChannel channel, const vector &tensors, in vector HDTransfer::Recv(TransferChannel channel, int channelId, const string& embName) { EASY_FUNCTION() + vector tensors; #ifndef GTEST - std::vector tensors; - string recvName = StringFormat("%s_%s_%d", embName.c_str(), TransferChannel2Str(channel).c_str(), channelId); + string recvName; + if (channel == TransferChannel::SWAP || channel == TransferChannel::D2H || channel == TransferChannel::H2D) { + recvName = StringFormat("%s_%s_all", embName.c_str(), TransferChannel2Str(channel).c_str()); + } else { + recvName = StringFormat("%s_%s_%d", embName.c_str(), TransferChannel2Str(channel).c_str(), channelId); + } + LOG_DEBUG("hd transfer try recv:{}", recvName); TimeCost tc = TimeCost(); tensorflow::Status status = tensorflow::RecvTensorByAcl(transferChannels[recvName], tensors); @@ -190,8 +221,8 @@ vector HDTransfer::Recv(TransferChannel channel, int channel sizes.push_back(t.NumElements()); } LOG_INFO("hd transfer recv:{}, size:{} cost:{}ms", recvName, VectorToString(sizes), tc.ElapsedMS()); - return tensors; #endif + return tensors; } /// 接收从device发送过来的数据(D2H), updateEmbV2函数使用;使用原生的aclTDT接口 @@ -199,27 +230,36 @@ vector HDTransfer::Recv(TransferChannel channel, int channel /// \param channelId 通道索引(训练/推理) /// \param embName 表名 /// \return -size_t HDTransfer::RecvAcl(TransferChannel channel, int channelId, const string& embName) +size_t HDTransfer::RecvAcl(TransferChannel channel, int channelId, const string& embName, + int embeddingThreadId, int batchId) { EASY_FUNCTION() + size_t ret = 0; #ifndef GTEST - std::vector tensors; - string recvName = StringFormat("%s_%s_%d", embName.c_str(), TransferChannel2Str(channel).c_str(), channelId); - LOG_DEBUG("hd transfer try recv:{}", recvName); + string recvName; + if (channel == TransferChannel::SWAP || channel == TransferChannel::D2H || channel == TransferChannel::H2D) { + recvName = StringFormat("%s_%s_all", embName.c_str(), TransferChannel2Str(channel).c_str()); + } else { + recvName = StringFormat("%s_%s_%d", embName.c_str(), TransferChannel2Str(channel).c_str(), channelId); + } + + LOG_DEBUG("hd transfer try recv:{}, batchId:{}", recvName, batchId); TimeCost tc = TimeCost(); - if (aclDatasets[embName] == nullptr) { + if (aclDatasets[embName][embeddingThreadId] == nullptr) { throw runtime_error(StringFormat("Failed recv:%s.", recvName.c_str()).c_str()); } - auto aclStatus = acltdtReceiveTensor(transferChannels[recvName], aclDatasets[embName], GlobalEnv::aclTimeout); + auto aclStatus = acltdtReceiveTensor( + transferChannels[recvName], aclDatasets[embName][embeddingThreadId], GlobalEnv::aclTimeout); if (!running) { return 0; } if (aclStatus != ACL_ERROR_NONE && aclStatus != ACL_ERROR_RT_QUEUE_EMPTY) { throw runtime_error(StringFormat("Failed receive data from acl channel, acl status:%d", aclStatus).c_str()); } - LOG_INFO("hd transfer recv:{} cost:{}ms", recvName, tc.ElapsedMS()); - return acltdtGetDatasetSize(aclDatasets[embName]); + LOG_INFO("hd transfer recv:{}, batchId:{}, cost:{}ms", recvName, batchId, tc.ElapsedMS()); + ret = acltdtGetDatasetSize(aclDatasets[embName][embeddingThreadId]); #endif + return ret; } std::unordered_map HDTransfer::GetTransChannel() diff --git a/src/core/hd_transfer/hd_transfer.h b/src/core/hd_transfer/hd_transfer.h index f9528578..58c48067 100644 --- a/src/core/hd_transfer/hd_transfer.h +++ b/src/core/hd_transfer/hd_transfer.h @@ -45,6 +45,8 @@ namespace MxRec { EVICT, H2D, SWAP, + SAVE_D2H, + SAVE_H2D, INVALID }; @@ -69,6 +71,10 @@ namespace MxRec { return "h2d"; case TransferChannel::SWAP: return "swap"; + case TransferChannel::SAVE_D2H: + return "save_d2h"; + case TransferChannel::SAVE_H2D: + return "save_h2d"; default: throw std::invalid_argument("Invalid TransferChannel"); } @@ -76,7 +82,7 @@ namespace MxRec { class HDTransfer { public: - std::unordered_map aclDatasets; + std::unordered_map> aclDatasets; HDTransfer() = default; @@ -87,7 +93,8 @@ namespace MxRec { vector Recv(TransferChannel channel, int channelId, const string& embName); - size_t RecvAcl(TransferChannel channel, int channelId, const string& embName); + size_t RecvAcl(TransferChannel channel, int channelId, const string& embName, + int embeddingThreadId, int batchId); void Destroy(); diff --git a/src/core/host_emb/host_emb.cpp b/src/core/host_emb/host_emb.cpp deleted file mode 100644 index ce0e0a78..00000000 --- a/src/core/host_emb/host_emb.cpp +++ /dev/null @@ -1,278 +0,0 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ - -#include "host_emb.h" -#include -#include "hd_transfer/hd_transfer.h" -#include "checkpoint/checkpoint.h" -#include "initializer/initializer.h" -#include "utils/time_cost.h" - -using namespace MxRec; -using namespace std; -using namespace chrono; - -/// 初始化host emb -/// \param embInfos 表信息列表 -/// \param seed 随机种子 -/// \return -void HostEmb::Initialize(const vector& embInfos, int seed) -{ - for (const auto& embInfo: embInfos) { - HostEmbTable hostEmb; - hostEmb.hostEmbInfo = embInfo; - EmbDataGenerator(embInfo.initializeInfos, seed, static_cast(embInfo.hostVocabSize), - embInfo.extEmbeddingSize, hostEmb.embData); - hostEmbs[embInfo.name] = move(hostEmb); - LOG_INFO(HOSTEMB + "HostEmb Initialize End"); - } -} - -/// 根据指定的初始化器对emb进行初始化 -/// \param initializeInfos emb初始化信息列表 -/// \param seed 随机种子 -/// \param vocabSize host表大小 -/// \param embeddingSize emb维度 -/// \param embData emb数据 -void HostEmb::EmbDataGenerator(const vector &initializeInfos, int seed, int vocabSize, - int embeddingSize, vector> &embData) const -{ -#ifndef GTEST - LOG_INFO(HOSTEMB + "GenerateEmbData Start, seed:{}, initializer num: {}", seed, initializeInfos.size()); - embData.clear(); - embData.resize(vocabSize, vector(embeddingSize)); - - for (auto initializeInfo: initializeInfos) { - LOG_INFO("Device GenerateEmbData ing. name {}", initializeInfo.name); - for (int i = 0; i < vocabSize; i++) { - initializeInfo.initializer->GenerateData(embData.at(i).data(), embeddingSize); - } - } - LOG_INFO(HOSTEMB + "GenerateEmbData End, seed:{}", seed); -#endif -} - -/// 停止用于异步更新D2H emb的线程 -/// \param channelId 通道索引(训练/推理) -void HostEmb::Join(int channelId) -{ - TimeCost tc = TimeCost(); - switch (channelId) { - case TRAIN_CHANNEL_ID: - LOG_DEBUG(HOSTEMB + "start join, channelId:{}, procThreadsForTrain num:{}", - channelId, procThreadsForTrain.size()); - for (auto& t: procThreadsForTrain) { - t->join(); - } - procThreadsForTrain.clear(); - LOG_DEBUG(HOSTEMB + "end join, channelId:{}, cost:{}ms", channelId, tc.ElapsedMS()); - break; - case EVAL_CHANNEL_ID: - LOG_DEBUG(HOSTEMB + "start join, channelId:{}, procThreadsForEval num:{}", - channelId, procThreadsForEval.size()); - for (auto& t: procThreadsForEval) { - t->join(); - } - procThreadsForEval.clear(); - LOG_DEBUG(HOSTEMB + "end join, channelId:{}, cost:{}ms", channelId, tc.ElapsedMS()); - break; - default: - throw invalid_argument("channelId not in [TRAIN_CHANNEL_ID, EVAL_CHANNEL_ID]"); - } -} - -#ifndef GTEST -/// 从hdTransfer获取device侧返回的emb信息,并在host侧表的对应位置插入。 -/// missingKeysHostPos为host侧需要发送的emb的位置,也就是淘汰的emb的插入位置 -/// \param missingKeysHostPos 当前batch在host上需要换出的偏移 -/// \param channelId 通道索引(训练/推理) -/// \param embName 表名 -void HostEmb::UpdateEmb(const vector& missingKeysHostPos, int channelId, const string& embName) -{ - LOG_INFO(HOSTEMB + "UpdateEmb, channelId:{}, embName:{}", channelId, embName); - EASY_FUNCTION(profiler::colors::Purple); - TimeCost tc = TimeCost(); - auto hdTransfer = Singleton::GetInstance(); - TransferChannel transferName = TransferChannel::D2H; - LOG_INFO(HOSTEMB + "wait D2H embs, channelId:{}", channelId); - const auto tensors = hdTransfer->Recv(transferName, channelId, embName); - if (tensors.empty()) { - LOG_WARN(HOSTEMB + "recv empty data"); - return; - } - const Tensor& d2hEmb = tensors[0]; - EASY_BLOCK("Update") - const float* tensorPtr = d2hEmb.flat().data(); - auto embeddingSize = hostEmbs[embName].hostEmbInfo.extEmbeddingSize; - auto& embData = hostEmbs[embName].embData; - - LOG_DEBUG(HOSTEMB + "embName:{}, UpdateEmb missingKeys len = {}, embeddingSize = {}, " - "embData.size = {} {}", embName, missingKeysHostPos.size(), embeddingSize, embData.size(), tensorPtr); - -#pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ - shared(missingKeysHostPos, tensorPtr, embData, embeddingSize) - for (size_t i = 0; i < missingKeysHostPos.size(); i++) { - auto& dst = embData[missingKeysHostPos[i]]; -#pragma omp simd - for (int j = 0; j < embeddingSize; j++) { - dst[j] = tensorPtr[j + embeddingSize * i]; - } - } - LOG_INFO(HOSTEMB + "update emb end cost: {}ms", tc.ElapsedMS()); - EASY_END_BLOCK -} - -/// 用从device获取的数据更新host的emb(使用aclTDT原生接口) -/// \param missingKeysHostPos 当前batch在host上需要换出的偏移 -/// \param channelId 通道索引(训练/推理) -/// \param embName 表名 -void HostEmb::UpdateEmbV2(const vector& missingKeysHostPos, int channelId, const string& embName) -{ - LOG_INFO(HOSTEMB + "UpdateEmbV2, channelId:{}, embName:{}", channelId, embName); - EASY_FUNCTION(profiler::colors::Purple) - auto updateThread = - [this, missingKeysHostPos, channelId, embName] { - auto hdTransfer = Singleton::GetInstance(); - TransferChannel transferName = TransferChannel::D2H; - LOG_INFO(HOSTEMB + "wait D2H embs, channelId:{}", channelId); - auto size = hdTransfer->RecvAcl(transferName, channelId, embName); - if (size == 0) { - LOG_WARN(HOSTEMB + "recv empty data"); - return; - } - TimeCost tc = TimeCost(); - - EASY_BLOCK("Update") - auto& embData = hostEmbs[embName].embData; - auto embeddingSize = hostEmbs[embName].hostEmbInfo.extEmbeddingSize; - auto aclData = acltdtGetDataItem(hdTransfer->aclDatasets[embName], 0); - if (aclData == nullptr) { - throw runtime_error("Acl get tensor data from dataset failed."); - } - float* ptr = static_cast(acltdtGetDataAddrFromItem(aclData)); - if (ptr == nullptr || missingKeysHostPos.size() == 0) { - return; - } - size_t elementSize = acltdtGetDataSizeFromItem(aclData); - size_t dimNum = acltdtGetDimNumFromItem(aclData); - LOG_DEBUG(HOSTEMB + "embName:{}, UpdateEmb missingKeys len = {}, embeddingSize = {}," - " embData.size = {}, RecvAcl = {}, elementSize = {}, dimNum = {}", - embName, missingKeysHostPos.size(), embeddingSize, embData.size(), size, elementSize, dimNum); -#pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) shared(ptr, embData, embeddingSize) - for (size_t j = 0; j < missingKeysHostPos.size(); j++) { - auto& dst = embData[missingKeysHostPos[j]]; -#pragma omp simd - for (int k = 0; k < embeddingSize; k++) { - dst[k] = ptr[k + embeddingSize * j]; - } - } - LOG_INFO(HOSTEMB + "update emb end cost: {}ms", tc.ElapsedMS()); - }; - - switch (channelId) { - case TRAIN_CHANNEL_ID: - procThreadsForTrain.emplace_back(make_unique(updateThread)); - break; - case EVAL_CHANNEL_ID: - procThreadsForEval.emplace_back(make_unique(updateThread)); - break; - default: - throw invalid_argument("channelId not in [TRAIN_CHANNEL_ID, EVAL_CHANNEL_ID]"); - } -} - -/// 查找host侧需要发送给device的emb数据。 -/// \param missingKeysHostPos 当前batch在host上需要换出的偏移 -/// \param embName -/// \param h2dEmbOut -void HostEmb::GetH2DEmb(const vector& missingKeysHostPos, const string& embName, - vector& h2dEmbOut) -{ - EASY_FUNCTION() - TimeCost tc = TimeCost(); - const auto& emb = hostEmbs[embName]; - const int embeddingSize = emb.hostEmbInfo.extEmbeddingSize; - h2dEmbOut.emplace_back(Tensor(tensorflow::DT_FLOAT, { - int(missingKeysHostPos.size()), embeddingSize - })); - auto& tmpTensor = h2dEmbOut.back(); - auto tmpData = tmpTensor.flat(); -#pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) shared(missingKeysHostPos, emb, tmpData) - for (size_t i = 0; i < missingKeysHostPos.size(); ++i) { - const auto& src = emb.embData[missingKeysHostPos[i]]; -#pragma omp simd - for (int j = 0; j < embeddingSize; j++) { - tmpData(j + i * embeddingSize) = src[j]; - } - } - LOG_INFO("GetH2DEmb end, missingKeys count:{} cost:{}ms", missingKeysHostPos.size(), tc.ElapsedMS()); -} - -/// 获取hostEmbs的指针 -/// \return -auto HostEmb::GetHostEmbs() -> absl::flat_hash_map* -{ - return &hostEmbs; -} - -/// 对指定offset的emb进行初始化 -/// \param initializeInfos emb初始化信息列表 -/// \param embData emb数据 -/// \param offset 偏移列表 -void HostEmb::EmbPartGenerator(const vector &initializeInfos, vector> &embData, - const vector& offset) const -{ - for (auto initializeInfo: initializeInfos) { - LOG_INFO("Device GenerateEmbData ing. name {}", initializeInfo.name); - for (size_t i = 0; i < offset.size(); ++i) { - initializeInfo.initializer->GenerateData(embData.at(offset.at(i)).data(), - static_cast(embData[0].size())); - } - } -} - -void HostEmb::EmbPartGenerator(const vector &initializeInfos, vector> &embData, - const vector& offset) const -{ - for (auto initializeInfo: initializeInfos) { - LOG_INFO("Device GenerateEmbData ing. name {}", initializeInfo.name); - for (size_t i = 0; i < offset.size(); ++i) { - initializeInfo.initializer->GenerateData(embData.at(offset.at(i)).data(), - static_cast(embData[0].size())); - } - } -} -#endif - -/// 利用initializer初始化emb淘汰的位置 -/// \param embName 表名 -/// \param offset 淘汰的偏移列表 -void HostEmb::EvictInitEmb(const string& embName, const vector& offset) -{ -#ifndef GTEST - auto& hostEmb = GetEmb(embName); - EmbPartGenerator(hostEmb.hostEmbInfo.initializeInfos, hostEmb.embData, offset); - LOG_INFO(HOSTEMB + "ddr EvictInitEmb!host embName {}, init offsets size: {}", embName, offset.size()); -#endif -} - -void HostEmb::EvictInitEmb(const string& embName, const vector& offset) -{ -#ifndef GTEST - auto& hostEmb = GetEmb(embName); - EmbPartGenerator(hostEmb.hostEmbInfo.initializeInfos, hostEmb.embData, offset); - LOG_INFO(HOSTEMB + "ddr EvictInitEmb!host embName {}, init offsets size: {}", embName, offset.size()); -#endif -} \ No newline at end of file diff --git a/src/core/host_emb/host_emb.h b/src/core/host_emb/host_emb.h deleted file mode 100644 index a9ff3786..00000000 --- a/src/core/host_emb/host_emb.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ - -#ifndef MX_REC_HOSTEMB_H -#define MX_REC_HOSTEMB_H - -#include -#include -#include -#include -#include "absl/container/flat_hash_map.h" -#include "utils/common.h" -#include "utils/singleton.h" -#include "tensorflow/core/framework/tensor.h" - -namespace MxRec { - using namespace std; - using namespace tensorflow; - - class HostEmb { - public: - HostEmb() = default; - - ~HostEmb() - {}; - - void Initialize(const vector& embInfos, int seed); - - void Join(int channelId); - - void UpdateEmb(const vector& missingKeysHostPos, int channelId, const string& embName); - - void UpdateEmbV2(const vector& missingKeysHostPos, int channelId, const string& embName); - - void GetH2DEmb(const vector& missingKeysHostPos, const string& embName, - vector& h2dEmbOut); - auto GetHostEmbs() -> absl::flat_hash_map*; - - void EvictInitEmb(const string& embName, const vector& offset); - - void EvictInitEmb(const string& embName, const vector& offset); - - HostEmbTable& GetEmb(const string& embName) - { - return hostEmbs.at(embName); - } - - GTEST_PRIVATE: - absl::flat_hash_map hostEmbs; - - std::vector> procThreadsForTrain; - std::vector> procThreadsForEval; - - void EmbDataGenerator(const vector& initializeInfos, int seed, int vocabSize, int embeddingSize, - vector>& embData) const; - void EmbPartGenerator(const vector &initializeInfos, vector> &embData, - const vector& offset) const; - - void EmbPartGenerator(const vector &initializeInfos, vector> &embData, - const vector& offset) const; - }; -} - -#endif // MX_REC_HOSTEMB_H diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 78621829..123b2c79 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -20,6 +20,7 @@ See the License for the specific language governing permissions and #include #include #include +#include #include "hd_transfer/hd_transfer.h" #include "hybrid_mgmt/hybrid_mgmt_block.h" @@ -30,12 +31,11 @@ See the License for the specific language governing permissions and #include "key_process/key_process.h" #include "key_process/feature_admit_and_evict.h" #include "emb_table/embedding_mgmt.h" -#include "emb_table/embedding_ddr.h" using namespace MxRec; using namespace std; - +using namespace ock::ctr; /// Openmpi通信域进程数设置、计算所有表host特征数量总数、设置训练模式(HBM/DDR) /// \param rankInfo @@ -89,8 +89,13 @@ bool HybridMgmt::Initialize(RankInfo rankInfo, const vector& embInfos, return true; } + // create factory for fastUnique and embeddingCache + int result = ock::ctr::Factory::Create(factory); + if (result != 0) { + throw runtime_error(Logger::Format("create fast factory failed, error code:{}", result)); + } + InitRankInfo(rankInfo, embInfos); - EmbeddingMgmt::Instance()->Init(rankInfo, embInfos, thresholdValues, seed); GlogConfig::gStatOn = GlobalEnv::statOn; LOG_INFO(MGMT + "begin initialize, localRankSize:{}, localRankId:{}, rank:{}", @@ -110,25 +115,17 @@ bool HybridMgmt::Initialize(RankInfo rankInfo, const vector& embInfos, KEY_PROCESS_INSTANCE->Initialize(rankInfo, embInfos, thresholdValues, seed); isRunning = true; + isSSDEnabled = rankInfo.isSSDEnabled; + EmbeddingMgmt::Instance()->Init(rankInfo, embInfos, seed); - // DDR模式,初始化hashmap和host emb if (rankInfo.isDDR) { - hostEmbs = Singleton::GetInstance(); - hostHashMaps = make_unique(); - hostEmbs->Initialize(embInfos, seed); - hostHashMaps->Init(rankInfo, embInfos, ifLoad); + InitEmbeddingCache(embInfos); } - // 非断点续训模式,启动数据传输 - isSSDEnabled = rankInfo.isSSDEnabled; if (isSSDEnabled) { cacheManager = Singleton::GetInstance(); - cacheManager->Init(hostEmbs, mgmtEmbInfo); - hostHashMaps->isSSDEnabled = this->isSSDEnabled; - hostHashMaps->cacheManager = this->cacheManager; - // 启用SSD时,EmbeddingDDR依赖cacheManager - EmbeddingMgmt::Instance()->EnableSSD(); - EmbeddingMgmt::Instance()->SetCacheManagerForEmbTable(this->cacheManager); + cacheManager->Init(embCache, mgmtEmbInfo); + EmbeddingMgmt::Instance()->SetCacheManagerForEmbTable(cacheManager); } isLoad = ifLoad; if (!isLoad) { @@ -136,99 +133,24 @@ bool HybridMgmt::Initialize(RankInfo rankInfo, const vector& embInfos, } for (const auto& info: embInfos) { - LOG_INFO(MGMT + "emb[{}] vocab size {}+{} sc:{}", + LOG_INFO(MGMT + "table:{}, vocab size dev+host:{}+{}, send count:{}", info.name, info.devVocabSize, info.hostVocabSize, info.sendCount); } - LOG_INFO(MGMT + "end initialize, isDDR:{}, maxStep:[{}, {}], rank:{}", rankInfo.isDDR, - rankInfo.ctrlSteps.at(TRAIN_CHANNEL_ID), rankInfo.ctrlSteps.at(EVAL_CHANNEL_ID), rankInfo.rankId); + LOG_INFO(MGMT + "end initialize, rankId:{}, isDDR:{}, " + "step[train_interval, eval_interval, save_interval, max_train_step]:[{}, {}, {}, {}]", + rankInfo.rankId, rankInfo.isDDR, + rankInfo.ctrlSteps.at(TRAIN_CHANNEL_ID), rankInfo.ctrlSteps.at(EVAL_CHANNEL_ID), + rankInfo.ctrlSteps.at(SAVE_STEP_INDEX), rankInfo.ctrlSteps.at(MAX_TRAIN_STEP_INDEX)); #endif isInitialized = true; return true; } -// 比较hostHashMap和cacheManager的数据是否一致 -void HybridMgmt::AddCacheManagerTraceLog(CkptData& saveData) -{ - if (Logger::GetLevel() != Logger::TRACE) { - return; - } - auto& embHashMaps = saveData.embHashMaps; - auto& ddrKeyFreqMap = saveData.ddrKeyFreqMaps; - for (auto& it : embHashMaps) { - string embTableName = it.first; - auto& hostMap = EmbeddingMgmt::Instance()->GetTable(embTableName)->keyOffsetMap; - auto& devSize = it.second.devVocabSize; - auto& lfu = ddrKeyFreqMap[embTableName]; - size_t tableKeyInDdr = 0; - for (const auto& item : hostMap) { - if (item.second < devSize) { - continue; - } - ++tableKeyInDdr; - auto cuKey = item.first; - if (lfu.find(cuKey) == lfu.end()) { - LOG_ERROR("save step error, ddr key:{}, not exist in lfu, hostHashMap offset:", - cuKey, item.second); - } - } - LOG_INFO("save step end, table:{}, tableKeyInDdr:{}, tableKeyInLfu:{}", - embTableName, tableKeyInDdr, lfu.size()); - } -} - -/// 保存CacheManager时恢复数据(与恢复hostHashMap类似,仅恢复保存数据,不修改源数据) -/// \param saveData 保存数据 -void HybridMgmt::RestoreFreq4Save(CkptData& saveData) const -{ - // 仅在差异1步时执行恢复操作 - int checkResult = hybridMgmtBlock->CheckSaveEmbMapValid(); - if (checkResult != 1) { - return; - } - auto& ddrKeyFreqMaps = saveData.ddrKeyFreqMaps; - auto& excludeDDRKeyFreqMaps = saveData.excludeDDRKeyFreqMaps; - - for (const auto& it : saveData.embHashMaps) { - auto& embTableName = it.first; - auto& embHashMap = it.second; - vector hbm2DdrKeys; - vector ddr2HbmKeys; - LOG_INFO("restore freq info for save step, table:{}, embHashMap.oldSwap size:{}", - embTableName, embHashMap.oldSwap.size()); - LOG_INFO("before, ddr key table size:{}, exclude ddr key table size:{}", - ddrKeyFreqMaps[embTableName].size(), excludeDDRKeyFreqMaps[embTableName].size()); - for (const auto& swapKeys : embHashMap.oldSwap) { - hbm2DdrKeys.emplace_back(swapKeys.second); - ddr2HbmKeys.emplace_back(swapKeys.first); - } - int hbm2DdrKeysNotInExcludeMapCount = 0; - int ddr2HbmKeysNotInDDRMapCount = 0; - for (auto& key : hbm2DdrKeys) { - if (excludeDDRKeyFreqMaps[embTableName].find(key) == excludeDDRKeyFreqMaps[embTableName].end()) { - ++hbm2DdrKeysNotInExcludeMapCount; - } - ddrKeyFreqMaps[embTableName][key] = excludeDDRKeyFreqMaps[embTableName][key]; - excludeDDRKeyFreqMaps[embTableName].erase(key); - } - for (auto& key : ddr2HbmKeys) { - if (ddrKeyFreqMaps[embTableName].find(key) == ddrKeyFreqMaps[embTableName].end()) { - ++ddr2HbmKeysNotInDDRMapCount; - } - excludeDDRKeyFreqMaps[embTableName][key] = ddrKeyFreqMaps[embTableName][key]; - ddrKeyFreqMaps[embTableName].erase(key); - } - LOG_INFO("hbm2DdrKeysNotInExcludeMapCount:{}, ddr2HbmKeysNotInDDRMapCount:{}", - hbm2DdrKeysNotInExcludeMapCount, ddr2HbmKeysNotInDDRMapCount); - LOG_INFO("after, ddr key table size:{}, exclude ddr key table size:{}", - ddrKeyFreqMaps[embTableName].size(), excludeDDRKeyFreqMaps[embTableName].size()); - } -} - /// 保存模型 /// \param savePath 保存路径 /// \return -bool HybridMgmt::Save(const string savePath) +void HybridMgmt::Save(const string& savePath) { #ifndef GTEST if (!isInitialized) { @@ -242,22 +164,17 @@ bool HybridMgmt::Save(const string savePath) Checkpoint saveCkpt; saveData.keyCountMap = KEY_PROCESS_INSTANCE->GetKeyCountMap(); - EmbeddingMgmt::Instance()->LockSave(); // acquire lock here to prevent HybridMgmt modify keyOffsetMap EmbeddingMgmt::Instance()->Save(savePath); - offsetMapToSend = EmbeddingMgmt::Instance()->GetDeviceOffsets(); + if (!mgmtRankInfo.isDDR) { + // hbm模式只保存必要的offset对应的内容 + offsetMapToSend = EmbeddingMgmt::Instance()->GetDeviceOffsets(); + } if (isSSDEnabled) { - LOG_DEBUG(MGMT + "Start host side save: ssd mode hashmap"); - for (auto& it : cacheManager->ddrKeyFreqMap) { - saveData.ddrKeyFreqMaps[it.first] = it.second.GetFreqTable(); - } - saveData.excludeDDRKeyFreqMaps = cacheManager->excludeDDRKeyCountMap; - RestoreFreq4Save(saveData); - AddCacheManagerTraceLog(saveData); + LOG_DEBUG(MGMT + "start save SSD data"); auto step = GetStepFromPath(savePath); cacheManager->SaveSSDEngine(step); } - EmbeddingMgmt::Instance()->UnLockSave(); // 保存特征准入淘汰相关的数据 FeatureAdmitAndEvict& featAdmitNEvict = KEY_PROCESS_INSTANCE->GetFeatAdmitAndEvict(); @@ -272,8 +189,9 @@ bool HybridMgmt::Save(const string savePath) saveCkpt.SaveModel(savePath, saveData, mgmtRankInfo, mgmtEmbInfo); // 数据处理线程释放锁 KEY_PROCESS_INSTANCE->LoadSaveUnlock(); + hybridMgmtBlock->FinishSave(); + cvCheckSave.notify_all(); #endif - return true; } /// 加载模型 @@ -297,26 +215,23 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) SetFeatureTypeForLoad(loadFeatures); if (warmStartTables.size() == 0) { - EmbeddingMgmt::Instance()->Load(loadPath); + EmbeddingMgmt::Instance()->Load(loadPath, trainKeysSet); } else { for (auto& tableName: warmStartTables) { - EmbeddingMgmt::Instance()->Load(tableName, loadPath); + EmbeddingMgmt::Instance()->Load(tableName, loadPath, trainKeysSet); } } - loadOffsetToSend = EmbeddingMgmt::Instance()->GetLoadOffsets(); + if (!mgmtRankInfo.isDDR) { + // hbm模式只保存必要的offset对应的内容 + loadOffsetToSend = EmbeddingMgmt::Instance()->GetLoadOffsets(); + } // 执行加载操作 loadCkpt.LoadModel(loadPath, loadData, mgmtRankInfo, mgmtEmbInfo, loadFeatures); KEY_PROCESS_INSTANCE->LoadKeyCountMap(loadData.keyCountMap); - if (mgmtRankInfo.isDDR) { - // DDR模式 将加载的hash map进行赋值 - LOG_DEBUG(MGMT + "Start host side load: ddr mode hashmap"); - auto GetEmbHashMaps = EmbeddingMgmt::Instance()->GetEmbHashMaps(); - LOG_DEBUG(MGMT + "over over Start host side load: ddr mode hashmap"); - hostHashMaps->LoadHashMap(GetEmbHashMaps); - } else { + if (!mgmtRankInfo.isDDR) { // HBM模式 将加载的最大偏移(真正使用了多少vocab容量)、特征到偏移的映射,进行赋值 LOG_DEBUG(MGMT + "Start host side load: no ddr mode hashmap"); auto keyOffsetMap = EmbeddingMgmt::Instance()->GetKeyOffsetMap(); @@ -336,13 +251,7 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) if (isSSDEnabled) { LOG_DEBUG(MGMT + "Start host side load: ssd key freq map"); auto step = GetStepFromPath(loadPath); - cacheManager->Load(loadData.ddrKeyFreqMaps, loadData.excludeDDRKeyFreqMaps, - step, mgmtRankInfo.rankSize, mgmtRankInfo.rankId); - for (auto info: mgmtEmbInfo) { - auto tb = EmbeddingMgmt::Instance()->GetTable(info.name); - auto tbCast = reinterpret_pointer_cast(tb); - tbCast->RefreshFreqInfoAfterLoad(); - } + cacheManager->Load(mgmtEmbInfo, step, trainKeysSet); } LOG_DEBUG(MGMT + "Finish host side load process"); @@ -368,10 +277,6 @@ void HybridMgmt::SetFeatureTypeForLoad(vector& loadFeatures) if (featAdmitNEvict.GetFunctionSwitch()) { loadFeatures.push_back(CkptFeatureType::FEAT_ADMIT_N_EVICT); } - - if (isSSDEnabled) { - loadFeatures.push_back(CkptFeatureType::DDR_KEY_FREQ_MAP); - } } /// 获取key对应的offset,python侧调用 @@ -444,76 +349,6 @@ void HybridMgmt::ReceiveHostMap(AllKeyOffsetMapT receiveKeyOffsetMap) #endif } -/// 对加载的数据和训练配置进行一致性校验 -/// \param loadHostEmbs -/// \param setupHostEmbs -/// \param embTableCount -/// \return -bool HybridMgmt::IsLoadDataMatches(const EmbMemT& loadHostEmbs, - const EmbInfo& setupHostEmbs, - size_t& embTableCount) const -{ - bool loadDataMatches = { true }; - const auto& loadEmbTable { loadHostEmbs.find(setupHostEmbs.name) }; - if (loadEmbTable != loadHostEmbs.end()) { - embTableCount++; - - const auto& loadEmbInfo { loadEmbTable->second.hostEmbInfo }; - if (setupHostEmbs.sendCount != loadEmbInfo.sendCount) { - LOG_ERROR(MGMT + "Load data sendCount {} for table {} does not match setup sendCount {}", - setupHostEmbs.sendCount, setupHostEmbs.name, loadEmbInfo.sendCount); - loadDataMatches = false; - } - if (setupHostEmbs.extEmbeddingSize != loadEmbInfo.extEmbeddingSize) { - LOG_ERROR(MGMT + "Load data extEmbeddingSize {} for table {} does not match setup extEmbeddingSize {}", - setupHostEmbs.extEmbeddingSize, setupHostEmbs.name, loadEmbInfo.extEmbeddingSize); - loadDataMatches = false; - } - if (setupHostEmbs.devVocabSize != loadEmbInfo.devVocabSize) { - LOG_ERROR(MGMT + "Load data devVocabSize {} for table {} does not match setup devVocabSize {}", - setupHostEmbs.devVocabSize, setupHostEmbs.name, loadEmbInfo.devVocabSize); - loadDataMatches = false; - } - if (setupHostEmbs.hostVocabSize != loadEmbInfo.hostVocabSize) { - LOG_ERROR(MGMT + "Load data hostVocabSize {} for table {} does not match setup hostVocabSize {}", - setupHostEmbs.hostVocabSize, setupHostEmbs.name, loadEmbInfo.hostVocabSize); - loadDataMatches = false; - } - if (!loadDataMatches) { - return false; - } - } else { - LOG_ERROR(MGMT + "Load data does not contain table with table name: {}", setupHostEmbs.name); - return false; - } - return true; -} - -/// 对DDR模式保存的模型和训练配置进行一致性校验 -/// \param loadData -/// \return 是否一致 -bool HybridMgmt::LoadMatchesDDRSetup(const CkptData& loadData) -{ - size_t embTableCount { 0 }; - auto loadHostEmbs { loadData.hostEmbs }; - if (loadHostEmbs == nullptr) { - LOG_ERROR(MGMT + "Host Embedding of load checkpoint data is nullptr!"); - return false; - } - for (EmbInfo setupHostEmbs : mgmtEmbInfo) { - if (!IsLoadDataMatches(*loadHostEmbs, setupHostEmbs, embTableCount)) { - return false; - } - } - - if (embTableCount < loadHostEmbs->size()) { - LOG_ERROR(MGMT + "Load data has {} tables more than setup table num {}", - loadHostEmbs->size(), embTableCount); - return false; - } - return true; -} - /// 根据HBM/DDR模式,启动数据处理线程 void HybridMgmt::Start() { @@ -558,6 +393,12 @@ void HybridMgmt::StartThreadForDDR() LOG_INFO("parseKeysTaskForEval done"); }; procThreads.emplace_back(std::make_unique(parseKeysTaskForEval)); + + auto embeddingProcessTask = [this]() { + EmbeddingTask(); + LOG_INFO("embeddingProcessTask done"); + }; + procThreads.emplace_back(std::make_unique(embeddingProcessTask)); #endif } @@ -574,6 +415,16 @@ void HybridMgmt::Destroy() // 先发送停止信号mgmt,先停止新lookup查询, 解除queue的限制防止卡住 isRunning = false; + mutexDestroy = true; + for (const auto& embInfo: mgmtEmbInfo) { + for (int index = 0; index < EMBEDDING_THREAD_NUM; index++) { + cvLastUpdateFinishMap[embInfo.name][index].notify_all(); + cvLastLookUpFinishMap[embInfo.name][index].notify_all(); + cvLastSendFinishMap[embInfo.name][index].notify_all(); + cvLastRecvFinishMap[embInfo.name][index].notify_all(); + } + } + { // 获取锁 避免KeyProcess中手动发送结束信息时通道关闭 std::unique_lock lockGuard(KEY_PROCESS_INSTANCE->destroyMutex); @@ -591,22 +442,18 @@ void HybridMgmt::Destroy() if (cacheManager != nullptr) { cacheManager = nullptr; } - if (hostEmbs != nullptr) { - hostEmbs->Join(TRAIN_CHANNEL_ID); - hostEmbs->Join(EVAL_CHANNEL_ID); - hostEmbs = nullptr; - } + JoinEmbeddingCacheThread(); procThreads.clear(); // 停止预处理 KEY_PROCESS_INSTANCE->Destroy(); LOG_DEBUG(MGMT + "Destroy hybrid_mgmt module end."); -}; +} -#ifndef GTEST /// 启动hybrid处理任务 /// \param type void HybridMgmt::TrainTask(TaskType type) { +#ifndef GTEST int channelId = TRAIN_CHANNEL_ID; int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[channelId]; do { @@ -619,19 +466,9 @@ void HybridMgmt::TrainTask(TaskType type) } LOG_INFO(HYBRID_BLOCKING + "hybrid start task channel {} batch {}", channelId, theTrainBatchId); - switch (type) { - case TaskType::HBM: - ParseKeysHBM(TRAIN_CHANNEL_ID, theTrainBatchId); - LOG_INFO(MGMT + "ParseKeysHBMBatchId = {}", theTrainBatchId); - break; - case TaskType::DDR: - ParseKeys(TRAIN_CHANNEL_ID, theTrainBatchId); - LOG_INFO(MGMT + "parseKeysBatchId = {}", theTrainBatchId); - break; - default: - throw std::invalid_argument("Invalid TaskType Type."); - } + ParseKeys(TRAIN_CHANNEL_ID, theTrainBatchId, type); } while (true); +#endif } /// 推理数据处理:数据处理状态正常,处理的batch数小于用户预设值或者设为-1时,会循环处理; @@ -639,11 +476,20 @@ void HybridMgmt::TrainTask(TaskType type) /// \return void HybridMgmt::EvalTask(TaskType type) { +#ifndef GTEST int channelId = EVAL_CHANNEL_ID; int& evalBatchId = hybridMgmtBlock->hybridBatchId[channelId]; do { hybridMgmtBlock->CheckAndSetBlock(channelId); if (hybridMgmtBlock->GetBlockStatus(channelId)) { + LOG_DEBUG("eval channel block at batchId:{}, needWaitSave:{}", + evalBatchId, hybridMgmtBlock->IsNeedWaitSave()); + std::unique_lock checkSaveLocker(saveMutex); + cvCheckSave.wait(checkSaveLocker, [this] { + return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; + }); + hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); + LOG_DEBUG("wake TrainTask"); hybridMgmtBlock->DoBlock(channelId); } if (!isRunning) { @@ -651,326 +497,250 @@ void HybridMgmt::EvalTask(TaskType type) } LOG_INFO(HYBRID_BLOCKING + "hybrid start task channel {} batch {}", channelId, evalBatchId); - switch (type) { - case TaskType::HBM: - ParseKeysHBM(EVAL_CHANNEL_ID, evalBatchId); - LOG_INFO(MGMT + "HBM evalBatchId = {}", evalBatchId); - break; - case TaskType::DDR: - ParseKeys(EVAL_CHANNEL_ID, evalBatchId); - LOG_INFO(MGMT + "DDR evalBatchId = {}", evalBatchId); - break; - default: - throw std::invalid_argument("Invalid TaskType Type."); - } + ParseKeys(EVAL_CHANNEL_ID, evalBatchId, type); } while (true); +#endif } -/// HBM模式下,发送key process线程已处理好的各类型向量到指定通道中 -/// \param channelId 通道索引(训练/推理) -/// \param batchId 已处理的batch数 -/// \return -bool HybridMgmt::ParseKeysHBM(int channelId, int& batchId) -{ - LOG_INFO(MGMT + "nBatch:{} channelId:{} batchId:{}, ParseKeys with HBM mode start.", - mgmtRankInfo.nBatch, channelId, batchId); - - // 循环处理每个表的数据 - for (const auto& embInfo: mgmtEmbInfo) { - TimeCost parseKeysTc; - // 获取各类向量,如果为空指针,退出当前函数 - auto infoVecs = KEY_PROCESS_INSTANCE->GetInfoVec(batchId, embInfo.name, channelId, ProcessedInfo::RESTORE); - if (infoVecs == nullptr) { - LOG_INFO(MGMT + "channelId:{} batchId:{}, ParseKeys infoVecs empty !", channelId, batchId); - return false; - } - LOG_DEBUG("channelId:{} batchId:{}, ParseKeysHBM GetInfoVec end.", channelId, batchId); - // 动态shape场景下,获取all2all向量(通信量矩阵) - TimeCost sendTensorsSyncTC; - unique_ptr> all2all = nullptr; - if (!mgmtRankInfo.useStatic) { - TimeCost getTensorsSyncTC; - all2all = KEY_PROCESS_INSTANCE->GetInfoVec(batchId, embInfo.name, channelId, ProcessedInfo::ALL2ALL); - LOG_DEBUG("channelId:{} batchId:{}, getTensorsSyncTC(ms):{}", - channelId, batchId, getTensorsSyncTC.ElapsedMS()); - if (all2all == nullptr) { - LOG_ERROR("Information vector is nullptr!"); - return false; - } - sendTensorsSyncTC = TimeCost(); // 重新初始化,不计算getTensors耗时 - TimeCost sendAll2AllScSyncTC; - hdTransfer->Send(TransferChannel::ALL2ALL, *all2all, channelId, embInfo.name); - LOG_DEBUG("channelId:{} batchId:{}, sendAll2AllScSyncTC(ms):{}", - channelId, batchId, sendAll2AllScSyncTC.ElapsedMS()); - } - - // 发送查询向量 - TimeCost sendLookupSyncTC; - hdTransfer->Send(TransferChannel::LOOKUP, { infoVecs->back() }, channelId, embInfo.name); - infoVecs->pop_back(); - LOG_DEBUG("channelId:{} batchId:{}, sendLookupSyncTC(ms):{}", channelId, batchId, sendLookupSyncTC.ElapsedMS()); - - // 训练时,使用全局去重聚合梯度,发送全局去重的key和对应的恢复向量 - if (mgmtRankInfo.useSumSameIdGradients && channelId == TRAIN_CHANNEL_ID) { - SendUniqKeysAndRestoreVecHBM(channelId, batchId, embInfo, infoVecs); - } - - // 发送恢复向量 - TimeCost sendRestoreSyncTC; - hdTransfer->Send(TransferChannel::RESTORE, *infoVecs, channelId, embInfo.name); - LOG_DEBUG("sendRestoreSyncTC(ms):{}, sendTensorsSyncTC(ms):{}, parseKeysTc HBM mode (ms):{}", - sendRestoreSyncTC.ElapsedMS(), sendTensorsSyncTC.ElapsedMS(), parseKeysTc.ElapsedMS()); - LOG_INFO(MGMT + "channelId:{} batchId:{}, embName:{}, ParseKeys with HBM mode end.", - channelId, batchId, embInfo.name); - } - batchId++; - return true; -} - -void HybridMgmt::SendUniqKeysAndRestoreVecHBM(int channelId, int &batchId, const EmbInfo &embInfo, - const unique_ptr> &infoVecs) const +void HybridMgmt::SendUniqKeysAndRestoreVecHBM(const EmbBaseInfo &info, + const unique_ptr> &infoVecs, bool isGrad) const { TimeCost sendUniqueKeysSyncTC; LOG_DEBUG("channelId:{} batchId:{}, global unique, table name: {}, is grad: {}", - channelId, batchId, embInfo.name, embInfo.isGrad); - if (embInfo.isGrad) { - hdTransfer->Send(TransferChannel::UNIQKEYS, {infoVecs->back()}, channelId, embInfo.name); + info.channelId, info.batchId, info.name, isGrad); + if (isGrad) { + hdTransfer->Send(TransferChannel::UNIQKEYS, {infoVecs->back()}, info.channelId, info.name); } infoVecs->pop_back(); LOG_DEBUG("channelId:{} batchId:{}, sendUniqueKeysSyncTC(ms):{}", - channelId, batchId, sendUniqueKeysSyncTC.ElapsedMS()); + info.channelId, info.batchId, sendUniqueKeysSyncTC.ElapsedMS()); TimeCost sendUniqueRestoreVecSyncTC; - if (embInfo.isGrad) { - hdTransfer->Send(TransferChannel::RESTORE_SECOND, {infoVecs->back()}, channelId, embInfo.name); + if (isGrad) { + hdTransfer->Send(TransferChannel::RESTORE_SECOND, {infoVecs->back()}, info.channelId, info.name); } infoVecs->pop_back(); LOG_DEBUG("channelId:{} batchId:{}, sendUniqueRestoreVecSyncTC(ms):{}", - channelId, batchId, sendUniqueRestoreVecSyncTC.ElapsedMS()); + info.channelId, info.batchId, sendUniqueRestoreVecSyncTC.ElapsedMS()); } -#endif -/// 当前处理的batch是否是最后一个batch +/// 当前处理的batch是否是最后一个batch,涵盖train切换eval、save场景 /// \param batchId 已处理的batch数 -/// \param channelId 通道索引(训练/推理) /// \return -bool HybridMgmt::EndBatch(int batchId, int channelId) const +bool HybridMgmt::IsTrainEndBatch(int batchId) const +{ + // case 1:需要切eval + // case 2:需要save时,补发pos后被阻塞,等待save完成,避免embCache状态发送变化 + // batchId是从0开始的,所以要+1对上step + bool isNeedSwitchToEval = mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID] != -1 && + (batchId + 1) % mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID] == 0; + bool isNeedSave = mgmtRankInfo.ctrlSteps[SAVE_STEP_INDEX] != -1 && + mgmtRankInfo.ctrlSteps[SAVE_STEP_INDEX] != 0 && + (batchId + 1) % mgmtRankInfo.ctrlSteps[SAVE_STEP_INDEX] == 0; + LOG_DEBUG("mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID]:{}, batchId:{}", + mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID], batchId); + LOG_DEBUG("isNeedSwitchToEval:{}, isNeedSave:{}", isNeedSwitchToEval, isNeedSave); + return isNeedSwitchToEval || isNeedSave; +} + +bool HybridMgmt::IsEvalEndBatch(int batchId) const { - return (batchId % mgmtRankInfo.ctrlSteps[channelId] == 0 && mgmtRankInfo.ctrlSteps[channelId] != -1); + // batchId是从0开始的,所以要+1对上step,表示当前step之后要结束eval了 + return (batchId + 1) == hybridMgmtBlock->stepsInterval[EVAL_CHANNEL_ID]; } /// DDR模式下,发送key process线程已处理好的各类型向量到指定通道中 /// \param channelId 通道索引(训练/推理) /// \param batchId 已处理的batch数 /// \return -bool HybridMgmt::ParseKeys(int channelId, int& batchId) +bool HybridMgmt::ParseKeys(int channelId, int& batchId, TaskType type) { #ifndef GTEST LOG_INFO(MGMT + "channelId:{} batchId:{}, DDR mode, ParseKeys start.", channelId, batchId); TimeCost parseKeyTC; - int start = batchId; bool remainBatch = true; // 是否从通道获取了数据 + vector parseKeyThreadPool; for (const auto& embInfo : mgmtEmbInfo) { - ProcessEmbInfo(embInfo.name, batchId, channelId, remainBatch); - // 通道数据已空 - if (!remainBatch) { - LOG_DEBUG("last batch ending"); - return false; + EmbBaseInfo info = {.batchId=batchId, .channelId=channelId, .name=embInfo.name}; + switch (type) { + case TaskType::HBM: + parseKeyThreadPool.emplace_back([this, info, &remainBatch, embInfo]() { + ProcessEmbInfoHBM(info, remainBatch, embInfo.isGrad); + }); + break; + case TaskType::DDR: + if (!isSSDEnabled) { + parseKeyThreadPool.emplace_back([this, info, &remainBatch, embInfo]() { + ProcessEmbInfoDDR(info, remainBatch); + }); + } else { + parseKeyThreadPool.emplace_back([this, info, &remainBatch, embInfo]() { + ProcessEmbInfoSSD(info, remainBatch); + }); + } + break; + default: + throw std::invalid_argument("Invalid TaskType Type."); } } - batchId++; + for (auto& t : parseKeyThreadPool) { + t.join(); + } + // 通道数据已空 + if (!remainBatch) { + LOG_DEBUG("last batch ending"); + return false; + } if (!isRunning) { return false; } - EmbHDTransWrap(channelId, batchId - 1, start); LOG_DEBUG(MGMT + "channelId:{} batchId:{}, ParseKeys end, parseKeyTC(ms):{}", channelId, batchId, parseKeyTC.ElapsedMS()); + batchId++; #endif return true; } -void HybridMgmt::HandlePrepareDDRDataRet(TransferRet prepareSSDRet) const +void HybridMgmt::ProcessEmbInfoHBM(const EmbBaseInfo &info, bool& remainBatchOut, bool isGrad) { - LOG_ERROR("Transfer embedding with DDR and SSD error."); - if (prepareSSDRet == TransferRet::SSD_SPACE_NOT_ENOUGH) { - LOG_ERROR("PrepareDDRData: SSD available space is not enough."); - throw runtime_error("ssdVocabSize too small"); + TimeCost parseKeysTc; + LOG_DEBUG("ProcessEmbInfoHBM table:{}, batchId:{}, channel:{}", info.name, info.batchId, info.channelId); + + // 获取各类向量,如果为空指针,退出当前函数 + bool isEos = false; + auto infoVecs = KEY_PROCESS_INSTANCE->GetInfoVec(info, ProcessedInfo::RESTORE, isEos); + if (isEos) { + HandleEosCaseHBM(info.name, info.batchId, info.channelId, remainBatchOut); + return; + } + if (infoVecs == nullptr) { + LOG_INFO(MGMT + "table:{}, channelId:{} batchId:{}, ParseKeys infoVecs empty !", + info.name, info.channelId, info.batchId); + remainBatchOut = false; + return; + } + LOG_DEBUG("table:{}, channelId:{} batchId:{}, ParseKeysHBM GetInfoVec end", + info.name, info.channelId, info.batchId); + + // 动态shape场景下,获取all2all向量(通信量矩阵) + SendAll2AllVec(info, remainBatchOut); + if (!remainBatchOut) { + return; + } + + // 发送查询向量 + TimeCost sendLookupSyncTC; + hdTransfer->Send(TransferChannel::LOOKUP, { infoVecs->back() }, info.channelId, info.name); + infoVecs->pop_back(); + LOG_DEBUG("table:{}, channelId:{} batchId:{}, sendLookupSyncTC(ms):{}", + info.name, info.channelId, info.batchId, sendLookupSyncTC.ElapsedMS()); + + // 训练时,使用全局去重聚合梯度,发送全局去重的key和对应的恢复向量 + if (mgmtRankInfo.useSumSameIdGradients && info.channelId == TRAIN_CHANNEL_ID) { + SendUniqKeysAndRestoreVecHBM(info, infoVecs, isGrad); } - if (prepareSSDRet == TransferRet::DDR_SPACE_NOT_ENOUGH) { - LOG_ERROR("PrepareDDRData: DDR available space is not enough."); - throw runtime_error("ddrVocabSize too small"); + + // 发送恢复向量 + TimeCost sendRestoreSyncTC; + hdTransfer->Send(TransferChannel::RESTORE, *infoVecs, info.channelId, info.name); + LOG_DEBUG("table:{}, sendRestoreSyncTC(ms):{}, parseKeysTc HBM mode (ms):{}", + info.name, sendRestoreSyncTC.ElapsedMS(), parseKeysTc.ElapsedMS()); + + LOG_INFO(MGMT + "table:{}, channelId:{} batchId:{}, embName:{}, ParseKeys with HBM mode end.", + info.name, info.channelId, info.batchId, info.name); + + if (info.channelId == TRAIN_CHANNEL_ID) { + alreadyTrainOnce = true; } - throw runtime_error("Transfer embedding with DDR and SSD error."); } -#ifndef GTEST /// 构造训练所需的各种向量数据 /// \param embName 表名 /// \param batchId 已处理的batch数 /// \param channelId 通道索引(训练/推理) /// \param remainBatchOut 是否从通道获取了数据 -/// \return HBM是否还有剩余空间 -bool HybridMgmt::ProcessEmbInfo(const std::string& embName, int batchId, int channelId, bool& remainBatchOut) +void HybridMgmt::ProcessEmbInfoDDR(const EmbBaseInfo& info, bool& remainBatchOut) { +#ifndef GTEST TimeCost getAndSendTensorsTC; - TimeCost getTensorsTC; + LOG_DEBUG("ProcessEmbInfoDDR start, table:{}, channel:{}, batchId:{}", info.name, info.channelId, info.batchId); - if (hostHashMaps->embHashMaps.find(embName) == hostHashMaps->embHashMaps.end()) { - LOG_ERROR("Failed to get embedding hash map with given name: {}", embName); - return false; + if (info.channelId == TRAIN_CHANNEL_ID && info.batchId == hybridMgmtBlock->maxTrainStep) { + HandleReachMaxStepCase(info, remainBatchOut); + return; } - auto& embHashMap = hostHashMaps->embHashMaps.at(embName); - // 计数初始化 - std::shared_ptr table = EmbeddingMgmt::Instance()->GetTable(embName); - table->SetStartCount(); - - // 获取查询向量 - auto lookupKeys = KEY_PROCESS_INSTANCE->GetLookupKeys(batchId, embName, channelId); - if (lookupKeys.empty()) { - remainBatchOut = false; - LOG_WARN("channelId:{} batchId:{}, embName:{}, GetLookupKeys result is empty.", channelId, batchId, embName); - return false; - } - LOG_DEBUG("channelId:{} batchId:{}, embName:{}, GetLookupKeys end.", channelId, batchId, embName); - // 获取各类向量,如果为空指针,退出当前函数 - unique_ptr> infoVecs = KEY_PROCESS_INSTANCE->GetInfoVec(batchId, embName, channelId, - ProcessedInfo::RESTORE); - if (infoVecs == nullptr) { - LOG_ERROR("Information vector is nullptr!"); - return false; + // 只有在每次GetUniqueKeys的时候才知道上游是否已经EOS + // 注意GetUniqueKeys与EOS关联,需要在ProcessEmbInfoDDR最先调用,如需调整位置,请参考并适配其他函数 + // 获取GlobalUnique向量 + auto uniqueKeys = GetUniqueKeys(info, remainBatchOut); + if (uniqueKeys.empty()) { + return; } - LOG_DEBUG("channelId:{} batchId:{}, GetInfoVec end, getTensorsTC(ms):{}", - channelId, batchId, getTensorsTC.ElapsedMS()); - - TimeCost sendRestoreSyncTC; - hdTransfer->Send(TransferChannel::RESTORE, *infoVecs, channelId, embName); - LOG_DEBUG("channelId:{} batchId:{}, send restore end, sendRestoreSyncTC(ms):{}", - channelId, batchId, sendRestoreSyncTC.ElapsedMS()); - - // 调用SSD cache缓存处理流程,获取锁避免保存时修改keyOffsetMap - table->mutSave_.lock(); - LOG_DEBUG("acquire save lock, table:{}", table->name); - PrepareDDRData(table, lookupKeys, channelId, batchId); - // 计算查询向量;记录需要被换出的HBM偏移 - vector tmpData; - vector offsetsOut; - DDRParam ddrParam(tmpData, offsetsOut); - TimeCost hostHashMapProcessTC; - - hostHashMaps->Process(embName, lookupKeys, ddrParam, channelId); - table->mutSave_.unlock(); - LOG_DEBUG("release save lock, table:{}", table->name); - - LOG_DEBUG("channelId:{} batchId:{}, hostHashMapProcessTC(ms):{}", - channelId, batchId, hostHashMapProcessTC.ElapsedMS()); - - if (mgmtRankInfo.useSumSameIdGradients && channelId == TRAIN_CHANNEL_ID && remainBatchOut) { - SendUniqKeysAndRestoreVecDDR(embName, batchId, channelId, ddrParam); + // 获取GlobalUnique对应的restoreVectorSec + auto restoreVecSec = GetRestoreVecSec(info, remainBatchOut); + if (restoreVecSec.empty()) { + return; } - TimeCost sendTensorsTC; - hdTransfer->Send(TransferChannel::LOOKUP, { ddrParam.tmpDataOut.front() }, channelId, embName); - ddrParam.tmpDataOut.erase(ddrParam.tmpDataOut.cbegin()); - hdTransfer->Send(TransferChannel::SWAP, ddrParam.tmpDataOut, channelId, embName); - if (!mgmtRankInfo.useStatic) { - unique_ptr> all2all = KEY_PROCESS_INSTANCE->GetInfoVec(batchId, embName, - channelId, ProcessedInfo::ALL2ALL); - if (all2all == nullptr) { - LOG_ERROR("Information vector is nullptr!"); - return false; - } - hdTransfer->Send(TransferChannel::ALL2ALL, *all2all, channelId, embName); + SendAll2AllVec(info, remainBatchOut); + if (!remainBatchOut) { + return; } - LOG_DEBUG("channelId:{} batchId:{}, ProcessEmbInfo end, sendTensorsTC(ms):{}, getAndSendTensorsTC(ms):{}", - channelId, batchId, sendTensorsTC.ElapsedMS(), getAndSendTensorsTC.ElapsedMS()); - if (!isSSDEnabled && embHashMap.HasFree(lookupKeys.size())) { // check free > next one batch - LOG_WARN(MGMT + "channelId:{} batchId:{}, embName:{}, freeSize not enough:{}", - channelId, batchId, embName, lookupKeys.size()); - return false; + SendRestoreVec(info, remainBatchOut); + if (!remainBatchOut) { + return; } - return true; -} -void HybridMgmt::SendUniqKeysAndRestoreVecDDR(const string &embName, int &batchId, int &channelId, DDRParam &ddrParam) -{ - LOG_DEBUG("channelId:{} batchId:{}, embName:{}, SendUniqKeysAndRestoreVecDDR start.", channelId, batchId, embName); - vector uniqueKeys; - vector restoreVecSec; - KEY_PROCESS_INSTANCE->GlobalUnique(ddrParam.offsetsOut, uniqueKeys, restoreVecSec); + std::pair, vector> swapInKoPair; + std::pair, vector> swapOutKoPair; + GetSwapPairsAndKey2Offset(info, uniqueKeys, swapInKoPair, swapOutKoPair); - TimeCost sendUniqueKeysSyncTC; - hdTransfer->Send(TransferChannel::UNIQKEYS, {mgmtRankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) : - Vec2TensorI32(uniqueKeys) }, channelId, embName); - LOG_DEBUG("channelId:{} batchId:{}, sendUniqueKeysSyncTC(ms):{}", - channelId, batchId, sendUniqueKeysSyncTC.ElapsedMS()); + SendLookupOffsets(info, uniqueKeys, restoreVecSec); - TimeCost sendRestoreVecSecSyncTC; - hdTransfer->Send(TransferChannel::RESTORE_SECOND, {Vec2TensorI32(restoreVecSec) }, channelId, embName); - LOG_DEBUG("channelId:{} batchId:{}, sendRestoreVecSecSyncTC(ms):{}", - channelId, batchId, sendRestoreVecSecSyncTC.ElapsedMS()); -} + SendGlobalUniqueVec(info, uniqueKeys, restoreVecSec); -/// 发送H2D和接收D2H向量 -/// \param channelId 通道索引(训练/推理) -/// \param batchId 已处理的batch数 -/// \param start -void HybridMgmt::EmbHDTransWrap(int channelId, const int& batchId, int start) -{ - LOG_INFO(MGMT + "start:{} channelId:{} batchId:{}, EmbHDTransWrap start.", start, channelId, batchId); - TimeCost embHDTransWrapTC; - TimeCost hostEmbsTC; - hostEmbs->Join(channelId); - LOG_DEBUG("channelId:{} batchId:{}, hostEmbs Join end, hostEmbsTC(ms):{}", - channelId, batchId, hostEmbsTC.ElapsedMS()); - if (!isRunning) { + auto isNeedReturn = HandleSpecialProcessStatusDDR(info, getAndSendTensorsTC, swapInKoPair, swapOutKoPair); + if (isNeedReturn) { return; } - EmbHDTrans(channelId, batchId); - LOG_DEBUG("channelId:{} batchId:{}, EmbHDTransWrap end, embHDTransWrapTC(ms):{}", - channelId, batchId, embHDTransWrapTC.ElapsedMS()); -} -/// 发送H2D和接收D2H向量,并更新host emb -/// \param channelId 通道索引(训练/推理) -/// \param batchId 已处理的batch数 -void HybridMgmt::EmbHDTrans(const int channelId, const int batchId) -{ - EASY_FUNCTION(profiler::colors::Blue) - EASY_VALUE("mgmtProcess", batchId) - LOG_DEBUG(MGMT + "channelId:{} batchId:{}, EmbHDTrans start.", channelId, batchId); - TimeCost h2dTC; - // 发送host需要换出的emb - for (const auto& embInfo: mgmtEmbInfo) { - const auto& missingKeys = EmbeddingMgmt::Instance()->GetMissingKeys(embInfo.name); - vector h2dEmb; - hostEmbs->GetH2DEmb(missingKeys, embInfo.name, h2dEmb); // order! - hdTransfer->Send(TransferChannel::H2D, h2dEmb, channelId, embInfo.name, batchId); + TimeCost swapProcessTC; + EnqueueSwapInfo(info, swapInKoPair, swapOutKoPair); + + auto &swapInPos = swapInKoPair.second; + auto &swapOutPos = swapOutKoPair.second; + auto lastSwapInPos = lastSwapInPosMap[info.name]; + lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 + + // 下发swaptensor + if (info.batchId != 0) { + SendTensorForSwap(info, lastSwapInPos, swapOutPos); } - LOG_DEBUG("channelId:{} batchId:{}, EmbHDTrans h2d end, h2dTC(ms):{}", channelId, batchId, h2dTC.ElapsedMS()); - TimeCost d2hTC; - // 接收device换出的emb,并更新到host上 - for (const auto& embInfo: mgmtEmbInfo) { - const auto& missingKeys = EmbeddingMgmt::Instance()->GetMissingKeys(embInfo.name); - hostEmbs->UpdateEmbV2(missingKeys, channelId, embInfo.name); // order! - EmbeddingMgmt::Instance()->ClearMissingKeys(embInfo.name); + HandleEndBatchCase(info, swapInPos); + + if (info.channelId == TRAIN_CHANNEL_ID) { + alreadyTrainOnce = true; } - LOG_DEBUG("channelId:{} batchId:{}, EmbHDTrans d2h end, d2hTC(ms):{}", channelId, batchId, d2hTC.ElapsedMS()); -} + + LOG_DEBUG("ProcessEmbInfoDDR end, table:{}, channel:{}, batchId:{} swapProcessTC(ms):{} getAndSendTensorsTC(ms):{}", + info.name, info.channelId, info.batchId, swapProcessTC.ElapsedMS(), getAndSendTensorsTC.ElapsedMS()); #endif +} /// hook通过时间或者step数触发淘汰 /// \return bool HybridMgmt::Evict() { #ifndef GTEST + std::lock_guard lk(evictMut); if (!isInitialized) { throw runtime_error("HybridMgmt not initialized. Call Initialize first."); } @@ -1001,8 +771,15 @@ bool HybridMgmt::Evict() } } else { if (GlobalEnv::useCombineFaae) { - for (auto& map : hostHashMaps->embHashMaps) { - EmbeddingMgmt::Instance()->EvictKeys(map.first, evictKeyMap[COMBINE_HISTORY_NAME]); + vector allTableNames; + int retCode = embCache->GetEmbTableNames(allTableNames); + if (retCode != H_OK) { + LOG_ERROR("GetEmbTableNames failed!"); + return false; + } + for (const string& embName : allTableNames) { + EvictKeys(embName, evictKeyMap[COMBINE_HISTORY_NAME]); + EvictSSDKeys(embName, evictKeyMap[COMBINE_HISTORY_NAME]); } } else { for (const auto& evict : as_const(evictKeyMap)) { @@ -1019,68 +796,24 @@ bool HybridMgmt::Evict() /// DDR模式下的淘汰:删除映射表、初始化host表、发送dev淘汰位置 /// \param embName /// \param keys -void HybridMgmt::EvictKeys(const string& embName, const vector& keys) -{ - std::shared_ptr table = EmbeddingMgmt::Instance()->GetTable(embName); - - table->EvictKeys(keys); - - const vector& evictOffsetDev = table->GetEvictedKeys(); - const vector& evictOffsetHost = table->GetHostEvictedKeys(); - - vector evictOffsetHostx(evictOffsetHost); - - size_t devVocabSize = table->GetDevVocabSize(); - for (int64_t& key: evictOffsetHostx) { - key -= static_cast(devVocabSize); - }; - - /* 淘汰Host侧 */ - if (!evictOffsetHost.empty()) { - hostEmbs->EvictInitEmb(embName, evictOffsetHost); - } - - vector tmpDataOut; - Tensor tmpData = Vec2TensorI32(evictOffsetDev); - tmpDataOut.emplace_back(tmpData); - tmpDataOut.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); - - auto evictLen = tmpDataOut.back().flat(); - auto evictSize = static_cast(evictOffsetDev.size()); - evictLen(0) = evictSize; - - hdTransfer->Send(TransferChannel::EVICT, tmpDataOut, TRAIN_CHANNEL_ID, embName); -} - -inline void HybridMgmt::PrepareDDRData(std::shared_ptr table, - const vector& keys, int channelId, int batchId) const +void HybridMgmt::EvictKeys(const string& embName, const vector& keys) { - if (!isSSDEnabled) { + if (keys.empty()) { return; } - LOG_DEBUG("channelId:{} batchId:{}, embTableName:{}, PrepareDDRData start.", channelId, batchId, table->name); - TimeCost prepareDDRDataTc; - TableInfo ti = table->GetTableInfo(); - TransferRet ret = cacheManager->TransferDDREmbWithSSD(ti, keys, channelId); - if (ret != TransferRet::TRANSFER_OK) { - HandlePrepareDDRDataRet(ret); + int retCode = embCache->RemoveEmbsByKeys(embName, keys); + if (retCode != H_OK) { + LOG_ERROR("RemoveEmbsByKeys failed!"); + return; } - LOG_DEBUG("channelId:{} batchId:{}, embTableName:{}, PrepareDDRData end, prepareDDRDataTc(ms):{}", - channelId, batchId, table->name, prepareDDRDataTc.ElapsedMS()); } -void HybridMgmt::EvictSSDKeys(const string& embName, const vector& keys) const +void HybridMgmt::EvictSSDKeys(const string& embName, const vector& keys) const { if (!isSSDEnabled) { return; } - vector ssdKeys; - for (auto& key : keys) { - if (cacheManager->IsKeyInSSD(embName, key)) { - ssdKeys.emplace_back(key); - } - } - cacheManager->EvictSSDEmbedding(embName, ssdKeys); + cacheManager->EvictSSDEmbedding(embName, keys); } int HybridMgmt::GetStepFromPath(const string& loadPath) const @@ -1134,19 +867,20 @@ void HybridMgmt::CountStepBySessionRun(int channelID, int steps) const /// \return 表使用大小 int64_t HybridMgmt::GetTableSize(const string& embName) const { + int64_t size = -1; #ifndef GTEST if (!isInitialized) { throw runtime_error("HybridMgmt not initialized. Call Initialize first."); } if (mgmtRankInfo.useDynamicExpansion) { - int64_t size = EmbeddingMgmt::Instance()->GetSize(embName); + size = EmbeddingMgmt::Instance()->GetSize(embName); LOG_INFO(MGMT + "dynamic expansion mode, get emb:[{}] size:{}", embName, size); return size; } if (!mgmtRankInfo.isDDR) { size_t maxOffset = EmbeddingMgmt::Instance()->GetMaxOffset(embName); - int64_t size = static_cast(maxOffset); + size = static_cast(maxOffset); LOG_INFO(MGMT + "HBM mode, get emb:[{}] size:{}", embName, size); return size; } @@ -1155,17 +889,11 @@ int64_t HybridMgmt::GetTableSize(const string& embName) const ssdSize = cacheManager->GetTableEmbeddingSize(embName); } - const auto& iter = hostHashMaps->embHashMaps.find(embName); - if (iter == hostHashMaps->embHashMaps.end()) { - LOG_ERROR(MGMT + "get maxOffset, wrong embName:{} ", embName); - return -1; - } - auto maxOffset = hostHashMaps->embHashMaps.at(embName).maxOffset; - int64_t size = static_cast(maxOffset) + ssdSize; - + uint32_t ddrSize = embCache->GetUsage(embName); + size = static_cast(ddrSize) + ssdSize; LOG_INFO(MGMT + "DDR/SSD mode, get emb:[{}] size:{}", embName, size); - return size; #endif + return size; } /// 获取table表容量大小 @@ -1184,8 +912,8 @@ int64_t HybridMgmt::GetTableCapacity(const string& embName) const return capacity; } LOG_WARN(MGMT + "no dynamic expansion mode, get emb:[{}] capacity failed", embName); - return -1; #endif + return -1; } /// 设置表的优化器信息 @@ -1199,3 +927,1281 @@ void HybridMgmt::SetOptimizerInfo(const string& embName, OptimizerInfo optimInfo } EmbeddingMgmt::Instance()->SetOptimizerInfo(embName, optimInfo); } + +void HybridMgmt::LookUpAddrs(const string &embName, int extEmbeddingSize) +{ + int id = 0; + uint64_t memSize = extEmbeddingSize * sizeof(float); + const std::string hbmSwapKeyQueName = "HBMSwapKeyQue"; + const std::string ddrSwapKeyQueName = "DDRSwapKeyQue"; + auto lookUpFunc = [this, memSize, embName, id]( + std::map>> &fromQue, + std::map>> &toQue, + const string &swapStr, const string &fromQueName + ) { + std::vector keys = fromQue[embName + swapStr].WaitAndPop(); + if (!isRunning) { + return; + } + std::vector addrs; + TimeCost lookupAddrsTC; + int rc = embCache->EmbeddingLookupAddrs(embName, keys, addrs); + if (rc != H_OK) { + lookupAddrSuccess = false; + LOG_ERROR("lookUpAddrs, table:{}, fromQue: {}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}", + embName, fromQueName, swapStr, keys.size(), addrs.size(), id); + throw runtime_error("EmbeddingLookupAddrs failed! error code:" + std::to_string(rc)); + } + if (&fromQue == &DDRSwapKeyQue && swapStr == SWAP_OUT_STR) { + for (auto &addr : addrs) { + auto *newAddr = (float*)malloc(memSize); + rc = memcpy_s(newAddr, memSize, addr, memSize); + if (rc != 0) { + lookupAddrSuccess = false; + throw runtime_error("memcpy_s failed! error code:" + std::to_string(rc)); + } + addr = newAddr; + } + rc = embCache->EmbeddingRemove(embName, keys); + if (rc != H_OK) { + lookupAddrSuccess = false; + throw runtime_error("EmbeddingRemove failed! error code:" + std::to_string(rc)); + } + } + LOG_DEBUG("table:{}, fromQue:{}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}, lookupAddrsTC(ms):{}", + embName, fromQueName, swapStr, keys.size(), addrs.size(), id, lookupAddrsTC.ElapsedMS()); + toQue[embName + swapStr].Pushv(addrs); + }; + while (isRunning && lookupAddrSuccess) { + lookUpFunc(DDRSwapKeyQue, DDRSwapAddrsQue, SWAP_OUT_STR, ddrSwapKeyQueName); + lookUpFunc(DDRSwapKeyQue, DDRSwapAddrsQue, SWAP_IN_STR, ddrSwapKeyQueName); + lookUpFunc(HBMSwapKeyQue, tableToQueueLookup, SWAP_IN_STR, hbmSwapKeyQueName); + lookUpFunc(HBMSwapKeyQue, tableToQueueLookup, SWAP_OUT_STR, hbmSwapKeyQueName); + id++; + lookUpSwapInAddrsPushId[embName]++; + } +} + +void HybridMgmt::LookUpSwapAddrs(const string &embName, const string &swapStr) +{ + int id = 0; + std::string swapName = embName + swapStr; + while (isRunning && lookupAddrSuccess) { + std::vector keys = HBMSwapKeyQue[swapName].WaitAndPop(); + if (!isRunning) { + return; + } + vector addrs; + TimeCost lookupAddrsTC; + int rc = embCache->EmbeddingLookupAddrs(embName, keys, addrs); + if (rc != H_OK) { + lookupAddrSuccess = false; + throw runtime_error("EmbeddingLookupAddrs failed! error code: " + std::to_string(rc)); + } + LOG_DEBUG( + "table:{}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}, lookupAddrsTC(ms):{}", + embName, swapStr, keys.size(), addrs.size(), id, lookupAddrsTC.ElapsedMS()); + tableToQueueLookup[swapName].Pushv(addrs); + if (swapStr==SWAP_IN_STR) { + lookUpSwapInAddrsPushId[embName]++; + LOG_DEBUG("LookUpSwapAddrs, table:{}, pushId:{}, lookUpSwapInAddrsPushId:{}", + embName, id, lookUpSwapInAddrsPushId[embName]); + } + id++; + } +} + +/// 导出npu的embedding +void HybridMgmt::FetchDeviceEmb() +{ + // 数据处理线程上锁 + KEY_PROCESS_INSTANCE->LoadSaveLock(); + + if (mgmtRankInfo.isDDR) { + // DDR模式保存host的emb表以及hashmap + LOG_DEBUG(MGMT + "start host side save: ddr mode"); + for (const auto &embInfo: mgmtEmbInfo) { + std::vector> koVec; + embCache->ExportDeviceKeyOffsetPairs(embInfo.name, koVec); + std::vector swapOutPos; + for (const auto &p : koVec) { + swapOutPos.push_back(p.second); + } + + vector swapTensor; + swapTensor.emplace_back(Vec2TensorI32(swapOutPos)); + swapTensor.emplace_back(Tensor(tensorflow::DT_INT32, {1})); + auto swapOutLen = swapTensor.back().flat(); + swapOutLen(0) = swapOutPos.size(); + LOG_DEBUG(MGMT + "save swapOutPos size:{}", swapOutPos.size()); + // 发送SwapOutPos信息 + hdTransfer->Send(TransferChannel::SAVE_H2D, swapTensor, TRAIN_CHANNEL_ID, embInfo.name); + } + } + KEY_PROCESS_INSTANCE->LoadSaveUnlock(); +} + +// 这里就是新增的embedding处理线程 +void HybridMgmt::EmbeddingTask() +{ + for (const auto& embInfo: mgmtEmbInfo) { + lastUpdateFinishStepMap[embInfo.name] = 0; + lastLookUpFinishStepMap[embInfo.name] = 0; + lastSendFinishStepMap[embInfo.name] = 0; + lastRecvFinishStepMap[embInfo.name] = 0; + } + + TimeCost embHDTransTC; + MultiThreadEmbHDTransWrap(); + LOG_DEBUG("embHDTransTC(ms):{}", embHDTransTC.ElapsedMS()); +} + +void HybridMgmt::MultiThreadEmbHDTransWrap() +{ + for (int index = 0; index < EMBEDDING_THREAD_NUM; index++) { + for (const auto& embInfo: mgmtEmbInfo) { + CreateEmbeddingLookUpAndSendThread(index, embInfo); + CreateEmbeddingReceiveAndUpdateThread(index, embInfo); + } + } +} + +void HybridMgmt::EmbeddingLookUpAndSendDDR(int batchId, int index, const EmbInfo& embInfo) +{ + int cvNotifyIndex = 0; + if (index + 1 != EMBEDDING_THREAD_NUM) { + cvNotifyIndex = index + 1; + } + + EmbTaskInfo info = { + .batchId=batchId, + .threadIdx=index, + .cvNotifyIndex=cvNotifyIndex, + .extEmbeddingSize=embInfo.extEmbeddingSize, + .name=embInfo.name + }; + vector h2dEmb; + + auto isSuccess = EmbeddingLookUpDDR(info, h2dEmb); + if (!isSuccess) { + LOG_INFO("HybridMgmt is not running"); + return; + } + + EmbeddingSendDDR(info, h2dEmb); +} + +void HybridMgmt::EmbeddingReceiveAndUpdateDDR(int batchId, int index, const EmbInfo& embInfo) +{ + int cvNotifyIndex = 0; + if (index + 1 != EMBEDDING_THREAD_NUM) { + cvNotifyIndex = index + 1; + } + + EmbTaskInfo info = { + .batchId=batchId, + .threadIdx=index, + .cvNotifyIndex=cvNotifyIndex, + .extEmbeddingSize=embInfo.extEmbeddingSize, + .name=embInfo.name + }; + + float* ptr = nullptr; + vector swapOutAddrs; + auto isSuccess = EmbeddingReceiveDDR(info, ptr, swapOutAddrs); + if (!isSuccess) { + LOG_INFO("HybridMgmt is not running"); + return; + } + + EmbeddingUpdateDDR(info, ptr, swapOutAddrs); +} + +void HybridMgmt::EmbeddingLookUpAndSendSSD(int batchId, int index, const EmbInfo& embInfo) +{ + int cvNotifyIndex = 0; + if (index + 1 != EMBEDDING_THREAD_NUM) { + cvNotifyIndex = index + 1; + } + + EmbTaskInfo info = { + .batchId=batchId, + .threadIdx=index, + .cvNotifyIndex=cvNotifyIndex, + .extEmbeddingSize=embInfo.extEmbeddingSize, + .name=embInfo.name + }; + vector h2dEmb; + + auto isSuccess = EmbeddingLookUpSSD(info, h2dEmb); + if (!isSuccess) { + LOG_INFO("HybridMgmt is not running"); + return; + } + + EmbeddingSendSSD(info, h2dEmb); +} + +void HybridMgmt::EmbeddingReceiveAndUpdateSSD(int batchId, int index, const EmbInfo& embInfo) +{ + int cvNotifyIndex = 0; + if (index + 1 != EMBEDDING_THREAD_NUM) { + cvNotifyIndex = index + 1; + } + + EmbTaskInfo info = { + .batchId=batchId, + .threadIdx=index, + .cvNotifyIndex=cvNotifyIndex, + .extEmbeddingSize=embInfo.extEmbeddingSize, + .name=embInfo.name + }; + float* ptr = nullptr; + vector swapOutAddrs; + int64_t dims0 = 0; + EmbeddingReceiveSSD(info, ptr, swapOutAddrs, dims0); + + EmbeddingUpdateSSD(info, ptr, swapOutAddrs, dims0); +} + + +/// 构造训练所需的各种向量数据 +/// \param embName 表名 +/// \param batchId 已处理的batch数 +/// \param channelId 通道索引(训练/推理) +/// \param remainBatchOut 是否从通道获取了数据 +/// \return 是否处理成功 +void HybridMgmt::ProcessEmbInfoSSD(const EmbBaseInfo& info, bool& remainBatchOut) +{ +#ifndef GTEST + TimeCost getAndSendTensorsTC; + LOG_DEBUG("ProcessEmbInfoSSD table:{}, channel:{}, batchId:{}", info.name, info.channelId, info.batchId); + + if (info.channelId == TRAIN_CHANNEL_ID && info.batchId == hybridMgmtBlock->maxTrainStep) { + HandleReachMaxStepCase(info, remainBatchOut); + return; + } + + // 只有在每次GetUniqueKeys的时候才知道上游是否已经EOS + // 注意GetUniqueKeys与EOS关联,需要在ProcessEmbInfoSSD最先调用,如需调整位置,请参考并适配其他函数 + // 获取GlobalUnique向量 + auto uniqueKeys = GetUniqueKeys(info, remainBatchOut); + if (uniqueKeys.empty()) { + return; + } + + // 获取GlobalUnique对应的restoreVectorSec + auto restoreVecSec = GetRestoreVecSec(info, remainBatchOut); + if (restoreVecSec.empty()) { + return; + } + + SendAll2AllVec(info, remainBatchOut); + if (!remainBatchOut) { + return; + } + + SendRestoreVec(info, remainBatchOut); + if (!remainBatchOut) { + return; + } + + std::pair, vector> swapInKoPair; + std::pair, vector> swapOutKoPair; + GetSwapPairsAndKey2Offset(info, uniqueKeys, swapInKoPair, swapOutKoPair); + + SendLookupOffsets(info, uniqueKeys, restoreVecSec); + + SendGlobalUniqueVec(info, uniqueKeys, restoreVecSec); + + auto isNeedReturn = HandleSpecialProcessStatusSSD(info, getAndSendTensorsTC, swapInKoPair, swapOutKoPair); + if (isNeedReturn) { + return; + } + + TimeCost swapProcessTC; + auto &swapInKeys = swapInKoPair.first; + auto &swapInPos = swapInKoPair.second; + auto &swapOutKeys = swapOutKoPair.first; + auto &swapOutPos = swapOutKoPair.second; + + HandleDataSwapForSSD(info, swapInKeys, swapOutKeys); + + auto lastSwapInPos = lastSwapInPosMap[info.name]; + lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 + + // 下发swaptensor + if (info.batchId != 0) { + SendTensorForSwap(info, lastSwapInPos, swapOutPos); + } + + HandleEndBatchCase(info, swapInPos); + + CheckLookupAddrSuccessSSD(); + + if (info.channelId == TRAIN_CHANNEL_ID) { + alreadyTrainOnce = true; + } + + LOG_DEBUG("ProcessEmbInfoSSD end, table:{}, batchId:{}, swapProcessTC(ms):{}, getAndSendTensorsTC(ms):{}", + info.name, info.batchId, swapProcessTC.ElapsedMS(), getAndSendTensorsTC.ElapsedMS()); +#endif +} + +void HybridMgmt::SendTensorForSwap(const EmbBaseInfo& info, + const vector &swapInPosUint, + const vector &swapOutPosUint) +{ +#ifndef GTEST + vector swapTensor; + swapTensor.emplace_back(Vec2TensorI32(swapInPosUint)); + swapTensor.emplace_back(Vec2TensorI32(swapOutPosUint)); + swapTensor.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); + auto swapInLen = swapTensor.back().flat(); + swapInLen(0) = swapInPosUint.size(); + swapTensor.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); + auto swapOutLen = swapTensor.back().flat(); + swapOutLen(0) = swapOutPosUint.size(); + + hdTransfer->Send(TransferChannel::SWAP, swapTensor, info.channelId, info.name, info.batchId); +#endif +} + +void HybridMgmt::InitDataPipelineForDDR(const string &embName) +{ + // 初始化公共队列 + HBMSwapKeyQue[embName+SWAP_IN_STR]; + HBMSwapKeyQue[embName+SWAP_OUT_STR]; + tableToQueueLookup[embName+SWAP_IN_STR]; + tableToQueueLookup[embName+SWAP_OUT_STR]; + + // 初始化lookup线程 + lookUpSwapInAddrsPushId[embName]; // 此处初始化,避免多线程竞争导致计数错误 + lookUpSwapInAddrsThreads.emplace_back( + std::async(std::launch::async, [=] { LookUpSwapAddrs(embName, SWAP_IN_STR); })); + lookUpSwapOutAddrsThreads.emplace_back( + std::async(std::launch::async, [=] { LookUpSwapAddrs(embName, SWAP_OUT_STR); })); + + LOG_DEBUG("data pipeline for ddr init"); +} + +void HybridMgmt::InitDataPipelineForSSD(const string &embName, int extEmbeddingSize) +{ + // 初始化公共队列 + HBMSwapKeyQue[embName+SWAP_IN_STR]; + HBMSwapKeyQue[embName+SWAP_OUT_STR]; + tableToQueueLookup[embName+SWAP_IN_STR]; + tableToQueueLookup[embName+SWAP_OUT_STR]; + + HBMSwapKeyQue[embName + ADDR_STR]; + SwapOut2SSDKeyQue[embName + SWAP_IN_STR]; + SwapOut2SSDKeyQue[embName + ADDR_STR]; + SwapOut2SSDKeyQue[embName + SWAP_OUT_STR]; + + DDRSwapKeyQue[embName + SWAP_OUT_STR]; + DDRSwapKeyQue[embName + SWAP_IN_STR]; + DDRSwapKeyForSSDQue[embName + SWAP_OUT_STR]; + DDRSwapKeyForSSDQue[embName + SWAP_IN_STR]; + DDRSwapAddrsQue[embName + SWAP_OUT_STR]; + DDRSwapAddrsQue[embName + SWAP_IN_STR]; + + // 初始化lookup线程 + lookUpThreads.emplace_back( + std::async(std::launch::async, [=] { LookUpAddrs(embName, extEmbeddingSize); })); + LOG_DEBUG("data pipeline for ssd init"); +} + +void HybridMgmt::InitEmbeddingCache(const vector& embInfos) +{ + factory->SetExternalLogFuncInner(CTRLog); + factory->CreateEmbCacheManager(embCache); + EmbeddingMgmt::Instance()->SetEmbCacheForEmbTable(embCache); + EmbeddingMgmt::Instance()->SetHDTransferForEmbTable(hdTransfer); + + for (auto embInfo: embInfos) { + if (isSSDEnabled) { + InitDataPipelineForSSD(embInfo.name, embInfo.extEmbeddingSize); + } else { + InitDataPipelineForDDR(embInfo.name); + } + + specialProcessStatus[embInfo.name] = ProcessStatus::NORMAL; + + // 初始化embedding cache + LOG_INFO("create cache for table:{}, hostVocabSize:{}, embSize:{}, maxCacheSize:{}", + embInfo.name, embInfo.hostVocabSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); + EmbCache::EmbCacheInfo embCacheInfo(embInfo.name, embInfo.hostVocabSize, embInfo.embeddingSize, + embInfo.extEmbeddingSize, embInfo.devVocabSize); + int ret = embCache->CreateCacheForTable( + embCacheInfo, embInfo.initializeInfos, INVALID_KEY_VALUE, embInfo.hostVocabSize, EMBEDDING_THREAD_NUM); + if (ret != H_OK) { + throw runtime_error(embInfo.name + "create cache for table failed, error code: " + std::to_string(ret)); + } + } +} + +void HybridMgmt::JoinEmbeddingCacheThread() +{ + for (auto &p : tableToQueueLookup) { + p.second.DestroyQueue(); + } + for (auto &p : HBMSwapKeyQue) { + p.second.DestroyQueue(); + } + for (auto &p : SwapOut2SSDKeyQue) { + p.second.DestroyQueue(); + } + for (auto &p : DDRSwapKeyQue) { + p.second.DestroyQueue(); + } + for (auto &p : DDRSwapKeyForSSDQue) { + p.second.DestroyQueue(); + } + for (auto &p : DDRSwapAddrsQue) { + p.second.DestroyQueue(); + } + for (auto& t : EmbeddingLookUpAndSendThreadPool) { + t.join(); + } + for (auto& t : EmbeddingReceiveAndUpdateThreadPool) { + t.join(); + } + for (auto& t : lookUpThreads) { + t.wait(); + } + for (auto& t : lookUpSwapInAddrsThreads) { + t.wait(); + } + for (auto& t : lookUpSwapOutAddrsThreads) { + t.wait(); + } +} + +void HybridMgmt::HandleReachMaxStepCase(const EmbBaseInfo& info, bool& remainBatchOut) +{ + // 1. 如果没有切换过,即状态normal,就该send以结束step n-1 + // 2. 如果切换过: + // a. eval场景跑完,不用send,外面自然退出 + // b. save场景,能触发,说明期望的train step已经跑完(由IsTrainEndBatch判定send),当前step也不用send + LOG_DEBUG("table:{}, batchId:{}, ProcessStatus:{}, reach maxTrainStep", + info.name, info.batchId, ProcessStatus2Str(ProcessStatus::NORMAL)); + if (specialProcessStatus[info.name] == ProcessStatus::NORMAL) { + LOG_DEBUG("table:{}, batchId:{}, need send swap tensor" + " for last step to finish train", info.name, info.batchId); + std::vector emptySwapOutPos; + SendTensorForSwap(info, lastSwapInPosMap[info.name], emptySwapOutPos); + } else { + LOG_DEBUG("table:{}, batchId:{}, switch from eval or save, unnecessary to send emptySwapOutPos", + info.name, info.batchId); + } + remainBatchOut = false; + hybridMgmtBlock->SetBlockStatus(TRAIN_CHANNEL_ID, true); +} + +void HybridMgmt::HandleEosCase(const EmbBaseInfo& info, bool &remainBatchOut) +{ + LOG_INFO("GetUniqueKeys get eos, handle final batch for current epoch, table:{}, channel:{}, batchId:{}", + info.name, info.channelId, info.batchId); + bool sendAllChannel = false; + if (info.channelId == TRAIN_CHANNEL_ID) { + vector emptySwapOutPos; + SendTensorForSwap(info, lastSwapInPosMap[info.name], emptySwapOutPos); + LOG_INFO("GetUniqueKeys get eos, send pos for train channel, table:{}, batchId:{}", info.name, info.batchId); + KEY_PROCESS_INSTANCE->SendEos(info.name, info.batchId, info.channelId, sendAllChannel); + remainBatchOut = false; + return; + } + + if (!alreadyTrainOnce) { + // predict场景 + LOG_INFO("ProcessEmbInfoDDR first run in eval channel, assume as predict mode, start handle eos"); + std::vector emptySwapOutPos; + SendTensorForSwap(info, lastSwapInPosMap[info.name], emptySwapOutPos); + sendAllChannel = true; + } else { + hybridMgmtBlock->SetBlockStatus(EVAL_CHANNEL_ID, true); + LOG_INFO("GetUniqueKeys get eos from eval channel, SetBlockStatus=true"); + if (hybridMgmtBlock->IsNeedWaitSave()) { + // train+eval+save场景 + // 当前step n之后需要save,涉及save到train的状态切换。需要: + // 1. 补发pos以启动eval step n-1并完成。 + // 2. eval step n遇到eos结束 + // 3. 开始save,完成后唤醒train的ProcessEmbInfoDDR,所以需要在此之前改变specialProcessStatus + LOG_DEBUG("eval encounter eos and need save after this step" + "send pos change specialProcessStatus, current status:{}, modify to status:{}", + ProcessStatus2Str(specialProcessStatus[info.name]), + ProcessStatus2Str(ProcessStatus::AFTER_SWITCH_FIRST_BATCH)); + vector emptySwapOutPos; + SendTensorForSwap(info, lastSwapInPosMap[info.name], emptySwapOutPos); + specialProcessStatus[info.name] = ProcessStatus::AFTER_SWITCH_FIRST_BATCH; + } else { + // train+eval+train场景 + // 交给train的ProcessEmbInfoDDR启动最后n-1步eval + // train发送pos让eval step n-1跑完,到eval step n时各channel遇到eos后结束(train、eval共享的channel除外) + LOG_INFO("GetUniqueKeys get eos, skip send pos for eval channel, table:{}, batchId:{}", + info.name, info.batchId); + } + } + KEY_PROCESS_INSTANCE->SendEos(info.name, info.batchId, info.channelId, sendAllChannel); + remainBatchOut = false; +} + +bool HybridMgmt::EmbeddingReceiveDDR(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs) +{ + std::unique_lock lastRecvFinishLocker(lastRecvFinishMutexMap[info.name][info.threadIdx]); + cvLastRecvFinishMap[info.name][info.threadIdx].wait(lastRecvFinishLocker, [info, this] { + return (lastRecvFinishStepMap[info.name] == info.batchId) || mutexDestroy; + }); + if (!isRunning) { + return false; + } + TimeCost EmbeddingRecvTC = TimeCost(); + + swapOutAddrs = tableToQueueLookup[info.name+SWAP_OUT_STR].WaitAndPop(); + if (!isRunning) { + return false; + } + // 等待图执行发送d2h embedding过来 + if (info.batchId != 0) { + TransferChannel transferName = TransferChannel::D2H; + auto size = hdTransfer->RecvAcl(transferName, TRAIN_CHANNEL_ID, info.name, info.threadIdx, info.batchId); + if (size == 0) { + LOG_WARN(HOSTEMB + "recv empty data"); + return false; + } + + auto aclData = acltdtGetDataItem(hdTransfer->aclDatasets[info.name][info.threadIdx], 0); + if (aclData == nullptr) { + throw runtime_error("Acl get tensor data from dataset failed."); + } + ptr = reinterpret_cast(acltdtGetDataAddrFromItem(aclData)); + + // 判断拿到的embedding个数是否与swapOutKeys个数相等 + size_t dimNum = acltdtGetDimNumFromItem(aclData); + int64_t dims[dimNum]; + acltdtGetDimsFromItem(aclData, dims, dimNum); + + LOG_DEBUG("table:{}, batchId:{}, dims[0]:{}, swapOutAddrs size:{}", + info.name, info.batchId, dims[0], swapOutAddrs.size()); + + if (dims[0] != static_cast(swapOutAddrs.size())) { + throw runtime_error("data dims[0] != swapOutKeys.size()"); + } + } + LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingRecvTC(ms):{}", + info.name, info.batchId, info.threadIdx, EmbeddingRecvTC.ElapsedMS()); + lastRecvFinishStepMap[info.name]++; + cvLastRecvFinishMap[info.name][info.cvNotifyIndex].notify_all(); + + return true; +} + +void HybridMgmt::EmbeddingUpdateDDR(const EmbTaskInfo& info, const float* embPtr, vector& swapOutAddrs) +{ + std::unique_lock lastUpdateFinishLocker(lastUpdateFinishMutexMap[info.name][info.threadIdx]); + cvLastUpdateFinishMap[info.name][info.threadIdx].wait(lastUpdateFinishLocker, [info, this] { + return (lastUpdateFinishStepMap[info.name] == info.batchId) || mutexDestroy; + }); + TimeCost EmbeddingUpdateTC = TimeCost(); + + uint64_t memSize = info.extEmbeddingSize * sizeof(float); + uint64_t extEmbeddingSize = info.extEmbeddingSize; +# pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ + shared(swapOutAddrs, embPtr, extEmbeddingSize, memSize) + for (uint64_t i = 0; i < swapOutAddrs.size(); i++) { + auto rc = memcpy_s(swapOutAddrs[i], memSize, embPtr + i * extEmbeddingSize, memSize); + if (rc != 0) { + throw runtime_error("memcpy_s failed, error code:" + to_string(rc)); + } + } + LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingUpdateTC(ms):{}", + info.name, info.batchId, info.threadIdx, EmbeddingUpdateTC.ElapsedMS()); + + lastUpdateFinishStepMap[info.name]++; + cvLastUpdateFinishMap[info.name][info.cvNotifyIndex].notify_all(); +} + +bool HybridMgmt::EmbeddingLookUpDDR(const EmbTaskInfo &info, vector& h2dEmb) +{ + std::unique_lock lastUpdateFinishLocker(lastUpdateFinishMutexMap[info.name][info.threadIdx]); + cvLastUpdateFinishMap[info.name][info.threadIdx].wait(lastUpdateFinishLocker, [info, this] { + return (lastUpdateFinishStepMap[info.name] >= info.batchId) || mutexDestroy; + }); + if (!isRunning) { + return false; + } + + std::unique_lock lastLookUpFinishLocker(lastLookUpFinishMutexMap[info.name][info.threadIdx]); + cvLastLookUpFinishMap[info.name][info.threadIdx].wait(lastLookUpFinishLocker, [info, this] { + return (lastLookUpFinishStepMap[info.name] == info.batchId) || mutexDestroy; + }); + if (!isRunning) { + return false; + } + + bool isSuccess = BuildH2DEmbedding(info, h2dEmb); + if (!isSuccess) { + return false; + } + + lastLookUpFinishStepMap[info.name]++; + cvLastLookUpFinishMap[info.name][info.cvNotifyIndex].notify_all(); + + return true; +} + +void HybridMgmt::EmbeddingSendDDR(const EmbTaskInfo &info, vector& h2dEmb) +{ + std::unique_lock lastSendFinishLocker(lastSendFinishMutexMap[info.name][info.threadIdx]); + cvLastSendFinishMap[info.name][info.threadIdx].wait(lastSendFinishLocker, [info, this] { + return (lastSendFinishStepMap[info.name] == info.batchId) || mutexDestroy; + }); + TimeCost SendTC = TimeCost(); + hdTransfer->Send(TransferChannel::H2D, h2dEmb, TRAIN_CHANNEL_ID, info.name, info.batchId); + lastSendFinishStepMap[info.name]++; + cvLastSendFinishMap[info.name][info.cvNotifyIndex].notify_all(); + LOG_DEBUG("table:{}, batchId:{}, thread:{}, SendH2DEmbTC(ms):{}", + info.name, info.batchId, info.threadIdx, SendTC.ElapsedMS()); + + // 对于end of sequence场景,key process需要基于h2dNextBatchId等待每个table都完成了最后1个step发送,才能发EOS至各channel + hybridMgmtBlock->h2dNextBatchId[info.name]++; + LOG_DEBUG("h2dNextBatchId, table:{}, next batchId:{}", info.name, hybridMgmtBlock->h2dNextBatchId[info.name]); +} + +void HybridMgmt::CreateEmbeddingLookUpAndSendThread(int index, const EmbInfo& embInfo) +{ + EmbeddingLookUpAndSendThreadPool.emplace_back([index, embInfo, this]() { + while (true) { + lookUpAndSendBatchIdMtx.lock(); + if (lookUpAndSendTableBatchMap[embInfo.name] % EMBEDDING_THREAD_NUM == index) { + int cur_batch_id = lookUpAndSendTableBatchMap[embInfo.name]; + lookUpAndSendTableBatchMap[embInfo.name]++; + lookUpAndSendBatchIdMtx.unlock(); + if (!isSSDEnabled) { + EmbeddingLookUpAndSendDDR(cur_batch_id, index, embInfo); + } else { + EmbeddingLookUpAndSendSSD(cur_batch_id, index, embInfo); + } + } else { + lookUpAndSendBatchIdMtx.unlock(); + } + if (!isRunning) { + return; + } + } + }); +} + +void HybridMgmt::CreateEmbeddingReceiveAndUpdateThread(int index, const EmbInfo& embInfo) +{ + EmbeddingReceiveAndUpdateThreadPool.emplace_back([index, embInfo, this]() { + while (true) { + receiveAndUpdateBatchIdMtx.lock(); + if (receiveAndUpdateTableBatchMap[embInfo.name] % EMBEDDING_THREAD_NUM == index) { + int cur_batch_id = receiveAndUpdateTableBatchMap[embInfo.name]; + receiveAndUpdateTableBatchMap[embInfo.name]++; + receiveAndUpdateBatchIdMtx.unlock(); + if (!isSSDEnabled) { + EmbeddingReceiveAndUpdateDDR(cur_batch_id, index, embInfo); + } else { + EmbeddingReceiveAndUpdateSSD(cur_batch_id, index, embInfo); + } + } else { + receiveAndUpdateBatchIdMtx.unlock(); + } + if (!isRunning) { + return; + } + } + }); +} + +bool HybridMgmt::EmbeddingReceiveSSD(const EmbTaskInfo &info, float *&ptr, + vector &swapOutAddrs, int64_t& dims0) +{ + std::unique_lock lastRecvFinishLocker(lastRecvFinishMutexMap[info.name][info.threadIdx]); + cvLastRecvFinishMap[info.name][info.threadIdx].wait(lastRecvFinishLocker, [info, this] { + return (lastRecvFinishStepMap[info.name] == info.batchId) || mutexDestroy; + }); + if (!isRunning) { + return false; + } + TimeCost EmbeddingRecvTC = TimeCost(); + // finish时会pop空vector,因此需要额外判定isRunning + swapOutAddrs = tableToQueueLookup[info.name+SWAP_OUT_STR].WaitAndPop(); + if (!isRunning) { + return false; + } + // 等待图执行发送d2h embedding过来 + if (info.batchId != 0) { + TransferChannel transferName = TransferChannel::D2H; + auto size = hdTransfer->RecvAcl(transferName, TRAIN_CHANNEL_ID, info.name, info.threadIdx, info.batchId); + if (size == 0) { + LOG_WARN(HOSTEMB + "recv empty data"); + return false; + } + + auto aclData = acltdtGetDataItem(hdTransfer->aclDatasets[info.name][info.threadIdx], 0); + if (aclData == nullptr) { + throw runtime_error("Acl get tensor data from dataset failed."); + } + ptr = reinterpret_cast(acltdtGetDataAddrFromItem(aclData)); + + // 判断拿到的embedding个数是否与swapOutKeys个数相等 + size_t dimNum = acltdtGetDimNumFromItem(aclData); + int64_t dims[dimNum]; + acltdtGetDimsFromItem(aclData, dims, dimNum); + + LOG_DEBUG("table:{}, batchId:{}, recv d2h, dims[0]:{}, swapOutAddrs.size:{}", + info.name, info.batchId, dims[0], swapOutAddrs.size()); + dims0 = dims[0]; + } + LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingRecvTC(ms):{}", + info.name.c_str(), info.batchId, info.threadIdx, EmbeddingRecvTC.ElapsedMS()); + lastRecvFinishStepMap[info.name]++; + cvLastRecvFinishMap[info.name][info.cvNotifyIndex].notify_all(); + return true; +} + +void HybridMgmt::EmbeddingUpdateSSD(const EmbTaskInfo& info, float *embPtr, + vector& swapOutAddrs, int64_t& dims0) +{ + std::unique_lock lastUpdateFinishLocker(lastUpdateFinishMutexMap[info.name][info.threadIdx]); + cvLastUpdateFinishMap[info.name][info.threadIdx].wait(lastUpdateFinishLocker, [info, this] { + return (lastUpdateFinishStepMap[info.name] == info.batchId) || mutexDestroy; + }); + + TimeCost EmbeddingUpdateTC = TimeCost(); + std::vector swapOutDDRAddrOffs = HBMSwapKeyQue[info.name + ADDR_STR].WaitAndPop(); + if (!isRunning) { + return; + } + uint64_t memSize = info.extEmbeddingSize * sizeof(float); + uint64_t extEmbeddingSize = info.extEmbeddingSize; + // DDR更新 +# pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ + shared(swapOutAddrs, swapOutDDRAddrOffs, embPtr, extEmbeddingSize, memSize) + for (uint64_t i = 0; i < swapOutAddrs.size(); i++) { + auto rc = memcpy_s(swapOutAddrs[i], memSize, embPtr + swapOutDDRAddrOffs[i] * extEmbeddingSize, memSize); + if (rc != 0) { + throw runtime_error("memcpy_s failed, error code:" + to_string(rc)); + } + } + LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingUpdateTC(ms):{}", + info.name.c_str(), info.batchId, info.threadIdx, EmbeddingUpdateTC.ElapsedMS()); + + // SSD更新 + TimeCost SSDUpdateTC = TimeCost(); + std::vector swapOutSSDAddrOffs = SwapOut2SSDKeyQue[info.name + ADDR_STR].WaitAndPop(); + std::vector swapOutSSDKeys = SwapOut2SSDKeyQue[info.name + SWAP_OUT_STR].WaitAndPop(); + if (!isRunning) { + return; + } + + if (dims0 != static_cast(swapOutAddrs.size() + swapOutSSDKeys.size())) { + throw runtime_error("data dims[0] != swapOutKeys.size"); + } + cacheManager->UpdateSSDEmb(info.name, embPtr, extEmbeddingSize, swapOutSSDKeys, swapOutSSDAddrOffs); + LOG_DEBUG("table:{}, batchId:{}, thread{}, SSDUpdateTC(ms):{}", + info.name.c_str(), info.batchId, info.threadIdx, SSDUpdateTC.ElapsedMS()); + + lastUpdateFinishStepMap[info.name]++; + cvLastUpdateFinishMap[info.name][info.cvNotifyIndex].notify_all(); +} + +bool HybridMgmt::EmbeddingLookUpSSD(const EmbTaskInfo& info, vector& h2dEmb) +{ + std::unique_lock lastUpdateFinishLocker(lastUpdateFinishMutexMap[info.name][info.threadIdx]); + cvLastUpdateFinishMap[info.name][info.threadIdx].wait(lastUpdateFinishLocker, [info, this] { + return (lastUpdateFinishStepMap[info.name] >= info.batchId) || mutexDestroy; + }); + if (!isRunning) { + return false; + } + + std::unique_lock lastLookUpFinishLocker(lastLookUpFinishMutexMap[info.name][info.threadIdx]); + cvLastLookUpFinishMap[info.name][info.threadIdx].wait(lastLookUpFinishLocker, [info, this] { + return (lastLookUpFinishStepMap[info.name] == info.batchId) || mutexDestroy; + }); + if (!isRunning) { + return false; + } + + TimeCost transferDDR2SSDTC = TimeCost(); + // DDR腾空间 + std::vector DDR2SSDKeys = DDRSwapKeyForSSDQue[info.name + SWAP_OUT_STR].WaitAndPop(); + std::vector DDR2SSDAddrs = DDRSwapAddrsQue[info.name + SWAP_OUT_STR].WaitAndPop(); + if (!isRunning) { + return false; + } + cacheManager->TransferDDR2SSD(info.name, info.extEmbeddingSize, DDR2SSDKeys, DDR2SSDAddrs); + LOG_DEBUG("table:{}, thread:{}, transferDDR2SSDTC(ms):{}", + info.name.c_str(), info.threadIdx, transferDDR2SSDTC.ElapsedMS()); + + TimeCost fetchSSDEmb2DDRTC = TimeCost(); + // swapInKeys中在SSD的到DDR + std::vector SSD2DDRKeys = DDRSwapKeyForSSDQue[info.name + SWAP_IN_STR].WaitAndPop(); + std::vector SSD2DDRAddrs = DDRSwapAddrsQue[info.name + SWAP_IN_STR].WaitAndPop(); + if (!isRunning) { + return false; + } + cacheManager->FetchSSDEmb2DDR(info.name, info.extEmbeddingSize, SSD2DDRKeys, SSD2DDRAddrs); + LOG_DEBUG("table:{}, thread:{}, fetchSSDEmb2DDRTC(ms):{}", + info.name.c_str(), info.threadIdx, fetchSSDEmb2DDRTC.ElapsedMS()); + + bool isSuccess = BuildH2DEmbedding(info, h2dEmb); + if (!isSuccess) { + return false; + } + + lastLookUpFinishStepMap[info.name]++; + cvLastLookUpFinishMap[info.name][info.cvNotifyIndex].notify_all(); + + return true; +} + +void HybridMgmt::EmbeddingSendSSD(const EmbTaskInfo& info, vector& h2dEmb) +{ + std::unique_lock lastSendFinishLocker(lastSendFinishMutexMap[info.name][info.threadIdx]); + cvLastSendFinishMap[info.name][info.threadIdx].wait(lastSendFinishLocker, [info, this] { + return (lastSendFinishStepMap[info.name] == info.batchId) || mutexDestroy; + }); + TimeCost SendTC = TimeCost(); + hdTransfer->Send(TransferChannel::H2D, h2dEmb, TRAIN_CHANNEL_ID, info.name, info.batchId); + lastSendFinishStepMap[info.name]++; + cvLastSendFinishMap[info.name][info.cvNotifyIndex].notify_all(); + LOG_DEBUG("table:{}, thread:{}, SendH2DEmbTC(ms):{}", info.name.c_str(), info.threadIdx, SendTC.ElapsedMS()); + + // 对于end of sequence场景,key process需要基于h2dNextBatchId等待每个table都完成了最后1个step发送,才能发EOS至各channel + hybridMgmtBlock->h2dNextBatchId[info.name]++; + LOG_DEBUG("h2dNextBatchId, table:{}, next batchId:{}", info.name, hybridMgmtBlock->h2dNextBatchId[info.name]); +} + +void HybridMgmt::HandleEosCaseHBM(const string &embName, int batchId, int channelId, bool &remainBatchOut) +{ + bool sendAllChannel = false; + if (channelId == EVAL_CHANNEL_ID) { + if (!alreadyTrainOnce) { + // predict场景 + sendAllChannel = true; + } else { + // train+eval场景 + hybridMgmtBlock->SetBlockStatus(EVAL_CHANNEL_ID, true); + LOG_INFO("GetUniqueKeys get eos from eval channel, SetBlockStatus=true"); + } + } + KEY_PROCESS_INSTANCE->SendEos(embName, batchId, channelId, sendAllChannel); + remainBatchOut = false; +} + +void HybridMgmt::HandleEndBatchCase(const EmbBaseInfo& info, vector& swapInPos) +{ + if ((info.channelId == TRAIN_CHANNEL_ID) && IsTrainEndBatch(info.batchId)) { + // 如果是train epoch最后一个batch,补发emptySwapOutPos以启动当前step + std::vector emptySwapOutPos; + SendTensorForSwap(info, swapInPos, emptySwapOutPos); + specialProcessStatus[info.name] = ProcessStatus::AFTER_SWITCH_FIRST_BATCH; + LOG_DEBUG("handle last end batch for current epoch, table:{}, batchId:{}", info.name, info.batchId); + return; + } + + if (info.channelId == EVAL_CHANNEL_ID && IsEvalEndBatch(info.batchId)) { + // 当前step之后eval结束,需要设置处理状态 + // 因为eval、predict最后1个batch之后不会像train那样再往后跑,所以必须放这里补发 + LOG_DEBUG("reach max eval step, send emptySwapOutPos tensor for last step to finish eval, " + "change ProcessStatus to {}, table:{}, batchId:{}", + ProcessStatus2Str(ProcessStatus::AFTER_SWITCH_FIRST_BATCH), info.name, info.batchId); + std::vector emptySwapOutPos; + SendTensorForSwap(info, lastSwapInPosMap[info.name], emptySwapOutPos); + specialProcessStatus[info.name] = ProcessStatus::AFTER_SWITCH_FIRST_BATCH; + } +} + +void HybridMgmt::HandleFirstBatchCaseDDR(const EmbBaseInfo& info, + pair, vector>& swapInKoPair, + pair, vector>& swapOutKoPair) +{ + TimeCost swapProcessTC; + auto &swapInKeys = swapInKoPair.first; + auto &swapInPos = swapInKoPair.second; + auto &swapOutKeys = swapOutKoPair.first; + auto &swapOutPos = swapOutKoPair.second; + + vector emptySwapOutKeys; + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", + info.name, info.batchId, info.channelId, swapInKoPair.first.size(), emptySwapOutKeys.size()); + trainTestSwitchInfoStore[info.name] = {swapOutKeys, swapOutPos}; + + LOG_DEBUG("handle first batch case, delay sending swapInPos, table:{}", info.name); + LOG_DEBUG("enqueue HBMSwapKeyQue table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", + info.name, info.batchId, info.channelId, swapInKeys.size(), emptySwapOutKeys.size()); + HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(emptySwapOutKeys); + HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKeys); +} + +void HybridMgmt::HandleFirstBatchCaseSSD(const EmbBaseInfo& info, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair) +{ + // 发现train、save、eval切换,先保存状态,发emptySwapOutKeys以对应上一步的emptySwapOutPos + vector emptySwapOutKeys; + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", + info.name, info.batchId, info.channelId, swapInKoPair.first.size(), emptySwapOutKeys.size()); + trainTestSwitchInfoStore[info.name] = {swapOutKoPair.first, swapOutKoPair.second}; + + TimeCost ProcessSwapInKeysTC = TimeCost(); + vector SSDToDDRKeys; + vector DDRToSSDKeys; + cacheManager->ProcessSwapInKeys(info.name, swapInKoPair.first, DDRToSSDKeys, SSDToDDRKeys); + LOG_DEBUG("ProcessSwapInKeysTC(ms):{} ", ProcessSwapInKeysTC.ElapsedMS()); + + vector emptySwapOutDDRKeys; + vector emptySwapOutDDRAddrOffs; + vector emptySwapOutSSDKeys; + vector emptySwapOutSSDAddrOff; + + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", + info.name, info.batchId, info.channelId, swapInKoPair.first.size(), swapOutKoPair.first.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapOutDDRKeys.size:{}, swapOutDDRAddrOffs.size:{}, " + "swapOutSSDKeys.size:{}, swapOutSSDAddrOff.size:{}", + info.name, info.batchId, info.channelId, emptySwapOutDDRKeys.size(), emptySwapOutDDRAddrOffs.size(), + emptySwapOutSSDKeys.size(), emptySwapOutSSDAddrOff.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToSSDKeys.size:{}, SSDToDDRKeys.size:{}", + info.name, info.batchId, info.channelId, DDRToSSDKeys.size(), SSDToDDRKeys.size()); + + auto DDRToSSDKeysForSSD = DDRToSSDKeys; + auto SSDToDDRKeysForSSD = SSDToDDRKeys; + // DDR<->SSD + DDRSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(DDRToSSDKeys); + DDRSwapKeyQue[info.name + SWAP_IN_STR].Pushv(SSDToDDRKeys); + + DDRSwapKeyForSSDQue[info.name + SWAP_OUT_STR].Pushv(DDRToSSDKeysForSSD); + DDRSwapKeyForSSDQue[info.name + SWAP_IN_STR].Pushv(SSDToDDRKeysForSSD); + + // HBM<->DDR + HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(emptySwapOutDDRKeys); + HBMSwapKeyQue[info.name + ADDR_STR].Pushv(emptySwapOutDDRAddrOffs); + HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKoPair.first); + + // HBM->SSD + SwapOut2SSDKeyQue[info.name + SWAP_OUT_STR].Pushv(emptySwapOutSSDKeys); + SwapOut2SSDKeyQue[info.name + ADDR_STR].Pushv(emptySwapOutSSDAddrOff); +} + +void HybridMgmt::HandleDataSwapForSSD(const EmbBaseInfo& info, + vector &swapInKeys, vector &swapOutKeys) +{ + TimeCost ProcessSwapInKeysTC; + vector SSDToDDRKeys; + vector DDRToSSDKeys; + cacheManager->ProcessSwapInKeys(info.name, swapInKeys, DDRToSSDKeys, SSDToDDRKeys); + LOG_DEBUG("ProcessSwapInKeysTC(ms):{} ", ProcessSwapInKeysTC.ElapsedMS()); + + TimeCost ProcessSwapOutKeysTC; + SwapOutInfo swapInfo; + cacheManager->ProcessSwapOutKeys(info.name, swapOutKeys, swapInfo); + LOG_DEBUG("ProcessSwapOutKeysTC(ms):{} ", ProcessSwapOutKeysTC.ElapsedMS()); + + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", + info.name, info.batchId, info.channelId, swapInKeys.size(), swapOutKeys.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapOutDDRKeys:{}, swapOutDDRAddrOffs:{}, " + "swapOutSSDKeys:{}, swapOutSSDAddrOff:{}", + info.name, info.batchId, info.channelId, swapInfo.swapOutDDRKeys.size(), + swapInfo.swapOutDDRAddrOffs.size(), swapInfo.swapOutSSDKeys.size(), swapInfo.swapOutSSDAddrOffs.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToSSDKeys:{}, SSDToDDRKeys:{}", + info.name, info.batchId, info.channelId, DDRToSSDKeys.size(), SSDToDDRKeys.size()); + + auto DDRToSSDKeysForSSD = DDRToSSDKeys; + auto SSDToDDRKeysForSSD = SSDToDDRKeys; + // DDR<->SSD + DDRSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(DDRToSSDKeys); + DDRSwapKeyQue[info.name + SWAP_IN_STR].Pushv(SSDToDDRKeys); + + DDRSwapKeyForSSDQue[info.name + SWAP_OUT_STR].Pushv(DDRToSSDKeysForSSD); + DDRSwapKeyForSSDQue[info.name + SWAP_IN_STR].Pushv(SSDToDDRKeysForSSD); + + // HBM<->DDR + HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(swapInfo.swapOutDDRKeys); + HBMSwapKeyQue[info.name + ADDR_STR].Pushv(swapInfo.swapOutDDRAddrOffs); + HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKeys); + + // HBM->SSD + SwapOut2SSDKeyQue[info.name + SWAP_OUT_STR].Pushv(swapInfo.swapOutSSDKeys); + SwapOut2SSDKeyQue[info.name + ADDR_STR].Pushv(swapInfo.swapOutSSDAddrOffs); +} + +bool HybridMgmt::BuildH2DEmbedding(const EmbTaskInfo &info, vector &h2dEmb) +{ + std::vector swapInAddrs = tableToQueueLookup[info.name+SWAP_IN_STR].WaitAndPop(); + if (!isRunning) { + return false; + } + h2dEmb.emplace_back(Tensor(tensorflow::DT_FLOAT, { + int(swapInAddrs.size()), static_cast(info.extEmbeddingSize) + })); + auto &tmpTensor = h2dEmb.back(); + float *h2dEmbAddr = tmpTensor.flat().data(); + TimeCost embeddingLookupTC = TimeCost(); + + uint64_t memSize = info.extEmbeddingSize * sizeof(float); +# pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ + shared(swapInAddrs, h2dEmbAddr, info, memSize) + for (uint64_t i = 0; i < swapInAddrs.size(); i++) { + auto rc = memcpy_s(h2dEmbAddr + i * info.extEmbeddingSize, memSize, swapInAddrs[i], memSize); + if (rc != 0) { + throw runtime_error("memcpy_s failed, error code:" + to_string(rc)); + } + } + LOG_DEBUG("table:{}, thread:{}, embeddingLookupTC(ms):{}", + info.name.c_str(), info.threadIdx, embeddingLookupTC.ElapsedMS()); + return true; +} + +vector HybridMgmt::GetUniqueKeys(const EmbBaseInfo &info, bool &remainBatchOut) +{ + bool isEos = false; + auto uniqueKeys = KEY_PROCESS_INSTANCE->GetUniqueKeys(info, isEos, lookUpSwapInAddrsPushId); + if (isEos) { + HandleEosCase(info, remainBatchOut); + return uniqueKeys; + } + if (uniqueKeys.empty()) { + remainBatchOut = false; + LOG_WARN("table:{}, channelId:{} batchId:{}, UniqueKeys result is empty", + info.name, info.channelId, info.batchId); + return uniqueKeys; + } + + if (info.channelId == TRAIN_CHANNEL_ID) { + TimeCost KeyMaintainTC; + trainKeysSet[info.name].insert(uniqueKeys.begin(), uniqueKeys.end()); + LOG_DEBUG("table:{}, batchId:{}, KeyMaintainTC(ms):{}", info.name, info.batchId, KeyMaintainTC.ElapsedMS()); + } else { + for (auto &key : uniqueKeys) { + if (trainKeysSet[info.name].find(key) == trainKeysSet[info.name].end()) { + key = INVALID_KEY_VALUE; + LOG_TRACE("find key not train before, set as invalid key"); + } + } + } + + LOG_DEBUG("table:{}, channelId:{} batchId:{}, GetUniqueKeys end", info.name, info.channelId, info.batchId); + return uniqueKeys; +} + +vector HybridMgmt::GetRestoreVecSec(const EmbBaseInfo &info, bool &remainBatchOut) +{ + auto restoreVecSec = KEY_PROCESS_INSTANCE->GetRestoreVecSec(info); + if (restoreVecSec.empty()) { + remainBatchOut = false; + LOG_WARN("table:{}, channelId:{} batchId:{}, restoreVecSec result is empty", + info.name, info.channelId, info.batchId); + return restoreVecSec; + } + LOG_DEBUG("table:{}, channelId:{} batchId:{}, GetRestoreVecSec end", info.name, info.channelId, info.batchId); + return restoreVecSec; +} + +void HybridMgmt::SendAll2AllVec(const EmbBaseInfo &info, bool &remainBatchOut) +{ + if (!mgmtRankInfo.useStatic) { + bool isEos = false; // useless, adapt to HBM mode + TimeCost getAll2AllTC; + unique_ptr> all2all = KEY_PROCESS_INSTANCE->GetInfoVec( + info, ProcessedInfo::ALL2ALL, isEos); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, GetInfoVec all2all end, GetAll2AllTC(ms):{}", + info.name, info.channelId, info.batchId, getAll2AllTC.ElapsedMS()); + if (all2all == nullptr) { + remainBatchOut = false; + LOG_WARN("Information vector is nullptr!"); + return; + } + TimeCost sendAll2AllTC; + hdTransfer->Send(TransferChannel::ALL2ALL, *all2all, info.channelId, info.name); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, send all2all end, sendAll2AllTC(ms):{}", + info.name, info.channelId, info.batchId, sendAll2AllTC.ElapsedMS()); + } +} + +void HybridMgmt::SendRestoreVec(const EmbBaseInfo &info, bool &remainBatchOut) +{ + bool isEos = false; // useless, adapt to HBM mode + TimeCost getRestoreTC; + unique_ptr> infoVecs = KEY_PROCESS_INSTANCE->GetInfoVec( + info, ProcessedInfo::RESTORE, isEos); + if (infoVecs == nullptr) { + remainBatchOut = false; + LOG_ERROR("Information vector is nullptr!"); + return; + } + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, get restore end, getRestoreTC(ms):{}", + info.name, info.channelId, info.batchId, getRestoreTC.ElapsedMS()); + + TimeCost sendRestoreSyncTC; + hdTransfer->Send(TransferChannel::RESTORE, *infoVecs, info.channelId, info.name); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, send restore end, sendRestoreSyncTC(ms):{}", + info.name, info.channelId, info.batchId, sendRestoreSyncTC.ElapsedMS()); +} + +void HybridMgmt::SendLookupOffsets(const EmbBaseInfo &info, + vector &uniqueKeys, vector &restoreVecSec) +{ + TimeCost sendLookupOffsetsTC; + std::vector lookupOffsets; + for (const auto &index : restoreVecSec) { + lookupOffsets.emplace_back(uniqueKeys[index]); + } + hdTransfer->Send(TransferChannel::LOOKUP, { Vec2TensorI32(lookupOffsets) }, info.channelId, info.name); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, send lookupOffset, sendLookupOffsetsTC(ms):{}", + info.name, info.channelId, info.batchId, sendLookupOffsetsTC.ElapsedMS()); +} + +void HybridMgmt::SendGlobalUniqueVec(const EmbBaseInfo &info, + vector &uniqueKeys, vector &restoreVecSec) +{ + if (!(info.channelId == TRAIN_CHANNEL_ID && mgmtRankInfo.useSumSameIdGradients)) { + return; + } + TimeCost sendUniqueKeysSyncTC; + hdTransfer->Send(TransferChannel::UNIQKEYS, {mgmtRankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) : + Vec2TensorI32(uniqueKeys) }, info.channelId, info.name); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, sendUniqueKeysSyncTC(ms):{}", + info.name, info.channelId, info.batchId, sendUniqueKeysSyncTC.ElapsedMS()); + + TimeCost sendRestoreVecSecSyncTC; + hdTransfer->Send(TransferChannel::RESTORE_SECOND, {Vec2TensorI32(restoreVecSec) }, info.channelId, info.name); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, sendRestoreVecSecSyncTC(ms):{}", + info.name, info.channelId, info.batchId, sendRestoreVecSecSyncTC.ElapsedMS()); +} + +bool HybridMgmt::HandleSpecialProcessStatusDDR(const EmbBaseInfo &info, TimeCost& getAndSendTensorsTC, + pair, vector> &swapInKoPair, + pair, vector> &swapOutKoPair) +{ + TimeCost swapProcessTC; + auto &swapInPos = swapInKoPair.second; + auto &swapOutKeys = swapOutKoPair.first; + auto &swapOutPos = swapOutKoPair.second; + + if (specialProcessStatus[info.name] == ProcessStatus::AFTER_SWITCH_FIRST_BATCH) { + // 发现train、save、eval切换,先保存状态,发emptySwapOutKeys以对应上一步的emptySwapOutPos + HandleFirstBatchCaseDDR(info, swapInKoPair, swapOutKoPair); + LOG_DEBUG("handle channel switch case:afterSwitchFirstBatch, table:{}, channelId:{}, batchId:{}", + info.name, info.channelId, info.batchId); + + if (mgmtRankInfo.ctrlSteps[info.channelId] == 1) { + vector emptySwapOutPos; + SendTensorForSwap(info, swapInPos, emptySwapOutPos); + LOG_DEBUG("ProcessEmbInfoDDR special case, user only run one step, table:{}, channelId:{}, batchId:{}", + info.name, info.channelId, info.batchId); + return true; + } + + specialProcessStatus[info.name] = ProcessStatus::AFTER_SWITCH_SECOND_BATCH; + LOG_DEBUG("ProcessEmbInfoDDR end, table:{}, batchId:{}, swapProcessTC(ms):{}, getAndSendTensorsTC(ms):{}", + info.name, info.batchId, swapProcessTC.ElapsedMS(), getAndSendTensorsTC.ElapsedMS()); + return true; + } + if (specialProcessStatus[info.name] == ProcessStatus::AFTER_SWITCH_SECOND_BATCH) { + // 将上一步暂存的状态合并至当前step一起处理 + auto tempStore = trainTestSwitchInfoStore[info.name]; + swapOutKeys.insert(swapOutKeys.end(), tempStore[0].begin(), tempStore[0].end()); + swapOutPos.insert(swapOutPos.end(), tempStore[1].begin(), tempStore[1].end()); + specialProcessStatus[info.name] = ProcessStatus::NORMAL; + LOG_DEBUG("handle channel switch case:afterSwitchSecondBatch, table:{}, channelId:{}, batchId:{}", + info.name, info.channelId, info.batchId); + } + return false; +} + +bool HybridMgmt::HandleSpecialProcessStatusSSD(const EmbBaseInfo &info, TimeCost &getAndSendTensorsTC, + pair, vector> &swapInKoPair, + pair, vector> &swapOutKoPair) +{ + TimeCost swapProcessTC; + auto &swapInPos = swapInKoPair.second; + auto &swapOutKeys = swapOutKoPair.first; + auto &swapOutPos = swapOutKoPair.second; + + if (specialProcessStatus[info.name] == ProcessStatus::AFTER_SWITCH_FIRST_BATCH) { + // 发现train、save、eval切换,先保存状态,发emptySwapOutKeys以对应上一步的emptySwapOutPos + HandleFirstBatchCaseSSD(info, swapInKoPair, swapOutKoPair); + LOG_DEBUG("handle channel switch case:afterSwitchFirstBatch, table:{}, channelId:{}, batchId:{}", + info.name, info.channelId, info.batchId); + + if (mgmtRankInfo.ctrlSteps[info.channelId] == 1) { + vector emptySwapOutPos; + SendTensorForSwap(info, swapInPos, emptySwapOutPos); + LOG_DEBUG("ProcessEmbInfoSSD special case, user only run one step, table:{}, channelId:{}, batchId:{}", + info.name, info.channelId, info.batchId); + } + + specialProcessStatus[info.name] = ProcessStatus::AFTER_SWITCH_SECOND_BATCH; + LOG_DEBUG("ProcessEmbInfoSSD end, table:{}, batchId:{}, swapProcessTC(ms):{}, getAndSendTensorsTC(ms):{}", + info.name, info.batchId, swapProcessTC.ElapsedMS(), getAndSendTensorsTC.ElapsedMS()); + return true; + } + if (specialProcessStatus[info.name] == ProcessStatus::AFTER_SWITCH_SECOND_BATCH) { + // 将上一步暂存的状态合并至当前step一起处理 + auto tempStore = trainTestSwitchInfoStore[info.name]; + swapOutKeys.insert(swapOutKeys.end(), tempStore[0].begin(), tempStore[0].end()); + swapOutPos.insert(swapOutPos.end(), tempStore[1].begin(), tempStore[1].end()); + specialProcessStatus[info.name] = ProcessStatus::NORMAL; + LOG_DEBUG("handle channel switch case:afterSwitchSecondBatch, table:{}, channelId:{}, batchId:{}", + info.name, info.channelId, info.batchId); + } + return false; +} + + +void HybridMgmt::CheckLookupAddrSuccessDDR() +{ + if (!lookupAddrSuccess) { + // lookup失败,从future捞出异常 + for (auto& t : lookUpSwapInAddrsThreads) { + t.get(); + } + for (auto& t : lookUpSwapOutAddrsThreads) { + t.get(); + } + } +} + +void HybridMgmt::CheckLookupAddrSuccessSSD() +{ + if (!lookupAddrSuccess) { + for (auto& t : lookUpThreads) { + t.get(); + } + } +} + +void HybridMgmt::GetSwapPairsAndKey2Offset(const EmbBaseInfo &info, vector &uniqueKeys, + pair, vector> &swapInKoPair, + pair, vector> &swapOutKoPair) +{ + TimeCost GetSwapPairsAndKey2OffsetTC; + int swapInCode = embCache->GetSwapPairsAndKey2Offset(info.name, uniqueKeys, swapInKoPair, swapOutKoPair); + if (swapInCode != H_OK) { + string errMsg = StringFormat("table:%s, GetSwapPairsAndKey2Offset failed! error code:%d", + info.name.c_str(), swapInCode); + throw runtime_error(errMsg); + } + LOG_DEBUG("table:{}, channel:{}, batchId:{}, GetSwapPairsAndKey2OffsetTC(ms):{}", + info.name, info.channelId, info.batchId, GetSwapPairsAndKey2OffsetTC.ElapsedMS()); +} + +void HybridMgmt::EnqueueSwapInfo(const EmbBaseInfo &info, + pair, vector>& swapInKoPair, + pair, vector>& swapOutKoPair) +{ + auto &swapInKeys = swapInKoPair.first; + auto &swapOutKeys = swapOutKoPair.first; + + LOG_DEBUG("enqueue HBMSwapKeyQue table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", + info.name, info.batchId, info.channelId, swapInKeys.size(), swapOutKeys.size()); + HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(swapOutKeys); + HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKeys); + + CheckLookupAddrSuccessDDR(); +} diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index a7bdcee6..2b4b2fc8 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -19,14 +19,19 @@ See the License for the specific language governing permissions and #include #include #include +#include #include "absl/container/flat_hash_map.h" #include "utils/common.h" #include "utils/config.h" +#include "utils/singleton.h" +#include "utils/task_queue.h" +#include "utils/time_cost.h" +#include "ock_ctr_common/include/factory.h" +#include "ock_ctr_common/include/embedding_cache.h" +#include "ock_ctr_common/include/error_code.h" -#include "host_emb/host_emb.h" -#include "emb_hashmap/emb_hashmap.h" #include "hd_transfer/hd_transfer.h" #include "ssd_cache/cache_manager.h" #include "hybrid_mgmt_block.h" @@ -35,12 +40,41 @@ See the License for the specific language governing permissions and namespace MxRec { using namespace std; using namespace tensorflow; + using namespace Common; enum class TaskType { HBM, DDR }; + enum class ProcessStatus { + NORMAL, + AFTER_SWITCH_FIRST_BATCH, + AFTER_SWITCH_SECOND_BATCH + }; + + inline string ProcessStatus2Str(ProcessStatus s) + { + switch (s) { + case ProcessStatus::NORMAL: + return "normal"; + case ProcessStatus::AFTER_SWITCH_FIRST_BATCH: + return "afterSwitchFirstBatch"; + case ProcessStatus::AFTER_SWITCH_SECOND_BATCH: + return "afterSwitchSecondBatch"; + default: + throw std::invalid_argument("Invalid ProcessStatus"); + } + }; + + struct EmbTaskInfo { + int batchId; + int threadIdx; + int cvNotifyIndex; + int extEmbeddingSize; + string name; + }; + class HybridMgmt { public: HybridMgmt() = default; @@ -59,7 +93,7 @@ namespace MxRec { bool Initialize(RankInfo rankInfo, const vector& embInfos, int seed, const vector& thresholdValues, bool ifLoad); - bool Save(const string savePath); + void Save(const string& savePath); bool Load(const string& loadPath, vector warmStartTables); @@ -77,13 +111,7 @@ namespace MxRec { void Destroy(); - bool ParseKeys(int channelId, int& batchId); - - bool ParseKeysHBM(int channelId, int& batchId); - - bool ProcessEmbInfo(const std::string& embName, int batchId, int channelId, bool& remainBatchOut); - - void EmbHDTrans(const int channelId, const int batchId); + bool ParseKeys(int channelId, int& batchId, TaskType type); bool Evict(); @@ -97,39 +125,100 @@ namespace MxRec { void SetOptimizerInfo(const string& embName, OptimizerInfo optimInfo) const; + void FetchDeviceEmb(); + + void ProcessEmbInfoHBM(const EmbBaseInfo& info, bool& remainBatchOut, bool isGrad); + + void ProcessEmbInfoDDR(const EmbBaseInfo& info, bool& remainBatchOut); + + void ProcessEmbInfoSSD(const EmbBaseInfo& info, bool& remainBatchOut); + GTEST_PRIVATE: + bool mutexDestroy { false }; + std::mutex lookUpAndSendBatchIdMtx; + std::mutex receiveAndUpdateBatchIdMtx; + std::map lookUpAndSendTableBatchMap; + std::map receiveAndUpdateTableBatchMap; + + std::map> lastUpdateFinishMutexMap; + std::map> cvLastUpdateFinishMap; + std::map lastUpdateFinishStepMap; + std::map> lastLookUpFinishMutexMap; + std::map> cvLastLookUpFinishMap; + std::map lastLookUpFinishStepMap; + std::map> lastSendFinishMutexMap; + std::map> cvLastSendFinishMap; + std::map lastSendFinishStepMap; + std::map> lastRecvFinishMutexMap; + std::map> cvLastRecvFinishMap; + std::map lastRecvFinishStepMap; + + std::vector EmbeddingLookUpAndSendThreadPool; + std::vector EmbeddingReceiveAndUpdateThreadPool; + std::vector> lookUpSwapOutAddrsThreads; + std::vector> lookUpSwapInAddrsThreads; + std::vector> lookUpThreads; + + std::map>> HBMSwapKeyQue; + std::map>> SwapOut2SSDKeyQue; + std::map>> DDRSwapKeyQue; + std::map>> DDRSwapKeyForSSDQue; + std::map>> DDRSwapAddrsQue; + + std::mutex evictMut; + + std::map> trainKeysSet; + + const string SWAP_IN_STR = "SwapIn"; + const string SWAP_OUT_STR = "SwapOut"; + const string ADDR_STR = "Addr"; + + ock::ctr::EmbCacheManagerPtr embCache = nullptr; + std::map>> tableToQueueLookup; + std::map> lastSwapInPosMap {}; + std::map>> trainTestSwitchInfoStore {}; + std::atomic lookupAddrSuccess {true}; + + std::mutex saveMutex; + std::condition_variable cvCheckSave; void SetFeatureTypeForLoad(vector& loadFeatures); - bool IsLoadDataMatches(const EmbMemT& loadHostEmbs, const EmbInfo& setupHostEmbs, size_t& embTableCount) const; - - void EvictKeys(const string& embName, const vector& keys); + void EvictKeys(const string& embName, const vector& keys); void InitRankInfo(RankInfo& rankInfo, const vector& embInfos) const; - void EvictSSDKeys(const string& embName, const vector& keys) const; - - void PrepareDDRData(std::shared_ptr table, - const vector &keys, int channelId, int batchId) const; + void EvictSSDKeys(const string& embName, const vector& keys) const; int GetStepFromPath(const string& loadPath) const; - static void AddCacheManagerTraceLog(CkptData& saveData); + void LookUpAddrs(const string &embName, int extEmbeddingSize); + + void LookUpSwapAddrs(const std::string &embName, const std::string &swapStr); + + void EmbeddingTask(); + + void MultiThreadEmbHDTransWrap(); + + void EmbeddingLookUpAndSendDDR(int batchId, int index, const EmbInfo& embInfo); + + void EmbeddingReceiveAndUpdateDDR(int batchId, int index, const EmbInfo& embInfo); + + void EmbeddingLookUpAndSendSSD(int batchId, int index, const EmbInfo& embInfo); + + void EmbeddingReceiveAndUpdateSSD(int batchId, int index, const EmbInfo& embInfo); + + void SendTensorForSwap(const EmbBaseInfo& info, + const vector &swapInPosUint, + const vector &swapOutPosUint); - void RestoreFreq4Save(CkptData& saveData) const; private: - int currentBatchId; - int trainBatchId = 0; // 0-199, 200- - int getInfoBatchId; // 0-199, 200- - int sendBatchId; HybridMgmtBlock* hybridMgmtBlock; vector mgmtEmbInfo; RankInfo mgmtRankInfo; CacheManager* cacheManager; - HostEmb* hostEmbs {}; - unique_ptr hostHashMaps {}; vector> procThreads {}; - map> evictKeyMap {}; + map> evictKeyMap {}; HDTransfer *hdTransfer; OffsetMapT offsetMapToSend; OffsetMapT loadOffsetToSend; @@ -137,23 +226,101 @@ namespace MxRec { bool isRunning; bool isLoad { false }; bool isInitialized { false }; + bool alreadyTrainOnce = false; // 用于判断是否为predict模式 + map lookUpSwapInAddrsPushId; // 用于处理eos场景,当消费者追上生产者且长时间无上游数据,会触发eos + map specialProcessStatus; void TrainTask(TaskType type); void EvalTask(TaskType type); - bool EndBatch(int batchId, int channelId) const; + void SendUniqKeysAndRestoreVecHBM(const EmbBaseInfo &info, + const unique_ptr> &infoVecs, bool isGrad) const; + + void HandleEndBatchCase(const EmbBaseInfo& info, vector& swapInPos); + + bool IsTrainEndBatch(int batchId) const; + + bool IsEvalEndBatch(int batchId) const; + + void InitEmbeddingCache(const vector& embInfos); + + void InitDataPipelineForDDR(const string &embName); + + void InitDataPipelineForSSD(const string &embName, int extEmbeddingSize); + + void JoinEmbeddingCacheThread(); + + void HandleReachMaxStepCase(const EmbBaseInfo& info, bool& remainBatchOut); + + void HandleEosCase(const EmbBaseInfo& info, bool& remainBatchOut); + + void HandleEosCaseHBM(const string& embName, int batchId, int channelId, bool& remainBatchOut); + + bool EmbeddingReceiveDDR(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs); + + void EmbeddingUpdateDDR(const EmbTaskInfo& info, const float* embPtr, vector& swapOutAddrs); + + bool EmbeddingLookUpDDR(const EmbTaskInfo& info, vector& h2dEmb); + + void EmbeddingSendDDR(const EmbTaskInfo& info, vector& h2dEmb); + + bool EmbeddingReceiveSSD(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs, int64_t& dims0); + + void EmbeddingUpdateSSD(const EmbTaskInfo& info, float* embPtr, vector& swapOutAddrs, int64_t& dims0); + + bool EmbeddingLookUpSSD(const EmbTaskInfo& info, vector& h2dEmb); + + void EmbeddingSendSSD(const EmbTaskInfo& info, vector& h2dEmb); + + void CreateEmbeddingLookUpAndSendThread(int index, const EmbInfo& embInfo); + + void CreateEmbeddingReceiveAndUpdateThread(int index, const EmbInfo& embInfo); + + void HandleFirstBatchCaseDDR(const EmbBaseInfo& info, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); + + void HandleFirstBatchCaseSSD(const EmbBaseInfo& info, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); + + void HandleDataSwapForSSD(const EmbBaseInfo& info, + vector &swapInKeys, vector &swapOutKeys); + + bool BuildH2DEmbedding(const EmbTaskInfo& info, vector& h2dEmb); + + vector GetUniqueKeys(const EmbBaseInfo& info, bool& remainBatchOut); + + vector GetRestoreVecSec(const EmbBaseInfo& info, bool& remainBatchOut); + + void SendAll2AllVec(const EmbBaseInfo& info, bool& remainBatchOut); + + void SendRestoreVec(const EmbBaseInfo& info, bool& remainBatchOut); + + void SendLookupOffsets(const EmbBaseInfo& info, vector& uniqueKeys, vector& restoreVecSec); + + void SendGlobalUniqueVec(const EmbBaseInfo& info, vector& uniqueKeys, vector& restoreVecSec); + + bool HandleSpecialProcessStatusDDR(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - void EmbHDTransWrap(int channelId, const int& batchId, int start); + bool HandleSpecialProcessStatusSSD(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - bool LoadMatchesDDRSetup(const CkptData& loadData); + void CheckLookupAddrSuccessDDR(); - void HandlePrepareDDRDataRet(TransferRet prepareSSDRet) const; + void CheckLookupAddrSuccessSSD(); - void SendUniqKeysAndRestoreVecHBM(int channelId, int& batchId, const EmbInfo &embInfo, - const unique_ptr> &infoVecs) const; + void GetSwapPairsAndKey2Offset(const EmbBaseInfo& info, vector &uniqueKeys, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - void SendUniqKeysAndRestoreVecDDR(const string &embName, int &batchId, int &channelId, DDRParam &ddrParam); + void EnqueueSwapInfo(const EmbBaseInfo& info, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); }; } #endif // MX_REC_EMB_MGMT_H diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp index ad10bac4..65235389 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp @@ -40,6 +40,7 @@ void HybridMgmtBlock::CheckAndSetBlock(int channelId) LOG_DEBUG(HYBRID_BLOCKING + "blocking by save saveInterval {} pythonBatchId {} hybridBatchId {}", saveInterval, pythonBatchId[channelId], hybridBatchId[channelId]); isBlock[TRAIN_CHANNEL_ID] = true; + finishSave = false; } if (stepsInterval[channelId] == -1) { return; @@ -74,7 +75,8 @@ bool HybridMgmtBlock::WaitValid(int channelId) { // 等待hybrid处理完成 int reTryNumber = 100; - LOG_INFO(HYBRID_BLOCKING + "check step invalid, wait {} {}", channelId, hybridBatchId[channelId]); + LOG_INFO(HYBRID_BLOCKING + "validate step and wait, channel:{}, pythonBatchId:{}, hybridBatchId:{}", + channelId, pythonBatchId[channelId], hybridBatchId[channelId]); // 等待hybrid处理完成后再一次唤醒 while (pythonBatchId[lastRunChannelId] != hybridBatchId[lastRunChannelId] and isRunning) { std::this_thread::sleep_for(std::chrono::milliseconds(10ms)); @@ -85,6 +87,8 @@ bool HybridMgmtBlock::WaitValid(int channelId) } if (pythonBatchId[channelId] == hybridBatchId[channelId]) { + LOG_ERROR(HYBRID_BLOCKING + "step not equal, channel:{}, pythonBatchId:{}, hybridBatchId:{}", + channelId, pythonBatchId[channelId], hybridBatchId[channelId]); return true; } else { // 如果等待python侧处理较长时间后hybrid依旧无法追赶上python则异常 @@ -159,14 +163,19 @@ void HybridMgmtBlock::DoBlock(int channelId) /// \param channelId channelId train 0 eval 1 void HybridMgmtBlock::ResetAll(int channelId) { - LOG_DEBUG(HYBRID_BLOCKING + "Hybridmgmt is resetting data channelId {} hybridBatchId {}", - channelId, hybridBatchId[channelId]); + LOG_DEBUG(HYBRID_BLOCKING + "start reset block status," + " channelId:{}, pythonBatchId:{}, readEmbedBatchId:{}, hybridBatchId:{}", + channelId, pythonBatchId[channelId], readEmbedBatchId[channelId], hybridBatchId[channelId]); readEmbedBatchId[channelId] = 0; pythonBatchId[channelId] = 0; hybridBatchId[channelId] = 0; isBlock[channelId] = false; + LOG_DEBUG(HYBRID_BLOCKING + "after reset block status," + " channelId:{}, pythonBatchId:{}, readEmbedBatchId:{}, hybridBatchId:{}", + channelId, pythonBatchId[channelId], readEmbedBatchId[channelId], hybridBatchId[channelId]); + LOG_DEBUG("Start to reset isNeedSendEos"); Singleton::GetInstance()->SetEos(0, channelId); } @@ -224,16 +233,37 @@ void HybridMgmtBlock::SetRankInfo(RankInfo ri) this->stepsInterval[TRAIN_CHANNEL_ID] = ri.ctrlSteps[TRAIN_CHANNEL_ID]; this->stepsInterval[EVAL_CHANNEL_ID] = ri.ctrlSteps[EVAL_CHANNEL_ID]; this->saveInterval = ri.ctrlSteps[SAVE_STEP_INDEX]; + this->maxTrainStep = ri.ctrlSteps[MAX_TRAIN_STEP_INDEX]; this->rankInfo = ri; -}; +} void HybridMgmtBlock::SetStepInterval(int trainStep, int evalStep) { this->stepsInterval[0] = trainStep; this->stepsInterval[1] = evalStep; -}; +} HybridMgmtBlock::~HybridMgmtBlock() { Destroy(); } + +void HybridMgmtBlock::Wake(int channelId) +{ + isBlock[channelId] = false; +} + +bool HybridMgmtBlock::IsNeedWaitSave() +{ + if (saveInterval != 0 && saveInterval != -1 && + hybridBatchId[TRAIN_CHANNEL_ID] % saveInterval == 0 + && !finishSave) { + return true; + } + return false; +} + +void HybridMgmtBlock::FinishSave() +{ + finishSave = true; +} diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.h b/src/core/hybrid_mgmt/hybrid_mgmt_block.h index 00cdc73e..a969d7a9 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.h @@ -26,6 +26,7 @@ See the License for the specific language governing permissions and namespace MxRec { const std::string HYBRID_BLOCKING = "[HYBRID_BLOCKING] "; const int SAVE_STEP_INDEX = 2; + const int MAX_TRAIN_STEP_INDEX = 3; const std::chrono::milliseconds SLEEP_MS = 20ms; class HybridMgmtBlock { @@ -39,6 +40,11 @@ namespace MxRec { int pythonBatchId[2] = {0, 0}; // readEmbed算子侧将要处理的batch id int readEmbedBatchId[2] = {0, 0}; + int maxTrainStep = 0; + int stepsInterval[2] = {0, 0}; // 通道i运行多少步后切换为通道j + + // hybrid已完成H2D的step + map h2dNextBatchId; int loop[2] = {1, 1}; @@ -76,14 +82,19 @@ namespace MxRec { void Destroy(); + void Wake(int channelId); + + bool IsNeedWaitSave(); + + void FinishSave(); + private: - // 通道i运行多少步后切换为通道j - int stepsInterval[2] = {0, 0}; // 控制通道阻塞的变量 bool isBlock[2] = {true, true}; // 控制训练了多少步进行保存的步数 int saveInterval = 0; RankInfo rankInfo; + bool finishSave = true; }; class HybridMgmtBlockingException : public std::exception { diff --git a/src/core/key_process/feature_admit_and_evict.cpp b/src/core/key_process/feature_admit_and_evict.cpp index fe7295b2..0305665a 100644 --- a/src/core/key_process/feature_admit_and_evict.cpp +++ b/src/core/key_process/feature_admit_and_evict.cpp @@ -144,7 +144,7 @@ FeatureAdmitType FeatureAdmitAndEvict::FeatureAdmitHelper(const int channel, con } // 特征淘汰接口 -void FeatureAdmitAndEvict::FeatureEvict(map>& evictKeyMap) +void FeatureAdmitAndEvict::FeatureEvict(map>& evictKeyMap) { std::vector tableNames = GetAllNeedEvictTableNames(); if (tableNames.empty()) { @@ -163,7 +163,7 @@ void FeatureAdmitAndEvict::FeatureEvict(map> } } -void FeatureAdmitAndEvict::FeatureEvictHelper(const std::string& embName, std::vector& evictKey) +void FeatureAdmitAndEvict::FeatureEvictHelper(const std::string& embName, std::vector& evictKey) { // 从 m_historyRecords 中淘汰删除 time_t currTime = m_recordsData.timestamps[embName]; diff --git a/src/core/key_process/feature_admit_and_evict.h b/src/core/key_process/feature_admit_and_evict.h index 0b31b080..6c82c846 100644 --- a/src/core/key_process/feature_admit_and_evict.h +++ b/src/core/key_process/feature_admit_and_evict.h @@ -25,7 +25,6 @@ See the License for the specific language governing permissions and #include #include #include "absl/container/flat_hash_map.h" -#include "host_emb/host_emb.h" #include "utils/common.h" #include "utils/safe_queue.h" #include "utils/singleton.h" @@ -69,7 +68,7 @@ namespace MxRec { KeysT& splitKey, std::vector& keyCount); // 特征淘汰接口 - void FeatureEvict(map>& evictKeyMap); + void FeatureEvict(map>& evictKeyMap); void ExecuteFeatureAdmit( const string& tableName, int channel, KeysT& splitKey, absl::flat_hash_map& mergeKeys); @@ -105,7 +104,7 @@ namespace MxRec { std::vector GetAllNeedEvictTableNames(); FeatureAdmitType FeatureAdmitHelper(const int channel, const std::string& tableNameOrigin, const int64_t featureId, const uint32_t featureCnt); - void FeatureEvictHelper(const std::string& embName, std::vector& evictKey); + void FeatureEvictHelper(const std::string& embName, std::vector& evictKey); void ResetAllRecords(); bool m_isEnableFunction { true }; // “特征淘汰”的使能开关 diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 22148581..63163453 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -24,7 +24,6 @@ See the License for the specific language governing permissions and #include "utils/singleton.h" #include "utils/time_cost.h" #include "utils/config.h" -#include "host_emb/host_emb.h" #include "emb_table/embedding_mgmt.h" #include "hd_transfer/hd_transfer.h" #include "ock_ctr_common/include/error_code.h" @@ -44,10 +43,15 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos const vector& thresholdValues, int seed) { + readySendEosCnt[TRAIN_CHANNEL_ID].store(0); + readySendEosCnt[EVAL_CHANNEL_ID].store(0); + finishSendEosCnt[TRAIN_CHANNEL_ID].store(0); + finishSendEosCnt[EVAL_CHANNEL_ID].store(0); + this->rankInfo = rInfo; - + SetupHotEmbUpdateStep(); - + map scInfo; for (const auto& info: eInfos) { embInfos[info.name] = info; @@ -79,13 +83,6 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos LOG_WARN(KEY_PROCESS "Feature admit-and-evict function is unavailable ..."); } - if (GlobalEnv::fastUnique) { - int result = ock::ctr::Factory::Create(factory); - if (result != 0) { - throw runtime_error(Logger::Format("create fast factory failed, error code:{}", result)); - } - } - LOG_INFO(KEY_PROCESS "scInfo:{}, localRankSize:{}, rankSize:{}, useStatic:{}", MapToString(scInfo), rInfo.localRankSize, rInfo.rankSize, rInfo.useStatic); #ifndef GTEST @@ -374,25 +371,32 @@ bool KeyProcess::KeyProcessTaskHelperWithFastUnique(unique_ptr& batch // Static all2all,need send count if (!rankInfo.useStatic) { SendA2A(uniqueInfo.all2AllInfo.scAll, batch->name, batch->channel, batch->batchId); } + TimeCost pushResultTC; auto tensors = make_unique>(); tensors->push_back(Vec2TensorI32(uniqueInfo.restore)); uniqueInfo.hotPos.resize(hotEmbTotCount[batch->name], -1); tensors->push_back(Vec2TensorI32(uniqueInfo.hotPos)); - + if (!rankInfo.isDDR) { PushGlobalUniqueTensors(move(tensors), uniqueInfo.all2AllInfo.keyRecv, channel); tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueInfo.all2AllInfo.keyRecv) : - Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); + Vec2TensorI32(uniqueInfo.all2AllInfo.keyRecv)); + PushResultHBM(batch, move(tensors)); + } else { + std::vector lookupKeysUint(uniqueInfo.all2AllInfo.keyRecv.begin(), + uniqueInfo.all2AllInfo.keyRecv.end()); + vector uniqueKeys; + vector restoreVecSec; + GlobalUnique(lookupKeysUint, uniqueKeys, restoreVecSec); + PushResultDDR(batch, move(tensors), uniqueKeys, restoreVecSec); } - TimeCost pushResultTC; - PushResult(batch, move(tensors), uniqueInfo.all2AllInfo.keyRecv); + LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); if (GlogConfig::gStatOn) { LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost_with_fast_unique {}", channel, batch->batchId, rankInfo.rankId, totalTimeCost.ElapsedMS()); } - LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); return true; } @@ -440,13 +444,19 @@ bool KeyProcess::KeyProcessTaskHelper(unique_ptr& batch, int channel, hotPos.resize(hotEmbTotCount[batch->name], 0); tensors->push_back(Vec2TensorI32(hotPos)); - + if (!rankInfo.isDDR) { PushGlobalUniqueTensors(tensors, lookupKeys, channel); tensors->push_back(rankInfo.useDynamicExpansion ? Vec2TensorI64(lookupKeys) : Vec2TensorI32(lookupKeys)); + PushResultHBM(batch, move(tensors)); + } else { + std::vector lookupKeysUint(lookupKeys.begin(), lookupKeys.end()); + vector uniqueKeys; + vector restoreVecSec; + GlobalUnique(lookupKeysUint, uniqueKeys, restoreVecSec); + PushResultDDR(batch, move(tensors), uniqueKeys, restoreVecSec); } - PushResult(batch, move(tensors), lookupKeys); LOG_DEBUG("pushResultTC(ms):{}", pushResultTC.ElapsedMS()); if (GlogConfig::gStatOn) { LOG_INFO(STAT_INFO "channel_id {} batch_id {} rank_id {} key_process_time_cost {}", @@ -504,15 +514,22 @@ vector KeyProcess::GetCountRecv(const unique_ptr& batch, in return countRecv; } -void KeyProcess::PushResult(unique_ptr& batch, unique_ptr> tensors, - KeysT& lookupKeys) +void KeyProcess::PushResultHBM(unique_ptr& batch, unique_ptr> tensors) { std::unique_lock lockGuard(mut); storage.push_front(move(tensors)); infoList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, storage.begin())); - if (rankInfo.isDDR) { - lookupKeysList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, move(lookupKeys))); - } + lockGuard.unlock(); +} + +void KeyProcess::PushResultDDR(unique_ptr& batch, unique_ptr> tensors, + std::vector& uniqueKeys, std::vector& restoreVecSec) +{ + std::unique_lock lockGuard(mut); + storage.push_front(move(tensors)); + infoList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, storage.begin())); + uniqueKeysList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, move(uniqueKeys))); + restoreVecSecList[batch->name][batch->channel].push(make_tuple(batch->batchId, batch->name, move(restoreVecSec))); lockGuard.unlock(); } @@ -1158,33 +1175,113 @@ void KeyProcess::BuildRestoreVec(const unique_ptr& batch, const vecto } template -T KeyProcess::GetInfo(info_list_t& list, int batch, const string& embName, int channel) +T KeyProcess::GetInfo(info_list_t& list, const EmbBaseInfo &info) { std::lock_guard lockGuard(mut); - if (list[embName][channel].empty()) { + if (list[info.name][info.channelId].empty()) { LOG_TRACE("get info list is empty."); throw EmptyList(); } - auto topBatch = get(list[embName][channel].top()); - if (topBatch < batch) { - LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", topBatch, batch, channel); + auto topBatch = get(list[info.name][info.channelId].top()); + if (topBatch < info.batchId) { + LOG_ERROR("wrong batch id, top:{} getting:{}, channel:{}, may not clear channel", + topBatch, info.batchId, info.channelId); this_thread::sleep_for(1s); } - if (topBatch != batch) { - LOG_TRACE("topBatch({}) is not equal batch({}).", topBatch, batch); + if (topBatch != info.batchId) { + LOG_TRACE("topBatch({}) is not equal batch({}).", topBatch, info.batchId); throw WrongListTop(); } - auto t = list[embName][channel].top(); - list[embName][channel].pop(); + auto t = list[info.name][info.channelId].top(); + list[info.name][info.channelId].pop(); return move(t); } -/// DDR模式下,从list中获取查询tensor向量 -/// \param batch 已处理的batch数 -/// \param embName 表名 -/// \param channel 通道索引(训练/推理) -/// \return -KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) +vector KeyProcess::GetUniqueKeys(const EmbBaseInfo& info, bool& isEos, + map &lookUpSwapInAddrsPushId) +{ + TimeCost tc = TimeCost(); + + HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); + bool cancelMonitor = false; + thread timeoutMonitor; + if (info.batchId != 0) { + timeoutMonitor = StartEosMonitorThread(info, cancelMonitor); + } + + // 循环尝试获取list中的数据;如果key process线程退出或者处理数据超时,返回空vector + + vector ret; + auto startTime = std::chrono::system_clock::now(); + while (true) { + if (!isRunning) { + break; + } + auto endTime = std::chrono::system_clock::now(); + // 判断此时的info.batchId id是否已经过期,即通道已经刷新 + if (info.batchId != hybridMgmtBlock->hybridBatchId[info.channelId]) { + LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", + info.name, info.channelId, info.batchId); + break; + } + if (info.batchId != 0 && info.channelId != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { + LOG_WARN(KEY_PROCESS "getting lookup keys timeout! {}[{}]:{}", + info.name, info.channelId, info.batchId); + break; + } + try { + auto infoVec = GetInfo(uniqueKeysList, info); + ret = get>(infoVec); + break; + } catch (EmptyList&) { + unique_lock lockEosGuard(eosMutex); + isEos = IsGetUniqueKeysEos(info, startTime, lookUpSwapInAddrsPushId); + if (isEos) { + break; + } + this_thread::sleep_for(1ms); + } catch (WrongListTop&) { + LOG_TRACE("getting info failed table:{}, channel:{}, mgmt batchId:{}, wrong top", + info.name, info.channelId, info.channelId); + this_thread::sleep_for(1ms); + } + } + cancelMonitor = true; + if (timeoutMonitor.joinable()) { + timeoutMonitor.join(); + } + return ret; +} + +bool KeyProcess::IsGetUniqueKeysEos(const EmbBaseInfo& info, std::chrono::_V2::system_clock::time_point& startTime, + map& lookUpSwapInAddrsPushId) +{ + HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); + auto endTime = std::chrono::system_clock::now(); + + // readEmbKey真实的次数是readEmbedBatchId减1 + int readEmbKeyBatchId = hybridMgmtBlock->readEmbedBatchId[info.channelId] - 1; + // 避免eos在keyProcess还未处理完数据时插队到通道前面 + std::chrono::duration elapsedTime = endTime - startTime; + if (info.batchId != 0 && elapsedTime.count() >= timeoutGetUniqueKeysEmpty) { + LOG_DEBUG("table:{}, channelId:{}, isNeedSendEos:{}, readEmbKeyBatchId:{}, batch:{}, h2dNextBatchId:{}," + " lookUpSwapInAddrsPushId:{}", info.name, info.channelId, isNeedSendEos[info.channelId], + readEmbKeyBatchId, info.batchId, hybridMgmtBlock->h2dNextBatchId[info.name], + lookUpSwapInAddrsPushId[info.name]); + startTime = std::chrono::system_clock::now(); + } + if (isNeedSendEos[info.channelId] && readEmbKeyBatchId < info.batchId && + hybridMgmtBlock->h2dNextBatchId[info.name] == lookUpSwapInAddrsPushId[info.name]) { + LOG_INFO("table:{}, channelId:{} batchId:{}, GetUniqueKeys eos", + info.name, info.channelId, info.batchId); + return true; + } + LOG_TRACE("getting uniqueKeys failed, table:{}, channel:{}, mgmt batchId:{}, readEmbKey batchId:{}, list is empty", + info.name, info.channelId, info.batchId, readEmbKeyBatchId); + return false; +} + +std::vector KeyProcess::GetRestoreVecSec(const EmbBaseInfo& info) { TimeCost tc = TimeCost(); // 循环尝试获取list中的数据;如果key process线程退出或者处理数据超时,返回空vector @@ -1194,74 +1291,80 @@ KeysT KeyProcess::GetLookupKeys(int batch, const string& embName, int channel) } // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); - if (batch != hybridMgmtBlock->hybridBatchId[channel]) { + if (info.batchId != hybridMgmtBlock->hybridBatchId[info.channelId]) { LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", - embName, channel, batch); + info.name, info.channelId, info.batchId); return {}; } - if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { - LOG_WARN(KEY_PROCESS "getting lookup keys timeout! {}[{}]:{}", embName, channel, batch); + if (info.batchId != 0 && info.channelId != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { + LOG_WARN(KEY_PROCESS "getting lookup keys timeout! {}[{}]:{}", info.name, info.channelId, info.batchId); return {}; } try { - auto ret = GetInfo(lookupKeysList, batch, embName, channel); - return get(ret); + auto ret = GetInfo(restoreVecSecList, info); + return get>(ret); } catch (EmptyList&) { unique_lock lockEosGuard(eosMutex); // readEmbKey真实的次数是readEmbedBatchId减1 - int readEmbKeyBatchId = hybridMgmtBlock->readEmbedBatchId[channel] - 1; + int readEmbKeyBatchId = hybridMgmtBlock->readEmbedBatchId[info.channelId] - 1; // 避免eos在keyProcess还未处理完数据时插队到通道前面 - if (isNeedSendEos[channel] && readEmbKeyBatchId < batch) { - LOG_INFO("channelId:{} batchId:{}, GetLookupKeys eos.", channel, batch); - unique_lock lockDestroyGuard(destroyMutex); - SendEos(batch, channel); - return {}; + if (isNeedSendEos[info.channelId] && readEmbKeyBatchId < info.batchId && + hybridMgmtBlock->h2dNextBatchId[info.name] == info.batchId) { + LOG_ERROR("channelId:{} batchId:{}, GetRestoreVecSec eos, code should not reach here", + info.channelId, info.batchId); + throw runtime_error("GetRestoreVecSec eos, code should not reach here"); } LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", - embName, channel, batch, readEmbKeyBatchId); + info.name, info.channelId, info.batchId, readEmbKeyBatchId); this_thread::sleep_for(1ms); } catch (WrongListTop&) { - LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); + LOG_TRACE("getting info failed {}[{}]:{} wrong top", info.name, info.channelId, info.batchId); this_thread::sleep_for(1ms); } } } /// 当数据列表为空,且eos标志位为true时,主动发送eos +/// \param embName 表名 /// \param batchId 已处理的batch数 /// \param channel 通道索引(训练/推理) -void KeyProcess::SendEos(int batchId, int channel) +/// \param sendAllChannel 是否强制发送所有channel +void KeyProcess::SendEos(const std::string& embName, int batchId, int channel, bool sendAllChannel) { #ifndef GTEST - LOG_INFO("channelId:{} batchId:{}, SendEos start.", channel, batchId); - - auto trans = Singleton::GetInstance(); - unordered_map transChannels = trans->GetTransChannel(); - std::set usedChannelNames = trans->GetUsedTransChannel()[channel]; - - vector tensors; - bool isNeedResend = true; - - for (const auto& emb: as_const(embInfos)) { // 一个表触发以后,其余表都发送eos,最后外层接收null退出此次循环 - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos start.", channel, batchId, emb.first); - if (!isRunning) { - throw EndRunExit("SendEos end run, isRunning is false after lock destroyMutex."); - } - for (const string& transName : usedChannelNames) { - string sendName = StringFormat("%s_%s_%d", emb.first.c_str(), transName.c_str(), channel); - size_t channelSize = 0; - - acltdtQueryChannelSize(transChannels[sendName], &channelSize); - LOG_INFO("[EOS] Before send eos, {} contains {}.", sendName, channelSize); - SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, isNeedResend); - acltdtQueryChannelSize(transChannels[sendName], &channelSize); - LOG_INFO("[EOS] After send eos, {} contains {}.", sendName, channelSize); - } - LOG_INFO("channelId:{} batchId:{}, the embName:{} related channel SendEos end.", channel, batchId, emb.first); + finishSendEosCnt[channel].store(0); + ++readySendEosCnt[channel]; + LOG_INFO("table:{}, channelId:{} batchId:{}, readySendEosCnt:{}, ready to SendEos", + embName, channel, batchId, readySendEosCnt[channel]); + while (readySendEosCnt[channel] != static_cast(embInfos.size())) { + LOG_DEBUG("table:{}, readySendEosCnt:{}, waiting other table enter SendEos", embName, readySendEosCnt[channel]); + this_thread::sleep_for(1000ms); + } + LOG_INFO("table:{}, channelId:{} batchId:{}, SendEos start, acquiring destroyMutex", embName, channel, batchId); + destroyMutex.lock(); + + LOG_INFO("table:{}, channelId:{} batchId:{}, SendEos start", embName, channel, batchId); + if (!isRunning) { + LOG_INFO("other table trigger eos ahead, keyProcess already destroyed. skip sending eos for table:{}", embName); + ++finishSendEosCnt[channel]; + destroyMutex.unlock(); + return; } + SendEosTensor(embName, channel, sendAllChannel); + destroyMutex.unlock(); + LOG_INFO("channelId:{} batchId:{}, the embName:{} SendEos end, release destroyMutex", channel, batchId, embName); - LOG_INFO("channelId:{} batchId:{}, SendEos end.", channel, batchId); + ++finishSendEosCnt[channel]; + LOG_INFO("table:{}, channelId:{} batchId:{}, finishSendEosCnt:{}, finish SendEos", + embName, channel, batchId, finishSendEosCnt[channel]); + while (finishSendEosCnt[channel] != static_cast(embInfos.size())) { + LOG_DEBUG("table:{}, channelId:{} batchId:{}, finishSendEosCnt:{}, waiting other table finish SendEos", + embName, channel, batchId, finishSendEosCnt[channel]); + this_thread::sleep_for(1000ms); + } + readySendEosCnt[channel].store(0); isNeedSendEos[channel] = false; + LOG_DEBUG("isNeedSendEos set to false, table:{}, channelId:{} batchId:{}", embName, channel, batchId); #endif } @@ -1271,7 +1374,7 @@ void KeyProcess::SendEos(int batchId, int channel) /// \param channel 通道索引(训练/推理) /// \param type 数据类型 /// \return -unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type) +unique_ptr> KeyProcess::GetInfoVec(const EmbBaseInfo &info, ProcessedInfo type, bool &isEos) { TimeCost tc = TimeCost(); info_list_t* list; @@ -1288,47 +1391,46 @@ unique_ptr> KeyProcess::GetInfoVec(int batch, const string& embNa throw std::invalid_argument("Invalid ProcessedInfo Type."); } + unique_ptr> ret = nullptr; // 循环尝试获取list中的数据;如果key process线程退出或者处理数据超时,返回空指针 while (true) { if (!isRunning) { - return nullptr; + break; } // 判断此时的batch id是否已经过期,即通道已经刷新 HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); - if (batch != hybridMgmtBlock->hybridBatchId[channel]) { + if (info.batchId != hybridMgmtBlock->hybridBatchId[info.channelId]) { LOG_DEBUG(KEY_PROCESS "Detected that the batch has expired at this time, exiting the loop! {}[{}]:{}", - embName, channel, batch); - return nullptr; + info.name, info.channelId, info.batchId); + break; } - if (batch != 0 && channel != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { - LOG_WARN(KEY_PROCESS "getting lookup keys timeout! {}[{}]:{}", embName, channel, batch); - return nullptr; + if (info.batchId != 0 && info.channelId != 0 && tc.ElapsedSec() > KEY_PROCESS_TIMEOUT) { + LOG_WARN(KEY_PROCESS "getting lookup keys timeout! {}[{}]:{}", info.name, info.channelId, info.batchId); + break; } try { - auto ret = GetInfo(*list, batch, embName, channel); - auto it = get>>::iterator>(ret); - auto uTensor = move(*it); + auto infoVec = GetInfo(*list, info); + auto it = get>>::iterator>(infoVec); + ret = std::move(*it); std::unique_lock lockGuard(mut); storage.erase(it); - return uTensor; + break; } catch (EmptyList&) { unique_lock lockEosGuard(eosMutex); - // 避免eos在keyProcess还未处理完数据时插队到通道前面, readEmbKey真实的次数是readEmbedBatchId减1 - if (isNeedSendEos[channel] && (hybridMgmtBlock->readEmbedBatchId[channel] - 1) < batch) { - LOG_INFO("channelId:{} batchId:{}, GetInfoVec eos.", channel, batch); - unique_lock lockDestroyGuard(destroyMutex); - SendEos(batch, channel); - return nullptr; + isEos = IsGetInfoVecEos(info.batchId, info.name, info.channelId); + if (isEos) { + break; } LOG_TRACE("getting info failed {}[{}], list is empty, and mgmt batchId: {}, readEmbKey batchId: {}.", - embName, channel, batch, (hybridMgmtBlock->readEmbedBatchId[channel] - 1)); + info.name, info.channelId, info.batchId, (hybridMgmtBlock->readEmbedBatchId[info.channelId] - 1)); this_thread::sleep_for(1ms); } catch (WrongListTop&) { - LOG_TRACE("getting info failed {}[{}]:{} wrong top", embName, channel, batch); + LOG_TRACE("getting info failed {}[{}]:{} wrong top", info.name, info.channelId, info.batchId); this_thread::sleep_for(1ms); } } + return ret; } void KeyProcess::SendA2A(const vector& a2aInfo, const string& embName, int channel, int batch) @@ -1355,13 +1457,13 @@ int KeyProcess::GetMaxStep(int channelId) const return rankInfo.ctrlSteps.at(channelId); } -void KeyProcess::EvictKeys(const string& embName, const vector& keys) // hbm +void KeyProcess::EvictKeys(const string& embName, const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm funEvictCall: [{}]! keySize:{}", embName, keys.size()); EmbeddingMgmt::Instance()->EvictKeys(embName, keys); } -void KeyProcess::EvictKeysCombine(const vector& keys) // hbm +void KeyProcess::EvictKeysCombine(const vector& keys) // hbm { LOG_INFO(KEY_PROCESS "hbm combine funEvictCall, keySize:{}", keys.size()); EmbeddingMgmt::Instance()->EvictKeysCombine(keys); @@ -1466,7 +1568,94 @@ void KeyProcess::RecordKeyCountMap(const unique_ptr& batch) void KeyProcess::SetEos(int status, int channelId) { unique_lock lockGuard(eosMutex); - LOG_INFO("isNeedSendEos status is changed, before status:[{}], input status:{}, channel:[{}], ", - isNeedSendEos[channelId], status, channelId); + LOG_INFO("isNeedSendEos status is changed, channel:{}, before status:{}, input status:{}", + channelId, isNeedSendEos[channelId], status); isNeedSendEos[channelId] = (status == 1); } + +bool KeyProcess::IsGetInfoVecEos(int batch, const string& embName, int channel) +{ + HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); + + // 避免eos在keyProcess还未处理完数据时插队到通道前面, readEmbKey真实的次数是readEmbedBatchId减1 + int readEmbKeyBatchId = hybridMgmtBlock->readEmbedBatchId[channel] - 1; + if (rankInfo.isDDR) { + if (isNeedSendEos[channel] && readEmbKeyBatchId < batch && + hybridMgmtBlock->h2dNextBatchId[embName] == batch) { + LOG_ERROR("channelId:{} batchId:{}, GetInfoVec eos, code should not reach here", channel, batch); + throw runtime_error("GetInfoVec eos, code should not reach here"); + } + } else { + LOG_TRACE("table:{}, channelId:{}, readEmbKeyBatchId:{}, batchId:{}, isNeedSendEos:{}", + embName, channel, readEmbKeyBatchId, batch, isNeedSendEos[channel]); + if (isNeedSendEos[channel] && readEmbKeyBatchId < batch) { + LOG_INFO("table:{}, channelId:{} batchId:{}, GetInfoVec eos", embName, channel, batch); + return true; + } + } + return false; +} + +std::thread KeyProcess::StartEosMonitorThread(const EmbBaseInfo &info, bool &cancelMonitor) +{ + // 由于embCache延迟发送swapPos的特性,step n需要step n+1的数据来启动,当获取不到step n+1时,需要触发eos并补发step n需要的swapPos + LOG_DEBUG("table:{}, channel:{}, batchId:{}, start a monitor thread to check eos", + info.name, info.channelId, info.batchId); + return thread([&]() { + chrono::high_resolution_clock::time_point start = chrono::high_resolution_clock::now(); + chrono::high_resolution_clock::time_point end = chrono::high_resolution_clock::now(); + chrono::duration duration = chrono::duration_cast>(end - start); + while (!cancelMonitor && duration.count() < timeoutGetUniqueKeys) { + this_thread::sleep_for(1ms); + end = chrono::high_resolution_clock::now(); + duration = chrono::duration_cast>(end - start); + } + if (!cancelMonitor) { + this->SetEos(1, info.channelId); + LOG_INFO("table:{}, channel:{}, batchId:{}, timeout:{}(s) monitor empty data, set eos", + info.name, info.channelId, info.batchId, timeoutGetUniqueKeys); + } else { + LOG_DEBUG("table:{}, channel:{}, batchId:{}, timeout monitor canceled", + info.name, info.channelId, info.batchId); + } + }); +} + +void KeyProcess::SendEosTensor(const std::string& embName, int channel, bool sendAllChannel) +{ +#ifndef GTEST + auto trans = Singleton::GetInstance(); + unordered_map transChannels = trans->GetTransChannel(); + std::set usedChannelNames = trans->GetUsedTransChannel()[channel]; + + vector tensors; + bool isNeedResend = true; + string sendName; + for (const string& transName : usedChannelNames) { + if (transName == TransferChannel2Str(TransferChannel::SAVE_D2H) || + transName == TransferChannel2Str(TransferChannel::SAVE_H2D)) { + // do nothing on save channel, it's independent to train, eval and predict channel; + continue; + } + + if (transName == TransferChannel2Str(TransferChannel::SWAP) || + transName == TransferChannel2Str(TransferChannel::H2D)) { + sendName = StringFormat("%s_%s_all", embName.c_str(), transName.c_str()); + if (channel == EVAL_CHANNEL_ID && !sendAllChannel) { + LOG_INFO("skip send eos for share channel:{}, channel id:{}", sendName, channel); + LOG_INFO("check if train ProcessEmbInfo run and let it decide eos or not"); + continue; + } + } else { + sendName = StringFormat("%s_%s_%d", embName.c_str(), transName.c_str(), channel); + } + + size_t channelSize = 0; + acltdtQueryChannelSize(transChannels[sendName], &channelSize); + LOG_INFO("[EOS] Before send eos, channel:{}, size:{}.", sendName, channelSize); + SendTensorsByAcl(transChannels[sendName], ACL_TENSOR_DATA_END_OF_SEQUENCE, tensors, isNeedResend); + acltdtQueryChannelSize(transChannels[sendName], &channelSize); + LOG_INFO("[EOS] After send eos, channel:{}, size:{}.", sendName, channelSize); + } +#endif +} diff --git a/src/core/key_process/key_process.h b/src/core/key_process/key_process.h index 8bd7b8d0..589fc2a5 100644 --- a/src/core/key_process/key_process.h +++ b/src/core/key_process/key_process.h @@ -83,9 +83,11 @@ namespace MxRec { bool Initialize(const RankInfo& rInfo, const vector& eInfos, const vector& thresholdValues = {}, int seed = 0); - unique_ptr> GetInfoVec(int batch, const string& embName, int channel, ProcessedInfo type); + unique_ptr> GetInfoVec(const EmbBaseInfo& info, ProcessedInfo type, bool &isEos); - KeysT GetLookupKeys(int batch, const string& embName, int channel); + vector GetUniqueKeys(const EmbBaseInfo &info, bool &isEos, map &lookUpSwapInAddrsPushId); + + vector GetRestoreVecSec(const EmbBaseInfo& info); int GetMaxStep(int channelId) const; @@ -109,9 +111,9 @@ namespace MxRec { void LoadSaveUnlock(); - void EvictKeys(const string& embName, const vector& keys); + void EvictKeys(const string& embName, const vector& keys); - void EvictKeysCombine(const vector& keys); + void EvictKeysCombine(const vector& keys); void SetupHotEmbUpdateStep(); @@ -157,7 +159,7 @@ namespace MxRec { void SetEos(int status, int channelId); - void SendEos(int batchId, int channel); + void SendEos(const string& embName, int batchId, int channel, bool sendAllChannel); bool isRunning { false }; @@ -167,12 +169,13 @@ namespace MxRec { { return embInfos.find(embName) != embInfos.end(); }; + GTEST_PRIVATE: int Start(); template - T GetInfo(info_list_t& list, int batch, const string& embName, int channel); + T GetInfo(info_list_t& list, const EmbBaseInfo &info); RankInfo rankInfo; map embInfos; @@ -181,6 +184,8 @@ namespace MxRec { vector> procThreads {}; std::mutex loadSaveMut[MAX_CHANNEL_NUM][MAX_KEY_PROCESS_THREAD] {}; info_list_t lookupKeysList; + info_list_t uniqueKeysList; + info_list_t restoreVecSecList; list>> storage; info_list_t infoList; info_list_t all2AllList; @@ -195,7 +200,13 @@ namespace MxRec { ock::ctr::FactoryPtr factory {}; int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; bool isWithFAAE; - bool isNeedSendEos[2] = { 0, 0 }; // 分别代表通道0、1的eos状态 + + // for end-of-sequence case + bool isNeedSendEos[2] = {false, false}; // 表示各表通道0、1的eos状态 + atomic readySendEosCnt[2]; + atomic finishSendEosCnt[2]; + const double timeoutGetUniqueKeys = 10.0; // 如果超时仍未获取到数据将触发EOS + const double timeoutGetUniqueKeysEmpty = 1.0; // 如果超时仍未获取到数据将打印信息 void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); @@ -262,7 +273,10 @@ namespace MxRec { void HandleHotAndSendCount(const unique_ptr &batch, UniqueInfo& uniqueInfoOut, KeySendInfo& keySendInfo, vector& sc, vector& splitSize); - void PushResult(unique_ptr& batch, unique_ptr> tensors, KeysT& lookupKeys); + void PushResultHBM(unique_ptr& batch, unique_ptr> tensors); + + void PushResultDDR(unique_ptr& batch, unique_ptr> tensors, + std::vector& uniqueKeys, std::vector& restoreVecSec); void PushGlobalUniqueTensors(const unique_ptr>& tensors, KeysT& lookupKeys, int channel); @@ -290,6 +304,15 @@ namespace MxRec { } string DumpSplitKeys(vector>& splitKeys) const; + + bool IsGetInfoVecEos(int batch, const string& embName, int channel); + + bool IsGetUniqueKeysEos(const EmbBaseInfo& info, std::chrono::_V2::system_clock::time_point& startTime, + map& lookUpSwapInAddrsPushId); + + void SendEosTensor(const std::string& embName, int channel, bool sendAllChannel); + + std::thread StartEosMonitorThread(const EmbBaseInfo& info, bool& cancelMonitor); }; #define KEY_PROCESS_INSTANCE Singleton::GetInstance() diff --git a/src/core/ock_ctr_common/include/embedding_cache.h b/src/core/ock_ctr_common/include/embedding_cache.h new file mode 100644 index 00000000..f3bc9e23 --- /dev/null +++ b/src/core/ock_ctr_common/include/embedding_cache.h @@ -0,0 +1,321 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.s +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef EMBEDDING_CACHE_H +#define EMBEDDING_CACHE_H + +#include +#include +#include +#include + +namespace EmbCache { +using KeyOffsetPair = std::pair, std::vector>; + +class Initializer { +public: + Initializer() = default; + virtual ~Initializer() = default; + + /* * + * 生成随机数 + * @Param emb embedding的首地址 + */ + virtual void GenerateData(float* emb, int embSize) = 0; + uint32_t start = 0; // 起始位置 + uint32_t len = 0; // 初始化的长度 + float initParam = 1.0; // 初始化器生成的初始值均需要乘以initParam +}; + +enum class InitializerType { + INVALID, + CONSTANT, + TRUNCATED_NORMAL, + RANDOM_NORMAL +}; + +struct ConstantInitializerInfo { + ConstantInitializerInfo() = default; + + ConstantInitializerInfo(float constantValue, float initK); + + float constantValue = 0; // 常量值 + float initK = 1.0; // 初始化出来的值需乘以initK +}; + +struct NormalInitializerInfo { + NormalInitializerInfo() = default; + + NormalInitializerInfo(float mean, float stddev, uint32_t seed, float initK); + + float mean = 0; // 平均值 + float stddev = 0; // 标准差 + uint32_t seed = 0; // 随机数种子 + float initK = 1.0; // 初始化出来的值需乘以initK +}; + +class ConstantInitializer : public Initializer { +public: + ConstantInitializer() = default; + + ConstantInitializer(uint32_t start, uint32_t len, float value, float initK); + + ~ConstantInitializer() override = default; + + void GenerateData(float* emb, int embSize) override; + + uint32_t start = 0; // 起始位置 + uint32_t len = 0; // 初始化的长度 + float constantValue = 0; // 常量值 +}; + +class RandomNormalInitializer : public Initializer { +public: + RandomNormalInitializer() = default; + RandomNormalInitializer(uint32_t start, uint32_t len, NormalInitializerInfo& initInfo); + + ~RandomNormalInitializer() override = default; + + void GenerateData(float* emb, int embSize) override; + + uint32_t start = 0; // 起始位置 + uint32_t len = 0; // 初始化的长度 + float mean = 0; // 平均值 + float stddev = 0; // 标准差 + uint32_t seed = 0; // 随机数种子 + + std::default_random_engine generator; // 随机数生成器 + std::normal_distribution distribution; // 正态分布 +}; + +class TruncatedNormalInitializer : public Initializer { +public: + TruncatedNormalInitializer() = default; + + TruncatedNormalInitializer(uint32_t start, uint32_t len, NormalInitializerInfo& initInfo); + + ~TruncatedNormalInitializer() override = default; + + void GenerateData(float* emb, int embSize) override; + + int boundNum = 2; + + uint32_t start = 0; // 起始位置 + uint32_t len = 0; // 初始化的长度 + float mean = 0; // 平均值 + float stddev = 0; // 标准差 + uint32_t seed = 0; // 随机数种子 + + std::default_random_engine generator; // 随机数生成器 + std::normal_distribution distribution; + float minBound = 0; // 下界 + float maxBound = 0; // 上界 +}; + +struct InitializerInfo { + InitializerInfo() = default; + + InitializerInfo(std::string& name, uint32_t start, uint32_t len, ConstantInitializerInfo constantInitializerInfo); + + InitializerInfo(std::string& name, uint32_t start, uint32_t len, NormalInitializerInfo normalInitializerInfo); + + std::string name = ""; // 初始化器的名称 + uint32_t start = 0; // 初始化开始的位置 + uint32_t len = 0; // 待初始化的长度 + InitializerType initializerType = InitializerType::INVALID; + + ConstantInitializerInfo constantInitializerInfo; + NormalInitializerInfo normalInitializerInfo; + + std::shared_ptr initializer; +}; + +struct EmbCacheInfo { + EmbCacheInfo(std::string tableName, uint32_t vocabSize, uint32_t embeddingSize, uint32_t extEmbeddingSize, + uint32_t maxCacheSize) + : tableName(tableName), + vocabSize(vocabSize), + embeddingSize(embeddingSize), + extEmbeddingSize(extEmbeddingSize), + maxCacheSize(maxCacheSize) + { + } + std::string tableName = ""; + uint32_t vocabSize = 0; // host侧的容量(能存多少条embedding) + uint32_t embeddingSize = 0; + uint32_t extEmbeddingSize = 0; // 包含embedding和优化器信息的embedding长度 + uint32_t maxCacheSize = 0; // device侧的容量(能存多少条embedding) +}; + +class EmbCacheManager { +public: + virtual ~EmbCacheManager() = default; + + /* * + * 对当前embInfo对应的table在cache_manager中进行table初始化 + * @Param EmbCacheInfo: embedding cache的初始化信息 + * @Param std::vector 初始化器的信息 + * @Param uint64_t prefillBufferSize emb内存池恒定可用大小 + * @Param uint32_t refillThreadNum emb内存池自动填充线程数 + * @Return errorCode + */ + virtual int CreateCacheForTable(const EmbCacheInfo& embCacheInfo, + const std::vector& initializerInfos, int64_t invalidKey = -1, + uint64_t prefillBufferSize = 500000, uint32_t refillThreadNum = 1) = 0; + + /* * + * 查找当前keys对应的offsets并将本不存在与offsetMapper中的keys插入到offsetMapper中并得到其偏移值offsets, + * 并且当offsetMapper可存放空间不足时,释放可swapOut的keys,获取当前需要被换入换出的keys和offsets的pair + * @Param tableName: 表名 + * @Param keys: 当前batch所有unique的keys + * @Param swapInKoPair: 输出参数,需要换入的Key-offset pair + * @Param swapOutKoPair: 输出参数,需要换出的Key-offset pair + * @Return errorCode + */ + virtual int GetSwapPairsAndKey2Offset(std::string tableName, std::vector& keys, + KeyOffsetPair& swapInKoPair, KeyOffsetPair& swapOutKoPair) = 0; + + /* * + * 查询Embedding + * @Param tableName: 表名 + * @Param keys: 待查询的keys + * @Param embAddr: 申请出来存放embedding的空间首地址 + * @Param threadNum: 线程数 + * @Return errorCode + */ + virtual int EmbeddingLookup(std::string tableName, const std::vector& keys, float* embAddr, + uint32_t threadNum = 4) = 0; + + /* * + * 查询Embedding的地址 + * @Param tableName: 表名 + * @Param keys: 待查询的keys + * @Param addrs: keys对应的申请出来存放embedding的空间首地址 + * @Param threadNum: 线程数 + * @Return errorCode + */ + virtual int EmbeddingLookupAddrs(std::string tableName, const std::vector& keys, + std::vector& addrs, uint32_t threadNum = 4) = 0; + + /* * + * 查询Embedding并且在查询完成之后删除embedding对应的key。如果多线程使用,严格保证传入的key线程间不会重复(unique + * key),否则可能出现未定义结果 + * @Param tableName: 表名 + * @Param keys: 待查询的keys + * @Param embAddr: 申请出来存放embedding的空间首地址 + * @Param threadNum: 线程数 + * @Return errorCode + */ + virtual int EmbeddingLookupAndRemove(std::string tableName, const std::vector& keys, float* embAddr, + uint32_t threadNum = 4) = 0; + + /* * + * 更新Embedding + * @Param tableName: 表名 + * @Param keys: 待更新的keys,用于查询出每个key在DDR上存放的地址 + * @Param embAddr: 待更新到DDR上的embedding的首地址 + * @Param threadNum: 线程数 + * @Return errorCode + */ + virtual int EmbeddingUpdate(std::string tableName, const std::vector& keys, float* embAddr, + uint32_t threadNum = 4) = 0; + + /* * + * 在EmbLocalTable中移除keys,并将存储其embedding的内存位置记为可复用 + * @Param tableName: 表名 + * @Param keys: 待移除的keys + * @Return errorCode + */ + virtual int EmbeddingRemove(std::string tableName, const std::vector& keys, uint32_t threadNum = 4) = 0; + + /* * + * 将需要被淘汰的keys从offsetMapper的记录中移除,同时也在EmbLocalTable中移除,并将存储其embedding的内存位置记为可复用 + * @Param tableName: 表名 + * @Param keys: 待淘汰的keys + * @Return errorCode + */ + virtual int RemoveEmbsByKeys(std::string tableName, const std::vector& keys) = 0; + + /* * + * 获取所有table names + * @Param allTableNames: 输出参数,用于存放所有的table names + * @Return errorCode + */ + virtual int GetEmbTableNames(std::vector& allTableNames) = 0; + + /* * + * 获取以values为增序排列的当前记录在offsetMapper中所有的keys和values的pairs + * @Param tableName: 表名 + * koVec: 输出参数 + * @Return errorCode + */ + virtual int ExportDeviceKeyOffsetPairs(std::string tableName, + std::vector>& koVec) = 0; + + /* * + * 获取当前table的序列化信息 + * @Param tableName: 要序列化的表 + * @Param buffer: 输出参数,存储序列化之后的信息 + * @Return errorCode + */ + virtual int Serialize(std::string tableName, std::vector& buffer) = 0; + + /* * + * 将当前table的序列化信息进行反序列化 + * @Param tableName: 要反序列化的表 + * @Param buffer: 输入参数,将buffer中的内容进行反序列化 + * @Return errorCode + */ + virtual int Deserialize(std::string tableName, const std::vector& buffer) = 0; + + /* * + * 析构所有embCache,释放内存 + */ + virtual void Destroy() = 0; + + /* * + * 查询表的使用量 + * @Param tableName: 要查询的表 + * @Return 当前表的使用量 + */ + virtual uint32_t GetUsage(const std::string& tableName) = 0; + + /* * + * 获取当前host侧所存储的所有keys及其对应的embeddings和优化器参数 + * @Param tableName: 需要获取信息的table名字 + * @Param keys: 输入参数,输入空vector,获取的存储的所有keys会赋到该vector中 + * @Param embeddings: 输入参数,输入空vector,获取的存储的所有embeddings会赋到该vector中 + * @Param optimizerSlots: 输入参数,输入空vector,获取的存储的所有optimizerSlots会赋到该vector中 + * @Return errorCode + */ + virtual int GetEmbTableInfos(std::string tableName, std::vector& keys, + std::vector>& embeddings, + std::vector>& optimizerSlots) = 0; + + /* * + * 将所需存储的keys及其对应的embeddings和优化器参数传入,来装载LocalEmbeddingTable + * @Param tableName: 需要加载信息的table名字 + * @Param keys: 输入参数,需要加载的所有keys + * @Param embeddings: 输入参数,需要加载的所有embeddings + * @Param optimizerSlots: 输入参数,需要加载的所有optimizerSlots + * @Return errorCode + */ + virtual int LoadEmbTableInfos(std::string tableName, const std::vector& keys, + const std::vector>& embeddings, + const std::vector>& optimizerSlots) = 0; +}; +} // namespace EmbCache + +#endif // EMBEDDING_CACHE_H diff --git a/src/core/ock_ctr_common/include/factory.h b/src/core/ock_ctr_common/include/factory.h index 44a2fce0..ce701abe 100644 --- a/src/core/ock_ctr_common/include/factory.h +++ b/src/core/ock_ctr_common/include/factory.h @@ -17,16 +17,17 @@ See the License for the specific language governing permissions and #define UNIQUE_OCK_CTR_COMMON_H #include -#include #include -#include "unique.h" +#include +#include "embedding_cache.h" +#include "unique.h" #ifdef __cplusplus extern "C" { #endif -using ExternalLog = void (*)(int level, const char *msg); +using ExternalLog = void (*)(int level, const char* msg); #ifdef __cplusplus } @@ -40,26 +41,28 @@ class Factory; using FactoryPtr = std::shared_ptr; using UniquePtr = std::shared_ptr; +using EmbCacheManagerPtr = std::shared_ptr; class Factory { public: virtual ~Factory() = default; - virtual int CreateUnique(UniquePtr &out) = 0; + virtual int CreateUnique(UniquePtr& out) = 0; + virtual int CreateEmbCacheManager(EmbCacheManagerPtr& out) = 0; virtual int SetExternalLogFuncInner(ExternalLog logFunc) = 0; public: - static int Create(FactoryPtr &out) + static int Create(FactoryPtr& out) { int result = 0; uintptr_t factory = 0; /* dynamic load function */ - if ((result = OckCtrCommonDef::CreatFactory(&factory)) == 0) { - out.reset(reinterpret_cast(factory)); + if ((result = OckCtrCommonDef::CreateFactory(&factory)) == 0) { + out.reset(reinterpret_cast(factory)); } return result; } }; -} -} +} // namespace ctr +} // namespace ock -#endif // UNIQUE_OCK_CTR_COMMON_H +#endif // UNIQUE_OCK_CTR_COMMON_H diff --git a/src/core/ock_ctr_common/include/ock_ctr_common_def.h b/src/core/ock_ctr_common/include/ock_ctr_common_def.h index e8b3f0b5..537d7a39 100644 --- a/src/core/ock_ctr_common/include/ock_ctr_common_def.h +++ b/src/core/ock_ctr_common/include/ock_ctr_common_def.h @@ -20,15 +20,15 @@ See the License for the specific language governing permissions and #include #include -using CTR_CREATE_FACTORY_FUNCTION = int (*)(uintptr_t *); +using CTR_CREATE_FACTORY_FUNCTION = int (*)(uintptr_t*); namespace ock { namespace ctr { class OckCtrCommonDef { public: - static int CreatFactory(uintptr_t *factory) + static int CreateFactory(uintptr_t* factory) { - static void *handle = nullptr; + static void* handle = nullptr; static std::mutex m; std::unique_lock lock(m); if (handle != nullptr) { @@ -38,8 +38,8 @@ public: handle = dlopen(LIBRARY_NAME, RTLD_NOW); if (handle == nullptr) { - std::cout << "Failed to call dlopen to load library '" << LIBRARY_NAME << "', error " << dlerror() << - std::endl; + std::cout << "Failed to call dlopen to load library '" << LIBRARY_NAME << "', error " << dlerror() + << std::endl; return -1; } @@ -55,9 +55,9 @@ public: } private: - constexpr static const char *LIBRARY_NAME = "lib_ock_ctr_common.so"; + constexpr static const char* LIBRARY_NAME = "lib_ock_ctr_common.so"; }; -} -} +} // namespace ctr +} // namespace ock -#endif // OCK_OCK_CTR_COMMON_DEF_H +#endif // OCK_OCK_CTR_COMMON_DEF_H diff --git a/src/core/ock_ctr_common/include/unique.h b/src/core/ock_ctr_common/include/unique.h index cb8960e7..5d11fe66 100644 --- a/src/core/ock_ctr_common/include/unique.h +++ b/src/core/ock_ctr_common/include/unique.h @@ -59,6 +59,7 @@ using UniqueConf = struct UniqueConfCTR { uint32_t maxThreadNum = 8; // 最大工作线程数 int64_t maxIdVal = 0; // 最大id值 bool trace = false; // 是否开启性能检测,需要配合外部日志输出 + bool performance = false; // 是否开启增强接口,增强接口shardingNum必须是2的幂次方,默认用取模分桶 } __attribute__((packed)); using UniqueIn = struct UniqueInCTR { diff --git a/src/core/ssd_cache/cache_manager.cpp b/src/core/ssd_cache/cache_manager.cpp index 36be19d9..a82a65a7 100644 --- a/src/core/ssd_cache/cache_manager.cpp +++ b/src/core/ssd_cache/cache_manager.cpp @@ -18,277 +18,27 @@ See the License for the specific language governing permissions and #include #include #include +#include #include "utils/common.h" #include "utils/time_cost.h" using namespace MxRec; -inline void CacheManager::GetExternalKeys(const absl::flat_hash_map &keyOffsetMap, - vector &externalKeys, vector &internalKeys, - const vector &keys) const +void CacheManager::Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo) { - for (const emb_key_t key : keys) { - if (keyOffsetMap.find(key) == keyOffsetMap.end()) { - externalKeys.emplace_back(key); - } else { - internalKeys.emplace_back(key); - } - } -} - -void CacheManager::AddDebugAndTraceLog(size_t batchKeySize, vector &externalKeys, - vector &externalSSDKeys) const -{ - LOG_DEBUG("TransferDDREmbWithSSD: batchKeySize:{}, externalKeys size:{}, externalSSDKeys size:{}", - batchKeySize, externalKeys.size(), externalSSDKeys.size()); - LOG_TRACE("TransferDDREmbWithSSD: externalKeys:{}, externalSSDKeys:{}", - VectorToString(externalKeys), VectorToString(externalSSDKeys)); -} - -/// 去重和过滤无效key -/// \param originalKeys 原有keys -/// \param keys 处理后的keys -void CacheManager::HandleRepeatAndInvalidKey(const vector& originalKeys, vector& keys) const -{ - // 去重并保持原key的顺序 结果可测试 - unordered_set keySet; - for (auto& key : originalKeys) { - if (key == INVALID_KEY_VALUE) { - continue; - } - if (keySet.find(key) == keySet.end()) { - keySet.emplace(key); - keys.emplace_back(key); - } - } -} - -/// DDR与SSD数据转移,使DDR内剩余空间能放置当前批次key -/// \param embTableName emb表名 -/// \param embHashMap emb表 -/// \param originalKeys 当前批次key -/// \param channelId 通道id -/// \return 转移结果枚举 -TransferRet CacheManager::TransferDDREmbWithSSD(TableInfo& table, - const vector& originalKeys, int channelId) -{ - vector keys; // 去重和删除无效key - HandleRepeatAndInvalidKey(originalKeys, keys); - // 区分HBM+DDR内key,和HBM+DDR外的key(新key或保存在SSD中的key) - vector externalKeys; - vector internalKeys; - GetExternalKeys(table.keyOffsetMap, externalKeys, internalKeys, keys); - if (externalKeys.empty()) { return TransferRet::TRANSFER_OK; } - - // 判断剩余内存空间是否足够; 可用内存空间计算:HBM+DDR-已占用; 若是训练,再加DDR已淘汰; - // SSD仅与DDR交互,不考虑HBM淘汰位置;由于maxOffset比实际使用大1,所以虽然从0开始也不用再减1 - size_t ddrAvailableSize = table.devVocabSize + table.hostVocabSize - table.maxOffset; - if (channelId == TRAIN_CHANNEL_ID) { - ddrAvailableSize += table.evictHostPos.size(); - } - LOG_DEBUG("TransferDDREmbWithSSD, table:{}, maxOffset:{}, evictHostPos size:{}, ddrAvailableSize:{}", - table.name, table.maxOffset, table.evictHostPos.size(), ddrAvailableSize); - CreateSSDTableIfNotExist(table.name); - - // 调用ssdEngine查询当前批次key中保存在SSD中的key - vector externalSSDKeys; - GetSSDKeys(table.name, externalKeys, externalSSDKeys); - // 后续判断maxOffset是否超出范围时,maxOffset=devVocabSize+hostVocabSize时可用,此处包含等于 - bool isDDRSpaceEnough = ddrAvailableSize >= externalKeys.size(); - bool ddrSpaceEnoughOrEval = channelId != TRAIN_CHANNEL_ID || isDDRSpaceEnough; - if (ddrSpaceEnoughOrEval && externalSSDKeys.empty()) { - // 部分场景后续不用处理,在此处返回 - return TransferRet::TRANSFER_OK; - } - - AddDebugAndTraceLog(keys.size(), externalKeys, externalSSDKeys); - /* - * 前面 externalSSDKeys = 0 ,评估场景的 ddr空间可用、不可用已返回; 训练的可用已返回; - * 剩下的情况如下: - * 评估: - * externalSSDKeys > 0, 可用 & 不可用操作一样; - * 可选:Ddr->ssd, 腾出 externalSSDKeys 大小空间; - * Ssd->ddr, 需要移动 externalSSDKeys ; - * externalSSDKeys = 0 --已返回 - * 训练: - * externalSSDKeys > 0 - * 可用: - * 可选:Ddr->ssd, 腾出 externalSSDKeys 大小空间; - * Ssd->ddr, 需要移动 externalSSDKeys ; - * 不可用: - * 必选:Ddr->ssd, 腾出 externalKeys 大小空间; - * 需要计算ssd剩余空间:externalKeys - externalSSDKeys - * (注: 当前策略均转移externalKeys) - * Ssd->ddr, 需要移动 externalSSDKeys ; - * externalSSDKeys = 0 - * 可用: --已返回 - * 不可用: - * Ddr->ssd, 腾出 externalKeys 大小的空间; - * 需要计算ssd剩余空间: externalKeys - * 因cache每次只转移DDR最小空间,上述可选动作也需执行,避免SSD移入DDR时空间不足 - */ - // 训练场景检查SSD剩余空间 评估不考虑新key - if (channelId == TRAIN_CHANNEL_ID) { - size_t needSSDSize = externalKeys.size() - externalSSDKeys.size() - ddrAvailableSize; - const int64_t ssdAvailableSize = ssdEngine->GetTableAvailableSpace(table.name); - if (int64_t(needSSDSize) > ssdAvailableSize) { - LOG_ERROR("TransferDDREmbWithSSD: ssd available space is not enough to transfer DDR emb data. " - "needSSDSize:{}, ssdAvailableSize:{}", needSSDSize, ssdAvailableSize); - return TransferRet::SSD_SPACE_NOT_ENOUGH; - } - } - - // 从SSD获取emb数据并从SSD删除; 避免DDR->SSD时空间不够 - vector> ssdEmbData; - if (!externalSSDKeys.empty()) { - ssdEmbData = ssdEngine->FetchEmbeddings(table.name, externalSSDKeys); - ssdEngine->DeleteEmbeddings(table.name, externalSSDKeys); - } - - // 从ddr转移到ssd的key个数 - size_t ddrSwapOutSizeTmp = ddrSpaceEnoughOrEval ? externalSSDKeys.size() : externalKeys.size(); - auto ddrSwapOutSize = static_cast(ddrSwapOutSizeTmp - ddrAvailableSize); - LOG_DEBUG("TransferDDREmbWithSSD: ddrSwapOutSize:{}", ddrSwapOutSize); - - /* - * 转移DDR中数据到SSD - */ - // 记录要从DDR转移到SSD的key对应的offset(相对值,需减去devVocabSize) - vector ddrTransferPos; - TransferRet ddr2SsdRet = TransferDDREmb2SSD(table, ddrSwapOutSize, internalKeys, ddrTransferPos); - if (ddr2SsdRet == TransferRet::DDR_SPACE_NOT_ENOUGH) { - ssdEngine->InsertEmbeddings(table.name, externalSSDKeys, ssdEmbData); - return ddr2SsdRet; - } - - HandleDDRTransferPos(ddrTransferPos, externalSSDKeys, table); - - /* - * 转移SSD中保存的当前批次key的emb数据到DDR - */ - return TransferSSDEmb2DDR(table, externalSSDKeys, ddrTransferPos, ssdEmbData); -} - -/// SSD数据转移到DDR中后刷新映射和频次信息 -/// \param embTableName emb表名 -/// \param embHashMap emb hash表 -/// \param externalSSDKeys 存储在SSD中的key列表 -/// \param ddrTransferPos -void CacheManager::RefreshRelateInfoWithSSD2DDR(TableInfo& table, - vector& externalSSDKeys, vector& ddrTransferPos) -{ - for (size_t i = 0; i < externalSSDKeys.size(); ++i) { - // 映射关系 ddrTransferPos是在ddrEmbHash中的位置,记录映射时需加上devVocabSize - auto& key = externalSSDKeys[i]; - table.keyOffsetMap[key] = ddrTransferPos[i] + table.devVocabSize; - // 频次 - ddrKeyFreqMap[table.name].PutWithInit(key, excludeDDRKeyCountMap[table.name][key]); - excludeDDRKeyCountMap[table.name].erase(key); - } -} - -void CacheManager::GetDDREmbInfo(vector& keys, TableInfo& table, - vector& ddrTransferPos, vector>& ddrEmbData) const -{ - // 根据offset 获取对应Emb数据 - for (auto& key : keys) { - auto koCast = static_cast(table.keyOffsetMap[key]); - ddrTransferPos.emplace_back(koCast - table.devVocabSize); - } - - LOG_TRACE("DDR keys:{}", VectorToString(keys)); - LOG_TRACE("DDR key positions:{}", VectorToString(ddrTransferPos)); - - ddrEmbData.resize(keys.size()); - const auto& emb = hostEmbs->GetEmb(table.name); -#pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) shared(ddrTransferPos, emb, ddrEmbData) - for (size_t i = 0; i < ddrTransferPos.size(); ++i) { - auto& missingKeyPo = ddrTransferPos[i]; - const auto& src = emb.embData[missingKeyPo]; - ddrEmbData[i] = src; - } -} - -/// 使用ssdEmbData更新DDR内emb数据 -/// \param embTableName emb表名 -/// \param ddrTransferPos 需要更新的DDR内的offset -/// \param ssdEmbData SSD对应的emb数据 -void CacheManager::UpdateDDREmbInfo(const std::string& embTableName, - vector& ddrTransferPos, - vector>& ssdEmbData) const -{ - auto& emb = hostEmbs->GetEmb(embTableName); - auto& embData = emb.embData; -#pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) shared(ddrTransferPos, embData, ssdEmbData) - for (size_t i = 0; i < ddrTransferPos.size(); ++i) { - embData[ddrTransferPos[i]] = ssdEmbData[i]; - } -} - -/// DDR_2_SSD场景数据刷新: 仅刷新映射和频次,ddr转移出去的offset信息后续统一处理 -/// \param embTableName emb表名 -/// \param embHashMap emb map -/// \param ddrSwapOutKeys 从DDR中转移到SSD中key列表 -/// \param ddrSwapOutCounts 从DDR中转移到SSD中key频次数据 -void CacheManager::RefreshRelateInfoWithDDR2SSD(TableInfo& table, - vector& ddrSwapOutKeys, - vector& ddrSwapOutCounts) -{ - auto& excludeFreqMap = excludeDDRKeyCountMap[table.name]; - for (size_t i = 0; i < ddrSwapOutKeys.size(); ++i) { - auto& key = ddrSwapOutKeys[i]; - table.keyOffsetMap.erase(key); - excludeFreqMap[key] = ddrSwapOutCounts[i]; - } -} - -/// key从DDR移入、移出、HBM淘汰时刷新频次信息;仅刷新频次信息 -/// \param embTableName emb表名 -/// \param keys 操作的key集合 -/// \param type TransferType -void CacheManager::RefreshFreqInfoCommon(const string& embTableName, vector& keys, TransferType type) -{ - if (type == TransferType::DDR_2_HBM) { - for (auto& key : keys) { - // 频次数据记录到 excludeDDRKeyCountMap,并删除ddrKeyFreqMap中频次数据 - // 进入findOffset时记录的key次数 + ddr内key次数 - auto tmpCount = excludeDDRKeyCountMap[embTableName][key]; - excludeDDRKeyCountMap[embTableName][key] = ddrKeyFreqMap[embTableName].Get(key) + tmpCount; - ddrKeyFreqMap[embTableName].Pop(key); - } - } else if (type == TransferType::HBM_2_DDR) { - for (auto& key : keys) { - // excludeDDRKeyCountMap 中次数转移到 ddrKeyFreqMap, 并删除原记录 - ddrKeyFreqMap[embTableName].PutWithInit(key, excludeDDRKeyCountMap[embTableName][key]); - excludeDDRKeyCountMap[embTableName].erase(key); - } - } else if (type == TransferType::DDR_2_EVICT) { - for (auto& key : keys) { - ddrKeyFreqMap[embTableName].Pop(key); - } - } else { - // TransferType::HBM_2_EVICT - for (auto& key : keys) { - excludeDDRKeyCountMap[embTableName].erase(key); - } - } -} - -void CacheManager::Init(HostEmb* hostEmbPtr, vector& mgmtEmbInfo) -{ - this->hostEmbs = hostEmbPtr; + LOG_INFO("CacheManager Init method begin"); + this->embCache = std::move(embCachePtr); for (auto& emb : mgmtEmbInfo) { EmbBaseInfo baseInfo {emb.ssdVocabSize, emb.ssdDataPath, false}; embBaseInfos.emplace(emb.name, baseInfo); - ddrKeyFreqMap[emb.name]; - excludeDDRKeyCountMap[emb.name]; + preProcessMapper[emb.name].Initialize(emb.name, emb.hostVocabSize, emb.ssdVocabSize); } ssdEngine->Start(); - LOG_INFO("CacheManager Init method end."); + LOG_INFO("CacheManager Init method end"); } -bool CacheManager::IsKeyInSSD(const string& embTableName, emb_key_t key) +bool CacheManager::IsKeyInSSD(const string& embTableName, emb_cache_key_t key) { return ssdEngine->IsKeyExist(embTableName, key); } @@ -296,16 +46,35 @@ bool CacheManager::IsKeyInSSD(const string& embTableName, emb_key_t key) /// 淘汰SSD中Emb信息 /// \param embTableName emb表名 /// \param keys 淘汰key列表 -void CacheManager::EvictSSDEmbedding(const string& embTableName, vector& keys) +void CacheManager::EvictSSDEmbedding(const string& embTableName, const vector& keys) { if (keys.empty()) { return; } - // 1 删除缓存中记录的key的次数 2 删除SSD中保存的Emb数据 - for (auto& key : keys) { - excludeDDRKeyCountMap[embTableName].erase(key); + + int keyStep = preProcessStep; + unordered_map& ssdMap = preProcessMapper[embTableName].excludeDDRKeyCountMap; + LFUCache& ddrLfu = preProcessMapper[embTableName].lfuCache; + std::vector ssdKeysToBeDeleted; + // 1 删除缓存中记录的key的次数 + for (auto &key: keys) { + auto it = ssdMap.find(key); + if (it != ssdMap.end()) { + ssdMap.erase(it); + ssdKeysToBeDeleted.emplace_back(key); + } else { + ddrLfu.Pop(key); + } } - ssdEngine->DeleteEmbeddings(embTableName, keys); + + ssdEvictThreads.emplace_back([=]() mutable { + // 2 删除SSD中保存的Emb数据 + std::unique_lock lk(evictWaitMut); + evictWaitCond.wait(lk, [keyStep, this] { + return embeddingTaskStep == keyStep; + }); + ssdEngine->DeleteEmbeddings(embTableName, ssdKeysToBeDeleted); + }); } /// 放入key,新增/更新(次数+1)次数 @@ -324,116 +93,6 @@ void CacheManager::PutKey(const string& embTableName, const emb_key_t& key, Reco hashMap[key] = count; } -/// DDR->SSD与SSD->DDR的key个数可能不一致,手动补齐/截取 -/// \param ddrTransferPos DDR->SSD的offset列表(hostEmb表内的偏移值) -/// \param externalSSDKeys SSD->DDR的key列表 -/// \param embHashMap emb hash表 -void CacheManager::HandleDDRTransferPos(vector& ddrTransferPos, vector& externalSSDKeys, - TableInfo& table) -{ - if (ddrTransferPos.size() == externalSSDKeys.size()) { - return; - } - LOG_DEBUG("TransferDDREmbWithSSD: operate length is not equal, will padding or clipping, " - "ddrTransferPos size:{}, externalSSDKeys size:{}", - ddrTransferPos.size(), externalSSDKeys.size()); - // ddrTransferPos中是DDR内偏移位置,存入evictPos时,需加上devVocabSize;取出时需减去 - if (ddrTransferPos.size() > externalSSDKeys.size()) { - while (ddrTransferPos.size() > externalSSDKeys.size()) { - auto evictHostPos = ddrTransferPos.back() + table.devVocabSize; - table.evictHostPos.emplace_back(static_cast(evictHostPos)); - ddrTransferPos.pop_back(); - } - return; - } - // 补齐offset - while (ddrTransferPos.size() < externalSSDKeys.size() && !table.evictHostPos.empty()) { - ddrTransferPos.emplace_back(table.evictHostPos.back() - table.devVocabSize); - table.evictHostPos.pop_back(); - } - auto allSize = table.devVocabSize + table.hostVocabSize; - // 还不够继续使用maxOffset - while (ddrTransferPos.size() < externalSSDKeys.size() && table.maxOffset < allSize) { - auto nextPos = table.maxOffset++; - ddrTransferPos.emplace_back(nextPos - table.devVocabSize); - } - LOG_DEBUG("HandleDDRTransferPos: handle end, pos len:{}, keys len:{}", - ddrTransferPos.size(), externalSSDKeys.size()); -} - -void CacheManager::GetSSDKeys(const std::string& embTableName, vector& externalKeys, - vector& externalSSDKeys) -{ - for (auto& key : externalKeys) { - if (ssdEngine->IsKeyExist(embTableName, key)) { - externalSSDKeys.emplace_back(key); - } - } -} - -TransferRet CacheManager::TransferDDREmb2SSD(TableInfo& table, - int64_t ddrSwapOutSize, - const vector& keys, vector& ddrTransferPos) -{ - if (ddrSwapOutSize <= 0) { - // 此时不需要转移数据 - return TransferRet::TRANSFER_OK; - } - - TimeCost ddr2SsdTc; - LOG_DEBUG("TransferDDREmbWithSSD: get ddr least freq keys, table:{}, ddrSwapOutSize:{}", - table.name, ddrSwapOutSize); - // 获取DDR中指定数量的最低频次key,并获取相应emb数据,执行DDR换出到SSD - vector ddrSwapOutKeys; - vector ddrSwapOutCounts; - ddrKeyFreqMap[table.name].GetAndDeleteLeastFreqKeyInfo(ddrSwapOutSize, keys, ddrSwapOutKeys, ddrSwapOutCounts); - if (static_cast(ddrSwapOutKeys.size()) != ddrSwapOutSize) { - auto keyTableSize = ddrKeyFreqMap[table.name].keyTable.size(); - // 获取的最低频次key数量和预期不一致,DDR空间不足,不能放置当前批次数据 - LOG_ERROR("TransferDDREmbWithSSD, table:{}, vector length is not equal, ddrSwapOutKeys size:{}, " - "ddrSwapOutSize:{}, ddr lfu keyTable size:{}", - table.name, ddrSwapOutKeys.size(), ddrSwapOutSize, keyTableSize); - RestoreLeastFreqInfo(table.name, ddrSwapOutKeys, ddrSwapOutCounts); - return TransferRet::DDR_SPACE_NOT_ENOUGH; - } - LOG_DEBUG("TransferDDREmbWithSSD: get DDR embeddings and save to SSD, table:{}, size:{}", - table.name, ddrSwapOutKeys.size()); - // 获取DDR中emb数据 - vector> ddrEmbData; - GetDDREmbInfo(ddrSwapOutKeys, table, ddrTransferPos, ddrEmbData); - // 调用SSDEngine接口,将DDR Emb数据保存到SSD - ssdEngine->InsertEmbeddings(table.name, ddrSwapOutKeys, ddrEmbData); - - // 初始化DDR内被转移出去的位置 - hostEmbs->EvictInitEmb(table.name, ddrTransferPos); - - // 更新记录的DDR中key频次信息 - RefreshRelateInfoWithDDR2SSD(table, ddrSwapOutKeys, ddrSwapOutCounts); - LOG_DEBUG("TransferDDREmbWithSSD: table:{}, ddr2SsdTc TimeCost(ms):{}", table.name, ddr2SsdTc.ElapsedMS()); - return TransferRet::TRANSFER_OK; -} - -TransferRet CacheManager::TransferSSDEmb2DDR(TableInfo& table, - vector& externalSSDKeys, vector& ddrTransferPos, - vector>& ssdEmbData) -{ - if (externalSSDKeys.empty()) { - return TransferRet::TRANSFER_OK; - } - TimeCost ssd2DdrTc; - LOG_DEBUG("TransferDDREmbWithSSD: get SSD embeddings and save to DDR, size:{}", externalSSDKeys.size()); - if (ddrTransferPos.size() != externalSSDKeys.size() || externalSSDKeys.size() != ssdEmbData.size()) { - LOG_ERROR("TransferDDREmbWithSSD, vector length is not equal, ddrTransferPos len:{}, externalSSDKeys len:{}, " - "ssdEmbData len:{}", ddrTransferPos.size(), externalSSDKeys.size(), ssdEmbData.size()); - return TransferRet::TRANSFER_ERROR; - } - // 将SSD emb存储到DDR中 刷新频次信息 - UpdateDDREmbInfo(table.name, ddrTransferPos, ssdEmbData); - RefreshRelateInfoWithSSD2DDR(table, externalSSDKeys, ddrTransferPos); - LOG_DEBUG("TransferDDREmbWithSSD: ssd2DdrTc TimeCost(ms):{}", ssd2DdrTc.ElapsedMS()); - return TransferRet::TRANSFER_OK; -} - void CacheManager::CreateSSDTableIfNotExist(const std::string& embTableName) { if (embBaseInfos[embTableName].isExist) { @@ -451,18 +110,11 @@ void CacheManager::CreateSSDTableIfNotExist(const std::string& embTableName) LOG_INFO("ssd table is exist, embTableName:" + embTableName); } -void CacheManager::RestoreLeastFreqInfo(const std::string& embTableName, vector& ddrSwapOutKeys, - vector& ddrSwapOutCounts) -{ - auto& lfuCache = ddrKeyFreqMap[embTableName]; - for (size_t i = 0; i < ddrSwapOutKeys.size(); ++i) { - lfuCache.PutWithInit(ddrSwapOutKeys[i], ddrSwapOutCounts[i]); - } -} - CacheManager::~CacheManager() { - hostEmbs = nullptr; + for (auto &t : ssdEvictThreads) { + t.join(); + } ssdEngine->Stop(); ddrKeyFreqMap.clear(); excludeDDRKeyCountMap.clear(); @@ -472,34 +124,9 @@ CacheManager::~CacheManager() /// \param ddrFreqInitMap ddr内key频次数据 /// \param excludeDdrFreqInitMap 非DDR key频次数据 /// \param step 加载SSDEngine传入步数 -void CacheManager::Load(unordered_map>& ddrFreqInitMap, - unordered_map>& excludeDdrFreqInitMap, - int step, int rankSize, int rankId) +void CacheManager::Load(const std::vector &mgmtEmbInfo, int step, + map>& trainKeySet) { - if (rankSize <= 0) { - throw runtime_error("rank size must > 0"); - } - // 加载CacheManager数据 - for (auto& it : ddrFreqInitMap) { - auto& embTableName = it.first; - auto& freqMap = it.second; - for (auto& freqIt : freqMap) { - if (freqIt.first % rankSize != rankId) { - continue; - } - ddrKeyFreqMap[embTableName].PutWithInit(freqIt.first, freqIt.second); - } - } - for (auto& it : excludeDdrFreqInitMap) { - auto& embTableName = it.first; - auto& freqMap = it.second; - for (auto& freqIt : freqMap) { - if (freqIt.first % rankSize != rankId) { - continue; - } - excludeDDRKeyCountMap[embTableName].emplace(freqIt.first, freqIt.second); - } - } // 加载SSDEngine数据 #ifndef GTEST for (auto& it : embBaseInfos) { @@ -507,6 +134,28 @@ void CacheManager::Load(unordered_mapLoad(embTableName, embBase.savePath, embBase.maxTableSize, step); } + auto tableKeysVec = ssdEngine->ExportTableKey(); + for (auto &it: tableKeysVec) { + auto &embTableName = it.first; + auto &keys = it.second; + for (auto key: keys) { + preProcessMapper[embTableName].excludeDDRKeyCountMap[key] = 1; + trainKeySet[embTableName].insert(key); + } + } + for (const auto &embInfo: mgmtEmbInfo) { + const std::string &tableName = embInfo.name; + std::vector buffer; + int rc = embCache->Serialize(tableName, buffer); + if (rc != 0) { + throw std::runtime_error("Serialize failed!"); + } + uint64_t memSize = sizeof(uint64_t) + embInfo.extEmbeddingSize * sizeof(float); + for (uint64_t i = 0; i < buffer.size(); i += memSize) { + uint64_t key = *reinterpret_cast(&buffer[i]); + preProcessMapper[tableName].lfuCache.Put(key); + } + } #endif } @@ -525,3 +174,114 @@ int64_t CacheManager::GetTableEmbeddingSize(const string& tableName) return ssdEngine->GetTableEmbeddingSize(tableName); } +void CacheManager::ProcessSwapOutKeys(const string& tableName, const vector& swapOutKeys, + SwapOutInfo& info) +{ + auto& swapOutDDRKeys = info.swapOutDDRKeys; + auto& swapOutDDRAddrOffs = info.swapOutDDRAddrOffs; + auto& swapOutSSDKeys = info.swapOutSSDKeys; + auto& swapOutSSDAddrOffs = info.swapOutSSDAddrOffs; + + // 处理一下没见过的key,看是更新到DDR还是SSD中 + auto& keyMapper = preProcessMapper[tableName]; + size_t availableDDRSize = keyMapper.DDRAvailableSize(); + for (size_t i = 0; i < swapOutKeys.size(); ++i) { + emb_cache_key_t key = swapOutKeys[i]; + if (keyMapper.IsDDRKeyExist(key)) { + keyMapper.lfuCache.Put(key); + swapOutDDRKeys.push_back(key); + swapOutDDRAddrOffs.push_back(i); + } else if (keyMapper.IsSSDKeyExist(key)) { + keyMapper.excludeDDRKeyCountMap[key]++; + swapOutSSDKeys.push_back(key); + swapOutSSDAddrOffs.push_back(i); + } else if (availableDDRSize > 0) { + keyMapper.InsertDDRKey(key); + swapOutDDRKeys.push_back(key); + swapOutDDRAddrOffs.push_back(i); + availableDDRSize--; + } else { + keyMapper.InsertSSDKey(key); + swapOutSSDKeys.push_back(key); + swapOutSSDAddrOffs.push_back(i); + } + } +} + +void CacheManager::ProcessSwapInKeys(const string& tableName, const vector& swapInKeys, + vector& DDRToSSDKeys, vector& SSDToDDRKeys) +{ + auto& keyMapper = preProcessMapper[tableName]; + size_t externalDDRSize = 0; + std::vector firstSeenKeys; + for (emb_cache_key_t key : swapInKeys) { + if (keyMapper.IsDDRKeyExist(key)) { + continue; + } + externalDDRSize++; + if (keyMapper.IsSSDKeyExist(key)) { + SSDToDDRKeys.push_back(key); + } else { + firstSeenKeys.push_back(key); + } + } + + auto ddrAvailableSize = keyMapper.DDRAvailableSize(); + if (externalDDRSize > ddrAvailableSize) { // 需要DDR--->SSD + size_t transNum = externalDDRSize - ddrAvailableSize; + + if (transNum > keyMapper.SSDAvailableSize()) { + throw invalid_argument("SSD table size too small, key quantity exceed while transferring DDR data to SSD"); + } + // DDR--->SSD + keyMapper.GetAndDeleteLeastFreqDDRKey2SSD(transNum, swapInKeys, DDRToSSDKeys); + } + + // SSD--->DDR + for (uint64_t key : SSDToDDRKeys) { + keyMapper.InsertDDRKey(key); + keyMapper.RemoveSSDKey(key); + } + for (uint64_t key : firstSeenKeys) { + keyMapper.InsertDDRKey(key); + } + preProcessStep++; +} + +void CacheManager::UpdateSSDEmb(string tableName, float* embPtr, uint32_t extEmbeddingSize, + vector& keys, const vector& swapOutSSDddrOffs) +{ + vector embeddingsAddr(keys.size()); + for (uint64_t i = 0; i < swapOutSSDddrOffs.size(); i++) { + embeddingsAddr[i] = embPtr + swapOutSSDddrOffs[i] * extEmbeddingSize; + } + ssdEngine->InsertEmbeddingsByAddr(tableName, keys, embeddingsAddr, extEmbeddingSize); +} + +void CacheManager::TransferDDR2SSD(string tableName, uint32_t extEmbeddingSize, vector& keys, + vector& addrs) +{ + CreateSSDTableIfNotExist(tableName); + ssdEngine->InsertEmbeddingsByAddr(tableName, keys, addrs, extEmbeddingSize); + for (auto addr : addrs) { + free(addr); + addr = nullptr; + } +} + +void CacheManager::FetchSSDEmb2DDR(string tableName, uint32_t extEmbeddingSize, vector& keys, + const vector& addrs) +{ + auto embeddings = ssdEngine->FetchEmbeddings(tableName, keys); + for (uint64_t i = 0; i < embeddings.size(); i++) { + int rc = memcpy_s(addrs[i], extEmbeddingSize * sizeof(float), embeddings[i].data(), + extEmbeddingSize * sizeof(float)); + if (rc != 0) { + throw runtime_error("memcpy_s failed, rc: " + to_string(rc)); + } + } + ssdEngine->DeleteEmbeddings(tableName, keys); + + embeddingTaskStep++; + evictWaitCond.notify_all(); +} diff --git a/src/core/ssd_cache/cache_manager.h b/src/core/ssd_cache/cache_manager.h index e750626d..89ed61d7 100644 --- a/src/core/ssd_cache/cache_manager.h +++ b/src/core/ssd_cache/cache_manager.h @@ -23,10 +23,11 @@ See the License for the specific language governing permissions and #include #include "hd_transfer/hd_transfer.h" -#include "host_emb/host_emb.h" #include "lfu_cache.h" #include "ssd_engine/ssd_engine.h" #include "utils/common.h" +#include "preprocess_mapper.h" +#include "ock_ctr_common/include/factory.h" namespace MxRec { @@ -36,8 +37,13 @@ namespace MxRec { size_t devVocabSize; size_t& maxOffset; absl::flat_hash_map& keyOffsetMap; - std::vector& evictDevPos; // 记录HBM内被淘汰的key - std::vector& evictHostPos; // 记录Host内淘汰列表 + }; + + struct SwapOutInfo { + vector swapOutDDRKeys; + vector swapOutDDRAddrOffs; + vector swapOutSSDKeys; + vector swapOutSSDAddrOffs; }; enum class TransferRet { @@ -67,34 +73,48 @@ namespace MxRec { ~CacheManager(); - void Init(HostEmb* hostEmbPtr, vector& mgmtEmbInfo); + void Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo); - void Load(unordered_map>& ddrFreqInitMap, - unordered_map>& excludeDdrFreqInitMap, - int step, int rankSize, int rankId); + void Load(const std::vector& mgmtEmbInfo, int step, + map>& trainKeySet); void SaveSSDEngine(int step); - // 转换DDR和SSD数据 - TransferRet TransferDDREmbWithSSD(TableInfo& table, - const vector& originalKeys, int channelId); + bool IsKeyInSSD(const string& embTableName, emb_cache_key_t key); + + void EvictSSDEmbedding(const string& embTableName, const vector& keys); - /* HBM与DDR换入换出时刷新频次信息 */ - void RefreshFreqInfoCommon(const string& embTableName, vector& keys, - TransferType type); + void PutKey(const string& embTableName, const emb_key_t& key, RecordType type); - bool IsKeyInSSD(const string& embTableName, emb_key_t key); + void ProcessSwapOutKeys(const string& tableName, const vector& swapOutKeys, + SwapOutInfo& info); - void EvictSSDEmbedding(const string& embTableName, vector& keys); + void ProcessSwapInKeys(const string& tableName, const vector& swapInKeys, + vector& DDRToSSDKeys, vector& SSDToDDRKeys); - void PutKey(const string& embTableName, const emb_key_t& key, RecordType type); + void UpdateSSDEmb(string tableName, float* embPtr, uint32_t extEmbeddingSize, vector& keys, + const vector& swapOutSSDAddrOffs); + + void TransferDDR2SSD(string tableName, uint32_t extEmbeddingSize, vector& keys, + vector& addrs); + + void FetchSSDEmb2DDR(string tableName, uint32_t extEmbeddingSize, vector& keys, + const vector& addrs); + + int64_t GetTableEmbeddingSize(const string& tableName); // DDR内每个表中emb数据频次缓存;map unordered_map ddrKeyFreqMap; // 每张表中非DDR内key的出现次数 - unordered_map> excludeDDRKeyCountMap; + unordered_map> excludeDDRKeyCountMap; - int64_t GetTableEmbeddingSize(const string& tableName); + // 每一个table对应一个PreProcessMapper,预先推演HBM->DDR的情况 + std::unordered_map preProcessMapper; + + int preProcessStep = 0; + int embeddingTaskStep = 0; + std::mutex evictWaitMut; + std::condition_variable evictWaitCond; private: struct EmbBaseInfo { @@ -103,53 +123,14 @@ namespace MxRec { bool isExist; }; - void GetDDREmbInfo(vector& keys, - TableInfo& table, - vector& ddrTransferPos, vector>& ddrEmbData) const; - - void UpdateDDREmbInfo(const std::string& embTableName, - vector& ddrTransferPos, - vector>& ssdEmbData) const; - - void RefreshRelateInfoWithDDR2SSD(TableInfo& table, - vector& ddrSwapOutKeys, vector& ddrSwapOutCounts); - - void RefreshRelateInfoWithSSD2DDR(TableInfo& table, - vector& externalSSDKeys, vector& ddrTransferPos); - - void GetSSDKeys(const std::string& embTableName, vector& externalKeys, - vector& externalSSDKeys); - - TransferRet TransferDDREmb2SSD(TableInfo& table, - int64_t ddrSwapOutSize, const vector& keys, - vector& ddrTransferPos); - - TransferRet TransferSSDEmb2DDR(TableInfo& table, - vector& externalSSDKeys, vector& ddrTransferPos, - vector>& ssdEmbData); - void CreateSSDTableIfNotExist(const std::string& embTableName); - void RestoreLeastFreqInfo(const std::string& embTableName, vector& ddrSwapOutKeys, - vector& ddrSwapOutCounts); - - static void HandleDDRTransferPos(vector& ddrTransferPos, vector& externalSSDKeys, - TableInfo& table); - - inline void GetExternalKeys(const absl::flat_hash_map &keyOffsetMap, - vector& externalKeys, - vector& internalKeys, const vector& keys) const; - - void AddDebugAndTraceLog(size_t batchKeySize, vector& externalKeys, - vector& externalSSDKeys) const; - - void HandleRepeatAndInvalidKey(const vector& originalKeys, vector& keys) const; - unordered_map embBaseInfos; GTEST_PRIVATE: shared_ptr ssdEngine = std::make_shared(); - HostEmb* hostEmbs {}; + vector ssdEvictThreads; + ock::ctr::EmbCacheManagerPtr embCache {}; }; } diff --git a/src/core/ssd_cache/lfu_cache.cpp b/src/core/ssd_cache/lfu_cache.cpp index c204e336..c2d38bd2 100644 --- a/src/core/ssd_cache/lfu_cache.cpp +++ b/src/core/ssd_cache/lfu_cache.cpp @@ -25,7 +25,7 @@ using namespace MxRec; /// 仅获取当前key的频次,不增加频次;key不存在时返回-1 /// \param key key /// \return key的频次 -freq_num_t LFUCache::Get(emb_key_t key) +freq_num_t LFUCache::Get(emb_cache_key_t key) { auto it = keyTable.find(key); if (it == keyTable.end()) { return -1; } @@ -37,13 +37,16 @@ freq_num_t LFUCache::Get(emb_key_t key) /// \param keys 要返回的最低频次key不能在该列表内 /// \param ddrSwapOutKeys 记录最低频次key /// \param ddrSwapOutCounts 记录最低频次key对应次数 -void LFUCache::GetAndDeleteLeastFreqKeyInfo(int64_t num, const vector& keys, - vector& ddrSwapOutKeys, vector& ddrSwapOutCounts) +void LFUCache::GetAndDeleteLeastFreqKeyInfo(uint64_t num, const vector& keys, + vector& ddrSwapOutKeys, + vector& ddrSwapOutCounts) { freq_num_t tempMinFreq = minFreq; - unordered_set retainedKeySet(keys.begin(), keys.end()); - int64_t counter = 0; + unordered_set retainedKeySet(keys.begin(), keys.end()); + uint64_t counter = 0; const size_t freqSize = freqTable.size(); + LOG_DEBUG("table:{}, num:{}, freqTable.size:{}, keys.size:{}, ddrSwapOutKeys.size:{}, ddrSwapOutCounts.size:{}", + name, num, freqTable.size(), keys.size(), ddrSwapOutKeys.size(), ddrSwapOutCounts.size()); // 遍历freqTable<次数,keyList>时,次数可能不连续,要实际使用了1个keyList后才自增,手动增加计数器 for (size_t i = 0; i < freqSize;) { auto nodesIter = freqTable.find(tempMinFreq); @@ -53,7 +56,7 @@ void LFUCache::GetAndDeleteLeastFreqKeyInfo(int64_t num, const vector } auto nodeIt = freqTable[tempMinFreq].begin(); while (nodeIt != freqTable[tempMinFreq].end() && !freqTable[tempMinFreq].empty() && counter < num) { - emb_key_t currentKey = nodeIt->key; + emb_cache_key_t currentKey = nodeIt->key; if (retainedKeySet.find(currentKey) != retainedKeySet.end()) { // 当前key在指定的集合中,不满足 nodeIt++; @@ -80,7 +83,7 @@ void LFUCache::GetAndDeleteLeastFreqKeyInfo(int64_t num, const vector /// 放入key,新增/更新(次数+1)次数 /// \param key key -void LFUCache::Put(emb_key_t key) +void LFUCache::Put(emb_cache_key_t key) { auto it = keyTable.find(key); if (it == keyTable.end()) { @@ -94,8 +97,10 @@ void LFUCache::Put(emb_key_t key) freqTable[freq].erase(node); if (freqTable[freq].empty()) { freqTable.erase(freq); + if (minFreq == freq) { + minFreq += 1; + } } - if (minFreq == freq) { minFreq += 1; } freqTable[freq + 1].emplace_front(key, freq + 1); keyTable[key] = freqTable[freq + 1].begin(); } @@ -103,7 +108,7 @@ void LFUCache::Put(emb_key_t key) /// 直接放入指定次数;用于初始化场景 /// \param key key /// \param freq 频次 -void LFUCache::PutWithInit(emb_key_t key, freq_num_t freq) +void LFUCache::PutWithInit(emb_cache_key_t key, freq_num_t freq) { if (keyTable.find(key) != keyTable.end()) { // 一般初始化时,key应该不存在已经被插入的情况;此处替换就的key频次信息 @@ -120,7 +125,7 @@ void LFUCache::PutWithInit(emb_key_t key, freq_num_t freq) } /// 删除指定key -bool LFUCache::Pop(emb_key_t key) +bool LFUCache::Pop(emb_cache_key_t key) { auto it = keyTable.find(key); if (it == keyTable.end()) { @@ -139,15 +144,23 @@ bool LFUCache::Pop(emb_key_t key) /// 获取所有的key和次数信息 /// \return 频次数据map -std::unordered_map LFUCache::GetFreqTable() +std::unordered_map LFUCache::GetFreqTable() { - unordered_map freqMap(keyTable.size()); + unordered_map freqMap(keyTable.size()); for (const auto& it :keyTable) { freqMap[it.first] = it.second->freq; } return freqMap; } +LFUCache::LFUCache(const string& cacheName) +{ + name = cacheName; + minFreq = 0; + keyTable.clear(); + freqTable.clear(); +} + LFUCache::LFUCache() { minFreq = 0; diff --git a/src/core/ssd_cache/lfu_cache.h b/src/core/ssd_cache/lfu_cache.h index 247e490e..94fde539 100644 --- a/src/core/ssd_cache/lfu_cache.h +++ b/src/core/ssd_cache/lfu_cache.h @@ -31,10 +31,10 @@ namespace MxRec { // 记录key和次数信息 struct LFUCacheNode { - emb_key_t key; + emb_cache_key_t key; freq_num_t freq; - LFUCacheNode(emb_key_t key, freq_num_t freq) : key(key), freq(freq) + LFUCacheNode(emb_cache_key_t key, freq_num_t freq) : key(key), freq(freq) {} }; @@ -42,25 +42,29 @@ namespace MxRec { public: LFUCache(); - freq_num_t Get(emb_key_t key); + explicit LFUCache(const string& cacheName); - void GetAndDeleteLeastFreqKeyInfo(int64_t num, const vector& keys, - vector& ddrSwapOutKeys, + freq_num_t Get(emb_cache_key_t key); + + void GetAndDeleteLeastFreqKeyInfo(uint64_t num, const vector& keys, + vector& ddrSwapOutKeys, vector& ddrSwapOutCounts); - void Put(emb_key_t key); + void Put(emb_cache_key_t key); - bool Pop(emb_key_t key); + bool Pop(emb_cache_key_t key); - void PutWithInit(emb_key_t key, freq_num_t freq); + void PutWithInit(emb_cache_key_t key, freq_num_t freq); - std::unordered_map GetFreqTable(); + std::unordered_map GetFreqTable(); // 最小频次 freq_num_t minFreq = 0; // 次数, 该次数对应的key列表(key, freq) std::unordered_map> freqTable; // key, key所属node在freqTable的节点列表中的存储位置地址 - std::unordered_map::iterator> keyTable; + std::unordered_map::iterator> keyTable; + private: + string name; }; } diff --git a/src/core/ssd_cache/preprocess_mapper.h b/src/core/ssd_cache/preprocess_mapper.h new file mode 100644 index 00000000..03860181 --- /dev/null +++ b/src/core/ssd_cache/preprocess_mapper.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. + * Description: ssd cache module + * Author: MindX SDK + * Date: 2024/2/18 + */ + +#ifndef MXREC_DDR_PREPROCESS_MAPPER_H +#define MXREC_DDR_PREPROCESS_MAPPER_H + +#include +#include "lfu_cache.h" + +namespace MxRec { + /* + * 专供keys处理的线程使用,每一个emb_local_table就有一个DDRPreProcessMapper + * MapperBase中的桶存储k-v对,在这里value统一赋值为0 + */ + class PreProcessMapper { + public: + void Initialize(const string& embName, uint32_t vocabSize, uint32_t ssdVocabSize) + { + tableName = embName; + lfuCache = LFUCache(embName); + ddrAvailableSize = vocabSize; + ssdAvailableSize = ssdVocabSize; + } + + bool IsDDRKeyExist(uint64_t key) + { + return lfuCache.keyTable.find(key) != lfuCache.keyTable.end(); + } + + bool IsSSDKeyExist(uint64_t key) + { + return excludeDDRKeyCountMap.find(key) != excludeDDRKeyCountMap.end(); + } + + bool InsertDDRKey(uint64_t key) + { + if (IsDDRKeyExist(key)) { + throw std::invalid_argument("InsertDDRKey failed! key already exist"); + } + + freq_num_t freq = excludeDDRKeyCountMap[key] + 1; + lfuCache.PutWithInit(key, freq); + return true; + } + + bool InsertSSDKey(uint64_t key) + { + if (IsSSDKeyExist(key)) { + throw std::invalid_argument("InsertSSDKey failed! key already exist"); + } + + excludeDDRKeyCountMap[key] = 1; + return true; + } + + bool RemoveSSDKey(uint64_t key) + { + if (!IsSSDKeyExist(key)) { + throw std::invalid_argument("RemoveKey failed! key not exist"); + } + excludeDDRKeyCountMap.erase(key); + return true; + } + + size_t DDRAvailableSize() + { + if (ddrAvailableSize < lfuCache.keyTable.size()) { + throw std::invalid_argument("ddrAvailableSize < existKeys.size()"); + } + return ddrAvailableSize - lfuCache.keyTable.size(); + } + + size_t SSDAvailableSize() + { + if (ssdAvailableSize < excludeDDRKeyCountMap.size()) { + throw std::invalid_argument("ssdAvailableSize < existKeys.size()"); + } + return ssdAvailableSize - excludeDDRKeyCountMap.size(); + } + + void GetAndDeleteLeastFreqDDRKey2SSD(uint64_t transNum, const std::vector& keys, + std::vector& DDRSwapOutKeys) + { + LOG_DEBUG("start GetAndDeleteLeastFreqDDRKey2SSD, table:{}", tableName); + std::vector DDRSwapOutCounts; + lfuCache.GetAndDeleteLeastFreqKeyInfo(transNum, keys, DDRSwapOutKeys, DDRSwapOutCounts); + for (uint64_t i = 0; i < DDRSwapOutKeys.size(); i++) { + excludeDDRKeyCountMap[DDRSwapOutKeys[i]] = DDRSwapOutCounts[i]; + } + if (DDRSwapOutCounts.size() != transNum) { + throw std::invalid_argument( + "GetAndDeleteLeastFreqDDRKey2SSD failed! DDRSwapOutCounts.size()!=transNum"); + } + } + + string tableName; + uint64_t ddrAvailableSize = 0; + uint64_t ssdAvailableSize = 0; + LFUCache lfuCache; + std::unordered_map excludeDDRKeyCountMap; + }; +} + +#endif // MXREC_DDR_PREPROCESS_MAPPER_H diff --git a/src/core/ssd_engine/file.cpp b/src/core/ssd_engine/file.cpp index 83395f36..cc9ec206 100644 --- a/src/core/ssd_engine/file.cpp +++ b/src/core/ssd_engine/file.cpp @@ -24,7 +24,7 @@ using namespace MxRec; /// 创建新文件实例,包含元数据文件、数据文件 /// \param fileID 文件ID /// \param fileDir 当前文件目录 -File::File(uint64_t fileID, string &fileDir) : fileID(fileID), fileDir(fileDir) +File::File(uint64_t fileID, string& fileDir) : fileID(fileID), fileDir(fileDir) { LOG_DEBUG("start init file, fileID:{}", fileID); @@ -75,7 +75,7 @@ File::File(uint64_t fileID, string &fileDir) : fileID(fileID), fileDir(fileDir) /// \param loadDir 加载文件的目录 /// \param fileDir 当前文件目录 /// \param step 加载的步数 -File::File(uint64_t fileID, string &fileDir, string &loadDir, int step) : fileID(fileID), fileDir(fileDir) +File::File(uint64_t fileID, string& fileDir, string& loadDir, int step) : fileID(fileID), fileDir(fileDir) { LOG_DEBUG("start init file with load, fileID:{}", fileID); @@ -141,13 +141,13 @@ File::~File() fs::remove(dataFilePath); } -bool File::IsKeyExist(emb_key_t key) +bool File::IsKeyExist(emb_cache_key_t key) const { auto it = keyToOffset.find(key); return !(it == keyToOffset.end()); } -void File::InsertEmbeddings(vector &keys, vector> &embeddings) +void File::InsertEmbeddings(vector& keys, vector>& embeddings) { if (keys.size() != embeddings.size()) { throw invalid_argument("keys' length not equal to embeddings' length"); @@ -178,10 +178,10 @@ void File::InsertEmbeddings(vector &keys, vector> &embe dataCnt += dLen; } -vector> File::FetchEmbeddings(vector &keys) +vector> File::FetchEmbeddings(vector& keys) { vector> ret; - for (emb_key_t k: keys) { + for (emb_cache_key_t k: keys) { auto it = keyToOffset.find(k); if (it == keyToOffset.end()) { throw invalid_argument("key not exist"); @@ -208,7 +208,7 @@ vector> File::FetchEmbeddings(vector &keys) return ret; } -void File::DeleteEmbedding(emb_key_t key) +void File::DeleteEmbedding(emb_cache_key_t key) { if (!IsKeyExist(key)) { return; @@ -217,7 +217,7 @@ void File::DeleteEmbedding(emb_key_t key) staleDataCnt += 1; } -void File::Save(const string &saveDir, int step) +void File::Save(const string& saveDir, int step) { LOG_DEBUG("start save file at step:{}, fileID:{}", step, fileID); @@ -278,7 +278,7 @@ void File::Load() { // file already validate and open in instantiation LOG_DEBUG("start reading meta file, fileID:{}", fileID); - emb_key_t key; + emb_cache_key_t key; offset_t offset; do { localFileMeta.read(reinterpret_cast(&key), keyDataLen); @@ -311,9 +311,9 @@ void File::Load() LOG_DEBUG("end reading meta file, fileID:{}", fileID); } -vector File::GetKeys() +vector File::GetKeys() { - vector ret; + vector ret; for (auto item: keyToOffset) { ret.push_back(item.first); } @@ -334,3 +334,40 @@ uint64_t File::GetStaleDataCnt() const { return staleDataCnt; } + +void File::InsertEmbeddingsByAddr(vector& keys, vector& embeddingsAddr, + uint64_t extEmbeddingSize) +{ + if (keys.size() != embeddingsAddr.size()) { + throw invalid_argument("keys' length not equal to embeddings' length"); + } + + size_t dLen = keys.size(); + for (size_t i = 0; i < dLen; ++i) { + if (embeddingsAddr[i] == nullptr) { + throw invalid_argument("Null pointer found in embeddingsAddr"); + } + } + + localFileData.seekp(lastWriteOffset); // always set pointer to buffer end in case reading happened before + + for (size_t i = 0; i < dLen; ++i) { + if (IsKeyExist(keys[i])) { + staleDataCnt++; + } + keyToOffset[keys[i]] = lastWriteOffset; + + if (extEmbeddingSize > maxEmbSize) { + throw invalid_argument("embedding size too large"); + } + localFileData.write(reinterpret_cast(&extEmbeddingSize), sizeof(extEmbeddingSize)); + localFileData.write(reinterpret_cast(embeddingsAddr[i]), extEmbeddingSize * sizeof(float)); + + auto pos = localFileData.tellp(); + if (pos == -1) { + throw runtime_error("can't get file position pointer, write data failed"); + } + lastWriteOffset = offset_t(pos); + } + dataCnt += dLen; +} diff --git a/src/core/ssd_engine/file.h b/src/core/ssd_engine/file.h index 949859db..bc2b1fcb 100644 --- a/src/core/ssd_engine/file.h +++ b/src/core/ssd_engine/file.h @@ -33,30 +33,31 @@ namespace MxRec { using offset_t = uint32_t; class File { - static const uint64_t keyDataLen = sizeof(emb_key_t); - static const uint64_t offsetDataLen = sizeof(offset_t); + static constexpr uint64_t keyDataLen = sizeof(emb_cache_key_t); + static constexpr uint64_t offsetDataLen = sizeof(offset_t); public: - File(uint64_t fileID, string &fileDir); + File(uint64_t fileID, string& fileDir); - File(uint64_t fileID, string &fileDir, string &loadDir, int step); // initialize with loading specific step data + File(uint64_t fileID, string& fileDir, string& loadDir, + int step); // initialize with loading specific step data File(const File&) = delete; File& operator=(const File&) = delete; ~File(); - bool IsKeyExist(emb_key_t key); + bool IsKeyExist(emb_cache_key_t key) const; - void InsertEmbeddings(vector &keys, vector> &embeddings); + void InsertEmbeddings(vector& keys, vector>& embeddings); - vector> FetchEmbeddings(vector &keys); + vector> FetchEmbeddings(vector& keys); - void DeleteEmbedding(emb_key_t key); + void DeleteEmbedding(emb_cache_key_t key); - void Save(const string &saveDir, int step); + void Save(const string& saveDir, int step); - vector GetKeys(); + vector GetKeys(); uint64_t GetDataCnt() const; @@ -64,6 +65,9 @@ namespace MxRec { uint64_t GetStaleDataCnt() const; + void InsertEmbeddingsByAddr(vector& keys, vector& embeddingsAddr, + uint64_t extEmbeddingSize); + private: uint64_t fileID; // init by constructor string fileDir; // init by constructor @@ -77,7 +81,7 @@ namespace MxRec { uint64_t dataCnt = 0; uint64_t staleDataCnt = 0; - unordered_map keyToOffset{}; // offset_t >> maxDataNumInFile * embDataSize + unordered_map keyToOffset{}; // offset_t >> maxDataNumInFile * embDataSize offset_t lastWriteOffset = 0; void Load(); diff --git a/src/core/ssd_engine/ssd_engine.cpp b/src/core/ssd_engine/ssd_engine.cpp index 65708792..bbf55e66 100644 --- a/src/core/ssd_engine/ssd_engine.cpp +++ b/src/core/ssd_engine/ssd_engine.cpp @@ -27,7 +27,7 @@ bool SSDEngine::IsTableExist(const string &tableName) return !(it == tableMap.end()); } -bool SSDEngine::IsKeyExist(const string &tableName, emb_key_t key) +bool SSDEngine::IsKeyExist(const string &tableName, emb_cache_key_t key) { if (!isRunning) { throw runtime_error("SSDEngine not running"); @@ -54,7 +54,8 @@ void SSDEngine::CreateTable(const string &tableName, vector savePaths, u tableMap[tableName] = make_shared(tableName, savePaths, maxTableSize, compactThreshold); } -void SSDEngine::InsertEmbeddings(const string &tableName, vector &keys, vector> &embeddings) +void SSDEngine::InsertEmbeddings(const string& tableName, vector& keys, + vector>& embeddings) { if (!isRunning) { throw runtime_error("SSDEngine not running"); @@ -71,7 +72,7 @@ void SSDEngine::InsertEmbeddings(const string &tableName, vector &key it->second->InsertEmbeddings(keys, embeddings); } -void SSDEngine::DeleteEmbeddings(const string &tableName, vector &keys) +void SSDEngine::DeleteEmbeddings(const string &tableName, vector &keys) { if (!isRunning) { throw runtime_error("SSDEngine not running"); @@ -154,7 +155,7 @@ void SSDEngine::CompactMonitor() LOG_DEBUG("SSDEngine end CompactMonitor"); } -vector> SSDEngine::FetchEmbeddings(const string &tableName, vector &keys) +vector> SSDEngine::FetchEmbeddings(const string &tableName, vector &keys) { if (!isRunning) { throw runtime_error("SSDEngine not running"); @@ -209,3 +210,30 @@ int64_t SSDEngine::GetTableEmbeddingSize(const string &tableName) } return static_cast(it->second->GetTableUsage()); } + +void SSDEngine::InsertEmbeddingsByAddr(const string& tableName, vector& keys, + vector& embeddingsAddr, uint64_t extEmbeddingSize) +{ + if (!isRunning) { + throw runtime_error("SSDEngine not running"); + } + auto it = as_const(tableMap).find(tableName); + if (it == tableMap.end()) { + throw invalid_argument("table not found"); + } + + if (keys.size() != embeddingsAddr.size()) { + throw invalid_argument("keys' length not equal to embeddings' length"); + } + + it->second->InsertEmbeddingsByAddr(keys, embeddingsAddr, extEmbeddingSize); +} + +vector>> SSDEngine::ExportTableKey() +{ + vector>> tableKeysVec; + for (const auto& p : tableMap) { + tableKeysVec.emplace_back(p.first, p.second->ExportKeys()); + } + return tableKeysVec; +} diff --git a/src/core/ssd_engine/ssd_engine.h b/src/core/ssd_engine/ssd_engine.h index 10f89d57..538f76e2 100644 --- a/src/core/ssd_engine/ssd_engine.h +++ b/src/core/ssd_engine/ssd_engine.h @@ -31,17 +31,18 @@ namespace MxRec { public: bool IsTableExist(const string &tableName); - bool IsKeyExist(const string &tableName, emb_key_t key); + bool IsKeyExist(const string &tableName, emb_cache_key_t key); void CreateTable(const string &tableName, vector savePaths, uint64_t maxTableSize); int64_t GetTableAvailableSpace(const string &tableName); - void InsertEmbeddings(const string &tableName, vector &keys, vector> &embeddings); + void InsertEmbeddings(const string &tableName, vector &keys, + vector> &embeddings); - void DeleteEmbeddings(const string &tableName, vector &keys); + void DeleteEmbeddings(const string &tableName, vector &keys); - vector> FetchEmbeddings(const string &tableName, vector &keys); + vector> FetchEmbeddings(const string &tableName, vector &keys); void Save(int step); @@ -57,6 +58,11 @@ namespace MxRec { int64_t GetTableEmbeddingSize(const string& tableName); + void InsertEmbeddingsByAddr(const string &tableName, vector &keys, + vector &embeddingsAddr, uint64_t extEmbeddingSize); + + vector>> ExportTableKey(); + private: bool isRunning = false; diff --git a/src/core/ssd_engine/table.cpp b/src/core/ssd_engine/table.cpp index c7ed5363..592cce0e 100644 --- a/src/core/ssd_engine/table.cpp +++ b/src/core/ssd_engine/table.cpp @@ -72,27 +72,27 @@ Table::Table(const string &name, vector &saveDirs, uint64_t maxTableSize LOG_INFO("load table:{} done. try store at path:{}", name, curTablePath); } -bool Table::IsKeyExist(emb_key_t key) +bool Table::IsKeyExist(emb_cache_key_t key) { lock_guard guard(rwLock); auto it = keyToFile.find(key); return !(it == keyToFile.end()); } -void Table::InsertEmbeddings(vector &keys, vector> &embeddings) +void Table::InsertEmbeddings(vector &keys, vector> &embeddings) { lock_guard guard(rwLock); InsertEmbeddingsInner(keys, embeddings); } -vector> Table::FetchEmbeddings(vector &keys) +vector> Table::FetchEmbeddings(vector &keys) { lock_guard guard(rwLock); return FetchEmbeddingsInner(keys); } -void Table::DeleteEmbeddings(vector &keys) +void Table::DeleteEmbeddings(vector &keys) { lock_guard guard(rwLock); DeleteEmbeddingsInner(keys); @@ -205,7 +205,7 @@ void Table::LoadDataFileSet(const shared_ptr &metaFile, int step) throw invalid_argument("table size too small, key quantity exceed while loading data"); } - for (emb_key_t k: keys) { + for (emb_cache_key_t k: keys) { if (keyToFile.find(k) != keyToFile.end()) { throw invalid_argument( "find duplicate key in files, compaction already done before saving, file may broken or modified"); @@ -267,7 +267,7 @@ void Table::Load(const string &metaFilePath, int step) LOG_INFO("table:{}, end load data file", name); } -void Table::InsertEmbeddingsInner(vector &keys, vector> &embeddings) +void Table::InsertEmbeddingsInner(vector &keys, vector> &embeddings) { if (totalKeyCnt > maxTableSize) { throw invalid_argument("table size too small, key quantity exceed while loading data"); @@ -281,7 +281,7 @@ void Table::InsertEmbeddingsInner(vector &keys, vector> curMaxFileID++; } - for (emb_key_t k: keys) { + for (emb_cache_key_t k: keys) { auto it = keyToFile.find(k); if (it != keyToFile.end()) { it->second->DeleteEmbedding(k); @@ -294,25 +294,25 @@ void Table::InsertEmbeddingsInner(vector &keys, vector> totalKeyCnt += keys.size(); } -vector> Table::FetchEmbeddingsInner(vector &keys) +vector> Table::FetchEmbeddingsInner(vector &keys) { // build mini batch for each file, first element for keys, second for index size_t dLen = keys.size(); - unordered_map, shared_ptr, vector>>> miniBatch; + unordered_map, shared_ptr, vector>>> miniBatch; for (size_t i = 0; i < dLen; ++i) { auto it = as_const(keyToFile).find(keys[i]); if (it == keyToFile.end()) { throw invalid_argument(StringFormat("failed to find the key, {key=%d} not exist!", keys[i])); } if (miniBatch[it->second] == nullptr) { - miniBatch[it->second] = make_shared, vector>>(); + miniBatch[it->second] = make_shared, vector>>(); } miniBatch[it->second]->first.emplace_back(keys[i]); miniBatch[it->second]->second.emplace_back(i); } // must convert map to list to perform parallel query, omp not support to iterate map - vector, vector, vector>> queryList; + vector, vector, vector>> queryList; queryList.reserve(miniBatch.size()); for (auto [f, info]: miniBatch) { queryList.emplace_back(f, info->first, info->second); @@ -368,7 +368,7 @@ void Table::Compact(bool fullCompact) for (const auto &f: compactFileList) { staleDataFileSet.erase(f); fileSet.erase(f); - vector validKeys = f->GetKeys(); + vector validKeys = f->GetKeys(); vector> validEmbs = f->FetchEmbeddings(validKeys); InsertEmbeddingsInner(validKeys, validEmbs); } @@ -381,9 +381,9 @@ uint64_t Table::GetTableAvailableSpace() return maxTableSize - totalKeyCnt; } -void Table::DeleteEmbeddingsInner(vector &keys) +void Table::DeleteEmbeddingsInner(vector &keys) { - for (emb_key_t k: keys) { + for (emb_cache_key_t k: keys) { auto it = keyToFile.find(k); if (it != keyToFile.end()) { it->second->DeleteEmbedding(k); @@ -441,3 +441,46 @@ void Table::CreateTableDir(const string &path) LOG_DEBUG("create table dir:{}", path); } +void Table::InsertEmbeddingsByAddr(vector& keys, vector& embeddingsAddr, + uint32_t extEmbeddingSize) +{ + lock_guard guard(rwLock); + InsertEmbeddingsByAddrInner(keys, embeddingsAddr, extEmbeddingSize); +} + +void Table::InsertEmbeddingsByAddrInner(vector& keys, vector& embeddingsAddr, + uint64_t extEmbeddingSize) +{ + if (totalKeyCnt > maxTableSize) { + throw invalid_argument("table size too small, key quantity exceed while loading data"); + } + + if (curFile == nullptr || (curFile != nullptr && curFile->GetDataCnt() >= maxDataNumInFile)) { + SetTablePathToDiskWithSpace(); + CreateTableDir(curTablePath); + curFile = make_shared(curMaxFileID, curTablePath); + fileSet.insert(curFile); + curMaxFileID++; + } + + for (emb_cache_key_t k : keys) { + auto it = keyToFile.find(k); + if (it != keyToFile.end()) { + it->second->DeleteEmbedding(k); + staleDataFileSet.insert(it->second); + totalKeyCnt -= 1; + } + keyToFile[k] = curFile; + } + curFile->InsertEmbeddingsByAddr(keys, embeddingsAddr, extEmbeddingSize); + totalKeyCnt += keys.size(); +} + +vector Table::ExportKeys() +{ + vector vec; + for (const auto& p : keyToFile) { + vec.push_back(p.first); + } + return vec; +} \ No newline at end of file diff --git a/src/core/ssd_engine/table.h b/src/core/ssd_engine/table.h index 87fa6f35..c34837dc 100644 --- a/src/core/ssd_engine/table.h +++ b/src/core/ssd_engine/table.h @@ -32,18 +32,18 @@ namespace MxRec { class Table { public: - Table(const string &name, vector &savePaths, uint64_t maxTableSize, double compactThreshold); + Table(const string& name, vector& savePaths, uint64_t maxTableSize, double compactThreshold); // initialize with loading specific step data - Table(const string &name, vector &saveDirs, uint64_t maxTableSize, double compactThreshold, int step); + Table(const string& name, vector& saveDirs, uint64_t maxTableSize, double compactThreshold, int step); - bool IsKeyExist(emb_key_t key); + bool IsKeyExist(emb_cache_key_t key); - void InsertEmbeddings(vector &keys, vector> &embeddings); + void InsertEmbeddings(vector& keys, vector>& embeddings); - vector> FetchEmbeddings(vector &keys); + vector> FetchEmbeddings(vector& keys); - void DeleteEmbeddings(vector &keys); + void DeleteEmbeddings(vector& keys); void Save(int step); @@ -53,26 +53,34 @@ namespace MxRec { uint64_t GetTableUsage(); + void InsertEmbeddingsByAddr(vector& keys, vector& embeddingsAddr, + uint32_t extEmbeddingSize); + + vector ExportKeys(); + private: static void CreateTableDir(const string& path); void Load(const string& metaFilePath, int step); - void InsertEmbeddingsInner(vector &keys, vector> &embeddings); + void InsertEmbeddingsInner(vector& keys, vector>& embeddings); - void DeleteEmbeddingsInner(vector &keys); + void DeleteEmbeddingsInner(vector& keys); - vector> FetchEmbeddingsInner(vector &keys); + vector> FetchEmbeddingsInner(vector& keys); void LoadDataFileSet(const shared_ptr& metaFile, int step); void SetTablePathToDiskWithSpace(); + void InsertEmbeddingsByAddrInner(vector& keys, vector& embeddingsAddr, + uint64_t extEmbeddingSize); + string name; // init by constructor vector savePaths; // init by constructor, support Save and Load from multiple path uint64_t maxTableSize; // init by constructor, maximum key-value volume uint64_t totalKeyCnt = 0; - unordered_map> keyToFile{}; // max mem cost 1.5G*2 for 100m keys + unordered_map> keyToFile{}; // max mem cost 1.5G*2 for 100m keys set> staleDataFileSet{}; string curTablePath = ""; uint32_t curSavePathIdx = 0; diff --git a/src/core/utils/common.cpp b/src/core/utils/common.cpp index abd50f56..d281162c 100644 --- a/src/core/utils/common.cpp +++ b/src/core/utils/common.cpp @@ -37,6 +37,8 @@ namespace MxRec { int GlogConfig::gGlogLevel; string GlogConfig::gRankId; + ock::ctr::FactoryPtr factory {}; + RankInfo::RankInfo(int rankId, int deviceId, int localRankSize, int option, const vector& ctrlSteps) : rankId(rankId), deviceId(deviceId), localRankSize(localRankSize), option(option), ctrlSteps(ctrlSteps) { diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 4f1d076c..5bb93a41 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -35,6 +35,8 @@ See the License for the specific language governing permissions and #include "initializer/constant_initializer/constant_initializer.h" #include "initializer/truncated_normal_initializer/truncated_normal_initializer.h" #include "initializer/random_normal_initializer/random_normal_initializer.h" +#include "ock_ctr_common/include/factory.h" +#include "ock_ctr_common/include/embedding_cache.h" #if defined(BUILD_WITH_EASY_PROFILER) #include @@ -53,6 +55,7 @@ namespace MxRec { #define MGMT_CPY_THREADS 4 #define PROFILING using namespace tensorflow; + extern ock::ctr::FactoryPtr factory; constexpr int TRAIN_CHANNEL_ID = 0; constexpr int EVAL_CHANNEL_ID = 1; @@ -65,6 +68,7 @@ namespace MxRec { constexpr size_t MAX_VOCABULARY_SIZE = 1e10; constexpr int SSD_SIZE_INDEX = 2; constexpr int MAX_FILE_NUM = 1000; + constexpr int EMBEDDING_THREAD_NUM = 2; // for GLOG struct GlogConfig { static bool gStatOn; @@ -111,10 +115,13 @@ namespace MxRec { const string COMBINE_HISTORY_NAME = "combine_table_history"; using emb_key_t = int64_t; + using emb_cache_key_t = uint64_t; using freq_num_t = int64_t; using EmbNameT= std::string; using KeysT = std::vector; using LookupKeyT = std::tuple; // batch_id quarry_lable keys_vector + using UinqueKeyT = std::tuple>; + using RestoreVecSecT = std::tuple>; using TensorInfoT = std::tuple>>::iterator>; namespace HybridOption { @@ -228,12 +235,17 @@ namespace MxRec { int localRankSize {}; bool useStatic { false }; uint32_t option {}; - int nBatch {}; bool isDDR { false }; bool isSSDEnabled { false }; bool useDynamicExpansion {false}; bool useSumSameIdGradients {true}; - std::vector ctrlSteps; // 包含三个步数: train_steps, eval_steps, save_steps + std::vector ctrlSteps; // 包含4个步数: train_steps, eval_steps, save_steps, max_train_steps + }; + + struct EmbBaseInfo { + int batchId; + int channelId; + string name; }; enum TensorIndex : uint32_t { @@ -445,7 +457,7 @@ namespace MxRec { EmbInfo(const EmbInfoParams& embInfoParams, std::vector vocabsize, - std::vector initializeInfos, + std::vector initializeInfos, std::vector ssdDataPath) : name(embInfoParams.name), sendCount(embInfoParams.sendCount), @@ -456,7 +468,7 @@ namespace MxRec { devVocabSize(vocabsize[0]), hostVocabSize(vocabsize[1]), ssdVocabSize(vocabsize[SSD_SIZE_INDEX]), - initializeInfos(initializeInfos), + initializeInfos(std::move(initializeInfos)), ssdDataPath(std::move(ssdDataPath)) { } @@ -470,7 +482,7 @@ namespace MxRec { size_t devVocabSize; size_t hostVocabSize; size_t ssdVocabSize; - std::vector initializeInfos; + std::vector initializeInfos; std::vector ssdDataPath; }; @@ -479,45 +491,6 @@ namespace MxRec { std::vector> embData; }; - struct EmbHashMapInfo { - absl::flat_hash_map hostHashMap; // key在HBM中的偏移 - std::vector devOffset2Batch; // has -1 - std::vector devOffset2Key; - size_t currentUpdatePos; - size_t currentUpdatePosStart; - size_t hostVocabSize; - size_t devVocabSize; - size_t freeSize; - std::vector lookUpVec; - std::vector missingKeysHostPos; // 用于记录当前batch在host上需要换出的偏移 - std::vector swapPos; // 记录从HBM换出到DDR的offset - /* - * 取值范围:[0,devVocabSize+hostVocabSize); - * [0,devVocabSize-1]时存储在HBM, [devVocabSize,devVocabSize+hostVocabSize)存储在DDR - */ - size_t maxOffset { 0 }; - /* - * 记录DDR内淘汰列表,其值为相对HBM+DDR大表的;hostHashMap可直接使用;操作ddr内emb时需减掉devVocabSize - * 例如:HBM表大小20(offset:0~19),DDR表大小为100(offset:0~99); - * 若DDR内0位置被淘汰,记录到evictPos的值为0+20=20 - */ - std::vector evictPos; - std::vector evictDevPos; // 记录HBM内淘汰列表 - size_t maxOffsetOld { 0 }; - std::vector evictPosChange; - std::vector evictDevPosChange; - std::vector> devOffset2KeyOld; - std::vector> oldSwap; // (old on dev, old on host) - /* - * HBM与DDR换入换出时,已存在于DDR且要转移到HBM的key(不包含新key); 用于SSD模式 - * (区别于oldSwap: pair.second为已存在于DDR key + 换入换出前映射到DDR的新key) - */ - std::vector ddr2HbmKeys; - void SetStartCount(); - - bool HasFree(size_t i) const; - }; - struct All2AllInfo { KeysT keyRecv; vector scAll; @@ -542,7 +515,6 @@ namespace MxRec { }; using EmbMemT = absl::flat_hash_map; - using EmbHashMemT = absl::flat_hash_map; using OffsetMemT = std::map; using KeyOffsetMemT = std::map>; using KeyCountMemT = std::map>; @@ -551,7 +523,8 @@ namespace MxRec { using OffsetMapT = std::map>; using OffsetT = std::vector; using AllKeyOffsetMapT = std::map>; - using KeyFreqMemT = unordered_map>; + using KeyFreqMemT = unordered_map>; + using EmbLocalTableT = EmbCache::EmbCacheManager; enum class CkptFeatureType { HOST_EMB = 0, @@ -561,12 +534,12 @@ namespace MxRec { FEAT_ADMIT_N_EVICT = 4, DDR_KEY_FREQ_MAP = 5, EXCLUDE_DDR_KEY_FREQ_MAP = 6, - KEY_COUNT_MAP = 7 + KEY_COUNT_MAP = 7, + EMB_LOCAL_TABLE = 8 }; struct CkptData { EmbMemT* hostEmbs = nullptr; - EmbHashMemT embHashMaps; OffsetMemT maxOffset; KeyOffsetMemT keyOffsetMap; OffsetMapT offsetMap; @@ -581,7 +554,6 @@ namespace MxRec { struct CkptTransData { std::vector int64Arr; std::vector addressArr; - std::vector floatArr; std::vector int32Arr; std::vector transDataset; // may all use this to transfer data std::vector attribute; // may need to use other form for attributes @@ -606,6 +578,33 @@ namespace MxRec { KEY_COUNT_MAP = 13 }; + enum CTRLogLevel { // can't use enum class due to compatibility for AccCTR + DEBUG = 0, + INFO, + WARN, + ERROR, + }; + + static void CTRLog(int level, const char *msg) + { + switch (level) { + case CTRLogLevel::DEBUG: + LOG_DEBUG(msg); + break; + case CTRLogLevel::INFO: + LOG_INFO(msg); + break; + case CTRLogLevel::WARN: + LOG_WARN(msg); + break; + case CTRLogLevel::ERROR: + LOG_ERROR(msg); + break; + default: + break; + } + } + ostream& operator<<(ostream& ss, MxRec::CkptDataType type); bool CheckFilePermission(const string& filePath); } // end namespace MxRec diff --git a/src/core/utils/task_queue.h b/src/core/utils/task_queue.h new file mode 100644 index 00000000..a42e5147 --- /dev/null +++ b/src/core/utils/task_queue.h @@ -0,0 +1,110 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef TASK_QUEUE_H +#define TASK_QUEUE_H + +#include +#include +#include +#include + +namespace MxRec { + namespace Common { + template + class TaskQueue { + public: + TaskQueue() = default; + + ~TaskQueue() = default; + + TaskQueue(TaskQueue const &other) + { + std::lock_guard lk(other.mut); + dataQueue = other.dataQueue; + } + + TaskQueue &operator=(TaskQueue const &other) + { + if (this == &other) { + return *this; + } + std::lock_guard lk(other.mut); + dataQueue = other.dataQueue; + return *this; + } + + void Pushv(T &t) + { + std::lock_guard lk(mut); + dataQueue.push_back(std::move(t)); + dataCond.notify_one(); + } + + void Pushv(T &&t) + { + std::lock_guard lk(mut); + dataQueue.emplace_back(t); + dataCond.notify_one(); + } + + T WaitAndPop() + { + std::unique_lock lk(mut); + dataCond.wait(lk, [this] { + if (!finished) { + return !dataQueue.empty(); + } else { + return true; + } + }); + T res; + if (finished) { + return std::move(res); + } + res = std::move(dataQueue.front()); + dataQueue.pop_front(); + return std::move(res); + } + + void DestroyQueue() + { + finished = true; + dataCond.notify_one(); + } + + bool Empty() const + { + std::lock_guard lk(mut); + return dataQueue.empty(); + } + + size_t Size() const + { + std::lock_guard lk(mut); + return dataQueue.size(); + } + + private: + mutable std::mutex mut; + std::list dataQueue; + std::condition_variable dataCond; + bool finished = false; + }; + } +} + + +#endif diff --git a/src/pybind/module_main.cpp b/src/pybind/module_main.cpp index 351d19a4..767cf4e0 100644 --- a/src/pybind/module_main.cpp +++ b/src/pybind/module_main.cpp @@ -138,7 +138,7 @@ namespace { { pybind11::class_(m, "EmbInfo") .def(pybind11::init, - std::vector&, std::vector&>(), + std::vector&, std::vector&>(), py::arg("embInfoParams"), py::arg("vocab_size"), py::arg("initialize_infos"), @@ -176,36 +176,38 @@ namespace { void GetInitializeInfo(pybind11::module_ &m) { - pybind11::class_(m, "InitializeInfo") - .def(py::init(), py::arg("name"), py::arg("start"), - py::arg("len"), py::arg("constant_initializer_info")) - .def(py::init(), py::arg("name"), py::arg("start"), - py::arg("len"), py::arg("normal_initializer_info")) - .def_readwrite("name", &InitializeInfo::name) - .def_readwrite("start", &InitializeInfo::start) - .def_readwrite("len", &InitializeInfo::len) - .def_readwrite("ConstantInitializerInfo", &InitializeInfo::constantInitializerInfo) - .def_readwrite("NormalInitializerInfo", &InitializeInfo::normalInitializerInfo); + pybind11::class_(m, "InitializeInfo") + .def(py::init(), + py::arg("name"), py::arg("start"), py::arg("len"), py::arg("constant_initializer_info")) + .def(py::init(), + py::arg("name"), py::arg("start"), py::arg("len"), py::arg("normal_initializer_info")) + .def_readwrite("name", &EmbCache::InitializerInfo::name) + .def_readwrite("start", &EmbCache::InitializerInfo::start) + .def_readwrite("len", &EmbCache::InitializerInfo::len) + .def_readwrite("ConstantInitializerInfo", &EmbCache::InitializerInfo::constantInitializerInfo) + .def_readwrite("NormalInitializerInfo", &EmbCache::InitializerInfo::normalInitializerInfo); } void GetConstantInitializerInfo(pybind11::module_ &m) { - pybind11::class_(m, "ConstantInitializerInfo") - .def(py::init(), py::arg("constant_val") = 0, py::arg("initK") = 1.0) - .def_readwrite("constant_val", &ConstantInitializerInfo::constantValue) - .def_readwrite("initK", &ConstantInitializerInfo::initK); + pybind11::class_(m, "ConstantInitializerInfo") + .def(py::init(), py::arg("constant_val") = 0, py::arg("initK") = 1.0) + .def_readwrite("constant_val", &EmbCache::ConstantInitializerInfo::constantValue) + .def_readwrite("initK", &EmbCache::ConstantInitializerInfo::initK); } void GetNormalInitializerInfo(pybind11::module_ &m) { - pybind11::class_(m, "NormalInitializerInfo") - .def(py::init(), py::arg("mean") = 0.0, - py::arg("stddev") = 1.0, py::arg("seed") = 0, - py::arg("initK") = 1.0) - .def_readwrite("mean", &NormalInitializerInfo::mean) - .def_readwrite("stddev", &NormalInitializerInfo::stddev) - .def_readwrite("seed", &NormalInitializerInfo::seed) - .def_readwrite("initK", &NormalInitializerInfo::initK); + pybind11::class_(m, "NormalInitializerInfo") + .def(py::init(), + py::arg("mean") = 0.0, + py::arg("stddev") = 1.0, + py::arg("seed") = 0, + py::arg("initK") = 1.0) + .def_readwrite("mean", &EmbCache::NormalInitializerInfo::mean) + .def_readwrite("stddev", &EmbCache::NormalInitializerInfo::stddev) + .def_readwrite("seed", &EmbCache::NormalInitializerInfo::seed) + .def_readwrite("initK", &EmbCache::NormalInitializerInfo::initK); } void GetHybridMgmt(pybind11::module_& m) @@ -220,6 +222,7 @@ namespace { py::arg("warm_start_tables") = vector {}) .def("destroy", &MxRec::HybridMgmt::Destroy) .def("evict", &MxRec::HybridMgmt::Evict) + .def("fetch_device_emb", &MxRec::HybridMgmt::FetchDeviceEmb) .def("send", &MxRec::HybridMgmt::SendHostMap, py::arg("table_name") = "") .def("send_load_offset", &MxRec::HybridMgmt::SendLoadMap, py::arg("table_name") = "") .def("receive", &MxRec::HybridMgmt::ReceiveHostMap, py::arg("key_offset_map")) diff --git a/src/tests/checkpoint/checkpoint_test.cpp b/src/tests/checkpoint/checkpoint_test.cpp index ad7bf34d..8d296363 100644 --- a/src/tests/checkpoint/checkpoint_test.cpp +++ b/src/tests/checkpoint/checkpoint_test.cpp @@ -143,7 +143,7 @@ protected: } } - void SetDDRKeyFreqMap(unordered_map& testDDRKeyFreqMap) + void SetDDRKeyFreqMap(unordered_map& testDDRKeyFreqMap) { for (int64_t i { 0 }; i < hostVocabSize; ++i) { testDDRKeyFreqMap[featMem] = i; @@ -159,7 +159,7 @@ protected: } } - void SetExcludeDDRKeyFreqMap(unordered_map& testExcludeDDRKeyFreqMap) + void SetExcludeDDRKeyFreqMap(unordered_map& testExcludeDDRKeyFreqMap) { for (int64_t i { 0 }; i < hostVocabSize; ++i) { testExcludeDDRKeyFreqMap[featMem] = i; @@ -169,7 +169,7 @@ protected: void SetDDRKeyFreqMaps(KeyFreqMemT& testDDRKeyFreqMaps) { - unordered_map testDDRKeyFreqMap; + unordered_map testDDRKeyFreqMap; for (const auto& testEmbInfo : testEmbInfos) { SetDDRKeyFreqMap(testDDRKeyFreqMap); testDDRKeyFreqMaps[testEmbInfo.name] = std::move(testDDRKeyFreqMap); @@ -187,7 +187,7 @@ protected: void SetExcludeDDRKeyFreqMaps(KeyFreqMemT& testExcludeDDRKeyFreqMaps) { - unordered_map testExcludeDDRKeyFreqMap; + unordered_map testExcludeDDRKeyFreqMap; for (const auto& testEmbInfo : testEmbInfos) { SetExcludeDDRKeyFreqMap(testExcludeDDRKeyFreqMap); testExcludeDDRKeyFreqMaps[testEmbInfo.name] = std::move(testExcludeDDRKeyFreqMap); diff --git a/src/tests/emb_hashmap/emb_hashmap_test.cpp b/src/tests/emb_hashmap/emb_hashmap_test.cpp deleted file mode 100644 index ac2f1583..00000000 --- a/src/tests/emb_hashmap/emb_hashmap_test.cpp +++ /dev/null @@ -1,185 +0,0 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ - -#include -#include - -#include "emb_hashmap/emb_hashmap.h" -#include "hybrid_mgmt/hybrid_mgmt_block.h" -#include "ssd_cache/cache_manager.h" -#include "utils/common.h" - -using namespace std; -using namespace MxRec; -using namespace testing; - -const int HBM_VOCAB_SIZE = 10; -const int DDR_VOCAB_SIZE = 100; -const int SSD_VOCAB_SIZE = 100; -const int INT_2 = 2; -const int INT_4 = 4; -const int INT_21 = 21; -const int INT_42 = 42; -const int NEGATIVE_INT_1 = -1; - -// 刷新换入换出频次和打印信息 -void RefreshSwapFreqInfoAndPrint(EmbHashMap& hostHashMaps, string embTableName, int opTimes) -{ - auto& embHashMap = hostHashMaps.embHashMaps[embTableName]; - hostHashMaps.RefreshFreqInfoWithSwap(embTableName, embHashMap); - vector hbm2DdrKeyList; - vector ddr2HbmKeyList; - for (auto it : embHashMap.oldSwap) { - hbm2DdrKeyList.emplace_back(it.first); - ddr2HbmKeyList.emplace_back(it.second); - } - LOG_INFO("embHashMap hbm2DdrKeyList: {}", VectorToString(hbm2DdrKeyList)); - LOG_INFO("embHashMap ddr2HbmKeyList: {}", VectorToString(ddr2HbmKeyList)); - embHashMap.oldSwap.clear(); - LOG_INFO("RefreshSwapFreqInfoAndPrint end, opTimes:{}", opTimes); -} - -vector GetEmbInfoList() -{ - EmbInfo embInfo; - embInfo.name = "table1"; - embInfo.devVocabSize = HBM_VOCAB_SIZE; - embInfo.hostVocabSize = DDR_VOCAB_SIZE; - embInfo.ssdVocabSize = SSD_VOCAB_SIZE; - embInfo.ssdDataPath = {"ssd_data"}; - vector embInfos; - embInfos.emplace_back(embInfo); - return embInfos; -} - -// 测试HBM与DDR换入换出时CacheManager模块频次刷新 -TEST(EmbHashMap, TestFindOffset) -{ - LOG_INFO("start TestFindOffset"); - string embTableName = "table1"; - EmbHashMap hostHashMaps; - RankInfo rankInfo; - rankInfo.isDDR = true; - auto embInfo = GetEmbInfoList(); - hostHashMaps.Init(rankInfo, embInfo, false); - CacheManager cacheManager; - cacheManager.Init(nullptr, embInfo); - bool isSSDEnabled = true; - hostHashMaps.isSSDEnabled = isSSDEnabled; - hostHashMaps.cacheManager = &cacheManager; - int channelId = 0; - size_t currentBatchId = 0; - size_t keepBatchId = 0; - int opTimes = 0; - - vector keys = {1, 2, 3, 4, 5}; - hostHashMaps.FindOffset(embTableName, keys, currentBatchId++, keepBatchId++, channelId); - RefreshSwapFreqInfoAndPrint(hostHashMaps, embTableName, opTimes++); - - vector keys2 = {6, 7, 8, 9, 10}; - hostHashMaps.FindOffset(embTableName, keys2, currentBatchId++, keepBatchId++, channelId); - RefreshSwapFreqInfoAndPrint(hostHashMaps, embTableName, opTimes++); - - auto& excludeKeyMap = cacheManager.excludeDDRKeyCountMap[embTableName]; - auto& ddrKeyMap = cacheManager.ddrKeyFreqMap[embTableName]; - - auto logLevelTemp = Logger::GetLevel(); - Logger::SetLevel(Logger::TRACE); - vector keys4 = {21, 21, 21, 21}; // 新key重复值, 且需要换入换出 - hostHashMaps.FindOffset(embTableName, keys4, currentBatchId++, keepBatchId++, channelId); - RefreshSwapFreqInfoAndPrint(hostHashMaps, embTableName, opTimes++); - ASSERT_EQ(excludeKeyMap[INT_21], INT_4); - ASSERT_EQ(ddrKeyMap.Get(1), 1); - - keys4 = {41, 42, 43, 44, 45, 46, 47, 48, 49, 50}; // 整个hbm大小key换入换出 - hostHashMaps.FindOffset(embTableName, keys4, currentBatchId++, keepBatchId++, channelId); - RefreshSwapFreqInfoAndPrint(hostHashMaps, embTableName, opTimes++); - ASSERT_EQ(ddrKeyMap.Get(INT_21), INT_4); - - keys4 = {51, 52, 53, 1, 2, 21, 41, 42, 43, 44}; // 3个新key, 3个在ddr, 4个在hbm - hostHashMaps.FindOffset(embTableName, keys4, currentBatchId, keepBatchId, channelId); - RefreshSwapFreqInfoAndPrint(hostHashMaps, embTableName, opTimes); - ASSERT_EQ(excludeKeyMap[1], INT_2); - ASSERT_EQ(excludeKeyMap[INT_42], INT_2); - ASSERT_EQ(ddrKeyMap.Get(INT_21), NEGATIVE_INT_1); - ASSERT_EQ(ddrKeyMap.Get(1), NEGATIVE_INT_1); - Logger::SetLevel(logLevelTemp); // 恢复日志级别 - LOG_INFO("test TestFindOffset end."); -} - -TEST(EmbHashMap, TESTGetHashMaps) -{ - string embTableName = "table1"; - EmbHashMap hostHashMaps; - RankInfo rankInfo; - rankInfo.isDDR = true; - auto embInfo = GetEmbInfoList(); - hostHashMaps.Init(rankInfo, embInfo, false); - CacheManager cacheManager; - cacheManager.Init(nullptr, embInfo); - hostHashMaps.isSSDEnabled = true; - hostHashMaps.cacheManager = &cacheManager; - int channelId = 0; - size_t currentBatchId = 0; - size_t keepBatchId = 0; - int opTimes = 0; - - vector keys = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - hostHashMaps.FindOffset(embTableName, keys, currentBatchId++, keepBatchId++, channelId); - RefreshSwapFreqInfoAndPrint(hostHashMaps, embTableName, opTimes++); - auto testEmbHashMap = hostHashMaps.GetHashMaps().at(embTableName); - hostHashMaps.embHashMaps.at(embTableName).maxOffsetOld = testEmbHashMap.maxOffset; - // 增加10个key, offset长度变为10 - ASSERT_EQ(testEmbHashMap.maxOffset, 10); - - keys = {11, 12, 13, 14, 15, 16, 17, 18, 19, 20}; - hostHashMaps.FindOffset(embTableName, keys, currentBatchId++, keepBatchId++, channelId); - RefreshSwapFreqInfoAndPrint(hostHashMaps, embTableName, opTimes++); - testEmbHashMap = hostHashMaps.GetHashMaps().at(embTableName); - // 再增加10个key,offset变为20 - ASSERT_EQ(testEmbHashMap.maxOffset, 20); - - HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); - hybridMgmtBlock->lastRunChannelId = channelId; - hybridMgmtBlock->hybridBatchId[0] = 1; - testEmbHashMap = hostHashMaps.GetHashMaps().at(embTableName); - // 回退一步,offset变回10 - ASSERT_EQ(testEmbHashMap.maxOffset, 10); - - hybridMgmtBlock->hybridBatchId[0] = 2; - // 回退2步,抛出异常 - ASSERT_THROW(hostHashMaps.GetHashMaps(), HybridMgmtBlockingException); - hybridMgmtBlock->hybridBatchId[0] = 0; - - keys = {10, 11}; - hostHashMaps.EvictDeleteEmb(embTableName, keys); - testEmbHashMap = hostHashMaps.GetHashMaps().at(embTableName); - // 淘汰1个hbm key和1个ddr key,表中无法查找到该key - ASSERT_EQ(testEmbHashMap.hostHashMap.find(10), testEmbHashMap.hostHashMap.end()); - ASSERT_EQ(testEmbHashMap.hostHashMap.find(11), testEmbHashMap.hostHashMap.end()); - ASSERT_EQ(cacheManager.excludeDDRKeyCountMap[embTableName][11], 0); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].Get(10), -1); - - keys = {1, 2}; - hostHashMaps.FindOffset(embTableName, keys, currentBatchId++, keepBatchId++, channelId); - RefreshSwapFreqInfoAndPrint(hostHashMaps, embTableName, opTimes++); - testEmbHashMap = hostHashMaps.GetHashMaps().at(embTableName); - // 从ddr中换回2个key到hbm,交换变量长度为2 - ASSERT_EQ(testEmbHashMap.ddr2HbmKeys.size(), 2); - hostHashMaps.ClearLookupAndSwapOffset(hostHashMaps.embHashMaps.at(embTableName)); - testEmbHashMap = hostHashMaps.GetHashMaps().at(embTableName); - // 清理后,交换变量长度为0 - ASSERT_EQ(testEmbHashMap.ddr2HbmKeys.size(), 0); -} \ No newline at end of file diff --git a/src/tests/emb_mgmt/emb_mgmt_test.cpp b/src/tests/emb_mgmt/emb_mgmt_test.cpp index e47f3b4f..4924abf1 100644 --- a/src/tests/emb_mgmt/emb_mgmt_test.cpp +++ b/src/tests/emb_mgmt/emb_mgmt_test.cpp @@ -15,7 +15,6 @@ See the License for the specific language governing permissions and #include #include "hybrid_mgmt/hybrid_mgmt.h" -#include "host_emb/host_emb.h" #include "utils/common.h" using namespace std; @@ -62,30 +61,6 @@ protected: string constantInitializerName = "constant_initializer"; int nBatch = 10; - void UpdateEmb(vector &missingKeysHostPos, int channelId, const string &embName, - std::unique_ptr &hostEmb, vector &d2h_emb) - { - LOG_INFO(HD + "update emb start"); - if (d2h_emb.size() == 0) { - LOG_INFO(HD + "emb is none channelId:{}", channelId); - return; - } - - auto tensorPtr = d2h_emb[0].flat().data(); - for (size_t i = 0; i < missingKeysHostPos.size(); i++) { - (hostEmb->GetEmb(embName).embData[missingKeysHostPos[i]]).assign( - tensorPtr, - tensorPtr + hostEmb->GetEmb(embName).hostEmbInfo.extEmbeddingSize); - tensorPtr = tensorPtr + hostEmb->GetEmb(embName).hostEmbInfo.extEmbeddingSize; - } - for (size_t i = 0; i < hostEmb->GetEmb(embName).embData.size(); ++i) { - LOG_INFO("hostEmb: embName {}, {} is: {}", embName, i, - VectorToString(hostEmb->GetEmb(embName).embData[i])); - } - LOG_INFO(HD + "update emb end"); - d2h_emb.clear(); - } - bool Float2TensorVec(const vector>& Datas, vector& tensors) { tensors.clear(); @@ -116,63 +91,6 @@ protected: // delete } }; -#ifndef GTEST -TEST_F(EmbMgmtTest, Initialize) -{ - vector vocabsize = { devVocabSize, hostVocabSize }; - aoto param = EmbInfoParams(name, sendCount, embeddingSize, extEmbeddingSize, isSave) - embInfo = EmbInfo(param, vocabsize, initializeInfos); - embInfos.emplace_back(embInfo); - vector thresholdValues = {}; - - auto hybridMgmt = Singleton::GetInstance(); - cout << "setup..." << endl; - - allRank = RankInfo(GlogConfig::gRankId, deviceId, localRankSize, useStatic, nBatch, maxStep); - hybridMgmt->Initialize(allRank, embInfos, seed, thresholdValues, false); - auto hostEmbs = make_unique(); - hostEmbs->Initialize(embInfos, seed); - auto hostHashMaps = make_unique(); - hostHashMaps->Init(allRank, embInfos, false); - - int currentBatchId = 0; - vector lookupKeys = { 1, 3, 5, 7 }; - vector d2h_emb; - vector> tmpDatas; - vector tmpData; - hostHashMaps->Process(embInfo.name, lookupKeys, currentBatchId, tmpData); - auto missingKeys = hostHashMaps->embHashMaps[embInfo.name].missingKeysHostPos; - LOG_INFO("missingKeys {}", missingKeys); - hostEmbs->EmbDataGenerator(initializeInfos, seed, missingKeys.size(), embeddingSize, tmpDatas); - auto status = Float2TensorVec(tmpDatas, d2h_emb); - ASSERT_EQ(status, true); - UpdateEmb(missingKeys, 0, embInfo.name, hostEmbs, d2h_emb); - hostHashMaps->embHashMaps[embInfo.name].missingKeysHostPos.clear(); - - lookupKeys = { 2, 3, 5, 6 }; - hostHashMaps->Process(embInfo.name, lookupKeys, currentBatchId, tmpData); - missingKeys = hostHashMaps->embHashMaps[embInfo.name].missingKeysHostPos; - LOG_INFO("missingKeys {}", missingKeys); - hostEmbs->EmbDataGenerator(initializeInfos, seed, missingKeys.size(), embeddingSize, tmpDatas); - status = Float2TensorVec(tmpDatas, d2h_emb); - ASSERT_EQ(status, true); - UpdateEmb(missingKeys, 0, embInfo.name, hostEmbs, d2h_emb); - hostHashMaps->embHashMaps[embInfo.name].missingKeysHostPos.clear(); - - lookupKeys = { 1, 7, 9, 10 }; - hostHashMaps->Process(embInfo.name, lookupKeys, currentBatchId, tmpData); - missingKeys = hostHashMaps->embHashMaps[embInfo.name].missingKeysHostPos; - LOG_INFO("missingKeys {}", missingKeys); - hostEmbs->EmbDataGenerator(initializeInfos, seed, missingKeys.size(), embeddingSize, tmpDatas); - Float2TensorVec(tmpDatas, d2h_emb); - status = Float2TensorVec(tmpDatas, d2h_emb); - ASSERT_EQ(status, true); - UpdateEmb(missingKeys, 0, embInfo.name, hostEmbs, d2h_emb); - hostHashMaps->embHashMaps[embInfo.name].missingKeysHostPos.clear(); - - hybridMgmt->Destroy(); -} -#endif #ifndef GTEST TEST_F(EmbMgmtTest, Initialize_HBM) diff --git a/src/tests/emb_table/embedding_ddr_test.cpp b/src/tests/emb_table/embedding_ddr_test.cpp index 374a1392..ddad3905 100644 --- a/src/tests/emb_table/embedding_ddr_test.cpp +++ b/src/tests/emb_table/embedding_ddr_test.cpp @@ -24,7 +24,6 @@ See the License for the specific language governing permissions and #include "utils/common.h" #include "emb_table/emb_table.h" #include "emb_table/embedding_ddr.h" -#include "host_emb/host_emb.h" using namespace std; using namespace MxRec; @@ -37,7 +36,7 @@ protected: { struct EmbInfoParams embParam(string("test1"), 0, 1000, 2000, true, true); std::vector vocabsize = {100}; - std::vector initializeInfos = {}; + vector initializeInfos = {}; std::vector ssdDataPath = {""}; vector maxStep = {1000}; embInfo_ = EmbInfo(embParam, vocabsize, initializeInfos, ssdDataPath); @@ -75,79 +74,6 @@ protected: */ TEST_F(EmbeddingDDRTest, SaveLoadEmbeddingData) { - vector embInfos = {embInfo_}; - HostEmb* hostEmbs = Singleton::GetInstance(); - hostEmbs->Initialize(embInfos, 0); - HostEmbTable& table = hostEmbs->GetEmb("test1"); - - vector tmp1 {1.1, 2.1, 3.1}; - vector tmp2 {1.2, 2.2, 3.2}; - vector tmp3 {1.3, 2.3, 3.3}; - vector> testData; - testData.push_back(tmp1); - testData.push_back(tmp2); - testData.push_back(tmp3); - - for (vector& tmp : testData) { - table.embData.push_back(tmp); - } - - shared_ptr ddr1 = std::make_shared(embInfo_, rankInfo_, 0); - shared_ptr ddr2 = std::make_shared(embInfo_, rankInfo_, 0); - ddr1->Save("test_dir"); - // 修改成0 - for (vector& tmp: table.embData) { - for (float& t : tmp) { - t = 0; - } - } - bool fileExist = false; - if (access("./test_dir/test1/embedding", F_OK) == 0) { - fileExist = true; - } - EXPECT_EQ(fileExist, true); -} - -/** - * 测试基本查找 - */ -TEST_F(EmbeddingDDRTest, DDRBasic) -{ - shared_ptr table = std::make_shared(embInfo_, rankInfo_, 0); - const size_t testNum = 100; - vector testKeys; - vector testSwap; - for (size_t i = 0; i < testNum; ++i) { - testKeys.push_back(i); - } - table->FindOffset(testKeys, 0, TRAIN_CHANNEL_ID, testSwap); - EXPECT_EQ(testKeys.size(), 100); - EXPECT_EQ(testSwap.size(), 0); -} - -TEST_F(EmbeddingDDRTest, evict) -{ - shared_ptr table = std::make_shared(embInfo_, rankInfo_, 0); - const size_t testNum = 100; - vector testKeys; - vector testSwap; - for (size_t i = 0; i < testNum; ++i) { - testKeys.push_back(i); - } - table->FindOffset(testKeys, 0, TRAIN_CHANNEL_ID, testSwap); - table->EvictKeys(testKeys); - EXPECT_EQ(table->evictDevPos.size(), 100); - EXPECT_EQ(testKeys.size(), 100); - EXPECT_EQ(testSwap.size(), 0); -} - -TEST_F(EmbeddingDDRTest, FindSwap) -{ - shared_ptr table = std::make_shared(embInfo_, rankInfo_, 0); - const size_t testNum = 100; - vector testSwap; - table->FindSwapPosOld(0, 0, 0, testSwap); - EXPECT_EQ(testSwap.size(), 1); } TEST_F(EmbeddingDDRTest, EvictDeleteEmb) diff --git a/src/tests/emb_table/embedding_mgmt_test.cpp b/src/tests/emb_table/embedding_mgmt_test.cpp index 9374b078..49f10b4f 100644 --- a/src/tests/emb_table/embedding_mgmt_test.cpp +++ b/src/tests/emb_table/embedding_mgmt_test.cpp @@ -36,7 +36,7 @@ protected: { struct EmbInfoParams embParam(string("test1"), 0, 1000, 2000, true, true); std::vector vocabsize = {100}; - std::vector initializeInfos = {}; + vector initializeInfos = {}; std::vector ssdDataPath = {""}; vector maxStep = {1000}; embInfo_ = EmbInfo(embParam, vocabsize, initializeInfos, ssdDataPath); @@ -75,7 +75,7 @@ TEST_F(EmbeddingMgmtTest, Init) ThresholdValue thvalue(tableName, 0, 0, 0, false); vector embInfos = {embInfo_}; vector thresholds = {thvalue}; - EmbeddingMgmt::Instance()->Init(rankInfo_, embInfos, thresholds, 0); + EmbeddingMgmt::Instance()->Init(rankInfo_, embInfos, 0); constexpr int testNum = 100; vector testKeys; @@ -95,7 +95,7 @@ TEST_F(EmbeddingMgmtTest, GetAttributes) ThresholdValue thvalue(tableName, 0, 0, 0, false); vector embInfos = {embInfo_}; vector thresholds = {thvalue}; - EmbeddingMgmt::Instance()->Init(rankInfo_, embInfos, thresholds, 0); + EmbeddingMgmt::Instance()->Init(rankInfo_, embInfos, 0); constexpr int testNum = 100; vector testKeys; diff --git a/src/tests/emb_table/embedding_static_test.cpp b/src/tests/emb_table/embedding_static_test.cpp index 09e72ca0..c8a5e252 100644 --- a/src/tests/emb_table/embedding_static_test.cpp +++ b/src/tests/emb_table/embedding_static_test.cpp @@ -35,7 +35,7 @@ protected: { struct EmbInfoParams embParam(string("test1"), 0, 1000, 2000, true, true); std::vector vocabsize = {100}; - std::vector initializeInfos = {}; + vector initializeInfos = {}; std::vector ssdDataPath = {""}; vector maxStep = {1000}; embInfo_ = EmbInfo(embParam, vocabsize, initializeInfos, ssdDataPath); @@ -136,7 +136,8 @@ TEST_F(EmbeddingStaticTest, Key2OffsetEvict) } table->Key2Offset(testData, TRAIN_CHANNEL_ID); // 全部淘汰 - table->EvictKeys(testData); + vector testDataAdapt(testData.cbegin(), testData.cend()); + table->EvictKeys(testDataAdapt); vector new_data; for (size_t i = 0; i < testNum; ++i) { diff --git a/src/tests/file_system/hdfs_file_system_test.cpp b/src/tests/file_system/hdfs_file_system_test.cpp index 3794d14d..0d469ca5 100644 --- a/src/tests/file_system/hdfs_file_system_test.cpp +++ b/src/tests/file_system/hdfs_file_system_test.cpp @@ -17,7 +17,6 @@ See the License for the specific language governing permissions and #include #include "file_system/file_system_handler.h" -#include "file_system/hdfs_file_system/hdfs_file_system.h" #include "file_system/hdfs_file_system/hdfs_wrapper.h" using namespace std; diff --git a/src/tests/file_system/local_file_system_test.cpp b/src/tests/file_system/local_file_system_test.cpp index dfe5d483..2ea0d9d3 100644 --- a/src/tests/file_system/local_file_system_test.cpp +++ b/src/tests/file_system/local_file_system_test.cpp @@ -16,7 +16,6 @@ See the License for the specific language governing permissions and #include #include "file_system/file_system_handler.h" -#include "file_system/local_file_system/local_file_system.h" using namespace std; using namespace MxRec; @@ -42,10 +41,10 @@ TEST(LocalFileSystem, WriteAndReadFile) TEST(LocalFileSystem, WriteEmbedding) { string filePath = "./write.data"; - float p[5] = {1.1, 2.2, 3.3, 4.4, 5.5}; - vector writeData = {p, p+1, p+2, p+3, p+4}; + vector writeData = {1.1, 2.2, 3.3, 4.4, 5.5}; + vector> writeData1 = {writeData}; auto fileSystemHandler = make_unique(); auto fileSystemPtr = fileSystemHandler->Create(filePath); - ssize_t res = fileSystemPtr->Write(filePath, writeData, sizeof(float)); + ssize_t res = fileSystemPtr->Write(filePath, writeData1, sizeof(float)); ASSERT_EQ(writeData.size() * sizeof(float), res); } diff --git a/src/tests/host_emb/host_emb_test.cpp b/src/tests/host_emb/host_emb_test.cpp deleted file mode 100644 index 05a636d9..00000000 --- a/src/tests/host_emb/host_emb_test.cpp +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ - -#include -#include - -#include "host_emb/host_emb.h" -#include "tensorflow/core/framework/tensor.h" -#include "hd_transfer/hd_transfer.h" -#include "utils/singleton.h" - -using namespace std; -using namespace tensorflow; -using namespace MxRec; - -namespace { -bool operator==(const Tensor& tensor1, const Tensor& tensor2) -{ - if (tensor1.shape() != tensor2.shape()) { - return false; - } - auto tensor1_data = tensor1.flat(); - auto tensor2_data = tensor2.flat(); - for (int j = 0; j < tensor1_data.size(); j++) { - if (tensor1_data(j) != tensor2_data(j)) { - return false; - } - } - return true; -} - -bool operator==(const vector& p1, const vector& p2) -{ - if (p1.size() != p2.size()) { - return false; - } - for (int i = 0; i>> lookups; - vector host_emb; - host_emb.resize(15); - vector> p(5, vector(3)); - host_emb[0] = 1; - host_emb[1] = 3; - std::cout << host_emb[0] << std::endl; - for (int i = 0; i < 5; i++) { - p[i].assign(host_emb.begin() + i * 3, host_emb.begin() + (i + 1) * 3); - } - std::cout << p[0][0] << std::endl; - std::cout << '5' << std::endl; - vector q; - std::cout << '0' << std::endl; - for (int i = 0; i < 2; i++) { - Tensor tmpTensor(tensorflow::DT_INT32, { 3 }); - std::cout << '1' << std::endl; - auto tmpData = tmpTensor.flat(); - std::cout << '2' << std::endl; - for (int j = 0; j < 3; j++) { - tmpData(j) = p[i][j]; - std::cout << '3' << std::endl; - } - - q.emplace_back(tmpTensor); - std::cout << '4' << std::endl; - } - std::cout << '1' << std::endl; - std::cout << q[0].flat()(0) << std::endl; - std::cout << q[0].flat()(1) << std::endl; - std::cout << q[1].flat()(0) << std::endl; - ASSERT_EQ(1, 1); -} - -TEST(HostEmb, DefaultConstructor) -{ - HostEmb h; - h.procThreadsForTrain.emplace_back(make_unique([] {})); - h.Join(TRAIN_CHANNEL_ID); - ASSERT_EQ(h.procThreadsForTrain.size(), 0); - - h.procThreadsForEval.emplace_back(make_unique([] {})); - h.Join(EVAL_CHANNEL_ID); - ASSERT_EQ(h.procThreadsForEval.size(), 0); -} - -} \ No newline at end of file diff --git a/src/tests/key_process/feature_admit_and_evict_test.cpp b/src/tests/key_process/feature_admit_and_evict_test.cpp index 09cadc7f..dffce96c 100644 --- a/src/tests/key_process/feature_admit_and_evict_test.cpp +++ b/src/tests/key_process/feature_admit_and_evict_test.cpp @@ -248,7 +248,7 @@ protected: currTime = time(nullptr); if (currTime - lastTime >= SleepTime::SLEEP_SECOND_4) { LOG_INFO("Evict-thread doing at currTime[{}] ...", currTime); - map> evictPosMap {}; + map> evictPosMap {}; faae.FeatureEvict(evictPosMap); lastTime = currTime; } @@ -258,7 +258,7 @@ protected: } void WaitEvictThread() { - map> evictPosMap {}; + map> evictPosMap {}; faae.FeatureEvict(evictPosMap); // 退出前保证执行了一次“淘汰” isExitFlag = true; if (evictThr.joinable()) { diff --git a/src/tests/key_process/key_process_test.cpp b/src/tests/key_process/key_process_test.cpp index a68f4787..fb2be40b 100644 --- a/src/tests/key_process/key_process_test.cpp +++ b/src/tests/key_process/key_process_test.cpp @@ -30,7 +30,6 @@ using namespace MxRec; using namespace testing; static constexpr size_t BATCH_NUM_EACH_THREAD = 3; -ock::ctr::FactoryPtr factory; class SimpleThreadPool { public: @@ -46,17 +45,6 @@ public: } }; -static void CTRLog(int level, const char *msg) -{ - switch (level) { - case 0: - LOG_DEBUG(msg); - break; - default: - break; - } -} - class KeyProcessTest : public testing::Test { protected: void SetUp() @@ -352,7 +340,7 @@ TEST_F(KeyProcessTest, Start) ASSERT_EQ(process.Start(), 0); setenv("keyProcessThreadNum", "abc", 1); ASSERT_EQ(process.Start(), 0); - CTRLog(0, "key process start successful"); + LOG_INFO("key process start successful"); process.Destroy(); } diff --git a/src/tests/ssd_cache/cache_manager_test.cpp b/src/tests/ssd_cache/cache_manager_test.cpp index 677939d2..7cb5e032 100644 --- a/src/tests/ssd_cache/cache_manager_test.cpp +++ b/src/tests/ssd_cache/cache_manager_test.cpp @@ -18,11 +18,9 @@ See the License for the specific language governing permissions and #include #include "absl/container/flat_hash_map.h" -#include "host_emb/host_emb.h" #include "ssd_cache/lfu_cache.h" #include "ssd_cache/cache_manager.h" #include "utils/common.h" -#include "emb_table/embedding_ddr.h" using namespace std; using namespace MxRec; @@ -39,10 +37,10 @@ void InitSSDEngine(CacheManager& manager, string embTableName, uint64_t ssdSize) manager.ssdEngine->SetCompactPeriod(period); manager.ssdEngine->SetCompactThreshold(1); manager.ssdEngine->CreateTable(embTableName, {SSD_SAVE_PATH}, ssdSize); - vector ssdKeys = {15, 25}; // 预设15, 25存储在SSD + vector ssdKeys = {15, 25}; // 预设15, 25存储在SSD std::vector> ssdEmbData = {{15.0f}, {25.0f}}; - auto& excludeMap = manager.excludeDDRKeyCountMap[embTableName]; + auto& excludeMap = manager.preProcessMapper[embTableName].excludeDDRKeyCountMap; excludeMap[15] = 3; // 初始化次数 excludeMap[25] = 5; manager.ssdEngine->InsertEmbeddings(embTableName, ssdKeys, ssdEmbData); @@ -94,7 +92,7 @@ protected: LFUCache cache2; cacheManager.ddrKeyFreqMap[embTableName2] = cache2; PutKeyInfo(cacheManager.ddrKeyFreqMap[embTableName2], input_keys); - unordered_map excludeDDRKeyFreq; + unordered_map excludeDDRKeyFreq; excludeDDRKeyFreq[27] = 10; excludeDDRKeyFreq[30] = 10; cacheManager.excludeDDRKeyCountMap[embTableName] = excludeDDRKeyFreq; @@ -105,14 +103,13 @@ protected: InitDDREmbData(loadData, embTableName, mgmtEmbInfos); InitDDREmbData(loadData, embTableName2, mgmtEmbInfos); - cacheManager.Init(hEmb, mgmtEmbInfos); + ock::ctr::EmbCacheManagerPtr embCachePtr = nullptr; + + cacheManager.Init(embCachePtr, mgmtEmbInfos); InitSSDEngine(cacheManager, embTableName, 5); InitSSDEngine(cacheManager, embTableName2, 10); // load ddr emb data - cacheManager.hostEmbs->hostEmbs = loadData; - - auto& embMap = cacheManager.hostEmbs->hostEmbs; } CacheManager cacheManager; @@ -126,49 +123,12 @@ protected: vector input_keys = {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 6, 6, 8, 9}; string embTableName = "table1"; string embTableName2 = "table2"; - HostEmb* hEmb = Singleton::GetInstance(); void TearDown() { } }; -TEST_F(CacheManagerTest, RefreshFreqInfo) -{ - vector ddr2HbmKeys = {8, 9}; - cacheManager.RefreshFreqInfoCommon(embTableName, ddr2HbmKeys, TransferType::DDR_2_HBM); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].minFreq, 2); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].keyTable.size(), 5); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].freqTable.size(), 2); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].Get(8), -1); - ASSERT_EQ(cacheManager.excludeDDRKeyCountMap[embTableName].size(), 6); - - // HBM转移到DDR 频次数据设置构造 - cacheManager.excludeDDRKeyCountMap[embTableName][150] = 4; - cacheManager.excludeDDRKeyCountMap[embTableName][151] = 1; - vector hbm2DdrKeys = {150, 151}; - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].Get(151), -1); - cacheManager.RefreshFreqInfoCommon(embTableName, hbm2DdrKeys, TransferType::HBM_2_DDR); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].Get(150), 4); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].Get(151), 1); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].minFreq, 1); - ASSERT_EQ(cacheManager.excludeDDRKeyCountMap[embTableName].size(), 6); - - vector ddr2EvictKeys = {151}; - cacheManager.RefreshFreqInfoCommon(embTableName, ddr2EvictKeys, TransferType::DDR_2_EVICT); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].Get(151), -1); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].freqTable.size(), 3); - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].minFreq, 2); - - // HBM2Evict - cacheManager.excludeDDRKeyCountMap[embTableName][160] = 1; - vector hbm2EvictKeys = {160}; - cacheManager.RefreshFreqInfoCommon(embTableName, hbm2EvictKeys, TransferType::HBM_2_EVICT); - const auto it = cacheManager.excludeDDRKeyCountMap[embTableName].find(160); - ASSERT_EQ(it, cacheManager.excludeDDRKeyCountMap[embTableName].end()); - LOG_INFO("test RefreshFreqInfo end."); -} - TEST_F(CacheManagerTest, PutKey) { vector putDDRKeys = {1, 9, 8, 15}; @@ -191,193 +151,17 @@ TEST_F(CacheManagerTest, IsKeyInSSD) LOG_INFO("test IsKeyInSSD end."); } -TEST_F(CacheManagerTest, TransferDDREmbWithSSDByEmptyExternalKey) -{ - EmbeddingDDR table; - - vector currentKeys = {55, 65, 75}; - table.keyOffsetMap[55] = 119; - table.keyOffsetMap[65] = 118; - table.keyOffsetMap[75] = 116; - - TableInfo ti = table.GetTableInfo(); - - auto ret = cacheManager.TransferDDREmbWithSSD(ti, currentKeys, TRAIN_CHANNEL_ID); - ASSERT_EQ(ret, TransferRet::TRANSFER_OK); - LOG_INFO("test TransferDDREmbWithSSDByEmptyExternalKey end."); -} - -TEST_F(CacheManagerTest, TransferDDREmbWithSSDByAllProcess) -{ - vector ssdKeys = {15, 25}; - vector> ssdKeyEmbInfo = {{1.5f}, {2.5f}}; - - // init EmbeddingDDR - EmbeddingDDR table; - table.name = embTableName; - table.devVocabSize = 20; - table.hostVocabSize = 100; - table.maxOffset = 118; - table.evictHostPos.emplace_back(110); // 淘汰列表 - - TableInfo ti = table.GetTableInfo(); - - // 构造已经存储早DDR中key和offset对应关系; DDR的offset在映射表中范围是 20~119 - table.keyOffsetMap[9] = 117; // DDR中相对位置: 97 - table.keyOffsetMap[8] = 116; // DDR中相对位置: 96 - table.keyOffsetMap[6] = 114; // DDR中相对位置: 94 - table.keyOffsetMap[4] = 112; // DDR中相对位置: 92 - table.keyOffsetMap[3] = 111; // DDR中相对位置: 91 - table.keyOffsetMap[2] = 21; // DDR中相对位置: 1 - table.keyOffsetMap[1] = 20; // DDR中相对位置: 0 - - // 检查构造数据正确性 - auto& embMap = cacheManager.hostEmbs->hostEmbs; - const auto& it = embMap.find(embTableName); - auto& hostData = it->second.embData; - ASSERT_TRUE(fabs(hostData[0][0] - 1.0f) < EPSILON); - ASSERT_TRUE(fabs(hostData[1][0] - 2.0f) < EPSILON); - ASSERT_TRUE(fabs(hostData[94][0] - 6.0f) < EPSILON); - ASSERT_TRUE(fabs(hostData[97][0] - 9.0f) < EPSILON); - auto& excludeKeyCountMap = cacheManager.excludeDDRKeyCountMap[embTableName]; - ASSERT_EQ(excludeKeyCountMap[15], 3); - ASSERT_EQ(excludeKeyCountMap[25], 5); - ASSERT_FALSE(cacheManager.ssdEngine->IsKeyExist(embTableName, 9)); - ASSERT_FALSE(cacheManager.ssdEngine->IsKeyExist(embTableName, 8)); - ASSERT_TRUE(cacheManager.IsKeyInSSD(embTableName, 15)); - - // externalKeys: SSD(15, 25) + newKey(55, 65, 75) - // 训练场景,构造结果:offsetAvailableSize=20+100-118+evictPos.size()=3 - // cacheManager中的频次数据(低-高): 9 8 6 4 3 2 1 - // 构造空间超出SSD可用上限 - vector exceedKeys = {15, 25, 6, 4, 55, 65, 75, 85, 95, 105, 115}; - auto spaceError1 = cacheManager.TransferDDREmbWithSSD(ti, exceedKeys, TRAIN_CHANNEL_ID); - ASSERT_EQ(spaceError1, TransferRet::SSD_SPACE_NOT_ENOUGH); - - // 构造训练+超SSD可用+当前批次中不包含报错在SSD的key - vector keys2 = {6, 4, 55, 65, 75, 85, 95, 105, 115, 125, 135}; - auto spaceError2 = cacheManager.TransferDDREmbWithSSD(ti, exceedKeys, TRAIN_CHANNEL_ID); - ASSERT_EQ(spaceError2, TransferRet::SSD_SPACE_NOT_ENOUGH); - - // 构造当前批次key 存储位置: SSD(15, 25) DDR(6, 4) newKey(55, 65, 75) - vector currentKeys = {15, 25, 6, 4, 55, 65, 75}; - // 需要从ddr转移4个key到ssd, 低频数据中6 4在当前批次key中,不会被转移,构造的数据转移key:9, 8, 3, 2 - auto ret = cacheManager.TransferDDREmbWithSSD(ti, currentKeys, TRAIN_CHANNEL_ID); - - // 检查处理后数据正确性 - ASSERT_EQ(ret, TransferRet::TRANSFER_OK); - ASSERT_TRUE(fabs(hostData[94][0] - 6.0f) < EPSILON); // DDR内未移动的数据 - ASSERT_TRUE(fabs(hostData[96][0] - 25.0f) < EPSILON); // SSD转移到DDR的数据 - ASSERT_TRUE(fabs(hostData[97][0] - 15.0f) < EPSILON); // SSD转移到DDR的数据 - ASSERT_EQ(table.evictHostPos.size(), 1); - ASSERT_EQ(table.evictHostPos.back(), 110); - - // 原DDR中最小频次key(9,8)次数(1)被转移到SSD,SSD转移到DDR的key(15,25)次数(3,5), DDR内频次索引应变为2 - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].minFreq, 2); - ASSERT_TRUE(cacheManager.IsKeyInSSD(embTableName, 9)); - ASSERT_TRUE(cacheManager.IsKeyInSSD(embTableName, 8)); - ASSERT_FALSE(cacheManager.IsKeyInSSD(embTableName, 15)); - LOG_INFO("test TransferDDREmbWithSSDByAllProcess end."); -} - -TEST_F(CacheManagerTest, TransferDDREmbWithSSDByEmptyExternalSSDKey) -{ - // 训练+评估:构造DDR剩余空间足够,externalSSDKeys为空 - EmbeddingDDR table; - table.name = embTableName; - table.devVocabSize = 20; - table.hostVocabSize = 100; - table.keyOffsetMap[6] = 114; // DDR中相对位置: 94 - table.keyOffsetMap[4] = 112; // DDR中相对位置: 92 - // 剩余3个可用空间(DDR剩余2个, 相对位置:98 99; DDR淘汰列表1个) - table.maxOffset = 118; - table.evictHostPos.emplace_back(110); - - TableInfo ti = table.GetTableInfo(); - - vector currentKeys = {6, 4, 55, 65, 75}; - auto ret = cacheManager.TransferDDREmbWithSSD(ti, currentKeys, TRAIN_CHANNEL_ID); - ASSERT_EQ(ret, TransferRet::TRANSFER_OK); - auto retByEval = cacheManager.TransferDDREmbWithSSD(ti, currentKeys, EVAL_CHANNEL_ID); - ASSERT_EQ(retByEval, TransferRet::TRANSFER_OK); - - // 评估场景, DDR剩余空间不足, externalSSDKeys为空 - vector currentKeys2 = {6, 4, 55, 65, 75, 85, 95, 105, 115}; - auto ret2 = cacheManager.TransferDDREmbWithSSD(ti, currentKeys2, EVAL_CHANNEL_ID); - ASSERT_EQ(ret2, TransferRet::TRANSFER_OK); - // 训练场景,返回ssd空间不足 - auto ret3 = cacheManager.TransferDDREmbWithSSD(ti, currentKeys2, TRAIN_CHANNEL_ID); - ASSERT_EQ(ret3, TransferRet::SSD_SPACE_NOT_ENOUGH); - LOG_INFO("test TransferDDREmbWithSSDByEmptyExternalSSDKey end."); -} - -TEST_F(CacheManagerTest, TransferDDREmbWithSSDByEval) -{ - // 评估+DDR剩余空间足够+externalSSDKeys为空 - EmbeddingDDR table; - table.name = embTableName; - table.devVocabSize = 20; - table.hostVocabSize = 100; - table.keyOffsetMap[9] = 117; // DDR中相对位置: 97 - table.keyOffsetMap[8] = 116; // DDR中相对位置: 96 - table.keyOffsetMap[6] = 114; // DDR中相对位置: 94 - table.keyOffsetMap[4] = 112; // DDR中相对位置: 92 - // 剩余3个可用空间(DDR剩余2个, 相对位置:98 99; DDR淘汰列表1个) - table.maxOffset = 118; - table.evictHostPos.emplace_back(110); // 淘汰列表 - - TableInfo ti = table.GetTableInfo(); - - vector currentKeys = {6, 4, 55, 65, 75}; - auto ret = cacheManager.TransferDDREmbWithSSD(ti, currentKeys, EVAL_CHANNEL_ID); - ASSERT_EQ(ret, TransferRet::TRANSFER_OK); - LOG_INFO("test eval+space enough+externalSSDKeysEmpty ok."); - - // 评估+DDR剩余空间足够+externalSSDKeys非空 - vector currentKeys2 = {15, 25, 6, 4, 55, 65, 75, 85, 95, 105, 115}; - auto ret2 = cacheManager.TransferDDREmbWithSSD(ti, currentKeys2, EVAL_CHANNEL_ID); - ASSERT_EQ(ret2, TransferRet::TRANSFER_OK); - // 检查处理后数据正确性 - const auto& it = cacheManager.hostEmbs->hostEmbs.find(embTableName); - auto& hostData = it->second.embData; - ASSERT_TRUE(fabs(hostData[94][0] - 6.0f) < EPSILON); // DDR内未移动的数据 - ASSERT_TRUE(fabs(hostData[98][0] - 25.0f) < EPSILON); // SSD转移到DDR的数据 - ASSERT_TRUE(fabs(hostData[90][0] - 15.0f) < EPSILON); // SSD转移到DDR的数据 - ASSERT_EQ(table.evictHostPos.size(), 0); - // 原DDR中最小频次key(9,8)次数(1)被转移到SSD,SSD转移到DDR的key(15,25)次数(3,5), DDR内频次索引应变为2 - ASSERT_EQ(cacheManager.ddrKeyFreqMap[embTableName].minFreq, 1); - ASSERT_FALSE(cacheManager.IsKeyInSSD(embTableName, 9)); - ASSERT_FALSE(cacheManager.IsKeyInSSD(embTableName, 8)); - ASSERT_FALSE(cacheManager.IsKeyInSSD(embTableName, 15)); - LOG_INFO("test eval+space enough+externalSSDKeysNotEmpty ok."); -} - -TEST_F(CacheManagerTest, TransferDDREmbWithSSDByDDRSpaceNotEnough) -{ - // 构造DDR所有空间不满足存放当前批次数据 - EmbeddingDDR table; - table.name = embTableName2; - table.devVocabSize = 20; - table.hostVocabSize = 10; - table.maxOffset = 30; - table.keyOffsetMap[6] = 9; - table.keyOffsetMap[4] = 8; - - TableInfo ti = table.GetTableInfo(); - - // keys size:10, ddr keys:2 externalKeys:8 externalSSDKeys:0 - vector currentKeys = {6, 4, 101, 102, 103, 104, 105, 106, 107, 108}; - auto ret = cacheManager.TransferDDREmbWithSSD(ti, currentKeys, TRAIN_CHANNEL_ID); - ASSERT_EQ(ret, TransferRet::DDR_SPACE_NOT_ENOUGH); - LOG_INFO("test train+ddr space enough+externalSSDKeysEmpty ok."); -} - TEST_F(CacheManagerTest, EvictSSDEmbedding) { // 构造时ssd中已存在的key: 15 25 - emb_key_t key = 15; - vector ssdKeys = {key}; + emb_cache_key_t key = 15; + vector ssdKeys = {key}; cacheManager.EvictSSDEmbedding(embTableName, ssdKeys); + int maxLoop = 1000; + while (!cacheManager.ssdEvictThreads.empty() && maxLoop > 0) { + this_thread::sleep_for(1ms); + maxLoop--; + } ASSERT_FALSE(cacheManager.IsKeyInSSD(embTableName, key)); const auto it = cacheManager.excludeDDRKeyCountMap[embTableName].find(key); ASSERT_EQ(it, cacheManager.excludeDDRKeyCountMap[embTableName].end()); @@ -386,31 +170,4 @@ TEST_F(CacheManagerTest, EvictSSDEmbedding) TEST_F(CacheManagerTest, LoadTest) { - cacheManager.ddrKeyFreqMap.clear(); - cacheManager.excludeDDRKeyCountMap.clear(); - unordered_map> ddrMap; - string embTableName = "table1"; - unordered_map ddrTableMap; - ddrTableMap.emplace(1, 3); - ddrTableMap.emplace(2, 3); - ddrTableMap.emplace(3, 3); - ddrTableMap.emplace(4, 2); - ddrTableMap.emplace(6, 2); - ddrTableMap.emplace(8, 1); - ddrTableMap.emplace(9, 1); - ddrMap.emplace(embTableName, ddrTableMap); - unordered_map> excludeDdrMap; - unordered_map excludeDdrTableMap; - excludeDdrTableMap.emplace(15, 1); - excludeDdrTableMap.emplace(25, 5); - excludeDdrMap.emplace(embTableName, excludeDdrTableMap); - cacheManager.Load(ddrMap, excludeDdrMap, 0, 1, 0); - // 数据检查 - auto& ddrKeyFreqMap = cacheManager.ddrKeyFreqMap; - auto& excludeDDRKeyCountMap = cacheManager.excludeDDRKeyCountMap; - ASSERT_EQ(ddrKeyFreqMap[embTableName].minFreq, 1); - ASSERT_EQ(ddrKeyFreqMap[embTableName].freqTable.size(), 3); - ASSERT_EQ(ddrKeyFreqMap[embTableName].Get(2), 3); - ASSERT_EQ(ddrKeyFreqMap[embTableName].Get(12), -1); - ASSERT_EQ(excludeDDRKeyCountMap[embTableName][25], 5); } \ No newline at end of file diff --git a/src/tests/ssd_cache/lfu_cache_test.cpp b/src/tests/ssd_cache/lfu_cache_test.cpp index 1adf4aad..7f8a7820 100644 --- a/src/tests/ssd_cache/lfu_cache_test.cpp +++ b/src/tests/ssd_cache/lfu_cache_test.cpp @@ -31,7 +31,7 @@ using namespace testing; */ vector INPUT_KEYS = {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 6, 6, 8, 9}; -inline void CompareHandleRet(vector& leastFreqKeys, vector& leastFreq, +inline void CompareHandleRet(vector& leastFreqKeys, vector& leastFreq, vector& expectKeys, vector& expectFreq) { @@ -81,8 +81,8 @@ TEST(LFUCache, PutInitTest) cache.PutWithInit(6, 2); cache.PutWithInit(8, 1); cache.PutWithInit(9, 1); - vector retainedKeys = {4, 6}; - vector leastFreqKeys; + vector retainedKeys = {4, 6}; + vector leastFreqKeys; vector leastFreq; cache.GetAndDeleteLeastFreqKeyInfo(2, retainedKeys, leastFreqKeys, leastFreq); vector expectKeys = {9, 8}; @@ -95,8 +95,8 @@ TEST(LFUCache, LFUDeleteTotalFreqListTest) { LFUCache cache; PutKeys(cache, INPUT_KEYS); - vector retainedKeys = {4, 6, 8, 9}; - vector leastFreqKeys; + vector retainedKeys = {4, 6, 8, 9}; + vector leastFreqKeys; vector leastFreq; cache.GetAndDeleteLeastFreqKeyInfo(2, retainedKeys, leastFreqKeys, leastFreq); vector expectKeys = {3, 2}; @@ -108,8 +108,8 @@ TEST(LFUCache, BaseCacheTest) { LFUCache cache; PutKeys(cache, INPUT_KEYS); - vector retainedKeys = {8, 4, 6, 2}; - vector leastFreqKeys; + vector retainedKeys = {8, 4, 6, 2}; + vector leastFreqKeys; vector leastFreq; cache.GetAndDeleteLeastFreqKeyInfo(2, retainedKeys, leastFreqKeys, leastFreq); vector expectKeys = {9, 3}; @@ -120,5 +120,5 @@ TEST(LFUCache, BaseCacheTest) cache.Put(9); ASSERT_EQ(cache.Get(9), 1); cache.Put(9); - ASSERT_EQ(cache.minFreq, 2); + ASSERT_EQ(cache.minFreq, 1); } diff --git a/src/tests/ssd_engine/engine_test.cpp b/src/tests/ssd_engine/engine_test.cpp index aad64a99..be57ad2f 100644 --- a/src/tests/ssd_engine/engine_test.cpp +++ b/src/tests/ssd_engine/engine_test.cpp @@ -47,9 +47,9 @@ TEST(SSDEngine, CreateAndWriteAndReadAndAutoCompactAndSave) ASSERT_EQ(eng->IsTableExist(tbName), true); // write - vector keys; + vector keys; vector> embeddings; - for (emb_key_t k = 0; k < 10; k++) { + for (emb_cache_key_t k = 0; k < 10; k++) { keys.emplace_back(k); vector emb = {static_cast(k + 0.1), static_cast(k + 0.2)}; embeddings.emplace_back(emb); @@ -64,7 +64,7 @@ TEST(SSDEngine, CreateAndWriteAndReadAndAutoCompactAndSave) ASSERT_EQ(eng->GetTableAvailableSpace(tbName), maxTableSize - keys.size()); // delete and wait auto compact - vector deleteKeys = {0}; + vector deleteKeys = {0}; eng->DeleteEmbeddings(tbName, deleteKeys); this_thread::sleep_for(compactPeriod); @@ -124,9 +124,9 @@ TEST(SSDEngine, LoadAndRead) engSave->CreateTable(tbName, savePath, maxTableSize); // write - vector keys; + vector keys; vector> embeddings; - for (emb_key_t k = 0; k < 10; k++) { + for (emb_cache_key_t k = 0; k < 10; k++) { keys.emplace_back(k); vector emb = {static_cast(k + 0.1), static_cast(k + 0.2)}; embeddings.emplace_back(emb); @@ -141,7 +141,7 @@ TEST(SSDEngine, LoadAndRead) shared_ptr engLoad = make_shared(); engLoad->Start(); engLoad->Load(tbName, savePath, maxTableSize, saveStep); - for (emb_key_t k: keys) { + for (emb_cache_key_t k: keys) { ASSERT_EQ(engLoad->IsKeyExist(tbName, k), true); } auto ret = engLoad->FetchEmbeddings(tbName, keys); diff --git a/src/tests/ssd_engine/file_test.cpp b/src/tests/ssd_engine/file_test.cpp index 599b5975..cdd80fc5 100644 --- a/src/tests/ssd_engine/file_test.cpp +++ b/src/tests/ssd_engine/file_test.cpp @@ -100,9 +100,9 @@ TEST(File, WriteAndRead) string savePath = GlogConfig::gRankId; auto f = make_shared(0, savePath); - vector keys; + vector keys; vector> embeddings; - for (emb_key_t k = 0; k < 10; k++) { + for (emb_cache_key_t k = 0; k < 10; k++) { keys.emplace_back(k); vector emb = {static_cast(k + 0.1), static_cast(k + 0.2)}; embeddings.emplace_back(emb); @@ -129,7 +129,7 @@ TEST(File, SaveAndLoad) string fileDir = GlogConfig::gRankId; auto fTmp = make_shared(0, fileDir); - vector key = {0}; + vector key = {0}; vector> expect = {{1.0, 1.1}}; fTmp->InsertEmbeddings(key, expect); string saveDir = fileDir; // for test convenience @@ -142,3 +142,40 @@ TEST(File, SaveAndLoad) fs::remove_all(fileDir); } + +TEST(File, WriteByAddrAndRead) +{ + int rankId; + MPI_Comm_rank(MPI_COMM_WORLD, &rankId); + GlogConfig::gRankId = to_string(rankId); + + string savePath = GlogConfig::gRankId; + auto f = make_shared(0, savePath); + + vector keys; + vector embeddings; + uint64_t extEmbeddingSize = 1; + for (emb_cache_key_t k = 0; k < 10; k++) { + keys.emplace_back(k); + float* emb = new float; + *emb = static_cast(k + 0.1); + embeddings.emplace_back(emb); + } + + f->InsertEmbeddingsByAddr(keys, embeddings, extEmbeddingSize); + auto ret = f->FetchEmbeddings(keys); + for (int i = 0; i < 10; i++) { + if (std::abs(ret[i][0] - *embeddings[i]) > std::numeric_limits::epsilon()) { + FAIL() << "embedding result not equal to input"; + } + } + + for (auto emb : embeddings) + { + delete emb; + emb = nullptr; + } + + + fs::remove_all(savePath); +} \ No newline at end of file diff --git a/src/tests/ssd_engine/table_test.cpp b/src/tests/ssd_engine/table_test.cpp index 2e180c13..20a66f2f 100644 --- a/src/tests/ssd_engine/table_test.cpp +++ b/src/tests/ssd_engine/table_test.cpp @@ -41,13 +41,13 @@ TEST(Table, WriteAndReadAndDeleteAndCompact) // write emb_key_t nData = 1000000; emb_key_t batchSize = 10000; - vector allKeys; + vector allKeys; vector> allEmbs; - vector batchKeys; + vector batchKeys; vector> batchEmbs; chrono::milliseconds writeCost = 0ms; - for (emb_key_t k = 0; k < nData; k++) { + for (emb_cache_key_t k = 0; k < nData; k++) { vector emb; emb.resize(embDim); for (uint64_t i = 0; i < embDim; ++i) { @@ -122,9 +122,9 @@ TEST(Table, SaveAndLoad) // write and save emb_key_t nData = 10; - vector keys; + vector keys; vector> embs; - for (emb_key_t k = 0; k < nData; k++) { + for (emb_cache_key_t k = 0; k < nData; k++) { vector emb = {static_cast(k + 0.1), static_cast(k + 0.2)}; keys.emplace_back(k); embs.emplace_back(emb); @@ -160,7 +160,7 @@ TEST(Table, GetTableUsage) // write uint64_t expectKeyCnt = 2; - vector keys = {1, 2}; + vector keys = {1, 2}; vector> embs = {{0.1}, {0.2}}; tbSave->InsertEmbeddings(keys, embs); diff --git a/src/tests/utils/common_h_test.cpp b/src/tests/utils/common_h_test.cpp index 2e86b88d..bf089198 100644 --- a/src/tests/utils/common_h_test.cpp +++ b/src/tests/utils/common_h_test.cpp @@ -113,12 +113,6 @@ TEST(TestHostEmbTable, DefaultConstructor) MxRec::HostEmbTable hostEmbTable; } -// 测试 EmbHashMapInfo 结构的默认构造函数 -TEST(TestEmbHashMapInfo, DefaultConstructor) -{ - MxRec::EmbHashMapInfo embHashMapInfo; -} - // 测试 All2AllInfo 结构的默认构造函数 TEST(TestAll2AllInfo, DefaultConstructor) { diff --git a/tests/mx_rec/core/test_build_graph.py b/tests/mx_rec/core/test_build_graph.py index 5360f908..5a24fd74 100644 --- a/tests/mx_rec/core/test_build_graph.py +++ b/tests/mx_rec/core/test_build_graph.py @@ -21,6 +21,7 @@ from unittest import mock import tensorflow as tf from mx_rec.util.global_env_conf import global_env +from mx_rec.core.asc.build_graph import SwapInfo from tests.mx_rec.core.mock_class import MockConfigInitializer @@ -134,10 +135,12 @@ class TestGetIdOffsetsFunc(unittest.TestCase): with tf.Graph().as_default(): mock_get_next.return_value = [0] - id_offsets, swap_pos, swap_len = get_id_offsets(self.max_lookup_vec_size, self.config) + id_offsets, swap_info = get_id_offsets(self.max_lookup_vec_size, self.config) self.assertEqual(id_offsets, 0) - self.assertListEqual(swap_pos, []) - self.assertEqual(swap_len, 0) + self.assertListEqual(swap_info.swap_in_pos, []) + self.assertEqual(swap_info.swap_in_len, 0) + self.assertListEqual(swap_info.swap_out_pos, []) + self.assertEqual(swap_info.swap_out_len, 0) @mock.patch("mx_rec.core.asc.build_graph.npu_ops.gen_npu_ops.get_next") def test_get_id_offsets_case2(self, mock_get_next): @@ -150,10 +153,12 @@ class TestGetIdOffsetsFunc(unittest.TestCase): with tf.Graph().as_default(): self.config["use_dynamic_expansion"] = False mock_get_next.return_value = [0] - id_offsets, swap_pos, swap_len = get_id_offsets(self.max_lookup_vec_size, self.config) + id_offsets, swap_info = get_id_offsets(self.max_lookup_vec_size, self.config) self.assertEqual(id_offsets, 0) - self.assertListEqual(swap_pos, []) - self.assertEqual(swap_len, 0) + self.assertListEqual(swap_info.swap_in_pos, []) + self.assertEqual(swap_info.swap_in_len, 0) + self.assertListEqual(swap_info.swap_out_pos, []) + self.assertEqual(swap_info.swap_out_len, 0) class TestGetAll2allArgsFunc(unittest.TestCase): @@ -217,7 +222,7 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), - get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), + get_id_offsets=mock.MagicMock(return_value=[0, SwapInfo()]), get_all2all_args=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case1(self, build_graph_config_initializer): @@ -236,7 +241,7 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), - get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), + get_id_offsets=mock.MagicMock(return_value=[0, SwapInfo()]), get_all2all_args=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case2(self, build_graph_config_initializer): @@ -255,7 +260,7 @@ class TestGetPreProcessedTensorForAscFunc(unittest.TestCase): @mock.patch.multiple("mx_rec.core.asc.build_graph", get_restore_vector=mock.MagicMock(return_value=[0, 0]), - get_id_offsets=mock.MagicMock(return_value=[0, 0, 0]), + get_id_offsets=mock.MagicMock(return_value=[0, SwapInfo]), get_all2all_args=mock.MagicMock(return_value=0)) @mock.patch("mx_rec.core.asc.build_graph.ConfigInitializer") def test_get_preprocessed_tensor_for_asc_case3(self, build_graph_config_initializer): diff --git a/tests/mx_rec/saver/test_saver.py b/tests/mx_rec/saver/test_saver.py index c0436a72..bcfa0948 100644 --- a/tests/mx_rec/saver/test_saver.py +++ b/tests/mx_rec/saver/test_saver.py @@ -41,6 +41,7 @@ class TestSaver(unittest.TestCase): @mock.patch.multiple("mx_rec.saver.saver", get_rank_id=mock.MagicMock(return_value=0), + get_rank_size=mock.MagicMock(return_value=1), get_local_rank_size=mock.MagicMock(return_value=1)) @mock.patch("mx_rec.saver.saver.ConfigInitializer") def test_save_and_load_is_consistent(self, saver_config_initializer): diff --git a/tests/run_python_dt.sh b/tests/run_python_dt.sh old mode 100644 new mode 100755 index 139e7ff7..475fd788 --- a/tests/run_python_dt.sh +++ b/tests/run_python_dt.sh @@ -36,7 +36,7 @@ cd - # set environment variable export PYTHONPATH="${TOP_PATH}"/output:$PYTHONPATH -export LD_LIBRARY_PATH="${TOP_PATH}"/output:/usr/local/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH="${TOP_PATH}"/output:/usr/local/lib:"${TOP_PATH}"/mx_rec/libasc:$LD_LIBRARY_PATH rm -rf result mkdir -p result -- Gitee From 41698044eff1712808d44d4231f31f4343e0e76d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 27 May 2024 14:49:19 +0800 Subject: [PATCH 165/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/CMakeLists.txt | 2 +- src/test_ut.sh | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 84505d15..757745a8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,7 +56,7 @@ else () message("==EASY_PROFILER_FOUND===") ADD_DEFINITIONS(-DBUILD_WITH_EASY_PROFILER) endif () -set(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -ffunction-sections -O0 -Wall -g2 -ggdb") +set(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -ffunction-sections -O0 -Wall -g2 -ggdb -fsanitize=address -fsanitize-recover=address,all") set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -ffunction-sections -O3 -Wfatal-errors -DNDEBUG -fPIC -fstack-protector-all -D_FORTIFY_SOURCE=2 -s") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack") diff --git a/src/test_ut.sh b/src/test_ut.sh index 6146aaab..cc163baf 100644 --- a/src/test_ut.sh +++ b/src/test_ut.sh @@ -38,6 +38,15 @@ opensource_path="${ROOT_DIR}"/../opensource acc_ctr_path="${ROOT_DIR}"/src/AccCTR export LD_LIBRARY_PATH="${acc_ctr_path}"/output/ock_ctr_common/lib:$LD_LIBRARY_PATH +# config asan report dir and environment variable +if [ ! -d asan_report ]; then + mkdir -p asan_report +else + rm -rf ./asan_report/* +fi +export ASAN_OPTIONS=halt_on_error=0:detect_leaks=1:log_path="${CUR_DIR}"/asan_report/asan.log + + function prepare_googletest(){ cd ${opensource_path} if [ ! -d googletest-release-1.8.1 ]; then -- Gitee From d74b737e4b24517e557b9551ef1115ae1a08f6f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 27 May 2024 16:00:35 +0800 Subject: [PATCH 166/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/test_ut.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/test_ut.sh b/src/test_ut.sh index cc163baf..20c6898a 100644 --- a/src/test_ut.sh +++ b/src/test_ut.sh @@ -38,13 +38,8 @@ opensource_path="${ROOT_DIR}"/../opensource acc_ctr_path="${ROOT_DIR}"/src/AccCTR export LD_LIBRARY_PATH="${acc_ctr_path}"/output/ock_ctr_common/lib:$LD_LIBRARY_PATH -# config asan report dir and environment variable -if [ ! -d asan_report ]; then - mkdir -p asan_report -else - rm -rf ./asan_report/* -fi -export ASAN_OPTIONS=halt_on_error=0:detect_leaks=1:log_path="${CUR_DIR}"/asan_report/asan.log +# config asan environment variable +export ASAN_OPTIONS=halt_on_error=1:detect_leaks=1 function prepare_googletest(){ -- Gitee From 638ea9dba6f305d92a5753e6689ca5c5fd6b62fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 27 May 2024 19:27:38 +0800 Subject: [PATCH 167/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/CMakeLists.txt | 2 +- src/tests/emb_table/embedding_ddr_test.cpp | 2 +- src/tests/emb_table/embedding_mgmt_test.cpp | 2 +- src/tests/emb_table/embedding_static_test.cpp | 2 +- src/tests/file_system/hdfs_file_system_test.cpp | 3 ++- 5 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 757745a8..a5cd76da 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,7 +56,7 @@ else () message("==EASY_PROFILER_FOUND===") ADD_DEFINITIONS(-DBUILD_WITH_EASY_PROFILER) endif () -set(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -ffunction-sections -O0 -Wall -g2 -ggdb -fsanitize=address -fsanitize-recover=address,all") +set(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -ffunction-sections -O0 -Wall -g2 -ggdb -fsanitize=address -fsanitize-recover=address,all -fno-omit-frame-pointer -fno-stack-protector") set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -ffunction-sections -O3 -Wfatal-errors -DNDEBUG -fPIC -fstack-protector-all -D_FORTIFY_SOURCE=2 -s") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack") diff --git a/src/tests/emb_table/embedding_ddr_test.cpp b/src/tests/emb_table/embedding_ddr_test.cpp index ddad3905..60ec5af6 100644 --- a/src/tests/emb_table/embedding_ddr_test.cpp +++ b/src/tests/emb_table/embedding_ddr_test.cpp @@ -35,7 +35,7 @@ protected: EmbeddingDDRTest() { struct EmbInfoParams embParam(string("test1"), 0, 1000, 2000, true, true); - std::vector vocabsize = {100}; + std::vector vocabsize = {100, 100, 100}; vector initializeInfos = {}; std::vector ssdDataPath = {""}; vector maxStep = {1000}; diff --git a/src/tests/emb_table/embedding_mgmt_test.cpp b/src/tests/emb_table/embedding_mgmt_test.cpp index 49f10b4f..055cf5c5 100644 --- a/src/tests/emb_table/embedding_mgmt_test.cpp +++ b/src/tests/emb_table/embedding_mgmt_test.cpp @@ -35,7 +35,7 @@ protected: EmbeddingMgmtTest() { struct EmbInfoParams embParam(string("test1"), 0, 1000, 2000, true, true); - std::vector vocabsize = {100}; + std::vector vocabsize = {100, 100, 100}; vector initializeInfos = {}; std::vector ssdDataPath = {""}; vector maxStep = {1000}; diff --git a/src/tests/emb_table/embedding_static_test.cpp b/src/tests/emb_table/embedding_static_test.cpp index c8a5e252..9e250f64 100644 --- a/src/tests/emb_table/embedding_static_test.cpp +++ b/src/tests/emb_table/embedding_static_test.cpp @@ -34,7 +34,7 @@ protected: EmbeddingStaticTest() { struct EmbInfoParams embParam(string("test1"), 0, 1000, 2000, true, true); - std::vector vocabsize = {100}; + std::vector vocabsize = {100, 100, 100}; vector initializeInfos = {}; std::vector ssdDataPath = {""}; vector maxStep = {1000}; diff --git a/src/tests/file_system/hdfs_file_system_test.cpp b/src/tests/file_system/hdfs_file_system_test.cpp index 0d469ca5..1f94e1c7 100644 --- a/src/tests/file_system/hdfs_file_system_test.cpp +++ b/src/tests/file_system/hdfs_file_system_test.cpp @@ -18,6 +18,7 @@ See the License for the specific language governing permissions and #include "file_system/file_system_handler.h" #include "file_system/hdfs_file_system/hdfs_wrapper.h" +#include "utils/logger.h" using namespace std; using namespace MxRec; @@ -75,7 +76,7 @@ TEST_F(HdfsFileSystemTest, CreateDirFailed) TEST_F(HdfsFileSystemTest, GetFileSize) { - hdfsFileInfo* fileInfo; + auto* fileInfo = new hdfsFileInfo(); EMOCK(&HdfsWrapper::GetPathInfo).stubs().will(returnValue(fileInfo)); string filePath = "hdfs://master:9000/test_dir/"; auto fileSystemHandler = make_unique(); -- Gitee From 83d59d6154b429a97e7abc92109f4bf0a82a6d81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 27 May 2024 19:44:14 +0800 Subject: [PATCH 168/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/file_system/hdfs_file_system_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/file_system/hdfs_file_system_test.cpp b/src/tests/file_system/hdfs_file_system_test.cpp index 1f94e1c7..0a642d44 100644 --- a/src/tests/file_system/hdfs_file_system_test.cpp +++ b/src/tests/file_system/hdfs_file_system_test.cpp @@ -18,7 +18,6 @@ See the License for the specific language governing permissions and #include "file_system/file_system_handler.h" #include "file_system/hdfs_file_system/hdfs_wrapper.h" -#include "utils/logger.h" using namespace std; using namespace MxRec; @@ -77,6 +76,7 @@ TEST_F(HdfsFileSystemTest, CreateDirFailed) TEST_F(HdfsFileSystemTest, GetFileSize) { auto* fileInfo = new hdfsFileInfo(); + fileInfo->mSize = 1; EMOCK(&HdfsWrapper::GetPathInfo).stubs().will(returnValue(fileInfo)); string filePath = "hdfs://master:9000/test_dir/"; auto fileSystemHandler = make_unique(); -- Gitee From 9119986cab1b10efad92ba1563e0a4951a77f26c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 27 May 2024 19:45:53 +0800 Subject: [PATCH 169/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/file_system/hdfs_file_system_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/file_system/hdfs_file_system_test.cpp b/src/tests/file_system/hdfs_file_system_test.cpp index 0a642d44..46fb0753 100644 --- a/src/tests/file_system/hdfs_file_system_test.cpp +++ b/src/tests/file_system/hdfs_file_system_test.cpp @@ -82,5 +82,6 @@ TEST_F(HdfsFileSystemTest, GetFileSize) auto fileSystemHandler = make_unique(); auto fileSystemPtr = fileSystemHandler->Create(filePath); EXPECT_NO_THROW(fileSystemPtr->GetFileSize(filePath)); + delete fileInfo; } -- Gitee From 76b678eab589f75fdf47939c6884ad85ccdbc2d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 27 May 2024 21:10:48 +0800 Subject: [PATCH 170/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/file_system/hdfs_file_system_test.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/tests/file_system/hdfs_file_system_test.cpp b/src/tests/file_system/hdfs_file_system_test.cpp index 46fb0753..3c1a2561 100644 --- a/src/tests/file_system/hdfs_file_system_test.cpp +++ b/src/tests/file_system/hdfs_file_system_test.cpp @@ -75,13 +75,11 @@ TEST_F(HdfsFileSystemTest, CreateDirFailed) TEST_F(HdfsFileSystemTest, GetFileSize) { - auto* fileInfo = new hdfsFileInfo(); - fileInfo->mSize = 1; - EMOCK(&HdfsWrapper::GetPathInfo).stubs().will(returnValue(fileInfo)); + std::unique_ptr fileInfo = std::make_unique(); + EMOCK(&HdfsWrapper::GetPathInfo).stubs().will(returnValue(fileInfo.get())); string filePath = "hdfs://master:9000/test_dir/"; auto fileSystemHandler = make_unique(); auto fileSystemPtr = fileSystemHandler->Create(filePath); EXPECT_NO_THROW(fileSystemPtr->GetFileSize(filePath)); - delete fileInfo; } -- Gitee From d61def25dc0e4a59eb036e01475aac00fb8b4073 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Tue, 28 May 2024 19:54:42 +0800 Subject: [PATCH 171/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/CMakeLists.txt | 4 +++ src/AccCTR/build/build_test.sh | 3 ++ src/AccCTR/tests/ut/src/CMakeLists.txt | 2 +- src/AccCTR/tests/ut/src/unique_test.cpp | 42 +++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/src/AccCTR/CMakeLists.txt b/src/AccCTR/CMakeLists.txt index 60e2d638..febf1740 100644 --- a/src/AccCTR/CMakeLists.txt +++ b/src/AccCTR/CMakeLists.txt @@ -73,6 +73,10 @@ elseif (${BUILD_MODE} MATCHES "ut") -Wfloat-equal -Wextra -std=c++17 + -fsanitize=address + -fsanitize-recover=address,all + -fno-omit-frame-pointer + -fstack-protector-all ) else () message(FATAL_ERROR "======BUILD_MODE not found") diff --git a/src/AccCTR/build/build_test.sh b/src/AccCTR/build/build_test.sh index 9441efe3..4001b825 100644 --- a/src/AccCTR/build/build_test.sh +++ b/src/AccCTR/build/build_test.sh @@ -24,6 +24,9 @@ TOOL_FILE="create_fake_id.py" CPU_TYPE=$(arch) BUILD_MODE=$1 +# config asan environment variable +export ASAN_OPTIONS=halt_on_error=1:detect_leaks=1 + create_data() { cd ${TOOL_PATH} diff --git a/src/AccCTR/tests/ut/src/CMakeLists.txt b/src/AccCTR/tests/ut/src/CMakeLists.txt index 3da58244..93f8f6c2 100644 --- a/src/AccCTR/tests/ut/src/CMakeLists.txt +++ b/src/AccCTR/tests/ut/src/CMakeLists.txt @@ -24,7 +24,7 @@ include("${CMAKE_CURRENT_SOURCE_DIR}/../conf/toolchain.cmake") set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../src) set(TOP_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../) -file(GLOB_RECURSE TEST_UNIQUE_FILES *.cpp *.h) +file(GLOB_RECURSE TEST_UNIQUE_FILES unique_test.cpp *.h) add_executable(test_unique_files ${TEST_UNIQUE_FILES}) include_directories(${OCK_CTR_UTIL_INSTALL_DIR}/googletest-release-1.8.1/include) link_directories(${OCK_CTR_UTIL_INSTALL_DIR}/googletest-release-1.8.1/lib64) diff --git a/src/AccCTR/tests/ut/src/unique_test.cpp b/src/AccCTR/tests/ut/src/unique_test.cpp index a94ebaf7..fe7d0242 100644 --- a/src/AccCTR/tests/ut/src/unique_test.cpp +++ b/src/AccCTR/tests/ut/src/unique_test.cpp @@ -95,6 +95,13 @@ TEST_F(UniqueTest, Conf) ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 3); // idCntFill空指针 uniqueOut.idCntFill = idCntFill; ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 7); // padding长度过小 + + unique->UnInitialize(); + delete[] idCnt; + delete[] idCntFill; + delete[] uniqueIdCntInBucket; + delete[] uniqueIdInBucket; + std::cout << "===========Conf end=============" << std::endl; } @@ -115,6 +122,9 @@ TEST_F(UniqueTest, usePaddingNoShardingErr) conf.outputType = OutputType::ENHANCED; ASSERT_EQ(unique->Initialize(conf), 9); + + unique->UnInitialize(); + std::cout << "===========usePaddingNoShardingErr end=============" << std::endl; } @@ -132,6 +142,8 @@ TEST_F(UniqueTest, useNegativeDesiredSize) ASSERT_EQ(unique->Initialize(conf), 1); + unique->UnInitialize(); + std::cout << "===========useNegativeDesiredSize end=============" << std::endl; } @@ -404,6 +416,9 @@ TEST_F(UniqueTest, DoEnhancedUniqueErr) ASSERT_EQ(uniqueOut.uniqueIdCnt, (int)idsSet.size()); unique->UnInitialize(); + delete[] uniqueIdInBucket; + delete[] idCnt; + std::cout << "===========DoEnhancedUniqueErr end=============" << std::endl; } @@ -544,6 +559,9 @@ TEST_F(UniqueTest, idCntIsNullSharding) ASSERT_EQ(ret, 3); unique->UnInitialize(); + delete[] uniqueIdCntInBucket; + delete[] uniqueIdInBucket; + std::cout << "===========idCntIsNullSharding end=============" << std::endl; } @@ -620,6 +638,7 @@ TEST_F(UniqueTest, DoUniqueShard) ASSERT_THAT(uniqueIdCntInBucket, testing::ElementsAreArray(expectedUniqueIdCnt)); ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueShard end=============" << std::endl; } @@ -685,6 +704,7 @@ TEST_F(UniqueTest, DoUniqueOnlyShard) ASSERT_THAT(inputId, testing::ElementsAreArray(restoreIds)); ASSERT_THAT(uniqueIdCntInBucket, testing::ElementsAreArray(expectedUniqueIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueOnlyShard end=============" << std::endl; } @@ -769,6 +789,8 @@ TEST_F(UniqueTest, DoUniquePadding) ASSERT_THAT(idCntFill, testing::ElementsAreArray(expectedIdCnt)); ASSERT_EQ(uniqueOut.uniqueIdCnt, conf.paddingSize * conf.shardingNum); unique->UnInitialize(); + delete[] idCnt; + delete[] uniqueIdInBucket; std::cout << "===========DoUniquePadding end=============" << std::endl; } @@ -913,6 +935,7 @@ TEST_F(UniqueTest, DoUniqueShardNumberOversize) ASSERT_THAT(uniqueIdCntInBucket, testing::ElementsAreArray(expectedUniqueIdCnt)); ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueShardNumberOversize end=============" << std::endl; } @@ -981,6 +1004,12 @@ TEST_F(UniqueTest, DoUniqueSpecial) } unique->UnInitialize(); + delete[] uniqueData; + delete[] index; + delete[] idCnt; + delete[] idCntFill; + delete[] uniqueIdCntInBucket; + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueSpecial end=============" << std::endl; } @@ -1020,6 +1049,10 @@ TEST_F(UniqueTest, IdLarge) uniqueOut.idCnt = idCnt; ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 6); // ID太大 + + unique->UnInitialize(); + delete[] idCnt; + std::cout << "===========IdLarge end=============" << std::endl; } @@ -1095,6 +1128,8 @@ TEST_F(UniqueTest, DoUniqueNormalInt32) ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; + std::cout << "===========DoUniqueNormalInt32 end=============" << std::endl; } @@ -1228,6 +1263,7 @@ TEST_F(UniqueTest, DoUniqueShardMultipleTimes) ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); } unique->UnInitialize(); + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueShardMultipleTimes end=============" << std::endl; } @@ -1312,6 +1348,9 @@ TEST_F(UniqueTest, DoUniquePaddingMultipleTimes) } unique->UnInitialize(); + delete[] idCnt; + delete[] uniqueIdInBucket; + std::cout << "===========DoUniquePaddingMultipleTimes end=============" << std::endl; } @@ -1348,6 +1387,9 @@ TEST_F(UniqueTest, IdCntSmall) uniqueOut.idCnt = idCnt; ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 4); // idcnt过小 + + unique->UnInitialize(); + std::cout << "===========IdCntSmall end=============" << std::endl; } -- Gitee From 24129d804f5f85a3fdfa79e139735198eae0d782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Tue, 28 May 2024 20:30:54 +0800 Subject: [PATCH 172/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/tests/ut/src/unique_test.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/AccCTR/tests/ut/src/unique_test.cpp b/src/AccCTR/tests/ut/src/unique_test.cpp index fe7d0242..1b663ba9 100644 --- a/src/AccCTR/tests/ut/src/unique_test.cpp +++ b/src/AccCTR/tests/ut/src/unique_test.cpp @@ -1389,6 +1389,7 @@ TEST_F(UniqueTest, IdCntSmall) ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 4); // idcnt过小 unique->UnInitialize(); + delete[] idCnt; std::cout << "===========IdCntSmall end=============" << std::endl; } @@ -1491,6 +1492,7 @@ TEST_F(UniqueTest, DoUniqueLotsDataFunction) ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; if (path) { free(path); } @@ -1599,6 +1601,8 @@ TEST_F(UniqueTest, DoUniqueLotsDataPaddingFunction) unique->UnInitialize(); ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 11); + delete[] idCnt; + delete[] uniqueIdInBucket; if (path) { free(path); } -- Gitee From b85bf288341ac1f0791b8e58728819390d0e1ed4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Tue, 28 May 2024 20:37:22 +0800 Subject: [PATCH 173/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/tests/ut/src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AccCTR/tests/ut/src/CMakeLists.txt b/src/AccCTR/tests/ut/src/CMakeLists.txt index 93f8f6c2..3da58244 100644 --- a/src/AccCTR/tests/ut/src/CMakeLists.txt +++ b/src/AccCTR/tests/ut/src/CMakeLists.txt @@ -24,7 +24,7 @@ include("${CMAKE_CURRENT_SOURCE_DIR}/../conf/toolchain.cmake") set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../src) set(TOP_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../) -file(GLOB_RECURSE TEST_UNIQUE_FILES unique_test.cpp *.h) +file(GLOB_RECURSE TEST_UNIQUE_FILES *.cpp *.h) add_executable(test_unique_files ${TEST_UNIQUE_FILES}) include_directories(${OCK_CTR_UTIL_INSTALL_DIR}/googletest-release-1.8.1/include) link_directories(${OCK_CTR_UTIL_INSTALL_DIR}/googletest-release-1.8.1/lib64) -- Gitee From 14e125ff0bcc6a3fea01fc3f05f6208a0e715afc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Wed, 29 May 2024 10:36:45 +0800 Subject: [PATCH 174/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/tests/ut/src/unique_test.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/AccCTR/tests/ut/src/unique_test.cpp b/src/AccCTR/tests/ut/src/unique_test.cpp index 1b663ba9..94e8d92c 100644 --- a/src/AccCTR/tests/ut/src/unique_test.cpp +++ b/src/AccCTR/tests/ut/src/unique_test.cpp @@ -219,6 +219,9 @@ TEST_F(UniqueTest, DoUniqueNormal) ASSERT_EQ(uniqueOut.uniqueIdCnt, (int)idsSet.size()); unique->UnInitialize(); + if (path) { + free(path); + } std::cout << "===========DoUniqueNormal end=============" << std::endl; } -- Gitee From cecd7ed0f932f5f54f08d1868fd255976217c943 Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Wed, 29 May 2024 03:38:24 +0000 Subject: [PATCH 175/302] =?UTF-8?q?!160=20=E7=89=B9=E6=80=A7=EF=BC=88?= =?UTF-8?q?=E4=BF=9D=E5=AD=98=E4=B8=8E=E5=8A=A0=E8=BD=BD=EF=BC=89=EF=BC=9A?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=97=A0slot=E4=BC=98=E5=8C=96=E5=99=A8?= =?UTF-8?q?=E4=BF=9D=E5=AD=98=E5=BC=82=E5=B8=B8=E9=97=AE=E9=A2=98=20*=20?= =?UTF-8?q?=E7=89=B9=E6=80=A7=EF=BC=88=E4=BF=9D=E5=AD=98=E4=B8=8E=E5=8A=A0?= =?UTF-8?q?=E8=BD=BD=EF=BC=89=EF=BC=9A=E4=BF=AE=E5=A4=8D=E6=97=A0slot?= =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=99=A8=E4=BF=9D=E5=AD=98=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 99 +++++++++++++++------------- src/core/emb_table/embedding_ddr.cpp | 10 +++ src/core/hybrid_mgmt/hybrid_mgmt.cpp | 4 +- 3 files changed, 66 insertions(+), 47 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index a91599bc..f9dfd0dc 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -260,52 +260,61 @@ class Saver(object): table_instance0 = self.config_instance.sparse_embed_config.get_table_instance(self.var_list[0]) if table_instance0.is_hbm: - self.config_instance.hybrid_manager_config.save_host_data(root_dir) - if self.config_instance.use_dynamic_expansion: - # Data related to dynamic expansion needs to be saved only on the host side. - return - - result = self.save_op_dict - threads = [] - for table_name in result.keys(): - thread = SaveModelThread(self, sess, result, root_dir, table_name) - threads.append(thread) - - for thread in threads: - thread.start() - - for thread in threads: - thread.join() + self._save_hbm(sess, root_dir) else: - # 接受host侧传来的需要swap_out的offset用于更新host侧并保存 - self.config_instance.hybrid_manager_config.fetch_device_emb() - for var in self.var_list: - table_instance = self.config_instance.sparse_embed_config.get_table_instance(var) - table_name = table_instance.table_name - - use_static = ConfigInitializer.get_instance().use_static - max_lookup_vec_size = None - if use_static: - max_lookup_vec_size = table_instance.send_count * self.rank_size - swap_out_pos, swap_out_len = npu_ops.gen_npu_ops.get_next( - output_types=[tf.int32, tf.int32], - output_shapes=[[max_lookup_vec_size], []], - channel_name=f'{table_name}_save_h2d_{TRAIN_CHANNEL_ID}') - if use_static: - swap_out_pos = swap_out_pos[:swap_out_len] - - optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name(table_name) - table = [var] + [slot_var for slots in optimizer.values() for slot_var in slots.values()] - - swap_outs = [tf.gather(one_table, swap_out_pos) for one_table in table] - swap_out = tf.concat(swap_outs, axis=1) - channel_name = f'{table_name}_save_d2h_{TRAIN_CHANNEL_ID}' - logger.debug('channel %s was built for op swap_out_op.', channel_name) - swap_out_op = npu_ops.outfeed_enqueue_op(channel_name=channel_name, inputs=[swap_out]) - # 发送host需要的embedding - sess.run(swap_out_op) - self.config_instance.hybrid_manager_config.save_host_data(root_dir) - logger.debug(f"host data was saved.") + self._save_ddr(sess, root_dir) + logger.debug(f"Host data was saved.") + + def _save_hbm(self, sess, root_dir): + self.config_instance.hybrid_manager_config.save_host_data(root_dir) + if self.config_instance.use_dynamic_expansion: + # Data related to dynamic expansion needs to be saved only on the host side. + return + + result = self.save_op_dict + threads = [] + for table_name in result.keys(): + thread = SaveModelThread(self, sess, result, root_dir, table_name) + threads.append(thread) + + for thread in threads: + thread.start() + + for thread in threads: + thread.join() + + def _save_ddr(self, sess, root_dir): + # 接受host侧传来的需要swap_out的offset用于更新host侧并保存 + self.config_instance.hybrid_manager_config.fetch_device_emb() + for var in self.var_list: + table_instance = self.config_instance.sparse_embed_config.get_table_instance(var) + table_name = table_instance.table_name + + use_static = ConfigInitializer.get_instance().use_static + max_lookup_vec_size = None + if use_static: + max_lookup_vec_size = table_instance.send_count * self.rank_size + swap_out_pos, swap_out_len = npu_ops.gen_npu_ops.get_next( + output_types=[tf.int32, tf.int32], + output_shapes=[[max_lookup_vec_size], []], + channel_name=f'{table_name}_save_h2d_{TRAIN_CHANNEL_ID}') + if use_static: + swap_out_pos = swap_out_pos[:swap_out_len] + + table = [var] + optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name(table_name) + if optimizer is not None: + for slots in optimizer.values(): + table += list(slots.values()) + + swap_outs = [tf.gather(one_table, swap_out_pos) for one_table in table] + swap_out = tf.concat(swap_outs, axis=1) + channel_name = f'{table_name}_save_d2h_{TRAIN_CHANNEL_ID}' + logger.debug('channel %s was built for op swap_out_op.', channel_name) + swap_out_op = npu_ops.outfeed_enqueue_op(channel_name=channel_name, inputs=[swap_out]) + # 发送host需要的embedding + sess.run(swap_out_op) + self.config_instance.hybrid_manager_config.save_host_data(root_dir) def _get_valid_dict_data(self, dump_data_dict, table_name): host_data = self.config_instance.hybrid_manager_config.get_host_data(table_name) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index caec0229..f069e5c7 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -156,6 +156,11 @@ void EmbeddingDDR::LoadEmbedding(const string &savePath, vector> & void EmbeddingDDR::LoadOptimizerSlot(const string &savePath, vector> &optimizerSlots) { + if (optimParams.size() == 0) { + LOG_DEBUG("optimizer has no slot data to load"); + return; + } + // must init first for (size_t i = 0; i < hostLoadOffset.size(); i++) { vector tmp(extEmbSize_ - embSize_); @@ -293,6 +298,11 @@ void EmbeddingDDR::SaveEmbedding(const string& savePath, vector>& void EmbeddingDDR::SaveOptimizerSlot(const string& savePath, vector>& optimizerSlots, size_t keySize) { + if (optimizerSlots.size() == 0) { + LOG_DEBUG("optimizer has no slot data to save"); + return; + } + if (optimizerSlots.size() != keySize) { string errMsg = StringFormat("optimizer slot data size not equal to key size, " "optimizerSlots.size:%d, keySize:%d", diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 123b2c79..6b998205 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -557,7 +557,7 @@ bool HybridMgmt::IsEvalEndBatch(int batchId) const bool HybridMgmt::ParseKeys(int channelId, int& batchId, TaskType type) { #ifndef GTEST - LOG_INFO(MGMT + "channelId:{} batchId:{}, DDR mode, ParseKeys start.", channelId, batchId); + LOG_INFO(MGMT + "channelId:{} batchId:{}, ParseKeys start.", channelId, batchId); TimeCost parseKeyTC; bool remainBatch = true; // 是否从通道获取了数据 @@ -1328,7 +1328,7 @@ void HybridMgmt::InitEmbeddingCache(const vector& embInfos) specialProcessStatus[embInfo.name] = ProcessStatus::NORMAL; // 初始化embedding cache - LOG_INFO("create cache for table:{}, hostVocabSize:{}, embSize:{}, maxCacheSize:{}", + LOG_INFO("create cache for table:{}, hostVocabSize:{}, extEmbeddingSize:{}, maxCacheSize(devVocabSize):{}", embInfo.name, embInfo.hostVocabSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); EmbCache::EmbCacheInfo embCacheInfo(embInfo.name, embInfo.hostVocabSize, embInfo.embeddingSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); -- Gitee From 2781f72ed2815a7c8a1d9d12b0affa3bd0fb6593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Wed, 29 May 2024 17:16:53 +0800 Subject: [PATCH 176/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/test_ut.sh | 7 +++---- src/tests/leaks.supp | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 src/tests/leaks.supp diff --git a/src/test_ut.sh b/src/test_ut.sh index 20c6898a..c7f8d9c0 100644 --- a/src/test_ut.sh +++ b/src/test_ut.sh @@ -38,10 +38,6 @@ opensource_path="${ROOT_DIR}"/../opensource acc_ctr_path="${ROOT_DIR}"/src/AccCTR export LD_LIBRARY_PATH="${acc_ctr_path}"/output/ock_ctr_common/lib:$LD_LIBRARY_PATH -# config asan environment variable -export ASAN_OPTIONS=halt_on_error=1:detect_leaks=1 - - function prepare_googletest(){ cd ${opensource_path} if [ ! -d googletest-release-1.8.1 ]; then @@ -133,6 +129,9 @@ mkdir build cd build python_path="$(dirname "$(dirname "$(which python3.7)")")" +# config asan environment variable +export ASAN_OPTIONS=halt_on_error=1:detect_leaks=1 +export LSAN_OPTIONS=suppressions=../tests/leaks.supp cmake -DCMAKE_BUILD_TYPE=Debug \ -DTF_PATH="${python_path}"/lib/python3.7/site-packages/"${TF_DIR}" \ diff --git a/src/tests/leaks.supp b/src/tests/leaks.supp new file mode 100644 index 00000000..c192bc92 --- /dev/null +++ b/src/tests/leaks.supp @@ -0,0 +1,21 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +# There are known leaks. +# 1.known mpi leaks. +leak:libmpi.so* +leak:libopen-pal.so* +leak:libpmix.so* +leak:libc.so* \ No newline at end of file -- Gitee From e0f5391d9353286645a9c63f0669a7462de7eb05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Wed, 29 May 2024 17:39:23 +0800 Subject: [PATCH 177/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/leaks.supp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/tests/leaks.supp b/src/tests/leaks.supp index c192bc92..ebe0718d 100644 --- a/src/tests/leaks.supp +++ b/src/tests/leaks.supp @@ -1,17 +1,17 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== # There are known leaks. # 1.known mpi leaks. -- Gitee From 0df1f96d631e2d5a85beba383e60c74b4ea01724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Wed, 29 May 2024 19:34:19 +0800 Subject: [PATCH 178/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/test_ut.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test_ut.sh b/src/test_ut.sh index c7f8d9c0..7305c081 100644 --- a/src/test_ut.sh +++ b/src/test_ut.sh @@ -130,7 +130,7 @@ cd build python_path="$(dirname "$(dirname "$(which python3.7)")")" # config asan environment variable -export ASAN_OPTIONS=halt_on_error=1:detect_leaks=1 +export ASAN_OPTIONS=halt_on_error=1:detect_leaks=1:fast_unwind_on_malloc=0 export LSAN_OPTIONS=suppressions=../tests/leaks.supp cmake -DCMAKE_BUILD_TYPE=Debug \ -- Gitee From 8387ff18a54c4d6226dd50ea3c0560277e5ad92b Mon Sep 17 00:00:00 2001 From: steepcurve Date: Thu, 30 May 2024 02:22:56 +0000 Subject: [PATCH 179/302] =?UTF-8?q?!161=20=E3=80=90bugfix=E3=80=91DCNv2?= =?UTF-8?q?=E5=88=87=E6=8D=A2=E4=BC=98=E5=8C=96=E5=99=A8=E6=8A=A5=E9=94=99?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20*=20update=20examples/DCNv2/delay=5Floss?= =?UTF-8?q?=5Fscale.py.=20*=20update=20examples/DCNv2/delay=5Floss=5Fscale?= =?UTF-8?q?.py.=20*=20update=20examples/DCNv2/delay=5Floss=5Fscale.py.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/delay_loss_scale.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/examples/DCNv2/delay_loss_scale.py b/examples/DCNv2/delay_loss_scale.py index a9ee5e64..821b2210 100644 --- a/examples/DCNv2/delay_loss_scale.py +++ b/examples/DCNv2/delay_loss_scale.py @@ -21,13 +21,13 @@ from tensorflow.compat.v1.train import Optimizer class DenseLossScaleOptimizer: def __init__(self, opt, loss_scale): if not isinstance(opt, Optimizer): - raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) + raise ValueError("`opt` must be an instance of Optimizer, but got: %s" % type(opt)) self._optimizer = opt self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - self._optimizer._lr = self._optimizer._lr / self._loss_scale + _scale_learning_rate(self._optimizer, loss_scale) def compute_gradients(self, loss, var_list=None): - return self._optimizer.compute_gradients(loss*self._loss_scale, var_list=var_list) + return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list) def apply_gradients(self, avg_grads): return self._optimizer.apply_gradients(avg_grads) @@ -36,13 +36,26 @@ class DenseLossScaleOptimizer: class SparseLossScaleOptimizer: def __init__(self, opt, loss_scale): if not isinstance(opt, Optimizer): - raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) + raise ValueError("`opt` must be an instance of Optimizer, but got: %s" % type(opt)) self._optimizer = opt self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - self._optimizer._lr = self._optimizer._lr / self._loss_scale + _scale_learning_rate(self._optimizer, loss_scale) def compute_gradients(self, loss, var_list=None): - return tf.gradients(loss*self._loss_scale, var_list) + return tf.gradients(loss * self._loss_scale, var_list) def apply_gradients(self, grads_and_vars): - return self._optimizer.apply_gradients(grads_and_vars) \ No newline at end of file + return self._optimizer.apply_gradients(grads_and_vars) + + +def _scale_learning_rate(opt: Optimizer, loss_scale: float) -> None: + if loss_scale == 0: + raise ValueError("`loss_scale` can not be zero") + if hasattr(opt, "_learning_rate"): + # `SGD` or `Adagrad` + opt._learning_rate = opt._learning_rate / tf.convert_to_tensor(loss_scale, tf.float32) + elif hasattr(opt, "_lr"): + # `Adam` + opt._lr = opt._lr / tf.convert_to_tensor(loss_scale, tf.float32) + else: + raise ValueError("`opt` should have a `_learning_rate` or `_lr` named field") -- Gitee From d3a388c03af109ac4230a4caf2e857e1d4869be5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Thu, 30 May 2024 10:53:07 +0800 Subject: [PATCH 180/302] =?UTF-8?q?C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88?= =?UTF-8?q?=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84?= =?UTF-8?q?=E9=9C=B2=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/file_system/hdfs_file_system_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/file_system/hdfs_file_system_test.cpp b/src/tests/file_system/hdfs_file_system_test.cpp index 3c1a2561..98f733f0 100644 --- a/src/tests/file_system/hdfs_file_system_test.cpp +++ b/src/tests/file_system/hdfs_file_system_test.cpp @@ -26,10 +26,10 @@ using namespace emock; void MockHdfs() { + EMOCK(&HdfsWrapper::LoadHdfsLib).stubs().will(ignoreReturnValue()); hdfsFS ConnectFs; hdfsFile hdfsFileHandler; hdfsFileInfo* fileInfo; - EMOCK(&HdfsWrapper::LoadHdfsLib).stubs().will(ignoreReturnValue()); EMOCK(&HdfsWrapper::CloseHdfsLib).stubs().will(ignoreReturnValue()); EMOCK(&HdfsWrapper::Connect).stubs().will(returnValue(ConnectFs)); EMOCK(&HdfsWrapper::Disconnect).stubs().will(returnValue(1)); -- Gitee From 7d246a26eca111b836e1e692e3d22bde33a09aaf Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Thu, 30 May 2024 14:28:06 +0000 Subject: [PATCH 181/302] =?UTF-8?q?!164=20xdeepFM=20Github=E5=8E=9F?= =?UTF-8?q?=E5=A7=8B=E4=BB=A3=E7=A0=81=20*=20xdeepFM=20Github=E5=8E=9F?= =?UTF-8?q?=E5=A7=8B=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/xDeepFM/IO/base_cache.py | 13 + examples/xDeepFM/IO/ffm_cache.py | 162 ++++++++++++ examples/xDeepFM/IO/iterator.py | 207 +++++++++++++++ examples/xDeepFM/main.py | 187 +++++++++++++ examples/xDeepFM/src/base_model.py | 193 ++++++++++++++ examples/xDeepFM/src/exDeepFM.py | 409 +++++++++++++++++++++++++++++ examples/xDeepFM/train.py | 304 +++++++++++++++++++++ examples/xDeepFM/utils/log.py | 20 ++ examples/xDeepFM/utils/metric.py | 97 +++++++ examples/xDeepFM/utils/util.py | 83 ++++++ 10 files changed, 1675 insertions(+) create mode 100644 examples/xDeepFM/IO/base_cache.py create mode 100644 examples/xDeepFM/IO/ffm_cache.py create mode 100644 examples/xDeepFM/IO/iterator.py create mode 100644 examples/xDeepFM/main.py create mode 100644 examples/xDeepFM/src/base_model.py create mode 100644 examples/xDeepFM/src/exDeepFM.py create mode 100644 examples/xDeepFM/train.py create mode 100644 examples/xDeepFM/utils/log.py create mode 100644 examples/xDeepFM/utils/metric.py create mode 100644 examples/xDeepFM/utils/util.py diff --git a/examples/xDeepFM/IO/base_cache.py b/examples/xDeepFM/IO/base_cache.py new file mode 100644 index 00000000..11187de9 --- /dev/null +++ b/examples/xDeepFM/IO/base_cache.py @@ -0,0 +1,13 @@ +"""define abstract base class""" +import abc + +__all__ = ["BaseCache"] + + +class BaseCache(object): + """abstract base class""" + + @abc.abstractmethod + def write_tfrecord(self, infile, outfile, hparams): + """Subclass must implement this.""" + pass diff --git a/examples/xDeepFM/IO/ffm_cache.py b/examples/xDeepFM/IO/ffm_cache.py new file mode 100644 index 00000000..1f3d505c --- /dev/null +++ b/examples/xDeepFM/IO/ffm_cache.py @@ -0,0 +1,162 @@ +"""define FfmCache class for cache the format dataset""" +from IO.base_cache import BaseCache +import tensorflow as tf +import numpy as np +from collections import defaultdict +import utils.util as util + +__all__ = ["FfmCache"] + + +class FfmCache(BaseCache): + # field index start by 1, feat index start by 1 + def _load_batch_data_from_file(self, file, hparams): + batch_size = hparams.batch_size + labels = [] + features = [] + impression_id = [] + cnt = 0 + with open(file, 'r') as rd: + while True: + line = rd.readline().strip(' ') + if not line: + break + tmp = line.strip().split(util.USER_ID_SPLIT) + if len(tmp) == 2: + impression_id.append(tmp[1].strip()) + line = tmp[0] + cols = line.strip().split(' ') + label = float(cols[0].strip()) + if label > 0: + label = 1 + else: + label = 0 + cur_feature_list = [] + for word in cols[1:]: + if not word.strip(): + continue + tokens = word.strip().split(':') + cur_feature_list.append( \ + [int(tokens[0]) - 1, \ + int(tokens[1]) - 1, \ + float(tokens[2])]) + features.append(cur_feature_list) + labels.append(label) + cnt += 1 + if cnt == batch_size: + yield labels, features, impression_id + labels = [] + features = [] + impression_id = [] + cnt = 0 + if cnt > 0: + yield labels, features, impression_id + + def _convert_data(self, labels, features, hparams): + dim = hparams.FEATURE_COUNT + FIELD_COUNT = hparams.FIELD_COUNT + instance_cnt = len(labels) + + fm_feat_indices = [] + fm_feat_values = [] + fm_feat_shape = [instance_cnt, dim] + + dnn_feat_indices = [] + dnn_feat_values = [] + dnn_feat_weights = [] + dnn_feat_shape = [instance_cnt * FIELD_COUNT, -1] + + for i in range(instance_cnt): + m = len(features[i]) + dnn_feat_dic = {} + for j in range(m): + fm_feat_indices.append([i, features[i][j][1]]) + fm_feat_values.append(features[i][j][2]) + if features[i][j][0] not in dnn_feat_dic: + dnn_feat_dic[features[i][j][0]] = 0 + else: + dnn_feat_dic[features[i][j][0]] += 1 + dnn_feat_indices.append([i * FIELD_COUNT + features[i][j][0], \ + dnn_feat_dic[features[i][j][0]]]) + dnn_feat_values.append(features[i][j][1]) + dnn_feat_weights.append(features[i][j][2]) + if dnn_feat_shape[1] < dnn_feat_dic[features[i][j][0]]: + dnn_feat_shape[1] = dnn_feat_dic[features[i][j][0]] + dnn_feat_shape[1] += 1 + + sorted_index = sorted(range(len(dnn_feat_indices)), + key=lambda k: (dnn_feat_indices[k][0], \ + dnn_feat_indices[k][1])) + + res = {} + res['fm_feat_indices'] = np.asarray(fm_feat_indices, dtype=np.int64) + res['fm_feat_values'] = np.asarray(fm_feat_values, dtype=np.float32) + res['fm_feat_shape'] = np.asarray(fm_feat_shape, dtype=np.int64) + res['labels'] = np.asarray([[label] for label in labels], dtype=np.float32) + + res['dnn_feat_indices'] = np.asarray(dnn_feat_indices, dtype=np.int64)[sorted_index] + res['dnn_feat_values'] = np.asarray(dnn_feat_values, dtype=np.int64)[sorted_index] + res['dnn_feat_weights'] = np.asarray(dnn_feat_weights, dtype=np.float32)[sorted_index] + res['dnn_feat_shape'] = np.asarray(dnn_feat_shape, dtype=np.int64) + return res + + def write_tfrecord(self, infile, outfile, hparams): + sample_num = 0 + FEATURE_COUNT = hparams.FEATURE_COUNT + writer = tf.python_io.TFRecordWriter(outfile) + feature_cnt = defaultdict(lambda: 0) + impression_id_list = [] + try: + for labels, features, impression_id in self._load_batch_data_from_file(infile, hparams): + impression_id_list.extend(impression_id) + sample_num += len(labels) + input_in_sp = self._convert_data(labels, features, hparams) + fm_feat_indices = input_in_sp['fm_feat_indices'] + + for feat in fm_feat_indices: + feature_cnt[feat[1]] += 1 + + fm_feat_values = input_in_sp['fm_feat_values'] + fm_feat_shape = input_in_sp['fm_feat_shape'] + labels = input_in_sp['labels'] + dnn_feat_indices = input_in_sp['dnn_feat_indices'] + dnn_feat_values = input_in_sp['dnn_feat_values'] + dnn_feat_weights = input_in_sp['dnn_feat_weights'] + dnn_feat_shape = input_in_sp['dnn_feat_shape'] + + fm_feat_indices_str = fm_feat_indices.tostring() + labels_str = labels.tostring() + dnn_feat_indices_str = dnn_feat_indices.tostring() + + example = tf.train.Example( + features=tf.train.Features( + feature={ + 'fm_feat_indices': tf.train.Feature( + bytes_list=tf.train.BytesList(value=[fm_feat_indices_str])), + 'fm_feat_values': tf.train.Feature( + float_list=tf.train.FloatList(value=fm_feat_values)), + 'fm_feat_shape': tf.train.Feature( + int64_list=tf.train.Int64List(value=fm_feat_shape)), + 'labels': tf.train.Feature( + bytes_list=tf.train.BytesList(value=[labels_str])), + 'dnn_feat_indices': tf.train.Feature( + bytes_list=tf.train.BytesList(value=[dnn_feat_indices_str])), + 'dnn_feat_values': tf.train.Feature( + int64_list=tf.train.Int64List(value=dnn_feat_values)), + 'dnn_feat_weights': tf.train.Feature( + float_list=tf.train.FloatList(value=dnn_feat_weights)), + 'dnn_feat_shape': tf.train.Feature( + int64_list=tf.train.Int64List(value=dnn_feat_shape)) + } + ) + ) + serialized = example.SerializeToString() + writer.write(serialized) + except: + raise ValueError('train data format must be libffm, for example 1 2:1:0.1 2:3:0.2 3:4:0.4') + writer.close() + sort_feature_cnt = sorted(feature_cnt.items(), key=lambda x: x[0]) + with open(util.FEAT_COUNT_FILE, 'w') as f: + for item in sort_feature_cnt: + f.write(str(item[0]) + ',' + str(item[1]) + '\n') + return sample_num, impression_id_list diff --git a/examples/xDeepFM/IO/iterator.py b/examples/xDeepFM/IO/iterator.py new file mode 100644 index 00000000..c7e50032 --- /dev/null +++ b/examples/xDeepFM/IO/iterator.py @@ -0,0 +1,207 @@ +"""define iterator""" +import collections +import tensorflow as tf +import abc + +BUFFER_SIZE = 256 +__all__ = ["BaseIterator", "FfmIterator", "DinIterator", "CCCFNetIterator"] + + +class BaseIterator(object): + @abc.abstractmethod + def get_iterator(self, src_dataset): + """Subclass must implement this.""" + pass + + @abc.abstractmethod + def parser(self, record): + pass + + +class FfmIterator(BaseIterator): + def __init__(self, src_dataset): + self.get_iterator(src_dataset) + + def get_iterator(self, src_dataset): + src_dataset = src_dataset.map(self.parser) + # src_dataset = src_dataset.shuffle(buffer_size=BUFFER_SIZE) + iterator = src_dataset.make_initializable_iterator() + _fm_feat_indices, _fm_feat_values, \ + _fm_feat_shape, _labels, _dnn_feat_indices, \ + _dnn_feat_values, _dnn_feat_weights, _dnn_feat_shape = iterator.get_next() + self.initializer = iterator.initializer + self.fm_feat_indices = _fm_feat_indices + self.fm_feat_values = _fm_feat_values + self.fm_feat_shape = _fm_feat_shape + self.labels = _labels + self.dnn_feat_indices = _dnn_feat_indices + self.dnn_feat_values = _dnn_feat_values + self.dnn_feat_weights = _dnn_feat_weights + self.dnn_feat_shape = _dnn_feat_shape + + def parser(self, record): + keys_to_features = { + 'fm_feat_indices': tf.FixedLenFeature([], tf.string), + 'fm_feat_values': tf.VarLenFeature(tf.float32), + 'fm_feat_shape': tf.FixedLenFeature([2], tf.int64), + 'labels': tf.FixedLenFeature([], tf.string), + 'dnn_feat_indices': tf.FixedLenFeature([], tf.string), + 'dnn_feat_values': tf.VarLenFeature(tf.int64), + 'dnn_feat_weights': tf.VarLenFeature(tf.float32), + 'dnn_feat_shape': tf.FixedLenFeature([2], tf.int64), + } + parsed = tf.parse_single_example(record, keys_to_features) + fm_feat_indices = tf.reshape(tf.decode_raw(parsed['fm_feat_indices'], tf.int64), [-1, 2]) + fm_feat_values = tf.sparse_tensor_to_dense(parsed['fm_feat_values']) + fm_feat_shape = parsed['fm_feat_shape'] + labels = tf.reshape(tf.decode_raw(parsed['labels'], tf.float32), [-1, 1]) + dnn_feat_indices = tf.reshape(tf.decode_raw(parsed['dnn_feat_indices'], tf.int64), [-1, 2]) + dnn_feat_values = tf.sparse_tensor_to_dense(parsed['dnn_feat_values']) + dnn_feat_weights = tf.sparse_tensor_to_dense(parsed['dnn_feat_weights']) + dnn_feat_shape = parsed['dnn_feat_shape'] + return fm_feat_indices, fm_feat_values, \ + fm_feat_shape, labels, dnn_feat_indices, \ + dnn_feat_values, dnn_feat_weights, dnn_feat_shape + + +class DinIterator(BaseIterator): + def __init__(self, src_dataset): + self.get_iterator(src_dataset) + + def get_iterator(self, src_dataset): + src_dataset = src_dataset.map(self.parser) + # src_dataset = src_dataset.shuffle(buffer_size=BUFFER_SIZE) + iterator = src_dataset.make_initializable_iterator() + output = iterator.get_next() + (_attention_news_indices, _attention_news_values, _attention_news_shape, \ + _attention_user_indices, _attention_user_values, _attention_user_weights, \ + _attention_user_shape, _fm_feat_indices, _fm_feat_val, \ + _fm_feat_shape, _labels, _dnn_feat_indices, _dnn_feat_values, \ + _dnn_feat_weight, _dnn_feat_shape) = output + self.initializer = iterator.initializer + self.attention_news_indices = _attention_news_indices + self.attention_news_values = _attention_news_values + self.attention_news_shape = _attention_news_shape + self.attention_user_indices = _attention_user_indices + self.attention_user_values = _attention_user_values + self.attention_user_weights = _attention_user_weights + self.attention_user_shape = _attention_user_shape + self.fm_feat_indices = _fm_feat_indices + self.fm_feat_val = _fm_feat_val + self.fm_feat_shape = _fm_feat_shape + self.labels = _labels + self.dnn_feat_indices = _dnn_feat_indices + self.dnn_feat_values = _dnn_feat_values + self.dnn_feat_weight = _dnn_feat_weight + self.dnn_feat_shape = _dnn_feat_shape + + def parser(self, record): + keys_to_features = { + 'attention_news_indices': tf.FixedLenFeature([], tf.string), + 'attention_news_values': tf.VarLenFeature(tf.float32), + 'attention_news_shape': tf.FixedLenFeature([2], tf.int64), + + 'attention_user_indices': tf.FixedLenFeature([], tf.string), + 'attention_user_values': tf.VarLenFeature(tf.int64), + 'attention_user_weights': tf.VarLenFeature(tf.float32), + 'attention_user_shape': tf.FixedLenFeature([2], tf.int64), + + 'fm_feat_indices': tf.FixedLenFeature([], tf.string), + 'fm_feat_val': tf.VarLenFeature(tf.float32), + 'fm_feat_shape': tf.FixedLenFeature([2], tf.int64), + + 'labels': tf.FixedLenFeature([], tf.string), + + 'dnn_feat_indices': tf.FixedLenFeature([], tf.string), + 'dnn_feat_values': tf.VarLenFeature(tf.int64), + 'dnn_feat_weight': tf.VarLenFeature(tf.float32), + 'dnn_feat_shape': tf.FixedLenFeature([2], tf.int64), + } + parsed = tf.parse_single_example(record, keys_to_features) + + attention_news_indices = tf.reshape(tf.decode_raw(parsed['attention_news_indices'], \ + tf.int64), [-1, 2]) + attention_news_values = tf.sparse_tensor_to_dense(parsed['attention_news_values']) + attention_news_shape = parsed['attention_news_shape'] + + attention_user_indices = tf.reshape(tf.decode_raw(parsed['attention_user_indices'], \ + tf.int64), [-1, 2]) + attention_user_values = tf.sparse_tensor_to_dense(parsed['attention_user_values']) + attention_user_weights = tf.sparse_tensor_to_dense(parsed['attention_user_weights']) + attention_user_shape = parsed['attention_user_shape'] + + fm_feat_indices = tf.reshape(tf.decode_raw(parsed['fm_feat_indices'], \ + tf.int64), [-1, 2]) + fm_feat_val = tf.sparse_tensor_to_dense(parsed['fm_feat_val']) + fm_feat_shape = parsed['fm_feat_shape'] + + labels = tf.reshape(tf.decode_raw(parsed['labels'], tf.float32), [-1, 1]) + + dnn_feat_indices = tf.reshape(tf.decode_raw(parsed['dnn_feat_indices'], \ + tf.int64), [-1, 2]) + dnn_feat_values = tf.sparse_tensor_to_dense(parsed['dnn_feat_values']) + dnn_feat_weight = tf.sparse_tensor_to_dense(parsed['dnn_feat_weight']) + dnn_feat_shape = parsed['dnn_feat_shape'] + return (attention_news_indices, attention_news_values, attention_news_shape, \ + attention_user_indices, attention_user_values, attention_user_weights, \ + attention_user_shape, fm_feat_indices, fm_feat_val, \ + fm_feat_shape, labels, dnn_feat_indices, dnn_feat_values, \ + dnn_feat_weight, dnn_feat_shape) + + +class CCCFNetIterator(BaseIterator): + def __init__(self, src_dataset): + self.get_iterator(src_dataset) + + def get_iterator(self, src_dataset): + src_dataset = src_dataset.map(self.parser) + # src_dataset = src_dataset.shuffle(buffer_size=BUFFER_SIZE) + iterator = src_dataset.make_initializable_iterator() + _labels, _userIds, _itemIds, \ + _user_profiles_indices, _user_profiles_values, _user_profiles_weights, _user_profiles_shape, \ + _item_profiles_indices, _item_profiles_values, _item_profiles_weights, _item_profiles_shape = iterator.get_next() + self.initializer = iterator.initializer + self.labels = _labels + self.userIds = _userIds + self.itemIds = _itemIds + self.user_profiles_indices = _user_profiles_indices + self.user_profiles_values = _user_profiles_values + self.user_profiles_weights = _user_profiles_weights + self.user_profiles_shape = _user_profiles_shape + self.item_profiles_indices = _item_profiles_indices + self.item_profiles_values = _item_profiles_values + self.item_profiles_weights = _item_profiles_weights + self.item_profiles_shape = _item_profiles_shape + + def parser(self, record): + keys_to_features = { + 'labels': tf.FixedLenFeature([], tf.string), + 'userIds': tf.VarLenFeature(tf.int64), + 'itemIds': tf.VarLenFeature(tf.int64), + 'user_profiles_indices': tf.FixedLenFeature([], tf.string), + 'user_profiles_values': tf.VarLenFeature(tf.int64), + 'user_profiles_weights': tf.VarLenFeature(tf.float32), + 'user_profiles_shape': tf.FixedLenFeature([2], tf.int64), + 'item_profiles_indices': tf.FixedLenFeature([], tf.string), + 'item_profiles_values': tf.VarLenFeature(tf.int64), + 'item_profiles_weights': tf.VarLenFeature(tf.float32), + 'item_profiles_shape': tf.FixedLenFeature([2], tf.int64) + } + parsed = tf.parse_single_example(record, keys_to_features) + labels = tf.reshape(tf.decode_raw(parsed['labels'], tf.float32), [-1, 1]) + userIds = tf.sparse_tensor_to_dense(parsed['userIds']) + itemIds = tf.sparse_tensor_to_dense(parsed['itemIds']) + + user_profiles_indices = tf.reshape(tf.decode_raw(parsed['user_profiles_indices'], tf.int64), [-1, 2]) + user_profiles_values = tf.sparse_tensor_to_dense(parsed['user_profiles_values']) + user_profiles_weights = tf.sparse_tensor_to_dense(parsed['user_profiles_weights']) + user_profiles_shape = parsed['user_profiles_shape'] + + item_profiles_indices = tf.reshape(tf.decode_raw(parsed['item_profiles_indices'], tf.int64), [-1, 2]) + item_profiles_values = tf.sparse_tensor_to_dense(parsed['item_profiles_values']) + item_profiles_weights = tf.sparse_tensor_to_dense(parsed['item_profiles_weights']) + item_profiles_shape = parsed['item_profiles_shape'] + + return labels, userIds, itemIds, \ + user_profiles_indices, user_profiles_values, user_profiles_weights, user_profiles_shape, \ + item_profiles_indices, item_profiles_values, item_profiles_weights, item_profiles_shape diff --git a/examples/xDeepFM/main.py b/examples/xDeepFM/main.py new file mode 100644 index 00000000..265faca9 --- /dev/null +++ b/examples/xDeepFM/main.py @@ -0,0 +1,187 @@ +"""This script parse and run train function""" +import train +import utils.util as util +import tensorflow as tf +import sys +from utils.log import Log + +#yaml = sys.argv[1] + + + +def flat_config(config): + """flat config to a dict""" + f_config = {} + category = ['data', 'model', 'train', 'info'] + for cate in category: + for key, val in config[cate].items(): + f_config[key] = val + return f_config + + +def create_hparams(FLAGS): + """Create hparams.""" + FLAGS = flat_config(FLAGS) + return tf.contrib.training.HParams( + # data + train_file=FLAGS['train_file'] if 'train_file' in FLAGS else None, + eval_file=FLAGS['eval_file'] if 'eval_file' in FLAGS else None, + test_file=FLAGS['test_file'] if 'test_file' in FLAGS else None, + infer_file=FLAGS['infer_file'] if 'infer_file' in FLAGS else None, + FEATURE_COUNT=FLAGS['FEATURE_COUNT'] if 'FEATURE_COUNT' in FLAGS else None, + FIELD_COUNT=FLAGS['FIELD_COUNT'] if 'FIELD_COUNT' in FLAGS else None, + data_format=FLAGS['data_format'] if 'data_format' in FLAGS else None, + PAIR_NUM=FLAGS['PAIR_NUM'] if 'PAIR_NUM' in FLAGS else None, + DNN_FIELD_NUM=FLAGS['DNN_FIELD_NUM'] if 'DNN_FIELD_NUM' in FLAGS else None, + n_user=FLAGS['n_user'] if 'n_user' in FLAGS else None, + n_item=FLAGS['n_item'] if 'n_item' in FLAGS else None, + n_user_attr=FLAGS['n_user_attr'] if 'n_user_attr' in FLAGS else None, + n_item_attr=FLAGS['n_item_attr'] if 'n_item_attr' in FLAGS else None, + # model + dim=FLAGS['dim'] if 'dim' in FLAGS else None, + layer_sizes=FLAGS['layer_sizes'] if 'layer_sizes' in FLAGS else None, + cross_layer_sizes=FLAGS['cross_layer_sizes'] if 'cross_layer_sizes' in FLAGS else None, + cross_layers = FLAGS['cross_layers'] if 'cross_layers' in FLAGS else None, + activation=FLAGS['activation'] if 'activation' in FLAGS else None, + cross_activation=FLAGS['cross_activation'] if 'cross_activation' in FLAGS else "identity", + dropout=FLAGS['dropout'] if 'dropout' in FLAGS else None, + attention_layer_sizes=FLAGS['attention_layer_sizes'] if 'attention_layer_sizes' in FLAGS else None, + attention_activation=FLAGS['attention_activation'] if 'attention_activation' in FLAGS else None, + model_type=FLAGS['model_type'] if 'model_type' in FLAGS else None, + method=FLAGS['method'] if 'method' in FLAGS else None, + load_model_name=FLAGS['load_model_name'] if 'load_model_name' in FLAGS else None, + mu=FLAGS['mu'] if 'mu' in FLAGS else None, + # train + init_method=FLAGS['init_method'] if 'init_method' in FLAGS else 'tnormal', + init_value=FLAGS['init_value'] if 'init_value' in FLAGS else 0.01, + embed_l2=FLAGS['embed_l2'] if 'embed_l2' in FLAGS else 0.0000, + embed_l1=FLAGS['embed_l1'] if 'embed_l1' in FLAGS else 0.0000, + layer_l2=FLAGS['layer_l2'] if 'layer_l2' in FLAGS else 0.0000, + layer_l1=FLAGS['layer_l1'] if 'layer_l1' in FLAGS else 0.0000, + cross_l2=FLAGS['cross_l2'] if 'cross_l2' in FLAGS else 0.0000, + cross_l1=FLAGS['cross_l1'] if 'cross_l1' in FLAGS else 0.0000, + learning_rate=FLAGS['learning_rate'] if 'learning_rate' in FLAGS else 0.001, + loss=FLAGS['loss'] if 'loss' in FLAGS else None, + optimizer=FLAGS['optimizer'] if 'optimizer' in FLAGS else 'adam', + epochs=FLAGS['epochs'] if 'epochs' in FLAGS else 10, + batch_size=FLAGS['batch_size'] if 'batch_size' in FLAGS else 1, + # show info + log=FLAGS['log'] if 'log' in FLAGS else "log", + logger=None, + show_step=FLAGS['show_step'] if 'show_step' in FLAGS else 1, + save_epoch=FLAGS['save_epoch'] if 'save_epoch' in FLAGS else 5, + metrics=FLAGS['metrics'] if 'metrics' in FLAGS else None + ) + + +def check_type(config): + """check config type""" + # check parameter type + int_parameters = ['FEATURE_COUNT', 'FIELD_COUNT', 'dim', 'epochs', 'batch_size', 'show_step', \ + 'save_epoch', 'PAIR_NUM', 'DNN_FIELD_NUM', 'attention_layer_sizes', \ + 'n_user', 'n_item', 'n_user_attr', 'n_item_attr'] + for param in int_parameters: + if param in config and not isinstance(config[param], int): + raise TypeError("parameters {0} must be int".format(param)) + + float_parameters = ['init_value', 'learning_rate', 'embed_l2', \ + 'embed_l1', 'layer_l2', 'layer_l1', 'mu'] + for param in float_parameters: + if param in config and not isinstance(config[param], float): + raise TypeError("parameters {0} must be float".format(param)) + + str_parameters = ['train_file', 'eval_file', 'test_file', 'infer_file', 'method', \ + 'load_model_name', 'loss', 'optimizer', 'init_method', 'attention_activation'] + for param in str_parameters: + if param in config and not isinstance(config[param], str): + raise TypeError("parameters {0} must be str".format(param)) + + list_parameters = ['layer_sizes', 'activation', 'dropout'] + for param in list_parameters: + if param in config and not isinstance(config[param], list): + raise TypeError("parameters {0} must be list".format(param)) + + if ('data_format' in config) and (not config['data_format'] in ['ffm', 'din', 'cccfnet']): + raise TypeError("parameters data_format must be din" \ + ",ffm, cccfnet but is {0}".format(config['data_format'])) + + +def check_nn_config(config): + """check neural networks config""" + if config['model']['model_type'] in ['fm']: + required_parameters = ['train_file', 'eval_file', 'FEATURE_COUNT', 'dim', 'loss', 'data_format', 'method'] + elif config['model']['model_type'] in ['lr']: + required_parameters = ['train_file', 'eval_file', 'FEATURE_COUNT', 'loss', 'data_format', 'method'] + elif config['model']['model_type'] in ['din']: + required_parameters = ['train_file', 'eval_file', 'PAIR_NUM', 'DNN_FIELD_NUM', 'FEATURE_COUNT', 'dim', \ + 'layer_sizes', 'activation', 'attention_layer_sizes', 'attention_activation', 'loss', \ + 'data_format', 'dropout', 'method'] + elif config['model']['model_type'] in ['cccfnet']: + required_parameters = ['train_file', 'eval_file', 'dim', 'layer_sizes', 'n_user', 'n_item', 'n_user_attr', + 'n_item_attr', + 'activation', 'loss', 'data_format', 'dropout', 'mu', 'method'] + elif config['model']['model_type'] in ['exDeepFM']: + required_parameters = ['train_file', 'eval_file', 'FIELD_COUNT', 'FEATURE_COUNT', 'method', + 'dim', 'layer_sizes', 'cross_layer_sizes', 'activation', 'loss', 'data_format', 'dropout'] + elif config['model']['model_type'] in ['deepcross']: + required_parameters = ['train_file', 'eval_file', 'FIELD_COUNT', 'FEATURE_COUNT', 'method', + 'dim', 'layer_sizes', 'cross_layers', 'activation', 'loss', 'data_format', + 'dropout'] + else: + required_parameters = ['train_file', 'eval_file', 'FIELD_COUNT', 'FEATURE_COUNT', 'method', + 'dim', 'layer_sizes', 'activation', 'loss', 'data_format', 'dropout'] + f_config = flat_config(config) + # check required parameters + for param in required_parameters: + if param not in f_config: + raise ValueError("parameters {0} must be set".format(param)) + if f_config['model_type'] == 'din': + if f_config['data_format'] != 'din': + raise ValueError( + "for din model, data format must be din, but your set is {0}".format(f_config['data_format'])) + elif f_config['model_type'] == 'cccfnet': + if f_config['data_format'] != 'cccfnet': + raise ValueError( + "for cccfnet model, data format must be cccfnet, but your set is {0}".format(f_config['data_format'])) + else: + if f_config['data_format'] != 'ffm': + raise ValueError("data format must be ffm, but your set is {0}".format(f_config['data_format'])) + check_type(f_config) + + +def check_config(config): + """check networks config""" + if config['model']['model_type'] not in ['deepFM', 'deepWide', 'dnn', 'ipnn', \ + 'opnn', 'fm', 'lr', 'din', 'cccfnet', 'deepcross', 'exDeepFM', "cross", "CIN"]: + raise ValueError( + "model type must be cccfnet, deepFM, deepWide, dnn, ipnn, opnn, fm, lr, din, deepcross, exDeepFM, cross, CIN but you set is {0}".format( + config['model']['model_type'])) + check_nn_config(config) + + +# train process load yaml +def load_yaml(): + """load config from yaml""" + yaml_name = util.CONFIG_DIR + util.TRAIN_YAML + print('trainging network configuration file is {0}'.format(yaml_name)) + util.check_file_exist(yaml_name) + config = util.load_yaml_file(yaml_name) + return config + + +def main(): + """main function""" + # flag = True + util.check_tensorflow_version() + util.check_and_mkdir() + #util.TRAIN_YAML = yaml + config = load_yaml() + check_config(config) + hparams = create_hparams(config) + print(hparams.values()) + log = Log(hparams) + hparams.logger = log.logger + train.train(hparams) + + +main() diff --git a/examples/xDeepFM/src/base_model.py b/examples/xDeepFM/src/base_model.py new file mode 100644 index 00000000..320f0876 --- /dev/null +++ b/examples/xDeepFM/src/base_model.py @@ -0,0 +1,193 @@ +"""define base class model""" +import abc +import math +import tensorflow as tf +import utils.util as util +from IO.iterator import BaseIterator + +__all__ = ["BaseModel"] + + +class BaseModel(object): + def __init__(self, hparams, iterator, scope=None): + assert isinstance(iterator, BaseIterator) + tf.set_random_seed(1234) + self.iterator = iterator + self.layer_params = [] + self.embed_params = [] + self.cross_params = [] + self.layer_keeps = None + self.keep_prob_train = None + self.keep_prob_test = None + self.initializer = self._get_initializer(hparams) + self.logit = self._build_graph(hparams) + self.pred = self._get_pred(self.logit, hparams) + self.data_loss = self._compute_data_loss(hparams) + self.regular_loss = self._compute_regular_loss(hparams) + self.loss = tf.add(self.data_loss, self.regular_loss) + self.saver = tf.train.Saver(max_to_keep=hparams.epochs) + self.update = self._build_train_opt(hparams) + self.init_op = tf.global_variables_initializer() + self.merged = self._add_summaries() + + def _get_pred(self, logit, hparams): + if hparams.method == 'regression': + pred = tf.identity(logit) + elif hparams.method == 'classification': + pred = tf.sigmoid(logit) + else: + raise ValueError("method must be regression or classification, but now is {0}".format(hparams.method)) + return pred + + def _add_summaries(self): + tf.summary.scalar("data_loss", self.data_loss) + tf.summary.scalar("regular_loss", self.regular_loss) + tf.summary.scalar("loss", self.loss) + merged = tf.summary.merge_all() + return merged + + @abc.abstractmethod + def _build_graph(self, hparams): + """Subclass must implement this.""" + pass + + def _l2_loss(self, hparams): + l2_loss = tf.zeros([1], dtype=tf.float32) + # embedding_layer l2 loss + for param in self.embed_params: + l2_loss = tf.add(l2_loss, tf.multiply(hparams.embed_l2, tf.nn.l2_loss(param))) + params = self.layer_params + for param in params: + l2_loss = tf.add(l2_loss, tf.multiply(hparams.layer_l2, tf.nn.l2_loss(param))) + return l2_loss + + def _l1_loss(self, hparams): + l1_loss = tf.zeros([1], dtype=tf.float32) + # embedding_layer l2 loss + for param in self.embed_params: + l1_loss = tf.add(l1_loss, tf.multiply(hparams.embed_l1, tf.norm(param, ord=1))) + params = self.layer_params + for param in params: + l1_loss = tf.add(l1_loss, tf.multiply(hparams.layer_l1, tf.norm(param, ord=1))) + return l1_loss + + def _cross_l_loss(self, hparams): + cross_l_loss = tf.zeros([1], dtype=tf.float32) + for param in self.cross_params: + cross_l_loss = tf.add(cross_l_loss, tf.multiply(hparams.cross_l1, tf.norm(param, ord=1))) + cross_l_loss = tf.add(cross_l_loss, tf.multiply(hparams.cross_l2, tf.norm(param, ord=1))) + return cross_l_loss + + def _get_initializer(self, hparams): + if hparams.init_method == 'tnormal': + return tf.truncated_normal_initializer(stddev=hparams.init_value) + elif hparams.init_method == 'uniform': + return tf.random_uniform_initializer(-hparams.init_value, hparams.init_value) + elif hparams.init_method == 'normal': + return tf.random_normal_initializer(stddev=hparams.init_value) + elif hparams.init_method == 'xavier_normal': + return tf.contrib.layers.xavier_initializer(uniform=False) + elif hparams.init_method == 'xavier_uniform': + return tf.contrib.layers.xavier_initializer(uniform=True) + elif hparams.init_method == 'he_normal': + return tf.contrib.layers.variance_scaling_initializer( \ + factor=2.0, mode='FAN_IN', uniform=False) + elif hparams.init_method == 'he_uniform': + return tf.contrib.layers.variance_scaling_initializer( \ + factor=2.0, mode='FAN_IN', uniform=True) + else: + return tf.truncated_normal_initializer(stddev=hparams.init_value) + + def _compute_data_loss(self, hparams): + if hparams.loss == 'cross_entropy_loss': + data_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( \ + logits=tf.reshape(self.logit, [-1]), \ + labels=tf.reshape(self.iterator.labels, [-1]))) + elif hparams.loss == 'square_loss': + data_loss = tf.sqrt(tf.reduce_mean( + tf.squared_difference(tf.reshape(self.pred, [-1]), tf.reshape(self.iterator.labels, [-1])))) + elif hparams.loss == 'log_loss': + data_loss = tf.reduce_mean(tf.losses.log_loss(predictions=tf.reshape(self.pred, [-1]), + labels=tf.reshape(self.iterator.labels, [-1]))) + else: + raise ValueError("this loss not defined {0}".format(hparams.loss)) + return data_loss + + def _compute_regular_loss(self, hparams): + regular_loss = self._l2_loss(hparams) + self._l1_loss(hparams) + self._cross_l_loss(hparams) + regular_loss = tf.reduce_sum(regular_loss) + return regular_loss + + def _build_train_opt(self, hparams): + def train_opt(hparams): + if hparams.optimizer == 'adadelta': + train_step = tf.train.AdadeltaOptimizer( \ + hparams.learning_rate).minimize(self.loss) + elif hparams.optimizer == 'adagrad': + train_step = tf.train.AdagradOptimizer( \ + hparams.learning_rate).minimize(self.loss) + elif hparams.optimizer == 'sgd': + train_step = tf.train.GradientDescentOptimizer( \ + hparams.learning_rate).minimize(self.loss) + elif hparams.optimizer == 'adam': + train_step = tf.train.AdamOptimizer( \ + hparams.learning_rate).minimize(self.loss) + elif hparams.optimizer == 'ftrl': + train_step = tf.train.FtrlOptimizer( \ + hparams.learning_rate).minimize(self.loss) + elif hparams.optimizer == 'gd': + train_step = tf.train.GradientDescentOptimizer( \ + hparams.learning_rate).minimize(self.loss) + elif hparams.optimizer == 'padagrad': + train_step = tf.train.ProximalAdagradOptimizer( \ + hparams.learning_rate).minimize(self.loss) + elif hparams.optimizer == 'pgd': + train_step = tf.train.ProximalGradientDescentOptimizer( \ + hparams.learning_rate).minimize(self.loss) + elif hparams.optimizer == 'rmsprop': + train_step = tf.train.RMSPropOptimizer( \ + hparams.learning_rate).minimize(self.loss) + else: + train_step = tf.train.GradientDescentOptimizer( \ + hparams.learning_rate).minimize(self.loss) + return train_step + + train_step = train_opt(hparams) + return train_step + + def _active_layer(self, logit, scope, activation, layer_idx): + logit = self._dropout(logit, layer_idx) + logit = self._activate(logit, activation) + return logit + + def _activate(self, logit, activation): + if activation == 'sigmoid': + return tf.nn.sigmoid(logit) + elif activation == 'softmax': + return tf.nn.softmax(logit) + elif activation == 'relu': + return tf.nn.relu(logit) + elif activation == 'tanh': + return tf.nn.tanh(logit) + elif activation == 'elu': + return tf.nn.elu(logit) + elif activation == 'identity': + return tf.identity(logit) + else: + raise ValueError("this activations not defined {0}".format(activation)) + + def _dropout(self, logit, layer_idx): + logit = tf.nn.dropout(x=logit, keep_prob=self.layer_keeps[layer_idx]) + return logit + + def train(self, sess): + return sess.run([self.update, self.loss, self.data_loss, self.merged], \ + feed_dict={self.layer_keeps: self.keep_prob_train}) + + def eval(self, sess): + return sess.run([self.loss, self.data_loss, self.pred, self.iterator.labels], \ + feed_dict={self.layer_keeps: self.keep_prob_test}) + + def infer(self, sess): + return sess.run([self.pred], \ + feed_dict={self.layer_keeps: self.keep_prob_test}) diff --git a/examples/xDeepFM/src/exDeepFM.py b/examples/xDeepFM/src/exDeepFM.py new file mode 100644 index 00000000..7167d460 --- /dev/null +++ b/examples/xDeepFM/src/exDeepFM.py @@ -0,0 +1,409 @@ +"""define Factorization-Machine based Neural Network Model""" +import math +import numpy as np +import tensorflow as tf +from src.base_model import BaseModel + +__all__ = ["ExtremeDeepFMModel"] + + +class ExtremeDeepFMModel(BaseModel): + """define Factorization-Machine based Neural Network Model""" + + def _build_graph(self, hparams): + self.keep_prob_train = 1 - np.array(hparams.dropout) + self.keep_prob_test = np.ones_like(hparams.dropout) + self.layer_keeps = tf.placeholder(tf.float32) + with tf.variable_scope("exDeepFm") as scope: + with tf.variable_scope("embedding", initializer=self.initializer) as escope: + self.embedding = tf.get_variable(name='embedding_layer', + shape=[hparams.FEATURE_COUNT, hparams.dim], + dtype=tf.float32) + self.embed_params.append(self.embedding) + embed_out, embed_layer_size = self._build_embedding(hparams) + logit = self._build_linear(hparams) + # logit = tf.add(logit, self._build_fm(hparams)) + # res: use resnet? direct: without split? reduce_D: Dimension reduction? f_dim: dimension of reduce_D + logit = tf.add(logit, self._build_extreme_FM(hparams, embed_out, res=False, direct=False, bias=False, reduce_D=False, f_dim=2)) + # logit = tf.add(logit, self._build_extreme_FM_quick(hparams, embed_out)) + logit = tf.add(logit, self._build_dnn(hparams, embed_out, embed_layer_size)) + return logit + + def _build_embedding(self, hparams): + fm_sparse_index = tf.SparseTensor(self.iterator.dnn_feat_indices, + self.iterator.dnn_feat_values, + self.iterator.dnn_feat_shape) + fm_sparse_weight = tf.SparseTensor(self.iterator.dnn_feat_indices, + self.iterator.dnn_feat_weights, + self.iterator.dnn_feat_shape) + w_fm_nn_input_orgin = tf.nn.embedding_lookup_sparse(self.embedding, + fm_sparse_index, + fm_sparse_weight, + combiner="sum") + embedding = tf.reshape(w_fm_nn_input_orgin, [-1, hparams.dim * hparams.FIELD_COUNT]) + embedding_size = hparams.FIELD_COUNT * hparams.dim + return embedding, embedding_size + + def _build_linear(self, hparams): + with tf.variable_scope("linear_part", initializer=self.initializer) as scope: + w_linear = tf.get_variable(name='w', + shape=[hparams.FEATURE_COUNT, 1], + dtype=tf.float32) + b_linear = tf.get_variable(name='b', + shape=[1], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + x = tf.SparseTensor(self.iterator.fm_feat_indices, + self.iterator.fm_feat_values, + self.iterator.fm_feat_shape) + linear_output = tf.add(tf.sparse_tensor_dense_matmul(x, w_linear), b_linear) + self.layer_params.append(w_linear) + self.layer_params.append(b_linear) + tf.summary.histogram("linear_part/w", w_linear) + tf.summary.histogram("linear_part/b", b_linear) + return linear_output + + def _build_fm(self, hparams): + with tf.variable_scope("fm_part") as scope: + x = tf.SparseTensor(self.iterator.fm_feat_indices, + self.iterator.fm_feat_values, + self.iterator.fm_feat_shape) + xx = tf.SparseTensor(self.iterator.fm_feat_indices, + tf.pow(self.iterator.fm_feat_values, 2), + self.iterator.fm_feat_shape) + fm_output = 0.5 * tf.reduce_sum( + tf.pow(tf.sparse_tensor_dense_matmul(x, self.embedding), 2) - \ + tf.sparse_tensor_dense_matmul(xx, + tf.pow(self.embedding, 2)), 1, + keep_dims=True) + return fm_output + """ + def _build_extreme_FM_slow_bad(self, hparams, nn_input): + hidden_nn_layers = [] + field_nums = [] + final_len = 0 + field_num = hparams.FIELD_COUNT + nn_input = tf.reshape(nn_input, shape=[-1, int(field_num), hparams.dim]) + field_nums.append(int(field_num)) + hidden_nn_layers.append(nn_input) + final_result = [] + with tf.variable_scope("exfm_part", initializer=self.initializer) as scope: + for idx, layer_size in enumerate(hparams.cross_layer_sizes): + dot_results = [] + split_tensor = tf.split(hidden_nn_layers[-1], field_nums[-1]*[1], 1) + for s in split_tensor: + s = tf.tile(s, [1, field_nums[0], 1]) + dot_results.append(tf.multiply(s, hidden_nn_layers[0])) + dot_result = tf.concat(dot_results, axis=1) + filters = tf.get_variable(name="f_"+str(idx), + shape=[1, len(dot_results)*field_nums[0], layer_size], + dtype=tf.float32) + dot_result = tf.transpose(dot_result, perm=[0, 2, 1]) + curr_out = tf.nn.conv1d(dot_result, filters=filters, stride=1, padding='VALID') + curr_out = tf.transpose(curr_out, perm=[0, 2, 1]) + + if idx != len(hparams.cross_layer_sizes)-1: + next_hidden, direct_connect = tf.split(curr_out, 2*[int(layer_size / 2)], 1) + final_len += int(layer_size / 2) + else: + direct_connect = curr_out + next_hidden=0 + final_len += layer_size + + ### + direct_connect = curr_out + next_hidden = curr_out + final_len += layer_size + ### + + final_result.append(direct_connect) + hidden_nn_layers.append(next_hidden) + field_nums.append(int(layer_size / 2)) + # field_nums.append(int(layer_size)) + self.cross_params.append(filters) + result = tf.concat(final_result, axis=1) + result = tf.reduce_sum(result, -1) + ### + # residual network + w_nn_output1 = tf.get_variable(name='w_nn_output1', + shape=[final_len, 128], + dtype=tf.float32) + b_nn_output1 = tf.get_variable(name='b_nn_output1', + shape=[128], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + self.layer_params.append(w_nn_output1) + self.layer_params.append(b_nn_output1) + exFM_out0 = tf.nn.xw_plus_b(result, w_nn_output1, b_nn_output1) + exFM_out1 = self._active_layer(logit=exFM_out0, + scope=scope, + activation="relu", + layer_idx=0) + w_nn_output2 = tf.get_variable(name='w_nn_output2', + shape=[128 + final_len, 1], + dtype=tf.float32) + b_nn_output2 = tf.get_variable(name='b_nn_output2', + shape=[1], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + self.layer_params.append(w_nn_output2) + self.layer_params.append(b_nn_output2) + exFM_in = tf.concat([exFM_out1, result], axis=1, name="user_emb") + exFM_out = tf.nn.xw_plus_b(exFM_in, w_nn_output2, b_nn_output2) + + ### + w_nn_output = tf.get_variable(name='w_nn_output', + shape=[final_len, 1], + dtype=tf.float32) + b_nn_output = tf.get_variable(name='b_nn_output', + shape=[1], + dtype=tf.float32) + self.layer_params.append(w_nn_output) + self.layer_params.append(b_nn_output) + exFM_out = tf.nn.xw_plus_b(result, w_nn_output, b_nn_output) + + return exFM_out + """ + + def _build_extreme_FM(self, hparams, nn_input, res=False, direct=False, bias=False, reduce_D=False, f_dim=2): + hidden_nn_layers = [] + field_nums = [] + final_len = 0 + field_num = hparams.FIELD_COUNT + nn_input = tf.reshape(nn_input, shape=[-1, int(field_num), hparams.dim]) + field_nums.append(int(field_num)) + hidden_nn_layers.append(nn_input) + final_result = [] + split_tensor0 = tf.split(hidden_nn_layers[0], hparams.dim * [1], 2) + with tf.variable_scope("exfm_part", initializer=self.initializer) as scope: + for idx, layer_size in enumerate(hparams.cross_layer_sizes): + split_tensor = tf.split(hidden_nn_layers[-1], hparams.dim * [1], 2) + dot_result_m = tf.matmul(split_tensor0, split_tensor, transpose_b=True) + dot_result_o = tf.reshape(dot_result_m, shape=[hparams.dim, -1, field_nums[0]*field_nums[-1]]) + dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2]) + + if reduce_D: + hparams.logger.info("reduce_D") + filters0 = tf.get_variable("f0_" + str(idx), + shape=[1, layer_size, field_nums[0], f_dim], + dtype=tf.float32) + filters_ = tf.get_variable("f__" + str(idx), + shape=[1, layer_size, f_dim, field_nums[-1]], + dtype=tf.float32) + filters_m = tf.matmul(filters0, filters_) + filters_o = tf.reshape(filters_m, shape=[1, layer_size, field_nums[0] * field_nums[-1]]) + filters = tf.transpose(filters_o, perm=[0, 2, 1]) + else: + filters = tf.get_variable(name="f_"+str(idx), + shape=[1, field_nums[-1]*field_nums[0], layer_size], + dtype=tf.float32) + # dot_result = tf.transpose(dot_result, perm=[0, 2, 1]) + curr_out = tf.nn.conv1d(dot_result, filters=filters, stride=1, padding='VALID') + + # BIAS ADD + if bias: + hparams.logger.info("bias") + b = tf.get_variable(name="f_b" + str(idx), + shape=[layer_size], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + curr_out = tf.nn.bias_add(curr_out, b) + self.cross_params.append(b) + self.layer_params.append(b) + + curr_out = self._activate(curr_out, hparams.cross_activation) + + curr_out = tf.transpose(curr_out, perm=[0, 2, 1]) + + if direct: + hparams.logger.info("all direct connect") + direct_connect = curr_out + next_hidden = curr_out + final_len += layer_size + field_nums.append(int(layer_size)) + + else: + hparams.logger.info("split connect") + if idx != len(hparams.cross_layer_sizes) - 1: + next_hidden, direct_connect = tf.split(curr_out, 2 * [int(layer_size / 2)], 1) + final_len += int(layer_size / 2) + else: + direct_connect = curr_out + next_hidden = 0 + final_len += layer_size + field_nums.append(int(layer_size / 2)) + + final_result.append(direct_connect) + hidden_nn_layers.append(next_hidden) + + self.cross_params.append(filters) + self.layer_params.append(filters) + + result = tf.concat(final_result, axis=1) + result = tf.reduce_sum(result, -1) + if res: + hparams.logger.info("residual network") + w_nn_output1 = tf.get_variable(name='w_nn_output1', + shape=[final_len, 128], + dtype=tf.float32) + b_nn_output1 = tf.get_variable(name='b_nn_output1', + shape=[128], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + self.layer_params.append(w_nn_output1) + self.layer_params.append(b_nn_output1) + exFM_out0 = tf.nn.xw_plus_b(result, w_nn_output1, b_nn_output1) + exFM_out1 = self._active_layer(logit=exFM_out0, + scope=scope, + activation="relu", + layer_idx=0) + w_nn_output2 = tf.get_variable(name='w_nn_output2', + shape=[128 + final_len, 1], + dtype=tf.float32) + b_nn_output2 = tf.get_variable(name='b_nn_output2', + shape=[1], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + self.layer_params.append(w_nn_output2) + self.layer_params.append(b_nn_output2) + exFM_in = tf.concat([exFM_out1, result], axis=1, name="user_emb") + exFM_out = tf.nn.xw_plus_b(exFM_in, w_nn_output2, b_nn_output2) + + else: + hparams.logger.info("no residual network") + w_nn_output = tf.get_variable(name='w_nn_output', + shape=[final_len, 1], + dtype=tf.float32) + b_nn_output = tf.get_variable(name='b_nn_output', + shape=[1], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + self.layer_params.append(w_nn_output) + self.layer_params.append(b_nn_output) + exFM_out = tf.nn.xw_plus_b(result, w_nn_output, b_nn_output) + + return exFM_out + + def _build_extreme_FM_quick(self, hparams, nn_input): + hidden_nn_layers = [] + field_nums = [] + final_len = 0 + field_num = hparams.FIELD_COUNT + nn_input = tf.reshape(nn_input, shape=[-1, int(field_num), hparams.dim]) + field_nums.append(int(field_num)) + hidden_nn_layers.append(nn_input) + final_result = [] + split_tensor0 = tf.split(hidden_nn_layers[0], hparams.dim * [1], 2) + with tf.variable_scope("exfm_part", initializer=self.initializer) as scope: + for idx, layer_size in enumerate(hparams.cross_layer_sizes): + split_tensor = tf.split(hidden_nn_layers[-1], hparams.dim * [1], 2) + dot_result_m = tf.matmul(split_tensor0, split_tensor, transpose_b=True) + dot_result_o = tf.reshape(dot_result_m, shape=[hparams.dim, -1, field_nums[0]*field_nums[-1]]) + dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2]) + + filters = tf.get_variable(name="f_"+str(idx), + shape=[1, field_nums[-1]*field_nums[0], layer_size], + dtype=tf.float32) + # dot_result = tf.transpose(dot_result, perm=[0, 2, 1]) + curr_out = tf.nn.conv1d(dot_result, filters=filters, stride=1, padding='VALID') + + + curr_out = tf.transpose(curr_out, perm=[0, 2, 1]) + + + hparams.logger.info("split connect") + if idx != len(hparams.cross_layer_sizes) - 1: + next_hidden, direct_connect = tf.split(curr_out, 2 * [int(layer_size / 2)], 1) + final_len += int(layer_size / 2) + else: + direct_connect = curr_out + next_hidden = 0 + final_len += layer_size + field_nums.append(int(layer_size / 2)) + + final_result.append(direct_connect) + hidden_nn_layers.append(next_hidden) + + self.cross_params.append(filters) + + result = tf.concat(final_result, axis=1) + result = tf.reduce_sum(result, -1) + + hparams.logger.info("no residual network") + w_nn_output = tf.get_variable(name='w_nn_output', + shape=[final_len, 1], + dtype=tf.float32) + b_nn_output = tf.get_variable(name='b_nn_output', + shape=[1], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + self.layer_params.append(w_nn_output) + self.layer_params.append(b_nn_output) + exFM_out = tf.nn.xw_plus_b(result, w_nn_output, b_nn_output) + + return exFM_out + + + def _build_dnn(self, hparams, embed_out, embed_layer_size): + """ + fm_sparse_index = tf.SparseTensor(self.iterator.dnn_feat_indices, + self.iterator.dnn_feat_values, + self.iterator.dnn_feat_shape) + fm_sparse_weight = tf.SparseTensor(self.iterator.dnn_feat_indices, + self.iterator.dnn_feat_weights, + self.iterator.dnn_feat_shape) + w_fm_nn_input_orgin = tf.nn.embedding_lookup_sparse(self.embedding, + fm_sparse_index, + fm_sparse_weight, + combiner="sum") + w_fm_nn_input = tf.reshape(w_fm_nn_input_orgin, [-1, hparams.dim * hparams.FIELD_COUNT]) + last_layer_size = hparams.FIELD_COUNT * hparams.dim + """ + w_fm_nn_input = embed_out + last_layer_size = embed_layer_size + layer_idx = 0 + hidden_nn_layers = [] + hidden_nn_layers.append(w_fm_nn_input) + with tf.variable_scope("nn_part", initializer=self.initializer) as scope: + for idx, layer_size in enumerate(hparams.layer_sizes): + curr_w_nn_layer = tf.get_variable(name='w_nn_layer' + str(layer_idx), + shape=[last_layer_size, layer_size], + dtype=tf.float32) + curr_b_nn_layer = tf.get_variable(name='b_nn_layer' + str(layer_idx), + shape=[layer_size], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + tf.summary.histogram("nn_part/" + 'w_nn_layer' + str(layer_idx), + curr_w_nn_layer) + tf.summary.histogram("nn_part/" + 'b_nn_layer' + str(layer_idx), + curr_b_nn_layer) + curr_hidden_nn_layer = tf.nn.xw_plus_b(hidden_nn_layers[layer_idx], + curr_w_nn_layer, + curr_b_nn_layer) + scope = "nn_part" + str(idx) + activation = hparams.activation[idx] + curr_hidden_nn_layer = self._active_layer(logit=curr_hidden_nn_layer, + scope=scope, + activation=activation, + layer_idx=idx) + hidden_nn_layers.append(curr_hidden_nn_layer) + layer_idx += 1 + last_layer_size = layer_size + self.layer_params.append(curr_w_nn_layer) + self.layer_params.append(curr_b_nn_layer) + + w_nn_output = tf.get_variable(name='w_nn_output', + shape=[last_layer_size, 1], + dtype=tf.float32) + b_nn_output = tf.get_variable(name='b_nn_output', + shape=[1], + dtype=tf.float32, + initializer=tf.zeros_initializer()) + tf.summary.histogram("nn_part/" + 'w_nn_output' + str(layer_idx), + w_nn_output) + tf.summary.histogram("nn_part/" + 'b_nn_output' + str(layer_idx), + b_nn_output) + self.layer_params.append(w_nn_output) + self.layer_params.append(b_nn_output) + nn_output = tf.nn.xw_plus_b(hidden_nn_layers[-1], w_nn_output, b_nn_output) + return nn_output diff --git a/examples/xDeepFM/train.py b/examples/xDeepFM/train.py new file mode 100644 index 00000000..2a9a0a31 --- /dev/null +++ b/examples/xDeepFM/train.py @@ -0,0 +1,304 @@ +"""define train, infer, eval, test process""" +import numpy as np +import os, time, collections +import tensorflow as tf +from IO.iterator import FfmIterator #, DinIterator, CCCFNetIterator +#from IO.din_cache import DinCache +from IO.ffm_cache import FfmCache +#from IO.cccfnet_cache import CCCFNetCache +#from src.deep_fm import DeepfmModel +#from src.deep_wide import DeepWideModel +#from src.fm import FmModel +#from src.dnn import DnnModel +#from src.opnn import OpnnModel +#from src.ipnn import IpnnModel +#from src.lr import LrModel +#from src.din import DinModel +#from src.cccfnet import CCCFModel +#from src.deepcross import DeepCrossModel +from src.exDeepFM import ExtremeDeepFMModel +from src.CIN import CINModel +#from src.cross import CrossModel +import utils.util as util +import utils.metric as metric +# from utils.log import Log + +# log = Log(hparams) + +class TrainModel(collections.namedtuple("TrainModel", ("graph", "model", "iterator", "filenames"))): + """define train class, include graph, model, iterator""" + pass + + +def create_train_model(model_creator, hparams, scope=None): + graph = tf.Graph() + with graph.as_default(): + # feed train file name, valid file name, or test file name + filenames = tf.placeholder(tf.string, shape=[None]) + #src_dataset = tf.contrib.data.TFRecordDataset(filenames) + src_dataset = tf.data.TFRecordDataset(filenames) + + if hparams.data_format == 'ffm': + batch_input = FfmIterator(src_dataset) + elif hparams.data_format == 'din': + batch_input = DinIterator(src_dataset) + elif hparams.data_format == 'cccfnet': + batch_input = CCCFNetIterator(src_dataset) + else: + raise ValueError("not support {0} format data".format(hparams.data_format)) + # build model + model = model_creator( + hparams, + iterator=batch_input, + scope=scope) + + return TrainModel( + graph=graph, + model=model, + iterator=batch_input, + filenames=filenames) + + +# run evaluation and get evaluted loss +def run_eval(load_model, load_sess, filename, sample_num_file, hparams, flag): + # load sample num + with open(sample_num_file, 'r') as f: + sample_num = int(f.readlines()[0].strip()) + load_sess.run(load_model.iterator.initializer, feed_dict={load_model.filenames: [filename]}) + preds = [] + labels = [] + while True: + try: + _, _, step_pred, step_labels = load_model.model.eval(load_sess) + preds.extend(np.reshape(step_pred, -1)) + labels.extend(np.reshape(step_labels, -1)) + except tf.errors.OutOfRangeError: + break + preds = preds[:sample_num] + labels = labels[:sample_num] + hparams.logger.info("data num:{0:d}".format(len(labels))) + res = metric.cal_metric(labels, preds, hparams, flag) + return res + + +# run infer +def run_infer(load_model, load_sess, filename, hparams, sample_num_file): + # load sample num + with open(sample_num_file, 'r') as f: + sample_num = int(f.readlines()[0].strip()) + if not os.path.exists(util.RES_DIR): + os.mkdir(util.RES_DIR) + load_sess.run(load_model.iterator.initializer, feed_dict={load_model.filenames: [filename]}) + preds = [] + while True: + try: + step_pred = load_model.model.infer(load_sess) + preds.extend(np.reshape(step_pred, -1)) + except tf.errors.OutOfRangeError: + break + preds = preds[:sample_num] + hparams.res_name = util.convert_res_name(hparams.infer_file) + # print('result name:', hparams.res_name) + with open(hparams.res_name, 'w') as out: + out.write('\n'.join(map(str, preds))) + + +# cache data +def cache_data(hparams, filename, flag): + if hparams.data_format == 'ffm': + cache_obj = FfmCache() + elif hparams.data_format == 'din': + cache_obj = DinCache() + elif hparams.data_format == 'cccfnet': + cache_obj = CCCFNetCache() + else: + raise ValueError( + "data format must be ffm, din, cccfnet, this format not defined {0}".format(hparams.data_format)) + if not os.path.exists(util.CACHE_DIR): + os.mkdir(util.CACHE_DIR) + if flag == 'train': + hparams.train_file_cache = util.convert_cached_name(hparams.train_file, hparams.batch_size) + cached_name = hparams.train_file_cache + sample_num_path = util.TRAIN_NUM + impression_id_path = util.TRAIN_IMPRESSION_ID + elif flag == 'eval': + hparams.eval_file_cache = util.convert_cached_name(hparams.eval_file, hparams.batch_size) + cached_name = hparams.eval_file_cache + sample_num_path = util.EVAL_NUM + impression_id_path = util.EVAL_IMPRESSION_ID + elif flag == 'test': + hparams.test_file_cache = util.convert_cached_name(hparams.test_file, hparams.batch_size) + cached_name = hparams.test_file_cache + sample_num_path = util.TEST_NUM + impression_id_path = util.TEST_IMPRESSION_ID + elif flag == 'infer': + hparams.infer_file_cache = util.convert_cached_name(hparams.infer_file, hparams.batch_size) + cached_name = hparams.infer_file_cache + sample_num_path = util.INFER_NUM + impression_id_path = util.INFER_IMPRESSION_ID + else: + raise ValueError("flag must be train, eval, test, infer") + print('cache filename:', filename) + if not os.path.isfile(cached_name): + print('has not cached file, begin cached...') + start_time = time.time() + sample_num, impression_id_list = cache_obj.write_tfrecord(filename, cached_name, hparams) + util.print_time("caced file used time", start_time) + print("data sample num:{0}".format(sample_num)) + with open(sample_num_path, 'w') as f: + f.write(str(sample_num) + '\n') + with open(impression_id_path, 'w') as f: + for impression_id in impression_id_list: + f.write(str(impression_id) + '\n') + + +def train(hparams, scope=None, target_session=""): + params = hparams.values() + for key, val in params.items(): + hparams.logger.info(str(key) + ':' + str(val)) + + print('load and cache data...') + if hparams.train_file is not None: + cache_data(hparams, hparams.train_file, flag='train') + if hparams.eval_file is not None: + cache_data(hparams, hparams.eval_file, flag='eval') + if hparams.test_file is not None: + cache_data(hparams, hparams.test_file, flag='test') + if hparams.infer_file is not None: + cache_data(hparams, hparams.infer_file, flag='infer') + + if hparams.model_type == 'deepFM': + model_creator = DeepfmModel + print("run deepfm model!") + elif hparams.model_type == 'deepWide': + model_creator = DeepWideModel + print("run deepWide model!") + elif hparams.model_type == 'dnn': + print("run dnn model!") + model_creator = DnnModel + elif hparams.model_type == 'ipnn': + print("run ipnn model!") + model_creator = IpnnModel + elif hparams.model_type == 'opnn': + print("run opnn model!") + model_creator = OpnnModel + elif hparams.model_type == 'din': + print("run din model!") + model_creator = DinModel + elif hparams.model_type == 'fm': + print("run fm model!") + model_creator = FmModel + elif hparams.model_type == 'lr': + print("run lr model!") + model_creator = LrModel + elif hparams.model_type == 'din': + print("run din model!") + model_creator = DinModel + elif hparams.model_type == 'cccfnet': + print("run cccfnet model!") + model_creator = CCCFModel + elif hparams.model_type == 'deepcross': + print("run deepcross model!") + model_creator = DeepCrossModel + elif hparams.model_type == 'exDeepFM': + print("run extreme deepFM model!") + model_creator = ExtremeDeepFMModel + elif hparams.model_type == 'cross': + print("run extreme cross model!") + model_creator = CrossModel + elif hparams.model_type == 'CIN': + print("run extreme cin model!") + model_creator = CINModel + + else: + raise ValueError("model type should be cccfnet, deepFM, deepWide, dnn, fm, lr, ipnn, opnn, din") + + # define train,eval,infer graph + # define train session, eval session, infer session + train_model = create_train_model(model_creator, hparams, scope) + gpuconfig = tf.ConfigProto() + gpuconfig.gpu_options.allow_growth = True + tf.set_random_seed(1234) + train_sess = tf.Session(target=target_session, graph=train_model.graph, config=gpuconfig) + + train_sess.run(train_model.model.init_op) + # load model from checkpoint + if not hparams.load_model_name is None: + checkpoint_path = hparams.load_model_name + try: + train_model.model.saver.restore(train_sess, checkpoint_path) + print('load model', checkpoint_path) + except: + raise IOError("Failed to find any matching files for {0}".format(checkpoint_path)) + print('total_loss = data_loss+regularization_loss, data_loss = {rmse or logloss ..}') + writer = tf.summary.FileWriter(util.SUMMARIES_DIR, train_sess.graph) + last_eval = 0 + for epoch in range(hparams.epochs): + step = 0 + train_sess.run(train_model.iterator.initializer, feed_dict={train_model.filenames: [hparams.train_file_cache]}) + epoch_loss = 0 + train_start = time.time() + train_load_time = 0 + while True: + try: + t1 = time.time() + step_result = train_model.model.train(train_sess) + t3 = time.time() + train_load_time += t3 - t1 + (_, step_loss, step_data_loss, summary) = step_result + writer.add_summary(summary, step) + epoch_loss += step_loss + step += 1 + if step % hparams.show_step == 0: + print('step {0:d} , total_loss: {1:.4f}, data_loss: {2:.4f}' \ + .format(step, step_loss, step_data_loss)) + except tf.errors.OutOfRangeError: + print('finish one epoch!') + break + train_end = time.time() + train_time = train_end - train_start + if epoch % hparams.save_epoch == 0: + checkpoint_path = train_model.model.saver.save( + sess=train_sess, + save_path=util.MODEL_DIR + 'epoch_' + str(epoch)) + # print(checkpoint_path) + train_res = dict() + train_res["loss"] = epoch_loss / step + eval_start = time.time() + # train_res = run_eval(train_model, train_sess, hparams.train_file_cache, util.TRAIN_NUM, hparams, flag='train') + eval_res = run_eval(train_model, train_sess, hparams.eval_file_cache, util.EVAL_NUM, hparams, flag='eval') + train_info = ', '.join( + [str(item[0]) + ':' + str(item[1]) + for item in sorted(train_res.items(), key=lambda x: x[0])]) + eval_info = ', '.join( + [str(item[0]) + ':' + str(item[1]) + for item in sorted(eval_res.items(), key=lambda x: x[0])]) + if hparams.test_file is not None: + test_res = run_eval(train_model, train_sess, hparams.test_file_cache, util.TEST_NUM, hparams, flag='test') + test_info = ', '.join( + [str(item[0]) + ':' + str(item[1]) + for item in sorted(test_res.items(), key=lambda x: x[0])]) + eval_end = time.time() + eval_time = eval_end - eval_start + if hparams.test_file is not None: + print('at epoch {0:d}'.format( + epoch) + ' train info: ' + train_info + ' eval info: ' + eval_info + ' test info: ' + test_info) + hparams.logger.info('at epoch {0:d}'.format( + epoch) + ' train info: ' + train_info + ' eval info: ' + eval_info + ' test info: ' + test_info) + else: + print('at epoch {0:d}'.format(epoch) + ' train info: ' + train_info + ' eval info: ' + eval_info) + hparams.logger.info('at epoch {0:d}'.format(epoch) + ' train info: ' + train_info + ' eval info: ' + eval_info) + print('at epoch {0:d} , train time: {1:.1f} eval time: {2:.1f}'.format(epoch, train_time, eval_time)) + + hparams.logger.info('at epoch {0:d} , train time: {1:.1f} eval time: {2:.1f}' \ + .format(epoch, train_time, eval_time)) + hparams.logger.info('\n') + + if eval_res["auc"] - last_eval < - 0.003: + break + if eval_res["auc"] > last_eval: + last_eval = eval_res["auc"] + writer.close() + # after train,run infer + if hparams.infer_file is not None: + run_infer(train_model, train_sess, hparams.infer_file_cache, hparams, util.INFER_NUM) diff --git a/examples/xDeepFM/utils/log.py b/examples/xDeepFM/utils/log.py new file mode 100644 index 00000000..a3fa891f --- /dev/null +++ b/examples/xDeepFM/utils/log.py @@ -0,0 +1,20 @@ +"""define logging configure""" +import logging +from datetime import datetime, timedelta, timezone +import platform + +__all__ = ["Log"] +class Log(object): + def __init__(self, hparams): + # UTC To Beijing Time + utc_dt = datetime.utcnow().replace(tzinfo=timezone.utc) + bj_dt = utc_dt.astimezone(timezone(timedelta(hours=8))) + + logging_filename = "logs/"+hparams.log + '__' + bj_dt.strftime('%Y-%m-%d_%H_%M_%S') + '.log' + self.logger = logging.getLogger(__name__) + self.logger.setLevel(logging.INFO) + handler = logging.FileHandler(logging_filename) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(message)s') + handler.setFormatter(formatter) + self.logger.addHandler(handler) diff --git a/examples/xDeepFM/utils/metric.py b/examples/xDeepFM/utils/metric.py new file mode 100644 index 00000000..c2749da1 --- /dev/null +++ b/examples/xDeepFM/utils/metric.py @@ -0,0 +1,97 @@ +"""define metrics""" +from collections import defaultdict +from sklearn.metrics import roc_auc_score, log_loss, mean_squared_error +import numpy as np +import utils.util as util + + +def cal_metric(labels, preds, hparams, flag): + """Calculate metrics,such as auc, logloss, group auc""" + res = {} + + def load_impression_id(file_name): + """load impression id, such as user id, news id""" + id_list = [] + with open(file_name, 'r') as f_in: + for line in f_in: + id_list.append(line.strip()) + return id_list + + for metric in hparams.metrics: + if metric == 'auc': + auc = roc_auc_score(np.asarray(labels), np.asarray(preds)) + res['auc'] = round(auc, 4) + elif metric == 'rmse': + rmse = mean_squared_error(np.asarray(labels), np.asarray(preds)) + res['rmse'] = np.sqrt(round(rmse, 4)) + elif metric == 'logloss': + # avoid logloss nan + preds = [max(min(p, 1. - 10e-12), 10e-12) for p in preds] + logloss = log_loss(np.asarray(labels), np.asarray(preds)) + res['logloss'] = round(logloss, 4) + elif metric == 'group_auc': + if flag == 'train': + impression_id_list = load_impression_id(util.TRAIN_IMPRESSION_ID) + if len(impression_id_list) == 0: + raise ValueError("train data does not has impressionId," \ + "so can not cal the group auc!") + group_auc = cal_group_auc(labels, preds, impression_id_list) + res['group_auc'] = group_auc + elif flag == 'eval': + impression_id_list = load_impression_id(util.EVAL_IMPRESSION_ID) + if len(impression_id_list) == 0: + raise ValueError("eval data does not has impressionId," \ + "so can not cal the group auc!") + group_auc = cal_group_auc(labels, preds, impression_id_list) + res['group_auc'] = group_auc + elif flag == 'test': + impression_id_list = load_impression_id(util.INFER_IMPRESSION_ID) + if len(impression_id_list) == 0: + raise ValueError("infer data does not has impressionId," \ + "so can not cal the group auc!") + group_auc = cal_group_auc(labels, preds, impression_id_list) + res['group_auc'] = group_auc + else: + raise ValueError("cal metric dataSet should be train, eval , test") + + else: + raise ValueError("not define this metric {0}".format(metric)) + return res + + +def cal_group_auc(labels, preds, impression_id_list): + """Calculate group auc""" + if len(impression_id_list) != len(labels): + raise ValueError( + "impression id num should equal to the sample num," \ + "impression id num is {0}".format(len(impression_id_list))) + group_score = defaultdict(lambda: []) + group_truth = defaultdict(lambda: []) + for idx, truth in enumerate(labels): + user_id = impression_id_list[idx] + score = preds[idx] + truth = labels[idx] + group_score[user_id].append(score) + group_truth[user_id].append(truth) + + group_flag = defaultdict(lambda: False) + for user_id in set(impression_id_list): + truths = group_truth[user_id] + flag = False + for i in range(len(truths) - 1): + if truths[i] != truths[i + 1]: + flag = True + break + group_flag[user_id] = flag + + impression_total = 0 + total_auc = 0 + # + for user_id in group_flag: + if group_flag[user_id]: + auc = roc_auc_score(np.asarray(group_truth[user_id]), np.asarray(group_score[user_id])) + total_auc += auc * len(group_truth[user_id]) + impression_total += len(group_truth[user_id]) + group_auc = float(total_auc) / impression_total + group_auc = round(group_auc, 4) + return group_auc diff --git a/examples/xDeepFM/utils/util.py b/examples/xDeepFM/utils/util.py new file mode 100644 index 00000000..36e2ab10 --- /dev/null +++ b/examples/xDeepFM/utils/util.py @@ -0,0 +1,83 @@ +"""define util function and global variable""" +import tensorflow as tf +import os, sys +import time, yaml +from packaging import version + +RES_DIR = './res/' +CACHE_DIR = './cache/' +MODEL_DIR = './checkpoint/' +CONFIG_DIR = './config/' +TRAIN_YAML = 'network.yaml' +TRAIN_NUM = './cache/train_num.csv' +EVAL_NUM = './cache/eval_num.csv' +TEST_NUM = './cache/test_num.csv' +INFER_NUM = './cache/infer_num.csv' +LOG_DIR = './logs/' +FEAT_COUNT_FILE = './cache/feat_cnt.csv' +TRAIN_IMPRESSION_ID = './cache/train_impressionId.csv' +EVAL_IMPRESSION_ID = './cache/eval_impressionId.csv' +TEST_IMPRESSION_ID = './cache/test_impressionId.csv' +INFER_IMPRESSION_ID = './cache/infer_impressionId.csv' +SUMMARIES_DIR = './logs/' +# define din format feature +DIN_FORMAT_SPLIT = '#' +# split feature and userid +USER_ID_SPLIT = '%' + + +def check_and_mkdir(): + def make_dir(DIR): + if not os.path.exists(DIR): + os.mkdir(DIR) + + make_dir(RES_DIR) + make_dir(CACHE_DIR) + make_dir(MODEL_DIR) + make_dir(CONFIG_DIR) + make_dir(LOG_DIR) + + +def check_tensorflow_version(): + if version.parse(tf.__version__) < version.parse("1.2.0"): + raise EnvironmentError("Tensorflow version must >= 1.2.0,but version is {0}". \ + format(tf.__version__)) + + +def print_time(s, start_time): + """Take a start time, print elapsed duration, and return a new time.""" + print("%s, %ds, %s." % (s, (time.time() - start_time), time.ctime())) + sys.stdout.flush() + return time.time() + + +def check_file_exist(filename): + if not os.path.isfile(filename): + raise ValueError("{0} is not exits".format(filename)) + + +def load_yaml_file(filename): + with open(filename) as f: + try: + config = yaml.load(f) + except: + raise IOError("load {0} error!".format(filename)) + return config + + +def convert_cached_name(file_name, batch_size): + prefix = CACHE_DIR + 'batch_size_' + str(batch_size) + '_' + prefix += (file_name.strip().split('/'))[-1] + train_cache_name = prefix.replace(".txt", ".tfrecord"). \ + replace(".csv", ".tfrecord"). \ + replace(".libsvm", ".tfrecord") + return train_cache_name + + +def convert_res_name(file_name): + prefix = RES_DIR + inferfile = file_name.split('/')[-1] + res_name = prefix + inferfile.replace("tfrecord", "res.csv"). \ + replace(".csv", ".tfrecord"). \ + replace(".libsvm", ".tfrecord") + return res_name -- Gitee From 774444030a4dcca59595eedf8a3843e985acc1aa Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Fri, 31 May 2024 06:28:44 +0000 Subject: [PATCH 182/302] =?UTF-8?q?!165=20xdeepFM=20CANN=E8=BD=AC=E6=8D=A2?= =?UTF-8?q?=20*=20xdeepFM=20CANN=E8=BD=AC=E6=8D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/xDeepFM/IO/base_cache.py | 2 ++ examples/xDeepFM/IO/ffm_cache.py | 2 ++ examples/xDeepFM/IO/iterator.py | 2 ++ examples/xDeepFM/main.py | 2 ++ examples/xDeepFM/src/base_model.py | 4 +++- examples/xDeepFM/src/exDeepFM.py | 2 ++ examples/xDeepFM/train.py | 4 +++- examples/xDeepFM/utils/log.py | 2 ++ examples/xDeepFM/utils/metric.py | 2 ++ examples/xDeepFM/utils/util.py | 2 ++ 10 files changed, 22 insertions(+), 2 deletions(-) diff --git a/examples/xDeepFM/IO/base_cache.py b/examples/xDeepFM/IO/base_cache.py index 11187de9..d0d8b5fc 100644 --- a/examples/xDeepFM/IO/base_cache.py +++ b/examples/xDeepFM/IO/base_cache.py @@ -1,4 +1,5 @@ """define abstract base class""" +from npu_bridge.npu_init import * import abc __all__ = ["BaseCache"] @@ -11,3 +12,4 @@ class BaseCache(object): def write_tfrecord(self, infile, outfile, hparams): """Subclass must implement this.""" pass + diff --git a/examples/xDeepFM/IO/ffm_cache.py b/examples/xDeepFM/IO/ffm_cache.py index 1f3d505c..9a694d8f 100644 --- a/examples/xDeepFM/IO/ffm_cache.py +++ b/examples/xDeepFM/IO/ffm_cache.py @@ -1,4 +1,5 @@ """define FfmCache class for cache the format dataset""" +from npu_bridge.npu_init import * from IO.base_cache import BaseCache import tensorflow as tf import numpy as np @@ -160,3 +161,4 @@ class FfmCache(BaseCache): for item in sort_feature_cnt: f.write(str(item[0]) + ',' + str(item[1]) + '\n') return sample_num, impression_id_list + diff --git a/examples/xDeepFM/IO/iterator.py b/examples/xDeepFM/IO/iterator.py index c7e50032..b044b3b0 100644 --- a/examples/xDeepFM/IO/iterator.py +++ b/examples/xDeepFM/IO/iterator.py @@ -1,4 +1,5 @@ """define iterator""" +from npu_bridge.npu_init import * import collections import tensorflow as tf import abc @@ -205,3 +206,4 @@ class CCCFNetIterator(BaseIterator): return labels, userIds, itemIds, \ user_profiles_indices, user_profiles_values, user_profiles_weights, user_profiles_shape, \ item_profiles_indices, item_profiles_values, item_profiles_weights, item_profiles_shape + diff --git a/examples/xDeepFM/main.py b/examples/xDeepFM/main.py index 265faca9..f0b93bd6 100644 --- a/examples/xDeepFM/main.py +++ b/examples/xDeepFM/main.py @@ -1,4 +1,5 @@ """This script parse and run train function""" +from npu_bridge.npu_init import * import train import utils.util as util import tensorflow as tf @@ -185,3 +186,4 @@ def main(): main() + diff --git a/examples/xDeepFM/src/base_model.py b/examples/xDeepFM/src/base_model.py index 320f0876..5481012f 100644 --- a/examples/xDeepFM/src/base_model.py +++ b/examples/xDeepFM/src/base_model.py @@ -1,4 +1,5 @@ """define base class model""" +from npu_bridge.npu_init import * import abc import math import tensorflow as tf @@ -177,7 +178,7 @@ class BaseModel(object): raise ValueError("this activations not defined {0}".format(activation)) def _dropout(self, logit, layer_idx): - logit = tf.nn.dropout(x=logit, keep_prob=self.layer_keeps[layer_idx]) + logit = npu_ops.dropout(x=logit, keep_prob=self.layer_keeps[layer_idx]) return logit def train(self, sess): @@ -191,3 +192,4 @@ class BaseModel(object): def infer(self, sess): return sess.run([self.pred], \ feed_dict={self.layer_keeps: self.keep_prob_test}) + diff --git a/examples/xDeepFM/src/exDeepFM.py b/examples/xDeepFM/src/exDeepFM.py index 7167d460..b8d235b7 100644 --- a/examples/xDeepFM/src/exDeepFM.py +++ b/examples/xDeepFM/src/exDeepFM.py @@ -1,4 +1,5 @@ """define Factorization-Machine based Neural Network Model""" +from npu_bridge.npu_init import * import math import numpy as np import tensorflow as tf @@ -407,3 +408,4 @@ class ExtremeDeepFMModel(BaseModel): self.layer_params.append(b_nn_output) nn_output = tf.nn.xw_plus_b(hidden_nn_layers[-1], w_nn_output, b_nn_output) return nn_output + diff --git a/examples/xDeepFM/train.py b/examples/xDeepFM/train.py index 2a9a0a31..1c434194 100644 --- a/examples/xDeepFM/train.py +++ b/examples/xDeepFM/train.py @@ -1,4 +1,5 @@ """define train, infer, eval, test process""" +from npu_bridge.npu_init import * import numpy as np import os, time, collections import tensorflow as tf @@ -219,7 +220,7 @@ def train(hparams, scope=None, target_session=""): gpuconfig = tf.ConfigProto() gpuconfig.gpu_options.allow_growth = True tf.set_random_seed(1234) - train_sess = tf.Session(target=target_session, graph=train_model.graph, config=gpuconfig) + train_sess = tf.Session(target=target_session, graph=train_model.graph, config=npu_config_proto(config_proto=gpuconfig)) train_sess.run(train_model.model.init_op) # load model from checkpoint @@ -302,3 +303,4 @@ def train(hparams, scope=None, target_session=""): # after train,run infer if hparams.infer_file is not None: run_infer(train_model, train_sess, hparams.infer_file_cache, hparams, util.INFER_NUM) + diff --git a/examples/xDeepFM/utils/log.py b/examples/xDeepFM/utils/log.py index a3fa891f..9b0c2c92 100644 --- a/examples/xDeepFM/utils/log.py +++ b/examples/xDeepFM/utils/log.py @@ -1,4 +1,5 @@ """define logging configure""" +from npu_bridge.npu_init import * import logging from datetime import datetime, timedelta, timezone import platform @@ -18,3 +19,4 @@ class Log(object): formatter = logging.Formatter('%(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) + diff --git a/examples/xDeepFM/utils/metric.py b/examples/xDeepFM/utils/metric.py index c2749da1..2a44b47b 100644 --- a/examples/xDeepFM/utils/metric.py +++ b/examples/xDeepFM/utils/metric.py @@ -1,4 +1,5 @@ """define metrics""" +from npu_bridge.npu_init import * from collections import defaultdict from sklearn.metrics import roc_auc_score, log_loss, mean_squared_error import numpy as np @@ -95,3 +96,4 @@ def cal_group_auc(labels, preds, impression_id_list): group_auc = float(total_auc) / impression_total group_auc = round(group_auc, 4) return group_auc + diff --git a/examples/xDeepFM/utils/util.py b/examples/xDeepFM/utils/util.py index 36e2ab10..7a52b6a5 100644 --- a/examples/xDeepFM/utils/util.py +++ b/examples/xDeepFM/utils/util.py @@ -1,4 +1,5 @@ """define util function and global variable""" +from npu_bridge.npu_init import * import tensorflow as tf import os, sys import time, yaml @@ -81,3 +82,4 @@ def convert_res_name(file_name): replace(".csv", ".tfrecord"). \ replace(".libsvm", ".tfrecord") return res_name + -- Gitee From 2c68138ebd7d64be3625d33e1ed1e5770c409914 Mon Sep 17 00:00:00 2001 From: yxy1684 <2270320041@qq.com> Date: Fri, 31 May 2024 09:35:27 +0000 Subject: [PATCH 183/302] =?UTF-8?q?!162=20xDeepFM=E8=BF=81=E7=A7=BB=20*=20?= =?UTF-8?q?xdeepFM=20*=20xdeepFM=20*=20xdeepFM=20*=20xdeepFM=20*=20xdeepFM?= =?UTF-8?q?=20*=20xdeepFM=20*=20xdeepFM=20*=20xdeepFM=20*=20xdeepFM=20*=20?= =?UTF-8?q?xdeepFM=20*=20xdeepFM=20*=20xdeepFM=20*=20xdeepFM=20*=20xdeepFM?= =?UTF-8?q?=20*=20xdeepFM=20*=20Merge=20branch=20'develop'=20of=20https://?= =?UTF-8?q?gitee.com/ascend/mxrec=20into=20xdeepfm=5Fdevelop=20*=20xdeepFM?= =?UTF-8?q?=20*=20exdeepfm=20*=20exdeepfm=20*=20exdeepfm=20*=20exdeepfm=20?= =?UTF-8?q?*=20exdeepfm=20*=20exdeepfm=20*=20exdeepfm=20*=20exdeepfm=20*?= =?UTF-8?q?=20exdeepfm=20*=20exdeepfm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/xDeepFM/IO/iterator.py | 31 ++- examples/xDeepFM/README.md | 296 +++++++++++++++++++++++++++++ examples/xDeepFM/main.py | 12 +- examples/xDeepFM/run.sh | 130 +++++++++++++ examples/xDeepFM/src/base_model.py | 8 +- examples/xDeepFM/src/exDeepFM.py | 22 ++- examples/xDeepFM/train.py | 60 +++--- examples/xDeepFM/utils/util.py | 2 +- 8 files changed, 505 insertions(+), 56 deletions(-) create mode 100644 examples/xDeepFM/README.md create mode 100644 examples/xDeepFM/run.sh diff --git a/examples/xDeepFM/IO/iterator.py b/examples/xDeepFM/IO/iterator.py index b044b3b0..ad6a9145 100644 --- a/examples/xDeepFM/IO/iterator.py +++ b/examples/xDeepFM/IO/iterator.py @@ -27,18 +27,16 @@ class FfmIterator(BaseIterator): src_dataset = src_dataset.map(self.parser) # src_dataset = src_dataset.shuffle(buffer_size=BUFFER_SIZE) iterator = src_dataset.make_initializable_iterator() - _fm_feat_indices, _fm_feat_values, \ - _fm_feat_shape, _labels, _dnn_feat_indices, \ - _dnn_feat_values, _dnn_feat_weights, _dnn_feat_shape = iterator.get_next() + batch = iterator.get_next() self.initializer = iterator.initializer - self.fm_feat_indices = _fm_feat_indices - self.fm_feat_values = _fm_feat_values - self.fm_feat_shape = _fm_feat_shape - self.labels = _labels - self.dnn_feat_indices = _dnn_feat_indices - self.dnn_feat_values = _dnn_feat_values - self.dnn_feat_weights = _dnn_feat_weights - self.dnn_feat_shape = _dnn_feat_shape + self.fm_feat_indices = batch.get('fm_feat_indices') + self.fm_feat_values = batch.get('fm_feat_values') + self.fm_feat_shape = batch.get('fm_feat_shape') + self.labels = batch.get('labels') + self.dnn_feat_indices = batch.get('dnn_feat_indices') + self.dnn_feat_values = batch.get('dnn_feat_values') + self.dnn_feat_weights = batch.get('dnn_feat_weights') + self.dnn_feat_shape = batch.get('dnn_feat_shape') def parser(self, record): keys_to_features = { @@ -60,9 +58,11 @@ class FfmIterator(BaseIterator): dnn_feat_values = tf.sparse_tensor_to_dense(parsed['dnn_feat_values']) dnn_feat_weights = tf.sparse_tensor_to_dense(parsed['dnn_feat_weights']) dnn_feat_shape = parsed['dnn_feat_shape'] - return fm_feat_indices, fm_feat_values, \ - fm_feat_shape, labels, dnn_feat_indices, \ - dnn_feat_values, dnn_feat_weights, dnn_feat_shape + return { + 'fm_feat_indices': fm_feat_indices, 'fm_feat_values': fm_feat_values, 'fm_feat_shape': fm_feat_shape, + 'labels': labels, 'dnn_feat_indices': dnn_feat_indices, 'dnn_feat_values': dnn_feat_values, + 'dnn_feat_weights': dnn_feat_weights, 'dnn_feat_shape': dnn_feat_shape + } class DinIterator(BaseIterator): @@ -205,5 +205,4 @@ class CCCFNetIterator(BaseIterator): return labels, userIds, itemIds, \ user_profiles_indices, user_profiles_values, user_profiles_weights, user_profiles_shape, \ - item_profiles_indices, item_profiles_values, item_profiles_weights, item_profiles_shape - + item_profiles_indices, item_profiles_values, item_profiles_weights, item_profiles_shape \ No newline at end of file diff --git a/examples/xDeepFM/README.md b/examples/xDeepFM/README.md new file mode 100644 index 00000000..d9a93744 --- /dev/null +++ b/examples/xDeepFM/README.md @@ -0,0 +1,296 @@ +# xDeepFM迁移样例 + +## 模型参考开源链接 + +1. https://github.com/Leavingseason/xDeepFM + +2. Commits on Oct 15, 2018,提交的SHA-1 hash值(提交ID):114c4c45b1cb6144b2540f92a2b357c3f445e98e + +3. 只保留执行所需要的代码及文件,其他已删除。 +4. config/network.yaml配置文件,data/dnn/infer.userid.txt、res/infer.userid.txt等数据文件由用户从开源链接下载导入 + +## 迁移NPU + +请参照昇腾社区CANN商用版文档先使用迁移工具进行NPU自动迁移:https://www.hiascend.com/document/detail/zh/canncommercial/700/modeldev/tfmigr1/tfmigr1_000009.html + + +## 迁移mxRec + +1、修改IO/iterator.py,把第30~41行 + + +```python + _fm_feat_indices, _fm_feat_values, + _fm_feat_shape, _labels, _dnn_feat_indices, + _dnn_feat_values, _dnn_feat_weights, _dnn_feat_shape = iterator.get_next() + self.initializer = iterator.initializer + self.fm_feat_indices = _fm_feat_indices + self.fm_feat_values = _fm_feat_values + self.fm_feat_shape = _fm_feat_shape + self.labels = _labels + self.dnn_feat_indices = _dnn_feat_indices + self.dnn_feat_values = _dnn_feat_values + self.dnn_feat_weights = _dnn_feat_weights + self.dnn_feat_shape = _dnn_feat_shape +``` +` ` ` `改为: +```python + batch = iterator.get_next() + self.initializer = iterator.initializer + self.fm_feat_indices = batch.get('fm_feat_indices') + self.fm_feat_values = batch.get('fm_feat_values') + self.fm_feat_shape = batch.get('fm_feat_shape') + self.labels = batch.get('labels') + self.dnn_feat_indices = batch.get('dnn_feat_indices') + self.dnn_feat_values = batch.get('dnn_feat_values') + self.dnn_feat_weights = batch.get('dnn_feat_weights') + self.dnn_feat_shape = batch.get('dnn_feat_shape') +``` + +` ` ` `第63~65行 +```python + return fm_feat_indices, fm_feat_values, + fm_feat_shape, labels, dnn_feat_indices, + dnn_feat_values, dnn_feat_weights, dnn_feat_shape +``` +` ` ` `改为: +```python + return { + 'fm_feat_indices': fm_feat_indices, 'fm_feat_values': fm_feat_values, 'fm_feat_shape': fm_feat_shape, + 'labels': labels, 'dnn_feat_indices': dnn_feat_indices, 'dnn_feat_values': dnn_feat_values, + 'dnn_feat_weights': dnn_feat_weights, 'dnn_feat_shape': dnn_feat_shape + } +``` + +2、修改src/base_model.py。把embedding初始化值设成tf.zeros_initializer(),把84行 +```python + return tf.truncated_normal_initializer(stddev=hparams.init_value) +``` +` ` ` `改为(为了对比CPU,xDeepFM源代码这里也要一起修改): +```python + return tf.zeros_initializer() +``` + +` ` ` `更新自动改图模式下生成新数据集中batch的label记录,把188~189行 +```python + def eval(self, sess): + return sess.run([self.loss, self.data_loss, self.pred, self.iterator.labels], \ +``` +` ` ` `改为: +```python + def eval(self, sess, eval_label): + return sess.run([self.loss, self.data_loss, self.pred, eval_label], \ +``` + +3、修改src/exDeepFM.py。在第6行添加 +```python +from mx_rec.core.embedding import create_table +from mx_rec.core.embedding import sparse_lookup +``` +` ` ` `把40~43行 +```python + w_fm_nn_input_orgin = tf.nn.embedding_lookup_sparse(self.embedding, + fm_sparse_index, + fm_sparse_weight, + combiner="sum") +``` +` ` ` `改为: +```python + dense_indices = tf.sparse.to_dense(fm_sparse_index, default_value=0) + dense_weights = tf.sparse.to_dense(fm_sparse_weight, default_value=0) + + sparse_hashtable = create_table(key_dtype=tf.int32, + dim=tf.TensorShape([hparams.dim]), + name='sparse_embeddings_table', + emb_initializer=tf.zeros_initializer(), + device_vocabulary_size=hparams.FEATURE_COUNT, + host_vocabulary_size=0 + ) + embedded_values = sparse_lookup(sparse_hashtable, + dense_indices, + is_train=True, + name="sparse_embeddings", + modify_graph=True) + w_fm_nn_input_orgin = tf.reduce_sum(embedded_values * tf.expand_dims(dense_weights, axis=-1), axis=1) +``` + +4、修改main.py。在第176行添加 +```python + # init + from mx_rec.util.initialize import init + init(use_dynamic=True, + use_dynamic_expansion=False) +``` + +5、修改train.py。把第35~57行 +```python + graph = tf.Graph() + with graph.as_default(): + # feed train file name, valid file name, or test file name + filenames = tf.placeholder(tf.string, shape=[None]) + #src_dataset = tf.contrib.data.TFRecordDataset(filenames) + src_dataset = tf.data.TFRecordDataset(filenames) + + if hparams.data_format == 'ffm': + batch_input = FfmIterator(src_dataset) + elif hparams.data_format == 'din': + batch_input = DinIterator(src_dataset) + elif hparams.data_format == 'cccfnet': + batch_input = CCCFNetIterator(src_dataset) + else: + raise ValueError("not support {0} format data".format(hparams.data_format)) + # build model + model = model_creator( + hparams, + iterator=batch_input, + scope=scope) + + return TrainModel( + graph=graph, +``` +` ` ` `改为: +```python + # feed train file name, valid file name, or test file name + filenames = tf.placeholder(tf.string, shape=[None]) + # src_dataset = tf.contrib.data.TFRecordDataset(filenames) + src_dataset = tf.data.TFRecordDataset(filenames) + + if hparams.data_format == 'ffm': + batch_input = FfmIterator(src_dataset) + elif hparams.data_format == 'din': + batch_input = DinIterator(src_dataset) + elif hparams.data_format == 'cccfnet': + batch_input = CCCFNetIterator(src_dataset) + else: + raise ValueError("not support {0} format data".format(hparams.data_format)) + # build model + model = model_creator( + hparams, + iterator=batch_input, + scope=scope) + + return TrainModel( + graph=tf.get_default_graph(), +``` +` ` ` `把第68~73行 +```python + load_sess.run(load_model.iterator.initializer, feed_dict={load_model.filenames: [filename]}) + preds = [] + labels = [] + while True: + try: + _, _, step_pred, step_labels = load_model.model.eval(load_sess) +``` +` ` ` `改为: +```python + from mx_rec.util.initialize import ConfigInitializer + eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(True).get("labels") + initializer = ConfigInitializer.get_instance().train_params_config.get_initializer(True) + load_sess.run(initializer, feed_dict={load_model.filenames: [filename]}) + preds = [] + labels = [] + while True: + try: + _, _, step_pred, step_labels = load_model.model.eval(load_sess, eval_label) +``` + +` ` ` `在第223行添加 +```python + from mx_rec.graph.modifier import modify_graph_and_start_emb_cache + modify_graph_and_start_emb_cache(dump_graph=True) +``` +` ` ` `把第239行 +```python + train_sess.run(train_model.iterator.initializer, feed_dict={train_model.filenames: [hparams.train_file_cache]}) +``` +` ` ` `改为: +```python + from mx_rec.util.initialize import ConfigInitializer + initializer = ConfigInitializer.get_instance().train_params_config.get_initializer(True) + train_sess.run(initializer, feed_dict={train_model.filenames: [hparams.train_file_cache]}) +``` +6、为了适配mxRec运行环境,添加了run.sh。 + +## 适配其他代码 + +1、修改utils/util.py。把第63行 + + +```python + config = yaml.load(f) +``` +` ` ` `改为(为了xDeepFM源代码在CPU上能跑通,这里也要一起修改): +```python + config = yaml.safe_load(f) +``` + +2、由于去掉了无关代码src/CIN.py,修改main.py适配。把第156~158行 + +```python + 'opnn', 'fm', 'lr', 'din', 'cccfnet', 'deepcross', 'exDeepFM', "cross", "CIN"]: + raise ValueError( + "model type must be cccfnet, deepFM, deepWide, dnn, ipnn, opnn, fm, lr, din, deepcross, exDeepFM, cross, CIN but you set is {0}".format( +``` +` ` ` `改为: +```python + 'opnn', 'fm', 'lr', 'din', 'cccfnet', 'deepcross', 'exDeepFM', "cross"]: + raise ValueError( + "model type must be cccfnet, deepFM, deepWide, dnn, ipnn, opnn, fm, lr, din, deepcross, exDeepFM, " + "cross, but you set is {0}".format(config['model']['model_type'])) +``` + +` ` ` `修改train.py适配。删除第21行代码 +```python +from src.CIN import CINModel +``` + +` ` ` `删除第210~212行代码 +```python + elif hparams.model_type == 'CIN': + print("run extreme cin model!") + model_creator = CINModel +``` + +## 运行命令 +```shell +bash run.sh main.py 10.10.10.10 +``` +其中,10.10.10.10为服务器IP,请替换成对应服务器IP。 + +## 验证结果 +1、CPU: +```log +step 1 , total_loss: 0.6931, data_loss: 0.6931 +step 2 , total_loss: 0.6905, data_loss: 0.6905 +finish one epoch! +at epoch 0 train info: loss:0.6918214857578278 eval info: auc:0.4867, logloss:0.6865 test info: auc:0.4867, logloss:0.6865 +at epoch 0 , train time: 0.6 eval time: 0.3 +step 1 , total_loss: 0.6845, data_loss: 0.6845 +step 2 , total_loss: 0.6818, data_loss: 0.6818 +finish one epoch! +at epoch 1 train info: loss:0.6831814646720886 eval info: auc:0.485, logloss:0.6801 test info: auc:0.485, logloss:0.6801 +at epoch 1 , train time: 0.2 eval time: 0.1 +step 1 , total_loss: 0.6766, data_loss: 0.6766 +step 2 , total_loss: 0.6732, data_loss: 0.6732 +finish one epoch! +at epoch 2 train info: loss:0.6748818755149841 eval info: auc:0.4832, logloss:0.6738 test info: auc:0.4832, logloss:0.6738 +at epoch 2 , train time: 0.1 eval time: 0.1 +``` +2、mxRec: +```log +[1,0]:step 1 , total_loss: 0.6931, data_loss: 0.6931 +[1,0]:step 2 , total_loss: 0.6905, data_loss: 0.6905 +[1,0]:finish one epoch! +[1,0]:at epoch 0 train info: loss:0.6918215453624725 eval info: auc:0.4867, logloss:0.6865 test info: auc:0.4867, logloss:0.6865 +[1,0]:at epoch 0 , train time: 15.9 eval time: 3.1 +[1,0]:step 1 , total_loss: 0.6845, data_loss: 0.6845 +[1,0]:step 2 , total_loss: 0.6818, data_loss: 0.6818 +[1,0]:finish one epoch! +[1,0]:at epoch 1 train info: loss:0.6831814646720886 eval info: auc:0.485, logloss:0.6801 test info: auc:0.485, logloss:0.6801 +[1,0]:at epoch 1 , train time: 7.8 eval time: 0.7 +[1,0]:step 1 , total_loss: 0.6766, data_loss: 0.6766 +[1,0]:step 2 , total_loss: 0.6732, data_loss: 0.6732 +[1,0]:finish one epoch! +[1,0]:at epoch 2 train info: loss:0.6748818457126617 eval info: auc:0.4832, logloss:0.6738 test info: auc:0.4832, logloss:0.6738 +[1,0]:at epoch 2 , train time: 0.5 eval time: 0.7 +``` diff --git a/examples/xDeepFM/main.py b/examples/xDeepFM/main.py index f0b93bd6..0752d18b 100644 --- a/examples/xDeepFM/main.py +++ b/examples/xDeepFM/main.py @@ -153,10 +153,10 @@ def check_nn_config(config): def check_config(config): """check networks config""" if config['model']['model_type'] not in ['deepFM', 'deepWide', 'dnn', 'ipnn', \ - 'opnn', 'fm', 'lr', 'din', 'cccfnet', 'deepcross', 'exDeepFM', "cross", "CIN"]: + 'opnn', 'fm', 'lr', 'din', 'cccfnet', 'deepcross', 'exDeepFM', "cross"]: raise ValueError( - "model type must be cccfnet, deepFM, deepWide, dnn, ipnn, opnn, fm, lr, din, deepcross, exDeepFM, cross, CIN but you set is {0}".format( - config['model']['model_type'])) + "model type must be cccfnet, deepFM, deepWide, dnn, ipnn, opnn, fm, lr, din, deepcross, exDeepFM, " + "cross, but you set is {0}".format(config['model']['model_type'])) check_nn_config(config) @@ -172,6 +172,12 @@ def load_yaml(): def main(): """main function""" + + # init + from mx_rec.util.initialize import init + init(use_dynamic=True, + use_dynamic_expansion=False) + # flag = True util.check_tensorflow_version() util.check_and_mkdir() diff --git a/examples/xDeepFM/run.sh b/examples/xDeepFM/run.sh new file mode 100644 index 00000000..613c440e --- /dev/null +++ b/examples/xDeepFM/run.sh @@ -0,0 +1,130 @@ +kill -9 `ps -ef | grep python | grep -v grep | awk '{print $2}'` > /dev/null 2>&1 + +# 获取输入参数:py、ip +if [ $# -ge 1 ]; then + py=$1 + ip=$2 +else + echo "for example: bash run.sh main.py 10.10.10.10 or bash run.sh main.py" + exit 1 +fi + +# 检查输入的python文件是否合法 +if [[ $py =~ ^[a-z0-9_]+\.py$ ]]; then + echo "File $py is a valid Python file" +else + echo "File $py is not a Python file" + exit 1 +fi + +# 判断IP地址是否有效 +if [ -n "$ip" ]; then + if [[ $ip =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}$ ]]; then + # 将IP地址拆分成四个数字 + ip_array=(${ip//./ }) + # 判断每个数字是否在0-255之间 + valid=true + for i in "${ip_array[@]}"; do + if ((i < 0 || i > 255)); then + valid=false + break + fi + done + if $valid; then + echo "ip: $ip is valid" + else + echo "ip: $ip is not valid" + exit 1 + fi + else + echo "ip: $ip is not valid." + exit 1 + fi +fi + +cur_path=`pwd` +mx_rec_package_path="/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec" # please config +so_path=${mx_rec_package_path}/libasc +# GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO +mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' +interface="lo" +local_rank_size=1 # 每个节点使用的NPU卡数 +num_server=1 # 训练节点数 +num_process=$((${num_server} * ${local_rank_size})) # 训练总的进程数,等于使用的NPU卡的总数 + +export HCCL_CONNECT_TIMEOUT=1200 # HCCL集合通信 建链超时时间,取值范围[120,7200] +export PYTHONPATH=${so_path}:$PYTHONPATH # 环境python安装路径 +export LD_PRELOAD=/usr/lib64/libgomp.so.1:/usr/local/python3.7.5/lib/python3.7/site-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0 +export LD_LIBRARY_PATH=${so_path}:/usr/local/lib:$LD_LIBRARY_PATH +# 集合通信文件,格式请参考昇腾官网CANN文档,“准备资源配置文件”章节。 +export JOB_ID=10086 +# 训练任务使用的NPU卡数总数 +export MXREC_LOG_LEVEL="ERROR" # 框架日志等级 +export TF_CPP_MIN_LOG_LEVEL=3 # tensorflow日志级别,3对应FATAL +# 设置应用类日志的全局日志级别及各模块日志级别,具体请参考昇腾官网CANN文档 +export ASCEND_GLOBAL_LOG_LEVEL=3 # “设置日志级别”章节0:debug, 1:info, 2:warning, 3:error, 4:NULL +export MXREC_MODE="ASC" +export USE_MPI=1 + +# 帮助信息,不需要修改 +if [[ $1 == --help || $1 == -h ]];then + echo "Usage: ./run.sh [OPTION]... [IP]..." + echo " " + echo "parameter explain: + [OPTION] main.py + [IP] IP address of the host + -h/--help show help message + " + exit 1 +fi + +# 使用ranktable方案 +function rankTableSolution() { + echo "The ranktable solution" + export RANK_TABLE_FILE="${cur_path}/hccl_json_${local_rank_size}p.json" + export RANK_SIZE=$num_process + export ASCEND_VISIBLE_DEVICES="0" + export RANK_ID=0 + export ASCEND_DEVICE_ID=$RANK_ID + echo "RANK_TABLE_FILE=$RANK_TABLE_FILE" + if [ ! -f "$RANK_TABLE_FILE" ];then + echo "the rank table file does not exit. Please reference {hccl_json_${local_rank_size}p.json} to correctly config rank table file" + exit 1 + fi +} + +if [ ! -n "$ip" ]; then + rankTableSolution +else + VALID_CHECK=$(echo $ip|awk -F. '$1<=255&&$2<=255&&$3<=255&&$4<=255{print "yes"}') + if echo $ip|grep -E "^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$">/dev/null; then + if [ "$VALID_CHECK" == "yes" ]; then + #################使用去除ranktable方案时开启###################### + echo "ip: $ip available." + echo "The ranktable solution is removed." + export CM_CHIEF_IP=$ip # 主节点ip + export CM_CHIEF_PORT=6000 # 主节点监听端口 + export CM_CHIEF_DEVICE=0 # 主节点device id + export CM_WORKER_IP=$ip # 当前节点ip + export CM_WORKER_SIZE=$num_process # 参与集群训练的device数量 + echo "CM_CHIEF_IP=$CM_CHIEF_IP" + echo "CM_CHIEF_PORT=$CM_CHIEF_PORT" + echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE" + echo "CM_WORKER_IP=$CM_WORKER_IP" + echo "CM_WORKER_SIZE=$CM_WORKER_SIZE" + echo "ASCEND_VISIBLE_DEVICES=$ASCEND_VISIBLE_DEVICES" + ######################################################### + else + echo "ip: $ip not available!" # 使用ranktable方案 + rankTableSolution + fi + else + echo "ip: $ip not available!" # 使用ranktable方案 + rankTableSolution + fi +fi + +echo "use horovod to start tasks" +DATE=$(date +%Y-%m-%d-%H-%M-%S) +horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ +python3.7 ${py} 2>&1 | tee "temp_${local_rank_size}p_t_${DATE}.log" diff --git a/examples/xDeepFM/src/base_model.py b/examples/xDeepFM/src/base_model.py index 5481012f..96682ae6 100644 --- a/examples/xDeepFM/src/base_model.py +++ b/examples/xDeepFM/src/base_model.py @@ -77,11 +77,11 @@ class BaseModel(object): for param in self.cross_params: cross_l_loss = tf.add(cross_l_loss, tf.multiply(hparams.cross_l1, tf.norm(param, ord=1))) cross_l_loss = tf.add(cross_l_loss, tf.multiply(hparams.cross_l2, tf.norm(param, ord=1))) - return cross_l_loss + return cross_l_loss def _get_initializer(self, hparams): if hparams.init_method == 'tnormal': - return tf.truncated_normal_initializer(stddev=hparams.init_value) + return tf.zeros_initializer() elif hparams.init_method == 'uniform': return tf.random_uniform_initializer(-hparams.init_value, hparams.init_value) elif hparams.init_method == 'normal': @@ -185,8 +185,8 @@ class BaseModel(object): return sess.run([self.update, self.loss, self.data_loss, self.merged], \ feed_dict={self.layer_keeps: self.keep_prob_train}) - def eval(self, sess): - return sess.run([self.loss, self.data_loss, self.pred, self.iterator.labels], \ + def eval(self, sess, eval_label): + return sess.run([self.loss, self.data_loss, self.pred, eval_label], \ feed_dict={self.layer_keeps: self.keep_prob_test}) def infer(self, sess): diff --git a/examples/xDeepFM/src/exDeepFM.py b/examples/xDeepFM/src/exDeepFM.py index b8d235b7..9d5b5299 100644 --- a/examples/xDeepFM/src/exDeepFM.py +++ b/examples/xDeepFM/src/exDeepFM.py @@ -3,6 +3,8 @@ from npu_bridge.npu_init import * import math import numpy as np import tensorflow as tf +from mx_rec.core.embedding import create_table +from mx_rec.core.embedding import sparse_lookup from src.base_model import BaseModel __all__ = ["ExtremeDeepFMModel"] @@ -37,10 +39,22 @@ class ExtremeDeepFMModel(BaseModel): fm_sparse_weight = tf.SparseTensor(self.iterator.dnn_feat_indices, self.iterator.dnn_feat_weights, self.iterator.dnn_feat_shape) - w_fm_nn_input_orgin = tf.nn.embedding_lookup_sparse(self.embedding, - fm_sparse_index, - fm_sparse_weight, - combiner="sum") + dense_indices = tf.sparse.to_dense(fm_sparse_index, default_value=0) + dense_weights = tf.sparse.to_dense(fm_sparse_weight, default_value=0) + + sparse_hashtable = create_table(key_dtype=tf.int32, + dim=tf.TensorShape([hparams.dim]), + name='sparse_embeddings_table', + emb_initializer=tf.zeros_initializer(), + device_vocabulary_size=hparams.FEATURE_COUNT, + host_vocabulary_size=0 + ) + embedded_values = sparse_lookup(sparse_hashtable, + dense_indices, + is_train=True, + name="sparse_embeddings", + modify_graph=True) + w_fm_nn_input_orgin = tf.reduce_sum(embedded_values * tf.expand_dims(dense_weights, axis=-1), axis=1) embedding = tf.reshape(w_fm_nn_input_orgin, [-1, hparams.dim * hparams.FIELD_COUNT]) embedding_size = hparams.FIELD_COUNT * hparams.dim return embedding, embedding_size diff --git a/examples/xDeepFM/train.py b/examples/xDeepFM/train.py index 1c434194..39918b34 100644 --- a/examples/xDeepFM/train.py +++ b/examples/xDeepFM/train.py @@ -18,7 +18,6 @@ from IO.ffm_cache import FfmCache #from src.cccfnet import CCCFModel #from src.deepcross import DeepCrossModel from src.exDeepFM import ExtremeDeepFMModel -from src.CIN import CINModel #from src.cross import CrossModel import utils.util as util import utils.metric as metric @@ -32,29 +31,27 @@ class TrainModel(collections.namedtuple("TrainModel", ("graph", "model", "iterat def create_train_model(model_creator, hparams, scope=None): - graph = tf.Graph() - with graph.as_default(): - # feed train file name, valid file name, or test file name - filenames = tf.placeholder(tf.string, shape=[None]) - #src_dataset = tf.contrib.data.TFRecordDataset(filenames) - src_dataset = tf.data.TFRecordDataset(filenames) + # feed train file name, valid file name, or test file name + filenames = tf.placeholder(tf.string, shape=[None]) + # src_dataset = tf.contrib.data.TFRecordDataset(filenames) + src_dataset = tf.data.TFRecordDataset(filenames) - if hparams.data_format == 'ffm': - batch_input = FfmIterator(src_dataset) - elif hparams.data_format == 'din': - batch_input = DinIterator(src_dataset) - elif hparams.data_format == 'cccfnet': - batch_input = CCCFNetIterator(src_dataset) - else: - raise ValueError("not support {0} format data".format(hparams.data_format)) - # build model - model = model_creator( - hparams, - iterator=batch_input, - scope=scope) + if hparams.data_format == 'ffm': + batch_input = FfmIterator(src_dataset) + elif hparams.data_format == 'din': + batch_input = DinIterator(src_dataset) + elif hparams.data_format == 'cccfnet': + batch_input = CCCFNetIterator(src_dataset) + else: + raise ValueError("not support {0} format data".format(hparams.data_format)) + # build model + model = model_creator( + hparams, + iterator=batch_input, + scope=scope) return TrainModel( - graph=graph, + graph=tf.get_default_graph(), model=model, iterator=batch_input, filenames=filenames) @@ -65,12 +62,15 @@ def run_eval(load_model, load_sess, filename, sample_num_file, hparams, flag): # load sample num with open(sample_num_file, 'r') as f: sample_num = int(f.readlines()[0].strip()) - load_sess.run(load_model.iterator.initializer, feed_dict={load_model.filenames: [filename]}) + from mx_rec.util.initialize import ConfigInitializer + eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(True).get("labels") + initializer = ConfigInitializer.get_instance().train_params_config.get_initializer(True) + load_sess.run(initializer, feed_dict={load_model.filenames: [filename]}) preds = [] labels = [] while True: try: - _, _, step_pred, step_labels = load_model.model.eval(load_sess) + _, _, step_pred, step_labels = load_model.model.eval(load_sess, eval_label) preds.extend(np.reshape(step_pred, -1)) labels.extend(np.reshape(step_labels, -1)) except tf.errors.OutOfRangeError: @@ -207,10 +207,7 @@ def train(hparams, scope=None, target_session=""): elif hparams.model_type == 'cross': print("run extreme cross model!") model_creator = CrossModel - elif hparams.model_type == 'CIN': - print("run extreme cin model!") - model_creator = CINModel - + else: raise ValueError("model type should be cccfnet, deepFM, deepWide, dnn, fm, lr, ipnn, opnn, din") @@ -220,6 +217,10 @@ def train(hparams, scope=None, target_session=""): gpuconfig = tf.ConfigProto() gpuconfig.gpu_options.allow_growth = True tf.set_random_seed(1234) + + from mx_rec.graph.modifier import modify_graph_and_start_emb_cache + modify_graph_and_start_emb_cache(dump_graph=True) + train_sess = tf.Session(target=target_session, graph=train_model.graph, config=npu_config_proto(config_proto=gpuconfig)) train_sess.run(train_model.model.init_op) @@ -236,7 +237,10 @@ def train(hparams, scope=None, target_session=""): last_eval = 0 for epoch in range(hparams.epochs): step = 0 - train_sess.run(train_model.iterator.initializer, feed_dict={train_model.filenames: [hparams.train_file_cache]}) + from mx_rec.util.initialize import ConfigInitializer + initializer = ConfigInitializer.get_instance().train_params_config.get_initializer(True) + train_sess.run(initializer, feed_dict={train_model.filenames: [hparams.train_file_cache]}) + epoch_loss = 0 train_start = time.time() train_load_time = 0 diff --git a/examples/xDeepFM/utils/util.py b/examples/xDeepFM/utils/util.py index 7a52b6a5..e4d88636 100644 --- a/examples/xDeepFM/utils/util.py +++ b/examples/xDeepFM/utils/util.py @@ -60,7 +60,7 @@ def check_file_exist(filename): def load_yaml_file(filename): with open(filename) as f: try: - config = yaml.load(f) + config = yaml.safe_load(f) except: raise IOError("load {0} error!".format(filename)) return config -- Gitee From e26c32e0c345dd6d80ec690781f2fb41ba00b63f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Mon, 3 Jun 2024 02:07:37 +0000 Subject: [PATCH 184/302] =?UTF-8?q?!158=20mxRec=E6=B5=8B=E8=AF=95=E7=94=A8?= =?UTF-8?q?=E4=BE=8B=EF=BC=88=E5=8C=85=E6=8B=ACAccCTR=EF=BC=89=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E3=80=81=E9=80=82=E9=85=8DASan=EF=BC=88=E5=9C=B0?= =?UTF-8?q?=E5=9D=80=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85?= =?UTF-8?q?=E5=AD=98=E6=B3=84=E6=BC=8F=E6=A3=80=E6=B5=8B=EF=BC=8C=E5=B9=B6?= =?UTF-8?q?=E8=A7=A3=E5=86=B3=E6=89=AB=E6=8F=8F=E5=87=BA=E6=9D=A5=E7=9A=84?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E4=B8=AD=E7=9A=84=E5=86=85?= =?UTF-8?q?=E5=AD=98=E6=B3=84=E6=BC=8F=E9=97=AE=E9=A2=98=20*=20C++?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN?= =?UTF-8?q?=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B?= =?UTF-8?q?=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B?= =?UTF-8?q?=20*=20C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92?= =?UTF-8?q?=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2?= =?UTF-8?q?=E6=A3=80=E6=B5=8B=20*=20C++=E6=B5=8B=E8=AF=95=E7=94=A8?= =?UTF-8?q?=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80?= =?UTF-8?q?=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98?= =?UTF-8?q?=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B=20*=20C++=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88?= =?UTF-8?q?=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B=20*=20C+?= =?UTF-8?q?+=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN?= =?UTF-8?q?=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B?= =?UTF-8?q?=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B?= =?UTF-8?q?=20*=20C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92?= =?UTF-8?q?=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2?= =?UTF-8?q?=E6=A3=80=E6=B5=8B=20*=20C++=E6=B5=8B=E8=AF=95=E7=94=A8?= =?UTF-8?q?=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80?= =?UTF-8?q?=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98?= =?UTF-8?q?=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B=20*=20C++=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88?= =?UTF-8?q?=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B=20*=20C+?= =?UTF-8?q?+=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN?= =?UTF-8?q?=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B?= =?UTF-8?q?=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B?= =?UTF-8?q?=20*=20C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92?= =?UTF-8?q?=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2?= =?UTF-8?q?=E6=A3=80=E6=B5=8B=20*=20C++=E6=B5=8B=E8=AF=95=E7=94=A8?= =?UTF-8?q?=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80?= =?UTF-8?q?=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98?= =?UTF-8?q?=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B=20*=20C++=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN=EF=BC=88?= =?UTF-8?q?=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B=E8=A1=8C?= =?UTF-8?q?=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B=20*=20C+?= =?UTF-8?q?+=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB=E5=8A=A0ASAN?= =?UTF-8?q?=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92=EF=BC=89=E8=BF=9B?= =?UTF-8?q?=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2=E6=A3=80=E6=B5=8B?= =?UTF-8?q?=20*=20C++=E6=B5=8B=E8=AF=95=E7=94=A8=E4=BE=8B=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0ASAN=EF=BC=88=E5=9C=B0=E5=9D=80=E6=B6=88=E6=AF=92?= =?UTF-8?q?=EF=BC=89=E8=BF=9B=E8=A1=8C=E5=86=85=E5=AD=98=E6=B3=84=E9=9C=B2?= =?UTF-8?q?=E6=A3=80=E6=B5=8B=20*=20Merge=20branch=20'develop'=20of=20http?= =?UTF-8?q?s://gitee.com/ascend/mxrec=20into=20develop=20*=20Merge=20branc?= =?UTF-8?q?h=20'develop'=20of=20https://gitee.com/ascend/mxrec=20into=20de?= =?UTF-8?q?velop=20*=20Merge=20remote-tracking=20branch=20'origin/develop'?= =?UTF-8?q?=20into=20develop=20*=20README=E4=B8=AD=E6=B7=BB=E5=8A=A0mxRec?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E6=8C=87=E5=8D=97=E7=A4=BE=E5=8C=BA=E9=93=BE?= =?UTF-8?q?=E6=8E=A5=E4=BB=A5=E5=8F=8A=E6=9B=B4=E6=96=B0=E5=85=AC=E7=BD=91?= =?UTF-8?q?=E5=9C=B0=E5=9D=80=20*=20README=E4=B8=AD=E6=B7=BB=E5=8A=A0mxRec?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E6=8C=87=E5=8D=97=E7=A4=BE=E5=8C=BA=E9=93=BE?= =?UTF-8?q?=E6=8E=A5=20*=20README=E4=B8=AD=E6=B7=BB=E5=8A=A0mxRec=E7=94=A8?= =?UTF-8?q?=E6=88=B7=E6=8C=87=E5=8D=97=E7=A4=BE=E5=8C=BA=E9=93=BE=E6=8E=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/CMakeLists.txt | 4 ++ src/AccCTR/build/build_test.sh | 3 ++ src/AccCTR/tests/ut/src/unique_test.cpp | 49 +++++++++++++++++++ src/CMakeLists.txt | 2 +- src/test_ut.sh | 3 ++ src/tests/emb_table/embedding_ddr_test.cpp | 2 +- src/tests/emb_table/embedding_mgmt_test.cpp | 2 +- src/tests/emb_table/embedding_static_test.cpp | 2 +- .../file_system/hdfs_file_system_test.cpp | 6 +-- src/tests/leaks.supp | 21 ++++++++ 10 files changed, 87 insertions(+), 7 deletions(-) create mode 100644 src/tests/leaks.supp diff --git a/src/AccCTR/CMakeLists.txt b/src/AccCTR/CMakeLists.txt index 60e2d638..febf1740 100644 --- a/src/AccCTR/CMakeLists.txt +++ b/src/AccCTR/CMakeLists.txt @@ -73,6 +73,10 @@ elseif (${BUILD_MODE} MATCHES "ut") -Wfloat-equal -Wextra -std=c++17 + -fsanitize=address + -fsanitize-recover=address,all + -fno-omit-frame-pointer + -fstack-protector-all ) else () message(FATAL_ERROR "======BUILD_MODE not found") diff --git a/src/AccCTR/build/build_test.sh b/src/AccCTR/build/build_test.sh index 9441efe3..4001b825 100644 --- a/src/AccCTR/build/build_test.sh +++ b/src/AccCTR/build/build_test.sh @@ -24,6 +24,9 @@ TOOL_FILE="create_fake_id.py" CPU_TYPE=$(arch) BUILD_MODE=$1 +# config asan environment variable +export ASAN_OPTIONS=halt_on_error=1:detect_leaks=1 + create_data() { cd ${TOOL_PATH} diff --git a/src/AccCTR/tests/ut/src/unique_test.cpp b/src/AccCTR/tests/ut/src/unique_test.cpp index a94ebaf7..94e8d92c 100644 --- a/src/AccCTR/tests/ut/src/unique_test.cpp +++ b/src/AccCTR/tests/ut/src/unique_test.cpp @@ -95,6 +95,13 @@ TEST_F(UniqueTest, Conf) ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 3); // idCntFill空指针 uniqueOut.idCntFill = idCntFill; ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 7); // padding长度过小 + + unique->UnInitialize(); + delete[] idCnt; + delete[] idCntFill; + delete[] uniqueIdCntInBucket; + delete[] uniqueIdInBucket; + std::cout << "===========Conf end=============" << std::endl; } @@ -115,6 +122,9 @@ TEST_F(UniqueTest, usePaddingNoShardingErr) conf.outputType = OutputType::ENHANCED; ASSERT_EQ(unique->Initialize(conf), 9); + + unique->UnInitialize(); + std::cout << "===========usePaddingNoShardingErr end=============" << std::endl; } @@ -132,6 +142,8 @@ TEST_F(UniqueTest, useNegativeDesiredSize) ASSERT_EQ(unique->Initialize(conf), 1); + unique->UnInitialize(); + std::cout << "===========useNegativeDesiredSize end=============" << std::endl; } @@ -207,6 +219,9 @@ TEST_F(UniqueTest, DoUniqueNormal) ASSERT_EQ(uniqueOut.uniqueIdCnt, (int)idsSet.size()); unique->UnInitialize(); + if (path) { + free(path); + } std::cout << "===========DoUniqueNormal end=============" << std::endl; } @@ -404,6 +419,9 @@ TEST_F(UniqueTest, DoEnhancedUniqueErr) ASSERT_EQ(uniqueOut.uniqueIdCnt, (int)idsSet.size()); unique->UnInitialize(); + delete[] uniqueIdInBucket; + delete[] idCnt; + std::cout << "===========DoEnhancedUniqueErr end=============" << std::endl; } @@ -544,6 +562,9 @@ TEST_F(UniqueTest, idCntIsNullSharding) ASSERT_EQ(ret, 3); unique->UnInitialize(); + delete[] uniqueIdCntInBucket; + delete[] uniqueIdInBucket; + std::cout << "===========idCntIsNullSharding end=============" << std::endl; } @@ -620,6 +641,7 @@ TEST_F(UniqueTest, DoUniqueShard) ASSERT_THAT(uniqueIdCntInBucket, testing::ElementsAreArray(expectedUniqueIdCnt)); ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueShard end=============" << std::endl; } @@ -685,6 +707,7 @@ TEST_F(UniqueTest, DoUniqueOnlyShard) ASSERT_THAT(inputId, testing::ElementsAreArray(restoreIds)); ASSERT_THAT(uniqueIdCntInBucket, testing::ElementsAreArray(expectedUniqueIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueOnlyShard end=============" << std::endl; } @@ -769,6 +792,8 @@ TEST_F(UniqueTest, DoUniquePadding) ASSERT_THAT(idCntFill, testing::ElementsAreArray(expectedIdCnt)); ASSERT_EQ(uniqueOut.uniqueIdCnt, conf.paddingSize * conf.shardingNum); unique->UnInitialize(); + delete[] idCnt; + delete[] uniqueIdInBucket; std::cout << "===========DoUniquePadding end=============" << std::endl; } @@ -913,6 +938,7 @@ TEST_F(UniqueTest, DoUniqueShardNumberOversize) ASSERT_THAT(uniqueIdCntInBucket, testing::ElementsAreArray(expectedUniqueIdCnt)); ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueShardNumberOversize end=============" << std::endl; } @@ -981,6 +1007,12 @@ TEST_F(UniqueTest, DoUniqueSpecial) } unique->UnInitialize(); + delete[] uniqueData; + delete[] index; + delete[] idCnt; + delete[] idCntFill; + delete[] uniqueIdCntInBucket; + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueSpecial end=============" << std::endl; } @@ -1020,6 +1052,10 @@ TEST_F(UniqueTest, IdLarge) uniqueOut.idCnt = idCnt; ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 6); // ID太大 + + unique->UnInitialize(); + delete[] idCnt; + std::cout << "===========IdLarge end=============" << std::endl; } @@ -1095,6 +1131,8 @@ TEST_F(UniqueTest, DoUniqueNormalInt32) ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; + std::cout << "===========DoUniqueNormalInt32 end=============" << std::endl; } @@ -1228,6 +1266,7 @@ TEST_F(UniqueTest, DoUniqueShardMultipleTimes) ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); } unique->UnInitialize(); + delete[] uniqueIdInBucket; std::cout << "===========DoUniqueShardMultipleTimes end=============" << std::endl; } @@ -1312,6 +1351,9 @@ TEST_F(UniqueTest, DoUniquePaddingMultipleTimes) } unique->UnInitialize(); + delete[] idCnt; + delete[] uniqueIdInBucket; + std::cout << "===========DoUniquePaddingMultipleTimes end=============" << std::endl; } @@ -1348,6 +1390,10 @@ TEST_F(UniqueTest, IdCntSmall) uniqueOut.idCnt = idCnt; ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 4); // idcnt过小 + + unique->UnInitialize(); + delete[] idCnt; + std::cout << "===========IdCntSmall end=============" << std::endl; } @@ -1449,6 +1495,7 @@ TEST_F(UniqueTest, DoUniqueLotsDataFunction) ASSERT_THAT(idCnt, testing::ElementsAreArray(expectedIdCnt)); unique->UnInitialize(); + delete[] uniqueIdInBucket; if (path) { free(path); } @@ -1557,6 +1604,8 @@ TEST_F(UniqueTest, DoUniqueLotsDataPaddingFunction) unique->UnInitialize(); ASSERT_EQ(unique->DoEnhancedUnique(uniqueIn, uniqueOut), 11); + delete[] idCnt; + delete[] uniqueIdInBucket; if (path) { free(path); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 84505d15..a5cd76da 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,7 +56,7 @@ else () message("==EASY_PROFILER_FOUND===") ADD_DEFINITIONS(-DBUILD_WITH_EASY_PROFILER) endif () -set(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -ffunction-sections -O0 -Wall -g2 -ggdb") +set(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -ffunction-sections -O0 -Wall -g2 -ggdb -fsanitize=address -fsanitize-recover=address,all -fno-omit-frame-pointer -fno-stack-protector") set(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -ffunction-sections -O3 -Wfatal-errors -DNDEBUG -fPIC -fstack-protector-all -D_FORTIFY_SOURCE=2 -s") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack") diff --git a/src/test_ut.sh b/src/test_ut.sh index 6146aaab..7305c081 100644 --- a/src/test_ut.sh +++ b/src/test_ut.sh @@ -129,6 +129,9 @@ mkdir build cd build python_path="$(dirname "$(dirname "$(which python3.7)")")" +# config asan environment variable +export ASAN_OPTIONS=halt_on_error=1:detect_leaks=1:fast_unwind_on_malloc=0 +export LSAN_OPTIONS=suppressions=../tests/leaks.supp cmake -DCMAKE_BUILD_TYPE=Debug \ -DTF_PATH="${python_path}"/lib/python3.7/site-packages/"${TF_DIR}" \ diff --git a/src/tests/emb_table/embedding_ddr_test.cpp b/src/tests/emb_table/embedding_ddr_test.cpp index ddad3905..60ec5af6 100644 --- a/src/tests/emb_table/embedding_ddr_test.cpp +++ b/src/tests/emb_table/embedding_ddr_test.cpp @@ -35,7 +35,7 @@ protected: EmbeddingDDRTest() { struct EmbInfoParams embParam(string("test1"), 0, 1000, 2000, true, true); - std::vector vocabsize = {100}; + std::vector vocabsize = {100, 100, 100}; vector initializeInfos = {}; std::vector ssdDataPath = {""}; vector maxStep = {1000}; diff --git a/src/tests/emb_table/embedding_mgmt_test.cpp b/src/tests/emb_table/embedding_mgmt_test.cpp index 49f10b4f..055cf5c5 100644 --- a/src/tests/emb_table/embedding_mgmt_test.cpp +++ b/src/tests/emb_table/embedding_mgmt_test.cpp @@ -35,7 +35,7 @@ protected: EmbeddingMgmtTest() { struct EmbInfoParams embParam(string("test1"), 0, 1000, 2000, true, true); - std::vector vocabsize = {100}; + std::vector vocabsize = {100, 100, 100}; vector initializeInfos = {}; std::vector ssdDataPath = {""}; vector maxStep = {1000}; diff --git a/src/tests/emb_table/embedding_static_test.cpp b/src/tests/emb_table/embedding_static_test.cpp index c8a5e252..9e250f64 100644 --- a/src/tests/emb_table/embedding_static_test.cpp +++ b/src/tests/emb_table/embedding_static_test.cpp @@ -34,7 +34,7 @@ protected: EmbeddingStaticTest() { struct EmbInfoParams embParam(string("test1"), 0, 1000, 2000, true, true); - std::vector vocabsize = {100}; + std::vector vocabsize = {100, 100, 100}; vector initializeInfos = {}; std::vector ssdDataPath = {""}; vector maxStep = {1000}; diff --git a/src/tests/file_system/hdfs_file_system_test.cpp b/src/tests/file_system/hdfs_file_system_test.cpp index 0d469ca5..98f733f0 100644 --- a/src/tests/file_system/hdfs_file_system_test.cpp +++ b/src/tests/file_system/hdfs_file_system_test.cpp @@ -26,10 +26,10 @@ using namespace emock; void MockHdfs() { + EMOCK(&HdfsWrapper::LoadHdfsLib).stubs().will(ignoreReturnValue()); hdfsFS ConnectFs; hdfsFile hdfsFileHandler; hdfsFileInfo* fileInfo; - EMOCK(&HdfsWrapper::LoadHdfsLib).stubs().will(ignoreReturnValue()); EMOCK(&HdfsWrapper::CloseHdfsLib).stubs().will(ignoreReturnValue()); EMOCK(&HdfsWrapper::Connect).stubs().will(returnValue(ConnectFs)); EMOCK(&HdfsWrapper::Disconnect).stubs().will(returnValue(1)); @@ -75,8 +75,8 @@ TEST_F(HdfsFileSystemTest, CreateDirFailed) TEST_F(HdfsFileSystemTest, GetFileSize) { - hdfsFileInfo* fileInfo; - EMOCK(&HdfsWrapper::GetPathInfo).stubs().will(returnValue(fileInfo)); + std::unique_ptr fileInfo = std::make_unique(); + EMOCK(&HdfsWrapper::GetPathInfo).stubs().will(returnValue(fileInfo.get())); string filePath = "hdfs://master:9000/test_dir/"; auto fileSystemHandler = make_unique(); auto fileSystemPtr = fileSystemHandler->Create(filePath); diff --git a/src/tests/leaks.supp b/src/tests/leaks.supp new file mode 100644 index 00000000..ebe0718d --- /dev/null +++ b/src/tests/leaks.supp @@ -0,0 +1,21 @@ +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# There are known leaks. +# 1.known mpi leaks. +leak:libmpi.so* +leak:libopen-pal.so* +leak:libpmix.so* +leak:libc.so* \ No newline at end of file -- Gitee From 74be454e15670afd84926c576d9ffc182b38fb6a Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Mon, 3 Jun 2024 02:08:02 +0000 Subject: [PATCH 185/302] =?UTF-8?q?!169=20=E4=BF=AE=E5=A4=8D=EF=BC=88embCa?= =?UTF-8?q?che=EF=BC=89=EF=BC=9Asave=E5=BC=82=E5=B8=B8=E9=80=80=E5=87=BA?= =?UTF-8?q?=E5=9C=BA=E6=99=AF=E9=98=BB=E5=A1=9E=E5=9C=A8EvalTask=20*=20?= =?UTF-8?q?=E4=BF=AE=E5=A4=8D=EF=BC=88embCache=EF=BC=89=EF=BC=9Asave?= =?UTF-8?q?=E5=BC=82=E5=B8=B8=E9=80=80=E5=87=BA=E5=9C=BA=E6=99=AF=E9=98=BB?= =?UTF-8?q?=E5=A1=9E=E5=9C=A8EvalTask?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 6b998205..4c64c2ec 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -424,6 +424,7 @@ void HybridMgmt::Destroy() cvLastRecvFinishMap[embInfo.name][index].notify_all(); } } + cvCheckSave.notify_all(); // 防止save异常退出场景阻塞在EvalTask { // 获取锁 避免KeyProcess中手动发送结束信息时通道关闭 -- Gitee From 5935823be7393f2803280a7f17414ed52d76e133 Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Mon, 3 Jun 2024 02:16:45 +0000 Subject: [PATCH 186/302] =?UTF-8?q?!166=20=E7=89=B9=E6=80=A7=EF=BC=88embCa?= =?UTF-8?q?che=EF=BC=89=EF=BC=9A=E4=BF=AE=E5=A4=8D=E9=9D=99=E6=80=81shape?= =?UTF-8?q?=20gather=E8=B6=8A=E7=95=8C=E9=97=AE=E9=A2=98=20*=20=E7=89=B9?= =?UTF-8?q?=E6=80=A7=EF=BC=88embCache=EF=BC=89=EF=BC=9A=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E9=9D=99=E6=80=81shape=20gather=E8=B6=8A=E7=95=8C=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 6 ++++++ src/core/utils/common.h | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 4c64c2ec..b318f2d4 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -2047,9 +2047,15 @@ void HybridMgmt::SendRestoreVec(const EmbBaseInfo &info, bool &remainBatchOut) void HybridMgmt::SendLookupOffsets(const EmbBaseInfo &info, vector &uniqueKeys, vector &restoreVecSec) { + // uniqueKeys already transfer to offset in GetSwapPairsAndKey2Offset + // graph will filter out invalid offset(-1). see function _set_specific_value_for_non_valid_key TimeCost sendLookupOffsetsTC; std::vector lookupOffsets; for (const auto &index : restoreVecSec) { + if (index == INVALID_INDEX_VALUE) { + lookupOffsets.emplace_back(static_cast(INVALID_KEY_VALUE)); + continue; + } lookupOffsets.emplace_back(uniqueKeys[index]); } hdTransfer->Send(TransferChannel::LOOKUP, { Vec2TensorI32(lookupOffsets) }, info.channelId, info.name); diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 5bb93a41..75837349 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -102,8 +102,8 @@ namespace MxRec { constexpr int EOS_TIMEOUT = 30; constexpr size_t DEFAULT_RANDOM_SEED = 10086; - // constexpr int INVALID_KEY_VALUE = -1; constexpr int64_t INVALID_KEY_VALUE = -1; + constexpr int32_t INVALID_INDEX_VALUE = -1; constexpr int ALLTOALLVC_ALIGN = 128; constexpr int PROFILING_START_BATCH_ID = 100; constexpr int PROFILING_END_BATCH_ID = 200; -- Gitee From 1a71bb5240518635ba17278faf1c3e712b06b957 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 3 Jun 2024 03:38:00 +0000 Subject: [PATCH 187/302] =?UTF-8?q?!159=20dlrm=20sess=E9=80=82=E9=85=8Dlaz?= =?UTF-8?q?y=5Fadam=E4=BC=98=E5=8C=96=E5=99=A8=20*=20=E4=BD=BF=E7=94=A8has?= =?UTF-8?q?attr=E5=88=A4=E6=96=AD=E4=BC=98=E5=8C=96=E5=99=A8=20*=20dlrm=20?= =?UTF-8?q?sess=E9=80=82=E9=85=8Dlazy=5Fadam=E4=BC=98=E5=8C=96=E5=99=A8--?= =?UTF-8?q?=E6=A3=80=E8=A7=86=E4=BF=AE=E6=94=B9=20*=20dlrm=20sess=E9=80=82?= =?UTF-8?q?=E9=85=8Dlazy=5Fadam=E4=BC=98=E5=8C=96=E5=99=A8--=E6=A3=80?= =?UTF-8?q?=E8=A7=86=E4=BF=AE=E6=94=B9=20*=20dlrm=20sess=E9=80=82=E9=85=8D?= =?UTF-8?q?lazy=5Fadam=E4=BC=98=E5=8C=96=E5=99=A8--=E6=A3=80=E8=A7=86?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=20*=20dlrm=20sess=E9=80=82=E9=85=8Dlazy=5Fad?= =?UTF-8?q?am=E4=BC=98=E5=8C=96=E5=99=A8=20*=20dlrm=20sess=E9=80=82?= =?UTF-8?q?=E9=85=8Dlazy=5Fadam=E4=BC=98=E5=8C=96=E5=99=A8-=E9=97=A8?= =?UTF-8?q?=E7=A6=81=E6=89=AB=E6=8F=8F=E4=BF=AE=E6=94=B9=20*=20dlrm=20sess?= =?UTF-8?q?=E9=80=82=E9=85=8Dlazy=5Fadam=E4=BC=98=E5=8C=96=E5=99=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/dlrm/model/config.py | 2 ++ examples/dlrm/model/delay_loss_scale.py | 30 +++++++++++++++++++------ examples/dlrm/model/optimizer.py | 24 +++++++++++++++----- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/examples/dlrm/model/config.py b/examples/dlrm/model/config.py index 45e8af40..78115d61 100644 --- a/examples/dlrm/model/config.py +++ b/examples/dlrm/model/config.py @@ -128,6 +128,8 @@ class Config: self.hashtable_threshold = 1 self.USE_PIPELINE_TEST = False + # False indicates use SGD optimizer, else use LazyAdam. If True, is incompatible with dynamic_expansion + self.use_lazy_adam_optimizer = False # 动态学习率 GLOBAL_BATCH_SIZE = 8192 * 8 diff --git a/examples/dlrm/model/delay_loss_scale.py b/examples/dlrm/model/delay_loss_scale.py index 0cb50688..01bb0d8f 100644 --- a/examples/dlrm/model/delay_loss_scale.py +++ b/examples/dlrm/model/delay_loss_scale.py @@ -17,32 +17,48 @@ import tensorflow as tf from tensorflow.python.training import optimizer +from config import Config + class DenseLossScaleOptimizer: - def __init__(self, opt, loss_scale): + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: if not isinstance(opt, optimizer.Optimizer): raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) self._optimizer = opt self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - self._optimizer._learning_rate = self._optimizer._learning_rate / self._loss_scale + _update_lr_loss_scale(self._optimizer, loss_scale) def compute_gradients(self, loss, var_list=None): - return self._optimizer.compute_gradients(loss*self._loss_scale, var_list=var_list) + return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list) def apply_gradients(self, avg_grads): return self._optimizer.apply_gradients(avg_grads) class SparseLossScaleOptimizer: - def __init__(self, opt, loss_scale): + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: if not isinstance(opt, optimizer.Optimizer): raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) self._optimizer = opt self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - self._optimizer._learning_rate = self._optimizer._learning_rate / self._loss_scale + _update_lr_loss_scale(self._optimizer, loss_scale) def compute_gradients(self, loss, var_list=None): - return tf.gradients(loss*self._loss_scale, var_list) + return tf.gradients(loss * self._loss_scale, var_list) def apply_gradients(self, grads_and_vars): - return self._optimizer.apply_gradients(grads_and_vars) \ No newline at end of file + return self._optimizer.apply_gradients(grads_and_vars) + + +def _update_lr_loss_scale(opt, loss_scale): + if loss_scale <= 0: + raise RuntimeError("the loss_scale must be greater than zero.") + loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + if hasattr(opt, "_lr"): + # LazyAdam or Adam optimizer + opt._lr = opt._lr / loss_scale + elif hasattr(opt, "_learning_rate"): + # SGD optimizer + opt._learning_rate = opt._learning_rate / loss_scale + else: + raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.") diff --git a/examples/dlrm/model/optimizer.py b/examples/dlrm/model/optimizer.py index 7a6d6878..18dbe288 100644 --- a/examples/dlrm/model/optimizer.py +++ b/examples/dlrm/model/optimizer.py @@ -15,20 +15,32 @@ # ============================================================================== import tensorflow as tf + from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer from gradient_descent_w import create_hash_optimizer from mx_rec.util.initialize import ConfigInitializer from mx_rec.optimizers.gradient_descent_by_addr import create_hash_optimizer_by_addr +from mx_rec.optimizers import lazy_adam def get_dense_and_sparse_optimizer(cfg): - dense_optimizer = tf.train.GradientDescentOptimizer(learning_rate=cfg.learning_rate[0]) use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion - if use_dynamic_expansion: - sparse_optimizer = create_hash_optimizer_by_addr(learning_rate=cfg.learning_rate[1], weight_decay=0.0001) + if cfg.use_lazy_adam_optimizer: + if use_dynamic_expansion: + raise RuntimeError("model is incompatible with dynamic_expansion when use lazy_adam optimizer.") + # use lazy_adam optimizer + dense_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=cfg.learning_rate[0]) + sparse_optimizer = lazy_adam.create_hash_optimizer(learning_rate=cfg.learning_rate[1]) + loss_scale = 65536 else: - sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1], weight_decay=0.0001) - sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, 1024) - dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, 1024) + # use SGD optimizer + dense_optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=cfg.learning_rate[0]) + if use_dynamic_expansion: + sparse_optimizer = create_hash_optimizer_by_addr(learning_rate=cfg.learning_rate[1], weight_decay=0.0001) + else: + sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1], weight_decay=0.0001) + loss_scale = 1024 + sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale) + dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale) return dense_optimizer, sparse_optimizer -- Gitee From dd3deb69d0b6ef9fd0d7c638100fa8347d896c9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Mon, 3 Jun 2024 14:47:32 +0800 Subject: [PATCH 188/302] =?UTF-8?q?WideDeep=E6=A8=A1=E5=9E=8B=E8=BF=81?= =?UTF-8?q?=E7=A7=BB=20=E5=8E=9F=E5=A7=8Bdlrm=E6=A8=A1=E5=9E=8B=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/model/config.py | 241 +++++++++ examples/WideDeep/model/delay_loss_scale.py | 64 +++ examples/WideDeep/model/gradient_descent_w.py | 71 +++ examples/WideDeep/model/main_mxrec.py | 469 ++++++++++++++++++ examples/WideDeep/model/mean_auc.py | 40 ++ examples/WideDeep/model/model.py | 94 ++++ examples/WideDeep/model/op_impl_mode.ini | 1 + examples/WideDeep/model/optimizer.py | 46 ++ examples/WideDeep/model/run.sh | 99 ++++ 9 files changed, 1125 insertions(+) create mode 100644 examples/WideDeep/model/config.py create mode 100644 examples/WideDeep/model/delay_loss_scale.py create mode 100644 examples/WideDeep/model/gradient_descent_w.py create mode 100644 examples/WideDeep/model/main_mxrec.py create mode 100644 examples/WideDeep/model/mean_auc.py create mode 100644 examples/WideDeep/model/model.py create mode 100644 examples/WideDeep/model/op_impl_mode.ini create mode 100644 examples/WideDeep/model/optimizer.py create mode 100644 examples/WideDeep/model/run.sh diff --git a/examples/WideDeep/model/config.py b/examples/WideDeep/model/config.py new file mode 100644 index 00000000..78115d61 --- /dev/null +++ b/examples/WideDeep/model/config.py @@ -0,0 +1,241 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import enum +import os + +import tensorflow as tf +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +from npu_bridge.estimator.npu.npu_config import NPURunConfig + +SSD_DATA_PATH = ["ssd_data"] + + +class LearningRateScheduler: + """ + LR Scheduler combining Polynomial Decay with Warmup at the beginning. + TF-based cond operations necessary for performance in graph mode. + """ + + def __init__(self, base_lr_dense, base_lr_sparse, warmup_steps, decay_start_step, decay_steps): + self.warmup_steps = tf.constant(warmup_steps, dtype=tf.int32) + self.decay_start_step = tf.constant(decay_start_step, dtype=tf.int32) + self.decay_steps = tf.constant(decay_steps) + self.decay_end_step = decay_start_step + decay_steps # 65041 + self.poly_power = 2.0 + self.base_lr_dense = base_lr_dense + self.base_lr_sparse = base_lr_sparse + + def calc(self, global_step): + # used for the warmup stage + warmup_step = tf.cast(1 / self.warmup_steps, tf.float32) + lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step + lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32) + # used for the constant stage + lr_factor_constant = tf.cast(1.0, tf.float32) + + # used for the decay stage + lr_factor_decay = (self.decay_end_step - global_step) / self.decay_steps + lr_factor_decay = tf.math.pow(lr_factor_decay, self.poly_power) + lr_factor_decay = tf.cast(lr_factor_decay, tf.float32) + sparse_after_decay = tf.cast(1 / self.decay_steps, tf.float32) + + lr_factor_decay_sparse = tf.cond( + global_step < self.decay_end_step, + lambda: lr_factor_decay, + lambda: sparse_after_decay, + ) + + lr_factor_decay_dense = tf.cond( + global_step < self.decay_end_step, + lambda: lr_factor_decay, + lambda: sparse_after_decay, + ) + + poly_schedule_sparse = tf.cond( + global_step < self.decay_start_step, + lambda: lr_factor_constant, + lambda: lr_factor_decay_sparse, + ) + + poly_schedule_dense = tf.cond( + global_step < self.decay_start_step, + lambda: lr_factor_constant, + lambda: lr_factor_decay_dense, + ) + + lr_factor_sparse = tf.cond( + global_step < self.warmup_steps, lambda: lr_factor_warmup, lambda: poly_schedule_sparse + ) + + lr_factor_dense = tf.cond( + global_step < self.warmup_steps, lambda: lr_factor_warmup, lambda: poly_schedule_dense + ) + + lr_sparse = self.base_lr_sparse * lr_factor_sparse + lr_dense = self.base_lr_dense * lr_factor_dense + return lr_dense, lr_sparse + + +class CacheModeEnum(enum.Enum): + HBM = "HBM" + DDR = "DDR" + SSD = "SSD" + + +class Config: + def __init__(self, ): + self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None + tmp = os.getenv("TRAIN_RANK_SIZE") + if tmp is None: + raise ValueError("please export TRAIN_RANK_SIZE") + self.rank_size = int(tmp) + + self.data_path = os.getenv("DLRM_CRITEO_DATA_PATH") + self.train_file_pattern = "train" + self.test_file_pattern = "test" + + self.batch_size = 8192 + self.line_per_sample = 1024 + self.train_epoch = 3 + self.test_epoch = 1 + self.perform_shuffle = False + + self.key_type = tf.int64 + self.label_type = tf.float32 + self.value_type = tf.int64 + + self.feat_cnt = 26 + self.__set_emb_table_size() + + self.field_num = 26 + self.send_count = 46000 // self.rank_size + + self.emb_dim = 128 + self.hashtable_threshold = 1 + + self.USE_PIPELINE_TEST = False + # False indicates use SGD optimizer, else use LazyAdam. If True, is incompatible with dynamic_expansion + self.use_lazy_adam_optimizer = False + + # 动态学习率 + GLOBAL_BATCH_SIZE = 8192 * 8 + LR_SCHEDULE_STEPS = [ + int(2750 * 55296 / GLOBAL_BATCH_SIZE), + int(49315 * 55296 / GLOBAL_BATCH_SIZE), + int(27772 * 55296 / GLOBAL_BATCH_SIZE), + ] + self.global_step = tf.Variable(0, trainable=False) + _lr_scheduler = LearningRateScheduler( + 28.443, + 33.71193, + LR_SCHEDULE_STEPS[0], + LR_SCHEDULE_STEPS[1], + LR_SCHEDULE_STEPS[2], + ) + self.learning_rate = _lr_scheduler.calc(self.global_step) + + def __set_emb_table_size(self): + self.cache_mode = os.getenv("CACHE_MODE") + if self.cache_mode is None: + raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") + + if self.cache_mode == CacheModeEnum.HBM.value: + self.dev_vocab_size = 24_000_000 * self.rank_size + self.host_vocab_size = 0 + elif self.cache_mode == CacheModeEnum.DDR.value: + self.dev_vocab_size = 500_000 * self.rank_size + self.host_vocab_size = 24_000_000 * self.rank_size + elif self.cache_mode == CacheModeEnum.SSD.value: + self.dev_vocab_size = 100_000 * self.rank_size + self.host_vocab_size = 2_000_000 * self.rank_size + self.ssd_vocab_size = 24_000_000 * self.rank_size + else: + raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") + + def get_emb_table_cfg(self): + if self.cache_mode == CacheModeEnum.HBM.value: + return {"device_vocabulary_size": self.dev_vocab_size} + elif self.cache_mode == CacheModeEnum.DDR.value: + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size} + elif self.cache_mode == CacheModeEnum.SSD.value: + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size, + "ssd_vocabulary_size": self.ssd_vocab_size, + "ssd_data_path": SSD_DATA_PATH} + else: + raise RuntimeError(f"get CACHE_MODE:{self.cache_mode}, check Config.__set_emb_table_size implementation") + + +def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"): + session_config = tf.ConfigProto(allow_soft_placement=False, + log_device_placement=False) + session_config.gpu_options.allow_growth = True + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["mix_compile_mode"].b = False + custom_op.parameter_map["use_off_line"].b = True + custom_op.parameter_map["min_group_size"].b = 1 + # 可选配置level0:pairwise;level1:pairwise + custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:fullmesh;level1:fullmesh") + custom_op.parameter_map["enable_data_pre_proc"].b = True + custom_op.parameter_map["iterations_per_loop"].i = 10 + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + custom_op.parameter_map["hcom_parallel"].b = False + custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes("op_impl_mode.ini") + custom_op.parameter_map["op_execute_timeout"].i = 2000 + custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes( + str(13 * 1024 * 1024 * 1024)) # total 31 need 13; + custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(18 * 1024 * 1024 * 1024)) # need 25 + custom_op.parameter_map["stream_max_parallel_num"].s = tf.compat.as_bytes("DNN_VM_AICPU:3,AIcoreEngine:3") + + if dump_data: + custom_op.parameter_map["enable_dump"].b = True + custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes(dump_path) + custom_op.parameter_map["dump_step"].s = tf.compat.as_bytes(dump_steps) + custom_op.parameter_map["dump_mode"].s = tf.compat.as_bytes("all") + + session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + return session_config + + +def get_npu_run_config(): + session_config = tf.ConfigProto(allow_soft_placement=False, + log_device_placement=False) + + session_config.gpu_options.allow_growth = True + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + run_config = NPURunConfig( + save_summary_steps=1000, + save_checkpoints_steps=100, + keep_checkpoint_max=5, + session_config=session_config, + log_step_count_steps=20, + precision_mode='allow_mix_precision', + enable_data_pre_proc=True, + iterations_per_loop=1, + jit_compile=False, + op_compiler_cache_mode="enable", + HCCL_algorithm="level0:fullmesh;level1:fullmesh" # 可选配置:level0:pairwise;level1:pairwise + ) + return run_config diff --git a/examples/WideDeep/model/delay_loss_scale.py b/examples/WideDeep/model/delay_loss_scale.py new file mode 100644 index 00000000..01bb0d8f --- /dev/null +++ b/examples/WideDeep/model/delay_loss_scale.py @@ -0,0 +1,64 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf +from tensorflow.python.training import optimizer + +from config import Config + + +class DenseLossScaleOptimizer: + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: + if not isinstance(opt, optimizer.Optimizer): + raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) + self._optimizer = opt + self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + _update_lr_loss_scale(self._optimizer, loss_scale) + + def compute_gradients(self, loss, var_list=None): + return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list) + + def apply_gradients(self, avg_grads): + return self._optimizer.apply_gradients(avg_grads) + + +class SparseLossScaleOptimizer: + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: + if not isinstance(opt, optimizer.Optimizer): + raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) + self._optimizer = opt + self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + _update_lr_loss_scale(self._optimizer, loss_scale) + + def compute_gradients(self, loss, var_list=None): + return tf.gradients(loss * self._loss_scale, var_list) + + def apply_gradients(self, grads_and_vars): + return self._optimizer.apply_gradients(grads_and_vars) + + +def _update_lr_loss_scale(opt, loss_scale): + if loss_scale <= 0: + raise RuntimeError("the loss_scale must be greater than zero.") + loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + if hasattr(opt, "_lr"): + # LazyAdam or Adam optimizer + opt._lr = opt._lr / loss_scale + elif hasattr(opt, "_learning_rate"): + # SGD optimizer + opt._learning_rate = opt._learning_rate / loss_scale + else: + raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.") diff --git a/examples/WideDeep/model/gradient_descent_w.py b/examples/WideDeep/model/gradient_descent_w.py new file mode 100644 index 00000000..53adb996 --- /dev/null +++ b/examples/WideDeep/model/gradient_descent_w.py @@ -0,0 +1,71 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import defaultdict + +import tensorflow as tf +from tensorflow.python.ops import math_ops +from tensorflow.python.training import gradient_descent +from mx_rec.optimizers.base import CustomizedOptimizer +from mx_rec.util.log import logger +from mx_rec.util.initialize import ConfigInitializer + + +def create_hash_optimizer(learning_rate, weight_decay=0.0001, use_locking=False, name="GradientDescent"): + optimizer = CustomizedGradientDescentWithWeighDecay(learning_rate=learning_rate, + weight_decay=weight_decay, + use_locking=use_locking, + name=name) + ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer + return optimizer + + +class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOptimizer, CustomizedOptimizer): + name_counter = defaultdict(int) + + def __init__(self, learning_rate, weight_decay, use_locking=False, name="GradientDescent"): + self.optimizer_type = "gradient_descent_with_weight_decay" + self.weight_decay = weight_decay + super(CustomizedGradientDescentWithWeighDecay, self)._get_name(name=name) + super(CustomizedGradientDescentWithWeighDecay, self).__init__( + learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name + ) + self._slot_num = 0 + self._derivative = 1 + + def get_slot_init_values(self): + logger.info("no slot for gradient descent") + return [] + + def _apply_sparse_duplicate_indices(self, grad, var): + logger.debug(">>>> Enter _apply_sparse_duplicate_indices") + nd_indices = tf.expand_dims(grad.indices, 1) + logger.info(f"weigh_decay={self.weight_decay}") + if self.weight_decay is None: + nd_value = grad.values * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) + else: + nd_value = (grad.values + math_ops.cast(self.weight_decay, var.dtype.base_dtype) * + tf.gather(var, grad.indices)) * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) + var_update_op = tf.scatter_nd_add(var, nd_indices, -nd_value, use_locking=self._use_locking) + return var_update_op + + def _apply_dense(self, grad, var): + logger.debug(">>>> Enter _apply_dense") + raise NotImplementedError("You are using a wrong type of variable.") diff --git a/examples/WideDeep/model/main_mxrec.py b/examples/WideDeep/model/main_mxrec.py new file mode 100644 index 00000000..51ed7c4a --- /dev/null +++ b/examples/WideDeep/model/main_mxrec.py @@ -0,0 +1,469 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import shutil +import time +import warnings +import random +from glob import glob + +import tensorflow as tf +from sklearn.metrics import roc_auc_score +import numpy as np + +from optimizer import get_dense_and_sparse_optimizer +from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum +from model import MyModel +from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET +from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func +from mx_rec.core.asc.manager import start_asc_pipeline +from mx_rec.core.embedding import create_table, sparse_lookup +from mx_rec.core.feature_process import EvictHook +from mx_rec.graph.modifier import modify_graph_and_start_emb_cache, GraphModifierHook +from mx_rec.constants.constants import ASCEND_TIMESTAMP +from mx_rec.util.initialize import ConfigInitializer, init, terminate_config_initializer +from mx_rec.util.ops import import_host_pipeline_ops +import mx_rec.util as mxrec_util +from mx_rec.util.variable import get_dense_and_sparse_variable +from mx_rec.util.log import logger +from npu_bridge.npu_init import * + +npu_plugin.set_device_sat_mode(0) + +dense_hashtable_seed = 128 +sparse_hashtable_seed = 128 +shuffle_seed = 128 +random.seed(shuffle_seed) + + +def add_timestamp_func(batch): + timestamp = import_host_pipeline_ops().return_timestamp(tf.cast(batch['label'], dtype=tf.int64)) + # tf.constant(np.random.randint(1,1688109060,1)), tf.int64)) + batch["timestamp"] = timestamp + return batch + + +def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, is_use_faae=False): + if config.USE_PIPELINE_TEST: + num_parallel = 1 + else: + num_parallel = 8 + + def extract_fn(data_record): + features = { + # Extract features using the keys set during creation + 'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64), + 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64), + 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32), + } + sample = tf.compat.v1.parse_single_example(data_record, features) + return sample + + def reshape_fn(batch): + batch['label'] = tf.reshape(batch['label'], [-1, 1]) + batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 13]) + batch['dense_feature'] = tf.math.log(batch['dense_feature'] + 3.0) + batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 26]) + return batch + + if is_training: + files_list = glob(os.path.join(config.data_path, config.train_file_pattern) + '/*.tfrecord') + else: + files_list = glob(os.path.join(config.data_path, config.test_file_pattern) + '/*.tfrecord') + dataset = tf.data.TFRecordDataset(files_list, num_parallel_reads=num_parallel) + batch_size = config.batch_size // config.line_per_sample + + dataset = dataset.shard(config.rank_size, config.rank_id) + if is_training: + dataset = dataset.shuffle(batch_size * 1000, seed=shuffle_seed) + if is_training: + dataset = dataset.repeat(config.train_epoch) + else: + dataset = dataset.repeat(config.test_epoch) + dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel).batch(batch_size, + drop_remainder=True) + dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel) + if is_use_faae: + dataset = dataset.map(add_timestamp_func) + + if not MODIFY_GRAPH_FLAG: + insert_fn = get_asc_insert_func(tgt_key_specs=feature_spec_list, is_training=is_training, dump_graph=dump_graph) + dataset = dataset.map(insert_fn) + + dataset = dataset.prefetch(100) + + iterator = dataset.make_initializable_iterator() + batch = iterator.get_next() + return batch, iterator + + +def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph): + embedding_list = [] + logger.debug(f"In model_forward function, is_train: {is_train}, feature_list: {len(feature_list)}, " + f"hash_table_list: {len(hash_table_list)}") + for feature, hash_table in zip(feature_list, hash_table_list): + if MODIFY_GRAPH_FLAG: + feature = batch["sparse_feature"] + embedding = sparse_lookup(hash_table, feature, cfg.send_count, dim=None, is_train=is_train, + name="user_embedding_lookup", modify_graph=modify_graph, batch=batch, + access_and_evict_config=None) + embedding_list.append(embedding) + + if len(embedding_list) == 1: + emb = embedding_list[0] + elif len(embedding_list) > 1: + emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False) + else: + raise ValueError("the length of embedding_list must be greater than or equal to 1.") + my_model = MyModel() + model_output = my_model.build_model(embedding=emb, + dense_feature=batch["dense_feature"], + label=batch["label"], + is_training=is_train, + seed=dense_hashtable_seed) + return model_output + + +def evaluate(): + print("read_test dataset") + if not MODIFY_GRAPH_FLAG: + eval_label = eval_model.get("label") + sess.run([eval_iterator.initializer]) + else: + # 在sess run模式下,若还是使用原来batch中的label去sess run,则会出现getnext超时报错,需要使用新数据集中的batch + eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label") + sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) + log_loss_list = [] + pred_list = [] + label_list = [] + eval_current_steps = 0 + finished = False + print("eval begin") + + while not finished: + try: + eval_current_steps += 1 + eval_start = time.time() + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_label]) + eval_cost = time.time() - eval_start + qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size + log_loss_list += list(eval_loss.reshape(-1)) + pred_list += list(pred.reshape(-1)) + label_list += list(label.reshape(-1)) + print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") + if eval_current_steps == eval_steps: + finished = True + except tf.errors.OutOfRangeError: + finished = True + auc = roc_auc_score(label_list, pred_list) + mean_log_loss = np.mean(log_loss_list) + return auc, mean_log_loss + + +def evaluate_fix(step): + print("read_test dataset evaluate_fix") + if not MODIFY_GRAPH_FLAG: + sess.run([eval_iterator.initializer]) + else: + sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) + log_loss_list = [] + pred_list = [] + label_list = [] + eval_current_steps = 0 + finished = False + print("eval begin") + while not finished: + try: + eval_current_steps += 1 + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_model.get("label")]) + log_loss_list += list(eval_loss.reshape(-1)) + pred_list += list(pred.reshape(-1)) + label_list += list(label.reshape(-1)) + print(f"eval current_steps: {eval_current_steps}") + + if eval_current_steps == eval_steps: + finished = True + except tf.errors.OutOfRangeError: + finished = True + + label_numpy = np.array(label_list) + pred_numpy = np.array(pred_list) + if not os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}"): + os.makedirs(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}") + + if os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy"): + os.remove(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy") + if os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy"): + os.remove(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy") + if os.path.exists(f"flag_{rank_id}.txt"): + os.remove(f"flag_{rank_id}.txt") + np.save(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy", label_numpy) + np.save(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy", pred_numpy) + os.mknod(f"flag_{rank_id}.txt") + while True: + file_exists_list = [os.path.exists(f"flag_{i}.txt") for i in range(rank_size)] + if sum(file_exists_list) == rank_size: + print("All saved!!!!!!!!!!") + break + else: + print("Waitting for saving numpy!!!!!!!!") + time.sleep(1) + continue + + auc = roc_auc_score(label_list, pred_list) + mean_log_loss = np.mean(log_loss_list) + return auc, mean_log_loss + + +def create_feature_spec_list(use_timestamp=False): + access_threshold = None + eviction_threshold = None + if use_timestamp: + access_threshold = 1000 + eviction_threshold = 180 + + feature_spec_list = [FeatureSpec("sparse_feature", table_name="sparse_embeddings", batch_size=cfg.batch_size, + access_threshold=access_threshold, eviction_threshold=eviction_threshold)] + if use_multi_lookup: + feature_spec_list.append(FeatureSpec("sparse_feature", table_name="sparse_embeddings", + batch_size=cfg.batch_size, + access_threshold=access_threshold, + eviction_threshold=eviction_threshold)) + if use_timestamp: + feature_spec_list.append(FeatureSpec("timestamp", is_timestamp=True)) + return feature_spec_list + + +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"Delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + _del_related_dir("kernel*") + _del_related_dir("model_dir_rank*") + _del_related_dir("op_cache") + + if os.getenv("CACHE_MODE", "") != CacheModeEnum.SSD.value: + return + logger.info("Current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" + " then create empty directory for this use case.") + for sub_path in SSD_DATA_PATH: + _del_related_dir(sub_path) + os.makedirs(sub_path, mode=0o550, exist_ok=True) + logger.info(f"Create dir:{sub_path}") + + +if __name__ == "__main__": + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + warnings.filterwarnings("ignore") + _clear_saved_model() + + rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None + rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None + interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None + train_steps = 10000 + eval_steps = 1360 + + try: + use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))) + use_multi_lookup = bool(int(os.getenv("USE_MULTI_LOOKUP", 0))) + MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0))) + use_faae = bool(int(os.getenv("USE_FAAE", 0))) + except ValueError as err: + raise ValueError("please correctly config USE_DYNAMIC_EXPANSION or USE_MULTI_LOOKUP or USE_FAAE " + "or USE_MODIFY_GRAPH only 0 or 1 is supported.") from err + + use_dynamic = bool(int(os.getenv("USE_DYNAMIC", 0))) + logger.info(f"USE_DYNAMIC:{use_dynamic}") + init(train_steps=train_steps, eval_steps=eval_steps, + use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion) + IF_LOAD = False + rank_id = mxrec_util.communication.hccl_ops.get_rank_id() + filelist = glob(f"./saved-model/sparse-model-0") + if filelist: + IF_LOAD = True + ConfigInitializer.get_instance().if_load = IF_LOAD + + cfg = Config() + feature_spec_list_train = None + feature_spec_list_eval = None + if use_faae: + feature_spec_list_train = create_feature_spec_list(use_timestamp=True) + feature_spec_list_eval = create_feature_spec_list(use_timestamp=True) + else: + feature_spec_list_train = create_feature_spec_list(use_timestamp=False) + feature_spec_list_eval = create_feature_spec_list(use_timestamp=False) + + train_batch, train_iterator = make_batch_and_iterator(cfg, feature_spec_list_train, is_training=True, + dump_graph=True, is_use_faae=use_faae) + eval_batch, eval_iterator = make_batch_and_iterator(cfg, feature_spec_list_eval, is_training=False, + dump_graph=False, is_use_faae=use_faae) + logger.info(f"train_batch: {train_batch}") + + if use_faae: + cfg.dev_vocab_size = cfg.dev_vocab_size // 2 + + optimizer_list = [get_dense_and_sparse_optimizer(cfg)] + + # note: variance_scaling_initializer only support HBM mode + emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) \ + if cfg.cache_mode != "HBM" or use_dynamic_expansion else \ + tf.compat.v1.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=sparse_hashtable_seed) + sparse_hashtable = create_table( + key_dtype=cfg.key_type, + dim=tf.TensorShape([cfg.emb_dim]), + name="sparse_embeddings", + emb_initializer=emb_initializer, + **cfg.get_emb_table_cfg() + ) + if use_faae: + tf.compat.v1.add_to_collection(ASCEND_TIMESTAMP, train_batch["timestamp"]) + + sparse_hashtable_list = [sparse_hashtable, sparse_hashtable] if use_multi_lookup else [sparse_hashtable] + train_model = model_forward(feature_spec_list_train, sparse_hashtable_list, train_batch, + is_train=True, modify_graph=MODIFY_GRAPH_FLAG) + eval_model = model_forward(feature_spec_list_eval, sparse_hashtable_list, eval_batch, + is_train=False, modify_graph=MODIFY_GRAPH_FLAG) + + dense_variables, sparse_variables = get_dense_and_sparse_variable() + trainable_varibles = [] + trainable_varibles.extend(dense_variables) + if use_dynamic_expansion: + trainable_varibles.append(tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB)[0]) + else: + trainable_varibles.extend(sparse_variables) + rank_size = mxrec_util.communication.hccl_ops.get_rank_size() + train_ops = [] + # multi task training + for loss, (dense_optimizer, sparse_optimizer) in zip([train_model.get("loss")], optimizer_list): + # do dense optimization + grads = dense_optimizer.compute_gradients(loss, var_list=trainable_varibles) + avg_grads = [] + for grad, var in grads[:-1]: + if rank_size > 1: + grad = hccl_ops.allreduce(grad, "sum") if grad is not None else None + if grad is not None: + avg_grads.append((grad / 8.0, var)) + # apply gradients: update variables + train_ops.append(dense_optimizer.apply_gradients(avg_grads)) + + if use_dynamic_expansion: + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) + # do sparse optimization by addr + sparse_grads = list(grads[-1]) # local_embedding + grads_and_vars = [(grad, address) for grad, address in zip(sparse_grads, train_address_list)] + train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) + else: + # do sparse optimization + sparse_grads = list(grads[-1]) + print("sparse_grads_tensor:", sparse_grads) + grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)] + train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) + + # 动态学习率更新 + train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]]) + + with tf.control_dependencies(train_ops): + train_ops = tf.no_op() + cfg.learning_rate = [cfg.learning_rate[0], cfg.learning_rate[1]] + + saver = tf.train.Saver() + if MODIFY_GRAPH_FLAG: + modify_graph_and_start_emb_cache(dump_graph=True) + else: + start_asc_pipeline() + + hook_list = [] + if use_faae: + hook_evict = EvictHook(evict_enable=True, evict_time_interval=120) + hook_list.append(hook_evict) + if MODIFY_GRAPH_FLAG: # 该场景添加hook处理校验问题 + hook_list.append(GraphModifierHook(modify_graph=False)) + + # with tf.compat.v1.Session(config=sess_config(dump_data=False)) as sess: + if use_faae: + sess = tf.compat.v1.train.MonitoredTrainingSession( + hooks=hook_list, + config=sess_config(dump_data=False) + ) + sess.graph._unsafe_unfinalize() + if not MODIFY_GRAPH_FLAG: + sess.run(train_iterator.initializer) + else: + sess.run(ConfigInitializer.get_instance().train_params_config.get_initializer(True)) + else: + sess = tf.compat.v1.Session(config=sess_config(dump_data=False)) + sess.run(tf.compat.v1.global_variables_initializer()) + if not MODIFY_GRAPH_FLAG: + sess.run(train_iterator.initializer) + else: + sess.run(ConfigInitializer.get_instance().train_params_config.get_initializer(True)) + + epoch = 0 + cost_sum = 0 + qps_sum = 0 + best_auc = 0 + iteration_per_loop = 10 + + train_ops = util.set_iteration_per_loop(sess, train_ops, 10) + + # for i in range(1, TRAIN_STEPS): + i = 0 + while True: + i += 1 + logger.info(f"################ training at step {i * iteration_per_loop} ################") + start_time = time.time() + + try: + grad, loss = sess.run([train_ops, train_model.get("loss")]) + lr = sess.run(cfg.learning_rate) + global_step = sess.run(cfg.global_step) + except tf.errors.OutOfRangeError: + logger.info(f"Encounter the end of Sequence for training.") + break + + end_time = time.time() + cost_time = end_time - start_time + qps = (1 / cost_time) * rank_size * cfg.batch_size * iteration_per_loop + cost_sum += cost_time + logger.info(f"step: {i * iteration_per_loop}; training loss: {loss}") + logger.info(f"step: {i * iteration_per_loop}; grad: {grad}") + logger.info(f"step: {i * iteration_per_loop}; lr: {lr}") + logger.info(f"global step: {global_step}") + logger.info(f"step: {i * iteration_per_loop}; current sess cost time: {cost_time:.10f}; current QPS: {qps}") + logger.info(f"training at step:{i * iteration_per_loop}, table[{sparse_hashtable.table_name}], " + f"table size:{sparse_hashtable.size()}, table capacity:{sparse_hashtable.capacity()}") + + if i % (train_steps // iteration_per_loop) == 0: + if interval is not None: + test_auc, test_mean_log_loss = evaluate_fix(i * iteration_per_loop) + else: + test_auc, test_mean_log_loss = evaluate() + print("Test auc: {}; log_loss: {} ".format(test_auc, test_mean_log_loss)) + best_auc = max(best_auc, test_auc) + logger.info(f"training step: {i * iteration_per_loop}, best auc: {best_auc}") + + sess.close() + + terminate_config_initializer() + logger.info("Demo done!") diff --git a/examples/WideDeep/model/mean_auc.py b/examples/WideDeep/model/mean_auc.py new file mode 100644 index 00000000..ff57df00 --- /dev/null +++ b/examples/WideDeep/model/mean_auc.py @@ -0,0 +1,40 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from glob import glob +import numpy as np + + +def split_auc(log_input): + with open(log_input, 'r') as log: + all_auc = [] + for line in log.readlines(): + if 'Test' in line: + all_auc.append(float(line.split(';')[0].split(':')[-1].strip())) + all_auc_len = len(all_auc) + all_auc_arr = np.array(all_auc)[:all_auc_len - all_auc_len % 8] + test_auc = np.mean(all_auc_arr.reshape(-1, 8), axis=-1) + return test_auc + + +log_path_all = 'latest_*.log' +log_path_list = glob(log_path_all) + +for log_path in log_path_list: + print(os.path.basename(log_path)) + print(split_auc(log_path)) + print('*'*20) \ No newline at end of file diff --git a/examples/WideDeep/model/model.py b/examples/WideDeep/model/model.py new file mode 100644 index 00000000..037fb276 --- /dev/null +++ b/examples/WideDeep/model/model.py @@ -0,0 +1,94 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import time +from easydict import EasyDict as edict + +import tensorflow as tf + + +model_cfg = edict() +model_cfg.loss_mode = "batch" +LOSS_OP_NAME = "loss" +LABEL_OP_NAME = "label" +VAR_LIST = "variable" +PRED_OP_NAME = "pred" + + +class MyModel: + def __init__(self): + self.kernel_init = None + self._loss_fn = None + self.is_training = None + + @classmethod + def _dot_interaction(cls, _input): + num_features = tf.shape(_input)[1] + batch_size = tf.shape(_input)[0] + xactions = tf.matmul(_input, _input, transpose_b=True) + ones = tf.ones_like(xactions, dtype=tf.float32) + upper_tri_mask = tf.linalg.band_part(ones, 0, -1) + + activations = tf.where(condition=tf.cast(upper_tri_mask, tf.bool), + x=tf.zeros_like(xactions), + y=xactions) + out_dim = num_features * num_features + activations = tf.reshape(activations, (batch_size, out_dim)) + return activations + + def build_model(self, + embedding=None, + dense_feature=None, + label=None, + is_training=True, + seed=None): + with tf.variable_scope("mlp", reuse=tf.AUTO_REUSE): + self._loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) + self.is_training = is_training + dense_embedding_vec = self.bottom_stack(dense_feature, seed) + dense_embedding = tf.expand_dims(dense_embedding_vec, 1) + interaction_args = tf.concat([dense_embedding, embedding], axis=1) + interaction_output = self._dot_interaction(interaction_args) + feature_interaction_output = tf.concat([dense_embedding_vec, interaction_output], axis=1) + # (8192, 857) + logits = self.top_stack(feature_interaction_output, seed) + loss = self._loss_fn(label, logits) + prediction = tf.sigmoid(logits) + trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mlp') + return {LOSS_OP_NAME: loss, + PRED_OP_NAME: prediction, + LABEL_OP_NAME: label, + VAR_LIST: trainable_variables} + + def bottom_stack(self, _input, seed): + dnn1 = tf.layers.dense(_input, 512, activation='relu', name='bs1', + kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), + bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), + kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) + dnn2 = tf.layers.dense(dnn1, 256, activation='relu', name='bs2', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) + dnn3 = tf.layers.dense(dnn2, 128, activation='relu', name='bs3', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) + return dnn3 + + def top_stack(self, _input, seed): + dnn1 = tf.layers.dense(_input, 1024, activation='relu', name='ts1', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) + dnn2 = tf.layers.dense(dnn1, 1024, activation='relu', name='ts2', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) + dnn3 = tf.layers.dense(dnn2, 512, activation='relu', name='ts3', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) + dnn4 = tf.layers.dense(dnn3, 256, activation='relu', name='ts4', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) + dnn5 = tf.layers.dense(dnn4, 1, activation=None, name='ts5', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) + return dnn5 + + +my_model = MyModel() diff --git a/examples/WideDeep/model/op_impl_mode.ini b/examples/WideDeep/model/op_impl_mode.ini new file mode 100644 index 00000000..579dea43 --- /dev/null +++ b/examples/WideDeep/model/op_impl_mode.ini @@ -0,0 +1 @@ +ScatterNdAdd=support_out_of_bound_index \ No newline at end of file diff --git a/examples/WideDeep/model/optimizer.py b/examples/WideDeep/model/optimizer.py new file mode 100644 index 00000000..18dbe288 --- /dev/null +++ b/examples/WideDeep/model/optimizer.py @@ -0,0 +1,46 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf + +from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer +from gradient_descent_w import create_hash_optimizer +from mx_rec.util.initialize import ConfigInitializer +from mx_rec.optimizers.gradient_descent_by_addr import create_hash_optimizer_by_addr +from mx_rec.optimizers import lazy_adam + + +def get_dense_and_sparse_optimizer(cfg): + use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion + if cfg.use_lazy_adam_optimizer: + if use_dynamic_expansion: + raise RuntimeError("model is incompatible with dynamic_expansion when use lazy_adam optimizer.") + # use lazy_adam optimizer + dense_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=cfg.learning_rate[0]) + sparse_optimizer = lazy_adam.create_hash_optimizer(learning_rate=cfg.learning_rate[1]) + loss_scale = 65536 + else: + # use SGD optimizer + dense_optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=cfg.learning_rate[0]) + if use_dynamic_expansion: + sparse_optimizer = create_hash_optimizer_by_addr(learning_rate=cfg.learning_rate[1], weight_decay=0.0001) + else: + sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1], weight_decay=0.0001) + loss_scale = 1024 + sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale) + dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale) + + return dense_optimizer, sparse_optimizer diff --git a/examples/WideDeep/model/run.sh b/examples/WideDeep/model/run.sh new file mode 100644 index 00000000..6c142443 --- /dev/null +++ b/examples/WideDeep/model/run.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +cur_path=$(dirname "$(readlink -f "$0")") + +so_path=$1 +mx_rec_package_path=$2 +hccl_cfg_json=$3 +dlrm_criteo_data_path=$4 +ip=$5 # no ranktable时传入该参数 + +interface="lo" +num_server=1 +local_rank_size=8 +num_process=$((num_server * local_rank_size)) +export TRAIN_RANK_SIZE=$num_process + +################# 参数配置 ###################### +export USE_DYNAMIC=0 # 0:静态shape;1:动态shape +export CACHE_MODE="HBM" # HBM;DDR;SSD +export USE_FAAE=0 # 0:关闭准入淘汰;1:开启准入淘汰 +export USE_DYNAMIC_EXPANSION=0 # 0:关闭动态扩容;1: 开启动态扩容 +export USE_MULTI_LOOKUP=0 # 0:一表一查;1:一表多查 +export USE_MODIFY_GRAPH=0 # 0:feature spec模式;1:自动改图模式 +################################################ +echo "CACHE_MODE:${CACHE_MODE}" + +export HCCL_CONNECT_TIMEOUT=1200 +export DLRM_CRITEO_DATA_PATH=${dlrm_criteo_data_path} +export PYTHONPATH=${mx_rec_package_path}:${so_path}:$PYTHONPATH +export LD_PRELOAD=/usr/lib64/libgomp.so.1 +export LD_LIBRARY_PATH=${so_path}:/usr/local/lib:$LD_LIBRARY_PATH +export ASCEND_DEVICE_ID=0 +export RANK_ID_START=0 +export JOB_ID=10086 +export CUSTOMIZED_OPS_LIB_PATH=${so_path}/libcust_ops.so # Todo: please config +export MXREC_LOG_LEVEL="INFO" +export TF_CPP_MIN_LOG_LEVEL=3 +export ASCEND_GLOBAL_LOG_LEVEL=3 +#export USE_FAAE=1 +export ENABLE_FORCE_V2_CONTROL=1 + +export PROFILING_OPTIONS='{"output":"/home/yz/profiling", + "training_trace":"on", + "task_trace":"on", + "aicpu":"on", + "fp_point":"", + "bp_point":"", + "aic_metrics":"PipeUtilization"}' + +RANK_ID_START=0 + +export MXREC_MODE="ASC" +echo "MXREC_MODE is $MXREC_MODE" +export py=main_mxrec.py +echo "py is $py" + +# 区分ranktable和no ranktable +if [ -n "$ip" ]; then + # no ranktable分支 + echo "Current is no ranktable solution." + echo "Input node ip: $ip, please make sure this ip is available." + export CM_CHIEF_IP=$ip # 主节点ip + export CM_CHIEF_PORT=60001 # 主节点监听端口 + export CM_CHIEF_DEVICE=0 # 主节点device id + export CM_WORKER_IP=$ip # 当前节点ip + export CM_WORKER_SIZE=$num_process # 参与集群训练的device数量 + echo "CM_CHIEF_IP=$CM_CHIEF_IP" + echo "CM_CHIEF_PORT=$CM_CHIEF_PORT" + echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE" + echo "CM_WORKER_IP=$CM_WORKER_IP" + echo "CM_WORKER_SIZE=$CM_WORKER_SIZE" +else + # ranktable分支 + echo "Current is ranktable solution, hccl json file:${hccl_cfg_json}" + export RANK_SIZE=$num_process + echo "RANK_SIZE=${RANK_SIZE}, please make sure hccl configuration json file match this parameter" + export RANK_TABLE_FILE=${hccl_cfg_json} +fi + +echo "use horovod to start tasks" +# GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO +mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' + +horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ +python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${num_process}p.log -- Gitee From 7a05b033d41af51df9aed7414ad04216dff821cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Mon, 3 Jun 2024 16:42:26 +0800 Subject: [PATCH 189/302] =?UTF-8?q?WideDeep=E6=A8=A1=E5=9E=8B=20=E8=BF=81?= =?UTF-8?q?=E7=A7=BB=E5=BC=80=E6=BA=90=E9=A1=B9=E7=9B=AE=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/README_WD.md | 567 ++++++++++++++++++++ examples/WideDeep/criteo.py | 246 +++++++++ examples/WideDeep/model/config.py | 22 +- examples/WideDeep/model/delay_loss_scale.py | 30 +- examples/WideDeep/model/main_mxrec.py | 205 ++++--- examples/WideDeep/model/model.py | 91 ++-- examples/WideDeep/model/optimizer.py | 28 +- 7 files changed, 1022 insertions(+), 167 deletions(-) create mode 100644 examples/WideDeep/README_WD.md create mode 100644 examples/WideDeep/criteo.py diff --git a/examples/WideDeep/README_WD.md b/examples/WideDeep/README_WD.md new file mode 100644 index 00000000..261861f7 --- /dev/null +++ b/examples/WideDeep/README_WD.md @@ -0,0 +1,567 @@ +# wide&deep模型 迁移样例(基于DLRM模型框架) + +开源项目在保证原有结构不变的情况下,可采用替换相关API接口的方式将项目由GPU >> NPU >> mxrec。在模型迁移适配过程中可能因兼容性问题而导致模型迁移失败,此处提供另一种模型适配方案。 + +*** +## 开源项目链接 + +```shell +https://github.com/ZiyaoGeng/RecLearn +``` +*** +## 数据集 + +```shell +Criteo4500w数据集: +https://ailab.criteo.com/ressources/kaggle-display-advertising-challenge-dataset.tar.gz +``` +*** +## 数据集预处理 + +### 解压文件列表 +- train.txt +- test.txt +- readme.txt + +text.txt因缺少label列无法使用,将train.txt数据集切分为10份,train_01.txt~train_09.txt为训练集,train_10.txt为测试集。数据预处理文件:criteo.py。 + +*** +### 数据预处理运行脚本 +```shell +python critro.py --data_path data_path --output_path output_path +``` +参数说明: +- dataset_path: train.txt的路径,如:"D:\dat\train.txt" +- output_path: tfrecord存放路径,如:"D:\dat\tfrecord\ " +*** + +### criteo.py +#### 1. 分割数据集 +调用`criteo.py`文件中的`get_split_file_path(parent_path, dataset_path, sample_num=4600000)`方法将数据集分割,`sample_num=4600000`是每个子数据集的样本数量。返回包含全部子数据集名称的列表。 + +```python +# get txt_list +split_file_list = get_split_file_path(dataset_path = dataset_path) +``` +*** +#### 2. 建立特征映射 +调用`criteo.py`文件中的`get_fea_map()`方法,以`{'C1':{}, 'C2':{},..., 'I1':{},...}`形式储存dense_feature的最大最小值以及sparse_feature去重后的特征映射。 + +```python +# get feature_map +fea_map = get_fea_map(split_file_list=split_file_list) +``` +*** +#### 3. dense_feature分桶离散化 +调用`criteo.py`文件中的`rec_kbins_discretizer(data_df, n_bins, min_max_dict)`方法将dense_feature分桶化离散化,`nbins=1000`。 + +```python +# dense feature: Bin continuous data into intervals. +data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, fea_map) +``` +*** +#### 4. sparse_feature特征映射 +通过如下操作将原始的字符串数据映射为0~max的int64数据。 + +```python +# sparse feature: mapping +for col in sparse_features: + data_df[col] = data_df[col].map(lambda x: fea_map[col][x]) +``` +*** +#### 5. 39个特征增加偏移项 +开源项目deep部分对39个特征分别作了embedding,即建了39个表。本项目只建了一张表,因此需要对每个特征对应的值作偏移。`slot_size_array`中的值分别对应各特征去重后的类别数。 + +```python +# add offsets +slot_size_array = [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, + 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, + 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573] +offset_size_list = np.cumsum([0] + slot_size_array[:-1]) +for j in range(1,len(offset_size_list)+1): + data_df.iloc[:, j] += offset_size_list[j-1] +``` +*** +#### 6. 数据集格式转换:txt >> tfrecord +调用`criteo.py`文件中的`convert_input2tfrd(in_file_path, out_file_path)`方法将txt文件转换为tfrecord文件。 + +```python +# txt to tfrecords +convert_input2tfrd(in_file_path=file, out_file_path=output_path) +``` +*** + +## 模型运行 + +参考mxrec的`README.md`文件在NPU服务器上配置环境后,可按照[mxrec-tf1](https://ascendhub.huawei.com/#/detail/mxrec-tf1)中DLRM模型运行命令启动模型训练。`so_path`、`mx_rec_package_path`、`hccl_cfg_json`配置不变,根据实际数据集路径配置`dlrm_criteo_data_path`。 + +```shell +# 运行命令 +bash run.sh {so_path} {mx_rec_package_path} {hccl_cfg_json} {dlrm_criteo_data_path} +``` +*** + +## 模型结果 +[开源项目](https://github.com/ZiyaoGeng/RecLearn)使用Criteo4500W数据集在GPU上训练模型,结果为`Log Loss=0.4692`、`AUC=0.7930`。适配完成模型后,固定`CACHE_MODE="HBM"`、`USE_FAAE=0`,在`run.sh`中配置其他选项卡,运行结果如下。 + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
ModelOptionsCriteo4500W
Use_DynamicUse_Dynamic_ExpansionUse_Multi_LookupUse_Modify_GraphLog LossAUC
WDL 0 0 0 0 0.4592 0.7934
WDL 0 1 0 0 0.4593 0.7933
WDL 1 0 0 0 0.4594 0.7932
WDL 1 1 0 0 0.4594 0.7932
WDL 1 1 1 0 0.4590 0.7937
WDL 0 0 0 1 0.4593 0.7934
WDL 0 1 0 1 0.4593 0.7933
WDL 1 0 0 1 0.4593 0.7933
WDL 1 1 0 1 0.4594 0.7932
WDL 1 1 1 1 0.4589 0.7937
+ + +*** +## 模型迁移 + +**迁移思路:** 参考开源项目,在现有已适配好的dlrm模型框架下,改动相关代码逻辑,完成Wide&deep模型的适配。 + +下文所提到的`动态扩容`、`动态shape`、`自动改图`、`一表多查`是mxrec提供的相关特性,开关选项见`run.sh`。 + +```shell +# run.sh: 32~37行 +export USE_DYNAMIC=0 # 0:静态shape;1:动态shape +export CACHE_MODE="HBM" # HBM;DDR;SSD +export USE_FAAE=0 # 0:关闭准入淘汰;1:开启准入淘汰 +export USE_DYNAMIC_EXPANSION=0 # 0:关闭动态扩容;1: 开启动态扩容 +export USE_MULTI_LOOKUP=0 # 0:一表一查;1:一表多查 +export USE_MODIFY_GRAPH=0 # 0:feature spec模式;1:自动改图模式 +``` + +*** +### DLRM模型框架 +**迁移说明:** 迁移过程中未使用`gradient_descent_w.py`、`mean_auc.py`。 + +- config.py +- delay_loss_scale.py +- gradient_descent_w.py +- main_mxrec.py +- mean_auc.py +- model.py +- optimizer.py +- run.sh + +*** + +#### 1. config.py +实验超参数配置如下:取消动态学习率逻辑,学习率固定为0.001。 + +```python +# 88~89行 +lr_sparse = self.base_lr_sparse * lr_factor_constant +lr_dense = self.base_lr_dense * lr_factor_constant +# 140~146行 +_lr_scheduler = LearningRateScheduler( + 0.001, + 0.001, + LR_SCHEDULE_STEPS[0], + LR_SCHEDULE_STEPS[1], + LR_SCHEDULE_STEPS[2], +) +# 超参数 +self.batch_size = 4096 +self.line_per_sample = 1 +self.train_epoch = 1 +self.test_epoch = 9 +self.emb_dim = 8 +``` +*** + + +#### 2. model.py +迁移过程中,`model.py`需参考开源项目文件`reclearn/models/ranking/wdl.py`的代码逻辑,使用tensorflow的低阶API重新编写。输出参数必须包括`loss`,`prediction`,`label`,`trainable_variables`。**迁移重点:mxRec对推荐模型中sparse_feature的创表查表操作作了加速,使用`create_table`与`sparse_lookup`接口替换tensorflow中的`tf.nn.embedding_lookup`接口。** 因此在适配开源项目时,会将sparse_feature的embedding操作放在模型结构外。 + +**reclearn开源项目原始代码:** +```python +# wdl.py +import tensorflow as tf +from tensorflow.keras import Model +from tensorflow.keras.layers import Dense, Embedding, Dropout, Input +from tensorflow.keras.regularizers import l2 + +from reclearn.layers import Linear, MLP +from reclearn.layers.utils import index_mapping + +class WideDeep(Model): + def __init__(self, feature_columns, hidden_units, activation='relu', + dnn_dropout=0., embed_reg=0., w_reg=0.): + """Wide&Deep. + Args: + :param feature_columns: A list. [{'feat_name':, 'feat_num':, 'embed_dim':}, ...] + :param hidden_units: A list. Neural network hidden units. + :param activation: A string. Activation function of MLP. + :param dnn_dropout: A scalar. Dropout of MLP. + :param embed_reg: A scalar. The regularization coefficient of embedding. + :param w_reg: A scalar. The regularization coefficient of Linear. + :return + """ + super(WideDeep, self).__init__() + self.feature_columns = feature_columns + self.embed_layers = { + feat['feat_name']: Embedding(input_dim=feat['feat_num'], + input_length=1, + output_dim=feat['embed_dim'], + embeddings_initializer='random_normal', + embeddings_regularizer=l2(embed_reg)) + for feat in self.feature_columns + } + self.map_dict = {} + self.feature_length = 0 + for feat in self.feature_columns: + self.map_dict[feat['feat_name']] = self.feature_length + self.feature_length += feat['feat_num'] + self.dnn_network = MLP(hidden_units, activation, dnn_dropout) + self.linear = Linear(self.feature_length, w_reg=w_reg) + self.final_dense = Dense(1, activation=None) + + def call(self, inputs): + sparse_embed = tf.concat([self.embed_layers[feat_name](value) for feat_name, value in inputs.items()], axis=-1) + x = sparse_embed # (batch_size, field * embed_dim) + # Wide + wide_inputs = index_mapping(inputs, self.map_dict) + wide_inputs = tf.concat([value for _, value in wide_inputs.items()], axis=-1) + wide_out = self.linear(wide_inputs) + # Deep + deep_out = self.dnn_network(x) + deep_out = self.final_dense(deep_out) + # out + outputs = tf.nn.sigmoid(0.5 * wide_out + 0.5 * deep_out) + return outputs + + def summary(self): + inputs = { + feat['feat_name']: Input(shape=(), dtype=tf.int32, name=feat['feat_name']) + for feat in self.feature_columns + } + Model(inputs=inputs, outputs=self.call(inputs)).summary() + +``` +`self.embed_layers`是对数据集中39个特征分别建表作embedding的操作,迁移后对应的代码逻辑见`main_mxrec.py`。 +`self.map_dict`统计了各特征需增加的偏移量。 +`index_mapping`是对数据增加偏移量的操作,迁移后对应的代码逻辑见`criteo.py`。 + +**迁移后代码:** +```python +# model.py +import time +from easydict import EasyDict as edict + +import tensorflow as tf + + +model_cfg = edict() +model_cfg.loss_mode = "batch" +LOSS_OP_NAME = "loss" +LABEL_OP_NAME = "label" +VAR_LIST = "variable" +PRED_OP_NAME = "pred" + + +class MyModel: + def __init__(self): + self.kernel_init = None + self._loss_fn = None + self.is_training = None + + def build_model(self, + wide_embedding=None, + deep_embedding=None, + label=None, + is_training=True, + seed=None, + dropout_rate=None, + batch_norm=False): + + with tf.variable_scope("wide_deep", reuse=tf.AUTO_REUSE): + self._loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) + self.is_training = is_training + + # wide + batch_size, wide_num, wide_emb_dim = wide_embedding.shape + wide_input = tf.reshape(wide_embedding[:,0], shape=(batch_size, wide_num * 1)) + wide_output = tf.reshape(tf.reduce_sum(wide_input, axis=1), shape=(-1,1)) + + # deep + batch_size, deep_num, deep_emb_dim = deep_embedding.shape + deep_input = tf.reshape(deep_embedding, shape=(batch_size, deep_num * deep_emb_dim)) + + ## MLP + hidden_units = [256,128,64] + net = deep_input + for i,unit in enumerate(hidden_units): + + net = tf.layers.dense(net, units=unit, activation='relu', name=f'hidden_layer_{i}', + kernel_initializer=tf.glorot_uniform_initializer(seed=seed), + bias_initializer=tf.zeros_initializer()) + + if dropout_rate is not None and 0.0 < dropout_rate < 1.0: + net = tf.layers.dropout(net,dropout_rate,training=self.is_training) + if batch_norm: + net = tf.layers.batch_normalization(net, training=self.is_training) + + deep_output = tf.layers.dense(net, units=1, activation=None, name='deep_output', + kernel_initializer=tf.glorot_uniform_initializer(seed=seed), + bias_initializer=tf.zeros_initializer()) + + total_logits = 0.5 * tf.add(wide_output,deep_output,name='total_logits') + loss = self._loss_fn(label, total_logits) + prediction = tf.sigmoid(total_logits) + trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='wide_deep') + return {LOSS_OP_NAME: loss, + PRED_OP_NAME: prediction, + LABEL_OP_NAME: label, + VAR_LIST: trainable_variables} + + +my_model = MyModel() + +``` +*** +#### 3. main_mxrec.py + +`main_mxrec.py`文件中的函数如下所示。`make_batch_and_iterator()`是读取数据集以及对数据作处理的函数;`model_forward()`是前向过程函数;`evaluate()`与`evaluate_fix()`是评估函数,用于计算测试集的AUC与loss。`add_timestamp_func()`与特征准入、淘汰有关;`create_feature_spec_list()`是生成元素为FeatureSpec类的列表的函数,其返回值是`make_batch_and_iterator()`所需的传参。特征准入与淘汰、FeatureSpec类、自动改图等解释见[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0001.html)。 + +- `add_timestamp_func()` +- `make_batch_and_iterator()` +- `model_forward()` +- `evaluate()` +- `evaluate_fix()` +- `create_feature_spec_list()` + +**迁移代码改动说明:** `add_timestamp_func()`、`evaluate()`、`evaluate_fix()`未作修改。 +
+ +3.1 读取数据集:`make_batch_and_iterator()` + +```python +# main_mxrec.py:100~104行 +def map_fn(batch): + new_batch = batch + new_batch['sparse_feature'] = tf.concat([batch['dense_feature'], batch['sparse_feature']], axis=1) + return new_batch +dataset = dataset.map(map_fn, num_parallel_calls=num_parallel) +``` +`map_fn()`:该函数是将分桶后的dense_feature与sparse_feature合并为新sparse_feature。该操作主要与`FeatureSpec()`、`sparse_lookup()`传入参数有关。 + +```python +# main_mxrec.py:109~118行 +if not MODIFY_GRAPH_FLAG: + + # Enable EOSDataset manually. + librec = import_host_pipeline_ops(LIBREC_EOS_OPS_SO) + channel_id = 0 if is_training else 1 + # 此处eos_map的调用必须先于insert_func,避免多卡数据不均匀的情况 + dataset = dataset.eos_map(librec, channel_id, kwargs.get("max_train_steps", max_train_steps), + kwargs.get("max_eval_steps", eval_steps)) + insert_fn = get_asc_insert_func(tgt_key_specs=feature_spec_list, is_training=is_training, dump_graph=dump_graph) + dataset = dataset.map(insert_fn) +``` +`dataset.eos_map()`:该函数主要是为了解决FeatureSpec模式下开`动态shape`选项卡,训练结束无法正常退出的问题。 + +*** +3.2 模型前向传播过程 + +```python +# main_mxrec.py:127~179行 +def model_forward(feature_list, wide_hash_table_list, deep_hash_table_list, batch, is_train, modify_graph, is_use_faae=False): + wide_embedding_list = [] + deep_embedding_list = [] + wide_feature_list = [] + deep_feature_list = [] + if is_use_faae: + feature_list_copy = feature_list[:-1] + else: + feature_list_copy = feature_list + + for i,item in enumerate(feature_list_copy): + if i % 2 == 0: + wide_feature_list.append(item) + else: + deep_feature_list.append(item) + + logger.debug(f"In model_forward function, is_train: {is_train}, feature_list: {len(feature_list)}, " + f"wide_hash_table_list: {len(wide_hash_table_list)}, deep_hash_table_list: {len(deep_hash_table_list)}") + + # wide + for wide_feature, wide_hash_table in zip(wide_feature_list, wide_hash_table_list): + if MODIFY_GRAPH_FLAG: + wide_feature = batch["sparse_feature"] + wide_embedding = sparse_lookup(wide_hash_table, wide_feature, cfg.send_count, dim=None, is_train=is_train, + name="wide_embedding_lookup", modify_graph=modify_graph, batch=batch, + access_and_evict_config=None) + wide_embedding_list.append(wide_embedding) + + # deep + for deep_feature, deep_hash_table in zip(deep_feature_list, deep_hash_table_list): + if MODIFY_GRAPH_FLAG: + deep_feature = batch["sparse_feature"] + deep_embedding = sparse_lookup(deep_hash_table, deep_feature, cfg.send_count, dim=None, is_train=is_train, + name="deep_embedding_lookup", modify_graph=modify_graph, batch=batch, + access_and_evict_config=None) + deep_embedding_list.append(deep_embedding) + + if len(wide_embedding_list) == 1: + wide_emb = wide_embedding_list[0] + deep_emb = deep_embedding_list[0] + elif len(wide_embedding_list) > 1: + wide_emb = tf.reduce_sum(wide_embedding_list, axis=0, keepdims=False) + deep_emb = tf.reduce_sum(deep_embedding_list, axis=0, keepdims=False) + else: + raise ValueError("the length of embedding_list must be greater than or equal to 1.") + my_model = MyModel() + model_output = my_model.build_model(wide_embedding=wide_emb, + deep_embedding=deep_emb, + label=batch["label"], + is_training=is_train, + seed=dense_hashtable_seed, + dropout_rate=0.5) + return model_output +``` +该函数是前向传播函数,主要包括sparse_feature的embedding操作(查表)与model前向操作。130~141行代码是预处理`sparse_lookup`传参的逻辑。147~162行代码对应开源项目中wide部分`self.linear`与deep部分`self.embed_layers`对39个特征作embedding的逻辑。164~171行是配置mxrec中`一表多查`特性的逻辑。 + +*** +3.3 创表操作 + +```python +# main_mxrec.py: 273~296行 +def create_feature_spec_list(use_timestamp=False): + access_threshold = None + eviction_threshold = None + if use_timestamp: + access_threshold = 1000 + eviction_threshold = 180 + + feature_spec_list = [FeatureSpec("sparse_feature", table_name="wide_embeddings", batch_size=cfg.batch_size, + access_threshold=access_threshold, eviction_threshold=eviction_threshold), + FeatureSpec("sparse_feature", table_name="deep_embeddings", batch_size=cfg.batch_size, + access_threshold=access_threshold, eviction_threshold=eviction_threshold)] + + if use_multi_lookup: + feature_spec_list.extend([FeatureSpec("sparse_feature", table_name="wide_embeddings", + batch_size=cfg.batch_size, + access_threshold=access_threshold, + eviction_threshold=eviction_threshold), + FeatureSpec("sparse_feature", table_name="deep_embeddings", + batch_size=cfg.batch_size, + access_threshold=access_threshold, + eviction_threshold=eviction_threshold)]) + if use_timestamp: + feature_spec_list.append(FeatureSpec("timestamp", is_timestamp=True)) + return feature_spec_list + +``` + +```python +# main_mxrec.py: 379~397行 +# 创表操作 +wide_emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) +deep_emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) + +sparse_hashtable_wide = create_table( + key_dtype=cfg.key_type, + dim=tf.TensorShape([cfg.emb_dim]), + name="wide_embeddings", + emb_initializer=wide_emb_initializer, + **cfg.get_emb_table_cfg() +) + +sparse_hashtable_deep = create_table( + key_dtype=cfg.key_type, + dim=tf.TensorShape([cfg.emb_dim]), + name="deep_embeddings", + emb_initializer=deep_emb_initializer, + **cfg.get_emb_table_cfg() +) +``` +`create_feature_spec_list()`的返回值是`make_batch_and_iterator()`、`model_forward()`的传参;`create_table()`的返回值是`sparse_lookup()`的传参。 +**注意:`len(feature_spec_list)`应与使用`create_table()`接口创建的表数相等;开启`一表多查`选项卡,feature_spec_list中的元素重复添加一次;开启`特征淘汰`选项卡,feature_spec_list增加时间戳的FeatureSpec类元素**。 + +*** + +3.4 模型反向传播过程 +```python +# main_mxrec.py: 410~442行 +train_variables, emb_variables = get_dense_and_sparse_variable() + +rank_size = mxrec_util.communication.hccl_ops.get_rank_size() +train_ops = [] +# multi task training +for loss, (model_optimizer, emb_optimizer) in zip([train_model.get("loss")], optimizer_list): + # do model optimization + grads = model_optimizer.compute_gradients(loss, var_list=train_variables) + avg_grads = [] + for grad, var in grads: + if rank_size > 1: + grad = hccl_ops.allreduce(grad, "sum") if grad is not None else None + if grad is not None: + avg_grads.append((grad / 8.0, var)) + # apply gradients: update variables + train_ops.append(model_optimizer.apply_gradients(avg_grads)) + + if use_dynamic_expansion: + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) + train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) + # do embedding optimization by addr + sparse_grads = emb_optimizer.compute_gradients(loss, train_emb_list) # local_embedding + grads_and_vars = [(grad, address) for grad, address in zip(sparse_grads, train_address_list)] + train_ops.append(emb_optimizer.apply_gradients(grads_and_vars)) + else: + # do embedding optimization + sparse_grads = emb_optimizer.compute_gradients(loss, emb_variables) + print("sparse_grads_tensor:", sparse_grads) + grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, emb_variables)] + train_ops.append(emb_optimizer.apply_gradients(grads_and_vars)) + +# 动态学习率更新 +train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]]) +``` +410~442行代码是模型的反向过程操作。mxRec对推荐模型中sparse_feature的创表查表操作作了加速,使用`create_table`与`sparse_lookup`接口替换tensorflow中的`tf.nn.embedding_lookup`接口。因此模型反向更新分为两部分:417~425行代码是对`model.py`内的模型部分的反向;427~439行代码是对sparse_feature作embedding操作部分的反向过程,根据是否开启`动态扩容`选择不同的参数计算梯度并更新权重。 + +*** + +#### 4. optimizer.py +如上所述,模型反向过程分为`model.py`与`embedding`两部分;`model.py`可使用tf原生的优化器,`embedding`部分选择mxrec提供的`lazy_adam`或`lazy_adam_by_addr`优化器。`delay_loss_scale.py`包装`dense_optimizer`与`sparse_optimizer`并对其应用损失缩放技术,该技术主要作用于混合精度训练过程中。 + +```python +import tensorflow as tf +from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer +from mx_rec.util.initialize import ConfigInitializer +from mx_rec.optimizers.lazy_adam import create_hash_optimizer +from mx_rec.optimizers.lazy_adam_by_addr import create_hash_optimizer_by_address + + +def get_dense_and_sparse_optimizer(cfg): + dense_optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate[0]) + use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion + if use_dynamic_expansion: + sparse_optimizer = create_hash_optimizer_by_address(learning_rate=cfg.learning_rate[1]) + else: + sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1]) + sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, 1) + dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, 1) + + return dense_optimizer, sparse_optimizer +``` + + diff --git a/examples/WideDeep/criteo.py b/examples/WideDeep/criteo.py new file mode 100644 index 00000000..9be82c35 --- /dev/null +++ b/examples/WideDeep/criteo.py @@ -0,0 +1,246 @@ +import os +import stat +import pickle +import argparse +import pandas as pd +import numpy as np +import tensorflow as tf +from tqdm import tqdm + +NAMES = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', + 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', + 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', + 'C23', 'C24', 'C25', 'C26'] + +def make_sub_file(lines, head, src_name, sub_dir_name, sub): + """Write sub-data. + Args: + :param lines: A list. Several pieces of data. + :param head: A string. ['label', 'I1', 'I2', ...]. + :param src_name: A string. The name of data. + :param sub_dir_name: A string. + :param sub: A scalar(Int). Record the current number of sub file. + :return: sub + 1. + """ + root_path, file_path = os.path.split(src_name) + file_name, suffix = file_path.split('.') + split_file_name = file_name + "_" + str(sub).zfill(2) + "." + suffix + split_file = os.path.join(root_path, sub_dir_name, split_file_name) + if not os.path.exists(os.path.join(root_path, sub_dir_name)): + os.mkdir(os.path.join(root_path, sub_dir_name)) + + modes = stat.S_IWUSR | stat.S_IRUSR + flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT + f = os.fdopen(os.open(split_file, flags, modes), 'w') + try: + f.writelines([head]) + f.writelines(lines) + return sub + 1 + finally: + os.close(f) + +def split_byline_count(filename, count, sub_dir_name): + """Split File. + Note: You can specify how many rows of data each sub file contains. + Args: + :param filename: A string. + :param count: A scalar(int). + :param sub_dir_name: A string. + :return: + """ + f = open(filename, 'r') + try: + head = f.readline() + buf = [] + sub = 1 + for line in f: + buf.append(line) + if len(buf) == count: + sub = make_sub_file(buf, head, filename, sub_dir_name, sub) + buf = [] + if len(buf) != 0: + make_sub_file(buf, head, filename, sub_dir_name, sub) + finally: + f.close() + +def get_split_file_path(parent_path=None, dataset_path=None, sample_num=4600000): + """Get the list of split file path. + Note: Either parent_path or dataset_path must be valid. + If exists dataset_path + "/split", parent_path = dataset_path + "/split". + Args: + :param parent_path: A string. split file's parent path. + :param dataset_path: A string. + :param sample_num: A int. The sample number of every split file. + :return: A list. [file1_path, file2_path, ...] + """ + sub_dir_name = 'split' + if parent_path is None and dataset_path is None: + raise ValueError('Please give parent path or file path.') + if parent_path is None and os.path.exists(os.path.join(os.path.dirname(dataset_path), sub_dir_name)): + parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) + elif parent_path is None or not os.path.exists(parent_path): + split_byline_count(dataset_path, sample_num, sub_dir_name) + parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) + split_file_name = os.listdir(parent_path) + split_file_name.sort() + split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt'] + return split_file_list + +def get_fea_map(fea_map_path=None, split_file_list=None): + """Get feature map. + Note: Either parent_path or dataset_path must be valid. + If exists dir(split_file_list[0]) + "/fea_map.pkl", fea_map_path is valid. + If fea_map_path is None and you want to build the feature map, + the default file path is the parent directory of split file + "fea_map.pkl". + Args: + :param fea_map_path: A string. + :param split_file_list: A list. [file1_path, file2_path, ...] + :return: A dict. {'C1':{}, 'C2':{}, ...} + """ + if fea_map_path is None and split_file_list is None: + raise ValueError('Please give feature map path or split file list.') + if fea_map_path is None and os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl"): + fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") + if os.path.exists(fea_map_path) and fea_map_path[-3:] == 'pkl': + with open(fea_map_path, 'rb') as f: + fea_map = pickle.load(f) + return fea_map + fea_map = {} + for file_open in tqdm(split_file_list): + f = open(file_open) + for line in f: + row = line.strip('\n').split('\t') + for i in range(14, 40): + if row[i] == '': + continue + name = NAMES[i] + fea_map.setdefault(name, {}) + if fea_map[name].get(row[i]) is None: + fea_map[name][row[i]] = len(fea_map[name]) + for j in range(1, 14): + if row[j] == '': + continue + name = NAMES[j] + fea_map.setdefault(name, {}) + fea_map[name].setdefault('min', float(row[j])) + fea_map[name].setdefault('max', float(row[j])) + fea_map[name]['min'] = min(fea_map[name]['min'], float(row[j])) + fea_map[name]['max'] = max(fea_map[name]['max'], float(row[j])) + f.close() + for i in range(14, 40): + fea_map[NAMES[i]]['-1'] = len(fea_map[NAMES[i]]) + fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") + + + modes = stat.S_IWUSR | stat.S_IRUSR + flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT + with os.fdopen(os.open(fea_map_path, flags, modes), 'wb') as fd: + pickle.dump(fea_map, fd, pickle.HIGHEST_PROTOCOL) + + fd.close() + return fea_map + +def rec_kbins_discretizer(dat, n_bins, min_max_dict): + """Bin continuous data into intervals. + Note: The strategy is "uniform". + Args: + :param dat: A dataframe. + :param n_bins: A scalar(int). + :param min_max_dict: A dict such as {'min': , 'max': }. + :return: The new dataframe. + """ + features = dat.columns + n_features = len(features) + bin_edges = np.zeros(n_features, dtype=object) + for idx, feature in enumerate(features): + bin_edges[idx] = np.linspace(min_max_dict[feature]['min'], min_max_dict[feature]['max'], n_bins + 1) + rtol = 1.e-5 + atol = 1.e-8 + eps = atol + rtol * np.abs(dat[feature]) + np.digitize(dat[feature] + eps, bin_edges[idx][1:]) + return dat + +def convert_input2tfrd(in_file_path, out_file_path): + """ + txt to tfrecords + """ + def make_example(label_list, dense_feat_list, sparse_feat_list): + # '1.0' >> 1.0 >> 1 + dense_feature = np.array(np.array(dense_feat_list, dtype=np.float32), dtype=np.int64).reshape(-1) + sparse_feature = np.array(np.array(sparse_feat_list, dtype=np.float32), dtype=np.int64).reshape(-1) + label = np.array(np.array(label_list, dtype=np.float32), dtype=np.int64).reshape(-1) + feature_dict = {"dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)), + "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), + "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) + } + example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) + + return example + + file_name = out_file_path + in_file_path[-12:-4] + '.tfrecords' + file_writer = tf.io.TFRecordWriter(file_name) + + with open(in_file_path, encoding='utf-8') as file_in: + + for i, line in tqdm(enumerate(file_in)): + + line = line.strip('\n') + items = line.split('\t') + if len(items) != 40: + continue + label = int(items[0]) + dense = items[1:14] + sparse = items[14:] + + ex = make_example(label, dense, sparse) + serialized = ex.SerializeToString() + file_writer.write(serialized) + + file_writer.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Get datasets') + parser.add_argument('--data_path') + parser.add_argument('--output_path') + + args, _ = parser.parse_known_args() + data_path = args.data_path + output_path = args.output_path + + # get txt_list + file_split_list = get_split_file_path(dataset_path=data_path) + # get feature_map + feature_map = get_fea_map(split_file_list=file_split_list) + + for file in tqdm(file_split_list): + + # read data + data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES) + # name feature + sparse_features = ['C' + str(i) for i in range(1, 27)] + dense_features = ['I' + str(i) for i in range(1, 14)] + # data processing + data_df[sparse_features] = data_df[sparse_features].fillna('-1') + data_df[dense_features] = data_df[dense_features].fillna(0) + # sparse feature: mapping + for col in sparse_features: + data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) + # dense feature: Bin continuous data into intervals. + data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map) + # add offsets + slot_size_array = [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, + 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, + 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573] + offset_size_list = np.cumsum([0] + slot_size_array[:-1]) + for col_index in range(1, len(offset_size_list) + 1): + data_df.iloc[:, col_index] += offset_size_list[col_index - 1] + # save to txt + data_df.to_csv(file, sep='\t', index=False, header=False) + # txt to tfrecords + convert_input2tfrd(in_file_path=file, out_file_path=output_path) + + + + + diff --git a/examples/WideDeep/model/config.py b/examples/WideDeep/model/config.py index 78115d61..fae850f9 100644 --- a/examples/WideDeep/model/config.py +++ b/examples/WideDeep/model/config.py @@ -85,8 +85,8 @@ class LearningRateScheduler: global_step < self.warmup_steps, lambda: lr_factor_warmup, lambda: poly_schedule_dense ) - lr_sparse = self.base_lr_sparse * lr_factor_sparse - lr_dense = self.base_lr_dense * lr_factor_dense + lr_sparse = self.base_lr_sparse * lr_factor_constant + lr_dense = self.base_lr_dense * lr_factor_constant return lr_dense, lr_sparse @@ -108,10 +108,10 @@ class Config: self.train_file_pattern = "train" self.test_file_pattern = "test" - self.batch_size = 8192 - self.line_per_sample = 1024 - self.train_epoch = 3 - self.test_epoch = 1 + self.batch_size = 4096 + self.line_per_sample = 1 + self.train_epoch = 1 + self.test_epoch = 9 self.perform_shuffle = False self.key_type = tf.int64 @@ -124,12 +124,10 @@ class Config: self.field_num = 26 self.send_count = 46000 // self.rank_size - self.emb_dim = 128 + self.emb_dim = 8 self.hashtable_threshold = 1 self.USE_PIPELINE_TEST = False - # False indicates use SGD optimizer, else use LazyAdam. If True, is incompatible with dynamic_expansion - self.use_lazy_adam_optimizer = False # 动态学习率 GLOBAL_BATCH_SIZE = 8192 * 8 @@ -140,8 +138,8 @@ class Config: ] self.global_step = tf.Variable(0, trainable=False) _lr_scheduler = LearningRateScheduler( - 28.443, - 33.71193, + 0.001, + 0.001, LR_SCHEDULE_STEPS[0], LR_SCHEDULE_STEPS[1], LR_SCHEDULE_STEPS[2], @@ -154,7 +152,7 @@ class Config: raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") if self.cache_mode == CacheModeEnum.HBM.value: - self.dev_vocab_size = 24_000_000 * self.rank_size + self.dev_vocab_size = 14_000_000 * self.rank_size self.host_vocab_size = 0 elif self.cache_mode == CacheModeEnum.DDR.value: self.dev_vocab_size = 500_000 * self.rank_size diff --git a/examples/WideDeep/model/delay_loss_scale.py b/examples/WideDeep/model/delay_loss_scale.py index 01bb0d8f..a99a2db3 100644 --- a/examples/WideDeep/model/delay_loss_scale.py +++ b/examples/WideDeep/model/delay_loss_scale.py @@ -17,48 +17,32 @@ import tensorflow as tf from tensorflow.python.training import optimizer -from config import Config - class DenseLossScaleOptimizer: - def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: + def __init__(self, opt, loss_scale): if not isinstance(opt, optimizer.Optimizer): raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) self._optimizer = opt self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - _update_lr_loss_scale(self._optimizer, loss_scale) + self._optimizer._learning_rate = self._optimizer._lr / self._loss_scale def compute_gradients(self, loss, var_list=None): - return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list) + return self._optimizer.compute_gradients(loss*self._loss_scale, var_list=var_list) def apply_gradients(self, avg_grads): return self._optimizer.apply_gradients(avg_grads) class SparseLossScaleOptimizer: - def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: + def __init__(self, opt, loss_scale): if not isinstance(opt, optimizer.Optimizer): raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) self._optimizer = opt self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - _update_lr_loss_scale(self._optimizer, loss_scale) + self._optimizer._learning_rate = self._optimizer._lr / self._loss_scale def compute_gradients(self, loss, var_list=None): - return tf.gradients(loss * self._loss_scale, var_list) + return tf.gradients(loss*self._loss_scale, var_list) def apply_gradients(self, grads_and_vars): - return self._optimizer.apply_gradients(grads_and_vars) - - -def _update_lr_loss_scale(opt, loss_scale): - if loss_scale <= 0: - raise RuntimeError("the loss_scale must be greater than zero.") - loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - if hasattr(opt, "_lr"): - # LazyAdam or Adam optimizer - opt._lr = opt._lr / loss_scale - elif hasattr(opt, "_learning_rate"): - # SGD optimizer - opt._learning_rate = opt._learning_rate / loss_scale - else: - raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.") + return self._optimizer.apply_gradients(grads_and_vars) \ No newline at end of file diff --git a/examples/WideDeep/model/main_mxrec.py b/examples/WideDeep/model/main_mxrec.py index 51ed7c4a..088aac84 100644 --- a/examples/WideDeep/model/main_mxrec.py +++ b/examples/WideDeep/model/main_mxrec.py @@ -16,6 +16,7 @@ import os import shutil +import collections import time import warnings import random @@ -34,7 +35,7 @@ from mx_rec.core.asc.manager import start_asc_pipeline from mx_rec.core.embedding import create_table, sparse_lookup from mx_rec.core.feature_process import EvictHook from mx_rec.graph.modifier import modify_graph_and_start_emb_cache, GraphModifierHook -from mx_rec.constants.constants import ASCEND_TIMESTAMP +from mx_rec.constants.constants import ASCEND_TIMESTAMP, LIBREC_EOS_OPS_SO from mx_rec.util.initialize import ConfigInitializer, init, terminate_config_initializer from mx_rec.util.ops import import_host_pipeline_ops import mx_rec.util as mxrec_util @@ -57,7 +58,7 @@ def add_timestamp_func(batch): return batch -def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, is_use_faae=False): +def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, is_use_faae=False, **kwargs): if config.USE_PIPELINE_TEST: num_parallel = 1 else: @@ -68,7 +69,7 @@ def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, # Extract features using the keys set during creation 'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64), 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64), - 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32), + 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.int64), } sample = tf.compat.v1.parse_single_example(data_record, features) return sample @@ -76,7 +77,6 @@ def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, def reshape_fn(batch): batch['label'] = tf.reshape(batch['label'], [-1, 1]) batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 13]) - batch['dense_feature'] = tf.math.log(batch['dense_feature'] + 3.0) batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 26]) return batch @@ -97,10 +97,24 @@ def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel).batch(batch_size, drop_remainder=True) dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel) + + def map_fn(batch): + new_batch = batch + new_batch['sparse_feature'] = tf.concat([batch['dense_feature'], batch['sparse_feature']], axis=1) + return new_batch + dataset = dataset.map(map_fn, num_parallel_calls=num_parallel) + if is_use_faae: dataset = dataset.map(add_timestamp_func) if not MODIFY_GRAPH_FLAG: + + # Enable EOSDataset manually. + librec = import_host_pipeline_ops(LIBREC_EOS_OPS_SO) + channel_id = 0 if is_training else 1 + # 此处eos_map的调用必须先于insert_func,避免多卡数据不均匀的情况 + dataset = dataset.eos_map(librec, channel_id, kwargs.get("max_train_steps", max_train_steps), + kwargs.get("max_eval_steps", eval_steps)) insert_fn = get_asc_insert_func(tgt_key_specs=feature_spec_list, is_training=is_training, dump_graph=dump_graph) dataset = dataset.map(insert_fn) @@ -111,32 +125,69 @@ def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, return batch, iterator -def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph): - embedding_list = [] + + +def model_forward(model_args): + feature_list = model_args.feature_list + wide_hash_table_list = model_args.wide_hash_table_list + deep_hash_table_list = model_args.deep_hash_table_list + batch = model_args.batch + is_train = model_args.is_train + modify_graph = model_args.modify_graph + is_use_faae = model_args.is_use_faae + + wide_embedding_list = [] + deep_embedding_list = [] + wide_feature_list = [] + deep_feature_list = [] + if is_use_faae: + feature_list_copy = feature_list[:-1] + else: + feature_list_copy = feature_list + + for i, item in enumerate(feature_list_copy): + if i % 2 == 0: + wide_feature_list.append(item) + else: + deep_feature_list.append(item) + logger.debug(f"In model_forward function, is_train: {is_train}, feature_list: {len(feature_list)}, " - f"hash_table_list: {len(hash_table_list)}") - for feature, hash_table in zip(feature_list, hash_table_list): + f"wide_hash_table_list: {len(wide_hash_table_list)}, " + f"deep_hash_table_list: {len(deep_hash_table_list)}") + + # wide + for wide_feature, wide_hash_table in zip(wide_feature_list, wide_hash_table_list): if MODIFY_GRAPH_FLAG: - feature = batch["sparse_feature"] - embedding = sparse_lookup(hash_table, feature, cfg.send_count, dim=None, is_train=is_train, - name="user_embedding_lookup", modify_graph=modify_graph, batch=batch, + wide_feature = batch["sparse_feature"] + wide_embedding = sparse_lookup(wide_hash_table, wide_feature, cfg.send_count, dim=None, is_train=is_train, + name="wide_embedding_lookup", modify_graph=modify_graph, batch=batch, access_and_evict_config=None) - embedding_list.append(embedding) + wide_embedding_list.append(wide_embedding) - if len(embedding_list) == 1: - emb = embedding_list[0] - elif len(embedding_list) > 1: - emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False) + # deep + for deep_feature, deep_hash_table in zip(deep_feature_list, deep_hash_table_list): + if MODIFY_GRAPH_FLAG: + deep_feature = batch["sparse_feature"] + deep_embedding = sparse_lookup(deep_hash_table, deep_feature, cfg.send_count, dim=None, is_train=is_train, + name="deep_embedding_lookup", modify_graph=modify_graph, batch=batch, + access_and_evict_config=None) + deep_embedding_list.append(deep_embedding) + + if len(wide_embedding_list) == 1: + wide_emb = wide_embedding_list[0] + deep_emb = deep_embedding_list[0] + elif len(wide_embedding_list) > 1: + wide_emb = tf.reduce_sum(wide_embedding_list, axis=0, keepdims=False) + deep_emb = tf.reduce_sum(deep_embedding_list, axis=0, keepdims=False) else: raise ValueError("the length of embedding_list must be greater than or equal to 1.") my_model = MyModel() - model_output = my_model.build_model(embedding=emb, - dense_feature=batch["dense_feature"], - label=batch["label"], - is_training=is_train, - seed=dense_hashtable_seed) - return model_output + BuildModel = collections.namedtuple("BuildModel", ["wide_embedding", "deep_embedding", "label", "is_training", + "seed", "dropout_rate", "batch_norm"]) + build_model_args = BuildModel(wide_emb, deep_emb, batch["label"], is_train, dense_hashtable_seed, 0.5, False) + model_output = my_model.build_model(build_model_args) + return model_output def evaluate(): print("read_test dataset") @@ -236,13 +287,20 @@ def create_feature_spec_list(use_timestamp=False): access_threshold = 1000 eviction_threshold = 180 - feature_spec_list = [FeatureSpec("sparse_feature", table_name="sparse_embeddings", batch_size=cfg.batch_size, + feature_spec_list = [FeatureSpec("sparse_feature", table_name="wide_embeddings", batch_size=cfg.batch_size, + access_threshold=access_threshold, eviction_threshold=eviction_threshold), + FeatureSpec("sparse_feature", table_name="deep_embeddings", batch_size=cfg.batch_size, access_threshold=access_threshold, eviction_threshold=eviction_threshold)] + if use_multi_lookup: - feature_spec_list.append(FeatureSpec("sparse_feature", table_name="sparse_embeddings", + feature_spec_list.extend([FeatureSpec("sparse_feature", table_name="wide_embeddings", + batch_size=cfg.batch_size, + access_threshold=access_threshold, + eviction_threshold=eviction_threshold), + FeatureSpec("sparse_feature", table_name="deep_embeddings", batch_size=cfg.batch_size, access_threshold=access_threshold, - eviction_threshold=eviction_threshold)) + eviction_threshold=eviction_threshold)]) if use_timestamp: feature_spec_list.append(FeatureSpec("timestamp", is_timestamp=True)) return feature_spec_list @@ -281,8 +339,9 @@ if __name__ == "__main__": rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None - train_steps = 10000 - eval_steps = 1360 + max_train_steps = 1270 + train_steps = 1120 + eval_steps = 1080 try: use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))) @@ -315,9 +374,11 @@ if __name__ == "__main__": feature_spec_list_eval = create_feature_spec_list(use_timestamp=False) train_batch, train_iterator = make_batch_and_iterator(cfg, feature_spec_list_train, is_training=True, - dump_graph=True, is_use_faae=use_faae) + dump_graph=True, is_use_faae=use_faae, + max_train_steps=max_train_steps, max_eval_steps=eval_steps) eval_batch, eval_iterator = make_batch_and_iterator(cfg, feature_spec_list_eval, is_training=False, - dump_graph=False, is_use_faae=use_faae) + dump_graph=False, is_use_faae=use_faae, + max_train_steps=max_train_steps, max_eval_steps=eval_steps) logger.info(f"train_batch: {train_batch}") if use_faae: @@ -325,60 +386,75 @@ if __name__ == "__main__": optimizer_list = [get_dense_and_sparse_optimizer(cfg)] - # note: variance_scaling_initializer only support HBM mode - emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) \ - if cfg.cache_mode != "HBM" or use_dynamic_expansion else \ - tf.compat.v1.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=sparse_hashtable_seed) - sparse_hashtable = create_table( + # 创表操作 + wide_emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) + deep_emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) + + sparse_hashtable_wide = create_table( key_dtype=cfg.key_type, dim=tf.TensorShape([cfg.emb_dim]), - name="sparse_embeddings", - emb_initializer=emb_initializer, + name="wide_embeddings", + emb_initializer=wide_emb_initializer, **cfg.get_emb_table_cfg() ) + + sparse_hashtable_deep = create_table( + key_dtype=cfg.key_type, + dim=tf.TensorShape([cfg.emb_dim]), + name="deep_embeddings", + emb_initializer=deep_emb_initializer, + **cfg.get_emb_table_cfg() + ) + if use_faae: tf.compat.v1.add_to_collection(ASCEND_TIMESTAMP, train_batch["timestamp"]) - sparse_hashtable_list = [sparse_hashtable, sparse_hashtable] if use_multi_lookup else [sparse_hashtable] - train_model = model_forward(feature_spec_list_train, sparse_hashtable_list, train_batch, - is_train=True, modify_graph=MODIFY_GRAPH_FLAG) - eval_model = model_forward(feature_spec_list_eval, sparse_hashtable_list, eval_batch, - is_train=False, modify_graph=MODIFY_GRAPH_FLAG) - - dense_variables, sparse_variables = get_dense_and_sparse_variable() - trainable_varibles = [] - trainable_varibles.extend(dense_variables) - if use_dynamic_expansion: - trainable_varibles.append(tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB)[0]) - else: - trainable_varibles.extend(sparse_variables) + # 一表多查 + wide_hashtable_list = [sparse_hashtable_wide, sparse_hashtable_wide] if use_multi_lookup else \ + [sparse_hashtable_wide] + deep_hashtable_list = [sparse_hashtable_deep, sparse_hashtable_deep] if use_multi_lookup else \ + [sparse_hashtable_deep] + + + Forward = collections.namedtuple("Forward", ["feature_list", "wide_hash_table_list", "deep_hash_table_list", + "batch", "is_train", "modify_graph", "is_use_faae"]) + train_forward_args = Forward(feature_spec_list_train, wide_hashtable_list, deep_hashtable_list, train_batch, + True, MODIFY_GRAPH_FLAG, use_faae) + eval_forward_args = Forward(feature_spec_list_eval, wide_hashtable_list, deep_hashtable_list, eval_batch, + False, MODIFY_GRAPH_FLAG, use_faae) + train_model = model_forward(train_forward_args) + eval_model = model_forward(eval_forward_args) + + train_variables, emb_variables = get_dense_and_sparse_variable() + rank_size = mxrec_util.communication.hccl_ops.get_rank_size() train_ops = [] # multi task training - for loss, (dense_optimizer, sparse_optimizer) in zip([train_model.get("loss")], optimizer_list): - # do dense optimization - grads = dense_optimizer.compute_gradients(loss, var_list=trainable_varibles) + for loss, (model_optimizer, emb_optimizer) in zip([train_model.get("loss")], optimizer_list): + # do model optimization + grads = model_optimizer.compute_gradients(loss, var_list=train_variables) avg_grads = [] - for grad, var in grads[:-1]: + for grad, var in grads: if rank_size > 1: grad = hccl_ops.allreduce(grad, "sum") if grad is not None else None if grad is not None: avg_grads.append((grad / 8.0, var)) # apply gradients: update variables - train_ops.append(dense_optimizer.apply_gradients(avg_grads)) + train_ops.append(model_optimizer.apply_gradients(avg_grads)) if use_dynamic_expansion: train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) - # do sparse optimization by addr - sparse_grads = list(grads[-1]) # local_embedding + train_emb_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB) + # do embedding optimization by addr + sparse_grads = emb_optimizer.compute_gradients(loss, train_emb_list) # local_embedding grads_and_vars = [(grad, address) for grad, address in zip(sparse_grads, train_address_list)] - train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) + train_ops.append(emb_optimizer.apply_gradients(grads_and_vars)) else: - # do sparse optimization - sparse_grads = list(grads[-1]) + # do embedding optimization + sparse_grads = emb_optimizer.compute_gradients(loss, emb_variables) print("sparse_grads_tensor:", sparse_grads) - grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)] - train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) + grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, emb_variables)] + train_ops.append(emb_optimizer.apply_gradients(grads_and_vars)) # 动态学习率更新 train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]]) @@ -451,8 +527,11 @@ if __name__ == "__main__": logger.info(f"step: {i * iteration_per_loop}; lr: {lr}") logger.info(f"global step: {global_step}") logger.info(f"step: {i * iteration_per_loop}; current sess cost time: {cost_time:.10f}; current QPS: {qps}") - logger.info(f"training at step:{i * iteration_per_loop}, table[{sparse_hashtable.table_name}], " - f"table size:{sparse_hashtable.size()}, table capacity:{sparse_hashtable.capacity()}") + logger.info(f"training at step:{i * iteration_per_loop}, " + f"table[{sparse_hashtable_wide.table_name}], " + f"table size:{sparse_hashtable_wide.size()}, table capacity:{sparse_hashtable_wide.capacity()}, " + f"table[{sparse_hashtable_deep.table_name}], " + f"table size:{sparse_hashtable_deep.size()}, table capacity:{sparse_hashtable_deep.capacity()}") if i % (train_steps // iteration_per_loop) == 0: if interval is not None: diff --git a/examples/WideDeep/model/model.py b/examples/WideDeep/model/model.py index 037fb276..61b1fdea 100644 --- a/examples/WideDeep/model/model.py +++ b/examples/WideDeep/model/model.py @@ -34,61 +34,54 @@ class MyModel: self._loss_fn = None self.is_training = None - @classmethod - def _dot_interaction(cls, _input): - num_features = tf.shape(_input)[1] - batch_size = tf.shape(_input)[0] - xactions = tf.matmul(_input, _input, transpose_b=True) - ones = tf.ones_like(xactions, dtype=tf.float32) - upper_tri_mask = tf.linalg.band_part(ones, 0, -1) - - activations = tf.where(condition=tf.cast(upper_tri_mask, tf.bool), - x=tf.zeros_like(xactions), - y=xactions) - out_dim = num_features * num_features - activations = tf.reshape(activations, (batch_size, out_dim)) - return activations - - def build_model(self, - embedding=None, - dense_feature=None, - label=None, - is_training=True, - seed=None): - with tf.variable_scope("mlp", reuse=tf.AUTO_REUSE): + def build_model(self,model_args): + wide_embedding = model_args.wide_embedding + deep_embedding = model_args.deep_embedding + label = model_args.label + is_training = model_args.is_training + seed = model_args.seed + dropout_rate = model_args.dropout_rate + batch_norm = model_args.batch_norm + + with tf.variable_scope("wide_deep", reuse=tf.AUTO_REUSE): self._loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True) self.is_training = is_training - dense_embedding_vec = self.bottom_stack(dense_feature, seed) - dense_embedding = tf.expand_dims(dense_embedding_vec, 1) - interaction_args = tf.concat([dense_embedding, embedding], axis=1) - interaction_output = self._dot_interaction(interaction_args) - feature_interaction_output = tf.concat([dense_embedding_vec, interaction_output], axis=1) - # (8192, 857) - logits = self.top_stack(feature_interaction_output, seed) - loss = self._loss_fn(label, logits) - prediction = tf.sigmoid(logits) - trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mlp') + + # wide + batch_size, wide_num, wide_emb_dim = wide_embedding.shape + wide_input = tf.reshape(wide_embedding[:, :, 0], shape=(batch_size, wide_num * 1)) + wide_output = tf.reshape(tf.reduce_sum(wide_input, axis=1), shape=(-1, 1)) + + # deep + batch_size, deep_num, deep_emb_dim = deep_embedding.shape + deep_input = tf.reshape(deep_embedding, shape=(batch_size, deep_num * deep_emb_dim)) + + ## MLP + hidden_units = [256, 128, 64] + net = deep_input + for i, unit in enumerate(hidden_units): + + net = tf.layers.dense(net, units=unit, activation='relu', name=f'hidden_layer_{i}', + kernel_initializer=tf.glorot_uniform_initializer(seed=seed), + bias_initializer=tf.zeros_initializer()) + + if dropout_rate is not None and 0.0 < dropout_rate < 1.0: + net = tf.layers.dropout(net, dropout_rate, training=self.is_training) + if batch_norm: + net = tf.layers.batch_normalization(net, training=self.is_training) + + deep_output = tf.layers.dense(net, units=1, activation=None, name='deep_output', + kernel_initializer=tf.glorot_uniform_initializer(seed=seed), + bias_initializer=tf.zeros_initializer()) + + total_logits = 0.5 * tf.add(wide_output, deep_output, name='total_logits') + loss = self._loss_fn(label, total_logits) + prediction = tf.sigmoid(total_logits) + trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='wide_deep') return {LOSS_OP_NAME: loss, PRED_OP_NAME: prediction, LABEL_OP_NAME: label, VAR_LIST: trainable_variables} - def bottom_stack(self, _input, seed): - dnn1 = tf.layers.dense(_input, 512, activation='relu', name='bs1', - kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), - bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), - kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) - dnn2 = tf.layers.dense(dnn1, 256, activation='relu', name='bs2', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) - dnn3 = tf.layers.dense(dnn2, 128, activation='relu', name='bs3', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) - return dnn3 - - def top_stack(self, _input, seed): - dnn1 = tf.layers.dense(_input, 1024, activation='relu', name='ts1', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) - dnn2 = tf.layers.dense(dnn1, 1024, activation='relu', name='ts2', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) - dnn3 = tf.layers.dense(dnn2, 512, activation='relu', name='ts3', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) - dnn4 = tf.layers.dense(dnn3, 256, activation='relu', name='ts4', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) - dnn5 = tf.layers.dense(dnn4, 1, activation=None, name='ts5', kernel_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), bias_initializer=tf.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=seed), kernel_regularizer=tf.contrib.layers.l1_regularizer(1e-2)) - return dnn5 - my_model = MyModel() diff --git a/examples/WideDeep/model/optimizer.py b/examples/WideDeep/model/optimizer.py index 18dbe288..1a781a01 100644 --- a/examples/WideDeep/model/optimizer.py +++ b/examples/WideDeep/model/optimizer.py @@ -15,32 +15,20 @@ # ============================================================================== import tensorflow as tf - from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer -from gradient_descent_w import create_hash_optimizer from mx_rec.util.initialize import ConfigInitializer -from mx_rec.optimizers.gradient_descent_by_addr import create_hash_optimizer_by_addr -from mx_rec.optimizers import lazy_adam +from mx_rec.optimizers.lazy_adam import create_hash_optimizer +from mx_rec.optimizers.lazy_adam_by_addr import create_hash_optimizer_by_address def get_dense_and_sparse_optimizer(cfg): + dense_optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate[0]) use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion - if cfg.use_lazy_adam_optimizer: - if use_dynamic_expansion: - raise RuntimeError("model is incompatible with dynamic_expansion when use lazy_adam optimizer.") - # use lazy_adam optimizer - dense_optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=cfg.learning_rate[0]) - sparse_optimizer = lazy_adam.create_hash_optimizer(learning_rate=cfg.learning_rate[1]) - loss_scale = 65536 + if use_dynamic_expansion: + sparse_optimizer = create_hash_optimizer_by_address(learning_rate=cfg.learning_rate[1]) else: - # use SGD optimizer - dense_optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=cfg.learning_rate[0]) - if use_dynamic_expansion: - sparse_optimizer = create_hash_optimizer_by_addr(learning_rate=cfg.learning_rate[1], weight_decay=0.0001) - else: - sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1], weight_decay=0.0001) - loss_scale = 1024 - sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale) - dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale) + sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1]) + sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, 1) + dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, 1) return dense_optimizer, sparse_optimizer -- Gitee From 91aa31c4db18d261a98cc34ae1d432a4dbb34643 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Mon, 3 Jun 2024 19:03:20 +0800 Subject: [PATCH 190/302] =?UTF-8?q?WideDeep=E6=A8=A1=E5=9E=8B=20=E8=BF=81?= =?UTF-8?q?=E7=A7=BB=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9cleancode=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/README_WD.md | 7 ++--- examples/WideDeep/criteo.py | 5 ++++ examples/WideDeep/model/delay_loss_scale.py | 29 ++++++++++++++++----- examples/WideDeep/model/main_mxrec.py | 9 +++---- examples/WideDeep/model/model.py | 2 +- examples/WideDeep/model/optimizer.py | 5 ++-- 6 files changed, 39 insertions(+), 18 deletions(-) diff --git a/examples/WideDeep/README_WD.md b/examples/WideDeep/README_WD.md index 261861f7..beb592c9 100644 --- a/examples/WideDeep/README_WD.md +++ b/examples/WideDeep/README_WD.md @@ -135,7 +135,7 @@ bash run.sh {so_path} {mx_rec_package_path} {hccl_cfg_json} {dlrm_criteo_data_pa *** ## 模型迁移 -**迁移思路:** 参考开源项目,在现有已适配好的dlrm模型框架下,改动相关代码逻辑,完成Wide&deep模型的适配。 +**迁移思路:** 在现有已适配好的dlrm模型框架下,改动相关代码逻辑,完成Wide&deep模型的适配。**核心:根据开源项目model代码修改`model.py`;数据处理操作一部分放入`criteo.py`,一部分放入`main_mxrec.py`中`make_batch_and_iterator()`内;`main_mxrec.py`中其他相关代码改动主要是为了适配mxrec提供的相关特性。** 下文所提到的`动态扩容`、`动态shape`、`自动改图`、`一表多查`是mxrec提供的相关特性,开关选项见`run.sh`。 @@ -164,6 +164,7 @@ export USE_MODIFY_GRAPH=0 # 0:feature spec模式;1:自动改图模 *** +### 代码改动说明 #### 1. config.py 实验超参数配置如下:取消动态学习率逻辑,学习率固定为0.001。 @@ -438,7 +439,7 @@ def model_forward(feature_list, wide_hash_table_list, deep_hash_table_list, batc dropout_rate=0.5) return model_output ``` -该函数是前向传播函数,主要包括sparse_feature的embedding操作(查表)与model前向操作。130~141行代码是预处理`sparse_lookup`传参的逻辑。147~162行代码对应开源项目中wide部分`self.linear`与deep部分`self.embed_layers`对39个特征作embedding的逻辑。164~171行是配置mxrec中`一表多查`特性的逻辑。 +该函数是前向传播函数,主要包括sparse_feature的embedding操作(查表)与model前向操作。130-141行代码是预处理`sparse_lookup`传参的逻辑。147-162行代码对应开源项目中wide部分`self.linear`与deep部分`self.embed_layers`对39个特征作embedding的逻辑。164-171行是配置mxrec中`一表多查`特性的逻辑。 *** 3.3 创表操作 @@ -536,7 +537,7 @@ for loss, (model_optimizer, emb_optimizer) in zip([train_model.get("loss")], opt # 动态学习率更新 train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]]) ``` -410~442行代码是模型的反向过程操作。mxRec对推荐模型中sparse_feature的创表查表操作作了加速,使用`create_table`与`sparse_lookup`接口替换tensorflow中的`tf.nn.embedding_lookup`接口。因此模型反向更新分为两部分:417~425行代码是对`model.py`内的模型部分的反向;427~439行代码是对sparse_feature作embedding操作部分的反向过程,根据是否开启`动态扩容`选择不同的参数计算梯度并更新权重。 +410-442行代码是模型的反向过程操作。mxRec对推荐模型中sparse_feature的创表查表操作作了加速,使用`create_table`与`sparse_lookup`接口替换tensorflow中的`tf.nn.embedding_lookup`接口。因此模型反向更新分为两部分:417-425行代码是对`model.py`内的模型部分的反向;427-439行代码是对sparse_feature作embedding操作部分的反向过程,根据是否开启`动态扩容`选择不同的参数计算梯度并更新权重。 *** diff --git a/examples/WideDeep/criteo.py b/examples/WideDeep/criteo.py index 9be82c35..055c41ec 100644 --- a/examples/WideDeep/criteo.py +++ b/examples/WideDeep/criteo.py @@ -12,6 +12,7 @@ NAMES = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', ' 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26'] + def make_sub_file(lines, head, src_name, sub_dir_name, sub): """Write sub-data. Args: @@ -39,6 +40,7 @@ def make_sub_file(lines, head, src_name, sub_dir_name, sub): finally: os.close(f) + def split_byline_count(filename, count, sub_dir_name): """Split File. Note: You can specify how many rows of data each sub file contains. @@ -86,6 +88,7 @@ def get_split_file_path(parent_path=None, dataset_path=None, sample_num=4600000) split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt'] return split_file_list + def get_fea_map(fea_map_path=None, split_file_list=None): """Get feature map. Note: Either parent_path or dataset_path must be valid. @@ -140,6 +143,7 @@ def get_fea_map(fea_map_path=None, split_file_list=None): fd.close() return fea_map + def rec_kbins_discretizer(dat, n_bins, min_max_dict): """Bin continuous data into intervals. Note: The strategy is "uniform". @@ -160,6 +164,7 @@ def rec_kbins_discretizer(dat, n_bins, min_max_dict): np.digitize(dat[feature] + eps, bin_edges[idx][1:]) return dat + def convert_input2tfrd(in_file_path, out_file_path): """ txt to tfrecords diff --git a/examples/WideDeep/model/delay_loss_scale.py b/examples/WideDeep/model/delay_loss_scale.py index a99a2db3..be5b9d58 100644 --- a/examples/WideDeep/model/delay_loss_scale.py +++ b/examples/WideDeep/model/delay_loss_scale.py @@ -17,32 +17,47 @@ import tensorflow as tf from tensorflow.python.training import optimizer +from config import Config class DenseLossScaleOptimizer: - def __init__(self, opt, loss_scale): + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: if not isinstance(opt, optimizer.Optimizer): raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) self._optimizer = opt self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - self._optimizer._learning_rate = self._optimizer._lr / self._loss_scale + _update_lr_loss_scale(self._optimizer, loss_scale) def compute_gradients(self, loss, var_list=None): - return self._optimizer.compute_gradients(loss*self._loss_scale, var_list=var_list) + return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list) def apply_gradients(self, avg_grads): return self._optimizer.apply_gradients(avg_grads) class SparseLossScaleOptimizer: - def __init__(self, opt, loss_scale): + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: if not isinstance(opt, optimizer.Optimizer): raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) self._optimizer = opt self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - self._optimizer._learning_rate = self._optimizer._lr / self._loss_scale + _update_lr_loss_scale(self._optimizer, loss_scale) def compute_gradients(self, loss, var_list=None): - return tf.gradients(loss*self._loss_scale, var_list) + return tf.gradients(loss * self._loss_scale, var_list) def apply_gradients(self, grads_and_vars): - return self._optimizer.apply_gradients(grads_and_vars) \ No newline at end of file + return self._optimizer.apply_gradients(grads_and_vars) + + +def _update_lr_loss_scale(opt, loss_scale): + if loss_scale <= 0: + raise RuntimeError("the loss_scale must be greater than zero.") + loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + if hasattr(opt, "_lr"): + # LazyAdam or Adam optimizer + opt._lr = opt._lr / loss_scale + elif hasattr(opt, "_learning_rate"): + # SGD optimizer + opt._learning_rate = opt._learning_rate / loss_scale + else: + raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.") \ No newline at end of file diff --git a/examples/WideDeep/model/main_mxrec.py b/examples/WideDeep/model/main_mxrec.py index 088aac84..37663b14 100644 --- a/examples/WideDeep/model/main_mxrec.py +++ b/examples/WideDeep/model/main_mxrec.py @@ -125,14 +125,12 @@ def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, return batch, iterator - - def model_forward(model_args): feature_list = model_args.feature_list wide_hash_table_list = model_args.wide_hash_table_list deep_hash_table_list = model_args.deep_hash_table_list batch = model_args.batch - is_train = model_args.is_train + is_train = model_args.is_train modify_graph = model_args.modify_graph is_use_faae = model_args.is_use_faae @@ -145,8 +143,8 @@ def model_forward(model_args): else: feature_list_copy = feature_list - for i, item in enumerate(feature_list_copy): - if i % 2 == 0: + for index, item in enumerate(feature_list_copy): + if index % 2 == 0: wide_feature_list.append(item) else: deep_feature_list.append(item) @@ -189,6 +187,7 @@ def model_forward(model_args): model_output = my_model.build_model(build_model_args) return model_output + def evaluate(): print("read_test dataset") if not MODIFY_GRAPH_FLAG: diff --git a/examples/WideDeep/model/model.py b/examples/WideDeep/model/model.py index 61b1fdea..bfe2177e 100644 --- a/examples/WideDeep/model/model.py +++ b/examples/WideDeep/model/model.py @@ -34,7 +34,7 @@ class MyModel: self._loss_fn = None self.is_training = None - def build_model(self,model_args): + def build_model(self, model_args): wide_embedding = model_args.wide_embedding deep_embedding = model_args.deep_embedding label = model_args.label diff --git a/examples/WideDeep/model/optimizer.py b/examples/WideDeep/model/optimizer.py index 1a781a01..2c7685bb 100644 --- a/examples/WideDeep/model/optimizer.py +++ b/examples/WideDeep/model/optimizer.py @@ -28,7 +28,8 @@ def get_dense_and_sparse_optimizer(cfg): sparse_optimizer = create_hash_optimizer_by_address(learning_rate=cfg.learning_rate[1]) else: sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1]) - sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, 1) - dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, 1) + loss_scale = 1 + sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale) + dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale) return dense_optimizer, sparse_optimizer -- Gitee From ffeefc4660fcf7f5bdc53f24a81032318fcc44f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Mon, 3 Jun 2024 19:52:50 +0800 Subject: [PATCH 191/302] =?UTF-8?q?WideDeep=E6=A8=A1=E5=9E=8B=20cleancode?= =?UTF-8?q?=E4=BF=AE=E6=94=B92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/criteo.py | 1 + examples/WideDeep/model/delay_loss_scale.py | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/WideDeep/criteo.py b/examples/WideDeep/criteo.py index 055c41ec..d6c493c0 100644 --- a/examples/WideDeep/criteo.py +++ b/examples/WideDeep/criteo.py @@ -65,6 +65,7 @@ def split_byline_count(filename, count, sub_dir_name): finally: f.close() + def get_split_file_path(parent_path=None, dataset_path=None, sample_num=4600000): """Get the list of split file path. Note: Either parent_path or dataset_path must be valid. diff --git a/examples/WideDeep/model/delay_loss_scale.py b/examples/WideDeep/model/delay_loss_scale.py index be5b9d58..f73baf68 100644 --- a/examples/WideDeep/model/delay_loss_scale.py +++ b/examples/WideDeep/model/delay_loss_scale.py @@ -19,6 +19,7 @@ from tensorflow.python.training import optimizer from config import Config + class DenseLossScaleOptimizer: def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: if not isinstance(opt, optimizer.Optimizer): -- Gitee From 5bd6d6811bcbe871e69f0dedebad4564ec71143c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Mon, 3 Jun 2024 20:35:07 +0800 Subject: [PATCH 192/302] =?UTF-8?q?WideDeep=E6=A8=A1=E5=9E=8B=20Issues?= =?UTF-8?q?=E9=97=AE=E9=A2=98=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/criteo.py | 12 ++++-------- examples/WideDeep/model/main_mxrec.py | 1 - 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/examples/WideDeep/criteo.py b/examples/WideDeep/criteo.py index d6c493c0..ffb07289 100644 --- a/examples/WideDeep/criteo.py +++ b/examples/WideDeep/criteo.py @@ -7,10 +7,7 @@ import numpy as np import tensorflow as tf from tqdm import tqdm -NAMES = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', - 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', - 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', - 'C23', 'C24', 'C25', 'C26'] +NAMES = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)] def make_sub_file(lines, head, src_name, sub_dir_name, sub): @@ -38,7 +35,7 @@ def make_sub_file(lines, head, src_name, sub_dir_name, sub): f.writelines(lines) return sub + 1 finally: - os.close(f) + f.close() def split_byline_count(filename, count, sub_dir_name): @@ -103,7 +100,7 @@ def get_fea_map(fea_map_path=None, split_file_list=None): """ if fea_map_path is None and split_file_list is None: raise ValueError('Please give feature map path or split file list.') - if fea_map_path is None and os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl"): + if fea_map_path is None and split_file_list is not None: fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") if os.path.exists(fea_map_path) and fea_map_path[-3:] == 'pkl': with open(fea_map_path, 'rb') as f: @@ -141,7 +138,6 @@ def get_fea_map(fea_map_path=None, split_file_list=None): with os.fdopen(os.open(fea_map_path, flags, modes), 'wb') as fd: pickle.dump(fea_map, fd, pickle.HIGHEST_PROTOCOL) - fd.close() return fea_map @@ -162,7 +158,7 @@ def rec_kbins_discretizer(dat, n_bins, min_max_dict): rtol = 1.e-5 atol = 1.e-8 eps = atol + rtol * np.abs(dat[feature]) - np.digitize(dat[feature] + eps, bin_edges[idx][1:]) + dat[feature] = np.digitize(dat[feature] + eps, bin_edges[idx][1:]) return dat diff --git a/examples/WideDeep/model/main_mxrec.py b/examples/WideDeep/model/main_mxrec.py index 37663b14..d81ff215 100644 --- a/examples/WideDeep/model/main_mxrec.py +++ b/examples/WideDeep/model/main_mxrec.py @@ -53,7 +53,6 @@ random.seed(shuffle_seed) def add_timestamp_func(batch): timestamp = import_host_pipeline_ops().return_timestamp(tf.cast(batch['label'], dtype=tf.int64)) - # tf.constant(np.random.randint(1,1688109060,1)), tf.int64)) batch["timestamp"] = timestamp return batch -- Gitee From 09ffbdf80301eaae28d42a25115da1a9f9eb243a Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Mon, 3 Jun 2024 12:47:50 +0000 Subject: [PATCH 193/302] !170 cleancode * cleancode * cleancode --- .../offset_mapper/offset_mapper.h | 2 +- src/AccCTR/src/unique/unique_func.h | 23 +++++++++---------- src/AccCTR/tests/ut/src/unique_test.cpp | 8 +++---- src/core/ssd_engine/file.cpp | 4 ++-- src/core/ssd_engine/file.h | 4 ++-- 5 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h index 80170989..f42a0d3f 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h @@ -91,7 +91,7 @@ public: std::vector swapInKeysID = FilterKeys(keys, swapInKoPair); uint64_t swapInCnt = 0; - int ret = FindInUsedPos(keys, swapInCnt, swapInKeysID, swapInKoPair, swapOutKoPair); + auto ret = FindInUsedPos(keys, swapInCnt, swapInKeysID, swapInKoPair, swapOutKoPair); if (ret != ock::ctr::H_OK) { return ret; } diff --git a/src/AccCTR/src/unique/unique_func.h b/src/AccCTR/src/unique/unique_func.h index 4812f74c..0222e4eb 100644 --- a/src/AccCTR/src/unique/unique_func.h +++ b/src/AccCTR/src/unique/unique_func.h @@ -175,22 +175,24 @@ public: } bucket->replaceBase = replaceOffset; for (int j = 0; j < bucket->count; ++j) { - out[total++] = bucket->data[j]; + out[total] = static_cast(bucket->data[j]); + ++total; } replaceOffset += bucket->count; } auto it = overflow_.begin(); int32_t totalOverflow = 0; while (it != overflow_.end()) { - out[total++] = it->first; + out[total] = it->first; it->second = replaceOffset++; + ++total; ++it; ++totalOverflow; } // set total overflow count stats_.totalUniques = static_cast(total - priorTotal); - stats_.totalOverflowUniques = totalOverflow; + stats_.totalOverflowUniques = static_cast(totalOverflow); return total - priorTotal; } @@ -241,17 +243,14 @@ public: { const int numOfGroupsInShard = groupMethod_.GroupCount(); uint32_t totalSize = conf.desiredSize + (conf.desiredSize >> 1); - while (bucketCountPower2_ * K_BUCKET_WIDTH * numOfGroupsInShard * estimatedDuplicateRatio < totalSize) { + while (static_cast(bucketCountPower2_ * K_BUCKET_WIDTH * numOfGroupsInShard * + estimatedDuplicateRatio) < totalSize) { bucketCountPower2_ <<= 1; } idCountEnable_ = (conf.outputType == OutputType::ENHANCED) && conf.useIdCount; for (int32_t i = 0; i < numOfGroupsInShard; ++i) { auto obj = new DedupT(bucketCountPower2_, numOfGroupsInShard, idCountEnable_); - if (obj == nullptr) { - ExternalLogger::PrintLog(LogLevel::ERROR, "creat object error"); - throw NullptrError(); - } dedupShards_.emplace_back(obj); } } @@ -302,7 +301,7 @@ public: if (conf.outputType == OutputType::ENHANCED) { int totalNumber = 0; for (int i = 0; i < conf.shardingNum; i++) { - totalUniqueSize[i] = totalNumber; + totalUniqueSize[i] = static_cast(totalNumber); if (conf.useSharding) { totalNumber += uniqueOut.uniqueIdCntInBucket[i]; } @@ -376,14 +375,14 @@ private: if (conf.useSharding && conf.useIdCount) { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueIdInBucket, total); // 特征计数使能和shard同时使能 - uniqueOut.uniqueIdCntInBucket[j] = inGroupTotal; + uniqueOut.uniqueIdCntInBucket[j] = static_cast(inGroupTotal); } else if (!conf.useSharding && conf.useIdCount) { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total); // 特征计数使能和shard不使能 } else if (conf.useSharding && !conf.useIdCount) { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueIdInBucket, total); // 特征计数使能和shard不使能 - uniqueOut.uniqueIdCntInBucket[j] = inGroupTotal; + uniqueOut.uniqueIdCntInBucket[j] = static_cast(inGroupTotal); } else { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total); // 特征计数不使能和shard不使能,跟普通unique对等 @@ -391,7 +390,7 @@ private: } else { inGroupTotal = dedupShards_[j]->UniqueRaw(uniqueOut.uniqueId, total); } - total += inGroupTotal; + total += static_cast(inGroupTotal); } uniqueOut.uniqueIdCnt = total; } diff --git a/src/AccCTR/tests/ut/src/unique_test.cpp b/src/AccCTR/tests/ut/src/unique_test.cpp index 94e8d92c..df5950e1 100644 --- a/src/AccCTR/tests/ut/src/unique_test.cpp +++ b/src/AccCTR/tests/ut/src/unique_test.cpp @@ -11,10 +11,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "unique_test.h" #include #include -#include "unique_test.h" #include "common.h" void UniqueTest::SetUpTestCase() @@ -155,7 +155,7 @@ TEST_F(UniqueTest, DoUniqueNormal) std::string input_path(path); std::cout << "input_path:" + input_path + "/data30.txt" << std::endl; std::ifstream input(input_path + "/data30.txt"); - if(!input.good()) { + if (!input.good()) { std::cout << "Failed to open file:" + input_path + "/data30.txt" << std::endl; return; } @@ -1404,7 +1404,7 @@ TEST_F(UniqueTest, DoUniqueLotsDataFunction) std::string input_path(path); std::cout << "input_path:" + input_path + "/data40.txt" << std::endl; std::ifstream input(input_path + "/data40.txt"); - if(!input.good()) { + if (!input.good()) { std::cout << "Failed to open file:" + input_path + "/data40.txt" << std::endl; return; } @@ -1510,7 +1510,7 @@ TEST_F(UniqueTest, DoUniqueLotsDataPaddingFunction) std::string input_path(path); std::cout << "input_path:" + input_path + "/data30.txt" << std::endl; std::ifstream input(input_path + "/data30.txt"); - if(!input.good()) { + if (!input.good()) { std::cout << "Failed to open file:" + input_path + "/data30.txt" << std::endl; return; } diff --git a/src/core/ssd_engine/file.cpp b/src/core/ssd_engine/file.cpp index cc9ec206..8c7da24e 100644 --- a/src/core/ssd_engine/file.cpp +++ b/src/core/ssd_engine/file.cpp @@ -281,12 +281,12 @@ void File::Load() emb_cache_key_t key; offset_t offset; do { - localFileMeta.read(reinterpret_cast(&key), keyDataLen); + localFileMeta.read(reinterpret_cast(&key), KEY_DATA_LEN); if (!localFileMeta.eof() && localFileMeta.fail()) { throw invalid_argument("file broken while reading key"); } - localFileMeta.read(reinterpret_cast(&offset), offsetDataLen); + localFileMeta.read(reinterpret_cast(&offset), OFFSET_DATA_LEN); if (!localFileMeta.eof() && localFileMeta.fail()) { throw invalid_argument("file broken while reading offset"); } diff --git a/src/core/ssd_engine/file.h b/src/core/ssd_engine/file.h index bc2b1fcb..5789ab8b 100644 --- a/src/core/ssd_engine/file.h +++ b/src/core/ssd_engine/file.h @@ -33,8 +33,8 @@ namespace MxRec { using offset_t = uint32_t; class File { - static constexpr uint64_t keyDataLen = sizeof(emb_cache_key_t); - static constexpr uint64_t offsetDataLen = sizeof(offset_t); + static constexpr uint64_t KEY_DATA_LEN = sizeof(emb_cache_key_t); + static constexpr uint64_t OFFSET_DATA_LEN = sizeof(offset_t); public: File(uint64_t fileID, string& fileDir); -- Gitee From 9de52b8e20f19516d55a2a4f9bed941b6b443e70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Tue, 4 Jun 2024 02:02:17 +0000 Subject: [PATCH 194/302] =?UTF-8?q?!167=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91DDR=E7=B2=BE=E5=BA=A6?= =?UTF-8?q?=E9=97=AE=E9=A2=98=EF=BC=8C=E7=BB=99=E6=90=BA=E5=B8=A6slot?= =?UTF-8?q?=E7=9A=84=E4=BC=98=E5=8C=96=E5=99=A8=E5=A2=9E=E5=8A=A0control?= =?UTF-8?q?=E8=BE=B9=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=20Modification=E3=80=91DDR=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98?= =?UTF-8?q?=EF=BC=8C=E6=89=A9=E5=AE=B9=E6=A8=A1=E5=BC=8F=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=99=A8=E5=90=8C=E6=AD=A5=E4=BF=AE=E6=94=B9=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91DDR?= =?UTF-8?q?=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98=EF=BC=8C=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=99=A8=E6=9B=B4=E6=8D=A2=E4=B8=BAsqrt(v=5Ft=5Fslice=20+=20te?= =?UTF-8?q?mp=5Fepsilon)=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91DDR=E7=B2=BE=E5=BA=A6=E9=97=AE?= =?UTF-8?q?=E9=A2=98=EF=BC=8C=E8=A7=A3=E5=86=B3issure=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91DDR?= =?UTF-8?q?=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98=EF=BC=8Ctrain=20bug?= =?UTF-8?q?=EF=BC=8C=E8=AE=AD=E7=BB=83=E5=8D=A1=E4=B8=BB=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91DDR?= =?UTF-8?q?=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98=EF=BC=8Ctrain=20bug?= =?UTF-8?q?=EF=BC=8C=E8=AE=AD=E7=BB=83=E5=8D=A1=E4=B8=BB=20*=20Merge=20rem?= =?UTF-8?q?ote-tracking=20branch=20'upstream/develop'=20into=20develop=20*?= =?UTF-8?q?=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91DDR=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98=EF=BC=8C?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E6=89=93=E5=8D=B0=E5=92=8Ccleancode=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91DDR=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98=EF=BC=8C?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E6=89=93=E5=8D=B0=E5=92=8Ccleancode=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91DDR=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98=EF=BC=8C?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E6=89=93=E5=8D=B0=E5=92=8Ccleancode=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91DDR=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98=EF=BC=8C?= =?UTF-8?q?=E7=BB=99=E6=90=BA=E5=B8=A6slot=E7=9A=84=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=99=A8=E5=A2=9E=E5=8A=A0control=E8=BE=B9=20*=20=E3=80=90?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification=E3=80=91DDR?= =?UTF-8?q?=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98=EF=BC=8C=E7=BB=99=E6=90=BA?= =?UTF-8?q?=E5=B8=A6slot=E7=9A=84=E4=BC=98=E5=8C=96=E5=99=A8=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0control=E8=BE=B9=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91DDR=E7=B2=BE=E5=BA=A6?= =?UTF-8?q?=E9=97=AE=E9=A2=98=EF=BC=8C=E7=BB=99=E6=90=BA=E5=B8=A6slot?= =?UTF-8?q?=E7=9A=84=E4=BC=98=E5=8C=96=E5=99=A8=E5=A2=9E=E5=8A=A0control?= =?UTF-8?q?=E8=BE=B9=20*=20=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=20Modification=E3=80=91DDR=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98?= =?UTF-8?q?=EF=BC=8C=E7=BB=99=E6=90=BA=E5=B8=A6slot=E7=9A=84=E4=BC=98?= =?UTF-8?q?=E5=8C=96=E5=99=A8=E5=A2=9E=E5=8A=A0control=E8=BE=B9=20*=20?= =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4=E6=98=8E=20Modification?= =?UTF-8?q?=E3=80=91DDR=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98=EF=BC=8C?= =?UTF-8?q?=E7=BB=99=E6=90=BA=E5=B8=A6slot=E7=9A=84=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=99=A8=E5=A2=9E=E5=8A=A0control=E8=BE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/constants/constants.py | 2 ++ mx_rec/core/asc/build_graph.py | 36 ++++++++++++++++---------- mx_rec/core/asc/swap_args.py | 7 +++++ mx_rec/graph/modifier.py | 11 +++++++- mx_rec/optimizers/adagrad.py | 4 ++- mx_rec/optimizers/base.py | 13 ++++++++++ mx_rec/optimizers/ftrl.py | 4 ++- mx_rec/optimizers/lazy_adam.py | 6 +++-- mx_rec/optimizers/lazy_adam_by_addr.py | 2 +- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 22 +++++++++++++--- src/core/utils/common.cpp | 11 ++++++++ src/core/utils/common.h | 7 ++++- 12 files changed, 100 insertions(+), 25 deletions(-) diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index 13b3d583..becba0ab 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -170,6 +170,8 @@ class ASCAnchorAttr(Enum): RESTORE_VECTOR_SECOND = "restore_vector_second" UNIQUE_KEYS = "unique_keys" IS_GRAD = "is_grad" + TABLE_NAME = "table_name" + CHANNEL_ID = "channel_id" class OptimizerType(Enum): diff --git a/mx_rec/core/asc/build_graph.py b/mx_rec/core/asc/build_graph.py index 0ddf313e..00b9d282 100644 --- a/mx_rec/core/asc/build_graph.py +++ b/mx_rec/core/asc/build_graph.py @@ -21,6 +21,7 @@ from typing import Optional, List, Dict, Union, Tuple import tensorflow as tf import mxrec_pybind +from mx_rec.constants.constants import ASCAnchorAttr from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.tf_version_adapter import npu_ops from mx_rec.util.log import logger @@ -36,7 +37,8 @@ class SwapInfo: def get_restore_vector(config): - logger.debug('Channel %s_restore_%s was built for getnext', config.get("table_name"), config.get("channel_id")) + logger.debug('Channel %s_restore_%s was built for getnext', config.get(ASCAnchorAttr.TABLE_NAME.value), + config.get(ASCAnchorAttr.CHANNEL_ID.value)) if config.get("is_hbm"): if not isinstance(config.get("emb_size"), int) or config.get("emb_size") < 1: raise TypeError(f"emb_size must be a int") @@ -58,32 +60,36 @@ def get_restore_vector(config): restore_size = None hot_size = None - with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): + with tf.compat.v1.variable_scope(config.get(ASCAnchorAttr.TABLE_NAME.value), reuse=tf.compat.v1.AUTO_REUSE): restore_vector, hot_pos = npu_ops.gen_npu_ops.get_next( output_types=[tf.int32, tf.int32], output_shapes=[restore_size, [hot_size]], - channel_name=f'{config.get("table_name")}_restore_{config.get("channel_id")}') + channel_name=f'{config.get(ASCAnchorAttr.TABLE_NAME.value)}' + f'_restore_{config.get(ASCAnchorAttr.CHANNEL_ID.value)}') return restore_vector, hot_pos def get_id_offsets(max_lookup_vec_size: int, config: dict) -> Tuple[int, SwapInfo]: - logger.debug('Channel %s_lookup_%s was built for getnext', config.get("table_name"), config.get("channel_id")) + logger.debug('Channel %s_lookup_%s was built for getnext', config.get(ASCAnchorAttr.TABLE_NAME.value), + config.get(ASCAnchorAttr.CHANNEL_ID.value)) # 自动扩容当前只支持HBM模式,默认没有换入换出 swap_info = SwapInfo() - with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): + with tf.compat.v1.variable_scope(config.get(ASCAnchorAttr.TABLE_NAME.value), reuse=tf.compat.v1.AUTO_REUSE): if config.get("use_dynamic_expansion"): [id_offsets] = npu_ops.gen_npu_ops.get_next( output_types=[tf.int64], output_shapes=[[max_lookup_vec_size]], - channel_name=f'{config.get("table_name")}_lookup_{config.get("channel_id")}') + channel_name=f'{config.get(ASCAnchorAttr.TABLE_NAME.value)}' + f'_lookup_{config.get(ASCAnchorAttr.CHANNEL_ID.value)}') return id_offsets, swap_info [id_offsets] = npu_ops.gen_npu_ops.get_next( output_types=[tf.int32], output_shapes=[[max_lookup_vec_size]], - channel_name=f'{config.get("table_name")}_lookup_{config.get("channel_id")}') + channel_name=f'{config.get(ASCAnchorAttr.TABLE_NAME.value)}' + f'_lookup_{config.get(ASCAnchorAttr.CHANNEL_ID.value)}') if config.get("is_hbm"): return id_offsets, swap_info ( @@ -94,9 +100,9 @@ def get_id_offsets(max_lookup_vec_size: int, config: dict) -> Tuple[int, SwapInf ) = npu_ops.gen_npu_ops.get_next( output_types=[tf.int32, tf.int32, tf.int32, tf.int32], output_shapes=[[max_lookup_vec_size], [max_lookup_vec_size], [], []], - channel_name=f'{config.get("table_name")}_swap_all', + channel_name=f'{config.get(ASCAnchorAttr.TABLE_NAME.value)}_swap_all', ) - logger.debug('Channel %s_swap_all was built for getnext', config.get("table_name")) + logger.debug('Channel %s_swap_all was built for getnext', config.get(ASCAnchorAttr.TABLE_NAME.value)) return id_offsets, swap_info @@ -111,13 +117,15 @@ def get_all2all_args(use_static: bool, config: dict) -> Optional[list]: if use_static: return all2all_args - with tf.compat.v1.variable_scope(config.get("table_name"), reuse=tf.compat.v1.AUTO_REUSE): + with tf.compat.v1.variable_scope(config.get(ASCAnchorAttr.TABLE_NAME.value), reuse=tf.compat.v1.AUTO_REUSE): with tf.compat.v1.variable_scope("all2all"): - logger.debug('Channel %s_a2a_%s was built for getnext', config.get("table_name"), config.get("channel_id")) + logger.debug('Channel %s_a2a_%s was built for getnext', config.get(ASCAnchorAttr.TABLE_NAME.value), + config.get(ASCAnchorAttr.CHANNEL_ID.value)) all2all_args = npu_ops.gen_npu_ops.get_next( output_types=[tf.int64], output_shapes=[[config.get("rank_size"), config.get("rank_size")]], - channel_name=f'{config.get("table_name")}_all2all_{config.get("channel_id")}', + channel_name=f'{config.get(ASCAnchorAttr.TABLE_NAME.value)}' + f'_all2all_{config.get(ASCAnchorAttr.CHANNEL_ID.value)}', name="a2a_get_next")[0] * config.get("emb_size") return all2all_args @@ -139,8 +147,8 @@ def get_preprocessed_tensor_for_asc(table, config): # 一表多查时,会多次进入get_preprocessed_tensor_for_asc,最后一次大查询替换map的key-value即可 swap_args = SwapArgs() - swap_args.set_data(SwapDataType.CONFIG.value, var_name=config.get("table_name"), - var_channel=config.get("channel_id"), config=config, swap_info=swap_info) + swap_args.set_data(SwapDataType.CONFIG.value, var_name=config.get(ASCAnchorAttr.TABLE_NAME.value), + var_channel=config.get(ASCAnchorAttr.CHANNEL_ID.value), config=config, swap_info=swap_info) all2all_args = get_all2all_args(use_static, config) diff --git a/mx_rec/core/asc/swap_args.py b/mx_rec/core/asc/swap_args.py index 5bcba234..1d206b95 100644 --- a/mx_rec/core/asc/swap_args.py +++ b/mx_rec/core/asc/swap_args.py @@ -41,6 +41,7 @@ class SwapArgs: def __init__(self): self.swap_config_dict = defaultdict(dict) self.swap_control_dict = defaultdict(dict) + self.slot_control_dict = defaultdict(dict) def set_data(self, data_type: str, **kwargs): if "var_name" not in kwargs: @@ -56,3 +57,9 @@ class SwapArgs: self.swap_control_dict[var_name][var_channel] = kwargs else: raise ValueError(f"Error data type in swap args: {data_type}") + + def set_slot_control(self, **kwargs): + if "var_name" not in kwargs: + raise ValueError("Missing Required key: var_name") + var_name = kwargs.pop("var_name") + self.slot_control_dict[var_name] = kwargs diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index 01aeda94..8629b350 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -253,12 +253,21 @@ class _GraphModifier: swap_args_dict = swap_args.swap_config_dict[table_instance.table_name][channel_id] swap_op = _get_swap_info( table_instance, variable_and_slot_list, swap_args_dict["swap_info"], channel_id) + # gather for id_offset need to be executed after swap_op swap_control_dict = swap_args.swap_control_dict[table_instance.table_name][channel_id] if "control_ops" not in swap_control_dict: - raise ValueError("Missing Required key in modify_graph_for_asc: control_ops") + raise ValueError("swap control missing key [control_ops] in modify_graph_for_asc") control_ops = swap_control_dict["control_ops"] utils.replace_anchor_control(self._full_graph, control_ops, swap_op) + if is_training: + # gather for slot need to be executed after swap_op + slot_control_dict = swap_args.slot_control_dict[table_instance.variable] + if "control_ops" not in slot_control_dict: + raise ValueError("slot control missing key [control_ops] in modify_graph_for_asc") + slot_control_ops = slot_control_dict["control_ops"] + utils.replace_anchor_control(self._full_graph, slot_control_ops, swap_op) + def _generate_get_next_op_specs(self, cutting_point_list: List[Tensor]) -> Dict[Tensor, _AnchorRecord]: get_next_op_map = defaultdict(dict) diff --git a/mx_rec/optimizers/adagrad.py b/mx_rec/optimizers/adagrad.py index 9998ec1f..df1fe2a3 100644 --- a/mx_rec/optimizers/adagrad.py +++ b/mx_rec/optimizers/adagrad.py @@ -26,7 +26,7 @@ from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.training import adagrad, training_ops -from mx_rec.optimizers.base import CustomizedOptimizer +from mx_rec.optimizers.base import CustomizedOptimizer, control_update_op_decorator from mx_rec.util.initialize import ConfigInitializer from mx_rec.validator.validator import para_checker_decorator, StringValidator, ClassValidator, FloatValidator @@ -111,6 +111,7 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): unique_local_grad, unique_keys = self.sum_same_id_gradients(grad=grad, var=handle, is_expansion=False) return self._resource_apply_sparse(unique_local_grad, handle, unique_keys) + @control_update_op_decorator def _apply_sparse(self, grad, var): acc = self.get_slot(var, "acc") return training_ops.sparse_apply_adagrad( @@ -119,6 +120,7 @@ class CustomizedAdagrad(adagrad.AdagradOptimizer, CustomizedOptimizer): grad.indices, use_locking=self._use_locking) + @control_update_op_decorator def _resource_apply_sparse(self, grad, var, indices): acc = self.get_slot(var, "acc") return training_ops.resource_sparse_apply_adagrad( diff --git a/mx_rec/optimizers/base.py b/mx_rec/optimizers/base.py index f74e9778..49629641 100644 --- a/mx_rec/optimizers/base.py +++ b/mx_rec/optimizers/base.py @@ -26,6 +26,7 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.training.optimizer import _TensorProcessor +from mx_rec.core.asc.swap_args import SwapArgs from mx_rec.constants.constants import ASCAnchorAttr from mx_rec.util.tf_version_adapter import npu_ops from mx_rec.util.initialize import ConfigInitializer @@ -143,6 +144,18 @@ def custom_update_op(self, opt, grad): raise RuntimeError("Only support g with type Tensor.") +def control_update_op_decorator(apply_sparse): + def wrapper(*args, **kwargs): + second_arg = args[2] if len(args) > 2 else None # index 2 input must be var + slot_control_ops = tf.no_op(name="place_holder_slot_control_op") + swap_args = SwapArgs() + swap_args.set_slot_control(var_name=second_arg, control_ops=slot_control_ops) + with tf.control_dependencies([slot_control_ops]): + result = apply_sparse(*args, **kwargs) + return result + return wrapper + + def patch_for_optimizer(): _TensorProcessor.update_op = custom_update_op logger.debug("update_op in Class optimizer._TensorProcessor has been patched.") \ No newline at end of file diff --git a/mx_rec/optimizers/ftrl.py b/mx_rec/optimizers/ftrl.py index 30287abd..ad4f9880 100644 --- a/mx_rec/optimizers/ftrl.py +++ b/mx_rec/optimizers/ftrl.py @@ -30,7 +30,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import gen_state_ops from tensorflow.python.training import ftrl -from mx_rec.optimizers.base import CustomizedOptimizer +from mx_rec.optimizers.base import CustomizedOptimizer, control_update_op_decorator from mx_rec.util.initialize import ConfigInitializer from mx_rec.constants.constants import MAX_INT32 from mx_rec.validator.validator import para_checker_decorator, ClassValidator, StringValidator, \ @@ -128,6 +128,7 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): grad.indices, lambda x, i, v: tf.compat.v1.scatter_nd_update(x, i, v)) + @control_update_op_decorator def _apply_sparse_shared(self, grad, var, indices, scatter_nd_update): accum = self.get_slot(var, "accum") linear = self.get_slot(var, "linear") @@ -169,6 +170,7 @@ class CustomizedFtrl(ftrl.FtrlOptimizer, CustomizedOptimizer): return control_flow_ops.group(accum_update_op, linear_update_op, var_update_op) + @control_update_op_decorator def _apply_sparse_shared_v2(self, grad, var, indices, scatter_nd_update): accum = self.get_slot(var, "accum") linear = self.get_slot(var, "linear") diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index 9aee0204..0684a715 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -29,7 +29,7 @@ from tensorflow.python.ops import gen_state_ops from tensorflow.python.ops import math_ops from tensorflow.python.training import adam -from mx_rec.optimizers.base import CustomizedOptimizer +from mx_rec.optimizers.base import CustomizedOptimizer, control_update_op_decorator from mx_rec.util.initialize import ConfigInitializer from mx_rec.util.ops import import_host_pipeline_ops from mx_rec.validator.validator import para_checker_decorator, StringValidator, FloatValidator, ClassValidator @@ -120,6 +120,7 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): } return temp + @control_update_op_decorator def _resource_apply_sparse(self, grad, handle, indices): return self._apply_sparse_shared( grad, @@ -127,6 +128,7 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): indices, self._resource_scatter_nd_add) + @control_update_op_decorator def _apply_sparse(self, grad, var): return self._apply_sparse_shared( grad.values, @@ -168,7 +170,7 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(grad) v_update_op = scatter_nd_add(velocity, nd_indices, v_t_slice - old_v_slice) - denominator_slice = math_ops.sqrt(v_t_slice) + temp_epsilon + denominator_slice = math_ops.sqrt(v_t_slice + temp_epsilon) var_update_op = scatter_nd_add(var, nd_indices, tf.divide(-learning_rate * m_t_slice, denominator_slice)) return control_flow_ops.group(m_update_op, v_update_op, var_update_op) diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index f1f8a2df..b7887052 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -136,7 +136,7 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): old_v_slice = split_tensors[2] v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(grad) - denominator_slice = math_ops.sqrt(v_t_slice) + temp_epsilon + denominator_slice = math_ops.sqrt(v_t_slice + temp_epsilon) update_list = [tf.divide(-learning_rate * m_t_slice, denominator_slice)] + [m_t_slice - old_m_slice] + \ [v_t_slice - old_v_slice] update_tensor = tf.concat(update_list, axis=1) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index b318f2d4..b96f4eb9 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -1515,8 +1515,15 @@ void HybridMgmt::EmbeddingUpdateDDR(const EmbTaskInfo& info, const float* embPtr throw runtime_error("memcpy_s failed, error code:" + to_string(rc)); } } - LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingUpdateTC(ms):{}", - info.name, info.batchId, info.threadIdx, EmbeddingUpdateTC.ElapsedMS()); + if (MxRec::Logger::GetLevel() <= MxRec::Logger::DEBUG) { + string sample; + if (!swapOutAddrs.empty()) { + sample = FloatPtrToLimitStr(swapOutAddrs.front(), info.extEmbeddingSize); // print first element + } + LOG_DEBUG("table:{}, batchId:{}, thread:{}, receive d2hEmb, ext emb:{}, emb size:{}, emb samples:{}, " + "EmbeddingUpdateTC(ms):{}", info.name.c_str(), info.batchId, info.threadIdx, + info.extEmbeddingSize, swapOutAddrs.size(), sample, EmbeddingUpdateTC.ElapsedMS()); + } lastUpdateFinishStepMap[info.name]++; cvLastUpdateFinishMap[info.name][info.cvNotifyIndex].notify_all(); @@ -1953,8 +1960,10 @@ bool HybridMgmt::BuildH2DEmbedding(const EmbTaskInfo &info, vector &h2dE throw runtime_error("memcpy_s failed, error code:" + to_string(rc)); } } - LOG_DEBUG("table:{}, thread:{}, embeddingLookupTC(ms):{}", - info.name.c_str(), info.threadIdx, embeddingLookupTC.ElapsedMS()); + LOG_DEBUG("table:{}, thread:{}, batchId:{}, send h2dEmb, emb size:{}, emb samples:{}, embeddingLookupTC(ms):{}", + info.name.c_str(), info.threadIdx, info.batchId, swapInAddrs.size(), + FloatPtrToLimitStr(h2dEmbAddr, swapInAddrs.size() * info.extEmbeddingSize), + embeddingLookupTC.ElapsedMS()); return true; } @@ -2196,6 +2205,11 @@ void HybridMgmt::GetSwapPairsAndKey2Offset(const EmbBaseInfo &info, vector(std::min(prtSize, maxDispLen)); + std::string s; + for (int i = 0; i < maxLen; i++) { + s += std::to_string(*(ptr + i)) + " "; + } + return s; + } + ostream& operator<<(ostream& ss, MxRec::CkptDataType type) { ss << static_cast(type); diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 75837349..4fdb7c8d 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -361,9 +361,12 @@ namespace MxRec { template std::string VectorToString(const std::vector& vec) { + constexpr size_t maxDispLen = 20; // max display number + int maxLen = static_cast(std::min(vec.size(), maxDispLen)); + std::stringstream ss; ss << "["; - for (size_t i = 0; i < vec.size(); ++i) { + for (size_t i = 0; i < maxLen; ++i) { ss << vec[i]; if (i != vec.size() - 1) { ss << ", "; @@ -373,6 +376,8 @@ namespace MxRec { return ss.str(); } + std::string FloatPtrToLimitStr(float* ptr, const size_t& prtSize); + template std::string MapToString(const std::map& map) { -- Gitee From e713f2c9f202757deb4f326dc3ccdac44e8d1378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Tue, 4 Jun 2024 11:06:34 +0800 Subject: [PATCH 195/302] =?UTF-8?q?WideDeep=E6=A8=A1=E5=9E=8B=EF=BC=9AIssu?= =?UTF-8?q?es=E9=97=AE=E9=A2=98=E4=BF=AE=E6=94=B92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/criteo.py | 2 +- examples/WideDeep/model/main_mxrec.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/WideDeep/criteo.py b/examples/WideDeep/criteo.py index ffb07289..137dac5c 100644 --- a/examples/WideDeep/criteo.py +++ b/examples/WideDeep/criteo.py @@ -179,7 +179,7 @@ def convert_input2tfrd(in_file_path, out_file_path): return example - file_name = out_file_path + in_file_path[-12:-4] + '.tfrecords' + file_name = out_file_path + in_file_path[-12:-4] + '.tfrecord' file_writer = tf.io.TFRecordWriter(file_name) with open(in_file_path, encoding='utf-8') as file_in: diff --git a/examples/WideDeep/model/main_mxrec.py b/examples/WideDeep/model/main_mxrec.py index d81ff215..b1d77698 100644 --- a/examples/WideDeep/model/main_mxrec.py +++ b/examples/WideDeep/model/main_mxrec.py @@ -89,7 +89,6 @@ def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, dataset = dataset.shard(config.rank_size, config.rank_id) if is_training: dataset = dataset.shuffle(batch_size * 1000, seed=shuffle_seed) - if is_training: dataset = dataset.repeat(config.train_epoch) else: dataset = dataset.repeat(config.test_epoch) @@ -382,8 +381,6 @@ if __name__ == "__main__": if use_faae: cfg.dev_vocab_size = cfg.dev_vocab_size // 2 - optimizer_list = [get_dense_and_sparse_optimizer(cfg)] - # 创表操作 wide_emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) deep_emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) @@ -424,6 +421,7 @@ if __name__ == "__main__": eval_model = model_forward(eval_forward_args) train_variables, emb_variables = get_dense_and_sparse_variable() + optimizer_list = [get_dense_and_sparse_optimizer(cfg)] rank_size = mxrec_util.communication.hccl_ops.get_rank_size() train_ops = [] -- Gitee From e62e62b14c9115097354ee49d225815153c9fa75 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 4 Jun 2024 11:16:10 +0000 Subject: [PATCH 196/302] =?UTF-8?q?!172=20tf=E5=8E=9F=E7=94=9Fwrite=20grap?= =?UTF-8?q?h=E5=87=BD=E6=95=B0=E6=B7=BB=E5=8A=A0patch=EF=BC=9B=E4=BF=AE?= =?UTF-8?q?=E5=A4=8Dddr=E4=BF=9D=E5=AD=98=E9=80=BB=E8=BE=91=EF=BC=9BhdfsCo?= =?UTF-8?q?nnet=E6=93=8D=E4=BD=9C=E4=BF=AE=E6=94=B9=20*=20tf=E5=8E=9F?= =?UTF-8?q?=E7=94=9Fwrite=20graph=E5=87=BD=E6=95=B0=E6=B7=BB=E5=8A=A0patch?= =?UTF-8?q?=EF=BC=9B=E4=BF=AE=E5=A4=8Dddr=E4=BF=9D=E5=AD=98=E9=80=BB?= =?UTF-8?q?=E8=BE=91=EF=BC=9BhdfsConnet=E6=93=8D=E4=BD=9C=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=20*=20Merge=20remote-tracking=20branch=20'upstream/de?= =?UTF-8?q?velop'=20into=20hdfs-fix-0530=20*=20tf=E5=8E=9F=E7=94=9Fwrite?= =?UTF-8?q?=20graph=E5=87=BD=E6=95=B0=E6=B7=BB=E5=8A=A0patch=EF=BC=9B?= =?UTF-8?q?=E4=BF=AE=E5=A4=8Dddr=E4=BF=9D=E5=AD=98=E9=80=BB=E8=BE=91?= =?UTF-8?q?=EF=BC=9BhdfsConnet=E6=93=8D=E4=BD=9C=E4=BF=AE=E6=94=B9=20*=20t?= =?UTF-8?q?f=E5=8E=9F=E7=94=9Fwrite=20graph=E5=87=BD=E6=95=B0=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0patch=EF=BC=9A=20=E5=A4=9A=E8=BF=9B=E7=A8=8B=E4=B8=AD?= =?UTF-8?q?=E5=8F=AA=E5=85=81=E8=AE=B8=E4=B8=80=E4=B8=AA=E8=BF=9B=E7=A8=8B?= =?UTF-8?q?=E8=BF=9B=E8=A1=8Cwrite=5Fgraph=E6=93=8D=E4=BD=9C=20*=20tf?= =?UTF-8?q?=E5=8E=9F=E7=94=9Fwrite=20graph=E5=87=BD=E6=95=B0=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0patch=EF=BC=9A=20=E5=A4=9A=E8=BF=9B=E7=A8=8B=E4=B8=AD?= =?UTF-8?q?=E5=8F=AA=E5=85=81=E8=AE=B8=E4=B8=80=E4=B8=AA=E8=BF=9B=E7=A8=8B?= =?UTF-8?q?=E8=BF=9B=E8=A1=8Cwrite=5Fgraph=E6=93=8D=E4=BD=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/patch.py | 16 +++++- mx_rec/saver/saver.py | 3 + src/core/emb_table/embedding_ddr.cpp | 51 ++++++++--------- src/core/emb_table/embedding_dynamic.cpp | 47 ++++++++------- src/core/emb_table/embedding_mgmt.cpp | 18 +++++- src/core/emb_table/embedding_static.cpp | 19 ++++--- src/core/emb_table/embedding_table.cpp | 18 +++++- src/core/emb_table/embedding_table.h | 9 ++- .../hdfs_file_system/hdfs_file_system.cpp | 57 ++----------------- .../hdfs_file_system/hdfs_file_system.h | 11 ++-- src/tests/emb_table/embedding_static_test.cpp | 1 + 11 files changed, 128 insertions(+), 122 deletions(-) diff --git a/mx_rec/saver/patch.py b/mx_rec/saver/patch.py index 6cffcc18..dcdf95ca 100644 --- a/mx_rec/saver/patch.py +++ b/mx_rec/saver/patch.py @@ -30,6 +30,7 @@ from tensorflow.python.client import session from tensorflow.python.eager import context from tensorflow.python.framework import errors from tensorflow.python.framework import ops +from tensorflow.python.framework import graph_io from tensorflow.python.ops import variables from tensorflow.python.ops import io_ops from tensorflow.python.platform import gfile @@ -41,6 +42,7 @@ from tensorflow.python.training.tracking import base as trackable from tensorflow.python.training.saving import saveable_object from tensorflow.python.training.saving import saveable_object_util import numpy as np +from mpi4py import MPI from mx_rec.saver.saver import Saver as SparseSaver, check_file_system_is_valid from mx_rec.util.initialize import ConfigInitializer @@ -248,7 +250,6 @@ def save(self, sess, save_path, global_step=None, latest_filename=None, meta_gra self.sparse_saver.save(sess, save_path=checkpoint_file) logger.info("Save sparse model into dir %s", checkpoint_file) - from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() comm.Barrier() @@ -447,6 +448,18 @@ class BulkSaverBuilder(BaseSaverBuilder): return io_ops.restore_v2(filename_tensor, tensor_names, tensor_slices, tensor_dtypes) +def patch_for_write_graph_func(func): + def wrapper(*args, **kwargs): + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + # In the case of multiple processes, choose one process to write graph. + if rank == 0: + return func(*args, **kwargs) + else: + return None + return wrapper + + def patch_for_saver(): dense_saver = tf.compat.v1.train.Saver dense_saver.__init__ = saver_init @@ -454,3 +467,4 @@ def patch_for_saver(): dense_saver.restore = restore dense_saver.build = build logger.debug("Class tf.train.Saver has been patched.") + training_util.write_graph = patch_for_write_graph_func(graph_io.write_graph) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index f9dfd0dc..f7c6b9a2 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -286,6 +286,9 @@ class Saver(object): def _save_ddr(self, sess, root_dir): # 接受host侧传来的需要swap_out的offset用于更新host侧并保存 self.config_instance.hybrid_manager_config.fetch_device_emb() + # In DDR mode, within the save process, the graph has been fixed and cannot execute the get_next op. + # The _unsafe_unfinalize operation can modify the state of the graph being fixed. + sess.graph._unsafe_unfinalize() for var in self.var_list: table_instance = self.config_instance.sparse_embed_config.get_table_instance(var) table_name = table_instance.table_name diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index f069e5c7..092ad0c5 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -18,7 +18,6 @@ See the License for the specific language governing permissions and #include "utils/logger.h" #include "utils/singleton.h" -#include "file_system/file_system_handler.h" #include "ssd_cache/cache_manager.h" #include "ock_ctr_common/include/error_code.h" @@ -86,12 +85,13 @@ void EmbeddingDDR::LoadKey(const string &savePath, vector &keys stringstream ss; ss << savePath << "/" << name << "/key/slice.data"; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } size_t fileSize = 0; try { - fileSize = fileSystemPtr->GetFileSize(ss.str()); + fileSize = fileSystemPtr_->GetFileSize(ss.str()); } catch (exception& e) { string errMsg = StringFormat("open file failed:%s, error code:%d", ss.str().c_str(), strerror(errno)); throw runtime_error(errMsg); @@ -107,7 +107,7 @@ void EmbeddingDDR::LoadKey(const string &savePath, vector &keys string errMsg = StringFormat("malloc buffer failed, error code:%d", strerror(errno)); throw runtime_error(errMsg); } - ssize_t result = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); + ssize_t result = fileSystemPtr_->Read(ss.str(), reinterpret_cast(buf), fileSize); if (result == -1) { free(static_cast(buf)); string errMsg = StringFormat("read buffer failed, error code:%d", strerror(errno)); @@ -144,13 +144,13 @@ void EmbeddingDDR::LoadEmbedding(const string &savePath, vector> & stringstream ss; ss << savePath << "/" << name; - - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - stringstream embedStream; embedStream << ss.str() << "/" << "embedding/slice.data"; - ssize_t res = fileSystemPtr->Read(embedStream.str(), embeddings, 0, hostLoadOffset, embSize_); + + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } + ssize_t res = fileSystemPtr_->Read(embedStream.str(), embeddings, 0, hostLoadOffset, embSize_); LOG_DEBUG("load embedding done, table:{}, read bytes:{}", name, res); } @@ -170,14 +170,14 @@ void EmbeddingDDR::LoadOptimizerSlot(const string &savePath, vector fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } int64_t slotIdx = 0; for (const auto ¶m: optimParams) { stringstream paramStream; paramStream << ss.str() << "/" << optimName + "_" + param << "/slice.data"; - ssize_t res = fileSystemPtr->Read(paramStream.str(), optimizerSlots, slotIdx, hostLoadOffset, embSize_); + ssize_t res = fileSystemPtr_->Read(paramStream.str(), optimizerSlots, slotIdx, hostLoadOffset, embSize_); slotIdx++; LOG_DEBUG("load optimizer slot, table:{}, slot:{}, read bytes:{}", name, param, res); } @@ -264,14 +264,14 @@ void EmbeddingDDR::SaveKey(const string& savePath, vector& keys MakeDir(ss.str()); ss << "slice_" << rankId_ << ".data"; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - // 暂时向HBM兼容,转成int64_t,后续再归一key类型为uint64_t vector keysCompat(keys.cbegin(), keys.cend()); - ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(keysCompat.data()), - static_cast(keys.size() * sizeof(int64_t))); + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } + ssize_t res = fileSystemPtr_->Write(ss.str(), reinterpret_cast(keysCompat.data()), + static_cast(keys.size() * sizeof(int64_t))); if (res == -1) { throw runtime_error("save key failed!"); } @@ -284,10 +284,10 @@ void EmbeddingDDR::SaveEmbedding(const string& savePath, vector>& MakeDir(ss.str()); ss << "slice_" << rankId_ << ".data"; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - - ssize_t writeBytesNum = fileSystemPtr->Write(ss.str(), embeddings, embSize_); + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } + ssize_t writeBytesNum = fileSystemPtr_->Write(ss.str(), embeddings, embSize_); ssize_t expectWriteBytes = embeddings.size() * embSize_ * sizeof(float); if (writeBytesNum != expectWriteBytes) { string errMsg = StringFormat("save embedding failed, write expect:%d, actual:%d, path:%s", @@ -317,15 +317,12 @@ void EmbeddingDDR::SaveOptimizerSlot(const string& savePath, vector fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - vector> slotData; for (const auto &data: optimizerSlots) { vector tmp(data.cbegin() + slotIdx * embSize_, data.cbegin() + (slotIdx+1) * embSize_); slotData.emplace_back(tmp); } - ssize_t writeBytesNum = fileSystemPtr->Write(ss.str(), slotData, embSize_); + ssize_t writeBytesNum = fileSystemPtr_->Write(ss.str(), slotData, embSize_); ssize_t expectWriteBytes = slotData.size() * embSize_ * sizeof(float); if (writeBytesNum != expectWriteBytes) { string errMsg = StringFormat("save optimizer slot failed, write expect:%d, actual:%d, path:%s", diff --git a/src/core/emb_table/embedding_dynamic.cpp b/src/core/emb_table/embedding_dynamic.cpp index 78c94862..a69cf930 100644 --- a/src/core/emb_table/embedding_dynamic.cpp +++ b/src/core/emb_table/embedding_dynamic.cpp @@ -17,7 +17,6 @@ See the License for the specific language governing permissions and #include "utils/logger.h" #include "utils/singleton.h" #include "hd_transfer/hd_transfer.h" -#include "file_system/file_system_handler.h" #include "utils/common.h" using namespace MxRec; @@ -139,9 +138,6 @@ void EmbeddingDynamic::SaveKey(const string& savePath) MakeDir(ss.str()); ss << "slice_" << rankId_ << ".data"; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - deviceKey.clear(); embAddress.clear(); @@ -150,8 +146,11 @@ void EmbeddingDynamic::SaveKey(const string& savePath) embAddress.push_back(it.second); } + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); - ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); + ssize_t res = fileSystemPtr_->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { throw runtime_error(StringFormat("Error: Save keys failed. " "An error occurred while writing file: {}.", ss.str())); @@ -195,22 +194,25 @@ void EmbeddingDynamic::SaveEmbData(const string& savePath) MakeDir(ss.str()); ss << "slice_" << rankId_ << ".data"; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - fileSystemPtr->WriteEmbedding(ss.str(), embSize_, embAddress, deviceId); + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } + fileSystemPtr_->WriteEmbedding(ss.str(), embSize_, embAddress, deviceId); } void EmbeddingDynamic::SaveOptimData(const string &savePath) { + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } + for (const auto &content: optimAddressMap) { stringstream ss; ss << savePath << "/" << name << "/" << optimName + "_" + content.first << "/"; MakeDir(ss.str()); ss << "slice_" << rankId_ << ".data"; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - fileSystemPtr->WriteEmbedding(ss.str(), embSize_, content.second, deviceId); + fileSystemPtr_->WriteEmbedding(ss.str(), embSize_, content.second, deviceId); } } @@ -225,22 +227,23 @@ void EmbeddingDynamic::LoadEmbAndOptim(const string& savePath) stringstream ss; ss << savePath << "/" << name; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - // 读embedding stringstream embedStream; embedStream << ss.str() << "/" << "embedding/slice.data"; + + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } EmbeddingSizeInfo embeddingSizeInfo = {embSize_, extEmbSize_}; - fileSystemPtr->ReadEmbedding(embedStream.str(), embeddingSizeInfo, firstAddress, rankId_, loadOffset); + fileSystemPtr_->ReadEmbedding(embedStream.str(), embeddingSizeInfo, firstAddress, rankId_, loadOffset); // 读optim int optimIndex = 1; for (const auto ¶m: optimParams) { stringstream paramStream; paramStream << ss.str() << "/" << optimName + "_" + param << "/slice.data"; - fileSystemPtr->ReadEmbedding(paramStream.str(), embeddingSizeInfo, - firstAddress + optimIndex * embSize_ * sizeof(float), deviceId, loadOffset); + fileSystemPtr_->ReadEmbedding(paramStream.str(), embeddingSizeInfo, + firstAddress + optimIndex * embSize_ * sizeof(float), deviceId, loadOffset); optimIndex++; } } @@ -250,10 +253,10 @@ void EmbeddingDynamic::LoadKey(const string& savePath) stringstream ss; ss << savePath << "/" << name << "/key/slice.data"; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - - size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } + size_t fileSize = fileSystemPtr_->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } @@ -264,7 +267,7 @@ void EmbeddingDynamic::LoadKey(const string& savePath) "failed to allocate {} bytes using malloc.", fileSize)); } - ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); + ssize_t res = fileSystemPtr_->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { throw runtime_error(StringFormat("Error: Load keys failed. " "An error occurred while reading file: {}.", ss.str())); diff --git a/src/core/emb_table/embedding_mgmt.cpp b/src/core/emb_table/embedding_mgmt.cpp index 33e1c671..9e7dcbb0 100644 --- a/src/core/emb_table/embedding_mgmt.cpp +++ b/src/core/emb_table/embedding_mgmt.cpp @@ -19,6 +19,7 @@ See the License for the specific language governing permissions and #include "emb_table/embedding_static.h" #include "emb_table/embedding_dynamic.h" #include "emb_table/embedding_ddr.h" +#include "file_system/file_system_handler.h" #include "utils/logger.h" using namespace MxRec; @@ -111,23 +112,32 @@ int64_t EmbeddingMgmt::GetCapacity(const std::string &name) void EmbeddingMgmt::Load(const string& name, const string& filePath, map>& trainKeySet) { - return embeddings[name]->Load(filePath, trainKeySet); + embeddings[name]->SetFileSystemPtr(filePath); + embeddings[name]->Load(filePath, trainKeySet); + embeddings[name]->UnsetFileSystemPtr(); } void EmbeddingMgmt::Load(const string& filePath, map>& trainKeySet) { for (auto& tablePair: embeddings) { + tablePair.second->SetFileSystemPtr(filePath); tablePair.second->Load(filePath, trainKeySet); + tablePair.second->UnsetFileSystemPtr(); } } void EmbeddingMgmt::Save(const string& name, const string& filePath) { - return embeddings[name]->Save(filePath); + embeddings[name]->SetFileSystemPtr(filePath); + embeddings[name]->Save(filePath); + embeddings[name]->UnsetFileSystemPtr(); } void EmbeddingMgmt::Save(const string& filePath) { + for (auto& tablePair: embeddings) { + tablePair.second->SetFileSystemPtr(filePath); + } // use multi-thread to prevent receiving save_d2h blocked when table order different between cpp and python vector> futures; for (auto& tablePair: embeddings) { @@ -137,6 +147,10 @@ void EmbeddingMgmt::Save(const string& filePath) for (auto& f: futures) { f.get(); // get() will repost exception if happened } + + for (auto& tablePair: embeddings) { + tablePair.second->UnsetFileSystemPtr(); + } } OffsetMapT EmbeddingMgmt::GetDeviceOffsets() diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index fdda5ede..ab66a42c 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -83,9 +83,6 @@ void EmbeddingStatic::SaveKey(const string& savePath) MakeDir(ss.str()); ss << "slice_" << rankId_ << ".data"; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - deviceKey.clear(); deviceOffset.clear(); @@ -94,8 +91,12 @@ void EmbeddingStatic::SaveKey(const string& savePath) deviceOffset.push_back(it.second); } + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } + size_t writeSize = static_cast(deviceKey.size() * sizeof(int64_t)); - ssize_t res = fileSystemPtr->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); + ssize_t res = fileSystemPtr_->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { throw runtime_error(StringFormat("Error: Save keys failed. " "An error occurred while writing file: {}.", ss.str())); @@ -116,10 +117,10 @@ void EmbeddingStatic::LoadKey(const string& savePath) stringstream ss; ss << savePath << "/" << name << "/key/slice.data"; - unique_ptr fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(ss.str()); - - size_t fileSize = fileSystemPtr->GetFileSize(ss.str()); + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } + size_t fileSize = fileSystemPtr_->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); } @@ -130,7 +131,7 @@ void EmbeddingStatic::LoadKey(const string& savePath) "failed to allocate {} bytes using malloc.", fileSize)); } - ssize_t res = fileSystemPtr->Read(ss.str(), reinterpret_cast(buf), fileSize); + ssize_t res = fileSystemPtr_->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { throw runtime_error(StringFormat("Error: Load keys failed. " "An error occurred while reading file: {}.", ss.str())); diff --git a/src/core/emb_table/embedding_table.cpp b/src/core/emb_table/embedding_table.cpp index 1579282f..b4eb2379 100644 --- a/src/core/emb_table/embedding_table.cpp +++ b/src/core/emb_table/embedding_table.cpp @@ -119,6 +119,17 @@ absl::flat_hash_map EmbeddingTable::GetKeyOffsetMap() return keyOffsetMap; } +void EmbeddingTable::SetFileSystemPtr(const string& savePath) +{ + unique_ptr fileSystemHandler = make_unique(); + fileSystemPtr_ = fileSystemHandler->Create(savePath); +} + +void EmbeddingTable::UnsetFileSystemPtr() +{ + fileSystemPtr_ = nullptr; +} + vector EmbeddingTable::GetLoadOffset() { return loadOffset; @@ -134,9 +145,10 @@ void EmbeddingTable::Save(const string& filePath) void EmbeddingTable::MakeDir(const string& dirName) { - auto fileSystemHandler = make_unique(); - unique_ptr fileSystemPtr = fileSystemHandler->Create(dirName); - fileSystemPtr->CreateDir(dirName); + if (fileSystemPtr_ == nullptr) { + throw runtime_error("failed to obtain the file system pointer, the file system pointer is null."); + } + fileSystemPtr_->CreateDir(dirName); } void EmbeddingTable::SetCacheManager(CacheManager *cm) diff --git a/src/core/emb_table/embedding_table.h b/src/core/emb_table/embedding_table.h index 1fa9008b..8b622194 100644 --- a/src/core/emb_table/embedding_table.h +++ b/src/core/emb_table/embedding_table.h @@ -21,6 +21,7 @@ See the License for the specific language governing permissions and #include "utils/common.h" #include "ssd_cache/cache_manager.h" +#include "file_system/file_system_handler.h" namespace MxRec { @@ -65,11 +66,15 @@ public: absl::flat_hash_map GetKeyOffsetMap(); + void SetFileSystemPtr(const string& savePath); + + void UnsetFileSystemPtr(); + virtual void Load(const string& savePath, map>& trainKeySet); virtual void Save(const string& savePath); - static void MakeDir(const string& dirName); + void MakeDir(const string& dirName); virtual vector GetDeviceOffset(); @@ -116,6 +121,8 @@ protected: std::vector missingKeysHostPos_; // 用于记录当前batch在host上需要换出的偏移 CacheManager* cacheManager_; bool isSSDEnabled_ = false; + + unique_ptr fileSystemPtr_; }; } diff --git a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp index 68fc47a8..3cbf4a44 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp +++ b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp @@ -28,19 +28,15 @@ using namespace MxRec; void HdfsFileSystem::CreateDir(const string& dirName) { - hdfsFS fs = ConnectHdfs(); int ret = hdfs->CreateDirectory(fs, dirName.c_str()); if (ret == -1) { LOG_DEBUG("Unable to create hdfs directory: {}", dirName); } - hdfs->Disconnect(fs); } vector HdfsFileSystem::ListDir(const string& dirName) { vector dirs; - hdfsFS fs = ConnectHdfs(); - int numEntries = 0; hdfsFileInfo* subDirs = hdfs->ListDirectory(fs, dirName.c_str(), &numEntries); for (int i = 0; i < numEntries; ++i) { @@ -50,15 +46,12 @@ vector HdfsFileSystem::ListDir(const string& dirName) } hdfs->FreeFileInfo(subDirs, numEntries); - hdfs->Disconnect(fs); return dirs; } size_t HdfsFileSystem::GetFileSize(const string& filePath) { - hdfsFS fs = ConnectHdfs(); hdfsFileInfo* fileInfo = hdfs->GetPathInfo(fs, filePath.c_str()); - hdfs->Disconnect(fs); if (fileInfo == nullptr) { throw runtime_error(StringFormat("Error: Unable to get hdfs file info : {}.", filePath.c_str())); } @@ -68,15 +61,8 @@ size_t HdfsFileSystem::GetFileSize(const string& filePath) ssize_t HdfsFileSystem::Write(const string& filePath, const char* fileContent, size_t dataSize) { - hdfsFS fs = ConnectHdfs(); - int flag = O_WRONLY | O_CREAT; - hdfsFileInfo* fileInfo = hdfs->GetPathInfo(fs, filePath.c_str()); - if (fileInfo) { - flag = O_WRONLY | O_APPEND; - } - hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), flag, 0, 0, 0); + hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_WRONLY | O_CREAT, 0, 0, 0); if (!file) { - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); } @@ -84,39 +70,32 @@ ssize_t HdfsFileSystem::Write(const string& filePath, const char* fileContent, s tSize res = hdfs->Write(fs, file, fileContent, dataSize); if (res == -1) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); return static_cast(res); } writeBytesNum += res; hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); return static_cast(writeBytesNum); } ssize_t HdfsFileSystem::Write(const string& filePath, vector>& fileContent, size_t dataSize) { - hdfsFS fs = ConnectHdfs(); - hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_WRONLY | O_CREAT, 0, 0, 0); if (!file) { - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); } tSize writeBytesNum = 0; size_t loops = fileContent.size(); for (size_t i = 0; i < loops; i++) { - tSize res = hdfs->Write(fs, file, reinterpret_cast(&fileContent[i]), dataSize); + tSize res = hdfs->Write(fs, file, reinterpret_cast(&fileContent[i]), dataSize * sizeof(float)); if (res == -1) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); return static_cast(res); } writeBytesNum += res; } hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); return static_cast(writeBytesNum); } @@ -129,11 +108,8 @@ ssize_t HdfsFileSystem::Write(const string& filePath, vector>& fil void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embeddingSize, const vector& addressArr, int deviceId) { - hdfsFS fs = ConnectHdfs(); - hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_WRONLY | O_CREAT, 0, 0, 0); if (!file) { - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); } @@ -141,7 +117,6 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding auto res = aclrtSetDevice(static_cast(deviceId)); if (res != ACL_ERROR_NONE) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Set device failed, device_id:%d", deviceId).c_str()); } @@ -155,20 +130,17 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding ACL_MEMCPY_DEVICE_TO_HOST); if (ret != ACL_SUCCESS) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); throw runtime_error("Error: Execute aclrtmemcpy from device to host failed."); } tSize res = hdfs->Write(fs, file, row.data(), embeddingSize * sizeof(float)); if (res == -1) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: An error occurred while writing file: {}.", filePath.c_str())); } if (res != embeddingSize * sizeof(float)) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: Expected to write {} bytes, " "but actually write {} bytes to file {}.", embeddingSize * sizeof(float), res, filePath.c_str())); @@ -176,16 +148,12 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding } #endif hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); } ssize_t HdfsFileSystem::Read(const string& filePath, char* fileContent, size_t datasetSize) { - hdfsFS fs = ConnectHdfs(); - hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_RDONLY, 0, 0, 0); if (!file) { - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); } @@ -193,24 +161,19 @@ ssize_t HdfsFileSystem::Read(const string& filePath, char* fileContent, size_t d tSize res = hdfs->Read(fs, file, fileContent, datasetSize); if (res == -1) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); return static_cast(res); } readBytesNum += res; hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); return static_cast(readBytesNum); } ssize_t HdfsFileSystem::Read(const string& filePath, vector>& fileContent, int64_t contentOffset, vector offsetArr, const size_t& embeddingSize) { - hdfsFS fs = ConnectHdfs(); - hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_RDONLY, 0, 0, 0); if (!file) { - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); } @@ -223,7 +186,6 @@ ssize_t HdfsFileSystem::Read(const string& filePath, vector>& file embeddingSize * sizeof(float)); if (res == -1) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); return static_cast(res); } embeddingCount++; @@ -231,7 +193,6 @@ ssize_t HdfsFileSystem::Read(const string& filePath, vector>& file } hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); return static_cast(readBytesNum); } @@ -245,11 +206,8 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em int deviceId, vector offsetArr) { #ifndef GTEST - hdfsFS fs = ConnectHdfs(); - hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_RDONLY, 0, 0, 0); if (!file) { - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); } @@ -265,7 +223,6 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em int seekRes = hdfs->Seek(fs, file, offset * embedSizeInfo.embeddingSize * sizeof(float)); if (seekRes == -1) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: hdfsSeek failed with error. file offset: {}", offset * embedSizeInfo.embeddingSize * sizeof(float))); } @@ -273,12 +230,10 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em tSize res = hdfs->Read(fs, file, row.data(), embedSizeInfo.embeddingSize * sizeof(float)); if (res == -1) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: An error occurred while reading file: {}.", filePath.c_str())); } if (res != embedSizeInfo.embeddingSize * sizeof(float)) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); throw runtime_error(StringFormat("Error: Expected to read {} bytes, " "but actually read {} bytes from file {}.", embedSizeInfo.embeddingSize * sizeof(float), res, filePath.c_str())); @@ -289,21 +244,19 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em row.data(), embedSizeInfo.embeddingSize * sizeof(float), ACL_MEMCPY_HOST_TO_DEVICE); if (ret != ACL_SUCCESS) { hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); throw runtime_error("Error: Execute aclrtmemcpy from host to device failed."); } i++; } hdfs->CloseFile(fs, file); - hdfs->Disconnect(fs); #endif } hdfsFS HdfsFileSystem::ConnectHdfs() { - hdfsFS fs = hdfs->Connect("default", 0); - if (!fs) { + hdfsFS hdfsClient = hdfs->Connect("default", 0); + if (!hdfsClient) { throw runtime_error("Connect hdfs file system failed."); } - return fs; + return hdfsClient; } \ No newline at end of file diff --git a/src/core/file_system/hdfs_file_system/hdfs_file_system.h b/src/core/file_system/hdfs_file_system/hdfs_file_system.h index f6c6a489..bf56062f 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_file_system.h +++ b/src/core/file_system/hdfs_file_system/hdfs_file_system.h @@ -24,11 +24,11 @@ namespace MxRec { class HdfsFileSystem : public FileSystem { public: - HdfsFileSystem() + HdfsFileSystem() {}; + ~HdfsFileSystem() { - hdfs = make_unique(); - }; - ~HdfsFileSystem() override {} + hdfs->Disconnect(fs); + } void CreateDir(const string& dirName) override; vector ListDir(const string& dirName) override; @@ -47,7 +47,8 @@ namespace MxRec { hdfsFS ConnectHdfs(); - unique_ptr hdfs; + unique_ptr hdfs = make_unique(); + hdfsFS fs = ConnectHdfs(); }; } diff --git a/src/tests/emb_table/embedding_static_test.cpp b/src/tests/emb_table/embedding_static_test.cpp index 9e250f64..a08569b3 100644 --- a/src/tests/emb_table/embedding_static_test.cpp +++ b/src/tests/emb_table/embedding_static_test.cpp @@ -156,6 +156,7 @@ TEST_F(EmbeddingStaticTest, SaveKeyData) { vector embInfos = {embInfo_}; shared_ptr hbm = std::make_shared(embInfo_, rankInfo_, 0); + hbm->SetFileSystemPtr("test_dir"); hbm->Save("test_dir"); bool fileExist = false; if (access("./test_dir/test1/key", F_OK) == 0) { -- Gitee From 80b3718559001df56b389f73e4c0992a87f12b0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Tue, 4 Jun 2024 20:42:43 +0800 Subject: [PATCH 197/302] =?UTF-8?q?WideDeep=E6=A8=A1=E5=9E=8B=EF=BC=9A?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E9=A2=84=E5=A4=84=E7=90=86=E8=84=9A=E6=9C=AC?= =?UTF-8?q?criteo.py=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/criteo.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/WideDeep/criteo.py b/examples/WideDeep/criteo.py index 137dac5c..f9624d21 100644 --- a/examples/WideDeep/criteo.py +++ b/examples/WideDeep/criteo.py @@ -167,10 +167,9 @@ def convert_input2tfrd(in_file_path, out_file_path): txt to tfrecords """ def make_example(label_list, dense_feat_list, sparse_feat_list): - # '1.0' >> 1.0 >> 1 - dense_feature = np.array(np.array(dense_feat_list, dtype=np.float32), dtype=np.int64).reshape(-1) - sparse_feature = np.array(np.array(sparse_feat_list, dtype=np.float32), dtype=np.int64).reshape(-1) - label = np.array(np.array(label_list, dtype=np.float32), dtype=np.int64).reshape(-1) + dense_feature = np.array(dense_feat_list, dtype=np.int64).reshape(-1) + sparse_feature = np.array(sparse_feat_list, dtype=np.int64).reshape(-1) + label = np.array(label_list, dtype=np.int64).reshape(-1) feature_dict = {"dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)), "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) -- Gitee From 6fbabb2abf1a9b9d7a0c45613ce2b3d3cefbd7b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 5 Jun 2024 10:31:34 +0800 Subject: [PATCH 198/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91DDR=E7=B2=BE=E5=BA=A6=E9=97=AE?= =?UTF-8?q?=E9=A2=98=EF=BC=8C=E4=BD=BF=E7=94=A8sgd=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E5=99=A8ddr=E6=A8=A1=E5=BC=8Fbug=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/graph/modifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index 8629b350..15c240e5 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -260,7 +260,7 @@ class _GraphModifier: control_ops = swap_control_dict["control_ops"] utils.replace_anchor_control(self._full_graph, control_ops, swap_op) - if is_training: + if is_training and slot_num > 1: # gather for slot need to be executed after swap_op slot_control_dict = swap_args.slot_control_dict[table_instance.variable] if "control_ops" not in slot_control_dict: -- Gitee From 3dbc71e41d9c24b864000170b55f9fe73274f7b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 6 Jun 2024 16:07:28 +0800 Subject: [PATCH 199/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/config.py | 8 ++--- examples/WideDeep/model/config.py | 8 ++--- examples/demo/little_demo/main.py | 8 +---- examples/dlrm/model/config.py | 8 ++--- mx_rec/constants/constants.py | 14 +++++++- mx_rec/core/embedding.py | 60 +++++++++++++++++++++++++------ 6 files changed, 69 insertions(+), 37 deletions(-) diff --git a/examples/DCNv2/config.py b/examples/DCNv2/config.py index 463f9aa1..6b7c9bce 100644 --- a/examples/DCNv2/config.py +++ b/examples/DCNv2/config.py @@ -20,6 +20,8 @@ import os import tensorflow as tf from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +from mx_rec.constants.constants import CacheModeEnum + SSD_DATA_PATH = ["ssd_data"] @@ -89,12 +91,6 @@ class LearningRateScheduler: return lr_dense, lr_sparse -class CacheModeEnum(enum.Enum): - HBM = "HBM" - DDR = "DDR" - SSD = "SSD" - - class Config: def __init__(self, ): self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None diff --git a/examples/WideDeep/model/config.py b/examples/WideDeep/model/config.py index fae850f9..0072dc59 100644 --- a/examples/WideDeep/model/config.py +++ b/examples/WideDeep/model/config.py @@ -21,6 +21,8 @@ import tensorflow as tf from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig from npu_bridge.estimator.npu.npu_config import NPURunConfig +from mx_rec.constants.constants import CacheModeEnum + SSD_DATA_PATH = ["ssd_data"] @@ -90,12 +92,6 @@ class LearningRateScheduler: return lr_dense, lr_sparse -class CacheModeEnum(enum.Enum): - HBM = "HBM" - DDR = "DDR" - SSD = "SSD" - - class Config: def __init__(self, ): self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None diff --git a/examples/demo/little_demo/main.py b/examples/demo/little_demo/main.py index ff09bc50..cfaecbde 100644 --- a/examples/demo/little_demo/main.py +++ b/examples/demo/little_demo/main.py @@ -24,7 +24,7 @@ from glob import glob import numpy as np import tensorflow as tf -from mx_rec.constants.constants import ASCEND_TIMESTAMP +from mx_rec.constants.constants import ASCEND_TIMESTAMP, CacheModeEnum from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.asc.helper import get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline @@ -46,12 +46,6 @@ tf.compat.v1.disable_eager_execution() _SSD_SAVE_PATH = ["ssd_data"] # user should make sure directory exist and clean before training -class CacheModeEnum(enum.Enum): - HBM = "HBM" - DDR = "DDR" - SSD = "SSD" - - def make_batch_and_iterator(is_training, feature_spec_list=None, use_timestamp=False, dump_graph=False, batch_number=100): dataset = generate_dataset(cfg, use_timestamp=use_timestamp, batch_number=batch_number) diff --git a/examples/dlrm/model/config.py b/examples/dlrm/model/config.py index 78115d61..c30a22d4 100644 --- a/examples/dlrm/model/config.py +++ b/examples/dlrm/model/config.py @@ -21,6 +21,8 @@ import tensorflow as tf from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig from npu_bridge.estimator.npu.npu_config import NPURunConfig +from mx_rec.constants.constants import CacheModeEnum + SSD_DATA_PATH = ["ssd_data"] @@ -90,12 +92,6 @@ class LearningRateScheduler: return lr_dense, lr_sparse -class CacheModeEnum(enum.Enum): - HBM = "HBM" - DDR = "DDR" - SSD = "SSD" - - class Config: def __init__(self, ): self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index becba0ab..fa34fddc 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - +import sys from enum import Enum import numpy as np @@ -26,6 +26,12 @@ ASCEND_TIMESTAMP = "ASCEND_TIMESTAMP" ASCEND_SPARSE_LOOKUP_LOCAL_EMB = "ASCEND_SPARSE_LOOKUP_LOCAL_EMB" EMPTY_STR = "" +# default emb memory size for hbm、ddr、ssd +DEFAULT_DEVICE_CACHE_MEMORY_SIZE = 2 * 12024 * 1024 * 1024 +DEFAULT_HOST_CACHE_MEMORY_SIZE = 20 * 12024 * 1024 * 1024 +DEFAULT_SSD_CACHE_MEMORY_SIZE = sys.maxsize + + # 获取ConfigInitializer对象实例失败提示信息 GET_CONFIG_INSTANCE_ERR_MSG = "Please init the environment for mx_rec at first." @@ -142,6 +148,12 @@ class EnvOption(Enum): OMPI_COMM_WORLD_RANK = "OMPI_COMM_WORLD_RANK" +class CacheModeEnum(Enum): + HBM = "HBM" + DDR = "DDR" + SSD = "SSD" + + class DataName(Enum): KEY = "key" EMBEDDING = "embedding" diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index 348ab9d6..51c1231d 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -16,7 +16,8 @@ # ============================================================================== import os -from typing import Optional, Union +import psutil +from typing import Optional, Union, List import tensorflow as tf from tensorflow import Tensor @@ -28,7 +29,9 @@ from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.core.emb.emb_factory import HBMDynamicSparseEmbeddingFactory, HBMSparseEmbeddingFactory, \ ExternalStorageSparseEmbeddingFactory -from mx_rec.constants.constants import MAX_INT32, All2allGradientsOp, MAX_VOCABULARY_SIZE, MAX_DEVICE_VOCABULARY_SIZE +from mx_rec.constants.constants import (MAX_INT32, All2allGradientsOp, MAX_VOCABULARY_SIZE, MAX_DEVICE_VOCABULARY_SIZE, + CacheModeEnum, DEFAULT_DEVICE_CACHE_MEMORY_SIZE, DEFAULT_HOST_CACHE_MEMORY_SIZE, + DEFAULT_SSD_CACHE_MEMORY_SIZE) from mx_rec.graph.constants import AnchorIteratorOp from mx_rec.util.initialize import ConfigInitializer from mx_rec.validator.validator import ClassValidator, StringValidator, SSDFeatureValidator, \ @@ -51,19 +54,19 @@ from mx_rec.util.log import logger ("host_vocabulary_size", IntValidator, {"min_value": 0, "max_value": MAX_VOCABULARY_SIZE}, ["check_value"]), ("ssd_vocabulary_size", IntValidator, {"min_value": 0, "max_value": MAX_VOCABULARY_SIZE}, ["check_value"]), ("ssd_data_path", ClassValidator, {"classes": (list, tuple)}), - ("is_save", ClassValidator, {"classes": (bool, )}), + ("is_save", ClassValidator, {"classes": (bool,)}), ("init_param", FloatValidator, {"min_value": -10, "max_value": 10}, ["check_value"]), ("all2all_gradients_op", OptionValidator, {"options": [i.value for i in list(All2allGradientsOp)]}), ("value_dtype", OptionValidator, {"options": [tf.float32]}), ("shard_num", IntValidator, {"min_value": 1, "max_value": 8192}, ["check_value"]), - ("fusion_optimizer_var", ClassValidator, {"classes": (bool, )}), + ("fusion_optimizer_var", ClassValidator, {"classes": (bool,)}), ("hashtable_threshold", IntValidator, {"min_value": 0, "max_value": MAX_INT32}, ["check_value"]) ]) def create_table(key_dtype, dim, name, emb_initializer, device_vocabulary_size=1, host_vocabulary_size=0, ssd_vocabulary_size=0, - ssd_data_path=(os.getcwd(), ), + ssd_data_path=(os.getcwd(),), is_save=True, init_param=1., all2all_gradients_op=All2allGradientsOp.SUM_GRADIENTS.value, @@ -91,24 +94,28 @@ def create_table(key_dtype, dim, name, emb_initializer, """ name = fix_invalid_table_name(name) + voc_size_list = [device_vocabulary_size, host_vocabulary_size, ssd_vocabulary_size] + if check_and_set_default_voc_size(voc_size_list, dim): + raise ValueError("voc_size_lis does not fit this cache mode") + config = dict(key_dtype=key_dtype, embedding_size=dim, table_name=name, emb_initializer=emb_initializer, - device_vocabulary_size=device_vocabulary_size, host_vocabulary_size=host_vocabulary_size, - ssd_vocabulary_size=ssd_vocabulary_size, ssd_data_path=ssd_data_path, + device_vocabulary_size=voc_size_list[0], host_vocabulary_size=voc_size_list[1], + ssd_vocabulary_size=voc_size_list[2], ssd_data_path=ssd_data_path, init_param=init_param, is_save=is_save, all2all_gradients_op=all2all_gradients_op) # 动态扩容 if ConfigInitializer.get_instance().use_dynamic_expansion: return HBMDynamicSparseEmbeddingFactory().create_embedding(config) # DDR or SSD - if host_vocabulary_size > 0: + if voc_size_list[1] > 0: return ExternalStorageSparseEmbeddingFactory().create_embedding(config) # HBM return HBMSparseEmbeddingFactory().create_embedding(config) @para_checker_decorator(check_option_list=[ - ("hashtable", ClassValidator, {"classes": (BaseSparseEmbedding, )}), + ("hashtable", ClassValidator, {"classes": (BaseSparseEmbedding,)}), ("ids", ClassValidator, {"classes": (FeatureSpec, tf.Tensor)}), - ("is_train", ClassValidator, {"classes": (bool, )}), + ("is_train", ClassValidator, {"classes": (bool,)}), ("send_count", ClassValidator, {"classes": (int, type(None))}), ("send_count", OptionalIntValidator, {"min_value": 1, "max_value": MAX_INT32}, ["check_value"]), ("name", ClassValidator, {"classes": (str, type(None))}), @@ -116,7 +123,7 @@ def create_table(key_dtype, dim, name, emb_initializer, ("modify_graph", ClassValidator, {"classes": (bool, type(None))}), ("batch", ClassValidator, {"classes": (dict, type(None))}), ("access_and_evict_config", ClassValidator, {"classes": (dict, type(None))}), - ("is_grad", ClassValidator, {"classes": (bool, )}), + ("is_grad", ClassValidator, {"classes": (bool,)}), ("serving_default_value", ClassValidator, {"classes": (tf.Tensor, type(None))}) ]) def sparse_lookup(hashtable: BaseSparseEmbedding, @@ -201,3 +208,34 @@ def mark_orphan_lookup_key(lookup_key: Tensor) -> Tensor: logger.info('Mark orphan lookup key %s as %s.', lookup_key, marked_lookup_key) return marked_lookup_key + + +def check_and_set_default_voc_size(voc_size_list: List[int], dim: int) -> bool: + if ConfigInitializer.get_instance().use_dynamic_expansion: + voc_size_list[1] = 0 + voc_size_list[2] = 0 + return True + cache_mode = os.getenv("CACHE_MODE") + if cache_mode is None and voc_size_list[0] <= 1: # no cache mode, no use_dynamic_expansion, must input dev-voc + return False + if cache_mode is None and voc_size_list[1] == 0: # no cache mode, dev-voc not None, use HBM + return True + if cache_mode is None and voc_size_list[2] == 0: # no cache mode, dev-voc/host-voc not None, use DDR + return True + if cache_mode is None: # no cache mode, dev-voc/host-voc/ssd-voc not None, use SSD + return True + + if cache_mode not in [mode.value for mode in CacheModeEnum]: + return False + if cache_mode == CacheModeEnum.HBM.value and (voc_size_list[1] > 0 or voc_size_list[2]) > 0: + return False + if cache_mode == CacheModeEnum.DDR.value and voc_size_list[2] > 0: + return False + if voc_size_list[0] == 1: + voc_size_list[0] = int(DEFAULT_DEVICE_CACHE_MEMORY_SIZE / dim / 4) # float32 4 bytes + if (cache_mode == CacheModeEnum.DDR.value or cache_mode == CacheModeEnum.SSD.value) and voc_size_list[1] == 0: + sys_mem = psutil.virtual_memory().total / dim / 4 # float32 4 bytes + voc_size_list[1] = sys_mem if sys_mem is not None else int(DEFAULT_HOST_CACHE_MEMORY_SIZE / dim / 4) + if cache_mode == CacheModeEnum.SSD.value and voc_size_list[2] == 0: + voc_size_list[2] = DEFAULT_SSD_CACHE_MEMORY_SIZE + return True -- Gitee From 6512fca2b5f49505891d37b39a1b1c50b71344a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 6 Jun 2024 16:10:21 +0800 Subject: [PATCH 200/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index 51c1231d..206b6af3 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -95,7 +95,7 @@ def create_table(key_dtype, dim, name, emb_initializer, name = fix_invalid_table_name(name) voc_size_list = [device_vocabulary_size, host_vocabulary_size, ssd_vocabulary_size] - if check_and_set_default_voc_size(voc_size_list, dim): + if not check_and_set_default_voc_size(voc_size_list, dim): raise ValueError("voc_size_lis does not fit this cache mode") config = dict(key_dtype=key_dtype, embedding_size=dim, table_name=name, emb_initializer=emb_initializer, -- Gitee From 74c1fe6adfb0bfbfd9141c0fc28ad9e33ca3628b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 6 Jun 2024 17:28:00 +0800 Subject: [PATCH 201/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/embedding.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index 206b6af3..872491da 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -94,8 +94,12 @@ def create_table(key_dtype, dim, name, emb_initializer, """ name = fix_invalid_table_name(name) + if isinstance(dim, tf.TensorShape): + dim_bytes = dim.as_list()[0] * 4 # float32 4 bytes + else: + dim_bytes = dim * 4 # float32 4 bytes voc_size_list = [device_vocabulary_size, host_vocabulary_size, ssd_vocabulary_size] - if not check_and_set_default_voc_size(voc_size_list, dim): + if not check_and_set_default_voc_size(voc_size_list, dim_bytes): raise ValueError("voc_size_lis does not fit this cache mode") config = dict(key_dtype=key_dtype, embedding_size=dim, table_name=name, emb_initializer=emb_initializer, @@ -210,7 +214,7 @@ def mark_orphan_lookup_key(lookup_key: Tensor) -> Tensor: return marked_lookup_key -def check_and_set_default_voc_size(voc_size_list: List[int], dim: int) -> bool: +def check_and_set_default_voc_size(voc_size_list: List[int], dim_bytes: int) -> bool: if ConfigInitializer.get_instance().use_dynamic_expansion: voc_size_list[1] = 0 voc_size_list[2] = 0 @@ -232,10 +236,12 @@ def check_and_set_default_voc_size(voc_size_list: List[int], dim: int) -> bool: if cache_mode == CacheModeEnum.DDR.value and voc_size_list[2] > 0: return False if voc_size_list[0] == 1: - voc_size_list[0] = int(DEFAULT_DEVICE_CACHE_MEMORY_SIZE / dim / 4) # float32 4 bytes + voc_size_list[0] = int(DEFAULT_DEVICE_CACHE_MEMORY_SIZE / dim_bytes) if (cache_mode == CacheModeEnum.DDR.value or cache_mode == CacheModeEnum.SSD.value) and voc_size_list[1] == 0: - sys_mem = psutil.virtual_memory().total / dim / 4 # float32 4 bytes - voc_size_list[1] = sys_mem if sys_mem is not None else int(DEFAULT_HOST_CACHE_MEMORY_SIZE / dim / 4) + sys_voc = int(psutil.virtual_memory().total * 0.8 / dim_bytes) # max host mem equal (0.8 * sys mem) + default_host_voc_size = int(DEFAULT_HOST_CACHE_MEMORY_SIZE / dim_bytes) + max_host_voc_size = MAX_VOCABULARY_SIZE if (sys_voc is not None and sys_voc > MAX_VOCABULARY_SIZE) else sys_voc + voc_size_list[1] = max_host_voc_size if sys_voc is not None else default_host_voc_size if cache_mode == CacheModeEnum.SSD.value and voc_size_list[2] == 0: voc_size_list[2] = DEFAULT_SSD_CACHE_MEMORY_SIZE return True -- Gitee From 6620de17237f8396987c5f4a262ced42b2d2f2e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 6 Jun 2024 18:47:45 +0800 Subject: [PATCH 202/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/constants/constants.py | 6 +++--- mx_rec/core/embedding.py | 14 +++++--------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index fa34fddc..fd27fc27 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -27,8 +27,8 @@ ASCEND_SPARSE_LOOKUP_LOCAL_EMB = "ASCEND_SPARSE_LOOKUP_LOCAL_EMB" EMPTY_STR = "" # default emb memory size for hbm、ddr、ssd -DEFAULT_DEVICE_CACHE_MEMORY_SIZE = 2 * 12024 * 1024 * 1024 -DEFAULT_HOST_CACHE_MEMORY_SIZE = 20 * 12024 * 1024 * 1024 +DEFAULT_DEVICE_CACHE_MEMORY_SIZE = 2 * 1024 * 1024 * 1024 +DEFAULT_HOST_CACHE_MEMORY_SIZE = 40 * 1024 * 1024 * 1024 DEFAULT_SSD_CACHE_MEMORY_SIZE = sys.maxsize @@ -74,7 +74,7 @@ DEFAULT_EVICT_TIME_INTERVAL = 60 * 60 * 24 TRAIN_CHANNEL_ID = 0 EVAL_CHANNEL_ID = 1 HASHTABLE_COLLECTION_NAME_LENGTH = 30 -MAX_VOCABULARY_SIZE = 10**10 +MAX_VOCABULARY_SIZE = 10**9 MAX_DEVICE_VOCABULARY_SIZE = 10 ** 9 # RANK INFO diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index 872491da..0422f893 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -16,7 +16,6 @@ # ============================================================================== import os -import psutil from typing import Optional, Union, List import tensorflow as tf @@ -94,10 +93,7 @@ def create_table(key_dtype, dim, name, emb_initializer, """ name = fix_invalid_table_name(name) - if isinstance(dim, tf.TensorShape): - dim_bytes = dim.as_list()[0] * 4 # float32 4 bytes - else: - dim_bytes = dim * 4 # float32 4 bytes + dim_bytes = dim.as_list()[0] * 4 if isinstance(dim, tf.TensorShape) else dim * 4 # float32 4 bytes voc_size_list = [device_vocabulary_size, host_vocabulary_size, ssd_vocabulary_size] if not check_and_set_default_voc_size(voc_size_list, dim_bytes): raise ValueError("voc_size_lis does not fit this cache mode") @@ -236,12 +232,12 @@ def check_and_set_default_voc_size(voc_size_list: List[int], dim_bytes: int) -> if cache_mode == CacheModeEnum.DDR.value and voc_size_list[2] > 0: return False if voc_size_list[0] == 1: - voc_size_list[0] = int(DEFAULT_DEVICE_CACHE_MEMORY_SIZE / dim_bytes) + default_device_voc_size = int(DEFAULT_DEVICE_CACHE_MEMORY_SIZE / dim_bytes) + voc_size_list[0] = default_device_voc_size if default_device_voc_size < MAX_VOCABULARY_SIZE \ + else MAX_VOCABULARY_SIZE if (cache_mode == CacheModeEnum.DDR.value or cache_mode == CacheModeEnum.SSD.value) and voc_size_list[1] == 0: - sys_voc = int(psutil.virtual_memory().total * 0.8 / dim_bytes) # max host mem equal (0.8 * sys mem) default_host_voc_size = int(DEFAULT_HOST_CACHE_MEMORY_SIZE / dim_bytes) - max_host_voc_size = MAX_VOCABULARY_SIZE if (sys_voc is not None and sys_voc > MAX_VOCABULARY_SIZE) else sys_voc - voc_size_list[1] = max_host_voc_size if sys_voc is not None else default_host_voc_size + voc_size_list[1] = default_host_voc_size if default_host_voc_size < MAX_VOCABULARY_SIZE else MAX_VOCABULARY_SIZE if cache_mode == CacheModeEnum.SSD.value and voc_size_list[2] == 0: voc_size_list[2] = DEFAULT_SSD_CACHE_MEMORY_SIZE return True -- Gitee From 6a275d1dcef4773fc5426a3f92d8de55d3becbe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 6 Jun 2024 18:48:03 +0800 Subject: [PATCH 203/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/constants/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index fd27fc27..60985115 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -74,7 +74,7 @@ DEFAULT_EVICT_TIME_INTERVAL = 60 * 60 * 24 TRAIN_CHANNEL_ID = 0 EVAL_CHANNEL_ID = 1 HASHTABLE_COLLECTION_NAME_LENGTH = 30 -MAX_VOCABULARY_SIZE = 10**9 +MAX_VOCABULARY_SIZE = 10**8 MAX_DEVICE_VOCABULARY_SIZE = 10 ** 9 # RANK INFO -- Gitee From 99f8838efc2b0c2abd49bb8c4fd036c76f6142ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 6 Jun 2024 19:49:54 +0800 Subject: [PATCH 204/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/constants/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index 60985115..fd27fc27 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -74,7 +74,7 @@ DEFAULT_EVICT_TIME_INTERVAL = 60 * 60 * 24 TRAIN_CHANNEL_ID = 0 EVAL_CHANNEL_ID = 1 HASHTABLE_COLLECTION_NAME_LENGTH = 30 -MAX_VOCABULARY_SIZE = 10**8 +MAX_VOCABULARY_SIZE = 10**9 MAX_DEVICE_VOCABULARY_SIZE = 10 ** 9 # RANK INFO -- Gitee From 4d37925e6a15d0c387220498572c569610bd252e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 6 Jun 2024 20:20:58 +0800 Subject: [PATCH 205/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91issure=20bug=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/embedding.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index 0422f893..8c12eb4c 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -95,8 +95,7 @@ def create_table(key_dtype, dim, name, emb_initializer, dim_bytes = dim.as_list()[0] * 4 if isinstance(dim, tf.TensorShape) else dim * 4 # float32 4 bytes voc_size_list = [device_vocabulary_size, host_vocabulary_size, ssd_vocabulary_size] - if not check_and_set_default_voc_size(voc_size_list, dim_bytes): - raise ValueError("voc_size_lis does not fit this cache mode") + check_and_set_default_voc_size(voc_size_list, dim_bytes) config = dict(key_dtype=key_dtype, embedding_size=dim, table_name=name, emb_initializer=emb_initializer, device_vocabulary_size=voc_size_list[0], host_vocabulary_size=voc_size_list[1], @@ -210,27 +209,29 @@ def mark_orphan_lookup_key(lookup_key: Tensor) -> Tensor: return marked_lookup_key -def check_and_set_default_voc_size(voc_size_list: List[int], dim_bytes: int) -> bool: +def check_and_set_default_voc_size(voc_size_list: List[int], dim_bytes: int): if ConfigInitializer.get_instance().use_dynamic_expansion: voc_size_list[1] = 0 voc_size_list[2] = 0 - return True + return cache_mode = os.getenv("CACHE_MODE") - if cache_mode is None and voc_size_list[0] <= 1: # no cache mode, no use_dynamic_expansion, must input dev-voc - return False - if cache_mode is None and voc_size_list[1] == 0: # no cache mode, dev-voc not None, use HBM - return True - if cache_mode is None and voc_size_list[2] == 0: # no cache mode, dev-voc/host-voc not None, use DDR - return True - if cache_mode is None: # no cache mode, dev-voc/host-voc/ssd-voc not None, use SSD - return True + if not cache_mode and voc_size_list[0] <= 1: + raise ValueError("no cache mode, no use_dynamic_expansion, must input dev-voc") + if not cache_mode and voc_size_list[1] == 0 and voc_size_list[2] == 0: # no cache mode, dev-voc not None, use HBM + return + if not cache_mode and voc_size_list[1] == 0 and voc_size_list[2] > 0: + raise ValueError("no cache mode, dev-voc is not none and host-voc is none, ssd-voc must be none too") + if not cache_mode and voc_size_list[2] == 0: # no cache mode, dev-voc/host-voc not None, use DDR + return + if not cache_mode: # no cache mode, dev-voc/host-voc/ssd-voc not None, use SSD + return if cache_mode not in [mode.value for mode in CacheModeEnum]: - return False - if cache_mode == CacheModeEnum.HBM.value and (voc_size_list[1] > 0 or voc_size_list[2]) > 0: - return False + raise ValueError("cache mode need to fit HBM, DDR, SSD") + if cache_mode == CacheModeEnum.HBM.value and (voc_size_list[1] > 0 or voc_size_list[2] > 0): + raise ValueError("cache mode HBM, host-voc or ssd-voc is need to be none") if cache_mode == CacheModeEnum.DDR.value and voc_size_list[2] > 0: - return False + raise ValueError("cache mode DDR, ssd-voc is need to be none") if voc_size_list[0] == 1: default_device_voc_size = int(DEFAULT_DEVICE_CACHE_MEMORY_SIZE / dim_bytes) voc_size_list[0] = default_device_voc_size if default_device_voc_size < MAX_VOCABULARY_SIZE \ @@ -240,4 +241,4 @@ def check_and_set_default_voc_size(voc_size_list: List[int], dim_bytes: int) -> voc_size_list[1] = default_host_voc_size if default_host_voc_size < MAX_VOCABULARY_SIZE else MAX_VOCABULARY_SIZE if cache_mode == CacheModeEnum.SSD.value and voc_size_list[2] == 0: voc_size_list[2] = DEFAULT_SSD_CACHE_MEMORY_SIZE - return True + return -- Gitee From c864ba8124b8f45d3fc0540bd4e60cba9e40134c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 6 Jun 2024 20:42:03 +0800 Subject: [PATCH 206/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91issure=20bug=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/mx_rec/core/test_embedding.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/mx_rec/core/test_embedding.py b/tests/mx_rec/core/test_embedding.py index 5bc762f4..509b9ae7 100644 --- a/tests/mx_rec/core/test_embedding.py +++ b/tests/mx_rec/core/test_embedding.py @@ -88,7 +88,8 @@ class TestCreateTableFunc(unittest.TestCase): test_table = create_table(key_dtype=tf.int64, dim=8, name='test_table', - emb_initializer=tf.compat.v1.truncated_normal_initializer()) + emb_initializer=tf.compat.v1.truncated_normal_initializer(), + device_vocabulary_size=8) self.assertIsInstance(test_table, HBMSparseEmbedding) @mock.patch.multiple("mx_rec.core.emb.base_sparse_embedding", @@ -120,6 +121,7 @@ class TestCreateTableFunc(unittest.TestCase): dim=8, name='test_table', emb_initializer=tf.compat.v1.truncated_normal_initializer(), + device_vocabulary_size=8, host_vocabulary_size=8) self.assertIsInstance(test_table, ExternalStorageSparseEmbedding) -- Gitee From 0af09c050417adb28125ba6ecebe4cabc506cd36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Thu, 6 Jun 2024 14:08:39 +0000 Subject: [PATCH 207/302] =?UTF-8?q?!177=20WideDeep=E6=A8=A1=E5=9E=8B?= =?UTF-8?q?=EF=BC=9A=E7=89=88=E6=9C=AC=E6=A3=80=E6=9F=A5cleancode=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=20*=20WideDeep=E6=A8=A1=E5=9E=8B=EF=BC=9A=20=E7=89=88?= =?UTF-8?q?=E6=9C=AC=E6=A3=80=E6=9F=A5cleancode=E4=BF=AE=E6=94=B93?= =?UTF-8?q?=E8=AF=84=E8=AE=BA=E6=84=8F=E8=A7=81=E4=BF=AE=E6=94=B9=20*=20Wi?= =?UTF-8?q?deDeep=E6=A8=A1=E5=9E=8B=EF=BC=9A=E7=89=88=E6=9C=AC=E6=A3=80?= =?UTF-8?q?=E6=9F=A5cleancode=E4=BF=AE=E6=94=B93=20*=20WideDeep=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=EF=BC=9A=E7=89=88=E6=9C=AC=E6=A3=80=E6=9F=A5cleancode?= =?UTF-8?q?=E4=BF=AE=E6=94=B92=20*=20WideDeep=E6=A8=A1=E5=9E=8B=EF=BC=9A?= =?UTF-8?q?=E7=89=88=E6=9C=AC=E6=A3=80=E6=9F=A5cleancode=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/criteo.py | 45 +++++++++++++++++++++------ examples/WideDeep/model/main_mxrec.py | 8 +++-- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/examples/WideDeep/criteo.py b/examples/WideDeep/criteo.py index f9624d21..617c76f6 100644 --- a/examples/WideDeep/criteo.py +++ b/examples/WideDeep/criteo.py @@ -1,3 +1,19 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + import os import stat import pickle @@ -58,7 +74,10 @@ def split_byline_count(filename, count, sub_dir_name): sub = make_sub_file(buf, head, filename, sub_dir_name, sub) buf = [] if len(buf) != 0: - make_sub_file(buf, head, filename, sub_dir_name, sub) + try: + make_sub_file(buf, head, filename, sub_dir_name, sub) + except FileNotFoundError as err: + raise FileNotFoundError("please check the filename of data") from err finally: f.close() @@ -170,10 +189,11 @@ def convert_input2tfrd(in_file_path, out_file_path): dense_feature = np.array(dense_feat_list, dtype=np.int64).reshape(-1) sparse_feature = np.array(sparse_feat_list, dtype=np.int64).reshape(-1) label = np.array(label_list, dtype=np.int64).reshape(-1) - feature_dict = {"dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)), - "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), - "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) - } + feature_dict = { + "dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)), + "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), + "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) + } example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) return example @@ -183,7 +203,7 @@ def convert_input2tfrd(in_file_path, out_file_path): with open(in_file_path, encoding='utf-8') as file_in: - for i, line in tqdm(enumerate(file_in)): + for _, line in tqdm(enumerate(file_in)): line = line.strip('\n') items = line.split('\t') @@ -226,13 +246,18 @@ if __name__ == '__main__': data_df[dense_features] = data_df[dense_features].fillna(0) # sparse feature: mapping for col in sparse_features: - data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) + try: + data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) + except KeyError as er: + raise KeyError("Feature {} not found in dataset".format(col)) from er # dense feature: Bin continuous data into intervals. data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map) # add offsets - slot_size_array = [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, - 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, - 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573] + slot_size_array = [ + 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, + 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, + 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573 + ] offset_size_list = np.cumsum([0] + slot_size_array[:-1]) for col_index in range(1, len(offset_size_list) + 1): data_df.iloc[:, col_index] += offset_size_list[col_index - 1] diff --git a/examples/WideDeep/model/main_mxrec.py b/examples/WideDeep/model/main_mxrec.py index b1d77698..0a7c2f87 100644 --- a/examples/WideDeep/model/main_mxrec.py +++ b/examples/WideDeep/model/main_mxrec.py @@ -284,10 +284,12 @@ def create_feature_spec_list(use_timestamp=False): access_threshold = 1000 eviction_threshold = 180 - feature_spec_list = [FeatureSpec("sparse_feature", table_name="wide_embeddings", batch_size=cfg.batch_size, + feature_spec_list = [ + FeatureSpec("sparse_feature", table_name="wide_embeddings", batch_size=cfg.batch_size, access_threshold=access_threshold, eviction_threshold=eviction_threshold), - FeatureSpec("sparse_feature", table_name="deep_embeddings", batch_size=cfg.batch_size, - access_threshold=access_threshold, eviction_threshold=eviction_threshold)] + FeatureSpec("sparse_feature", table_name="deep_embeddings", batch_size=cfg.batch_size, + access_threshold=access_threshold, eviction_threshold=eviction_threshold) + ] if use_multi_lookup: feature_spec_list.extend([FeatureSpec("sparse_feature", table_name="wide_embeddings", -- Gitee From 747c21b273a76b42a9addd37c1dd9462bbff1dcd Mon Sep 17 00:00:00 2001 From: sihaixianyu Date: Fri, 7 Jun 2024 10:40:29 +0800 Subject: [PATCH 208/302] =?UTF-8?q?[CleanCode]=20=E6=B8=85=E7=90=86?= =?UTF-8?q?=E6=94=B9=E5=9B=BE=E9=83=A8=E5=88=86=E7=9A=84=E9=AD=94=E6=B3=95?= =?UTF-8?q?=E5=AD=97=E7=AC=A6=E4=B8=B2=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/asc/swap_args.py | 1 + mx_rec/graph/modifier.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/mx_rec/core/asc/swap_args.py b/mx_rec/core/asc/swap_args.py index 1d206b95..3157e1e0 100644 --- a/mx_rec/core/asc/swap_args.py +++ b/mx_rec/core/asc/swap_args.py @@ -22,6 +22,7 @@ from enum import Enum class SwapDataType(Enum): CONFIG = "config" CONTROL = "control" + CONTROL_OPS = "control_ops" def singleton(cls): diff --git a/mx_rec/graph/modifier.py b/mx_rec/graph/modifier.py index 15c240e5..97205481 100644 --- a/mx_rec/graph/modifier.py +++ b/mx_rec/graph/modifier.py @@ -38,7 +38,7 @@ from mx_rec.constants.constants import ( from mx_rec.core.asc.feature_spec import FeatureSpec from mx_rec.core.asc.helper import get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline -from mx_rec.core.asc.swap_args import SwapArgs +from mx_rec.core.asc.swap_args import SwapArgs, SwapDataType from mx_rec.core.asc.build_graph import SwapInfo from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.graph.merge_lookup import do_merge_lookup @@ -255,17 +255,17 @@ class _GraphModifier: table_instance, variable_and_slot_list, swap_args_dict["swap_info"], channel_id) # gather for id_offset need to be executed after swap_op swap_control_dict = swap_args.swap_control_dict[table_instance.table_name][channel_id] - if "control_ops" not in swap_control_dict: + if SwapDataType.CONTROL_OPS.value not in swap_control_dict: raise ValueError("swap control missing key [control_ops] in modify_graph_for_asc") - control_ops = swap_control_dict["control_ops"] + control_ops = swap_control_dict[SwapDataType.CONTROL_OPS.value] utils.replace_anchor_control(self._full_graph, control_ops, swap_op) if is_training and slot_num > 1: # gather for slot need to be executed after swap_op slot_control_dict = swap_args.slot_control_dict[table_instance.variable] - if "control_ops" not in slot_control_dict: + if SwapDataType.CONTROL_OPS.value not in slot_control_dict: raise ValueError("slot control missing key [control_ops] in modify_graph_for_asc") - slot_control_ops = slot_control_dict["control_ops"] + slot_control_ops = slot_control_dict[SwapDataType.CONTROL_OPS.value] utils.replace_anchor_control(self._full_graph, slot_control_ops, swap_op) def _generate_get_next_op_specs(self, cutting_point_list: List[Tensor]) -> Dict[Tensor, _AnchorRecord]: @@ -728,8 +728,8 @@ def _get_variable_and_slot_list(each_var, slot_num, table_name, channel_id): return variable_and_slot_list -def _get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: list, - swap_info: SwapInfo, channel_id: int) -> list: +def _get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: list, + swap_info: SwapInfo, channel_id: int) -> list: """ Get swap op. :param table_instance: BaseSparseEmbedding @@ -740,10 +740,10 @@ def _get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: """ if table_instance.is_hbm: return [tf.no_op()] - + if len(variable_and_slot_list) == 0: raise RuntimeError("When enable emb_transfer, optimizer should have slots") - + use_static = ConfigInitializer.get_instance().use_static max_lookup_vec_size = None if use_static: @@ -756,7 +756,7 @@ def _get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: output_shapes=[[max_lookup_vec_size, table_instance.ext_emb_size]], channel_name=f'{table_instance.table_name}_h2d_all')[0] logger.debug("h2d_emb shape: %s", h2d_emb) - + swap_out_pos = swap_info.swap_out_pos swap_in_pos = swap_info.swap_in_pos if use_static: @@ -766,14 +766,14 @@ def _get_swap_info(table_instance: BaseSparseEmbedding, variable_and_slot_list: swap_outs = [tf.gather(one_table, swap_out_pos) for one_table in variable_and_slot_list] swap_out = tf.concat(swap_outs, axis=1) logger.debug('Channel %s_d2h_all was built for op outfeed.', table_instance.table_name) - + swap_out_op = npu_ops.outfeed_enqueue_op( channel_name=f'{table_instance.table_name}_d2h_all', inputs=[swap_out]) with tf.control_dependencies([swap_out_op]): nd_swap_pos = tf.expand_dims(swap_in_pos, 1) var_num = len(variable_and_slot_list) h2d_emb_split = tf.split(h2d_emb, var_num, axis=1) - + optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name( table_instance.table_name) if optimizer is None and channel_id == 1: -- Gitee From 11b42c08a357d4ea5924403daa357587f4d8b5e2 Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Tue, 4 Jun 2024 03:57:47 +0000 Subject: [PATCH 209/302] =?UTF-8?q?=E6=8A=BD=E8=B1=A1L3=E5=AD=98=E5=82=A8?= =?UTF-8?q?=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/emb_table/embedding_ddr.cpp | 4 +- src/core/emb_table/embedding_table.h | 2 +- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 254 +++++++++--------- src/core/hybrid_mgmt/hybrid_mgmt.h | 44 +-- .../cache_manager.cpp | 137 +++++----- .../{ssd_cache => l3_storage}/cache_manager.h | 37 +-- src/core/l3_storage/l3_storage.cpp | 69 +++++ src/core/l3_storage/l3_storage.h | 63 +++++ .../{ssd_cache => l3_storage}/lfu_cache.cpp | 0 .../{ssd_cache => l3_storage}/lfu_cache.h | 0 .../preprocess_mapper.h | 56 ++-- src/core/ssd_engine/ssd_engine.cpp | 2 +- src/core/ssd_engine/ssd_engine.h | 6 +- src/tests/ssd_cache/cache_manager_test.cpp | 46 ++-- src/tests/ssd_cache/lfu_cache_test.cpp | 2 +- 15 files changed, 438 insertions(+), 284 deletions(-) rename src/core/{ssd_cache => l3_storage}/cache_manager.cpp (61%) rename src/core/{ssd_cache => l3_storage}/cache_manager.h (71%) create mode 100644 src/core/l3_storage/l3_storage.cpp create mode 100644 src/core/l3_storage/l3_storage.h rename src/core/{ssd_cache => l3_storage}/lfu_cache.cpp (100%) rename src/core/{ssd_cache => l3_storage}/lfu_cache.h (100%) rename src/core/{ssd_cache => l3_storage}/preprocess_mapper.h (54%) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 092ad0c5..3898a7da 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -18,7 +18,7 @@ See the License for the specific language governing permissions and #include "utils/logger.h" #include "utils/singleton.h" -#include "ssd_cache/cache_manager.h" +#include "l3_storage/cache_manager.h" #include "ock_ctr_common/include/error_code.h" using namespace MxRec; @@ -253,7 +253,7 @@ void EmbeddingDDR::SyncLatestEmbedding() throw std::invalid_argument(errMsg); } } - cacheManager_->UpdateSSDEmb(name, ptr, embInfo_.extEmbeddingSize, info.swapOutSSDKeys, info.swapOutSSDAddrOffs); + cacheManager_->UpdateL3StorageEmb(name, ptr, embInfo_.extEmbeddingSize, info.swapOutL3StorageKeys, info.swapOutL3StorageAddrOffs); } } diff --git a/src/core/emb_table/embedding_table.h b/src/core/emb_table/embedding_table.h index 8b622194..cbf15a7a 100644 --- a/src/core/emb_table/embedding_table.h +++ b/src/core/emb_table/embedding_table.h @@ -20,7 +20,7 @@ See the License for the specific language governing permissions and #include #include "utils/common.h" -#include "ssd_cache/cache_manager.h" +#include "l3_storage/cache_manager.h" #include "file_system/file_system_handler.h" namespace MxRec { diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index b96f4eb9..973831a2 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -48,17 +48,17 @@ void HybridMgmt::InitRankInfo(RankInfo& rankInfo, const vector& embInfo // 计算训练任务涉及的所有表在DDR中需要分配的key数量 size_t totHostVocabSize = 0; - size_t totalSsdVocabSize = 0; + size_t totalL3StorageVocabSize = 0; for (const auto& emb : embInfos) { totHostVocabSize += emb.hostVocabSize; - totalSsdVocabSize += emb.ssdVocabSize; + totalL3StorageVocabSize += emb.ssdVocabSize; } // 根据DDR的key数量,配置存储模式HBM/DDR if (totHostVocabSize != 0) { rankInfo.isDDR = true; } - if (totalSsdVocabSize != 0) { + if (totalL3StorageVocabSize != 0) { rankInfo.isSSDEnabled = true; } #endif @@ -115,16 +115,18 @@ bool HybridMgmt::Initialize(RankInfo rankInfo, const vector& embInfos, KEY_PROCESS_INSTANCE->Initialize(rankInfo, embInfos, thresholdValues, seed); isRunning = true; - isSSDEnabled = rankInfo.isSSDEnabled; + isL3StorageEnabled = rankInfo.isSSDEnabled; EmbeddingMgmt::Instance()->Init(rankInfo, embInfos, seed); if (rankInfo.isDDR) { InitEmbeddingCache(embInfos); } - if (isSSDEnabled) { + if (isL3StorageEnabled) { cacheManager = Singleton::GetInstance(); - cacheManager->Init(embCache, mgmtEmbInfo); + // 用户可实现L3Storage接口替换SSDEngine以对接外部存储服务 + auto ssdEngine = std::make_shared(); + cacheManager->Init(embCache, mgmtEmbInfo, ssdEngine); EmbeddingMgmt::Instance()->SetCacheManagerForEmbTable(cacheManager); } isLoad = ifLoad; @@ -170,10 +172,10 @@ void HybridMgmt::Save(const string& savePath) offsetMapToSend = EmbeddingMgmt::Instance()->GetDeviceOffsets(); } - if (isSSDEnabled) { - LOG_DEBUG(MGMT + "start save SSD data"); + if (isL3StorageEnabled) { + LOG_DEBUG(MGMT + "start save L3Storage data"); auto step = GetStepFromPath(savePath); - cacheManager->SaveSSDEngine(step); + cacheManager->Save(step); } // 保存特征准入淘汰相关的数据 @@ -248,8 +250,8 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) featAdmitNEvict.LoadHistoryRecords(loadData.histRec); } - if (isSSDEnabled) { - LOG_DEBUG(MGMT + "Start host side load: ssd key freq map"); + if (isL3StorageEnabled) { + LOG_DEBUG(MGMT + "Start host side load: L3Storage key freq map"); auto step = GetStepFromPath(loadPath); cacheManager->Load(mgmtEmbInfo, step, trainKeysSet); } @@ -572,13 +574,13 @@ bool HybridMgmt::ParseKeys(int channelId, int& batchId, TaskType type) }); break; case TaskType::DDR: - if (!isSSDEnabled) { + if (!isL3StorageEnabled) { parseKeyThreadPool.emplace_back([this, info, &remainBatch, embInfo]() { ProcessEmbInfoDDR(info, remainBatch); }); } else { parseKeyThreadPool.emplace_back([this, info, &remainBatch, embInfo]() { - ProcessEmbInfoSSD(info, remainBatch); + ProcessEmbInfoL3Storage(info, remainBatch); }); } break; @@ -780,12 +782,12 @@ bool HybridMgmt::Evict() } for (const string& embName : allTableNames) { EvictKeys(embName, evictKeyMap[COMBINE_HISTORY_NAME]); - EvictSSDKeys(embName, evictKeyMap[COMBINE_HISTORY_NAME]); + EvictL3StorageKeys(embName, evictKeyMap[COMBINE_HISTORY_NAME]); } } else { for (const auto& evict : as_const(evictKeyMap)) { EvictKeys(evict.first, evict.second); - EvictSSDKeys(evict.first, evict.second); + EvictL3StorageKeys(evict.first, evict.second); } } } @@ -809,12 +811,12 @@ void HybridMgmt::EvictKeys(const string& embName, const vector& } } -void HybridMgmt::EvictSSDKeys(const string& embName, const vector& keys) const +void HybridMgmt::EvictL3StorageKeys(const string& embName, const vector& keys) const { - if (!isSSDEnabled) { + if (!isL3StorageEnabled) { return; } - cacheManager->EvictSSDEmbedding(embName, keys); + cacheManager->EvictL3StorageEmbedding(embName, keys); } int HybridMgmt::GetStepFromPath(const string& loadPath) const @@ -885,14 +887,14 @@ int64_t HybridMgmt::GetTableSize(const string& embName) const LOG_INFO(MGMT + "HBM mode, get emb:[{}] size:{}", embName, size); return size; } - int64_t ssdSize = 0; - if (mgmtRankInfo.isSSDEnabled) { - ssdSize = cacheManager->GetTableEmbeddingSize(embName); + int64_t l3StorageUsage = 0; + if (isL3StorageEnabled) { + l3StorageUsage = cacheManager->GetTableUsage(embName); } uint32_t ddrSize = embCache->GetUsage(embName); - size = static_cast(ddrSize) + ssdSize; - LOG_INFO(MGMT + "DDR/SSD mode, get emb:[{}] size:{}", embName, size); + size = static_cast(ddrSize) + l3StorageUsage; + LOG_INFO(MGMT + "DDR/L3Storage mode, get emb:[{}] size:{}", embName, size); #endif return size; } @@ -1118,7 +1120,7 @@ void HybridMgmt::EmbeddingReceiveAndUpdateDDR(int batchId, int index, const EmbI EmbeddingUpdateDDR(info, ptr, swapOutAddrs); } -void HybridMgmt::EmbeddingLookUpAndSendSSD(int batchId, int index, const EmbInfo& embInfo) +void HybridMgmt::EmbeddingLookUpAndSendL3Storage(int batchId, int index, const EmbInfo& embInfo) { int cvNotifyIndex = 0; if (index + 1 != EMBEDDING_THREAD_NUM) { @@ -1134,16 +1136,16 @@ void HybridMgmt::EmbeddingLookUpAndSendSSD(int batchId, int index, const EmbInfo }; vector h2dEmb; - auto isSuccess = EmbeddingLookUpSSD(info, h2dEmb); + auto isSuccess = EmbeddingLookUpL3Storage(info, h2dEmb); if (!isSuccess) { LOG_INFO("HybridMgmt is not running"); return; } - EmbeddingSendSSD(info, h2dEmb); + EmbeddingSendL3Storage(info, h2dEmb); } -void HybridMgmt::EmbeddingReceiveAndUpdateSSD(int batchId, int index, const EmbInfo& embInfo) +void HybridMgmt::EmbeddingReceiveAndUpdateL3Storage(int batchId, int index, const EmbInfo& embInfo) { int cvNotifyIndex = 0; if (index + 1 != EMBEDDING_THREAD_NUM) { @@ -1160,9 +1162,9 @@ void HybridMgmt::EmbeddingReceiveAndUpdateSSD(int batchId, int index, const EmbI float* ptr = nullptr; vector swapOutAddrs; int64_t dims0 = 0; - EmbeddingReceiveSSD(info, ptr, swapOutAddrs, dims0); + EmbeddingReceiveL3Storage(info, ptr, swapOutAddrs, dims0); - EmbeddingUpdateSSD(info, ptr, swapOutAddrs, dims0); + EmbeddingUpdateL3Storage(info, ptr, swapOutAddrs, dims0); } @@ -1172,11 +1174,11 @@ void HybridMgmt::EmbeddingReceiveAndUpdateSSD(int batchId, int index, const EmbI /// \param channelId 通道索引(训练/推理) /// \param remainBatchOut 是否从通道获取了数据 /// \return 是否处理成功 -void HybridMgmt::ProcessEmbInfoSSD(const EmbBaseInfo& info, bool& remainBatchOut) +void HybridMgmt::ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBatchOut) { #ifndef GTEST TimeCost getAndSendTensorsTC; - LOG_DEBUG("ProcessEmbInfoSSD table:{}, channel:{}, batchId:{}", info.name, info.channelId, info.batchId); + LOG_DEBUG("ProcessEmbInfoL3Storage table:{}, channel:{}, batchId:{}", info.name, info.channelId, info.batchId); if (info.channelId == TRAIN_CHANNEL_ID && info.batchId == hybridMgmtBlock->maxTrainStep) { HandleReachMaxStepCase(info, remainBatchOut); @@ -1184,7 +1186,7 @@ void HybridMgmt::ProcessEmbInfoSSD(const EmbBaseInfo& info, bool& remainBatchOut } // 只有在每次GetUniqueKeys的时候才知道上游是否已经EOS - // 注意GetUniqueKeys与EOS关联,需要在ProcessEmbInfoSSD最先调用,如需调整位置,请参考并适配其他函数 + // 注意GetUniqueKeys与EOS关联,需要在ProcessEmbInfoL3Storage最先调用,如需调整位置,请参考并适配其他函数 // 获取GlobalUnique向量 auto uniqueKeys = GetUniqueKeys(info, remainBatchOut); if (uniqueKeys.empty()) { @@ -1215,7 +1217,7 @@ void HybridMgmt::ProcessEmbInfoSSD(const EmbBaseInfo& info, bool& remainBatchOut SendGlobalUniqueVec(info, uniqueKeys, restoreVecSec); - auto isNeedReturn = HandleSpecialProcessStatusSSD(info, getAndSendTensorsTC, swapInKoPair, swapOutKoPair); + auto isNeedReturn = HandleSpecialProcessStatusL3Storage(info, getAndSendTensorsTC, swapInKoPair, swapOutKoPair); if (isNeedReturn) { return; } @@ -1226,7 +1228,7 @@ void HybridMgmt::ProcessEmbInfoSSD(const EmbBaseInfo& info, bool& remainBatchOut auto &swapOutKeys = swapOutKoPair.first; auto &swapOutPos = swapOutKoPair.second; - HandleDataSwapForSSD(info, swapInKeys, swapOutKeys); + HandleDataSwapForL3Storage(info, swapInKeys, swapOutKeys); auto lastSwapInPos = lastSwapInPosMap[info.name]; lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 @@ -1238,13 +1240,13 @@ void HybridMgmt::ProcessEmbInfoSSD(const EmbBaseInfo& info, bool& remainBatchOut HandleEndBatchCase(info, swapInPos); - CheckLookupAddrSuccessSSD(); + CheckLookupAddrSuccessL3Storage(); if (info.channelId == TRAIN_CHANNEL_ID) { alreadyTrainOnce = true; } - LOG_DEBUG("ProcessEmbInfoSSD end, table:{}, batchId:{}, swapProcessTC(ms):{}, getAndSendTensorsTC(ms):{}", + LOG_DEBUG("ProcessEmbInfoL3Storage end, table:{}, batchId:{}, swapProcessTC(ms):{}, getAndSendTensorsTC(ms):{}", info.name, info.batchId, swapProcessTC.ElapsedMS(), getAndSendTensorsTC.ElapsedMS()); #endif } @@ -1286,7 +1288,7 @@ void HybridMgmt::InitDataPipelineForDDR(const string &embName) LOG_DEBUG("data pipeline for ddr init"); } -void HybridMgmt::InitDataPipelineForSSD(const string &embName, int extEmbeddingSize) +void HybridMgmt::InitDataPipelineForL3Storage(const string &embName, int extEmbeddingSize) { // 初始化公共队列 HBMSwapKeyQue[embName+SWAP_IN_STR]; @@ -1295,21 +1297,21 @@ void HybridMgmt::InitDataPipelineForSSD(const string &embName, int extEmbeddingS tableToQueueLookup[embName+SWAP_OUT_STR]; HBMSwapKeyQue[embName + ADDR_STR]; - SwapOut2SSDKeyQue[embName + SWAP_IN_STR]; - SwapOut2SSDKeyQue[embName + ADDR_STR]; - SwapOut2SSDKeyQue[embName + SWAP_OUT_STR]; + SwapOut2L3StorageKeyQue[embName + SWAP_IN_STR]; + SwapOut2L3StorageKeyQue[embName + ADDR_STR]; + SwapOut2L3StorageKeyQue[embName + SWAP_OUT_STR]; DDRSwapKeyQue[embName + SWAP_OUT_STR]; DDRSwapKeyQue[embName + SWAP_IN_STR]; - DDRSwapKeyForSSDQue[embName + SWAP_OUT_STR]; - DDRSwapKeyForSSDQue[embName + SWAP_IN_STR]; + DDRSwapKeyForL3StorageQue[embName + SWAP_OUT_STR]; + DDRSwapKeyForL3StorageQue[embName + SWAP_IN_STR]; DDRSwapAddrsQue[embName + SWAP_OUT_STR]; DDRSwapAddrsQue[embName + SWAP_IN_STR]; // 初始化lookup线程 lookUpThreads.emplace_back( std::async(std::launch::async, [=] { LookUpAddrs(embName, extEmbeddingSize); })); - LOG_DEBUG("data pipeline for ssd init"); + LOG_DEBUG("data pipeline for L3Storage init"); } void HybridMgmt::InitEmbeddingCache(const vector& embInfos) @@ -1320,8 +1322,8 @@ void HybridMgmt::InitEmbeddingCache(const vector& embInfos) EmbeddingMgmt::Instance()->SetHDTransferForEmbTable(hdTransfer); for (auto embInfo: embInfos) { - if (isSSDEnabled) { - InitDataPipelineForSSD(embInfo.name, embInfo.extEmbeddingSize); + if (isL3StorageEnabled) { + InitDataPipelineForL3Storage(embInfo.name, embInfo.extEmbeddingSize); } else { InitDataPipelineForDDR(embInfo.name); } @@ -1349,13 +1351,13 @@ void HybridMgmt::JoinEmbeddingCacheThread() for (auto &p : HBMSwapKeyQue) { p.second.DestroyQueue(); } - for (auto &p : SwapOut2SSDKeyQue) { + for (auto &p : SwapOut2L3StorageKeyQue) { p.second.DestroyQueue(); } for (auto &p : DDRSwapKeyQue) { p.second.DestroyQueue(); } - for (auto &p : DDRSwapKeyForSSDQue) { + for (auto &p : DDRSwapKeyForL3StorageQue) { p.second.DestroyQueue(); } for (auto &p : DDRSwapAddrsQue) { @@ -1585,10 +1587,10 @@ void HybridMgmt::CreateEmbeddingLookUpAndSendThread(int index, const EmbInfo& em int cur_batch_id = lookUpAndSendTableBatchMap[embInfo.name]; lookUpAndSendTableBatchMap[embInfo.name]++; lookUpAndSendBatchIdMtx.unlock(); - if (!isSSDEnabled) { + if (!isL3StorageEnabled) { EmbeddingLookUpAndSendDDR(cur_batch_id, index, embInfo); } else { - EmbeddingLookUpAndSendSSD(cur_batch_id, index, embInfo); + EmbeddingLookUpAndSendL3Storage(cur_batch_id, index, embInfo); } } else { lookUpAndSendBatchIdMtx.unlock(); @@ -1609,10 +1611,10 @@ void HybridMgmt::CreateEmbeddingReceiveAndUpdateThread(int index, const EmbInfo& int cur_batch_id = receiveAndUpdateTableBatchMap[embInfo.name]; receiveAndUpdateTableBatchMap[embInfo.name]++; receiveAndUpdateBatchIdMtx.unlock(); - if (!isSSDEnabled) { + if (!isL3StorageEnabled) { EmbeddingReceiveAndUpdateDDR(cur_batch_id, index, embInfo); } else { - EmbeddingReceiveAndUpdateSSD(cur_batch_id, index, embInfo); + EmbeddingReceiveAndUpdateL3Storage(cur_batch_id, index, embInfo); } } else { receiveAndUpdateBatchIdMtx.unlock(); @@ -1624,8 +1626,8 @@ void HybridMgmt::CreateEmbeddingReceiveAndUpdateThread(int index, const EmbInfo& }); } -bool HybridMgmt::EmbeddingReceiveSSD(const EmbTaskInfo &info, float *&ptr, - vector &swapOutAddrs, int64_t& dims0) +bool HybridMgmt::EmbeddingReceiveL3Storage(const EmbTaskInfo &info, float *&ptr, + vector &swapOutAddrs, int64_t& dims0) { std::unique_lock lastRecvFinishLocker(lastRecvFinishMutexMap[info.name][info.threadIdx]); cvLastRecvFinishMap[info.name][info.threadIdx].wait(lastRecvFinishLocker, [info, this] { @@ -1671,8 +1673,8 @@ bool HybridMgmt::EmbeddingReceiveSSD(const EmbTaskInfo &info, float *&ptr, return true; } -void HybridMgmt::EmbeddingUpdateSSD(const EmbTaskInfo& info, float *embPtr, - vector& swapOutAddrs, int64_t& dims0) +void HybridMgmt::EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float *embPtr, + vector& swapOutAddrs, int64_t& dims0) { std::unique_lock lastUpdateFinishLocker(lastUpdateFinishMutexMap[info.name][info.threadIdx]); cvLastUpdateFinishMap[info.name][info.threadIdx].wait(lastUpdateFinishLocker, [info, this] { @@ -1698,26 +1700,26 @@ void HybridMgmt::EmbeddingUpdateSSD(const EmbTaskInfo& info, float *embPtr, LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingUpdateTC(ms):{}", info.name.c_str(), info.batchId, info.threadIdx, EmbeddingUpdateTC.ElapsedMS()); - // SSD更新 - TimeCost SSDUpdateTC = TimeCost(); - std::vector swapOutSSDAddrOffs = SwapOut2SSDKeyQue[info.name + ADDR_STR].WaitAndPop(); - std::vector swapOutSSDKeys = SwapOut2SSDKeyQue[info.name + SWAP_OUT_STR].WaitAndPop(); + // L3Storage更新 + TimeCost L3StorageUpdateTC = TimeCost(); + std::vector swapOutL3StorageAddrOffs = SwapOut2L3StorageKeyQue[info.name + ADDR_STR].WaitAndPop(); + std::vector swapOutL3StorageKeys = SwapOut2L3StorageKeyQue[info.name + SWAP_OUT_STR].WaitAndPop(); if (!isRunning) { return; } - if (dims0 != static_cast(swapOutAddrs.size() + swapOutSSDKeys.size())) { + if (dims0 != static_cast(swapOutAddrs.size() + swapOutL3StorageKeys.size())) { throw runtime_error("data dims[0] != swapOutKeys.size"); } - cacheManager->UpdateSSDEmb(info.name, embPtr, extEmbeddingSize, swapOutSSDKeys, swapOutSSDAddrOffs); - LOG_DEBUG("table:{}, batchId:{}, thread{}, SSDUpdateTC(ms):{}", - info.name.c_str(), info.batchId, info.threadIdx, SSDUpdateTC.ElapsedMS()); + cacheManager->UpdateL3StorageEmb(info.name, embPtr, extEmbeddingSize, swapOutL3StorageKeys, swapOutL3StorageAddrOffs); + LOG_DEBUG("table:{}, batchId:{}, thread{}, L3StorageUpdateTC(ms):{}", + info.name.c_str(), info.batchId, info.threadIdx, L3StorageUpdateTC.ElapsedMS()); lastUpdateFinishStepMap[info.name]++; cvLastUpdateFinishMap[info.name][info.cvNotifyIndex].notify_all(); } -bool HybridMgmt::EmbeddingLookUpSSD(const EmbTaskInfo& info, vector& h2dEmb) +bool HybridMgmt::EmbeddingLookUpL3Storage(const EmbTaskInfo& info, vector& h2dEmb) { std::unique_lock lastUpdateFinishLocker(lastUpdateFinishMutexMap[info.name][info.threadIdx]); cvLastUpdateFinishMap[info.name][info.threadIdx].wait(lastUpdateFinishLocker, [info, this] { @@ -1735,27 +1737,27 @@ bool HybridMgmt::EmbeddingLookUpSSD(const EmbTaskInfo& info, vector& h2d return false; } - TimeCost transferDDR2SSDTC = TimeCost(); + TimeCost transferDDR2L3StorageTC = TimeCost(); // DDR腾空间 - std::vector DDR2SSDKeys = DDRSwapKeyForSSDQue[info.name + SWAP_OUT_STR].WaitAndPop(); - std::vector DDR2SSDAddrs = DDRSwapAddrsQue[info.name + SWAP_OUT_STR].WaitAndPop(); + std::vector DDR2L3StorageKeys = DDRSwapKeyForL3StorageQue[info.name + SWAP_OUT_STR].WaitAndPop(); + std::vector DDR2L3StorageAddrs = DDRSwapAddrsQue[info.name + SWAP_OUT_STR].WaitAndPop(); if (!isRunning) { return false; } - cacheManager->TransferDDR2SSD(info.name, info.extEmbeddingSize, DDR2SSDKeys, DDR2SSDAddrs); - LOG_DEBUG("table:{}, thread:{}, transferDDR2SSDTC(ms):{}", - info.name.c_str(), info.threadIdx, transferDDR2SSDTC.ElapsedMS()); + cacheManager->TransferDDR2L3Storage(info.name, info.extEmbeddingSize, DDR2L3StorageKeys, DDR2L3StorageAddrs); + LOG_DEBUG("table:{}, thread:{}, transferDDR2L3StorageTC(ms):{}", + info.name.c_str(), info.threadIdx, transferDDR2L3StorageTC.ElapsedMS()); - TimeCost fetchSSDEmb2DDRTC = TimeCost(); - // swapInKeys中在SSD的到DDR - std::vector SSD2DDRKeys = DDRSwapKeyForSSDQue[info.name + SWAP_IN_STR].WaitAndPop(); - std::vector SSD2DDRAddrs = DDRSwapAddrsQue[info.name + SWAP_IN_STR].WaitAndPop(); + TimeCost fetchL3StorageEmb2DDRTC = TimeCost(); + // swapInKeys中在L3Storage的挪到DDR + std::vector L3Storage2DDRKeys = DDRSwapKeyForL3StorageQue[info.name + SWAP_IN_STR].WaitAndPop(); + std::vector L3Storage2DDRAddrs = DDRSwapAddrsQue[info.name + SWAP_IN_STR].WaitAndPop(); if (!isRunning) { return false; } - cacheManager->FetchSSDEmb2DDR(info.name, info.extEmbeddingSize, SSD2DDRKeys, SSD2DDRAddrs); - LOG_DEBUG("table:{}, thread:{}, fetchSSDEmb2DDRTC(ms):{}", - info.name.c_str(), info.threadIdx, fetchSSDEmb2DDRTC.ElapsedMS()); + cacheManager->FetchL3StorageEmb2DDR(info.name, info.extEmbeddingSize, L3Storage2DDRKeys, L3Storage2DDRAddrs); + LOG_DEBUG("table:{}, thread:{}, fetchL3StorageEmb2DDRTC(ms):{}", + info.name.c_str(), info.threadIdx, fetchL3StorageEmb2DDRTC.ElapsedMS()); bool isSuccess = BuildH2DEmbedding(info, h2dEmb); if (!isSuccess) { @@ -1768,7 +1770,7 @@ bool HybridMgmt::EmbeddingLookUpSSD(const EmbTaskInfo& info, vector& h2d return true; } -void HybridMgmt::EmbeddingSendSSD(const EmbTaskInfo& info, vector& h2dEmb) +void HybridMgmt::EmbeddingSendL3Storage(const EmbTaskInfo& info, vector& h2dEmb) { std::unique_lock lastSendFinishLocker(lastSendFinishMutexMap[info.name][info.threadIdx]); cvLastSendFinishMap[info.name][info.threadIdx].wait(lastSendFinishLocker, [info, this] { @@ -1847,9 +1849,9 @@ void HybridMgmt::HandleFirstBatchCaseDDR(const EmbBaseInfo& info, HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKeys); } -void HybridMgmt::HandleFirstBatchCaseSSD(const EmbBaseInfo& info, - std::pair, vector>& swapInKoPair, - std::pair, vector>& swapOutKoPair) +void HybridMgmt::HandleFirstBatchCaseL3Storage(const EmbBaseInfo& info, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair) { // 发现train、save、eval切换,先保存状态,发emptySwapOutKeys以对应上一步的emptySwapOutPos vector emptySwapOutKeys; @@ -1858,51 +1860,51 @@ void HybridMgmt::HandleFirstBatchCaseSSD(const EmbBaseInfo& info, trainTestSwitchInfoStore[info.name] = {swapOutKoPair.first, swapOutKoPair.second}; TimeCost ProcessSwapInKeysTC = TimeCost(); - vector SSDToDDRKeys; - vector DDRToSSDKeys; - cacheManager->ProcessSwapInKeys(info.name, swapInKoPair.first, DDRToSSDKeys, SSDToDDRKeys); + vector L3StorageToDDRKeys; + vector DDRToL3StorageKeys; + cacheManager->ProcessSwapInKeys(info.name, swapInKoPair.first, DDRToL3StorageKeys, L3StorageToDDRKeys); LOG_DEBUG("ProcessSwapInKeysTC(ms):{} ", ProcessSwapInKeysTC.ElapsedMS()); vector emptySwapOutDDRKeys; vector emptySwapOutDDRAddrOffs; - vector emptySwapOutSSDKeys; - vector emptySwapOutSSDAddrOff; + vector emptySwapOutL3StorageKeys; + vector emptySwapOutL3StorageAddrOff; LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", info.name, info.batchId, info.channelId, swapInKoPair.first.size(), swapOutKoPair.first.size()); LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapOutDDRKeys.size:{}, swapOutDDRAddrOffs.size:{}, " - "swapOutSSDKeys.size:{}, swapOutSSDAddrOff.size:{}", + "swapOutL3StorageKeys.size:{}, swapOutL3StorageAddrOff.size:{}", info.name, info.batchId, info.channelId, emptySwapOutDDRKeys.size(), emptySwapOutDDRAddrOffs.size(), - emptySwapOutSSDKeys.size(), emptySwapOutSSDAddrOff.size()); - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToSSDKeys.size:{}, SSDToDDRKeys.size:{}", - info.name, info.batchId, info.channelId, DDRToSSDKeys.size(), SSDToDDRKeys.size()); + emptySwapOutL3StorageKeys.size(), emptySwapOutL3StorageAddrOff.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToL3StorageKeys.size:{}, L3StorageToDDRKeys.size:{}", + info.name, info.batchId, info.channelId, DDRToL3StorageKeys.size(), L3StorageToDDRKeys.size()); - auto DDRToSSDKeysForSSD = DDRToSSDKeys; - auto SSDToDDRKeysForSSD = SSDToDDRKeys; - // DDR<->SSD - DDRSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(DDRToSSDKeys); - DDRSwapKeyQue[info.name + SWAP_IN_STR].Pushv(SSDToDDRKeys); + auto DDRToL3StorageKeysForL3S = DDRToL3StorageKeys; + auto L3StorageToDDRKeysForL3S = L3StorageToDDRKeys; + // DDR<->L3Storage + DDRSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(DDRToL3StorageKeys); + DDRSwapKeyQue[info.name + SWAP_IN_STR].Pushv(L3StorageToDDRKeys); - DDRSwapKeyForSSDQue[info.name + SWAP_OUT_STR].Pushv(DDRToSSDKeysForSSD); - DDRSwapKeyForSSDQue[info.name + SWAP_IN_STR].Pushv(SSDToDDRKeysForSSD); + DDRSwapKeyForL3StorageQue[info.name + SWAP_OUT_STR].Pushv(DDRToL3StorageKeysForL3S); + DDRSwapKeyForL3StorageQue[info.name + SWAP_IN_STR].Pushv(L3StorageToDDRKeysForL3S); // HBM<->DDR HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(emptySwapOutDDRKeys); HBMSwapKeyQue[info.name + ADDR_STR].Pushv(emptySwapOutDDRAddrOffs); HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKoPair.first); - // HBM->SSD - SwapOut2SSDKeyQue[info.name + SWAP_OUT_STR].Pushv(emptySwapOutSSDKeys); - SwapOut2SSDKeyQue[info.name + ADDR_STR].Pushv(emptySwapOutSSDAddrOff); + // HBM->L3Storage + SwapOut2L3StorageKeyQue[info.name + SWAP_OUT_STR].Pushv(emptySwapOutL3StorageKeys); + SwapOut2L3StorageKeyQue[info.name + ADDR_STR].Pushv(emptySwapOutL3StorageAddrOff); } -void HybridMgmt::HandleDataSwapForSSD(const EmbBaseInfo& info, - vector &swapInKeys, vector &swapOutKeys) +void HybridMgmt::HandleDataSwapForL3Storage(const EmbBaseInfo& info, + vector &swapInKeys, vector &swapOutKeys) { TimeCost ProcessSwapInKeysTC; - vector SSDToDDRKeys; - vector DDRToSSDKeys; - cacheManager->ProcessSwapInKeys(info.name, swapInKeys, DDRToSSDKeys, SSDToDDRKeys); + vector L3StorageToDDRKeys; + vector DDRToL3StorageKeys; + cacheManager->ProcessSwapInKeys(info.name, swapInKeys, DDRToL3StorageKeys, L3StorageToDDRKeys); LOG_DEBUG("ProcessSwapInKeysTC(ms):{} ", ProcessSwapInKeysTC.ElapsedMS()); TimeCost ProcessSwapOutKeysTC; @@ -1913,29 +1915,29 @@ void HybridMgmt::HandleDataSwapForSSD(const EmbBaseInfo& info, LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", info.name, info.batchId, info.channelId, swapInKeys.size(), swapOutKeys.size()); LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapOutDDRKeys:{}, swapOutDDRAddrOffs:{}, " - "swapOutSSDKeys:{}, swapOutSSDAddrOff:{}", + "swapOutL3StorageKeys:{}, swapOutL3StorageAddrOff:{}", info.name, info.batchId, info.channelId, swapInfo.swapOutDDRKeys.size(), - swapInfo.swapOutDDRAddrOffs.size(), swapInfo.swapOutSSDKeys.size(), swapInfo.swapOutSSDAddrOffs.size()); - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToSSDKeys:{}, SSDToDDRKeys:{}", - info.name, info.batchId, info.channelId, DDRToSSDKeys.size(), SSDToDDRKeys.size()); + swapInfo.swapOutDDRAddrOffs.size(), swapInfo.swapOutL3StorageKeys.size(), swapInfo.swapOutL3StorageAddrOffs.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToL3StorageKeys:{}, L3StorageToDDRKeys:{}", + info.name, info.batchId, info.channelId, DDRToL3StorageKeys.size(), L3StorageToDDRKeys.size()); - auto DDRToSSDKeysForSSD = DDRToSSDKeys; - auto SSDToDDRKeysForSSD = SSDToDDRKeys; - // DDR<->SSD - DDRSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(DDRToSSDKeys); - DDRSwapKeyQue[info.name + SWAP_IN_STR].Pushv(SSDToDDRKeys); + auto DDRToL3StorageKeysForL3S = DDRToL3StorageKeys; + auto L3StorageToDDRKeysForL3S = L3StorageToDDRKeys; + // DDR<->L3Storage + DDRSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(DDRToL3StorageKeys); + DDRSwapKeyQue[info.name + SWAP_IN_STR].Pushv(L3StorageToDDRKeys); - DDRSwapKeyForSSDQue[info.name + SWAP_OUT_STR].Pushv(DDRToSSDKeysForSSD); - DDRSwapKeyForSSDQue[info.name + SWAP_IN_STR].Pushv(SSDToDDRKeysForSSD); + DDRSwapKeyForL3StorageQue[info.name + SWAP_OUT_STR].Pushv(DDRToL3StorageKeysForL3S); + DDRSwapKeyForL3StorageQue[info.name + SWAP_IN_STR].Pushv(L3StorageToDDRKeysForL3S); // HBM<->DDR HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(swapInfo.swapOutDDRKeys); HBMSwapKeyQue[info.name + ADDR_STR].Pushv(swapInfo.swapOutDDRAddrOffs); HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKeys); - // HBM->SSD - SwapOut2SSDKeyQue[info.name + SWAP_OUT_STR].Pushv(swapInfo.swapOutSSDKeys); - SwapOut2SSDKeyQue[info.name + ADDR_STR].Pushv(swapInfo.swapOutSSDAddrOffs); + // HBM->L3Storage + SwapOut2L3StorageKeyQue[info.name + SWAP_OUT_STR].Pushv(swapInfo.swapOutL3StorageKeys); + SwapOut2L3StorageKeyQue[info.name + ADDR_STR].Pushv(swapInfo.swapOutL3StorageAddrOffs); } bool HybridMgmt::BuildH2DEmbedding(const EmbTaskInfo &info, vector &h2dEmb) @@ -2130,9 +2132,9 @@ bool HybridMgmt::HandleSpecialProcessStatusDDR(const EmbBaseInfo &info, TimeCost return false; } -bool HybridMgmt::HandleSpecialProcessStatusSSD(const EmbBaseInfo &info, TimeCost &getAndSendTensorsTC, - pair, vector> &swapInKoPair, - pair, vector> &swapOutKoPair) +bool HybridMgmt::HandleSpecialProcessStatusL3Storage(const EmbBaseInfo &info, TimeCost &getAndSendTensorsTC, + pair, vector> &swapInKoPair, + pair, vector> &swapOutKoPair) { TimeCost swapProcessTC; auto &swapInPos = swapInKoPair.second; @@ -2141,19 +2143,19 @@ bool HybridMgmt::HandleSpecialProcessStatusSSD(const EmbBaseInfo &info, TimeCost if (specialProcessStatus[info.name] == ProcessStatus::AFTER_SWITCH_FIRST_BATCH) { // 发现train、save、eval切换,先保存状态,发emptySwapOutKeys以对应上一步的emptySwapOutPos - HandleFirstBatchCaseSSD(info, swapInKoPair, swapOutKoPair); + HandleFirstBatchCaseL3Storage(info, swapInKoPair, swapOutKoPair); LOG_DEBUG("handle channel switch case:afterSwitchFirstBatch, table:{}, channelId:{}, batchId:{}", info.name, info.channelId, info.batchId); if (mgmtRankInfo.ctrlSteps[info.channelId] == 1) { vector emptySwapOutPos; SendTensorForSwap(info, swapInPos, emptySwapOutPos); - LOG_DEBUG("ProcessEmbInfoSSD special case, user only run one step, table:{}, channelId:{}, batchId:{}", + LOG_DEBUG("ProcessEmbInfoL3Storage special case, user only run one step, table:{}, channelId:{}, batchId:{}", info.name, info.channelId, info.batchId); } specialProcessStatus[info.name] = ProcessStatus::AFTER_SWITCH_SECOND_BATCH; - LOG_DEBUG("ProcessEmbInfoSSD end, table:{}, batchId:{}, swapProcessTC(ms):{}, getAndSendTensorsTC(ms):{}", + LOG_DEBUG("ProcessEmbInfoL3Storage end, table:{}, batchId:{}, swapProcessTC(ms):{}, getAndSendTensorsTC(ms):{}", info.name, info.batchId, swapProcessTC.ElapsedMS(), getAndSendTensorsTC.ElapsedMS()); return true; } @@ -2183,7 +2185,7 @@ void HybridMgmt::CheckLookupAddrSuccessDDR() } } -void HybridMgmt::CheckLookupAddrSuccessSSD() +void HybridMgmt::CheckLookupAddrSuccessL3Storage() { if (!lookupAddrSuccess) { for (auto& t : lookUpThreads) { diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index 2b4b2fc8..02829896 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -33,7 +33,7 @@ See the License for the specific language governing permissions and #include "ock_ctr_common/include/error_code.h" #include "hd_transfer/hd_transfer.h" -#include "ssd_cache/cache_manager.h" +#include "l3_storage/cache_manager.h" #include "hybrid_mgmt_block.h" #include "emb_table/embedding_table.h" @@ -131,7 +131,7 @@ namespace MxRec { void ProcessEmbInfoDDR(const EmbBaseInfo& info, bool& remainBatchOut); - void ProcessEmbInfoSSD(const EmbBaseInfo& info, bool& remainBatchOut); + void ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBatchOut); GTEST_PRIVATE: bool mutexDestroy { false }; @@ -160,9 +160,9 @@ namespace MxRec { std::vector> lookUpThreads; std::map>> HBMSwapKeyQue; - std::map>> SwapOut2SSDKeyQue; + std::map>> SwapOut2L3StorageKeyQue; std::map>> DDRSwapKeyQue; - std::map>> DDRSwapKeyForSSDQue; + std::map>> DDRSwapKeyForL3StorageQue; std::map>> DDRSwapAddrsQue; std::mutex evictMut; @@ -188,7 +188,7 @@ namespace MxRec { void InitRankInfo(RankInfo& rankInfo, const vector& embInfos) const; - void EvictSSDKeys(const string& embName, const vector& keys) const; + void EvictL3StorageKeys(const string& embName, const vector& keys) const; int GetStepFromPath(const string& loadPath) const; @@ -204,9 +204,9 @@ namespace MxRec { void EmbeddingReceiveAndUpdateDDR(int batchId, int index, const EmbInfo& embInfo); - void EmbeddingLookUpAndSendSSD(int batchId, int index, const EmbInfo& embInfo); + void EmbeddingLookUpAndSendL3Storage(int batchId, int index, const EmbInfo& embInfo); - void EmbeddingReceiveAndUpdateSSD(int batchId, int index, const EmbInfo& embInfo); + void EmbeddingReceiveAndUpdateL3Storage(int batchId, int index, const EmbInfo& embInfo); void SendTensorForSwap(const EmbBaseInfo& info, const vector &swapInPosUint, @@ -222,7 +222,7 @@ namespace MxRec { HDTransfer *hdTransfer; OffsetMapT offsetMapToSend; OffsetMapT loadOffsetToSend; - bool isSSDEnabled { false }; + bool isL3StorageEnabled { false }; bool isRunning; bool isLoad { false }; bool isInitialized { false }; @@ -247,7 +247,7 @@ namespace MxRec { void InitDataPipelineForDDR(const string &embName); - void InitDataPipelineForSSD(const string &embName, int extEmbeddingSize); + void InitDataPipelineForL3Storage(const string &embName, int extEmbeddingSize); void JoinEmbeddingCacheThread(); @@ -265,13 +265,13 @@ namespace MxRec { void EmbeddingSendDDR(const EmbTaskInfo& info, vector& h2dEmb); - bool EmbeddingReceiveSSD(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs, int64_t& dims0); + bool EmbeddingReceiveL3Storage(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs, int64_t& dims0); - void EmbeddingUpdateSSD(const EmbTaskInfo& info, float* embPtr, vector& swapOutAddrs, int64_t& dims0); + void EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float* embPtr, vector& swapOutAddrs, int64_t& dims0); - bool EmbeddingLookUpSSD(const EmbTaskInfo& info, vector& h2dEmb); + bool EmbeddingLookUpL3Storage(const EmbTaskInfo& info, vector& h2dEmb); - void EmbeddingSendSSD(const EmbTaskInfo& info, vector& h2dEmb); + void EmbeddingSendL3Storage(const EmbTaskInfo& info, vector& h2dEmb); void CreateEmbeddingLookUpAndSendThread(int index, const EmbInfo& embInfo); @@ -281,12 +281,12 @@ namespace MxRec { std::pair, vector>& swapInKoPair, std::pair, vector>& swapOutKoPair); - void HandleFirstBatchCaseSSD(const EmbBaseInfo& info, - std::pair, vector>& swapInKoPair, - std::pair, vector>& swapOutKoPair); + void HandleFirstBatchCaseL3Storage(const EmbBaseInfo& info, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - void HandleDataSwapForSSD(const EmbBaseInfo& info, - vector &swapInKeys, vector &swapOutKeys); + void HandleDataSwapForL3Storage(const EmbBaseInfo& info, + vector &swapInKeys, vector &swapOutKeys); bool BuildH2DEmbedding(const EmbTaskInfo& info, vector& h2dEmb); @@ -306,13 +306,13 @@ namespace MxRec { std::pair, vector>& swapInKoPair, std::pair, vector>& swapOutKoPair); - bool HandleSpecialProcessStatusSSD(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, - std::pair, vector>& swapInKoPair, - std::pair, vector>& swapOutKoPair); + bool HandleSpecialProcessStatusL3Storage(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); void CheckLookupAddrSuccessDDR(); - void CheckLookupAddrSuccessSSD(); + void CheckLookupAddrSuccessL3Storage(); void GetSwapPairsAndKey2Offset(const EmbBaseInfo& info, vector &uniqueKeys, std::pair, vector>& swapInKoPair, diff --git a/src/core/ssd_cache/cache_manager.cpp b/src/core/l3_storage/cache_manager.cpp similarity index 61% rename from src/core/ssd_cache/cache_manager.cpp rename to src/core/l3_storage/cache_manager.cpp index a82a65a7..a2cbfb32 100644 --- a/src/core/ssd_cache/cache_manager.cpp +++ b/src/core/l3_storage/cache_manager.cpp @@ -25,55 +25,60 @@ See the License for the specific language governing permissions and using namespace MxRec; -void CacheManager::Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo) +void CacheManager::Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo, shared_ptr level3Storage) { LOG_INFO("CacheManager Init method begin"); + if (level3Storage == nullptr) { + throw runtime_error("level3Storage is nullptr"); + } + this->embCache = std::move(embCachePtr); for (auto& emb : mgmtEmbInfo) { EmbBaseInfo baseInfo {emb.ssdVocabSize, emb.ssdDataPath, false}; embBaseInfos.emplace(emb.name, baseInfo); preProcessMapper[emb.name].Initialize(emb.name, emb.hostVocabSize, emb.ssdVocabSize); } - ssdEngine->Start(); + this->l3Storage = level3Storage; + this->l3Storage->Start(); LOG_INFO("CacheManager Init method end"); } -bool CacheManager::IsKeyInSSD(const string& embTableName, emb_cache_key_t key) +bool CacheManager::IsKeyInL3Storage(const string& embTableName, emb_cache_key_t key) { - return ssdEngine->IsKeyExist(embTableName, key); + return l3Storage->IsKeyExist(embTableName, key); } -/// 淘汰SSD中Emb信息 +/// 淘汰三级存储中Emb信息 /// \param embTableName emb表名 /// \param keys 淘汰key列表 -void CacheManager::EvictSSDEmbedding(const string& embTableName, const vector& keys) +void CacheManager::EvictL3StorageEmbedding(const string& embTableName, const vector& keys) { if (keys.empty()) { return; } int keyStep = preProcessStep; - unordered_map& ssdMap = preProcessMapper[embTableName].excludeDDRKeyCountMap; + unordered_map& l3StorageMap = preProcessMapper[embTableName].excludeDDRKeyCountMap; LFUCache& ddrLfu = preProcessMapper[embTableName].lfuCache; - std::vector ssdKeysToBeDeleted; + std::vector l3StorageKeysToBeDeleted; // 1 删除缓存中记录的key的次数 for (auto &key: keys) { - auto it = ssdMap.find(key); - if (it != ssdMap.end()) { - ssdMap.erase(it); - ssdKeysToBeDeleted.emplace_back(key); + auto it = l3StorageMap.find(key); + if (it != l3StorageMap.end()) { + l3StorageMap.erase(it); + l3StorageKeysToBeDeleted.emplace_back(key); } else { ddrLfu.Pop(key); } } - ssdEvictThreads.emplace_back([=]() mutable { - // 2 删除SSD中保存的Emb数据 + l3StorageEvictThreads.emplace_back([=]() mutable { + // 2 删除L3Storage中保存的Emb数据 std::unique_lock lk(evictWaitMut); evictWaitCond.wait(lk, [keyStep, this] { return embeddingTaskStep == keyStep; }); - ssdEngine->DeleteEmbeddings(embTableName, ssdKeysToBeDeleted); + l3Storage->DeleteEmbeddings(embTableName, l3StorageKeysToBeDeleted); }); } @@ -93,29 +98,29 @@ void CacheManager::PutKey(const string& embTableName, const emb_key_t& key, Reco hashMap[key] = count; } -void CacheManager::CreateSSDTableIfNotExist(const std::string& embTableName) +void CacheManager::CreateL3StorageTableIfNotExist(const std::string& embTableName) { if (embBaseInfos[embTableName].isExist) { return; } - if (!ssdEngine->IsTableExist(embTableName)) { - ssdEngine->CreateTable(embTableName, embBaseInfos[embTableName].savePath, + if (!l3Storage->IsTableExist(embTableName)) { + l3Storage->CreateTable(embTableName, embBaseInfos[embTableName].savePath, embBaseInfos[embTableName].maxTableSize); embBaseInfos[embTableName].isExist = true; - LOG_INFO("create ssd table end, embTableName:" + embTableName); + LOG_INFO("create l3Storage table end, embTableName:" + embTableName); return; } - // 续训场景:embBaseInfos 没有保存,不会初始化;SSD表会初始化,此时表已存在 + // 续训场景:embBaseInfos 没有保存,不会初始化;L3Storage表会初始化,此时表已存在 embBaseInfos[embTableName].isExist = true; - LOG_INFO("ssd table is exist, embTableName:" + embTableName); + LOG_INFO("l3Storage table is exist, embTableName:" + embTableName); } CacheManager::~CacheManager() { - for (auto &t : ssdEvictThreads) { + for (auto& t : l3StorageEvictThreads) { t.join(); } - ssdEngine->Stop(); + l3Storage->Stop(); ddrKeyFreqMap.clear(); excludeDDRKeyCountMap.clear(); } @@ -123,18 +128,18 @@ CacheManager::~CacheManager() /// 加载数据到CacheManager /// \param ddrFreqInitMap ddr内key频次数据 /// \param excludeDdrFreqInitMap 非DDR key频次数据 -/// \param step 加载SSDEngine传入步数 +/// \param step 加载L3Storage传入步数 void CacheManager::Load(const std::vector &mgmtEmbInfo, int step, map>& trainKeySet) { - // 加载SSDEngine数据 + // 加载L3Storage数据 #ifndef GTEST for (auto& it : embBaseInfos) { string embTableName = it.first; EmbBaseInfo& embBase = it.second; - ssdEngine->Load(embTableName, embBase.savePath, embBase.maxTableSize, step); + l3Storage->Load(embTableName, embBase.savePath, embBase.maxTableSize, step); } - auto tableKeysVec = ssdEngine->ExportTableKey(); + auto tableKeysVec = l3Storage->ExportTableKey(); for (auto &it: tableKeysVec) { auto &embTableName = it.first; auto &keys = it.second; @@ -159,19 +164,19 @@ void CacheManager::Load(const std::vector &mgmtEmbInfo, int step, #endif } -void CacheManager::SaveSSDEngine(int step) +void CacheManager::Save(int step) { #ifndef GTEST - ssdEngine->Save(step); + l3Storage->Save(step); #endif } -int64_t CacheManager::GetTableEmbeddingSize(const string& tableName) +int64_t CacheManager::GetTableUsage(const string& tableName) { - if (ssdEngine == nullptr) { - throw runtime_error("SSDEngine not init"); + if (l3Storage == nullptr) { + throw runtime_error("L3Storage not init"); } - return ssdEngine->GetTableEmbeddingSize(tableName); + return l3Storage->GetTableUsage(tableName); } void CacheManager::ProcessSwapOutKeys(const string& tableName, const vector& swapOutKeys, @@ -179,10 +184,10 @@ void CacheManager::ProcessSwapOutKeys(const string& tableName, const vector 0) { keyMapper.InsertDDRKey(key); swapOutDDRKeys.push_back(key); swapOutDDRAddrOffs.push_back(i); availableDDRSize--; } else { - keyMapper.InsertSSDKey(key); - swapOutSSDKeys.push_back(key); - swapOutSSDAddrOffs.push_back(i); + keyMapper.InsertL3StorageKey(key); + swapOutL3StorageKeys.push_back(key); + swapOutL3StorageAddrOffs.push_back(i); } } } void CacheManager::ProcessSwapInKeys(const string& tableName, const vector& swapInKeys, - vector& DDRToSSDKeys, vector& SSDToDDRKeys) + vector& DDRToL3StorageKeys, vector& L3StorageToDDRKeys) { auto& keyMapper = preProcessMapper[tableName]; size_t externalDDRSize = 0; @@ -219,28 +224,28 @@ void CacheManager::ProcessSwapInKeys(const string& tableName, const vector ddrAvailableSize) { // 需要DDR--->SSD + if (externalDDRSize > ddrAvailableSize) { // 需要DDR--->L3Storage size_t transNum = externalDDRSize - ddrAvailableSize; - if (transNum > keyMapper.SSDAvailableSize()) { - throw invalid_argument("SSD table size too small, key quantity exceed while transferring DDR data to SSD"); + if (transNum > keyMapper.L3StorageAvailableSize()) { + throw invalid_argument("L3Storage table size too small, key quantity exceed while transferring DDR data to L3Storage"); } - // DDR--->SSD - keyMapper.GetAndDeleteLeastFreqDDRKey2SSD(transNum, swapInKeys, DDRToSSDKeys); + // DDR--->L3Storage + keyMapper.GetAndDeleteLeastFreqDDRKey2L3Storage(transNum, swapInKeys, DDRToL3StorageKeys); } - // SSD--->DDR - for (uint64_t key : SSDToDDRKeys) { + // L3Storage--->DDR + for (uint64_t key : L3StorageToDDRKeys) { keyMapper.InsertDDRKey(key); - keyMapper.RemoveSSDKey(key); + keyMapper.RemoveL3StorageKey(key); } for (uint64_t key : firstSeenKeys) { keyMapper.InsertDDRKey(key); @@ -248,31 +253,31 @@ void CacheManager::ProcessSwapInKeys(const string& tableName, const vector& keys, const vector& swapOutSSDddrOffs) +void CacheManager::UpdateL3StorageEmb(string tableName, float* embPtr, uint32_t extEmbeddingSize, + vector& keys, const vector& swapOutL3StorageOffs) { vector embeddingsAddr(keys.size()); - for (uint64_t i = 0; i < swapOutSSDddrOffs.size(); i++) { - embeddingsAddr[i] = embPtr + swapOutSSDddrOffs[i] * extEmbeddingSize; + for (uint64_t i = 0; i < swapOutL3StorageOffs.size(); i++) { + embeddingsAddr[i] = embPtr + swapOutL3StorageOffs[i] * extEmbeddingSize; } - ssdEngine->InsertEmbeddingsByAddr(tableName, keys, embeddingsAddr, extEmbeddingSize); + l3Storage->InsertEmbeddingsByAddr(tableName, keys, embeddingsAddr, extEmbeddingSize); } -void CacheManager::TransferDDR2SSD(string tableName, uint32_t extEmbeddingSize, vector& keys, - vector& addrs) +void CacheManager::TransferDDR2L3Storage(string tableName, uint32_t extEmbeddingSize, vector& keys, + vector& addrs) { - CreateSSDTableIfNotExist(tableName); - ssdEngine->InsertEmbeddingsByAddr(tableName, keys, addrs, extEmbeddingSize); + CreateL3StorageTableIfNotExist(tableName); + l3Storage->InsertEmbeddingsByAddr(tableName, keys, addrs, extEmbeddingSize); for (auto addr : addrs) { free(addr); addr = nullptr; } } -void CacheManager::FetchSSDEmb2DDR(string tableName, uint32_t extEmbeddingSize, vector& keys, - const vector& addrs) +void CacheManager::FetchL3StorageEmb2DDR(string tableName, uint32_t extEmbeddingSize, vector& keys, + const vector& addrs) { - auto embeddings = ssdEngine->FetchEmbeddings(tableName, keys); + auto embeddings = l3Storage->FetchEmbeddings(tableName, keys); for (uint64_t i = 0; i < embeddings.size(); i++) { int rc = memcpy_s(addrs[i], extEmbeddingSize * sizeof(float), embeddings[i].data(), extEmbeddingSize * sizeof(float)); @@ -280,7 +285,7 @@ void CacheManager::FetchSSDEmb2DDR(string tableName, uint32_t extEmbeddingSize, throw runtime_error("memcpy_s failed, rc: " + to_string(rc)); } } - ssdEngine->DeleteEmbeddings(tableName, keys); + l3Storage->DeleteEmbeddings(tableName, keys); embeddingTaskStep++; evictWaitCond.notify_all(); diff --git a/src/core/ssd_cache/cache_manager.h b/src/core/l3_storage/cache_manager.h similarity index 71% rename from src/core/ssd_cache/cache_manager.h rename to src/core/l3_storage/cache_manager.h index 89ed61d7..1571454b 100644 --- a/src/core/ssd_cache/cache_manager.h +++ b/src/core/l3_storage/cache_manager.h @@ -28,6 +28,7 @@ See the License for the specific language governing permissions and #include "utils/common.h" #include "preprocess_mapper.h" #include "ock_ctr_common/include/factory.h" +#include "l3_storage.h" namespace MxRec { @@ -42,14 +43,14 @@ namespace MxRec { struct SwapOutInfo { vector swapOutDDRKeys; vector swapOutDDRAddrOffs; - vector swapOutSSDKeys; - vector swapOutSSDAddrOffs; + vector swapOutL3StorageKeys; + vector swapOutL3StorageAddrOffs; }; enum class TransferRet { TRANSFER_OK = 0, // 转移成功或无需处理 TRANSFER_ERROR, - SSD_SPACE_NOT_ENOUGH, + L3Storage_SPACE_NOT_ENOUGH, DDR_SPACE_NOT_ENOUGH, }; @@ -73,16 +74,16 @@ namespace MxRec { ~CacheManager(); - void Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo); + void Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo, shared_ptr level3Storage); void Load(const std::vector& mgmtEmbInfo, int step, map>& trainKeySet); - void SaveSSDEngine(int step); + void Save(int step); - bool IsKeyInSSD(const string& embTableName, emb_cache_key_t key); + bool IsKeyInL3Storage(const string& embTableName, emb_cache_key_t key); - void EvictSSDEmbedding(const string& embTableName, const vector& keys); + void EvictL3StorageEmbedding(const string& embTableName, const vector& keys); void PutKey(const string& embTableName, const emb_key_t& key, RecordType type); @@ -90,18 +91,18 @@ namespace MxRec { SwapOutInfo& info); void ProcessSwapInKeys(const string& tableName, const vector& swapInKeys, - vector& DDRToSSDKeys, vector& SSDToDDRKeys); + vector& DDRToL3StorageKeys, vector& L3StorageToDDRKeys); - void UpdateSSDEmb(string tableName, float* embPtr, uint32_t extEmbeddingSize, vector& keys, - const vector& swapOutSSDAddrOffs); + void UpdateL3StorageEmb(string tableName, float* embPtr, uint32_t extEmbeddingSize, vector& keys, + const vector& swapOutL3StorageAddrOffs); - void TransferDDR2SSD(string tableName, uint32_t extEmbeddingSize, vector& keys, - vector& addrs); + void TransferDDR2L3Storage(string tableName, uint32_t extEmbeddingSize, vector& keys, + vector& addrs); - void FetchSSDEmb2DDR(string tableName, uint32_t extEmbeddingSize, vector& keys, - const vector& addrs); + void FetchL3StorageEmb2DDR(string tableName, uint32_t extEmbeddingSize, vector& keys, + const vector& addrs); - int64_t GetTableEmbeddingSize(const string& tableName); + int64_t GetTableUsage(const string& tableName); // DDR内每个表中emb数据频次缓存;map unordered_map ddrKeyFreqMap; @@ -123,13 +124,13 @@ namespace MxRec { bool isExist; }; - void CreateSSDTableIfNotExist(const std::string& embTableName); + void CreateL3StorageTableIfNotExist(const std::string& embTableName); unordered_map embBaseInfos; GTEST_PRIVATE: - shared_ptr ssdEngine = std::make_shared(); - vector ssdEvictThreads; + shared_ptr l3Storage; + vector l3StorageEvictThreads; ock::ctr::EmbCacheManagerPtr embCache {}; }; } diff --git a/src/core/l3_storage/l3_storage.cpp b/src/core/l3_storage/l3_storage.cpp new file mode 100644 index 00000000..4eb61c6e --- /dev/null +++ b/src/core/l3_storage/l3_storage.cpp @@ -0,0 +1,69 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#include "l3_storage.h" + +using MxRec::L3Storage; + +L3Storage::L3Storage() {} + +L3Storage::~L3Storage() {} + +bool L3Storage::IsTableExist(const string& tableName) +{ + return false; +} + +bool L3Storage::IsKeyExist(const string& tableName, emb_cache_key_t key) +{ + return false; +} + +void L3Storage::CreateTable(const string& tableName, vector savePaths, uint64_t maxTableSize) {} + +int64_t L3Storage::GetTableAvailableSpace(const string& tableName) +{ + return 0; +} + +void L3Storage::InsertEmbeddingsByAddr(const string& tableName, vector& keys, + vector& embeddingsAddr, uint64_t extEmbeddingSize) +{ +} + +void L3Storage::DeleteEmbeddings(const string& tableName, vector& keys) {} + +vector> L3Storage::FetchEmbeddings(const string& tableName, vector& keys) +{ + return vector>(); +} + +void L3Storage::Save(int step) {} + +void L3Storage::Load(const string& tableName, vector savePaths, uint64_t maxTableSize, int step) {} + +void L3Storage::Start() {} + +void L3Storage::Stop() {} + +int64_t L3Storage::GetTableUsage(const string& tableName) +{ + return 0; +} + +vector>> L3Storage::ExportTableKey() +{ + return vector>>(); +} diff --git a/src/core/l3_storage/l3_storage.h b/src/core/l3_storage/l3_storage.h new file mode 100644 index 00000000..606f2320 --- /dev/null +++ b/src/core/l3_storage/l3_storage.h @@ -0,0 +1,63 @@ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ + +#ifndef MX_REC_L3_STORAGE_H +#define MX_REC_L3_STORAGE_H + +#include +#include + +#include "utils/common.h" + +using MxRec::emb_cache_key_t; +using std::string; +using std::vector; + +namespace MxRec { + +class L3Storage { +public: + L3Storage(); + virtual ~L3Storage(); + + virtual bool IsTableExist(const string& tableName); + + virtual bool IsKeyExist(const string& tableName, emb_cache_key_t key); + + virtual void CreateTable(const string& tableName, vector savePaths, uint64_t maxTableSize); + + virtual int64_t GetTableAvailableSpace(const string& tableName); + + virtual void InsertEmbeddingsByAddr(const string& tableName, vector& keys, + vector& embeddingsAddr, uint64_t extEmbeddingSize); + + virtual void DeleteEmbeddings(const string& tableName, vector& keys); + + virtual vector> FetchEmbeddings(const string& tableName, vector& keys); + + virtual void Save(int step); + + virtual void Load(const string& tableName, vector savePaths, uint64_t maxTableSize, int step); + + virtual void Start(); + + virtual void Stop(); + + virtual int64_t GetTableUsage(const string& tableName); + + virtual vector>> ExportTableKey(); +}; +} // namespace MxRec +#endif // MX_REC_L3_STORAGE_H \ No newline at end of file diff --git a/src/core/ssd_cache/lfu_cache.cpp b/src/core/l3_storage/lfu_cache.cpp similarity index 100% rename from src/core/ssd_cache/lfu_cache.cpp rename to src/core/l3_storage/lfu_cache.cpp diff --git a/src/core/ssd_cache/lfu_cache.h b/src/core/l3_storage/lfu_cache.h similarity index 100% rename from src/core/ssd_cache/lfu_cache.h rename to src/core/l3_storage/lfu_cache.h diff --git a/src/core/ssd_cache/preprocess_mapper.h b/src/core/l3_storage/preprocess_mapper.h similarity index 54% rename from src/core/ssd_cache/preprocess_mapper.h rename to src/core/l3_storage/preprocess_mapper.h index 03860181..fd28677f 100644 --- a/src/core/ssd_cache/preprocess_mapper.h +++ b/src/core/l3_storage/preprocess_mapper.h @@ -1,9 +1,17 @@ -/* - * Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. - * Description: ssd cache module - * Author: MindX SDK - * Date: 2024/2/18 - */ +/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and + limitations under the License. +==============================================================================*/ #ifndef MXREC_DDR_PREPROCESS_MAPPER_H #define MXREC_DDR_PREPROCESS_MAPPER_H @@ -18,12 +26,12 @@ namespace MxRec { */ class PreProcessMapper { public: - void Initialize(const string& embName, uint32_t vocabSize, uint32_t ssdVocabSize) + void Initialize(const string& embName, uint32_t ddrVocabSize, uint32_t l3StorageVocabSize) { tableName = embName; lfuCache = LFUCache(embName); - ddrAvailableSize = vocabSize; - ssdAvailableSize = ssdVocabSize; + ddrAvailableSize = ddrVocabSize; + l3StorageAvailableSize = l3StorageVocabSize; } bool IsDDRKeyExist(uint64_t key) @@ -31,7 +39,7 @@ namespace MxRec { return lfuCache.keyTable.find(key) != lfuCache.keyTable.end(); } - bool IsSSDKeyExist(uint64_t key) + bool IsL3StorageKeyExist(uint64_t key) { return excludeDDRKeyCountMap.find(key) != excludeDDRKeyCountMap.end(); } @@ -47,19 +55,19 @@ namespace MxRec { return true; } - bool InsertSSDKey(uint64_t key) + bool InsertL3StorageKey(uint64_t key) { - if (IsSSDKeyExist(key)) { - throw std::invalid_argument("InsertSSDKey failed! key already exist"); + if (IsL3StorageKeyExist(key)) { + throw std::invalid_argument("InsertL3StorageKey failed! key already exist"); } excludeDDRKeyCountMap[key] = 1; return true; } - bool RemoveSSDKey(uint64_t key) + bool RemoveL3StorageKey(uint64_t key) { - if (!IsSSDKeyExist(key)) { + if (!IsL3StorageKeyExist(key)) { throw std::invalid_argument("RemoveKey failed! key not exist"); } excludeDDRKeyCountMap.erase(key); @@ -74,18 +82,18 @@ namespace MxRec { return ddrAvailableSize - lfuCache.keyTable.size(); } - size_t SSDAvailableSize() + size_t L3StorageAvailableSize() { - if (ssdAvailableSize < excludeDDRKeyCountMap.size()) { - throw std::invalid_argument("ssdAvailableSize < existKeys.size()"); + if (l3StorageAvailableSize < excludeDDRKeyCountMap.size()) { + throw std::invalid_argument("l3StorageAvailableSize < existKeys.size()"); } - return ssdAvailableSize - excludeDDRKeyCountMap.size(); + return l3StorageAvailableSize - excludeDDRKeyCountMap.size(); } - void GetAndDeleteLeastFreqDDRKey2SSD(uint64_t transNum, const std::vector& keys, - std::vector& DDRSwapOutKeys) + void GetAndDeleteLeastFreqDDRKey2L3Storage(uint64_t transNum, const std::vector& keys, + std::vector& DDRSwapOutKeys) { - LOG_DEBUG("start GetAndDeleteLeastFreqDDRKey2SSD, table:{}", tableName); + LOG_DEBUG("start GetAndDeleteLeastFreqDDRKey2L3Storage, table:{}", tableName); std::vector DDRSwapOutCounts; lfuCache.GetAndDeleteLeastFreqKeyInfo(transNum, keys, DDRSwapOutKeys, DDRSwapOutCounts); for (uint64_t i = 0; i < DDRSwapOutKeys.size(); i++) { @@ -93,13 +101,13 @@ namespace MxRec { } if (DDRSwapOutCounts.size() != transNum) { throw std::invalid_argument( - "GetAndDeleteLeastFreqDDRKey2SSD failed! DDRSwapOutCounts.size()!=transNum"); + "GetAndDeleteLeastFreqDDRKey2L3Storage failed! DDRSwapOutCounts.size()!=transNum"); } } string tableName; uint64_t ddrAvailableSize = 0; - uint64_t ssdAvailableSize = 0; + uint64_t l3StorageAvailableSize = 0; LFUCache lfuCache; std::unordered_map excludeDDRKeyCountMap; }; diff --git a/src/core/ssd_engine/ssd_engine.cpp b/src/core/ssd_engine/ssd_engine.cpp index bbf55e66..e50ad43c 100644 --- a/src/core/ssd_engine/ssd_engine.cpp +++ b/src/core/ssd_engine/ssd_engine.cpp @@ -199,7 +199,7 @@ void SSDEngine::SetCompactThreshold(double threshold) throw invalid_argument("compact threshold should in range [0, 1]"); } -int64_t SSDEngine::GetTableEmbeddingSize(const string &tableName) +int64_t SSDEngine::GetTableUsage(const string &tableName) { if (!isRunning) { throw runtime_error("SSDEngine not running"); diff --git a/src/core/ssd_engine/ssd_engine.h b/src/core/ssd_engine/ssd_engine.h index 538f76e2..40b65843 100644 --- a/src/core/ssd_engine/ssd_engine.h +++ b/src/core/ssd_engine/ssd_engine.h @@ -22,12 +22,12 @@ See the License for the specific language governing permissions and #include #include -#include "utils/common.h" +#include "l3_storage/l3_storage.h" namespace MxRec { - class SSDEngine { + class SSDEngine : public L3Storage { public: bool IsTableExist(const string &tableName); @@ -56,7 +56,7 @@ namespace MxRec { void SetCompactThreshold(double threshold); - int64_t GetTableEmbeddingSize(const string& tableName); + int64_t GetTableUsage(const string& tableName); void InsertEmbeddingsByAddr(const string &tableName, vector &keys, vector &embeddingsAddr, uint64_t extEmbeddingSize); diff --git a/src/tests/ssd_cache/cache_manager_test.cpp b/src/tests/ssd_cache/cache_manager_test.cpp index 7cb5e032..164e667a 100644 --- a/src/tests/ssd_cache/cache_manager_test.cpp +++ b/src/tests/ssd_cache/cache_manager_test.cpp @@ -18,8 +18,8 @@ See the License for the specific language governing permissions and #include #include "absl/container/flat_hash_map.h" -#include "ssd_cache/lfu_cache.h" -#include "ssd_cache/cache_manager.h" +#include "l3_storage/lfu_cache.h" +#include "l3_storage/cache_manager.h" #include "utils/common.h" using namespace std; @@ -34,16 +34,21 @@ void InitSSDEngine(CacheManager& manager, string embTableName, uint64_t ssdSize) { // Init ssd engine data chrono::seconds period = chrono::seconds(120); - manager.ssdEngine->SetCompactPeriod(period); - manager.ssdEngine->SetCompactThreshold(1); - manager.ssdEngine->CreateTable(embTableName, {SSD_SAVE_PATH}, ssdSize); + auto ssdEngine = static_pointer_cast(manager.l3Storage); + ssdEngine->SetCompactPeriod(period); + ssdEngine->SetCompactThreshold(1); + ssdEngine->CreateTable(embTableName, {SSD_SAVE_PATH}, ssdSize); vector ssdKeys = {15, 25}; // 预设15, 25存储在SSD - std::vector> ssdEmbData = {{15.0f}, - {25.0f}}; + auto emb1 = new float(15.0f); + auto emb2 = new float(25.0f); + uint64_t extEmbeddingSize = 1; + std::vector ssdEmbData = {{emb1}, {emb2}}; auto& excludeMap = manager.preProcessMapper[embTableName].excludeDDRKeyCountMap; excludeMap[15] = 3; // 初始化次数 excludeMap[25] = 5; - manager.ssdEngine->InsertEmbeddings(embTableName, ssdKeys, ssdEmbData); + ssdEngine->InsertEmbeddingsByAddr(embTableName, ssdKeys, ssdEmbData, extEmbeddingSize); + delete emb1; + delete emb2; } void InitDDREmbData(absl::flat_hash_map& loadData, string& embTableName, @@ -105,7 +110,8 @@ protected: ock::ctr::EmbCacheManagerPtr embCachePtr = nullptr; - cacheManager.Init(embCachePtr, mgmtEmbInfos); + auto ssdEngine = make_shared(); + cacheManager.Init(embCachePtr, mgmtEmbInfos, ssdEngine); InitSSDEngine(cacheManager, embTableName, 5); InitSSDEngine(cacheManager, embTableName2, 10); @@ -141,31 +147,31 @@ TEST_F(CacheManagerTest, PutKey) LOG_INFO("test PutKey end."); } -TEST_F(CacheManagerTest, IsKeyInSSD) +TEST_F(CacheManagerTest, IsKeyInL3Storage) { vector checkKeys = {1, 2, 15, 25}; - ASSERT_FALSE(cacheManager.IsKeyInSSD(embTableName, checkKeys[0])); - ASSERT_FALSE(cacheManager.IsKeyInSSD(embTableName, checkKeys[1])); - ASSERT_TRUE(cacheManager.IsKeyInSSD(embTableName, checkKeys[2])); - ASSERT_TRUE(cacheManager.IsKeyInSSD(embTableName, checkKeys[3])); - LOG_INFO("test IsKeyInSSD end."); + ASSERT_FALSE(cacheManager.IsKeyInL3Storage(embTableName, checkKeys[0])); + ASSERT_FALSE(cacheManager.IsKeyInL3Storage(embTableName, checkKeys[1])); + ASSERT_TRUE(cacheManager.IsKeyInL3Storage(embTableName, checkKeys[2])); + ASSERT_TRUE(cacheManager.IsKeyInL3Storage(embTableName, checkKeys[3])); + LOG_INFO("test IsKeyInL3Storage end."); } -TEST_F(CacheManagerTest, EvictSSDEmbedding) +TEST_F(CacheManagerTest, EvictL3StorageEmbedding) { // 构造时ssd中已存在的key: 15 25 emb_cache_key_t key = 15; vector ssdKeys = {key}; - cacheManager.EvictSSDEmbedding(embTableName, ssdKeys); + cacheManager.EvictL3StorageEmbedding(embTableName, ssdKeys); int maxLoop = 1000; - while (!cacheManager.ssdEvictThreads.empty() && maxLoop > 0) { + while (!cacheManager.l3StorageEvictThreads.empty() && maxLoop > 0) { this_thread::sleep_for(1ms); maxLoop--; } - ASSERT_FALSE(cacheManager.IsKeyInSSD(embTableName, key)); + ASSERT_FALSE(cacheManager.IsKeyInL3Storage(embTableName, key)); const auto it = cacheManager.excludeDDRKeyCountMap[embTableName].find(key); ASSERT_EQ(it, cacheManager.excludeDDRKeyCountMap[embTableName].end()); - LOG_INFO("test EvictSSDEmbedding end."); + LOG_INFO("test EvictL3StorageEmbedding end."); } TEST_F(CacheManagerTest, LoadTest) diff --git a/src/tests/ssd_cache/lfu_cache_test.cpp b/src/tests/ssd_cache/lfu_cache_test.cpp index 7f8a7820..500e3989 100644 --- a/src/tests/ssd_cache/lfu_cache_test.cpp +++ b/src/tests/ssd_cache/lfu_cache_test.cpp @@ -16,7 +16,7 @@ See the License for the specific language governing permissions and #include #include -#include "ssd_cache/lfu_cache.h" +#include "l3_storage/lfu_cache.h" using namespace std; using namespace MxRec; -- Gitee From 8251c4e9ce25f59d273d837e731f00c9a233f4e4 Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Fri, 7 Jun 2024 03:35:51 +0000 Subject: [PATCH 210/302] cleancode --- src/core/emb_table/embedding_ddr.cpp | 3 ++- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 10 ++++++---- src/core/hybrid_mgmt/hybrid_mgmt.h | 6 ++++-- src/core/l3_storage/cache_manager.cpp | 9 ++++++--- src/core/l3_storage/cache_manager.h | 8 +++++--- src/core/l3_storage/l3_storage.cpp | 3 ++- src/core/l3_storage/l3_storage.h | 1 - 7 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 3898a7da..257238b8 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -253,7 +253,8 @@ void EmbeddingDDR::SyncLatestEmbedding() throw std::invalid_argument(errMsg); } } - cacheManager_->UpdateL3StorageEmb(name, ptr, embInfo_.extEmbeddingSize, info.swapOutL3StorageKeys, info.swapOutL3StorageAddrOffs); + cacheManager_->UpdateL3StorageEmb(name, ptr, embInfo_.extEmbeddingSize, info.swapOutL3StorageKeys, + info.swapOutL3StorageAddrOffs); } } diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 973831a2..895715c9 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -1711,7 +1711,8 @@ void HybridMgmt::EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float *embPtr if (dims0 != static_cast(swapOutAddrs.size() + swapOutL3StorageKeys.size())) { throw runtime_error("data dims[0] != swapOutKeys.size"); } - cacheManager->UpdateL3StorageEmb(info.name, embPtr, extEmbeddingSize, swapOutL3StorageKeys, swapOutL3StorageAddrOffs); + cacheManager->UpdateL3StorageEmb(info.name, embPtr, extEmbeddingSize, swapOutL3StorageKeys, + swapOutL3StorageAddrOffs); LOG_DEBUG("table:{}, batchId:{}, thread{}, L3StorageUpdateTC(ms):{}", info.name.c_str(), info.batchId, info.threadIdx, L3StorageUpdateTC.ElapsedMS()); @@ -1917,7 +1918,8 @@ void HybridMgmt::HandleDataSwapForL3Storage(const EmbBaseInfo& info, LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapOutDDRKeys:{}, swapOutDDRAddrOffs:{}, " "swapOutL3StorageKeys:{}, swapOutL3StorageAddrOff:{}", info.name, info.batchId, info.channelId, swapInfo.swapOutDDRKeys.size(), - swapInfo.swapOutDDRAddrOffs.size(), swapInfo.swapOutL3StorageKeys.size(), swapInfo.swapOutL3StorageAddrOffs.size()); + swapInfo.swapOutDDRAddrOffs.size(), swapInfo.swapOutL3StorageKeys.size(), + swapInfo.swapOutL3StorageAddrOffs.size()); LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToL3StorageKeys:{}, L3StorageToDDRKeys:{}", info.name, info.batchId, info.channelId, DDRToL3StorageKeys.size(), L3StorageToDDRKeys.size()); @@ -2150,8 +2152,8 @@ bool HybridMgmt::HandleSpecialProcessStatusL3Storage(const EmbBaseInfo &info, Ti if (mgmtRankInfo.ctrlSteps[info.channelId] == 1) { vector emptySwapOutPos; SendTensorForSwap(info, swapInPos, emptySwapOutPos); - LOG_DEBUG("ProcessEmbInfoL3Storage special case, user only run one step, table:{}, channelId:{}, batchId:{}", - info.name, info.channelId, info.batchId); + LOG_DEBUG("ProcessEmbInfoL3Storage special case, user only run one step, " + "table:{}, channelId:{}, batchId:{}", info.name, info.channelId, info.batchId); } specialProcessStatus[info.name] = ProcessStatus::AFTER_SWITCH_SECOND_BATCH; diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index 02829896..4fd2b541 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -265,9 +265,11 @@ namespace MxRec { void EmbeddingSendDDR(const EmbTaskInfo& info, vector& h2dEmb); - bool EmbeddingReceiveL3Storage(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs, int64_t& dims0); + bool EmbeddingReceiveL3Storage(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs, + int64_t& dims0); - void EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float* embPtr, vector& swapOutAddrs, int64_t& dims0); + void EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float* embPtr, vector& swapOutAddrs, + int64_t& dims0); bool EmbeddingLookUpL3Storage(const EmbTaskInfo& info, vector& h2dEmb); diff --git a/src/core/l3_storage/cache_manager.cpp b/src/core/l3_storage/cache_manager.cpp index a2cbfb32..188f2aaf 100644 --- a/src/core/l3_storage/cache_manager.cpp +++ b/src/core/l3_storage/cache_manager.cpp @@ -25,7 +25,8 @@ See the License for the specific language governing permissions and using namespace MxRec; -void CacheManager::Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo, shared_ptr level3Storage) +void CacheManager::Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo, + shared_ptr level3Storage) { LOG_INFO("CacheManager Init method begin"); if (level3Storage == nullptr) { @@ -214,7 +215,8 @@ void CacheManager::ProcessSwapOutKeys(const string& tableName, const vector& swapInKeys, - vector& DDRToL3StorageKeys, vector& L3StorageToDDRKeys) + vector& DDRToL3StorageKeys, + vector& L3StorageToDDRKeys) { auto& keyMapper = preProcessMapper[tableName]; size_t externalDDRSize = 0; @@ -236,7 +238,8 @@ void CacheManager::ProcessSwapInKeys(const string& tableName, const vector keyMapper.L3StorageAvailableSize()) { - throw invalid_argument("L3Storage table size too small, key quantity exceed while transferring DDR data to L3Storage"); + throw invalid_argument( + "L3Storage table size too small, key quantity exceed while transferring DDR data to L3Storage"); } // DDR--->L3Storage keyMapper.GetAndDeleteLeastFreqDDRKey2L3Storage(transNum, swapInKeys, DDRToL3StorageKeys); diff --git a/src/core/l3_storage/cache_manager.h b/src/core/l3_storage/cache_manager.h index 1571454b..dda4c396 100644 --- a/src/core/l3_storage/cache_manager.h +++ b/src/core/l3_storage/cache_manager.h @@ -50,7 +50,7 @@ namespace MxRec { enum class TransferRet { TRANSFER_OK = 0, // 转移成功或无需处理 TRANSFER_ERROR, - L3Storage_SPACE_NOT_ENOUGH, + L3STORAGE_SPACE_NOT_ENOUGH, DDR_SPACE_NOT_ENOUGH, }; @@ -74,7 +74,8 @@ namespace MxRec { ~CacheManager(); - void Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo, shared_ptr level3Storage); + void Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vector& mgmtEmbInfo, + shared_ptr level3Storage); void Load(const std::vector& mgmtEmbInfo, int step, map>& trainKeySet); @@ -91,7 +92,8 @@ namespace MxRec { SwapOutInfo& info); void ProcessSwapInKeys(const string& tableName, const vector& swapInKeys, - vector& DDRToL3StorageKeys, vector& L3StorageToDDRKeys); + vector& DDRToL3StorageKeys, + vector& L3StorageToDDRKeys); void UpdateL3StorageEmb(string tableName, float* embPtr, uint32_t extEmbeddingSize, vector& keys, const vector& swapOutL3StorageAddrOffs); diff --git a/src/core/l3_storage/l3_storage.cpp b/src/core/l3_storage/l3_storage.cpp index 4eb61c6e..6a3ea668 100644 --- a/src/core/l3_storage/l3_storage.cpp +++ b/src/core/l3_storage/l3_storage.cpp @@ -16,6 +16,7 @@ See the License for the specific language governing permissions and #include "l3_storage.h" using MxRec::L3Storage; +using MxRec::emb_cache_key_t; L3Storage::L3Storage() {} @@ -39,7 +40,7 @@ int64_t L3Storage::GetTableAvailableSpace(const string& tableName) } void L3Storage::InsertEmbeddingsByAddr(const string& tableName, vector& keys, - vector& embeddingsAddr, uint64_t extEmbeddingSize) + vector& embeddingsAddr, uint64_t extEmbeddingSize) { } diff --git a/src/core/l3_storage/l3_storage.h b/src/core/l3_storage/l3_storage.h index 606f2320..6462409f 100644 --- a/src/core/l3_storage/l3_storage.h +++ b/src/core/l3_storage/l3_storage.h @@ -21,7 +21,6 @@ See the License for the specific language governing permissions and #include "utils/common.h" -using MxRec::emb_cache_key_t; using std::string; using std::vector; -- Gitee From 8b389399dc1b70801a2779271fa82092f81b6a6e Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Fri, 7 Jun 2024 03:45:31 +0000 Subject: [PATCH 211/302] cleancode --- src/core/l3_storage/cache_manager.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/l3_storage/cache_manager.h b/src/core/l3_storage/cache_manager.h index dda4c396..3f5b0a22 100644 --- a/src/core/l3_storage/cache_manager.h +++ b/src/core/l3_storage/cache_manager.h @@ -95,7 +95,8 @@ namespace MxRec { vector& DDRToL3StorageKeys, vector& L3StorageToDDRKeys); - void UpdateL3StorageEmb(string tableName, float* embPtr, uint32_t extEmbeddingSize, vector& keys, + void UpdateL3StorageEmb(string tableName, float* embPtr, uint32_t extEmbeddingSize,\ + vector& keys, const vector& swapOutL3StorageAddrOffs); void TransferDDR2L3Storage(string tableName, uint32_t extEmbeddingSize, vector& keys, -- Gitee From 167b0e1d9eeadb152ac5996681045b13c3b45eb5 Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Wed, 12 Jun 2024 08:55:42 +0000 Subject: [PATCH 212/302] =?UTF-8?q?!180=20=E6=A0=B7=E4=BE=8B=EF=BC=88?= =?UTF-8?q?=E6=8E=A5=E5=85=A5PS=EF=BC=89=EF=BC=9A=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E9=80=82=E9=85=8D=E6=8C=87=E5=AF=BC=20*=20=E6=A0=B7=E4=BE=8B?= =?UTF-8?q?=EF=BC=88=E6=8E=A5=E5=85=A5PS=EF=BC=89=EF=BC=9A=E6=B7=BB?= =?UTF-8?q?=E5=8A=A0=E9=80=82=E9=85=8D=E6=8C=87=E5=AF=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ps_adapt_to_mxrec/ps_adapt_to_mxrec.md | 750 ++++++++++++++++++ 1 file changed, 750 insertions(+) create mode 100644 examples/ps_adapt_to_mxrec/ps_adapt_to_mxrec.md diff --git a/examples/ps_adapt_to_mxrec/ps_adapt_to_mxrec.md b/examples/ps_adapt_to_mxrec/ps_adapt_to_mxrec.md new file mode 100644 index 00000000..431133b7 --- /dev/null +++ b/examples/ps_adapt_to_mxrec/ps_adapt_to_mxrec.md @@ -0,0 +1,750 @@ +# 版本信息 + +1. ps-lite + + [GitHub - dmlc/ps-lite: A lightweight parameter server interface](https://github.com/dmlc/ps-lite) + + commit 11b42c08a357d4ea5924403daa357587f4d8b5e2(包含本commit及之后都可以) + +2. mxRec + + [mxrec: 华为昇腾-MindX 推荐SDK - Gitee.com](https://gitee.com/ascend/mxrec/tree/develop/) + + commit ae36047f1dda8c03fa849184205bdc8bcfb4a137 + +**注:ps-lite不支持多表存储,所以本文档以单表训练场景为例。** + +# 适配流程 + +## ps-lite + +### 下载ps-lite代码 + +```shell +# 在mxrec根目录下 +cd mxrec/src +mkdir 3rdparty +cd 3rdparty +git clone https://github.com/dmlc/ps-lite.git +``` + +### 修改ps-lite/make/deps.mk + +* 调整为不删除源码包,减少重复编译耗时 +* 调整依赖版本与ps-lite/CMakeLists.txt一致。其中protobuf 3.8.0为tensorflow 1.15适配版本,用户可根据自身tf版本调整。 + +```makefile +# protobuf +PROTOBUF = ${DEPS_PATH}/include/google/protobuf/message.h +${PROTOBUF}: + $(eval FILE=protobuf-cpp-3.8.0.tar.gz) + $(eval DIR=protobuf-3.8.0) + rm -rf $(DIR) + $(WGET) -nc $(URL2)/$(FILE) && tar --no-same-owner -zxf $(FILE) + cd $(DIR) && export CFLAGS=-fPIC && export CXXFLAGS=-fPIC && ./configure -prefix=$(DEPS_PATH) && $(MAKE) && $(MAKE) install + rm -rf $(DIR) + +# zmq +ZMQ = ${DEPS_PATH}/include/zmq.h + +${ZMQ}: + $(eval FILE=zeromq-4.3.2.tar.gz) + $(eval DIR=zeromq-4.3.2) + rm -rf $(DIR) + $(WGET) -nc $(URL1)/$(FILE) && tar --no-same-owner -zxf $(FILE) + cd $(DIR) && export CFLAGS=-fPIC && export CXXFLAGS=-fPIC && ./configure -prefix=$(DEPS_PATH) --with-libsodium=no --with-libgssapi_krb5=no && $(MAKE) && $(MAKE) install + rm -rf $(DIR) + +# lz4 +LZ4 = ${DEPS_PATH}/include/lz4.h +${LZ4}: + $(eval FILE=lz4-r129.tar.gz) + $(eval DIR=lz4-r129) + rm -rf $(DIR) + wget -nc $(URL1)/$(FILE) && tar --no-same-owner -zxf $(FILE) + cd $(DIR) && $(MAKE) && PREFIX=$(DEPS_PATH) $(MAKE) install + rm -rf $(DIR) + +# cityhash +CITYHASH = ${DEPS_PATH}/include/city.h +${CITYHASH}: + $(eval FILE=cityhash-1.1.1.tar.gz) + $(eval DIR=cityhash-1.1.1) + rm -rf $(DIR) + wget -nc $(URL1)/$(FILE)&& tar --no-same-owner -zxf $(FILE) + cd $(DIR) && ./configure -prefix=$(DEPS_PATH) --enable-sse4.2 && $(MAKE) CXXFLAGS="-g -O3 -msse4.2" && $(MAKE) install + rm -rf $(DIR) +``` + +### 安装依赖 + +* protobuf:需要确保版本与tensorflow的一致,在tensorflow目录中搜索`GOOGLE_PROTOBUF_VERSION`查看protobuf版本 + +* zeromq:参考github,版本如ps-lite/make/deps.mk所示 + +### 准备KVServerMxRecHandle源码 + +在ps-lite/include/ps/kv_app.h中增加如下代码: + +```c++ +/** + * \brief for mxrec embedding storage + */ +template +struct KVServerMxRecHandle { + void operator()( + const KVMeta& req_meta, const KVPairs& req_data, KVServer* server) { + LL << "KVServerMxRecHandle, customerId:" << req_meta.customer_id << ", push:" << req_meta.push << ", pull:" << req_meta.pull; + auto es = std::getenv("EMB_SIZE"); + if (es == nullptr) { + throw std::runtime_error("EMB_SIZE environment variable not found, please export"); + } + int embeddingSize = std::stoi(es); + size_t keyCnt = req_data.keys.size(); + KVPairs res; + + if (req_meta.pull) { + LL << "pull, customerId:" << req_meta.customer_id << ", keys.size:" << keyCnt << ", embeddingSize:" << embeddingSize; + res.keys = req_data.keys; + res.vals.resize(keyCnt * embeddingSize); // flatten all data + for (size_t i = 0; i < keyCnt; ++i) { + Key key = req_data.keys[i]; + std::vector emb = store[key]; + if (emb.size() == 0) { + emb = std::vector(embeddingSize, 0); + } else if (emb.size() != embeddingSize) { + throw std::runtime_error("embedding size in server not equal to request"); + } + for (int j = 0; j < embeddingSize; j++) { + res.vals[i * embeddingSize + j] = emb[j]; + } + } + } else if (req_meta.push) { + LL << "push, customerId:" << req_meta.customer_id << ", keys.size:" << keyCnt << ", vals.size:" << req_data.vals.size() << ", embeddingSize:" << embeddingSize; + for (size_t i = 0; i < keyCnt; i++) { + Key key = req_data.keys[i]; + std::vector tmp(embeddingSize); + for (size_t j = 0; j < embeddingSize; j++) + { + tmp[j] = res.vals[i * embeddingSize + j]; + } + store[key] = tmp; + } + } else { + LL << "error: request neither push or pull"; + throw std::runtime_error("request neither push or pull"); + } + + server->Response(req_meta, res); + } + std::unordered_map> store; +}; +``` + +### 准备scheduler、server、worker源码 + +* ps-lite/tests/test_scheduler.cc + + ```c++ + #include + #include "ps/ps.h" + + using namespace ps; + + void RunSchedular(int appId) { + // start system + LL << "start schedular, appId:" << appId; + Start(appId); + Finalize(appId, true); + LL << "quit schedular, appId:" << appId; + } + + int main(int argc, char *argv[]) { + int appId = std::stoi(argv[1]); + RunSchedular(appId); + return 0; + } + ``` + +* ps-lite/tests/test_server.cc + + ```c++ + #include + #include "ps/ps.h" + + using namespace ps; + + void StartServer(int serverId) { + if (!IsServer()) { + return; + } + auto server = new KVServer(serverId); + server->set_request_handle(KVServerMxRecHandle()); + RegisterExitCallback([server](){ delete server; }); + } + + void RunServer(int appId) { + LL << "start server, appId:" << appId; + Start(appId); + StartServer(appId); + // stop system + Finalize(appId, true); + LL << "quit server, appId:" << appId; + } + + int main(int argc, char *argv[]) { + int appId = std::stoi(argv[1]); + RunServer(appId); + return 0; + } + + ``` + +* ps-lite/tests/test_worker.cc + + ```c++ + #include + #include "ps/ps.h" + + using namespace ps; + using std::vector; + + + void RunWorker(int appId, int customerId) { + LL << "start worker, appId:" << appId << ", customerId:" << customerId; + Start(appId); + if (!IsWorker()) { + return; + } + KVWorker kv(appId, customerId); + + // init + int num = 10000; + int embSize = 2; + vector lens(num, embSize); + vector keys(num); + vector vals(num * embSize); + int rank = MyRank(); + srand(rank + 7); + for (int i = 0; i < num; ++i) { + keys[i] = kMaxKey / num * i + customerId; + for (int j = 0; j < embSize; ++j) + { + vals[i * embSize + j] = rand() % 1000; + } + } + + // push + LL << "start push"; + kv.Wait(kv.Push(keys, vals)); + + // pull + LL << "start pull"; + std::vector rets; + kv.Wait(kv.Pull(keys, &rets)); + + LL << "start validation"; + float res = 0; + for (int i = 0; i < num; ++i) { + for (int j = 0; j < embSize; ++j) { + if (abs(vals[i * embSize + j] - rets[i * embSize + j]) > std::numeric_limits::epsilon()) { + LL << "error: embedding from server not equal to original data"; + Finalize(appId, true); + return; + } + } + } + + // stop system + Finalize(appId, true); + LL << "stop worker, appId:" << appId << ", customerId:" << customerId; + } + + int main(int argc, char *argv[]) { + int customerId = std::stoi(argv[1]); + std::thread t0(RunWorker, 0, customerId); + t0.join(); + return 0; + } + ``` + +### 修改ps-lite/tests/CMakeLists.txt + +修改为如下代码: + +```makefile +add_executable(test_schedular test_schedular.cc) +target_link_libraries(test_schedular pslite) + +add_executable(test_server test_server.cc) +target_link_libraries(test_server pslite) + +add_executable(test_worker test_worker.cc) +target_link_libraries(test_worker pslite) +``` + +### 修改ps-lite/CMakeLists.txt + +增加如下代码: + +```cmake +target_link_libraries(pslite PUBLIC pthread) +``` + +### 编译scheduler、server、worker + +在ps-lite目录下执行 + +```shell +mkdir build +cd build +cmake .. +make -j4 +``` + +### 准备scheduler、server、worker启动脚本 + +* ps-lite/start_service.sh + + ```shell + #!/bin/bash + # set -x + if [ $# -lt 2 ]; then + echo "usage: $0 bin_schedular bin_server" + exit -1; + fi + + export DMLC_NUM_SERVER=1 + export DMLC_NUM_WORKER=1 + bin_schedular=$1 + bin_server=$2 + + # start the scheduler + export DMLC_PS_ROOT_URI='127.0.0.1' + export DMLC_ROLE='scheduler' + export DMLC_PS_ROOT_PORT=8000 + ${bin_schedular} 0 & + + # start servers + export DMLC_ROLE='server' + ${bin_server} 0 & + + wait + ``` + +* ps-lite/start_worker.sh + + ```shell + #!/bin/bash + # set -x + if [ $# -lt 1 ]; then + echo "usage: $0 bin_worker" + exit -1; + fi + + export DMLC_NUM_SERVER=1 + export DMLC_NUM_WORKER=1 + bin_worker=$1 + + # scheduler info + export DMLC_PS_ROOT_URI='127.0.0.1' + export DMLC_PS_ROOT_PORT=8000 + export DMLC_ROLE='worker' + ${bin_worker} 0 & + + wait + ``` + + +### 编译ps-lite + +在ps-lite目录下 + +```shell +mkdir build +cd build +cmake .. +make -j8 +``` + +### 测试基础功能是否正常 + +将编译好的test文件复制到ps-lite目录,执行: + +```shell +#分别执行 +./start_service.sh ./test_schedular ./test_server +./start_worker.sh ./test_worker +``` + +无报错表示正常。 + +## mxrec + +### 调整ps-lite + +1. 删除ps-lite/build +2. 修改ps-lite/CMakeLists.txt,注释掉`add_subdirectory(tests)` + +搜索以下代码片段,新增、替换源码。 + +### src/build.sh + +```makefile +cmake -DCMAKE_BUILD_TYPE=Release \ + -DTF_PATH="$1" \ + -DOMPI_PATH="$(whereis openmpi)" \ + -DPYTHON_PATH="$python_path" \ + -DEASY_PROFILER_PATH=/ \ + -DASCEND_PATH="$ascend_path" \ + -DABSEIL_PATH="$1" \ + -DSECUREC_PATH="$2"/../opensource/securec \ + -DCMAKE_INSTALL_PREFIX="$2"/output \ + -DBUILD_CUST="$3" .. \ + -DDEPS_PATH="$2"/src/3rdparty/ps-lite # new +``` + +### src/CMakeLists.txt + +```cmake +add_subdirecotry(dataset_tf) +add_subdirecotry(core/3rdparty/ps-lite) # new +``` + +### src/core/CMakeLists.txt + +```cmake +file(GLOB_RECURSE MXREC_SRC ./*.cpp ./*.h) +add_library(ASC SHARED ${MXREC_SRC}) + +target_include_directories(ASC PUBLIC 3rdparty/ps-lite/include) # new +``` + +```makefile +target_link_libraries(ASC PUBLIC ascendcl msprofiler ge_executor gert runtime ge_common register graph ascend_protobuf + profapi opt_feature error_manager exe_graph acl_tdt_channel acl_tdt_queue securec drvdsmi_host _ock_ctr_common + pslite # new +) +``` + +### src/core/ps_store/ps_store.h**(新增)** + +```c +#ifndef MXREC_PS_STORE_H +#define MXREC_PS_STORE_H + +#include +#include + +#include "l3_storage/l3_storage.h" +#include "ps/ps.h" // must set behind any mxrec header file, otherwise will compile fail + +using MxRec::L3Storage; +using ps::KVWorker; +using std::map; +using std::shared_ptr; +using std::string; + +namespace MxRec { +class PSStore : public L3Storage { +public: + PSStore(int rankId); + + bool IsTableExist(const string& tableName); + + bool IsKeyExist(const string& tableName, emb_cache_key_t key); + + void CreateTable(const string& tableName, vector savePaths, uint64_t maxTableSize); + + int64_t GetTableAvailableSpace(const string& tableName); + + void InsertEmbeddingsByAddr(const string& tableName, vector& keys, vector& embeddingsAddr, + uint64_t extEmbeddingSize); + + void DeleteEmbeddings(const string& tableName, vector& keys); + + vector> FetchEmbeddings(const string& tableName, vector& keys); + + void Save(int step); + + void Load(const string& tableName, vector savePaths, uint64_t maxTableSize, int step); + + void Start(); + + void Stop(); + + int64_t GetTableUsage(const string& tableName); + + vector>> ExportTableKey(); + +private: + // ps-lite not support multiple table yet, thus this example code only use one client + int appId = 0; + int customerId = 0; + + // table --> client + map>> cliMap; +}; +} // namespace MxRec +#endif // MXREC_PS_STORE_H +``` + +### src/core/ps_store/ps_store.cpp**(新增)** + +```c++ +#include "ps_store.h" + +using MxRec::PSStore; +using MxRec::emb_cache_key_t; + +struct KeyWithIdx { + emb_cache_key_t key; + size_t index; +} + +bool CompareKeyWithIdx(KeyWithIdx a, KeyWithIdx b) { + return a.key < b.key; +} + +PSStore::PSStore(int rankId) +{ + this->customerId = rankId + std::stoi(std::getenv("REC_WORKER_ID_START_IDX")); +} + +bool PSStore::IsTableExist(const string& tableName) +{ + auto iter = cliMap.find(tableName); + if (iter == cliMap.end()) { + return false; + } + return true; +} + +bool PSStore::IsKeyExist(const string& tableName, emb_cache_key_t key) +{ + auto iter = cliMap.find(tableName); + if (iter == cliMap.end()) { + LOG_DEBUG("table:{} not create yet", tableName); + throw std::runtime_error("table not create yet"); + } + + auto worker = cliMap[tableName]; + vector keys = {key}; + vector rets; + worker->Wait(worker->Pull(keys, &rets)); + if (rets.size() > 0) { + return true; + } + return false; +} + +void PSStore::CreateTable(const string& tableName, vector savePaths, uint64_t maxTableSize) { + static bool alreadyCreate = false; + if (alreadyCreate) { + throw runtime_error("ps-lite not support multiple table yet, thus this example code only support one table"); + } + LOG_DEBUG("start create table:{}, init ps-lite client, appId:{}, customerId:{}", tableName, appId, customerId); + ps::Start(appId); + auto worker = make_shared>(appId, customerId); + cliMap[tableName] = worker; + LOG_DEBUG("finish create table:{}, worker appId:{}, customerId:{}", tableName, appId, customerId); + alreadyCreate = true; +} + +int64_t PSStore::GetTableAvailableSpace(const string& tableName) +{ + // ps-lite don't have this api + // thus always available + return 1000000000000; +} + +void PSStore::InsertEmbeddingsByAddr(const string& tableName, vector& keys, + vector& embeddingsAddr, uint64_t extEmbeddingSize) +{ + if (keys.size() == 0) { + return; + } + + auto iter = cliMap.find(tableName); + if (iter == cliMap.end()) { + LOG_DEBUG("table:{} not create yet", tableName); + throw std::runtime_error("table not create yet"); + } + auto psCli = cliMap[tableName]; + + // note: ps-lite need keys in order + vector elements; + for (size_t i = 0; i < keys.size(); i++) { + KeyWithIdx e = {keys[i], i}; + elements.push_back(e); + } + sort(elements.begin(), elements.end(), CompareKeyWithIdx); + vector sortedKeys; + vector sortedEmbeddingsAddr; + for (size_t i = 0; i < elements.size(); i++) { + sortedKeys.push_back(elements[i].key); + sortedEmbeddingsAddr.push_back(embeddingsAddr[elements[i].index]); + } + + vector lens(keys.size(), extEmbeddingSize); + vector vals(embeddingsAddr.size() * extEmbeddingSize); + for (size_t i = 0; i < embeddingsAddr.size(); i++) + { + auto rc = memcpy_s(vals.data()+i*extEmbeddingSize, extEmbeddingSize, sortedEmbeddingsAddr[i], extEmbeddingSize); + if (rc !=0){ + throw std::runtime_error("copy embedding data failed"); + } + } + + LOG_DEBUG("start push to server, table:{}, keys.size:{}, vals.size:{}", tableName, keys.size(), vals.size()); + int timeStamp = psCli->Push(keys, vals); + psCli->Wait(timeStamp); + + LOG_DEBUG("end push embedding to server, table:{}", tableName); +} + +void PSStore::DeleteEmbeddings(const string& tableName, vector& keys) +{ + LOG_WARN("ps-lite don't have delete function, just return"); + return; +} + +vector> PSStore::FetchEmbeddings(const string& tableName, vector& keys) +{ + LOG_DEBUG("start pull embedding to server, table:{}, keys.size:{}", tableName, keys.size()); + if (keys.size() == 0) { + return vector>; + } + + + auto iter = cliMap.find(tableName); + if (iter == cliMap.end()) { + LOG_DEBUG("table:{} not create yet", tableName); + throw std::runtime_error("table not create yet"); + } + auto psCli = cliMap[tableName]; + + // note: ps-lite need keys in order + vector elements; + for (size_t i = 0; i < keys.size(); i++) { + KeyWithIdx e = {keys[i], i}; + elements.push_back(e); + } + sort(elements.begin(), elements.end(), CompareKeyWithIdx); + vector sortedKeys; + for (size_t i = 0; i < elements.size(); i++) { + sortedKeys.push_back(elements[i].key); + } + + // input lens will be stuck at req_data.lens, so we use environment variable to work around + std::vector rets; + psCli->Wait(psCli->Pull(sortedKeys, &rets)); + + LOG_DEBUG("finish pull embedding, table:{}, embedding len:{}", tableName, rets.size()); + if (rets.size() % keys.size() != 0) { + LOG_ERROR("can't split received embedding equally, keys.size:{}, embeddings.size:{}", keys.size(), rets.size()); + throw std::runtime_error("embedding from server incomplete"); + } + + auto extEmbSize = rets.size() % keys.size(); + vector> embs(keys.size()); + for (size_t i = 0; i < elements.size(); i++) { + auto emb = embs[elements[i].index]; + emb.insert(emb.cbegin(), rets.cbegin() + i * extEmbSize, rets.cend() + (i + 1) * extEmbSize); + } + + LOG_DEBUG("end pull embedding to server, table:{}", tableName); + return embs; +} + +void PSStore::Save(int step) +{ + LOG_WARN("ps-lite don't have save function, just return"); +} + +void PSStore::Load(const string& tableName, vector savePaths, uint64_t maxTableSize, int step) +{ + LOG_WARN("ps-lite don't have save function, just return"); +} + +void PSStore::Start() +{ + LOG_INFO("start ps store"); +} + +void PSStore::Stop() +{ + LOG_INFO("start stop ps store"); + ps::Finalize(appId, true); + LOG_INFO("finish stop ps store"); +} + +int64_t PSStore::GetTableUsage(const string& tableName) +{ + LOG_WARN("ps-lite don't have GetTableUsage function, just return 0"); + return 0; +} + +vector>> PSStore::ExportTableKey() +{ + LOG_WARN("ps-lite don't have export key function, just return empty result"); + return vector>>(); +} +``` + +### src/core/hybrid_mgmt/hybrid_mgmt.cpp + +```c++ +#include "ps_store/ps_store.h" // new +``` + +```c++ +if (isL3StorageEnabled) { + cacheManager = Singleton::GetInstance(); + // 用户可实现L3Storage接口替换SSDEngine以对接外部存储服务 + auto psStore = std::make_shared(mgmtRankInfo.rankId); // replace + cacheManager->Init(embCache, mgmtEmbInfo, psStore); // replace + EmbeddingMgmt::Instance()->SetCacheManagerForEmbTable(cacheManager); +} +``` + +### 模型代码 + +以dcnV2为例,在run.sh中新增以下环境变量。 + +```shell +# ps-lite info +export DMLC_NUM_SERVER=1 +export DMLC_NUM_WORKER=8 # ausume we run 8 train process + +# scheduler info +export DMLC_PS_ROOT_URI='127.0.0.1' # user can set to remote server +export DMLC_PS_ROOT_PORT=8000 + +# set role as workers +export DMLC_ROLE='worker' + +# mark worker id for train process between multiple train server +# e.g. server A, worker id range [REC_WORKER_ID_START_IDX, +1, ..., +n]; server B, worker id range [REC_WORKER_ID_START_IDX +(n+1), +(n+2), ...] +export REC_WORKER_ID_START_IDX=0 +``` + +在ps-lite目录拉起存储服务 + +```shell +./start_service.sh ./test_schedular ./test_server +``` + +在模型目录拉起训练 + +```shell +# 修改缓存模式为SSD(按上述mxrec源码修改步骤,SSDEngine已被替换为ps-lite,为了不影响对外接口,未修改对外暴露的ssd参数,用户可自行修改) +export CACHE_MODE="SSD" + +./run.sh $LIBSAC_PATH $PYTHON_PATH $HCCL_JSON_PATH $DATA_PATH +``` + + + + + -- Gitee From 3ea537c40fa4f58edd27dff102f135a9ab940647 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 11 Jun 2024 14:33:41 +0800 Subject: [PATCH 213/302] =?UTF-8?q?=E5=8E=BB=E6=8E=89=E8=B0=83=E7=94=A8hdf?= =?UTF-8?q?sRead=E3=80=81hdfsWrite=E6=8E=A5=E5=8F=A3=E7=9A=84=E5=BE=AA?= =?UTF-8?q?=E7=8E=AF=E6=AC=A1=E6=95=B0=E9=99=90=E5=88=B6=EF=BC=8C=E6=95=B4?= =?UTF-8?q?=E6=94=B9=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/checkpoint/checkpoint.cpp | 20 ++++++------ src/core/emb_table/embedding_ddr.cpp | 4 +-- src/core/emb_table/embedding_dynamic.cpp | 19 ++++++------ src/core/emb_table/embedding_static.cpp | 19 ++++++------ .../hdfs_file_system/hdfs_file_system.cpp | 31 ++++++++++--------- .../hdfs_file_system/hdfs_wrapper.h | 16 +++------- .../local_file_system/local_file_system.cpp | 4 +-- src/core/ssd_engine/table.cpp | 4 +-- src/core/utils/common.h | 3 -- 9 files changed, 56 insertions(+), 64 deletions(-) diff --git a/src/core/checkpoint/checkpoint.cpp b/src/core/checkpoint/checkpoint.cpp index 8a6750d5..abd3a10e 100644 --- a/src/core/checkpoint/checkpoint.cpp +++ b/src/core/checkpoint/checkpoint.cpp @@ -210,13 +210,13 @@ void Checkpoint::WriteStream(CkptTransData& transData, const string& dataDir, si } if (writeBytesNum == -1) { - throw runtime_error(StringFormat("Error: Save data failed. data type: {}. " - "An error occurred while writing file: {}.", dataType, dataDir)); + throw runtime_error(StringFormat("Error: Save data failed. data type: %d. " + "An error occurred while writing file: %s.", dataType, dataDir.c_str())); } if (writeBytesNum != dataSize) { - throw runtime_error(StringFormat("Error: Save data failed. data type: {} ." - "Expected to write {} bytes, but actually write {} bytes to file {}.", - dataType, dataSize, writeBytesNum, dataDir)); + throw runtime_error(StringFormat("Error: Save data failed. data type: %d ." + "Expected to write %d bytes, but actually write %d bytes to file %s.", + dataType, dataSize, writeBytesNum, dataDir.c_str())); } } @@ -334,13 +334,13 @@ void Checkpoint::ReadStream(CkptTransData& transData, } if (readBytesNum == -1) { - throw runtime_error(StringFormat("Error: Load data failed. data type: {} ." - "An error occurred while reading file: {}.", dataType, dataDir)); + throw runtime_error(StringFormat("Error: Load data failed. data type: %d ." + "An error occurred while reading file: %s.", dataType, dataDir.c_str())); } if (readBytesNum != datasetSize) { - throw runtime_error(StringFormat("Error: Load data failed. data type: {} ." - "Expected to read {} bytes, but actually read {} bytes to file {}.", - dataType, datasetSize, readBytesNum, dataDir)); + throw runtime_error(StringFormat("Error: Load data failed. data type: %d ." + "Expected to read %d bytes, but actually read %d bytes to file %s.", + dataType, datasetSize, readBytesNum, dataDir.c_str())); } } diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 257238b8..167894e5 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -115,8 +115,8 @@ void EmbeddingDDR::LoadKey(const string &savePath, vector &keys } if (result != fileSize) { free(static_cast(buf)); - throw runtime_error(StringFormat("Error: Load keys failed. Expected to read {} bytes, " - "but actually read {} bytes to file {}.", fileSize, result, ss.str())); + throw runtime_error(StringFormat("Error: Load keys failed. Expected to read %d bytes, " + "but actually read %d bytes to file %s.", fileSize, result, ss.str().c_str())); } hostLoadOffset.clear(); diff --git a/src/core/emb_table/embedding_dynamic.cpp b/src/core/emb_table/embedding_dynamic.cpp index a69cf930..7f8cd7e5 100644 --- a/src/core/emb_table/embedding_dynamic.cpp +++ b/src/core/emb_table/embedding_dynamic.cpp @@ -153,11 +153,11 @@ void EmbeddingDynamic::SaveKey(const string& savePath) ssize_t res = fileSystemPtr_->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { throw runtime_error(StringFormat("Error: Save keys failed. " - "An error occurred while writing file: {}.", ss.str())); + "An error occurred while writing file: %s.", ss.str().c_str())); } if (res != writeSize) { - throw runtime_error(StringFormat("Error: Save keys failed. Expected to write {} bytes, " - "but actually write {} bytes to file {}.", writeSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. Expected to write %d bytes, " + "but actually write %d bytes to file %s.", writeSize, res, ss.str().c_str())); } } @@ -258,23 +258,24 @@ void EmbeddingDynamic::LoadKey(const string& savePath) } size_t fileSize = fileSystemPtr_->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); + throw runtime_error(StringFormat("Error: Load keys failed. " + "file %s size %d is too big.", ss.str().c_str(), fileSize)); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { throw runtime_error(StringFormat("Error: Load keys failed. " - "failed to allocate {} bytes using malloc.", fileSize)); + "failed to allocate %d bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr_->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { throw runtime_error(StringFormat("Error: Load keys failed. " - "An error occurred while reading file: {}.", ss.str())); + "An error occurred while reading file: %s.", ss.str().c_str())); } if (res != fileSize) { - throw runtime_error(StringFormat("Error: Load keys failed. Expected to read {} bytes, " - "but actually read {} bytes to file {}.", fileSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Load keys failed. Expected to read %d bytes, " + "but actually read %d bytes to file %s.", fileSize, res, ss.str().c_str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -293,7 +294,7 @@ void EmbeddingDynamic::LoadKey(const string& savePath) aclError ret = aclrtMalloc(&newBlock, static_cast(datasetSize), ACL_MEM_MALLOC_HUGE_FIRST); if (ret != ACL_SUCCESS) { throw runtime_error(StringFormat("Error: in dynamic expansion mode, " - "aclrtMalloc failed, malloc size: {}.", datasetSize)); + "aclrtMalloc failed, malloc size: %d.", datasetSize)); } // 此处的 newBlock -> first address; // 对key_offset map 进行一个恢复操作 diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index ab66a42c..61874b1f 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -99,11 +99,11 @@ void EmbeddingStatic::SaveKey(const string& savePath) ssize_t res = fileSystemPtr_->Write(ss.str(), reinterpret_cast(deviceKey.data()), writeSize); if (res == -1) { throw runtime_error(StringFormat("Error: Save keys failed. " - "An error occurred while writing file: {}.", ss.str())); + "An error occurred while writing file: %s.", ss.str().c_str())); } if (res != writeSize) { - throw runtime_error(StringFormat("Error: Save keys failed. Expected to write {} bytes, " - "but actually write {} bytes to file {}.", writeSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Save keys failed. Expected to write %d bytes, " + "but actually write %d bytes to file %s.", writeSize, res, ss.str().c_str())); } } @@ -122,23 +122,24 @@ void EmbeddingStatic::LoadKey(const string& savePath) } size_t fileSize = fileSystemPtr_->GetFileSize(ss.str()); if (fileSize >= FILE_MAX_SIZE) { - throw runtime_error(StringFormat("Error: Load keys failed. file {} size {} is too big.", ss.str(), fileSize)); + throw runtime_error(StringFormat("Error: Load keys failed. " + "file %s size %d is too big.", ss.str().c_str(), fileSize)); } int64_t* buf = static_cast(malloc(fileSize)); if (buf == nullptr) { throw runtime_error(StringFormat("Error: Load keys failed. " - "failed to allocate {} bytes using malloc.", fileSize)); + "failed to allocate %d bytes using malloc.", fileSize)); } ssize_t res = fileSystemPtr_->Read(ss.str(), reinterpret_cast(buf), fileSize); if (res == -1) { throw runtime_error(StringFormat("Error: Load keys failed. " - "An error occurred while reading file: {}.", ss.str())); + "An error occurred while reading file: %s.", ss.str().c_str())); } if (res != fileSize) { - throw runtime_error(StringFormat("Error: Load keys failed. Expected to read {} bytes, " - "but actually read {} bytes to file {}.", fileSize, res, ss.str())); + throw runtime_error(StringFormat("Error: Load keys failed. Expected to read %d bytes, " + "but actually read %d bytes to file %s.", fileSize, res, ss.str().c_str())); } size_t loadKeySize = fileSize / sizeof(int64_t); @@ -154,7 +155,7 @@ void EmbeddingStatic::LoadKey(const string& savePath) if (loadOffset.size() > devVocabSize) { free(static_cast(buf)); - throw runtime_error(StringFormat("Error: Load keys failed. Load key size :{} exceeds device vocab size: {}.", + throw runtime_error(StringFormat("Error: Load keys failed. Load key size :%d exceeds device vocab size: %d.", loadOffset.size(), devVocabSize)); } diff --git a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp index 3cbf4a44..45c50f6f 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp +++ b/src/core/file_system/hdfs_file_system/hdfs_file_system.cpp @@ -53,7 +53,7 @@ size_t HdfsFileSystem::GetFileSize(const string& filePath) { hdfsFileInfo* fileInfo = hdfs->GetPathInfo(fs, filePath.c_str()); if (fileInfo == nullptr) { - throw runtime_error(StringFormat("Error: Unable to get hdfs file info : {}.", filePath.c_str())); + throw runtime_error(StringFormat("Error: Unable to get hdfs file info : %s.", filePath.c_str())); } auto fileSize = static_cast(fileInfo->mSize); return fileSize; @@ -63,7 +63,7 @@ ssize_t HdfsFileSystem::Write(const string& filePath, const char* fileContent, s { hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_WRONLY | O_CREAT, 0, 0, 0); if (!file) { - throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); + throw runtime_error(StringFormat("Error: Unable to open hdfs file : %s.", filePath.c_str())); } tSize writeBytesNum = 0; @@ -82,13 +82,13 @@ ssize_t HdfsFileSystem::Write(const string& filePath, vector>& fil { hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_WRONLY | O_CREAT, 0, 0, 0); if (!file) { - throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); + throw runtime_error(StringFormat("Error: Unable to open hdfs file : %s.", filePath.c_str())); } tSize writeBytesNum = 0; size_t loops = fileContent.size(); for (size_t i = 0; i < loops; i++) { - tSize res = hdfs->Write(fs, file, reinterpret_cast(&fileContent[i]), dataSize * sizeof(float)); + tSize res = hdfs->Write(fs, file, fileContent[i].data(), dataSize * sizeof(float)); if (res == -1) { hdfs->CloseFile(fs, file); return static_cast(res); @@ -110,7 +110,7 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding { hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_WRONLY | O_CREAT, 0, 0, 0); if (!file) { - throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); + throw runtime_error(StringFormat("Error: Unable to open hdfs file : %s.", filePath.c_str())); } #ifndef GTEST @@ -136,13 +136,13 @@ void HdfsFileSystem::WriteEmbedding(const string& filePath, const int& embedding tSize res = hdfs->Write(fs, file, row.data(), embeddingSize * sizeof(float)); if (res == -1) { hdfs->CloseFile(fs, file); - throw runtime_error(StringFormat("Error: An error occurred while writing file: {}.", filePath.c_str())); + throw runtime_error(StringFormat("Error: An error occurred while writing file: %s.", filePath.c_str())); } if (res != embeddingSize * sizeof(float)) { hdfs->CloseFile(fs, file); - throw runtime_error(StringFormat("Error: Expected to write {} bytes, " - "but actually write {} bytes to file {}.", + throw runtime_error(StringFormat("Error: Expected to write %d bytes, " + "but actually write %d bytes to file %s.", embeddingSize * sizeof(float), res, filePath.c_str())); } } @@ -154,10 +154,11 @@ ssize_t HdfsFileSystem::Read(const string& filePath, char* fileContent, size_t d { hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_RDONLY, 0, 0, 0); if (!file) { - throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); + throw runtime_error(StringFormat("Error: Unable to open hdfs file : %s.", filePath.c_str())); } tSize readBytesNum = 0; + LOG_INFO("Start to read file : {}", filePath); tSize res = hdfs->Read(fs, file, fileContent, datasetSize); if (res == -1) { hdfs->CloseFile(fs, file); @@ -174,7 +175,7 @@ ssize_t HdfsFileSystem::Read(const string& filePath, vector>& file { hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_RDONLY, 0, 0, 0); if (!file) { - throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); + throw runtime_error(StringFormat("Error: Unable to open hdfs file : %s.", filePath.c_str())); } ssize_t readBytesNum = 0; @@ -208,7 +209,7 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em #ifndef GTEST hdfsFile file = hdfs->OpenFile(fs, filePath.c_str(), O_RDONLY, 0, 0, 0); if (!file) { - throw runtime_error(StringFormat("Error: Unable to open hdfs file : {}.", filePath.c_str())); + throw runtime_error(StringFormat("Error: Unable to open hdfs file : %s.", filePath.c_str())); } auto res = aclrtSetDevice(static_cast(deviceId)); @@ -223,19 +224,19 @@ void HdfsFileSystem::ReadEmbedding(const string& filePath, EmbeddingSizeInfo& em int seekRes = hdfs->Seek(fs, file, offset * embedSizeInfo.embeddingSize * sizeof(float)); if (seekRes == -1) { hdfs->CloseFile(fs, file); - throw runtime_error(StringFormat("Error: hdfsSeek failed with error. file offset: {}", + throw runtime_error(StringFormat("Error: hdfsSeek failed with error. file offset: %d", offset * embedSizeInfo.embeddingSize * sizeof(float))); } tSize res = hdfs->Read(fs, file, row.data(), embedSizeInfo.embeddingSize * sizeof(float)); if (res == -1) { hdfs->CloseFile(fs, file); - throw runtime_error(StringFormat("Error: An error occurred while reading file: {}.", filePath.c_str())); + throw runtime_error(StringFormat("Error: An error occurred while reading file: %s.", filePath.c_str())); } if (res != embedSizeInfo.embeddingSize * sizeof(float)) { hdfs->CloseFile(fs, file); - throw runtime_error(StringFormat("Error: Expected to read {} bytes, " - "but actually read {} bytes from file {}.", + throw runtime_error(StringFormat("Error: Expected to read %d bytes, " + "but actually read %d bytes from file %s.", embedSizeInfo.embeddingSize * sizeof(float), res, filePath.c_str())); } diff --git a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h index 6b9fe19c..b00913ff 100644 --- a/src/core/file_system/hdfs_file_system/hdfs_wrapper.h +++ b/src/core/file_system/hdfs_file_system/hdfs_wrapper.h @@ -140,11 +140,10 @@ namespace MxRec { throw runtime_error("Failed to obtain the pointer of the function hdfsRead from the libhdfs."); } - tSize reTryCount = 0; tSize unReadLength = length; tSize readBytes = 0; - while (unReadLength != 0 && reTryCount < RETRY_COUNT) { + while (unReadLength != 0) { tSize offset = (length - unReadLength) / sizeof(char); tSize res = hdfsRead(fs, file, buffer + offset, unReadLength); if (res == -1) { @@ -152,7 +151,6 @@ namespace MxRec { } unReadLength -= res; readBytes += res; - reTryCount++; } return readBytes; } @@ -163,11 +161,10 @@ namespace MxRec { throw runtime_error("Failed to obtain the pointer of the function hdfsRead from the libhdfs."); } - tSize reTryCount = 0; tSize unReadLength = length; tSize readBytes = 0; - while (unReadLength != 0 && reTryCount < RETRY_COUNT) { + while (unReadLength != 0) { tSize offset = (length - unReadLength) / sizeof(float); tSize res = hdfsRead(fs, file, buffer + offset, unReadLength); if (res == -1) { @@ -175,7 +172,6 @@ namespace MxRec { } unReadLength -= res; readBytes += res; - reTryCount++; } return readBytes; } @@ -185,11 +181,10 @@ namespace MxRec { if (hdfsWrite == nullptr) { throw runtime_error("Failed to obtain the pointer of the function hdfsWrite from the libhdfs."); } - tSize reTryCount = 0; tSize unWriteLength = length; tSize writeBytes = 0; - while (unWriteLength != 0 && reTryCount < RETRY_COUNT) { + while (unWriteLength != 0) { tSize offset = (length - unWriteLength) / sizeof(char); tSize res = hdfsWrite(fs, file, buffer + offset, unWriteLength); if (res == -1) { @@ -197,7 +192,6 @@ namespace MxRec { } unWriteLength -= res; writeBytes += res; - reTryCount++; } return writeBytes; } @@ -207,11 +201,10 @@ namespace MxRec { if (hdfsWrite == nullptr) { throw runtime_error("Failed to obtain the pointer of the function hdfsWrite from the libhdfs."); } - tSize reTryCount = 0; tSize unWriteLength = length; tSize writeBytes = 0; - while (unWriteLength != 0 && reTryCount < RETRY_COUNT) { + while (unWriteLength != 0) { tSize offset = (length - unWriteLength) / sizeof(float); tSize res = hdfsWrite(fs, file, buffer + offset, unWriteLength); if (res == -1) { @@ -219,7 +212,6 @@ namespace MxRec { } unWriteLength -= res; writeBytes += res; - reTryCount++; } return writeBytes; } diff --git a/src/core/file_system/local_file_system/local_file_system.cpp b/src/core/file_system/local_file_system/local_file_system.cpp index 6215d2ac..e9ddb8a4 100644 --- a/src/core/file_system/local_file_system/local_file_system.cpp +++ b/src/core/file_system/local_file_system/local_file_system.cpp @@ -38,13 +38,13 @@ void LocalFileSystem::CreateDir(const string& dirName) while (getline(input, tmp, '/')) { guard++; if (guard > maxDepth) { - throw runtime_error(StringFormat("create directory {} exceed max depth", dirName.c_str())); + throw runtime_error(StringFormat("create directory %s exceed max depth", dirName.c_str())); } ss << tmp << '/'; int ret = mkdir(ss.str().c_str(), dirMode); if (ret != 0 && errno != EEXIST) { LOG_ERROR("Unable to create directory: {} ret:{} error info: {}", dirName, ret, strerror(errno)); - throw runtime_error(StringFormat("create directory {} failed: {}", dirName.c_str(), strerror(errno))); + throw runtime_error(StringFormat("create directory %s failed: %s", dirName.c_str(), strerror(errno))); } } } diff --git a/src/core/ssd_engine/table.cpp b/src/core/ssd_engine/table.cpp index 592cce0e..9e48b0ef 100644 --- a/src/core/ssd_engine/table.cpp +++ b/src/core/ssd_engine/table.cpp @@ -137,7 +137,7 @@ void Table::Save(int step) SetTablePathToDiskWithSpace(); } catch (runtime_error &e) { metaFile.close(); - throw runtime_error(StringFormat("set table path to disk with space error:{}", e.what())); + throw runtime_error(StringFormat("set table path to disk with space error:%s", e.what())); } try { CreateTableDir(curTablePath); @@ -258,7 +258,7 @@ void Table::Load(const string &metaFilePath, int step) LoadDataFileSet(metaFile, step); } catch (exception &e) { metaFile->close(); - throw runtime_error(StringFormat("load data file set error:{}", e.what())); + throw runtime_error(StringFormat("load data file set error: %s", e.what())); } metaFile->close(); if (metaFile->fail()) { diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 4fdb7c8d..0013f27e 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -81,9 +81,6 @@ namespace MxRec { constexpr int GLOG_TIME_WIDTH_6 = 6; constexpr char GLOG_STAT_FLAG[] = "statOn"; - // for file system - constexpr int RETRY_COUNT = 100; - // unique related config constexpr int UNIQUE_BUCKET = 6; constexpr int MIN_UNIQUE_THREAD_NUM = 1; -- Gitee From 425454bcfb3fbb0ab57d2566833c5ab0ba80aacf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 13 Jun 2024 04:12:44 +0000 Subject: [PATCH 214/302] =?UTF-8?q?!182=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84=E6=94=AF?= =?UTF-8?q?=E6=8C=81=EF=BC=8Cbug=20fix=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84=E6=94=AF?= =?UTF-8?q?=E6=8C=81=EF=BC=8Cbug=20fix=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84=E6=94=AF?= =?UTF-8?q?=E6=8C=81=EF=BC=8Cbug=20fix=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84=E6=94=AF?= =?UTF-8?q?=E6=8C=81=EF=BC=8Cbug=20fix=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84=E6=94=AF?= =?UTF-8?q?=E6=8C=81=EF=BC=8Cbug=20fix=20*=20=E3=80=90=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E8=AF=B4=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E5=AF=B9=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84=E6=94=AF?= =?UTF-8?q?=E6=8C=81=EF=BC=8Cbug=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/embedding.py | 10 +++---- mx_rec/validator/emb_validator.py | 4 +-- src/AccCTR/src/embedding_cache/common.h | 1 + .../offset_mapper/mapper_base.h | 28 +++++++++++++++++-- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index 8c12eb4c..eaf0c759 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -32,6 +32,7 @@ from mx_rec.constants.constants import (MAX_INT32, All2allGradientsOp, MAX_VOCAB CacheModeEnum, DEFAULT_DEVICE_CACHE_MEMORY_SIZE, DEFAULT_HOST_CACHE_MEMORY_SIZE, DEFAULT_SSD_CACHE_MEMORY_SIZE) from mx_rec.graph.constants import AnchorIteratorOp +from mx_rec.util.communication.hccl_ops import get_rank_size from mx_rec.util.initialize import ConfigInitializer from mx_rec.validator.validator import ClassValidator, StringValidator, SSDFeatureValidator, \ para_checker_decorator, IntValidator, NumValidator, OptionValidator, OptionalIntValidator, \ @@ -233,12 +234,11 @@ def check_and_set_default_voc_size(voc_size_list: List[int], dim_bytes: int): if cache_mode == CacheModeEnum.DDR.value and voc_size_list[2] > 0: raise ValueError("cache mode DDR, ssd-voc is need to be none") if voc_size_list[0] == 1: - default_device_voc_size = int(DEFAULT_DEVICE_CACHE_MEMORY_SIZE / dim_bytes) - voc_size_list[0] = default_device_voc_size if default_device_voc_size < MAX_VOCABULARY_SIZE \ - else MAX_VOCABULARY_SIZE + default_device_voc_size = int(DEFAULT_DEVICE_CACHE_MEMORY_SIZE / dim_bytes * get_rank_size()) # single rank 2GB + voc_size_list[0] = min(default_device_voc_size, MAX_DEVICE_VOCABULARY_SIZE) if (cache_mode == CacheModeEnum.DDR.value or cache_mode == CacheModeEnum.SSD.value) and voc_size_list[1] == 0: - default_host_voc_size = int(DEFAULT_HOST_CACHE_MEMORY_SIZE / dim_bytes) - voc_size_list[1] = default_host_voc_size if default_host_voc_size < MAX_VOCABULARY_SIZE else MAX_VOCABULARY_SIZE + default_host_voc_size = int(DEFAULT_HOST_CACHE_MEMORY_SIZE / dim_bytes) # total 40GB + voc_size_list[1] = min(default_host_voc_size, MAX_VOCABULARY_SIZE) if cache_mode == CacheModeEnum.SSD.value and voc_size_list[2] == 0: voc_size_list[2] = DEFAULT_SSD_CACHE_MEMORY_SIZE return diff --git a/mx_rec/validator/emb_validator.py b/mx_rec/validator/emb_validator.py index c9d18f05..0c7d7e81 100644 --- a/mx_rec/validator/emb_validator.py +++ b/mx_rec/validator/emb_validator.py @@ -57,8 +57,8 @@ def check_emb_lookup_params(table_params: dict, feature_spec: Union[tf.Tensor, F slice_device_vocabulary_size = table_params.get("slice_device_vocabulary_size") slice_host_vocabulary_size = table_params.get("slice_host_vocabulary_size") table_name = table_params.get("table_name") - if slice_host_vocabulary_size + slice_device_vocabulary_size > MAX_VOCABULARY_SIZE: - raise ValueError(f"Given device_vocabulary_size and host_vocabulary_size was too big for table " + if slice_host_vocabulary_size > MAX_VOCABULARY_SIZE: + raise ValueError(f"given host_vocabulary_size was too big for table " f"'{table_name}', in which slice_device_vocabulary_size was " f"{slice_device_vocabulary_size} and slice_host_vocabulary_size was " f"{slice_host_vocabulary_size}.") diff --git a/src/AccCTR/src/embedding_cache/common.h b/src/AccCTR/src/embedding_cache/common.h index 72433332..d9841541 100644 --- a/src/AccCTR/src/embedding_cache/common.h +++ b/src/AccCTR/src/embedding_cache/common.h @@ -61,5 +61,6 @@ constexpr float CONSTANT_VALUE_MIN = -1e9; constexpr float INIT_K_MAX = 10000; constexpr float INIT_K_MIN = -10000; const int INVALID_EMB_SIZE = -1; +const size_t MEMSET_S_MAX_SIZE = 2LL * 1024 * 1024 * 1024 - 1; } #endif // MXREC_COMMON_H diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h index 969845ee..164daaab 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h @@ -313,10 +313,13 @@ public: } /* make physical page and set to zero */ - auto ret = memset_s(tmp, sizeof(NetHashBucket) * bucketCount, 0, sizeof(NetHashBucket) * bucketCount); + auto ret = SafeMemset(tmp, 0, sizeof(NetHashBucket) * bucketCount); if (ret != 0) { - ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, - "memset_s failed... size: " + std::to_string(sizeof(NetHashBucket) * bucketCount)); + delete[] tmp; + tmp = nullptr; + FreeSubMaps(); + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "memset_s failed... size: " + + std::to_string(sizeof(NetHashBucket) * bucketCount) + ", error code:" + std::to_string(ret)); return false; } @@ -693,6 +696,25 @@ private: } } + /* + * Description: SECUREC_MEM_MAX_LEN of memset_s function is 2GB + * Parameter: dest - destination address + * Parameter: c - the value to be copied + * Parameter: count - copies count bytes of value to dest + */ + int SafeMemset(void* dest, int c, size_t count) + { + char* destBytePtr = reinterpret_cast(dest); + for (size_t i = 0; i < count; i += MEMSET_S_MAX_SIZE) { + size_t bytesOnceSet = (i + MEMSET_S_MAX_SIZE <= count) ? MEMSET_S_MAX_SIZE : (count - i); + auto ret = memset_s(destBytePtr + i, bytesOnceSet, c, bytesOnceSet); + if (ret != 0) { + return ret; + } + } + return 0; + } + void FreeOverFlowedEntries() { for (auto &mSubMap : mSubMaps) { -- Gitee From 7b0cfa94f50de57e7c58299797e3e6f9955680fd Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Thu, 13 Jun 2024 03:42:15 +0000 Subject: [PATCH 215/302] cleancode --- src/core/l3_storage/cache_manager.cpp | 4 ++-- src/core/l3_storage/l3_storage.cpp | 2 ++ src/core/l3_storage/l3_storage.h | 27 +++++++++++++-------------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/core/l3_storage/cache_manager.cpp b/src/core/l3_storage/cache_manager.cpp index 188f2aaf..75d73b2d 100644 --- a/src/core/l3_storage/cache_manager.cpp +++ b/src/core/l3_storage/cache_manager.cpp @@ -108,12 +108,12 @@ void CacheManager::CreateL3StorageTableIfNotExist(const std::string& embTableNam l3Storage->CreateTable(embTableName, embBaseInfos[embTableName].savePath, embBaseInfos[embTableName].maxTableSize); embBaseInfos[embTableName].isExist = true; - LOG_INFO("create l3Storage table end, embTableName:" + embTableName); + LOG_INFO("create l3Storage table end, embTableName:{}", embTableName); return; } // 续训场景:embBaseInfos 没有保存,不会初始化;L3Storage表会初始化,此时表已存在 embBaseInfos[embTableName].isExist = true; - LOG_INFO("l3Storage table is exist, embTableName:" + embTableName); + LOG_INFO("l3Storage table is exist, embTableName:{}", embTableName); } CacheManager::~CacheManager() diff --git a/src/core/l3_storage/l3_storage.cpp b/src/core/l3_storage/l3_storage.cpp index 6a3ea668..cc26d8a4 100644 --- a/src/core/l3_storage/l3_storage.cpp +++ b/src/core/l3_storage/l3_storage.cpp @@ -17,6 +17,8 @@ See the License for the specific language governing permissions and using MxRec::L3Storage; using MxRec::emb_cache_key_t; +using std::vector; +using std::string; L3Storage::L3Storage() {} diff --git a/src/core/l3_storage/l3_storage.h b/src/core/l3_storage/l3_storage.h index 6462409f..5f7270c1 100644 --- a/src/core/l3_storage/l3_storage.h +++ b/src/core/l3_storage/l3_storage.h @@ -21,9 +21,6 @@ See the License for the specific language governing permissions and #include "utils/common.h" -using std::string; -using std::vector; - namespace MxRec { class L3Storage { @@ -31,32 +28,34 @@ public: L3Storage(); virtual ~L3Storage(); - virtual bool IsTableExist(const string& tableName); + virtual bool IsTableExist(const std::string& tableName); - virtual bool IsKeyExist(const string& tableName, emb_cache_key_t key); + virtual bool IsKeyExist(const std::string& tableName, emb_cache_key_t key); - virtual void CreateTable(const string& tableName, vector savePaths, uint64_t maxTableSize); + virtual void CreateTable(const std::string& tableName, std::vector savePaths, uint64_t maxTableSize); - virtual int64_t GetTableAvailableSpace(const string& tableName); + virtual int64_t GetTableAvailableSpace(const std::string& tableName); - virtual void InsertEmbeddingsByAddr(const string& tableName, vector& keys, - vector& embeddingsAddr, uint64_t extEmbeddingSize); + virtual void InsertEmbeddingsByAddr(const std::string& tableName, std::vector& keys, + std::vector& embeddingsAddr, uint64_t extEmbeddingSize); - virtual void DeleteEmbeddings(const string& tableName, vector& keys); + virtual void DeleteEmbeddings(const std::string& tableName, std::vector& keys); - virtual vector> FetchEmbeddings(const string& tableName, vector& keys); + virtual std::vector> FetchEmbeddings(const std::string& tableName, + std::vector& keys); virtual void Save(int step); - virtual void Load(const string& tableName, vector savePaths, uint64_t maxTableSize, int step); + virtual void Load(const std::string& tableName, std::vector savePaths, uint64_t maxTableSize, + int step); virtual void Start(); virtual void Stop(); - virtual int64_t GetTableUsage(const string& tableName); + virtual int64_t GetTableUsage(const std::string& tableName); - virtual vector>> ExportTableKey(); + virtual std::vector>> ExportTableKey(); }; } // namespace MxRec #endif // MX_REC_L3_STORAGE_H \ No newline at end of file -- Gitee From df4494d014cb117a6e31cdbfd5e185984df65a59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Fri, 14 Jun 2024 16:18:42 +0800 Subject: [PATCH 216/302] =?UTF-8?q?=E9=97=AE=E9=A2=98=E5=8D=95=E5=8F=B7?= =?UTF-8?q?=E3=80=90DTS2024061404223=E3=80=91=EF=BC=9AWideDeep=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E7=B2=BE=E5=BA=A6=E5=8A=A3=E5=8C=96=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/optimizers/lazy_adam.py | 2 +- mx_rec/optimizers/lazy_adam_by_addr.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mx_rec/optimizers/lazy_adam.py b/mx_rec/optimizers/lazy_adam.py index 0684a715..ac88afc9 100644 --- a/mx_rec/optimizers/lazy_adam.py +++ b/mx_rec/optimizers/lazy_adam.py @@ -170,7 +170,7 @@ class CustomizedLazyAdam(adam.AdamOptimizer, CustomizedOptimizer): v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(grad) v_update_op = scatter_nd_add(velocity, nd_indices, v_t_slice - old_v_slice) - denominator_slice = math_ops.sqrt(v_t_slice + temp_epsilon) + denominator_slice = math_ops.sqrt(tf.abs(v_t_slice)) + temp_epsilon var_update_op = scatter_nd_add(var, nd_indices, tf.divide(-learning_rate * m_t_slice, denominator_slice)) return control_flow_ops.group(m_update_op, v_update_op, var_update_op) diff --git a/mx_rec/optimizers/lazy_adam_by_addr.py b/mx_rec/optimizers/lazy_adam_by_addr.py index b7887052..1d5aacd2 100644 --- a/mx_rec/optimizers/lazy_adam_by_addr.py +++ b/mx_rec/optimizers/lazy_adam_by_addr.py @@ -136,7 +136,7 @@ class CustomizedLazyAdamByAddress(adam.AdamOptimizer, CustomizedOptimizer): old_v_slice = split_tensors[2] v_t_slice = temp_b2 * old_v_slice + (1 - temp_b2) * math_ops.square(grad) - denominator_slice = math_ops.sqrt(v_t_slice + temp_epsilon) + denominator_slice = math_ops.sqrt(tf.abs(v_t_slice)) + temp_epsilon update_list = [tf.divide(-learning_rate * m_t_slice, denominator_slice)] + [m_t_slice - old_m_slice] + \ [v_t_slice - old_v_slice] update_tensor = tf.concat(update_list, axis=1) -- Gitee From 7f0bd73f37d23788ca5b45aec25238feb7c0fc70 Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Mon, 17 Jun 2024 07:44:31 +0000 Subject: [PATCH 217/302] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=EF=BC=88embCache?= =?UTF-8?q?=EF=BC=89=EF=BC=9Aeval=20swapInPos=E4=B8=8Eh2dEmb=E4=B8=8D?= =?UTF-8?q?=E5=8C=B9=E9=85=8D=E5=AF=BC=E8=87=B4=E7=9A=84=E8=B6=8A=E7=95=8C?= =?UTF-8?q?=E6=88=96=E7=B2=BE=E5=BA=A6=E5=BC=82=E5=B8=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 895715c9..409a0e92 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -709,19 +709,19 @@ void HybridMgmt::ProcessEmbInfoDDR(const EmbBaseInfo& info, bool& remainBatchOut SendGlobalUniqueVec(info, uniqueKeys, restoreVecSec); + TimeCost swapProcessTC; + auto &swapInPos = swapInKoPair.second; + auto &swapOutPos = swapOutKoPair.second; + auto lastSwapInPos = lastSwapInPosMap[info.name]; + lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 + auto isNeedReturn = HandleSpecialProcessStatusDDR(info, getAndSendTensorsTC, swapInKoPair, swapOutKoPair); if (isNeedReturn) { return; } - TimeCost swapProcessTC; EnqueueSwapInfo(info, swapInKoPair, swapOutKoPair); - auto &swapInPos = swapInKoPair.second; - auto &swapOutPos = swapOutKoPair.second; - auto lastSwapInPos = lastSwapInPosMap[info.name]; - lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 - // 下发swaptensor if (info.batchId != 0) { SendTensorForSwap(info, lastSwapInPos, swapOutPos); @@ -1217,22 +1217,21 @@ void HybridMgmt::ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBa SendGlobalUniqueVec(info, uniqueKeys, restoreVecSec); - auto isNeedReturn = HandleSpecialProcessStatusL3Storage(info, getAndSendTensorsTC, swapInKoPair, swapOutKoPair); - if (isNeedReturn) { - return; - } - TimeCost swapProcessTC; auto &swapInKeys = swapInKoPair.first; auto &swapInPos = swapInKoPair.second; auto &swapOutKeys = swapOutKoPair.first; auto &swapOutPos = swapOutKoPair.second; - - HandleDataSwapForL3Storage(info, swapInKeys, swapOutKeys); - auto lastSwapInPos = lastSwapInPosMap[info.name]; lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 + auto isNeedReturn = HandleSpecialProcessStatusL3Storage(info, getAndSendTensorsTC, swapInKoPair, swapOutKoPair); + if (isNeedReturn) { + return; + } + + HandleDataSwapForL3Storage(info, swapInKeys, swapOutKeys); + // 下发swaptensor if (info.batchId != 0) { SendTensorForSwap(info, lastSwapInPos, swapOutPos); -- Gitee From 6804a8e59a945573e0c9901481eae63b9d405b6c Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Mon, 17 Jun 2024 04:32:14 +0000 Subject: [PATCH 218/302] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=EF=BC=88=E4=BF=9D?= =?UTF-8?q?=E5=AD=98=EF=BC=89=EF=BC=9Aestimator=E4=BF=9D=E5=AD=98=E6=AD=A5?= =?UTF-8?q?=E6=95=B0=E6=AD=A3=E5=88=99=E5=8C=B9=E9=85=8D=E6=97=A0=E6=95=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 9 ++++++--- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 2 +- src/core/utils/common.h | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index f7c6b9a2..9e0e1d29 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -36,6 +36,9 @@ from mx_rec.optimizers.base import CustomizedOptimizer from mx_rec.util.tf_version_adapter import npu_ops +SAVE_SPARSE_PATH_PREFIX = "sparse" + + # define save model thread class SaveModelThread(threading.Thread): def __init__(self, saver, sess, result, root_dir, table_name): @@ -128,9 +131,9 @@ class Saver(object): if global_step: if not isinstance(global_step, compat.integral_types): global_step = int(sess.run(global_step)) - ckpt_name = f"sparse-{base_name}-{global_step}" + ckpt_name = f"{SAVE_SPARSE_PATH_PREFIX}-{base_name}-{global_step}" else: - ckpt_name = f"sparse-{base_name}" + ckpt_name = f"{SAVE_SPARSE_PATH_PREFIX}-{base_name}" saving_path = os.path.join(directory, ckpt_name) self.config_instance.train_params_config.sparse_dir = saving_path @@ -185,7 +188,7 @@ class Saver(object): "only local file system and hdfs file system supported. ") directory, base_name = os.path.split(reading_path) - ckpt_name = f"sparse-{base_name}" + ckpt_name = f"{SAVE_SPARSE_PATH_PREFIX}-{base_name}" reading_path = os.path.join(directory, ckpt_name) if not tf.io.gfile.exists(reading_path): diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 895715c9..c3b9cbec 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -821,7 +821,7 @@ void HybridMgmt::EvictL3StorageKeys(const string& embName, const vector(1. / 3); // hot emb cache percent const string COMBINE_HISTORY_NAME = "combine_table_history"; + const string SAVE_SPARSE_PATH_PREFIX = "sparse"; using emb_key_t = int64_t; using emb_cache_key_t = uint64_t; -- Gitee From 508b62839c5c065e4e5afb5f3bbb447231f3ec4b Mon Sep 17 00:00:00 2001 From: liangrenhao Date: Wed, 19 Jun 2024 11:31:41 +0800 Subject: [PATCH 219/302] add Infer Reference Cases Signed-off-by: liangrenhao --- examples/rec_infer/README.md | 134 +++++++++++++ examples/rec_infer/client.py | 76 ++++++++ examples/rec_infer/client.sh | 4 + examples/rec_infer/input_config.py | 178 ++++++++++++++++++ ...-Performance-optimization-referrence.patch | 72 +++++++ examples/rec_infer/optimize/README.md | 51 +++++ examples/rec_infer/server.sh | 6 + tools/graph_partition/gen_config.py | 54 ++++++ tools/graph_partition/graph_partition.py | 116 ++++++++++++ tools/graph_partition/template.cfg | 57 ++++++ 10 files changed, 748 insertions(+) create mode 100644 examples/rec_infer/README.md create mode 100644 examples/rec_infer/client.py create mode 100644 examples/rec_infer/client.sh create mode 100644 examples/rec_infer/input_config.py create mode 100644 examples/rec_infer/optimize/0001-Performance-optimization-referrence.patch create mode 100644 examples/rec_infer/optimize/README.md create mode 100644 examples/rec_infer/server.sh create mode 100644 tools/graph_partition/gen_config.py create mode 100644 tools/graph_partition/graph_partition.py create mode 100644 tools/graph_partition/template.cfg diff --git a/examples/rec_infer/README.md b/examples/rec_infer/README.md new file mode 100644 index 00000000..573ecafc --- /dev/null +++ b/examples/rec_infer/README.md @@ -0,0 +1,134 @@ +# 推理环境部署 +一、安装依赖包:

+安装开发套件包Ascend-cann-toolkit_{version}_linux-{arch}.run

+安装框架插件包Ascend-cann-tfplugin_{version}_linux-{arch}.run

+安装其他依赖包:

+|依赖包 | 版本限制| +|:---|:---:| +|gcc,g++|8.4及以上版本| +|zip,unzip,libtool,automake|无特定版本要求| +|python|3.7.5| +|TensorFlow| 1.15.0| +|tensorflow-serving-api|1.15.0| +|future|无特定版本要求| +|bazel|0.24.1| +|camake|3.14.0| +|swig|若操作系统为"aarch64",软件安装版本需大于或等于3.0.12。若操作系统架构为"X86_64",软件安装版本需大于或等于4.0.1| +|java|jdk-11| +||| + +二、编译serving +1. 下载TF-serving源码:https://github.com/tensorflow/serving/archive/1.15.0.zip +2. 解压后进入源码目录 +3. 添加TF-serving第三方依赖 + +a)执行如下命令,在“serving-1.15.0/third_party”目录下创建“tf_adapter”文件夹并进入。 +>cd third_party/
+mkdir tf_adapter
+cd tf_adapter
+b)执行如下命令,在“tf_adapter”文件夹下拷贝存放“libpython3.7m.so.1.0”文件,并创建软链接。 +> cp /usr/local/python3.7.5/lib/libpython3.7m.so.1.0 .
+ln -s libpython3.7m.so.1.0 libpython3.7m.so
+ +c.执行如下命令,在“tf_adapter”文件夹下拷贝存放“_tf_adapter.so”文件,并将“_tf_adapter.so”文件名修改为“lib_tf_adapter.so”。 +>cp /home/HwHiAiUser/Ascend/tfplugin/latest/python/site-packages/npu_bridge/_tf_adapter.so .
+mv _tf_adapter.so lib_tf_adapter.so
+ +4. 编译空的libtensorflow_framework.so、_pywrap_tensorflow_internal.so文件. + +a. 在“tf_adapter”文件夹下,执行如下命令。 +>vim CMakeLists.txt
+ +b. 写入如下内容保存。 +```text +file(TOUCH ${CMAKE_CURRENT_BINARY_DIR}/stub.c) +add_library(_pywrap_tensorflow_internal SHARED ${CMAKE_CURRENT_BINARY_DIR}/stub.c) +add_library(tensorflow_framework SHARED ${CMAKE_CURRENT_BINARY_DIR}/stub.c) +``` + +c.执行:wq!命令保存文件并退出。 +d.执行如下命令,编译出空的.so文件。 +> mkdir temp
+cd temp
+cmake ..
+make
+mv lib_pywrap_tensorflow_internal.so ../_pywrap_tensorflow_internal.so
+mv libtensorflow_framework.so ../libtensorflow_framework.so
+cd ..
+ln -s libtensorflow_framework.so libtensorflow_framework.so.1
+ +e.配置环境命令。 +```text +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd)
+``` + +5. 在“tf_adapter”文件夹下创建BUILD文件。 写入如下内容。 +```text +licenses(["notice"]) # BSD/MIT. + +cc_import( + name = "tf_adapter", + shared_library = "lib_tf_adapter.so", + visibility = ["//visibility:public"] +) + +cc_import( + name = "tf_python", + shared_library = "libpython3.7m.so", + visibility = ["//visibility:public"] +) +``` + +6. 修改“serving-1.15.0/tensorflow_serving/model_servers/”路径下的BUILD文件,在“cc_binary”中添加如下加粗内容。 + +>cc_binary(
+name = "tensorflow_model_server",
+     stamp = 1,
+     visibility = [
+         ":testing",
+         "//tensorflow_serving:internal",
+     ],
+     deps = [
+         ":tensorflow_model_server_main_lib",
+         __"//third_party/tf_adapter:tf_adapter",__
+         __"//third_party/tf_adapter:tf_python",__
+         __"@org_tensorflow//tensorflow/compiler/jit:xla_cpu_jit",__
+     ],
+)
+ +7. TF Serving,在TF Serving安装目录“serving-1.15.0”下执行如下命令,编译TF Serving。 + +> bazel --output_user_root=/opt/tf_serving build -c opt --distdir=../depends --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" tensorflow_serving/model_servers:tensorflow_model_server
+如果编译过程中遇到依赖包下载失败问题,可手动下载,TF serving编译依赖包(https://www.hiascend.com/document/detail/zh/canncommercial/80RC1/developmentguide/moddevg/onlineinfer1/atlastfserv_26_0011.html) + +8. 建立软连接。 +> ln -s /opt/tf_serving/{tf_serving_ID}/execroot/tf_serving/bazel-out/xxx-opt/bin/tensorflow_serving/model_servers/tensorflow_model_server /usr/local/bin/tensorflow_model_server
+ ++ {tf_serving_ID}为一串如“063944eceea3e72745362a0b6eb12a3c”的无规则字符。请根据实际进行填写。 ++ xxx-opt为工具自动生成文件夹,具体显示请以实际为准。 + +# 脚本工具介绍 +server.sh/client.sh +启动服务脚本/客户端请求服务器脚本 + +1. 启动tf-serving server方法 +进入目录 tf_serving_inerence +> 更改server.sh中模型路径model_base_path为导出的savedModel路径,
+> 将编译tf_serving的第三方依赖tf_adapter路径加入环境变量,export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/xxx/xxx/serving-1.15.0/third_party/tf_adapter/,
+> source /usr/local/Ascend/ascend-toolkit/set_env.sh
+> sh server.sh
+ +若日志中显示Running gRPC ModelServer at 0.0.0.0:xxxx则表示启动成功 +2.请求服务器方法 +执行脚本:sh client.sh +推理成功会打印端到端时延 + +# 使用切图工具 +1.进入目录:graph_patition,修改gen_config.py中的模型目录 +2.执行 python3 gen_config.py,使用生成的test1.cfg文件启动模型,使用方法如下: +> python3 gen_config.py --output_path . --output_filename test1.cfg --model_path savedmodel_path
++ 参数解释:output_path(输出路径),output_filename(输出文件名),model_path(输入模型路径)
++ 得到输出文件后,替换服务启动脚本中--platform_config_file参数选项即可生效 + +#性能优化 +1. 具体参考optimize目录下的文件 \ No newline at end of file diff --git a/examples/rec_infer/client.py b/examples/rec_infer/client.py new file mode 100644 index 00000000..62a15882 --- /dev/null +++ b/examples/rec_infer/client.py @@ -0,0 +1,76 @@ +import os +import time + +import grpc +import numpy as np + +import tensorflow as tf +from input_config import config +from tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc + + +class PredictModelGrpc(): + def __init__( + self, + model_name, + inputs, + input_types, + output_name, + socket="xxx.xxx.xxx.xxx:8500", + ): + self.socket = socket + self.model_name = model_name + self.inputs = inputs + self.input_types = input_types + self.output_name = output_name + self.request, self.stub = self.__get_request() + + def inference(self): + for name in self.inputs: + self.request.inputs[name].CopyFrom( + tf.make_tensor_proto(self.inputs[name], dtype=self.input_types[name]) + ) + + for _ in range(100): + result = self.stub.Predict.future(self.request, 1000.0) + result.result() + + def __get_request(self): + channel = grpc.insecure_channel( + self.socket, + options=[ + ("grpc.max_send_message_length", 1024 * 1024 * 1024), + ("grpc.max_receive_message_length", 1024 * 1024 * 1024), + ], + ) + stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) + request = predict_pb2.PredictRequest() + request.model_spec.name = self.model_name + request.model_spec.signature_name = "serving_default" + + return request, stub + + +def gen_inputs(): + inputs = {} + input_types = {} + for name in config: + input_types[name] = config[name]["dtype"] + if config[name]["dtype"] == tf.int32: + inputs[name] = np.random.randint(0, 100, size=config[name]["shape"]) + elif config[name]["dtype"] == tf.float32: + inputs[name] = np.random.randint(0, 2, size=config[name]["shape"]) * 1.0 + return inputs, input_types + + +if __name__ == "__main__": + input_datas, types = gen_inputs() + model = PredictModelGrpc( + model_name="saved_model", + inputs=input_datas, + input_types=types, + output_name="", + socket="127.0.0.1:9999", + ) + + model.inference() diff --git a/examples/rec_infer/client.sh b/examples/rec_infer/client.sh new file mode 100644 index 00000000..fa968858 --- /dev/null +++ b/examples/rec_infer/client.sh @@ -0,0 +1,4 @@ +source /usr/local/Ascend/ascend-toolkit/set_env.sh +unset http_proxy +unset https_proxy +python3 client.py \ No newline at end of file diff --git a/examples/rec_infer/input_config.py b/examples/rec_infer/input_config.py new file mode 100644 index 00000000..24e28f03 --- /dev/null +++ b/examples/rec_infer/input_config.py @@ -0,0 +1,178 @@ +import tensorflow as tf + +BATCH_SIZE = 9600 +config = { + "feat_0": {"dtype": tf.float32, "shape": [BATCH_SIZE, 40], "name": "feat_0"}, + "feat_1": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_1"}, + "feat_2": {"dtype": tf.float32, "shape": [BATCH_SIZE, 40], "name": "feat_2"}, + "feat_3": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_3"}, + "feat_4": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_4"}, + "feat_5": {"dtype": tf.float32, "shape": [BATCH_SIZE, 32], "name": "feat_5"}, + "feat_6": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_6"}, + "feat_7": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_7"}, + "feat_8": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_8"}, + "feat_9": {"dtype": tf.int32, "shape": [BATCH_SIZE, 16], "name": "feat_9"}, + "feat_10": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_10"}, + "feat_11": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_11"}, + "feat_12": {"dtype": tf.float32, "shape": [BATCH_SIZE, 480], "name": "feat_12"}, + "feat_13": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_13"}, + "feat_14": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_14"}, + "feat_15": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_15"}, + "feat_16": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_16"}, + "feat_17": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_17"}, + "feat_18": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_18"}, + "feat_19": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_19"}, + "feat_20": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_20"}, + "feat_21": {"dtype": tf.float32, "shape": [BATCH_SIZE, 32], "name": "feat_21"}, + "feat_22": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_22"}, + "feat_23": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_23"}, + "feat_24": {"dtype": tf.int32, "shape": [BATCH_SIZE, 10], "name": "feat_24"}, + "feat_25": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_25"}, + "feat_26": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_26"}, + "feat_27": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_27"}, + "feat_28": {"dtype": tf.int32, "shape": [BATCH_SIZE, 36], "name": "feat_28"}, + "feat_29": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_29"}, + "feat_30": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_30"}, + "feat_31": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_31"}, + "feat_32": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_32"}, + "feat_33": {"dtype": tf.float32, "shape": [BATCH_SIZE, 256], "name": "feat_33"}, + "feat_34": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_34"}, + "feat_35": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_35"}, + "feat_36": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_36"}, + "feat_37": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_37"}, + "feat_38": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_38"}, + "feat_39": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_39"}, + "feat_40": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_40"}, + "feat_41": {"dtype": tf.float32, "shape": [BATCH_SIZE, 32], "name": "feat_41"}, + "feat_42": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_42"}, + "feat_43": {"dtype": tf.float32, "shape": [BATCH_SIZE, 40], "name": "feat_43"}, + "feat_44": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_44"}, + "feat_45": {"dtype": tf.int32, "shape": [BATCH_SIZE, 7], "name": "feat_45"}, + "feat_46": {"dtype": tf.int32, "shape": [BATCH_SIZE, 4], "name": "feat_46"}, + "feat_47": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_47"}, + "feat_48": {"dtype": tf.int32, "shape": [BATCH_SIZE, 4], "name": "feat_48"}, + "feat_49": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_49"}, + "feat_50": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_50"}, + "feat_51": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_51"}, + "feat_52": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_52"}, + "feat_53": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_53"}, + "feat_54": {"dtype": tf.int32, "shape": [BATCH_SIZE, 100], "name": "feat_54"}, + "feat_55": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_55"}, + "feat_56": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_56"}, + "feat_57": {"dtype": tf.float32, "shape": [BATCH_SIZE, 8], "name": "feat_57"}, + "feat_58": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_58"}, + "feat_59": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_59"}, + "feat_60": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_60"}, + "feat_61": {"dtype": tf.float32, "shape": [BATCH_SIZE, 8], "name": "feat_61"}, + "feat_62": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_62"}, + "feat_63": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_63"}, + "feat_64": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_64"}, + "feat_65": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_65"}, + "feat_66": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_66"}, + "feat_67": {"dtype": tf.float32, "shape": [BATCH_SIZE, 192], "name": "feat_67"}, + "feat_68": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_68"}, + "feat_69": {"dtype": tf.float32, "shape": [BATCH_SIZE, 8], "name": "feat_69"}, + "feat_70": {"dtype": tf.float32, "shape": [BATCH_SIZE, 6, 32], "name": "feat_70"}, + "feat_71": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_71"}, + "feat_72": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_72"}, + "feat_73": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_73"}, + "feat_74": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_74"}, + "feat_75": {"dtype": tf.int32, "shape": [BATCH_SIZE, 10], "name": "feat_75"}, + "feat_76": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_76"}, + "feat_77": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_77"}, + "feat_78": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_78"}, + "feat_79": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_79"}, + "feat_80": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_80"}, + "feat_81": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_81"}, + "feat_82": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_82"}, + "feat_83": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_83"}, + "feat_84": {"dtype": tf.float32, "shape": [BATCH_SIZE, 32], "name": "feat_84"}, + "feat_85": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_85"}, + "feat_86": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_86"}, + "feat_87": {"dtype": tf.float32, "shape": [BATCH_SIZE, 40], "name": "feat_87"}, + "feat_88": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_88"}, + "feat_89": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_89"}, + "feat_90": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_90"}, + "feat_91": {"dtype": tf.float32, "shape": [BATCH_SIZE, 40], "name": "feat_91"}, + "feat_92": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_92"}, + "feat_93": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_93"}, + "feat_94": {"dtype": tf.int32, "shape": [BATCH_SIZE, 36], "name": "feat_94"}, + "feat_95": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_95"}, + "feat_96": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_96"}, + "feat_97": {"dtype": tf.float32, "shape": [BATCH_SIZE, 320], "name": "feat_97"}, + "feat_98": {"dtype": tf.float32, "shape": [BATCH_SIZE, 1], "name": "feat_98"}, + "feat_99": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_99"}, + "feat_100": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_100"}, + "feat_101": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_101"}, + "feat_102": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_102"}, + "feat_103": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_103"}, + "feat_104": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_104"}, + "feat_105": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_105"}, + "feat_106": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_106"}, + "feat_107": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_107"}, + "feat_108": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_108"}, + "feat_109": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_109"}, + "feat_110": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_110"}, + "feat_111": {"dtype": tf.int32, "shape": [BATCH_SIZE, 36], "name": "feat_111"}, + "feat_112": {"dtype": tf.int32, "shape": [BATCH_SIZE, 10], "name": "feat_112"}, + "feat_113": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_113"}, + "feat_114": {"dtype": tf.float32, "shape": [BATCH_SIZE, 8], "name": "feat_114"}, + "feat_115": {"dtype": tf.float32, "shape": [BATCH_SIZE, 60], "name": "feat_115"}, + "feat_116": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_116"}, + "feat_117": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_117"}, + "feat_118": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_118"}, + "feat_119": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_119"}, + "feat_120": {"dtype": tf.int32, "shape": [BATCH_SIZE, 13], "name": "feat_120"}, + "feat_121": {"dtype": tf.int32, "shape": [BATCH_SIZE, 3], "name": "feat_121"}, + "feat_122": {"dtype": tf.int32, "shape": [BATCH_SIZE, 9], "name": "feat_122"}, + "feat_123": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_123"}, + "feat_124": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_124"}, + "feat_125": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_125"}, + "feat_126": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_126"}, + "feat_127": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_127"}, + "feat_128": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_128"}, + "feat_129": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_129"}, + "feat_130": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_130"}, + "feat_131": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_131"}, + "feat_132": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_132"}, + "feat_133": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_133"}, + "feat_134": {"dtype": tf.int32, "shape": [BATCH_SIZE, 10], "name": "feat_134"}, + "feat_135": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_135"}, + "feat_136": {"dtype": tf.int32, "shape": [BATCH_SIZE, 33], "name": "feat_136"}, + "feat_137": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_137"}, + "feat_138": {"dtype": tf.int32, "shape": [BATCH_SIZE, 36], "name": "feat_138"}, + "feat_139": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_139"}, + "feat_140": {"dtype": tf.float32, "shape": [BATCH_SIZE, 40], "name": "feat_140"}, + "feat_141": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_141"}, + "feat_142": {"dtype": tf.int32, "shape": [BATCH_SIZE, 26], "name": "feat_142"}, + "feat_143": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_143"}, + "feat_144": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_144"}, + "feat_145": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_145"}, + "feat_146": {"dtype": tf.float32, "shape": [BATCH_SIZE, 8], "name": "feat_146"}, + "feat_147": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_147"}, + "feat_148": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_148"}, + "feat_149": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_149"}, + "feat_150": {"dtype": tf.float32, "shape": [BATCH_SIZE, 8], "name": "feat_150"}, + "feat_151": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_151"}, + "feat_152": {"dtype": tf.float32, "shape": [BATCH_SIZE, 7], "name": "feat_152"}, + "feat_153": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_153"}, + "feat_154": {"dtype": tf.float32, "shape": [BATCH_SIZE, 8], "name": "feat_154"}, + "feat_155": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_155"}, + "feat_156": {"dtype": tf.float32, "shape": [BATCH_SIZE, 8], "name": "feat_156"}, + "feat_157": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_157"}, + "feat_158": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_158"}, + "feat_159": {"dtype": tf.int32, "shape": [BATCH_SIZE, 8], "name": "feat_159"}, + "feat_160": {"dtype": tf.float32, "shape": [BATCH_SIZE, 40], "name": "feat_160"}, + "feat_161": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_161"}, + "feat_162": {"dtype": tf.int32, "shape": [BATCH_SIZE, 36], "name": "feat_162"}, + "feat_163": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_163"}, + "feat_164": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_164"}, + "feat_165": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_165"}, + "feat_166": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_166"}, + "feat_167": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_167"}, + "feat_168": {"dtype": tf.int32, "shape": [BATCH_SIZE, 6], "name": "feat_168"}, + "feat_169": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_169"}, + "feat_170": {"dtype": tf.int32, "shape": [BATCH_SIZE, 40], "name": "feat_170"}, + "feat_172": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_172"}, + "feat_173": {"dtype": tf.int32, "shape": [BATCH_SIZE, 1], "name": "feat_173"}, +} diff --git a/examples/rec_infer/optimize/0001-Performance-optimization-referrence.patch b/examples/rec_infer/optimize/0001-Performance-optimization-referrence.patch new file mode 100644 index 00000000..a3576055 --- /dev/null +++ b/examples/rec_infer/optimize/0001-Performance-optimization-referrence.patch @@ -0,0 +1,72 @@ +--- + tensorflow_serving/model_servers/BUILD | 1 + + tensorflow_serving/model_servers/main.cc | 8 +++++++- + tensorflow_serving/model_servers/server.cc | 5 +++++ + tensorflow_serving/model_servers/server.h | 6 +++++- + 4 files changed, 18 insertions(+), 2 deletions(-) + +diff --git a/tensorflow_serving/model_servers/BUILD b/tensorflow_serving/model_servers/BUILD +index f60f3d7..e74a514 100644 +--- a/tensorflow_serving/model_servers/BUILD ++++ b/tensorflow_serving/model_servers/BUILD +@@ -373,6 +373,7 @@ cc_binary( + deps = [ + ":tensorflow_model_server_main_lib", + ], ++ linkops = ["-L/usr/local/lib -lstringlib", "-L/usr/local/lib -ljemalloc"] + ) + + py_test( +diff --git a/tensorflow_serving/model_servers/main.cc b/tensorflow_serving/model_servers/main.cc +index 2b83500..3a055d0 100644 +--- a/tensorflow_serving/model_servers/main.cc ++++ b/tensorflow_serving/model_servers/main.cc +@@ -192,7 +192,13 @@ int main(int argc, char** argv) { + "EXPERIMENTAL; CAN BE REMOVED ANYTIME! Load and use " + "TensorFlow Lite model from `model.tflite` file in " + "SavedModel directory instead of the TensorFlow model " +- "from `saved_model.pb` file.")}; ++ "from `saved_model.pb` file."), ++ tensorflow::Flag("set_SyncServerOption_flag", &options.set_SyncServerOption_flag, ++ "if true,the server will config SyncServerOption"), ++ tensorflow::Flag("NUM_CQS", &options.NUM_CQS, "config NUM_CQS"), ++ tensorflow::Flag("MIN_POLLERS", &options.MIN_POLLERS, "config MIN_POLLERS"), ++ tensorflow::Flag("MAX_POLLERS", &options.MAX_POLLERS, "config MAX_POLLERS"), ++ }; + + const auto& usage = tensorflow::Flags::Usage(argv[0], flag_list); + if (!tensorflow::Flags::Parse(&argc, argv, flag_list)) { +diff --git a/tensorflow_serving/model_servers/server.cc b/tensorflow_serving/model_servers/server.cc +index 9808f9a..b5df129 100644 +--- a/tensorflow_serving/model_servers/server.cc ++++ b/tensorflow_serving/model_servers/server.cc +@@ -330,6 +330,11 @@ Status Server::BuildAndStart(const Options& server_options) { + BuildServerCredentialsFromSSLConfigFile( + server_options.ssl_config_file)); + } ++ if (server_options.set_SyncServerOption_flag) { ++ builder.SetSyncServerOption(::grpc::ServerBuilder::SyncServerOption.NUM_CQS, server_options.NUM_CQS); ++ builder.SetSyncServerOption(::grpc::ServerBuilder::SyncServerOption.MIN_POLLERS, server_options.MIN_POLLERS); ++ builder.SetSyncServerOption(::grpc::ServerBuilder::SyncServerOption.MAX_POLLERS, server_options.MAX_POLLERS); ++ } + builder.RegisterService(model_service_.get()); + builder.RegisterService(prediction_service_.get()); + builder.SetMaxMessageSize(tensorflow::kint32max); +diff --git a/tensorflow_serving/model_servers/server.h b/tensorflow_serving/model_servers/server.h +index 7738f29..90a0994 100644 +--- a/tensorflow_serving/model_servers/server.h ++++ b/tensorflow_serving/model_servers/server.h +@@ -83,7 +83,11 @@ class Server { + bool enforce_session_run_timeout = true; + bool remove_unused_fields_from_bundle_metagraph = true; + bool use_tflite_model = false; +- ++ // SyncServerOption config ++ bool set_SyncServerOption_flag = false; ++ tensorflow::int32 NUM_CQS = 3; ++ tensorflow::int32 MIN_POLLERS = 6; ++ tensorflow::int32 MAX_POLLERS = 12; + Options(); + }; + +-- diff --git a/examples/rec_infer/optimize/README.md b/examples/rec_infer/optimize/README.md new file mode 100644 index 00000000..a6d7cd35 --- /dev/null +++ b/examples/rec_infer/optimize/README.md @@ -0,0 +1,51 @@ +# 链接ARM的optimized-routines库 +在memcpy等接口占比较大的模型中,有性能收益,源码路径为(https://github.com/ARM-software/optimized-routines/tree/v23.01) +```shell +unzip optimized-routines-23.01.zip +cd optimized-routines-23.01 +``` + +在源码基础上,修改代码,修改脚本如下: +```shell +for m in memcmp memcpy memset memmove memrchr strcpy strchrnul strchr strcmp stpcpy strncmp strnlen strrchr; do + for f in $(grep __${m}_aarch64 * -r |awk -F ':' '{print $1}'); do + sed_str1="__${m}_aarch64" + sed_str2="${m}" + sed -i 's!'${sed_str1}'!'${sed_str2}'!g' $f + done +done +``` + +编译: +```shell +make ARCH=aarch64 -j 8 +cp build/lib/libstringlib.so /usr/local/lib/ +``` + +在编译tensorflow serving时链接libstringlib.so,相关修改代码参考0001-Performance-optimization-referrence +运行server时,需要配置环境变量: +```shell +export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH +``` + +# 链接jemalloc库 +源码下载链接: https://github.com/jemalloc/jemalloc/archive/refs/tags/5.3.0.tar.gz +编译安装命令如下: +```shell +tar -xzvf jemalloc-5.3.0.tar.gz +cd jemalloc-5.3.0 +./autogen.sh +make -j 8 +make install +``` + +安装完成后,默认安装在/usr/local/lib/,在编译tensorflow serving时链接libjemalloc.so,相关修改代码参考0001-Performance-optimization-referrence +运行server时,需要配置环境变量: +```shell +export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH +``` + +# gRPC配置优化 +增加NUM_CQS,MIN_POLLERS,MAX_POLLERS这三个配置项的配置,在多线程请求推理场景可以提升性能 +配置项参考gRPC官网(https://grpc.github.io/grpc/cpp/classgrpc_1_1_server_builder.html) +具体修改参考0001-Performance-optimization-referrence,配置最优值根据不同模型和机器可能有所不同; diff --git a/examples/rec_infer/server.sh b/examples/rec_infer/server.sh new file mode 100644 index 00000000..50735b0f --- /dev/null +++ b/examples/rec_infer/server.sh @@ -0,0 +1,6 @@ +taskset -c 0-32 /home/lmp/serving-1.15.0/bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server \ + --model_name=saved_model \ + --model_base_path=$(pwd)/inference_model/saved_model/ \ + --port=9999 \ + --rest_api_prot=9991 \ + --platform_config_file=test.cfg \ No newline at end of file diff --git a/tools/graph_partition/gen_config.py b/tools/graph_partition/gen_config.py new file mode 100644 index 00000000..8e80a182 --- /dev/null +++ b/tools/graph_partition/gen_config.py @@ -0,0 +1,54 @@ +import argparse +import os + +import tensorflow as tf +from graph_partition import GraphPartitioner + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument("--model_path", type=str, default="./") + parser.add_argument("--output_path", type=str, default="./") + parser.add_argument("--output_filename", type=str, default="config.cfg") + args = parser.parse_args() + + signature_def = "serving_default" + + # 模型配置 + embedding_lookup_op_type = ["Sum"] + heavy_load_ops = ["MatMul"] # 必须下沉的算子(暂时没用到) + use_whole_graph = False + partition_to_first_heavy_load = False + ######################################################### + + output_filepath = os.path.join(args.output_path, args.output_filename) + + with tf.compat.v1.Session() as sess: + meta_graph = tf.compat.v1.saved_model.loader.load( + sess, ["serve"], args.model_path + ) + ops = sess.graph.get_operations() + graph_partitioner = GraphPartitioner() + + graph_partitioner.graph = sess.graph + graph_partitioner.signature_def = meta_graph.signature_def.get(signature_def) + graph_partitioner.set_embedding_lookup_op_type(embedding_lookup_op_type) + + inputs, outputs = graph_partitioner.get_sub_graph() + + res_string = "[[" + inputs + "," + outputs + "]]" + + ori_test = open("template.cfg") + template = ori_test.read() + output = template.replace("#value@in_out_pair#", res_string) + if os.path.exists(output_filepath): + os.remove(output_filepath) + + # open text file + text_file = os.fdopen(os.open(output_filepath, os.O_WRONLY | os.O_CREAT, 0o666, "w")) + + # write string to file + n = text_file.write(output) + + # close file + text_file.close() + ori_test.close() diff --git a/tools/graph_partition/graph_partition.py b/tools/graph_partition/graph_partition.py new file mode 100644 index 00000000..6e01e6e6 --- /dev/null +++ b/tools/graph_partition/graph_partition.py @@ -0,0 +1,116 @@ +import tensorflow as tf +from tensorflow.contrib import graph_editor as ge + + +class GraphPartitioner: + def __init__(self): + self.signature_def = None + self.graph = None + self.op_node_lookup = dict() + self.input_op_nodes = [] + self.output_op_nodes = [] + self.tensor_node_lookup = dict() + self.heavy_load_ops = [] + self.embedding_lookup_op_type = None + self.first_heavy_load_on_sparse_path = set() + self.first_op_after_lookup = [] + self.seen = set() + self.post_out = set() + self.partition_to_first_heavy_load = False + + self.sparse_lookup_ops = [] + self.sparse_lookup_tensors = [] + self.input_nodes = [] + self.output_nodes = [] + + @staticmethod + def has_gray_downstreams(op): + gray_list = ["DynamicPartition"] + down_ops = ge.get_forward_walk_ops([op]) + for op in down_ops: + if op.type in gray_list: + return True + return False + + def set_embedding_lookup_op_type(self, s): + self.embedding_lookup_op_type = s + + def get_sub_graph(self): + for op in self.graph.get_operations(): + if self._is_embedding_lookup(op): + self.sparse_lookup_ops.append(op) + if not self.sparse_lookup_ops: + for op in self.graph.get_operations(): + is_top_op = True + for op1 in self.graph.get_operations(): + for tensor in op1.outputs: + if tensor in op.inputs: + is_top_op = False + break + if not is_top_op: + break + if is_top_op: + self.sparse_lookup_ops.append(op) + check_ops = self.sparse_lookup_ops + self.sparse_lookup_ops = [] + for op in check_ops: + if not self.has_gray_downstreams(op): + self.sparse_lookup_ops.append(op) + self.sparse_lookup_tensors.extend(op.outputs) + + for op in self.graph.get_operations(): + for tensor in self.sparse_lookup_tensors: + if tensor in op.inputs: + self.input_nodes.append(op) + for k, v in self.signature_def.outputs.items(): + op_name = ( + str(v) + .split("\n")[0] + .replace(" ", "") + .replace('"', "") + .split(":")[1] + .split(":")[0] + ) + for op in self.graph.get_operations(): + if op.name == op_name: + self.output_nodes.append(op) + + float_ups = [] + to_expand = [] + in_str = [] + + for op in self.input_nodes: + if op.type not in float_ups: + if op.name not in in_str: + in_str.append(op.name) + else: + to_expand.append(op) + + while to_expand: + candidates = [] + for top in to_expand: + for op in self.graph.get_operations(): + for tensor in op.inputs: + if tensor in top.outputs: + candidates.append(op) + to_expand = [] + for op in candidates: + if op.type not in float_ups: + if op.name not in in_str: + in_str.append(op.name) + else: + to_expand.append(op) + return str(in_str), str([op.name for op in self.output_nodes]) + + def _is_embedding_lookup(self, op): + if op.type in self.embedding_lookup_op_type: + return True + + return False + + def _check_op_status(self): + unseen_list = [] + for name, op_node in self.op_node_lookup.items(): + if not op_node.seen: + unseen_list.append(name) + return unseen_list diff --git a/tools/graph_partition/template.cfg b/tools/graph_partition/template.cfg new file mode 100644 index 00000000..3227bdea --- /dev/null +++ b/tools/graph_partition/template.cfg @@ -0,0 +1,57 @@ +platform_configs { + key: "tensorflow" + value { + source_adapter_config { + [type.googleapis.com/tensorflow.serving.SaveModelBundleSourceAdapterConfig] { + legacy_config { + session_config { + graph_options { + rewrite_options { + custom_optimizers { + name: "NpuOptimizer" + parameter_map: { + key:"use_off_line" + value:{ + b:true + } + } + parameter_map: { + key:"mix_compile_mode" + value:{ + b:true + } + } + parameter_map: { + key:"variable_placement" + value:{ + s:"Host" + } + } + parameter_map: { + key:"graph_run_mode" + value:{ + i:0 + } + } + parameter_map: { + key:"precision_mode" + value:{ + s:"must_keep_origin_dtype" + } + } + parameter_map: { + key:"in_out_pair" + value:{ + s:"#value@in_out_pair#" + } + } + } + remapping: OFF + } + } + } + } + } + } + } +} \ No newline at end of file -- Gitee From 4401dcbce1940cf8c1c5de6afebe23bbe55bb38d Mon Sep 17 00:00:00 2001 From: steepcurve Date: Wed, 19 Jun 2024 07:47:34 +0000 Subject: [PATCH 220/302] bugfix: add `GlobalTensor` cache mode explicitly Signed-off-by: steepcurve --- .../op_kernel/embedding_lookup_by_address.cpp | 5 +++++ .../op_kernel/embedding_update_by_address.cpp | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp index cc45c5be..0d9babc8 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp @@ -44,6 +44,10 @@ public: pipe.InitBuffer(inQueue, pingpongNum, veclen); pipe.InitBuffer(outQueue, pingpongNum, veclen); + // set `GlobalTensor` cache mode explicitly + srcAddrGlobal.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); + dstDataGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); + // get start index for current core, core parallel block_indx block_dim,即使是最后一个核也应该多初始化一些,并对齐4的倍数 srcAddrGlobal.SetGlobalBuffer((__gm__ int64_t *)(address + block_idx * singleCoreAddrLen), needComputeAddrLen); dstDataGm.SetGlobalBuffer((__gm__ T *)(y)); @@ -111,6 +115,7 @@ private: int64_t address = srcAddrLocal.GetValue(i); if (address != 0) { + srcDataBufferGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); srcDataBufferGm.SetGlobalBuffer((__gm__ T *)(address), embDimAligned); DataCopy(dataLocal[embDimAligned * nums], srcDataBufferGm, embDimAligned); } else { diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp index 828d7fbe..cfefb021 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp @@ -40,6 +40,11 @@ public: pipe.InitBuffer(inQueue, pingpongNum, veclen); pipe.InitBuffer(outQueue, pingpongNum, veclen); + // set `GlobalTensor` cache mode explicitly + srcAddrGlobal.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); + srcDataBufferGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); + outDataGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); + // get start index for current core, core parallel block_indx block_dim srcAddrGlobal.SetGlobalBuffer((__gm__ int64_t *)(address + block_idx * singleCoreAddrLen)); srcDataBufferGm.SetGlobalBuffer((__gm__ T *)(embedding + block_idx * singleCoreAddrLen @@ -112,6 +117,7 @@ private: for (int i = 0; i < addrNum; i++) { address = srcAddrLocal.GetValue(i); if (address != 0) { + dstDataGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); dstDataGm.SetGlobalBuffer((__gm__ T*)(address)); DataCopy(dstDataGm, dstLocal[i * inputDimAligned], inputDimAligned); } @@ -150,6 +156,7 @@ private: LocalTensor dstLocal = outQueue.DeQue(); if (address != 0) { + dstDataGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); dstDataGm.SetGlobalBuffer((__gm__ T *)(address)); if (updateType == 0) { -- Gitee From c12cf34f3f53d304d10eb71702650884f9eb4c56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 19 Jun 2024 15:53:04 +0800 Subject: [PATCH 221/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9=E5=AF=B9?= =?UTF-8?q?=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84=E6=94=AF=E6=8C=81?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E5=A4=8Dsave=E6=8A=A5=E9=94=99=E5=92=8Cdestr?= =?UTF-8?q?oy=E5=8D=A1=E9=A1=BF=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/constants/constants.py | 2 -- mx_rec/core/embedding.py | 5 ++--- .../src/embedding_cache/offset_mapper/address_mapper.h | 8 ++------ src/core/hybrid_mgmt/hybrid_mgmt.cpp | 2 ++ src/core/l3_storage/preprocess_mapper.h | 2 +- 5 files changed, 7 insertions(+), 12 deletions(-) diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index fd27fc27..50c8dd2e 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -29,8 +29,6 @@ EMPTY_STR = "" # default emb memory size for hbm、ddr、ssd DEFAULT_DEVICE_CACHE_MEMORY_SIZE = 2 * 1024 * 1024 * 1024 DEFAULT_HOST_CACHE_MEMORY_SIZE = 40 * 1024 * 1024 * 1024 -DEFAULT_SSD_CACHE_MEMORY_SIZE = sys.maxsize - # 获取ConfigInitializer对象实例失败提示信息 GET_CONFIG_INSTANCE_ERR_MSG = "Please init the environment for mx_rec at first." diff --git a/mx_rec/core/embedding.py b/mx_rec/core/embedding.py index eaf0c759..23eb86aa 100644 --- a/mx_rec/core/embedding.py +++ b/mx_rec/core/embedding.py @@ -29,8 +29,7 @@ from mx_rec.core.emb.base_sparse_embedding import BaseSparseEmbedding from mx_rec.core.emb.emb_factory import HBMDynamicSparseEmbeddingFactory, HBMSparseEmbeddingFactory, \ ExternalStorageSparseEmbeddingFactory from mx_rec.constants.constants import (MAX_INT32, All2allGradientsOp, MAX_VOCABULARY_SIZE, MAX_DEVICE_VOCABULARY_SIZE, - CacheModeEnum, DEFAULT_DEVICE_CACHE_MEMORY_SIZE, DEFAULT_HOST_CACHE_MEMORY_SIZE, - DEFAULT_SSD_CACHE_MEMORY_SIZE) + CacheModeEnum, DEFAULT_DEVICE_CACHE_MEMORY_SIZE, DEFAULT_HOST_CACHE_MEMORY_SIZE) from mx_rec.graph.constants import AnchorIteratorOp from mx_rec.util.communication.hccl_ops import get_rank_size from mx_rec.util.initialize import ConfigInitializer @@ -240,5 +239,5 @@ def check_and_set_default_voc_size(voc_size_list: List[int], dim_bytes: int): default_host_voc_size = int(DEFAULT_HOST_CACHE_MEMORY_SIZE / dim_bytes) # total 40GB voc_size_list[1] = min(default_host_voc_size, MAX_VOCABULARY_SIZE) if cache_mode == CacheModeEnum.SSD.value and voc_size_list[2] == 0: - voc_size_list[2] = DEFAULT_SSD_CACHE_MEMORY_SIZE + voc_size_list[2] = MAX_VOCABULARY_SIZE return diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h index 649b2d8a..eac5f46d 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h @@ -92,10 +92,7 @@ public: ~AutoRefillEmbeddingMemoryPool() { - { - std::lock_guard lock(producerMutex); - stop = true; - } + stop = true; producerCv.notify_all(); fullCv.notify_all(); for (auto& t : producerThreads) { @@ -105,7 +102,6 @@ public: void Stop() { - std::lock_guard lock(producerMutex); stop = true; producerCv.notify_all(); fullCv.notify_all(); @@ -141,7 +137,7 @@ private: uint64_t totalLeftVocabSize; uint32_t numThreads; std::atomic currBufferSize{0}; - volatile bool stop = false; + volatile std::atomic stop = false; volatile std::atomic full = false; std::mutex producerMutex; std::mutex getAddrMutex; diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 30c41e0c..c38aa131 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -449,6 +449,8 @@ void HybridMgmt::Destroy() procThreads.clear(); // 停止预处理 KEY_PROCESS_INSTANCE->Destroy(); + // stop embCache, even if the host emb is still allocating + embCache->Destroy(); LOG_DEBUG(MGMT + "Destroy hybrid_mgmt module end."); } diff --git a/src/core/l3_storage/preprocess_mapper.h b/src/core/l3_storage/preprocess_mapper.h index fd28677f..0fc8e4d8 100644 --- a/src/core/l3_storage/preprocess_mapper.h +++ b/src/core/l3_storage/preprocess_mapper.h @@ -26,7 +26,7 @@ namespace MxRec { */ class PreProcessMapper { public: - void Initialize(const string& embName, uint32_t ddrVocabSize, uint32_t l3StorageVocabSize) + void Initialize(const string& embName, size_t ddrVocabSize, size_t l3StorageVocabSize) { tableName = embName; lfuCache = LFUCache(embName); -- Gitee From a49ed83833b0259bb5b9a0a8b8fd9d0798092dd9 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Wed, 19 Jun 2024 09:24:19 +0000 Subject: [PATCH 222/302] bugfix: add `GlobalTensor` cache mode explicitly Signed-off-by: steepcurve --- .../cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp | 2 +- .../cust_op_by_addr/op_kernel/embedding_update_by_address.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp index 0d9babc8..e198b6c0 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp @@ -115,7 +115,7 @@ private: int64_t address = srcAddrLocal.GetValue(i); if (address != 0) { - srcDataBufferGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); + srcDataBufferGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); srcDataBufferGm.SetGlobalBuffer((__gm__ T *)(address), embDimAligned); DataCopy(dataLocal[embDimAligned * nums], srcDataBufferGm, embDimAligned); } else { diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp index cfefb021..5d496ee8 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp @@ -42,7 +42,7 @@ public: // set `GlobalTensor` cache mode explicitly srcAddrGlobal.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); - srcDataBufferGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); + srcDataBufferGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); outDataGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); // get start index for current core, core parallel block_indx block_dim -- Gitee From c4eb0df086df5aa4ed7de4d7f998492e547696a5 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Wed, 19 Jun 2024 21:41:21 +0800 Subject: [PATCH 223/302] =?UTF-8?q?=E3=80=90bugfix=E3=80=91=E4=BF=AE?= =?UTF-8?q?=E5=A4=8DDDR=E6=A8=A1=E5=BC=8Fdevice=E4=BE=A7=E5=A4=84=E7=90=86?= =?UTF-8?q?=E8=BE=83=E5=BF=AB=E6=97=B6host=E4=BE=A7=E6=8F=90=E5=89=8D?= =?UTF-8?q?=E5=8F=91=E9=80=81eos=E4=BF=A1=E6=81=AF=E5=9C=BA=E6=99=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/key_process/key_process.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 63163453..4207852f 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -1270,8 +1270,12 @@ bool KeyProcess::IsGetUniqueKeysEos(const EmbBaseInfo& info, std::chrono::_V2::s lookUpSwapInAddrsPushId[info.name]); startTime = std::chrono::system_clock::now(); } + // hybridMgmtBlock->h2dNextBatchId[info.name] used by postfix increment, the last value will be grater than + // readEmbKeyBatchId and equals readEmbKeyBatchId + 1. + // Check '> readEmbKeyBatchId' condition to avoid send eos before handle all batch data from readEmbKey Op. if (isNeedSendEos[info.channelId] && readEmbKeyBatchId < info.batchId && - hybridMgmtBlock->h2dNextBatchId[info.name] == lookUpSwapInAddrsPushId[info.name]) { + hybridMgmtBlock->h2dNextBatchId[info.name] == lookUpSwapInAddrsPushId[info.name] && + hybridMgmtBlock->h2dNextBatchId[info.name] > readEmbKeyBatchId) { LOG_INFO("table:{}, channelId:{} batchId:{}, GetUniqueKeys eos", info.name, info.channelId, info.batchId); return true; -- Gitee From ef5bcd2886b40904f98d90fcee839c2fe4ba03b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 20 Jun 2024 08:59:19 +0800 Subject: [PATCH 224/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91=E6=89=A9=E5=AE=B9=E5=AF=B9?= =?UTF-8?q?=E5=A4=9A=E7=BA=A7=E7=BC=93=E5=AD=98=E7=9A=84=E6=94=AF=E6=8C=81?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E5=A4=8Dsave=E6=8A=A5=E9=94=99=E5=92=8Cdestr?= =?UTF-8?q?oy=E5=8D=A1=E9=A1=BF=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/constants/constants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mx_rec/constants/constants.py b/mx_rec/constants/constants.py index 50c8dd2e..f8558cd9 100644 --- a/mx_rec/constants/constants.py +++ b/mx_rec/constants/constants.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -import sys from enum import Enum import numpy as np -- Gitee From f309355edfe4a1771f5c9e70e15f6ba9e97f7932 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Thu, 20 Jun 2024 10:03:53 +0800 Subject: [PATCH 225/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=9E=84=E5=BB=BAmxR?= =?UTF-8?q?ec=E6=97=B6=E4=BC=9A=E7=BC=96=E8=AF=91=E5=AE=89=E8=A3=85?= =?UTF-8?q?=E6=89=A9=E5=AE=B9=E7=AE=97=E5=AD=90=E7=9A=84=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/gen_mxrec_tar_pkg.sh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/build/gen_mxrec_tar_pkg.sh b/build/gen_mxrec_tar_pkg.sh index 3b6a9713..b5cba7a2 100644 --- a/build/gen_mxrec_tar_pkg.sh +++ b/build/gen_mxrec_tar_pkg.sh @@ -82,10 +82,3 @@ function clean() gen_tar_file clean - -# compile cust op -echo "---------------- start to compile cust op ----------------" -cd "${MxRec_DIR}"/cust_op/cust_op_by_addr -chmod u+x run.sh -./run.sh -echo "---------------- compile cust op success!!!! ----------------" \ No newline at end of file -- Gitee From 3da1b22d7d3b7cde2ad3caf9aa065fcb3773241b Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 20 Jun 2024 11:22:31 +0800 Subject: [PATCH 226/302] =?UTF-8?q?=E6=96=B0=E5=A2=9EreadEmbKeyBatchId?= =?UTF-8?q?=E8=AE=B0=E5=BD=95readEmbedKey=20Op=E5=A4=84=E7=90=86=E8=BF=87?= =?UTF-8?q?=E7=9A=84batch=E8=AE=A1=E6=95=B0=E7=94=A8=E4=BA=8E=E5=88=A4?= =?UTF-8?q?=E6=96=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt_block.cpp | 45 +++++++++++++--------- src/core/hybrid_mgmt/hybrid_mgmt_block.h | 9 ++++- src/core/key_process/key_process.cpp | 4 +- src/ops_tf/hybrid_dataset_ops.cpp | 4 +- 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp index 65235389..e4935166 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp @@ -114,7 +114,8 @@ void HybridMgmtBlock::CheckValid(int channelId) // 当python侧第一次调用时,此时跳过参数检查 if (lastRunChannelId == -1) { LOG_DEBUG(HYBRID_BLOCKING + "The data channel was called for the first time, and the parameters were " - "checked to be normal channelId {} hybridBatchId {}", channelId, hybridBatchId[channelId]); + "checked to be normal channelId {} hybridBatchId {}", channelId, + hybridBatchId[channelId]); lastRunChannelId = channelId; return; @@ -122,9 +123,9 @@ void HybridMgmtBlock::CheckValid(int channelId) // 在通道切换时,hybrid预处理的batch与python的一致。 if (pythonBatchId[lastRunChannelId] == hybridBatchId[lastRunChannelId]) { LOG_DEBUG(HYBRID_BLOCKING + - "HybridMgmt is switching data channels and checking for normal parameters. he number of steps " - "in the previous round is lastRunChannelId {} pythonBatchId {} hybridBatchId {}", - lastRunChannelId, pythonBatchId[lastRunChannelId], hybridBatchId[lastRunChannelId]); + "HybridMgmt is switching data channels and checking for normal parameters. he number of steps " + "in the previous round is lastRunChannelId {} pythonBatchId {} hybridBatchId {}", + lastRunChannelId, pythonBatchId[lastRunChannelId], hybridBatchId[lastRunChannelId]); } else if (pythonBatchId[lastRunChannelId] < hybridBatchId[lastRunChannelId]) { // 在通道切换时,上一个通道处理的数据超出了python侧的调用 if (rankInfo.isDDR and !WaitValid(lastRunChannelId)) { @@ -133,10 +134,10 @@ void HybridMgmtBlock::CheckValid(int channelId) } else { // 在通道切换时,hybrid处理的数据还没有赶上python侧,此时需要等待hybrid处理完成 LOG_INFO(HYBRID_BLOCKING + - "When switching data channels, it was found that HybridMgmt processed less data than the " - "Python side.In this case, after reading the dataset, the Python side called it again, but it was " - "interrupted midway,which did not affect the subsequent calls lastRunChannelId {} hybridBatchId {}", - lastRunChannelId, hybridBatchId[lastRunChannelId]); + "When switching data channels, it was found that HybridMgmt processed less data than the " + "Python side.In this case, after reading the dataset, the Python side called it again, but it was " + "interrupted midway,which did not affect the subsequent calls lastRunChannelId {} hybridBatchId {}", + lastRunChannelId, hybridBatchId[lastRunChannelId]); } lastRunChannelId = channelId; } @@ -147,7 +148,7 @@ void HybridMgmtBlock::DoBlock(int channelId) { // 通道没有切换,不用处理 LOG_DEBUG(HYBRID_BLOCKING + "HybridMgmt starts blocking channelId {} hybridBatchId {}", - channelId, hybridBatchId[channelId]); + channelId, hybridBatchId[channelId]); while (isBlock[channelId]) { std::this_thread::sleep_for(SLEEP_MS); @@ -156,7 +157,7 @@ void HybridMgmtBlock::DoBlock(int channelId) } } LOG_DEBUG(HYBRID_BLOCKING + "HybridMgmt is starting to wake up channelId {} hybridBatchId {}", - channelId, hybridBatchId[channelId]); + channelId, hybridBatchId[channelId]); } /// 重置所有的步数,主要用于图重构的情况,readembedkey算子重建 @@ -187,24 +188,24 @@ int HybridMgmtBlock::CheckSaveEmbMapValid() // 检查数据通道此时的HashMap是否被提前处理了 if (pythonBatchId[lastRunChannelId] >= hybridBatchId[lastRunChannelId]) { LOG_DEBUG(HYBRID_BLOCKING + - "HybridMgmt is checking the step and checking that the parameters are normal. " - "The number of steps in the previous round is " - "lastRunChannelId {} pythonBatchId {} hybridBatchId {}", - lastRunChannelId, pythonBatchId[lastRunChannelId], hybridBatchId[lastRunChannelId]); + "HybridMgmt is checking the step and checking that the parameters are normal. " + "The number of steps in the previous round is " + "lastRunChannelId {} pythonBatchId {} hybridBatchId {}", + lastRunChannelId, pythonBatchId[lastRunChannelId], hybridBatchId[lastRunChannelId]); return 0; } else if (pythonBatchId[lastRunChannelId] + 1 == hybridBatchId[lastRunChannelId]) { // 在通道切换时,上一个通道处理的数据超出了python侧的调用 LOG_DEBUG(HYBRID_BLOCKING + - "HybridMgmt is checking the step, and the parameters have been processed one step " - "in advance. The number of steps in the previous round was " - "lastRunChannelId {} pythonBatchId {} hybridBatchId {}", - lastRunChannelId, pythonBatchId[lastRunChannelId], hybridBatchId[lastRunChannelId]); + "HybridMgmt is checking the step, and the parameters have been processed one step " + "in advance. The number of steps in the previous round was " + "lastRunChannelId {} pythonBatchId {} hybridBatchId {}", + lastRunChannelId, pythonBatchId[lastRunChannelId], hybridBatchId[lastRunChannelId]); return 1; } else { // 在通道切换时,hybrid处理的数据还没有赶上python侧,此时需要等待hybrid处理完成 LOG_DEBUG(HYBRID_BLOCKING + "ERROR FLAG lastRunChannelId {} hybridBatchId {}", - lastRunChannelId, hybridBatchId[lastRunChannelId]); + lastRunChannelId, hybridBatchId[lastRunChannelId]); return -1; } } @@ -267,3 +268,9 @@ void HybridMgmtBlock::FinishSave() { finishSave = true; } + +void HybridMgmtBlock::IncreaseReadEmbBatchId(const int channelId) +{ + this->readEmbedBatchId[channelId] += 1; + this->readEmbedBatchIdAll += 1; +} \ No newline at end of file diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.h b/src/core/hybrid_mgmt/hybrid_mgmt_block.h index a969d7a9..78b5260a 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.h @@ -32,6 +32,7 @@ namespace MxRec { class HybridMgmtBlock { public: HybridMgmtBlock() = default; + // 上一次运行的通道ID int lastRunChannelId = -1; // hybrid将要处理的batch id @@ -40,10 +41,12 @@ namespace MxRec { int pythonBatchId[2] = {0, 0}; // readEmbed算子侧将要处理的batch id int readEmbedBatchId[2] = {0, 0}; + // readEmbed算子处理过的batch计数,不区分通道,不会重置;用于判断h2d swap是否需要eos + int readEmbedBatchIdAll = 0; int maxTrainStep = 0; int stepsInterval[2] = {0, 0}; // 通道i运行多少步后切换为通道j - // hybrid已完成H2D的step + // hybrid已完成H2D的step;不区分通道、图,不会重置; map h2dNextBatchId; int loop[2] = {1, 1}; @@ -88,6 +91,8 @@ namespace MxRec { void FinishSave(); + void IncreaseReadEmbBatchId(const int channelId); + private: // 控制通道阻塞的变量 bool isBlock[2] = {true, true}; @@ -101,7 +106,7 @@ namespace MxRec { public: explicit HybridMgmtBlockingException(const string scene) { - HybridMgmtBlock *hybridMgmtBlock = Singleton::GetInstance(); + HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); int channelId = hybridMgmtBlock->lastRunChannelId; int preprocessBatchNumber = hybridMgmtBlock->hybridBatchId[channelId]; int currentBatchNumber = hybridMgmtBlock->pythonBatchId[channelId]; diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 4207852f..237e3d2a 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -1272,10 +1272,10 @@ bool KeyProcess::IsGetUniqueKeysEos(const EmbBaseInfo& info, std::chrono::_V2::s } // hybridMgmtBlock->h2dNextBatchId[info.name] used by postfix increment, the last value will be grater than // readEmbKeyBatchId and equals readEmbKeyBatchId + 1. - // Check '> readEmbKeyBatchId' condition to avoid send eos before handle all batch data from readEmbKey Op. + // Check '>= readEmbedBatchIdAll' condition to avoid send eos before handle all batch data from readEmbKey Op. if (isNeedSendEos[info.channelId] && readEmbKeyBatchId < info.batchId && hybridMgmtBlock->h2dNextBatchId[info.name] == lookUpSwapInAddrsPushId[info.name] && - hybridMgmtBlock->h2dNextBatchId[info.name] > readEmbKeyBatchId) { + hybridMgmtBlock->h2dNextBatchId[info.name] >= hybridMgmtBlock->readEmbedBatchIdAll) { LOG_INFO("table:{}, channelId:{} batchId:{}, GetUniqueKeys eos", info.name, info.channelId, info.batchId); return true; diff --git a/src/ops_tf/hybrid_dataset_ops.cpp b/src/ops_tf/hybrid_dataset_ops.cpp index 2eee8531..1ef52de1 100644 --- a/src/ops_tf/hybrid_dataset_ops.cpp +++ b/src/ops_tf/hybrid_dataset_ops.cpp @@ -214,7 +214,7 @@ namespace MxRec { return; } } - hybridMgmtBlock->readEmbedBatchId[channelId] += 1; + hybridMgmtBlock->IncreaseReadEmbBatchId(channelId); const Tensor& inputTensor = context->input(TensorIndex::TENSOR_INDEX_0); const auto& splits = context->input(TENSOR_INDEX_1).flat(); int fieldNum = 0; @@ -407,7 +407,7 @@ namespace MxRec { return; } } - hybridMgmtBlock->readEmbedBatchId[channelId] += 1; + hybridMgmtBlock->IncreaseReadEmbBatchId(channelId); const Tensor& inputTensor = context->input(TensorIndex::TENSOR_INDEX_0); size_t dataSize = inputTensor.NumElements(); -- Gitee From 7445ebc8b0f19386f1d80c596aaf1ae110223c67 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 20 Jun 2024 11:35:02 +0800 Subject: [PATCH 227/302] =?UTF-8?q?=E6=B3=A8=E9=87=8A=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt_block.h | 2 +- src/core/key_process/key_process.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.h b/src/core/hybrid_mgmt/hybrid_mgmt_block.h index 78b5260a..05e60e72 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.h @@ -41,7 +41,7 @@ namespace MxRec { int pythonBatchId[2] = {0, 0}; // readEmbed算子侧将要处理的batch id int readEmbedBatchId[2] = {0, 0}; - // readEmbed算子处理过的batch计数,不区分通道,不会重置;用于判断h2d swap是否需要eos + // readEmbed算子处理过的batch计数,不区分通道、图,不会重置;用于判断h2d swap是否需要eos int readEmbedBatchIdAll = 0; int maxTrainStep = 0; int stepsInterval[2] = {0, 0}; // 通道i运行多少步后切换为通道j diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 237e3d2a..b5dc962e 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -1270,8 +1270,6 @@ bool KeyProcess::IsGetUniqueKeysEos(const EmbBaseInfo& info, std::chrono::_V2::s lookUpSwapInAddrsPushId[info.name]); startTime = std::chrono::system_clock::now(); } - // hybridMgmtBlock->h2dNextBatchId[info.name] used by postfix increment, the last value will be grater than - // readEmbKeyBatchId and equals readEmbKeyBatchId + 1. // Check '>= readEmbedBatchIdAll' condition to avoid send eos before handle all batch data from readEmbKey Op. if (isNeedSendEos[info.channelId] && readEmbKeyBatchId < info.batchId && hybridMgmtBlock->h2dNextBatchId[info.name] == lookUpSwapInAddrsPushId[info.name] && -- Gitee From 9c5e09c9c3e43fc5f36d270bfc7349fd8b55b8c2 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 20 Jun 2024 11:47:51 +0800 Subject: [PATCH 228/302] =?UTF-8?q?=E7=BC=A9=E8=BF=9B=E6=A0=BC=E5=BC=8F?= =?UTF-8?q?=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt_block.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp index e4935166..092cfa7c 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp @@ -113,9 +113,10 @@ void HybridMgmtBlock::CheckValid(int channelId) } // 当python侧第一次调用时,此时跳过参数检查 if (lastRunChannelId == -1) { - LOG_DEBUG(HYBRID_BLOCKING + "The data channel was called for the first time, and the parameters were " - "checked to be normal channelId {} hybridBatchId {}", channelId, - hybridBatchId[channelId]); + LOG_DEBUG(HYBRID_BLOCKING + + "The data channel was called for the first time, and the parameters were " + "checked to be normal channelId {} hybridBatchId {}", + channelId, hybridBatchId[channelId]); lastRunChannelId = channelId; return; -- Gitee From e080280e15b7d40fcd728b15024637beb6ca8fd4 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 20 Jun 2024 11:56:25 +0800 Subject: [PATCH 229/302] =?UTF-8?q?=E6=97=A5=E5=BF=97=E6=89=93=E5=8D=B0?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt_block.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp index 092cfa7c..c3459c77 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp @@ -121,10 +121,11 @@ void HybridMgmtBlock::CheckValid(int channelId) lastRunChannelId = channelId; return; } + t // 在通道切换时,hybrid预处理的batch与python的一致。 if (pythonBatchId[lastRunChannelId] == hybridBatchId[lastRunChannelId]) { LOG_DEBUG(HYBRID_BLOCKING + - "HybridMgmt is switching data channels and checking for normal parameters. he number of steps " + "HybridMgmt is switching data channels and checking for normal parameters. The number of steps " "in the previous round is lastRunChannelId {} pythonBatchId {} hybridBatchId {}", lastRunChannelId, pythonBatchId[lastRunChannelId], hybridBatchId[lastRunChannelId]); } else if (pythonBatchId[lastRunChannelId] < hybridBatchId[lastRunChannelId]) { -- Gitee From 8adefa043295d5bb6245f215861e22c69220d3aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 20 Jun 2024 12:02:23 +0800 Subject: [PATCH 230/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91lock=E9=97=AE=E9=A2=98?= =?UTF-8?q?=E5=AF=BC=E8=87=B4=E6=B3=84=E9=9C=B2=E5=86=85=E5=AD=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h index eac5f46d..251c185c 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h @@ -103,6 +103,7 @@ public: void Stop() { stop = true; + std::lock_guard lock(producerMutex); producerCv.notify_all(); fullCv.notify_all(); } -- Gitee From 4eea2da6e251acba489adb98648add0eddeba681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 20 Jun 2024 12:02:47 +0800 Subject: [PATCH 231/302] =?UTF-8?q?=E3=80=90=E4=BF=AE=E6=94=B9=E8=AF=B4?= =?UTF-8?q?=E6=98=8E=20Modification=E3=80=91lock=E9=97=AE=E9=A2=98?= =?UTF-8?q?=E5=AF=BC=E8=87=B4=E6=B3=84=E9=9C=B2=E5=86=85=E5=AD=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h index 251c185c..46daaf29 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h @@ -93,6 +93,7 @@ public: ~AutoRefillEmbeddingMemoryPool() { stop = true; + std::lock_guard lock(producerMutex); producerCv.notify_all(); fullCv.notify_all(); for (auto& t : producerThreads) { -- Gitee From b1a9e5982e7bfa887c0ef8a399e709df2512ad31 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Thu, 20 Jun 2024 15:28:13 +0800 Subject: [PATCH 232/302] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt_block.cpp | 8 ++++---- src/core/hybrid_mgmt/hybrid_mgmt_block.h | 2 +- src/ops_tf/hybrid_dataset_ops.cpp | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp index c3459c77..fbee8b9a 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp @@ -115,18 +115,18 @@ void HybridMgmtBlock::CheckValid(int channelId) if (lastRunChannelId == -1) { LOG_DEBUG(HYBRID_BLOCKING + "The data channel was called for the first time, and the parameters were " - "checked to be normal channelId {} hybridBatchId {}", + "checked to be normal channelId {} hybridBatchId {}.", channelId, hybridBatchId[channelId]); lastRunChannelId = channelId; return; } - t + // 在通道切换时,hybrid预处理的batch与python的一致。 if (pythonBatchId[lastRunChannelId] == hybridBatchId[lastRunChannelId]) { LOG_DEBUG(HYBRID_BLOCKING + "HybridMgmt is switching data channels and checking for normal parameters. The number of steps " - "in the previous round is lastRunChannelId {} pythonBatchId {} hybridBatchId {}", + "in the previous round is lastRunChannelId {} pythonBatchId {} hybridBatchId {}.", lastRunChannelId, pythonBatchId[lastRunChannelId], hybridBatchId[lastRunChannelId]); } else if (pythonBatchId[lastRunChannelId] < hybridBatchId[lastRunChannelId]) { // 在通道切换时,上一个通道处理的数据超出了python侧的调用 @@ -271,7 +271,7 @@ void HybridMgmtBlock::FinishSave() finishSave = true; } -void HybridMgmtBlock::IncreaseReadEmbBatchId(const int channelId) +void HybridMgmtBlock::IncrementReadEmbBatchId(const int channelId) { this->readEmbedBatchId[channelId] += 1; this->readEmbedBatchIdAll += 1; diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.h b/src/core/hybrid_mgmt/hybrid_mgmt_block.h index 05e60e72..a66f9b00 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.h @@ -91,7 +91,7 @@ namespace MxRec { void FinishSave(); - void IncreaseReadEmbBatchId(const int channelId); + void IncrementReadEmbBatchId(const int channelId); private: // 控制通道阻塞的变量 diff --git a/src/ops_tf/hybrid_dataset_ops.cpp b/src/ops_tf/hybrid_dataset_ops.cpp index 1ef52de1..0b192da5 100644 --- a/src/ops_tf/hybrid_dataset_ops.cpp +++ b/src/ops_tf/hybrid_dataset_ops.cpp @@ -214,7 +214,7 @@ namespace MxRec { return; } } - hybridMgmtBlock->IncreaseReadEmbBatchId(channelId); + hybridMgmtBlock->IncrementReadEmbBatchId(channelId); const Tensor& inputTensor = context->input(TensorIndex::TENSOR_INDEX_0); const auto& splits = context->input(TENSOR_INDEX_1).flat(); int fieldNum = 0; @@ -407,7 +407,7 @@ namespace MxRec { return; } } - hybridMgmtBlock->IncreaseReadEmbBatchId(channelId); + hybridMgmtBlock->IncrementReadEmbBatchId(channelId); const Tensor& inputTensor = context->input(TensorIndex::TENSOR_INDEX_0); size_t dataSize = inputTensor.NumElements(); -- Gitee From 58e4da2d272e67e4026fb0829b576604a18850b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Thu, 20 Jun 2024 15:36:33 +0800 Subject: [PATCH 233/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91DCNv2=E6=A8=A1?= =?UTF-8?q?=E5=9E=8Bssd=E6=A8=A1=E5=BC=8F=EF=BC=8CError=E6=89=93=E5=B1=8F?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E5=B1=8F=E8=94=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 30c41e0c..9ede05c8 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -2044,7 +2044,9 @@ void HybridMgmt::SendRestoreVec(const EmbBaseInfo &info, bool &remainBatchOut) info, ProcessedInfo::RESTORE, isEos); if (infoVecs == nullptr) { remainBatchOut = false; - LOG_ERROR("Information vector is nullptr!"); + if (isRunning) { + LOG_ERROR("Information vector is nullptr!"); + } return; } LOG_DEBUG("table:{}, channelId:{}, batchId:{}, get restore end, getRestoreTC(ms):{}", -- Gitee From e38bd5da61f159919aff1111242d1962f2dc1a57 Mon Sep 17 00:00:00 2001 From: liangrenhao Date: Thu, 20 Jun 2024 15:22:45 +0800 Subject: [PATCH 234/302] [FIX]clean code Signed-off-by: liangrenhao --- examples/rec_infer/client.py | 31 ++++++++++++++++++++---- examples/rec_infer/client.sh | 4 +++ examples/rec_infer/input_config.py | 17 +++++++++++++ examples/rec_infer/server.sh | 4 +++ tools/graph_partition/gen_config.py | 17 +++++++++++++ tools/graph_partition/graph_partition.py | 17 +++++++++++++ tools/graph_partition/template.cfg | 2 ++ 7 files changed, 87 insertions(+), 5 deletions(-) diff --git a/examples/rec_infer/client.py b/examples/rec_infer/client.py index 62a15882..7c6f1cb1 100644 --- a/examples/rec_infer/client.py +++ b/examples/rec_infer/client.py @@ -1,3 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + import os import time @@ -51,15 +68,19 @@ class PredictModelGrpc(): return request, stub +FIELD_TYPE = "dtype" +FIELD_SHAPE = "shape" + + def gen_inputs(): inputs = {} input_types = {} for name in config: - input_types[name] = config[name]["dtype"] - if config[name]["dtype"] == tf.int32: - inputs[name] = np.random.randint(0, 100, size=config[name]["shape"]) - elif config[name]["dtype"] == tf.float32: - inputs[name] = np.random.randint(0, 2, size=config[name]["shape"]) * 1.0 + input_types[name] = config[name][FIELD_TYPE] + if config[name][FIELD_TYPE] == tf.int32: + inputs[name] = np.random.randint(0, 100, size=config[name][FIELD_SHAPE]) + elif config[name][FIELD_TYPE] == tf.float32: + inputs[name] = np.random.randint(0, 2, size=config[name][FIELD_SHAPE]) * 1.0 return inputs, input_types diff --git a/examples/rec_infer/client.sh b/examples/rec_infer/client.sh index fa968858..0d3169c2 100644 --- a/examples/rec_infer/client.sh +++ b/examples/rec_infer/client.sh @@ -1,3 +1,7 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. +# Description: startup client + source /usr/local/Ascend/ascend-toolkit/set_env.sh unset http_proxy unset https_proxy diff --git a/examples/rec_infer/input_config.py b/examples/rec_infer/input_config.py index 24e28f03..8fff6ceb 100644 --- a/examples/rec_infer/input_config.py +++ b/examples/rec_infer/input_config.py @@ -1,3 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + import tensorflow as tf BATCH_SIZE = 9600 diff --git a/examples/rec_infer/server.sh b/examples/rec_infer/server.sh index 50735b0f..67166f61 100644 --- a/examples/rec_infer/server.sh +++ b/examples/rec_infer/server.sh @@ -1,3 +1,7 @@ +#!/bin/bash +# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. +# Description: startup server + taskset -c 0-32 /home/lmp/serving-1.15.0/bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server \ --model_name=saved_model \ --model_base_path=$(pwd)/inference_model/saved_model/ \ diff --git a/tools/graph_partition/gen_config.py b/tools/graph_partition/gen_config.py index 8e80a182..7cd69de3 100644 --- a/tools/graph_partition/gen_config.py +++ b/tools/graph_partition/gen_config.py @@ -1,3 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + import argparse import os diff --git a/tools/graph_partition/graph_partition.py b/tools/graph_partition/graph_partition.py index 6e01e6e6..8ebfbfba 100644 --- a/tools/graph_partition/graph_partition.py +++ b/tools/graph_partition/graph_partition.py @@ -1,3 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + import tensorflow as tf from tensorflow.contrib import graph_editor as ge diff --git a/tools/graph_partition/template.cfg b/tools/graph_partition/template.cfg index 3227bdea..fef30a9b 100644 --- a/tools/graph_partition/template.cfg +++ b/tools/graph_partition/template.cfg @@ -1,3 +1,5 @@ +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. + platform_configs { key: "tensorflow" value { -- Gitee From 20580c9d9970f10764ccc27603c4e21a3ddf36c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 20 Jun 2024 20:13:20 +0800 Subject: [PATCH 235/302] =?UTF-8?q?=E3=80=90fix=E3=80=91HBM=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E4=B8=8D=E5=BA=94=E8=AF=A5=E8=B0=83=E7=94=A8embCache-?= =?UTF-8?q?>Destroy()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index c38aa131..8ba1a9cd 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -450,7 +450,7 @@ void HybridMgmt::Destroy() // 停止预处理 KEY_PROCESS_INSTANCE->Destroy(); // stop embCache, even if the host emb is still allocating - embCache->Destroy(); + if (embCache != nullptr) { embCache->Destroy(); } LOG_DEBUG(MGMT + "Destroy hybrid_mgmt module end."); } -- Gitee From 1dafba4fd98cc6d818064b65b6f69a006cebbe10 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Thu, 20 Jun 2024 16:22:10 +0800 Subject: [PATCH 236/302] =?UTF-8?q?little=20demo=20estimator=20=E8=B0=83?= =?UTF-8?q?=E7=94=A8init=E6=8E=A5=E5=8F=A3=E6=97=B6=E4=BC=A0=E5=85=A5save?= =?UTF-8?q?=5Fsteps=E3=80=81max=5Fsteps?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/demo/little_demo_estimator/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/demo/little_demo_estimator/main.py b/examples/demo/little_demo_estimator/main.py index a369abe5..716e40d0 100644 --- a/examples/demo/little_demo_estimator/main.py +++ b/examples/demo/little_demo_estimator/main.py @@ -214,6 +214,8 @@ if __name__ == '__main__': # set init init(train_steps=args.train_steps, eval_steps=args.eval_steps, + save_steps=args.save_checkpoints_steps, + max_steps=args.max_steps, use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion) -- Gitee From 3272d73c1dbd47af85220f95a883f7971b21bb28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Fri, 21 Jun 2024 11:42:12 +0800 Subject: [PATCH 237/302] =?UTF-8?q?=E3=80=90fix=E3=80=91Acctr=E4=B8=AD?= =?UTF-8?q?=E7=9A=84cleacode=EF=BC=8C=E4=B8=8D=E8=83=BD=E5=B0=81=E8=A3=85?= =?UTF-8?q?=E5=AE=89=E5=85=A8=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../offset_mapper/mapper_base.h | 53 +++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h index 164daaab..d4e0aaa6 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h @@ -304,25 +304,8 @@ public: /* allocate buckets for sub-maps */ for (auto &mSubMap : mSubMaps) { - auto tmp = new (std::nothrow) NetHashBucket[bucketCount]; - if (HM_UNLIKELY(tmp == nullptr)) { - FreeSubMaps(); - ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, - "Failed to new hash bucket, probably out of memory"); - return false; - } - - /* make physical page and set to zero */ - auto ret = SafeMemset(tmp, 0, sizeof(NetHashBucket) * bucketCount); - if (ret != 0) { - delete[] tmp; - tmp = nullptr; - FreeSubMaps(); - ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "memset_s failed... size: " + - std::to_string(sizeof(NetHashBucket) * bucketCount) + ", error code:" + std::to_string(ret)); - return false; - } - + NetHashBucket* tmp; + if (!NewAndSetBucket(bucketCount, 0, tmp)) { return false;} mSubMap = tmp; } @@ -697,22 +680,38 @@ private: } /* - * Description: SECUREC_MEM_MAX_LEN of memset_s function is 2GB - * Parameter: dest - destination address + * Description: allocate buckets and init it + * Parameter: bucketCount - the bucket counts * Parameter: c - the value to be copied - * Parameter: count - copies count bytes of value to dest + * Parameter: bucketPtr - pointing at the bucket array which is allocated + * NOTES: SECUREC_MEM_MAX_LEN of memset_s function is 2GB */ - int SafeMemset(void* dest, int c, size_t count) + bool NewAndSetBucket(const uint32_t& bucketCount, const int& c, NetHashBucket* &bucketPtr) { + bucketPtr = new (std::nothrow) NetHashBucket[bucketCount]; + if (HM_UNLIKELY(bucketPtr == nullptr)) { + FreeSubMaps(); + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, + "Failed to new hash bucket, probably out of memory"); + return false; + } + + /* make physical page and set to zero */ + size_t bucketsBytes = sizeof(NetHashBucket) * bucketCount; char* destBytePtr = reinterpret_cast(dest); - for (size_t i = 0; i < count; i += MEMSET_S_MAX_SIZE) { - size_t bytesOnceSet = (i + MEMSET_S_MAX_SIZE <= count) ? MEMSET_S_MAX_SIZE : (count - i); + for (size_t i = 0; i < bucketsBytes; i += MEMSET_S_MAX_SIZE) { + size_t bytesOnceSet = (i + MEMSET_S_MAX_SIZE <= bucketsBytes) ? MEMSET_S_MAX_SIZE : (bucketsBytes - i); auto ret = memset_s(destBytePtr + i, bytesOnceSet, c, bytesOnceSet); if (ret != 0) { - return ret; + delete[] bucketPtr; + bucketPtr = nullptr; + FreeSubMaps(); + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "memset_s failed... size: " + std::to_string( + bucketsBytes) + ", error code:" + std::to_string(ret)); + return false; } } - return 0; + return true; } void FreeOverFlowedEntries() -- Gitee From 65195772e61067916f37e12d73afbc0cba53bc41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Fri, 21 Jun 2024 12:02:55 +0800 Subject: [PATCH 238/302] =?UTF-8?q?=E3=80=90fix=E3=80=91Acctr=E4=B8=AD?= =?UTF-8?q?=E7=9A=84cleacode=EF=BC=8C=E4=B8=8D=E8=83=BD=E5=B0=81=E8=A3=85?= =?UTF-8?q?=E5=AE=89=E5=85=A8=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h index d4e0aaa6..78729be3 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h @@ -698,7 +698,7 @@ private: /* make physical page and set to zero */ size_t bucketsBytes = sizeof(NetHashBucket) * bucketCount; - char* destBytePtr = reinterpret_cast(dest); + char* destBytePtr = reinterpret_cast(bucketPtr); for (size_t i = 0; i < bucketsBytes; i += MEMSET_S_MAX_SIZE) { size_t bytesOnceSet = (i + MEMSET_S_MAX_SIZE <= bucketsBytes) ? MEMSET_S_MAX_SIZE : (bucketsBytes - i); auto ret = memset_s(destBytePtr + i, bytesOnceSet, c, bytesOnceSet); -- Gitee From 7957f2c72c5fcc353c49ee53b5e2aa4fb8b22df8 Mon Sep 17 00:00:00 2001 From: yangzhen_BIG Date: Fri, 21 Jun 2024 08:03:01 +0000 Subject: [PATCH 239/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=EF=BC=88=E4=BF=9D?= =?UTF-8?q?=E5=AD=98=E4=B8=8E=E5=8A=A0=E8=BD=BD=EF=BC=89=EF=BC=9Aestimator?= =?UTF-8?q?=E6=A8=A1=E5=BC=8Fhost=E4=BE=A7=E9=81=BF=E5=85=8D=E9=87=8D?= =?UTF-8?q?=E5=A4=8D=E4=BF=9D=E5=AD=98=E5=8A=A0=E8=BD=BD=EF=BC=9B=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D=E5=8D=95=E6=AC=A1=E5=86=99=E8=B6=85=E7=B3=BB=E7=BB=9F?= =?UTF-8?q?=E4=B8=8A=E9=99=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/emb_table/embedding_ddr.cpp | 11 ++++-- .../local_file_system/local_file_system.cpp | 23 ++++++++++- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 39 +++++++++---------- src/core/hybrid_mgmt/hybrid_mgmt.h | 4 +- src/core/ssd_engine/ssd_engine.cpp | 14 +++++++ src/core/ssd_engine/ssd_engine.h | 3 ++ src/core/utils/common.cpp | 20 ++++++++++ src/core/utils/common.h | 2 + 8 files changed, 87 insertions(+), 29 deletions(-) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 167894e5..ca706c73 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -187,12 +187,15 @@ void EmbeddingDDR::LoadOptimizerSlot(const string &savePath, vector keys; vector> embeddings; vector> optimizerSlots; - embCache->GetEmbTableInfos(name, keys, embeddings, optimizerSlots); + + auto step = GetStepFromPath(savePath); + if (step > 0) { + SyncLatestEmbedding(); + embCache->GetEmbTableInfos(name, keys, embeddings, optimizerSlots); + } SaveKey(savePath, keys); SaveEmbedding(savePath, embeddings); @@ -291,7 +294,7 @@ void EmbeddingDDR::SaveEmbedding(const string& savePath, vector>& ssize_t writeBytesNum = fileSystemPtr_->Write(ss.str(), embeddings, embSize_); ssize_t expectWriteBytes = embeddings.size() * embSize_ * sizeof(float); if (writeBytesNum != expectWriteBytes) { - string errMsg = StringFormat("save embedding failed, write expect:%d, actual:%d, path:%s", + string errMsg = StringFormat("Save embedding failed, write expect:%ld, actual:%ld, path:%s .", expectWriteBytes, writeBytesNum, savePath.c_str()); throw runtime_error(errMsg); } diff --git a/src/core/file_system/local_file_system/local_file_system.cpp b/src/core/file_system/local_file_system/local_file_system.cpp index e9ddb8a4..b0b5c76a 100644 --- a/src/core/file_system/local_file_system/local_file_system.cpp +++ b/src/core/file_system/local_file_system/local_file_system.cpp @@ -124,8 +124,27 @@ ssize_t LocalFileSystem::Write(const string& filePath, vector>& fi flattenContent.insert(flattenContent.cend(), vec.cbegin(), vec.cend()); } - ssize_t writeBytesNum = - write(fd, reinterpret_cast(flattenContent.data()), flattenContent.size() * sizeof(float)); + size_t writeBytesRemain = flattenContent.size() * sizeof(float); + size_t writeSize = 0; + size_t idx = 0; + ssize_t writeBytesNum = 0; + auto dumpPtr = reinterpret_cast(flattenContent.data()); + + while (writeBytesRemain != 0) { + if (writeBytesRemain > oneTimeReadWriteLen) { + writeSize = oneTimeReadWriteLen; + } else { + writeSize = writeBytesRemain; + } + ssize_t res = write(fd, dumpPtr + idx, writeSize); + if (res == -1) { + close(fd); + return res; + } + writeBytesRemain -= res; + idx += res; + writeBytesNum += res; + } close(fd); diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 8155f1ec..fda54d9d 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -206,6 +206,12 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) throw runtime_error("HybridMgmt not initialized. Call Initialize first."); } + if (mgmtRankInfo.isDDR && IsTrainAndEvalCase()) { + LOG_INFO("estimator train and eval case, skip loading, " + "host will reuse data in memory while evaluating since is's same as saved data"); + return true; + } + // 数据处理线程上锁 KEY_PROCESS_INSTANCE->LoadSaveLock(); @@ -821,27 +827,6 @@ void HybridMgmt::EvictL3StorageKeys(const string& embName, const vectorEvictL3StorageEmbedding(embName, keys); } -int HybridMgmt::GetStepFromPath(const string& loadPath) const -{ - regex pattern(SAVE_SPARSE_PATH_PREFIX + "-.*-(\\d+)"); - smatch match; - if (regex_search(loadPath, match, pattern)) { - int res = 0; - unsigned int minSize = 2; - if (match.size() < minSize) { - return res; - } - try { - res = stoi(match[1]); - } catch (const std::invalid_argument& e) { - LOG_ERROR(e.what()); - } catch (const std::out_of_range& e) { - LOG_ERROR(e.what()); - } - return res; - } - return 0; -} /// 通过pyBind在python侧调用,通知hybridMgmt上层即将进行图的执行,需要进行唤醒 /// \param channelID 通道id @@ -2233,3 +2218,15 @@ void HybridMgmt::EnqueueSwapInfo(const EmbBaseInfo &info, CheckLookupAddrSuccessDDR(); } + +bool HybridMgmt::IsTrainAndEvalCase() +{ + bool isChannelSwitchCase = false; + for (auto& i: mgmtEmbInfo) { + if (specialProcessStatus[i.name] == ProcessStatus::AFTER_SWITCH_FIRST_BATCH) { + isChannelSwitchCase = true; + break; + } + } + return alreadyTrainOnce && isChannelSwitchCase; +} diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index 4fd2b541..83299da3 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -190,8 +190,6 @@ namespace MxRec { void EvictL3StorageKeys(const string& embName, const vector& keys) const; - int GetStepFromPath(const string& loadPath) const; - void LookUpAddrs(const string &embName, int extEmbeddingSize); void LookUpSwapAddrs(const std::string &embName, const std::string &swapStr); @@ -323,6 +321,8 @@ namespace MxRec { void EnqueueSwapInfo(const EmbBaseInfo& info, std::pair, vector>& swapInKoPair, std::pair, vector>& swapOutKoPair); + + bool IsTrainAndEvalCase(); }; } #endif // MX_REC_EMB_MGMT_H diff --git a/src/core/ssd_engine/ssd_engine.cpp b/src/core/ssd_engine/ssd_engine.cpp index e50ad43c..3f0b3a1c 100644 --- a/src/core/ssd_engine/ssd_engine.cpp +++ b/src/core/ssd_engine/ssd_engine.cpp @@ -103,9 +103,16 @@ void SSDEngine::Save(int step) if (!isRunning) { throw runtime_error("SSDEngine not running"); } + + if (step == loadStep) { + LOG_INFO("save step equal to load step, skip saving, step:{}", step); + return; + } + for (auto item: as_const(tableMap)) { item.second->Save(step); } + saveStep = step; } void SSDEngine::Load(const string &tableName, vector savePaths, uint64_t maxTableSize, int step) @@ -113,12 +120,19 @@ void SSDEngine::Load(const string &tableName, vector savePaths, uint64_t if (!isRunning) { throw runtime_error("SSDEngine not running"); } + + if (step == saveStep) { + LOG_INFO("load step equal to save step, skip loading, step:{}", step); + return; + } + auto it = as_const(tableMap).find(tableName); if (it != tableMap.end()) { throw invalid_argument("table already exist"); } tableMap[tableName] = make_shared(tableName, savePaths, maxTableSize, compactThreshold, step); + loadStep = step; } void SSDEngine::Start() diff --git a/src/core/ssd_engine/ssd_engine.h b/src/core/ssd_engine/ssd_engine.h index 40b65843..942318c4 100644 --- a/src/core/ssd_engine/ssd_engine.h +++ b/src/core/ssd_engine/ssd_engine.h @@ -74,6 +74,9 @@ namespace MxRec { shared_ptr compactThread = nullptr; void CompactMonitor(); + + int loadStep = -1; + int saveStep = -1; }; } diff --git a/src/core/utils/common.cpp b/src/core/utils/common.cpp index 32e32827..1b3edcfd 100644 --- a/src/core/utils/common.cpp +++ b/src/core/utils/common.cpp @@ -20,6 +20,7 @@ See the License for the specific language governing permissions and #include #include #include +#include #include @@ -166,4 +167,23 @@ namespace MxRec { return ss; } + int GetStepFromPath(const string& loadPath) + { + regex pattern(SAVE_SPARSE_PATH_PREFIX + "-.*-(\\d+)"); + smatch match; + if (!regex_search(loadPath, match, pattern)) { + return 0; + } + int res = 0; + unsigned int minSize = 2; + if (match.size() < minSize) { + return res; + } + try { + res = stoi(match[1]); + } catch (const std::invalid_argument& e) { + LOG_ERROR(e.what()); + } + return res; + } } // end namespace MxRec diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 26aad3fe..9a39e7ac 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -610,6 +610,8 @@ namespace MxRec { ostream& operator<<(ostream& ss, MxRec::CkptDataType type); bool CheckFilePermission(const string& filePath); + + int GetStepFromPath(const string& loadPath); } // end namespace MxRec #define KEY_PROCESS "\033[45m[KeyProcess]\033[0m " -- Gitee From d3a463873ad1777b322627529cc89a9df720757e Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 24 Jun 2024 04:12:47 +0000 Subject: [PATCH 240/302] fix: add compatibility with old cann versions Signed-off-by: steepcurve --- .../op_kernel/embedding_lookup_by_address.cpp | 4 ++++ .../op_kernel/embedding_update_by_address.cpp | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp index e198b6c0..f6a1e656 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_lookup_by_address.cpp @@ -44,9 +44,11 @@ public: pipe.InitBuffer(inQueue, pingpongNum, veclen); pipe.InitBuffer(outQueue, pingpongNum, veclen); +#ifdef L2_CACHE_HINT // set `GlobalTensor` cache mode explicitly srcAddrGlobal.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); dstDataGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); +#endif // get start index for current core, core parallel block_indx block_dim,即使是最后一个核也应该多初始化一些,并对齐4的倍数 srcAddrGlobal.SetGlobalBuffer((__gm__ int64_t *)(address + block_idx * singleCoreAddrLen), needComputeAddrLen); @@ -115,7 +117,9 @@ private: int64_t address = srcAddrLocal.GetValue(i); if (address != 0) { +#ifdef L2_CACHE_HINT srcDataBufferGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); +#endif srcDataBufferGm.SetGlobalBuffer((__gm__ T *)(address), embDimAligned); DataCopy(dataLocal[embDimAligned * nums], srcDataBufferGm, embDimAligned); } else { diff --git a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp index 5d496ee8..50abf83c 100644 --- a/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp +++ b/cust_op/cust_op_by_addr/op_kernel/embedding_update_by_address.cpp @@ -40,10 +40,12 @@ public: pipe.InitBuffer(inQueue, pingpongNum, veclen); pipe.InitBuffer(outQueue, pingpongNum, veclen); +#ifdef L2_CACHE_HINT // set `GlobalTensor` cache mode explicitly srcAddrGlobal.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); srcDataBufferGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); outDataGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); +#endif // get start index for current core, core parallel block_indx block_dim srcAddrGlobal.SetGlobalBuffer((__gm__ int64_t *)(address + block_idx * singleCoreAddrLen)); @@ -117,7 +119,9 @@ private: for (int i = 0; i < addrNum; i++) { address = srcAddrLocal.GetValue(i); if (address != 0) { +#ifdef L2_CACHE_HINT dstDataGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); +#endif dstDataGm.SetGlobalBuffer((__gm__ T*)(address)); DataCopy(dstDataGm, dstLocal[i * inputDimAligned], inputDimAligned); } @@ -156,7 +160,9 @@ private: LocalTensor dstLocal = outQueue.DeQue(); if (address != 0) { +#ifdef L2_CACHE_HINT dstDataGm.SetL2CacheHint(CacheMode::CACHE_MODE_NORMAL); +#endif dstDataGm.SetGlobalBuffer((__gm__ T *)(address)); if (updateType == 0) { -- Gitee From d304c4b5422e9367410ab1dbd6cff659c852bb9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 24 Jun 2024 19:19:39 +0800 Subject: [PATCH 241/302] =?UTF-8?q?=E3=80=90fix=E3=80=91Acctr=E4=B8=AD?= =?UTF-8?q?=E7=9A=84cleacode=EF=BC=8C=E4=B8=8D=E8=83=BD=E5=B0=81=E8=A3=85?= =?UTF-8?q?=E5=AE=89=E5=85=A8=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h index 78729be3..42d62ca4 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/mapper_base.h @@ -706,8 +706,8 @@ private: delete[] bucketPtr; bucketPtr = nullptr; FreeSubMaps(); - ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "memset_s failed... size: " + std::to_string( - bucketsBytes) + ", error code:" + std::to_string(ret)); + ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, + "memset_s failed... size: " + std::to_string(bucketsBytes) + ", error code:" + std::to_string(ret)); return false; } } -- Gitee From 277e413ffca6e714cb12acacd997aef153e2623e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 24 Jun 2024 19:56:58 +0800 Subject: [PATCH 242/302] =?UTF-8?q?=E3=80=90fix=E3=80=91=E3=80=90=E5=8A=A8?= =?UTF-8?q?=E6=80=81=E6=89=A9=E5=AE=B9=E3=80=91=E5=88=A0=E9=99=A4=E6=89=A9?= =?UTF-8?q?=E5=AE=B9=E7=9A=84=E5=86=97=E4=BD=99=E4=BB=A3=E7=A0=81=EF=BC=8C?= =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=89=A9=E5=AE=B9=E9=A2=9D=E5=A4=96=E7=94=B3?= =?UTF-8?q?=E8=AF=B7=E7=A9=BA=E9=97=B4=E7=9A=84=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/emb_table/emb_table.cpp | 163 ------------------------- src/core/emb_table/emb_table.h | 93 -------------- src/core/key_process/key_process.cpp | 39 ------ src/core/key_process/key_process.h | 4 - src/tests/emb_table/emb_table_test.cpp | 135 -------------------- 5 files changed, 434 deletions(-) delete mode 100644 src/core/emb_table/emb_table.cpp delete mode 100644 src/core/emb_table/emb_table.h delete mode 100644 src/tests/emb_table/emb_table_test.cpp diff --git a/src/core/emb_table/emb_table.cpp b/src/core/emb_table/emb_table.cpp deleted file mode 100644 index 914cf535..00000000 --- a/src/core/emb_table/emb_table.cpp +++ /dev/null @@ -1,163 +0,0 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ - -#include -#include -#include -#include -#include "acl/acl_base.h" -#include "utils/common.h" -#include "initializer/initializer.h" -#include "emb_table/emb_table.h" - - -using namespace std; -using namespace MxRec; -using namespace tensorflow; - -void EmbTable::Init(const EmbInfo& eInfo, const RankInfo& rInfo, int initSeed) -{ -#ifndef GTEST - this->rankInfo = rInfo; - this->seed = initSeed; - this->embInfo = eInfo; - LOG_INFO("EmbTable init, deviceID {}, embSize {} running", rInfo.deviceId, embInfo.extEmbeddingSize); - // 计算embedding table需要分配的内存块数 - auto ret = aclrtSetDevice(static_cast(rInfo.deviceId)); - if (ret != ACL_ERROR_NONE) { - LOG_ERROR("Set device failed, device_id:{}, ret={}", rInfo.deviceId, ret); - throw AclError(); - } - embSize = embInfo.extEmbeddingSize; - blockSize = BLOCK_EMB_COUNT * embSize; - for (int i = 0; i < INIT_BLOCK_COUNT; ++i) { - // 申请新的内存块 - void *newBlock = nullptr; - aclError ec = aclrtMalloc(&newBlock, blockSize * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST); - if (ec != ACL_SUCCESS) { - LOG_ERROR("aclrtMalloc failed, ret={}", ec); - throw AclError(); - } - // 申请内存初始化 - RandomInit(newBlock); - // 将新的内存块加入内存链表 - memoryList.push_back(newBlock); - SplitMemoryBlock(newBlock); - } - totalCapacity = static_cast(memoryList.size()) * BLOCK_EMB_COUNT; - LOG_INFO("aclrtMalloc success, emb name:{}, total capacity:{}", embInfo.name, totalCapacity); -#endif -} - -EmbTable::~EmbTable() -{ -#ifndef GTEST - for (void *block : memoryList) { - // 释放内存块 - aclError ret = aclrtFree(block); - if (ret != ACL_SUCCESS) { - LOG_ERROR("aclrtFree failed, ret={}", ret); - } - block = nullptr; - } -#endif -} - -// 从embeddingList获取一个可用的emb地址 -int64_t EmbTable::GetEmbAddress() -{ - int64_t ret = -1; -#ifndef GTEST - if (embeddingList.empty()) { - PrintStatus(); - LOG_DEBUG("GetEmbAddress, embedding_list size: empty! Add block!"); - void *addBlock = nullptr; - aclError ret = aclrtMalloc(&addBlock, blockSize * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST); - if (ret != ACL_SUCCESS) { - LOG_ERROR("aclrtMalloc failed, ret={}", ret); - throw AclError(); - } - RandomInit(addBlock); - // 将新的内存块加入内存list - memoryList.push_back(addBlock); - SplitMemoryBlock(addBlock); - totalCapacity += BLOCK_EMB_COUNT; - } - float *embAddr = embeddingList.front(); - embeddingList.pop_front(); - usedCapacity++; - ret = reinterpret_cast(embAddr); -#endif - return ret; -} - -void EmbTable::RandomInit(void* newBlock) -{ -#ifndef GTEST - LOG_INFO("Device GenerateEmbData Start, seed:{}, initializer num: {}", seed, embInfo.initializeInfos.size()); - vector devEmb(blockSize); - for (const auto& initializeInfo: as_const(embInfo.initializeInfos)) { - LOG_INFO("Device GenerateEmbData ing. name {}", initializeInfo.name.c_str()); - for (int i = 0; i < BLOCK_EMB_COUNT; i++) { - initializeInfo.initializer->GenerateData(&devEmb[i * embSize], embSize); - } - } - LOG_INFO("Device GenerateEmbData End, seed:{}", seed); - ExecuteAclMemcpy(newBlock, devEmb); -#endif -} - -void EmbTable::ExecuteAclMemcpy(void* newBlock, vector devEmb) const -{ -#ifndef GTEST - aclError ret = aclrtMemcpy( - newBlock, blockSize * sizeof(float), devEmb.data(), blockSize * sizeof(float), ACL_MEMCPY_HOST_TO_DEVICE); - if (ret != ACL_SUCCESS) { - LOG_ERROR("aclrtMemcpy failed, ret={}", ret); - throw AclError(); - } -#endif -} - - -void EmbTable::SplitMemoryBlock(void *newBlock) -{ -#ifndef GTEST - if (embSize == 0) { - throw std::runtime_error("SplitMemoryBlock by embSize=0!"); - } - for (int i = 0; i < BLOCK_EMB_COUNT; i++) { - float *embPtr = static_cast(newBlock) + i * embSize; - embeddingList.push_back(embPtr); - } -#endif -} - -void EmbTable::PrintStatus() const -{ - // 输出embedding table的总容量和未使用的使用容量 - LOG_INFO("Total capacity:{}, Unused capacity:{}", - totalCapacity * embSize, totalCapacity * embSize - usedCapacity * embSize); -} - -int64_t EmbTable::GetTableSize() const -{ - return static_cast(usedCapacity); -} - -int64_t EmbTable::GetTableCapacity() const -{ - return static_cast(totalCapacity); -} diff --git a/src/core/emb_table/emb_table.h b/src/core/emb_table/emb_table.h deleted file mode 100644 index 2d30818c..00000000 --- a/src/core/emb_table/emb_table.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ - -#ifndef MX_REC_EMB_TABLE_H -#define MX_REC_EMB_TABLE_H - -#include -#include -#include -#include - -#include "utils/common.h" - -namespace MxRec { - - using namespace std; - - class EmbTable { - public: - EmbTable() = default; - - void Init(const EmbInfo& eInfo, const RankInfo& rInfo, int initSeed = 0); - - ~EmbTable(); - - // 从embeddingList获取获取一个可用的emb地址 - int64_t GetEmbAddress(); - - // 打印emb表使用情况 - void PrintStatus() const; - - int64_t GetTableSize() const; - - int64_t GetTableCapacity() const; - - EmbTable(const EmbTable&) = delete; - - EmbTable(EmbTable&&) = delete; - - EmbTable& operator=(const EmbTable&) = delete; - - EmbTable& operator=(EmbTable&&) = delete; - - void ExecuteAclMemcpy(void* newBlock, vector devEmb) const; - - GTEST_PRIVATE: - constexpr static int BLOCK_EMB_COUNT = 100000; - constexpr static int INIT_BLOCK_COUNT = 5; - constexpr static int TEST_EMB_SIZE = 12; - EmbInfo embInfo; - RankInfo rankInfo; - size_t blockSize = 1; - int embSize = 1; - size_t totalCapacity = 1; - size_t usedCapacity = 0; - int seed = 0; - // embedding地址的列表 - list embeddingList; - // 内存块列表 - vector memoryList; - - void RandomInit(void* newBlock); - - // embSize由embInfo得出 - void SplitMemoryBlock(void* newBlock); - - // 内部类,抛出内存不足异常 - class OutOfMemoryError : public runtime_error { - public: - OutOfMemoryError() : runtime_error("Out of memory!") {} - }; - - // 内部类,抛出acl异常 - class AclError : public runtime_error { - public: - AclError() : runtime_error("Acl failed!") {} - }; - }; -} - -#endif // MX_REC_EMB_TABLE_MANAGER_H \ No newline at end of file diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index b5dc962e..74dfafa5 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -57,11 +57,6 @@ bool KeyProcess::Initialize(const RankInfo& rInfo, const vector& eInfos embInfos[info.name] = info; scInfo[info.name] = info.sendCount; InitHotEmbTotCount(info, rInfo); - if (rankInfo.useDynamicExpansion) { - // 动态扩容 - embeddingTableMap[info.name].Init(info, rInfo, seed); - LOG_INFO(KEY_PROCESS "EmbeddingTableMap:{} init success", info.name); - } } LOG_INFO(KEY_PROCESS "hot emb count info:{}", MapToString(hotEmbTotCount)); @@ -1114,40 +1109,6 @@ void KeyProcess::Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channe embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); } -void KeyProcess::Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel) -{ - TimeCost key2OffsetTC; - EASY_FUNCTION(profiler::colors::Blue600) - std::lock_guard lk(mut); // lock for PROCESS_THREAD - auto& key2Offset = keyOffsetMap[embName]; - auto& maxOffsetTmp = maxOffset[embName]; - auto& curEmbTable = embeddingTableMap[embName]; // empty when not use dynamic expansion - for (long& key : splitKey) { - if (key == -1) { - key = 0; - continue; - } - const auto& iter = key2Offset.find(key); - if (iter != key2Offset.end()) { - key = iter->second; - } else { - // 新值 - if (channel == TRAIN_CHANNEL_ID) { -#ifndef GTEST - int64_t addr = curEmbTable.GetEmbAddress(); - key2Offset[key] = addr; - key = addr; -#endif - maxOffsetTmp++; - continue; - } - key = 0; - } - } - LOG_DEBUG("current expansion emb:{}, usage:{}/{}, key2OffsetTC({} ms)", - embName, maxOffsetTmp, embInfos[embName].devVocabSize, key2OffsetTC.ElapsedMS()); -} - /* * 构建恢复向量,以便从去重后的emb向量/key恢复回batch对应的emb向量 * 输入接收到emb块的偏移blockOffset,batch内每个key在块内的偏移restoreVec diff --git a/src/core/key_process/key_process.h b/src/core/key_process/key_process.h index 589fc2a5..82a3205b 100644 --- a/src/core/key_process/key_process.h +++ b/src/core/key_process/key_process.h @@ -28,7 +28,6 @@ See the License for the specific language governing permissions and #include "ock_ctr_common/include/factory.h" #include "utils/common.h" -#include "emb_table/emb_table.h" #include "feature_admit_and_evict.h" #include "hybrid_mgmt/hybrid_mgmt_block.h" #include "utils/singleton.h" @@ -196,7 +195,6 @@ namespace MxRec { map> evictPosMap {}; map> hotKey {}; map hotEmbTotCount; - map embeddingTableMap {}; ock::ctr::FactoryPtr factory {}; int hotEmbUpdateStep = HOT_EMB_UPDATE_STEP_DEFAULT; bool isWithFAAE; @@ -251,8 +249,6 @@ namespace MxRec { void Key2Offset(const EmbNameT& embName, KeysT& splitKey, int channel); - void Key2OffsetDynamicExpansion(const EmbNameT& embName, KeysT& splitKey, int channel); - unique_ptr GetBatchData(int channel, int commId) const; void BuildRestoreVec(const unique_ptr& batch, const vector& blockOffset, diff --git a/src/tests/emb_table/emb_table_test.cpp b/src/tests/emb_table/emb_table_test.cpp deleted file mode 100644 index b26b4487..00000000 --- a/src/tests/emb_table/emb_table_test.cpp +++ /dev/null @@ -1,135 +0,0 @@ -/* Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and - limitations under the License. -==============================================================================*/ - -#include -#include -#include -#include -#include -#include -#include -#include "utils/common.h" -#include "emb_table/emb_table.h" - -using namespace std; -using namespace MxRec; -using namespace testing; -using namespace tensorflow; - -class EmbTableTest : public testing::Test { -protected: - void SetUp() - { - // 设置测试用的EmbInfo - embInfo.extEmbeddingSize = embTable.TEST_EMB_SIZE; - LOG_INFO("EmbTable BLOCK_EMB_COUNT {} INIT_BLOCK_COUNT {}", - embTable.BLOCK_EMB_COUNT, embTable.INIT_BLOCK_COUNT); - rankInfo.rankId = 0; - rankInfo.rankSize = 1; - rankInfo.localRankSize = 1; - rankInfo.useStatic = true; - rankInfo.localRankId = 0; - rankInfo.isDDR = true; - rankInfo.ctrlSteps = { 1, -1 }; - rankInfo.deviceId = 0; - // 初始化EmbeddingTable -#ifndef GTEST - LOG_INFO("rank {} running", rankInfo.deviceId); - aclInit(nullptr); -#endif - } - - EmbTable embTable; - EmbInfo embInfo; - RankInfo rankInfo; - aclrtContext context; - - void TearDown() { - } -}; - -// 测试初始化是否正常 -TEST_F(EmbTableTest, Init) -{ -#ifndef GTEST - // 测试初始化是否出现异常 - EXPECT_NO_THROW(embTable.Init(embInfo, rankInfo, 0)); - LOG_INFO("embTable Init succeed!"); - ASSERT_EQ(embTable.rankInfo.g_rankId, rankInfo.g_rankId); - ASSERT_EQ(embTable.rankInfo.rankSize, rankInfo.rankSize); - ASSERT_EQ(embTable.rankInfo.localRankSize, rankInfo.localRankSize); - ASSERT_EQ(embTable.rankInfo.useStatic, rankInfo.useStatic); - ASSERT_EQ(embTable.rankInfo.localRankId, rankInfo.localRankId); - // 测试容量是否正常 - LOG_INFO("totalCapacity {}, INIT_BLOCK_COUNT {}", embTable.totalCapacity, embTable.INIT_BLOCK_COUNT); - EXPECT_EQ(embTable.totalCapacity, embTable.INIT_BLOCK_COUNT * embTable.BLOCK_EMB_COUNT); -#endif -} - -// 测试embedding list为空时的情况 -TEST_F(EmbTableTest, GetEmbAddressEmptyList) -{ -#ifndef GTEST - embTable.Init(embInfo, rankInfo, 0); - while (!embTable.embeddingList.empty()) { - float *embAddr = reinterpret_cast(embTable.GetEmbAddress()); - EXPECT_NE(embAddr, nullptr); - } - ASSERT_EQ(embTable.embeddingList.size(), 0); - - float *curAddr = nullptr; - int usedCapacityBefore = embTable.usedCapacity; - ASSERT_NO_THROW({ - curAddr= reinterpret_cast(embTable.GetEmbAddress()); - }); - EXPECT_NE(curAddr, nullptr); - EXPECT_EQ(embTable.usedCapacity, usedCapacityBefore + 1); -#endif -} - -// 测试正常情况 -TEST_F(EmbTableTest, GetEmbAddressNormal) -{ -#ifndef GTEST - embTable.Init(embInfo, rankInfo, 0); - ASSERT_EQ(embTable.totalCapacity, embTable.INIT_BLOCK_COUNT); - float *curAddr = nullptr; - int totalCapacityBefore = embTable.totalCapacity; - int usedCapacityBefore = embTable.usedCapacity; - ASSERT_NO_THROW({ - curAddr = reinterpret_cast(embTable.GetEmbAddress()); - }); - EXPECT_NE(curAddr, nullptr); - EXPECT_EQ(embTable.totalCapacity, totalCapacityBefore); - EXPECT_EQ(embTable.usedCapacity, usedCapacityBefore + 1); -#endif -} - -// 测试将一个emb地址放入embeddingList中,是否成功 -TEST_F(EmbTableTest, PutEmbAddress) -{ -#ifndef GTEST - embTable.Init(embInfo, rankInfo, 0); - int64_t curAddr; - int usedCapacityBefore = embTable.usedCapacity; - ASSERT_NO_THROW({ - curAddr = embTable.GetEmbAddress(); - }); - EXPECT_EQ(embTable.usedCapacity, usedCapacityBefore + 1); - embTable.PutEmbAddress(curAddr); - EXPECT_EQ(embTable.usedCapacity, usedCapacityBefore); - EXPECT_EQ(curAddr, reinterpret_cast(embTable.embeddingList.back())); -#endif -} -- Gitee From 3b9fbb550f6ca5b78f3e6adfbe4220ea98c7afb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 24 Jun 2024 20:24:09 +0800 Subject: [PATCH 243/302] =?UTF-8?q?=E3=80=90fix=E3=80=91capacity=E6=8E=A5?= =?UTF-8?q?=E5=8F=A3=E9=80=82=E9=85=8D=E6=96=B0ddr=E3=80=81ssd?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/core/emb/sparse_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mx_rec/core/emb/sparse_embedding.py b/mx_rec/core/emb/sparse_embedding.py index 071f4506..39af9d60 100644 --- a/mx_rec/core/emb/sparse_embedding.py +++ b/mx_rec/core/emb/sparse_embedding.py @@ -77,9 +77,9 @@ class ExternalStorageSparseEmbedding(SparseEmbedding): def capacity(self) -> int: # DDR if not self._ssd_vocabulary_size: - return self._device_vocabulary_size + self._host_vocabulary_size + return self._host_vocabulary_size # SSD - return self._device_vocabulary_size + self._host_vocabulary_size + self._ssd_vocabulary_size + return self._host_vocabulary_size + self._ssd_vocabulary_size def _set_specific_value_for_non_valid_key(id_offsets: Optional[tf.Tensor], -- Gitee From d54007682b22976f72a62049888e60d6335cf123 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 26 Jun 2024 17:19:26 +0800 Subject: [PATCH 244/302] =?UTF-8?q?=E3=80=90fix=E3=80=91=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E5=BC=82=E5=B8=B8=E6=83=85=E5=86=B5=E4=B8=8B=E7=9A=84=E6=97=A5?= =?UTF-8?q?=E5=BF=97=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/validator/emb_validator.py | 8 ++++---- .../src/embedding_cache/cache_manager/cache_manager.cpp | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/mx_rec/validator/emb_validator.py b/mx_rec/validator/emb_validator.py index 0c7d7e81..e4417b6d 100644 --- a/mx_rec/validator/emb_validator.py +++ b/mx_rec/validator/emb_validator.py @@ -78,14 +78,14 @@ def check_emb_lookup_params(table_params: dict, feature_spec: Union[tf.Tensor, F if slice_device_vocabulary_size < send_count * rank_size: raise ValueError(f"Given device_vocabulary_size was too small for table '{table_name}', " f"in which slice_device_vocabulary_size was {slice_device_vocabulary_size} " - f"and send_count({send_count}) * rank_size({rank_size}) was " - f"{send_count * rank_size}.") + f"and it must be bigger than send_count({send_count}) * rank_size({rank_size}): " + f"{send_count * rank_size}, please increase [device vocabSize] in [create_table] interface") if slice_host_vocabulary_size < send_count * rank_size: raise ValueError(f"Given host_vocabulary_size was too small for table '{table_name}', " f"in which slice_host_vocabulary_size was {slice_host_vocabulary_size} " - f"and send_count({send_count}) * rank_size({rank_size}) was " - f"{send_count * rank_size}.") + f"and it must be bigger than send_count({send_count}) * rank_size({rank_size}): " + f"{send_count * rank_size}, please increase [host vocabSize] in [create_table] interface") def check_emb_multi_lookup_times(lookup_times: int, table_name: str): diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp index 3620c5d0..8a6187a1 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -40,7 +40,9 @@ int EmbCacheManagerImpl::CreateCacheForTable(const EmbCacheInfo& embCacheInfo, } if (embCacheInfo.vocabSize < embCacheInfo.maxCacheSize) { - ExternalLogger::PrintLog(LogLevel::ERROR, "vocabSize must be greater than or equal to maxCacheSize"); + ExternalLogger::PrintLog(LogLevel::ERROR, "host vocabSize:" + std::to_string(embCacheInfo.vocabSize) + + " must be greater than or equal to device vocabSize:" + std::to_string(embCacheInfo.maxCacheSize) + + ", please increase [host vocabSize] in [create_table] interface"); return H_HOST_VOCAB_SIZE_TOO_SMALL; } -- Gitee From 2187cc1c56a8e4fb3f90bb051c717f6cb951153d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 27 Jun 2024 19:19:32 +0800 Subject: [PATCH 245/302] =?UTF-8?q?=E3=80=90fix=E3=80=91ddr=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E4=B8=8B=E7=9A=84eos=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt_block.cpp | 9 +++------ src/core/hybrid_mgmt/hybrid_mgmt_block.h | 6 ++---- src/core/key_process/key_process.cpp | 20 ++++++++++++++------ src/core/key_process/key_process.h | 2 +- src/ops_tf/hybrid_dataset_ops.cpp | 4 ++-- 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp index fbee8b9a..04433469 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.cpp @@ -174,6 +174,9 @@ void HybridMgmtBlock::ResetAll(int channelId) pythonBatchId[channelId] = 0; hybridBatchId[channelId] = 0; isBlock[channelId] = false; + if (channelId == EVAL_CHANNEL_ID) { + evalBatchIdTotal += readEmbedBatchId[channelId]; + } LOG_DEBUG(HYBRID_BLOCKING + "after reset block status," " channelId:{}, pythonBatchId:{}, readEmbedBatchId:{}, hybridBatchId:{}", @@ -269,10 +272,4 @@ bool HybridMgmtBlock::IsNeedWaitSave() void HybridMgmtBlock::FinishSave() { finishSave = true; -} - -void HybridMgmtBlock::IncrementReadEmbBatchId(const int channelId) -{ - this->readEmbedBatchId[channelId] += 1; - this->readEmbedBatchIdAll += 1; } \ No newline at end of file diff --git a/src/core/hybrid_mgmt/hybrid_mgmt_block.h b/src/core/hybrid_mgmt/hybrid_mgmt_block.h index a66f9b00..f3ee6e8f 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt_block.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt_block.h @@ -41,8 +41,8 @@ namespace MxRec { int pythonBatchId[2] = {0, 0}; // readEmbed算子侧将要处理的batch id int readEmbedBatchId[2] = {0, 0}; - // readEmbed算子处理过的batch计数,不区分通道、图,不会重置;用于判断h2d swap是否需要eos - int readEmbedBatchIdAll = 0; + // eval通道处理过的batch计数,不区分通道、图,不会重置;用于判断h2d swap是否需要eos + int evalBatchIdTotal = 0; int maxTrainStep = 0; int stepsInterval[2] = {0, 0}; // 通道i运行多少步后切换为通道j @@ -91,8 +91,6 @@ namespace MxRec { void FinishSave(); - void IncrementReadEmbBatchId(const int channelId); - private: // 控制通道阻塞的变量 bool isBlock[2] = {true, true}; diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index b5dc962e..96448c05 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -1263,19 +1263,27 @@ bool KeyProcess::IsGetUniqueKeysEos(const EmbBaseInfo& info, std::chrono::_V2::s int readEmbKeyBatchId = hybridMgmtBlock->readEmbedBatchId[info.channelId] - 1; // 避免eos在keyProcess还未处理完数据时插队到通道前面 std::chrono::duration elapsedTime = endTime - startTime; + // train and eval batch total num + int allChannelBatchId = 0; + if (info.channelId == EVAL_CHANNEL_ID) { + allChannelBatchId = hybridMgmtBlock->evalBatchIdTotal + hybridMgmtBlock->hybridBatchId[TRAIN_CHANNEL_ID] + + readEmbKeyBatchId; + } else { + allChannelBatchId = hybridMgmtBlock->evalBatchIdTotal + readEmbKeyBatchId; + } if (info.batchId != 0 && elapsedTime.count() >= timeoutGetUniqueKeysEmpty) { LOG_DEBUG("table:{}, channelId:{}, isNeedSendEos:{}, readEmbKeyBatchId:{}, batch:{}, h2dNextBatchId:{}," - " lookUpSwapInAddrsPushId:{}", info.name, info.channelId, isNeedSendEos[info.channelId], - readEmbKeyBatchId, info.batchId, hybridMgmtBlock->h2dNextBatchId[info.name], - lookUpSwapInAddrsPushId[info.name]); + " lookUpSwapInAddrsPushId:{}, allChannelBatchId:{}", info.name, info.channelId, + isNeedSendEos[info.channelId], readEmbKeyBatchId, info.batchId, + hybridMgmtBlock->h2dNextBatchId[info.name], lookUpSwapInAddrsPushId[info.name], allChannelBatchId); startTime = std::chrono::system_clock::now(); } // Check '>= readEmbedBatchIdAll' condition to avoid send eos before handle all batch data from readEmbKey Op. if (isNeedSendEos[info.channelId] && readEmbKeyBatchId < info.batchId && hybridMgmtBlock->h2dNextBatchId[info.name] == lookUpSwapInAddrsPushId[info.name] && - hybridMgmtBlock->h2dNextBatchId[info.name] >= hybridMgmtBlock->readEmbedBatchIdAll) { - LOG_INFO("table:{}, channelId:{} batchId:{}, GetUniqueKeys eos", - info.name, info.channelId, info.batchId); + hybridMgmtBlock->h2dNextBatchId[info.name] >= allChannelBatchId) { + LOG_INFO("table:{}, channelId:{} batchId:{}, GetUniqueKeys eos, h2dNextBatchId:{}, allChannelBatchId:{}", + info.name, info.channelId, info.batchId, hybridMgmtBlock->h2dNextBatchId[info.name], allChannelBatchId); return true; } LOG_TRACE("getting uniqueKeys failed, table:{}, channel:{}, mgmt batchId:{}, readEmbKey batchId:{}, list is empty", diff --git a/src/core/key_process/key_process.h b/src/core/key_process/key_process.h index 589fc2a5..ba24181a 100644 --- a/src/core/key_process/key_process.h +++ b/src/core/key_process/key_process.h @@ -205,7 +205,7 @@ namespace MxRec { bool isNeedSendEos[2] = {false, false}; // 表示各表通道0、1的eos状态 atomic readySendEosCnt[2]; atomic finishSendEosCnt[2]; - const double timeoutGetUniqueKeys = 10.0; // 如果超时仍未获取到数据将触发EOS + const double timeoutGetUniqueKeys = 30.0; // 如果超时仍未获取到数据将触发EOS const double timeoutGetUniqueKeysEmpty = 1.0; // 如果超时仍未获取到数据将打印信息 void InitHotEmbTotCount(const EmbInfo& info, const RankInfo& rInfo); diff --git a/src/ops_tf/hybrid_dataset_ops.cpp b/src/ops_tf/hybrid_dataset_ops.cpp index 0b192da5..2eee8531 100644 --- a/src/ops_tf/hybrid_dataset_ops.cpp +++ b/src/ops_tf/hybrid_dataset_ops.cpp @@ -214,7 +214,7 @@ namespace MxRec { return; } } - hybridMgmtBlock->IncrementReadEmbBatchId(channelId); + hybridMgmtBlock->readEmbedBatchId[channelId] += 1; const Tensor& inputTensor = context->input(TensorIndex::TENSOR_INDEX_0); const auto& splits = context->input(TENSOR_INDEX_1).flat(); int fieldNum = 0; @@ -407,7 +407,7 @@ namespace MxRec { return; } } - hybridMgmtBlock->IncrementReadEmbBatchId(channelId); + hybridMgmtBlock->readEmbedBatchId[channelId] += 1; const Tensor& inputTensor = context->input(TensorIndex::TENSOR_INDEX_0); size_t dataSize = inputTensor.NumElements(); -- Gitee From 1d735d5e609b243da9720f14c164bc29649d6197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 27 Jun 2024 20:17:25 +0800 Subject: [PATCH 246/302] =?UTF-8?q?=E3=80=90fix=E3=80=91ddr=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E4=B8=8B=E7=9A=84eos=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/key_process/key_process.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/core/key_process/key_process.cpp b/src/core/key_process/key_process.cpp index 96448c05..4d467153 100644 --- a/src/core/key_process/key_process.cpp +++ b/src/core/key_process/key_process.cpp @@ -1259,7 +1259,7 @@ bool KeyProcess::IsGetUniqueKeysEos(const EmbBaseInfo& info, std::chrono::_V2::s HybridMgmtBlock* hybridMgmtBlock = Singleton::GetInstance(); auto endTime = std::chrono::system_clock::now(); - // readEmbKey真实的次数是readEmbedBatchId减1 + // readEmbKey start with 0 int readEmbKeyBatchId = hybridMgmtBlock->readEmbedBatchId[info.channelId] - 1; // 避免eos在keyProcess还未处理完数据时插队到通道前面 std::chrono::duration elapsedTime = endTime - startTime; @@ -1267,9 +1267,9 @@ bool KeyProcess::IsGetUniqueKeysEos(const EmbBaseInfo& info, std::chrono::_V2::s int allChannelBatchId = 0; if (info.channelId == EVAL_CHANNEL_ID) { allChannelBatchId = hybridMgmtBlock->evalBatchIdTotal + hybridMgmtBlock->hybridBatchId[TRAIN_CHANNEL_ID] + - readEmbKeyBatchId; + hybridMgmtBlock->readEmbedBatchId[info.channelId]; } else { - allChannelBatchId = hybridMgmtBlock->evalBatchIdTotal + readEmbKeyBatchId; + allChannelBatchId = hybridMgmtBlock->evalBatchIdTotal + hybridMgmtBlock->readEmbedBatchId[info.channelId]; } if (info.batchId != 0 && elapsedTime.count() >= timeoutGetUniqueKeysEmpty) { LOG_DEBUG("table:{}, channelId:{}, isNeedSendEos:{}, readEmbKeyBatchId:{}, batch:{}, h2dNextBatchId:{}," @@ -1283,7 +1283,8 @@ bool KeyProcess::IsGetUniqueKeysEos(const EmbBaseInfo& info, std::chrono::_V2::s hybridMgmtBlock->h2dNextBatchId[info.name] == lookUpSwapInAddrsPushId[info.name] && hybridMgmtBlock->h2dNextBatchId[info.name] >= allChannelBatchId) { LOG_INFO("table:{}, channelId:{} batchId:{}, GetUniqueKeys eos, h2dNextBatchId:{}, allChannelBatchId:{}", - info.name, info.channelId, info.batchId, hybridMgmtBlock->h2dNextBatchId[info.name], allChannelBatchId); + info.name, info.channelId, info.batchId, hybridMgmtBlock->h2dNextBatchId[info.name], + allChannelBatchId); return true; } LOG_TRACE("getting uniqueKeys failed, table:{}, channel:{}, mgmt batchId:{}, readEmbKey batchId:{}, list is empty", -- Gitee From 533bc2be9043d0e29fbf962a5a3f781cea3250f4 Mon Sep 17 00:00:00 2001 From: LiJiang Date: Fri, 28 Jun 2024 14:31:16 +0800 Subject: [PATCH 247/302] =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=86=97=E4=BD=99?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=EF=BC=9B=E4=BF=AE=E6=94=B9=E9=94=99=E8=AF=AF?= =?UTF-8?q?log?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp | 2 +- cust_op/fused_lazy_adam/op_host/lazy_adam.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp index e9711379..3b9b51fe 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/src/op_runner.cpp @@ -322,7 +322,7 @@ namespace AclnnLazyAdam { ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); return false; } - INFO_LOG("Execute aclnnAddCustom success"); + INFO_LOG("Execute aclnnLazyAdam success"); ret = aclrtSynchronizeStreamWithTimeout(stream, STREAM_TIMEOUT); if (ret != SUCCESS) { diff --git a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp index fb7f86b3..2c288729 100644 --- a/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp +++ b/cust_op/fused_lazy_adam/op_host/lazy_adam.cpp @@ -54,8 +54,6 @@ static ge::graphStatus LazyAdamTilingFunc(gert::TilingContext* context) ge::DataType indicesDtype = context->GetInputDesc(1)->GetDataType(); int indicesDtypeSize = ge::GetSizeByDataType(indicesDtype); - tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); - context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); auto attrs = context->GetAttrs(); float beta1 = *attrs->GetAttrPointer(0); -- Gitee From 21e75227a44bbfabf1adf7c47a1352843a0e7276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Sat, 29 Jun 2024 15:24:55 +0800 Subject: [PATCH 248/302] =?UTF-8?q?=E3=80=90fix=E3=80=91stoi=E6=8A=9B?= =?UTF-8?q?=E5=87=BA=E5=BC=82=E5=B8=B8=E7=9A=84cleancode=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/utils/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/utils/common.cpp b/src/core/utils/common.cpp index 1b3edcfd..15aa69bb 100644 --- a/src/core/utils/common.cpp +++ b/src/core/utils/common.cpp @@ -182,7 +182,7 @@ namespace MxRec { try { res = stoi(match[1]); } catch (const std::invalid_argument& e) { - LOG_ERROR(e.what()); + LOG_ERROR("argument is invalid: {}", e.what()); } return res; } -- Gitee From 550a302c91620bbb771ac980ac564aa5c4f467a8 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 1 Jul 2024 15:14:44 +0800 Subject: [PATCH 249/302] fix: `StringFormat` use cases --- src/core/checkpoint/checkpoint.cpp | 142 ++-- src/core/utils/common.h | 1132 ++++++++++++++-------------- src/ops_tf/hybrid_dataset_ops.cpp | 4 +- 3 files changed, 653 insertions(+), 625 deletions(-) diff --git a/src/core/checkpoint/checkpoint.cpp b/src/core/checkpoint/checkpoint.cpp index abd3a10e..bc7501bb 100644 --- a/src/core/checkpoint/checkpoint.cpp +++ b/src/core/checkpoint/checkpoint.cpp @@ -13,21 +13,22 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include -#include -#include -#include +#include "checkpoint.h" + #include #include +#include +#include +#include + +#include #include "ckpt_data_handler/feat_admit_n_evict_ckpt/feat_admit_n_evict_ckpt.h" -#include "ckpt_data_handler/key_freq_map_ckpt/key_freq_map_ckpt.h" #include "ckpt_data_handler/key_count_map_ckpt/key_count_map_ckpt.h" -#include "utils/time_cost.h" -#include "utils/common.h" +#include "ckpt_data_handler/key_freq_map_ckpt/key_freq_map_ckpt.h" #include "file_system/file_system_handler.h" - -#include "checkpoint.h" +#include "utils/common.h" +#include "utils/time_cost.h" using namespace std; using namespace MxRec; @@ -89,11 +90,17 @@ void Checkpoint::SetDataHandler(CkptData& ckptData) void Checkpoint::SetDataHandler(const vector& featureTypes) { - map> setCkptMap{ - {CkptFeatureType::FEAT_ADMIT_N_EVICT, [this] { dataHandlers.push_back(make_unique()); }}, - {CkptFeatureType::DDR_KEY_FREQ_MAP, [this] { dataHandlers.push_back(make_unique()); }}, - {CkptFeatureType::KEY_COUNT_MAP, [this] { dataHandlers.push_back(make_unique()); }} - }; + map> setCkptMap{{CkptFeatureType::FEAT_ADMIT_N_EVICT, + [this] { + dataHandlers.push_back(make_unique()); + }}, + {CkptFeatureType::DDR_KEY_FREQ_MAP, + [this] { + dataHandlers.push_back(make_unique()); + }}, + {CkptFeatureType::KEY_COUNT_MAP, [this] { + dataHandlers.push_back(make_unique()); + }}}; for (const auto& featureType : featureTypes) { setCkptMap.at(featureType)(); @@ -104,8 +111,8 @@ void Checkpoint::SaveProcess(CkptData& ckptData) { for (const auto& dataHandler : dataHandlers) { dataHandler->SetProcessData(ckptData); - vector embNames { dataHandler->GetEmbNames() }; - vector saveDataTypes { dataHandler->GetDataTypes() }; + vector embNames{dataHandler->GetEmbNames()}; + vector saveDataTypes{dataHandler->GetDataTypes()}; MakeUpperLayerSaveDir(); MakeDataLayerSaveDir(embNames, saveDataTypes, dataHandler); SaveDataset(embNames, saveDataTypes, dataHandler); @@ -118,17 +125,16 @@ void Checkpoint::MakeUpperLayerSaveDir() MakeSaveDir(innerDirPath); } -void Checkpoint::MakeDataLayerSaveDir(const vector& embNames, - const vector& saveDataTypes, +void Checkpoint::MakeDataLayerSaveDir(const vector& embNames, const vector& saveDataTypes, const unique_ptr& dataHandler) { for (const auto& embName : embNames) { - auto dataDir { innerDirPath + dirSeparator + embName }; + auto dataDir{innerDirPath + dirSeparator + embName}; MakeSaveDir(dataDir); for (const auto& saveDataType : saveDataTypes) { - auto dataDirName { dataHandler->GetDataDirName(saveDataType) }; - auto datasetPath { dataDir + dirSeparator + dataDirName }; + auto dataDirName{dataHandler->GetDataDirName(saveDataType)}; + auto datasetPath{dataDir + dirSeparator + dataDirName}; MakeSaveDir(datasetPath); } } @@ -146,7 +152,7 @@ void Checkpoint::MakeSaveDir(const string& dirName) const Checkpoint::EmbSizeInfo Checkpoint::GetEmbeddingSize(const string& embName) { EmbSizeInfo embSizeInfo; - for (const auto &embInfo: mgmtEmbInfo) { + for (const auto& embInfo : mgmtEmbInfo) { if (embInfo.name == embName) { embSizeInfo.embSize = embInfo.embeddingSize; embSizeInfo.extEmbSize = embInfo.extEmbeddingSize; @@ -158,29 +164,28 @@ Checkpoint::EmbSizeInfo Checkpoint::GetEmbeddingSize(const string& embName) bool Checkpoint::CheckEmbNames(const string& embName) { - for (const auto &embInfo: mgmtEmbInfo) { - if (embInfo.name == embName && embInfo.isSave) { + for (const auto& embInfo : mgmtEmbInfo) { + if (embInfo.name == embName && embInfo.isSave) { return true; } } return false; } -void Checkpoint::SaveDataset(const vector& embNames, - const vector& saveDataTypes, +void Checkpoint::SaveDataset(const vector& embNames, const vector& saveDataTypes, const unique_ptr& dataHandler) { - for (const auto& embName: embNames) { + for (const auto& embName : embNames) { if (!CheckEmbNames(embName)) { continue; } auto dataDir{innerDirPath + dirSeparator + embName}; - for (const auto& saveDataType: saveDataTypes) { - auto datasetPath { dataDir + dirSeparator + dataHandler->GetDataDirName(saveDataType) }; - auto datasetDir { datasetPath + dirSeparator + datasetName + to_string(rankId) + dataFileType }; + for (const auto& saveDataType : saveDataTypes) { + auto datasetPath{dataDir + dirSeparator + dataHandler->GetDataDirName(saveDataType)}; + auto datasetDir{datasetPath + dirSeparator + datasetName + to_string(rankId) + dataFileType}; LOG_DEBUG("====Start getting data from handler to: {}", datasetDir); - auto transData { dataHandler->GetDataset(saveDataType, embName) }; + auto transData{dataHandler->GetDataset(saveDataType, embName)}; LOG_DEBUG("====Start saving data to: {}", datasetDir); WriteStream(transData, datasetDir, transData.datasetSize, saveDataType); @@ -197,36 +202,36 @@ void Checkpoint::WriteStream(CkptTransData& transData, const string& dataDir, si ssize_t writeBytesNum; if (int32TransSet.find(dataType) != int32TransSet.end()) { - writeBytesNum = fileSystemPtr->Write(dataDir, - reinterpret_cast(transData.int32Arr.data()), dataSize); + writeBytesNum = + fileSystemPtr->Write(dataDir, reinterpret_cast(transData.int32Arr.data()), dataSize); } else if (int64TransSet.find(dataType) != int64TransSet.end()) { - writeBytesNum = fileSystemPtr->Write(dataDir, - reinterpret_cast(transData.int64Arr.data()), dataSize); + writeBytesNum = + fileSystemPtr->Write(dataDir, reinterpret_cast(transData.int64Arr.data()), dataSize); } else if (dataType == CkptDataType::ATTRIBUTE) { - writeBytesNum = fileSystemPtr->Write(dataDir, - reinterpret_cast(transData.attribute.data()), dataSize); + writeBytesNum = + fileSystemPtr->Write(dataDir, reinterpret_cast(transData.attribute.data()), dataSize); } else { throw runtime_error("unknown CkptDataType"); } if (writeBytesNum == -1) { - throw runtime_error(StringFormat("Error: Save data failed. data type: %d. " - "An error occurred while writing file: %s.", dataType, dataDir.c_str())); + throw runtime_error(StringFormat("Error: Save data failed. data type: %s. " + "An error occurred while writing file: %s.", + CkptDataTypeName(dataType).c_str(), dataDir.c_str())); } if (writeBytesNum != dataSize) { - throw runtime_error(StringFormat("Error: Save data failed. data type: %d ." + throw runtime_error(StringFormat("Error: Save data failed. data type: %s. " "Expected to write %d bytes, but actually write %d bytes to file %s.", - dataType, dataSize, writeBytesNum, dataDir.c_str())); + CkptDataTypeName(dataType).c_str(), dataSize, writeBytesNum, dataDir.c_str())); } } - void Checkpoint::LoadProcess(CkptData& ckptData) { for (const auto& dataHandler : dataHandlers) { - vector embNames {}; - vector dirNames { dataHandler->GetDirNames() }; - vector saveDataTypes { dataHandler->GetDataTypes() }; + vector embNames{}; + vector dirNames{dataHandler->GetDirNames()}; + vector saveDataTypes{dataHandler->GetDataTypes()}; innerDirPath = processPath; if (find(dirNames.begin(), dirNames.end(), ssdSymbol) != dirNames.end()) { embNames = GetTableLayerLoadDir(); @@ -238,7 +243,6 @@ void Checkpoint::LoadProcess(CkptData& ckptData) } } - vector Checkpoint::GetEmbedTableNames() { vector loadTableNames; @@ -262,22 +266,20 @@ vector Checkpoint::GetTableLayerLoadDir() return loadTableDir; } -void Checkpoint::LoadDataset(const vector& embNames, - const vector& saveDataTypes, - const unique_ptr& dataHandler, - CkptData& ckptData) +void Checkpoint::LoadDataset(const vector& embNames, const vector& saveDataTypes, + const unique_ptr& dataHandler, CkptData& ckptData) { for (const auto& embName : embNames) { - auto dataDir { innerDirPath + dirSeparator + embName }; + auto dataDir{innerDirPath + dirSeparator + embName}; for (const auto& saveDataType : saveDataTypes) { - auto datasetPath { dataDir + dirSeparator + dataHandler->GetDataDirName(saveDataType) }; + auto datasetPath{dataDir + dirSeparator + dataHandler->GetDataDirName(saveDataType)}; - auto datasetDir { datasetPath + dirSeparator + "slice" + dataFileType }; - auto attributeDir { datasetPath + dirSeparator + "slice" + attribFileType }; + auto datasetDir{datasetPath + dirSeparator + "slice" + dataFileType}; + auto attributeDir{datasetPath + dirSeparator + "slice" + attribFileType}; CkptTransData transData; LOG_DEBUG("====Start reading data from: {}", attributeDir); - auto dataElmtBytes { dataHandler->GetDataElmtBytes(CkptDataType::ATTRIBUTE) }; + auto dataElmtBytes{dataHandler->GetDataElmtBytes(CkptDataType::ATTRIBUTE)}; ReadStream(transData, attributeDir, CkptDataType::ATTRIBUTE, dataElmtBytes); dataElmtBytes = dataHandler->GetDataElmtBytes(saveDataType); @@ -290,7 +292,7 @@ void Checkpoint::LoadDataset(const vector& embNames, } LOG_DEBUG("====Start loading data from: {} to data handler.", attributeDir); - if ((saveDataType == CkptDataType::EMB_INFO)) { + if ((saveDataType == CkptDataType::EMB_INFO)) { dataHandler->SetDatasetForLoadEmb(saveDataType, embName, transData, ckptData); } else { dataHandler->SetDataset(saveDataType, embName, transData); @@ -299,14 +301,12 @@ void Checkpoint::LoadDataset(const vector& embNames, } } -void Checkpoint::ReadStream(CkptTransData& transData, - const string& dataDir, - CkptDataType dataType, +void Checkpoint::ReadStream(CkptTransData& transData, const string& dataDir, CkptDataType dataType, uint32_t dataElmtBytes) { if (dataElmtBytes == 0) { LOG_WARN("dataElmtBytes is 0, don't handle [/ %] operation"); - return ; + return; } if (fileSystemPtr == nullptr) { @@ -315,7 +315,7 @@ void Checkpoint::ReadStream(CkptTransData& transData, } size_t datasetSize = fileSystemPtr->GetFileSize(dataDir); - auto resizeSize { datasetSize / dataElmtBytes }; + auto resizeSize{datasetSize / dataElmtBytes}; SetTransDataSize(transData, resizeSize, dataType); if (datasetSize % dataElmtBytes > 0) { @@ -328,31 +328,29 @@ void Checkpoint::ReadStream(CkptTransData& transData, } else if (int64TransSet.find(dataType) != int64TransSet.end()) { readBytesNum = fileSystemPtr->Read(dataDir, reinterpret_cast(transData.int64Arr.data()), datasetSize); } else if (dataType == CkptDataType::ATTRIBUTE) { - readBytesNum = fileSystemPtr->Read(dataDir, reinterpret_cast(transData.attribute.data()), datasetSize); + readBytesNum = fileSystemPtr->Read(dataDir, reinterpret_cast(transData.attribute.data()), datasetSize); } else { throw runtime_error("unknown CkptDataType"); } if (readBytesNum == -1) { - throw runtime_error(StringFormat("Error: Load data failed. data type: %d ." - "An error occurred while reading file: %s.", dataType, dataDir.c_str())); + throw runtime_error(StringFormat("Error: Load data failed. data type: %s. " + "An error occurred while reading file: %s.", + CkptDataTypeName(dataType).c_str(), dataDir.c_str())); } if (readBytesNum != datasetSize) { - throw runtime_error(StringFormat("Error: Load data failed. data type: %d ." + throw runtime_error(StringFormat("Error: Load data failed. data type: %s. " "Expected to read %d bytes, but actually read %d bytes to file %s.", - dataType, datasetSize, readBytesNum, dataDir.c_str())); + CkptDataTypeName(dataType).c_str(), datasetSize, readBytesNum, dataDir.c_str())); } } -void Checkpoint::ReadStreamForEmbData(CkptTransData& transData, - const string& dataDir, - uint32_t dataElmtBytes, - CkptData& ckptData, - string embName) const +void Checkpoint::ReadStreamForEmbData(CkptTransData& transData, const string& dataDir, uint32_t dataElmtBytes, + CkptData& ckptData, string embName) const { if (dataElmtBytes == 0) { LOG_ERROR("dataElmtBytes is 0, don't handle [/ %] operation"); - return ; + return; } if (fileSystemPtr == nullptr) { diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 9a39e7ac..f8ff4565 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -17,608 +17,638 @@ See the License for the specific language governing permissions and #define COMMON_H #include -#include -#include -#include + +#include #include +#include +#include #include +#include #include -#include -#include -#include "tensorflow/core/framework/tensor.h" -#include "absl/container/flat_hash_map.h" -#include "securec.h" -#include "utils/logger.h" -#include "utils/config.h" +#include -#include "initializer/initializer.h" +#include "absl/container/flat_hash_map.h" #include "initializer/constant_initializer/constant_initializer.h" -#include "initializer/truncated_normal_initializer/truncated_normal_initializer.h" +#include "initializer/initializer.h" #include "initializer/random_normal_initializer/random_normal_initializer.h" -#include "ock_ctr_common/include/factory.h" +#include "initializer/truncated_normal_initializer/truncated_normal_initializer.h" #include "ock_ctr_common/include/embedding_cache.h" +#include "ock_ctr_common/include/factory.h" +#include "securec.h" +#include "tensorflow/core/framework/tensor.h" +#include "utils/config.h" +#include "utils/logger.h" #if defined(BUILD_WITH_EASY_PROFILER) - #include - #include +#include +#include #else - #define EASY_FUNCTION(...) - #define EASY_VALUE(...) - #define EASY_BLOCK(...) - #define EASY_END_BLOCK - #define EASY_PROFILER_ENABLE - #define EASY_PROFILER_DISABLE +#define EASY_FUNCTION(...) +#define EASY_VALUE(...) +#define EASY_BLOCK(...) +#define EASY_END_BLOCK +#define EASY_PROFILER_ENABLE +#define EASY_PROFILER_DISABLE #endif namespace MxRec { #define INFO_PTR shared_ptr #define MGMT_CPY_THREADS 4 #define PROFILING - using namespace tensorflow; - extern ock::ctr::FactoryPtr factory; - constexpr int TRAIN_CHANNEL_ID = 0; - constexpr int EVAL_CHANNEL_ID = 1; - - constexpr int MAX_CHANNEL_NUM = 2; - constexpr int MAX_KEY_PROCESS_THREAD = 10; - constexpr int MAX_QUEUE_NUM = MAX_CHANNEL_NUM * MAX_KEY_PROCESS_THREAD; - constexpr int DEFAULT_KEY_PROCESS_THREAD = 6; - constexpr int KEY_PROCESS_THREAD = 6; - constexpr char SUM_SAME_ID[] = "sum_same_id_gradients_and_apply"; - constexpr size_t MAX_VOCABULARY_SIZE = 1e10; - constexpr int SSD_SIZE_INDEX = 2; - constexpr int MAX_FILE_NUM = 1000; - constexpr int EMBEDDING_THREAD_NUM = 2; - // for GLOG - struct GlogConfig { - static bool gStatOn; - static int gGlogLevel; - static string gRankId; - }; - - constexpr int GLOG_MAX_BUF_SIZE = 1024; - constexpr int GLOG_TIME_WIDTH_2 = 2; - constexpr int GLOG_TIME_WIDTH_6 = 6; - constexpr char GLOG_STAT_FLAG[] = "statOn"; - - // unique related config - constexpr int UNIQUE_BUCKET = 6; - constexpr int MIN_UNIQUE_THREAD_NUM = 1; - - // validate file - constexpr long long FILE_MAX_SIZE = 1LL << 40; - constexpr int FILE_MIN_SIZE = 0; - constexpr size_t BUFFER_SIZE{1024 * 1024 * 64}; - constexpr size_t MAP_BYTE_SIZE{static_cast(10) * 1024 * 1024 * 1024}; +using namespace tensorflow; +extern ock::ctr::FactoryPtr factory; +constexpr int TRAIN_CHANNEL_ID = 0; +constexpr int EVAL_CHANNEL_ID = 1; + +constexpr int MAX_CHANNEL_NUM = 2; +constexpr int MAX_KEY_PROCESS_THREAD = 10; +constexpr int MAX_QUEUE_NUM = MAX_CHANNEL_NUM * MAX_KEY_PROCESS_THREAD; +constexpr int DEFAULT_KEY_PROCESS_THREAD = 6; +constexpr int KEY_PROCESS_THREAD = 6; +constexpr char SUM_SAME_ID[] = "sum_same_id_gradients_and_apply"; +constexpr size_t MAX_VOCABULARY_SIZE = 1e10; +constexpr int SSD_SIZE_INDEX = 2; +constexpr int MAX_FILE_NUM = 1000; +constexpr int EMBEDDING_THREAD_NUM = 2; +// for GLOG +struct GlogConfig { + static bool gStatOn; + static int gGlogLevel; + static string gRankId; +}; + +constexpr int GLOG_MAX_BUF_SIZE = 1024; +constexpr int GLOG_TIME_WIDTH_2 = 2; +constexpr int GLOG_TIME_WIDTH_6 = 6; +constexpr char GLOG_STAT_FLAG[] = "statOn"; + +// unique related config +constexpr int UNIQUE_BUCKET = 6; +constexpr int MIN_UNIQUE_THREAD_NUM = 1; + +// validate file +constexpr long long FILE_MAX_SIZE = 1LL << 40; +constexpr int FILE_MIN_SIZE = 0; +constexpr size_t BUFFER_SIZE{1024 * 1024 * 64}; +constexpr size_t MAP_BYTE_SIZE{static_cast(10) * 1024 * 1024 * 1024}; #ifdef GTEST - constexpr int KEY_PROCESS_TIMEOUT = 3; +constexpr int KEY_PROCESS_TIMEOUT = 3; #else - constexpr int KEY_PROCESS_TIMEOUT = 120; +constexpr int KEY_PROCESS_TIMEOUT = 120; #endif - constexpr int GET_BATCH_TIMEOUT = 300; - constexpr int EOS_TIMEOUT = 30; - - constexpr size_t DEFAULT_RANDOM_SEED = 10086; - constexpr int64_t INVALID_KEY_VALUE = -1; - constexpr int32_t INVALID_INDEX_VALUE = -1; - constexpr int ALLTOALLVC_ALIGN = 128; - constexpr int PROFILING_START_BATCH_ID = 100; - constexpr int PROFILING_END_BATCH_ID = 200; - constexpr int MGMT_THREAD_BIND = 48; - constexpr int UNIQUE_MAX_BUCKET_WIDTH = 6; - constexpr int HOT_EMB_UPDATE_STEP_DEFAULT = 1000; - constexpr float HOT_EMB_CACHE_PCT = static_cast(1. / 3); // hot emb cache percent - - const string COMBINE_HISTORY_NAME = "combine_table_history"; - const string SAVE_SPARSE_PATH_PREFIX = "sparse"; - - using emb_key_t = int64_t; - using emb_cache_key_t = uint64_t; - using freq_num_t = int64_t; - using EmbNameT= std::string; - using KeysT = std::vector; - using LookupKeyT = std::tuple; // batch_id quarry_lable keys_vector - using UinqueKeyT = std::tuple>; - using RestoreVecSecT = std::tuple>; - using TensorInfoT = std::tuple>>::iterator>; - - namespace HybridOption { - const unsigned int USE_STATIC = 0x001; - const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 1; - const unsigned int USE_SUM_SAME_ID_GRADIENTS = 0x001 << 2; - }; - - string GetChipName(int devID); - int GetThreadNumEnv(); - - namespace UBSize { - const int ASCEND910_PREMIUM_A = 262144; - const int ASCEND910_PRO_B = 262144; - const int ASCEND910_B2 = 196608; - const int ASCEND910_B1 = 196608; - const int ASCEND910_B3 = 196608; - const int ASCEND910_B4 = 196608; - const int ASCEND910_C1 = 196608; - const int ASCEND910_C2 = 196608; - const int ASCEND910_C3 = 196608; - const int ASCEND920_A = 196608; - const int ASCEND910_PRO_A = 262144; - const int ASCEND910_B = 262144; - const int ASCEND910_A = 262144; - const int ASCEND910_B2C = 196608; - }; - - inline int GetUBSize(int devID) - { - const std::map chipUbSizeList = {{"910A", UBSize::ASCEND910_A}, - {"910B", UBSize::ASCEND910_B}, - {"920A", UBSize::ASCEND920_A}, - {"910B1", UBSize::ASCEND910_B1}, - {"910B2", UBSize::ASCEND910_B2}, - {"910B3", UBSize::ASCEND910_B3}, - {"910B4", UBSize::ASCEND910_B4}, - {"910B2C", UBSize::ASCEND910_B2C}, - {"910C1", UBSize::ASCEND910_C1}, - {"910C2", UBSize::ASCEND910_C1}, - {"910C3", UBSize::ASCEND910_C3} - }; - auto it = chipUbSizeList.find(GetChipName(devID)); - if (it != chipUbSizeList.end()) { - return it->second; - } - - throw std::runtime_error("unknown chip ub size" + GetChipName(devID)); +constexpr int GET_BATCH_TIMEOUT = 300; +constexpr int EOS_TIMEOUT = 30; + +constexpr size_t DEFAULT_RANDOM_SEED = 10086; +constexpr int64_t INVALID_KEY_VALUE = -1; +constexpr int32_t INVALID_INDEX_VALUE = -1; +constexpr int ALLTOALLVC_ALIGN = 128; +constexpr int PROFILING_START_BATCH_ID = 100; +constexpr int PROFILING_END_BATCH_ID = 200; +constexpr int MGMT_THREAD_BIND = 48; +constexpr int UNIQUE_MAX_BUCKET_WIDTH = 6; +constexpr int HOT_EMB_UPDATE_STEP_DEFAULT = 1000; +constexpr float HOT_EMB_CACHE_PCT = static_cast(1. / 3); // hot emb cache percent + +const string COMBINE_HISTORY_NAME = "combine_table_history"; +const string SAVE_SPARSE_PATH_PREFIX = "sparse"; + +using emb_key_t = int64_t; +using emb_cache_key_t = uint64_t; +using freq_num_t = int64_t; +using EmbNameT = std::string; +using KeysT = std::vector; +using LookupKeyT = std::tuple; // batch_id quarry_lable keys_vector +using UinqueKeyT = std::tuple>; +using RestoreVecSecT = std::tuple>; +using TensorInfoT = std::tuple>>::iterator>; + +namespace HybridOption { +const unsigned int USE_STATIC = 0x001; +const unsigned int USE_DYNAMIC_EXPANSION = 0x001 << 1; +const unsigned int USE_SUM_SAME_ID_GRADIENTS = 0x001 << 2; +}; // namespace HybridOption + +string GetChipName(int devID); +int GetThreadNumEnv(); + +namespace UBSize { +const int ASCEND910_PREMIUM_A = 262144; +const int ASCEND910_PRO_B = 262144; +const int ASCEND910_B2 = 196608; +const int ASCEND910_B1 = 196608; +const int ASCEND910_B3 = 196608; +const int ASCEND910_B4 = 196608; +const int ASCEND910_C1 = 196608; +const int ASCEND910_C2 = 196608; +const int ASCEND910_C3 = 196608; +const int ASCEND920_A = 196608; +const int ASCEND910_PRO_A = 262144; +const int ASCEND910_B = 262144; +const int ASCEND910_A = 262144; +const int ASCEND910_B2C = 196608; +}; // namespace UBSize + +inline int GetUBSize(int devID) +{ + const std::map chipUbSizeList = { + {"910A", UBSize::ASCEND910_A}, {"910B", UBSize::ASCEND910_B}, {"920A", UBSize::ASCEND920_A}, + {"910B1", UBSize::ASCEND910_B1}, {"910B2", UBSize::ASCEND910_B2}, {"910B3", UBSize::ASCEND910_B3}, + {"910B4", UBSize::ASCEND910_B4}, {"910B2C", UBSize::ASCEND910_B2C}, {"910C1", UBSize::ASCEND910_C1}, + {"910C2", UBSize::ASCEND910_C1}, {"910C3", UBSize::ASCEND910_C3}}; + auto it = chipUbSizeList.find(GetChipName(devID)); + if (it != chipUbSizeList.end()) { + return it->second; } - template - struct Batch { - size_t Size() const - { - return sample.size(); - } + throw std::runtime_error("unknown chip ub size" + GetChipName(devID)); +} - std::string UnParse() const - { - std::string s; - constexpr size_t maxDispLen = 20; - int maxLen = static_cast(std::min(sample.size(), maxDispLen)); - for (int i = 0; i < maxLen; i++) { - s += std::to_string(sample[i]) + " "; - } - return s; - } - - std::vector sample; - std::string name; - size_t batchSize; - int batchId; - int channel = 0; - time_t timestamp { -1 }; - }; - - struct BatchTask { - vector splits; - vector embNames; - size_t batchSize; - int batchQueueId; - int batchId; - int channelId; - time_t timestamp { -1 }; - const void *tensor; - }; - - using EmbBatchT = Batch; - using BatchTaskT = BatchTask; - - struct DDRParam { - vector tmpDataOut; - vector offsetsOut; - DDRParam(vector tmpData, vector offset) - { - tmpDataOut = tmpData; - offsetsOut = offset; - } - }; - - struct RankInfo { - RankInfo() = default; - - RankInfo(int rankId, int deviceId, int localRankSize, int option, const std::vector& ctrlSteps); - RankInfo(int localRankSize, int option, const std::vector& maxStep); - - int rankId {}; - int deviceId {}; - int rankSize {}; - int localRankId {}; - int localRankSize {}; - bool useStatic { false }; - uint32_t option {}; - bool isDDR { false }; - bool isSSDEnabled { false }; - bool useDynamicExpansion {false}; - bool useSumSameIdGradients {true}; - std::vector ctrlSteps; // 包含4个步数: train_steps, eval_steps, save_steps, max_train_steps - }; - - struct EmbBaseInfo { - int batchId; - int channelId; - string name; - }; - - enum TensorIndex : uint32_t { - TENSOR_INDEX_0, - TENSOR_INDEX_1, - TENSOR_INDEX_2, - TENSOR_INDEX_3, - TENSOR_INDEX_4, - TENSOR_INDEX_5, - TENSOR_INDEX_6, - TENSOR_INDEX_7, - TENSOR_INDEX_8 - }; - - enum TupleIndex : uint32_t { - TUPLE_INDEX_0 = 0, - TUPLE_INDEX_1, - TUPLE_INDEX_2, - TUPLE_INDEX_3, - TUPLE_INDEX_4, - TUPLE_INDEX_5, - TUPLE_INDEX_6, - TUPLE_INDEX_7 - }; - - struct RandomInfo { - RandomInfo() = default; - - RandomInfo(int start, int len, float constantVal, float randomMin, float randomMax); - - int start; - int len; - float constantVal; - float randomMin; - float randomMax; - }; - - struct EmbeddingSizeInfo { - size_t embeddingSize = 0; - size_t extendEmbSize = 0; - EmbeddingSizeInfo() = default; - EmbeddingSizeInfo(size_t embSize, size_t extendSize) - : embeddingSize(embSize), extendEmbSize(extendSize) {} - }; - - struct OptimizerInfo { - OptimizerInfo() = default; - OptimizerInfo(std::string name, vector params) - { - optimName = name; - optimParams = std::move(params); - } - - std::string optimName; - vector optimParams; - }; - - struct ThresholdValue { - ThresholdValue() = default; - ThresholdValue(EmbNameT name, int countThre, int timeThre, int faaeCoef, bool isSum) - { - tableName = name; - countThreshold = countThre; - timeThreshold = timeThre; - faaeCoefficient = faaeCoef; - isEnableSum = isSum; - } - - EmbNameT tableName { "" }; // embName - int countThreshold { -1 }; // 只配置count,即“只有准入、而没有淘汰”功能,对应SingleHostEmbTableStatus::SETS_ONLY_ADMIT状态 - int timeThreshold { -1 }; // 只配置time,配置错误;即准入是淘汰的前提,对应SingleHostEmbTableStatus::SETS_BOTH状态 - int faaeCoefficient { 1 }; // 配置后,该表在准入时,count计数会乘以该系数 - bool isEnableSum {true}; // 配置false,该表在准入时,count计数不会累加 - }; - - struct FeatureItemInfo { - FeatureItemInfo() = default; - FeatureItemInfo(uint32_t cnt, time_t lastT) - : count(cnt), lastTime(lastT) - {} - - uint32_t count { 0 }; - time_t lastTime { 0 }; - }; - - using HistoryRecords = absl::flat_hash_map>; - struct AdmitAndEvictData { - HistoryRecords historyRecords; // embName ---> {id, FeatureItemInfo} 映射 - absl::flat_hash_map timestamps; // 用于特征准入&淘汰的时间戳 - }; - - void SetLog(int rank); - - template - string StringFormat(const string& format, Args ... args) +template +struct Batch { + size_t Size() const { - auto size = static_cast(GLOG_MAX_BUF_SIZE); - auto buf = std::make_unique(size); - memset_s(buf.get(), size, 0, size); - int nChar = snprintf_s(buf.get(), size, size - 1, format.c_str(), args ...); - if (nChar == -1) { - throw invalid_argument("StringFormat failed"); - } - return string(buf.get(), buf.get() + nChar); + return sample.size(); } - // use environment variable GLOG_v to decide if showing debug log. - // default 0, debug message will not display. - // 1 for debug, 2 for trace - constexpr int GLOG_DEBUG = 1; - constexpr int GLOG_TRACE = 2; - - template - std::string VectorToString(const std::vector& vec) + std::string UnParse() const { - constexpr size_t maxDispLen = 20; // max display number - int maxLen = static_cast(std::min(vec.size(), maxDispLen)); - - std::stringstream ss; - ss << "["; - for (size_t i = 0; i < maxLen; ++i) { - ss << vec[i]; - if (i != vec.size() - 1) { - ss << ", "; - } + std::string s; + constexpr size_t maxDispLen = 20; + int maxLen = static_cast(std::min(sample.size(), maxDispLen)); + for (int i = 0; i < maxLen; i++) { + s += std::to_string(sample[i]) + " "; } - ss << "]"; - return ss.str(); + return s; } - std::string FloatPtrToLimitStr(float* ptr, const size_t& prtSize); - - template - std::string MapToString(const std::map& map) + std::vector sample; + std::string name; + size_t batchSize; + int batchId; + int channel = 0; + time_t timestamp{-1}; +}; + +struct BatchTask { + vector splits; + vector embNames; + size_t batchSize; + int batchQueueId; + int batchId; + int channelId; + time_t timestamp{-1}; + const void* tensor; +}; + +using EmbBatchT = Batch; +using BatchTaskT = BatchTask; + +struct DDRParam { + vector tmpDataOut; + vector offsetsOut; + DDRParam(vector tmpData, vector offset) { - std::stringstream ss; - ss << "{"; - for (auto it = map.begin(); it != map.end(); ++it) { - ss << it->first << ": " << it->second; - if (std::next(it) != map.end()) { - ss << ", "; - } - } - ss << "}"; - return ss.str(); + tmpDataOut = tmpData; + offsetsOut = offset; } - - template - std::string MapToString(const absl::flat_hash_map& map) +}; + +struct RankInfo { + RankInfo() = default; + + RankInfo(int rankId, int deviceId, int localRankSize, int option, const std::vector& ctrlSteps); + RankInfo(int localRankSize, int option, const std::vector& maxStep); + + int rankId{}; + int deviceId{}; + int rankSize{}; + int localRankId{}; + int localRankSize{}; + bool useStatic{false}; + uint32_t option{}; + bool isDDR{false}; + bool isSSDEnabled{false}; + bool useDynamicExpansion{false}; + bool useSumSameIdGradients{true}; + std::vector ctrlSteps; // 包含4个步数: train_steps, eval_steps, save_steps, max_train_steps +}; + +struct EmbBaseInfo { + int batchId; + int channelId; + string name; +}; + +enum TensorIndex : uint32_t { + TENSOR_INDEX_0, + TENSOR_INDEX_1, + TENSOR_INDEX_2, + TENSOR_INDEX_3, + TENSOR_INDEX_4, + TENSOR_INDEX_5, + TENSOR_INDEX_6, + TENSOR_INDEX_7, + TENSOR_INDEX_8 +}; + +enum TupleIndex : uint32_t { + TUPLE_INDEX_0 = 0, + TUPLE_INDEX_1, + TUPLE_INDEX_2, + TUPLE_INDEX_3, + TUPLE_INDEX_4, + TUPLE_INDEX_5, + TUPLE_INDEX_6, + TUPLE_INDEX_7 +}; + +struct RandomInfo { + RandomInfo() = default; + + RandomInfo(int start, int len, float constantVal, float randomMin, float randomMax); + + int start; + int len; + float constantVal; + float randomMin; + float randomMax; +}; + +struct EmbeddingSizeInfo { + size_t embeddingSize = 0; + size_t extendEmbSize = 0; + EmbeddingSizeInfo() = default; + EmbeddingSizeInfo(size_t embSize, size_t extendSize) : embeddingSize(embSize), extendEmbSize(extendSize) {} +}; + +struct OptimizerInfo { + OptimizerInfo() = default; + OptimizerInfo(std::string name, vector params) { - std::stringstream ss; - ss << "{"; - for (auto it = map.begin(); it != map.end(); ++it) { - ss << it->first << ": " << it->second; - if (std::next(it) != map.end()) { - ss << ", "; - } - } - ss << "}"; - return ss.str(); + optimName = name; + optimParams = std::move(params); } - void ValidateReadFile(const string& dataDir, size_t datasetSize); + std::string optimName; + vector optimParams; +}; - template - inline Tensor Vec2TensorI32(const std::vector& data) +struct ThresholdValue { + ThresholdValue() = default; + ThresholdValue(EmbNameT name, int countThre, int timeThre, int faaeCoef, bool isSum) { - Tensor tmpTensor(tensorflow::DT_INT32, { static_cast(data.size()) }); - auto tmpData = tmpTensor.flat(); - for (int j = 0; j < static_cast(data.size()); j++) { - tmpData(j) = static_cast(data[j]); - } - return tmpTensor; + tableName = name; + countThreshold = countThre; + timeThreshold = timeThre; + faaeCoefficient = faaeCoef; + isEnableSum = isSum; } - template - inline Tensor Vec2TensorI64(const std::vector& data) - { - Tensor tmpTensor(tensorflow::DT_INT64, { static_cast(data.size()) }); - auto tmpData = tmpTensor.flat(); - for (int j = 0; j < static_cast(data.size()); j++) { - tmpData(j) = static_cast(data[j]); + EmbNameT tableName{""}; // embName + int countThreshold{ + -1}; // 只配置count,即“只有准入、而没有淘汰”功能,对应SingleHostEmbTableStatus::SETS_ONLY_ADMIT状态 + int timeThreshold{-1}; // 只配置time,配置错误;即准入是淘汰的前提,对应SingleHostEmbTableStatus::SETS_BOTH状态 + int faaeCoefficient{1}; // 配置后,该表在准入时,count计数会乘以该系数 + bool isEnableSum{true}; // 配置false,该表在准入时,count计数不会累加 +}; + +struct FeatureItemInfo { + FeatureItemInfo() = default; + FeatureItemInfo(uint32_t cnt, time_t lastT) : count(cnt), lastTime(lastT) {} + + uint32_t count{0}; + time_t lastTime{0}; +}; + +using HistoryRecords = absl::flat_hash_map>; +struct AdmitAndEvictData { + HistoryRecords historyRecords; // embName ---> {id, FeatureItemInfo} 映射 + absl::flat_hash_map timestamps; // 用于特征准入&淘汰的时间戳 +}; + +void SetLog(int rank); + +template +string StringFormat(const string& format, Args... args) +{ + auto size = static_cast(GLOG_MAX_BUF_SIZE); + auto buf = std::make_unique(size); + memset_s(buf.get(), size, 0, size); + int nChar = snprintf_s(buf.get(), size, size - 1, format.c_str(), args...); + if (nChar == -1) { + throw invalid_argument("StringFormat failed"); + } + return string(buf.get(), buf.get() + nChar); +} + +// use environment variable GLOG_v to decide if showing debug log. +// default 0, debug message will not display. +// 1 for debug, 2 for trace +constexpr int GLOG_DEBUG = 1; +constexpr int GLOG_TRACE = 2; + +template +std::string VectorToString(const std::vector& vec) +{ + constexpr size_t maxDispLen = 20; // max display number + int maxLen = static_cast(std::min(vec.size(), maxDispLen)); + + std::stringstream ss; + ss << "["; + for (size_t i = 0; i < maxLen; ++i) { + ss << vec[i]; + if (i != vec.size() - 1) { + ss << ", "; } - return tmpTensor; } - - struct EmbInfoParams { - std::string name; - int sendCount; - int embeddingSize; - int extEmbeddingSize; - bool isSave; - bool isGrad; - EmbInfoParams() = default; - - EmbInfoParams(const std::string& name, - int sendCount, - int embeddingSize, - int extEmbeddingSize, - bool isSave, - bool isGrad) - : name(name), - sendCount(sendCount), - embeddingSize(embeddingSize), - extEmbeddingSize(extEmbeddingSize), - isSave(isSave), - isGrad(isGrad) - { + ss << "]"; + return ss.str(); +} + +std::string FloatPtrToLimitStr(float* ptr, const size_t& prtSize); + +template +std::string MapToString(const std::map& map) +{ + std::stringstream ss; + ss << "{"; + for (auto it = map.begin(); it != map.end(); ++it) { + ss << it->first << ": " << it->second; + if (std::next(it) != map.end()) { + ss << ", "; } - }; - - struct EmbInfo { - EmbInfo() = default; - - EmbInfo(const EmbInfoParams& embInfoParams, - std::vector vocabsize, - std::vector initializeInfos, - std::vector ssdDataPath) - : name(embInfoParams.name), - sendCount(embInfoParams.sendCount), - embeddingSize(embInfoParams.embeddingSize), - extEmbeddingSize(embInfoParams.extEmbeddingSize), - isSave(embInfoParams.isSave), - isGrad(embInfoParams.isGrad), - devVocabSize(vocabsize[0]), - hostVocabSize(vocabsize[1]), - ssdVocabSize(vocabsize[SSD_SIZE_INDEX]), - initializeInfos(std::move(initializeInfos)), - ssdDataPath(std::move(ssdDataPath)) - { + } + ss << "}"; + return ss.str(); +} + +template +std::string MapToString(const absl::flat_hash_map& map) +{ + std::stringstream ss; + ss << "{"; + for (auto it = map.begin(); it != map.end(); ++it) { + ss << it->first << ": " << it->second; + if (std::next(it) != map.end()) { + ss << ", "; } + } + ss << "}"; + return ss.str(); +} + +void ValidateReadFile(const string& dataDir, size_t datasetSize); + +template +inline Tensor Vec2TensorI32(const std::vector& data) +{ + Tensor tmpTensor(tensorflow::DT_INT32, {static_cast(data.size())}); + auto tmpData = tmpTensor.flat(); + for (int j = 0; j < static_cast(data.size()); j++) { + tmpData(j) = static_cast(data[j]); + } + return tmpTensor; +} + +template +inline Tensor Vec2TensorI64(const std::vector& data) +{ + Tensor tmpTensor(tensorflow::DT_INT64, {static_cast(data.size())}); + auto tmpData = tmpTensor.flat(); + for (int j = 0; j < static_cast(data.size()); j++) { + tmpData(j) = static_cast(data[j]); + } + return tmpTensor; +} + +struct EmbInfoParams { + std::string name; + int sendCount; + int embeddingSize; + int extEmbeddingSize; + bool isSave; + bool isGrad; + EmbInfoParams() = default; + + EmbInfoParams(const std::string& name, int sendCount, int embeddingSize, int extEmbeddingSize, bool isSave, + bool isGrad) + : name(name), + sendCount(sendCount), + embeddingSize(embeddingSize), + extEmbeddingSize(extEmbeddingSize), + isSave(isSave), + isGrad(isGrad) + { + } +}; + +struct EmbInfo { + EmbInfo() = default; + + EmbInfo(const EmbInfoParams& embInfoParams, std::vector vocabsize, + std::vector initializeInfos, std::vector ssdDataPath) + : name(embInfoParams.name), + sendCount(embInfoParams.sendCount), + embeddingSize(embInfoParams.embeddingSize), + extEmbeddingSize(embInfoParams.extEmbeddingSize), + isSave(embInfoParams.isSave), + isGrad(embInfoParams.isGrad), + devVocabSize(vocabsize[0]), + hostVocabSize(vocabsize[1]), + ssdVocabSize(vocabsize[SSD_SIZE_INDEX]), + initializeInfos(std::move(initializeInfos)), + ssdDataPath(std::move(ssdDataPath)) + { + } - std::string name; - int sendCount; - int embeddingSize; - int extEmbeddingSize; - bool isSave; - bool isGrad; - size_t devVocabSize; - size_t hostVocabSize; - size_t ssdVocabSize; - std::vector initializeInfos; - std::vector ssdDataPath; - }; - - struct HostEmbTable { - EmbInfo hostEmbInfo; - std::vector> embData; - }; - - struct All2AllInfo { - KeysT keyRecv; - vector scAll; - vector countRecv; - All2AllInfo() = default; - All2AllInfo(KeysT keyRecv, vector scAll, vector countRecv) - : keyRecv(keyRecv), scAll(scAll), countRecv(countRecv) {} - }; - - struct UniqueInfo { - vector restore; - vector hotPos; - All2AllInfo all2AllInfo; - UniqueInfo() = default; - UniqueInfo(vector restore, vector hotPos, All2AllInfo all2AllInfo) - : restore(restore), hotPos(hotPos), all2AllInfo(all2AllInfo) {} - }; - - struct KeySendInfo { - KeysT keySend; - vector keyCount; - }; - - using EmbMemT = absl::flat_hash_map; - using OffsetMemT = std::map; - using KeyOffsetMemT = std::map>; - using KeyCountMemT = std::map>; - using Table2ThreshMemT = absl::flat_hash_map; - using trans_serialize_t = uint8_t; - using OffsetMapT = std::map>; - using OffsetT = std::vector; - using AllKeyOffsetMapT = std::map>; - using KeyFreqMemT = unordered_map>; - using EmbLocalTableT = EmbCache::EmbCacheManager; - - enum class CkptFeatureType { - HOST_EMB = 0, - EMB_HASHMAP = 1, - MAX_OFFSET = 2, - KEY_OFFSET_MAP = 3, - FEAT_ADMIT_N_EVICT = 4, - DDR_KEY_FREQ_MAP = 5, - EXCLUDE_DDR_KEY_FREQ_MAP = 6, - KEY_COUNT_MAP = 7, - EMB_LOCAL_TABLE = 8 - }; - - struct CkptData { - EmbMemT* hostEmbs = nullptr; - OffsetMemT maxOffset; - KeyOffsetMemT keyOffsetMap; - OffsetMapT offsetMap; - OffsetMapT* offsetMapPtr = &offsetMap; - KeyCountMemT keyCountMap; - Table2ThreshMemT table2Thresh; - AdmitAndEvictData histRec; - KeyFreqMemT ddrKeyFreqMaps; - KeyFreqMemT excludeDDRKeyFreqMaps; - }; - - struct CkptTransData { - std::vector int64Arr; - std::vector addressArr; - std::vector int32Arr; - std::vector transDataset; // may all use this to transfer data - std::vector attribute; // may need to use other form for attributes - size_t datasetSize; - size_t attributeSize; - }; - - enum class CkptDataType { - EMB_INFO = 0, - EMB_DATA = 1, - EMB_HASHMAP = 2, - DEV_OFFSET = 3, - EMB_CURR_STAT = 4, - NDDR_OFFSET = 5, - NDDR_FEATMAP = 6, - TABLE_2_THRESH = 7, - HIST_REC = 8, - ATTRIBUTE = 9, - DDR_FREQ_MAP = 10, - EXCLUDE_FREQ_MAP = 11, - EVICT_POS = 12, - KEY_COUNT_MAP = 13 - }; - - enum CTRLogLevel { // can't use enum class due to compatibility for AccCTR - DEBUG = 0, - INFO, - WARN, - ERROR, - }; - - static void CTRLog(int level, const char *msg) + std::string name; + int sendCount; + int embeddingSize; + int extEmbeddingSize; + bool isSave; + bool isGrad; + size_t devVocabSize; + size_t hostVocabSize; + size_t ssdVocabSize; + std::vector initializeInfos; + std::vector ssdDataPath; +}; + +struct HostEmbTable { + EmbInfo hostEmbInfo; + std::vector> embData; +}; + +struct All2AllInfo { + KeysT keyRecv; + vector scAll; + vector countRecv; + All2AllInfo() = default; + All2AllInfo(KeysT keyRecv, vector scAll, vector countRecv) + : keyRecv(keyRecv), + scAll(scAll), + countRecv(countRecv) { - switch (level) { - case CTRLogLevel::DEBUG: - LOG_DEBUG(msg); - break; - case CTRLogLevel::INFO: - LOG_INFO(msg); - break; - case CTRLogLevel::WARN: - LOG_WARN(msg); - break; - case CTRLogLevel::ERROR: - LOG_ERROR(msg); - break; - default: - break; - } } +}; + +struct UniqueInfo { + vector restore; + vector hotPos; + All2AllInfo all2AllInfo; + UniqueInfo() = default; + UniqueInfo(vector restore, vector hotPos, All2AllInfo all2AllInfo) + : restore(restore), + hotPos(hotPos), + all2AllInfo(all2AllInfo) + { + } +}; + +struct KeySendInfo { + KeysT keySend; + vector keyCount; +}; + +using EmbMemT = absl::flat_hash_map; +using OffsetMemT = std::map; +using KeyOffsetMemT = std::map>; +using KeyCountMemT = std::map>; +using Table2ThreshMemT = absl::flat_hash_map; +using trans_serialize_t = uint8_t; +using OffsetMapT = std::map>; +using OffsetT = std::vector; +using AllKeyOffsetMapT = std::map>; +using KeyFreqMemT = unordered_map>; +using EmbLocalTableT = EmbCache::EmbCacheManager; + +enum class CkptFeatureType { + HOST_EMB = 0, + EMB_HASHMAP = 1, + MAX_OFFSET = 2, + KEY_OFFSET_MAP = 3, + FEAT_ADMIT_N_EVICT = 4, + DDR_KEY_FREQ_MAP = 5, + EXCLUDE_DDR_KEY_FREQ_MAP = 6, + KEY_COUNT_MAP = 7, + EMB_LOCAL_TABLE = 8 +}; + +struct CkptData { + EmbMemT* hostEmbs = nullptr; + OffsetMemT maxOffset; + KeyOffsetMemT keyOffsetMap; + OffsetMapT offsetMap; + OffsetMapT* offsetMapPtr = &offsetMap; + KeyCountMemT keyCountMap; + Table2ThreshMemT table2Thresh; + AdmitAndEvictData histRec; + KeyFreqMemT ddrKeyFreqMaps; + KeyFreqMemT excludeDDRKeyFreqMaps; +}; + +struct CkptTransData { + std::vector int64Arr; + std::vector addressArr; + std::vector int32Arr; + std::vector transDataset; // may all use this to transfer data + std::vector attribute; // may need to use other form for attributes + size_t datasetSize; + size_t attributeSize; +}; + +enum class CkptDataType { + EMB_INFO = 0, + EMB_DATA = 1, + EMB_HASHMAP = 2, + DEV_OFFSET = 3, + EMB_CURR_STAT = 4, + NDDR_OFFSET = 5, + NDDR_FEATMAP = 6, + TABLE_2_THRESH = 7, + HIST_REC = 8, + ATTRIBUTE = 9, + DDR_FREQ_MAP = 10, + EXCLUDE_FREQ_MAP = 11, + EVICT_POS = 12, + KEY_COUNT_MAP = 13 +}; + +static std::string CkptDataTypeName(CkptDataType type) +{ + switch (type) { + case CkptDataType::EMB_INFO: + return "EMB_INFO"; + case CkptDataType::EMB_DATA: + return "EMB_DATA"; + case CkptDataType::EMB_HASHMAP: + return "EMB_HASHMAP"; + case CkptDataType::DEV_OFFSET: + return "DEV_OFFSET"; + case CkptDataType::EMB_CURR_STAT: + return "EMB_CURR_STAT"; + case CkptDataType::NDDR_OFFSET: + return "NDDR_OFFSET"; + case CkptDataType::NDDR_FEATMAP: + return "NDDR_FEATMAP"; + case CkptDataType::TABLE_2_THRESH: + return "TABLE_2_THRESH"; + case CkptDataType::HIST_REC: + return "HIST_REC"; + case CkptDataType::ATTRIBUTE: + return "ATTRIBUTE"; + case CkptDataType::DDR_FREQ_MAP: + return "DDR_FREQ_MAP"; + case CkptDataType::EXCLUDE_FREQ_MAP: + return "EXCLUDE_FREQ_MAP"; + case CkptDataType::EVICT_POS: + return "EVICT_POS"; + case CkptDataType::KEY_COUNT_MAP: + return "KEY_COUNT_MAP"; + default: + return "UNKNOWN"; + } +} + +enum CTRLogLevel { // can't use enum class due to compatibility for AccCTR + DEBUG = 0, + INFO, + WARN, + ERROR, +}; + +static void CTRLog(int level, const char* msg) +{ + switch (level) { + case CTRLogLevel::DEBUG: + LOG_DEBUG(msg); + break; + case CTRLogLevel::INFO: + LOG_INFO(msg); + break; + case CTRLogLevel::WARN: + LOG_WARN(msg); + break; + case CTRLogLevel::ERROR: + LOG_ERROR(msg); + break; + default: + break; + } +} - ostream& operator<<(ostream& ss, MxRec::CkptDataType type); - bool CheckFilePermission(const string& filePath); +ostream& operator<<(ostream& ss, MxRec::CkptDataType type); +bool CheckFilePermission(const string& filePath); - int GetStepFromPath(const string& loadPath); -} // end namespace MxRec +int GetStepFromPath(const string& loadPath); +} // end namespace MxRec #define KEY_PROCESS "\033[45m[KeyProcess]\033[0m " #define STAT_INFO "[StatInfo] " #ifdef GTEST - #define GTEST_PRIVATE public +#define GTEST_PRIVATE public #else - #define GTEST_PRIVATE private +#define GTEST_PRIVATE private #endif #endif diff --git a/src/ops_tf/hybrid_dataset_ops.cpp b/src/ops_tf/hybrid_dataset_ops.cpp index 2eee8531..98fca961 100644 --- a/src/ops_tf/hybrid_dataset_ops.cpp +++ b/src/ops_tf/hybrid_dataset_ops.cpp @@ -403,7 +403,7 @@ namespace MxRec { out(0) = batchId; if (channelId == 1) { if (maxStep != -1 && batchId >= maxStep) { - LOG_DEBUG(StringFormat("skip excess batch after {}/{}", batchId, maxStep)); + LOG_DEBUG(StringFormat("skip excess batch after %d/%d", batchId, maxStep)); return; } } @@ -658,4 +658,4 @@ namespace tensorflow { .SetIsStateful() .SetShapeFn(::tensorflow::shape_inference::UnknownShape); REGISTER_KERNEL_BUILDER(Name("LazyAdam").Device(DEVICE_CPU), MxRec::CustOps); -} \ No newline at end of file +} -- Gitee From f6340067b4f2475582615b8cf77cf38baeabffe3 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Mon, 1 Jul 2024 08:28:14 +0000 Subject: [PATCH 250/302] fix: cleancode. Signed-off-by: steepcurve --- src/core/checkpoint/checkpoint.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/core/checkpoint/checkpoint.cpp b/src/core/checkpoint/checkpoint.cpp index bc7501bb..469e209e 100644 --- a/src/core/checkpoint/checkpoint.cpp +++ b/src/core/checkpoint/checkpoint.cpp @@ -90,17 +90,18 @@ void Checkpoint::SetDataHandler(CkptData& ckptData) void Checkpoint::SetDataHandler(const vector& featureTypes) { - map> setCkptMap{{CkptFeatureType::FEAT_ADMIT_N_EVICT, - [this] { - dataHandlers.push_back(make_unique()); - }}, - {CkptFeatureType::DDR_KEY_FREQ_MAP, - [this] { - dataHandlers.push_back(make_unique()); - }}, - {CkptFeatureType::KEY_COUNT_MAP, [this] { - dataHandlers.push_back(make_unique()); - }}}; + auto featAdmitNEvictHandler = [this] { + dataHandlers.push_back(make_unique()); + }; + auto ddrKeyFreqMapHandler = [this] { + dataHandlers.push_back(make_unique()); + }; + auto keyCountMapHandler = [this] { + dataHandlers.push_back(make_unique()); + }; + map> setCkptMap{{CkptFeatureType::FEAT_ADMIT_N_EVICT, featAdmitNEvictHandler}, + {CkptFeatureType::DDR_KEY_FREQ_MAP, ddrKeyFreqMapHandler}, + {CkptFeatureType::KEY_COUNT_MAP, keyCountMapHandler}}; for (const auto& featureType : featureTypes) { setCkptMap.at(featureType)(); @@ -341,7 +342,8 @@ void Checkpoint::ReadStream(CkptTransData& transData, const string& dataDir, Ckp if (readBytesNum != datasetSize) { throw runtime_error(StringFormat("Error: Load data failed. data type: %s. " "Expected to read %d bytes, but actually read %d bytes to file %s.", - CkptDataTypeName(dataType).c_str(), datasetSize, readBytesNum, dataDir.c_str())); + CkptDataTypeName(dataType).c_str(), datasetSize, readBytesNum, + dataDir.c_str())); } } -- Gitee From 90bd4a3705ce1a239178284d001abe91dd05fb91 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Tue, 2 Jul 2024 17:10:56 +0800 Subject: [PATCH 251/302] =?UTF-8?q?estimator=E4=B8=ADtrain=E5=88=87?= =?UTF-8?q?=E6=8D=A2=E4=B8=BAeval,=E5=A2=9E=E5=8A=A0=E5=8E=9Fhost=E4=BE=A7?= =?UTF-8?q?train=E7=9A=84=E7=9B=B8=E5=85=B3=E7=8A=B6=E6=80=81=E5=A4=87?= =?UTF-8?q?=E4=BB=BD=EF=BC=8C=E5=9C=A8eval=E5=88=87=E6=8D=A2=E4=B8=BAtrain?= =?UTF-8?q?=E5=90=8E=E8=BF=9B=E8=A1=8C=E8=BF=98=E5=8E=9F=E5=A4=87=E4=BB=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cache_manager/cache_manager.cpp | 27 ++++++++++++++++ .../cache_manager/cache_manager.h | 4 +++ src/AccCTR/src/embedding_cache/limited_set.h | 18 +++++++++++ .../offset_mapper/offset_mapper.h | 32 +++++++++++++++++++ src/AccCTR/src/include/embedding_cache.h | 14 ++++++++ src/core/emb_table/embedding_ddr.cpp | 10 ++++++ src/core/emb_table/embedding_ddr.h | 3 ++ src/core/emb_table/embedding_mgmt.cpp | 14 ++++++++ src/core/emb_table/embedding_mgmt.h | 11 +++++++ src/core/emb_table/embedding_static.cpp | 16 ++++++++-- src/core/emb_table/embedding_static.h | 4 +++ src/core/emb_table/embedding_table.cpp | 8 +++++ src/core/emb_table/embedding_table.h | 5 +++ src/core/hybrid_mgmt/hybrid_mgmt.cpp | 27 ++++++++++++++++ src/core/hybrid_mgmt/hybrid_mgmt.h | 5 +++ .../ock_ctr_common/include/embedding_cache.h | 14 ++++++++ 16 files changed, 210 insertions(+), 2 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp index 8a6187a1..452e2fd1 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -317,6 +317,33 @@ int EmbCacheManagerImpl::LoadEmbTableInfos(std::string tableName, const std::vec return H_OK; } +int EmbCacheManagerImpl::BackUpTrainStatus(std:string tableName) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + + auto om = offsetMappersBackUp.find(tableName); + if (om != offsetMappersBackUp.end()) { + offsetMappersBackUp[tableName] = offsetMappers[tableName]; + } else{ + offsetMappersBackUp[tableName].Initialize(1000, 1000); + offsetMappersBackUp[tableName] = offsetMappers[tableName]; + } + return H_OK; +} + +int EmbCacheManagerImpl::RecoverTrainStatus(std:string tableName) +{ + int checkTableNameRet = CheckValidTableName(tableName); + if (checkTableNameRet != H_OK) { + return checkTableNameRet; + } + offsetMappers[tableName] = offsetMappersBackUp[tableName]; + return H_OK; +} + void EmbCacheManagerImpl::Destroy() { for (auto it = offsetMappers.begin(); it != offsetMappers.end(); it++) { diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h index 80fbcd46..359e88ad 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h @@ -73,11 +73,15 @@ public: const std::vector>& embeddings, const std::vector>& optimizerSlots) override; + int BackUpTrainStatus(std:string tableName) override; + int RecoverTrainStatus(std::string tableName) override; + uint32_t GetUsage(const std::string& tableName) override; private: std::map embCacheInfos; std::map offsetMappers; + std::map offsetMappersBackUp; std::map embTables; int CheckValidTableName(const std::string& tableName); diff --git a/src/AccCTR/src/embedding_cache/limited_set.h b/src/AccCTR/src/embedding_cache/limited_set.h index 036a6477..d44b615a 100644 --- a/src/AccCTR/src/embedding_cache/limited_set.h +++ b/src/AccCTR/src/embedding_cache/limited_set.h @@ -47,6 +47,24 @@ public: delete tail; } + // 拷贝构造函数 + LimitedSet(const LimitedSet& other): head(new Node(-1)), tail(new Node(-1)) + { + nodes.resize(other.nodes.size()); + for (auto &node: nodes) { + node = new Node(-1); + } + + // 初始化头尾节点 + head->next = tail; + tail->prev = head; + + // 遍历原vector的每一个节点并复制 + for (Node* node = other.head->next; node != other.tail; node = node->next) { + insert(node->value); + } + } + void insert(uint64_t value) { if (nodes[value]->value == value) { diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h index f42a0d3f..1ad470c5 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/offset_mapper.h @@ -35,6 +35,38 @@ public: ~OffsetMapper() = default; + OffsetMapper(const OffsetMapper& other): maxCacheSize(other.maxCacheSize), useLength(other.useLength), + validPos(new LimitedSet(*other.validPos)), + evictPos(new LimitedSet(*other.evictPos)), + pos2Key(other.pos2Key), lastBatchPos(other.lastBatchPos), + evictSize(other.evictSize) + { + } + + OffsetMapper& operator=(const OffsetMapper& other) + { + if (this != &other) { + delete validPos; + validPos = nullptr; + delete evictPos; + evictPos = nullptr; + + if (other.validPos != nullptr) { + validPos = new LimitedSet(*other.validPos); + } + if (other.evictPos != nullptr) { + evictPos = new LimitedSet(*other.evictPos); + } + + maxCacheSize = other.maxCacheSize; + useLength = other.useLength; + pos2Key = other.pos2Key; + lastBatchPos = other.lastBatchPos; + evictSize = other.evictSize; + } + return *this; + } + bool Initialize(uint32_t reserve, uint32_t maxSize = 0) { maxCacheSize = maxSize; diff --git a/src/AccCTR/src/include/embedding_cache.h b/src/AccCTR/src/include/embedding_cache.h index 4adf1fbf..40d9dcbe 100644 --- a/src/AccCTR/src/include/embedding_cache.h +++ b/src/AccCTR/src/include/embedding_cache.h @@ -315,6 +315,20 @@ public: virtual int LoadEmbTableInfos(std::string tableName, const std::vector& keys, const std::vector>& embeddings, const std::vector>& optimizerSlots) = 0; + + /* * + * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态 + * @Param tableName: 需要加载信息的table名字 + * @Return errorCode + */ + virtual int BackUpTrainStatus(std::string tableName) = 0; + + /* * + * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态 + * @Param tableName: 需要加载信息的table名字 + * @Return errorCode + */ + virtual int RecoverTrainStatus(std::string tableName) = 0; }; } // namespace EmbCache diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index ca706c73..e4b96eb6 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -376,3 +376,13 @@ void EmbeddingDDR::SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache) { this->embCache = embCache; } + +void EmbeddingDDR::BackUpTrainStatus() +{ + embCache->BackUpTrainStatus(name); +} + +void EmbeddingDDR::RecoverTrainStatus() +{ + embCache->RecoverTrainStatus(name); +} diff --git a/src/core/emb_table/embedding_ddr.h b/src/core/emb_table/embedding_ddr.h index ac5c5878..26d85e60 100644 --- a/src/core/emb_table/embedding_ddr.h +++ b/src/core/emb_table/embedding_ddr.h @@ -73,6 +73,9 @@ public: void SaveEmbAndOptim(const string& savePath); void SetEmbCache(ock::ctr::EmbCacheManagerPtr embCache); + void BackUpTrainStatus(); + void RecoverTrainStatus(); + GTEST_PRIVATE: void EvictDeleteEmb(const vector& keys); diff --git a/src/core/emb_table/embedding_mgmt.cpp b/src/core/emb_table/embedding_mgmt.cpp index 9e7dcbb0..d889cdba 100644 --- a/src/core/emb_table/embedding_mgmt.cpp +++ b/src/core/emb_table/embedding_mgmt.cpp @@ -196,3 +196,17 @@ void EmbeddingMgmt::SetEmbCacheForEmbTable(const ock::ctr::EmbCacheManagerPtr& e table.second->SetEmbCache(embCache); } } + +void EmbeddingMgmt::BackUpTrainStatusBeforeLoad() +{ + for (auto& table: embeddings) { + table.second->BackUpTrainStatus(); + } +} + +void EmbeddingMgmt::RecoverTrainStatus() +{ + for (auto& table: embeddings) { + table.second->RecoverTrainStatus(); + } +} \ No newline at end of file diff --git a/src/core/emb_table/embedding_mgmt.h b/src/core/emb_table/embedding_mgmt.h index ef106786..7cd3f782 100644 --- a/src/core/emb_table/embedding_mgmt.h +++ b/src/core/emb_table/embedding_mgmt.h @@ -89,6 +89,17 @@ public: */ void Save(const string& filePath); + /** + * estimator模式下train切换为eval时, 备份所有表train的状态 + */ + void BackUpTrainStatusBeforeLoad(); + + /** + * estimator模式下eval切换为train时, 还原所有表train的状态 + */ + void RecoverTrainStatus(); + + /** * 获取所有表对应的DeviceOffsets,该偏移用于python侧保存embedding时抽取key对应的embedding */ diff --git a/src/core/emb_table/embedding_static.cpp b/src/core/emb_table/embedding_static.cpp index 61874b1f..0db152ed 100644 --- a/src/core/emb_table/embedding_static.cpp +++ b/src/core/emb_table/embedding_static.cpp @@ -160,11 +160,23 @@ void EmbeddingStatic::LoadKey(const string& savePath) } maxOffset = keyOffsetMap.size(); - free(static_cast(buf)); } vector EmbeddingStatic::GetDeviceOffset() { return deviceOffset; -} \ No newline at end of file +} + +void EmbeddingStatic::BackUpTrainStatus() +{ + keyOffsetMapBackUp = keyOffsetMap; +} + +void EmbeddingStatic::RecoverTrainStatus() +{ + if (keyOffsetMapBackUp.size()!=0) { + keyOffsetMap = keyOffsetMapBackUp; + keyOffsetMapBackUp.clear(); + } +} diff --git a/src/core/emb_table/embedding_static.h b/src/core/emb_table/embedding_static.h index 6515f586..6f772e08 100644 --- a/src/core/emb_table/embedding_static.h +++ b/src/core/emb_table/embedding_static.h @@ -39,6 +39,10 @@ public: void Save(const string& savePath); + void BackUpTrainStatus(); + + void RecoverTrainStatus(); + vector GetDeviceOffset(); GTEST_PRIVATE: diff --git a/src/core/emb_table/embedding_table.cpp b/src/core/emb_table/embedding_table.cpp index b4eb2379..12b0137a 100644 --- a/src/core/emb_table/embedding_table.cpp +++ b/src/core/emb_table/embedding_table.cpp @@ -143,6 +143,14 @@ void EmbeddingTable::Save(const string& filePath) { } +void EmbeddingTable::BackUpTrainStatus() +{ +} + +void EmbeddingTable::RecoverTrainStatus() +{ +} + void EmbeddingTable::MakeDir(const string& dirName) { if (fileSystemPtr_ == nullptr) { diff --git a/src/core/emb_table/embedding_table.h b/src/core/emb_table/embedding_table.h index cbf15a7a..174cc0fc 100644 --- a/src/core/emb_table/embedding_table.h +++ b/src/core/emb_table/embedding_table.h @@ -76,6 +76,10 @@ public: void MakeDir(const string& dirName); + virtual void BackUpTrainStatus(); + + virtual void RecoverTrainStatus(); + virtual vector GetDeviceOffset(); vector GetLoadOffset(); @@ -96,6 +100,7 @@ public: size_t ssdVocabSize; size_t maxOffset; absl::flat_hash_map keyOffsetMap; + absl::flat_hash_map keyOffsetMapBackUp; std::vector evictDevPos; // 记录HBM内被淘汰的key std::vector evictHostPos; // 记录Host内淘汰列表 diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index fda54d9d..100ed24e 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -221,6 +221,7 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) Checkpoint loadCkpt; vector loadFeatures; SetFeatureTypeForLoad(loadFeatures); + BackUpTrainStatus(); if (warmStartTables.size() == 0) { EmbeddingMgmt::Instance()->Load(loadPath, trainKeysSet); @@ -499,6 +500,8 @@ void HybridMgmt::EvalTask(TaskType type) cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; }); + // 在唤醒train的数据处理进程之前,需要将备份的train状态还原 + RecoverTrainStatus(); hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); LOG_DEBUG("wake TrainTask"); hybridMgmtBlock->DoBlock(channelId); @@ -2230,3 +2233,27 @@ bool HybridMgmt::IsTrainAndEvalCase() } return alreadyTrainOnce && isChannelSwitchCase; } + +void HybridMgmt::BackUpTrainStatus() +{ + int channelID = TRAIN_CHANNEL_ID; + int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[channelID]; + //续训load、predict模式下的load不需要对train的状态进行备份 + if (theTrainBatchId==0) { + return; + } + // train and eval模式下,train切换为eval之后 + // eval的load需要线备份原有的相关状态, HBM非扩容模式需要备份keyOffsetMap, DDR模式需要备份offsetMapper对象 + LOG_INFO("On Estimator train and eval mode, start to backup train status, " + "current train batchId: {} .", theTrainBatchId); + EmbeddingMgmt::Instance()->BackUpTrainStatusBeforeLoad(); + isBackUpTrainStatus = true; +} + +void HybridMgmt::RecoverTrainStatus() +{ + if (isBackUpTrainStatus) { + EmbeddingMgmt::Instance()->RecoverTrainStatus(); + } + isBackUpTrainStatus = false; +} \ No newline at end of file diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index 83299da3..fb050e70 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -133,6 +133,10 @@ namespace MxRec { void ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBatchOut); + void BackUpTrainStatus(); + + void RecoverTrainStatus(); + GTEST_PRIVATE: bool mutexDestroy { false }; std::mutex lookUpAndSendBatchIdMtx; @@ -225,6 +229,7 @@ namespace MxRec { bool isLoad { false }; bool isInitialized { false }; bool alreadyTrainOnce = false; // 用于判断是否为predict模式 + bool isBackUpTrainStatus = false; // 用于判断当前是否已经备份了train的状态 map lookUpSwapInAddrsPushId; // 用于处理eos场景,当消费者追上生产者且长时间无上游数据,会触发eos map specialProcessStatus; diff --git a/src/core/ock_ctr_common/include/embedding_cache.h b/src/core/ock_ctr_common/include/embedding_cache.h index f3bc9e23..5e25a718 100644 --- a/src/core/ock_ctr_common/include/embedding_cache.h +++ b/src/core/ock_ctr_common/include/embedding_cache.h @@ -315,6 +315,20 @@ public: virtual int LoadEmbTableInfos(std::string tableName, const std::vector& keys, const std::vector>& embeddings, const std::vector>& optimizerSlots) = 0; + + /* * + * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态 + * @Param tableName: 需要加载信息的table名字 + * @Return errorCode + */ + virtual int BackUpTrainStatus(std::string tableName) = 0; + + /* * + * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态 + * @Param tableName: 需要加载信息的table名字 + * @Return errorCode + */ + virtual int RecoverTrainStatus(std::string tableName) = 0; }; } // namespace EmbCache -- Gitee From c145cc40abe4a77ab850169ebd09576d27261c21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 3 Jul 2024 14:31:25 +0800 Subject: [PATCH 252/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?SSD=E6=A8=A1=E5=BC=8F=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cache_manager/cache_manager.cpp | 8 +-- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 59 +++++++------------ src/core/hybrid_mgmt/hybrid_mgmt.h | 7 +-- 3 files changed, 28 insertions(+), 46 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp index 8a6187a1..3017cf8e 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -72,16 +72,16 @@ int EmbCacheManagerImpl::CreateCacheForTable(const EmbCacheInfo& embCacheInfo, return H_THREAD_NUM_ERROR; } - uint32_t reserve = embCacheInfo.vocabSize / VOCAB_CACHE_RATIO; - if (!offsetMappers[embCacheInfo.tableName].Initialize(reserve, embCacheInfo.maxCacheSize)) { + uint32_t reserveDevice = embCacheInfo.maxCacheSize / VOCAB_CACHE_RATIO; + if (!offsetMappers[embCacheInfo.tableName].Initialize(reserveDevice, embCacheInfo.maxCacheSize)) { offsetMappers[embCacheInfo.tableName].UnInitialize(); offsetMappers.erase(embCacheInfo.tableName); return H_MEMORY_ALLOC_ERROR; } EmbPoolParam embPoolParam{prefillBufferSize, refillThreadNum}; - - if (!embTables[embCacheInfo.tableName].Initialize(embCacheInfo, reserve, initializerInfos, embPoolParam)) { + uint32_t reserveHost = embCacheInfo.vocabSize / VOCAB_CACHE_RATIO; + if (!embTables[embCacheInfo.tableName].Initialize(embCacheInfo, reserveHost, initializerInfos, embPoolParam)) { offsetMappers.erase(embCacheInfo.tableName); embTables.erase(embCacheInfo.tableName); return H_MEMORY_ALLOC_ERROR; diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index fda54d9d..9e195419 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -918,28 +918,27 @@ void HybridMgmt::SetOptimizerInfo(const string& embName, OptimizerInfo optimInfo EmbeddingMgmt::Instance()->SetOptimizerInfo(embName, optimInfo); } -void HybridMgmt::LookUpAddrs(const string &embName, int extEmbeddingSize) +// L3Storage +void HybridMgmt::LookUpAndRemoveAddrs(const EmbTaskInfo &info) { - int id = 0; - uint64_t memSize = extEmbeddingSize * sizeof(float); + uint64_t memSize = info.extEmbeddingSize * sizeof(float); const std::string hbmSwapKeyQueName = "HBMSwapKeyQue"; const std::string ddrSwapKeyQueName = "DDRSwapKeyQue"; - auto lookUpFunc = [this, memSize, embName, id]( + auto lookUpFunc = [this, memSize, info]( std::map>> &fromQue, std::map>> &toQue, const string &swapStr, const string &fromQueName ) { - std::vector keys = fromQue[embName + swapStr].WaitAndPop(); + std::vector keys = fromQue[info.name + swapStr].WaitAndPop(); if (!isRunning) { return; } std::vector addrs; TimeCost lookupAddrsTC; - int rc = embCache->EmbeddingLookupAddrs(embName, keys, addrs); + int rc = embCache->EmbeddingLookupAddrs(info.name, keys, addrs); if (rc != H_OK) { - lookupAddrSuccess = false; LOG_ERROR("lookUpAddrs, table:{}, fromQue: {}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}", - embName, fromQueName, swapStr, keys.size(), addrs.size(), id); + info.name, fromQueName, swapStr, keys.size(), addrs.size(), info.batchId); throw runtime_error("EmbeddingLookupAddrs failed! error code:" + std::to_string(rc)); } if (&fromQue == &DDRSwapKeyQue && swapStr == SWAP_OUT_STR) { @@ -947,31 +946,28 @@ void HybridMgmt::LookUpAddrs(const string &embName, int extEmbeddingSize) auto *newAddr = (float*)malloc(memSize); rc = memcpy_s(newAddr, memSize, addr, memSize); if (rc != 0) { - lookupAddrSuccess = false; throw runtime_error("memcpy_s failed! error code:" + std::to_string(rc)); } addr = newAddr; } - rc = embCache->EmbeddingRemove(embName, keys); + rc = embCache->EmbeddingRemove(info.name, keys); if (rc != H_OK) { - lookupAddrSuccess = false; throw runtime_error("EmbeddingRemove failed! error code:" + std::to_string(rc)); } } LOG_DEBUG("table:{}, fromQue:{}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}, lookupAddrsTC(ms):{}", - embName, fromQueName, swapStr, keys.size(), addrs.size(), id, lookupAddrsTC.ElapsedMS()); - toQue[embName + swapStr].Pushv(addrs); + info.name, fromQueName, swapStr, keys.size(), addrs.size(), info.batchId, lookupAddrsTC.ElapsedMS()); + toQue[info.name + swapStr].Pushv(addrs); }; - while (isRunning && lookupAddrSuccess) { - lookUpFunc(DDRSwapKeyQue, DDRSwapAddrsQue, SWAP_OUT_STR, ddrSwapKeyQueName); - lookUpFunc(DDRSwapKeyQue, DDRSwapAddrsQue, SWAP_IN_STR, ddrSwapKeyQueName); - lookUpFunc(HBMSwapKeyQue, tableToQueueLookup, SWAP_IN_STR, hbmSwapKeyQueName); - lookUpFunc(HBMSwapKeyQue, tableToQueueLookup, SWAP_OUT_STR, hbmSwapKeyQueName); - id++; - lookUpSwapInAddrsPushId[embName]++; - } + + lookUpFunc(DDRSwapKeyQue, DDRSwapAddrsQue, SWAP_OUT_STR, ddrSwapKeyQueName); + lookUpFunc(DDRSwapKeyQue, DDRSwapAddrsQue, SWAP_IN_STR, ddrSwapKeyQueName); + lookUpFunc(HBMSwapKeyQue, tableToQueueLookup, SWAP_IN_STR, hbmSwapKeyQueName); + lookUpFunc(HBMSwapKeyQue, tableToQueueLookup, SWAP_OUT_STR, hbmSwapKeyQueName); + lookUpSwapInAddrsPushId[info.name]++; } +// DDR void HybridMgmt::LookUpSwapAddrs(const string &embName, const string &swapStr) { int id = 0; @@ -1146,6 +1142,9 @@ void HybridMgmt::EmbeddingReceiveAndUpdateL3Storage(int batchId, int index, cons .extEmbeddingSize=embInfo.extEmbeddingSize, .name=embInfo.name }; + // host swap out need to be executed before lookup + LookUpAndRemoveAddrs(info); + float* ptr = nullptr; vector swapOutAddrs; int64_t dims0 = 0; @@ -1226,8 +1225,6 @@ void HybridMgmt::ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBa HandleEndBatchCase(info, swapInPos); - CheckLookupAddrSuccessL3Storage(); - if (info.channelId == TRAIN_CHANNEL_ID) { alreadyTrainOnce = true; } @@ -1295,8 +1292,6 @@ void HybridMgmt::InitDataPipelineForL3Storage(const string &embName, int extEmbe DDRSwapAddrsQue[embName + SWAP_IN_STR]; // 初始化lookup线程 - lookUpThreads.emplace_back( - std::async(std::launch::async, [=] { LookUpAddrs(embName, extEmbeddingSize); })); LOG_DEBUG("data pipeline for L3Storage init"); } @@ -1321,8 +1316,9 @@ void HybridMgmt::InitEmbeddingCache(const vector& embInfos) embInfo.name, embInfo.hostVocabSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); EmbCache::EmbCacheInfo embCacheInfo(embInfo.name, embInfo.hostVocabSize, embInfo.embeddingSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); + size_t prefill = std::max(embInfo.hostVocabSize/10, 2 * embInfo.devVocabSize); int ret = embCache->CreateCacheForTable( - embCacheInfo, embInfo.initializeInfos, INVALID_KEY_VALUE, embInfo.hostVocabSize, EMBEDDING_THREAD_NUM); + embCacheInfo, embInfo.initializeInfos, INVALID_KEY_VALUE, prefill, EMBEDDING_THREAD_NUM); if (ret != H_OK) { throw runtime_error(embInfo.name + "create cache for table failed, error code: " + std::to_string(ret)); } @@ -1355,9 +1351,6 @@ void HybridMgmt::JoinEmbeddingCacheThread() for (auto& t : EmbeddingReceiveAndUpdateThreadPool) { t.join(); } - for (auto& t : lookUpThreads) { - t.wait(); - } for (auto& t : lookUpSwapInAddrsThreads) { t.wait(); } @@ -2175,14 +2168,6 @@ void HybridMgmt::CheckLookupAddrSuccessDDR() } } -void HybridMgmt::CheckLookupAddrSuccessL3Storage() -{ - if (!lookupAddrSuccess) { - for (auto& t : lookUpThreads) { - t.get(); - } - } -} void HybridMgmt::GetSwapPairsAndKey2Offset(const EmbBaseInfo &info, vector &uniqueKeys, pair, vector> &swapInKoPair, diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index 83299da3..0654be91 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -157,7 +157,6 @@ namespace MxRec { std::vector EmbeddingReceiveAndUpdateThreadPool; std::vector> lookUpSwapOutAddrsThreads; std::vector> lookUpSwapInAddrsThreads; - std::vector> lookUpThreads; std::map>> HBMSwapKeyQue; std::map>> SwapOut2L3StorageKeyQue; @@ -190,9 +189,9 @@ namespace MxRec { void EvictL3StorageKeys(const string& embName, const vector& keys) const; - void LookUpAddrs(const string &embName, int extEmbeddingSize); + void LookUpAndRemoveAddrs(const EmbTaskInfo &info); // L3Storage, synchronous - void LookUpSwapAddrs(const std::string &embName, const std::string &swapStr); + void LookUpSwapAddrs(const std::string &embName, const std::string &swapStr); // DDR, asynchronous void EmbeddingTask(); @@ -312,8 +311,6 @@ namespace MxRec { void CheckLookupAddrSuccessDDR(); - void CheckLookupAddrSuccessL3Storage(); - void GetSwapPairsAndKey2Offset(const EmbBaseInfo& info, vector &uniqueKeys, std::pair, vector>& swapInKoPair, std::pair, vector>& swapOutKoPair); -- Gitee From 12b6f9f608f43a9d3e0f981f531dc2c72021478f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 3 Jul 2024 14:48:57 +0800 Subject: [PATCH 253/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?SSD=E6=A8=A1=E5=BC=8F=E7=B2=BE=E5=BA=A6=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cache_manager/cache_manager.cpp | 7 +- src/core/emb_table/embedding_ddr.cpp | 2 +- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 65 +++++++++---------- src/core/hybrid_mgmt/hybrid_mgmt.h | 7 +- src/core/l3_storage/cache_manager.h | 4 +- 5 files changed, 41 insertions(+), 44 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp index 3017cf8e..c6cc1bbd 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -72,16 +72,15 @@ int EmbCacheManagerImpl::CreateCacheForTable(const EmbCacheInfo& embCacheInfo, return H_THREAD_NUM_ERROR; } - uint32_t reserveDevice = embCacheInfo.maxCacheSize / VOCAB_CACHE_RATIO; - if (!offsetMappers[embCacheInfo.tableName].Initialize(reserveDevice, embCacheInfo.maxCacheSize)) { + uint32_t reserve = embCacheInfo.vocabSize / VOCAB_CACHE_RATIO; + if (!offsetMappers[embCacheInfo.tableName].Initialize(reserve, embCacheInfo.maxCacheSize)) { offsetMappers[embCacheInfo.tableName].UnInitialize(); offsetMappers.erase(embCacheInfo.tableName); return H_MEMORY_ALLOC_ERROR; } EmbPoolParam embPoolParam{prefillBufferSize, refillThreadNum}; - uint32_t reserveHost = embCacheInfo.vocabSize / VOCAB_CACHE_RATIO; - if (!embTables[embCacheInfo.tableName].Initialize(embCacheInfo, reserveHost, initializerInfos, embPoolParam)) { + if (!embTables[embCacheInfo.tableName].Initialize(embCacheInfo, reserve, initializerInfos, embPoolParam)) { offsetMappers.erase(embCacheInfo.tableName); embTables.erase(embCacheInfo.tableName); return H_MEMORY_ALLOC_ERROR; diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index ca706c73..151e372c 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -235,7 +235,7 @@ void EmbeddingDDR::SyncLatestEmbedding() } } else { // 在保存之前先更新ddr和ssd的embedding - SwapOutInfo info; + HBMSwapOutInfo info; cacheManager_->ProcessSwapOutKeys(name, swapOutKeys, info); vector swapOutAddrs; rc = embCache->EmbeddingLookupAddrs(name, info.swapOutDDRKeys, swapOutAddrs); diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 9e195419..01beb358 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -962,8 +962,8 @@ void HybridMgmt::LookUpAndRemoveAddrs(const EmbTaskInfo &info) lookUpFunc(DDRSwapKeyQue, DDRSwapAddrsQue, SWAP_OUT_STR, ddrSwapKeyQueName); lookUpFunc(DDRSwapKeyQue, DDRSwapAddrsQue, SWAP_IN_STR, ddrSwapKeyQueName); - lookUpFunc(HBMSwapKeyQue, tableToQueueLookup, SWAP_IN_STR, hbmSwapKeyQueName); - lookUpFunc(HBMSwapKeyQue, tableToQueueLookup, SWAP_OUT_STR, hbmSwapKeyQueName); + lookUpFunc(HBMSwapKeyQue, HBMSwapAddrsQue, SWAP_IN_STR, hbmSwapKeyQueName); + lookUpFunc(HBMSwapKeyQue, HBMSwapAddrsQue, SWAP_OUT_STR, hbmSwapKeyQueName); lookUpSwapInAddrsPushId[info.name]++; } @@ -987,7 +987,7 @@ void HybridMgmt::LookUpSwapAddrs(const string &embName, const string &swapStr) LOG_DEBUG( "table:{}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}, lookupAddrsTC(ms):{}", embName, swapStr, keys.size(), addrs.size(), id, lookupAddrsTC.ElapsedMS()); - tableToQueueLookup[swapName].Pushv(addrs); + HBMSwapAddrsQue[swapName].Pushv(addrs); if (swapStr==SWAP_IN_STR) { lookUpSwapInAddrsPushId[embName]++; LOG_DEBUG("LookUpSwapAddrs, table:{}, pushId:{}, lookUpSwapInAddrsPushId:{}", @@ -1258,8 +1258,8 @@ void HybridMgmt::InitDataPipelineForDDR(const string &embName) // 初始化公共队列 HBMSwapKeyQue[embName+SWAP_IN_STR]; HBMSwapKeyQue[embName+SWAP_OUT_STR]; - tableToQueueLookup[embName+SWAP_IN_STR]; - tableToQueueLookup[embName+SWAP_OUT_STR]; + HBMSwapAddrsQue[embName + SWAP_IN_STR]; + HBMSwapAddrsQue[embName + SWAP_OUT_STR]; // 初始化lookup线程 lookUpSwapInAddrsPushId[embName]; // 此处初始化,避免多线程竞争导致计数错误 @@ -1276,13 +1276,13 @@ void HybridMgmt::InitDataPipelineForL3Storage(const string &embName, int extEmbe // 初始化公共队列 HBMSwapKeyQue[embName+SWAP_IN_STR]; HBMSwapKeyQue[embName+SWAP_OUT_STR]; - tableToQueueLookup[embName+SWAP_IN_STR]; - tableToQueueLookup[embName+SWAP_OUT_STR]; + HBMSwapAddrsQue[embName + SWAP_IN_STR]; + HBMSwapAddrsQue[embName + SWAP_OUT_STR]; HBMSwapKeyQue[embName + ADDR_STR]; - SwapOut2L3StorageKeyQue[embName + SWAP_IN_STR]; - SwapOut2L3StorageKeyQue[embName + ADDR_STR]; - SwapOut2L3StorageKeyQue[embName + SWAP_OUT_STR]; + HBMSwapKeyForL3StorageQue[embName + SWAP_IN_STR]; + HBMSwapKeyForL3StorageQue[embName + ADDR_STR]; + HBMSwapKeyForL3StorageQue[embName + SWAP_OUT_STR]; DDRSwapKeyQue[embName + SWAP_OUT_STR]; DDRSwapKeyQue[embName + SWAP_IN_STR]; @@ -1316,9 +1316,8 @@ void HybridMgmt::InitEmbeddingCache(const vector& embInfos) embInfo.name, embInfo.hostVocabSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); EmbCache::EmbCacheInfo embCacheInfo(embInfo.name, embInfo.hostVocabSize, embInfo.embeddingSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); - size_t prefill = std::max(embInfo.hostVocabSize/10, 2 * embInfo.devVocabSize); int ret = embCache->CreateCacheForTable( - embCacheInfo, embInfo.initializeInfos, INVALID_KEY_VALUE, prefill, EMBEDDING_THREAD_NUM); + embCacheInfo, embInfo.initializeInfos, INVALID_KEY_VALUE, embInfo.hostVocabSize, EMBEDDING_THREAD_NUM); if (ret != H_OK) { throw runtime_error(embInfo.name + "create cache for table failed, error code: " + std::to_string(ret)); } @@ -1327,13 +1326,13 @@ void HybridMgmt::InitEmbeddingCache(const vector& embInfos) void HybridMgmt::JoinEmbeddingCacheThread() { - for (auto &p : tableToQueueLookup) { + for (auto &p : HBMSwapAddrsQue) { p.second.DestroyQueue(); } for (auto &p : HBMSwapKeyQue) { p.second.DestroyQueue(); } - for (auto &p : SwapOut2L3StorageKeyQue) { + for (auto &p : HBMSwapKeyForL3StorageQue) { p.second.DestroyQueue(); } for (auto &p : DDRSwapKeyQue) { @@ -1439,7 +1438,7 @@ bool HybridMgmt::EmbeddingReceiveDDR(const EmbTaskInfo& info, float*& ptr, vecto } TimeCost EmbeddingRecvTC = TimeCost(); - swapOutAddrs = tableToQueueLookup[info.name+SWAP_OUT_STR].WaitAndPop(); + swapOutAddrs = HBMSwapAddrsQue[info.name + SWAP_OUT_STR].WaitAndPop(); if (!isRunning) { return false; } @@ -1617,7 +1616,7 @@ bool HybridMgmt::EmbeddingReceiveL3Storage(const EmbTaskInfo &info, float *&ptr, } TimeCost EmbeddingRecvTC = TimeCost(); // finish时会pop空vector,因此需要额外判定isRunning - swapOutAddrs = tableToQueueLookup[info.name+SWAP_OUT_STR].WaitAndPop(); + swapOutAddrs = HBMSwapAddrsQue[info.name + SWAP_OUT_STR].WaitAndPop(); if (!isRunning) { return false; } @@ -1681,8 +1680,8 @@ void HybridMgmt::EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float *embPtr // L3Storage更新 TimeCost L3StorageUpdateTC = TimeCost(); - std::vector swapOutL3StorageAddrOffs = SwapOut2L3StorageKeyQue[info.name + ADDR_STR].WaitAndPop(); - std::vector swapOutL3StorageKeys = SwapOut2L3StorageKeyQue[info.name + SWAP_OUT_STR].WaitAndPop(); + std::vector swapOutL3StorageAddrOffs = HBMSwapKeyForL3StorageQue[info.name + ADDR_STR].WaitAndPop(); + std::vector swapOutL3StorageKeys = HBMSwapKeyForL3StorageQue[info.name + SWAP_OUT_STR].WaitAndPop(); if (!isRunning) { return; } @@ -1874,8 +1873,8 @@ void HybridMgmt::HandleFirstBatchCaseL3Storage(const EmbBaseInfo& info, HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKoPair.first); // HBM->L3Storage - SwapOut2L3StorageKeyQue[info.name + SWAP_OUT_STR].Pushv(emptySwapOutL3StorageKeys); - SwapOut2L3StorageKeyQue[info.name + ADDR_STR].Pushv(emptySwapOutL3StorageAddrOff); + HBMSwapKeyForL3StorageQue[info.name + SWAP_OUT_STR].Pushv(emptySwapOutL3StorageKeys); + HBMSwapKeyForL3StorageQue[info.name + ADDR_STR].Pushv(emptySwapOutL3StorageAddrOff); } void HybridMgmt::HandleDataSwapForL3Storage(const EmbBaseInfo& info, @@ -1888,18 +1887,18 @@ void HybridMgmt::HandleDataSwapForL3Storage(const EmbBaseInfo& info, LOG_DEBUG("ProcessSwapInKeysTC(ms):{} ", ProcessSwapInKeysTC.ElapsedMS()); TimeCost ProcessSwapOutKeysTC; - SwapOutInfo swapInfo; - cacheManager->ProcessSwapOutKeys(info.name, swapOutKeys, swapInfo); + HBMSwapOutInfo hbmSwapInfo; + cacheManager->ProcessSwapOutKeys(info.name, swapOutKeys, hbmSwapInfo); LOG_DEBUG("ProcessSwapOutKeysTC(ms):{} ", ProcessSwapOutKeysTC.ElapsedMS()); LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", info.name, info.batchId, info.channelId, swapInKeys.size(), swapOutKeys.size()); - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapOutDDRKeys:{}, swapOutDDRAddrOffs:{}, " - "swapOutL3StorageKeys:{}, swapOutL3StorageAddrOff:{}", - info.name, info.batchId, info.channelId, swapInfo.swapOutDDRKeys.size(), - swapInfo.swapOutDDRAddrOffs.size(), swapInfo.swapOutL3StorageKeys.size(), - swapInfo.swapOutL3StorageAddrOffs.size()); - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToL3StorageKeys:{}, L3StorageToDDRKeys:{}", + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swap out, HBM2DDR Keys:{}, HBM2DDR AddrOffs:{}, " + "HBM2L3Storage Keys:{}, HBM2L3Storage AddrOff:{}", + info.name, info.batchId, info.channelId, hbmSwapInfo.swapOutDDRKeys.size(), + hbmSwapInfo.swapOutDDRAddrOffs.size(), hbmSwapInfo.swapOutL3StorageKeys.size(), + hbmSwapInfo.swapOutL3StorageAddrOffs.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDR2L3Storage Keys:{}, L3Storage2DDR Keys:{}", info.name, info.batchId, info.channelId, DDRToL3StorageKeys.size(), L3StorageToDDRKeys.size()); auto DDRToL3StorageKeysForL3S = DDRToL3StorageKeys; @@ -1912,18 +1911,18 @@ void HybridMgmt::HandleDataSwapForL3Storage(const EmbBaseInfo& info, DDRSwapKeyForL3StorageQue[info.name + SWAP_IN_STR].Pushv(L3StorageToDDRKeysForL3S); // HBM<->DDR - HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(swapInfo.swapOutDDRKeys); - HBMSwapKeyQue[info.name + ADDR_STR].Pushv(swapInfo.swapOutDDRAddrOffs); + HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(hbmSwapInfo.swapOutDDRKeys); + HBMSwapKeyQue[info.name + ADDR_STR].Pushv(hbmSwapInfo.swapOutDDRAddrOffs); HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKeys); // HBM->L3Storage - SwapOut2L3StorageKeyQue[info.name + SWAP_OUT_STR].Pushv(swapInfo.swapOutL3StorageKeys); - SwapOut2L3StorageKeyQue[info.name + ADDR_STR].Pushv(swapInfo.swapOutL3StorageAddrOffs); + HBMSwapKeyForL3StorageQue[info.name + SWAP_OUT_STR].Pushv(hbmSwapInfo.swapOutL3StorageKeys); + HBMSwapKeyForL3StorageQue[info.name + ADDR_STR].Pushv(hbmSwapInfo.swapOutL3StorageAddrOffs); } bool HybridMgmt::BuildH2DEmbedding(const EmbTaskInfo &info, vector &h2dEmb) { - std::vector swapInAddrs = tableToQueueLookup[info.name+SWAP_IN_STR].WaitAndPop(); + std::vector swapInAddrs = HBMSwapAddrsQue[info.name + SWAP_IN_STR].WaitAndPop(); if (!isRunning) { return false; } diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index 0654be91..f5897861 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -159,21 +159,20 @@ namespace MxRec { std::vector> lookUpSwapInAddrsThreads; std::map>> HBMSwapKeyQue; - std::map>> SwapOut2L3StorageKeyQue; + std::map>> HBMSwapKeyForL3StorageQue; std::map>> DDRSwapKeyQue; std::map>> DDRSwapKeyForL3StorageQue; + std::map>> HBMSwapAddrsQue; std::map>> DDRSwapAddrsQue; std::mutex evictMut; std::map> trainKeysSet; - const string SWAP_IN_STR = "SwapIn"; const string SWAP_OUT_STR = "SwapOut"; - const string ADDR_STR = "Addr"; + const string ADDR_STR = "Addr"; ock::ctr::EmbCacheManagerPtr embCache = nullptr; - std::map>> tableToQueueLookup; std::map> lastSwapInPosMap {}; std::map>> trainTestSwitchInfoStore {}; std::atomic lookupAddrSuccess {true}; diff --git a/src/core/l3_storage/cache_manager.h b/src/core/l3_storage/cache_manager.h index 3f5b0a22..79335788 100644 --- a/src/core/l3_storage/cache_manager.h +++ b/src/core/l3_storage/cache_manager.h @@ -40,7 +40,7 @@ namespace MxRec { absl::flat_hash_map& keyOffsetMap; }; - struct SwapOutInfo { + struct HBMSwapOutInfo { vector swapOutDDRKeys; vector swapOutDDRAddrOffs; vector swapOutL3StorageKeys; @@ -89,7 +89,7 @@ namespace MxRec { void PutKey(const string& embTableName, const emb_key_t& key, RecordType type); void ProcessSwapOutKeys(const string& tableName, const vector& swapOutKeys, - SwapOutInfo& info); + HBMSwapOutInfo& info); void ProcessSwapInKeys(const string& tableName, const vector& swapInKeys, vector& DDRToL3StorageKeys, -- Gitee From ddefbd55694512acf3a2213c94cd491ed3058077 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 3 Jul 2024 15:03:10 +0800 Subject: [PATCH 254/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?prefill=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 3 ++- src/core/utils/common.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index fda54d9d..e4e30f64 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -1321,8 +1321,9 @@ void HybridMgmt::InitEmbeddingCache(const vector& embInfos) embInfo.name, embInfo.hostVocabSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); EmbCache::EmbCacheInfo embCacheInfo(embInfo.name, embInfo.hostVocabSize, embInfo.embeddingSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); + size_t prefill = std::max(embInfo.hostVocabSize/HOST_TO_PREFILL_RATIO, embInfo.devVocabSize); int ret = embCache->CreateCacheForTable( - embCacheInfo, embInfo.initializeInfos, INVALID_KEY_VALUE, embInfo.hostVocabSize, EMBEDDING_THREAD_NUM); + embCacheInfo, embInfo.initializeInfos, INVALID_KEY_VALUE, prefill, EMBEDDING_THREAD_NUM); if (ret != H_OK) { throw runtime_error(embInfo.name + "create cache for table failed, error code: " + std::to_string(ret)); } diff --git a/src/core/utils/common.h b/src/core/utils/common.h index 9a39e7ac..c020bbc5 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -69,6 +69,7 @@ namespace MxRec { constexpr int SSD_SIZE_INDEX = 2; constexpr int MAX_FILE_NUM = 1000; constexpr int EMBEDDING_THREAD_NUM = 2; + constexpr int HOST_TO_PREFILL_RATIO = 10; // for GLOG struct GlogConfig { static bool gStatOn; -- Gitee From afa1b548e1d91dd6bd5f8b9a2b4e61af04019440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 3 Jul 2024 15:09:34 +0800 Subject: [PATCH 255/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?reserve=E5=9C=A8dev=E4=BE=A7=E6=B5=AA=E8=B4=B9=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../embedding_cache/cache_manager/cache_manager.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp index 8a6187a1..a9fac9f6 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -64,7 +64,8 @@ int EmbCacheManagerImpl::CreateCacheForTable(const EmbCacheInfo& embCacheInfo, } if ((prefillBufferSize < 1) || (prefillBufferSize > embCacheInfo.vocabSize)) { - ExternalLogger::PrintLog(LogLevel::ERROR, "prefillBufferSize has to be between [1, hostVocabSize]"); + ExternalLogger::PrintLog(LogLevel::ERROR, "prefillBufferSize: " + std::to_string(prefillBufferSize) + + "has to be between [1, hostVocabSize]"); return H_PREFILL_BUFFER_SIZE_INVALID; } @@ -72,16 +73,16 @@ int EmbCacheManagerImpl::CreateCacheForTable(const EmbCacheInfo& embCacheInfo, return H_THREAD_NUM_ERROR; } - uint32_t reserve = embCacheInfo.vocabSize / VOCAB_CACHE_RATIO; - if (!offsetMappers[embCacheInfo.tableName].Initialize(reserve, embCacheInfo.maxCacheSize)) { + uint32_t reserveDevice = embCacheInfo.maxCacheSize / VOCAB_CACHE_RATIO; + if (!offsetMappers[embCacheInfo.tableName].Initialize(reserveDevice, embCacheInfo.maxCacheSize)) { offsetMappers[embCacheInfo.tableName].UnInitialize(); offsetMappers.erase(embCacheInfo.tableName); return H_MEMORY_ALLOC_ERROR; } EmbPoolParam embPoolParam{prefillBufferSize, refillThreadNum}; - - if (!embTables[embCacheInfo.tableName].Initialize(embCacheInfo, reserve, initializerInfos, embPoolParam)) { + uint32_t reserveHost = embCacheInfo.vocabSize / VOCAB_CACHE_RATIO; + if (!embTables[embCacheInfo.tableName].Initialize(embCacheInfo, reserveHost, initializerInfos, embPoolParam)) { offsetMappers.erase(embCacheInfo.tableName); embTables.erase(embCacheInfo.tableName); return H_MEMORY_ALLOC_ERROR; -- Gitee From fe3982f8995eb03e428f03866c941b8569f86785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 3 Jul 2024 15:19:09 +0800 Subject: [PATCH 256/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?reserve=E5=9C=A8dev=E4=BE=A7=E6=B5=AA=E8=B4=B9=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/l3_storage/cache_manager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/l3_storage/cache_manager.cpp b/src/core/l3_storage/cache_manager.cpp index 75d73b2d..ee3d7bc5 100644 --- a/src/core/l3_storage/cache_manager.cpp +++ b/src/core/l3_storage/cache_manager.cpp @@ -181,7 +181,7 @@ int64_t CacheManager::GetTableUsage(const string& tableName) } void CacheManager::ProcessSwapOutKeys(const string& tableName, const vector& swapOutKeys, - SwapOutInfo& info) + HBMSwapOutInfo& info) { auto& swapOutDDRKeys = info.swapOutDDRKeys; auto& swapOutDDRAddrOffs = info.swapOutDDRAddrOffs; -- Gitee From 486e3e9f7159de52b78bb4313d027e7d48d6918b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 3 Jul 2024 15:37:52 +0800 Subject: [PATCH 257/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?SSD=E7=B2=BE=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 01beb358..6969c27d 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -1142,8 +1142,6 @@ void HybridMgmt::EmbeddingReceiveAndUpdateL3Storage(int batchId, int index, cons .extEmbeddingSize=embInfo.extEmbeddingSize, .name=embInfo.name }; - // host swap out need to be executed before lookup - LookUpAndRemoveAddrs(info); float* ptr = nullptr; vector swapOutAddrs; @@ -1614,6 +1612,9 @@ bool HybridMgmt::EmbeddingReceiveL3Storage(const EmbTaskInfo &info, float *&ptr, if (!isRunning) { return false; } + // DDR swap out key need to be removed + LookUpAndRemoveAddrs(info); + TimeCost EmbeddingRecvTC = TimeCost(); // finish时会pop空vector,因此需要额外判定isRunning swapOutAddrs = HBMSwapAddrsQue[info.name + SWAP_OUT_STR].WaitAndPop(); -- Gitee From 474ca30dc51783a59d16803324e25cf8aa2d0395 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 3 Jul 2024 16:04:13 +0800 Subject: [PATCH 258/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E5=8A=A8=E6=80=81=E6=89=A9=E5=AE=B9=E5=86=97=E4=BD=99=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E4=BF=AE=E5=A4=8Dtest?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/emb_table/embedding_ddr_test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tests/emb_table/embedding_ddr_test.cpp b/src/tests/emb_table/embedding_ddr_test.cpp index 60ec5af6..097167f6 100644 --- a/src/tests/emb_table/embedding_ddr_test.cpp +++ b/src/tests/emb_table/embedding_ddr_test.cpp @@ -22,7 +22,6 @@ See the License for the specific language governing permissions and #include #include #include "utils/common.h" -#include "emb_table/emb_table.h" #include "emb_table/embedding_ddr.h" using namespace std; -- Gitee From e6c501c12489146c7887c52519f739922367f780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 4 Jul 2024 09:26:28 +0800 Subject: [PATCH 259/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91issure?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/embedding_cache/cache_manager/cache_manager.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp index a9fac9f6..76e90abc 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -64,8 +64,8 @@ int EmbCacheManagerImpl::CreateCacheForTable(const EmbCacheInfo& embCacheInfo, } if ((prefillBufferSize < 1) || (prefillBufferSize > embCacheInfo.vocabSize)) { - ExternalLogger::PrintLog(LogLevel::ERROR, "prefillBufferSize: " + std::to_string(prefillBufferSize) + - "has to be between [1, hostVocabSize]"); + ExternalLogger::PrintLog(LogLevel::ERROR, "PrefillBufferSize: " + std::to_string(prefillBufferSize) + + " has to be between [1, hostVocabSize]."); return H_PREFILL_BUFFER_SIZE_INVALID; } -- Gitee From 637ef26d445886b98aee01f64b13a40081c60c04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 4 Jul 2024 09:54:03 +0800 Subject: [PATCH 260/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91delete?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/tests/emb_table/embedding_mgmt_test.cpp | 1 - src/tests/emb_table/embedding_static_test.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/src/tests/emb_table/embedding_mgmt_test.cpp b/src/tests/emb_table/embedding_mgmt_test.cpp index 055cf5c5..81a354bf 100644 --- a/src/tests/emb_table/embedding_mgmt_test.cpp +++ b/src/tests/emb_table/embedding_mgmt_test.cpp @@ -22,7 +22,6 @@ See the License for the specific language governing permissions and #include #include #include "utils/common.h" -#include "emb_table/emb_table.h" #include "emb_table/embedding_mgmt.h" using namespace std; diff --git a/src/tests/emb_table/embedding_static_test.cpp b/src/tests/emb_table/embedding_static_test.cpp index a08569b3..5d1f0ab7 100644 --- a/src/tests/emb_table/embedding_static_test.cpp +++ b/src/tests/emb_table/embedding_static_test.cpp @@ -21,7 +21,6 @@ See the License for the specific language governing permissions and #include #include #include "utils/common.h" -#include "emb_table/emb_table.h" #include "emb_table/embedding_static.h" using namespace std; -- Gitee From c9a321e908b0290b60b1776b595e38b37d2698ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Thu, 4 Jul 2024 16:08:15 +0800 Subject: [PATCH 261/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E4=B8=8B=EF=BC=8Ctable.capacity=E5=87=BA?= =?UTF-8?q?=E7=8E=B0=E5=81=B6=E5=8F=91=E8=B4=9F=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/emb_table/embedding_ddr.cpp | 2 +- src/core/emb_table/embedding_dynamic.cpp | 4 ++-- src/core/emb_table/embedding_table.h | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index ca706c73..b9ca70dc 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -45,7 +45,7 @@ void EmbeddingDDR::Key2Offset(std::vector& splitKey, int channel) int64_t EmbeddingDDR::capacity() const { - return capacity_; + return capacity_.load(); } /* diff --git a/src/core/emb_table/embedding_dynamic.cpp b/src/core/emb_table/embedding_dynamic.cpp index 7f8cd7e5..703d08ad 100644 --- a/src/core/emb_table/embedding_dynamic.cpp +++ b/src/core/emb_table/embedding_dynamic.cpp @@ -77,7 +77,7 @@ void EmbeddingDynamic::Key2Offset(std::vector& keys, int channel) int64_t EmbeddingDynamic::capacity() const { - return capacity_; + return capacity_.load(); } int64_t EmbeddingDynamic::GetEmptyEmbeddingAddress() @@ -103,7 +103,7 @@ void EmbeddingDynamic::MallocEmbeddingBlock(int embNum) float *embAddr = static_cast(block) + (i * extEmbSize_); embeddingList_.push_back(embAddr); } - capacity_ += embNum; + capacity_.fetch_add(embNum); } void EmbeddingDynamic::RandomInit(void* addr, size_t embNum) diff --git a/src/core/emb_table/embedding_table.h b/src/core/emb_table/embedding_table.h index cbf15a7a..3396a8a0 100644 --- a/src/core/emb_table/embedding_table.h +++ b/src/core/emb_table/embedding_table.h @@ -15,6 +15,7 @@ See the License for the specific language governing permissions and #ifndef MX_REC_EMBEDDING_TABLE_H #define MX_REC_EMBEDDING_TABLE_H +#include #include #include #include @@ -113,7 +114,7 @@ protected: size_t embSize_; size_t extEmbSize_; int seed_; - int64_t capacity_; + std::atomic capacity_; size_t rankId_; size_t rankSize_; vector loadOffset; -- Gitee From a55ed2e12febf7542083651f8ed95ffbd3894e90 Mon Sep 17 00:00:00 2001 From: steepcurve Date: Fri, 5 Jul 2024 08:01:45 +0000 Subject: [PATCH 262/302] fix: conflict in src/core/utils/common.h. Signed-off-by: steepcurve --- src/core/utils/common.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/utils/common.h b/src/core/utils/common.h index f8ff4565..8c7528f4 100644 --- a/src/core/utils/common.h +++ b/src/core/utils/common.h @@ -70,6 +70,7 @@ constexpr size_t MAX_VOCABULARY_SIZE = 1e10; constexpr int SSD_SIZE_INDEX = 2; constexpr int MAX_FILE_NUM = 1000; constexpr int EMBEDDING_THREAD_NUM = 2; +constexpr int HOST_TO_PREFILL_RATIO = 10; // for GLOG struct GlogConfig { static bool gStatOn; -- Gitee From 03a664699c4aa5fd7bd6e8353327181dcd677aec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Fri, 5 Jul 2024 16:38:57 +0800 Subject: [PATCH 263/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E4=B8=8B=EF=BC=8Ctable.capacity=E5=87=BA?= =?UTF-8?q?=E7=8E=B0=E5=81=B6=E5=8F=91=E8=B4=9F=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 591 +++++++++++++-------------- src/core/hybrid_mgmt/hybrid_mgmt.h | 408 +++++++++--------- 2 files changed, 485 insertions(+), 514 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 61064fb4..3eb99685 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -15,23 +15,23 @@ See the License for the specific language governing permissions and #include "hybrid_mgmt.h" +#include + #include +#include #include -#include #include #include -#include +#include "checkpoint/checkpoint.h" +#include "emb_table/embedding_mgmt.h" #include "hd_transfer/hd_transfer.h" #include "hybrid_mgmt/hybrid_mgmt_block.h" -#include "utils/time_cost.h" -#include "utils/logger.h" -#include "utils/common.h" -#include "checkpoint/checkpoint.h" -#include "key_process/key_process.h" #include "key_process/feature_admit_and_evict.h" -#include "emb_table/embedding_mgmt.h" - +#include "key_process/key_process.h" +#include "utils/common.h" +#include "utils/logger.h" +#include "utils/time_cost.h" using namespace MxRec; using namespace std; @@ -98,8 +98,8 @@ bool HybridMgmt::Initialize(RankInfo rankInfo, const vector& embInfos, InitRankInfo(rankInfo, embInfos); GlogConfig::gStatOn = GlobalEnv::statOn; - LOG_INFO(MGMT + "begin initialize, localRankSize:{}, localRankId:{}, rank:{}", - rankInfo.localRankSize, rankInfo.localRankId, rankInfo.rankId); + LOG_INFO(MGMT + "begin initialize, localRankSize:{}, localRankId:{}, rank:{}", rankInfo.localRankSize, + rankInfo.localRankId, rankInfo.rankId); mgmtRankInfo = rankInfo; mgmtEmbInfo = embInfos; @@ -134,15 +134,15 @@ bool HybridMgmt::Initialize(RankInfo rankInfo, const vector& embInfos, Start(); } - for (const auto& info: embInfos) { - LOG_INFO(MGMT + "table:{}, vocab size dev+host:{}+{}, send count:{}", - info.name, info.devVocabSize, info.hostVocabSize, info.sendCount); + for (const auto& info : embInfos) { + LOG_INFO(MGMT + "table:{}, vocab size dev+host:{}+{}, send count:{}", info.name, info.devVocabSize, + info.hostVocabSize, info.sendCount); } LOG_INFO(MGMT + "end initialize, rankId:{}, isDDR:{}, " "step[train_interval, eval_interval, save_interval, max_train_step]:[{}, {}, {}, {}]", - rankInfo.rankId, rankInfo.isDDR, - rankInfo.ctrlSteps.at(TRAIN_CHANNEL_ID), rankInfo.ctrlSteps.at(EVAL_CHANNEL_ID), - rankInfo.ctrlSteps.at(SAVE_STEP_INDEX), rankInfo.ctrlSteps.at(MAX_TRAIN_STEP_INDEX)); + rankInfo.rankId, rankInfo.isDDR, rankInfo.ctrlSteps.at(TRAIN_CHANNEL_ID), + rankInfo.ctrlSteps.at(EVAL_CHANNEL_ID), rankInfo.ctrlSteps.at(SAVE_STEP_INDEX), + rankInfo.ctrlSteps.at(MAX_TRAIN_STEP_INDEX)); #endif isInitialized = true; @@ -225,7 +225,7 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) if (warmStartTables.size() == 0) { EmbeddingMgmt::Instance()->Load(loadPath, trainKeysSet); } else { - for (auto& tableName: warmStartTables) { + for (auto& tableName : warmStartTables) { EmbeddingMgmt::Instance()->Load(tableName, loadPath, trainKeysSet); } } @@ -373,17 +373,17 @@ void HybridMgmt::Start() void HybridMgmt::StartThreadForHBM() { #ifndef GTEST - auto parseKeysTaskForHBMTrain = [this]() { - TrainTask(TaskType::HBM); - LOG_INFO("parseKeysTaskForHBMTrain done"); - }; - procThreads.emplace_back(std::make_unique(parseKeysTaskForHBMTrain)); - - auto parseKeysTaskForHBMEval = [this]() { - EvalTask(TaskType::HBM); - LOG_INFO("parseKeysTaskForHBMEval done"); - }; - procThreads.emplace_back(std::make_unique(parseKeysTaskForHBMEval)); + auto parseKeysTaskForHBMTrain = [this]() { + TrainTask(TaskType::HBM); + LOG_INFO("parseKeysTaskForHBMTrain done"); + }; + procThreads.emplace_back(std::make_unique(parseKeysTaskForHBMTrain)); + + auto parseKeysTaskForHBMEval = [this]() { + EvalTask(TaskType::HBM); + LOG_INFO("parseKeysTaskForHBMEval done"); + }; + procThreads.emplace_back(std::make_unique(parseKeysTaskForHBMEval)); #endif } @@ -424,7 +424,7 @@ void HybridMgmt::Destroy() isRunning = false; mutexDestroy = true; - for (const auto& embInfo: mgmtEmbInfo) { + for (const auto& embInfo : mgmtEmbInfo) { for (int index = 0; index < EMBEDDING_THREAD_NUM; index++) { cvLastUpdateFinishMap[embInfo.name][index].notify_all(); cvLastLookUpFinishMap[embInfo.name][index].notify_all(); @@ -456,7 +456,9 @@ void HybridMgmt::Destroy() // 停止预处理 KEY_PROCESS_INSTANCE->Destroy(); // stop embCache, even if the host emb is still allocating - if (embCache != nullptr) { embCache->Destroy(); } + if (embCache != nullptr) { + embCache->Destroy(); + } LOG_DEBUG(MGMT + "Destroy hybrid_mgmt module end."); } @@ -493,12 +495,10 @@ void HybridMgmt::EvalTask(TaskType type) do { hybridMgmtBlock->CheckAndSetBlock(channelId); if (hybridMgmtBlock->GetBlockStatus(channelId)) { - LOG_DEBUG("eval channel block at batchId:{}, needWaitSave:{}", - evalBatchId, hybridMgmtBlock->IsNeedWaitSave()); + LOG_DEBUG("eval channel block at batchId:{}, needWaitSave:{}", evalBatchId, + hybridMgmtBlock->IsNeedWaitSave()); std::unique_lock checkSaveLocker(saveMutex); - cvCheckSave.wait(checkSaveLocker, [this] { - return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; - }); + cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; }); hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); LOG_DEBUG("wake TrainTask"); hybridMgmtBlock->DoBlock(channelId); @@ -513,29 +513,28 @@ void HybridMgmt::EvalTask(TaskType type) #endif } -void HybridMgmt::SendUniqKeysAndRestoreVecHBM(const EmbBaseInfo &info, - const unique_ptr> &infoVecs, bool isGrad) const +void HybridMgmt::SendUniqKeysAndRestoreVecHBM(const EmbBaseInfo& info, const unique_ptr>& infoVecs, + bool isGrad) const { TimeCost sendUniqueKeysSyncTC; - LOG_DEBUG("channelId:{} batchId:{}, global unique, table name: {}, is grad: {}", - info.channelId, info.batchId, info.name, isGrad); + LOG_DEBUG("channelId:{} batchId:{}, global unique, table name: {}, is grad: {}", info.channelId, info.batchId, + info.name, isGrad); if (isGrad) { hdTransfer->Send(TransferChannel::UNIQKEYS, {infoVecs->back()}, info.channelId, info.name); } infoVecs->pop_back(); - LOG_DEBUG("channelId:{} batchId:{}, sendUniqueKeysSyncTC(ms):{}", - info.channelId, info.batchId, sendUniqueKeysSyncTC.ElapsedMS()); + LOG_DEBUG("channelId:{} batchId:{}, sendUniqueKeysSyncTC(ms):{}", info.channelId, info.batchId, + sendUniqueKeysSyncTC.ElapsedMS()); TimeCost sendUniqueRestoreVecSyncTC; if (isGrad) { hdTransfer->Send(TransferChannel::RESTORE_SECOND, {infoVecs->back()}, info.channelId, info.name); } infoVecs->pop_back(); - LOG_DEBUG("channelId:{} batchId:{}, sendUniqueRestoreVecSyncTC(ms):{}", - info.channelId, info.batchId, sendUniqueRestoreVecSyncTC.ElapsedMS()); + LOG_DEBUG("channelId:{} batchId:{}, sendUniqueRestoreVecSyncTC(ms):{}", info.channelId, info.batchId, + sendUniqueRestoreVecSyncTC.ElapsedMS()); } - /// 当前处理的batch是否是最后一个batch,涵盖train切换eval、save场景 /// \param batchId 已处理的batch数 /// \return @@ -544,13 +543,12 @@ bool HybridMgmt::IsTrainEndBatch(int batchId) const // case 1:需要切eval // case 2:需要save时,补发pos后被阻塞,等待save完成,避免embCache状态发送变化 // batchId是从0开始的,所以要+1对上step - bool isNeedSwitchToEval = mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID] != -1 && - (batchId + 1) % mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID] == 0; - bool isNeedSave = mgmtRankInfo.ctrlSteps[SAVE_STEP_INDEX] != -1 && - mgmtRankInfo.ctrlSteps[SAVE_STEP_INDEX] != 0 && + bool isNeedSwitchToEval = + mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID] != -1 && (batchId + 1) % mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID] == 0; + bool isNeedSave = mgmtRankInfo.ctrlSteps[SAVE_STEP_INDEX] != -1 && mgmtRankInfo.ctrlSteps[SAVE_STEP_INDEX] != 0 && (batchId + 1) % mgmtRankInfo.ctrlSteps[SAVE_STEP_INDEX] == 0; - LOG_DEBUG("mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID]:{}, batchId:{}", - mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID], batchId); + LOG_DEBUG("mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID]:{}, batchId:{}", mgmtRankInfo.ctrlSteps[TRAIN_CHANNEL_ID], + batchId); LOG_DEBUG("isNeedSwitchToEval:{}, isNeedSave:{}", isNeedSwitchToEval, isNeedSave); return isNeedSwitchToEval || isNeedSave; } @@ -570,26 +568,23 @@ bool HybridMgmt::ParseKeys(int channelId, int& batchId, TaskType type) #ifndef GTEST LOG_INFO(MGMT + "channelId:{} batchId:{}, ParseKeys start.", channelId, batchId); TimeCost parseKeyTC; - bool remainBatch = true; // 是否从通道获取了数据 + bool remainBatch = true; // 是否从通道获取了数据 vector parseKeyThreadPool; for (const auto& embInfo : mgmtEmbInfo) { - EmbBaseInfo info = {.batchId=batchId, .channelId=channelId, .name=embInfo.name}; + EmbBaseInfo info = {.batchId = batchId, .channelId = channelId, .name = embInfo.name}; switch (type) { case TaskType::HBM: - parseKeyThreadPool.emplace_back([this, info, &remainBatch, embInfo]() { - ProcessEmbInfoHBM(info, remainBatch, embInfo.isGrad); - }); + parseKeyThreadPool.emplace_back( + [this, info, &remainBatch, embInfo]() { ProcessEmbInfoHBM(info, remainBatch, embInfo.isGrad); }); break; case TaskType::DDR: if (!isL3StorageEnabled) { - parseKeyThreadPool.emplace_back([this, info, &remainBatch, embInfo]() { - ProcessEmbInfoDDR(info, remainBatch); - }); + parseKeyThreadPool.emplace_back( + [this, info, &remainBatch, embInfo]() { ProcessEmbInfoDDR(info, remainBatch); }); } else { - parseKeyThreadPool.emplace_back([this, info, &remainBatch, embInfo]() { - ProcessEmbInfoL3Storage(info, remainBatch); - }); + parseKeyThreadPool.emplace_back( + [this, info, &remainBatch, embInfo]() { ProcessEmbInfoL3Storage(info, remainBatch); }); } break; default: @@ -608,14 +603,14 @@ bool HybridMgmt::ParseKeys(int channelId, int& batchId, TaskType type) if (!isRunning) { return false; } - LOG_DEBUG(MGMT + "channelId:{} batchId:{}, ParseKeys end, parseKeyTC(ms):{}", - channelId, batchId, parseKeyTC.ElapsedMS()); + LOG_DEBUG(MGMT + "channelId:{} batchId:{}, ParseKeys end, parseKeyTC(ms):{}", channelId, batchId, + parseKeyTC.ElapsedMS()); batchId++; #endif return true; } -void HybridMgmt::ProcessEmbInfoHBM(const EmbBaseInfo &info, bool& remainBatchOut, bool isGrad) +void HybridMgmt::ProcessEmbInfoHBM(const EmbBaseInfo& info, bool& remainBatchOut, bool isGrad) { TimeCost parseKeysTc; LOG_DEBUG("ProcessEmbInfoHBM table:{}, batchId:{}, channel:{}", info.name, info.batchId, info.channelId); @@ -628,13 +623,13 @@ void HybridMgmt::ProcessEmbInfoHBM(const EmbBaseInfo &info, bool& remainBatchOut return; } if (infoVecs == nullptr) { - LOG_INFO(MGMT + "table:{}, channelId:{} batchId:{}, ParseKeys infoVecs empty !", - info.name, info.channelId, info.batchId); + LOG_INFO(MGMT + "table:{}, channelId:{} batchId:{}, ParseKeys infoVecs empty !", info.name, info.channelId, + info.batchId); remainBatchOut = false; return; } - LOG_DEBUG("table:{}, channelId:{} batchId:{}, ParseKeysHBM GetInfoVec end", - info.name, info.channelId, info.batchId); + LOG_DEBUG("table:{}, channelId:{} batchId:{}, ParseKeysHBM GetInfoVec end", info.name, info.channelId, + info.batchId); // 动态shape场景下,获取all2all向量(通信量矩阵) SendAll2AllVec(info, remainBatchOut); @@ -644,10 +639,10 @@ void HybridMgmt::ProcessEmbInfoHBM(const EmbBaseInfo &info, bool& remainBatchOut // 发送查询向量 TimeCost sendLookupSyncTC; - hdTransfer->Send(TransferChannel::LOOKUP, { infoVecs->back() }, info.channelId, info.name); + hdTransfer->Send(TransferChannel::LOOKUP, {infoVecs->back()}, info.channelId, info.name); infoVecs->pop_back(); - LOG_DEBUG("table:{}, channelId:{} batchId:{}, sendLookupSyncTC(ms):{}", - info.name, info.channelId, info.batchId, sendLookupSyncTC.ElapsedMS()); + LOG_DEBUG("table:{}, channelId:{} batchId:{}, sendLookupSyncTC(ms):{}", info.name, info.channelId, info.batchId, + sendLookupSyncTC.ElapsedMS()); // 训练时,使用全局去重聚合梯度,发送全局去重的key和对应的恢复向量 if (mgmtRankInfo.useSumSameIdGradients && info.channelId == TRAIN_CHANNEL_ID) { @@ -657,18 +652,17 @@ void HybridMgmt::ProcessEmbInfoHBM(const EmbBaseInfo &info, bool& remainBatchOut // 发送恢复向量 TimeCost sendRestoreSyncTC; hdTransfer->Send(TransferChannel::RESTORE, *infoVecs, info.channelId, info.name); - LOG_DEBUG("table:{}, sendRestoreSyncTC(ms):{}, parseKeysTc HBM mode (ms):{}", - info.name, sendRestoreSyncTC.ElapsedMS(), parseKeysTc.ElapsedMS()); + LOG_DEBUG("table:{}, sendRestoreSyncTC(ms):{}, parseKeysTc HBM mode (ms):{}", info.name, + sendRestoreSyncTC.ElapsedMS(), parseKeysTc.ElapsedMS()); - LOG_INFO(MGMT + "table:{}, channelId:{} batchId:{}, embName:{}, ParseKeys with HBM mode end.", - info.name, info.channelId, info.batchId, info.name); + LOG_INFO(MGMT + "table:{}, channelId:{} batchId:{}, embName:{}, ParseKeys with HBM mode end.", info.name, + info.channelId, info.batchId, info.name); if (info.channelId == TRAIN_CHANNEL_ID) { alreadyTrainOnce = true; } } - /// 构造训练所需的各种向量数据 /// \param embName 表名 /// \param batchId 已处理的batch数 @@ -680,7 +674,7 @@ void HybridMgmt::ProcessEmbInfoDDR(const EmbBaseInfo& info, bool& remainBatchOut TimeCost getAndSendTensorsTC; LOG_DEBUG("ProcessEmbInfoDDR start, table:{}, channel:{}, batchId:{}", info.name, info.channelId, info.batchId); - if (info.channelId == TRAIN_CHANNEL_ID && info.batchId == hybridMgmtBlock->maxTrainStep) { + if (info.channelId == TRAIN_CHANNEL_ID && info.batchId == hybridMgmtBlock->maxTrainStep) { HandleReachMaxStepCase(info, remainBatchOut); return; } @@ -718,10 +712,10 @@ void HybridMgmt::ProcessEmbInfoDDR(const EmbBaseInfo& info, bool& remainBatchOut SendGlobalUniqueVec(info, uniqueKeys, restoreVecSec); TimeCost swapProcessTC; - auto &swapInPos = swapInKoPair.second; - auto &swapOutPos = swapOutKoPair.second; + auto& swapInPos = swapInKoPair.second; + auto& swapOutPos = swapOutKoPair.second; auto lastSwapInPos = lastSwapInPosMap[info.name]; - lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 + lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 auto isNeedReturn = HandleSpecialProcessStatusDDR(info, getAndSendTensorsTC, swapInKoPair, swapOutKoPair); if (isNeedReturn) { @@ -827,7 +821,6 @@ void HybridMgmt::EvictL3StorageKeys(const string& embName, const vectorEvictL3StorageEmbedding(embName, keys); } - /// 通过pyBind在python侧调用,通知hybridMgmt上层即将进行图的执行,需要进行唤醒 /// \param channelID 通道id /// \param steps 运行的步数,由于可能存在循环下沉,所以1个session run 对应N步 @@ -919,16 +912,14 @@ void HybridMgmt::SetOptimizerInfo(const string& embName, OptimizerInfo optimInfo } // L3Storage -void HybridMgmt::LookUpAndRemoveAddrs(const EmbTaskInfo &info) +void HybridMgmt::LookUpAndRemoveAddrs(const EmbTaskInfo& info) { uint64_t memSize = info.extEmbeddingSize * sizeof(float); const std::string hbmSwapKeyQueName = "HBMSwapKeyQue"; const std::string ddrSwapKeyQueName = "DDRSwapKeyQue"; - auto lookUpFunc = [this, memSize, info]( - std::map>> &fromQue, - std::map>> &toQue, - const string &swapStr, const string &fromQueName - ) { + auto lookUpFunc = [this, memSize, info](std::map>>& fromQue, + std::map>>& toQue, + const string& swapStr, const string& fromQueName) { std::vector keys = fromQue[info.name + swapStr].WaitAndPop(); if (!isRunning) { return; @@ -942,8 +933,8 @@ void HybridMgmt::LookUpAndRemoveAddrs(const EmbTaskInfo &info) throw runtime_error("EmbeddingLookupAddrs failed! error code:" + std::to_string(rc)); } if (&fromQue == &DDRSwapKeyQue && swapStr == SWAP_OUT_STR) { - for (auto &addr : addrs) { - auto *newAddr = (float*)malloc(memSize); + for (auto& addr : addrs) { + auto* newAddr = (float*)malloc(memSize); rc = memcpy_s(newAddr, memSize, addr, memSize); if (rc != 0) { throw runtime_error("memcpy_s failed! error code:" + std::to_string(rc)); @@ -968,7 +959,7 @@ void HybridMgmt::LookUpAndRemoveAddrs(const EmbTaskInfo &info) } // DDR -void HybridMgmt::LookUpSwapAddrs(const string &embName, const string &swapStr) +void HybridMgmt::LookUpSwapAddrs(const string& embName, const string& swapStr) { int id = 0; std::string swapName = embName + swapStr; @@ -977,21 +968,20 @@ void HybridMgmt::LookUpSwapAddrs(const string &embName, const string &swapStr) if (!isRunning) { return; } - vector addrs; + vector addrs; TimeCost lookupAddrsTC; int rc = embCache->EmbeddingLookupAddrs(embName, keys, addrs); if (rc != H_OK) { lookupAddrSuccess = false; throw runtime_error("EmbeddingLookupAddrs failed! error code: " + std::to_string(rc)); } - LOG_DEBUG( - "table:{}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}, lookupAddrsTC(ms):{}", - embName, swapStr, keys.size(), addrs.size(), id, lookupAddrsTC.ElapsedMS()); + LOG_DEBUG("table:{}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}, lookupAddrsTC(ms):{}", embName, + swapStr, keys.size(), addrs.size(), id, lookupAddrsTC.ElapsedMS()); HBMSwapAddrsQue[swapName].Pushv(addrs); - if (swapStr==SWAP_IN_STR) { + if (swapStr == SWAP_IN_STR) { lookUpSwapInAddrsPushId[embName]++; - LOG_DEBUG("LookUpSwapAddrs, table:{}, pushId:{}, lookUpSwapInAddrsPushId:{}", - embName, id, lookUpSwapInAddrsPushId[embName]); + LOG_DEBUG("LookUpSwapAddrs, table:{}, pushId:{}, lookUpSwapInAddrsPushId:{}", embName, id, + lookUpSwapInAddrsPushId[embName]); } id++; } @@ -1006,15 +996,15 @@ void HybridMgmt::FetchDeviceEmb() if (mgmtRankInfo.isDDR) { // DDR模式保存host的emb表以及hashmap LOG_DEBUG(MGMT + "start host side save: ddr mode"); - for (const auto &embInfo: mgmtEmbInfo) { + for (const auto& embInfo : mgmtEmbInfo) { std::vector> koVec; embCache->ExportDeviceKeyOffsetPairs(embInfo.name, koVec); std::vector swapOutPos; - for (const auto &p : koVec) { + for (const auto& p : koVec) { swapOutPos.push_back(p.second); } - vector swapTensor; + vector swapTensor; swapTensor.emplace_back(Vec2TensorI32(swapOutPos)); swapTensor.emplace_back(Tensor(tensorflow::DT_INT32, {1})); auto swapOutLen = swapTensor.back().flat(); @@ -1030,7 +1020,7 @@ void HybridMgmt::FetchDeviceEmb() // 这里就是新增的embedding处理线程 void HybridMgmt::EmbeddingTask() { - for (const auto& embInfo: mgmtEmbInfo) { + for (const auto& embInfo : mgmtEmbInfo) { lastUpdateFinishStepMap[embInfo.name] = 0; lastLookUpFinishStepMap[embInfo.name] = 0; lastSendFinishStepMap[embInfo.name] = 0; @@ -1045,7 +1035,7 @@ void HybridMgmt::EmbeddingTask() void HybridMgmt::MultiThreadEmbHDTransWrap() { for (int index = 0; index < EMBEDDING_THREAD_NUM; index++) { - for (const auto& embInfo: mgmtEmbInfo) { + for (const auto& embInfo : mgmtEmbInfo) { CreateEmbeddingLookUpAndSendThread(index, embInfo); CreateEmbeddingReceiveAndUpdateThread(index, embInfo); } @@ -1059,13 +1049,11 @@ void HybridMgmt::EmbeddingLookUpAndSendDDR(int batchId, int index, const EmbInfo cvNotifyIndex = index + 1; } - EmbTaskInfo info = { - .batchId=batchId, - .threadIdx=index, - .cvNotifyIndex=cvNotifyIndex, - .extEmbeddingSize=embInfo.extEmbeddingSize, - .name=embInfo.name - }; + EmbTaskInfo info = {.batchId = batchId, + .threadIdx = index, + .cvNotifyIndex = cvNotifyIndex, + .extEmbeddingSize = embInfo.extEmbeddingSize, + .name = embInfo.name}; vector h2dEmb; auto isSuccess = EmbeddingLookUpDDR(info, h2dEmb); @@ -1084,13 +1072,11 @@ void HybridMgmt::EmbeddingReceiveAndUpdateDDR(int batchId, int index, const EmbI cvNotifyIndex = index + 1; } - EmbTaskInfo info = { - .batchId=batchId, - .threadIdx=index, - .cvNotifyIndex=cvNotifyIndex, - .extEmbeddingSize=embInfo.extEmbeddingSize, - .name=embInfo.name - }; + EmbTaskInfo info = {.batchId = batchId, + .threadIdx = index, + .cvNotifyIndex = cvNotifyIndex, + .extEmbeddingSize = embInfo.extEmbeddingSize, + .name = embInfo.name}; float* ptr = nullptr; vector swapOutAddrs; @@ -1110,13 +1096,11 @@ void HybridMgmt::EmbeddingLookUpAndSendL3Storage(int batchId, int index, const E cvNotifyIndex = index + 1; } - EmbTaskInfo info = { - .batchId=batchId, - .threadIdx=index, - .cvNotifyIndex=cvNotifyIndex, - .extEmbeddingSize=embInfo.extEmbeddingSize, - .name=embInfo.name - }; + EmbTaskInfo info = {.batchId = batchId, + .threadIdx = index, + .cvNotifyIndex = cvNotifyIndex, + .extEmbeddingSize = embInfo.extEmbeddingSize, + .name = embInfo.name}; vector h2dEmb; auto isSuccess = EmbeddingLookUpL3Storage(info, h2dEmb); @@ -1135,13 +1119,11 @@ void HybridMgmt::EmbeddingReceiveAndUpdateL3Storage(int batchId, int index, cons cvNotifyIndex = index + 1; } - EmbTaskInfo info = { - .batchId=batchId, - .threadIdx=index, - .cvNotifyIndex=cvNotifyIndex, - .extEmbeddingSize=embInfo.extEmbeddingSize, - .name=embInfo.name - }; + EmbTaskInfo info = {.batchId = batchId, + .threadIdx = index, + .cvNotifyIndex = cvNotifyIndex, + .extEmbeddingSize = embInfo.extEmbeddingSize, + .name = embInfo.name}; float* ptr = nullptr; vector swapOutAddrs; @@ -1151,7 +1133,6 @@ void HybridMgmt::EmbeddingReceiveAndUpdateL3Storage(int batchId, int index, cons EmbeddingUpdateL3Storage(info, ptr, swapOutAddrs, dims0); } - /// 构造训练所需的各种向量数据 /// \param embName 表名 /// \param batchId 已处理的batch数 @@ -1164,7 +1145,7 @@ void HybridMgmt::ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBa TimeCost getAndSendTensorsTC; LOG_DEBUG("ProcessEmbInfoL3Storage table:{}, channel:{}, batchId:{}", info.name, info.channelId, info.batchId); - if (info.channelId == TRAIN_CHANNEL_ID && info.batchId == hybridMgmtBlock->maxTrainStep) { + if (info.channelId == TRAIN_CHANNEL_ID && info.batchId == hybridMgmtBlock->maxTrainStep) { HandleReachMaxStepCase(info, remainBatchOut); return; } @@ -1202,12 +1183,12 @@ void HybridMgmt::ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBa SendGlobalUniqueVec(info, uniqueKeys, restoreVecSec); TimeCost swapProcessTC; - auto &swapInKeys = swapInKoPair.first; - auto &swapInPos = swapInKoPair.second; - auto &swapOutKeys = swapOutKoPair.first; - auto &swapOutPos = swapOutKoPair.second; + auto& swapInKeys = swapInKoPair.first; + auto& swapInPos = swapInKoPair.second; + auto& swapOutKeys = swapOutKoPair.first; + auto& swapOutPos = swapOutKoPair.second; auto lastSwapInPos = lastSwapInPosMap[info.name]; - lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 + lastSwapInPosMap[info.name] = swapInPos; // 暂存待下一步发送 auto isNeedReturn = HandleSpecialProcessStatusL3Storage(info, getAndSendTensorsTC, swapInKoPair, swapOutKoPair); if (isNeedReturn) { @@ -1232,18 +1213,17 @@ void HybridMgmt::ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBa #endif } -void HybridMgmt::SendTensorForSwap(const EmbBaseInfo& info, - const vector &swapInPosUint, - const vector &swapOutPosUint) +void HybridMgmt::SendTensorForSwap(const EmbBaseInfo& info, const vector& swapInPosUint, + const vector& swapOutPosUint) { #ifndef GTEST vector swapTensor; swapTensor.emplace_back(Vec2TensorI32(swapInPosUint)); swapTensor.emplace_back(Vec2TensorI32(swapOutPosUint)); - swapTensor.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); + swapTensor.emplace_back(Tensor(tensorflow::DT_INT32, {1})); auto swapInLen = swapTensor.back().flat(); swapInLen(0) = swapInPosUint.size(); - swapTensor.emplace_back(Tensor(tensorflow::DT_INT32, { 1 })); + swapTensor.emplace_back(Tensor(tensorflow::DT_INT32, {1})); auto swapOutLen = swapTensor.back().flat(); swapOutLen(0) = swapOutPosUint.size(); @@ -1251,11 +1231,11 @@ void HybridMgmt::SendTensorForSwap(const EmbBaseInfo& info, #endif } -void HybridMgmt::InitDataPipelineForDDR(const string &embName) +void HybridMgmt::InitDataPipelineForDDR(const string& embName) { // 初始化公共队列 - HBMSwapKeyQue[embName+SWAP_IN_STR]; - HBMSwapKeyQue[embName+SWAP_OUT_STR]; + HBMSwapKeyQue[embName + SWAP_IN_STR]; + HBMSwapKeyQue[embName + SWAP_OUT_STR]; HBMSwapAddrsQue[embName + SWAP_IN_STR]; HBMSwapAddrsQue[embName + SWAP_OUT_STR]; @@ -1269,11 +1249,11 @@ void HybridMgmt::InitDataPipelineForDDR(const string &embName) LOG_DEBUG("data pipeline for ddr init"); } -void HybridMgmt::InitDataPipelineForL3Storage(const string &embName, int extEmbeddingSize) +void HybridMgmt::InitDataPipelineForL3Storage(const string& embName, int extEmbeddingSize) { // 初始化公共队列 - HBMSwapKeyQue[embName+SWAP_IN_STR]; - HBMSwapKeyQue[embName+SWAP_OUT_STR]; + HBMSwapKeyQue[embName + SWAP_IN_STR]; + HBMSwapKeyQue[embName + SWAP_OUT_STR]; HBMSwapAddrsQue[embName + SWAP_IN_STR]; HBMSwapAddrsQue[embName + SWAP_OUT_STR]; @@ -1300,7 +1280,7 @@ void HybridMgmt::InitEmbeddingCache(const vector& embInfos) EmbeddingMgmt::Instance()->SetEmbCacheForEmbTable(embCache); EmbeddingMgmt::Instance()->SetHDTransferForEmbTable(hdTransfer); - for (auto embInfo: embInfos) { + for (auto embInfo : embInfos) { if (isL3StorageEnabled) { InitDataPipelineForL3Storage(embInfo.name, embInfo.extEmbeddingSize); } else { @@ -1314,9 +1294,9 @@ void HybridMgmt::InitEmbeddingCache(const vector& embInfos) embInfo.name, embInfo.hostVocabSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); EmbCache::EmbCacheInfo embCacheInfo(embInfo.name, embInfo.hostVocabSize, embInfo.embeddingSize, embInfo.extEmbeddingSize, embInfo.devVocabSize); - size_t prefill = std::max(embInfo.hostVocabSize/HOST_TO_PREFILL_RATIO, embInfo.devVocabSize); - int ret = embCache->CreateCacheForTable( - embCacheInfo, embInfo.initializeInfos, INVALID_KEY_VALUE, prefill, EMBEDDING_THREAD_NUM); + size_t prefill = std::max(embInfo.hostVocabSize / HOST_TO_PREFILL_RATIO, embInfo.devVocabSize); + int ret = embCache->CreateCacheForTable(embCacheInfo, embInfo.initializeInfos, INVALID_KEY_VALUE, prefill, + EMBEDDING_THREAD_NUM); if (ret != H_OK) { throw runtime_error(embInfo.name + "create cache for table failed, error code: " + std::to_string(ret)); } @@ -1325,22 +1305,22 @@ void HybridMgmt::InitEmbeddingCache(const vector& embInfos) void HybridMgmt::JoinEmbeddingCacheThread() { - for (auto &p : HBMSwapAddrsQue) { + for (auto& p : HBMSwapAddrsQue) { p.second.DestroyQueue(); } - for (auto &p : HBMSwapKeyQue) { + for (auto& p : HBMSwapKeyQue) { p.second.DestroyQueue(); } - for (auto &p : HBMSwapKeyForL3StorageQue) { + for (auto& p : HBMSwapKeyForL3StorageQue) { p.second.DestroyQueue(); } - for (auto &p : DDRSwapKeyQue) { + for (auto& p : DDRSwapKeyQue) { p.second.DestroyQueue(); } - for (auto &p : DDRSwapKeyForL3StorageQue) { + for (auto& p : DDRSwapKeyForL3StorageQue) { p.second.DestroyQueue(); } - for (auto &p : DDRSwapAddrsQue) { + for (auto& p : DDRSwapAddrsQue) { p.second.DestroyQueue(); } for (auto& t : EmbeddingLookUpAndSendThreadPool) { @@ -1363,25 +1343,26 @@ void HybridMgmt::HandleReachMaxStepCase(const EmbBaseInfo& info, bool& remainBat // 2. 如果切换过: // a. eval场景跑完,不用send,外面自然退出 // b. save场景,能触发,说明期望的train step已经跑完(由IsTrainEndBatch判定send),当前step也不用send - LOG_DEBUG("table:{}, batchId:{}, ProcessStatus:{}, reach maxTrainStep", - info.name, info.batchId, ProcessStatus2Str(ProcessStatus::NORMAL)); + LOG_DEBUG("table:{}, batchId:{}, ProcessStatus:{}, reach maxTrainStep", info.name, info.batchId, + ProcessStatus2Str(ProcessStatus::NORMAL)); if (specialProcessStatus[info.name] == ProcessStatus::NORMAL) { LOG_DEBUG("table:{}, batchId:{}, need send swap tensor" - " for last step to finish train", info.name, info.batchId); + " for last step to finish train", + info.name, info.batchId); std::vector emptySwapOutPos; SendTensorForSwap(info, lastSwapInPosMap[info.name], emptySwapOutPos); } else { - LOG_DEBUG("table:{}, batchId:{}, switch from eval or save, unnecessary to send emptySwapOutPos", - info.name, info.batchId); + LOG_DEBUG("table:{}, batchId:{}, switch from eval or save, unnecessary to send emptySwapOutPos", info.name, + info.batchId); } remainBatchOut = false; hybridMgmtBlock->SetBlockStatus(TRAIN_CHANNEL_ID, true); } -void HybridMgmt::HandleEosCase(const EmbBaseInfo& info, bool &remainBatchOut) +void HybridMgmt::HandleEosCase(const EmbBaseInfo& info, bool& remainBatchOut) { - LOG_INFO("GetUniqueKeys get eos, handle final batch for current epoch, table:{}, channel:{}, batchId:{}", - info.name, info.channelId, info.batchId); + LOG_INFO("GetUniqueKeys get eos, handle final batch for current epoch, table:{}, channel:{}, batchId:{}", info.name, + info.channelId, info.batchId); bool sendAllChannel = false; if (info.channelId == TRAIN_CHANNEL_ID) { vector emptySwapOutPos; @@ -1418,8 +1399,8 @@ void HybridMgmt::HandleEosCase(const EmbBaseInfo& info, bool &remainBatchOut) // train+eval+train场景 // 交给train的ProcessEmbInfoDDR启动最后n-1步eval // train发送pos让eval step n-1跑完,到eval step n时各channel遇到eos后结束(train、eval共享的channel除外) - LOG_INFO("GetUniqueKeys get eos, skip send pos for eval channel, table:{}, batchId:{}", - info.name, info.batchId); + LOG_INFO("GetUniqueKeys get eos, skip send pos for eval channel, table:{}, batchId:{}", info.name, + info.batchId); } } KEY_PROCESS_INSTANCE->SendEos(info.name, info.batchId, info.channelId, sendAllChannel); @@ -1454,22 +1435,22 @@ bool HybridMgmt::EmbeddingReceiveDDR(const EmbTaskInfo& info, float*& ptr, vecto if (aclData == nullptr) { throw runtime_error("Acl get tensor data from dataset failed."); } - ptr = reinterpret_cast(acltdtGetDataAddrFromItem(aclData)); + ptr = reinterpret_cast(acltdtGetDataAddrFromItem(aclData)); // 判断拿到的embedding个数是否与swapOutKeys个数相等 size_t dimNum = acltdtGetDimNumFromItem(aclData); int64_t dims[dimNum]; acltdtGetDimsFromItem(aclData, dims, dimNum); - LOG_DEBUG("table:{}, batchId:{}, dims[0]:{}, swapOutAddrs size:{}", - info.name, info.batchId, dims[0], swapOutAddrs.size()); + LOG_DEBUG("table:{}, batchId:{}, dims[0]:{}, swapOutAddrs size:{}", info.name, info.batchId, dims[0], + swapOutAddrs.size()); if (dims[0] != static_cast(swapOutAddrs.size())) { throw runtime_error("data dims[0] != swapOutKeys.size()"); } } - LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingRecvTC(ms):{}", - info.name, info.batchId, info.threadIdx, EmbeddingRecvTC.ElapsedMS()); + LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingRecvTC(ms):{}", info.name, info.batchId, info.threadIdx, + EmbeddingRecvTC.ElapsedMS()); lastRecvFinishStepMap[info.name]++; cvLastRecvFinishMap[info.name][info.cvNotifyIndex].notify_all(); @@ -1486,8 +1467,8 @@ void HybridMgmt::EmbeddingUpdateDDR(const EmbTaskInfo& info, const float* embPtr uint64_t memSize = info.extEmbeddingSize * sizeof(float); uint64_t extEmbeddingSize = info.extEmbeddingSize; -# pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ - shared(swapOutAddrs, embPtr, extEmbeddingSize, memSize) +#pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ + shared(swapOutAddrs, embPtr, extEmbeddingSize, memSize) for (uint64_t i = 0; i < swapOutAddrs.size(); i++) { auto rc = memcpy_s(swapOutAddrs[i], memSize, embPtr + i * extEmbeddingSize, memSize); if (rc != 0) { @@ -1497,18 +1478,19 @@ void HybridMgmt::EmbeddingUpdateDDR(const EmbTaskInfo& info, const float* embPtr if (MxRec::Logger::GetLevel() <= MxRec::Logger::DEBUG) { string sample; if (!swapOutAddrs.empty()) { - sample = FloatPtrToLimitStr(swapOutAddrs.front(), info.extEmbeddingSize); // print first element + sample = FloatPtrToLimitStr(swapOutAddrs.front(), info.extEmbeddingSize); // print first element } LOG_DEBUG("table:{}, batchId:{}, thread:{}, receive d2hEmb, ext emb:{}, emb size:{}, emb samples:{}, " - "EmbeddingUpdateTC(ms):{}", info.name.c_str(), info.batchId, info.threadIdx, - info.extEmbeddingSize, swapOutAddrs.size(), sample, EmbeddingUpdateTC.ElapsedMS()); + "EmbeddingUpdateTC(ms):{}", + info.name.c_str(), info.batchId, info.threadIdx, info.extEmbeddingSize, swapOutAddrs.size(), sample, + EmbeddingUpdateTC.ElapsedMS()); } lastUpdateFinishStepMap[info.name]++; cvLastUpdateFinishMap[info.name][info.cvNotifyIndex].notify_all(); } -bool HybridMgmt::EmbeddingLookUpDDR(const EmbTaskInfo &info, vector& h2dEmb) +bool HybridMgmt::EmbeddingLookUpDDR(const EmbTaskInfo& info, vector& h2dEmb) { std::unique_lock lastUpdateFinishLocker(lastUpdateFinishMutexMap[info.name][info.threadIdx]); cvLastUpdateFinishMap[info.name][info.threadIdx].wait(lastUpdateFinishLocker, [info, this] { @@ -1537,7 +1519,7 @@ bool HybridMgmt::EmbeddingLookUpDDR(const EmbTaskInfo &info, vector& h2d return true; } -void HybridMgmt::EmbeddingSendDDR(const EmbTaskInfo &info, vector& h2dEmb) +void HybridMgmt::EmbeddingSendDDR(const EmbTaskInfo& info, vector& h2dEmb) { std::unique_lock lastSendFinishLocker(lastSendFinishMutexMap[info.name][info.threadIdx]); cvLastSendFinishMap[info.name][info.threadIdx].wait(lastSendFinishLocker, [info, this] { @@ -1547,10 +1529,11 @@ void HybridMgmt::EmbeddingSendDDR(const EmbTaskInfo &info, vector& h2dEm hdTransfer->Send(TransferChannel::H2D, h2dEmb, TRAIN_CHANNEL_ID, info.name, info.batchId); lastSendFinishStepMap[info.name]++; cvLastSendFinishMap[info.name][info.cvNotifyIndex].notify_all(); - LOG_DEBUG("table:{}, batchId:{}, thread:{}, SendH2DEmbTC(ms):{}", - info.name, info.batchId, info.threadIdx, SendTC.ElapsedMS()); + LOG_DEBUG("table:{}, batchId:{}, thread:{}, SendH2DEmbTC(ms):{}", info.name, info.batchId, info.threadIdx, + SendTC.ElapsedMS()); - // 对于end of sequence场景,key process需要基于h2dNextBatchId等待每个table都完成了最后1个step发送,才能发EOS至各channel + // 对于end of sequence场景,key + // process需要基于h2dNextBatchId等待每个table都完成了最后1个step发送,才能发EOS至各channel hybridMgmtBlock->h2dNextBatchId[info.name]++; LOG_DEBUG("h2dNextBatchId, table:{}, next batchId:{}", info.name, hybridMgmtBlock->h2dNextBatchId[info.name]); } @@ -1603,8 +1586,8 @@ void HybridMgmt::CreateEmbeddingReceiveAndUpdateThread(int index, const EmbInfo& }); } -bool HybridMgmt::EmbeddingReceiveL3Storage(const EmbTaskInfo &info, float *&ptr, - vector &swapOutAddrs, int64_t& dims0) +bool HybridMgmt::EmbeddingReceiveL3Storage(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs, + int64_t& dims0) { std::unique_lock lastRecvFinishLocker(lastRecvFinishMutexMap[info.name][info.threadIdx]); cvLastRecvFinishMap[info.name][info.threadIdx].wait(lastRecvFinishLocker, [info, this] { @@ -1635,26 +1618,26 @@ bool HybridMgmt::EmbeddingReceiveL3Storage(const EmbTaskInfo &info, float *&ptr, if (aclData == nullptr) { throw runtime_error("Acl get tensor data from dataset failed."); } - ptr = reinterpret_cast(acltdtGetDataAddrFromItem(aclData)); + ptr = reinterpret_cast(acltdtGetDataAddrFromItem(aclData)); // 判断拿到的embedding个数是否与swapOutKeys个数相等 size_t dimNum = acltdtGetDimNumFromItem(aclData); int64_t dims[dimNum]; acltdtGetDimsFromItem(aclData, dims, dimNum); - LOG_DEBUG("table:{}, batchId:{}, recv d2h, dims[0]:{}, swapOutAddrs.size:{}", - info.name, info.batchId, dims[0], swapOutAddrs.size()); + LOG_DEBUG("table:{}, batchId:{}, recv d2h, dims[0]:{}, swapOutAddrs.size:{}", info.name, info.batchId, dims[0], + swapOutAddrs.size()); dims0 = dims[0]; } - LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingRecvTC(ms):{}", - info.name.c_str(), info.batchId, info.threadIdx, EmbeddingRecvTC.ElapsedMS()); + LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingRecvTC(ms):{}", info.name.c_str(), info.batchId, + info.threadIdx, EmbeddingRecvTC.ElapsedMS()); lastRecvFinishStepMap[info.name]++; cvLastRecvFinishMap[info.name][info.cvNotifyIndex].notify_all(); return true; } -void HybridMgmt::EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float *embPtr, - vector& swapOutAddrs, int64_t& dims0) +void HybridMgmt::EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float* embPtr, vector& swapOutAddrs, + int64_t& dims0) { std::unique_lock lastUpdateFinishLocker(lastUpdateFinishMutexMap[info.name][info.threadIdx]); cvLastUpdateFinishMap[info.name][info.threadIdx].wait(lastUpdateFinishLocker, [info, this] { @@ -1669,16 +1652,16 @@ void HybridMgmt::EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float *embPtr uint64_t memSize = info.extEmbeddingSize * sizeof(float); uint64_t extEmbeddingSize = info.extEmbeddingSize; // DDR更新 -# pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ - shared(swapOutAddrs, swapOutDDRAddrOffs, embPtr, extEmbeddingSize, memSize) +#pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ + shared(swapOutAddrs, swapOutDDRAddrOffs, embPtr, extEmbeddingSize, memSize) for (uint64_t i = 0; i < swapOutAddrs.size(); i++) { auto rc = memcpy_s(swapOutAddrs[i], memSize, embPtr + swapOutDDRAddrOffs[i] * extEmbeddingSize, memSize); if (rc != 0) { throw runtime_error("memcpy_s failed, error code:" + to_string(rc)); } } - LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingUpdateTC(ms):{}", - info.name.c_str(), info.batchId, info.threadIdx, EmbeddingUpdateTC.ElapsedMS()); + LOG_DEBUG("table:{}, batchId:{}, thread:{}, EmbeddingUpdateTC(ms):{}", info.name.c_str(), info.batchId, + info.threadIdx, EmbeddingUpdateTC.ElapsedMS()); // L3Storage更新 TimeCost L3StorageUpdateTC = TimeCost(); @@ -1693,8 +1676,8 @@ void HybridMgmt::EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float *embPtr } cacheManager->UpdateL3StorageEmb(info.name, embPtr, extEmbeddingSize, swapOutL3StorageKeys, swapOutL3StorageAddrOffs); - LOG_DEBUG("table:{}, batchId:{}, thread{}, L3StorageUpdateTC(ms):{}", - info.name.c_str(), info.batchId, info.threadIdx, L3StorageUpdateTC.ElapsedMS()); + LOG_DEBUG("table:{}, batchId:{}, thread{}, L3StorageUpdateTC(ms):{}", info.name.c_str(), info.batchId, + info.threadIdx, L3StorageUpdateTC.ElapsedMS()); lastUpdateFinishStepMap[info.name]++; cvLastUpdateFinishMap[info.name][info.cvNotifyIndex].notify_all(); @@ -1726,8 +1709,8 @@ bool HybridMgmt::EmbeddingLookUpL3Storage(const EmbTaskInfo& info, vectorTransferDDR2L3Storage(info.name, info.extEmbeddingSize, DDR2L3StorageKeys, DDR2L3StorageAddrs); - LOG_DEBUG("table:{}, thread:{}, transferDDR2L3StorageTC(ms):{}", - info.name.c_str(), info.threadIdx, transferDDR2L3StorageTC.ElapsedMS()); + LOG_DEBUG("table:{}, thread:{}, transferDDR2L3StorageTC(ms):{}", info.name.c_str(), info.threadIdx, + transferDDR2L3StorageTC.ElapsedMS()); TimeCost fetchL3StorageEmb2DDRTC = TimeCost(); // swapInKeys中在L3Storage的挪到DDR @@ -1737,8 +1720,8 @@ bool HybridMgmt::EmbeddingLookUpL3Storage(const EmbTaskInfo& info, vectorFetchL3StorageEmb2DDR(info.name, info.extEmbeddingSize, L3Storage2DDRKeys, L3Storage2DDRAddrs); - LOG_DEBUG("table:{}, thread:{}, fetchL3StorageEmb2DDRTC(ms):{}", - info.name.c_str(), info.threadIdx, fetchL3StorageEmb2DDRTC.ElapsedMS()); + LOG_DEBUG("table:{}, thread:{}, fetchL3StorageEmb2DDRTC(ms):{}", info.name.c_str(), info.threadIdx, + fetchL3StorageEmb2DDRTC.ElapsedMS()); bool isSuccess = BuildH2DEmbedding(info, h2dEmb); if (!isSuccess) { @@ -1763,12 +1746,13 @@ void HybridMgmt::EmbeddingSendL3Storage(const EmbTaskInfo& info, vector& cvLastSendFinishMap[info.name][info.cvNotifyIndex].notify_all(); LOG_DEBUG("table:{}, thread:{}, SendH2DEmbTC(ms):{}", info.name.c_str(), info.threadIdx, SendTC.ElapsedMS()); - // 对于end of sequence场景,key process需要基于h2dNextBatchId等待每个table都完成了最后1个step发送,才能发EOS至各channel + // 对于end of sequence场景,key + // process需要基于h2dNextBatchId等待每个table都完成了最后1个step发送,才能发EOS至各channel hybridMgmtBlock->h2dNextBatchId[info.name]++; LOG_DEBUG("h2dNextBatchId, table:{}, next batchId:{}", info.name, hybridMgmtBlock->h2dNextBatchId[info.name]); } -void HybridMgmt::HandleEosCaseHBM(const string &embName, int batchId, int channelId, bool &remainBatchOut) +void HybridMgmt::HandleEosCaseHBM(const string& embName, int batchId, int channelId, bool& remainBatchOut) { bool sendAllChannel = false; if (channelId == EVAL_CHANNEL_ID) { @@ -1813,19 +1797,19 @@ void HybridMgmt::HandleFirstBatchCaseDDR(const EmbBaseInfo& info, pair, vector>& swapOutKoPair) { TimeCost swapProcessTC; - auto &swapInKeys = swapInKoPair.first; - auto &swapInPos = swapInKoPair.second; - auto &swapOutKeys = swapOutKoPair.first; - auto &swapOutPos = swapOutKoPair.second; + auto& swapInKeys = swapInKoPair.first; + auto& swapInPos = swapInKoPair.second; + auto& swapOutKeys = swapOutKoPair.first; + auto& swapOutPos = swapOutKoPair.second; vector emptySwapOutKeys; - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", - info.name, info.batchId, info.channelId, swapInKoPair.first.size(), emptySwapOutKeys.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", info.name, info.batchId, + info.channelId, swapInKoPair.first.size(), emptySwapOutKeys.size()); trainTestSwitchInfoStore[info.name] = {swapOutKeys, swapOutPos}; LOG_DEBUG("handle first batch case, delay sending swapInPos, table:{}", info.name); - LOG_DEBUG("enqueue HBMSwapKeyQue table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", - info.name, info.batchId, info.channelId, swapInKeys.size(), emptySwapOutKeys.size()); + LOG_DEBUG("enqueue HBMSwapKeyQue table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", info.name, + info.batchId, info.channelId, swapInKeys.size(), emptySwapOutKeys.size()); HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(emptySwapOutKeys); HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKeys); } @@ -1836,8 +1820,8 @@ void HybridMgmt::HandleFirstBatchCaseL3Storage(const EmbBaseInfo& info, { // 发现train、save、eval切换,先保存状态,发emptySwapOutKeys以对应上一步的emptySwapOutPos vector emptySwapOutKeys; - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", - info.name, info.batchId, info.channelId, swapInKoPair.first.size(), emptySwapOutKeys.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", info.name, info.batchId, + info.channelId, swapInKoPair.first.size(), emptySwapOutKeys.size()); trainTestSwitchInfoStore[info.name] = {swapOutKoPair.first, swapOutKoPair.second}; TimeCost ProcessSwapInKeysTC = TimeCost(); @@ -1851,14 +1835,14 @@ void HybridMgmt::HandleFirstBatchCaseL3Storage(const EmbBaseInfo& info, vector emptySwapOutL3StorageKeys; vector emptySwapOutL3StorageAddrOff; - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", - info.name, info.batchId, info.channelId, swapInKoPair.first.size(), swapOutKoPair.first.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", info.name, info.batchId, + info.channelId, swapInKoPair.first.size(), swapOutKoPair.first.size()); LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapOutDDRKeys.size:{}, swapOutDDRAddrOffs.size:{}, " "swapOutL3StorageKeys.size:{}, swapOutL3StorageAddrOff.size:{}", info.name, info.batchId, info.channelId, emptySwapOutDDRKeys.size(), emptySwapOutDDRAddrOffs.size(), emptySwapOutL3StorageKeys.size(), emptySwapOutL3StorageAddrOff.size()); - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToL3StorageKeys.size:{}, L3StorageToDDRKeys.size:{}", - info.name, info.batchId, info.channelId, DDRToL3StorageKeys.size(), L3StorageToDDRKeys.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDRToL3StorageKeys.size:{}, L3StorageToDDRKeys.size:{}", info.name, + info.batchId, info.channelId, DDRToL3StorageKeys.size(), L3StorageToDDRKeys.size()); auto DDRToL3StorageKeysForL3S = DDRToL3StorageKeys; auto L3StorageToDDRKeysForL3S = L3StorageToDDRKeys; @@ -1879,8 +1863,8 @@ void HybridMgmt::HandleFirstBatchCaseL3Storage(const EmbBaseInfo& info, HBMSwapKeyForL3StorageQue[info.name + ADDR_STR].Pushv(emptySwapOutL3StorageAddrOff); } -void HybridMgmt::HandleDataSwapForL3Storage(const EmbBaseInfo& info, - vector &swapInKeys, vector &swapOutKeys) +void HybridMgmt::HandleDataSwapForL3Storage(const EmbBaseInfo& info, vector& swapInKeys, + vector& swapOutKeys) { TimeCost ProcessSwapInKeysTC; vector L3StorageToDDRKeys; @@ -1893,15 +1877,15 @@ void HybridMgmt::HandleDataSwapForL3Storage(const EmbBaseInfo& info, cacheManager->ProcessSwapOutKeys(info.name, swapOutKeys, hbmSwapInfo); LOG_DEBUG("ProcessSwapOutKeysTC(ms):{} ", ProcessSwapOutKeysTC.ElapsedMS()); - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", - info.name, info.batchId, info.channelId, swapInKeys.size(), swapOutKeys.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", info.name, info.batchId, + info.channelId, swapInKeys.size(), swapOutKeys.size()); LOG_DEBUG("table:{}, batchId:{}, channelId:{}, swap out, HBM2DDR Keys:{}, HBM2DDR AddrOffs:{}, " "HBM2L3Storage Keys:{}, HBM2L3Storage AddrOff:{}", info.name, info.batchId, info.channelId, hbmSwapInfo.swapOutDDRKeys.size(), hbmSwapInfo.swapOutDDRAddrOffs.size(), hbmSwapInfo.swapOutL3StorageKeys.size(), hbmSwapInfo.swapOutL3StorageAddrOffs.size()); - LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDR2L3Storage Keys:{}, L3Storage2DDR Keys:{}", - info.name, info.batchId, info.channelId, DDRToL3StorageKeys.size(), L3StorageToDDRKeys.size()); + LOG_DEBUG("table:{}, batchId:{}, channelId:{}, DDR2L3Storage Keys:{}, L3Storage2DDR Keys:{}", info.name, + info.batchId, info.channelId, DDRToL3StorageKeys.size(), L3StorageToDDRKeys.size()); auto DDRToL3StorageKeysForL3S = DDRToL3StorageKeys; auto L3StorageToDDRKeysForL3S = L3StorageToDDRKeys; @@ -1922,22 +1906,20 @@ void HybridMgmt::HandleDataSwapForL3Storage(const EmbBaseInfo& info, HBMSwapKeyForL3StorageQue[info.name + ADDR_STR].Pushv(hbmSwapInfo.swapOutL3StorageAddrOffs); } -bool HybridMgmt::BuildH2DEmbedding(const EmbTaskInfo &info, vector &h2dEmb) +bool HybridMgmt::BuildH2DEmbedding(const EmbTaskInfo& info, vector& h2dEmb) { std::vector swapInAddrs = HBMSwapAddrsQue[info.name + SWAP_IN_STR].WaitAndPop(); if (!isRunning) { return false; } - h2dEmb.emplace_back(Tensor(tensorflow::DT_FLOAT, { - int(swapInAddrs.size()), static_cast(info.extEmbeddingSize) - })); - auto &tmpTensor = h2dEmb.back(); - float *h2dEmbAddr = tmpTensor.flat().data(); + h2dEmb.emplace_back( + Tensor(tensorflow::DT_FLOAT, {int(swapInAddrs.size()), static_cast(info.extEmbeddingSize)})); + auto& tmpTensor = h2dEmb.back(); + float* h2dEmbAddr = tmpTensor.flat().data(); TimeCost embeddingLookupTC = TimeCost(); uint64_t memSize = info.extEmbeddingSize * sizeof(float); -# pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) \ - shared(swapInAddrs, h2dEmbAddr, info, memSize) +#pragma omp parallel for num_threads(MGMT_CPY_THREADS) default(none) shared(swapInAddrs, h2dEmbAddr, info, memSize) for (uint64_t i = 0; i < swapInAddrs.size(); i++) { auto rc = memcpy_s(h2dEmbAddr + i * info.extEmbeddingSize, memSize, swapInAddrs[i], memSize); if (rc != 0) { @@ -1951,7 +1933,7 @@ bool HybridMgmt::BuildH2DEmbedding(const EmbTaskInfo &info, vector &h2dE return true; } -vector HybridMgmt::GetUniqueKeys(const EmbBaseInfo &info, bool &remainBatchOut) +vector HybridMgmt::GetUniqueKeys(const EmbBaseInfo& info, bool& remainBatchOut) { bool isEos = false; auto uniqueKeys = KEY_PROCESS_INSTANCE->GetUniqueKeys(info, isEos, lookUpSwapInAddrsPushId); @@ -1961,8 +1943,8 @@ vector HybridMgmt::GetUniqueKeys(const EmbBaseInfo &info, bool &remain } if (uniqueKeys.empty()) { remainBatchOut = false; - LOG_WARN("table:{}, channelId:{} batchId:{}, UniqueKeys result is empty", - info.name, info.channelId, info.batchId); + LOG_WARN("table:{}, channelId:{} batchId:{}, UniqueKeys result is empty", info.name, info.channelId, + info.batchId); return uniqueKeys; } @@ -1971,7 +1953,7 @@ vector HybridMgmt::GetUniqueKeys(const EmbBaseInfo &info, bool &remain trainKeysSet[info.name].insert(uniqueKeys.begin(), uniqueKeys.end()); LOG_DEBUG("table:{}, batchId:{}, KeyMaintainTC(ms):{}", info.name, info.batchId, KeyMaintainTC.ElapsedMS()); } else { - for (auto &key : uniqueKeys) { + for (auto& key : uniqueKeys) { if (trainKeysSet[info.name].find(key) == trainKeysSet[info.name].end()) { key = INVALID_KEY_VALUE; LOG_TRACE("find key not train before, set as invalid key"); @@ -1983,28 +1965,27 @@ vector HybridMgmt::GetUniqueKeys(const EmbBaseInfo &info, bool &remain return uniqueKeys; } -vector HybridMgmt::GetRestoreVecSec(const EmbBaseInfo &info, bool &remainBatchOut) +vector HybridMgmt::GetRestoreVecSec(const EmbBaseInfo& info, bool& remainBatchOut) { auto restoreVecSec = KEY_PROCESS_INSTANCE->GetRestoreVecSec(info); if (restoreVecSec.empty()) { remainBatchOut = false; - LOG_WARN("table:{}, channelId:{} batchId:{}, restoreVecSec result is empty", - info.name, info.channelId, info.batchId); + LOG_WARN("table:{}, channelId:{} batchId:{}, restoreVecSec result is empty", info.name, info.channelId, + info.batchId); return restoreVecSec; } LOG_DEBUG("table:{}, channelId:{} batchId:{}, GetRestoreVecSec end", info.name, info.channelId, info.batchId); return restoreVecSec; } -void HybridMgmt::SendAll2AllVec(const EmbBaseInfo &info, bool &remainBatchOut) +void HybridMgmt::SendAll2AllVec(const EmbBaseInfo& info, bool& remainBatchOut) { if (!mgmtRankInfo.useStatic) { bool isEos = false; // useless, adapt to HBM mode TimeCost getAll2AllTC; - unique_ptr> all2all = KEY_PROCESS_INSTANCE->GetInfoVec( - info, ProcessedInfo::ALL2ALL, isEos); - LOG_DEBUG("table:{}, channelId:{}, batchId:{}, GetInfoVec all2all end, GetAll2AllTC(ms):{}", - info.name, info.channelId, info.batchId, getAll2AllTC.ElapsedMS()); + unique_ptr> all2all = KEY_PROCESS_INSTANCE->GetInfoVec(info, ProcessedInfo::ALL2ALL, isEos); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, GetInfoVec all2all end, GetAll2AllTC(ms):{}", info.name, + info.channelId, info.batchId, getAll2AllTC.ElapsedMS()); if (all2all == nullptr) { remainBatchOut = false; LOG_WARN("Information vector is nullptr!"); @@ -2012,17 +1993,16 @@ void HybridMgmt::SendAll2AllVec(const EmbBaseInfo &info, bool &remainBatchOut) } TimeCost sendAll2AllTC; hdTransfer->Send(TransferChannel::ALL2ALL, *all2all, info.channelId, info.name); - LOG_DEBUG("table:{}, channelId:{}, batchId:{}, send all2all end, sendAll2AllTC(ms):{}", - info.name, info.channelId, info.batchId, sendAll2AllTC.ElapsedMS()); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, send all2all end, sendAll2AllTC(ms):{}", info.name, + info.channelId, info.batchId, sendAll2AllTC.ElapsedMS()); } } -void HybridMgmt::SendRestoreVec(const EmbBaseInfo &info, bool &remainBatchOut) +void HybridMgmt::SendRestoreVec(const EmbBaseInfo& info, bool& remainBatchOut) { bool isEos = false; // useless, adapt to HBM mode TimeCost getRestoreTC; - unique_ptr> infoVecs = KEY_PROCESS_INSTANCE->GetInfoVec( - info, ProcessedInfo::RESTORE, isEos); + unique_ptr> infoVecs = KEY_PROCESS_INSTANCE->GetInfoVec(info, ProcessedInfo::RESTORE, isEos); if (infoVecs == nullptr) { remainBatchOut = false; if (isRunning) { @@ -2030,66 +2010,67 @@ void HybridMgmt::SendRestoreVec(const EmbBaseInfo &info, bool &remainBatchOut) } return; } - LOG_DEBUG("table:{}, channelId:{}, batchId:{}, get restore end, getRestoreTC(ms):{}", - info.name, info.channelId, info.batchId, getRestoreTC.ElapsedMS()); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, get restore end, getRestoreTC(ms):{}", info.name, info.channelId, + info.batchId, getRestoreTC.ElapsedMS()); TimeCost sendRestoreSyncTC; hdTransfer->Send(TransferChannel::RESTORE, *infoVecs, info.channelId, info.name); - LOG_DEBUG("table:{}, channelId:{}, batchId:{}, send restore end, sendRestoreSyncTC(ms):{}", - info.name, info.channelId, info.batchId, sendRestoreSyncTC.ElapsedMS()); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, send restore end, sendRestoreSyncTC(ms):{}", info.name, + info.channelId, info.batchId, sendRestoreSyncTC.ElapsedMS()); } -void HybridMgmt::SendLookupOffsets(const EmbBaseInfo &info, - vector &uniqueKeys, vector &restoreVecSec) +void HybridMgmt::SendLookupOffsets(const EmbBaseInfo& info, vector& uniqueKeys, + vector& restoreVecSec) { // uniqueKeys already transfer to offset in GetSwapPairsAndKey2Offset // graph will filter out invalid offset(-1). see function _set_specific_value_for_non_valid_key TimeCost sendLookupOffsetsTC; std::vector lookupOffsets; - for (const auto &index : restoreVecSec) { + for (const auto& index : restoreVecSec) { if (index == INVALID_INDEX_VALUE) { lookupOffsets.emplace_back(static_cast(INVALID_KEY_VALUE)); continue; } lookupOffsets.emplace_back(uniqueKeys[index]); } - hdTransfer->Send(TransferChannel::LOOKUP, { Vec2TensorI32(lookupOffsets) }, info.channelId, info.name); - LOG_DEBUG("table:{}, channelId:{}, batchId:{}, send lookupOffset, sendLookupOffsetsTC(ms):{}", - info.name, info.channelId, info.batchId, sendLookupOffsetsTC.ElapsedMS()); + hdTransfer->Send(TransferChannel::LOOKUP, {Vec2TensorI32(lookupOffsets)}, info.channelId, info.name); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, send lookupOffset, sendLookupOffsetsTC(ms):{}", info.name, + info.channelId, info.batchId, sendLookupOffsetsTC.ElapsedMS()); } -void HybridMgmt::SendGlobalUniqueVec(const EmbBaseInfo &info, - vector &uniqueKeys, vector &restoreVecSec) +void HybridMgmt::SendGlobalUniqueVec(const EmbBaseInfo& info, vector& uniqueKeys, + vector& restoreVecSec) { if (!(info.channelId == TRAIN_CHANNEL_ID && mgmtRankInfo.useSumSameIdGradients)) { return; } TimeCost sendUniqueKeysSyncTC; - hdTransfer->Send(TransferChannel::UNIQKEYS, {mgmtRankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) : - Vec2TensorI32(uniqueKeys) }, info.channelId, info.name); - LOG_DEBUG("table:{}, channelId:{}, batchId:{}, sendUniqueKeysSyncTC(ms):{}", - info.name, info.channelId, info.batchId, sendUniqueKeysSyncTC.ElapsedMS()); + hdTransfer->Send(TransferChannel::UNIQKEYS, + {mgmtRankInfo.useDynamicExpansion ? Vec2TensorI64(uniqueKeys) : Vec2TensorI32(uniqueKeys)}, + info.channelId, info.name); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, sendUniqueKeysSyncTC(ms):{}", info.name, info.channelId, + info.batchId, sendUniqueKeysSyncTC.ElapsedMS()); TimeCost sendRestoreVecSecSyncTC; - hdTransfer->Send(TransferChannel::RESTORE_SECOND, {Vec2TensorI32(restoreVecSec) }, info.channelId, info.name); - LOG_DEBUG("table:{}, channelId:{}, batchId:{}, sendRestoreVecSecSyncTC(ms):{}", - info.name, info.channelId, info.batchId, sendRestoreVecSecSyncTC.ElapsedMS()); + hdTransfer->Send(TransferChannel::RESTORE_SECOND, {Vec2TensorI32(restoreVecSec)}, info.channelId, info.name); + LOG_DEBUG("table:{}, channelId:{}, batchId:{}, sendRestoreVecSecSyncTC(ms):{}", info.name, info.channelId, + info.batchId, sendRestoreVecSecSyncTC.ElapsedMS()); } -bool HybridMgmt::HandleSpecialProcessStatusDDR(const EmbBaseInfo &info, TimeCost& getAndSendTensorsTC, - pair, vector> &swapInKoPair, - pair, vector> &swapOutKoPair) +bool HybridMgmt::HandleSpecialProcessStatusDDR(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, + pair, vector>& swapInKoPair, + pair, vector>& swapOutKoPair) { TimeCost swapProcessTC; - auto &swapInPos = swapInKoPair.second; - auto &swapOutKeys = swapOutKoPair.first; - auto &swapOutPos = swapOutKoPair.second; + auto& swapInPos = swapInKoPair.second; + auto& swapOutKeys = swapOutKoPair.first; + auto& swapOutPos = swapOutKoPair.second; if (specialProcessStatus[info.name] == ProcessStatus::AFTER_SWITCH_FIRST_BATCH) { // 发现train、save、eval切换,先保存状态,发emptySwapOutKeys以对应上一步的emptySwapOutPos HandleFirstBatchCaseDDR(info, swapInKoPair, swapOutKoPair); - LOG_DEBUG("handle channel switch case:afterSwitchFirstBatch, table:{}, channelId:{}, batchId:{}", - info.name, info.channelId, info.batchId); + LOG_DEBUG("handle channel switch case:afterSwitchFirstBatch, table:{}, channelId:{}, batchId:{}", info.name, + info.channelId, info.batchId); if (mgmtRankInfo.ctrlSteps[info.channelId] == 1) { vector emptySwapOutPos; @@ -2110,32 +2091,33 @@ bool HybridMgmt::HandleSpecialProcessStatusDDR(const EmbBaseInfo &info, TimeCost swapOutKeys.insert(swapOutKeys.end(), tempStore[0].begin(), tempStore[0].end()); swapOutPos.insert(swapOutPos.end(), tempStore[1].begin(), tempStore[1].end()); specialProcessStatus[info.name] = ProcessStatus::NORMAL; - LOG_DEBUG("handle channel switch case:afterSwitchSecondBatch, table:{}, channelId:{}, batchId:{}", - info.name, info.channelId, info.batchId); + LOG_DEBUG("handle channel switch case:afterSwitchSecondBatch, table:{}, channelId:{}, batchId:{}", info.name, + info.channelId, info.batchId); } return false; } -bool HybridMgmt::HandleSpecialProcessStatusL3Storage(const EmbBaseInfo &info, TimeCost &getAndSendTensorsTC, - pair, vector> &swapInKoPair, - pair, vector> &swapOutKoPair) +bool HybridMgmt::HandleSpecialProcessStatusL3Storage(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, + pair, vector>& swapInKoPair, + pair, vector>& swapOutKoPair) { TimeCost swapProcessTC; - auto &swapInPos = swapInKoPair.second; - auto &swapOutKeys = swapOutKoPair.first; - auto &swapOutPos = swapOutKoPair.second; + auto& swapInPos = swapInKoPair.second; + auto& swapOutKeys = swapOutKoPair.first; + auto& swapOutPos = swapOutKoPair.second; if (specialProcessStatus[info.name] == ProcessStatus::AFTER_SWITCH_FIRST_BATCH) { // 发现train、save、eval切换,先保存状态,发emptySwapOutKeys以对应上一步的emptySwapOutPos HandleFirstBatchCaseL3Storage(info, swapInKoPair, swapOutKoPair); - LOG_DEBUG("handle channel switch case:afterSwitchFirstBatch, table:{}, channelId:{}, batchId:{}", - info.name, info.channelId, info.batchId); + LOG_DEBUG("handle channel switch case:afterSwitchFirstBatch, table:{}, channelId:{}, batchId:{}", info.name, + info.channelId, info.batchId); if (mgmtRankInfo.ctrlSteps[info.channelId] == 1) { vector emptySwapOutPos; SendTensorForSwap(info, swapInPos, emptySwapOutPos); LOG_DEBUG("ProcessEmbInfoL3Storage special case, user only run one step, " - "table:{}, channelId:{}, batchId:{}", info.name, info.channelId, info.batchId); + "table:{}, channelId:{}, batchId:{}", + info.name, info.channelId, info.batchId); } specialProcessStatus[info.name] = ProcessStatus::AFTER_SWITCH_SECOND_BATCH; @@ -2149,13 +2131,12 @@ bool HybridMgmt::HandleSpecialProcessStatusL3Storage(const EmbBaseInfo &info, Ti swapOutKeys.insert(swapOutKeys.end(), tempStore[0].begin(), tempStore[0].end()); swapOutPos.insert(swapOutPos.end(), tempStore[1].begin(), tempStore[1].end()); specialProcessStatus[info.name] = ProcessStatus::NORMAL; - LOG_DEBUG("handle channel switch case:afterSwitchSecondBatch, table:{}, channelId:{}, batchId:{}", - info.name, info.channelId, info.batchId); + LOG_DEBUG("handle channel switch case:afterSwitchSecondBatch, table:{}, channelId:{}, batchId:{}", info.name, + info.channelId, info.batchId); } return false; } - void HybridMgmt::CheckLookupAddrSuccessDDR() { if (!lookupAddrSuccess) { @@ -2169,20 +2150,19 @@ void HybridMgmt::CheckLookupAddrSuccessDDR() } } - -void HybridMgmt::GetSwapPairsAndKey2Offset(const EmbBaseInfo &info, vector &uniqueKeys, - pair, vector> &swapInKoPair, - pair, vector> &swapOutKoPair) +void HybridMgmt::GetSwapPairsAndKey2Offset(const EmbBaseInfo& info, vector& uniqueKeys, + pair, vector>& swapInKoPair, + pair, vector>& swapOutKoPair) { TimeCost GetSwapPairsAndKey2OffsetTC; int swapInCode = embCache->GetSwapPairsAndKey2Offset(info.name, uniqueKeys, swapInKoPair, swapOutKoPair); if (swapInCode != H_OK) { - string errMsg = StringFormat("table:%s, GetSwapPairsAndKey2Offset failed! error code:%d", - info.name.c_str(), swapInCode); + string errMsg = + StringFormat("table:%s, GetSwapPairsAndKey2Offset failed! error code:%d", info.name.c_str(), swapInCode); throw runtime_error(errMsg); } - LOG_DEBUG("table:{}, channel:{}, batchId:{}, GetSwapPairsAndKey2OffsetTC(ms):{}", - info.name, info.channelId, info.batchId, GetSwapPairsAndKey2OffsetTC.ElapsedMS()); + LOG_DEBUG("table:{}, channel:{}, batchId:{}, GetSwapPairsAndKey2OffsetTC(ms):{}", info.name, info.channelId, + info.batchId, GetSwapPairsAndKey2OffsetTC.ElapsedMS()); LOG_DEBUG("table:{}, channel:{}, batchId:{}, swapIn keys:{}, swapIn pos:{}, swapOut keys:{}, swapOut pos:{}", info.name, info.channelId, info.batchId, VectorToString(swapInKoPair.first), @@ -2190,15 +2170,14 @@ void HybridMgmt::GetSwapPairsAndKey2Offset(const EmbBaseInfo &info, vector, vector>& swapInKoPair, +void HybridMgmt::EnqueueSwapInfo(const EmbBaseInfo& info, pair, vector>& swapInKoPair, pair, vector>& swapOutKoPair) { - auto &swapInKeys = swapInKoPair.first; - auto &swapOutKeys = swapOutKoPair.first; + auto& swapInKeys = swapInKoPair.first; + auto& swapOutKeys = swapOutKoPair.first; - LOG_DEBUG("enqueue HBMSwapKeyQue table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", - info.name, info.batchId, info.channelId, swapInKeys.size(), swapOutKeys.size()); + LOG_DEBUG("enqueue HBMSwapKeyQue table:{}, batchId:{}, channelId:{}, swapInSize:{}, swapOutSize:{}", info.name, + info.batchId, info.channelId, swapInKeys.size(), swapOutKeys.size()); HBMSwapKeyQue[info.name + SWAP_OUT_STR].Pushv(swapOutKeys); HBMSwapKeyQue[info.name + SWAP_IN_STR].Pushv(swapInKeys); @@ -2208,7 +2187,7 @@ void HybridMgmt::EnqueueSwapInfo(const EmbBaseInfo &info, bool HybridMgmt::IsTrainAndEvalCase() { bool isChannelSwitchCase = false; - for (auto& i: mgmtEmbInfo) { + for (auto& i : mgmtEmbInfo) { if (specialProcessStatus[i.name] == ProcessStatus::AFTER_SWITCH_FIRST_BATCH) { isChannelSwitchCase = true; break; diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index f5897861..ab34b19f 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -17,308 +17,300 @@ See the License for the specific language governing permissions and #define MX_REC_EMB_MGMT_H #include -#include #include #include +#include #include "absl/container/flat_hash_map.h" - +#include "emb_table/embedding_table.h" +#include "hd_transfer/hd_transfer.h" +#include "hybrid_mgmt_block.h" +#include "l3_storage/cache_manager.h" +#include "ock_ctr_common/include/embedding_cache.h" +#include "ock_ctr_common/include/error_code.h" +#include "ock_ctr_common/include/factory.h" #include "utils/common.h" #include "utils/config.h" #include "utils/singleton.h" #include "utils/task_queue.h" #include "utils/time_cost.h" -#include "ock_ctr_common/include/factory.h" -#include "ock_ctr_common/include/embedding_cache.h" -#include "ock_ctr_common/include/error_code.h" - -#include "hd_transfer/hd_transfer.h" -#include "l3_storage/cache_manager.h" -#include "hybrid_mgmt_block.h" -#include "emb_table/embedding_table.h" namespace MxRec { - using namespace std; - using namespace tensorflow; - using namespace Common; - - enum class TaskType { - HBM, - DDR - }; - - enum class ProcessStatus { - NORMAL, - AFTER_SWITCH_FIRST_BATCH, - AFTER_SWITCH_SECOND_BATCH - }; - - inline string ProcessStatus2Str(ProcessStatus s) +using namespace std; +using namespace tensorflow; +using namespace Common; + +enum class TaskType { + HBM, + DDR +}; + +enum class ProcessStatus { + NORMAL, + AFTER_SWITCH_FIRST_BATCH, + AFTER_SWITCH_SECOND_BATCH +}; + +inline string ProcessStatus2Str(ProcessStatus s) +{ + switch (s) { + case ProcessStatus::NORMAL: + return "normal"; + case ProcessStatus::AFTER_SWITCH_FIRST_BATCH: + return "afterSwitchFirstBatch"; + case ProcessStatus::AFTER_SWITCH_SECOND_BATCH: + return "afterSwitchSecondBatch"; + default: + throw std::invalid_argument("Invalid ProcessStatus"); + } +}; + +struct EmbTaskInfo { + int batchId; + int threadIdx; + int cvNotifyIndex; + int extEmbeddingSize; + string name; +}; + +class HybridMgmt { +public: + HybridMgmt() = default; + + ~HybridMgmt() { - switch (s) { - case ProcessStatus::NORMAL: - return "normal"; - case ProcessStatus::AFTER_SWITCH_FIRST_BATCH: - return "afterSwitchFirstBatch"; - case ProcessStatus::AFTER_SWITCH_SECOND_BATCH: - return "afterSwitchSecondBatch"; - default: - throw std::invalid_argument("Invalid ProcessStatus"); - } - }; - - struct EmbTaskInfo { - int batchId; - int threadIdx; - int cvNotifyIndex; - int extEmbeddingSize; - string name; - }; - - class HybridMgmt { - public: - HybridMgmt() = default; - - ~HybridMgmt() - { - if (isRunning) { - Destroy(); - } + if (isRunning) { + Destroy(); } + } - HybridMgmt(const HybridMgmt&) = delete; + HybridMgmt(const HybridMgmt&) = delete; - HybridMgmt& operator=(const HybridMgmt&) = delete; + HybridMgmt& operator=(const HybridMgmt&) = delete; - bool Initialize(RankInfo rankInfo, const vector& embInfos, int seed, - const vector& thresholdValues, bool ifLoad); + bool Initialize(RankInfo rankInfo, const vector& embInfos, int seed, + const vector& thresholdValues, bool ifLoad); - void Save(const string& savePath); + void Save(const string& savePath); - bool Load(const string& loadPath, vector warmStartTables); + bool Load(const string& loadPath, vector warmStartTables); - OffsetT SendHostMap(const string tableName); + OffsetT SendHostMap(const string tableName); - OffsetT SendLoadMap(const string tableName); + OffsetT SendLoadMap(const string tableName); - void ReceiveHostMap(AllKeyOffsetMapT receiveKeyOffsetMap); + void ReceiveHostMap(AllKeyOffsetMapT receiveKeyOffsetMap); - void Start(); + void Start(); - void StartThreadForHBM(); + void StartThreadForHBM(); - void StartThreadForDDR(); + void StartThreadForDDR(); - void Destroy(); + void Destroy(); - bool ParseKeys(int channelId, int& batchId, TaskType type); + bool ParseKeys(int channelId, int& batchId, TaskType type); - bool Evict(); + bool Evict(); - void NotifyBySessionRun(int channelID) const; + void NotifyBySessionRun(int channelID) const; - void CountStepBySessionRun(int channelID, int steps) const; + void CountStepBySessionRun(int channelID, int steps) const; - int64_t GetTableSize(const string& embName) const; + int64_t GetTableSize(const string& embName) const; - int64_t GetTableCapacity(const string& embName) const; + int64_t GetTableCapacity(const string& embName) const; - void SetOptimizerInfo(const string& embName, OptimizerInfo optimInfo) const; + void SetOptimizerInfo(const string& embName, OptimizerInfo optimInfo) const; - void FetchDeviceEmb(); + void FetchDeviceEmb(); - void ProcessEmbInfoHBM(const EmbBaseInfo& info, bool& remainBatchOut, bool isGrad); + void ProcessEmbInfoHBM(const EmbBaseInfo& info, bool& remainBatchOut, bool isGrad); - void ProcessEmbInfoDDR(const EmbBaseInfo& info, bool& remainBatchOut); + void ProcessEmbInfoDDR(const EmbBaseInfo& info, bool& remainBatchOut); - void ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBatchOut); + void ProcessEmbInfoL3Storage(const EmbBaseInfo& info, bool& remainBatchOut); - GTEST_PRIVATE: - bool mutexDestroy { false }; - std::mutex lookUpAndSendBatchIdMtx; - std::mutex receiveAndUpdateBatchIdMtx; - std::map lookUpAndSendTableBatchMap; - std::map receiveAndUpdateTableBatchMap; + GTEST_PRIVATE : bool mutexDestroy{false}; + std::mutex lookUpAndSendBatchIdMtx; + std::mutex receiveAndUpdateBatchIdMtx; + std::map lookUpAndSendTableBatchMap; + std::map receiveAndUpdateTableBatchMap; - std::map> lastUpdateFinishMutexMap; - std::map> cvLastUpdateFinishMap; - std::map lastUpdateFinishStepMap; - std::map> lastLookUpFinishMutexMap; - std::map> cvLastLookUpFinishMap; - std::map lastLookUpFinishStepMap; - std::map> lastSendFinishMutexMap; - std::map> cvLastSendFinishMap; - std::map lastSendFinishStepMap; - std::map> lastRecvFinishMutexMap; - std::map> cvLastRecvFinishMap; - std::map lastRecvFinishStepMap; + std::map> lastUpdateFinishMutexMap; + std::map> cvLastUpdateFinishMap; + std::map lastUpdateFinishStepMap; + std::map> lastLookUpFinishMutexMap; + std::map> cvLastLookUpFinishMap; + std::map lastLookUpFinishStepMap; + std::map> lastSendFinishMutexMap; + std::map> cvLastSendFinishMap; + std::map lastSendFinishStepMap; + std::map> lastRecvFinishMutexMap; + std::map> cvLastRecvFinishMap; + std::map lastRecvFinishStepMap; - std::vector EmbeddingLookUpAndSendThreadPool; - std::vector EmbeddingReceiveAndUpdateThreadPool; - std::vector> lookUpSwapOutAddrsThreads; - std::vector> lookUpSwapInAddrsThreads; + std::vector EmbeddingLookUpAndSendThreadPool; + std::vector EmbeddingReceiveAndUpdateThreadPool; + std::vector> lookUpSwapOutAddrsThreads; + std::vector> lookUpSwapInAddrsThreads; - std::map>> HBMSwapKeyQue; - std::map>> HBMSwapKeyForL3StorageQue; - std::map>> DDRSwapKeyQue; - std::map>> DDRSwapKeyForL3StorageQue; - std::map>> HBMSwapAddrsQue; - std::map>> DDRSwapAddrsQue; + std::map>> HBMSwapKeyQue; + std::map>> HBMSwapKeyForL3StorageQue; + std::map>> DDRSwapKeyQue; + std::map>> DDRSwapKeyForL3StorageQue; + std::map>> HBMSwapAddrsQue; + std::map>> DDRSwapAddrsQue; - std::mutex evictMut; + std::mutex evictMut; - std::map> trainKeysSet; - const string SWAP_IN_STR = "SwapIn"; - const string SWAP_OUT_STR = "SwapOut"; + std::map> trainKeysSet; + const string SWAP_IN_STR = "SwapIn"; + const string SWAP_OUT_STR = "SwapOut"; - const string ADDR_STR = "Addr"; - ock::ctr::EmbCacheManagerPtr embCache = nullptr; - std::map> lastSwapInPosMap {}; - std::map>> trainTestSwitchInfoStore {}; - std::atomic lookupAddrSuccess {true}; + const string ADDR_STR = "Addr"; + ock::ctr::EmbCacheManagerPtr embCache = nullptr; + std::map> lastSwapInPosMap{}; + std::map>> trainTestSwitchInfoStore{}; + std::atomic lookupAddrSuccess{true}; - std::mutex saveMutex; - std::condition_variable cvCheckSave; + std::mutex saveMutex; + std::condition_variable cvCheckSave; - void SetFeatureTypeForLoad(vector& loadFeatures); + void SetFeatureTypeForLoad(vector& loadFeatures); - void EvictKeys(const string& embName, const vector& keys); + void EvictKeys(const string& embName, const vector& keys); - void InitRankInfo(RankInfo& rankInfo, const vector& embInfos) const; + void InitRankInfo(RankInfo& rankInfo, const vector& embInfos) const; - void EvictL3StorageKeys(const string& embName, const vector& keys) const; + void EvictL3StorageKeys(const string& embName, const vector& keys) const; - void LookUpAndRemoveAddrs(const EmbTaskInfo &info); // L3Storage, synchronous + void LookUpAndRemoveAddrs(const EmbTaskInfo& info); // L3Storage, synchronous - void LookUpSwapAddrs(const std::string &embName, const std::string &swapStr); // DDR, asynchronous + void LookUpSwapAddrs(const std::string& embName, const std::string& swapStr); // DDR, asynchronous - void EmbeddingTask(); + void EmbeddingTask(); - void MultiThreadEmbHDTransWrap(); + void MultiThreadEmbHDTransWrap(); - void EmbeddingLookUpAndSendDDR(int batchId, int index, const EmbInfo& embInfo); + void EmbeddingLookUpAndSendDDR(int batchId, int index, const EmbInfo& embInfo); - void EmbeddingReceiveAndUpdateDDR(int batchId, int index, const EmbInfo& embInfo); + void EmbeddingReceiveAndUpdateDDR(int batchId, int index, const EmbInfo& embInfo); - void EmbeddingLookUpAndSendL3Storage(int batchId, int index, const EmbInfo& embInfo); + void EmbeddingLookUpAndSendL3Storage(int batchId, int index, const EmbInfo& embInfo); - void EmbeddingReceiveAndUpdateL3Storage(int batchId, int index, const EmbInfo& embInfo); + void EmbeddingReceiveAndUpdateL3Storage(int batchId, int index, const EmbInfo& embInfo); - void SendTensorForSwap(const EmbBaseInfo& info, - const vector &swapInPosUint, - const vector &swapOutPosUint); + void SendTensorForSwap(const EmbBaseInfo& info, const vector& swapInPosUint, + const vector& swapOutPosUint); - private: - HybridMgmtBlock* hybridMgmtBlock; - vector mgmtEmbInfo; - RankInfo mgmtRankInfo; - CacheManager* cacheManager; - vector> procThreads {}; - map> evictKeyMap {}; - HDTransfer *hdTransfer; - OffsetMapT offsetMapToSend; - OffsetMapT loadOffsetToSend; - bool isL3StorageEnabled { false }; - bool isRunning; - bool isLoad { false }; - bool isInitialized { false }; - bool alreadyTrainOnce = false; // 用于判断是否为predict模式 - map lookUpSwapInAddrsPushId; // 用于处理eos场景,当消费者追上生产者且长时间无上游数据,会触发eos - map specialProcessStatus; +private: + HybridMgmtBlock* hybridMgmtBlock; + vector mgmtEmbInfo; + RankInfo mgmtRankInfo; + CacheManager* cacheManager; + vector> procThreads{}; + map> evictKeyMap{}; + HDTransfer* hdTransfer; + OffsetMapT offsetMapToSend; + OffsetMapT loadOffsetToSend; + bool isL3StorageEnabled{false}; + bool isRunning; + bool isLoad{false}; + bool isInitialized{false}; + bool alreadyTrainOnce = false; // 用于判断是否为predict模式 + map lookUpSwapInAddrsPushId; // 用于处理eos场景,当消费者追上生产者且长时间无上游数据,会触发eos + map specialProcessStatus; - void TrainTask(TaskType type); + void TrainTask(TaskType type); - void EvalTask(TaskType type); + void EvalTask(TaskType type); - void SendUniqKeysAndRestoreVecHBM(const EmbBaseInfo &info, - const unique_ptr> &infoVecs, bool isGrad) const; + void SendUniqKeysAndRestoreVecHBM(const EmbBaseInfo& info, const unique_ptr>& infoVecs, + bool isGrad) const; - void HandleEndBatchCase(const EmbBaseInfo& info, vector& swapInPos); + void HandleEndBatchCase(const EmbBaseInfo& info, vector& swapInPos); - bool IsTrainEndBatch(int batchId) const; + bool IsTrainEndBatch(int batchId) const; - bool IsEvalEndBatch(int batchId) const; + bool IsEvalEndBatch(int batchId) const; - void InitEmbeddingCache(const vector& embInfos); + void InitEmbeddingCache(const vector& embInfos); - void InitDataPipelineForDDR(const string &embName); + void InitDataPipelineForDDR(const string& embName); - void InitDataPipelineForL3Storage(const string &embName, int extEmbeddingSize); + void InitDataPipelineForL3Storage(const string& embName, int extEmbeddingSize); - void JoinEmbeddingCacheThread(); + void JoinEmbeddingCacheThread(); - void HandleReachMaxStepCase(const EmbBaseInfo& info, bool& remainBatchOut); + void HandleReachMaxStepCase(const EmbBaseInfo& info, bool& remainBatchOut); - void HandleEosCase(const EmbBaseInfo& info, bool& remainBatchOut); + void HandleEosCase(const EmbBaseInfo& info, bool& remainBatchOut); - void HandleEosCaseHBM(const string& embName, int batchId, int channelId, bool& remainBatchOut); + void HandleEosCaseHBM(const string& embName, int batchId, int channelId, bool& remainBatchOut); - bool EmbeddingReceiveDDR(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs); + bool EmbeddingReceiveDDR(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs); - void EmbeddingUpdateDDR(const EmbTaskInfo& info, const float* embPtr, vector& swapOutAddrs); + void EmbeddingUpdateDDR(const EmbTaskInfo& info, const float* embPtr, vector& swapOutAddrs); - bool EmbeddingLookUpDDR(const EmbTaskInfo& info, vector& h2dEmb); + bool EmbeddingLookUpDDR(const EmbTaskInfo& info, vector& h2dEmb); - void EmbeddingSendDDR(const EmbTaskInfo& info, vector& h2dEmb); + void EmbeddingSendDDR(const EmbTaskInfo& info, vector& h2dEmb); - bool EmbeddingReceiveL3Storage(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs, - int64_t& dims0); + bool EmbeddingReceiveL3Storage(const EmbTaskInfo& info, float*& ptr, vector& swapOutAddrs, int64_t& dims0); - void EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float* embPtr, vector& swapOutAddrs, - int64_t& dims0); + void EmbeddingUpdateL3Storage(const EmbTaskInfo& info, float* embPtr, vector& swapOutAddrs, int64_t& dims0); - bool EmbeddingLookUpL3Storage(const EmbTaskInfo& info, vector& h2dEmb); + bool EmbeddingLookUpL3Storage(const EmbTaskInfo& info, vector& h2dEmb); - void EmbeddingSendL3Storage(const EmbTaskInfo& info, vector& h2dEmb); + void EmbeddingSendL3Storage(const EmbTaskInfo& info, vector& h2dEmb); - void CreateEmbeddingLookUpAndSendThread(int index, const EmbInfo& embInfo); + void CreateEmbeddingLookUpAndSendThread(int index, const EmbInfo& embInfo); - void CreateEmbeddingReceiveAndUpdateThread(int index, const EmbInfo& embInfo); + void CreateEmbeddingReceiveAndUpdateThread(int index, const EmbInfo& embInfo); - void HandleFirstBatchCaseDDR(const EmbBaseInfo& info, - std::pair, vector>& swapInKoPair, - std::pair, vector>& swapOutKoPair); + void HandleFirstBatchCaseDDR(const EmbBaseInfo& info, std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - void HandleFirstBatchCaseL3Storage(const EmbBaseInfo& info, - std::pair, vector>& swapInKoPair, - std::pair, vector>& swapOutKoPair); + void HandleFirstBatchCaseL3Storage(const EmbBaseInfo& info, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - void HandleDataSwapForL3Storage(const EmbBaseInfo& info, - vector &swapInKeys, vector &swapOutKeys); + void HandleDataSwapForL3Storage(const EmbBaseInfo& info, vector& swapInKeys, + vector& swapOutKeys); - bool BuildH2DEmbedding(const EmbTaskInfo& info, vector& h2dEmb); + bool BuildH2DEmbedding(const EmbTaskInfo& info, vector& h2dEmb); - vector GetUniqueKeys(const EmbBaseInfo& info, bool& remainBatchOut); + vector GetUniqueKeys(const EmbBaseInfo& info, bool& remainBatchOut); - vector GetRestoreVecSec(const EmbBaseInfo& info, bool& remainBatchOut); + vector GetRestoreVecSec(const EmbBaseInfo& info, bool& remainBatchOut); - void SendAll2AllVec(const EmbBaseInfo& info, bool& remainBatchOut); + void SendAll2AllVec(const EmbBaseInfo& info, bool& remainBatchOut); - void SendRestoreVec(const EmbBaseInfo& info, bool& remainBatchOut); + void SendRestoreVec(const EmbBaseInfo& info, bool& remainBatchOut); - void SendLookupOffsets(const EmbBaseInfo& info, vector& uniqueKeys, vector& restoreVecSec); + void SendLookupOffsets(const EmbBaseInfo& info, vector& uniqueKeys, vector& restoreVecSec); - void SendGlobalUniqueVec(const EmbBaseInfo& info, vector& uniqueKeys, vector& restoreVecSec); + void SendGlobalUniqueVec(const EmbBaseInfo& info, vector& uniqueKeys, vector& restoreVecSec); - bool HandleSpecialProcessStatusDDR(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, - std::pair, vector>& swapInKoPair, - std::pair, vector>& swapOutKoPair); + bool HandleSpecialProcessStatusDDR(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - bool HandleSpecialProcessStatusL3Storage(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, - std::pair, vector>& swapInKoPair, - std::pair, vector>& swapOutKoPair); + bool HandleSpecialProcessStatusL3Storage(const EmbBaseInfo& info, TimeCost& getAndSendTensorsTC, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - void CheckLookupAddrSuccessDDR(); + void CheckLookupAddrSuccessDDR(); - void GetSwapPairsAndKey2Offset(const EmbBaseInfo& info, vector &uniqueKeys, - std::pair, vector>& swapInKoPair, - std::pair, vector>& swapOutKoPair); + void GetSwapPairsAndKey2Offset(const EmbBaseInfo& info, vector& uniqueKeys, + std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - void EnqueueSwapInfo(const EmbBaseInfo& info, - std::pair, vector>& swapInKoPair, - std::pair, vector>& swapOutKoPair); + void EnqueueSwapInfo(const EmbBaseInfo& info, std::pair, vector>& swapInKoPair, + std::pair, vector>& swapOutKoPair); - bool IsTrainAndEvalCase(); - }; -} -#endif // MX_REC_EMB_MGMT_H + bool IsTrainAndEvalCase(); +}; +} // namespace MxRec +#endif // MX_REC_EMB_MGMT_H -- Gitee From ade2d1089abee6af3a3e4ef09313c5b1d5522bd7 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Fri, 5 Jul 2024 13:21:35 +0000 Subject: [PATCH 264/302] =?UTF-8?q?!209=20=E3=80=90FIX=E3=80=91=E5=A4=9A?= =?UTF-8?q?=E6=9C=BA=E8=AE=AD=E7=BB=83=E6=95=B0=E6=8D=AE=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=E9=80=82=E9=85=8D=20*=20=E3=80=90FIX?= =?UTF-8?q?=E3=80=91=E5=A4=9A=E6=9C=BA=E8=AE=AD=E7=BB=83=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E4=BF=9D=E5=AD=98=E5=8A=A0=E8=BD=BD=E9=80=82=E9=85=8D+hdfs=20*?= =?UTF-8?q?=20=E3=80=90FIX=E3=80=91=E5=A4=9A=E6=9C=BA=E8=AE=AD=E7=BB=83?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E4=BF=9D=E5=AD=98=E5=8A=A0=E8=BD=BD=E9=80=82?= =?UTF-8?q?=E9=85=8D+hdfs=20*=20=E3=80=90FIX=E3=80=91=E5=A4=9A=E6=9C=BA?= =?UTF-8?q?=E8=AE=AD=E7=BB=83=E6=95=B0=E6=8D=AE=E4=BF=9D=E5=AD=98=E5=8A=A0?= =?UTF-8?q?=E8=BD=BD=E9=80=82=E9=85=8D=20*=20=E3=80=90FIX=E3=80=91?= =?UTF-8?q?=E5=A4=9A=E6=9C=BA=E8=AE=AD=E7=BB=83=E6=95=B0=E6=8D=AE=E4=BF=9D?= =?UTF-8?q?=E5=AD=98=E5=8A=A0=E8=BD=BD=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/patch.py | 8 +++++--- mx_rec/saver/saver.py | 27 ++++++++++++++++++--------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/mx_rec/saver/patch.py b/mx_rec/saver/patch.py index dcdf95ca..0f3a237b 100644 --- a/mx_rec/saver/patch.py +++ b/mx_rec/saver/patch.py @@ -44,7 +44,8 @@ from tensorflow.python.training.saving import saveable_object_util import numpy as np from mpi4py import MPI -from mx_rec.saver.saver import Saver as SparseSaver, check_file_system_is_valid +from mx_rec.saver.saver import Saver as SparseSaver, check_file_system_is_valid, should_write_data +from mx_rec.util.communication.hccl_ops import get_local_rank_size from mx_rec.util.initialize import ConfigInitializer from mx_rec.validator.validator import para_checker_decorator, ClassValidator, StringValidator, OptionalIntValidator, \ OptionalStringValidator, DirectoryValidator @@ -253,7 +254,7 @@ def save(self, sess, save_path, global_step=None, latest_filename=None, meta_gra comm = MPI.COMM_WORLD rank = comm.Get_rank() comm.Barrier() - if rank == 0: + if should_write_data(rank, save_path): model_checkpoint_path = compat.as_str(get_model_checkpoint_path(self, checkpoint_file, sess)) if write_state: update_checkpoint_state(self, model_checkpoint_path, save_path_parent, latest_filename, meta_graph_suffix, @@ -453,10 +454,11 @@ def patch_for_write_graph_func(func): comm = MPI.COMM_WORLD rank = comm.Get_rank() # In the case of multiple processes, choose one process to write graph. - if rank == 0: + if len(args) > 1 and should_write_data(rank, args[1]): return func(*args, **kwargs) else: return None + return wrapper diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index 9e0e1d29..a6362506 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -35,7 +35,6 @@ from mx_rec.util.log import logger from mx_rec.optimizers.base import CustomizedOptimizer from mx_rec.util.tf_version_adapter import npu_ops - SAVE_SPARSE_PATH_PREFIX = "sparse" @@ -171,7 +170,7 @@ class Saver(object): comm = MPI.COMM_WORLD rank = comm.Get_rank() comm.Barrier() - if rank == 0: + if should_write_data(rank, saving_path): table_list = self.save_op_dict.keys() for table_name in table_list: self.merge_sparse_file(saving_path, table_name) @@ -267,7 +266,7 @@ class Saver(object): else: self._save_ddr(sess, root_dir) logger.debug(f"Host data was saved.") - + def _save_hbm(self, sess, root_dir): self.config_instance.hybrid_manager_config.save_host_data(root_dir) if self.config_instance.use_dynamic_expansion: @@ -285,7 +284,7 @@ class Saver(object): for thread in threads: thread.join() - + def _save_ddr(self, sess, root_dir): # 接受host侧传来的需要swap_out的offset用于更新host侧并保存 self.config_instance.hybrid_manager_config.fetch_device_emb() @@ -306,7 +305,7 @@ class Saver(object): channel_name=f'{table_name}_save_h2d_{TRAIN_CHANNEL_ID}') if use_static: swap_out_pos = swap_out_pos[:swap_out_len] - + table = [var] optimizer = ConfigInitializer.get_instance().optimizer_config.get_optimizer_by_table_name(table_name) if optimizer is not None: @@ -382,7 +381,6 @@ class Saver(object): else: placeholder_dict, restore_fetch_list = self.placeholder_dict, self.restore_fetch_dict - for table_name in placeholder_dict: optimizer_instance = ConfigInitializer.get_instance().optimizer_config.optimizer_instance if optimizer_instance: @@ -395,7 +393,7 @@ class Saver(object): table_instance0 = self.config_instance.sparse_embed_config.get_table_instance(self.var_list[0]) if not table_instance0.is_hbm: return - + if self.config_instance.use_dynamic_expansion: # Data related to dynamic expansion needs to be restored only on the host side. return @@ -405,7 +403,7 @@ class Saver(object): for table_name, sub_placeholder_dict in placeholder_dict.items(): load_offset = self.config_instance.hybrid_manager_config.get_load_offset(table_name) fill_placeholder(reading_path, sub_placeholder_dict, restore_feed_dict, - NameDescriptor(table_name, DataName.EMBEDDING.value), load_offset) + NameDescriptor(table_name, DataName.EMBEDDING.value), load_offset) if "optimizer" in sub_placeholder_dict: optimizer_state_placeholder_dict_group = sub_placeholder_dict.get("optimizer") @@ -698,4 +696,15 @@ def set_optimizer_info(optimizer: CustomizedOptimizer, table_name: str): """ from mxrec_pybind import OptimizerInfo optim_info = OptimizerInfo(optimizer.optimizer_type, optimizer.optim_param_list) - ConfigInitializer.get_instance().hybrid_manager_config.set_optim_info(table_name, optim_info) \ No newline at end of file + ConfigInitializer.get_instance().hybrid_manager_config.set_optim_info(table_name, optim_info) + + +def should_write_data(rank_id: int, save_path: str) -> bool: + # When using hdfs filesystem, only the rank0 process execute write data operation, assuming use same hdfs path in + # multi-machine. + # When using local filesystem, the process which `rank_id % local_rank_size == 0` execute write data operation. + # When using hdfs filesystem, and use different hdfs path to save data, should modify check condition + # as same as local filesystem. + is_hdfs = check_file_system_is_hdfs(save_path) + local_rank_size = get_local_rank_size() + return rank_id == 0 if is_hdfs else rank_id % local_rank_size == 0 -- Gitee From f07efc133ddc6a416d6e7f5fec9e7bb2fddacd21 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 8 Jul 2024 11:46:03 +0800 Subject: [PATCH 265/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91criteo=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=A4=84=E7=90=86=E8=84=9A=E6=9C=AC=E5=88=A4=E6=96=AD?= =?UTF-8?q?=E6=9D=A1=E4=BB=B6=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/dlrm/criteo_tb/gen_ttf.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/dlrm/criteo_tb/gen_ttf.py b/examples/dlrm/criteo_tb/gen_ttf.py index 8715f048..986bc6df 100644 --- a/examples/dlrm/criteo_tb/gen_ttf.py +++ b/examples/dlrm/criteo_tb/gen_ttf.py @@ -224,9 +224,9 @@ def make_example(label_list, dense_feat_list, sparse_feat_list): sparse_feature = np.array(sparse_feat_list, dtype=np.int64).reshape(-1) label = np.array(label_list, dtype=np.int64).reshape(-1) feature_dict = {"dense_feature": tf.train.Feature(float_list=tf.train.FloatList(value=dense_feature)), - "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), - "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) - } + "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), + "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) + } example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) return example @@ -273,10 +273,10 @@ def convert_input2tfrd_multiprocess(proc_num, proc_id, in_file_path, output_file label = int(items[0]) values = items[1:14] cats = items[14:] - if len(values) == 13: - raise ValueError("values.size: {}".format(len(values))) - if len(cats) == 26: - raise ValueError("cats.size: {}".format(len(cats))) + if len(values) != 13: + raise ValueError("dense feature length must be 13, current values.size: {}".format(len(values))) + if len(cats) != 26: + raise ValueError("sparse feature length must be 26, current cats.size: {}".format(len(cats))) val_list, cat_list = criteo_stats_dict.map_cat2id(values, cats) dense_res_list.append(val_list) cat_res_list.append(cat_list) @@ -363,7 +363,7 @@ if __name__ == "__main__": process_num = args.train_process_num if len(train_data_files) == 0: raise ValueError(f'file not exist in train_data_dir:{train_data_dir}') - if process_num % len(train_data_files) == 0: + if process_num % len(train_data_files) != 0: raise ValueError(f'process_num {process_num} must exact div length of train_data_files {len(train_data_files)}') for process_id in range(process_num): @@ -387,7 +387,7 @@ if __name__ == "__main__": process_num = args.test_process_num if len(test_data_files) == 0: raise ValueError(f'file not exist in test_data_dir:{test_data_dir}') - if process_num % len(test_data_files) == 0: + if process_num % len(test_data_files) != 0: raise ValueError(f'process_num {process_num} must exact div length of test_data_files {len(test_data_files)}') for process_id in range(process_num): -- Gitee From 33991245ee3d8f68cccb5d18b5ae6a20fab07014 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Tue, 9 Jul 2024 10:49:44 +0800 Subject: [PATCH 266/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E6=89=A9=E5=AE=B9?= =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E4=B8=8B=EF=BC=8Ctable.capacity=E5=87=BA?= =?UTF-8?q?=E7=8E=B0=E5=81=B6=E5=8F=91=E8=B4=9F=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/emb_table/embedding_table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/emb_table/embedding_table.h b/src/core/emb_table/embedding_table.h index 3396a8a0..ef741887 100644 --- a/src/core/emb_table/embedding_table.h +++ b/src/core/emb_table/embedding_table.h @@ -114,7 +114,7 @@ protected: size_t embSize_; size_t extEmbSize_; int seed_; - std::atomic capacity_; + std::atomic capacity_{0}; size_t rankId_; size_t rankSize_; vector loadOffset; -- Gitee From 1b81040851f1bf326983ce1e4e6589c0c4a5986d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Tue, 9 Jul 2024 15:45:28 +0800 Subject: [PATCH 267/302] =?UTF-8?q?=E4=BF=AE=E6=94=B9mxRec=E9=95=9C?= =?UTF-8?q?=E5=83=8F=E4=BB=93=E7=9A=84=E9=93=BE=E6=8E=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 17d38fcd..f6bfb828 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,6 @@ mxRec所支持的使用环境、功能特性、API接口与使用样例请参考 mxRec框架基础镜像,基于TensorFlow 1.15.0、tensorflow2.6.5制作的基础镜像,安装mxRec后即可开始训练,以及样例使用介绍。 -1. https://ascendhub.huawei.com/#/detail/mxrec-tf1 +1. https://www.hiascend.com/developer/ascendhub/detail/mxrec-tf1 -2. https://ascendhub.huawei.com/#/detail/mxrec-tf2 +2. https://www.hiascend.com/developer/ascendhub/detail/mxrec-tf2 -- Gitee From 42ac8e68ecab452042587c8fe7bac19c7abca82c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Tue, 9 Jul 2024 15:46:14 +0800 Subject: [PATCH 268/302] =?UTF-8?q?=E6=B7=BB=E5=8A=A0dlrm=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E8=BF=90=E8=A1=8C=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/dlrm/README.md | 60 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 examples/dlrm/README.md diff --git a/examples/dlrm/README.md b/examples/dlrm/README.md new file mode 100644 index 00000000..85293c0c --- /dev/null +++ b/examples/dlrm/README.md @@ -0,0 +1,60 @@ +# DLRM模型运行说明 + +## 代码结构 +```shell +. +├── criteo_tb +│ ├── gen_ttf.py # criteo_tb原始数据转换成tfrecord格式的脚本 +│ └── README.md # 数据格式转换脚本说明 +├── model +│ ├── config.py # 模型配置文件 +│ ├── delay_loss_scale.py # loss缩放函数 +│ ├── gradient_descent_w.py # 自定义SGD优化器 +│ ├── main_mxrec.py # 主函数 +│ ├── mean_auc.py # 计算acu的脚本 +│ ├── model.py # DLRM模型 +│ ├── op_impl_mode.ini # 算子执行模式配置 +│ ├── optimizer.py # 优化器 +│ └── run.sh # 运行DLRM模型的脚本 +└── README.md # DLRM模型运行说明 +``` + +## 1.准备数据 +参考criteo_tb目录下的说明文档准备好模型所需要的数据集,放在一个目录下,比如:/data/criteo_tb/。 + +## 2.准备运行环境 +运行环境可以参考[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0007.html) +“安装部署”章节进行准备。 + +## 3.安装mxRec +mxRec软件包可以通过[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0007.html) +“安装部署”>“环境准备”>“获取软件包”章节提供的链接进行下载,选择自己需要的架构(x86或者arm)的mxRec包。下载完成之后,将mxRec包解压,进入解压后的目录(mindxsdk-mxrec) +如下: +```shell +. +├── cust_op +│ └── cust_op_by_addr +├── examples +│ ├── DCNv2 +│ ├── demo +│ └── dlrm +├── tf1_whl +│ └── mx_rec-{version}-py3-none-linux_x86_64.whl # version为版本号 +├── tf2_whl +│ └── mx_rec-{version}-py3-none-linux_x86_64.whl # version为版本号 +└── version.info +``` +其中,tf1_whl和tf2_whl目录下分别是适配tf1和tf2的mxRec软件包,按照自己需要选择其中一个进行安装即可(用pip/pip3 install 软件包这种方式进行安装)。 +确认安装mxRec的目录,比如mxRec安装在 /usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec这个目录下。 + +## 4.运行DLRM模型 +执行完以上步骤之后,接下来就可以运行DLRM模型,其中run.sh就是运行的脚本,默认是8张卡。其中需要传入5个参数,分别对应:so_path、mx_rec_package_path、hccl_cfg_json、 +dlrm_criteo_data_path和ip。运行命令如: +```shell +bash run.sh {so_path} {mx_rec_package_path} {hccl_cfg_json} {dlrm_criteo_data_path} {ip} +``` +* so_path:so_path是mxRec中动态库的目录,一般在mxRec的安装目录下的libasc目录,比如:/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec/libasc。 +* mx_rec_package_path:mx_rec_package_path是mxRec的安装目录,比如:/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec。 +* hccl_cfg_json:hccl_cfg_json是hccl通信配置文件,如果配置了ip参数,这个参数就不用了,直接给一个""空字符串即可。 +* dlrm_criteo_data_path:dlrm_criteo_data_path是数据集所在的目录,比如/data/criteo_tb/。 +* ip:ip是运行模型的机器所在的ip,建议配置。 -- Gitee From 31aa8b6db348a4e8dd2688b1331559eb20264aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Tue, 9 Jul 2024 15:54:15 +0800 Subject: [PATCH 269/302] =?UTF-8?q?=E6=B7=BB=E5=8A=A0DCNv2=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E8=BF=90=E8=A1=8C=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/README.md | 54 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 examples/DCNv2/README.md diff --git a/examples/DCNv2/README.md b/examples/DCNv2/README.md new file mode 100644 index 00000000..e9b8a75f --- /dev/null +++ b/examples/DCNv2/README.md @@ -0,0 +1,54 @@ +# DCNv2模型运行说明 + +## 代码结构 +```shell +. +├── config.py # 模型配置文件 +├── delay_loss_scale.py # loss缩放函数 +├── main_mxrec.py # 主函数 +├── model.py # DCNv2模型 +├── op_impl_mode.ini # 算子执行模式配置 +├── optimizer.py # 优化器 +├── README.md # DCNv2模型运行说明 +└── run.sh # 运行DCNv2模型的脚本 +``` + +## 1.准备数据 +参考dlrm模型中criteo_tb目录下的说明文档准备好模型所需要的数据集,放在一个目录下,比如:/data/criteo_tb/。 + +## 2.准备运行环境 +运行环境可以参考[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0007.html) +“安装部署”章节进行准备。 + +## 3.安装mxRec +mxRec软件包可以通过[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0007.html) +“安装部署”>“环境准备”>“获取软件包”章节提供的链接进行下载,选择自己需要的架构(x86或者arm)的mxRec包。下载完成之后,将mxRec包解压,进入解压后的目录(mindxsdk-mxrec) +如下: +```shell +. +├── cust_op +│ └── cust_op_by_addr +├── examples +│ ├── DCNv2 +│ ├── demo +│ └── dlrm +├── tf1_whl +│ └── mx_rec-{version}-py3-none-linux_x86_64.whl # version为版本号 +├── tf2_whl +│ └── mx_rec-{version}-py3-none-linux_x86_64.whl # version为版本号 +└── version.info +``` +其中,tf1_whl和tf2_whl目录下分别是适配tf1和tf2的mxRec软件包,按照自己需要选择其中一个进行安装即可(用pip/pip3 install 软件包这种方式进行安装)。 +确认安装mxRec的目录,比如mxRec安装在 /usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec这个目录下。 + +## 4.运行DLRM模型 +执行完以上步骤之后,接下来就可以运行DLRM模型,其中run.sh就是运行的脚本,默认是8张卡。其中需要传入5个参数,分别对应:so_path、mx_rec_package_path、hccl_cfg_json、 +dlrm_criteo_data_path和ip。运行命令如: +```shell +bash run.sh {so_path} {mx_rec_package_path} {hccl_cfg_json} {dlrm_criteo_data_path} {ip} +``` +* so_path:so_path是mxRec中动态库的目录,一般在mxRec的安装目录下的libasc目录,比如:/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec/libasc。 +* mx_rec_package_path:mx_rec_package_path是mxRec的安装目录,比如:/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec。 +* hccl_cfg_json:hccl_cfg_json是hccl通信配置文件,如果配置了ip参数,这个参数就不用了,直接给一个""空字符串即可。 +* dlrm_criteo_data_path:dlrm_criteo_data_path是数据集所在的目录,比如/data/criteo_tb/。 +* ip:ip是运行模型的机器所在的ip,建议配置。 -- Gitee From 39fa9310b122431bffc75204d7fb8d18343db93f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Tue, 9 Jul 2024 16:10:33 +0800 Subject: [PATCH 270/302] =?UTF-8?q?WideDeep=E6=A0=B7=E4=BE=8B=20README?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E5=AE=8C=E5=96=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/{README_WD.md => README.md} | 45 +++++++++++++------ 1 file changed, 31 insertions(+), 14 deletions(-) rename examples/WideDeep/{README_WD.md => README.md} (89%) diff --git a/examples/WideDeep/README_WD.md b/examples/WideDeep/README.md similarity index 89% rename from examples/WideDeep/README_WD.md rename to examples/WideDeep/README.md index beb592c9..aef2379f 100644 --- a/examples/WideDeep/README_WD.md +++ b/examples/WideDeep/README.md @@ -4,7 +4,7 @@ *** ## 开源项目链接 - +Commits on Apr 29, 2022, 提交的SHA-1 hash值(提交ID):4bbfb492b872c5a3290a2bce1ed5c160162558a3 ```shell https://github.com/ZiyaoGeng/RecLearn ``` @@ -41,7 +41,7 @@ python critro.py --data_path data_path --output_path output_path ```python # get txt_list -split_file_list = get_split_file_path(dataset_path = dataset_path) +file_split_list = get_split_file_path(dataset_path=data_path) ``` *** #### 2. 建立特征映射 @@ -49,7 +49,7 @@ split_file_list = get_split_file_path(dataset_path = dataset_path) ```python # get feature_map -fea_map = get_fea_map(split_file_list=split_file_list) +feature_map = get_fea_map(split_file_list=file_split_list) ``` *** #### 3. dense_feature分桶离散化 @@ -57,7 +57,7 @@ fea_map = get_fea_map(split_file_list=split_file_list) ```python # dense feature: Bin continuous data into intervals. -data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, fea_map) +data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map) ``` *** #### 4. sparse_feature特征映射 @@ -66,7 +66,10 @@ data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, f ```python # sparse feature: mapping for col in sparse_features: - data_df[col] = data_df[col].map(lambda x: fea_map[col][x]) + try: + data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) + except KeyError as er: + raise KeyError("Feature {} not found in dataset".format(col)) from er ``` *** #### 5. 39个特征增加偏移项 @@ -74,12 +77,14 @@ for col in sparse_features: ```python # add offsets -slot_size_array = [1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, - 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, - 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573] +slot_size_array = [ + 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, + 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, + 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573 +] offset_size_list = np.cumsum([0] + slot_size_array[:-1]) -for j in range(1,len(offset_size_list)+1): - data_df.iloc[:, j] += offset_size_list[j-1] +for col_index in range(1, len(offset_size_list) + 1): + data_df.iloc[:, col_index] += offset_size_list[col_index - 1] ``` *** #### 6. 数据集格式转换:txt >> tfrecord @@ -93,13 +98,25 @@ convert_input2tfrd(in_file_path=file, out_file_path=output_path) ## 模型运行 -参考mxrec的`README.md`文件在NPU服务器上配置环境后,可按照[mxrec-tf1](https://ascendhub.huawei.com/#/detail/mxrec-tf1)中DLRM模型运行命令启动模型训练。`so_path`、`mx_rec_package_path`、`hccl_cfg_json`配置不变,根据实际数据集路径配置`dlrm_criteo_data_path`。 +参考mxrec的`README.md`文件在NPU服务器上配置环境并安装镜像创建容器后,可参考DLRM模型运行命令启动模型训练。模型运行脚本是run.sh,运行此脚本需要四个参数:so_path、mx_rec_package_path、hccl_cfg_json以及dlrm_criteo_data_path。其中, +- so_path: mxrec中libasc所在路径,在镜像中已经安装过mxrec,所以so_path是:/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec/libasc/ +- mx_rec_package_path: mxrec这个包的安装路径,镜像中是:/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec/ +- hccl_cfg_json: hccl配置文件所在路径,一般是当前路径下的hccl文件 +- dlrm_criteo_data_path: Wide&Deep模型需要的数据所在路径,根据实际情况进行配置 +运行mxRec有两种方式,一种是使用hccl配置文件(rank table方案),一种是不使用hccl配置文件(去rank table方案)。 +- 使用hccl配置文件(rank table方案) ```shell -# 运行命令 bash run.sh {so_path} {mx_rec_package_path} {hccl_cfg_json} {dlrm_criteo_data_path} ``` *** +- 不使用hccl配置文件(去rank table方案) +```shell +bash run.sh {so_path} {mx_rec_package_path} {hccl_cfg_json} {dlrm_criteo_data_path} {IP} +``` +如:bash run.sh /usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec/libasc/ /usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec/ hccl_json_8p.json /dataset 10.10.10.10。 +**注意:** 去rank table方案,当前路径下不存在hccl文件,模型仍可正常运行。 + ## 模型结果 [开源项目](https://github.com/ZiyaoGeng/RecLearn)使用Criteo4500W数据集在GPU上训练模型,结果为`Log Loss=0.4692`、`AUC=0.7930`。适配完成模型后,固定`CACHE_MODE="HBM"`、`USE_FAAE=0`,在`run.sh`中配置其他选项卡,运行结果如下。 @@ -135,8 +152,8 @@ bash run.sh {so_path} {mx_rec_package_path} {hccl_cfg_json} {dlrm_criteo_data_pa *** ## 模型迁移 -**迁移思路:** 在现有已适配好的dlrm模型框架下,改动相关代码逻辑,完成Wide&deep模型的适配。**核心:根据开源项目model代码修改`model.py`;数据处理操作一部分放入`criteo.py`,一部分放入`main_mxrec.py`中`make_batch_and_iterator()`内;`main_mxrec.py`中其他相关代码改动主要是为了适配mxrec提供的相关特性。** - +**迁移思路:** 在现有已适配好的dlrm模型框架下,改动相关代码逻辑,完成Wide&deep模型的适配。**核心:根据开源项目model代码修改`model.py`;数据处理操作一部分放入`criteo.py`,一部分放入`main_mxrec.py`中`make_batch_and_iterator()`内;`main_mxrec.py`中其他相关代码改动主要是为了适配mxrec提供的相关特性。** +详细改动见https://gitee.com/ascend/mxrec/pulls/171/commits,Commits ID:7a05b033d41af51df9aed7414ad04216dff821cc。 下文所提到的`动态扩容`、`动态shape`、`自动改图`、`一表多查`是mxrec提供的相关特性,开关选项见`run.sh`。 ```shell -- Gitee From 909ace13858217f2812884cd13d0ad8aeaaf7d19 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 9 Jul 2024 16:23:46 +0800 Subject: [PATCH 271/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91LazyAdam=E8=9E=8D?= =?UTF-8?q?=E5=90=88=E7=AE=97=E5=AD=90=E6=8F=8F=E8=BF=B0=E4=BF=A1=E6=81=AF?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cust_op/fused_lazy_adam/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cust_op/fused_lazy_adam/README.md b/cust_op/fused_lazy_adam/README.md index 13ed6994..3cb69f2d 100644 --- a/cust_op/fused_lazy_adam/README.md +++ b/cust_op/fused_lazy_adam/README.md @@ -6,7 +6,7 @@ ├── aclnn_lazy_adam_test # 单算子测试用例 ├── lazy_adam.json # 算子原型配置 ├── op_host # LazyAdam融合算子Host侧实现 -├── op_kernel # LazyAdam融合算子Kernel测实现 +├── op_kernel # LazyAdam融合算子Kernel侧实现 ├── README.md # LazyAdam融合算子说明文档 └── run.sh # LazyAdam融合算子安装脚本 ``` -- Gitee From 14ac6e7f2f7d5b62f9ba4aaae3e57c2082ea036a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Tue, 9 Jul 2024 17:14:57 +0800 Subject: [PATCH 272/302] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/WideDeep/README.md | 5 +++-- examples/WideDeep/criteo.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/WideDeep/README.md b/examples/WideDeep/README.md index aef2379f..f4815cd9 100644 --- a/examples/WideDeep/README.md +++ b/examples/WideDeep/README.md @@ -5,6 +5,7 @@ *** ## 开源项目链接 Commits on Apr 29, 2022, 提交的SHA-1 hash值(提交ID):4bbfb492b872c5a3290a2bce1ed5c160162558a3 +commit的链接: https://github.com/ZiyaoGeng/RecLearn/tree/4bbfb492b872c5a3290a2bce1ed5c160162558a3 ```shell https://github.com/ZiyaoGeng/RecLearn ``` @@ -68,8 +69,8 @@ data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, f for col in sparse_features: try: data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) - except KeyError as er: - raise KeyError("Feature {} not found in dataset".format(col)) from er + except KeyError as e: + raise KeyError("Feature {} not found in dataset".format(col)) from e ``` *** #### 5. 39个特征增加偏移项 diff --git a/examples/WideDeep/criteo.py b/examples/WideDeep/criteo.py index 617c76f6..3c8ea430 100644 --- a/examples/WideDeep/criteo.py +++ b/examples/WideDeep/criteo.py @@ -248,8 +248,8 @@ if __name__ == '__main__': for col in sparse_features: try: data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) - except KeyError as er: - raise KeyError("Feature {} not found in dataset".format(col)) from er + except KeyError as e: + raise KeyError("Feature {} not found in dataset".format(col)) from e # dense feature: Bin continuous data into intervals. data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map) # add offsets -- Gitee From 30d416ea128496119c1e95ed43240628727c7ca3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Tue, 9 Jul 2024 20:18:51 +0800 Subject: [PATCH 273/302] =?UTF-8?q?=E6=B7=BB=E5=8A=A0demo=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B=E8=BF=90=E8=A1=8C=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/DCNv2/README.md | 6 +- examples/demo/README.md | 13 +++++ examples/demo/little_demo/README.md | 56 ++++++++++++++++++ examples/demo/little_demo_estimator/README.md | 57 +++++++++++++++++++ 4 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 examples/demo/README.md create mode 100644 examples/demo/little_demo/README.md create mode 100644 examples/demo/little_demo_estimator/README.md diff --git a/examples/DCNv2/README.md b/examples/DCNv2/README.md index e9b8a75f..f1940ebe 100644 --- a/examples/DCNv2/README.md +++ b/examples/DCNv2/README.md @@ -14,7 +14,7 @@ ``` ## 1.准备数据 -参考dlrm模型中criteo_tb目录下的说明文档准备好模型所需要的数据集,放在一个目录下,比如:/data/criteo_tb/。 +参考DLRM模型中criteo_tb目录下的说明文档准备好模型所需要的数据集,放在一个目录下,比如:/data/criteo_tb/。 ## 2.准备运行环境 运行环境可以参考[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0007.html) @@ -41,8 +41,8 @@ mxRec软件包可以通过[mxRec用户指南](https://www.hiascend.com/document/ 其中,tf1_whl和tf2_whl目录下分别是适配tf1和tf2的mxRec软件包,按照自己需要选择其中一个进行安装即可(用pip/pip3 install 软件包这种方式进行安装)。 确认安装mxRec的目录,比如mxRec安装在 /usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec这个目录下。 -## 4.运行DLRM模型 -执行完以上步骤之后,接下来就可以运行DLRM模型,其中run.sh就是运行的脚本,默认是8张卡。其中需要传入5个参数,分别对应:so_path、mx_rec_package_path、hccl_cfg_json、 +## 4.运行DCNv2模型 +执行完以上步骤之后,接下来就可以运行DCNv2模型,其中run.sh就是运行的脚本,默认是8张卡。其中需要传入5个参数,分别对应:so_path、mx_rec_package_path、hccl_cfg_json、 dlrm_criteo_data_path和ip。运行命令如: ```shell bash run.sh {so_path} {mx_rec_package_path} {hccl_cfg_json} {dlrm_criteo_data_path} {ip} diff --git a/examples/demo/README.md b/examples/demo/README.md new file mode 100644 index 00000000..931f8c26 --- /dev/null +++ b/examples/demo/README.md @@ -0,0 +1,13 @@ +# demo样例说明 + +## 代码结构 +```shell +. +├── little_demo # sess.run模式的demo +├── little_demo_estimator # estimator模式的demo +└── README.md # demo样例说明 +``` + +mxRec提供了一个非常简单的样例模型demo,用于快速体验mxRec。在TensorFlow中,运行模型有sess.run和estimator两种模式。因此,mxRec也提供了两种 +模式下的样例。其中little_demo是sess.run模式的样例;little_demo_estimator是estimator模式的样例。用户可以选择自己需要或者感兴趣的模式进行 +体验,各个模式的样例的说明见对应目录下的README文档。 \ No newline at end of file diff --git a/examples/demo/little_demo/README.md b/examples/demo/little_demo/README.md new file mode 100644 index 00000000..dabe105b --- /dev/null +++ b/examples/demo/little_demo/README.md @@ -0,0 +1,56 @@ +# sess.run模式下demo模型运行说明 + +## 代码结构 +```shell +. +├── config.py # 模型配置文件 +├── dataset.py # 生成数据集的脚本 +├── deterministic_loss # 确定性计算loss样例 +├── main.py # 主函数 +├── model.py # demo模型 +├── op_impl_mode.ini # 算子执行模式配置 +├── optimizer.py # 优化器 +├── random_data_generator.py # 数据生成器 +├── README.md # demo模型运行说明 +├── run_deterministic.sh # 运行确定性计算的脚本 +├── run_mode.py # 执行模型train、evaluate和predict的脚本 +└── run.sh # demo运行脚本 +``` + +## 1.准备数据 +demo样例无需从其他地方下载数据集,在demo样例中mxRec会自动生成数据集,详情见dataset.py和random_data_generator.py。 + +## 2.准备运行环境 +运行环境可以参考[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0007.html) +“安装部署”章节进行准备。 + +## 3.安装mxRec +mxRec软件包可以通过[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0007.html) +“安装部署”>“环境准备”>“获取软件包”章节提供的链接进行下载,选择自己需要的架构(x86或者arm)的mxRec包。下载完成之后,将mxRec包解压,进入解压后的目录(mindxsdk-mxrec) +如下: +```shell +. +├── cust_op +│ └── cust_op_by_addr +├── examples +│ ├── DCNv2 +│ ├── demo +│ └── dlrm +├── tf1_whl +│ └── mx_rec-{version}-py3-none-linux_x86_64.whl # version为版本号 +├── tf2_whl +│ └── mx_rec-{version}-py3-none-linux_x86_64.whl # version为版本号 +└── version.info +``` +其中,tf1_whl和tf2_whl目录下分别是适配tf1和tf2的mxRec软件包,按照自己需要选择其中一个进行安装即可(用pip/pip3 install 软件包这种方式进行安装)。 +确认安装mxRec的目录,比如mxRec安装在 /usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec这个目录下。 + +## 4.运行demo模型 +执行完以上步骤之后,接下来就可以运行demo模型,其中run.sh就是运行的脚本,默认是8张卡。其中需要传入ip这个参数,运行命令如: +```shell +bash run.sh main.py {ip} +``` +* ip:ip是运行模型的机器所在的ip。 + +**Tips**:run.sh脚本中有一个参数是mx_rec_package_path,mx_rec_package_path是mxRec的安装目录,比如:/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec。 +这个参数在脚本是默认的,用户需要根据自己环境中mxRec实际安装的路径进行配置。 \ No newline at end of file diff --git a/examples/demo/little_demo_estimator/README.md b/examples/demo/little_demo_estimator/README.md new file mode 100644 index 00000000..aca25a34 --- /dev/null +++ b/examples/demo/little_demo_estimator/README.md @@ -0,0 +1,57 @@ +# estimator模式下demo模型运行说明 + +## 代码结构 +```shell +. +├── config.py # 模型配置文件 +├── dataset.py # 生成数据集的脚本 +├── main.py # 主函数 +├── nn_model_build.py # demo模型 +├── nn_model_input.py # 定义model_fn +├── nn_optim.py # 定义train的各个op +├── nn_reader.py # 定义input_fn +├── op_precision.ini # 算子执行模式配置 +├── random_data_generator.py # 数据生成器 +├── README.md # demo模型运行说明 +├── run.sh # demo运行脚本 +├── tf_adapter.py # 导入tf adapter +└── utils.py # 公共函数 +``` + +## 1.准备数据 +demo样例无需从其他地方下载数据集,在demo样例中mxRec会自动生成数据集,详情见dataset.py和random_data_generator.py。 + +## 2.准备运行环境 +运行环境可以参考[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0007.html) +“安装部署”章节进行准备。 + +## 3.安装mxRec +mxRec软件包可以通过[mxRec用户指南](https://www.hiascend.com/document/detail/zh/mind-sdk/60rc1/mxRec/mxrecug/mxrecug_0007.html) +“安装部署”>“环境准备”>“获取软件包”章节提供的链接进行下载,选择自己需要的架构(x86或者arm)的mxRec包。下载完成之后,将mxRec包解压,进入解压后的目录(mindxsdk-mxrec) +如下: +```shell +. +├── cust_op +│ └── cust_op_by_addr +├── examples +│ ├── DCNv2 +│ ├── demo +│ └── dlrm +├── tf1_whl +│ └── mx_rec-{version}-py3-none-linux_x86_64.whl # version为版本号 +├── tf2_whl +│ └── mx_rec-{version}-py3-none-linux_x86_64.whl # version为版本号 +└── version.info +``` +其中,tf1_whl和tf2_whl目录下分别是适配tf1和tf2的mxRec软件包,按照自己需要选择其中一个进行安装即可(用pip/pip3 install 软件包这种方式进行安装)。 +确认安装mxRec的目录,比如mxRec安装在 /usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec这个目录下。 + +## 4.运行demo模型 +执行完以上步骤之后,接下来就可以运行demo模型,其中run.sh就是运行的脚本,默认是8张卡。其中需要传入ip这个参数,运行命令如: +```shell +bash run.sh main.py {ip} +``` +* ip:ip是运行模型的机器所在的ip。 + +**Tips**:run.sh脚本中有一个参数是mx_rec_package_path,mx_rec_package_path是mxRec的安装目录,比如:/usr/local/python3.7.5/lib/python3.7/site-packages/mx_rec。 +这个参数在脚本是默认的,用户需要根据自己环境中mxRec实际安装的路径进行配置。 \ No newline at end of file -- Gitee From 38866e896710f3ff873083c02216bb638672aecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 10 Jul 2024 16:16:29 +0800 Subject: [PATCH 274/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91DDR=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E5=9C=A8device=E6=B5=8B=E8=BF=90=E8=A1=8C=E8=BE=83?= =?UTF-8?q?=E5=BF=AB=E7=9A=84=E6=83=85=E5=86=B5=E4=B8=8B=EF=BC=8Chost?= =?UTF-8?q?=E6=B5=8B=E7=94=B3=E8=AF=B7=E5=86=85=E5=AD=98=E5=92=8C=E5=88=9D?= =?UTF-8?q?=E5=A7=8B=E5=8C=96=E6=85=A2=EF=BC=8C=E5=AF=BC=E8=87=B4=E6=8A=A5?= =?UTF-8?q?=E9=94=99=E9=80=80=E5=87=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h index 46daaf29..3b87e6e6 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h @@ -109,7 +109,7 @@ public: fullCv.notify_all(); } - BeforePutFuncState GetNewValueToBeInserted(uint64_t& value, uint32_t maxRetry = 1000) + BeforePutFuncState GetNewValueToBeInserted(uint64_t& value, uint32_t maxRetry = 10000) { for (uint32_t i = 0; i < maxRetry; i++) { if (BufferBin.pop(value)) { @@ -252,7 +252,7 @@ public: FkvState FindAndPutIfNotFound(uint64_t key, uint64_t& value) { FkvState ret = MapperBase::FindAndPutIfNotFound(key, value, [&]() { - if (HM_UNLIKELY(current_size.load() >= hostVocabSize)) { + if (HM_UNLIKELY(current_size.load() > hostVocabSize)) { ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "host does not have enough space"); return BeforePutFuncState::BEFORE_NO_SPACE; } -- Gitee From d076f903d3669c8194312b793de1349586f9f1b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Wed, 10 Jul 2024 16:55:41 +0800 Subject: [PATCH 275/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91DDR=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E6=8A=A5=E9=94=99host=E7=A9=BA=E9=97=B4=E4=B8=8D?= =?UTF-8?q?=E8=B6=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/embedding_cache/offset_mapper/address_mapper.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h index 3b87e6e6..8b7e4e67 100644 --- a/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h +++ b/src/AccCTR/src/embedding_cache/offset_mapper/address_mapper.h @@ -109,7 +109,7 @@ public: fullCv.notify_all(); } - BeforePutFuncState GetNewValueToBeInserted(uint64_t& value, uint32_t maxRetry = 10000) + BeforePutFuncState GetNewValueToBeInserted(uint64_t& value, uint32_t maxRetry = 1000) { for (uint32_t i = 0; i < maxRetry; i++) { if (BufferBin.pop(value)) { @@ -252,8 +252,11 @@ public: FkvState FindAndPutIfNotFound(uint64_t key, uint64_t& value) { FkvState ret = MapperBase::FindAndPutIfNotFound(key, value, [&]() { - if (HM_UNLIKELY(current_size.load() > hostVocabSize)) { - ock::ExternalLogger::PrintLog(ock::LogLevel::ERROR, "host does not have enough space"); + if (HM_UNLIKELY(current_size.load() >= hostVocabSize)) { + ock::ExternalLogger::PrintLog( + ock::LogLevel::ERROR, + "host does not have enough space, current: " + std::to_string(current_size.load()) + + ", host max size: " + std::to_string(hostVocabSize)); return BeforePutFuncState::BEFORE_NO_SPACE; } return emExpendMemInfoPtr->GetNewValueToBeInserted(value); -- Gitee From db89f0016478fcf3f9bde8481d0ae8ad4a1cb934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E9=9C=96?= Date: Fri, 12 Jul 2024 18:05:31 +0800 Subject: [PATCH 276/302] =?UTF-8?q?=E4=BF=AE=E6=94=B9mxRec=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f6bfb828..5a2d9c03 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ mxRec作为面向互联网市场搜索推荐广告的应用使能SDK产品,对 ## 安装方式 -安装前,请参考《CANN 软件安装指南》安装CANN开发套件软件包和TensorFlow适配昇腾插件。 +安装前,请参考[CANN 软件安装指南](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha003/softwareinst/instg/instg_0022.html)安装CANN开发套件软件包和TensorFlow适配昇腾插件。 CANN软件提供进程级环境变量设置脚本,供用户在进程中引用,以自动完成环境变量设置。用户进程结束后自动失效。可在程序启动的Shell脚本中使用如下命令设置CANN的相关环境变量,也可通过命令行执行如下命令(以root用户默认安装路径“/usr/local/Ascend”为例): ```shell @@ -65,12 +65,34 @@ bash run.sh 将pybind11和securec的压缩包放在与mxRec代码同级的opensource目录下,并且将其分别更名为pybind11-2.10.3.zip、huaweicloud-sdk-c-obs-3.23.9.zip。如果没有opensource目录,则需要在mxRec同级的目录下手动创建opensource目录,然后将pybind11和securec的压缩包放在opensource目录下。 -为了构建多个版本的whl包,编译脚本在python虚拟环境完成对应tensorflow版本的安装。用户可以根据实际情况调整编译脚本,指定tensorflow的安装路径。编译方法: +由于构建脚本需要适配内部构建工程,所以在脚本中存在适配代码,但是这些代码可能对于用户来说不需要,所以在编译之前需要做如下处理: + +在build目录中存在build_tf1.sh和build_tf2.sh,其中分别存在如下代码: +```shell +# 配置tf1路径 +source /opt/buildtools/tf1_env/bin/activate +tf1_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow_core +deactivate tf1_env +``` +```shell +# 配置tf2路径 +source /opt/buildtools/tf2_env/bin/activate +tf2_path=$(dirname "$(dirname "$(which python3.7)")")/lib/python3.7/site-packages/tensorflow +deactivate tf2_env +``` + +可以看到,上述代码中都有激活Python虚拟环境的步骤,因此用户有两种选择: + +1. 根据需要在/opt/buildtools/目录下(没有此目录需要先创建)创建tf1_env和tf2_env两个Python虚拟环境,并在虚拟环境中安装对应版本的Tensorflow +2. 将source /opt/buildtools/tf1_env/bin/activate和deactivate tf1_env注释掉或者删除或者source /opt/buildtools/tf2_env/bin/activate和deactivate tf2_env注释掉或者删除 + + +编译方法: 进入mxRec代码目录: -- setup.py:执行脚本setup.py,比如:**python3.7 setup.py**完成tf1和tf2版本whl包的构建和打包,构建成功后,whl包在build/mindxsdk-mxrec/目录下,其中tf1_whl和tf2_whl目录下存在对应的whl包。执行脚本前,请参考build/build_tf1.sh、build/build_tf2.sh创建对应的虚拟环境,在虚拟环境中完成对应tensorflow版本的安装,并修改对应的激活命令。 -- setup_tf1.py:执行脚本setup_tf1.py,比如:**python3.7 setup_tf1.py bdist_wheel**完成tf1版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf1_whl子目录下。执行脚本前,请参考build/build_tf1.sh创建tf1虚拟环境,在虚拟环境中完成tensorflow 1.15.0版本的安装,并修改对应的激活命令。 -- setup_tf2.py:执行脚本setup_tf2.py,比如:**python3.7 setup_tf2.py bdist_wheel**完成tf2版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf2_whl子目录下。执行脚本前,请参考build/build_tf2.sh创建tf2虚拟环境,在虚拟环境中完成tensorflow 2.6.5版本的安装,并修改对应的激活命令。 +- setup.py:此脚本供内部使用,用于同时构建tf1和tf2的mxRec包,用户通常只需要其中一个,所以建议使用下面两个脚本构建。 +- setup_tf1.py:执行脚本setup_tf1.py,比如:**python3.7 setup_tf1.py bdist_wheel**完成tf1版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf1_whl子目录下。 +- setup_tf2.py:执行脚本setup_tf2.py,比如:**python3.7 setup_tf2.py bdist_wheel**完成tf2版本whl包的构建,构建成功后,whl包在build/mindxsdk-mxrec/tf2_whl子目录下。 如需使用动态扩容功能,进入“./cust_op/cust_op_by_addr”目录中。参考以下命令编译并安装动态扩容算子包。 ```shell -- Gitee From c2d469d400a520846808d08ad7cb1016c0e462ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Fri, 12 Jul 2024 18:20:22 +0800 Subject: [PATCH 277/302] =?UTF-8?q?Little=20demo=E6=A8=A1=E5=9E=8Bestimato?= =?UTF-8?q?r=E6=A8=A1=E5=BC=8FDDR=E4=BF=9D=E5=AD=98=E9=97=AE=E9=A2=98?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=EF=BC=9B=E9=97=A8=E7=A6=81=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E7=94=A8=E4=BE=8B=E4=BF=AE=E6=94=B9=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 9 +++++++++ src/core/hybrid_mgmt/hybrid_mgmt.cpp | 10 +++++++++- tests/mx_rec/saver/test_saver.py | 6 +++--- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index a6362506..f7ba8f03 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -127,6 +127,15 @@ class Saver(object): save_path = save_path if save_path else self._prefix_name directory, base_name = os.path.split(save_path) + # skip save in step-0, cause host skip save in step-0 EmbeddingDDR::Save SyncLatestEmbedding + try: + step_in_name = int(base_name.split("-")[-1]) + if step_in_name == 0: + return + except ValueError as err: + raise ValueError(f"The base_name {base_name} needs to include save_step message " + f"eg: mode-100") from err + if global_step: if not isinstance(global_step, compat.integral_types): global_step = int(sess.run(global_step)) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 3eb99685..bcc3a2a5 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -499,7 +499,15 @@ void HybridMgmt::EvalTask(TaskType type) hybridMgmtBlock->IsNeedWaitSave()); std::unique_lock checkSaveLocker(saveMutex); cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; }); - hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); + + if (hybridMgmtBlock->pythonBatchID[EVAL_CHANNEL_ID] >= hybridMgmtBlock->hybridBatchId[EVAL_CHANNEL_ID]) + { + hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); + } else { + std::this_thread::sleep_for(SLEEP_MS); + continue; + } + LOG_DEBUG("wake TrainTask"); hybridMgmtBlock->DoBlock(channelId); } diff --git a/tests/mx_rec/saver/test_saver.py b/tests/mx_rec/saver/test_saver.py index bcfa0948..53066038 100644 --- a/tests/mx_rec/saver/test_saver.py +++ b/tests/mx_rec/saver/test_saver.py @@ -61,18 +61,18 @@ class TestSaver(unittest.TestCase): self.saver = Saver() with tf.compat.v1.Session(graph=self.graph) as sess: - embedding_directory = "./sparse-model/test_table/embedding" + embedding_directory = "./sparse-model-1/test_table/embedding" data_file = os.path.join(embedding_directory, "slice.data") attribute_file = os.path.join(embedding_directory, "slice.attribute") sess.run(tf.global_variables_initializer()) origin_embedding = sess.run(self.var)[[0, 1, 4, 6, 8], :] - self.saver.save(sess) + self.saver.save(sess, save_path="model-1") self.assertTrue(os.path.exists(embedding_directory), "embedding目录已创建") self.assertTrue(os.path.exists(data_file), "embedding的data文件存储成功") self.assertTrue(os.path.exists(attribute_file), "embedding的attribute文件存储成功") - tf.io.gfile.rmtree("./sparse-model") + tf.io.gfile.rmtree("./sparse-model-1") def build_graph(self): self.graph = tf.compat.v1.Graph() -- Gitee From 6775ab93f0b004a6bbe15ce0d56f58da5df35745 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Fri, 12 Jul 2024 18:35:54 +0800 Subject: [PATCH 278/302] =?UTF-8?q?=E6=8B=BC=E5=86=99=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index bcc3a2a5..737cdb1d 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -500,7 +500,7 @@ void HybridMgmt::EvalTask(TaskType type) std::unique_lock checkSaveLocker(saveMutex); cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; }); - if (hybridMgmtBlock->pythonBatchID[EVAL_CHANNEL_ID] >= hybridMgmtBlock->hybridBatchId[EVAL_CHANNEL_ID]) + if (hybridMgmtBlock->pythonBatchId[EVAL_CHANNEL_ID] >= hybridMgmtBlock->hybridBatchId[EVAL_CHANNEL_ID]) { hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); } else { -- Gitee From cb43c6a8da89f2a25118df6d68631eec9549998d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Fri, 12 Jul 2024 19:05:56 +0800 Subject: [PATCH 279/302] =?UTF-8?q?cleancode=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 737cdb1d..93954401 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -500,9 +500,8 @@ void HybridMgmt::EvalTask(TaskType type) std::unique_lock checkSaveLocker(saveMutex); cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; }); - if (hybridMgmtBlock->pythonBatchId[EVAL_CHANNEL_ID] >= hybridMgmtBlock->hybridBatchId[EVAL_CHANNEL_ID]) - { - hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); + if (hybridMgmtBlock->pythonBatchId[EVAL_CHANNEL_ID] >= hybridMgmtBlock->hybridBatchId[EVAL_CHANNEL_ID]) { + hybridMgmtBlockgi->Wake(TRAIN_CHANNEL_ID); } else { std::this_thread::sleep_for(SLEEP_MS); continue; -- Gitee From d2ba56b47391194d99c807df5bd8879437cb6418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Fri, 12 Jul 2024 19:20:39 +0800 Subject: [PATCH 280/302] =?UTF-8?q?=E6=8B=BC=E5=86=99=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 93954401..cab348ba 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -501,7 +501,7 @@ void HybridMgmt::EvalTask(TaskType type) cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; }); if (hybridMgmtBlock->pythonBatchId[EVAL_CHANNEL_ID] >= hybridMgmtBlock->hybridBatchId[EVAL_CHANNEL_ID]) { - hybridMgmtBlockgi->Wake(TRAIN_CHANNEL_ID); + hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); } else { std::this_thread::sleep_for(SLEEP_MS); continue; -- Gitee From 33c03cadd19b48fb11daaac4045925bd13a4236f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 15 Jul 2024 09:49:14 +0800 Subject: [PATCH 281/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91DDR=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E5=81=B6=E5=8F=91=E6=8A=A5=E9=94=99=E7=A9=BA=E9=97=B4?= =?UTF-8?q?=E4=B8=8D=E8=B6=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 41 ++++++++++++++++++---------- src/core/hybrid_mgmt/hybrid_mgmt.h | 2 +- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 3eb99685..73c30e13 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -959,30 +959,43 @@ void HybridMgmt::LookUpAndRemoveAddrs(const EmbTaskInfo& info) } // DDR -void HybridMgmt::LookUpSwapAddrs(const string& embName, const string& swapStr) +void HybridMgmt::LookUpSwapAddrs(const string& embName) { int id = 0; - std::string swapName = embName + swapStr; + std::string swapInName = embName + SWAP_IN_STR; + std::string swapOutName = embName + SWAP_OUT_STR; + vector addrs; while (isRunning && lookupAddrSuccess) { - std::vector keys = HBMSwapKeyQue[swapName].WaitAndPop(); if (!isRunning) { return; } - vector addrs; - TimeCost lookupAddrsTC; + // swap in + std::vector keys = HBMSwapKeyQue[swapInName].WaitAndPop(); + TimeCost lookupAddrsInTC; int rc = embCache->EmbeddingLookupAddrs(embName, keys, addrs); if (rc != H_OK) { lookupAddrSuccess = false; throw runtime_error("EmbeddingLookupAddrs failed! error code: " + std::to_string(rc)); } - LOG_DEBUG("table:{}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}, lookupAddrsTC(ms):{}", embName, - swapStr, keys.size(), addrs.size(), id, lookupAddrsTC.ElapsedMS()); - HBMSwapAddrsQue[swapName].Pushv(addrs); - if (swapStr == SWAP_IN_STR) { - lookUpSwapInAddrsPushId[embName]++; - LOG_DEBUG("LookUpSwapAddrs, table:{}, pushId:{}, lookUpSwapInAddrsPushId:{}", embName, id, - lookUpSwapInAddrsPushId[embName]); + LOG_DEBUG("table:{}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}, lookupAddrsInTC(ms):{}", embName, + SWAP_IN_STR, keys.size(), addrs.size(), id, lookupAddrsInTC.ElapsedMS()); + HBMSwapAddrsQue[swapInName].Pushv(addrs); + + lookUpSwapInAddrsPushId[embName]++; + LOG_DEBUG("LookUpSwapAddrs, table:{}, pushId:{}, lookUpSwapInAddrsPushId:{}", embName, id, + lookUpSwapInAddrsPushId[embName]); + + // swap out + keys = HBMSwapKeyQue[swapOutName].WaitAndPop(); + TimeCost lookupAddrsOutTC; + rc = embCache->EmbeddingLookupAddrs(embName, keys, addrs); + if (rc != H_OK) { + lookupAddrSuccess = false; + throw runtime_error("EmbeddingLookupAddrs failed! error code: " + std::to_string(rc)); } + LOG_DEBUG("table:{}, swapStr:{}, keys.size:{}, addrs.size:{}, pushId:{}, lookupAddrsOutTC(ms):{}", embName, + SWAP_OUT_STR, keys.size(), addrs.size(), id, lookupAddrsOutTC.ElapsedMS()); + HBMSwapAddrsQue[swapOutName].Pushv(addrs); id++; } } @@ -1242,9 +1255,7 @@ void HybridMgmt::InitDataPipelineForDDR(const string& embName) // 初始化lookup线程 lookUpSwapInAddrsPushId[embName]; // 此处初始化,避免多线程竞争导致计数错误 lookUpSwapInAddrsThreads.emplace_back( - std::async(std::launch::async, [=] { LookUpSwapAddrs(embName, SWAP_IN_STR); })); - lookUpSwapOutAddrsThreads.emplace_back( - std::async(std::launch::async, [=] { LookUpSwapAddrs(embName, SWAP_OUT_STR); })); + std::async(std::launch::async, [=] { LookUpSwapAddrs(embName); })); LOG_DEBUG("data pipeline for ddr init"); } diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index ab34b19f..57a7ddd1 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -187,7 +187,7 @@ public: void LookUpAndRemoveAddrs(const EmbTaskInfo& info); // L3Storage, synchronous - void LookUpSwapAddrs(const std::string& embName, const std::string& swapStr); // DDR, asynchronous + void LookUpSwapAddrs(const std::string& embName); // DDR, asynchronous void EmbeddingTask(); -- Gitee From 5592a8e616f1ca0e98873b2ac84ec94fdeb20fc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E5=B9=B8=E8=BF=90?= Date: Mon, 15 Jul 2024 17:19:33 +0800 Subject: [PATCH 282/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91DDR=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E5=81=B6=E5=8F=91=E6=8A=A5=E9=94=99=E7=A9=BA=E9=97=B4?= =?UTF-8?q?=E4=B8=8D=E8=B6=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 9921fe27..f8ad9216 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -971,7 +971,7 @@ void HybridMgmt::LookUpSwapAddrs(const string& embName) int id = 0; std::string swapInName = embName + SWAP_IN_STR; std::string swapOutName = embName + SWAP_OUT_STR; - vector addrs; + std::vector addrs; while (isRunning && lookupAddrSuccess) { if (!isRunning) { return; -- Gitee From 45f3fe4365341c6024d52c589c77bb9af41e5248 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Mon, 15 Jul 2024 22:20:31 +0800 Subject: [PATCH 283/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91LazyAdam=E8=9E=8D?= =?UTF-8?q?=E5=90=88=E7=AE=97=E5=AD=90=E6=96=B0=E7=89=88=E6=9C=ACCANN?= =?UTF-8?q?=E7=BC=96=E8=AF=91=E5=A4=B1=E8=B4=A5=E4=BF=AE=E6=94=B9=EF=BC=9B?= =?UTF-8?q?=E8=AE=A1=E7=AE=97=E9=80=BB=E8=BE=91=E5=90=8C=E6=AD=A5py?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_lazy_adam_test/scripts/gen_data.py | 2 +- cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py index 6e07f836..6e8c9251 100644 --- a/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py +++ b/cust_op/fused_lazy_adam/aclnn_lazy_adam_test/scripts/gen_data.py @@ -121,7 +121,7 @@ def _gen_golden_data(): update_v = beta2 * old_v_slice + (1 - beta2) * np.square(gradient) out_v = _scatter_nd_update(input_v, indices, update_v) - denominator_slice = np.sqrt(update_v) + epsilon + denominator_slice = np.sqrt(np.abs(update_v)) + epsilon update_var = np.divide(-lr * update_m, denominator_slice) out_var = _scatter_nd_add(input_var, indices, update_var) diff --git a/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp b/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp index 76164e50..e0ad8e45 100644 --- a/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp +++ b/cust_op/fused_lazy_adam/op_kernel/lazy_adam.cpp @@ -176,6 +176,7 @@ private: this->updateV = localVSlice + this->updateV; // 计算Var + Abs(this->updateV, this->updateV, row * this->dim2); Sqrt(this->updateVar, this->updateV, row * this->dim2); Adds(this->updateVar, this->updateVar, this->epsilon, row * this->dim2); Muls(this->temp, this->updateM, -this->lr, row * this->dim2); @@ -233,5 +234,10 @@ extern "C" __global__ __aicore__ void lazy_adam(GM_ADDR gradient, GM_ADDR indice tiling_data.row, tiling_data.indicesAllocSize, tiling_data.otherAllocSize, tiling_data.batch, tiling_data.loopCount, tiling_data.rowLeft, tiling_data.loopCountTail, tiling_data.rowLeftTail, tiling_data.coreNum); +#ifdef KERNEL_TASK_TYPE_DEFAULT + // Set kernel type with new versions of CANN to avoid matmul error during compiling. + // In previous versions of CANN, avoid matmul error by using '#ifndef __GET_CODE_CHANNEL__'. + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); +#endif op32.Process(); } \ No newline at end of file -- Gitee From 1e9e773c32f67ff466893976f5b748ac217947c0 Mon Sep 17 00:00:00 2001 From: longfeifei <962977793@qq.com> Date: Mon, 15 Jul 2024 14:20:33 +0800 Subject: [PATCH 284/302] =?UTF-8?q?estimator=E4=B8=ADtrain=E5=88=87?= =?UTF-8?q?=E6=8D=A2=E4=B8=BAeval,=E5=A2=9E=E5=8A=A0=E5=8E=9Fhost=E4=BE=A7?= =?UTF-8?q?train=E7=9A=84=E7=9B=B8=E5=85=B3=E7=8A=B6=E6=80=81=E5=A4=87?= =?UTF-8?q?=E4=BB=BD=EF=BC=8C=E5=9C=A8eval=E5=88=87=E6=8D=A2=E4=B8=BAtrain?= =?UTF-8?q?=E5=90=8E=E8=BF=9B=E8=A1=8C=E8=BF=98=E5=8E=9F=E5=A4=87=E4=BB=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/AccCTR/src/common/util/error_code.h | 1 + .../cache_manager/cache_manager.cpp | 57 ++++++++++++-- .../cache_manager/cache_manager.h | 8 +- src/AccCTR/src/embedding_cache/limited_set.h | 19 +++-- src/AccCTR/src/include/embedding_cache.h | 18 +++-- src/core/emb_table/embedding_ddr.cpp | 11 ++- src/core/emb_table/embedding_mgmt.h | 5 +- src/core/hybrid_mgmt/hybrid_mgmt.cpp | 30 +++++--- src/core/hybrid_mgmt/hybrid_mgmt.h | 2 +- src/core/l3_storage/cache_manager.cpp | 74 ++++++++++++++++++- src/core/l3_storage/cache_manager.h | 11 +++ .../ock_ctr_common/include/embedding_cache.h | 18 +++-- 12 files changed, 200 insertions(+), 54 deletions(-) diff --git a/src/AccCTR/src/common/util/error_code.h b/src/AccCTR/src/common/util/error_code.h index b30bfd83..87c8ffe6 100644 --- a/src/AccCTR/src/common/util/error_code.h +++ b/src/AccCTR/src/common/util/error_code.h @@ -43,6 +43,7 @@ using CTRCode = enum : int { H_TABLE_NAME_EMPTY = 22, H_PREFILL_BUFFER_SIZE_INVALID = 23, H_TABLE_NAME_TOO_LONG = 24, + H_EMB_CACHE_INFO_LOST = 25 }; } } diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp index 68351328..52578820 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.cpp @@ -253,8 +253,7 @@ int EmbCacheManagerImpl::ExportDeviceKeyOffsetPairs(const std::string& tableName if (checkTableNameRet != H_OK) { return checkTableNameRet; } - OffsetMapper& om = offsetMappers[tableName]; - koVec = om.ExportSortedKVPairs(); + koVec = offsetMappers[tableName].ExportSortedKVPairs(); return H_OK; } @@ -318,30 +317,58 @@ int EmbCacheManagerImpl::LoadEmbTableInfos(std::string tableName, const std::vec return H_OK; } -int EmbCacheManagerImpl::BackUpTrainStatus(std:string tableName) +int EmbCacheManagerImpl::BackUpTrainStatus(const std::string& tableName) { int checkTableNameRet = CheckValidTableName(tableName); if (checkTableNameRet != H_OK) { return checkTableNameRet; } + // Back up the key-offset correspondence on the device + kvVecsBackUp[tableName] = offsetMappers[tableName].ExportVec(); + + auto embInfo = embCacheInfos.find(tableName); + if (embInfo == embCacheInfos.end()) { + return H_EMB_CACHE_INFO_LOST; + } + uint32_t reserve = embInfo->second.maxCacheSize / VOCAB_CACHE_RATIO; + uint32_t maxCacheSize = embInfo->second.maxCacheSize; + auto om = offsetMappersBackUp.find(tableName); if (om != offsetMappersBackUp.end()) { - offsetMappersBackUp[tableName] = offsetMappers[tableName]; - } else{ - offsetMappersBackUp[tableName].Initialize(1000, 1000); - offsetMappersBackUp[tableName] = offsetMappers[tableName]; + offsetMappersBackUp[tableName].UnInitialize(); } + offsetMappersBackUp[tableName].Initialize(reserve, maxCacheSize); + offsetMappersBackUp[tableName] = offsetMappers[tableName]; + return H_OK; } -int EmbCacheManagerImpl::RecoverTrainStatus(std:string tableName) +int EmbCacheManagerImpl::RecoverTrainStatus(const std::string& tableName) { int checkTableNameRet = CheckValidTableName(tableName); if (checkTableNameRet != H_OK) { return checkTableNameRet; } + + auto embInfo = embCacheInfos.find(tableName); + if (embInfo == embCacheInfos.end()) { + return H_EMB_CACHE_INFO_LOST; + } + uint32_t reserve = embInfo->second.maxCacheSize / VOCAB_CACHE_RATIO; + uint32_t maxCacheSize = embInfo->second.maxCacheSize; + + offsetMappers[tableName].UnInitialize(); + offsetMappers[tableName].Initialize(reserve, maxCacheSize); offsetMappers[tableName] = offsetMappersBackUp[tableName]; + + // Recover the key-offset correspondence on the device + auto kvVecBackUp = kvVecsBackUp[tableName]; + for (const auto& kvPair: kvVecBackUp) { + offsetMappers[tableName].Put(kvPair.first, kvPair.second); + } + + kvVecBackUp.clear(); return H_OK; } @@ -449,3 +476,17 @@ uint32_t EmbCacheManagerImpl::GetUsage(const std::string& tableName) { return embTables[tableName].GetUsage(); } + +int EmbCacheManagerImpl::ResetOffsetMappers() +{ + for (auto it = offsetMappers.begin(); it != offsetMappers.end(); it++) { + auto embInfo = embCacheInfos.find(it->first); + if (embInfo == embCacheInfos.end()) { + return H_EMB_CACHE_INFO_LOST; + } + it->second.UnInitialize(); + uint32_t reserve = embInfo->second.maxCacheSize / VOCAB_CACHE_RATIO; + it->second.Initialize(reserve, embInfo->second.maxCacheSize); + } + return H_OK; +} diff --git a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h index 359e88ad..e4a240ae 100644 --- a/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h +++ b/src/AccCTR/src/embedding_cache/cache_manager/cache_manager.h @@ -73,8 +73,11 @@ public: const std::vector>& embeddings, const std::vector>& optimizerSlots) override; - int BackUpTrainStatus(std:string tableName) override; - int RecoverTrainStatus(std::string tableName) override; + int BackUpTrainStatus(const std::string& tableName) override; + + int RecoverTrainStatus(const std::string& tableName) override; + + int ResetOffsetMappers() override; uint32_t GetUsage(const std::string& tableName) override; @@ -83,6 +86,7 @@ private: std::map offsetMappers; std::map offsetMappersBackUp; std::map embTables; + std::map>> kvVecsBackUp; int CheckValidTableName(const std::string& tableName); diff --git a/src/AccCTR/src/embedding_cache/limited_set.h b/src/AccCTR/src/embedding_cache/limited_set.h index d44b615a..f7bc2e1e 100644 --- a/src/AccCTR/src/embedding_cache/limited_set.h +++ b/src/AccCTR/src/embedding_cache/limited_set.h @@ -20,19 +20,21 @@ limitations under the License. namespace EmbCache { +static constexpr int64_t NODE_DEFAULT_VALUE = -1; + class LimitedSet { public: struct Node { uint64_t value; Node *prev, *next; - Node(uint64_t val = -1) : value(val), prev(nullptr), next(nullptr) {} + Node(uint64_t val = NODE_DEFAULT_VALUE) : value(val), prev(nullptr), next(nullptr) {} }; - LimitedSet(uint64_t maxRange) : head(new Node(-1)), tail(new Node(-1)) + LimitedSet(uint64_t maxRange) : head(new Node(NODE_DEFAULT_VALUE)), tail(new Node(NODE_DEFAULT_VALUE)) { nodes.resize(maxRange); for (auto &node : nodes) { - node = new Node(-1); + node = new Node(NODE_DEFAULT_VALUE); } head->next = tail; tail->prev = head; @@ -47,19 +49,16 @@ public: delete tail; } - // 拷贝构造函数 - LimitedSet(const LimitedSet& other): head(new Node(-1)), tail(new Node(-1)) + LimitedSet(const LimitedSet& other): head(new Node(NODE_DEFAULT_VALUE)), tail(new Node(NODE_DEFAULT_VALUE)) { nodes.resize(other.nodes.size()); - for (auto &node: nodes) { - node = new Node(-1); + for (auto& node: nodes) { + node = new Node(NODE_DEFAULT_VALUE); } - // 初始化头尾节点 head->next = tail; tail->prev = head; - // 遍历原vector的每一个节点并复制 for (Node* node = other.head->next; node != other.tail; node = node->next) { insert(node->value); } @@ -87,7 +86,7 @@ public: Node *node = nodes[value]; node->prev->next = node->next; node->next->prev = node->prev; - node->value = -1; + node->value = NODE_DEFAULT_VALUE; } bool find(uint64_t value) diff --git a/src/AccCTR/src/include/embedding_cache.h b/src/AccCTR/src/include/embedding_cache.h index 40d9dcbe..c0468549 100644 --- a/src/AccCTR/src/include/embedding_cache.h +++ b/src/AccCTR/src/include/embedding_cache.h @@ -317,18 +317,24 @@ public: const std::vector>& optimizerSlots) = 0; /* * - * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态 - * @Param tableName: 需要加载信息的table名字 + * When switch the channel to eval, backup the current table's offsetMapper object. + * @Param tableName: embedding table name * @Return errorCode */ - virtual int BackUpTrainStatus(std::string tableName) = 0; + virtual int BackUpTrainStatus(const std::string& tableName) = 0; /* * - * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态 - * @Param tableName: 需要加载信息的table名字 + * When switch the eval channel back to train, Recover the current table's offsetMapper object to the backup state. + * @Param tableName: embedding table name + * @Return errorCode + */ + virtual int RecoverTrainStatus(const std::string& tableName) = 0; + + /* * + * Reset the offsetMapper object to revert to its initialized state after loading. * @Return errorCode */ - virtual int RecoverTrainStatus(std::string tableName) = 0; + virtual int ResetOffsetMappers() = 0; }; } // namespace EmbCache diff --git a/src/core/emb_table/embedding_ddr.cpp b/src/core/emb_table/embedding_ddr.cpp index 82ca0b73..d05b3501 100644 --- a/src/core/emb_table/embedding_ddr.cpp +++ b/src/core/emb_table/embedding_ddr.cpp @@ -78,6 +78,11 @@ void EmbeddingDDR::Load(const string& savePath, mapResetOffsetMappers(); + if (rs != 0) { + throw runtime_error("embCache->ResetOffsetMappers failed, err code: " + to_string(rc)); + } } void EmbeddingDDR::LoadKey(const string &savePath, vector &keys) @@ -187,15 +192,13 @@ void EmbeddingDDR::LoadOptimizerSlot(const string &savePath, vector keys; vector> embeddings; vector> optimizerSlots; auto step = GetStepFromPath(savePath); - if (step > 0) { - SyncLatestEmbedding(); - embCache->GetEmbTableInfos(name, keys, embeddings, optimizerSlots); - } + embCache->GetEmbTableInfos(name, keys, embeddings, optimizerSlots); SaveKey(savePath, keys); SaveEmbedding(savePath, embeddings); diff --git a/src/core/emb_table/embedding_mgmt.h b/src/core/emb_table/embedding_mgmt.h index 7cd3f782..9dd0e363 100644 --- a/src/core/emb_table/embedding_mgmt.h +++ b/src/core/emb_table/embedding_mgmt.h @@ -90,16 +90,15 @@ public: void Save(const string& filePath); /** - * estimator模式下train切换为eval时, 备份所有表train的状态 + * In estimator mode, when switching from train to eval, backup the training state of all tables. */ void BackUpTrainStatusBeforeLoad(); /** - * estimator模式下eval切换为train时, 还原所有表train的状态 + * In estimator mode, when switching from eval to train, recover the training state of all tables. */ void RecoverTrainStatus(); - /** * 获取所有表对应的DeviceOffsets,该偏移用于python侧保存embedding时抽取key对应的embedding */ diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.cpp b/src/core/hybrid_mgmt/hybrid_mgmt.cpp index 84195a3c..91750b65 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.cpp +++ b/src/core/hybrid_mgmt/hybrid_mgmt.cpp @@ -206,12 +206,6 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) throw runtime_error("HybridMgmt not initialized. Call Initialize first."); } - if (mgmtRankInfo.isDDR && IsTrainAndEvalCase()) { - LOG_INFO("estimator train and eval case, skip loading, " - "host will reuse data in memory while evaluating since is's same as saved data"); - return true; - } - // 数据处理线程上锁 KEY_PROCESS_INSTANCE->LoadSaveLock(); @@ -257,10 +251,15 @@ bool HybridMgmt::Load(const string& loadPath, vector warmStartTables) featAdmitNEvict.LoadHistoryRecords(loadData.histRec); } + int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[TRAIN_CHANNEL_ID]; if (isL3StorageEnabled) { LOG_DEBUG(MGMT + "Start host side load: L3Storage key freq map"); auto step = GetStepFromPath(loadPath); - cacheManager->Load(mgmtEmbInfo, step, trainKeysSet); + // When in load and train mode or predict mode, SSD needs to actually execute loading + // When in the train and eval modes, loading before eval should be directly skipped + if (theTrainBatchId == 0) { + cacheManager->Load(mgmtEmbInfo, step, trainKeysSet); + } } LOG_DEBUG(MGMT + "Finish host side load process"); @@ -502,7 +501,7 @@ void HybridMgmt::EvalTask(TaskType type) cvCheckSave.wait(checkSaveLocker, [this] { return !hybridMgmtBlock->IsNeedWaitSave() || mutexDestroy; }); if (hybridMgmtBlock->pythonBatchId[EVAL_CHANNEL_ID] >= hybridMgmtBlock->hybridBatchId[EVAL_CHANNEL_ID]) { - // 在唤醒train的数据处理进程之前,需要将备份的train状态还原 + // Before waking the data process for training, Recover the backed-up training state RecoverTrainStatus(); hybridMgmtBlock->Wake(TRAIN_CHANNEL_ID); } else { @@ -2210,15 +2209,18 @@ void HybridMgmt::BackUpTrainStatus() { int channelID = TRAIN_CHANNEL_ID; int& theTrainBatchId = hybridMgmtBlock->hybridBatchId[channelID]; - //续训load、predict模式下的load不需要对train的状态进行备份 - if (theTrainBatchId==0) { + if (theTrainBatchId == 0) { return; } - // train and eval模式下,train切换为eval之后 - // eval的load需要线备份原有的相关状态, HBM非扩容模式需要备份keyOffsetMap, DDR模式需要备份offsetMapper对象 + LOG_INFO("On Estimator train and eval mode, start to backup train status, " "current train batchId: {} .", theTrainBatchId); + // When in the train and eval mode of estimator, backup training states before loading. EmbeddingMgmt::Instance()->BackUpTrainStatusBeforeLoad(); + + if (isL3StorageEnabled) { + cacheManager->BackUpTrainStatus(); + } isBackUpTrainStatus = true; } @@ -2227,5 +2229,9 @@ void HybridMgmt::RecoverTrainStatus() if (isBackUpTrainStatus) { EmbeddingMgmt::Instance()->RecoverTrainStatus(); } + + if (isL3StorageEnabled) { + cacheManager->RecoverTrainStatus(); + } isBackUpTrainStatus = false; } \ No newline at end of file diff --git a/src/core/hybrid_mgmt/hybrid_mgmt.h b/src/core/hybrid_mgmt/hybrid_mgmt.h index f845efb1..233030b9 100644 --- a/src/core/hybrid_mgmt/hybrid_mgmt.h +++ b/src/core/hybrid_mgmt/hybrid_mgmt.h @@ -223,7 +223,7 @@ private: bool isLoad{false}; bool isInitialized{false}; bool alreadyTrainOnce = false; // 用于判断是否为predict模式 - bool isBackUpTrainStatus = false; // 用于判断当前是否已经备份了train的状态 + bool isBackUpTrainStatus = false; // whether the train state has been backed up map lookUpSwapInAddrsPushId; // 用于处理eos场景,当消费者追上生产者且长时间无上游数据,会触发eos map specialProcessStatus; diff --git a/src/core/l3_storage/cache_manager.cpp b/src/core/l3_storage/cache_manager.cpp index ee3d7bc5..7ea68e14 100644 --- a/src/core/l3_storage/cache_manager.cpp +++ b/src/core/l3_storage/cache_manager.cpp @@ -32,10 +32,10 @@ void CacheManager::Init(ock::ctr::EmbCacheManagerPtr embCachePtr, vectorembCache = std::move(embCachePtr); for (auto& emb : mgmtEmbInfo) { - EmbBaseInfo baseInfo {emb.ssdVocabSize, emb.ssdDataPath, false}; + EmbBaseInfo baseInfo {emb.ssdVocabSize, emb.ssdDataPath, false, emb.extEmbeddingSize}; embBaseInfos.emplace(emb.name, baseInfo); preProcessMapper[emb.name].Initialize(emb.name, emb.hostVocabSize, emb.ssdVocabSize); } @@ -293,3 +293,73 @@ void CacheManager::FetchL3StorageEmb2DDR(string tableName, uint32_t extEmbedding embeddingTaskStep++; evictWaitCond.notify_all(); } + +void CacheManager::BackUpTrainStatus() +{ + ddrKeyFreqMapBackUp = ddrKeyFreqMap; + excludeDDRKeyCountMapBackUp = excludeDDRKeyCountMap; +} + +void CacheManager::RecoverTrainStatus() +{ + for (const auto& pair: excludeDDRKeyCountMapBackUp) { + auto tableName = pair.first; + + std::vector ssdKeysBeforeEval; + std::vector ssdKeysAfterEval; + std::vector swapInKeys; + std::vector swapOutKeys; + + for (const auto& keyMap : pair.second) { + ssdKeysBeforeEval.push_back(keyMap.first); + } + for (const auto& keyMap : excludeDDRKeyCountMap[tableName]) { + ssdKeysAfterEval.push_back(keyMap.first); + } + + GetSwapInAndSwapOutKeys(ssdKeysBeforeEval, ssdKeysAfterEval, swapInKeys, swapOutKeys); + + // ddr <-> ssd + // ddr-> lookup address, ssd->insert embedding , ddr->remove embedding + vector swapInKeysAddr; + int rc = embCache->EmbeddingLookupAddrs(tableName, swapInKeys, swapInKeysAddr); + if (rc != 0) { + throw runtime_error("EmbeddingLookUpAddrs failed! error code: " + std::to_string(rc)); + } + auto extEmbeddingSize = embBaseInfos[tableName].extEmbeddingSize; + l3Storage->InsertEmbeddingsByAddr(tableName, swapInKeys, swapInKeysAddr, extEmbeddingSize); + rc = embCache->EmbeddingRemove(tableName, swapInKeys); + if (rc != 0) { + throw runtime_error("EmbeddingRemove failed! error code: " + std::to_string(rc)); + } + + // ssd->fetch embedding, ddr->EmbeddingUpdate, ssd->delete embedding + auto swapOutEmbeddings = l3Storage->FetchEmbeddings(tableName, swapOutKeys); + vector swapOutFlattenEmbeddings; + for (auto& emb : swapOutEmbeddings) { + swapOutFlattenEmbeddings.insert(swapOutFlattenEmbeddings.cend(), emb.cbegin(), emb.cend()); + } + rc = embCache->EmbeddingUpdate(tableName, swapOutKeys, swapOutFlattenEmbeddings.data()); + l3Storage->DeleteEmbeddings(tableName, swapOutKeys); + } + + ddrKeyFreqMap = ddrKeyFreqMapBackUp; + excludeDDRKeyCountMap = excludeDDRKeyCountMapBackUp; +} + +void CacheManager::GetSwapInAndSwapOutKeys(vector& ssdKeysBeforeEval, + vector& ssdKeysAfterEval, + vector& swapInKeys, vector& swapOutKeys) +{ + std::sort(ssdKeysBeforeEval.begin(), ssdKeysBeforeEval.end()); + std::sort(ssdKeysAfterEval.begin(), ssdKeysAfterEval.end()); + vector intersectionKeys; + std::set_intersection(ssdKeysBeforeEval.begin(), ssdKeysBeforeEval.end(), ssdKeysAfterEval.begin(), + ssdKeysAfterEval.end(), std::back_inserter(intersectionKeys)); + + std::set_difference(ssdKeysBeforeEval.begin(), ssdKeysBeforeEval.end(), intersectionKeys.begin(), + intersectionKeys.end(), std::back_inserter(swapInKeys)); + std::set_difference(ssdKeysAfterEval.begin(), ssdKeysAfterEval.end(), intersectionKeys.begin(), + intersectionKeys.end(), std::back_inserter(swapOutKeys)); +} + diff --git a/src/core/l3_storage/cache_manager.h b/src/core/l3_storage/cache_manager.h index 79335788..34e7f0c2 100644 --- a/src/core/l3_storage/cache_manager.h +++ b/src/core/l3_storage/cache_manager.h @@ -107,10 +107,20 @@ namespace MxRec { int64_t GetTableUsage(const string& tableName); + void BackUpTrainStatus(); + + void RecoverTrainStatus(); + + void GetSwapInAndSwapOutKeys(vector& ssdKeysBeforeEval, + vector& ssdKeysAfterEval, + vector& swapInKeys, vector& swapOutKeys); + // DDR内每个表中emb数据频次缓存;map unordered_map ddrKeyFreqMap; + unordered_map ddrKeyFreqMapBackUp; // 每张表中非DDR内key的出现次数 unordered_map> excludeDDRKeyCountMap; + unordered_map> excludeDDRKeyCountMapBackUp; // 每一个table对应一个PreProcessMapper,预先推演HBM->DDR的情况 std::unordered_map preProcessMapper; @@ -125,6 +135,7 @@ namespace MxRec { uint64_t maxTableSize; vector savePath; bool isExist; + int extEmbeddingSize; }; void CreateL3StorageTableIfNotExist(const std::string& embTableName); diff --git a/src/core/ock_ctr_common/include/embedding_cache.h b/src/core/ock_ctr_common/include/embedding_cache.h index 5e25a718..ce807f16 100644 --- a/src/core/ock_ctr_common/include/embedding_cache.h +++ b/src/core/ock_ctr_common/include/embedding_cache.h @@ -317,18 +317,24 @@ public: const std::vector>& optimizerSlots) = 0; /* * - * train通道切换为eval, 备份当前表的offsetMapper对象, 存储下当前train对应的devices上key的状态 - * @Param tableName: 需要加载信息的table名字 + * When switch the channel to eval, backup the current table's offsetMapper object. + * @Param tableName: embedding table name * @Return errorCode */ - virtual int BackUpTrainStatus(std::string tableName) = 0; + virtual int BackUpTrainStatus(const std::string& tableName) = 0; /* * - * eval通道切换为train, 将当前表的offsetMapper对象还原成备份的train对应的的device上key的状态 - * @Param tableName: 需要加载信息的table名字 + * When switch the eval channel back to train, Recover the current table's offsetMapper object to the backup state. + * @Param tableName: embedding table name + * @Return errorCode + */ + virtual int RecoverTrainStatus(const std::string& tableName) = 0; + + /* * + * Reset the offsetMapper object to revert to its initialized state after loading. * @Return errorCode */ - virtual int RecoverTrainStatus(std::string tableName) = 0; + virtual int ResetOffsetMappers() = 0; }; } // namespace EmbCache -- Gitee From 0c6d360bf5aa30853d4c4da9bc913d3100137f87 Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Fri, 19 Jul 2024 15:36:31 +0800 Subject: [PATCH 285/302] =?UTF-8?q?=E3=80=90FEAT=E3=80=91=E7=BB=99FileWrit?= =?UTF-8?q?er=E6=B7=BB=E5=8A=A0patch=E9=98=B2=E6=AD=A2=E5=86=99summary?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=86=B2=E7=AA=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/__init__.py | 3 ++- mx_rec/saver/patch.py | 22 +++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/mx_rec/__init__.py b/mx_rec/__init__.py index 4f82c3ed..618d802e 100644 --- a/mx_rec/__init__.py +++ b/mx_rec/__init__.py @@ -20,7 +20,7 @@ __all__ = ["version", "__version__"] from mx_rec.constants.constants import ASCEND_GLOBAL_HASHTABLE_COLLECTION from mx_rec.util.tf_version_adapter import npu_ops, hccl_ops, NPUCheckpointSaverHook -from mx_rec.saver.patch import patch_for_saver +from mx_rec.saver.patch import patch_for_saver, patch_for_summary_writer from mx_rec.graph.patch import patch_for_dataset, patch_for_chief_session_creator, patch_for_bool_gauge, \ patch_for_assert_eval_spec, patch_for_scale_loss, patch_for_session from mx_rec.data.patch import patch_for_dataset_eos_map @@ -28,6 +28,7 @@ from mx_rec.optimizers.base import patch_for_optimizer from mx_rec.saver.warm_start import patch_for_warm_start patch_for_saver() +patch_for_summary_writer() patch_for_dataset() patch_for_dataset_eos_map() patch_for_scale_loss() diff --git a/mx_rec/saver/patch.py b/mx_rec/saver/patch.py index 0f3a237b..d5071d5c 100644 --- a/mx_rec/saver/patch.py +++ b/mx_rec/saver/patch.py @@ -23,6 +23,7 @@ import os import time import tensorflow as tf +from tensorflow.compat.v1.summary import FileWriter from tensorflow.core.protobuf import saver_pb2 from tensorflow.core.protobuf import trackable_object_graph_pb2 from tensorflow.python import pywrap_tensorflow @@ -45,13 +46,15 @@ import numpy as np from mpi4py import MPI from mx_rec.saver.saver import Saver as SparseSaver, check_file_system_is_valid, should_write_data -from mx_rec.util.communication.hccl_ops import get_local_rank_size +from mx_rec.util.communication.hccl_ops import get_rank_id from mx_rec.util.initialize import ConfigInitializer from mx_rec.validator.validator import para_checker_decorator, ClassValidator, StringValidator, OptionalIntValidator, \ OptionalStringValidator, DirectoryValidator from mx_rec.util.log import logger from mx_rec.constants.constants import MAX_INT32, INVALID_CHARS +_FILENAME_SUFFIX = "filename_suffix" + def get_sparse_vars(var_list): sparse_var_list = [] @@ -470,3 +473,20 @@ def patch_for_saver(): dense_saver.build = build logger.debug("Class tf.train.Saver has been patched.") training_util.write_graph = patch_for_write_graph_func(graph_io.write_graph) + + +def _patch_for_summary_writer(func): + def wrapper(*args, **kwargs): + filename_suffix = kwargs.get(_FILENAME_SUFFIX, "") + filename_suffix = filename_suffix or "" + rank_suffix = "_rank" + str(get_rank_id()) + if rank_suffix not in filename_suffix: + filename_suffix = rank_suffix + "_" + filename_suffix if filename_suffix else rank_suffix + kwargs[_FILENAME_SUFFIX] = filename_suffix + return func(*args, **kwargs) + + return wrapper + + +def patch_for_summary_writer(): + FileWriter.__init__ = _patch_for_summary_writer(FileWriter.__init__) -- Gitee From fd3f91d2cd874d683753469fce9d9bffe05f63b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=AD=E6=9C=9B?= <1244372993@qq.com> Date: Mon, 22 Jul 2024 16:19:16 +0800 Subject: [PATCH 286/302] =?UTF-8?q?=E3=80=90FIX=E3=80=91=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E5=8A=A0=E8=BD=BD=EF=BC=8C=E5=9B=9E=E9=80=80Python=E4=BE=A7?= =?UTF-8?q?=E8=B7=B3=E8=BF=87=E7=AC=AC0=E6=AD=A5=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E7=9A=84=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/saver.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mx_rec/saver/saver.py b/mx_rec/saver/saver.py index f7ba8f03..a6362506 100644 --- a/mx_rec/saver/saver.py +++ b/mx_rec/saver/saver.py @@ -127,15 +127,6 @@ class Saver(object): save_path = save_path if save_path else self._prefix_name directory, base_name = os.path.split(save_path) - # skip save in step-0, cause host skip save in step-0 EmbeddingDDR::Save SyncLatestEmbedding - try: - step_in_name = int(base_name.split("-")[-1]) - if step_in_name == 0: - return - except ValueError as err: - raise ValueError(f"The base_name {base_name} needs to include save_step message " - f"eg: mode-100") from err - if global_step: if not isinstance(global_step, compat.integral_types): global_step = int(sess.run(global_step)) -- Gitee From aaabe4aa37ef1b188d5e112c3a7c99040579c92f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Mon, 22 Jul 2024 22:08:22 +0800 Subject: [PATCH 287/302] =?UTF-8?q?mmoe=20=E6=A8=A1=E5=9E=8B=E6=A1=86?= =?UTF-8?q?=E6=9E=B6=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/model.py | 136 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 examples/mmoe/model.py diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py new file mode 100644 index 00000000..0046d2fd --- /dev/null +++ b/examples/mmoe/model.py @@ -0,0 +1,136 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import time +from easydict import EasyDict as edict + +import tensorflow as tf + + +model_cfg = edict() +model_cfg.loss_mode = "batch" +LOSS_OP_NAME = "loss" +LABEL_OP_NAME = "label" +VAR_LIST = "variable" +PRED_OP_NAME = "pred" + + +class MyModel: + def __init__(self, expert_num =8, expert_size=16, tower_size=8, gate_num = 2): + + self.expert_num = expert_num + self.expert_size = expert_size + self.tower_size = tower_size + self.gate_num = gate_num + + + def expert_layer(self, input): + param_expert = [] + for i in range(0, self.expert_num): + expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name = f'expert_payer_{i}', + kernel_initializer = tf.constant_initializer(value=0.1), + bias_initializer = tf.constant_initializer(values = 0.1)) + + param_expert.append(expert_linear) + return param_expert + + + def gate_layer(self, input): + param_gate = [] + for i in range(0, self.gate_num): + gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name = f'gate_payer_{i}', + kernel_initializer = tf.constant_initializer(value=0.1), + bias_initializer = tf.constant_initializer(values = 0.1)) + + param_gate.append(gate_linear) + return param_gate + + + def tower_layer(self, input, layer_name): + tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name = f'tower_payer_{layer_name}', + kernel_initializer = tf.constant_initializer(value=0.1), + bias_initializer = tf.constant_initializer(values = 0.1)) + + tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, name = f'tower_payer_out_{layer_name}', + kernel_initializer = tf.constant_initializer(value=0.1), + bias_initializer = tf.constant_initializer(values = 0.1)) + + return tower_linear_out + + + + + def build_model(self, + embedding=None, + dense_feature=None, + label=None, + is_training=True, + seed=None): + + with tf.variable_scope("mmoe", reuse=tf.AUTO_REUSE): + + dense_expert = self.expert_layer(dense_feature) + dense_gate = self.gate_layer(dense_feature) + + all_expert = [] + _slice_num = 0 + for i in range(0, self.expert_num): + slice_num_end = _slice_num + self.expert_size + cur_expert = tf.add(dense_expert[i], embedding[:, _slice_num:slice_num_end]) + cur_expert = tf.nn.relu(cur_expert) + all_expert.append(cur_expert) + _slice_num = slice_num_end + + expert_concat = tf.concat(all_expert, axis=1) + expert_concat = tf.reshape(expert_concat, [-1, self.expert_num, self.expert_size]) + + output_layers = [] + out_pred = [] + for i in range(0, self.gate_num): + slice_gate_end = _slice_num + self.expert_num + cur_gate = tf.add(dense_gate[i], embedding[:, _slice_num:slice_gate_end]) + cur_gate = tf.nn.softmax(cur_gate) + + cur_gate = tf.reshape(cur_gate, [-1, self.expert_num, 1]) + + cur_gate_expert = tf.multiply(x=expert_concat, y=cur_gate) + cur_gate_expert = tf.reduce_sum(cur_gate_expert, axis=1) + out = self.tower_layer(cur_gate_expert, i) + output_layers.append(out) + out_pred.append(tf.nn.softmax(out[:, 1])) + _slice_num = slice_num_end + trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mmoe') + + label_income = label[:, 0:1] + label_mat = label[:, 1:] + + pred_income_1 = tf.slice(output_layers[0], [0, 1], [-1, 1]) + pred_marital_1 = tf.slice(output_layers[1], [0, 1], [-1, 1]) + + cost_income = tf.losses.log_loss(labels=tf.cast(label_income, tf.float32), predictions=pred_income_1, + epsilon=1e-4) + cost_marital = tf.losses.log_loss(labels=tf.cast(label_mat, tf.float32), predictions=pred_marital_1, + epsilon=1e-4) + + avg_cost_income = tf.reduce_mean(cost_income) + avg_cost_marital = tf.reduce_mean(cost_marital) + + loss = 0.5 * (avg_cost_income + avg_cost_marital) + + return {LOSS_OP_NAME: loss, + PRED_OP_NAME: out_pred, + LABEL_OP_NAME: label, + VAR_LIST: trainable_variables} -- Gitee From f17973de35900ab90455e1933717c21161fe2a62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Mon, 22 Jul 2024 22:39:52 +0800 Subject: [PATCH 288/302] cleancode --- examples/mmoe/config.py | 197 ++++++++++++++++++++ examples/mmoe/criteo.py | 273 ++++++++++++++++++++++++++++ examples/mmoe/delay_loss_scale.py | 64 +++++++ examples/mmoe/gradient_descent_w.py | 71 ++++++++ examples/mmoe/mean_auc.py | 40 ++++ examples/mmoe/model.py | 27 +-- examples/mmoe/op_impl_mode.ini | 1 + examples/mmoe/optimizer.py | 35 ++++ 8 files changed, 695 insertions(+), 13 deletions(-) create mode 100644 examples/mmoe/config.py create mode 100644 examples/mmoe/criteo.py create mode 100644 examples/mmoe/delay_loss_scale.py create mode 100644 examples/mmoe/gradient_descent_w.py create mode 100644 examples/mmoe/mean_auc.py create mode 100644 examples/mmoe/op_impl_mode.ini create mode 100644 examples/mmoe/optimizer.py diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py new file mode 100644 index 00000000..d5540908 --- /dev/null +++ b/examples/mmoe/config.py @@ -0,0 +1,197 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import enum +import os + +import tensorflow as tf +from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig +from npu_bridge.estimator.npu.npu_config import NPURunConfig + +from mx_rec.constants.constants import CacheModeEnum + +SSD_DATA_PATH = ["ssd_data"] + + +class LearningRateScheduler: + """ + LR Scheduler combining Polynomial Decay with Warmup at the beginning. + TF-based cond operations necessary for performance in graph mode. + """ + + def __init__(self, base_lr_dense, base_lr_sparse, warmup_steps, decay_start_step, decay_steps): + self.warmup_steps = tf.constant(warmup_steps, dtype=tf.int32) + self.decay_start_step = tf.constant(decay_start_step, dtype=tf.int32) + self.decay_steps = tf.constant(decay_steps) + self.decay_end_step = decay_start_step + decay_steps # 65041 + self.poly_power = 2.0 + self.base_lr_dense = base_lr_dense + self.base_lr_sparse = base_lr_sparse + + def calc(self, global_step): + # used for the warmup stage + warmup_step = tf.cast(1 / self.warmup_steps, tf.float32) + lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step + lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32) + # used for the constant stage + lr_factor_constant = tf.cast(1.0, tf.float32) + + lr_sparse = self.base_lr_sparse * lr_factor_constant + lr_dense = self.base_lr_dense * lr_factor_constant + return lr_dense, lr_sparse + + +class Config: + def __init__(self, ): + self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None + tmp = os.getenv("TRAIN_RANK_SIZE") + if tmp is None: + raise ValueError("please export TRAIN_RANK_SIZE") + self.rank_size = int(tmp) + + self.data_path = os.getenv("DLRM_CRITEO_DATA_PATH") + self.train_file_pattern = "train" + self.test_file_pattern = "test" + + self.batch_size = 4096 + self.line_per_sample = 1 + self.train_epoch = 1 + self.test_epoch = 9 + self.perform_shuffle = False + + self.key_type = tf.int64 + self.label_type = tf.float32 + self.value_type = tf.int64 + + self.feat_cnt = 26 + self.__set_emb_table_size() + + self.field_num = 26 + self.send_count = 46000 // self.rank_size + + self.emb_dim = 8 + self.hashtable_threshold = 1 + + self.USE_PIPELINE_TEST = False + + # 动态学习率 + GLOBAL_BATCH_SIZE = 8192 * 8 + LR_SCHEDULE_STEPS = [ + int(2750 * 55296 / GLOBAL_BATCH_SIZE), + int(49315 * 55296 / GLOBAL_BATCH_SIZE), + int(27772 * 55296 / GLOBAL_BATCH_SIZE), + ] + self.global_step = tf.Variable(0, trainable=False) + _lr_scheduler = LearningRateScheduler( + 0.001, + 0.001, + LR_SCHEDULE_STEPS[0], + LR_SCHEDULE_STEPS[1], + LR_SCHEDULE_STEPS[2], + ) + self.learning_rate = _lr_scheduler.calc(self.global_step) + + def __set_emb_table_size(self): + self.cache_mode = os.getenv("CACHE_MODE") + if self.cache_mode is None: + raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") + + if self.cache_mode == CacheModeEnum.HBM.value: + self.dev_vocab_size = 14_000_000 * self.rank_size + self.host_vocab_size = 0 + elif self.cache_mode == CacheModeEnum.DDR.value: + self.dev_vocab_size = 500_000 * self.rank_size + self.host_vocab_size = 24_000_000 * self.rank_size + elif self.cache_mode == CacheModeEnum.SSD.value: + self.dev_vocab_size = 100_000 * self.rank_size + self.host_vocab_size = 2_000_000 * self.rank_size + self.ssd_vocab_size = 24_000_000 * self.rank_size + else: + raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") + + def get_emb_table_cfg(self): + if self.cache_mode == CacheModeEnum.HBM.value: + return {"device_vocabulary_size": self.dev_vocab_size} + elif self.cache_mode == CacheModeEnum.DDR.value: + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size} + elif self.cache_mode == CacheModeEnum.SSD.value: + return {"device_vocabulary_size": self.dev_vocab_size, + "host_vocabulary_size": self.host_vocab_size, + "ssd_vocabulary_size": self.ssd_vocab_size, + "ssd_data_path": SSD_DATA_PATH} + else: + raise RuntimeError(f"get CACHE_MODE:{self.cache_mode}, check Config.__set_emb_table_size implementation") + + +def sess_config(dump_data=False, dump_path="./dump_output", dump_steps="0|1|2"): + session_config = tf.ConfigProto(allow_soft_placement=False, + log_device_placement=False) + session_config.gpu_options.allow_growth = True + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + custom_op.parameter_map["mix_compile_mode"].b = False + custom_op.parameter_map["use_off_line"].b = True + custom_op.parameter_map["min_group_size"].b = 1 + # 可选配置level0:pairwise;level1:pairwise + custom_op.parameter_map["HCCL_algorithm"].s = tf.compat.as_bytes("level0:fullmesh;level1:fullmesh") + custom_op.parameter_map["enable_data_pre_proc"].b = True + custom_op.parameter_map["iterations_per_loop"].i = 10 + custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") + custom_op.parameter_map["hcom_parallel"].b = False + custom_op.parameter_map["op_precision_mode"].s = tf.compat.as_bytes("op_impl_mode.ini") + custom_op.parameter_map["op_execute_timeout"].i = 2000 + custom_op.parameter_map["variable_memory_max_size"].s = tf.compat.as_bytes( + str(13 * 1024 * 1024 * 1024)) # total 31 need 13; + custom_op.parameter_map["graph_memory_max_size"].s = tf.compat.as_bytes(str(18 * 1024 * 1024 * 1024)) # need 25 + custom_op.parameter_map["stream_max_parallel_num"].s = tf.compat.as_bytes("DNN_VM_AICPU:3,AIcoreEngine:3") + + if dump_data: + custom_op.parameter_map["enable_dump"].b = True + custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes(dump_path) + custom_op.parameter_map["dump_step"].s = tf.compat.as_bytes(dump_steps) + custom_op.parameter_map["dump_mode"].s = tf.compat.as_bytes("all") + + session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + return session_config + + +def get_npu_run_config(): + session_config = tf.ConfigProto(allow_soft_placement=False, + log_device_placement=False) + + session_config.gpu_options.allow_growth = True + custom_op = session_config.graph_options.rewrite_options.custom_optimizers.add() + custom_op.name = "NpuOptimizer" + session_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF + session_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF + + run_config = NPURunConfig( + save_summary_steps=1000, + save_checkpoints_steps=100, + keep_checkpoint_max=5, + session_config=session_config, + log_step_count_steps=20, + precision_mode='allow_mix_precision', + enable_data_pre_proc=True, + iterations_per_loop=1, + jit_compile=False, + op_compiler_cache_mode="enable", + HCCL_algorithm="level0:fullmesh;level1:fullmesh" # 可选配置:level0:pairwise;level1:pairwise + ) + return run_config diff --git a/examples/mmoe/criteo.py b/examples/mmoe/criteo.py new file mode 100644 index 00000000..25f1d869 --- /dev/null +++ b/examples/mmoe/criteo.py @@ -0,0 +1,273 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import stat +import pickle +import argparse +import pandas as pd +import numpy as np +import tensorflow as tf +from tqdm import tqdm + +NAMES = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)] + + +def make_sub_file(lines, head, src_name, sub_dir_name, sub): + """Write sub-data. + + Args: + :param lines: A list. Several pieces of data. + :param head: A string. ['label', 'I1', 'I2', ...]. + :param src_name: A string. The name of data. + :param sub_dir_name: A string. + :param sub: A scalar(Int). Record the current number of sub file. + :return: sub + 1. + """ + root_path, file_path = os.path.split(src_name) + file_name, suffix = file_path.split('.') + split_file_name = file_name + "_" + str(sub).zfill(2) + "." + suffix + split_file = os.path.join(root_path, sub_dir_name, split_file_name) + if not os.path.exists(os.path.join(root_path, sub_dir_name)): + os.mkdir(os.path.join(root_path, sub_dir_name)) + + modes = stat.S_IWUSR | stat.S_IRUSR + flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT + f = os.fdopen(os.open(split_file, flags, modes), 'w') + try: + f.writelines([head]) + f.writelines(lines) + return sub + 1 + finally: + f.close() + + +def split_byline_count(filename, count, sub_dir_name): + """Split File. + Note: You can specify how many rows of data each sub file contains. + Args: + :param filename: A string. + :param count: A scalar(int). + :param sub_dir_name: A string. + :return: + """ + f = open(filename, 'r') + try: + head = f.readline() + buf = [] + sub = 1 + for line in f: + buf.append(line) + if len(buf) == count: + sub = make_sub_file(buf, head, filename, sub_dir_name, sub) + buf = [] + if len(buf) != 0: + try: + make_sub_file(buf, head, filename, sub_dir_name, sub) + except FileNotFoundError as err: + raise FileNotFoundError("please check the filename of data") from err + finally: + f.close() + + +def get_split_file_path(parent_path=None, dataset_path=None, sample_num=4600000): + """Get the list of split file path. + Note: Either parent_path or dataset_path must be valid. + If exists dataset_path + "/split", parent_path = dataset_path + "/split". + Args: + :param parent_path: A string. split file's parent path. + :param dataset_path: A string. + :param sample_num: A int. The sample number of every split file. + :return: A list. [file1_path, file2_path, ...] + """ + sub_dir_name = 'split' + if parent_path is None and dataset_path is None: + raise ValueError('Please give parent path or file path.') + if parent_path is None and os.path.exists(os.path.join(os.path.dirname(dataset_path), sub_dir_name)): + parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) + elif parent_path is None or not os.path.exists(parent_path): + split_byline_count(dataset_path, sample_num, sub_dir_name) + parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) + split_file_name = os.listdir(parent_path) + split_file_name.sort() + split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt'] + return split_file_list + + +def get_fea_map(fea_map_path=None, split_file_list=None): + """Get feature map. + Note: Either parent_path or dataset_path must be valid. + If exists dir(split_file_list[0]) + "/fea_map.pkl", fea_map_path is valid. + If fea_map_path is None and you want to build the feature map, + the default file path is the parent directory of split file + "fea_map.pkl". + Args: + :param fea_map_path: A string. + :param split_file_list: A list. [file1_path, file2_path, ...] + :return: A dict. {'C1':{}, 'C2':{}, ...} + """ + if fea_map_path is None and split_file_list is None: + raise ValueError('Please give feature map path or split file list.') + if fea_map_path is None and split_file_list is not None: + fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") + if os.path.exists(fea_map_path) and fea_map_path[-3:] == 'pkl': + with open(fea_map_path, 'rb') as f: + fea_map = pickle.load(f) + return fea_map + fea_map = {} + for file_open in tqdm(split_file_list): + f = open(file_open) + for line in f: + row = line.strip('\n').split('\t') + for i in range(14, 40): + if row[i] == '': + continue + name = NAMES[i] + fea_map.setdefault(name, {}) + if fea_map[name].get(row[i]) is None: + fea_map[name][row[i]] = len(fea_map[name]) + for j in range(1, 14): + if row[j] == '': + continue + name = NAMES[j] + fea_map.setdefault(name, {}) + fea_map[name].setdefault('min', float(row[j])) + fea_map[name].setdefault('max', float(row[j])) + fea_map[name]['min'] = min(fea_map[name]['min'], float(row[j])) + fea_map[name]['max'] = max(fea_map[name]['max'], float(row[j])) + f.close() + for i in range(14, 40): + fea_map[NAMES[i]]['-1'] = len(fea_map[NAMES[i]]) + fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") + + + modes = stat.S_IWUSR | stat.S_IRUSR + flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT + with os.fdopen(os.open(fea_map_path, flags, modes), 'wb') as fd: + pickle.dump(fea_map, fd, pickle.HIGHEST_PROTOCOL) + + return fea_map + + +def rec_kbins_discretizer(dat, n_bins, min_max_dict): + """Bin continuous data into intervals. + Note: The strategy is "uniform". + Args: + :param dat: A dataframe. + :param n_bins: A scalar(int). + :param min_max_dict: A dict such as {'min': , 'max': }. + :return: The new dataframe. + """ + features = dat.columns + n_features = len(features) + bin_edges = np.zeros(n_features, dtype=object) + for idx, feature in enumerate(features): + bin_edges[idx] = np.linspace(min_max_dict[feature]['min'], min_max_dict[feature]['max'], n_bins + 1) + rtol = 1.e-5 + atol = 1.e-8 + eps = atol + rtol * np.abs(dat[feature]) + dat[feature] = np.digitize(dat[feature] + eps, bin_edges[idx][1:]) + return dat + + +def convert_input2tfrd(in_file_path, out_file_path): + """ + txt to tfrecords + """ + def make_example(label_list, dense_feat_list, sparse_feat_list): + dense_feature = np.array(dense_feat_list, dtype=np.int64).reshape(-1) + sparse_feature = np.array(sparse_feat_list, dtype=np.int64).reshape(-1) + label = np.array(label_list, dtype=np.int64).reshape(-1) + feature_dict = { + "dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)), + "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), + "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) + } + example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) + + return example + + file_name = out_file_path + in_file_path[-12:-4] + '.tfrecord' + file_writer = tf.io.TFRecordWriter(file_name) + + with open(in_file_path, encoding='utf-8') as file_in: + + for _, line in tqdm(enumerate(file_in)): + + line = line.strip('\n') + items = line.split('\t') + if len(items) != 40: + continue + label = int(items[0]) + dense = items[1:14] + sparse = items[14:] + + ex = make_example(label, dense, sparse) + serialized = ex.SerializeToString() + file_writer.write(serialized) + + file_writer.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Get datasets') + parser.add_argument('--data_path') + parser.add_argument('--output_path') + + args, _ = parser.parse_known_args() + data_path = args.data_path + output_path = args.output_path + + # get txt_list + file_split_list = get_split_file_path(dataset_path=data_path) + # get feature_map + feature_map = get_fea_map(split_file_list=file_split_list) + + for file in tqdm(file_split_list): + + # read data + data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES) + # name feature + sparse_features = ['C' + str(i) for i in range(1, 27)] + dense_features = ['I' + str(i) for i in range(1, 14)] + # data processing + data_df[sparse_features] = data_df[sparse_features].fillna('-1') + data_df[dense_features] = data_df[dense_features].fillna(0) + # sparse feature: mapping + for col in sparse_features: + try: + data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) + except KeyError as e: + raise KeyError("Feature {} not found in dataset".format(col)) from e + # dense feature: Bin continuous data into intervals. + data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map) + # add offsets + slot_size_array = [ + 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, + 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, + 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573 + ] + offset_size_list = np.cumsum([0] + slot_size_array[:-1]) + for col_index in range(1, len(offset_size_list) + 1): + data_df.iloc[:, col_index] += offset_size_list[col_index - 1] + # save to txt + data_df.to_csv(file, sep='\t', index=False, header=False) + # txt to tfrecords + convert_input2tfrd(in_file_path=file, out_file_path=output_path) + + + + + diff --git a/examples/mmoe/delay_loss_scale.py b/examples/mmoe/delay_loss_scale.py new file mode 100644 index 00000000..f73baf68 --- /dev/null +++ b/examples/mmoe/delay_loss_scale.py @@ -0,0 +1,64 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf +from tensorflow.python.training import optimizer + +from config import Config + + +class DenseLossScaleOptimizer: + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: + if not isinstance(opt, optimizer.Optimizer): + raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) + self._optimizer = opt + self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + _update_lr_loss_scale(self._optimizer, loss_scale) + + def compute_gradients(self, loss, var_list=None): + return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list) + + def apply_gradients(self, avg_grads): + return self._optimizer.apply_gradients(avg_grads) + + +class SparseLossScaleOptimizer: + def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: + if not isinstance(opt, optimizer.Optimizer): + raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) + self._optimizer = opt + self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + _update_lr_loss_scale(self._optimizer, loss_scale) + + def compute_gradients(self, loss, var_list=None): + return tf.gradients(loss * self._loss_scale, var_list) + + def apply_gradients(self, grads_and_vars): + return self._optimizer.apply_gradients(grads_and_vars) + + +def _update_lr_loss_scale(opt, loss_scale): + if loss_scale <= 0: + raise RuntimeError("the loss_scale must be greater than zero.") + loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) + if hasattr(opt, "_lr"): + # LazyAdam or Adam optimizer + opt._lr = opt._lr / loss_scale + elif hasattr(opt, "_learning_rate"): + # SGD optimizer + opt._learning_rate = opt._learning_rate / loss_scale + else: + raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.") \ No newline at end of file diff --git a/examples/mmoe/gradient_descent_w.py b/examples/mmoe/gradient_descent_w.py new file mode 100644 index 00000000..53adb996 --- /dev/null +++ b/examples/mmoe/gradient_descent_w.py @@ -0,0 +1,71 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import defaultdict + +import tensorflow as tf +from tensorflow.python.ops import math_ops +from tensorflow.python.training import gradient_descent +from mx_rec.optimizers.base import CustomizedOptimizer +from mx_rec.util.log import logger +from mx_rec.util.initialize import ConfigInitializer + + +def create_hash_optimizer(learning_rate, weight_decay=0.0001, use_locking=False, name="GradientDescent"): + optimizer = CustomizedGradientDescentWithWeighDecay(learning_rate=learning_rate, + weight_decay=weight_decay, + use_locking=use_locking, + name=name) + ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer + return optimizer + + +class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOptimizer, CustomizedOptimizer): + name_counter = defaultdict(int) + + def __init__(self, learning_rate, weight_decay, use_locking=False, name="GradientDescent"): + self.optimizer_type = "gradient_descent_with_weight_decay" + self.weight_decay = weight_decay + super(CustomizedGradientDescentWithWeighDecay, self)._get_name(name=name) + super(CustomizedGradientDescentWithWeighDecay, self).__init__( + learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name + ) + self._slot_num = 0 + self._derivative = 1 + + def get_slot_init_values(self): + logger.info("no slot for gradient descent") + return [] + + def _apply_sparse_duplicate_indices(self, grad, var): + logger.debug(">>>> Enter _apply_sparse_duplicate_indices") + nd_indices = tf.expand_dims(grad.indices, 1) + logger.info(f"weigh_decay={self.weight_decay}") + if self.weight_decay is None: + nd_value = grad.values * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) + else: + nd_value = (grad.values + math_ops.cast(self.weight_decay, var.dtype.base_dtype) * + tf.gather(var, grad.indices)) * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) + var_update_op = tf.scatter_nd_add(var, nd_indices, -nd_value, use_locking=self._use_locking) + return var_update_op + + def _apply_dense(self, grad, var): + logger.debug(">>>> Enter _apply_dense") + raise NotImplementedError("You are using a wrong type of variable.") diff --git a/examples/mmoe/mean_auc.py b/examples/mmoe/mean_auc.py new file mode 100644 index 00000000..ff57df00 --- /dev/null +++ b/examples/mmoe/mean_auc.py @@ -0,0 +1,40 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +from glob import glob +import numpy as np + + +def split_auc(log_input): + with open(log_input, 'r') as log: + all_auc = [] + for line in log.readlines(): + if 'Test' in line: + all_auc.append(float(line.split(';')[0].split(':')[-1].strip())) + all_auc_len = len(all_auc) + all_auc_arr = np.array(all_auc)[:all_auc_len - all_auc_len % 8] + test_auc = np.mean(all_auc_arr.reshape(-1, 8), axis=-1) + return test_auc + + +log_path_all = 'latest_*.log' +log_path_list = glob(log_path_all) + +for log_path in log_path_list: + print(os.path.basename(log_path)) + print(split_auc(log_path)) + print('*'*20) \ No newline at end of file diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index 0046d2fd..5b1917a3 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -29,7 +29,7 @@ PRED_OP_NAME = "pred" class MyModel: - def __init__(self, expert_num =8, expert_size=16, tower_size=8, gate_num = 2): + def __init__(self, expert_num=8, expert_size=16, tower_size=8, gate_num=2): self.expert_num = expert_num self.expert_size = expert_size @@ -40,9 +40,9 @@ class MyModel: def expert_layer(self, input): param_expert = [] for i in range(0, self.expert_num): - expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name = f'expert_payer_{i}', - kernel_initializer = tf.constant_initializer(value=0.1), - bias_initializer = tf.constant_initializer(values = 0.1)) + expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name=f'expert_payer_{i}', + kernel_initializer=tf.constant_initializer(value=0.1), + bias_initializer=tf.constant_initializer(values = 0.1)) param_expert.append(expert_linear) return param_expert @@ -51,22 +51,23 @@ class MyModel: def gate_layer(self, input): param_gate = [] for i in range(0, self.gate_num): - gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name = f'gate_payer_{i}', - kernel_initializer = tf.constant_initializer(value=0.1), - bias_initializer = tf.constant_initializer(values = 0.1)) + gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name=f'gate_payer_{i}', + kernel_initializer=tf.constant_initializer(value=0.1), + bias_initializer=tf.constant_initializer(values = 0.1)) param_gate.append(gate_linear) return param_gate def tower_layer(self, input, layer_name): - tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name = f'tower_payer_{layer_name}', - kernel_initializer = tf.constant_initializer(value=0.1), - bias_initializer = tf.constant_initializer(values = 0.1)) + tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name=f'tower_payer_{layer_name}', + kernel_initializer=tf.constant_initializer(value=0.1), + bias_initializer=tf.constant_initializer(values = 0.1)) - tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, name = f'tower_payer_out_{layer_name}', - kernel_initializer = tf.constant_initializer(value=0.1), - bias_initializer = tf.constant_initializer(values = 0.1)) + tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, + name=f'tower_payer_out_{layer_name}', + kernel_initializer=tf.constant_initializer(value=0.1), + bias_initializer=tf.constant_initializer(values=0.1)) return tower_linear_out diff --git a/examples/mmoe/op_impl_mode.ini b/examples/mmoe/op_impl_mode.ini new file mode 100644 index 00000000..579dea43 --- /dev/null +++ b/examples/mmoe/op_impl_mode.ini @@ -0,0 +1 @@ +ScatterNdAdd=support_out_of_bound_index \ No newline at end of file diff --git a/examples/mmoe/optimizer.py b/examples/mmoe/optimizer.py new file mode 100644 index 00000000..2c7685bb --- /dev/null +++ b/examples/mmoe/optimizer.py @@ -0,0 +1,35 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import tensorflow as tf +from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer +from mx_rec.util.initialize import ConfigInitializer +from mx_rec.optimizers.lazy_adam import create_hash_optimizer +from mx_rec.optimizers.lazy_adam_by_addr import create_hash_optimizer_by_address + + +def get_dense_and_sparse_optimizer(cfg): + dense_optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate[0]) + use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion + if use_dynamic_expansion: + sparse_optimizer = create_hash_optimizer_by_address(learning_rate=cfg.learning_rate[1]) + else: + sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1]) + loss_scale = 1 + sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale) + dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale) + + return dense_optimizer, sparse_optimizer -- Gitee From fe7073494d499d161e16ce826175f744a17336eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Mon, 22 Jul 2024 22:50:39 +0800 Subject: [PATCH 289/302] =?UTF-8?q?mmoe=20=E5=90=8A=E8=B5=B7=E4=BB=A3?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/main_mxrec.py | 469 ++++++++++++++++++++++++++++++++++++ 1 file changed, 469 insertions(+) create mode 100644 examples/mmoe/main_mxrec.py diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py new file mode 100644 index 00000000..51ed7c4a --- /dev/null +++ b/examples/mmoe/main_mxrec.py @@ -0,0 +1,469 @@ +# coding=utf-8 +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import os +import shutil +import time +import warnings +import random +from glob import glob + +import tensorflow as tf +from sklearn.metrics import roc_auc_score +import numpy as np + +from optimizer import get_dense_and_sparse_optimizer +from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum +from model import MyModel +from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET +from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func +from mx_rec.core.asc.manager import start_asc_pipeline +from mx_rec.core.embedding import create_table, sparse_lookup +from mx_rec.core.feature_process import EvictHook +from mx_rec.graph.modifier import modify_graph_and_start_emb_cache, GraphModifierHook +from mx_rec.constants.constants import ASCEND_TIMESTAMP +from mx_rec.util.initialize import ConfigInitializer, init, terminate_config_initializer +from mx_rec.util.ops import import_host_pipeline_ops +import mx_rec.util as mxrec_util +from mx_rec.util.variable import get_dense_and_sparse_variable +from mx_rec.util.log import logger +from npu_bridge.npu_init import * + +npu_plugin.set_device_sat_mode(0) + +dense_hashtable_seed = 128 +sparse_hashtable_seed = 128 +shuffle_seed = 128 +random.seed(shuffle_seed) + + +def add_timestamp_func(batch): + timestamp = import_host_pipeline_ops().return_timestamp(tf.cast(batch['label'], dtype=tf.int64)) + # tf.constant(np.random.randint(1,1688109060,1)), tf.int64)) + batch["timestamp"] = timestamp + return batch + + +def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, is_use_faae=False): + if config.USE_PIPELINE_TEST: + num_parallel = 1 + else: + num_parallel = 8 + + def extract_fn(data_record): + features = { + # Extract features using the keys set during creation + 'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64), + 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64), + 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32), + } + sample = tf.compat.v1.parse_single_example(data_record, features) + return sample + + def reshape_fn(batch): + batch['label'] = tf.reshape(batch['label'], [-1, 1]) + batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 13]) + batch['dense_feature'] = tf.math.log(batch['dense_feature'] + 3.0) + batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 26]) + return batch + + if is_training: + files_list = glob(os.path.join(config.data_path, config.train_file_pattern) + '/*.tfrecord') + else: + files_list = glob(os.path.join(config.data_path, config.test_file_pattern) + '/*.tfrecord') + dataset = tf.data.TFRecordDataset(files_list, num_parallel_reads=num_parallel) + batch_size = config.batch_size // config.line_per_sample + + dataset = dataset.shard(config.rank_size, config.rank_id) + if is_training: + dataset = dataset.shuffle(batch_size * 1000, seed=shuffle_seed) + if is_training: + dataset = dataset.repeat(config.train_epoch) + else: + dataset = dataset.repeat(config.test_epoch) + dataset = dataset.map(extract_fn, num_parallel_calls=num_parallel).batch(batch_size, + drop_remainder=True) + dataset = dataset.map(reshape_fn, num_parallel_calls=num_parallel) + if is_use_faae: + dataset = dataset.map(add_timestamp_func) + + if not MODIFY_GRAPH_FLAG: + insert_fn = get_asc_insert_func(tgt_key_specs=feature_spec_list, is_training=is_training, dump_graph=dump_graph) + dataset = dataset.map(insert_fn) + + dataset = dataset.prefetch(100) + + iterator = dataset.make_initializable_iterator() + batch = iterator.get_next() + return batch, iterator + + +def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph): + embedding_list = [] + logger.debug(f"In model_forward function, is_train: {is_train}, feature_list: {len(feature_list)}, " + f"hash_table_list: {len(hash_table_list)}") + for feature, hash_table in zip(feature_list, hash_table_list): + if MODIFY_GRAPH_FLAG: + feature = batch["sparse_feature"] + embedding = sparse_lookup(hash_table, feature, cfg.send_count, dim=None, is_train=is_train, + name="user_embedding_lookup", modify_graph=modify_graph, batch=batch, + access_and_evict_config=None) + embedding_list.append(embedding) + + if len(embedding_list) == 1: + emb = embedding_list[0] + elif len(embedding_list) > 1: + emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False) + else: + raise ValueError("the length of embedding_list must be greater than or equal to 1.") + my_model = MyModel() + model_output = my_model.build_model(embedding=emb, + dense_feature=batch["dense_feature"], + label=batch["label"], + is_training=is_train, + seed=dense_hashtable_seed) + return model_output + + +def evaluate(): + print("read_test dataset") + if not MODIFY_GRAPH_FLAG: + eval_label = eval_model.get("label") + sess.run([eval_iterator.initializer]) + else: + # 在sess run模式下,若还是使用原来batch中的label去sess run,则会出现getnext超时报错,需要使用新数据集中的batch + eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label") + sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) + log_loss_list = [] + pred_list = [] + label_list = [] + eval_current_steps = 0 + finished = False + print("eval begin") + + while not finished: + try: + eval_current_steps += 1 + eval_start = time.time() + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_label]) + eval_cost = time.time() - eval_start + qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size + log_loss_list += list(eval_loss.reshape(-1)) + pred_list += list(pred.reshape(-1)) + label_list += list(label.reshape(-1)) + print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") + if eval_current_steps == eval_steps: + finished = True + except tf.errors.OutOfRangeError: + finished = True + auc = roc_auc_score(label_list, pred_list) + mean_log_loss = np.mean(log_loss_list) + return auc, mean_log_loss + + +def evaluate_fix(step): + print("read_test dataset evaluate_fix") + if not MODIFY_GRAPH_FLAG: + sess.run([eval_iterator.initializer]) + else: + sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) + log_loss_list = [] + pred_list = [] + label_list = [] + eval_current_steps = 0 + finished = False + print("eval begin") + while not finished: + try: + eval_current_steps += 1 + eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_model.get("label")]) + log_loss_list += list(eval_loss.reshape(-1)) + pred_list += list(pred.reshape(-1)) + label_list += list(label.reshape(-1)) + print(f"eval current_steps: {eval_current_steps}") + + if eval_current_steps == eval_steps: + finished = True + except tf.errors.OutOfRangeError: + finished = True + + label_numpy = np.array(label_list) + pred_numpy = np.array(pred_list) + if not os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}"): + os.makedirs(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}") + + if os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy"): + os.remove(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy") + if os.path.exists(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy"): + os.remove(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy") + if os.path.exists(f"flag_{rank_id}.txt"): + os.remove(f"flag_{rank_id}.txt") + np.save(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/label_{rank_id}.npy", label_numpy) + np.save(os.path.abspath(".") + f"/interval_{interval}/numpy_{step}/pred_{rank_id}.npy", pred_numpy) + os.mknod(f"flag_{rank_id}.txt") + while True: + file_exists_list = [os.path.exists(f"flag_{i}.txt") for i in range(rank_size)] + if sum(file_exists_list) == rank_size: + print("All saved!!!!!!!!!!") + break + else: + print("Waitting for saving numpy!!!!!!!!") + time.sleep(1) + continue + + auc = roc_auc_score(label_list, pred_list) + mean_log_loss = np.mean(log_loss_list) + return auc, mean_log_loss + + +def create_feature_spec_list(use_timestamp=False): + access_threshold = None + eviction_threshold = None + if use_timestamp: + access_threshold = 1000 + eviction_threshold = 180 + + feature_spec_list = [FeatureSpec("sparse_feature", table_name="sparse_embeddings", batch_size=cfg.batch_size, + access_threshold=access_threshold, eviction_threshold=eviction_threshold)] + if use_multi_lookup: + feature_spec_list.append(FeatureSpec("sparse_feature", table_name="sparse_embeddings", + batch_size=cfg.batch_size, + access_threshold=access_threshold, + eviction_threshold=eviction_threshold)) + if use_timestamp: + feature_spec_list.append(FeatureSpec("timestamp", is_timestamp=True)) + return feature_spec_list + + +def _del_related_dir(del_path: str) -> None: + if not os.path.isabs(del_path): + del_path = os.path.join(os.getcwd(), del_path) + dirs = glob(del_path) + for sub_dir in dirs: + shutil.rmtree(sub_dir, ignore_errors=True) + logger.info(f"Delete dir:{sub_dir}") + + +def _clear_saved_model() -> None: + _del_related_dir("/root/ascend/log/*") + _del_related_dir("kernel*") + _del_related_dir("model_dir_rank*") + _del_related_dir("op_cache") + + if os.getenv("CACHE_MODE", "") != CacheModeEnum.SSD.value: + return + logger.info("Current cache mode is SSD, and file overwrite is not allowed in SSD mode, deleting exist directory" + " then create empty directory for this use case.") + for sub_path in SSD_DATA_PATH: + _del_related_dir(sub_path) + os.makedirs(sub_path, mode=0o550, exist_ok=True) + logger.info(f"Create dir:{sub_path}") + + +if __name__ == "__main__": + tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) + warnings.filterwarnings("ignore") + _clear_saved_model() + + rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None + rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None + interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None + train_steps = 10000 + eval_steps = 1360 + + try: + use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))) + use_multi_lookup = bool(int(os.getenv("USE_MULTI_LOOKUP", 0))) + MODIFY_GRAPH_FLAG = bool(int(os.getenv("USE_MODIFY_GRAPH", 0))) + use_faae = bool(int(os.getenv("USE_FAAE", 0))) + except ValueError as err: + raise ValueError("please correctly config USE_DYNAMIC_EXPANSION or USE_MULTI_LOOKUP or USE_FAAE " + "or USE_MODIFY_GRAPH only 0 or 1 is supported.") from err + + use_dynamic = bool(int(os.getenv("USE_DYNAMIC", 0))) + logger.info(f"USE_DYNAMIC:{use_dynamic}") + init(train_steps=train_steps, eval_steps=eval_steps, + use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion) + IF_LOAD = False + rank_id = mxrec_util.communication.hccl_ops.get_rank_id() + filelist = glob(f"./saved-model/sparse-model-0") + if filelist: + IF_LOAD = True + ConfigInitializer.get_instance().if_load = IF_LOAD + + cfg = Config() + feature_spec_list_train = None + feature_spec_list_eval = None + if use_faae: + feature_spec_list_train = create_feature_spec_list(use_timestamp=True) + feature_spec_list_eval = create_feature_spec_list(use_timestamp=True) + else: + feature_spec_list_train = create_feature_spec_list(use_timestamp=False) + feature_spec_list_eval = create_feature_spec_list(use_timestamp=False) + + train_batch, train_iterator = make_batch_and_iterator(cfg, feature_spec_list_train, is_training=True, + dump_graph=True, is_use_faae=use_faae) + eval_batch, eval_iterator = make_batch_and_iterator(cfg, feature_spec_list_eval, is_training=False, + dump_graph=False, is_use_faae=use_faae) + logger.info(f"train_batch: {train_batch}") + + if use_faae: + cfg.dev_vocab_size = cfg.dev_vocab_size // 2 + + optimizer_list = [get_dense_and_sparse_optimizer(cfg)] + + # note: variance_scaling_initializer only support HBM mode + emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) \ + if cfg.cache_mode != "HBM" or use_dynamic_expansion else \ + tf.compat.v1.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=sparse_hashtable_seed) + sparse_hashtable = create_table( + key_dtype=cfg.key_type, + dim=tf.TensorShape([cfg.emb_dim]), + name="sparse_embeddings", + emb_initializer=emb_initializer, + **cfg.get_emb_table_cfg() + ) + if use_faae: + tf.compat.v1.add_to_collection(ASCEND_TIMESTAMP, train_batch["timestamp"]) + + sparse_hashtable_list = [sparse_hashtable, sparse_hashtable] if use_multi_lookup else [sparse_hashtable] + train_model = model_forward(feature_spec_list_train, sparse_hashtable_list, train_batch, + is_train=True, modify_graph=MODIFY_GRAPH_FLAG) + eval_model = model_forward(feature_spec_list_eval, sparse_hashtable_list, eval_batch, + is_train=False, modify_graph=MODIFY_GRAPH_FLAG) + + dense_variables, sparse_variables = get_dense_and_sparse_variable() + trainable_varibles = [] + trainable_varibles.extend(dense_variables) + if use_dynamic_expansion: + trainable_varibles.append(tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_LOCAL_EMB)[0]) + else: + trainable_varibles.extend(sparse_variables) + rank_size = mxrec_util.communication.hccl_ops.get_rank_size() + train_ops = [] + # multi task training + for loss, (dense_optimizer, sparse_optimizer) in zip([train_model.get("loss")], optimizer_list): + # do dense optimization + grads = dense_optimizer.compute_gradients(loss, var_list=trainable_varibles) + avg_grads = [] + for grad, var in grads[:-1]: + if rank_size > 1: + grad = hccl_ops.allreduce(grad, "sum") if grad is not None else None + if grad is not None: + avg_grads.append((grad / 8.0, var)) + # apply gradients: update variables + train_ops.append(dense_optimizer.apply_gradients(avg_grads)) + + if use_dynamic_expansion: + train_address_list = tf.compat.v1.get_collection(ASCEND_SPARSE_LOOKUP_ID_OFFSET) + # do sparse optimization by addr + sparse_grads = list(grads[-1]) # local_embedding + grads_and_vars = [(grad, address) for grad, address in zip(sparse_grads, train_address_list)] + train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) + else: + # do sparse optimization + sparse_grads = list(grads[-1]) + print("sparse_grads_tensor:", sparse_grads) + grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)] + train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) + + # 动态学习率更新 + train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]]) + + with tf.control_dependencies(train_ops): + train_ops = tf.no_op() + cfg.learning_rate = [cfg.learning_rate[0], cfg.learning_rate[1]] + + saver = tf.train.Saver() + if MODIFY_GRAPH_FLAG: + modify_graph_and_start_emb_cache(dump_graph=True) + else: + start_asc_pipeline() + + hook_list = [] + if use_faae: + hook_evict = EvictHook(evict_enable=True, evict_time_interval=120) + hook_list.append(hook_evict) + if MODIFY_GRAPH_FLAG: # 该场景添加hook处理校验问题 + hook_list.append(GraphModifierHook(modify_graph=False)) + + # with tf.compat.v1.Session(config=sess_config(dump_data=False)) as sess: + if use_faae: + sess = tf.compat.v1.train.MonitoredTrainingSession( + hooks=hook_list, + config=sess_config(dump_data=False) + ) + sess.graph._unsafe_unfinalize() + if not MODIFY_GRAPH_FLAG: + sess.run(train_iterator.initializer) + else: + sess.run(ConfigInitializer.get_instance().train_params_config.get_initializer(True)) + else: + sess = tf.compat.v1.Session(config=sess_config(dump_data=False)) + sess.run(tf.compat.v1.global_variables_initializer()) + if not MODIFY_GRAPH_FLAG: + sess.run(train_iterator.initializer) + else: + sess.run(ConfigInitializer.get_instance().train_params_config.get_initializer(True)) + + epoch = 0 + cost_sum = 0 + qps_sum = 0 + best_auc = 0 + iteration_per_loop = 10 + + train_ops = util.set_iteration_per_loop(sess, train_ops, 10) + + # for i in range(1, TRAIN_STEPS): + i = 0 + while True: + i += 1 + logger.info(f"################ training at step {i * iteration_per_loop} ################") + start_time = time.time() + + try: + grad, loss = sess.run([train_ops, train_model.get("loss")]) + lr = sess.run(cfg.learning_rate) + global_step = sess.run(cfg.global_step) + except tf.errors.OutOfRangeError: + logger.info(f"Encounter the end of Sequence for training.") + break + + end_time = time.time() + cost_time = end_time - start_time + qps = (1 / cost_time) * rank_size * cfg.batch_size * iteration_per_loop + cost_sum += cost_time + logger.info(f"step: {i * iteration_per_loop}; training loss: {loss}") + logger.info(f"step: {i * iteration_per_loop}; grad: {grad}") + logger.info(f"step: {i * iteration_per_loop}; lr: {lr}") + logger.info(f"global step: {global_step}") + logger.info(f"step: {i * iteration_per_loop}; current sess cost time: {cost_time:.10f}; current QPS: {qps}") + logger.info(f"training at step:{i * iteration_per_loop}, table[{sparse_hashtable.table_name}], " + f"table size:{sparse_hashtable.size()}, table capacity:{sparse_hashtable.capacity()}") + + if i % (train_steps // iteration_per_loop) == 0: + if interval is not None: + test_auc, test_mean_log_loss = evaluate_fix(i * iteration_per_loop) + else: + test_auc, test_mean_log_loss = evaluate() + print("Test auc: {}; log_loss: {} ".format(test_auc, test_mean_log_loss)) + best_auc = max(best_auc, test_auc) + logger.info(f"training step: {i * iteration_per_loop}, best auc: {best_auc}") + + sess.close() + + terminate_config_initializer() + logger.info("Demo done!") -- Gitee From 769164b3b7aff7766e4ffbec81e4766b13d75032 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Mon, 22 Jul 2024 23:38:14 +0800 Subject: [PATCH 290/302] =?UTF-8?q?=E9=85=8D=E7=BD=AE=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E4=BF=AE=E6=94=B9=EF=BC=8C=E5=85=A5=E5=8F=A3=E5=87=BD=E6=95=B0?= =?UTF-8?q?=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/config.py | 31 ++-- examples/mmoe/criteo.py | 273 ------------------------------------ examples/mmoe/main_mxrec.py | 59 ++++---- 3 files changed, 51 insertions(+), 312 deletions(-) delete mode 100644 examples/mmoe/criteo.py diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py index d5540908..b87bc11b 100644 --- a/examples/mmoe/config.py +++ b/examples/mmoe/config.py @@ -42,10 +42,6 @@ class LearningRateScheduler: self.base_lr_sparse = base_lr_sparse def calc(self, global_step): - # used for the warmup stage - warmup_step = tf.cast(1 / self.warmup_steps, tf.float32) - lr_factor_warmup = 1 - tf.cast(self.warmup_steps - global_step, tf.float32) * warmup_step - lr_factor_warmup = tf.cast(lr_factor_warmup, tf.float32) # used for the constant stage lr_factor_constant = tf.cast(1.0, tf.float32) @@ -66,10 +62,15 @@ class Config: self.train_file_pattern = "train" self.test_file_pattern = "test" - self.batch_size = 4096 + self.batch_size = 32 self.line_per_sample = 1 - self.train_epoch = 1 - self.test_epoch = 9 + self.train_epoch = 100 + self.test_epoch = 100 + self.expert_num = 8 + self.gate_num = 2 + self.expert_size = 16 + self.tower_size = 8 + self.perform_shuffle = False self.key_type = tf.int64 @@ -82,7 +83,7 @@ class Config: self.field_num = 26 self.send_count = 46000 // self.rank_size - self.emb_dim = 8 + self.emb_dim = self.expert_num * self.expert_size + self.gate_num * self.expert_num self.hashtable_threshold = 1 self.USE_PIPELINE_TEST = False @@ -102,7 +103,7 @@ class Config: LR_SCHEDULE_STEPS[1], LR_SCHEDULE_STEPS[2], ) - self.learning_rate = _lr_scheduler.calc(self.global_step) + self.learning_rate = _lr_scheduler.calc() def __set_emb_table_size(self): self.cache_mode = os.getenv("CACHE_MODE") @@ -110,15 +111,15 @@ class Config: raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") if self.cache_mode == CacheModeEnum.HBM.value: - self.dev_vocab_size = 14_000_000 * self.rank_size + self.dev_vocab_size = 1000 * self.rank_size self.host_vocab_size = 0 elif self.cache_mode == CacheModeEnum.DDR.value: - self.dev_vocab_size = 500_000 * self.rank_size - self.host_vocab_size = 24_000_000 * self.rank_size + self.dev_vocab_size = 1000 * self.rank_size + self.host_vocab_size = 1000 * self.rank_size elif self.cache_mode == CacheModeEnum.SSD.value: - self.dev_vocab_size = 100_000 * self.rank_size - self.host_vocab_size = 2_000_000 * self.rank_size - self.ssd_vocab_size = 24_000_000 * self.rank_size + self.dev_vocab_size = 1000 * self.rank_size + self.host_vocab_size = 1000 * self.rank_size + self.ssd_vocab_size = 1000 * self.rank_size else: raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") diff --git a/examples/mmoe/criteo.py b/examples/mmoe/criteo.py deleted file mode 100644 index 25f1d869..00000000 --- a/examples/mmoe/criteo.py +++ /dev/null @@ -1,273 +0,0 @@ -# coding=utf-8 -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import os -import stat -import pickle -import argparse -import pandas as pd -import numpy as np -import tensorflow as tf -from tqdm import tqdm - -NAMES = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)] - - -def make_sub_file(lines, head, src_name, sub_dir_name, sub): - """Write sub-data. - - Args: - :param lines: A list. Several pieces of data. - :param head: A string. ['label', 'I1', 'I2', ...]. - :param src_name: A string. The name of data. - :param sub_dir_name: A string. - :param sub: A scalar(Int). Record the current number of sub file. - :return: sub + 1. - """ - root_path, file_path = os.path.split(src_name) - file_name, suffix = file_path.split('.') - split_file_name = file_name + "_" + str(sub).zfill(2) + "." + suffix - split_file = os.path.join(root_path, sub_dir_name, split_file_name) - if not os.path.exists(os.path.join(root_path, sub_dir_name)): - os.mkdir(os.path.join(root_path, sub_dir_name)) - - modes = stat.S_IWUSR | stat.S_IRUSR - flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT - f = os.fdopen(os.open(split_file, flags, modes), 'w') - try: - f.writelines([head]) - f.writelines(lines) - return sub + 1 - finally: - f.close() - - -def split_byline_count(filename, count, sub_dir_name): - """Split File. - Note: You can specify how many rows of data each sub file contains. - Args: - :param filename: A string. - :param count: A scalar(int). - :param sub_dir_name: A string. - :return: - """ - f = open(filename, 'r') - try: - head = f.readline() - buf = [] - sub = 1 - for line in f: - buf.append(line) - if len(buf) == count: - sub = make_sub_file(buf, head, filename, sub_dir_name, sub) - buf = [] - if len(buf) != 0: - try: - make_sub_file(buf, head, filename, sub_dir_name, sub) - except FileNotFoundError as err: - raise FileNotFoundError("please check the filename of data") from err - finally: - f.close() - - -def get_split_file_path(parent_path=None, dataset_path=None, sample_num=4600000): - """Get the list of split file path. - Note: Either parent_path or dataset_path must be valid. - If exists dataset_path + "/split", parent_path = dataset_path + "/split". - Args: - :param parent_path: A string. split file's parent path. - :param dataset_path: A string. - :param sample_num: A int. The sample number of every split file. - :return: A list. [file1_path, file2_path, ...] - """ - sub_dir_name = 'split' - if parent_path is None and dataset_path is None: - raise ValueError('Please give parent path or file path.') - if parent_path is None and os.path.exists(os.path.join(os.path.dirname(dataset_path), sub_dir_name)): - parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) - elif parent_path is None or not os.path.exists(parent_path): - split_byline_count(dataset_path, sample_num, sub_dir_name) - parent_path = os.path.join(os.path.dirname(dataset_path), sub_dir_name) - split_file_name = os.listdir(parent_path) - split_file_name.sort() - split_file_list = [parent_path + "/" + file_name for file_name in split_file_name if file_name[-3:] == 'txt'] - return split_file_list - - -def get_fea_map(fea_map_path=None, split_file_list=None): - """Get feature map. - Note: Either parent_path or dataset_path must be valid. - If exists dir(split_file_list[0]) + "/fea_map.pkl", fea_map_path is valid. - If fea_map_path is None and you want to build the feature map, - the default file path is the parent directory of split file + "fea_map.pkl". - Args: - :param fea_map_path: A string. - :param split_file_list: A list. [file1_path, file2_path, ...] - :return: A dict. {'C1':{}, 'C2':{}, ...} - """ - if fea_map_path is None and split_file_list is None: - raise ValueError('Please give feature map path or split file list.') - if fea_map_path is None and split_file_list is not None: - fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") - if os.path.exists(fea_map_path) and fea_map_path[-3:] == 'pkl': - with open(fea_map_path, 'rb') as f: - fea_map = pickle.load(f) - return fea_map - fea_map = {} - for file_open in tqdm(split_file_list): - f = open(file_open) - for line in f: - row = line.strip('\n').split('\t') - for i in range(14, 40): - if row[i] == '': - continue - name = NAMES[i] - fea_map.setdefault(name, {}) - if fea_map[name].get(row[i]) is None: - fea_map[name][row[i]] = len(fea_map[name]) - for j in range(1, 14): - if row[j] == '': - continue - name = NAMES[j] - fea_map.setdefault(name, {}) - fea_map[name].setdefault('min', float(row[j])) - fea_map[name].setdefault('max', float(row[j])) - fea_map[name]['min'] = min(fea_map[name]['min'], float(row[j])) - fea_map[name]['max'] = max(fea_map[name]['max'], float(row[j])) - f.close() - for i in range(14, 40): - fea_map[NAMES[i]]['-1'] = len(fea_map[NAMES[i]]) - fea_map_path = os.path.join(os.path.dirname(split_file_list[0]), "fea_map.pkl") - - - modes = stat.S_IWUSR | stat.S_IRUSR - flags = os.O_WRONLY | os.O_TRUNC | os.O_CREAT - with os.fdopen(os.open(fea_map_path, flags, modes), 'wb') as fd: - pickle.dump(fea_map, fd, pickle.HIGHEST_PROTOCOL) - - return fea_map - - -def rec_kbins_discretizer(dat, n_bins, min_max_dict): - """Bin continuous data into intervals. - Note: The strategy is "uniform". - Args: - :param dat: A dataframe. - :param n_bins: A scalar(int). - :param min_max_dict: A dict such as {'min': , 'max': }. - :return: The new dataframe. - """ - features = dat.columns - n_features = len(features) - bin_edges = np.zeros(n_features, dtype=object) - for idx, feature in enumerate(features): - bin_edges[idx] = np.linspace(min_max_dict[feature]['min'], min_max_dict[feature]['max'], n_bins + 1) - rtol = 1.e-5 - atol = 1.e-8 - eps = atol + rtol * np.abs(dat[feature]) - dat[feature] = np.digitize(dat[feature] + eps, bin_edges[idx][1:]) - return dat - - -def convert_input2tfrd(in_file_path, out_file_path): - """ - txt to tfrecords - """ - def make_example(label_list, dense_feat_list, sparse_feat_list): - dense_feature = np.array(dense_feat_list, dtype=np.int64).reshape(-1) - sparse_feature = np.array(sparse_feat_list, dtype=np.int64).reshape(-1) - label = np.array(label_list, dtype=np.int64).reshape(-1) - feature_dict = { - "dense_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=dense_feature)), - "sparse_feature": tf.train.Feature(int64_list=tf.train.Int64List(value=sparse_feature)), - "label": tf.train.Feature(int64_list=tf.train.Int64List(value=label)) - } - example = tf.train.Example(features=tf.train.Features(feature=feature_dict)) - - return example - - file_name = out_file_path + in_file_path[-12:-4] + '.tfrecord' - file_writer = tf.io.TFRecordWriter(file_name) - - with open(in_file_path, encoding='utf-8') as file_in: - - for _, line in tqdm(enumerate(file_in)): - - line = line.strip('\n') - items = line.split('\t') - if len(items) != 40: - continue - label = int(items[0]) - dense = items[1:14] - sparse = items[14:] - - ex = make_example(label, dense, sparse) - serialized = ex.SerializeToString() - file_writer.write(serialized) - - file_writer.close() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Get datasets') - parser.add_argument('--data_path') - parser.add_argument('--output_path') - - args, _ = parser.parse_known_args() - data_path = args.data_path - output_path = args.output_path - - # get txt_list - file_split_list = get_split_file_path(dataset_path=data_path) - # get feature_map - feature_map = get_fea_map(split_file_list=file_split_list) - - for file in tqdm(file_split_list): - - # read data - data_df = pd.read_csv(file, sep='\t', header=None, names=NAMES) - # name feature - sparse_features = ['C' + str(i) for i in range(1, 27)] - dense_features = ['I' + str(i) for i in range(1, 14)] - # data processing - data_df[sparse_features] = data_df[sparse_features].fillna('-1') - data_df[dense_features] = data_df[dense_features].fillna(0) - # sparse feature: mapping - for col in sparse_features: - try: - data_df[col] = data_df[col].map(lambda x: feature_map[col][x]) - except KeyError as e: - raise KeyError("Feature {} not found in dataset".format(col)) from e - # dense feature: Bin continuous data into intervals. - data_df[dense_features] = rec_kbins_discretizer(data_df[dense_features], 1000, feature_map) - # add offsets - slot_size_array = [ - 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, 1001, - 1462, 585, 10131228, 2202609, 307, 25, 12519, 635, 5, 93147, 5685, 8351594, 3196, - 29, 14994, 5461307, 12, 5654, 2174, 5, 7046548, 19, 17, 286182, 106, 142573 - ] - offset_size_list = np.cumsum([0] + slot_size_array[:-1]) - for col_index in range(1, len(offset_size_list) + 1): - data_df.iloc[:, col_index] += offset_size_list[col_index - 1] - # save to txt - data_df.to_csv(file, sep='\t', index=False, header=False) - # txt to tfrecords - convert_input2tfrd(in_file_path=file, out_file_path=output_path) - - - - - diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py index 51ed7c4a..e236cd2f 100644 --- a/examples/mmoe/main_mxrec.py +++ b/examples/mmoe/main_mxrec.py @@ -66,18 +66,17 @@ def make_batch_and_iterator(config, feature_spec_list, is_training, dump_graph, def extract_fn(data_record): features = { # Extract features using the keys set during creation - 'label': tf.compat.v1.FixedLenFeature(shape=(config.line_per_sample,), dtype=tf.int64), - 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(26 * config.line_per_sample,), dtype=tf.int64), - 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(13 * config.line_per_sample,), dtype=tf.float32), + 'label': tf.compat.v1.FixedLenFeature(shape=(2 * config.line_per_sample,), dtype=tf.int64), + 'sparse_feature': tf.compat.v1.FixedLenFeature(shape=(29 * config.line_per_sample,), dtype=tf.int64), + 'dense_feature': tf.compat.v1.FixedLenFeature(shape=(11 * config.line_per_sample,), dtype=tf.float32), } sample = tf.compat.v1.parse_single_example(data_record, features) return sample def reshape_fn(batch): - batch['label'] = tf.reshape(batch['label'], [-1, 1]) - batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 13]) - batch['dense_feature'] = tf.math.log(batch['dense_feature'] + 3.0) - batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 26]) + batch['label'] = tf.reshape(batch['label'], [-1, 2]) + batch['dense_feature'] = tf.reshape(batch['dense_feature'], [-1, 11]) + batch['sparse_feature'] = tf.reshape(batch['sparse_feature'], [-1, 29]) return batch if is_training: @@ -129,6 +128,7 @@ def model_forward(feature_list, hash_table_list, batch, is_train, modify_graph): emb = tf.reduce_sum(embedding_list, axis=0, keepdims=False) else: raise ValueError("the length of embedding_list must be greater than or equal to 1.") + emb = tf.reduce_sum(emb, axis=1) my_model = MyModel() model_output = my_model.build_model(embedding=emb, dense_feature=batch["dense_feature"], @@ -148,8 +148,10 @@ def evaluate(): eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label") sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) log_loss_list = [] - pred_list = [] - label_list = [] + pred_income_list = [] + pred_mat_list = [] + label_income_list = [] + label_mat_list = [] eval_current_steps = 0 finished = False print("eval begin") @@ -162,16 +164,21 @@ def evaluate(): eval_cost = time.time() - eval_start qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size log_loss_list += list(eval_loss.reshape(-1)) - pred_list += list(pred.reshape(-1)) - label_list += list(label.reshape(-1)) + pred_income = pred[0] + pred_mat = pred[1] + pred_income_list += list(pred_income.reshape(-1)) + pred_mat_list += list(pred_mat.reshape(-1)) + label_income_list += list(label[:, 0].reshape(-1)) + label_mat_list += list(label[:, 1].reshape(-1)) print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") if eval_current_steps == eval_steps: finished = True except tf.errors.OutOfRangeError: finished = True - auc = roc_auc_score(label_list, pred_list) + auc_income = roc_auc_score(label_income_list, pred_income_list) + auc_mat = roc_auc_score(label_mat_list, pred_mat_list) mean_log_loss = np.mean(log_loss_list) - return auc, mean_log_loss + return auc_income, auc_mat, mean_log_loss def evaluate_fix(step): @@ -281,8 +288,8 @@ if __name__ == "__main__": rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None - train_steps = 10000 - eval_steps = 1360 + train_steps = 1000 + eval_steps = 1000 try: use_dynamic_expansion = bool(int(os.getenv("USE_DYNAMIC_EXPANSION", 0))) @@ -326,9 +333,7 @@ if __name__ == "__main__": optimizer_list = [get_dense_and_sparse_optimizer(cfg)] # note: variance_scaling_initializer only support HBM mode - emb_initializer = tf.compat.v1.truncated_normal_initializer(stddev=0.05, seed=sparse_hashtable_seed) \ - if cfg.cache_mode != "HBM" or use_dynamic_expansion else \ - tf.compat.v1.variance_scaling_initializer(mode="fan_avg", distribution='normal', seed=sparse_hashtable_seed) + emb_initializer = tf.constant_initializer(value = 0.1) sparse_hashtable = create_table( key_dtype=cfg.key_type, dim=tf.TensorShape([cfg.emb_dim]), @@ -422,7 +427,8 @@ if __name__ == "__main__": epoch = 0 cost_sum = 0 qps_sum = 0 - best_auc = 0 + best_income_auc = 0 + best_auc_mat = 0 iteration_per_loop = 10 train_ops = util.set_iteration_per_loop(sess, train_ops, 10) @@ -456,12 +462,17 @@ if __name__ == "__main__": if i % (train_steps // iteration_per_loop) == 0: if interval is not None: - test_auc, test_mean_log_loss = evaluate_fix(i * iteration_per_loop) + test_auc_income, test_auc_mat, test_mean_log_loss = evaluate_fix(i * iteration_per_loop) else: - test_auc, test_mean_log_loss = evaluate() - print("Test auc: {}; log_loss: {} ".format(test_auc, test_mean_log_loss)) - best_auc = max(best_auc, test_auc) - logger.info(f"training step: {i * iteration_per_loop}, best auc: {best_auc}") + test_auc_income, test_auc_mat, test_mean_log_loss = evaluate() + print("Test auc income: {};Test auc mat: {} ;log_loss: {} ".format(test_auc_income, + test_auc_mat,test_mean_log_loss)) + best_auc_income = max(best_auc_income, test_auc_income) + best_auc_mat = max(best_auc_mat, test_auc_mat) + logger.info(f"training step: {i * iteration_per_loop}, + best auc income: {best_auc_income} , + best auc mat: {best_auc_mat}") + sess.close() -- Gitee From c70d9eebb72a4f818b88d1ea19cb1ba9d172d197 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 00:09:26 +0800 Subject: [PATCH 291/302] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index 5b1917a3..cf8ca108 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -42,7 +42,7 @@ class MyModel: for i in range(0, self.expert_num): expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name=f'expert_payer_{i}', kernel_initializer=tf.constant_initializer(value=0.1), - bias_initializer=tf.constant_initializer(values = 0.1)) + bias_initializer=tf.constant_initializer(value=0.1)) param_expert.append(expert_linear) return param_expert @@ -53,7 +53,7 @@ class MyModel: for i in range(0, self.gate_num): gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name=f'gate_payer_{i}', kernel_initializer=tf.constant_initializer(value=0.1), - bias_initializer=tf.constant_initializer(values = 0.1)) + bias_initializer=tf.constant_initializer(value=0.1)) param_gate.append(gate_linear) return param_gate @@ -62,12 +62,12 @@ class MyModel: def tower_layer(self, input, layer_name): tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name=f'tower_payer_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), - bias_initializer=tf.constant_initializer(values = 0.1)) + bias_initializer=tf.constant_initializer(value=0.1)) tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, name=f'tower_payer_out_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), - bias_initializer=tf.constant_initializer(values=0.1)) + bias_initializer=tf.constant_initializer(value=0.1)) return tower_linear_out -- Gitee From 66a629d05eafaeadb807d25b077e41cf5936f1c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 14:28:16 +0800 Subject: [PATCH 292/302] =?UTF-8?q?=E6=97=A0=E7=94=A8=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=88=A0=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/gradient_descent_w.py | 71 ----------------------------- 1 file changed, 71 deletions(-) delete mode 100644 examples/mmoe/gradient_descent_w.py diff --git a/examples/mmoe/gradient_descent_w.py b/examples/mmoe/gradient_descent_w.py deleted file mode 100644 index 53adb996..00000000 --- a/examples/mmoe/gradient_descent_w.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding=utf-8 -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from collections import defaultdict - -import tensorflow as tf -from tensorflow.python.ops import math_ops -from tensorflow.python.training import gradient_descent -from mx_rec.optimizers.base import CustomizedOptimizer -from mx_rec.util.log import logger -from mx_rec.util.initialize import ConfigInitializer - - -def create_hash_optimizer(learning_rate, weight_decay=0.0001, use_locking=False, name="GradientDescent"): - optimizer = CustomizedGradientDescentWithWeighDecay(learning_rate=learning_rate, - weight_decay=weight_decay, - use_locking=use_locking, - name=name) - ConfigInitializer.get_instance().optimizer_config.optimizer_instance = optimizer - return optimizer - - -class CustomizedGradientDescentWithWeighDecay(gradient_descent.GradientDescentOptimizer, CustomizedOptimizer): - name_counter = defaultdict(int) - - def __init__(self, learning_rate, weight_decay, use_locking=False, name="GradientDescent"): - self.optimizer_type = "gradient_descent_with_weight_decay" - self.weight_decay = weight_decay - super(CustomizedGradientDescentWithWeighDecay, self)._get_name(name=name) - super(CustomizedGradientDescentWithWeighDecay, self).__init__( - learning_rate=learning_rate, use_locking=use_locking, name=self.unique_name - ) - self._slot_num = 0 - self._derivative = 1 - - def get_slot_init_values(self): - logger.info("no slot for gradient descent") - return [] - - def _apply_sparse_duplicate_indices(self, grad, var): - logger.debug(">>>> Enter _apply_sparse_duplicate_indices") - nd_indices = tf.expand_dims(grad.indices, 1) - logger.info(f"weigh_decay={self.weight_decay}") - if self.weight_decay is None: - nd_value = grad.values * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) - else: - nd_value = (grad.values + math_ops.cast(self.weight_decay, var.dtype.base_dtype) * - tf.gather(var, grad.indices)) * math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) - var_update_op = tf.scatter_nd_add(var, nd_indices, -nd_value, use_locking=self._use_locking) - return var_update_op - - def _apply_dense(self, grad, var): - logger.debug(">>>> Enter _apply_dense") - raise NotImplementedError("You are using a wrong type of variable.") -- Gitee From 74f39df93b60f70f7ac236a4a236d351eb230c8e Mon Sep 17 00:00:00 2001 From: penghuiyang <1060916628@qq.com> Date: Tue, 23 Jul 2024 17:06:43 +0800 Subject: [PATCH 293/302] =?UTF-8?q?=E3=80=90FEAT=E3=80=91=E7=BB=99FileWrit?= =?UTF-8?q?er=E6=B7=BB=E5=8A=A0patch=E9=98=B2=E6=AD=A2=E5=86=99summary?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=86=B2=E7=AA=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mx_rec/saver/patch.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mx_rec/saver/patch.py b/mx_rec/saver/patch.py index d5071d5c..f57e8ce0 100644 --- a/mx_rec/saver/patch.py +++ b/mx_rec/saver/patch.py @@ -489,4 +489,8 @@ def _patch_for_summary_writer(func): def patch_for_summary_writer(): + """ + Patch for `tf.summary.FileWriter.__init__` method, add rankId to init param `filename_suffix`. + """ FileWriter.__init__ = _patch_for_summary_writer(FileWriter.__init__) + logger.debug("Method `tf.summary.FileWriter.__init__` has been patched.") -- Gitee From aac7a3b3f4d613aea3c303d0266987e023daf62c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:20:36 +0800 Subject: [PATCH 294/302] =?UTF-8?q?=E5=90=8A=E8=B5=B7shell=E6=8F=90?= =?UTF-8?q?=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/run.sh | 99 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 examples/mmoe/run.sh diff --git a/examples/mmoe/run.sh b/examples/mmoe/run.sh new file mode 100644 index 00000000..6c142443 --- /dev/null +++ b/examples/mmoe/run.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +cur_path=$(dirname "$(readlink -f "$0")") + +so_path=$1 +mx_rec_package_path=$2 +hccl_cfg_json=$3 +dlrm_criteo_data_path=$4 +ip=$5 # no ranktable时传入该参数 + +interface="lo" +num_server=1 +local_rank_size=8 +num_process=$((num_server * local_rank_size)) +export TRAIN_RANK_SIZE=$num_process + +################# 参数配置 ###################### +export USE_DYNAMIC=0 # 0:静态shape;1:动态shape +export CACHE_MODE="HBM" # HBM;DDR;SSD +export USE_FAAE=0 # 0:关闭准入淘汰;1:开启准入淘汰 +export USE_DYNAMIC_EXPANSION=0 # 0:关闭动态扩容;1: 开启动态扩容 +export USE_MULTI_LOOKUP=0 # 0:一表一查;1:一表多查 +export USE_MODIFY_GRAPH=0 # 0:feature spec模式;1:自动改图模式 +################################################ +echo "CACHE_MODE:${CACHE_MODE}" + +export HCCL_CONNECT_TIMEOUT=1200 +export DLRM_CRITEO_DATA_PATH=${dlrm_criteo_data_path} +export PYTHONPATH=${mx_rec_package_path}:${so_path}:$PYTHONPATH +export LD_PRELOAD=/usr/lib64/libgomp.so.1 +export LD_LIBRARY_PATH=${so_path}:/usr/local/lib:$LD_LIBRARY_PATH +export ASCEND_DEVICE_ID=0 +export RANK_ID_START=0 +export JOB_ID=10086 +export CUSTOMIZED_OPS_LIB_PATH=${so_path}/libcust_ops.so # Todo: please config +export MXREC_LOG_LEVEL="INFO" +export TF_CPP_MIN_LOG_LEVEL=3 +export ASCEND_GLOBAL_LOG_LEVEL=3 +#export USE_FAAE=1 +export ENABLE_FORCE_V2_CONTROL=1 + +export PROFILING_OPTIONS='{"output":"/home/yz/profiling", + "training_trace":"on", + "task_trace":"on", + "aicpu":"on", + "fp_point":"", + "bp_point":"", + "aic_metrics":"PipeUtilization"}' + +RANK_ID_START=0 + +export MXREC_MODE="ASC" +echo "MXREC_MODE is $MXREC_MODE" +export py=main_mxrec.py +echo "py is $py" + +# 区分ranktable和no ranktable +if [ -n "$ip" ]; then + # no ranktable分支 + echo "Current is no ranktable solution." + echo "Input node ip: $ip, please make sure this ip is available." + export CM_CHIEF_IP=$ip # 主节点ip + export CM_CHIEF_PORT=60001 # 主节点监听端口 + export CM_CHIEF_DEVICE=0 # 主节点device id + export CM_WORKER_IP=$ip # 当前节点ip + export CM_WORKER_SIZE=$num_process # 参与集群训练的device数量 + echo "CM_CHIEF_IP=$CM_CHIEF_IP" + echo "CM_CHIEF_PORT=$CM_CHIEF_PORT" + echo "CM_CHIEF_DEVICE=$CM_CHIEF_DEVICE" + echo "CM_WORKER_IP=$CM_WORKER_IP" + echo "CM_WORKER_SIZE=$CM_WORKER_SIZE" +else + # ranktable分支 + echo "Current is ranktable solution, hccl json file:${hccl_cfg_json}" + export RANK_SIZE=$num_process + echo "RANK_SIZE=${RANK_SIZE}, please make sure hccl configuration json file match this parameter" + export RANK_TABLE_FILE=${hccl_cfg_json} +fi + +echo "use horovod to start tasks" +# GLOG_stderrthreshold -2:TRACE -1:DEBUG 0:INFO 1:WARN 2.ERROR, 默认为INFO +mpi_args='-x BIND_INFO="0:12 12:48 60:48" -x GLOG_stderrthreshold=2 -x GLOG_logtostderr=true -bind-to none -x NCCL_SOCKET_IFNAME=docker0 -mca btl_tcp_if_exclude docker0' + +horovodrun --network-interface ${interface} -np ${num_process} --mpi-args "${mpi_args}" --mpi -H localhost:${local_rank_size} \ +python3.7 ${py} 2>&1 | tee temp_${CACHE_MODE}_${num_process}p.log -- Gitee From 2bad2444eb05428de24da9a99e9f52496fcb4c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:21:16 +0800 Subject: [PATCH 295/302] =?UTF-8?q?=E6=97=A0=E9=9C=80loss=5Fscale=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=EF=BC=8C=E5=8E=BB=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/delay_loss_scale.py | 64 ------------------------------- 1 file changed, 64 deletions(-) delete mode 100644 examples/mmoe/delay_loss_scale.py diff --git a/examples/mmoe/delay_loss_scale.py b/examples/mmoe/delay_loss_scale.py deleted file mode 100644 index f73baf68..00000000 --- a/examples/mmoe/delay_loss_scale.py +++ /dev/null @@ -1,64 +0,0 @@ -# coding=utf-8 -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import tensorflow as tf -from tensorflow.python.training import optimizer - -from config import Config - - -class DenseLossScaleOptimizer: - def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: - if not isinstance(opt, optimizer.Optimizer): - raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) - self._optimizer = opt - self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - _update_lr_loss_scale(self._optimizer, loss_scale) - - def compute_gradients(self, loss, var_list=None): - return self._optimizer.compute_gradients(loss * self._loss_scale, var_list=var_list) - - def apply_gradients(self, avg_grads): - return self._optimizer.apply_gradients(avg_grads) - - -class SparseLossScaleOptimizer: - def __init__(self, opt: optimizer.Optimizer, loss_scale: int) -> None: - if not isinstance(opt, optimizer.Optimizer): - raise ValueError('"opt" must be an instance of Optimizer, but got: %s' % type(opt)) - self._optimizer = opt - self._loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - _update_lr_loss_scale(self._optimizer, loss_scale) - - def compute_gradients(self, loss, var_list=None): - return tf.gradients(loss * self._loss_scale, var_list) - - def apply_gradients(self, grads_and_vars): - return self._optimizer.apply_gradients(grads_and_vars) - - -def _update_lr_loss_scale(opt, loss_scale): - if loss_scale <= 0: - raise RuntimeError("the loss_scale must be greater than zero.") - loss_scale = tf.convert_to_tensor(loss_scale, tf.float32) - if hasattr(opt, "_lr"): - # LazyAdam or Adam optimizer - opt._lr = opt._lr / loss_scale - elif hasattr(opt, "_learning_rate"): - # SGD optimizer - opt._learning_rate = opt._learning_rate / loss_scale - else: - raise RuntimeError("`opt` should have a `_learning_rate` or `_lr` named field.") \ No newline at end of file -- Gitee From 6d08cf2ecb0290eafae0c4639c86f8eb85c43e47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:25:42 +0800 Subject: [PATCH 296/302] =?UTF-8?q?=E6=97=A0=E7=94=A8=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E5=88=A0=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/mean_auc.py | 40 --------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 examples/mmoe/mean_auc.py diff --git a/examples/mmoe/mean_auc.py b/examples/mmoe/mean_auc.py deleted file mode 100644 index ff57df00..00000000 --- a/examples/mmoe/mean_auc.py +++ /dev/null @@ -1,40 +0,0 @@ -# coding=utf-8 -# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -import os -from glob import glob -import numpy as np - - -def split_auc(log_input): - with open(log_input, 'r') as log: - all_auc = [] - for line in log.readlines(): - if 'Test' in line: - all_auc.append(float(line.split(';')[0].split(':')[-1].strip())) - all_auc_len = len(all_auc) - all_auc_arr = np.array(all_auc)[:all_auc_len - all_auc_len % 8] - test_auc = np.mean(all_auc_arr.reshape(-1, 8), axis=-1) - return test_auc - - -log_path_all = 'latest_*.log' -log_path_list = glob(log_path_all) - -for log_path in log_path_list: - print(os.path.basename(log_path)) - print(split_auc(log_path)) - print('*'*20) \ No newline at end of file -- Gitee From 9845d170e50cd3087b4869fe070308230967e364 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:27:02 +0800 Subject: [PATCH 297/302] =?UTF-8?q?=E6=A3=80=E8=A7=86=E6=84=8F=E8=A7=81?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/config.py | 38 ++++++++----------- examples/mmoe/main_mxrec.py | 67 ++++++++++++++-------------------- examples/mmoe/model.py | 15 +++++--- examples/mmoe/op_impl_mode.ini | 1 - examples/mmoe/optimizer.py | 6 +-- 5 files changed, 54 insertions(+), 73 deletions(-) diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py index b87bc11b..67ed7a20 100644 --- a/examples/mmoe/config.py +++ b/examples/mmoe/config.py @@ -32,16 +32,11 @@ class LearningRateScheduler: TF-based cond operations necessary for performance in graph mode. """ - def __init__(self, base_lr_dense, base_lr_sparse, warmup_steps, decay_start_step, decay_steps): - self.warmup_steps = tf.constant(warmup_steps, dtype=tf.int32) - self.decay_start_step = tf.constant(decay_start_step, dtype=tf.int32) - self.decay_steps = tf.constant(decay_steps) - self.decay_end_step = decay_start_step + decay_steps # 65041 - self.poly_power = 2.0 + def __init__(self, base_lr_dense, base_lr_sparse): self.base_lr_dense = base_lr_dense self.base_lr_sparse = base_lr_sparse - def calc(self, global_step): + def calc(self): # used for the constant stage lr_factor_constant = tf.cast(1.0, tf.float32) @@ -51,7 +46,7 @@ class LearningRateScheduler: class Config: - def __init__(self, ): + def __init__(self, ) -> None: self.rank_id = int(os.getenv("OMPI_COMM_WORLD_RANK")) if os.getenv("OMPI_COMM_WORLD_RANK") else None tmp = os.getenv("TRAIN_RANK_SIZE") if tmp is None: @@ -81,31 +76,30 @@ class Config: self.__set_emb_table_size() self.field_num = 26 - self.send_count = 46000 // self.rank_size + self.send_count = self.get_send_count(self.rank_size) self.emb_dim = self.expert_num * self.expert_size + self.gate_num * self.expert_num self.hashtable_threshold = 1 self.USE_PIPELINE_TEST = False - # 动态学习率 - GLOBAL_BATCH_SIZE = 8192 * 8 - LR_SCHEDULE_STEPS = [ - int(2750 * 55296 / GLOBAL_BATCH_SIZE), - int(49315 * 55296 / GLOBAL_BATCH_SIZE), - int(27772 * 55296 / GLOBAL_BATCH_SIZE), - ] self.global_step = tf.Variable(0, trainable=False) _lr_scheduler = LearningRateScheduler( 0.001, - 0.001, - LR_SCHEDULE_STEPS[0], - LR_SCHEDULE_STEPS[1], - LR_SCHEDULE_STEPS[2], + 0.001 ) self.learning_rate = _lr_scheduler.calc() + + def get_send_count(self, rank_size): + try: + return 46000 // rank_size + except ZeroDivisionError as exp: + raise ZeroDivisionError('Rank size can not be zero.') from exp + + + - def __set_emb_table_size(self): + def __set_emb_table_size(self) -> None: self.cache_mode = os.getenv("CACHE_MODE") if self.cache_mode is None: raise ValueError("please export CACHE_MODE environment variable, support:[HBM, DDR, SSD]") @@ -123,7 +117,7 @@ class Config: else: raise ValueError(f"get CACHE_MODE:{self.cache_mode}, expect in [HBM, DDR, SSD]") - def get_emb_table_cfg(self): + def get_emb_table_cfg(self) -> None: if self.cache_mode == CacheModeEnum.HBM.value: return {"device_vocabulary_size": self.dev_vocab_size} elif self.cache_mode == CacheModeEnum.DDR.value: diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py index e236cd2f..0eb127dd 100644 --- a/examples/mmoe/main_mxrec.py +++ b/examples/mmoe/main_mxrec.py @@ -24,10 +24,7 @@ from glob import glob import tensorflow as tf from sklearn.metrics import roc_auc_score import numpy as np - -from optimizer import get_dense_and_sparse_optimizer -from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum -from model import MyModel +from npu_bridge.npu_init import * from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline @@ -40,7 +37,9 @@ from mx_rec.util.ops import import_host_pipeline_ops import mx_rec.util as mxrec_util from mx_rec.util.variable import get_dense_and_sparse_variable from mx_rec.util.log import logger -from npu_bridge.npu_init import * +from optimizer import get_dense_and_sparse_optimizer +from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum +from model import MyModel npu_plugin.set_device_sat_mode(0) @@ -52,7 +51,6 @@ random.seed(shuffle_seed) def add_timestamp_func(batch): timestamp = import_host_pipeline_ops().return_timestamp(tf.cast(batch['label'], dtype=tf.int64)) - # tf.constant(np.random.randint(1,1688109060,1)), tf.int64)) batch["timestamp"] = timestamp return batch @@ -144,7 +142,8 @@ def evaluate(): eval_label = eval_model.get("label") sess.run([eval_iterator.initializer]) else: - # 在sess run模式下,若还是使用原来batch中的label去sess run,则会出现getnext超时报错,需要使用新数据集中的batch + # In sess run mode, if the label from the original batch is still used for sess run, + # a getnext timeout error will occur, and a new batch from the new dataset needs to be used eval_label = ConfigInitializer.get_instance().train_params_config.get_target_batch(False).get("label") sess.run([ConfigInitializer.get_instance().train_params_config.get_initializer(False)]) log_loss_list = [] @@ -157,24 +156,26 @@ def evaluate(): print("eval begin") while not finished: + + eval_current_steps += 1 + eval_start = time.time() try: - eval_current_steps += 1 - eval_start = time.time() eval_loss, pred, label = sess.run([eval_model.get("loss"), eval_model.get("pred"), eval_label]) - eval_cost = time.time() - eval_start - qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size - log_loss_list += list(eval_loss.reshape(-1)) - pred_income = pred[0] - pred_mat = pred[1] - pred_income_list += list(pred_income.reshape(-1)) - pred_mat_list += list(pred_mat.reshape(-1)) - label_income_list += list(label[:, 0].reshape(-1)) - label_mat_list += list(label[:, 1].reshape(-1)) - print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") - if eval_current_steps == eval_steps: - finished = True except tf.errors.OutOfRangeError: + break + eval_cost = time.time() - eval_start + qps_eval = (1 / eval_cost) * rank_size * cfg.batch_size + log_loss_list += list(eval_loss.reshape(-1)) + pred_income = pred[0] + pred_mat = pred[1] + pred_income_list += list(pred_income.reshape(-1)) + pred_mat_list += list(pred_mat.reshape(-1)) + label_income_list += list(label[:, 0].reshape(-1)) + label_mat_list += list(label[:, 1].reshape(-1)) + print(f"eval current_steps: {eval_current_steps}, qps: {qps_eval}") + if eval_current_steps == eval_steps: finished = True + auc_income = roc_auc_score(label_income_list, pred_income_list) auc_mat = roc_auc_score(label_mat_list, pred_mat_list) mean_log_loss = np.mean(log_loss_list) @@ -285,7 +286,6 @@ if __name__ == "__main__": warnings.filterwarnings("ignore") _clear_saved_model() - rank_id = int(os.getenv("RANK_ID")) if os.getenv("RANK_ID") else None rank_size = int(os.getenv("TRAIN_RANK_SIZE")) if os.getenv("TRAIN_RANK_SIZE") else None interval = int(os.getenv("INTERVAL")) if os.getenv("INTERVAL") else None train_steps = 1000 @@ -304,13 +304,8 @@ if __name__ == "__main__": logger.info(f"USE_DYNAMIC:{use_dynamic}") init(train_steps=train_steps, eval_steps=eval_steps, use_dynamic=use_dynamic, use_dynamic_expansion=use_dynamic_expansion) - IF_LOAD = False + rank_id = mxrec_util.communication.hccl_ops.get_rank_id() - filelist = glob(f"./saved-model/sparse-model-0") - if filelist: - IF_LOAD = True - ConfigInitializer.get_instance().if_load = IF_LOAD - cfg = Config() feature_spec_list_train = None feature_spec_list_eval = None @@ -385,14 +380,11 @@ if __name__ == "__main__": grads_and_vars = [(grad, variable) for grad, variable in zip(sparse_grads, sparse_variables)] train_ops.append(sparse_optimizer.apply_gradients(grads_and_vars)) - # 动态学习率更新 - train_ops.extend([cfg.global_step.assign(cfg.global_step + 1), cfg.learning_rate[0], cfg.learning_rate[1]]) with tf.control_dependencies(train_ops): train_ops = tf.no_op() cfg.learning_rate = [cfg.learning_rate[0], cfg.learning_rate[1]] - saver = tf.train.Saver() if MODIFY_GRAPH_FLAG: modify_graph_and_start_emb_cache(dump_graph=True) else: @@ -405,7 +397,6 @@ if __name__ == "__main__": if MODIFY_GRAPH_FLAG: # 该场景添加hook处理校验问题 hook_list.append(GraphModifierHook(modify_graph=False)) - # with tf.compat.v1.Session(config=sess_config(dump_data=False)) as sess: if use_faae: sess = tf.compat.v1.train.MonitoredTrainingSession( hooks=hook_list, @@ -427,13 +418,12 @@ if __name__ == "__main__": epoch = 0 cost_sum = 0 qps_sum = 0 - best_income_auc = 0 + best_auc_income= 0 best_auc_mat = 0 iteration_per_loop = 10 train_ops = util.set_iteration_per_loop(sess, train_ops, 10) - # for i in range(1, TRAIN_STEPS): i = 0 while True: i += 1 @@ -441,9 +431,8 @@ if __name__ == "__main__": start_time = time.time() try: - grad, loss = sess.run([train_ops, train_model.get("loss")]) - lr = sess.run(cfg.learning_rate) - global_step = sess.run(cfg.global_step) + grad, loss, lr, global_step = sess.run([train_ops, train_model.get("loss"), + cfg.learning_rate, cfg.global_step]) except tf.errors.OutOfRangeError: logger.info(f"Encounter the end of Sequence for training.") break @@ -469,9 +458,7 @@ if __name__ == "__main__": test_auc_mat,test_mean_log_loss)) best_auc_income = max(best_auc_income, test_auc_income) best_auc_mat = max(best_auc_mat, test_auc_mat) - logger.info(f"training step: {i * iteration_per_loop}, - best auc income: {best_auc_income} , - best auc mat: {best_auc_mat}") + logger.info(f"training step: {i * iteration_per_loop}, best auc income: {best_auc_income} , best auc mat: {best_auc_mat}") sess.close() diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index cf8ca108..224e8d6d 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -37,10 +37,10 @@ class MyModel: self.gate_num = gate_num - def expert_layer(self, input): + def expert_layer(self, _input): param_expert = [] for i in range(0, self.expert_num): - expert_linear = tf.layers.dense(input, units=self.expert_size, activation=None, name=f'expert_payer_{i}', + expert_linear = tf.layers.dense(_input, units=self.expert_size, activation=None, name=f'expert_layer_{i}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) @@ -48,10 +48,10 @@ class MyModel: return param_expert - def gate_layer(self, input): + def gate_layer(self, _input): param_gate = [] for i in range(0, self.gate_num): - gate_linear = tf.layers.dense(input, units=self.gate_size, activation=None, name=f'gate_payer_{i}', + gate_linear = tf.layers.dense(_input, units=self.expert_num, activation=None, name=f'gate_layer_{i}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) @@ -59,8 +59,8 @@ class MyModel: return param_gate - def tower_layer(self, input, layer_name): - tower_linear = tf.layers.dense(input, units=self.tower_size, activation=None, name=f'tower_payer_{layer_name}', + def tower_layer(self, _input, layer_name): + tower_linear = tf.layers.dense(_input, units=self.tower_size, activation=None, name=f'tower_layer_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) @@ -109,7 +109,10 @@ class MyModel: cur_gate_expert = tf.multiply(x=expert_concat, y=cur_gate) cur_gate_expert = tf.reduce_sum(cur_gate_expert, axis=1) + out = self.tower_layer(cur_gate_expert, i) + out = tf.nn.softmax(out) + out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0-1e-15) output_layers.append(out) out_pred.append(tf.nn.softmax(out[:, 1])) _slice_num = slice_num_end diff --git a/examples/mmoe/op_impl_mode.ini b/examples/mmoe/op_impl_mode.ini index 579dea43..e69de29b 100644 --- a/examples/mmoe/op_impl_mode.ini +++ b/examples/mmoe/op_impl_mode.ini @@ -1 +0,0 @@ -ScatterNdAdd=support_out_of_bound_index \ No newline at end of file diff --git a/examples/mmoe/optimizer.py b/examples/mmoe/optimizer.py index 2c7685bb..5469c705 100644 --- a/examples/mmoe/optimizer.py +++ b/examples/mmoe/optimizer.py @@ -15,12 +15,13 @@ # ============================================================================== import tensorflow as tf -from delay_loss_scale import DenseLossScaleOptimizer, SparseLossScaleOptimizer + from mx_rec.util.initialize import ConfigInitializer from mx_rec.optimizers.lazy_adam import create_hash_optimizer from mx_rec.optimizers.lazy_adam_by_addr import create_hash_optimizer_by_address + def get_dense_and_sparse_optimizer(cfg): dense_optimizer = tf.train.AdamOptimizer(learning_rate=cfg.learning_rate[0]) use_dynamic_expansion = ConfigInitializer.get_instance().use_dynamic_expansion @@ -28,8 +29,5 @@ def get_dense_and_sparse_optimizer(cfg): sparse_optimizer = create_hash_optimizer_by_address(learning_rate=cfg.learning_rate[1]) else: sparse_optimizer = create_hash_optimizer(learning_rate=cfg.learning_rate[1]) - loss_scale = 1 - sparse_optimizer = SparseLossScaleOptimizer(sparse_optimizer, loss_scale) - dense_optimizer = DenseLossScaleOptimizer(dense_optimizer, loss_scale) return dense_optimizer, sparse_optimizer -- Gitee From ca2e82248c638e21066a3c6ae779d9409724d122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 17:28:49 +0800 Subject: [PATCH 298/302] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index 224e8d6d..f18dbff0 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -60,11 +60,11 @@ class MyModel: def tower_layer(self, _input, layer_name): - tower_linear = tf.layers.dense(_input, units=self.tower_size, activation=None, name=f'tower_layer_{layer_name}', + tower_linear = tf.layers.dense(_input, units=self.tower_size, activation='relu', name=f'tower_layer_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) - tower_linear_out = tf.layers.dense(tower_linear, units=self.tower_size, activation=None, + tower_linear_out = tf.layers.dense(tower_linear, units=2, activation=None, name=f'tower_payer_out_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) -- Gitee From 13f3618364bae56befe067d91b75603f3bae4624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 19:29:53 +0800 Subject: [PATCH 299/302] codecheck --- examples/mmoe/config.py | 8 ++++---- examples/mmoe/main_mxrec.py | 12 +++++++----- examples/mmoe/model.py | 5 +++-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/mmoe/config.py b/examples/mmoe/config.py index 67ed7a20..b6a83582 100644 --- a/examples/mmoe/config.py +++ b/examples/mmoe/config.py @@ -90,14 +90,14 @@ class Config: ) self.learning_rate = _lr_scheduler.calc() + + @staticmethod def get_send_count(self, rank_size): try: - return 46000 // rank_size + return 46000 // rank_size except ZeroDivisionError as exp: raise ZeroDivisionError('Rank size can not be zero.') from exp - - - + def __set_emb_table_size(self) -> None: self.cache_mode = os.getenv("CACHE_MODE") diff --git a/examples/mmoe/main_mxrec.py b/examples/mmoe/main_mxrec.py index 0eb127dd..d02566aa 100644 --- a/examples/mmoe/main_mxrec.py +++ b/examples/mmoe/main_mxrec.py @@ -25,6 +25,7 @@ import tensorflow as tf from sklearn.metrics import roc_auc_score import numpy as np from npu_bridge.npu_init import * +from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum from mx_rec.constants.constants import ASCEND_SPARSE_LOOKUP_LOCAL_EMB, ASCEND_SPARSE_LOOKUP_ID_OFFSET from mx_rec.core.asc.helper import FeatureSpec, get_asc_insert_func from mx_rec.core.asc.manager import start_asc_pipeline @@ -38,7 +39,7 @@ import mx_rec.util as mxrec_util from mx_rec.util.variable import get_dense_and_sparse_variable from mx_rec.util.log import logger from optimizer import get_dense_and_sparse_optimizer -from config import sess_config, Config, SSD_DATA_PATH, CacheModeEnum + from model import MyModel npu_plugin.set_device_sat_mode(0) @@ -328,7 +329,7 @@ if __name__ == "__main__": optimizer_list = [get_dense_and_sparse_optimizer(cfg)] # note: variance_scaling_initializer only support HBM mode - emb_initializer = tf.constant_initializer(value = 0.1) + emb_initializer = tf.constant_initializer(value=0.1) sparse_hashtable = create_table( key_dtype=cfg.key_type, dim=tf.TensorShape([cfg.emb_dim]), @@ -418,7 +419,7 @@ if __name__ == "__main__": epoch = 0 cost_sum = 0 qps_sum = 0 - best_auc_income= 0 + best_auc_income = 0 best_auc_mat = 0 iteration_per_loop = 10 @@ -455,10 +456,11 @@ if __name__ == "__main__": else: test_auc_income, test_auc_mat, test_mean_log_loss = evaluate() print("Test auc income: {};Test auc mat: {} ;log_loss: {} ".format(test_auc_income, - test_auc_mat,test_mean_log_loss)) + test_auc_mat, test_mean_log_loss)) best_auc_income = max(best_auc_income, test_auc_income) best_auc_mat = max(best_auc_mat, test_auc_mat) - logger.info(f"training step: {i * iteration_per_loop}, best auc income: {best_auc_income} , best auc mat: {best_auc_mat}") + logger.info(f"training step: {i * iteration_per_loop}, best auc income: " + f"{best_auc_income} , best auc mat: {best_auc_mat}") sess.close() diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index f18dbff0..f8090373 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -60,7 +60,8 @@ class MyModel: def tower_layer(self, _input, layer_name): - tower_linear = tf.layers.dense(_input, units=self.tower_size, activation='relu', name=f'tower_layer_{layer_name}', + tower_linear = tf.layers.dense(_input, units=self.tower_size, activation='relu', + name=f'tower_layer_{layer_name}', kernel_initializer=tf.constant_initializer(value=0.1), bias_initializer=tf.constant_initializer(value=0.1)) @@ -112,7 +113,7 @@ class MyModel: out = self.tower_layer(cur_gate_expert, i) out = tf.nn.softmax(out) - out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0-1e-15) + out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0 - 1e-15) output_layers.append(out) out_pred.append(tf.nn.softmax(out[:, 1])) _slice_num = slice_num_end -- Gitee From e3ffcd9bffabc259852c0af58f43273272d655c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=97=A0=E6=82=94?= <1187940490@qq.com> Date: Tue, 23 Jul 2024 21:59:32 +0800 Subject: [PATCH 300/302] =?UTF-8?q?bug=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/mmoe/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mmoe/model.py b/examples/mmoe/model.py index f8090373..8cbb7ba8 100644 --- a/examples/mmoe/model.py +++ b/examples/mmoe/model.py @@ -116,7 +116,7 @@ class MyModel: out = tf.clip_by_value(out, clip_value_min=1e-15, clip_value_max=1.0 - 1e-15) output_layers.append(out) out_pred.append(tf.nn.softmax(out[:, 1])) - _slice_num = slice_num_end + _slice_num = slice_gate_end trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='mmoe') label_income = label[:, 0:1] -- Gitee From aa1c87126e4b84f6a58253932138d21860f1313b Mon Sep 17 00:00:00 2001 From: steepcurve Date: Tue, 23 Jul 2024 14:12:51 +0000 Subject: [PATCH 301/302] =?UTF-8?q?!228=20=E3=80=90FEAT=E3=80=91`PerfRec`?= =?UTF-8?q?=E6=80=A7=E8=83=BD=E5=B7=A5=E5=85=B7=20*=20cleancode=20*=20clea?= =?UTF-8?q?ncode=20*=20cleancode=20*=20add=20shell=3DFalse=20*=20add=20REA?= =?UTF-8?q?DME=20*=20cleancode=20*=20fix=20bug=20and=20cleancode=20*=20add?= =?UTF-8?q?=20README.md=20*=20cleancode=20*=20cleancode=20*=20cleancode=20?= =?UTF-8?q?*=20add=20comment=20*=20add=20comment=20*=20write=20call=20stac?= =?UTF-8?q?k=20to=20file=20*=20write=20call=20stack=20to=20file=20*=20writ?= =?UTF-8?q?e=20call=20stack=20to=20file=20*=20npu=20optional=20*=20npu=20o?= =?UTF-8?q?ptional=20*=20fix=20bug=20*=20fix=20bug=20*=20add=20comments=20?= =?UTF-8?q?*=20add=20fusion=20tracing=20*=20add=20fusion=20tracing=20*=20a?= =?UTF-8?q?dd=20fusion=20tracing=20*=20add=20fusion=20tracing=20*=20add=20?= =?UTF-8?q?fusion=20tracing=20*=20add=20fusion=20tracing=20*=20add=20fusio?= =?UTF-8?q?n=20tracing=20*=20add=20fusion=20tracing=20*=20feat:=20`flamegr?= =?UTF-8?q?aph`=20wrapper=20*=20feat:=20`flamegraph`=20wrapper=20*=20feat:?= =?UTF-8?q?=20`flamegraph`=20wrapper=20*=20feat:=20`flamegraph`=20wrapper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/perfrec-python/README.md | 81 +++++ tools/perfrec-python/config.toml | 27 ++ tools/perfrec-python/fusion_tracing.py | 425 +++++++++++++++++++++++++ tools/perfrec-python/perf.py | 251 +++++++++++++++ 4 files changed, 784 insertions(+) create mode 100644 tools/perfrec-python/README.md create mode 100644 tools/perfrec-python/config.toml create mode 100644 tools/perfrec-python/fusion_tracing.py create mode 100644 tools/perfrec-python/perf.py diff --git a/tools/perfrec-python/README.md b/tools/perfrec-python/README.md new file mode 100644 index 00000000..ddc7e114 --- /dev/null +++ b/tools/perfrec-python/README.md @@ -0,0 +1,81 @@ +## perf.py +``` +usage: perf.py [-h] --perf_data PERF_DATA --flamegraph_path FLAMEGRAPH_PATH + [--perf_bin PERF_BIN] [--output_svg OUTPUT_SVG] + +Generate a Flamegraph from perf.data. + +optional arguments: + -h, --help show this help message and exit + --perf_data PERF_DATA + Path to the perf.data file. + --flamegraph_path FLAMEGRAPH_PATH + Path to the Flamegraph Perl scripts directory. + --perf_bin PERF_BIN Path to perf exacutable binary file. (default: perf) + --output_svg OUTPUT_SVG + Path to the output SVG file. (default: flamegraph.svg) +``` +#### 使用示例 + +参考以下脚本使用`perf`采集数据。 +```bash +pid=$(top -b -n 1 | head -n 8 | tail -n 1 | awk '{print $1}') +if [ -z "$pid" ];then + echo "未获取到进程ID" + exit 1 +fi +perf record -F 99 -p $pid -a -g -- sleep 60 +if [ $? -ne 0 ]; then + echo "perf record执行失败" + exit 1 +fi +echo "perf.data 采集完成" +``` + +使用本工具生成火焰图和耗时函数分析。 +```bash +python perf.py --perf_data perf.data --flamegraph_path /ws/FlameGraph +``` +#### 可选配置 +```toml +# config.toml + +[perf] +# Filter percentage of time cost +threshold = 0.05 +# Ignore function list +ignores = ["[libc.so.6]"] +``` + +## fusion_tracing.py +``` +usage: fusion_tracing.py [-h] --debug_log DEBUG_LOG + [--msprof_output MSPROF_OUTPUT] + +Generate CPU/NPU fusion tracing json. + +optional arguments: + -h, --help show this help message and exit + --debug_log DEBUG_LOG + MxRec DEBUG level log flie path. + --msprof_output MSPROF_OUTPUT + msprof output path. +``` +#### 使用示例 +```bash +# only cpu +python fusion_tracing.py --debug_log ../../example/demo/little_demo/temp.log +# cpu + npu +python fusion_tracing.py --debug_log ../../example/demo/little_demo/temp.log --msprof_output ../../example/demo/little_demo/msprof +``` +#### 可选配置 +```toml +# config.toml + +[mxrec] +# Pipe name and time cost name +key_process = ["getBatchData", "getAndProcess"] +process_emb_info = ["getAndSendTensors"] +lookup_swap_addr = ["lookupAddrs"] +embedding_recv = ["EmbeddingRecv", "EmbeddingUpdate", "SendH2DEmb"] +``` diff --git a/tools/perfrec-python/config.toml b/tools/perfrec-python/config.toml new file mode 100644 index 00000000..8e15fd1d --- /dev/null +++ b/tools/perfrec-python/config.toml @@ -0,0 +1,27 @@ +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +[mxrec] +# Pipe name and time cost name +key_process = ["getBatchData", "getAndProcess"] +process_emb_info = ["getAndSendTensors"] +lookup_swap_addr = ["lookupAddrs"] +embedding_recv = ["EmbeddingRecv", "EmbeddingUpdate", "SendH2DEmb"] + +[perf] +# Filter percentage of time cost +threshold = 0.05 +# Ignore function list +ignores = ["[libc.so.6]"] diff --git a/tools/perfrec-python/fusion_tracing.py b/tools/perfrec-python/fusion_tracing.py new file mode 100644 index 00000000..49900004 --- /dev/null +++ b/tools/perfrec-python/fusion_tracing.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import argparse +import json +import logging +import os +import re +from collections import defaultdict +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, List, Tuple + +import pandas as pd +import toml + + +class MxRecConfig: + """ + Configuration from `config.toml`. + """ + + def __init__(self, pipes: Dict[str, List[str]]): + self.pipes = pipes + self.func_to_pipe = defaultdict(str) + for pipe_name, event_list in self.pipes.items(): + for event in event_list: + self.func_to_pipe[event] = pipe_name + self.pipe_names = [name for name in pipes.keys()] + + +class MxRecEvent: + """ + Class to represent an MxRec event. + """ + + def __init__(self, log_line: str, event_name: str, pipe_id: int): + timestamp_s = get_timestamp(log_line) + duration_ms = get_duration(log_line, event_name) + process_id = get_process_id(log_line) + self.timestamp_start_us = timestamp_s * 1e6 - duration_ms * 1e3 + self.duration_us = duration_ms * 1e3 + self.timestamp_end_us = timestamp_s * 1e6 + self.process_id = process_id + self.name = event_name + self.pipe_id = pipe_id + + +@dataclass +class OpEvent: + """ + Class to represent an Op event. + """ + + device_id: int + op_name: str + op_type: str + task_type: str + start_timestamp: float + duration: float + + +def extract_mxrec_events( + log_path: str, config: MxRecConfig +) -> Dict[int, Dict[str, List[MxRecEvent]]]: + """ + Extracts MxRec events from the log file. + + Args: + log_path (str): Path to the log file. + config (MxRecConfig): Dictionary mapping event names to pipe names and other configs. + + Returns: + Dict[int, Dict[str, List[MxRecEvent]]]: Extracted MxRec events grouped by process ID and pipe. + """ + events: Dict[int, Dict[str, List[MxRecEvent]]] = defaultdict( + lambda: defaultdict(list) + ) + broken_lines = list() + event_names = config.func_to_pipe + pipe_names = config.pipe_names + pipe_ids = defaultdict(int) + for i, pipe in enumerate(pipe_names): + pipe_ids[pipe] = i + with open(log_path) as log: + for line in log: + for name, pipe in filter(lambda item: item[0] in line, event_names.items()): + try: + event = MxRecEvent(line, name, pipe_ids[pipe]) + events[event.process_id][pipe].append(event) + except RuntimeError: + broken_lines.append(line) + if broken_lines: + logging.warning("There are %d broken log lines", len(broken_lines)) + for line in broken_lines: + logging.warning(line) + return events + + +def extract_op_events(op_summary_path: str) -> List[OpEvent]: + """ + Extracts Op events from the CSV file. + + Args: + op_summary_path (str): Path to the op summary CSV file. + + Returns: + List[OpEvent]: List of extracted Op events. + """ + df = pd.read_csv(op_summary_path) + return [ + OpEvent( + row["Device_id"], + row["Op Name"], + row["OP Type"], + row["Task Type"], + row["Task Start Time(us)"], + row["Task Duration(us)"], + ) + for _, row in df.iterrows() + ] + + +def get_timestamp(log_line: str) -> float: + """ + Extracts the timestamp from a log line. + + Args: + log_line (str): A line from the log file. + + Returns: + float: The extracted timestamp as a float. + """ + pattern = r"\[(\d{4}/\d{1,2}/\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}\.\d+)\]" + match = re.search(pattern, log_line) + if not match: + raise RuntimeError(f"there is no time in log: {log_line}") + date_time_str = match.group(1) + date_time_format = "%Y/%m/%d %H:%M:%S.%f" + # Parse the date-time string into a datetime object + date_time_obj = datetime.strptime(date_time_str, date_time_format) + # Convert the datetime object to a timestamp + return date_time_obj.timestamp() + + +def get_duration(log_line: str, event_name: str) -> float: + """ + Extracts the duration of an event from a log line. + + Args: + log_line (str): A line from the log file. + event_name (str): The name of the event. + + Returns: + int: The extracted duration in milliseconds. + """ + pattern = event_name + r".*:\s*(\d+)" + match = re.search(pattern, log_line) + if not match: + raise RuntimeError(f"there is no event: {event_name}, log: {log_line}") + duration_ms = match.group(1) + return float(duration_ms) + + +def get_process_id(log_line: str) -> int: + """ + Extracts the process ID from a log line. + + Args: + log_line (str): A line from the log file. + + Returns: + int: The extracted process ID. + """ + pattern = r"\[(\d+)\]" + match = re.search(pattern, log_line) + if not match: + raise RuntimeError(f"there is no process_id in log: {log_line}") + process_id = match.group(1) + return int(process_id) + + +def read_mxrec_config() -> MxRecConfig: + """ + Reads the MxRec configuration from a TOML file. + + Returns: + MxRecCofig: Configuration class. + """ + try: + config = toml.load("config.toml") + return MxRecConfig(config["mxrec"]) + except toml.TomlDecodeError as e: + raise RuntimeError("can not load config.toml") from e + + +@dataclass +class TracingMetaData: + """ + Class to represent metadata for tracing. + """ + + name: str + pid: int + tid: int + ph: str + args: Dict[str, Any] + + +class TracingMxRecEvent: + """ + Class to represent a traced MxRec event. + """ + + def __init__(self, mxrec_event: MxRecEvent): + self.name = mxrec_event.name + self.pid = mxrec_event.process_id + self.tid = get_fake_tid(self.pid, mxrec_event.pipe_id) + self.ts = mxrec_event.timestamp_start_us + self.dur = mxrec_event.duration_us + self.ph = "X" + self.args = {} + + +class TracingOpEvent: + """ + Class to represent a traced Op event. + """ + + def __init__(self, op_event: OpEvent, tid: int): + self.name = op_event.op_type + self.pid = get_op_pid(op_event) + self.tid = tid + self.ts = op_event.start_timestamp + self.dur = op_event.duration + self.ph = "X" + self.args = {"Op Name": op_event.op_name} + + +def get_metadata(processes: List[int], config: MxRecConfig) -> List[TracingMetaData]: + """ + Generates metadata for tracing processes and threads. + + Args: + processes (List[int]): List of process IDs. + config (MxRecConfig): Configuration class. + + Returns: + List[TracingMetaData]: List of tracing metadata. + """ + metadata = list() + pipes = config.pipe_names + for i, pid in enumerate(processes): + metadata1 = TracingMetaData( + "process_name", pid, 0, "M", {"name": f"MxRec process {i}"} + ) + metadata2 = TracingMetaData( + "process_sort_index", pid, 0, "M", {"sort_index": i} + ) + metadata.append(metadata1) + metadata.append(metadata2) + for pipe_i, pipe in enumerate(pipes): + pipe_metadata1 = TracingMetaData( + "thread_name", + pid, + get_fake_tid(pid, pipe_i), + "M", + {"name": f"{pipe} {pid}"}, + ) + pipe_metadata2 = TracingMetaData( + "thread_sort_index", + pid, + get_fake_tid(pid, pipe_i), + "M", + {"sort_index": pipe_i}, + ) + metadata.append(pipe_metadata1) + metadata.append(pipe_metadata2) + return metadata + + +def get_fake_tid(pid: int, pipe_id: int) -> int: + """ + Generates a fake thread ID based on process ID and pipe ID. + + Args: + pid (int): Process ID. + pipe_id (int): Pipe ID. + + Returns: + int: Fake thread ID. + """ + return pid * 10 + pipe_id + + +def get_op_pid(op_event: OpEvent) -> int: + """ + Gets the process ID for an Op event. + + Args: + op_event (OpEvent): An Op event. + + Returns: + int: Process ID. + """ + # add 100 avoiding confict with cpu pid(rand_id) + return 100 + op_event.device_id + + +def get_op_tracing(path: str) -> Tuple[List[TracingMetaData], List[TracingOpEvent]]: + """ + Generates tracing data for Op events. + + Args: + path (str): Path to the directory containing Op event summaries. + + Returns: + Tuple[List[TracingMetaData], List[TracingOpEvent]]: Metadata and tracing events. + """ + task_types = defaultdict(int) + pids = set() + tids = set() + metadata = list() + op_tracing = list() + + def new_process_metadata(pid, device_id): + metadata1 = TracingMetaData( + "process_name", pid, 0, "M", {"name": f"NPU {device_id}"} + ) + metadata2 = TracingMetaData( + "process_sort_index", pid, 0, "M", {"sort_index": pid} + ) + return [metadata1, metadata2] + + def new_thread_metadata(pid, tid, name): + metadata1 = TracingMetaData("thread_name", pid, tid, "M", {"name": f"{name}"}) + metadata2 = TracingMetaData( + "thread_sort_index", pid, tid, "M", {"sort_index": tid} + ) + return [metadata1, metadata2] + + for root, _, files in os.walk(path): + for file in files: + if ( + root.endswith("mindstudio_profiler_output") + and file.startswith("op_summary") + and file.endswith(".csv") + ): + file_path = os.path.join(root, file) + op_events = extract_op_events(file_path) + for event in op_events: + process_id = get_op_pid(event) + if process_id not in pids: + pids.add(process_id) + metadata.extend( + new_process_metadata(process_id, event.device_id) + ) + if event.task_type not in task_types: + task_id = len(task_types) + task_types[event.task_type] = task_id + tid = get_fake_tid(process_id, task_types[event.task_type]) + if tid not in tids: + tids.add(tid) + metadata.extend( + new_thread_metadata(process_id, tid, event.task_type) + ) + op_tracing.append(TracingOpEvent(event, tid)) + return metadata, op_tracing + + +def main(): + """ + Main function to parse arguments and generate tracing JSON. + """ + logging.basicConfig(level=logging.INFO) + parser = argparse.ArgumentParser( + description="Generate CPU/NPU fusion tracing json." + ) + parser.add_argument( + "--debug_log", help="MxRec DEBUG level log file path.", required=True + ) + parser.add_argument("--msprof_output", help="msprof output path.", required=False) + args = parser.parse_args() + + log_path = args.debug_log + tracing = list() + try: + config = read_mxrec_config() + mxrec_events = extract_mxrec_events(log_path, config) + tracing.extend(get_metadata(list(mxrec_events.keys()), config)) + except RuntimeError: + logging.error("Can not read config.toml, it will exit unsuccessfully.") + exit(1) + + for process in mxrec_events.values(): + for events in process.values(): + tracing.extend([TracingMxRecEvent(event) for event in events]) + + msprof_output_path = args.msprof_output + if msprof_output_path: + op_metadata, op_tracing = get_op_tracing(msprof_output_path) + tracing.extend(op_metadata) + tracing.extend(op_tracing) + + fd = os.open("mxrec_tracing.json", os.O_WRONLY | os.O_CREAT, 0o640) + with os.fdopen(fd, "w") as file: + json.dump(tracing, file, indent=4, default=lambda obj: obj.__dict__) + + +if __name__ == "__main__": + main() diff --git a/tools/perfrec-python/perf.py b/tools/perfrec-python/perf.py new file mode 100644 index 00000000..34f688e9 --- /dev/null +++ b/tools/perfrec-python/perf.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# Copyright 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import argparse +import logging +import os +import subprocess +from collections import defaultdict +from typing import List + +import toml +from tabulate import tabulate + + +def generate_flamegraph( + perf_bin: str, perf_data: str, output_svg: str, flamegraph_path: str +) -> None: + """ + Generate a flamegraph from perf data. + + Args: + perf_data (str): Path to the perf.data file. + output_svg (str): Path to the output SVG file. + flamegraph_path (str): Path to the Flamegraph scripts directory. + """ + # Ensure perf script is available + try: + subprocess.run([perf_bin, "--version"], shell=False, check=True) + except subprocess.CalledProcessError: + logging.error("perf is not installed or not in PATH.") + return + + # Ensure Flamegraph scripts are available + stackcollapse_path = os.path.join(flamegraph_path, "stackcollapse-perf.pl") + flamegraph_script_path = os.path.join(flamegraph_path, "flamegraph.pl") + + if not os.path.isfile(stackcollapse_path) or not os.path.isfile( + flamegraph_script_path + ): + logging.error( + "Flamegraph scripts not found in the provided directory %s.", + flamegraph_path, + ) + return + + # Generate the folded stack output + folded_output = perf_data + ".folded" + fd = os.open(folded_output, os.O_WRONLY | os.O_CREAT, 0o640) + with os.fdopen(fd, "w") as f: + script_output = subprocess.run( + [perf_bin, "script", "-i", perf_data], + shell=False, + check=True, + stdout=subprocess.PIPE, + ) + subprocess.run( + [stackcollapse_path], + shell=False, + check=True, + input=script_output.stdout, + stdout=f, + ) + + # Generate the flamegraph + fd_svg = os.open(output_svg, os.O_WRONLY | os.O_CREAT, 0o640) + with os.fdopen(fd_svg, "w") as f: + subprocess.run( + [flamegraph_script_path, folded_output], shell=False, check=True, stdout=f + ) + + logging.info("Flamegraph generated at %s", output_svg) + + # Analyze the folded stack output + analyze_folded_stack(folded_output) + + +class CallStack: + def __init__(self): + self.count = 0 + self.call_stacks = [] + + def add_call_stacks(self, count: int, call_stack: str): + self.count += count + self.call_stacks.append(call_stack) + + +def analyze_folded_stack(folded_output: str) -> None: + """ + Analyzes the folded stack output to find functions with significant sample counts. + + Args: + folded_output (str): Path to the folded stack output file. + """ + + function_counts = defaultdict(CallStack) + total_count = 0 + + # Read the folded stack output + # Line of folded stack example: + # python3.7;[libascendalog.so];access;__sys_trace_return;prepare_creds 10101010 + with open(folded_output, "r") as f: + for line in f: + parts = line.strip().rsplit( + " ", 1 + ) # Use rsplit to handle function names with spaces + count = int(parts[-1]) + call_stack_str = parts[0] + stack = parts[0].split(";") + function_counts[stack[-1]].add_call_stacks(count, call_stack_str) + total_count += count + + config = read_config() + + # Filter and display functions with more than 5% total count + threshold = total_count * config.threshold + results = [ + (func, call_stack) + for func, call_stack in function_counts.items() + if call_stack.count >= threshold and func not in config.ignores + ] + + # Sort results by count in descending order + results.sort(key=lambda x: x[1].count, reverse=True) + + # Prepare data for tabulate + # Write call stacks to file + table_data = [] + fd_call_stacks = os.open("call_stacks.txt", os.O_WRONLY | os.O_CREAT, 0o640) + with os.fdopen(fd_call_stacks, "w") as f: + for func, call_stack in results: + percentage = ( + (call_stack.count / total_count) * 100 if total_count != 0 else 0 + ) + table_data.append( + [limit_line(func, 50), call_stack.count, f"{percentage:.2f}%"] + ) + stacks = [stk + "\n" for stk in call_stack.call_stacks] + f.writelines( + [ + f"func_name: {func}\n", + f"percentage: {percentage:.2f}%\n", + "call_stacks:\n", + ] + + stacks + + ["\n\n"] + ) + + # Print the results using tabulate + logging.info("\nFunctions with more than 5% of total samples:") + headers = ["Function", "Count", "Percentage"] + logging.info("\n%s", tabulate(table_data, headers=headers, tablefmt="grid")) + + +def limit_line(input_content: str, line_length: int) -> str: + """ + Limits the length of a line to a specified number of characters, adding line breaks if necessary. + + Args: + input_content (str): The input string. + line_length (int): The maximum line length. + + Returns: + str: The formatted string with line breaks. + """ + if line_length >= len(input_content): + return input_content + limited_str = "" + if line_length > 0: + limited_str = "\n".join( + input_content[i : i + line_length] + for i in range(len(input_content), line_length) + ) + return limited_str + + +class PerfConfig: + """ + Configuration from `config.toml`. + """ + + def __init__(self, ignores: List[str], threshold: float = 0.05): + self.ignores = set(ignores) + self.threshold = threshold + + +def read_config() -> PerfConfig: + """ + Reads configs related to `perf` from the configuration file. + + Returns: + PerfConfig: Configuration class. + """ + try: + config = toml.load("config.toml") + perf_config = config["perf"] + return PerfConfig(perf_config["ignores"], perf_config["threshold"]) + except toml.TomlDecodeError: + return PerfConfig(ignores=[]) + + +def main(): + """ + Main function to parse arguments and generate a flamegraph. + """ + logging.basicConfig(level=logging.INFO) + parser = argparse.ArgumentParser( + description="Generate a Flamegraph from perf.data." + ) + parser.add_argument( + "--perf_data", help="Path to the perf.data file.", required=True + ) + parser.add_argument( + "--flamegraph_path", + help="Path to the Flamegraph Perl scripts directory.", + required=True, + ) + parser.add_argument( + "--perf_bin", + help="Path to perf exacutable binary file. (default: perf)", + required=False, + default="perf", + ) + parser.add_argument( + "--output_svg", + help="Path to the output SVG file. (default: flamegraph.svg)", + required=False, + default="flamegraph.svg", + ) + args = parser.parse_args() + + generate_flamegraph( + args.perf_bin, args.perf_data, args.output_svg, args.flamegraph_path + ) + + +if __name__ == "__main__": + main() -- Gitee From af1a01a3eb99e9a2d59f22e7e43e7cba414624de Mon Sep 17 00:00:00 2001 From: wuhongfa <1660398197@qq.com> Date: Wed, 24 Jul 2024 05:51:12 +0000 Subject: [PATCH 302/302] =?UTF-8?q?=E3=80=90FEAT=E3=80=91=E6=96=B0?= =?UTF-8?q?=E5=A2=9Eattention=20grad=E8=9E=8D=E5=90=88=E7=AE=97=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../aclnn_attention_fusion_grad/inc/common.h | 45 ++ .../inc/op_runner.h | 182 +++++++ .../inc/operator_desc.h | 57 +++ .../aclnn_attention_fusion_grad/run.sh | 91 ++++ .../scripts/gen_data.py | 47 ++ .../scripts/verify_result.py | 34 ++ .../src/CMakeLists.txt | 68 +++ .../src/common.cpp | 79 +++ .../aclnn_attention_fusion_grad/src/main.cpp | 182 +++++++ .../src/op_runner.cpp | 464 ++++++++++++++++++ .../src/operator_desc.cpp | 56 +++ 11 files changed, 1305 insertions(+) create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/common.h create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/op_runner.h create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/operator_desc.h create mode 100755 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/run.sh create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/scripts/gen_data.py create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/scripts/verify_result.py create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/CMakeLists.txt create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/common.cpp create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/main.cpp create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/op_runner.cpp create mode 100644 cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/operator_desc.cpp diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/common.h b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/common.h new file mode 100644 index 00000000..954f3f33 --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/common.h @@ -0,0 +1,45 @@ +/** +* @file common.h +* +* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size); + +#endif // COMMON_H diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/op_runner.h b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/op_runner.h new file mode 100644 index 00000000..03d0aff4 --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/op_runner.h @@ -0,0 +1,182 @@ +/** +* @file op_runner.h +* +* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "aclnn/acl_meta.h" +#include "acl/acl.h" +#include "common.h" +#include "operator_desc.h" + +/** + * Op Runner + */ +class OpRunner { +public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + const size_t GetInputNumDims(size_t index) const; + aclDataType GetInputDataType(size_t index) const; + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + const size_t GetOutputNumDims(size_t index) const; + aclDataType GetOutputDataType(size_t index) const; + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template + T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template + const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(); + +private: + size_t numInputs_; + size_t numOutputs_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; +}; + +#endif // OP_RUNNER_H diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/operator_desc.h b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/operator_desc.h new file mode 100644 index 00000000..da719849 --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/inc/operator_desc.h @@ -0,0 +1,57 @@ +/** +* @file operator_desc.h +* +* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +/** + * Op description + */ +struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; +}; + +#endif // OPERATOR_DESC_H diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/run.sh b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/run.sh new file mode 100755 index 00000000..6793de82 --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/run.sh @@ -0,0 +1,91 @@ +#!/bin/bash +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=0 + +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) +cd $CURRENT_DIR + +SHORT=v:, +LONG=dtype:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + # float16, float, int32 + (-v | --dtype) + DTYPE="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +if [ ! $ASCEND_HOME_DIR ]; then + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + export ASCEND_HOME_DIR=$HOME/Ascend/ascend-toolkit/latest + else + export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest + fi +fi + +export DDK_PATH=$ASCEND_HOME_DIR +arch=$(uname -m) +export NPU_HOST_LIB=$ASCEND_HOME_DIR/${arch}-linux/lib64 + +function main { + rm -rf $HOME/ascend/log/* + rm ./input/*.bin + rm ./output/*.bin + + cd $CURRENT_DIR + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + cd $CURRENT_DIR; rm -rf build; mkdir -p build; cd build + cmake ../src + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + cd $CURRENT_DIR/output + echo "INFO: execute op!" + ./execute_attention_fusion_grad_op + + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + cd $CURRENT_DIR + ret=`python3 scripts/verify_result.py output/grad_query.bin output/grad_key.bin output/grad_value.bin output/golden_grad_query.bin output/golden_grad_key.bin output/golden_grad_value.bin ` + echo $ret + if [ "x$ret" == "xtest pass" ]; then + echo "" + echo "#####################################" + echo "INFO: you have passed the Precision!" + echo "#####################################" + echo "" + fi +} + +main diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/scripts/gen_data.py b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/scripts/gen_data.py new file mode 100644 index 00000000..69077ee3 --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/scripts/gen_data.py @@ -0,0 +1,47 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2024 Huawei Technologies Co., Ltd +import numpy as np +import os +import math + +def softmax_grad(grad, src): + dst = grad * src + dst = np.sum(dst, axis=-1, keepdims=True) + dst = (grad - dst) * src + return dst + +def param_attn_layer_grad(dout, softmax_out, query, key, value): + # Dv and dS + dv = np.matmul(np.transpose(softmax_out, (0, 2, 1)), dout) + dS = np.matmul(dout, np.transpose(value, (0, 2, 1))) + dS = softmax_grad(dS, softmax_out)/math.sqrt(query.shape[2]) + # Atten + dQ = np.matmul(dS, key) + dK = np.matmul(np.transpose(dS, (0, 2, 1)), query) + return dQ, dK, dv + +def gen_golden_data_simple(): + + dout = np.random.uniform(-1, 1,[1024, 1000, 80]).astype(np.float32) + softmax_out = np.random.uniform(-1, 1,[1024, 1000, 50]).astype(np.float32) + query = np.random.uniform(-1, 1,[1024, 1000, 80]).astype(np.float32) + key = np.random.uniform(-1, 1,[1024, 50, 80]).astype(np.float32) + value = np.random.uniform(-1, 1,[1024, 50, 80]).astype(np.float32) + + grad_query, grad_key, grad_value = param_attn_layer_grad(dout, softmax_out, query, key, value) + + os.system("mkdir -p input") + os.system("mkdir -p output") + dout.tofile("./input/dout.bin") + softmax_out.tofile("./input/softmax_out.bin") + query.tofile("./input/query.bin") + key.tofile("./input/key.bin") + value.tofile("./input/value.bin") + + grad_query.tofile("./output/golden_grad_query.bin") + grad_key.tofile("./output/golden_grad_key.bin") + grad_value.tofile("./output/golden_grad_value.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/scripts/verify_result.py b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/scripts/verify_result.py new file mode 100644 index 00000000..7781d41f --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/scripts/verify_result.py @@ -0,0 +1,34 @@ +#!/usr/bin/python3 +# -*- coding:utf-8 -*- +# Copyright 2024 Huawei Technologies Co., Ltd +import os +import sys +import numpy as np + +loss = 1e-3 +minimum = 10e-10 + +def verify_result(real_result, golden): + real_result = np.fromfile(real_result, dtype=np.float32) + golden = np.fromfile(golden, dtype=np.float32) + real_result = real_result[:golden.size] + print(real_result[:32]) + print(golden[:32]) + result = np.abs(real_result - golden) + deno = np.maximum(np.abs(real_result), np.abs(golden)) + result_atol = np.less_equal(result, loss) + result_rtol = np.less_equal(result / np.add(deno, minimum), loss) + if not result_rtol.all() and not result_atol.all(): + if np.sum(result_rtol == False) > real_result.size * loss and np.sum(result_atol == False) > real_result.size * loss: + print("[ERROR] result error") + return False + print("test pass") + return True + +if __name__ == '__main__': + print("=============================grad query============") + verify_result(sys.argv[1], sys.argv[4]) + print("=============================grad key============") + verify_result(sys.argv[2], sys.argv[5]) + print("=============================grad value============") + verify_result(sys.argv[3], sys.argv[6]) \ No newline at end of file diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/CMakeLists.txt b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/CMakeLists.txt new file mode 100644 index 00000000..f1459958 --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/CMakeLists.txt @@ -0,0 +1,68 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_attention_fusion_grad) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/attention_fusion_grad/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub/") + set(LIB_PATH1 "/usr/local/Ascend/ascend-toolkit/latest/atc/lib64/stub/") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ${INC_PATH}/runtime/include + ${INC_PATH}/atc/include + ../inc + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${LIB_PATH1} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_attention_fusion_grad_op + operator_desc.cpp + op_runner.cpp + main.cpp + op_runner.cpp + common.cpp +) + +target_link_libraries(execute_attention_fusion_grad_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_attention_fusion_grad_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/common.cpp b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/common.cpp new file mode 100644 index 00000000..02eac9b4 --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/common.cpp @@ -0,0 +1,79 @@ +/** +* @file common.cpp +* +* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include "common.h" + +#include +#include +#include +#include + +extern bool g_isDevice; + +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/main.cpp b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/main.cpp new file mode 100644 index 00000000..e6aa8340 --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/main.cpp @@ -0,0 +1,182 @@ +/** +* @file main.cpp +* +* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include +#include +#include +#include +#include + +#include "acl/acl.h" +#include "op_runner.h" + +#include "common.h" + +bool g_isDevice = false; +int deviceId = 15; + +OperatorDesc CreateOpDesc() +{ + // define operator + std::vector dout { 1024, 1000, 80 }; + std::vector softmax_out { 1024, 1000, 50 }; + std::vector query { 1024, 1000, 80}; + std::vector key { 1024, 50, 80 }; + std::vector value { 1024, 50, 80 }; + + std::vector grad_query { 1024, 1000, 80}; + std::vector grad_key { 1024, 50, 80 }; + std::vector grad_value { 1024, 50, 80 }; + + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(ACL_FLOAT, dout.size(), dout.data(), format); + opDesc.AddInputTensorDesc(ACL_FLOAT, softmax_out.size(), softmax_out.data(), format); + opDesc.AddInputTensorDesc(ACL_FLOAT, query.size(), query.data(), format); + opDesc.AddInputTensorDesc(ACL_FLOAT, key.size(), key.data(), format); + opDesc.AddInputTensorDesc(ACL_FLOAT, value.size(), value.data(), format); + + opDesc.AddOutputTensorDesc(ACL_FLOAT, grad_query.size(), grad_query.data(), format); + opDesc.AddOutputTensorDesc(ACL_FLOAT, grad_key.size(), grad_key.data(), format); + opDesc.AddOutputTensorDesc(ACL_FLOAT, grad_value.size(), grad_value.data(), format); + return opDesc; +} + +bool SetInputData(OpRunner &runner) +{ + size_t fileSize = 0; + ReadFile("../input/dout.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0)); + ReadFile("../input/softmax_out.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1)); + ReadFile("../input/query.bin", fileSize, runner.GetInputBuffer(2), runner.GetInputSize(2)); + ReadFile("../input/key.bin", fileSize, runner.GetInputBuffer(3), runner.GetInputSize(3)); + ReadFile("../input/value.bin", fileSize, runner.GetInputBuffer(4), runner.GetInputSize(4)); + INFO_LOG("Set input success"); + return true; +} + +bool ProcessOutputData(OpRunner &runner) +{ + WriteFile("../output/grad_query.bin", runner.GetOutputBuffer(0), runner.GetOutputSize(0)); + WriteFile("../output/grad_key.bin", runner.GetOutputBuffer(1), runner.GetOutputSize(1)); + WriteFile("../output/grad_value.bin", runner.GetOutputBuffer(2), runner.GetOutputSize(2)); + INFO_LOG("Write output success"); + return true; +} + +void DestoryResource() +{ + bool flag = false; + if (aclrtResetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destory resource failed"); + } else { + INFO_LOG("Destory resource success"); + } +} + +bool InitResource() +{ + std::string output = "../output"; + if (access(output.c_str(), 0) == -1) { + int ret = mkdir(output.c_str(), 0700); + if (ret == 0) { + INFO_LOG("Make output directory successfully"); + } + else { + ERROR_LOG("Make output directory fail"); + return false; + } + } + + // acl.json is dump or profiling config file + if (aclInit(NULL) != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + if (aclrtSetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. deviceId is %d", deviceId); + (void)aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestoryResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; +} + +bool RunOp() +{ + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp()) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + + INFO_LOG("Run op success"); + return true; +} + +int main(int argc, char **argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + if (!RunOp()) { + DestoryResource(); + return FAILED; + } + + DestoryResource(); + + return SUCCESS; +} diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/op_runner.cpp b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/op_runner.cpp new file mode 100644 index 00000000..4df5eea5 --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/op_runner.cpp @@ -0,0 +1,464 @@ +/** +* @file op_runner.cpp +* +* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include "op_runner.h" +#include "aclnn_attention_fusion_grad.h" +#include +#include +#include +#include "acl/acl_op_compiler.h" +#include "common.h" + +using namespace std; + +extern bool g_isDevice; + +OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); +} + +OpRunner::~OpRunner() +{ + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); + } + } + + for (size_t i = 0; i < numOutputs_; ++i) { + (void)aclDestroyTensor(outputTensor_[i]); + (void)aclDestroyDataBuffer(outputBuffers_[i]); + (void)aclrtFree(devOutputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); + } + } +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor *inputTensor = aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), + nullptr, 0, GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + devOutputs_.emplace_back(devMem); + outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + + aclTensor *outputTensor = aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), + nullptr, 0, GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]); + if (outputTensor == nullptr) { + ERROR_LOG("Create Tensor for output[%zu] failed", i); + return false; + } + outputTensor_.emplace_back(outputTensor); + } + + return true; +} + +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} + +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} + +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} + +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} + +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor *handle = nullptr; + auto ret = aclnnAttentionFusionGradGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], inputTensor_[4], outputTensor_[0], outputTensor_[1], outputTensor_[2], + &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAttentionFusionGradGetWorkspaceSize success, workspace size %lu", workspaceSize); + + void *workspace = nullptr; + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = aclnnAttentionFusionGrad(workspace, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAttentionFusionGrad success"); + + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + auto beforeTime = std::chrono::steady_clock::now(); + for (int i = 0; i<100; i++) { + ret = aclnnAttentionFusionGradGetWorkspaceSize(inputTensor_[0], inputTensor_[1], inputTensor_[2], inputTensor_[3], inputTensor_[4], outputTensor_[0], outputTensor_[1], outputTensor_[2], + &workspaceSize, &handle); + ret = aclnnAttentionFusionGrad(workspace, workspaceSize, handle, stream); + } + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + auto afterTime = std::chrono::steady_clock::now(); + double duration_microsecond = std::chrono::duration(afterTime - beforeTime).count(); + std::cout << "time cost " << duration_microsecond/100 << " us" << std::endl; + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void)aclrtDestroyStream(stream); + return true; +} + + +template +void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} + +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} + +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} diff --git a/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/operator_desc.cpp b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/operator_desc.cpp new file mode 100644 index 00000000..1928103c --- /dev/null +++ b/cust_op/attention_fusion_grad/aclnn_attention_fusion_grad/src/operator_desc.cpp @@ -0,0 +1,56 @@ +/** +* @file operator_desc.cpp +* +* Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +*/ +#include "common.h" +#include "operator_desc.h" + +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto *desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + + for (auto *desc : outputDesc) { + aclDestroyTensorDesc(desc); + } + +} + +OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, + int numDims, + const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, + int numDims, + const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + + outputDesc.emplace_back(desc); + return *this; +} -- Gitee