diff --git a/CMakeLists.txt b/CMakeLists.txt index 4dfabfcbf539289c590b2544f8dea4454a835340..3de3d0364f309ac64792ba1baeb3a9fe34ea4bea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,8 +42,6 @@ add_executable(kylin-ai-business-framework-service src/main.cpp src/datamanagement/datamanagementdatabase.h src/datamanagement/datamanagementservice.cpp src/datamanagement/datamanagementservice.h - src/datamanagement/segmenttokenizer.cpp - src/datamanagement/segmenttokenizer.h src/utils/vectordb/vectordb.cpp src/utils/vectordb/vectordb.h src/utils/utils.cpp diff --git a/src/datamanagement/datamanagementservice.cpp b/src/datamanagement/datamanagementservice.cpp index 7dcae7ec79d3d6e1a94c59e2fa1283e93898ecec..f20015513ab709b0dae863872405c1c2dc7900e3 100644 --- a/src/datamanagement/datamanagementservice.cpp +++ b/src/datamanagement/datamanagementservice.cpp @@ -19,14 +19,14 @@ #include "datamanagement/datamanagementjsonhelper.h" #include "utils/parser/parser.h" #include "utils/parser/fileparserfactory.h" -#include "datamanagement/segmenttokenizer.h" +#include "utils/utils.h" #include "embeddingtaskmanager/embeddingtaskmanager.h" #include "embeddingtaskmanager/embeddingtask.h" #include #include -static const double TEXT_SEARCH_THRESHOLD = 0.8; +static const double TEXT_SEARCH_THRESHOLD = 0.65; static const double VISION_SEARCH_THRESHOLD = 0.4; typedef enum { @@ -274,7 +274,7 @@ std::vector DataManagementService::makeDatasByTextFilePathAndFormat( continue; } - SegmentTokenizer::segmentTokenize(file.contents, 100); + text::segmentTokenize(file.contents, 100); std::cout << "start encode text file " << path << std::endl; std::vector> vectors = textSideEmbeddingTexts(file.contents); std::cout << "finish encode text file " << path << std::endl; diff --git a/src/datamanagement/segmenttokenizer.cpp b/src/datamanagement/segmenttokenizer.cpp deleted file mode 100644 index 6e1074d03b38f6fced97f7ff357f7d8ffa27854e..0000000000000000000000000000000000000000 --- a/src/datamanagement/segmenttokenizer.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright 2024 KylinSoft Co., Ltd. - * - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 3 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include "segmenttokenizer.h" - -namespace SegmentTokenizer { - -std::vector splitString( - const std::string& str, const std::string& delimiters) { - std::vector result; - - size_t startPos = 0; - size_t foundPos = str.find(delimiters); - - while(foundPos != std::string::npos) { - if (delimiters == "." && !isEnglishPeriod(str, foundPos)) { - foundPos = str.find(delimiters, foundPos + delimiters.size()); - continue; - } - result.emplace_back(str.substr(startPos, foundPos - startPos + delimiters.size())); - startPos = foundPos + delimiters.size(); - foundPos = str.find(delimiters, startPos); - } - std::string lastSegment = str.substr(startPos); - if (!lastSegment.empty()) { - result.emplace_back(lastSegment); - } - - return result; -} - -// 按照句号分割句子,如果按照句号分割后仍然过长,则按照逗号进一步分割 -std::vector splitChineseString(const std::string& str, int maxLength) -{ - std::vector sentences = splitString(str, "。"); - for (auto it = sentences.begin(); it != sentences.end();) { - if (it->size() > maxLength) { - std::vector phrases = splitString(*it, ","); - it = sentences.erase(it); - it = sentences.insert(it, phrases.begin(), phrases.end()); - } - ++it; - } - return sentences; -} - -std::vector splitEnglishString(const std::string& str, int maxLength) -{ - std::vector sentences = splitString(str, "."); - for (auto it = sentences.begin(); it != sentences.end();) { - if (it->size() > maxLength) { - std::vector phrases = splitString(*it, ","); - it = sentences.erase(it); - it = sentences.insert(it, phrases.begin(), phrases.end()); - } - ++it; - } - return sentences; -} - -bool isEnglishPeriod(const std::string& text, size_t index) { - if (index == 0) // 确保不越界 - return false; - if (index == text.size() - 1) // 文本结尾的'.'认为是英文句号 - return true; - - // 观察`.`周围的内容 - char prevChar = text[index - 1]; - char nextChar = text[index + 1]; - - // 判断`.`是否是英文句号 - if (std::isspace(prevChar) && std::isalpha(nextChar)) // 如果`.`前是空格且`.`后是字母 - return true; - else if (std::isalpha(prevChar) && std::isspace(nextChar)) // 如果`.`前是字母且`.`后是空格 - return true; - else if (std::ispunct(prevChar) && std::isspace(nextChar)) // 如果`.`前是标点符号且`.`后是空格 - return true; - - return false; -} - -bool containsEnglishPeriod(const std::string& text) -{ - for (size_t i = 0; i < text.size(); ++i) { - if (isEnglishPeriod(text, i)) - return true; - } - return false; -} - -std::vector fixedSizeChunks(const std::string& text, int chunkSize) -{ - std::vector chunks; - - size_t startPos = 0; - while (startPos < text.size()) { - size_t endPos = startPos + chunkSize; - // 确保不要在一个 UTF-8 多字节字符的中间分割 - if (endPos < text.size()) { - while ((text[endPos] & 0xC0) == 0x80) { - // 如果 endPos 指向的字节是多字节字符的非起始字节 - // 向前移动,直到找到多字节字符的起始字节 - --endPos; - } - } - - size_t length = endPos - startPos; - chunks.push_back(text.substr(startPos, length)); - startPos += length; - } - - return chunks; -} - -// 文本分割,首先按照句号分割,如果按照句号分割完过长, -// 则按逗号进行进一步分割,如果按照逗号分割完仍然过长,则按照固定长度进行分割 -void segmentTokenize(std::vector &texts, int maxLength) -{ - for (auto begin = texts.begin(); begin != texts.end();) { - if (begin->size() < maxLength) { - ++begin; - continue; - } - - if (begin->find("。") != std::string::npos) { - std::vector sentences = splitChineseString(*begin, maxLength); - begin = texts.erase(begin); - begin = texts.insert(begin, sentences.begin(), sentences.end()); - } else if (containsEnglishPeriod(*begin)) { - std::vector sentences = splitEnglishString(*begin, maxLength); - begin = texts.erase(begin); - begin = texts.insert(begin, sentences.begin(), sentences.end()); - } - - if (begin->size() < maxLength) { - ++begin; - continue; - } - - std::vector chunks = fixedSizeChunks(*begin, maxLength); - begin = texts.erase(begin); - begin = texts.insert(begin, chunks.begin(), chunks.end()); - std::advance(begin, chunks.size()); - } -} - -} diff --git a/src/datamanagement/segmenttokenizer.h b/src/datamanagement/segmenttokenizer.h deleted file mode 100644 index 162ad3290385037937e8b8a451c0636776e3a7c8..0000000000000000000000000000000000000000 --- a/src/datamanagement/segmenttokenizer.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright 2024 KylinSoft Co., Ltd. - * - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 3 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#ifndef SEGMENTTOKENIZER_H -#define SEGMENTTOKENIZER_H - -#include -#include - -namespace SegmentTokenizer { - -std::vector splitString(const std::string& str, const std::string& delimiters); -std::vector splitChineseString(const std::string& str, int maxLength); -std::vector splitEnglishString(const std::string& str, int maxLength); -bool isEnglishPeriod(const std::string& text, size_t index); -bool containsEnglishPeriod(const std::string& text); -std::vector fixedSizeChunks(const std::string& text, int chunkSize); -void segmentTokenize(std::vector& texts, int maxLength); - -} // namespace SegmentTokenizer - -#endif // SEGMENTTOKENIZER_H diff --git a/src/utils/autotokenizer.cpp b/src/utils/autotokenizer.cpp index 3151c868e1246f4a6c995d6912734f6d571bcc73..5c3595c2cd41ab36e2f8e49bdd626aead9d3e93c 100644 --- a/src/utils/autotokenizer.cpp +++ b/src/utils/autotokenizer.cpp @@ -29,22 +29,39 @@ AutoTokenizer::AutoTokenizer(const std::string& modelFolderPath) PyRun_SimpleString("import sys"); std::string command = "sys.path.append('" + std::string(DATA_MANAGEMENT_PYTHON_PATH) + "')"; PyRun_SimpleString(command.c_str()); - tokenizer_ = PyImport_ImportModule("autotokenizer"); - if (!tokenizer_) { + + PyObject* module = PyImport_ImportModule("autotokenizer"); + if (!module) { std::cerr << "Import tokenizer error" << std::endl; PyErr_Print(); return; } - PyObject* initFunc = PyObject_GetAttrString(tokenizer_, "init_tokenizer"); - if (!initFunc) { - std::cerr << "Get init_tokenizer from tokenizer error" << std::endl; + + PyObject* tokenizerClass = PyObject_GetAttrString(module, "Tokenizer"); + if (!tokenizerClass) { + std::cerr << "Get tokenizer class error" << std::endl; PyErr_Print(); + Py_XDECREF(module); return; } + PyObject* args = Py_BuildValue("(s)", modelFolderPath.c_str()); - PyObject_CallObject(initFunc, args); + if (!args) { + std::cerr << "Build string args with " << modelFolderPath << " error" << std::endl; + PyErr_Print(); + Py_XDECREF(module); + Py_XDECREF(tokenizerClass); + return; + } + + tokenizer_ = PyObject_CallObject(tokenizerClass, args); + if (!tokenizer_) { + std::cerr << "Call init_tokenizer error" << std::endl; + PyErr_Print(); + } - Py_XDECREF(initFunc); + Py_XDECREF(module); + Py_XDECREF(tokenizerClass); Py_XDECREF(args); } diff --git a/src/utils/python/autotokenizer.py b/src/utils/python/autotokenizer.py index 74a3198cde882467955f0607ffe830566451a758..81aa9182d4cbdcf45d58d45d18476edfd38ba346 100644 --- a/src/utils/python/autotokenizer.py +++ b/src/utils/python/autotokenizer.py @@ -16,37 +16,24 @@ from transformers import AutoTokenizer import os os.environ["TOKENIZERS_PARALLELISM"] = "false" -tokenizer = None -def init_tokenizer(tokenizer_file_path): - global tokenizer +class Tokenizer: + def __init__(self, tokenizer_file_path): + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_file_path) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_file_path) + def tokenize_text(self, input_text): -def tokenize_text(input_text): - global tokenizer + if self.tokenizer is None: + print("Don't init tokenizer, please run init_tokenizer befor") + return {} - if tokenizer is None: - print("Don't init tokenizer, please run init_tokenizer befor") - return {} + # 将文本转换为 tokens + tokens = self.tokenizer(input_text) - # 将文本转换为 tokens - tokens = tokenizer(input_text) - # print(tokens) + input_ids = tokens['input_ids'] + attention_mask = tokens['attention_mask'] - input_ids = tokens['input_ids'] - attention_mask = tokens['attention_mask'] + # 合并 input_ids 和 attention_mask + merged_dict = {'input_ids': input_ids, 'attention_mask': attention_mask} - # 合并 input_ids 和 attention_mask - merged_dict = {'input_ids': input_ids, 'attention_mask': attention_mask} - - return merged_dict - -# # 调用函数,传入输入文本和 tokenizer 文件路径 -# input_texts = { "杰尼龟", "皮卡丘", "小火龙", "妙蛙种子" } -# tokenizer_file_path = "/home/wangweinan/models/bge-m3-onnx/" - -# init_tokenizer(tokenizer_file_path) -# for text in input_texts: -# tokens = tokenize_text(text) -# print (tokens) + return merged_dict diff --git a/src/utils/textembedder.cpp b/src/utils/textembedder.cpp index 1a2b65f039cd31cb74e09ba9a51848577fd9ea77..d1380862589825a8d87eb3c5d0d5bcf3a61d81be 100644 --- a/src/utils/textembedder.cpp +++ b/src/utils/textembedder.cpp @@ -201,7 +201,7 @@ std::vector TextEmbedder::inference(Token& token, int batch) } const std::vector inputNames = {"input_ids", "attention_mask"}; - const std::vector outputNames = {"dense_vecs"}; + const std::vector outputNames = {"sentence_embedding"}; std::vector output(batch * 1024); std::vector inputTensors = createInputTensors(token, batch, inputNames); diff --git a/src/utils/utils.cpp b/src/utils/utils.cpp index 8ec3005c07b3227253b3d328966751363c1f6237..cb2f7efc0978102ebed8be8e676b1940c4cc0435 100644 --- a/src/utils/utils.cpp +++ b/src/utils/utils.cpp @@ -19,6 +19,7 @@ #include #include #include +#include namespace utils { @@ -80,3 +81,80 @@ void normalizeTokens(std::vector &tokens, int length) } } } // namespace math + + +namespace text +{ +//去除多余的换行和空格、使用递归的方式进行文档切分 +void applyRegex(std::string &text) { + static const std::regex newlineRegex("\\n{3,}"); + static const std::regex spaceRegex("\\s"); + static const std::regex doubleNewlineRegex("\\n\\n"); + static const std::regex singlePunctuationRegex("([;;.!?。!?\\?])([^”’])"); + static const std::regex ellipsisRegex("(\\.{6})([^\"’”」』])"); + static const std::regex chineseEllipsisRegex("(\\…{2})([^\"’”」』])"); + static const std::regex endQuotePunctuationRegex("([;;!?。!?\\?][\"’”」』]{0,2})([^;;!?,。!?\\?])"); + + text = std::regex_replace(text, newlineRegex, "\n"); + text = std::regex_replace(text, spaceRegex, " "); + text = std::regex_replace(text, doubleNewlineRegex, ""); + text = std::regex_replace(text, singlePunctuationRegex, "$1\n$2"); + text = std::regex_replace(text, ellipsisRegex, "$1\n$2"); + text = std::regex_replace(text, chineseEllipsisRegex, "$1\n$2"); + text = std::regex_replace(text, endQuotePunctuationRegex, "$1\n$2"); + + text.erase(std::remove_if(text.begin(), text.end(), isspace), text.end()); +} + +// 根据最大长度分割句子 +std::vector splitSentences(const std::string &text, int maxLength) { + std::istringstream iss(text); + std::string sentence; + std::vector sentences; + while (std::getline(iss, sentence, '\n')) { + // 检查句子是否为空,若为空则跳过本次循环 + if (sentence.empty()) { + continue; + } + size_t start = 0; + while (start < sentence.length()) { + size_t end = std::min(sentence.length(), start + maxLength); + // 确保不切分多字节字符 + while (end > start && (sentence[end] & 0xC0) == 0x80) { + --end; + } + // 如果end位置没有移动,则向前寻找一个合法的结束位置 + if (end == start) { + while (start < sentence.length() && (sentence[start] & 0xC0) == 0x80) { + ++start; + } + end = std::min(sentence.length(), start + maxLength); + while (end > start && (sentence[end] & 0xC0) == 0x80) { + --end; + } + } + sentences.push_back(sentence.substr(start, end - start)); + start = end; + } + } + return sentences; +} + + +void segmentTokenize(std::vector &texts, int maxLength) { + auto begin = texts.begin(); + while (begin != texts.end()) { + applyRegex(*begin); + std::vector sentences = splitSentences(*begin, maxLength); + if (sentences.empty()) { + ++begin; + continue; + } + + begin = texts.erase(begin); + begin = texts.insert(begin, sentences.begin(), sentences.end()); + std::advance(begin, sentences.size()); + } + +} +} diff --git a/src/utils/utils.h b/src/utils/utils.h index 15f46d112599604b75a3a5c26322f373b5127319..4495dc52413f4d8d787db3277a0486e09bc77b18 100644 --- a/src/utils/utils.h +++ b/src/utils/utils.h @@ -31,6 +31,13 @@ namespace math { void normalize(std::vector &vec); float norm(const std::vector &matrix); void normalizeTokens(std::vector &tokens, int length); + +} + +namespace text { + void applyRegex(std::string &text); + std::vector splitSentences(const std::string &text, int maxLength); + void segmentTokenize(std::vector &texts, int maxLength); } #endif // UTILS_H