diff --git a/.gitignore b/.gitignore
index cf2a19c60fdbb04bb13294bb4b98aed49ff33578..af55bde91b16fa63e219f6ba0b0493577ffd015b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,4 +26,5 @@ nbdist/
.nb-gradle/
### Mac files ###
-*.DS_Store
\ No newline at end of file
+*.DS_Store
+.vscode
diff --git a/__release/solon-ai-bundle1/pom.xml b/__release/solon-ai-bundle1/pom.xml
index 21c0e02d8c597e8da06e2700a19559d126b532fa..894d670b22eac3b6136d6e44d4e5b8195db2e79e 100644
--- a/__release/solon-ai-bundle1/pom.xml
+++ b/__release/solon-ai-bundle1/pom.xml
@@ -22,6 +22,7 @@
../../solon-ai-load-html
../../solon-ai-load-markdown
../../solon-ai-load-pdf
+ ../../solon-ai-load-word
../../solon-ai-repo-elasticsearch
../../solon-ai-repo-milvus
diff --git a/solon-ai-load-word/pom.xml b/solon-ai-load-word/pom.xml
new file mode 100644
index 0000000000000000000000000000000000000000..3a1dc953e638c556000bdc8714a4de0f37cb6a68
--- /dev/null
+++ b/solon-ai-load-word/pom.xml
@@ -0,0 +1,43 @@
+
+
+ 4.0.0
+
+
+ org.noear
+ solon-parent
+ 3.1.0
+ ../../../solon-parent/pom.xml
+
+
+ solon-ai-load-word
+ jar
+
+
+
+ org.noear
+ solon-ai
+
+
+
+ org.noear
+ solon-logging-simple
+ test
+
+
+
+ org.noear
+ solon-test
+ test
+
+
+
+ org.apache.poi
+ poi-ooxml
+ ${apache.poi.version}
+
+
+
+
+
\ No newline at end of file
diff --git a/solon-ai-load-word/src/main/java/org/noear/solon/ai/rag/loader/WordLoader.java b/solon-ai-load-word/src/main/java/org/noear/solon/ai/rag/loader/WordLoader.java
new file mode 100644
index 0000000000000000000000000000000000000000..7d929357b13ad1b39b937472a0815a417e5df0c3
--- /dev/null
+++ b/solon-ai-load-word/src/main/java/org/noear/solon/ai/rag/loader/WordLoader.java
@@ -0,0 +1,119 @@
+package org.noear.solon.ai.rag.loader;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.FileInputStream;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.poi.sl.draw.geom.GuideIf.Op;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.noear.solon.ai.rag.Document;
+import org.noear.solon.core.util.SupplierEx;
+
+public class WordLoader extends AbstractOptionsDocumentLoader {
+ private final SupplierEx source;
+
+ public WordLoader(File file) {
+ this(() -> new FileInputStream(file));
+ }
+
+ public WordLoader(URL url) {
+ this(() -> url.openStream());
+ }
+
+ public WordLoader(SupplierEx source) {
+ this.source = source;
+ this.options = new Options();
+ }
+
+ @Override
+ public List load() throws IOException {
+ List documents = new ArrayList<>();
+ try (InputStream stream = source.get()) {
+ try (
+ XWPFDocument reader = new XWPFDocument(stream)) {
+ Map metadata = new HashMap<>();
+ metadata.put("type", "word");
+ // 读取文档内容
+ if (options.loadMode == LoadMode.SINGLE) {
+ XWPFWordExtractor extractor = new XWPFWordExtractor(reader);
+ // 一次性获取文档的全部文本内容
+ String content = extractor.getText();
+ Document doc = new Document(content, metadata)
+ .metadata(this.additionalMetadata);
+ documents.add(doc);
+ extractor.close();
+ } else {
+ /*
+ for (XWPFParagraph extractor : reader.getParagraphs()) {
+ String content = extractor.getText();
+ Document doc = new Document(content, metadata)
+ .metadata(this.additionalMetadata);
+ documents.add(doc);
+ }
+ */
+ XWPFWordExtractor extractor = new XWPFWordExtractor(reader);
+ String content = extractor.getText();
+ Integer pageSize = this.options.pageSize;
+ int pageCount = (int) Math.ceil(content.length() / (double)pageSize);
+ for (int i = 0; i < pageCount; i++) {
+ String pageContent = content.substring(i * pageSize, Math.min((i + 1) * pageSize, content.length()));
+ Document doc = new Document(pageContent, metadata)
+ .metadata(this.additionalMetadata);
+ documents.add(doc);;
+ }
+
+ extractor.close();
+ }
+ }
+ } catch (IOException e) {
+ throw e;
+ } catch (RuntimeException e) {
+ throw e;
+ } catch (Throwable e) {
+ throw new RuntimeException(e);
+ }
+ return documents;
+ }
+
+ public static enum LoadMode {
+ /**
+ * 整个文档作为一个 Document
+ */
+ SINGLE,
+ /**
+ * 每页作为一个 Document
+ */
+ PAGE
+ }
+
+ /**
+ * 选项
+ */
+ public static class Options {
+ private LoadMode loadMode = LoadMode.PAGE;
+ private Integer pageSize = 500;
+
+
+ /**
+ * WORD 加载模式,可以是单文档模式或分页模式
+ */
+ public Options loadMode(LoadMode loadMode) {
+ this.loadMode = loadMode;
+ return this;
+ }
+
+ public Options pageSize(Integer pageSize) {
+ this.pageSize = pageSize;
+ return this;
+ }
+
+ }
+}
diff --git a/solon-ai-load-word/src/test/java/org/noear/solon/ai/rag/loader/WordLoaderTest.java b/solon-ai-load-word/src/test/java/org/noear/solon/ai/rag/loader/WordLoaderTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..e611f6755c89b069caabf120d20d8069e023c11e
--- /dev/null
+++ b/solon-ai-load-word/src/test/java/org/noear/solon/ai/rag/loader/WordLoaderTest.java
@@ -0,0 +1,21 @@
+package org.noear.solon.ai.rag.loader;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+import org.noear.solon.ai.rag.Document;
+import org.noear.solon.core.util.ResourceUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class WordLoaderTest {
+ private static final Logger log = LoggerFactory.getLogger(WordLoaderTest.class);
+
+ @Test
+ public void test1() throws Exception {
+ WordLoader loader = new WordLoader(ResourceUtil.getResource("demo.docx"))
+ .options(opt -> opt.loadMode(WordLoader.LoadMode.PAGE));
+ List docs = loader.load();
+ System.out.println(docs);
+ }
+}
diff --git a/solon-ai-load-word/src/test/resources/demo.docx b/solon-ai-load-word/src/test/resources/demo.docx
new file mode 100644
index 0000000000000000000000000000000000000000..0a2247710a72e9d4389688184d2c067bf2b3582d
Binary files /dev/null and b/solon-ai-load-word/src/test/resources/demo.docx differ