diff --git a/.gitignore b/.gitignore index cf2a19c60fdbb04bb13294bb4b98aed49ff33578..af55bde91b16fa63e219f6ba0b0493577ffd015b 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ nbdist/ .nb-gradle/ ### Mac files ### -*.DS_Store \ No newline at end of file +*.DS_Store +.vscode diff --git a/__release/solon-ai-bundle1/pom.xml b/__release/solon-ai-bundle1/pom.xml index 21c0e02d8c597e8da06e2700a19559d126b532fa..894d670b22eac3b6136d6e44d4e5b8195db2e79e 100644 --- a/__release/solon-ai-bundle1/pom.xml +++ b/__release/solon-ai-bundle1/pom.xml @@ -22,6 +22,7 @@ ../../solon-ai-load-html ../../solon-ai-load-markdown ../../solon-ai-load-pdf + ../../solon-ai-load-word ../../solon-ai-repo-elasticsearch ../../solon-ai-repo-milvus diff --git a/solon-ai-load-word/pom.xml b/solon-ai-load-word/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..3a1dc953e638c556000bdc8714a4de0f37cb6a68 --- /dev/null +++ b/solon-ai-load-word/pom.xml @@ -0,0 +1,43 @@ + + + 4.0.0 + + + org.noear + solon-parent + 3.1.0 + ../../../solon-parent/pom.xml + + + solon-ai-load-word + jar + + + + org.noear + solon-ai + + + + org.noear + solon-logging-simple + test + + + + org.noear + solon-test + test + + + + org.apache.poi + poi-ooxml + ${apache.poi.version} + + + + + \ No newline at end of file diff --git a/solon-ai-load-word/src/main/java/org/noear/solon/ai/rag/loader/WordLoader.java b/solon-ai-load-word/src/main/java/org/noear/solon/ai/rag/loader/WordLoader.java new file mode 100644 index 0000000000000000000000000000000000000000..7d929357b13ad1b39b937472a0815a417e5df0c3 --- /dev/null +++ b/solon-ai-load-word/src/main/java/org/noear/solon/ai/rag/loader/WordLoader.java @@ -0,0 +1,119 @@ +package org.noear.solon.ai.rag.loader; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.FileInputStream; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.poi.sl.draw.geom.GuideIf.Op; +import org.apache.poi.xwpf.extractor.XWPFWordExtractor; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.noear.solon.ai.rag.Document; +import org.noear.solon.core.util.SupplierEx; + +public class WordLoader extends AbstractOptionsDocumentLoader { + private final SupplierEx source; + + public WordLoader(File file) { + this(() -> new FileInputStream(file)); + } + + public WordLoader(URL url) { + this(() -> url.openStream()); + } + + public WordLoader(SupplierEx source) { + this.source = source; + this.options = new Options(); + } + + @Override + public List load() throws IOException { + List documents = new ArrayList<>(); + try (InputStream stream = source.get()) { + try ( + XWPFDocument reader = new XWPFDocument(stream)) { + Map metadata = new HashMap<>(); + metadata.put("type", "word"); + // 读取文档内容 + if (options.loadMode == LoadMode.SINGLE) { + XWPFWordExtractor extractor = new XWPFWordExtractor(reader); + // 一次性获取文档的全部文本内容 + String content = extractor.getText(); + Document doc = new Document(content, metadata) + .metadata(this.additionalMetadata); + documents.add(doc); + extractor.close(); + } else { + /* + for (XWPFParagraph extractor : reader.getParagraphs()) { + String content = extractor.getText(); + Document doc = new Document(content, metadata) + .metadata(this.additionalMetadata); + documents.add(doc); + } + */ + XWPFWordExtractor extractor = new XWPFWordExtractor(reader); + String content = extractor.getText(); + Integer pageSize = this.options.pageSize; + int pageCount = (int) Math.ceil(content.length() / (double)pageSize); + for (int i = 0; i < pageCount; i++) { + String pageContent = content.substring(i * pageSize, Math.min((i + 1) * pageSize, content.length())); + Document doc = new Document(pageContent, metadata) + .metadata(this.additionalMetadata); + documents.add(doc);; + } + + extractor.close(); + } + } + } catch (IOException e) { + throw e; + } catch (RuntimeException e) { + throw e; + } catch (Throwable e) { + throw new RuntimeException(e); + } + return documents; + } + + public static enum LoadMode { + /** + * 整个文档作为一个 Document + */ + SINGLE, + /** + * 每页作为一个 Document + */ + PAGE + } + + /** + * 选项 + */ + public static class Options { + private LoadMode loadMode = LoadMode.PAGE; + private Integer pageSize = 500; + + + /** + * WORD 加载模式,可以是单文档模式或分页模式 + */ + public Options loadMode(LoadMode loadMode) { + this.loadMode = loadMode; + return this; + } + + public Options pageSize(Integer pageSize) { + this.pageSize = pageSize; + return this; + } + + } +} diff --git a/solon-ai-load-word/src/test/java/org/noear/solon/ai/rag/loader/WordLoaderTest.java b/solon-ai-load-word/src/test/java/org/noear/solon/ai/rag/loader/WordLoaderTest.java new file mode 100644 index 0000000000000000000000000000000000000000..e611f6755c89b069caabf120d20d8069e023c11e --- /dev/null +++ b/solon-ai-load-word/src/test/java/org/noear/solon/ai/rag/loader/WordLoaderTest.java @@ -0,0 +1,21 @@ +package org.noear.solon.ai.rag.loader; + +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.noear.solon.ai.rag.Document; +import org.noear.solon.core.util.ResourceUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class WordLoaderTest { + private static final Logger log = LoggerFactory.getLogger(WordLoaderTest.class); + + @Test + public void test1() throws Exception { + WordLoader loader = new WordLoader(ResourceUtil.getResource("demo.docx")) + .options(opt -> opt.loadMode(WordLoader.LoadMode.PAGE)); + List docs = loader.load(); + System.out.println(docs); + } +} diff --git a/solon-ai-load-word/src/test/resources/demo.docx b/solon-ai-load-word/src/test/resources/demo.docx new file mode 100644 index 0000000000000000000000000000000000000000..0a2247710a72e9d4389688184d2c067bf2b3582d Binary files /dev/null and b/solon-ai-load-word/src/test/resources/demo.docx differ