From 6b8d70ecb717b512d8c53da54f531517ee4f5580 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Fri, 29 Aug 2025 17:58:04 +0800 Subject: [PATCH 01/24] modify test indicator --- ACL_PyTorch/built-in/audio/whisper/README.md | 25 ++++---------------- ACL_PyTorch/built-in/audio/whisper/infer.py | 9 +++++++ 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/ACL_PyTorch/built-in/audio/whisper/README.md b/ACL_PyTorch/built-in/audio/whisper/README.md index 0e2b164d03..b5187cd746 100644 --- a/ACL_PyTorch/built-in/audio/whisper/README.md +++ b/ACL_PyTorch/built-in/audio/whisper/README.md @@ -95,27 +95,10 @@ infer.py推理参数: warmup结束之后,开始推理librispeech_asr_dummy数据集,推理过程中会打屏输出E2E性能,推理结束后会输出WER精度得分。 -**如果你想推理过程中打印encode和decode的耗时,你可以执行以下命令:** -```SHELL -# 1. 找到当前的环境路径(简称${location}),Location后面的那一串就是当前环境路径 -pip show openai-whisper | grep Location -# 2. 记录当前whisper库decoding.py的文件路径 -${decoding_path} = ${location}/whisper/decoding.py -# 3. 执行patch文件 -patch -p1 < whisper_decoding.patch -# 可能会提示你 -# cant find file to patch at input line 3 -# ... -# File to patch: -# 这时候需要你手动指定文件路径,输入之前得到的 -${decoding_path} -# 按回车,提示 patching file ${decoding_path} 即成功 -``` - ## 性能数据 在librispeech_asr_dummy/clean数据集上的性能如下: - | 模型 | 芯片 | 平均encode | 平均decode |平均E2E | - |---------|------------|----------|-----------------|---------| - | whisper | 800I A2 | 0.90ms | 3.25ms | 67.32ms | - 注:平均decode 指在decode阶段,生成单个token的平均耗时。 \ No newline at end of file + | 模型 | 芯片 | RTF | + |---------|------------|----------| + | whisper | 800I A2 | 0.0236 | + 注:RTF表示转录一段音频所需的时间与音频实际长度的比值,多次运行取平均 \ No newline at end of file diff --git a/ACL_PyTorch/built-in/audio/whisper/infer.py b/ACL_PyTorch/built-in/audio/whisper/infer.py index ba5da6fa13..0617aa7eb0 100644 --- a/ACL_PyTorch/built-in/audio/whisper/infer.py +++ b/ACL_PyTorch/built-in/audio/whisper/infer.py @@ -17,6 +17,7 @@ import jiwer import numpy as np import pandas as pd from datasets import load_dataset +import librosa import torch from torch import nn, Tensor @@ -279,6 +280,12 @@ if __name__ == '__main__': npu_backend = tng.get_npu_backend(compiler_config=config) dataset = LibriSpeechDataset(wsp_args.speech_path, device=device) + audios = load_dataset(wsp_args.speech_path, split="validation") + duration_seconds = 0 + for audio in audios: + y, audio_sr = audio["audio"]["array"], audio["audio"]["sampling_rate"] + duration_seconds += librosa.get_duration(y=y, sr=audio_sr) + loader = torch.utils.data.DataLoader(dataset, batch_size=wsp_args.batch_size) options = whisper.DecodingOptions(language='en', without_timestamps=True, fp16=True) @@ -300,5 +307,7 @@ if __name__ == '__main__': print("{}/{} - {}".format(_step, wsp_args.warmup, result[bs].text)) print("LibriSpeech infer, English to English TRANSCRIBE ...") + start_time = time.time() p_wer = libri_speech_infer(wsp_model, options, loader) + print(f"RTF: {(time.time()-start_time)/duration_seconds:.4f}") print(f"LibriSpeech infer WER score = {p_wer * 100:.2f} %") -- Gitee From eb280416a48b54c90a7623f0ed2cc2820ad597c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Fri, 29 Aug 2025 18:19:45 +0800 Subject: [PATCH 02/24] modify test indicator --- ACL_PyTorch/built-in/audio/whisper/README.md | 25 ++++---------------- ACL_PyTorch/built-in/audio/whisper/infer.py | 9 +++++++ 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/ACL_PyTorch/built-in/audio/whisper/README.md b/ACL_PyTorch/built-in/audio/whisper/README.md index 0e2b164d03..135d9f87b9 100644 --- a/ACL_PyTorch/built-in/audio/whisper/README.md +++ b/ACL_PyTorch/built-in/audio/whisper/README.md @@ -95,27 +95,10 @@ infer.py推理参数: warmup结束之后,开始推理librispeech_asr_dummy数据集,推理过程中会打屏输出E2E性能,推理结束后会输出WER精度得分。 -**如果你想推理过程中打印encode和decode的耗时,你可以执行以下命令:** -```SHELL -# 1. 找到当前的环境路径(简称${location}),Location后面的那一串就是当前环境路径 -pip show openai-whisper | grep Location -# 2. 记录当前whisper库decoding.py的文件路径 -${decoding_path} = ${location}/whisper/decoding.py -# 3. 执行patch文件 -patch -p1 < whisper_decoding.patch -# 可能会提示你 -# cant find file to patch at input line 3 -# ... -# File to patch: -# 这时候需要你手动指定文件路径,输入之前得到的 -${decoding_path} -# 按回车,提示 patching file ${decoding_path} 即成功 -``` - ## 性能数据 在librispeech_asr_dummy/clean数据集上的性能如下: - | 模型 | 芯片 | 平均encode | 平均decode |平均E2E | - |---------|------------|----------|-----------------|---------| - | whisper | 800I A2 | 0.90ms | 3.25ms | 67.32ms | - 注:平均decode 指在decode阶段,生成单个token的平均耗时。 \ No newline at end of file + | 模型 | 芯片 | 转录倍率QPS | + |---------|------------|----------| + | whisper | 800I A2 | 42.34 | + 注:QPS表示音频实际长度与转录音频所需的时间的比值,多次运行取平均 \ No newline at end of file diff --git a/ACL_PyTorch/built-in/audio/whisper/infer.py b/ACL_PyTorch/built-in/audio/whisper/infer.py index ba5da6fa13..066a46d3fd 100644 --- a/ACL_PyTorch/built-in/audio/whisper/infer.py +++ b/ACL_PyTorch/built-in/audio/whisper/infer.py @@ -17,6 +17,7 @@ import jiwer import numpy as np import pandas as pd from datasets import load_dataset +import librosa import torch from torch import nn, Tensor @@ -279,6 +280,12 @@ if __name__ == '__main__': npu_backend = tng.get_npu_backend(compiler_config=config) dataset = LibriSpeechDataset(wsp_args.speech_path, device=device) + audios = load_dataset(wsp_args.speech_path, split="validation") + duration_seconds = 0 + for audio in audios: + y, audio_sr = audio["audio"]["array"], audio["audio"]["sampling_rate"] + duration_seconds += librosa.get_duration(y=y, sr=audio_sr) + loader = torch.utils.data.DataLoader(dataset, batch_size=wsp_args.batch_size) options = whisper.DecodingOptions(language='en', without_timestamps=True, fp16=True) @@ -300,5 +307,7 @@ if __name__ == '__main__': print("{}/{} - {}".format(_step, wsp_args.warmup, result[bs].text)) print("LibriSpeech infer, English to English TRANSCRIBE ...") + start_time = time.time() p_wer = libri_speech_infer(wsp_model, options, loader) + print(f"QPS: {duration_seconds/(time.time()-start_time):.2f}") print(f"LibriSpeech infer WER score = {p_wer * 100:.2f} %") -- Gitee From 46c4501320deb7b846c081b92b206dab773b7453 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Fri, 29 Aug 2025 18:38:01 +0800 Subject: [PATCH 03/24] modify test indicator --- ACL_PyTorch/built-in/audio/whisper/README.md | 2 +- .../audio/whisper/whisper_decoding.patch | 34 ------------------- 2 files changed, 1 insertion(+), 35 deletions(-) delete mode 100644 ACL_PyTorch/built-in/audio/whisper/whisper_decoding.patch diff --git a/ACL_PyTorch/built-in/audio/whisper/README.md b/ACL_PyTorch/built-in/audio/whisper/README.md index 4ed36a7f30..548db00008 100644 --- a/ACL_PyTorch/built-in/audio/whisper/README.md +++ b/ACL_PyTorch/built-in/audio/whisper/README.md @@ -100,5 +100,5 @@ warmup结束之后,开始推理librispeech_asr_dummy数据集,推理过程 | 模型 | 芯片 | 转录倍率QPS | |---------|------------|----------| - | whisper | 800I A2 | 42.34 | + | whisper | 800I A2 64G | 42.34 | 注:QPS表示音频实际长度与转录音频所需的时间的比值,多次运行取平均 diff --git a/ACL_PyTorch/built-in/audio/whisper/whisper_decoding.patch b/ACL_PyTorch/built-in/audio/whisper/whisper_decoding.patch deleted file mode 100644 index 871e972c2f..0000000000 --- a/ACL_PyTorch/built-in/audio/whisper/whisper_decoding.patch +++ /dev/null @@ -1,34 +0,0 @@ -+++ decoding.py -@@ -652,7 +652,10 @@ - # encoded audio features are given; skip audio encoding - audio_features = mel - else: -+ import time -+ time1 = time.time() - audio_features = self.model.encoder(mel) -+ print(f"encode time = {(time.time() - time1) * 1000:.2f} ms") - - if audio_features.dtype != ( - torch.float16 if self.options.fp16 else torch.float32 -@@ -683,6 +686,8 @@ - no_speech_probs = [np.nan] * n_batch - - try: -+ import time -+ time1 = time.time() - for i in range(self.sample_len): - logits = self.inference.logits(tokens, audio_features) - -@@ -703,6 +708,8 @@ - tokens, completed = self.decoder.update(tokens, logits, sum_logprobs) - - if completed or tokens.shape[-1] > self.n_ctx: -+ avg_time = (time.time() - time1) / i * 1000 -+ print(f"avg decode time = {avg_time:.2f} ms") - break - finally: - self.inference.cleanup_caching() -@@ -824,3 +831,4 @@ - result = DecodingTask(model, options).run(mel) - - return result[0] if single else result -- Gitee From ca522f30b19251671ecc9f5362a847cffcf5abd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 10:11:27 +0800 Subject: [PATCH 04/24] add mineru torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/README.md | 194 ++++++++++ .../built-in/ocr/MinerU/doclayout_yolo.patch | 50 +++ ACL_PyTorch/built-in/ocr/MinerU/infer.py | 346 ++++++++++++++++++ .../built-in/ocr/MinerU/requirements.txt | 34 ++ .../built-in/ocr/MinerU/ultralytics.patch | 77 ++++ 5 files changed, 701 insertions(+) create mode 100644 ACL_PyTorch/built-in/ocr/MinerU/README.md create mode 100644 ACL_PyTorch/built-in/ocr/MinerU/doclayout_yolo.patch create mode 100644 ACL_PyTorch/built-in/ocr/MinerU/infer.py create mode 100644 ACL_PyTorch/built-in/ocr/MinerU/requirements.txt create mode 100644 ACL_PyTorch/built-in/ocr/MinerU/ultralytics.patch diff --git a/ACL_PyTorch/built-in/ocr/MinerU/README.md b/ACL_PyTorch/built-in/ocr/MinerU/README.md new file mode 100644 index 0000000000..19f28fbb03 --- /dev/null +++ b/ACL_PyTorch/built-in/ocr/MinerU/README.md @@ -0,0 +1,194 @@ +# MinerU(TorchAir)-推理指导 + +- [MinerU(TorchAir)-推理指导](#MinerU(TorchAir)-推理指导) +- [概述](#概述) +- [推理环境准备](#推理环境准备) +- [快速上手](#快速上手) + - [获取源码](#获取源码) + - [获取权重](#获取权重) + - [获取数据集](#获取数据集) + - [执行推理](#执行推理) + - [精度测试](#精度测试) + +****** + +# 概述 +MinerU是由上海人工智能实验室OpenDataLab团队开发的开源文档解析工具,致力于解决大模型(LLM)训练和RAG(检索增强生成)应用中高质量结构化数据的提取难题。其核心价值在于将复杂文档(如PDF、网页、电子书)转换为机器可读的Markdown、JSON格式,同时保留原始文档的语义逻辑与多模态元素。 + +- 版本说明: + + ``` + url=https://github.com/opendatalab/MinerU.git + commit_id=de41fa58590263e43b783fe224b6d07cae290a33 + model_name=MinerU + ``` + +# 推理环境准备 + +- 该模型需要以下插件与驱动 + **表 1** 版本配套表 + + | 配套 | 版本 | 环境准备指导 | + | ------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------- | + | 固件与驱动 | 25.0.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | + | CANN | 8.3.0 | - | + | Python | 3.11 | - | + | PyTorch | 2.6.0 | - | + | Ascend Extension PyTorch | 2.6.0 | - | + | 说明:Atlas 800I A2/Atlas 300I Pro 推理卡请以CANN版本选择实际固件与驱动版本。 | \ | \ | + +# 快速上手 + +## 获取源码 + +1. 获取`Pytorch`源码 + + ``` + git clone https://github.com/opendatalab/MinerU.git + cd MinerU + git reset --hard de41fa58590263e43b783fe224b6d07cae290a33 + pip3 install -e . + cd .. + ``` + +2. 安装依赖 + + ``` + pip3 install -r requirements.txt + ### 此外,还需安装 Torchvision Adapter + git clone https://gitee.com/ascend/vision.git vision_npu + cd vision_npu + git checkout v0.21.0-7.1.0 + pip3 install -r requirement.txt + source /usr/local/Ascend/ascend-toolkit/set_env.sh # Default path, change it if needed. + python setup.py bdist_wheel + cd dist + pip install torchvision_npu-0.21.0+git22ca6b2-cp311-cp311-linux_aarch64.whl + cd ../../ + ``` + + +3. 修改第三方库 +进入第三方库安装路径,默认为`source_path = /usr/local/lib/python3.11/site-packages`,通过工作目录`workdir`(自定义)中的`ultralytics.patch`和`doclayout_yolo.patch`进行修改 + ``` + source_path=/usr/local/lib/python3.11/site-packages + cd ${source_path}/ultralytics + patch -p2 < ${workdir}/ultralytics.patch + cd ${source_path}/doclayout_yolo + patch -p2 < ${workdir}/doclayout_yolo.patch + ``` +修改完成后回到工作目录`workdir` + +## 获取权重 + +运行以下指令,下载权重文件[Model weights](https://www.modelscope.cn/models/OpenDataLab/PDF-Extract-Kit-1.0/summary),默认存放为`/root/.cache/modelscope/hub/models/OpenDataLab/PDF-Extract-Kit-1___0` + +``` +mineru-models-download --source modelscope --model_type pipeline +``` +下载完成后,默认在根目录生成`mineru.json`文件,移动数据集时,需修改`/root/mineru.json`文件中"models-dir": "pipeline"为修改后权重存放路径 + +权重目录大致结构为: +```text +📁 models +├── 📁 Layout +│   └── 📁 YOLO +│   └── doclayout_yolo_docstructbench_imgsz1280_2501.pt +├── 📁 MFD +│   └── 📁 YOLO +│   └── yolo_v8_ft.pt +├── 📁 MFR +│   └── 📁 unimernet_hf_small_2503 +│   ├── model.safetensors +│   ├── …… +│   └── tokenizer_config.json +├── 📁 OCR +│   └── 📁 paddleocr_torch +│   ├── Multilingual_PP-OCRv3_det_infer.pth +│   ├── arabic_PP-OCRv3_rec_infer.pth +│   ├── …… +│   ├── …… +│   └── th_PP-OCRv5_rec_infer.pth +├── 📁 ReadingOrder +│   └── 📁 layout_reader +│   ├── config.json +│   └── model.safetensors +└── 📁 TabRec + └── 📁 SlanetPlus + └── slanet-plus.onnx +``` + + +## 获取数据集 + +创建数据集目录`OmniDocBench_dataset`,下载多样性文档解析评测集`OmniDocBench`数据集的[pdfs和标注](https://opendatalab.com/OpenDataLab/OmniDocBench),解压并放置在`OmniDocBench_dataset`目录下 +文件目录格式大致如下: + ``` + 📁 workdir + ├── infer.py + ├── …… + ├── 📁 MinerU + └── 📁 OmniDocBench_dataset +   ├── OmniDocBench.json +   └── 📁 pdfs + └── ***.pdf + ``` + +## 执行推理 + +运行推理脚本infer.py + +``` +python3 infer.py --data_path=OmniDocBench_dataset --model_source=local +``` + +- 参数说明 + - data_path: 数据集路径 + - model_source: 模型源类型,local表示使用本地文件,modelscope/huggingface表示在线拉取权重 + +推理执行完成后,解析结果存放于`OmniDocBench_dataset/output/`目录,结果除了输出主要的 markdown 文件外,还会生成多个辅助文件用于调试、质检和进一步处理。 + +## 精度测试 + +使用`OmniDocBench`数据集配套评测代码测试精度。 + +1. 推理结果整理 +将解析结果文件夹中的markdown文件整理放置于同一目录,本例将所有markdown文件存放于OmniDocBench_dataset目录下的results_md文件夹 + ``` + cp OmniDocBench_dataset/output/*/auto/*.md OmniDocBench_dataset/results_md/ + ``` + +2. 获取测评源码并构建环境 + + ``` + git clone https://github.com/opendatalab/OmniDocBench.git + cd OmniDocBench + conda create -n omnidocbench python=3.10 + conda activate omnidocbench + pip install -r requirements.txt + ``` + +3. 测评配置修改 +修改`OmniDocBench`测评代码中的config文件,具体来说,我们使用端到端测评配置,修改configs/end2end.yaml文件中的ground_truth的data_path为下载的OmniDocBench.json路径,修改prediction的data_path中提供整理的推理结果的文件夹路径,如下: + ``` + # -----以下是需要修改的部分 ----- + dataset: + dataset_name: end2end_dataset + ground_truth: + data_path: ../OmniDocBench_dataset/OmniDocBench.json + prediction: + data_path: ../OmniDocBench_dataset/result_md + ``` + +4. 精度测量结果 +配置好config文件后,只需要将config文件作为参数传入,运行以下代码即可进行评测: + ``` + python pdf_validation.py --config ./configs/end2end.yaml + ``` + + 在`OmniDocBench`数据集上的精度为: + |模型|芯片|overall_EN|overall_CH| + |------|------|------|------| + |MinerU|300I DUO|0.1588|0.2527| + |MinerU|800I A2 64G|0.1580|0.2510| + diff --git a/ACL_PyTorch/built-in/ocr/MinerU/doclayout_yolo.patch b/ACL_PyTorch/built-in/ocr/MinerU/doclayout_yolo.patch new file mode 100644 index 0000000000..7cf22c0b32 --- /dev/null +++ b/ACL_PyTorch/built-in/ocr/MinerU/doclayout_yolo.patch @@ -0,0 +1,50 @@ +diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/engine/predictor.py doclayout_yolo-0.0.4_fix/doclayout_yolo/engine/predictor.py +--- doclayout_yolo-0.0.4/doclayout_yolo/engine/predictor.py 2025-02-11 15:49:31.000000000 +0800 ++++ doclayout_yolo-0.0.4_fix/doclayout_yolo/engine/predictor.py 2025-09-09 16:05:20.011737230 +0800 +@@ -152,7 +152,8 @@ + (list): A list of transformed images. + """ + same_shapes = len({x.shape for x in im}) == 1 +- letterbox = LetterBox(self.imgsz, auto=same_shapes and self.model.pt, stride=self.model.stride) ++ letterbox = LetterBox(self.imgsz, auto=False, stride=self.model.stride) ++ # letterbox = LetterBox(self.imgsz, auto=same_shapes and self.model.pt, stride=self.model.stride) + return [letterbox(image=x) for x in im] + + def postprocess(self, preds, img, orig_imgs): +@@ -225,7 +226,8 @@ + + # Warmup model + if not self.done_warmup: +- self.model.warmup(imgsz=(1 if self.model.pt or self.model.triton else self.dataset.bs, 3, *self.imgsz)) ++ # self.model.warmup(imgsz=(1 if self.model.pt or self.model.triton else self.dataset.bs, 3, *self.imgsz)) ++ self.model.warmup(imgsz=(self.dataset.bs, 3, *self.imgsz)) + self.done_warmup = True + + self.seen, self.windows, self.batch = 0, [], None + +diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/nn/modules/block.py doclayout_yolo-0.0.4_fix/doclayout_yolo/nn/modules/block.py +--- doclayout_yolo-0.0.4/doclayout_yolo/nn/modules/block.py 2025-02-11 15:49:31.000000000 +0800 ++++ doclayout_yolo-0.0.4_fix/doclayout_yolo/nn/modules/block.py 2025-09-09 16:05:20.019737230 +0800 +@@ -230,7 +230,9 @@ + def forward(self, x): + """Forward pass through C2f layer.""" + y = list(self.cv1(x).chunk(2, 1)) +- y.extend(m(y[-1]) for m in self.m) ++ # y.extend(m(y[-1]) for m in self.m) ++ for m in self.m: ++ y.append(m(y[-1])) + return self.cv2(torch.cat(y, 1)) + + def forward_split(self, x): + +diff -ruN doclayout_yolo-0.0.4/doclayout_yolo/utils/tal.py doclayout_yolo-0.0.4_fix/doclayout_yolo/utils/tal.py +--- doclayout_yolo-0.0.4/doclayout_yolo/utils/tal.py 2025-02-11 15:49:31.000000000 +0800 ++++ doclayout_yolo-0.0.4_fix/doclayout_yolo/utils/tal.py 2025-09-09 16:05:20.023737230 +0800 +@@ -328,7 +328,8 @@ + sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y + sy, sx = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx) + anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2)) +- stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) ++ # stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) ++ stride_tensor.append(torch.ones((h * w, 1), dtype=dtype, device=device)*stride) + return torch.cat(anchor_points), torch.cat(stride_tensor) \ No newline at end of file diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py new file mode 100644 index 0000000000..933917f639 --- /dev/null +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -0,0 +1,346 @@ +# Copyright (c) Opendatalab. All rights reserved. +import os +from pathlib import Path +import inspect +import argparse +from loguru import logger +import math +from typing import Optional, Tuple, Union +import pypdfium2 as pdfium +import time + +import torch +import torch_npu +import torch.nn as nn +import torchvision +import torchvision_npu +import torchair as tng +from torchair.configs.compiler_config import CompilerConfig + +# import logging +# from torchair import logger +# logger.setLevel(logging.DEBUG) + +from mineru.backend.pipeline.model_list import AtomicModel +from mineru.model.mfr.unimernet.unimernet_hf.unimer_swin.modeling_unimer_swin import UnimerSwinSelfAttention +from mineru.backend.pipeline.model_init import ( + AtomModelSingleton, + table_model_init, + mfd_model_init, + mfr_model_init, + doclayout_yolo_model_init, + ocr_model_init, + ) +from mineru.utils.model_utils import get_vram +from mineru.backend.pipeline.batch_analyze import ( + YOLO_LAYOUT_BASE_BATCH_SIZE, + MFD_BASE_BATCH_SIZE, + MFR_BASE_BATCH_SIZE, + ) + +from transformers.generation.utils import GenerationMixin +from demo.demo import parse_doc + + +def parse_args(): + parser = argparse.ArgumentParser("MinerU infer") + parser.add_argument("--model_source", type=str, default="local", help="model checkpoint source") + parser.add_argument("--data_path", type=str, default="/home/z00939677/OmniDocBench_dataset") + parser.add_argument("--warmup", type=int, default=2, help="Warm up times") + parser.add_argument("--warmup_data_path", type=str, default="../OmniDocBench_dataset/pdfs/jiaocai_71434495.pdf_0.pdf") + args = parser.parse_args() + return args + +def atom_model_init_compile(model_name: str, **kwargs): + atom_model = None + if model_name == AtomicModel.Layout: + atom_model = doclayout_yolo_model_init( + kwargs.get('doclayout_yolo_weights'), + kwargs.get('device') + ) + atom_model.model.model = compile_model(atom_model.model.model, False, True) + npu_input = torch.zeros((batch_candidate[AtomicModel.Layout][0], 3, atom_model.imgsz, atom_model.imgsz)) + tng.inference.set_dim_gears(npu_input, {0: batch_candidate[AtomicModel.Layout]}) + + elif model_name == AtomicModel.MFD: + atom_model = mfd_model_init( + kwargs.get('mfd_weights'), + kwargs.get('device') + ) + atom_model.model.model = compile_model(atom_model.model.model, False, True) + npu_input = torch.zeros((batch_candidate[AtomicModel.MFD][0], 3, atom_model.imgsz, atom_model.imgsz)) + tng.inference.set_dim_gears(npu_input, {0: batch_candidate[AtomicModel.MFD]}) + + elif model_name == AtomicModel.MFR: + atom_model = mfr_model_init( + kwargs.get('mfr_weight_dir'), + kwargs.get('device') + ) + + modify_mfr_model(atom_model.model) + + print(atom_model.model.encoder.__class__) + + atom_model.model.encoder = compile_model(atom_model.model.encoder, False, True) + # npu_input = torch.zeros((batch_candidate[AtomicModel.MFR], 3, *atom_model.model.transform.input_size), dtype=torch.half, device=atom_model.device) + # tng.inference.set_dim_gears(npu_input, {0: batch_candidate[AtomicModel.MFR]}) + atom_model.model.decoder = compile_model(atom_model.model.decoder, True, True) + + elif model_name == AtomicModel.OCR: + atom_model = ocr_model_init( + kwargs.get('det_db_box_thresh'), + kwargs.get('lang'), + kwargs.get('det_limit_side_len'), + ) + + elif model_name == AtomicModel.Table: + atom_model = table_model_init( + kwargs.get('lang'), + ) + + else: + logger.error('model name not allow') + exit(1) + + if atom_model is None: + logger.error('model init failed') + exit(1) + else: + return atom_model + + + +def rewrite_mfr_encoder_multi_head_attention_forward(model): + wq = model.query.weight + wk = model.key.weight + wv = model.value.weight + model.qkv = nn.Linear(in_features=wk.shape[1], out_features= wq.shape[0] + wk.shape[0] + wv.shape[0]) + model.qkv.weight = nn.Parameter(torch.concat([wq, wk, wv], dim=0), requires_grad=False) + wq_bias = model.query.bias if model.query.bias is not None else torch.zeros(wq.shape[0]) + wk_bias = model.key.bias if model.key.bias is not None else torch.zeros(wk.shape[0]) + wv_bias = model.key.bias if model.value.bias is not None else torch.zeros(wv.shape[0]) + model.qkv.bias = nn.Parameter(torch.concat([wq_bias, wk_bias, wv_bias], dim=0), requires_grad=False) + + def attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False + ) -> Tuple[torch.Tensor]: + + # """融合qk为大矩阵,由于加入相对位置编码,PFA接口用不了,暂时只修改矩阵乘法""" + batch_size, dim, num_channels = hidden_states.shape + qkv = model.qkv(hidden_states) + q, k, v = qkv.chunk(3, dim=-1) + + query_layer = q.view(*q.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) + key_layer = k.view(*k.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) + value_layer = v.view(*v.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) + + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)] + relative_position_bias = relative_position_bias.view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + ) + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() + + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + attention_scores = attention_scores + relative_position_bias.unsqueeze(0) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in UnimerSwinModel forward() function) + mask_shape = attention_mask.shape[0] + attention_scores = attention_scores.view( + batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim + ) + attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0) + attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + model.forward = attn_forward.__get__(model) + + +def modify_mfr_model(model): + # 修改encoder的attention forward + for name, module in model.encoder.named_modules(): + if isinstance(module, UnimerSwinSelfAttention): + rewrite_mfr_encoder_multi_head_attention_forward(module) + rewrite_mfr_encoder_forward() + +def compile_model(model, dynamic, fullgraph): + config = CompilerConfig() + config.experimental_config.frozen_parameter = True + config.experimental_config.tiling_schedule_optimize = True + npu_backend = tng.get_npu_backend(compiler_config=config) + compiled_model = torch.compile(model, dynamic=dynamic, fullgraph=fullgraph, backend=npu_backend) + # tng.use_internal_format_weight(compiled_model) + return compiled_model + + +def rewrite_model_init(): + def _patched_getmodel(self, atom_model_name: str, **kwargs): + lang = kwargs.get('lang', None) + table_model_name = kwargs.get('table_model_name', None) + + if atom_model_name in [AtomicModel.OCR]: + key = (atom_model_name, lang) + elif atom_model_name in [AtomicModel.Table]: + key = (atom_model_name, table_model_name, lang) + else: + key = atom_model_name + + if key not in self._models: + self._models[key] = atom_model_init_compile(model_name=atom_model_name, **kwargs) + return self._models[key] + AtomModelSingleton.get_atom_model = _patched_getmodel + +def rewrite_mfr_encoder_forward(): + def _patched_prepare_encoder_decoder_kwargs_for_generation(self, + inputs_tensor: torch.Tensor, + model_kwargs, + model_input_name: Optional[str], + generation_config, + ): + # 1. get encoder + encoder = self.get_encoder() + + # 2. Prepare encoder args and encoder kwargs from model kwargs and generation config. + irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"] + encoder_kwargs = { + argument: value + for argument, value in model_kwargs.items() + if not any(argument.startswith(p) for p in irrelevant_prefix) + } + encoder_signature = set(inspect.signature(encoder.forward).parameters) + encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature + if not encoder_accepts_wildcard: + encoder_kwargs = { + argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature + } + encoder_kwargs["output_attentions"] = generation_config.output_attentions + encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states + + # 3. make sure that encoder returns `ModelOutput` + model_input_name = model_input_name if model_input_name is not None else self.main_input_name + encoder_kwargs["return_dict"] = True + + ####### 固定input_tensor形状 + pad_count = 0 + if batch_candidate[AtomicModel.MFR] != inputs_tensor.shape[0]: + pad_count = batch_candidate[AtomicModel.MFR]-inputs_tensor.shape[0] + padding_tensor = torch.zeros(pad_count, *inputs_tensor.shape[1:], dtype=inputs_tensor.dtype, device=inputs_tensor.device) + inputs_tensor = torch.cat((inputs_tensor, padding_tensor), dim=0) + + encoder_kwargs[model_input_name] = inputs_tensor + output = encoder(**encoder_kwargs)# type: ignore + if pad_count != 0: + output.last_hidden_state = output.last_hidden_state[:-pad_count] + output.pooler_output = output.pooler_output[:-pad_count] + model_kwargs["encoder_outputs"] = output + return model_kwargs + + GenerationMixin._prepare_encoder_decoder_kwargs_for_generation = _patched_prepare_encoder_decoder_kwargs_for_generation + +def warmup(data_path, warmup_iters): + data_path = Path(data_path) + + output_dir = Path(data_path).parent + output_dir = os.path.join(output_dir, "warmup_res") + pdf_suffixes = [".pdf"] + image_suffixes = [".png", ".jpeg", ".jpg"] + supported_suffixes = pdf_suffixes + image_suffixes + + if data_path.suffix.lower() not in supported_suffixes: + raise ValueError( + f"Unsupported file type: '{data_path.suffix}'. " + f"Supported types: {supported_suffixes}" + ) + + doc_path_list = [data_path] * sum(batch_candidate[AtomicModel.Layout]) + for _ in range(warmup_iters): + parse_doc(doc_path_list, output_dir, backend="pipeline") + + +def get_batch_ratio(device="npu"): + batch_ratio = 1 + if str(device).startswith('npu') or str(device).startswith('cuda'): + vram = get_vram(device) + if vram is not None: + gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram))) + if gpu_memory >= 16: + batch_ratio = 16 + elif gpu_memory >= 12: + batch_ratio = 8 + elif gpu_memory >= 8: + batch_ratio = 4 + elif gpu_memory >= 6: + batch_ratio = 2 + else: + batch_ratio = 1 + logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}') + else: + # Default batch_ratio when VRAM can't be determined + batch_ratio = 1 + logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}') + return batch_ratio + + +def get_pdf_page_count(pdf_path): + pdf = pdfium.PdfDocument(pdf_path) + try: + return len(pdf) + finally: + pdf.close() + +if __name__ == '__main__': + args = parse_args() + os.environ['MINERU_MODEL_SOURCE'] = args.model_source + + __dir__ = args.data_path + pdf_files_dir = os.path.join(__dir__, "pdfs") + output_dir = os.path.join(__dir__, "output") + pdf_suffixes = [".pdf"] + image_suffixes = [".png", ".jpeg", ".jpg"] + + + print(pdf_files_dir) + batch_ratio = get_batch_ratio() + # warmup(args.warmup, batch_ratio, ) + + rewrite_model_init() + + doc_path_list = [] + pdfs_page_count = 0 + for doc_path in Path(pdf_files_dir).glob('*'): + if doc_path.suffix in pdf_suffixes + image_suffixes: + doc_path_list.append(doc_path) + pdfs_page_count += get_pdf_page_count(doc_path) + + batch_candidate = { + AtomicModel.Layout: [YOLO_LAYOUT_BASE_BATCH_SIZE, pdfs_page_count % YOLO_LAYOUT_BASE_BATCH_SIZE], + AtomicModel.MFD: [MFD_BASE_BATCH_SIZE, pdfs_page_count % MFD_BASE_BATCH_SIZE], + AtomicModel.MFR: batch_ratio*MFR_BASE_BATCH_SIZE, + } + print(len(doc_path_list), batch_candidate) + warmup(args.warmup_data_path, args.warmup) + + print("******** 精度测试 **********") + start_time = time.time() + parse_doc(doc_path_list, output_dir, backend="pipeline") + print(f"per page process time: {(time.time()-start_time)/pdfs_page_count:.2f}s") diff --git a/ACL_PyTorch/built-in/ocr/MinerU/requirements.txt b/ACL_PyTorch/built-in/ocr/MinerU/requirements.txt new file mode 100644 index 0000000000..b98527828a --- /dev/null +++ b/ACL_PyTorch/built-in/ocr/MinerU/requirements.txt @@ -0,0 +1,34 @@ +boto3==1.40.24 +click==8.2.1 +loguru==0.7.3 +numpy==2.2.6 +pandas==2.3.2 +pdfminer.six==20250506 +tqdm==4.67.1 +requests +httpx +pillow==11.3.0 +pypdfium2==4.30.0 +pypdf==6.0.0 +reportlab==4.4.3 +pdftext==0.6.3 +modelscope==1.29.2 +huggingface-hub==0.34.4 +json-repair==0.50.0 +opencv-python==4.12.0.88 +fast-langdetect==0.2.5 +matplotlib==3.10.6 +ultralytics==8.3.193 +doclayout_yolo==0.0.4 +dill==0.3.8 +rapid_table==1.0.5 +PyYAML==6.0.2 +ftfy==6.3.1 +openai==1.106.1 +shapely==2.1.1 +pyclipper==1.3.0.post6 +omegaconf==2.3.0 +torch==2.6.0 +torch_npu==2.6.0 +torchvision==0.21.0 +transformers==4.56.1 \ No newline at end of file diff --git a/ACL_PyTorch/built-in/ocr/MinerU/ultralytics.patch b/ACL_PyTorch/built-in/ocr/MinerU/ultralytics.patch new file mode 100644 index 0000000000..4fab87d605 --- /dev/null +++ b/ACL_PyTorch/built-in/ocr/MinerU/ultralytics.patch @@ -0,0 +1,77 @@ +diff -ruN ultralytics-8.3.193/ultralytics/engine/predictor.py ultralytics_/ultralytics/engine/predictor.py +--- ultralytics-8.3.193/ultralytics/engine/predictor.py 2025-09-04 19:51:11.000000000 +0800 ++++ ultralytics_/ultralytics/engine/predictor.py 2025-09-09 14:56:14.535737230 +0800 +@@ -196,9 +196,10 @@ + same_shapes = len({x.shape for x in im}) == 1 + letterbox = LetterBox( + self.imgsz, +- auto=same_shapes +- and self.args.rect +- and (self.model.pt or (getattr(self.model, "dynamic", False) and not self.model.imx)), ++ # auto=same_shapes ++ # and self.args.rect ++ # and (self.model.pt or (getattr(self.model, "dynamic", False) and not self.model.imx)), ++ auto=False, + stride=self.model.stride, + ) + return [letterbox(image=x) for x in im] +@@ -311,8 +312,11 @@ + + # Warmup model + if not self.done_warmup: ++ # self.model.warmup( ++ # imgsz=(1 if self.models.pt or self.model.triton else self.dataset.bs, self.model.ch, *self.imgsz) ++ # ) + self.model.warmup( +- imgsz=(1 if self.model.pt or self.model.triton else self.dataset.bs, self.model.ch, *self.imgsz) ++ imgsz=(self.dataset.bs, self.model.ch, *self.imgsz) + ) + self.done_warmup = True + +@@ -400,7 +404,8 @@ + dnn=self.args.dnn, + data=self.args.data, + fp16=self.args.half, +- fuse=True, ++ # fuse=True, ++ fuse=False, + verbose=verbose, + ) + +diff -ruN ultralytics-8.3.193/ultralytics/nn/modules/block.py ultralytics_/ultralytics/nn/modules/block.py +--- ultralytics-8.3.193/ultralytics/nn/modules/block.py 2025-09-04 19:51:11.000000000 +0800 ++++ ultralytics_/ultralytics/nn/modules/block.py 2025-09-09 14:56:14.543737230 +0800 +@@ -237,7 +237,9 @@ + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Apply sequential pooling operations to input and return concatenated feature maps.""" + y = [self.cv1(x)] +- y.extend(self.m(y[-1]) for _ in range(3)) ++ # y.extend(self.m(y[-1]) for _ in range(3)) ++ for _ in range(3): ++ y.append(self.m(y[-1])) + return self.cv2(torch.cat(y, 1)) + + +@@ -315,7 +317,9 @@ + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward pass through C2f layer.""" + y = list(self.cv1(x).chunk(2, 1)) +- y.extend(m(y[-1]) for m in self.m) ++ # y.extend(m(y[-1]) for m in self.m) ++ for m in self.m: ++ y.append(m(y[-1])) + return self.cv2(torch.cat(y, 1)) + + def forward_split(self, x: torch.Tensor) -> torch.Tensor: + +diff -ruN ultralytics-8.3.193/ultralytics/utils/tal.py ultralytics_/ultralytics/utils/tal.py +--- ultralytics-8.3.193/ultralytics/utils/tal.py 2025-09-04 19:51:11.000000000 +0800 ++++ ultralytics_/ultralytics/utils/tal.py 2025-09-09 14:56:14.551737230 +0800 +@@ -375,7 +375,8 @@ + sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y + sy, sx = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx) + anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2)) +- stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) ++ # stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device)) ++ stride_tensor.append(torch.ones((h * w, 1), dtype=dtype, device=device)*stride) + return torch.cat(anchor_points), torch.cat(stride_tensor) \ No newline at end of file -- Gitee From 786d49c07aada86b1bab99ab3c09c84a94554a3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 11:21:26 +0800 Subject: [PATCH 05/24] add mineru torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/infer.py | 56 +++++++++++++++--------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py index 933917f639..b3947b91e8 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/infer.py +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -1,13 +1,28 @@ -# Copyright (c) Opendatalab. All rights reserved. +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os -from pathlib import Path +import math +import time +import sys +from typing import Optional, Tuple, Union import inspect + +from pathlib import Path import argparse from loguru import logger -import math -from typing import Optional, Tuple, Union import pypdfium2 as pdfium -import time import torch import torch_npu @@ -17,10 +32,6 @@ import torchvision_npu import torchair as tng from torchair.configs.compiler_config import CompilerConfig -# import logging -# from torchair import logger -# logger.setLevel(logging.DEBUG) - from mineru.backend.pipeline.model_list import AtomicModel from mineru.model.mfr.unimernet.unimernet_hf.unimer_swin.modeling_unimer_swin import UnimerSwinSelfAttention from mineru.backend.pipeline.model_init import ( @@ -51,6 +62,7 @@ def parse_args(): args = parser.parse_args() return args + def atom_model_init_compile(model_name: str, **kwargs): atom_model = None if model_name == AtomicModel.Layout: @@ -82,8 +94,6 @@ def atom_model_init_compile(model_name: str, **kwargs): print(atom_model.model.encoder.__class__) atom_model.model.encoder = compile_model(atom_model.model.encoder, False, True) - # npu_input = torch.zeros((batch_candidate[AtomicModel.MFR], 3, *atom_model.model.transform.input_size), dtype=torch.half, device=atom_model.device) - # tng.inference.set_dim_gears(npu_input, {0: batch_candidate[AtomicModel.MFR]}) atom_model.model.decoder = compile_model(atom_model.model.decoder, True, True) elif model_name == AtomicModel.OCR: @@ -100,21 +110,20 @@ def atom_model_init_compile(model_name: str, **kwargs): else: logger.error('model name not allow') - exit(1) + sys.exit(1) if atom_model is None: logger.error('model init failed') - exit(1) - else: - return atom_model - + sys.exit(1) + + return atom_model def rewrite_mfr_encoder_multi_head_attention_forward(model): wq = model.query.weight wk = model.key.weight wv = model.value.weight - model.qkv = nn.Linear(in_features=wk.shape[1], out_features= wq.shape[0] + wk.shape[0] + wv.shape[0]) + model.qkv = nn.Linear(in_feature = wk.shape[1], out_features = wq.shape[0] + wk.shape[0] + wv.shape[0]) model.qkv.weight = nn.Parameter(torch.concat([wq, wk, wv], dim=0), requires_grad=False) wq_bias = model.query.bias if model.query.bias is not None else torch.zeros(wq.shape[0]) wk_bias = model.key.bias if model.key.bias is not None else torch.zeros(wk.shape[0]) @@ -178,18 +187,18 @@ def rewrite_mfr_encoder_multi_head_attention_forward(model): def modify_mfr_model(model): # 修改encoder的attention forward - for name, module in model.encoder.named_modules(): + for _, module in model.encoder.named_modules(): if isinstance(module, UnimerSwinSelfAttention): rewrite_mfr_encoder_multi_head_attention_forward(module) rewrite_mfr_encoder_forward() + def compile_model(model, dynamic, fullgraph): config = CompilerConfig() config.experimental_config.frozen_parameter = True config.experimental_config.tiling_schedule_optimize = True npu_backend = tng.get_npu_backend(compiler_config=config) compiled_model = torch.compile(model, dynamic=dynamic, fullgraph=fullgraph, backend=npu_backend) - # tng.use_internal_format_weight(compiled_model) return compiled_model @@ -210,6 +219,7 @@ def rewrite_model_init(): return self._models[key] AtomModelSingleton.get_atom_model = _patched_getmodel + def rewrite_mfr_encoder_forward(): def _patched_prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor: torch.Tensor, @@ -231,7 +241,9 @@ def rewrite_mfr_encoder_forward(): encoder_accepts_wildcard = "kwargs" in encoder_signature or "model_kwargs" in encoder_signature if not encoder_accepts_wildcard: encoder_kwargs = { - argument: value for argument, value in encoder_kwargs.items() if argument in encoder_signature + argument: value + for argument, value in encoder_kwargs.items() + if argument in encoder_signature } encoder_kwargs["output_attentions"] = generation_config.output_attentions encoder_kwargs["output_hidden_states"] = generation_config.output_hidden_states @@ -243,7 +255,7 @@ def rewrite_mfr_encoder_forward(): ####### 固定input_tensor形状 pad_count = 0 if batch_candidate[AtomicModel.MFR] != inputs_tensor.shape[0]: - pad_count = batch_candidate[AtomicModel.MFR]-inputs_tensor.shape[0] + pad_count = batch_candidate[AtomicModel.MFR] - inputs_tensor.shape[0] padding_tensor = torch.zeros(pad_count, *inputs_tensor.shape[1:], dtype=inputs_tensor.dtype, device=inputs_tensor.device) inputs_tensor = torch.cat((inputs_tensor, padding_tensor), dim=0) @@ -257,6 +269,7 @@ def rewrite_mfr_encoder_forward(): GenerationMixin._prepare_encoder_decoder_kwargs_for_generation = _patched_prepare_encoder_decoder_kwargs_for_generation + def warmup(data_path, warmup_iters): data_path = Path(data_path) @@ -308,6 +321,7 @@ def get_pdf_page_count(pdf_path): finally: pdf.close() + if __name__ == '__main__': args = parse_args() os.environ['MINERU_MODEL_SOURCE'] = args.model_source -- Gitee From daf9c2a2bf11f9cc9fdce7af41ea0c3bf0dd6cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 11:51:29 +0800 Subject: [PATCH 06/24] add MinerU torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/infer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py index b3947b91e8..6ce7c54565 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/infer.py +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -110,12 +110,12 @@ def atom_model_init_compile(model_name: str, **kwargs): else: logger.error('model name not allow') - sys.exit(1) + raise ValueError("model name not allow") if atom_model is None: logger.error('model init failed') - sys.exit(1) - + raise RuntimeError("model init failed") + return atom_model @@ -123,7 +123,8 @@ def rewrite_mfr_encoder_multi_head_attention_forward(model): wq = model.query.weight wk = model.key.weight wv = model.value.weight - model.qkv = nn.Linear(in_feature = wk.shape[1], out_features = wq.shape[0] + wk.shape[0] + wv.shape[0]) + model.qkv = nn.Linear(in_feature = wk.shape[1], + out_features = wq.shape[0] + wk.shape[0] + wv.shape[0]) model.qkv.weight = nn.Parameter(torch.concat([wq, wk, wv], dim=0), requires_grad=False) wq_bias = model.query.bias if model.query.bias is not None else torch.zeros(wq.shape[0]) wk_bias = model.key.bias if model.key.bias is not None else torch.zeros(wk.shape[0]) @@ -335,7 +336,6 @@ if __name__ == '__main__': print(pdf_files_dir) batch_ratio = get_batch_ratio() - # warmup(args.warmup, batch_ratio, ) rewrite_model_init() @@ -349,7 +349,7 @@ if __name__ == '__main__': batch_candidate = { AtomicModel.Layout: [YOLO_LAYOUT_BASE_BATCH_SIZE, pdfs_page_count % YOLO_LAYOUT_BASE_BATCH_SIZE], AtomicModel.MFD: [MFD_BASE_BATCH_SIZE, pdfs_page_count % MFD_BASE_BATCH_SIZE], - AtomicModel.MFR: batch_ratio*MFR_BASE_BATCH_SIZE, + AtomicModel.MFR: batch_ratio * MFR_BASE_BATCH_SIZE, } print(len(doc_path_list), batch_candidate) warmup(args.warmup_data_path, args.warmup) -- Gitee From 0eb4a2b5ec105dee509b5cd7a60c45848ff1ca66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 11:58:28 +0800 Subject: [PATCH 07/24] add MinerU torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/infer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py index 6ce7c54565..70bbdb91cb 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/infer.py +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -13,11 +13,11 @@ # limitations under the License. import os +import sys import math import time -import sys -from typing import Optional, Tuple, Union import inspect +from typing import Optional, Tuple, Union from pathlib import Path import argparse @@ -56,9 +56,9 @@ from demo.demo import parse_doc def parse_args(): parser = argparse.ArgumentParser("MinerU infer") parser.add_argument("--model_source", type=str, default="local", help="model checkpoint source") - parser.add_argument("--data_path", type=str, default="/home/z00939677/OmniDocBench_dataset") + parser.add_argument("--data_path", type=str, default="OmniDocBench_dataset") parser.add_argument("--warmup", type=int, default=2, help="Warm up times") - parser.add_argument("--warmup_data_path", type=str, default="../OmniDocBench_dataset/pdfs/jiaocai_71434495.pdf_0.pdf") + parser.add_argument("--warmup_data_path", type=str, default="OmniDocBench_dataset/pdfs/jiaocai_71434495.pdf_0.pdf") args = parser.parse_args() return args -- Gitee From 3d67c9c0b020447745c4905576a17cb6196f5979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 12:55:02 +0800 Subject: [PATCH 08/24] add MinerU torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/infer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py index 70bbdb91cb..dca505247e 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/infer.py +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -123,8 +123,7 @@ def rewrite_mfr_encoder_multi_head_attention_forward(model): wq = model.query.weight wk = model.key.weight wv = model.value.weight - model.qkv = nn.Linear(in_feature = wk.shape[1], - out_features = wq.shape[0] + wk.shape[0] + wv.shape[0]) + model.qkv = nn.Linear(in_feature=wk.shape[1], out_features=wq.shape[0]+wk.shape[0]+wv.shape[0]) model.qkv.weight = nn.Parameter(torch.concat([wq, wk, wv], dim=0), requires_grad=False) wq_bias = model.query.bias if model.query.bias is not None else torch.zeros(wq.shape[0]) wk_bias = model.key.bias if model.key.bias is not None else torch.zeros(wk.shape[0]) -- Gitee From 8d055490924c0a0f4456d97402d4a866dbb82874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 13:36:54 +0800 Subject: [PATCH 09/24] add MinerU torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/infer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py index dca505247e..cf53dbd1ac 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/infer.py +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -1,4 +1,5 @@ # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2023 The LAION-AI Team and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -91,8 +92,6 @@ def atom_model_init_compile(model_name: str, **kwargs): modify_mfr_model(atom_model.model) - print(atom_model.model.encoder.__class__) - atom_model.model.encoder = compile_model(atom_model.model.encoder, False, True) atom_model.model.decoder = compile_model(atom_model.model.decoder, True, True) @@ -130,6 +129,7 @@ def rewrite_mfr_encoder_multi_head_attention_forward(model): wv_bias = model.key.bias if model.value.bias is not None else torch.zeros(wv.shape[0]) model.qkv.bias = nn.Parameter(torch.concat([wq_bias, wk_bias, wv_bias], dim=0), requires_grad=False) + # Adapted from: transformers.models.swin.modeling_swin.SwinSelfAttention def attn_forward( self, hidden_states: torch.Tensor, @@ -147,7 +147,6 @@ def rewrite_mfr_encoder_multi_head_attention_forward(model): key_layer = k.view(*k.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) value_layer = v.view(*v.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) - relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)] relative_position_bias = relative_position_bias.view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 @@ -221,6 +220,7 @@ def rewrite_model_init(): def rewrite_mfr_encoder_forward(): + # Adapted from: transformers.generation.utils.GenerationMixin def _patched_prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor: torch.Tensor, model_kwargs, -- Gitee From f877e1140da154bf48f8a8f12fa88c53e7d237ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 14:09:43 +0800 Subject: [PATCH 10/24] add MinerU torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/README.md | 2 + ACL_PyTorch/built-in/ocr/MinerU/infer.py | 58 +------------------ .../ocr/MinerU/mfr_encoder_mhsa.patch | 23 ++++++++ 3 files changed, 26 insertions(+), 57 deletions(-) create mode 100644 ACL_PyTorch/built-in/ocr/MinerU/mfr_encoder_mhsa.patch diff --git a/ACL_PyTorch/built-in/ocr/MinerU/README.md b/ACL_PyTorch/built-in/ocr/MinerU/README.md index 19f28fbb03..20069b5bbd 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/README.md +++ b/ACL_PyTorch/built-in/ocr/MinerU/README.md @@ -76,6 +76,8 @@ MinerU是由上海人工智能实验室OpenDataLab团队开发的开源文档解 patch -p2 < ${workdir}/ultralytics.patch cd ${source_path}/doclayout_yolo patch -p2 < ${workdir}/doclayout_yolo.patch + cd ${workdir} + patch -p0 < mfr_encoder_mhsa.patch ``` 修改完成后回到工作目录`workdir` diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py index cf53dbd1ac..91b53a045c 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/infer.py +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -1,5 +1,4 @@ # Copyright 2025 Huawei Technologies Co., Ltd -# Copyright 2023 The LAION-AI Team and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -122,67 +121,13 @@ def rewrite_mfr_encoder_multi_head_attention_forward(model): wq = model.query.weight wk = model.key.weight wv = model.value.weight - model.qkv = nn.Linear(in_feature=wk.shape[1], out_features=wq.shape[0]+wk.shape[0]+wv.shape[0]) + model.qkv = nn.Linear(in_features=wk.shape[1], out_features=wq.shape[0]+wk.shape[0]+wv.shape[0]) model.qkv.weight = nn.Parameter(torch.concat([wq, wk, wv], dim=0), requires_grad=False) wq_bias = model.query.bias if model.query.bias is not None else torch.zeros(wq.shape[0]) wk_bias = model.key.bias if model.key.bias is not None else torch.zeros(wk.shape[0]) wv_bias = model.key.bias if model.value.bias is not None else torch.zeros(wv.shape[0]) model.qkv.bias = nn.Parameter(torch.concat([wq_bias, wk_bias, wv_bias], dim=0), requires_grad=False) - # Adapted from: transformers.models.swin.modeling_swin.SwinSelfAttention - def attn_forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.FloatTensor] = None, - head_mask: Optional[torch.FloatTensor] = None, - output_attentions: Optional[bool] = False - ) -> Tuple[torch.Tensor]: - - # """融合qk为大矩阵,由于加入相对位置编码,PFA接口用不了,暂时只修改矩阵乘法""" - batch_size, dim, num_channels = hidden_states.shape - qkv = model.qkv(hidden_states) - q, k, v = qkv.chunk(3, dim=-1) - - query_layer = q.view(*q.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) - key_layer = k.view(*k.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) - value_layer = v.view(*v.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) - - relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)] - relative_position_bias = relative_position_bias.view( - self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 - ) - relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() - - attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - attention_scores = attention_scores + relative_position_bias.unsqueeze(0) - - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in UnimerSwinModel forward() function) - mask_shape = attention_mask.shape[0] - attention_scores = attention_scores.view( - batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim - ) - attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0) - attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim) - - # Normalize the attention scores to probabilities. - attention_probs = nn.functional.softmax(attention_scores, dim=-1) - attention_probs = self.dropout(attention_probs) - - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - - context_layer = torch.matmul(attention_probs, value_layer) - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(new_context_layer_shape) - - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) - return outputs - model.forward = attn_forward.__get__(model) - def modify_mfr_model(model): # 修改encoder的attention forward @@ -220,7 +165,6 @@ def rewrite_model_init(): def rewrite_mfr_encoder_forward(): - # Adapted from: transformers.generation.utils.GenerationMixin def _patched_prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor: torch.Tensor, model_kwargs, diff --git a/ACL_PyTorch/built-in/ocr/MinerU/mfr_encoder_mhsa.patch b/ACL_PyTorch/built-in/ocr/MinerU/mfr_encoder_mhsa.patch new file mode 100644 index 0000000000..1fe80a05cb --- /dev/null +++ b/ACL_PyTorch/built-in/ocr/MinerU/mfr_encoder_mhsa.patch @@ -0,0 +1,23 @@ +--- MinerU/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py 2025-09-02 17:58:15.032000000 +0800 ++++ copy_mfr.py 2025-09-10 13:58:36.616000000 +0800 +@@ -465,11 +465,15 @@ + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + batch_size, dim, num_channels = hidden_states.shape +- mixed_query_layer = self.query(hidden_states) + +- key_layer = self.transpose_for_scores(self.key(hidden_states)) +- value_layer = self.transpose_for_scores(self.value(hidden_states)) +- query_layer = self.transpose_for_scores(mixed_query_layer) ++ # """融合qk为大矩阵,由于加入相对位置编码,PFA接口用不了,暂时只修改矩阵乘法""" ++ batch_size, dim, num_channels = hidden_states.shape ++ qkv = self.qkv(hidden_states) ++ q, k, v = qkv.chunk(3, dim=-1) ++ ++ query_layer = q.view(*q.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) ++ key_layer = k.view(*k.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) ++ value_layer = v.view(*v.shape[:2], self.num_attention_heads, -1).permute(0, 2, 1, 3) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + -- Gitee From dc435a0323dc66a4661089f80a4bacf8053f4ec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 14:27:53 +0800 Subject: [PATCH 11/24] add MinerU torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/infer.py | 28 ++---------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py index 91b53a045c..7780e89949 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/infer.py +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -121,7 +121,7 @@ def rewrite_mfr_encoder_multi_head_attention_forward(model): wq = model.query.weight wk = model.key.weight wv = model.value.weight - model.qkv = nn.Linear(in_features=wk.shape[1], out_features=wq.shape[0]+wk.shape[0]+wv.shape[0]) + model.qkv = nn.Linear(in_features=wk.shape[1], out_features=wq.shape[0] + wk.shape[0] + wv.shape[0]) model.qkv.weight = nn.Parameter(torch.concat([wq, wk, wv], dim=0), requires_grad=False) wq_bias = model.query.bias if model.query.bias is not None else torch.zeros(wq.shape[0]) wk_bias = model.key.bias if model.key.bias is not None else torch.zeros(wk.shape[0]) @@ -234,30 +234,6 @@ def warmup(data_path, warmup_iters): parse_doc(doc_path_list, output_dir, backend="pipeline") -def get_batch_ratio(device="npu"): - batch_ratio = 1 - if str(device).startswith('npu') or str(device).startswith('cuda'): - vram = get_vram(device) - if vram is not None: - gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram))) - if gpu_memory >= 16: - batch_ratio = 16 - elif gpu_memory >= 12: - batch_ratio = 8 - elif gpu_memory >= 8: - batch_ratio = 4 - elif gpu_memory >= 6: - batch_ratio = 2 - else: - batch_ratio = 1 - logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}') - else: - # Default batch_ratio when VRAM can't be determined - batch_ratio = 1 - logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}') - return batch_ratio - - def get_pdf_page_count(pdf_path): pdf = pdfium.PdfDocument(pdf_path) try: @@ -278,7 +254,7 @@ if __name__ == '__main__': print(pdf_files_dir) - batch_ratio = get_batch_ratio() + batch_ratio = 16 rewrite_model_init() -- Gitee From 08c450b0a436dd82f1aff6e9fefc8b15e0380072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 15:39:34 +0800 Subject: [PATCH 12/24] add MinerU torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/infer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py index 7780e89949..93779d6622 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/infer.py +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -17,7 +17,6 @@ import sys import math import time import inspect -from typing import Optional, Tuple, Union from pathlib import Path import argparse @@ -168,7 +167,7 @@ def rewrite_mfr_encoder_forward(): def _patched_prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor: torch.Tensor, model_kwargs, - model_input_name: Optional[str], + model_input_name, generation_config, ): # 1. get encoder -- Gitee From eaec3391c1ba20fcd17e992ef09d51d78993aff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 10 Sep 2025 17:09:30 +0800 Subject: [PATCH 13/24] add MinerU torchair adaptation --- ACL_PyTorch/built-in/ocr/MinerU/README.md | 3 +-- ACL_PyTorch/built-in/ocr/MinerU/infer.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ACL_PyTorch/built-in/ocr/MinerU/README.md b/ACL_PyTorch/built-in/ocr/MinerU/README.md index 20069b5bbd..cdf01eb0ea 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/README.md +++ b/ACL_PyTorch/built-in/ocr/MinerU/README.md @@ -30,7 +30,7 @@ MinerU是由上海人工智能实验室OpenDataLab团队开发的开源文档解 | 配套 | 版本 | 环境准备指导 | | ------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------- | - | 固件与驱动 | 25.0.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | + | 固件与驱动 | 25.2.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | | CANN | 8.3.0 | - | | Python | 3.11 | - | | PyTorch | 2.6.0 | - | @@ -79,7 +79,6 @@ MinerU是由上海人工智能实验室OpenDataLab团队开发的开源文档解 cd ${workdir} patch -p0 < mfr_encoder_mhsa.patch ``` -修改完成后回到工作目录`workdir` ## 获取权重 diff --git a/ACL_PyTorch/built-in/ocr/MinerU/infer.py b/ACL_PyTorch/built-in/ocr/MinerU/infer.py index 93779d6622..b04545b7fc 100644 --- a/ACL_PyTorch/built-in/ocr/MinerU/infer.py +++ b/ACL_PyTorch/built-in/ocr/MinerU/infer.py @@ -49,7 +49,7 @@ from mineru.backend.pipeline.batch_analyze import ( ) from transformers.generation.utils import GenerationMixin -from demo.demo import parse_doc +from MinerU.demo.demo import parse_doc def parse_args(): -- Gitee From 5b494f8ea15e3d9ba70fe3d5c9adb50dfaa276ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Wed, 17 Sep 2025 14:44:34 +0800 Subject: [PATCH 14/24] Updating the Third-Party Library Version --- ACL_PyTorch/built-in/cv/SAM/README.md | 12 ++++++------ ACL_PyTorch/built-in/cv/SAM/requirements.txt | 13 +++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/README.md b/ACL_PyTorch/built-in/cv/SAM/README.md index 7fee7963ca..4ca3afa1f5 100644 --- a/ACL_PyTorch/built-in/cv/SAM/README.md +++ b/ACL_PyTorch/built-in/cv/SAM/README.md @@ -61,11 +61,11 @@ SAM 首先会自动分割图像中的所有内容,但是如果你需要分割 | 配套 | 版本 | 环境准备指导 | | ---- | ---- | ---- | -| 固件与驱动 | 24.0.T1.B010 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | -| CANN | 8.0.RC1.B060 | - | -| MindIE | 1.0.RC1.B060 | - | -| Python | 3.10.13(MindIE 要求 Python 3.10) | - | -| PyTorch | 1.13.1 | - | +| 固件与驱动 | 25.2.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | +| CANN | 8.2.RC1 | - | +| MindIE | 2.1.RC1 | - | +| Python | 3.11.10 | - | +| PyTorch | 2.1.0 | - | | 说明:Atlas 300I Duo 推理卡请以CANN版本选择实际固件与驱动版本。 | \ | \ | ## 3. 快速上手 @@ -78,7 +78,7 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/cv/SAM git clone https://github.com/facebookresearch/segment-anything.git cd segment-anything git reset --hard 6fdee8f2727f4506cfbbe553e23b895e27956588 -patch -p2 < segment_anything_diff.patch +patch -p2 < ../segment_anything_diff.patch pip3 install -e . cd .. ``` diff --git a/ACL_PyTorch/built-in/cv/SAM/requirements.txt b/ACL_PyTorch/built-in/cv/SAM/requirements.txt index 22d75b25e8..6722adb464 100644 --- a/ACL_PyTorch/built-in/cv/SAM/requirements.txt +++ b/ACL_PyTorch/built-in/cv/SAM/requirements.txt @@ -1,14 +1,15 @@ -torch==1.13.1 -torchvision==0.14.1 -torchaudio==0.13.1 +torch==2.1.0 +torch_npu==2.1.0.post17.dev20250905 +torchvision==0.16.0 +torchaudio==2.1.0 decorator scipy attrs psutil -numpy -opencv-python +numpy==1.26.0 +opencv-python==4.11.0.86 pycocotools matplotlib +onnx==1.16.1 onnxruntime -onnx onnxsim==0.4.17 \ No newline at end of file -- Gitee From d5c2fcf00418234a655a5b25ecd684dd25fd4b9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Tue, 23 Sep 2025 16:10:22 +0800 Subject: [PATCH 15/24] fix sam bugs --- ACL_PyTorch/built-in/cv/SAM/README.md | 2 +- ACL_PyTorch/built-in/cv/SAM/requirements.txt | 2 +- .../cv/SAM/segment_anything_diff.patch | 73 +++++++++++++------ 3 files changed, 53 insertions(+), 24 deletions(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/README.md b/ACL_PyTorch/built-in/cv/SAM/README.md index 4ca3afa1f5..7e0577be26 100644 --- a/ACL_PyTorch/built-in/cv/SAM/README.md +++ b/ACL_PyTorch/built-in/cv/SAM/README.md @@ -78,7 +78,7 @@ cd ModelZoo-PyTorch/ACL_PyTorch/built-in/cv/SAM git clone https://github.com/facebookresearch/segment-anything.git cd segment-anything git reset --hard 6fdee8f2727f4506cfbbe553e23b895e27956588 -patch -p2 < ../segment_anything_diff.patch +git apply ../segment_anything_diff.patch pip3 install -e . cd .. ``` diff --git a/ACL_PyTorch/built-in/cv/SAM/requirements.txt b/ACL_PyTorch/built-in/cv/SAM/requirements.txt index 6722adb464..b4969b2263 100644 --- a/ACL_PyTorch/built-in/cv/SAM/requirements.txt +++ b/ACL_PyTorch/built-in/cv/SAM/requirements.txt @@ -1,5 +1,5 @@ torch==2.1.0 -torch_npu==2.1.0.post17.dev20250905 +torch_npu==2.1.0.post10 torchvision==0.16.0 torchaudio==2.1.0 decorator diff --git a/ACL_PyTorch/built-in/cv/SAM/segment_anything_diff.patch b/ACL_PyTorch/built-in/cv/SAM/segment_anything_diff.patch index aec413383e..41328579f8 100644 --- a/ACL_PyTorch/built-in/cv/SAM/segment_anything_diff.patch +++ b/ACL_PyTorch/built-in/cv/SAM/segment_anything_diff.patch @@ -1,6 +1,7 @@ -diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/scripts/export_onnx_model.py ---- a/segment-anything/scripts/export_onnx_model.py 2023-11-13 16:25:26.000000000 +0800 -+++ b/segment-anything/scripts/export_onnx_model.py 2023-11-18 16:15:20.088025762 +0800 +diff --git a/scripts/export_onnx_model.py b/scripts/export_onnx_model.py +index 5c6f838..0bfaff2 100644 +--- a/scripts/export_onnx_model.py ++++ b/scripts/export_onnx_model.py @@ -6,8 +6,12 @@ import torch @@ -14,7 +15,7 @@ diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/sc import argparse import warnings -@@ -24,11 +28,30 @@ +@@ -24,11 +28,30 @@ parser = argparse.ArgumentParser( ) parser.add_argument( @@ -47,7 +48,7 @@ diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/sc ) parser.add_argument( -@@ -56,11 +79,21 @@ +@@ -56,11 +79,21 @@ parser.add_argument( ) parser.add_argument( @@ -71,7 +72,7 @@ diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/sc "Quantization is performed with quantize_dynamic from onnxruntime.quantization.quantize." ), ) -@@ -97,7 +130,9 @@ +@@ -97,7 +130,9 @@ parser.add_argument( def run_export( model_type: str, checkpoint: str, @@ -82,7 +83,7 @@ diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/sc opset: int, return_single_mask: bool, gelu_approximate: bool = False, -@@ -107,6 +142,74 @@ +@@ -107,6 +142,74 @@ def run_export( print("Loading model...") sam = sam_model_registry[model_type](checkpoint=checkpoint) @@ -157,7 +158,7 @@ diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/sc onnx_model = SamOnnxModel( model=sam, return_single_mask=return_single_mask, -@@ -129,16 +232,17 @@ +@@ -129,16 +232,17 @@ def run_export( mask_input_size = [4 * x for x in embed_size] dummy_inputs = { "image_embeddings": torch.randn(1, embed_dim, *embed_size, dtype=torch.float), @@ -178,7 +179,7 @@ diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/sc with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) -@@ -164,7 +268,7 @@ +@@ -164,7 +268,7 @@ def run_export( providers = ["CPUExecutionProvider"] ort_session = onnxruntime.InferenceSession(output, providers=providers) _ = ort_session.run(None, ort_inputs) @@ -187,7 +188,7 @@ diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/sc def to_numpy(tensor): -@@ -176,7 +280,9 @@ +@@ -176,7 +280,9 @@ if __name__ == "__main__": run_export( model_type=args.model_type, checkpoint=args.checkpoint, @@ -198,7 +199,7 @@ diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/sc opset=args.opset, return_single_mask=args.return_single_mask, gelu_approximate=args.gelu_approximate, -@@ -184,18 +290,34 @@ +@@ -184,18 +290,34 @@ if __name__ == "__main__": return_extra_metrics=args.return_extra_metrics, ) @@ -238,10 +239,11 @@ diff -Naru a/segment-anything/scripts/export_onnx_model.py b/segment-anything/sc + ) + print("Done!") \ No newline at end of file -diff -Naru a/segment-anything/segment_anything/modeling/image_encoder.py b/segment-anything/segment_anything/modeling/image_encoder.py ---- a/segment-anything/segment_anything/modeling/image_encoder.py 2023-11-13 16:25:26.000000000 +0800 -+++ b/segment-anything/segment_anything/modeling/image_encoder.py 2023-11-13 19:26:32.000000000 +0800 -@@ -253,8 +253,8 @@ +diff --git a/segment_anything/modeling/image_encoder.py b/segment_anything/modeling/image_encoder.py +index 66351d9..31d622c 100644 +--- a/segment_anything/modeling/image_encoder.py ++++ b/segment_anything/modeling/image_encoder.py +@@ -253,8 +253,8 @@ def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, T """ B, H, W, C = x.shape @@ -252,7 +254,7 @@ diff -Naru a/segment-anything/segment_anything/modeling/image_encoder.py b/segme if pad_h > 0 or pad_w > 0: x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) Hp, Wp = H + pad_h, W + pad_w -@@ -322,6 +322,15 @@ +@@ -322,6 +322,15 @@ def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor return rel_pos_resized[relative_coords.long()] @@ -268,7 +270,7 @@ diff -Naru a/segment-anything/segment_anything/modeling/image_encoder.py b/segme def add_decomposed_rel_pos( attn: torch.Tensor, q: torch.Tensor, -@@ -351,8 +360,8 @@ +@@ -351,8 +360,8 @@ def add_decomposed_rel_pos( B, _, dim = q.shape r_q = q.reshape(B, q_h, q_w, dim) @@ -279,10 +281,36 @@ diff -Naru a/segment-anything/segment_anything/modeling/image_encoder.py b/segme attn = ( attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] -diff -Naru a/segment-anything/segment_anything/utils/onnx.py b/segment-anything/segment_anything/utils/onnx.py ---- a/segment-anything/segment_anything/utils/onnx.py 2023-11-13 16:25:26.000000000 +0800 -+++ b/segment-anything/segment_anything/utils/onnx.py 2023-11-18 16:14:01.512027850 +0800 -@@ -112,7 +112,6 @@ +diff --git a/segment_anything/modeling/mask_decoder.py b/segment_anything/modeling/mask_decoder.py +index 5d2fdb0..1061655 100644 +--- a/segment_anything/modeling/mask_decoder.py ++++ b/segment_anything/modeling/mask_decoder.py +@@ -123,9 +123,18 @@ class MaskDecoder(nn.Module): + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) + + # Expand per-image data in batch direction to be per-mask +- src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) ++ ++ ## Torch versions 2.1.0 and above are not compatible. ++ # src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) ++ N = tokens.shape[0] ++ B, C, H, W = image_embeddings.shape ++ src = image_embeddings.unsqueeze(1).expand(B, N, C, H, W).reshape(B * N, C, H, W) ++ + src = src + dense_prompt_embeddings +- pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) ++ # pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) ++ B, C, H, W = image_pe.shape ++ pos_src = image_pe.unsqueeze(1).expand(B, N, C, H, W).reshape(B * N, C, H, W) ++ + b, c, h, w = src.shape + + # Run the transformer +diff --git a/segment_anything/utils/onnx.py b/segment_anything/utils/onnx.py +index 3196bdf..e718afc 100644 +--- a/segment_anything/utils/onnx.py ++++ b/segment_anything/utils/onnx.py +@@ -112,7 +112,6 @@ class SamOnnxModel(nn.Module): point_labels: torch.Tensor, mask_input: torch.Tensor, has_mask_input: torch.Tensor, @@ -290,7 +318,7 @@ diff -Naru a/segment-anything/segment_anything/utils/onnx.py b/segment-anything/ ): sparse_embedding = self._embed_points(point_coords, point_labels) dense_embedding = self._embed_masks(mask_input, has_mask_input) -@@ -131,14 +130,4 @@ +@@ -131,14 +130,4 @@ class SamOnnxModel(nn.Module): if self.return_single_mask: masks, scores = self.select_masks(masks, scores, point_coords.shape[1]) @@ -306,3 +334,4 @@ diff -Naru a/segment-anything/segment_anything/utils/onnx.py b/segment-anything/ - - return upscaled_masks, scores, masks + return scores, masks + -- Gitee From a062c934062e75d02675f92513171c4014f1877b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Tue, 23 Sep 2025 16:57:33 +0800 Subject: [PATCH 16/24] fix sam bugs --- ACL_PyTorch/built-in/cv/SAM/segment_anything_diff.patch | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/segment_anything_diff.patch b/ACL_PyTorch/built-in/cv/SAM/segment_anything_diff.patch index 41328579f8..96284944bc 100644 --- a/ACL_PyTorch/built-in/cv/SAM/segment_anything_diff.patch +++ b/ACL_PyTorch/built-in/cv/SAM/segment_anything_diff.patch @@ -282,24 +282,21 @@ index 66351d9..31d622c 100644 attn = ( attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] diff --git a/segment_anything/modeling/mask_decoder.py b/segment_anything/modeling/mask_decoder.py -index 5d2fdb0..1061655 100644 +index 5d2fdb0..ee8da94 100644 --- a/segment_anything/modeling/mask_decoder.py +++ b/segment_anything/modeling/mask_decoder.py -@@ -123,9 +123,18 @@ class MaskDecoder(nn.Module): +@@ -123,9 +123,15 @@ class MaskDecoder(nn.Module): tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) # Expand per-image data in batch direction to be per-mask - src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) -+ -+ ## Torch versions 2.1.0 and above are not compatible. -+ # src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) + N = tokens.shape[0] + B, C, H, W = image_embeddings.shape + src = image_embeddings.unsqueeze(1).expand(B, N, C, H, W).reshape(B * N, C, H, W) + src = src + dense_prompt_embeddings - pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) -+ # pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) ++ + B, C, H, W = image_pe.shape + pos_src = image_pe.unsqueeze(1).expand(B, N, C, H, W).reshape(B * N, C, H, W) + -- Gitee From 951d405152b5dba9b4dbc7a6213dd41d31b7f707 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Thu, 25 Sep 2025 15:27:06 +0800 Subject: [PATCH 17/24] Adding a Precision Test Script --- ACL_PyTorch/built-in/cv/SAM/README.md | 161 +++++++++------ .../built-in/cv/SAM/sam_coco_metric.py | 191 ++++++++++++++++++ .../built-in/cv/SAM/sam_end2end_infer.py | 7 +- .../cv/SAM/sam_preprocessing_pytorch.py | 27 ++- 4 files changed, 317 insertions(+), 69 deletions(-) create mode 100644 ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py diff --git a/ACL_PyTorch/built-in/cv/SAM/README.md b/ACL_PyTorch/built-in/cv/SAM/README.md index 7e0577be26..b35a7a354d 100644 --- a/ACL_PyTorch/built-in/cv/SAM/README.md +++ b/ACL_PyTorch/built-in/cv/SAM/README.md @@ -1,5 +1,6 @@ -# SAM 推理指导 +# SAM(ONNX)-推理指导 +## 概述 Segment Anything Model (SAM) 是由 Meta 开源的图像分割大模型,在计算机视觉领域(CV)取得了新的突破。SAM 可在不需要任何标注的情况下,对任何图像中的任何物体进行分割,SAM 的开源引起了业界的广泛反响,被称为计算机视觉领域的 GPT。 - 论文: @@ -17,7 +18,48 @@ Segment Anything Model (SAM) 是由 Meta 开源的图像分割大模型,在计 model_name=sam_vit_b_01ec64 ``` -## 1. 输入输出数据 +## 推理环境准备 + +- 该模型需要以下插件与驱动 + +**表 1** 版本配套表 + + | 配套 | 版本 | 环境准备指导 | + | ---- | ---- | ---- | + | 固件与驱动 | 25.2.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | + | CANN | 8.2.RC1 | - | + | MindIE | 2.1.RC1 | - | + | Python | 3.11.10 | - | + | PyTorch | 2.1.0 | - | + | 说明:Atlas 300I Duo 推理卡请以CANN版本选择实际固件与驱动版本。 | \ | \ | + +## 快速上手 + +### 1. 获取源码 + +```bash +git clone https://gitee.com/ascend/ModelZoo-PyTorch.git +cd ModelZoo-PyTorch/ACL_PyTorch/built-in/cv/SAM +git clone https://github.com/facebookresearch/segment-anything.git +cd segment-anything +git reset --hard 6fdee8f2727f4506cfbbe553e23b895e27956588 +git apply ../segment_anything_diff.patch +pip3 install -e . +cd .. +``` + +### 2. 安装依赖。 + +- 安装基础环境。 + +```bash +pip3 install -r requirements.txt +``` +说明:如果某些库通过此方式安装失败,可使用 pip3 install 单独进行安装。 + +- 安装 [msit](https://gitee.com/ascend/msit/tree/master/msit/) 的 surgeon 组件和 benchmark 组件。 + +### 3. 输入输出数据描述 SAM 首先会自动分割图像中的所有内容,但是如果你需要分割某一个目标物体,则需要你输入一个目标物体上的坐标,比如一张图片你想让SAM分割Cat或Dog这个目标的提示坐标,SAM会自动在照片中猫或狗进行分割,在离线推理时,会转成encoder模型和decoder模型,其输入输出详情如下: @@ -53,51 +95,9 @@ SAM 首先会自动分割图像中的所有内容,但是如果你需要分割 | low_res_masks | FLOAT32 | -1 x 1 x -1 x -1 | ND | -## 2. 推理环境准备 - -- 该模型需要以下插件与驱动 - - **表 1** 版本配套表 - -| 配套 | 版本 | 环境准备指导 | -| ---- | ---- | ---- | -| 固件与驱动 | 25.2.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | -| CANN | 8.2.RC1 | - | -| MindIE | 2.1.RC1 | - | -| Python | 3.11.10 | - | -| PyTorch | 2.1.0 | - | -| 说明:Atlas 300I Duo 推理卡请以CANN版本选择实际固件与驱动版本。 | \ | \ | - -## 3. 快速上手 +### 4. 准备数据集 -### 3.1 获取源码 - -``` -git clone https://gitee.com/ascend/ModelZoo-PyTorch.git -cd ModelZoo-PyTorch/ACL_PyTorch/built-in/cv/SAM -git clone https://github.com/facebookresearch/segment-anything.git -cd segment-anything -git reset --hard 6fdee8f2727f4506cfbbe553e23b895e27956588 -git apply ../segment_anything_diff.patch -pip3 install -e . -cd .. -``` - -### 3.2 安装依赖。 - -1. 安装基础环境。 - - ```bash - pip3 install -r requirements.txt - ``` - - 说明:如果某些库通过此方式安装失败,可使用 pip3 install 单独进行安装。 - -2. 安装 [msit](https://gitee.com/ascend/msit/tree/master/msit/) 的 surgeon 组件和 benchmark 组件。 - -### 3.3 准备数据集 - -GitHub 仓库没有提供精度和性能的测试手段,这里取仓库里的 demo 图片进行测试。 +- 取仓库里的 demo 图片进行端到端测试。 ```bash mkdir data @@ -106,9 +106,23 @@ wget -O demo.jpg https://raw.githubusercontent.com/facebookresearch/segment-anyt cd .. ``` -### 3.4 模型转换 +- 下载coco2017数据集进行精度测试。 -#### 3.4.1 获取权重文件 +下载COCO-2017数据集的[图片](https://gitee.com/link?target=http%3A%2F%2Fimages.cocodataset.org%2Fzips%2Fval2017.zip)与[标注](https://gitee.com/link?target=http%3A%2F%2Fimages.cocodataset.org%2Fannotations%2Fannotations_trainval2017.zip),放置coco2017目录下 + + ``` + coco2017 + ├── annotations/ + │ └── instances_val2017.json + └── val2017/ + ├── 000000000139.jpg + ├── 000000000139.jpg + └── ... + ``` + +### 5. 模型转换 + +#### 5.1 获取权重文件 GitHub 仓库提供了三种大小的权重文件:vit_h、vit_l、vit_b。这里以 vit_b 为例。 @@ -119,7 +133,7 @@ wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth cd .. ``` -#### 3.4.2 导出 ONNX 模型 +#### 5.2 导出 ONNX 模型 ```bash python3 segment-anything/scripts/export_onnx_model.py \ @@ -140,7 +154,7 @@ python3 segment-anything/scripts/export_onnx_model.py \ - decoder-output:保存decoder模型的输出ONNX模型的文件路径。 - return-single-mask:设置最优mask模式。 -#### 3.4.3 使用 onnxsim 简化 ONNX 模型 +#### 5.3 使用 onnxsim 简化 ONNX 模型 这里以 batchsize=1 为例。 @@ -156,7 +170,7 @@ onnxsim models/decoder.onnx models/decoder_sim.onnx - 第二个参数:简化后的 ONNX 保存路径。 - overwrite-input-shape:指定输入的维度。 -#### 3.4.4 运行改图脚本,修改 ONNX 模型以适配昇腾芯片 +#### 5.4 运行改图脚本,修改 ONNX 模型以适配昇腾芯片 ```bash python3 encoder_onnx_modify.py \ @@ -169,9 +183,9 @@ python3 encoder_onnx_modify.py \ - 第一个参数:原 ONNX 路径。 - 第二个参数:适配后的 ONNX 保存路径。 -#### 3.4.5 使用 ATC 工具将 ONNX 模型转为 OM 模型 +#### 5.5 使用 ATC 工具将 ONNX 模型转为 OM 模型 -1. 配置环境变量。 +- 配置环境变量。 ```bash source /usr/local/Ascend/ascend-toolkit/set_env.sh @@ -180,7 +194,7 @@ python3 encoder_onnx_modify.py \ > **说明:** 该脚本中环境变量仅供参考,请以实际安装环境配置环境变量。详细介绍请参见《[CANN 开发辅助工具指南 \(推理\)](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=developer-documents&subcategory=auxiliary-development-tools)》。 -2. 执行命令查看芯片名称($\{chip\_name\})。 +- 执行命令查看芯片名称($\{chip\_name\})。 ```bash npu-smi info @@ -198,7 +212,7 @@ python3 encoder_onnx_modify.py \ +===================+=================+======================================================+ ``` -3. 执行 atc 命令。 +- 执行 atc 命令。 ```bash atc \ @@ -234,9 +248,9 @@ python3 encoder_onnx_modify.py \ 更多参数说明请参考 [ATC 参数概览](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC1alpha002/devaids/auxiliarydevtool/atlasatc_16_0039.html)(如果链接失效,请从 [CANN 社区版文档](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition) 查找目录「应用开发 > ATC 模型转换 > 参数说明 > 参数概览」) -### 3.5 推理验证 +### 6 推理验证 -1. 端到端推理。成功执行下述命令后会在save-path参数指定的目录生成离线推理的结果。 +6.1 端到端推理。成功执行下述命令后会在save-path参数指定的目录生成离线推理的结果。 ```bash python3 sam_end2end_infer.py \ @@ -271,7 +285,7 @@ python3 encoder_onnx_modify.py \ ![](./assets/om_truck_result.JPG) -2. 性能验证。 +6.2 性能验证。 1. encoder 纯推理性能验证。 @@ -305,9 +319,38 @@ python3 encoder_onnx_modify.py \ - loop: 循环次数 - batchsize: 模型batch size -## 4. 模型推理性能 & 精度 +6.3 精度验证。 + +SAM 官方未提供精度评测手段,这里提供对应脚本,基于 COCO 验证集标注框作为输入提示,使用 SAM 预测分割掩码,并与 COCO 标注掩码逐实例进行 IoU 计算,最后对所有实例的 IoU 结果取平均,得到整体的平均交并比(mIoU)。 + + ```bash + python sam_coco_metric.py \ + --dataset-path coco2017 \ + --save-path outputs \ + --encoder-model-path models/encoder_sim.om \ + --decoder-model-path models/decoder_sim.om \ + --device-id 0 \ + --max-instances 0 + ``` +参数说明: + +- dataset-path: coco数据集目录 +- save-path: SAM预测掩码结果存储路径 +- encoder-model-path:encoder的OM模型路径 +- decoder-model-path:decoder的OM模型路径 +- device-id: 指定推理的NPU设备ID +- max-instances: 评测的最大实例数量,默认为0表示测评完整验证集 + +## 4. 模型推理性能 & 精度 +性能结果: | 芯片型号 | 模型 | Batch Size | 性能 | | ---- | ---- | ---- | ---- | | 300I Pro | encoder | 1 | 4.43 fps | | 300I Pro | decoder | 1 | 679.77 fps | + +精度结果: +| 芯片型号 | 模型 | Batch Size | 精度(mIoU) | +| ---- | ---- | ---- | ---- | +| 300I Pro | SAM | 1 | 0.7654 | + diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py new file mode 100644 index 0000000000..fe44855be3 --- /dev/null +++ b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py @@ -0,0 +1,191 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import cv2 +import argparse +import numpy as np +from tqdm import tqdm +from pycocotools.coco import COCO +from pycocotools import mask as maskUtils + +from ais_bench.infer.interface import InferSession\ + +from sam_preprocessing_pytorch import encoder_preprocessing, decoder_preprocessing +from sam_postprocessing_pytorch import sam_postprocessing + + +def rle_to_mask(rle, h, w): + """COCO segmentation → binary mask (h,w) uint8.""" + if isinstance(rle, list): + rles = maskUtils.frPyObjects(rle, h, w) + rle = maskUtils.merge(rles) + elif isinstance(rle, dict) and isinstance(rle.get("counts"), list): + rle = maskUtils.frPyObjects(rle, h, w) + return maskUtils.decode(rle).astype(np.uint8) + + +def compute_iou(pred_mask, gt_mask): + pred = (pred_mask > 0).astype(np.uint8) + gt = (gt_mask > 0).astype(np.uint8) + inter = (pred & gt).sum() + union = (pred | gt).sum() + return float(inter) / float(union) if union > 0 else 0.0 + + +def coco_bbox_to_xyxy(bbox_xywh): + x, y, w, h = bbox_xywh + return [x, y, x + w, y + h] + + +def encoder_infer(session_encoder, x): + encoder_outputs = session_encoder.infer([x]) + image_embedding = encoder_outputs[0] + return image_embedding + + +def decoder_infer(session_decoder, decoder_inputs): + decoder_outputs = session_decoder.infer(decoder_inputs, mode="dymdims", custom_sizes=[1000, 1000000]) + low_res_masks = decoder_outputs[1] + return low_res_masks + + +def save_mask_overlay(masks, image, save_dir, image_name): + overlay = image.copy() + alpha = 0.5 + + for mask in masks: + if mask.sum() == 0: + continue + color = np.random.randint(0, 255, (3,), dtype=np.uint8) # 每个实例随机颜色 + overlay[mask > 0] = (overlay[mask > 0] * (1 - alpha) + color * alpha).astype(np.uint8) + + base, ext = os.path.splitext(image_name) + save_path = os.path.join(save_dir, f"{base}_sam_pre{ext}") + cv2.imwrite(save_path, overlay) + + +def evaluate_sam_on_coco(coco_root, save_path, encoder, decoder, max_instances=0): + ann_file = os.path.join(coco_root, "annotations", "instances_val2017.json") + img_root = os.path.join(coco_root, "val2017") + if not os.path.isfile(ann_file): + raise FileNotFoundError(f"COCO annotations not found: {ann_file}") + if not os.path.isdir(img_root): + raise FileNotFoundError(f"COCO val2017 images not found: {img_root}") + + coco = COCO(ann_file) + img_ids = coco.getImgIds() + + session_encoder = encoder + session_decoder = decoder + + ious = [] + counted = 0 + + for img_id in tqdm(img_ids, desc="Evaluating"): + img_info = coco.loadImgs(img_id)[0] + img_path = os.path.join(img_root, img_info["file_name"]) + image = cv2.imread(img_path) + + H, W = image.shape[:2] + + x = encoder_preprocessing(image) + image_embedding = encoder_infer(session_encoder, x) + + ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) + anns = coco.loadAnns(ann_ids) + + mask_list = [] + for ann in anns: + + if max_instances > 0 and counted >= max_instances: + break + + box_xyxy = coco_bbox_to_xyxy(ann["bbox"]) + + decoder_inputs = decoder_preprocessing(image_embedding, box=box_xyxy, image=image) + low_res_masks = decoder_infer(session_decoder, decoder_inputs) + masks = sam_postprocessing(low_res_masks, image) + + pred2d = masks[0][0].astype(np.uint8) + mask_list.append(pred2d) + pred_bin = pred2d.astype(np.uint8) + + gt_mask = rle_to_mask(ann["segmentation"], H, W) + iou = compute_iou(pred_bin, gt_mask) + ious.append(iou) + counted += 1 + + if save_path is not None and len(mask_list) > 0: + save_mask_overlay(mask_list, image, save_path, img_info["file_name"]) + + if max_instances > 0 and counted >= max_instances: + break + + miou = float(np.mean(ious)) if counted > 0 else 0.0 + print("\n=========== COCO Evaluation (Box Prompt) ===========") + print(f"Instances Evaluated : {counted}") + print(f"Mean IoU (mIoU) : {miou:.4f}") + print("====================================================\n") + return miou + + +def check_device_range_valid(value): + # if contain , split to int list + min_value = 0 + max_value = 255 + if ',' in value: + ilist = [ int(v) for v in value.split(',') ] + for ivalue in ilist[:2]: + if ivalue < min_value or ivalue > max_value: + raise argparse.ArgumentTypeError("{} of device:{} is invalid. valid value range is [{}, {}]".format( + ivalue, value, min_value, max_value)) + return ilist[:2] + else: + # default as single int value + ivalue = int(value) + if ivalue < min_value or ivalue > max_value: + raise argparse.ArgumentTypeError("device:{} is invalid. valid value range is [{}, {}]".format( + ivalue, min_value, max_value)) + return ivalue + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--dataset-path', type=str, default='./datasets/', help='input path to coco dataset') + parser.add_argument('--save-path', type=str, default=None, help='output path to image') + parser.add_argument('--encoder-model-path', type=str, default='./models/encoder_sim.om', help='path to encoder model') + parser.add_argument('--decoder-model-path', type=str, default='./models/decoder_sim.om', help='path to decoder model') + parser.add_argument('--device-id', type=check_device_range_valid, default=0, help='NPU device id.') + parser.add_argument('--max-instances', type=int, default=0, help='Maximum number of instances to evaluate (0 = all).') + args = parser.parse_args() + + if args.save_path and not os.path.exists(args.save_path): + os.makedirs(os.path.realpath(args.save_path), mode=0o744) + + session_encoder = InferSession(args.device_id, args.encoder_model_path) + session_decoder = InferSession(args.device_id, args.decoder_model_path) + + evaluate_sam_on_coco( + args.dataset_path, + args.save_path, + session_encoder, + session_decoder, + max_instances=args.max_instances + ) + +if __name__ == "__main__": + main() + diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_end2end_infer.py b/ACL_PyTorch/built-in/cv/SAM/sam_end2end_infer.py index 25db4ffd00..952c95520d 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_end2end_infer.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_end2end_infer.py @@ -69,11 +69,11 @@ def decoder_infer(session_decoder, decoder_inputs): return low_res_masks -def sam_infer(src_path, session_encoder, session_decoder, input_point, save_path): +def sam_infer(src_path, session_encoder, session_decoder, input_point=None, box=None, save_path="./"): image = cv2.imread(src_path) x = encoder_preprocessing(image) image_embedding = encoder_infer(session_encoder, x) - decoder_inputs = decoder_preprocessing(image_embedding, input_point, image) + decoder_inputs = decoder_preprocessing(image_embedding, input_point=input_point, box=box, image=image) low_res_masks = decoder_infer(session_decoder, decoder_inputs) masks = sam_postprocessing(low_res_masks, image) save_mask(masks, image, src_path, save_path, random_color=True) @@ -95,8 +95,7 @@ def main(): session_encoder = InferSession(args.device_id, args.encoder_model_path) session_decoder = InferSession(args.device_id, args.decoder_model_path) - sam_infer(args.src_path, session_encoder, session_decoder, args.input_point, args.save_path) - + sam_infer(args.src_path, session_encoder, session_decoder, input_point=args.input_point, save_path=args.save_path) if __name__ == '__main__': main() diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py b/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py index d8b5a14742..f79da035e8 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py @@ -35,12 +35,27 @@ def encoder_preprocessing(image): return image -def decoder_preprocessing(image_embedding, input_point, image): - input_point = np.array(input_point) - input_label = [1] * len(input_point) - input_label = np.array(input_label) - onnx_coord = np.concatenate([input_point, np.array([[0.0, 0.0]])], axis=0)[None, :, :] - onnx_label = np.concatenate([input_label, np.array([-1])], axis=0)[None, :].astype(np.float32) +def decoder_preprocessing(image_embedding, input_point=None, box=None, image=None): ## box:[x0,y0,x1,y1] + coords_list = [] + labels_list = [] + + if input_point is not None and len(input_point) > 0: + input_point = np.array(input_point, dtype=np.float32) + input_label = np.ones(len(input_point), dtype=np.float32) + coords_list.append(input_point) + labels_list.append(input_label) + + coords_list.append(np.array([[0.0, 0.0]], dtype=np.float32)) + labels_list.append(np.array([-1], dtype=np.float32)) + + if box is not None: + box = np.array(box, dtype=np.float32).reshape(2, 2) + coords_list.append(box) + labels_list.append(np.array([2, 3], dtype=np.float32)) + + onnx_coord = np.concatenate(coords_list, axis=0)[None, :, :] # (1,N,2) + onnx_label = np.concatenate(labels_list, axis=0)[None, :].astype(np.float32) # (1,N) + transform = ResizeLongestSide(IMAGE_SIZE) onnx_coord = transform.apply_coords(onnx_coord, image.shape[: 2]).astype(np.float32) onnx_mask_input = np.zeros((1, 1, 256, 256), dtype=np.float32) -- Gitee From 19c58c797a9da2c0d083c75bd6c31c230a36f7ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Thu, 25 Sep 2025 16:00:49 +0800 Subject: [PATCH 18/24] Adding a Precision Test Script --- ACL_PyTorch/built-in/cv/SAM/README.md | 45 --------------------------- 1 file changed, 45 deletions(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/README.md b/ACL_PyTorch/built-in/cv/SAM/README.md index b912ccc1f2..076d0e1075 100644 --- a/ACL_PyTorch/built-in/cv/SAM/README.md +++ b/ACL_PyTorch/built-in/cv/SAM/README.md @@ -97,52 +97,7 @@ SAM 首先会自动分割图像中的所有内容,但是如果你需要分割 ### 4. 准备数据集 -<<<<<<< HEAD - 取仓库里的 demo 图片进行端到端测试。 -======= - **表 1** 版本配套表 - -| 配套 | 版本 | 环境准备指导 | -| ---- | ---- | ---- | -| 固件与驱动 | 25.2.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | -| CANN | 8.2.RC1 | - | -| MindIE | 2.1.RC1 | - | -| Python | 3.11.10 | - | -| PyTorch | 2.1.0 | - | -| 说明:Atlas 300I Duo 推理卡请以CANN版本选择实际固件与驱动版本。 | \ | \ | - -## 3. 快速上手 - -### 3.1 获取源码 - -``` -git clone https://gitee.com/ascend/ModelZoo-PyTorch.git -cd ModelZoo-PyTorch/ACL_PyTorch/built-in/cv/SAM -git clone https://github.com/facebookresearch/segment-anything.git -cd segment-anything -git reset --hard 6fdee8f2727f4506cfbbe553e23b895e27956588 -git apply ../segment_anything_diff.patch -pip3 install -e . -cd .. -``` - -### 3.2 安装依赖。 - -1. 安装基础环境。 - - ```bash - pip3 install -r requirements.txt - ``` - - 说明:如果某些库通过此方式安装失败,可使用 pip3 install 单独进行安装。 - -2. 安装 [msit](https://gitee.com/ascend/msit/tree/master/msit/) 的 surgeon 组件和 benchmark 组件。 - -### 3.3 准备数据集 - -GitHub 仓库没有提供精度和性能的测试手段,这里取仓库里的 demo 图片进行测试。 ->>>>>>> d3e26ee6ae7580508cba0256159aefde6be56a2a - ```bash mkdir data cd data -- Gitee From d5010fd73165013625018de814957dfb2f061047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Thu, 25 Sep 2025 17:27:23 +0800 Subject: [PATCH 19/24] 1 --- ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py | 7 ++++--- ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py index fe44855be3..74b9484834 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py @@ -16,6 +16,7 @@ import os import cv2 import argparse + import numpy as np from tqdm import tqdm from pycocotools.coco import COCO @@ -39,7 +40,7 @@ def rle_to_mask(rle, h, w): def compute_iou(pred_mask, gt_mask): pred = (pred_mask > 0).astype(np.uint8) - gt = (gt_mask > 0).astype(np.uint8) + gt = (gt_mask > 0).astype(np.uint8) inter = (pred & gt).sum() union = (pred | gt).sum() return float(inter) / float(union) if union > 0 else 0.0 @@ -147,7 +148,7 @@ def check_device_range_valid(value): min_value = 0 max_value = 255 if ',' in value: - ilist = [ int(v) for v in value.split(',') ] + ilist = [int(v) for v in value.split(',')] for ivalue in ilist[:2]: if ivalue < min_value or ivalue > max_value: raise argparse.ArgumentTypeError("{} of device:{} is invalid. valid value range is [{}, {}]".format( @@ -178,7 +179,7 @@ def main(): session_encoder = InferSession(args.device_id, args.encoder_model_path) session_decoder = InferSession(args.device_id, args.decoder_model_path) - evaluate_sam_on_coco( + miou = evaluate_sam_on_coco( args.dataset_path, args.save_path, session_encoder, diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py b/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py index f79da035e8..3afa5e11fe 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_preprocessing_pytorch.py @@ -35,7 +35,7 @@ def encoder_preprocessing(image): return image -def decoder_preprocessing(image_embedding, input_point=None, box=None, image=None): ## box:[x0,y0,x1,y1] +def decoder_preprocessing(image_embedding, input_point=None, box=None, image=None): coords_list = [] labels_list = [] @@ -53,8 +53,8 @@ def decoder_preprocessing(image_embedding, input_point=None, box=None, image=Non coords_list.append(box) labels_list.append(np.array([2, 3], dtype=np.float32)) - onnx_coord = np.concatenate(coords_list, axis=0)[None, :, :] # (1,N,2) - onnx_label = np.concatenate(labels_list, axis=0)[None, :].astype(np.float32) # (1,N) + onnx_coord = np.concatenate(coords_list, axis=0)[None, :, :] + onnx_label = np.concatenate(labels_list, axis=0)[None, :].astype(np.float32) transform = ResizeLongestSide(IMAGE_SIZE) onnx_coord = transform.apply_coords(onnx_coord, image.shape[: 2]).astype(np.float32) -- Gitee From 4c2adfd40bdd1709fb1642b34886eaf426bdbf9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Thu, 25 Sep 2025 18:28:04 +0800 Subject: [PATCH 20/24] 1 --- ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py index 74b9484834..cf8308fed6 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py @@ -15,8 +15,8 @@ import os import cv2 -import argparse +import argparse import numpy as np from tqdm import tqdm from pycocotools.coco import COCO @@ -40,7 +40,7 @@ def rle_to_mask(rle, h, w): def compute_iou(pred_mask, gt_mask): pred = (pred_mask > 0).astype(np.uint8) - gt = (gt_mask > 0).astype(np.uint8) + gt = (gt_mask > 0).astype(np.uint8) inter = (pred & gt).sum() union = (pred | gt).sum() return float(inter) / float(union) if union > 0 else 0.0 @@ -140,7 +140,6 @@ def evaluate_sam_on_coco(coco_root, save_path, encoder, decoder, max_instances=0 print(f"Instances Evaluated : {counted}") print(f"Mean IoU (mIoU) : {miou:.4f}") print("====================================================\n") - return miou def check_device_range_valid(value): @@ -179,7 +178,7 @@ def main(): session_encoder = InferSession(args.device_id, args.encoder_model_path) session_decoder = InferSession(args.device_id, args.decoder_model_path) - miou = evaluate_sam_on_coco( + evaluate_sam_on_coco( args.dataset_path, args.save_path, session_encoder, -- Gitee From 597e9654818a1782e1d4ff4c79a40165fb140472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Thu, 25 Sep 2025 18:28:49 +0800 Subject: [PATCH 21/24] 1 --- ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py index cf8308fed6..bc3bad1c49 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, -- Gitee From b3287760405e376833ab2056b38d4bf8581f3af9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Thu, 25 Sep 2025 18:42:31 +0800 Subject: [PATCH 22/24] 1 --- ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py index bc3bad1c49..4e540f7b5c 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# https://www.apache.org/licenses/LICENSE-2.0 +# https://apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -14,9 +14,9 @@ import os -import cv2 - import argparse + +import cv2 import numpy as np from tqdm import tqdm from pycocotools.coco import COCO -- Gitee From 848234b112448a4d43e757bf1e2e132c73351090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Thu, 25 Sep 2025 18:53:56 +0800 Subject: [PATCH 23/24] 1 --- ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py index 4e540f7b5c..865c1f0f6a 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# https://apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, -- Gitee From 0dd1b97352cfc2f8463f455089f9a67fa7422dce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=82=B9=E4=BA=A6=E8=88=9F?= Date: Thu, 25 Sep 2025 20:41:44 +0800 Subject: [PATCH 24/24] 1 --- ACL_PyTorch/built-in/cv/SAM/README.md | 5 ++--- ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/ACL_PyTorch/built-in/cv/SAM/README.md b/ACL_PyTorch/built-in/cv/SAM/README.md index 076d0e1075..50bd2cd7f1 100644 --- a/ACL_PyTorch/built-in/cv/SAM/README.md +++ b/ACL_PyTorch/built-in/cv/SAM/README.md @@ -28,10 +28,9 @@ Segment Anything Model (SAM) 是由 Meta 开源的图像分割大模型,在计 | ---- | ---- | ---- | | 固件与驱动 | 25.2.RC1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | | CANN | 8.2.RC1 | - | - | MindIE | 2.1.RC1 | - | | Python | 3.11.10 | - | | PyTorch | 2.1.0 | - | - | 说明:Atlas 300I Duo 推理卡请以CANN版本选择实际固件与驱动版本。 | \ | \ | + | 说明:仅支持Atlas 300I Duo 推理卡,请以CANN版本选择实际固件与驱动版本。 | \ | \ | ## 快速上手 @@ -107,7 +106,7 @@ cd .. - 下载coco2017数据集进行精度测试。 -下载COCO-2017数据集的[图片](https://gitee.com/link?target=http%3A%2F%2Fimages.cocodataset.org%2Fzips%2Fval2017.zip)与[标注](https://gitee.com/link?target=http%3A%2F%2Fimages.cocodataset.org%2Fannotations%2Fannotations_trainval2017.zip),放置coco2017目录下 +下载并解压COCO-2017数据集的[图片](https://gitee.com/link?target=http%3A%2F%2Fimages.cocodataset.org%2Fzips%2Fval2017.zip)与[标注](https://gitee.com/link?target=http%3A%2F%2Fimages.cocodataset.org%2Fannotations%2Fannotations_trainval2017.zip),放置coco2017目录下 ``` coco2017 diff --git a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py index 865c1f0f6a..b8e416deb5 100644 --- a/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py +++ b/ACL_PyTorch/built-in/cv/SAM/sam_coco_metric.py @@ -22,7 +22,7 @@ from tqdm import tqdm from pycocotools.coco import COCO from pycocotools import mask as maskUtils -from ais_bench.infer.interface import InferSession\ +from ais_bench.infer.interface import InferSession from sam_preprocessing_pytorch import encoder_preprocessing, decoder_preprocessing from sam_postprocessing_pytorch import sam_postprocessing -- Gitee