diff --git a/.gitignore b/.gitignore
index b245b85b056b3433442c99cc1cd7e3c8d63b75a5..58865d7842da034fb37582a1cd947fdeb90e0871 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,4 +104,7 @@ venv.bak/
# data
data/
-output_dir/
\ No newline at end of file
+output_dir/
+
+# macos
+.DS_Store
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..744bf2ba7ca0e0fc6d2e30faa4e9bafd7b949e63
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,203 @@
+Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
index 79b751dc6173c1949655b8a144b5cd15c441fecd..bab238475e4490fd97bc0d2b5abf1956465fce03 100644
--- a/README.md
+++ b/README.md
@@ -13,30 +13,95 @@ GAN-Generative Adversarial Network, was praised by "the Father of Convolutional
[](LICENSE)
+## 🎪 Hot Activities
-## Recent Contributors
-[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/0)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/1)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/2)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/3)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/4)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/5)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/6)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/7)
+- 2021.4.15~4.22
-## Quick Start
+ GAN 7 Days Course Camp: Baidu Senior Research Developers help you learn the basic and advanced GAN knowledge in 7 days!
-* Please refer to the [installation document](./docs/en_US/install.md) to make sure you have installed PaddlePaddle and PaddleGAN correctly.
+ **Courses videos and related materials: https://aistudio.baidu.com/aistudio/course/introduce/16651**
-* Get started through ppgan.app interface:
+## 🚀 Recent Updates
- ```python
- from ppgan.apps import RealSRPredictor
- sr = RealSRPredictor()
- sr.run("docs/imgs/monarch.png")
- ```
-* More applications, please refer to [ppgan.apps apis](./docs/en_US/apis/apps.md)
-* More tutorials:
- - [Data preparation](./docs/en_US/data_prepare.md)
- - [Training/Evaluating/Testing basic usage](./docs/zh_CN/get_started.md)
+- 👶 **Young or Old?:[StyleGAN V2 Face Editing](./docs/en_US/tutorials/styleganv2editing.md)-Time Machine!** 👨🦳
+ - **[Online Toturials](https://aistudio.baidu.com/aistudio/projectdetail/3251280?channelType=0&channel=0)**
+
+

+
+
+- 🔥 **Latest Release: [PP-MSVSR](./docs/en_US/tutorials/video_super_resolution.md)** 🔥
+ - **Video Super Resolution SOTA models**
+
+

+
+
+- 😍 **Boy or Girl?:[StyleGAN V2 Face Editing](./docs/en_US/tutorials/styleganv2editing.md)-Changing genders!** 😍
+ - **[Online Toturials](https://aistudio.baidu.com/aistudio/projectdetail/2565277?contributionType=1)**
+
+

+
+
+- 👩🚀 **A Space Odyssey :[LapStyle](./docs/zh_CN/tutorials/lap_style.md) image translation take you travel around the universe**👨🚀
+
+ - **[Online Toturials](https://aistudio.baidu.com/aistudio/projectdetail/2343740?contributionType=1)**
+
+
+
+- 🧙♂️ **Latest Creative Project:create magic/dynamic profile for your student ID in Hogwarts** 🧙♀️
+
+ - **[Online Toturials](https://aistudio.baidu.com/aistudio/projectdetail/2288888?channelType=0&channel=0)**
+
+
+

+
+
+- **💞 Add Face Morphing function💞 : you can perfectly merge any two faces and make the new face get any facial expressions!**
+
+ - Tutorials: https://aistudio.baidu.com/aistudio/projectdetail/2254031
+
+
+

+
+
+- **Publish a new version of First Oder Motion model by having two impressive features:**
+ - High resolution 512x512
+ - Face Enhancement
+ - Tutorials: https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/motion_driving.md
+
+- **New image translation ability--transfer photo into oil painting style:**
+
+ - Complete tutorials for deployment: https://github.com/wzmsltw/PaintTransformer
+
+
+

+
+
+## Document Tutorial
+
+#### **Installation**
+
+* Environment dependence:
+ - PaddlePaddle >= 2.1.0
+ - Python >= 3.6
+ - CUDA >= 10.1
+* [Full installation tutorial](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/install.md)
+
+#### **Starter Tutorial**
+
+- [Quick start](./docs/en_US/get_started.md)
+- [Data Preparation](./docs/en_US/data_prepare.md)
+- [Instruction of APIs](./docs/en_US/apis/apps.md)
+- [Instruction of Config Files](./docs/en_US/config_doc.md)
## Model Tutorial
* [Pixel2Pixel](./docs/en_US/tutorials/pix2pix_cyclegan.md)
* [CycleGAN](./docs/en_US/tutorials/pix2pix_cyclegan.md)
+* [LapStyle](./docs/en_US/tutorials/lap_style.md)
* [PSGAN](./docs/en_US/tutorials/psgan.md)
* [First Order Motion Model](./docs/en_US/tutorials/motion_driving.md)
* [FaceParsing](./docs/en_US/tutorials/face_parse.md)
@@ -44,74 +109,138 @@ GAN-Generative Adversarial Network, was praised by "the Father of Convolutional
* [U-GAT-IT](./docs/en_US/tutorials/ugatit.md)
* [Photo2Cartoon](./docs/en_US/tutorials/photo2cartoon.md)
* [Wav2Lip](./docs/en_US/tutorials/wav2lip.md)
-* [Super_Resolution](./docs/en_US/tutorials/super_resolution.md)
+* [Single Image Super Resolution(SISR)](./docs/en_US/tutorials/single_image_super_resolution.md)
+ * Including: RealSR, ESRGAN, LESRCNN, PAN, DRN
+* [Video Super Resolution(VSR)](./docs/en_US/tutorials/video_super_resolution.md)
+ * Including: ⭐ PP-MSVSR ⭐, EDVR, BasicVSR, BasicVSR++
+* [StyleGAN2](./docs/en_US/tutorials/styleganv2.md)
+* [Pixel2Style2Pixel](./docs/en_US/tutorials/pixel2style2pixel.md)
+* [StarGANv2](docs/en_US/tutorials/starganv2.md)
+* [MPR Net](./docs/en_US/tutorials/mpr_net.md)
+* [FaceEnhancement](./docs/en_US/tutorials/face_enhancement.md)
+* [PReNet](./docs/en_US/tutorials/prenet.md)
+* [SwinIR](./docs/en_US/tutorials/swinir.md)
+* [InvDN](./docs/en_US/tutorials/invdn.md)
+* [AOT-GAN](./docs/en_US/tutorials/aotgan.md)
+* [NAFNet](./docs/en_US/tutorials/nafnet.md)
+* [GFPGan](./docs/en_US/tutorials/gfpgan.md)
+* [GPEN](./docs/en_US/tutorials/gpen.md)
+
## Composite Application
-* [Video restore](./docs/zh_CN/tutorials/video_restore.md)
+* [Video restore](./docs/en_US/tutorials/video_restore.md)
+
+## Online Tutorial
+
+You can run those projects in the [AI Studio](https://aistudio.baidu.com/aistudio/projectoverview/public/1?kw=paddlegan) to learn how to use the models above:
+
+|Online Tutorial | link |
+|--------------|-----------|
+|Motion Driving-multi-personal "Mai-ha-hi" | [Click and Try](https://aistudio.baidu.com/aistudio/projectdetail/1603391) |
+|Restore the video of Beijing hundreds years ago|[Click and Try](https://aistudio.baidu.com/aistudio/projectdetail/1161285)|
+|Motion Driving-When "Su Daqiang" sings "unravel" |[Click and Try](https://aistudio.baidu.com/aistudio/projectdetail/1048840)|
## Examples
+### Face Morphing
+
+
+

+
+
### Image Translation
-

+
+
### Old video restore
-

+
+
### Motion driving
-

+
+
### Super resolution
-

+
+
### Makeup shifter
-

+
+
### Face cartoonization
-

+
+
### Realistic face cartoonization
-

+
+
### Photo animation
-

+
+
### Lip-syncing
-

+
+
## Changelog
+- v2.1.0 (2021.12.8)
+ - Release a video super-resolution model PP-MSVSR and multiple pre-training weights
+ - Release several SOTA video super-resolution models and their pre-trained models such as BasicVSR, IconVSR and BasicVSR++
+ - Release the light-weight motion-driven model(Volume compression: 229M->10.1M), and optimized the fusion effect
+ - Release high-resolution FOMM and Wav2Lip pre-trained models
+ - Release several interesting applications based on StyleGANv2, such as face inversion, face fusion and face editing
+ - Released Baidu’s self-developed and effective style transfer model LapStyle and its interesting applications, and launched the official website [experience page](https://www.paddlepaddle.org.cn/paddlegan)
+ - Release a light-weight image super-resolution model PAN
+
+- v2.0.0 (2021.6.2)
+ - Release [Fisrt Order Motion](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/tutorials/motion_driving.md) model and multiple pre-training weights
+ - Release applications that support [Multi-face action driven](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/tutorials/motion_driving.md#1-test-for-face)
+ - Release video super-resolution model [EDVR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/tutorials/video_super_resolution.md) and multiple pre-training weights
+ - Release the contents of [7-day punch-in training camp](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/education) corresponding to PaddleGAN
+ - Enhance the robustness of PaddleGAN running on the windows platform
+
+- v2.0.0-beta (2021.3.1)
+ - Completely switch the API of Paddle 2.0.0 version.
+ - Release of super-resolution models: ESRGAN, RealSR, LESRCNN, DRN, etc.
+ - Release lip migration model: Wav2Lip
+ - Release anime model of Street View: AnimeGANv2
+ - Release face animation model: U-GAT-IT, Photo2Cartoon
+ - Release SOTA generation model: StyleGAN2
- v0.1.0 (2020.11.02)
- Release first version, supported models include Pixel2Pixel, CycleGAN, PSGAN. Supported applications include video frame interpolation, super resolution, colorize images and videos, image animation.
diff --git a/README_cn.md b/README_cn.md
index ded54c91589fb4cfe505e1c01a5ab1f6f343a6e8..14118607f36e414651d2c94bf1bd2026ecaf386c 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -13,43 +13,140 @@ GAN--生成对抗网络,被“卷积网络之父”**Yann LeCun(杨立昆)
[](LICENSE)
-## 近期贡献者
-[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/0)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/1)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/2)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/3)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/4)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/5)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/6)[](https://sourcerer.io/fame/LaraStuStu/paddlepaddle/paddlegan/links/7)
-## 快速开始
-* 请确保您按照[安装文档](./docs/zh_CN/install.md)的说明正确安装了PaddlePaddle和PaddleGAN
+## 近期活动🔥🔥🔥
-* 通过ppgan.apps接口直接使用应用:
- ```python
- from ppgan.apps import RealSRPredictor
- sr = RealSRPredictor()
- sr.run("docs/imgs/monarch.png")
- ```
+- 🔥**2021.12.08**🔥
+ **💙 AI快车道👩🏫:视频超分算法及行业应用 💙**
+ - **课程回放链接🔗:https://aistudio.baidu.com/aistudio/education/group/info/25179**
-* 更多应用的使用请参考[ppgan.apps API](./docs/zh_CN/apis/apps.md)
-* 更多训练、评估教程:
- * [数据准备](./docs/zh_CN/data_prepare.md)
- * [训练/评估/推理教程](./docs/zh_CN/get_started.md)
-## 经典模型实现
-* [Pixel2Pixel](./docs/zh_CN/tutorials/pix2pix_cyclegan.md)
-* [CycleGAN](./docs/zh_CN/tutorials/pix2pix_cyclegan.md)
-* [PSGAN](./docs/zh_CN/tutorials/psgan.md)
-* [First Order Motion Model](./docs/zh_CN/tutorials/motion_driving.md)
-* [FaceParsing](./docs/zh_CN/tutorials/face_parse.md)
-* [AnimeGANv2](./docs/zh_CN/tutorials/animegan.md)
-* [U-GAT-IT](./docs/zh_CN/tutorials/ugatit.md)
-* [Photo2Cartoon](docs/zh_CN/tutorials/photo2cartoon.md)
-* [Wav2Lip](docs/zh_CN/tutorials/wav2lip.md)
-* [Super_Resolution](./docs/en_US/tutorials/super_resolution.md)
+- 2021.4.15~4.22
-## 复合应用
+ 生成对抗网络七日打卡营火爆来袭,赶紧让百度资深研发带你上车GAN起来吧!
-* [视频修复](./docs/zh_CN/tutorials/video_restore.md)
+ **直播回放与课件资料:https://aistudio.baidu.com/aistudio/course/introduce/16651**
+
+- 2020.12.10
+
+ 《大谷 Spitzer 手把手教你修复百年前老北京影像》b站直播中奖用户名单请点击[PaddleGAN直播中奖名单](./docs/luckydraw.md)查看~
+
+ **想要看直播回放视频请点击链接:https://www.bilibili.com/video/BV1GZ4y1g7xc**
+
+## 产品动态
+- 👶 **人脸编辑神器:[StyleGAN V2人脸属性编辑](./docs/zh_CN/tutorials/styleganv2editing.md)之年龄变换--时光穿梭机,一键实现变老变年轻** 👨🦳
+ - **[完整在线教程](https://aistudio.baidu.com/aistudio/projectdetail/3251280?channelType=0&channel=0)**
+
+

+
+
+- 👀 **视频超分SOTA算法[PP-MSVSR](./docs/zh_CN/tutorials/video_super_resolution.md):一行命令从"马赛克"到"高清影像"** 👀
+ - **[完整在线教程](https://aistudio.baidu.com/aistudio/projectdetail/3205183)**
+
+

+
+
+- 😍 **人脸编辑神器:[StyleGAN V2人脸属性编辑](./docs/zh_CN/tutorials/styleganv2editing.md)之性别转换--怕什么孤单?和自己谈一场恋爱吧!** 😍
+ - **[完整在线教程](https://aistudio.baidu.com/aistudio/projectdetail/2565277?contributionType=1)**
+
+

+
+
+- 👩🚀 **宇宙漫游指南:[LapStyle](./docs/zh_CN/tutorials/lap_style.md)风格迁移带你「沉浸式」体验太空漫游** 👨🚀
+
+ - **[完整在线教程](https://aistudio.baidu.com/aistudio/projectdetail/2343740?contributionType=1)**
+
+
+
+- 🧙♂️ **新增创意项目**:制作专属于你的**会动的霍格沃兹魔法头像** 🧙♀️
+
+ - **[完整在线教程](https://aistudio.baidu.com/aistudio/projectdetail/2288888?channelType=0&channel=0)**
+
+
+
+

+
+
+- ⭐ **新增人脸融合能力,结合新版Frirst Order Motion,实现人脸完美融合并带有丰富表情(๑^ں^๑)** ⭐
+
+ - **[完整在线教程](https://aistudio.baidu.com/aistudio/projectdetail/2254031 )**
+
+
+

+
+
+- 新增First Order Motion分辨率512清晰版本,并加上人脸增强特效,使得面部细节更清晰,详情见[教程](./docs/zh_CN/tutorials/motion_driving.md)。
+
+- 新增真实照片转油画风格能力
+
+ - 完整推理代码及教程见: https://github.com/wzmsltw/PaintTransformer
+
+
+

+
+
+## 文档教程
+
+### 安装
+
+- 环境依赖:
+ - PaddlePaddle >= 2.1.0
+ - Python >= 3.6
+ - CUDA >= 10.1
+- [完整安装教程](./docs/zh_CN/install.md)
+
+### 入门教程
+
+- [快速开始](./docs/zh_CN/get_started.md)
+- [数据准备](./docs/zh_CN/data_prepare.md)
+- [API接口使用文档](./docs/zh_CN/apis/apps.md)
+- [配置文件/Config使用说明](./docs/zh_CN/config_doc.md)
+
+## 模型库
+
+* 图像翻译
+ * 风格迁移:[Pixel2Pixel](./docs/zh_CN/tutorials/pix2pix_cyclegan.md)
+ * 风格迁移:[CycleGAN](./docs/zh_CN/tutorials/pix2pix_cyclegan.md)
+ * 图像艺术风格转换:[LapStyle](./docs/zh_CN/tutorials/lap_style.md)
+ * 人脸换妆:[PSGAN](./docs/zh_CN/tutorials/psgan.md)
+ * 照片动漫化:[AnimeGANv2](./docs/zh_CN/tutorials/animegan.md)
+ * 人像动漫化:[U-GAT-IT](./docs/zh_CN/tutorials/ugatit.md)
+ * 人脸卡通化:[Photo2Cartoon](docs/zh_CN/tutorials/photo2cartoon.md)
+ * 多种风格迁移:[StarGANv2](docs/zh_CN/tutorials/starganv2.md)
+* 动作迁移
+ * 人脸表情迁移:[First Order Motion Model](./docs/zh_CN/tutorials/motion_driving.md)
+ * 唇形合成:[Wav2Lip](docs/zh_CN/tutorials/wav2lip.md)
+* 基础GAN
+ * [DCGAN](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/ppgan/models/dc_gan_model.py)
+ * WGAN
+* 人脸生成
+ * 人脸生成:[StyleGAN2](./docs/zh_CN/tutorials/styleganv2.md)
+ * 人脸编码:[Pixel2Style2Pixel](./docs/zh_CN/tutorials/pixel2style2pixel.md)
+ * 人脸增强:[FaceEnhancement](./docs/zh_CN/tutorials/face_enhancement.md)
+ * 人脸解析:[FaceParsing](./docs/zh_CN/tutorials/face_parse.md)
+ * 盲人脸修复:[GFPGan](./docs/zh_CN/tutorials/gfpgan.md)、[GPEN](./docs/zh_CN/tutorials/gpen.md)
+* 分辨率提升
+ * 单张图片超分:[Single Image Super Resolution(SISR)](./docs/zh_CN/tutorials/single_image_super_resolution.md)
+ * 包含模型:RealSR、ESRGAN、LESRCNN、PAN、DRN
+ * 视频超分:[Video Super Resolution(VSR)](./docs/zh_CN/tutorials/video_super_resolution.md)
+ * 包含模型:⭐ PP-MSVSR ⭐、EDVR、BasicVSR、BasicVSR++
+* 图像视频修复
+ * 图像去模糊去噪去雨:[MPR Net](./docs/zh_CN/tutorials/mpr_net.md)、[SwinIR](./docs/zh_CN/tutorials/swinir.md)、[InvDN](./docs/zh_CN/tutorials/invdn.md)、[NAFNet](./docs/zh_CN/tutorials/nafnet.md)
+ * 视频去模糊:[EDVR](./docs/zh_CN/tutorials/video_super_resolution.md)
+ * 图像去雨:[PReNet](./docs/zh_CN/tutorials/prenet.md)
+ * 图像补全:[AOT-GAN](./docs/zh_CN/tutorials/aotgan.md)
+
+## 产业级应用
+
+- [智能影像修复](./docs/zh_CN/industrial_solution/video_restore_cn.md)
## 在线教程
@@ -57,86 +154,119 @@ GAN--生成对抗网络,被“卷积网络之父”**Yann LeCun(杨立昆)
|在线教程 | 链接 |
|--------------|-----------|
+|人脸融合-PaddleGAN七夕特辑 | [点击体验](https://aistudio.baidu.com/aistudio/projectdetail/2254031 ) |
+|表情动作迁移-一键实现多人版「蚂蚁呀嘿」 | [点击体验](https://aistudio.baidu.com/aistudio/projectdetail/1603391) |
|老北京视频修复|[点击体验](https://aistudio.baidu.com/aistudio/projectdetail/1161285)|
|表情动作迁移-当苏大强唱起unravel |[点击体验](https://aistudio.baidu.com/aistudio/projectdetail/1048840)|
+
## 效果展示
-### 图片变换
+### 人脸融合
-

+
+### 风格迁移
+
+
+

+
+
+
### 老视频修复
-

+
+
### 动作迁移
-

+
+
### 超分辨率
-

+
+
### 妆容迁移
-

+
+
### 人脸动漫化
-

+
+
### 写实人像卡通化
-

+
+
### 照片动漫化
-

+
+
### 唇形同步
-

+
+
## 版本更新
+- v2.1.0 (2021.12.8)
+ - 发布视频超分辨率模型PP-MSVSR以及多个预训练权重
+ - 发布BasicVSR,IconVSR与Basicvsr++等多个效果领先的视频超分辨率模型及其预训练模型
+ - 发布轻量级动作驱动模型(体积压缩:229M->10.1M),并优化融合效果
+ - 发布高分辨率的FOMM和Wav2Lip预训练模型
+ - 发布人脸反演,人脸融合和人脸编辑等多个基于StyleGANv2的有趣应用
+ - 发布百度自研且效果领先的风格迁移模型LapStyle及其有趣应用,并上线官网[体验页面](https://www.paddlepaddle.org.cn/paddlegan)
+ - 发布轻量的图像超分辨模型PAN
+
+- v2.0.0 (2021.6.2)
+ - 发布[Fisrt Order Motion](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/tutorials/motion_driving.md)模型以及多个预训练权重
+ - 发布支持[多人脸驱动](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/tutorials/motion_driving.md#1-test-for-face)的应用
+ - 发布视频超分辨模型[EDVR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/tutorials/video_super_resolution.md)以及多个预训练权重
+ - 发布PaddleGAN对应的[七日打卡训练营](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/education)内容
+ - 增强PaddleGAN在windows平台运行的鲁棒性
+
+- v2.0.0-beta (2021.3.1)
+ - 完全切换Paddle 2.0.0版本的API。
+ - 发布超分辨模型:ESRGAN,RealSR,LESRCNN,DRN等
+ - 发布唇形迁移模型:Wav2Lip
+ - 发布街景动漫化模型:AnimeGANv2
+ - 发布人脸动漫化模型:U-GAT-IT ,Photo2Cartoon
+ - 发布高清人脸生成模型:StyleGAN2
- v0.1.0 (2020.11.02)
- 初版发布,支持Pixel2Pixel、CycleGAN、PSGAN模型,支持视频插针、超分、老照片/视频上色、视频动作生成等应用。
- 模块化设计,接口简单易用。
-## 近期活动更新
-
-- 2020.12.10
-
- 《大谷 Spitzer 手把手教你修复百年前老北京影像》b站直播中奖用户名单请点击[PaddleGAN直播中奖名单](./docs/luckydraw.md)查看~
-
- 想要看直播回放视频请点击链接:https://www.bilibili.com/video/BV1GZ4y1g7xc
-
## 欢迎加入PaddleGAN技术交流群
@@ -146,6 +276,12 @@ GAN--生成对抗网络,被“卷积网络之父”**Yann LeCun(杨立昆)
+扫描二维码回复关键字"GAN"即可加入官方微信交流群!
+
+

+
+
+
### PaddleGAN 特别兴趣小组(Special Interest Group)
最早于1961年被[ACM(Association for Computing Machinery)](https://en.wikipedia.org/wiki/Association_for_Computing_Machinery)首次提出并使用,国际顶尖开源组织包括[Kubernates](https://kubernetes.io/)都采用SIGs的形式,使拥有同样特定兴趣的成员可以共同分享、学习知识并进行项目开发。这些成员不需要在同一国家/地区、同一个组织,只要大家志同道合,都可以奔着相同的目标一同学习、工作、玩耍~
@@ -158,8 +294,9 @@ SIG贡献:
- [zhen8838](https://github.com/zhen8838): 贡献AnimeGANv2.
- [Jay9z](https://github.com/Jay9z): 贡献DCGAN的示例、修改安装文档等。
-- [HighCWu](https://github.com/HighCWu): 贡献c-DCGAN和WGAN,以及对`paddle.vision.datasets`数据集的支持。
+- [HighCWu](https://github.com/HighCWu): 贡献c-DCGAN和WGAN,以及对`paddle.vision.datasets`数据集的支持;贡献inversion部分代码复现。
- [hao-qiang](https://github.com/hao-qiang) & [ minivision-ai ](https://github.com/minivision-ai): 贡献人像卡通化photo2cartoon项目。
+- [lyl120117](https://github.com/lyl120117):贡献去模糊MPRNet推理代码。
## 贡献代码
diff --git a/applications/tools/animeganv2.py b/applications/tools/animeganv2.py
index 7503c547da157e856901be7d94180ea0c2f3a420..74588559b81fb82ae7e52f3bc8d65feb253c75b1 100644
--- a/applications/tools/animeganv2.py
+++ b/applications/tools/animeganv2.py
@@ -1,6 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
import paddle
import os
import sys
+
sys.path.insert(0, os.getcwd())
from ppgan.apps import AnimeGANPredictor
import argparse
diff --git a/applications/tools/aotgan.py b/applications/tools/aotgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..e545eda8b022bc3916e77d333c62dbebb7197f55
--- /dev/null
+++ b/applications/tools/aotgan.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import os
+import sys
+
+sys.path.insert(0, os.getcwd())
+from ppgan.apps import AOTGANPredictor
+import argparse
+from ppgan.utils.config import get_config
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--input_image_path",
+ type=str,
+ default=None,
+ help="path to input image")
+
+ parser.add_argument("--input_mask_path",
+ type=str,
+ default=None,
+ help="path to input mask")
+
+ parser.add_argument("--output_path",
+ type=str,
+ default=None,
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model weight")
+
+ parser.add_argument("--config-file",
+ type=str,
+ default=None,
+ help="path to yaml file")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ cfg = get_config(args.config_file)
+
+ predictor = AOTGANPredictor(output_path=args.output_path,
+ weight_path=args.weight_path,
+ gen_cfg=cfg.predict)
+ predictor.run(input_image_path=args.input_image_path, input_mask_path=args.input_mask_path)
diff --git a/applications/tools/face_parse.py b/applications/tools/face_parse.py
index 5eded74084dd96731b3b98ce4d736ea5192694fb..6cd907ebb394d44730a2bdc4402f134a84de818f 100644
--- a/applications/tools/face_parse.py
+++ b/applications/tools/face_parse.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
import argparse
import paddle
diff --git a/applications/tools/first-order-demo.py b/applications/tools/first-order-demo.py
index 624aea24d46ce9dc917e1461d343130da5f6dc9b..6b566f1801e9b017adf6db82218f3596230ae355 100644
--- a/applications/tools/first-order-demo.py
+++ b/applications/tools/first-order-demo.py
@@ -25,6 +25,9 @@ parser.add_argument("--weight_path",
parser.add_argument("--source_image", type=str, help="path to source image")
parser.add_argument("--driving_video", type=str, help="path to driving video")
parser.add_argument("--output", default='output', help="path to output")
+parser.add_argument("--filename",
+ default='result.mp4',
+ help="filename to output")
parser.add_argument("--relative",
dest="relative",
action="store_true",
@@ -48,22 +51,81 @@ parser.add_argument("--best_frame",
type=int,
default=None,
help="Set frame to start from.")
-parser.add_argument("--cpu", dest="cpu", action="store_true", help="cpu mode.")
+# for device
+group = parser.add_mutually_exclusive_group()
+group.add_argument("--cpu", dest="cpu", action="store_true", help="cpu mode.")
+group.add_argument("--xpu", dest="xpu", action="store_true", help="xpu mode.")
+
+parser.add_argument("--ratio",
+ dest="ratio",
+ type=float,
+ default=0.4,
+ help="margin ratio")
+parser.add_argument(
+ "--face_detector",
+ dest="face_detector",
+ type=str,
+ default='sfd',
+ help="face detector to be used, can choose s3fd or blazeface")
+parser.add_argument("--multi_person",
+ dest="multi_person",
+ action="store_true",
+ default=False,
+ help="whether there is only one person in the image or not")
+parser.add_argument("--image_size",
+ dest="image_size",
+ type=int,
+ default=256,
+ help="size of image")
+parser.add_argument("--batch_size",
+ dest="batch_size",
+ type=int,
+ default=1,
+ help="Batch size for fom model")
+parser.add_argument("--face_enhancement",
+ dest="face_enhancement",
+ action="store_true",
+ help="use face enhance for face")
+parser.add_argument("--mobile_net",
+ dest="mobile_net",
+ action="store_true",
+ help="use mobile_net for fom")
parser.set_defaults(relative=False)
parser.set_defaults(adapt_scale=False)
+parser.set_defaults(face_enhancement=False)
+parser.set_defaults(mobile_net=False)
+
+parser.add_argument(
+ "--slice_size",
+ dest="slice_size",
+ type=int,
+ default=0,
+ help=
+ "slice driving video to smaller parts to bypass XPU's 4G byte tensor restriction"
+)
if __name__ == "__main__":
args = parser.parse_args()
if args.cpu:
paddle.set_device('cpu')
-
+ if args.xpu:
+ paddle.set_device('xpu')
predictor = FirstOrderPredictor(output=args.output,
+ filename=args.filename,
weight_path=args.weight_path,
config=args.config,
relative=args.relative,
adapt_scale=args.adapt_scale,
find_best_frame=args.find_best_frame,
- best_frame=args.best_frame)
+ best_frame=args.best_frame,
+ ratio=args.ratio,
+ face_detector=args.face_detector,
+ multi_person=args.multi_person,
+ image_size=args.image_size,
+ batch_size=args.batch_size,
+ face_enhancement=args.face_enhancement,
+ mobile_net=args.mobile_net,
+ slice_size=args.slice_size)
predictor.run(args.source_image, args.driving_video)
diff --git a/applications/tools/gpen.py b/applications/tools/gpen.py
new file mode 100644
index 0000000000000000000000000000000000000000..d77cd1c1533bba6afee523f96a878a384224d055
--- /dev/null
+++ b/applications/tools/gpen.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+sys.path.append(".")
+import argparse
+import paddle
+from ppgan.apps import GPENPredictor
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--test_img",
+ type=str,
+ default='data/gpen/lite_data/15006.png',
+ help="path of test image")
+
+ parser.add_argument("--model_type",
+ type=str,
+ default=None,
+ help="type of model for loading pretrained model")
+
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="sample random seed for model's image generation")
+
+ parser.add_argument("--size",
+ type=int,
+ default=256,
+ help="resolution of output image")
+
+ parser.add_argument("--style_dim",
+ type=int,
+ default=512,
+ help="number of style dimension")
+
+ parser.add_argument("--n_mlp",
+ type=int,
+ default=8,
+ help="number of mlp layer depth")
+
+ parser.add_argument("--channel_multiplier",
+ type=int,
+ default=1,
+ help="number of channel multiplier")
+
+ parser.add_argument("--narrow",
+ type=float,
+ default=0.5,
+ help="number of channel narrow")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = GPENPredictor(output_path=args.output_path,
+ weight_path=args.weight_path,
+ model_type=args.model_type,
+ seed=args.seed,
+ size=args.size,
+ style_dim=args.style_dim,
+ n_mlp=args.n_mlp,
+ narrow=args.narrow,
+ channel_multiplier=args.channel_multiplier)
+ predictor.run(args.test_img)
diff --git a/applications/tools/image_restoration.py b/applications/tools/image_restoration.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf1e610c257e006f57661e393fcd0a650cc08f4b
--- /dev/null
+++ b/applications/tools/image_restoration.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import os
+import sys
+
+sys.path.insert(0, os.getcwd())
+from ppgan.apps import MPRPredictor
+import argparse
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="sample random seed for model's image generation")
+
+ parser.add_argument('--images_path',
+ default=None,
+ required=True,
+ type=str,
+ help='Single image or images directory.')
+
+ parser.add_argument('--task',
+ required=True,
+ type=str,
+ help='Task to run',
+ choices=['Deblurring', 'Denoising', 'Deraining'])
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = MPRPredictor(
+ output_path=args.output_path,
+ weight_path=args.weight_path,
+ seed=args.seed,
+ task=args.task)
+ predictor.run(images_path=args.images_path)
diff --git a/applications/tools/invdn_denoising.py b/applications/tools/invdn_denoising.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe91f52ca6418a5111f010371ff5cfbe1c174042
--- /dev/null
+++ b/applications/tools/invdn_denoising.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import argparse
+
+sys.path.insert(0, os.getcwd())
+
+import paddle
+from ppgan.apps import InvDNPredictor
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="sample random seed for model's image generation")
+
+ parser.add_argument('--images_path',
+ default=None,
+ required=True,
+ type=str,
+ help='Single image or images directory.')
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ parser.add_argument(
+ "--disable_mc",
+ action="store_true",
+ help=
+ "Disable the Monte Carlo Self Ensemble in the paper to boost the speed during test. Performance may degrade."
+ )
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = InvDNPredictor(output_path=args.output_path,
+ weight_path=args.weight_path,
+ seed=args.seed)
+ predictor.run(images_path=args.images_path, disable_mc=args.disable_mc)
diff --git a/applications/tools/lapstyle.py b/applications/tools/lapstyle.py
new file mode 100644
index 0000000000000000000000000000000000000000..df24d2cf1eb5e426169621f4767d1bbdf1e8af1c
--- /dev/null
+++ b/applications/tools/lapstyle.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import os
+import sys
+
+sys.path.insert(0, os.getcwd())
+from ppgan.apps import LapStylePredictor
+import argparse
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--content_img_path",
+ type=str,
+ required=True,
+ help="path to content image")
+
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model weight path")
+
+ parser.add_argument(
+ "--style",
+ type=str,
+ default='starrynew',
+ help=
+ "if weight_path is None, style can be chosen in 'starrynew', 'circuit', 'ocean' and 'stars'"
+ )
+
+ parser.add_argument("--style_image_path",
+ type=str,
+ required=True,
+ help="path to style image")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = LapStylePredictor(output=args.output_path,
+ style=args.style,
+ weight_path=args.weight_path)
+ predictor.run(args.content_img_path, args.style_image_path)
diff --git a/applications/tools/mprnet.py b/applications/tools/mprnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cb63925058d8015719b9c5b8fc036d422086690
--- /dev/null
+++ b/applications/tools/mprnet.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import os
+import sys
+
+sys.path.insert(0, os.getcwd())
+from ppgan.apps import MPRPredictor
+import argparse
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input_image", type=str, help="path to image")
+
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model weight path")
+
+ parser.add_argument(
+ "--task",
+ type=str,
+ default='Deblurring',
+ help="task can be chosen in 'Deblurring', 'Denoising', 'Deraining'")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = MPRPredictor(output_path=args.output_path,
+ task=args.task,
+ weight_path=args.weight_path)
+ predictor.run(args.input_image)
diff --git a/applications/tools/nafnet_denoising.py b/applications/tools/nafnet_denoising.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ad6e7a09798c49d6da4d158d2abac885d795e88
--- /dev/null
+++ b/applications/tools/nafnet_denoising.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import argparse
+
+sys.path.insert(0, os.getcwd())
+import paddle
+from ppgan.apps import NAFNetPredictor
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="sample random seed for model's image generation")
+
+ parser.add_argument('--images_path',
+ default=None,
+ required=True,
+ type=str,
+ help='Single image or images directory.')
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = NAFNetPredictor(output_path=args.output_path,
+ weight_path=args.weight_path,
+ seed=args.seed)
+ predictor.run(images_path=args.images_path)
diff --git a/applications/tools/photopen.py b/applications/tools/photopen.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ab5af218b4b73d45eae475b1971b1144f2c3d95
--- /dev/null
+++ b/applications/tools/photopen.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import os
+import sys
+
+sys.path.insert(0, os.getcwd())
+from ppgan.apps import PhotoPenPredictor
+import argparse
+from ppgan.utils.config import get_config
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("--semantic_label_path",
+ type=str,
+ default=None,
+ help="path to input semantic label")
+
+ parser.add_argument("--output_path",
+ type=str,
+ default=None,
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model weight")
+
+ parser.add_argument("--config-file",
+ type=str,
+ default=None,
+ help="path to yaml file")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ cfg = get_config(args.config_file)
+ predictor = PhotoPenPredictor(output_path=args.output_path,
+ weight_path=args.weight_path,
+ gen_cfg=cfg.predict)
+ predictor.run(semantic_label_path=args.semantic_label_path)
diff --git a/applications/tools/pixel2style2pixel.py b/applications/tools/pixel2style2pixel.py
index 69a2452d57eb3e40571f1e594d9073773e2fa902..48d19deb86ad76d9f2c7cdb48a7163a26a7332a3 100644
--- a/applications/tools/pixel2style2pixel.py
+++ b/applications/tools/pixel2style2pixel.py
@@ -1,9 +1,21 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
import paddle
-import os
-import sys
-sys.path.insert(0, os.getcwd())
from ppgan.apps import Pixel2Style2PixelPredictor
-import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
@@ -33,17 +45,17 @@ if __name__ == "__main__":
type=int,
default=1024,
help="resolution of output image")
-
+
parser.add_argument("--style_dim",
type=int,
default=512,
help="number of style dimension")
-
+
parser.add_argument("--n_mlp",
type=int,
default=8,
help="number of mlp layer depth")
-
+
parser.add_argument("--channel_multiplier",
type=int,
default=2,
@@ -67,6 +79,5 @@ if __name__ == "__main__":
size=args.size,
style_dim=args.style_dim,
n_mlp=args.n_mlp,
- channel_multiplier=args.channel_multiplier
- )
+ channel_multiplier=args.channel_multiplier)
predictor.run(args.input_image)
diff --git a/applications/tools/singan.py b/applications/tools/singan.py
new file mode 100755
index 0000000000000000000000000000000000000000..c668047bd31063e7c892ad41f105c195effd8333
--- /dev/null
+++ b/applications/tools/singan.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import paddle
+from ppgan.apps import SinGANPredictor
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--pretrained_model",
+ type=str,
+ default=None,
+ help="a pretianed model, only trees, stone, mountains, birds, and lightning are implemented.")
+
+ parser.add_argument("--mode",
+ type=str,
+ default="random_sample",
+ help="type of model for loading pretrained model")
+
+ parser.add_argument("--generate_start_scale",
+ type=int,
+ default=0,
+ help="sample random seed for model's image generation")
+
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="sample random seed for model's image generation")
+
+ parser.add_argument("--scale_h",
+ type=float,
+ default=1.0,
+ help="horizontal scale")
+
+ parser.add_argument("--scale_v",
+ type=float,
+ default=1.0,
+ help="vertical scale")
+
+ parser.add_argument("--ref_image",
+ type=str,
+ default=None,
+ help="reference image for harmonization, editing and paint2image")
+
+ parser.add_argument("--mask_image",
+ type=str,
+ default=None,
+ help="mask image for harmonization and editing")
+
+ parser.add_argument("--sr_factor",
+ type=float,
+ default=4.0,
+ help="scale for super resolution")
+
+ parser.add_argument("--animation_alpha",
+ type=float,
+ default=0.9,
+ help="a parameter determines how close the frames of the sequence remain to the training image")
+
+ parser.add_argument("--animation_beta",
+ type=float,
+ default=0.9,
+ help="a parameter controls the smoothness and rate of change in the generated clip")
+
+ parser.add_argument("--animation_frames",
+ type=int,
+ default=20,
+ help="frame number of output animation when mode is animation")
+
+ parser.add_argument("--animation_duration",
+ type=float,
+ default=0.1,
+ help="duration of each frame in animation")
+
+ parser.add_argument("--n_row",
+ type=int,
+ default=5,
+ help="row number of output image grid")
+
+ parser.add_argument("--n_col",
+ type=int,
+ default=3,
+ help="column number of output image grid")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = SinGANPredictor(args.output_path,
+ args.weight_path,
+ args.pretrained_model,
+ args.seed)
+ predictor.run(args.mode,
+ args.generate_start_scale,
+ args.scale_h,
+ args.scale_v,
+ args.ref_image,
+ args.mask_image,
+ args.sr_factor,
+ args.animation_alpha,
+ args.animation_beta,
+ args.animation_frames,
+ args.animation_duration,
+ args.n_row,
+ args.n_col)
diff --git a/applications/tools/styleganv2.py b/applications/tools/styleganv2.py
index 55f792837c300ccd03ad785e111faa84ecf05818..8026bcac97cfa602b0dc2e5a7ebf9465c27e0474 100644
--- a/applications/tools/styleganv2.py
+++ b/applications/tools/styleganv2.py
@@ -1,9 +1,21 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
import paddle
-import os
-import sys
-sys.path.insert(0, os.getcwd())
from ppgan.apps import StyleGANv2Predictor
-import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser()
@@ -26,22 +38,22 @@ if __name__ == "__main__":
type=int,
default=None,
help="sample random seed for model's image generation")
-
+
parser.add_argument("--size",
type=int,
default=1024,
help="resolution of output image")
-
+
parser.add_argument("--style_dim",
type=int,
default=512,
help="number of style dimension")
-
+
parser.add_argument("--n_mlp",
type=int,
default=8,
help="number of mlp layer depth")
-
+
parser.add_argument("--channel_multiplier",
type=int,
default=2,
@@ -67,14 +79,12 @@ if __name__ == "__main__":
if args.cpu:
paddle.set_device('cpu')
- predictor = StyleGANv2Predictor(
- output_path=args.output_path,
- weight_path=args.weight_path,
- model_type=args.model_type,
- seed=args.seed,
- size=args.size,
- style_dim=args.style_dim,
- n_mlp=args.n_mlp,
- channel_multiplier=args.channel_multiplier
- )
+ predictor = StyleGANv2Predictor(output_path=args.output_path,
+ weight_path=args.weight_path,
+ model_type=args.model_type,
+ seed=args.seed,
+ size=args.size,
+ style_dim=args.style_dim,
+ n_mlp=args.n_mlp,
+ channel_multiplier=args.channel_multiplier)
predictor.run(args.n_row, args.n_col)
diff --git a/applications/tools/styleganv2clip.py b/applications/tools/styleganv2clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfd308ba395ee15ba8d3b2f54a8a7a95e2f5fb7c
--- /dev/null
+++ b/applications/tools/styleganv2clip.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import paddle
+from ppgan.apps import StyleGANv2ClipPredictor
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--latent",
+ type=str,
+ help="path to first image latent codes")
+
+ parser.add_argument("--neutral", type=str, help="neutral description")
+ parser.add_argument("--target", type=str, help="neutral description")
+ parser.add_argument("--beta_threshold",
+ type=float,
+ default=0.12,
+ help="beta threshold for channel editing")
+
+ parser.add_argument("--direction_offset",
+ type=float,
+ default=5.0,
+ help="offset value of edited attribute")
+
+ parser.add_argument("--direction_path",
+ type=str,
+ default=None,
+ help="path to latent editing directions")
+
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--model_type",
+ type=str,
+ default=None,
+ help="type of model for loading pretrained model")
+
+ parser.add_argument("--size",
+ type=int,
+ default=1024,
+ help="resolution of output image")
+
+ parser.add_argument("--style_dim",
+ type=int,
+ default=512,
+ help="number of style dimension")
+
+ parser.add_argument("--n_mlp",
+ type=int,
+ default=8,
+ help="number of mlp layer depth")
+
+ parser.add_argument("--channel_multiplier",
+ type=int,
+ default=2,
+ help="number of channel multiplier")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = StyleGANv2ClipPredictor(
+ output_path=args.output_path,
+ weight_path=args.weight_path,
+ model_type=args.model_type,
+ seed=None,
+ size=args.size,
+ style_dim=args.style_dim,
+ n_mlp=args.n_mlp,
+ channel_multiplier=args.channel_multiplier,
+ direction_path=args.direction_path)
+ predictor.run(args.latent, args.neutral, args.target, args.direction_offset,
+ args.beta_threshold)
diff --git a/applications/tools/styleganv2editing.py b/applications/tools/styleganv2editing.py
new file mode 100644
index 0000000000000000000000000000000000000000..5435eb5a0db8cf0d37d9a743320e5c6e6cf6cee6
--- /dev/null
+++ b/applications/tools/styleganv2editing.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import paddle
+from ppgan.apps import StyleGANv2EditingPredictor
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--latent",
+ type=str,
+ help="path to first image latent codes")
+
+ parser.add_argument("--direction_name",
+ type=str,
+ default=None,
+ help="name in directions dictionary")
+
+ parser.add_argument("--direction_offset",
+ type=float,
+ default=0.0,
+ help="offset value of edited attribute")
+
+ parser.add_argument("--direction_path",
+ type=str,
+ default=None,
+ help="path to latent editing directions")
+
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--model_type",
+ type=str,
+ default=None,
+ help="type of model for loading pretrained model")
+
+ parser.add_argument("--size",
+ type=int,
+ default=1024,
+ help="resolution of output image")
+
+ parser.add_argument("--style_dim",
+ type=int,
+ default=512,
+ help="number of style dimension")
+
+ parser.add_argument("--n_mlp",
+ type=int,
+ default=8,
+ help="number of mlp layer depth")
+
+ parser.add_argument("--channel_multiplier",
+ type=int,
+ default=2,
+ help="number of channel multiplier")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = StyleGANv2EditingPredictor(
+ output_path=args.output_path,
+ weight_path=args.weight_path,
+ model_type=args.model_type,
+ seed=None,
+ size=args.size,
+ style_dim=args.style_dim,
+ n_mlp=args.n_mlp,
+ channel_multiplier=args.channel_multiplier,
+ direction_path=args.direction_path)
+ predictor.run(args.latent, args.direction_name, args.direction_offset)
diff --git a/applications/tools/styleganv2fitting.py b/applications/tools/styleganv2fitting.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8209719e8f06649a90b40a15582fe3468370cb5
--- /dev/null
+++ b/applications/tools/styleganv2fitting.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import paddle
+from ppgan.apps import StyleGANv2FittingPredictor
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input_image", type=str, help="path to source image")
+
+ parser.add_argument("--need_align",
+ action="store_true",
+ help="whether to align input image")
+
+ parser.add_argument("--start_lr",
+ type=float,
+ default=0.1,
+ help="learning rate at the begin of training")
+
+ parser.add_argument("--final_lr",
+ type=float,
+ default=0.025,
+ help="learning rate at the end of training")
+
+ parser.add_argument("--latent_level",
+ type=int,
+ nargs="+",
+ default=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
+ help="indices of latent code for training")
+
+ parser.add_argument("--step",
+ type=int,
+ default=100,
+ help="optimize iterations")
+
+ parser.add_argument("--mse_weight",
+ type=float,
+ default=1,
+ help="weight of the mse loss")
+
+ parser.add_argument("--pre_latent",
+ type=str,
+ default=None,
+ help="path to pre-prepared latent codes")
+
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--model_type",
+ type=str,
+ default=None,
+ help="type of model for loading pretrained model")
+
+ parser.add_argument("--size",
+ type=int,
+ default=1024,
+ help="resolution of output image")
+
+ parser.add_argument("--style_dim",
+ type=int,
+ default=512,
+ help="number of style dimension")
+
+ parser.add_argument("--n_mlp",
+ type=int,
+ default=8,
+ help="number of mlp layer depth")
+
+ parser.add_argument("--channel_multiplier",
+ type=int,
+ default=2,
+ help="number of channel multiplier")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = StyleGANv2FittingPredictor(
+ output_path=args.output_path,
+ weight_path=args.weight_path,
+ model_type=args.model_type,
+ seed=None,
+ size=args.size,
+ style_dim=args.style_dim,
+ n_mlp=args.n_mlp,
+ channel_multiplier=args.channel_multiplier)
+ predictor.run(args.input_image,
+ need_align=args.need_align,
+ start_lr=args.start_lr,
+ final_lr=args.final_lr,
+ latent_level=args.latent_level,
+ step=args.step,
+ mse_weight=args.mse_weight,
+ pre_latent=args.pre_latent)
diff --git a/applications/tools/styleganv2mixing.py b/applications/tools/styleganv2mixing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0f0b2cda6784fbb4be5722bda517a9bfa257799
--- /dev/null
+++ b/applications/tools/styleganv2mixing.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import paddle
+from ppgan.apps import StyleGANv2MixingPredictor
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--latent1",
+ type=str,
+ help="path to first image latent codes")
+
+ parser.add_argument("--latent2",
+ type=str,
+ help="path to second image latent codes")
+
+ parser.add_argument(
+ "--weights",
+ type=float,
+ nargs="+",
+ default=[0.5] * 18,
+ help="different weights at each level of two latent codes")
+
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--model_type",
+ type=str,
+ default=None,
+ help="type of model for loading pretrained model")
+
+ parser.add_argument("--size",
+ type=int,
+ default=1024,
+ help="resolution of output image")
+
+ parser.add_argument("--style_dim",
+ type=int,
+ default=512,
+ help="number of style dimension")
+
+ parser.add_argument("--n_mlp",
+ type=int,
+ default=8,
+ help="number of mlp layer depth")
+
+ parser.add_argument("--channel_multiplier",
+ type=int,
+ default=2,
+ help="number of channel multiplier")
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = StyleGANv2MixingPredictor(
+ output_path=args.output_path,
+ weight_path=args.weight_path,
+ model_type=args.model_type,
+ seed=None,
+ size=args.size,
+ style_dim=args.style_dim,
+ n_mlp=args.n_mlp,
+ channel_multiplier=args.channel_multiplier)
+ predictor.run(args.latent1, args.latent2, args.weights)
diff --git a/applications/tools/swinir_denoising.py b/applications/tools/swinir_denoising.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2f886c41cd0e892a8e11c94b3d317860ddf41b
--- /dev/null
+++ b/applications/tools/swinir_denoising.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import sys
+import argparse
+
+import paddle
+from ppgan.apps import SwinIRPredictor
+
+sys.path.insert(0, os.getcwd())
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output_path",
+ type=str,
+ default='output_dir',
+ help="path to output image dir")
+
+ parser.add_argument("--weight_path",
+ type=str,
+ default=None,
+ help="path to model checkpoint path")
+
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="sample random seed for model's image generation")
+
+ parser.add_argument('--images_path',
+ default=None,
+ required=True,
+ type=str,
+ help='Single image or images directory.')
+
+ parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
+
+ args = parser.parse_args()
+
+ if args.cpu:
+ paddle.set_device('cpu')
+
+ predictor = SwinIRPredictor(output_path=args.output_path,
+ weight_path=args.weight_path,
+ seed=args.seed)
+ predictor.run(images_path=args.images_path)
diff --git a/applications/tools/video-enhance.py b/applications/tools/video-enhance.py
index 6a06050347e0b61741f30b7292794f51129b75e1..6d1b13d23aacff903ae8dcfbcaf89864cfe2304a 100644
--- a/applications/tools/video-enhance.py
+++ b/applications/tools/video-enhance.py
@@ -20,6 +20,9 @@ from ppgan.apps import DeepRemasterPredictor
from ppgan.apps import DeOldifyPredictor
from ppgan.apps import RealSRPredictor
from ppgan.apps import EDVRPredictor
+from ppgan.apps import PPMSVSRPredictor, BasicVSRPredictor, \
+ BasiVSRPlusPlusPredictor, IconVSRPredictor, \
+ PPMSVSRLargePredictor
parser = argparse.ArgumentParser(description='Fix video')
parser.add_argument('--input', type=str, default=None, help='Input video')
@@ -44,6 +47,26 @@ parser.add_argument('--EDVR_weight',
type=str,
default=None,
help='Path to model weight')
+parser.add_argument('--PPMSVSR_weight',
+ type=str,
+ default=None,
+ help='Path to model weight')
+parser.add_argument('--PPMSVSRLarge_weight',
+ type=str,
+ default=None,
+ help='Path to model weight')
+parser.add_argument('--BasicVSR_weight',
+ type=str,
+ default=None,
+ help='Path to model weight')
+parser.add_argument('--IconVSR_weight',
+ type=str,
+ default=None,
+ help='Path to model weight')
+parser.add_argument('--BasiVSRPlusPlus_weight',
+ type=str,
+ default=None,
+ help='Path to model weight')
# DAIN args
parser.add_argument('--time_step',
type=float,
@@ -75,21 +98,32 @@ parser.add_argument('--render_factor',
type=int,
default=32,
help='model inputsize=render_factor*16')
+#vsr input number frames
+parser.add_argument('--num_frames',
+ type=int,
+ default=10,
+ help='num frames for recurrent vsr model')
#process order support model name:[DAIN, DeepRemaster, DeOldify, RealSR, EDVR]
parser.add_argument('--process_order',
type=str,
default='none',
nargs='+',
help='Process order')
+parser.add_argument("--cpu",
+ dest="cpu",
+ action="store_true",
+ help="cpu mode.")
if __name__ == "__main__":
args = parser.parse_args()
+ if args.cpu:
+ paddle.set_device('cpu')
orders = args.process_order
temp_video_path = None
for order in orders:
- print('Model {} proccess start..'.format(order))
+ print('Model {} process start..'.format(order))
if temp_video_path is None:
temp_video_path = args.input
if order == 'DAIN':
@@ -119,11 +153,36 @@ if __name__ == "__main__":
weight_path=args.RealSR_weight)
frames_path, temp_video_path = predictor.run(temp_video_path)
elif order == 'EDVR':
- paddle.enable_static()
predictor = EDVRPredictor(args.output, weight_path=args.EDVR_weight)
frames_path, temp_video_path = predictor.run(temp_video_path)
- paddle.disable_static()
+ elif order == 'PPMSVSR':
+ predictor = PPMSVSRPredictor(args.output,
+ weight_path=args.PPMSVSR_weight,
+ num_frames=args.num_frames)
+ frames_path, temp_video_path = predictor.run(temp_video_path)
+ elif order == 'PPMSVSRLarge':
+ predictor = PPMSVSRLargePredictor(
+ args.output,
+ weight_path=args.PPMSVSRLarge_weight,
+ num_frames=args.num_frames)
+ frames_path, temp_video_path = predictor.run(temp_video_path)
+ elif order == 'BasicVSR':
+ predictor = BasicVSRPredictor(args.output,
+ weight_path=args.BasicVSR_weight,
+ num_frames=args.num_frames)
+ frames_path, temp_video_path = predictor.run(temp_video_path)
+ elif order == 'IconVSR':
+ predictor = IconVSRPredictor(args.output,
+ weight_path=args.IconVSR_weight,
+ num_frames=args.num_frames)
+ frames_path, temp_video_path = predictor.run(temp_video_path)
+ elif order == 'BasiVSRPlusPlus':
+ predictor = BasiVSRPlusPlusPredictor(
+ args.output,
+ weight_path=args.BasiVSRPlusPlus_weight,
+ num_frames=args.num_frames)
+ frames_path, temp_video_path = predictor.run(temp_video_path)
print('Model {} output frames path:'.format(order), frames_path)
print('Model {} output video path:'.format(order), temp_video_path)
- print('Model {} proccess done!'.format(order))
+ print('Model {} process done!'.format(order))
diff --git a/applications/tools/wav2lip.py b/applications/tools/wav2lip.py
index 8a708fc53bd555b4fd6244050614e54509ae8b64..684fd8a261db2c3c6fda3fc389616bb1383b76da 100644
--- a/applications/tools/wav2lip.py
+++ b/applications/tools/wav2lip.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
import argparse
import paddle
@@ -97,11 +111,34 @@ parser.add_argument(
action='store_true',
help='Prevent smoothing face detections over a short temporal window')
parser.add_argument("--cpu", dest="cpu", action="store_true", help="cpu mode.")
+parser.add_argument(
+ "--face_detector",
+ dest="face_detector",
+ type=str,
+ default='sfd',
+ help="face detector to be used, can choose s3fd or blazeface")
+parser.add_argument("--face_enhancement",
+ dest="face_enhancement",
+ action="store_true",
+ help="use face enhance for face")
+parser.set_defaults(face_enhancement=False)
if __name__ == "__main__":
args = parser.parse_args()
if args.cpu:
paddle.set_device('cpu')
- predictor = Wav2LipPredictor(args)
- predictor.run()
+ predictor = Wav2LipPredictor(checkpoint_path=args.checkpoint_path,
+ static=args.static,
+ fps=args.fps,
+ pads=args.pads,
+ face_det_batch_size=args.face_det_batch_size,
+ wav2lip_batch_size=args.wav2lip_batch_size,
+ resize_factor=args.resize_factor,
+ crop=args.crop,
+ box=args.box,
+ rotate=args.rotate,
+ nosmooth=args.nosmooth,
+ face_detector=args.face_detector,
+ face_enhancement=args.face_enhancement)
+ predictor.run(args.face, args.audio, args.outfile)
diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e24b67d5bd54510f39f3fcc0ad3a08c071c52013
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,56 @@
+# PaddGAN模型性能复现
+## 目录
+
+```
+├── README.md # 说明文档
+├── benchmark.yaml # 配置文件,设置测试模型及模型参数
+├── run_all.sh # 执行入口,测试并获取所有生成对抗模型的训练性能
+└── run_benchmark.sh # 执行实体,测试单个分割模型的训练性能
+```
+
+## 环境介绍
+### 物理机环境
+- 单机(单卡、8卡)
+ - 系统:CentOS release 7.5 (Final)
+ - GPU:Tesla V100-SXM2-32GB * 8
+ - CPU:Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz * 80
+ - CUDA、cudnn Version: cuda10.2-cudnn7
+
+#### 备注
+BasicVSR模型因竞品torch模型只能测4卡,故这里也测4卡。
+
+因REDS数据集较大,避免每次下载时间较长,需要在Docker建立好后,将REDS数据集放到/workspace/data/目录一下。
+
+### Docker 镜像
+
+- **镜像版本**: `registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7`
+- **paddle 版本**: `2.1.2`
+- **CUDA 版本**: `10.2`
+- **cuDnn 版本**: `7`
+
+## 在PaddleGAN目录下,启动测试脚本的方法如下:
+```script
+ImageName="registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7"
+docker pull ${ImageName}
+
+run_cmd="set -xe;
+ cd /workspace ;
+ bash -x benchmark/run_all.sh"
+
+nvidia-docker run --name test_paddlegan -i \
+ --net=host \
+ --shm-size=128g \
+ -v $PWD:/workspace \
+ ${ImageName} /bin/bash -c "${run_cmd}"
+```
+
+如果需要打开profile选项,可以直接替换`run_cmd`为:
+```
+run_cmd="set -xe;
+ cd /workspace ;
+ bash -x benchmark/run_all.sh on"
+```
+
+## 输出
+
+执行完成后,在PaddleGAN目录会产出模型训练性能数据的文件,比如`esrgan_mp_bs32_fp32_8`等文件。
diff --git a/benchmark/analysis_log.py b/benchmark/analysis_log.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c807a5ed9f5eb5768f36e44add523252f4b523b
--- /dev/null
+++ b/benchmark/analysis_log.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# encoding=utf-8 vi:ts=4:sw=4:expandtab:ft=python
+
+import re
+import sys
+import json
+
+def analyze(model_name, log_file, res_log_file):
+ time_pat = re.compile(r"ips: (.*) images/s")
+
+ logs = open(log_file).readlines()
+ logs = ";".join(logs)
+ time_res = time_pat.findall(logs)
+
+ fail_flag = 0
+ run_mode = ""
+ gpu_num = 0
+ ips = 0
+
+ if time_res == []:
+ fail_flag = 1
+ else:
+ gpu_num = log_file.split('_')[-1]
+ run_mode = "sp" if gpu_num == 1 else "mp"
+
+ skip_num = 4
+ total_time = 0
+ for i in range(skip_num, len(time_res)):
+ total_time += float(time_res[i])
+ ips = total_time / (len(time_res) - skip_num)
+
+ info = {"log_file": log_file, "model_name": model_name, "mission_name": "图像生成",
+ "direction_id": 0, "run_mode": run_mode, "index": 1, "gpu_num": gpu_num,
+ "FINAL_RESULT": ips, "JOB_FAIL_FLAG": fail_flag, "UNIT": "images/s"}
+ json_info = json.dumps(info)
+ with open(res_log_file, "w") as of:
+ of.write(json_info)
+
+if __name__ == "__main__":
+ if len(sys.argv) != 4:
+ print("Usage:" + sys.argv[0] + " model_name path/to/log/file path/to/res/log/file")
+ sys.exit()
+
+ model_name = sys.argv[1]
+ log_file = sys.argv[2]
+ res_log_file = sys.argv[3]
+
+ analyze(model_name, log_file, res_log_file)
+
+
diff --git a/benchmark/benchmark.yaml b/benchmark/benchmark.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3b1b9f0c0c0394be182c36bb337388ff39b22d5e
--- /dev/null
+++ b/benchmark/benchmark.yaml
@@ -0,0 +1,39 @@
+StyleGANv2:
+ dataset_web: https://paddlegan.bj.bcebos.com/datasets/ffhq_256.tar
+ config: configs/stylegan_v2_256_ffhq.yaml
+ fp_item: fp32
+ bs_item: 8
+ total_iters: 100
+ log_interval: 5
+
+FOMM:
+ dataset_web: https://paddlegan.bj.bcebos.com/datasets/fom_test_data.tar
+ config: configs/firstorder_vox_256.yaml
+ fp_item: fp32
+ bs_item: 16
+ epochs: 1
+ log_interval: 1
+
+esrgan:
+ dataset_web: https://paddlegan.bj.bcebos.com/datasets/DIV2KandSet14paddle.tar
+ config: configs/esrgan_psnr_x4_div2k.yaml
+ fp_item: fp32
+ bs_item: 32
+ total_iters: 300
+ log_interval: 10
+
+edvr:
+ dataset: data/REDS
+ config: configs/edvr_m_wo_tsa.yaml
+ fp_item: fp32
+ bs_item: 4
+ total_iters: 300
+ log_interval: 10
+
+basicvsr:
+ dataset: data/REDS
+ config: configs/basicvsr_reds.yaml
+ fp_item: fp32
+ bs_item: 2 4
+ total_iters: 300
+ log_interval: 10
diff --git a/benchmark/prepare.sh b/benchmark/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7831b31b8747d04a20ec65cdf3a2d412eb37275f
--- /dev/null
+++ b/benchmark/prepare.sh
@@ -0,0 +1,30 @@
+
+#!usr/bin/env bash
+
+export BENCHMARK_ROOT=/workspace
+run_env=$BENCHMARK_ROOT/run_env
+log_date=`date "+%Y.%m%d.%H%M%S"`
+frame=paddle2.1.3
+cuda_version=10.2
+save_log_dir=${BENCHMARK_ROOT}/logs/${frame}_${log_date}_${cuda_version}/
+
+if [[ -d ${save_log_dir} ]]; then
+ rm -rf ${save_log_dir}
+fi
+
+# this for update the log_path coding mat
+export TRAIN_LOG_DIR=${save_log_dir}/train_log
+mkdir -p ${TRAIN_LOG_DIR}
+log_path=${TRAIN_LOG_DIR}
+
+################################# 配置python, 如:
+rm -rf $run_env
+mkdir $run_env
+echo `which python3.7`
+ln -s $(which python3.7)m-config $run_env/python3-config
+ln -s $(which python3.7) $run_env/python
+ln -s $(which pip3.7) $run_env/pip
+
+export PATH=$run_env:${PATH}
+cd $BENCHMARK_ROOT
+pip install -v -e .
diff --git a/benchmark/run_all.sh b/benchmark/run_all.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4be68642e031d9a7d5bd97802474bccfd13aecde
--- /dev/null
+++ b/benchmark/run_all.sh
@@ -0,0 +1,70 @@
+#!usr/bin/env bash
+
+export log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
+
+function parse_yaml {
+ local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
+ sed -ne "s|^\($s\):|\1|" \
+ -e "s|^\($s\)\($w\)$s:$s[\"']\(.*\)[\"']$s\$|\1$fs\2$fs\3|p" \
+ -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 |
+ awk -F$fs '{
+ indent = length($1)/2;
+ vname[indent] = $2;
+ if (indent == 0) {
+ model_mode_list[model_num]=$2;
+ printf("model_mode_list[%d]=%s\n",(model_num), $2);
+ printf("model_num=%d\n", (model_num+1));
+ model_num=(model_num+1);
+ }
+ for (i in vname) {if (i > indent) {delete vname[i]}}
+ if (length($3) >= 0) {
+ vn=""; for (i=0; i&1 # (5min)
+ sleep 60
+ echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_mode}"
+ run_mode=mp
+ basicvsr_name=basicvsr
+ if [ ${model_mode} = ${basicvsr_name} ]; then
+# CUDA_VISIBLE_DEVICES=0,1,2,3 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${mode} ${max_iter} ${model_mode} ${config} ${log_interval} ${profile} | tee ${log_path}/gan_dygraph_${model_mode}_${run_mode}_bs${bs_item}_${fp_item}_speed_4gpus4p 2>&1
+ echo "-----skip 4cards"
+ else
+ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} ${mode} ${max_iter} ${model_mode} ${config} ${log_interval} ${profile} | tee ${log_path}/gan_dygraph_${model_mode}_${run_mode}_bs${bs_item}_${fp_item}_speed_8gpus8p 2>&1
+ fi
+ sleep 60
+ done
+ done
+done
diff --git a/benchmark/run_benchmark.sh b/benchmark/run_benchmark.sh
new file mode 100755
index 0000000000000000000000000000000000000000..eab29a424f69d418c131690f55e9c015423ff8a7
--- /dev/null
+++ b/benchmark/run_benchmark.sh
@@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+set -xe
+# 运行示例:CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
+# 参数说明
+function _set_params(){
+ run_mode=${1:-"sp"} # 单卡sp|多卡mp
+ batch_size=${2:-"64"}
+ fp_item=${3:-"fp32"} # fp32|fp16
+ mode=${4:-"epochs"}
+ max_iter=${5:-"500"} # 可选,如果需要修改代码提前中断
+ model_item=${6:-"model_item"}
+ config=${7:-"config"}
+ log_interval=${8:-"1"}
+ run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
+ need_profile=${9:-"off"}
+
+ index=1
+ base_batch_size=${batch_size}
+ mission_name="图像生成"
+ direction_id=0
+ keyword="ips:"
+ keyword_loss="G_idt_A_loss:"
+ skip_steps=5
+ ips_unit="images/s"
+ model_name=${model_item}_bs${batch_size}_${fp_item}
+# 以下不用修改
+ device=${CUDA_VISIBLE_DEVICES//,/ }
+ arr=(${device})
+ num_gpu_devices=${#arr[*]}
+ log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
+ res_log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}_speed
+ log_profile=${run_log_path}/${model_name}_model.profile
+}
+
+
+function _train(){
+ echo "Train on ${num_gpu_devices} GPUs"
+ echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
+
+ profiler_cmd=""
+ profiler_options="batch_range=[10,20];profile_path=${log_profile}"
+ if [ $need_profile = "on" ]; then
+ profiler_cmd="--profiler_options=${profiler_options}"
+ fi
+
+ train_cmd="${profiler_cmd}
+ --config-file=${config}
+ -o dataset.train.batch_size=${batch_size}
+ log_config.interval=${log_interval}
+ ${mode}=${max_iter} "
+ case ${run_mode} in
+ sp) train_cmd="python -u tools/main.py "${train_cmd} ;;
+ mp)
+ rm -rf ./mylog
+ train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/main.py "${train_cmd}
+ log_parse_file="mylog/workerlog.0" ;;
+ *) echo "choose run_mode(sp or mp)"; exit 1;
+ esac
+# 以下不用修改
+ timeout 15m ${train_cmd} > ${log_file} 2>&1
+ if [ $? -ne 0 ];then
+ echo -e "${model_name}, FAIL"
+ export job_fail_flag=1
+ else
+ echo -e "${model_name}, SUCCESS"
+ export job_fail_flag=0
+ fi
+ trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM
+
+ if [ $run_mode = "mp" -a -d mylog ]; then
+ rm ${log_file}
+ cp mylog/workerlog.0 ${log_file}
+ fi
+
+}
+
+source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
+_set_params $@
+_run
diff --git a/configs/animeganv2.yaml b/configs/animeganv2.yaml
index f09ea1364c5500a1efe828e34cda9bb6280f70df..5e4b650d2a7ecd575b82766d5abb0ff1ed9ebf67 100644
--- a/configs/animeganv2.yaml
+++ b/configs/animeganv2.yaml
@@ -1,12 +1,5 @@
epochs: 30
output_dir: output_dir
-pretrain_ckpt: output_dir/AnimeGANV2PreTrainModel-2020-11-29-17-02/epoch_2_checkpoint.pdparams
-g_adv_weight: 300.
-d_adv_weight: 300.
-con_weight: 1.5
-sty_weight: 2.5
-color_weight: 10.
-tv_weight: 1.
model:
name: AnimeGANV2Model
@@ -14,7 +7,17 @@ model:
name: AnimeGenerator
discriminator:
name: AnimeDiscriminator
- gan_mode: lsgan
+ gan_criterion:
+ name: GANLoss
+ gan_mode: lsgan
+ # use your trained path
+ pretrain_ckpt: output_dir/AnimeGANV2PreTrainModel-2020-11-29-17-02/epoch_2_checkpoint.pdparams
+ g_adv_weight: 300.
+ d_adv_weight: 300.
+ con_weight: 1.5
+ sty_weight: 2.5
+ color_weight: 10.
+ tv_weight: 1.
dataset:
train:
@@ -23,8 +26,6 @@ dataset:
batch_size: 4
dataroot: data/animedataset
style: Hayao
- phase: train
- direction: AtoB
transform_real:
- name: Transpose
- name: Normalize
@@ -47,31 +48,41 @@ dataset:
test:
name: SingleDataset
dataroot: data/animedataset/test/HR_photo
- max_dataset_size: inf
- direction: BtoA
- input_nc: 3
- output_nc: 3
- serial_batches: False
- pool_size: 50
- transforms:
- - name: ResizeToScale
- size: [256, 256]
- scale: 32
- interpolation: bilinear
- - name: Transpose
- - name: Normalize
- mean: [127.5, 127.5, 127.5]
- std: [127.5, 127.5, 127.5]
-
-optimizer:
- name: Adam
- beta1: 0.5
+ preprocess:
+ - name: LoadImageFromFile
+ key: A
+ - name: Transforms
+ input_keys: [A]
+ pipeline:
+ - name: ResizeToScale
+ size: [256, 256]
+ scale: 32
+ interpolation: bilinear
+ - name: Transpose
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+ keys: [image, image]
lr_scheduler:
- name: linear
- learning_rate: 0.00002
+ name: LinearDecay
+ learning_rate: 0.0002
start_epoch: 100
decay_epochs: 100
+ # will get from real dataset
+ iters_per_epoch: 1
+
+optimizer:
+ optimizer_G:
+ name: Adam
+ net_names:
+ - netG
+ beta1: 0.5
+ optimizer_D:
+ name: Adam
+ net_names:
+ - netD
+ beta1: 0.5
log_config:
interval: 100
diff --git a/configs/animeganv2_pretrain.yaml b/configs/animeganv2_pretrain.yaml
index e481b7c430a411968cd1b24f8948c9c3e1be2e0a..3e421978577d50c1f7def6b71d5e37b787d9d534 100644
--- a/configs/animeganv2_pretrain.yaml
+++ b/configs/animeganv2_pretrain.yaml
@@ -1,7 +1,5 @@
epochs: 2
output_dir: output_dir
-con_weight: 1
-pretrain_ckpt: null
model:
name: AnimeGANV2PreTrainModel
@@ -9,7 +7,11 @@ model:
name: AnimeGenerator
discriminator:
name: AnimeDiscriminator
- gan_mode: lsgan
+ gan_criterion:
+ name: GANLoss
+ gan_mode: lsgan
+ con_weight: 1
+ pretrain_ckpt: null
dataset:
train:
@@ -18,8 +20,6 @@ dataset:
batch_size: 4
dataroot: data/animedataset
style: Hayao
- phase: train
- direction: AtoB
transform_real:
- name: Transpose
- name: Normalize
@@ -42,30 +42,41 @@ dataset:
test:
name: SingleDataset
dataroot: data/animedataset/test/test_photo
- max_dataset_size: inf
- direction: BtoA
- input_nc: 3
- output_nc: 3
- serial_batches: False
- pool_size: 50
- transforms:
- - name: Resize
- size: [256, 256]
- interpolation: "bicubic" #cv2.INTER_CUBIC
- - name: Transpose
- - name: Normalize
- mean: [127.5, 127.5, 127.5]
- std: [127.5, 127.5, 127.5]
-
-optimizer:
- name: Adam
- beta1: 0.5
+ preprocess:
+ - name: LoadImageFromFile
+ key: A
+ - name: Transforms
+ input_keys: [A]
+ pipeline:
+ - name: ResizeToScale
+ size: [256, 256]
+ scale: 32
+ interpolation: bilinear
+ - name: Transpose
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+ keys: [image, image]
lr_scheduler:
- name: linear
+ name: LinearDecay
learning_rate: 0.0002
start_epoch: 100
decay_epochs: 100
+ # will get from real dataset
+ iters_per_epoch: 1
+
+optimizer:
+ optimizer_G:
+ name: Adam
+ net_names:
+ - netG
+ beta1: 0.5
+ optimizer_D:
+ name: Adam
+ net_names:
+ - netD
+ beta1: 0.5
log_config:
interval: 100
diff --git a/configs/aotgan.yaml b/configs/aotgan.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a78bde3fd9bc79cd3d6514403f750344085de232
--- /dev/null
+++ b/configs/aotgan.yaml
@@ -0,0 +1,71 @@
+total_iters: 1000000
+output_dir: output_dir
+checkpoints_dir: checkpoints
+epochs: 5
+
+model:
+ name: AOTGANModel
+ generator:
+ name: InpaintGenerator
+ rates: [1, 2, 4, 8]
+ block_num: 8
+ discriminator:
+ name: Discriminator
+ inc: 3
+ criterion:
+ name: AOTGANCriterionLoss
+ pretrained: https://paddlegan.bj.bcebos.com/models/vgg19feats.pdparams
+ l1_weight: 1
+ perceptual_weight: 1
+ style_weight: 250
+ adversal_weight: 0.01
+ img_size: 512
+
+dataset:
+ train:
+ name: AOTGANDataset
+ dataset_path: data/aotgan
+ batch_size: 8 # Multi-Card:4
+ img_size: 512
+ test:
+ name: AOTGANDataset_test
+ dataset_path: data/aotgan
+ batch_size: 1
+ img_size: 512
+
+lr_scheduler: # abundoned
+ name: MultiStepDecay
+ learning_rate: 0.0001
+ milestones: [990000]
+ gamma: 0.1
+
+optimizer:
+ lr: 0.0001
+ optimG:
+ name: Adam
+ net_names:
+ - netG
+ beta1: 0.5
+ beta2: 0.999
+ optimD:
+ name: Adam
+ net_names:
+ - netD
+ beta1: 0.5
+ beta2: 0.999
+
+log_config:
+ interval: 100
+ visiual_interval: 100
+
+snapshot_config:
+ interval: 1000
+
+predict:
+ name: AOTGANGenerator
+ rates: [1, 2, 4, 8]
+ block_num: 8
+ img_size: 512
+
+export_model:
+ - {name: 'net_gen', inputs_num: 1}
diff --git a/configs/basicvsr++_reds.yaml b/configs/basicvsr++_reds.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f10e8e05ffb13392324f7167dd2a73cb890fad49
--- /dev/null
+++ b/configs/basicvsr++_reds.yaml
@@ -0,0 +1,121 @@
+total_iters: 600000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: BasicVSRModel
+ fix_iter: 5000
+ lr_mult: 0.25
+ generator:
+ name: BasicVSRPlusPlus
+ mid_channels: 64
+ num_blocks: 7
+ is_low_res_input: True
+ pixel_criterion:
+ name: CharbonnierLoss
+ reduction: mean
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 4
+ batch_size: 2 #4 gpus
+ dataset:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/train_sharp_bicubic/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 30
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+ test:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 100
+ test_mode: True
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 1e-4
+ periods: [600000]
+ restart_weights: [1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: False
+
+log_config:
+ interval: 10
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/basicvsr++_vimeo90k_BD.yaml b/configs/basicvsr++_vimeo90k_BD.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..95c1634dfe6e5ceaa8a11b85350fa4c6b8257188
--- /dev/null
+++ b/configs/basicvsr++_vimeo90k_BD.yaml
@@ -0,0 +1,122 @@
+total_iters: 600000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: BasicVSRModel
+ fix_iter: 5000
+ lr_mult: 0.25
+ generator:
+ name: BasicVSRPlusPlus
+ mid_channels: 64
+ num_blocks: 7
+ is_low_res_input: True
+ pixel_criterion:
+ name: CharbonnierLoss
+ reduction: mean
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 4
+ batch_size: 1 #4 gpus
+ dataset:
+ name: VSRVimeo90KDataset
+ # mode: train
+ lq_folder: data/vimeo90k/vimeo_septuplet_BD_matlabLRx4/sequences
+ gt_folder: data/vimeo90k/vimeo_septuplet/sequences
+ ann_file: data/vimeo90k/vimeo_septuplet/sep_trainlist.txt
+ preprocess:
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: MirrorVideoSequence
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+ test:
+ name: VSRFolderDataset
+ # for UDM10 dataset
+ # lq_folder: data/udm10/BDx4
+ # gt_folder: data/udm10/GT
+ lq_folder: data/Vid4/BDx4
+ gt_folder: data/Vid4/GT
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ # for UDM10 dataset
+ # filename_tmpl: '{:04d}.png'
+ filename_tmpl: '{:08d}.png'
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 1e-4
+ periods: [600000]
+ restart_weights: [1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: true
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: true
+
+log_config:
+ interval: 10
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/basicvsr_reds.yaml b/configs/basicvsr_reds.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f803a3cd83d84688df069b346033cce5cd5dedeb
--- /dev/null
+++ b/configs/basicvsr_reds.yaml
@@ -0,0 +1,124 @@
+total_iters: 300000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: BasicVSRModel
+ fix_iter: 5000
+ lr_mult: 0.125
+ generator:
+ name: BasicVSRNet
+ mid_channels: 64
+ num_blocks: 30
+ pixel_criterion:
+ name: CharbonnierLoss
+ reduction: mean
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 4
+ batch_size: 2 #4 gpus
+ dataset:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/train_sharp_bicubic/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 15
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+ test:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 100
+ test_mode: True
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 2e-4
+ periods: [300000]
+ restart_weights: [1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: False
+
+log_config:
+ interval: 100
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
+
+export_model:
+ - {name: 'generator', inputs_num: 1}
diff --git a/configs/cond_dcgan_mnist.yaml b/configs/cond_dcgan_mnist.yaml
index 0c3aba181863b481ba44720d376973f761e855e3..e708396587b4a6271a7fe467f24613526468d880 100644
--- a/configs/cond_dcgan_mnist.yaml
+++ b/configs/cond_dcgan_mnist.yaml
@@ -18,46 +18,57 @@ model:
norm_type: batch
n_class: 10
use_sigmoid: True
- gan_mode: vanilla
+ gan_criterion:
+ name: GANLoss
+ gan_mode: vanilla
dataset:
train:
name: CommonVisionDataset
- class_name: MNIST
- dataroot: None
+ dataset_name: MNIST
num_workers: 4
batch_size: 64
- mode: train
- return_cls: True
+ return_label: True
transforms:
- name: Normalize
mean: [127.5]
std: [127.5]
keys: [image]
+ params:
+ mode: train
test:
name: CommonVisionDataset
- class_name: MNIST
- dataroot: None
+ dataset_name: MNIST
num_workers: 0
batch_size: 64
- mode: test
+ return_label: True
+ params:
+ mode: train
transforms:
- name: Normalize
mean: [127.5]
std: [127.5]
keys: [image]
- return_cls: True
-
-
-optimizer:
- name: Adam
- beta1: 0.5
lr_scheduler:
- name: linear
+ name: LinearDecay
learning_rate: 0.0002
start_epoch: 100
decay_epochs: 100
+ # will get from real dataset
+ iters_per_epoch: 1
+
+optimizer:
+ optimizer_G:
+ name: Adam
+ net_names:
+ - netG
+ beta1: 0.5
+ optimizer_D:
+ name: Adam
+ net_names:
+ - netD
+ beta1: 0.5
log_config:
interval: 100
diff --git a/configs/cyclegan_cityscapes.yaml b/configs/cyclegan_cityscapes.yaml
index 4f5257bb6108d2c7191e84356b49e21796790704..e6265361368a55fb9084e9952da8dbe82c2189b7 100644
--- a/configs/cyclegan_cityscapes.yaml
+++ b/configs/cyclegan_cityscapes.yaml
@@ -1,5 +1,6 @@
epochs: 200
output_dir: output_dir
+find_unused_parameters: True
model:
name: CycleGANModel
diff --git a/configs/cyclegan_horse2zebra.yaml b/configs/cyclegan_horse2zebra.yaml
index 15e449e7607791794cdc9448025f0c62d6f041b1..8967500f1b65a8ed9bf5a21dd75cc56dd4980f9c 100644
--- a/configs/cyclegan_horse2zebra.yaml
+++ b/configs/cyclegan_horse2zebra.yaml
@@ -1,5 +1,6 @@
epochs: 200
output_dir: output_dir
+find_unused_parameters: True
model:
name: CycleGANModel
@@ -25,6 +26,12 @@ model:
gan_criterion:
name: GANLoss
gan_mode: lsgan
+ # training model under @to_static
+ to_static: False
+
+export_model:
+ - {name: 'netG_A', inputs_num: 1}
+ - {name: 'netG_B', inputs_num: 1}
dataset:
train:
@@ -114,3 +121,12 @@ log_config:
snapshot_config:
interval: 5
+
+validate:
+ interval: 30000
+ save_img: false
+ metrics:
+ fid: # metric name, can be arbitrary
+ name: FID
+ batch_size: 8
+
diff --git a/configs/dcgan_mnist.yaml b/configs/dcgan_mnist.yaml
index a423dd03657b6fb157cbc9a1d6d11edee06aaa6a..931c679907eb0e01bbad5ae17dcef53cf124c3df 100644
--- a/configs/dcgan_mnist.yaml
+++ b/configs/dcgan_mnist.yaml
@@ -21,44 +21,33 @@ model:
dataset:
train:
- name: SingleDataset
- dataroot: data/mnist/train
+ name: CommonVisionDataset
+ dataset_name: MNIST
+ num_workers: 0
batch_size: 128
- preprocess:
- - name: LoadImageFromFile
- key: A
- - name: Transfroms
- input_keys: [A]
- pipeline:
- - name: Resize
- size: [64, 64]
- interpolation: 'bicubic' #cv2.INTER_CUBIC
- keys: [image, image]
- - name: Transpose
- keys: [image, image]
- - name: Normalize
- mean: [127.5, 127.5, 127.5]
- std: [127.5, 127.5, 127.5]
- keys: [image, image]
+ return_label: False
+ transforms:
+ - name: Resize
+ size: [64, 64]
+ interpolation: 'bicubic' #cv2.INTER_CUBIC
+ - name: Normalize
+ mean: [127.5]
+ std: [127.5]
+ keys: [image]
test:
- name: SingleDataset
- dataroot: data/mnist/test
- preprocess:
- - name: LoadImageFromFile
- key: A
- - name: Transforms
- input_keys: [A]
- pipeline:
- - name: Resize
- size: [64, 64]
- interpolation: 'bicubic' #cv2.INTER_CUBIC
- keys: [image, image]
- - name: Transpose
- keys: [image, image]
- - name: Normalize
- mean: [127.5, 127.5, 127.5]
- std: [127.5, 127.5, 127.5]
- keys: [image, image]
+ name: CommonVisionDataset
+ dataset_name: MNIST
+ num_workers: 0
+ batch_size: 128
+ return_label: False
+ transforms:
+ - name: Resize
+ size: [64, 64]
+ interpolation: 'bicubic' #cv2.INTER_CUBIC
+ - name: Normalize
+ mean: [127.5]
+ std: [127.5]
+ keys: [image]
lr_scheduler:
name: LinearDecay
diff --git a/configs/drn_psnr_x4_div2k.yaml b/configs/drn_psnr_x4_div2k.yaml
index 836571ca2629b88a64cb57d99209e83387ee069c..6aa912b09581c47318e34157ea32ebffaab61456 100644
--- a/configs/drn_psnr_x4_div2k.yaml
+++ b/configs/drn_psnr_x4_div2k.yaml
@@ -53,8 +53,8 @@ dataset:
keys: [image, image, image]
test:
name: SRDataset
- gt_folder: data/DIV2K/val_set14/Set14
- lq_folder: data/DIV2K/val_set14/Set14_bicLRx4
+ gt_folder: data/Set14/GTmod12
+ lq_folder: data/Set14/LRbicx4
scale: 4
preprocess:
- name: LoadImageFromFile
diff --git a/configs/edvr_l_blur_w_tsa.yaml b/configs/edvr_l_blur_w_tsa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b89440df66b76e4b55074147370e6ef581509a8a
--- /dev/null
+++ b/configs/edvr_l_blur_w_tsa.yaml
@@ -0,0 +1,132 @@
+total_iters: 600000
+output_dir: output_dir
+checkpoints_dir: checkpoints
+find_unused_parameters: True
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: EDVRModel
+ tsa_iter: 50000
+ generator:
+ name: EDVRNet
+ in_nf: 3
+ out_nf: 3
+ scale_factor: 1
+ nf: 128
+ nframes: 5
+ groups: 8
+ front_RBs: 5
+ back_RBs: 40
+ center: 2
+ predeblur: True
+ HR_in: True
+ w_TSA: True
+ pixel_criterion:
+ name: CharbonnierLoss
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 6
+ batch_size: 8 #4 gpus
+ dataset:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/train_blur/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: False
+ preprocess:
+ - name: GetFrameIdx
+ interval_list: [1]
+ frames_per_clip: 99
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 1
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+
+ test:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/REDS4_test_blur/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: True
+ preprocess:
+ - name: GetFrameIdxwithPadding
+ padding: reflection_circle
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 4e-4
+ periods: [50000, 100000, 150000, 150000, 150000]
+ restart_weights: [1, 0.5, 0.5, 0.5, 0.5]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 10000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: False
+
+log_config:
+ interval: 50
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/edvr_l_blur_wo_tsa.yaml b/configs/edvr_l_blur_wo_tsa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a6b3ee43b19f764ddd3bd1cef0145d893de9f40a
--- /dev/null
+++ b/configs/edvr_l_blur_wo_tsa.yaml
@@ -0,0 +1,131 @@
+total_iters: 600000
+output_dir: output_dir
+checkpoints_dir: checkpoints
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: EDVRModel
+ tsa_iter: False
+ generator:
+ name: EDVRNet
+ in_nf: 3
+ out_nf: 3
+ scale_factor: 1
+ nf: 128
+ nframes: 5
+ groups: 8
+ front_RBs: 5
+ back_RBs: 40
+ center: 2
+ predeblur: True
+ HR_in: True
+ w_TSA: False
+ pixel_criterion:
+ name: CharbonnierLoss
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 6
+ batch_size: 8 #4 gpus
+ dataset:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/train_blur/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: False
+ preprocess:
+ - name: GetFrameIdx
+ interval_list: [1]
+ frames_per_clip: 99
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 1
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+
+ test:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/REDS4_test_blur/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: True
+ preprocess:
+ - name: GetFrameIdxwithPadding
+ padding: reflection_circle
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 2e-4
+ periods: [150000, 150000, 150000, 150000]
+ restart_weights: [1, 0.5, 0.5, 0.5]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: False
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/edvr_l_w_tsa.yaml b/configs/edvr_l_w_tsa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fa79dedb2a67d2c17efc7f6f646132d6a2baa57b
--- /dev/null
+++ b/configs/edvr_l_w_tsa.yaml
@@ -0,0 +1,132 @@
+total_iters: 600000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: EDVRModel
+ tsa_iter: 50000
+ generator:
+ name: EDVRNet
+ in_nf: 3
+ out_nf: 3
+ scale_factor: 4
+ nf: 128
+ nframes: 5
+ groups: 8
+ front_RBs: 5
+ back_RBs: 40
+ center: 2
+ predeblur: False
+ HR_in: False
+ w_TSA: True
+ pixel_criterion:
+ name: CharbonnierLoss
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 3
+ batch_size: 4 #8 gpus
+ dataset:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/train_sharp_bicubic/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: False
+ preprocess:
+ - name: GetFrameIdx
+ interval_list: [1]
+ frames_per_clip: 99
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+
+ test:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: True
+ preprocess:
+ - name: GetFrameIdxwithPadding
+ padding: reflection_circle
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 4e-4
+ periods: [50000, 100000, 150000, 150000, 150000]
+ restart_weights: [1, 0.5, 0.5, 0.5, 0.5]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: False
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/edvr_l_wo_tsa.yaml b/configs/edvr_l_wo_tsa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..891c00914d1c5405db064fc7a1ce7b4af8c62ed6
--- /dev/null
+++ b/configs/edvr_l_wo_tsa.yaml
@@ -0,0 +1,131 @@
+total_iters: 600000
+output_dir: output_dir
+checkpoints_dir: checkpoints
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: EDVRModel
+ tsa_iter: 0
+ generator:
+ name: EDVRNet
+ in_nf: 3
+ out_nf: 3
+ scale_factor: 4
+ nf: 128
+ nframes: 5
+ groups: 8
+ front_RBs: 5
+ back_RBs: 40
+ center: 2
+ predeblur: False
+ HR_in: False
+ w_TSA: False
+ pixel_criterion:
+ name: CharbonnierLoss
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 3
+ batch_size: 4 #8 gpus
+ dataset:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/train_sharp_bicubic/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: False
+ preprocess:
+ - name: GetFrameIdx
+ interval_list: [1]
+ frames_per_clip: 99
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+
+ test:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: True
+ preprocess:
+ - name: GetFrameIdxwithPadding
+ padding: reflection_circle
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 4e-4
+ periods: [150000, 150000, 150000, 150000]
+ restart_weights: [1, 0.5, 0.5, 0.5]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: False
+
+log_config:
+ interval: 10
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/edvr_m_w_tsa.yaml b/configs/edvr_m_w_tsa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce14a40ac2c4daabf13e294f347b89c33cc0fb76
--- /dev/null
+++ b/configs/edvr_m_w_tsa.yaml
@@ -0,0 +1,135 @@
+total_iters: 600000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: EDVRModel
+ tsa_iter: 50000
+ generator:
+ name: EDVRNet
+ in_nf: 3
+ out_nf: 3
+ scale_factor: 4
+ nf: 64
+ nframes: 5
+ groups: 8
+ front_RBs: 5
+ back_RBs: 10
+ center: 2
+ predeblur: False
+ HR_in: False
+ w_TSA: True
+ pixel_criterion:
+ name: CharbonnierLoss
+
+export_model:
+ - {name: 'generator', inputs_num: 1}
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 3
+ batch_size: 4 #8 gpus
+ dataset:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/train_sharp_bicubic/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: False
+ preprocess:
+ - name: GetFrameIdx
+ interval_list: [1]
+ frames_per_clip: 99
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+
+ test:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: True
+ preprocess:
+ - name: GetFrameIdxwithPadding
+ padding: reflection_circle
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 4e-4
+ periods: [50000, 100000, 150000, 150000, 150000]
+ restart_weights: [1, 1, 1, 1, 1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: False
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/edvr_m_wo_tsa.yaml b/configs/edvr_m_wo_tsa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6a24f470bfdc049fa861d0cca008ac6343260ee5
--- /dev/null
+++ b/configs/edvr_m_wo_tsa.yaml
@@ -0,0 +1,136 @@
+total_iters: 600000
+output_dir: output_dir
+checkpoints_dir: checkpoints
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: EDVRModel
+ tsa_iter: 0
+ generator:
+ name: EDVRNet
+ in_nf: 3
+ out_nf: 3
+ scale_factor: 4
+ nf: 64
+ nframes: 5
+ groups: 8
+ front_RBs: 5
+ back_RBs: 10
+ center: 2
+ predeblur: False
+ HR_in: False
+ w_TSA: False
+ pixel_criterion:
+ name: CharbonnierLoss
+ # training model under @to_static
+ to_static: False
+
+export_model:
+ - {name: 'generator', inputs_num: 1}
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 3
+ batch_size: 4 #8 gpus
+ dataset:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/train_sharp_bicubic/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: False
+ preprocess:
+ - name: GetFrameIdx
+ interval_list: [1]
+ frames_per_clip: 99
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+
+ test:
+ name: VSRREDSDataset
+ lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 5
+ val_partition: REDS4
+ test_mode: True
+ preprocess:
+ - name: GetFrameIdxwithPadding
+ padding: reflection_circle
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 4e-4
+ periods: [150000, 150000, 150000, 150000]
+ restart_weights: [1, 1, 1, 1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: False
+
+log_config:
+ interval: 100
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/esrgan_psnr_x2_div2k.yaml b/configs/esrgan_psnr_x2_div2k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dca1581385e6fdfea784dab2e9e37f7d2ba20758
--- /dev/null
+++ b/configs/esrgan_psnr_x2_div2k.yaml
@@ -0,0 +1,106 @@
+total_iters: 1000000
+output_dir: output_dir
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: BaseSRModel
+ generator:
+ name: RRDBNet
+ in_nc: 3
+ out_nc: 3
+ nf: 64
+ nb: 23
+ scale: 2
+ pixel_criterion:
+ name: L1Loss
+
+dataset:
+ train:
+ name: SRDataset
+ gt_folder: data/DIV2K/DIV2K_train_HR_sub
+ lq_folder: data/DIV2K/DIV2K_train_LR_bicubic/X2_sub
+ num_workers: 4
+ batch_size: 8
+ scale: 2
+ preprocess:
+ - name: LoadImageFromFile
+ key: lq
+ - name: LoadImageFromFile
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 128
+ scale: 2
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: Transpose
+ keys: [image, image]
+ - name: Normalize
+ mean: [0., .0, 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+ test:
+ name: SRDataset
+ gt_folder: data/Set14/GTmod12
+ lq_folder: data/Set14/LRbicx2
+ scale: 2
+ preprocess:
+ - name: LoadImageFromFile
+ key: lq
+ - name: LoadImageFromFile
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: Transpose
+ keys: [image, image]
+ - name: Normalize
+ mean: [0., .0, 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: 0.0002
+ periods: [250000, 250000, 250000, 250000]
+ restart_weights: [1, 1, 1, 1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 2
+ test_y_channel: True
+ ssim:
+ name: SSIM
+ crop_border: 2
+ test_y_channel: True
+
+log_config:
+ interval: 100
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/esrgan_psnr_x4_div2k.yaml b/configs/esrgan_psnr_x4_div2k.yaml
index 7dd7428f8fbda251f4cd5dc8dc0704e791e573ab..2f3504a80839235df8153426a3cf5030ec104ec0 100644
--- a/configs/esrgan_psnr_x4_div2k.yaml
+++ b/configs/esrgan_psnr_x4_div2k.yaml
@@ -14,6 +14,11 @@ model:
nb: 23
pixel_criterion:
name: L1Loss
+ # training model under @to_static
+ to_static: False
+
+export_model:
+ - {name: 'generator', inputs_num: 1}
dataset:
train:
@@ -44,13 +49,13 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
test:
name: SRDataset
- gt_folder: data/DIV2K/val_set14/Set14
- lq_folder: data/DIV2K/val_set14/Set14_bicLRx4
+ gt_folder: data/Set14/GTmod12
+ lq_folder: data/Set14/LRbicx4
scale: 4
preprocess:
- name: LoadImageFromFile
@@ -63,7 +68,7 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
diff --git a/configs/esrgan_x4_div2k.yaml b/configs/esrgan_x4_div2k.yaml
index a44d8b0ed573e5f03adf3775ba52fe67d3924cb0..aa9e9ab8efd908e08a32d1ff23935eaadc1b8d5e 100644
--- a/configs/esrgan_x4_div2k.yaml
+++ b/configs/esrgan_x4_div2k.yaml
@@ -1,5 +1,6 @@
total_iters: 250000
output_dir: output_dir
+find_unused_parameters: True
# tensor range for function tensor2img
min_max:
(0., 1.)
@@ -31,6 +32,9 @@ model:
gan_mode: vanilla
loss_weight: !!float 5e-3
+export_model:
+ - {name: 'generator', inputs_num: 1}
+
dataset:
train:
name: SRDataset
@@ -60,13 +64,13 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
test:
name: SRDataset
- gt_folder: data/DIV2K/val_set14/Set14
- lq_folder: data/DIV2K/val_set14/Set14_bicLRx4
+ gt_folder: data/Set14/GTmod12
+ lq_folder: data/Set14/LRbicx4
scale: 4
preprocess:
- name: LoadImageFromFile
@@ -79,7 +83,7 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
@@ -118,6 +122,8 @@ validate:
name: SSIM
crop_border: 4
test_y_channel: false
+ lpips:
+ name: LPIPSMetric
log_config:
interval: 100
diff --git a/configs/firstorder_fashion.yaml b/configs/firstorder_fashion.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..bf39368a8e7862c6e90fb80a8d75ba47ade857f3
--- /dev/null
+++ b/configs/firstorder_fashion.yaml
@@ -0,0 +1,106 @@
+epochs: 150
+output_dir: output_dir
+
+model:
+ name: FirstOrderModel
+ common_params:
+ num_kp: 10
+ num_channels: 3
+ estimate_jacobian: True
+ generator:
+ name: FirstOrderGenerator
+ kp_detector_cfg:
+ temperature: 0.1
+ block_expansion: 32
+ max_features: 1024
+ scale_factor: 0.25
+ num_blocks: 5
+ generator_cfg:
+ block_expansion: 64
+ max_features: 512
+ num_down_blocks: 2
+ num_bottleneck_blocks: 6
+ estimate_occlusion_map: True
+ dense_motion_params:
+ block_expansion: 64
+ max_features: 1024
+ num_blocks: 5
+ scale_factor: 0.25
+ discriminator:
+ name: FirstOrderDiscriminator
+ discriminator_cfg:
+ scales: [1]
+ block_expansion: 32
+ max_features: 512
+ num_blocks: 4
+ sn: True
+ train_params:
+ scales: [1, 0.5, 0.25, 0.125]
+ transform_params:
+ sigma_affine: 0.05
+ sigma_tps: 0.005
+ points_tps: 5
+ loss_weights:
+ generator_gan: 1
+ discriminator_gan: 1
+ feature_matching: [10, 10, 10, 10]
+ perceptual: [10, 10, 10, 10, 10]
+ equivariance_value: 10
+ equivariance_jacobian: 10
+
+optimizer:
+ name: Adam
+
+lr_scheduler:
+ epoch_milestones: [187500, 281250]
+ lr_generator: 2.0e-4
+ lr_discriminator: 2.0e-4
+ lr_kp_detector: 2.0e-4
+
+dataset:
+ train:
+ name: FirstOrderDataset
+ phase: train
+ dataroot: data/first_order/fashion/
+ num_repeats: 50
+ time_flip: True
+ batch_size: 8
+ id_sampling: False
+ frame_shape: [ 256, 256, 3 ]
+ process_time: False
+ create_frames_folder: False
+ num_workers: 4
+ max_dataset_size: inf
+ direction: BtoA
+ input_nc: 3
+ output_nc: 3
+ transforms:
+ - name: PairedRandomHorizontalFlip
+ prob: 0.5
+ keys: [image, image]
+ - name: PairedColorJitter
+ brightness: 0.1
+ contrast: 0.1
+ saturation: 0.1
+ hue: 0.1
+ keys: [image, image]
+ test:
+ name: FirstOrderDataset
+ dataroot: data/first_order/fashion/
+ phase: test
+ batch_size: 1
+ num_workers: 1
+ time_flip: False
+ id_sampling: False
+ create_frames_folder: False
+ frame_shape: [ 256, 256, 3 ]
+
+log_config:
+ interval: 10
+ visiual_interval: 10
+
+snapshot_config:
+ interval: 10
+
+validate:
+ interval: 31250
diff --git a/configs/firstorder_vox_256.yaml b/configs/firstorder_vox_256.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..299f0edf45590992361d4845742772aa7f393a94
--- /dev/null
+++ b/configs/firstorder_vox_256.yaml
@@ -0,0 +1,128 @@
+epochs: 100
+output_dir: output_dir
+
+dataset:
+ train:
+ name: FirstOrderDataset
+ batch_size: 8
+ num_workers: 8
+ use_shared_memory: False
+ phase: train
+ dataroot: data/first_order/Voxceleb/
+ frame_shape: [256, 256, 3]
+ id_sampling: True
+ pairs_list: None
+ time_flip: True
+ num_repeats: 75
+ create_frames_folder: False
+ transforms:
+ - name: PairedRandomHorizontalFlip
+ prob: 0.5
+ keys: [image, image]
+ - name: PairedColorJitter
+ brightness: 0.1
+ contrast: 0.1
+ saturation: 0.1
+ hue: 0.1
+ keys: [image, image]
+ test:
+ name: FirstOrderDataset
+ dataroot: data/first_order/Voxceleb/
+ phase: test
+ batch_size: 1
+ num_workers: 1
+ time_flip: False
+ id_sampling: False
+ create_frames_folder: False
+ frame_shape: [ 256, 256, 3 ]
+
+
+model:
+ name: FirstOrderModel
+ common_params:
+ num_kp: 10
+ num_channels: 3
+ estimate_jacobian: True
+ generator:
+ name: FirstOrderGenerator
+ kp_detector_cfg:
+ temperature: 0.1
+ block_expansion: 32
+ max_features: 1024
+ scale_factor: 0.25
+ num_blocks: 5
+ generator_cfg:
+ block_expansion: 64
+ max_features: 512
+ num_down_blocks: 2
+ num_bottleneck_blocks: 6
+ estimate_occlusion_map: True
+ dense_motion_params:
+ block_expansion: 64
+ max_features: 1024
+ num_blocks: 5
+ scale_factor: 0.25
+ discriminator:
+ name: FirstOrderDiscriminator
+ discriminator_cfg:
+ scales: [1]
+ block_expansion: 32
+ max_features: 512
+ num_blocks: 4
+ sn: True
+ train_params:
+ num_epochs: 100
+ scales: [1, 0.5, 0.25, 0.125]
+ checkpoint_freq: 50
+ transform_params:
+ sigma_affine: 0.05
+ sigma_tps: 0.005
+ points_tps: 5
+ loss_weights:
+ generator_gan: 0
+ discriminator_gan: 1
+ feature_matching: [10, 10, 10, 10]
+ perceptual: [10, 10, 10, 10, 10]
+ equivariance_value: 10
+ equivariance_jacobian: 10
+
+lr_scheduler:
+ name: MultiStepDecay
+ epoch_milestones: [237360, 356040]
+ lr_generator: 2.0e-4
+ lr_discriminator: 2.0e-4
+ lr_kp_detector: 2.0e-4
+
+reconstruction_params:
+ num_videos: 1000
+ format: '.mp4'
+
+animate_params:
+ num_pairs: 50
+ format: '.mp4'
+ normalization_params:
+ adapt_movement_scale: False
+ use_relative_movement: True
+ use_relative_jacobian: True
+
+visualizer_params:
+ kp_size: 5
+ draw_border: True
+ colormap: 'gist_rainbow'
+
+log_config:
+ interval: 10
+ visiual_interval: 10
+
+validate:
+ interval: 3000
+ save_img: false
+
+snapshot_config:
+ interval: 10
+
+optimizer:
+ name: Adam
+
+export_model:
+ - {}
diff --git a/configs/firstorder_vox_mobile_256.yaml b/configs/firstorder_vox_mobile_256.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca5457a9f6c017122873a9e42770ddeff131fd06
--- /dev/null
+++ b/configs/firstorder_vox_mobile_256.yaml
@@ -0,0 +1,152 @@
+epochs: 100
+output_dir: output_dir
+
+dataset:
+ train:
+ name: FirstOrderDataset
+ batch_size: 8
+ num_workers: 4
+ use_shared_memory: False
+ phase: train
+ dataroot: data/first_order/Voxceleb/
+ frame_shape: [256, 256, 3]
+ id_sampling: True
+ pairs_list: None
+ time_flip: True
+ num_repeats: 75
+ create_frames_folder: False
+ transforms:
+ - name: PairedRandomHorizontalFlip
+ prob: 0.5
+ keys: [image, image]
+ - name: PairedColorJitter
+ brightness: 0.1
+ contrast: 0.1
+ saturation: 0.1
+ hue: 0.1
+ keys: [image, image]
+ test:
+ name: FirstOrderDataset
+ dataroot: data/first_order/Voxceleb/
+ phase: test
+ batch_size: 1
+ num_workers: 1
+ time_flip: False
+ id_sampling: False
+ create_frames_folder: False
+ frame_shape: [ 256, 256, 3 ]
+
+
+model:
+ name: FirstOrderModelMobile
+ mode: generator # should be kp_detector, generator, both
+ kp_weight_path: None
+ gen_weight_path: None
+ common_params:
+ num_kp: 10
+ num_channels: 3
+ estimate_jacobian: True
+ generator:
+ name: FirstOrderGenerator
+ kp_detector_cfg:
+ temperature: 0.1
+ block_expansion: 32
+ max_features: 256
+ scale_factor: 0.25
+ num_blocks: 5
+ mobile_net: True
+ generator_cfg:
+ block_expansion: 32
+ max_features: 256
+ num_down_blocks: 2
+ num_bottleneck_blocks: 6
+ estimate_occlusion_map: True
+ dense_motion_params:
+ block_expansion: 32
+ max_features: 256
+ num_blocks: 5
+ scale_factor: 0.25
+ mobile_net: True
+ generator_ori:
+ name: FirstOrderGenerator
+ kp_detector_cfg:
+ temperature: 0.1
+ block_expansion: 32
+ max_features: 1024
+ scale_factor: 0.25
+ num_blocks: 5
+ generator_cfg:
+ block_expansion: 64
+ max_features: 512
+ num_down_blocks: 2
+ num_bottleneck_blocks: 6
+ estimate_occlusion_map: True
+ dense_motion_params:
+ block_expansion: 64
+ max_features: 1024
+ num_blocks: 5
+ scale_factor: 0.25
+ discriminator:
+ name: FirstOrderDiscriminator
+ discriminator_cfg:
+ scales: [1]
+ block_expansion: 32
+ max_features: 512
+ num_blocks: 4
+ sn: True
+ train_params:
+ num_epochs: 100
+ scales: [1, 0.5, 0.25, 0.125]
+ checkpoint_freq: 50
+ transform_params:
+ sigma_affine: 0.05
+ sigma_tps: 0.005
+ points_tps: 5
+ loss_weights:
+ generator_gan: 1
+ discriminator_gan: 1
+ feature_matching: [10, 10, 10, 10]
+ perceptual: [10, 10, 10, 10, 10]
+ equivariance_value: 10
+ equivariance_jacobian: 10
+
+lr_scheduler:
+ name: MultiStepDecay
+ epoch_milestones: [2000000000000] #just fix learning rate, and change into [237360, 356040] for "both" mode fine tune
+ lr_generator: 2.0e-4
+ lr_discriminator: 2.0e-4
+ lr_kp_detector: 2.0e-4
+
+reconstruction_params:
+ num_videos: 1000
+ format: '.mp4'
+
+animate_params:
+ num_pairs: 50
+ format: '.mp4'
+ normalization_params:
+ adapt_movement_scale: False
+ use_relative_movement: True
+ use_relative_jacobian: True
+
+visualizer_params:
+ kp_size: 5
+ draw_border: True
+ colormap: 'gist_rainbow'
+
+log_config:
+ interval: 10
+ visiual_interval: 10
+
+validate:
+ interval: 20000000000 #close validate step to speed up training, if you need to see the process, change it into 20000
+ save_img: true
+
+snapshot_config:
+ interval: 5
+
+optimizer:
+ name: Adam
+
+export_model:
+ - {}
diff --git a/configs/gfpgan_ffhq1024.yaml b/configs/gfpgan_ffhq1024.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c3ca9e2742af37a340c929244277e146391e1c7
--- /dev/null
+++ b/configs/gfpgan_ffhq1024.yaml
@@ -0,0 +1,207 @@
+total_iters: 800000
+output_dir: output
+find_unused_parameters: True
+
+log_config:
+ interval: 100
+ visiual_interval: 100
+
+snapshot_config:
+ interval: 30000
+
+enable_visualdl: False
+
+validate:
+ interval: 5000
+ save_img: True
+
+ metrics:
+ psnr:
+ name: PSNR
+ crop_border: 0
+ test_y_channel: false
+ fid:
+ name: FID
+ batch_size: 8
+model:
+ name: GFPGANModel
+ network_g:
+ name: GFPGANv1
+ out_size: 512
+ num_style_feat: 512
+ channel_multiplier: 1
+ resample_kernel: [1, 3, 3, 1]
+ decoder_load_path: https://paddlegan.bj.bcebos.com/models/StyleGAN2_FFHQ512_Cmul1.pdparams
+ fix_decoder: true
+ num_mlp: 8
+ lr_mlp: 0.01
+ input_is_latent: true
+ different_w: true
+ narrow: 1
+ sft_half: true
+ network_d:
+ name: StyleGAN2DiscriminatorGFPGAN
+ out_size: 512
+ channel_multiplier: 1
+ resample_kernel: [1, 3, 3, 1]
+ network_d_left_eye:
+ type: FacialComponentDiscriminator
+
+ network_d_right_eye:
+ type: FacialComponentDiscriminator
+
+ network_d_mouth:
+ type: FacialComponentDiscriminator
+
+ network_identity:
+ name: ResNetArcFace
+ block: IRBlock
+ layers: [2, 2, 2, 2]
+ use_se: False
+
+ path:
+ image_visual: gfpgan_train_outdir
+ pretrain_network_g: ~
+ param_key_g: params_ema
+ strict_load_g: ~
+ pretrain_network_d: ~
+ pretrain_network_d_left_eye: https://paddlegan.bj.bcebos.com/models/Facial_component_discriminator.pdparams
+ pretrain_network_d_right_eye: https://paddlegan.bj.bcebos.com/models/Facial_component_discriminator.pdparams
+ pretrain_network_d_mouth: https://paddlegan.bj.bcebos.com/models/Facial_component_discriminator.pdparams
+ pretrain_network_identity: https://paddlegan.bj.bcebos.com/models/arcface_resnet18.pdparams
+
+
+ # losses
+ # pixel loss
+ pixel_opt:
+ name: GFPGANL1Loss
+ loss_weight: !!float 1e-1
+ reduction: mean
+ # L1 loss used in pyramid loss, component style loss and identity loss
+ L1_opt:
+ name: GFPGANL1Loss
+ loss_weight: 1
+ reduction: mean
+
+ # image pyramid loss
+ pyramid_loss_weight: 1
+ remove_pyramid_loss: 50000
+ # perceptual loss (content and style losses)
+ perceptual_opt:
+ name: GFPGANPerceptualLoss
+ layer_weights:
+ # before relu
+ "conv1_2": 0.1
+ "conv2_2": 0.1
+ "conv3_4": 1
+ "conv4_4": 1
+ "conv5_4": 1
+ vgg_type: vgg19
+ use_input_norm: true
+ perceptual_weight: !!float 1
+ style_weight: 50
+ range_norm: true
+ criterion: l1
+ # gan loss
+ gan_opt:
+ name: GFPGANGANLoss
+ gan_type: wgan_softplus
+ loss_weight: !!float 1e-1
+ # r1 regularization for discriminator
+ r1_reg_weight: 10
+ # facial component loss
+ gan_component_opt:
+ name: GFPGANGANLoss
+ gan_type: vanilla
+ real_label_val: 1.0
+ fake_label_val: 0.0
+ loss_weight: !!float 1
+ comp_style_weight: 200
+ # identity loss
+ identity_weight: 10
+
+ net_d_iters: 1
+ net_d_init_iters: 0
+ net_d_reg_every: 16
+
+export_model:
+ - { name: "net_g_ema", inputs_num: 1 }
+
+dataset:
+ train:
+ name: FFHQDegradationDataset
+ dataroot_gt: data/gfpgan_data/train
+ io_backend:
+ type: disk
+
+ use_hflip: true
+ mean: [0.5, 0.5, 0.5]
+ std: [0.5, 0.5, 0.5]
+ out_size: 512
+
+ blur_kernel_size: 41
+ kernel_list: ["iso", "aniso"]
+ kernel_prob: [0.5, 0.5]
+ blur_sigma: [0.1, 10]
+ downsample_range: [0.8, 8]
+ noise_range: [0, 20]
+ jpeg_range: [60, 100]
+
+ # color jitter and gray
+ color_jitter_prob: 0.3
+ color_jitter_shift: 20
+ color_jitter_pt_prob: 0.3
+ gray_prob: 0.01
+
+ # If you do not want colorization, please set
+ # color_jitter_prob: ~
+ # color_jitter_pt_prob: ~
+ # gray_prob: 0.01
+ # gt_gray: True
+
+ crop_components: true
+ component_path: https://paddlegan.bj.bcebos.com/models/FFHQ_eye_mouth_landmarks_512.pdparams
+ eye_enlarge_ratio: 1.4
+
+ # data loader
+ use_shuffle: true
+ # TODO fix out of memory for val while training
+ num_workers: 0
+ batch_size: 1
+ prefetch_mode: ~
+
+ test:
+ # Please modify accordingly to use your own validation
+ # Or comment the val block if do not need validation during training
+ name: PairedImageDataset
+ dataroot_lq: data/gfpgan_data/lq
+ dataroot_gt: data/gfpgan_data/gt
+ io_backend:
+ type: disk
+ mean: [0.5, 0.5, 0.5]
+ std: [0.5, 0.5, 0.5]
+ scale: 1
+ # TODO fix out of memory for val while training
+ num_workers: 0
+ batch_size: 8
+ phase: val
+
+lr_scheduler:
+ name: MultiStepDecay
+ learning_rate: 0.002
+ milestones: [600000, 700000]
+ gamma: 0.5
+
+optimizer:
+ optim_g:
+ name: Adam
+ beta1: 0
+ beta2: 0.99
+ optim_d:
+ name: Adam
+ beta1: 0
+ beta2: 0.99
+ optim_component:
+ name: Adam
+ beta1: 0.9
+ beta2: 0.99
diff --git a/configs/gpen_256_ffhq.yaml b/configs/gpen_256_ffhq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bead76a995becfe788380f7b65d2c3bc7ae58f19
--- /dev/null
+++ b/configs/gpen_256_ffhq.yaml
@@ -0,0 +1,77 @@
+total_iters: 200000
+output_dir: output_dir
+find_unused_parameters: True
+
+
+model:
+ name: GPENModel
+ generator:
+ name: GPENGenerator
+ size: 256
+ style_dim: 512
+ n_mlp: 8
+ channel_multiplier: 1
+ narrow: 0.5
+ discriminator:
+ name: GPENDiscriminator
+ size: 256
+ channel_multiplier: 1
+ narrow: 0.5
+
+
+export_model:
+ - {name: 'g_ema', inputs_num: 1}
+
+dataset:
+ train:
+ name: GPENDataset
+ dataroot: data/ffhq/images256x256/
+ num_workers: 0
+ batch_size: 2 #1gpus
+ size: 256
+
+ test:
+ name: GPENDataset
+ dataroot: data/ffhq/images256x256/
+ num_workers: 0
+ batch_size: 1
+ size: 256
+ amount: 100
+
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: 0.002
+ periods: [500000, 500000, 500000, 500000]
+ restart_weights: [1, 1, 1, 1]
+ eta_min: 0.002
+
+
+optimizer:
+ optimG:
+ name: Adam
+ net_names:
+ - netG
+ beta1: 0.9
+ beta2: 0.99
+ optimD:
+ name: Adam
+ net_names:
+ - netD
+ beta1: 0.9
+ beta2: 0.99
+
+log_config:
+ interval: 100
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
+
+validate:
+ interval: 5000
+ save_img: false
+ metrics:
+ fid:
+ name: FID
+ batch_size: 1
diff --git a/configs/iconvsr_reds.yaml b/configs/iconvsr_reds.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3f8262b225f7561b41c3efab4288ab69783ca0b
--- /dev/null
+++ b/configs/iconvsr_reds.yaml
@@ -0,0 +1,120 @@
+total_iters: 300000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: BasicVSRModel
+ fix_iter: 5000
+ lr_mult: 0.125
+ generator:
+ name: IconVSR
+ mid_channels: 64
+ num_blocks: 30
+ pixel_criterion:
+ name: CharbonnierLoss
+ reduction: mean
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 4
+ batch_size: 2 #4 gpus
+ dataset:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/train_sharp_bicubic/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 15
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+ test:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 100
+ test_mode: True
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 2e-4
+ periods: [300000]
+ restart_weights: [1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: False
+
+log_config:
+ interval: 100
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/invdn_denoising.yaml b/configs/invdn_denoising.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ba228b660b09f91b399792704dfdb60ae704ae1b
--- /dev/null
+++ b/configs/invdn_denoising.yaml
@@ -0,0 +1,77 @@
+total_iters: 150000
+output_dir: output_dir
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: InvDNModel
+ generator:
+ name: InvDN
+ channel_in: 3
+ channel_out: 3
+ block_num: [8, 8]
+ scale: 4
+ down_num: 2
+
+dataset:
+ train:
+ name: InvDNDataset
+ # TODO fix out of memory for val while training
+ num_workers: 0
+ batch_size: 14 # 4 GPUs
+ opt:
+ phase: train
+ scale: 4
+ crop_size: 144
+ train_dir: data/SIDD_Medium_Srgb_Patches_512/train/
+ test:
+ name: InvDNDataset
+ # TODO fix out of memory for val while training
+ num_workers: 0
+ batch_size: 1
+ opt:
+ phase: test
+ scale: 4
+ val_dir: data/SIDD_Valid_Srgb_Patches_256/valid/
+
+export_model:
+ - {name: 'generator', inputs_num: 1}
+
+lr_scheduler:
+ name: MultiStepDecay
+ learning_rate: 8e-4 # num_gpu * 2e-4
+ milestones: [25000, 50000, 75000, 100000, 125000, 135000, 145000]
+ gamma: 0.5
+
+validate:
+ interval: 500
+ save_img: True
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 4
+ test_y_channel: True
+ ssim:
+ name: SSIM
+ crop_border: 4
+ test_y_channel: True
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+ epsilon: 1e-8
+ clip_grad_norm: 10
+
+log_config:
+ interval: 100
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 500
diff --git a/configs/lapstyle_draft.yaml b/configs/lapstyle_draft.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46c4a3f23b875e0fee9711a8743188bc51290886
--- /dev/null
+++ b/configs/lapstyle_draft.yaml
@@ -0,0 +1,67 @@
+total_iters: 30000
+output_dir: output_dir
+checkpoints_dir: checkpoints
+min_max:
+ (0., 1.)
+
+model:
+ name: LapStyleDraModel
+ generator_encode:
+ name: Encoder
+ generator_decode:
+ name: DecoderNet
+ calc_style_emd_loss:
+ name: CalcStyleEmdLoss
+ calc_content_relt_loss:
+ name: CalcContentReltLoss
+ calc_content_loss:
+ name: CalcContentLoss
+ calc_style_loss:
+ name: CalcStyleLoss
+ content_layers: ['r11', 'r21', 'r31', 'r41', 'r51']
+ style_layers: ['r11', 'r21', 'r31', 'r41', 'r51']
+ content_weight: 1.0
+ style_weight: 3.0
+
+
+dataset:
+ train:
+ name: LapStyleDataset
+ content_root: data/coco/train2017/
+ style_root: data/starrynew.png
+ load_size: 136
+ crop_size: 128
+ num_workers: 16
+ batch_size: 5 #1 GPUs
+ test:
+ name: LapStyleDataset
+ content_root: data/coco/test2017/
+ style_root: data/starrynew.png
+ load_size: 128
+ crop_size: 128
+ num_workers: 0
+ batch_size: 1
+
+lr_scheduler:
+ name: NonLinearDecay
+ learning_rate: 1e-4
+ lr_decay: 5e-5
+
+optimizer:
+ optimG:
+ name: Adam
+ net_names:
+ - net_dec
+ beta1: 0.9
+ beta2: 0.999
+
+validate:
+ interval: 500
+ save_img: false
+
+log_config:
+ interval: 10
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/lapstyle_rev_first.yaml b/configs/lapstyle_rev_first.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e0158dea002d6ea27450a4a5ae2bcc0f20847293
--- /dev/null
+++ b/configs/lapstyle_rev_first.yaml
@@ -0,0 +1,79 @@
+total_iters: 30000
+output_dir: output_dir
+checkpoints_dir: checkpoints
+min_max:
+ (0., 1.)
+
+model:
+ name: LapStyleRevFirstModel
+ revnet_generator:
+ name: RevisionNet
+ revnet_discriminator:
+ name: LapStyleDiscriminator
+ draftnet_encode:
+ name: Encoder
+ draftnet_decode:
+ name: DecoderNet
+ calc_style_emd_loss:
+ name: CalcStyleEmdLoss
+ calc_content_relt_loss:
+ name: CalcContentReltLoss
+ calc_content_loss:
+ name: CalcContentLoss
+ calc_style_loss:
+ name: CalcStyleLoss
+ gan_criterion:
+ name: GANLoss
+ gan_mode: vanilla
+ content_layers: ['r11', 'r21', 'r31', 'r41', 'r51']
+ style_layers: ['r11', 'r21', 'r31', 'r41', 'r51']
+ content_weight: 1.0
+ style_weight: 3.0
+
+dataset:
+ train:
+ name: LapStyleDataset
+ content_root: data/coco/train2017/
+ style_root: data/starrynew.png
+ load_size: 280
+ crop_size: 256
+ num_workers: 16
+ batch_size: 5 #1 GPUs
+ test:
+ name: LapStyleDataset
+ content_root: data/coco/test2017/
+ style_root: data/starrynew.png
+ load_size: 256
+ crop_size: 256
+ num_workers: 0
+ batch_size: 1
+
+lr_scheduler:
+ name: NonLinearDecay
+ learning_rate: 1e-4
+ lr_decay: 5e-5
+
+optimizer:
+ optimG:
+ name: Adam
+ net_names:
+ - net_rev
+ beta1: 0.9
+ beta2: 0.999
+ optimD:
+ name: Adam
+ net_names:
+ - netD
+ beta1: 0.9
+ beta2: 0.999
+
+validate:
+ interval: 500
+ save_img: false
+
+log_config:
+ interval: 10
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/lapstyle_rev_second.yaml b/configs/lapstyle_rev_second.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..167ecb7708de60a9c4a34487041ec51158073716
--- /dev/null
+++ b/configs/lapstyle_rev_second.yaml
@@ -0,0 +1,79 @@
+total_iters: 30000
+output_dir: output_dir
+checkpoints_dir: checkpoints
+min_max:
+ (0., 1.)
+
+model:
+ name: LapStyleRevSecondModel
+ revnet_generator:
+ name: RevisionNet
+ revnet_discriminator:
+ name: LapStyleDiscriminator
+ draftnet_encode:
+ name: Encoder
+ draftnet_decode:
+ name: DecoderNet
+ calc_style_emd_loss:
+ name: CalcStyleEmdLoss
+ calc_content_relt_loss:
+ name: CalcContentReltLoss
+ calc_content_loss:
+ name: CalcContentLoss
+ calc_style_loss:
+ name: CalcStyleLoss
+ gan_criterion:
+ name: GANLoss
+ gan_mode: vanilla
+ content_layers: ['r11', 'r21', 'r31', 'r41', 'r51']
+ style_layers: ['r11', 'r21', 'r31', 'r41', 'r51']
+ content_weight: 1.0
+ style_weight: 3.0
+
+dataset:
+ train:
+ name: LapStyleDataset
+ content_root: data/coco/train2017/
+ style_root: data/starrynew.png
+ load_size: 540
+ crop_size: 512
+ num_workers: 16
+ batch_size: 2 #1 GPUs
+ test:
+ name: LapStyleDataset
+ content_root: data/coco/test2017/
+ style_root: data/starrynew.png
+ load_size: 512
+ crop_size: 512
+ num_workers: 0
+ batch_size: 1
+
+lr_scheduler:
+ name: NonLinearDecay
+ learning_rate: 1e-4
+ lr_decay: 5e-5
+
+optimizer:
+ optimG:
+ name: Adam
+ net_names:
+ - net_rev_2
+ beta1: 0.9
+ beta2: 0.999
+ optimD:
+ name: Adam
+ net_names:
+ - netD
+ beta1: 0.9
+ beta2: 0.999
+
+validate:
+ interval: 500
+ save_img: false
+
+log_config:
+ interval: 10
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/lesrcnn_psnr_x4_div2k.yaml b/configs/lesrcnn_psnr_x4_div2k.yaml
index e9759e06126fa367030c100e0abfd5e21b118e51..6591be2361dbd99a7ce61b8380164efa570fee65 100644
--- a/configs/lesrcnn_psnr_x4_div2k.yaml
+++ b/configs/lesrcnn_psnr_x4_div2k.yaml
@@ -40,13 +40,13 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
test:
name: SRDataset
- gt_folder: data/DIV2K/val_set14/Set14
- lq_folder: data/DIV2K/val_set14/Set14_bicLRx4
+ gt_folder: data/Set14/GTmod12
+ lq_folder: data/Set14/LRbicx4
scale: 4
preprocess:
- name: LoadImageFromFile
@@ -59,7 +59,7 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
diff --git a/configs/makeup.yaml b/configs/makeup.yaml
index 1d68f52ef3751ed55ef4f31e2d92b37eefaa7a90..05723e02b4c96c460e18affbb8774b36c5c6b532 100644
--- a/configs/makeup.yaml
+++ b/configs/makeup.yaml
@@ -1,6 +1,7 @@
epochs: 100
output_dir: tmp
checkpoints_dir: checkpoints
+find_unused_parameters: True
model:
name: MakeupModel
diff --git a/configs/mprnet_deblurring.yaml b/configs/mprnet_deblurring.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..64775364a8156866a2e4f7fe397ba74cfad5765e
--- /dev/null
+++ b/configs/mprnet_deblurring.yaml
@@ -0,0 +1,70 @@
+# epoch: 3000 for total batch size=16
+total_iters: 400000
+output_dir: output_dir
+
+model:
+ name: MPRModel
+ generator:
+ name: MPRNet
+ n_feat: 96
+ scale_unetfeats: 48
+ scale_orsnetfeats: 32
+
+ char_criterion:
+ name: CharbonnierLoss
+ edge_criterion:
+ name: EdgeLoss
+
+dataset:
+ train:
+ name: MPRTrain
+ rgb_dir: data/GoPro/train
+ num_workers: 4
+ batch_size: 2 # 8GPUs
+ img_options:
+ patch_size: 256
+ test:
+ name: MPRVal
+ rgb_dir: data/GoPro/test
+ num_workers: 1
+ batch_size: 1
+ img_options:
+ patch_size: 256
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 1e-4
+ periods: [400000]
+ restart_weights: [1]
+ eta_min: !!float 1e-6
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 4
+ test_y_channel: false
+ ssim:
+ name: SSIM
+ crop_border: 4
+ test_y_channel: false
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.999
+ epsilon: 1e-8
+
+log_config:
+ interval: 100
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/mprnet_denoising.yaml b/configs/mprnet_denoising.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a6160972dafb32a85fdd4d6c7976347d932bd1c2
--- /dev/null
+++ b/configs/mprnet_denoising.yaml
@@ -0,0 +1,69 @@
+total_iters: 100000
+output_dir: output_dir
+
+model:
+ name: MPRModel
+ generator:
+ name: MPRNet
+ n_feat: 80
+ scale_unetfeats: 48
+ scale_orsnetfeats: 32
+
+ char_criterion:
+ name: CharbonnierLoss
+ edge_criterion:
+ name: EdgeLoss
+
+dataset:
+ train:
+ name: MPRTrain
+ rgb_dir: data/SIDD/train
+ num_workers: 16
+ batch_size: 4 # 4GPUs
+ img_options:
+ patch_size: 256
+ test:
+ name: MPRTrain
+ rgb_dir: data/SIDD/val
+ num_workers: 1
+ batch_size: 1
+ img_options:
+ patch_size: 256
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 2e-4
+ periods: [25000, 25000, 25000, 25000]
+ restart_weights: [1, 1, 1, 1]
+ eta_min: !!float 1e-6
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 4
+ test_y_channel: True
+ ssim:
+ name: SSIM
+ crop_border: 4
+ test_y_channel: True
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.999
+ epsilon: 1e-8
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/mprnet_deraining.yaml b/configs/mprnet_deraining.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..94de4dfbe01aac2b2f2daff72cd2b01032ad766b
--- /dev/null
+++ b/configs/mprnet_deraining.yaml
@@ -0,0 +1,69 @@
+total_iters: 100000
+output_dir: output_dir
+
+model:
+ name: MPRModel
+ generator:
+ name: MPRNet
+ n_feat: 40
+ scale_unetfeats: 20
+ scale_orsnetfeats: 16
+
+ char_criterion:
+ name: CharbonnierLoss
+ edge_criterion:
+ name: EdgeLoss
+
+dataset:
+ train:
+ name: MPRTrain
+ rgb_dir: data/Synthetic_Rain_Datasets/train
+ num_workers: 16
+ batch_size: 4 # 4GPUs
+ img_options:
+ patch_size: 256
+ test:
+ name: MPRTrain
+ rgb_dir: data/Synthetic_Rain_Datasets/test/Rain100L
+ num_workers: 1
+ batch_size: 1
+ img_options:
+ patch_size: 256
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 2e-4
+ periods: [25000, 25000, 25000, 25000]
+ restart_weights: [1, 1, 1, 1]
+ eta_min: !!float 1e-6
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 4
+ test_y_channel: True
+ ssim:
+ name: SSIM
+ crop_border: 4
+ test_y_channel: True
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.999
+ epsilon: 1e-8
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/msvsr_l_reds.yaml b/configs/msvsr_l_reds.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7d42fa52e4743849e9d83e7cf13718511aa08bd7
--- /dev/null
+++ b/configs/msvsr_l_reds.yaml
@@ -0,0 +1,129 @@
+total_iters: 300000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: MultiStageVSRModel
+ fix_iter: 2500
+ generator:
+ name: MSVSR
+ mid_channels: 64
+ num_init_blocks: 5
+ num_blocks: 7
+ num_reconstruction_blocks: 5
+ only_last: False
+ use_tiny_spynet: False
+ deform_groups: 8
+ stage1_groups: 8
+ auxiliary_loss: True
+ use_refine_align: True
+ aux_reconstruction_blocks: 2
+ use_local_connnect: True
+ pixel_criterion:
+ name: CharbonnierLoss
+ reduction: mean
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 4
+ batch_size: 2 #8 gpus
+ dataset:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/train_sharp_bicubic/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 30
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+ test:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 100
+ test_mode: True
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 2e-4
+ periods: [300000]
+ restart_weights: [1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: false
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: false
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/msvsr_reds.yaml b/configs/msvsr_reds.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..78e0f58b62cf766cb9328ed0a1e4c58d902a6131
--- /dev/null
+++ b/configs/msvsr_reds.yaml
@@ -0,0 +1,134 @@
+total_iters: 150000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: MultiStageVSRModel
+ fix_iter: 2500
+ generator:
+ name: MSVSR
+ mid_channels: 32
+ num_init_blocks: 2
+ num_blocks: 3
+ num_reconstruction_blocks: 2
+ only_last: True
+ use_tiny_spynet: True
+ deform_groups: 4
+ stage1_groups: 8
+ auxiliary_loss: True
+ use_refine_align: True
+ aux_reconstruction_blocks: 1
+ use_local_connnect: True
+ pixel_criterion:
+ name: CharbonnierLoss
+ reduction: mean
+ # training model under @to_static
+ to_static: False
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 6
+ batch_size: 2 #8 gpus
+ dataset:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/train_sharp_bicubic/X4
+ gt_folder: data/REDS/train_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 20
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+ test:
+ name: VSRREDSMultipleGTDataset
+ lq_folder: data/REDS/REDS4_test_sharp_bicubic/X4
+ gt_folder: data/REDS/REDS4_test_sharp/X4
+ ann_file: data/REDS/meta_info_REDS_GT.txt
+ num_frames: 100
+ test_mode: True
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 2e-4
+ periods: [150000]
+ restart_weights: [1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: false
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: false
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
+
+export_model:
+ - {name: 'generator', inputs_num: 1}
diff --git a/configs/msvsr_vimeo90k_BD.yaml b/configs/msvsr_vimeo90k_BD.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a1d0f86bd40e3644003a28bcd81e4ea0c51c12ec
--- /dev/null
+++ b/configs/msvsr_vimeo90k_BD.yaml
@@ -0,0 +1,130 @@
+total_iters: 300000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: MultiStageVSRModel
+ fix_iter: -1
+ generator:
+ name: MSVSR
+ mid_channels: 32
+ num_init_blocks: 2
+ num_blocks: 3
+ num_reconstruction_blocks: 2
+ only_last: True
+ use_tiny_spynet: True
+ deform_groups: 4
+ stage1_groups: 8
+ auxiliary_loss: True
+ use_refine_align: True
+ aux_reconstruction_blocks: 1
+ use_local_connnect: True
+ pixel_criterion:
+ name: CharbonnierLoss
+ reduction: mean
+
+dataset:
+ train:
+ name: RepeatDataset
+ times: 1000
+ num_workers: 4
+ batch_size: 2 #8 gpus
+ dataset:
+ name: VSRVimeo90KDataset
+ lq_folder: data/vimeo90k/vimeo_septuplet_BD_matlabLRx4/sequences
+ gt_folder: data/vimeo90k/vimeo_septuplet/sequences
+ ann_file: data/vimeo90k/vimeo_septuplet/sep_trainlist.txt
+ preprocess:
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: MirrorVideoSequence
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+ test:
+ name: VSRFolderDataset
+ # for udm10 dataset
+ # lq_folder: data/udm10/BDx4
+ # gt_folder: data/udm10/GT
+ lq_folder: data/Vid4/BDx4
+ gt_folder: data/Vid4/GT
+ preprocess:
+ - name: GetNeighboringFramesIdx
+ interval_list: [1]
+ # for udm10 dataset
+ # filename_tmpl: '{:04d}.png'
+ filename_tmpl: '{:08d}.png'
+ - name: ReadImageSequence
+ key: lq
+ - name: ReadImageSequence
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: TransposeSequence
+ keys: [image, image]
+ - name: NormalizeSequence
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 2e-4
+ periods: [300000]
+ restart_weights: [1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 2500
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: true
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: true
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 2500
diff --git a/configs/nafnet_denoising.yaml b/configs/nafnet_denoising.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..08a3a37819d819b50fb79b5b635e4a4e6e5f43d5
--- /dev/null
+++ b/configs/nafnet_denoising.yaml
@@ -0,0 +1,74 @@
+total_iters: 400000
+output_dir: output_dir
+
+model:
+ name: NAFNetModel
+ generator:
+ name: NAFNet
+ img_channel: 3
+ width: 64
+ enc_blk_nums: [2, 2, 4, 8]
+ middle_blk_num: 12
+ dec_blk_nums: [2, 2, 2, 2]
+ psnr_criterion:
+ name: PSNRLoss
+
+dataset:
+ train:
+ name: NAFNetTrain
+ rgb_dir: data/SIDD/train
+ # TODO fix out of memory for val while training
+ num_workers: 0
+ batch_size: 8 # 8GPU
+ img_options:
+ patch_size: 256
+ test:
+ name: NAFNetVal
+ rgb_dir: data/SIDD/val
+ # TODO fix out of memory for val while training
+ num_workers: 0
+ batch_size: 1
+ img_options:
+ patch_size: 256
+
+export_model:
+ - {name: 'generator', inputs_num: 1}
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: 0.001
+ periods: [400000]
+ restart_weights: [1]
+ eta_min: !!float 8e-7
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 4
+ test_y_channel: True
+ ssim:
+ name: SSIM
+ crop_border: 4
+ test_y_channel: True
+
+optimizer:
+ name: AdamW
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ weight_decay: 0.0
+ beta1: 0.9
+ beta2: 0.9
+ epsilon: 1e-8
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/pan_psnr_x4_div2k.yaml b/configs/pan_psnr_x4_div2k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2b848ea68a8305aae7102d7f9cdccd21adc7331e
--- /dev/null
+++ b/configs/pan_psnr_x4_div2k.yaml
@@ -0,0 +1,109 @@
+total_iters: 1000000
+output_dir: output_dir
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: BaseSRModel
+ generator:
+ name: PAN
+ in_nc: 3
+ out_nc: 3
+ nf: 40
+ unf: 24
+ nb: 16
+ scale: 4
+ pixel_criterion:
+ name: L1Loss
+ use_init_weight: True
+
+dataset:
+ train:
+ name: SRDataset
+ gt_folder: data/DIV2K/DIV2K_train_HR_sub
+ lq_folder: data/DIV2K/DIV2K_train_LR_bicubic/X4_sub
+ num_workers: 6
+ batch_size: 32 #1 GPU
+ use_shared_memory: False
+ scale: 4
+ preprocess:
+ - name: LoadImageFromFile
+ key: lq
+ - name: LoadImageFromFile
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 256
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: Transpose
+ keys: [image, image]
+ - name: Normalize
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+ test:
+ name: SRDataset
+ gt_folder: data/Set14/GTmod12
+ lq_folder: data/Set14/LRbicx4
+ scale: 4
+ preprocess:
+ - name: LoadImageFromFile
+ key: lq
+ - name: LoadImageFromFile
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: Transpose
+ keys: [image, image]
+ - name: Normalize
+ mean: [0., 0., 0.]
+ std: [255., 255., 255.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: !!float 7e-4
+ periods: [250000, 250000, 250000, 250000]
+ restart_weights: [1, 1, 1, 1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 4
+ test_y_channel: False
+ ssim:
+ name: SSIM
+ crop_border: 4
+ test_y_channel: False
+
+log_config:
+ interval: 100
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 5000
diff --git a/configs/photopen.yaml b/configs/photopen.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e916bd1bb71d0a1bac48abd8560cc9bc8d92461
--- /dev/null
+++ b/configs/photopen.yaml
@@ -0,0 +1,96 @@
+total_iters: 1
+output_dir: output_dir
+checkpoints_dir: checkpoints
+
+model:
+ name: PhotoPenModel
+ generator:
+ name: SPADEGenerator
+ ngf: 24
+ num_upsampling_layers: normal
+ crop_size: 256
+ aspect_ratio: 1.0
+ norm_G: spectralspadebatch3x3
+ semantic_nc: 14
+ use_vae: False
+ nef: 16
+ discriminator:
+ name: MultiscaleDiscriminator
+ ndf: 128
+ num_D: 4
+ crop_size: 256
+ label_nc: 12
+ output_nc: 3
+ contain_dontcare_label: True
+ no_instance: False
+ n_layers_D: 6
+ criterion:
+ name: PhotoPenPerceptualLoss
+ crop_size: 224
+ lambda_vgg: 1.6
+ label_nc: 12
+ contain_dontcare_label: True
+ batchSize: 1
+ crop_size: 256
+ lambda_feat: 10.0
+
+dataset:
+ train:
+ name: PhotoPenDataset
+ content_root: test/coco_stuff
+ load_size: 286
+ crop_size: 256
+ num_workers: 0
+ batch_size: 1
+ test:
+ name: PhotoPenDataset_test
+ content_root: test/coco_stuff
+ load_size: 286
+ crop_size: 256
+ num_workers: 0
+ batch_size: 1
+
+lr_scheduler: # abundoned
+ name: LinearDecay
+ learning_rate: 0.0001
+ start_epoch: 99999
+ decay_epochs: 99999
+ # will get from real dataset
+ iters_per_epoch: 1
+
+optimizer:
+ lr: 0.0001
+ optimG:
+ name: Adam
+ net_names:
+ - net_gen
+ beta1: 0.9
+ beta2: 0.999
+ optimD:
+ name: Adam
+ net_names:
+ - net_des
+ beta1: 0.9
+ beta2: 0.999
+
+log_config:
+ interval: 1
+ visiual_interval: 1
+
+snapshot_config:
+ interval: 1
+
+predict:
+ name: SPADEGenerator
+ ngf: 24
+ num_upsampling_layers: normal
+ crop_size: 256
+ aspect_ratio: 1.0
+ norm_G: spectralspadebatch3x3
+ semantic_nc: 14
+ use_vae: False
+ nef: 16
+ contain_dontcare_label: True
+ label_nc: 12
+ batchSize: 1
+
\ No newline at end of file
diff --git a/configs/pix2pix_cityscapes.yaml b/configs/pix2pix_cityscapes.yaml
index 5a24315b60f8547d4ecf14a708554b4c05d78680..ba237d050cac8e066da9e9d3410e8751ba504b3b 100644
--- a/configs/pix2pix_cityscapes.yaml
+++ b/configs/pix2pix_cityscapes.yaml
@@ -25,6 +25,9 @@ model:
name: GANLoss
gan_mode: vanilla
+export_model:
+ - {name: 'netG', inputs_num: 1}
+
dataset:
train:
name: PairedDataset
@@ -58,7 +61,7 @@ dataset:
keys: [image, image]
test:
name: PairedDataset
- dataroot: data/cityscapes/test
+ dataroot: data/cityscapes/val
num_workers: 4
batch_size: 1
preprocess:
@@ -107,3 +110,11 @@ log_config:
snapshot_config:
interval: 5
+
+validate:
+ interval: 29750
+ save_img: false
+ metrics:
+ fid: # metric name, can be arbitrary
+ name: FID
+ batch_size: 8
diff --git a/configs/pix2pix_cityscapes_2gpus.yaml b/configs/pix2pix_cityscapes_2gpus.yaml
index 6686cfedca2efdc0a55791c828b27f107cd814a5..824f0c487aa389e1e9ef54903854caa630174935 100644
--- a/configs/pix2pix_cityscapes_2gpus.yaml
+++ b/configs/pix2pix_cityscapes_2gpus.yaml
@@ -58,13 +58,16 @@ dataset:
keys: [image, image]
test:
name: PairedDataset
- dataroot: data/cityscapes/test
+ dataroot: data/cityscapes/val
num_workers: 4
batch_size: 1
preprocess:
- name: LoadImageFromFile
key: pair
- - name: Transforms
+ - name: SplitPairedImage
+ key: pair
+ paired_keys: [A, B]
+ - name: Transforms
input_keys: [A, B]
pipeline:
- name: Resize
@@ -104,3 +107,11 @@ log_config:
snapshot_config:
interval: 5
+
+validate:
+ interval: 29750
+ save_img: false
+ metrics:
+ fid: # metric name, can be arbitrary
+ name: FID
+ batch_size: 8
diff --git a/configs/pix2pix_facades.yaml b/configs/pix2pix_facades.yaml
index c7ed53754992052ecf708be7ed2346d6ba070e3a..ec30b188dad8863b48461d1207d008b4ad08199c 100644
--- a/configs/pix2pix_facades.yaml
+++ b/configs/pix2pix_facades.yaml
@@ -24,6 +24,8 @@ model:
gan_criterion:
name: GANLoss
gan_mode: vanilla
+ # training model under @to_static
+ to_static: False
dataset:
train:
@@ -64,7 +66,10 @@ dataset:
preprocess:
- name: LoadImageFromFile
key: pair
- - name: Transforms
+ - name: SplitPairedImage
+ key: pair
+ paired_keys: [A, B]
+ - name: Transforms
input_keys: [A, B]
pipeline:
- name: Resize
@@ -104,3 +109,14 @@ log_config:
snapshot_config:
interval: 5
+
+validate:
+ interval: 4000
+ save_img: false
+ metrics:
+ fid: # metric name, can be arbitrary
+ name: FID
+ batch_size: 8
+
+export_model:
+ - {name: 'netG', inputs_num: 1}
diff --git a/configs/prenet.yaml b/configs/prenet.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a4e31ee7bd643181fefa17572935b42431f2304
--- /dev/null
+++ b/configs/prenet.yaml
@@ -0,0 +1,98 @@
+total_iters: 300000
+output_dir: output_dir
+find_unused_parameters: True
+checkpoints_dir: checkpoints
+use_dataset: True
+# tensor range for function tensor2img
+min_max:
+ (0., 1.)
+
+model:
+ name: PReNetModel
+ generator:
+ name: PReNet
+ pixel_criterion:
+ name: SSIM
+
+dataset:
+ train:
+ name: SRDataset
+ gt_folder: data/RainH/RainTrainH/norain
+ lq_folder: data/RainH/RainTrainH/rain
+ num_workers: 4
+ batch_size: 16
+ scale: 1
+ preprocess:
+ - name: LoadImageFromFile
+ key: lq
+ - name: LoadImageFromFile
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: PairedRandomCrop
+ size: [100, 100]
+ keys: [image, image]
+ - name: PairedToTensor
+ keys: [image, image]
+ test:
+ name: SRDataset
+ gt_folder: data/RainH/Rain100H/norain
+ lq_folder: data/RainH/Rain100H/rain
+ scale: 1
+ preprocess:
+ - name: LoadImageFromFile
+ key: lq
+ - name: LoadImageFromFile
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: PairedToTensor
+ keys: [image, image]
+
+
+lr_scheduler:
+ name: MultiStepDecay
+ learning_rate: 0.0013
+ milestones: [36000,60000,96000]
+ gamma: 0.2
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 5000
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 0
+ test_y_channel: True
+ ssim:
+ name: SSIM
+ crop_border: 0
+ test_y_channel: True
+
+log_config:
+ interval: 100
+ visiual_interval: 500
+
+snapshot_config:
+ interval: 5000
+
+export_model:
+ - {name: 'generator', inputs_num: 1}
diff --git a/configs/rcan_rssr_x4.yaml b/configs/rcan_rssr_x4.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9be1561832c674d6c163223ea918f49817e94f20
--- /dev/null
+++ b/configs/rcan_rssr_x4.yaml
@@ -0,0 +1,104 @@
+total_iters: 1000000
+output_dir: output_dir
+# tensor range for function tensor2img
+min_max:
+ (0., 255.)
+
+model:
+ name: RCANModel
+ generator:
+ name: RCAN
+ scale: 4
+ n_resgroups: 10
+ n_resblocks: 20
+ pixel_criterion:
+ name: L1Loss
+
+dataset:
+ train:
+ name: SRDataset
+ gt_folder: data/DIV2K/DIV2K_train_HR_sub
+ lq_folder: data/DIV2K/DIV2K_train_LR_bicubic/X4_sub
+ num_workers: 4
+ batch_size: 16
+ scale: 4
+ preprocess:
+ - name: LoadImageFromFile
+ key: lq
+ - name: LoadImageFromFile
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: SRPairedRandomCrop
+ gt_patch_size: 192
+ scale: 4
+ keys: [image, image]
+ - name: PairedRandomHorizontalFlip
+ keys: [image, image]
+ - name: PairedRandomVerticalFlip
+ keys: [image, image]
+ - name: PairedRandomTransposeHW
+ keys: [image, image]
+ - name: Transpose
+ keys: [image, image]
+ - name: Normalize
+ mean: [0., .0, 0.]
+ std: [1., 1., 1.]
+ keys: [image, image]
+ test:
+ name: SRDataset
+ gt_folder: data/Set14/GTmod12
+ lq_folder: data/Set14/LRbicx4
+ scale: 4
+ preprocess:
+ - name: LoadImageFromFile
+ key: lq
+ - name: LoadImageFromFile
+ key: gt
+ - name: Transforms
+ input_keys: [lq, gt]
+ pipeline:
+ - name: Transpose
+ keys: [image, image]
+ - name: Normalize
+ mean: [0., .0, 0.]
+ std: [1., 1., 1.]
+ keys: [image, image]
+
+lr_scheduler:
+ name: CosineAnnealingRestartLR
+ learning_rate: 0.0001
+ periods: [1000000]
+ restart_weights: [1]
+ eta_min: !!float 1e-7
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.99
+
+validate:
+ interval: 2500
+ save_img: false
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 4
+ test_y_channel: True
+ ssim:
+ name: SSIM
+ crop_border: 4
+ test_y_channel: True
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 2500
diff --git a/configs/realsr_bicubic_noise_x4_df2k.yaml b/configs/realsr_bicubic_noise_x4_df2k.yaml
index d8069885d5278ba27b6b3aaaba0a5eec81068274..0a19753e519ab14b7207805f97c7847cfd75df99 100644
--- a/configs/realsr_bicubic_noise_x4_df2k.yaml
+++ b/configs/realsr_bicubic_noise_x4_df2k.yaml
@@ -1,5 +1,6 @@
total_iters: 60000
output_dir: output_dir
+find_unused_parameters: True
# tensor range for function tensor2img
min_max:
(0., 1.)
@@ -34,8 +35,8 @@ model:
dataset:
train:
name: SRDataset
- gt_folder: data/realsr_preprocess/DF2K/generated/tdsr/HR_sub/
- lq_folder: data/realsr_preprocess/DF2K/generated/tdsr/LR_sub/
+ gt_folder: data/DF2K/generated/tdsr/HR/
+ lq_folder: data/DF2K/generated/tdsr/LR/
num_workers: 4
batch_size: 16
scale: 4
@@ -60,17 +61,17 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
- name: SRNoise
- noise_path: data/realsr_preprocess/DF2K/Corrupted_noise/
+ noise_path: data/DF2K/Corrupted_noise/
size: 32
keys: [image]
test:
name: SRDataset
- gt_folder: data/DIV2K/val_set14/Set14
- lq_folder: data/DIV2K/val_set14/Set14_bicLRx4
+ gt_folder: data/Set14/GTmod12
+ lq_folder: data/Set14/LRbicx4
scale: 4
preprocess:
- name: LoadImageFromFile
@@ -83,7 +84,7 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
diff --git a/configs/realsr_kernel_noise_x4_dped.yaml b/configs/realsr_kernel_noise_x4_dped.yaml
index e3655639d9632f978422fa3f7884ff5c39f391ac..ba2851e71060824787953bdf6e94df5da0561b89 100644
--- a/configs/realsr_kernel_noise_x4_dped.yaml
+++ b/configs/realsr_kernel_noise_x4_dped.yaml
@@ -1,5 +1,6 @@
total_iters: 60000
output_dir: output_dir
+find_unused_parameters: True
# tensor range for function tensor2img
min_max:
(0., 1.)
@@ -34,8 +35,8 @@ model:
dataset:
train:
name: SRDataset
- gt_folder: data/realsr_preprocess/DPED/generated/clean/train_tdsr/HR/
- lq_folder: data/realsr_preprocess/DPED/generated/clean/train_tdsr/LR/
+ gt_folder: data/DPED/generated/clean/train_tdsr/HR/
+ lq_folder: data/DPED/generated/clean/train_tdsr/LR/
num_workers: 4
batch_size: 16
scale: 4
@@ -60,17 +61,17 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
- name: SRNoise
- noise_path: data/realsr_preprocess/DPED/DPEDiphone_noise/
+ noise_path: data/DPED/DPEDiphone_noise/
size: 32
keys: [image]
test:
name: SRDataset
- gt_folder: data/DIV2K/val_set14/Set14
- lq_folder: data/DIV2K/val_set14/Set14_bicLRx4
+ gt_folder: data/Set14/GTmod12
+ lq_folder: data/Set14/LRbicx4
scale: 4
preprocess:
- name: LoadImageFromFile
@@ -83,7 +84,7 @@ dataset:
- name: Transpose
keys: [image, image]
- name: Normalize
- mean: [0., .0, 0.]
+ mean: [0., 0., 0.]
std: [255., 255., 255.]
keys: [image, image]
diff --git a/configs/singan_animation.yaml b/configs/singan_animation.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..e12b69f297751acc1835257658e1a560735e3730
--- /dev/null
+++ b/configs/singan_animation.yaml
@@ -0,0 +1,79 @@
+total_iters: 100000
+output_dir: output_dir
+export_model: null
+
+model:
+ name: SinGANModel
+ generator:
+ name: SinGANGenerator
+ nfc_init: 32
+ min_nfc_init: 32
+ noise_zero_pad: False
+ discriminator:
+ name: SinGANDiscriminator
+ nfc_init: 32
+ min_nfc_init: 32
+ gan_criterion:
+ name: GANLoss
+ gan_mode: wgangp
+ loss_weight: 1.0
+ recon_criterion:
+ name: MSELoss
+ loss_weight: 10.0
+ gp_criterion:
+ name: GradientPenalty
+ loss_weight: 0.1
+ train_image: data/singan/stone.png
+ scale_factor: 0.75
+ min_size: 25
+ is_finetune: False
+
+dataset:
+ train:
+ name: EmptyDataset
+ test:
+ name: SingleDataset
+ dataroot: data/singan
+ num_workers: 0
+ batch_size: 1
+ preprocess:
+ - name: LoadImageFromFile
+ key: A
+ - name: Transforms
+ input_keys: [A]
+ pipeline:
+ - name: Transpose
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+
+lr_scheduler:
+ name: MultiStepDecay
+ learning_rate: 0.0005
+ milestones: [9600]
+ gamma: 0.1
+
+optimizer:
+ optimizer_G:
+ name: Adam
+ beta1: 0.5
+ beta2: 0.999
+ optimizer_D:
+ name: Adam
+ beta1: 0.5
+ beta2: 0.999
+
+log_config:
+ interval: 100
+ visiual_interval: 2000
+
+snapshot_config:
+ interval: 10000
+
+validate:
+ interval: -1
+ save_img: True
+ metrics:
+ fid: # metric name, can be arbitrary
+ name: FID
+ batch_size: 1
diff --git a/configs/singan_finetune.yaml b/configs/singan_finetune.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..0a86559719c81d5704bbe70b379481245b127850
--- /dev/null
+++ b/configs/singan_finetune.yaml
@@ -0,0 +1,72 @@
+total_iters: 12000
+output_dir: output_dir
+
+model:
+ name: SinGANModel
+ generator:
+ name: SinGANGenerator
+ nfc_init: 32
+ min_nfc_init: 32
+ noise_zero_pad: True
+ discriminator:
+ name: SinGANDiscriminator
+ nfc_init: 32
+ min_nfc_init: 32
+ gan_criterion:
+ name: GANLoss
+ gan_mode: wgangp
+ loss_weight: 1.0
+ recon_criterion:
+ name: MSELoss
+ loss_weight: 10.0
+ gp_criterion:
+ name: GradientPenalty
+ loss_weight: 0.1
+ train_image: data/singan/stone.png
+ scale_factor: 0.75
+ min_size: 25
+ is_finetune: True
+ finetune_scale: 1
+ color_num: 5
+
+dataset:
+ train:
+ name: EmptyDataset
+ test:
+ name: SingleDataset
+ dataroot: data/singan
+ num_workers: 0
+ batch_size: 1
+ preprocess:
+ - name: LoadImageFromFile
+ key: A
+ - name: Transforms
+ input_keys: [A]
+ pipeline:
+ - name: Transpose
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+
+lr_scheduler:
+ name: MultiStepDecay
+ learning_rate: 0.0005
+ milestones: [9600]
+ gamma: 0.1
+
+optimizer:
+ optimizer_G:
+ name: Adam
+ beta1: 0.5
+ beta2: 0.999
+ optimizer_D:
+ name: Adam
+ beta1: 0.5
+ beta2: 0.999
+
+log_config:
+ interval: 100
+ visiual_interval: 2000
+
+snapshot_config:
+ interval: 4000
diff --git a/configs/singan_sr.yaml b/configs/singan_sr.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..00751ef06548bf46337af0da1fbbfabc6484a003
--- /dev/null
+++ b/configs/singan_sr.yaml
@@ -0,0 +1,79 @@
+total_iters: 100000
+output_dir: output_dir
+export_model: null
+
+model:
+ name: SinGANModel
+ generator:
+ name: SinGANGenerator
+ nfc_init: 32
+ min_nfc_init: 32
+ noise_zero_pad: True
+ discriminator:
+ name: SinGANDiscriminator
+ nfc_init: 32
+ min_nfc_init: 32
+ gan_criterion:
+ name: GANLoss
+ gan_mode: wgangp
+ loss_weight: 1.0
+ recon_criterion:
+ name: MSELoss
+ loss_weight: 100.0
+ gp_criterion:
+ name: GradientPenalty
+ loss_weight: 0.1
+ train_image: data/singan/stone.png
+ scale_factor: 0.793701 # (1/2)^(1/3)
+ min_size: 18
+ is_finetune: False
+
+dataset:
+ train:
+ name: EmptyDataset
+ test:
+ name: SingleDataset
+ dataroot: data/singan
+ num_workers: 0
+ batch_size: 1
+ preprocess:
+ - name: LoadImageFromFile
+ key: A
+ - name: Transforms
+ input_keys: [A]
+ pipeline:
+ - name: Transpose
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+
+lr_scheduler:
+ name: MultiStepDecay
+ learning_rate: 0.0005
+ milestones: [9600]
+ gamma: 0.1
+
+optimizer:
+ optimizer_G:
+ name: Adam
+ beta1: 0.5
+ beta2: 0.999
+ optimizer_D:
+ name: Adam
+ beta1: 0.5
+ beta2: 0.999
+
+log_config:
+ interval: 100
+ visiual_interval: 2000
+
+snapshot_config:
+ interval: 10000
+
+validate:
+ interval: -1
+ save_img: True
+ metrics:
+ fid: # metric name, can be arbitrary
+ name: FID
+ batch_size: 1
diff --git a/configs/singan_universal.yaml b/configs/singan_universal.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..6e50b7582e988a78cc78830599cbdbef90fa448f
--- /dev/null
+++ b/configs/singan_universal.yaml
@@ -0,0 +1,79 @@
+total_iters: 100000
+output_dir: output_dir
+export_model: null
+
+model:
+ name: SinGANModel
+ generator:
+ name: SinGANGenerator
+ nfc_init: 32
+ min_nfc_init: 32
+ noise_zero_pad: True
+ discriminator:
+ name: SinGANDiscriminator
+ nfc_init: 32
+ min_nfc_init: 32
+ gan_criterion:
+ name: GANLoss
+ gan_mode: wgangp
+ loss_weight: 1.0
+ recon_criterion:
+ name: MSELoss
+ loss_weight: 10.0
+ gp_criterion:
+ name: GradientPenalty
+ loss_weight: 0.1
+ train_image: data/singan/stone.png
+ scale_factor: 0.75
+ min_size: 25
+ is_finetune: False
+
+dataset:
+ train:
+ name: EmptyDataset
+ test:
+ name: SingleDataset
+ dataroot: data/singan
+ num_workers: 0
+ batch_size: 1
+ preprocess:
+ - name: LoadImageFromFile
+ key: A
+ - name: Transforms
+ input_keys: [A]
+ pipeline:
+ - name: Transpose
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+
+lr_scheduler:
+ name: MultiStepDecay
+ learning_rate: 0.0005
+ milestones: [9600]
+ gamma: 0.1
+
+optimizer:
+ optimizer_G:
+ name: Adam
+ beta1: 0.5
+ beta2: 0.999
+ optimizer_D:
+ name: Adam
+ beta1: 0.5
+ beta2: 0.999
+
+log_config:
+ interval: 100
+ visiual_interval: 2000
+
+snapshot_config:
+ interval: 10000
+
+validate:
+ interval: -1
+ save_img: True
+ metrics:
+ fid: # metric name, can be arbitrary
+ name: FID
+ batch_size: 1
diff --git a/configs/starganv2_afhq.yaml b/configs/starganv2_afhq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca8d54b32af9afc2509a6a4d0499d2bb419a3eef
--- /dev/null
+++ b/configs/starganv2_afhq.yaml
@@ -0,0 +1,141 @@
+epochs: 200
+output_dir: output_dir
+
+model:
+ name: StarGANv2Model
+ latent_dim: &LATENT_DIM 16
+ lambda_sty: 1
+ lambda_ds: 2
+ lambda_cyc: 1
+ generator:
+ name: StarGANv2Generator
+ img_size: &IMAGE_SIZE 256
+ w_hpf: 0
+ style_dim: &STYLE_DIM 64
+ style:
+ name: StarGANv2Style
+ img_size: *IMAGE_SIZE
+ style_dim: *STYLE_DIM
+ num_domains: &NUM_DOMAINS 3
+ mapping:
+ name: StarGANv2Mapping
+ latent_dim: *LATENT_DIM
+ style_dim: *STYLE_DIM
+ num_domains: *NUM_DOMAINS
+ discriminator:
+ name: StarGANv2Discriminator
+ img_size: *IMAGE_SIZE
+ num_domains: *NUM_DOMAINS
+
+dataset:
+ train:
+ name: StarGANv2Dataset
+ dataroot: data/stargan-v2/afhq/train
+ is_train: True
+ num_workers: 8
+ batch_size: 4
+ preprocess:
+ - name: LoadImageFromFile
+ key: src
+ - name: LoadImageFromFile
+ key: ref
+ - name: LoadImageFromFile
+ key: ref2
+ - name: Transforms
+ input_keys: [src, ref, ref2]
+ pipeline:
+ - name: RandomResizedCropProb
+ prob: 0.9
+ size: [*IMAGE_SIZE, *IMAGE_SIZE]
+ scale: [0.8, 1.0]
+ ratio: [0.9, 1.1]
+ interpolation: 'bilinear'
+ keys: [image, image, image]
+ - name: Resize
+ size: [*IMAGE_SIZE, *IMAGE_SIZE]
+ interpolation: 'bilinear'
+ keys: [image, image, image]
+ - name: RandomHorizontalFlip
+ prob: 0.5
+ keys: [image, image, image]
+ - name: Transpose
+ keys: [image, image, image]
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+ keys: [image, image, image]
+
+ test:
+ name: StarGANv2Dataset
+ dataroot: data/stargan-v2/afhq/val
+ is_train: False
+ num_workers: 8
+ batch_size: 16
+ test_count: 16
+ preprocess:
+ - name: LoadImageFromFile
+ key: src
+ - name: LoadImageFromFile
+ key: ref
+ - name: Transforms
+ input_keys: [src, ref]
+ pipeline:
+ - name: Resize
+ size: [*IMAGE_SIZE, *IMAGE_SIZE]
+ interpolation: 'bicubic' #cv2.INTER_CUBIC
+ keys: [image, image]
+ - name: Transpose
+ keys: [image, image]
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+ keys: [image, image]
+
+lr_scheduler:
+ name: LinearDecay
+ learning_rate: 0.0001
+ start_epoch: 100
+ decay_epochs: 100
+ # will get from real dataset
+ iters_per_epoch: 365
+
+optimizer:
+ generator:
+ name: Adam
+ net_names:
+ - generator
+ beta1: 0.0
+ beta2: 0.99
+ weight_decay: 0.0001
+ style_encoder:
+ name: Adam
+ net_names:
+ - style_encoder
+ beta1: 0.0
+ beta2: 0.99
+ weight_decay: 0.0001
+ mapping_network:
+ name: Adam
+ net_names:
+ - mapping_network
+ beta1: 0.0
+ beta2: 0.99
+ weight_decay: 0.0001
+ discriminator:
+ name: Adam
+ net_names:
+ - discriminator
+ beta1: 0.0
+ beta2: 0.99
+ weight_decay: 0.0001
+
+validate:
+ interval: 3000
+ save_img: false
+
+log_config:
+ interval: 100
+ visiual_interval: 3000
+
+snapshot_config:
+ interval: 5
diff --git a/configs/starganv2_celeba_hq.yaml b/configs/starganv2_celeba_hq.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8519d2cb216f2db4a02a4b982b79b8f6ee79c24c
--- /dev/null
+++ b/configs/starganv2_celeba_hq.yaml
@@ -0,0 +1,144 @@
+epochs: 200
+output_dir: output_dir
+
+model:
+ name: StarGANv2Model
+ latent_dim: &LATENT_DIM 16
+ lambda_sty: 1
+ lambda_ds: 1
+ lambda_cyc: 1
+ generator:
+ name: StarGANv2Generator
+ img_size: &IMAGE_SIZE 256
+ w_hpf: 1
+ style_dim: &STYLE_DIM 64
+ style:
+ name: StarGANv2Style
+ img_size: *IMAGE_SIZE
+ style_dim: *STYLE_DIM
+ num_domains: &NUM_DOMAINS 2
+ mapping:
+ name: StarGANv2Mapping
+ latent_dim: *LATENT_DIM
+ style_dim: *STYLE_DIM
+ num_domains: *NUM_DOMAINS
+ fan:
+ name: FAN
+ fname_pretrained: None
+ discriminator:
+ name: StarGANv2Discriminator
+ img_size: *IMAGE_SIZE
+ num_domains: *NUM_DOMAINS
+
+dataset:
+ train:
+ name: StarGANv2Dataset
+ dataroot: data/stargan-v2/celeba_hq/train/
+ is_train: True
+ num_workers: 8
+ batch_size: 4
+ preprocess:
+ - name: LoadImageFromFile
+ key: src
+ - name: LoadImageFromFile
+ key: ref
+ - name: LoadImageFromFile
+ key: ref2
+ - name: Transforms
+ input_keys: [src, ref, ref2]
+ pipeline:
+ - name: RandomResizedCropProb
+ prob: 0.9
+ size: [*IMAGE_SIZE, *IMAGE_SIZE]
+ scale: [0.8, 1.0]
+ ratio: [0.9, 1.1]
+ interpolation: 'bilinear'
+ keys: [image, image, image]
+ - name: Resize
+ size: [*IMAGE_SIZE, *IMAGE_SIZE]
+ interpolation: 'bilinear'
+ keys: [image, image, image]
+ - name: RandomHorizontalFlip
+ prob: 0.5
+ keys: [image, image, image]
+ - name: Transpose
+ keys: [image, image, image]
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+ keys: [image, image, image]
+
+ test:
+ name: StarGANv2Dataset
+ dataroot: data/stargan-v2/celeba_hq/val/
+ is_train: False
+ num_workers: 8
+ batch_size: 16
+ test_count: 16
+ preprocess:
+ - name: LoadImageFromFile
+ key: src
+ - name: LoadImageFromFile
+ key: ref
+ - name: Transforms
+ input_keys: [src, ref]
+ pipeline:
+ - name: Resize
+ size: [*IMAGE_SIZE, *IMAGE_SIZE]
+ interpolation: 'bicubic' #cv2.INTER_CUBIC
+ keys: [image, image]
+ - name: Transpose
+ keys: [image, image]
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
+ keys: [image, image]
+
+lr_scheduler:
+ name: LinearDecay
+ learning_rate: 0.0001
+ start_epoch: 100
+ decay_epochs: 100
+ # will get from real dataset
+ iters_per_epoch: 365
+
+optimizer:
+ generator:
+ name: Adam
+ net_names:
+ - generator
+ beta1: 0.0
+ beta2: 0.99
+ weight_decay: 0.0001
+ style_encoder:
+ name: Adam
+ net_names:
+ - style_encoder
+ beta1: 0.0
+ beta2: 0.99
+ weight_decay: 0.0001
+ mapping_network:
+ name: Adam
+ net_names:
+ - mapping_network
+ beta1: 0.0
+ beta2: 0.99
+ weight_decay: 0.0001
+ discriminator:
+ name: Adam
+ net_names:
+ - discriminator
+ beta1: 0.0
+ beta2: 0.99
+ weight_decay: 0.0001
+
+validate:
+ interval: 3000
+ save_img: false
+
+log_config:
+ interval: 100
+ visiual_interval: 3000
+
+snapshot_config:
+ interval: 5
diff --git a/configs/stylegan_v2_256_ffhq.yaml b/configs/stylegan_v2_256_ffhq.yaml
index d87268c97d174e7c457230ad71e7cb87928c5984..32c23c3d119315fbd8a065f9d5bb2730b486fd42 100644
--- a/configs/stylegan_v2_256_ffhq.yaml
+++ b/configs/stylegan_v2_256_ffhq.yaml
@@ -23,6 +23,10 @@ model:
params:
gen_iters: 4
disc_iters: 16
+ max_eval_steps: 50000
+
+export_model:
+ - {name: 'gen', inputs_num: 2}
dataset:
train:
@@ -41,6 +45,21 @@ dataset:
- name: Normalize
mean: [127.5, 127.5, 127.5]
std: [127.5, 127.5, 127.5]
+ test:
+ name: SingleDataset
+ dataroot: data/ffhq/images256x256/
+ num_workers: 3
+ batch_size: 3
+ preprocess:
+ - name: LoadImageFromFile
+ key: A
+ - name: Transforms
+ input_keys: [A]
+ pipeline:
+ - name: Transpose
+ - name: Normalize
+ mean: [127.5, 127.5, 127.5]
+ std: [127.5, 127.5, 127.5]
lr_scheduler:
name: MultiStepDecay
@@ -69,3 +88,11 @@ log_config:
snapshot_config:
interval: 5000
+
+validate:
+ interval: 50000
+ save_imig: False
+ metrics:
+ fid: # metric name, can be arbitrary
+ name: FID
+ batch_size: 4
diff --git a/configs/swinir_denoising.yaml b/configs/swinir_denoising.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9aa317f10912934ed2317c38427d346cb72df620
--- /dev/null
+++ b/configs/swinir_denoising.yaml
@@ -0,0 +1,84 @@
+total_iters: 420000
+output_dir: output_dir
+
+model:
+ name: SwinIRModel
+ generator:
+ name: SwinIR
+ upscale: 1
+ img_size: 128
+ window_size: 8
+ depths: [6, 6, 6, 6, 6, 6]
+ embed_dim: 180
+ num_heads: [6, 6, 6, 6, 6, 6]
+ mlp_ratio: 2
+ char_criterion:
+ name: CharbonnierLoss
+ eps: 0.000000001
+ reduction: mean
+
+dataset:
+ train:
+ name: SwinIRDataset
+ # TODO fix out of memory for val while training
+ num_workers: 0
+ batch_size: 2 # 4GPU
+ opt:
+ phase: train
+ n_channels: 3
+ H_size: 128
+ sigma: 15
+ sigma_test: 15
+ dataroot_H: data/trainsets/trainH
+ test:
+ name: SwinIRDataset
+ # TODO fix out of memory for val while training
+ num_workers: 0
+ batch_size: 1
+ opt:
+ phase: test
+ n_channels: 3
+ H_size: 128
+ sigma: 15
+ sigma_test: 15
+ dataroot_H: data/trainsets/CBSD68
+
+export_model:
+ - {name: 'generator', inputs_num: 1}
+
+lr_scheduler:
+ name: MultiStepDecay
+ learning_rate: 2e-4
+ milestones: [210000, 305000, 345000, 385000, 420000]
+ gamma: 0.5
+
+validate:
+ interval: 200
+ save_img: True
+
+ metrics:
+ psnr: # metric name, can be arbitrary
+ name: PSNR
+ crop_border: 4
+ test_y_channel: True
+ ssim:
+ name: SSIM
+ crop_border: 4
+ test_y_channel: True
+
+optimizer:
+ name: Adam
+ # add parameters of net_name to optim
+ # name should in self.nets
+ net_names:
+ - generator
+ beta1: 0.9
+ beta2: 0.999
+ epsilon: 1e-8
+
+log_config:
+ interval: 10
+ visiual_interval: 5000
+
+snapshot_config:
+ interval: 500
diff --git a/configs/ugatit_photo2cartoon.yaml b/configs/ugatit_photo2cartoon.yaml
index 689dde75a15f0e012b520c617b5e4084aba7684a..6c2fc65db82cd2287386f92e3edb29d9c4ec4736 100644
--- a/configs/ugatit_photo2cartoon.yaml
+++ b/configs/ugatit_photo2cartoon.yaml
@@ -78,7 +78,7 @@ dataset:
key: A
- name: LoadImageFromFile
key: B
- - name: Transfroms
+ - name: Transforms
input_keys: [A, B]
pipeline:
- name: Resize
diff --git a/configs/ugatit_selfie2anime_light.yaml b/configs/ugatit_selfie2anime_light.yaml
index a9d21be739b047d16a1f240ae6c5763f97ee04d8..4dbca42e387a0fc9954be52b84534693a46f7f0e 100644
--- a/configs/ugatit_selfie2anime_light.yaml
+++ b/configs/ugatit_selfie2anime_light.yaml
@@ -78,7 +78,7 @@ dataset:
key: A
- name: LoadImageFromFile
key: B
- - name: Transfroms
+ - name: Transforms
input_keys: [A, B]
pipeline:
- name: Resize
diff --git a/configs/wav2lip_hq.yaml b/configs/wav2lip_hq.yaml
index a6a4f1cabf808980179a58675860c749bbd07eac..9e9dc51078743bbc19c24a4ea1ab4c6bd933bf89 100644
--- a/configs/wav2lip_hq.yaml
+++ b/configs/wav2lip_hq.yaml
@@ -14,6 +14,9 @@ model:
discriminator_hq:
name: Wav2LipDiscQual
+export_model:
+ - {name: 'netG', inputs_num: 2}
+
dataset:
train:
name: Wav2LipDataset
diff --git a/lsr2_preprocess.py b/data/lsr2_preprocess.py
similarity index 100%
rename from lsr2_preprocess.py
rename to data/lsr2_preprocess.py
diff --git a/data/process_div2k_data.py b/data/process_div2k_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfa98580ae6b3121960a60ce6a5d0a0a3e2fa834
--- /dev/null
+++ b/data/process_div2k_data.py
@@ -0,0 +1,290 @@
+import os
+import re
+import sys
+import cv2
+import argparse
+import numpy as np
+import os.path as osp
+
+from time import time
+from multiprocessing import Pool
+from shutil import get_terminal_size
+from ppgan.datasets.base_dataset import scandir
+
+
+class Timer:
+ """A flexible Timer class."""
+ def __init__(self, start=True, print_tmpl=None):
+ self._is_running = False
+ self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}'
+ if start:
+ self.start()
+
+ @property
+ def is_running(self):
+ """bool: indicate whether the timer is running"""
+ return self._is_running
+
+ def __enter__(self):
+ self.start()
+ return self
+
+ def __exit__(self, type, value, traceback):
+ print(self.print_tmpl.format(self.since_last_check()))
+ self._is_running = False
+
+ def start(self):
+ """Start the timer."""
+ if not self._is_running:
+ self._t_start = time()
+ self._is_running = True
+ self._t_last = time()
+
+ def since_start(self):
+ """Total time since the timer is started.
+
+ Returns (float): Time in seconds.
+ """
+ if not self._is_running:
+ raise ValueError('timer is not running')
+ self._t_last = time()
+ return self._t_last - self._t_start
+
+ def since_last_check(self):
+ """Time since the last checking.
+
+ Either :func:`since_start` or :func:`since_last_check` is a checking
+ operation.
+
+ Returns (float): Time in seconds.
+ """
+ if not self._is_running:
+ raise ValueError('timer is not running')
+ dur = time() - self._t_last
+ self._t_last = time()
+ return dur
+
+
+class ProgressBar:
+ """A progress bar which can print the progress."""
+ def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout):
+ self.task_num = task_num
+ self.bar_width = bar_width
+ self.completed = 0
+ self.file = file
+ if start:
+ self.start()
+
+ @property
+ def terminal_width(self):
+ width, _ = get_terminal_size()
+ return width
+
+ def start(self):
+ if self.task_num > 0:
+ self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, '
+ 'elapsed: 0s, ETA:')
+ else:
+ self.file.write('completed: 0, elapsed: 0s')
+ self.file.flush()
+ self.timer = Timer()
+
+ def update(self, num_tasks=1):
+ assert num_tasks > 0
+ self.completed += num_tasks
+ elapsed = self.timer.since_start()
+ if elapsed > 0:
+ fps = self.completed / elapsed
+ else:
+ fps = float('inf')
+ if self.task_num > 0:
+ percentage = self.completed / float(self.task_num)
+ eta = int(elapsed * (1 - percentage) / percentage + 0.5)
+ msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \
+ f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \
+ f'ETA: {eta:5}s'
+
+ bar_width = min(self.bar_width,
+ int(self.terminal_width - len(msg)) + 2,
+ int(self.terminal_width * 0.6))
+ bar_width = max(2, bar_width)
+ mark_width = int(bar_width * percentage)
+ bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width)
+ self.file.write(msg.format(bar_chars))
+ else:
+ self.file.write(
+ f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,'
+ f' {fps:.1f} tasks/s')
+ self.file.flush()
+
+
+def main_extract_subimages(args):
+ """A multi-thread tool to crop large images to sub-images for faster IO.
+
+ It is used for DIV2K dataset.
+
+ args (dict): Configuration dict. It contains:
+ n_thread (int): Thread number.
+ compression_level (int): CV_IMWRITE_PNG_COMPRESSION from 0 to 9.
+ A higher value means a smaller size and longer compression time.
+ Use 0 for faster CPU decompression. Default: 3, same in cv2.
+
+ input_folder (str): Path to the input folder.
+ save_folder (str): Path to save folder.
+ crop_size (int): Crop size.
+ step (int): Step for overlapped sliding window.
+ thresh_size (int): Threshold size. Patches whose size is lower
+ than thresh_size will be dropped.
+
+ Usage:
+ For each folder, run this script.
+ Typically, there are four folders to be processed for DIV2K dataset.
+ DIV2K_train_HR
+ DIV2K_train_LR_bicubic/X2
+ DIV2K_train_LR_bicubic/X3
+ DIV2K_train_LR_bicubic/X4
+ After process, each sub_folder should have the same number of
+ subimages.
+ Remember to modify opt configurations according to your settings.
+ """
+
+ opt = {}
+ opt['n_thread'] = args.n_thread
+ opt['compression_level'] = args.compression_level
+
+ # HR images
+ opt['input_folder'] = osp.join(args.data_root, 'DIV2K_train_HR')
+ opt['save_folder'] = osp.join(args.data_root, 'DIV2K_train_HR_sub')
+ opt['crop_size'] = args.crop_size
+ opt['step'] = args.step
+ opt['thresh_size'] = args.thresh_size
+ extract_subimages(opt)
+
+ for scale in [2, 3, 4]:
+ opt['input_folder'] = osp.join(args.data_root,
+ f'DIV2K_train_LR_bicubic/X{scale}')
+ opt['save_folder'] = osp.join(args.data_root,
+ f'DIV2K_train_LR_bicubic/X{scale}_sub')
+ opt['crop_size'] = args.crop_size // scale
+ opt['step'] = args.step // scale
+ opt['thresh_size'] = args.thresh_size // scale
+ extract_subimages(opt)
+
+
+def extract_subimages(opt):
+ """Crop images to subimages.
+
+ Args:
+ opt (dict): Configuration dict. It contains:
+ input_folder (str): Path to the input folder.
+ save_folder (str): Path to save folder.
+ n_thread (int): Thread number.
+ """
+ input_folder = opt['input_folder']
+ save_folder = opt['save_folder']
+ if not osp.exists(save_folder):
+ os.makedirs(save_folder)
+ print(f'mkdir {save_folder} ...')
+ else:
+ print(f'Folder {save_folder} already exists. Exit.')
+ sys.exit(1)
+
+ img_list = list(scandir(input_folder))
+ img_list = [osp.join(input_folder, v) for v in img_list]
+
+ prog_bar = ProgressBar(len(img_list))
+ pool = Pool(opt['n_thread'])
+ for path in img_list:
+ pool.apply_async(worker,
+ args=(path, opt),
+ callback=lambda arg: prog_bar.update())
+ pool.close()
+ pool.join()
+ print('All processes done.')
+
+
+def worker(path, opt):
+ """Worker for each process.
+
+ Args:
+ path (str): Image path.
+ opt (dict): Configuration dict. It contains:
+ crop_size (int): Crop size.
+ step (int): Step for overlapped sliding window.
+ thresh_size (int): Threshold size. Patches whose size is smaller
+ than thresh_size will be dropped.
+ save_folder (str): Path to save folder.
+ compression_level (int): for cv2.IMWRITE_PNG_COMPRESSION.
+
+ Returns:
+ process_info (str): Process information displayed in progress bar.
+ """
+ crop_size = opt['crop_size']
+ step = opt['step']
+ thresh_size = opt['thresh_size']
+ img_name, extension = osp.splitext(osp.basename(path))
+
+ # remove the x2, x3, x4 and x8 in the filename for DIV2K
+ img_name = re.sub('x[2348]', '', img_name)
+
+ img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
+
+ if img.ndim == 2 or img.ndim == 3:
+ h, w = img.shape[:2]
+ else:
+ raise ValueError(f'Image ndim should be 2 or 3, but got {img.ndim}')
+
+ h_space = np.arange(0, h - crop_size + 1, step)
+ if h - (h_space[-1] + crop_size) > thresh_size:
+ h_space = np.append(h_space, h - crop_size)
+ w_space = np.arange(0, w - crop_size + 1, step)
+ if w - (w_space[-1] + crop_size) > thresh_size:
+ w_space = np.append(w_space, w - crop_size)
+
+ index = 0
+ for x in h_space:
+ for y in w_space:
+ index += 1
+ cropped_img = img[x:x + crop_size, y:y + crop_size, ...]
+ cv2.imwrite(
+ osp.join(opt['save_folder'],
+ f'{img_name}_s{index:03d}{extension}'), cropped_img,
+ [cv2.IMWRITE_PNG_COMPRESSION, opt['compression_level']])
+ process_info = f'Processing {img_name} ...'
+ return process_info
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='Prepare DIV2K dataset',
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--data-root', help='dataset root')
+ parser.add_argument('--crop-size',
+ nargs='?',
+ default=480,
+ help='cropped size for HR images')
+ parser.add_argument('--step',
+ nargs='?',
+ default=240,
+ help='step size for HR images')
+ parser.add_argument('--thresh-size',
+ nargs='?',
+ default=0,
+ help='threshold size for HR images')
+ parser.add_argument('--compression-level',
+ nargs='?',
+ default=3,
+ help='compression level when save png images')
+ parser.add_argument('--n-thread',
+ nargs='?',
+ default=20,
+ help='thread number when using multiprocessing')
+
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == '__main__':
+ args = parse_args()
+ # extract subimages
+ main_extract_subimages(args)
diff --git a/data/realsr_preprocess/collect_noise.py b/data/realsr_preprocess/collect_noise.py
index 18a7c48553600723a5645c048045252e6aedaed5..a3558ff18d9c65ce6439e60bed2d174aa67a0f7f 100644
--- a/data/realsr_preprocess/collect_noise.py
+++ b/data/realsr_preprocess/collect_noise.py
@@ -27,8 +27,9 @@ parser.add_argument('--upscale_factor',
opt = parser.parse_args()
# define input and target directories
-with open('./preprocess/paths.yml', 'r') as stream:
- PATHS = yaml.load(stream)
+cur_path = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(cur_path, './paths.yml'), 'r') as stream:
+ PATHS = yaml.load(stream, Loader=yaml.SafeLoader)
def noise_patch(rgb_img, sp, max_var, min_mean):
diff --git a/data/realsr_preprocess/create_bicubic_dataset.py b/data/realsr_preprocess/create_bicubic_dataset.py
index 085feb6f90594b41dfbb8ad912c20fe7b8b69211..bb86bce917430f72de9798b0eda6a5ca6c91fc3a 100644
--- a/data/realsr_preprocess/create_bicubic_dataset.py
+++ b/data/realsr_preprocess/create_bicubic_dataset.py
@@ -48,8 +48,9 @@ parser.add_argument('--upscale_factor',
opt = parser.parse_args()
# define input and target directories
-with open('./paths.yml', 'r') as stream:
- PATHS = yaml.load(stream)
+cur_path = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(cur_path, './paths.yml'), 'r') as stream:
+ PATHS = yaml.load(stream, Loader=yaml.SafeLoader)
if opt.dataset == 'df2k':
path_sdsr = PATHS['datasets']['df2k'] + '/generated/sdsr/'
@@ -82,7 +83,8 @@ else:
tdsr_hr_dir = path_tdsr + 'HR'
tdsr_lr_dir = path_tdsr + 'LR'
-assert not os.path.exists(PATHS['datasets'][opt.dataset])
+assert not os.path.exists(tdsr_hr_dir)
+assert not os.path.exists(tdsr_lr_dir)
if not os.path.exists(tdsr_hr_dir):
os.makedirs(tdsr_hr_dir)
diff --git a/data/realsr_preprocess/create_kernel_dataset.py b/data/realsr_preprocess/create_kernel_dataset.py
index 97c7535412697a1b89776f69e932c12e128c7962..291363a9dae0bea1f1d70746b292c53a85e8e649 100644
--- a/data/realsr_preprocess/create_kernel_dataset.py
+++ b/data/realsr_preprocess/create_kernel_dataset.py
@@ -53,8 +53,9 @@ parser.add_argument('--upscale_factor',
opt = parser.parse_args()
# define input and target directories
-with open('./paths.yml', 'r') as stream:
- PATHS = yaml.load(stream)
+cur_path = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(cur_path, './paths.yml'), 'r') as stream:
+ PATHS = yaml.load(stream, Loader=yaml.SafeLoader)
if opt.dataset == 'df2k':
path_sdsr = PATHS['datasets']['df2k'] + '/generated/sdsr/'
diff --git a/data/realsr_preprocess/imresize.py b/data/realsr_preprocess/imresize.py
index 02fa92a19a7a4013381f7baf7ab1470100876e8c..6d61bc7caa2618d8617f3c5c9128f577c29cec53 100644
--- a/data/realsr_preprocess/imresize.py
+++ b/data/realsr_preprocess/imresize.py
@@ -218,7 +218,7 @@ def kernel_shift(kernel, sf):
# Before applying the shift, we first pad the kernel so that nothing is lost due to the shift
# (biggest shift among dims + 1 for safety)
- kernel = np.pad(kernel, np.int(np.ceil(np.max(shift_vec))) + 1, 'constant')
+ kernel = np.pad(kernel, np.int_(np.ceil(np.max(shift_vec))) + 1, 'constant')
# Finally shift the kernel and return
return interpolation.shift(kernel, shift_vec)
diff --git a/data/realsr_preprocess/paths.yml b/data/realsr_preprocess/paths.yml
index 508ba3079dbf994eada47e2e495e121e4defccb7..5a7fd420681bd182f6c4402e22aeef5dcb8c2ef0 100644
--- a/data/realsr_preprocess/paths.yml
+++ b/data/realsr_preprocess/paths.yml
@@ -1,13 +1,13 @@
df2k:
tdsr:
- source: '/workspace/datasets/ntire20/Corrupted-tr-x'
- target: '/workspace/datasets/ntire20/Corrupted-tr-y'
+ source: './data/ntire20/Corrupted-tr-x'
+ target: './data/ntire20/Corrupted-tr-y'
valid:
dped:
clean:
hr:
- train: '/workspace/datasets/ntire20/DPEDiphone-tr-x'
- valid: '/workspace/datasets/ntire20/DPEDiphone-va'
+ train: './data/ntire20/DPEDiphone-tr-x'
+ valid: './data/ntire20/DPEDiphone-va'
datasets:
- df2k: 'DF2K'
- dped: 'DPED'
+ df2k: 'data/DF2K'
+ dped: 'data/DPED'
diff --git a/deploy/TENSOR_RT.md b/deploy/TENSOR_RT.md
new file mode 100644
index 0000000000000000000000000000000000000000..1ef67ce4256f9bf42f3fc1c5d5f769f973879cb2
--- /dev/null
+++ b/deploy/TENSOR_RT.md
@@ -0,0 +1,61 @@
+# TensorRT预测部署教程
+TensorRT是NVIDIA提出的用于统一模型部署的加速库,可以应用于V100、JETSON Xavier等硬件,它可以极大提高预测速度。Paddle TensorRT教程请参考文档[使用Paddle-TensorRT库预测](https://paddle-inference.readthedocs.io/en/latest/optimize/paddle_trt.html#)
+
+## 1. 安装PaddleInference预测库
+- Python安装包,请从[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/Tables.html#whl-release) 下载带有tensorrt的安装包进行安装
+
+- CPP预测库,请从[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/build_and_install_lib_cn.html) 下载带有TensorRT编译的预测库
+
+- 如果Python和CPP官网没有提供已编译好的安装包或预测库,请参考[源码安装](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/compile/linux-compile.html) 自行编译
+
+**注意:**
+- 您的机器上TensorRT的版本需要跟您使用的预测库中TensorRT版本保持一致。
+- PaddleGAN中部署预测要求TensorRT版本 > 7.0。
+
+## 2. 导出模型
+模型导出具体请参考文档[PaddleGAN模型导出教程](../EXPORT_MODEL.md)。
+
+## 3. 开启TensorRT加速
+### 3.1 配置TensorRT
+在使用Paddle预测库构建预测器配置config时,打开TensorRT引擎就可以了:
+
+```
+config->EnableUseGpu(100, 0); // 初始化100M显存,使用GPU ID为0
+config->GpuDeviceId(); // 返回正在使用的GPU ID
+// 开启TensorRT预测,可提升GPU预测性能,需要使用带TensorRT的预测库
+config->EnableTensorRtEngine(1 << 20 /*workspace_size*/,
+ batch_size /*max_batch_size*/,
+ 3 /*min_subgraph_size*/,
+ AnalysisConfig::Precision::kFloat32 /*precision*/,
+ false /*use_static*/,
+ false /*use_calib_mode*/);
+
+```
+
+### 3.2 TensorRT固定尺寸预测
+
+以`msvsr`为例,使用固定尺寸输入预测:
+```
+python tools/inference.py --model_path=/root/to/model --config-file /root/to/config --run_mode trt_fp32 --min_subgraph_size 20 --mode_type msvsr
+```
+
+## 4、常见问题QA
+**Q:** 提示没有`tensorrt_op`
+**A:** 请检查是否使用带有TensorRT的Paddle Python包或预测库。
+
+**Q:** 提示`op out of memory`
+**A:** 检查GPU是否是别人也在使用,请尝试使用空闲GPU
+
+**Q:** 提示`some trt inputs dynamic shape info not set`
+**A:** 这是由于`TensorRT`会把网络结果划分成多个子图,我们只设置了输入数据的动态尺寸,划分的其他子图的输入并未设置动态尺寸。有两个解决方法:
+
+- 方法一:通过增大`min_subgraph_size`,跳过对这些子图的优化。根据提示,设置min_subgraph_size大于并未设置动态尺寸输入的子图中OP个数即可。
+`min_subgraph_size`的意思是,在加载TensorRT引擎的时候,大于`min_subgraph_size`的OP才会被优化,并且这些OP是连续的且是TensorRT可以优化的。
+
+- 方法二:找到子图的这些输入,按照上面方式也设置子图的输入动态尺寸。
+
+**Q:** 如何打开日志
+**A:** 预测库默认是打开日志的,只要注释掉`config.disable_glog_info()`就可以打开日志
+
+**Q:** 开启TensorRT,预测时提示Slice on batch axis is not supported in TensorRT
+**A:** 请尝试使用动态尺寸输入
diff --git a/deploy/cpp_infer/CMakeLists.txt b/deploy/cpp_infer/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..57312c616bf7a33ae03f5deb36f9299070eecb5a
--- /dev/null
+++ b/deploy/cpp_infer/CMakeLists.txt
@@ -0,0 +1,223 @@
+project(vsr CXX C)
+cmake_minimum_required(VERSION 3.14)
+
+option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON)
+option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON)
+option(WITH_TENSORRT "Compile demo with TensorRT." OFF)
+
+SET(PADDLE_LIB "" CACHE PATH "Location of libraries")
+SET(OPENCV_DIR "" CACHE PATH "Location of libraries")
+SET(CUDA_LIB "" CACHE PATH "Location of libraries")
+SET(CUDNN_LIB "" CACHE PATH "Location of libraries")
+SET(TENSORRT_DIR "" CACHE PATH "Compile demo with TensorRT")
+
+set(DEMO_NAME "vsr")
+
+
+macro(safe_set_static_flag)
+ foreach(flag_var
+ CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+ CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+ if(${flag_var} MATCHES "/MD")
+ string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+ endif(${flag_var} MATCHES "/MD")
+ endforeach(flag_var)
+endmacro()
+
+if (WITH_MKL)
+ ADD_DEFINITIONS(-DUSE_MKL)
+endif()
+
+if(NOT DEFINED PADDLE_LIB)
+ message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+
+if(NOT DEFINED OPENCV_DIR)
+ message(FATAL_ERROR "please set OPENCV_DIR with -DOPENCV_DIR=/path/opencv")
+endif()
+
+
+if (WIN32)
+ include_directories("${PADDLE_LIB}/paddle/include")
+ link_directories("${PADDLE_LIB}/paddle/lib")
+ find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/build/ NO_DEFAULT_PATH)
+
+else ()
+ find_package(OpenCV REQUIRED PATHS ${OPENCV_DIR}/share/OpenCV NO_DEFAULT_PATH)
+ include_directories("${PADDLE_LIB}/paddle/include")
+ link_directories("${PADDLE_LIB}/paddle/lib")
+endif ()
+include_directories(${OpenCV_INCLUDE_DIRS})
+
+if (WIN32)
+ add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+ if(WITH_MKL)
+ set(FLAG_OPENMP "/openmp")
+ endif()
+ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+ set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+ if (WITH_STATIC_LIB)
+ safe_set_static_flag()
+ add_definitions(-DSTATIC_LIB)
+ endif()
+ message("cmake c debug flags " ${CMAKE_C_FLAGS_DEBUG})
+ message("cmake c release flags " ${CMAKE_C_FLAGS_RELEASE})
+ message("cmake cxx debug flags " ${CMAKE_CXX_FLAGS_DEBUG})
+ message("cmake cxx release flags " ${CMAKE_CXX_FLAGS_RELEASE})
+else()
+ if(WITH_MKL)
+ set(FLAG_OPENMP "-fopenmp")
+ endif()
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -o3 ${FLAG_OPENMP} -std=c++11")
+ set(CMAKE_STATIC_LIBRARY_PREFIX "")
+ message("cmake cxx flags" ${CMAKE_CXX_FLAGS})
+endif()
+
+if (WITH_GPU)
+ if (NOT DEFINED CUDA_LIB OR ${CUDA_LIB} STREQUAL "")
+ message(FATAL_ERROR "please set CUDA_LIB with -DCUDA_LIB=/path/cuda-8.0/lib64")
+ endif()
+ if (NOT WIN32)
+ if (NOT DEFINED CUDNN_LIB)
+ message(FATAL_ERROR "please set CUDNN_LIB with -DCUDNN_LIB=/path/cudnn_v7.4/cuda/lib64")
+ endif()
+ endif(NOT WIN32)
+endif()
+
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+include_directories("${CMAKE_SOURCE_DIR}/")
+
+if (NOT WIN32)
+ if (WITH_TENSORRT AND WITH_GPU)
+ include_directories("${TENSORRT_DIR}/include")
+ link_directories("${TENSORRT_DIR}/lib")
+ endif()
+endif(NOT WIN32)
+
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
+
+if(WITH_MKL)
+ include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+ if (WIN32)
+ set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib
+ ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md.lib)
+ else ()
+ set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+ ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+ execute_process(COMMAND cp -r ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} /usr/lib)
+ endif ()
+ set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
+ if(EXISTS ${MKLDNN_PATH})
+ include_directories("${MKLDNN_PATH}/include")
+ if (WIN32)
+ set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+ else ()
+ set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+ endif ()
+ endif()
+else()
+ if (WIN32)
+ set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+ else ()
+ set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+ endif ()
+endif()
+
+# Note: libpaddle_inference_api.so/a must put before libpaddle_inference.so/a
+if(WITH_STATIC_LIB)
+ if(WIN32)
+ set(DEPS
+ ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+ else()
+ set(DEPS
+ ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+ endif()
+else()
+ if(WIN32)
+ set(DEPS
+ ${PADDLE_LIB}/paddle/lib/paddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
+ else()
+ set(DEPS
+ ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
+ endif()
+endif(WITH_STATIC_LIB)
+
+if (NOT WIN32)
+ set(DEPS ${DEPS}
+ ${MATH_LIB} ${MKLDNN_LIB}
+ glog gflags protobuf z xxhash
+ )
+ if(EXISTS "${PADDLE_LIB}/third_party/install/snappystream/lib")
+ set(DEPS ${DEPS} snappystream)
+ endif()
+ if (EXISTS "${PADDLE_LIB}/third_party/install/snappy/lib")
+ set(DEPS ${DEPS} snappy)
+ endif()
+else()
+ set(DEPS ${DEPS}
+ ${MATH_LIB} ${MKLDNN_LIB}
+ glog gflags_static libprotobuf xxhash)
+ set(DEPS ${DEPS} libcmt shlwapi)
+ if (EXISTS "${PADDLE_LIB}/third_party/install/snappy/lib")
+ set(DEPS ${DEPS} snappy)
+ endif()
+ if(EXISTS "${PADDLE_LIB}/third_party/install/snappystream/lib")
+ set(DEPS ${DEPS} snappystream)
+ endif()
+endif(NOT WIN32)
+
+
+if(WITH_GPU)
+ if(NOT WIN32)
+ if (WITH_TENSORRT)
+ set(DEPS ${DEPS} ${TENSORRT_DIR}/lib/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+ set(DEPS ${DEPS} ${TENSORRT_DIR}/lib/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+ endif()
+ set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+ set(DEPS ${DEPS} ${CUDNN_LIB}/libcudnn${CMAKE_SHARED_LIBRARY_SUFFIX})
+ else()
+ set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
+ set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
+ set(DEPS ${DEPS} ${CUDNN_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX})
+ endif()
+endif()
+
+
+if (NOT WIN32)
+ set(EXTERNAL_LIB "-ldl -lrt -lgomp -lz -lm -lpthread")
+ set(DEPS ${DEPS} ${EXTERNAL_LIB})
+endif()
+
+set(DEPS ${DEPS} ${OpenCV_LIBS})
+
+AUX_SOURCE_DIRECTORY(./src SRCS)
+add_executable(${DEMO_NAME} ${SRCS})
+target_link_libraries(${DEMO_NAME} ${DEPS})
+
+if (WIN32 AND WITH_MKL)
+ add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.dll ./mklml.dll
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md.dll ./libiomp5md.dll
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.dll ./mkldnn.dll
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.dll ./release/mklml.dll
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5md.dll ./release/libiomp5md.dll
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.dll ./release/mkldnn.dll
+ )
+endif()
diff --git a/deploy/cpp_infer/include/process_op.h b/deploy/cpp_infer/include/process_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..909befc755e85b7893cb7da3da4af7790afc8a45
--- /dev/null
+++ b/deploy/cpp_infer/include/process_op.h
@@ -0,0 +1,27 @@
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+using namespace std;
+
+class Normalize {
+public:
+ virtual void Run(cv::Mat *im, const std::vector &mean,
+ const std::vector &scale, const bool is_scale = true);
+};
+
+
+// RGB -> CHW
+class Permute {
+public:
+ virtual void Run(const cv::Mat *im, float *data);
+};
diff --git a/deploy/cpp_infer/include/vsr.h b/deploy/cpp_infer/include/vsr.h
new file mode 100644
index 0000000000000000000000000000000000000000..e594710f2cd71cd9815b6f07a84ce986af8a566f
--- /dev/null
+++ b/deploy/cpp_infer/include/vsr.h
@@ -0,0 +1,57 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include "include/process_op.h"
+#include "paddle_inference_api.h"
+
+namespace PaddleGAN {
+
+class VSR {
+public:
+ explicit VSR(const std::string& model_path,
+ const std::string& param_path,
+ const std::string& device,
+ const int& gpu_id,
+ const bool& use_mkldnn,
+ const int& cpu_threads) {
+
+ this->device_ = device;
+ this->gpu_id_ = gpu_id;
+ this->use_mkldnn_ = use_mkldnn_;
+ this->cpu_threads_ = cpu_threads;
+
+ LoadModel(model_path, param_path);
+ }
+
+ // Load paddle inference model
+ void LoadModel(const std::string& model_path, const std::string& param_path);
+
+ // Run predictor
+ void Run(const std::vector& imgs, std::vector* result = nullptr);
+
+private:
+ std::shared_ptr predictor_;
+
+ std::string device_ = "GPU";
+ int gpu_id_ = 0;
+ bool use_mkldnn_ = false;
+ int cpu_threads_ = 1;
+
+ std::vector mean_ = {0., 0., 0.};
+ std::vector scale_ = {1., 1., 1.};
+
+ // pre/post-process
+ Permute permute_op_;
+ Normalize normalize_op_;
+ std::vector Preprocess(cv::Mat& img);
+};
+
+}
diff --git a/deploy/cpp_infer/src/main.cc b/deploy/cpp_infer/src/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb3b68ad05f2da3bd2f6aeb2967038f21c84b329
--- /dev/null
+++ b/deploy/cpp_infer/src/main.cc
@@ -0,0 +1,110 @@
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "include/vsr.h"
+#include
+
+
+DEFINE_string(model_path, "", "Path of inference model");
+DEFINE_string(param_path, "", "Path of inference param");
+DEFINE_int32(frame_num, 2, "frame_num");
+DEFINE_string(video_path, "", "Path of input video, `video_file` or `camera_id` has a highest priority.");
+DEFINE_string(device, "CPU", "Choose the device you want to run, it can be: CPU/GPU/XPU, default is CPU.");
+DEFINE_string(output_dir, "output", "Directory of output visualization files.");
+DEFINE_int32(gpu_id, 0, "Device id of GPU to execute");
+DEFINE_bool(use_mkldnn, false, "Whether use mkldnn with CPU");
+DEFINE_int32(cpu_threads, 1, "Num of threads with CPU");
+
+
+void main_predict(const std::string& video_path,
+ PaddleGAN::VSR* vsr,
+ const std::string& output_dir = "output") {
+
+ // Open video
+ cv::VideoCapture capture;
+ std::string video_out_name = "output.mp4";
+ capture.open(video_path.c_str());
+ if (!capture.isOpened()) {
+ printf("can not open video : %s\n", video_path.c_str());
+ return;
+ }
+
+ // Get Video info :fps, frame count
+ int video_fps = static_cast(capture.get(CV_CAP_PROP_FPS));
+ int video_frame_count = static_cast(capture.get(CV_CAP_PROP_FRAME_COUNT));
+ // Set fixed size for output frame, only for msvsr model
+ int out_width = 1280;
+ int out_height = 720;
+ printf("fps: %d, frame_count: %d\n", video_fps, video_frame_count);
+
+ // Create VideoWriter for output
+ cv::VideoWriter video_out;
+ std::string video_out_path(output_dir);
+ video_out_path += video_out_name;
+
+ video_out.open(video_out_path,
+ 0x00000021,
+ video_fps,
+ cv::Size(out_width, out_height),
+ true);
+ if (!video_out.isOpened()) {
+ printf("create video writer failed!\n");
+ return;
+ }
+
+ // Capture all frames and do inference
+ cv::Mat frame;
+ int frame_id = 0;
+ bool reach_end = false;
+ while (capture.isOpened()) {
+ std::vector imgs;
+ for (int i = 0; i < FLAGS_frame_num; i++) {
+ capture.read(frame);
+ if (!frame.empty()) {
+ imgs.push_back(frame);
+ }
+ else {
+ reach_end = true;
+ }
+ }
+ if (reach_end) {
+ break;
+ }
+
+ std::vector result;
+ vsr->Run(imgs, &result);
+ for (auto& item : result) {
+ cv::Mat temp = cv::Mat::zeros(item.size(), CV_8UC3);
+ item.convertTo(temp, CV_8UC3, 255);
+ video_out.write(temp);
+ printf("Processing frame: %d\n", frame_id);
+ // auto im_nm = std::to_string(frame_id) + "test.jpg";
+ // cv::imwrite(FLAGS_output_dir + im_nm, temp);
+ frame_id += 1;
+ }
+ }
+ printf("inference finished, output video saved at %s", video_out_path.c_str());
+ capture.release();
+ video_out.release();
+}
+
+int main(int argc, char** argv) {
+ // Parsing command-line
+ google::ParseCommandLineFlags(&argc, &argv, true);
+
+ // Load model and create a vsr
+ PaddleGAN::VSR vsr(FLAGS_model_path, FLAGS_param_path, FLAGS_device, FLAGS_gpu_id, FLAGS_use_mkldnn,
+ FLAGS_cpu_threads);
+
+ // Do inference on input video or image
+ main_predict(FLAGS_video_path, &vsr, FLAGS_output_dir);
+ return 0;
+}
\ No newline at end of file
diff --git a/deploy/cpp_infer/src/process_op.cc b/deploy/cpp_infer/src/process_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b1cba35ee105eb5b36a07f3f39a3c3990c31aaf
--- /dev/null
+++ b/deploy/cpp_infer/src/process_op.cc
@@ -0,0 +1,46 @@
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#include
+
+// RGB -> CHW
+void Permute::Run(const cv::Mat *im, float *data) {
+ int rh = im->rows;
+ int rw = im->cols;
+ int ch = im->channels();
+ for (int i = 0; i < ch; ++i) {
+ cv::extractChannel(*im, cv::Mat(rh, rw, CV_32FC1, data + i * rh * rw), i);
+ }
+}
+
+void Normalize::Run(cv::Mat *im, const std::vector &mean,
+ const std::vector &scale, const bool is_scale) {
+ double e = 1.0;
+ if (is_scale) {
+ e /= 255.0;
+ }
+ (*im).convertTo(*im, CV_32FC3, e);
+
+ for (int h = 0; h < im->rows; h++) {
+ for (int w = 0; w < im->cols; w++) {
+ im->at(h, w)[0] =
+ (im->at(h, w)[0] - mean[0]) * scale[0];
+ im->at(h, w)[1] =
+ (im->at(h, w)[1] - mean[1]) * scale[1];
+ im->at(h, w)[2] =
+ (im->at(h, w)[2] - mean[2]) * scale[2];
+ }
+ }
+}
+
diff --git a/deploy/cpp_infer/src/vsr.cc b/deploy/cpp_infer/src/vsr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4999fe28fdd943d1b8895836f0af4cde142330f
--- /dev/null
+++ b/deploy/cpp_infer/src/vsr.cc
@@ -0,0 +1,92 @@
+#include "include/vsr.h"
+#include
+
+namespace PaddleGAN {
+
+// VSR load model and initialize predictor
+void VSR::LoadModel(const std::string& model_path,
+ const std::string& param_path) {
+ paddle_infer::Config config;
+ config.SetModel(model_path, param_path);
+ if (this->device_ == "GPU") {
+ config.EnableUseGpu(200, this->gpu_id_);
+ }
+ else {
+ config.DisableGpu();
+ if (this->use_mkldnn_) {
+ config.EnableMKLDNN();
+ // cache 10 for mkldnn to avoid memory leak; copy from paddleseg
+ config.SetMkldnnCacheCapacity(10);
+ }
+ config.SetCpuMathLibraryNumThreads(this->cpu_threads_);
+ }
+
+ config.SwitchUseFeedFetchOps(false);
+ config.SwitchIrOptim(true);
+ config.EnableMemoryOptim();
+ config.DisableGlogInfo();
+ this->predictor_ = paddle_infer::CreatePredictor(config);
+}
+
+std::vector VSR::Preprocess(cv::Mat& img) {
+ cv::Mat new_img;
+ img.copyTo(new_img);
+ cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
+
+ // transform 1 image
+ this->normalize_op_.Run(&new_img, this->mean_, this->scale_, true);
+ std::vector unroll(1 * 3 * new_img.rows * new_img.cols, 0.0f);
+ this->permute_op_.Run(&new_img, unroll.data());
+ return unroll;
+}
+
+void VSR::Run(const std::vector& imgs, std::vector* result) {
+ int frame_num = imgs.size();
+ int rows = imgs[0].rows;
+ int cols = imgs[0].cols;
+
+ // Preprocess
+ // initialize a fixed size unroll vector to store processed img
+ std::vector in_data_all;
+
+ for (int i = 0; i < frame_num; i++) {
+ cv::Mat im = imgs[i];
+ std::vector unroll = this->Preprocess(im);
+ in_data_all.insert(in_data_all.end(), unroll.begin(), unroll.end());
+ }
+
+ // Set input
+ auto input_names = this->predictor_->GetInputNames();
+ auto input_t = this->predictor_->GetInputHandle(input_names[0]);
+ input_t->Reshape({1, frame_num, 3, rows, cols});
+ input_t->CopyFromCpu(in_data_all.data());
+
+ // Run
+ this->predictor_->Run();
+
+ // Get output
+ auto output_names = this->predictor_->GetOutputNames();
+ auto output_t = this->predictor_->GetOutputHandle(output_names[0]);
+ std::vector output_shape = output_t->shape();
+ int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, std::multiplies());
+ std::vector out_data;
+ out_data.resize(out_num);
+ output_t->CopyToCpu(out_data.data());
+
+ // group to image
+ cv::Mat res = cv::Mat::zeros(output_shape[3], output_shape[4], CV_32FC3); // RGB image
+ int pix_num = output_shape[3] * output_shape[4];
+ int frame_pix_num = pix_num * 3;
+ for (int frame = 0; frame < output_shape[1]; frame++) {
+ int index = 0;
+ for (int h = 0; h < output_shape[3]; ++h) {
+ for (int w = 0; w < output_shape[4]; ++w) {
+ res.at(h, w) = {out_data[2*pix_num+index+frame_pix_num*frame], out_data[pix_num+index+frame_pix_num*frame], out_data[index+frame_pix_num*frame]};
+ index+=1;
+ }
+ }
+ result->push_back(res);
+ }
+}
+
+}
\ No newline at end of file
diff --git a/deploy/export_model.md b/deploy/export_model.md
new file mode 100644
index 0000000000000000000000000000000000000000..0d2073c2ba403699b1966c2f5115d88ac64ffa7b
--- /dev/null
+++ b/deploy/export_model.md
@@ -0,0 +1,36 @@
+# PaddleGAN模型导出教程
+
+## 一、模型导出
+本章节介绍如何使用`tools/export_model.py`脚本导出模型。
+
+### 1、启动参数说明
+
+| FLAG | 用途 | 默认值 | 备注 |
+|:--------------:|:--------------:|:------------:|:-----------------------------------------:|
+| -c | 指定配置文件 | None | |
+| --load | 指定加载的模型参数路径 | None | |
+| -s|--inputs_size | 指定模型输入形状 | None | |
+| --output_dir | 模型保存路径 | `./inference_model` | |
+
+### 2、使用示例
+
+使用训练得到的模型进行试用,这里使用CycleGAN模型为例,脚本如下
+
+```bash
+# 下载预训练好的CycleGAN_horse2zebra模型
+wget https://paddlegan.bj.bcebos.com/models/CycleGAN_horse2zebra.pdparams
+
+# 导出Cylclegan模型
+python -u tools/export_model.py -c configs/cyclegan_horse2zebra.yaml --load CycleGAN_horse2zebra.pdparams --inputs_size="-1,3,-1,-1;-1,3,-1,-1"
+```
+
+### 3、config配置说明
+```python
+export_model:
+ - {name: 'netG_A', inputs_num: 1}
+ - {name: 'netG_B', inputs_num: 1}
+```
+以上为```configs/cyclegan_horse2zebra.yaml```中的配置, 由于```CycleGAN_horse2zebra.pdparams```是个字典,需要制定其中用于导出模型的权重键值。```inputs_num```
+为该网络的输入个数。
+
+预测模型会导出到`inference_model/`目录下,分别为`cycleganmodel_netG_A.pdiparams`, `cycleganmodel_netG_A.pdiparams.info`, `cycleganmodel_netG_A.pdmodel`, `cycleganmodel_netG_B.pdiparams`, `cycleganmodel_netG_B.pdiparams.info`, `cycleganmodel_netG_B.pdmodel`,。
diff --git a/deploy/lite/README.md b/deploy/lite/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c67c6bbe6385f05695ee1e521f9cef6320a02607
--- /dev/null
+++ b/deploy/lite/README.md
@@ -0,0 +1,177 @@
+# Paddle-Lite端侧部署
+
+本教程将介绍基于[Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite) 在移动端部署FOM模型的详细步骤。
+
+Paddle Lite是飞桨轻量化推理引擎,为手机、IOT端提供高效推理能力,并广泛整合跨平台硬件,为端侧部署及应用落地问题提供轻量化的部署方案。
+
+## 1. 准备环境
+
+### 运行准备
+- 电脑(编译Paddle Lite)
+- 安卓手机(armv7或armv8)
+
+### 1.1 准备交叉编译环境
+交叉编译环境用于编译 Paddle Lite 和 FOM 的C++ demo。
+支持多种开发环境,不同开发环境的编译流程请参考对应文档,请确保安装完成Java jdk、Android NDK(R17以上)。
+
+1. [Docker](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#docker)
+2. [Linux](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#linux)
+3. [MAC OS](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html#mac-os)
+
+### 1.2 准备预测库
+
+预测库有两种获取方式:
+1. 直接下载,预测库下载链接如下:
+ |平台|预测库下载链接|
+ |-|-|
+ |Android|[arm7](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.8/inference_lite_lib.android.armv7.gcc.c++_static.with_extra.with_cv.tar.gz) / [arm8](https://github.com/PaddlePaddle/Paddle-Lite/releases/download/v2.8/inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv.tar.gz)|
+
+**注意**:1. 目前FOM的算子只在PaddleLite的develop版本中支持,需要自行下载编译 2.如果是从 Paddle-Lite [官方文档](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#android-toolchain-gcc)下载的预测库,注意选择`with_extra=ON,with_cv=ON`的下载链接。3. 目前只提供Android端demo.
+
+
+2. 编译Paddle-Lite得到预测库,Paddle-Lite的编译方式如下:
+```shell
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+# 如果使用编译方式,建议使用develop分支编译预测库
+git checkout develop
+./lite/tools/build_android.sh --arch=armv8 --with_cv=ON --with_extra=ON
+```
+
+**注意**:编译Paddle-Lite获得预测库时,需要打开`--with_cv=ON --with_extra=ON`两个选项,`--arch`表示`arm`版本,这里指定为armv8,更多编译命令介绍请参考[链接](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_andriod.html#id2)。
+
+直接下载预测库并解压后,可以得到`inference_lite_lib.android.armv8.gcc.c++_static.with_extra.with_cv/`文件夹,通过编译Paddle-Lite得到的预测库位于`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/`文件夹下。
+预测库的文件目录如下:
+
+```
+inference_lite_lib.android.armv8/
+|-- cxx C++ 预测库和头文件
+| |-- include C++ 头文件
+| | |-- paddle_api.h
+| | |-- paddle_image_preprocess.h
+| | |-- paddle_lite_factory_helper.h
+| | |-- paddle_place.h
+| | |-- paddle_use_kernels.h
+| | |-- paddle_use_ops.h
+| | `-- paddle_use_passes.h
+| `-- lib C++预测库
+| |-- libpaddle_api_light_bundled.a C++静态库
+| `-- libpaddle_light_api_shared.so C++动态库
+|-- java Java预测库
+| |-- jar
+| | `-- PaddlePredictor.jar
+| |-- so
+| | `-- libpaddle_lite_jni.so
+| `-- src
+|-- demo C++和Java示例代码
+| |-- cxx C++ 预测库demo
+| `-- java Java 预测库demo
+```
+
+## 2 开始运行
+
+### 2.1 模型优化
+
+Paddle-Lite 提供了多种策略来自动优化原始的模型,其中包括量化、子图融合、混合调度、Kernel优选等方法,使用Paddle-Lite的`opt`工具可以自动对inference模型进行优化,目前支持两种优化方式,优化后的模型更轻量,模型运行速度更快。
+
+**注意**:如果已经准备好了 `.nb` 结尾的模型文件,可以跳过此步骤。
+
+#### 2.1.1 安装paddle_lite_opt工具
+安装paddle_lite_opt工具有如下两种方法:
+1. [**建议**]pip安装paddlelite并进行转换
+ ```shell
+ pip install paddlelite
+ ```
+
+2. 源码编译Paddle-Lite生成opt工具
+
+ 模型优化需要Paddle-Lite的`opt`可执行文件,可以通过编译Paddle-Lite源码获得,编译步骤如下:
+ ```shell
+ # 如果准备环境时已经clone了Paddle-Lite,则不用重新clone Paddle-Lite
+ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+ cd Paddle-Lite
+ git checkout develop
+ # 启动编译
+ ./lite/tools/build.sh build_optimize_tool
+ ```
+
+ 编译完成后,`opt`文件位于`build.opt/lite/api/`下,可通过如下方式查看`opt`的运行选项和使用方式;
+ ```shell
+ cd build.opt/lite/api/
+ ./opt
+ ```
+
+ `opt`的使用方式与参数与上面的`paddle_lite_opt`完全一致。
+
+之后使用`paddle_lite_opt`工具可以进行inference模型的转换。`paddle_lite_opt`的部分参数如下:
+
+|选项|说明|
+|-|-|
+|--model_file|待优化的PaddlePaddle模型(combined形式)的网络结构文件路径|
+|--param_file|待优化的PaddlePaddle模型(combined形式)的权重文件路径|
+|--optimize_out_type|输出模型类型,目前支持两种类型:protobuf和naive_buffer,其中naive_buffer是一种更轻量级的序列化/反序列化实现,默认为naive_buffer|
+|--optimize_out|优化模型的输出路径|
+|--valid_targets|指定模型可执行的backend,默认为arm。目前可支持x86、arm、opencl、npu、xpu,可以同时指定多个backend(以空格分隔),Model Optimize Tool将会自动选择最佳方式。如果需要支持华为NPU(Kirin 810/990 Soc搭载的达芬奇架构NPU),应当设置为npu, arm|
+
+更详细的`paddle_lite_opt`工具使用说明请参考[使用opt转化模型文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/opt/opt_bin.html)
+
+`--model_file`表示inference模型的model文件地址,`--param_file`表示inference模型的param文件地址;`optimize_out`用于指定输出文件的名称(不需要添加`.nb`的后缀)。直接在命令行中运行`paddle_lite_opt`,也可以查看所有参数及其说明。
+
+
+#### 2.1.3 FOM转换示例
+```shell
+# 将inference模型转化为Paddle-Lite优化模型
+paddle_lite_opt --model_file=output_inference/fom_dy2st/generator.pdmodel \
+ --param_file=output_inference/fom_dy2st/generator.pdiparams \
+ --optimize_out=output_inference/fom_dy2st/generator_lite \
+ --optimize_out_type=naive_buffer \
+ --valid_targets=arm
+paddle_lite_opt --model_file=output_inference/fom_dy2st/kp_detector.pdmodel \
+ --param_file=output_inference/fom_dy2st/kp_detector.pdiparams \
+ --optimize_out=output_inference/fom_dy2st/kp_detector_lite \
+ --optimize_out_type=naive_buffer \
+ --valid_targets=arm
+```
+
+最终在当前文件夹下生成`generator_lite.nb`和`kp_detector_lite.nb`的文件。
+
+**注意**:`--optimize_out` 参数为优化后模型的保存路径,无需加后缀`.nb`;`--model_file` 参数为模型结构信息文件的路径,`--param_file` 参数为模型权重信息文件的路径,请注意文件名。
+
+### 2.2 与手机联调
+
+首先需要进行一些准备工作。
+1. 准备一台arm8的安卓手机,如果编译的预测库和opt文件是armv7,则需要arm7的手机,并修改Makefile中`ARM_ABI = arm7`。
+2. 电脑上安装ADB工具,用于调试。 ADB安装方式如下:
+
+ 2.1. MAC电脑安装ADB:
+
+ ```shell
+ brew cask install android-platform-tools
+ ```
+ 2.2. Linux安装ADB
+ ```shell
+ sudo apt update
+ sudo apt install -y wget adb
+ ```
+ 2.3. Window安装ADB
+
+ win上安装需要去谷歌的安卓平台下载ADB软件包进行安装:[链接](https://developer.android.com/studio)
+
+3. 手机连接电脑后,开启手机`USB调试`选项,选择`文件传输`模式,在电脑终端中输入:
+
+```shell
+adb devices
+```
+如果有device输出,则表示安装成功,如下所示:
+```
+List of devices attached
+744be294 device
+```
+
+4. 准备优化后的模型、预测库文件、测试图像和类别映射文件, 导入手机等运行,目前apk还存在一些效果问题,还在优化中。
+
+
+## FAQ
+Q1:如果想更换模型怎么办,需要重新按照流程走一遍吗?
+A1:如果已经走通了上述步骤,更换模型只需要替换 `.nb` 模型文件即可,同时要注意修改下配置文件中的 `.nb` 文件路径以及类别映射文件(如有必要)。
+
diff --git a/deploy/serving/README.md b/deploy/serving/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9d16e022b5b4f44bc4a697a2a7973822c18ec2b8
--- /dev/null
+++ b/deploy/serving/README.md
@@ -0,0 +1,101 @@
+# 服务端预测部署
+
+`PaddleGAN`训练出来的模型可以使用[Serving](https://github.com/PaddlePaddle/Serving) 部署在服务端。
+本教程以在REDS数据集上用`configs/msvsr_reds.yaml`算法训练的模型进行部署。
+预训练模型权重文件为[PP-MSVSR_reds_x4.pdparams](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams) 。
+
+## 1. 安装 paddle serving
+请参考[PaddleServing](https://github.com/PaddlePaddle/Serving/tree/v0.6.0) 中安装教程安装(版本>=0.6.0)。
+
+## 2. 导出模型
+PaddleGAN在训练过程包括网络的前向和优化器相关参数,而在部署过程中,我们只需要前向参数,具体参考:[导出模型](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/deploy/EXPORT_MODEL.md)
+
+```
+python tools/export_model.py -c configs/msvsr_reds.yaml --inputs_size="1,2,3,180,320" --load /path/to/model --export_serving_model True
+----output_dir /path/to/output
+```
+
+以上命令会在`/path/to/output`文件夹下生成一个`msvsr`文件夹:
+```
+output
+│ ├── multistagevsrmodel_generator
+│ │ ├── multistagevsrmodel_generator.pdiparams
+│ │ ├── multistagevsrmodel_generator.pdiparams.info
+│ │ ├── multistagevsrmodel_generator.pdmodel
+│ │ ├── serving_client
+│ │ │ ├── serving_client_conf.prototxt
+│ │ │ ├── serving_client_conf.stream.prototxt
+│ │ ├── serving_server
+│ │ │ ├── __model__
+│ │ │ ├── __params__
+│ │ │ ├── serving_server_conf.prototxt
+│ │ │ ├── serving_server_conf.stream.prototxt
+│ │ │ ├── ...
+```
+
+`serving_client`文件夹下`serving_client_conf.prototxt`详细说明了模型输入输出信息
+`serving_client_conf.prototxt`文件内容为:
+```
+feed_var {
+ name: "lqs"
+ alias_name: "lqs"
+ is_lod_tensor: false
+ feed_type: 1
+ shape: 1
+ shape: 2
+ shape: 3
+ shape: 180
+ shape: 320
+}
+fetch_var {
+ name: "stack_18.tmp_0"
+ alias_name: "stack_18.tmp_0"
+ is_lod_tensor: false
+ fetch_type: 1
+ shape: 1
+ shape: 2
+ shape: 3
+ shape: 720
+ shape: 1280
+}
+fetch_var {
+ name: "stack_19.tmp_0"
+ alias_name: "stack_19.tmp_0"
+ is_lod_tensor: false
+ fetch_type: 1
+ shape: 1
+ shape: 3
+ shape: 720
+ shape: 1280
+}
+```
+
+## 4. 启动PaddleServing服务
+
+```
+cd output_dir/multistagevsrmodel_generator/
+
+# GPU
+python -m paddle_serving_server.serve --model serving_server --port 9393 --gpu_ids 0
+
+# CPU
+python -m paddle_serving_server.serve --model serving_server --port 9393
+```
+
+## 5. 测试部署的服务
+```
+# 进入到导出模型文件夹
+cd output/msvsr/
+```
+
+设置`prototxt`文件路径为`serving_client/serving_client_conf.prototxt` 。
+设置`fetch`为`fetch=["stack_19.tmp_0"])`
+
+测试
+```
+# 进入目录
+cd output/msvsr/
+
+# 测试代码 test_client.py 会自动创建output文件夹,并在output下生成`res.mp4`文件
+python ../../deploy/serving/test_client.py input_video frame_num
+```
diff --git a/deploy/serving/test_client.py b/deploy/serving/test_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5e0ae73a50673840a376cdca37b69e2bfd74933
--- /dev/null
+++ b/deploy/serving/test_client.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import numpy as np
+from paddle_serving_client import Client
+from paddle_serving_app.reader import *
+import cv2
+import os
+import imageio
+
+def get_img(pred):
+ pred = pred.squeeze()
+ pred = np.clip(pred, a_min=0., a_max=1.0)
+ pred = pred * 255
+ pred = pred.round()
+ pred = pred.astype('uint8')
+ pred = np.transpose(pred, (1, 2, 0)) # chw -> hwc
+ return pred
+
+preprocess = Sequential([
+ BGR2RGB(), Resize(
+ (320, 180)), Div(255.0), Transpose(
+ (2, 0, 1))
+])
+
+client = Client()
+
+client.load_client_config("serving_client/serving_client_conf.prototxt")
+client.connect(['127.0.0.1:9393'])
+
+frame_num = int(sys.argv[2])
+
+cap = cv2.VideoCapture(sys.argv[1])
+fps = cap.get(cv2.CAP_PROP_FPS)
+size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
+ int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
+success, frame = cap.read()
+read_end = False
+res_frames = []
+output_dir = "./output"
+if not os.path.exists(output_dir):
+ os.makedirs(output_dir)
+
+while success:
+ frames = []
+ for i in range(frame_num):
+ if success:
+ frames.append(preprocess(frame))
+ success, frame = cap.read()
+ else:
+ read_end = True
+ if read_end: break
+
+ frames = np.stack(frames, axis=0)
+ fetch_map = client.predict(
+ feed={
+ "lqs": frames,
+ },
+ fetch=["stack_19.tmp_0"],
+ batch=False)
+ res_frames.extend([fetch_map["stack_19.tmp_0"][0][i] for i in range(frame_num)])
+
+imageio.mimsave("output/output.mp4",
+ [get_img(frame) for frame in res_frames],
+ fps=fps)
+
diff --git a/docs/en_US/apis/apps.md b/docs/en_US/apis/apps.md
deleted file mode 120000
index 5e0941aad3f32570ca9539cefba472e7e32efc25..0000000000000000000000000000000000000000
--- a/docs/en_US/apis/apps.md
+++ /dev/null
@@ -1 +0,0 @@
-../../zh_CN/apis/apps.md
\ No newline at end of file
diff --git a/docs/en_US/apis/apps.md b/docs/en_US/apis/apps.md
new file mode 100644
index 0000000000000000000000000000000000000000..4d5e343d5b4632861a8ee04668d5483b0ecd31a6
--- /dev/null
+++ b/docs/en_US/apis/apps.md
@@ -0,0 +1,627 @@
+# Introduction of Prediction Interface
+
+PaddleGAN(ppgan.apps)provides prediction APIs covering multiple applications, including super resolution, video frame interpolation, colorization, makeup shifter, image animation, face parsing, etc. The integral pre-trained high-performance models enable users' flexible and efficient usage and inference.
+
+* Colorization:
+ * [DeOldify](#ppgan.apps.DeOldifyPredictor)
+ * [DeepRemaster](#ppgan.apps.DeepRemasterPredictor)
+* Super Resolution:
+ * [RealSR](#ppgan.apps.RealSRPredictor)
+ * [PPMSVSR](#ppgan.apps.PPMSVSRPredictor)
+ * [PPMSVSRLarge](#ppgan.apps.PPMSVSRLargePredictor)
+ * [EDVR](#ppgan.apps.EDVRPredictor)
+ * [BasicVSR](#ppgan.apps.BasicVSRPredictor)
+ * [IconVSR](#ppgan.apps.IconVSRPredictor)
+ * [BasiVSRPlusPlus](#ppgan.apps.BasiVSRPlusPlusPredictor)
+* Video Frame Interpolation:
+ * [DAIN](#ppgan.apps.DAINPredictor)
+* Motion Driving:
+ * [FirstOrder](#ppgan.apps.FirstOrderPredictor)
+* Face:
+ * [FaceFaceParse](#ppgan.apps.FaceParsePredictor)
+* Image Animation:
+ * [AnimeGAN](#ppgan.apps.AnimeGANPredictor)
+* Lip-syncing:
+ * [Wav2Lip](#ppgan.apps.Wav2LipPredictor)
+
+
+## Public Usage
+
+### Switch of CPU and GPU
+
+By default, GPU devices with the [PaddlePaddle](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/windows-pip.html) GPU environment package installed conduct inference by using GPU. If the CPU environment package is installed, CPU is used for inference.
+
+If manual switch of CPU and GPU is needed,you can do the following:
+
+
+```
+import paddle
+paddle.set_device('cpu') #set as CPU
+#paddle.set_device('gpu') #set as GPU
+```
+
+## ppgan.apps.DeOldifyPredictor
+
+```python
+ppgan.apps.DeOldifyPredictor(output='output', weight_path=None, render_factor=32)
+```
+
+> Build the instance of DeOldify. DeOldify is a coloring model based on GAN. The interface supports the colorization of images or videos. The recommended video format is mp4.
+>
+> **Example**
+>
+> ```python
+> from ppgan.apps import DeOldifyPredictor
+> deoldify = DeOldifyPredictor()
+> deoldify.run("docs/imgs/test_old.jpeg")
+> ```
+> **Parameters**
+>
+> > - output (str): path of the output image, default: output. Note that the save path should be set as output/DeOldify.
+> > - weight_path (str): path of the model, default: None,pre-trained integral model will then be automatically downloaded.
+> > - artistic (bool): whether to use "artistic" model, which may produce interesting colors, but there are more glitches.
+> > - render_factor (int): the zoom factor during image rendering and colorization. The image will be zoomed to a square with side length of 16xrender_factor before being colorized. For example, with a default value of 32,the entered image will be resized to (16x32=512) 512x512. Normally,the smaller the render_factor,the faster the computation and the more vivid the colors. Therefore, old images with low quality usually benefits from lowering the value of rendering factor. The higher the value, the better the image quality, but the color may fade slightly.
+### run
+
+```python
+run(input)
+```
+
+> The execution interface after building the instance.
+> **Parameters**
+>
+> > - input (str|np.ndarray|Image.Image): the input image or video files。For images, it could be its path, np.ndarray, or PIL.Image type. For videos, it could only be the file path.
+>
+>**Return Value**
+>
+>> - tuple(pred_img(np.array), out_paht(str)): for image input, return the predicted image, PIL.Image type and the path where the image is saved.
+> > - tuple(frame_path(str), out_path(str)): for video input, frame_path is the save path of the images after colorizing each frame of the video, and out_path is the save path of the colorized video.
+### run_image
+
+```python
+run_image(img)
+```
+
+> The interface of image colorization.
+> **Parameters**
+>
+> > - img (str|np.ndarray|Image.Image): input image,it could be the path of the image, np.ndarray, or PIL.Image type.
+>
+>**Return Value**
+>
+>> - pred_img(PIL.Image): return the predicted image, PIL.Image type.
+### run_video
+
+```python
+run_video(video)
+```
+
+> The interface of video colorization.
+> **Parameters**
+>
+> > - Video (str): path of the input video files.
+>
+> **Return Value**
+>
+> > - tuple(frame_path(str), out_path(str)): frame_path is the save path of the images after colorizing each frame of the video, and out_path is the save path of the colorized video.
+
+
+## ppgan.apps.DeepRemasterPredictor
+
+```python
+ppgan.apps.DeepRemasterPredictor(output='output', weight_path=None, colorization=False, reference_dir=None, mindim=360)
+```
+
+> Build the instance of DeepRemasterPredictor. DeepRemaster is a GAN-based coloring and restoring model, which can provide input reference frames. Only video input is available now, and the recommended format is mp4.
+>
+> **Example**
+>
+> ```
+> from ppgan.apps import DeepRemasterPredictor
+> deep_remaster = DeepRemasterPredictor()
+> deep_remaster.run("docs/imgs/test_old.jpeg")
+> ```
+>
+>
+> **Parameters**
+>
+> > - output (str): path of the output image, default: output. Note that the path should be set as output/DeepRemaster.
+> > - weight_path (str): path of the model, default: None,pre-trained integral model will then be automatically downloaded.
+> > - colorization (bool): whether to enable the coloring function, default: False, only the restoring function will be executed.
+> > - reference_dir(str|None): path of the reference frame when the coloring function is on, no reference frame is also allowed.
+> > - mindim(int): minimum side length of the resized image before prediction.
+### run
+
+```python
+run(video_path)
+```
+
+> The execution interface after building the instance.
+> **Parameters**
+>
+> > - video_path (str): path of the video file.
+> >
+> > **Return Value**
+> >
+> > - tuple(str, str)): return two types of str, the former is the save path of each frame of the colorized video, the latter is the save path of the colorized video.
+
+
+## ppgan.apps.RealSRPredictor
+
+```python
+ppgan.apps.RealSRPredictor(output='output', weight_path=None)
+```
+
+> Build the instance of RealSR。RealSR, Real-World Super-Resolution via Kernel Estimation and Noise Injection, is launched by CVPR 2020 Workshops in its super resolution model based on real-world images training. The interface imposes 4x super resolution on the input image or video. The recommended video format is mp4.
+>
+> *Note: the size of the input image should be less than 1000x1000pix。
+>
+> **Example**
+>
+> ```
+> from ppgan.apps import RealSRPredictor
+> sr = RealSRPredictor()
+> sr.run("docs/imgs/test_sr.jpeg")
+> ```
+> **Parameters**
+>
+> > - output (str): path of the output image, default: output. Note that the path should be set as output/RealSR.
+> > - weight_path (str): path of the model, default: None,pre-trained integral model will then be automatically downloaded.
+```python
+run(video_path)
+```
+
+> The execution interface after building the instance.
+> **Parameters**
+>
+> > - video_path (str): path of the video file.
+>
+>**Return Value**
+>
+>> - tuple(pred_img(np.array), out_paht(str)): for image input, return the predicted image, PIL.Image type and the path where the image is saved.
+> > - tuple(frame_path(str), out_path(str)): for video input, frame_path is the save path of each frame of the video after super resolution, and out_path is the save path of the video after super resolution.
+### run_image
+
+```python
+run_image(img)
+```
+
+> The interface of image super resolution.
+> **Parameter**
+>
+> > - img (str|np.ndarray|Image.Image): input image, it could be the path of the image, np.ndarray, or PIL.Image type.
+>
+> **Return Value**
+>
+> > - pred_img(PIL.Image): return the predicted image, PIL.Image type.
+### run_video
+
+```python
+run_video(video)
+```
+
+> The interface of video super resolution.
+> **Parameter**
+>
+> > - Video (str): path of the video file.
+>
+> **Return Value**
+>
+> > - tuple(frame_path(str), out_path(str)): frame_path is the save path of each frame of the video after super resolution, and out_path is the save path of the video after super resolution.
+
+
+
+## ppgan.apps.PPMSVSRPredictor
+
+```python
+ppgan.apps.PPMSVSRPredictor(output='output', weight_path=None, num_frames=10)
+```
+
+> Build the instance of PPMSVSR. PPMSVSR is a multi-stage VSR deep architecture. For more details, see the paper, PP-MSVSR: Multi-Stage Video Super-Resolution (https://arxiv.org/pdf/2112.02828.pdf). The interface imposes 4x super resolution on the input video. The recommended video format is mp4.
+>
+> **Parameter**
+>
+> ```
+> from ppgan.apps import PPMSVSRPredictor
+> sr = PPMSVSRPredictor()
+> # test a video file
+> sr.run("docs/imgs/test.mp4")
+> ```
+> **参数**
+>
+> > - output (str): path of the output image, default: output. Note that the path should be set as output/EDVR.
+> > - weight_path (str): path of the model, default: None,pre-trained integral model will then be automatically downloaded.
+> > - num_frames (int): the number of input frames of the PPMSVSR model, the default value: 10. Note that the larger the num_frames, the better the effect of the video after super resolution.
+```python
+run(video_path)
+```
+
+> The execution interface after building the instance.
+> **Parameter**
+>
+> > - video_path (str): path of the video files.
+>
+> **Return Value**
+>
+> > - tuple(str, str): the former is the save path of each frame of the video after super resolution, the latter is the save path of the video after super resolution.
+
+
+## ppgan.apps.PPMSVSRLargePredictor
+
+```python
+ppgan.apps.PPMSVSRLargePredictor(output='output', weight_path=None, num_frames=10)
+```
+
+> Build the instance of PPMSVSRLarge. PPMSVSRLarge is a Large PPMSVSR model. For more details, see the paper, PP-MSVSR: Multi-Stage Video Super-Resolution (https://arxiv.org/pdf/2112.02828.pdf). The interface imposes 4x super resolution on the input video. The recommended video format is mp4.
+>
+> **Parameter**
+>
+> ```
+> from ppgan.apps import PPMSVSRLargePredictor
+> sr = PPMSVSRLargePredictor()
+> # test a video file
+> sr.run("docs/imgs/test.mp4")
+> ```
+> **参数**
+>
+> > - output (str): path of the output image, default: output. Note that the path should be set as output/EDVR.
+> > - weight_path (str): path of the model, default: None,pre-trained integral model will then be automatically downloaded.
+> > - num_frames (int): the number of input frames of the PPMSVSR model, the default value: 10. Note that the larger the num_frames, the better the effect of the video after super resolution.
+```python
+run(video_path)
+```
+
+> The execution interface after building the instance.
+> **Parameter**
+>
+> > - video_path (str): path of the video files.
+>
+> **Return Value**
+>
+> > - tuple(str, str): the former is the save path of each frame of the video after super resolution, the latter is the save path of the video after super resolution.
+
+## ppgan.apps.EDVRPredictor
+
+```python
+ppgan.apps.EDVRPredictor(output='output', weight_path=None)
+```
+
+> Build the instance of EDVR. EDVR is a model designed for video super resolution. For more details, see the paper, EDVR: Video Restoration with Enhanced Deformable Convolutional Networks (https://arxiv.org/abs/1905.02716). The interface imposes 4x super resolution on the input video. The recommended video format is mp4.
+>
+> **Parameter**
+>
+> ```
+> from ppgan.apps import EDVRPredictor
+> sr = EDVRPredictor()
+> # test a video file
+> sr.run("docs/imgs/test.mp4")
+> ```
+> **参数**
+>
+> > - output (str): path of the output image, default: output. Note that the path should be set as output/EDVR.
+> > - weight_path (str): path of the model, default: None,pre-trained integral model will then be automatically downloaded.
+```python
+run(video_path)
+```
+
+> The execution interface after building the instance.
+> **Parameter**
+>
+> > - video_path (str): path of the video files.
+>
+> **Return Value**
+>
+> > - tuple(str, str): the former is the save path of each frame of the video after super resolution, the latter is the save path of the video after super resolution.
+
+
+## ppgan.apps.BasicVSRPredictor
+
+```python
+ppgan.apps.BasicVSRPredictor(output='output', weight_path=None, num_frames=10)
+```
+
+> Build the instance of BasicVSR. BasicVSR is a model designed for video super resolution. For more details, see the paper, BasicVSR: The Search for Essential Components in Video Super-Resolution and Beyond (https://arxiv.org/pdf/2012.02181.pdf). The interface imposes 4x super resolution on the input video. The recommended video format is mp4.
+>
+> **Parameter**
+>
+> ```
+> from ppgan.apps import BasicVSRPredictor
+> sr = BasicVSRPredictor()
+> # test a video file
+> sr.run("docs/imgs/test.mp4")
+> ```
+> **参数**
+>
+> > - output (str): path of the output image, default: output. Note that the path should be set as output/EDVR.
+> > - weight_path (str): path of the model, default: None,pre-trained integral model will then be automatically downloaded.
+> > - num_frames (int): the number of input frames of the PPMSVSR model, the default value: 10. Note that the larger the num_frames, the better the effect of the video after super resolution.
+```python
+run(video_path)
+```
+
+> The execution interface after building the instance.
+> **Parameter**
+>
+> > - video_path (str): path of the video files.
+>
+> **Return Value**
+>
+> > - tuple(str, str): the former is the save path of each frame of the video after super resolution, the latter is the save path of the video after super resolution.
+
+## ppgan.apps.IconVSRPredictor
+
+```python
+ppgan.apps.IconVSRPredictor(output='output', weight_path=None, num_frames=10)
+```
+
+> Build the instance of IconVSR. IconVSR is a VSR model expanded by BasicVSR. For more details, see the paper, BasicVSR: The Search for Essential Components in Video Super-Resolution and Beyond (https://arxiv.org/pdf/2012.02181.pdf). The interface imposes 4x super resolution on the input video. The recommended video format is mp4.
+>
+> **Parameter**
+>
+> ```
+> from ppgan.apps import IconVSRPredictor
+> sr = IconVSRPredictor()
+> # test a video file
+> sr.run("docs/imgs/test.mp4")
+> ```
+> **参数**
+>
+> > - output (str): path of the output image, default: output. Note that the path should be set as output/EDVR.
+> > - weight_path (str): path of the model, default: None,pre-trained integral model will then be automatically downloaded.
+> > - num_frames (int): the number of input frames of the PPMSVSR model, the default value: 10. Note that the larger the num_frames, the better the effect of the video after super resolution.
+```python
+run(video_path)
+```
+
+> The execution interface after building the instance.
+> **Parameter**
+>
+> > - video_path (str): path of the video files.
+>
+> **Return Value**
+>
+> > - tuple(str, str): the former is the save path of each frame of the video after super resolution, the latter is the save path of the video after super resolution.
+
+
+## ppgan.apps.BasiVSRPlusPlusPredictor
+
+```python
+ppgan.apps.BasiVSRPlusPlusPredictor(output='output', weight_path=None, num_frames=10)
+```
+
+> Build the instance of BasiVSRPlusPlus. BasiVSRPlusPlus is a model designed for video super resolution. For more details, see the paper, BasicVSR++: Improving Video Super-Resolution with Enhanced Propagation and Alignment (https://arxiv.org/pdf/2104.13371v1.pdf). The interface imposes 4x super resolution on the input video. The recommended video format is mp4.
+>
+> **Parameter**
+>
+> ```
+> from ppgan.apps import BasiVSRPlusPlusPredictor
+> sr = BasiVSRPlusPlusPredictor()
+> # test a video file
+> sr.run("docs/imgs/test.mp4")
+> ```
+> **参数**
+>
+> > - output (str): path of the output image, default: output. Note that the path should be set as output/EDVR.
+> > - weight_path (str): path of the model, default: None,pre-trained integral model will then be automatically downloaded.
+> > - num_frames (int): the number of input frames of the PPMSVSR model, the default value: 10. Note that the larger the num_frames, the better the effect of the video after super resolution.
+```python
+run(video_path)
+```
+
+> The execution interface after building the instance.
+> **Parameter**
+>
+> > - video_path (str): path of the video files.
+>
+> **Return Value**
+>
+> > - tuple(str, str): the former is the save path of each frame of the video after super resolution, the latter is the save path of the video after super resolution.
+
+
+
+## ppgan.apps.DAINPredictor
+
+```python
+ppgan.apps.DAINPredictor(output='output', weight_path=None,time_step=None, use_gpu=True, key_frame_thread=0,remove_duplicates=False)
+```
+
+> Build the instance of DAIN model. DAIN supports video frame interpolation, producing videos with higher frame rate. For more details, see the paper, DAIN: Depth-Aware Video Frame interpolation (https://arxiv.org/abs/1904.00830).
+>
+> *Note: The interface is only available in static graph, add the following codes to enable static graph before using it:
+>
+> ```
+> import paddle
+> paddle.enable_static() #enable static graph
+> paddle.disable_static() #disable static graph
+> ```
+>
+> **Example**
+>
+> ```
+> from ppgan.apps import DAINPredictor
+> dain = DAINPredictor(time_step=0.5) # With no defualt value, time_step need to be manually specified
+> # test a video file
+> dain.run("docs/imgs/test.mp4")
+> ```
+> **Parameters**
+>
+> > - output_path (str): path of the predicted output, default: output. Note that the path should be set as output/DAIN.
+> > - weight_path (str): path of the model, default: None, pre-trained integral model will then be automatically downloaded.
+> > - time_step (float): the frame rate changes by a factor of 1./time_step, e.g. 2x frames if time_step is 0.5 and 4x frames if it is 0.25.
+> > - use_gpu (bool): whether to make predictions by using GPU, default: True.
+> > - remove_duplicates (bool): whether to remove duplicates, default: False.
+```python
+run(video_path)
+```
+
+> The execution interface after building the instance.
+> **Parameters**
+>
+> > - video_path (str): path of the video file.
+>
+> **Return Value**
+>
+> > - tuple(str, str): for video input, frame_path is the save path of the image after colorizing each frame of the video, and out_path is the save path of the colorized video.
+
+
+## ppgan.apps.FirstOrderPredictor
+
+```python
+ppgan.apps.FirstOrderPredictor(output='output', weight_path=None,config=None, relative=False, adapt_scale=False,find_best_frame=False, best_frame=None)
+```
+
+> Build the instance of FirstOrder model. The model is dedicated to Image Animation, i.e., generating a video sequence so that an object in a source image is animated according to the motion of a driving video.
+>
+> For more details, see paper, First Order Motion Model for Image Animation (https://arxiv.org/abs/2003.00196) .
+>
+> **Example**
+>
+> ```
+> from ppgan.apps import FirstOrderPredictor
+> animate = FirstOrderPredictor()
+> # test a video file
+> animate.run("source.png","driving.mp4")
+> ```
+> **Parameters**
+>
+> > - output_path (str): path of the predicted output, default: output. Note that the path should be set as output/result.mp4.
+> > - weight_path (str): path of the model, default: None, pre-trained integral model will then be automatically downloaded.
+> > - config (dict|str|None): model configuration, it can be a dictionary type or a YML file, and the default value None is adopted. When the weight is None by default, the config also needs to adopt the default value None. otherwise, the configuration here should be consistent with the corresponding weight.
+> > - relative (bool): indicate whether the relative or absolute coordinates of key points in the video are used in the program, default: False.
+> > - adapt_scale (bool): adapt movement scale based on convex hull of key points, default: False.
+> > - find_best_frame (bool): whether to start generating from the frame that best matches the source image, which exclusively applies to face applications and requires libraries with face alignment.
+> > - best_frame (int): set the number of the starting frame, default: None, that is, starting from the first frame(counting from 1).
+```python
+run(source_image,driving_video)
+```
+
+> The execution interface after building the instance, the predicted video is save in output/result.mp4.
+> **Parameters**
+>
+> > - source_image (str): input the source image。
+> > - driving_video (str): input the driving video, mp4 format recommended.
+>
+> **Return Value**
+>
+> > None.
+## ppgan.apps.FaceParsePredictor
+
+```pyhton
+ppgan.apps.FaceParsePredictor(output_path='output')
+```
+> Build the instance of the face parsing model. The model is devoted to address the task of distributing a pixel-wise label to each semantic components (e.g. hair, lips, nose, ears, etc.) in accordance with the input facial image. The task proceeds with the help of BiseNet.
+>
+> For more details, see the paper, BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation (https://arxiv.org/abs/1808.00897v1).
+>
+> *Note: dlib package is needed for this interface, use the following codes to install it:
+>
+> ```
+> pip install dlib
+> ```
+> It may take long to install this package under Windows, please be patient.
+>
+> **Parameters:**
+>
+> > - input_image: path of the input image to be parsed
+> > - output_path: path of the output to be saved
+> **Example:**
+>
+> ```
+> from ppgan.apps import FaceParsePredictor
+> parser = FaceParsePredictor()
+> parser.run('docs/imgs/face.png')
+> ```
+> **Return Value:**
+>
+> > - mask(numpy.ndarray): return the mask matrix of the parsed facial components, data type: numpy.ndarray.
+## ppgan.apps.AnimeGANPredictor
+
+```pyhton
+ppgan.apps.AnimeGANPredictor(output_path='output_dir',weight_path=None,use_adjust_brightness=True)
+```
+> Adopt the AnimeGAN v2 to realize the animation of scenery images.
+>
+> For more details, see the paper, AnimeGAN: A Novel Lightweight GAN for Photo Animation (https://link.springer.com/chapter/10.1007/978-981-15-5577-0_18).
+> **Parameters:**
+>
+> > - input_image: path of the input image to be parsed.
+> **Example:**
+>
+> ```
+> from ppgan.apps import AnimeGANPredictor
+> predictor = AnimeGANPredictor()
+> predictor.run('docs/imgs/animeganv2_test.jpg')
+> ```
+> **Return Value:**
+>
+> > - anime_image(numpy.ndarray): return the stylized scenery image.
+
+## ppgan.apps.MiDaSPredictor
+
+```pyhton
+ppgan.apps.MiDaSPredictor(output=None, weight_path=None)
+```
+
+> MiDaSv2 is a monocular depth estimation model (see https://github.com/intel-isl/MiDaS). Monocular depth estimation is a method used to compute depth from a singe RGB image.
+>
+> For more details, see the paper Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer (https://arxiv.org/abs/1907.01341v3).
+> **Example**
+>
+> ```python
+> from ppgan.apps import MiDaSPredictor
+> # if set output, will write depth pfm and png file in output/MiDaS
+> model = MiDaSPredictor()
+> prediction = model.run()
+> ```
+>
+> Color display of the depth image:
+>
+> ```python
+> import numpy as np
+> import PIL.Image as Image
+> import matplotlib as mpl
+> import matplotlib.cm as cm
+>
+> vmax = np.percentile(prediction, 95)
+> normalizer = mpl.colors.Normalize(vmin=prediction.min(), vmax=vmax)
+> mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
+> colormapped_im = (mapper.to_rgba(prediction)[:, :, :3] * 255).astype(np.uint8)
+> im = Image.fromarray(colormapped_im)
+> im.save('test_disp.jpeg')
+> ```
+>
+> **Parameters:**
+>
+> > - output (str): path of the output, if it is None, no pfm and png depth image will be saved.
+> > - weight_path (str): path of the model, default: None, pre-trained integral model will then be automatically downloaded.
+> **Return Value:**
+>
+> > - prediction (numpy.ndarray): return the prediction.
+> > - pfm_f (str): return the save path of pfm files if the output path is set.
+> > - png_f (str): return the save path of png files if the output path is set.
+
+## ppgan.apps.Wav2LipPredictor
+
+```python
+ppgan.apps.Wav2LipPredictor(face=None, ausio_seq=None, outfile=None)
+```
+
+> Build the instance for the Wav2Lip model, which is used for lip generation, i.e., achieving the synchronization of lip movements on a talking face video and the voice from an input audio.
+>
+> For more details, see the paper, A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild (http://arxiv.org/abs/2008.10010).
+>
+> **Example**
+>
+> ```
+> from ppgan.apps import Wav2LipPredictor
+> import ppgan
+> predictor = Wav2LipPredictor()
+> predictor.run('/home/aistudio/先烈.jpeg', '/home/aistudio/pp_guangquan_zhenzhu46s.mp4','wav2lip')
+> ```
+> **Parameters:**
+> - face (str): path of images or videos containing human face.
+> - audio_seq (str): path of the input audio, any processable format in ffmpeg is supported, including `.wav`, `.mp3`, `.m4a` etc.
+> - outfile (str): path of the output video file.
+>**Return Value**
+>
+>> None
diff --git a/docs/en_US/config_doc.md b/docs/en_US/config_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..a4f3f7065357935f8b915b0b72bd0deed947f862
--- /dev/null
+++ b/docs/en_US/config_doc.md
@@ -0,0 +1,77 @@
+# Instruction of Config Files
+
+## Introduction of Parameters
+
+Take`lapstyle_rev_first.yaml` as an example.
+
+### Global
+
+| Field | Usage | Default |
+| ------------------------- | :------------------------- | --------------- |
+| total_iters | total training steps | 30000 |
+| min_max | numeric range of tensor(for image storage) | (0., 1.) |
+| output_dir | path of the output | ./output_dir |
+| snapshot_config: interval | interval for saving model parameters | 5000 |
+
+### Model
+
+| Field | Usage | Default |
+| :---------------------- | -------- | ------ |
+| name | name of the model | LapStyleRevFirstModel |
+| revnet_generator | set the revnet generator | RevisionNet |
+| revnet_discriminator | set the revnet discriminator | LapStyleDiscriminator |
+| draftnet_encode | set the draftnet encoder | Encoder |
+| draftnet_decode | set the draftnet decoder | DecoderNet |
+| calc_style_emd_loss | set the style loss 1 | CalcStyleEmdLoss |
+| calc_content_relt_loss | set the content loss 1 | CalcContentReltLoss |
+| calc_content_loss | set the content loss 2 | CalcContentLoss |
+| calc_style_loss | set the style loss 2 | CalcStyleLoss |
+| gan_criterion: name | set the GAN loss | GANLoss |
+| gan_criterion: gan_mode | set the modal parameter of GAN loss | vanilla |
+| content_layers | set the network layer that calculates content loss 2 |['r11', 'r21', 'r31', 'r41', 'r51']|
+| style_layers | set the network layer that calculates style loss 2 | ['r11', 'r21', 'r31', 'r41', 'r51'] |
+| content_weight | set the weight of total content loss | 1.0 |
+| style_weigh | set the weight of total style loss | 3.0 |
+
+### Dataset (train & test)
+
+| Field | Usage | Default |
+| :----------- | -------------------- | -------------------- |
+| name | name of the dataset | LapStyleDataset |
+| content_root | path of the dataset | data/coco/train2017/ |
+| style_root | path of the target style image | data/starrynew.png |
+| load_size | image size after resizing the input image | 280 |
+| crop_size | image size after random cropping | 256 |
+| num_workers | number of worker process | 16 |
+| batch_size | size of the data sample for one training session | 5 |
+
+### Lr_scheduler
+
+| Field | Usage | Default |
+| :------------ | ---------------- | -------------- |
+| name | name of the learning strategy | NonLinearDecay |
+| learning_rate | initial learning rate | 1e-4 |
+| lr_decay | decay rate of the learning rate | 5e-5 |
+
+### Optimizer
+
+| Field | Usage | Default |
+| :-------- | ---------- | ------- |
+| name | class name of the optimizer | Adam |
+| net_names | the network under the optimizer | net_rev |
+| beta1 | set beta1, parameter of the optimizer | 0.9 |
+| beta2 | set beta2, parameter of the optimizer | 0.999 |
+
+### Validate
+
+| Field | Usage | Default |
+| :------- | ---- | ------ |
+| interval | validation interval | 500 |
+| save_img | whether to save image while validating | false |
+
+### Log_config
+
+| Field | Usage | Default |
+| :--------------- | ---- | ------ |
+| interval | log printing interval | 10 |
+| visiual_interval | interval for saving the generated images during training | 500 |
diff --git a/docs/en_US/data_prepare.md b/docs/en_US/data_prepare.md
index 7fde7c98b1d3e53df2547b85d2b5028695e98c65..be8589b8d8173395ab26449e73cd120b1e3ea82f 100644
--- a/docs/en_US/data_prepare.md
+++ b/docs/en_US/data_prepare.md
@@ -1,6 +1,8 @@
-## Data prepare
+## Data Preparation
-The config will suppose your data put in `$PaddleGAN/data`. You can symlink your datasets to `$PaddleGAN/data`.
+## **1. Routes Configuration of Datasets**
+
+The config will suppose your data is in `$PaddleGAN/data`. You can symlink your datasets to `$PaddleGAN/data`.
```
PaddleGAN
@@ -28,7 +30,7 @@ PaddleGAN
```
-If you put your datasets on other place,for example ```your/data/path```, you can also change ```dataroot``` in config file:
+If you put the datasets on other place,for example ```your/data/path```, you can also change ```dataroot``` in config file:
```
dataset:
@@ -38,22 +40,30 @@ dataset:
num_workers: 4
```
-### Datasets of CycleGAN
+## 2. Datasets Preparation
+
+### 2.1 Download of Datasets
+
+#### 2.1.1 Datasets of CycleGAN
+
+- #### Download from website
-#### download form website
-Datasets for CycleGAN can be downloaded from [here](https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/)
+Datasets of CycleGAN can be downloaded from [here](https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/), remember to symlink your datasets to `$PaddleGAN/data`.
-#### download by script
+- #### Download by script
-You can use ```download_cyclegan_data.py``` in ```PaddleGAN/data``` to download datasets you wanted. Supported datasets are: apple2orange, summer2winter_yosemite,horse2zebra, monet2photo, cezanne2photo, ukiyoe2photo, vangogh2photo, maps, cityscapes, facades, iphone2dslr_flower, ae_photos, cityscapes。
+
+You can use ```download_cyclegan_data.py``` in ```PaddleGAN/data``` to download datasets you wanted.
+
+Supported datasets are: apple2orange, summer2winter_yosemite,horse2zebra, monet2photo, cezanne2photo, ukiyoe2photo, vangogh2photo, maps, cityscapes, facades, iphone2dslr_flower, ae_photos, cityscapes。
run following command. Dataset will be downloaded to ```~/.cache/ppgan``` and symlink to ```PaddleGAN/data/``` .
```
python data/download_cyclegan_data.py --name horse2zebra
```
-#### custom dataset
+#### Custom dataset
Data should be arranged in following way if you use custom dataset.
@@ -65,13 +75,15 @@ custom_datasets
└── trainB
```
-### Datasets of Pix2Pix
+#### 2.1.2 Datasets of Pix2Pix
+
+- #### Download from website
-#### Download from website
Dataset for pix2pix can be downloaded from [here](https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets/)
-#### Download by script
+- #### Download by script
+
You can use ```download_pix2pix_data.py``` in ```PaddleGAN/data``` to download datasets you wanted. Supported datasets are: apple2orange, summer2winter_yosemite,horse2zebra, monet2photo, cezanne2photo, ukiyoe2photo, vangogh2photo, maps, cityscapes, facades, iphone2dslr_flower, ae_photos, cityscapes.
@@ -82,7 +94,7 @@ python data/download_pix2pix_data.py --name cityscapes
```
#### Custom datasets
-Data should be arranged in following way if you use custom dataset. And image content shoubld be same with example image.
+Data should be arranged in following way if you use custom dataset. And image content should be the same with example image.
```
facades
diff --git a/docs/en_US/get_started.md b/docs/en_US/get_started.md
index 5a3fe7cd5e7dfbd3e80609bb918b78894d0e1da4..85de5fb56a26e630aad35d74a1e835ff39275f76 100644
--- a/docs/en_US/get_started.md
+++ b/docs/en_US/get_started.md
@@ -1,28 +1,63 @@
-## Getting started with PaddleGAN
+# Quick Start
-Note:
-* Before starting to use PaddleGAN, please make sure you have read the [install document](./install_en.md), and prepare the dataset according to the [data preparation document](./data_prepare_en.md)
-* The following tutorial uses the train and evaluate of the CycleGAN model on the Cityscapes dataset as an example
+PaddleGAN is a PaddlePaddle Generative Adversarial Network (GAN) development kit that provides a high-performance replication of a variety of classical networks with applications covering a wide range of areas such as image generation, style migration, ainimation driving, image/video super resolution and colorization.
-### Train
+This section will teach you how to quickly get started with PaddleGAN, using the train and evaluate of the CycleGAN model on the Cityscapes dataset as an example.
+
+Note that all model configuration files in PaddleGAN are available at [. /PaddleGAN/configs](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/configs).
+
+## Contents
+
+* [Installation](#Installation)
+* [Data preparation](#Data-preparation)
+* [Training](#Trianing)
+ * [Single Card Training](#Single-Card-Training)
+ * [Parameters](#Parameters)
+ * [Visualize Training](#Visualize-Training)
+ * [Resume Training](#Resume-Training)
+ * [Multi-Card Training](#Multi-Card-Training)
+* [Evaluation](#Evaluation)
+
+## Installation
+
+For installation and configuration of the runtime environment, please refer to the [installation documentation](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/install.md) to complete the installation of PaddlePaddle and PaddleGAN.
+
+In this demo, it is assumed that the user cloned and placed the code of PaddleGAN in the '/home/paddle' directory. The user executes the command operations in the '/home/paddle/PaddleGAN' directory.
+
+## Data preparation
+
+Prepare the Cityscapes dataset according to the [data preparation](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/data_prepare.md).
+
+Download the Cityscapes dataset to ~/.cache/ppgan and softlink to PaddleGAN/data/ using the following script.
-#### Train with single gpu
```
-python -u tools/main.py --config-file configs/cyclegan_cityscapes.yaml
+python data/download_cyclegan_data.py --name cityscapes
```
-#### Args
+## Trianing
-- `--config-file (str)`: path of config file。
+### 1. Single Card Training
-The output log, weight, and visualization result will be saved in ```./output_dir``` by default, which can be modified by the ```output_dir``` parameter in the config file:
```
- output_dir: output_dir
+ python -u tools/main.py --config-file configs/cyclegan_cityscapes.yaml
```
+#### Parameters
+
+* `--config-file (str)`: path to the config file. This is the configuration file used here for CycleGAN training on the Cityscapes dataset.
+
+* The output logs, weights, and visualization results are saved by default in `. /output_dir`, which can be modified by the `output_dir` parameter in the configuration file:
+ ```
+ output_dir: output_dir
+ ```
+
+
+

+
+
+* The saved folder will automatically generate a new directory based on the model name and timestamp, with the following directory example.
-The saved folder will automatically generate a new directory based on the model name and timestamp. The directory example is as follows:
```
output_dir
└── CycleGANModel-2020-10-29-09-21
@@ -47,31 +82,57 @@ output_dir
└── epoch002_rec_B.png
```
-Also, you can add the parameter ```enable_visualdl: true``` in the configuration file, use [PaddlePaddle VisualDL](https://github.com/PaddlePaddle/VisualDL) record the metrics or images generated in the training process, and run the command to monitor the training process:
+#### Visualize Training
+
+[VisualDL](https://github.com/PaddlePaddle/VisualDL) is a visual analysis tool developed for deep learning model development, providing real-time trend visualization of key metrics, sample training intermediate process visualization, network structure visualization, etc. It can visually show the relationship between the effects of super participant models and assist in efficient tuning.
+
+Please make sure that you have installed [VisualDL](https://github.com/PaddlePaddle/VisualDL). Refer to the [VisualDL installation guide](https://github.com/PaddlePaddle/VisualDL/blob/develop/README.md#Installation).
+
+Use the [VisualDL](https://github.com/PaddlePaddle/VisualDL) to record the metrics or images generated by the training process by adding the command `enable_visualdl: True` to the configuration file cyclegan_cityscapes.yaml, and run the corresponding command to monitor the training process in real time.
+
+
+

+
+
+
+
+If you want to customize the content of the [VisualDL](https://github.com/PaddlePaddle/VisualDL) visualization, you can go to . /PaddleGAN/ppgan/engine/trainer.py.
+
+Launch [VisualDL](https://github.com/PaddlePaddle/VisualDL) locally by:
+
```
visualdl --logdir output_dir/CycleGANModel-2020-10-29-09-21/
```
-#### Recovery of training
+Please refer to the [VisualDL User's Guide](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/components/README.md) for more guidance on how to start and use those visualization functions.
+
+#### Resume Training
+
+The checkpoint of the previous epoch is saved in `output_dir` by default during the training process to facilitate resuming the training.
+
+In this demo, cyclegan's training will save checkpoint every five epochs by default, and if you want to change the number of epochs, you can go to the **config file to adjust the `interval` paramter**.
+
+
+

+
-The checkpoint of the previous epoch will be saved by default during the training process to facilitate the recovery of training
```
python -u tools/main.py --config-file configs/cyclegan_cityscapes.yaml --resume your_checkpoint_path
```
-#### Args
-
- `--resume (str)`: path of checkpoint。
-#### Train with multiple gpus:
+### 2. Multi-Card Training
+
```
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch tools/main.py --config-file configs/cyclegan_cityscapes.yaml
```
-### evaluate
+## Evaluation
```
python tools/main.py --config-file configs/cyclegan_cityscapes.yaml --evaluate-only --load your_weight_path
```
#### Args
-- `--evaluate-only`: whether to evaluate only。
-- `--load (str)`: path of weight。
+- `--evaluate-only`: If or not to make predictions only
+- `--load (str)`: path of the weight
+
diff --git a/docs/en_US/industrial_solution/photo_color_en.md b/docs/en_US/industrial_solution/photo_color_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..83c58b7e1dad06e21583e7ebc95b4d68d82705b6
--- /dev/null
+++ b/docs/en_US/industrial_solution/photo_color_en.md
@@ -0,0 +1,43 @@
+# Image Colorization
+PaddleGAN provides [DeOldify](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdeoldifypredictor) model for image colorization.
+
+## DeOldifyPredictor
+
+[DeOldify](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdeoldifypredictor) generates the adversarial network with a self-attentive mechanism. The generator is a U-NET structured network with better effects in image/video coloring.
+
+
+

+
+
+### Parameters
+
+- `output (str,可选的)`: path of the output folder, default value: `output`
+- `weight_path (None, optional)`: path to load weights, if not set, the default weights will be downloaded locally from the cloud. Default value:`None`
+- `artistic (bool)`: whether or not to use the "artistic" model. "Artistic" models are likely to produce some interesting colors, but with some burrs.
+- `render_factor (int)`: This parameter will be multiplied by 16 and used as the resize value for the input frame. If the value is set to 32, the input frame will be resized to a size of (32 * 16, 32 * 16) and fed into the network.
+
+
+### Usage
+**1. API Prediction**
+
+```
+from ppgan.apps import DeOldifyPredictor
+deoldify = DeOldifyPredictor()
+deoldify.run("/home/aistudio/先烈.jpg") #原图片所在路径
+```
+*`run` interface is a common interface for images/videos, since the object here is an image, the interface of `run_image` is suitable.
+
+[Complete API interface usage instructions]()
+
+**2. Command-Line Prediction**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/先烈.jpg \ #Original image path
+ --process_order DeOldify \ #Order of processing of the original image
+ --output output_dir #Path of the final image
+```
+
+### Experience Online Projects
+**1. [Old Beijing City Video Restoration](https://aistudio.baidu.com/aistudio/projectdetail/1161285)**
+
+**2. [PaddleGAN ❤️ 520 Edition](https://aistudio.baidu.com/aistudio/projectdetail/1956943?channelType=0&channel=0)**
diff --git a/docs/en_US/install.md b/docs/en_US/install.md
index 0439fdfb5c17972a4b916815a05cf9245eaa1a2e..c4fddc88cf6b52ba102644cc908d31bf1b8d0054 100644
--- a/docs/en_US/install.md
+++ b/docs/en_US/install.md
@@ -1,68 +1,81 @@
-## Install PaddleGAN
-### requirements
-* PaddlePaddle >= 2.0.0-rc
+## Installation
+
+This document contains how to install PaddleGAN and related dependencies. For more product overview, please refer to [README](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/README_en.md).
+
+### Requirements
+
+* PaddlePaddle >= 2.1.0
* Python >= 3.6
-* CUDA >= 9.0
+* CUDA >= 10.1
-### 1. Install PaddlePaddle
+### Install PaddlePaddle
```
-pip install -U paddlepaddle-gpu==2.0.0rc0
+# CUDA10.1
+python -m pip install paddlepaddle-gpu==2.1.0.post101 -f https://mirror.baidu.com/pypi/simple
+
+# CPU
+python -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
```
-Note: command above will install paddle with cuda10.2,if your installed cuda is different, you can choose an proper version to install from table below.
+For more installation methods such as conda or source compilation installation methods, please refer to the [PaddlePaddle installation documentation](https://www.paddlepaddle.org.cn/documentation/docs/en/install/index_en.html).
- CUDA | python3.8 | python3.7 | python3.6 | 10.1 | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10.1-cudnn7-mkl_gcc8.2%2Fpaddlepaddle_gpu-2.0.0rc0.post101-cp38-cp38-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10.1-cudnn7-mkl_gcc8.2%2Fpaddlepaddle_gpu-2.0.0rc0.post101-cp37-cp37m-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10.1-cudnn7-mkl_gcc8.2%2Fpaddlepaddle_gpu-2.0.0rc0.post101-cp36-cp36m-linux_x86_64.whl
- | |
10.0 | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post100-cp38-cp38-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post100-cp37-cp37m-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post100-cp36-cp36m-linux_x86_64.whl
- | |
9.0 | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda9-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post90-cp38-cp38-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda9-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post90-cp37-cp37m-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda9-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post90-cp36-cp36m-linux_x86_64.whl
- |
+Make sure that your PaddlePaddle is successfully installed in the required or higher version, and then please use the following command to verify.
-Visit home page of [paddlepaddle](https://www.paddlepaddle.org.cn/install/quick) for support of other systems, such as Windows10.
+```
+# verify that PaddlePaddle is installed successfully in your Python interpreter
+>>> import paddle
+>>> paddle.utils.run_check()
-### 2. Install paddleGAN
+# Confirm PaddlePaddle version
+python -c "import paddle; print(paddle.__version__)"
+```
-#### 2.1 Install through pip
+### Install PaddleGAN
+
+#### 1. Install via PIP (only Python3 is available)
+
+* Install
```
-# only support Python3
python3 -m pip install --upgrade ppgan
```
-Download the examples and configuration files via cloning the source code:
+* Download the examples and configuration files via cloning the source code:
```
git clone https://github.com/PaddlePaddle/PaddleGAN
cd PaddleGAN
```
-#### 2.2 Install through source code
+#### 2. Install via source code
```
git clone https://github.com/PaddlePaddle/PaddleGAN
cd PaddleGAN
+
pip install -v -e . # or "python setup.py develop"
+
+# Install other dependencies
+pip install -r requirements.txt
```
-### 4. Installation of other tools that may be used
+### Other Third-Party Tool Installation
-#### 4.1 ffmpeg
+#### 1. ffmpeg
-If you need to use ppgan to handle video-related tasks, you need to install ffmpeg. It is recommended that you use [conda](https://docs.conda.io/en/latest/miniconda.html) to install:
+All tasks involving video require `ffmpeg` to be installed, here we recommend using conda
```
conda install x264=='1!152.20180717' ffmpeg=4.0.2 -c conda-forge
```
-#### 4.2 Visual DL
-If you want to use [PaddlePaddle VisualDL](https://github.com/PaddlePaddle/VisualDL) to monitor the training process, Please install `VisualDL`(For more detail refer [here](./get_started.md)):
+#### 2. VisualDL
+If you want to use [PaddlePaddle VisualDL](https://github.com/PaddlePaddle/VisualDL) to visualize the training process, Please install `VisualDL`(For more detail refer [here](./get_started.md)):
```
python -m pip install visualdl -i https://mirror.baidu.com/pypi/simple
```
+
+*Note: Only versions installed under Python 3 or higher are maintained by VisualDL officially.
diff --git a/docs/en_US/tutorials/animegan.md b/docs/en_US/tutorials/animegan.md
index 38aaaf349f9330303dfb328cbf20cf5505f5f723..d216719e147ee77852846a777ba26035643c8c41 100644
--- a/docs/en_US/tutorials/animegan.md
+++ b/docs/en_US/tutorials/animegan.md
@@ -70,7 +70,7 @@ animedataset
2. After the warmup, we strat to training GAN.:
**NOTE:** you must modify the `configs/animeganv2.yaml > pretrain_ckpt ` parameter first! ensure the GAN can reuse the warmup generator model.
- Set the `batch size=4` and the `learning rate=0.00002`. Train 30 epochs on a GTX2060S GPU to reproduce the result. For other hyperparameters, please refer to `configs/animeganv2.yaml`.
+ Set the `batch size=4` and the `learning rate=0.0002`. Train 30 epochs on a GTX2060S GPU to reproduce the result. For other hyperparameters, please refer to `configs/animeganv2.yaml`.
```sh
python tools/main.py --config-file configs/animeganv2.yaml
```
diff --git a/docs/en_US/tutorials/aotgan.md b/docs/en_US/tutorials/aotgan.md
new file mode 100644
index 0000000000000000000000000000000000000000..9d0c3609ed359e9ed1586fccfb24f9d1db8ddae4
--- /dev/null
+++ b/docs/en_US/tutorials/aotgan.md
@@ -0,0 +1,89 @@
+# AOT GAN
+
+## 1 Principle
+
+ The Aggregated COntextual-Transformation GAN (AOT-GAN) is for high-resolution image inpainting.The AOT blocks aggregate contextual
+transformations from various receptive fields, allowing to capture both informative distant image contexts and rich patterns of interest
+for context reasoning.
+
+
+
+**Paper:** [Aggregated Contextual Transformations for High-Resolution Image Inpainting](https://paperswithcode.com/paper/aggregated-contextual-transformations-for)
+
+**Official Repo:** [https://github.com/megvii-research/NAFNet](https://github.com/megvii-research/NAFNet)
+
+
+## 2 How to use
+
+### 2.1 Prediction
+
+Download pretrained generator weights from: (https://paddlegan.bj.bcebos.com/models/AotGan_g.pdparams)
+
+```
+python applications/tools/aotgan.py \
+ --input_image_path data/aotgan/armani1.jpg \
+ --input_mask_path data/aotgan/armani1.png \
+ --weight_path test/aotgan/g.pdparams \
+ --output_path output_dir/armani_pred.jpg \
+ --config-file configs/aotgan.yaml
+```
+Parameters:
+* input_image_path:input image
+* input_mask_path:input mask
+* weight_path:pretrained generator weights
+* output_path:predicted image
+* config-file:yaml file,same with the training process
+
+AI Studio Project:(https://aistudio.baidu.com/aistudio/datasetdetail/165081)
+
+### 2.2 Train
+
+Data Preparation:
+
+The pretained model uses 'Place365Standard' and 'NVIDIA Irregular Mask' as its training datasets. You can download then from ([Place365Standard](http://places2.csail.mit.edu/download.html)) and ([NVIDIA Irregular Mask Dataset](https://nv-adlr.github.io/publication/partialconv-inpainting)).
+
+```
+└─data
+ └─aotgan
+ ├─train_img
+ ├─train_mask
+ ├─val_img
+ └─val_mask
+```
+Train(Single Card):
+
+`python -u tools/main.py --config-file configs/aotgan.yaml`
+
+Train(Mult-Card):
+
+```
+!python -m paddle.distributed.launch \
+ tools/main.py \
+ --config-file configs/photopen.yaml \
+ -o dataset.train.batch_size=6
+```
+Train(continue):
+
+```
+python -u tools/main.py \
+ --config-file configs/aotgan.yaml \
+ --resume output_dir/[path_to_checkpoint]/iter_[iternumber]_checkpoint.pdparams
+```
+
+# Results
+
+On Places365-Val Dataset
+
+| mask | PSNR | SSIM | download |
+| ---- | ---- | ---- | ---- |
+| 20-30% | 26.04001 | 0.89011 | [download](https://paddlegan.bj.bcebos.com/models/AotGan_g.pdparams) |
+
+# References
+
+@inproceedings{yan2021agg,
+ author = {Zeng, Yanhong and Fu, Jianlong and Chao, Hongyang and Guo, Baining},
+ title = {Aggregated Contextual Transformations for High-Resolution Image Inpainting},
+ booktitle = {Arxiv},
+ pages={-},
+ year = {2020}
+}
diff --git a/docs/en_US/tutorials/face_enhancement.md b/docs/en_US/tutorials/face_enhancement.md
new file mode 100644
index 0000000000000000000000000000000000000000..318c010483fdd7b29ffaa7f6c88f5000f9994672
--- /dev/null
+++ b/docs/en_US/tutorials/face_enhancement.md
@@ -0,0 +1,43 @@
+# Face Enhancement
+
+## 1. face enhancement introduction
+
+Blind face restoration (BFR) from severely degraded face images in the wild is a very challenging problem. Due to the high illness of the problem and the complex unknown degradation, directly training a deep neural network (DNN) usually cannot lead to acceptable results. Existing generative adversarial network (GAN) based methods can produce better results but tend to generate over-smoothed restorations. Here we provide the [GPEN](https://arxiv.org/abs/2105.06070) model. GPEN was proposed by first learning a GAN for high-quality face image generation and embedding it into a U-shaped DNN as a prior decoder, then fine-tuning the GAN prior embedded DNN with a set of synthesized low-quality face images. The GAN blocks are designed to ensure that the latent code and noise input to the GAN can be respectively generated from the deep and shallow features of the DNN, controlling the global face structure, local face details and background of the reconstructed image. The proposed GAN prior embedded network (GPEN) is easy-to-implement, and it can generate visually photo-realistic results. Experiments demonstrated that the proposed GPEN achieves significantly superior results to state-of-the-art BFR methods both quantitatively and qualitatively, especially for the restoration of severely degraded face images in the wild.
+
+## How to use
+
+### face enhancement
+
+The user could use the following command to do face enhancement and select the local image as input:
+
+```python
+import paddle
+from ppgan.faceutils.face_enhancement import FaceEnhancement
+
+faceenhancer = FaceEnhancement()
+img = faceenhancer.enhance_from_image(img)
+```
+
+Note: please convert the image to float type, currently does not support int8 type.
+
+### Train (TODO)
+
+In the future, training scripts will be added to facilitate users to train more types of GPEN.
+
+## Results
+
+
+
+## Reference
+
+```
+@inproceedings{inproceedings,
+author = {Yang, Tao and Ren, Peiran and Xie, Xuansong and Zhang, Lei},
+year = {2021},
+month = {06},
+pages = {672-681},
+title = {GAN Prior Embedded Network for Blind Face Restoration in the Wild},
+doi = {10.1109/CVPR46437.2021.00073}
+}
+
+```
diff --git a/docs/en_US/tutorials/face_parse.md b/docs/en_US/tutorials/face_parse.md
index 3bf4acbd5d489e5a2cdce1426a4377cb44a1939d..8f21b8138cd974b32efdba8dad849f4247990675 100644
--- a/docs/en_US/tutorials/face_parse.md
+++ b/docs/en_US/tutorials/face_parse.md
@@ -12,7 +12,7 @@ Runing the following command to complete the face parsing task. The output resul
```
cd applications
-python face_parse.py --input_image ../docs/imgs/face.png
+python tools/face_parse.py --input_image ../docs/imgs/face.png
```
**params:**
diff --git a/docs/en_US/tutorials/gfpgan.md b/docs/en_US/tutorials/gfpgan.md
new file mode 100644
index 0000000000000000000000000000000000000000..d4ca57bff64ca288159dff41720fb65df91a3442
--- /dev/null
+++ b/docs/en_US/tutorials/gfpgan.md
@@ -0,0 +1,207 @@
+## GFPGAN Blind Face Restoration Model
+
+
+
+## 1、Introduction
+
+GFP-GAN that leverages rich and diverse priors encapsulated in a pretrained face GAN for blind face restoration.
+### Overview of GFP-GAN framework:
+
+
+
+GFP-GAN is comprised of a degradation removal
+module (U-Net) and a pretrained face GAN (such as StyleGAN2) as prior. They are bridged by a latent code
+mapping and several Channel-Split Spatial Feature Transform (CS-SFT) layers.
+
+By dealing with features, it achieving realistic results while preserving high fidelity.
+
+For a more detailed introduction to the model, and refer to the repo, you can view the following AI Studio project
+[https://aistudio.baidu.com/aistudio/projectdetail/4421649](https://aistudio.baidu.com/aistudio/projectdetail/4421649)
+
+In this experiment, We train
+our model with Adam optimizer for a total of 210k iterations.
+
+The result of experiments of recovering of GFPGAN as following:
+
+Model | LPIPS | FID | PSNR
+--- |:---:|:---:|:---:|
+GFPGAN | 0.3817 | 36.8068 | 65.0461
+
+## 2、Ready to work
+
+### 2.1 Dataset Preparation
+
+The GFPGAN model training set is the classic FFHQ face data set,
+with a total of 70,000 high-resolution 1024 x 1024 high-resolution face pictures,
+and the test set is the CELEBA-HQ data set, with a total of 2,000 high-resolution face pictures. The generation way is the same as that during training.
+For details, please refer to **Dataset URL:** [FFHQ](https://github.com/NVlabs/ffhq-dataset), [CELEBA-HQ](https://github.com/tkarras/progressive_growing_of_gans).
+The specific download links are given below:
+
+**Original dataset download address:**
+
+**FFHQ :** https://drive.google.com/drive/folders/1tZUcXDBeOibC6jcMCtgRRz67pzrAHeHL?usp=drive_open
+
+**CELEBA-HQ:** https://drive.google.com/drive/folders/0B4qLcYyJmiz0TXY1NG02bzZVRGs?resourcekey=0-arAVTUfW9KRhN-irJchVKQ&usp=sharing
+
+The structure of data as following
+
+```
+|-- data/GFPGAN
+ |-- train
+ |-- 00000.png
+ |-- 00001.png
+ |-- ......
+ |-- 00999.png
+ |-- ......
+ |-- 69999.png
+ |-- lq
+ |-- 2000张jpg图片
+ |-- gt
+ |-- 2000张jpg图片
+```
+
+
+Please modify the dataroot parameters of dataset train and test in the configs/gfpgan_ffhq1024.yaml configuration file to your training set and test set path.
+
+
+### 2.2 Model preparation
+
+**Model parameter file and training log download address:**
+
+https://paddlegan.bj.bcebos.com/models/GFPGAN.pdparams
+
+Download the model parameters and test images from the link and put them in the data/ folder in the project root directory. The specific file structure is as follows:
+
+the params is a dict(one type in python),and could be load by paddlepaddle. It contains key (net_g,net_g_ema),you can use any of one to inference
+
+## 3、Start using
+
+### 3.1 model training
+
+Enter the following code in the console to start training:
+
+ ```bash
+ python tools/main.py -c configs/gfpgan_ffhq1024.yaml
+ ```
+
+The model supports single-card training and multi-card training.So you can use this bash to train
+
+ ```bash
+!CUDA_VISIBLE_DEVICES=0,1,2,3
+!python -m paddle.distributed.launch tools/main.py \
+ --config-file configs/gpfgan_ffhq1024.yaml
+ ```
+
+Model training needs to use paddle2.3 and above, and wait for paddle to implement the second-order operator related functions of elementwise_pow. The paddle2.2.2 version can run normally, but the model cannot be successfully trained because some loss functions will calculate the wrong gradient. . If an error is reported during training, training is not supported for the time being. You can skip the training part and directly use the provided model parameters for testing. Model evaluation and testing can use paddle2.2.2 and above.
+
+
+
+### 3.2 Model evaluation
+
+When evaluating the model, enter the following code in the console, using the downloaded model parameters mentioned above:
+
+ ```shell
+python tools/main.py -c configs/gfpgan_ffhq1024.yaml --load GFPGAN.pdparams --evaluate-only
+ ```
+
+If you want to test on your own provided model, please modify the path after --load .
+
+
+
+### 3.3 Model prediction
+
+#### 3.3.1 Export model
+
+After training, you need to use ``tools/export_model.py`` to extract the weights of the generator from the trained model (including the generator only)
+Enter the following command to extract the model of the generator:
+
+```bash
+python -u tools/export_model.py --config-file configs/gfpgan_ffhq1024.yaml \
+ --load GFPGAN.pdparams \
+ --inputs_size 1,3,512,512
+```
+
+
+#### 3.3.2 Process a single image
+
+You can use our tools in ppgan/faceutils/face_enhancement/gfpgan_enhance.py to inferences one picture quickly
+```python
+%env PYTHONPATH=.:$PYTHONPATH
+%env CUDA_VISIBLE_DEVICES=0
+import paddle
+import cv2
+import numpy as np
+import sys
+from ppgan.faceutils.face_enhancement.gfpgan_enhance import gfp_FaceEnhancement
+# you can use your path
+img_path='test/2.png'
+img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+# this is origin picture
+cv2.imwrite('test/outlq.png',img)
+img=np.array(img).astype('float32')
+faceenhancer = gfp_FaceEnhancement()
+img = faceenhancer.enhance_from_image(img)
+# the result of prediction
+cv2.imwrite('test/out_gfpgan.png',img)
+```
+
+
+
+
+
+
+
+## 4. Tipc
+
+### 4.1 Export the inference model
+
+```bash
+python -u tools/export_model.py --config-file configs/gfpgan_ffhq1024.yaml \
+ --load GFPGAN.pdparams \
+ --inputs_size 1,3,512,512
+```
+
+You can also modify the parameters after --load to the model parameter file you want to test.
+
+
+
+### 4.2 Inference with a prediction engine
+
+```bash
+%cd /home/aistudio/work/PaddleGAN
+# %env PYTHONPATH=.:$PYTHONPATH
+# %env CUDA_VISIBLE_DEVICES=0
+!python -u tools/inference.py --config-file configs/gfpgan_ffhq1024.yaml \
+ --model_path GFPGAN.pdparams \
+ --model_type gfpgan \
+ --device gpu \
+ -o validate=None
+```
+
+
+### 4.3 Call the script to complete the training and push test in two steps
+
+To invoke the `lite_train_lite_infer` mode of the foot test base training prediction function, run:
+
+```bash
+%cd /home/aistudio/work/PaddleGAN
+!bash test_tipc/prepare.sh \
+ test_tipc/configs/GFPGAN/train_infer_python.txt \
+ lite_train_lite_infer
+!bash test_tipc/test_train_inference_python.sh \
+ test_tipc/configs/GFPGAN/train_infer_python.txt \
+ lite_train_lite_infer
+```
+
+
+
+## 5、References
+
+```
+@InProceedings{wang2021gfpgan,
+ author = {Xintao Wang and Yu Li and Honglun Zhang and Ying Shan},
+ title = {Towards Real-World Blind Face Restoration with Generative Facial Prior},
+ booktitle={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2021}
+}
+```
diff --git a/docs/en_US/tutorials/gpen.md b/docs/en_US/tutorials/gpen.md
new file mode 100644
index 0000000000000000000000000000000000000000..0384a44784792c64960dad1a94175f9c58e0952a
--- /dev/null
+++ b/docs/en_US/tutorials/gpen.md
@@ -0,0 +1,202 @@
+English | [Chinese](../../zh_CN/tutorials/gpen.md)
+
+## GPEN Blind Face Restoration Model
+
+
+## 1、Introduction
+
+The GPEN model is a blind face restoration model. The author embeds the decoder of StyleGAN V2 proposed by the previous model as the decoder of GPEN; reconstructs a simple encoder with DNN to provide input for the decoder. In this way, while retaining the excellent performance of the StyleGAN V2 decoder, the function of the model is changed from image style conversion to blind face restoration. The overall structure of the model is shown in the following figure:
+
+
+
+For a more detailed introduction to the model, and refer to the repo, you can view the following AI Studio project [link]([GPEN Blind Face Repair Model Reproduction - Paddle AI Studio (baidu.com)](https://aistudio.baidu.com/ The latest version of aistudio/projectdetail/3936241?contributionType=1)).
+
+
+
+
+## 2、Ready to work
+
+### 2.1 Dataset Preparation
+
+The GPEN model training set is the classic FFHQ face data set, with a total of 70,000 high-resolution 1024 x 1024 high-resolution face pictures, and the test set is the CELEBA-HQ data set, with a total of 2,000 high-resolution face pictures. For details, please refer to **Dataset URL:** [FFHQ](https://github.com/NVlabs/ffhq-dataset), [CELEBA-HQ](https://github.com/tkarras/progressive_growing_of_gans). The specific download links are given below:
+
+**Original dataset download address:**
+
+**FFHQ :** https://drive.google.com/drive/folders/1tZUcXDBeOibC6jcMCtgRRz67pzrAHeHL?usp=drive_open
+
+**CELEBA-HQ:** https://drive.google.com/drive/folders/0B4qLcYyJmiz0TXY1NG02bzZVRGs?resourcekey=0-arAVTUfW9KRhN-irJchVKQ&usp=sharing
+
+
+
+Since the original FFHQ dataset is too large, you can also download the 256-resolution FFHQ dataset from the following link:
+
+https://paddlegan.bj.bcebos.com/datasets/images256x256.tar
+
+
+
+**After downloading, the file organization is as follows**
+
+```
+|-- data/GPEN
+ |-- ffhq/images256x256/
+ |-- 00000
+ |-- 00000.png
+ |-- 00001.png
+ |-- ......
+ |-- 00999.png
+ |-- 01000
+ |-- ......
+ |-- ......
+ |-- 69000
+ |-- ......
+ |-- 69999.png
+ |-- test
+ |-- 2000张png图片
+```
+
+Please modify the dataroot parameters of dataset train and test in the configs/gpen_256_ffhq.yaml configuration file to your training set and test set path.
+
+
+
+### 2.2 Model preparation
+
+**Model parameter file and training log download address:**
+
+link:https://paddlegan.bj.bcebos.com/models/gpen.zip
+
+
+Download the model parameters and test images from the link and put them in the data/ folder in the project root directory. The specific file structure is as follows:
+
+
+```
+data/gpen/weights
+ |-- model_ir_se50.pdparams
+ |-- weight_pretrain.pdparams
+data/gpen/lite_data
+```
+
+
+
+## 3、Start using
+
+### 3.1 model training
+
+Enter the following code in the console to start training:
+
+ ```shell
+ python tools/main.py -c configs/gpen_256_ffhq.yaml
+ ```
+
+The model only supports single-card training.
+
+Model training needs to use paddle2.3 and above, and wait for paddle to implement the second-order operator related functions of elementwise_pow. The paddle2.2.2 version can run normally, but the model cannot be successfully trained because some loss functions will calculate the wrong gradient. . If an error is reported during training, training is not supported for the time being. You can skip the training part and directly use the provided model parameters for testing. Model evaluation and testing can use paddle2.2.2 and above.
+
+
+
+### 3.2 Model evaluation
+
+When evaluating the model, enter the following code in the console, using the downloaded model parameters mentioned above:
+
+ ```shell
+python tools/main.py -c configs/gpen_256_ffhq.yaml -o dataset.test.amount=2000 --load data/gpen/weights/weight_pretrain.pdparams --evaluate-only
+ ```
+
+If you want to test on your own provided model, please modify the path after --load .
+
+
+
+### 3.3 Model prediction
+
+#### 3.3.1 Export generator weights
+
+After training, you need to use ``tools/extract_weight.py`` to extract the weights of the generator from the trained model (including the generator and discriminator) for inference to `applications/tools/gpen.py` to achieve Various applications of the GPEN model. Enter the following command to extract the weights of the generator:
+
+```bash
+python tools/extract_weight.py data/gpen/weights/weight_pretrain.pdparams --net-name g_ema --output data/gpen/weights/g_ema.pdparams
+```
+
+
+
+#### 3.3.2 Process a single image
+
+After extracting the weights of the generator, enter the following command to test the images under the --test_img path. Modifying the --seed parameter can generate different degraded images to show richer effects. You can modify the path after --test_img to any image you want to test. If no weight is provided after the --weight_path parameter, the trained model weights will be automatically downloaded for testing.
+
+```bash
+python applications/tools/gpen.py --test_img data/gpen/lite_data/15006.png --seed=100 --weight_path data/gpen/weights/g_ema.pdparams --model_type gpen-ffhq-256
+```
+
+The following are the sample images and the corresponding inpainted images, from left to right, the degraded image, the generated image, and the original clear image:
+
+
+
+An example output is as follows:
+
+
+```
+result saved in : output_dir/gpen_predict.png
+ FID: 92.11730631094356
+ PSNR:19.014782083825743
+```
+
+
+
+## 4. Tipc
+
+### 4.1 Export the inference model
+
+```bash
+python tools/export_model.py -c configs/gpen_256_ffhq.yaml --inputs_size=1,3,256,256 --load data/gpen/weights/weight_pretrain.pdparams
+```
+
+The above command will generate the model structure file `gpenmodel_g_ema.pdmodel` and model weight files `gpenmodel_g_ema.pdiparams` and `gpenmodel_g_ema.pdiparams.info` files required for prediction, which are stored in the `inference_model/` directory. You can also modify the parameters after --load to the model parameter file you want to test.
+
+
+
+### 4.2 Inference with a prediction engine
+
+```bash
+python tools/inference.py --model_type GPEN --seed 100 -c configs/gpen_256_ffhq.yaml -o dataset.test.dataroot="./data/gpen/lite_data/" --output_path test_tipc/output/ --model_path inference_model/gpenmodel_g_ema
+```
+
+At the end of the inference, the repaired image generated by the model will be saved in the test_tipc/output/GPEN directory by default, and the FID value obtained by the test will be output in test_tipc/output/GPEN/metric.txt.
+
+
+The default output is as follows:
+
+```
+Metric fid: 187.0158
+```
+
+Note: Since the operation of degrading high-definition pictures has a certain degree of randomness, the results of each test will be different. In order to ensure that the test results are consistent, here I fixed the random seed, so that the same degradation operation is performed on the image for each test.
+
+
+
+### 4.3 Call the script to complete the training and push test in two steps
+
+To invoke the `lite_train_lite_infer` mode of the foot test base training prediction function, run:
+
+```shell
+# Corrected format of sh file
+sed -i 's/\r//' test_tipc/prepare.sh
+sed -i 's/\r//' test_tipc/test_train_inference_python.sh
+sed -i 's/\r//' test_tipc/common_func.sh
+# prepare data
+bash test_tipc/prepare.sh ./test_tipc/configs/GPEN/train_infer_python.txt 'lite_train_lite_infer'
+# run the test
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/GPEN/train_infer_python.txt 'lite_train_lite_infer'
+```
+
+
+
+## 5、References
+
+```
+@misc{2021GAN,
+ title={GAN Prior Embedded Network for Blind Face Restoration in the Wild},
+ author={ Yang, T. and Ren, P. and Xie, X. and Zhang, L. },
+ year={2021},
+ archivePrefix={CVPR},
+ primaryClass={cs.CV}
+}
+```
+
diff --git a/docs/en_US/tutorials/invdn.md b/docs/en_US/tutorials/invdn.md
new file mode 100644
index 0000000000000000000000000000000000000000..08741fa2bcd0dff3f3f619e12f5bc6fe453872fc
--- /dev/null
+++ b/docs/en_US/tutorials/invdn.md
@@ -0,0 +1,119 @@
+English | [Chinese](../../zh_CN/tutorials/invdn.md)
+
+# Invertible Denoising Network: A Light Solution for Real Noise Removal
+
+**Invertible Denoising Network: A Light Solution for Real Noise Removal** (CVPR 2021)
+
+Official code:[https://github.com/Yang-Liu1082/InvDN](https://github.com/Yang-Liu1082/InvDN)
+
+Paper:[https://arxiv.org/abs/2104.10546](https://arxiv.org/abs/2104.10546)
+
+## 1、Introduction
+
+InvDN uses invertible network to divide noise image into low resolution clean image and high frequency latent representation, which contains noise information and content information. Since the invertible network is information lossless, if we can separate the noise information in the high-frequency representation, then we can reconstruct the clean picture with the original resolution together with the clean picture with the low resolution. However, it is difficult to remove the noise in the high-frequency information. In this paper, the high-frequency latent representation with noise is directly replaced by another representation sampled from the prior distribution in the process of reduction, and then the low-resolution clean image is reconstructed back to the original resolution clean image. The network implemented in this paper is lightweight.
+
+
+
+
+## 2 How to use
+
+### 2.1 Quick start
+
+After installing `PaddleGAN`, you can run a command as follows to generate the restorated image `./output_dir/Denoising/image_name.png`.
+
+```sh
+python applications/tools/invdn_denoising.py --images_path ${PATH_OF_IMAGE}
+```
+
+Where `PATH_OF_IMAGE` is the path of the image you need to denoise, or the path of the folder where the images is located.
+
+- Note that in the author's original code, Monte Carlo self-ensemble is used for testing to improve performance, but it slows things down. Users are free to choose whether to use the `--disable_mc` parameter to turn off Monte Carlo self-ensemble for faster speeds. (Monte-carlo self-ensemble is enabled by default for $test$, and disabled by default for $train$ and $valid$.)
+
+### 2.2 Prepare dataset
+
+#### **Train Dataset**
+
+In this paper, we will use SIDD, including training dataset [SIDD-Medium](https://www.eecs.yorku.ca/~kamel/sidd/dataset.php). According to the requirements of the paper, it is necessary to process the dataset into patches of $512 \times 512$. In addition, this paper needs to produce a low-resolution version of the GT image with a size of $128 \times 128$ during training. The low-resolution image is denoted as LQ.
+
+The processed dataset can be find in [Ai Studio](https://aistudio.baidu.com/aistudio/datasetdetail/172084).
+
+The train dataset is placed under: `data/SIDD_Medium_Srgb_Patches_512/train/`.
+
+#### **Test Dataset**
+
+The test dataset is [SIDD_valid](https://www.eecs.yorku.ca/~kamel/sidd/dataset.php). The dataset downloaded from the official website is `./ValidationNoisyBlocksSrgb.mat and ./ValidationGtBlocksSrgb.mat`. You are advised to convert it to $.png$ for convenience.
+
+The converted dataset can be find in [Ai Studio](https://aistudio.baidu.com/aistudio/datasetdetail/172069).
+
+The test dataset is placed under:`data/SIDD_Valid_Srgb_Patches_256/valid/`.
+
+- The file structure under the `PaddleGAN/data` folder is
+```sh
+data
+├─ SIDD_Medium_Srgb_Patches_512
+│ └─ train
+│ ├─ GT
+│ │ 0_0.PNG
+│ │ ...
+│ ├─ LQ
+│ │ 0_0.PNG
+│ │ ...
+│ └─ Noisy
+│ 0_0.PNG
+│ ...
+│
+└─ SIDD_Valid_Srgb_Patches_256
+ └─ valid
+ ├─ GT
+ │ 0_0.PNG
+ │ ...
+ └─ Noisy
+ 0_0.PNG
+ ...
+```
+
+### 2.3 Training
+
+Run the following command to start training:
+```sh
+python -u tools/main.py --config-file configs/invdn_denoising.yaml
+```
+- TIPS:
+In order to ensure that the total $epoch$ number is the same as in the paper configuration, we need to ensure that $total\_batchsize*iter == 1gpus*14bs*600000iters$. Also make sure that $batchsize/learning\_rate == 14/0.0002$ when $batchsize$ is changed.
+For example, when using 4 GPUs, set $batchsize$ as 14, then the actual total $batchsize$ should be 14*4, and the total $iters$ needed to be set as 150,000, and the learning rate should be expanded to 8e-4.
+
+### 2.4 Test
+
+Run the following command to start testing:
+```sh
+python tools/main.py --config-file configs/invdn_denoising.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 3 Results
+
+Denoising
+| model | dataset | PSNR/SSIM |
+|---|---|---|
+| InvDN | SIDD | 39.29 / 0.956 |
+
+
+## 4 Download
+
+| model | link |
+|---|---|
+| InvDN| [InvDN_Denoising](https://paddlegan.bj.bcebos.com/models/InvDN_Denoising.pdparams) |
+
+
+
+# References
+
+- [https://arxiv.org/abs/2104.10546](https://arxiv.org/abs/2104.10546)
+
+```
+@article{liu2021invertible,
+ title={Invertible Denoising Network: A Light Solution for Real Noise Removal},
+ author={Liu, Yang and Qin, Zhenyue and Anwar, Saeed and Ji, Pan and Kim, Dongwoo and Caldwell, Sabrina and Gedeon, Tom},
+ journal={arXiv preprint arXiv:2104.10546},
+ year={2021}
+}
+```
diff --git a/docs/en_US/tutorials/lap_style.md b/docs/en_US/tutorials/lap_style.md
new file mode 100644
index 0000000000000000000000000000000000000000..9449d1c0e95b8123ce3f24a041c05cc740fb00a2
--- /dev/null
+++ b/docs/en_US/tutorials/lap_style.md
@@ -0,0 +1,100 @@
+
+
+# LapStyle
+
+
+This repo holds the official codes of paper: "Drafting and Revision: Laplacian Pyramid Network for Fast High-Quality Artistic Style Transfer", which is accepted in CVPR 2021.
+
+## 1 Paper Introduction
+
+
+Artistic style transfer aims at migrating the style from an example image to a content image. Currently, optimization- based methods have achieved great stylization quality, but expensive time cost restricts their practical applications. Meanwhile, feed-forward methods still fail to synthesize complex style, especially when holistic global and local patterns exist. Inspired by the common painting process ofdrawing a draft and revising the details, [this paper](https://arxiv.org/pdf/2104.05376.pdf) introduce a novel feed- forward method Laplacian Pyramid Network (LapStyle). LapStyle first transfers global style pattern in low-resolution via a Drafting Network. It then revises the local details in high-resolution via a Revision Network, which hallucinates a residual image according to the draft and the image textures extracted by Laplacian filtering. Higher resolution details can be easily generated by stacking Revision Networks with multiple Laplacian pyramid levels. The final stylized image is obtained by aggregating outputs ofall pyramid levels. We also introduce a patch discriminator to better learn local pattern adversarially. Experiments demonstrate that our method can synthesize high quality stylized images in real time, where holistic style patterns are properly transferred.
+
+
+
+
+## 2 Quick experience
+Here four style images:
+| [StarryNew](https://user-images.githubusercontent.com/79366697/118655415-1ec8c000-b81c-11eb-8002-90bf8d477860.png) | [Stars](https://user-images.githubusercontent.com/79366697/118655423-20928380-b81c-11eb-92bd-0deeb320ff14.png) | [Ocean](https://user-images.githubusercontent.com/79366697/118655407-1c666600-b81c-11eb-83a6-300ee1952415.png) | [Circuit](https://user-images.githubusercontent.com/79366697/118655399-196b7580-b81c-11eb-8bc5-d5ece80c18ba.jpg)|
+
+```
+python applications/tools/lapstyle.py --content_img_path ${PATH_OF_CONTENT_IMG} --style_image_path ${PATH_OF_STYLE_IMG}
+```
+### Parameters
+
+- `--content_img_path (str)`: path to content image.
+- `--style_image_path (str)`: path to style image.
+- `--output_path (str)`: path to output image dir, default value:`output_dir`.
+- `--weight_path (str)`: path to model weight path, if `weight_path` is `None`, the pre-training model will be downloaded automatically, default value:`None`.
+- `--style (str)`: style of output image, if `weight_path` is `None`, `style` can be chosen in `starrynew`, `circuit`, `ocean` and `stars`, default value:`starrynew`.
+
+## 3 How to use
+
+### 3.1 Prepare Datasets
+
+To train LapStyle, we use the COCO dataset as content image set. You can choose one style image from [starrynew](https://user-images.githubusercontent.com/79366697/118655415-1ec8c000-b81c-11eb-8002-90bf8d477860.png), [ocean](https://user-images.githubusercontent.com/79366697/118655407-1c666600-b81c-11eb-83a6-300ee1952415.png), [stars](https://user-images.githubusercontent.com/79366697/118655423-20928380-b81c-11eb-92bd-0deeb320ff14.png) or [circuit](https://user-images.githubusercontent.com/79366697/118655399-196b7580-b81c-11eb-8bc5-d5ece80c18ba.jpg). Or you can choose any style image you like. Before training or testing, remember modify the data path of style image in the config file.
+
+
+### 3.2 Train
+
+Datasets used in example is COCO, you can also change it to your own dataset in the config file.
+
+Note that train of lapstyle model does not currently support Windows system.
+
+(1) Train the Draft Network of LapStyle under 128*128 resolution:
+```
+python -u tools/main.py --config-file configs/lapstyle_draft.yaml
+```
+
+(2) Then, train the Revision Network of LapStyle under 256*256 resolution:
+```
+python -u tools/main.py --config-file configs/lapstyle_rev_first.yaml --load ${PATH_OF_LAST_STAGE_WEIGHT}
+```
+
+(3) Further, you can train the second Revision Network under 512*512 resolution:
+
+```
+python -u tools/main.py --config-file configs/lapstyle_rev_second.yaml --load ${PATH_OF_LAST_STAGE_WEIGHT}
+```
+
+### 3.4 Test
+
+When testing, you need to change the parameter `validate/save_img` in the configuration file to `true` to save the output image.
+To test the trained model, you can directly test the "lapstyle_rev_second", since it also contains the trained weight of previous stages:
+```
+python tools/main.py --config-file configs/lapstyle_rev_second.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 4 Results
+
+| Style | Stylized Results |
+| --- | --- |
+|  | |
+|  | |
+|  | |
+|  | |
+
+## 5 Pre-trained models
+
+We also provide several trained models.
+
+| model | style | path |
+|---|---|---|
+| lapstyle_circuit | circuit | [lapstyle_circuit](https://paddlegan.bj.bcebos.com/models/lapstyle_circuit.pdparams)
+| lapstyle_ocean | ocean | [lapstyle_ocean](https://paddlegan.bj.bcebos.com/models/lapstyle_ocean.pdparams)
+| lapstyle_starrynew | starrynew | [lapstyle_starrynew](https://paddlegan.bj.bcebos.com/models/lapstyle_starrynew.pdparams)
+| lapstyle_stars | stars | [lapstyle_stars](https://paddlegan.bj.bcebos.com/models/lapstyle_stars.pdparams)
+
+
+# References
+
+
+
+```
+@article{lin2021drafting,
+ title={Drafting and Revision: Laplacian Pyramid Network for Fast High-Quality Artistic Style Transfer},
+ author={Lin, Tianwei and Ma, Zhuoqi and Li, Fu and He, Dongliang and Li, Xin and Ding, Errui and Wang, Nannan and Li, Jie and Gao, Xinbo},
+ booktitle={Computer Vision and Pattern Recognition (CVPR)},
+ year={2021}
+}
+```
diff --git a/docs/en_US/tutorials/motion_driving.md b/docs/en_US/tutorials/motion_driving.md
index e5436f8760c8c4731335cf41b56446b332f8e274..b7f4150e58157755925faa86b0f6ea1c3569e39f 100644
--- a/docs/en_US/tutorials/motion_driving.md
+++ b/docs/en_US/tutorials/motion_driving.md
@@ -1,37 +1,200 @@
-# Fist order motion model
+# First Order Motion
-## First order motion model introduction
-
-[First order motion model](https://arxiv.org/abs/2003.00196) is to complete the Image animation task, which consists of generating a video sequence so that an object in a source image is animated according to the motion of a driving video. The first order motion framework addresses this problem without using any annotation or prior information about the specific object to animate. Once trained on a set of videos depicting objects of the same category (e.g. faces, human bodies), this method can be applied to any object of this class. To achieve this, the innovative method decouple appearance and motion information using a self-supervised formulation. In addition, to support complex motions, it use a representation consisting of a set of learned keypoints along with their local affine transformations. A generator network models occlusions arising during target motions and combines the appearance extracted from the source image and the motion derived from the driving video.
+[First order motion model](https://arxiv.org/abs/2003.00196) is to complete the Image animation task, which consists of generating a video sequence so that an object in a source image is animated according to the motion of the driving video. The image below gives examples of source images with objects and driving videos containing a series of motions.
+Taking the face in the upper left corner as an example, given a source object and a driving video, a new video can be generated in which the source subject wears an expression that is derived from the driving video. Usually, we need to annotate the keypoints of the source object and train the model for facial expression transfer.
+
+The following gif clearly expounds the principle:
+
+[](https://user-images.githubusercontent.com/48054808/127443878-b9369c1a-909c-4af6-8c84-a62821262910.gif)
+
+The proposed method is not exclusively for facial expression transfer, it also supports other object with training on similar datasets. For example, you can transfer the motion of playing Tai Chi by training on Tai Chi video datasets and achieve facial expression transfer by using dataset voxceleb for training. After that, you could realize real-time image animation with the corresponding pre-training model.
+
+## Features
+
+- #### Multi-Faces Swapping
+
+ - **Unique face detection algorithm that supports automatic multi-faces detection and expression swapping.**
+
+ - We adopt PaddleGAN's face detection model [S3FD](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/ppgan/faceutils/face_detection/detection) to detect all the faces in an image and transfer those expressions for multi-faces swapping.
+
+ Specific technical steps are shown below:
+
+ a. Use the S3FD model to detect all the faces in an image
+
+ b. Use the First Order Motion model to do the facial expression transfer of each face
+
+ c. Crop those "new" generated faces and put them back to the original photo
+
+ At the same time, PaddleGAN also provides a ["faceutils" tool](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/ppgan/faceutils) for face-related work, including face detection, face segmentation, keypoints detection, etc.
+
+- #### Face Enhancement
+
+ - **This effect significantly improves the definition of the driven video.**
+
+- #### Abundant applications for online experience
+
+ - 🐜**Ant Ah Hey**🐜:https://aistudio.baidu.com/aistudio/projectdetail/1603391
+ - 💙**Special For Love Confession on May 20th (pronounced as I love you)**💙:https://aistudio.baidu.com/aistudio/projectdetail/1956943
+ - **Smile of the Deceased(▰˘◡˘▰)**:https://aistudio.baidu.com/aistudio/projectdetail/1660701
+ - 👨**Special For Father's Day**:https://aistudio.baidu.com/aistudio/projectdetail/2068655
## How to use
+### 1. Quick Start: Face Detection and Effect Enhancement
+
+Users can upload a source image with single or multiple faces and driving video, then substitute the paths of source image and driving video for the `source_image` and `driving_video` parameters respectively and run the following command. It will generate a video file named `result.mp4` in the `output` folder, which is the animated video file.
-Users can upload the prepared source image and driving video, then substitute the path of source image and driving video for the `source_image` and `driving_video` parameter in the following running command. It will geneate a video file named `result.mp4` in the `output` folder, which is the animated video file.
+Note: For photos with multiple faces, the longer the distances between faces, the better the result quality you can get, or you could optimize the effect by adjusting ratio.
+
+The original image and driving video here are provided for demonstration purposes, the running command is as follows:
+
+#### Running Command:
```
cd applications/
python -u tools/first-order-demo.py \
--driving_video ../docs/imgs/fom_dv.mp4 \
--source_image ../docs/imgs/fom_source_image.png \
- --relative --adapt_scale
+ --ratio 0.4 \
+ --relative \
+ --adapt_scale \
+ --image_size 512 \
+ --face_enhancement \
+ --multi_person
+```
+
+
+
+#### Parameters:
+
+| Parameters | Instructions |
+| ---------------- | ------------------------------------------------------------ |
+| driving_video | driving video, the motion and expression of the driving video is to be migrated. |
+| source_image | source image, support single face and multi-faces images, the image will be animated according to the motion and expression of the driving video. |
+| relative | indicate whether the relative or absolute coordinates of the key points in the video are used in the program. It is recommended to use relative coordinates, or the characters will be distorted after animation. |
+| adapt_scale | adaptive movement scale based on convex hull of keypoints. |
+| ratio | The pasted face percentage of generated image, this parameter should be adjusted in the case of multi-person image in which the adjacent faces are close. The default value is 0.4 and the range is [0.4, 0.5]. |
+| image_size | The image size of the face. 256 by default, 512 is supported |
+| face_enhancement | enhance the face, closed by default with no parameter added. |
+| multi_person | multiple faces in the image. Default means only one face in the image |
+
+#### 📣Result of Face Enhancement
+
+| Before | After |
+| :----------------------------------------------------------: | :----------------------------------------------------------: |
+| [](https://user-images.githubusercontent.com/17897185/126444836-b68593e3-ae43-4450-b18f-1a549230bf07.gif) | [](https://user-images.githubusercontent.com/17897185/126444194-436cc885-259d-4636-ad4c-c3dcc52fe175.gif) |
+
+### 2. Training
+
+#### **Datasets:**
+
+- fashion See [here](https://vision.cs.ubc.ca/datasets/fashion/)
+- VoxCeleb See [here](https://github.com/AliaksandrSiarohin/video-preprocessing). Here you can process the data sizes according to your requirements. We deal with two resolution sizes: 256 and 512, the results can be seen below: [](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/imgs/fom_512_vs_256.png)
+
+**Parameters:**
+
+- dataset_name.yaml: Configure your own yaml document and parameters
+
+- Training using single GPU:
+```
+export CUDA_VISIBLE_DEVICES=0
+python tools/main.py --config-file configs/dataset_name.yaml
+```
+- Training using multiple GPUs: change the *nn.BatchNorm* in /ppgan/modules/first_order.py to *nn.SyncBatchNorm*
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch \
+ tools/main.py \
+ --config-file configs/dataset_name.yaml
+
```
-**params:**
-- driving_video: driving video, the motion of the driving video is to be migrated.
-- source_image: source_image, the image will be animated according to the motion of the driving video.
-- relative: indicate whether the relative or absolute coordinates of the key points in the video are used in the program. It is recommended to use relative coordinates. If absolute coordinates are used, the characters will be distorted after animation.
-- adapt_scale: adapt movement scale based on convex hull of keypoints.
+**Example:**
+- Training using single GPU:
+```
+export CUDA_VISIBLE_DEVICES=0
+python tools/main.py --config-file configs/firstorder_fashion.yaml \
+```
+- Training using multiple GPUs:
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch \
+ tools/main.py \
+ --config-file configs/firstorder_fashion.yaml \
+```
+
+
## Animation results
-
+ [](https://user-images.githubusercontent.com/48054808/119469551-0a377b00-bd7a-11eb-9117-e4871c8fb9c0.gif)
+
+
+
+### 3. Model Compression
+
+**Prediction:**
+
+```
+cd applications/
+python -u tools/first-order-demo.py \
+ --driving_video ../docs/imgs/mayiyahei.MP4 \
+ --source_image ../docs/imgs/father_23.jpg \
+ --config ../configs/firstorder_vox_mobile_256.yaml \
+ --ratio 0.4 \
+ --relative \
+ --adapt_scale \
+ --mobile_net
+```
+
+Currently, we use mobilenet combined with pruning to compress models, see the comparison below:
+
+| | Size(M) | reconstruction loss |
+| ---------- | ------- | ------------------- |
+| Original | 229 | 0.041781392 |
+| Compressed | 10.1 | 0.047878753 |
+
+**Training:** First, set mode in configs/firstorder_vox_mobile_256.yaml as kp_detector, train the compressed kp_detector model, and immobilize the original generator model. Then set mode in configs/firstorder_vox_mobile_256.yaml as generator,train the compressed generator model, and immobilize the original kp_detector model. Finally, set mode as both and modify kp_weight_path and gen_weight_path in the config to the path of trained model to train together。
+
+```
+export CUDA_VISIBLE_DEVICES=0
+python tools/main.py --config-file configs/firstorder_vox_mobile_256.yaml
+```
+
+
+
+### 4. Deployment
+
+#### 4.1 Export
+
+Use the `tools/fom_export.py` script to export the configuration file used when the model has been deployed with the config name of `firstorder_vox_mobile_256.yml`. The export script of the model is as follows.
+
+```
+# Export FOM Model
+
+python tools/export_model.py \
+ --config-file configs/firstorder_vox_mobile_256.yaml \
+ --load /root/.cache/ppgan/vox_mobile.pdparams \
+ --inputs_size "1,3,256,256;1,3,256,256;1,10,2;1,10,2,2" \
+ --export_model output_inference/
+```
+
+The prediction models will be exported to the directory of `output_inference/fom_dy2st/` as `model.pdiparams`, `model.pdiparams.info`, `model.pdmodel`。
+
+- [Pre-training Model](https://paddlegan.bj.bcebos.com/applications/first_order_model/paddle_lite/inference/lite.zip)
+
+#### 4.2 Deployment of PaddleLite
+
+- [Deployment of FOM model with Paddle Lite](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/deploy/lite)
+
+- [FOM-Lite-Demo](https://paddlegan.bj.bcebos.com/applications/first_order_model/paddle_lite/apk/fom_demo.zip)。For more details, please refer to [Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite) .
+ Current problems: (a).Paddle Lite performs slightly worse than Paddle Inference,under optimization (b).Run Generator in a single thread, if the number of frames is too large, it will run at the small core rather than the large core.
-## Reference
+## References
```
@InProceedings{Siarohin_2019_NeurIPS,
diff --git a/docs/en_US/tutorials/mpr_net.md b/docs/en_US/tutorials/mpr_net.md
new file mode 100644
index 0000000000000000000000000000000000000000..40d096f8c38e7e809cb63f053bccdf24e5d73f0c
--- /dev/null
+++ b/docs/en_US/tutorials/mpr_net.md
@@ -0,0 +1,124 @@
+# MPR_Net
+
+## 1 Introduction
+
+[MPR_Net](https://arxiv.org/abs/2102.02808) is an image restoration method published in CVPR2021. Image restoration tasks demand a complex balance between spatial details and high-level contextualized information while recovering images. MPR_Net propose a novel synergistic design that can optimally balance these competing goals. The main proposal is a multi-stage architecture, that progressively learns restoration functions for the degraded inputs, thereby breaking down the overall recovery process into more manageable steps. Specifically, the model first learns the contextualized features using encoder-decoder architectures and later combines them with a high-resolution branch that retains local information. At each stage, MPR_Net introduce a novel per-pixel adaptive design that leverages in-situ supervised attention to reweight the local features. A key ingredient in such a multi-stage architecture is the information exchange between different stages. To this end, MPR_Net propose a two-faceted approach where the information is not only exchanged sequentially from early to late stages, but lateral connections between feature processing blocks also exist to avoid any loss of information. The resulting tightly interlinked multi-stage architecture, named as MPRNet, delivers strong performance gains on ten datasets across a range of tasks including image deraining, deblurring, and denoising.
+
+## 2 How to use
+
+### 2.1 Quick start
+
+After installing PaddleGAN, you can run python code as follows to generate the restorated image. Where the `task` is the type of restoration method, you can chose in `Deblurring`、`Denoising` and `Deraining`, and `PATH_OF_IMAGE`is your image path.
+
+```python
+from ppgan.apps import MPRPredictor
+predictor = MPRPredictor(task='Deblurring')
+predictor.run(PATH_OF_IMAGE)
+```
+
+Or run such a command to get the same result:
+
+```sh
+python applications/tools/mprnet.py --input_image ${PATH_OF_IMAGE} --task Deblurring
+```
+Where the `task` is the type of restoration method, you can chose in `Deblurring`、`Denoising` and `Deraining`, and `PATH_OF_IMAGE`is your image path.
+
+### 2.1 Prepare dataset
+
+The Deblurring training datasets is GoPro. The GoPro datasets used for deblurring consists of 3214 blurred images with a size of 1,280×720. These images are divided into 2103 training images and 1111 test images. It can be downloaded from [here](https://drive.google.com/file/d/1H0PIXvJH4c40pk7ou6nAwoxuR4Qh_Sa2/view?usp=sharing).
+After downloading, decompress it to the data directory. After decompression, the structure of `GoProdataset` is as following:
+
+```sh
+GoPro
+├── train
+│ ├── input
+│ └── target
+└── test
+ ├── input
+ └── target
+
+```
+
+The Denoising training datasets is SIDD, an image denoising datasets, containing 30,000 noisy images from 10 different lighting conditions, which can be downloaded from [training datasets](https://www.eecs.yorku.ca/~kamel/sidd/dataset.php) and [Test datasets](https://drive.google.com/drive/folders/1S44fHXaVxAYW3KLNxK41NYCnyX9S79su).
+After downloading, decompress it to the data directory. After decompression, the structure of `SIDDdataset` is as following:
+
+```sh
+SIDD
+├── train
+│ ├── input
+│ └── target
+└── val
+ ├── input
+ └── target
+
+```
+
+Deraining training datasets is Synthetic Rain Datasets, which consists of 13,712 clean rain image pairs collected from multiple datasets (Rain14000, Rain1800, Rain800, Rain12), which can be downloaded from [training datasets](https://drive.google.com/drive/folders/1Hnnlc5kI0v9_BtfMytC2LR5VpLAFZtVe) and [Test datasets](https://drive.google.com/drive/folders/1PDWggNh8ylevFmrjo-JEvlmqsDlWWvZs).
+After downloading, decompress it to the data directory. After decompression, the structure of `Synthetic_Rain_Datasets` is as following:
+
+```sh
+Synthetic_Rain_Datasets
+├── train
+│ ├── input
+│ └── target
+└── test
+ ├── Test100
+ ├── Rain100H
+ ├── Rain100L
+ ├── Test1200
+ └── Test2800
+
+```
+
+### 2.2 Training
+ An example is training to deblur. If you want to train for other tasks, you can replace the config file.
+
+ ```sh
+ python -u tools/main.py --config-file configs/mprnet_deblurring.yaml
+ ```
+
+### 2.3 Test
+
+test model:
+```sh
+python tools/main.py --config-file configs/mprnet_deblurring.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 3 Results
+Deblurring
+| model | dataset | PSNR/SSIM |
+|---|---|---|
+| MPRNet | GoPro | 33.4360/0.9410 |
+
+Denoising
+| model | dataset | PSNR/SSIM |
+|---|---|---|
+| MPRNet | SIDD | 43.6100 / 0.9586 |
+
+Deraining
+| model | dataset | PSNR/SSIM |
+|---|---|---|
+| MPRNet | Rain100L | 36.2848 / 0.9651 |
+
+## 4 Download
+
+| model | link |
+|---|---|
+| MPR_Deblurring | [MPR_Deblurring](https://paddlegan.bj.bcebos.com/models/MPR_Deblurring.pdparams) |
+| MPR_Denoising | [MPR_Denoising](https://paddlegan.bj.bcebos.com/models/MPR_Denoising.pdparams) |
+| MPR_Deraining | [MPR_Deraining](https://paddlegan.bj.bcebos.com/models/MPR_Deraining.pdparams) |
+
+
+
+# References
+
+- [Multi-Stage Progressive Image Restoration](https://arxiv.org/abs/2102.02808)
+
+ ```
+ @inproceedings{Kim2020U-GAT-IT:,
+ title={Multi-Stage Progressive Image Restoration},
+ author={Syed Waqas Zamir and Aditya Arora and Salman Khan and Munawar Hayat and Fahad Shahbaz Khan and Ming-Hsuan Yang and Ling Shao},
+ booktitle={CVPR},
+ year={2021}
+ }
+ ```
diff --git a/docs/en_US/tutorials/nafnet.md b/docs/en_US/tutorials/nafnet.md
new file mode 100644
index 0000000000000000000000000000000000000000..9bb9a2dd0c2c10cd84265a2d580854038ba0f80c
--- /dev/null
+++ b/docs/en_US/tutorials/nafnet.md
@@ -0,0 +1,87 @@
+English | [Chinese](../../zh_CN/tutorials/nafnet.md)
+
+## NAFNet:Simple Baselines for Image Restoration
+
+## 1、Introduction
+
+NAFNet proposes an ultra-simple baseline scheme, Baseline, which is not only computationally efficient but also outperforms the previous SOTA scheme; the resulting Baseline is further simplified to give NAFNet: the non-linear activation units are removed and the performance is further improved. The proposed solution achieves new SOTA performance for both SIDD noise reduction and GoPro deblurring tasks with a significant reduction in computational effort. The network design and features are shown in the figure below, using a UNet with skip connections as the overall architecture, modifying the Transformer module in the Restormer block and eliminating the activation function, adopting a simpler and more efficient simplegate design, and applying a simpler channel attention mechanism.
+
+
+
+For a more detailed introduction to the model, please refer to the original paper [Simple Baselines for Image Restoration](https://arxiv.org/pdf/2204.04676), PaddleGAN currently provides the weight of the denoising task.
+
+## 2 How to use
+
+### 2.1 Quick start
+
+After installing PaddleGAN, you can run a command as follows to generate the restorated image.
+
+```sh
+python applications/tools/nafnet_denoising.py --images_path ${PATH_OF_IMAGE}
+```
+Where `PATH_OF_IMAGE` is the path of the image you need to denoise, or the path of the folder where the images is located. If you need to use your own model weights, run the following command, where `PATH_OF_MODEL` is the path to the model weights.
+
+```sh
+python applications/tools/nafnet_denoising.py --images_path ${PATH_OF_IMAGE} --weight_path ${PATH_OF_MODEL}
+```
+
+### 2.2 Prepare dataset
+
+The Denoising training datasets is SIDD, an image denoising datasets, containing 30,000 noisy images from 10 different lighting conditions, which can be downloaded from [training datasets](https://www.eecs.yorku.ca/~kamel/sidd/dataset.php) and [Test datasets](https://drive.google.com/drive/folders/1S44fHXaVxAYW3KLNxK41NYCnyX9S79su).
+After downloading, decompress it to the data directory. After decompression, the structure of `SIDDdataset` is as following:
+
+```sh
+SIDD
+├── train
+│ ├── input
+│ └── target
+└── val
+ ├── input
+ └── target
+
+```
+Users can also use the [SIDD data](https://aistudio.baidu.com/aistudio/datasetdetail/149460) on AI studio, but need to rename the folders `input_crops` and `gt_crops` to `input` and ` target`
+
+### 2.3 Training
+An example is training to denoising. If you want to train for other tasks,If you want to train other tasks, you can change the dataset and modify the config file.
+
+```sh
+python -u tools/main.py --config-file configs/nafnet_denoising.yaml
+```
+
+### 2.4 Test
+
+test model:
+```sh
+python tools/main.py --config-file configs/nafnet_denoising.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 3 Results
+Denoising
+| model | dataset | PSNR/SSIM |
+|---|---|---|
+| NAFNet | SIDD Val | 43.1468 / 0.9563 |
+
+## 4 Download
+
+| model | link |
+|---|---|
+| NAFNet| [NAFNet_Denoising](https://paddlegan.bj.bcebos.com/models/NAFNet_Denoising.pdparams) |
+
+# References
+
+- [Simple Baselines for Image Restoration](https://arxiv.org/pdf/2204.04676)
+
+```
+@article{chen_simple_nodate,
+ title = {Simple {Baselines} for {Image} {Restoration}},
+ abstract = {Although there have been significant advances in the field of image restoration recently, the system complexity of the state-of-the-art (SOTA) methods is increasing as well, which may hinder the convenient analysis and comparison of methods. In this paper, we propose a simple baseline that exceeds the SOTA methods and is computationally efficient. To further simplify the baseline, we reveal that the nonlinear activation functions, e.g. Sigmoid, ReLU, GELU, Softmax, etc. are not necessary: they could be replaced by multiplication or removed. Thus, we derive a Nonlinear Activation Free Network, namely NAFNet, from the baseline. SOTA results are achieved on various challenging benchmarks, e.g. 33.69 dB PSNR on GoPro (for image deblurring), exceeding the previous SOTA 0.38 dB with only 8.4\% of its computational costs; 40.30 dB PSNR on SIDD (for image denoising), exceeding the previous SOTA 0.28 dB with less than half of its computational costs. The code and the pretrained models will be released at github.com/megvii-research/NAFNet.},
+ language = {en},
+ author = {Chen, Liangyu and Chu, Xiaojie and Zhang, Xiangyu and Sun, Jian},
+ pages = {17}
+}
+```
+
+
+
+
diff --git a/docs/en_US/tutorials/pix2pix_cyclegan.md b/docs/en_US/tutorials/pix2pix_cyclegan.md
index 818ea8d5e7ce148e51d46583a04356450bb7c0da..445df4dcb6ffb119d159023fd1c42f0621f4a3b4 100644
--- a/docs/en_US/tutorials/pix2pix_cyclegan.md
+++ b/docs/en_US/tutorials/pix2pix_cyclegan.md
@@ -18,7 +18,7 @@
```
You can download from wget, download facades from wget for example:
```
- wget https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/facades.zip --no-check-certificate
+ wget http://efrosgans.eecs.berkeley.edu/pix2pix/datasets/facades.tar.gz --no-check-certificate
```
### 1.2.2 Train/Test
@@ -43,6 +43,7 @@
| 模型 | 数据集 | 下载地址 |
|---|---|---|
| Pix2Pix_cityscapes | cityscapes | [Pix2Pix_cityscapes](https://paddlegan.bj.bcebos.com/models/Pix2Pix_cityscapes.pdparams)
+| Pix2Pix_facedes | facades | [Pix2Pix_facades](https://paddlegan.bj.bcebos.com/models/Pixel2Pixel_facades.pdparams)
@@ -71,7 +72,7 @@
```
You can download from wget, download facades from wget for example:
```
- wget http://efrosgans.eecs.berkeley.edu/pix2pix/datasets/facades.tar.gz --no-check-certificate
+ wget https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/facades.zip --no-check-certificate
```
### 2.2.2 Train/Test
diff --git a/docs/en_US/tutorials/pixel2style2pixel.md b/docs/en_US/tutorials/pixel2style2pixel.md
index 2f37f9eb4d06b74c08365685da94d6186684a6fb..56ced0d14cf5fb93484a8f2146a6324575a76627 100644
--- a/docs/en_US/tutorials/pixel2style2pixel.md
+++ b/docs/en_US/tutorials/pixel2style2pixel.md
@@ -27,7 +27,7 @@ The user could use the following command to generate and select the local image
```
cd applications/
-python -u tools/styleganv2.py \
+python -u tools/pixel2style2pixel.py \
--input_image \
--output_path \
--weight_path \
diff --git a/docs/en_US/tutorials/prenet.md b/docs/en_US/tutorials/prenet.md
new file mode 100644
index 0000000000000000000000000000000000000000..b68e8ca6c8732cea990d6b8fdb5e4023183a11ab
--- /dev/null
+++ b/docs/en_US/tutorials/prenet.md
@@ -0,0 +1,101 @@
+# PReNet
+
+## 1 Introduction
+"Progressive Image Deraining Networks: A Better and Simpler Baseline" provides a better and simpler baseline deraining network by considering network architecture, input and output, and loss functions.
+
+
+

+
+
+## 2 How to use
+
+### 2.1 Prepare dataset
+
+ The dataset(RainH.zip) used by PReNet can be downloaded from [here](https://pan.baidu.com/s/1_vxCatOV3sOA6Vkx1l23eA?pwd=vitu),uncompress it and get two folders(RainTrainH、Rain100H).
+
+ The structure of dataset is as following:
+
+```
+ ├── RainH
+ ├── RainTrainH
+ | ├── rain
+ | | ├── 1.png
+ | | └── 2.png
+ | | .
+ | | .
+ | └── norain
+ | ├── 1.png
+ | └── 2.png
+ | .
+ | .
+ └── Rain100H
+ ├── rain
+ | ├── 001.png
+ | └── 002.png
+ | .
+ | .
+ └── norain
+ ├── 001.png
+ └── 002.png
+ .
+ .
+ ```
+
+### 2.2 Train/Test
+
+
+ train model:
+ ```
+ python -u tools/main.py --config-file configs/prenet.yaml
+ ```
+
+ test model:
+ ```
+ python tools/main.py --config-file configs/prenet.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+ ```
+
+## 3 Results
+Evaluated on RGB channels, scale pixels in each border are cropped before evaluation.
+
+The metrics are PSNR / SSIM.
+
+| Method | Rain100H |
+|---|---|
+| PReNet | 29.5037 / 0.899 |
+
+
+Input:
+
+
+

+
+
+Output:
+
+
+

+
+
+## 4 Model Download
+
+| model | dataset |
+|---|---|
+| [PReNet](https://paddlegan.bj.bcebos.com/models/PReNet.pdparams) | [RainH.zip](https://pan.baidu.com/s/1_vxCatOV3sOA6Vkx1l23eA?pwd=vitu) |
+
+
+
+
+
+# References
+
+- 1. [Progressive Image Deraining Networks: A Better and Simpler Baseline](https://arxiv.org/pdf/1901.09221v3.pdf)
+
+
+```
+@inproceedings{ren2019progressive,
+ title={Progressive Image Deraining Networks: A Better and Simpler Baseline},
+ author={Ren, Dongwei and Zuo, Wangmeng and Hu, Qinghua and Zhu, Pengfei and Meng, Deyu},
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+ year={2019},
+ }
+
diff --git a/docs/en_US/tutorials/psgan.md b/docs/en_US/tutorials/psgan.md
index 74af4a6bb413b4aaf03c63c0ddaf4577142d8cff..368a212b4dac1a61e2b8a6dad1c7166c431640f6 100644
--- a/docs/en_US/tutorials/psgan.md
+++ b/docs/en_US/tutorials/psgan.md
@@ -20,7 +20,7 @@ python tools/psgan_infer.py \
--model_path /your/model/path \
--source_path docs/imgs/ps_source.png \
--reference_dir docs/imgs/ref \
- --evaluate-only True
+ --evaluate-only
```
**params:**
- config-file: PSGAN network configuration file, yaml format
diff --git a/docs/en_US/tutorials/singan.md b/docs/en_US/tutorials/singan.md
new file mode 100755
index 0000000000000000000000000000000000000000..0fbe4b30aa428f522d4476c4c2bed0c745bb3a20
--- /dev/null
+++ b/docs/en_US/tutorials/singan.md
@@ -0,0 +1,147 @@
+# SinGAN
+
+## Introduction
+
+SinGAN is a novel unconditional* generative model that is trained using a single image. Traditionally, GANs have been trained on class-specific datasets and capture common features among images of the same class. SinGAN, on the other hand, learns from the overlapping patches at multiple scales of a particular image and learns its internal statistics. Once trained, SinGAN can produce assorted high-quality images of arbitrary sizes and aspect ratios that semantically resemble the training image but contain new object configurations and structures.
+
+** An unconditional GAN creates samples purely from randomized input, while a conditional GAN generates samples based on a "class label" that controls the type of image generated.*
+
+## Usage
+
+### About Config Files
+
+We provide 4 config files for SinGAN model:
+
+- `singan_universal.yaml`
+- `singan_sr.yaml`
+- `singan_animation.yaml`
+- `singan_finetune.yaml`
+
+Among them, `singan_universal.yaml` is a config file suit for all tasks, `singan_sr.yaml` is a config file for super resolution recommended by the author, `singan_animation.yaml` is a config file for animation recommended by the author. Results showed in this document were trained with `singan_universal.yaml`. For *Paint to Image*, we will get better results by finetuning with `singan_finetune.yaml` after training with `singan_universal.yaml`.
+
+### Train
+
+Start training:
+
+```bash
+python tools/main.py -c configs/singan_universal.yaml \
+ -o model.train_image=train_image.png
+```
+
+Finetune for "Paint2Image":
+
+```bash
+python tools/main.py -c configs/singan_finetune.yaml \
+ -o model.train_image=train_image.png \
+ --load weight_saved_in_training.pdparams
+```
+
+### Evaluation
+Running following command, a random image will be generated. It should be noted that `train_image.png` ought to be in directory `data/singan`, or you can modify the value of `dataset.test.dataroot` in config file manually. Besides, this directory must contain only one image, which is `train_image.png`.
+```bash
+python tools/main.py -c configs/singan_universal.yaml \
+ -o model.train_image=train_image.png \
+ --load weight_saved_in_training.pdparams \
+ --evaluate-only
+```
+
+### Extract Weight for Generator
+
+After training, we need use ``tools/extract_weight.py`` to extract weight of generator from training model which includes both generator and discriminator. Then we can use `applications/tools/singan.py` to achieve diverse application of SinGAN.
+
+```bash
+python tools/extract_weight.py weight_saved_in_training.pdparams --net-name netG --output weight_of_generator.pdparams
+```
+
+### Inference and Result
+
+*Attention: to use pretrained model, you can replace `--weight_path weight_of_generator.pdparams` in the following commands by `--pretrained_model `, where `` can be `trees`, `stone`, `mountains`, `birds` or `lightning`.*
+
+#### Random Sample
+
+```bash
+python applications/tools/singan.py \
+ --weight_path weight_of_generator.pdparams \
+ --mode random_sample \
+ --scale_v 1 \ # vertical scale
+ --scale_h 1 \ # horizontal scale
+ --n_row 2 \
+ --n_col 2
+```
+
+|training image|result|
+| ---- | ---- |
+|||
+
+#### Editing & Harmonization
+
+```bash
+python applications/tools/singan.py \
+ --weight_path weight_of_generator.pdparams \
+ --mode editing \ # or harmonization
+ --ref_image editing_image.png \
+ --mask_image mask_of_editing.png \
+ --generate_start_scale 2
+```
+
+
+|training image|editing image|mask of editing|result|
+|----|----|----|----|
+|||||
+
+#### Super Resolution
+
+```bash
+python applications/tools/singan.py \
+ --weight_path weight_of_generator.pdparams \
+ --mode sr \
+ --ref_image image_to_sr.png \
+ --sr_factor 4
+```
+|training image|result|
+| ---- | ---- |
+|||
+
+
+#### Animation
+
+```bash
+python applications/tools/singan.py \
+ --weight_path weight_of_generator.pdparams \
+ --mode animation \
+ --animation_alpha 0.6 \ # this parameter determines how close the frames of the sequence remain to the training image
+ --animation_beta 0.7 \ # this parameter controls the smoothness and rate of change in the generated clip
+ --animation_frames 20 \ # frames of animation
+ --animation_duration 0.1 # duration of each frame
+```
+
+|training image|animation|
+| ---- | ---- |
+|||
+
+
+#### Paint to Image
+```bash
+python applications/tools/singan.py \
+ --weight_path weight_of_generator.pdparams \
+ --mode paint2image \
+ --ref_image paint.png \
+ --generate_start_scale 2
+```
+|training image|paint|result|result after finetune|
+|----|----|----|----|
+|||||
+
+## Reference
+
+```
+@misc{shaham2019singan,
+ title={SinGAN: Learning a Generative Model from a Single Natural Image},
+ author={Tamar Rott Shaham and Tali Dekel and Tomer Michaeli},
+ year={2019},
+ eprint={1905.01164},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
diff --git a/docs/en_US/tutorials/super_resolution.md b/docs/en_US/tutorials/single_image_super_resolution.md
similarity index 48%
rename from docs/en_US/tutorials/super_resolution.md
rename to docs/en_US/tutorials/single_image_super_resolution.md
index 27ce932ae01058ff64804ccb5406c8db83c60864..95b95ee2c2b6c76cf947ac62518d41dee9dc17a0 100644
--- a/docs/en_US/tutorials/super_resolution.md
+++ b/docs/en_US/tutorials/single_image_super_resolution.md
@@ -1,12 +1,15 @@
-# 1 Super Resolution
+# 1 Single Image Super Resolution(SISR)
## 1.1 Principle
Super resolution is a process of upscaling and improving the details within an image. It usually takes a low-resolution image as input and upscales the same image to a higher resolution as output.
- Here we provide three super-resolution models, namely [RealSR](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w31/Ji_Real-World_Super-Resolution_via_Kernel_Estimation_and_Noise_Injection_CVPRW_2020_paper.pdf), [ESRGAN](https://arxiv.org/abs/1809.00219v2), [LESRCNN](https://arxiv.org/abs/2007.04344).
- [RealSR](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w31/Ji_Real-World_Super-Resolution_via_Kernel_Estimation_and_Noise_Injection_CVPRW_2020_paper.pdf) proposed a realworld super-resolution model aiming at better perception.
- [ESRGAN](https://arxiv.org/abs/1809.00219v2) is an enhanced SRGAN that improves the three key components of SRGAN.
- [LESRCNN](https://arxiv.org/abs/2007.04344) is a lightweight enhanced SR CNN (LESRCNN) with three successive sub-blocks.
+ Here we provide four super-resolution models, namely [RealSR](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w31/Ji_Real-World_Super-Resolution_via_Kernel_Estimation_and_Noise_Injection_CVPRW_2020_paper.pdf), [ESRGAN](https://arxiv.org/abs/1809.00219v2), [LESRCNN](https://arxiv.org/abs/2007.04344), [PAN](https://arxiv.org/pdf/2010.01073.pdf).
+
+ [RealSR](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w31/Ji_Real-World_Super-Resolution_via_Kernel_Estimation_and_Noise_Injection_CVPRW_2020_paper.pdf) focus on designing a novel degradation framework for realworld images by estimating various blur kernels as well as real noise distributions. Based on the novel degradation framework, we can acquire LR images sharing a common domain with real-world images. RealSR is a real-world super-resolution model aiming at better perception. Extensive experiments on synthetic noise data and real-world images demonstrate that RealSR outperforms the state-of-the-art methods, resulting in lower noise and better visual quality.
+
+ [ESRGAN](https://arxiv.org/abs/1809.00219v2) is an enhanced SRGAN. To further enhance the visual quality of SRGAN, ESRGAN improves three key components of srgan. In addition, ESRGAN also introduces the Residual-in-Residual Dense Block (RRDB) without batch normalization as the basic network building unit, lets the discriminator predict relative realness instead of the absolute value, and improves the perceptual loss by using the features before activation. Benefiting from these improvements, the proposed ESRGAN achieves consistently better visual quality with more realistic and natural textures than SRGAN and won the first place in the PIRM2018-SR Challenge.
+
+ Considering that the application of CNN in SISR often consume high computational cost and more memory storage for training a SR model, a lightweight enhanced SR CNN ([LESRCNN](https://arxiv.org/abs/2007.04344)) was proposed.Extensive experiments demonstrate that the proposed LESRCNN outperforms state-of-the-arts on SISR in terms of qualitative and quantitative evaluation. Then [PAN](https://arxiv.org/pdf/2010.01073.pdf) designed a lightweight convolutional neural network for image super-resolution (SR).
## 1.2 How to use
@@ -20,29 +23,82 @@
| Classical SR Testing | Set5 | Set5 test dataset | [Google Drive](https://drive.google.com/drive/folders/1B3DJGQKB6eNdwuQIhdskA64qUuVKLZ9u) / [Baidu Drive](https://pan.baidu.com/s/1q_1ERCMqALH0xFwjLM0pTg#list/path=%2Fsharelink2016187762-785433459861126%2Fclassical_SR_datasets&parentPath=%2Fsharelink2016187762-785433459861126) |
| Classical SR Testing | Set14 | Set14 test dataset | [Google Drive](https://drive.google.com/drive/folders/1B3DJGQKB6eNdwuQIhdskA64qUuVKLZ9u) / [Baidu Drive](https://pan.baidu.com/s/1q_1ERCMqALH0xFwjLM0pTg#list/path=%2Fsharelink2016187762-785433459861126%2Fclassical_SR_datasets&parentPath=%2Fsharelink2016187762-785433459861126) |
- The structure of DIV2K is as following:
+ The structure of DIV2K, Set5 and Set14 is as following:
+ ```
+ PaddleGAN
+ ├── data
+ ├── DIV2K
+ ├── DIV2K_train_HR
+ ├── DIV2K_train_LR_bicubic
+ | ├──X2
+ | ├──X3
+ | └──X4
+ ├── DIV2K_valid_HR
+ ├── DIV2K_valid_LR_bicubic
+ Set5
+ ├── GTmod12
+ ├── LRbicx2
+ ├── LRbicx3
+ ├── LRbicx4
+ └── original
+ Set14
+ ├── GTmod12
+ ├── LRbicx2
+ ├── LRbicx3
+ ├── LRbicx4
+ └── original
+ ...
+ ```
+
+ Use the following commands to process the DIV2K data set:
+ ```
+ python data/process_div2k_data.py --data-root data/DIV2K
+ ```
+ When the program is finished, check whether there are ``DIV2K_train_HR_sub``, ``X2_sub``, ``X3_sub`` and ``X4_sub`` directories in the DIV2K directory
+ ```
+ PaddleGAN
+ ├── data
+ ├── DIV2K
+ ├── DIV2K_train_HR
+ ├── DIV2K_train_HR_sub
+ ├── DIV2K_train_LR_bicubic
+ | ├──X2
+ | ├──X2_sub
+ | ├──X3
+ | ├──X3_sub
+ | ├──sX4
+ | └──X4_sub
+ ├── DIV2K_valid_HR
+ ├── DIV2K_valid_LR_bicubic
+ ...
+ ```
+
+#### Prepare dataset for realsr df2k model
+ Download dataset from [NTIRE 2020 RWSR](https://competitions.codalab.org/competitions/22220#participate) and unzip it to your path.
+ Unzip Corrupted-tr-x.zip and Corrupted-tr-y.zip to ``PaddleGAN/data/ntire20`` directory.
+
+ Run the following commands:
```
- DIV2K
- ├── DIV2K_train_HR
- ├── DIV2K_train_LR_bicubic
- | ├──X2
- | ├──X3
- | └──X4
- ├── DIV2K_valid_HR
- ├── DIV2K_valid_LR_bicubic
- ...
+ python ./data/realsr_preprocess/create_bicubic_dataset.py --dataset df2k --artifacts tdsr
+
+ python ./data/realsr_preprocess/collect_noise.py --dataset df2k --artifacts tdsr
```
- The structures of Set5 and Set14 are similar. Taking Set5 as an example, the structure is as following:
+#### Prepare dataset for realsr dped model
+ Download dataset from [NTIRE 2020 RWSR](https://competitions.codalab.org/competitions/22220#participate) and unzip it to your path.
+ Unzip DPEDiphone-tr-x.zip and DPEDiphone-va.zip to ``PaddleGAN/data/ntire20`` directory.
+
+ Use [KernelGAN](https://github.com/sefibk/KernelGAN) to generate kernels from source images. Clone the repo here. Replace SOURCE_PATH with specific path and run:
```
- Set5
- ├── GTmod12
- ├── LRbicx2
- ├── LRbicx3
- ├── LRbicx4
- └── original
+ python train.py --X4 --input-dir SOURCE_PATH
```
+ for convenient, we provide [DPED_KERNEL.tar](https://paddlegan.bj.bcebos.com/datasets/DPED_KERNEL.tar). You can download it to ``PaddleGAN/data/DPED_KERNEL``
+ Run the following commands:
+ ```
+ python ./data/realsr_preprocess/create_kernel_dataset.py --dataset dped --artifacts clean --kernel_path data/DPED_KERNEL
+ python ./data/realsr_preprocess/collect_noise.py --dataset dped --artifacts clean
+ ```
### 1.2.2 Train/Test
@@ -71,13 +127,14 @@ The metrics are PSNR / SSIM.
| lesrcnn_x4 | 31.9476 / 0.8909 | 28.4110 / 0.7770 | 30.231 / 0.8326 |
| esrgan_psnr_x4 | 32.5512 / 0.8991 | 28.8114 / 0.7871 | 30.7565 / 0.8449 |
| esrgan_x4 | 28.7647 / 0.8187 | 25.0065 / 0.6762 | 26.9013 / 0.7542 |
-
+| pan_x4 | 30.4574 / 0.8643 | 26.7204 / 0.7434 | 28.9187 / 0.8176 |
+| drns_x4 | 32.6684 / 0.8999 | 28.9037 / 0.7885 | - |
-## 1.4 模型下载
-| 模型 | 数据集 | 下载地址 |
+## 1.4 Model Download
+| Method | Dataset | Download Link |
|---|---|---|
| realsr_df2k | df2k | [realsr_df2k](https://paddlegan.bj.bcebos.com/models/realsr_df2k.pdparams)
| realsr_dped | dped | [realsr_dped](https://paddlegan.bj.bcebos.com/models/realsr_dped.pdparams)
@@ -85,6 +142,7 @@ The metrics are PSNR / SSIM.
| lesrcnn_x4 | DIV2K | [lesrcnn_x4](https://paddlegan.bj.bcebos.com/models/lesrcnn_x4.pdparams)
| esrgan_psnr_x4 | DIV2K | [esrgan_psnr_x4](https://paddlegan.bj.bcebos.com/models/esrgan_psnr_x4.pdparams)
| esrgan_x4 | DIV2K | [esrgan_x4](https://paddlegan.bj.bcebos.com/models/esrgan_x4.pdparams)
+| pan_x4 | DIV2K | [pan_x4](https://paddlegan.bj.bcebos.com/models/pan_x4.pdparams)
| drns_x4 | DIV2K | [drns_x4](https://paddlegan.bj.bcebos.com/models/DRNSx4.pdparams)
@@ -127,7 +185,20 @@ The metrics are PSNR / SSIM.
publisher={Elsevier}
}
```
-- 4. [Closed-loop Matters: Dual Regression Networks for Single Image Super-Resolution](https://arxiv.org/pdf/2003.07018.pdf)
+
+- 4. [Efficient Image Super-Resolution Using Pixel Attention](https://arxiv.org/pdf/2010.01073.pdf)
+
+ ```
+ @inproceedings{Hengyuan2020Efficient,
+ title={Efficient Image Super-Resolution Using Pixel Attention},
+ author={Hengyuan Zhao and Xiangtao Kong and Jingwen He and Yu Qiao and Chao Dong},
+ booktitle={Computer Vision – ECCV 2020 Workshops},
+ volume={12537},
+ pages={56-72},
+ year={2020}
+ }
+ ```
+- 5. [Closed-loop Matters: Dual Regression Networks for Single Image Super-Resolution](https://arxiv.org/pdf/2003.07018.pdf)
```
@inproceedings{guo2020closed,
@@ -135,5 +206,5 @@ The metrics are PSNR / SSIM.
author={Guo, Yong and Chen, Jian and Wang, Jingdong and Chen, Qi and Cao, Jiezhang and Deng, Zeshuai and Xu, Yanwu and Tan, Mingkui},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
year={2020}
-}
+ }
```
diff --git a/docs/en_US/tutorials/starganv2.md b/docs/en_US/tutorials/starganv2.md
new file mode 100644
index 0000000000000000000000000000000000000000..5c1866f9dd5e89433f8b3a48e35ab29fb3607794
--- /dev/null
+++ b/docs/en_US/tutorials/starganv2.md
@@ -0,0 +1,74 @@
+# StarGAN V2
+
+## 1 Introduction
+
+ [StarGAN V2](https://arxiv.org/pdf/1912.01865.pdf)is an image-to-image translation model published on CVPR2020.
+ A good image-to-image translation model should learn a mapping between different visual domains while satisfying the following properties: 1) diversity of generated images and 2) scalability over multiple domains. Existing methods address either of the issues, having limited diversity or multiple models for all domains. StarGAN v2 is a single framework that tackles both and shows significantly improved results over the baselines. Experiments on CelebA-HQ and a new animal faces dataset (AFHQ) validate superiority of StarGAN v2 in terms of visual quality, diversity, and scalability.
+
+## 2 How to use
+
+### 2.1 Prepare dataset
+
+ The CelebAHQ dataset used by StarGAN V2 can be downloaded from [here](https://www.dropbox.com/s/f7pvjij2xlpff59/celeba_hq.zip?dl=0), and the AFHQ dataset can be downloaded from [here](https://www.dropbox.com/s/t9l9o3vsx2jai3z/afhq.zip?dl=0). Then unzip dataset to the ``PaddleGAN/data`` directory.
+
+ The structure of dataset is as following:
+
+ ```
+ ├── data
+ ├── afhq
+ | ├── train
+ | | ├── cat
+ | | ├── dog
+ | | └── wild
+ | └── val
+ | ├── cat
+ | ├── dog
+ | └── wild
+ └── celeba_hq
+ ├── train
+ | ├── female
+ | └── male
+ └── val
+ ├── female
+ └── male
+
+ ```
+
+### 2.2 Train/Test
+
+ The example uses the AFHQ dataset as an example. If you want to use the CelebAHQ dataset, you can change the config file.
+
+ train model:
+ ```
+ python -u tools/main.py --config-file configs/starganv2_afhq.yaml
+ ```
+
+ test model:
+ ```
+ python tools/main.py --config-file configs/starganv2_afhq.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+ ```
+
+## 3 Results
+
+
+
+## 4 Model Download
+| 模型 | 数据集 | 下载地址 |
+|---|---|---|
+| starganv2_afhq | AFHQ | [starganv2_afhq](https://paddlegan.bj.bcebos.com/models/starganv2_afhq.pdparams)
+
+
+
+
+# References
+
+- 1. [StarGAN v2: Diverse Image Synthesis for Multiple Domains](https://arxiv.org/abs/1912.01865)
+
+ ```
+ @inproceedings{choi2020starganv2,
+ title={StarGAN v2: Diverse Image Synthesis for Multiple Domains},
+ author={Yunjey Choi and Youngjung Uh and Jaejun Yoo and Jung-Woo Ha},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ year={2020}
+ }
+ ```
diff --git a/docs/en_US/tutorials/styleganv2.md b/docs/en_US/tutorials/styleganv2.md
index dc742c391418ddf486196c2ae9b5aa1440175bfd..ed54e77c3297b5322703dc0d4a7df0414749f3c2 100644
--- a/docs/en_US/tutorials/styleganv2.md
+++ b/docs/en_US/tutorials/styleganv2.md
@@ -54,9 +54,57 @@ python -u tools/styleganv2.py \
- n_col: the number of columns of the sampled picture
- cpu: whether to use cpu inference, if not, please remove it from the command
-### Train (TODO)
+### Train
+
+#### prepapre datasets
+you can get ffhq dataset from [here](https://drive.google.com/drive/folders/1u2xu7bSrWxrbUxk-dT-UvEJq8IjdmNTP)
+
+for convenient, we provide [images256x256.tar](https://paddlegan.bj.bcebos.com/datasets/images256x256.tar)
+
+The structure of stylegan data is as following:
+ ```
+ PaddleGAN
+ ├── data
+ ├── ffhq
+ ├──images1024x1024
+ ├── 00000.png
+ ├── 00001.png
+ ├── 00002.png
+ ├── 00003.png
+ ├── 00004.png
+ ├──images256x256
+ ├── 00000.png
+ ├── 00001.png
+ ├── 00002.png
+ ├── 00003.png
+ ├── 00004.png
+ ├──custom_data
+ ├── img0.png
+ ├── img1.png
+ ├── img2.png
+ ├── img3.png
+ ├── img4.png
+ ...
+ ```
+
+train model
+```
+python tools/main.py -c configs/stylegan_v2_256_ffhq.yaml
+```
+
+### Inference
+
+When you finish training, you need to use ``tools/extract_weight.py`` to extract the corresponding weights.
+```
+python tools/extract_weight.py output_dir/YOUR_TRAINED_WEIGHT.pdparams --net-name gen_ema --output YOUR_WEIGHT_PATH.pdparams
+```
+
+Then use ``applications/tools/styleganv2.py`` to get results
+```
+python tools/styleganv2.py --output_path stylegan01 --weight_path YOUR_WEIGHT_PATH.pdparams --size 256
+```
-In the future, training scripts will be added to facilitate users to train more types of StyleGAN V2 image generators.
+Note: ``--size`` should be same with your config file.
## Results
diff --git a/docs/en_US/tutorials/styleganv2clip.md b/docs/en_US/tutorials/styleganv2clip.md
new file mode 100644
index 0000000000000000000000000000000000000000..c22519f6c787566b6118f603fd90cf88924a7d4d
--- /dev/null
+++ b/docs/en_US/tutorials/styleganv2clip.md
@@ -0,0 +1,144 @@
+# StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery
+
+## Introduction
+
+The task of StyleGAN V2 is image generation while the Clip guided Editing module uses the attribute manipulation vector obtained by CLIP (Contrastive Language-Image Pre-training) Model for mapping text prompts to input-agnostic directions in StyleGAN’s style space, enabling interactive text-driven image manipulation.
+
+
+This model uses pretrained StyleGAN V2 generator and uses Pixel2Style2Pixel model for image encoding. At present, only the models of portrait editing (trained on FFHQ dataset) is available.
+
+Paddle-CLIP and dlib package is needed for this module.
+
+```
+pip install -e .
+pip install paddleclip
+pip install dlib-bin
+```
+
+## How to use
+
+### Editing
+
+```
+cd applications/
+python -u tools/styleganv2clip.py \
+ --latent < \
+ --output_path \
+ --weight_path \
+ --model_type ffhq-config-f \
+ --size 1024 \
+ --style_dim 512 \
+ --n_mlp 8 \
+ --channel_multiplier 2 \
+ --direction_path \
+ --neutral \
+ --target \
+ --beta_threshold 0.12 \
+ --direction_offset 5
+ --cpu
+```
+
+**params:**
+- latent: The path of the style vector which represents an image. Come from `dst.npy` generated by Pixel2Style2Pixel or `dst.fitting.npy` generated by StyleGANv2 Fitting module
+- output_path: the directory where the generated images are stored
+- weight_path: pretrained StyleGANv2 model path
+- model_type: inner model type, currently only `ffhq-config-f` is available.
+- direction_path: The path of CLIP mapping vector
+- stat_path: The path of latent statisitc file
+- neutral: Description of the source image,for example: face
+- target: Description of the target image,for example: young face
+- beta_threshold: editing threshold of the attribute channels
+- direction_offset: Offset strength of the attribute
+- cpu: whether to use cpu inference, if not, please remove it from the command
+
+>inherited params for the pretrained StyleGAN model
+- size: model parameters, output image resolution
+- style_dim: model parameters, dimensions of style z
+- n_mlp: model parameters, the number of multi-layer perception layers for style z
+- channel_multiplier: model parameters, channel product, affect model size and the quality of generated pictures
+
+### Results
+
+Input portrait:
+
+

+
+
+with
+> direction_offset = [ -1, 0, 1, 2, 3, 4, 5]
+> beta_threshold = 0.1
+
+edit from 'face' to 'boy face':
+
+
+
+
+edit from 'face' to 'happy face':
+
+
+
+
+edit from 'face' to 'angry face':
+
+
+
+edit from 'face' to 'face with long hair':
+
+
+
+
+
+edit from 'face' to 'face with curly hair':
+
+
+
+
+edit from 'head with black hair' to 'head with gold hair':
+
+
+
+## Make Attribute Direction Vector
+
+For details, please refer to [Puzer/stylegan-encoder](https://github.com/Puzer/stylegan-encoder/blob/master/Learn_direction_in_latent_space.ipynb)
+
+Currently pretrained weight for `stylegan2` & `ffhq-config-f` dataset is provided:
+
+direction: https://paddlegan.bj.bcebos.com/models/stylegan2-ffhq-config-f-styleclip-global-directions.pdparams
+
+stats: https://paddlegan.bj.bcebos.com/models/stylegan2-ffhq-config-f-styleclip-stats.pdparams
+
+## Training
+
+1. extract style latent vector stats
+```
+python styleclip_getf.py
+```
+2. calcuate mapping vector using CLIP model
+
+```
+python ppgan/apps/styleganv2clip_predictor.py extract
+```
+
+# Reference
+
+- 1. [StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery](https://arxiv.org/abs/2103.17249)
+
+ ```
+ @article{Patashnik2021StyleCLIPTM,
+ title={StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery},
+ author={Or Patashnik and Zongze Wu and Eli Shechtman and Daniel Cohen-Or and D. Lischinski},
+ journal={2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
+ year={2021},
+ pages={2065-2074}
+ }
+ ```
+- 2. [Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation](hhttps://arxiv.org/abs/2008.00951)
+
+ ```
+ @article{richardson2020encoding,
+ title={Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation},
+ author={Richardson, Elad and Alaluf, Yuval and Patashnik, Or and Nitzan, Yotam and Azar, Yaniv and Shapiro, Stav and Cohen-Or, Daniel},
+ journal={arXiv preprint arXiv:2008.00951},
+ year={2020}
+ }
+ ```
diff --git a/docs/en_US/tutorials/styleganv2editing.md b/docs/en_US/tutorials/styleganv2editing.md
new file mode 100644
index 0000000000000000000000000000000000000000..367372ad07d8f60d9680be2385b064630602c750
--- /dev/null
+++ b/docs/en_US/tutorials/styleganv2editing.md
@@ -0,0 +1,91 @@
+# StyleGAN V2 Editing Module
+
+## StyleGAN V2 Editing introduction
+
+The task of StyleGAN V2 is image generation while the Editing module uses the attribute manipulation vector obtained by pre-classifying and regressing the style vector of the multi-image to manipulate the attributes of the generated image.
+
+## How to use
+
+### Editing
+
+
+The user can use the following command to edit images:
+
+```
+cd applications/
+python -u tools/styleganv2editing.py \
+ --latent \
+ --output_path \
+ --weight_path \
+ --model_type ffhq-config-f \
+ --size 1024 \
+ --style_dim 512 \
+ --n_mlp 8 \
+ --channel_multiplier 2 \
+ --direction_path \
+ --direction_name \
+ --direction_offset 0.0 \
+ --cpu
+```
+
+**params:**
+- latent: The path of the style vector which represents an image. Come from `dst.npy` generated by Pixel2Style2Pixel or `dst.fitting.npy` generated by StyleGANv2 Fitting module
+- output_path: the directory where the generated images are stored
+- weight_path: pretrained model path
+- model_type: inner model type in PaddleGAN. If you use an existing model type, `weight_path` will have no effect.
+ Currently recommended use: `ffhq-config-f`
+- size: model parameters, output image resolution
+- style_dim: model parameters, dimensions of style z
+- n_mlp: model parameters, the number of multi-layer perception layers for style z
+- channel_multiplier: model parameters, channel product, affect model size and the quality of generated pictures
+- direction_path: The path of the file storing a series of attribute names and object attribute vectors. The default is empty, that is, the file that comes with ppgan is used. If you don’t use it, please remove it from the command
+- direction_name: Attribute to be manipulated,For `ffhq-conf-f`, we have: age, eyes_open, eye_distance, eye_eyebrow_distance, eye_ratio, gender, lip_ratio, mouth_open, mouth_ratio, nose_mouth_distance, nose_ratio, nose_tip, pitch, roll, smile, yaw
+- direction_offset: Offset strength of the attribute
+- cpu: whether to use cpu inference, if not, please remove it from the command
+
+## Editing Results
+
+The image corresponding to the style vector:
+
+
+

+
+
+The image obtained by editing the `age` attribute according to [-5,-2.5,0,2.5,5]:
+
+
+

+
+
+The image obtained by further editing the `gender` to the style vector obtained by the `-5` offset:
+
+
+

+
+
+## Make Attribute Direction Vector
+
+For details, please refer to [Puzer/stylegan-encoder](https://github.com/Puzer/stylegan-encoder/blob/master/Learn_direction_in_latent_space.ipynb)
+
+## Reference
+
+- 1. [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+
+ ```
+ @article{Karras2019stylegan2,
+ title={Analyzing and Improving the Image Quality of {StyleGAN}},
+ author={Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten and Jaakko Lehtinen and Timo Aila},
+ booktitle={Proc. CVPR},
+ year={2020}
+ }
+ ```
+- 2. [Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation](hhttps://arxiv.org/abs/2008.00951)
+
+ ```
+ @article{richardson2020encoding,
+ title={Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation},
+ author={Richardson, Elad and Alaluf, Yuval and Patashnik, Or and Nitzan, Yotam and Azar, Yaniv and Shapiro, Stav and Cohen-Or, Daniel},
+ journal={arXiv preprint arXiv:2008.00951},
+ year={2020}
+ }
+ ```
diff --git a/docs/en_US/tutorials/styleganv2fitting.md b/docs/en_US/tutorials/styleganv2fitting.md
new file mode 100644
index 0000000000000000000000000000000000000000..f3c5201b7a2220187204f9b804e7744cc463cd34
--- /dev/null
+++ b/docs/en_US/tutorials/styleganv2fitting.md
@@ -0,0 +1,94 @@
+# StyleGAN V2 Fitting Module
+
+## StyleGAN V2 Fitting introduction
+
+The task of StyleGAN V2 is image generation while the Fitting module inversely derives the style vector with a high degree of decoupling based on the existing image. The generated style vector can be used in tasks such as face fusion and face attribute editing.
+
+## How to use
+
+### Fitting
+
+The user can use the following command to fit images:
+
+```
+cd applications/
+python -u tools/styleganv2fitting.py \
+ --input_image \
+ --need_align \
+ --start_lr 0.1 \
+ --final_lr 0.025 \
+ --latent_level 0 1 2 3 4 5 6 7 8 9 10 11 \
+ --step 100 \
+ --mse_weight 1 \
+ --pre_latent \
+ --output_path \
+ --weight_path \
+ --model_type ffhq-config-f \
+ --size 1024 \
+ --style_dim 512 \
+ --n_mlp 8 \
+ --channel_multiplier 2 \
+ --cpu
+```
+
+**params:**
+- input_image: the input image file path
+- need_align: whether to crop the image to an image that can be recognized by the model. For an image that has been cropped, such as the `src.png` that is pre-generated when Pixel2Style2Pixel is used to generate the style vector, the need_align parameter may not be filled in
+- start_lr: learning rate at the begin of training
+- final_lr: learning rate at the end of training
+- latent_level: The style vector levels involved in fitting are from 0 to 17 at 1024 resolution, from 0 to 15 at 512 resolution, and so on. The lower the level, the more biased toward the overall style change. The higher the level, the more biased toward the detail style change
+- step: the number of steps required to fit the image, the larger the number of steps, the longer it takes and the better the effect
+- mse_weight: weight of MSE loss
+- pre_latent: The pre-made style vector files are saved to facilitate better fitting. The default is empty, you can fill in the file path of `dst.npy` generated by Pixel2Style2Pixel
+- output_path: the directory where the generated images are stored
+- weight_path: pretrained model path
+- model_type: inner model type in PaddleGAN. If you use an existing model type, `weight_path` will have no effect.
+ Currently recommended use: `ffhq-config-f`
+- size: model parameters, output image resolution
+- style_dim: model parameters, dimensions of style z
+- n_mlp: model parameters, the number of multi-layer perception layers for style z
+- channel_multiplier: model parameters, channel product, affect model size and the quality of generated pictures
+- cpu: whether to use cpu inference, if not, please remove it from the command
+
+## Fitting Results
+
+Source image:
+
+
+

+
+
+Image encoded by Pixel2Style2Pixel:
+
+
+

+
+
+After passing the style vector generated by Pixel2Style2Pixel, use the Fitting module to perform 1000 steps of fitting to get the result:
+
+
+

+
+
+## Reference
+
+- 1. [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+
+ ```
+ @article{Karras2019stylegan2,
+ title={Analyzing and Improving the Image Quality of {StyleGAN}},
+ author={Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten and Jaakko Lehtinen and Timo Aila},
+ booktitle={Proc. CVPR},
+ year={2020}
+ }
+ ```
+- 2. [Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation](hhttps://arxiv.org/abs/2008.00951)
+
+ ```
+ @article{richardson2020encoding,
+ title={Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation},
+ author={Richardson, Elad and Alaluf, Yuval and Patashnik, Or and Nitzan, Yotam and Azar, Yaniv and Shapiro, Stav and Cohen-Or, Daniel},
+ journal={arXiv preprint arXiv:2008.00951},
+ year={2020}
+ }
+ ```
diff --git a/docs/en_US/tutorials/styleganv2mixing.md b/docs/en_US/tutorials/styleganv2mixing.md
new file mode 100644
index 0000000000000000000000000000000000000000..2bbeade69e4f72d05c25956d6189f920bfe2e103
--- /dev/null
+++ b/docs/en_US/tutorials/styleganv2mixing.md
@@ -0,0 +1,108 @@
+# StyleGAN V2 Mixing Module
+
+## StyleGAN V2 Mixing introduction
+
+The task of StyleGAN V2 is image generation while the Mixing module uses its style vector to achieve the mixing of two generated images with different levels and different proportions.
+
+## How to use
+
+### Mixing
+
+
+The user can use the following command to mix images:
+
+```
+cd applications/
+python -u tools/styleganv2mixing.py \
+ --latent1 \
+ --latent2 \
+ --weights \
+ 0.5 0.5 0.5 0.5 0.5 0.5 \
+ 0.5 0.5 0.5 0.5 0.5 0.5 \
+ 0.5 0.5 0.5 0.5 0.5 0.5 \
+ --output_path \
+ --weight_path \
+ --model_type ffhq-config-f \
+ --size 1024 \
+ --style_dim 512 \
+ --n_mlp 8 \
+ --channel_multiplier 2 \
+ --cpu
+```
+
+**params:**
+- latent1: The path of the first style vector. Come from `dst.npy` generated by Pixel2Style2Pixel or `dst.fitting.npy` generated by StyleGANv2 Fitting module
+- latent2: The path of the second style vector. The source is the same as the first style vector
+- weights: The two style vectors are mixed in different proportions at different levels. For a resolution of 1024, there are 18 levels. For a resolution of 512, there are 16 levels, and so on.
+ The more in front, the more it affects the whole of the mixed image. The more behind, the more it affects the details of the mixed image. In the figure below we show the fusion results of different weights for reference.
+- output_path: the directory where the generated images are stored
+- weight_path: pretrained model path
+- model_type: inner model type in PaddleGAN. If you use an existing model type, `weight_path` will have no effect.
+ Currently recommended use: `ffhq-config-f`
+- size: model parameters, output image resolution
+- style_dim: model parameters, dimensions of style z
+- n_mlp: model parameters, the number of multi-layer perception layers for style z
+- channel_multiplier: model parameters, channel product, affect model size and the quality of generated pictures
+- cpu: whether to use cpu inference, if not, please remove it from the command
+
+## Mixing Results
+
+The image corresponding to the first style vector:
+
+
+

+
+
+The image corresponding to the second style vector:
+
+
+

+
+
+The result of mixing two style vectors in a specific ratio:
+
+
+

+
+
+## Results with different weight
+The image corresponding to the first style vector:
+
+
+

+
+
+The image corresponding to the second style vector:
+
+
+

+
+
+The result of mixing two style vectors with different weight:
+
+

+
+
+
+## Reference
+
+- 1. [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+
+ ```
+ @article{Karras2019stylegan2,
+ title={Analyzing and Improving the Image Quality of {StyleGAN}},
+ author={Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten and Jaakko Lehtinen and Timo Aila},
+ booktitle={Proc. CVPR},
+ year={2020}
+ }
+ ```
+- 2. [Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation](hhttps://arxiv.org/abs/2008.00951)
+
+ ```
+ @article{richardson2020encoding,
+ title={Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation},
+ author={Richardson, Elad and Alaluf, Yuval and Patashnik, Or and Nitzan, Yotam and Azar, Yaniv and Shapiro, Stav and Cohen-Or, Daniel},
+ journal={arXiv preprint arXiv:2008.00951},
+ year={2020}
+ }
+ ```
diff --git a/docs/en_US/tutorials/swinir.md b/docs/en_US/tutorials/swinir.md
new file mode 100644
index 0000000000000000000000000000000000000000..e7752dbd69632ffd9ee09619e38361fffabadde3
--- /dev/null
+++ b/docs/en_US/tutorials/swinir.md
@@ -0,0 +1,81 @@
+English | [Chinese](../../zh_CN/tutorials/swinir.md)
+
+## SwinIR Strong Baseline Model for Image Restoration Based on Swin Transformer
+
+## 1、Introduction
+
+The structure of SwinIR is relatively simple. If you have seen Swin-Transformer, there is no difficulty. The authors introduce the Swin-T structure for low-level vision tasks, including image super-resolution reconstruction, image denoising, and image compression artifact removal. The SwinIR network consists of a shallow feature extraction module, a deep feature extraction module and a reconstruction module. The reconstruction module uses different structures for different tasks. Shallow feature extraction is a 3×3 convolutional layer. Deep feature extraction is composed of k RSTB blocks and a convolutional layer plus residual connections. Each RSTB (Res-Swin-Transformer-Block) consists of L STLs and a layer of convolution plus residual connections. The structure of the model is shown in the following figure:
+
+
+
+For a more detailed introduction to the model, please refer to the original paper [SwinIR: Image Restoration Using Swin Transformer](https://arxiv.org/pdf/2108.10257.pdf), PaddleGAN currently provides the weight of the denoising task.
+
+## 2 How to use
+
+### 2.1 Quick start
+
+After installing PaddleGAN, you can run a command as follows to generate the restorated image.
+
+```sh
+python applications/tools/swinir_denoising.py --images_path ${PATH_OF_IMAGE}
+```
+Where `PATH_OF_IMAGE` is the path of the image you need to denoise, or the path of the folder where the images is located.
+
+### 2.2 Prepare dataset
+
+#### Train Dataset
+
+[DIV2K](https://cv.snu.ac.kr/research/EDSR/DIV2K.tar) (800 training images) + [Flickr2K](https://cv.snu.ac.kr/research/EDSR/Flickr2K.tar) (2650 images) + [BSD500](http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/BSR/BSR_bsds500.tgz) (400 training&testing images) + [WED](http://ivc.uwaterloo.ca/database/WaterlooExploration/exploration_database_and_code.rar)(4744 images)
+
+The data that has been sorted out: put it in [Ai Studio](https://aistudio.baidu.com/aistudio/datasetdetail/149405).
+
+The training data is placed under: `data/trainsets/trainH`
+
+#### Test Dataset
+
+The test data is CBSD68: put it in [Ai Studio](https://aistudio.baidu.com/aistudio/datasetdetail/147756).
+
+Extract to: `data/triansets/CBSD68`
+
+### 2.3 Training
+An example is training to denoising. If you want to train for other tasks,If you want to train other tasks, you can change the dataset and modify the config file.
+
+```sh
+python -u tools/main.py --config-file configs/swinir_denoising.yaml
+```
+
+### 2.4 Test
+
+test model:
+```sh
+python tools/main.py --config-file configs/swinir_denoising.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 3 Results
+Denoising
+| model | dataset | PSNR/SSIM |
+|---|---|---|
+| SwinIR | CBSD68 | 36.0819 / 0.9464 |
+
+## 4 Download
+
+| model | link |
+|---|---|
+| SwinIR| [SwinIR_Denoising](https://paddlegan.bj.bcebos.com/models/SwinIR_Denoising.pdparams) |
+
+# References
+
+- [SwinIR: Image Restoration Using Swin Transformer](https://arxiv.org/pdf/2108.10257.pdf)
+
+```
+@article{liang2021swinir,
+ title={SwinIR: Image Restoration Using Swin Transformer},
+ author={Liang, Jingyun and Cao, Jiezhang and Sun, Guolei and Zhang, Kai and Van Gool, Luc and Timofte, Radu},
+ journal={arXiv preprint arXiv:2108.10257},
+ year={2021}
+}
+```
+
+
+
+
diff --git a/docs/en_US/tutorials/ugatit.md b/docs/en_US/tutorials/ugatit.md
index f32e16d216155dd68276a1213bd3f8d1e8e621e5..273525604e747d4068a92097ac3e24a5410a0ee0 100644
--- a/docs/en_US/tutorials/ugatit.md
+++ b/docs/en_US/tutorials/ugatit.md
@@ -1,3 +1,60 @@
-### U-GAT-IT
+# 1 U-GAT-IT
+
+## 1.1 Principle
+
+ Similar to CycleGAN, [U-GAT-IT](https://arxiv.org/abs/1907.10830) uses unpaired pictures for image translation, input two different images with different styles, and automatically perform style transfer. Differently, U-GAT-IT is a novel method for unsupervised image-to-image translation, which incorporates a new attention module and a new learnable normalization function in an end-to-end manner.
+
+## 1.2 How to use
+
+### 1.2.1 Prepare Datasets
+
+ Selfie2anime dataset used by U-GAT-IT can be download from [here](https://www.kaggle.com/arnaud58/selfie2anime). You can also use your own dataset.
+ The structure of dataset is as following:
+ ```
+ ├── dataset
+ └── YOUR_DATASET_NAME
+ ├── trainA
+ ├── trainB
+ ├── testA
+ └── testB
+ ```
+
+### 1.2.2 Train/Test
+
+ Datasets used in example is selfie2anime, you can change it to your own dataset in the config file.
+
+ Train a model:
+ ```
+ python -u tools/main.py --config-file configs/ugatit_selfie2anime_light.yaml
+ ```
+
+ Test the model:
+ ```
+ python tools/main.py --config-file configs/ugatit_selfie2anime_light.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+ ```
+
+## 1.3 Results
+
+
+
+## 1.4 模型下载
+| 模型 | 数据集 | 下载地址 |
+|---|---|---|
+| ugatit_light | selfie2anime | [ugatit_light](https://paddlegan.bj.bcebos.com/models/ugatit_light.pdparams)
+
+
+
+
+# References
+
+- 1. [U-GAT-IT: Unsupervised Generative Attentional Networks with Adaptive Layer-Instance Normalization for Image-to-Image Translation](https://arxiv.org/abs/1907.10830)
+
+ ```
+ @article{kim2019u,
+ title={U-GAT-IT: unsupervised generative attentional networks with adaptive layer-instance normalization for image-to-image translation},
+ author={Kim, Junho and Kim, Minjae and Kang, Hyeonwoo and Lee, Kwanghee},
+ journal={arXiv preprint arXiv:1907.10830},
+ year={2019}
+ }
+ ```
-To be added, you can refer to [Training/Evaluating/Testing basic usage](../get_started.md)
diff --git a/docs/en_US/tutorials/video_restore.md b/docs/en_US/tutorials/video_restore.md
deleted file mode 120000
index 6043d42afd6af0b019f09b636318cbddc9d2a913..0000000000000000000000000000000000000000
--- a/docs/en_US/tutorials/video_restore.md
+++ /dev/null
@@ -1 +0,0 @@
-../../zh_CN/tutorials/video_restore.md
\ No newline at end of file
diff --git a/docs/en_US/tutorials/video_restore.md b/docs/en_US/tutorials/video_restore.md
new file mode 100644
index 0000000000000000000000000000000000000000..5022bc03d98d73cad139285c2aaf32e7afd9e227
--- /dev/null
+++ b/docs/en_US/tutorials/video_restore.md
@@ -0,0 +1,177 @@
+## Old Video Restoration
+
+Old video often has the characteristics of few frames, no color, low resolution. Therefore, in view of these characteristics, we use the complementary frame, coloring, super resolution model to restore the video.
+
+### Using 'video-enhance.py' in 'applications' for quick start video restoration
+```
+cd applications
+python tools/video-enhance.py --input you_video_path.mp4 --process_order DAIN DeOldify EDVR --output output_dir
+```
+#### Parameters
+
+- `--input (str)`: path of the input video.
+- `--output (str)`: path of the output video.
+- `--process_order`: name and order of called model. For example, if the input is `DAIN DeOldify EDVR`, then `DAINPredictor` `DeOldifyPredictor` `EDVRPredictor` will be called in sequence.
+- `--cpu`: use cpu inference, GPU inference is used by default.
+
+#### Results
+
+
+
+### Quick experience
+We made an [ai studio old Beijing video restoration tutorial](https://aistudio.baidu.com/aistudio/projectdetail/1161285) in ai studio.
+
+### Points for attention
+
+* Before using this tutorial, please make sure that you have [installed paddle and ppgan](../install.md).
+
+* All commands in this tutorial are executed based on the 'PaddleGAN/applications' home directory.
+
+* Each model takes a long time, especially for the super-resolution model. It is recommended that the resolution of input video should be lower and the time should be shorter.
+
+* It needs to run on GPU environment.
+
+### Brief introduction of ppgan's prediction API for video restoration
+Different models and parameters can be used according to the characteristics of the video to be restored.
+
+### Complementary frame model -- DAIN
+The Dain model can detect occlusion explicitly by exploring the depth information. A depth aware stream projection layer is developed to synthesize intermediate streams in this model. It has a good effect in video complementary frame.
+
+
+```
+ppgan.apps.DAINPredictor(
+ output='output',
+ weight_path=None,
+ time_step=None,
+ use_gpu=True,
+ remove_duplicates=False)
+```
+#### Parameters
+
+- `output (str, Optional)`: path of your output, default: `output`.
+- `weight_path (None, Optional)`: path of your model weight. If it is not set, the default weight will be downloaded from the cloud to the local. Default: `None`.
+- `time_step (int)`: time coefficient of the complementary frame. If it is set to 0.5 and the original video is 30 frames per second, it will become 60 frames per second after complementary frame.
+- `remove_duplicates (bool, Optional)`: whether remove duplicate frames, default: `False`.
+
+### Coloring model -- DeOldifyPredictor
+DeOldify is generative adversarial networks with self attention mechanism. And the generator is a u-net structure. this model has a good effect in image coloring.
+
+
+```
+ppgan.apps.DeOldifyPredictor(output='output', weight_path=None, render_factor=32)
+```
+#### Parameters
+
+- `output (str, Optional)`: path of your output, default: `output`.
+- `weight_path (None, Optional)`: path of your model weight. If it is not set, the default weight will be downloaded from the cloud to the local. Default: `None`.
+- `artistic (bool)`: whether use "artistic" model. The "artistic" model may produce some interesting colors, but there are more burrs.
+- `render_factor (int)`: this parameter is multiplied by 16 as the resize value of the input frame. If the value is set to 32, the input frame will be resized to (32 * 16, 32 * 16) and then input into the network.
+
+### Coloring model -- eepRemasterPredictor
+DeepRemaster model is based on spatial-temporal convolutional neural network and self attention mechanism. And the picture can be colored according to any number of input reference frames.
+
+
+```
+ppgan.apps.DeepRemasterPredictor(
+ output='output',
+ weight_path=None,
+ colorization=False,
+ reference_dir=None,
+ mindim=360):
+```
+#### Parameters
+
+- `output (str, Optional)`: path of your output, default: `output`.
+- `weight_path (None, Optional)`: path of your model weight. If it is not set, the default weight will be downloaded from the cloud to the local. Default: `None`.
+- `colorization (bool)`: whether color the input video. If the option is set to `True`, the path of the reference frame must also be set. Default: `False`.
+- `reference_dir (bool)`: path of the reference frame, default: `None`.
+- `mindim (bool)`: size of the short side of the input frame after resize, default: `360`.
+
+### Super resolution model -- RealSRPredictor
+RealSR model focus on designing a novel degradation framework for realworld images by estimating various blur kernels as well as real noise distributions. Based on the novel degradation framework, we can acquire LR images sharing a common domain with real-world images. RealSR is a real-world super-resolution model aiming at better perception. Extensive experiments on synthetic noise data and real-world images demonstrate that RealSR outperforms the state-of-the-art methods, resulting in lower noise and better visual quality.
+
+
+
+```
+ppgan.apps.RealSRPredictor(output='output', weight_path=None)
+```
+#### Parameters
+
+- `output (str, Optional)`: path of your output, default: `output`.
+- `weight_path (None, Optional)`: path of your model weight. If it is not set, the default weight will be downloaded from the cloud to the local. Default: `None`.
+
+### Super resolution model -- EDVRPredictor
+EDVR model proposes a novel video restoration framework with enhanced deformable convolution. First, to handle large motions, it devise a Pyramid, Cascading and Deformable (PCD) alignment module, in which frame alignment is done at the feature level using deformable convolutions in a coarse-to-fine manner. Second, it propose a Temporal and Spatial Attention (TSA) fusion module, in which attention is applied both temporally and spatially, so as to emphasize important features for subsequent restoration.
+
+EDVR model is a super resolution model based on continuous frames, which can effectively use the information between frames and is faster than RealSR model.
+
+
+
+```
+ppgan.apps.EDVRPredictor(output='output', weight_path=None)
+```
+#### Parameters
+
+- `output (str, Optional)`: path of your output, default: `output`.
+- `weight_path (None, Optional)`: path of your model weight. If it is not set, the default weight will be downloaded from the cloud to the local. Default: `None`.
+
+### Video super-resolution model -- BasicVSRPredictor & IconVSRPredictor
+BasicVSR is a generic and efficient baseline for VSR. With minimal redesigns of existing components including optical flow and residual blocks, it outperforms existing state of the arts with high efficiency. BasicVSR adopts a typical bidirectional recurrent network. The upsampling module U contains multiple pixel-shuffle and convolutions. The red and blue colors represent the backward and forward propagations, respectively. The propagation branches contain only generic components. S, W, and R refer to the flow estimation module, spatial warping module, and residual blocks, respectively.
+
+
+
+```
+ppgan.apps.BasiVSRPredictor(output='output', weight_path=None, num_frames=10)
+ppgan.apps.IconVSRPredictor(output='output', weight_path=None, num_frames=10)
+```
+#### Parameters
+
+- `output (str, Optional)`: path of your output, default: `output`.
+- `weight_path (None, Optional)`: path of your model weight. If it is not set, the default weight will be downloaded from the cloud to the local. Default: `None`.
+- `num_frames (10, Optional)`: the number of video frames input at a time. Default: `10`.
+
+
+### Video super-resolution model -- BasicVSRPlusPlusPredictor
+BasicVSR++ consists of two effective modifications for improving propagation and alignment. The proposed second-order grid propagation and flow-guided deformable alignment allow BasicVSR++ to significantly outperform existing state of the arts with comparable runtime. BasicVSR++ won 3 champions and 1 runner-up in NTIRE 2021 Video Restoration and Enhancement Challenge.
+
+
+
+```
+ppgan.apps.BasiVSRPlusPlusPredictor(output='output', weight_path=None, num_frames=10)
+```
+#### Parameters
+
+- `output (str, Optional)`: path of your output, default: `output`.
+- `weight_path (None, Optional)`: path of your model weight. If it is not set, the default weight will be downloaded from the cloud to the local. Default: `None`.
+- `num_frames (10, Optional)`: the number of video frames input at a time. Default: `10`.
+
+
+### Video super-resolution model -- BasicVSRPlusPlusPredictor
+BasicVSR++ consists of two effective modifications for improving propagation and alignment. The proposed second-order grid propagation and flow-guided deformable alignment allow BasicVSR++ to significantly outperform existing state of the arts with comparable runtime. BasicVSR++ won 3 champions and 1 runner-up in NTIRE 2021 Video Restoration and Enhancement Challenge.
+
+
+
+```
+ppgan.apps.BasiVSRPlusPlusPredictor(output='output', weight_path=None, num_frames=10)
+```
+#### Parameters
+
+- `output (str, Optional)`: path of your output, default: `output`.
+- `weight_path (None, Optional)`: path of your model weight. If it is not set, the default weight will be downloaded from the cloud to the local. Default: `None`.
+- `num_frames (10, Optional)`: the number of video frames input at a time. Default: `10`.
+
+
+### Video super-resolution model -- PPMSVSRPredictor
+PP-MSVSR proposes local fusion module, auxiliary loss and re-align module to refine the enhanced result progressively.
+
+
+
+```
+ppgan.apps.PPMSVSRPredictor(output='output', weight_path=None, num_frames=10)
+ppgan.apps.PPMSVSRLargePredictor(output='output', weight_path=None, num_frames=10)
+```
+#### Parameters
+
+- `output (str, Optional)`: path of your output, default: `output`.
+- `weight_path (None, Optional)`: path of your model weight. If it is not set, the default weight will be downloaded from the cloud to the local. Default: `None`.
+- `num_frames (10, Optional)`: the number of video frames input at a time. Default: `10`.
diff --git a/docs/en_US/tutorials/video_super_resolution.md b/docs/en_US/tutorials/video_super_resolution.md
new file mode 100644
index 0000000000000000000000000000000000000000..099c2873fda7b0f7ce1f995ae214e2ee4df10f6d
--- /dev/null
+++ b/docs/en_US/tutorials/video_super_resolution.md
@@ -0,0 +1,227 @@
+
+# 1 Video Super Resolution (VSR)
+
+## 1.1 Principle
+
+ Video super-resolution originates from image super-resolution, which aims to recover high-resolution (HR) images from one or more low resolution (LR) images. The difference between them is that the video is composed of multiple frames, so the video super-resolution usually uses the information between frames to repair. Here we provide the video super-resolution model [EDVR](https://arxiv.org/pdf/1905.02716.pdf), [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf),[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf), and PP-MSVSR.
+
+### 🔥 PP-MSVSR 🔥
+ [PP-MSVSR](https://arxiv.org/pdf/2112.02828.pdf) is a multi-stage VSR deep architecture, with local fusion module, auxiliary loss and refined align module to refine the enhanced result progressively. Specifically, in order to strengthen the fusion of features across frames in feature propagation, a local fusion module is designed in stage-1 to perform local feature fusion before feature propagation. Moreover, an auxiliary loss in stage-2 is introduced to make the features obtained by the propagation module reserve more correlated information connected to the HR space, and introduced a refined align module in stage-3 to make full use of the feature information of the previous stage. Extensive experiments substantiate that PP-MSVSR achieves a promising performance of Vid4 datasets, which PSNR metric can achieve 28.13 with only 1.45M parameters.
+
+ Additionally, [PP-MSVSR](https://arxiv.org/pdf/2112.02828.pdf) provides two different models with 1.45M and 7.4M parameters in order to satisfy different requirements.
+
+### EDVR
+ [EDVR](https://arxiv.org/pdf/1905.02716.pdf) wins the champions and outperforms the second place by a large margin in all four tracks in the NTIRE19 video restoration and enhancement challenges. The main difficulties of video super-resolution from two aspects: (1) how to align multiple frames given large motions, and (2) how to effectively fuse different frames with diverse motion and blur. First, to handle large motions, EDVR devise a Pyramid, Cascading and Deformable (PCD) alignment module, in which frame alignment is done at the feature level using deformable convolutions in a coarse-to-fine manner. Second, EDVR propose a Temporal and Spatial Attention (TSA) fusion module, in which attention is applied both temporally and spatially, so as to emphasize important features for subsequent restoration.
+
+ [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf) reconsiders some most essential components for VSR guided by four basic functionalities, i.e., Propagation, Alignment, Aggregation, and Upsampling. By reusing some existing components added with minimal redesigns, a succinct pipeline, BasicVSR, achieves appealing improvements in terms of speed and restoration quality in comparison to many state-of-the-art algorithms. By presenting an informationrefill mechanism and a coupled propagation scheme to facilitate information aggregation, the BasicVSR can be expanded to [IconVSR](https://arxiv.org/pdf/2012.02181.pdf), which can serve as strong baselines for future VSR approaches.
+
+ [BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf) redesign BasicVSR by proposing second-order grid propagation and flowguided deformable alignment. By empowering the recurrent framework with the enhanced propagation and alignment, BasicVSR++ can exploit spatiotemporal information across misaligned video frames more effectively. The new components lead to an improved performance under a similar computational constraint. In particular, BasicVSR++ surpasses BasicVSR by 0.82 dB in PSNR with similar number of parameters. In NTIRE 2021, BasicVSR++ obtains three champions and one runner-up in the Video Super-Resolution and Compressed Video Enhancement Challenges.
+
+
+
+## 1.2 How to use
+
+### 1.2.1 Prepare Datasets
+ Here are 4 commonly used video super-resolution dataset, REDS, Vimeo90K, Vid4, UDM10. The REDS and Vimeo90K dataset include train dataset and test dataset, Vid4 and UDM10 are test dataset. Download and decompress the required dataset and place it under the ``PaddleGAN/data``.
+
+ REDS([download](https://seungjunnah.github.io/Datasets/reds.html))is a newly proposed high-quality (720p) video dataset in the NTIRE19 Competition. REDS consists of 240 training clips, 30 validation clips and 30 testing clips (each with 100 consecutive frames). Since the test ground truth is not available, we select four representative clips (they are '000', '011', '015', '020', with diverse scenes and motions) as our test set, denoted by REDS4. The remaining training and validation clips are re-grouped as our training dataset (a total of 266 clips).
+
+ The structure of the processed REDS is as follows:
+ ```
+ PaddleGAN
+ ├── data
+ ├── REDS
+ ├── train_sharp
+ | └──X4
+ ├── train_sharp_bicubic
+ | └──X4
+ ├── REDS4_test_sharp
+ | └──X4
+ └── REDS4_test_sharp_bicubic
+ └──X4
+ ...
+ ```
+
+ Vimeo90K ([download](http://toflow.csail.mit.edu/)) is designed by Tianfan Xue etc. for the following four video processing tasks: temporal frame interpolation, video denoising, video deblocking, and video super-resolution. Vimeo90K is a large-scale, high-quality video dataset. This dataset consists of 89,800 video clips downloaded from vimeo.com, which covers large variaty of scenes and actions.
+
+ The structure of the processed Vimeo90K is as follows:
+ ```
+ PaddleGAN
+ ├── data
+ ├── Vimeo90K
+ ├── vimeo_septuplet
+ | |──sequences
+ | └──sep_trainlist.txt
+ ├── vimeo_septuplet_BD_matlabLRx4
+ | └──sequences
+ └── vimeo_super_resolution_test
+ |──low_resolution
+ |──target
+ └──sep_testlist.txt
+ ...
+ ```
+
+ Vid4 ([Data Download](https://paddlegan.bj.bcebos.com/datasets/Vid4.zip)) is a commonly used test dataset for VSR, which contains 4 video segments.
+ The structure of the processed Vid4 is as follows:
+ ```
+ PaddleGAN
+ ├── data
+ ├── Vid4
+ ├── BDx4
+ └── GT
+ ...
+ ```
+
+ UDM10 ([Data Download](https://paddlegan.bj.bcebos.com/datasets/udm10_paddle.tar)) is a commonly used test dataset for VSR, which contains 10 video segments.
+ The structure of the processed UDM10 is as follows:
+ ```
+ PaddleGAN
+ ├── data
+ ├── udm10
+ ├── BDx4
+ └── GT
+ ...
+ ```
+
+
+
+### 1.2.2 Train/Test
+
+ According to the number of channels, EDVR are divided into EDVR_L(128 channels) and EDVR_M (64 channels). Then, taking EDVR_M as an example, the model training and testing are introduced.
+
+ The train of EDVR is generally divided into two stages. First, train EDVR without TSA module.
+
+ The command to train and test edvr without TSA module is as follows:
+
+ Train a model:
+ ```
+ python -u tools/main.py --config-file configs/edvr_m_wo_tsa.yaml
+ ```
+
+ Test the model:
+ ```
+ python tools/main.py --config-file configs/edvr_m_wo_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT_WITHOUT_TSA}
+ ```
+
+ Then the weight of EDVR without TSA module is used as the initialization of edvr model to train the complete edvr model.
+
+ The command to train and test edvr is as follows:
+
+ Train a model:
+ ```
+ python -u tools/main.py --config-file configs/edvr_m_w_tsa.yaml --load ${PATH_OF_WEIGHT_WITHOUT_TSA}
+ ```
+
+ Test the model:
+ ```
+ python tools/main.py --config-file configs/edvr_m_w_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+ ```
+
+ To train or test other VSR model, you can find the config file of the corresponding VSR model in the ``PaddleGAN/configs``, then change the config file in the command to the config file of corresponding VSR model.
+
+### 1.2.3 Model export
+
+Take the msvsr model as an example, ``inputs_size`` is the input size, ``model_name`` is the name of the exported model, and ``model_path`` is the path of the model weight.
+```
+python tools/export_model.py -c configs/msvsr_reds.yaml --inputs_size="1,2,3,180,320" --model_name inference --load model_path
+```
+
+### 1.2.4 Model inference
+Take the msvsr model as an example.
+```
+python tools/inference.py --model_type msvsr -c configs/msvsr_reds.yaml --output_path output_dir
+```
+
+## 1.3 Results
+The experimental results are evaluated on RGB channel.
+
+The metrics are PSNR / SSIM.
+
+VSR quantitative comparis on the test dataset REDS4 from REDS dataset
+| Method | Paramete(M) | FLOPs(G) | REDS4 |
+|---|---|---|---|
+| EDVR_M_wo_tsa_SRx4 | 3.00 | 223 | 30.4429 / 0.8684 |
+| EDVR_M_w_tsa_SRx4 | 3.30 | 232 | 30.5169 / 0.8699 |
+| EDVR_L_wo_tsa_SRx4 | 19.42 | 974 | 30.8649 / 0.8761 |
+| EDVR_L_w_tsa_SRx4 | 20.63 | 1010 | 30.9336 / 0.8773 |
+| BasicVSR_x4 | 6.29 | 374 | 31.4325 / 0.8913 |
+| IconVSR_x4 | 8.69 | 516 | 31.6882 / 0.8950 |
+| BasicVSR++_x4 | 7.32 | 406 | 32.4018 / 0.9071 |
+| PP-MSVSR_reds_x4 | 1.45 | 111 | 31.2535 / 0.8884 |
+| PP-MSVSR-L_reds_x4 | 7.42 | 543 | 32.5321 / 0.9083 |
+
+Deblur quantitative comparis on the test dataset REDS4 from REDS dataset
+| Method | REDS4 |
+|---|---|
+| EDVR_L_wo_tsa_deblur | 34.9587 / 0.9509 |
+| EDVR_L_w_tsa_deblur | 35.1473 / 0.9526 |
+
+VSR quantitative comparis on the Vimeo90K, Vid4, UDM10
+| Model | Vimeo90K | Vid4 | UDM10 |
+|---|---|---|---|
+| PP-MSVSR_vimeo90k_x4 |37.54/0.9499|28.13/0.8604|40.06/0.9699|
+
+## 1.4 Model Download
+| Method | Dataset | Download Link |
+|---|---|---|
+| EDVR_M_wo_tsa_SRx4 | REDS | [EDVR_M_wo_tsa_SRx4](https://paddlegan.bj.bcebos.com/models/EDVR_M_wo_tsa_SRx4.pdparams)
+| EDVR_M_w_tsa_SRx4 | REDS | [EDVR_M_w_tsa_SRx4](https://paddlegan.bj.bcebos.com/models/EDVR_M_w_tsa_SRx4.pdparams)
+| EDVR_L_wo_tsa_SRx4 | REDS | [EDVR_L_wo_tsa_SRx4](https://paddlegan.bj.bcebos.com/models/EDVR_L_wo_tsa_SRx4.pdparams)
+| EDVR_L_w_tsa_SRx4 | REDS | [EDVR_L_w_tsa_SRx4](https://paddlegan.bj.bcebos.com/models/EDVR_L_w_tsa_SRx4.pdparams)
+| EDVR_L_wo_tsa_deblur | REDS | [EDVR_L_wo_tsa_deblur](https://paddlegan.bj.bcebos.com/models/EDVR_L_wo_tsa_deblur.pdparams)
+| EDVR_L_w_tsa_deblur | REDS | [EDVR_L_w_tsa_deblur](https://paddlegan.bj.bcebos.com/models/EDVR_L_w_tsa_deblur.pdparams)
+| BasicVSR_x4 | REDS | [BasicVSR_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR_reds_x4.pdparams)
+| IconVSR_x4 | REDS | [IconVSR_x4](https://paddlegan.bj.bcebos.com/models/IconVSR_reds_x4.pdparams)
+| BasicVSR++_x4 | REDS | [BasicVSR++_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR%2B%2B_reds_x4.pdparams)
+| PP-MSVSR_reds_x4 | REDS | [PP-MSVSR_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams)
+| PP-MSVSR-L_reds_x4 | REDS | [PP-MSVSR-L_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR-L_reds_x4.pdparams)
+| PP-MSVSR_vimeo90k_x4 | Vimeo90K | [PP-MSVSR_vimeo90k_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_vimeo90k_x4.pdparams)
+
+
+
+# References
+
+- 1. [EDVR: Video Restoration with Enhanced Deformable Convolutional Networks](https://arxiv.org/pdf/1905.02716.pdf)
+
+ ```
+ @InProceedings{wang2019edvr,
+ author = {Wang, Xintao and Chan, Kelvin C.K. and Yu, Ke and Dong, Chao and Loy, Chen Change},
+ title = {EDVR: Video Restoration with Enhanced Deformable Convolutional Networks},
+ booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) Workshops},
+ month = {June},
+ year = {2019}
+ }
+ ```
+
+- 2. [BasicVSR: The Search for Essential Components in Video Super-Resolution and Beyond](https://arxiv.org/pdf/2012.02181.pdf)
+
+```
+ @InProceedings{chan2021basicvsr,
+ author = {Chan, Kelvin C.K. and Wang, Xintao and Yu, Ke and Dong, Chao and Loy, Chen Change},
+ title = {BasicVSR: The Search for Essential Components in Video Super-Resolution and Beyond},
+ booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition},
+ year = {2021}
+ }
+```
+
+- 3. [BasicVSR++: Improving Video Super-Resolution with Enhanced Propagation and Alignment](https://arxiv.org/pdf/2104.13371v1.pdf)
+
+ ```
+ @article{chan2021basicvsr++,
+ author = {Chan, Kelvin C.K. and Zhou, Shangchen and Xu, Xiangyu and Loy, Chen Change},
+ title = {BasicVSR++: Improving Video Super-Resolution with Enhanced Propagation and Alignment},
+ booktitle = {arXiv preprint arXiv:2104.13371},
+ year = {2021}
+ }
+ ```
+
+- 4. [PP-MSVSR: Multi-Stage Video Super-Resolution](https://arxiv.org/pdf/2112.02828.pdf)
+
+ ```
+ @article{jiang2021PP-MSVSR,
+ author = {Jiang, Lielin and Wang, Na and Dang, Qingqing and Liu, Rui and Lai, Baohua},
+ title = {PP-MSVSR: Multi-Stage Video Super-Resolution},
+ booktitle = {arXiv preprint arXiv:2112.02828},
+ year = {2021}
+ }
+ ```
diff --git a/docs/en_US/tutorials/wav2lip.md b/docs/en_US/tutorials/wav2lip.md
index de2b77ad79f385f79eaf6ceb4abd453e11681175..4411b54a9d2e9affec88a6a87a2c4fbb8d9fa2e4 100644
--- a/docs/en_US/tutorials/wav2lip.md
+++ b/docs/en_US/tutorials/wav2lip.md
@@ -11,13 +11,19 @@ Runing the following command to complete the lip-syning task. The output is the
```
cd applications
-python tools/wav2lip.py --face ../../imgs/mona7s.mp4 --audio ../../imgs/guangquan.m4a --outfile pp_guangquan_mona7s.mp4
+python tools/wav2lip.py \
+ --face ../docs/imgs/mona7s.mp4 \
+ --audio ../docs/imgs/guangquan.m4a \
+ --outfile pp_guangquan_mona7s.mp4 \
+ --face_enhancement
```
**params:**
- face: path of the input image or video file including faces.
- audio: path of the input audio file, format can be `.wav`, `.mp3`, `.m4a`. It can be any file supported by `FFMPEG` containing audio data.
+- outfile: result video of wav2lip
+- face_enhancement: enhance the face, default is False
### 2.2 Training
1. Our model are trained on LRS2. See [here](https://github.com/Rudrabha/Wav2Lip#training-on-datasets-other-than-lrs2) for a few suggestions regarding training on other datasets.
@@ -36,14 +42,13 @@ Place the LRS2 filelists(train, val, test) `.txt` files in the `filelists/` fold
- For single GPU:
```
export CUDA_VISIBLE_DEVICES=0
-python tools/main.py --confit-file configs/wav2lip.yaml
+python tools/main.py --config-file configs/wav2lip.yaml
```
- For multiple GPUs:
```
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch \
- --log_dir ./mylog_dd.log \
tools/main.py \
--config-file configs/wav2lip.yaml \
@@ -52,13 +57,12 @@ For the latter, run:
- For single GPU:
```
export CUDA_VISIBLE_DEVICES=0
-python tools/main.py --confit-file configs/wav2lip_hq.yaml
+python tools/main.py --config-file configs/wav2lip_hq.yaml
```
- For multiple GPUs:
```
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch \
- --log_dir ./mylog_dd.log \
tools/main.py \
--config-file configs/wav2lip_hq.yaml \
diff --git a/docs/imgs/RSSR.png b/docs/imgs/RSSR.png
new file mode 100644
index 0000000000000000000000000000000000000000..dc4147740a3c19c32145c3def401f688395aec59
Binary files /dev/null and b/docs/imgs/RSSR.png differ
diff --git a/docs/imgs/animeganv2.png b/docs/imgs/animeganv2.png
deleted file mode 100644
index a4ec4392ae7794488f1e72babe1a9706947e7f8f..0000000000000000000000000000000000000000
Binary files a/docs/imgs/animeganv2.png and /dev/null differ
diff --git a/docs/imgs/color_sr_peking.gif b/docs/imgs/color_sr_peking.gif
deleted file mode 100644
index 2e9e4428599732a3e613b8c68cb8c9826c593b50..0000000000000000000000000000000000000000
Binary files a/docs/imgs/color_sr_peking.gif and /dev/null differ
diff --git a/docs/imgs/dain_network.png b/docs/imgs/dain_network.png
deleted file mode 100644
index 0af436858a8f65070c48cbafd6279642d2f900b1..0000000000000000000000000000000000000000
Binary files a/docs/imgs/dain_network.png and /dev/null differ
diff --git a/docs/imgs/deoldify_network.png b/docs/imgs/deoldify_network.png
deleted file mode 100644
index dc73a188cdf134ad3679aa5d1f9fb64d07e46f02..0000000000000000000000000000000000000000
Binary files a/docs/imgs/deoldify_network.png and /dev/null differ
diff --git a/docs/imgs/edvr_network.png b/docs/imgs/edvr_network.png
deleted file mode 100644
index 5843aab9b1a46a7847420eebac36d6f07e64d7f4..0000000000000000000000000000000000000000
Binary files a/docs/imgs/edvr_network.png and /dev/null differ
diff --git a/docs/imgs/father_23.jpg b/docs/imgs/father_23.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5f9422b4b79eb45d462867eb694ff7d02c39772d
Binary files /dev/null and b/docs/imgs/father_23.jpg differ
diff --git a/docs/imgs/first_order.gif b/docs/imgs/first_order.gif
deleted file mode 100644
index 9b6b609939f4e9e71ffe2afdb2b9f68ad0585c16..0000000000000000000000000000000000000000
Binary files a/docs/imgs/first_order.gif and /dev/null differ
diff --git a/docs/imgs/fom_512_vs_256.png b/docs/imgs/fom_512_vs_256.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d449a079a708bf2d3aec787f5a97a228c2015d7
Binary files /dev/null and b/docs/imgs/fom_512_vs_256.png differ
diff --git a/docs/imgs/fom_source_image_multi_person.jpg b/docs/imgs/fom_source_image_multi_person.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1799a0cb9935653a6b7ba26d0559bd62bf0172db
Binary files /dev/null and b/docs/imgs/fom_source_image_multi_person.jpg differ
diff --git a/docs/imgs/horse2zebra.gif b/docs/imgs/horse2zebra.gif
deleted file mode 100644
index 93dc63be84c7502dc0d2bd76570858f9e8a7b278..0000000000000000000000000000000000000000
Binary files a/docs/imgs/horse2zebra.gif and /dev/null differ
diff --git a/docs/imgs/makeup_shifter.png b/docs/imgs/makeup_shifter.png
deleted file mode 100644
index 5f11a7d352ec4c6fe2a04c851c42d7508fb5f53b..0000000000000000000000000000000000000000
Binary files a/docs/imgs/makeup_shifter.png and /dev/null differ
diff --git a/docs/imgs/mayiyahei.MP4 b/docs/imgs/mayiyahei.MP4
new file mode 100644
index 0000000000000000000000000000000000000000..1092c616e0ba7b3c793f67d5c4333f0585f1f95e
Binary files /dev/null and b/docs/imgs/mayiyahei.MP4 differ
diff --git a/docs/imgs/mona.gif b/docs/imgs/mona.gif
deleted file mode 100644
index 9f3d4af05edce0920212784011642fe8348246dc..0000000000000000000000000000000000000000
Binary files a/docs/imgs/mona.gif and /dev/null differ
diff --git a/docs/imgs/photo2cartoon.png b/docs/imgs/photo2cartoon.png
deleted file mode 100644
index d620e42012de772bcadd4b8231c842abb225167e..0000000000000000000000000000000000000000
Binary files a/docs/imgs/photo2cartoon.png and /dev/null differ
diff --git a/docs/imgs/realsr_network.png b/docs/imgs/realsr_network.png
deleted file mode 100644
index e41ff89f5cbe81b40dcade4e097a669e0d5593e6..0000000000000000000000000000000000000000
Binary files a/docs/imgs/realsr_network.png and /dev/null differ
diff --git a/docs/imgs/remaster_network.png b/docs/imgs/remaster_network.png
deleted file mode 100644
index 9ddc26a5d411e78e0b1f69a363a86fc27877bc16..0000000000000000000000000000000000000000
Binary files a/docs/imgs/remaster_network.png and /dev/null differ
diff --git a/docs/imgs/sr_demo.png b/docs/imgs/sr_demo.png
deleted file mode 100644
index 885ddb308bdf64a481122ec6ac3bca614a0dfa1f..0000000000000000000000000000000000000000
Binary files a/docs/imgs/sr_demo.png and /dev/null differ
diff --git a/docs/imgs/stylegan2editing-sample1.png b/docs/imgs/stylegan2editing-sample1.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f3c57bf450447156e39251c1ae532e17d2b247f
Binary files /dev/null and b/docs/imgs/stylegan2editing-sample1.png differ
diff --git a/docs/imgs/stylegan2editing-sample2.png b/docs/imgs/stylegan2editing-sample2.png
new file mode 100644
index 0000000000000000000000000000000000000000..bdcc168298fd9a82003580b5c4b61c742dc5140a
Binary files /dev/null and b/docs/imgs/stylegan2editing-sample2.png differ
diff --git a/docs/imgs/stylegan2fitting-sample.png b/docs/imgs/stylegan2fitting-sample.png
new file mode 100644
index 0000000000000000000000000000000000000000..92a7fad8777bb76c542536e2b3b4b3aa05eff844
Binary files /dev/null and b/docs/imgs/stylegan2fitting-sample.png differ
diff --git a/docs/imgs/stylegan2fitting-sample2.png b/docs/imgs/stylegan2fitting-sample2.png
new file mode 100644
index 0000000000000000000000000000000000000000..733990430821ea9081d9f816e10e2d089395737e
Binary files /dev/null and b/docs/imgs/stylegan2fitting-sample2.png differ
diff --git a/docs/imgs/stylegan2mixing-sample.png b/docs/imgs/stylegan2mixing-sample.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d15128f40ba48454a4934dc519374026e085c85
Binary files /dev/null and b/docs/imgs/stylegan2mixing-sample.png differ
diff --git a/docs/imgs/ugatit.png b/docs/imgs/ugatit.png
deleted file mode 100644
index a13d7847f0e6489675f08217c74511e5e760b8aa..0000000000000000000000000000000000000000
Binary files a/docs/imgs/ugatit.png and /dev/null differ
diff --git a/docs/zh_CN/apis/apps.md b/docs/zh_CN/apis/apps.md
index e2a7124a476b65ef63a6ed73902f1607eadf4a17..0c17d6589eb7908eca55df294c4fb728d521bb4c 100644
--- a/docs/zh_CN/apis/apps.md
+++ b/docs/zh_CN/apis/apps.md
@@ -1,49 +1,42 @@
-# Applications接口说明
+# 预测接口说明
-ppgan.apps包含超分、插针、上色、换妆、图像动画生成、人脸解析等应用,接口使用简洁,并内置了已训练好的模型,可以直接用来做应用。
+PaddleGAN(ppgan.apps)提供超分、插帧、上色、换妆、图像动画生成、人脸解析等多种应用的预测API接口。接口内置训练好的高性能模型,支持用户进行灵活高效的应用推理。
-* 超分:
- * [RealSR](#ppgan.apps.DeOldifyPredictor)
- * [EDVR](#ppgan.apps.EDVRPredictor)
* 上色:
- * [DeOldify](#ppgan.apps.DeOldifyPredictor)
- * [DeepRemaster](#ppgan.apps.DeepRemasterPredictor)
+ * [DeOldify](#ppganappsDeOldifyPredictor)
+ * [DeepRemaster](#ppganappsDeepRemasterPredictor)
+* 超分:
+ * [RealSR](#ppganappsRealSRPredictor)
+ * [EDVR](#ppganappsEDVRPredictor)
* 插帧:
- * [DAIN](#ppgan.apps.DAINPredictor)
-* 图像工作驱动:
- * [FirstOrder](#ppgan.apps.FirstOrderPredictor)
+ * [DAIN](#ppganappsDAINPredictor)
+* 图像动作驱动:
+ * [FirstOrder](#ppganappsFirstOrderPredictor)
* 人脸:
- * [FaceFaceParse](#ppgan.apps.FaceParsePredictor)
+ * [FaceFaceParse](#ppganappsFaceParsePredictor)
* 动漫画:
- * [AnimeGAN](#ppgan.apps.AnimeGANPredictor)
+ * [AnimeGAN](#ppganappsAnimeGANPredictor)
* 唇形合成:
- * [Wav2Lip](#ppgan.apps.Wav2LipPredictor)
-
+ * [Wav2Lip](#ppganappsWav2LipPredictor)
## 公共用法
### CPU和GPU的切换
+默认情况下,如果是GPU设备、并且安装了[PaddlePaddle](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/windows-pip.html)的GPU环境包,则默认使用GPU进行推理。否则,如果安装的是CPU环境包,则使用CPU进行推理。
-默认情况下,如果是GPU设备、并且安装了PaddlePaddle的GPU环境包,则默认使用GPU进行推理。否则,如果安装的是CPU环境包,则使用CPU进行推理。如果需要手动切换CPU、GPU,可以通过以下方式:
+如果需要手动切换CPU、GPU,可以通过以下方式:
```
import paddle
-paddle.set_device('cpu')
-#paddle.set_device('gpu')
-
-# from ppgan.apps import DeOldifyPredictor
-# deoldify = DeOldifyPredictor()
-# deoldify.run("docs/imgs/test_old.jpeg")
+paddle.set_device('cpu') #设置为CPU
+#paddle.set_device('gpu') #设置为GPU
```
-
## ppgan.apps.DeOldifyPredictor
-
```python
ppgan.apps.DeOldifyPredictor(output='output', weight_path=None, render_factor=32)
```
-
-> 构建DeOldify实例。DeOldify是一个基于GAN的老照片上色模型。该接口可以对图片或视频做上色。建议视频使用mp4格式。
+> 构建DeOldify实例。DeOldify是一个基于GAN的影像上色模型。该接口支持对图片或视频上色。视频建议使用mp4格式。
>
> **示例**
>
@@ -52,57 +45,48 @@ ppgan.apps.DeOldifyPredictor(output='output', weight_path=None, render_factor=32
> deoldify = DeOldifyPredictor()
> deoldify.run("docs/imgs/test_old.jpeg")
> ```
-
> **参数**
>
-> > - output (str): 设置输出图片的保存路径,默认是output。注意,保存路径为设置output/DeOldify。
-> > - weight_path (str): 指定模型路径,默认是None,则会自动下载内置的已经训练好的模型。
-> > - artistic (bool): 是否使用偏"艺术性"的模型。"艺术性"的模型有可能产生一些有趣的颜色,但是毛刺比较多。
-> > - render_factor (int): 图片渲染上色时的缩放因子,图片会缩放到边长为16xrender_factor的正方形, 再上色,例如render_factor默认值为32,输入图片先缩放到(16x32=512) 512x512大小的图片。通常来说,render_factor越小,计算速度越快,颜色看起来也更鲜活。较旧和较低质量的图像通常会因降低渲染因子而受益。渲染因子越高,图像质量越好,但颜色可能会稍微褪色。
-
+> - output (str): 设置输出图片的保存路径,默认是output。注意,保存路径为设置output/DeOldify。
+> - weight_path (str): 指定模型路径,默认是None,则会自动下载内置的已经训练好的模型。
+> - artistic (bool): 是否使用偏"艺术性"的模型。"艺术性"的模型有可能产生一些有趣的颜色,但是毛刺比较多。
+> - render_factor (int): 图片渲染上色时的缩放因子,图片会缩放到边长为16xrender_factor的正方形, 再上色,例如render_factor默认值为32,输入图片先缩放到(16x32=512) 512x512大小的图片。通常来说,render_factor越小,计算速度越快,颜色看起来也更鲜活。较旧和较低质量的图像通常会因降低渲染因子而受益。渲染因子越高,图像质量越好,但颜色可能会稍微褪色。
### run
-
```python
run(input)
```
-
> 构建实例后的执行接口。
-
-> **参数**
->
-> > - input (str|np.ndarray|Image.Image): 输入的图片或视频文件。如果是图片,可以是图片的路径、np.ndarray、或PIL.Image类型。如果是视频,只能是视频文件路径。
-> >
>
-> **返回值**
+> **参数**
>
-> > - tuple(pred_img(np.array), out_paht(str)): 当属输入时图片时,返回预测后的图片,类型PIL.Image,以及图片的保存的路径。
-> > - tuple(frame_path(str), out_path(str)): 当输入为视频时,frame_path为视频每帧上色后保存的图片路径,out_path为上色后视频的保存路径。
+> >- input (str|np.ndarray|Image.Image): 输入的图片或视频文件。如果是图片,可以是图片的路径、np.ndarray、或PIL.Image类型。如果是视频,只能是视频文件路径。
+>
+>**返回值**
+>
+>> - tuple(pred_img(np.array), out_paht(str)): 当属输入时图片时,返回预测后的图片,类型PIL.Image,以及图片的保存的路径。
+> > - tuple(frame_path(str), out_path(str)): 当输入为视频时,frame_path为视频每帧上色后保存的图片路径,out_path为上色后视频的保存路径。
### run_image
-
```python
run_image(img)
```
-
> 图片上色的接口。
> **参数**
>
> > - img (str|np.ndarray|Image.Image): 输入图片,可以是图片的路径、np.ndarray、或PIL.Image类型。
-> >
->
-> **返回值**
->
-> > - pred_img(PIL.Image): 返回预测后的图片,为PIL.Image类型。
+>
+>**返回值**
+>
+>> - pred_img(PIL.Image): 返回预测后的图片,为PIL.Image类型。
### run_video
```python
run_video(video)
```
-
> 视频上色的接口。
-
+>
> **参数**
>
> > - Video (str): 输入视频文件的路径。
@@ -110,16 +94,11 @@ run_video(video)
> **返回值**
>
> > - tuple(frame_path(str), out_path(str)): frame_path为视频每帧上色后保存的图片路径,out_path为上色后视频的保存路径。
-
-
-
## ppgan.apps.DeepRemasterPredictor
-
```python
ppgan.apps.DeepRemasterPredictor(output='output', weight_path=None, colorization=False, reference_dir=None, mindim=360)
```
-
-> 构建DeepRemasterPredictor实例。DeepRemaster是一个基于GAN的老照片/视频修复、上色模型,该模型可以提供一个参考色的图片作为输入。该接口目前只支持视频输入,建议使用mp4格式。
+> 构建DeepRemasterPredictor实例。DeepRemaster是一个基于GAN的视频上色、修复模型,该模型可以提供一个参考色的图片作为输入。该接口目前只支持视频输入,建议使用mp4格式。
>
> **示例**
>
@@ -130,7 +109,6 @@ ppgan.apps.DeepRemasterPredictor(output='output', weight_path=None, colorization
> ```
>
>
-
> **参数**
>
> > - output (str): 设置输出图片的保存路径,默认是output。注意,保存路径为设置output/DeepRemaster。
@@ -138,15 +116,12 @@ ppgan.apps.DeepRemasterPredictor(output='output', weight_path=None, colorization
> > - colorization (bool): 是否打开上色功能,默认是False,既不打开,只执行修复功能。
> > - reference_dir(str|None): 打开上色功能时,输入参考色图片路径,也可以不设置参考色图片。
> > - mindim(int): 预测前图片会进行缩放,最小边长度。
-
### run
-
```python
run(video_path)
```
-
> 构建实例后的执行接口。
-
+>
> **参数**
>
> > - video_path (str): 输入视频文件路径。
@@ -154,54 +129,45 @@ run(video_path)
> > 返回值
> >
> > - tuple(str, str)): 返回两个str类型,前者是视频上色后每帧图片的保存路径,后者是上色之后的视频保存路径。
-
-
-
## ppgan.apps.RealSRPredictor
-
```python
ppgan.apps.RealSRPredictor(output='output', weight_path=None)
```
> 构建RealSR实例。RealSR: Real-World Super-Resolution via Kernel Estimation and Noise Injection发表于CVPR 2020 Workshops的基于真实世界图像训练的超分辨率模型。此接口对输入图片或视频做4倍的超分辨率。建议视频使用mp4格式。
>
+> *注意:RealSR的输入图片尺寸需小于1000x1000pix。
+>
> **用例**
>
> ```
-> from ppgan.apps import RealSRPredictor
+> > from ppgan.apps import RealSRPredictor
> sr = RealSRPredictor()
> sr.run("docs/imgs/test_sr.jpeg")
> ```
-
> **参数**
>
> > - output (str): 设置输出图片的保存路径,默认是output。注意,保存路径为设置output/RealSR。
> > - weight_path (str): 指定模型路径,默认是None,则会自动下载内置的已经训练好的模型。
-
```python
run(video_path)
```
-
> 构建实例后的执行接口。
-
+>
> **参数**
>
> > - video_path (str): 输入视频文件路径。
-> >
->
-> **返回值**
->
-> > - tuple(pred_img(np.array), out_paht(str)): 当属输入时图片时,返回预测后的图片,类型PIL.Image,以及图片的保存的路径。
+>
+>**返回值**
+>
+>> - tuple(pred_img(np.array), out_paht(str)): 当属输入时图片时,返回预测后的图片,类型PIL.Image,以及图片的保存的路径。
> > - tuple(frame_path(str), out_path(str)): 当输入为视频时,frame_path为超分后视频每帧图片的保存路径,out_path为超分后的视频保存路径。
-
### run_image
-
```python
run_image(img)
```
-
> 图片超分的接口。
-
+>
> **参数**
>
> > - img (str|np.ndarray|Image.Image): 输入图片,可以是图片的路径、np.ndarray、或PIL.Image类型。
@@ -209,15 +175,12 @@ run_image(img)
> **返回值**
>
> > - pred_img(PIL.Image): 返回预测后的图片,为PIL.Image类型。
-
### run_video
-
```python
run_video(video)
```
-
> 视频超分的接口。
-
+>
> **参数**
>
> > - Video (str): 输入视频文件的路径。
@@ -225,37 +188,38 @@ run_video(video)
> **返回值**
>
> > - tuple(frame_path(str), out_path(str)): frame_path为超分后视频每帧图片的保存路径,out_path为超分后的视频保存路径。
-
-
-
## ppgan.apps.EDVRPredictor
-
```python
ppgan.apps.EDVRPredictor(output='output', weight_path=None)
```
> 构建RealSR实例。EDVR: Video Restoration with Enhanced Deformable Convolutional Networks,论文链接: https://arxiv.org/abs/1905.02716 ,是一个针对视频超分的模型。该接口,对视频做2倍的超分。建议视频使用mp4格式。
>
+> *注意:目前该接口仅支持在静态图下使用,需在使用前添加如下代码开启静态图:
+>
+> ```
+> import paddle
+> paddle.enable_static() #开启静态图
+> paddle.disable_static() #关闭静态图
+> ```
+>
> **示例**
>
> ```
-> from ppgan.apps import EDVRPredictor
+> > from ppgan.apps import EDVRPredictor
> sr = EDVRPredictor()
> # 测试一个视频文件
> sr.run("docs/imgs/test.mp4")
> ```
-
> **参数**
>
> > - output (str): 设置输出图片的保存路径,默认是output。注意,保存路径为设置output/EDVR。
> > - weight_path (str): 指定模型路径,默认是None,则会自动下载内置的已经训练好的模型。
-
```python
run(video_path)
```
-
> 构建实例后的执行接口。
-
+>
> **参数**
>
> > - video_path (str): 输入视频文件路径。
@@ -263,26 +227,29 @@ run(video_path)
> **返回值**
>
> > - tuple(str, str): 前者超分后的视频每帧图片的保存路径,后者为做完超分的视频路径。
-
-
-
## ppgan.apps.DAINPredictor
-
```python
ppgan.apps.DAINPredictor(output='output', weight_path=None,time_step=None, use_gpu=True, key_frame_thread=0,remove_duplicates=False)
```
> 构建插帧DAIN模型的实例。DAIN: Depth-Aware Video Frame Interpolation,论文链接: https://arxiv.org/abs/1904.00830 ,对视频做插帧,获得帧率更高的视频。
>
+> *注意:目前该接口仅支持在静态图下使用,需在使用前添加如下代码开启静态图:
+>
+> ```
+> import paddle
+> paddle.enable_static() #开启静态图
+> paddle.disable_static() #关闭静态图
+> ```
+>
> **示例**
>
> ```
> from ppgan.apps import DAINPredictor
-> dain = DAINPredictor()
-> # 测试一个视频文件
+> dain = DAINPredictor(time_step=0.5)
+> #目前 time_step 无默认值,需手动指定,测试一个视频文件
> dain.run("docs/imgs/test.mp4")
> ```
-
> **参数**
>
> > - output_path (str): 设置预测输出的保存路径,默认是output。注意,保存路径为设置output/DAIN。
@@ -290,30 +257,26 @@ ppgan.apps.DAINPredictor(output='output', weight_path=None,time_step=None, use
> > - time_step (float): 帧率变化的倍数为 1./time_step,例如,如果time_step为0.5,则2倍插针,为0.25,则为4倍插帧。
> > - use_gpu (bool): 是否使用GPU做预测,默认是True。
> > - remove_duplicates (bool): 是否去除重复帧,默认是False。
-
```python
run(video_path)
```
-
> 构建实例后的执行接口。
-
+>
> **参数**
>
> > - video_path (str): 输入视频文件路径。
>
> **返回值**
+
>
> > - tuple(str, str): 当输入为视频时,frame_path为视频每帧上色后保存的图片路径,out_path为上色后视频的保存路径。
-
-
-
## ppgan.apps.FirstOrderPredictor
-
```python
ppgan.apps.FirstOrderPredictor(output='output', weight_path=None,config=None, relative=False, adapt_scale=False,find_best_frame=False, best_frame=None)
```
-
-> 构建FirsrOrder模型的实例,此模型用来做Image Animation,即给定一张源图片和一个驱动视频,生成一段视频,其中主体是源图片,动作是驱动视频中的动作。论文是First Order Motion Model for Image Animation,论文链接: https://arxiv.org/abs/2003.00196 。
+> 构建FirsrOrder模型的实例,此模型用来做Image Animation,即给定一张源图片和一个驱动视频,生成一段视频,其中主体是源图片,动作是驱动视频中的动作。
+>
+> 论文是First Order Motion Model for Image Animation,论文链接: https://arxiv.org/abs/2003.00196 。
>
> **示例**
>
@@ -323,7 +286,6 @@ ppgan.apps.FirstOrderPredictor(output='output', weight_path=None,config=None,
> # 测试一个视频文件
> animate.run("source.png","driving.mp4")
> ```
-
> **参数**
>
> > - output_path (str): 设置预测输出的保存路径,默认是output。注意,保存路径为设置output/result.mp4。
@@ -333,13 +295,11 @@ ppgan.apps.FirstOrderPredictor(output='output', weight_path=None,config=None,
> > - adapt_scale (bool): 是否基于关键点凸包的自适应运动,默认是False。
> > - find_best_frame (bool): 是否从与源图片最匹配的帧开始生成,仅仅适用于人脸应用,需要人脸对齐的库。
> > - best_frame (int): 设置起始帧数,默认是None,从第1帧开始(从1开始计数)。
-
```python
run(source_image,driving_video)
```
-
> 构建实例后的执行接口,预测视频保存位置为output/result.mp4。
-
+>
> **参数**
>
> > - source_image (str): 输入源图片。
@@ -348,18 +308,26 @@ run(source_image,driving_video)
> **返回值**
>
> > 无。
-
## ppgan.apps.FaceParsePredictor
-
```pyhton
ppgan.apps.FaceParsePredictor(output_path='output')
```
-> 构建人脸解析模型实例,此模型用来做人脸解析, 即给定一个输入的人脸图像,人脸解析将为每个语义成分(如头发、嘴唇、鼻子、耳朵等)分配一个像素级标签。我们用BiseNet来完成这项任务。论文是 BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation, 论文链接: https://arxiv.org/abs/1808.00897v1.
-
+> 构建人脸解析模型实例,此模型用来做人脸解析, 即给定一个输入的人脸图像,人脸解析将为每个语义成分(如头发、嘴唇、鼻子、耳朵等)分配一个像素级标签。我们用BiseNet来完成这项任务。
+>
+> 论文是 BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation, 论文链接: https://arxiv.org/abs/1808.00897v1.
+>
+> *注意:此接口需要dlib包,使用前需用以下代码安装:
+>
+> ```
+> pip install dlib
+> ```
+> Windows下安装此包时间可能过长,请耐心等待。
+>
> **参数:**
>
> > - input_image: 输入待解析的图片文件路径
-
+> > - output_path:输出保存的路径
+>
> **示例:**
>
> ```
@@ -368,6 +336,7 @@ ppgan.apps.FaceParsePredictor(output_path='output')
> parser.run('docs/imgs/face.png')
> ```
> **返回值:**
+>
> > - mask(numpy.ndarray): 返回解析完成的人脸成分mask矩阵, 数据类型为numpy.ndarray
## ppgan.apps.AnimeGANPredictor
@@ -375,12 +344,14 @@ ppgan.apps.FaceParsePredictor(output_path='output')
```pyhton
ppgan.apps.AnimeGANPredictor(output_path='output_dir',weight_path=None,use_adjust_brightness=True)
```
-> 利用animeganv2来对景物图像进行动漫风格化。论文是 AnimeGAN: A Novel Lightweight GAN for Photo Animation, 论文链接: https://link.springer.com/chapter/10.1007/978-981-15-5577-0_18.
-
+> 利用AnimeGAN v2来对景物图像进行动漫风格化。
+>
+> 论文是 AnimeGAN: A Novel Lightweight GAN for Photo Animation, 论文链接: https://link.springer.com/chapter/10.1007/978-981-15-5577-0_18.
+>
> **参数:**
>
> > - input_image: 输入待解析的图片文件路径
-
+>
> **示例:**
>
> ```
@@ -389,17 +360,16 @@ ppgan.apps.AnimeGANPredictor(output_path='output_dir',weight_path=None,use_adjus
> predictor.run('docs/imgs/animeganv2_test.jpg')
> ```
> **返回值:**
+>
> > - anime_image(numpy.ndarray): 返回风格化后的景色图像
-
## ppgan.apps.MiDaSPredictor
-
```pyhton
ppgan.apps.MiDaSPredictor(output=None, weight_path=None)
```
-
-> 单目深度估计模型MiDaSv2, 参考 https://github.com/intel-isl/MiDaS, 论文是 Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer , 论文链接: https://arxiv.org/abs/1907.01341v3
-
+> 单目深度估计模型MiDaSv2, 参考 https://github.com/intel-isl/MiDaS 单目深度估计是从单幅RGB图像中估计深度的方法
+>
+> 论文是 Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer , 论文链接: https://arxiv.org/abs/1907.01341v3
> **示例**
>
> ```python
@@ -429,39 +399,36 @@ ppgan.apps.MiDaSPredictor(output=None, weight_path=None)
>
> > - output (str): 输出路径,如果是None,则不保存pfm和png的深度图文件。
> > - weight_path (str): 指定模型路径,默认是None,则会自动下载内置的已经训练好的模型。
-
+> >
> **返回值:**
+>
> > - prediction (numpy.ndarray): 返回预测结果。
> > - pfm_f (str): 如果设置output路径,返回pfm文件保存路径。
> > - png_f (str): 如果设置output路径,返回png文件保存路径。
-
-## ppgan.apps.Wav2lipPredictor
+## ppgan.apps.Wav2LipPredictor
```python
-ppgan.apps.FirstOrderPredictor(args)
+ppgan.apps.Wav2LipPredictor(face=None, ausio_seq=None, outfile=None)
```
-
-> 构建Wav2lip模型的实例,此模型用来做唇形合成,即给定一个人物视频和一个音频,实现人物口型与输入语音同步。论文是A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild,论文链接: http://arxiv.org/abs/2008.10010.
+> 构建Wav2Lip模型的实例,此模型用来做唇形合成,即给定一个人物视频和一个音频,实现人物口型与输入语音同步。
+>
+> 论文是A Lip Sync Expert Is All You Need for Speech to Lip Generation In the Wild,论文链接: http://arxiv.org/abs/2008.10010.
>
> **示例**
>
> ```
> from ppgan.apps import Wav2LipPredictor
-> # The args parameter should be specified by argparse
-> predictor = Wav2LipPredictor(args)
-> predictor.run()
+> import ppgan
+> predictor = Wav2LipPredictor()
+> predictor.run('/home/aistudio/先烈.jpeg', '/home/aistudio/pp_guangquan_zhenzhu46s.mp4','wav2lip')
> ```
-
> **参数:**
+> - face (str): 指定的包含人物的图片或者视频的文件路径。
+> - audio_seq (str): 指定的输入音频的文件路径,它的格式可以是 `.wav`, `.mp3`, `.m4a`等,任何ffmpeg可以处理的文件格式都可以。
+> - outfile (str): 指定的输出视频文件路径。
-> - args(ArgumentParser): 参数包含所有的输入参数,用户在运行程序时需要通过argparse指定,主要的参数主要包含以下几项:`
-> > - checkpoint_path (str): 指定模型路径,默认是None,不指定则会自动下载内置的已经训练好的模型。
-> > - face (str): 指定的包含人物的图片或者视频的文件路径。
-> > - audio (str): 指定的输入音频的文件路径,它的格式可以是 `.wav`, `.mp3`, `.m4a`等,任何ffmpeg可以处理的文件格式都可以。
-> > - outfile (str): 指定的输出视频文件路径。
+>**返回值**
+>
+>> 无。
->
-> **返回值**
->
-> > 无。
diff --git a/docs/zh_CN/config_doc.md b/docs/zh_CN/config_doc.md
new file mode 100644
index 0000000000000000000000000000000000000000..3f8eded3dd260421e79bf093f616a95b7453dbd6
--- /dev/null
+++ b/docs/zh_CN/config_doc.md
@@ -0,0 +1,77 @@
+# 配置文件说明文档
+
+## Config文件参数介绍
+
+以`lapstyle_rev_first.yaml`为例。
+
+### Global
+
+| 字段 | 用途 | 默认值 |
+| ------------------------- | -------------------------- | --------------- |
+| total_iters | 设置总训练步数 | 30000 |
+| min_max | tensor数值范围(存图像时使用) | (0., 1.) |
+| output_dir | 设置输出结果所在的文件路径 | ./output_dir |
+| snapshot_config: interval | 设置保存模型参数的间隔 | 5000 |
+
+### Model
+
+| 字段 | 用途 | 默认值 |
+| :---------------------- | -------- | ------ |
+| name | 模型名称 | LapStyleRevFirstModel |
+| revnet_generator | 设置revnet生成器 | RevisionNet |
+| revnet_discriminator | 设置revnet判别器 | LapStyleDiscriminator |
+| draftnet_encode | 设置draftnet编码器 | Encoder |
+| draftnet_decode | 设置draftnet解码器 | DecoderNet |
+| calc_style_emd_loss | 设置style损失1 | CalcStyleEmdLoss |
+| calc_content_relt_loss | 设置content损失1 | CalcContentReltLoss |
+| calc_content_loss | 设置content损失2 | CalcContentLoss |
+| calc_style_loss | 设置style损失2 | CalcStyleLoss |
+| gan_criterion: name | 设置GAN损失 | GANLoss |
+| gan_criterion: gan_mode | 设置GAN损失模态参数 | vanilla |
+| content_layers | 设置计算content损失2的网络层 |['r11', 'r21', 'r31', 'r41', 'r51']|
+| style_layers | 设置计算style损失2的网络层 | ['r11', 'r21', 'r31', 'r41', 'r51'] |
+| content_weight | 设置content总损失权重 | 1.0 |
+| style_weigh | 设置style总损失权重 | 3.0 |
+
+### Dataset (train & test)
+
+| 字段 | 用途 | 默认值 |
+| :----------- | -------------------- | -------------------- |
+| name | 数据集名称 | LapStyleDataset |
+| content_root | 数据集所在路径 | data/coco/train2017/ |
+| style_root | 目标风格图片所在路径 | data/starrynew.png |
+| load_size | 输入图像resize后图像大小 | 280 |
+| crop_size | 随机剪裁图像后图像大小 | 256 |
+| num_workers | 设置工作进程个数 | 16 |
+| batch_size | 设置一次训练所抓取的数据样本数量 | 5 |
+
+### Lr_scheduler
+
+| 字段 | 用途 | 默认值 |
+| :------------ | ---------------- | -------------- |
+| name | 学习策略名称 | NonLinearDecay |
+| learning_rate | 设置初始学习率 | 1e-4 |
+| lr_decay | 设置学习率衰减率 | 5e-5 |
+
+### Optimizer
+
+| 字段 | 用途 | 默认值 |
+| :-------- | ---------- | ------- |
+| name | 优化器类名 | Adam |
+| net_names | 优化器作用的网络 | net_rev |
+| beta1 | 设置优化器参数beta1 | 0.9 |
+| beta2 | 设置优化器参数beta2 | 0.999 |
+
+### Validate
+
+| 字段 | 用途 | 默认值 |
+| :------- | ---- | ------ |
+| interval | 设置验证间隔 | 500 |
+| save_img | 验证时是否保存图像 | false |
+
+### Log_config
+
+| 字段 | 用途 | 默认值 |
+| :--------------- | ---- | ------ |
+| interval | 设置打印log间隔 | 10 |
+| visiual_interval | 设置训练过程中保存生成图像的间隔 | 500 |
diff --git a/docs/zh_CN/data_prepare.md b/docs/zh_CN/data_prepare.md
index 3b461389a4487f5f980d48356bdcafd649683e67..08f9dc3dc8b05560ef9c6cddc2267edf817c5727 100644
--- a/docs/zh_CN/data_prepare.md
+++ b/docs/zh_CN/data_prepare.md
@@ -1,4 +1,6 @@
-## 数据准备
+# 数据准备
+
+## 1. 数据集路径配置
现有的配置默认数据集的路径是在`$PaddleGAN/data`下,目录结构如下图所示。如果你已经下载好数据集了,建议将数据集软链接到 `$PaddleGAN/data`。
@@ -39,16 +41,24 @@ dataset:
num_workers: 4
```
-### CycleGAN模型相关的数据集下载
+## 2. 准备数据集
+
+### 2.1 下载数据集
+
+#### 2.1.1 CycleGAN模型相关的数据集下载
+
+- #### 从网页下载
-#### 从网页下载
-cyclgan模型相关的数据集可以在[这里](https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/)下载
+CycleGAN模型相关的数据集可以在[这里](https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/)下载,下载后记得软连接到 ```PaddleGAN/data/``` 下。
+
+- #### 使用脚本下载
-#### 使用脚本下载
我们在 ```PaddleGAN/data``` 文件夹下提供了一个脚本 ```download_cyclegan_data.py``` 方便下载CycleGAN相关的
-数据集。执行如下命令可以下载相关的数据集,目前支持的数据集名称有:apple2orange, summer2winter_yosemite,horse2zebra, monet2photo, cezanne2photo, ukiyoe2photo, vangogh2photo, maps, cityscapes, facades, iphone2dslr_flower, ae_photos, cityscapes。
+数据集。
+
+目前支持下载的数据集名称有:apple2orange, summer2winter_yosemite,horse2zebra, monet2photo, cezanne2photo, ukiyoe2photo, vangogh2photo, maps, cityscapes, facades, iphone2dslr_flower, ae_photos, cityscapes
执行如下命令,可以下载对应的数据集到 ```~/.cache/ppgan``` 并软连接到 ```PaddleGAN/data/``` 下。
@@ -56,35 +66,47 @@ cyclgan模型相关的数据集可以在[这里](https://people.eecs.berkeley.ed
python data/download_cyclegan_data.py --name horse2zebra
```
-#### 使用自己的数据集
-如果你使用自己的数据集,需要构造成如下目录的格式。注意 ```xxxA```,```xxxB```文件数量,文件内容无需一一对应。
-```
-custom_datasets
-├── testA
-├── testB
-├── trainA
-└── trainB
-```
+#### 2.1.2 Pix2Pix相关的数据集下载
-### Pix2Pix相关的数据集下载
+- #### 从网页下载
-#### 从网页下载
-pixel2pixel模型相关的数据集可以在[这里](https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets/)下载
+pixel2pixel模型相关的数据集可以在[这里](https://people.eecs.berkeley.edu/~tinghuiz/projects/pix2pix/datasets/)下载,下载后记得软连接到 ```PaddleGAN/data/``` 下。
-#### 使用脚本下载
+- #### 使用脚本下载
-我们在 ```PaddleGAN/data``` 文件夹下提供了一个脚本 ```download_pix2pix_data.py``` 方便下载pix2pix模型相关的数据集。执行如下命令可以下载相关的数据集,目前支持的数据集名称有:apple2orange, summer2winter_yosemite,horse2zebra, monet2photo, cezanne2photo, ukiyoe2photo, vangogh2photo, maps, cityscapes, facades, iphone2dslr_flower, ae_photos, cityscapes。
-执行如下命令,可以下载对应的数据集到 ```~/.cache/ppgan``` 并软连接到 ```PaddleGAN/data/``` 下。
+我们在 ```PaddleGAN/data``` 文件夹下提供了一个脚本 ```download_pix2pix_data.py``` 方便下载pix2pix模型相关的数据集。
+
+目前支持下载的数据集名称有:apple2orange, summer2winter_yosemite,horse2zebra, monet2photo, cezanne2photo, ukiyoe2photo, vangogh2photo, maps, cityscapes, facades, iphone2dslr_flower, ae_photos, cityscapes。
+
+同理,执行如下命令,可以下载对应的数据集到 ```~/.cache/ppgan``` 并软连接到 ```PaddleGAN/data/``` 下。
```
python data/download_pix2pix_data.py --name cityscapes
```
-#### 使用自己的数据集
+### 2.2 自制数据集
+
+#### 2.2.1非成对数据集构建
+
+针对不需成对数据训练的模型,如CycleGAN等,如需使用自己的数据集,需要构造成如下目录的格式。
+
+注意 ```xxxA```,```xxxB```文件数量,文件内容无需一一对应。
+
+```
+custom_datasets
+├── testA
+├── testB
+├── trainA
+└── trainB
+```
+
+### 2.2.2 成对数据集构建
+
+针对需要成对数据训练的模型,如Pixel2Pixel等,如需使用自己的数据集,需要构造成如下目录的格式。
-如果你使用自己的数据集,需要构造成如下目录的格式。同时图片应该制作成下图的样式,即左边为一种风格,另一边为相应转换的风格。
+注意图片应该制作成下图的样式,即左边为一种风格,另一边为相应转换的风格。
```
facades
diff --git a/docs/zh_CN/get_started.md b/docs/zh_CN/get_started.md
index 69d6ec89a2c244c5a03442fcefc2c5accf7a2b3b..94ea085c9e2ef62cb84d6ec18ac099a7834f5306 100644
--- a/docs/zh_CN/get_started.md
+++ b/docs/zh_CN/get_started.md
@@ -1,27 +1,61 @@
+# 快速开始
-## 快速开始使用PaddleGAN
+PaddleGAN是飞桨生成对抗网络(GAN)开发套件,提供多种经典前沿网络的高性能复现,应用覆盖图像生成、风格迁移、动作驱动、影像超分及上色等多种领域。
-注意:
-* 开始使用PaddleGAN前请确保已经阅读过[安装文档](./install.md),并根据[数据准备文档](./data_prepare.md)准备好数据集。
-* 以下教程以CycleGAN模型在Cityscapes数据集上的训练预测作为示例。
+本章节将以CycleGAN模型在Cityscapes数据集上的训练预测作为示例,教大家如何快速上手使用PaddleGAN。
+**注意,PaddleGAN中所有的模型配置文件均可在 [./PaddleGAN/configs](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/configs) 中找到。**
-### 训练
+## 目录
+- [安装](#安装)
+- [数据准备](#数据准备)
+- [训练](#训练)
+ - [单卡训练](#1-单卡训练)
+ - [参数](#参数)
+ - [可视化训练](#可视化训练)
+ - [恢复训练](#恢复训练)
+ - [多卡训练](#2-多卡训练)
+- [预测](#预测)
+
+## 安装
+
+关于安装配置运行环境,请参考[安装文档](./install.md)完成Paddle及PaddleGAN的安装。
+
+在本演示案例中,假设用户将PaddleGAN的代码克隆并放置在 ’/home/paddle‘ 目录中。用户执行的命令操作均在 ’/home/paddle/PaddleGAN‘ 目录下完成。
+
+
+## 数据准备
+
+按照[数据准备文档](./data_prepare.md)准备Cityscapes数据集。
+
+- 使用脚本下载Cityscapes数据集到 ~/.cache/ppgan 并软连接到 PaddleGAN/data/ 下:
+
+```
+python data/download_cyclegan_data.py --name cityscapes
+```
+
+## 训练
+
+### 1. 单卡训练
-#### 单卡训练
```
python -u tools/main.py --config-file configs/cyclegan_cityscapes.yaml
```
+
#### 参数
+- `--config-file (str)`: 配置文件的路径。此处用的是CycleGAN在Cityscapes数据集上训练的配置文件。
+- 输出的日志,权重,可视化结果会默认保存在`./output_dir`中,可以通过配置文件中的`output_dir`参数修改:
+
+```
+output_dir: output_dir
+```
+
+

+
-- `--config-file (str)`: 配置文件的路径。
- 输出的日志,权重,可视化结果会默认保存在```./output_dir```中,可以通过配置文件中的```output_dir```参数修改:
- ```
- output_dir: output_dir
- ```
+- 保存的文件夹会根据模型名字和时间戳自动生成一个新目录,目录示例如下:
- 保存的文件夹会根据模型名字和时间戳自动生成一个新目录,目录示例如下:
```
output_dir
└── CycleGANModel-2020-10-29-09-21
@@ -44,32 +78,55 @@ output_dir
├── epoch002_real_B.png
├── epoch002_rec_A.png
└── epoch002_rec_B.png
+
```
-同时可以通过在配置文件中添加参数```enable_visualdl: true```使用[飞桨VisualDL](https://github.com/PaddlePaddle/VisualDL)对训练过程产生的指标或生成的图像进行记录,并运行相应命令对训练过程进行实时监控:
+
+#### 可视化训练
+
+[飞桨VisualDL](https://github.com/PaddlePaddle/VisualDL)是针对深度学习模型开发所打造的可视化分析工具,提供关键指标的实时趋势可视化、样本训练中间过程可视化、网络结构可视化等等,更能直观展示超参与模型效果间关系,辅助实现高效调参。
+
+以下操作请确保您已完成[VisualDL](https://github.com/PaddlePaddle/VisualDL)的安装,安装指南请见[VisualDL安装文档](https://github.com/PaddlePaddle/VisualDL/blob/develop/README_CN.md#%E5%AE%89%E8%A3%85%E6%96%B9%E5%BC%8F)。
+
+**通过在配置文件 cyclegan_cityscapes.yaml 中添加参数`enable_visualdl: true`使用 [飞桨VisualDL](https://github.com/PaddlePaddle/VisualDL)对训练过程产生的指标或生成的图像进行记录,并运行相应命令对训练过程进行实时监控:**
+
+
+

+
+
+如果想要自定义[飞桨VisualDL](https://github.com/PaddlePaddle/VisualDL)可视化内容,可以到 [./PaddleGAN/ppgan/engine/trainer.py](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/ppgan/engine/trainer.py) 中进行修改。
+
+本地启动命令:
+
```
visualdl --logdir output_dir/CycleGANModel-2020-10-29-09-21/
```
+更多启动方式及可视化功能使用指南请见[VisualDL使用指南](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/components/README_CN.md)。
#### 恢复训练
-训练过程中默认会保存上一个epoch的checkpoint,方便恢复训练
+在训练过程中默认会**保存上一个epoch的checkpoint在`output_dir`中,方便恢复训练。**
+
+本次示例中,cyclegan的训练默认**每五个epoch会保存checkpoint**,如需更改,可以到**config文件中的`interval`**进行修改。
+
+
+

+
+
```
python -u tools/main.py --config-file configs/cyclegan_cityscapes.yaml --resume your_checkpoint_path
```
-#### 参数
+- `--resume (str)`: 用来恢复训练的checkpoint路径(保存于上面配置文件中设置的output所在路径)。
-- `--resume (str)`: 用来恢复训练的checkpoint路径。
+### 2. 多卡训练
-#### 多卡训练:
```
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch tools/main.py --config-file configs/cyclegan_cityscapes.yaml
```
-### 预测
+## 预测
+
```
python tools/main.py --config-file configs/cyclegan_cityscapes.yaml --evaluate-only --load your_weight_path
```
-
-#### 参数
- `--evaluate-only`: 是否仅进行预测。
- `--load (str)`: 训练好的权重路径。
diff --git a/docs/zh_CN/industrial_solution/photo_color_cn.md b/docs/zh_CN/industrial_solution/photo_color_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..5427b807067710bbc4cd37b23f8f295f0be83be3
--- /dev/null
+++ b/docs/zh_CN/industrial_solution/photo_color_cn.md
@@ -0,0 +1,44 @@
+# 图片上色
+针对图片的上色,PaddleGAN提供了[DeOldify](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdeoldifypredictor)模型。
+
+## DeOldifyPredictor
+
+[DeOldify](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdeoldifypredictor)采用自注意力机制的生成对抗网络,生成器是一个U-NET结构的网络。在图像/视频的上色方面有着较好的效果。
+
+
+

+
+
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+- `artistic (bool)`: 是否使用偏"艺术性"的模型。"艺术性"的模型有可能产生一些有趣的颜色,但是毛刺比较多。
+- `render_factor (int)`: 会将该参数乘以16后作为输入帧的resize的值,如果该值设置为32,
+ 则输入帧会resize到(32 * 16, 32 * 16)的尺寸再输入到网络中。
+
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import DeOldifyPredictor
+deoldify = DeOldifyPredictor()
+deoldify.run("/home/aistudio/先烈.jpg") #原图片所在路径
+```
+*`run`接口为图片/视频通用接口,由于这里对象是图片,可以使用`run_image`的接口
+
+[完整API接口使用说明]()
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/先烈.jpg \ #原图片路径
+ --process_order DeOldify \ #对原图片处理的顺序
+ --output output_dir #成品图片所在的路径
+```
+
+### 在线项目体验
+**1. [老北京城影像修复](https://aistudio.baidu.com/aistudio/projectdetail/1161285)**
+
+**2. [PaddleGAN ❤️ 520特辑](https://aistudio.baidu.com/aistudio/projectdetail/1956943?channelType=0&channel=0)**
diff --git a/docs/zh_CN/industrial_solution/photo_sr_cn.md b/docs/zh_CN/industrial_solution/photo_sr_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..f87010ac59f20c2bec90c3867c102779128baa87
--- /dev/null
+++ b/docs/zh_CN/industrial_solution/photo_sr_cn.md
@@ -0,0 +1,62 @@
+# 图片超分
+针对图片分辨率提升,PaddleGAN提供了[RealSR](#RealSR)、[ESRGAN](#ESRGAN)、[LESRCNN](#LESRCNN)三种模型。接下来将介绍模型预测方式。
+
+## RealSR
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/single_image_super_resolution.md)
+
+[RealSR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsrealsrpredictor)模型通过估计各种模糊内核以及实际噪声分布,为现实世界的图像设计一种新颖的真实图片降采样框架。基于该降采样框架,可以获取与真实世界图像共享同一域的低分辨率图像。并且提出了一个旨在提高感知度的真实世界超分辨率模型。对合成噪声数据和真实世界图像进行的大量实验表明,该模型能够有效降低了噪声并提高了视觉质量。
+
+
+

+
+
+```
+ppgan.apps.RealSRPredictor(output='output', weight_path=None)
+```
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import DeepRemasterPredictor
+deep_remaster = DeepRemasterPredictor()
+deep_remaster.run("docs/imgs/先烈.jpg") #原图片所在路径
+```
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order DeepRemaster \ #对原视频处理的顺序
+ --output output_dir #成品视频所在的路径
+```
+## ESRGAN
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/single_image_super_resolution.md)
+
+目前ESRGAN还未封装为API供开发者们使用,因此如需使用模型,可下载使用:
+
+| 模型 | 数据集 | 下载地址 |
+|---|---|---|
+| esrgan_psnr_x4 | DIV2K | [esrgan_psnr_x4](https://paddlegan.bj.bcebos.com/models/esrgan_psnr_x4.pdparams)
+| esrgan_x4 | DIV2K | [esrgan_x4](https://paddlegan.bj.bcebos.com/models/esrgan_x4.pdparams)
+
+## LESRCNN
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/single_image_super_resolution.md)
+
+目前LESRCNN还未封装为API供开发者们使用,因此如需使用模型,可下载使用:
+
+| 模型 | 数据集 | 下载地址 |
+|---|---|---|
+| lesrcnn_x4 | DIV2K | [lesrcnn_x4](https://paddlegan.bj.bcebos.com/models/lesrcnn_x4.pdparams)
+
+### 在线项目体验
+**1. [老北京城影像修复](https://aistudio.baidu.com/aistudio/projectdetail/1161285)**
+
+**2. [PaddleGAN ❤️ 520特辑](https://aistudio.baidu.com/aistudio/projectdetail/1956943?channelType=0&channel=0)**
diff --git a/docs/zh_CN/industrial_solution/video_color_cn.md b/docs/zh_CN/industrial_solution/video_color_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..09e1c6b7518542ee0585a3123c0bfcce1c625533
--- /dev/null
+++ b/docs/zh_CN/industrial_solution/video_color_cn.md
@@ -0,0 +1,86 @@
+# 视频上色
+针对视频上色,PaddleGAN提供两种上色模型:[DeOldify](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdeoldifypredictor)与[DeepRemaster](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdeepremasterpredictor)。
+
+## DeOldifyPredictor
+
+[DeOldify](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdeoldifypredictor)采用自注意力机制的生成对抗网络,生成器是一个U-NET结构的网络。在图像/视频的上色方面有着较好的效果。
+
+
+

+
+
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+- `artistic (bool)`: 是否使用偏"艺术性"的模型。"艺术性"的模型有可能产生一些有趣的颜色,但是毛刺比较多。
+- `render_factor (int)`: 会将该参数乘以16后作为输入帧的resize的值,如果该值设置为32,
+ 则输入帧会resize到(32 * 16, 32 * 16)的尺寸再输入到网络中。
+
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import DeOldifyPredictor
+deoldify = DeOldifyPredictor()
+deoldify.run("/home/aistudio/Peking_input360p_clip6_5s.mp4") #原视频所在路径
+```
+*`run`接口为图片/视频通用接口,由于这里对象是视频,可以使用`run_video`的接口
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order DeOldify \ #对原视频处理的顺序
+ --output output_dir #成品视频所在的路径
+```
+
+## DeepRemasterPredictor
+
+[DeepRemaster](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdeepremasterpredictor) 模型目前只能用于对视频上色,基于时空卷积神经网络和自注意力机制。并且能够根据输入的任意数量的参考帧对视频中的每一帧图片进行上色。
+
+
+
+

+
+
+```
+ppgan.apps.DeepRemasterPredictor(
+ output='output',
+ weight_path=None,
+ colorization=False,
+ reference_dir=None,
+ mindim=360):
+```
+
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+- `colorization (bool)`: 是否对输入视频上色,如果选项设置为 `True` ,则参考帧的文件夹路径也必须要设置。默认值:`False`。
+- `reference_dir (bool)`: 参考帧的文件夹路径。默认值:`None`。
+- `mindim (bool)`: 输入帧重新resize后的短边的大小。默认值:360。
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import DeepRemasterPredictor
+deep_remaster = DeepRemasterPredictor()
+deep_remaster.run("docs/imgs/test_old.jpeg") #原视频所在路径
+
+```
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order DeepRemaster \ #对原视频处理的顺序
+ --output output_dir #成品视频所在的路径
+```
+
+### 在线项目体验
+**1. [老北京城影像修复](https://aistudio.baidu.com/aistudio/projectdetail/1161285)**
+
+**2. [PaddleGAN ❤️ 520特辑](https://aistudio.baidu.com/aistudio/projectdetail/1956943?channelType=0&channel=0)**
diff --git a/docs/zh_CN/industrial_solution/video_frame_cn.md b/docs/zh_CN/industrial_solution/video_frame_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..a31e9d2c0f2095e3d911505c09fde8f4066126e2
--- /dev/null
+++ b/docs/zh_CN/industrial_solution/video_frame_cn.md
@@ -0,0 +1,54 @@
+# 视频补帧
+
+针对老视频的流畅度提升,PaddleGAN提供了[DAIN](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdainpredictor)模型接口。
+
+## DAIN
+
+[DAIN](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsdainpredictor)模型通过探索深度的信息来显式检测遮挡。并且开发了一个深度感知的流投影层来合成中间流。在视频补帧方面有较好的效果。
+
+
+

+
+
+```
+ppgan.apps.DAINPredictor(
+ output='output',
+ weight_path=None,
+ time_step=None,
+ use_gpu=True,
+ remove_duplicates=False)
+```
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+- `time_step (int)`: 补帧的时间系数,如果设置为0.5,则原先为每秒30帧的视频,补帧后变为每秒60帧。
+- `remove_duplicates (bool,可选的)`: 是否删除重复帧,默认值:`False`.
+
+### 使用方式
+**1. API预测**
+
+除了定义输入视频路径外,此接口还需定义time_step,同时,目前API预测方式只支持在静态图下运行,需加上启动静态图命令,后续会支持动态图,敬请期待~
+
+```
+paddle.enable_static()
+
+from ppgan.apps import DAINPredictor
+dain = DAINPredictor(output='output', time_step=0.5)
+# 测试一个视频文件
+dain.run("/home/aistudio/Peking_input360p_clip6_5s.mp4",)
+paddle.disable_static()
+
+paddle.disable_static()
+
+```
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order DAIN \
+ --output output_dir #成品视频所在的路径
+```
+### 在线项目体验
+**1. [老北京城影像修复](https://aistudio.baidu.com/aistudio/projectdetail/1161285)**
diff --git a/docs/zh_CN/industrial_solution/video_restore_cn.md b/docs/zh_CN/industrial_solution/video_restore_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..df5eebfe4a6d86de9a398cb0bf95b656363ff579
--- /dev/null
+++ b/docs/zh_CN/industrial_solution/video_restore_cn.md
@@ -0,0 +1,41 @@
+# 智能影像修复
+
+PaddleGAN提供一系列影像修复能力,包括 **[图片上色](./photo_color_cn.md)、[视频上色](./video_color_cn.md)、[图片分辨率提升](./photo_sr_cn.md)、[视频分辨率提升](./video_sr_cn.md)**,以及 **[视频流畅度提升](./video_frame_cn.md)**(提高视频播放流畅度)三大功能,使得历史影像恢复往日鲜活的色彩,清晰流畅的呈现于我们眼前。
+
+在未来,PaddleGAN也将不断补充与优化影像修复的能力,比如增加去噪、图像修复等功能,还请大家敬请期待!
+
+## **一行代码快速进行影像修复**
+
+```
+cd applications
+python tools/video-enhance.py --input you_video_path.mp4 --process_order DAIN DeOldify PPMSVSR --output output_dir
+```
+
+### **参数**
+
+- `--input (str)`: 输入的视频路径。
+- `--output (str)`: 输出的视频路径。
+- `--process_order`: 调用的模型名字和顺序,比如输入为 `DAIN DeOldify PPMSVSR`,则会顺序调用 `DAINPredictor` `DeOldifyPredictor` `PPMSVSRPredictor` 。
+- `--cpu`: 开启cpu推理,默认使用GPU推理。
+
+
+

+
+
+## 详细教程
+* 视频修复
+ * [视频上色](./video_color_cn.md)
+ * [视频分辨率提升](./video_sr_cn.md)
+ * [视频流畅度提升](./video_frame_cn.md)
+
+* 照片修复
+ * [图片上色](./photo_color_cn.md)
+ * [图片分辨率提升](./photo_sr_cn.md)
+
+
+## 在线体验
+为了让大家快速体验影像修复的能力,PaddleGAN在飞桨人工智能学习与实训平台AI Studio准备了完整的实现步骤及详细代码,同时,AI Studio还为大家准备了免费的GPU算力,大家登录即可亲自实践 **[老北京城影像修复](https://aistudio.baidu.com/aistudio/projectdetail/1161285)** 的项目,快上手体验吧!
+
+
+

+
diff --git a/docs/zh_CN/industrial_solution/video_sr_cn.md b/docs/zh_CN/industrial_solution/video_sr_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..e0c12481eb607eb71760362e8adf0b6c22242bcd
--- /dev/null
+++ b/docs/zh_CN/industrial_solution/video_sr_cn.md
@@ -0,0 +1,279 @@
+# 视频分辨率提升
+
+针对视频超分,PaddleGAN提供了七种模型,[RealSR](#RealSR)、[PPMSVSR](#PPMSVSR)、[PPMSVSRLarge](#PPMSVSRLarge)、[EDVR](#EDVR)、[BasicVSR](#BasicVSR)、[IconVSR](#IconVSR)、[BasiVSRPlusPlus](#BasiVSRPlusPlus)。
+
+## RealSR
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/single_image_super_resolution.md)
+
+[RealSR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsrealsrpredictor)是图像超分模式,其通过估计各种模糊内核以及实际噪声分布,为现实世界的图像设计一种新颖的真实图片降采样框架。基于该降采样框架,可以获取与真实世界图像共享同一域的低分辨率图像。并且提出了一个旨在提高感知度的真实世界超分辨率模型。对合成噪声数据和真实世界图像进行的大量实验表明,该模型能够有效降低了噪声并提高了视觉质量。
+
+
+

+
+
+```
+ppgan.apps.RealSRPredictor(output='output', weight_path=None)
+```
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import DeepRemasterPredictor
+deep_remaster = DeepRemasterPredictor()
+deep_remaster.run("/home/aistudio/Peking_input360p_clip6_5s.mp4") #原视频所在路径
+
+```
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order DeepRemaster \ #对原视频处理的顺序
+ --output output_dir #成品视频所在的路径
+```
+
+
+## PPMSVSR
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/video_super_resolution.md)
+
+[PPMSVSR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsppmsvsrpredictor)为PaddleGAN自研的轻量视频超分模型,是一种多阶段视频超分深度架构,具有局部融合模块、辅助损失和细化对齐模块,以逐步细化增强结果。具体来说,在第一阶段设计了局部融合模块,在特征传播之前进行局部特征融合, 以加强特征传播中跨帧特征的融合。在第二阶段中引入了一个辅助损失,使传播模块获得的特征保留了更多与HR空间相关的信息。在第三阶段中引入了一个细化的对齐模块,以充分利用前一阶段传播模块的特征信息。大量实验证实,PP-MSVSR在Vid4数据集性能优异,仅使用 1.45M 参数PSNR指标即可达到28.13dB。
+
+[PPMSVSR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsppmsvsrpredictor)模型是一个轻量视频超分模型,在当前轻量视频超分模型(模型参数量小于6M)中,PPMSVSR以最小的参数量在4个常用视频超分测试数据集Vimeo90K、Vid4、UDM10和REDS4上达到最优超分效果。
+
+
+

+
+
+```
+ppgan.apps.PPMSVSRPredictor(output='output', weight_path=None, num_frames=10)
+```
+
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+- `num_frames (int,可选的)`: 模型输入帧数,默认值:`10`。模型输入帧数设置的越大,模型超分效果越好.
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import PPMSVSRPredictor
+sr = PPMSVSRPredictor()
+# 测试一个视频文件
+sr.run("/home/aistudio/Peking_input360p_clip6_5s.mp4") #原视频所在路径
+
+```
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order PPMSVSR \ #对原视频处理的顺序,此处注意“EDVR”四个字母都需大写
+ --output output_dir #成品视频所在的路径
+```
+
+## PPMSVSRLarge
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/video_super_resolution.md)
+
+[PPMSVSRLarge](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsppmsvsrlargepredictor)为PaddleGAN自研的高精度超分模型,是一种多阶段视频超分深度架构,具有局部融合模块、辅助损失和细化对齐模块,以逐步细化增强结果。具体来说,在第一阶段设计了局部融合模块,在特征传播之前进行局部特征融合, 以加强特征传播中跨帧特征的融合。在第二阶段中引入了一个辅助损失,使传播模块获得的特征保留了更多与HR空间相关的信息。在第三阶段中引入了一个细化的对齐模块,以充分利用前一阶段传播模块的特征信息。
+
+[PPMSVSRLarge](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsppmsvsrlargepredictor)模型是为满足精度提升,对PPMSVSR通过增加基础快数量而构造的一个大模型。PPMSVSRLarge与当前精度最高的BasicVSR++模型相比,以相似的参数量达到了更高的精度。
+
+```
+ppgan.apps.PPMSVSRLargePredictor(output='output', weight_path=None, num_frames=10)
+```
+
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+- `num_frames (int,可选的)`: 模型输入帧数,默认值:`10`。模型输入帧数设置的越大,模型超分效果越好.
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import PPMSVSRLargePredictor
+sr = PPMSVSRLargePredictor()
+# 测试一个视频文件
+sr.run("/home/aistudio/Peking_input360p_clip6_5s.mp4") #原视频所在路径
+
+```
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order PPMSVSRLarge \ #对原视频处理的顺序,此处注意“EDVR”四个字母都需大写
+ --output output_dir #成品视频所在的路径
+```
+
+
+
+## EDVR
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/video_super_resolution.md)
+
+[EDVR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsedvrpredictor)模型提出了一个新颖的视频具有增强可变形卷积的还原框架:第一,为了处理大动作而设计的一个金字塔,级联和可变形(PCD)对齐模块,使用可变形卷积以从粗到精的方式在特征级别完成对齐;第二,提出时空注意力机制(TSA)融合模块,在时间和空间上都融合了注意机制,用以增强复原的功能。
+
+[EDVR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsedvrpredictor)模型是一个基于连续帧的超分模型,能够有效利用帧间的信息,速度比[RealSR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsrealsrpredictor)模型快。
+
+
+

+
+
+```
+ppgan.apps.EDVRPredictor(output='output', weight_path=None)
+```
+
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import EDVRPredictor
+sr = EDVRPredictor()
+# 测试一个视频文件
+sr.run("/home/aistudio/Peking_input360p_clip6_5s.mp4") #原视频所在路径
+
+```
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order EDVR \ #对原视频处理的顺序,此处注意“EDVR”四个字母都需大写
+ --output output_dir #成品视频所在的路径
+```
+
+## BasicVSR
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/video_super_resolution.md)
+
+[BasicVSR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsbasicvsrpredictor)在VSR的指导下重新考虑了四个基本模块(即传播、对齐、聚合和上采样)的一些最重要的组件。 通过添加一些小设计,重用一些现有组件,得到了简洁的 BasicVSR。与许多最先进的算法相比,BasicVSR在速度和恢复质量方面实现了有吸引力的改进。
+
+```
+ppgan.apps.BasicVSRPredictor(output='output', weight_path=None, num_frames=10)
+```
+
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+- `num_frames (int,可选的)`: 模型输入帧数,默认值:`10`。模型输入帧数设置的越大,模型超分效果越好.
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import BasicVSRPredictor
+sr = BasicVSRPredictor()
+# 测试一个视频文件
+sr.run("/home/aistudio/Peking_input360p_clip6_5s.mp4") #原视频所在路径
+
+```
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order BasicVSR \ #对原视频处理的顺序,此处注意“EDVR”四个字母都需大写
+ --output output_dir #成品视频所在的路径
+```
+
+## IconVSR
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/video_super_resolution.md)
+
+[IconVSR](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsiconvsrpredictor)是由BasicVSR扩展而来,其是在BasicVSR基础之上,通过添加信息重新填充机制和耦合传播方案以促进信息聚合。与BasicVSR相比,IconVSR提升了一点精度。
+
+```
+ppgan.apps.IconVSRPredictor(output='output', weight_path=None, num_frames=10)
+```
+
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+- `num_frames (int,可选的)`: 模型输入帧数,默认值:`10`。模型输入帧数设置的越大,模型超分效果越好.
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import IconVSRPredictor
+sr = IconVSRPredictor()
+# 测试一个视频文件
+sr.run("/home/aistudio/Peking_input360p_clip6_5s.mp4") #原视频所在路径
+
+```
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order IconVSR \ #对原视频处理的顺序,此处注意“EDVR”四个字母都需大写
+ --output output_dir #成品视频所在的路径
+```
+
+## BasiVSRPlusPlus
+
+[完整模型教程](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/tutorials/video_super_resolution.md)
+
+[BasiVSRPlusPlus](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md#ppganappsbasicvsrpluspluspredictor)通过提出二阶网格传播和导流可变形对齐来重新设计BasicVSR。通过增强传播和对齐来增强循环框架,BasicVSR++可以更有效地利用未对齐视频帧的时空信息。 在类似的计算约束下,新组件可提高性能。特别是,BasicVSR++ 以相似的参数数量在 PSNR 方面比 BasicVSR 高0.82dB。BasicVSR++ 在NTIRE2021的视频超分辨率和压缩视频增强挑战赛中获得三名冠军和一名亚军。
+
+
+

+
+
+```
+ppgan.apps.BasiVSRPlusPlusPredictor(output='output', weight_path=None, num_frames=10)
+```
+
+### 参数
+
+- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+- `num_frames (int,可选的)`: 模型输入帧数,默认值:`10`。模型输入帧数设置的越大,模型超分效果越好.
+
+### 使用方式
+**1. API预测**
+
+```
+from ppgan.apps import BasiVSRPlusPlusPredictor
+sr = BasiVSRPlusPlusPredictor()
+# 测试一个视频文件
+sr.run("/home/aistudio/Peking_input360p_clip6_5s.mp4") #原视频所在路径
+
+```
+
+**2. 命令行预测**
+
+```
+!python applications/tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \ #原视频路径
+ --process_order BasiVSRPlusPlus \ #对原视频处理的顺序,此处注意“EDVR”四个字母都需大写
+ --output output_dir #成品视频所在的路径
+```
+
+
+### 在线项目体验
+**1. [PaddleGAN SOTA算法:视频超分模型PP-MSVSR详解及应用](https://aistudio.baidu.com/aistudio/projectdetail/3205183)**
+
+**2. [老北京城影像修复](https://aistudio.baidu.com/aistudio/projectdetail/1161285)**
+
+**3. [PaddleGAN ❤️ 520特辑](https://aistudio.baidu.com/aistudio/projectdetail/1956943?channelType=0&channel=0)**
diff --git a/docs/zh_CN/install.md b/docs/zh_CN/install.md
index 5f1c80ceb558b1f16bc530841c6e9b4019ef2a38..275c85f99cd12ed359d6c0e318cd609d5fc6ef42 100644
--- a/docs/zh_CN/install.md
+++ b/docs/zh_CN/install.md
@@ -1,71 +1,73 @@
-## 安装PaddleGAN
+# 安装文档
+本文档包含了如何安装PaddleGAN以及相关依赖,更多产品简介请参考[README](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/README_cn.md)。
+
+## 环境依赖
+
+- PaddlePaddle >= 2.1.0
+- Python >= 3.6
+- CUDA >= 10.1
-### 要求
-* PaddlePaddle >= 2.0.0-rc
-* Python >= 3.6
-* CUDA >= 9.0
+## 安装PaddlePaddle
-### 1. 安装PaddlePaddle
```
-pip install -U paddlepaddle-gpu==2.0.0rc0
+
+# CUDA10.1
+python -m pip install paddlepaddle-gpu==2.1.0.post101 -f https://mirror.baidu.com/pypi/simple
+
+# CPU
+python -m pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
+
```
-上面命令会默认安装cuda10.2的包,如果想安装其他cuda版本的包,可以参考下面的表格。
- CUDA | python3.8 | python3.7 | python3.6 | 10.1 | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10.1-cudnn7-mkl_gcc8.2%2Fpaddlepaddle_gpu-2.0.0rc0.post101-cp38-cp38-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10.1-cudnn7-mkl_gcc8.2%2Fpaddlepaddle_gpu-2.0.0rc0.post101-cp37-cp37m-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10.1-cudnn7-mkl_gcc8.2%2Fpaddlepaddle_gpu-2.0.0rc0.post101-cp36-cp36m-linux_x86_64.whl
- | |
10.0 | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post100-cp38-cp38-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post100-cp37-cp37m-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda10-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post100-cp36-cp36m-linux_x86_64.whl
- | |
9.0 | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda9-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post90-cp38-cp38-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda9-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post90-cp37-cp37m-linux_x86_64.whl
- | install python -m pip install https://paddle-wheel.bj.bcebos.com/2.0.0-rc0-gpu-cuda9-cudnn7-mkl%2Fpaddlepaddle_gpu-2.0.0rc0.post90-cp36-cp36m-linux_x86_64.whl
- |
+更多安装方式例如conda或源码编译安装方法,请参考[PaddlePaddle安装文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)。
-支持更多系统的安装教程请前往[paddlepaddle官网](https://www.paddlepaddle.org.cn/install/quick)
+请确保您的PaddlePaddle安装成功并且版本不低于需求版本。使用以下命令进行验证。
-### 2. 安装PaddleGAN
+```
+# 在您的Python解释器中确认PaddlePaddle安装成功
+>>> import paddle
+>>> paddle.utils.run_check()
-##### 2.1 通过Pip安裝
+# 确认PaddlePaddle版本
+python -c "import paddle; print(paddle.__version__)"
+```
+
+## 安装PaddleGAN
+
+### 通过PIP安裝(只支持Python3)
+
+* 安装:
```
-# only support Python3
python3 -m pip install --upgrade ppgan
```
-
-下载示例和配置文件:
+* 下载示例和配置文件:
```
git clone https://github.com/PaddlePaddle/PaddleGAN
cd PaddleGAN
```
-
-##### 2.2通过源码安装
+### 通过源码安装
```
git clone https://github.com/PaddlePaddle/PaddleGAN
cd PaddleGAN
pip install -v -e . # or "python setup.py develop"
-```
-
-按照上述方法安装成功后,本地的修改也会自动同步到ppgan中
+# 安装其他依赖
+pip install -r requirements.txt
+```
+## 其他第三方工具安装
-### 4. 其他可能用到的工具安装
-
-#### 4.1 ffmpeg
-
-如果需要使用ppgan处理视频相关的任务,则需要安装ffmpeg。这里推荐您使用[conda](https://docs.conda.io/en/latest/miniconda.html)安装:
+* 涉及视频的任务都需安装**ffmpeg**,这里推荐使用[conda](https://docs.conda.io/en/latest/miniconda.html)安装:
```
conda install x264=='1!152.20180717' ffmpeg=4.0.2 -c conda-forge
```
-#### 4.2 Visual DL
-
-如果需要使用[飞桨VisualDL](https://github.com/PaddlePaddle/VisualDL)对训练过程进行可视化监控,请安装`VisualDL`(使用方法请参考[这里](./get_started.md)):
-
+* 如需使用可视化工具监控训练过程,请安装[飞桨VisualDL](https://github.com/PaddlePaddle/VisualDL):
```
python -m pip install visualdl -i https://mirror.baidu.com/pypi/simple
```
+*注意:VisualDL目前只维护Python3以上的安装版本
diff --git a/docs/zh_CN/tutorials/animegan.md b/docs/zh_CN/tutorials/animegan.md
index bc3d5aa950bd2a0f229b29d2fb319fd9c8f2d6c5..3c86beca5873d0dbf495d74ff0e0bbd188c8c12f 100644
--- a/docs/zh_CN/tutorials/animegan.md
+++ b/docs/zh_CN/tutorials/animegan.md
@@ -70,7 +70,7 @@ animedataset
1. 预热模型完成后,训练风格迁移模型:
**注意:** 必须先修改在`configs/animeganv2.yaml`中的`pretrain_ckpt`参数,确保指向正确的 **预热模型权重路径**
- 设置`batch size=4`,`learning rate=0.00002`,在一个 GTX2060S GPU上训练30个epoch即可获得较好的效果,其他超参数请参考`configs/animeganv2.yaml`。
+ 设置`batch size=4`,`learning rate=0.0002`,在一个 GTX2060S GPU上训练30个epoch即可获得较好的效果,其他超参数请参考`configs/animeganv2.yaml`。
```sh
python tools/main.py --config-file configs/animeganv2.yaml
diff --git a/docs/zh_CN/tutorials/aotgan.md b/docs/zh_CN/tutorials/aotgan.md
new file mode 100644
index 0000000000000000000000000000000000000000..716d613b8b469324ace96379ef6caeef1ec6c4aa
--- /dev/null
+++ b/docs/zh_CN/tutorials/aotgan.md
@@ -0,0 +1,101 @@
+# AOT GAN
+
+## 1. 简介
+
+本应用的 AOT GAN 模型出自论文《Aggregated Contextual Transformations for High-Resolution Image Inpainting》,其通过聚合不同膨胀率的空洞卷积学习到的图片特征,刷出了inpainting任务的新SOTA。模型推理效果如下:
+
+
+
+**论文:** [Aggregated Contextual Transformations for High-Resolution Image Inpainting](https://paperswithcode.com/paper/aggregated-contextual-transformations-for)
+
+**参考repo:** [https://github.com/megvii-research/NAFNet](https://github.com/megvii-research/NAFNet)
+
+## 2.快速体验
+
+预训练模型权重文件 g.pdparams 可以从如下地址下载: (https://paddlegan.bj.bcebos.com/models/AotGan_g.pdparams)
+
+输入一张 512x512 尺寸的图片和擦除 mask 给模型,输出一张补全(inpainting)的图片。预测代码如下:
+
+```
+python applications/tools/aotgan.py \
+ --input_image_path data/aotgan/armani1.jpg \
+ --input_mask_path data/aotgan/armani1.png \
+ --weight_path test/aotgan/g.pdparams \
+ --output_path output_dir/armani_pred.jpg \
+ --config-file configs/aotgan.yaml
+```
+
+**参数说明:**
+* input_image_path:输入图片路径
+* input_mask_path:输入擦除 mask 路径
+* weight_path:训练完成的模型权重存储路径,为 statedict 格式(.pdparams)的 Paddle 模型行权重文件
+* output_path:预测生成图片的存储路径
+* config-file:存储参数设定的yaml文件存储路径,与训练过程使用同一个yaml文件,预测参数由 predict 下字段设定
+
+AI Studio 快速体验项目:(https://aistudio.baidu.com/aistudio/datasetdetail/165081)
+
+## 3.训练
+
+**数据准备:**
+
+* 训练用的图片解压到项目路径下的 data/aotgan/train_img 文件夹内,可包含多层目录,dataloader会递归读取每层目录下的图片。训练用的mask图片解压到项目路径下的 data/aotgan/train_mask 文件夹内。
+* 验证用的图片和mask图片相应的放到项目路径下的 data/aotgan/val_img 文件夹和 data/aotgan/val_mask 文件夹内。
+
+数据集目录结构如下:
+
+```
+└─data
+ └─aotgan
+ ├─train_img
+ ├─train_mask
+ ├─val_img
+ └─val_mask
+```
+
+* 训练预训练模型的权重使用了 Place365Standard 数据集的训练集图片,以及 NVIDIA Irregular Mask Dataset 数据集的测试集掩码图片。Place365Standard 的训练集为 160万张长或宽最小为 512 像素的图片。NVIDIA Irregular Mask Dataset 的测试集为 12000 张尺寸为 512 x 512 的不规则掩码图片。数据集下载链接:[Place365Standard](http://places2.csail.mit.edu/download.html)、[NVIDIA Irregular Mask Dataset](https://nv-adlr.github.io/publication/partialconv-inpainting)
+
+### 3.1 gpu 单卡训练
+
+`python -u tools/main.py --config-file configs/aotgan.yaml`
+
+* config-file:训练使用的超参设置 yamal 文件的存储路径
+
+### 3.2 gpu 多卡训练
+
+```
+!python -m paddle.distributed.launch \
+ tools/main.py \
+ --config-file configs/photopen.yaml \
+ -o dataset.train.batch_size=6
+```
+
+* config-file:训练使用的超参设置 yamal 文件的存储路径
+* -o dataset.train.batch_size=6:-o 设置参数覆盖 yaml 文件中的值,这里调整了 batch_size 参数
+
+### 3.3 继续训练
+
+```
+python -u tools/main.py \
+ --config-file configs/aotgan.yaml \
+ --resume output_dir/[path_to_checkpoint]/iter_[iternumber]_checkpoint.pdparams
+```
+
+* config-file:训练使用的超参设置 yamal 文件的存储路径
+* resume:指定读取的 checkpoint 路径
+
+### 3.4 实验结果展示
+
+在Places365模型的验证集上的指标如下
+
+| mask | PSNR | SSIM | download |
+| ---- | ---- | ---- | ---- |
+| 20-30% | 26.04001 | 0.89011 | [download](https://paddlegan.bj.bcebos.com/models/AotGan_g.pdparams) |
+
+## 4. 参考链接与文献
+@inproceedings{yan2021agg,
+ author = {Zeng, Yanhong and Fu, Jianlong and Chao, Hongyang and Guo, Baining},
+ title = {Aggregated Contextual Transformations for High-Resolution Image Inpainting},
+ booktitle = {Arxiv},
+ pages={-},
+ year = {2020}
+}
diff --git a/docs/zh_CN/tutorials/face_enhancement.md b/docs/zh_CN/tutorials/face_enhancement.md
new file mode 100644
index 0000000000000000000000000000000000000000..fb14b6130d1b3a7734aca5e5d87e7a2bc5d1efb9
--- /dev/null
+++ b/docs/zh_CN/tutorials/face_enhancement.md
@@ -0,0 +1,43 @@
+# 人脸增强
+
+## 1. 人脸增强简介
+
+从严重退化的人脸图像中恢复出人脸是一个非常具有挑战性的问题。由于问题的严重性和复杂的未知退化,直接训练深度神经网络通常无法得到可接受的结果。现有的基于生成对抗网络 (GAN) 的方法可以产生更好的结果,但往往会产生过度平滑的恢复。这里我们提供[GPEN](https://arxiv.org/abs/2105.06070)模型来进行人脸增强。GPEN模型首先学习用于生成高质量人脸图像的GAN并将其嵌入到U形DNN作为先验解码器,然后使用一组合成的低质量人脸图像对GAN先验嵌入DNN进行微调。 GAN 模块的设计是为了确保输入到 GAN 的隐码和噪声可以分别从 DNN 的深层和浅层特征中生成,控制重建图像的全局人脸结构、局部人脸细节和背景。所提出的 GAN 先验嵌入网络 (GPEN) 易于实现,并且可以生成视觉上逼真的结果。实验表明,GPEN 在数量和质量上都比最先进的 BFR 方法取得了显着优越的结果,特别是对于野外严重退化的人脸图像的恢复。
+
+## 使用方法
+
+### 人脸增强
+
+用户使用如下代码进行人脸增强,选择本地图像作为输入:
+
+```python
+import paddle
+from ppgan.faceutils.face_enhancement import FaceEnhancement
+
+faceenhancer = FaceEnhancement()
+img = faceenhancer.enhance_from_image(img)
+```
+
+注意:请将图片转为float类型输入,目前不支持int8类型
+
+### 训练
+
+[详见](../../zh_CN/tutorials/gpen.md)
+
+## 人脸增强结果展示
+
+
+
+## 参考文献
+
+```
+@inproceedings{inproceedings,
+author = {Yang, Tao and Ren, Peiran and Xie, Xuansong and Zhang, Lei},
+year = {2021},
+month = {06},
+pages = {672-681},
+title = {GAN Prior Embedded Network for Blind Face Restoration in the Wild},
+doi = {10.1109/CVPR46437.2021.00073}
+}
+
+```
diff --git a/docs/zh_CN/tutorials/face_parse.md b/docs/zh_CN/tutorials/face_parse.md
index 24c1a622f644f065cdaf12cd1ab6e6544064d44b..931a76d6548a929b381cf74daf93917137fb83ce 100644
--- a/docs/zh_CN/tutorials/face_parse.md
+++ b/docs/zh_CN/tutorials/face_parse.md
@@ -10,7 +10,7 @@
运行如下命令,可以完成人脸解析任务,程序运行成功后,会在`output`文件夹生成解析后的图片文件。具体命令如下所示:
```
cd applications
-python face_parse.py --input_image ../docs/imgs/face.png
+python tools/face_parse.py --input_image ../docs/imgs/face.png
```
**参数:**
diff --git a/docs/zh_CN/tutorials/gfpgan.md b/docs/zh_CN/tutorials/gfpgan.md
new file mode 100644
index 0000000000000000000000000000000000000000..1da5317fe303368307e75aedc894b28d9fc174d9
--- /dev/null
+++ b/docs/zh_CN/tutorials/gfpgan.md
@@ -0,0 +1,198 @@
+## GFPGAN 盲脸复原模型
+
+
+## 1、介绍
+GFP-GAN利用丰富和多样化的先验封装在预先训练的面部GAN用于盲人面部恢复。
+### GFPGAN的整体结构:
+
+
+
+GFP-GAN由降解去除物组成
+模块(U-Net)和预先训练的面部GAN(如StyleGAN2)作为先验。他们之间有隐藏的密码
+映射和几个通道分割空间特征变换(CS-SFT)层。
+
+通过处理特征,它在保持高保真度的同时实现了真实的结果。
+
+要了解更详细的模型介绍,并参考回购,您可以查看以下AI Studio项目
+[基于PaddleGAN复现GFPGAN](https://aistudio.baidu.com/aistudio/projectdetail/4421649)
+
+在这个实验中,我们训练
+我们的模型和Adam优化器共进行了210k次迭代。
+
+GFPGAN的回收实验结果如下:
+
+
+Model | LPIPS | FID | PSNR
+--- |:---:|:---:|:---:|
+GFPGAN | 0.3817 | 36.8068 | 65.0461
+
+## 2、准备工作
+
+### 2.1 数据集准备
+
+GFPGAN模型训练集是经典的FFHQ人脸数据集,
+总共有7万张高分辨率1024 x 1024的人脸图片,
+测试集为CELEBA-HQ数据集,共有2000张高分辨率人脸图片。生成方式与训练时相同。
+For details, please refer to **Dataset URL:** [FFHQ](https://github.com/NVlabs/ffhq-dataset), [CELEBA-HQ](https://github.com/tkarras/progressive_growing_of_gans).
+The specific download links are given below:
+
+**原始数据集地址:**
+
+**FFHQ :** https://drive.google.com/drive/folders/1tZUcXDBeOibC6jcMCtgRRz67pzrAHeHL?usp=drive_open
+
+**CELEBA-HQ:** https://drive.google.com/drive/folders/0B4qLcYyJmiz0TXY1NG02bzZVRGs?resourcekey=0-arAVTUfW9KRhN-irJchVKQ&usp=sharing
+
+数据集结构如下
+
+```
+|-- data/GFPGAN
+ |-- train
+ |-- 00000.png
+ |-- 00001.png
+ |-- ......
+ |-- 00999.png
+ |-- ......
+ |-- 69999.png
+ |-- lq
+ |-- 2000张jpg图片
+ |-- gt
+ |-- 2000张jpg图片
+```
+
+请在configs/gfpgan_ffhq1024. data中修改数据集train和test的dataroot参数。Yaml配置文件到您的训练集和测试集路径。
+
+### 2.2 模型准备
+**模型参数文件和训练日志下载地址:**
+
+https://paddlegan.bj.bcebos.com/models/GFPGAN.pdparams
+
+从链接下载模型参数和测试图像,并将它们放在项目根目录中的data/文件夹中。具体文件结构如下:
+
+params是一个dict(python中的一种类型),可以通过paddlepaddle加载。它包含key (net_g,net_g_ema),您可以使用其中任何一个来进行推断
+
+## 3、开始使用
+模型训练
+
+在控制台中输入以下代码开始训练:
+
+ ```bash
+ python tools/main.py -c configs/gfpgan_ffhq1024.yaml
+ ```
+
+该模型支持单卡训练和多卡训练。
+也可以使用如下命令进行多卡训练
+
+```bash
+!CUDA_VISIBLE_DEVICES=0,1,2,3
+!python -m paddle.distributed.launch tools/main.py \
+ --config-file configs/gpfgan_ffhq1024.yaml
+```
+
+模型训练需要使用paddle2.3及以上版本,等待paddle实现elementwise_pow的二阶算子相关函数。paddle2.2.2版本可以正常运行,但由于某些损失函数会计算出错误的梯度,无法成功训练模型。如果在培训过程中报错,则暂时不支持培训。您可以跳过训练部分,直接使用提供的模型参数进行测试。模型评估和测试可以使用paddle2.2.2及以上版本。
+
+### 3.2 模型评估
+
+当评估模型时,在控制台中输入以下代码,使用上面提到的下载的模型参数:
+
+ ```shell
+python tools/main.py -c configs/gfpgan_ffhq1024.yaml --load GFPGAN.pdparams --evaluate-only
+ ```
+
+当评估模型时,在控制台中输入以下代码,使用下载的模型。如果您想在您自己提供的模型上进行测试,请修改之后的路径 --load .
+
+
+
+### 3.3 模型预测
+
+#### 3.3.1 导出模型
+
+在训练之后,您需要使用' ' tools/export_model.py ' '从训练的模型中提取生成器的权重(仅包括生成器)
+输入以下命令提取生成器的模型:
+
+```bash
+python -u tools/export_model.py --config-file configs/gfpgan_ffhq1024.yaml \
+ --load GFPGAN.pdparams \
+ --inputs_size 1,3,512,512
+```
+
+
+#### 3.3.2 加载一张图片
+
+你可以使用我们在ppgan/faceutils/face_enhancement/gfpgan_enhance.py中的工具来快速推断一张图片
+
+```python
+%env PYTHONPATH=.:$PYTHONPATH
+%env CUDA_VISIBLE_DEVICES=0
+import paddle
+import cv2
+import numpy as np
+import sys
+from ppgan.faceutils.face_enhancement.gfpgan_enhance import gfp_FaceEnhancement
+# 图片路径可以用自己的
+img_path='test/2.png'
+img = cv2.imread(img_path, cv2.IMREAD_COLOR)
+# 这是原来的模糊图片
+cv2.imwrite('test/outlq.png',img)
+img=np.array(img).astype('float32')
+faceenhancer = gfp_FaceEnhancement()
+img = faceenhancer.enhance_from_image(img)
+# 这是生成的清晰图片
+cv2.imwrite('test/out_gfpgan.png',img)
+```
+
+
+
+
+
+
+
+## 4. Tipc
+
+### 4.1 导出推理模型
+
+```bash
+python -u tools/export_model.py --config-file configs/gfpgan_ffhq1024.yaml \
+ --load GFPGAN.pdparams \
+ --inputs_size 1,3,512,512
+```
+
+### 4.2 使用paddleInference推理
+
+```bash
+%cd /home/aistudio/work/PaddleGAN
+# %env PYTHONPATH=.:$PYTHONPATH
+# %env CUDA_VISIBLE_DEVICES=0
+!python -u tools/inference.py --config-file configs/gfpgan_ffhq1024.yaml \
+ --model_path GFPGAN.pdparams \
+ --model_type gfpgan \
+ --device gpu \
+ -o validate=None
+```
+
+
+### 4.3 一键TIPC
+
+调用足部测试基础训练预测函数的' lite_train_lite_infer '模式,执行:
+
+```bash
+%cd /home/aistudio/work/PaddleGAN
+!bash test_tipc/prepare.sh \
+ test_tipc/configs/GFPGAN/train_infer_python.txt \
+ lite_train_lite_infer
+!bash test_tipc/test_train_inference_python.sh \
+ test_tipc/configs/GFPGAN/train_infer_python.txt \
+ lite_train_lite_infer
+```
+
+
+
+## 5、References
+
+```
+@InProceedings{wang2021gfpgan,
+ author = {Xintao Wang and Yu Li and Honglun Zhang and Ying Shan},
+ title = {Towards Real-World Blind Face Restoration with Generative Facial Prior},
+ booktitle={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+ year = {2021}
+}
+```
diff --git a/docs/zh_CN/tutorials/gpen.md b/docs/zh_CN/tutorials/gpen.md
new file mode 100644
index 0000000000000000000000000000000000000000..e303e6c6db7280a8779a711193a9f820dea2b70e
--- /dev/null
+++ b/docs/zh_CN/tutorials/gpen.md
@@ -0,0 +1,205 @@
+[English](../../en_US/tutorials/gpen.md) | 中文
+
+## GPEN 盲人脸修复模型
+
+
+## 1、简介
+
+GPEN模型是一个盲人脸修复模型。作者将前人提出的 StyleGAN V2 的解码器嵌入模型,作为GPEN的解码器;用DNN重新构建了一种简单的编码器,为解码器提供输入。这样模型在保留了 StyleGAN V2 解码器优秀的性能的基础上,将模型的功能由图像风格转换变为了盲人脸修复。模型的总体结构如下图所示:
+
+
+
+对模型更详细的介绍,和参考repo可查看以下AI Studio项目[链接]([GPEN盲人脸修复模型复现 - 飞桨AI Studio (baidu.com)](https://aistudio.baidu.com/aistudio/projectdetail/3936241?contributionType=1))的最新版本。
+
+
+
+
+## 2、准备工作
+
+### 2.1 数据集准备
+
+GPEN模型训练集是经典的FFHQ人脸数据集,共70000张1024 x 1024高分辨率的清晰人脸图片,测试集是CELEBA-HQ数据集,共2000张高分辨率人脸图片。详细信息可以参考**数据集网址:** [FFHQ](https://github.com/NVlabs/ffhq-dataset) ,[CELEBA-HQ](https://github.com/tkarras/progressive_growing_of_gans) 。以下给出了具体的下载链接:
+
+**原数据集下载地址:**
+
+**FFHQ :** https://drive.google.com/drive/folders/1tZUcXDBeOibC6jcMCtgRRz67pzrAHeHL?usp=drive_open
+
+**CELEBA-HQ:** https://drive.google.com/drive/folders/0B4qLcYyJmiz0TXY1NG02bzZVRGs?resourcekey=0-arAVTUfW9KRhN-irJchVKQ&usp=sharing
+
+
+
+由于FFHQ原数据集过大,也可以从以下链接下载256分辨率的FFHQ数据集:
+
+https://paddlegan.bj.bcebos.com/datasets/images256x256.tar
+
+
+
+**下载后,文件参考组织形式如下**
+
+```
+|-- data/GPEN
+ |-- ffhq/images256x256/
+ |-- 00000
+ |-- 00000.png
+ |-- 00001.png
+ |-- ......
+ |-- 00999.png
+ |-- 01000
+ |-- ......
+ |-- ......
+ |-- 69000
+ |-- ......
+ |-- 69999.png
+ |-- test
+ |-- 2000张png图片
+```
+
+请修改configs/gpen_256_ffhq.yaml配置文件中dataset的train和test的dataroot参数为你的训练集和测试集路径。
+
+
+
+### 2.2 模型准备
+
+**模型参数文件及训练日志下载地址:**
+
+链接:https://paddlegan.bj.bcebos.com/models/gpen.zip
+
+
+从链接中下载模型参数和测试图片,并放到项目根目录下的data/文件夹下,具体文件结构如下所示:
+
+**文件结构**
+
+
+```
+data/gpen/weights
+ |-- model_ir_se50.pdparams #计算id_loss需要加载的facenet的模型参数文件
+ |-- weight_pretrain.pdparams #256分辨率的包含生成器和判别器的模型参数文件,其中只有生成器的参数是训练好的参数,参 #数文件的格式与3.1训练过程中保存的参数文件格式相同。3.2、3.3.1、4.1也需要用到该参数文件
+data/gpen/lite_data
+```
+
+
+
+## 3、开始使用
+
+### 3.1 模型训练
+
+在控制台输入以下代码,开始训练:
+
+ ```shell
+ python tools/main.py -c configs/gpen_256_ffhq.yaml
+ ```
+
+模型只支持单卡训练。
+
+模型训练需使用paddle2.3及以上版本,且需等paddle实现elementwise_pow 的二阶算子相关功能,使用paddle2.2.2版本能正常运行,但因部分损失函数会求出错误梯度,导致模型无法训练成功。如训练时报错则暂不支持进行训练,可跳过训练部分,直接使用提供的模型参数进行测试。模型评估和测试使用paddle2.2.2及以上版本即可。
+
+
+
+### 3.2 模型评估
+
+对模型进行评估时,在控制台输入以下代码,下面代码中使用上面提到的下载的模型参数:
+
+ ```shell
+python tools/main.py -c configs/gpen_256_ffhq.yaml -o dataset.test.amount=2000 --load data/gpen/weights/weight_pretrain.pdparams --evaluate-only
+ ```
+
+如果要在自己提供的模型上进行测试,请修改 --load 后面的路径。
+
+
+
+### 3.3 模型预测
+
+#### 3.3.1 导出生成器权重
+
+训练结束后,需要使用 ``tools/extract_weight.py`` 来从训练模型(包含了生成器和判别器)中提取生成器的权重来给`applications/tools/gpen.py`进行推理,以实现GPEN模型的各种应用。输入以下命令来提取生成器的权重:
+
+```bash
+python tools/extract_weight.py data/gpen/weights/weight_pretrain.pdparams --net-name g_ema --output data/gpen/weights/g_ema.pdparams
+```
+
+
+
+#### 3.3.2 对单张图像进行处理
+
+提取完生成器的权重后,输入以下命令可对--test_img路径下图片进行测试。修改--seed参数,可生成不同的退化图像,展示出更丰富的效果。可修改--test_img后的路径为你想测试的任意图片。如--weight_path参数后不提供权重,则会自动下载训练好的模型权重进行测试。
+
+```bash
+python applications/tools/gpen.py --test_img data/gpen/lite_data/15006.png --seed=100 --weight_path data/gpen/weights/g_ema.pdparams --model_type gpen-ffhq-256
+```
+
+以下是样例图片和对应的修复图像,从左到右依次是退化图像、生成的图像和原始清晰图像:
+
+
+
+
+
+
+输出示例如下:
+
+```
+result saved in : output_dir/gpen_predict.png
+ FID: 92.11730631094356
+ PSNR:19.014782083825743
+```
+
+
+
+## 4. Tipc
+
+### 4.1 导出inference模型
+
+```bash
+python tools/export_model.py -c configs/gpen_256_ffhq.yaml --inputs_size=1,3,256,256 --load data/gpen/weights/weight_pretrain.pdparams
+```
+
+上述命令将生成预测所需的模型结构文件`gpenmodel_g_ema.pdmodel`和模型权重文件`gpenmodel_g_ema.pdiparams`以及`gpenmodel_g_ema.pdiparams.info`文件,均存放在`inference_model/`目录下。也可以修改--load 后的参数为你想测试的模型参数文件。
+
+
+
+### 4.2 使用预测引擎推理
+
+```bash
+python tools/inference.py --model_type GPEN --seed 100 -c configs/gpen_256_ffhq.yaml -o dataset.test.dataroot="./data/gpen/lite_data/" --output_path test_tipc/output/ --model_path inference_model/gpenmodel_g_ema
+```
+
+推理结束会默认保存下模型生成的修复图像在test_tipc/output/GPEN目录下,并载test_tipc/output/GPEN/metric.txt中输出测试得到的FID值。
+
+
+默认输出如下:
+
+```
+Metric fid: 187.0158
+```
+
+注:由于对高清图片进行退化的操作具有一定的随机性,所以每次测试的结果都会有所不同。为了保证测试结果一致,在这里我固定了随机种子,使每次测试时对图片都进行相同的退化操作。
+
+
+
+### 4.3 调用脚本两步完成训推一体测试
+
+测试基本训练预测功能的`lite_train_lite_infer`模式,运行:
+
+```shell
+# 修正脚本文件格式
+sed -i 's/\r//' test_tipc/prepare.sh
+sed -i 's/\r//' test_tipc/test_train_inference_python.sh
+sed -i 's/\r//' test_tipc/common_func.sh
+# 准备数据
+bash test_tipc/prepare.sh ./test_tipc/configs/GPEN/train_infer_python.txt 'lite_train_lite_infer'
+# 运行测试
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/GPEN/train_infer_python.txt 'lite_train_lite_infer'
+```
+
+
+
+## 5、参考文献
+
+```
+@misc{2021GAN,
+ title={GAN Prior Embedded Network for Blind Face Restoration in the Wild},
+ author={ Yang, T. and Ren, P. and Xie, X. and Zhang, L. },
+ year={2021},
+ archivePrefix={CVPR},
+ primaryClass={cs.CV}
+}
+```
diff --git a/docs/zh_CN/tutorials/invdn.md b/docs/zh_CN/tutorials/invdn.md
new file mode 100644
index 0000000000000000000000000000000000000000..f2cb70f6c96d00207c8cb51cad76e845733d9b8f
--- /dev/null
+++ b/docs/zh_CN/tutorials/invdn.md
@@ -0,0 +1,117 @@
+[English](../../en_US/tutorials/invdn.md) | 中文
+
+# 可逆去噪网络(InvDN):真实噪声移除的一个轻量级方案
+
+**Invertible Denoising Network: A Light Solution for Real Noise Removal** (CVPR 2021) 论文复现
+
+官方源码:[https://github.com/Yang-Liu1082/InvDN](https://github.com/Yang-Liu1082/InvDN)
+
+论文地址:[https://arxiv.org/abs/2104.10546](https://arxiv.org/abs/2104.10546)
+
+## 1、简介
+
+InvDN利用可逆网络把噪声图片分成低解析度干净图片和高频潜在表示, 其中高频潜在表示中含有噪声信息和内容信息。由于可逆网络是无损的, 如果我们能够将高频表示中的噪声信息分离, 那么就可以将其和低解析度干净图片一起重构成原分辨率的干净图片。但实际上去除高频信息中的噪声是很困难的, 本文通过直接将带有噪声的高频潜在表示替换为在还原过程中从先验分布中采样的另一个表示,进而结合低解析度干净图片重构回原分辨率干净图片。本文所实现网络是轻量级的, 且效果较好。
+
+
+
+## 2 如何使用
+
+### 2.1 快速体验
+
+安装`PaddleGAN`之后进入`PaddleGAN`文件夹下,运行如下命令即生成修复后的图像`./output_dir/Denoising/image_name.png`
+
+```sh
+python applications/tools/invdn_denoising.py --images_path ${PATH_OF_IMAGE}
+```
+其中`PATH_OF_IMAGE`为你需要去噪的图像路径,或图像所在文件夹的路径。
+
+- 注意,作者原代码中,测试时使用了蒙特卡洛自集成(Monte Carlo self-ensemble)以提高性能,但是会拖慢速度。用户可以自由选择是否使用 `--disable_mc` 参数来关闭蒙特卡洛自集成以提高速度。($test$ 时默认开启蒙特卡洛自集成,而 $train$ 和 $valid$ 时默认关闭蒙特卡洛自集成。)
+
+### 2.2 数据准备
+
+#### **训练数据**
+
+本文所使用的数据集为SIDD,其中训练集为 [SIDD-Medium](https://www.eecs.yorku.ca/~kamel/sidd/dataset.php)。按照论文要求,需要将数据集处理为 $512 \times 512$ 的 patches。此外,本文训练时需要产生低分辨率版本的GT图像,其尺寸为 $128 \times 128$。将低分辨率图像记作LQ。
+
+已经处理好的数据,放在了 [Ai Studio](https://aistudio.baidu.com/aistudio/datasetdetail/172084) 里。
+
+训练数据放在:`data/SIDD_Medium_Srgb_Patches_512/train/` 下。
+
+#### **测试数据**
+
+验证集为 [SIDD_valid](https://www.eecs.yorku.ca/~kamel/sidd/dataset.php)。官网下载的验证集为 `./ValidationNoisyBlocksSrgb.mat 和 ./ValidationGtBlocksSrgb.mat`,建议转换为 $.png$ 格式更为方便。
+
+已经转换好的数据,放在了 [Ai Studio](https://aistudio.baidu.com/aistudio/datasetdetail/172069) 里。
+
+验证集数据放在:`data/SIDD_Valid_Srgb_Patches_256/valid/` 下。
+
+- 经过处理之后,`PaddleGAN/data` 文件夹下的文件结构为
+```sh
+data
+├─ SIDD_Medium_Srgb_Patches_512
+│ └─ train
+│ ├─ GT
+│ │ 0_0.PNG
+│ │ ...
+│ ├─ LQ
+│ │ 0_0.PNG
+│ │ ...
+│ └─ Noisy
+│ 0_0.PNG
+│ ...
+│
+└─ SIDD_Valid_Srgb_Patches_256
+ └─ valid
+ ├─ GT
+ │ 0_0.PNG
+ │ ...
+ └─ Noisy
+ 0_0.PNG
+ ...
+```
+
+### 2.3 训练
+
+运行以下命令来快速开始训练:
+```sh
+python -u tools/main.py --config-file configs/invdn_denoising.yaml
+```
+- TIPS:
+在复现时,为了保证总 $epoch$ 数目和论文配置相同,我们需要确保 $ total\_batchsize*iter == 1gpus*14bs*600000iters$。同时 $batchsize$ 改变时也要确保 $batchsize/learning\_rate == 14/0.0002$ 。
+例如,在使用单机四卡时,将单卡 $batchsize$ 设置为14,此时实际的总 $batchsize$ 应为14*4,需要将总 $iters$ 设置为为150000,且学习率扩大到8e-4。
+
+### 2.4 测试
+
+运行以下命令来快速开始测试:
+```sh
+python tools/main.py --config-file configs/invdn_denoising.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 3 结果展示
+
+去噪
+| 模型 | 数据集 | PSNR/SSIM |
+|---|---|---|
+| InvDN | SIDD | 39.29 / 0.956 |
+
+
+## 4 模型下载
+
+| 模型 | 下载地址 |
+|---|---|
+| InvDN| [InvDN_Denoising](https://paddlegan.bj.bcebos.com/models/InvDN_Denoising.pdparams) |
+
+
+
+# 参考文献
+
+- [https://arxiv.org/abs/2104.10546](https://arxiv.org/abs/2104.10546)
+
+```
+@article{liu2021invertible,
+ title={Invertible Denoising Network: A Light Solution for Real Noise Removal},
+ author={Liu, Yang and Qin, Zhenyue and Anwar, Saeed and Ji, Pan and Kim, Dongwoo and Caldwell, Sabrina and Gedeon, Tom},
+ journal={arXiv preprint arXiv:2104.10546},
+ year={2021}
+}
+```
diff --git a/docs/zh_CN/tutorials/lap_style.md b/docs/zh_CN/tutorials/lap_style.md
new file mode 100644
index 0000000000000000000000000000000000000000..7744ebb2a47cabf21d7778665ed7c48587775665
--- /dev/null
+++ b/docs/zh_CN/tutorials/lap_style.md
@@ -0,0 +1,116 @@
+
+# LapStyle
+
+ **LapStyle--拉普拉斯金字塔风格化网络**,是一种能够生成高质量风格化图的快速前馈风格化网络,能渐进地生成复杂的纹理迁移效果,同时能够在**512分辨率**下达到**100fps**的速度。可实现多种不同艺术风格的快速迁移,在艺术图像生成、滤镜等领域有广泛的应用。
+
+本文档提供CVPR2021论文"Drafting and Revision: Laplacian Pyramid Network for Fast High-Quality Artistic Style Transfer"的官方代码。
+
+## 1. 论文介绍
+
+艺术风格迁移的目的是将一个实例图像的艺术风格迁移到一个内容图像。目前,基于优化的方法已经取得了很好的合成质量,但昂贵的时间成本限制了其实际应用。
+
+同时,前馈方法仍然不能合成复杂风格,特别是存在全局和局部模式时。受绘制草图和修改细节这一常见绘画过程的启发,[论文](https://arxiv.org/pdf/2104.05376.pdf) 提出了一种新的前馈方法拉普拉斯金字塔网络(LapStyle)。
+
+LapStyle首先通过绘图网络(Drafting Network)传输低分辨率的全局风格模式。然后通过修正网络(Revision Network)对局部细节进行高分辨率的修正,它根据拉普拉斯滤波提取的图像纹理和草图产生图像残差。通过叠加具有多个拉普拉斯金字塔级别的修订网络,可以很容易地生成更高分辨率的细节。最终的样式化图像是通过聚合所有金字塔级别的输出得到的。论文还引入了一个补丁鉴别器,以更好地对抗的学习局部风格。实验表明,该方法能实时合成高质量的风格化图像,并能正确生成整体风格模式。
+
+
+
+## 2. 快速体验
+
+PaddleGAN为大家提供了四种不同艺术风格的预训练模型,风格预览如下:
+
+| 原图 | StarryNew | Stars | Ocean | Circuit |
+| :----------------------------------------------------------: | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+|
|
|
|
|
|
+
+4个风格图像下载地址如下:
+| [StarryNew](https://user-images.githubusercontent.com/79366697/118655415-1ec8c000-b81c-11eb-8002-90bf8d477860.png) | [Stars](https://user-images.githubusercontent.com/79366697/118655423-20928380-b81c-11eb-92bd-0deeb320ff14.png) | [Ocean](https://user-images.githubusercontent.com/79366697/118655407-1c666600-b81c-11eb-83a6-300ee1952415.png) | [Circuit](https://user-images.githubusercontent.com/79366697/118655399-196b7580-b81c-11eb-8bc5-d5ece80c18ba.jpg)|
+
+只需运行下面的代码即可迁移至指定风格:
+
+```
+python applications/tools/lapstyle.py --content_img_path ${PATH_OF_CONTENT_IMG} --style_image_path ${PATH_OF_STYLE_IMG}
+```
+### **参数**
+
+- `--content_img_path (str)`: 输入的内容图像路径。
+- `--style_image_path (str)`: 输入的风格图像路径。
+- `--output_path (str)`: 输出的图像路径,默认为`output_dir`。
+- `--weight_path (str)`: 模型权重路径,设置`None`时会自行下载预训练模型,默认为`None`。
+- `--style (str)`: 生成图像风格,当`weight_path`为`None`时,可以在`starrynew`, `circuit`, `ocean` 和 `stars`中选择,默认为`starrynew`。
+
+## 3. 模型训练
+
+配置文件参数详情:[Config文件使用说明](../config_doc.md)
+
+### 3.1 数据准备
+
+为了训练LapStyle,我们使用COCO数据集作为内容图像数据集。您可以从[starrynew](https://user-images.githubusercontent.com/79366697/118655415-1ec8c000-b81c-11eb-8002-90bf8d477860.png),[ocean](https://user-images.githubusercontent.com/79366697/118655407-1c666600-b81c-11eb-83a6-300ee1952415.png),[stars](https://user-images.githubusercontent.com/79366697/118655423-20928380-b81c-11eb-92bd-0deeb320ff14.png)或[circuit](https://user-images.githubusercontent.com/79366697/118655399-196b7580-b81c-11eb-8bc5-d5ece80c18ba.jpg)中选择一张风格图片,也可以任意选择您喜欢的图片作为风格图片。在开始训练与测试之前,记得修改配置文件的数据路径。
+
+### 3.2 训练
+
+示例以COCO数据为例。如果您想使用自己的数据集,可以在配置文件中修改数据集为您自己的数据集。
+
+
+

+
+
+
+
+**注意,LapStyle模型训练暂时不支持Windows系统。**
+
+(1) 首先在128*128像素下训练LapStyle的绘图网络(**Drafting Network**):
+```
+python -u tools/main.py --config-file configs/lapstyle_draft.yaml
+```
+
+(2) 然后,在256*256像素下训练LapStyle的修正网络(**Revision Network**):
+```
+python -u tools/main.py --config-file configs/lapstyle_rev_first.yaml --load ${PATH_OF_LAST_STAGE_WEIGHT}
+```
+
+(3) 最后,在512*512像素下再次训练LapStyle的修正网络(**Revision Network**):
+```
+python -u tools/main.py --config-file configs/lapstyle_rev_second.yaml --load ${PATH_OF_LAST_STAGE_WEIGHT}
+```
+
+### 3.3 测试
+
+测试时需要将配置文件中的`validate/save_img`参数改成`True`以保存输出图像。
+测试训练好的模型,您可以直接测试 "lapstyle_rev_second",因为它包含了之前步骤里的训练权重:
+```
+python tools/main.py --config-file configs/lapstyle_rev_second.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 4. 结果展示
+
+| Style | Stylized Results |
+| --- | --- |
+|  | |
+|  | |
+|  | |
+|  | |
+
+
+## 5. 模型下载
+
+PaddleGAN中提供四个风格的预训练模型下载:
+
+| 模型 | 风格 | 下载地址 |
+|---|---|---|
+| lapstyle_circuit | circuit | [lapstyle_circuit](https://paddlegan.bj.bcebos.com/models/lapstyle_circuit.pdparams)
+| lapstyle_ocean | ocean | [lapstyle_ocean](https://paddlegan.bj.bcebos.com/models/lapstyle_ocean.pdparams)
+| lapstyle_starrynew | starrynew | [lapstyle_starrynew](https://paddlegan.bj.bcebos.com/models/lapstyle_starrynew.pdparams)
+| lapstyle_stars | stars | [lapstyle_stars](https://paddlegan.bj.bcebos.com/models/lapstyle_stars.pdparams)
+
+
+# References
+
+```
+@article{lin2021drafting,
+ title={Drafting and Revision: Laplacian Pyramid Network for Fast High-Quality Artistic Style Transfer},
+ author={Lin, Tianwei and Ma, Zhuoqi and Li, Fu and He, Dongliang and Li, Xin and Ding, Errui and Wang, Nannan and Li, Jie and Gao, Xinbo},
+ booktitle={Computer Vision and Pattern Recognition (CVPR)},
+ year={2021}
+}
+```
diff --git a/docs/zh_CN/tutorials/motion_driving.md b/docs/zh_CN/tutorials/motion_driving.md
index 7439160b747038503af07d7fd62ca02c37dde033..02bee44515227c6d3718c6539d1b45a565bb5dee 100644
--- a/docs/zh_CN/tutorials/motion_driving.md
+++ b/docs/zh_CN/tutorials/motion_driving.md
@@ -1,40 +1,189 @@
-# First order motion model
-## First order motion model原理
-First order motion model的任务是image animation,给定一张源图片,给定一个驱动视频,生成一段视频,其中主角是源图片,动作是驱动视频中的动作。如下图所示,源图像通常包含一个主体,驱动视频包含一系列动作。
+# First Order Motion
+
+First Order Motion的任务是图像动画/Image Animation,即输入为一张源图片和一个驱动视频,源图片中的人物则会做出驱动视频中的动作。如下图所示,源图像通常包含一个主体,驱动视频包含一系列动作。
-
以左上角的人脸表情迁移为例,给定一个源人物,给定一个驱动视频,可以生成一个视频,其中主体是源人物,视频中源人物的表情是由驱动视频中的表情所确定的。通常情况下,我们需要对源人物进行人脸关键点标注、进行表情迁移的模型训练。
-但是这篇文章提出的方法只需要在同类别物体的数据集上进行训练即可,比如实现太极动作迁移就用太极视频数据集进行训练,想要达到表情迁移的效果就使用人脸视频数据集voxceleb进行训练。训练好后,我们使用对应的预训练模型就可以达到前言中实时image animation的操作。
+用下图可以简单阐述其中原理:
+
+
+

+
+
+不仅可以做脸部表情动作迁移,这篇文章提出的方法只需要在同类别物体的数据集上进行训练即可,比如实现太极动作迁移就用太极视频数据集进行训练,想要达到表情迁移的效果就使用人脸视频数据集voxceleb进行训练。训练好后,我们使用对应的预训练模型就可以达到前言中实时image animation的操作。
+
+## 特点
+
+- #### 支持多人脸同时驱动
+
+ - **独家引入人脸检测算法,自动检测多人脸,实现多人脸表情同时驱动。**
+
+ - 使用PaddleGAN提供的[人脸检测算法S3FD](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/ppgan/faceutils/face_detection/detection),将照片中多个人脸检测出来并进行表情迁移,实现多人同时换脸。
+
+ 具体技术原理:
+
+ 1. 使用S3FD人脸检测模型将照片中的每张人脸检测出来并抠出
+ 2. 使用First Order Motion模型对抠出的每张人脸进行脸部表情迁移
+ 3. 将完成表情迁移的人脸进行适当剪裁后贴回原照片位置
+
+ 同时,PaddleGAN针对人脸的相关处理提供[faceutil工具](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/ppgan/faceutils),包括人脸检测、五官分割、关键点检测等能力。
+
+- #### 新增人脸增强效果
+
+ - **人脸增强特效使得驱动后的视频中人脸清晰度大大提升。**
+
+- #### 丰富的在线体验应用
+
+ - 🐜**蚂蚁呀嘿**🐜:https://aistudio.baidu.com/aistudio/projectdetail/1603391
+ - 💙**520告白特辑**💙:https://aistudio.baidu.com/aistudio/projectdetail/1956943
+ - **复刻故人的微笑(▰˘◡˘▰)**:https://aistudio.baidu.com/aistudio/projectdetail/1660701
+ - 👨**父亲节特辑**:https://aistudio.baidu.com/aistudio/projectdetail/2068655
## 使用方法
+### 1. 快速体验:人脸检测与效果增强
+用户可上传一张单人/多人照片与驱动视频,并在如下命令中的`source_image`参数和`driving_video`参数分别换成自己的图片和视频路径,然后运行如下命令,即可完成单人/多人脸动作表情迁移,运行结果为命名为result.mp4的视频文件,保存在output文件夹中。
+
+注意:使用多人脸时,尽量使用人脸间距较大的照片,效果更佳,也可通过手动调节ratio进行效果优化。
+
+本项目中提供了原始图片和驱动视频供展示使用,运行的命令如下:
-用户可以上传自己准备的视频和图片,并在如下命令中的source_image参数和driving_video参数分别换成自己的图片和视频路径,然后运行如下命令,就可以完成动作表情迁移,程序运行成功后,会在ouput文件夹生成名为result.mp4的视频文件,该文件即为动作迁移后的视频。本项目中提供了原始图片和驱动视频供展示使用。运行的命令如下所示:
+#### 运行命令如下:
```
cd applications/
python -u tools/first-order-demo.py \
--driving_video ../docs/imgs/fom_dv.mp4 \
--source_image ../docs/imgs/fom_source_image.png \
- --relative --adapt_scale
+ --ratio 0.4 \
+ --relative \
+ --adapt_scale \
+ --image_size 512 \
+ --face_enhancement \
+ --multi_person
```
+#### 参数说明:
+
+
+| 参数 | 使用说明 |
+| ---------------- | ------------------------------------------------------------ |
+| driving_video | 驱动视频,视频中人物的表情动作作为待迁移的对象。 |
+| source_image | 原始图片,支持单人图片和多人图片,视频中人物的表情动作将迁移到该原始图片中的人物上。 |
+| relative | 指示程序中使用视频和图片中人物关键点的相对坐标还是绝对坐标,建议使用相对坐标,若使用绝对坐标,会导致迁移后人物扭曲变形。 |
+| adapt_scale | 根据关键点凸包自适应运动尺度。 |
+| ratio | 贴回驱动生成的人脸区域占原图的比例, 用户需要根据生成的效果调整该参数,尤其对于多人脸距离比较近的情况下需要调整改参数, 默认为0.4,调整范围是[0.4, 0.5]。 |
+| image_size | 图片人脸大小,默认为256,可设置为512. |
+| face_enhancement | 添加人脸增强,不添加参数默认为不使用增强功能 |
+| multi_person | 当图片中有多张人脸请添加此参数,不加则默认为单人脸 |
+
+#### 📣新增脸部增强功能
+
+| 人脸增强前 | 人脸增强后 |
+| :----------------------------------------------------------: | :----------------------------------------------------------: |
+|
|
|
+
+### 2. 模型训练
+#### **数据集:**
+
+- fashion 可以参考[这里](https://vision.cs.ubc.ca/datasets/fashion/)
+- VoxCeleb 可以参考[这里](https://github.com/AliaksandrSiarohin/video-preprocessing). 将数据按照需求处理为想要的大小,即可开始训练,这里我们处理了256和512两种分辨率大小,结果对比如下:
+
-**参数说明:**
-- driving_video: 驱动视频,视频中人物的表情动作作为待迁移的对象
-- source_image: 原始图片,视频中人物的表情动作将迁移到该原始图片中的人物上
-- relative: 指示程序中使用视频和图片中人物关键点的相对坐标还是绝对坐标,建议使用相对坐标,若使用绝对坐标,会导致迁移后人物扭曲变形
-- adapt_scale: 根据关键点凸包自适应运动尺度
+#### **参数说明:**
+- dataset_name.yaml: 需要配置自己的yaml文件及参数
+
+- GPU单卡训练:
+```
+export CUDA_VISIBLE_DEVICES=0
+python tools/main.py --config-file configs/dataset_name.yaml
+```
+- GPU多卡训练:
+需要将 “/ppgan/modules/first_order.py”中的nn.BatchNorm 改为nn.SyncBatchNorm
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch \
+ tools/main.py \
+ --config-file configs/dataset_name.yaml \
+
+```
+
+**例如:**
+- GPU单卡训练:
+```
+export CUDA_VISIBLE_DEVICES=0
+python tools/main.py --config-file configs/firstorder_fashion.yaml
+```
+- GPU多卡训练:
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python -m paddle.distributed.launch \
+ tools/main.py \
+ --config-file configs/firstorder_fashion.yaml \
+```
## 生成结果展示
-
+
+

+
+
+
+### 3. 模型压缩
+
+**预测:**
+```
+cd applications/
+python -u tools/first-order-demo.py \
+ --driving_video ../docs/imgs/mayiyahei.MP4 \
+ --source_image ../docs/imgs/father_23.jpg \
+ --config ../configs/firstorder_vox_mobile_256.yaml \
+ --ratio 0.4 \
+ --relative \
+ --adapt_scale \
+ --mobile_net
+```
+目前压缩采用mobilenet+剪枝的方法,和之前对比:
+| | 大小(M) | l1 loss |
+| :--------------: | :--------------: | :-----------------: |
+| 原始 | 229 | 0.041781392 |
+| 压缩 | 10.1 | 0.047878753 |
+
+face keypoints 的评估指标参见(https://github.com/AliaksandrSiarohin/pose-evaluation)
+
+**训练:**
+先将configs/firstorder_vox_mobile_256.yaml 中的mode设置成kp_detector, 训练压缩版
+的kp_detector的模型,固定原始generator模型;然后将configs/firstorder_vox_mobile_256.yaml 中的mode设置成generator,训练压缩版的generator的模型,固定原始kp_detector模型;最后将mode设置为both,修改配置文件中的kp_weight_path和gen_weight_path为>已经训练好的模型路径,一起训练。
+```
+export CUDA_VISIBLE_DEVICES=0
+python tools/main.py --config-file configs/firstorder_vox_mobile_256.yaml
+```
+### 4. 模型部署
+#### 4.1 导出模型
+使用`tools/fom_export.py`脚本导出模型已经部署时使用的配置文件,配置文件名字为`firstorder_vox_mobile_256.yml`。模型导出脚本如下:
+```bash
+# 导出FOM模型
+
+python tools/export_model.py \
+ --config-file configs/firstorder_vox_mobile_256.yaml \
+ --load /root/.cache/ppgan/vox_mobile.pdparams \
+ --inputs_size "1,3,256,256;1,3,256,256;1,10,2;1,10,2,2" \
+ --export_model output_inference/
+```
+预测模型会导出到`output_inference/fom_dy2st/`目录下,分别为`model.pdiparams`, `model.pdiparams.info`, `model.pdmodel`。
+- [预训练模型](https://paddlegan.bj.bcebos.com/applications/first_order_model/paddle_lite/inference/lite.zip)
+
+#### 4.2 PaddleLite部署
+- [使用Paddle Lite部署FOM模型](https://github.com/PaddlePaddle/PaddleGAN/tree/develop/deploy/lite)
+- [FOM-Lite-Demo](https://paddlegan.bj.bcebos.com/applications/first_order_model/paddle_lite/apk/fom_demo.zip)。更多内容,请参考[Paddle-Lite](https://github.com/PaddlePaddle/Paddle-Lite)
+目前问题:
+(a).Paddle Lite运行效果略差于Paddle Inference,正在优化中
+(b).单线程跑Generator,帧数多了会跑到小核不跑大核
## 参考文献
diff --git a/docs/zh_CN/tutorials/mpr_net.md b/docs/zh_CN/tutorials/mpr_net.md
new file mode 100644
index 0000000000000000000000000000000000000000..ca57ca967a6fde5589709119f502a965b80dc287
--- /dev/null
+++ b/docs/zh_CN/tutorials/mpr_net.md
@@ -0,0 +1,125 @@
+# MPR_Net
+
+## 1 原理介绍
+
+[MPR_Net](https://arxiv.org/abs/2102.02808)是发表在CVPR2021的一篇图像修复方法。图像修复任务需要在恢复图像时在空间细节和高级上下文信息之间实现复杂的平衡。MPR_Net提出了一种新颖的协同设计,可以最佳地平衡这些相互竞争的目标。其中主要提议是一个多阶段架构,它逐步学习退化输入的恢复函数,从而将整个恢复过程分解为更易于管理的步骤。具体来说,MPR_Net首先使用编码器-解码器架构学习上下文特征,然后将它们与保留本地信息的高分辨率分支相结合。在每个阶段引入了一种新颖的每像素自适应设计,利用原位监督注意力来重新加权局部特征。这种多阶段架构的一个关键要素是不同阶段之间的信息交换。为此,MPR_Net提出了一种双向方法,其中信息不仅从早期到后期按顺序交换,而且特征处理块之间也存在横向连接以避免任何信息丢失。由此产生的紧密互连的多阶段架构,称为MPRNet,在包括图像去雨、去模糊和去噪在内的一系列任务中,在十个数据集上提供了强大的性能提升。
+
+## 2 如何使用
+
+### 2.1 快速体验
+
+安装`PaddleGAN`之后运行如下代码即生成修复后的图像`output_dir/Deblurring/image_name.png`,其中`task`为你想要修复的任务,可以在`Deblurring`、`Denoising`和`Deraining`中选择,`PATH_OF_IMAGE`为你需要转换的图像路径。
+
+```python
+from ppgan.apps import MPRPredictor
+predictor = MPRPredictor(task='Deblurring')
+predictor.run(PATH_OF_IMAGE)
+```
+
+或者在终端中运行如下命令,也可获得相同结果:
+
+```sh
+python applications/tools/mprnet.py --input_image ${PATH_OF_IMAGE} --task Deblurring
+```
+其中`task`为你想要修复的任务,可以在`Deblurring`、`Denoising`和`Deraining`中选择,`PATH_OF_IMAGE`为你需要转换的图像路径。
+
+### 2.1 数据准备
+
+Deblurring训练数据是GoPro,用于去模糊的GoPro数据集由3214张1,280×720大小的模糊图像组成,这些图像分为2103张训练图像和1111张测试图像。可以从[这里](https://drive.google.com/file/d/1H0PIXvJH4c40pk7ou6nAwoxuR4Qh_Sa2/view?usp=sharing)下载。
+下载后解压到data目录下,解压完成后数据分布如下所示:
+
+```sh
+GoPro
+├── train
+│ ├── input
+│ └── target
+└── test
+ ├── input
+ └── target
+
+```
+
+Denoising训练数据是SIDD,一个图像去噪数据集,包含来自10个不同光照条件下的3万幅噪声图像,可以从[训练数据集下载](https://www.eecs.yorku.ca/~kamel/sidd/dataset.php)和[测试数据集下载](https://drive.google.com/drive/folders/1S44fHXaVxAYW3KLNxK41NYCnyX9S79su)下载。
+下载后解压到data目录下,解压完成后数据分布如下所示:
+
+```sh
+SIDD
+├── train
+│ ├── input
+│ └── target
+└── val
+ ├── input
+ └── target
+
+```
+
+Deraining训练数据是Synthetic Rain Datasets,由13712张从多个数据集(Rain14000, Rain1800, Rain800, Rain12)收集的干净雨图像对组成,可以从[训练数据集下载](https://drive.google.com/drive/folders/1Hnnlc5kI0v9_BtfMytC2LR5VpLAFZtVe)和[测试数据集下载](https://drive.google.com/drive/folders/1PDWggNh8ylevFmrjo-JEvlmqsDlWWvZs)下载。
+下载后解压到data目录下,解压完成后数据分布如下所示:
+
+```sh
+Synthetic_Rain_Datasets
+├── train
+│ ├── input
+│ └── target
+└── test
+ ├── Test100
+ ├── Rain100H
+ ├── Rain100L
+ ├── Test1200
+ └── Test2800
+
+```
+
+### 2.2 训练
+ 示例以训练Deblurring的数据为例。如果想训练其他任务可以通过替换配置文件。
+
+ ```sh
+ python -u tools/main.py --config-file configs/mprnet_deblurring.yaml
+ ```
+
+### 2.3 测试
+
+测试模型:
+```sh
+python tools/main.py --config-file configs/mprnet_deblurring.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 3 结果展示
+
+去模糊
+| 模型 | 数据集 | PSNR/SSIM |
+|---|---|---|
+| MPRNet | GoPro | 33.4360/0.9410 |
+
+去噪
+| 模型 | 数据集 | PSNR/SSIM |
+|---|---|---|
+| MPRNet | SIDD | 43.6100 / 0.9586 |
+
+去雨
+| 模型 | 数据集 | PSNR/SSIM |
+|---|---|---|
+| MPRNet | Rain100L | 36.2848 / 0.9651 |
+
+
+## 4 模型下载
+
+| 模型 | 下载地址 |
+|---|---|
+| MPR_Deblurring | [MPR_Deblurring](https://paddlegan.bj.bcebos.com/models/MPR_Deblurring.pdparams) |
+| MPR_Denoising | [MPR_Denoising](https://paddlegan.bj.bcebos.com/models/MPR_Denoising.pdparams) |
+| MPR_Deraining | [MPR_Deraining](https://paddlegan.bj.bcebos.com/models/MPR_Deraining.pdparams) |
+
+
+# 参考文献
+
+- [Multi-Stage Progressive Image Restoration](https://arxiv.org/abs/2102.02808)
+
+ ```
+ @inproceedings{Kim2020U-GAT-IT:,
+ title={Multi-Stage Progressive Image Restoration},
+ author={Syed Waqas Zamir and Aditya Arora and Salman Khan and Munawar Hayat and Fahad Shahbaz Khan and Ming-Hsuan Yang and Ling Shao},
+ booktitle={CVPR},
+ year={2021}
+ }
+ ```
diff --git a/docs/zh_CN/tutorials/nafnet.md b/docs/zh_CN/tutorials/nafnet.md
new file mode 100644
index 0000000000000000000000000000000000000000..c193d2c10e3c4124c9ebe20519be3828d46b491d
--- /dev/null
+++ b/docs/zh_CN/tutorials/nafnet.md
@@ -0,0 +1,87 @@
+[English](../../en_US/tutorials/nafnet.md) | 中文
+
+# NAFNet:图像恢复的简单基线
+
+## 1、简介
+
+NAFNet提出一种超简基线方案Baseline,它不仅计算高效同时性能优于之前SOTA方案;在所得Baseline基础上进一步简化得到了NAFNet:移除了非线性激活单元且性能进一步提升。所提方案在SIDD降噪与GoPro去模糊任务上均达到了新的SOTA性能,同时计算量大幅降低。网络设计和特点如下图所示,采用带跳过连接的UNet作为整体架构,同时修改了Restormer块中的Transformer模块,并取消了激活函数,采取更简单有效的simplegate设计,运用更简单的通道注意力机制
+
+
+
+对模型更详细的介绍,可参考论文原文[Simple Baselines for Image Restoration](https://arxiv.org/pdf/2204.04676),PaddleGAN中目前提供去噪任务的权重
+
+## 2 如何使用
+
+### 2.1 快速体验
+
+安装`PaddleGAN`之后进入`PaddleGAN`文件夹下,运行如下命令即生成修复后的图像`./output_dir/Denoising/image_name.png`
+
+```sh
+python applications/tools/nafnet_denoising.py --images_path ${PATH_OF_IMAGE}
+```
+其中`PATH_OF_IMAGE`为你需要去噪的图像路径,或图像所在文件夹的路径。若需要使用自己的模型权重,则运行如下命令,其中`PATH_OF_MODEL`为模型权重的路径
+
+```sh
+python applications/tools/nafnet_denoising.py --images_path ${PATH_OF_IMAGE} --weight_path ${PATH_OF_MODEL}
+```
+
+### 2.2 数据准备
+
+Denoising训练数据是SIDD,一个图像去噪数据集,包含来自10个不同光照条件下的3万幅噪声图像,可以从[训练数据集下载](https://www.eecs.yorku.ca/~kamel/sidd/dataset.php)和[测试数据集下载](https://drive.google.com/drive/folders/1S44fHXaVxAYW3KLNxK41NYCnyX9S79su)下载。
+下载后解压到data目录下,解压完成后数据分布如下所示:
+
+```sh
+SIDD
+├── train
+│ ├── input
+│ └── target
+└── val
+ ├── input
+ └── target
+
+```
+用户也可以使用AI studio上的[SIDD数据](https://aistudio.baidu.com/aistudio/datasetdetail/149460),但需要将文件夹`input_crops`与`gt_crops`重命名为`input`和`target`
+
+### 2.3 训练
+示例以训练Denoising的数据为例。如果想训练其他任务可以更换数据集并修改配置文件
+
+```sh
+python -u tools/main.py --config-file configs/nafnet_denoising.yaml
+```
+
+### 2.4 测试
+
+测试模型:
+```sh
+python tools/main.py --config-file configs/nafnet_denoising.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 3 结果展示
+
+去噪
+| 模型 | 数据集 | PSNR/SSIM |
+|---|---|---|
+| NAFNet | SIDD Val | 43.1468 / 0.9563 |
+
+## 4 模型下载
+
+| 模型 | 下载地址 |
+|---|---|
+| NAFNet| [NAFNet_Denoising](https://paddlegan.bj.bcebos.com/models/NAFNet_Denoising.pdparams) |
+
+
+
+# 参考文献
+
+- [Simple Baselines for Image Restoration](https://arxiv.org/pdf/2204.04676)
+
+```
+@article{chen_simple_nodate,
+ title = {Simple {Baselines} for {Image} {Restoration}},
+ abstract = {Although there have been significant advances in the field of image restoration recently, the system complexity of the state-of-the-art (SOTA) methods is increasing as well, which may hinder the convenient analysis and comparison of methods. In this paper, we propose a simple baseline that exceeds the SOTA methods and is computationally efficient. To further simplify the baseline, we reveal that the nonlinear activation functions, e.g. Sigmoid, ReLU, GELU, Softmax, etc. are not necessary: they could be replaced by multiplication or removed. Thus, we derive a Nonlinear Activation Free Network, namely NAFNet, from the baseline. SOTA results are achieved on various challenging benchmarks, e.g. 33.69 dB PSNR on GoPro (for image deblurring), exceeding the previous SOTA 0.38 dB with only 8.4\% of its computational costs; 40.30 dB PSNR on SIDD (for image denoising), exceeding the previous SOTA 0.28 dB with less than half of its computational costs. The code and the pretrained models will be released at github.com/megvii-research/NAFNet.},
+ language = {en},
+ author = {Chen, Liangyu and Chu, Xiaojie and Zhang, Xiangyu and Sun, Jian},
+ pages = {17}
+}
+```
+
diff --git a/docs/zh_CN/tutorials/photopen.md b/docs/zh_CN/tutorials/photopen.md
new file mode 100644
index 0000000000000000000000000000000000000000..050454b3c9d44c471bbeee27083b18fe3058d072
--- /dev/null
+++ b/docs/zh_CN/tutorials/photopen.md
@@ -0,0 +1,104 @@
+# GauGAN(加SimAM注意力的改进版)
+
+## 1.简介:
+
+本应用的模型出自论文《Semantic Image Synthesis with Spatially-Adaptive Normalization》,是一个像素风格迁移网络 Pix2PixHD,能够根据输入的语义分割标签生成照片风格的图片。为了解决模型归一化层导致标签语义信息丢失的问题,论文作者向 Pix2PixHD 的生成器网络中添加了 SPADE(Spatially-Adaptive Normalization)空间自适应归一化模块,通过两个卷积层保留了归一化时训练的缩放与偏置参数的空间维度,以增强生成图片的质量。
+
+
+
+此模型在 GauGAN 的 SPADE 模块上添加了无参的 SimAM 注意力模块,增强了生成图片的立体质感。
+
+
+
+## 2.快速体验
+
+预训练模型可以从如下地址下载: (https://paddlegan.bj.bcebos.com/models/photopen.pdparams)
+
+输入一张png格式的语义标签图片给模型,输出一张按标签语义生成的照片风格的图片。预测代码如下:
+
+```
+python applications/tools/photopen.py \
+ --semantic_label_path test/sem.png \
+ --weight_path test/n_g.pdparams \
+ --output_path test/pic.jpg \
+ --config-file configs/photopen.yaml
+```
+
+**参数说明:**
+* semantic_label_path:输入的语义标签路径,为png图片文件
+* weight_path:训练完成的模型权重存储路径,为 statedict 格式(.pdparams)的 Paddle 模型行权重文件
+* output_path:预测生成图片的存储路径
+* config-file:存储参数设定的yaml文件存储路径,与训练过程使用同一个yaml文件,预测参数由 predict 下字段设定
+
+## 3.训练
+
+**数据准备:**
+
+数据集目录结构如下:
+
+```
+└─coco_stuff
+ ├─train_img
+ └─train_inst
+```
+
+coco_stuff 是数据集根目录可任意改变,其下的 train_img 子目录存放训练用的风景图片(一般jpg格式),train_inst 子目录下存放与风景图片文件名一一对应、尺寸相同的语义标签图片(一般png格式)。
+
+### 3.1 gpu 单卡训练
+
+`python -u tools/main.py --config-file configs/photopen.yaml`
+
+* config-file:训练使用的超参设置 yamal 文件的存储路径
+
+### 3.2 gpu 多卡训练
+
+```
+!python -m paddle.distributed.launch \
+ tools/main.py \
+ --config-file configs/photopen.yaml \
+ -o model.generator.norm_G=spectralspadesyncbatch3x3 \
+ model.batchSize=4 \
+ dataset.train.batch_size=4
+```
+
+* config-file:训练使用的超参设置 yamal 文件的存储路径
+* model.generator.norm_G:设置使用 syncbatch 归一化,使多个 GPU 中的数据一起进行归一化
+* model.batchSize:设置模型的 batch size,一般为 GPU 个数的整倍数
+* dataset.train.batch_size:设置数据读取的 batch size,要和模型的 batch size 一致
+
+### 3.3 继续训练
+
+`python -u tools/main.py --config-file configs/photopen.yaml --resume output_dir\photopen-2021-09-30-15-59\iter_3_checkpoint.pdparams`
+
+* config-file:训练使用的超参设置 yamal 文件的存储路径
+* resume:指定读取的 checkpoint 路径
+
+## 4.模型效果展示
+
+
+
+## 5.参考
+
+```
+@inproceedings{park2019SPADE,
+ title={Semantic Image Synthesis with Spatially-Adaptive Normalization},
+ author={Park, Taesung and Liu, Ming-Yu and Wang, Ting-Chun and Zhu, Jun-Yan},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ year={2019}
+}
+
+@InProceedings{pmlr-v139-yang21o,
+ title = {SimAM: A Simple, Parameter-Free Attention Module for Convolutional Neural Networks},
+ author = {Yang, Lingxiao and Zhang, Ru-Yuan and Li, Lida and Xie, Xiaohua},
+ booktitle = {Proceedings of the 38th International Conference on Machine Learning},
+ pages = {11863--11874},
+ year = {2021},
+ editor = {Meila, Marina and Zhang, Tong},
+ volume = {139},
+ series = {Proceedings of Machine Learning Research},
+ month = {18--24 Jul},
+ publisher = {PMLR},
+ pdf = {http://proceedings.mlr.press/v139/yang21o/yang21o.pdf},
+ url = {http://proceedings.mlr.press/v139/yang21o.html}
+}
+```
diff --git a/docs/zh_CN/tutorials/pix2pix_cyclegan.md b/docs/zh_CN/tutorials/pix2pix_cyclegan.md
index 5a211306f6be023e202a42f04949a258d7a27b65..a121cfdf4504e74bf2b3aca35a5f81d8daff5f23 100644
--- a/docs/zh_CN/tutorials/pix2pix_cyclegan.md
+++ b/docs/zh_CN/tutorials/pix2pix_cyclegan.md
@@ -44,6 +44,7 @@
| 模型 | 数据集 | 下载地址 |
|---|---|---|
| Pix2Pix_cityscapes | cityscapes | [Pix2Pix_cityscapes](https://paddlegan.bj.bcebos.com/models/Pix2Pix_cityscapes.pdparams)
+| Pix2Pix_facedes | facades | [Pix2Pix_facades](https://paddlegan.bj.bcebos.com/models/Pixel2Pixel_facades.pdparams)
# 2 CycleGAN
diff --git a/docs/zh_CN/tutorials/pixel2style2pixel.md b/docs/zh_CN/tutorials/pixel2style2pixel.md
index 09319653d0146555b3c37ab454c8199704d9b8be..83e71b17c6c629404795838fc654927039e8467b 100644
--- a/docs/zh_CN/tutorials/pixel2style2pixel.md
+++ b/docs/zh_CN/tutorials/pixel2style2pixel.md
@@ -27,7 +27,7 @@ Pixel2Style2Pixel使用相当大的模型对图像进行编码,将图像编码
```
cd applications/
-python -u tools/styleganv2.py \
+python -u tools/pixel2style2pixel.py \
--input_image <替换为输入的图像路径> \
--output_path <替换为生成图片存放的文件夹> \
--weight_path <替换为你的预训练模型路径> \
diff --git a/docs/zh_CN/tutorials/prenet.md b/docs/zh_CN/tutorials/prenet.md
new file mode 100644
index 0000000000000000000000000000000000000000..65a2b64a1d383bd9750b1862ed3c8cc532b3db1a
--- /dev/null
+++ b/docs/zh_CN/tutorials/prenet.md
@@ -0,0 +1,98 @@
+# PReNet
+
+## 1 简介
+Progressive Image Deraining Networks: A Better and Simpler Baseline提出一种多阶段渐进的残差网络,每一个阶段都是resnet,每一res块的输入为上一res块输出和原始雨图,另外采用使用SSIM损失进行训练,进一步提升了网络的性能,网络总体简洁高效,在各种数据集上表现良好,为图像去雨提供了一个很好的基准。
+
+

+
+
+## 2 如何使用
+
+### 2.1 数据准备
+
+ 数据集(RainH.zip) 可以在[此处](https://pan.baidu.com/s/1_vxCatOV3sOA6Vkx1l23eA?pwd=vitu)下载,将其解压到./data路径下。
+
+ 数据集文件结构如下:
+
+ ```
+ ├── RainH
+ ├── RainTrainH
+ | ├── rain
+ | | ├── 1.png
+ | | └── 2.png
+ | | .
+ | | .
+ | └── norain
+ | ├── 1.png
+ | └── 2.png
+ | .
+ | .
+ └── Rain100H
+ ├── rain
+ | ├── 001.png
+ | └── 002.png
+ | .
+ | .
+ └── norain
+ ├── 001.png
+ └── 002.png
+ .
+ .
+ ```
+
+### 2.2 训练和测试
+
+
+ 训练模型:
+ ```
+ python -u tools/main.py --config-file configs/prenet.yaml
+ ```
+
+ 测试模型:
+ ```
+ python tools/main.py --config-file configs/prenet.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+ ```
+
+## 3 实验结果展示
+实验数值结果是在 RGB 通道上进行评估,并在评估之前裁剪每个边界的尺度像素。
+
+度量指标为 PSNR / SSIM.
+
+| 模型 | Rain100H |
+|---|---|
+| PReNet | 29.5037 / 0.899 |
+
+可视化展示:
+输入:
+
+
+

+
+
+输出:
+
+
+

+
+
+## 4 模型参数下载
+| 模型 | 数据集 |
+|---|---|
+| [PReNet](https://paddlegan.bj.bcebos.com/models/PReNet.pdparams) | [RainH.zip](https://pan.baidu.com/s/1_vxCatOV3sOA6Vkx1l23eA?pwd=vitu) |
+
+
+
+
+## 参考
+
+- 1. [Progressive Image Deraining Networks: A Better and Simpler Baseline](https://arxiv.org/pdf/1901.09221v3.pdf)
+
+
+```
+@inproceedings{ren2019progressive,
+ title={Progressive Image Deraining Networks: A Better and Simpler Baseline},
+ author={Ren, Dongwei and Zuo, Wangmeng and Hu, Qinghua and Zhu, Pengfei and Meng, Deyu},
+ booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+ year={2019},
+ }
+```
diff --git a/docs/zh_CN/tutorials/psgan.md b/docs/zh_CN/tutorials/psgan.md
index 5d55f146dc46ab9f6df9bb4cda3a4d722efe3aed..fe172e897feffa3a478c5bd3495555e9ff2fe8d3 100644
--- a/docs/zh_CN/tutorials/psgan.md
+++ b/docs/zh_CN/tutorials/psgan.md
@@ -20,7 +20,7 @@ python tools/psgan_infer.py \
--model_path /your/model/path \
--source_path docs/imgs/ps_source.png \
--reference_dir docs/imgs/ref \
- --evaluate-only True
+ --evaluate-only
```
**参数说明:**
- config-file: PSGAN网络到参数配置文件,格式为yaml
diff --git a/docs/zh_CN/tutorials/remote_sensing_image_super-resolution.md b/docs/zh_CN/tutorials/remote_sensing_image_super-resolution.md
new file mode 100644
index 0000000000000000000000000000000000000000..3fa4b537754b9e6db4ffc0eb35605014bc95fbb6
--- /dev/null
+++ b/docs/zh_CN/tutorials/remote_sensing_image_super-resolution.md
@@ -0,0 +1,70 @@
+# 1.单幅遥感图像超分辨率重建
+
+## 1.1 背景和原理介绍
+
+ **意义与应用场景**:单幅影像超分辨率重建一直是low-level视觉领域中一个比较热门的任务,其可以成为修复老电影、老照片的技术手段,也可以为图像分割、目标检测等下游任务提供质量较高的数据。在遥感中的应用场景也比较广泛,例如:在**船舶检测和分类**等诸多遥感影像应用中,**提高遥感影像分辨率具有重要意义**。
+
+**原理**:单幅遥感影像的超分辨率重建本质上与单幅影像超分辨率重建类似,均是使用RGB三通道的低分辨率影像生成纹理清晰的高分辨率影像。本项目复现的论文是[Yulun Zhang](http://yulunzhang.com/), [Kunpeng Li](https://kunpengli1994.github.io/), [Kai Li](http://kailigo.github.io/), [Lichen Wang](https://sites.google.com/site/lichenwang123/), [Bineng Zhong](https://scholar.google.de/citations?user=hvRBydsAAAAJ&hl=en), and [Yun Fu](http://www1.ece.neu.edu/~yunfu/), 发表在ECCV 2018上的论文[《Image Super-Resolution Using Very Deep Residual Channel Attention Networks》](https://arxiv.org/abs/1807.02758)。
+作者提出了一个深度残差通道注意力网络(RCAN),引入一种通道注意力机制(CA),通过考虑通道之间的相互依赖性来自适应地重新调整特征。该模型取得优异的性能,因此本项目选择RCAN进行单幅遥感影像的x4超分辨率重建。
+
+## 1.2 如何使用
+
+### 1.2.1 数据准备
+ 本项目的训练分为两个阶段,第一个阶段使用[DIV2K数据集](https://data.vision.ee.ethz.ch/cvl/DIV2K/)进行预训练RCANx4模型,然后基于该模型再使用[遥感超分数据集合](https://aistudio.baidu.com/aistudio/datasetdetail/129011)进行迁移学习。
+ - 关于DIV2K数据的准备方法参考[该文档](./single_image_super_resolution.md)
+ - 遥感超分数据准备
+ - 数据已经上传至AI studio中,该数据为从UC Merced Land-Use Dataset 21 级土地利用图像遥感数据集中抽取部分遥感影像,通过BI退化生成的HR-LR影像对用于训练超分模型,其中训练集6720对,测试集420对
+ - 下载解压后的文件组织形式如下
+ ```
+ ├── RSdata_for_SR
+ ├── train_HR
+ ├── train_LR
+ | └──x4
+ ├── test_HR
+ ├── test_LR
+ | └──x4
+ ```
+
+### 1.2.2 DIV2K数据集上训练/测试
+
+首先是在DIV2K数据集上训练RCANx4模型,并以Set14作为测试集。按照论文需要准备RCANx2作为初始化权重,可通过下表进行获取。
+
+| 模型 | 数据集 | 下载地址 |
+|---|---|---|
+| RCANx2 | DIV2K | [RCANx2](https://paddlegan.bj.bcebos.com/models/RCAN_X2_DIV2K.pdparams)
+
+
+将DIV2K数据按照 [该文档](./single_image_super_resolution.md)所示准备好后,执行以下命令训练模型,`--load`的参数为下载好的RCANx2模型权重所在路径。
+
+```shell
+python -u tools/main.py --config-file configs/rcan_rssr_x4.yaml --load ${PATH_OF_WEIGHT}
+```
+
+训练好后,执行以下命令可对测试集Set14预测,`--load`的参数为训练好的RCANx4模型权重
+```shell
+python tools/main.py --config-file configs/rcan_rssr_x4.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+本项目在DIV2K数据集训练迭代第57250次得到的权重[RCAN_X4_DIV2K](https://pan.baidu.com/s/1rI7yUdD4T1DE0RZB5yHXjA)(提取码:aglw),在Set14数据集上测得的精度:`PSNR:28.8959 SSIM:0.7896`
+
+### 1.2.3 遥感超分数据上迁移学习训练/测试
+- 使用该数据集,需要修改`rcan_rssr_x4.yaml`文件中训练集与测试集的高分辨率图像路径和低分辨率图像路径,即文件中的`gt_folder`和`lq_folder`。
+- 同时,由于使用了在DIV2K数据集上训练的RCAN_X4_DIV2K模型权重来进行迁移学习,所以训练的迭代次数`total_iters`也可以进行修改,并不需要很多次数的迭代就能有良好的效果。训练模型中`--load`的参数为下载好的RCANx4模型权重所在路径。
+
+训练模型:
+```shell
+python -u tools/main.py --config-file configs/rcan_rssr_x4.yaml --load ${PATH_OF_RCANx4_WEIGHT}
+```
+测试模型:
+```shell
+python -u tools/main.py --config-file configs/rcan_rssr_x4.yaml --load ${PATH_OF_RCANx4_WEIGHT}
+```
+
+## 1.3 实验结果
+
+- RCANx4遥感影像超分效果
+
+
+
+- [RCAN遥感影像超分辨率重建 Ai studio 项目在线体验](https://aistudio.baidu.com/aistudio/projectdetail/3508912)
+
diff --git a/docs/zh_CN/tutorials/singan.md b/docs/zh_CN/tutorials/singan.md
new file mode 100755
index 0000000000000000000000000000000000000000..1656a55a3362253db5036850be3f2efe253ec25a
--- /dev/null
+++ b/docs/zh_CN/tutorials/singan.md
@@ -0,0 +1,147 @@
+# SinGAN
+
+## 简介
+
+SinGAN是一种新的可以从单个自然图像中学习的无条件生成模型。该模型包含一个全卷积生成对抗网络的金字塔结构,每个生成对抗网络负责学习不同在不同比例的图像上的块分布。这允许生成任意大小和纵横比的新样本,具有显著的可变性,但同时保持训练图像的全局结构和精细纹理。与以往单一图像生成方案相比,该方法不局限于纹理图像,也没有条件(即从噪声中生成样本)。
+
+## 使用方法
+
+### 配置说明
+
+我们为SinGAN提供了4个配置文件:
+
+- `singan_universal.yaml`
+- `singan_sr.yaml`
+- `singan_animation.yaml`
+- `singan_finetune.yaml`
+
+其中`singan_universal.yaml`对所有任务都适用配置,`singan_sr.yaml`是官方建议的用于超分任务的配置,`singan_animation.yaml`是官方建议的用于“静图转动”任务的配置。本文档展示的结果均由`singan_universal.yaml`训练而来。对于手绘转照片任务,使用`singan_universal.yaml`训练后再用`singan_finetune.yaml`微调会得到更好的结果。
+
+### 训练
+
+启动训练:
+
+```bash
+python tools/main.py -c configs/singan_universal.yaml \
+ -o model.train_image=训练图片.png
+```
+
+为“手绘转照片”任务微调:
+
+```bash
+python tools/main.py -c configs/singan_finetune.yaml \
+ -o model.train_image=训练图片.png \
+ --load 已经训练好的模型.pdparams
+```
+
+### 测试
+运行下面的命令,可以随机生成一张图片。需要注意的是,`训练图片.png`应当位于`data/singan`目录下,或者手动调整配置文件中`dataset.test.dataroot`的值。此外,这个目录中只能包含`训练图片.png`这一张图片。
+```bash
+python tools/main.py -c configs/singan_universal.yaml \
+ -o model.train_image=训练图片.png \
+ --load 已经训练好的模型.pdparams \
+ --evaluate-only
+```
+
+### 导出生成器权重
+
+训练结束后,需要使用 ``tools/extract_weight.py`` 来从训练模型(包含了生成器和判别器)中提取生成器的权重来给`applications/tools/singan.py`进行推理,以实现SinGAN的各种应用。
+
+```bash
+python tools/extract_weight.py 训练过程中保存的权重文件.pdparams --net-name netG --output 生成器权重文件.pdparams
+```
+
+### 推理及结果展示
+
+*注意:您可以下面的命令中的`--weight_path 生成器权重文件.pdparams`可以换成`--pretrained_model `来体验训练好的模型,其中``可以是`trees`、`stone`、`mountains`、`birds`和`lightning`。*
+
+#### 随机采样
+
+```bash
+python applications/tools/singan.py \
+ --weight_path 生成器权重文件.pdparams \
+ --mode random_sample \
+ --scale_v 1 \ # vertical scale
+ --scale_h 1 \ # horizontal scale
+ --n_row 2 \
+ --n_col 2
+```
+
+|训练图片|随机采样结果|
+| ---- | ---- |
+|||
+
+#### 图像编辑&风格和谐化
+
+```bash
+python applications/tools/singan.py \
+ --weight_path 生成器权重文件.pdparams \
+ --mode editing \ # or harmonization
+ --ref_image 编辑后的图片.png \
+ --mask_image 编辑区域标注图片.png \
+ --generate_start_scale 2
+```
+
+
+|训练图片|编辑图片|编辑区域标注|SinGAN生成|
+|----|----|----|----|
+|||||
+
+#### 超分
+
+```bash
+python applications/tools/singan.py \
+ --weight_path 生成器权重文件.pdparams \
+ --mode sr \
+ --ref_image 待超分的图片亦即用于训练的图片.png \
+ --sr_factor 4
+```
+|训练图片|超分结果|
+| ---- | ---- |
+|||
+
+
+#### 静图转动
+
+```bash
+python applications/tools/singan.py \
+ --weight_path 生成器权重文件.pdparams \
+ --mode animation \
+ --animation_alpha 0.6 \ # this parameter determines how close the frames of the sequence remain to the training image
+ --animation_beta 0.7 \ # this parameter controls the smoothness and rate of change in the generated clip
+ --animation_frames 20 \ # frames of animation
+ --animation_duration 0.1 # duration of each frame
+```
+
+|训练图片|动画效果|
+| ---- | ---- |
+|||
+
+
+#### 手绘转照片
+```bash
+python applications/tools/singan.py \
+ --weight_path 生成器权重文件.pdparams \
+ --mode paint2image \
+ --ref_image 手绘图片.png \
+ --generate_start_scale 2
+```
+|训练图片|手绘图片|SinGAN生成|SinGAN微调后生成|
+|----|----|----|----|
+|||||
+
+
+
+## 参考文献
+
+```
+@misc{shaham2019singan,
+ title={SinGAN: Learning a Generative Model from a Single Natural Image},
+ author={Tamar Rott Shaham and Tali Dekel and Tomer Michaeli},
+ year={2019},
+ eprint={1905.01164},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+
diff --git a/docs/zh_CN/tutorials/single_image_super_resolution.md b/docs/zh_CN/tutorials/single_image_super_resolution.md
new file mode 100644
index 0000000000000000000000000000000000000000..b4e43acf675624cf5dac74e27eef949a205a5479
--- /dev/null
+++ b/docs/zh_CN/tutorials/single_image_super_resolution.md
@@ -0,0 +1,199 @@
+# 1 单张图像超分
+
+## 1.1 原理介绍
+
+ 超分是放大和改善图像细节的过程。它通常将低分辨率图像作为输入,将同一图像放大到更高分辨率作为输出。这里我们提供了四种超分辨率模型,即[RealSR](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w31/Ji_Real-World_Super-Resolution_via_Kernel_Estimation_and_Noise_Injection_CVPRW_2020_paper.pdf), [ESRGAN](https://arxiv.org/abs/1809.00219v2), [LESRCNN](https://arxiv.org/abs/2007.04344),[PAN](https://arxiv.org/pdf/2010.01073.pdf).
+ [RealSR](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w31/Ji_Real-World_Super-Resolution_via_Kernel_Estimation_and_Noise_Injection_CVPRW_2020_paper.pdf)通过估计各种模糊内核以及实际噪声分布,为现实世界的图像设计一种新颖的真实图片降采样框架。基于该降采样框架,可以获取与真实世界图像共享同一域的低分辨率图像。RealSR是一个旨在提高感知度的真实世界超分辨率模型。对合成噪声数据和真实世界图像进行的大量实验表明,RealSR模型能够有效降低了噪声并提高了视觉质量。
+ [ESRGAN](https://arxiv.org/abs/1809.00219v2)是增强型SRGAN,为了进一步提高SRGAN的视觉质量,ESRGAN在SRGAN的基础上改进了SRGAN的三个关键组件。此外,ESRGAN还引入了未经批量归一化的剩余密集块(RRDB)作为基本的网络构建单元,让鉴别器预测相对真实性而不是绝对值,并利用激活前的特征改善感知损失。得益于这些改进,提出的ESRGAN实现了比SRGAN更好的视觉质量和更逼真、更自然的纹理,并在PIRM2018-SR挑战赛中获得第一名。
+ 考虑到CNNs在SISR的应用上往往会消耗大量的计算量和存储空间来训练SR模型。轻量级增强SR-CNN([LESRCNN](https://arxiv.org/abs/2007.04344))被提出。大量实验表明,LESRCNN在定性和定量评价方面优于现有的SISR算法。
+ 之后[PAN](https://arxiv.org/pdf/2010.01073.pdf)设计了一种用于图像超分辨率(SR)的轻量级卷积神经网络。
+
+
+
+## 1.2 如何使用
+
+### 1.2.1 数据准备
+
+ 常用的图像超分数据集如下:
+ | name | 数据集 | 数据描述 | 下载 |
+ |---|---|---|---|
+ | 2K Resolution | [DIV2K](https://data.vision.ee.ethz.ch/cvl/DIV2K/) | proposed in [NTIRE17](https://data.vision.ee.ethz.ch/cvl/ntire17//) (800 train and 100 validation) | [official website](https://data.vision.ee.ethz.ch/cvl/DIV2K/) |
+ | Classical SR Testing | Set5 | Set5 test dataset | [Google Drive](https://drive.google.com/drive/folders/1B3DJGQKB6eNdwuQIhdskA64qUuVKLZ9u) / [Baidu Drive](https://pan.baidu.com/s/1q_1ERCMqALH0xFwjLM0pTg#list/path=%2Fsharelink2016187762-785433459861126%2Fclassical_SR_datasets&parentPath=%2Fsharelink2016187762-785433459861126) |
+ | Classical SR Testing | Set14 | Set14 test dataset | [Google Drive](https://drive.google.com/drive/folders/1B3DJGQKB6eNdwuQIhdskA64qUuVKLZ9u) / [Baidu Drive](https://pan.baidu.com/s/1q_1ERCMqALH0xFwjLM0pTg#list/path=%2Fsharelink2016187762-785433459861126%2Fclassical_SR_datasets&parentPath=%2Fsharelink2016187762-785433459861126) |
+
+ 数据集DIV2K, Set5 和 Set14 的组成形式如下:
+ ```
+ PaddleGAN
+ ├── data
+ ├── DIV2K
+ ├── DIV2K_train_HR
+ ├── DIV2K_train_LR_bicubic
+ | ├──X2
+ | ├──X3
+ | └──X4
+ ├── DIV2K_valid_HR
+ ├── DIV2K_valid_LR_bicubic
+ Set5
+ ├── GTmod12
+ ├── LRbicx2
+ ├── LRbicx3
+ ├── LRbicx4
+ └── original
+ Set14
+ ├── GTmod12
+ ├── LRbicx2
+ ├── LRbicx3
+ ├── LRbicx4
+ └── original
+ ...
+ ```
+ 使用以下命令处理DIV2K数据集:
+ ```
+ python data/process_div2k_data.py --data-root data/DIV2K
+ ```
+ 程序完成后,检查DIV2K目录中是否有``DIV2K_train_HR_sub``、``X2_sub``、``X3_sub``和``X4_sub``目录
+ ```
+ PaddleGAN
+ ├── data
+ ├── DIV2K
+ ├── DIV2K_train_HR
+ ├── DIV2K_train_HR_sub
+ ├── DIV2K_train_LR_bicubic
+ | ├──X2
+ | ├──X2_sub
+ | ├──X3
+ | ├──X3_sub
+ | ├──sX4
+ | └──X4_sub
+ ├── DIV2K_valid_HR
+ ├── DIV2K_valid_LR_bicubic
+ ...
+ ```
+
+#### Realsr df2k model的数据准备
+
+ 从 [NTIRE 2020 RWSR](https://competitions.codalab.org/competitions/22220#participate) 下载数据集并解压到您的路径下。
+ 将 Corrupted-tr-x.zip 和 Corrupted-tr-y.zip 解压到 ``PaddleGAN/data/ntire20`` 目录下。
+
+ 运行如下命令:
+ ```
+ python ./data/realsr_preprocess/create_bicubic_dataset.py --dataset df2k --artifacts tdsr
+ python ./data/realsr_preprocess/collect_noise.py --dataset df2k --artifacts tdsr
+ ```
+
+### 1.2.2 训练/测试
+
+ 示例以df2k数据集和RealSR模型为例。如果您想使用自己的数据集,可以在配置文件中修改数据集为您自己的数据集。如果您想使用其他模型,可以通过替换配置文件。
+
+ 训练模型:
+ ```
+ python -u tools/main.py --config-file configs/realsr_bicubic_noise_x4_df2k.yaml
+ ```
+
+ 测试模型:
+ ```
+ python tools/main.py --config-file configs/realsr_bicubic_noise_x4_df2k.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+ ```
+
+## 1.3 实验结果展示
+实验数值结果是在 RGB 通道上进行评估,并在评估之前裁剪每个边界的尺度像素。
+
+度量指标为 PSNR / SSIM.
+
+| 模型 | Set5 | Set14 | DIV2K |
+|---|---|---|---|
+| realsr_df2k | 28.4385 / 0.8106 | 24.7424 / 0.6678 | 26.7306 / 0.7512 |
+| realsr_dped | 20.2421 / 0.6158 | 19.3775 / 0.5259 | 20.5976 / 0.6051 |
+| realsr_merge | 24.8315 / 0.7030 | 23.0393 / 0.5986 | 24.8510 / 0.6856 |
+| lesrcnn_x4 | 31.9476 / 0.8909 | 28.4110 / 0.7770 | 30.231 / 0.8326 |
+| esrgan_psnr_x4 | 32.5512 / 0.8991 | 28.8114 / 0.7871 | 30.7565 / 0.8449 |
+| esrgan_x4 | 28.7647 / 0.8187 | 25.0065 / 0.6762 | 26.9013 / 0.7542 |
+| pan_x4 | 30.4574 / 0.8643 | 26.7204 / 0.7434 | 28.9187 / 0.8176 |
+| drns_x4 | 32.6684 / 0.8999 | 28.9037 / 0.7885 | - |
+
+PAN指标对比
+
+paddle模型使用DIV2K数据集训练,torch模型使用df2k和DIV2K数据集训练。
+
+| 框架 | Set5 | Set14 |
+|---|---|---|
+| paddle | 30.4574 / 0.8643 | 26.7204 / 0.7434 |
+| torch | 30.2183 / 0.8643 | 26.8035 / 0.7445 |
+
+
+
+
+## 1.4 模型下载
+| 模型 | 数据集 | 下载地址 |
+|---|---|---|
+| realsr_df2k | df2k | [realsr_df2k](https://paddlegan.bj.bcebos.com/models/realsr_df2k.pdparams)
+| realsr_dped | dped | [realsr_dped](https://paddlegan.bj.bcebos.com/models/realsr_dped.pdparams)
+| realsr_merge | DIV2K | [realsr_merge](https://paddlegan.bj.bcebos.com/models/realsr_merge.pdparams)
+| lesrcnn_x4 | DIV2K | [lesrcnn_x4](https://paddlegan.bj.bcebos.com/models/lesrcnn_x4.pdparams)
+| esrgan_psnr_x4 | DIV2K | [esrgan_psnr_x4](https://paddlegan.bj.bcebos.com/models/esrgan_psnr_x4.pdparams)
+| esrgan_x4 | DIV2K | [esrgan_x4](https://paddlegan.bj.bcebos.com/models/esrgan_x4.pdparams)
+| pan_x4 | DIV2K | [pan_x4](https://paddlegan.bj.bcebos.com/models/pan_x4.pdparams)
+| drns_x4 | DIV2K | [drns_x4](https://paddlegan.bj.bcebos.com/models/DRNSx4.pdparams)
+
+
+# 参考文献
+
+- 1. [Real-World Super-Resolution via Kernel Estimation and Noise Injection](https://openaccess.thecvf.com/content_CVPRW_2020/papers/w31/Ji_Real-World_Super-Resolution_via_Kernel_Estimation_and_Noise_Injection_CVPRW_2020_paper.pdf)
+
+ ```
+ @inproceedings{ji2020real,
+ title={Real-World Super-Resolution via Kernel Estimation and Noise Injection},
+ author={Ji, Xiaozhong and Cao, Yun and Tai, Ying and Wang, Chengjie and Li, Jilin and Huang, Feiyue},
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops},
+ pages={466--467},
+ year={2020}
+ }
+ ```
+
+- 2. [ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks](https://arxiv.org/abs/1809.00219v2)
+
+ ```
+ @inproceedings{wang2018esrgan,
+ title={Esrgan: Enhanced super-resolution generative adversarial networks},
+ author={Wang, Xintao and Yu, Ke and Wu, Shixiang and Gu, Jinjin and Liu, Yihao and Dong, Chao and Qiao, Yu and Change Loy, Chen},
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+ pages={0--0},
+ year={2018}
+ }
+ ```
+
+- 3. [Lightweight image super-resolution with enhanced CNN](https://arxiv.org/abs/2007.04344)
+
+ ```
+ @article{tian2020lightweight,
+ title={Lightweight image super-resolution with enhanced CNN},
+ author={Tian, Chunwei and Zhuge, Ruibin and Wu, Zhihao and Xu, Yong and Zuo, Wangmeng and Chen, Chen and Lin, Chia-Wen},
+ journal={Knowledge-Based Systems},
+ volume={205},
+ pages={106235},
+ year={2020},
+ publisher={Elsevier}
+ }
+ ```
+- 4. [Efficient Image Super-Resolution Using Pixel Attention](https://arxiv.org/pdf/2010.01073.pdf)
+
+ ```
+ @inproceedings{Hengyuan2020Efficient,
+ title={Efficient Image Super-Resolution Using Pixel Attention},
+ author={Hengyuan Zhao and Xiangtao Kong and Jingwen He and Yu Qiao and Chao Dong},
+ booktitle={Computer Vision – ECCV 2020 Workshops},
+ volume={12537},
+ pages={56-72},
+ year={2020}
+ }
+ ```
+ - 5. [Closed-loop Matters: Dual Regression Networks for Single Image Super-Resolution](https://arxiv.org/pdf/2003.07018.pdf)
+
+ ```
+ @inproceedings{guo2020closed,
+ title={Closed-loop Matters: Dual Regression Networks for Single Image Super-Resolution},
+ author={Guo, Yong and Chen, Jian and Wang, Jingdong and Chen, Qi and Cao, Jiezhang and Deng, Zeshuai and Xu, Yanwu and Tan, Mingkui},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ year={2020}
+ }
+ ```
diff --git a/docs/zh_CN/tutorials/starganv2.md b/docs/zh_CN/tutorials/starganv2.md
new file mode 100644
index 0000000000000000000000000000000000000000..7960fb92fd8903ba4111f03a125cd854c5d8e26e
--- /dev/null
+++ b/docs/zh_CN/tutorials/starganv2.md
@@ -0,0 +1,74 @@
+# StarGAN V2
+
+## 1 原理介绍
+
+ [StarGAN V2](https://arxiv.org/pdf/1912.01865.pdf)是发布在CVPR2020上的一个图像转换模型。
+ 一个好的图像到图像转换模型应该学习不同视觉域之间的映射,同时满足以下属性:1)生成图像的多样性和 2)多个域的可扩展性。 现有方法只解决了其中一个问题,领域的多样性有限或对所有领域用多个模型。 StarGAN V2是一个单一的框架,可以同时解决这两个问题,并在基线上显示出显着改善的结果。 CelebAHQ 和新的动物面孔数据集 (AFHQ) 上的实验验证了StarGAN V2在视觉质量、多样性和可扩展性方面的优势。
+
+## 2 如何使用
+
+### 2.1 数据准备
+
+ StarGAN V2使用的CelebAHQ数据集可以从[这里](https://www.dropbox.com/s/f7pvjij2xlpff59/celeba_hq.zip?dl=0)下载,使用的AFHQ数据集可以从[这里](https://www.dropbox.com/s/t9l9o3vsx2jai3z/afhq.zip?dl=0)下载。将数据集下载解压后放到``PaddleGAN/data``文件夹下 。
+
+ 数据的组成形式为:
+
+ ```
+ ├── data
+ ├── afhq
+ | ├── train
+ | | ├── cat
+ | | ├── dog
+ | | └── wild
+ | └── val
+ | ├── cat
+ | ├── dog
+ | └── wild
+ └── celeba_hq
+ ├── train
+ | ├── female
+ | └── male
+ └── val
+ ├── female
+ └── male
+
+ ```
+
+### 2.2 训练/测试
+
+ 示例以AFHQ数据集为例。如果您想使用CelebAHQ数据集,可以在换一下配置文件。
+
+ 训练模型:
+ ```
+ python -u tools/main.py --config-file configs/starganv2_afhq.yaml
+ ```
+
+ 测试模型:
+ ```
+ python tools/main.py --config-file configs/starganv2_afhq.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+ ```
+
+## 3 结果展示
+
+
+
+## 4 模型下载
+| 模型 | 数据集 | 下载地址 |
+|---|---|---|
+| starganv2_afhq | AFHQ | [starganv2_afhq](https://paddlegan.bj.bcebos.com/models/starganv2_afhq.pdparams)
+
+
+
+
+# 参考文献
+
+- 1. [StarGAN v2: Diverse Image Synthesis for Multiple Domains](https://arxiv.org/abs/1912.01865)
+
+ ```
+ @inproceedings{choi2020starganv2,
+ title={StarGAN v2: Diverse Image Synthesis for Multiple Domains},
+ author={Yunjey Choi and Youngjung Uh and Jaejun Yoo and Jung-Woo Ha},
+ booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
+ year={2020}
+ }
+ ```
diff --git a/docs/zh_CN/tutorials/styleganv2.md b/docs/zh_CN/tutorials/styleganv2.md
index 7ebab5e1ff14af2fdca8769515b40736491a6029..7140bc5240953e9e838b1586def9d14b75ff4719 100644
--- a/docs/zh_CN/tutorials/styleganv2.md
+++ b/docs/zh_CN/tutorials/styleganv2.md
@@ -54,9 +54,56 @@ python -u tools/styleganv2.py \
- n_col: 采样的图片的列数
- cpu: 是否使用cpu推理,若不使用,请在命令中去除
-### 训练(TODO)
+### 训练
+
+#### 准备数据集
+你可以从[这里](https://drive.google.com/drive/folders/1u2xu7bSrWxrbUxk-dT-UvEJq8IjdmNTP)下载对应的数据集
+
+为了方便,我们提供了[images256x256.tar](https://paddlegan.bj.bcebos.com/datasets/images256x256.tar)
+
+目前的配置文件默认数据集的结构如下:
+ ```
+ PaddleGAN
+ ├── data
+ ├── ffhq
+ ├──images1024x1024
+ ├── 00000.png
+ ├── 00001.png
+ ├── 00002.png
+ ├── 00003.png
+ ├── 00004.png
+ ├──images256x256
+ ├── 00000.png
+ ├── 00001.png
+ ├── 00002.png
+ ├── 00003.png
+ ├── 00004.png
+ ├──custom_data
+ ├── img0.png
+ ├── img1.png
+ ├── img2.png
+ ├── img3.png
+ ├── img4.png
+ ...
+ ```
+
+启动训练
+```
+python tools/main.py -c configs/stylegan_v2_256_ffhq.yaml
+```
+
+### 推理
+
+训练结束后,需要使用 ``tools/extract_weight.py`` 来提取对应的权重给``applications/tools/styleganv2.py``来进行推理.
+```
+python tools/extract_weight.py output_dir/YOUR_TRAINED_WEIGHT.pdparams --net-name gen_ema --output YOUR_WEIGHT_PATH.pdparams
+```
+
+```
+python tools/styleganv2.py --output_path stylegan01 --weight_path YOUR_WEIGHT_PATH.pdparams --size 256
+```
-未来还将添加训练脚本方便用户训练出更多类型的 StyleGAN V2 图像生成器。
+注意: ``--size`` 这个参数要和配置文件中的参数保持一致.
## 生成结果展示
diff --git a/docs/zh_CN/tutorials/styleganv2clip.md b/docs/zh_CN/tutorials/styleganv2clip.md
new file mode 100644
index 0000000000000000000000000000000000000000..733b90ea06d0057aa7dac44fad227c72b07a60a1
--- /dev/null
+++ b/docs/zh_CN/tutorials/styleganv2clip.md
@@ -0,0 +1,167 @@
+# StyleCLIP: 文本驱动的图像处理
+## 1. 简介
+
+StyleGAN V2 的任务是使用风格向量进行image generation,而Clip guided Editing 则是利用CLIP (Contrastive Language-Image Pre-training ) 多模态预训练模型计算文本输入对应的风格向量变化,用文字表述来对图像进行编辑操纵风格向量进而操纵生成图像的属性。相比于Editing 模块,StyleCLIP不受预先统计的标注属性限制,可以通过语言描述自由控制图像编辑。
+
+原论文中使用 Pixel2Style2Pixel 的 升级模型 Encode4Editing 计算要编辑的代表图像的风格向量,为尽量利用PaddleGAN提供的预训练模型本次复现中仍使用Pixel2Style2Pixel计算得到风格向量进行实验,重构效果略有下降,期待PaddleGAN跟进e4e相关工作。
+
+
+## 2. 复现
+
+StyleCLIP 模型 需要使用简介中对应提到的几个预训练模型,
+本次复现使用PPGAN 提供的 在FFHQ数据集上进行预训练的StyleGAN V2 模型作为生成器,并使用Pixel2Style2Pixel模型将待编辑图像转换为对应风格向量。
+
+CLIP模型依赖Paddle-CLIP实现。
+pSp模型包含人脸检测步骤,依赖dlib框架。
+除本repo外还需要安装 Paddle-CLIP 和 dlib 依赖。
+
+整体安装方法如下。
+```
+pip install -e .
+pip install paddleclip
+pip install dlib-bin
+```
+
+### 编辑结果展示
+
+风格向量对应的图像:
+
+

+
+
+设置
+> direction_offset = [ -1, 0, 1, 2, 3, 4, 5]
+> beta_threshold = 0.1
+
+从 'face' 到 'boy face' 编辑得到的图像:
+
+
+
+
+从'face' 到 'happy face' 编辑得到的图像:
+
+
+
+从'face' 到 'angry face' 编辑得到的图像:
+
+
+
+从'face' 到 'face with long hair' 编辑得到的图像:
+
+
+
+
+从'face' 到 'face with curl hair' (卷发) 编辑得到的图像:
+
+
+
+从'head with black hair'(黑发) 到 'head with gold hair'(金发)编辑得到的图像:
+
+
+
+
+## 3. 使用方法
+
+### 制作属性向量
+
+具体可以参考[Puzer/stylegan-encoder](https://github.com/Puzer/stylegan-encoder/blob/master/Learn_direction_in_latent_space.ipynb)中的做法。
+
+当前提供与`stylegan2`对应`ffhq-config-f`数据集上的权重参数:
+
+direction: https://paddlegan.bj.bcebos.com/models/stylegan2-ffhq-config-f-styleclip-global-directions.pdparams
+
+stats: https://paddlegan.bj.bcebos.com/models/stylegan2-ffhq-config-f-styleclip-stats.pdparams
+
+### 训练
+
+在StyleCLIP论文中作者研究了 3 种结合 StyleGAN 和 CLIP 的方法:
+1. 文本引导的风格向量优化,使用 CLIP 模型作为损失网络对现有风格向量进行多次迭代更新,但该方法对每次处理都需要重新训练。
+2. 训练 风格向量映射器,使CLIP文本特征向量映射至StyleGAN 风格向量空间,避免(1)方法的训练问题,但可控性较差,经论文对比其生成质量也不如(3)。
+3. 在 StyleGAN 的 StyleSpace 中,把文本描述映射到输入图像的全局方向 (Global Direction),进而运行自由控制图像操作强度以及分离程度,实现类似于StyleGAN Editing 模块的使用体验。
+
+本次仅复现论文中效果最好的 (3)Global Direction 方法。
+
+StyleCLIP Global Direction 训练过程分两步:
+1. 提取风格向量并统计
+
+```
+python styleclip_getf.py
+```
+2. 结合CLIP模型计算转换矩阵
+
+```
+python ppgan/apps/styleganv2clip_predictor.py extract
+```
+
+### 编辑
+
+用户使用如下命令中对图像属性进行编辑:
+
+```
+cd applications/
+python -u tools/styleganv2clip.py \
+ --latent <替换为要编辑的风格向量的路径> \
+ --output_path <替换为生成图片存放的文件夹> \
+ --weight_path <替换为你的预训练模型路径> \
+ --model_type ffhq-config-f \
+ --size 1024 \
+ --style_dim 512 \
+ --n_mlp 8 \
+ --channel_multiplier 2 \
+ --direction_path <替换为存放统计数据的文件路径> \
+ --neutral <替换为对原图像的描述,如face> \
+ --target <替换为对目标图像的描述> \
+ --beta_threshold 0.12 \
+ --direction_offset 5
+ --cpu
+```
+
+**参数说明:**
+- latent: 要编辑的代表图像的风格向量的路径。可来自于Pixel2Style2Pixel生成的`dst.npy`或StyleGANv2 Fitting模块生成的`dst.fitting.npy`
+- output_path: 生成图片存放的文件夹
+- weight_path: 或StyleGANv2 预训练模型路径
+- model_type: 模型类型,当前使用: `ffhq-config-f`
+- direction_path: 存放CLIP统计向量的文件路径
+- stat_path: 存放StyleGAN向量统计数据的文件路径
+- neutral: 对原图像的中性描述,如 face
+- target: 为对目标图像的描述,如 young face
+- beta_threshold: 向量调整阈值
+- direction_offset: 属性的偏移强度
+- cpu: 是否使用cpu推理,若不使用,请在命令中去除
+
+!以下 参数需与StyleGAN 预训练模型保持一致
+- size: 模型参数,输出图片的分辨率
+- style_dim: 模型参数,风格z的维度
+- n_mlp: 模型参数,风格z所输入的多层感知层的层数
+- channel_multiplier: 模型参数,通道乘积,影响模型大小和生成图片质量
+
+## 复现记录
+1. PaddleGAN 实现中的StyleGAN模型将Style Affine层进行了模块耦合,而论文中使用到的S Space 需要用到,因此对StyleGAN 生成器代码也进行了魔改,增加style_affine 及 synthesis_from_styles 方法同时尽量兼容现有接口。
+2. StyleCLIP论文中表示使用100张图像进行Global Direction 训练在V1080Ti需要约4h,但使用V100的训练数据及官方repo中也有issue提到实际需要约24h,该问题但作者还未能给出解答。
+3. Paddle Resize处理对Tensor和ndarray的处理方法不同,默认Tensor使用BCHW模式存储而非图像的BHWC。
+4. 现有 uppfirdn2d 模块中似乎存在多次不必要的Tensor拷贝、reshape过程,希望后续能够优化运算及显存占用。
+5. 切片拷贝:paddle中对Tensor进行切片时(有时)会创建新的拷贝,此时再对其进行赋值很可能不生效,两种写法`a[ind1][ind2]=0` 和 `a[ind1, ind2]=0` 前者并不改变a中的参数。
+
+# 参考文献
+
+- 1. [StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery](https://arxiv.org/abs/2103.17249)
+
+ ```
+ @article{Patashnik2021StyleCLIPTM,
+ title={StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery},
+ author={Or Patashnik and Zongze Wu and Eli Shechtman and Daniel Cohen-Or and D. Lischinski},
+ journal={2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
+ year={2021},
+ pages={2065-2074}
+ }
+ ```
+- 2. [Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation](hhttps://arxiv.org/abs/2008.00951)
+
+ ```
+ @article{richardson2020encoding,
+ title={Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation},
+ author={Richardson, Elad and Alaluf, Yuval and Patashnik, Or and Nitzan, Yotam and Azar, Yaniv and Shapiro, Stav and Cohen-Or, Daniel},
+ journal={arXiv preprint arXiv:2008.00951},
+ year={2020}
+ }
+ ```
diff --git a/docs/zh_CN/tutorials/styleganv2editing.md b/docs/zh_CN/tutorials/styleganv2editing.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d2c7e7f5428b043cc7e675f9096d7f06f3db4bf
--- /dev/null
+++ b/docs/zh_CN/tutorials/styleganv2editing.md
@@ -0,0 +1,91 @@
+# StyleGAN V2 Editing 模块
+
+## StyleGAN V2 Editing 原理
+
+StyleGAN V2 的任务是使用风格向量进行image generation,而Editing模块则是利用预先对多图的风格向量进行分类回归得到的属性操纵向量来操纵生成图像的属性
+
+## 使用方法
+
+### 编辑
+
+用户使用如下命令中对图像属性进行编辑:
+
+```
+cd applications/
+python -u tools/styleganv2editing.py \
+ --latent <替换为要编辑的风格向量的路径> \
+ --output_path <替换为生成图片存放的文件夹> \
+ --weight_path <替换为你的预训练模型路径> \
+ --model_type ffhq-config-f \
+ --size 1024 \
+ --style_dim 512 \
+ --n_mlp 8 \
+ --channel_multiplier 2 \
+ --direction_path <替换为存放属性向量的文件路径> \
+ --direction_name <替换为你操纵的属性名称> \
+ --direction_offset 0.0 \
+ --cpu
+```
+
+**参数说明:**
+- latent: 要编辑的代表图像的风格向量的路径。可来自于Pixel2Style2Pixel生成的`dst.npy`或StyleGANv2 Fitting模块生成的`dst.fitting.npy`
+- latent2: 第二个风格向量的路径。来源同第一个风格向量
+- output_path: 生成图片存放的文件夹
+- weight_path: 预训练模型路径
+- model_type: PaddleGAN内置模型类型,若输入PaddleGAN已存在的模型类型,`weight_path`将失效。当前建议使用: `ffhq-config-f`
+- size: 模型参数,输出图片的分辨率
+- style_dim: 模型参数,风格z的维度
+- n_mlp: 模型参数,风格z所输入的多层感知层的层数
+- channel_multiplier: 模型参数,通道乘积,影响模型大小和生成图片质量
+- direction_path: 存放一系列属性名称及对象属性向量的文件的路径。默认为空,即使用ppgan自带的文件。若不使用,请在命令中去除
+- direction_name: 要编辑的属性名称,对于`ffhq-conf-f`有预先准备的这些属性: age、eyes_open、eye_distance、eye_eyebrow_distance、eye_ratio、gender、lip_ratio、mouth_open、mouth_ratio、nose_mouth_distance、nose_ratio、nose_tip、pitch、roll、smile、yaw
+- direction_offset: 属性的偏移强度
+- cpu: 是否使用cpu推理,若不使用,请在命令中去除
+
+## 编辑结果展示
+
+风格向量对应的图像:
+
+
+

+
+
+按[-5,-2.5,0,2.5,5]进行`age`(年龄)属性编辑得到的图像:
+
+
+

+
+
+对`-5`偏移得到的风格向量进一步进行`gender`(性别)编辑得到的图像:
+
+
+

+
+
+## 制作属性向量
+
+具体可以参考[Puzer/stylegan-encoder](https://github.com/Puzer/stylegan-encoder/blob/master/Learn_direction_in_latent_space.ipynb)中的做法。
+
+
+# 参考文献
+
+- 1. [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+
+ ```
+ @article{Karras2019stylegan2,
+ title={Analyzing and Improving the Image Quality of {StyleGAN}},
+ author={Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten and Jaakko Lehtinen and Timo Aila},
+ booktitle={Proc. CVPR},
+ year={2020}
+ }
+ ```
+- 2. [Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation](hhttps://arxiv.org/abs/2008.00951)
+
+ ```
+ @article{richardson2020encoding,
+ title={Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation},
+ author={Richardson, Elad and Alaluf, Yuval and Patashnik, Or and Nitzan, Yotam and Azar, Yaniv and Shapiro, Stav and Cohen-Or, Daniel},
+ journal={arXiv preprint arXiv:2008.00951},
+ year={2020}
+ }
+ ```
diff --git a/docs/zh_CN/tutorials/styleganv2fitting.md b/docs/zh_CN/tutorials/styleganv2fitting.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e76fa343378c01637655589f05766ad55a7a21a
--- /dev/null
+++ b/docs/zh_CN/tutorials/styleganv2fitting.md
@@ -0,0 +1,93 @@
+# StyleGAN V2 Fitting 模块
+
+## StyleGAN V2 Fitting 原理
+
+StyleGAN V2 的任务是使用风格向量进行image generation,而Fitting模块则是根据已有的图像反推出解耦程度高的风格向量。得到的风格向量可用于人脸融合、人脸属性编辑等任务中
+
+## 使用方法
+
+### 拟合
+
+用户使用如下命令中进行拟合:
+
+```
+cd applications/
+python -u tools/styleganv2fitting.py \
+ --input_image <替换为输入的图像路径> \
+ --need_align \
+ --start_lr 0.1 \
+ --final_lr 0.025 \
+ --latent_level 0 1 2 3 4 5 6 7 8 9 10 11 \
+ --step 100 \
+ --mse_weight 1 \
+ --pre_latent <替换为预先准备好的风格向量> \
+ --output_path <替换为生成图片存放的文件夹> \
+ --weight_path <替换为你的预训练模型路径> \
+ --model_type ffhq-config-f \
+ --size 1024 \
+ --style_dim 512 \
+ --n_mlp 8 \
+ --channel_multiplier 2 \
+ --cpu
+```
+
+**参数说明:**
+- input_image: 输入的图像路径
+- need_align: 是否将图像裁剪为模型能识别的图像,对于输入为已经裁剪过的图像,如使用Pixel2Style2Pixel生成风格向量时预生成的`src.png`,可不填写need_align参数
+- start_lr: 拟合的初始学习率
+- final_lr: 拟合结束时的学习率
+- latent_level: 参与拟合的风格向量层次,1024分辨率下为0到17,512分辨率下则为0到15,以此类推。级别越低越偏向于整体风格改变,越高越偏向于细节风格改变
+- step: 拟合图像所需步数,步数越大,花费时间越久,效果也更好
+- mse_weight: MSE损失的权重
+- pre_latent: 预制的风格向量保存的文件,便于更好效果的拟合。默认为空,可填入使用Pixel2Style2Pixel生成的`dst.npy`文件路径
+- output_path: 生成图片存放的文件夹
+- weight_path: 预训练模型路径
+- model_type: PaddleGAN内置模型类型,若输入PaddleGAN已存在的模型类型,`weight_path`将失效。当前建议使用: `ffhq-config-f`
+- size: 模型参数,输出图片的分辨率
+- style_dim: 模型参数,风格z的维度
+- n_mlp: 模型参数,风格z所输入的多层感知层的层数
+- channel_multiplier: 模型参数,通道乘积,影响模型大小和生成图片质量
+- cpu: 是否使用cpu推理,若不使用,请在命令中去除
+
+## 拟合结果展示
+
+源图像:
+
+
+

+
+
+Pixel2Style2Pixel编码结果:
+
+
+

+
+
+利用Pixel2Style2Pixel产生的风格向量,使用Fitting模块再进行1000步拟合的结果:
+
+
+

+
+
+# 参考文献
+
+- 1. [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+
+ ```
+ @article{Karras2019stylegan2,
+ title={Analyzing and Improving the Image Quality of {StyleGAN}},
+ author={Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten and Jaakko Lehtinen and Timo Aila},
+ booktitle={Proc. CVPR},
+ year={2020}
+ }
+ ```
+- 2. [Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation](hhttps://arxiv.org/abs/2008.00951)
+
+ ```
+ @article{richardson2020encoding,
+ title={Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation},
+ author={Richardson, Elad and Alaluf, Yuval and Patashnik, Or and Nitzan, Yotam and Azar, Yaniv and Shapiro, Stav and Cohen-Or, Daniel},
+ journal={arXiv preprint arXiv:2008.00951},
+ year={2020}
+ }
+ ```
diff --git a/docs/zh_CN/tutorials/styleganv2mixing.md b/docs/zh_CN/tutorials/styleganv2mixing.md
new file mode 100644
index 0000000000000000000000000000000000000000..664bc9cb918e781c89299a9ff678a6e5019af73b
--- /dev/null
+++ b/docs/zh_CN/tutorials/styleganv2mixing.md
@@ -0,0 +1,105 @@
+# StyleGAN V2 Mixing 模块
+
+## StyleGAN V2 Mixing 原理
+
+StyleGAN V2 的任务是使用风格向量进行image generation,而Mixing模块则是利用其风格向量实现两张生成图像不同层次不同比例的混合
+
+## 使用方法
+
+### 混合
+
+用户使用如下命令中进行混合:
+
+```
+cd applications/
+python -u tools/styleganv2mixing.py \
+ --latent1 <替换为第一个风格向量的路径> \
+ --latent2 <替换为第二个风格向量的路径> \
+ --weights \
+ 0.5 0.5 0.5 0.5 0.5 0.5 \
+ 0.5 0.5 0.5 0.5 0.5 0.5 \
+ 0.5 0.5 0.5 0.5 0.5 0.5 \
+ --output_path <替换为生成图片存放的文件夹> \
+ --weight_path <替换为你的预训练模型路径> \
+ --model_type ffhq-config-f \
+ --size 1024 \
+ --style_dim 512 \
+ --n_mlp 8 \
+ --channel_multiplier 2 \
+ --cpu
+```
+
+**参数说明:**
+- latent1: 第一个风格向量的路径。可来自于Pixel2Style2Pixel生成的`dst.npy`或StyleGANv2 Fitting模块生成的`dst.fitting.npy`
+- latent2: 第二个风格向量的路径。来源同第一个风格向量
+- weights: 两个风格向量在不同的层次按不同比例进行混合。对于1024的分辨率,有18个层次,512的分辨率,有16个层次,以此类推。越前面,越影响混合图像的整体。越后面,越影响混合图像的细节。
+ 在下图中我们展示了不同权重的融合结果,可供参考。
+- output_path: 生成图片存放的文件夹
+- weight_path: 预训练模型路径
+- model_type: PaddleGAN内置模型类型,若输入PaddleGAN已存在的模型类型,`weight_path`将失效。当前建议使用: `ffhq-config-f`
+- size: 模型参数,输出图片的分辨率
+- style_dim: 模型参数,风格z的维度
+- n_mlp: 模型参数,风格z所输入的多层感知层的层数
+- channel_multiplier: 模型参数,通道乘积,影响模型大小和生成图片质量
+- cpu: 是否使用cpu推理,若不使用,请在命令中去除
+
+## 混合结果展示
+
+第一个风格向量对应的图像:
+
+
+

+
+
+第二个风格向量对应的图像:
+
+
+

+
+
+两个风格向量按特定比例混合的结果:
+
+
+

+
+
+## 不同权重拟合结果展示
+第一个风格向量对应的图像:
+
+
+

+
+
+第二个风格向量对应的图像:
+
+
+

+
+
+不同权重的混合结果:
+
+

+
+
+# 参考文献
+
+- 1. [Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/abs/1912.04958)
+
+ ```
+ @article{Karras2019stylegan2,
+ title={Analyzing and Improving the Image Quality of {StyleGAN}},
+ author={Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten and Jaakko Lehtinen and Timo Aila},
+ booktitle={Proc. CVPR},
+ year={2020}
+ }
+ ```
+- 2. [Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation](hhttps://arxiv.org/abs/2008.00951)
+
+ ```
+ @article{richardson2020encoding,
+ title={Encoding in Style: a StyleGAN Encoder for Image-to-Image Translation},
+ author={Richardson, Elad and Alaluf, Yuval and Patashnik, Or and Nitzan, Yotam and Azar, Yaniv and Shapiro, Stav and Cohen-Or, Daniel},
+ journal={arXiv preprint arXiv:2008.00951},
+ year={2020}
+ }
+ ```
diff --git a/docs/zh_CN/tutorials/swinir.md b/docs/zh_CN/tutorials/swinir.md
new file mode 100644
index 0000000000000000000000000000000000000000..86f0f8ead621221d88e0e7659a7c3ca79e0a4449
--- /dev/null
+++ b/docs/zh_CN/tutorials/swinir.md
@@ -0,0 +1,101 @@
+[English](../../en_US/tutorials/swinir.md) | 中文
+
+## SwinIR 基于Swin Transformer的用于图像恢复的强基线模型
+
+
+## 1、简介
+
+SwinIR的结构比较简单,如果看过Swin-Transformer的话就没什么难点了。作者引入Swin-T结构应用于低级视觉任务,包括图像超分辨率重建、图像去噪、图像压缩伪影去除。SwinIR网络由一个浅层特征提取模块、深层特征提取模块、重建模块构成。重建模块对不同的任务使用不同的结构。浅层特征提取就是一个3×3的卷积层。深层特征提取是k个RSTB块和一个卷积层加残差连接构成。每个RSTB(Res-Swin-Transformer-Block)由L个STL和一层卷积加残差连接构成。模型的结构如下图所示:
+
+
+
+对模型更详细的介绍,可参考论文原文[SwinIR: Image Restoration Using Swin Transformer](https://arxiv.org/pdf/2108.10257.pdf),PaddleGAN中目前提供去噪任务的权重
+
+
+
+
+## 2 如何使用
+
+### 2.1 快速体验
+
+安装`PaddleGAN`之后进入`PaddleGAN`文件夹下,运行如下命令即生成修复后的图像`./output_dir/Denoising/image_name.png`
+
+```sh
+python applications/tools/swinir_denoising.py --images_path ${PATH_OF_IMAGE}
+```
+其中`PATH_OF_IMAGE`为你需要去噪的图像路径,或图像所在文件夹的路径
+
+### 2.2 数据准备
+
+#### 训练数据
+
+[DIV2K](https://cv.snu.ac.kr/research/EDSR/DIV2K.tar) (800 training images) + [Flickr2K](https://cv.snu.ac.kr/research/EDSR/Flickr2K.tar) (2650 images) + [BSD500](http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/BSR/BSR_bsds500.tgz) (400 training&testing images) + [WED](http://ivc.uwaterloo.ca/database/WaterlooExploration/exploration_database_and_code.rar)(4744 images)
+
+已经整理好的数据:放在了 [Ai Studio](https://aistudio.baidu.com/aistudio/datasetdetail/149405) 里.
+
+训练数据放在:`data/trainsets/trainH` 下
+
+#### 测试数据
+
+测试数据为 CBSD68:放在了 [Ai Studio](https://aistudio.baidu.com/aistudio/datasetdetail/147756) 里.
+
+解压到:`data/triansets/CBSD68`
+
+- 经过处理之后,`PaddleGAN/data`文件夹下的
+```sh
+trainsets
+├── trainH
+| |-- 101085.png
+| |-- 101086.png
+| |-- ......
+│ └── 201085.png
+└── CBSD68
+ ├── 271035.png
+ |-- ......
+ └── 351093.png
+```
+
+
+
+### 2.3 训练
+示例以训练Denoising的数据为例。如果想训练其他任务可以更换数据集并修改配置文件
+
+```sh
+python -u tools/main.py --config-file configs/swinir_denoising.yaml
+```
+
+### 2.4 测试
+
+测试模型:
+```sh
+python tools/main.py --config-file configs/swinir_denoising.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+```
+
+## 3 结果展示
+
+去噪
+| 模型 | 数据集 | PSNR/SSIM |
+|---|---|---|
+| SwinIR | CBSD68 | 36.0819 / 0.9464 |
+
+
+## 4 模型下载
+
+| 模型 | 下载地址 |
+|---|---|
+| SwinIR| [SwinIR_Denoising](https://paddlegan.bj.bcebos.com/models/SwinIR_Denoising.pdparams) |
+
+
+
+# 参考文献
+
+- [SwinIR: Image Restoration Using Swin Transformer](https://arxiv.org/pdf/2108.10257.pdf)
+
+```
+@article{liang2021swinir,
+ title={SwinIR: Image Restoration Using Swin Transformer},
+ author={Liang, Jingyun and Cao, Jiezhang and Sun, Guolei and Zhang, Kai and Van Gool, Luc and Timofte, Radu},
+ journal={arXiv preprint arXiv:2108.10257},
+ year={2021}
+}
+```
diff --git a/docs/zh_CN/tutorials/ugatit.md b/docs/zh_CN/tutorials/ugatit.md
index b0f2ac870b47c864520a6f3832b0cc1acfd5f282..f50e970a31393ad7cfca0fc358f1aa9d258ef05c 100644
--- a/docs/zh_CN/tutorials/ugatit.md
+++ b/docs/zh_CN/tutorials/ugatit.md
@@ -1,3 +1,61 @@
-### U-GAT-IT
+# 1 U-GAT-IT
-待增加,您也可以先参考通用de[训练/评估/推理教程](../get_started.md)
+## 1.1 原理介绍
+
+ 与CycleGAN类似,[U-GAT-IT](https://arxiv.org/abs/1907.10830)使用未配对的图片进行图像风格转换,输入两个不同风格的图像,U-GAT-IT自动执行风格转换。不同的是,U-GAT-IT在历史研究的基础上以端到端的方式引入了一个新的注意模块和一个新的可学习的归一化函数。
+
+## 1.2 如何使用
+
+### 1.2.1 数据准备
+
+ U-GAT-IT使用的Selfie2anime数据集可以从[这里](https://www.kaggle.com/arnaud58/selfie2anime)下载,您也可以使用自己的数据集。
+
+ 数据的组成形式为:
+
+ ```
+ ├── dataset
+ └── YOUR_DATASET_NAME
+ ├── trainA
+ ├── trainB
+ ├── testA
+ └── testB
+ ```
+
+### 1.2.2 训练/测试
+
+ 示例以selfie2anime数据集为例。如果您想使用自己的数据集,可以在配置文件中修改数据集为您自己的数据集。
+
+ 训练模型:
+ ```
+ python -u tools/main.py --config-file configs/ugatit_selfie2anime_light.yaml
+ ```
+
+ 测试模型:
+ ```
+ python tools/main.py --config-file configs/ugatit_selfie2anime_light.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+ ```
+
+## 1.3 结果展示
+
+
+
+## 1.4 模型下载
+| 模型 | 数据集 | 下载地址 |
+|---|---|---|
+| ugatit_light | selfie2anime | [ugatit_light](https://paddlegan.bj.bcebos.com/models/ugatit_light.pdparams)
+
+
+
+
+# 参考文献
+
+- 1. [U-GAT-IT: Unsupervised Generative Attentional Networks with Adaptive Layer-Instance Normalization for Image-to-Image Translation](https://arxiv.org/abs/1907.10830)
+
+ ```
+ @article{kim2019u,
+ title={U-GAT-IT: unsupervised generative attentional networks with adaptive layer-instance normalization for image-to-image translation},
+ author={Kim, Junho and Kim, Minjae and Kang, Hyeonwoo and Lee, Kwanghee},
+ journal={arXiv preprint arXiv:1907.10830},
+ year={2019}
+ }
+ ```
diff --git a/docs/zh_CN/tutorials/video_super_resolution.md b/docs/zh_CN/tutorials/video_super_resolution.md
new file mode 100644
index 0000000000000000000000000000000000000000..7789644a5b8a87eff07bd3a4be1f293a2677df3d
--- /dev/null
+++ b/docs/zh_CN/tutorials/video_super_resolution.md
@@ -0,0 +1,223 @@
+
+# 视频超分
+
+## 1.1 原理介绍
+
+ 视频超分源于图像超分,其目的是从一个或多个低分辨率(LR)图像中恢复高分辨率(HR)图像。它们的区别也很明显,由于视频是由多个帧组成的,所以视频超分通常利用帧间的信息来进行修复。
+
+ 这里我们提供百度自研SOTA超分系列模型PP-MSVSR、业界领先视频超分模型[EDVR](https://arxiv.org/pdf/1905.02716.pdf)、[BasicVSR](https://arxiv.org/pdf/2012.02181.pdf),[IconVSR](https://arxiv.org/pdf/2012.02181.pdf)和[BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf)。
+
+### ⭐ PP-MSVSR ⭐
+ 百度自研的[PP-MSVSR](https://arxiv.org/pdf/2112.02828.pdf)是一种多阶段视频超分深度架构,具有局部融合模块、辅助损失和细化对齐模块,以逐步细化增强结果。具体来说,在第一阶段设计了局部融合模块,在特征传播之前进行局部特征融合, 以加强特征传播中跨帧特征的融合。在第二阶段中引入了一个辅助损失,使传播模块获得的特征保留了更多与HR空间相关的信息。在第三阶段中引入了一个细化的对齐模块,以充分利用前一阶段传播模块的特征信息。大量实验证实,PP-MSVSR在Vid4数据集性能优异,仅使用 1.45M 参数PSNR指标即可达到28.13dB。
+
+ [PP-MSVSR](https://arxiv.org/pdf/2112.02828.pdf)提供两种体积模型,开发者可根据实际场景灵活选择:[PP-MSVSR](https://arxiv.org/pdf/2112.02828.pdf)(参数量1.45M)与[PP-MSVSR-L](https://arxiv.org/pdf/2112.02828.pdf)(参数量7.42)。
+
+### EDVR
+ [EDVR](https://arxiv.org/pdf/1905.02716.pdf)模型在NTIRE19视频恢复和增强挑战赛的四个赛道中都赢得了冠军,并以巨大的优势超过了第二名。视频超分的主要难点在于(1)如何在给定大运动的情况下对齐多个帧;(2)如何有效地融合具有不同运动和模糊的不同帧。首先,为了处理大的运动,EDVR模型设计了一个金字塔级联的可变形(PCD)对齐模块,在该模块中,从粗到精的可变形卷积被使用来进行特征级的帧对齐。其次,EDVR使用了时空注意力(TSA)融合模块,该模块在时间和空间上同时应用注意力机制,以强调后续恢复的重要特征。
+
+### BasicVSR
+ [BasicVSR](https://arxiv.org/pdf/2012.02181.pdf)在VSR的指导下重新考虑了四个基本模块(即传播、对齐、聚合和上采样)的一些最重要的组件。 通过添加一些小设计,重用一些现有组件,得到了简洁的 BasicVSR。与许多最先进的算法相比,BasicVSR在速度和恢复质量方面实现了有吸引力的改进。 同时,通过添加信息重新填充机制和耦合传播方案以促进信息聚合,BasicVSR 可以扩展为 [IconVSR](https://arxiv.org/pdf/2012.02181.pdf),IconVSR可以作为未来 VSR 方法的强大基线 .
+
+### BasicVSR++
+ [BasicVSR++](https://arxiv.org/pdf/2104.13371v1.pdf)通过提出二阶网格传播和导流可变形对齐来重新设计BasicVSR。通过增强传播和对齐来增强循环框架,BasicVSR++可以更有效地利用未对齐视频帧的时空信息。 在类似的计算约束下,新组件可提高性能。特别是,BasicVSR++ 以相似的参数数量在 PSNR 方面比 BasicVSR 高0.82dB。BasicVSR++ 在NTIRE2021的视频超分辨率和压缩视频增强挑战赛中获得三名冠军和一名亚军。
+
+## 1.2 如何使用
+
+### 1.2.1 数据准备
+
+ 这里提供4个视频超分辨率常用数据集,REDS,Vimeo90K,Vid4,UDM10。其中REDS和vimeo90k数据集包括训练集和测试集,Vid4和UDM10为测试数据集。将需要的数据集下载解压后放到``PaddleGAN/data``文件夹下 。
+
+ REDS([数据下载](https://seungjunnah.github.io/Datasets/reds.html))数据集是NTIRE19比赛最新提出的高质量(720p)视频数据集,其由240个训练片段、30个验证片段和30个测试片段组成(每个片段有100个连续帧)。由于测试数据集不可用,这里在训练集选择了四个具有代表性的片段(分别为'000', '011', '015', '020',它们具有不同的场景和动作)作为测试集,用REDS4表示。剩下的训练和验证片段被重新分组为训练数据集(总共266个片段)。
+
+ 处理后的数据集 REDS 的组成形式如下:
+ ```
+ PaddleGAN
+ ├── data
+ ├── REDS
+ ├── train_sharp
+ | └──X4
+ ├── train_sharp_bicubic
+ | └──X4
+ ├── REDS4_test_sharp
+ | └──X4
+ └── REDS4_test_sharp_bicubic
+ └──X4
+ ...
+ ```
+
+ Vimeo90K([数据下载](http://toflow.csail.mit.edu/))数据集是Tianfan Xue等人构建的一个用于视频超分、视频降噪、视频去伪影、视频插帧的数据集。Vimeo90K是大规模、高质量的视频数据集,包含从vimeo.com下载的 89,800 个视频剪辑,涵盖了大量场景和动作。
+
+ 处理后的数据集 Vimeo90K 的组成形式如下:
+ ```
+ PaddleGAN
+ ├── data
+ ├── Vimeo90K
+ ├── vimeo_septuplet
+ | |──sequences
+ | └──sep_trainlist.txt
+ ├── vimeo_septuplet_BD_matlabLRx4
+ | └──sequences
+ └── vimeo_super_resolution_test
+ |──low_resolution
+ |──target
+ └──sep_testlist.txt
+ ...
+ ```
+
+ Vid4([数据下载](https://paddlegan.bj.bcebos.com/datasets/Vid4.zip))数据集是常用的视频超分验证数据集,包含4个视频段。
+
+ 处理后的数据集 Vid4 的组成形式如下:
+ ```
+ PaddleGAN
+ ├── data
+ ├── Vid4
+ ├── BDx4
+ └── GT
+ ...
+ ```
+
+ UDM10([数据下载](https://paddlegan.bj.bcebos.com/datasets/udm10_paddle.tar))数据集是常用的视频超分验证数据集,包含10个视频段。
+
+ 处理后的数据集 UDM10 的组成形式如下:
+ ```
+ PaddleGAN
+ ├── data
+ ├── udm10
+ ├── BDx4
+ └── GT
+ ...
+ ```
+
+### 1.2.2 训练/测试
+
+ EDVR模型根据模型中间通道数分为EDVR_L(128通道)和EDVR_M(64通道)两种模型。下面以EDVR_M模型为例介绍模型训练与测试。
+
+ EDVR模型训练一般分两个阶段训练,先不带TSA模块训练,训练与测试命令如下:
+
+ 训练模型:
+ ```
+ python -u tools/main.py --config-file configs/edvr_m_wo_tsa.yaml
+ ```
+
+ 测试模型:
+ ```
+ python tools/main.py --config-file configs/edvr_m_wo_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT_WITHOUT_TSA}
+ ```
+
+ 然后用保存的不带TSA模块的EDVR权重作为EDVR模型的初始化,训练完整的EDVR模型,训练与测试命令如下:
+
+ 训练模型:
+ ```
+ python -u tools/main.py --config-file configs/edvr_m_w_tsa.yaml --load ${PATH_OF_WEIGHT_WITHOUT_TSA}
+ ```
+
+ 测试模型:
+ ```
+ python tools/main.py --config-file configs/edvr_m_w_tsa.yaml --evaluate-only --load ${PATH_OF_WEIGHT}
+ ```
+
+ 训练或测试其他视频超分模型,可以在``PaddleGAN/configs``文件夹下找到对应模型的配置文件,将命令中的配置文件改成该视频超分模型的配置文件即可。
+
+### 1.2.3 模型导出
+
+以msvsr模型为例,``inputs_size``为模型输入size,``model_name``为导出模型的命名,``model_path``为模型权重的路径.
+```
+python tools/export_model.py -c configs/msvsr_reds.yaml --inputs_size="1,2,3,180,320" --model_name inference --load model_path
+```
+
+### 1.2.4 模型推理
+以msvsr模型为例
+```
+python tools/inference.py --model_type msvsr -c configs/msvsr_reds.yaml --output_path output_dir
+```
+
+
+## 1.3 实验结果展示
+实验数值结果是在 RGB 通道上进行评估。
+
+度量指标为 PSNR / SSIM.
+
+REDS的测试数据集REDS4上的超分性能对比
+| 模型| 参数量(M) | 计算量(G) | REDS4 |
+|---|---|---|---|
+| EDVR_M_wo_tsa_SRx4 | 3.00 | 223 | 30.4429 / 0.8684 |
+| EDVR_M_w_tsa_SRx4 | 3.30 | 232 | 30.5169 / 0.8699 |
+| EDVR_L_wo_tsa_SRx4 | 19.42 | 974 | 30.8649 / 0.8761 |
+| EDVR_L_w_tsa_SRx4 | 20.63 | 1010 | 30.9336 / 0.8773 |
+| BasicVSR_x4 | 6.29 | 374 | 31.4325 / 0.8913 |
+| IconVSR_x4 | 8.69 | 516 | 31.6882 / 0.8950 |
+| BasicVSR++_x4 | 7.32 | 406 | 32.4018 / 0.9071 |
+| PP-MSVSR_reds_x4 | 1.45 | 111 | 31.2535 / 0.8884 |
+| PP-MSVSR-L_reds_x4 | 7.42 | 543 | 32.5321 / 0.9083 |
+
+REDS的测试数据集REDS4上的去模糊性能对比
+| 模型 | REDS4 |
+|---|---|
+| EDVR_L_wo_tsa_deblur | 34.9587 / 0.9509 |
+| EDVR_L_w_tsa_deblur | 35.1473 / 0.9526 |
+
+Vimeo90K,Vid4,UDM10测试数据集上超分性能对比
+| 模型 | Vimeo90K | Vid4 | UDM10 |
+|---|---|---|---|
+| PP-MSVSR_vimeo90k_x4 |37.54/0.9499|28.13/0.8604|40.06/0.9699|
+
+## 1.4 模型下载
+| 模型 | 数据集 | 下载地址 |
+|---|---|---|
+| EDVR_M_wo_tsa_SRx4 | REDS | [EDVR_M_wo_tsa_SRx4](https://paddlegan.bj.bcebos.com/models/EDVR_M_wo_tsa_SRx4.pdparams)
+| EDVR_M_w_tsa_SRx4 | REDS | [EDVR_M_w_tsa_SRx4](https://paddlegan.bj.bcebos.com/models/EDVR_M_w_tsa_SRx4.pdparams)
+| EDVR_L_wo_tsa_SRx4 | REDS | [EDVR_L_wo_tsa_SRx4](https://paddlegan.bj.bcebos.com/models/EDVR_L_wo_tsa_SRx4.pdparams)
+| EDVR_L_w_tsa_SRx4 | REDS | [EDVR_L_w_tsa_SRx4](https://paddlegan.bj.bcebos.com/models/EDVR_L_w_tsa_SRx4.pdparams)
+| EDVR_L_wo_tsa_deblur | REDS | [EDVR_L_wo_tsa_deblur](https://paddlegan.bj.bcebos.com/models/EDVR_L_wo_tsa_deblur.pdparams)
+| EDVR_L_w_tsa_deblur | REDS | [EDVR_L_w_tsa_deblur](https://paddlegan.bj.bcebos.com/models/EDVR_L_w_tsa_deblur.pdparams)
+| BasicVSR_x4 | REDS | [BasicVSR_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR_reds_x4.pdparams)
+| IconVSR_x4 | REDS | [IconVSR_x4](https://paddlegan.bj.bcebos.com/models/IconVSR_reds_x4.pdparams)
+| BasicVSR++_x4 | REDS | [BasicVSR++_x4](https://paddlegan.bj.bcebos.com/models/BasicVSR%2B%2B_reds_x4.pdparams)
+| PP-MSVSR_reds_x4 | REDS | [PP-MSVSR_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams)
+| PP-MSVSR-L_reds_x4 | REDS | [PP-MSVSR-L_reds_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR-L_reds_x4.pdparams)
+| PP-MSVSR_vimeo90k_x4 | Vimeo90K | [PP-MSVSR_vimeo90k_x4](https://paddlegan.bj.bcebos.com/models/PP-MSVSR_vimeo90k_x4.pdparams)
+
+# 参考文献
+
+- 1. [EDVR: Video Restoration with Enhanced Deformable Convolutional Networks](https://arxiv.org/pdf/1905.02716.pdf)
+
+ ```
+ @InProceedings{wang2019edvr,
+ author = {Wang, Xintao and Chan, Kelvin C.K. and Yu, Ke and Dong, Chao and Loy, Chen Change},
+ title = {EDVR: Video Restoration with Enhanced Deformable Convolutional Networks},
+ booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR) Workshops},
+ month = {June},
+ year = {2019}
+ }
+ ```
+- 2. [BasicVSR: The Search for Essential Components in Video Super-Resolution and Beyond](https://arxiv.org/pdf/2012.02181.pdf)
+
+ ```
+ @InProceedings{chan2021basicvsr,
+ author = {Chan, Kelvin C.K. and Wang, Xintao and Yu, Ke and Dong, Chao and Loy, Chen Change},
+ title = {BasicVSR: The Search for Essential Components in Video Super-Resolution and Beyond},
+ booktitle = {Proceedings of the IEEE conference on computer vision and pattern recognition},
+ year = {2021}
+ }
+ ```
+- 3. [BasicVSR++: Improving Video Super-Resolution with Enhanced Propagation and Alignment](https://arxiv.org/pdf/2104.13371v1.pdf)
+
+ ```
+ @article{chan2021basicvsr++,
+ author = {Chan, Kelvin C.K. and Zhou, Shangchen and Xu, Xiangyu and Loy, Chen Change},
+ title = {BasicVSR++: Improving Video Super-Resolution with Enhanced Propagation and Alignment},
+ booktitle = {arXiv preprint arXiv:2104.13371},
+ year = {2021}
+ }
+ ```
+
+- 4. [PP-MSVSR: Multi-Stage Video Super-Resolution](https://arxiv.org/pdf/2112.02828.pdf)
+
+ ```
+ @article{jiang2021PP-MSVSR,
+ author = {Jiang, Lielin and Wang, Na and Dang, Qingqing and Liu, Rui and Lai, Baohua},
+ title = {PP-MSVSR: Multi-Stage Video Super-Resolution},
+ booktitle = {arXiv preprint arXiv:2112.02828},
+ year = {2021}
+ }
+ ```
diff --git a/docs/zh_CN/tutorials/wav2lip.md b/docs/zh_CN/tutorials/wav2lip.md
index deee5ef802693c0e5e8f39701f5ca3f7cf714e56..fcc29dc6869b4febd24e17bd24605527950680d3 100644
--- a/docs/zh_CN/tutorials/wav2lip.md
+++ b/docs/zh_CN/tutorials/wav2lip.md
@@ -13,11 +13,17 @@ Wav2Lip实现的是视频人物根据输入音频生成与语音同步的人物
```
cd applications
-python tools/wav2lip.py --face ../../imgs/mona7s.mp4 --audio ../../imgs/guangquan.m4a --outfile pp_guangquan_mona7s.mp4
+python tools/wav2lip.py \
+ --face ../docs/imgs/mona7s.mp4 \
+ --audio ../docs/imgs/guangquan.m4a \
+ --outfile pp_guangquan_mona7s.mp4 \
+ --face_enhancement
```
**参数说明:**
-- face: 原始视频,视频中都人物都唇形将根据音频进行唇形合成,以和音频同步
+- face: 视频或图片,视频或图片中的人物唇形将根据音频进行唇形合成,以和音频同步
- audio: 驱动唇形合成的音频,视频中的人物将根据此音频进行唇形合成
+- outfile: 合成的视频
+- face_enhancement: 添加人脸增强,不添加参数默认为不使用增强功能
### 2.2 训练
1. 我们的模型是基于LRS2数据集训练的。可以参考[这里](https://github.com/Rudrabha/Wav2Lip#training-on-datasets-other-than-lrs2)获得在其它训练集上进行训练的一些建议。
@@ -38,14 +44,13 @@ preprocessed_root (lrs2_preprocessed)
- GPU单卡训练:
```
export CUDA_VISIBLE_DEVICES=0
-python tools/main.py --confit-file configs/wav2lip.yaml
+python tools/main.py --config-file configs/wav2lip.yaml
```
- GPU多卡训练:
```
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch \
- --log_dir ./mylog_dd.log \
tools/main.py \
--config-file configs/wav2lip.yaml \
@@ -54,13 +59,12 @@ python -m paddle.distributed.launch \
- GPU单卡训练:
```
export CUDA_VISIBLE_DEVICES=0
-python tools/main.py --confit-file configs/wav2lip_hq.yaml
+python tools/main.py --config-file configs/wav2lip_hq.yaml
```
- GPU多卡训练:
```
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch \
- --log_dir ./mylog_dd.log \
tools/main.py \
--config-file configs/wav2lip_hq.yaml \
@@ -70,7 +74,7 @@ python -m paddle.distributed.launch \
### 2.3 模型
Model|Dataset|BatchSize|Inference speed|Download
---|:--:|:--:|:--:|:--:
-wa2lip_hq|LRS2| 1 | 0.2853s/image (GPU:P40) | [model](https://paddlegan.bj.bcebos.com/models/psgan_weight.pdparam://paddlegan.bj.bcebos.com/models/wav2lip_hq.pdparams)
+wa2lip_hq|LRS2| 1 | 0.2853s/image (GPU:P40) | [model](https://paddlegan.bj.bcebos.com/models/wav2lip_hq.pdparams)
## 3. 结果展示
diff --git a/education/README.md b/education/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..15c59ccbffd867d793e8ad9e1f780aa3789d0fba
--- /dev/null
+++ b/education/README.md
@@ -0,0 +1,244 @@
+# PaddleGAN七日打卡营
+
+欢迎加入《百度飞桨生成对抗网络七日打卡营》,PaddlePaddle研发团队直播授课,4月15日起,每天1小时,连续7天,深入剖析多种经典及前沿 GAN 模型算法及代码。
+
+直播链接:http://live.bilibili.com/21689802
+直播日期:4月15日~4月20日直播授课,4月22日结营直播。
+
+致敬开源、鼓励分享:开课期间发布原创优秀技术经验贴、有价值的项目、视频等,即可获得额外加分,加分直接体现在总成绩上。希望你收获更强的自己,如有问题,欢迎加入课程qq群(651940985)交流讨论。
+
+## 课程安排
+
+4月15日 20:30~21:30:Day 1 GAN基础概念及应用介绍
+
+4月16日 20:30~21:30:Day 2 GAN的技术演进及人脸生成应用
+
+4月17日 20:30~21:30:Day 3 图像翻译及卡通画应用
+
+4月18日 20:30~21:30:Day 4 超分辨率及老视频修复
+
+4月19日 20:30~21:30:Day 5 动作迁移理论及实践
+
+4月20日 20:30~21:30:Day 6 Wav2lip唇形合成理论及趣味应用
+
+4月22日 19:00~20:00:Day 7 作业讲评与拓展提升
+
+## 作业
+
+作业提交规则:作业提交均在AI Studio上,前5天的作业的截止提交日期均在第二天课程开始前,第6天的大作业将预留一天时间给大家完成,在结营前完成即可。
+
+作业评讲:每日作业评讲均在第二天的课程开头10min。
+
+**Bonus🤩:鼓励大家除了在AI Studio上完成作业外,也可自行创建GitHub repo,在repo上完成作业(内容可和AI Studio一样),并将PaddleGAN加入你的repo的requirements.txt中,最后将repo链接放在AI Studio项目中即可获得加分,最终有机会获得大奖°˖✧◝(⁰▿⁰)◜✧˖°。**
+
+Day 1:[客观题_理论层面的单选题](./homework1.md)
+
+Day 2:[代码题_基于DCGAN,改写为LS-GAN](./homework2.md)
+
+Day 3:[代码题_填空补全基于pix2pix实现人脸卡通的预测代码](./homework3.md)
+
+Day 4:[客观题+项目展示(照片、视频)](./homework4.md)
+
+Day 5:客观题+项目展示(照片、视频)
+
+Day 6-大作业:自选PaddleGAN里面的模型实现超分
+
+## 😍GitHub提交作业指南😍
+
+**各位学员们,加分的机会来啦!在自己的GitHub repo上上传完成的作业,并在requirements.txt中加上ppgan即可获得额外加分5分🤓**
+
+由于考虑到大家对于GitHub可能不太熟悉,所以,我们将所有作业**在GitHub上的提交截止日期放在了结营日前一天--4.22日**,下面就来简单介绍如何在GitHub上提交作业。
+
+### Step 0:创建GitHub账号
+
+对于还未创建GitHub账号的同学们,可以先创建账号,创建后,**可以先对[PaddleGAN](https://github.com/PaddlePaddle/PaddleGAN)点击Star完成关注哟⸜(ّᶿധّᶿ)⸝**,这样你就能及时掌握repo的咨询更新啦~
+
+### Step 1:创建自己的GitHub repo
+
+对于还未有自己的repo的同学(fork他人项目不算噢)可以点进自己的主页,点击【New】创建一个新的repo,并予以命名。
+
+
+
+### Step 3:新增requirements.txt文件
+
+Requirements.txt文件中包含了你创建的repo中所引用/使用其他开发者的repo名称,代表了对于其他开发者的内容版权的尊重,类似于【转载自】。
+
+点击Step 2中新创建的repo,选择【Add file】中的【Create new file】,将其命名为【requirements.txt】。
+
+
+
+这样,文件列表中即新增【requirements.txt】~
+
+
+
+### Step 4:在requirements.txt文件中加上”ppgan“
+
+点击step 3中创建的【requirements.txt】,在其中增加”ppgan“即可。
+
+
+
+### Step 5:提交作业
+
+从AI Studio中将作业导出为markdown格式:
+
+
+
+回到Step 2中创建的repo的主页,同创建【requirements.txt】步骤一样,选择【Add file】中的【Upload files】上传自己作业即可。
+
+
+
+### Step 6:在AI Studio提交作业中增加GitHub repo作业链接以及requirements.txt截图
+
+这样就完成啦,就能获得5分加分噢!
+
+
+
+**⭐总结下,所需提交内容⭐:**
+
+1. **GitHub作业链接**
+2. **requirements.txt中加上ppgan的截图**
+
+**期待大家的作业噢(〜 ̄▽ ̄)〜〜( ̄▽ ̄〜)**
+
+
+
+## 奖品列表
+
+为了鼓励大家积极认真的参与课程,完成课后作业,我们为大家准备了丰厚的奖品!
+
+我们会根据大家课程打卡以及作业完成的情况计算大家的总成绩,给大家发奖~
+
+一等奖1名:HHKB Professional静电容蓝牙键盘
+
+
+
+二等奖2名:Kindle paperwhite 电子阅读器
+
+
+
+三等奖6名:小度无线智能耳机
+
+
+
+优秀奖10名:纸质书_《Generative Deep Learning》+《Deep Learning》
+
+完成课程的同学还将获得我们精美的结业证书!
+
+
+
+## 相关模型资料
+
+以下是课程中所涉及到的所有模型简介、代码链接及论文。
+
+***注意:实际代码请参考Config文件进行配置。**
+
+### Wasserstein GAN
+
+论文:[Wasserstein GAN](https://arxiv.org/pdf/1701.07875.pdf)
+简介:本文从理论上分析了原始 GAN 模型存在的训练不稳定、生成器和判别器的 loss 无法只是训练进程、生成样本缺乏多样性等问题,并通过改进算法流程针对性的给出了改进要点。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/wgan_mnist.yaml
+
+### DCGAN
+
+论文:[UNSUPERVISED REPRESENTATION LEARNING WITH DEEP CONVOLUTIONAL GENERATIVE ADVERSARIAL NETWORKS](https://arxiv.org/pdf/1511.06434.pdf)
+简介:由于卷积神经网络(Convolutional neural network, CNN)比MLP有更强的拟合与表达能力,并在判别式模型中取得了很大的成果。因此,本文将CNN引入生成器和判别器,称作深度卷积对抗神经网络(Deep Convolutional GAN, DCGAN)。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/dcgan_mnist.yaml
+
+### Least Squares GAN
+
+论文:[Least Squares Generative Adversarial Networks](https://arxiv.org/pdf/1611.04076.pdf)
+简介:本文主要将交叉熵损失函数换做了最小二乘损失函数,改善了传统 GAN 生成的图片质量不高,且训练过程十分不稳定的问题。
+
+### Progressive Growing of GAN
+
+论文:[PROGRESSIVE GROWING OF GANS FOR IMPROVED QUALITY, STABILITY, AND VARIATION](https://arxiv.org/pdf/1710.10196.pdf)
+简介:本文提出了一种用来训练生成对抗网络的新方法:渐进式地增加生成器和判别器的规模,同时,提出了一种提高生成图像多样性的方法以及给出一种新的关于图像生成质量和多样性的评价指标。
+
+### StyleGAN
+
+论文:[A Style-Based Generator Architecture for Generative Adversarial Networks](https://arxiv.org/pdf/1812.04948.pdf)
+简介:本文是NVIDIA继ProGAN之后提出的新的生成网络,其主要通过分别修改每一层级的输入,在不影响其他层级的情况下,来控制该层级所表示的视觉特征。 这些特征可以是粗的特征(如姿势、脸型等),也可以是一些细节特征(如瞳色、发色等)。
+
+### StyleGAN2
+
+论文:[Analyzing and Improving the Image Quality of StyleGAN](https://arxiv.org/pdf/1912.04958.pdf)
+简介:本文主要解决StyleGAN生成图像伪影的同时还能得到细节更好的高质量图像。新的改进方案也不会带来更高的计算成本。不管是在现有的分布质量指标上,还是在人所感知的图像质量上,新提出的模型都实现了无条件图像建模任务上新的 SOTA。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/stylegan_v2_256_ffhq.yaml
+
+### Conditional GAN
+
+论文:[Conditional Generative Adversarial Nets](https://arxiv.org/pdf/1411.1784.pdf)
+简介:本文提出在利用 GAN(对抗网络)的方法时,在生成模型G和判别模型D中都加入条件信息来引导模型的训练,并将这种方法应用于跨模态问题,例如图像自动标注等。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/cond_dcgan_mnist.yaml
+
+### CycleGAN
+
+论文:[Unpaired Image-to-Image Translation using Cycle-Consistent Adversarial Networks](https://arxiv.org/pdf/1703.10593.pdf)
+ 简介:CycleGAN本质上是两个镜像对称的GAN,构成了一个环形网络。 两个GAN共享两个生成器,并各自带一个判别器,即共有两个判别器和两个生成器。 一个单向GAN两个loss,两个即共四个loss。 可以实现无配对的两个图片集的训练是CycleGAN与Pixel2Pixel相比的一个典型优点。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/cyclegan_horse2zebra.yaml
+
+### Pix2Pix
+
+论文:[Image-to-Image Translation with Conditional Adversarial Networks](https://arxiv.org/pdf/1611.07004.pdf)
+简介:本文在GAN的基础上提供一个通用方法,完成成对的图像转换。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/pix2pix_cityscapes_2gpus.yaml
+
+### U-GAT-IT
+
+论文:[U-GAT-IT: UNSUPERVISED GENERATIVE ATTENTIONAL NETWORKS WITH ADAPTIVE LAYERINSTANCE NORMALIZATION FOR IMAGE-TO-IMAGE TRANSLATION](https://arxiv.org/pdf/1907.10830.pdf)
+简介:本文主要研究无监督的image-to-image translation。在风格转换中引入了注意力模块,并且提出了一种新的可学习的normalization方法。注意力模块根据辅助分类器获得的attention map,使得模型聚能更好地区分源域和目标域的重要区域。同时,AdaLIN(自适应层实例归一化)帮助注意力指导模型根据所学习的数据集灵活地控制形状和纹理的变化量。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/ugatit_selfie2anime_light.yaml
+
+### Super Resolution GAN
+
+论文:[Photo-Realistic Single Image Super-Resolution Using a Generative Adversarial Network](https://arxiv.org/pdf/1609.04802.pdf)
+简介:本文主要讲解如何利用卷积神经网络实现单影像的超分辨率,其瓶颈仍在于如何恢复图像的细微纹理信息。
+
+### Enhanced Super Resolution GAN
+
+论文:[ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks](https://arxiv.org/pdf/1809.00219.pdf)
+简介:本文在SRGAN的基础上进行了改进,包括改进网络的结构,判决器的判决形式,以及更换了一个用于计算感知域损失的预训练网络。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/esrgan_x4_div2k.yaml
+
+### Residual Channel Attention Networks(RCAN)
+
+论文:[Image Super-Resolution Using Very Deep Residual Channel Attention Networks](https://arxiv.org/pdf/1807.02758.pdf)
+简介:本文提出了一个深度残差通道注意力网络(RCAN)解决过深的网络难以训练、网络的表示能力较弱的问题。
+
+### EDVR
+
+论文:[EDVR: Video Restoration with Enhanced Deformable Convolutional Networks](https://arxiv.org/pdf/1905.02716.pdf)
+简介:本文主要介绍基于可形变卷积的视频恢复、去模糊、超分的网络。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/edvr.yaml
+
+### First Order Motion
+
+论文:[First Order Motion Model for Image Animation](https://arxiv.org/pdf/2003.00196.pdf)
+简介:本文介绍的是image animation,给定一张源图片,给定一个驱动视频,生成一段视频,其中主角是源图片,动作是驱动视频中的动作。如下图所示,源图像通常包含一个主体,驱动视频包含一系列动作。
+
+### Wav2lip
+
+论文:[A Lip Sync Expert Is All You Need for Speech to Lip Generation In The Wild](https://arxiv.org/pdf/2008.10010.pdf)
+简介:本文主要介绍如何将任意说话的面部视频与任意语音进行唇形同步。
+
+代码链接:https://github.com/PaddlePaddle/PaddleGAN/blob/develop/configs/wav2lip.yaml
+
+
+## 优秀作业公示
+
+敬请期待
+
+## 奖品名单公布
+
+敬请期待
+
+
diff --git a/education/homework1.md b/education/homework1.md
new file mode 100644
index 0000000000000000000000000000000000000000..66a4f2c951decc99b33aca6120d1ddb4ebb7de91
--- /dev/null
+++ b/education/homework1.md
@@ -0,0 +1,99 @@
+# GAN基础理论客观题
+
+**1. (多选)GAN的基础结构包括()**
+
+A. 生成器
+
+B. 判别器
+
+C. 编码器
+
+D. 解码器
+
+参考答案:AB
+
+**2.(多选)GAN的应用包括()**
+
+A. 换脸
+
+B. 动作迁移
+
+C. 图像翻译
+
+D. 超分辨率
+
+参考答案:ABCD
+
+**3. (多选)生成对抗网络中的生成模型可以()**
+
+A. 输入噪声生成图像
+
+B. 输入噪声和标签生成图像
+
+C. 输入图像生成图像
+
+D. 输入文字描述生成图像
+
+参考答案:ABCD
+
+**4. (单选)下列哪一项是GAN的判别器的损失函数()**
+
+A. 
+
+B. 
+
+C. 
+
+D. 
+
+
+参考答案:C
+
+
+**5. (多选)下列关于GAN中对抗的描述正确的是()**
+
+A. 生成器与判别器互相对抗,在对抗中增强
+
+B. 两个神经网络通过相互博弈的方式进行学习
+
+C. 像警察与假钞,在对抗中增强警察的鉴别能力和小偷造假能力
+
+D. 像自然界中捕食者与被捕食者在对抗中的进化
+
+参考答案:ABCD
+
+**6. (多选)下列关于GAN的描述正确的是()**
+
+A. 生成网络希望Fake image的score尽可能的大
+
+B. 生成网络希望Fake image的score尽可能的小
+
+C. 判别网络希望Fake image的score尽可能的大
+
+D. 判别网络希望Fake image的score尽可能的小
+
+参考答案:AD
+
+**7. (多选)下列关于DCGAN的说法正确的有()**
+
+A. 使用卷积代替全连接层
+
+B. 添加BatchNorm
+
+C. 在生成器中使用Relu
+
+D. 在判别器中使用Relu
+
+参考答案:ABC
+
+**8. GAN和auto encoder结构本质的区别是()**
+
+A. 网络结构不同
+
+B. 输入不同
+
+C. 对数据集的要求不同
+
+D. 损失函数不同
+
+参考答案:D
diff --git a/education/homework2.md b/education/homework2.md
new file mode 100644
index 0000000000000000000000000000000000000000..6d80d26eb609852c04480d05d32cdbcee1bbe8b3
--- /dev/null
+++ b/education/homework2.md
@@ -0,0 +1,231 @@
+# DCGAN代码改写LSGAN的损失函数
+
+可以看下有提示的地方。
+
+
+```python
+#导入一些必要的包
+import os
+import random
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as optim
+import paddle.vision.datasets as dset
+import paddle.vision.transforms as transforms
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+```
+
+
+```python
+dataset = paddle.vision.datasets.MNIST(mode='train',
+ transform=transforms.Compose([
+ # resize ->(32,32)
+ transforms.Resize((32,32)),
+ # 归一化到-1~1
+ transforms.Normalize([127.5], [127.5])
+ ]))
+
+dataloader = paddle.io.DataLoader(dataset, batch_size=32,
+ shuffle=True, num_workers=4)
+```
+
+
+```python
+#参数初始化的模块
+@paddle.no_grad()
+def normal_(x, mean=0., std=1.):
+ temp_value = paddle.normal(mean, std, shape=x.shape)
+ x.set_value(temp_value)
+ return x
+
+@paddle.no_grad()
+def uniform_(x, a=-1., b=1.):
+ temp_value = paddle.uniform(min=a, max=b, shape=x.shape)
+ x.set_value(temp_value)
+ return x
+
+@paddle.no_grad()
+def constant_(x, value):
+ temp_value = paddle.full(x.shape, value, x.dtype)
+ x.set_value(temp_value)
+ return x
+
+def weights_init(m):
+ classname = m.__class__.__name__
+ if hasattr(m, 'weight') and classname.find('Conv') != -1:
+ normal_(m.weight, 0.0, 0.02)
+ elif classname.find('BatchNorm') != -1:
+ normal_(m.weight, 1.0, 0.02)
+ constant_(m.bias, 0)
+```
+
+
+```python
+# Generator Code
+class Generator(nn.Layer):
+ def __init__(self, ):
+ super(Generator, self).__init__()
+ self.gen = nn.Sequential(
+ # input is Z, [B, 100, 1, 1] -> [B, 64 * 4, 4, 4]
+ nn.Conv2DTranspose(100, 64 * 4, 4, 1, 0, bias_attr=False),
+ nn.BatchNorm2D(64 * 4),
+ nn.ReLU(True),
+ # state size. [B, 64 * 4, 4, 4] -> [B, 64 * 2, 8, 8]
+ nn.Conv2DTranspose(64 * 4, 64 * 2, 4, 2, 1, bias_attr=False),
+ nn.BatchNorm2D(64 * 2),
+ nn.ReLU(True),
+ # state size. [B, 64 * 2, 8, 8] -> [B, 64, 16, 16]
+ nn.Conv2DTranspose( 64 * 2, 64, 4, 2, 1, bias_attr=False),
+ nn.BatchNorm2D(64),
+ nn.ReLU(True),
+ # state size. [B, 64, 16, 16] -> [B, 1, 32, 32]
+ nn.Conv2DTranspose( 64, 1, 4, 2, 1, bias_attr=False),
+ nn.Tanh()
+ )
+
+ def forward(self, x):
+ return self.gen(x)
+
+
+netG = Generator()
+# Apply the weights_init function to randomly initialize all weights
+# to mean=0, stdev=0.2.
+netG.apply(weights_init)
+
+# Print the model
+print(netG)
+```
+
+
+```python
+class Discriminator(nn.Layer):
+ def __init__(self,):
+ super(Discriminator, self).__init__()
+ self.dis = nn.Sequential(
+
+ # input [B, 1, 32, 32] -> [B, 64, 16, 16]
+ nn.Conv2D(1, 64, 4, 2, 1, bias_attr=False),
+ nn.LeakyReLU(0.2),
+
+ # state size. [B, 64, 16, 16] -> [B, 128, 8, 8]
+ nn.Conv2D(64, 64 * 2, 4, 2, 1, bias_attr=False),
+ nn.BatchNorm2D(64 * 2),
+ nn.LeakyReLU(0.2),
+
+ # state size. [B, 128, 8, 8] -> [B, 256, 4, 4]
+ nn.Conv2D(64 * 2, 64 * 4, 4, 2, 1, bias_attr=False),
+ nn.BatchNorm2D(64 * 4),
+ nn.LeakyReLU(0.2),
+
+ # state size. [B, 256, 4, 4] -> [B, 1, 1, 1]
+ nn.Conv2D(64 * 4, 1, 4, 1, 0, bias_attr=False),
+ # 这里为需要改变的地方
+ nn.Sigmoid()
+ )
+
+ def forward(self, x):
+ return self.dis(x)
+
+netD = Discriminator()
+netD.apply(weights_init)
+print(netD)
+```
+
+
+```python
+# Initialize BCELoss function
+# 这里为需要改变的地方
+loss = nn.BCELoss()
+
+# Create batch of latent vectors that we will use to visualize
+# the progression of the generator
+fixed_noise = paddle.randn([32, 100, 1, 1], dtype='float32')
+
+# Establish convention for real and fake labels during training
+real_label = 1.
+fake_label = 0.
+
+# Setup Adam optimizers for both G and D
+optimizerD = optim.Adam(parameters=netD.parameters(), learning_rate=0.0002, beta1=0.5, beta2=0.999)
+optimizerG = optim.Adam(parameters=netG.parameters(), learning_rate=0.0002, beta1=0.5, beta2=0.999)
+
+```
+
+
+```python
+losses = [[], []]
+#plt.ion()
+now = 0
+for pass_id in range(100):
+ for batch_id, (data, target) in enumerate(dataloader):
+ ############################
+ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
+ ###########################
+
+ optimizerD.clear_grad()
+ real_img = data
+ bs_size = real_img.shape[0]
+ label = paddle.full((bs_size, 1, 1, 1), real_label, dtype='float32')
+ real_out = netD(real_img)
+ errD_real = loss(real_out, label)
+ errD_real.backward()
+
+ noise = paddle.randn([bs_size, 100, 1, 1], 'float32')
+ fake_img = netG(noise)
+ label = paddle.full((bs_size, 1, 1, 1), fake_label, dtype='float32')
+ fake_out = netD(fake_img.detach())
+ errD_fake = loss(fake_out,label)
+ errD_fake.backward()
+ optimizerD.step()
+ optimizerD.clear_grad()
+
+ errD = errD_real + errD_fake
+ losses[0].append(float(errD))
+
+ ############################
+ # (2) Update G network: maximize log(D(G(z)))
+ ###########################
+ optimizerG.clear_grad()
+ noise = paddle.randn([bs_size, 100, 1, 1],'float32')
+ fake = netG(noise)
+ label = paddle.full((bs_size, 1, 1, 1), real_label, dtype=np.float32,)
+ output = netD(fake)
+ errG = loss(output,label)
+ errG.backward()
+ optimizerG.step()
+ optimizerG.clear_grad()
+
+ losses[1].append(float(errG))
+
+
+ ############################
+ # visualize
+ ###########################
+ if batch_id % 100 == 0:
+ generated_image = netG(noise).numpy()
+ imgs = []
+ plt.figure(figsize=(15,15))
+ try:
+ for i in range(10):
+ image = generated_image[i].transpose()
+ image = np.where(image > 0, image, 0)
+ image = image.transpose((1,0,2))
+ plt.subplot(10, 10, i + 1)
+
+ plt.imshow(image[...,0], vmin=-1, vmax=1)
+ plt.axis('off')
+ plt.xticks([])
+ plt.yticks([])
+ plt.subplots_adjust(wspace=0.1, hspace=0.1)
+ msg = 'Epoch ID={0} Batch ID={1} \n\n D-Loss={2} G-Loss={3}'.format(pass_id, batch_id, float(errD), float(errG))
+ print(msg)
+ plt.suptitle(msg,fontsize=20)
+ plt.draw()
+ plt.savefig('{}/{:04d}_{:04d}.png'.format('work', pass_id, batch_id), bbox_inches='tight')
+ plt.pause(0.01)
+ except IOError:
+ print(IOError)
+ paddle.save(netG.state_dict(), "work/generator.params")
+```
\ No newline at end of file
diff --git a/education/homework3.md b/education/homework3.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d7c54efe5a1a28fca07ca75dccd6c01ca7aaa4e
--- /dev/null
+++ b/education/homework3.md
@@ -0,0 +1,394 @@
+# Day 3 作业--Pixel2Pixel:人像卡通化
+
+经过今天的学习,相信大家对图像翻译、风格迁移有了一定的了解啦,是不是也想自己动手来实现下呢?
+
+那么,为了满足大家动手实践的愿望,同时为了巩固大家学到的知识,我们Day 3的作业便是带大家完成一遍课程讲解过的应用--**Pixel2Pixel:人像卡通化**
+
+在本次作业中,大家需要做的是:**补齐代码,跑通训练,提交一张卡通化的成品图,动手完成自己的第一个人像卡通化的应用~**
+
+
+
+## 准备工作:引入依赖 & 数据准备
+
+
+```python
+import paddle
+import paddle.nn as nn
+from paddle.io import Dataset, DataLoader
+
+import os
+import cv2
+import numpy as np
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+
+%matplotlib inline
+```
+
+### 数据准备:
+
+- 真人数据来自[seeprettyface](http://www.seeprettyface.com/mydataset.html)。
+- 数据预处理(详情见[photo2cartoon](https://github.com/minivision-ai/photo2cartoon)项目)。
+
+
+

+
+
+
+- 使用[photo2cartoon](https://github.com/minivision-ai/photo2cartoon)项目生成真人数据对应的卡通数据。
+
+
+```python
+# 解压数据
+!unzip -q data/data79149/cartoon_A2B.zip -d data/
+```
+
+### 数据可视化
+
+
+```python
+# 训练数据统计
+train_names = os.listdir('data/cartoon_A2B/train')
+print(f'训练集数据量: {len(train_names)}')
+
+# 测试数据统计
+test_names = os.listdir('data/cartoon_A2B/test')
+print(f'测试集数据量: {len(test_names)}')
+
+# 训练数据可视化
+imgs = []
+for img_name in np.random.choice(train_names, 3, replace=False):
+ imgs.append(cv2.imread('data/cartoon_A2B/train/'+img_name))
+
+img_show = np.vstack(imgs)[:,:,::-1]
+plt.figure(figsize=(10, 10))
+plt.imshow(img_show)
+plt.show()
+```
+
+
+```python
+class PairedData(Dataset):
+ def __init__(self, phase):
+ super(PairedData, self).__init__()
+ self.img_path_list = self.load_A2B_data(phase) # 获取数据列表
+ self.num_samples = len(self.img_path_list) # 数据量
+
+ def __getitem__(self, idx):
+ img_A2B = # 读取一组数据
+ img_A2B = # 从0~255归一化至-1~1
+ img_A2B = # 维度变换HWC -> CHW
+ img_A = # 真人照
+ img_B = # 卡通图
+ return img_A, img_B
+
+ def __len__(self):
+ return self.num_samples
+
+ @staticmethod
+ def load_A2B_data(phase):
+ assert phase in ['train', 'test'], "phase should be set within ['train', 'test']"
+ # 读取数据集,数据中每张图像包含照片和对应的卡通画。
+ data_path = 'data/cartoon_A2B/'+phase
+ return [os.path.join(data_path, x) for x in os.listdir(data_path)]
+```
+
+
+```python
+paired_dataset_train = PairedData('train')
+paired_dataset_test = PairedData('test')
+```
+
+## 第一步:搭建生成器
+
+### 请大家补齐空白处的代码,‘#’ 后是提示。
+
+
+```python
+class UnetGenerator(nn.Layer):
+ def __init__(self, input_nc=3, output_nc=3, ngf=64):
+ super(UnetGenerator, self).__init__()
+
+ self.down1 = nn.Conv2D(input_nc, ngf, kernel_size=4, stride=2, padding=1)
+ self.down2 = Downsample(ngf, ngf*2)
+ self.down3 = Downsample(ngf*2, ngf*4)
+ self.down4 = Downsample(ngf*4, ngf*8)
+ self.down5 = Downsample(ngf*8, ngf*8)
+ self.down6 = Downsample(ngf*8, ngf*8)
+ self.down7 = Downsample(ngf*8, ngf*8)
+
+ self.center = Downsample(ngf*8, ngf*8)
+
+ self.up7 = Upsample(ngf*8, ngf*8, use_dropout=True)
+ self.up6 = Upsample(ngf*8*2, ngf*8, use_dropout=True)
+ self.up5 = Upsample(ngf*8*2, ngf*8, use_dropout=True)
+ self.up4 = Upsample(ngf*8*2, ngf*8)
+ self.up3 = Upsample(ngf*8*2, ngf*4)
+ self.up2 = Upsample(ngf*4*2, ngf*2)
+ self.up1 = Upsample(ngf*2*2, ngf)
+
+ self.output_block = nn.Sequential(
+ nn.ReLU(),
+ nn.Conv2DTranspose(ngf*2, output_nc, kernel_size=4, stride=2, padding=1),
+ nn.Tanh()
+ )
+
+ def forward(self, x):
+ d1 = self.down1(x)
+ d2 = self.down2(d1)
+ d3 = self.down3(d2)
+ d4 = self.down4(d3)
+ d5 = self.down5(d4)
+ d6 = self.down6(d5)
+ d7 = self.down7(d6)
+
+ c = self.center(d7)
+
+ x = self.up7(c, d7)
+ x = self.up6(x, d6)
+ x = self.up5(x, d5)
+ x = self.up4(x, d4)
+ x = self.up3(x, d3)
+ x = self.up2(x, d2)
+ x = self.up1(x, d1)
+
+ x = self.output_block(x)
+ return x
+
+
+class Downsample(nn.Layer):
+ # LeakyReLU => conv => batch norm
+ def __init__(self, in_dim, out_dim, kernel_size=4, stride=2, padding=1):
+ super(Downsample, self).__init__()
+
+ self.layers = nn.Sequential(
+ # LeakyReLU, leaky=0.2
+ # Conv2D
+ # BatchNorm2D
+ )
+
+ def forward(self, x):
+ x = self.layers(x)
+ return x
+
+
+class Upsample(nn.Layer):
+ # ReLU => deconv => batch norm => dropout
+ def __init__(self, in_dim, out_dim, kernel_size=4, stride=2, padding=1, use_dropout=False):
+ super(Upsample, self).__init__()
+
+ sequence = [
+ # ReLU
+ # Conv2DTranspose
+ # nn.BatchNorm2D
+ ]
+
+ if use_dropout:
+ sequence.append(nn.Dropout(p=0.5))
+
+ self.layers = nn.Sequential(*sequence)
+
+ def forward(self, x, skip):
+ x = self.layers(x)
+ x = paddle.concat([x, skip], axis=1)
+ return x
+```
+
+## 第二步:鉴别器的搭建
+
+### 请大家补齐空白处的代码,‘#’ 后是提示。
+
+
+```python
+class NLayerDiscriminator(nn.Layer):
+ def __init__(self, input_nc=6, ndf=64):
+ super(NLayerDiscriminator, self).__init__()
+
+ self.layers = nn.Sequential(
+ nn.Conv2D(input_nc, ndf, kernel_size=4, stride=2, padding=1),
+ nn.LeakyReLU(0.2),
+
+ ConvBlock(ndf, ndf*2),
+ ConvBlock(ndf*2, ndf*4),
+ ConvBlock(ndf*4, ndf*8, stride=1),
+
+ nn.Conv2D(ndf*8, 1, kernel_size=4, stride=1, padding=1),
+ nn.Sigmoid()
+ )
+
+ def forward(self, input):
+ return self.layers(input)
+
+
+class ConvBlock(nn.Layer):
+ # conv => batch norm => LeakyReLU
+ def __init__(self, in_dim, out_dim, kernel_size=4, stride=2, padding=1):
+ super(ConvBlock, self).__init__()
+
+ self.layers = nn.Sequential(
+ # Conv2D
+ # BatchNorm2D
+ # LeakyReLU, leaky=0.2
+ )
+
+ def forward(self, x):
+ x = self.layers(x)
+ return x
+```
+
+
+```python
+generator = UnetGenerator()
+discriminator = NLayerDiscriminator()
+```
+
+
+```python
+out = generator(paddle.ones([1, 3, 256, 256]))
+print('生成器输出尺寸:', out.shape) # 应为[1, 3, 256, 256]
+
+out = discriminator(paddle.ones([1, 6, 256, 256]))
+print('鉴别器输出尺寸:', out.shape) # 应为[1, 1, 30, 30]
+```
+
+
+```python
+# 超参数
+LR = 1e-4
+BATCH_SIZE = 8
+EPOCHS = 100
+
+# 优化器
+optimizerG = paddle.optimizer.Adam(
+ learning_rate=LR,
+ parameters=generator.parameters(),
+ beta1=0.5,
+ beta2=0.999)
+
+optimizerD = paddle.optimizer.Adam(
+ learning_rate=LR,
+ parameters=discriminator.parameters(),
+ beta1=0.5,
+ beta2=0.999)
+
+# 损失函数
+bce_loss =
+l1_loss =
+
+# dataloader
+data_loader_train = DataLoader(
+ paired_dataset_train,
+ batch_size=BATCH_SIZE,
+ shuffle=True,
+ drop_last=True
+ )
+
+data_loader_test = DataLoader(
+ paired_dataset_test,
+ batch_size=BATCH_SIZE
+ )
+```
+
+
+```python
+results_save_path = 'work/results'
+os.makedirs(results_save_path, exist_ok=True) # 保存每个epoch的测试结果
+
+weights_save_path = 'work/weights'
+os.makedirs(weights_save_path, exist_ok=True) # 保存模型
+
+for epoch in range(EPOCHS):
+ for data in tqdm(data_loader_train):
+ real_A, real_B = data
+
+ optimizerD.clear_grad()
+ # D([real_A, real_B])
+ real_AB = paddle.concat((real_A, real_B), 1)
+ d_real_predict = discriminator(real_AB)
+ d_real_loss = bce_loss(d_real_predict, paddle.ones_like(d_real_predict))
+
+ # D([real_A, fake_B])
+ fake_B =
+ fake_AB =
+ d_fake_predict =
+ d_fake_loss =
+
+ # train D
+ d_loss = (d_real_loss + d_fake_loss) / 2.
+ d_loss.backward()
+ optimizerD.step()
+
+ optimizerG.clear_grad()
+ # D([real_A, fake_B])
+ fake_B =
+ fake_AB =
+ g_fake_predict =
+ g_bce_loss =
+ g_l1_loss =
+ g_loss = g_bce_loss + g_l1_loss * 100.
+
+ # train G
+ g_loss.backward()
+ optimizerG.step()
+
+ print(f'Epoch [{epoch+1}/{EPOCHS}] Loss D: {d_loss.numpy()}, Loss G: {g_loss.numpy()}')
+
+ if (epoch+1) % 10 == 0:
+ paddle.save(generator.state_dict(), os.path.join(weights_save_path, 'epoch'+str(epoch+1).zfill(3)+'.pdparams'))
+
+ # test
+ generator.eval()
+ with paddle.no_grad():
+ for data in data_loader_test:
+ real_A, real_B = data
+ break
+
+ fake_B = generator(real_A)
+ result = paddle.concat([real_A[:3], real_B[:3], fake_B[:3]], 3)
+
+ result = result.detach().numpy().transpose(0, 2, 3, 1)
+ result = np.vstack(result)
+ result = (result * 127.5 + 127.5).astype(np.uint8)
+
+ cv2.imwrite(os.path.join(results_save_path, 'epoch'+str(epoch+1).zfill(3)+'.png'), result)
+
+ generator.train()
+```
+
+## 最后:用你补齐的代码试试卡通化的效果吧!
+
+
+```python
+# 为生成器加载权重
+last_weights_path = os.path.join(weights_save_path, sorted(os.listdir(weights_save_path))[-1])
+print('加载权重:', last_weights_path)
+
+model_state_dict = paddle.load(last_weights_path)
+generator.load_dict(model_state_dict)
+generator.eval()
+```
+
+
+```python
+# 读取数据
+test_names = os.listdir('data/cartoon_A2B/test')
+img_name = np.random.choice(test_names)
+img_A2B = cv2.imread('data/cartoon_A2B/test/'+img_name)
+img_A = img_A2B[:, :256] # 真人照
+img_B = img_A2B[:, 256:] # 卡通图
+
+g_input = img_A.astype('float32') / 127.5 - 1 # 归一化
+g_input = g_input[np.newaxis, ...].transpose(0, 3, 1, 2) # NHWC -> NCHW
+g_input = paddle.to_tensor(g_input) # numpy -> tensor
+
+g_output = generator(g_input)
+g_output = g_output.detach().numpy() # tensor -> numpy
+g_output = g_output.transpose(0, 2, 3, 1)[0] # NCHW -> NHWC
+g_output = g_output * 127.5 + 127.5 # 反归一化
+g_output = g_output.astype(np.uint8)
+
+img_show = np.hstack([img_A, g_output])[:,:,::-1]
+plt.figure(figsize=(8, 8))
+plt.imshow(img_show)
+plt.show()
+```
\ No newline at end of file
diff --git a/docs/zh_CN/tutorials/video_restore.md b/education/homework4.md
similarity index 46%
rename from docs/zh_CN/tutorials/video_restore.md
rename to education/homework4.md
index 88b12a9ff5c0621d314c4c2d96e7a8c99bedff5d..f5ba3ccfba4206f3c481360a838bc97fde50789b 100644
--- a/docs/zh_CN/tutorials/video_restore.md
+++ b/education/homework4.md
@@ -1,75 +1,79 @@
-## 老视频修复
+# 老北京城影像修复
-老视频往往具有帧数少,无色彩,分辨率低等特点。于是针对这些特点,我们使用补帧,上色,超分等模型对视频进行修复。
+完整项目见:https://aistudio.baidu.com/aistudio/projectdetail/1796293
-### 使用applications中的video-enhance.py工具进行快速开始视频修复
-```
-cd applications
-python tools/video-enhance.py --input you_video_path.mp4 --process_order DAIN DeOldify EDVR --output output_dir
-```
-#### 参数
+本项目运用[PaddleGAN](https://github.com/PaddlePaddle/PaddleGAN)实现了百年前老北京城视频的复原,其中将详细讲解如何运用视频的上色、超分辨率(提高清晰度)、插帧(提高流畅度)等AI修复技术,让那些先辈们的一举一动,一颦一簇都宛若眼前之人。
-- `--input (str)`: 输入的视频路径。
-- `--output (str)`: 输出的视频路径。
-- `--process_order`: 调用的模型名字和顺序,比如输入为 `DAIN DeOldify EDVR`,则会顺序调用 `DAINPredictor` `DeOldifyPredictor` `EDVRPredictor` 。
+当然,如果大家觉得这个项目有趣好用的话,希望大家能够为我们[PaddleGAN](https://github.com/PaddlePaddle/PaddleGAN)的[Github主页](https://github.com/PaddlePaddle/PaddleGAN)点Star噢~
-#### 效果展示
-
+
+

+
-### 快速体验
-我们在ai studio制作了一个[ai studio 老北京视频修复教程](https://aistudio.baidu.com/aistudio/projectdetail/1161285)
+
-### 注意事项
+
+

+
-* 在使用本教程前,请确保您已经[安装完paddle和ppgan]()。
-* 本教程的所有命令都基于PaddleGAN/applications主目录进行执行。
+## 安装PaddleGAN
-* 各个模型耗时较长,尤其使超分辨率模型,建议输入的视频分辨率低一些,时长短一些。
+PaddleGAN的安装目前支持Clone GitHub和Gitee两种方式:
-* 需要运行在gpu环境上
-### ppgan提供的可用于视频修复的预测api简介
-可以根据要修复的视频的特点,使用不同的模型与参数
+```python
+# 安装ppgan
+# 当前目录在: /home/aistudio/, 这个目录也是左边文件和文件夹所在的目录
+# 克隆最新的PaddleGAN仓库到当前目录
+# !git clone https://github.com/PaddlePaddle/PaddleGAN.git
+# 如果从github下载慢可以从gitee clone:
+!git clone https://gitee.com/paddlepaddle/PaddleGAN.git
+%cd PaddleGAN/
+!pip install -v -e .
+```
+
+## PaddleGAN中要使用的预测模型介绍
### 补帧模型DAIN
+
DAIN 模型通过探索深度的信息来显式检测遮挡。并且开发了一个深度感知的流投影层来合成中间流。在视频补帧方面有较好的效果。
-
```
ppgan.apps.DAINPredictor(
- output='output',
+ output_path='output',
weight_path=None,
time_step=None,
use_gpu=True,
remove_duplicates=False)
```
+
#### 参数
-- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `output_path (str,可选的)`: 输出的文件夹路径,默认值:`output`.
- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
- `time_step (int)`: 补帧的时间系数,如果设置为0.5,则原先为每秒30帧的视频,补帧后变为每秒60帧。
- `remove_duplicates (bool,可选的)`: 是否删除重复帧,默认值:`False`.
### 上色模型DeOldifyPredictor
+
DeOldify 采用自注意力机制的生成对抗网络,生成器是一个U-NET结构的网络。在图像的上色方面有着较好的效果。
-
```
ppgan.apps.DeOldifyPredictor(output='output', weight_path=None, render_factor=32)
```
+
#### 参数
-- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `output_path (str,可选的)`: 输出的文件夹路径,默认值:`output`.
- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
-- `artistic (bool)`: 是否使用偏"艺术性"的模型。"艺术性"的模型有可能产生一些有趣的颜色,但是毛刺比较多。
- `render_factor (int)`: 会将该参数乘以16后作为输入帧的resize的值,如果该值设置为32,
- 则输入帧会resize到(32 * 16, 32 * 16)的尺寸再输入到网络中。
+ 则输入帧会resize到(32 * 16, 32 * 16)的尺寸再输入到网络中。
### 上色模型DeepRemasterPredictor
+
DeepRemaster 模型基于时空卷积神经网络和自注意力机制。并且能够根据输入的任意数量的参考帧对图片进行上色。
-
```
ppgan.apps.DeepRemasterPredictor(
@@ -79,38 +83,115 @@ ppgan.apps.DeepRemasterPredictor(
reference_dir=None,
mindim=360):
```
+
#### 参数
-- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `output_path (str,可选的)`: 输出的文件夹路径,默认值:`output`.
- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
- `colorization (bool)`: 是否对输入视频上色,如果选项设置为 `True` ,则参考帧的文件夹路径也必须要设置。默认值:`False`。
- `reference_dir (bool)`: 参考帧的文件夹路径。默认值:`None`。
- `mindim (bool)`: 输入帧重新resize后的短边的大小。默认值:360。
### 超分辨率模型RealSRPredictor
-RealSR模型通过估计各种模糊内核以及实际噪声分布,为现实世界的图像设计一种新颖的真实图片降采样框架。基于该降采样框架,可以获取与真实世界图像共享同一域的低分辨率图像。并且提出了一个旨在提高感知度的真实世界超分辨率模型。对合成噪声数据和真实世界图像进行的大量实验表明,该模型能够有效降低了噪声并提高了视觉质量。
-
+RealSR模型通过估计各种模糊内核以及实际噪声分布,为现实世界的图像设计一种新颖的真实图片降采样框架。基于该降采样框架,可以获取与真实世界图像共享同一域的低分辨率图像。并且提出了一个旨在提高感知度的真实世界超分辨率模型。对合成噪声数据和真实世界图像进行的大量实验表明,该模型能够有效降低了噪声并提高了视觉质量。
```
ppgan.apps.RealSRPredictor(output='output', weight_path=None)
```
+
#### 参数
-- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `output_path (str,可选的)`: 输出的文件夹路径,默认值:`output`.
- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
--
-### 超分辨率模型EDVRPredictor
-EDVR模型提出了一个新颖的视频具有增强可变形卷积的还原框架:第一,为了处理大动作而设计的一个金字塔,级联和可变形(PCD)对齐模块,使用可变形卷积以从粗到精的方式在特征级别完成对齐;第二,提出时空注意力机制(TSA)融合模块,在时间和空间上都融合了注意机制,用以增强复原的功能。
-EDVR模型是一个基于连续帧的超分模型,能够有效利用帧间的信息,速度比RealSR模型快。
+### 超分辨率模型EDVRPredictor
-
+EDVR模型提出了一个新颖的视频具有增强可变形卷积的还原框架:第一,为了处理大动作而设计的一个金字塔,级联和可变形(PCD)对齐模块,使用可变形卷积以从粗到精的方式在特征级别完成对齐;第二,提出时空注意力机制(TSA)融合模块,在时间和空间上都融合了注意机制,用以增强复原的功能。
```
ppgan.apps.EDVRPredictor(output='output', weight_path=None)
```
+
#### 参数
-- `output (str,可选的)`: 输出的文件夹路径,默认值:`output`.
+- `output_path (str,可选的)`: 输出的文件夹路径,默认值:`output`.
- `weight_path (None,可选的)`: 载入的权重路径,如果没有设置,则从云端下载默认的权重到本地。默认值:`None`。
+
+## 使用PaddleGAN进行视频修复
+
+
+```python
+# 导入一些可视化需要的包
+import cv2
+import imageio
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+from IPython.display import HTML
+import warnings
+warnings.filterwarnings("ignore")
+```
+
+
+```python
+# 定义一个展示视频的函数
+def display(driving, fps, size=(8, 6)):
+ fig = plt.figure(figsize=size)
+
+ ims = []
+ for i in range(len(driving)):
+ cols = []
+ cols.append(driving[i])
+
+ im = plt.imshow(np.concatenate(cols, axis=1), animated=True)
+ plt.axis('off')
+ ims.append([im])
+
+ video = animation.ArtistAnimation(fig, ims, interval=1000.0/fps, repeat_delay=1000)
+
+ plt.close()
+ return video
+```
+
+
+```python
+# 展示一下输入的视频, 如果视频太大,时间会非常久,可以跳过这个步骤
+video_path = '/home/aistudio/Peking_input360p_clip6_5s.mp4'
+video_frames = imageio.mimread(video_path, memtest=False)
+
+# 获得视频的原分辨率
+cap = cv2.VideoCapture(video_path)
+fps = cap.get(cv2.CAP_PROP_FPS)
+
+
+HTML(display(video_frames, fps).to_html5_video())
+```
+
+
+```python
+# 使用插帧(DAIN), 上色(DeOldify), 超分(EDVR)这三个模型对该视频进行修复
+# input参数表示输入的视频路径
+# output表示处理后的视频的存放文件夹
+# proccess_order 表示使用的模型和顺序(目前支持)
+%cd /home/aistudio/PaddleGAN/applications/
+!python tools/video-enhance.py --input /home/aistudio/Peking_input360p_clip6_5s.mp4 \
+ --process_order DAIN DeOldify EDVR \
+ --output output_dir
+```
+
+
+```python
+# 展示一下处理好的视频, 如果视频太大,时间会非常久,可以下载下来看
+# 这个路径可以查看上个code cell的最后打印的output video path
+output_video_path = '/home/aistudio/PaddleGAN/applications/output_dir/EDVR/Peking_input360p_clip6_5s_deoldify_out_edvr_out.mp4'
+
+video_frames = imageio.mimread(output_video_path, memtest=False)
+
+# 获得视频的原分辨率
+cap = cv2.VideoCapture(output_video_path)
+fps = cap.get(cv2.CAP_PROP_FPS)
+
+
+HTML(display(video_frames, fps, size=(16, 12)).to_html5_video())
+```
\ No newline at end of file
diff --git "a/education/\347\254\254\344\272\214\345\244\251\344\275\234\344\270\232.py" "b/education/\347\254\254\344\272\214\345\244\251\344\275\234\344\270\232.py"
new file mode 100644
index 0000000000000000000000000000000000000000..d995632a5e760b9a428afe999792101fb8e891a2
--- /dev/null
+++ "b/education/\347\254\254\344\272\214\345\244\251\344\275\234\344\270\232.py"
@@ -0,0 +1,208 @@
+#导入一些必要的包
+import os
+import random
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as optim
+import paddle.vision.datasets as dset
+import paddle.vision.transforms as transforms
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.animation as animation
+
+dataset = paddle.vision.datasets.MNIST(mode='train',
+ transform=transforms.Compose([
+ # resize ->(32,32)
+ transforms.Resize((32,32)),
+ # 归一化到-1~1
+ transforms.Normalize([127.5], [127.5])
+ ]))
+
+dataloader = paddle.io.DataLoader(dataset, batch_size=32,
+ shuffle=True, num_workers=4)
+
+#参数初始化的模块
+@paddle.no_grad()
+def normal_(x, mean=0., std=1.):
+ temp_value = paddle.normal(mean, std, shape=x.shape)
+ x.set_value(temp_value)
+ return x
+
+@paddle.no_grad()
+def uniform_(x, a=-1., b=1.):
+ temp_value = paddle.uniform(min=a, max=b, shape=x.shape)
+ x.set_value(temp_value)
+ return x
+
+@paddle.no_grad()
+def constant_(x, value):
+ temp_value = paddle.full(x.shape, value, x.dtype)
+ x.set_value(temp_value)
+ return x
+
+def weights_init(m):
+ classname = m.__class__.__name__
+ if hasattr(m, 'weight') and classname.find('Conv') != -1:
+ normal_(m.weight, 0.0, 0.02)
+ elif classname.find('BatchNorm') != -1:
+ normal_(m.weight, 1.0, 0.02)
+ constant_(m.bias, 0)
+
+# Generator Code
+class Generator(nn.Layer):
+ def __init__(self, ):
+ super(Generator, self).__init__()
+ self.gen = nn.Sequential(
+ # input is Z, [B, 100, 1, 1] -> [B, 64 * 4, 4, 4]
+ nn.Conv2DTranspose(100, 64 * 4, 4, 1, 0, bias_attr=False),
+ nn.BatchNorm2D(64 * 4),
+ nn.ReLU(True),
+ # state size. [B, 64 * 4, 4, 4] -> [B, 64 * 2, 8, 8]
+ nn.Conv2DTranspose(64 * 4, 64 * 2, 4, 2, 1, bias_attr=False),
+ nn.BatchNorm2D(64 * 2),
+ nn.ReLU(True),
+ # state size. [B, 64 * 2, 8, 8] -> [B, 64, 16, 16]
+ nn.Conv2DTranspose( 64 * 2, 64, 4, 2, 1, bias_attr=False),
+ nn.BatchNorm2D(64),
+ nn.ReLU(True),
+ # state size. [B, 64, 16, 16] -> [B, 1, 32, 32]
+ nn.Conv2DTranspose( 64, 1, 4, 2, 1, bias_attr=False),
+ nn.Tanh()
+ )
+
+ def forward(self, x):
+ return self.gen(x)
+
+
+netG = Generator()
+# Apply the weights_init function to randomly initialize all weights
+# to mean=0, stdev=0.2.
+netG.apply(weights_init)
+
+# Print the model
+print(netG)
+
+class Discriminator(nn.Layer):
+ def __init__(self,):
+ super(Discriminator, self).__init__()
+ self.dis = nn.Sequential(
+
+ # input [B, 1, 32, 32] -> [B, 64, 16, 16]
+ nn.Conv2D(1, 64, 4, 2, 1, bias_attr=False),
+ nn.LeakyReLU(0.2),
+
+ # state size. [B, 64, 16, 16] -> [B, 128, 8, 8]
+ nn.Conv2D(64, 64 * 2, 4, 2, 1, bias_attr=False),
+ nn.BatchNorm2D(64 * 2),
+ nn.LeakyReLU(0.2),
+
+ # state size. [B, 128, 8, 8] -> [B, 256, 4, 4]
+ nn.Conv2D(64 * 2, 64 * 4, 4, 2, 1, bias_attr=False),
+ nn.BatchNorm2D(64 * 4),
+ nn.LeakyReLU(0.2),
+
+ # state size. [B, 256, 4, 4] -> [B, 1, 1, 1]
+ nn.Conv2D(64 * 4, 1, 4, 1, 0, bias_attr=False),
+ # 这里为需要改变的地方
+ # nn.Sigmoid()
+ nn.LeakyReLU()
+ )
+
+ def forward(self, x):
+ return self.dis(x)
+
+netD = Discriminator()
+netD.apply(weights_init)
+print(netD)
+
+# Initialize BCELoss function
+# 这里为需要改变的地方
+# loss = nn.BCELoss()
+loss = nn.MSELoss()
+
+# Create batch of latent vectors that we will use to visualize
+# the progression of the generator
+fixed_noise = paddle.randn([32, 100, 1, 1], dtype='float32')
+
+# Establish convention for real and fake labels during training
+real_label = 1.
+fake_label = 0.
+
+# Setup Adam optimizers for both G and D
+optimizerD = optim.Adam(parameters=netD.parameters(), learning_rate=0.0002, beta1=0.5, beta2=0.999)
+optimizerG = optim.Adam(parameters=netG.parameters(), learning_rate=0.0002, beta1=0.5, beta2=0.999)
+
+losses = [[], []]
+#plt.ion()
+now = 0
+for pass_id in range(100):
+ for batch_id, (data, target) in enumerate(dataloader):
+ ############################
+ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
+ ###########################
+
+ optimizerD.clear_grad()
+ real_img = data
+ bs_size = real_img.shape[0]
+ label = paddle.full((bs_size, 1, 1, 1), real_label, dtype='float32')
+ real_out = netD(real_img)
+ errD_real = loss(real_out, label)
+ errD_real.backward()
+
+ noise = paddle.randn([bs_size, 100, 1, 1], 'float32')
+ fake_img = netG(noise)
+ label = paddle.full((bs_size, 1, 1, 1), fake_label, dtype='float32')
+ fake_out = netD(fake_img.detach())
+ errD_fake = loss(fake_out,label)
+ errD_fake.backward()
+ optimizerD.step()
+ optimizerD.clear_grad()
+
+ errD = errD_real + errD_fake
+ losses[0].append(float(errD))
+
+ ############################
+ # (2) Update G network: maximize log(D(G(z)))
+ ###########################
+ optimizerG.clear_grad()
+ noise = paddle.randn([bs_size, 100, 1, 1],'float32')
+ fake = netG(noise)
+ label = paddle.full((bs_size, 1, 1, 1), real_label, dtype=np.float32,)
+ output = netD(fake)
+ errG = loss(output,label)
+ errG.backward()
+ optimizerG.step()
+ optimizerG.clear_grad()
+
+ losses[1].append(float(errG))
+
+
+ ############################
+ # visualize
+ ###########################
+ if batch_id % 100 == 0:
+ generated_image = netG(noise).numpy()
+ imgs = []
+ plt.figure(figsize=(15,15))
+ try:
+ for i in range(10):
+ image = generated_image[i].transpose()
+ image = np.where(image > 0, image, 0)
+ image = image.transpose((1,0,2))
+ plt.subplot(10, 10, i + 1)
+
+ plt.imshow(image[...,0], vmin=-1, vmax=1)
+ plt.axis('off')
+ plt.xticks([])
+ plt.yticks([])
+ plt.subplots_adjust(wspace=0.1, hspace=0.1)
+ msg = 'Epoch ID={0} Batch ID={1} \n\n D-Loss={2} G-Loss={3}'.format(pass_id, batch_id, float(errD), float(errG))
+ print(msg)
+ plt.suptitle(msg,fontsize=20)
+ plt.draw()
+ plt.savefig('{}/{:04d}_{:04d}.png'.format('work', pass_id, batch_id), bbox_inches='tight')
+ plt.pause(0.01)
+ except IOError:
+ print(IOError)
+ paddle.save(netG.state_dict(), "work/generator.params")
+
diff --git a/paddlegan-wechaty-demo/README.md b/paddlegan-wechaty-demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..821052fd9a390fd1fed5853bb24fd1381af617ab
--- /dev/null
+++ b/paddlegan-wechaty-demo/README.md
@@ -0,0 +1,130 @@
+# PaddleGAN-WeChaty-Demo
+
+本示例将展示如何在[Wechaty](https://github.com/Wechaty/wechaty)中使用PaddleGAN的多种能力。
+
+基本原理:通过[Wechaty](https://github.com/Wechaty/wechaty)获取微信接收的消息,然后使用PaddleGAN中的人脸动作迁移算法`first order motion`模型,将静态照片转换成动态趣味视频,最终以微信消息的形式发送。
+
+## 风险提示
+
+本项目采用的api为第三方——Wechaty提供,**非微信官方api**,用户需承担来自微信方的使用风险。
+在运行项目的过程中,建议尽量选用**新注册的小号**进行测试,不要用自己的常用微信号。
+
+## Wechaty
+
+关于Wechaty和python-wechaty,请查阅以下官方repo:
+- [Wechaty](https://github.com/Wechaty/wechaty)
+- [python-wechaty](https://github.com/wechaty/python-wechaty)
+- [python-wechaty-getting-started](https://github.com/wechaty/python-wechaty-getting-started/blob/master/README.md)
+
+
+## 环境准备
+
+- 系统环境:Linux, MacOS, Windows
+- python3.7+
+
+
+## 安装和使用
+
+1. 安装PaddleGAN,详情请见[安装方式](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/zh_CN/install.md)
+
+ ```shell
+ git clone https://github.com/PaddlePaddle/paddlegan
+ cd paddlegan-wechaty-demo
+ ```
+
+2. 安装依赖 —— paddlepaddle, ppgan, wechaty
+
+ ```shell
+ pip install -r requirements.txt
+ ```
+
+3. 安装项目所需的PaddleGAN的module
+
+ 此demo以`first order motion`为示例,其他module根据项目所需安装,更多的模型请查阅[PaddleGAN模型API接口说明](https://github.com/PaddlePaddle/PaddleGAN/blob/develop/docs/en_US/apis/apps.md)。
+
+4. Set token for your bot
+
+ 在当前系统的环境变量中,配置以下与`WECHATY_PUPPET`相关的两个变量。
+ 关于其作用详情和TOKEN的获取方式,请查看[Wechaty Puppet Services](https://wechaty.js.org/docs/puppet-services/)。
+
+ ```shell
+ export WECHATY_PUPPET=wechaty-puppet-service
+ export WECHATY_PUPPET_SERVICE_TOKEN=your_token_at_here
+ ```
+
+ [Paimon](https://wechaty.js.org/docs/puppet-services/paimon/)的短期TOKEN经测试可用,比赛期间将提供选手一个可使用1个月的token,大家可自行使用。
+
+4. Run the bot
+
+ ```shell
+ python examples/paddleGAN_fom.py
+ ```
+ 运行后,可以通过微信移动端扫码登陆,登陆成功后则可正常使用。
+
+## 运行效果
+
+在`examples/paddleGAN_fom.py`中,通过以下几行代码即可实例化一个`first order motion`的模型。
+
+```python
+# Initialize a PaddleGAN first order motion model
+from ppgan.apps import FirstOrderPredictor
+animate = FirstOrderPredictor(output="test_fom", filename="result.mp4",\
+ relative=True, adapt_scale=True)
+```
+
+`on_message`方法是接收到消息时的回调函数,可以通过自定义的条件(譬如消息类型、消息来源、消息文字是否包含关键字、是否群聊消息等等)来判断是否回复信息,消息的更多属性和条件可以参考[Class Message](https://github.com/Wechaty/wechaty#3-class-message)。
+
+本示例中的`on_message`方法的代码如下,
+
+```python
+async def on_message(msg: Message):
+ """
+ Message Handler for the Bot
+ """
+ ### PaddleGAN fom
+
+ global fom, source, driving
+
+ if isinstance(msg.text(), str) and len(msg.text()) > 0 \
+ and msg._payload.type == MessageType.MESSAGE_TYPE_TEXT \
+ and "fom" in msg.text():
+ bot_response = u"好嘞, 给我发个图片和驱动视频吧"
+ fom = True
+ await msg.say(bot_response)
+
+ if fom and msg._payload.type == MessageType.MESSAGE_TYPE_IMAGE:
+ fileBox = await msg.to_file_box()
+ await fileBox.to_file("test_fom/source.jpg", True)
+
+ bot_response = u"好嘞, 收到图片"
+ await msg.say(bot_response)
+
+ source = True
+
+ if fom and msg._payload.type == MessageType.MESSAGE_TYPE_VIDEO:
+ fileBox = await msg.to_file_box()
+ await fileBox.to_file("test_fom/driving.mp4", True)
+
+ bot_response = u"好嘞, 收到驱动视频"
+ await msg.say(bot_response)
+
+ driving = True
+
+ if source and driving:
+ bot_response = u"都收到啦,稍等一下嘿嘿"
+ await msg.say(bot_response)
+ source = False
+ driving = False
+ fom = False
+ animate.run("test_fom/source.jpg", "test_fom/driving.mp4")
+ file_box = FileBox.from_file("test_fom/result.mp4")
+ await msg.say(file_box)
+
+ ###
+
+```
+
+脚本成功运行后,所登陆的账号即可作为一个Chatbot,下图左侧的内容由Chatbot生成和回复。
+
+

+
diff --git a/paddlegan-wechaty-demo/examples/paddleGAN_fom.py b/paddlegan-wechaty-demo/examples/paddleGAN_fom.py
new file mode 100755
index 0000000000000000000000000000000000000000..614ed1b2655a1324d44b30710d317661e6062c7e
--- /dev/null
+++ b/paddlegan-wechaty-demo/examples/paddleGAN_fom.py
@@ -0,0 +1,110 @@
+from collections import deque
+import os
+import asyncio
+
+from wechaty import (
+ Contact,
+ FileBox,
+ Message,
+ Wechaty,
+ ScanStatus,
+)
+from wechaty_puppet import MessageType
+
+# Initialize a PaddleGAN fom model
+from ppgan.apps import FirstOrderPredictor
+animate = FirstOrderPredictor(output="test_fom", filename="result.mp4",\
+ relative=True, adapt_scale=True)
+fom = False
+source = False
+driving = False
+
+
+async def on_message(msg: Message):
+ """
+ Message Handler for the Bot
+ """
+ ### PaddleGAN fom
+
+ global fom, source, driving
+
+ if isinstance(msg.text(), str) and len(msg.text()) > 0 \
+ and msg._payload.type == MessageType.MESSAGE_TYPE_TEXT \
+ and "fom" in msg.text():
+ bot_response = u"好嘞, 给我发个图片和驱动视频吧"
+ fom = True
+ await msg.say(bot_response)
+
+ if fom and msg._payload.type == MessageType.MESSAGE_TYPE_IMAGE:
+ fileBox = await msg.to_file_box()
+ await fileBox.to_file("test_fom/source.jpg", True)
+
+ bot_response = u"好嘞, 收到图片"
+ await msg.say(bot_response)
+
+ source = True
+
+ if fom and msg._payload.type == MessageType.MESSAGE_TYPE_VIDEO:
+ fileBox = await msg.to_file_box()
+ await fileBox.to_file("test_fom/driving.mp4", True)
+
+ bot_response = u"好嘞, 收到驱动视频"
+ await msg.say(bot_response)
+
+ driving = True
+
+ if source and driving:
+ bot_response = u"都收到啦,稍等一下嘿嘿"
+ await msg.say(bot_response)
+ source = False
+ driving = False
+ fom = False
+ animate.run("test_fom/source.jpg", "test_fom/driving.mp4")
+ file_box = FileBox.from_file("test_fom/result.mp4")
+ await msg.say(file_box)
+
+
+async def on_scan(
+ qrcode: str,
+ status: ScanStatus,
+ _data,
+):
+ """
+ Scan Handler for the Bot
+ """
+ print('Status: ' + str(status))
+ print('View QR Code Online: https://wechaty.js.org/qrcode/' + qrcode)
+
+
+async def on_login(user: Contact):
+ """
+ Login Handler for the Bot
+ """
+ print(user)
+ # TODO: To be written
+
+
+async def main():
+ """
+ Async Main Entry
+ """
+ #
+ # Make sure we have set WECHATY_PUPPET_SERVICE_TOKEN in the environment variables.
+ #
+ if 'WECHATY_PUPPET_SERVICE_TOKEN' not in os.environ:
+ print('''
+ Error: WECHATY_PUPPET_SERVICE_TOKEN is not found in the environment variables
+ You need a TOKEN to run the Python Wechaty. Please goto our README for details
+ https://github.com/wechaty/python-wechaty-getting-started/#wechaty_puppet_service_token
+ ''')
+
+ bot = Wechaty()
+
+ bot.on('scan', on_scan)
+ bot.on('login', on_login)
+ bot.on('message', on_message)
+
+ await bot.start()
+
+
+asyncio.run(main())
diff --git a/paddlegan-wechaty-demo/requirements.txt b/paddlegan-wechaty-demo/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..af9ec4d750fd7a27ef4db63966c6e873be645661
--- /dev/null
+++ b/paddlegan-wechaty-demo/requirements.txt
@@ -0,0 +1,3 @@
+paddlepaddle >= 2.1.0
+ppgan >= 2.0.0
+wechaty ~= 0.7dev16
diff --git a/paddlegan-wechaty-demo/test_fom/=2.0.0 b/paddlegan-wechaty-demo/test_fom/=2.0.0
new file mode 100644
index 0000000000000000000000000000000000000000..936c280b3e48f37be8b1989650a4b7118b393f8b
--- /dev/null
+++ b/paddlegan-wechaty-demo/test_fom/=2.0.0
@@ -0,0 +1,34 @@
+Requirement already satisfied: ppgan in /workspace/paddle_gan_new/PaddleGAN (2.0.0)
+Requirement already satisfied: tqdm in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (4.54.1)
+Requirement already satisfied: PyYAML>=5.1 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (5.3.1)
+Requirement already satisfied: scikit-image>=0.14.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (0.18.1)
+Requirement already satisfied: scipy>=1.1.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (1.5.4)
+Requirement already satisfied: opencv-python in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (4.2.0.32)
+Requirement already satisfied: imageio-ffmpeg in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (0.4.3)
+Requirement already satisfied: librosa==0.7.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (0.7.0)
+Requirement already satisfied: numba==0.48 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (0.48.0)
+Requirement already satisfied: easydict in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (1.9)
+Requirement already satisfied: munch in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from ppgan) (2.5.0)
+Requirement already satisfied: joblib>=0.12 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from librosa==0.7.0->ppgan) (1.0.0)
+Requirement already satisfied: numpy>=1.15.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from librosa==0.7.0->ppgan) (1.20.2)
+Requirement already satisfied: scikit-learn!=0.19.0,>=0.14.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from librosa==0.7.0->ppgan) (0.23.2)
+Requirement already satisfied: six>=1.3 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from librosa==0.7.0->ppgan) (1.16.0)
+Requirement already satisfied: audioread>=2.0.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from librosa==0.7.0->ppgan) (2.1.9)
+Requirement already satisfied: soundfile>=0.9.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from librosa==0.7.0->ppgan) (0.10.3.post1)
+Requirement already satisfied: resampy>=0.2.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from librosa==0.7.0->ppgan) (0.2.2)
+Requirement already satisfied: decorator>=3.0.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from librosa==0.7.0->ppgan) (4.4.2)
+Requirement already satisfied: setuptools in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from numba==0.48->ppgan) (51.0.0.post20201207)
+Requirement already satisfied: llvmlite<0.32.0,>=0.31.0dev0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from numba==0.48->ppgan) (0.31.0)
+Requirement already satisfied: pillow!=7.1.0,!=7.1.1,>=4.3.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from scikit-image>=0.14.0->ppgan) (8.0.1)
+Requirement already satisfied: imageio>=2.3.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from scikit-image>=0.14.0->ppgan) (2.9.0)
+Requirement already satisfied: PyWavelets>=1.1.1 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from scikit-image>=0.14.0->ppgan) (1.1.1)
+Requirement already satisfied: matplotlib!=3.0.0,>=2.0.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from scikit-image>=0.14.0->ppgan) (3.3.3)
+Requirement already satisfied: networkx>=2.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from scikit-image>=0.14.0->ppgan) (2.5)
+Requirement already satisfied: tifffile>=2019.7.26 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from scikit-image>=0.14.0->ppgan) (2020.12.8)
+Requirement already satisfied: python-dateutil>=2.1 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image>=0.14.0->ppgan) (2.8.1)
+Requirement already satisfied: cycler>=0.10 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image>=0.14.0->ppgan) (0.10.0)
+Requirement already satisfied: kiwisolver>=1.0.1 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image>=0.14.0->ppgan) (1.3.1)
+Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image>=0.14.0->ppgan) (2.4.7)
+Requirement already satisfied: threadpoolctl>=2.0.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from scikit-learn!=0.19.0,>=0.14.0->librosa==0.7.0->ppgan) (2.1.0)
+Requirement already satisfied: cffi>=1.0 in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from soundfile>=0.9.0->librosa==0.7.0->ppgan) (1.14.5)
+Requirement already satisfied: pycparser in /root/miniconda3/envs/py37/lib/python3.7/site-packages (from cffi>=1.0->soundfile>=0.9.0->librosa==0.7.0->ppgan) (2.20)
diff --git a/ppgan/__init__.py b/ppgan/__init__.py
index 9e83ec486fc6232d90d965425588645bc1204386..0f460a98dcd9e22cccfc6776ea60d13005c34f87 100644
--- a/ppgan/__init__.py
+++ b/ppgan/__init__.py
@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+
+from .version import ppgan_version as __version__
diff --git a/ppgan/apps/__init__.py b/ppgan/apps/__init__.py
index 647a21290de700065735e1cc4c1cce33bee548c9..37bce436db183b4aaaa83ed43107126c0e6603e1 100644
--- a/ppgan/apps/__init__.py
+++ b/ppgan/apps/__init__.py
@@ -23,5 +23,21 @@ from .animegan_predictor import AnimeGANPredictor
from .midas_predictor import MiDaSPredictor
from .photo2cartoon_predictor import Photo2CartoonPredictor
from .styleganv2_predictor import StyleGANv2Predictor
+# from .styleganv2clip_predictor import StyleGANv2ClipPredictor
+from .styleganv2fitting_predictor import StyleGANv2FittingPredictor
+from .styleganv2mixing_predictor import StyleGANv2MixingPredictor
+from .styleganv2editing_predictor import StyleGANv2EditingPredictor
from .pixel2style2pixel_predictor import Pixel2Style2PixelPredictor
from .wav2lip_predictor import Wav2LipPredictor
+from .mpr_predictor import MPRPredictor
+from .lapstyle_predictor import LapStylePredictor
+from .photopen_predictor import PhotoPenPredictor
+from .recurrent_vsr_predictor import (PPMSVSRPredictor, BasicVSRPredictor, \
+ BasiVSRPlusPlusPredictor, IconVSRPredictor, \
+ PPMSVSRLargePredictor)
+from .singan_predictor import SinGANPredictor
+from .gpen_predictor import GPENPredictor
+from .swinir_predictor import SwinIRPredictor
+from .invdn_predictor import InvDNPredictor
+from .nafnet_predictor import NAFNetPredictor
+from .aotgan_predictor import AOTGANPredictor
diff --git a/ppgan/apps/animegan_predictor.py b/ppgan/apps/animegan_predictor.py
index 8c5655d674c10a2f04c39d8d40246c85c9ca5404..b3c8b0b57975efd3250303c468ce9f60389fdd08 100644
--- a/ppgan/apps/animegan_predictor.py
+++ b/ppgan/apps/animegan_predictor.py
@@ -18,7 +18,7 @@ import cv2
import paddle
from .base_predictor import BasePredictor
-from ppgan.datasets.transforms import ResizeToScale
+from ppgan.datasets.preprocess.transforms import ResizeToScale
import paddle.vision.transforms as T
from ppgan.models.generators import AnimeGenerator
from ppgan.utils.download import get_path_from_url
diff --git a/ppgan/apps/aotgan_predictor.py b/ppgan/apps/aotgan_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..73304ee29d652d778d1b7a7a2a219b04044d81ab
--- /dev/null
+++ b/ppgan/apps/aotgan_predictor.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from PIL import Image, ImageOps
+import cv2
+import numpy as np
+import os
+
+import paddle
+from paddle.vision.transforms import Resize
+
+from .base_predictor import BasePredictor
+from ppgan.models.generators import InpaintGenerator
+from ..utils.filesystem import load
+
+
+class AOTGANPredictor(BasePredictor):
+ def __init__(self,
+ output_path,
+ weight_path,
+ gen_cfg):
+
+ # initialize model
+ gen = InpaintGenerator(
+ gen_cfg.rates,
+ gen_cfg.block_num,
+ )
+ gen.eval()
+ para = load(weight_path)
+ if 'net_gen' in para:
+ gen.set_state_dict(para['net_gen'])
+ else:
+ gen.set_state_dict(para)
+
+ self.gen = gen
+ self.output_path = output_path
+ self.gen_cfg = gen_cfg
+
+
+ def run(self, input_image_path, input_mask_path):
+ img = Image.open(input_image_path)
+ mask = Image.open(input_mask_path)
+ img = Resize([self.gen_cfg.img_size, self.gen_cfg.img_size], interpolation='bilinear')(img)
+ mask = Resize([self.gen_cfg.img_size, self.gen_cfg.img_size], interpolation='nearest')(mask)
+ img = img.convert('RGB')
+ mask = mask.convert('L')
+ img = np.array(img)
+ mask = np.array(mask)
+
+ # normalize image data to (-1, +1),image tensor shape:[n=1, c=3, h=512, w=512]
+ img = (img.astype('float32') / 255.) * 2. - 1.
+ img = np.transpose(img, (2, 0, 1))
+ img = paddle.to_tensor(np.expand_dims(img, 0))
+ # mask tensor shape:[n=1, c=3, h=512, w=512], value 0 denotes known pixels and 1 denotes missing regions
+ mask = np.expand_dims(mask.astype('float32') / 255., 0)
+ mask = paddle.to_tensor(np.expand_dims(mask, 0))
+
+ # predict
+ img_masked = (img * (1 - mask)) + mask # put the mask onto the image
+ input_data = paddle.concat((img_masked, mask), axis=1) # concatenate
+ pred_img = self.gen(input_data) # predict by masked image
+ comp_img = (1 - mask) * img + mask * pred_img # compound the inpainted image
+ img_save = ((comp_img.numpy()[0].transpose((1,2,0)) + 1.) / 2. * 255).astype('uint8')
+
+ pic = cv2.cvtColor(img_save,cv2.COLOR_BGR2RGB)
+ path, _ = os.path.split(self.output_path)
+ if not os.path.exists(path):
+ os.mkdir(path)
+ cv2.imwrite(self.output_path, pic)
+ print('Predicted pictures are saved: '+self.output_path+' 。')
diff --git a/ppgan/apps/base_predictor.py b/ppgan/apps/base_predictor.py
index 2cdd3e31366bd70e640bbe14a03b7efabb2b9718..4569da791b2b128f725fe428bf4f0ae5e5911124 100644
--- a/ppgan/apps/base_predictor.py
+++ b/ppgan/apps/base_predictor.py
@@ -28,8 +28,8 @@ class BasePredictor(object):
# todo self.model = build_model(self.cfg)
pass
else:
- place = paddle.fluid.framework._current_expected_place()
- self.exe = paddle.fluid.Executor(place)
+ place = paddle.get_device()
+ self.exe = paddle.static.Executor(place)
file_names = os.listdir(self.weight_path)
for file_name in file_names:
if file_name.find('model') > -1:
diff --git a/ppgan/apps/dain_predictor.py b/ppgan/apps/dain_predictor.py
index 31d9e974782d5007fdaff141fe7b42f8c41834f9..bf899c52556315fcff3b9bb9c6de2a08112b5bef 100644
--- a/ppgan/apps/dain_predictor.py
+++ b/ppgan/apps/dain_predictor.py
@@ -21,7 +21,6 @@ from tqdm import tqdm
from imageio import imread, imsave
import paddle
-import paddle.fluid as fluid
from ppgan.utils.download import get_path_from_url
from ppgan.utils.video import video2frames, frames2video
@@ -79,7 +78,7 @@ class DAINPredictor(BasePredictor):
out_path = video2frames(video_path, frame_path_input)
- vidname = video_path.split('/')[-1].split('.')[0]
+ vidname = os.path.basename(video_path).split('.')[0]
frames = sorted(glob.glob(os.path.join(out_path, '*.png')))
@@ -123,8 +122,8 @@ class DAINPredictor(BasePredictor):
for i in tqdm(range(frame_num - 1)):
first = frames[i]
second = frames[i + 1]
- first_index = int(first.split('/')[-1].split('.')[-2])
- second_index = int(second.split('/')[-1].split('.')[-2])
+ first_index = int(first.split(os.sep)[-1].split('.')[-2])
+ second_index = int(second.split(os.sep)[-1].split('.')[-2])
img_first = imread(first)
img_second = imread(second)
@@ -225,7 +224,7 @@ class DAINPredictor(BasePredictor):
for i in range(num1):
src = frames1[i]
- imgname = int(src.split('/')[-1].split('.')[-2])
+ imgname = int(src.split(os.sep)[-1].split('.')[-2])
assert i == imgname
dst = os.path.join(combined,
'{:08d}.png'.format(i * (num_frames + 1)))
@@ -250,14 +249,14 @@ class DAINPredictor(BasePredictor):
for i in range(num1):
src = frames1[i]
- index = int(src.split('/')[-1].split('.')[-2])
+ index = int(src.split(os.sep)[-1].split('.')[-2])
dst = os.path.join(combined,
'{:08d}.png'.format(times_interp * index))
shutil.copy2(src, dst)
for i in range(num2):
src = frames2[i]
- imgname = src.split('/')[-1]
+ imgname = src.split(os.sep)[-1]
dst = os.path.join(combined, imgname)
shutil.copy2(src, dst)
@@ -280,9 +279,10 @@ class DAINPredictor(BasePredictor):
for (h, hashed_paths) in hashes.items():
if len(hashed_paths) > 1:
- first_index = int(hashed_paths[0].split('/')[-1].split('.')[-2])
- last_index = int(
- hashed_paths[-1].split('/')[-1].split('.')[-2]) + 1
+ first_index = int(hashed_paths[0].split(
+ os.sep)[-1].split('.')[-2])
+ last_index = int(hashed_paths[-1].split(
+ os.sep)[-1].split('.')[-2]) + 1
gap = 2 * (last_index - first_index) - 1
if gap > 2 * max_interp:
cut1 = len(hashed_paths) // 3
diff --git a/ppgan/apps/edvr_predictor.py b/ppgan/apps/edvr_predictor.py
index 0a6425dc289158bd57caa00d65d8d15a76917cad..74ac07504da6d29109e7af286d48547d29538fbd 100644
--- a/ppgan/apps/edvr_predictor.py
+++ b/ppgan/apps/edvr_predictor.py
@@ -19,12 +19,15 @@ import glob
import numpy as np
from tqdm import tqdm
+import paddle
+from paddle.io import Dataset, DataLoader
+
from ppgan.utils.download import get_path_from_url
from ppgan.utils.video import frames2video, video2frames
-
+from ppgan.models.generators import EDVRNet
from .base_predictor import BasePredictor
-EDVR_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/applications/edvr_infer_model.tar'
+EDVR_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/EDVR_L_w_tsa_SRx4.pdparams'
def get_img(pred):
@@ -110,7 +113,7 @@ def get_test_neighbor_frames(crt_i, N, max_n, padding='new_info'):
return return_l
-class EDVRDataset:
+class EDVRDataset(Dataset):
def __init__(self, frame_paths):
self.frames = frame_paths
@@ -133,16 +136,15 @@ class EDVRDataset:
class EDVRPredictor(BasePredictor):
- def __init__(self, output='output', weight_path=None):
+ def __init__(self, output='output', weight_path=None, bs=1):
self.input = input
self.output = os.path.join(output, 'EDVR')
-
+ self.bs = bs
+ self.model = EDVRNet(nf=128, back_RBs=40)
if weight_path is None:
weight_path = get_path_from_url(EDVR_WEIGHT_URL)
-
- self.weight_path = weight_path
-
- self.build_inference_model()
+ self.model.set_dict(paddle.load(weight_path)['generator'])
+ self.model.eval()
def run(self, video_path):
vid = video_path
@@ -163,23 +165,23 @@ class EDVRPredictor(BasePredictor):
frames = sorted(glob.glob(os.path.join(out_path, '*.png')))
- dataset = EDVRDataset(frames)
+ test_dataset = EDVRDataset(frames)
+ dataset = DataLoader(test_dataset, batch_size=self.bs, num_workers=2)
periods = []
cur_time = time.time()
for infer_iter, data in enumerate(tqdm(dataset)):
- data_feed_in = [data[0]]
-
- outs = self.base_forward(np.array(data_feed_in))
-
- infer_result_list = [item for item in outs]
-
+ data_feed_in = paddle.to_tensor(data[0])
+ with paddle.no_grad():
+ outs = self.model(data_feed_in).numpy()
+ infer_result_list = [outs[i, :, :, :] for i in range(self.bs)]
frame_path = data[1]
-
- img_i = get_img(infer_result_list[0])
- save_img(
- img_i,
- os.path.join(pred_frame_path, os.path.basename(frame_path)))
+ for i in range(self.bs):
+ img_i = get_img(infer_result_list[i])
+ save_img(
+ img_i,
+ os.path.join(pred_frame_path,
+ os.path.basename(frame_path[i])))
prev_time = cur_time
cur_time = time.time()
diff --git a/ppgan/apps/face_parse_predictor.py b/ppgan/apps/face_parse_predictor.py
index 4cf0084e1821f45497ae8712f2804b39702bd238..f9890605c9229831c3fa00e0dae3a2abe3c96a19 100644
--- a/ppgan/apps/face_parse_predictor.py
+++ b/ppgan/apps/face_parse_predictor.py
@@ -52,8 +52,8 @@ class FaceParsePredictor(BasePredictor):
mask = cv2.resize(mask.numpy(), (256, 256))
mask = mask.astype(np.uint8)
mask = mask2image(mask)
- if not os.path.exists(output_path):
- os.makedirs(output_path)
+ if not os.path.exists(self.output_path):
+ os.makedirs(self.output_path)
save_path = os.path.join(self.output_path, 'face_parse.png')
cv2.imwrite(save_path, mask)
return mask
diff --git a/ppgan/apps/first_order_predictor.py b/ppgan/apps/first_order_predictor.py
index 7fe981a4718f356d19b714e808aef0989ccfa857..8a857d3d69bd4c97bd7f1a82e2540e6bac6b58d2 100644
--- a/ppgan/apps/first_order_predictor.py
+++ b/ppgan/apps/first_order_predictor.py
@@ -14,14 +14,14 @@
import os
import sys
+import cv2
+import math
import yaml
import pickle
import imageio
import numpy as np
from tqdm import tqdm
-from skimage import img_as_ubyte
-from skimage.transform import resize
from scipy.spatial import ConvexHull
import paddle
@@ -29,11 +29,13 @@ from ppgan.utils.download import get_path_from_url
from ppgan.utils.animate import normalize_kp
from ppgan.modules.keypoint_detector import KPDetector
from ppgan.models.generators.occlusion_aware import OcclusionAwareGenerator
+from ppgan.faceutils import face_detection
from .base_predictor import BasePredictor
class FirstOrderPredictor(BasePredictor):
+
def __init__(self,
output='output',
weight_path=None,
@@ -41,58 +43,132 @@ class FirstOrderPredictor(BasePredictor):
relative=False,
adapt_scale=False,
find_best_frame=False,
- best_frame=None):
+ best_frame=None,
+ ratio=1.0,
+ filename='result.mp4',
+ face_detector='sfd',
+ multi_person=False,
+ image_size=256,
+ face_enhancement=False,
+ batch_size=1,
+ mobile_net=False,
+ slice_size=0):
if config is not None and isinstance(config, str):
- self.cfg = yaml.load(config)
+ with open(config) as f:
+ self.cfg = yaml.load(f, Loader=yaml.SafeLoader)
elif isinstance(config, dict):
self.cfg = config
elif config is None:
self.cfg = {
- 'model_params': {
+ 'model': {
'common_params': {
'num_kp': 10,
'num_channels': 3,
'estimate_jacobian': True
},
- 'kp_detector_params': {
- 'temperature': 0.1,
- 'block_expansion': 32,
- 'max_features': 1024,
- 'scale_factor': 0.25,
- 'num_blocks': 5
- },
- 'generator_params': {
- 'block_expansion': 64,
- 'max_features': 512,
- 'num_down_blocks': 2,
- 'num_bottleneck_blocks': 6,
- 'estimate_occlusion_map': True,
- 'dense_motion_params': {
- 'block_expansion': 64,
+ 'generator': {
+ 'kp_detector_cfg': {
+ 'temperature': 0.1,
+ 'block_expansion': 32,
'max_features': 1024,
- 'num_blocks': 5,
- 'scale_factor': 0.25
+ 'scale_factor': 0.25,
+ 'num_blocks': 5
+ },
+ 'generator_cfg': {
+ 'block_expansion': 64,
+ 'max_features': 512,
+ 'num_down_blocks': 2,
+ 'num_bottleneck_blocks': 6,
+ 'estimate_occlusion_map': True,
+ 'dense_motion_params': {
+ 'block_expansion': 64,
+ 'max_features': 1024,
+ 'num_blocks': 5,
+ 'scale_factor': 0.25
+ }
}
}
}
}
- if weight_path is None:
- vox_cpk_weight_url = 'https://paddlegan.bj.bcebos.com/applications/first_order_model/vox-cpk.pdparams'
- weight_path = get_path_from_url(vox_cpk_weight_url)
+ self.image_size = image_size
+ if weight_path is None:
+ if mobile_net:
+ vox_cpk_weight_url = 'https://paddlegan.bj.bcebos.com/applications/first_order_model/vox-mobile.pdparams'
+
+ else:
+ if self.image_size == 512:
+ vox_cpk_weight_url = 'https://paddlegan.bj.bcebos.com/applications/first_order_model/vox-cpk-512.pdparams'
+ else:
+ vox_cpk_weight_url = 'https://paddlegan.bj.bcebos.com/applications/first_order_model/vox-cpk.pdparams'
+ weight_path = get_path_from_url(vox_cpk_weight_url)
self.weight_path = weight_path
if not os.path.exists(output):
os.makedirs(output)
self.output = output
+ self.filename = filename
self.relative = relative
self.adapt_scale = adapt_scale
self.find_best_frame = find_best_frame
self.best_frame = best_frame
+ self.ratio = ratio
+ self.face_detector = face_detector
self.generator, self.kp_detector = self.load_checkpoints(
self.cfg, self.weight_path)
+ self.multi_person = multi_person
+ self.face_enhancement = face_enhancement
+ self.batch_size = batch_size
+ if face_enhancement:
+ from ppgan.faceutils.face_enhancement import FaceEnhancement
+ self.faceenhancer = FaceEnhancement(batch_size=batch_size)
+ self.slice_size = slice_size
+
+ def read_img(self, path):
+ img = imageio.imread(path)
+ if img.ndim == 2:
+ img = np.expand_dims(img, axis=2)
+ # som images have 4 channels
+ if img.shape[2] > 3:
+ img = img[:, :, :3]
+ return img
def run(self, source_image, driving_video):
- source_image = imageio.imread(source_image)
+
+ def get_prediction(face_image):
+ if self.find_best_frame or self.best_frame is not None:
+ i = self.best_frame if self.best_frame is not None else self.find_best_frame_func(
+ source_image, driving_video)
+
+ print("Best frame: " + str(i))
+ driving_forward = driving_video[i:]
+ driving_backward = driving_video[:(i + 1)][::-1]
+ predictions_forward = self.make_animation(
+ face_image,
+ driving_forward,
+ self.generator,
+ self.kp_detector,
+ relative=self.relative,
+ adapt_movement_scale=self.adapt_scale)
+ predictions_backward = self.make_animation(
+ face_image,
+ driving_backward,
+ self.generator,
+ self.kp_detector,
+ relative=self.relative,
+ adapt_movement_scale=self.adapt_scale)
+ predictions = predictions_backward[::-1] + predictions_forward[
+ 1:]
+ else:
+ predictions = self.make_animation(
+ face_image,
+ driving_video,
+ self.generator,
+ self.kp_detector,
+ relative=self.relative,
+ adapt_movement_scale=self.adapt_scale)
+ return predictions
+
+ source_image = self.read_img(source_image)
reader = imageio.get_reader(driving_video)
fps = reader.get_meta_data()['fps']
driving_video = []
@@ -100,56 +176,71 @@ class FirstOrderPredictor(BasePredictor):
for im in reader:
driving_video.append(im)
except RuntimeError:
+ print("Read driving video error!")
pass
reader.close()
- source_image = resize(source_image, (256, 256))[..., :3]
driving_video = [
- resize(frame, (256, 256))[..., :3] for frame in driving_video
+ cv2.resize(frame, (self.image_size, self.image_size)) / 255.0
+ for frame in driving_video
]
+ results = []
- if self.find_best_frame or self.best_frame is not None:
- i = self.best_frame if self.best_frame is not None else self.find_best_frame_func(
- source_image, driving_video)
-
- print("Best frame: " + str(i))
- driving_forward = driving_video[i:]
- driving_backward = driving_video[:(i + 1)][::-1]
- predictions_forward = self.make_animation(
- source_image,
- driving_forward,
- self.generator,
- self.kp_detector,
- relative=self.relative,
- adapt_movement_scale=self.adapt_scale)
- predictions_backward = self.make_animation(
- source_image,
- driving_backward,
- self.generator,
- self.kp_detector,
- relative=self.relative,
- adapt_movement_scale=self.adapt_scale)
- predictions = predictions_backward[::-1] + predictions_forward[1:]
- else:
- predictions = self.make_animation(
- source_image,
- driving_video,
- self.generator,
- self.kp_detector,
- relative=self.relative,
- adapt_movement_scale=self.adapt_scale)
- imageio.mimsave(os.path.join(self.output, 'result.mp4'),
- [img_as_ubyte(frame) for frame in predictions],
- fps=fps)
+ bboxes = self.extract_bbox(source_image.copy())
+ print(str(len(bboxes)) + " persons have been detected")
+
+ # for multi person
+ for rec in bboxes:
+ face_image = source_image.copy()[rec[1]:rec[3], rec[0]:rec[2]]
+ face_image = cv2.resize(face_image,
+ (self.image_size, self.image_size)) / 255.0
+ predictions = get_prediction(face_image)
+ results.append({
+ 'rec':
+ rec,
+ 'predict':
+ [predictions[i] for i in range(predictions.shape[0])]
+ })
+ if len(bboxes) == 1 or not self.multi_person:
+ break
+ out_frame = []
+
+ for i in range(len(driving_video)):
+ frame = source_image.copy()
+ for result in results:
+ x1, y1, x2, y2, _ = result['rec']
+ h = y2 - y1
+ w = x2 - x1
+ out = result['predict'][i]
+ out = cv2.resize(out.astype(np.uint8), (x2 - x1, y2 - y1))
+ if len(results) == 1:
+ frame[y1:y2, x1:x2] = out
+ break
+ else:
+ patch = np.zeros(frame.shape).astype('uint8')
+ patch[y1:y2, x1:x2] = out
+ mask = np.zeros(frame.shape[:2]).astype('uint8')
+ cx = int((x1 + x2) / 2)
+ cy = int((y1 + y2) / 2)
+ cv2.circle(mask, (cx, cy), math.ceil(h * self.ratio),
+ (255, 255, 255), -1, 8, 0)
+ frame = cv2.copyTo(patch, mask, frame)
+
+ out_frame.append(frame)
+ imageio.mimsave(os.path.join(self.output, self.filename),
+ [frame for frame in out_frame],
+ fps=fps)
def load_checkpoints(self, config, checkpoint_path):
- generator = OcclusionAwareGenerator(
- **config['model_params']['generator_params'],
- **config['model_params']['common_params'])
+ generator = OcclusionAwareGenerator(**config['model']['generator']
+ ['generator_cfg'],
+ **config['model']['common_params'],
+ inference=True)
- kp_detector = KPDetector(**config['model_params']['kp_detector_params'],
- **config['model_params']['common_params'])
+ kp_detector = KPDetector(
+ **config['model']['generator']['kp_detector_cfg'],
+ **config['model']['common_params'])
checkpoint = paddle.load(self.weight_path)
generator.set_state_dict(checkpoint['generator'])
@@ -173,15 +264,61 @@ class FirstOrderPredictor(BasePredictor):
source = paddle.to_tensor(source_image[np.newaxis].astype(
np.float32)).transpose([0, 3, 1, 2])
- driving = paddle.to_tensor(
- np.array(driving_video)[np.newaxis].astype(
- np.float32)).transpose([0, 4, 1, 2, 3])
+ driving_video_np = np.array(driving_video).astype(np.float32)
+ driving_n, driving_h, driving_w, driving_c = driving_video_np.shape
+
+ driving_slices = []
+
+ if self.slice_size != 0:
+ batch_count_in_slice = int(
+ np.floor(
+ float(self.slice_size) /
+ (self.batch_size * driving_h * driving_w * driving_c)))
+ assert batch_count_in_slice > 0, "batch_count_in_slice is 0, use smaller batch_size or bigger slice_size"
+ frame_count_in_slice = batch_count_in_slice * self.batch_size
+ for slice_start in range(0, driving_n, frame_count_in_slice):
+ slice_end = slice_start + min(frame_count_in_slice,
+ driving_n - slice_start)
+ current_slice = paddle.to_tensor(
+ driving_video_np[slice_start:slice_end, ]).transpose(
+ [0, 3, 1, 2])
+ driving_slices.append(current_slice)
+ else:
+ # whole driving as a single slice
+ driving = paddle.to_tensor(
+ np.array(driving_video).astype(np.float32)).transpose(
+ [0, 3, 1, 2])
+ frame_count_in_slice = driving_n
+ driving_slices.append(driving)
+
kp_source = kp_detector(source)
- kp_driving_initial = kp_detector(driving[:, :, 0])
+ kp_driving_initial = kp_detector(driving_slices[0][0:1])
+ kp_source_batch = {}
+ kp_source_batch["value"] = paddle.tile(
+ kp_source["value"], repeat_times=[self.batch_size, 1, 1])
+ kp_source_batch["jacobian"] = paddle.tile(
+ kp_source["jacobian"], repeat_times=[self.batch_size, 1, 1, 1])
+ source = paddle.tile(source,
+ repeat_times=[self.batch_size, 1, 1, 1])
+ begin_idx = 0
+ for frame_idx in tqdm(
+ range(int(np.ceil(float(driving_n) / self.batch_size)))):
+ frame_num = min(self.batch_size, driving_n - begin_idx)
+ slice_id = int(frame_idx * self.batch_size /
+ frame_count_in_slice)
+
+ internal_start = frame_idx - slice_id * frame_count_in_slice
+ internal_end = frame_idx - slice_id * frame_count_in_slice + frame_num
+
+ driving_frame = driving_slices[slice_id][
+ internal_start:internal_end]
- for frame_idx in tqdm(range(driving.shape[2])):
- driving_frame = driving[:, :, frame_idx]
kp_driving = kp_detector(driving_frame)
+ kp_source_img = {}
+ kp_source_img["value"] = kp_source_batch["value"][0:frame_num]
+ kp_source_img["jacobian"] = kp_source_batch["jacobian"][
+ 0:frame_num]
+
kp_norm = normalize_kp(
kp_source=kp_source,
kp_driving=kp_driving,
@@ -189,11 +326,19 @@ class FirstOrderPredictor(BasePredictor):
use_relative_movement=relative,
use_relative_jacobian=relative,
adapt_movement_scale=adapt_movement_scale)
- out = generator(source, kp_source=kp_source, kp_driving=kp_norm)
- predictions.append(
- np.transpose(out['prediction'].numpy(), [0, 2, 3, 1])[0])
- return predictions
+ out = generator(source[0:frame_num],
+ kp_source=kp_source_img,
+ kp_driving=kp_norm)
+ img = np.transpose(out['prediction'].numpy(),
+ [0, 2, 3, 1]) * 255.0
+
+ if self.face_enhancement:
+ img = self.faceenhancer.enhance_from_batch(img)
+
+ predictions.append(img)
+ begin_idx += frame_num
+ return np.concatenate(predictions)
def find_best_frame_func(self, source, driving):
import face_alignment
@@ -220,3 +365,62 @@ class FirstOrderPredictor(BasePredictor):
norm = new_norm
frame_num = i
return frame_num
+
+ def extract_bbox(self, image):
+ detector = face_detection.FaceAlignment(
+ face_detection.LandmarksType._2D,
+ flip_input=False,
+ face_detector=self.face_detector)
+
+ frame = [image]
+ predictions = detector.get_detections_for_image(np.array(frame))
+ person_num = len(predictions)
+ if person_num == 0:
+ return np.array([])
+ results = []
+ face_boxs = []
+ h, w, _ = image.shape
+ for rect in predictions:
+ bh = rect[3] - rect[1]
+ bw = rect[2] - rect[0]
+ cy = rect[1] + int(bh / 2)
+ cx = rect[0] + int(bw / 2)
+ margin = max(bh, bw)
+ y1 = max(0, cy - margin)
+ x1 = max(0, cx - int(0.8 * margin))
+ y2 = min(h, cy + margin)
+ x2 = min(w, cx + int(0.8 * margin))
+ area = (y2 - y1) * (x2 - x1)
+ results.append([x1, y1, x2, y2, area])
+ # if a person has more than one bbox, keep the largest one
+ # maybe greedy will be better?
+ sorted(results, key=lambda area: area[4], reverse=True)
+ results_box = [results[0]]
+ for i in range(1, person_num):
+ num = len(results_box)
+ add_person = True
+ for j in range(num):
+ pre_person = results_box[j]
+ iou = self.IOU(pre_person[0], pre_person[1], pre_person[2],
+ pre_person[3], pre_person[4], results[i][0],
+ results[i][1], results[i][2], results[i][3],
+ results[i][4])
+ if iou > 0.5:
+ add_person = False
+ break
+ if add_person:
+ results_box.append(results[i])
+ boxes = np.array(results_box)
+ return boxes
+
+ def IOU(self, ax1, ay1, ax2, ay2, sa, bx1, by1, bx2, by2, sb):
+ #sa = abs((ax2 - ax1) * (ay2 - ay1))
+ #sb = abs((bx2 - bx1) * (by2 - by1))
+ x1, y1 = max(ax1, bx1), max(ay1, by1)
+ x2, y2 = min(ax2, bx2), min(ay2, by2)
+ w = x2 - x1
+ h = y2 - y1
+ if w < 0 or h < 0:
+ return 0.0
+ else:
+ return 1.0 * w * h / (sa + sb - w * h)
diff --git a/ppgan/apps/gpen_predictor.py b/ppgan/apps/gpen_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0648a1724d25f423ec4ea9c99c42cdfca7d6c36
--- /dev/null
+++ b/ppgan/apps/gpen_predictor.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import os
+import random
+import numpy as np
+import paddle
+import sys
+
+sys.path.append(".")
+from .base_predictor import BasePredictor
+from ppgan.datasets.gpen_dataset import GFPGAN_degradation
+from ppgan.models.generators import GPENGenerator
+from ppgan.metrics.fid import FID
+from ppgan.utils.download import get_path_from_url
+import cv2
+
+import warnings
+
+model_cfgs = {
+ 'gpen-ffhq-256': {
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/gpen-ffhq-256-generator.pdparams',
+ 'size': 256,
+ 'style_dim': 512,
+ 'n_mlp': 8,
+ 'channel_multiplier': 1,
+ 'narrow': 0.5
+ }
+}
+
+
+def psnr(pred, gt):
+ pred = paddle.clip(pred, min=0, max=1)
+ gt = paddle.clip(gt, min=0, max=1)
+ imdff = np.asarray(pred - gt)
+ rmse = math.sqrt(np.mean(imdff**2))
+ if rmse == 0:
+ return 100
+ return 20 * math.log10(1.0 / rmse)
+
+
+def data_loader(path, size=256):
+ degrader = GFPGAN_degradation()
+
+ img_gt = cv2.imread(path, cv2.IMREAD_COLOR)
+
+ img_gt = cv2.resize(img_gt, (size, size), interpolation=cv2.INTER_NEAREST)
+
+ img_gt = img_gt.astype(np.float32) / 255.
+ img_gt, img_lq = degrader.degrade_process(img_gt)
+
+ img_gt = (paddle.to_tensor(img_gt) - 0.5) / 0.5
+ img_lq = (paddle.to_tensor(img_lq) - 0.5) / 0.5
+
+ img_gt = img_gt.transpose([2, 0, 1]).flip(0).unsqueeze(0)
+ img_lq = img_lq.transpose([2, 0, 1]).flip(0).unsqueeze(0)
+
+ return np.array(img_lq).astype('float32'), np.array(img_gt).astype(
+ 'float32')
+
+
+class GPENPredictor(BasePredictor):
+
+ def __init__(self,
+ output_path='output_dir',
+ weight_path=None,
+ model_type=None,
+ seed=100,
+ size=256,
+ style_dim=512,
+ n_mlp=8,
+ channel_multiplier=1,
+ narrow=0.5):
+ self.output_path = output_path
+ self.size = size
+ if weight_path is None:
+ if model_type in model_cfgs.keys():
+ weight_path = get_path_from_url(
+ model_cfgs[model_type]['model_urls'])
+ size = model_cfgs[model_type].get('size', size)
+ style_dim = model_cfgs[model_type].get('style_dim', style_dim)
+ n_mlp = model_cfgs[model_type].get('n_mlp', n_mlp)
+ channel_multiplier = model_cfgs[model_type].get(
+ 'channel_multiplier', channel_multiplier)
+ narrow = model_cfgs[model_type].get('narrow', narrow)
+ checkpoint = paddle.load(weight_path)
+ else:
+ raise ValueError(
+ 'Predictor need a weight path or a pretrained model type')
+ else:
+ checkpoint = paddle.load(weight_path)
+
+ warnings.filterwarnings("always")
+ self.generator = GPENGenerator(size, style_dim, n_mlp, channel_multiplier,
+ narrow)
+ self.generator.set_state_dict(checkpoint)
+ self.generator.eval()
+
+ if seed is not None:
+ paddle.seed(seed)
+ random.seed(seed)
+ np.random.seed(seed)
+
+ def run(self, img_path):
+ os.makedirs(self.output_path, exist_ok=True)
+ input_array, target_array = data_loader(img_path, self.size)
+ input_tensor = paddle.to_tensor(input_array)
+ target_tensor = paddle.to_tensor(target_array)
+
+ FID_model = FID(use_GPU=True)
+
+ with paddle.no_grad():
+ output, _ = self.generator(input_tensor)
+ psnr_score = psnr(target_tensor, output)
+ FID_model.update(output, target_tensor)
+ fid_score = FID_model.accumulate()
+
+ input_tensor = input_tensor.transpose([0, 2, 3, 1])
+ target_tensor = target_tensor.transpose([0, 2, 3, 1])
+ output = output.transpose([0, 2, 3, 1])
+ sample_result = paddle.concat(
+ (input_tensor[0], output[0], target_tensor[0]), 1)
+ sample = cv2.cvtColor((sample_result.numpy() + 1) / 2 * 255,
+ cv2.COLOR_RGB2BGR)
+ file_name = self.output_path + '/gpen_predict.png'
+ cv2.imwrite(file_name, sample)
+ print(f"result saved in : {file_name}")
+ print(f"\tFID: {fid_score}\n\tPSNR:{psnr_score}")
diff --git a/ppgan/apps/invdn_predictor.py b/ppgan/apps/invdn_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..931a36a113a88ef228202e2343732c4d811806ab
--- /dev/null
+++ b/ppgan/apps/invdn_predictor.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+from glob import glob
+from natsort import natsorted
+import numpy as np
+import os
+import random
+from tqdm import tqdm
+
+import paddle
+
+from ppgan.models.generators import InvDN
+from ppgan.utils.download import get_path_from_url
+from .base_predictor import BasePredictor
+
+model_cfgs = {
+ 'Denoising': {
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/InvDN_Denoising.pdparams',
+ 'channel_in': 3,
+ 'channel_out': 3,
+ 'block_num': [8, 8],
+ 'scale': 4,
+ 'down_num': 2
+ }
+}
+
+
+class InvDNPredictor(BasePredictor):
+ def __init__(self, output_path='output_dir', weight_path=None, seed=None):
+ self.output_path = output_path
+ task = 'Denoising'
+ self.task = task
+
+ if weight_path is None:
+ if task in model_cfgs.keys():
+ weight_path = get_path_from_url(model_cfgs[task]['model_urls'])
+ checkpoint = paddle.load(weight_path)
+ else:
+ raise ValueError('Predictor need a task to define!')
+ else:
+ if weight_path.startswith("http"): # os.path.islink dosen't work!
+ weight_path = get_path_from_url(weight_path)
+ checkpoint = paddle.load(weight_path)
+ else:
+ checkpoint = paddle.load(weight_path)
+
+ self.generator = InvDN(channel_in=model_cfgs[task]['channel_in'],
+ channel_out=model_cfgs[task]['channel_out'],
+ block_num=model_cfgs[task]['block_num'],
+ scale=model_cfgs[task]['scale'],
+ down_num=model_cfgs[task]['down_num'])
+
+ checkpoint = checkpoint['generator']
+ self.generator.set_state_dict(checkpoint)
+ self.generator.eval()
+
+ if seed is not None:
+ paddle.seed(seed)
+ random.seed(seed)
+ np.random.seed(seed)
+
+ def get_images(self, images_path):
+ if os.path.isdir(images_path):
+ return natsorted(
+ glob(os.path.join(images_path, '*.jpeg')) +
+ glob(os.path.join(images_path, '*.jpg')) +
+ glob(os.path.join(images_path, '*.JPG')) +
+ glob(os.path.join(images_path, '*.png')) +
+ glob(os.path.join(images_path, '*.PNG')))
+ else:
+ return [images_path]
+
+ def imread_uint(self, path, n_channels=3):
+ # input: path
+ # output: HxWx3(RGB or GGG), or HxWx1 (G)
+ if n_channels == 1:
+ img = cv2.imread(path, 0) # cv2.IMREAD_GRAYSCALE
+ img = np.expand_dims(img, axis=2) # HxWx1
+ elif n_channels == 3:
+ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # BGR or G
+ if img.ndim == 2:
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) # GGG
+ else:
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB
+
+ return img
+
+ def uint2single(self, img):
+
+ return np.float32(img / 255.)
+
+ # convert single (HxWxC) to 3-dimensional paddle tensor
+ def single2tensor3(self, img):
+ return paddle.Tensor(np.ascontiguousarray(
+ img, dtype=np.float32)).transpose([2, 0, 1])
+
+ def forward_x8(self, x, forward_function, noise_channel):
+ def _transform(v, op):
+ v2np = v.cpu().numpy()
+ if op == 'v':
+ tfnp = v2np[:, :, :, ::-1].copy()
+ elif op == 'h':
+ tfnp = v2np[:, :, ::-1, :].copy()
+ elif op == 't':
+ tfnp = v2np.transpose((0, 1, 3, 2)).copy()
+
+ ret = paddle.to_tensor(tfnp)
+ return ret
+
+ noise_list = [x]
+ for tf in 'v', 'h', 't':
+ noise_list.extend([_transform(t, tf) for t in noise_list])
+
+ gaussian_list = [
+ paddle.randn(
+ (aug.shape[0], noise_channel, aug.shape[2], aug.shape[3]))
+ for aug in noise_list
+ ]
+ sr_list = [
+ forward_function(aug, g_noise)[0]
+ for aug, g_noise in zip(noise_list, gaussian_list)
+ ]
+
+ for i in range(len(sr_list)):
+ if i > 3:
+ sr_list[i] = _transform(sr_list[i], 't')
+ if i % 4 > 1:
+ sr_list[i] = _transform(sr_list[i], 'h')
+ if (i % 4) % 2 == 1:
+ sr_list[i] = _transform(sr_list[i], 'v')
+
+ output_cat = paddle.stack(sr_list, axis=0)
+ output = output_cat.mean(axis=0)
+
+ return output
+
+ def run(self, images_path=None, disable_mc=False):
+ os.makedirs(self.output_path, exist_ok=True)
+ task_path = os.path.join(self.output_path, self.task)
+ os.makedirs(task_path, exist_ok=True)
+ image_files = self.get_images(images_path)
+ for image_file in tqdm(image_files):
+ img_noisy = self.imread_uint(image_file, 3)
+
+ image_name = os.path.basename(image_file)
+ img = cv2.cvtColor(img_noisy, cv2.COLOR_RGB2BGR)
+ cv2.imwrite(os.path.join(task_path, image_name), img)
+
+ tmps = image_name.split('.')
+ assert len(
+ tmps) == 2, f'Invalid image name: {image_name}, too much "."'
+ restoration_save_path = os.path.join(
+ task_path, f'{tmps[0]}_restoration.{tmps[1]}')
+
+ img_noisy = self.uint2single(img_noisy)
+
+ # HWC to CHW, numpy to tensor
+ img_noisy = self.single2tensor3(img_noisy)
+ img_noisy = img_noisy.unsqueeze(0)
+ with paddle.no_grad():
+
+ # Monte Carlo Self Ensemble
+ noise_channel = 3 * 4**(model_cfgs['Denoising']['down_num']) - 3
+ if not disable_mc:
+ output = self.forward_x8(img_noisy, self.generator.forward,
+ noise_channel)
+ output = output[:, :3, :, :]
+ else:
+ noise = paddle.randn(
+ (img_noisy.shape[0], noise_channel, img_noisy.shape[2],
+ img_noisy.shape[3]))
+ output, _ = self.generator(img_noisy, noise)
+ output = output[:, :3, :, :]
+
+ restored = paddle.clip(output, 0, 1)
+
+ restored = restored.numpy()
+ restored = restored.transpose(0, 2, 3, 1)
+ restored = restored[0]
+ restored = restored * 255
+ restored = restored.astype(np.uint8)
+
+ cv2.imwrite(restoration_save_path,
+ cv2.cvtColor(restored, cv2.COLOR_RGB2BGR))
+
+ print('Done, output path is:', task_path)
diff --git a/ppgan/apps/lapstyle_predictor.py b/ppgan/apps/lapstyle_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a590ee70494912447bd34afcdf8e7b3cbd973174
--- /dev/null
+++ b/ppgan/apps/lapstyle_predictor.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2 as cv
+import numpy as np
+import urllib.request
+from PIL import Image
+
+import paddle
+import paddle.nn.functional as F
+from paddle.vision.transforms import functional
+
+from ppgan.utils.download import get_path_from_url
+from ppgan.utils.visual import tensor2img
+from ppgan.models.generators import DecoderNet, Encoder, RevisionNet
+from .base_predictor import BasePredictor
+
+LapStyle_circuit_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/lapstyle_circuit.pdparams'
+LapStyle_ocean_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/lapstyle_ocean.pdparams'
+LapStyle_starrynew_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/lapstyle_starrynew.pdparams'
+LapStyle_stars_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/lapstyle_stars.pdparams'
+
+
+def img(img):
+ # some images have 4 channels
+ if img.shape[2] > 3:
+ img = img[:, :, :3]
+ # HWC to CHW
+ return img
+
+
+def img_read(content_img_path, style_image_path):
+ content_img = cv.imread(content_img_path)
+ if content_img.ndim == 2:
+ content_img = cv.cvtColor(content_img, cv.COLOR_GRAY2RGB)
+ else:
+ content_img = cv.cvtColor(content_img, cv.COLOR_BGR2RGB)
+ h, w, c = content_img.shape
+ content_img = Image.fromarray(content_img)
+ content_img = content_img.resize((512, 512), Image.BILINEAR)
+ content_img = np.array(content_img)
+ content_img = img(content_img)
+ content_img = functional.to_tensor(content_img)
+
+ style_img = cv.imread(style_image_path)
+ style_img = cv.cvtColor(style_img, cv.COLOR_BGR2RGB)
+ style_img = Image.fromarray(style_img)
+ style_img = style_img.resize((512, 512), Image.BILINEAR)
+ style_img = np.array(style_img)
+ style_img = img(style_img)
+ style_img = functional.to_tensor(style_img)
+
+ content_img = paddle.unsqueeze(content_img, axis=0)
+ style_img = paddle.unsqueeze(style_img, axis=0)
+ return content_img, style_img, h, w
+
+
+def tensor_resample(tensor, dst_size, mode='bilinear'):
+ return F.interpolate(tensor, dst_size, mode=mode, align_corners=False)
+
+
+def laplacian(x):
+ """
+ Laplacian
+
+ return:
+ x - upsample(downsample(x))
+ """
+ return x - tensor_resample(
+ tensor_resample(x, [x.shape[2] // 2, x.shape[3] // 2]),
+ [x.shape[2], x.shape[3]])
+
+
+def make_laplace_pyramid(x, levels):
+ """
+ Make Laplacian Pyramid
+ """
+ pyramid = []
+ current = x
+ for i in range(levels):
+ pyramid.append(laplacian(current))
+ current = tensor_resample(
+ current,
+ (max(current.shape[2] // 2, 1), max(current.shape[3] // 2, 1)))
+ pyramid.append(current)
+ return pyramid
+
+
+def fold_laplace_pyramid(pyramid):
+ """
+ Fold Laplacian Pyramid
+ """
+ current = pyramid[-1]
+ for i in range(len(pyramid) - 2, -1, -1): # iterate from len-2 to 0
+ up_h, up_w = pyramid[i].shape[2], pyramid[i].shape[3]
+ current = pyramid[i] + tensor_resample(current, (up_h, up_w))
+ return current
+
+
+class LapStylePredictor(BasePredictor):
+ def __init__(self,
+ output='output_dir',
+ style='starrynew',
+ weight_path=None):
+ self.input = input
+ self.output = os.path.join(output, 'LapStyle')
+ if not os.path.exists(self.output):
+ os.makedirs(self.output)
+ self.net_enc = Encoder()
+ self.net_dec = DecoderNet()
+ self.net_rev = RevisionNet()
+ self.net_rev_2 = RevisionNet()
+
+ if weight_path is None:
+ if style == 'starrynew':
+ weight_path = get_path_from_url(LapStyle_starrynew_WEIGHT_URL)
+ elif style == 'circuit':
+ weight_path = get_path_from_url(LapStyle_circuit_WEIGHT_URL)
+ elif style == 'ocean':
+ weight_path = get_path_from_url(LapStyle_ocean_WEIGHT_URL)
+ elif style == 'stars':
+ weight_path = get_path_from_url(LapStyle_stars_WEIGHT_URL)
+ else:
+ raise Exception(f'has not implemented {style}.')
+ self.net_enc.set_dict(paddle.load(weight_path)['net_enc'])
+ self.net_enc.eval()
+ self.net_dec.set_dict(paddle.load(weight_path)['net_dec'])
+ self.net_dec.eval()
+ self.net_rev.set_dict(paddle.load(weight_path)['net_rev'])
+ self.net_rev.eval()
+ self.net_rev_2.set_dict(paddle.load(weight_path)['net_rev_2'])
+ self.net_rev_2.eval()
+
+ def run(self, content_img_path, style_image_path):
+ if not self.is_image(content_img_path):
+ raise ValueError(
+ 'The path of content_img does not exist or is not image')
+ if not self.is_image(style_image_path):
+ raise ValueError(
+ 'The path of style_image does not exist or is not image')
+ content_img, style_img, h, w = img_read(content_img_path,
+ style_image_path)
+ content_img_visual = tensor2img(content_img, min_max=(0., 1.))
+ content_img_visual = cv.cvtColor(content_img_visual, cv.COLOR_RGB2BGR)
+ content_img_visual = cv.resize(content_img_visual, (w, h))
+ cv.imwrite(os.path.join(self.output, 'content.png'), content_img_visual)
+ style_img_visual = tensor2img(style_img, min_max=(0., 1.))
+ style_img_visual = cv.cvtColor(style_img_visual, cv.COLOR_RGB2BGR)
+ cv.imwrite(os.path.join(self.output, 'style.png'), style_img_visual)
+ pyr_ci = make_laplace_pyramid(content_img, 2)
+ pyr_si = make_laplace_pyramid(style_img, 2)
+ pyr_ci.append(content_img)
+ pyr_si.append(style_img)
+ cF = self.net_enc(pyr_ci[2])
+ sF = self.net_enc(pyr_si[2])
+ stylized_small = self.net_dec(cF, sF)
+ stylized_up = F.interpolate(stylized_small, scale_factor=2)
+
+ revnet_input = paddle.concat(x=[pyr_ci[1], stylized_up], axis=1)
+ stylized_rev_lap = self.net_rev(revnet_input)
+ stylized_rev = fold_laplace_pyramid([stylized_rev_lap, stylized_small])
+ stylized_up = F.interpolate(stylized_rev, scale_factor=2)
+
+ revnet_input = paddle.concat(x=[pyr_ci[0], stylized_up], axis=1)
+ stylized_rev_lap_second = self.net_rev_2(revnet_input)
+ stylized_rev_second = fold_laplace_pyramid(
+ [stylized_rev_lap_second, stylized_rev_lap, stylized_small])
+
+ stylized = stylized_rev_second
+ stylized_visual = tensor2img(stylized, min_max=(0., 1.))
+ stylized_visual = cv.cvtColor(stylized_visual, cv.COLOR_RGB2BGR)
+ stylized_visual = cv.resize(stylized_visual, (w, h))
+ cv.imwrite(os.path.join(self.output, 'stylized.png'), stylized_visual)
+
+ print('Model LapStyle output images path:', self.output)
+
+ return stylized
diff --git a/ppgan/apps/mpr_predictor.py b/ppgan/apps/mpr_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ec7ee868fc9e081d8d18f4fd350713bd826800a
--- /dev/null
+++ b/ppgan/apps/mpr_predictor.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+from natsort import natsorted
+from glob import glob
+import numpy as np
+import cv2
+from PIL import Image
+import paddle
+from .base_predictor import BasePredictor
+from ppgan.models.generators import MPRNet
+from ppgan.utils.download import get_path_from_url
+from ppgan.utils.visual import make_grid, tensor2img, save_image
+from ppgan.datasets.mpr_dataset import to_tensor
+from paddle.vision.transforms import Pad
+from tqdm import tqdm
+
+model_cfgs = {
+ 'Deblurring': {
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/MPR_Deblurring.pdparams',
+ 'n_feat': 96,
+ 'scale_unetfeats': 48,
+ 'scale_orsnetfeats': 32,
+ },
+ 'Denoising': {
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/MPR_Denoising.pdparams',
+ 'n_feat': 80,
+ 'scale_unetfeats': 48,
+ 'scale_orsnetfeats': 32,
+ },
+ 'Deraining': {
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/MPR_Deraining.pdparams',
+ 'n_feat': 40,
+ 'scale_unetfeats': 20,
+ 'scale_orsnetfeats': 16,
+ }
+}
+
+
+class MPRPredictor(BasePredictor):
+ def __init__(self,
+ output_path='output_dir',
+ weight_path=None,
+ seed=None,
+ task=None):
+ self.output_path = output_path
+ self.task = task
+ self.max_size = 640
+ self.img_multiple_of = 8
+
+ if weight_path is None:
+ if task in model_cfgs.keys():
+ weight_path = get_path_from_url(model_cfgs[task]['model_urls'])
+ checkpoint = paddle.load(weight_path)
+ else:
+ raise ValueError(
+ 'Predictor need a weight path or a pretrained model type')
+ else:
+ checkpoint = paddle.load(weight_path)
+
+ self.generator = MPRNet(
+ n_feat=model_cfgs[task]['n_feat'],
+ scale_unetfeats=model_cfgs[task]['scale_unetfeats'],
+ scale_orsnetfeats=model_cfgs[task]['scale_orsnetfeats'])
+ self.generator.set_state_dict(checkpoint)
+ self.generator.eval()
+
+ if seed is not None:
+ paddle.seed(seed)
+ random.seed(seed)
+ np.random.seed(seed)
+
+ def get_images(self, images_path):
+ if os.path.isdir(images_path):
+ return natsorted(
+ glob(os.path.join(images_path, '*.jpeg')) +
+ glob(os.path.join(images_path, '*.jpg')) +
+ glob(os.path.join(images_path, '*.JPG')) +
+ glob(os.path.join(images_path, '*.png')) +
+ glob(os.path.join(images_path, '*.PNG')))
+ else:
+ return [images_path]
+
+ def read_image(self, image_file):
+ img = Image.open(image_file).convert('RGB')
+ max_length = max(img.width, img.height)
+ if max_length > self.max_size:
+ ratio = max_length / self.max_size
+ dw = int(img.width / ratio)
+ dh = int(img.height / ratio)
+ img = img.resize((dw, dh))
+ return img
+
+ def run(self, images_path=None):
+ os.makedirs(self.output_path, exist_ok=True)
+ task_path = os.path.join(self.output_path, self.task)
+ os.makedirs(task_path, exist_ok=True)
+ image_files = self.get_images(images_path)
+ for image_file in tqdm(image_files):
+ img = self.read_image(image_file)
+ image_name = os.path.basename(image_file)
+ img.save(os.path.join(task_path, image_name))
+ tmps = image_name.split('.')
+ assert len(
+ tmps) == 2, f'Invalid image name: {image_name}, too much "."'
+ restoration_save_path = os.path.join(
+ task_path, f'{tmps[0]}_restoration.{tmps[1]}')
+ input_ = to_tensor(img)
+
+ # Pad the input if not_multiple_of 8
+ h, w = input_.shape[1], input_.shape[2]
+
+ H, W = ((h + self.img_multiple_of) //
+ self.img_multiple_of) * self.img_multiple_of, (
+ (w + self.img_multiple_of) //
+ self.img_multiple_of) * self.img_multiple_of
+ padh = H - h if h % self.img_multiple_of != 0 else 0
+ padw = W - w if w % self.img_multiple_of != 0 else 0
+ input_ = paddle.to_tensor(input_)
+ transform = Pad((0, 0, padw, padh), padding_mode='reflect')
+ input_ = transform(input_)
+
+ input_ = paddle.to_tensor(np.expand_dims(input_.numpy(), 0))
+
+ with paddle.no_grad():
+ restored = self.generator(input_)
+ restored = restored[0]
+ restored = paddle.clip(restored, 0, 1)
+
+ # Unpad the output
+ restored = restored[:, :, :h, :w]
+
+ restored = restored.numpy()
+ restored = restored.transpose(0, 2, 3, 1)
+ restored = restored[0]
+ restored = restored * 255
+ restored = restored.astype(np.uint8)
+
+ cv2.imwrite(restoration_save_path,
+ cv2.cvtColor(restored, cv2.COLOR_RGB2BGR))
+
+ print('Done, output path is:', task_path)
diff --git a/ppgan/apps/nafnet_predictor.py b/ppgan/apps/nafnet_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7f7fef6fffdb8d414f5ad759c78efd160888968
--- /dev/null
+++ b/ppgan/apps/nafnet_predictor.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+from glob import glob
+from natsort import natsorted
+import numpy as np
+import os
+import random
+from tqdm import tqdm
+
+import paddle
+
+from ppgan.models.generators import NAFNet
+from ppgan.utils.download import get_path_from_url
+from .base_predictor import BasePredictor
+
+model_cfgs = {
+ 'Denoising': {
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/NAFNet_Denoising.pdparams',
+ 'img_channel': 3,
+ 'width': 64,
+ 'enc_blk_nums': [2, 2, 4, 8],
+ 'middle_blk_num': 12,
+ 'dec_blk_nums': [2, 2, 2, 2]
+ }
+}
+
+
+class NAFNetPredictor(BasePredictor):
+
+ def __init__(self,
+ output_path='output_dir',
+ weight_path=None,
+ seed=None,
+ window_size=8):
+ self.output_path = output_path
+ task = 'Denoising'
+ self.task = task
+ self.window_size = window_size
+
+ if weight_path is None:
+ if task in model_cfgs.keys():
+ weight_path = get_path_from_url(model_cfgs[task]['model_urls'])
+ checkpoint = paddle.load(weight_path)
+ else:
+ raise ValueError('Predictor need a task to define!')
+ else:
+ if weight_path.startswith("http"): # os.path.islink dosen't work!
+ weight_path = get_path_from_url(weight_path)
+ checkpoint = paddle.load(weight_path)
+ else:
+ checkpoint = paddle.load(weight_path)
+
+ self.generator = NAFNet(
+ img_channel=model_cfgs[task]['img_channel'],
+ width=model_cfgs[task]['width'],
+ enc_blk_nums=model_cfgs[task]['enc_blk_nums'],
+ middle_blk_num=model_cfgs[task]['middle_blk_num'],
+ dec_blk_nums=model_cfgs[task]['dec_blk_nums'])
+
+ checkpoint = checkpoint['generator']
+ self.generator.set_state_dict(checkpoint)
+ self.generator.eval()
+
+ if seed is not None:
+ paddle.seed(seed)
+ random.seed(seed)
+ np.random.seed(seed)
+
+ def get_images(self, images_path):
+ if os.path.isdir(images_path):
+ return natsorted(
+ glob(os.path.join(images_path, '*.jpeg')) +
+ glob(os.path.join(images_path, '*.jpg')) +
+ glob(os.path.join(images_path, '*.JPG')) +
+ glob(os.path.join(images_path, '*.png')) +
+ glob(os.path.join(images_path, '*.PNG')))
+ else:
+ return [images_path]
+
+ def imread_uint(self, path, n_channels=3):
+ # input: path
+ # output: HxWx3(RGB or GGG), or HxWx1 (G)
+ if n_channels == 1:
+ img = cv2.imread(path, 0) # cv2.IMREAD_GRAYSCALE
+ img = np.expand_dims(img, axis=2) # HxWx1
+ elif n_channels == 3:
+ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # BGR or G
+ if img.ndim == 2:
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) # GGG
+ else:
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB
+
+ return img
+
+ def uint2single(self, img):
+
+ return np.float32(img / 255.)
+
+ # convert single (HxWxC) to 3-dimensional paddle tensor
+ def single2tensor3(self, img):
+ return paddle.Tensor(np.ascontiguousarray(
+ img, dtype=np.float32)).transpose([2, 0, 1])
+
+ def run(self, images_path=None):
+ os.makedirs(self.output_path, exist_ok=True)
+ task_path = os.path.join(self.output_path, self.task)
+ os.makedirs(task_path, exist_ok=True)
+ image_files = self.get_images(images_path)
+ for image_file in tqdm(image_files):
+ img_L = self.imread_uint(image_file, 3)
+
+ image_name = os.path.basename(image_file)
+ img = cv2.cvtColor(img_L, cv2.COLOR_RGB2BGR)
+ cv2.imwrite(os.path.join(task_path, image_name), img)
+
+ tmps = image_name.split('.')
+ assert len(
+ tmps) == 2, f'Invalid image name: {image_name}, too much "."'
+ restoration_save_path = os.path.join(
+ task_path, f'{tmps[0]}_restoration.{tmps[1]}')
+
+ img_L = self.uint2single(img_L)
+
+ # HWC to CHW, numpy to tensor
+ img_L = self.single2tensor3(img_L)
+ img_L = img_L.unsqueeze(0)
+ with paddle.no_grad():
+ output = self.generator(img_L)
+
+ restored = paddle.clip(output, 0, 1)
+
+ restored = restored.numpy()
+ restored = restored.transpose(0, 2, 3, 1)
+ restored = restored[0]
+ restored = restored * 255
+ restored = restored.astype(np.uint8)
+
+ cv2.imwrite(restoration_save_path,
+ cv2.cvtColor(restored, cv2.COLOR_RGB2BGR))
+
+ print('Done, output path is:', task_path)
diff --git a/ppgan/apps/photopen_predictor.py b/ppgan/apps/photopen_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5497a0255f64026ba88c2362ac6b290c19dba06c
--- /dev/null
+++ b/ppgan/apps/photopen_predictor.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from PIL import Image, ImageOps
+import cv2
+import numpy as np
+import os
+
+import paddle
+
+from .base_predictor import BasePredictor
+from ppgan.models.generators import SPADEGenerator
+from ppgan.utils.photopen import data_onehot_pro
+from ..utils.filesystem import load
+
+
+class PhotoPenPredictor(BasePredictor):
+ def __init__(self,
+ output_path,
+ weight_path,
+ gen_cfg):
+
+ # 初始化模型
+ gen = SPADEGenerator(
+ gen_cfg.ngf,
+ gen_cfg.num_upsampling_layers,
+ gen_cfg.crop_size,
+ gen_cfg.aspect_ratio,
+ gen_cfg.norm_G,
+ gen_cfg.semantic_nc,
+ gen_cfg.use_vae,
+ gen_cfg.nef,
+ )
+ gen.eval()
+ para = load(weight_path)
+ if 'net_gen' in para:
+ gen.set_state_dict(para['net_gen'])
+ else:
+ gen.set_state_dict(para)
+
+ self.gen = gen
+ self.output_path = output_path
+ self.gen_cfg = gen_cfg
+
+
+ def run(self, semantic_label_path):
+ sem = Image.open(semantic_label_path)
+ sem = sem.resize((self.gen_cfg.crop_size, self.gen_cfg.crop_size), Image.NEAREST)
+ sem = np.array(sem).astype('float32')
+ sem = paddle.to_tensor(sem)
+ sem = sem.reshape([1, 1, self.gen_cfg.crop_size, self.gen_cfg.crop_size])
+
+ one_hot = data_onehot_pro(sem, self.gen_cfg)
+ predicted = self.gen(one_hot)
+ pic = predicted.numpy()[0].reshape((3, 256, 256)).transpose((1,2,0))
+ pic = ((pic + 1.) / 2. * 255).astype('uint8')
+
+ pic = cv2.cvtColor(pic,cv2.COLOR_BGR2RGB)
+ path, _ = os.path.split(self.output_path)
+ if not os.path.exists(path):
+ os.mkdir(path)
+ cv2.imwrite(self.output_path, pic)
+
+
+
\ No newline at end of file
diff --git a/ppgan/apps/pixel2style2pixel_predictor.py b/ppgan/apps/pixel2style2pixel_predictor.py
index b3722a9111cf6860fe49771f7fc5b83319b7f4ff..16646de7a6e66741f2014def2b09a0d395a0f44d 100644
--- a/ppgan/apps/pixel2style2pixel_predictor.py
+++ b/ppgan/apps/pixel2style2pixel_predictor.py
@@ -25,34 +25,46 @@ from ppgan.models.generators import Pixel2Style2Pixel
from ppgan.utils.download import get_path_from_url
from PIL import Image
-
model_cfgs = {
'ffhq-inversion': {
- 'model_urls': 'https://paddlegan.bj.bcebos.com/models/pSp-ffhq-inversion.pdparams',
- 'transform': T.Compose([
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/pSp-ffhq-inversion.pdparams',
+ 'transform':
+ T.Compose([
T.Resize((256, 256)),
T.Transpose(),
T.Normalize([127.5, 127.5, 127.5], [127.5, 127.5, 127.5])
]),
- 'size': 1024,
- 'style_dim': 512,
- 'n_mlp': 8,
- 'channel_multiplier': 2
+ 'size':
+ 1024,
+ 'style_dim':
+ 512,
+ 'n_mlp':
+ 8,
+ 'channel_multiplier':
+ 2
},
'ffhq-toonify': {
- 'model_urls': 'https://paddlegan.bj.bcebos.com/models/pSp-ffhq-toonify.pdparams',
- 'transform': T.Compose([
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/pSp-ffhq-toonify.pdparams',
+ 'transform':
+ T.Compose([
T.Resize((256, 256)),
T.Transpose(),
T.Normalize([127.5, 127.5, 127.5], [127.5, 127.5, 127.5])
]),
- 'size': 1024,
- 'style_dim': 512,
- 'n_mlp': 8,
- 'channel_multiplier': 2
+ 'size':
+ 1024,
+ 'style_dim':
+ 512,
+ 'n_mlp':
+ 8,
+ 'channel_multiplier':
+ 2
},
'default': {
- 'transform': T.Compose([
+ 'transform':
+ T.Compose([
T.Resize((256, 256)),
T.Transpose(),
T.Normalize([127.5, 127.5, 127.5], [127.5, 127.5, 127.5])
@@ -68,23 +80,23 @@ def run_alignment(image_path):
raise Exception('Could not find a face in the given image.')
face_on_image = face[0]
lm = futils.dlib.landmarks(img, face_on_image)
- lm = np.array(lm)[:,::-1]
- lm_eye_left = lm[36 : 42]
- lm_eye_right = lm[42 : 48]
- lm_mouth_outer = lm[48 : 60]
+ lm = np.array(lm)[:, ::-1]
+ lm_eye_left = lm[36:42]
+ lm_eye_right = lm[42:48]
+ lm_mouth_outer = lm[48:60]
output_size = 1024
transform_size = 4096
enable_padding = True
# Calculate auxiliary vectors.
- eye_left = np.mean(lm_eye_left, axis=0)
- eye_right = np.mean(lm_eye_right, axis=0)
- eye_avg = (eye_left + eye_right) * 0.5
- eye_to_eye = eye_right - eye_left
- mouth_left = lm_mouth_outer[0]
- mouth_right = lm_mouth_outer[6]
- mouth_avg = (mouth_left + mouth_right) * 0.5
+ eye_left = np.mean(lm_eye_left, axis=0)
+ eye_right = np.mean(lm_eye_right, axis=0)
+ eye_avg = (eye_left + eye_right) * 0.5
+ eye_to_eye = eye_right - eye_left
+ mouth_left = lm_mouth_outer[0]
+ mouth_right = lm_mouth_outer[6]
+ mouth_avg = (mouth_left + mouth_right) * 0.5
eye_to_mouth = mouth_avg - eye_avg
# Choose oriented crop rectangle.
@@ -99,36 +111,52 @@ def run_alignment(image_path):
# Shrink.
shrink = int(np.floor(qsize / output_size * 0.5))
if shrink > 1:
- rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
+ rsize = (int(np.rint(float(img.size[0]) / shrink)),
+ int(np.rint(float(img.size[1]) / shrink)))
img = img.resize(rsize, Image.ANTIALIAS)
quad /= shrink
qsize /= shrink
# Crop.
border = max(int(np.rint(qsize * 0.1)), 3)
- crop = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
- crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]), min(crop[3] + border, img.size[1]))
+ crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))),
+ int(np.ceil(max(quad[:, 0]))), int(np.ceil(max(quad[:, 1]))))
+ crop = (max(crop[0] - border, 0), max(crop[1] - border, 0),
+ min(crop[2] + border,
+ img.size[0]), min(crop[3] + border, img.size[1]))
if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
img = img.crop(crop)
quad -= crop[0:2]
# Pad.
- pad = (int(np.floor(min(quad[:,0]))), int(np.floor(min(quad[:,1]))), int(np.ceil(max(quad[:,0]))), int(np.ceil(max(quad[:,1]))))
- pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0), max(pad[3] - img.size[1] + border, 0))
+ pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))),
+ int(np.ceil(max(quad[:, 0]))), int(np.ceil(max(quad[:, 1]))))
+ pad = (max(-pad[0] + border,
+ 0), max(-pad[1] + border,
+ 0), max(pad[2] - img.size[0] + border,
+ 0), max(pad[3] - img.size[1] + border, 0))
if enable_padding and max(pad) > border - 4:
pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
- img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
+ img = np.pad(np.float32(img),
+ ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
h, w, _ = img.shape
y, x, _ = np.ogrid[:h, :w, :1]
- mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w-1-x) / pad[2]), 1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h-1-y) / pad[3]))
+ mask = np.maximum(
+ 1.0 -
+ np.minimum(np.float32(x) / pad[0],
+ np.float32(w - 1 - x) / pad[2]), 1.0 -
+ np.minimum(np.float32(y) / pad[1],
+ np.float32(h - 1 - y) / pad[3]))
blur = qsize * 0.02
- img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
- img += (np.median(img, axis=(0,1)) - img) * np.clip(mask, 0.0, 1.0)
+ img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) -
+ img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
+ img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
img = Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
quad += pad[:2]
# Transform.
- img = img.transform((transform_size, transform_size), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
+ img = img.transform((transform_size, transform_size), Image.QUAD,
+ (quad + 0.5).flatten(), Image.BILINEAR)
return img
@@ -153,14 +181,17 @@ class Pixel2Style2PixelPredictor(BasePredictor):
if weight_path is None and model_type != 'default':
if model_type in model_cfgs.keys():
- weight_path = get_path_from_url(model_cfgs[model_type]['model_urls'])
+ weight_path = get_path_from_url(
+ model_cfgs[model_type]['model_urls'])
size = model_cfgs[model_type].get('size', size)
style_dim = model_cfgs[model_type].get('style_dim', style_dim)
n_mlp = model_cfgs[model_type].get('n_mlp', n_mlp)
- channel_multiplier = model_cfgs[model_type].get('channel_multiplier', channel_multiplier)
+ channel_multiplier = model_cfgs[model_type].get(
+ 'channel_multiplier', channel_multiplier)
checkpoint = paddle.load(weight_path)
else:
- raise ValueError('Predictor need a weight path or a pretrained model type')
+ raise ValueError(
+ 'Predictor need a weight path or a pretrained model type')
else:
checkpoint = paddle.load(weight_path)
@@ -174,7 +205,7 @@ class Pixel2Style2PixelPredictor(BasePredictor):
self.generator = Pixel2Style2Pixel(opts)
self.generator.set_state_dict(checkpoint)
self.generator.eval()
-
+
if seed is not None:
paddle.seed(seed)
random.seed(seed)
@@ -186,14 +217,20 @@ class Pixel2Style2PixelPredictor(BasePredictor):
src_img = run_alignment(image)
src_img = np.asarray(src_img)
transformed_image = model_cfgs[self.model_type]['transform'](src_img)
- dst_img = (self.generator(paddle.to_tensor(transformed_image[None, ...]))
- * 0.5 + 0.5)[0].numpy() * 255
+ dst_img, latents = self.generator(paddle.to_tensor(
+ transformed_image[None, ...]),
+ resize=False,
+ return_latents=True)
+ dst_img = (dst_img * 0.5 + 0.5)[0].numpy() * 255
dst_img = dst_img.transpose((1, 2, 0))
+ dst_npy = latents[0].numpy()
os.makedirs(self.output_path, exist_ok=True)
save_src_path = os.path.join(self.output_path, 'src.png')
cv2.imwrite(save_src_path, cv2.cvtColor(src_img, cv2.COLOR_RGB2BGR))
save_dst_path = os.path.join(self.output_path, 'dst.png')
cv2.imwrite(save_dst_path, cv2.cvtColor(dst_img, cv2.COLOR_RGB2BGR))
+ save_npy_path = os.path.join(self.output_path, 'dst.npy')
+ np.save(save_npy_path, dst_npy)
- return src_img
+ return src_img, dst_img, dst_npy
diff --git a/ppgan/apps/psgan_predictor.py b/ppgan/apps/psgan_predictor.py
index 488a7a8b61b206dc8c9a6df9ee1bf74ddc3a7729..b39da8cb42f1a13eb8f37a03c235a626588f16a6 100644
--- a/ppgan/apps/psgan_predictor.py
+++ b/ppgan/apps/psgan_predictor.py
@@ -22,12 +22,12 @@ import numpy as np
import paddle
import paddle.vision.transforms as T
+from paddle.utils.download import get_weights_path_from_url
import ppgan.faceutils as futils
from ppgan.utils.options import parse_args
from ppgan.utils.config import get_config
from ppgan.utils.setup import setup
from ppgan.utils.filesystem import load
-from ppgan.engine.trainer import Trainer
from ppgan.models.builder import build_model
from ppgan.utils.preprocess import *
from .base_predictor import BasePredictor
@@ -120,7 +120,7 @@ class PostProcess:
class Inference:
def __init__(self, config, model_path=''):
- self.model = build_model(config)
+ self.model = build_model(config.model)
self.preprocess = PreProcess(config)
self.model_path = model_path
@@ -154,6 +154,7 @@ class Inference:
'P_B': reference_input[2],
'consis_mask': consis_mask
}
+
state_dicts = load(self.model_path)
for net_name, net in self.model.nets.items():
net.set_state_dict(state_dicts[net_name])
@@ -175,8 +176,7 @@ class PSGANPredictor(BasePredictor):
self.cfg = cfg
self.weight_path = self.args.model_path
if self.weight_path is None:
- cur_path = os.path.abspath(os.path.dirname(__file__))
- self.weight_path = get_path_from_url(PS_WEIGHT_URL, cur_path)
+ self.weight_path = get_weights_path_from_url(PS_WEIGHT_URL)
self.output_path = output_path
def run(self):
diff --git a/ppgan/apps/recurrent_vsr_predictor.py b/ppgan/apps/recurrent_vsr_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cd3e5082989e42b88bb33c3b6506c70d32d74ad
--- /dev/null
+++ b/ppgan/apps/recurrent_vsr_predictor.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import cv2
+import time
+import glob
+import numpy as np
+from tqdm import tqdm
+
+import paddle
+from paddle.io import Dataset, DataLoader
+
+from ppgan.utils.download import get_path_from_url
+from ppgan.utils.video import frames2video, video2frames
+from ppgan.models.generators import BasicVSRNet, IconVSR, BasicVSRPlusPlus, MSVSR
+from .base_predictor import BasePredictor
+from .edvr_predictor import get_img, read_img, save_img
+
+BasicVSR_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/BasicVSR_reds_x4.pdparams'
+IconVSR_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/IconVSR_reds_x4.pdparams'
+BasicVSR_PP_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/BasicVSR%2B%2B_reds_x4.pdparams'
+PP_MSVSR_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/PP-MSVSR_reds_x4.pdparams'
+PP_MSVSR_BD_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/PP-MSVSR_vimeo90k_x4.pdparams'
+PP_MSVSR_L_WEIGHT_URL = 'https://paddlegan.bj.bcebos.com/models/PP-MSVSR-L_reds_x4.pdparams'
+
+
+class RecurrentDataset(Dataset):
+ def __init__(self, frames_path, num_frames=30):
+ self.frames_path = frames_path
+
+ if num_frames is not None:
+ self.num_frames = num_frames
+ else:
+ self.num_frames = len(self.frames_path)
+
+ if len(frames_path) % self.num_frames == 0:
+ self.size = len(frames_path) // self.num_frames
+ else:
+ self.size = len(frames_path) // self.num_frames + 1
+
+ def __getitem__(self, index):
+ indexs = list(
+ range(index * self.num_frames, (index + 1) * self.num_frames))
+ frame_list = []
+ frames_path = []
+ for i in indexs:
+ if i >= len(self.frames_path):
+ break
+
+ frames_path.append(self.frames_path[i])
+ img = read_img(self.frames_path[i])
+ frame_list.append(img)
+
+ img_LQs = np.stack(frame_list, axis=0)
+ # BGR to RGB, HWC to CHW, numpy to tensor
+ img_LQs = img_LQs[:, :, :, [2, 1, 0]]
+ img_LQs = np.transpose(img_LQs, (0, 3, 1, 2)).astype('float32')
+
+ return img_LQs, frames_path
+
+ def __len__(self):
+ return self.size
+
+
+class BasicVSRPredictor(BasePredictor):
+ def __init__(self, output='output', weight_path=None, num_frames=10):
+ self.input = input
+ self.name = 'BasiVSR'
+ self.num_frames = num_frames
+ self.output = os.path.join(output, self.name)
+ self.model = BasicVSRNet()
+ if weight_path is None:
+ weight_path = get_path_from_url(BasicVSR_WEIGHT_URL)
+ self.model.set_dict(paddle.load(weight_path)['generator'])
+ self.model.eval()
+
+ def run(self, video_path):
+ vid = video_path
+ base_name = os.path.basename(vid).split('.')[0]
+ output_path = os.path.join(self.output, base_name)
+ pred_frame_path = os.path.join(output_path, 'frames_pred')
+
+ if not os.path.exists(output_path):
+ os.makedirs(output_path)
+
+ if not os.path.exists(pred_frame_path):
+ os.makedirs(pred_frame_path)
+
+ cap = cv2.VideoCapture(vid)
+ fps = cap.get(cv2.CAP_PROP_FPS)
+
+ out_path = video2frames(vid, output_path)
+
+ frames = sorted(glob.glob(os.path.join(out_path, '*.png')))
+
+ test_dataset = RecurrentDataset(frames, num_frames=self.num_frames)
+ dataset = DataLoader(test_dataset, batch_size=1, num_workers=2)
+
+ periods = []
+ cur_time = time.time()
+ for infer_iter, data in enumerate(tqdm(dataset)):
+ data_feed_in = paddle.to_tensor(data[0])
+ with paddle.no_grad():
+ outs = self.model(data_feed_in)
+
+ if isinstance(outs, (list, tuple)):
+ outs = outs[-1]
+
+ outs = outs[0].numpy()
+
+ infer_result_list = [outs[i, :, :, :] for i in range(outs.shape[0])]
+
+ frames_path = data[1]
+
+ for i in range(len(infer_result_list)):
+ img_i = get_img(infer_result_list[i])
+ save_img(
+ img_i,
+ os.path.join(pred_frame_path,
+ os.path.basename(frames_path[i][0])))
+
+ prev_time = cur_time
+ cur_time = time.time()
+ period = cur_time - prev_time
+ periods.append(period)
+
+ frame_pattern_combined = os.path.join(pred_frame_path, '%08d.png')
+ vid_out_path = os.path.join(
+ self.output, '{}_{}_out.mp4'.format(base_name, self.name))
+ frames2video(frame_pattern_combined, vid_out_path, str(int(fps)))
+
+ return frame_pattern_combined, vid_out_path
+
+
+class IconVSRPredictor(BasicVSRPredictor):
+ def __init__(self, output='output', weight_path=None, num_frames=10):
+ self.input = input
+ self.name = 'IconVSR'
+ self.output = os.path.join(output, self.name)
+ self.num_frames = num_frames
+ self.model = IconVSR()
+ if weight_path is None:
+ weight_path = get_path_from_url(IconVSR_WEIGHT_URL)
+ self.model.set_dict(paddle.load(weight_path)['generator'])
+ self.model.eval()
+
+
+class BasiVSRPlusPlusPredictor(BasicVSRPredictor):
+ def __init__(self, output='output', weight_path=None, num_frames=10):
+ self.input = input
+ self.name = 'BasiVSR_PP'
+ self.output = os.path.join(output, self.name)
+ self.num_frames = num_frames
+ self.model = BasicVSRPlusPlus()
+ if weight_path is None:
+ weight_path = get_path_from_url(BasicVSR_PP_WEIGHT_URL)
+ self.model.set_dict(paddle.load(weight_path)['generator'])
+ self.model.eval()
+
+
+class PPMSVSRPredictor(BasicVSRPredictor):
+ def __init__(self, output='output', weight_path=None, num_frames=10):
+ self.input = input
+ self.name = 'PPMSVSR'
+ self.output = os.path.join(output, self.name)
+ self.num_frames = num_frames
+ self.model = MSVSR()
+ if weight_path is None:
+ weight_path = get_path_from_url(PP_MSVSR_WEIGHT_URL)
+ self.model.set_dict(paddle.load(weight_path)['generator'])
+ self.model.eval()
+
+
+class PPMSVSRLargePredictor(BasicVSRPredictor):
+ def __init__(self, output='output', weight_path=None, num_frames=10):
+ self.input = input
+ self.name = 'PPMSVSR-L'
+ self.output = os.path.join(output, self.name)
+ self.num_frames = num_frames
+ self.model = MSVSR(mid_channels=64,
+ num_init_blocks=5,
+ num_blocks=7,
+ num_reconstruction_blocks=5,
+ only_last=False,
+ use_tiny_spynet=False,
+ deform_groups=8,
+ aux_reconstruction_blocks=2)
+ if weight_path is None:
+ weight_path = get_path_from_url(PP_MSVSR_L_WEIGHT_URL)
+ self.model.set_dict(paddle.load(weight_path)['generator'])
+ self.model.eval()
diff --git a/ppgan/apps/singan_predictor.py b/ppgan/apps/singan_predictor.py
new file mode 100755
index 0000000000000000000000000000000000000000..a75f08c3b9dce80b1f9a48252ed4352bbddee040
--- /dev/null
+++ b/ppgan/apps/singan_predictor.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import os
+import cv2
+import math
+import skimage
+import imageio
+
+import paddle
+import paddle.nn.functional as F
+import paddle.vision.transforms as T
+
+from .base_predictor import BasePredictor
+from ..models.singan_model import pad_shape
+from ppgan.models.generators import SinGANGenerator
+from ppgan.utils.download import get_path_from_url
+from ppgan.utils.visual import tensor2img, save_image, make_grid
+
+pretrained_weights_url = {
+ 'trees': 'https://paddlegan.bj.bcebos.com/models/singan_universal_trees.pdparams',
+ 'stone': 'https://paddlegan.bj.bcebos.com/models/singan_universal_stone.pdparams',
+ 'mountains': 'https://paddlegan.bj.bcebos.com/models/singan_universal_mountains.pdparams',
+ 'birds': 'https://paddlegan.bj.bcebos.com/models/singan_universal_birds.pdparams',
+ 'lightning': 'https://paddlegan.bj.bcebos.com/models/singan_universal_lightning.pdparams'
+}
+
+
+def imread(path):
+ return cv2.cvtColor(
+ cv2.imread(
+ path, cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+
+def imgpath2tensor(path):
+ return paddle.to_tensor(T.Compose([
+ T.Transpose(),
+ T.Normalize(127.5, 127.5)
+ ])(imread(path))).unsqueeze(0)
+
+def dilate_mask(mask, mode):
+ if mode == "harmonization":
+ element = skimage.morphology.disk(radius=7)
+ elif mode == "editing":
+ element = skimage.morphology.disk(radius=20)
+ else:
+ raise NotImplementedError('mode %s is not implemented' % mode)
+ mask = skimage.morphology.binary_dilation(mask, selem=element)
+ mask = skimage.filters.gaussian(mask, sigma=5)
+ return mask
+
+class SinGANPredictor(BasePredictor):
+ def __init__(self,
+ output_path='output_dir',
+ weight_path=None,
+ pretrained_model=None,
+ seed=None):
+ self.output_path = output_path
+ if weight_path is None:
+ if pretrained_model in pretrained_weights_url.keys():
+ weight_path = get_path_from_url(
+ pretrained_weights_url[pretrained_model])
+ else:
+ raise ValueError(
+ 'Predictor need a weight path or a pretrained model.')
+ checkpoint = paddle.load(weight_path)
+
+ self.scale_num = checkpoint['scale_num'].item()
+ self.coarsest_shape = checkpoint['coarsest_shape'].tolist()
+ self.nfc_init = checkpoint['nfc_init'].item()
+ self.min_nfc_init = checkpoint['min_nfc_init'].item()
+ self.num_layers = checkpoint['num_layers'].item()
+ self.ker_size = checkpoint['ker_size'].item()
+ self.noise_zero_pad = checkpoint['noise_zero_pad'].item()
+ self.generator = SinGANGenerator(self.scale_num,
+ self.coarsest_shape,
+ self.nfc_init,
+ self.min_nfc_init,
+ 3,
+ self.num_layers,
+ self.ker_size,
+ self.noise_zero_pad)
+ self.generator.set_state_dict(checkpoint)
+ self.generator.eval()
+ self.scale_factor = self.generator.scale_factor.item()
+ self.niose_pad_size = 0 if self.noise_zero_pad \
+ else self.generator._pad_size
+ if seed is not None:
+ paddle.seed(seed)
+
+ def noise_like(self, x):
+ return paddle.randn(pad_shape(x.shape, self.niose_pad_size))
+
+ def run(self,
+ mode='random_sample',
+ generate_start_scale=0,
+ scale_h=1.0,
+ scale_v=1.0,
+ ref_image=None,
+ mask_image=None,
+ sr_factor=4,
+ animation_alpha=0.9,
+ animation_beta=0.9,
+ animation_frames=20,
+ animation_duration=0.1,
+ n_row=5,
+ n_col=3):
+
+ # check config
+ if mode not in ['random_sample',
+ 'sr', 'animation',
+ 'harmonization',
+ 'editing', 'paint2image']:
+ raise ValueError(
+ 'Only random_sample, sr, animation, harmonization, \
+ editing and paint2image is implemented.')
+ if mode in ['sr', 'harmonization', 'editing', 'paint2image'] and \
+ ref_image is None:
+ raise ValueError(
+ 'When mode is sr, harmonization, editing, or \
+ paint2image, a reference image must be privided.')
+ if mode in ['harmonization', 'editing'] and mask_image is None:
+ raise ValueError(
+ 'When mode is harmonization or editing, \
+ a mask image must be privided.')
+
+ if mode == 'animation':
+ batch_size = animation_frames
+ elif mode == 'random_sample':
+ batch_size = n_row * n_col
+ else:
+ batch_size = 1
+
+ # prepare input
+ if mode == 'harmonization' or mode == 'editing' or mode == 'paint2image':
+ ref = imgpath2tensor(ref_image)
+ x_init = F.interpolate(
+ ref, None,
+ self.scale_factor ** (self.scale_num - generate_start_scale),
+ 'bicubic')
+ x_init = F.interpolate(
+ x_init, None, 1 / self.scale_factor, 'bicubic')
+ elif mode == 'sr':
+ ref = imgpath2tensor(ref_image)
+ sr_iters = math.ceil(math.log(sr_factor, 1 / self.scale_factor))
+ sr_scale_factor = sr_factor ** (1 / sr_factor)
+ x_init = F.interpolate(ref, None, sr_scale_factor, 'bicubic')
+ else:
+ x_init = paddle.zeros([
+ batch_size,
+ self.coarsest_shape[1],
+ int(self.coarsest_shape[2] * scale_v),
+ int(self.coarsest_shape[3] * scale_h)])
+
+ # forward
+ if mode == 'sr':
+ for _ in range(sr_iters):
+ out = self.generator([self.noise_like(x_init)], x_init, -1, -1)
+ x_init = F.interpolate(out, None, sr_scale_factor, 'bicubic')
+ else:
+ z_pyramid = [
+ self.noise_like(
+ F.interpolate(
+ x_init, None, 1 / self.scale_factor ** i))
+ for i in range(self.scale_num - generate_start_scale)]
+
+ if mode == 'animation':
+ a = animation_alpha
+ b = animation_beta
+ for i in range(len(z_pyramid)):
+ z = paddle.chunk(z_pyramid[i], batch_size)
+ if i == 0 and generate_start_scale == 0:
+ z_0 = F.interpolate(
+ self.generator.z_fixed,
+ pad_shape(x_init.shape[-2:], self.niose_pad_size),
+ None, 'bicubic')
+ else:
+ z_0 = 0
+ z_1 = z_0
+ z_2 = 0.95 * z_1 + 0.05 * z[0]
+ for j in range(len(z)):
+ z[j] = a * z_0 + (1 - a) * (z_2 + b * (z_2 - z_1) + (1 - b) * z[j])
+ z_1 = z_2
+ z_2 = z[j]
+ z = paddle.concat(z)
+ z_pyramid[i] = z
+
+ out = self.generator(z_pyramid, x_init, self.scale_num - 1, generate_start_scale)
+
+ # postprocess and save
+ os.makedirs(self.output_path, exist_ok=True)
+ if mode == 'animation':
+ frames = [tensor2img(x) for x in out.chunk(animation_frames)]
+ imageio.mimsave(
+ os.path.join(self.output_path, 'animation.gif'),
+ frames, 'GIF', duration=animation_duration)
+ else:
+ if mode == 'harmonization' or mode == 'editing':
+ mask = cv2.imread(mask_image, cv2.IMREAD_GRAYSCALE)
+ mask = paddle.to_tensor(dilate_mask(mask, mode), 'float32')
+ out = F.interpolate(out, mask.shape, None, 'bicubic')
+ out = (1 - mask) * ref + mask * out
+ elif mode == 'sr':
+ out = F.interpolate(
+ out,
+ [ref.shape[-2] * sr_factor, ref.shape[-1] * sr_factor],
+ None, 'bicubic')
+ elif mode == 'paint2image':
+ out = F.interpolate(out, ref.shape[-2:], None, 'bicubic')
+ elif mode == 'random_sample':
+ out = make_grid(out, n_row)
+
+ save_image(tensor2img(out), os.path.join(self.output_path, mode + '.png'))
diff --git a/ppgan/apps/styleganv2_predictor.py b/ppgan/apps/styleganv2_predictor.py
index c9626967735d3ddf395c3f417bc0a92687f65339..f61af25a4d5d9c0dbdb8e807461c2504043baeca 100644
--- a/ppgan/apps/styleganv2_predictor.py
+++ b/ppgan/apps/styleganv2_predictor.py
@@ -21,17 +21,18 @@ from ppgan.models.generators import StyleGANv2Generator
from ppgan.utils.download import get_path_from_url
from ppgan.utils.visual import make_grid, tensor2img, save_image
-
model_cfgs = {
'ffhq-config-f': {
- 'model_urls': 'https://paddlegan.bj.bcebos.com/models/stylegan2-ffhq-config-f.pdparams',
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/stylegan2-ffhq-config-f.pdparams',
'size': 1024,
'style_dim': 512,
'n_mlp': 8,
'channel_multiplier': 2
},
'animeface-512': {
- 'model_urls': 'https://paddlegan.bj.bcebos.com/models/stylegan2-animeface-512.pdparams',
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/stylegan2-animeface-512.pdparams',
'size': 512,
'style_dim': 512,
'n_mlp': 8,
@@ -64,7 +65,7 @@ def sample(generator, mean_style, n_sample):
truncation=0.7,
truncation_latent=mean_style,
)[0]
-
+
return image
@@ -73,16 +74,16 @@ def style_mixing(generator, mean_style, n_source, n_target):
source_code = paddle.randn([n_source, generator.style_dim])
target_code = paddle.randn([n_target, generator.style_dim])
- resolution = 2 ** ((generator.n_latent + 2) // 2)
+ resolution = 2**((generator.n_latent + 2) // 2)
images = [paddle.ones([1, 3, resolution, resolution]) * -1]
- source_image = generator(
- [source_code], truncation_latent=mean_style, truncation=0.7
- )[0]
- target_image = generator(
- [target_code], truncation_latent=mean_style, truncation=0.7
- )[0]
+ source_image = generator([source_code],
+ truncation_latent=mean_style,
+ truncation=0.7)[0]
+ target_image = generator([target_code],
+ truncation_latent=mean_style,
+ truncation=0.7)[0]
images.append(source_image)
@@ -96,7 +97,7 @@ def style_mixing(generator, mean_style, n_source, n_target):
images.append(image)
images = paddle.concat(images, 0)
-
+
return images
@@ -114,21 +115,25 @@ class StyleGANv2Predictor(BasePredictor):
if weight_path is None:
if model_type in model_cfgs.keys():
- weight_path = get_path_from_url(model_cfgs[model_type]['model_urls'])
+ weight_path = get_path_from_url(
+ model_cfgs[model_type]['model_urls'])
size = model_cfgs[model_type].get('size', size)
style_dim = model_cfgs[model_type].get('style_dim', style_dim)
n_mlp = model_cfgs[model_type].get('n_mlp', n_mlp)
- channel_multiplier = model_cfgs[model_type].get('channel_multiplier', channel_multiplier)
+ channel_multiplier = model_cfgs[model_type].get(
+ 'channel_multiplier', channel_multiplier)
checkpoint = paddle.load(weight_path)
else:
- raise ValueError('Predictor need a weight path or a pretrained model type')
+ raise ValueError(
+ 'Predictor need a weight path or a pretrained model type')
else:
checkpoint = paddle.load(weight_path)
- self.generator = StyleGANv2Generator(size, style_dim, n_mlp, channel_multiplier)
+ self.generator = StyleGANv2Generator(size, style_dim, n_mlp,
+ channel_multiplier)
self.generator.set_state_dict(checkpoint)
self.generator.eval()
-
+
if seed is not None:
paddle.seed(seed)
random.seed(seed)
@@ -139,10 +144,10 @@ class StyleGANv2Predictor(BasePredictor):
mean_style = get_mean_style(self.generator)
img = sample(self.generator, mean_style, n_row * n_col)
- save_image(tensor2img(make_grid(img, nrow=n_col)), f'{self.output_path}/sample.png')
+ save_image(tensor2img(make_grid(img, nrow=n_col)),
+ f'{self.output_path}/sample.png')
for j in range(2):
img = style_mixing(self.generator, mean_style, n_col, n_row)
- save_image(tensor2img(make_grid(
- img, nrow=n_col + 1
- )), f'{self.output_path}/sample_mixing_{j}.png')
+ save_image(tensor2img(make_grid(img, nrow=n_col + 1)),
+ f'{self.output_path}/sample_mixing_{j}.png')
diff --git a/ppgan/apps/styleganv2clip_predictor.py b/ppgan/apps/styleganv2clip_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1461044f926dbc40de37ab665cb66682af4b9955
--- /dev/null
+++ b/ppgan/apps/styleganv2clip_predictor.py
@@ -0,0 +1,415 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import cv2
+import numpy as np
+import paddle
+from ppgan.apps.styleganv2_predictor import StyleGANv2Predictor
+from ppgan.utils.download import get_path_from_url
+from clip import tokenize, load_model
+
+model_cfgs = {
+ 'ffhq-config-f': {
+ 'direction_urls':
+ 'https://paddlegan.bj.bcebos.com/models/stylegan2-ffhq-config-f-styleclip-global-directions.pdparams',
+ 'stat_urls':
+ 'https://paddlegan.bj.bcebos.com/models/stylegan2-ffhq-config-f-styleclip-stats.pdparams'
+ }
+}
+
+
+def make_image(tensor):
+ return (((tensor.detach() + 1) / 2 * 255).clip(min=0, max=255).transpose(
+ (0, 2, 3, 1)).numpy().astype('uint8'))
+
+
+# prompt engineering
+prompt_templates = [
+ 'a bad photo of a {}.',
+ 'a photo of the hard to see {}.',
+ 'a low resolution photo of the {}.',
+ 'a rendering of a {}.',
+ 'graffiti of a {}.',
+ 'a bad photo of the {}.',
+ 'a cropped photo of the {}.',
+ 'a photo of a hard to see {}.',
+ 'a bright photo of a {}.',
+ 'a photo of a clean {}.',
+ 'a photo of a dirty {}.',
+ 'a dark photo of the {}.',
+ 'a drawing of a {}.',
+ 'a photo of my {}.',
+ 'a photo of the cool {}.',
+ 'a close-up photo of a {}.',
+ 'a black and white photo of the {}.',
+ 'a painting of the {}.',
+ 'a painting of a {}.',
+ 'a pixelated photo of the {}.',
+ 'a sculpture of the {}.',
+ 'a bright photo of the {}.',
+ 'a cropped photo of a {}.',
+ 'a jpeg corrupted photo of a {}.',
+ 'a blurry photo of the {}.',
+ 'a photo of the {}.',
+ 'a good photo of the {}.',
+ 'a rendering of the {}.',
+ 'a close-up photo of the {}.',
+ 'a photo of a {}.',
+ 'a low resolution photo of a {}.',
+ 'a photo of the clean {}.',
+ 'a photo of a large {}.',
+ 'a photo of a nice {}.',
+ 'a blurry photo of a {}.',
+ 'a cartoon {}.',
+ 'art of a {}.',
+ 'a good photo of a {}.',
+ 'a photo of the nice {}.',
+ 'a photo of the small {}.',
+ 'a photo of the weird {}.',
+ 'art of the {}.',
+ 'a drawing of the {}.',
+ 'a photo of the large {}.',
+ 'a dark photo of a {}.',
+ 'graffiti of the {}.',
+]
+
+
+@paddle.no_grad()
+def get_delta_t(neutral, target, model, templates=prompt_templates):
+ text_features = []
+ for classname in [neutral, target]:
+ texts = [template.format(classname)
+ for template in templates] #format with class
+ texts = tokenize(texts) #tokenize
+ class_embeddings = model.encode_text(texts) #embed with text encoder
+ class_embeddings /= class_embeddings.norm(axis=-1, keepdim=True)
+ class_embedding = class_embeddings.mean(axis=0)
+ class_embedding /= class_embedding.norm()
+ text_features.append(class_embedding)
+ text_features = paddle.stack(text_features, axis=1).t()
+
+ delta_t = (text_features[1] - text_features[0])
+ delta_t = delta_t / delta_t.norm()
+ return delta_t
+
+
+@paddle.no_grad()
+def get_ds_from_dt(global_style_direction,
+ delta_t,
+ generator,
+ beta_threshold,
+ relative=False,
+ soft_threshold=False):
+ delta_s = global_style_direction @ delta_t
+ delta_s_max = delta_s.abs().max()
+ print(f'max delta_s is {delta_s_max.item()}')
+ if relative: beta_threshold *= delta_s_max
+ # apply beta threshold (disentangle)
+ select = delta_s.abs() < beta_threshold
+ num_channel = paddle.sum(~select).item()
+ # threshold in style direction
+ delta_s[select] = delta_s[select] * soft_threshold
+ delta_s /= delta_s_max # normalize
+
+ # delta_s -> style dict
+ dic = []
+ ind = 0
+ for layer in range(len(generator.w_idx_lst)): # 26
+ dim = generator.channels_lst[layer]
+ if layer in generator.style_layers:
+ dic.append(paddle.to_tensor(delta_s[ind:ind + dim]))
+ ind += dim
+ else:
+ dic.append(paddle.zeros([dim]))
+ return dic, num_channel
+
+
+class StyleGANv2ClipPredictor(StyleGANv2Predictor):
+ def __init__(self, model_type=None, direction_path=None, stat_path=None, **kwargs):
+ super().__init__(model_type=model_type, **kwargs)
+
+ if direction_path is None and model_type is not None:
+ assert model_type in model_cfgs, f'There is not any pretrained direction file for {model_type} model.'
+ direction_path = get_path_from_url(
+ model_cfgs[model_type]['direction_urls'])
+ self.fs3 = paddle.load(direction_path)
+
+ self.clip_model, _ = load_model('ViT_B_32', pretrained=True)
+ self.manipulator = Manipulator(self.generator, model_type=model_type, stat_path=stat_path)
+
+ def get_delta_s(self,
+ neutral,
+ target,
+ beta_threshold,
+ relative=False,
+ soft_threshold=0):
+ # get delta_t in CLIP text space (text directions)
+ delta_t = get_delta_t(neutral, target, self.clip_model)
+ # get delta_s in global image directions
+ delta_s, num_channel = get_ds_from_dt(self.fs3, delta_t, self.generator,
+ beta_threshold, relative,
+ soft_threshold)
+ print(
+ f'{num_channel} channels will be manipulated under the {"relative" if relative else ""} beta threshold {beta_threshold}'
+ )
+ return delta_s
+
+ @paddle.no_grad()
+ def gengrate(self, latent: paddle.Tensor, delta_s, lst_alpha):
+ styles = self.generator.style_affine(latent)
+ styles = self.manipulator.manipulate(styles, delta_s, lst_alpha)
+ # synthesis images from manipulated styles
+ img_gen = self.manipulator.synthesis_from_styles(styles)
+ return img_gen, styles
+
+ @paddle.no_grad()
+ def run(self, latent, neutral, target, offset, beta_threshold=0.8):
+ latent = paddle.to_tensor(
+ np.load(latent)).unsqueeze(0).astype('float32')
+ delta_s = self.get_delta_s(neutral, target, beta_threshold)
+ img_gen, styles = self.gengrate(latent, delta_s, [0, offset])
+ imgs = make_image(paddle.concat(img_gen))
+ src_img = imgs[0]
+ dst_img = imgs[1]
+
+ dst_latent = styles[1]
+ os.makedirs(self.output_path, exist_ok=True)
+ save_src_path = os.path.join(self.output_path, 'src.editing.png')
+ cv2.imwrite(save_src_path, cv2.cvtColor(src_img, cv2.COLOR_RGB2BGR))
+ save_dst_path = os.path.join(self.output_path, 'dst.editing.png')
+ cv2.imwrite(save_dst_path, cv2.cvtColor(dst_img, cv2.COLOR_RGB2BGR))
+ save_path = os.path.join(self.output_path, 'dst.editing.pd')
+ paddle.save(dst_latent, save_path)
+ return src_img, dst_img, dst_latent
+
+
+@paddle.no_grad()
+def extract_global_direction(G,
+ lst_alpha,
+ batchsize=5,
+ num=100,
+ dataset_name='',
+ seed=None):
+ from tqdm import tqdm
+ import PIL
+ """Extract global style direction in 100 images
+ """
+ assert len(lst_alpha) == 2 #[-5, 5]
+ assert num < 200
+ #np.random.seed(0)
+ # get intermediate latent of n samples
+ try:
+ S = paddle.load(f'S-{dataset_name}.pdparams')
+ S = [S[i][:num] for i in range(len(G.w_idx_lst))]
+ except:
+ print('No pre-computed S, run tools/styleclip_getf.py first!')
+ exit()
+ # total channel used: 1024 -> 6048 channels, 256 -> 4928 channels
+ print(
+ f"total channels to manipulate: {sum([G.channels_lst[i] for i in G.style_layers])}"
+ )
+
+ manipulator = Manipulator(G, model_type=dataset_name,
+ stat_path=f'stylegan2-{dataset_name}-styleclip-stats.pdparams')
+ model, preprocess = load_model('ViT_B_32', pretrained=True)
+
+ nbatch = int(num / batchsize)
+ all_feats = list()
+ for layer in G.style_layers:
+ print(f'\nStyle manipulation in layer "{layer}"')
+ for channel_ind in tqdm(range(G.channels_lst[layer])):
+ styles = manipulator.manipulate_one_channel(copy.deepcopy(S), layer,
+ channel_ind, lst_alpha,
+ num)
+ # 2 * num images
+ feats = list()
+ for img_ind in range(nbatch): # batch size * nbatch * 2
+ start = img_ind * batchsize
+ end = img_ind * batchsize + batchsize
+ synth_imgs = manipulator.synthesis_from_styles(
+ styles, [start, end])
+ synth_imgs = [(synth_img.transpose((0, 2, 3, 1)) * 127.5 +
+ 128).clip(0, 255).astype('uint8').numpy()
+ for synth_img in synth_imgs]
+ imgs = list()
+ for i in range(batchsize):
+ img0 = PIL.Image.fromarray(synth_imgs[0][i])
+ img1 = PIL.Image.fromarray(synth_imgs[1][i])
+ imgs.append(preprocess(img0).unsqueeze(0))
+ imgs.append(preprocess(img1).unsqueeze(0))
+ feat = model.encode_image(paddle.concat(imgs))
+ feats.append(feat.numpy())
+ all_feats.append(np.concatenate(feats).reshape([-1, 2, 512]))
+ all_feats = np.stack(all_feats)
+ np.save(f'fs-{dataset_name}.npy', all_feats)
+
+ fs = all_feats #L B 2 512
+ fs1 = fs / np.linalg.norm(fs, axis=-1)[:, :, :, None]
+ fs2 = fs1[:, :, 1, :] - fs1[:, :, 0, :] # 5*sigma - (-5)* sigma
+ fs3 = fs2 / np.linalg.norm(fs2, axis=-1)[:, :, None]
+ fs3 = fs3.mean(axis=1)
+ fs3 = fs3 / np.linalg.norm(fs3, axis=-1)[:, None]
+
+ paddle.save(paddle.to_tensor(fs3),
+ f'stylegan2-{dataset_name}-styleclip-global-directions.pdparams'
+ ) # global style direction
+
+
+class Manipulator():
+ """Manipulator for style editing
+ The paper uses 100 image pairs to estimate the mean for alpha(magnitude of the perturbation) [-5, 5]
+ """
+ def __init__(self, generator, model_type='ffhq-config-f', stat_path=None):
+ self.generator = generator
+
+ if stat_path is None and model_type is not None:
+ assert model_type in model_cfgs, f'There is not any pretrained stat file for {model_type} model.'
+ stat_path = get_path_from_url(
+ model_cfgs[model_type]['direction_urls'])
+ data = paddle.load(stat_path)
+ self.S_mean = data['mean']
+ self.S_std = data['std']
+
+ @paddle.no_grad()
+ def manipulate(self, styles, delta_s, lst_alpha):
+ """Edit style by given delta_style
+ - use perturbation (delta s) * (alpha) as a boundary
+ """
+ styles = [copy.deepcopy(styles) for _ in range(len(lst_alpha))]
+
+ for (alpha, style) in zip(lst_alpha, styles):
+ for i in range(len(self.generator.w_idx_lst)):
+ style[i] += delta_s[i] * alpha
+ return styles
+
+ @paddle.no_grad()
+ def manipulate_one_channel(self,
+ styles,
+ layer_ind,
+ channel_ind: int,
+ lst_alpha=[0],
+ num_images=100):
+ """Edit style from given layer, channel index
+ - use mean value of pre-saved style
+ - use perturbation (pre-saved style std) * (alpha) as a boundary
+ """
+ assert 0 <= channel_ind < styles[layer_ind].shape[1]
+ boundary = self.S_std[layer_ind][channel_ind].item()
+ # apply self.S_mean value for given layer, channel_ind
+ for img_ind in range(num_images):
+ styles[layer_ind][img_ind,
+ channel_ind] = self.S_mean[layer_ind][channel_ind]
+ styles = [copy.deepcopy(styles) for _ in range(len(lst_alpha))]
+ perturbation = (paddle.to_tensor(lst_alpha) * boundary).numpy().tolist()
+ # apply one channel manipulation
+ for img_ind in range(num_images):
+ for edit_ind, delta in enumerate(perturbation):
+ styles[edit_ind][layer_ind][img_ind, channel_ind] += delta
+ return styles
+
+ @paddle.no_grad()
+ def synthesis_from_styles(self, styles, slice=None, randomize_noise=True):
+ """Synthesis edited styles from styles, lst_alpha
+ """
+ imgs = list()
+ if slice is not None:
+ for style in styles:
+ style_ = [list() for _ in range(len(self.generator.w_idx_lst))]
+ for i in range(len(self.generator.w_idx_lst)):
+ style_[i] = style[i][slice[0]:slice[1]]
+ imgs.append(
+ self.generator.synthesis(style_,
+ randomize_noise=randomize_noise))
+ else:
+ for style in styles:
+ imgs.append(
+ self.generator.synthesis(style,
+ randomize_noise=randomize_noise))
+ return imgs
+
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser()
+ parser.add_argument('runtype',
+ type=str,
+ default='generate',
+ choices=['generate', 'test', 'extract'])
+ parser.add_argument("--latent",
+ type=str,
+ default='output_dir/sample/dst.npy',
+ help="path to first image latent codes")
+ parser.add_argument("--neutral",
+ type=str,
+ default=None,
+ help="neutral description")
+ parser.add_argument("--target",
+ type=str,
+ default=None,
+ help="neutral description")
+ parser.add_argument("--direction_path",
+ type=str,
+ default=None,
+ help="path to latent editing directions")
+ parser.add_argument("--stat_path",
+ type=str,
+ default=None,
+ help="path to latent stat files")
+ parser.add_argument("--direction_offset",
+ type=float,
+ default=5,
+ help="offset value of edited attribute")
+ parser.add_argument("--beta_threshold",
+ type=float,
+ default=0.12,
+ help="beta threshold for channel editing")
+ parser.add_argument('--dataset_name', type=str,
+ default='ffhq-config-f') #'animeface-512')
+ args = parser.parse_args()
+ runtype = args.runtype
+ if runtype in ['test', 'extract']:
+ dataset_name = args.dataset_name
+ G = StyleGANv2Predictor(model_type=dataset_name).generator
+ if runtype == 'test': # test manipulator
+ from ppgan.utils.visual import make_grid, tensor2img, save_image
+ num_images = 2
+ lst_alpha = [-5, 0, 5]
+ layer = 6
+ channel_ind = 501
+ manipulator = Manipulator(G, model_type=dataset_name, stat_path=args.stat_path)
+ styles = manipulator.manipulate_one_channel(layer, channel_ind,
+ lst_alpha, num_images)
+ imgs = manipulator.synthesis_from_styles(styles)
+ print(len(imgs), imgs[0].shape)
+ save_image(
+ tensor2img(make_grid(paddle.concat(imgs), nrow=num_images)),
+ f'sample.png')
+ elif runtype == 'extract': # train: extract global style direction
+ batchsize = 10
+ num_images = 100
+ lst_alpha = [-5, 5]
+ extract_global_direction(G,
+ lst_alpha,
+ batchsize,
+ num_images,
+ dataset_name=dataset_name)
+ else:
+ predictor = StyleGANv2ClipPredictor(model_type=args.dataset_name,
+ seed=None,
+ direction_path=args.direction_path,
+ stat_path=args.stat_path)
+ predictor.run(args.latent, args.neutral, args.target,
+ args.direction_offset, args.beta_threshold)
diff --git a/ppgan/apps/styleganv2editing_predictor.py b/ppgan/apps/styleganv2editing_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1d7ced77d2689c4f4e4646b01ad93eed880b997
--- /dev/null
+++ b/ppgan/apps/styleganv2editing_predictor.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import numpy as np
+import paddle
+
+from ppgan.utils.download import get_path_from_url
+from .styleganv2_predictor import StyleGANv2Predictor
+
+model_cfgs = {
+ 'ffhq-config-f': {
+ 'direction_urls':
+ 'https://paddlegan.bj.bcebos.com/models/stylegan2-ffhq-config-f-directions.pdparams'
+ }
+}
+
+
+def make_image(tensor):
+ return (((tensor.detach() + 1) / 2 * 255).clip(min=0, max=255).transpose(
+ (0, 2, 3, 1)).numpy().astype('uint8'))
+
+
+class StyleGANv2EditingPredictor(StyleGANv2Predictor):
+ def __init__(self, model_type=None, direction_path=None, **kwargs):
+ super().__init__(model_type=model_type, **kwargs)
+
+ if direction_path is None and model_type is not None:
+ assert model_type in model_cfgs, f'There is not any pretrained direction file for {model_type} model.'
+ direction_path = get_path_from_url(
+ model_cfgs[model_type]['direction_urls'])
+ self.directions = paddle.load(direction_path)
+
+ @paddle.no_grad()
+ def run(self, latent, direction, offset):
+
+ latent = paddle.to_tensor(
+ np.load(latent)).unsqueeze(0).astype('float32')
+ direction = self.directions[direction].unsqueeze(0).astype('float32')
+
+ latent_n = paddle.concat([latent, latent + offset * direction], 0)
+ generator = self.generator
+ img_gen, _ = generator([latent_n],
+ input_is_latent=True,
+ randomize_noise=False)
+ imgs = make_image(img_gen)
+ src_img = imgs[0]
+ dst_img = imgs[1]
+
+ dst_latent = (latent + offset * direction)[0].numpy().astype('float32')
+
+ os.makedirs(self.output_path, exist_ok=True)
+ save_src_path = os.path.join(self.output_path, 'src.editing.png')
+ cv2.imwrite(save_src_path, cv2.cvtColor(src_img, cv2.COLOR_RGB2BGR))
+ save_dst_path = os.path.join(self.output_path, 'dst.editing.png')
+ cv2.imwrite(save_dst_path, cv2.cvtColor(dst_img, cv2.COLOR_RGB2BGR))
+ save_npy_path = os.path.join(self.output_path, 'dst.editing.npy')
+ np.save(save_npy_path, dst_latent)
+
+ return src_img, dst_img, dst_latent
diff --git a/ppgan/apps/styleganv2fitting_predictor.py b/ppgan/apps/styleganv2fitting_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e8fd0767ed8555ed09a8de7868b071e07d1a2ae
--- /dev/null
+++ b/ppgan/apps/styleganv2fitting_predictor.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import numpy as np
+import paddle
+from paddle import optimizer as optim
+from paddle.nn import functional as F
+from paddle.vision import transforms
+from tqdm import tqdm
+from PIL import Image
+from .styleganv2_predictor import StyleGANv2Predictor
+from .pixel2style2pixel_predictor import run_alignment
+from ..metrics.lpips import LPIPS
+
+
+def get_lr(t, ts, initial_lr, final_lr):
+ alpha = pow(final_lr / initial_lr, 1 / ts)**(t * ts)
+
+ return initial_lr * alpha
+
+
+def make_image(tensor):
+ return (((tensor.detach() + 1) / 2 * 255).clip(min=0, max=255).transpose(
+ (0, 2, 3, 1)).numpy().astype('uint8'))
+
+
+class StyleGANv2FittingPredictor(StyleGANv2Predictor):
+ def run(self,
+ image,
+ need_align=False,
+ start_lr=0.1,
+ final_lr=0.025,
+ latent_level=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
+ step=100,
+ mse_weight=1,
+ pre_latent=None):
+
+ if need_align:
+ src_img = run_alignment(image)
+ else:
+ src_img = Image.open(image).convert("RGB")
+
+ generator = self.generator
+ generator.train()
+
+ percept = LPIPS(net='vgg')
+ # on PaddlePaddle, lpips's default eval mode means no gradients.
+ percept.train()
+
+ n_mean_latent = 4096
+
+ transform = transforms.Compose([
+ transforms.Resize(256),
+ transforms.CenterCrop(256),
+ transforms.Transpose(),
+ transforms.Normalize([127.5, 127.5, 127.5], [127.5, 127.5, 127.5]),
+ ])
+
+ imgs = paddle.to_tensor(transform(src_img)).unsqueeze(0)
+
+ if pre_latent is None:
+ with paddle.no_grad():
+ noise_sample = paddle.randn(
+ (n_mean_latent, generator.style_dim))
+ latent_out = generator.style(noise_sample)
+
+ latent_mean = latent_out.mean(0)
+
+ latent_in = latent_mean.detach().clone().unsqueeze(0).tile(
+ (imgs.shape[0], 1))
+ latent_in = latent_in.unsqueeze(1).tile(
+ (1, generator.n_latent, 1)).detach()
+
+ else:
+ latent_in = paddle.to_tensor(np.load(pre_latent)).unsqueeze(0)
+
+ var_levels = list(latent_level)
+ const_levels = [
+ i for i in range(generator.n_latent) if i not in var_levels
+ ]
+ assert len(var_levels) > 0
+ if len(const_levels) > 0:
+ latent_fix = latent_in.index_select(paddle.to_tensor(const_levels),
+ 1).detach().clone()
+ latent_in = latent_in.index_select(paddle.to_tensor(var_levels),
+ 1).detach().clone()
+
+ latent_in.stop_gradient = False
+
+ optimizer = optim.Adam(parameters=[latent_in], learning_rate=start_lr)
+
+ pbar = tqdm(range(step))
+
+ for i in pbar:
+ t = i / step
+ lr = get_lr(t, step, start_lr, final_lr)
+ optimizer.set_lr(lr)
+
+ if len(const_levels) > 0:
+ latent_dict = {}
+ for idx, idx2 in enumerate(var_levels):
+ latent_dict[idx2] = latent_in[:, idx:idx + 1]
+ for idx, idx2 in enumerate(const_levels):
+ latent_dict[idx2] = (latent_fix[:, idx:idx + 1]).detach()
+ latent_list = []
+ for idx in range(generator.n_latent):
+ latent_list.append(latent_dict[idx])
+ latent_n = paddle.concat(latent_list, 1)
+ else:
+ latent_n = latent_in
+
+ img_gen, _ = generator([latent_n],
+ input_is_latent=True,
+ randomize_noise=False)
+
+ batch, channel, height, width = img_gen.shape
+
+ if height > 256:
+ factor = height // 256
+
+ img_gen = img_gen.reshape((batch, channel, height // factor,
+ factor, width // factor, factor))
+ img_gen = img_gen.mean([3, 5])
+
+ p_loss = percept(img_gen, imgs).sum()
+ mse_loss = F.mse_loss(img_gen, imgs)
+ loss = p_loss + mse_weight * mse_loss
+
+ optimizer.clear_grad()
+ loss.backward()
+ optimizer.step()
+
+ pbar.set_description(
+ (f"perceptual: {float(p_loss):.4f}; "
+ f"mse: {float(mse_loss):.4f}; lr: {lr:.4f}"))
+
+ img_gen, _ = generator([latent_n],
+ input_is_latent=True,
+ randomize_noise=False)
+ dst_img = make_image(img_gen)[0]
+ dst_latent = latent_n.numpy()[0]
+
+ os.makedirs(self.output_path, exist_ok=True)
+ save_src_path = os.path.join(self.output_path, 'src.fitting.png')
+ cv2.imwrite(save_src_path,
+ cv2.cvtColor(np.asarray(src_img), cv2.COLOR_RGB2BGR))
+ save_dst_path = os.path.join(self.output_path, 'dst.fitting.png')
+ cv2.imwrite(save_dst_path, cv2.cvtColor(dst_img, cv2.COLOR_RGB2BGR))
+ save_npy_path = os.path.join(self.output_path, 'dst.fitting.npy')
+ np.save(save_npy_path, dst_latent)
+
+ return np.asarray(src_img), dst_img, dst_latent
diff --git a/ppgan/apps/styleganv2mixing_predictor.py b/ppgan/apps/styleganv2mixing_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd6dcd8cefaee012f967106868c77998a6fd2b2e
--- /dev/null
+++ b/ppgan/apps/styleganv2mixing_predictor.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import numpy as np
+import paddle
+from .styleganv2_predictor import StyleGANv2Predictor
+
+
+def make_image(tensor):
+ return (((tensor.detach() + 1) / 2 * 255).clip(min=0, max=255).transpose(
+ (0, 2, 3, 1)).numpy().astype('uint8'))
+
+
+class StyleGANv2MixingPredictor(StyleGANv2Predictor):
+ @paddle.no_grad()
+ def run(self, latent1, latent2, weights=[0.5] * 18):
+
+ latent1 = paddle.to_tensor(np.load(latent1)).unsqueeze(0)
+ latent2 = paddle.to_tensor(np.load(latent2)).unsqueeze(0)
+ assert latent1.shape[1] == latent2.shape[1] == len(
+ weights
+ ), 'latents and their weights should have the same level nums.'
+ mix_latent = []
+ for i, weight in enumerate(weights):
+ mix_latent.append(latent1[:, i:i + 1] * weight +
+ latent2[:, i:i + 1] * (1 - weight))
+ mix_latent = paddle.concat(mix_latent, 1)
+ latent_n = paddle.concat([latent1, latent2, mix_latent], 0)
+ generator = self.generator
+ img_gen, _ = generator([latent_n],
+ input_is_latent=True,
+ randomize_noise=False)
+ imgs = make_image(img_gen)
+ src_img1 = imgs[0]
+ src_img2 = imgs[1]
+ dst_img = imgs[2]
+
+ os.makedirs(self.output_path, exist_ok=True)
+ save_src_path = os.path.join(self.output_path, 'src1.mixing.png')
+ cv2.imwrite(save_src_path, cv2.cvtColor(src_img1, cv2.COLOR_RGB2BGR))
+ save_src_path = os.path.join(self.output_path, 'src2.mixing.png')
+ cv2.imwrite(save_src_path, cv2.cvtColor(src_img2, cv2.COLOR_RGB2BGR))
+ save_dst_path = os.path.join(self.output_path, 'dst.mixing.png')
+ cv2.imwrite(save_dst_path, cv2.cvtColor(dst_img, cv2.COLOR_RGB2BGR))
+
+ return src_img1, src_img2, dst_img
diff --git a/ppgan/apps/swinir_predictor.py b/ppgan/apps/swinir_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc69a9872bbbc6a049adcfdd2424e11cfe1c097a
--- /dev/null
+++ b/ppgan/apps/swinir_predictor.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+from glob import glob
+from natsort import natsorted
+import numpy as np
+import os
+import random
+from tqdm import tqdm
+
+import paddle
+
+from ppgan.models.generators import SwinIR
+from ppgan.utils.download import get_path_from_url
+from .base_predictor import BasePredictor
+
+model_cfgs = {
+ 'Denoising': {
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/SwinIR_Denoising.pdparams',
+ 'upscale': 1,
+ 'img_size': 128,
+ 'window_size': 8,
+ 'depths': [6, 6, 6, 6, 6, 6],
+ 'embed_dim': 180,
+ 'num_heads': [6, 6, 6, 6, 6, 6],
+ 'mlp_ratio': 2
+ }
+}
+
+
+class SwinIRPredictor(BasePredictor):
+
+ def __init__(self,
+ output_path='output_dir',
+ weight_path=None,
+ seed=None,
+ window_size=8):
+ self.output_path = output_path
+ task = 'Denoising'
+ self.task = task
+ self.window_size = window_size
+
+ if weight_path is None:
+ if task in model_cfgs.keys():
+ weight_path = get_path_from_url(model_cfgs[task]['model_urls'])
+ checkpoint = paddle.load(weight_path)
+ else:
+ raise ValueError('Predictor need a task to define!')
+ else:
+ if weight_path.startswith("http"): # os.path.islink dosen't work!
+ weight_path = get_path_from_url(weight_path)
+ checkpoint = paddle.load(weight_path)
+ else:
+ checkpoint = paddle.load(weight_path)
+
+ self.generator = SwinIR(upscale=model_cfgs[task]['upscale'],
+ img_size=model_cfgs[task]['img_size'],
+ window_size=model_cfgs[task]['window_size'],
+ depths=model_cfgs[task]['depths'],
+ embed_dim=model_cfgs[task]['embed_dim'],
+ num_heads=model_cfgs[task]['num_heads'],
+ mlp_ratio=model_cfgs[task]['mlp_ratio'])
+
+ checkpoint = checkpoint['generator']
+ self.generator.set_state_dict(checkpoint)
+ self.generator.eval()
+
+ if seed is not None:
+ paddle.seed(seed)
+ random.seed(seed)
+ np.random.seed(seed)
+
+ def get_images(self, images_path):
+ if os.path.isdir(images_path):
+ return natsorted(
+ glob(os.path.join(images_path, '*.jpeg')) +
+ glob(os.path.join(images_path, '*.jpg')) +
+ glob(os.path.join(images_path, '*.JPG')) +
+ glob(os.path.join(images_path, '*.png')) +
+ glob(os.path.join(images_path, '*.PNG')))
+ else:
+ return [images_path]
+
+ def imread_uint(self, path, n_channels=3):
+ # input: path
+ # output: HxWx3(RGB or GGG), or HxWx1 (G)
+ if n_channels == 1:
+ img = cv2.imread(path, 0) # cv2.IMREAD_GRAYSCALE
+ img = np.expand_dims(img, axis=2) # HxWx1
+ elif n_channels == 3:
+ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # BGR or G
+ if img.ndim == 2:
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) # GGG
+ else:
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB
+
+ return img
+
+ def uint2single(self, img):
+
+ return np.float32(img / 255.)
+
+ # convert single (HxWxC) to 3-dimensional paddle tensor
+ def single2tensor3(self, img):
+ return paddle.Tensor(np.ascontiguousarray(
+ img, dtype=np.float32)).transpose([2, 0, 1])
+
+ def run(self, images_path=None):
+ os.makedirs(self.output_path, exist_ok=True)
+ task_path = os.path.join(self.output_path, self.task)
+ os.makedirs(task_path, exist_ok=True)
+ image_files = self.get_images(images_path)
+ for image_file in tqdm(image_files):
+ img_L = self.imread_uint(image_file, 3)
+
+ image_name = os.path.basename(image_file)
+ img = cv2.cvtColor(img_L, cv2.COLOR_RGB2BGR)
+ cv2.imwrite(os.path.join(task_path, image_name), img)
+
+ tmps = image_name.split('.')
+ assert len(
+ tmps) == 2, f'Invalid image name: {image_name}, too much "."'
+ restoration_save_path = os.path.join(
+ task_path, f'{tmps[0]}_restoration.{tmps[1]}')
+
+ img_L = self.uint2single(img_L)
+
+ # HWC to CHW, numpy to tensor
+ img_L = self.single2tensor3(img_L)
+ img_L = img_L.unsqueeze(0)
+ with paddle.no_grad():
+ # pad input image to be a multiple of window_size
+ _, _, h_old, w_old = img_L.shape
+ h_pad = (h_old // self.window_size +
+ 1) * self.window_size - h_old
+ w_pad = (w_old // self.window_size +
+ 1) * self.window_size - w_old
+ img_L = paddle.concat([img_L, paddle.flip(img_L, [2])],
+ 2)[:, :, :h_old + h_pad, :]
+ img_L = paddle.concat([img_L, paddle.flip(img_L, [3])],
+ 3)[:, :, :, :w_old + w_pad]
+ output = self.generator(img_L)
+ output = output[..., :h_old, :w_old]
+
+ restored = paddle.clip(output, 0, 1)
+
+ restored = restored.numpy()
+ restored = restored.transpose(0, 2, 3, 1)
+ restored = restored[0]
+ restored = restored * 255
+ restored = restored.astype(np.uint8)
+
+ cv2.imwrite(restoration_save_path,
+ cv2.cvtColor(restored, cv2.COLOR_RGB2BGR))
+
+ print('Done, output path is:', task_path)
diff --git a/ppgan/apps/wav2lip_predictor.py b/ppgan/apps/wav2lip_predictor.py
index 4ba30e2464f1a75460741760138d9d7778fda532..152eedc255e8a362acc8232c1b9a1f52f042052f 100644
--- a/ppgan/apps/wav2lip_predictor.py
+++ b/ppgan/apps/wav2lip_predictor.py
@@ -17,13 +17,36 @@ mel_step_size = 16
class Wav2LipPredictor(BasePredictor):
- def __init__(self, args):
- self.args = args
- if os.path.isfile(self.args.face) and self.args.face.split('.')[1] in [
- 'jpg', 'png', 'jpeg'
- ]:
- self.args.static = True
+ def __init__(self, checkpoint_path = None,
+ static = False,
+ fps = 25,
+ pads = [0, 10, 0, 0],
+ face_det_batch_size = 16,
+ wav2lip_batch_size = 128,
+ resize_factor = 1,
+ crop = [0, -1, 0, -1],
+ box = [-1, -1, -1, -1],
+ rotate = False,
+ nosmooth = False,
+ face_detector = 'sfd',
+ face_enhancement = False):
self.img_size = 96
+ self.checkpoint_path = checkpoint_path
+ self.static = static
+ self.fps = fps
+ self.pads = pads
+ self.face_det_batch_size = face_det_batch_size
+ self.wav2lip_batch_size = wav2lip_batch_size
+ self.resize_factor = resize_factor
+ self.crop = crop
+ self.box = box
+ self.rotate = rotate
+ self.nosmooth = nosmooth
+ self.face_detector = face_detector
+ self.face_enhancement = face_enhancement
+ if face_enhancement:
+ from ppgan.faceutils.face_enhancement import FaceEnhancement
+ self.faceenhancer = FaceEnhancement()
makedirs('./temp', exist_ok=True)
def get_smoothened_boxes(self, boxes, T):
@@ -37,9 +60,11 @@ class Wav2LipPredictor(BasePredictor):
def face_detect(self, images):
detector = face_detection.FaceAlignment(
- face_detection.LandmarksType._2D, flip_input=False)
+ face_detection.LandmarksType._2D,
+ flip_input=False,
+ face_detector=self.face_detector)
- batch_size = self.args.face_det_batch_size
+ batch_size = self.face_det_batch_size
while 1:
predictions = []
@@ -60,7 +85,7 @@ class Wav2LipPredictor(BasePredictor):
break
results = []
- pady1, pady2, padx1, padx2 = self.args.pads
+ pady1, pady2, padx1, padx2 = self.pads
for rect, image in zip(predictions, images):
if rect is None:
cv2.imwrite(
@@ -78,7 +103,7 @@ class Wav2LipPredictor(BasePredictor):
results.append([x1, y1, x2, y2])
boxes = np.array(results)
- if not self.args.nosmooth: boxes = self.get_smoothened_boxes(boxes, T=5)
+ if not self.nosmooth: boxes = self.get_smoothened_boxes(boxes, T=5)
results = [[image[y1:y2, x1:x2], (y1, y2, x1, x2)]
for image, (x1, y1, x2, y2) in zip(images, boxes)]
@@ -88,8 +113,8 @@ class Wav2LipPredictor(BasePredictor):
def datagen(self, frames, mels):
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
- if self.args.box[0] == -1:
- if not self.args.static:
+ if self.box[0] == -1:
+ if not self.static:
face_det_results = self.face_detect(
frames) # BGR2RGB for CNN face detection
else:
@@ -97,12 +122,12 @@ class Wav2LipPredictor(BasePredictor):
else:
print(
'Using the specified bounding box instead of face detection...')
- y1, y2, x1, x2 = self.args.box
+ y1, y2, x1, x2 = self.box
face_det_results = [[f[y1:y2, x1:x2], (y1, y2, x1, x2)]
for f in frames]
for i, m in enumerate(mels):
- idx = 0 if self.args.static else i % len(frames)
+ idx = 0 if self.static else i % len(frames)
frame_to_save = frames[idx].copy()
face, coords = face_det_results[idx].copy()
@@ -113,7 +138,7 @@ class Wav2LipPredictor(BasePredictor):
frame_batch.append(frame_to_save)
coords_batch.append(coords)
- if len(img_batch) >= self.args.wav2lip_batch_size:
+ if len(img_batch) >= self.wav2lip_batch_size:
img_batch, mel_batch = np.asarray(img_batch), np.asarray(
mel_batch)
@@ -142,17 +167,22 @@ class Wav2LipPredictor(BasePredictor):
yield img_batch, mel_batch, frame_batch, coords_batch
- def run(self):
- if not os.path.isfile(self.args.face):
+ def run(self, face, audio_seq, outfile):
+ if os.path.isfile(face) and path.basename(
+ face).split('.')[1] in ['jpg', 'png', 'jpeg']:
+ self.static = True
+
+ if not os.path.isfile(face):
raise ValueError(
'--face argument must be a valid path to video/image file')
- elif self.args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
- full_frames = [cv2.imread(self.args.face)]
- fps = self.args.fps
+ elif path.basename(
+ face).split('.')[1] in ['jpg', 'png', 'jpeg']:
+ full_frames = [cv2.imread(face)]
+ fps = self.fps
else:
- video_stream = cv2.VideoCapture(self.args.face)
+ video_stream = cv2.VideoCapture(face)
fps = video_stream.get(cv2.CAP_PROP_FPS)
print('Reading video frames...')
@@ -163,15 +193,15 @@ class Wav2LipPredictor(BasePredictor):
if not still_reading:
video_stream.release()
break
- if self.args.resize_factor > 1:
+ if self.resize_factor > 1:
frame = cv2.resize(
- frame, (frame.shape[1] // self.args.resize_factor,
- frame.shape[0] // self.args.resize_factor))
+ frame, (frame.shape[1] // self.resize_factor,
+ frame.shape[0] // self.resize_factor))
- if self.args.rotate:
+ if self.rotate:
frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
- y1, y2, x1, x2 = self.args.crop
+ y1, y2, x1, x2 = self.crop
if x2 == -1: x2 = frame.shape[1]
if y2 == -1: y2 = frame.shape[0]
@@ -182,18 +212,16 @@ class Wav2LipPredictor(BasePredictor):
print("Number of frames available for inference: " +
str(len(full_frames)))
- if not self.args.audio.endswith('.wav'):
+ if not audio_seq.endswith('.wav'):
print('Extracting raw audio...')
command = 'ffmpeg -y -i {} -strict -2 {}'.format(
- self.args.audio, 'temp/temp.wav')
+ audio_seq, 'temp/temp.wav')
subprocess.call(command, shell=True)
- self.args.audio = 'temp/temp.wav'
+ audio_seq = 'temp/temp.wav'
- wav = audio.load_wav(self.args.audio, 16000)
+ wav = audio.load_wav(audio_seq, 16000)
mel = audio.melspectrogram(wav)
- print(mel.shape)
-
if np.isnan(mel.reshape(-1)).sum() > 0:
raise ValueError(
'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again'
@@ -214,15 +242,15 @@ class Wav2LipPredictor(BasePredictor):
full_frames = full_frames[:len(mel_chunks)]
- batch_size = self.args.wav2lip_batch_size
+ batch_size = self.wav2lip_batch_size
gen = self.datagen(full_frames.copy(), mel_chunks)
model = Wav2Lip()
- if self.args.checkpoint_path is None:
+ if self.checkpoint_path is None:
model_weights_path = get_weights_path_from_url(WAV2LIP_WEIGHT_URL)
weights = paddle.load(model_weights_path)
else:
- weights = paddle.load(self.args.checkpoint_path)
+ weights = paddle.load(self.checkpoint_path)
model.load_dict(weights)
model.eval()
print("Model loaded")
@@ -248,6 +276,8 @@ class Wav2LipPredictor(BasePredictor):
for p, f, c in zip(pred, frames, coords):
y1, y2, x1, x2 = c
+ if self.face_enhancement:
+ p = self.faceenhancer.enhance_from_image(p)
p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
f[y1:y2, x1:x2] = p
@@ -256,5 +286,6 @@ class Wav2LipPredictor(BasePredictor):
out.release()
command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(
- self.args.audio, 'temp/result.avi', self.args.outfile)
+ audio_seq, 'temp/result.avi', outfile)
subprocess.call(command, shell=platform.system() != 'Windows')
+
diff --git a/ppgan/datasets/__init__.py b/ppgan/datasets/__init__.py
index 4761233c96b1902a12d31b8d6ef44d3f27be0e87..e1527dec4f165f15eb4b48f4cdb281823c14f9bd 100644
--- a/ppgan/datasets/__init__.py
+++ b/ppgan/datasets/__init__.py
@@ -20,3 +20,20 @@ from .makeup_dataset import MakeupDataset
from .common_vision_dataset import CommonVisionDataset
from .animeganv2_dataset import AnimeGANV2Dataset
from .wav2lip_dataset import Wav2LipDataset
+from .starganv2_dataset import StarGANv2Dataset
+from .firstorder_dataset import FirstOrderDataset
+from .lapstyle_dataset import LapStyleDataset
+from .mpr_dataset import MPRTrain, MPRVal, MPRTest
+from .vsr_reds_dataset import VSRREDSDataset
+from .vsr_reds_multiple_gt_dataset import VSRREDSMultipleGTDataset
+from .vsr_vimeo90k_dataset import VSRVimeo90KDataset
+from .vsr_folder_dataset import VSRFolderDataset
+from .photopen_dataset import PhotoPenDataset
+from .empty_dataset import EmptyDataset
+from .gpen_dataset import GPENDataset
+from .swinir_dataset import SwinIRDataset
+from .gfpgan_datasets import FFHQDegradationDataset
+from .paired_image_datasets import PairedImageDataset
+from .invdn_dataset import InvDNDataset
+from .nafnet_dataset import NAFNetTrain, NAFNetVal, NAFNetTest
+from .aotgan_dataset import AOTGANDataset
diff --git a/ppgan/datasets/animeganv2_dataset.py b/ppgan/datasets/animeganv2_dataset.py
index 6ceb6d52e2756a38e16960839ec212141c8201af..7f70ef8bfdd18cafc85fd33c6955c8615d815bb7 100644
--- a/ppgan/datasets/animeganv2_dataset.py
+++ b/ppgan/datasets/animeganv2_dataset.py
@@ -13,31 +13,38 @@
#limitations under the License.
import cv2
-import numpy as np
import os.path
+import numpy as np
+import paddle
from .base_dataset import BaseDataset
from .image_folder import ImageFolder
from .builder import DATASETS
-from .transforms.builder import build_transforms
+from .preprocess.builder import build_transforms
@DATASETS.register()
-class AnimeGANV2Dataset(BaseDataset):
+class AnimeGANV2Dataset(paddle.io.Dataset):
"""
"""
- def __init__(self, cfg):
+ def __init__(self,
+ dataroot,
+ style,
+ transform_real=None,
+ transform_anime=None,
+ transform_gray=None):
"""Initialize this dataset class.
Args:
- cfg (dict) -- stores all the experiment flags
+ dataroot (dict): Directory of dataset.
+
"""
- BaseDataset.__init__(self, cfg)
- self.style = cfg.style
+ self.root = dataroot
+ self.style = style
- self.transform_real = build_transforms(self.cfg.transform_real)
- self.transform_anime = build_transforms(self.cfg.transform_anime)
- self.transform_gray = build_transforms(self.cfg.transform_gray)
+ self.transform_real = build_transforms(transform_real)
+ self.transform_anime = build_transforms(transform_anime)
+ self.transform_gray = build_transforms(transform_gray)
self.real_root = os.path.join(self.root, 'train_photo')
self.anime_root = os.path.join(self.root, f'{self.style}', 'style')
diff --git a/ppgan/datasets/aotgan_dataset.py b/ppgan/datasets/aotgan_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c49b203186cbda9c9f930bf56bb1adf466796d7d
--- /dev/null
+++ b/ppgan/datasets/aotgan_dataset.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from PIL import Image, ImageOps
+import os
+import numpy as np
+import logging
+
+from paddle.io import Dataset, DataLoader
+from paddle.vision.transforms import Compose, RandomResizedCrop, RandomHorizontalFlip, RandomRotation, ColorJitter, Resize
+
+from .builder import DATASETS
+
+logger = logging.getLogger(__name__)
+
+@DATASETS.register()
+class AOTGANDataset(Dataset):
+ def __init__(self, dataset_path, img_size, istrain=True):
+ super(AOTGANDataset, self).__init__()
+
+ self.image_path = []
+ def get_all_sub_dirs(root_dir): # read all image files including subdirectories
+ file_list = []
+ def get_sub_dirs(r_dir):
+ for root, dirs, files in os.walk(r_dir):
+ if len(files) > 0:
+ for f in files:
+ file_list.append(os.path.join(root, f))
+ if len(dirs) > 0:
+ for d in dirs:
+ get_sub_dirs(os.path.join(root, d))
+ break
+ get_sub_dirs(root_dir)
+ return file_list
+
+ # set data path
+ if istrain:
+ self.img_list = get_all_sub_dirs(os.path.join(dataset_path, 'train_img'))
+ self.mask_dir = os.path.join(dataset_path, 'train_mask')
+ else:
+ self.img_list = get_all_sub_dirs(os.path.join(dataset_path, 'val_img'))
+ self.mask_dir = os.path.join(dataset_path, 'val_mask')
+ self.img_list = np.sort(np.array(self.img_list))
+ _, _, mask_list = next(os.walk(self.mask_dir))
+ self.mask_list = np.sort(mask_list)
+
+
+ self.istrain = istrain
+
+ # augumentations
+ if istrain:
+ self.img_trans = Compose([
+ Resize(img_size),
+ RandomResizedCrop(img_size),
+ RandomHorizontalFlip(),
+ ColorJitter(0.05, 0.05, 0.05, 0.05),
+ ])
+ self.mask_trans = Compose([
+ Resize([img_size, img_size], interpolation='nearest'),
+ RandomHorizontalFlip(),
+ ])
+ else:
+ self.img_trans = Compose([
+ Resize([img_size, img_size], interpolation='bilinear'),
+ ])
+ self.mask_trans = Compose([
+ Resize([img_size, img_size], interpolation='nearest'),
+ ])
+
+ self.istrain = istrain
+
+ # feed data
+ def __getitem__(self, idx):
+ img = Image.open(self.img_list[idx])
+ mask = Image.open(os.path.join(self.mask_dir, self.mask_list[np.random.randint(0, self.mask_list.shape[0])]))
+ img = self.img_trans(img)
+ mask = self.mask_trans(mask)
+
+ mask = mask.rotate(np.random.randint(0, 45))
+ img = img.convert('RGB')
+ mask = mask.convert('L')
+
+ img = np.array(img).astype('float32')
+ img = (img / 255.) * 2. - 1.
+ img = np.transpose(img, (2, 0, 1))
+ mask = np.array(mask).astype('float32') / 255.
+ mask = np.expand_dims(mask, 0)
+
+ return {'img':img, 'mask':mask, 'img_path':self.img_list[idx]}
+
+ def __len__(self):
+ return len(self.img_list)
+
+ def name(self):
+ return 'PlaceDateset'
+
+@DATASETS.register()
+class AOTGANDataset_test(Dataset):
+ def __init__(self, dataset_path, img_size, istrain=True):
+ super(AOTGANDataset_test, self).__init__()
+
+ self.image_path = []
+ def get_all_sub_dirs(root_dir): # read all image files including subdirectories
+ file_list = []
+ def get_sub_dirs(r_dir):
+ for root, dirs, files in os.walk(r_dir):
+ if len(files) > 0:
+ for f in files:
+ file_list.append(os.path.join(root, f))
+ if len(dirs) > 0:
+ for d in dirs:
+ get_sub_dirs(os.path.join(root, d))
+ break
+ get_sub_dirs(root_dir)
+ return file_list
+
+ # set data path
+ if istrain:
+ self.img_list = get_all_sub_dirs(os.path.join(dataset_path, 'train_img'))
+ self.mask_dir = os.path.join(dataset_path, 'train_mask')
+ else:
+ self.img_list = get_all_sub_dirs(os.path.join(dataset_path, 'val_img'))
+ self. mask_dir = os.path.join(dataset_path, 'val_mask')
+ self.img_list = np.sort(np.array(self.img_list))
+ _, _, mask_list = next(os.walk(self.mask_dir))
+ self.mask_list = np.sort(mask_list)
+
+
+ self.istrain = istrain
+
+ # augumentations
+ if istrain:
+ self.img_trans = Compose([
+ RandomResizedCrop(img_size),
+ RandomHorizontalFlip(),
+ ColorJitter(0.05, 0.05, 0.05, 0.05),
+ ])
+ self.mask_trans = Compose([
+ Resize([img_size, img_size], interpolation='nearest'),
+ RandomHorizontalFlip(),
+ ])
+ else:
+ self.img_trans = Compose([
+ Resize([img_size, img_size], interpolation='bilinear'),
+ ])
+ self.mask_trans = Compose([
+ Resize([img_size, img_size], interpolation='nearest'),
+ ])
+
+ self.istrain = istrain
+
+ # feed data
+ def __getitem__(self, idx):
+ img = Image.open(self.img_list[idx])
+ mask = Image.open(os.path.join(self.mask_dir, self.mask_list[np.random.randint(0, self.mask_list.shape[0])]))
+ img = self.img_trans(img)
+ mask = self.mask_trans(mask)
+
+ mask = mask.rotate(np.random.randint(0, 45))
+ img = img.convert('RGB')
+ mask = mask.convert('L')
+
+ img = np.array(img).astype('float32')
+ img = (img / 255.) * 2. - 1.
+ img = np.transpose(img, (2, 0, 1))
+ mask = np.array(mask).astype('float32') / 255.
+ mask = np.expand_dims(mask, 0)
+
+ return {'img':img, 'mask':mask, 'img_path':self.img_list[idx]}
+
+ def __len__(self):
+ return len(self.img_list)
+
+ def name(self):
+ return 'PlaceDateset_test'
diff --git a/ppgan/datasets/base_dataset.py b/ppgan/datasets/base_dataset.py
index 8ea7b8b0063b5f396575dfe1a7fe222a8267ec6a..229c8e23a9c4286cc9e1af9a223cf99c8db98922 100644
--- a/ppgan/datasets/base_dataset.py
+++ b/ppgan/datasets/base_dataset.py
@@ -13,10 +13,10 @@
# limitations under the License.
import os
+import copy
from pathlib import Path
-from abc import ABCMeta, abstractmethod
-
from paddle.io import Dataset
+from abc import ABCMeta, abstractmethod
from .preprocess import build_preprocess
@@ -119,7 +119,7 @@ class BaseDataset(Dataset, metaclass=ABCMeta):
return samples
def __getitem__(self, idx):
- datas = self.data_infos[idx]
+ datas = copy.deepcopy(self.data_infos[idx])
if hasattr(self, 'preprocess') and self.preprocess:
datas = self.preprocess(datas)
diff --git a/ppgan/datasets/builder.py b/ppgan/datasets/builder.py
index da582bb6d76efc5829003740af67abd69f7e91a9..9ee1f41c7597bebf7cdfef611f2cfba3f162edf7 100644
--- a/ppgan/datasets/builder.py
+++ b/ppgan/datasets/builder.py
@@ -16,123 +16,26 @@ import time
import paddle
import numbers
import numpy as np
-from multiprocessing import Manager
-from paddle.distributed import ParallelEnv
+from paddle.distributed import ParallelEnv
from paddle.io import DistributedBatchSampler
-from ..utils.registry import Registry
-
-DATASETS = Registry("DATASETS")
-
-
-class DictDataset(paddle.io.Dataset):
- def __init__(self, dataset):
- self.dataset = dataset
- self.tensor_keys_set = set()
- self.non_tensor_keys_set = set()
- self.non_tensor_dict = Manager().dict()
- single_item = dataset[0]
- self.keys = single_item.keys()
-
- for k, v in single_item.items():
- if not isinstance(v, (numbers.Number, np.ndarray)):
- setattr(self, k, Manager().dict())
- self.non_tensor_keys_set.add(k)
- else:
- self.tensor_keys_set.add(k)
-
- def __getitem__(self, index):
-
- ori_map = self.dataset[index]
-
- tmp_list = []
-
- for k, v in ori_map.items():
- if isinstance(v, (numbers.Number, np.ndarray)):
- tmp_list.append(v)
- else:
- getattr(self, k).update({index: v})
-
- tmp_list.append(index)
- return tuple(tmp_list)
- def __len__(self):
- return len(self.dataset)
+from .repeat_dataset import RepeatDataset
+from ..utils.registry import Registry, build_from_config
- def reset(self):
- for k in self.non_tensor_keys_set:
- setattr(self, k, Manager().dict())
-
-
-class DictDataLoader():
- def __init__(self,
- dataset,
- batch_size,
- is_train,
- num_workers=4,
- use_shared_memory=True,
- distributed=True):
-
- self.dataset = DictDataset(dataset)
-
- place = paddle.CUDAPlace(ParallelEnv().dev_id) \
- if ParallelEnv().nranks > 1 else paddle.CUDAPlace(0)
-
- if distributed:
- sampler = DistributedBatchSampler(
- self.dataset,
- batch_size=batch_size,
- shuffle=True if is_train else False,
- drop_last=True if is_train else False)
-
- self.dataloader = paddle.io.DataLoader(
- self.dataset,
- batch_sampler=sampler,
- places=place,
- num_workers=num_workers,
- use_shared_memory=use_shared_memory)
- else:
- self.dataloader = paddle.io.DataLoader(
- self.dataset,
- batch_size=batch_size,
- shuffle=True if is_train else False,
- drop_last=True if is_train else False,
- places=place,
- use_shared_memory=False,
- num_workers=num_workers)
-
- self.batch_size = batch_size
-
- def __iter__(self):
-
- self.dataset.reset()
-
- for i, data in enumerate(self.dataloader):
- return_dict = {}
- j = 0
- for k in self.dataset.keys:
- if k in self.dataset.tensor_keys_set:
- return_dict[k] = data[j] if isinstance(data,
- (list,
- tuple)) else data
- j += 1
- else:
- return_dict[k] = self.get_items_by_indexs(k, data[-1])
- yield return_dict
+DATASETS = Registry("DATASETS")
- def __len__(self):
- return len(self.dataloader)
- def get_items_by_indexs(self, key, indexs):
- if isinstance(indexs, paddle.Tensor):
- indexs = indexs.numpy()
- current_items = []
- items = getattr(self.dataset, key)
+def build_dataset(cfg):
+ name = cfg.pop('name')
- for index in indexs:
- current_items.append(items[index])
+ if name == 'RepeatDataset':
+ dataset_ = build_from_config(cfg['dataset'], DATASETS)
+ dataset = RepeatDataset(dataset_, cfg['times'])
+ else:
+ dataset = dataset = DATASETS.get(name)(**cfg)
- return current_items
+ return dataset
def build_dataloader(cfg, is_train=True, distributed=True):
@@ -142,14 +45,24 @@ def build_dataloader(cfg, is_train=True, distributed=True):
num_workers = cfg_.pop('num_workers', 0)
use_shared_memory = cfg_.pop('use_shared_memory', True)
- name = cfg_.pop('name')
-
- dataset = DATASETS.get(name)(**cfg_)
- dataloader = DictDataLoader(dataset,
- batch_size,
- is_train,
- num_workers,
- use_shared_memory=use_shared_memory,
- distributed=distributed)
+ dataset = build_dataset(cfg_)
+
+ if distributed:
+ sampler = DistributedBatchSampler(dataset,
+ batch_size=batch_size,
+ shuffle=True if is_train else False,
+ drop_last=True if is_train else False)
+
+ dataloader = paddle.io.DataLoader(dataset,
+ batch_sampler=sampler,
+ num_workers=num_workers,
+ use_shared_memory=use_shared_memory)
+ else:
+ dataloader = paddle.io.DataLoader(dataset,
+ batch_size=batch_size,
+ shuffle=True if is_train else False,
+ drop_last=True if is_train else False,
+ use_shared_memory=use_shared_memory,
+ num_workers=num_workers)
return dataloader
diff --git a/ppgan/datasets/common_vision_dataset.py b/ppgan/datasets/common_vision_dataset.py
index 2e69104603defab1c03705b02043bcb535f18079..8b03926594eae35242b1fad31984f413d260eede 100644
--- a/ppgan/datasets/common_vision_dataset.py
+++ b/ppgan/datasets/common_vision_dataset.py
@@ -17,7 +17,7 @@ import paddle
from .builder import DATASETS
from .base_dataset import BaseDataset
-from .transforms.builder import build_transforms
+from .preprocess.builder import build_transforms
@DATASETS.register()
diff --git a/ppgan/datasets/empty_dataset.py b/ppgan/datasets/empty_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..970341763245e91e5e307ae0d38d1ed7c42279ba
--- /dev/null
+++ b/ppgan/datasets/empty_dataset.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base_dataset import BaseDataset
+from .builder import DATASETS
+
+
+@DATASETS.register()
+class EmptyDataset(BaseDataset):
+ '''
+ Dataset for models who don't need a dataset.
+ '''
+ def __init__(self, size=1):
+ super().__init__()
+ self.size = size
+ self.data_infos = self.prepare_data_infos()
+
+ def prepare_data_infos(self):
+ return [{i: 0} for i in range(self.size)]
diff --git a/ppgan/datasets/firstorder_dataset.py b/ppgan/datasets/firstorder_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..31749b4f1bfb2daf79d3df60e3c80858918d45c5
--- /dev/null
+++ b/ppgan/datasets/firstorder_dataset.py
@@ -0,0 +1,261 @@
+# code was heavily based on https://github.com/AliaksandrSiarohin/first-order-model
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/AliaksandrSiarohin/first-order-model/blob/master/LICENSE.md
+
+import logging
+from multiprocessing import Pool
+from pathlib import Path
+
+import numpy as np
+import tqdm
+from imageio import imread, mimread, imwrite
+import cv2
+from paddle.io import Dataset
+from .builder import DATASETS
+from .preprocess.builder import build_transforms
+import glob, os
+
+POOL_SIZE = 64 # If POOL_SIZE>0 use multiprocessing to extract frames from gif file
+
+
+@DATASETS.register()
+class FirstOrderDataset(Dataset):
+ def __init__(self, **cfg):
+ """Initialize FirstOrder dataset class.
+
+ Args:
+ dataroot (str): Directory of dataset.
+ phase (str): train or test
+ num_repeats (int): Number for datasets to repeat
+ time_flip (bool): whether to exchange the driving image and source image randomly
+ batch_size (int): dataset batch size
+ id_sampling (bool): whether to sample person's id
+ frame_shape (list): image shape
+ create_frames_folder (bool): if the format of your input datasets is '.mp4', \
+ you can choose whether to save it with images
+ num_workers (int): dataset
+ """
+ super(FirstOrderDataset, self).__init__()
+ self.cfg = cfg
+ self.frameDataset = FramesDataset(self.cfg)
+
+ # create frames folder before 'DatasetRepeater'
+ if self.cfg['create_frames_folder']:
+ file_idx_set = [
+ idx for idx, path in enumerate(self.frameDataset.videos)
+ if not self.frameDataset.root_dir.joinpath(path).is_dir()
+ ]
+ file_idx_set = list(file_idx_set)
+ if len(file_idx_set) != 0:
+ if POOL_SIZE == 0:
+ for idx in tqdm.tqdm(file_idx_set,
+ desc='Extracting frames'):
+ _ = self.frameDataset[idx]
+ else:
+ # multiprocessing
+ bar = tqdm.tqdm(total=len(file_idx_set),
+ desc='Extracting frames')
+ with Pool(POOL_SIZE) as pl:
+ _p = 0
+ while _p <= len(file_idx_set) - 1:
+ _ = pl.map(self.frameDataset.__getitem__,
+ file_idx_set[_p:_p + POOL_SIZE * 2])
+ _p = _p + POOL_SIZE * 2
+ bar.update(POOL_SIZE * 2)
+ bar.close()
+
+ # rewrite video path
+ self.frameDataset.videos = [
+ i.with_suffix('') for i in self.frameDataset.videos
+ ]
+
+ if self.cfg['phase'] == 'train':
+ self.outDataset = DatasetRepeater(self.frameDataset,
+ self.cfg['num_repeats'])
+ else:
+ self.outDataset = self.frameDataset
+
+ def __len__(self):
+ return len(self.outDataset)
+
+ def __getitem__(self, idx):
+ return self.outDataset[idx]
+
+
+def read_video(name: Path, frame_shape=tuple([256, 256, 3]), saveto='folder'):
+ """
+ Read video which can be:
+ - an image of concatenated frames
+ - '.mp4' and'.gif'
+ - folder with videos
+ """
+ if name.is_dir():
+ frames = sorted(name.iterdir(),
+ key=lambda x: int(x.with_suffix('').name))
+ video_array = np.array([imread(path) for path in frames],
+ dtype='float32')
+ return video_array
+ elif name.suffix.lower() in ['.gif', '.mp4', '.mov']:
+ try:
+ video = mimread(name, memtest=False)
+ except Exception as err:
+ logging.error('DataLoading File:%s Msg:%s' % (str(name), str(err)))
+ return None
+
+ # convert to 3-channel image
+ if video[0].shape[-1] == 4:
+ video = [i[..., :3] for i in video]
+ elif video[0].shape[-1] == 1:
+ video = [np.tile(i, (1, 1, 3)) for i in video]
+ elif len(video[0].shape) == 2:
+ video = [np.tile(i[..., np.newaxis], (1, 1, 3)) for i in video]
+ video_array = np.asarray(video)
+ video_array_reshape = []
+ for idx, img in enumerate(video_array):
+ img = cv2.resize(img, (frame_shape[0], frame_shape[1]))
+ video_array_reshape.append(img.astype(np.uint8))
+ video_array_reshape = np.asarray(video_array_reshape)
+
+ if saveto == 'folder':
+ sub_dir = name.with_suffix('')
+ try:
+ sub_dir.mkdir()
+ except FileExistsError:
+ pass
+ for idx, img in enumerate(video_array_reshape):
+ cv2.imwrite(str(sub_dir.joinpath('%i.png' % idx)), img[:,:,[2,1,0]])
+ name.unlink()
+ return video_array_reshape
+ else:
+ raise Exception("Unknown dataset file extensions %s" % name)
+
+
+class FramesDataset(Dataset):
+ """
+ Dataset of videos, each video can be represented as:
+ - an image of concatenated frames
+ - '.mp4' or '.gif'
+ - folder with all frames
+ FramesDataset[i]: obtain sample from i-th video in self.videos
+ """
+ def __init__(self, cfg):
+ self.root_dir = Path(cfg['dataroot'])
+ self.videos = None
+ self.frame_shape = tuple(cfg['frame_shape'])
+ self.id_sampling = cfg['id_sampling']
+ self.time_flip = cfg['time_flip']
+ self.is_train = True if cfg['phase'] == 'train' else False
+ self.pairs_list = cfg.setdefault('pairs_list', None)
+ self.create_frames_folder = cfg['create_frames_folder']
+ self.transform = None
+ random_seed = 0
+ assert self.root_dir.joinpath('train').exists()
+ assert self.root_dir.joinpath('test').exists()
+ logging.info("Use predefined train-test split.")
+ if self.id_sampling:
+ train_videos = {
+ video.name.split('#')[0]
+ for video in self.root_dir.joinpath('train').iterdir()
+ }
+ train_videos = list(train_videos)
+ else:
+ train_videos = list(self.root_dir.joinpath('train').iterdir())
+ test_videos = list(self.root_dir.joinpath('test').iterdir())
+ self.root_dir = self.root_dir.joinpath(
+ 'train' if self.is_train else 'test')
+
+ if self.is_train:
+ self.videos = train_videos
+ self.transform = build_transforms(cfg['transforms'])
+ else:
+ self.videos = test_videos
+ self.transform = None
+
+ def __len__(self):
+ return len(self.videos)
+
+ def __getitem__(self, idx):
+ if self.is_train and self.id_sampling:
+ name = self.videos[idx]
+ path = Path(
+ np.random.choice(
+ glob.glob(os.path.join(self.root_dir, name + '*.mp4'))))
+ else:
+ path = self.videos[idx]
+ video_name = path.name
+ if self.is_train and path.is_dir():
+ frames = sorted(path.iterdir(),
+ key=lambda x: int(x.with_suffix('').name))
+ num_frames = len(frames)
+ frame_idx = np.sort(
+ np.random.choice(num_frames, replace=True, size=2))
+ video_array = [imread(str(frames[idx])) for idx in frame_idx]
+ else:
+ if self.create_frames_folder:
+ video_array = read_video(path,
+ frame_shape=self.frame_shape,
+ saveto='folder')
+ self.videos[idx] = path.with_suffix(
+ '') # rename /xx/xx/xx.gif -> /xx/xx/xx
+ else:
+ video_array = read_video(path,
+ frame_shape=self.frame_shape,
+ saveto=None)
+ num_frames = len(video_array)
+ frame_idx = np.sort(
+ np.random.choice(
+ num_frames, replace=True,
+ size=2)) if self.is_train else range(num_frames)
+ video_array = [video_array[i] for i in frame_idx]
+ # convert to 3-channel image
+ if video_array[0].shape[-1] == 4:
+ video_array = [i[..., :3] for i in video_array]
+ elif video_array[0].shape[-1] == 1:
+ video_array = [np.tile(i, (1, 1, 3)) for i in video_array]
+ elif len(video_array[0].shape) == 2:
+ video_array = [
+ np.tile(i[..., np.newaxis], (1, 1, 3)) for i in video_array
+ ]
+ out = {}
+ if self.is_train:
+ if self.transform is not None: #modify
+ t = self.transform(tuple(video_array))
+ out['driving'] = t[0].transpose(2, 0, 1).astype(
+ np.float32) / 255.0
+ out['source'] = t[1].transpose(2, 0, 1).astype(
+ np.float32) / 255.0
+ else:
+ source = np.array(video_array[0],
+ dtype='float32') / 255.0 # shape is [H, W, C]
+ driving = np.array(
+ video_array[1],
+ dtype='float32') / 255.0 # shape is [H, W, C]
+ out['driving'] = driving.transpose(2, 0, 1)
+ out['source'] = source.transpose(2, 0, 1)
+ if self.time_flip and np.random.rand() < 0.5: #modify
+ buf = out['driving']
+ out['driving'] = out['source']
+ out['source'] = buf
+ else:
+ video = np.stack(video_array, axis=0).astype(np.float32) / 255.0
+ out['video'] = video.transpose(3, 0, 1, 2)
+ out['name'] = video_name
+ return out
+
+ def get_sample(self, idx):
+ return self.__getitem__(idx)
+
+
+class DatasetRepeater(Dataset):
+ """
+ Pass several times over the same dataset for better i/o performance
+ """
+ def __init__(self, dataset, num_repeats=100):
+ self.dataset = dataset
+ self.num_repeats = num_repeats
+
+ def __len__(self):
+ return self.num_repeats * self.dataset.__len__()
+
+ def __getitem__(self, idx):
+ return self.dataset[idx % self.dataset.__len__()]
diff --git a/ppgan/datasets/gfpgan_datasets.py b/ppgan/datasets/gfpgan_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f34ac1cb7f4cb76498ba698eaeac5f10f8b69bf
--- /dev/null
+++ b/ppgan/datasets/gfpgan_datasets.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import math
+import numpy as np
+import random
+import os
+
+import paddle
+import paddle.nn.functional as F
+from paddle.vision.transforms.functional import normalize
+
+from .builder import DATASETS
+
+from ppgan.utils.download import get_path_from_url
+from ppgan.utils.gfpgan_tools import *
+
+
+@DATASETS.register()
+class FFHQDegradationDataset(paddle.io.Dataset):
+ """FFHQ dataset for GFPGAN.
+
+ It reads high resolution images, and then generate low-quality (LQ) images on-the-fly.
+
+ Args:
+ opt (dict): Config for train datasets. It contains the following keys:
+ dataroot_gt (str): Data root path for gt.
+ io_backend (dict): IO backend type and other kwarg.
+ mean (list | tuple): Image mean.
+ std (list | tuple): Image std.
+ use_hflip (bool): Whether to horizontally flip.
+ Please see more options in the codes.
+ """
+ def __init__(self, **opt):
+ super(FFHQDegradationDataset, self).__init__()
+ self.opt = opt
+ self.file_client = None
+ self.io_backend_opt = opt['io_backend']
+ self.gt_folder = opt['dataroot_gt']
+ self.mean = opt['mean']
+ self.std = opt['std']
+ self.out_size = opt['out_size']
+ self.crop_components = opt.get('crop_components', False)
+ self.eye_enlarge_ratio = opt.get('eye_enlarge_ratio', 1)
+ if self.crop_components:
+ self.components_list = get_path_from_url(opt.get('component_path'))
+ self.components_list = paddle.load(self.components_list)
+ # print(self.components_list)
+ self.paths = paths_from_folder(self.gt_folder)
+ self.blur_kernel_size = opt['blur_kernel_size']
+ self.kernel_list = opt['kernel_list']
+ self.kernel_prob = opt['kernel_prob']
+ self.blur_sigma = opt['blur_sigma']
+ self.downsample_range = opt['downsample_range']
+ self.noise_range = opt['noise_range']
+ self.jpeg_range = opt['jpeg_range']
+ self.color_jitter_prob = opt.get('color_jitter_prob')
+ self.color_jitter_pt_prob = opt.get('color_jitter_pt_prob')
+ self.color_jitter_shift = opt.get('color_jitter_shift', 20)
+ self.gray_prob = opt.get('gray_prob')
+ self.color_jitter_shift /= 255.0
+
+ @staticmethod
+ def color_jitter(img, shift):
+ """jitter color: randomly jitter the RGB values, in numpy formats"""
+ jitter_val = np.random.uniform(-shift, shift, 3).astype(np.float32)
+ img = img + jitter_val
+ img = np.clip(img, 0, 1)
+ return img
+
+ @staticmethod
+ def color_jitter_pt(img, brightness, contrast, saturation, hue):
+ """jitter color: randomly jitter the brightness, contrast, saturation, and hue, in torch Tensor formats"""
+ fn_idx = paddle.randperm(4)
+ img = paddle.to_tensor(img, dtype=img.dtype)
+ for fn_id in fn_idx:
+ # print('fn_id',fn_id)
+ if fn_id == 0 and brightness is not None:
+ brightness_factor = paddle.to_tensor(1.0).uniform_(
+ brightness[0], brightness[1]).item()
+ # print("brightness_factor",brightness_factor)
+ img = adjust_brightness(img, brightness_factor)
+ if fn_id == 1 and contrast is not None:
+ contrast_factor = paddle.to_tensor(1.0).uniform_(
+ contrast[0], contrast[1]).item()
+ img = adjust_contrast(img, contrast_factor)
+ if fn_id == 2 and saturation is not None:
+ saturation_factor = paddle.to_tensor(1.0).uniform_(
+ saturation[0], saturation[1]).item()
+ img = adjust_saturation(img, saturation_factor)
+ if fn_id == 3 and hue is not None:
+ hue_factor = paddle.to_tensor(1.0).uniform_(hue[0],
+ hue[1]).item()
+ img = adjust_hue(img, hue_factor)
+ return img
+
+ def get_component_coordinates(self, index, status):
+ """Get facial component (left_eye, right_eye, mouth) coordinates from a pre-loaded pth file"""
+ # print(f'{index:08d}',type(self.components_list))
+ components_bbox = self.components_list[f'{index:08d}']
+ if status[0]:
+ tmp = components_bbox['left_eye']
+ components_bbox['left_eye'] = components_bbox['right_eye']
+ components_bbox['right_eye'] = tmp
+ components_bbox['left_eye'][
+ 0] = self.out_size - components_bbox['left_eye'][0]
+ components_bbox['right_eye'][
+ 0] = self.out_size - components_bbox['right_eye'][0]
+ components_bbox['mouth'][
+ 0] = self.out_size - components_bbox['mouth'][0]
+ locations = []
+ for part in ['left_eye', 'right_eye', 'mouth']:
+ mean = components_bbox[part][0:2]
+ half_len = components_bbox[part][2]
+ if 'eye' in part:
+ half_len *= self.eye_enlarge_ratio
+ loc = np.hstack((mean - half_len + 1, mean + half_len))
+ loc = paddle.to_tensor(loc)
+ locations.append(loc)
+ return locations
+
+ def __getitem__(self, index):
+ if self.file_client is None:
+ self.file_client = FileClient(self.io_backend_opt.pop('type'),
+ **self.io_backend_opt)
+ gt_path = self.paths[index]
+ img_bytes = self.file_client.get(gt_path)
+ img_gt = imfrombytes(img_bytes, float32=True)
+ img_gt = cv2.resize(img_gt, (self.out_size, self.out_size))
+ img_gt, status = augment(img_gt,
+ hflip=self.opt['use_hflip'],
+ rotation=False,
+ return_status=True)
+ h, w, _ = img_gt.shape
+ if self.crop_components:
+ locations = self.get_component_coordinates(index, status)
+ loc_left_eye, loc_right_eye, loc_mouth = locations
+ kernel = random_mixed_kernels(self.kernel_list,
+ self.kernel_prob,
+ self.blur_kernel_size,
+ self.blur_sigma,
+ self.blur_sigma, [-math.pi, math.pi],
+ noise_range=None)
+ img_lq = cv2.filter2D(img_gt, -1, kernel)
+ scale = np.random.uniform(self.downsample_range[0],
+ self.downsample_range[1])
+ img_lq = cv2.resize(img_lq, (int(w // scale), int(h // scale)),
+ interpolation=cv2.INTER_LINEAR)
+ if self.noise_range is not None:
+ img_lq = random_add_gaussian_noise(img_lq, self.noise_range)
+ if self.jpeg_range is not None:
+ img_lq = random_add_jpg_compression(img_lq, self.jpeg_range)
+ img_lq = cv2.resize(img_lq, (w, h), interpolation=cv2.INTER_LINEAR)
+ if self.color_jitter_prob is not None and np.random.uniform(
+ ) < self.color_jitter_prob:
+ img_lq = self.color_jitter(img_lq, self.color_jitter_shift)
+ if self.gray_prob and np.random.uniform() < self.gray_prob:
+ img_lq = cv2.cvtColor(img_lq, cv2.COLOR_BGR2GRAY)
+ img_lq = np.tile(img_lq[:, :, None], [1, 1, 3])
+ if self.opt.get('gt_gray'):
+ img_gt = cv2.cvtColor(img_gt, cv2.COLOR_BGR2GRAY)
+ img_gt = np.tile(img_gt[:, :, None], [1, 1, 3])
+ img_gt, img_lq = img2tensor([img_gt, img_lq],
+ bgr2rgb=True,
+ float32=True)
+ if self.color_jitter_pt_prob is not None and np.random.uniform(
+ ) < self.color_jitter_pt_prob:
+ brightness = self.opt.get('brightness', (0.5, 1.5))
+ contrast = self.opt.get('contrast', (0.5, 1.5))
+ saturation = self.opt.get('saturation', (0, 1.5))
+ hue = self.opt.get('hue', (-0.1, 0.1))
+ img_lq = self.color_jitter_pt(img_lq, brightness, contrast,
+ saturation, hue)
+ img_lq = np.clip((img_lq * 255.0).round(), 0, 255) / 255.0
+ img_gt = normalize(img_gt, self.mean, self.std)
+ img_lq = normalize(img_lq, self.mean, self.std)
+ if self.crop_components:
+ return_dict = {
+ 'lq': img_lq,
+ 'gt': img_gt,
+ 'gt_path': gt_path,
+ 'loc_left_eye': loc_left_eye,
+ 'loc_right_eye': loc_right_eye,
+ 'loc_mouth': loc_mouth
+ }
+ return return_dict
+ else:
+ return {'lq': img_lq, 'gt': img_gt, 'gt_path': gt_path}
+
+ def __len__(self):
+ return len(self.paths)
diff --git a/ppgan/datasets/gpen_dataset.py b/ppgan/datasets/gpen_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4584f8170742f8b9e00bc5d8b261af317bf2a26
--- /dev/null
+++ b/ppgan/datasets/gpen_dataset.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import logging
+import os
+import numpy as np
+import paddle
+from paddle.io import Dataset
+import cv2
+
+from .builder import DATASETS
+
+import math
+import random
+
+logger = logging.getLogger(__name__)
+
+
+def generate_gaussian_noise(img, sigma=10, gray_noise=False):
+ """Generate Gaussian noise.
+
+ Args:
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+ sigma (float): Noise scale (measured in range 255). Default: 10.
+
+ Returns:
+ (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
+ float32.
+ """
+ if gray_noise:
+ noise = np.float32(np.random.randn(*(img.shape[0:2]))) * sigma / 255.
+ noise = np.expand_dims(noise, axis=2).repeat(3, axis=2)
+ else:
+ noise = np.float32(np.random.randn(*(img.shape))) * sigma / 255.
+ return noise
+
+
+def random_generate_gaussian_noise(img, sigma_range=(0, 10), gray_prob=0):
+ sigma = np.random.uniform(sigma_range[0], sigma_range[1])
+ if np.random.uniform() < gray_prob:
+ gray_noise = True
+ else:
+ gray_noise = False
+ return generate_gaussian_noise(img, sigma, gray_noise)
+
+
+def random_add_gaussian_noise(img,
+ sigma_range=(0, 1.0),
+ gray_prob=0,
+ clip=True,
+ rounds=False):
+ noise = random_generate_gaussian_noise(img, sigma_range, gray_prob)
+ out = img + noise
+ if clip and rounds:
+ out = np.clip((out * 255.0).round(), 0, 255) / 255.
+ elif clip:
+ out = np.clip(out, 0, 1)
+ elif rounds:
+ out = (out * 255.0).round() / 255.
+ return out
+
+
+def add_jpg_compression(img, quality=90):
+ """Add JPG compression artifacts.
+
+ Args:
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+ quality (float): JPG compression quality. 0 for lowest quality, 100 for
+ best quality. Default: 90.
+
+ Returns:
+ (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
+ float32.
+ """
+ img = np.clip(img, 0, 1)
+ encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+ _, encimg = cv2.imencode('.jpg', img * 255., encode_param)
+ img = np.float32(cv2.imdecode(encimg, 1)) / 255.
+ return img
+
+
+def random_add_jpg_compression(img, quality_range=(90, 100)):
+ """Randomly add JPG compression artifacts.
+
+ Args:
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+ quality_range (tuple[float] | list[float]): JPG compression quality
+ range. 0 for lowest quality, 100 for best quality.
+ Default: (90, 100).
+
+ Returns:
+ (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
+ float32.
+ """
+ quality = int(np.random.uniform(quality_range[0], quality_range[1]))
+ return add_jpg_compression(img, quality)
+
+
+def random_mixed_kernels(kernel_list,
+ kernel_prob,
+ kernel_size=21,
+ sigma_x_range=(0.6, 5),
+ sigma_y_range=(0.6, 5),
+ rotation_range=(-math.pi, math.pi),
+ betag_range=(0.5, 8),
+ betap_range=(0.5, 8),
+ noise_range=None):
+ """Randomly generate mixed kernels.
+
+ Args:
+ kernel_list (tuple): a list name of kernel types,
+ support ['iso', 'aniso', 'skew', 'generalized', 'plateau_iso',
+ 'plateau_aniso']
+ kernel_prob (tuple): corresponding kernel probability for each
+ kernel type
+ kernel_size (int):
+ sigma_x_range (tuple): [0.6, 5]
+ sigma_y_range (tuple): [0.6, 5]
+ rotation range (tuple): [-math.pi, math.pi]
+ beta_range (tuple): [0.5, 8]
+ noise_range(tuple, optional): multiplicative kernel noise,
+ [0.75, 1.25]. Default: None
+
+ Returns:
+ kernel (ndarray):
+ """
+ kernel_type = random.choices(kernel_list, kernel_prob)[0]
+ if kernel_type == 'iso':
+ kernel = random_bivariate_Gaussian(kernel_size,
+ sigma_x_range,
+ sigma_y_range,
+ rotation_range,
+ noise_range=noise_range,
+ isotropic=True)
+ elif kernel_type == 'aniso':
+ kernel = random_bivariate_Gaussian(kernel_size,
+ sigma_x_range,
+ sigma_y_range,
+ rotation_range,
+ noise_range=noise_range,
+ isotropic=False)
+ return kernel
+
+
+def random_bivariate_Gaussian(kernel_size,
+ sigma_x_range,
+ sigma_y_range,
+ rotation_range,
+ noise_range=None,
+ isotropic=True):
+ """Randomly generate bivariate isotropic or anisotropic Gaussian kernels.
+
+ In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
+
+ Args:
+ kernel_size (int):
+ sigma_x_range (tuple): [0.6, 5]
+ sigma_y_range (tuple): [0.6, 5]
+ rotation range (tuple): [-math.pi, math.pi]
+ noise_range(tuple, optional): multiplicative kernel noise,
+ [0.75, 1.25]. Default: None
+
+ Returns:
+ kernel (ndarray):
+ """
+ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
+ assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
+ sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
+ if isotropic is False:
+ assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
+ assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
+ sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
+ rotation = np.random.uniform(rotation_range[0], rotation_range[1])
+ else:
+ sigma_y = sigma_x
+ rotation = 0
+
+ kernel = bivariate_Gaussian(kernel_size,
+ sigma_x,
+ sigma_y,
+ rotation,
+ isotropic=isotropic)
+
+ # add multiplicative noise
+ if noise_range is not None:
+ assert noise_range[0] < noise_range[1], 'Wrong noise range.'
+ noise = np.random.uniform(noise_range[0],
+ noise_range[1],
+ size=kernel.shape)
+ kernel = kernel * noise
+ kernel = kernel / np.sum(kernel)
+ return kernel
+
+
+def bivariate_Gaussian(kernel_size,
+ sig_x,
+ sig_y,
+ theta,
+ grid=None,
+ isotropic=True):
+ """Generate a bivariate isotropic or anisotropic Gaussian kernel.
+
+ In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
+
+ Args:
+ kernel_size (int):
+ sig_x (float):
+ sig_y (float):
+ theta (float): Radian measurement.
+ grid (ndarray, optional): generated by :func:`mesh_grid`,
+ with the shape (K, K, 2), K is the kernel size. Default: None
+ isotropic (bool):
+
+ Returns:
+ kernel (ndarray): normalized kernel.
+ """
+ if grid is None:
+ grid, _, _ = mesh_grid(kernel_size)
+ if isotropic:
+ sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
+ else:
+ sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
+ kernel = pdf2(sigma_matrix, grid)
+ kernel = kernel / np.sum(kernel)
+ return kernel
+
+
+def sigma_matrix2(sig_x, sig_y, theta):
+ """Calculate the rotated sigma matrix (two dimensional matrix).
+
+ Args:
+ sig_x (float):
+ sig_y (float):
+ theta (float): Radian measurement.
+
+ Returns:
+ ndarray: Rotated sigma matrix.
+ """
+ d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]])
+ u_matrix = np.array([[np.cos(theta), -np.sin(theta)],
+ [np.sin(theta), np.cos(theta)]])
+ return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))
+
+
+def mesh_grid(kernel_size):
+ """Generate the mesh grid, centering at zero.
+
+ Args:
+ kernel_size (int):
+
+ Returns:
+ xy (ndarray): with the shape (kernel_size, kernel_size, 2)
+ xx (ndarray): with the shape (kernel_size, kernel_size)
+ yy (ndarray): with the shape (kernel_size, kernel_size)
+ """
+ ax = np.arange(-kernel_size // 2 + 1., kernel_size // 2 + 1.)
+ xx, yy = np.meshgrid(ax, ax)
+ xy = np.hstack((xx.reshape((kernel_size * kernel_size, 1)),
+ yy.reshape(kernel_size * kernel_size,
+ 1))).reshape(kernel_size, kernel_size, 2)
+ return xy, xx, yy
+
+
+def pdf2(sigma_matrix, grid):
+ """Calculate PDF of the bivariate Gaussian distribution.
+
+ Args:
+ sigma_matrix (ndarray): with the shape (2, 2)
+ grid (ndarray): generated by :func:`mesh_grid`,
+ with the shape (K, K, 2), K is the kernel size.
+
+ Returns:
+ kernel (ndarrray): un-normalized kernel.
+ """
+ inverse_sigma = np.linalg.inv(sigma_matrix)
+ kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2))
+ return kernel
+
+
+class GFPGAN_degradation(object):
+
+ def __init__(self):
+ self.kernel_list = ['iso', 'aniso']
+ self.kernel_prob = [0.5, 0.5]
+ self.blur_kernel_size = 41
+ self.blur_sigma = [0.1, 10]
+ self.downsample_range = [0.8, 8]
+ self.noise_range = [0, 20]
+ self.jpeg_range = [60, 100]
+ self.gray_prob = 0.2
+ self.color_jitter_prob = 0.0
+ self.color_jitter_pt_prob = 0.0
+ self.shift = 20 / 255.
+
+ def degrade_process(self, img_gt):
+ if random.random() > 0.5:
+ img_gt = cv2.flip(img_gt, 1)
+
+ h, w = img_gt.shape[:2]
+
+ # random color jitter
+ if np.random.uniform() < self.color_jitter_prob:
+ jitter_val = np.random.uniform(-self.shift, self.shift,
+ 3).astype(np.float32)
+ img_gt = img_gt + jitter_val
+ img_gt = np.clip(img_gt, 0, 1)
+
+ # random grayscale
+ if np.random.uniform() < self.gray_prob:
+ img_gt = cv2.cvtColor(img_gt, cv2.COLOR_BGR2GRAY)
+ img_gt = np.tile(img_gt[:, :, None], [1, 1, 3])
+
+ # ------------------------ generate lq image ------------------------ #
+ # blur
+ kernel = random_mixed_kernels(self.kernel_list,
+ self.kernel_prob,
+ self.blur_kernel_size,
+ self.blur_sigma,
+ self.blur_sigma, [-math.pi, math.pi],
+ noise_range=None)
+ img_lq = cv2.filter2D(img_gt, -1, kernel)
+ # downsample
+ scale = np.random.uniform(self.downsample_range[0],
+ self.downsample_range[1])
+ img_lq = cv2.resize(img_lq, (int(w // scale), int(h // scale)),
+ interpolation=cv2.INTER_LINEAR)
+
+ # noise
+ if self.noise_range is not None:
+ img_lq = random_add_gaussian_noise(img_lq, self.noise_range)
+ # jpeg compression
+ if self.jpeg_range is not None:
+ img_lq = random_add_jpg_compression(img_lq, self.jpeg_range)
+
+ # round and clip
+ img_lq = np.clip((img_lq * 255.0).round(), 0, 255) / 255.
+
+ # resize to original size
+ img_lq = cv2.resize(img_lq, (w, h), interpolation=cv2.INTER_LINEAR)
+
+ return img_gt, img_lq
+
+
+@DATASETS.register()
+class GPENDataset(Dataset):
+ """
+ coco2017 dataset for LapStyle model
+ """
+
+ def __init__(self, dataroot, size=256, amount=-1):
+ super(GPENDataset, self).__init__()
+ self.size = size
+ self.HQ_imgs = sorted(glob.glob(os.path.join(dataroot,
+ '*/*.*g')))[:amount]
+ self.length = len(self.HQ_imgs)
+ if self.length == 0:
+ self.HQ_imgs = sorted(glob.glob(os.path.join(dataroot,
+ '*.*g')))[:amount]
+ self.length = len(self.HQ_imgs)
+ print(self.length)
+ self.degrader = GFPGAN_degradation()
+
+ def __len__(self):
+ return self.length
+
+ def __getitem__(self, index):
+ """Get training sample
+
+ return:
+ ci: content image with shape [C,W,H],
+ si: style image with shape [C,W,H],
+ ci_path: str
+ """
+ img_gt = cv2.imread(self.HQ_imgs[index], cv2.IMREAD_COLOR)
+ img_gt = cv2.resize(img_gt, (self.size, self.size),
+ interpolation=cv2.INTER_AREA)
+
+ # BFR degradation
+ img_gt = img_gt.astype(np.float32) / 255.
+ img_gt, img_lq = self.degrader.degrade_process(img_gt)
+
+ img_gt = (paddle.to_tensor(img_gt) - 0.5) / 0.5
+ img_lq = (paddle.to_tensor(img_lq) - 0.5) / 0.5
+
+ img_gt = img_gt.transpose([2, 0, 1]).flip(0)
+ img_lq = img_lq.transpose([2, 0, 1]).flip(0)
+
+ return np.array(img_lq).astype('float32'), np.array(img_gt).astype(
+ 'float32')
diff --git a/ppgan/datasets/invdn_dataset.py b/ppgan/datasets/invdn_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a0e9ee41ce2ff2cda230da93e71522ceb58f7f
--- /dev/null
+++ b/ppgan/datasets/invdn_dataset.py
@@ -0,0 +1,274 @@
+# code was heavily based on https://github.com/cszn/KAIR
+# MIT License
+# Copyright (c) 2019 Kai Zhang
+
+import os
+import os.path as osp
+import pickle
+import random
+import numpy as np
+import cv2
+import math
+
+import paddle
+from paddle.io import Dataset
+
+from .builder import DATASETS
+
+IMG_EXTENSIONS = [
+ '.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG', '.ppm', '.PPM', '.bmp',
+ '.BMP'
+]
+
+
+def is_image_file(filename):
+ return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+
+
+def _get_paths_from_images(path):
+ '''get image path list from image folder'''
+ assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
+ images = []
+ for dirpath, _, fnames in sorted(os.walk(path)):
+ for fname in sorted(fnames):
+ if is_image_file(fname):
+ img_path = os.path.join(dirpath, fname)
+ images.append(img_path)
+ assert images, '{:s} has no valid image file'.format(path)
+ return images
+
+
+def get_image_paths(data_type, dataroot):
+ '''get image path list'''
+ paths, sizes = None, None
+ if dataroot is not None:
+ if data_type == 'img':
+ paths = sorted(_get_paths_from_images(dataroot))
+ else:
+ raise NotImplementedError(
+ 'data_type [{:s}] is not recognized.'.format(data_type))
+ return paths, sizes
+
+
+def read_img(env, path, size=None):
+ '''read image by cv2
+ return: Numpy float32, HWC, BGR, [0,1]'''
+ if env is None: # img
+ #img = cv2.imread(path, cv2.IMREAD_UNCHANGED)
+ img = cv2.imread(path, cv2.IMREAD_COLOR)
+ img = img.astype(np.float32) / 255.
+ if img.ndim == 2:
+ img = np.expand_dims(img, axis=2)
+ # some images have 4 channels
+ if img.shape[2] > 3:
+ img = img[:, :, :3]
+ return img
+
+
+def modcrop(img_in, scale):
+ # img_in: Numpy, HWC or HW
+ img = np.copy(img_in)
+ if img.ndim == 2:
+ H, W = img.shape
+ H_r, W_r = H % scale, W % scale
+ img = img[:H - H_r, :W - W_r]
+ elif img.ndim == 3:
+ H, W, C = img.shape
+ H_r, W_r = H % scale, W % scale
+ img = img[:H - H_r, :W - W_r, :]
+ else:
+ raise ValueError('Wrong img ndim: [{:d}].'.format(img.ndim))
+ return img
+
+
+def bgr2ycbcr(img, only_y=True):
+ '''bgr version of rgb2ycbcr
+ only_y: only return Y channel
+ Input:
+ uint8, [0, 255]
+ float, [0, 1]
+ '''
+ in_img_type = img.dtype
+ img.astype(np.float32)
+ if in_img_type != np.uint8:
+ img *= 255.
+ # convert
+ if only_y:
+ rlt = np.dot(img, [24.966, 128.553, 65.481]) / 255.0 + 16.0
+ else:
+ rlt = np.matmul(img,
+ [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+ [65.481, -37.797, 112.0]]) / 255.0 + [16, 128, 128]
+ if in_img_type == np.uint8:
+ rlt = rlt.round()
+ else:
+ rlt /= 255.
+ return rlt.astype(in_img_type)
+
+
+def channel_convert(in_c, tar_type, img_list):
+ # conversion among BGR, gray and y
+ if in_c == 3 and tar_type == 'gray': # BGR to gray
+ gray_list = [cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) for img in img_list]
+ return [np.expand_dims(img, axis=2) for img in gray_list]
+ elif in_c == 3 and tar_type == 'y': # BGR to y
+ y_list = [bgr2ycbcr(img, only_y=False) for img in img_list]
+ return y_list
+ # return [np.expand_dims(img, axis=2) for img in y_list]
+ elif in_c == 1 and tar_type == 'RGB': # gray/y to BGR
+ return [cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) for img in img_list]
+ else:
+ return img_list
+
+
+def augment(img_list, hflip=True, rot=True):
+ # horizontal flip OR rotate
+ hflip = hflip and random.random() < 0.5
+ vflip = rot and random.random() < 0.5
+ rot90 = rot and random.random() < 0.5
+
+ def _augment(img):
+ if isinstance(img, list):
+ if hflip:
+ img = [image[:, ::-1, :] for image in img]
+ if vflip:
+ img = [image[::-1, :, :] for image in img]
+ if rot90:
+ img = [image.transpose(1, 0, 2) for image in img]
+ else:
+ if hflip:
+ img = img[:, ::-1, :]
+ if vflip:
+ img = img[::-1, :, :]
+ if rot90:
+ img = img.transpose(1, 0, 2)
+ return img
+
+ return [_augment(img) for img in img_list]
+
+
+@DATASETS.register()
+class InvDNDataset(Dataset):
+ '''
+ Read LQ (Low Quality, here is LR), GT and noisy image pairs.
+ The pair is ensured by 'sorted' function, so please check the name convention.
+ '''
+ def __init__(self, opt=None):
+ super(InvDNDataset, self).__init__()
+ self.opt = opt
+ self.is_train = True if self.opt['phase'] == 'train' else False
+
+ self.paths_LQ, self.paths_GT, self.paths_Noisy = None, None, None
+ self.sizes_LQ, self.sizes_GT, self.sizes_Noisy = None, None, None
+ self.LQ_env, self.GT_env, self.Noisy_env = None, None, None
+
+ self.data_type = "img"
+
+ if self.is_train:
+ dataroot_gt = osp.join(opt["train_dir"], "GT")
+ dataroot_noisy = osp.join(opt["train_dir"], "Noisy")
+ dataroot_lq = osp.join(opt["train_dir"], "LQ")
+ else:
+ dataroot_gt = osp.join(opt["val_dir"], "GT")
+ dataroot_noisy = osp.join(opt["val_dir"], "Noisy")
+ dataroot_lq = None
+
+ self.paths_GT, self.sizes_GT = get_image_paths(self.data_type,
+ dataroot_gt)
+ self.paths_Noisy, self.sizes_Noisy = get_image_paths(
+ self.data_type, dataroot_noisy)
+ self.paths_LQ, self.sizes_LQ = get_image_paths(self.data_type,
+ dataroot_lq)
+
+ assert self.paths_GT, 'Error: GT path is empty.'
+ assert self.paths_Noisy, 'Error: Noisy path is empty.'
+ if self.paths_LQ and self.paths_GT:
+ assert len(self.paths_LQ) == len(
+ self.paths_GT
+ ), 'GT and LQ datasets have different number of images - {}, {}.'.format(
+ len(self.paths_LQ), len(self.paths_GT))
+ self.random_scale_list = [1]
+
+ def __getitem__(self, index):
+ GT_path, Noisy_path, LQ_path = None, None, None
+
+ scale = self.opt["scale"]
+
+ # get GT image
+ GT_path = self.paths_GT[index]
+ resolution = None
+ img_GT = read_img(self.GT_env, GT_path, resolution)
+
+ # modcrop in the validation / test phase
+ if not self.is_train:
+ img_GT = modcrop(img_GT, scale)
+
+ # change color space if necessary
+ img_GT = channel_convert(img_GT.shape[2], "RGB", [img_GT])[0]
+
+ # get Noisy image
+ Noisy_path = self.paths_Noisy[index]
+ resolution = None
+ img_Noisy = read_img(self.Noisy_env, Noisy_path, resolution)
+
+ # modcrop in the validation / test phase
+ if not self.is_train:
+ img_Noisy = modcrop(img_Noisy, scale)
+
+ # change color space if necessary
+ img_Noisy = channel_convert(img_Noisy.shape[2], "RGB", [img_Noisy])[0]
+
+ # get LQ image
+ if self.paths_LQ:
+ LQ_path = self.paths_LQ[index]
+ resolution = None
+ img_LQ = read_img(self.LQ_env, LQ_path, resolution)
+
+ if self.is_train:
+ GT_size = self.opt["crop_size"]
+
+ H, W, C = img_LQ.shape
+ LQ_size = GT_size // scale
+
+ # randomly crop
+ rnd_h = random.randint(0, max(0, H - LQ_size))
+ rnd_w = random.randint(0, max(0, W - LQ_size))
+ img_LQ = img_LQ[rnd_h:rnd_h + LQ_size, rnd_w:rnd_w +
+ LQ_size, :] # (128, 128, 3) --> (36, 36, 3)
+ rnd_h_GT, rnd_w_GT = int(rnd_h * scale), int(rnd_w * scale)
+ img_GT = img_GT[rnd_h_GT:rnd_h_GT + GT_size, rnd_w_GT:rnd_w_GT +
+ GT_size, :] # (512, 512, 3) --> (144, 144, 3)
+ img_Noisy = img_Noisy[rnd_h_GT:rnd_h_GT + GT_size,
+ rnd_w_GT:rnd_w_GT + GT_size, :]
+ # augmentation - flip, rotate
+ img_LQ, img_GT, img_Noisy = augment([img_LQ, img_GT, img_Noisy],
+ True, True)
+
+ # change color space if necessary
+ C = img_LQ.shape[0]
+ img_LQ = channel_convert(C, "RGB", [img_LQ])[0]
+
+ # BGR to RGB, HWC to CHW, numpy to tensor
+ if img_GT.shape[2] == 3:
+ img_GT = img_GT[:, :, [2, 1, 0]]
+ img_Noisy = img_Noisy[:, :, [2, 1, 0]]
+ if self.is_train:
+ img_LQ = img_LQ[:, :, [2, 1, 0]]
+
+ img_GT = paddle.to_tensor(np.ascontiguousarray(
+ np.transpose(img_GT, (2, 0, 1))),
+ dtype="float32")
+ img_Noisy = paddle.to_tensor(np.ascontiguousarray(
+ np.transpose(img_Noisy, (2, 0, 1))),
+ dtype="float32")
+ if self.is_train:
+ img_LQ = paddle.to_tensor(np.ascontiguousarray(
+ np.transpose(img_LQ, (2, 0, 1))),
+ dtype="float32")
+
+ if self.is_train:
+ return img_Noisy, img_GT, img_LQ
+ return img_Noisy, img_GT, img_GT
+
+ def __len__(self):
+ return len(self.paths_GT) #32000 for train, 1280 for valid
diff --git a/ppgan/datasets/lapstyle_dataset.py b/ppgan/datasets/lapstyle_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..619d39c28673bd32de5dc1931344287427d081b2
--- /dev/null
+++ b/ppgan/datasets/lapstyle_dataset.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import numpy as np
+from PIL import Image
+import paddle
+import paddle.vision.transforms as T
+from paddle.io import Dataset
+import cv2
+
+from .builder import DATASETS
+
+logger = logging.getLogger(__name__)
+
+
+def data_transform(crop_size):
+ transform_list = [T.RandomCrop(crop_size)]
+ return T.Compose(transform_list)
+
+
+@DATASETS.register()
+class LapStyleDataset(Dataset):
+ """
+ coco2017 dataset for LapStyle model
+ """
+ def __init__(self, content_root, style_root, load_size, crop_size):
+ super(LapStyleDataset, self).__init__()
+ self.content_root = content_root
+ self.paths = os.listdir(self.content_root)
+ self.style_root = style_root
+ self.load_size = load_size
+ self.crop_size = crop_size
+ self.transform = data_transform(self.crop_size)
+
+ def __getitem__(self, index):
+ """Get training sample
+
+ return:
+ ci: content image with shape [C,W,H],
+ si: style image with shape [C,W,H],
+ ci_path: str
+ """
+ path = self.paths[index]
+ content_img = cv2.imread(os.path.join(self.content_root, path))
+ if content_img.ndim == 2:
+ content_img = cv2.cvtColor(content_img, cv2.COLOR_GRAY2RGB)
+ else:
+ content_img = cv2.cvtColor(content_img, cv2.COLOR_BGR2RGB)
+ content_img = Image.fromarray(content_img)
+ content_img = content_img.resize((self.load_size, self.load_size),
+ Image.BILINEAR)
+ content_img = np.array(content_img)
+ style_img = cv2.imread(self.style_root)
+ style_img = cv2.cvtColor(style_img, cv2.COLOR_BGR2RGB)
+ style_img = Image.fromarray(style_img)
+ style_img = style_img.resize((self.load_size, self.load_size),
+ Image.BILINEAR)
+ style_img = np.array(style_img)
+ content_img = self.transform(content_img)
+ style_img = self.transform(style_img)
+ content_img = self.img(content_img)
+ style_img = self.img(style_img)
+ return {'ci': content_img, 'si': style_img, 'ci_path': path}
+
+ def img(self, img):
+ """make image with [0,255] and HWC to [0,1] and CHW
+
+ return:
+ img: image with shape [3,W,H] and value [0, 1].
+ """
+ # [0,255] to [0,1]
+ img = img.astype(np.float32) / 255.
+ # some images have 4 channels
+ if img.shape[2] > 3:
+ img = img[:, :, :3]
+ # HWC to CHW
+ img = np.transpose(img, (2, 0, 1)).astype('float32')
+ return img
+
+ def __len__(self):
+ return len(self.paths)
+
+ def name(self):
+ return 'LapStyleDataset'
diff --git a/ppgan/datasets/mpr_dataset.py b/ppgan/datasets/mpr_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..fad866dabeeecdda8ce047f32d34286e878ece56
--- /dev/null
+++ b/ppgan/datasets/mpr_dataset.py
@@ -0,0 +1,194 @@
+# code was heavily based on https://github.com/swz30/MPRNet
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/swz30/MPRNet/blob/main/LICENSE.md
+
+import os
+import random
+import numpy as np
+import cv2
+import paddle
+from PIL import Image, ImageEnhance
+import numpy as np
+import random
+import numbers
+from paddle.io import Dataset
+from .builder import DATASETS
+from paddle.vision.transforms.functional import to_tensor, adjust_brightness, adjust_saturation, rotate, hflip, hflip, vflip, center_crop
+
+
+def is_image_file(filename):
+ return any(
+ filename.endswith(extension)
+ for extension in ['jpeg', 'JPEG', 'jpg', 'png', 'JPG', 'PNG', 'gif'])
+
+
+@DATASETS.register()
+class MPRTrain(Dataset):
+ def __init__(self, rgb_dir, img_options=None):
+ super(MPRTrain, self).__init__()
+
+ inp_files = sorted(os.listdir(os.path.join(rgb_dir, 'input')))
+ tar_files = sorted(os.listdir(os.path.join(rgb_dir, 'target')))
+
+ self.inp_filenames = [
+ os.path.join(rgb_dir, 'input', x) for x in inp_files
+ if is_image_file(x)
+ ]
+ self.tar_filenames = [
+ os.path.join(rgb_dir, 'target', x) for x in tar_files
+ if is_image_file(x)
+ ]
+
+ self.img_options = img_options
+ self.sizex = len(self.tar_filenames) # get the size of target
+
+ self.ps = self.img_options['patch_size']
+
+ def __len__(self):
+ return self.sizex
+
+ def __getitem__(self, index):
+ index_ = index % self.sizex
+ ps = self.ps
+
+ inp_path = self.inp_filenames[index_]
+ tar_path = self.tar_filenames[index_]
+
+ inp_img = Image.open(inp_path)
+ tar_img = Image.open(tar_path)
+
+ w, h = tar_img.size
+ padw = ps - w if w < ps else 0
+ padh = ps - h if h < ps else 0
+
+ # Reflect Pad in case image is smaller than patch_size
+ if padw != 0 or padh != 0:
+ inp_img = np.pad(inp_img, (0, 0, padw, padh),
+ padding_mode='reflect')
+ tar_img = np.pad(tar_img, (0, 0, padw, padh),
+ padding_mode='reflect')
+
+ aug = random.randint(0, 2)
+ if aug == 1:
+ inp_img = adjust_brightness(inp_img, 1)
+ tar_img = adjust_brightness(tar_img, 1)
+
+ aug = random.randint(0, 2)
+ if aug == 1:
+ sat_factor = 1 + (0.2 - 0.4 * np.random.rand())
+ inp_img = adjust_saturation(inp_img, sat_factor)
+ tar_img = adjust_saturation(tar_img, sat_factor)
+
+ # Data Augmentations
+ if aug == 1:
+ inp_img = vflip(inp_img)
+ tar_img = vflip(tar_img)
+ elif aug == 2:
+ inp_img = hflip(inp_img)
+ tar_img = hflip(tar_img)
+ elif aug == 3:
+ inp_img = rotate(inp_img, 90)
+ tar_img = rotate(tar_img, 90)
+ elif aug == 4:
+ inp_img = rotate(inp_img, 90 * 2)
+ tar_img = rotate(tar_img, 90 * 2)
+ elif aug == 5:
+ inp_img = rotate(inp_img, 90 * 3)
+ tar_img = rotate(tar_img, 90 * 3)
+ elif aug == 6:
+ inp_img = rotate(vflip(inp_img), 90)
+ tar_img = rotate(vflip(tar_img), 90)
+ elif aug == 7:
+ inp_img = rotate(hflip(inp_img), 90)
+ tar_img = rotate(hflip(tar_img), 90)
+
+ inp_img = to_tensor(inp_img)
+ tar_img = to_tensor(tar_img)
+
+ hh, ww = tar_img.shape[1], tar_img.shape[2]
+
+ rr = random.randint(0, hh - ps)
+ cc = random.randint(0, ww - ps)
+ aug = random.randint(0, 8)
+
+ # Crop patch
+ inp_img = inp_img[:, rr:rr + ps, cc:cc + ps]
+ tar_img = tar_img[:, rr:rr + ps, cc:cc + ps]
+
+ filename = os.path.splitext(os.path.split(tar_path)[-1])[0]
+
+ return tar_img, inp_img, filename
+
+
+@DATASETS.register()
+class MPRVal(Dataset):
+ def __init__(self, rgb_dir, img_options=None, rgb_dir2=None):
+ super(MPRVal, self).__init__()
+
+ inp_files = sorted(os.listdir(os.path.join(rgb_dir, 'input')))
+ tar_files = sorted(os.listdir(os.path.join(rgb_dir, 'target')))
+
+ self.inp_filenames = [
+ os.path.join(rgb_dir, 'input', x) for x in inp_files
+ if is_image_file(x)
+ ]
+ self.tar_filenames = [
+ os.path.join(rgb_dir, 'target', x) for x in tar_files
+ if is_image_file(x)
+ ]
+
+ self.img_options = img_options
+ self.sizex = len(self.tar_filenames) # get the size of target
+
+ self.ps = self.img_options['patch_size']
+
+ def __len__(self):
+ return self.sizex
+
+ def __getitem__(self, index):
+ index_ = index % self.sizex
+ ps = self.ps
+
+ inp_path = self.inp_filenames[index_]
+ tar_path = self.tar_filenames[index_]
+
+ inp_img = Image.open(inp_path)
+ tar_img = Image.open(tar_path)
+
+ # Validate on center crop
+ if self.ps is not None:
+ inp_img = center_crop(inp_img, (ps, ps))
+ tar_img = center_crop(tar_img, (ps, ps))
+
+ inp_img = to_tensor(inp_img)
+ tar_img = to_tensor(tar_img)
+
+ filename = os.path.splitext(os.path.split(tar_path)[-1])[0]
+
+ return tar_img, inp_img, filename
+
+
+@DATASETS.register()
+class MPRTest(Dataset):
+ def __init__(self, inp_dir, img_options):
+ super(MPRTest, self).__init__()
+
+ inp_files = sorted(os.listdir(inp_dir))
+ self.inp_filenames = [
+ os.path.join(inp_dir, x) for x in inp_files if is_image_file(x)
+ ]
+
+ self.inp_size = len(self.inp_filenames)
+ self.img_options = img_options
+
+ def __len__(self):
+ return self.inp_size
+
+ def __getitem__(self, index):
+
+ path_inp = self.inp_filenames[index]
+ filename = os.path.splitext(os.path.split(path_inp)[-1])[0]
+ inp = Image.open(path_inp)
+
+ inp = to_tensor(inp)
+ return inp, filename
diff --git a/ppgan/datasets/nafnet_dataset.py b/ppgan/datasets/nafnet_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..88cab703f0a6f293159303082add5f9f4a2aa6b9
--- /dev/null
+++ b/ppgan/datasets/nafnet_dataset.py
@@ -0,0 +1,193 @@
+# code was heavily based on https://github.com/swz30/MPRNet
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/swz30/MPRNet/blob/main/LICENSE.md
+
+import os
+import random
+import numpy as np
+from PIL import Image
+
+from paddle.io import Dataset
+from .builder import DATASETS
+from paddle.vision.transforms.functional import to_tensor, adjust_brightness, adjust_saturation, rotate, hflip, hflip, vflip, center_crop
+
+
+def is_image_file(filename):
+ return any(
+ filename.endswith(extension)
+ for extension in ['jpeg', 'JPEG', 'jpg', 'png', 'JPG', 'PNG', 'gif'])
+
+
+@DATASETS.register()
+class NAFNetTrain(Dataset):
+
+ def __init__(self, rgb_dir, img_options=None):
+ super(NAFNetTrain, self).__init__()
+
+ inp_files = sorted(os.listdir(os.path.join(rgb_dir, 'input')))
+ tar_files = sorted(os.listdir(os.path.join(rgb_dir, 'target')))
+
+ self.inp_filenames = [
+ os.path.join(rgb_dir, 'input', x) for x in inp_files
+ if is_image_file(x)
+ ]
+ self.tar_filenames = [
+ os.path.join(rgb_dir, 'target', x) for x in tar_files
+ if is_image_file(x)
+ ]
+
+ self.img_options = img_options
+ self.sizex = len(self.tar_filenames) # get the size of target
+
+ self.ps = self.img_options['patch_size']
+
+ def __len__(self):
+ return self.sizex
+
+ def __getitem__(self, index):
+ index_ = index % self.sizex
+ ps = self.ps
+
+ inp_path = self.inp_filenames[index_]
+ tar_path = self.tar_filenames[index_]
+
+ inp_img = Image.open(inp_path)
+ tar_img = Image.open(tar_path)
+
+ w, h = tar_img.size
+ padw = ps - w if w < ps else 0
+ padh = ps - h if h < ps else 0
+
+ # Reflect Pad in case image is smaller than patch_size
+ if padw != 0 or padh != 0:
+ inp_img = np.pad(inp_img, (0, 0, padw, padh),
+ padding_mode='reflect')
+ tar_img = np.pad(tar_img, (0, 0, padw, padh),
+ padding_mode='reflect')
+
+ aug = random.randint(0, 2)
+ if aug == 1:
+ inp_img = adjust_brightness(inp_img, 1)
+ tar_img = adjust_brightness(tar_img, 1)
+
+ aug = random.randint(0, 2)
+ if aug == 1:
+ sat_factor = 1 + (0.2 - 0.4 * np.random.rand())
+ inp_img = adjust_saturation(inp_img, sat_factor)
+ tar_img = adjust_saturation(tar_img, sat_factor)
+
+ # Data Augmentations
+ aug = random.randint(0, 8)
+ if aug == 1:
+ inp_img = vflip(inp_img)
+ tar_img = vflip(tar_img)
+ elif aug == 2:
+ inp_img = hflip(inp_img)
+ tar_img = hflip(tar_img)
+ elif aug == 3:
+ inp_img = rotate(inp_img, 90)
+ tar_img = rotate(tar_img, 90)
+ elif aug == 4:
+ inp_img = rotate(inp_img, 90 * 2)
+ tar_img = rotate(tar_img, 90 * 2)
+ elif aug == 5:
+ inp_img = rotate(inp_img, 90 * 3)
+ tar_img = rotate(tar_img, 90 * 3)
+ elif aug == 6:
+ inp_img = rotate(vflip(inp_img), 90)
+ tar_img = rotate(vflip(tar_img), 90)
+ elif aug == 7:
+ inp_img = rotate(hflip(inp_img), 90)
+ tar_img = rotate(hflip(tar_img), 90)
+
+ inp_img = to_tensor(inp_img)
+ tar_img = to_tensor(tar_img)
+
+ hh, ww = tar_img.shape[1], tar_img.shape[2]
+
+ rr = random.randint(0, hh - ps)
+ cc = random.randint(0, ww - ps)
+
+ # Crop patch
+ inp_img = inp_img[:, rr:rr + ps, cc:cc + ps]
+ tar_img = tar_img[:, rr:rr + ps, cc:cc + ps]
+
+ filename = os.path.splitext(os.path.split(tar_path)[-1])[0]
+
+ return tar_img, inp_img, filename
+
+
+@DATASETS.register()
+class NAFNetVal(Dataset):
+
+ def __init__(self, rgb_dir, img_options=None, rgb_dir2=None):
+ super(NAFNetVal, self).__init__()
+
+ inp_files = sorted(os.listdir(os.path.join(rgb_dir, 'input')))
+ tar_files = sorted(os.listdir(os.path.join(rgb_dir, 'target')))
+
+ self.inp_filenames = [
+ os.path.join(rgb_dir, 'input', x) for x in inp_files
+ if is_image_file(x)
+ ]
+ self.tar_filenames = [
+ os.path.join(rgb_dir, 'target', x) for x in tar_files
+ if is_image_file(x)
+ ]
+
+ self.img_options = img_options
+ self.sizex = len(self.tar_filenames) # get the size of target
+
+ self.ps = self.img_options['patch_size']
+
+ def __len__(self):
+ return self.sizex
+
+ def __getitem__(self, index):
+ index_ = index % self.sizex
+ ps = self.ps
+
+ inp_path = self.inp_filenames[index_]
+ tar_path = self.tar_filenames[index_]
+
+ inp_img = Image.open(inp_path)
+ tar_img = Image.open(tar_path)
+
+ # Validate on center crop
+ if self.ps is not None:
+ inp_img = center_crop(inp_img, (ps, ps))
+ tar_img = center_crop(tar_img, (ps, ps))
+
+ inp_img = to_tensor(inp_img)
+ tar_img = to_tensor(tar_img)
+
+ filename = os.path.splitext(os.path.split(tar_path)[-1])[0]
+
+ return tar_img, inp_img, filename
+
+
+@DATASETS.register()
+class NAFNetTest(Dataset):
+
+ def __init__(self, inp_dir, img_options):
+ super(NAFNetTest, self).__init__()
+
+ inp_files = sorted(os.listdir(inp_dir))
+ self.inp_filenames = [
+ os.path.join(inp_dir, x) for x in inp_files if is_image_file(x)
+ ]
+
+ self.inp_size = len(self.inp_filenames)
+ self.img_options = img_options
+
+ def __len__(self):
+ return self.inp_size
+
+ def __getitem__(self, index):
+
+ path_inp = self.inp_filenames[index]
+ filename = os.path.splitext(os.path.split(path_inp)[-1])[0]
+ inp = Image.open(path_inp)
+
+ inp = to_tensor(inp)
+ return inp, filename
diff --git a/ppgan/datasets/paired_image_datasets.py b/ppgan/datasets/paired_image_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdaae3c2e2ad89bdaacef3695322b1dac3c7bb21
--- /dev/null
+++ b/ppgan/datasets/paired_image_datasets.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle.vision.transforms.functional import normalize
+
+from .builder import DATASETS
+from ppgan.utils.gfpgan_tools import *
+
+
+@DATASETS.register()
+class PairedImageDataset(paddle.io.Dataset):
+ """Paired image dataset for image restoration.
+
+ Read LQ (Low Quality, e.g. LR (Low Resolution), blurry, noisy, etc) and GT image pairs.
+
+ There are three modes:
+ 1. 'lmdb': Use lmdb files.
+ If opt['io_backend'] == lmdb.
+ 2. 'meta_info_file': Use meta information file to generate paths.
+ If opt['io_backend'] != lmdb and opt['meta_info_file'] is not None.
+ 3. 'folder': Scan folders to generate paths.
+ The rest.
+
+ Args:
+ opt (dict): Config for train datasets. It contains the following keys:
+ dataroot_gt (str): Data root path for gt.
+ dataroot_lq (str): Data root path for lq.
+ meta_info_file (str): Path for meta information file.
+ io_backend (dict): IO backend type and other kwarg.
+ filename_tmpl (str): Template for each filename. Note that the template excludes the file extension.
+ Default: '{}'.
+ gt_size (int): Cropped patched size for gt patches.
+ use_hflip (bool): Use horizontal flips.
+ use_rot (bool): Use rotation (use vertical flip and transposing h and w for implementation).
+
+ scale (bool): Scale, which will be added automatically.
+ phase (str): 'train' or 'val'.
+ """
+ def __init__(self, **opt):
+ super(PairedImageDataset, self).__init__()
+ self.opt = opt
+ # file client (io backend)
+ self.file_client = None
+ self.io_backend_opt = opt['io_backend']
+ self.mean = opt['mean'] if 'mean' in opt else None
+ self.std = opt['std'] if 'std' in opt else None
+
+ self.gt_folder, self.lq_folder = opt['dataroot_gt'], opt['dataroot_lq']
+ if 'filename_tmpl' in opt:
+ self.filename_tmpl = opt['filename_tmpl']
+ else:
+ self.filename_tmpl = '{}'
+
+ if self.io_backend_opt['type'] == 'lmdb':
+ self.io_backend_opt['db_paths'] = [self.lq_folder, self.gt_folder]
+ self.io_backend_opt['client_keys'] = ['lq', 'gt']
+ self.paths = paired_paths_from_lmdb(
+ [self.lq_folder, self.gt_folder], ['lq', 'gt'])
+ elif 'meta_info_file' in self.opt and self.opt[
+ 'meta_info_file'] is not None:
+ self.paths = paired_paths_from_meta_info_file(
+ [self.lq_folder, self.gt_folder], ['lq', 'gt'],
+ self.opt['meta_info_file'], self.filename_tmpl)
+ else:
+ self.paths = paired_paths_from_folder(
+ [self.lq_folder, self.gt_folder], ['lq', 'gt'],
+ self.filename_tmpl)
+
+ def __getitem__(self, index):
+ if self.file_client is None:
+ self.file_client = FileClient(self.io_backend_opt.pop('type'),
+ **self.io_backend_opt)
+ # print(self.file_client)
+ scale = self.opt['scale']
+
+ # Load gt and lq images. Dimension order: HWC; channel order: BGR;
+ # image range: [0, 1], float32.
+ gt_path = self.paths[index]['gt_path']
+ img_bytes = self.file_client.get(gt_path, 'gt')
+ img_gt = imfrombytes(img_bytes, float32=True)
+
+ lq_path = self.paths[index]['lq_path']
+ img_bytes = self.file_client.get(lq_path, 'lq')
+ img_lq = imfrombytes(img_bytes, float32=True)
+ # augmentation for training
+ if self.opt['phase'] == 'train':
+ gt_size = self.opt['gt_size']
+ # random crop
+ img_gt, img_lq = paired_random_crop(img_gt, img_lq, gt_size, scale,
+ gt_path)
+ # flip, rotation
+ img_gt, img_lq = augment([img_gt, img_lq], self.opt['use_hflip'],
+ self.opt['use_rot'])
+
+ # color space transform
+ if 'color' in self.opt and self.opt['color'] == 'y':
+ img_gt = bgr2ycbcr(img_gt, y_only=True)[..., None]
+ img_lq = bgr2ycbcr(img_lq, y_only=True)[..., None]
+
+ # crop the unmatched GT images during validation or testing, especially for SR benchmark datasets
+ # TODO: It is better to update the datasets, rather than force to crop
+ if self.opt['phase'] != 'train':
+ img_gt = img_gt[0:img_lq.shape[0] * scale,
+ 0:img_lq.shape[1] * scale, :]
+
+ # BGR to RGB, HWC to CHW, numpy to tensor
+ img_gt, img_lq = img2tensor([img_gt, img_lq],
+ bgr2rgb=True,
+ float32=True)
+ # normalize
+
+ if self.mean is not None or self.std is not None:
+ img_lq = normalize(img_lq, self.mean, self.std)
+ img_gt = normalize(img_gt, self.mean, self.std)
+
+ return {
+ 'lq': img_lq,
+ 'gt': img_gt,
+ 'lq_path': lq_path,
+ 'gt_path': gt_path
+ }
+
+ def __len__(self):
+ return len(self.paths)
diff --git a/ppgan/datasets/photopen_dataset.py b/ppgan/datasets/photopen_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..84a5a559826a0b1cfc3399aa82d37b0bf832ff51
--- /dev/null
+++ b/ppgan/datasets/photopen_dataset.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import numpy as np
+from PIL import Image
+import paddle
+import paddle.vision.transforms as T
+from paddle.io import Dataset
+import cv2
+import random
+
+from .builder import DATASETS
+
+logger = logging.getLogger(__name__)
+
+
+def data_transform(img, resize_w, resize_h, load_size=286, pos=[0, 0, 256, 256], flip=True, is_image=True):
+ if is_image:
+ resized = img.resize((resize_w, resize_h), Image.BICUBIC)
+ else:
+ resized = img.resize((resize_w, resize_h), Image.NEAREST)
+ croped = resized.crop((pos[0], pos[1], pos[2], pos[3]))
+ fliped = ImageOps.mirror(croped) if flip else croped
+ fliped = np.array(fliped) # transform to numpy array
+ expanded = np.expand_dims(fliped, 2) if len(fliped.shape) < 3 else fliped
+ transposed = np.transpose(expanded, (2, 0, 1)).astype('float32')
+ if is_image:
+ normalized = transposed / 255. * 2. - 1.
+ else:
+ normalized = transposed
+ return normalized
+
+
+@DATASETS.register()
+class PhotoPenDataset(Dataset):
+ def __init__(self, content_root, load_size, crop_size):
+ super(PhotoPenDataset, self).__init__()
+ inst_dir = os.path.join(content_root, 'train_inst')
+ _, _, inst_list = next(os.walk(inst_dir))
+ self.inst_list = np.sort(inst_list)
+ self.content_root = content_root
+ self.load_size = load_size
+ self.crop_size = crop_size
+
+ def __getitem__(self, idx):
+ ins = Image.open(os.path.join(self.content_root, 'train_inst', self.inst_list[idx]))
+ img = Image.open(os.path.join(self.content_root, 'train_img', self.inst_list[idx].replace(".png", ".jpg")))
+ img = img.convert('RGB')
+
+ w, h = img.size
+ resize_w, resize_h = 0, 0
+ if w < h:
+ resize_w, resize_h = self.load_size, int(h * self.load_size / w)
+ else:
+ resize_w, resize_h = int(w * self.load_size / h), self.load_size
+ left = random.randint(0, resize_w - self.crop_size)
+ top = random.randint(0, resize_h - self.crop_size)
+ flip = False
+
+ img = data_transform(img, resize_w, resize_h, load_size=self.load_size,
+ pos=[left, top, left + self.crop_size, top + self.crop_size], flip=flip, is_image=True)
+ ins = data_transform(ins, resize_w, resize_h, load_size=self.load_size,
+ pos=[left, top, left + self.crop_size, top + self.crop_size], flip=flip, is_image=False)
+ return {'img': img, 'ins': ins, 'img_path': self.inst_list[idx]}
+
+ def __len__(self):
+ return len(self.inst_list)
+
+ def name(self):
+ return 'PhotoPenDataset'
+
+@DATASETS.register()
+class PhotoPenDataset_test(Dataset):
+ def __init__(self, content_root, load_size, crop_size):
+ super(PhotoPenDataset_test, self).__init__()
+ inst_dir = os.path.join(content_root, 'test_inst')
+ _, _, inst_list = next(os.walk(inst_dir))
+ self.inst_list = np.sort(inst_list)
+ self.content_root = content_root
+ self.load_size = load_size
+ self.crop_size = crop_size
+
+ def __getitem__(self, idx):
+ ins = Image.open(os.path.join(self.content_root, 'test_inst', self.inst_list[idx]))
+
+ w, h = ins.size
+ resize_w, resize_h = 0, 0
+ if w < h:
+ resize_w, resize_h = self.load_size, int(h * self.load_size / w)
+ else:
+ resize_w, resize_h = int(w * self.load_size / h), self.load_size
+ left = random.randint(0, resize_w - self.crop_size)
+ top = random.randint(0, resize_h - self.crop_size)
+ flip = False
+
+ ins = data_transform(ins, resize_w, resize_h, load_size=self.load_size,
+ pos=[left, top, left + self.crop_size, top + self.crop_size], flip=flip, is_image=False)
+ return {'ins': ins, 'img_path': self.inst_list[idx]}
+
+ def __len__(self):
+ return len(self.inst_list)
+
+ def name(self):
+ return 'PhotoPenDataset'
diff --git a/ppgan/datasets/preprocess/__init__.py b/ppgan/datasets/preprocess/__init__.py
index 883dce15d2eb48a5e00bd195fd02122809da917a..a28b503b2d6bca5c99f9722f7b6c800c031bd7fb 100644
--- a/ppgan/datasets/preprocess/__init__.py
+++ b/ppgan/datasets/preprocess/__init__.py
@@ -1,6 +1,8 @@
-from .io import LoadImageFromFile
+from .io import LoadImageFromFile, ReadImageSequence, GetNeighboringFramesIdx, GetFrameIdx, GetFrameIdxwithPadding
from .transforms import (PairedRandomCrop, PairedRandomHorizontalFlip,
PairedRandomVerticalFlip, PairedRandomTransposeHW,
- SRPairedRandomCrop, SplitPairedImage, SRNoise)
+ SRPairedRandomCrop, SplitPairedImage, SRNoise,
+ NormalizeSequence, MirrorVideoSequence,
+ TransposeSequence, PairedToTensor)
from .builder import build_preprocess
diff --git a/ppgan/datasets/preprocess/builder.py b/ppgan/datasets/preprocess/builder.py
index e25147c8c6bfb0d6206aa93a2a905ee411183040..bb6c7dec4958194b1984b8930bce15b7535facb8 100644
--- a/ppgan/datasets/preprocess/builder.py
+++ b/ppgan/datasets/preprocess/builder.py
@@ -62,3 +62,15 @@ def build_preprocess(cfg):
preproccess = Compose(preproccess)
return preproccess
+
+
+def build_transforms(cfg):
+ transforms = []
+
+ for trans_cfg in cfg:
+ temp_trans_cfg = copy.deepcopy(trans_cfg)
+ name = temp_trans_cfg.pop('name')
+ transforms.append(TRANSFORMS.get(name)(**temp_trans_cfg))
+
+ transforms = Compose(transforms)
+ return transforms
diff --git a/ppgan/datasets/preprocess/io.py b/ppgan/datasets/preprocess/io.py
index 5857d58796aa871bcdbcbcd80cfa0a93e19dc89d..e5123ed25f059eabae9dd30eda0d597c925424bc 100644
--- a/ppgan/datasets/preprocess/io.py
+++ b/ppgan/datasets/preprocess/io.py
@@ -1,5 +1,7 @@
+# code was reference to mmcv
+import os
import cv2
-
+import numpy as np
from .builder import PREPROCESS
@@ -8,12 +10,12 @@ class LoadImageFromFile(object):
"""Load image from file.
Args:
- key (str): Keys in results to find corresponding path. Default: 'image'.
+ key (str): Keys in datas to find corresponding path. Default: 'image'.
flag (str): Loading flag for images. Default: -1.
to_rgb (str): Convert img to 'rgb' format. Default: True.
backend (str): io backend where images are store. Default: None.
save_original_img (bool): If True, maintain a copy of the image in
- `results` dict with name of `f'ori_{key}'`. Default: False.
+ `datas` dict with name of `f'ori_{key}'`. Default: False.
kwargs (dict): Args for file client.
"""
def __init__(self,
@@ -30,28 +32,298 @@ class LoadImageFromFile(object):
self.save_original_img = save_original_img
self.kwargs = kwargs
- def __call__(self, results):
+ def __call__(self, datas):
"""Call function.
Args:
- results (dict): A dict containing the necessary information and
+ datas (dict): A dict containing the necessary information and
data for augmentation.
Returns:
dict: A dict containing the processed data and information.
"""
- filepath = str(results[f'{self.key}_path'])
+ filepath = str(datas[f'{self.key}_path'])
#TODO: use file client to manage io backend
# such as opencv, pil, imdb
img = cv2.imread(filepath, self.flag)
if self.to_rgb:
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
- results[self.key] = img
- results[f'{self.key}_path'] = filepath
- results[f'{self.key}_ori_shape'] = img.shape
+ datas[self.key] = img
+ datas[f'{self.key}_path'] = filepath
+ datas[f'{self.key}_ori_shape'] = img.shape
if self.save_original_img:
- results[f'ori_{self.key}'] = img.copy()
+ datas[f'ori_{self.key}'] = img.copy()
+
+ return datas
+
+
+@PREPROCESS.register()
+class ReadImageSequence(LoadImageFromFile):
+ """Read image sequence.
+
+ It accepts a list of path and read each frame from each path. A list
+ of frames will be returned.
+
+ Args:
+ key (str): Keys in datas to find corresponding path. Default: 'gt'.
+ flag (str): Loading flag for images. Default: 'color'.
+ to_rgb (str): Convert img to 'rgb' format. Default: True.
+ save_original_img (bool): If True, maintain a copy of the image in
+ `datas` dict with name of `f'ori_{key}'`. Default: False.
+ kwargs (dict): Args for file client.
+ """
+ def __call__(self, datas):
+ """Call function.
+
+ Args:
+ datas (dict): A dict containing the necessary information and
+ data for augmentation.
+
+ Returns:
+ dict: A dict containing the processed data and information.
+ """
+
+ filepaths = datas[f'{self.key}_path']
+ if not isinstance(filepaths, list):
+ raise TypeError(
+ f'filepath should be list, but got {type(filepaths)}')
+
+ filepaths = [str(v) for v in filepaths]
+
+ imgs = []
+ shapes = []
+ if self.save_original_img:
+ ori_imgs = []
+ for filepath in filepaths:
+ img = cv2.imread(filepath, self.flag)
+
+ if self.to_rgb:
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+ if img.ndim == 2:
+ img = np.expand_dims(img, axis=2)
+ imgs.append(img)
+ shapes.append(img.shape)
+ if self.save_original_img:
+ ori_imgs.append(img.copy())
+
+ datas[self.key] = imgs
+ datas[f'{self.key}_path'] = filepaths
+ datas[f'{self.key}_ori_shape'] = shapes
+ if self.save_original_img:
+ datas[f'ori_{self.key}'] = ori_imgs
+
+ return datas
+
+
+@PREPROCESS.register()
+class GetNeighboringFramesIdx:
+ """Get neighboring frame indices for a video. It also performs temporal
+ augmention with random interval.
+
+ Args:
+ interval_list (list[int]): Interval list for temporal augmentation.
+ It will randomly pick an interval from interval_list and sample
+ frame index with the interval.
+ start_idx (int): The index corresponds to the first frame in the
+ sequence. Default: 0.
+ filename_tmpl (str): Template for file name. Default: '{:08d}.png'.
+ """
+ def __init__(self, interval_list, start_idx=0, filename_tmpl='{:08d}.png'):
+ self.interval_list = interval_list
+ self.filename_tmpl = filename_tmpl
+ self.start_idx = start_idx
+
+ def __call__(self, datas):
+ """Call function.
+
+ Args:
+ datas (dict): A dict containing the necessary information and
+ data for augmentation.
+
+ Returns:
+ dict: A dict containing the processed data and information.
+ """
+
+ clip_name = datas['key']
+ interval = np.random.choice(self.interval_list)
+
+ self.sequence_length = datas['sequence_length']
+ num_frames = datas.get('num_frames', self.sequence_length)
+
+ if self.sequence_length - num_frames * interval < 0:
+ raise ValueError('The input sequence is not long enough to '
+ 'support the current choice of [interval] or '
+ '[num_frames].')
+ start_frame_idx = np.random.randint(
+ 0, self.sequence_length - num_frames * interval + 1)
+ end_frame_idx = start_frame_idx + num_frames * interval
+ neighbor_list = list(range(start_frame_idx, end_frame_idx, interval))
+ neighbor_list = [v + self.start_idx for v in neighbor_list]
+
+ lq_path_root = datas['lq_path']
+ gt_path_root = datas['gt_path']
+
+ lq_path = [
+ os.path.join(lq_path_root, clip_name, self.filename_tmpl.format(v))
+ for v in neighbor_list
+ ]
+ gt_path = [
+ os.path.join(gt_path_root, clip_name, self.filename_tmpl.format(v))
+ for v in neighbor_list
+ ]
+
+ datas['lq_path'] = lq_path
+ datas['gt_path'] = gt_path
+ datas['interval'] = interval
+
+ return datas
+
+
+@PREPROCESS.register()
+class GetFrameIdx:
+ """Generate frame index for REDS datasets.
+
+ Args:
+ interval_list (list[int]): Interval list for temporal augmentation.
+ It will randomly pick an interval from interval_list and sample
+ frame index with the interval.
+ frames_per_clip(int): Number of frames per clips. Default: 99 for
+ REDS dataset.
+ """
+ def __init__(self, interval_list, frames_per_clip=99):
+ self.interval_list = interval_list
+ self.frames_per_clip = frames_per_clip
+
+ def __call__(self, results):
+ """Call function.
+
+ Args:
+ results (dict): A dict containing the necessary information and
+ data for augmentation.
+
+ Returns:
+ dict: A dict containing the processed data and information.
+ """
+ clip_name, frame_name = results['key'].split('/')
+ center_frame_idx = int(frame_name)
+ num_half_frames = results['num_frames'] // 2
+
+ interval = np.random.choice(self.interval_list)
+ # ensure not exceeding the borders
+ start_frame_idx = center_frame_idx - num_half_frames * interval
+ end_frame_idx = center_frame_idx + num_half_frames * interval
+ while (start_frame_idx < 0) or (end_frame_idx > self.frames_per_clip):
+ center_frame_idx = np.random.randint(0, self.frames_per_clip + 1)
+ start_frame_idx = center_frame_idx - num_half_frames * interval
+ end_frame_idx = center_frame_idx + num_half_frames * interval
+ frame_name = f'{center_frame_idx:08d}'
+ neighbor_list = list(
+ range(center_frame_idx - num_half_frames * interval,
+ center_frame_idx + num_half_frames * interval + 1, interval))
+
+ lq_path_root = results['lq_path']
+ gt_path_root = results['gt_path']
+ lq_path = [
+ os.path.join(lq_path_root, clip_name, f'{v:08d}.png')
+ for v in neighbor_list
+ ]
+ gt_path = [os.path.join(gt_path_root, clip_name, f'{frame_name}.png')]
+ results['lq_path'] = lq_path
+ results['gt_path'] = gt_path
+ results['interval'] = interval
+
+ return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__
+ repr_str += (f'(interval_list={self.interval_list}, '
+ f'frames_per_clip={self.frames_per_clip})')
+ return repr_str
+
+
+@PREPROCESS.register()
+class GetFrameIdxwithPadding:
+ """Generate frame index with padding for REDS dataset and Vid4 dataset
+ during testing.
+
+ Args:
+ padding (str): padding mode, one of
+ 'replicate' | 'reflection' | 'reflection_circle' | 'circle'.
+
+ Examples: current_idx = 0, num_frames = 5
+ The generated frame indices under different padding mode:
+
+ replicate: [0, 0, 0, 1, 2]
+ reflection: [2, 1, 0, 1, 2]
+ reflection_circle: [4, 3, 0, 1, 2]
+ circle: [3, 4, 0, 1, 2]
+
+ filename_tmpl (str): Template for file name. Default: '{:08d}'.
+ """
+ def __init__(self, padding, filename_tmpl='{:08d}'):
+ if padding not in ('replicate', 'reflection', 'reflection_circle',
+ 'circle'):
+ raise ValueError(f'Wrong padding mode {padding}.'
+ 'Should be "replicate", "reflection", '
+ '"reflection_circle", "circle"')
+ self.padding = padding
+ self.filename_tmpl = filename_tmpl
+
+ def __call__(self, results):
+ """Call function.
+
+ Args:
+ results (dict): A dict containing the necessary information and
+ data for augmentation.
+
+ Returns:
+ dict: A dict containing the processed data and information.
+ """
+ clip_name, frame_name = results['key'].split('/')
+ current_idx = int(frame_name)
+ max_frame_num = results['max_frame_num'] - 1 # start from 0
+ num_frames = results['num_frames']
+ num_pad = num_frames // 2
+
+ frame_list = []
+ for i in range(current_idx - num_pad, current_idx + num_pad + 1):
+ if i < 0:
+ if self.padding == 'replicate':
+ pad_idx = 0
+ elif self.padding == 'reflection':
+ pad_idx = -i
+ elif self.padding == 'reflection_circle':
+ pad_idx = current_idx + num_pad - i
+ else:
+ pad_idx = num_frames + i
+ elif i > max_frame_num:
+ if self.padding == 'replicate':
+ pad_idx = max_frame_num
+ elif self.padding == 'reflection':
+ pad_idx = max_frame_num * 2 - i
+ elif self.padding == 'reflection_circle':
+ pad_idx = (current_idx - num_pad) - (i - max_frame_num)
+ else:
+ pad_idx = i - num_frames
+ else:
+ pad_idx = i
+ frame_list.append(pad_idx)
+
+ lq_path_root = results['lq_path']
+ gt_path_root = results['gt_path']
+ lq_paths = [
+ os.path.join(lq_path_root, clip_name,
+ f'{self.filename_tmpl.format(idx)}.png')
+ for idx in frame_list
+ ]
+ gt_paths = [os.path.join(gt_path_root, clip_name, f'{frame_name}.png')]
+ results['lq_path'] = lq_paths
+ results['gt_path'] = gt_paths
return results
+
+ def __repr__(self):
+ repr_str = self.__class__.__name__ + f"(padding='{self.padding}')"
+ return repr_str
diff --git a/ppgan/datasets/preprocess/transforms.py b/ppgan/datasets/preprocess/transforms.py
index 45901932d96499cb5520f5bc3b5f4cb705162a0c..064a7db78ded22a6dd82689cd34167f3a1443aa8 100644
--- a/ppgan/datasets/preprocess/transforms.py
+++ b/ppgan/datasets/preprocess/transforms.py
@@ -22,6 +22,7 @@ import numpy as np
from PIL import Image
+import paddle
import paddle.vision.transforms as T
import paddle.vision.transforms.functional as F
@@ -41,10 +42,13 @@ TRANSFORMS.register(T.RandomHorizontalFlip)
TRANSFORMS.register(T.RandomVerticalFlip)
TRANSFORMS.register(T.Normalize)
TRANSFORMS.register(T.Transpose)
+TRANSFORMS.register(T.Grayscale)
+TRANSFORMS.register(T.ToTensor)
@PREPROCESS.register()
class Transforms():
+
def __init__(self, pipeline, input_keys, output_keys=None):
self.input_keys = input_keys
self.output_keys = output_keys
@@ -54,6 +58,7 @@ class Transforms():
def __call__(self, datas):
data = []
+
for k in self.input_keys:
data.append(datas[k])
data = tuple(data)
@@ -79,6 +84,7 @@ class Transforms():
@PREPROCESS.register()
class SplitPairedImage:
+
def __init__(self, key, paired_keys=['A', 'B']):
self.key = key
self.paired_keys = paired_keys
@@ -101,6 +107,7 @@ class SplitPairedImage:
@TRANSFORMS.register()
class PairedRandomCrop(T.RandomCrop):
+
def __init__(self, size, keys=None):
super().__init__(size, keys=keys)
@@ -120,8 +127,19 @@ class PairedRandomCrop(T.RandomCrop):
return F.crop(img, i, j, h, w)
+@TRANSFORMS.register()
+class PairedToTensor(T.ToTensor):
+
+ def __init__(self, data_format='CHW', keys=None):
+ super().__init__(data_format, keys=keys)
+
+ def _apply_image(self, img):
+ return F.to_tensor(img)
+
+
@TRANSFORMS.register()
class PairedRandomHorizontalFlip(T.RandomHorizontalFlip):
+
def __init__(self, prob=0.5, keys=None):
super().__init__(prob, keys=keys)
@@ -132,12 +150,16 @@ class PairedRandomHorizontalFlip(T.RandomHorizontalFlip):
def _apply_image(self, image):
if self.params['flip']:
- return F.hflip(image)
+ if isinstance(image, list):
+ image = [F.hflip(v) for v in image]
+ else:
+ return F.hflip(image)
return image
@TRANSFORMS.register()
class PairedRandomVerticalFlip(T.RandomHorizontalFlip):
+
def __init__(self, prob=0.5, keys=None):
super().__init__(prob, keys=keys)
@@ -148,7 +170,10 @@ class PairedRandomVerticalFlip(T.RandomHorizontalFlip):
def _apply_image(self, image):
if self.params['flip']:
- return F.hflip(image)
+ if isinstance(image, list):
+ image = [F.vflip(v) for v in image]
+ else:
+ return F.vflip(image)
return image
@@ -168,6 +193,7 @@ class PairedRandomTransposeHW(T.BaseTransform):
prob (float): The propability to transpose the images.
keys (list[str]): The images to be transposed.
"""
+
def __init__(self, prob=0.5, keys=None):
self.keys = keys
self.prob = prob
@@ -179,10 +205,110 @@ class PairedRandomTransposeHW(T.BaseTransform):
def _apply_image(self, image):
if self.params['transpose']:
- image = image.transpose(1, 0, 2)
+ if isinstance(image, list):
+ image = [v.transpose(1, 0, 2) for v in image]
+ else:
+ image = image.transpose(1, 0, 2)
return image
+@TRANSFORMS.register()
+class TransposeSequence(T.Transpose):
+ """Transpose input data or a video sequence to a target format.
+ For example, most transforms use HWC mode image,
+ while the Neural Network might use CHW mode input tensor.
+ output image will be an instance of numpy.ndarray.
+
+ Args:
+ order (list|tuple, optional): Target order of input data. Default: (2, 0, 1).
+ keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+
+ Examples:
+
+ .. code-block:: python
+
+ import numpy as np
+ from PIL import Image
+
+ transform = TransposeSequence()
+
+ fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
+
+ fake_img_seq = [fake_img, fake_img, fake_img]
+ fake_img_seq = transform(fake_img_seq)
+
+ """
+
+ def _apply_image(self, img):
+ if isinstance(img, list):
+ imgs = []
+ for im in img:
+ if F._is_tensor_image(im):
+ return im.transpose(self.order)
+
+ if F._is_pil_image(im):
+ im = np.asarray(im)
+
+ if len(im.shape) == 2:
+ im = im[..., np.newaxis]
+ imgs.append(im.transpose(self.order))
+ return imgs
+ else:
+ if F._is_tensor_image(img):
+ return img.transpose(self.order)
+
+ if F._is_pil_image(img):
+ img = np.asarray(img)
+
+ if len(img.shape) == 2:
+ img = img[..., np.newaxis]
+ return img.transpose(self.order)
+
+
+@TRANSFORMS.register()
+class NormalizeSequence(T.Normalize):
+ """Normalize the input data with mean and standard deviation.
+ Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
+ this transform will normalize each channel of the input data.
+ ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+ Args:
+ mean (int|float|list|tuple): Sequence of means for each channel.
+ std (int|float|list|tuple): Sequence of standard deviations for each channel.
+ data_format (str, optional): Data format of img, should be 'HWC' or
+ 'CHW'. Default: 'CHW'.
+ to_rgb (bool, optional): Whether to convert to rgb. Default: False.
+ keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+
+ Examples:
+
+ .. code-block:: python
+
+ import numpy as np
+ from PIL import Image
+
+ normalize_seq = NormalizeSequence(mean=[127.5, 127.5, 127.5],
+ std=[127.5, 127.5, 127.5],
+ data_format='HWC')
+
+ fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
+ fake_img_seq = [fake_img, fake_img, fake_img]
+ fake_img_seq = normalize_seq(fake_img_seq)
+
+ """
+
+ def _apply_image(self, img):
+ if isinstance(img, list):
+ imgs = [
+ F.normalize(v, self.mean, self.std, self.data_format,
+ self.to_rgb) for v in img
+ ]
+ return np.stack(imgs, axis=0).astype('float32')
+
+ return F.normalize(img, self.mean, self.std, self.data_format,
+ self.to_rgb)
+
+
@TRANSFORMS.register()
class SRPairedRandomCrop(T.BaseTransform):
"""Super resolution random crop.
@@ -196,6 +322,7 @@ class SRPairedRandomCrop(T.BaseTransform):
scale (int): model upscale factor.
gt_patch_size (int): cropped gt patch size.
"""
+
def __init__(self, scale, gt_patch_size, scale_list=False, keys=None):
self.gt_patch_size = gt_patch_size
self.scale = scale
@@ -203,15 +330,19 @@ class SRPairedRandomCrop(T.BaseTransform):
self.scale_list = scale_list
def __call__(self, inputs):
- """inputs must be (lq_img, gt_img)"""
+ """inputs must be (lq_img or list[lq_img], gt_img or list[gt_img])"""
scale = self.scale
lq_patch_size = self.gt_patch_size // scale
lq = inputs[0]
gt = inputs[1]
- h_lq, w_lq, _ = lq.shape
- h_gt, w_gt, _ = gt.shape
+ if isinstance(lq, list):
+ h_lq, w_lq, _ = lq[0].shape
+ h_gt, w_gt, _ = gt[0].shape
+ else:
+ h_lq, w_lq, _ = lq.shape
+ h_gt, w_gt, _ = gt.shape
if h_gt != h_lq * scale or w_gt != w_lq * scale:
raise ValueError('scale size not match')
@@ -221,18 +352,30 @@ class SRPairedRandomCrop(T.BaseTransform):
# randomly choose top and left coordinates for lq patch
top = random.randint(0, h_lq - lq_patch_size)
left = random.randint(0, w_lq - lq_patch_size)
- # crop lq patch
- lq = lq[top:top + lq_patch_size, left:left + lq_patch_size, ...]
- # crop corresponding gt patch
- top_gt, left_gt = int(top * scale), int(left * scale)
- gt = gt[top_gt:top_gt + self.gt_patch_size,
- left_gt:left_gt + self.gt_patch_size, ...]
-
- if self.scale_list and self.scale == 4:
- lqx2 = F.resize(gt, (lq_patch_size * 2, lq_patch_size * 2),
- 'bicubic')
- outputs = (lq, lqx2, gt)
- return outputs
+
+ if isinstance(lq, list):
+ lq = [
+ v[top:top + lq_patch_size, left:left + lq_patch_size, ...]
+ for v in lq
+ ]
+ top_gt, left_gt = int(top * scale), int(left * scale)
+ gt = [
+ v[top_gt:top_gt + self.gt_patch_size,
+ left_gt:left_gt + self.gt_patch_size, ...] for v in gt
+ ]
+ else:
+ # crop lq patch
+ lq = lq[top:top + lq_patch_size, left:left + lq_patch_size, ...]
+ # crop corresponding gt patch
+ top_gt, left_gt = int(top * scale), int(left * scale)
+ gt = gt[top_gt:top_gt + self.gt_patch_size,
+ left_gt:left_gt + self.gt_patch_size, ...]
+
+ if self.scale_list and self.scale == 4:
+ lqx2 = F.resize(gt, (lq_patch_size * 2, lq_patch_size * 2),
+ 'bicubic')
+ outputs = (lq, lqx2, gt)
+ return outputs
outputs = (lq, gt)
return outputs
@@ -246,6 +389,7 @@ class SRNoise(T.BaseTransform):
noise_path (str): directory of noise image.
size (int): cropped noise patch size.
"""
+
def __init__(self, noise_path, size, keys=None):
self.noise_path = noise_path
self.noise_imgs = sorted(glob.glob(noise_path + '*.png'))
@@ -264,3 +408,187 @@ class SRNoise(T.BaseTransform):
image = image + normed_noise
image = np.clip(image, 0., 1.)
return image
+
+
+@TRANSFORMS.register()
+class RandomResizedCropProb(T.RandomResizedCrop):
+ """RandomResizedCropProb.
+
+ Args:
+ prob (float): probabilty of using random-resized cropping.
+ size (int): cropped size.
+ """
+
+ def __init__(self, prob, size, scale, ratio, interpolation, keys=None):
+ super().__init__(size, scale, ratio, interpolation)
+ self.prob = prob
+ self.keys = keys
+
+ def _apply_image(self, image):
+ if random.random() < self.prob:
+ image = super()._apply_image(image)
+ return image
+
+
+@TRANSFORMS.register()
+class Add(T.BaseTransform):
+
+ def __init__(self, value, keys=None):
+ """Initialize Add Transform
+
+ Parameters:
+ value (List[int]) -- the [r,g,b] value will add to image by pixel wise.
+ """
+ super().__init__(keys=keys)
+ self.value = value
+
+ def _get_params(self, inputs):
+ params = {}
+ params['value'] = self.value
+ return params
+
+ def _apply_image(self, image):
+ return np.clip(image + self.params['value'], 0, 255).astype('uint8')
+ # return custom_F.add(image, self.params['value'])
+
+
+@TRANSFORMS.register()
+class ResizeToScale(T.BaseTransform):
+
+ def __init__(self,
+ size: int,
+ scale: int,
+ interpolation='bilinear',
+ keys=None):
+ """Initialize ResizeToScale Transform
+
+ Parameters:
+ size (List[int]) -- the minimum target size
+ scale (List[int]) -- the stride scale
+ interpolation (Optional[str]) -- interpolation method
+ """
+ super().__init__(keys=keys)
+ if isinstance(size, int):
+ self.size = (size, size)
+ else:
+ self.size = size
+ self.scale = scale
+ self.interpolation = interpolation
+
+ def _get_params(self, inputs):
+ image = inputs[self.keys.index('image')]
+ hw = image.shape[:2]
+ params = {}
+ params['taget_size'] = self.reduce_to_scale(hw, self.size[::-1],
+ self.scale)
+ return params
+
+ @staticmethod
+ def reduce_to_scale(img_hw, min_hw, scale):
+ im_h, im_w = img_hw
+ if im_h <= min_hw[0]:
+ im_h = min_hw[0]
+ else:
+ x = im_h % scale
+ im_h = im_h - x
+
+ if im_w < min_hw[1]:
+ im_w = min_hw[1]
+ else:
+ y = im_w % scale
+ im_w = im_w - y
+ return (im_h, im_w)
+
+ def _apply_image(self, image):
+ return F.resize(image, self.params['taget_size'], self.interpolation)
+
+
+@TRANSFORMS.register()
+class PairedColorJitter(T.BaseTransform):
+
+ def __init__(self,
+ brightness=0,
+ contrast=0,
+ saturation=0,
+ hue=0,
+ keys=None):
+ super().__init__(keys=keys)
+ self.brightness = T.transforms._check_input(brightness, 'brightness')
+ self.contrast = T.transforms._check_input(contrast, 'contrast')
+ self.saturation = T.transforms._check_input(saturation, 'saturation')
+ self.hue = T.transforms._check_input(hue,
+ 'hue',
+ center=0,
+ bound=(-0.5, 0.5),
+ clip_first_on_zero=False)
+
+ def _get_params(self, input):
+ """Get a randomized transform to be applied on image.
+ Arguments are same as that of __init__.
+ Returns:
+ Transform which randomly adjusts brightness, contrast and
+ saturation in a random order.
+ """
+ transforms = []
+
+ if self.brightness is not None:
+ brightness = random.uniform(self.brightness[0], self.brightness[1])
+ f = lambda img: F.adjust_brightness(img, brightness)
+ transforms.append(f)
+
+ if self.contrast is not None:
+ contrast = random.uniform(self.contrast[0], self.contrast[1])
+ f = lambda img: F.adjust_contrast(img, contrast)
+ transforms.append(f)
+
+ if self.saturation is not None:
+ saturation = random.uniform(self.saturation[0], self.saturation[1])
+ f = lambda img: F.adjust_saturation(img, saturation)
+ transforms.append(f)
+
+ if self.hue is not None:
+ hue = random.uniform(self.hue[0], self.hue[1])
+ f = lambda img: F.adjust_hue(img, hue)
+ transforms.append(f)
+
+ random.shuffle(transforms)
+ return transforms
+
+ def _apply_image(self, img):
+ for f in self.params:
+ img = f(img)
+ return img
+
+
+@TRANSFORMS.register()
+class MirrorVideoSequence:
+ """Double a short video sequences by mirroring the sequences
+
+ Example:
+ Given a sequence with N frames (x1, ..., xN), extend the
+ sequence to (x1, ..., xN, xN, ..., x1).
+
+ Args:
+ keys (list[str]): The frame lists to be extended.
+ """
+
+ def __init__(self, keys=None):
+ self.keys = keys
+
+ def __call__(self, datas):
+ """Call function.
+
+ Args:
+ datas (dict): A dict containing the necessary information and
+ data for augmentation.
+
+ Returns:
+ dict: A dict containing the processed data and information.
+ """
+ lrs, hrs = datas
+ assert isinstance(lrs, list) and isinstance(hrs, list)
+
+ lrs = lrs + lrs[::-1]
+ hrs = hrs + hrs[::-1]
+
+ return (lrs, hrs)
diff --git a/ppgan/datasets/repeat_dataset.py b/ppgan/datasets/repeat_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f1e803500479dca9519b16fdb5548e36131d64f
--- /dev/null
+++ b/ppgan/datasets/repeat_dataset.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+class RepeatDataset(paddle.io.Dataset):
+ """A wrapper of repeated dataset.
+
+ The length of repeated dataset will be `times` larger than the original
+ dataset. This is useful when the data loading time is long but the dataset
+ is small. Using RepeatDataset can reduce the data loading time between
+ epochs.
+
+ Args:
+ dataset (:obj:`Dataset`): The dataset to be repeated.
+ times (int): Repeat times.
+ """
+
+ def __init__(self, dataset, times):
+ self.dataset = dataset
+ self.times = times
+
+ self._ori_len = len(self.dataset)
+
+ def __getitem__(self, idx):
+ """Get item at each call.
+
+ Args:
+ idx (int): Index for getting each item.
+ """
+ return self.dataset[idx % self._ori_len]
+
+ def __len__(self):
+ """Length of the dataset.
+
+ Returns:
+ int: Length of the dataset.
+ """
+ return self.times * self._ori_len
diff --git a/ppgan/datasets/starganv2_dataset.py b/ppgan/datasets/starganv2_dataset.py
new file mode 100755
index 0000000000000000000000000000000000000000..0985b13c62362e0f01e6e35a76af0796991a4d7f
--- /dev/null
+++ b/ppgan/datasets/starganv2_dataset.py
@@ -0,0 +1,184 @@
+# code was heavily based on https://github.com/clovaai/stargan-v2
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/clovaai/stargan-v2#license
+
+import paddle
+from .base_dataset import BaseDataset
+from .builder import DATASETS
+import os
+from itertools import chain
+from pathlib import Path
+import traceback
+import random
+import numpy as np
+from PIL import Image
+
+from paddle.io import Dataset, WeightedRandomSampler
+
+
+def listdir(dname):
+ fnames = list(
+ chain(*[
+ list(Path(dname).rglob('*.' + ext))
+ for ext in ['png', 'jpg', 'jpeg', 'JPG']
+ ]))
+ return fnames
+
+
+def _make_balanced_sampler(labels):
+ class_counts = np.bincount(labels)
+ class_weights = 1. / class_counts
+ weights = class_weights[labels]
+ return WeightedRandomSampler(weights, len(weights))
+
+
+class ImageFolder(Dataset):
+ def __init__(self, root, use_sampler=False):
+ self.samples, self.targets = self._make_dataset(root)
+ self.use_sampler = use_sampler
+ if self.use_sampler:
+ self.sampler = _make_balanced_sampler(self.targets)
+ self.iter_sampler = iter(self.sampler)
+
+ def _make_dataset(self, root):
+ domains = os.listdir(root)
+ fnames, labels = [], []
+ for idx, domain in enumerate(sorted(domains)):
+ class_dir = os.path.join(root, domain)
+ cls_fnames = listdir(class_dir)
+ fnames += cls_fnames
+ labels += [idx] * len(cls_fnames)
+ return fnames, labels
+
+ def __getitem__(self, i):
+ if self.use_sampler:
+ try:
+ index = next(self.iter_sampler)
+ except StopIteration:
+ self.iter_sampler = iter(self.sampler)
+ index = next(self.iter_sampler)
+ else:
+ index = i
+ fname = self.samples[index]
+ label = self.targets[index]
+ return fname, label
+
+ def __len__(self):
+ return len(self.targets)
+
+
+class ReferenceDataset(Dataset):
+ def __init__(self, root, use_sampler=None):
+ self.samples, self.targets = self._make_dataset(root)
+ self.use_sampler = use_sampler
+ if self.use_sampler:
+ self.sampler = _make_balanced_sampler(self.targets)
+ self.iter_sampler = iter(self.sampler)
+
+ def _make_dataset(self, root):
+ domains = os.listdir(root)
+ fnames, fnames2, labels = [], [], []
+ for idx, domain in enumerate(sorted(domains)):
+ class_dir = os.path.join(root, domain)
+ cls_fnames = listdir(class_dir)
+ fnames += cls_fnames
+ fnames2 += random.sample(cls_fnames, len(cls_fnames))
+ labels += [idx] * len(cls_fnames)
+ return list(zip(fnames, fnames2)), labels
+
+ def __getitem__(self, i):
+ if self.use_sampler:
+ try:
+ index = next(self.iter_sampler)
+ except StopIteration:
+ self.iter_sampler = iter(self.sampler)
+ index = next(self.iter_sampler)
+ else:
+ index = i
+ fname, fname2 = self.samples[index]
+ label = self.targets[index]
+ return fname, fname2, label
+
+ def __len__(self):
+ return len(self.targets)
+
+
+@DATASETS.register()
+class StarGANv2Dataset(BaseDataset):
+ """
+ """
+ def __init__(self, dataroot, is_train, preprocess, test_count=0):
+ """Initialize single dataset class.
+
+ Args:
+ dataroot (str): Directory of dataset.
+ preprocess (list[dict]): A sequence of data preprocess config.
+ """
+ super(StarGANv2Dataset, self).__init__(preprocess)
+
+ self.dataroot = dataroot
+ self.is_train = is_train
+ if self.is_train:
+ self.src_loader = ImageFolder(self.dataroot, use_sampler=True)
+ self.ref_loader = ReferenceDataset(self.dataroot, use_sampler=True)
+ self.counts = len(self.src_loader)
+ else:
+ files = os.listdir(self.dataroot)
+ if 'src' in files and 'ref' in files:
+ self.src_loader = ImageFolder(os.path.join(
+ self.dataroot, 'src'))
+ self.ref_loader = ImageFolder(os.path.join(
+ self.dataroot, 'ref'))
+ else:
+ self.src_loader = ImageFolder(self.dataroot)
+ self.ref_loader = ImageFolder(self.dataroot)
+ self.counts = min(test_count, len(self.src_loader))
+ self.counts = min(self.counts, len(self.ref_loader))
+
+ def _fetch_inputs(self):
+ try:
+ x, y = next(self.iter_src)
+ except (AttributeError, StopIteration):
+ self.iter_src = iter(self.src_loader)
+ x, y = next(self.iter_src)
+ return x, y
+
+ def _fetch_refs(self):
+ try:
+ x, x2, y = next(self.iter_ref)
+ except (AttributeError, StopIteration):
+ self.iter_ref = iter(self.ref_loader)
+ x, x2, y = next(self.iter_ref)
+ return x, x2, y
+
+ def __getitem__(self, idx):
+ if self.is_train:
+ x, y = self._fetch_inputs()
+ x_ref, x_ref2, y_ref = self._fetch_refs()
+ datas = {
+ 'src_path': x,
+ 'src_cls': y,
+ 'ref_path': x_ref,
+ 'ref2_path': x_ref2,
+ 'ref_cls': y_ref,
+ }
+ else:
+ x, y = self.src_loader[idx]
+ x_ref, y_ref = self.ref_loader[idx]
+ datas = {
+ 'src_path': x,
+ 'src_cls': y,
+ 'ref_path': x_ref,
+ 'ref_cls': y_ref,
+ }
+
+ if hasattr(self, 'preprocess') and self.preprocess:
+ datas = self.preprocess(datas)
+
+ return datas
+
+ def __len__(self):
+ return self.counts
+
+ def prepare_data_infos(self, dataroot):
+ pass
diff --git a/ppgan/datasets/swinir_dataset.py b/ppgan/datasets/swinir_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..720ae6aa9dabbc246e943f59dca7017d2a9d5c96
--- /dev/null
+++ b/ppgan/datasets/swinir_dataset.py
@@ -0,0 +1,165 @@
+# code was heavily based on https://github.com/cszn/KAIR
+# MIT License
+# Copyright (c) 2019 Kai Zhang
+
+import os
+import random
+import numpy as np
+import cv2
+
+import paddle
+from paddle.io import Dataset
+
+from .builder import DATASETS
+
+
+def is_image_file(filename):
+ return any(
+ filename.endswith(extension)
+ for extension in ['jpeg', 'JPEG', 'jpg', 'png', 'JPG', 'PNG', 'gif'])
+
+
+def get_image_paths(dataroot):
+ paths = None # return None if dataroot is None
+ if isinstance(dataroot, str):
+ paths = sorted(_get_paths_from_images(dataroot))
+ elif isinstance(dataroot, list):
+ paths = []
+ for i in dataroot:
+ paths += sorted(_get_paths_from_images(i))
+ return paths
+
+
+def _get_paths_from_images(path):
+ assert os.path.isdir(path), '{:s} is not a valid directory'.format(path)
+ images = []
+ for dirpath, _, fnames in sorted(os.walk(path)):
+ for fname in sorted(fnames):
+ if is_image_file(fname):
+ img_path = os.path.join(dirpath, fname)
+ images.append(img_path)
+ assert images, '{:s} has no valid image file'.format(path)
+ return images
+
+
+def imread_uint(path, n_channels=3):
+ # input: path
+ # output: HxWx3(RGB or GGG), or HxWx1 (G)
+ if n_channels == 1:
+ img = cv2.imread(path, 0) # cv2.IMREAD_GRAYSCALE
+ img = np.expand_dims(img, axis=2) # HxWx1
+ elif n_channels == 3:
+ img = cv2.imread(path, cv2.IMREAD_UNCHANGED) # BGR or G
+ if img.ndim == 2:
+ img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) # GGG
+ else:
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB
+ return img
+
+
+def augment_img(img, mode=0):
+ if mode == 0:
+ return img
+ elif mode == 1:
+ return np.flipud(np.rot90(img))
+ elif mode == 2:
+ return np.flipud(img)
+ elif mode == 3:
+ return np.rot90(img, k=3)
+ elif mode == 4:
+ return np.flipud(np.rot90(img, k=2))
+ elif mode == 5:
+ return np.rot90(img)
+ elif mode == 6:
+ return np.rot90(img, k=2)
+ elif mode == 7:
+ return np.flipud(np.rot90(img, k=3))
+
+
+def uint2tensor3(img):
+ if img.ndim == 2:
+ img = np.expand_dims(img, axis=2)
+ return paddle.Tensor(np.ascontiguousarray(img, dtype=np.float32)).transpose(
+ [2, 0, 1]) / 255.
+
+
+def uint2single(img):
+
+ return np.float32(img / 255.)
+
+
+# convert single (HxWxC) to 3-dimensional paddle tensor
+def single2tensor3(img):
+ return paddle.Tensor(np.ascontiguousarray(img, dtype=np.float32)).transpose(
+ [2, 0, 1])
+
+
+@DATASETS.register()
+class SwinIRDataset(Dataset):
+ """ Get L/H for denosing on AWGN with fixed sigma.
+ Ref:
+ DnCNN: Beyond a Gaussian Denoiser: Residual Learning of Deep CNN for Image Denoising
+ Args:
+ opt (dict): A dictionary defining dataset-related parameters.
+ """
+
+ def __init__(self, opt=None):
+ super(SwinIRDataset, self).__init__()
+
+ print(
+ 'Dataset: Denosing on AWGN with fixed sigma. Only dataroot_H is needed.'
+ )
+ self.opt = opt
+ self.n_channels = opt['n_channels'] if opt['n_channels'] else 3
+ self.patch_size = opt['H_size'] if opt['H_size'] else 64
+ self.sigma = opt['sigma'] if opt['sigma'] else 25
+ self.sigma_test = opt['sigma_test'] if opt['sigma_test'] else self.sigma
+ self.paths_H = get_image_paths(opt['dataroot_H'])
+
+ def __len__(self):
+ return len(self.paths_H)
+
+ def __getitem__(self, index):
+ # get H image
+ H_path = self.paths_H[index]
+
+ img_H = imread_uint(H_path, self.n_channels)
+
+ L_path = H_path
+
+ if self.opt['phase'] == 'train':
+ # get L/H patch pairs
+ H, W, _ = img_H.shape
+
+ # randomly crop the patch
+ rnd_h = random.randint(0, max(0, H - self.patch_size))
+ rnd_w = random.randint(0, max(0, W - self.patch_size))
+ patch_H = img_H[rnd_h:rnd_h + self.patch_size,
+ rnd_w:rnd_w + self.patch_size, :]
+
+ # augmentation - flip, rotate
+ mode = random.randint(0, 7)
+ patch_H = augment_img(patch_H, mode=mode)
+ img_H = uint2tensor3(patch_H)
+ img_L = img_H.clone()
+
+ # add noise
+ noise = paddle.randn(img_L.shape) * self.sigma / 255.0
+ img_L = img_L + noise
+
+ else:
+ # get L/H image pairs
+ img_H = uint2single(img_H)
+ img_L = np.copy(img_H)
+
+ # add noise
+ np.random.seed(seed=0)
+ img_L += np.random.normal(0, self.sigma_test / 255.0, img_L.shape)
+
+ # HWC to CHW, numpy to tensor
+ img_L = single2tensor3(img_L)
+ img_H = single2tensor3(img_H)
+
+ filename = os.path.splitext(os.path.split(H_path)[-1])[0]
+
+ return img_H, img_L, filename
diff --git a/ppgan/datasets/transforms/__init__.py b/ppgan/datasets/transforms/__init__.py
deleted file mode 100644
index acb1b770db0c05f74cce8e0350be8d0ef4e96b89..0000000000000000000000000000000000000000
--- a/ppgan/datasets/transforms/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .transforms import ResizeToScale, PairedRandomCrop, PairedRandomHorizontalFlip, Add
diff --git a/ppgan/datasets/transforms/functional_cv2.py b/ppgan/datasets/transforms/functional_cv2.py
deleted file mode 100644
index e688a974ae00f49e4be1099aa01a43326d347156..0000000000000000000000000000000000000000
--- a/ppgan/datasets/transforms/functional_cv2.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from __future__ import division
-import numpy as np
-
-
-def add(image, value):
- return np.clip(image + value, 0, 255).astype('uint8')
diff --git a/ppgan/datasets/transforms/transforms.py b/ppgan/datasets/transforms/transforms.py
deleted file mode 100644
index 540644acce336df4a77cdf1c207b20ad4650d1df..0000000000000000000000000000000000000000
--- a/ppgan/datasets/transforms/transforms.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import random
-import numbers
-import collections
-import numpy as np
-
-import paddle.vision.transforms as T
-import paddle.vision.transforms.functional as F
-
-from . import functional as custom_F
-from .builder import TRANSFORMS
-
-if sys.version_info < (3, 3):
- Sequence = collections.Sequence
- Iterable = collections.Iterable
-else:
- Sequence = collections.abc.Sequence
- Iterable = collections.abc.Iterable
-
-TRANSFORMS.register(T.Resize)
-TRANSFORMS.register(T.RandomCrop)
-TRANSFORMS.register(T.RandomHorizontalFlip)
-TRANSFORMS.register(T.Normalize)
-TRANSFORMS.register(T.Transpose)
-TRANSFORMS.register(T.Grayscale)
-
-
-@TRANSFORMS.register()
-class PairedRandomCrop(T.RandomCrop):
- def __init__(self, size, keys=None):
- super().__init__(size, keys=keys)
-
- if isinstance(size, int):
- self.size = (size, size)
- else:
- self.size = size
-
- def _get_params(self, inputs):
- image = inputs[self.keys.index('image')]
- params = {}
- params['crop_prams'] = self._get_param(image, self.size)
- return params
-
- def _apply_image(self, img):
- i, j, h, w = self.params['crop_prams']
- return F.crop(img, i, j, h, w)
-
-
-@TRANSFORMS.register()
-class PairedRandomHorizontalFlip(T.RandomHorizontalFlip):
- def __init__(self, prob=0.5, keys=None):
- super().__init__(prob, keys=keys)
-
- def _get_params(self, inputs):
- params = {}
- params['flip'] = random.random() < self.prob
- return params
-
- def _apply_image(self, image):
- if self.params['flip']:
- return F.hflip(image)
- return image
-
-
-@TRANSFORMS.register()
-class Add(T.BaseTransform):
- def __init__(self, value, keys=None):
- """Initialize Add Transform
-
- Parameters:
- value (List[int]) -- the [r,g,b] value will add to image by pixel wise.
- """
- super().__init__(keys=keys)
- self.value = value
-
- def _get_params(self, inputs):
- params = {}
- params['value'] = self.value
- return params
-
- def _apply_image(self, image):
- return custom_F.add(image, self.params['value'])
-
-
-@TRANSFORMS.register()
-class ResizeToScale(T.BaseTransform):
- def __init__(self,
- size: int,
- scale: int,
- interpolation='bilinear',
- keys=None):
- """Initialize ResizeToScale Transform
-
- Parameters:
- size (List[int]) -- the minimum target size
- scale (List[int]) -- the stride scale
- interpolation (Optional[str]) -- interpolation method
- """
- super().__init__(keys=keys)
- if isinstance(size, int):
- self.size = (size, size)
- else:
- self.size = size
- self.scale = scale
- self.interpolation = interpolation
-
- def _get_params(self, inputs):
- image = inputs[self.keys.index('image')]
- hw = image.shape[:2]
- params = {}
- params['taget_size'] = self.reduce_to_scale(hw, self.size[::-1],
- self.scale)
- return params
-
- @staticmethod
- def reduce_to_scale(img_hw, min_hw, scale):
- im_h, im_w = img_hw
- if im_h <= min_hw[0]:
- im_h = min_hw[0]
- else:
- x = im_h % scale
- im_h = im_h - x
-
- if im_w < min_hw[1]:
- im_w = min_hw[1]
- else:
- y = im_w % scale
- im_w = im_w - y
- return (im_h, im_w)
-
- def _apply_image(self, image):
- return F.resize(image, self.params['taget_size'], self.interpolation)
diff --git a/ppgan/datasets/vsr_folder_dataset.py b/ppgan/datasets/vsr_folder_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..b38a5e713e98598fba14d33b641910a0452d5b7d
--- /dev/null
+++ b/ppgan/datasets/vsr_folder_dataset.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import glob
+import random
+import logging
+import numpy as np
+from paddle.io import Dataset
+
+from .base_sr_dataset import BaseDataset
+from .builder import DATASETS
+
+logger = logging.getLogger(__name__)
+
+
+@DATASETS.register()
+class VSRFolderDataset(BaseDataset):
+ """Video super-resolution for folder format.
+
+ Args:
+ lq_folder (str): Path to a low quality image folder.
+ gt_folder (str): Path to a ground truth image folder.
+ ann_file (str): Path to the annotation file.
+ preprocess (list[dict|callable]): A list functions of data transformations.
+ num_frames (int): Number of frames of each input clip.
+ times (int): Repeat times of datset length.
+ """
+ def __init__(self,
+ lq_folder,
+ gt_folder,
+ preprocess,
+ num_frames=None,
+ times=1):
+ super().__init__(preprocess)
+
+ self.lq_folder = str(lq_folder)
+ self.gt_folder = str(gt_folder)
+ self.num_frames = num_frames
+ self.times = times
+
+ self.data_infos = self.prepare_data_infos()
+
+ def prepare_data_infos(self):
+
+ sequences = sorted(glob.glob(os.path.join(self.lq_folder, '*')))
+ sep = os.path.sep
+
+ data_infos = []
+ for sequence in sequences:
+ sequence_length = len(glob.glob(os.path.join(sequence, '*.png')))
+ if self.num_frames is None:
+ num_frames = sequence_length
+ else:
+ num_frames = self.num_frames
+ data_infos.append(
+ dict(lq_path=self.lq_folder,
+ gt_path=self.gt_folder,
+ key=sequence.replace(f'{self.lq_folder}' + sep, ''),
+ num_frames=num_frames,
+ sequence_length=sequence_length))
+ return data_infos
diff --git a/ppgan/datasets/vsr_reds_dataset.py b/ppgan/datasets/vsr_reds_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc665e0d656779edf869f78e8081115e4f266e2
--- /dev/null
+++ b/ppgan/datasets/vsr_reds_dataset.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from .builder import DATASETS
+from .base_sr_dataset import BaseDataset
+
+logger = logging.getLogger(__name__)
+
+
+@DATASETS.register()
+class VSRREDSDataset(BaseDataset):
+ """REDS dataset for video super resolution for Sliding-window networks.
+
+ The dataset loads several LQ (Low-Quality) frames and a center GT
+ (Ground-Truth) frame. Then it applies specified transforms and finally
+ returns a dict containing paired data and other information.
+
+ It reads REDS keys from the txt file. Each line contains video frame folder
+
+ Examples:
+
+ 000/00000000.png (720, 1280, 3)
+ 000/00000001.png (720, 1280, 3)
+
+ Args:
+ lq_folder (str): Path to a low quality image folder.
+ gt_folder (str): Path to a ground truth image folder.
+ ann_file (str): Path to the annotation file.
+ num_frames (int): Window size for input frames.
+ preprocess (list[dict|callable]): A list functions of data transformations.
+ val_partition (str): Validation partition mode. Choices ['official' or 'REDS4']. Default: 'REDS4'.
+ test_mode (bool): Store `True` when building test dataset. Default: `False`.
+ """
+ def __init__(self,
+ lq_folder,
+ gt_folder,
+ ann_file,
+ num_frames,
+ preprocess,
+ val_partition='REDS4',
+ test_mode=False):
+ super().__init__(preprocess)
+ assert num_frames % 2 == 1, (f'num_frames should be odd numbers, '
+ f'but received {num_frames }.')
+ self.lq_folder = str(lq_folder)
+ self.gt_folder = str(gt_folder)
+ self.ann_file = str(ann_file)
+ self.num_frames = num_frames
+ self.val_partition = val_partition
+ self.test_mode = test_mode
+ self.data_infos = self.prepare_data_infos()
+
+ def prepare_data_infos(self):
+ """Load annoations for REDS dataset.
+ Returns:
+ dict: Returned dict for LQ and GT pairs.
+ """
+ # get keys
+ with open(self.ann_file, 'r') as fin:
+ keys = [v.strip().split('.')[0] for v in fin]
+
+ if self.val_partition == 'REDS4':
+ val_partition = ['000', '011', '015', '020']
+ elif self.val_partition == 'official':
+ val_partition = [f'{v:03d}' for v in range(240, 270)]
+ else:
+ raise ValueError(f'Wrong validation partition {self.val_partition}.'
+ f'Supported ones are ["official", "REDS4"]')
+
+ if self.test_mode:
+ keys = [v for v in keys if v.split('/')[0] in val_partition]
+ else:
+ keys = [v for v in keys if v.split('/')[0] not in val_partition]
+
+ data_infos = []
+ for key in keys:
+ data_infos.append(
+ dict(lq_path=self.lq_folder,
+ gt_path=self.gt_folder,
+ key=key,
+ max_frame_num=100,
+ num_frames=self.num_frames))
+
+ return data_infos
diff --git a/ppgan/datasets/vsr_reds_multiple_gt_dataset.py b/ppgan/datasets/vsr_reds_multiple_gt_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f327b10631ad15ec973934b4c1e345f493ecc0db
--- /dev/null
+++ b/ppgan/datasets/vsr_reds_multiple_gt_dataset.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from .builder import DATASETS
+from .base_sr_dataset import BaseDataset
+
+logger = logging.getLogger(__name__)
+
+
+@DATASETS.register()
+class VSRREDSMultipleGTDataset(BaseDataset):
+ """REDS dataset for video super resolution for recurrent networks.
+
+ The dataset loads several LQ (Low-Quality) frames and GT (Ground-Truth) frames.
+ Then it applies specified transforms and finally returns a dict containing
+ paired data and other information.
+
+ Args:
+ Args:
+ lq_folder (str): Path to a low quality image folder.
+ gt_folder (str): Path to a ground truth image folder.
+ ann_file (str): Path to the annotation file.
+ num_frames (int): Window size for input frames.
+ preprocess (list[dict|callable]): A list functions of data transformations.
+ val_partition (str): Validation partition mode. Choices ['official' or 'REDS4'].
+ Default: 'REDS4'.
+ test_mode (bool): Store `True` when building test dataset. Default: `False`.
+ """
+ def __init__(self,
+ lq_folder,
+ gt_folder,
+ ann_file,
+ num_frames,
+ preprocess,
+ val_partition='REDS4',
+ test_mode=False):
+ super().__init__(preprocess)
+ self.lq_folder = str(lq_folder)
+ self.gt_folder = str(gt_folder)
+ self.ann_file = str(ann_file)
+ self.num_frames = num_frames
+ self.val_partition = val_partition
+ self.test_mode = test_mode
+ self.data_infos = self.prepare_data_infos()
+
+ def prepare_data_infos(self):
+ """Load annoations for REDS dataset.
+
+ Returns:
+ dict: Returned dict for LQ and GT pairs.
+ """
+ # get keys
+ with open(self.ann_file, 'r') as fin:
+ keys = [v.strip().split('/')[0] for v in fin]
+ keys = list(set(keys))
+
+ if self.val_partition == 'REDS4':
+ val_partition = ['000', '011', '015', '020']
+ elif self.val_partition == 'official':
+ val_partition = [f'{v:03d}' for v in range(240, 270)]
+ else:
+ raise ValueError(f'Wrong validation partition {self.val_partition}.'
+ f'Supported ones are ["official", "REDS4"]')
+
+ if self.test_mode:
+ keys = [v for v in keys if v in val_partition]
+ else:
+ keys = [v for v in keys if v not in val_partition]
+
+ data_infos = []
+ for key in keys:
+ data_infos.append(
+ dict(lq_path=self.lq_folder,
+ gt_path=self.gt_folder,
+ key=key,
+ sequence_length=100,
+ num_frames=self.num_frames))
+
+ return data_infos
diff --git a/ppgan/datasets/vsr_vimeo90k_dataset.py b/ppgan/datasets/vsr_vimeo90k_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee141f34e7163abe30277189f2f4940b176b21e3
--- /dev/null
+++ b/ppgan/datasets/vsr_vimeo90k_dataset.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import glob
+import random
+import logging
+import numpy as np
+from paddle.io import Dataset
+
+from .base_sr_dataset import BaseDataset
+from .builder import DATASETS
+
+
+@DATASETS.register()
+class VSRVimeo90KDataset(BaseDataset):
+ """Vimeo90K dataset for video super resolution for recurrent networks.
+
+ The dataset loads several LQ (Low-Quality) frames and GT (Ground-Truth)
+ frames. Then it applies specified transforms and finally returns a dict
+ containing paired data and other information.
+
+ It reads Vimeo90K keys from the txt file. Each line contains video frame folder
+
+ Examples:
+
+ 00001/0233
+ 00001/0234
+
+ Args:
+ lq_folder (str): Path to a low quality image folder.
+ gt_folder (str): Path to a ground truth image folder.
+ ann_file (str): Path to the annotation file.
+ preprocess (list[dict|callable]): A list functions of data transformations.
+ """
+ def __init__(self, lq_folder, gt_folder, ann_file, preprocess):
+ super().__init__(preprocess)
+
+ self.lq_folder = str(lq_folder)
+ self.gt_folder = str(gt_folder)
+ self.ann_file = str(ann_file)
+
+ self.data_infos = self.prepare_data_infos()
+
+ def prepare_data_infos(self):
+
+ with open(self.ann_file, 'r') as fin:
+ keys = [line.strip() for line in fin]
+
+ data_infos = []
+ for key in keys:
+ lq_paths = sorted(
+ glob.glob(os.path.join(self.lq_folder, key, '*.png')))
+ gt_paths = sorted(
+ glob.glob(os.path.join(self.gt_folder, key, '*.png')))
+
+ data_infos.append(dict(lq_path=lq_paths, gt_path=gt_paths, key=key))
+
+ return data_infos
diff --git a/ppgan/datasets/wav2lip_dataset.py b/ppgan/datasets/wav2lip_dataset.py
index fde1fb675de7070537320d02724ca6240960132f..582583edfd0e6512a563709f9bb541f70552fad3 100644
--- a/ppgan/datasets/wav2lip_dataset.py
+++ b/ppgan/datasets/wav2lip_dataset.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/Rudrabha/Wav2Lip
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/Rudrabha/Wav2Lip#license-and-citation
import cv2
import random
@@ -34,7 +24,10 @@ def get_image_list(data_root, split):
for line in f:
line = line.strip()
if ' ' in line: line = line.split()[0]
- filelist.append(os.path.join(data_root, line))
+ video_path = os.path.join(data_root, line)
+ assert os.path.exists(video_path), '{} is not found'.format(
+ video_path)
+ filelist.append(video_path)
return filelist
@@ -179,14 +172,6 @@ class Wav2LipDataset(paddle.io.Dataset):
mel = np.transpose(mel)
mel = np.expand_dims(mel, 0)
indiv_mels = np.expand_dims(indiv_mels, 1)
- #np.random.seed(200)
- #x = np.random.rand(*x.shape).astype('float32')
- #np.random.seed(200)
- #mel = np.random.rand(*mel.shape)
- #np.random.seed(200)
- #indiv_mels = np.random.rand(*indiv_mels.shape)
- #np.random.seed(200)
- #y = np.random.rand(*y.shape)
return {
'x': x,
diff --git a/ppgan/engine/trainer.py b/ppgan/engine/trainer.py
old mode 100644
new mode 100755
index d3b303c9da7951a81c3665f49efad16da7a1193c..0a48a505b56ea90e40b3263a439989c624477c3b
--- a/ppgan/engine/trainer.py
+++ b/ppgan/engine/trainer.py
@@ -13,6 +13,7 @@
# limitations under the License.
import os
+import sys
import time
import copy
@@ -27,13 +28,16 @@ from ..models.builder import build_model
from ..utils.visual import tensor2img, save_image
from ..utils.filesystem import makedirs, save, load
from ..utils.timer import TimeAverager
+from ..utils.profiler import add_profiler_step
class IterLoader:
+
def __init__(self, dataloader):
self._dataloader = dataloader
self.iter_loader = iter(self._dataloader)
self._epoch = 1
+ self._inner_iter = 0
@property
def epoch(self):
@@ -41,12 +45,17 @@ class IterLoader:
def __next__(self):
try:
+ if sys.platform == "Windows" and self._inner_iter == len(
+ self._dataloader) - 1:
+ self._inner_iter = 0
+ raise StopIteration
data = next(self.iter_loader)
except StopIteration:
self._epoch += 1
self.iter_loader = iter(self._dataloader)
data = next(self.iter_loader)
+ self._inner_iter += 1
return data
def __len__(self):
@@ -71,13 +80,43 @@ class Trainer:
# | ||
# save checkpoint (model.nets) \/
"""
+
def __init__(self, cfg):
+ # base config
+ self.logger = logging.getLogger(__name__)
+ self.cfg = cfg
+ self.output_dir = cfg.output_dir
+ self.max_eval_steps = cfg.model.get('max_eval_steps', None)
+
+ self.local_rank = ParallelEnv().local_rank
+ self.world_size = ParallelEnv().nranks
+ self.log_interval = cfg.log_config.interval
+ self.visual_interval = cfg.log_config.visiual_interval
+ self.weight_interval = cfg.snapshot_config.interval
+
+ self.start_epoch = 1
+ self.current_epoch = 1
+ self.current_iter = 1
+ self.inner_iter = 1
+ self.batch_id = 0
+ self.global_steps = 0
# build model
self.model = build_model(cfg.model)
- # multiple gpus prepare
- if ParallelEnv().nranks > 1:
- self.distributed_data_parallel()
+
+ # build metrics
+ self.metrics = None
+ self.is_save_img = True
+ validate_cfg = cfg.get('validate', None)
+ if validate_cfg and 'metrics' in validate_cfg:
+ self.metrics = self.model.setup_metrics(validate_cfg['metrics'])
+ if validate_cfg and 'save_img' in validate_cfg:
+ self.is_save_img = validate_cfg['save_img']
+
+ self.enable_visualdl = cfg.get('enable_visualdl', False)
+ if self.enable_visualdl:
+ import visualdl
+ self.vdl_logger = visualdl.LogWriter(logdir=cfg.output_dir)
# build train dataloader
self.train_dataloader = build_dataloader(cfg.dataset.train)
@@ -93,21 +132,17 @@ class Trainer:
self.optimizers = self.model.setup_optimizers(self.lr_schedulers,
cfg.optimizer)
- # build metrics
- self.metrics = None
- validate_cfg = cfg.get('validate', None)
- if validate_cfg and 'metrics' in validate_cfg:
- self.metrics = self.model.setup_metrics(validate_cfg['metrics'])
+ # setup amp train
+ self.scalers = self.setup_amp_train() if self.cfg.amp else None
- self.logger = logging.getLogger(__name__)
- self.enable_visualdl = cfg.get('enable_visualdl', False)
- if self.enable_visualdl:
- import visualdl
- self.vdl_logger = visualdl.LogWriter(logdir=cfg.output_dir)
+ # multiple gpus prepare
+ if ParallelEnv().nranks > 1:
+ self.distributed_data_parallel()
+
+ # evaluate only
+ if not cfg.is_train:
+ return
- # base config
- self.output_dir = cfg.output_dir
- self.max_eval_steps = cfg.model.get('max_eval_steps', None)
self.epochs = cfg.get('epochs', None)
if self.epochs:
self.total_iters = self.epochs * self.iters_per_epoch
@@ -116,34 +151,49 @@ class Trainer:
self.by_epoch = False
self.total_iters = cfg.total_iters
- self.start_epoch = 1
- self.current_epoch = 1
- self.current_iter = 1
- self.inner_iter = 1
- self.batch_id = 0
- self.global_steps = 0
- self.weight_interval = cfg.snapshot_config.interval
- if self.by_epoch:
- self.weight_interval *= self.iters_per_epoch
- self.log_interval = cfg.log_config.interval
- self.visual_interval = cfg.log_config.visiual_interval
if self.by_epoch:
self.weight_interval *= self.iters_per_epoch
self.validate_interval = -1
if cfg.get('validate', None) is not None:
self.validate_interval = cfg.validate.get('interval', -1)
- self.cfg = cfg
-
- self.local_rank = ParallelEnv().local_rank
self.time_count = {}
self.best_metric = {}
+ self.model.set_total_iter(self.total_iters)
+ self.profiler_options = cfg.profiler_options
+
+ def setup_amp_train(self):
+ """ decerate model, optimizer and return a list of GradScaler """
+ self.logger.info('use AMP to train. AMP level = {}'.format(
+ self.cfg.amp_level))
+
+ # need to decorate model and optim if amp_level == 'O2'
+ if self.cfg.amp_level == 'O2':
+ nets, optimizers = list(self.model.nets.values()), list(
+ self.optimizers.values())
+ nets, optimizers = paddle.amp.decorate(models=nets,
+ optimizers=optimizers,
+ level='O2',
+ save_dtype='float32')
+ for i, (k, _) in enumerate(self.model.nets.items()):
+ self.model.nets[k] = nets[i]
+ for i, (k, _) in enumerate(self.optimizers.items()):
+ self.optimizers[k] = optimizers[i]
+
+ scalers = [
+ paddle.amp.GradScaler(init_loss_scaling=1024)
+ for i in range(len(self.optimizers))
+ ]
+
+ return scalers
def distributed_data_parallel(self):
- strategy = paddle.distributed.prepare_context()
+ paddle.distributed.init_parallel_env()
+ find_unused_parameters = self.cfg.get('find_unused_parameters', False)
for net_name, net in self.model.nets.items():
- self.model.nets[net_name] = paddle.DataParallel(net, strategy)
+ self.model.nets[net_name] = paddle.DataParallel(
+ net, find_unused_parameters=find_unused_parameters)
def learning_rate_scheduler_step(self):
if isinstance(self.model.lr_scheduler, dict):
@@ -162,9 +212,13 @@ class Trainer:
iter_loader = IterLoader(self.train_dataloader)
+ # set model.is_train = True
+ self.model.setup_train_mode(is_train=True)
while self.current_iter < (self.total_iters + 1):
self.current_epoch = iter_loader.epoch
- self.inner_iter = self.current_iter % self.iters_per_epoch
+ self.inner_iter = self.current_iter % max(self.iters_per_epoch, 1)
+
+ add_profiler_step(self.profiler_options)
start_time = step_start_time = time.time()
data = next(iter_loader)
@@ -172,11 +226,16 @@ class Trainer:
# unpack data from dataset and apply preprocessing
# data input should be dict
self.model.setup_input(data)
- self.model.train_iter(self.optimizers)
- batch_cost_averager.record(time.time() - step_start_time,
- num_samples=self.cfg.get(
- 'batch_size', 1))
+ if self.cfg.amp:
+ self.model.train_iter_amp(self.optimizers, self.scalers,
+ self.cfg.amp_level) # amp train
+ else:
+ self.model.train_iter(self.optimizers) # norm train
+
+ batch_cost_averager.record(
+ time.time() - step_start_time,
+ num_samples=self.cfg['dataset']['train'].get('batch_size', 1))
step_start_time = time.time()
@@ -189,7 +248,7 @@ class Trainer:
reader_cost_averager.reset()
batch_cost_averager.reset()
- if self.current_iter % self.visual_interval == 0:
+ if self.current_iter % self.visual_interval == 0 and self.local_rank == 0:
self.visual('visual_train')
self.learning_rate_scheduler_step()
@@ -206,8 +265,7 @@ class Trainer:
def test(self):
if not hasattr(self, 'test_dataloader'):
self.test_dataloader = build_dataloader(self.cfg.dataset.test,
- is_train=False,
- distributed=False)
+ is_train=False)
iter_loader = IterLoader(self.test_dataloader)
if self.max_eval_steps is None:
self.max_eval_steps = len(self.test_dataloader)
@@ -216,42 +274,47 @@ class Trainer:
for metric in self.metrics.values():
metric.reset()
+ # set model.is_train = False
+ self.model.setup_train_mode(is_train=False)
+
for i in range(self.max_eval_steps):
+ if self.max_eval_steps < self.log_interval or i % self.log_interval == 0:
+ self.logger.info('Test iter: [%d/%d]' %
+ (i * self.world_size,
+ self.max_eval_steps * self.world_size))
+
data = next(iter_loader)
self.model.setup_input(data)
self.model.test_iter(metrics=self.metrics)
- visual_results = {}
- current_paths = self.model.get_image_paths()
- current_visuals = self.model.get_current_visuals()
+ if self.is_save_img:
+ visual_results = {}
+ current_paths = self.model.get_image_paths()
+ current_visuals = self.model.get_current_visuals()
- if len(current_visuals) > 0 and list(
- current_visuals.values())[0].shape == 4:
- num_samples = list(current_visuals.values())[0].shape[0]
- else:
- num_samples = 1
-
- for j in range(num_samples):
- if j < len(current_paths):
- short_path = os.path.basename(current_paths[j])
- basename = os.path.splitext(short_path)[0]
+ if len(current_visuals) > 0 and list(
+ current_visuals.values())[0].shape == 4:
+ num_samples = list(current_visuals.values())[0].shape[0]
else:
- basename = '{:04d}_{:04d}'.format(i, j)
- for k, img_tensor in current_visuals.items():
- name = '%s_%s' % (basename, k)
- if len(img_tensor.shape) == 4:
- visual_results.update({name: img_tensor[j]})
- else:
- visual_results.update({name: img_tensor})
+ num_samples = 1
- self.visual('visual_test',
- visual_results=visual_results,
- step=self.batch_id,
- is_save_image=True)
-
- if i % self.log_interval == 0:
- self.logger.info('Test iter: [%d/%d]' %
- (i, self.max_eval_steps))
+ for j in range(num_samples):
+ if j < len(current_paths):
+ short_path = os.path.basename(current_paths[j])
+ basename = os.path.splitext(short_path)[0]
+ else:
+ basename = '{:04d}_{:04d}'.format(i, j)
+ for k, img_tensor in current_visuals.items():
+ name = '%s_%s' % (basename, k)
+ if len(img_tensor.shape) == 4:
+ visual_results.update({name: img_tensor[j]})
+ else:
+ visual_results.update({name: img_tensor})
+
+ self.visual('visual_test',
+ visual_results=visual_results,
+ step=self.batch_id,
+ is_save_image=True)
if self.metrics:
for metric_name, metric in self.metrics.items():
@@ -286,7 +349,9 @@ class Trainer:
message += 'ips: %.5f images/s ' % self.ips
if hasattr(self, 'step_time'):
- eta = self.step_time * (self.total_iters - self.current_iter - 1)
+ eta = self.step_time * (self.total_iters - self.current_iter)
+ eta = eta if eta > 0 else 0
+
eta_str = str(datetime.timedelta(seconds=int(eta)))
message += f'eta: {eta_str}'
@@ -305,7 +370,6 @@ class Trainer:
is_save_image=False):
"""
visual the images, use visualdl or directly write to the directory
-
Parameters:
results_dir (str) -- directory name which contains saved images
visual_results (dict) -- the results images dict
@@ -334,7 +398,10 @@ class Trainer:
dataformats="HWC" if image_num == 1 else "NCHW")
else:
if self.cfg.is_train:
- msg = 'epoch%.3d_' % self.current_epoch
+ if self.by_epoch:
+ msg = 'epoch%.3d_' % self.current_epoch
+ else:
+ msg = 'iter%.3d_' % self.current_iter
else:
msg = ''
makedirs(os.path.join(self.output_dir, results_dir))
@@ -406,20 +473,39 @@ class Trainer:
def load(self, weight_path):
state_dicts = load(weight_path)
- for net_name, net in self.model.nets.items():
- if net_name in state_dicts:
- net.set_state_dict(state_dicts[net_name])
- self.logger.info(
- 'Loaded pretrained weight for net {}'.format(net_name))
+ def is_dict_in_dict_weight(state_dict):
+ if isinstance(state_dict, dict) and len(state_dict) > 0:
+ val = list(state_dict.values())[0]
+ if isinstance(val, dict):
+ return True
+ else:
+ return False
else:
- self.logger.warning(
- 'Can not find state dict of net {}. Skip load pretrained weight for net {}'
- .format(net_name, net_name))
+ return False
+
+ if is_dict_in_dict_weight(state_dicts):
+ for net_name, net in self.model.nets.items():
+ if net_name in state_dicts:
+ net.set_state_dict(state_dicts[net_name])
+ self.logger.info(
+ 'Loaded pretrained weight for net {}'.format(net_name))
+ else:
+ self.logger.warning(
+ 'Can not find state dict of net {}. Skip load pretrained weight for net {}'
+ .format(net_name, net_name))
+ else:
+ assert len(self.model.nets
+ ) == 1, 'checkpoint only contain weight of one net, \
+ but model contains more than one net!'
+
+ net_name, net = list(self.model.nets.items())[0]
+ net.set_state_dict(state_dicts)
+ self.logger.info(
+ 'Loaded pretrained weight for net {}'.format(net_name))
def close(self):
"""
when finish the training need close file handler or other.
-
"""
if self.enable_visualdl:
self.vdl_logger.close()
diff --git a/ppgan/faceutils/dlibutils/dlib_utils.py b/ppgan/faceutils/dlibutils/dlib_utils.py
index 665276da0a5d0acfa18c8de852c20b138b3d0eec..f9fbaed088b89e3ad542239829d9f7b579937db3 100644
--- a/ppgan/faceutils/dlibutils/dlib_utils.py
+++ b/ppgan/faceutils/dlibutils/dlib_utils.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/wtjiang98/PSGAN
+# MIT License
+# Copyright (c) 2020 Wentao Jiang
import os
import os.path as osp
diff --git a/ppgan/faceutils/dlibutils/face_align.py b/ppgan/faceutils/dlibutils/face_align.py
index 9d4063e827412b1824f1062e6fc94c103cd8282a..f18c5d9f044573c1822e464d0c182372adc9ef5c 100644
--- a/ppgan/faceutils/dlibutils/face_align.py
+++ b/ppgan/faceutils/dlibutils/face_align.py
@@ -23,7 +23,7 @@ from .dlib_utils import detect, landmarks
def align_crop(image: Image):
faces = detect(image)
- assert len(faces) > 0, 'can not detect face!!!'
+ assert len(faces) > 0, 'Cannot detect face!!!'
face = get_max_face(faces)
lms = landmarks(image, face)
@@ -43,10 +43,10 @@ def get_max_face(faces):
# find max face
areas = []
for face in faces:
- left = face.rect.left()
- top = face.rect.top()
- right = face.rect.right()
- bottom = face.rect.bottom()
+ left = face.left()
+ top = face.top()
+ right = face.right()
+ bottom = face.bottom()
areas.append((bottom - top) * (right - left))
max_face_index = np.argmax(areas)
return faces[max_face_index]
diff --git a/ppgan/faceutils/face_detection/api.py b/ppgan/faceutils/face_detection/api.py
index 0d503ee90d2ffbafb98d84cd1c3d85cbe7ecd870..608ad5b0cef1cc4f7e75df82fe018ff290603c2c 100644
--- a/ppgan/faceutils/face_detection/api.py
+++ b/ppgan/faceutils/face_detection/api.py
@@ -80,6 +80,22 @@ class FaceAlignment:
d = d[0]
d = np.clip(d, 0, None)
+ x1, y1, x2, y2 = map(int, d[:4])
+ results.append((x1, y1, x2, y2))
+
+ return results
+
+ def get_detections_for_image(self, images):
+ images = images[..., ::-1]
+ detected_faces = self.face_detector.detect_from_batch(images.copy())
+ results = []
+
+ for i, d in enumerate(detected_faces[0]):
+ if len(d) == 0:
+ results.append(None)
+ continue
+ d = np.clip(d, 0, None)
+
x1, y1, x2, y2 = map(int, d[:-1])
results.append((x1, y1, x2, y2))
diff --git a/ppgan/faceutils/face_detection/detection/blazeface/__init__.py b/ppgan/faceutils/face_detection/detection/blazeface/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2a47085c3aa15084a2dc7ec5bd593b447f838a5
--- /dev/null
+++ b/ppgan/faceutils/face_detection/detection/blazeface/__init__.py
@@ -0,0 +1 @@
+from .blazeface_detector import BlazeFaceDetector as FaceDetector
diff --git a/ppgan/faceutils/face_detection/detection/blazeface/blazeface_detector.py b/ppgan/faceutils/face_detection/detection/blazeface/blazeface_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bc58b501772ff092027ca258383d1c5512d0bdf
--- /dev/null
+++ b/ppgan/faceutils/face_detection/detection/blazeface/blazeface_detector.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+from paddle.utils.download import get_weights_path_from_url
+
+from ..core import FaceDetector
+
+from .net_blazeface import BlazeFace
+from .detect import *
+
+blazeface_weights = 'https://paddlegan.bj.bcebos.com/models/blazeface.pdparams'
+blazeface_anchors = 'https://paddlegan.bj.bcebos.com/models/anchors.npy'
+
+
+class BlazeFaceDetector(FaceDetector):
+ def __init__(self,
+ path_to_detector=None,
+ path_to_anchor=None,
+ verbose=False,
+ min_score_thresh=0.5,
+ min_suppression_threshold=0.3):
+ super(BlazeFaceDetector, self).__init__(verbose)
+
+ # Initialise the face detector
+ if path_to_detector is None:
+ model_weights_path = get_weights_path_from_url(blazeface_weights)
+ model_weights = paddle.load(model_weights_path)
+ model_anchors = np.load(
+ get_weights_path_from_url(blazeface_anchors))
+ else:
+ model_weights = paddle.load(path_to_detector)
+ model_anchors = np.load(path_to_anchor)
+
+ self.face_detector = BlazeFace()
+ self.face_detector.load_dict(model_weights)
+ self.face_detector.load_anchors_from_npy(model_anchors)
+
+ self.face_detector.min_score_thresh = min_score_thresh
+ self.face_detector.min_suppression_threshold = min_suppression_threshold
+
+ self.face_detector.eval()
+
+ def detect_from_image(self, tensor_or_path):
+ image = self.tensor_or_path_to_ndarray(tensor_or_path)
+
+ bboxlist = detect(self.face_detector, image)[0]
+
+ return bboxlist
+
+ def detect_from_batch(self, tensor):
+ bboxlists = batch_detect(self.face_detector, tensor)
+ return bboxlists
+
+ @property
+ def reference_scale(self):
+ return 195
+
+ @property
+ def reference_x_shift(self):
+ return 0
+
+ @property
+ def reference_y_shift(self):
+ return 0
diff --git a/ppgan/faceutils/face_detection/detection/blazeface/detect.py b/ppgan/faceutils/face_detection/detection/blazeface/detect.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4f2b89d0b1430f08b35dac436da0d36549be6c0
--- /dev/null
+++ b/ppgan/faceutils/face_detection/detection/blazeface/detect.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+
+import cv2
+import numpy as np
+
+from .utils import *
+
+
+def detect(net, img, device):
+ H, W, C = img.shape
+ orig_size = min(H, W)
+ img, (xshift, yshift) = resize_and_crop_image(img, 128)
+ preds = net.predict_on_image(img.astype('float32')).numpy()
+
+ if 0 == len(preds):
+ return [[]]
+
+ shift = np.array([xshift, yshift] * 2)
+ scores = preds[:, -1:]
+
+ locs = np.concatenate(
+ (preds[:, 1:2], preds[:, 0:1], preds[:, 3:4], preds[:, 2:3]), axis=1)
+ return [np.concatenate((locs * orig_size + shift, scores), axis=1)]
+
+
+def batch_detect(net, img_batch):
+ """
+ Inputs:
+ - img_batch: a numpy array or tensor of shape (Batch size, Channels, Height, Width)
+ Outputs:
+ - list of 2-dim numpy arrays with shape (faces_on_this_image, 5): x1, y1, x2, y2, confidence
+ (x1, y1) - top left corner, (x2, y2) - bottom right corner
+ """
+ B, H, W, C = img_batch.shape
+ orig_size = min(H, W)
+
+ if isinstance(img_batch, paddle.Tensor):
+ img_batch = img_batch.numpy()
+
+ imgs, (xshift, yshift) = resize_and_crop_batch(img_batch, 128)
+ preds = net.predict_on_batch(imgs.astype('float32'))
+ bboxlists = []
+ for pred in preds:
+ pred = pred.numpy()
+ shift = np.array([xshift, yshift] * 2)
+ scores = pred[:, -1:]
+ xmin = pred[:, 1:2]
+ ymin = pred[:, 0:1]
+ xmax = pred[:, 3:4]
+ ymax = pred[:, 2:3]
+ locs = np.concatenate((xmin, ymin, xmax, ymax), axis=1)
+ bboxlists.append(
+ np.concatenate((locs * orig_size + shift, scores), axis=1))
+
+ return bboxlists
+
+
+def flip_detect(net, img):
+ img = cv2.flip(img, 1)
+ b = detect(net, img)
+
+ bboxlist = np.zeros(b.shape)
+ bboxlist[:, 0] = img.shape[1] - b[:, 2]
+ bboxlist[:, 1] = b[:, 1]
+ bboxlist[:, 2] = img.shape[1] - b[:, 0]
+ bboxlist[:, 3] = b[:, 3]
+ bboxlist[:, 4] = b[:, 4]
+ return bboxlist
+
+
+def pts_to_bb(pts):
+ min_x, min_y = np.min(pts, axis=0)
+ max_x, max_y = np.max(pts, axis=0)
+ return np.array([min_x, min_y, max_x, max_y])
diff --git a/ppgan/faceutils/face_detection/detection/blazeface/net_blazeface.py b/ppgan/faceutils/face_detection/detection/blazeface/net_blazeface.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e182708c0e3e432fa1ce39b378a9fa267e28477
--- /dev/null
+++ b/ppgan/faceutils/face_detection/detection/blazeface/net_blazeface.py
@@ -0,0 +1,370 @@
+# code was heavily based on https://github.com/hollance/BlazeFace-PyTorch
+# This work is licensed under the same terms as MediaPipe (Apache License 2.0)
+# https://github.com/google/mediapipe/blob/master/LICENSE
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class BlazeBlock(nn.Layer):
+ def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
+ super(BlazeBlock, self).__init__()
+
+ self.stride = stride
+ self.channel_pad = out_channels - in_channels
+
+ if stride == 2:
+ self.max_pool = nn.MaxPool2D(kernel_size=stride, stride=stride)
+ padding = 0
+ else:
+ padding = (kernel_size - 1) // 2
+
+ self.convs = nn.Sequential(
+ nn.Conv2D(in_channels=in_channels,
+ out_channels=in_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ groups=in_channels),
+ nn.Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0),
+ )
+
+ self.act = nn.ReLU()
+
+ def forward(self, x):
+ if self.stride == 2:
+ h = F.pad(x, [0, 2, 0, 2], "constant", 0)
+ x = self.max_pool(x)
+ else:
+ h = x
+ if self.channel_pad > 0:
+ x = F.pad(x, [0, 0, 0, self.channel_pad, 0, 0, 0, 0], "constant", 0)
+
+ return self.act(self.convs(h) + x)
+
+
+class BlazeFace(nn.Layer):
+ """The BlazeFace face detection model.
+ """
+ def __init__(self):
+ super(BlazeFace, self).__init__()
+
+ self.num_classes = 1
+ self.num_anchors = 896
+ self.num_coords = 16
+ self.score_clipping_thresh = 100.0
+ self.x_scale = 128.0
+ self.y_scale = 128.0
+ self.h_scale = 128.0
+ self.w_scale = 128.0
+ self.min_score_thresh = 0.75
+ self.min_suppression_threshold = 0.3
+
+ self._define_layers()
+
+ def _define_layers(self):
+ self.backbone1 = nn.Sequential(
+ nn.Conv2D(in_channels=3,
+ out_channels=24,
+ kernel_size=5,
+ stride=2,
+ padding=0),
+ nn.ReLU(),
+ BlazeBlock(24, 24),
+ BlazeBlock(24, 28),
+ BlazeBlock(28, 32, stride=2),
+ BlazeBlock(32, 36),
+ BlazeBlock(36, 42),
+ BlazeBlock(42, 48, stride=2),
+ BlazeBlock(48, 56),
+ BlazeBlock(56, 64),
+ BlazeBlock(64, 72),
+ BlazeBlock(72, 80),
+ BlazeBlock(80, 88),
+ )
+
+ self.backbone2 = nn.Sequential(
+ BlazeBlock(88, 96, stride=2),
+ BlazeBlock(96, 96),
+ BlazeBlock(96, 96),
+ BlazeBlock(96, 96),
+ BlazeBlock(96, 96),
+ )
+
+ self.classifier_8 = nn.Conv2D(88, 2, 1)
+ self.classifier_16 = nn.Conv2D(96, 6, 1)
+
+ self.regressor_8 = nn.Conv2D(88, 32, 1)
+ self.regressor_16 = nn.Conv2D(96, 96, 1)
+
+ def forward(self, x):
+ x = F.pad(x, [1, 2, 1, 2], "constant", 0)
+
+ b = x.shape[0]
+
+ x = self.backbone1(x) # (b, 88, 16, 16)
+ h = self.backbone2(x) # (b, 96, 8, 8)
+
+ c1 = self.classifier_8(x) # (b, 2, 16, 16)
+ c1 = c1.transpose([0, 2, 3, 1]) # (b, 16, 16, 2)
+ c1 = c1.reshape([b, -1, 1]) # (b, 512, 1)
+
+ c2 = self.classifier_16(h) # (b, 6, 8, 8)
+ c2 = c2.transpose([0, 2, 3, 1]) # (b, 8, 8, 6)
+ c2 = c2.reshape([b, -1, 1]) # (b, 384, 1)
+
+ c = paddle.concat((c1, c2), axis=1) # (b, 896, 1)
+
+ r1 = self.regressor_8(x) # (b, 32, 16, 16)
+ r1 = r1.transpose([0, 2, 3, 1]) # (b, 16, 16, 32)
+ r1 = r1.reshape([b, -1, 16]) # (b, 512, 16)
+
+ r2 = self.regressor_16(h) # (b, 96, 8, 8)
+ r2 = r2.transpose([0, 2, 3, 1]) # (b, 8, 8, 96)
+ r2 = r2.reshape([b, -1, 16]) # (b, 384, 16)
+
+ r = paddle.concat((r1, r2), axis=1) # (b, 896, 16)
+ return [r, c]
+
+ def load_weights(self, path):
+ paddle.load_dict(paddle.load(path))
+ self.eval()
+
+ def load_anchors(self, path):
+ self.anchors = paddle.to_tensor(np.load(path), dtype='float32')
+ assert (self.anchors.shape == 2)
+ assert (self.anchors.shape[0] == self.num_anchors)
+ assert (self.anchors.shape[1] == 4)
+
+ def load_anchors_from_npy(self, arr):
+ self.anchors = paddle.to_tensor(arr, dtype='float32')
+ assert (len(self.anchors.shape) == 2)
+ assert (self.anchors.shape[0] == self.num_anchors)
+ assert (self.anchors.shape[1] == 4)
+
+ def _preprocess(self, x):
+ """Converts the image pixels to the range [-1, 1]."""
+ return x.astype('float32') / 127.5 - 1.0
+
+ def predict_on_image(self, img):
+ """Makes a prediction on a single image.
+
+ Arguments:
+ img: a NumPy array of shape (H, W, 3) or a Paddle tensor of
+ shape (3, H, W). The image's height and width should be
+ 128 pixels.
+
+ Returns:
+ A tensor with face detections.
+ """
+ if isinstance(img, np.ndarray):
+ img = paddle.to_tensor(img).transpose((2, 0, 1))
+
+ return self.predict_on_batch(img.unsqueeze(0))[0]
+
+ def predict_on_batch(self, x):
+ """Makes a prediction on a batch of images.
+
+ Arguments:
+ x: a NumPy array of shape (b, H, W, 3) or a Paddle tensor of
+ shape (b, 3, H, W). The height and width should be 128 pixels.
+
+ Returns:
+ A list containing a tensor of face detections for each image in
+ the batch. If no faces are found for an image, returns a tensor
+ of shape (0, 17).
+
+ Each face detection is a Paddle tensor consisting of 17 numbers:
+ - ymin, xmin, ymax, xmax
+ - x,y-coordinates for the 6 keypoints
+ - confidence score
+ """
+ if isinstance(x, np.ndarray):
+ x = paddle.to_tensor(x).transpose((0, 3, 1, 2))
+
+ assert x.shape[1] == 3
+ assert x.shape[2] == 128
+ assert x.shape[3] == 128
+
+ x = self._preprocess(x)
+
+ with paddle.no_grad():
+ out = self.__call__(x)
+
+ detections = self._tensors_to_detections(out[0], out[1], self.anchors)
+
+ filtered_detections = []
+ for i in range(len(detections)):
+ faces = self._weighted_non_max_suppression(detections[i])
+ faces = paddle.stack(faces) if len(faces) > 0 else paddle.zeros(
+ (0, 17))
+ filtered_detections.append(faces)
+
+ return filtered_detections
+
+ def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+ """The output of the neural network is a tensor of shape (b, 896, 16)
+ containing the bounding box regressor predictions, as well as a tensor
+ of shape (b, 896, 1) with the classification confidences.
+
+ Returns a list of (num_detections, 17) tensors, one for each image in
+ the batch.
+ """
+ assert len(raw_box_tensor.shape) == 3
+ assert raw_box_tensor.shape[1] == self.num_anchors
+ assert raw_box_tensor.shape[2] == self.num_coords
+
+ assert len(raw_score_tensor.shape) == 3
+ assert raw_score_tensor.shape[1] == self.num_anchors
+ assert raw_score_tensor.shape[2] == self.num_classes
+
+ assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+
+ detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+
+ thresh = self.score_clipping_thresh
+ raw_score_tensor = raw_score_tensor.clip(-thresh, thresh)
+ detection_scores = F.sigmoid(raw_score_tensor).squeeze(axis=-1)
+
+ mask = detection_scores >= self.min_score_thresh
+ mask = mask.numpy()
+ detection_boxes = detection_boxes.numpy()
+ detection_scores = detection_scores.numpy()
+
+ output_detections = []
+ for i in range(raw_box_tensor.shape[0]):
+ boxes = paddle.to_tensor(detection_boxes[i, mask[i]])
+ scores = paddle.to_tensor(
+ detection_scores[i, mask[i]]).unsqueeze(axis=-1)
+ output_detections.append(paddle.concat((boxes, scores), axis=-1))
+
+ return output_detections
+
+ def _decode_boxes(self, raw_boxes, anchors):
+ """Converts the predictions into actual coordinates using
+ the anchor boxes. Processes the entire batch at once.
+ """
+ boxes = paddle.zeros_like(raw_boxes)
+
+ x_center = raw_boxes[:,:, 0] / self.x_scale * \
+ anchors[:, 2] + anchors[:, 0]
+ y_center = raw_boxes[:,:, 1] / self.y_scale * \
+ anchors[:, 3] + anchors[:, 1]
+
+ w = raw_boxes[:, :, 2] / self.w_scale * anchors[:, 2]
+ h = raw_boxes[:, :, 3] / self.h_scale * anchors[:, 3]
+
+ boxes[:, :, 0] = y_center - h / 2. # ymin
+ boxes[:, :, 1] = x_center - w / 2. # xmin
+ boxes[:, :, 2] = y_center + h / 2. # ymax
+ boxes[:, :, 3] = x_center + w / 2. # xmax
+
+ for k in range(6):
+ offset = 4 + k * 2
+ keypoint_x = raw_boxes[:,:, offset] / \
+ self.x_scale * anchors[:, 2] + anchors[:, 0]
+ keypoint_y = raw_boxes[:,:, offset + 1] / \
+ self.y_scale * anchors[:, 3] + anchors[:, 1]
+ boxes[:, :, offset] = keypoint_x
+ boxes[:, :, offset + 1] = keypoint_y
+
+ return boxes
+
+ def _weighted_non_max_suppression(self, detections):
+ """The alternative NMS method as mentioned in the BlazeFace paper:
+ The input detections should be a Tensor of shape (count, 17).
+ Returns a list of Paddle tensors, one for each detected face.
+
+ """
+ if len(detections) == 0:
+ return []
+
+ output_detections = []
+
+ # Sort the detections from highest to lowest score.
+ remaining = paddle.argsort(detections[:, 16], descending=True).numpy()
+ detections = detections.numpy()
+
+ while len(remaining) > 0:
+ detection = detections[remaining[0]]
+
+ first_box = detection[:4]
+ other_boxes = detections[remaining, :4]
+ ious = overlap_similarity(paddle.to_tensor(first_box),
+ paddle.to_tensor(other_boxes))
+
+ mask = ious > self.min_suppression_threshold
+ mask = mask.numpy()
+
+ overlapping = remaining[mask]
+ remaining = remaining[~mask]
+
+ weighted_detection = detection.copy()
+ if len(overlapping) > 1:
+ coordinates = detections[overlapping, :16]
+ scores = detections[overlapping, 16:17]
+ total_score = scores.sum()
+ weighted = (coordinates * scores).sum(axis=0) / total_score
+ weighted_detection[:16] = weighted
+ weighted_detection[16] = total_score / len(overlapping)
+
+ output_detections.append(paddle.to_tensor(weighted_detection))
+
+ return output_detections
+
+
+def intersect(box_a, box_b):
+ """Compute the area of intersect between box_a and box_b.
+ Args:
+ box_a: (tensor) bounding boxes, Shape: [A,4].
+ box_b: (tensor) bounding boxes, Shape: [B,4].
+ Return:
+ (tensor) intersection area, Shape: [A,B].
+ """
+ A = box_a.shape[0]
+ B = box_b.shape[0]
+ max_xy = paddle.minimum(box_a[:, 2:].unsqueeze(1).expand((A, B, 2)),
+ box_b[:, 2:].unsqueeze(0).expand((A, B, 2)))
+ min_xy = paddle.maximum(box_a[:, :2].unsqueeze(1).expand((A, B, 2)),
+ box_b[:, :2].unsqueeze(0).expand((A, B, 2)))
+ inter = paddle.clip((max_xy - min_xy), min=0)
+ return inter[:, :, 0] * inter[:, :, 1]
+
+
+def jaccard(box_a, box_b):
+ """Compute the jaccard overlap of two sets of boxes.
+ Args:
+ box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
+ box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
+ Return:
+ jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
+ """
+ inter = intersect(box_a, box_b)
+ area_a = ((box_a[:, 2] - box_a[:, 0]) *
+ (box_a[:, 3] - box_a[:, 1])).unsqueeze(1).expand_as(inter)
+ area_b = ((box_b[:, 2] - box_b[:, 0]) *
+ (box_b[:, 3] - box_b[:, 1])).unsqueeze(0).expand_as(inter)
+ union = area_a + area_b - inter
+ return inter / union
+
+
+def overlap_similarity(box, other_boxes):
+ """Computes the IOU between a bounding box and set of other boxes."""
+ return jaccard(box.unsqueeze(0), other_boxes).squeeze(0)
+
+
+def init_model():
+ net = BlazeFace()
+ net.load_weights("blazeface.pdparams")
+ net.load_anchors("anchors.npy")
+
+ net.min_score_thresh = 0.75
+ net.min_suppression_threshold = 0.3
+
+ return net
diff --git a/ppgan/datasets/transforms/builder.py b/ppgan/faceutils/face_detection/detection/blazeface/utils.py
similarity index 31%
rename from ppgan/datasets/transforms/builder.py
rename to ppgan/faceutils/face_detection/detection/blazeface/utils.py
index 12b05a6c0524274e0711938c51e77ed855a056b2..a5691ce05bca650bbfd761fb5b62012e65a79ed6 100644
--- a/ppgan/datasets/transforms/builder.py
+++ b/ppgan/faceutils/face_detection/detection/blazeface/utils.py
@@ -12,47 +12,49 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import copy
-import traceback
-import paddle
-from ...utils.registry import Registry
-
-TRANSFORMS = Registry("TRANSFORMS")
-
-
-class Compose(object):
- """
- Composes several transforms together use for composing list of transforms
- together for a dataset transform.
- Args:
- transforms (list): List of transforms to compose.
- Returns:
- A compose object which is callable, __call__ for this Compose
- object will call each given :attr:`transforms` sequencely.
- """
- def __init__(self, transforms):
- self.transforms = transforms
-
- def __call__(self, data):
- for f in self.transforms:
- try:
- data = f(data)
- except Exception as e:
- print(f)
- stack_info = traceback.format_exc()
- print("fail to perform transform [{}] with error: "
- "{} and stack:\n{}".format(f, e, str(stack_info)))
- raise e
- return data
-
-
-def build_transforms(cfg):
- transforms = []
-
- for trans_cfg in cfg:
- temp_trans_cfg = copy.deepcopy(trans_cfg)
- name = temp_trans_cfg.pop('name')
- transforms.append(TRANSFORMS.get(name)(**temp_trans_cfg))
-
- transforms = Compose(transforms)
- return transforms
+import cv2
+import numpy as np
+
+
+def image_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
+ dim = None
+ (h, w) = image.shape[:2]
+
+ if width is None and height is None:
+ return image
+
+ if width is None:
+ r = height / float(h)
+ dim = (int(w * r), height)
+ else:
+ r = width / float(w)
+ dim = (width, int(h * r))
+
+ resized = cv2.resize(image, dim, interpolation=inter)
+
+ return resized
+
+
+def resize_and_crop_image(image, dim):
+ if image.shape[0] > image.shape[1]:
+ img = image_resize(image, width=dim)
+ yshift, xshift = (image.shape[0] - image.shape[1]) // 2, 0
+ y_start = (img.shape[0] - img.shape[1]) // 2
+ y_end = y_start + dim
+ return img[y_start:y_end, :, :], (xshift, yshift)
+ else:
+ img = image_resize(image, height=dim)
+ yshift, xshift = 0, (image.shape[1] - image.shape[0]) // 2
+ x_start = (img.shape[1] - img.shape[0]) // 2
+ x_end = x_start + dim
+ return img[:, x_start:x_end, :], (xshift, yshift)
+
+
+def resize_and_crop_batch(frames, dim):
+ smframes = []
+ xshift, yshift = 0, 0
+ for i in range(len(frames)):
+ smframe, (xshift, yshift) = resize_and_crop_image(frames[i], dim)
+ smframes.append(smframe)
+ smframes = np.stack(smframes)
+ return smframes, (xshift, yshift)
diff --git a/ppgan/faceutils/face_detection/detection/core.py b/ppgan/faceutils/face_detection/detection/core.py
index b9988f8d707de136057a7fb043cc568baf2b2f2c..adb541ceb3464032c9e83b3b9e4473619fb4e823 100644
--- a/ppgan/faceutils/face_detection/detection/core.py
+++ b/ppgan/faceutils/face_detection/detection/core.py
@@ -134,7 +134,7 @@ class FaceDetector(object):
tensor_or_path)[..., ::-1]
elif isinstance(
tensor_or_path,
- (paddle.fluid.framework.Variable, paddle.fluid.core.VarBase)):
+ (paddle.static.Variable, paddle.Tensor)):
# Call cpu in case its coming from cuda
return tensor_or_path.numpy()[
..., ::-1].copy() if not rgb else tensor_or_path.numpy()
diff --git a/ppgan/faceutils/face_detection/detection/sfd/bbox.py b/ppgan/faceutils/face_detection/detection/sfd/bbox.py
index 02b21a46846f579031a1870f6eaa60c81f9d4f71..e268a29be681c0186742dbd4e56ee902043b946d 100644
--- a/ppgan/faceutils/face_detection/detection/sfd/bbox.py
+++ b/ppgan/faceutils/face_detection/detection/sfd/bbox.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/clovaai/EXTD_Pytorch
+# Copyright (c) 2019-present NAVER Corp.
from __future__ import print_function
import os
diff --git a/ppgan/faceutils/face_detection/utils.py b/ppgan/faceutils/face_detection/utils.py
index 6590f9662252d26dd67c4adc2e82b3b86a8123a1..570b2c259222ffde1876b3f459b29bed8f3653e0 100644
--- a/ppgan/faceutils/face_detection/utils.py
+++ b/ppgan/faceutils/face_detection/utils.py
@@ -59,7 +59,7 @@ def crop(image, center, scale, resolution=256.0):
dtype=np.int32)
newImg = np.zeros(newDim, dtype=np.uint8)
else:
- newDim = np.array([br[1] - ul[1], br[0] - ul[0]], dtype=np.int)
+ newDim = np.array([br[1] - ul[1], br[0] - ul[0]], dtype=np.int_)
newImg = np.zeros(newDim, dtype=np.uint8)
ht = image.shape[0]
wd = image.shape[1]
diff --git a/ppgan/faceutils/face_enhancement/__init__.py b/ppgan/faceutils/face_enhancement/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f429a8268ef0ba4e46d57d28aaf970d230a493a4
--- /dev/null
+++ b/ppgan/faceutils/face_enhancement/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .face_enhance import FaceEnhancement
diff --git a/ppgan/faceutils/face_enhancement/face_enhance.py b/ppgan/faceutils/face_enhancement/face_enhance.py
new file mode 100644
index 0000000000000000000000000000000000000000..055fc0bad73c420d78ad6c0606fce2f6b7cc1a40
--- /dev/null
+++ b/ppgan/faceutils/face_enhancement/face_enhance.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import math
+import cv2
+import numpy as np
+from ppgan.utils.download import get_path_from_url
+from ppgan.models.generators import GPEN
+from ppgan.faceutils.face_detection.detection.blazeface.utils import *
+
+GPEN_weights = 'https://paddlegan.bj.bcebos.com/models/GPEN-512.pdparams'
+
+
+class FaceEnhancement(object):
+ def __init__(self,
+ path_to_enhance=None,
+ size = 512,
+ batch_size=1
+ ):
+ super(FaceEnhancement, self).__init__()
+
+ # Initialise the face detector
+ if path_to_enhance is None:
+ model_weights_path = get_path_from_url(GPEN_weights)
+ model_weights = paddle.load(model_weights_path)
+ else:
+ model_weights = paddle.load(path_to_enhance)
+
+ self.face_enhance = GPEN(size=512, style_dim=512, n_mlp=8)
+ self.face_enhance.load_dict(model_weights)
+ self.face_enhance.eval()
+ self.size = size
+ self.mask = np.zeros((512, 512), np.float32)
+ cv2.rectangle(self.mask, (26, 26), (486, 486), (1, 1, 1), -1, cv2.LINE_AA)
+ self.mask = cv2.GaussianBlur(self.mask, (101, 101), 11)
+ self.mask = cv2.GaussianBlur(self.mask, (101, 101), 11)
+ self.mask = paddle.tile(paddle.to_tensor(self.mask).unsqueeze(0).unsqueeze(-1), repeat_times=[batch_size,1,1,3]).numpy()
+
+
+ def enhance_from_image(self, img):
+ if isinstance(img, np.ndarray):
+ img, _ = resize_and_crop_image(img, 512)
+ img = paddle.to_tensor(img).transpose([2, 0, 1])
+ else:
+ assert img.shape == [3, 512, 512]
+ return self.enhance_from_batch(img.unsqueeze(0))[0]
+
+ def enhance_from_batch(self, img):
+ if isinstance(img, np.ndarray):
+ img_ori, _ = resize_and_crop_batch(img, 512)
+ img = paddle.to_tensor(img_ori).transpose([0, 3, 1, 2])
+ else:
+ assert img.shape[1:] == [3, 512, 512]
+ img_ori = img.transpose([0, 2, 3, 1]).numpy()
+ img_t = (img/255. - 0.5) / 0.5
+
+ with paddle.no_grad():
+ out, __ = self.face_enhance(img_t)
+
+ image_tensor = out * 0.5 + 0.5
+ image_tensor = image_tensor.transpose([0, 2, 3, 1]) # RGB
+ image_numpy = paddle.clip(image_tensor, 0, 1) * 255.0
+
+ out = image_numpy.astype(np.uint8).cpu().numpy()
+ return out * self.mask + (1-self.mask) * img_ori
diff --git a/ppgan/faceutils/face_enhancement/gfpgan_enhance.py b/ppgan/faceutils/face_enhancement/gfpgan_enhance.py
new file mode 100644
index 0000000000000000000000000000000000000000..707e5c1c5bfe438892590378566c47847b27dd57
--- /dev/null
+++ b/ppgan/faceutils/face_enhancement/gfpgan_enhance.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import cv2
+import numpy as np
+import sys
+
+import paddle
+import paddle.nn as nn
+
+from ppgan.utils.visual import *
+from ppgan.utils.download import get_path_from_url
+from ppgan.models.generators import GFPGANv1Clean
+from ppgan.models.generators import GFPGANv1
+from ppgan.faceutils.face_detection.detection.blazeface.utils import *
+GFPGAN_weights = 'https://paddlegan.bj.bcebos.com/models/GFPGAN.pdparams'
+
+
+class gfp_FaceEnhancement(object):
+ def __init__(self, size=512, batch_size=1):
+ super(gfp_FaceEnhancement, self).__init__()
+
+ # Initialise the face detector
+ model_weights_path = get_path_from_url(GFPGAN_weights)
+ model_weights = paddle.load(model_weights_path)
+
+ self.face_enhance = GFPGANv1(out_size=512,
+ num_style_feat=512,
+ channel_multiplier=1,
+ resample_kernel=[1, 3, 3, 1],
+ decoder_load_path=None,
+ fix_decoder=True,
+ num_mlp=8,
+ lr_mlp=0.01,
+ input_is_latent=True,
+ different_w=True,
+ narrow=1,
+ sft_half=True)
+ self.face_enhance.load_dict(model_weights['net_g_ema'])
+ self.face_enhance.eval()
+ self.size = size
+ self.mask = np.zeros((512, 512), np.float32)
+ cv2.rectangle(self.mask, (26, 26), (486, 486), (1, 1, 1), -1,
+ cv2.LINE_AA)
+ self.mask = cv2.GaussianBlur(self.mask, (101, 101), 11)
+ self.mask = cv2.GaussianBlur(self.mask, (101, 101), 11)
+ self.mask = paddle.tile(paddle.to_tensor(
+ self.mask).unsqueeze(0).unsqueeze(-1),
+ repeat_times=[batch_size, 1, 1, 3]).numpy()
+
+ def enhance_from_image(self, img):
+ if isinstance(img, np.ndarray):
+ img, _ = resize_and_crop_image(img, 512)
+ img = paddle.to_tensor(img).transpose([2, 0, 1])
+
+ else:
+ assert img.shape == [3, 512, 512]
+ return self.enhance_from_batch(img.unsqueeze(0))[0]
+
+ def enhance_from_batch(self, img):
+ if isinstance(img, np.ndarray):
+ img_ori, _ = resize_and_crop_batch(img, 512)
+ img = paddle.to_tensor(img_ori).transpose([0, 3, 1, 2])
+ else:
+ assert img.shape[1:] == [3, 512, 512]
+ img_ori = img.transpose([0, 2, 3, 1]).numpy()
+ img_t = (img / 255. - 0.5) / 0.5
+
+ with paddle.no_grad():
+ out, __ = self.face_enhance(img_t)
+ image_tensor = out * 0.5 + 0.5
+ image_tensor = image_tensor.transpose([0, 2, 3, 1]) # RGB
+ image_numpy = paddle.clip(image_tensor, 0, 1) * 255.0
+
+ out = image_numpy.astype(np.uint8).cpu().numpy()
+ return out * self.mask + (1 - self.mask) * img_ori
diff --git a/ppgan/faceutils/mask/face_parser.py b/ppgan/faceutils/mask/face_parser.py
index 38dd9a4d894d0daf8b68e339abde734533b9201c..c01701979914f0643bbc7f898f66002d929e9095 100644
--- a/ppgan/faceutils/mask/face_parser.py
+++ b/ppgan/faceutils/mask/face_parser.py
@@ -1,16 +1,7 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/wtjiang98/PSGAN
+# MIT License
+# Copyright (c) 2020 Wentao Jiang
+
import os.path as osp
diff --git a/ppgan/faceutils/mask/model.py b/ppgan/faceutils/mask/model.py
index a61f7979b1a7758b622abda4f49c4eca24cdd8cf..9bb6294047e5b005918aa0dba2871680fbf9bf44 100644
--- a/ppgan/faceutils/mask/model.py
+++ b/ppgan/faceutils/mask/model.py
@@ -1,16 +1,7 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/wtjiang98/PSGAN
+# MIT License
+# Copyright (c) 2020 Wentao Jiang
+
import paddle
import paddle.nn as nn
diff --git a/ppgan/metrics/__init__.py b/ppgan/metrics/__init__.py
index 88afbd621fa8b9b7d4f6405e14aae406f5caa0f0..f14ef8d2575b7c48cd0f94b84b458b065cd58a67 100644
--- a/ppgan/metrics/__init__.py
+++ b/ppgan/metrics/__init__.py
@@ -13,4 +13,6 @@
# limitations under the License.
from .psnr_ssim import PSNR, SSIM
+from .fid import FID
+from .lpips import LPIPSMetric
from .builder import build_metric
diff --git a/ppgan/metrics/compute_fid.py b/ppgan/metrics/fid.py
similarity index 72%
rename from ppgan/metrics/compute_fid.py
rename to ppgan/metrics/fid.py
index da213d2586fc9da41b39fb5afb24af6da458cf6a..5fc8b473b267b3f54a1c38877a501b722a089e34 100644
--- a/ppgan/metrics/compute_fid.py
+++ b/ppgan/metrics/fid.py
@@ -1,27 +1,19 @@
-#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# code was heavily based on https://github.com/mseitzer/pytorch-fid
+# This implementation is licensed under the Apache License 2.0.
+# Copyright (c) mseitzer
+
import os
import fnmatch
import numpy as np
import cv2
+import paddle
from PIL import Image
from cv2 import imread
from scipy import linalg
-import paddle.fluid as fluid
-from inception import InceptionV3
-from paddle.fluid.dygraph.base import to_variable
+from .inception import InceptionV3
+from paddle.utils.download import get_weights_path_from_url
+from .builder import METRICS
try:
from tqdm import tqdm
@@ -36,6 +28,52 @@ except:
"""
inceptionV3 pretrain model is convert from pytorch, pretrain_model url is https://paddle-gan-models.bj.bcebos.com/params_inceptionV3.tar.gz
"""
+INCEPTIONV3_WEIGHT_URL = "https://paddlegan.bj.bcebos.com/InceptionV3.pdparams"
+
+
+@METRICS.register()
+class FID(paddle.metric.Metric):
+ def __init__(self,
+ batch_size=1,
+ use_GPU=True,
+ dims=2048,
+ premodel_path=None,
+ model=None):
+ self.batch_size = batch_size
+ self.use_GPU = use_GPU
+ self.dims = dims
+ self.premodel_path = premodel_path
+ if model is None:
+ block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
+ model = InceptionV3([block_idx], normalize_input=False)
+ if premodel_path is None:
+ premodel_path = get_weights_path_from_url(INCEPTIONV3_WEIGHT_URL)
+ self.model = model
+ param_dict = paddle.load(premodel_path)
+ self.model.load_dict(param_dict)
+ self.model.eval()
+ self.reset()
+
+ def reset(self):
+ self.preds = []
+ self.gts = []
+ self.results = []
+
+ def update(self, preds, gts):
+ preds_inception, gts_inception = calculate_inception_val(
+ preds, gts, self.batch_size, self.model, self.use_GPU, self.dims)
+ self.preds.append(preds_inception)
+ self.gts.append(gts_inception)
+
+ def accumulate(self):
+ self.preds = np.concatenate(self.preds, axis=0)
+ self.gts = np.concatenate(self.gts, axis=0)
+ value = calculate_fid_given_img(self.preds, self.gts)
+ self.reset()
+ return value
+
+ def name(self):
+ return 'FID'
def _calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
@@ -65,21 +103,19 @@ def _calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
m = np.max(np.abs(covmean.imag))
raise ValueError('Imaginary component {}'.format(m))
covmean = covmean.real
-
tr_covmean = np.trace(covmean)
return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) -
2 * tr_covmean)
-def _get_activations_from_ims(img, model, batch_size, dims, use_gpu,
- premodel_path):
+def _get_activations_from_ims(img, model, batch_size, dims, use_gpu):
n_batches = (len(img) + batch_size - 1) // batch_size
n_used_img = len(img)
pred_arr = np.empty((n_used_img, dims))
- for i in tqdm(range(n_batches)):
+ for i in range(n_batches):
start = i * batch_size
end = start + batch_size
if end > len(img):
@@ -87,47 +123,36 @@ def _get_activations_from_ims(img, model, batch_size, dims, use_gpu,
images = img[start:end]
if images.shape[1] != 3:
images = images.transpose((0, 3, 1, 2))
- images /= 255
- images = to_variable(images)
- param_dict, _ = fluid.load_dygraph(premodel_path)
- model.set_dict(param_dict)
- model.eval()
+ images = paddle.to_tensor(images)
pred = model(images)[0][0]
- pred_arr[start:end] = pred.reshape(end - start, -1)
-
+ pred_arr[start:end] = pred.reshape([end - start, -1]).cpu().numpy()
return pred_arr
-def _compute_statistic_of_img(img, model, batch_size, dims, use_gpu,
- premodel_path):
- act = _get_activations_from_ims(img, model, batch_size, dims, use_gpu,
- premodel_path)
+def _compute_statistic_of_img(act):
mu = np.mean(act, axis=0)
sigma = np.cov(act, rowvar=False)
return mu, sigma
-def calculate_fid_given_img(img_fake,
+def calculate_inception_val(img_fake,
img_real,
batch_size,
- use_gpu,
- dims,
- premodel_path,
- model=None):
- assert os.path.exists(
- premodel_path
- ), 'pretrain_model path {} is not exists! Please download it first'.format(
- premodel_path)
- if model is None:
- block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
- model = InceptionV3([block_idx])
+ model,
+ use_gpu=True,
+ dims=2048):
+ act_fake = _get_activations_from_ims(img_fake, model, batch_size, dims,
+ use_gpu)
+ act_real = _get_activations_from_ims(img_real, model, batch_size, dims,
+ use_gpu)
+ return act_fake, act_real
+
- m1, s1 = _compute_statistic_of_img(img_fake, model, batch_size, dims,
- use_gpu, premodel_path)
- m2, s2 = _compute_statistic_of_img(img_real, model, batch_size, dims,
- use_gpu, premodel_path)
+def calculate_fid_given_img(act_fake, act_real):
+ m1, s1 = _compute_statistic_of_img(act_fake)
+ m2, s2 = _compute_statistic_of_img(act_real)
fid_value = _calculate_frechet_distance(m1, s1, m2, s2)
return fid_value
@@ -188,9 +213,9 @@ def _get_activations(files,
if style == 'stargan':
pred_arr[start:end] = inception_infer(images, premodel_path)
else:
- with fluid.dygraph.guard():
- images = to_variable(images)
- param_dict, _ = fluid.load_dygraph(premodel_path)
+ with paddle.guard():
+ images = paddle.to_tensor(images)
+ param_dict, _ = paddle.load(premodel_path)
model.set_dict(param_dict)
model.eval()
@@ -202,9 +227,9 @@ def _get_activations(files,
def inception_infer(x, model_path):
- exe = fluid.Executor()
+ exe = paddle.static.Executor()
[inference_program, feed_target_names,
- fetch_targets] = fluid.io.load_inference_model(model_path, exe)
+ fetch_targets] = paddle.static.load_inference_model(model_path, exe)
results = exe.run(inference_program,
feed={feed_target_names[0]: x},
fetch_list=fetch_targets)
@@ -264,7 +289,7 @@ def calculate_fid_given_paths(paths,
raise RuntimeError('Invalid path: %s' % p)
if model is None and style != 'stargan':
- with fluid.dygraph.guard():
+ with paddle.guard():
block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
model = InceptionV3([block_idx], class_dim=1008)
diff --git a/ppgan/metrics/inception.py b/ppgan/metrics/inception.py
index 643d4766e4deb71a1b3d5c47a2777cfdc9b677a5..45640b7b512bdf425c37b0e07fe63b77ba439613 100644
--- a/ppgan/metrics/inception.py
+++ b/ppgan/metrics/inception.py
@@ -1,29 +1,17 @@
-#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# code was heavily based on https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py
+# BSD 3-Clause License
+# Copyright (c) Soumith Chintala 2016
+
import math
import paddle
-import paddle.fluid as fluid
-from paddle.fluid.param_attr import ParamAttr
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
-from paddle.fluid.dygraph.base import to_variable
+import paddle.nn as nn
+from paddle.nn import Conv2D, AvgPool2D, MaxPool2D, BatchNorm, Linear, AdaptiveAvgPool2D
__all__ = ['InceptionV3']
-class InceptionV3(fluid.dygraph.Layer):
+class InceptionV3(nn.Layer):
DEFAULT_BLOCK_INDEX = 3
BLOCK_INDEX_BY_DIM = {
64: 0, # First max pooling features
@@ -60,21 +48,21 @@ class InceptionV3(fluid.dygraph.Layer):
3,
padding=1,
name='Conv2d_2b_3x3')
- self.maxpool1 = Pool2D(pool_size=3, pool_stride=2, pool_type='max')
+ self.maxpool1 = MaxPool2D(kernel_size=3, stride=2)
block0 = [
self.Conv2d_1a_3x3, self.Conv2d_2a_3x3, self.Conv2d_2b_3x3,
self.maxpool1
]
- self.blocks.append(fluid.dygraph.Sequential(*block0))
+ self.blocks.append(nn.Sequential(*block0))
### block1
if self.last_needed_block >= 1:
self.Conv2d_3b_1x1 = ConvBNLayer(64, 80, 1, name='Conv2d_3b_1x1')
self.Conv2d_4a_3x3 = ConvBNLayer(80, 192, 3, name='Conv2d_4a_3x3')
- self.maxpool2 = Pool2D(pool_size=3, pool_stride=2, pool_type='max')
+ self.maxpool2 = MaxPool2D(kernel_size=3, stride=2)
block1 = [self.Conv2d_3b_1x1, self.Conv2d_4a_3x3, self.maxpool2]
- self.blocks.append(fluid.dygraph.Sequential(*block1))
+ self.blocks.append(nn.Sequential(*block1))
### block2
### Mixed_5b 5c 5d
@@ -100,7 +88,7 @@ class InceptionV3(fluid.dygraph.Layer):
self.Mixed_5b, self.Mixed_5c, self.Mixed_5d, self.Mixed_6a,
self.Mixed_6b, self.Mixed_6c, self.Mixed_6d, self.Mixed_6e
]
- self.blocks.append(fluid.dygraph.Sequential(*block2))
+ self.blocks.append(nn.Sequential(*block2))
if self.aux_logits:
self.AuxLogits = InceptionAux(768, self.class_dim, name='AuxLogits')
@@ -110,19 +98,20 @@ class InceptionV3(fluid.dygraph.Layer):
self.Mixed_7a = InceptionD(768, name='Mixed_7a')
self.Mixed_7b = Fid_inceptionE_1(1280, name='Mixed_7b')
self.Mixed_7c = Fid_inceptionE_2(2048, name='Mixed_7c')
- self.avgpool = Pool2D(global_pooling=True, pool_type='avg')
+ self.avgpool = AdaptiveAvgPool2D(output_size=1)
block3 = [self.Mixed_7a, self.Mixed_7b, self.Mixed_7c, self.avgpool]
- self.blocks.append(fluid.dygraph.Sequential(*block3))
+ self.blocks.append(nn.Sequential(*block3))
def forward(self, x):
out = []
aux = None
if self.resize_input:
- x = fluid.layers.resize_bilinear(x,
- out_shape=[299, 299],
- align_corners=False,
- align_mode=0)
+ x = nn.functional.interpolate(x,
+ size=[299, 299],
+ mode='bilinear',
+ align_corners=False,
+ align_mode=0)
if self.normalize_input:
x = x * 2 - 1
@@ -139,7 +128,7 @@ class InceptionV3(fluid.dygraph.Layer):
return out, aux
-class InceptionA(fluid.dygraph.Layer):
+class InceptionA(nn.Layer):
def __init__(self, in_channels, pool_features, name=None):
super(InceptionA, self).__init__()
self.branch1x1 = ConvBNLayer(in_channels,
@@ -172,11 +161,10 @@ class InceptionA(fluid.dygraph.Layer):
padding=1,
name=name + '.branch3x3dbl_3')
- self.branch_pool0 = Pool2D(pool_size=3,
- pool_stride=1,
- pool_padding=1,
- exclusive=True,
- pool_type='avg')
+ self.branch_pool0 = AvgPool2D(kernel_size=3,
+ stride=1,
+ padding=1,
+ exclusive=True)
self.branch_pool = ConvBNLayer(in_channels,
pool_features,
1,
@@ -194,11 +182,11 @@ class InceptionA(fluid.dygraph.Layer):
branch_pool = self.branch_pool0(x)
branch_pool = self.branch_pool(branch_pool)
- return fluid.layers.concat(
+ return paddle.concat(
[branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
-class InceptionB(fluid.dygraph.Layer):
+class InceptionB(nn.Layer):
def __init__(self, in_channels, name=None):
super(InceptionB, self).__init__()
self.branch3x3 = ConvBNLayer(in_channels,
@@ -222,7 +210,7 @@ class InceptionB(fluid.dygraph.Layer):
stride=2,
name=name + '.branch3x3dbl_3')
- self.branch_pool = Pool2D(pool_size=3, pool_stride=2, pool_type='max')
+ self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
def forward(self, x):
branch3x3 = self.branch3x3(x)
@@ -232,11 +220,11 @@ class InceptionB(fluid.dygraph.Layer):
branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
branch_pool = self.branch_pool(x)
- return fluid.layers.concat([branch3x3, branch3x3dbl, branch_pool],
+ return paddle.concat([branch3x3, branch3x3dbl, branch_pool],
axis=1)
-class InceptionC(fluid.dygraph.Layer):
+class InceptionC(nn.Layer):
def __init__(self, in_channels, c7, name=None):
super(InceptionC, self).__init__()
self.branch1x1 = ConvBNLayer(in_channels,
@@ -278,11 +266,10 @@ class InceptionC(fluid.dygraph.Layer):
padding=(0, 3),
name=name + '.branch7x7dbl_5')
- self.branch_pool0 = Pool2D(pool_size=3,
- pool_stride=1,
- pool_padding=1,
- exclusive=True,
- pool_type='avg')
+ self.branch_pool0 = AvgPool2D(kernel_size=3,
+ stride=1,
+ padding=1,
+ exclusive=True)
self.branch_pool = ConvBNLayer(in_channels,
192,
1,
@@ -304,11 +291,11 @@ class InceptionC(fluid.dygraph.Layer):
branch_pool = self.branch_pool0(x)
branch_pool = self.branch_pool(branch_pool)
- return fluid.layers.concat(
+ return paddle.concat(
[branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
-class InceptionD(fluid.dygraph.Layer):
+class InceptionD(nn.Layer):
def __init__(self, in_channels, name=None):
super(InceptionD, self).__init__()
self.branch3x3_1 = ConvBNLayer(in_channels,
@@ -339,7 +326,7 @@ class InceptionD(fluid.dygraph.Layer):
stride=2,
name=name + '.branch7x7x3_4')
- self.branch_pool = Pool2D(pool_size=3, pool_stride=2, pool_type='max')
+ self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
def forward(self, x):
branch3x3 = self.branch3x3_1(x)
@@ -352,11 +339,11 @@ class InceptionD(fluid.dygraph.Layer):
branch_pool = self.branch_pool(x)
- return fluid.layers.concat([branch3x3, branch7x7x3, branch_pool],
+ return paddle.concat([branch3x3, branch7x7x3, branch_pool],
axis=1)
-class InceptionE(fluid.dygraph.Layer):
+class InceptionE(nn.Layer):
def __init__(self, in_channels, name=None):
super(InceptionE, self).__init__()
self.branch1x1 = ConvBNLayer(in_channels,
@@ -395,11 +382,10 @@ class InceptionE(fluid.dygraph.Layer):
padding=(1, 0),
name=name + '.branch3x3dbl_3b')
- self.branch_pool0 = Pool2D(pool_size=3,
- pool_stride=1,
- pool_padding=1,
- exclusive=True,
- pool_type='avg')
+ self.branch_pool0 = AvgPool2D(kernel_size=3,
+ stride=1,
+ padding=1,
+ exclusive=True)
self.branch_pool = ConvBNLayer(in_channels,
192,
1,
@@ -410,42 +396,42 @@ class InceptionE(fluid.dygraph.Layer):
branch3x3_1 = self.branch3x3_1(x)
branch3x3_2a = self.branch3x3_2a(branch3x3_1)
branch3x3_2b = self.branch3x3_2b(branch3x3_1)
- branch3x3 = fluid.layers.concat([branch3x3_2a, branch3x3_2b], axis=1)
+ branch3x3 = paddle.concat([branch3x3_2a, branch3x3_2b], axis=1)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl_3a = self.branch3x3dbl_3a(branch3x3dbl)
branch3x3dbl_3b = self.branch3x3dbl_3b(branch3x3dbl)
- branch3x3dbl = fluid.layers.concat([branch3x3dbl_3a, branch3x3dbl_3b],
+ branch3x3dbl = paddle.concat([branch3x3dbl_3a, branch3x3dbl_3b],
axis=1)
branch_pool = self.branch_pool0(x)
branch_pool = self.branch_pool(branch_pool)
- return fluid.layers.concat(
+ return paddle.concat(
[branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
-class InceptionAux(fluid.dygraph.Layer):
+class InceptionAux(nn.Layer):
def __init__(self, in_channels, num_classes, name=None):
super(InceptionAux, self).__init__()
self.num_classes = num_classes
- self.pool0 = Pool2D(pool_size=5, pool_stride=3, pool_type='avg')
+ self.pool0 = AvgPool2D(kernel_size=5, stride=3)
self.conv0 = ConvBNLayer(in_channels, 128, 1, name=name + '.conv0')
self.conv1 = ConvBNLayer(128, 768, 5, name=name + '.conv1')
- self.pool1 = Pool2D(global_pooling=True, pool_type='avg')
+ self.pool1 = AvgPool2D(global_pooling=True)
def forward(self, x):
x = self.pool0(x)
x = self.conv0(x)
x = self.conv1(x)
x = self.pool1(x)
- x = fluid.layers.flatten(x, axis=1)
- x = fluid.layers.fc(x, size=self.num_classes)
+ x = paddle.flatten(x, axis=1)
+ x = paddle.static.nn.fc(x, size=self.num_classes)
return x
-class Fid_inceptionA(fluid.dygraph.Layer):
+class Fid_inceptionA(nn.Layer):
""" FID block in inception v3
"""
def __init__(self, in_channels, pool_features, name=None):
@@ -480,11 +466,10 @@ class Fid_inceptionA(fluid.dygraph.Layer):
padding=1,
name=name + '.branch3x3dbl_3')
- self.branch_pool0 = Pool2D(pool_size=3,
- pool_stride=1,
- pool_padding=1,
- exclusive=True,
- pool_type='avg')
+ self.branch_pool0 = AvgPool2D(kernel_size=3,
+ stride=1,
+ padding=1,
+ exclusive=True)
self.branch_pool = ConvBNLayer(in_channels,
pool_features,
1,
@@ -502,11 +487,11 @@ class Fid_inceptionA(fluid.dygraph.Layer):
branch_pool = self.branch_pool0(x)
branch_pool = self.branch_pool(branch_pool)
- return fluid.layers.concat(
+ return paddle.concat(
[branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
-class Fid_inceptionC(fluid.dygraph.Layer):
+class Fid_inceptionC(nn.Layer):
""" FID block in inception v3
"""
def __init__(self, in_channels, c7, name=None):
@@ -550,11 +535,10 @@ class Fid_inceptionC(fluid.dygraph.Layer):
padding=(0, 3),
name=name + '.branch7x7dbl_5')
- self.branch_pool0 = Pool2D(pool_size=3,
- pool_stride=1,
- pool_padding=1,
- exclusive=True,
- pool_type='avg')
+ self.branch_pool0 = AvgPool2D(kernel_size=3,
+ stride=1,
+ padding=1,
+ exclusive=True)
self.branch_pool = ConvBNLayer(in_channels,
192,
1,
@@ -576,11 +560,11 @@ class Fid_inceptionC(fluid.dygraph.Layer):
branch_pool = self.branch_pool0(x)
branch_pool = self.branch_pool(branch_pool)
- return fluid.layers.concat(
+ return paddle.concat(
[branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
-class Fid_inceptionE_1(fluid.dygraph.Layer):
+class Fid_inceptionE_1(nn.Layer):
""" FID block in inception v3
"""
def __init__(self, in_channels, name=None):
@@ -621,11 +605,10 @@ class Fid_inceptionE_1(fluid.dygraph.Layer):
padding=(1, 0),
name=name + '.branch3x3dbl_3b')
- self.branch_pool0 = Pool2D(pool_size=3,
- pool_stride=1,
- pool_padding=1,
- exclusive=True,
- pool_type='avg')
+ self.branch_pool0 = AvgPool2D(kernel_size=3,
+ stride=1,
+ padding=1,
+ exclusive=True)
self.branch_pool = ConvBNLayer(in_channels,
192,
1,
@@ -636,23 +619,23 @@ class Fid_inceptionE_1(fluid.dygraph.Layer):
branch3x3_1 = self.branch3x3_1(x)
branch3x3_2a = self.branch3x3_2a(branch3x3_1)
branch3x3_2b = self.branch3x3_2b(branch3x3_1)
- branch3x3 = fluid.layers.concat([branch3x3_2a, branch3x3_2b], axis=1)
+ branch3x3 = paddle.concat([branch3x3_2a, branch3x3_2b], axis=1)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl_3a = self.branch3x3dbl_3a(branch3x3dbl)
branch3x3dbl_3b = self.branch3x3dbl_3b(branch3x3dbl)
- branch3x3dbl = fluid.layers.concat([branch3x3dbl_3a, branch3x3dbl_3b],
+ branch3x3dbl = paddle.concat([branch3x3dbl_3a, branch3x3dbl_3b],
axis=1)
branch_pool = self.branch_pool0(x)
branch_pool = self.branch_pool(branch_pool)
- return fluid.layers.concat(
+ return paddle.concat(
[branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
-class Fid_inceptionE_2(fluid.dygraph.Layer):
+class Fid_inceptionE_2(nn.Layer):
""" FID block in inception v3
"""
def __init__(self, in_channels, name=None):
@@ -693,10 +676,9 @@ class Fid_inceptionE_2(fluid.dygraph.Layer):
padding=(1, 0),
name=name + '.branch3x3dbl_3b')
### same with paper
- self.branch_pool0 = Pool2D(pool_size=3,
- pool_stride=1,
- pool_padding=1,
- pool_type='max')
+ self.branch_pool0 = MaxPool2D(kernel_size=3,
+ stride=1,
+ padding=1)
self.branch_pool = ConvBNLayer(in_channels,
192,
1,
@@ -707,23 +689,23 @@ class Fid_inceptionE_2(fluid.dygraph.Layer):
branch3x3_1 = self.branch3x3_1(x)
branch3x3_2a = self.branch3x3_2a(branch3x3_1)
branch3x3_2b = self.branch3x3_2b(branch3x3_1)
- branch3x3 = fluid.layers.concat([branch3x3_2a, branch3x3_2b], axis=1)
+ branch3x3 = paddle.concat([branch3x3_2a, branch3x3_2b], axis=1)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl_3a = self.branch3x3dbl_3a(branch3x3dbl)
branch3x3dbl_3b = self.branch3x3dbl_3b(branch3x3dbl)
- branch3x3dbl = fluid.layers.concat([branch3x3dbl_3a, branch3x3dbl_3b],
+ branch3x3dbl = paddle.concat([branch3x3dbl_3a, branch3x3dbl_3b],
axis=1)
branch_pool = self.branch_pool0(x)
branch_pool = self.branch_pool(branch_pool)
- return fluid.layers.concat(
+ return paddle.concat(
[branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
num_filters,
@@ -734,20 +716,19 @@ class ConvBNLayer(fluid.dygraph.Layer):
act='relu',
name=None):
super(ConvBNLayer, self).__init__()
- self.conv = Conv2D(num_channels=in_channels,
- num_filters=num_filters,
- filter_size=filter_size,
+ self.conv = Conv2D(in_channels=in_channels,
+ out_channels=num_filters,
+ kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
- act=None,
- param_attr=ParamAttr(name=name + ".conv.weight"),
+ weight_attr=paddle.ParamAttr(name=name + ".conv.weight"),
bias_attr=False)
self.bn = BatchNorm(num_filters,
act=act,
epsilon=0.001,
- param_attr=ParamAttr(name=name + ".bn.weight"),
- bias_attr=ParamAttr(name=name + ".bn.bias"),
+ param_attr=paddle.ParamAttr(name=name + ".bn.weight"),
+ bias_attr=paddle.ParamAttr(name=name + ".bn.bias"),
moving_mean_name=name + '.bn.running_mean',
moving_variance_name=name + '.bn.running_var')
diff --git a/ppgan/metrics/lpips.py b/ppgan/metrics/lpips.py
new file mode 100644
index 0000000000000000000000000000000000000000..b989096356664e8528eabb89c54091fcd5cda59d
--- /dev/null
+++ b/ppgan/metrics/lpips.py
@@ -0,0 +1,292 @@
+#Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+
+from collections import namedtuple
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle.utils.download import get_weights_path_from_url
+from ..modules import init
+from ..models.criterions.perceptual_loss import PerceptualVGG
+from .builder import METRICS
+
+lpips = True
+
+VGG16_TORCHVISION_URL = 'https://paddlegan.bj.bcebos.com/models/vgg16_official.pdparams'
+LINS_01_VGG_URL = 'https://paddlegan.bj.bcebos.com/models/lins_0.1_vgg.pdparams'
+
+
+@METRICS.register()
+class LPIPSMetric(paddle.metric.Metric):
+ """Calculate LPIPS (Learned Perceptual Image Patch Similarity).
+
+ Ref: https://arxiv.org/abs/1801.03924
+
+ Args:
+ net (str): Type of backbone net. Default: 'vgg'.
+ version (str): Version of lpips method. Defalut: '0.1'.
+ mean (list): Sequence of means for each channel of input image. Default: None.
+ std (list): Sequence of standard deviations for each channel of input image. Default: None.
+
+ Returns:
+ float: lpips result.
+ """
+ def __init__(self, net='vgg', version='0.1', mean=None, std=None):
+ self.net = net
+ self.version = version
+
+ self.loss_fn = LPIPS(net=net, version=version)
+
+ if mean is None:
+ self.mean = [0.5, 0.5, 0.5]
+ else:
+ self.mean = mean
+
+ if std is None:
+ self.std = [0.5, 0.5, 0.5]
+ else:
+ self.std = std
+
+ self.reset()
+
+ def reset(self):
+ self.results = []
+
+ def update(self, preds, gts):
+ if not isinstance(preds, (list, tuple)):
+ preds = [preds]
+
+ if not isinstance(gts, (list, tuple)):
+ gts = [gts]
+
+ for pred, gt in zip(preds, gts):
+ pred, gt = pred.astype(np.float32) / 255., gt.astype(
+ np.float32) / 255.
+ pred = paddle.vision.transforms.normalize(pred.transpose([2, 0, 1]),
+ self.mean, self.std)
+ gt = paddle.vision.transforms.normalize(gt.transpose([2, 0, 1]),
+ self.mean, self.std)
+
+ with paddle.no_grad():
+ value = self.loss_fn(
+ paddle.to_tensor(pred).unsqueeze(0),
+ paddle.to_tensor(gt).unsqueeze(0))
+
+ self.results.append(value.item())
+
+ def accumulate(self):
+ if paddle.distributed.get_world_size() > 1:
+ results = paddle.to_tensor(self.results)
+ results_list = []
+ paddle.distributed.all_gather(results_list, results)
+ self.results = paddle.concat(results_list).numpy()
+
+ if len(self.results) <= 0:
+ return 0.
+ return np.mean(self.results)
+
+ def name(self):
+ return 'LPIPS'
+
+
+def spatial_average(in_tens, keepdim=True):
+ return in_tens.mean([2, 3], keepdim=keepdim)
+
+
+# assumes scale factor is same for H and W
+def upsample(in_tens, out_HW=(64, 64)):
+ in_H, in_W = in_tens.shape[2], in_tens.shape[3]
+ scale_factor_H, scale_factor_W = 1. * out_HW[0] / in_H, 1. * out_HW[1] / in_W
+
+ return nn.Upsample(scale_factor=(scale_factor_H, scale_factor_W),
+ mode='bilinear',
+ align_corners=False)(in_tens)
+
+
+def normalize_tensor(in_feat, eps=1e-10):
+ norm_factor = paddle.sqrt(paddle.sum(in_feat**2, 1, keepdim=True))
+ return in_feat / (norm_factor + eps)
+
+
+# Learned perceptual metric
+class LPIPS(nn.Layer):
+ def __init__(self,
+ pretrained=True,
+ net='vgg',
+ version='0.1',
+ lpips=True,
+ spatial=False,
+ pnet_rand=False,
+ pnet_tune=False,
+ use_dropout=True,
+ model_path=None,
+ eval_mode=True,
+ verbose=True):
+ # lpips - [True] means with linear calibration on top of base network
+ # pretrained - [True] means load linear weights
+
+ super(LPIPS, self).__init__()
+ if (verbose):
+ print(
+ 'Setting up [%s] perceptual loss: trunk [%s], v[%s], spatial [%s]'
+ % ('LPIPS' if lpips else 'baseline', net, version,
+ 'on' if spatial else 'off'))
+
+ self.pnet_type = net
+ self.pnet_tune = pnet_tune
+ self.pnet_rand = pnet_rand
+ self.spatial = spatial
+
+ # false means baseline of just averaging all layers
+ self.lpips = lpips
+
+ self.version = version
+ self.scaling_layer = ScalingLayer()
+
+ if (self.pnet_type in ['vgg', 'vgg16']):
+ net_type = vgg16
+ self.chns = [64, 128, 256, 512, 512]
+ elif (self.pnet_type == 'alex'):
+ raise TypeError('alex not support now!')
+
+ elif (self.pnet_type == 'squeeze'):
+ raise TypeError('squeeze not support now!')
+
+ self.L = len(self.chns)
+
+ self.net = net_type(pretrained=True, requires_grad=False)
+
+ if (lpips):
+ self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+ self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+ self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+ self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+ self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+ self.lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+
+ # TODO: add alex and squeezenet
+ # 7 layers for squeezenet
+ self.lins = nn.LayerList(self.lins)
+ if (pretrained):
+ if (model_path is None):
+ model_path = get_weights_path_from_url(LINS_01_VGG_URL)
+
+ if (verbose):
+ print('Loading model from: %s' % model_path)
+
+ self.lins.set_state_dict(paddle.load(model_path))
+
+ if (eval_mode):
+ self.eval()
+
+ def forward(self, in0, in1, retPerLayer=False, normalize=False):
+ # turn on this flag if input is [0,1] so it can be adjusted to [-1, +1]
+ if normalize:
+ in0 = 2 * in0 - 1
+ in1 = 2 * in1 - 1
+
+ # v0.0 - original release had a bug, where input was not scaled
+ in0_input, in1_input = (
+ self.scaling_layer(in0),
+ self.scaling_layer(in1)) if self.version == '0.1' else (in0, in1)
+ outs0, outs1 = self.net.forward(in0_input), self.net.forward(in1_input)
+ feats0, feats1, diffs = {}, {}, {}
+
+ for kk in range(self.L):
+ feats0[kk], feats1[kk] = normalize_tensor(
+ outs0[kk]), normalize_tensor(outs1[kk])
+ diffs[kk] = (feats0[kk] - feats1[kk])**2
+
+ if (self.lpips):
+ if (self.spatial):
+ res = [
+ upsample(self.lins[kk].model(diffs[kk]),
+ out_HW=in0.shape[2:]) for kk in range(self.L)
+ ]
+ else:
+ res = [
+ spatial_average(self.lins[kk].model(diffs[kk]),
+ keepdim=True) for kk in range(self.L)
+ ]
+ else:
+ if (self.spatial):
+ res = [
+ upsample(diffs[kk].sum(dim=1, keepdim=True),
+ out_HW=in0.shape[2:]) for kk in range(self.L)
+ ]
+ else:
+ res = [
+ spatial_average(diffs[kk].sum(dim=1, keepdim=True),
+ keepdim=True) for kk in range(self.L)
+ ]
+
+ val = res[0]
+ for l in range(1, self.L):
+ val += res[l]
+
+ if (retPerLayer):
+ return (val, res)
+ else:
+ return val
+
+
+class ScalingLayer(nn.Layer):
+ def __init__(self):
+ super(ScalingLayer, self).__init__()
+ self.register_buffer(
+ 'shift',
+ paddle.to_tensor([-.030, -.088, -.188]).reshape([1, 3, 1, 1]))
+ self.register_buffer(
+ 'scale',
+ paddle.to_tensor([.458, .448, .450]).reshape([1, 3, 1, 1]))
+
+ def forward(self, inp):
+ return (inp - self.shift) / self.scale
+
+
+class NetLinLayer(nn.Layer):
+ ''' A single linear layer which does a 1x1 conv '''
+ def __init__(self, chn_in, chn_out=1, use_dropout=False):
+ super(NetLinLayer, self).__init__()
+
+ layers = [
+ nn.Dropout(),
+ ] if (use_dropout) else []
+ layers += [
+ nn.Conv2D(chn_in, chn_out, 1, stride=1, padding=0, bias_attr=False),
+ ]
+ self.model = nn.Sequential(*layers)
+
+
+class vgg16(nn.Layer):
+ def __init__(self, requires_grad=False, pretrained=True):
+ super(vgg16, self).__init__()
+ self.vgg16 = PerceptualVGG(['3', '8', '15', '22', '29'], 'vgg16', False,
+ VGG16_TORCHVISION_URL)
+
+ if not requires_grad:
+ for param in self.parameters():
+ param.trainable = False
+
+ def forward(self, x):
+ out = self.vgg16(x)
+ vgg_outputs = namedtuple(
+ "VggOutputs",
+ ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+ out = vgg_outputs(out['3'], out['8'], out['15'], out['22'], out['29'])
+
+ return out
diff --git a/ppgan/metrics/psnr_ssim.py b/ppgan/metrics/psnr_ssim.py
index 5bd9dc8d4571a25ae2231f52b7ebbd1044a6ed9b..7ed288a295169670f79e3911c3b6aa15d34bcf81 100644
--- a/ppgan/metrics/psnr_ssim.py
+++ b/ppgan/metrics/psnr_ssim.py
@@ -30,19 +30,34 @@ class PSNR(paddle.metric.Metric):
def reset(self):
self.results = []
- def update(self, preds, gts):
+ def update(self, preds, gts, is_seq=False):
if not isinstance(preds, (list, tuple)):
preds = [preds]
if not isinstance(gts, (list, tuple)):
gts = [gts]
+ if is_seq:
+ single_seq = []
+
for pred, gt in zip(preds, gts):
value = calculate_psnr(pred, gt, self.crop_border, self.input_order,
self.test_y_channel)
- self.results.append(value)
+ if is_seq:
+ single_seq.append(value)
+ else:
+ self.results.append(value)
+
+ if is_seq:
+ self.results.append(np.mean(single_seq))
def accumulate(self):
+ if paddle.distributed.get_world_size() > 1:
+ results = paddle.to_tensor(self.results)
+ results_list = []
+ paddle.distributed.all_gather(results_list, results)
+ self.results = paddle.concat(results_list).numpy()
+
if len(self.results) <= 0:
return 0.
return np.mean(self.results)
@@ -53,17 +68,26 @@ class PSNR(paddle.metric.Metric):
@METRICS.register()
class SSIM(PSNR):
- def update(self, preds, gts):
+ def update(self, preds, gts, is_seq=False):
if not isinstance(preds, (list, tuple)):
preds = [preds]
if not isinstance(gts, (list, tuple)):
gts = [gts]
+ if is_seq:
+ single_seq = []
+
for pred, gt in zip(preds, gts):
value = calculate_ssim(pred, gt, self.crop_border, self.input_order,
self.test_y_channel)
- self.results.append(value)
+ if is_seq:
+ single_seq.append(value)
+ else:
+ self.results.append(value)
+
+ if is_seq:
+ self.results.append(np.mean(single_seq))
def name(self):
return 'SSIM'
@@ -188,8 +212,8 @@ def calculate_ssim(img1,
f'Wrong input_order {input_order}. Supported input_orders are '
'"HWC" and "CHW"')
- img1 = img1.copy().astype('float32')[..., ::-1]
- img2 = img2.copy().astype('float32')[..., ::-1]
+ img1 = img1.copy().astype('float32')
+ img2 = img2.copy().astype('float32')
img1 = reorder_image(img1, input_order=input_order)
img2 = reorder_image(img2, input_order=input_order)
diff --git a/ppgan/models/__init__.py b/ppgan/models/__init__.py
index 77b7c454852dfbf4434a315f0e4374ad4f6a6539..6b116a71ea334b034da33819de081f1d1347d2cb 100644
--- a/ppgan/models/__init__.py
+++ b/ppgan/models/__init__.py
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from .base_model import BaseModel
+from .base_model import BaseModel, apply_to_static
from .gan_model import GANModel
from .cycle_gan_model import CycleGANModel
from .pix2pix_model import Pix2PixModel
@@ -26,3 +26,20 @@ from .animeganv2_model import AnimeGANV2Model, AnimeGANV2PreTrainModel
from .styleganv2_model import StyleGAN2Model
from .wav2lip_model import Wav2LipModel
from .wav2lip_hq_model import Wav2LipModelHq
+from .starganv2_model import StarGANv2Model
+from .edvr_model import EDVRModel
+from .firstorder_model import FirstOrderModel
+from .lapstyle_model import LapStyleDraModel, LapStyleRevFirstModel, LapStyleRevSecondModel
+from .basicvsr_model import BasicVSRModel
+from .mpr_model import MPRModel
+from .photopen_model import PhotoPenModel
+from .msvsr_model import MultiStageVSRModel
+from .singan_model import SinGANModel
+from .rcan_model import RCANModel
+from .prenet_model import PReNetModel
+from .gpen_model import GPENModel
+from .swinir_model import SwinIRModel
+from .gfpgan_model import GFPGANModel
+from .invdn_model import InvDNModel
+from .nafnet_model import NAFNetModel
+from .aotgan_model import AOTGANModel
diff --git a/ppgan/models/animeganv2_model.py b/ppgan/models/animeganv2_model.py
index 9f768c7abf125350a678935e7c2f3fcd2ee71105..c2ee5de2e6bbecd0f4e7e9c45608e36ca61d5058 100644
--- a/ppgan/models/animeganv2_model.py
+++ b/ppgan/models/animeganv2_model.py
@@ -13,62 +13,70 @@
#limitations under the License.
import paddle
-from paddle import nn
+import paddle.nn as nn
from .base_model import BaseModel
from .builder import MODELS
from .generators.builder import build_generator
from .discriminators.builder import build_discriminator
-from .criterions.gan_loss import GANLoss
+from .criterions import build_criterion
from ..modules.caffevgg import CaffeVGG19
-from ..solver import build_optimizer
from ..modules.init import init_weights
from ..utils.filesystem import load
@MODELS.register()
class AnimeGANV2Model(BaseModel):
- def __init__(self, cfg):
+ """ This class implements the AnimeGANV2 model.
+ """
+ def __init__(self,
+ generator,
+ discriminator=None,
+ gan_criterion=None,
+ pretrain_ckpt=None,
+ g_adv_weight=300.,
+ d_adv_weight=300.,
+ con_weight=1.5,
+ sty_weight=2.5,
+ color_weight=10.,
+ tv_weight=1.):
"""Initialize the AnimeGANV2 class.
- Parameters:
- opt (config dict)-- stores all the experiment flags; needs to be a subclass of Dict
+ Args:
+ generator (dict): config of generator.
+ discriminator (dict): config of discriminator.
+ gan_criterion (dict): config of gan criterion.
"""
- super(AnimeGANV2Model, self).__init__(cfg)
+ super(AnimeGANV2Model, self).__init__()
+ self.g_adv_weight = g_adv_weight
+ self.d_adv_weight = d_adv_weight
+ self.con_weight = con_weight
+ self.sty_weight = sty_weight
+ self.color_weight = color_weight
+ self.tv_weight = tv_weight
# define networks (both generator and discriminator)
- self.nets['netG'] = build_generator(cfg.model.generator)
+ self.nets['netG'] = build_generator(generator)
init_weights(self.nets['netG'])
- # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc
+ # define a discriminator
if self.is_train:
- self.nets['netD'] = build_discriminator(cfg.model.discriminator)
+ self.nets['netD'] = build_discriminator(discriminator)
init_weights(self.nets['netD'])
self.pretrained = CaffeVGG19()
self.losses = {}
# define loss functions
- self.criterionGAN = GANLoss(cfg.model.gan_mode)
+ self.criterionGAN = build_criterion(gan_criterion)
self.criterionL1 = nn.L1Loss()
self.criterionHub = nn.SmoothL1Loss()
- # build optimizers
- self.build_lr_scheduler()
- self.optimizers['optimizer_G'] = build_optimizer(
- cfg.optimizer,
- self.lr_scheduler,
- parameter_list=self.nets['netG'].parameters())
- self.optimizers['optimizer_D'] = build_optimizer(
- cfg.optimizer,
- self.lr_scheduler,
- parameter_list=self.nets['netD'].parameters())
-
- if self.cfg.pretrain_ckpt:
- state_dicts = load(self.cfg.pretrain_ckpt)
+ if pretrain_ckpt:
+ state_dicts = load(pretrain_ckpt)
self.nets['netG'].set_state_dict(state_dicts['netG'])
- print('Load pretrained generator from', self.cfg.pretrain_ckpt)
+ print('Load pretrained generator from', pretrain_ckpt)
- def set_input(self, input):
+ def setup_input(self, input):
"""Unpack input data from the dataloader and perform necessary pre-processing steps.
"""
@@ -79,18 +87,18 @@ class AnimeGANV2Model(BaseModel):
self.smooth_gray = paddle.to_tensor(input['smooth_gray'])
else:
self.real = paddle.to_tensor(input['A'])
- self.image_paths = input['A_paths']
+ self.image_paths = input['A_path']
def forward(self):
"""Run forward pass; called by both functions and ."""
- self.fake = self.nets['netG'](self.real) # G(A)
+ self.fake = self.nets['netG'](self.real)
# put items to visual dict
self.visual_items['real'] = self.real
self.visual_items['fake'] = self.fake
def test(self):
- self.fake = self.nets['netG'](self.real) # G(A)
+ self.fake = self.nets['netG'](self.real)
# put items to visual dict
self.visual_items['real'] = self.real
@@ -152,13 +160,13 @@ class AnimeGANV2Model(BaseModel):
fake_logit = self.nets['netD'](self.fake.detach())
smooth_logit = self.nets['netD'](self.smooth_gray)
- d_real_loss = (self.cfg.d_adv_weight * 1.2 *
+ d_real_loss = (self.d_adv_weight * 1.2 *
self.criterionGAN(real_logit, True))
- d_gray_loss = (self.cfg.d_adv_weight * 1.2 *
+ d_gray_loss = (self.d_adv_weight * 1.2 *
self.criterionGAN(gray_logit, False))
- d_fake_loss = (self.cfg.d_adv_weight * 1.2 *
+ d_fake_loss = (self.d_adv_weight * 1.2 *
self.criterionGAN(fake_logit, False))
- d_blur_loss = (self.cfg.d_adv_weight * 0.8 *
+ d_blur_loss = (self.d_adv_weight * 0.8 *
self.criterionGAN(smooth_logit, False))
self.loss_D = d_real_loss + d_gray_loss + d_fake_loss + d_blur_loss
@@ -175,11 +183,11 @@ class AnimeGANV2Model(BaseModel):
fake_logit = self.nets['netD'](self.fake)
c_loss, s_loss = self.con_sty_loss(self.real, self.anime_gray,
self.fake)
- c_loss = self.cfg.con_weight * c_loss
- s_loss = self.cfg.sty_weight * s_loss
- tv_loss = self.cfg.tv_weight * self.variation_loss(self.fake)
- col_loss = self.cfg.color_weight * self.color_loss(self.real, self.fake)
- g_loss = (self.cfg.g_adv_weight * self.criterionGAN(fake_logit, True))
+ c_loss = self.con_weight * c_loss
+ s_loss = self.sty_weight * s_loss
+ tv_loss = self.tv_weight * self.variation_loss(self.fake)
+ col_loss = self.color_weight * self.color_loss(self.real, self.fake)
+ g_loss = (self.g_adv_weight * self.criterionGAN(fake_logit, True))
self.loss_G = c_loss + s_loss + col_loss + g_loss + tv_loss
@@ -191,7 +199,7 @@ class AnimeGANV2Model(BaseModel):
self.losses['col_loss'] = col_loss
self.losses['tv_loss'] = tv_loss
- def optimize_parameters(self):
+ def train_iter(self, optimizers=None):
# compute fake images: G(A)
self.forward()
@@ -212,11 +220,11 @@ class AnimeGANV2PreTrainModel(AnimeGANV2Model):
real_feature_map = self.pretrained(self.real)
fake_feature_map = self.pretrained(self.fake)
init_c_loss = self.criterionL1(real_feature_map, fake_feature_map)
- loss = self.cfg.con_weight * init_c_loss
+ loss = self.con_weight * init_c_loss
loss.backward()
self.losses['init_c_loss'] = init_c_loss
- def optimize_parameters(self):
+ def train_iter(self, optimizers=None):
self.forward()
# update G
self.optimizers['optimizer_G'].clear_grad()
diff --git a/ppgan/models/aotgan_model.py b/ppgan/models/aotgan_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab70f1f913723f1b809e4871f9f83bd4b76abe30
--- /dev/null
+++ b/ppgan/models/aotgan_model.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from .base_model import BaseModel
+
+from .builder import MODELS
+from .generators.builder import build_generator
+from .criterions import build_criterion
+from .discriminators.builder import build_discriminator
+
+from ..modules.init import init_weights
+from ..solver import build_optimizer
+
+# gaussion blur on mask
+def gaussian_blur(input, kernel_size, sigma):
+ def get_gaussian_kernel(kernel_size: int, sigma: float) -> paddle.Tensor:
+ def gauss_fcn(x, window_size, sigma):
+ return -(x - window_size // 2)**2 / float(2 * sigma**2)
+ gauss = paddle.stack([paddle.exp(paddle.to_tensor(gauss_fcn(x, kernel_size, sigma)))for x in range(kernel_size)])
+ return gauss / gauss.sum()
+
+
+ b, c, h, w = input.shape
+ ksize_x, ksize_y = kernel_size
+ sigma_x, sigma_y = sigma
+ kernel_x = get_gaussian_kernel(ksize_x, sigma_x)
+ kernel_y = get_gaussian_kernel(ksize_y, sigma_y)
+ kernel_2d = paddle.matmul(kernel_x, kernel_y, transpose_y=True)
+ kernel = kernel_2d.reshape([1, 1, ksize_x, ksize_y])
+ kernel = kernel.repeat_interleave(c, 0)
+ padding = [(k - 1) // 2 for k in kernel_size]
+ return F.conv2d(input, kernel, padding=padding, stride=1, groups=c)
+
+# GAN Loss
+class Adversal():
+ def __init__(self, ksize=71):
+ self.ksize = ksize
+ self.loss_fn = nn.MSELoss()
+
+ def __call__(self, netD, fake, real, masks):
+ fake_detach = fake.detach()
+
+ g_fake = netD(fake)
+ d_fake = netD(fake_detach)
+ d_real = netD(real)
+
+ _, _, h, w = g_fake.shape
+ b, c, ht, wt = masks.shape
+
+ # align image shape with mask
+ if h != ht or w != wt:
+ g_fake = F.interpolate(g_fake, size=(ht, wt), mode='bilinear', align_corners=True)
+ d_fake = F.interpolate(d_fake, size=(ht, wt), mode='bilinear', align_corners=True)
+ d_real = F.interpolate(d_real, size=(ht, wt), mode='bilinear', align_corners=True)
+ d_fake_label = gaussian_blur(masks, (self.ksize, self.ksize), (10, 10)).detach()
+ d_real_label = paddle.zeros_like(d_real)
+ g_fake_label = paddle.ones_like(g_fake)
+
+ dis_loss = [self.loss_fn(d_fake, d_fake_label).mean(), self.loss_fn(d_real, d_real_label).mean()]
+ gen_loss = (self.loss_fn(g_fake, g_fake_label) * masks / paddle.mean(masks)).mean()
+
+ return dis_loss, gen_loss
+
+@MODELS.register()
+class AOTGANModel(BaseModel):
+ def __init__(self,
+ generator,
+ discriminator,
+ criterion,
+ l1_weight,
+ perceptual_weight,
+ style_weight,
+ adversal_weight,
+ img_size,
+ ):
+
+ super(AOTGANModel, self).__init__()
+
+ # define nets
+ self.nets['netG'] = build_generator(generator)
+ self.nets['netD'] = build_discriminator(discriminator)
+ self.net_vgg = build_criterion(criterion)
+
+ self.adv_loss = Adversal()
+
+ self.l1_weight = l1_weight
+ self.perceptual_weight = perceptual_weight
+ self.style_weight = style_weight
+ self.adversal_weight = adversal_weight
+ self.img_size = img_size
+
+ def setup_input(self, input):
+ self.img = input['img']
+ self.mask = input['mask']
+ self.img_masked = (self.img * (1 - self.mask)) + self.mask
+ self.img_paths = input['img_path']
+
+ def forward(self):
+ input_x = paddle.concat([self.img_masked, self.mask], 1)
+ self.pred_img = self.nets['netG'](input_x)
+ self.comp_img = (1 - self.mask) * self.img + self.mask * self.pred_img
+ self.visual_items['pred_img'] = self.pred_img.detach()
+
+ def train_iter(self, optimizers=None):
+ self.forward()
+ l1_loss, perceptual_loss, style_loss = self.net_vgg(self.img, self.pred_img, self.img_size)
+ self.losses['l1'] = l1_loss * self.l1_weight
+ self.losses['perceptual'] = perceptual_loss * self.perceptual_weight
+ self.losses['style'] = style_loss * self.style_weight
+ dis_loss, gen_loss = self.adv_loss(self.nets['netD'], self.comp_img, self.img, self.mask)
+ self.losses['adv_g'] = gen_loss * self.adversal_weight
+ loss_d_fake = dis_loss[0]
+ loss_d_real = dis_loss[1]
+ self.losses['adv_d'] = loss_d_fake + loss_d_real
+
+ loss_g = self.losses['l1'] + self.losses['perceptual'] + self.losses['style'] + self.losses['adv_g']
+ loss_d = self.losses['adv_d']
+
+ self.optimizers['optimG'].clear_grad()
+ self.optimizers['optimD'].clear_grad()
+ loss_g.backward()
+ loss_d.backward()
+ self.optimizers['optimG'].step()
+ self.optimizers['optimD'].step()
+
+ def test_iter(self, metrics=None):
+ self.eval()
+ with paddle.no_grad():
+ self.forward()
+ self.train()
+
+ def setup_optimizers(self, lr, cfg):
+ for opt_name, opt_cfg in cfg.items():
+ if opt_name == 'lr':
+ learning_rate = opt_cfg
+ continue
+ cfg_ = opt_cfg.copy()
+ net_names = cfg_.pop('net_names')
+ parameters = []
+ for net_name in net_names:
+ parameters += self.nets[net_name].parameters()
+ if opt_name == 'optimG':
+ lr = learning_rate * 4
+ else:
+ lr = learning_rate
+ self.optimizers[opt_name] = build_optimizer(
+ cfg_, lr, parameters)
+
+ return self.optimizers
diff --git a/ppgan/models/backbones/__init__.py b/ppgan/models/backbones/__init__.py
index 1c6f371833742fbd303e301d850c2204d50e8db4..9e83ec486fc6232d90d965425588645bc1204386 100644
--- a/ppgan/models/backbones/__init__.py
+++ b/ppgan/models/backbones/__init__.py
@@ -11,5 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
-from .resnet_backbone import resnet18, resnet34, resnet50, resnet101, resnet152
diff --git a/ppgan/models/base_model.py b/ppgan/models/base_model.py
old mode 100644
new mode 100755
index e70524715328491cef9430823fc97b6514add7fa..ec6d0336118681a2494859a438018d72f1189e21
--- a/ppgan/models/base_model.py
+++ b/ppgan/models/base_model.py
@@ -19,14 +19,17 @@ import numpy as np
from collections import OrderedDict
from abc import ABC, abstractmethod
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
from .criterions.builder import build_criterion
from ..solver import build_lr_scheduler, build_optimizer
-from ..metrics import build_metric
from ..utils.visual import tensor2img
+from ..utils.logger import get_logger
class BaseModel(ABC):
- """This class is an abstract base class (ABC) for models.
+ r"""This class is an abstract base class (ABC) for models.
To create a subclass, you need to implement the following five functions:
-- <__init__>: initialize the class.
-- : unpack data from dataset and apply preprocessing.
@@ -95,6 +98,9 @@ class BaseModel(ABC):
"""Calculate losses, gradients, and update network weights; called in every training iteration"""
pass
+ def set_total_iter(self, total_iter):
+ self.total_iter = total_iter
+
def test_iter(self, metrics=None):
"""Calculate metrics; called in every test iteration"""
self.eval()
@@ -130,6 +136,7 @@ class BaseModel(ABC):
return self.optimizers
def setup_metrics(self, cfg):
+ from ..metrics import build_metric
if isinstance(list(cfg.values())[0], dict):
for metric_name, cfg_ in cfg.items():
self.metrics[metric_name] = build_metric(cfg_)
@@ -179,3 +186,48 @@ class BaseModel(ABC):
if net is not None:
for param in net.parameters():
param.trainable = requires_grad
+
+ def export_model(self, export_model, output_dir=None, inputs_size=[], export_serving_model=False, model_name=None):
+ inputs_num = 0
+ for net in export_model:
+ input_spec = [
+ paddle.static.InputSpec(shape=inputs_size[inputs_num + i],
+ dtype="float32")
+ for i in range(net["inputs_num"])
+ ]
+ inputs_num = inputs_num + net["inputs_num"]
+ self.nets[net["name"]].export_mode = True
+ static_model = paddle.jit.to_static(self.nets[net["name"]],
+ input_spec=input_spec)
+ if output_dir is None:
+ output_dir = 'inference_model'
+ if model_name is None:
+ model_name = '{}_{}'.format(self.__class__.__name__.lower(),
+ net["name"])
+ paddle.jit.save(
+ static_model,
+ os.path.join(
+ output_dir, model_name))
+ if export_serving_model:
+ from paddle_serving_client.io import inference_model_to_serving
+ model_name = '{}_{}'.format(self.__class__.__name__.lower(),
+ net["name"])
+
+ inference_model_to_serving(
+ dirname=output_dir,
+ serving_server="{}/{}/serving_server".format(output_dir,
+ model_name),
+ serving_client="{}/{}/serving_client".format(output_dir,
+ model_name),
+ model_filename="{}.pdmodel".format(model_name),
+ params_filename="{}.pdiparams".format(model_name))
+
+def apply_to_static(support_to_static, image_shape, model):
+ if support_to_static:
+ specs = None
+ if image_shape is not None:
+ specs = [InputSpec([None] + image_shape)]
+ model = to_static(model, input_spec=specs)
+ logger = get_logger('ppgan')
+ logger.info("Successfully to apply @to_static with specs: {}".format(specs))
+ return model
\ No newline at end of file
diff --git a/ppgan/models/basicvsr_model.py b/ppgan/models/basicvsr_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..54a9b5454842339259143991514cb392511c0014
--- /dev/null
+++ b/ppgan/models/basicvsr_model.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .builder import MODELS
+from .sr_model import BaseSRModel
+from .generators.iconvsr import EDVRFeatureExtractor
+from .generators.basicvsr import ResidualBlockNoBN, PixelShufflePack, SPyNet
+from ..modules.init import reset_parameters
+from ..utils.visual import tensor2img
+
+
+@MODELS.register()
+class BasicVSRModel(BaseSRModel):
+ """BasicVSR Model.
+
+ Paper: BasicVSR: The Search for Essential Components in Video Super-Resolution and Beyond, CVPR, 2021
+ """
+ def __init__(self, generator, fix_iter, lr_mult, pixel_criterion=None):
+ """Initialize the BasicVSR class.
+
+ Args:
+ generator (dict): config of generator.
+ fix_iter (dict): config of fix_iter.
+ pixel_criterion (dict): config of pixel criterion.
+ """
+ super(BasicVSRModel, self).__init__(generator, pixel_criterion)
+ self.fix_iter = fix_iter
+ self.current_iter = 1
+ self.flag = True
+ self.lr_mult = lr_mult
+ init_basicvsr_weight(self.nets['generator'])
+
+ def setup_input(self, input):
+ self.lq = paddle.to_tensor(input['lq'])
+ self.visual_items['lq'] = self.lq[:, 0, :, :, :]
+ if 'gt' in input:
+ self.gt = paddle.to_tensor(input['gt'])
+ self.visual_items['gt'] = self.gt[:, 0, :, :, :]
+ self.image_paths = input['lq_path']
+
+ def train_iter(self, optims=None):
+ optims['optim'].clear_grad()
+ if self.fix_iter:
+ if self.current_iter == 1:
+ print('Train BasicVSR with fixed spynet for', self.fix_iter,
+ 'iters.')
+ for name, param in self.nets['generator'].named_parameters():
+ if 'spynet' in name or 'edvr' in name:
+ param.trainable = False
+ elif self.current_iter >= self.fix_iter + 1 and self.flag:
+ print('Train all the parameters.')
+ for name, param in self.nets['generator'].named_parameters():
+ param.trainable = True
+ if 'spynet' in name:
+ param.optimize_attr['learning_rate'] = self.lr_mult
+ self.flag = False
+ for net in self.nets.values():
+ net.find_unused_parameters = False
+
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output[:, 0, :, :, :]
+ # pixel loss
+ loss_pixel = self.pixel_criterion(self.output, self.gt)
+
+ loss_pixel.backward()
+ optims['optim'].step()
+
+ self.losses['loss_pixel'] = loss_pixel
+
+ self.current_iter += 1
+
+ def test_iter(self, metrics=None):
+ self.gt = self.gt.cpu()
+ self.nets['generator'].eval()
+ with paddle.no_grad():
+ output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = output[:, 0, :, :, :].cpu()
+ self.nets['generator'].train()
+
+ out_img = []
+ gt_img = []
+
+ _, t, _, _, _ = self.gt.shape
+ for i in range(t):
+ out_tensor = output[0, i]
+ gt_tensor = self.gt[0, i]
+ out_img.append(tensor2img(out_tensor, (0., 1.)))
+ gt_img.append(tensor2img(gt_tensor, (0., 1.)))
+
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(out_img, gt_img, is_seq=True)
+
+
+def init_basicvsr_weight(net):
+ for m in net.children():
+ if hasattr(m,
+ 'weight') and not isinstance(m,
+ (nn.BatchNorm, nn.BatchNorm2D)):
+ reset_parameters(m)
+ continue
+
+ if (not isinstance(m, (ResidualBlockNoBN, PixelShufflePack, SPyNet,
+ EDVRFeatureExtractor))):
+ init_basicvsr_weight(m)
diff --git a/ppgan/models/criterions/__init__.py b/ppgan/models/criterions/__init__.py
index 4c49542b201205ae4db830366d50e1553a5dc723..6d5bcd43a1fb1e49b178eb612adbc8b450760cbb 100644
--- a/ppgan/models/criterions/__init__.py
+++ b/ppgan/models/criterions/__init__.py
@@ -1,5 +1,14 @@
from .gan_loss import GANLoss
from .perceptual_loss import PerceptualLoss
-from .pixel_loss import L1Loss, MSELoss
+from .pixel_loss import L1Loss, MSELoss, CharbonnierLoss, \
+ CalcStyleEmdLoss, CalcContentReltLoss, \
+ CalcContentLoss, CalcStyleLoss, EdgeLoss, PSNRLoss
+from .photopen_perceptual_loss import PhotoPenPerceptualLoss
+from .gradient_penalty import GradientPenalty
from .builder import build_criterion
+
+from .ssim import SSIM
+from .id_loss import IDLoss
+from .gfpgan_loss import GFPGANGANLoss, GFPGANL1Loss, GFPGANPerceptualLoss
+from .aotgan_perceptual_loss import AOTGANCriterionLoss
diff --git a/ppgan/models/criterions/aotgan_perceptual_loss.py b/ppgan/models/criterions/aotgan_perceptual_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..48b86d9418bfb31cdd5f04997ab5a161b523e908
--- /dev/null
+++ b/ppgan/models/criterions/aotgan_perceptual_loss.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.utils import spectral_norm
+
+from ppgan.utils.download import get_path_from_url
+from .builder import CRITERIONS
+
+# VGG19(ImageNet pretrained)
+class VGG19F(nn.Layer):
+ def __init__(self):
+ super(VGG19F, self).__init__()
+
+ self.feature_0 = nn.Conv2D(3, 64, 3, 1, 1)
+ self.relu_1 = nn.ReLU()
+ self.feature_2 = nn.Conv2D(64, 64, 3, 1, 1)
+ self.relu_3 = nn.ReLU()
+
+ self.mp_4 = nn.MaxPool2D(2, 2, 0)
+ self.feature_5 = nn.Conv2D(64, 128, 3, 1, 1)
+ self.relu_6 = nn.ReLU()
+ self.feature_7 = nn.Conv2D(128, 128, 3, 1, 1)
+ self.relu_8 = nn.ReLU()
+
+ self.mp_9 = nn.MaxPool2D(2, 2, 0)
+ self.feature_10 = nn.Conv2D(128, 256, 3, 1, 1)
+ self.relu_11 = nn.ReLU()
+ self.feature_12 = nn.Conv2D(256, 256, 3, 1, 1)
+ self.relu_13 = nn.ReLU()
+ self.feature_14 = nn.Conv2D(256, 256, 3, 1, 1)
+ self.relu_15 = nn.ReLU()
+ self.feature_16 = nn.Conv2D(256, 256, 3, 1, 1)
+ self.relu_17 = nn.ReLU()
+
+ self.mp_18 = nn.MaxPool2D(2, 2, 0)
+ self.feature_19 = nn.Conv2D(256, 512, 3, 1, 1)
+ self.relu_20 = nn.ReLU()
+ self.feature_21 = nn.Conv2D(512, 512, 3, 1, 1)
+ self.relu_22 = nn.ReLU()
+ self.feature_23 = nn.Conv2D(512, 512, 3, 1, 1)
+ self.relu_24 = nn.ReLU()
+ self.feature_25 = nn.Conv2D(512, 512, 3, 1, 1)
+ self.relu_26 = nn.ReLU()
+
+ self.mp_27 = nn.MaxPool2D(2, 2, 0)
+ self.feature_28 = nn.Conv2D(512, 512, 3, 1, 1)
+ self.relu_29 = nn.ReLU()
+ self.feature_30 = nn.Conv2D(512, 512, 3, 1, 1)
+ self.relu_31 = nn.ReLU()
+ self.feature_32 = nn.Conv2D(512, 512, 3, 1, 1)
+ self.relu_33 = nn.ReLU()
+ self.feature_34 = nn.Conv2D(512, 512, 3, 1, 1)
+ self.relu_35 = nn.ReLU()
+
+ def forward(self, x):
+ x = self.stand(x)
+ feats = []
+ group = []
+ x = self.feature_0(x)
+ x = self.relu_1(x)
+ group.append(x)
+ x = self.feature_2(x)
+ x = self.relu_3(x)
+ group.append(x)
+ feats.append(group)
+
+ group = []
+ x = self.mp_4(x)
+ x = self.feature_5(x)
+ x = self.relu_6(x)
+ group.append(x)
+ x = self.feature_7(x)
+ x = self.relu_8(x)
+ group.append(x)
+ feats.append(group)
+
+ group = []
+ x = self.mp_9(x)
+ x = self.feature_10(x)
+ x = self.relu_11(x)
+ group.append(x)
+ x = self.feature_12(x)
+ x = self.relu_13(x)
+ group.append(x)
+ x = self.feature_14(x)
+ x = self.relu_15(x)
+ group.append(x)
+ x = self.feature_16(x)
+ x = self.relu_17(x)
+ group.append(x)
+ feats.append(group)
+
+ group = []
+ x = self.mp_18(x)
+ x = self.feature_19(x)
+ x = self.relu_20(x)
+ group.append(x)
+ x = self.feature_21(x)
+ x = self.relu_22(x)
+ group.append(x)
+ x = self.feature_23(x)
+ x = self.relu_24(x)
+ group.append(x)
+ x = self.feature_25(x)
+ x = self.relu_26(x)
+ group.append(x)
+ feats.append(group)
+
+ group = []
+ x = self.mp_27(x)
+ x = self.feature_28(x)
+ x = self.relu_29(x)
+ group.append(x)
+ x = self.feature_30(x)
+ x = self.relu_31(x)
+ group.append(x)
+ x = self.feature_32(x)
+ x = self.relu_33(x)
+ group.append(x)
+ x = self.feature_34(x)
+ x = self.relu_35(x)
+ group.append(x)
+ feats.append(group)
+
+ return feats
+
+ def stand(self, x):
+ mean = paddle.to_tensor([0.485, 0.456, 0.406]).reshape([1, 3, 1, 1])
+ std = paddle.to_tensor([0.229, 0.224, 0.225]).reshape([1, 3, 1, 1])
+ y = (x + 1.) / 2.
+ y = (y - mean) / std
+ return y
+
+# l1 loss
+class L1():
+ def __init__(self,):
+ self.calc = nn.L1Loss()
+
+ def __call__(self, x, y):
+ return self.calc(x, y)
+
+# perceptual loss
+class Perceptual():
+ def __init__(self, vgg, weights=[1.0, 1.0, 1.0, 1.0, 1.0]):
+ super(Perceptual, self).__init__()
+ self.vgg = vgg
+ self.criterion = nn.L1Loss()
+ self.weights = weights
+
+ def __call__(self, x, y, img_size):
+ x = F.interpolate(x, (img_size, img_size), mode='bilinear', align_corners=True)
+ y = F.interpolate(y, (img_size, img_size), mode='bilinear', align_corners=True)
+ x_features = self.vgg(x)
+ y_features = self.vgg(y)
+ content_loss = 0.0
+ for i in range(len(self.weights)):
+ content_loss += self.weights[i] * self.criterion(x_features[i][0], y_features[i][0]) # 此vgg19预训练模型无bn层,所以尝试不用rate
+ return content_loss
+
+# style loss
+class Style():
+ def __init__(self, vgg):
+ super(Style, self).__init__()
+ self.vgg = vgg
+ self.criterion = nn.L1Loss()
+
+ def compute_gram(self, x):
+ b, c, h, w = x.shape
+ f = x.reshape([b, c, w * h])
+ f_T = f.transpose([0, 2, 1])
+ G = paddle.matmul(f, f_T) / (h * w * c)
+ return G
+
+ def __call__(self, x, y, img_size):
+ x = F.interpolate(x, (img_size, img_size), mode='bilinear', align_corners=True)
+ y = F.interpolate(y, (img_size, img_size), mode='bilinear', align_corners=True)
+ x_features = self.vgg(x)
+ y_features = self.vgg(y)
+ style_loss = 0.0
+ blocks = [2, 3, 4, 5]
+ layers = [2, 4, 4, 2]
+ for b, l in list(zip(blocks, layers)):
+ b = b - 1
+ l = l - 1
+ style_loss += self.criterion(self.compute_gram(x_features[b][l]), self.compute_gram(y_features[b][l]))
+ return style_loss
+
+# sum of weighted losses
+@CRITERIONS.register()
+class AOTGANCriterionLoss(nn.Layer):
+ def __init__(self,
+ pretrained,
+ ):
+ super(AOTGANCriterionLoss, self).__init__()
+ self.model = VGG19F()
+ weight_path = get_path_from_url(pretrained)
+ vgg_weight = paddle.load(weight_path)
+ self.model.set_state_dict(vgg_weight)
+ print('PerceptualVGG loaded pretrained weight.')
+ self.l1_loss = L1()
+ self.perceptual_loss = Perceptual(self.model)
+ self.style_loss = Style(self.model)
+
+ def forward(self, img_r, img_f, img_size):
+ l1_loss = self.l1_loss(img_r, img_f)
+ perceptual_loss = self.perceptual_loss(img_r, img_f, img_size)
+ style_loss = self.style_loss(img_r, img_f, img_size)
+
+ return l1_loss, perceptual_loss, style_loss
diff --git a/ppgan/models/criterions/gfpgan_loss.py b/ppgan/models/criterions/gfpgan_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e66a93f29df8cf25e3fd8b459cfa20290b3185e
--- /dev/null
+++ b/ppgan/models/criterions/gfpgan_loss.py
@@ -0,0 +1,427 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import math
+import numpy as np
+from collections import OrderedDict
+import os
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.vision.models import vgg
+
+from .builder import CRITERIONS
+from ppgan.utils.download import get_path_from_url
+
+VGG_PRETRAIN_PATH = os.path.join(os.getcwd(), 'pretrain', 'vgg19' + '.pdparams')
+NAMES = {
+ 'vgg11': [
+ 'conv1_1', 'relu1_1', 'pool1', 'conv2_1', 'relu2_1', 'pool2', 'conv3_1',
+ 'relu3_1', 'conv3_2', 'relu3_2', 'pool3', 'conv4_1', 'relu4_1',
+ 'conv4_2', 'relu4_2', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2',
+ 'relu5_2', 'pool5'
+ ],
+ 'vgg13': [
+ 'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1',
+ 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', 'conv3_1', 'relu3_1',
+ 'conv3_2', 'relu3_2', 'pool3', 'conv4_1', 'relu4_1', 'conv4_2',
+ 'relu4_2', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'pool5'
+ ],
+ 'vgg16': [
+ 'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1',
+ 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', 'conv3_1', 'relu3_1',
+ 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'pool3', 'conv4_1',
+ 'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'pool4',
+ 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3',
+ 'pool5'
+ ],
+ 'vgg19': [
+ 'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1',
+ 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', 'conv3_1', 'relu3_1',
+ 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'conv3_4', 'relu3_4',
+ 'pool3', 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3',
+ 'relu4_3', 'conv4_4', 'relu4_4', 'pool4', 'conv5_1', 'relu5_1',
+ 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'conv5_4', 'relu5_4',
+ 'pool5'
+ ]
+}
+
+
+def insert_bn(names):
+ """Insert bn layer after each conv.
+
+ Args:
+ names (list): The list of layer names.
+
+ Returns:
+ list: The list of layer names with bn layers.
+ """
+ names_bn = []
+ for name in names:
+ names_bn.append(name)
+ if 'conv' in name:
+ position = name.replace('conv', '')
+ names_bn.append('bn' + position)
+ return names_bn
+
+
+class VGGFeatureExtractor(nn.Layer):
+ """VGG network for feature extraction.
+
+ In this implementation, we allow users to choose whether use normalization
+ in the input feature and the type of vgg network. Note that the pretrained
+ path must fit the vgg type.
+
+ Args:
+ layer_name_list (list[str]): Forward function returns the corresponding
+ features according to the layer_name_list.
+ Example: {'relu1_1', 'relu2_1', 'relu3_1'}.
+ vgg_type (str): Set the type of vgg network. Default: 'vgg19'.
+ use_input_norm (bool): If True, normalize the input image. Importantly,
+ the input feature must in the range [0, 1]. Default: True.
+ range_norm (bool): If True, norm images with range [-1, 1] to [0, 1].
+ Default: False.
+ requires_grad (bool): If true, the parameters of VGG network will be
+ optimized. Default: False.
+ remove_pooling (bool): If true, the max pooling operations in VGG net
+ will be removed. Default: False.
+ pooling_stride (int): The stride of max pooling operation. Default: 2.
+ """
+ def __init__(
+ self,
+ layer_name_list,
+ vgg_type='vgg19',
+ use_input_norm=True,
+ range_norm=False,
+ requires_grad=False,
+ remove_pooling=False,
+ pooling_stride=2,
+ pretrained_url='https://paddlegan.bj.bcebos.com/models/vgg19.pdparams'
+ ):
+ super(VGGFeatureExtractor, self).__init__()
+ self.layer_name_list = layer_name_list
+ self.use_input_norm = use_input_norm
+ self.range_norm = range_norm
+ self.names = NAMES[vgg_type.replace('_bn', '')]
+ if 'bn' in vgg_type:
+ self.names = insert_bn(self.names)
+ max_idx = 0
+ for v in layer_name_list:
+ idx = self.names.index(v)
+ if idx > max_idx:
+ max_idx = idx
+ if os.path.exists(VGG_PRETRAIN_PATH):
+ vgg_net = getattr(vgg, vgg_type)(pretrained=False)
+ weight_path = get_path_from_url(pretrained_url)
+ state_dict = paddle.load(weight_path)
+ vgg_net.set_state_dict(state_dict)
+ else:
+ vgg_net = getattr(vgg, vgg_type)(pretrained=True)
+ features = vgg_net.features[:max_idx + 1]
+ self.vgg_layers = nn.Sequential()
+ for k, v in zip(self.names, features):
+ if 'pool' in k:
+ if remove_pooling:
+ continue
+ else:
+ self.vgg_layers.add_sublayer(
+ k, nn.MaxPool2D(kernel_size=2, stride=pooling_stride))
+ else:
+ self.vgg_layers.add_sublayer(k, v)
+
+ if not requires_grad:
+ self.vgg_layers.eval()
+ for param in self.parameters():
+ param.stop_gradient = True
+ else:
+ self.vgg_layers.train()
+ for param in self.parameters():
+ param.stop_gradient = False
+ if self.use_input_norm:
+ self.register_buffer(
+ 'mean',
+ paddle.to_tensor([0.485, 0.456, 0.406]).reshape([1, 3, 1, 1]))
+ self.register_buffer(
+ 'std',
+ paddle.to_tensor([0.229, 0.224, 0.225]).reshape([1, 3, 1, 1]))
+
+ def forward(self, x, rep=None):
+ """Forward function.
+
+ Args:
+ x (Tensor): Input tensor with shape (n, c, h, w).
+
+ Returns:
+ Tensor: Forward results.
+ """
+ if self.range_norm:
+ x = (x + 1) / 2
+ if self.use_input_norm:
+ x = (x - self.mean) / self.std
+ output = {}
+
+ for name, module in self.vgg_layers.named_children():
+ x = module(x)
+ if name in self.layer_name_list:
+ output[name] = x.clone()
+ return output
+
+
+@CRITERIONS.register()
+class GFPGANPerceptualLoss(nn.Layer):
+ """Perceptual loss with commonly used style loss.
+
+ Args:
+ layer_weights (dict): The weight for each layer of vgg feature.
+ Here is an example: {'conv5_4': 1.}, which means the conv5_4
+ feature layer (before relu5_4) will be extracted with weight
+ 1.0 in calculating losses.
+ vgg_type (str): The type of vgg network used as feature extractor.
+ Default: 'vgg19'.
+ use_input_norm (bool): If True, normalize the input image in vgg.
+ Default: True.
+ range_norm (bool): If True, norm images with range [-1, 1] to [0, 1].
+ Default: False.
+ perceptual_weight (float): If `perceptual_weight > 0`, the perceptual
+ loss will be calculated and the loss will multiplied by the
+ weight. Default: 1.0.
+ style_weight (float): If `style_weight > 0`, the style loss will be
+ calculated and the loss will multiplied by the weight.
+ Default: 0.
+ criterion (str): Criterion used for perceptual loss. Default: 'l1'.
+ """
+ def __init__(self,
+ layer_weights,
+ vgg_type='vgg19',
+ use_input_norm=True,
+ range_norm=False,
+ perceptual_weight=1.0,
+ style_weight=0.0,
+ criterion='l1'):
+ super(GFPGANPerceptualLoss, self).__init__()
+ self.perceptual_weight = perceptual_weight
+ self.style_weight = style_weight
+ self.layer_weights = layer_weights
+ self.vgg = VGGFeatureExtractor(layer_name_list=list(
+ layer_weights.keys()),
+ vgg_type=vgg_type,
+ use_input_norm=use_input_norm,
+ range_norm=range_norm)
+ self.criterion_type = criterion
+ if self.criterion_type == 'l1':
+ self.criterion = paddle.nn.L1Loss()
+ elif self.criterion_type == 'fro':
+ self.criterion = None
+ else:
+ raise NotImplementedError(
+ f'{criterion} criterion has not been supported.')
+
+ def forward(self, x, gt, rep=None):
+ """Forward function.
+
+ Args:
+ x (Tensor): Input tensor with shape (n, c, h, w).
+ gt (Tensor): Ground-truth tensor with shape (n, c, h, w).
+
+ Returns:
+ Tensor: Forward results.
+ """
+ x_features = self.vgg(x, rep)
+ gt_features = self.vgg(gt.detach())
+ if self.perceptual_weight > 0:
+ percep_loss = 0
+ for k in x_features.keys():
+ if self.criterion_type == 'fro':
+ percep_loss += paddle.linalg.norm(
+ x_features[k] - gt_features[k],
+ p='fro') * self.layer_weights[k]
+ else:
+ percep_loss += self.criterion(
+ x_features[k], gt_features[k]) * self.layer_weights[k]
+ percep_loss *= self.perceptual_weight
+ else:
+ percep_loss = None
+ if self.style_weight > 0:
+ style_loss = 0
+ for k in x_features.keys():
+ if self.criterion_type == 'fro':
+ style_loss += paddle.linalg.norm(
+ self._gram_mat(x_features[k]) -
+ self._gram_mat(gt_features[k]),
+ p='fro') * self.layer_weights[k]
+ else:
+ style_loss += self.criterion(
+ self._gram_mat(x_features[k]),
+ self._gram_mat(gt_features[k])) * self.layer_weights[k]
+ style_loss *= self.style_weight
+ else:
+ style_loss = None
+ return percep_loss, style_loss
+
+ def _gram_mat(self, x):
+ """Calculate Gram matrix.
+
+ Args:
+ x (torch.Tensor): Tensor with shape of (n, c, h, w).
+
+ Returns:
+ torch.Tensor: Gram matrix.
+ """
+ (n, c, h, w) = x.shape
+ features = x.reshape([n, c, w * h])
+ features_t = features.transpose([0, 2, 1])
+ gram = features.bmm(features_t) / (c * h * w)
+ return gram
+
+
+@CRITERIONS.register()
+class GFPGANGANLoss(nn.Layer):
+ """Define GAN loss.
+
+ Args:
+ gan_type (str): Support 'vanilla', 'lsgan', 'wgan', 'hinge'.
+ real_label_val (float): The value for real label. Default: 1.0.
+ fake_label_val (float): The value for fake label. Default: 0.0.
+ loss_weight (float): Loss weight. Default: 1.0.
+ Note that loss_weight is only for generators; and it is always 1.0
+ for discriminators.
+ """
+ def __init__(self,
+ gan_type,
+ real_label_val=1.0,
+ fake_label_val=0.0,
+ loss_weight=1.0):
+ super(GFPGANGANLoss, self).__init__()
+ self.gan_type = gan_type
+ self.loss_weight = loss_weight
+ self.real_label_val = real_label_val
+ self.fake_label_val = fake_label_val
+
+ if self.gan_type == 'vanilla':
+ self.loss = nn.BCEWithLogitsLoss()
+ elif self.gan_type == 'lsgan':
+ self.loss = nn.MSELoss()
+ elif self.gan_type == 'wgan':
+ self.loss = self._wgan_loss
+ elif self.gan_type == 'wgan_softplus':
+ self.loss = self._wgan_softplus_loss
+ elif self.gan_type == 'hinge':
+ self.loss = nn.ReLU()
+ else:
+ raise NotImplementedError(
+ f'GAN type {self.gan_type} is not implemented.')
+
+ def _wgan_loss(self, input, target):
+ """wgan loss.
+
+ Args:
+ input (Tensor): Input tensor.
+ target (bool): Target label.
+
+ Returns:
+ Tensor: wgan loss.
+ """
+ return -input.mean() if target else input.mean()
+
+ def _wgan_softplus_loss(self, input, target):
+ """wgan loss with soft plus. softplus is a smooth approximation to the
+ ReLU function.
+
+ In StyleGAN2, it is called:
+ Logistic loss for discriminator;
+ Non-saturating loss for generator.
+
+ Args:
+ input (Tensor): Input tensor.
+ target (bool): Target label.
+
+ Returns:
+ Tensor: wgan loss.
+ """
+
+ return F.softplus(-1.0 *
+ input).mean() if target else F.softplus(input).mean()
+
+ def get_target_label(self, input, target_is_real):
+ """Get target label.
+
+ Args:
+ input (Tensor): Input tensor.
+ target_is_real (bool): Whether the target is real or fake.
+
+ Returns:
+ (bool | Tensor): Target tensor. Return bool for wgan, otherwise,
+ return Tensor.
+ """
+
+ if self.gan_type in ['wgan', 'wgan_softplus']:
+ return target_is_real
+ target_val = (self.real_label_val
+ if target_is_real else self.fake_label_val)
+ return paddle.ones(input.shape, dtype=input.dtype) * target_val
+
+ def forward(self, input, target_is_real, is_disc=False):
+ """
+ Args:
+ input (Tensor): The input for the loss module, i.e., the network
+ prediction.
+ target_is_real (bool): Whether the targe is real or fake.
+ is_disc (bool): Whether the loss for discriminators or not.
+ Default: False.
+
+ Returns:
+ Tensor: GAN loss value.
+ """
+ target_label = self.get_target_label(input, target_is_real)
+ if self.gan_type == 'hinge':
+ if is_disc: # for discriminators in hinge-gan
+ input = -input if target_is_real else input
+ loss = self.loss(1 + input).mean()
+ else: # for generators in hinge-gan
+ loss = -input.mean()
+ else: # other gan types
+ loss = self.loss(input, target_label)
+
+ # loss_weight is always 1.0 for discriminators
+ return loss if is_disc else loss * self.loss_weight
+
+
+@CRITERIONS.register()
+class GFPGANL1Loss(nn.Layer):
+ """L1 (mean absolute error, MAE) loss.
+
+ Args:
+ loss_weight (float): Loss weight for L1 loss. Default: 1.0.
+ reduction (str): Specifies the reduction to apply to the output.
+ Supported choices are 'none' | 'mean' | 'sum'. Default: 'mean'.
+ """
+ def __init__(self, loss_weight=1.0, reduction='mean'):
+ super(GFPGANL1Loss, self).__init__()
+ if reduction not in ['none', 'mean', 'sum']:
+ raise ValueError(
+ f'Unsupported reduction mode: {reduction}. Supported ones are: "none" | "mean" | "sum"'
+ )
+
+ self.loss_weight = loss_weight
+ self.l1_loss = paddle.nn.L1Loss(reduction)
+
+ def forward(self, pred, target):
+ """
+ Args:
+ pred (Tensor): of shape (N, C, H, W). Predicted tensor.
+ target (Tensor): of shape (N, C, H, W). Ground truth tensor.
+ weight (Tensor, optional): of shape (N, C, H, W). Element-wise weights. Default: None.
+ """
+ return self.loss_weight * self.l1_loss(pred, target)
diff --git a/ppgan/models/criterions/gradient_penalty.py b/ppgan/models/criterions/gradient_penalty.py
new file mode 100755
index 0000000000000000000000000000000000000000..22500ec6fb9a88780a5de2e88b224327e2781074
--- /dev/null
+++ b/ppgan/models/criterions/gradient_penalty.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from .builder import CRITERIONS
+
+
+@CRITERIONS.register()
+class GradientPenalty():
+ def __init__(self, loss_weight=1.0):
+ self.loss_weight = loss_weight
+
+ def __call__(self, net, real, fake):
+ batch_size = real.shape[0]
+ alpha = paddle.rand([batch_size])
+ for _ in range(real.ndim - 1):
+ alpha = paddle.unsqueeze(alpha, -1)
+ interpolate = alpha * real + (1 - alpha) * fake
+ interpolate.stop_gradient = False
+ interpolate_pred = net(interpolate)
+ gradient = paddle.grad(outputs=interpolate_pred,
+ inputs=interpolate,
+ grad_outputs=paddle.ones_like(interpolate_pred),
+ create_graph=True,
+ retain_graph=True,
+ only_inputs=True)[0]
+ gradient_penalty = ((gradient.norm(2, 1) - 1) ** 2).mean()
+ return gradient_penalty * self.loss_weight
diff --git a/ppgan/models/criterions/id_loss.py b/ppgan/models/criterions/id_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f76a3140b8d8847e1e5e1eeb6ce8fae3113f74e
--- /dev/null
+++ b/ppgan/models/criterions/id_loss.py
@@ -0,0 +1,255 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import namedtuple
+
+import paddle
+import paddle.nn as nn
+from paddle.vision.transforms import Resize
+
+from .builder import CRITERIONS
+from ppgan.utils.download import get_path_from_url
+
+model_cfgs = {
+ 'model_urls':
+ 'https://paddlegan.bj.bcebos.com/models/model_ir_se50.pdparams',
+}
+
+class Flatten(nn.Layer):
+
+ def forward(self, input):
+ return paddle.reshape(input, [input.shape[0], -1])
+
+
+def l2_norm(input, axis=1):
+ norm = paddle.norm(input, 2, axis, True)
+ output = paddle.divide(input, norm)
+ return output
+
+
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+ """ A named tuple describing a ResNet block. """
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+ return [Bottleneck(in_channel, depth, stride)
+ ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+ if num_layers == 50:
+ blocks = [
+ get_block(in_channel=64, depth=64, num_units=3),
+ get_block(in_channel=64, depth=128, num_units=4),
+ get_block(in_channel=128, depth=256, num_units=14),
+ get_block(in_channel=256, depth=512, num_units=3)
+ ]
+ elif num_layers == 100:
+ blocks = [
+ get_block(in_channel=64, depth=64, num_units=3),
+ get_block(in_channel=64, depth=128, num_units=13),
+ get_block(in_channel=128, depth=256, num_units=30),
+ get_block(in_channel=256, depth=512, num_units=3)
+ ]
+ elif num_layers == 152:
+ blocks = [
+ get_block(in_channel=64, depth=64, num_units=3),
+ get_block(in_channel=64, depth=128, num_units=8),
+ get_block(in_channel=128, depth=256, num_units=36),
+ get_block(in_channel=256, depth=512, num_units=3)
+ ]
+ else:
+ raise ValueError(
+ "Invalid number of layers: {}. Must be one of [50, 100, 152]".
+ format(num_layers))
+ return blocks
+
+
+class SEModule(nn.Layer):
+
+ def __init__(self, channels, reduction):
+ super(SEModule, self).__init__()
+ self.avg_pool = nn.AdaptiveAvgPool2D(1)
+ self.fc1 = nn.Conv2D(channels,
+ channels // reduction,
+ kernel_size=1,
+ padding=0,
+ bias_attr=False)
+ self.relu = nn.ReLU()
+ self.fc2 = nn.Conv2D(channels // reduction,
+ channels,
+ kernel_size=1,
+ padding=0,
+ bias_attr=False)
+ self.sigmoid = nn.Sigmoid()
+
+ def forward(self, x):
+ module_input = x
+ x = self.avg_pool(x)
+ x = self.fc1(x)
+ x = self.relu(x)
+ x = self.fc2(x)
+ x = self.sigmoid(x)
+ return module_input * x
+
+
+class bottleneck_IR(nn.Layer):
+
+ def __init__(self, in_channel, depth, stride):
+ super(bottleneck_IR, self).__init__()
+ if in_channel == depth:
+ self.shortcut_layer = nn.MaxPool2D(1, stride)
+ else:
+ self.shortcut_layer = nn.Sequential(
+ nn.Conv2D(in_channel, depth, (1, 1), stride, bias_attr=False),
+ nn.BatchNorm2D(depth))
+ self.res_layer = nn.Sequential(
+ nn.BatchNorm2D(in_channel),
+ nn.Conv2D(in_channel, depth, (3, 3), (1, 1), 1, bias_attr=False),
+ nn.PReLU(depth),
+ nn.Conv2D(depth, depth, (3, 3), stride, 1, bias_attr=False),
+ nn.BatchNorm2D(depth))
+
+ def forward(self, x):
+ shortcut = self.shortcut_layer(x)
+ res = self.res_layer(x)
+ return res + shortcut
+
+
+class bottleneck_IR_SE(nn.Layer):
+
+ def __init__(self, in_channel, depth, stride):
+ super(bottleneck_IR_SE, self).__init__()
+ if in_channel == depth:
+ self.shortcut_layer = nn.MaxPool2D(1, stride)
+ else:
+ self.shortcut_layer = nn.Sequential(
+ nn.Conv2D(in_channel, depth, (1, 1), stride, bias_attr=False),
+ nn.BatchNorm2D(depth))
+ self.res_layer = nn.Sequential(
+ nn.BatchNorm2D(in_channel),
+ nn.Conv2D(in_channel, depth, (3, 3), (1, 1), 1, bias_attr=False),
+ nn.PReLU(depth),
+ nn.Conv2D(depth, depth, (3, 3), stride, 1, bias_attr=False),
+ nn.BatchNorm2D(depth), SEModule(depth, 16))
+
+ def forward(self, x):
+ shortcut = self.shortcut_layer(x)
+ res = self.res_layer(x)
+ return res + shortcut
+
+"""
+Modified Backbone implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch)
+"""
+
+class Backbone(nn.Layer):
+
+ def __init__(self,
+ input_size,
+ num_layers,
+ mode='ir',
+ drop_ratio=0.4,
+ affine=True):
+ super(Backbone, self).__init__()
+ assert input_size in [112, 224], "input_size should be 112 or 224"
+ assert num_layers in [50, 100,
+ 152], "num_layers should be 50, 100 or 152"
+ assert mode in ['ir', 'ir_se'], "mode should be ir or ir_se"
+ blocks = get_blocks(num_layers)
+ if mode == 'ir':
+ unit_module = bottleneck_IR
+ elif mode == 'ir_se':
+ unit_module = bottleneck_IR_SE
+ self.input_layer = paddle.nn.Sequential(
+ nn.Conv2D(3, 64, (3, 3), 1, 1, bias_attr=False), nn.BatchNorm2D(64),
+ nn.PReLU(64))
+ if input_size == 112:
+ self.output_layer = nn.Sequential(nn.BatchNorm2D(512),
+ nn.Dropout(drop_ratio), Flatten(),
+ nn.Linear(512 * 7 * 7, 512),
+ nn.BatchNorm1D(512))
+ else:
+ self.output_layer = nn.Sequential(nn.BatchNorm2D(512),
+ nn.Dropout(drop_ratio), Flatten(),
+ nn.Linear(512 * 14 * 14, 512),
+ nn.BatchNorm1D(512))
+
+ modules = []
+ for block in blocks:
+ for bottleneck in block:
+ modules.append(
+ unit_module(bottleneck.in_channel, bottleneck.depth,
+ bottleneck.stride))
+ self.body = nn.Sequential(*modules)
+
+ def forward(self, x):
+ x = self.input_layer(x)
+ x = self.body(x)
+ x = self.output_layer(x)
+ return l2_norm(x)
+
+
+@CRITERIONS.register()
+class IDLoss(paddle.nn.Layer):
+
+ def __init__(self, base_dir='./'):
+ super(IDLoss, self).__init__()
+ print('Loading ResNet ArcFace')
+ self.facenet = Backbone(input_size=112,
+ num_layers=50,
+ drop_ratio=0.6,
+ mode='ir_se')
+
+ facenet_weights_path = os.path.join(base_dir, 'data/gpen/weights',
+ 'model_ir_se50.pdparams')
+
+ if not os.path.isfile(facenet_weights_path):
+ facenet_weights_path = get_path_from_url(model_cfgs['model_urls'])
+
+ self.facenet.load_dict(paddle.load(facenet_weights_path))
+
+ self.face_pool = paddle.nn.AdaptiveAvgPool2D((112, 112))
+ self.facenet.eval()
+
+ def extract_feats(self, x):
+ _, _, h, w = x.shape
+ assert h == w
+ ss = h // 256
+ x = x[:, :, 35 * ss:-33 * ss, 32 * ss:-36 * ss]
+ transform = Resize(size=(112, 112))
+
+ for num in range(x.shape[0]):
+ mid_feats = transform(x[num]).unsqueeze(0)
+ if num == 0:
+ x_feats = mid_feats
+ else:
+ x_feats = paddle.concat([x_feats, mid_feats], axis=0)
+
+ x_feats = self.facenet(x_feats)
+ return x_feats
+
+ def forward(self, y_hat, y, x):
+ n_samples = x.shape[0]
+ y_feats = self.extract_feats(y)
+ y_hat_feats = self.extract_feats(y_hat)
+ y_feats = y_feats.detach()
+ loss = 0
+ count = 0
+ for i in range(n_samples):
+ diff_target = y_hat_feats[i].dot(y_feats[i])
+ loss += 1 - diff_target
+ count += 1
+
+ return loss / count
diff --git a/ppgan/models/criterions/perceptual_loss.py b/ppgan/models/criterions/perceptual_loss.py
index 2c57edc46590eff3cac358c22694aa91567f3ad5..194723829bee052e0244ca7429c3f09c8102bf1e 100644
--- a/ppgan/models/criterions/perceptual_loss.py
+++ b/ppgan/models/criterions/perceptual_loss.py
@@ -1,3 +1,4 @@
+# Copyright (c) MMEditing Authors.
import paddle
import paddle.nn as nn
import paddle.vision.models.vgg as vgg
@@ -120,7 +121,7 @@ class PerceptualLoss(nn.Layer):
perceptual_weight=1.0,
style_weight=1.0,
norm_img=True,
- pretrained='https://paddlegan.bj.bcebos.com/model/vgg19.pdparams',
+ pretrained='https://paddlegan.bj.bcebos.com/models/vgg19.pdparams',
criterion='l1'):
super(PerceptualLoss, self).__init__()
# when loss weight less than zero return None
diff --git a/ppgan/models/criterions/photopen_perceptual_loss.py b/ppgan/models/criterions/photopen_perceptual_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..f90ed465754caeb17627073b0403413b16ca7509
--- /dev/null
+++ b/ppgan/models/criterions/photopen_perceptual_loss.py
@@ -0,0 +1,150 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.vision.models.vgg as vgg
+from paddle import ParamAttr
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+from ppgan.utils.download import get_path_from_url
+from .builder import CRITERIONS
+
+class ConvBlock(nn.Layer):
+ def __init__(self, input_channels, output_channels, groups, name=None):
+ super(ConvBlock, self).__init__()
+
+ self.groups = groups
+ self._conv_1 = Conv2D(
+ in_channels=input_channels,
+ out_channels=output_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False)
+ if groups == 2 or groups == 3 or groups == 4:
+ self._conv_2 = Conv2D(
+ in_channels=output_channels,
+ out_channels=output_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False)
+ if groups == 3 or groups == 4:
+ self._conv_3 = Conv2D(
+ in_channels=output_channels,
+ out_channels=output_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False)
+ if groups == 4:
+ self._conv_4 = Conv2D(
+ in_channels=output_channels,
+ out_channels=output_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False)
+
+ self._pool = MaxPool2D(kernel_size=2, stride=2, padding=0)
+
+ def forward(self, inputs):
+ x = self._conv_1(inputs)
+ x = F.relu(x)
+ if self.groups == 2 or self.groups == 3 or self.groups == 4:
+ x = self._conv_2(x)
+ x = F.relu(x)
+ if self.groups == 3 or self.groups == 4:
+ x = self._conv_3(x)
+ x = F.relu(x)
+ if self.groups == 4:
+ x = self._conv_4(x)
+ x = F.relu(x)
+ x = self._pool(x)
+ return x
+
+class VGG19(nn.Layer):
+ def __init__(self, layers=19, class_dim=1000):
+ super(VGG19, self).__init__()
+
+ self.layers = layers
+ self.vgg_configure = {
+ 11: [1, 1, 2, 2, 2],
+ 13: [2, 2, 2, 2, 2],
+ 16: [2, 2, 3, 3, 3],
+ 19: [2, 2, 4, 4, 4]
+ }
+ assert self.layers in self.vgg_configure.keys(), \
+ "supported layers are {} but input layer is {}".format(
+ vgg_configure.keys(), layers)
+ self.groups = self.vgg_configure[self.layers]
+
+ self._conv_block_1 = ConvBlock(3, 64, self.groups[0], name="conv1_")
+ self._conv_block_2 = ConvBlock(64, 128, self.groups[1], name="conv2_")
+ self._conv_block_3 = ConvBlock(128, 256, self.groups[2], name="conv3_")
+ self._conv_block_4 = ConvBlock(256, 512, self.groups[3], name="conv4_")
+ self._conv_block_5 = ConvBlock(512, 512, self.groups[4], name="conv5_")
+
+ self._drop = Dropout(p=0.5, mode="downscale_in_infer")
+ self._fc1 = Linear(
+ 7 * 7 * 512,
+ 4096,)
+ self._fc2 = Linear(
+ 4096,
+ 4096,)
+ self._out = Linear(
+ 4096,
+ class_dim,)
+
+ def forward(self, inputs):
+ features = []
+ features.append(inputs)
+ x = self._conv_block_1(inputs)
+ features.append(x)
+ x = self._conv_block_2(x)
+ features.append(x)
+ x = self._conv_block_3(x)
+ features.append(x)
+ x = self._conv_block_4(x)
+ features.append(x)
+ x = self._conv_block_5(x)
+
+ x = paddle.reshape(x, [0, -1])
+ x = self._fc1(x)
+ x = F.relu(x)
+ x = self._drop(x)
+ x = self._fc2(x)
+ x = F.relu(x)
+ x = self._drop(x)
+ x = self._out(x)
+ return x, features
+
+@CRITERIONS.register()
+class PhotoPenPerceptualLoss(nn.Layer):
+ def __init__(self,
+ crop_size,
+ lambda_vgg,
+# pretrained='test/vgg19pretrain.pdparams',
+ pretrained='https://paddlegan.bj.bcebos.com/models/vgg19pretrain.pdparams',
+ ):
+ super(PhotoPenPerceptualLoss, self).__init__()
+ self.model = VGG19()
+ weight_path = get_path_from_url(pretrained)
+ vgg_weight = paddle.load(weight_path)
+ self.model.set_state_dict(vgg_weight)
+ print('PerceptualVGG loaded pretrained weight.')
+ self.rates = [1.0 / 32, 1.0 / 16, 1.0 / 8, 1.0 / 4, 1.0]
+ self.crop_size = crop_size
+ self.lambda_vgg = lambda_vgg
+
+ def forward(self, img_r, img_f):
+ img_r = F.interpolate(img_r, (self.crop_size, self.crop_size))
+ img_f = F.interpolate(img_f, (self.crop_size, self.crop_size))
+ _, feat_r = self.model(img_r)
+ _, feat_f = self.model(img_f)
+ g_vggloss = paddle.to_tensor(0.)
+ for i in range(len(feat_r)):
+ g_vggloss += self.rates[i] * nn.L1Loss()(feat_r[i], feat_f[i])
+ g_vggloss *= self.lambda_vgg
+
+ return g_vggloss
diff --git a/ppgan/models/criterions/pixel_loss.py b/ppgan/models/criterions/pixel_loss.py
index 4c949766d8d023e001c8a7c1ebabb6b56897f488..1e103748fb8db16389cd37dc9c3a9d885e7e1544 100644
--- a/ppgan/models/criterions/pixel_loss.py
+++ b/ppgan/models/criterions/pixel_loss.py
@@ -13,9 +13,11 @@
# limitations under the License.
import numpy as np
+from ..generators.generater_lapstyle import calc_mean_std, mean_variance_norm
import paddle
import paddle.nn as nn
+import paddle.nn.functional as F
from .builder import CRITERIONS
@@ -29,6 +31,7 @@ class L1Loss():
loss_weight (float): Loss weight for L1 loss. Default: 1.0.
"""
+
def __init__(self, reduction='mean', loss_weight=1.0):
# when loss weight less than zero return None
if loss_weight <= 0:
@@ -49,6 +52,36 @@ class L1Loss():
return self.loss_weight * self._l1_loss(pred, target)
+@CRITERIONS.register()
+class CharbonnierLoss():
+ """Charbonnier Loss (L1).
+
+ Args:
+ eps (float): Default: 1e-12.
+
+ """
+
+ def __init__(self, eps=1e-12, reduction='sum'):
+ self.eps = eps
+ self.reduction = reduction
+
+ def __call__(self, pred, target, **kwargs):
+ """Forward Function.
+
+ Args:
+ pred (Tensor): of shape (N, C, H, W). Predicted tensor.
+ target (Tensor): of shape (N, C, H, W). Ground truth tensor.
+ """
+ if self.reduction == 'sum':
+ out = paddle.sum(paddle.sqrt((pred - target)**2 + self.eps))
+ elif self.reduction == 'mean':
+ out = paddle.mean(paddle.sqrt((pred - target)**2 + self.eps))
+ else:
+ raise NotImplementedError('CharbonnierLoss %s not implemented' %
+ self.reduction)
+ return out
+
+
@CRITERIONS.register()
class MSELoss():
"""MSE (L2) loss.
@@ -59,6 +92,7 @@ class MSELoss():
loss_weight (float): Loss weight for MSE loss. Default: 1.0.
"""
+
def __init__(self, reduction='mean', loss_weight=1.0):
# when loss weight less than zero return None
if loss_weight <= 0:
@@ -88,6 +122,7 @@ class BCEWithLogitsLoss():
Supported choices are 'none' | 'mean' | 'sum'. Default: 'mean'.
loss_weight (float): Loss weight for MSE loss. Default: 1.0.
"""
+
def __init__(self, reduction='mean', loss_weight=1.0):
# when loss weight less than zero return None
if loss_weight <= 0:
@@ -106,3 +141,166 @@ class BCEWithLogitsLoss():
weights. Default: None.
"""
return self.loss_weight * self._bce_loss(pred, target)
+
+
+def calc_emd_loss(pred, target):
+ """calculate emd loss.
+
+ Args:
+ pred (Tensor): of shape (N, C, H, W). Predicted tensor.
+ target (Tensor): of shape (N, C, H, W). Ground truth tensor.
+ """
+ b, _, h, w = pred.shape
+ pred = pred.reshape([b, -1, w * h])
+ pred_norm = paddle.sqrt((pred**2).sum(1).reshape([b, -1, 1]))
+ pred = pred.transpose([0, 2, 1])
+ target_t = target.reshape([b, -1, w * h])
+ target_norm = paddle.sqrt((target**2).sum(1).reshape([b, 1, -1]))
+ similarity = paddle.bmm(pred, target_t) / pred_norm / target_norm
+ dist = 1. - similarity
+ return dist
+
+
+@CRITERIONS.register()
+class CalcStyleEmdLoss():
+ """Calc Style Emd Loss.
+ """
+
+ def __init__(self):
+ super(CalcStyleEmdLoss, self).__init__()
+
+ def __call__(self, pred, target):
+ """Forward Function.
+
+ Args:
+ pred (Tensor): of shape (N, C, H, W). Predicted tensor.
+ target (Tensor): of shape (N, C, H, W). Ground truth tensor.
+ """
+ CX_M = calc_emd_loss(pred, target)
+ m1 = CX_M.min(2)
+ m2 = CX_M.min(1)
+ m = paddle.concat([m1.mean(), m2.mean()])
+ loss_remd = paddle.max(m)
+ return loss_remd
+
+
+@CRITERIONS.register()
+class CalcContentReltLoss():
+ """Calc Content Relt Loss.
+ """
+
+ def __init__(self):
+ super(CalcContentReltLoss, self).__init__()
+
+ def __call__(self, pred, target):
+ """Forward Function.
+
+ Args:
+ pred (Tensor): of shape (N, C, H, W). Predicted tensor.
+ target (Tensor): of shape (N, C, H, W). Ground truth tensor.
+ """
+ dM = 1.
+ Mx = calc_emd_loss(pred, pred)
+ Mx = Mx / Mx.sum(1, keepdim=True)
+ My = calc_emd_loss(target, target)
+ My = My / My.sum(1, keepdim=True)
+ loss_content = paddle.abs(
+ dM * (Mx - My)).mean() * pred.shape[2] * pred.shape[3]
+ return loss_content
+
+
+@CRITERIONS.register()
+class CalcContentLoss():
+ """Calc Content Loss.
+ """
+
+ def __init__(self):
+ self.mse_loss = nn.MSELoss()
+
+ def __call__(self, pred, target, norm=False):
+ """Forward Function.
+
+ Args:
+ pred (Tensor): of shape (N, C, H, W). Predicted tensor.
+ target (Tensor): of shape (N, C, H, W). Ground truth tensor.
+ norm(Bool): whether use mean_variance_norm for pred and target
+ """
+ if (norm == False):
+ return self.mse_loss(pred, target)
+ else:
+ return self.mse_loss(mean_variance_norm(pred),
+ mean_variance_norm(target))
+
+
+@CRITERIONS.register()
+class CalcStyleLoss():
+ """Calc Style Loss.
+ """
+
+ def __init__(self):
+ self.mse_loss = nn.MSELoss()
+
+ def __call__(self, pred, target):
+ """Forward Function.
+
+ Args:
+ pred (Tensor): of shape (N, C, H, W). Predicted tensor.
+ target (Tensor): of shape (N, C, H, W). Ground truth tensor.
+ """
+ pred_mean, pred_std = calc_mean_std(pred)
+ target_mean, target_std = calc_mean_std(target)
+ return self.mse_loss(pred_mean, target_mean) + self.mse_loss(
+ pred_std, target_std)
+
+
+@CRITERIONS.register()
+class EdgeLoss():
+
+ def __init__(self):
+ k = paddle.to_tensor([[.05, .25, .4, .25, .05]])
+ self.kernel = paddle.matmul(k.t(), k).unsqueeze(0).tile([3, 1, 1, 1])
+ self.loss = CharbonnierLoss()
+
+ def conv_gauss(self, img):
+ n_channels, _, kw, kh = self.kernel.shape
+ img = F.pad(img, [kw // 2, kh // 2, kw // 2, kh // 2], mode='replicate')
+ return F.conv2d(img, self.kernel, groups=n_channels)
+
+ def laplacian_kernel(self, current):
+ filtered = self.conv_gauss(current) # filter
+ down = filtered[:, :, ::2, ::2] # downsample
+ new_filter = paddle.zeros_like(filtered)
+ new_filter.stop_gradient = True
+ new_filter[:, :, ::2, ::2] = down * 4 # upsample
+ filtered = self.conv_gauss(new_filter) # filter
+ diff = current - filtered
+ return diff
+
+ def __call__(self, x, y):
+ y.stop_gradient = True
+ loss = self.loss(self.laplacian_kernel(x), self.laplacian_kernel(y))
+ return loss
+
+
+@CRITERIONS.register()
+class PSNRLoss(nn.Layer):
+
+ def __init__(self, loss_weight=1.0, reduction='mean', toY=False):
+ super(PSNRLoss, self).__init__()
+ assert reduction == 'mean'
+ self.loss_weight = loss_weight
+ self.scale = 10 / np.log(10)
+ self.toY = toY
+ self.coef = paddle.to_tensor(np.array([65.481, 128.553,
+ 24.966])).reshape([1, 3, 1, 1])
+
+ def forward(self, pred, target):
+ if self.toY:
+ pred = (pred * self.coef).sum(axis=1).unsqueeze(axis=1) + 16.
+ target = (target * self.coef).sum(axis=1).unsqueeze(axis=1) + 16.
+
+ pred, target = pred / 255., target / 255.
+ pass
+
+ return self.loss_weight * self.scale * paddle.log((
+ (pred - target)**2).mean(axis=[1, 2, 3]) + 1e-8).mean()
diff --git a/ppgan/models/criterions/ssim.py b/ppgan/models/criterions/ssim.py
new file mode 100644
index 0000000000000000000000000000000000000000..33c3c41babaa2b99648d089922cb46208f3a5210
--- /dev/null
+++ b/ppgan/models/criterions/ssim.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# code was heavily based on https://github.com/csdwren/PReNet
+# Users should be careful about adopting these functions in any commercial matters.
+
+import numpy as np
+from math import exp
+
+import paddle
+import paddle.nn.functional as F
+from .builder import CRITERIONS
+
+
+def gaussian(window_size, sigma):
+ gauss = paddle.to_tensor([
+ exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
+ for x in range(window_size)
+ ])
+ return gauss / gauss.sum()
+
+
+
+def create_window(window_size, channel):
+ _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+ _2D_window = _1D_window.mm(_1D_window.t()).unsqueeze(0).unsqueeze(0)
+ window = paddle.to_tensor(paddle.expand(
+ _2D_window, (channel, 1, window_size, window_size)),
+ stop_gradient=False)
+ return window
+
+
+def _ssim(img1, img2, window, window_size, channel, size_average=True):
+ mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
+ mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+
+ mu1_sq = mu1.pow(2)
+ mu2_sq = mu2.pow(2)
+ mu1_mu2 = mu1 * mu2
+
+ sigma1_sq = F.conv2d(
+ img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
+ sigma2_sq = F.conv2d(
+ img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
+ sigma12 = F.conv2d(
+ img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
+
+ C1 = 0.01**2
+ C2 = 0.03**2
+
+ ssim_map = ((2 * mu1_mu2 + C1) *
+ (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
+ (sigma1_sq + sigma2_sq + C2))
+
+ if size_average:
+ return ssim_map.mean()
+ else:
+ return ssim_map.mean(1).mean(1).mean(1)
+
+
+@CRITERIONS.register()
+class SSIM(paddle.nn.Layer):
+
+ def __init__(self, window_size=11, size_average=True):
+ super(SSIM, self).__init__()
+ self.window_size = window_size
+ self.size_average = size_average
+ self.channel = 1
+ self.window = create_window(window_size, self.channel)
+
+ def forward(self, img1, img2):
+ (_, channel, _, _) = img1.shape
+
+ if channel == self.channel and self.window.dtype == img1.dtype:
+ window = self.window
+ else:
+ window = create_window(self.window_size, channel)
+ tt = img1.dtype
+ window = paddle.to_tensor(window, dtype=tt)
+ self.window = window
+ self.channel = channel
+
+ return _ssim(img1, img2, window, self.window_size, channel,
+ self.size_average)
+
+
+def ssim(img1, img2, window_size=11, size_average=True):
+ (_, channel, _, _) = img1.size()
+ window = create_window(window_size, channel)
+
+ if img1.is_cuda:
+ window = window.cuda(img1.get_device())
+ window = window.type_as(img1)
+
+ return _ssim(img1, img2, window, window_size, channel, size_average)
diff --git a/ppgan/models/cycle_gan_model.py b/ppgan/models/cycle_gan_model.py
index 6d1c3f0950b96dc9588ab598a572cc9497c302d3..fd7204f5ed8a60114dfe74354b1c032b7c7504c7 100644
--- a/ppgan/models/cycle_gan_model.py
+++ b/ppgan/models/cycle_gan_model.py
@@ -13,7 +13,7 @@
# limitations under the License.
import paddle
-from .base_model import BaseModel
+from .base_model import BaseModel, apply_to_static
from .builder import MODELS
from .generators.builder import build_generator
@@ -40,7 +40,9 @@ class CycleGANModel(BaseModel):
pool_size=50,
direction='a2b',
lambda_a=10.,
- lambda_b=10.):
+ lambda_b=10.,
+ to_static=False,
+ image_shape=None):
"""Initialize the CycleGAN class.
Args:
@@ -59,6 +61,9 @@ class CycleGANModel(BaseModel):
# Code (vs. paper): G_A (G), G_B (F), D_A (D_Y), D_B (D_X)
self.nets['netG_A'] = build_generator(generator)
self.nets['netG_B'] = build_generator(generator)
+ # set @to_static for benchmark, skip this by default.
+ apply_to_static(to_static, image_shape, self.nets['netG_A'])
+ apply_to_static(to_static, image_shape, self.nets['netG_B'])
init_weights(self.nets['netG_A'])
init_weights(self.nets['netG_B'])
@@ -66,6 +71,9 @@ class CycleGANModel(BaseModel):
if discriminator:
self.nets['netD_A'] = build_discriminator(discriminator)
self.nets['netD_B'] = build_discriminator(discriminator)
+ # set @to_static for benchmark, skip this by default.
+ apply_to_static(to_static, image_shape, self.nets['netD_A'])
+ apply_to_static(to_static, image_shape, self.nets['netD_B'])
init_weights(self.nets['netD_A'])
init_weights(self.nets['netD_B'])
@@ -114,6 +122,7 @@ class CycleGANModel(BaseModel):
def forward(self):
"""Run forward pass; called by both functions and ."""
if hasattr(self, 'real_A'):
+ self.real_A.stop_gradient = False
self.fake_B = self.nets['netG_A'](self.real_A) # G_A(A)
self.rec_A = self.nets['netG_B'](self.fake_B) # G_B(G_A(A))
@@ -221,14 +230,13 @@ class CycleGANModel(BaseModel):
# forward
# compute fake images and reconstruction images.
self.forward()
- # G_A and G_B
- # Ds require no gradients when optimizing Gs
- self.set_requires_grad([self.nets['netD_A'], self.nets['netD_B']],
- False)
# set G_A and G_B's gradients to zero
optimizers['optimG'].clear_grad()
# calculate gradients for G_A and G_B
self.backward_G()
+ # G_A and G_B
+ # Ds require no gradients when optimizing Gs
+ self.set_requires_grad([self.nets['netD_A'], self.nets['netD_B']], False)
# update G_A and G_B's weights
self.optimizers['optimG'].step()
# D_A and D_B
@@ -242,3 +250,13 @@ class CycleGANModel(BaseModel):
self.backward_D_B()
# update D_A and D_B's weights
optimizers['optimD'].step()
+
+
+ def test_iter(self, metrics=None):
+ self.nets['netG_A'].eval()
+ self.forward()
+ with paddle.no_grad():
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(self.fake_B, self.real_B)
+ self.nets['netG_A'].train()
diff --git a/ppgan/models/dc_gan_model.py b/ppgan/models/dc_gan_model.py
index b13e494af2d83d34aecc878ccaa8e505d7327796..220e05c0d0cf4ec690f4fabbbb3dd305c2b2f9ff 100644
--- a/ppgan/models/dc_gan_model.py
+++ b/ppgan/models/dc_gan_model.py
@@ -56,8 +56,9 @@ class DCGANModel(BaseModel):
input (dict): include the data itself and its metadata information.
"""
# get 1-channel gray image, or 3-channel color image
- self.real = paddle.to_tensor(input['A'])
- self.image_paths = input['A_path']
+ self.real = paddle.to_tensor(input['img'])
+ if 'img_path' in input:
+ self.image_paths = input['A_path']
def forward(self):
"""Run forward pass; called by both functions and ."""
diff --git a/ppgan/models/discriminators/__init__.py b/ppgan/models/discriminators/__init__.py
index cbdbc5eee41aba076fb3b7659e199901b9bd00e1..3fe48bcf5b8baa1e42bd52ea400eb43b8fedd39b 100644
--- a/ppgan/models/discriminators/__init__.py
+++ b/ppgan/models/discriminators/__init__.py
@@ -17,6 +17,13 @@ from .nlayers import NLayerDiscriminator, NLayerDiscriminatorWithClassification
from .discriminator_ugatit import UGATITDiscriminator
from .dcdiscriminator import DCDiscriminator
from .discriminator_animegan import AnimeDiscriminator
-from .discriminator_styleganv2 import StyleGANv2Discriminator
+from .discriminator_styleganv2 import StyleGANv2Discriminator, GPENDiscriminator
from .syncnet import SyncNetColor
from .wav2lip_disc_qual import Wav2LipDiscQual
+from .discriminator_starganv2 import StarGANv2Discriminator
+from .discriminator_firstorder import FirstOrderDiscriminator
+from .discriminator_lapstyle import LapStyleDiscriminator
+from .discriminator_photopen import MultiscaleDiscriminator
+from .discriminator_singan import SinGANDiscriminator
+from .arcface_arch_paddle import ResNetArcFace
+from .discriminator_aotgan import Discriminator
diff --git a/ppgan/models/backbones/resnet_backbone.py b/ppgan/models/discriminators/arcface_arch_paddle.py
similarity index 36%
rename from ppgan/models/backbones/resnet_backbone.py
rename to ppgan/models/discriminators/arcface_arch_paddle.py
index 6ba9c07a9105effe603cc535745f6710325830f6..5a8465306ab621e2550f8688d678aabe34cb7727 100644
--- a/ppgan/models/backbones/resnet_backbone.py
+++ b/ppgan/models/discriminators/arcface_arch_paddle.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -11,19 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
import paddle
import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .builder import DISCRIMINATORS
-__all__ = [
- 'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'
-]
+def conv3x3(inplanes, outplanes, stride=1):
+ """A simple wrapper for 3x3 convolution with padding.
-def conv3x3(in_planes, out_planes, stride=1):
- "3x3 convolution with padding"
- return nn.Conv2D(in_planes,
- out_planes,
+ Args:
+ inplanes (int): Channel number of inputs.
+ outplanes (int): Channel number of outputs.
+ stride (int): Stride in convolution. Default: 1.
+ """
+ return nn.Conv2D(inplanes,
+ outplanes,
kernel_size=3,
stride=stride,
padding=1,
@@ -31,57 +35,116 @@ def conv3x3(in_planes, out_planes, stride=1):
class BasicBlock(nn.Layer):
+ """Basic residual block used in the ResNetArcFace architecture.
+
+ Args:
+ inplanes (int): Channel number of inputs.
+ planes (int): Channel number of outputs.
+ stride (int): Stride in convolution. Default: 1.
+ downsample (nn.Module): The downsample module. Default: None.
+ """
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
- self.bn1 = nn.BatchNorm(planes)
+ self.bn1 = nn.BatchNorm2D(planes)
self.relu = nn.ReLU()
self.conv2 = conv3x3(planes, planes)
- self.bn2 = nn.BatchNorm(planes)
+ self.bn2 = nn.BatchNorm2D(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
-
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
-
out = self.conv2(out)
out = self.bn2(out)
-
if self.downsample is not None:
residual = self.downsample(x)
-
out += residual
out = self.relu(out)
+ return out
+
+
+class IRBlock(nn.Layer):
+ """Improved residual block (IR Block) used in the ResNetArcFace architecture.
+ Args:
+ inplanes (int): Channel number of inputs.
+ planes (int): Channel number of outputs.
+ stride (int): Stride in convolution. Default: 1.
+ downsample (nn.Module): The downsample module. Default: None.
+ use_se (bool): Whether use the SEBlock (squeeze and excitation block). Default: True.
+ """
+ expansion = 1
+
+ def __init__(self,
+ inplanes,
+ planes,
+ stride=1,
+ downsample=None,
+ use_se=True):
+ super(IRBlock, self).__init__()
+ self.bn0 = nn.BatchNorm2D(inplanes)
+ self.conv1 = conv3x3(inplanes, inplanes)
+ self.bn1 = nn.BatchNorm2D(inplanes)
+ self.prelu = PReLU_layer()
+ self.conv2 = conv3x3(inplanes, planes, stride)
+ self.bn2 = nn.BatchNorm2D(planes)
+ self.downsample = downsample
+ self.stride = stride
+ self.use_se = use_se
+ if self.use_se:
+ self.se = SEBlock(planes)
+
+ def forward(self, x):
+ residual = x
+ out = self.bn0(x)
+ out = self.conv1(out)
+ out = self.bn1(out)
+ out = self.prelu(out)
+ out = self.conv2(out)
+ out = self.bn2(out)
+ if self.use_se:
+ out = self.se(out)
+ if self.downsample is not None:
+ residual = self.downsample(x)
+ out += residual
+ out = self.prelu(out)
return out
class Bottleneck(nn.Layer):
+ """Bottleneck block used in the ResNetArcFace architecture.
+
+ Args:
+ inplanes (int): Channel number of inputs.
+ planes (int): Channel number of outputs.
+ stride (int): Stride in convolution. Default: 1.
+ downsample (nn.Module): The downsample module. Default: None.
+ """
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
- self.bn1 = nn.BatchNorm(planes)
+ self.bn1 = nn.BatchNorm2D(planes)
self.conv2 = nn.Conv2D(planes,
planes,
kernel_size=3,
stride=stride,
padding=1,
bias_attr=False)
- self.bn2 = nn.BatchNorm(planes)
+ self.bn2 = nn.BatchNorm2D(planes)
self.conv3 = nn.Conv2D(planes,
- planes * 4,
+ planes * self.expansion,
kernel_size=1,
bias_attr=False)
- self.bn3 = nn.BatchNorm(planes * 4)
- self.relu = nn.ReLU()
+ self.bn3 = nn.BatchNorm2D(planes * self.expansion)
+ self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
@@ -90,44 +153,103 @@ class Bottleneck(nn.Layer):
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
-
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
-
out = self.conv3(out)
out = self.bn3(out)
-
if self.downsample is not None:
residual = self.downsample(x)
-
out += residual
out = self.relu(out)
-
return out
-class ResNet(nn.Layer):
- def __init__(self, block, layers, num_classes=1000):
+class PReLU_layer(nn.Layer):
+ def __init__(self, init_value=0.25, num=1):
+ super(PReLU_layer, self).__init__()
+ x = self.create_parameter(
+ attr=None,
+ shape=[num],
+ dtype=paddle.get_default_dtype(),
+ is_bias=False,
+ default_initializer=nn.initializer.Constant(init_value))
+ self.add_parameter('weight', x)
+
+ def forward(self, x):
+ return F.prelu(x, self.weight)
+
+
+class SEBlock(nn.Layer):
+ """The squeeze-and-excitation block (SEBlock) used in the IRBlock.
+
+ Args:
+ channel (int): Channel number of inputs.
+ reduction (int): Channel reduction ration. Default: 16.
+ """
+ def __init__(self, channel, reduction=16):
+ super(SEBlock, self).__init__()
+ self.avg_pool = nn.AdaptiveAvgPool2D(1)
+ self.fc = nn.Sequential(nn.Linear(channel, channel // reduction),
+ nn.PReLU(),
+ nn.Linear(channel // reduction, channel),
+ nn.Sigmoid())
+
+ def forward(self, x):
+ b, c, _, _ = x.size()
+ y = self.avg_pool(x).view(b, c)
+ y = self.fc(y).view(b, c, 1, 1)
+ return x * y
+
+
+def constant_init(param, **kwargs):
+ initializer = nn.initializer.Constant(**kwargs)
+ initializer(param, param.block)
+
+
+@DISCRIMINATORS.register()
+class ResNetArcFace(nn.Layer):
+ """ArcFace with ResNet architectures.
+
+ Ref: ArcFace: Additive Angular Margin Loss for Deep Face Recognition.
+
+ Args:
+ block (str): Block used in the ArcFace architecture.
+ layers (tuple(int)): Block numbers in each layer.
+ use_se (bool): Whether use the SEBlock (squeeze and excitation block). Default: True.
+ """
+ def __init__(self, block, layers, use_se=True, reprod_logger=None):
+ if block == 'IRBlock':
+ block = IRBlock
self.inplanes = 64
- super(ResNet, self).__init__()
- self.conv1 = nn.Conv2D(3,
- 64,
- kernel_size=7,
- stride=2,
- padding=3,
- bias_attr=False)
- self.bn1 = nn.BatchNorm(64)
- self.relu = nn.ReLU()
- self.maxpool = nn.Pool2D(pool_size=3, pool_stride=2, pool_padding=1)
+ self.use_se = use_se
+ super(ResNetArcFace, self).__init__()
+ self.conv1 = nn.Conv2D(1, 64, kernel_size=3, padding=1, bias_attr=False)
+ self.bn1 = nn.BatchNorm2D(64)
+ self.maxpool = nn.MaxPool2D(kernel_size=2, stride=2)
+ self.prelu = PReLU_layer()
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
- self.avgpool = nn.Pool2D(7, pool_stride=1, pool_type='avg')
- self.fc = nn.Linear(512 * block.expansion, num_classes)
-
- def _make_layer(self, block, planes, blocks, stride=1):
+ self.bn4 = nn.BatchNorm2D(512)
+ self.dropout = nn.Dropout()
+ self.fc5 = nn.Linear(512 * 8 * 8, 512)
+ self.bn5 = nn.BatchNorm1D(512)
+ self.apply(self._init_weights)
+
+ def _init_weights(self, m):
+ if isinstance(m, paddle.nn.Conv2D):
+ nn.initializer.XavierNormal(m.weight)
+ elif isinstance(m, paddle.nn.BatchNorm2D) or isinstance(
+ m, paddle.nn.BatchNorm1D):
+ constant_init(m.weight, value=1.)
+ constant_init(m.bias, value=0.)
+ elif isinstance(m, paddle.nn.Linear):
+ nn.initializer.XavierNormal(m.weight)
+ constant_init(m.bias, value=0.)
+
+ def _make_layer(self, block, planes, num_blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
@@ -136,79 +258,28 @@ class ResNet(nn.Layer):
kernel_size=1,
stride=stride,
bias_attr=False),
- nn.BatchNorm(planes * block.expansion),
- )
-
+ nn.BatchNorm2D(planes * block.expansion))
layers = []
- layers.append(block(self.inplanes, planes, stride, downsample))
- self.inplanes = planes * block.expansion
- for _ in range(1, blocks):
- layers.append(block(self.inplanes, planes))
-
+ layers.append(
+ block(self.inplanes, planes, stride, downsample,
+ use_se=self.use_se))
+ self.inplanes = planes
+ for _ in range(1, num_blocks):
+ layers.append(block(self.inplanes, planes, use_se=self.use_se))
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
- x = self.relu(x)
+ x = self.prelu(x)
x = self.maxpool(x)
-
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
- x = self.avgpool(x)
- x = paddle.reshape(x, (x.shape[0], -1))
- x = self.fc(x)
-
+ x = self.bn4(x)
+ x = self.dropout(x)
+ x = x.reshape([x.shape[0], -1])
+ x = self.fc5(x)
+ x = self.bn5(x)
return x
-
-
-def resnet18(pretrained=False, **kwargs):
- """Constructs a ResNet-18 model.
-
- Args:
- pretrained (bool): If True, returns a model pre-trained on ImageNet
- """
- model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
- return model
-
-
-def resnet34(pretrained=False, **kwargs):
- """Constructs a ResNet-34 model.
-
- Args:
- pretrained (bool): If True, returns a model pre-trained on ImageNet
- """
- model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
- return model
-
-
-def resnet50(pretrained=False, **kwargs):
- """Constructs a ResNet-50 model.
-
- Args:
- pretrained (bool): If True, returns a model pre-trained on ImageNet
- """
- model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
- return model
-
-
-def resnet101(pretrained=False, **kwargs):
- """Constructs a ResNet-101 model.
-
- Args:
- pretrained (bool): If True, returns a model pre-trained on ImageNet
- """
- model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
- return model
-
-
-def resnet152(pretrained=False, **kwargs):
- """Constructs a ResNet-152 model.
-
- Args:
- pretrained (bool): If True, returns a model pre-trained on ImageNet
- """
- model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
- return model
diff --git a/ppgan/models/discriminators/dcdiscriminator.py b/ppgan/models/discriminators/dcdiscriminator.py
index f66b49a84ebca5b5d14e6f2f00ff065e035973bd..cd964cf91564ad0870c27874876a7d1cbff78633 100644
--- a/ppgan/models/discriminators/dcdiscriminator.py
+++ b/ppgan/models/discriminators/dcdiscriminator.py
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/aidotse/Team-Haste
+# MIT License
+# Copyright (c) 2020 AI Sweden
+
import paddle
import functools
import numpy as np
@@ -31,15 +35,14 @@ class DCDiscriminator(nn.Layer):
"""Construct a DCGAN discriminator
Parameters:
- input_nc (int) -- the number of channels in input images
- ndf (int) -- the number of filters in the last conv layer
- norm_type (str) -- normalization layer type
+ input_nc (int): the number of channels in input images
+ ndf (int): the number of filters in the last conv layer
+ norm_type (str): normalization layer type
"""
super(DCDiscriminator, self).__init__()
norm_layer = build_norm_layer(norm_type)
- if type(
- norm_layer
- ) == functools.partial: # no need to use bias as BatchNorm2d has affine parameters
+ if type(norm_layer) == functools.partial:
+ # no need to use bias as BatchNorm2d has affine parameters
use_bias = norm_layer.func == nn.BatchNorm2D
else:
use_bias = norm_layer == nn.BatchNorm2D
@@ -48,29 +51,30 @@ class DCDiscriminator(nn.Layer):
padw = 1
sequence = [
- nn.Conv2D(input_nc,
- ndf,
- kernel_size=kw,
- stride=2,
- padding=padw,
- bias_attr=use_bias),
- nn.LeakyReLU(0.2)
- ]
+ nn.Conv2D(input_nc,
+ ndf,
+ kernel_size=kw,
+ stride=2,
+ padding=padw,
+ bias_attr=use_bias),
+ nn.LeakyReLU(0.2)
+ ]
nf_mult = 1
nf_mult_prev = 1
n_downsampling = 4
- for n in range(1, n_downsampling): # gradually increase the number of filters
+ # gradually increase the number of filters
+ for n in range(1, n_downsampling):
nf_mult_prev = nf_mult
nf_mult = min(2**n, 8)
if norm_type == 'batch':
sequence += [
nn.Conv2D(ndf * nf_mult_prev,
- ndf * nf_mult,
- kernel_size=kw,
- stride=2,
- padding=padw),
+ ndf * nf_mult,
+ kernel_size=kw,
+ stride=2,
+ padding=padw),
BatchNorm2D(ndf * nf_mult),
nn.LeakyReLU(0.2)
]
@@ -88,13 +92,14 @@ class DCDiscriminator(nn.Layer):
nf_mult_prev = nf_mult
+ # output 1 channel prediction map
sequence += [
- nn.Conv2D(ndf * nf_mult_prev,
- 1,
- kernel_size=kw,
- stride=1,
- padding=0)
- ] # output 1 channel prediction map
+ nn.Conv2D(ndf * nf_mult_prev,
+ 1,
+ kernel_size=kw,
+ stride=1,
+ padding=0)
+ ]
self.model = nn.Sequential(*sequence)
diff --git a/ppgan/models/discriminators/discriminator_animegan.py b/ppgan/models/discriminators/discriminator_animegan.py
index d0c7badea1dabe325346eaca9d57fff573953932..c06ad72f7d8aa3a1af8d8a0e397c6039ada9e025 100644
--- a/ppgan/models/discriminators/discriminator_animegan.py
+++ b/ppgan/models/discriminators/discriminator_animegan.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# code was heavily based on https://github.com/TachibanaYoshino/AnimeGANv2
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/TachibanaYoshino/AnimeGANv2#license
import paddle.nn as nn
import paddle.nn.functional as F
diff --git a/ppgan/datasets/transforms/functional.py b/ppgan/models/discriminators/discriminator_aotgan.py
similarity index 37%
rename from ppgan/datasets/transforms/functional.py
rename to ppgan/models/discriminators/discriminator_aotgan.py
index 83350f58618e1fa60e613098900774acb9ae285e..57a34109364e58845877e0460ccec7f03b18bd61 100644
--- a/ppgan/datasets/transforms/functional.py
+++ b/ppgan/models/discriminators/discriminator_aotgan.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
@@ -11,20 +11,29 @@
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
-from __future__ import division
-from . import functional_cv2 as F_cv2
-from paddle.vision.transforms.functional import _is_numpy_image, _is_pil_image
+import paddle
+import paddle.nn as nn
+from paddle.nn.utils import spectral_norm
-__all__ = ['add']
+from .builder import DISCRIMINATORS
+@DISCRIMINATORS.register()
+class Discriminator(nn.Layer):
+ def __init__(self, inc = 3):
+ super(Discriminator, self).__init__()
+ self.conv = nn.Sequential(
+ spectral_norm(nn.Conv2D(inc, 64, 4, 2, 1, bias_attr=False)),
+ nn.LeakyReLU(0.2),
+ spectral_norm(nn.Conv2D(64, 128, 4, 2, 1, bias_attr=False)),
+ nn.LeakyReLU(0.2),
+ spectral_norm(nn.Conv2D(128, 256, 4, 2, 1, bias_attr=False)),
+ nn.LeakyReLU(0.2),
+ spectral_norm(nn.Conv2D(256, 512, 4, 1, 1, bias_attr=False)),
+ nn.LeakyReLU(0.2),
+ nn.Conv2D(512, 1, 4, 1, 1)
+ )
-def add(pic, value):
- if not (_is_pil_image(pic) or _is_numpy_image(pic)):
- raise TypeError('pic should be PIL Image or ndarray. Got {}'.format(
- type(pic)))
-
- if _is_pil_image(pic):
- raise NotImplementedError('add not support pil image')
- else:
- return F_cv2.add(pic, value)
+ def forward(self, x):
+ feat = self.conv(x)
+ return feat
diff --git a/ppgan/models/discriminators/discriminator_firstorder.py b/ppgan/models/discriminators/discriminator_firstorder.py
new file mode 100755
index 0000000000000000000000000000000000000000..9a18d70e9d274c2c2cddd61beae1f251963c7770
--- /dev/null
+++ b/ppgan/models/discriminators/discriminator_firstorder.py
@@ -0,0 +1,177 @@
+# code was heavily based on https://github.com/AliaksandrSiarohin/first-order-model
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/AliaksandrSiarohin/first-order-model/blob/master/LICENSE.md
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .builder import DISCRIMINATORS
+from ...modules.first_order import ImagePyramide, detach_kp, kp2gaussian
+
+from ...modules.utils import spectral_norm
+
+
+@DISCRIMINATORS.register()
+class FirstOrderDiscriminator(nn.Layer):
+ """
+ Merge all discriminator related updates into single model for better multi-gpu usage
+ Args:
+ discriminator_cfg:
+ scales (list): extract the features of image pyramids
+ block_expansion (int): block_expansion * (2**i) output features for each block i
+ max_features (int): input features cannot larger than max_features for encoding images
+ num_blocks (int): number of blocks for encoding images
+ sn (bool): whether to use spentral norm
+ common_params:
+ num_kp (int): number of keypoints
+ num_channels (int): image channels
+ estimate_jacobian (bool): whether to estimate jacobian values of keypoints
+ train_params:
+ loss_weights:
+ discriminator_gan (int): weight of discriminator loss
+ """
+ def __init__(self, discriminator_cfg, common_params, train_params):
+ super(FirstOrderDiscriminator, self).__init__()
+ self.discriminator = MultiScaleDiscriminator(**discriminator_cfg,
+ **common_params)
+ self.train_params = train_params
+ self.scales = self.discriminator.scales
+ self.pyramid = ImagePyramide(self.scales, common_params['num_channels'])
+ self.loss_weights = train_params['loss_weights']
+
+ def forward(self, x, generated):
+ pyramide_real = self.pyramid(x['driving'])
+ pyramide_generated = self.pyramid(generated['prediction'].detach())
+
+ kp_driving = generated['kp_driving']
+ discriminator_maps_generated = self.discriminator(
+ pyramide_generated, kp=detach_kp(kp_driving))
+ discriminator_maps_real = self.discriminator(pyramide_real,
+ kp=detach_kp(kp_driving))
+
+ loss_values = {}
+ value_total = 0
+ for scale in self.scales:
+ key = 'prediction_map_%s' % scale
+ value = (1 - discriminator_maps_real[key]
+ )**2 + discriminator_maps_generated[key]**2
+ value_total += self.loss_weights['discriminator_gan'] * value.mean()
+ loss_values['disc_gan'] = value_total
+
+ return loss_values
+
+
+class DownBlock2d(nn.Layer):
+ """
+ Simple block for processing video (encoder).
+ """
+ def __init__(self,
+ in_features,
+ out_features,
+ norm=False,
+ kernel_size=4,
+ pool=False,
+ sn=False):
+ super(DownBlock2d, self).__init__()
+ self.conv = nn.Conv2D(in_features,
+ out_features,
+ kernel_size=kernel_size)
+ if sn:
+ self.conv = spectral_norm(self.conv)
+ else:
+ self.sn = None
+ if norm:
+ self.norm = nn.InstanceNorm2D(num_features=out_features,
+ epsilon=1e-05)
+ else:
+ self.norm = None
+
+ self.pool = pool
+
+ def forward(self, x):
+
+ out = x
+ out = self.conv(out)
+ if self.norm is not None:
+ out = self.norm(out)
+ out = F.leaky_relu(out, 0.2)
+ if self.pool:
+ out = F.avg_pool2d(out, kernel_size=2, stride=2, ceil_mode=False)
+ return out
+
+
+class Discriminator(nn.Layer):
+ def __init__(self,
+ num_channels=3,
+ block_expansion=64,
+ num_blocks=4,
+ max_features=512,
+ sn=False,
+ use_kp=False,
+ num_kp=10,
+ kp_variance=0.01,
+ **kwargs):
+ super(Discriminator, self).__init__()
+
+ down_blocks = []
+ for i in range(num_blocks):
+ down_blocks.append(
+ DownBlock2d(num_channels + num_kp * use_kp if i == 0 else min(
+ max_features, block_expansion * (2**i)),
+ min(max_features, block_expansion * (2**(i + 1))),
+ norm=(i != 0),
+ kernel_size=4,
+ pool=(i != num_blocks - 1),
+ sn=sn))
+
+ self.down_blocks = nn.LayerList(down_blocks)
+ self.conv = nn.Conv2D(self.down_blocks[len(self.down_blocks) -
+ 1].conv.parameters()[0].shape[0],
+ 1,
+ kernel_size=1)
+ if sn:
+ self.conv = spectral_norm(self.conv)
+ else:
+ self.sn = None
+ self.use_kp = use_kp
+ self.kp_variance = kp_variance
+
+ def forward(self, x, kp=None):
+ feature_maps = []
+ out = x
+
+ if self.use_kp:
+ heatmap = kp2gaussian(kp, x.shape[2:], self.kp_variance)
+ out = paddle.concat([out, heatmap], axis=1)
+ for down_block in self.down_blocks:
+ out = down_block(out)
+ feature_maps.append(out)
+ out = feature_maps[-1]
+ prediction_map = self.conv(out)
+ return feature_maps, prediction_map
+
+
+class MultiScaleDiscriminator(nn.Layer):
+ """
+ Multi-scale (scale) discriminator
+ """
+ def __init__(self, scales=(), **kwargs):
+ super(MultiScaleDiscriminator, self).__init__()
+ self.scales = scales
+ self.discs = nn.LayerList()
+ self.nameList = []
+ for scale in scales:
+ self.discs.add_sublayer(
+ str(scale).replace('.', '-'), Discriminator(**kwargs))
+ self.nameList.append(str(scale).replace('.', '-'))
+
+ def forward(self, x, kp=None):
+ out_dict = {}
+ for scale, disc in zip(self.nameList, self.discs):
+ scale = str(scale).replace('-', '.')
+ key = 'prediction_' + scale
+ feature_maps, prediction_map = disc(x[key], kp)
+ out_dict['feature_maps_' + scale] = feature_maps
+ out_dict['prediction_map_' + scale] = prediction_map
+ return out_dict
diff --git a/ppgan/models/discriminators/discriminator_lapstyle.py b/ppgan/models/discriminators/discriminator_lapstyle.py
new file mode 100644
index 0000000000000000000000000000000000000000..624cfd8695df6f9510b8e0332c6bade0413a4fd5
--- /dev/null
+++ b/ppgan/models/discriminators/discriminator_lapstyle.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .builder import DISCRIMINATORS
+
+
+@DISCRIMINATORS.register()
+class LapStyleDiscriminator(nn.Layer):
+ def __init__(self):
+ super(LapStyleDiscriminator, self).__init__()
+ num_layer = 3
+ num_channel = 32
+ self.head = nn.Sequential(
+ ('conv',
+ nn.Conv2D(3, num_channel, kernel_size=3, stride=1, padding=1)),
+ ('norm', nn.BatchNorm2D(num_channel)),
+ ('LeakyRelu', nn.LeakyReLU(0.2)))
+ self.body = nn.Sequential()
+ for i in range(num_layer - 2):
+ self.body.add_sublayer(
+ 'conv%d' % (i + 1),
+ nn.Conv2D(num_channel,
+ num_channel,
+ kernel_size=3,
+ stride=1,
+ padding=1))
+ self.body.add_sublayer('norm%d' % (i + 1),
+ nn.BatchNorm2D(num_channel))
+ self.body.add_sublayer('LeakyRelu%d' % (i + 1), nn.LeakyReLU(0.2))
+ self.tail = nn.Conv2D(num_channel,
+ 1,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ def forward(self, x):
+ x = self.head(x)
+ x = self.body(x)
+ x = self.tail(x)
+ return x
diff --git a/ppgan/models/discriminators/discriminator_photopen.py b/ppgan/models/discriminators/discriminator_photopen.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e378a41f660c92a4c1c3c0ce898a857594f25f3
--- /dev/null
+++ b/ppgan/models/discriminators/discriminator_photopen.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import re
+import copy
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.utils import spectral_norm
+
+from ppgan.utils.photopen import build_norm_layer, simam, Dict
+from .builder import DISCRIMINATORS
+
+
+
+class NLayersDiscriminator(nn.Layer):
+ def __init__(self, opt):
+ super(NLayersDiscriminator, self).__init__()
+
+ kw = 4
+ padw = int(np.ceil((kw - 1.0) / 2))
+ nf = opt.ndf
+ input_nc = self.compute_D_input_nc(opt)
+ layer_count = 0
+
+ layer = nn.Sequential(
+ nn.Conv2D(input_nc, nf, kw, 2, padw),
+ nn.GELU()
+ )
+ self.add_sublayer('block_'+str(layer_count), layer)
+ layer_count += 1
+
+ feat_size_prev = np.floor((opt.crop_size + padw * 2 - (kw - 2)) / 2).astype('int64')
+ InstanceNorm = build_norm_layer('instance')
+ for n in range(1, opt.n_layers_D):
+ nf_prev = nf
+ nf = min(nf * 2, 512)
+ stride = 1 if n == opt.n_layers_D - 1 else 2
+ feat_size = np.floor((feat_size_prev + padw * 2 - (kw - stride)) / stride).astype('int64')
+ feat_size_prev = feat_size
+ layer = nn.Sequential(
+ spectral_norm(nn.Conv2D(nf_prev, nf, kw, stride, padw,
+ weight_attr=None,
+ bias_attr=None)),
+ InstanceNorm(nf),
+ nn.GELU()
+ )
+ self.add_sublayer('block_'+str(layer_count), layer)
+ layer_count += 1
+
+ layer = nn.Conv2D(nf, 1, kw, 1, padw)
+ self.add_sublayer('block_'+str(layer_count), layer)
+ layer_count += 1
+
+ def forward(self, input):
+ output = []
+ for layer in self._sub_layers.values():
+ output.append(simam(layer(input)))
+ input = output[-1]
+
+ return output
+
+ def compute_D_input_nc(self, opt):
+ input_nc = opt.label_nc + opt.output_nc
+ if opt.contain_dontcare_label:
+ input_nc += 1
+ if not opt.no_instance:
+ input_nc += 1
+ return input_nc
+
+@DISCRIMINATORS.register()
+class MultiscaleDiscriminator(nn.Layer):
+ def __init__(self,
+ ndf,
+ num_D,
+ crop_size,
+ label_nc,
+ output_nc,
+ contain_dontcare_label,
+ no_instance,
+ n_layers_D,
+
+ ):
+ super(MultiscaleDiscriminator, self).__init__()
+
+ opt = {
+ 'ndf': ndf,
+ 'num_D': num_D,
+ 'crop_size': crop_size,
+ 'label_nc': label_nc,
+ 'output_nc': output_nc,
+ 'contain_dontcare_label': contain_dontcare_label,
+ 'no_instance': no_instance,
+ 'n_layers_D': n_layers_D,
+
+ }
+ opt = Dict(opt)
+
+ for i in range(opt.num_D):
+ sequence = []
+ crop_size_bkp = opt.crop_size
+ feat_size = opt.crop_size
+ for j in range(i):
+ sequence += [nn.AvgPool2D(3, 2, 1)]
+ feat_size = np.floor((feat_size + 1 * 2 - (3 - 2)) / 2).astype('int64')
+ opt.crop_size = feat_size
+ sequence += [NLayersDiscriminator(opt)]
+ opt.crop_size = crop_size_bkp
+ sequence = nn.Sequential(*sequence)
+ self.add_sublayer('nld_'+str(i), sequence)
+
+ def forward(self, input):
+ output = []
+ for layer in self._sub_layers.values():
+ output.append(layer(input))
+ return output
+
diff --git a/ppgan/models/discriminators/discriminator_singan.py b/ppgan/models/discriminators/discriminator_singan.py
new file mode 100755
index 0000000000000000000000000000000000000000..6a2a78bd59fc2bab6014fbeac5f4befe0baea38a
--- /dev/null
+++ b/ppgan/models/discriminators/discriminator_singan.py
@@ -0,0 +1,31 @@
+# code was based on https://github.com/tamarott/SinGAN
+
+import paddle.nn as nn
+
+from ..generators.generator_singan import ConvBlock
+from .builder import DISCRIMINATORS
+
+
+@DISCRIMINATORS.register()
+class SinGANDiscriminator(nn.Layer):
+ def __init__(self,
+ nfc=32,
+ min_nfc=32,
+ input_nc=3,
+ num_layers=5,
+ ker_size=3,
+ padd_size=0):
+ super(SinGANDiscriminator, self).__init__()
+ self.head = ConvBlock(input_nc, nfc, ker_size, padd_size, 1)
+ self.body = nn.Sequential()
+ for i in range(num_layers - 2):
+ N = int(nfc / pow(2, (i + 1)))
+ block = ConvBlock(max(2 * N, min_nfc), max(N, min_nfc), ker_size, padd_size, 1)
+ self.body.add_sublayer('block%d' % (i + 1), block)
+ self.tail = nn.Conv2D(max(N, min_nfc), 1, ker_size, 1, padd_size)
+
+ def forward(self, x):
+ x = self.head(x)
+ x = self.body(x)
+ x = self.tail(x)
+ return x
diff --git a/ppgan/models/discriminators/discriminator_starganv2.py b/ppgan/models/discriminators/discriminator_starganv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4525d4b95d93fac103b4f109a559f7f4643fb2bf
--- /dev/null
+++ b/ppgan/models/discriminators/discriminator_starganv2.py
@@ -0,0 +1,42 @@
+# code was heavily based on https://github.com/clovaai/stargan-v2
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/clovaai/stargan-v2#license
+
+import paddle.nn as nn
+import paddle
+
+from .builder import DISCRIMINATORS
+from ..generators.generator_starganv2 import ResBlk
+
+import numpy as np
+
+
+@DISCRIMINATORS.register()
+class StarGANv2Discriminator(nn.Layer):
+ def __init__(self, img_size=256, num_domains=2, max_conv_dim=512):
+ super().__init__()
+ dim_in = 2**14 // img_size
+ blocks = []
+ blocks += [nn.Conv2D(3, dim_in, 3, 1, 1)]
+
+ repeat_num = int(np.log2(img_size)) - 2
+ for _ in range(repeat_num):
+ dim_out = min(dim_in * 2, max_conv_dim)
+ blocks += [ResBlk(dim_in, dim_out, downsample=True)]
+ dim_in = dim_out
+
+ blocks += [nn.LeakyReLU(0.2)]
+ blocks += [nn.Conv2D(dim_out, dim_out, 4, 1, 0)]
+ blocks += [nn.LeakyReLU(0.2)]
+ blocks += [nn.Conv2D(dim_out, num_domains, 1, 1, 0)]
+ self.main = nn.Sequential(*blocks)
+
+ def forward(self, x, y):
+ out = self.main(x)
+ out = paddle.reshape(out, (out.shape[0], -1)) # (batch, num_domains)
+ idx = paddle.zeros_like(out)
+ for i in range(idx.shape[0]):
+ idx[i, y[i]] = 1
+ s = idx * out
+ s = paddle.sum(s, axis=1)
+ return s
diff --git a/ppgan/models/discriminators/discriminator_styleganv2.py b/ppgan/models/discriminators/discriminator_styleganv2.py
index 038d39ab5f24374c9d1ec06675a44d8f94e80c32..80d6e5bf5f5e245ad3a16a1b0159f46b0461e3ff 100644
--- a/ppgan/models/discriminators/discriminator_styleganv2.py
+++ b/ppgan/models/discriminators/discriminator_styleganv2.py
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/rosinality/stylegan2-pytorch
+# MIT License
+# Copyright (c) 2019 Kim Seonghyeon
+
import math
import paddle
import paddle.nn as nn
@@ -24,6 +28,7 @@ from ...modules.upfirdn2d import Upfirdn2dBlur
class ConvLayer(nn.Sequential):
+
def __init__(
self,
in_channel,
@@ -68,6 +73,7 @@ class ConvLayer(nn.Sequential):
class ResBlock(nn.Layer):
+
def __init__(self, in_channel, out_channel, blur_kernel=[1, 3, 3, 1]):
super().__init__()
@@ -108,6 +114,7 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
@DISCRIMINATORS.register()
class StyleGANv2Discriminator(nn.Layer):
+
def __init__(self, size, channel_multiplier=2, blur_kernel=[1, 3, 3, 1]):
super().__init__()
@@ -167,3 +174,71 @@ class StyleGANv2Discriminator(nn.Layer):
out = self.final_linear(out)
return out
+
+
+@DISCRIMINATORS.register()
+class GPENDiscriminator(nn.Layer):
+
+ def __init__(self,
+ size,
+ channel_multiplier=1,
+ narrow=0.5,
+ blur_kernel=[1, 3, 3, 1]):
+ super().__init__()
+
+ channels = {
+ 4: int(512 * narrow),
+ 8: int(512 * narrow),
+ 16: int(512 * narrow),
+ 32: int(512 * narrow),
+ 64: int(256 * channel_multiplier * narrow),
+ 128: int(128 * channel_multiplier * narrow),
+ 256: int(64 * channel_multiplier * narrow),
+ 512: int(32 * channel_multiplier * narrow),
+ 1024: int(16 * channel_multiplier * narrow),
+ }
+
+ convs = [ConvLayer(3, channels[size], 1)]
+
+ log_size = int(math.log(size, 2))
+
+ in_channel = channels[size]
+
+ for i in range(log_size, 2, -1):
+ out_channel = channels[2**(i - 1)]
+
+ convs.append(ResBlock(in_channel, out_channel, blur_kernel))
+
+ in_channel = out_channel
+
+ self.convs = nn.Sequential(*convs)
+
+ self.stddev_group = 4
+ self.stddev_feat = 1
+
+ self.final_conv = ConvLayer(in_channel + 1, channels[4], 3)
+ self.final_linear = nn.Sequential(
+ EqualLinear(channels[4] * 4 * 4,
+ channels[4],
+ activation="fused_lrelu"),
+ EqualLinear(channels[4], 1),
+ )
+
+ def forward(self, input):
+ out = self.convs(input)
+
+ batch, channel, height, width = out.shape
+ group = min(batch, self.stddev_group)
+ stddev = out.reshape((group, -1, self.stddev_feat,
+ channel // self.stddev_feat, height, width))
+ stddev = paddle.sqrt(var(stddev, 0, unbiased=False) + 1e-8)
+ stddev = stddev.mean([2, 3, 4], keepdim=True).squeeze(2)
+ stddev = stddev.tile((group, 1, height, width))
+ out = paddle.concat([out, stddev], 1)
+
+ out = self.final_conv(out)
+
+ out = out.reshape((batch, -1))
+ out = self.final_linear(out)
+
+ return out
diff --git a/ppgan/models/discriminators/discriminator_ugatit.py b/ppgan/models/discriminators/discriminator_ugatit.py
index d08615925e17a58cf301b758f90b8752e806edcb..62e4da12771d6a4821629ef0930984216c7b6323 100644
--- a/ppgan/models/discriminators/discriminator_ugatit.py
+++ b/ppgan/models/discriminators/discriminator_ugatit.py
@@ -1,3 +1,5 @@
+# code was based on https://github.com/znxlwm/UGATIT-pytorch
+
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
diff --git a/ppgan/models/discriminators/nlayers.py b/ppgan/models/discriminators/nlayers.py
index 938b1f43ee297333e9b485f434ee242d6ba4ec05..2d8f4d46c9b1e8e5cf802e8606a1e3f8d8031c68 100644
--- a/ppgan/models/discriminators/nlayers.py
+++ b/ppgan/models/discriminators/nlayers.py
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/wtjiang98/PSGAN
+# MIT License
+# Copyright (c) 2020 Wentao Jiang
+
import paddle
import functools
import numpy as np
diff --git a/ppgan/models/discriminators/syncnet.py b/ppgan/models/discriminators/syncnet.py
index e597844d857c338b06f1dd79c9ed4bc2517fe67f..9fc3d26aa5794c6e771431c7f863bad8b5feaea8 100644
--- a/ppgan/models/discriminators/syncnet.py
+++ b/ppgan/models/discriminators/syncnet.py
@@ -1,3 +1,7 @@
+# code was heavily based on https://github.com/Rudrabha/Wav2Lip
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/Rudrabha/Wav2Lip#license-and-citation
+
import paddle
from paddle import nn
from paddle.nn import functional as F
diff --git a/ppgan/models/discriminators/vgg_discriminator.py b/ppgan/models/discriminators/vgg_discriminator.py
index 454e964446e67ee5de4fd388de3a0cb9fd6155ad..74b6112abdf0fb0d0e53ade8c010dc65d8d237f7 100644
--- a/ppgan/models/discriminators/vgg_discriminator.py
+++ b/ppgan/models/discriminators/vgg_discriminator.py
@@ -1,3 +1,5 @@
+# code was based on https://github.com/xinntao/ESRGAN
+
import paddle.nn as nn
from .builder import DISCRIMINATORS
diff --git a/ppgan/models/discriminators/wav2lip_disc_qual.py b/ppgan/models/discriminators/wav2lip_disc_qual.py
index 30dfa5d42669688b393adc6bd1687f6e05864da3..ea1f9c8fddc946aa0734b7239800d8aa03092379 100644
--- a/ppgan/models/discriminators/wav2lip_disc_qual.py
+++ b/ppgan/models/discriminators/wav2lip_disc_qual.py
@@ -1,3 +1,7 @@
+# code was heavily based on https://github.com/Rudrabha/Wav2Lip
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/Rudrabha/Wav2Lip#license-and-citation
+
import paddle
from paddle import nn
from paddle.nn import functional as F
diff --git a/ppgan/models/drn_model.py b/ppgan/models/drn_model.py
index f1c41a14be8dc07fb5e45bce590bfe7014ff9e52..ce44e9888c3c39b0c03482ad549e229652436f18 100644
--- a/ppgan/models/drn_model.py
+++ b/ppgan/models/drn_model.py
@@ -79,7 +79,7 @@ class DRN(BaseSRModel):
self.gan_criterion = build_criterion(gan_criterion)
def setup_input(self, input):
- self.lq = paddle.fluid.dygraph.to_variable(input['lq'])
+ self.lq = paddle.to_tensor(input['lq'])
self.visual_items['lq'] = self.lq
if isinstance(self.scale, (list, tuple)) and len(
@@ -87,7 +87,7 @@ class DRN(BaseSRModel):
self.lqx2 = input['lqx2']
if 'gt' in input:
- self.gt = paddle.fluid.dygraph.to_variable(input['gt'])
+ self.gt = paddle.to_tensor(input['gt'])
self.visual_items['gt'] = self.gt
self.image_paths = input['lq_path']
diff --git a/ppgan/models/edvr_model.py b/ppgan/models/edvr_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..387714275cf7cc9e42f9e3eb34bc8c74948124c3
--- /dev/null
+++ b/ppgan/models/edvr_model.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .base_model import apply_to_static
+from .builder import MODELS
+from .sr_model import BaseSRModel
+from .generators.edvr import ResidualBlockNoBN, DCNPack
+from ..modules.init import reset_parameters
+
+
+@MODELS.register()
+class EDVRModel(BaseSRModel):
+ """EDVR Model.
+
+ Paper: EDVR: Video Restoration with Enhanced Deformable Convolutional Networks.
+ """
+
+ def __init__(self, generator, tsa_iter, pixel_criterion=None, to_static=False,
+ image_shape=None):
+ """Initialize the EDVR class.
+
+ Args:
+ generator (dict): config of generator.
+ tsa_iter (dict): config of tsa_iter.
+ pixel_criterion (dict): config of pixel criterion.
+ """
+ super(EDVRModel, self).__init__(generator, pixel_criterion,
+ to_static=to_static,
+ image_shape=image_shape)
+ self.tsa_iter = tsa_iter
+ self.current_iter = 1
+ init_edvr_weight(self.nets['generator'])
+
+ def setup_input(self, input):
+ self.lq = input['lq']
+ self.visual_items['lq'] = self.lq[:, 2, :, :, :]
+ self.visual_items['lq-2'] = self.lq[:, 0, :, :, :]
+ self.visual_items['lq-1'] = self.lq[:, 1, :, :, :]
+ self.visual_items['lq+1'] = self.lq[:, 3, :, :, :]
+ self.visual_items['lq+2'] = self.lq[:, 4, :, :, :]
+ if 'gt' in input:
+ self.gt = input['gt'][:, 0, :, :, :]
+ self.visual_items['gt'] = self.gt
+ self.image_paths = input['lq_path']
+
+ def train_iter(self, optims=None):
+ optims['optim'].clear_grad()
+ if self.tsa_iter:
+ if self.current_iter == 1:
+ print('Only train TSA module for', self.tsa_iter, 'iters.')
+ for name, param in self.nets['generator'].named_parameters():
+ if 'TSAModule' not in name:
+ param.trainable = False
+ elif self.current_iter == self.tsa_iter + 1:
+ print('Train all the parameters.')
+ for param in self.nets['generator'].parameters():
+ param.trainable = True
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output
+ # pixel loss
+ loss_pixel = self.pixel_criterion(self.output, self.gt)
+ self.losses['loss_pixel'] = loss_pixel
+
+ loss_pixel.backward()
+ optims['optim'].step()
+ self.current_iter += 1
+
+ # amp train with brute force implementation
+ def train_iter_amp(self, optims=None, scalers=None, amp_level='O1'):
+ optims['optim'].clear_grad()
+ if self.tsa_iter:
+ if self.current_iter == 1:
+ print('Only train TSA module for', self.tsa_iter, 'iters.')
+ for name, param in self.nets['generator'].named_parameters():
+ if 'TSAModule' not in name:
+ param.trainable = False
+ elif self.current_iter == self.tsa_iter + 1:
+ print('Train all the parameters.')
+ for param in self.nets['generator'].parameters():
+ param.trainable = True
+
+ # put loss computation in amp context
+ with paddle.amp.auto_cast(enable=True, level=amp_level):
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output
+ # pixel loss
+ loss_pixel = self.pixel_criterion(self.output, self.gt)
+ self.losses['loss_pixel'] = loss_pixel
+
+ scaled_loss = scalers[0].scale(loss_pixel)
+ scaled_loss.backward()
+ scalers[0].minimize(optims['optim'], scaled_loss)
+
+ self.current_iter += 1
+
+
+def init_edvr_weight(net):
+
+ def reset_func(m):
+ if hasattr(m, 'weight') and (not isinstance(
+ m, (nn.BatchNorm, nn.BatchNorm2D))) and (
+ not isinstance(m, ResidualBlockNoBN) and
+ (not isinstance(m, DCNPack))):
+ reset_parameters(m)
+
+ net.apply(reset_func)
diff --git a/ppgan/models/esrgan_model.py b/ppgan/models/esrgan_model.py
index fe67cff05550dd777e35ea9ae65a706cffa32916..08c7b676a21c2c55b07601a627c8ea6630671062 100644
--- a/ppgan/models/esrgan_model.py
+++ b/ppgan/models/esrgan_model.py
@@ -29,6 +29,7 @@ class ESRGAN(BaseSRModel):
ESRGAN paper: https://arxiv.org/pdf/1809.00219.pdf
"""
+
def __init__(self,
generator,
discriminator=None,
@@ -127,3 +128,87 @@ class ESRGAN(BaseSRModel):
else:
l_total.backward()
optimizers['optimG'].step()
+
+ # amp training
+ def train_iter_amp(self, optimizers=None, scalers=None, amp_level='O1'):
+ optimizers['optimG'].clear_grad()
+ l_total = 0
+
+ # put loss computation in amp context
+ with paddle.amp.auto_cast(enable=True, level=amp_level):
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output
+ # pixel loss
+ if self.pixel_criterion:
+ l_pix = self.pixel_criterion(self.output, self.gt)
+ l_total += l_pix
+ self.losses['loss_pix'] = l_pix
+ if self.perceptual_criterion:
+ l_g_percep, l_g_style = self.perceptual_criterion(
+ self.output, self.gt)
+ # l_total += l_pix
+ if l_g_percep is not None:
+ l_total += l_g_percep
+ self.losses['loss_percep'] = l_g_percep
+ if l_g_style is not None:
+ l_total += l_g_style
+ self.losses['loss_style'] = l_g_style
+
+ # gan loss (relativistic gan)
+ if hasattr(self, 'gan_criterion'):
+ self.set_requires_grad(self.nets['discriminator'], False)
+
+ # put fwd and loss computation in amp context
+ with paddle.amp.auto_cast(enable=True, level=amp_level):
+ real_d_pred = self.nets['discriminator'](self.gt).detach()
+ fake_g_pred = self.nets['discriminator'](self.output)
+ l_g_real = self.gan_criterion(real_d_pred -
+ paddle.mean(fake_g_pred),
+ False,
+ is_disc=False)
+ l_g_fake = self.gan_criterion(fake_g_pred -
+ paddle.mean(real_d_pred),
+ True,
+ is_disc=False)
+ l_g_gan = (l_g_real + l_g_fake) / 2
+
+ l_total += l_g_gan
+ self.losses['l_g_gan'] = l_g_gan
+
+ scaled_l_total = scalers[0].scale(l_total)
+ scaled_l_total.backward()
+ optimizers['optimG'].step()
+ scalers[0].minimize(optimizers['optimG'], scaled_l_total)
+
+ self.set_requires_grad(self.nets['discriminator'], True)
+ optimizers['optimD'].clear_grad()
+
+ with paddle.amp.auto_cast(enable=True, level=amp_level):
+ # real
+ fake_d_pred = self.nets['discriminator'](self.output).detach()
+ real_d_pred = self.nets['discriminator'](self.gt)
+ l_d_real = self.gan_criterion(
+ real_d_pred - paddle.mean(fake_d_pred), True,
+ is_disc=True) * 0.5
+
+ # fake
+ fake_d_pred = self.nets['discriminator'](self.output.detach())
+ l_d_fake = self.gan_criterion(
+ fake_d_pred - paddle.mean(real_d_pred.detach()),
+ False,
+ is_disc=True) * 0.5
+
+ l_temp = l_d_real + l_d_fake
+ scaled_l_temp = scalers[1].scale(l_temp)
+ scaled_l_temp.backward()
+ scalers[0].minimize(optimizers['optimD'], scaled_l_temp)
+
+ self.losses['l_d_real'] = l_d_real
+ self.losses['l_d_fake'] = l_d_fake
+ self.losses['out_d_real'] = paddle.mean(real_d_pred.detach())
+ self.losses['out_d_fake'] = paddle.mean(fake_d_pred.detach())
+ else:
+ scaled_l_total = scalers[0].scale(l_total)
+ scaled_l_total.backward()
+ optimizers['optimG'].step()
+ scalers[0].minimize(optimizers['optimG'], scaled_l_total)
diff --git a/ppgan/models/firstorder_model.py b/ppgan/models/firstorder_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..e1348620dfefdeeb921db8d7550f61d033227e20
--- /dev/null
+++ b/ppgan/models/firstorder_model.py
@@ -0,0 +1,522 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# code was heavily based on https://github.com/AliaksandrSiarohin/first-order-model
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/AliaksandrSiarohin/first-order-model/blob/master/LICENSE.md
+
+import paddle
+
+from .base_model import BaseModel
+from .builder import MODELS
+from .discriminators.builder import build_discriminator
+from .generators.builder import build_generator
+from ..modules.init import init_weights
+from ..solver import build_optimizer
+from paddle.optimizer.lr import MultiStepDecay
+from ..modules.init import reset_parameters, uniform_
+import paddle.nn as nn
+import numpy as np
+from paddle.utils import try_import
+import paddle.nn.functional as F
+import cv2
+import os
+
+
+def init_weight(net):
+ def reset_func(m):
+ if isinstance(m, (nn.BatchNorm, nn.BatchNorm2D, nn.SyncBatchNorm)):
+ m.weight = uniform_(m.weight, 0, 1)
+ elif hasattr(m, 'weight') and hasattr(m, 'bias'):
+ reset_parameters(m)
+
+ net.apply(reset_func)
+
+
+@MODELS.register()
+class FirstOrderModel(BaseModel):
+ """ This class implements the FirstOrderMotion model, FirstOrderMotion paper:
+ https://proceedings.neurips.cc/paper/2019/file/31c0b36aef265d9221af80872ceb62f9-Paper.pdf.
+ """
+ def __init__(self,
+ common_params,
+ train_params,
+ generator,
+ discriminator=None):
+ super(FirstOrderModel, self).__init__()
+
+ # def local var
+ self.input_data = None
+ self.generated = None
+ self.losses_generator = None
+ self.train_params = train_params
+ # define networks
+ generator_cfg = generator
+ generator_cfg.update({'common_params': common_params})
+ generator_cfg.update({'train_params': train_params})
+ generator_cfg.update(
+ {'dis_scales': discriminator.discriminator_cfg.scales})
+ self.nets['Gen_Full'] = build_generator(generator_cfg)
+ discriminator_cfg = discriminator
+ discriminator_cfg.update({'common_params': common_params})
+ discriminator_cfg.update({'train_params': train_params})
+ self.nets['Dis'] = build_discriminator(discriminator_cfg)
+ self.visualizer = Visualizer()
+ self.test_loss = []
+ self.is_train = False
+
+ def setup_lr_schedulers(self, lr_cfg):
+ self.kp_lr = MultiStepDecay(learning_rate=lr_cfg['lr_kp_detector'],
+ milestones=lr_cfg['epoch_milestones'],
+ gamma=0.1)
+ self.gen_lr = MultiStepDecay(learning_rate=lr_cfg['lr_generator'],
+ milestones=lr_cfg['epoch_milestones'],
+ gamma=0.1)
+ self.dis_lr = MultiStepDecay(learning_rate=lr_cfg['lr_discriminator'],
+ milestones=lr_cfg['epoch_milestones'],
+ gamma=0.1)
+ self.lr_scheduler = {
+ "kp_lr": self.kp_lr,
+ "gen_lr": self.gen_lr,
+ "dis_lr": self.dis_lr
+ }
+
+ def setup_net_parallel(self):
+ if isinstance(self.nets['Gen_Full'], paddle.DataParallel):
+ self.nets['kp_detector'] = self.nets[
+ 'Gen_Full']._layers.kp_extractor
+ self.nets['generator'] = self.nets['Gen_Full']._layers.generator
+ self.nets['discriminator'] = self.nets['Dis']._layers.discriminator
+ else:
+ self.nets['kp_detector'] = self.nets['Gen_Full'].kp_extractor
+ self.nets['generator'] = self.nets['Gen_Full'].generator
+ self.nets['discriminator'] = self.nets['Dis'].discriminator
+
+ def setup_optimizers(self, lr_cfg, optimizer):
+ self.setup_net_parallel()
+ # init params
+ init_weight(self.nets['kp_detector'])
+ init_weight(self.nets['generator'])
+ init_weight(self.nets['discriminator'])
+
+ # define loss functions
+ self.losses = {}
+
+ self.optimizers['optimizer_KP'] = build_optimizer(
+ optimizer,
+ self.kp_lr,
+ parameters=self.nets['kp_detector'].parameters())
+ self.optimizers['optimizer_Gen'] = build_optimizer(
+ optimizer,
+ self.gen_lr,
+ parameters=self.nets['generator'].parameters())
+ self.optimizers['optimizer_Dis'] = build_optimizer(
+ optimizer,
+ self.dis_lr,
+ parameters=self.nets['discriminator'].parameters())
+
+ def setup_input(self, input):
+ self.input_data = input
+
+ def forward(self):
+ """Run forward pass; called by both functions and ."""
+ self.losses_generator, self.generated = \
+ self.nets['Gen_Full'](self.input_data.copy(), self.nets['discriminator'])
+
+
+ def backward_G(self):
+ loss_values = [val.mean() for val in self.losses_generator.values()]
+ loss = paddle.add_n(loss_values)
+ self.losses = dict(zip(self.losses_generator.keys(), loss_values))
+ loss.backward()
+
+ def backward_D(self):
+ losses_discriminator = self.nets['Dis'](self.input_data.copy(),
+ self.generated)
+ loss_values = [val.mean() for val in losses_discriminator.values()]
+ loss = paddle.add_n(loss_values)
+ loss.backward()
+ self.losses.update(dict(zip(losses_discriminator.keys(), loss_values)))
+
+ def train_iter(self, optimizers=None):
+ self.train = True
+ self.forward()
+ # update G
+ self.set_requires_grad(self.nets['discriminator'], False)
+ self.optimizers['optimizer_KP'].clear_grad()
+ self.optimizers['optimizer_Gen'].clear_grad()
+ self.backward_G()
+ self.optimizers['optimizer_KP'].step()
+ self.optimizers['optimizer_Gen'].step()
+
+ # update D
+ if self.train_params['loss_weights']['generator_gan'] != 0:
+ self.set_requires_grad(self.nets['discriminator'], True)
+ self.optimizers['optimizer_Dis'].clear_grad()
+ self.backward_D()
+ self.optimizers['optimizer_Dis'].step()
+
+ def test_iter(self, metrics=None):
+ if not self.is_train:
+ self.is_train = True
+ self.setup_net_parallel()
+
+ self.nets['kp_detector'].eval()
+ self.nets['generator'].eval()
+ with paddle.no_grad():
+ kp_source = self.nets['kp_detector'](self.input_data['video'][:, :,
+ 0])
+ for frame_idx in range(self.input_data['video'].shape[2]):
+ source = self.input_data['video'][:, :, 0]
+ driving = self.input_data['video'][:, :, frame_idx]
+ kp_driving = self.nets['kp_detector'](driving)
+ out = self.nets['generator'](source,
+ kp_source=kp_source,
+ kp_driving=kp_driving)
+ out.update({'kp_source': kp_source, 'kp_driving': kp_driving})
+ loss = paddle.abs(out['prediction'] -
+ driving).mean().cpu().numpy()
+ self.test_loss.append(loss)
+ self.visual_items['driving_source_gen'] = self.visualizer.visualize(
+ driving, source, out)
+ print("Reconstruction loss: %s" % np.mean(self.test_loss))
+ self.nets['kp_detector'].train()
+ self.nets['generator'].train()
+
+ class InferGenerator(paddle.nn.Layer):
+ def set_generator(self, generator):
+ self.generator = generator
+
+ def forward(self, source, kp_source, kp_driving, kp_driving_initial):
+ kp_norm = {k: v for k, v in kp_driving.items()}
+
+ kp_value_diff = (kp_driving['value'] - kp_driving_initial['value'])
+ kp_norm['value'] = kp_value_diff + kp_source['value']
+
+ jacobian_diff = paddle.matmul(
+ kp_driving['jacobian'],
+ paddle.inverse(kp_driving_initial['jacobian']))
+ kp_norm['jacobian'] = paddle.matmul(jacobian_diff,
+ kp_source['jacobian'])
+ out = self.generator(source,
+ kp_source=kp_source,
+ kp_driving=kp_norm)
+ return out['prediction']
+
+ def export_model(self, export_model=None, output_dir=None, inputs_size=[], export_serving_model=False, model_name=None):
+
+ source = paddle.rand(shape=inputs_size[0], dtype='float32')
+ driving = paddle.rand(shape=inputs_size[1], dtype='float32')
+ value = paddle.rand(shape=inputs_size[2], dtype='float32')
+ j = paddle.rand(shape=inputs_size[3], dtype='float32')
+ value2 = paddle.rand(shape=inputs_size[2], dtype='float32')
+ j2 = paddle.rand(shape=inputs_size[3], dtype='float32')
+ driving1 = {'value': value, 'jacobian': j}
+ driving2 = {'value': value2, 'jacobian': j2}
+ driving3 = {'value': value, 'jacobian': j}
+
+ if output_dir is None:
+ output_dir = 'inference_model'
+ outpath = os.path.join(output_dir, "fom_dy2st")
+ if not os.path.exists(outpath):
+ os.makedirs(outpath)
+ paddle.jit.save(self.nets['Gen_Full'].kp_extractor,
+ os.path.join(outpath, "kp_detector"),
+ input_spec=[source])
+ infer_generator = self.InferGenerator()
+ infer_generator.set_generator(self.nets['Gen_Full'].generator)
+ paddle.jit.save(infer_generator,
+ os.path.join(outpath, "generator"),
+ input_spec=[source, driving1, driving2, driving3])
+
+
+@MODELS.register()
+class FirstOrderModelMobile(FirstOrderModel):
+ """ This class implements the FirstOrderMotionMobile model, modified according to the FirstOrderMotion paper:
+ https://proceedings.neurips.cc/paper/2019/file/31c0b36aef265d9221af80872ceb62f9-Paper.pdf.
+ """
+ def __init__(self,
+ common_params,
+ train_params,
+ generator_ori,
+ generator,
+ mode,
+ kp_weight_path=None,
+ gen_weight_path=None,
+ discriminator=None):
+ super(FirstOrderModel, self).__init__()
+ modes = ["kp_detector", "generator", "both"]
+ assert mode in modes
+ # def local var
+ self.input_data = None
+ self.generated = None
+ self.losses_generator = None
+ self.train_params = train_params
+
+ # fix origin fom model for distill
+ generator_ori_cfg = generator_ori
+ generator_ori_cfg.update({'common_params': common_params})
+ generator_ori_cfg.update({'train_params': train_params})
+ generator_ori_cfg.update(
+ {'dis_scales': discriminator.discriminator_cfg.scales})
+ self.Gen_Full_ori = build_generator(generator_ori_cfg)
+ discriminator_cfg = discriminator
+ discriminator_cfg.update({'common_params': common_params})
+ discriminator_cfg.update({'train_params': train_params})
+ self.nets['Dis'] = build_discriminator(discriminator_cfg)
+
+ # define networks
+ generator_cfg = generator
+ generator_cfg.update({'common_params': common_params})
+ generator_cfg.update({'train_params': train_params})
+ generator_cfg.update(
+ {'dis_scales': discriminator.discriminator_cfg.scales})
+ if (mode == "kp_detector"):
+ print("just train kp_detector, fix generator")
+ generator_cfg.update(
+ {'generator_cfg': generator_ori_cfg['generator_cfg']})
+ elif mode == "generator":
+ print("just train generator, fix kp_detector")
+ generator_cfg.update(
+ {'kp_detector_cfg': generator_ori_cfg['kp_detector_cfg']})
+ elif mode == "both":
+ print("train both kp_detector and generator")
+ self.mode = mode
+ self.nets['Gen_Full'] = build_generator(generator_cfg)
+ self.kp_weight_path = kp_weight_path
+ self.gen_weight_path = gen_weight_path
+ self.visualizer = Visualizer()
+ self.test_loss = []
+ self.is_train = False
+
+
+ def setup_net_parallel(self):
+ if isinstance(self.nets['Gen_Full'], paddle.DataParallel):
+ self.nets['kp_detector'] = self.nets[
+ 'Gen_Full']._layers.kp_extractor
+ self.nets['generator'] = self.nets['Gen_Full']._layers.generator
+ self.nets['generator'] = self.nets['Gen_Full']._layers.generator
+ self.nets['discriminator'] = self.nets['Dis']._layers.discriminator
+ else:
+ self.nets['kp_detector'] = self.nets['Gen_Full'].kp_extractor
+ self.nets['generator'] = self.nets['Gen_Full'].generator
+ self.nets['discriminator'] = self.nets['Dis'].discriminator
+ self.kp_detector_ori = self.Gen_Full_ori.kp_extractor
+ if self.is_train:
+ return
+
+ from ppgan.utils.download import get_path_from_url
+ vox_cpk_weight_url = 'https://paddlegan.bj.bcebos.com/applications/first_order_model/vox-cpk.pdparams'
+ weight_path = get_path_from_url(vox_cpk_weight_url)
+ checkpoint = paddle.load(weight_path)
+ if (self.mode == "kp_detector"):
+ print("load pretrained generator... ")
+ self.nets['generator'].set_state_dict(checkpoint['generator'])
+ for param in self.nets['generator'].parameters():
+ param.stop_gradient = True
+ elif self.mode == "generator":
+ print("load pretrained kp_detector... ")
+ self.nets['kp_detector'].set_state_dict(checkpoint['kp_detector'])
+ for param in self.nets['kp_detector'].parameters():
+ param.stop_gradient = True
+
+ def setup_optimizers(self, lr_cfg, optimizer):
+ self.setup_net_parallel()
+ # init params
+ init_weight(self.nets['discriminator'])
+ self.optimizers['optimizer_Dis'] = build_optimizer(
+ optimizer,
+ self.dis_lr,
+ parameters=self.nets['discriminator'].parameters())
+
+ if (self.mode == "kp_detector"):
+ init_weight(self.nets['kp_detector'])
+ self.optimizers['optimizer_KP'] = build_optimizer(
+ optimizer,
+ self.kp_lr,
+ parameters=self.nets['kp_detector'].parameters())
+ elif self.mode == "generator":
+ init_weight(self.nets['generator'])
+ self.optimizers['optimizer_Gen'] = build_optimizer(
+ optimizer,
+ self.gen_lr,
+ parameters=self.nets['generator'].parameters())
+ elif self.mode == "both":
+ super(FirstOrderModelMobile,
+ self).setup_optimizers(lr_cfg, optimizer)
+ print("load both pretrained kp_detector and generator")
+ checkpoint = paddle.load(self.kp_weight_path)
+ self.nets['kp_detector'].set_state_dict(checkpoint['kp_detector'])
+ checkpoint = paddle.load(self.gen_weight_path)
+ self.nets['generator'].set_state_dict(checkpoint['generator'])
+
+ # define loss functions
+ self.losses = {}
+
+ def forward(self):
+ """Run forward pass; called by both functions and ."""
+ if (self.mode == "kp_detector_distill"):
+ self.losses_generator, self.generated = \
+ self.nets['Gen_Full'](self.input_data.copy(), self.nets['discriminator'], self.kp_detector_ori)
+ else:
+ self.losses_generator, self.generated = \
+ self.nets['Gen_Full'](self.input_data.copy(), self.nets['discriminator'])
+
+ def train_iter(self, optimizers=None):
+ self.is_train = True
+ if (self.mode == "both"):
+ super(FirstOrderModelMobile, self).train_iter(optimizers=optimizers)
+ return
+ self.forward()
+ # update G
+ self.set_requires_grad(self.nets['discriminator'], False)
+ if (self.mode == "kp_detector"):
+ self.optimizers['optimizer_KP'].clear_grad()
+ self.backward_G()
+ self.optimizers['optimizer_KP'].step()
+ if (self.mode == "generator"):
+ self.optimizers['optimizer_Gen'].clear_grad()
+ self.backward_G()
+ self.optimizers['optimizer_Gen'].step()
+
+ # update D
+ if self.train_params['loss_weights']['generator_gan'] != 0:
+ self.set_requires_grad(self.nets['discriminator'], True)
+ self.optimizers['optimizer_Dis'].clear_grad()
+ self.backward_D()
+ self.optimizers['optimizer_Dis'].step()
+
+
+class Visualizer:
+ def __init__(self, kp_size=3, draw_border=False, colormap='gist_rainbow'):
+ plt = try_import('matplotlib.pyplot')
+ self.kp_size = kp_size
+ self.draw_border = draw_border
+ self.colormap = plt.get_cmap(colormap)
+
+ def draw_image_with_kp(self, image, kp_array):
+ image = np.copy(image)
+ spatial_size = np.array(image.shape[:2][::-1])[np.newaxis]
+ kp_array = spatial_size * (kp_array + 1) / 2
+ num_kp = kp_array.shape[0]
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+ image = (image * 255).astype(np.uint8)
+ for kp_ind, kp in enumerate(kp_array):
+ color = cv2.applyColorMap(
+ np.array(kp_ind / num_kp * 255).astype(np.uint8),
+ cv2.COLORMAP_JET)[0][0]
+ color = (int(color[0]), int(color[1]), int(color[2]))
+ image = cv2.circle(image, (int(kp[1]), int(kp[0])), self.kp_size,
+ color, 3)
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR).astype('float32') / 255.0
+ return image
+
+ def create_image_column_with_kp(self, images, kp):
+ image_array = np.array(
+ [self.draw_image_with_kp(v, k) for v, k in zip(images, kp)])
+ return self.create_image_column(image_array)
+
+ def create_image_column(self, images, draw_border=False):
+ if draw_border:
+ images = np.copy(images)
+ images[:, :, [0, -1]] = (1, 1, 1)
+ images[:, :, [0, -1]] = (1, 1, 1)
+ return np.concatenate(list(images), axis=0)
+
+ def create_image_grid(self, *args):
+ out = []
+ for arg in args:
+ if type(arg) == tuple:
+ out.append(self.create_image_column_with_kp(arg[0], arg[1]))
+ else:
+ out.append(self.create_image_column(arg))
+ return np.concatenate(out, axis=1)
+
+ def visualize(self, driving, source, out):
+ images = []
+ # Source image with keypoints
+ source = source.cpu().numpy()
+ kp_source = out['kp_source']['value'].cpu().numpy()
+ source = np.transpose(source, [0, 2, 3, 1])
+ images.append((source, kp_source))
+
+ # Equivariance visualization
+ if 'transformed_frame' in out:
+ transformed = out['transformed_frame'].cpu().numpy()
+ transformed = np.transpose(transformed, [0, 2, 3, 1])
+ transformed_kp = out['transformed_kp']['value'].cpu().numpy()
+ images.append((transformed, transformed_kp))
+
+ # Driving image with keypoints
+ kp_driving = out['kp_driving']['value'].cpu().numpy()
+ driving = driving.cpu().numpy()
+ driving = np.transpose(driving, [0, 2, 3, 1])
+ images.append((driving, kp_driving))
+
+ # Deformed image
+ if 'deformed' in out:
+ deformed = out['deformed'].cpu().numpy()
+ deformed = np.transpose(deformed, [0, 2, 3, 1])
+ images.append(deformed)
+
+ # Result with and without keypoints
+ prediction = out['prediction'].cpu().numpy()
+ prediction = np.transpose(prediction, [0, 2, 3, 1])
+ if 'kp_norm' in out:
+ kp_norm = out['kp_norm']['value'].cpu().numpy()
+ images.append((prediction, kp_norm))
+ images.append(prediction)
+
+ ## Occlusion map
+ if 'occlusion_map' in out:
+ occlusion_map = out['occlusion_map'].cpu().tile([1, 3, 1, 1])
+ occlusion_map = F.interpolate(occlusion_map,
+ size=source.shape[1:3]).numpy()
+ occlusion_map = np.transpose(occlusion_map, [0, 2, 3, 1])
+ images.append(occlusion_map)
+
+ # Deformed images according to each individual transform
+ if 'sparse_deformed' in out:
+ full_mask = []
+ for i in range(out['sparse_deformed'].shape[1]):
+ image = out['sparse_deformed'][:, i].cpu()
+ image = F.interpolate(image, size=source.shape[1:3])
+ mask = out['mask'][:, i:(i + 1)].cpu().tile([1, 3, 1, 1])
+ mask = F.interpolate(mask, size=source.shape[1:3])
+ image = np.transpose(image.numpy(), (0, 2, 3, 1))
+ mask = np.transpose(mask.numpy(), (0, 2, 3, 1))
+
+ if i != 0:
+ color = np.array(
+ self.colormap(
+ (i - 1) /
+ (out['sparse_deformed'].shape[1] - 1)))[:3]
+ else:
+ color = np.array((0, 0, 0))
+
+ color = color.reshape((1, 1, 1, 3))
+
+ images.append(image)
+ if i != 0:
+ images.append(mask * color)
+ else:
+ images.append(mask)
+
+ full_mask.append(mask * color)
+
+ images.append(sum(full_mask))
+
+ image = self.create_image_grid(*images)
+ image = (255 * image).astype(np.uint8)
+ return image
diff --git a/ppgan/models/gan_model.py b/ppgan/models/gan_model.py
index 6c82488bd0fe495b3ee0fa13ea8c28795b559261..cfe157d2fb90773bac36e250316d03eb95644a73 100644
--- a/ppgan/models/gan_model.py
+++ b/ppgan/models/gan_model.py
@@ -83,7 +83,11 @@ class GANModel(BaseModel):
input = {'img': input}
self.D_real_inputs = [paddle.to_tensor(input['img'])]
if 'class_id' in input: # n class input
- self.n_class = self.nets['netG'].n_class
+ if isinstance(self.nets['netG'], paddle.DataParallel):
+ self.n_class = self.nets['netG']._layers.n_class
+ else:
+ self.n_class = self.nets['netG'].n_class
+
self.D_real_inputs += [
paddle.to_tensor(input['class_id'], dtype='int64')
]
@@ -91,7 +95,12 @@ class GANModel(BaseModel):
self.n_class = 0
batch_size = self.D_real_inputs[0].shape[0]
- self.G_inputs = self.nets['netG'].random_inputs(batch_size)
+
+ if isinstance(self.nets['netG'], paddle.DataParallel):
+ self.G_inputs = self.nets['netG']._layers.random_inputs(batch_size)
+ else:
+ self.G_inputs = self.nets['netG'].random_inputs(batch_size)
+
if not isinstance(self.G_inputs, (list, tuple)):
self.G_inputs = [self.G_inputs]
diff --git a/ppgan/models/generators/__init__.py b/ppgan/models/generators/__init__.py
index c017baf9759672345ce9f7b7be1e1e0dcd8a5227..76af90ac8c5462039d8c00260ee06858c1e5caeb 100644
--- a/ppgan/models/generators/__init__.py
+++ b/ppgan/models/generators/__init__.py
@@ -26,3 +26,25 @@ from .resnet_ugatit_p2c import ResnetUGATITP2CGenerator
from .generator_styleganv2 import StyleGANv2Generator
from .generator_pixel2style2pixel import Pixel2Style2Pixel
from .drn import DRNGenerator
+from .generator_starganv2 import StarGANv2Generator, StarGANv2Style, StarGANv2Mapping, FAN
+from .edvr import EDVRNet
+from .generator_firstorder import FirstOrderGenerator
+from .generater_lapstyle import DecoderNet, Encoder, RevisionNet
+from .basicvsr import BasicVSRNet
+from .mpr import MPRNet
+from .iconvsr import IconVSR
+from .gpen import GPEN
+from .pan import PAN
+from .generater_photopen import SPADEGenerator
+from .basicvsr_plus_plus import BasicVSRPlusPlus
+from .msvsr import MSVSR
+from .generator_singan import SinGANGenerator
+from .rcan import RCAN
+from .prenet import PReNet
+from .generator_gpen import GPENGenerator
+from .swinir import SwinIR
+from .gfpganv1_clean_arch import GFPGANv1Clean
+from .gfpganv1_arch import GFPGANv1, StyleGAN2DiscriminatorGFPGAN
+from .invdn import InvDN
+from .nafnet import NAFNet, NAFNetLocal
+from .generater_aotgan import InpaintGenerator
diff --git a/ppgan/models/generators/basicvsr.py b/ppgan/models/generators/basicvsr.py
new file mode 100644
index 0000000000000000000000000000000000000000..b57290e90860306057f1270183f7ce8adc40e401
--- /dev/null
+++ b/ppgan/models/generators/basicvsr.py
@@ -0,0 +1,696 @@
+# Copyright (c) MMEditing Authors.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.vision.ops import DeformConv2D
+from ...utils.download import get_path_from_url
+from ...modules.init import kaiming_normal_, constant_
+from .builder import GENERATORS
+
+
+@paddle.no_grad()
+def default_init_weights(layer_list, scale=1, bias_fill=0, **kwargs):
+ """Initialize network weights.
+
+ Args:
+ layer_list (list[nn.Layer] | nn.Layer): Layers to be initialized.
+ scale (float): Scale initialized weights, especially for residual
+ blocks. Default: 1.
+ bias_fill (float): The value to fill bias. Default: 0
+ kwargs (dict): Other arguments for initialization function.
+ """
+ if not isinstance(layer_list, list):
+ layer_list = [layer_list]
+ for m in layer_list:
+ if isinstance(m, nn.Conv2D):
+ kaiming_normal_(m.weight, **kwargs)
+ scale_weight = scale * m.weight
+ m.weight.set_value(scale_weight)
+ if m.bias is not None:
+ constant_(m.bias, bias_fill)
+ elif isinstance(m, nn.Linear):
+ kaiming_normal_(m.weight, **kwargs)
+ scale_weight = scale * m.weight
+ m.weight.set_value(scale_weight)
+ if m.bias is not None:
+ constant_(m.bias, bias_fill)
+ elif isinstance(m, nn.BatchNorm):
+ constant_(m.weight, 1)
+
+
+class PixelShufflePack(nn.Layer):
+ """ Pixel Shuffle upsample layer.
+
+ Args:
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ scale_factor (int): Upsample ratio.
+ upsample_kernel (int): Kernel size of Conv layer to expand channels.
+
+ Returns:
+ Upsampled feature map.
+ """
+ def __init__(self, in_channels, out_channels, scale_factor,
+ upsample_kernel):
+ super().__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.scale_factor = scale_factor
+ self.upsample_kernel = upsample_kernel
+ self.upsample_conv = nn.Conv2D(self.in_channels,
+ self.out_channels * scale_factor *
+ scale_factor,
+ self.upsample_kernel,
+ padding=(self.upsample_kernel - 1) // 2)
+ self.pixel_shuffle = nn.PixelShuffle(self.scale_factor)
+ self.init_weights()
+
+ def init_weights(self):
+ """Initialize weights for PixelShufflePack.
+ """
+ default_init_weights(self, 1)
+
+ def forward(self, x):
+ """Forward function for PixelShufflePack.
+
+ Args:
+ x (Tensor): Input tensor with shape (in_channels, c, h, w).
+
+ Returns:
+ Tensor with shape (out_channels, c, scale_factor*h, scale_factor*w).
+ """
+ x = self.upsample_conv(x)
+ x = self.pixel_shuffle(x)
+ return x
+
+
+def MakeMultiBlocks(func, num_layers, nf=64):
+ """Make layers by stacking the same blocks.
+
+ Args:
+ func (nn.Layer): nn.Layer class for basic block.
+ num_layers (int): number of blocks.
+
+ Returns:
+ nn.Sequential: Stacked blocks in nn.Sequential.
+ """
+ Blocks = nn.Sequential()
+ for i in range(num_layers):
+ Blocks.add_sublayer('block%d' % i, func(nf))
+ return Blocks
+
+
+class ResidualBlockNoBN(nn.Layer):
+ """Residual block without BN.
+
+ It has a style of:
+ ---Conv-ReLU-Conv-+-
+ |________________|
+
+ Args:
+ nf (int): Channel number of intermediate features.
+ Default: 64.
+ res_scale (float): Residual scale. Default: 1.0.
+ """
+ def __init__(self, nf=64, res_scale=1.0):
+ super(ResidualBlockNoBN, self).__init__()
+ self.nf = nf
+ self.res_scale = res_scale
+ self.conv1 = nn.Conv2D(self.nf, self.nf, 3, 1, 1)
+ self.conv2 = nn.Conv2D(self.nf, self.nf, 3, 1, 1)
+ self.relu = nn.ReLU()
+ if self.res_scale == 1.0:
+ default_init_weights([self.conv1, self.conv2], 0.1)
+
+ def forward(self, x):
+ """Forward function.
+
+ Args:
+ x (Tensor): Input tensor with shape (n, c, h, w).
+
+ Returns:
+ Tensor with shape (n, c, h, w).
+ """
+ identity = x
+ out = self.conv2(self.relu(self.conv1(x)))
+ return identity + out * self.res_scale
+
+
+def flow_warp(x,
+ flow,
+ interpolation='bilinear',
+ padding_mode='zeros',
+ align_corners=True):
+ """Warp an image or a feature map with optical flow.
+
+ Args:
+ x (Tensor): Tensor with size (n, c, h, w).
+ flow (Tensor): Tensor with size (n, h, w, 2). The last dimension is
+ a two-channel, denoting the width and height relative offsets.
+ Note that the values are not normalized to [-1, 1].
+ interpolation (str): Interpolation mode: 'nearest' or 'bilinear'.
+ Default: 'bilinear'.
+ padding_mode (str): Padding mode: 'zeros' or 'border' or 'reflection'.
+ Default: 'zeros'.
+ align_corners (bool): Whether align corners. Default: True.
+
+ Returns:
+ Tensor: Warped image or feature map.
+ """
+ x_h, x_w = x.shape[-2:]
+ flow_h, flow_w = flow.shape[1:3]
+ if x_h != flow_h or x_w != flow_w:
+ raise ValueError(f'The spatial sizes of input ({x.shape[-2:]}) and '
+ f'flow ({flow.shape[1:3]}) are not the same.')
+ _, _, h, w = x.shape
+ # create mesh grid
+ grid_y, grid_x = paddle.meshgrid(paddle.arange(0, h), paddle.arange(0, w))
+ grid = paddle.stack((grid_x, grid_y), axis=2) # (w, h, 2)
+ grid = paddle.cast(grid, 'float32')
+ grid.stop_gradient = True
+
+ grid_flow = grid + flow
+ # scale grid_flow to [-1,1]
+ grid_flow_x = 2.0 * grid_flow[:, :, :, 0] / max(w - 1, 1) - 1.0
+ grid_flow_y = 2.0 * grid_flow[:, :, :, 1] / max(h - 1, 1) - 1.0
+ grid_flow = paddle.stack((grid_flow_x, grid_flow_y), axis=3)
+ output = F.grid_sample(x,
+ grid_flow,
+ mode=interpolation,
+ padding_mode=padding_mode,
+ align_corners=align_corners)
+ return output
+
+
+class SPyNetBasicModule(nn.Layer):
+ """Basic Module for SPyNet.
+
+ Paper:
+ Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+ """
+ def __init__(self):
+ super().__init__()
+
+ self.conv1 = nn.Conv2D(in_channels=8,
+ out_channels=32,
+ kernel_size=7,
+ stride=1,
+ padding=3)
+ self.conv2 = nn.Conv2D(in_channels=32,
+ out_channels=64,
+ kernel_size=7,
+ stride=1,
+ padding=3)
+ self.conv3 = nn.Conv2D(in_channels=64,
+ out_channels=32,
+ kernel_size=7,
+ stride=1,
+ padding=3)
+ self.conv4 = nn.Conv2D(in_channels=32,
+ out_channels=16,
+ kernel_size=7,
+ stride=1,
+ padding=3)
+ self.conv5 = nn.Conv2D(in_channels=16,
+ out_channels=2,
+ kernel_size=7,
+ stride=1,
+ padding=3)
+ self.relu = nn.ReLU()
+
+ def forward(self, tensor_input):
+ """
+ Args:
+ tensor_input (Tensor): Input tensor with shape (b, 8, h, w).
+ 8 channels contain:
+ [reference image (3), neighbor image (3), initial flow (2)].
+
+ Returns:
+ Tensor: Refined flow with shape (b, 2, h, w)
+ """
+ out = self.relu(self.conv1(tensor_input))
+ out = self.relu(self.conv2(out))
+ out = self.relu(self.conv3(out))
+ out = self.relu(self.conv4(out))
+ out = self.conv5(out)
+ return out
+
+
+class SPyNet(nn.Layer):
+ """SPyNet network structure.
+
+ The difference to the SPyNet in paper is that
+ 1. more SPyNetBasicModule is used in this version, and
+ 2. no batch normalization is used in this version.
+
+ Paper:
+ Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+
+ """
+ def __init__(self):
+ super().__init__()
+
+ self.basic_module0 = SPyNetBasicModule()
+ self.basic_module1 = SPyNetBasicModule()
+ self.basic_module2 = SPyNetBasicModule()
+ self.basic_module3 = SPyNetBasicModule()
+ self.basic_module4 = SPyNetBasicModule()
+ self.basic_module5 = SPyNetBasicModule()
+
+ self.register_buffer(
+ 'mean',
+ paddle.to_tensor([0.485, 0.456, 0.406]).reshape([1, 3, 1, 1]))
+ self.register_buffer(
+ 'std',
+ paddle.to_tensor([0.229, 0.224, 0.225]).reshape([1, 3, 1, 1]))
+
+ def compute_flow(self, ref, supp):
+ """Compute flow from ref to supp.
+
+ Note that in this function, the images are already resized to a
+ multiple of 32.
+
+ Args:
+ ref (Tensor): Reference image with shape of (n, 3, h, w).
+ supp (Tensor): Supporting image with shape of (n, 3, h, w).
+
+ Returns:
+ Tensor: Estimated optical flow: (n, 2, h, w).
+ """
+
+ n, _, h, w = ref.shape
+
+ # normalize the input images
+ ref = [(ref - self.mean) / self.std]
+ supp = [(supp - self.mean) / self.std]
+
+ # generate downsampled frames
+ for level in range(5):
+ ref.append(F.avg_pool2d(ref[-1], kernel_size=2, stride=2))
+ supp.append(F.avg_pool2d(supp[-1], kernel_size=2, stride=2))
+ ref = ref[::-1]
+ supp = supp[::-1]
+
+ # flow computation
+ flow = paddle.zeros([n, 2, h // 32, w // 32])
+
+ # level=0
+ flow_up = flow
+ flow = flow_up + self.basic_module0(
+ paddle.concat([
+ ref[0],
+ flow_warp(supp[0],
+ flow_up.transpose([0, 2, 3, 1]),
+ padding_mode='border'), flow_up
+ ], 1))
+
+ # level=1
+ flow_up = F.interpolate(
+ flow, scale_factor=2, mode='bilinear', align_corners=True) * 2.0
+ flow = flow_up + self.basic_module1(
+ paddle.concat([
+ ref[1],
+ flow_warp(supp[1],
+ flow_up.transpose([0, 2, 3, 1]),
+ padding_mode='border'), flow_up
+ ], 1))
+
+ # level=2
+ flow_up = F.interpolate(
+ flow, scale_factor=2, mode='bilinear', align_corners=True) * 2.0
+ flow = flow_up + self.basic_module2(
+ paddle.concat([
+ ref[2],
+ flow_warp(supp[2],
+ flow_up.transpose([0, 2, 3, 1]),
+ padding_mode='border'), flow_up
+ ], 1))
+
+ # level=3
+ flow_up = F.interpolate(
+ flow, scale_factor=2, mode='bilinear', align_corners=True) * 2.0
+ flow = flow_up + self.basic_module3(
+ paddle.concat([
+ ref[3],
+ flow_warp(supp[3],
+ flow_up.transpose([0, 2, 3, 1]),
+ padding_mode='border'), flow_up
+ ], 1))
+
+ # level=4
+ flow_up = F.interpolate(
+ flow, scale_factor=2, mode='bilinear', align_corners=True) * 2.0
+ flow = flow_up + self.basic_module4(
+ paddle.concat([
+ ref[4],
+ flow_warp(supp[4],
+ flow_up.transpose([0, 2, 3, 1]),
+ padding_mode='border'), flow_up
+ ], 1))
+
+ # level=5
+ flow_up = F.interpolate(
+ flow, scale_factor=2, mode='bilinear', align_corners=True) * 2.0
+ flow = flow_up + self.basic_module5(
+ paddle.concat([
+ ref[5],
+ flow_warp(supp[5],
+ flow_up.transpose([0, 2, 3, 1]),
+ padding_mode='border'), flow_up
+ ], 1))
+
+ return flow
+
+ def forward(self, ref, supp):
+ """Forward function of SPyNet.
+
+ This function computes the optical flow from ref to supp.
+
+ Args:
+ ref (Tensor): Reference image with shape of (n, 3, h, w).
+ supp (Tensor): Supporting image with shape of (n, 3, h, w).
+
+ Returns:
+ Tensor: Estimated optical flow: (n, 2, h, w).
+ """
+
+ # upsize to a multiple of 32
+ h, w = ref.shape[2:4]
+ w_up = w if (w % 32) == 0 else 32 * (w // 32 + 1)
+ h_up = h if (h % 32) == 0 else 32 * (h // 32 + 1)
+ ref = F.interpolate(ref,
+ size=(h_up, w_up),
+ mode='bilinear',
+ align_corners=False)
+ supp = F.interpolate(supp,
+ size=(h_up, w_up),
+ mode='bilinear',
+ align_corners=False)
+ ref.stop_gradient = False
+ supp.stop_gradient = False
+ # compute flow, and resize back to the original resolution
+ flow_up = self.compute_flow(ref, supp)
+ flow = F.interpolate(flow_up,
+ size=(h, w),
+ mode='bilinear',
+ align_corners=False)
+
+ # adjust the flow values
+ # todo: grad bug
+ # flow[:, 0, :, :] *= (float(w) / float(w_up))
+ # flow[:, 1, :, :] *= (float(h) / float(h_up))
+
+ flow_x = flow[:, 0:1, :, :] * (float(w) / float(w_up))
+ flow_y = flow[:, 1:2, :, :] * (float(h) / float(h_up))
+ flow = paddle.concat([flow_x, flow_y], 1)
+
+ return flow
+
+
+class ResidualBlocksWithInputConv(nn.Layer):
+ """Residual blocks with a convolution in front.
+
+ Args:
+ in_channels (int): Number of input channels of the first conv.
+ out_channels (int): Number of channels of the residual blocks.
+ Default: 64.
+ num_blocks (int): Number of residual blocks. Default: 30.
+ """
+ def __init__(self, in_channels, out_channels=64, num_blocks=30):
+ super().__init__()
+
+ # a convolution used to match the channels of the residual blocks
+ self.covn1 = nn.Conv2D(in_channels, out_channels, 3, 1, 1)
+ self.Leaky_relu = nn.LeakyReLU(negative_slope=0.1)
+
+ # residual blocks
+ self.ResidualBlocks = MakeMultiBlocks(ResidualBlockNoBN,
+ num_blocks,
+ nf=out_channels)
+
+ def forward(self, feat):
+ """
+ Forward function for ResidualBlocksWithInputConv.
+
+ Args:
+ feat (Tensor): Input feature with shape (n, in_channels, h, w)
+
+ Returns:
+ Tensor: Output feature with shape (n, out_channels, h, w)
+ """
+ out = self.Leaky_relu(self.covn1(feat))
+ out = self.ResidualBlocks(out)
+ return out
+
+
+@GENERATORS.register()
+class BasicVSRNet(nn.Layer):
+ """BasicVSR network structure for video super-resolution.
+
+ Support only x4 upsampling.
+ Paper:
+ BasicVSR: The Search for Essential Components in Video Super-Resolution
+ and Beyond, CVPR, 2021
+
+ Args:
+ mid_channels (int): Channel number of the intermediate features.
+ Default: 64.
+ num_blocks (int): Number of residual blocks in each propagation branch.
+ Default: 30.
+ """
+ def __init__(self, mid_channels=64, num_blocks=30):
+
+ super().__init__()
+
+ self.mid_channels = mid_channels
+
+ # optical flow network for feature alignment
+ self.spynet = SPyNet()
+ weight_path = get_path_from_url(
+ 'https://paddlegan.bj.bcebos.com/models/spynet.pdparams')
+ self.spynet.set_state_dict(paddle.load(weight_path))
+
+ # propagation branches
+ self.backward_resblocks = ResidualBlocksWithInputConv(
+ mid_channels + 3, mid_channels, num_blocks)
+ self.forward_resblocks = ResidualBlocksWithInputConv(
+ mid_channels + 3, mid_channels, num_blocks)
+
+ # upsample
+ self.fusion = nn.Conv2D(mid_channels * 2, mid_channels, 1, 1, 0)
+ self.upsample1 = PixelShufflePack(mid_channels,
+ mid_channels,
+ 2,
+ upsample_kernel=3)
+ self.upsample2 = PixelShufflePack(mid_channels,
+ 64,
+ 2,
+ upsample_kernel=3)
+ self.conv_hr = nn.Conv2D(64, 64, 3, 1, 1)
+ self.conv_last = nn.Conv2D(64, 3, 3, 1, 1)
+ self.img_upsample = nn.Upsample(scale_factor=4,
+ mode='bilinear',
+ align_corners=False)
+
+ # activation function
+ self.lrelu = nn.LeakyReLU(negative_slope=0.1)
+
+ def check_if_mirror_extended(self, lrs):
+ """Check whether the input is a mirror-extended sequence.
+
+ If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the
+ (t-1-i)-th frame.
+
+ Args:
+ lrs (tensor): Input LR images with shape (n, t, c, h, w)
+ """
+
+ self.is_mirror_extended = False
+ if lrs.shape[1] % 2 == 0:
+ lrs_1, lrs_2 = paddle.chunk(lrs, 2, axis=1)
+ lrs_2 = paddle.flip(lrs_2, [1])
+ if paddle.norm(lrs_1 - lrs_2) == 0:
+ self.is_mirror_extended = True
+
+ def compute_flow(self, lrs):
+ """Compute optical flow using SPyNet for feature warping.
+
+ Note that if the input is an mirror-extended sequence, 'flows_forward'
+ is not needed, since it is equal to 'flows_backward.flip(1)'.
+
+ Args:
+ lrs (tensor): Input LR images with shape (n, t, c, h, w)
+
+ Return:
+ tuple(Tensor): Optical flow. 'flows_forward' corresponds to the
+ flows used for forward-time propagation (current to previous).
+ 'flows_backward' corresponds to the flows used for
+ backward-time propagation (current to next).
+ """
+
+ n, t, c, h, w = lrs.shape
+
+ lrs_1 = lrs[:, :-1, :, :, :].reshape([-1, c, h, w])
+ lrs_2 = lrs[:, 1:, :, :, :].reshape([-1, c, h, w])
+
+ flows_backward = self.spynet(lrs_1, lrs_2).reshape([n, t - 1, 2, h, w])
+
+ if self.is_mirror_extended: # flows_forward = flows_backward.flip(1)
+ flows_forward = None
+ else:
+ flows_forward = self.spynet(lrs_2,
+ lrs_1).reshape([n, t - 1, 2, h, w])
+
+ return flows_forward, flows_backward
+
+ def compute_flow_export(self, lrs):
+ """export version of compute_flow
+ """
+
+ n, t, c, h, w = lrs.shape
+
+ lrs_1 = lrs[:, :-1, :, :, :].reshape([-1, c, h, w])
+ lrs_2 = lrs[:, 1:, :, :, :].reshape([-1, c, h, w])
+
+ flows_backward = self.spynet(lrs_1, lrs_2).reshape([n, t - 1, 2, h, w])
+
+ flows_forward = self.spynet(lrs_2,
+ lrs_1).reshape([n, t - 1, 2, h, w])
+
+ return flows_forward, flows_backward
+
+ def forward(self, lrs):
+ """Forward function for BasicVSR.
+
+ Args:
+ lrs (Tensor): Input LR sequence with shape (n, t, c, h, w).
+
+ Returns:
+ Tensor: Output HR sequence with shape (n, t, c, 4h, 4w).
+ """
+
+ n, t, c, h, w = lrs.shape
+ t = paddle.to_tensor(t)
+ assert h >= 64 and w >= 64, (
+ 'The height and width of inputs should be at least 64, '
+ f'but got {h} and {w}.')
+
+ # check whether the input is an extended sequence
+ self.check_if_mirror_extended(lrs)
+
+ # compute optical flow
+ if hasattr(self, 'export_mode') and self.export_mode is True:
+ flows_forward, flows_backward = self.compute_flow_export(lrs)
+ else:
+ flows_forward, flows_backward = self.compute_flow(lrs)
+
+ # backward-time propgation
+ outputs = []
+ feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+ for i in range(t - 1, -1, -1):
+ if i < t - 1: # no warping required for the last timestep
+ flow1 = flows_backward[:, i, :, :, :]
+ feat_prop = flow_warp(feat_prop, flow1.transpose([0, 2, 3, 1]))
+
+ feat_prop = paddle.concat([lrs[:, i, :, :, :], feat_prop], axis=1)
+ feat_prop = self.backward_resblocks(feat_prop)
+
+ outputs.append(feat_prop)
+ outputs = outputs[::-1]
+
+ # forward-time propagation and upsampling
+ feat_prop = paddle.zeros_like(feat_prop)
+ for i in range(0, t):
+ lr_curr = lrs[:, i, :, :, :]
+ if i > 0: # no warping required for the first timestep
+ if flows_forward is not None:
+ flow = flows_forward[:, i - 1, :, :, :]
+ else:
+ flow = flows_backward[:, -i, :, :, :]
+ feat_prop = flow_warp(feat_prop, flow.transpose([0, 2, 3, 1]))
+
+ feat_prop = paddle.concat([lr_curr, feat_prop], axis=1)
+ feat_prop = self.forward_resblocks(feat_prop)
+
+ # upsampling given the backward and forward features
+ out = paddle.concat([outputs[i], feat_prop], axis=1)
+ out = self.lrelu(self.fusion(out))
+ out = self.lrelu(self.upsample1(out))
+ out = self.lrelu(self.upsample2(out))
+ out = self.lrelu(self.conv_hr(out))
+ out = self.conv_last(out)
+ base = self.img_upsample(lr_curr)
+ out += base
+ outputs[i] = out
+
+ return paddle.stack(outputs, axis=1)
+
+
+class SecondOrderDeformableAlignment(nn.Layer):
+ """Second-order deformable alignment module.
+
+ Args:
+ in_channels (int): Same as nn.Conv2d.
+ out_channels (int): Same as nn.Conv2d.
+ kernel_size (int or tuple[int]): Same as nn.Conv2d.
+ stride (int or tuple[int]): Same as nn.Conv2d.
+ padding (int or tuple[int]): Same as nn.Conv2d.
+ dilation (int or tuple[int]): Same as nn.Conv2d.
+ groups (int): Same as nn.Conv2d.
+ deformable_groups (int).
+ """
+ def __init__(self,
+ in_channels=128,
+ out_channels=64,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ dilation=1,
+ groups=1,
+ deformable_groups=16):
+ super(SecondOrderDeformableAlignment, self).__init__()
+
+ self.conv_offset = nn.Sequential(
+ nn.Conv2D(3 * out_channels + 4, out_channels, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(out_channels, 27 * deformable_groups, 3, 1, 1),
+ )
+ self.dcn = DeformConv2D(in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ deformable_groups=deformable_groups)
+ self.init_offset()
+
+ def init_offset(self):
+ constant_(self.conv_offset[-1].weight, 0)
+ constant_(self.conv_offset[-1].bias, 0)
+
+ def forward(self, x, extra_feat, flow_1, flow_2):
+ extra_feat = paddle.concat([extra_feat, flow_1, flow_2], axis=1)
+ out = self.conv_offset(extra_feat)
+ o1, o2, mask = paddle.chunk(out, 3, axis=1)
+
+ # offset
+ offset = 10 * paddle.tanh(paddle.concat((o1, o2), axis=1))
+ offset_1, offset_2 = paddle.chunk(offset, 2, axis=1)
+ offset_1 = offset_1 + flow_1.flip(1).tile(
+ [1, offset_1.shape[1] // 2, 1, 1])
+ offset_2 = offset_2 + flow_2.flip(1).tile(
+ [1, offset_2.shape[1] // 2, 1, 1])
+ offset = paddle.concat([offset_1, offset_2], axis=1)
+
+ # mask
+ mask = F.sigmoid(mask)
+
+ out = self.dcn(x, offset, mask)
+ return out
diff --git a/ppgan/models/generators/basicvsr_plus_plus.py b/ppgan/models/generators/basicvsr_plus_plus.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0783258a7355ae1322a6b039e54d37545a323fa
--- /dev/null
+++ b/ppgan/models/generators/basicvsr_plus_plus.py
@@ -0,0 +1,439 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ...utils.download import get_path_from_url
+from .basicvsr import PixelShufflePack, flow_warp, SPyNet, \
+ ResidualBlocksWithInputConv, SecondOrderDeformableAlignment
+from .builder import GENERATORS
+
+
+@GENERATORS.register()
+class BasicVSRPlusPlus(nn.Layer):
+ """BasicVSR++ network structure.
+ Support either x4 upsampling or same size output. Since DCN is used in this
+ model, it can only be used with CUDA enabled.
+ Paper:
+ BasicVSR++: Improving Video Super-Resolution with Enhanced Propagation
+ and Alignment
+
+ Adapted from 'https://github.com/open-mmlab/mmediting'
+ 'mmediting/blob/master/mmedit/models/backbones/sr_backbones/basicvsr_pp.py'
+ Copyright (c) MMEditing Authors.
+
+ Args:
+ mid_channels (int, optional): Channel number of the intermediate
+ features. Default: 64.
+ num_blocks (int, optional): The number of residual blocks in each
+ propagation branch. Default: 7.
+ is_low_res_input (bool, optional): Whether the input is low-resolution
+ or not. If False, the output resolution is equal to the input
+ resolution. Default: True.
+ """
+ def __init__(self, mid_channels=64, num_blocks=7, is_low_res_input=True):
+
+ super().__init__()
+
+ self.mid_channels = mid_channels
+ self.is_low_res_input = is_low_res_input
+
+ # optical flow
+ self.spynet = SPyNet()
+ weight_path = get_path_from_url(
+ 'https://paddlegan.bj.bcebos.com/models/spynet.pdparams')
+ self.spynet.set_state_dict(paddle.load(weight_path))
+
+ # feature extraction module
+ if is_low_res_input:
+ self.feat_extract = ResidualBlocksWithInputConv(3, mid_channels, 5)
+ else:
+ self.feat_extract = nn.Sequential(
+ nn.Conv2D(3, mid_channels, 3, 2, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(mid_channels, mid_channels, 3, 2, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ ResidualBlocksWithInputConv(mid_channels, mid_channels, 5))
+
+ # propagation branches
+ self.deform_align_backward_1 = SecondOrderDeformableAlignment(
+ 2 * mid_channels, mid_channels, 3, padding=1, deformable_groups=16)
+ self.deform_align_forward_1 = SecondOrderDeformableAlignment(
+ 2 * mid_channels, mid_channels, 3, padding=1, deformable_groups=16)
+ self.deform_align_backward_2 = SecondOrderDeformableAlignment(
+ 2 * mid_channels, mid_channels, 3, padding=1, deformable_groups=16)
+ self.deform_align_forward_2 = SecondOrderDeformableAlignment(
+ 2 * mid_channels, mid_channels, 3, padding=1, deformable_groups=16)
+ self.backbone_backward_1 = ResidualBlocksWithInputConv(
+ 2 * mid_channels, mid_channels, num_blocks)
+ self.backbone_forward_1 = ResidualBlocksWithInputConv(
+ 3 * mid_channels, mid_channels, num_blocks)
+ self.backbone_backward_2 = ResidualBlocksWithInputConv(
+ 4 * mid_channels, mid_channels, num_blocks)
+ self.backbone_forward_2 = ResidualBlocksWithInputConv(
+ 5 * mid_channels, mid_channels, num_blocks)
+
+ # upsampling module
+ self.reconstruction = ResidualBlocksWithInputConv(
+ 5 * mid_channels, mid_channels, 5)
+ self.upsample1 = PixelShufflePack(mid_channels,
+ mid_channels,
+ 2,
+ upsample_kernel=3)
+ self.upsample2 = PixelShufflePack(mid_channels,
+ 64,
+ 2,
+ upsample_kernel=3)
+ self.conv_hr = nn.Conv2D(64, 64, 3, 1, 1)
+ self.conv_last = nn.Conv2D(64, 3, 3, 1, 1)
+ self.img_upsample = nn.Upsample(scale_factor=4,
+ mode='bilinear',
+ align_corners=False)
+
+ # activation function
+ self.lrelu = nn.LeakyReLU(negative_slope=0.1)
+
+ def check_if_mirror_extended(self, lrs):
+ """Check whether the input is a mirror-extended sequence.
+ If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the
+ (t-1-i)-th frame.
+ Args:
+ lqs (tensor): Input LR images with shape (n, t, c, h, w)
+ """
+
+ with paddle.no_grad():
+ self.is_mirror_extended = False
+ if lrs.shape[1] % 2 == 0:
+ lrs_1, lrs_2 = paddle.chunk(lrs, 2, axis=1)
+ lrs_2 = paddle.flip(lrs_2, [1])
+ if paddle.norm(lrs_1 - lrs_2) == 0:
+ self.is_mirror_extended = True
+
+ def compute_flow(self, lrs):
+ """Compute optical flow using SPyNet for feature alignment.
+ Note that if the input is an mirror-extended sequence, 'flows_forward'
+ is not needed, since it is equal to 'flows_backward.flip(1)'.
+ Args:
+ lqs (tensor): Input LR images with shape (n, t, c, h, w)
+ Return:
+ tuple(Tensor): Optical flow. 'flows_forward' corresponds to the
+ flows used for forward-time propagation (current to previous).
+ 'flows_backward' corresponds to the flows used for
+ backward-time propagation (current to next).
+ """
+
+ n, t, c, h, w = lrs.shape
+
+ lrs_1 = lrs[:, :-1, :, :, :].reshape([-1, c, h, w])
+ lrs_2 = lrs[:, 1:, :, :, :].reshape([-1, c, h, w])
+
+ flows_backward = self.spynet(lrs_1, lrs_2).reshape([n, t - 1, 2, h, w])
+
+ if self.is_mirror_extended:
+ flows_forward = flows_backward.flip(1)
+ else:
+ flows_forward = self.spynet(lrs_2,
+ lrs_1).reshape([n, t - 1, 2, h, w])
+
+ return flows_forward, flows_backward
+
+ def upsample(self, lqs, feats):
+ """Compute the output image given the features.
+ Args:
+ lqs (tensor): Input LR images with shape (n, t, c, h, w).
+ feats (dict): The features from the propgation branches.
+ Returns:
+ Tensor: Output HR sequence with shape (n, t, c, 4h, 4w).
+ """
+
+ outputs = []
+ num_outputs = len(feats['spatial'])
+
+ mapping_idx = list(range(0, num_outputs))
+ mapping_idx += mapping_idx[::-1]
+
+ for i in range(0, lqs.shape[1]):
+ hr = [feats[k].pop(0) for k in feats if k != 'spatial']
+ hr.insert(0, feats['spatial'][mapping_idx[i]])
+ hr = paddle.concat(hr, axis=1)
+
+ hr = self.reconstruction(hr)
+ hr = self.lrelu(self.upsample1(hr))
+ hr = self.lrelu(self.upsample2(hr))
+ hr = self.lrelu(self.conv_hr(hr))
+ hr = self.conv_last(hr)
+ if self.is_low_res_input:
+ hr += self.img_upsample(lqs[:, i, :, :, :])
+ else:
+ hr += lqs[:, i, :, :, :]
+
+ outputs.append(hr)
+
+ return paddle.stack(outputs, axis=1)
+
+ def forward(self, lqs):
+ """Forward function for BasicVSR++.
+ Args:
+ lqs (Tensor): Input LR sequence with shape (n, t, c, h, w).
+ Returns:
+ Tensor: Output HR sequence with shape (n, t, c, 4h, 4w).
+ """
+
+ n, t, c, h, w = lqs.shape
+
+ if self.is_low_res_input:
+ lqs_downsample = lqs
+ else:
+ lqs_downsample = F.interpolate(lqs.reshape([-1, c, h, w]),
+ scale_factor=0.25,
+ mode='bicubic').reshape(
+ [n, t, c, h // 4, w // 4])
+
+ # check whether the input is an extended sequence
+ self.check_if_mirror_extended(lqs)
+
+ feats = {}
+ feats_ = self.feat_extract(lqs.reshape([-1, c, h, w]))
+ h, w = feats_.shape[2:]
+ feats_ = feats_.reshape([n, t, -1, h, w])
+ feats['spatial'] = [feats_[:, i, :, :, :] for i in range(0, t)]
+
+ # compute optical flow using the low-res inputs
+ assert lqs_downsample.shape[3] >= 64 and lqs_downsample.shape[4] >= 64, (
+ 'The height and width of low-res inputs must be at least 64, '
+ f'but got {h} and {w}.')
+ flows_forward, flows_backward = self.compute_flow(lqs_downsample)
+
+ # feature propgation
+
+ # backward_1
+ feats['backward_1'] = []
+ flows = flows_backward
+
+ n, t, _, h, w = flows.shape
+
+ frame_idx = range(t, -1, -1)
+ flow_idx = range(t, -1, -1)
+ mapping_idx = list(range(0, len(feats['spatial'])))
+ mapping_idx += mapping_idx[::-1]
+
+ feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+
+ for i, idx in enumerate(frame_idx):
+ feat_current = feats['spatial'][mapping_idx[idx]]
+
+ if i > 0:
+ flow_n1 = flows[:, flow_idx[i], :, :, :]
+ cond_n1 = flow_warp(feat_prop, flow_n1.transpose([0, 2, 3, 1]))
+
+ # initialize second-order features
+ feat_n2 = paddle.zeros_like(feat_prop)
+ flow_n2 = paddle.zeros_like(flow_n1)
+ cond_n2 = paddle.zeros_like(cond_n1)
+
+ if i > 1: # second-order features
+ feat_n2 = feats['backward_1'][-2]
+
+ flow_n2 = flows[:, flow_idx[i - 1], :, :, :]
+
+ flow_n2 = flow_n1 + flow_warp(
+ flow_n2, flow_n1.transpose([0, 2, 3, 1]))
+
+ cond_n2 = flow_warp(feat_n2, flow_n2.transpose([0, 2, 3,
+ 1]))
+
+ # flow-guided deformable convolution
+ cond = paddle.concat([cond_n1, feat_current, cond_n2], axis=1)
+ feat_prop = paddle.concat([feat_prop, feat_n2], axis=1)
+
+ feat_prop = self.deform_align_backward_1(
+ feat_prop, cond, flow_n1, flow_n2)
+
+ # concatenate and residual blocks
+ feat = [feat_current] + [
+ feats[k][idx]
+ for k in feats if k not in ['spatial', 'backward_1']
+ ] + [feat_prop]
+
+ feat = paddle.concat(feat, axis=1)
+ feat_prop = feat_prop + self.backbone_backward_1(feat)
+ feats['backward_1'].append(feat_prop)
+
+ feats['backward_1'] = feats['backward_1'][::-1]
+
+ # forward_1
+ feats['forward_1'] = []
+ flows = flows_forward
+
+ n, t, _, h, w = flows.shape
+
+ frame_idx = range(0, t + 1)
+ flow_idx = range(-1, t)
+ mapping_idx = list(range(0, len(feats['spatial'])))
+ mapping_idx += mapping_idx[::-1]
+
+ feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+
+ for i, idx in enumerate(frame_idx):
+ feat_current = feats['spatial'][mapping_idx[idx]]
+
+ if i > 0:
+ flow_n1 = flows[:, flow_idx[i], :, :, :]
+ cond_n1 = flow_warp(feat_prop, flow_n1.transpose([0, 2, 3, 1]))
+
+ # initialize second-order features
+ feat_n2 = paddle.zeros_like(feat_prop)
+ flow_n2 = paddle.zeros_like(flow_n1)
+ cond_n2 = paddle.zeros_like(cond_n1)
+
+ if i > 1: # second-order features
+ feat_n2 = feats['forward_1'][-2]
+
+ flow_n2 = flows[:, flow_idx[i - 1], :, :, :]
+
+ flow_n2 = flow_n1 + flow_warp(
+ flow_n2, flow_n1.transpose([0, 2, 3, 1]))
+
+ cond_n2 = flow_warp(feat_n2, flow_n2.transpose([0, 2, 3,
+ 1]))
+
+ # flow-guided deformable convolution
+ cond = paddle.concat([cond_n1, feat_current, cond_n2], axis=1)
+ feat_prop = paddle.concat([feat_prop, feat_n2], axis=1)
+
+ feat_prop = self.deform_align_forward_1(feat_prop, cond,
+ flow_n1, flow_n2)
+
+ # concatenate and residual blocks
+ feat = [feat_current] + [
+ feats[k][idx]
+ for k in feats if k not in ['spatial', 'forward_1']
+ ] + [feat_prop]
+
+ feat = paddle.concat(feat, axis=1)
+ feat_prop = feat_prop + self.backbone_forward_1(feat)
+ feats['forward_1'].append(feat_prop)
+
+ # backward_2
+ feats['backward_2'] = []
+ flows = flows_backward
+
+ n, t, _, h, w = flows.shape
+
+ frame_idx = range(t, -1, -1)
+ flow_idx = range(t, -1, -1)
+ mapping_idx = list(range(0, len(feats['spatial'])))
+ mapping_idx += mapping_idx[::-1]
+
+ feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+
+ for i, idx in enumerate(frame_idx):
+ feat_current = feats['spatial'][mapping_idx[idx]]
+
+ if i > 0:
+ flow_n1 = flows[:, flow_idx[i], :, :, :]
+ cond_n1 = flow_warp(feat_prop, flow_n1.transpose([0, 2, 3, 1]))
+
+ # initialize second-order features
+ feat_n2 = paddle.zeros_like(feat_prop)
+ flow_n2 = paddle.zeros_like(flow_n1)
+ cond_n2 = paddle.zeros_like(cond_n1)
+
+ if i > 1: # second-order features
+ feat_n2 = feats['backward_2'][-2]
+
+ flow_n2 = flows[:, flow_idx[i - 1], :, :, :]
+
+ flow_n2 = flow_n1 + flow_warp(
+ flow_n2, flow_n1.transpose([0, 2, 3, 1]))
+
+ cond_n2 = flow_warp(feat_n2, flow_n2.transpose([0, 2, 3,
+ 1]))
+
+ # flow-guided deformable convolution
+ cond = paddle.concat([cond_n1, feat_current, cond_n2], axis=1)
+ feat_prop = paddle.concat([feat_prop, feat_n2], axis=1)
+
+ feat_prop = self.deform_align_backward_2(
+ feat_prop, cond, flow_n1, flow_n2)
+
+ # concatenate and residual blocks
+ feat = [feat_current] + [
+ feats[k][idx]
+ for k in feats if k not in ['spatial', 'backward_2']
+ ] + [feat_prop]
+
+ feat = paddle.concat(feat, axis=1)
+ feat_prop = feat_prop + self.backbone_backward_2(feat)
+ feats['backward_2'].append(feat_prop)
+
+ feats['backward_2'] = feats['backward_2'][::-1]
+
+ # forward_2
+ feats['forward_2'] = []
+ flows = flows_forward
+
+ n, t, _, h, w = flows.shape
+
+ frame_idx = range(0, t + 1)
+ flow_idx = range(-1, t)
+ mapping_idx = list(range(0, len(feats['spatial'])))
+ mapping_idx += mapping_idx[::-1]
+
+ feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+
+ for i, idx in enumerate(frame_idx):
+ feat_current = feats['spatial'][mapping_idx[idx]]
+
+ if i > 0:
+ flow_n1 = flows[:, flow_idx[i], :, :, :]
+ cond_n1 = flow_warp(feat_prop, flow_n1.transpose([0, 2, 3, 1]))
+
+ # initialize second-order features
+ feat_n2 = paddle.zeros_like(feat_prop)
+ flow_n2 = paddle.zeros_like(flow_n1)
+ cond_n2 = paddle.zeros_like(cond_n1)
+
+ if i > 1: # second-order features
+ feat_n2 = feats['forward_2'][-2]
+
+ flow_n2 = flows[:, flow_idx[i - 1], :, :, :]
+
+ flow_n2 = flow_n1 + flow_warp(
+ flow_n2, flow_n1.transpose([0, 2, 3, 1]))
+
+ cond_n2 = flow_warp(feat_n2, flow_n2.transpose([0, 2, 3,
+ 1]))
+
+ # flow-guided deformable convolution
+ cond = paddle.concat([cond_n1, feat_current, cond_n2], axis=1)
+ feat_prop = paddle.concat([feat_prop, feat_n2], axis=1)
+
+ feat_prop = self.deform_align_forward_2(feat_prop, cond,
+ flow_n1, flow_n2)
+
+ # concatenate and residual blocks
+ feat = [feat_current] + [
+ feats[k][idx]
+ for k in feats if k not in ['spatial', 'forward_2']
+ ] + [feat_prop]
+
+ feat = paddle.concat(feat, axis=1)
+ feat_prop = feat_prop + self.backbone_forward_2(feat)
+ feats['forward_2'].append(feat_prop)
+
+ return self.upsample(lqs, feats)
diff --git a/ppgan/models/generators/dcgenerator.py b/ppgan/models/generators/dcgenerator.py
index 5bbdbb33e84358583d0d30f3596c0880e2a52711..abbc633b9cbbad8d85971eb4606dd7c2889bac34 100644
--- a/ppgan/models/generators/dcgenerator.py
+++ b/ppgan/models/generators/dcgenerator.py
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/aidotse/Team-Haste
+# MIT License
+# Copyright (c) 2020 AI Sweden
+
import paddle
import paddle.nn as nn
import functools
@@ -26,7 +30,6 @@ from .builder import GENERATORS
class DCGenerator(nn.Layer):
"""Resnet-based generator that consists of Resnet blocks between a few downsampling/upsampling operations.
- code and idea from Justin Johnson's neural style transfer project(https://github.com/jcjohnson/fast-neural-style)
"""
def __init__(self,
input_nz,
@@ -38,12 +41,12 @@ class DCGenerator(nn.Layer):
"""Construct a DCGenerator generator
Args:
- input_nz (int) -- the number of dimension in input noise
- input_nc (int) -- the number of channels in input images
- output_nc (int) -- the number of channels in output images
- ngf (int) -- the number of filters in the last conv layer
- norm_layer -- normalization layer
- padding_type (str) -- the name of padding layer in conv layers: reflect | replicate | zero
+ input_nz (int): the number of dimension in input noise
+ input_nc (int): the number of channels in input images
+ output_nc (int): the number of channels in output images
+ ngf (int): the number of filters in the last conv layer
+ norm_layer: normalization layer
+ padding_type (str): the name of padding layer in conv layers: reflect | replicate | zero
"""
super(DCGenerator, self).__init__()
@@ -59,65 +62,66 @@ class DCGenerator(nn.Layer):
if norm_type == 'batch':
model = [
nn.Conv2DTranspose(input_nz,
- ngf * mult,
- kernel_size=4,
- stride=1,
- padding=0,
- bias_attr=use_bias),
+ ngf * mult,
+ kernel_size=4,
+ stride=1,
+ padding=0,
+ bias_attr=use_bias),
BatchNorm2D(ngf * mult),
nn.ReLU()
]
else:
model = [
nn.Conv2DTranspose(input_nz,
- ngf * mult,
- kernel_size=4,
- stride=1,
- padding=0,
- bias_attr=use_bias),
+ ngf * mult,
+ kernel_size=4,
+ stride=1,
+ padding=0,
+ bias_attr=use_bias),
norm_layer(ngf * mult),
nn.ReLU()
]
- for i in range(1,n_downsampling): # add upsampling layers
+ # add upsampling layers
+ for i in range(1, n_downsampling):
mult = 2**(n_downsampling - i)
- output_size = 2**(i+2)
+ output_size = 2**(i + 2)
if norm_type == 'batch':
model += [
- nn.Conv2DTranspose(ngf * mult,
- ngf * mult//2,
- kernel_size=4,
- stride=2,
- padding=1,
- bias_attr=use_bias),
- BatchNorm2D(ngf * mult//2),
- nn.ReLU()
- ]
+ nn.Conv2DTranspose(ngf * mult,
+ ngf * mult // 2,
+ kernel_size=4,
+ stride=2,
+ padding=1,
+ bias_attr=use_bias),
+ BatchNorm2D(ngf * mult // 2),
+ nn.ReLU()
+ ]
else:
model += [
nn.Conv2DTranspose(ngf * mult,
- int(ngf * mult//2),
- kernel_size=4,
- stride=2,
- padding=1,
- bias_attr=use_bias),
+ int(ngf * mult // 2),
+ kernel_size=4,
+ stride=2,
+ padding=1,
+ bias_attr=use_bias),
norm_layer(int(ngf * mult // 2)),
nn.ReLU()
]
output_size = 2**(6)
model += [
- nn.Conv2DTranspose(ngf ,
- output_nc,
- kernel_size=4,
- stride=2,
- padding=1,
- bias_attr=use_bias),
- nn.Tanh()
- ]
+ nn.Conv2DTranspose(ngf,
+ output_nc,
+ kernel_size=4,
+ stride=2,
+ padding=1,
+ bias_attr=use_bias),
+ nn.Tanh()
+ ]
self.model = nn.Sequential(*model)
def forward(self, x):
"""Standard forward"""
- return self.model(x)
\ No newline at end of file
+ return self.model(x)
diff --git a/ppgan/models/generators/deep_conv.py b/ppgan/models/generators/deep_conv.py
index 9712c9f6b1c505d9a981bc4d8c45db53739b0188..0e757cbecbe8c797734767704e008f740800408a 100644
--- a/ppgan/models/generators/deep_conv.py
+++ b/ppgan/models/generators/deep_conv.py
@@ -21,32 +21,33 @@ from .builder import GENERATORS
@GENERATORS.register()
class DeepConvGenerator(nn.Layer):
- """Create a Deep Convolutional generator"""
+ """Create a Deep Convolutional generator
+ Refer to https://arxiv.org/abs/1511.06434
+ """
def __init__(self, latent_dim, output_nc, size=64, ngf=64):
"""Construct a Deep Convolutional generator
Args:
- latent_dim (int) -- the number of latent dimension
- output_nc (int) -- the number of channels in output images
- size (int) -- size of output tensor
- ngf (int) -- the number of filters in the last conv layer
-
- Refer to https://arxiv.org/abs/1511.06434
+ latent_dim (int): the number of latent dimension
+ output_nc (int): the number of channels in output images
+ size (int): size of output tensor
+ ngf (int): the number of filters in the last conv layer
"""
super(DeepConvGenerator, self).__init__()
self.latent_dim = latent_dim
self.ngf = ngf
self.init_size = size // 4
- self.l1 = nn.Sequential(nn.Linear(latent_dim, ngf*2 * self.init_size ** 2))
+ self.l1 = nn.Sequential(
+ nn.Linear(latent_dim, ngf * 2 * self.init_size**2))
self.conv_blocks = nn.Sequential(
- nn.BatchNorm2D(ngf*2),
+ nn.BatchNorm2D(ngf * 2),
nn.Upsample(scale_factor=2),
- nn.Conv2D(ngf*2, ngf*2, 3, stride=1, padding=1),
- nn.BatchNorm2D(ngf*2, 0.2),
+ nn.Conv2D(ngf * 2, ngf * 2, 3, stride=1, padding=1),
+ nn.BatchNorm2D(ngf * 2, 0.2),
nn.LeakyReLU(0.2),
nn.Upsample(scale_factor=2),
- nn.Conv2D(ngf*2, ngf, 3, stride=1, padding=1),
+ nn.Conv2D(ngf * 2, ngf, 3, stride=1, padding=1),
nn.BatchNorm2D(ngf, 0.2),
nn.LeakyReLU(0.2),
nn.Conv2D(ngf, output_nc, 3, stride=1, padding=1),
@@ -55,24 +56,36 @@ class DeepConvGenerator(nn.Layer):
def random_inputs(self, batch_size):
return paddle.randn([batch_size, self.latent_dim])
-
+
def forward(self, z):
out = self.l1(z)
- out = out.reshape([out.shape[0], self.ngf * 2, self.init_size, self.init_size])
+ out = out.reshape(
+ [out.shape[0], self.ngf * 2, self.init_size, self.init_size])
img = self.conv_blocks(out)
return img
@GENERATORS.register()
class ConditionalDeepConvGenerator(DeepConvGenerator):
+ """Create a Conditional Deep Convolutional generator
+ """
def __init__(self, latent_dim, output_nc, n_class=10, **kwargs):
- super(ConditionalDeepConvGenerator, self).__init__(latent_dim + n_class, output_nc, **kwargs)
+ """Construct a Conditional Deep Convolutional generator
+ Args:
+ latent_dim (int): the number of latent dimension
+ output_nc (int): the number of channels in output images
+ n_class (int): the number of class
+ """
+ super(ConditionalDeepConvGenerator,
+ self).__init__(latent_dim + n_class, output_nc, **kwargs)
self.n_class = n_class
self.latent_dim = latent_dim
-
+
def random_inputs(self, batch_size):
- return_list = [super(ConditionalDeepConvGenerator, self).random_inputs(batch_size)]
+ return_list = [
+ super(ConditionalDeepConvGenerator, self).random_inputs(batch_size)
+ ]
class_id = paddle.randint(0, self.n_class, [batch_size])
return return_list + [class_id]
@@ -82,5 +95,5 @@ class ConditionalDeepConvGenerator(DeepConvGenerator):
class_id = F.one_hot(class_id, self.n_class).astype('float32')
class_id = class_id.reshape([x.shape[0], -1])
x = paddle.concat([x, class_id], 1)
-
+
return super(ConditionalDeepConvGenerator, self).forward(x)
diff --git a/ppgan/models/generators/deoldify.py b/ppgan/models/generators/deoldify.py
index b04f39df7e3bbfc22b8baa85e0ff2b5d8ce4b4df..1b0a9f8652dc848d74e3e2977a75627e3e12e555 100644
--- a/ppgan/models/generators/deoldify.py
+++ b/ppgan/models/generators/deoldify.py
@@ -1,16 +1,5 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# MIT License
+# Copyright (c) 2018 Jason Antic
import numpy as np
import paddle
diff --git a/ppgan/models/generators/drn.py b/ppgan/models/generators/drn.py
index 8c85ad0f7f07c4c6cf23eb35b0ffc95b7eef45ae..70de3ed77f6b490a76dfa374fe6cd08efedff96b 100644
--- a/ppgan/models/generators/drn.py
+++ b/ppgan/models/generators/drn.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# MIT License
+# Copyright (c) 2020 Yong Guo
+# code was based on https://github.com/guoyongcs/DRN
import math
import paddle
diff --git a/ppgan/models/generators/edvr.py b/ppgan/models/generators/edvr.py
new file mode 100644
index 0000000000000000000000000000000000000000..88107767bc21e31ba14c7198d6b484fff941a23c
--- /dev/null
+++ b/ppgan/models/generators/edvr.py
@@ -0,0 +1,794 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle.vision.ops import DeformConv2D
+
+from ...modules.init import kaiming_normal_, constant_, constant_init
+
+from .builder import GENERATORS
+
+
+@paddle.no_grad()
+def default_init_weights(module_list, scale=1, bias_fill=0, **kwargs):
+ """Initialize network weights.
+
+ Args:
+ module_list (list[nn.Module] | nn.Module): Modules to be initialized.
+ scale (float): Scale initialized weights, especially for residual
+ blocks. Default: 1.
+ bias_fill (float): The value to fill bias. Default: 0
+ kwargs (dict): Other arguments for initialization function.
+ """
+ if not isinstance(module_list, list):
+ module_list = [module_list]
+ for m in module_list:
+ if isinstance(m, nn.Conv2D):
+ kaiming_normal_(m.weight, **kwargs)
+ scale_weight = scale * m.weight
+ m.weight.set_value(scale_weight)
+ if m.bias is not None:
+ constant_(m.bias, bias_fill)
+ elif isinstance(m, nn.Linear):
+ kaiming_normal_(m.weight, **kwargs)
+ scale_weight = scale * m.weight
+ m.weight.set_value(scale_weight)
+ if m.bias is not None:
+ constant_(m.bias, bias_fill)
+
+
+class ResidualBlockNoBN(nn.Layer):
+ """Residual block without BN.
+
+ It has a style of:
+ ---Conv-ReLU-Conv-+-
+ |________________|
+
+ Args:
+ nf (int): Channel number of intermediate features.
+ Default: 64.
+ """
+
+ def __init__(self, nf=64):
+ super(ResidualBlockNoBN, self).__init__()
+ self.nf = nf
+ self.conv1 = nn.Conv2D(self.nf, self.nf, 3, 1, 1)
+ self.conv2 = nn.Conv2D(self.nf, self.nf, 3, 1, 1)
+ self.relu = nn.ReLU()
+ default_init_weights([self.conv1, self.conv2], 0.1)
+
+ def forward(self, x):
+ identity = x
+ out = self.conv2(self.relu(self.conv1(x)))
+ return identity + out
+
+
+def MakeMultiBlocks(func, num_layers, nf=64):
+ """Make layers by stacking the same blocks.
+
+ Args:
+ func (nn.Layer): nn.Layer class for basic block.
+ num_layers (int): number of blocks.
+
+ Returns:
+ nn.Sequential: Stacked blocks in nn.Sequential.
+ """
+ Blocks = nn.Sequential()
+ for i in range(num_layers):
+ Blocks.add_sublayer('block%d' % i, func(nf))
+ return Blocks
+
+
+class PredeblurResNetPyramid(nn.Layer):
+ """Pre-dublur module.
+
+ Args:
+ in_nf (int): Channel number of input image. Default: 3.
+ nf (int): Channel number of intermediate features. Default: 64.
+ HR_in (bool): Whether the input has high resolution. Default: False.
+ """
+
+ def __init__(self, in_nf=3, nf=64, HR_in=False):
+ super(PredeblurResNetPyramid, self).__init__()
+ self.in_nf = in_nf
+ self.nf = nf
+ self.HR_in = True if HR_in else False
+ self.Leaky_relu = nn.LeakyReLU(negative_slope=0.1)
+ if self.HR_in:
+ self.conv_first_1 = nn.Conv2D(in_channels=self.in_nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.conv_first_2 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=2,
+ padding=1)
+ self.conv_first_3 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=2,
+ padding=1)
+ else:
+ self.conv_first = nn.Conv2D(in_channels=self.in_nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.RB_L1_1 = ResidualBlockNoBN(nf=self.nf)
+ self.RB_L1_2 = ResidualBlockNoBN(nf=self.nf)
+ self.RB_L1_3 = ResidualBlockNoBN(nf=self.nf)
+ self.RB_L1_4 = ResidualBlockNoBN(nf=self.nf)
+ self.RB_L1_5 = ResidualBlockNoBN(nf=self.nf)
+ self.RB_L2_1 = ResidualBlockNoBN(nf=self.nf)
+ self.RB_L2_2 = ResidualBlockNoBN(nf=self.nf)
+ self.RB_L3_1 = ResidualBlockNoBN(nf=self.nf)
+ self.deblur_L2_conv = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=2,
+ padding=1)
+ self.deblur_L3_conv = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=2,
+ padding=1)
+ self.upsample = nn.Upsample(scale_factor=2,
+ mode="bilinear",
+ align_corners=False,
+ align_mode=0)
+
+ def forward(self, x):
+ if self.HR_in:
+ L1_fea = self.Leaky_relu(self.conv_first_1(x))
+ L1_fea = self.Leaky_relu(self.conv_first_2(L1_fea))
+ L1_fea = self.Leaky_relu(self.conv_first_3(L1_fea))
+ else:
+ L1_fea = self.Leaky_relu(self.conv_first(x))
+ L2_fea = self.deblur_L2_conv(L1_fea)
+ L2_fea = self.Leaky_relu(L2_fea)
+ L3_fea = self.deblur_L3_conv(L2_fea)
+ L3_fea = self.Leaky_relu(L3_fea)
+ L3_fea = self.RB_L3_1(L3_fea)
+ L3_fea = self.upsample(L3_fea)
+ L2_fea = self.RB_L2_1(L2_fea) + L3_fea
+ L2_fea = self.RB_L2_2(L2_fea)
+ L2_fea = self.upsample(L2_fea)
+ L1_fea = self.RB_L1_1(L1_fea)
+ L1_fea = self.RB_L1_2(L1_fea) + L2_fea
+ out = self.RB_L1_3(L1_fea)
+ out = self.RB_L1_4(out)
+ out = self.RB_L1_5(out)
+ return out
+
+
+class TSAFusion(nn.Layer):
+ """Temporal Spatial Attention (TSA) fusion module.
+
+ Temporal: Calculate the correlation between center frame and
+ neighboring frames;
+ Spatial: It has 3 pyramid levels, the attention is similar to SFT.
+ (SFT: Recovering realistic texture in image super-resolution by deep
+ spatial feature transform.)
+
+ Args:
+ nf (int): Channel number of middle features. Default: 64.
+ nframes (int): Number of frames. Default: 5.
+ center (int): The index of center frame. Default: 2.
+ """
+
+ def __init__(self, nf=64, nframes=5, center=2):
+ super(TSAFusion, self).__init__()
+ self.nf = nf
+ self.nframes = nframes
+ self.center = center
+ self.sigmoid = nn.Sigmoid()
+ self.Leaky_relu = nn.LeakyReLU(negative_slope=0.1)
+ self.tAtt_2 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.tAtt_1 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.fea_fusion = nn.Conv2D(in_channels=self.nf * self.nframes,
+ out_channels=self.nf,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.sAtt_1 = nn.Conv2D(in_channels=self.nf * self.nframes,
+ out_channels=self.nf,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.max_pool = nn.MaxPool2D(3, stride=2, padding=1)
+ self.avg_pool = nn.AvgPool2D(3, stride=2, padding=1, exclusive=False)
+ self.sAtt_2 = nn.Conv2D(in_channels=2 * self.nf,
+ out_channels=self.nf,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.sAtt_3 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.sAtt_4 = nn.Conv2D(
+ in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ )
+ self.sAtt_5 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.sAtt_add_1 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.sAtt_add_2 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.sAtt_L1 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self.sAtt_L2 = nn.Conv2D(
+ in_channels=2 * self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ )
+ self.sAtt_L3 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.upsample = nn.Upsample(scale_factor=2,
+ mode="bilinear",
+ align_corners=False,
+ align_mode=0)
+
+ def forward(self, aligned_fea):
+ """
+ Args:
+ aligned_feat (Tensor): Aligned features with shape (b, n, c, h, w).
+
+ Returns:
+ Tensor: Features after TSA with the shape (b, c, h, w).
+ """
+ B, N, C, H, W = aligned_fea.shape
+ x_center = aligned_fea[:, self.center, :, :, :]
+ emb_rf = self.tAtt_2(x_center)
+ emb = aligned_fea.reshape([-1, C, H, W])
+ emb = self.tAtt_1(emb)
+ emb = emb.reshape([-1, N, self.nf, H, W])
+ cor_l = []
+ for i in range(N):
+ emb_nbr = emb[:, i, :, :, :] #[B,C,W,H]
+ cor_tmp = paddle.sum(emb_nbr * emb_rf, axis=1)
+ cor_tmp = paddle.unsqueeze(cor_tmp, axis=1)
+ cor_l.append(cor_tmp)
+ cor_prob = paddle.concat(cor_l, axis=1) #[B,N,H,W]
+
+ cor_prob = self.sigmoid(cor_prob)
+ cor_prob = paddle.unsqueeze(cor_prob, axis=2) #[B,N,1,H,W]
+ cor_prob = paddle.expand(cor_prob, [B, N, self.nf, H, W]) #[B,N,C,H,W]
+ cor_prob = cor_prob.reshape([B, -1, H, W])
+ aligned_fea = aligned_fea.reshape([B, -1, H, W])
+ aligned_fea = aligned_fea * cor_prob
+
+ fea = self.fea_fusion(aligned_fea)
+ fea = self.Leaky_relu(fea)
+
+ #spatial fusion
+ att = self.sAtt_1(aligned_fea)
+ att = self.Leaky_relu(att)
+ att_max = self.max_pool(att)
+ att_avg = self.avg_pool(att)
+ att_pool = paddle.concat([att_max, att_avg], axis=1)
+ att = self.sAtt_2(att_pool)
+ att = self.Leaky_relu(att)
+
+ #pyramid
+ att_L = self.sAtt_L1(att)
+ att_L = self.Leaky_relu(att_L)
+ att_max = self.max_pool(att_L)
+ att_avg = self.avg_pool(att_L)
+ att_pool = paddle.concat([att_max, att_avg], axis=1)
+ att_L = self.sAtt_L2(att_pool)
+ att_L = self.Leaky_relu(att_L)
+ att_L = self.sAtt_L3(att_L)
+ att_L = self.Leaky_relu(att_L)
+ att_L = self.upsample(att_L)
+
+ att = self.sAtt_3(att)
+ att = self.Leaky_relu(att)
+ att = att + att_L
+ att = self.sAtt_4(att)
+ att = self.Leaky_relu(att)
+ att = self.upsample(att)
+ att = self.sAtt_5(att)
+ att_add = self.sAtt_add_1(att)
+ att_add = self.Leaky_relu(att_add)
+ att_add = self.sAtt_add_2(att_add)
+ att = self.sigmoid(att)
+
+ fea = fea * att * 2 + att_add
+ return fea
+
+
+class DCNPack(nn.Layer):
+ """Modulated deformable conv for deformable alignment.
+
+ Ref:
+ Delving Deep into Deformable Alignment in Video Super-Resolution.
+ """
+
+ def __init__(self,
+ num_filters=64,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ dilation=1,
+ deformable_groups=8,
+ extra_offset_mask=True):
+ super(DCNPack, self).__init__()
+ self.extra_offset_mask = extra_offset_mask
+ self.deformable_groups = deformable_groups
+ self.num_filters = num_filters
+ if isinstance(kernel_size, int):
+ self.kernel_size = [kernel_size, kernel_size]
+ self.conv_offset_mask = nn.Conv2D(in_channels=self.num_filters,
+ out_channels=self.deformable_groups *
+ 3 * self.kernel_size[0] *
+ self.kernel_size[1],
+ kernel_size=self.kernel_size,
+ stride=stride,
+ padding=padding)
+ self.total_channels = self.deformable_groups * 3 * self.kernel_size[
+ 0] * self.kernel_size[1]
+ self.split_channels = self.total_channels // 3
+ self.dcn = DeformConv2D(in_channels=self.num_filters,
+ out_channels=self.num_filters,
+ kernel_size=self.kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ deformable_groups=self.deformable_groups)
+ self.sigmoid = nn.Sigmoid()
+ # init conv offset
+ constant_init(self.conv_offset_mask, 0., 0.)
+
+ def forward(self, fea_and_offset):
+ out = None
+ x = None
+ if self.extra_offset_mask:
+ out = self.conv_offset_mask(fea_and_offset[1])
+ x = fea_and_offset[0]
+ o1 = out[:, 0:self.split_channels, :, :]
+ o2 = out[:, self.split_channels:2 * self.split_channels, :, :]
+ mask = out[:, 2 * self.split_channels:, :, :]
+ offset = paddle.concat([o1, o2], axis=1)
+ mask = self.sigmoid(mask)
+ y = self.dcn(x, offset, mask)
+ return y
+
+
+class PCDAlign(nn.Layer):
+ """Alignment module using Pyramid, Cascading and Deformable convolution
+ (PCD). It is used in EDVR.
+
+ Ref:
+ EDVR: Video Restoration with Enhanced Deformable Convolutional Networks
+
+ Args:
+ nf (int): Channel number of middle features. Default: 64.
+ groups (int): Deformable groups. Defaults: 8.
+ """
+
+ def __init__(self, nf=64, groups=8):
+ super(PCDAlign, self).__init__()
+ self.nf = nf
+ self.groups = groups
+ self.Leaky_relu = nn.LeakyReLU(negative_slope=0.1)
+ self.upsample = nn.Upsample(scale_factor=2,
+ mode="bilinear",
+ align_corners=False,
+ align_mode=0)
+ # Pyramid has three levels:
+ # L3: level 3, 1/4 spatial size
+ # L2: level 2, 1/2 spatial size
+ # L1: level 1, original spatial size
+
+ # L3
+ self.PCD_Align_L3_offset_conv1 = nn.Conv2D(in_channels=nf * 2,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_L3_offset_conv2 = nn.Conv2D(in_channels=nf,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_L3_dcn = DCNPack(num_filters=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ deformable_groups=groups)
+ #L2
+ self.PCD_Align_L2_offset_conv1 = nn.Conv2D(in_channels=nf * 2,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_L2_offset_conv2 = nn.Conv2D(in_channels=nf * 2,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_L2_offset_conv3 = nn.Conv2D(in_channels=nf,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_L2_dcn = DCNPack(num_filters=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ deformable_groups=groups)
+ self.PCD_Align_L2_fea_conv = nn.Conv2D(in_channels=nf * 2,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ #L1
+ self.PCD_Align_L1_offset_conv1 = nn.Conv2D(in_channels=nf * 2,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_L1_offset_conv2 = nn.Conv2D(in_channels=nf * 2,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_L1_offset_conv3 = nn.Conv2D(in_channels=nf,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_L1_dcn = DCNPack(num_filters=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ deformable_groups=groups)
+ self.PCD_Align_L1_fea_conv = nn.Conv2D(in_channels=nf * 2,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ #cascade
+ self.PCD_Align_cas_offset_conv1 = nn.Conv2D(in_channels=nf * 2,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_cas_offset_conv2 = nn.Conv2D(in_channels=nf,
+ out_channels=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.PCD_Align_cascade_dcn = DCNPack(num_filters=nf,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ deformable_groups=groups)
+
+ def forward(self, nbr_fea_l, ref_fea_l):
+ """Align neighboring frame features to the reference frame features.
+
+ Args:
+ nbr_fea_l (list[Tensor]): Neighboring feature list. It
+ contains three pyramid levels (L1, L2, L3),
+ each with shape (b, c, h, w).
+ ref_fea_l (list[Tensor]): Reference feature list. It
+ contains three pyramid levels (L1, L2, L3),
+ each with shape (b, c, h, w).
+
+ Returns:
+ Tensor: Aligned features.
+ """
+ #L3
+ L3_offset = paddle.concat([nbr_fea_l[2], ref_fea_l[2]], axis=1)
+ L3_offset = self.PCD_Align_L3_offset_conv1(L3_offset)
+ L3_offset = self.Leaky_relu(L3_offset)
+ L3_offset = self.PCD_Align_L3_offset_conv2(L3_offset)
+ L3_offset = self.Leaky_relu(L3_offset)
+
+ L3_fea = self.PCD_Align_L3_dcn([nbr_fea_l[2], L3_offset])
+ L3_fea = self.Leaky_relu(L3_fea)
+ #L2
+ L2_offset = paddle.concat([nbr_fea_l[1], ref_fea_l[1]], axis=1)
+ L2_offset = self.PCD_Align_L2_offset_conv1(L2_offset)
+ L2_offset = self.Leaky_relu(L2_offset)
+ L3_offset = self.upsample(L3_offset)
+ L2_offset = paddle.concat([L2_offset, L3_offset * 2], axis=1)
+ L2_offset = self.PCD_Align_L2_offset_conv2(L2_offset)
+ L2_offset = self.Leaky_relu(L2_offset)
+ L2_offset = self.PCD_Align_L2_offset_conv3(L2_offset)
+ L2_offset = self.Leaky_relu(L2_offset)
+ L2_fea = self.PCD_Align_L2_dcn([nbr_fea_l[1], L2_offset])
+ L3_fea = self.upsample(L3_fea)
+ L2_fea = paddle.concat([L2_fea, L3_fea], axis=1)
+ L2_fea = self.PCD_Align_L2_fea_conv(L2_fea)
+ L2_fea = self.Leaky_relu(L2_fea)
+ #L1
+ L1_offset = paddle.concat([nbr_fea_l[0], ref_fea_l[0]], axis=1)
+ L1_offset = self.PCD_Align_L1_offset_conv1(L1_offset)
+ L1_offset = self.Leaky_relu(L1_offset)
+ L2_offset = self.upsample(L2_offset)
+ L1_offset = paddle.concat([L1_offset, L2_offset * 2], axis=1)
+ L1_offset = self.PCD_Align_L1_offset_conv2(L1_offset)
+ L1_offset = self.Leaky_relu(L1_offset)
+ L1_offset = self.PCD_Align_L1_offset_conv3(L1_offset)
+ L1_offset = self.Leaky_relu(L1_offset)
+ L1_fea = self.PCD_Align_L1_dcn([nbr_fea_l[0], L1_offset])
+ L2_fea = self.upsample(L2_fea)
+ L1_fea = paddle.concat([L1_fea, L2_fea], axis=1)
+ L1_fea = self.PCD_Align_L1_fea_conv(L1_fea)
+ #cascade
+ offset = paddle.concat([L1_fea, ref_fea_l[0]], axis=1)
+ offset = self.PCD_Align_cas_offset_conv1(offset)
+ offset = self.Leaky_relu(offset)
+ offset = self.PCD_Align_cas_offset_conv2(offset)
+ offset = self.Leaky_relu(offset)
+ L1_fea = self.PCD_Align_cascade_dcn([L1_fea, offset])
+ L1_fea = self.Leaky_relu(L1_fea)
+
+ return L1_fea
+
+
+@GENERATORS.register()
+class EDVRNet(nn.Layer):
+ """EDVR network structure for video super-resolution.
+
+ Now only support X4 upsampling factor.
+ Paper:
+ EDVR: Video Restoration with Enhanced Deformable Convolutional Networks
+
+ Args:
+ in_nf (int): Channel number of input image. Default: 3.
+ out_nf (int): Channel number of output image. Default: 3.
+ scale_factor (int): Scale factor from input image to output image. Default: 4.
+ nf (int): Channel number of intermediate features. Default: 64.
+ nframes (int): Number of input frames. Default: 5.
+ groups (int): Deformable groups. Defaults: 8.
+ front_RBs (int): Number of blocks for feature extraction. Default: 5.
+ back_RBs (int): Number of blocks for reconstruction. Default: 10.
+ center (int): The index of center frame. Frame counting from 0. Default: None.
+ predeblur (bool): Whether has predeblur module. Default: False.
+ HR_in (bool): Whether the input has high resolution. Default: False.
+ with_tsa (bool): Whether has TSA module. Default: True.
+ TSA_only (bool): Whether only use TSA module. Default: False.
+ """
+
+ def __init__(self,
+ in_nf=3,
+ out_nf=3,
+ scale_factor=4,
+ nf=64,
+ nframes=5,
+ groups=8,
+ front_RBs=5,
+ back_RBs=10,
+ center=None,
+ predeblur=False,
+ HR_in=False,
+ w_TSA=True):
+ super(EDVRNet, self).__init__()
+ self.in_nf = in_nf
+ self.out_nf = out_nf
+ self.scale_factor = scale_factor
+ self.nf = nf
+ self.nframes = nframes
+ self.groups = groups
+ self.front_RBs = front_RBs
+ self.back_RBs = back_RBs
+ self.center = nframes // 2 if center is None else center
+ self.predeblur = True if predeblur else False
+ self.HR_in = True if HR_in else False
+ self.w_TSA = True if w_TSA else False
+
+ self.Leaky_relu = nn.LeakyReLU(negative_slope=0.1)
+ if self.predeblur:
+ self.pre_deblur = PredeblurResNetPyramid(in_nf=self.in_nf,
+ nf=self.nf,
+ HR_in=self.HR_in)
+ self.cov_1 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=1,
+ stride=1)
+ else:
+ self.conv_first = nn.Conv2D(in_channels=self.in_nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ #feature extraction module
+ self.feature_extractor = MakeMultiBlocks(ResidualBlockNoBN,
+ self.front_RBs, self.nf)
+ self.fea_L2_conv1 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=2,
+ padding=1)
+ self.fea_L2_conv2 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.fea_L3_conv1 = nn.Conv2D(
+ in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ )
+ self.fea_L3_conv2 = nn.Conv2D(in_channels=self.nf,
+ out_channels=self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ #PCD alignment module
+ self.PCDModule = PCDAlign(nf=self.nf, groups=self.groups)
+
+ #TSA Fusion module
+ if self.w_TSA:
+ self.TSAModule = TSAFusion(nf=self.nf,
+ nframes=self.nframes,
+ center=self.center)
+ else:
+ self.TSAModule = nn.Conv2D(in_channels=self.nframes * self.nf,
+ out_channels=self.nf,
+ kernel_size=1,
+ stride=1)
+
+ #reconstruction module
+ self.reconstructor = MakeMultiBlocks(ResidualBlockNoBN, self.back_RBs,
+ self.nf)
+ self.upconv1 = nn.Conv2D(in_channels=self.nf,
+ out_channels=4 * self.nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+
+ self.pixel_shuffle = nn.PixelShuffle(2)
+ self.upconv2 = nn.Conv2D(in_channels=self.nf,
+ out_channels=4 * 64,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.HRconv = nn.Conv2D(in_channels=64,
+ out_channels=64,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ self.conv_last = nn.Conv2D(in_channels=64,
+ out_channels=self.out_nf,
+ kernel_size=3,
+ stride=1,
+ padding=1)
+ if self.scale_factor == 4:
+ self.upsample = nn.Upsample(scale_factor=self.scale_factor,
+ mode="bilinear",
+ align_corners=False,
+ align_mode=0)
+
+ def forward(self, x):
+ """
+ Args:
+ x (Tensor): Input features with shape (b, n, c, h, w).
+
+ Returns:
+ Tensor: Features after EDVR with the shape (b, c, scale_factor*h, scale_factor*w).
+ """
+ B, N, C, H, W = x.shape
+ x_center = x[:, self.center, :, :, :]
+ L1_fea = x.reshape([-1, C, H, W]) #[B*N,C,W,H]
+ if self.predeblur:
+ L1_fea = self.pre_deblur(L1_fea)
+ L1_fea = self.cov_1(L1_fea)
+ if self.HR_in:
+ H, W = H // 4, W // 4
+ else:
+ L1_fea = self.conv_first(L1_fea)
+ L1_fea = self.Leaky_relu(L1_fea)
+
+ # feature extraction and create Pyramid
+ L1_fea = self.feature_extractor(L1_fea)
+ # L2
+ L2_fea = self.fea_L2_conv1(L1_fea)
+ L2_fea = self.Leaky_relu(L2_fea)
+ L2_fea = self.fea_L2_conv2(L2_fea)
+ L2_fea = self.Leaky_relu(L2_fea)
+ # L3
+ L3_fea = self.fea_L3_conv1(L2_fea)
+ L3_fea = self.Leaky_relu(L3_fea)
+ L3_fea = self.fea_L3_conv2(L3_fea)
+ L3_fea = self.Leaky_relu(L3_fea)
+
+ L1_fea = L1_fea.reshape([-1, N, self.nf, H, W])
+ L2_fea = L2_fea.reshape([-1, N, self.nf, H // 2, W // 2])
+ L3_fea = L3_fea.reshape([-1, N, self.nf, H // 4, W // 4])
+
+ # pcd align
+ ref_fea_l = [
+ L1_fea[:, self.center, :, :, :], L2_fea[:, self.center, :, :, :],
+ L3_fea[:, self.center, :, :, :]
+ ]
+
+ aligned_fea = [
+ self.PCDModule([
+ L1_fea[:, i, :, :, :], L2_fea[:, i, :, :, :], L3_fea[:,
+ i, :, :, :]
+ ], ref_fea_l) for i in range(N)
+ ]
+
+ # TSA Fusion
+ aligned_fea = paddle.stack(aligned_fea, axis=1) # [B, N, C, H, W]
+ fea = None
+ if not self.w_TSA:
+ aligned_fea = aligned_fea.reshape([B, -1, H, W])
+ fea = self.TSAModule(aligned_fea) # [B, N, C, H, W]
+
+ #Reconstruct
+ out = self.reconstructor(fea)
+
+ out = self.upconv1(out)
+ out = self.pixel_shuffle(out)
+ out = self.Leaky_relu(out)
+ out = self.upconv2(out)
+ out = self.pixel_shuffle(out)
+ out = self.Leaky_relu(out)
+
+ out = self.HRconv(out)
+ out = self.Leaky_relu(out)
+ out = self.conv_last(out)
+
+ if self.HR_in:
+ base = x_center
+ else:
+ base = self.upsample(x_center)
+
+ out += base
+ return out
diff --git a/ppgan/models/generators/generater_animegan.py b/ppgan/models/generators/generater_animegan.py
index a2b09fe453e890420102c84cb2a643eb8d5ad667..2d3f3aa037cc76d2ad51c706e5376f8269e53da0 100644
--- a/ppgan/models/generators/generater_animegan.py
+++ b/ppgan/models/generators/generater_animegan.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# code was heavily based on https://github.com/TachibanaYoshino/AnimeGANv2
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/TachibanaYoshino/AnimeGANv2#license
import paddle
import paddle.nn as nn
diff --git a/ppgan/models/generators/generater_aotgan.py b/ppgan/models/generators/generater_aotgan.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b670b23a08600d55a39c1fb01176118b53e84d8
--- /dev/null
+++ b/ppgan/models/generators/generater_aotgan.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.utils import spectral_norm
+
+from .builder import GENERATORS
+
+# Aggregated Contextual Transformations Block
+class AOTBlock(nn.Layer):
+ def __init__(self, dim, rates):
+ super(AOTBlock, self).__init__()
+
+ self.rates = rates
+ for i, rate in enumerate(rates):
+ self.__setattr__(
+ 'block{}'.format(str(i).zfill(2)),
+ nn.Sequential(
+ nn.Pad2D(rate, mode='reflect'),
+ nn.Conv2D(dim, dim//4, 3, 1, 0, dilation=int(rate)),
+ nn.ReLU()))
+ self.fuse = nn.Sequential(
+ nn.Pad2D(1, mode='reflect'),
+ nn.Conv2D(dim, dim, 3, 1, 0, dilation=1))
+ self.gate = nn.Sequential(
+ nn.Pad2D(1, mode='reflect'),
+ nn.Conv2D(dim, dim, 3, 1, 0, dilation=1))
+
+ def forward(self, x):
+ out = [self.__getattr__(f'block{str(i).zfill(2)}')(x) for i in range(len(self.rates))]
+ out = paddle.concat(out, 1)
+ out = self.fuse(out)
+ mask = my_layer_norm(self.gate(x))
+ mask = F.sigmoid(mask)
+ return x * (1 - mask) + out * mask
+
+def my_layer_norm(feat):
+ mean = feat.mean((2, 3), keepdim=True)
+ std = feat.std((2, 3), keepdim=True) + 1e-9
+ feat = 2 * (feat - mean) / std - 1
+ feat = 5 * feat
+ return feat
+
+class UpConv(nn.Layer):
+ def __init__(self, inc, outc, scale=2):
+ super(UpConv, self).__init__()
+ self.scale = scale
+ self.conv = nn.Conv2D(inc, outc, 3, 1, 1)
+
+ def forward(self, x):
+ return self.conv(F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=True))
+
+# generator
+@GENERATORS.register()
+class InpaintGenerator(nn.Layer):
+ def __init__(self, rates, block_num):
+ super(InpaintGenerator, self).__init__()
+
+ self.encoder = nn.Sequential(
+ nn.Pad2D(3, mode='reflect'),
+ nn.Conv2D(4, 64, 7, 1, 0),
+ nn.ReLU(),
+ nn.Conv2D(64, 128, 4, 2, 1),
+ nn.ReLU(),
+ nn.Conv2D(128, 256, 4, 2, 1),
+ nn.ReLU()
+ )
+
+ self.middle = nn.Sequential(*[AOTBlock(256, rates) for _ in range(block_num)])
+
+ self.decoder = nn.Sequential(
+ UpConv(256, 128),
+ nn.ReLU(),
+ UpConv(128, 64),
+ nn.ReLU(),
+ nn.Conv2D(64, 3, 3, 1, 1)
+ )
+
+ def forward(self, x):
+ x = self.encoder(x)
+ x = self.middle(x)
+ x = self.decoder(x)
+ x = paddle.tanh(x)
+
+ return x
diff --git a/ppgan/models/generators/generater_lapstyle.py b/ppgan/models/generators/generater_lapstyle.py
new file mode 100644
index 0000000000000000000000000000000000000000..20108d7cd06804602140e16782bb90af5489093e
--- /dev/null
+++ b/ppgan/models/generators/generater_lapstyle.py
@@ -0,0 +1,314 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from ...utils.download import get_path_from_url
+
+from .builder import GENERATORS
+
+
+def calc_mean_std(feat, eps=1e-5):
+ """calculate mean and standard deviation.
+
+ Args:
+ feat (Tensor): Tensor with shape (N, C, H, W).
+ eps (float): Default: 1e-5.
+
+ Return:
+ mean and std of feat
+ shape: [N, C, 1, 1]
+ """
+ size = feat.shape
+ assert (len(size) == 4)
+ N, C = size[:2]
+ feat_var = feat.reshape([N, C, -1])
+ feat_var = paddle.var(feat_var, axis=2) + eps
+ feat_std = paddle.sqrt(feat_var)
+ feat_std = feat_std.reshape([N, C, 1, 1])
+ feat_mean = feat.reshape([N, C, -1])
+ feat_mean = paddle.mean(feat_mean, axis=2)
+ feat_mean = feat_mean.reshape([N, C, 1, 1])
+ return feat_mean, feat_std
+
+
+def mean_variance_norm(feat):
+ """mean_variance_norm.
+
+ Args:
+ feat (Tensor): Tensor with shape (N, C, H, W).
+
+ Return:
+ Normalized feat with shape (N, C, H, W)
+ """
+ size = feat.shape
+ mean, std = calc_mean_std(feat)
+ normalized_feat = (feat - mean.expand(size)) / std.expand(size)
+ return normalized_feat
+
+
+def adaptive_instance_normalization(content_feat, style_feat):
+ """adaptive_instance_normalization.
+
+ Args:
+ content_feat (Tensor): Tensor with shape (N, C, H, W).
+ style_feat (Tensor): Tensor with shape (N, C, H, W).
+
+ Return:
+ Normalized content_feat with shape (N, C, H, W)
+ """
+ assert (content_feat.shape[:2] == style_feat.shape[:2])
+ size = content_feat.shape
+ style_mean, style_std = calc_mean_std(style_feat)
+ content_mean, content_std = calc_mean_std(content_feat)
+
+ normalized_feat = (content_feat -
+ content_mean.expand(size)) / content_std.expand(size)
+ return normalized_feat * style_std.expand(size) + style_mean.expand(size)
+
+
+class ResnetBlock(nn.Layer):
+ """Residual block.
+
+ It has a style of:
+ ---Pad-Conv-ReLU-Pad-Conv-+-
+ |________________________|
+
+ Args:
+ dim (int): Channel number of intermediate features.
+ """
+ def __init__(self, dim):
+ super(ResnetBlock, self).__init__()
+ self.conv_block = nn.Sequential(nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(dim, dim, (3, 3)), nn.ReLU(),
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(dim, dim, (3, 3)))
+
+ def forward(self, x):
+ out = x + self.conv_block(x)
+ return out
+
+
+class ConvBlock(nn.Layer):
+ """convolution block.
+
+ It has a style of:
+ ---Pad-Conv-ReLU---
+
+ Args:
+ dim1 (int): Channel number of input features.
+ dim2 (int): Channel number of output features.
+ """
+ def __init__(self, dim1, dim2):
+ super(ConvBlock, self).__init__()
+ self.conv_block = nn.Sequential(nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(dim1, dim2, (3, 3)),
+ nn.ReLU())
+
+ def forward(self, x):
+ out = self.conv_block(x)
+ return out
+
+
+@GENERATORS.register()
+class DecoderNet(nn.Layer):
+ """Decoder of Drafting module.
+ Paper:
+ Drafting and Revision: Laplacian Pyramid Network for Fast High-Quality
+ Artistic Style Transfer.
+ """
+ def __init__(self):
+ super(DecoderNet, self).__init__()
+
+ self.resblock_41 = ResnetBlock(512)
+ self.convblock_41 = ConvBlock(512, 256)
+ self.resblock_31 = ResnetBlock(256)
+ self.convblock_31 = ConvBlock(256, 128)
+
+ self.convblock_21 = ConvBlock(128, 128)
+ self.convblock_22 = ConvBlock(128, 64)
+
+ self.convblock_11 = ConvBlock(64, 64)
+ self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+
+ self.final_conv = nn.Sequential(nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(64, 3, (3, 3)))
+
+ def forward(self, cF, sF):
+
+ out = adaptive_instance_normalization(cF['r41'], sF['r41'])
+ out = self.resblock_41(out)
+ out = self.convblock_41(out)
+
+ out = self.upsample(out)
+ out += adaptive_instance_normalization(cF['r31'], sF['r31'])
+ out = self.resblock_31(out)
+ out = self.convblock_31(out)
+
+ out = self.upsample(out)
+ out += adaptive_instance_normalization(cF['r21'], sF['r21'])
+ out = self.convblock_21(out)
+ out = self.convblock_22(out)
+
+ out = self.upsample(out)
+ out = self.convblock_11(out)
+ out = self.final_conv(out)
+ return out
+
+
+
+
+@GENERATORS.register()
+class Encoder(nn.Layer):
+ """Encoder of Drafting module.
+ Paper:
+ Drafting and Revision: Laplacian Pyramid Network for Fast High-Quality
+ Artistic Style Transfer.
+ """
+ def __init__(self):
+ super(Encoder, self).__init__()
+ vgg_net = nn.Sequential(
+ nn.Conv2D(3, 3, (1, 1)),
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(3, 64, (3, 3)),
+ nn.ReLU(), # relu1-1
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(64, 64, (3, 3)),
+ nn.ReLU(), # relu1-2
+ nn.MaxPool2D((2, 2), (2, 2), (0, 0), ceil_mode=True),
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(64, 128, (3, 3)),
+ nn.ReLU(), # relu2-1
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(128, 128, (3, 3)),
+ nn.ReLU(), # relu2-2
+ nn.MaxPool2D((2, 2), (2, 2), (0, 0), ceil_mode=True),
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(128, 256, (3, 3)),
+ nn.ReLU(), # relu3-1
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(256, 256, (3, 3)),
+ nn.ReLU(), # relu3-2
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(256, 256, (3, 3)),
+ nn.ReLU(), # relu3-3
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(256, 256, (3, 3)),
+ nn.ReLU(), # relu3-4
+ nn.MaxPool2D((2, 2), (2, 2), (0, 0), ceil_mode=True),
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(256, 512, (3, 3)),
+ nn.ReLU(), # relu4-1, this is the last layer used
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(512, 512, (3, 3)),
+ nn.ReLU(), # relu4-2
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(512, 512, (3, 3)),
+ nn.ReLU(), # relu4-3
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(512, 512, (3, 3)),
+ nn.ReLU(), # relu4-4
+ nn.MaxPool2D((2, 2), (2, 2), (0, 0), ceil_mode=True),
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(512, 512, (3, 3)),
+ nn.ReLU(), # relu5-1
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(512, 512, (3, 3)),
+ nn.ReLU(), # relu5-2
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(512, 512, (3, 3)),
+ nn.ReLU(), # relu5-3
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(512, 512, (3, 3)),
+ nn.ReLU() # relu5-4
+ )
+ weight_path = get_path_from_url(
+ 'https://paddlegan.bj.bcebos.com/models/vgg_normalised.pdparams')
+ vgg_net.set_dict(paddle.load(weight_path))
+ self.enc_1 = nn.Sequential(*list(
+ vgg_net.children())[:4]) # input -> relu1_1
+ self.enc_2 = nn.Sequential(*list(
+ vgg_net.children())[4:11]) # relu1_1 -> relu2_1
+ self.enc_3 = nn.Sequential(*list(
+ vgg_net.children())[11:18]) # relu2_1 -> relu3_1
+ self.enc_4 = nn.Sequential(*list(
+ vgg_net.children())[18:31]) # relu3_1 -> relu4_1
+ self.enc_5 = nn.Sequential(*list(
+ vgg_net.children())[31:44]) # relu4_1 -> relu5_1
+
+ def forward(self, x):
+ out = {}
+ x = self.enc_1(x)
+ out['r11'] = x
+ x = self.enc_2(x)
+ out['r21'] = x
+ x = self.enc_3(x)
+ out['r31'] = x
+ x = self.enc_4(x)
+ out['r41'] = x
+ x = self.enc_5(x)
+ out['r51'] = x
+ return out
+
+
+@GENERATORS.register()
+class RevisionNet(nn.Layer):
+ """RevisionNet of Revision module.
+ Paper:
+ Drafting and Revision: Laplacian Pyramid Network for Fast High-Quality
+ Artistic Style Transfer.
+ """
+ def __init__(self, input_nc=6):
+ super(RevisionNet, self).__init__()
+ DownBlock = []
+ DownBlock += [
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(input_nc, 64, (3, 3)),
+ nn.ReLU()
+ ]
+ DownBlock += [
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(64, 64, (3, 3), stride=2),
+ nn.ReLU()
+ ]
+
+ self.resblock = ResnetBlock(64)
+
+ UpBlock = []
+ UpBlock += [
+ nn.Upsample(scale_factor=2, mode='nearest'),
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(64, 64, (3, 3)),
+ nn.ReLU()
+ ]
+ UpBlock += [
+ nn.Pad2D([1, 1, 1, 1], mode='reflect'),
+ nn.Conv2D(64, 3, (3, 3))
+ ]
+
+ self.DownBlock = nn.Sequential(*DownBlock)
+ self.UpBlock = nn.Sequential(*UpBlock)
+
+ def forward(self, input):
+ """
+ Args:
+ input (Tensor): (b, 6, 256, 256) is concat of last input and this lap.
+
+ Returns:
+ Tensor: (b, 3, 256, 256).
+ """
+ out = self.DownBlock(input)
+ out = self.resblock(out)
+ out = self.UpBlock(out)
+ return out
diff --git a/ppgan/models/generators/generater_photopen.py b/ppgan/models/generators/generater_photopen.py
new file mode 100644
index 0000000000000000000000000000000000000000..210b9672c74651fd9678604c3b8367c31f54d176
--- /dev/null
+++ b/ppgan/models/generators/generater_photopen.py
@@ -0,0 +1,279 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import re
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.utils import spectral_norm
+
+from ppgan.utils.photopen import build_norm_layer, simam, Dict
+from .builder import GENERATORS
+
+class SPADE(nn.Layer):
+ def __init__(self, config_text, norm_nc, label_nc):
+ super(SPADE, self).__init__()
+
+ parsed = re.search(r'spade(\D+)(\d)x\d', config_text)
+ param_free_norm_type = str(parsed.group(1))
+ ks = int(parsed.group(2))
+
+ self.param_free_norm = build_norm_layer(param_free_norm_type)(norm_nc)
+
+ # The dimension of the intermediate embedding space. Yes, hardcoded.
+ nhidden = 128
+
+ pw = ks // 2
+ self.mlp_shared = nn.Sequential(*[
+ nn.Conv2D(label_nc, nhidden, ks, 1, pw),
+ nn.GELU(),
+ ])
+ self.mlp_gamma = nn.Conv2D(nhidden, norm_nc, ks, 1, pw)
+ self.mlp_beta = nn.Conv2D(nhidden, norm_nc, ks, 1, pw)
+
+ def forward(self, x, segmap):
+ # Part 1. generate parameter-free normalized activations
+ normalized = self.param_free_norm(x)
+
+ # Part 2. produce scaling and bias conditioned on semantic map
+ segmap = F.interpolate(segmap, x.shape[2:])
+ actv = self.mlp_shared(segmap)
+ gamma = self.mlp_gamma(actv)
+ beta = self.mlp_beta(actv)
+
+ # apply scale and bias
+ out = normalized * (1 + gamma) + beta
+
+ return out
+
+class SPADEResnetBlock(nn.Layer):
+ def __init__(self, fin, fout, opt):
+ super(SPADEResnetBlock, self).__init__()
+
+ # Attributes
+ self.learned_shortcut = (fin != fout)
+ fmiddle = min(fin, fout)
+
+ # define spade layers
+ spade_config_str = opt.norm_G.replace('spectral', '')
+ self.spade_0 = SPADE(spade_config_str, fin, opt.semantic_nc)
+ self.spade_1 = SPADE(spade_config_str, fmiddle, opt.semantic_nc)
+ if self.learned_shortcut:
+ self.spade_s = SPADE(spade_config_str, fin, opt.semantic_nc)
+
+ # define act_conv layers
+ self.act_conv_0 = nn.Sequential(*[
+ nn.GELU(),
+ spectral_norm(nn.Conv2D(fin, fmiddle, 3, 1, 1,
+ weight_attr=None,
+ bias_attr=None)),
+ ])
+ self.act_conv_1 = nn.Sequential(*[
+ nn.GELU(),
+ spectral_norm(nn.Conv2D(fmiddle, fout, 3, 1, 1,
+ weight_attr=None,
+ bias_attr=None)),
+ ])
+ if self.learned_shortcut:
+ self.act_conv_s = nn.Sequential(*[
+ spectral_norm(nn.Conv2D(fin, fout, 1, 1, 0, bias_attr=False,
+ weight_attr=None)),
+ ])
+
+
+ def forward(self, x, seg):
+ x_s = self.shortcut(x, seg)
+
+ dx = self.act_conv_0(self.spade_0(x, seg))
+ dx = self.act_conv_1(self.spade_1(dx, seg))
+
+ return simam(dx + x_s)
+
+ def shortcut(self, x, seg):
+ if self.learned_shortcut:
+ x_s = self.act_conv_s(self.spade_s(x, seg))
+ else:
+ x_s = x
+ return x_s
+
+@GENERATORS.register()
+class SPADEGenerator(nn.Layer):
+ def __init__(self,
+ ngf,
+ num_upsampling_layers,
+ crop_size,
+ aspect_ratio,
+ norm_G,
+ semantic_nc,
+ use_vae,
+ nef,
+ ):
+ super(SPADEGenerator, self).__init__()
+
+ opt = {
+ 'ngf': ngf,
+ 'num_upsampling_layers': num_upsampling_layers,
+ 'crop_size': crop_size,
+ 'aspect_ratio': aspect_ratio,
+ 'norm_G': norm_G,
+ 'semantic_nc': semantic_nc,
+ 'use_vae': use_vae,
+ 'nef': nef,
+ }
+ self.opt = Dict(opt)
+
+ nf = self.opt.ngf
+ self.sw, self.sh = self.compute_latent_vector_size(self.opt)
+
+ if self.opt.use_vae:
+ self.fc = nn.Linear(opt.z_dim, 16 * opt.nef * self.sw * self.sh)
+ self.head_0 = SPADEResnetBlock(16 * opt.nef, 16 * nf, self.opt)
+ else:
+ self.fc = nn.Conv2D(self.opt.semantic_nc, 16 * nf, 3, 1, 1)
+ self.head_0 = SPADEResnetBlock(16 * nf, 16 * nf, self.opt)
+
+ self.G_middle_0 = SPADEResnetBlock(16 * nf, 16 * nf, self.opt)
+ self.G_middle_1 = SPADEResnetBlock(16 * nf, 16 * nf, self.opt)
+
+ self.up_0 = SPADEResnetBlock(16 * nf, 8 * nf, self.opt)
+ self.up_1 = SPADEResnetBlock(8 * nf, 4 * nf, self.opt)
+ self.up_2 = SPADEResnetBlock(4 * nf, 2 * nf, self.opt)
+ self.up_3 = SPADEResnetBlock(2 * nf, 1 * nf, self.opt)
+
+ final_nc = nf
+
+ if self.opt.num_upsampling_layers == 'most':
+ self.up_4 = SPADEResnetBlock(1 * nf, nf // 2, self.opt)
+ final_nc = nf // 2
+
+ self.conv_img = nn.Conv2D(final_nc, 3, 3, 1, 1)
+
+ self.up = nn.Upsample(scale_factor=2)
+
+ def forward(self, input, z=None):
+ seg = input
+ if self.opt.use_vae:
+ x = self.fc(z)
+ x = paddle.reshape(x, [-1, 16 * self.opt.nef, self.sh, self.sw])
+ else:
+ x = F.interpolate(seg, (self.sh, self.sw))
+ x = self.fc(x)
+ x = self.head_0(x, seg)
+
+ x = self.up(x)
+ x = self.G_middle_0(x, seg)
+
+ if self.opt.num_upsampling_layers == 'more' or \
+ self.opt.num_upsampling_layers == 'most':
+ x = self.up(x)
+
+ x = self.G_middle_1(x, seg)
+
+ x = self.up(x)
+ x = self.up_0(x, seg)
+ x = self.up(x)
+ x = self.up_1(x, seg)
+ x = self.up(x)
+ x = self.up_2(x, seg)
+ x = self.up(x)
+ x = self.up_3(x, seg)
+
+ if self.opt.num_upsampling_layers == 'most':
+ x = self.up(x)
+ x = self.up_4(x, seg)
+
+ x = self.conv_img(F.gelu(x))
+ x = F.tanh(x)
+
+ return x
+
+ def compute_latent_vector_size(self, opt):
+ if opt.num_upsampling_layers == 'normal':
+ num_up_layers = 5
+ elif opt.num_upsampling_layers == 'more':
+ num_up_layers = 6
+ elif opt.num_upsampling_layers == 'most':
+ num_up_layers = 7
+ else:
+ raise ValueError('opt.num_upsampling_layers [%s] not recognized' %
+ opt.num_upsampling_layers)
+
+ sw = opt.crop_size // (2**num_up_layers)
+ sh = round(sw / opt.aspect_ratio)
+
+ return sw, sh
+
+class VAE_Encoder(nn.Layer):
+ def __init__(self, opt):
+ super(VAE_Encoder, self).__init__()
+
+ kw = 3
+ pw = int(np.ceil((kw - 1.0) / 2))
+ ndf = opt.nef
+
+ InstanceNorm = build_norm_layer('instance')
+ model = [
+ spectral_norm(nn.Conv2D(3, ndf, kw, 2, pw,
+ weight_attr=None,
+ bias_attr=None)),
+ InstanceNorm(ndf),
+
+ nn.GELU(),
+ spectral_norm(nn.Conv2D(ndf * 1, ndf * 2, kw, 2, pw,
+ weight_attr=None,
+ bias_attr=None)),
+ InstanceNorm(ndf * 2),
+
+ nn.GELU(),
+ spectral_norm(nn.Conv2D(ndf * 2, ndf * 4, kw, 2, pw,
+ weight_attr=None,
+ bias_attr=None)),
+ InstanceNorm(ndf * 4),
+
+ nn.GELU(),
+ spectral_norm(nn.Conv2D(ndf * 4, ndf * 8, kw, 2, pw,
+ weight_attr=None,
+ bias_attr=None)),
+ InstanceNorm(ndf * 8),
+
+ nn.GELU(),
+ spectral_norm(nn.Conv2D(ndf * 8, ndf * 8, kw, 2, pw,
+ weight_attr=None,
+ bias_attr=None)),
+ InstanceNorm(ndf * 8),
+ ]
+ if opt.crop_size >= 256:
+ model += [
+ nn.GELU(),
+ spectral_norm(nn.Conv2D(ndf * 8, ndf * 8, kw, 2, pw,
+ weight_attr=None,
+ bias_attr=None)),
+ InstanceNorm(ndf * 8),
+ ]
+ model += [nn.GELU(),]
+
+ self.flatten = nn.Flatten(1, -1)
+ self.so = 4
+ self.fc_mu = nn.Linear(ndf * 8 * self.so * self.so, opt.z_dim)
+ self.fc_var = nn.Linear(ndf * 8 * self.so * self.so, opt.z_dim)
+
+ self.model = nn.Sequential(*model)
+
+ def forward(self, x):
+ x = self.model(x)
+
+ x = self.flatten(x)
+
+ return self.fc_mu(x), self.fc_var(x)
+
diff --git a/ppgan/models/generators/generator_firstorder.py b/ppgan/models/generators/generator_firstorder.py
new file mode 100755
index 0000000000000000000000000000000000000000..61f488b568b09e71bcaa3c5da7557a5d275f6d1f
--- /dev/null
+++ b/ppgan/models/generators/generator_firstorder.py
@@ -0,0 +1,297 @@
+# code was heavily based on https://github.com/AliaksandrSiarohin/first-order-model
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/AliaksandrSiarohin/first-order-model/blob/master/LICENSE.md
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from ppgan.models.generators.builder import GENERATORS
+from .occlusion_aware import OcclusionAwareGenerator
+from ...modules.first_order import make_coordinate_grid, ImagePyramide, detach_kp
+from ...modules.keypoint_detector import KPDetector
+
+import paddle.vision.models.vgg as vgg
+from ppgan.utils.download import get_path_from_url
+
+
+@GENERATORS.register()
+class FirstOrderGenerator(nn.Layer):
+ """
+ Args:
+ kp_detector_cfg:
+ temperature (flost): parameter of softmax
+ block_expansion (int): block_expansion * (2**i) output features for each block i
+ max_features (int): input features cannot larger than max_features for encoding images
+ num_blocks (int): number of blocks for encoding images
+ generator_cfg:
+ block_expansion (int): block_expansion * (2**i) output features for each block i
+ max_features (int): input features cannot larger than max_features for encoding images
+ num_down_blocks (int): Downsampling block number for use in encoder.
+ num_bottleneck_blocks (int): block number for use in decoder.
+ estimate_occlusion_map (bool): whether to extimate occlusion_map
+ common_params:
+ num_kp (int): number of keypoints
+ num_channels (int): image channels
+ estimate_jacobian (bool): whether to estimate jacobian values of keypoints
+ train_params:
+ transform_params: transform keypoints and its jacobians
+ scale: extract the features of image pyramids
+ loss_weights: weight of [generator, discriminator, feature_matching, perceptual,
+ equivariance_value, equivariance_jacobian]
+
+ """
+ def __init__(self, generator_cfg, kp_detector_cfg, common_params,
+ train_params, dis_scales):
+ super(FirstOrderGenerator, self).__init__()
+ self.kp_extractor = KPDetector(**kp_detector_cfg, **common_params)
+ self.generator = OcclusionAwareGenerator(**generator_cfg,
+ **common_params)
+ self.train_params = train_params
+ self.scales = train_params['scales']
+ self.disc_scales = dis_scales
+ self.pyramid = ImagePyramide(self.scales, self.generator.num_channels)
+ self.loss_weights = train_params['loss_weights']
+ if sum(self.loss_weights['perceptual']) != 0:
+ self.vgg = VGG19()
+
+ def forward(self, x, discriminator, kp_extractor_ori=None):
+ kp_source = self.kp_extractor(x['source'])
+ kp_driving = self.kp_extractor(x['driving'])
+ generated = self.generator(x['source'],
+ kp_source=kp_source,
+ kp_driving=kp_driving)
+ generated.update({'kp_source': kp_source, 'kp_driving': kp_driving})
+
+ loss_values = {}
+
+ pyramide_real = self.pyramid(x['driving'])
+ pyramide_generated = self.pyramid(generated['prediction'])
+ # VGG19 perceptual Loss
+ if sum(self.loss_weights['perceptual']) != 0:
+ value_total = 0
+ for scale in self.scales:
+ x_vgg = self.vgg(pyramide_generated['prediction_' + str(scale)])
+ y_vgg = self.vgg(pyramide_real['prediction_' + str(scale)])
+
+ for i, weight in enumerate(self.loss_weights['perceptual']):
+ value = paddle.abs(x_vgg[i] - y_vgg[i].detach()).mean()
+ value_total += self.loss_weights['perceptual'][i] * value
+ loss_values['perceptual'] = value_total
+
+ # Generator Loss
+ if self.loss_weights['generator_gan'] != 0:
+ discriminator_maps_generated = discriminator(
+ pyramide_generated, kp=detach_kp(kp_driving))
+ discriminator_maps_real = discriminator(pyramide_real,
+ kp=detach_kp(kp_driving))
+ value_total = 0
+ for scale in self.disc_scales:
+ key = 'prediction_map_%s' % scale
+ value = ((1 - discriminator_maps_generated[key])**2).mean()
+ value_total += self.loss_weights['generator_gan'] * value
+ loss_values['gen_gan'] = value_total
+ # Feature matching Loss
+ if sum(self.loss_weights['feature_matching']) != 0:
+ value_total = 0
+ for scale in self.disc_scales:
+ key = 'feature_maps_%s' % scale
+ for i, (a, b) in enumerate(
+ zip(discriminator_maps_real[key],
+ discriminator_maps_generated[key])):
+
+ if self.loss_weights['feature_matching'][i] == 0:
+ continue
+ value = paddle.abs(a - b).mean()
+ value_total += self.loss_weights['feature_matching'][
+ i] * value
+ loss_values['feature_matching'] = value_total
+ if (self.loss_weights['equivariance_value'] +
+ self.loss_weights['equivariance_jacobian']) != 0:
+ transform = Transform(x['driving'].shape[0],
+ **self.train_params['transform_params'])
+ transformed_frame = transform.transform_frame(x['driving'])
+ transformed_kp = self.kp_extractor(transformed_frame)
+ generated['transformed_frame'] = transformed_frame
+ generated['transformed_kp'] = transformed_kp
+
+ # Value loss part
+ if self.loss_weights['equivariance_value'] != 0:
+ value = paddle.abs(
+ kp_driving['value'] -
+ transform.warp_coordinates(transformed_kp['value'])).mean()
+ loss_values['equivariance_value'] = self.loss_weights[
+ 'equivariance_value'] * value
+
+ # jacobian loss part
+ if self.loss_weights['equivariance_jacobian'] != 0:
+ jacobian_transformed = paddle.matmul(
+ *broadcast(transform.jacobian(transformed_kp['value']),
+ transformed_kp['jacobian']))
+ normed_driving = paddle.inverse(kp_driving['jacobian'])
+ normed_transformed = jacobian_transformed
+ value = paddle.matmul(
+ *broadcast(normed_driving, normed_transformed))
+ eye = paddle.tensor.eye(2, dtype='float32').reshape(
+ (1, 1, 2, 2))
+ eye = paddle.tile(eye, [1, value.shape[1], 1, 1])
+ value = paddle.abs(eye - value).mean()
+ loss_values['equivariance_jacobian'] = self.loss_weights[
+ 'equivariance_jacobian'] * value
+
+ if kp_extractor_ori is not None:
+ recon_loss = paddle.nn.loss.L1Loss()
+
+ kp_distillation_loss_source = recon_loss(
+ kp_extractor_ori(x['source'])['value'],
+ self.kp_extractor(x['source'])['value'])
+ kp_distillation_loss_driving = recon_loss(
+ kp_extractor_ori(x['driving'])['value'],
+ self.kp_extractor(x['driving'])['value'])
+ loss_values[
+ "kp_distillation_loss"] = kp_distillation_loss_source + kp_distillation_loss_driving
+
+ return loss_values, generated
+
+
+class VGG19(nn.Layer):
+ """
+ Vgg19 network for perceptual loss. See Sec 3.3.
+ """
+ def __init__(self, requires_grad=False):
+ super(VGG19, self).__init__()
+ pretrained_url = 'https://paddlegan.bj.bcebos.com/models/vgg19.pdparams'
+ weight_path = get_path_from_url(pretrained_url)
+ state_dict = paddle.load(weight_path)
+ _vgg = getattr(vgg, 'vgg19')()
+ _vgg.load_dict(state_dict)
+ vgg_pretrained_features = _vgg.features
+ self.slice1 = paddle.nn.Sequential()
+ self.slice2 = paddle.nn.Sequential()
+ self.slice3 = paddle.nn.Sequential()
+ self.slice4 = paddle.nn.Sequential()
+ self.slice5 = paddle.nn.Sequential()
+ for x in range(2):
+ self.slice1.add_sublayer(str(x), vgg_pretrained_features[x])
+ for x in range(2, 7):
+ self.slice2.add_sublayer(str(x), vgg_pretrained_features[x])
+ for x in range(7, 12):
+ self.slice3.add_sublayer(str(x), vgg_pretrained_features[x])
+ for x in range(12, 21):
+ self.slice4.add_sublayer(str(x), vgg_pretrained_features[x])
+ for x in range(21, 30):
+ self.slice5.add_sublayer(str(x), vgg_pretrained_features[x])
+
+ self.register_buffer(
+ 'mean',
+ paddle.to_tensor([0.485, 0.456, 0.406]).reshape([1, 3, 1, 1]))
+ # the std is for image with range [-1, 1]
+ self.register_buffer(
+ 'std',
+ paddle.to_tensor([0.229, 0.224, 0.225]).reshape([1, 3, 1, 1]))
+ if not requires_grad:
+ for param in self.parameters():
+ param.stop_gradient = True
+
+ def forward(self, x):
+ x = (x - self.mean) / self.std
+ h_relu1 = self.slice1(x)
+ h_relu2 = self.slice2(h_relu1)
+ h_relu3 = self.slice3(h_relu2)
+ h_relu4 = self.slice4(h_relu3)
+ h_relu5 = self.slice5(h_relu4)
+ out = [h_relu1, h_relu2, h_relu3, h_relu4, h_relu5]
+ return out
+
+
+class Transform:
+ """
+ Random tps transformation for equivariance constraints. See Sec 3.3
+ """
+ def __init__(self, bs, **kwargs):
+ noise = paddle.distribution.Normal(loc=[0],
+ scale=[kwargs['sigma_affine']
+ ]).sample([bs, 2, 3])
+ noise = noise.reshape((bs, 2, 3))
+ self.theta = noise + paddle.tensor.eye(2, 3, dtype='float32').reshape(
+ (1, 2, 3))
+ self.bs = bs
+
+ if ('sigma_tps' in kwargs) and ('points_tps' in kwargs):
+ self.tps = True
+ self.control_points = make_coordinate_grid(
+ (kwargs['points_tps'], kwargs['points_tps'])).unsqueeze(0)
+ buf = paddle.distribution.Normal(
+ loc=[0], scale=[kwargs['sigma_tps']
+ ]).sample([bs, 1, kwargs['points_tps']**2])
+ self.control_params = buf.reshape((bs, 1, kwargs['points_tps']**2))
+ else:
+ self.tps = False
+
+ def transform_frame(self, frame):
+ grid = make_coordinate_grid(frame.shape[2:], 'float32').unsqueeze(0)
+ grid = grid.reshape((1, frame.shape[2] * frame.shape[3], 2))
+ grid = self.warp_coordinates(grid).reshape(
+ (self.bs, frame.shape[2], frame.shape[3], 2))
+ return F.grid_sample(frame,
+ grid,
+ mode='bilinear',
+ padding_mode='reflection',
+ align_corners=True)
+
+ def warp_coordinates(self, coordinates):
+ theta = self.theta.astype('float32')
+ theta = theta.unsqueeze(1)
+ coordinates = coordinates.unsqueeze(-1)
+
+ # If x1:(1, 5, 2, 2), x2:(10, 100, 2, 1)
+ # torch.matmul can broadcast x1, x2 to (10, 100, ...)
+ # In PDPD, it should be done manually
+ theta_part_a = theta[:, :, :, :2]
+ theta_part_b = theta[:, :, :, 2:]
+
+ transformed = paddle.matmul(
+ *broadcast(theta_part_a, coordinates)) + theta_part_b #M*p + m0
+ transformed = transformed.squeeze(-1)
+ if self.tps:
+ control_points = self.control_points.astype('float32')
+ control_params = self.control_params.astype('float32')
+ distances = coordinates.reshape(
+ (coordinates.shape[0], -1, 1, 2)) - control_points.reshape(
+ (1, 1, -1, 2))
+ distances = distances.abs().sum(-1)
+
+ result = distances * distances
+ result = result * paddle.log(distances + 1e-6)
+ result = result * control_params
+ result = result.sum(2).reshape((self.bs, coordinates.shape[1], 1))
+ transformed = transformed + result
+ return transformed
+
+ def jacobian(self, coordinates):
+ new_coordinates = self.warp_coordinates(coordinates)
+ assert len(new_coordinates.shape) == 3
+ grad_x = paddle.grad(new_coordinates[:, :, 0].sum(),
+ coordinates,
+ create_graph=True)
+ grad_y = paddle.grad(new_coordinates[:, :, 1].sum(),
+ coordinates,
+ create_graph=True)
+ jacobian = paddle.concat(
+ [grad_x[0].unsqueeze(-2), grad_y[0].unsqueeze(-2)], axis=-2)
+ return jacobian
+
+
+def broadcast(x, y):
+ """
+ Broadcast before matmul
+ """
+ if len(x.shape) != len(y.shape):
+ raise ValueError(x.shape, '!=', y.shape)
+ *dim_x, _, _ = x.shape
+ *dim_y, _, _ = y.shape
+ max_shape = np.max(np.stack([dim_x, dim_y], axis=0), axis=0)
+ x_bc = paddle.broadcast_to(x, (*max_shape, x.shape[-2], x.shape[-1]))
+ y_bc = paddle.broadcast_to(y, (*max_shape, y.shape[-2], y.shape[-1]))
+ return x_bc, y_bc
diff --git a/ppgan/models/generators/generator_gpen.py b/ppgan/models/generators/generator_gpen.py
new file mode 100644
index 0000000000000000000000000000000000000000..d73690bfd07a8d98cd7b10eb26d7f01ca326d8af
--- /dev/null
+++ b/ppgan/models/generators/generator_gpen.py
@@ -0,0 +1,538 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# code was heavily based on https://github.com/rosinality/stylegan2-pytorch
+# MIT License
+# Copyright (c) 2019 Kim Seonghyeon
+
+import math
+import random
+import itertools
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppgan.models.generators.builder import GENERATORS
+from ppgan.models.discriminators.discriminator_styleganv2 import ConvLayer
+from ppgan.modules.equalized import EqualLinear_gpen as EqualLinear
+from ppgan.modules.fused_act import FusedLeakyReLU
+from ppgan.modules.upfirdn2d import Upfirdn2dUpsample, Upfirdn2dBlur
+
+
+class PixelNorm(nn.Layer):
+
+ def __init__(self):
+ super().__init__()
+
+ def forward(self, inputs):
+ return inputs * paddle.rsqrt(
+ paddle.mean(inputs * inputs, 1, keepdim=True) + 1e-8)
+
+
+class ModulatedConv2D(nn.Layer):
+
+ def __init__(
+ self,
+ in_channel,
+ out_channel,
+ kernel_size,
+ style_dim,
+ demodulate=True,
+ upsample=False,
+ downsample=False,
+ blur_kernel=[1, 3, 3, 1],
+ ):
+ super().__init__()
+
+ self.eps = 1e-8
+ self.kernel_size = kernel_size
+ self.in_channel = in_channel
+ self.out_channel = out_channel
+ self.upsample = upsample
+ self.downsample = downsample
+
+ if upsample:
+ factor = 2
+ p = (len(blur_kernel) - factor) - (kernel_size - 1)
+ pad0 = (p + 1) // 2 + factor - 1
+ pad1 = p // 2 + 1
+
+ self.blur = Upfirdn2dBlur(blur_kernel,
+ pad=(pad0, pad1),
+ upsample_factor=factor)
+
+ if downsample:
+ factor = 2
+ p = (len(blur_kernel) - factor) + (kernel_size - 1)
+ pad0 = (p + 1) // 2
+ pad1 = p // 2
+
+ self.blur = Upfirdn2dBlur(blur_kernel, pad=(pad0, pad1))
+
+ fan_in = in_channel * (kernel_size * kernel_size)
+ self.scale = 1 / math.sqrt(fan_in)
+ self.padding = kernel_size // 2
+
+ self.weight = self.create_parameter(
+ (1, out_channel, in_channel, kernel_size, kernel_size),
+ default_initializer=nn.initializer.Normal())
+
+ self.modulation = EqualLinear(style_dim, in_channel, bias_init=1)
+
+ self.demodulate = demodulate
+
+ def __repr__(self):
+ return (
+ f"{self.__class__.__name__}({self.in_channel}, {self.out_channel}, {self.kernel_size}, "
+ f"upsample={self.upsample}, downsample={self.downsample})")
+
+ def forward(self, inputs, style):
+ batch, in_channel, height, width = inputs.shape
+
+ style = self.modulation(style).reshape((batch, 1, in_channel, 1, 1))
+ weight = self.scale * self.weight * style
+
+ if self.demodulate:
+ demod = paddle.rsqrt((weight * weight).sum([2, 3, 4]) + 1e-8)
+ weight = weight * demod.reshape((batch, self.out_channel, 1, 1, 1))
+
+ weight = weight.reshape((batch * self.out_channel, in_channel,
+ self.kernel_size, self.kernel_size))
+
+ if self.upsample:
+ inputs = inputs.reshape((1, batch * in_channel, height, width))
+ weight = weight.reshape((batch, self.out_channel, in_channel,
+ self.kernel_size, self.kernel_size))
+ weight = weight.transpose((0, 2, 1, 3, 4)).reshape(
+ (batch * in_channel, self.out_channel, self.kernel_size,
+ self.kernel_size))
+ out = F.conv2d_transpose(inputs,
+ weight,
+ padding=0,
+ stride=2,
+ groups=batch)
+ _, _, height, width = out.shape
+ out = out.reshape((batch, self.out_channel, height, width))
+ out = self.blur(out)
+
+ elif self.downsample:
+ inputs = self.blur(inputs)
+ _, _, height, width = inputs.shape
+ inputs = inputs.reshape((1, batch * in_channel, height, width))
+ out = F.conv2d(inputs, weight, padding=0, stride=2, groups=batch)
+ _, _, height, width = out.shape
+ out = out.reshape((batch, self.out_channel, height, width))
+
+ else:
+ inputs = inputs.reshape((1, batch * in_channel, height, width))
+ out = F.conv2d(inputs, weight, padding=self.padding, groups=batch)
+ _, _, height, width = out.shape
+ out = out.reshape((batch, self.out_channel, height, width))
+
+ return out
+
+
+class NoiseInjection(nn.Layer):
+
+ def __init__(self, is_concat=False):
+ super().__init__()
+
+ self.weight = self.create_parameter(
+ (1, ), default_initializer=nn.initializer.Constant(0.0))
+ self.is_concat = is_concat
+
+ def forward(self, image, noise=None):
+ if noise is None:
+ batch, _, height, width = image.shape
+ noise = paddle.randn((batch, 1, height, width))
+ if self.is_concat:
+ return paddle.concat([image, self.weight * noise], axis=1)
+ else:
+ return image + self.weight * noise
+
+
+class ConstantInput(nn.Layer):
+
+ def __init__(self, channel, size=4):
+ super().__init__()
+
+ self.input = self.create_parameter(
+ (1, channel, size, size),
+ default_initializer=nn.initializer.Normal())
+
+ def forward(self, inputs):
+ batch = inputs.shape[0]
+ out = self.input.tile((batch, 1, 1, 1))
+
+ return out
+
+
+class StyledConv(nn.Layer):
+
+ def __init__(self,
+ in_channel,
+ out_channel,
+ kernel_size,
+ style_dim,
+ upsample=False,
+ blur_kernel=[1, 3, 3, 1],
+ demodulate=True,
+ is_concat=False):
+ super().__init__()
+
+ self.conv = ModulatedConv2D(
+ in_channel,
+ out_channel,
+ kernel_size,
+ style_dim,
+ upsample=upsample,
+ blur_kernel=blur_kernel,
+ demodulate=demodulate,
+ )
+
+ self.noise = NoiseInjection(is_concat=is_concat)
+ self.activate = FusedLeakyReLU(out_channel *
+ 2 if is_concat else out_channel)
+
+ def forward(self, inputs, style, noise=None):
+ out = self.conv(inputs, style)
+ out = self.noise(out, noise=noise)
+ out = self.activate(out)
+
+ return out
+
+
+class ToRGB(nn.Layer):
+
+ def __init__(self,
+ in_channel,
+ style_dim,
+ upsample=True,
+ blur_kernel=[1, 3, 3, 1]):
+ super().__init__()
+
+ if upsample:
+ self.upsample = Upfirdn2dUpsample(blur_kernel)
+
+ self.conv = ModulatedConv2D(in_channel,
+ 3,
+ 1,
+ style_dim,
+ demodulate=False)
+ self.bias = self.create_parameter((1, 3, 1, 1),
+ nn.initializer.Constant(0.0))
+
+ def forward(self, inputs, style, skip=None):
+ out = self.conv(inputs, style)
+ out = out + self.bias
+
+ if skip is not None:
+ skip = self.upsample(skip)
+
+ out = out + skip
+
+ return out
+
+
+class StyleGANv2Generator(nn.Layer):
+
+ def __init__(self,
+ size,
+ style_dim,
+ n_mlp,
+ channel_multiplier=1,
+ narrow=0.5,
+ blur_kernel=[1, 3, 3, 1],
+ lr_mlp=0.01,
+ is_concat=True):
+ super().__init__()
+
+ self.size = size
+
+ self.style_dim = style_dim
+
+ layers = [PixelNorm()]
+
+ for i in range(n_mlp):
+ layers.append(
+ EqualLinear(style_dim,
+ style_dim,
+ lr_mul=lr_mlp,
+ activation="fused_lrelu"))
+
+ self.style = nn.Sequential(*layers)
+
+ self.channels = {
+ 4: int(512 * narrow),
+ 8: int(512 * narrow),
+ 16: int(512 * narrow),
+ 32: int(512 * narrow),
+ 64: int(256 * channel_multiplier * narrow),
+ 128: int(128 * channel_multiplier * narrow),
+ 256: int(64 * channel_multiplier * narrow),
+ 512: int(32 * channel_multiplier * narrow),
+ 1024: int(16 * channel_multiplier * narrow),
+ 2048: int(8 * channel_multiplier * narrow)
+ }
+
+ self.input = ConstantInput(self.channels[4])
+ self.conv1 = StyledConv(self.channels[4],
+ self.channels[4],
+ 3,
+ style_dim,
+ blur_kernel=blur_kernel,
+ is_concat=is_concat)
+ self.to_rgb1 = ToRGB(self.channels[4] *
+ 2 if is_concat else self.channels[4],
+ style_dim,
+ upsample=False)
+
+ self.log_size = int(math.log(size, 2))
+ self.num_layers = (self.log_size - 2) * 2 + 1
+
+ self.convs = nn.LayerList()
+ self.upsamples = nn.LayerList()
+ self.to_rgbs = nn.LayerList()
+ self.noises = nn.Layer()
+
+ in_channel = self.channels[4]
+
+ for layer_idx in range(self.num_layers):
+ res = (layer_idx + 5) // 2
+ shape = [1, 1, 2**res, 2**res]
+ self.noises.register_buffer(f"noise_{layer_idx}",
+ paddle.randn(shape))
+
+ for i in range(3, self.log_size + 1):
+ out_channel = self.channels[2**i]
+
+ self.convs.append(
+ StyledConv(
+ in_channel * 2 if is_concat else in_channel,
+ out_channel,
+ 3,
+ style_dim,
+ upsample=True,
+ blur_kernel=blur_kernel,
+ is_concat=is_concat,
+ ))
+
+ self.convs.append(
+ StyledConv(out_channel * 2 if is_concat else out_channel,
+ out_channel,
+ 3,
+ style_dim,
+ blur_kernel=blur_kernel,
+ is_concat=is_concat))
+
+ self.to_rgbs.append(
+ ToRGB(out_channel * 2 if is_concat else out_channel, style_dim))
+
+ in_channel = out_channel
+
+ self.n_latent = self.log_size * 2 - 2
+ self.is_concat = is_concat
+
+ def make_noise(self):
+ noises = [paddle.randn((1, 1, 2**2, 2**2))]
+
+ for i in range(3, self.log_size + 1):
+ for _ in range(2):
+ noises.append(paddle.randn((1, 1, 2**i, 2**i)))
+
+ return noises
+
+ def mean_latent(self, n_latent):
+ latent_in = paddle.randn((n_latent, self.style_dim))
+ latent = self.style(latent_in).mean(0, keepdim=True)
+
+ return latent
+
+ def get_latent(self, inputs):
+ return self.style(inputs)
+
+ def get_mean_style(self):
+ mean_style = None
+ with paddle.no_grad():
+ for i in range(10):
+ style = self.mean_latent(1024)
+ if mean_style is None:
+ mean_style = style
+ else:
+ mean_style += style
+
+ mean_style /= 10
+ return mean_style
+
+ def forward(
+ self,
+ styles,
+ return_latents=False,
+ inject_index=None,
+ truncation=1,
+ truncation_latent=None,
+ input_is_latent=False,
+ noise=None,
+ ):
+ if not input_is_latent:
+ styles = [self.style(s) for s in styles]
+
+ if noise is None:
+ '''
+ noise = [None] * (2 * (self.log_size - 2) + 1)
+ '''
+ noise = []
+ batch = styles[0].shape[0]
+ for i in range(self.n_mlp + 1):
+ size = 2**(i + 2)
+ noise.append(
+ paddle.create_parameter(
+ [batch, self.channels[size], size, size],
+ dtype='float32',
+ attr=paddle.ParamAttr(
+ initializer=nn.initializer.Constant(0),
+ trainable=True)))
+
+ if truncation < 1:
+ style_t = []
+
+ for style in styles:
+ style_t.append(truncation_latent + truncation *
+ (style - truncation_latent))
+
+ styles = style_t
+
+ if len(styles) < 2:
+ inject_index = self.n_latent
+
+ latent = styles[0].unsqueeze(1)
+ latent = paddle.tile(latent, repeat_times=[1, inject_index, 1])
+ else:
+ if inject_index is None:
+ inject_index = random.randint(1, self.n_latent - 1)
+
+ latent = paddle.tile(styles[0].unsqueeze(1),
+ repeat_times=[1, inject_index, 1])
+ latent2 = paddle.tile(
+ styles[1].unsqueeze(1),
+ repeat_times=[1, self.n_latent - inject_index, 1])
+
+ latent = paddle.concat([latent, latent2], 1)
+
+ out = self.input(latent)
+ out = self.conv1(out, latent[:, 0], noise=noise[0])
+
+ skip = self.to_rgb1(out, latent[:, 1])
+
+ i = 1
+ for conv1, conv2, noise1, noise2, to_rgb in zip(self.convs[::2],
+ self.convs[1::2],
+ noise[1::2],
+ noise[2::2],
+ self.to_rgbs):
+ out = conv1(out, latent[:, i], noise=noise1)
+ out = conv2(out, latent[:, i + 1], noise=noise2)
+ skip = to_rgb(out, latent[:, i + 2], skip)
+
+ i += 2
+
+ image = skip
+
+ if return_latents:
+ return image, latent
+
+ else:
+ return image, None
+
+@GENERATORS.register()
+class GPENGenerator(nn.Layer):
+
+ def __init__(
+ self,
+ size,
+ style_dim,
+ n_mlp,
+ channel_multiplier=2,
+ narrow=1,
+ blur_kernel=[1, 3, 3, 1],
+ lr_mlp=0.01,
+ is_concat=True,
+ ):
+ super(GPENGenerator, self).__init__()
+ channels = {
+ 4: int(512 * narrow),
+ 8: int(512 * narrow),
+ 16: int(512 * narrow),
+ 32: int(512 * narrow),
+ 64: int(256 * channel_multiplier * narrow),
+ 128: int(128 * channel_multiplier * narrow),
+ 256: int(64 * channel_multiplier * narrow),
+ 512: int(32 * channel_multiplier * narrow),
+ 1024: int(16 * channel_multiplier * narrow),
+ 2048: int(8 * channel_multiplier * narrow)
+ }
+ self.log_size = int(math.log(size, 2))
+ self.generator = StyleGANv2Generator(
+ size,
+ style_dim,
+ n_mlp,
+ channel_multiplier=channel_multiplier,
+ narrow=narrow,
+ blur_kernel=blur_kernel,
+ lr_mlp=lr_mlp,
+ is_concat=is_concat)
+
+ conv = [ConvLayer(3, channels[size], 1)]
+ self.ecd0 = nn.Sequential(*conv)
+ in_channel = channels[size]
+
+ self.names = ['ecd%d' % i for i in range(self.log_size - 1)]
+ for i in range(self.log_size, 2, -1):
+ out_channel = channels[2**(i - 1)]
+ conv = [ConvLayer(in_channel, out_channel, 3, downsample=True)]
+ setattr(self, self.names[self.log_size - i + 1],
+ nn.Sequential(*conv))
+ in_channel = out_channel
+ self.final_linear = nn.Sequential(
+ EqualLinear(channels[4] * 4 * 4,
+ style_dim,
+ activation='fused_lrelu'))
+
+ def forward(
+ self,
+ inputs,
+ return_latents=False,
+ inject_index=None,
+ truncation=1,
+ truncation_latent=None,
+ input_is_latent=False,
+ ):
+ noise = []
+ for i in range(self.log_size - 1):
+ ecd = getattr(self, self.names[i])
+ inputs = ecd(inputs)
+ noise.append(inputs)
+ inputs = inputs.reshape([inputs.shape[0], -1])
+ outs = self.final_linear(inputs)
+ noise = list(
+ itertools.chain.from_iterable(
+ itertools.repeat(x, 2) for x in noise))[::-1]
+ outs = self.generator([outs],
+ return_latents,
+ inject_index,
+ truncation,
+ truncation_latent,
+ input_is_latent,
+ noise=noise[1:])
+ return outs
diff --git a/ppgan/models/generators/generator_pixel2style2pixel.py b/ppgan/models/generators/generator_pixel2style2pixel.py
index 1651cc54c01b45df3a837d51e53a295f4a45b199..04f57ee82a04c6caa89cffa0bd96bb35aec27a4f 100644
--- a/ppgan/models/generators/generator_pixel2style2pixel.py
+++ b/ppgan/models/generators/generator_pixel2style2pixel.py
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/eladrich/pixel2style2pixel
+# MIT License
+# Copyright (c) 2020 Elad Richardson, Yuval Alaluf
+
import math
import numpy as np
import paddle
@@ -41,7 +45,8 @@ class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
def get_block(in_channel, depth, num_units, stride=2):
- return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+ return [Bottleneck(in_channel, depth, stride)
+ ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
def get_blocks(num_layers):
@@ -67,7 +72,9 @@ def get_blocks(num_layers):
get_block(in_channel=256, depth=512, num_units=3)
]
else:
- raise ValueError("Invalid number of layers: {}. Must be one of [50, 100, 152]".format(num_layers))
+ raise ValueError(
+ "Invalid number of layers: {}. Must be one of [50, 100, 152]".
+ format(num_layers))
return blocks
@@ -75,9 +82,17 @@ class SEModule(nn.Layer):
def __init__(self, channels, reduction):
super(SEModule, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2D(1)
- self.fc1 = nn.Conv2D(channels, channels // reduction, kernel_size=1, padding=0, bias_attr=False)
+ self.fc1 = nn.Conv2D(channels,
+ channels // reduction,
+ kernel_size=1,
+ padding=0,
+ bias_attr=False)
self.relu = nn.ReLU()
- self.fc2 = nn.Conv2D(channels // reduction, channels, kernel_size=1, padding=0, bias_attr=False)
+ self.fc2 = nn.Conv2D(channels // reduction,
+ channels,
+ kernel_size=1,
+ padding=0,
+ bias_attr=False)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
@@ -98,13 +113,13 @@ class BottleneckIR(nn.Layer):
else:
self.shortcut_layer = nn.Sequential(
nn.Conv2D(in_channel, depth, (1, 1), stride, bias_attr=False),
- nn.BatchNorm2D(depth)
- )
+ nn.BatchNorm2D(depth))
self.res_layer = nn.Sequential(
nn.BatchNorm2D(in_channel),
- nn.Conv2D(in_channel, depth, (3, 3), (1, 1), 1, bias_attr=False), nn.PReLU(depth),
- nn.Conv2D(depth, depth, (3, 3), stride, 1, bias_attr=False), nn.BatchNorm2D(depth)
- )
+ nn.Conv2D(in_channel, depth, (3, 3), (1, 1), 1, bias_attr=False),
+ nn.PReLU(depth),
+ nn.Conv2D(depth, depth, (3, 3), stride, 1, bias_attr=False),
+ nn.BatchNorm2D(depth))
def forward(self, x):
shortcut = self.shortcut_layer(x)
@@ -120,16 +135,13 @@ class BottleneckIRSE(nn.Layer):
else:
self.shortcut_layer = nn.Sequential(
nn.Conv2D(in_channel, depth, (1, 1), stride, bias_attr=False),
- nn.BatchNorm2D(depth)
- )
+ nn.BatchNorm2D(depth))
self.res_layer = nn.Sequential(
nn.BatchNorm2D(in_channel),
nn.Conv2D(in_channel, depth, (3, 3), (1, 1), 1, bias_attr=False),
nn.PReLU(depth),
nn.Conv2D(depth, depth, (3, 3), stride, 1, bias_attr=False),
- nn.BatchNorm2D(depth),
- SEModule(depth, 16)
- )
+ nn.BatchNorm2D(depth), SEModule(depth, 16))
def forward(self, x):
shortcut = self.shortcut_layer(x)
@@ -144,8 +156,10 @@ class GradualStyleBlock(nn.Layer):
self.spatial = spatial
num_pools = int(np.log2(spatial))
modules = []
- modules += [nn.Conv2D(in_c, out_c, kernel_size=3, stride=2, padding=1),
- nn.LeakyReLU()]
+ modules += [
+ nn.Conv2D(in_c, out_c, kernel_size=3, stride=2, padding=1),
+ nn.LeakyReLU()
+ ]
for i in range(num_pools - 1):
modules += [
nn.Conv2D(out_c, out_c, kernel_size=3, stride=2, padding=1),
@@ -164,22 +178,23 @@ class GradualStyleBlock(nn.Layer):
class GradualStyleEncoder(nn.Layer):
def __init__(self, num_layers, mode='ir', opts=None):
super(GradualStyleEncoder, self).__init__()
- assert num_layers in [50, 100, 152], 'num_layers should be 50,100, or 152'
+ assert num_layers in [50, 100,
+ 152], 'num_layers should be 50,100, or 152'
assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
blocks = get_blocks(num_layers)
if mode == 'ir':
unit_module = BottleneckIR
elif mode == 'ir_se':
unit_module = BottleneckIRSE
- self.input_layer = nn.Sequential(nn.Conv2D(opts.input_nc, 64, (3, 3), 1, 1, bias_attr=False),
- nn.BatchNorm2D(64),
- nn.PReLU(64))
+ self.input_layer = nn.Sequential(
+ nn.Conv2D(opts.input_nc, 64, (3, 3), 1, 1, bias_attr=False),
+ nn.BatchNorm2D(64), nn.PReLU(64))
modules = []
for block in blocks:
for bottleneck in block:
- modules.append(unit_module(bottleneck.in_channel,
- bottleneck.depth,
- bottleneck.stride))
+ modules.append(
+ unit_module(bottleneck.in_channel, bottleneck.depth,
+ bottleneck.stride))
self.body = nn.Sequential(*modules)
self.styles = nn.LayerList()
@@ -214,7 +229,8 @@ class GradualStyleEncoder(nn.Layer):
So we choose bilinear upsample which supports arbitrary output sizes.
'''
_, _, H, W = y.shape
- return F.interpolate(x, size=(H, W), mode='bilinear', align_corners=True) + y
+ return F.interpolate(
+ x, size=(H, W), mode='bilinear', align_corners=True) + y
def forward(self, x):
x = self.input_layer(x)
@@ -249,24 +265,25 @@ class BackboneEncoderUsingLastLayerIntoW(nn.Layer):
def __init__(self, num_layers, mode='ir', opts=None):
super(BackboneEncoderUsingLastLayerIntoW, self).__init__()
print('Using BackboneEncoderUsingLastLayerIntoW')
- assert num_layers in [50, 100, 152], 'num_layers should be 50,100, or 152'
+ assert num_layers in [50, 100,
+ 152], 'num_layers should be 50,100, or 152'
assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
blocks = get_blocks(num_layers)
if mode == 'ir':
unit_module = BottleneckIR
elif mode == 'ir_se':
unit_module = BottleneckIRSE
- self.input_layer = nn.Sequential(nn.Conv2D(opts.input_nc, 64, (3, 3), 1, 1, bias_attr=False),
- nn.BatchNorm2D(64),
- nn.PReLU(64))
+ self.input_layer = nn.Sequential(
+ nn.Conv2D(opts.input_nc, 64, (3, 3), 1, 1, bias_attr=False),
+ nn.BatchNorm2D(64), nn.PReLU(64))
self.output_pool = nn.AdaptiveAvgPool2D((1, 1))
self.linear = EqualLinear(512, 512, lr_mul=1)
modules = []
for block in blocks:
for bottleneck in block:
- modules.append(unit_module(bottleneck.in_channel,
- bottleneck.depth,
- bottleneck.stride))
+ modules.append(
+ unit_module(bottleneck.in_channel, bottleneck.depth,
+ bottleneck.stride))
self.body = nn.Sequential(*modules)
def forward(self, x):
@@ -282,16 +299,17 @@ class BackboneEncoderUsingLastLayerIntoWPlus(nn.Layer):
def __init__(self, num_layers, mode='ir', opts=None):
super(BackboneEncoderUsingLastLayerIntoWPlus, self).__init__()
print('Using BackboneEncoderUsingLastLayerIntoWPlus')
- assert num_layers in [50, 100, 152], 'num_layers should be 50,100, or 152'
+ assert num_layers in [50, 100,
+ 152], 'num_layers should be 50,100, or 152'
assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
blocks = get_blocks(num_layers)
if mode == 'ir':
unit_module = BottleneckIR
elif mode == 'ir_se':
unit_module = BottleneckIRSE
- self.input_layer = nn.Sequential(nn.Conv2D(opts.input_nc, 64, (3, 3), 1, 1, bias_attr=False),
- nn.BatchNorm2D(64),
- nn.PReLU(64))
+ self.input_layer = nn.Sequential(
+ nn.Conv2D(opts.input_nc, 64, (3, 3), 1, 1, bias_attr=False),
+ nn.BatchNorm2D(64), nn.PReLU(64))
self.output_layer_2 = nn.Sequential(nn.BatchNorm2D(512),
nn.AdaptiveAvgPool2D((7, 7)),
Flatten(),
@@ -300,9 +318,9 @@ class BackboneEncoderUsingLastLayerIntoWPlus(nn.Layer):
modules = []
for block in blocks:
for bottleneck in block:
- modules.append(unit_module(bottleneck.in_channel,
- bottleneck.depth,
- bottleneck.stride))
+ modules.append(
+ unit_module(bottleneck.in_channel, bottleneck.depth,
+ bottleneck.stride))
self.body = nn.Sequential(*modules)
def forward(self, x):
@@ -321,15 +339,19 @@ class Pixel2Style2Pixel(nn.Layer):
self.set_opts(opts)
# Define architecture
self.encoder = self.set_encoder()
- self.decoder = StyleGANv2Generator(opts.size, opts.style_dim, opts.n_mlp, opts.channel_multiplier)
+ self.decoder = StyleGANv2Generator(opts.size, opts.style_dim,
+ opts.n_mlp, opts.channel_multiplier)
self.face_pool = nn.AdaptiveAvgPool2D((256, 256))
self.style_dim = self.decoder.style_dim
self.n_latent = self.decoder.n_latent
if self.opts.start_from_latent_avg:
if self.opts.learn_in_w:
- self.register_buffer('latent_avg', paddle.zeros([1, self.style_dim]))
+ self.register_buffer('latent_avg',
+ paddle.zeros([1, self.style_dim]))
else:
- self.register_buffer('latent_avg', paddle.zeros([1, self.n_latent, self.style_dim]))
+ self.register_buffer(
+ 'latent_avg',
+ paddle.zeros([1, self.n_latent, self.style_dim]))
def set_encoder(self):
if self.opts.encoder_type == 'GradualStyleEncoder':
@@ -337,13 +359,22 @@ class Pixel2Style2Pixel(nn.Layer):
elif self.opts.encoder_type == 'BackboneEncoderUsingLastLayerIntoW':
encoder = BackboneEncoderUsingLastLayerIntoW(50, 'ir_se', self.opts)
elif self.opts.encoder_type == 'BackboneEncoderUsingLastLayerIntoWPlus':
- encoder = BackboneEncoderUsingLastLayerIntoWPlus(50, 'ir_se', self.opts)
+ encoder = BackboneEncoderUsingLastLayerIntoWPlus(
+ 50, 'ir_se', self.opts)
else:
- raise Exception('{} is not a valid encoders'.format(self.opts.encoder_type))
+ raise Exception('{} is not a valid encoders'.format(
+ self.opts.encoder_type))
return encoder
- def forward(self, x, resize=True, latent_mask=None, input_code=False, randomize_noise=True,
- inject_latent=None, return_latents=False, alpha=None):
+ def forward(self,
+ x,
+ resize=True,
+ latent_mask=None,
+ input_code=False,
+ randomize_noise=True,
+ inject_latent=None,
+ return_latents=False,
+ alpha=None):
if input_code:
codes = x
else:
@@ -355,12 +386,12 @@ class Pixel2Style2Pixel(nn.Layer):
else:
codes = codes + self.latent_avg.tile([codes.shape[0], 1, 1])
-
if latent_mask is not None:
for i in latent_mask:
if inject_latent is not None:
if alpha is not None:
- codes[:, i] = alpha * inject_latent[:, i] + (1 - alpha) * codes[:, i]
+ codes[:, i] = alpha * inject_latent[:, i] + (
+ 1 - alpha) * codes[:, i]
else:
codes[:, i] = inject_latent[:, i]
else:
diff --git a/ppgan/models/generators/generator_singan.py b/ppgan/models/generators/generator_singan.py
new file mode 100755
index 0000000000000000000000000000000000000000..596384653f6611eb7df2f44934027eda633d04c3
--- /dev/null
+++ b/ppgan/models/generators/generator_singan.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# code was based on https://github.com/tamarott/SinGAN
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from .builder import GENERATORS
+
+
+class ConvBlock(nn.Sequential):
+ def __init__(self, in_channel, out_channel, ker_size, padd, stride):
+ super(ConvBlock,self).__init__()
+ self.add_sublayer('conv', nn.Conv2D(in_channel ,out_channel, ker_size, stride, padd)),
+ self.add_sublayer('norm', nn.BatchNorm2D(out_channel)),
+ self.add_sublayer('LeakyRelu', nn.LeakyReLU(0.2))
+
+class GeneratorConcatSkip2CleanAdd(nn.Layer):
+ def __init__(self, nfc=32, min_nfc=32, input_nc=3, num_layers=5, ker_size=3, padd_size=0):
+ super(GeneratorConcatSkip2CleanAdd, self).__init__()
+ self.head = ConvBlock(input_nc, nfc, ker_size, padd_size, 1)
+ self.body = nn.Sequential()
+ for i in range(num_layers - 2):
+ N = int(nfc / pow(2, i + 1))
+ block = ConvBlock(max(2 * N, min_nfc), max(N, min_nfc), ker_size, padd_size, 1)
+ self.body.add_sublayer('block%d' % (i + 1), block)
+ self.tail = nn.Sequential(
+ nn.Conv2D(max(N, min_nfc), input_nc, ker_size, 1, padd_size),
+ nn.Tanh())
+ def forward(self, x, y):
+ x = self.head(x)
+ x = self.body(x)
+ x = self.tail(x)
+ ind = int((y.shape[2] - x.shape[2]) / 2)
+ y = y[:, :, ind: (y.shape[2] - ind), ind: (y.shape[3] - ind)]
+ return x + y
+
+@GENERATORS.register()
+class SinGANGenerator(nn.Layer):
+ def __init__(self,
+ scale_num,
+ coarsest_shape,
+ nfc_init=32,
+ min_nfc_init=32,
+ input_nc=3,
+ num_layers=5,
+ ker_size=3,
+ noise_zero_pad=True):
+ super().__init__()
+ nfc_list = [min(nfc_init * pow(2, math.floor(i / 4)), 128) for i in range(scale_num)]
+ min_nfc_list = [min(min_nfc_init * pow(2, math.floor(i / 4)), 128) for i in range(scale_num)]
+ self.generators = nn.LayerList([
+ GeneratorConcatSkip2CleanAdd(
+ nfc, min_nfc, input_nc, num_layers,
+ ker_size, 0
+ ) for nfc, min_nfc in zip(nfc_list, min_nfc_list)])
+ self._scale_num = scale_num
+ self._pad_size = int((ker_size - 1) / 2 * num_layers)
+ self.noise_pad = nn.Pad2D(self._pad_size if noise_zero_pad else 0)
+ self.image_pad = nn.Pad2D(self._pad_size)
+ self._noise_zero_pad = noise_zero_pad
+ self._coarsest_shape = coarsest_shape
+ self.register_buffer('scale_num', paddle.to_tensor(scale_num, 'int32'), True)
+ self.register_buffer('coarsest_shape', paddle.to_tensor(coarsest_shape, 'int32'), True)
+ self.register_buffer('nfc_init', paddle.to_tensor(nfc_init, 'int32'), True)
+ self.register_buffer('min_nfc_init', paddle.to_tensor(min_nfc_init, 'int32'), True)
+ self.register_buffer('num_layers', paddle.to_tensor(num_layers, 'int32'), True)
+ self.register_buffer('ker_size', paddle.to_tensor(ker_size, 'int32'), True)
+ self.register_buffer('noise_zero_pad', paddle.to_tensor(noise_zero_pad, 'bool'), True)
+ self.register_buffer('sigma', paddle.ones([scale_num]), True)
+ self.register_buffer('scale_factor', paddle.ones([1]), True)
+ self.register_buffer(
+ 'z_fixed',
+ paddle.randn(
+ F.pad(
+ paddle.zeros(coarsest_shape),
+ [0 if noise_zero_pad else self._pad_size] * 4).shape), True)
+
+ def forward(self, z_pyramid, x_prev, stop_scale, start_scale=0):
+ stop_scale %= self._scale_num
+ start_scale %= self._scale_num
+ for i, scale in enumerate(range(start_scale, stop_scale + 1)):
+ x_prev = self.image_pad(x_prev)
+ z = self.noise_pad(z_pyramid[i] * self.sigma[scale]) + x_prev
+ x_prev = self.generators[scale](
+ z.detach(),
+ x_prev.detach()
+ )
+ if scale < stop_scale:
+ x_prev = F.interpolate(x_prev,
+ F.pad(z_pyramid[i + 1], [0 if self._noise_zero_pad else -self._pad_size] * 4).shape[-2:],
+ None, 'bicubic')
+ return x_prev
diff --git a/ppgan/models/generators/generator_starganv2.py b/ppgan/models/generators/generator_starganv2.py
new file mode 100755
index 0000000000000000000000000000000000000000..636df792e23970229453ab161ce98261bc398ae1
--- /dev/null
+++ b/ppgan/models/generators/generator_starganv2.py
@@ -0,0 +1,389 @@
+# code was heavily based on https://github.com/clovaai/stargan-v2
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/clovaai/stargan-v2#license
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+
+from .builder import GENERATORS
+import numpy as np
+import math
+
+from ppgan.modules.wing import CoordConvTh, ConvBlock, HourGlass, preprocess
+
+from ppgan.utils.download import get_path_from_url
+
+FAN_WEIGHT_URL = "https://paddlegan.bj.bcebos.com/models/wing.pdparams"
+
+
+class ResBlk(nn.Layer):
+ def __init__(self,
+ dim_in,
+ dim_out,
+ actv=nn.LeakyReLU(0.2),
+ normalize=False,
+ downsample=False):
+ super().__init__()
+ self.actv = actv
+ self.normalize = normalize
+ self.downsample = downsample
+ self.learned_sc = dim_in != dim_out
+ self._build_weights(dim_in, dim_out)
+ self.maxpool = nn.AvgPool2D(kernel_size=2)
+
+ def _build_weights(self, dim_in, dim_out):
+ self.conv1 = nn.Conv2D(dim_in, dim_in, 3, 1, 1)
+ self.conv2 = nn.Conv2D(dim_in, dim_out, 3, 1, 1)
+ if self.normalize:
+ self.norm1 = nn.InstanceNorm2D(dim_in,
+ weight_attr=True,
+ bias_attr=True)
+ self.norm2 = nn.InstanceNorm2D(dim_in,
+ weight_attr=True,
+ bias_attr=True)
+ if self.learned_sc:
+ self.conv1x1 = nn.Conv2D(dim_in, dim_out, 1, 1, 0, bias_attr=False)
+
+ def _shortcut(self, x):
+ if self.learned_sc:
+ x = self.conv1x1(x)
+ if self.downsample:
+ x = self.maxpool(x)
+ return x
+
+ def _residual(self, x):
+ if self.normalize:
+ x = self.norm1(x)
+ x = self.actv(x)
+ x = self.conv1(x)
+ if self.downsample:
+ x = self.maxpool(x)
+ if self.normalize:
+ x = self.norm2(x)
+ x = self.actv(x)
+ x = self.conv2(x)
+ return x
+
+ def forward(self, x):
+ x = self._shortcut(x) + self._residual(x)
+ return x / math.sqrt(2) # unit variance
+
+
+class AdaIN(nn.Layer):
+ def __init__(self, style_dim, num_features):
+ super().__init__()
+ self.norm = nn.InstanceNorm2D(num_features,
+ weight_attr=False,
+ bias_attr=False)
+ self.fc = nn.Linear(style_dim, num_features * 2)
+
+ def forward(self, x, s):
+ h = self.fc(s)
+ # h = h.view(h.size(0), h.size(1), 1, 1)
+ h = paddle.reshape(h, (h.shape[0], h.shape[1], 1, 1))
+ gamma, beta = paddle.chunk(h, chunks=2, axis=1)
+ return (1 + gamma) * self.norm(x) + beta
+
+
+class AdainResBlk(nn.Layer):
+ def __init__(self,
+ dim_in,
+ dim_out,
+ style_dim=64,
+ w_hpf=0,
+ actv=nn.LeakyReLU(0.2),
+ upsample=False):
+ super().__init__()
+ self.w_hpf = w_hpf
+ self.actv = actv
+ self.upsample = upsample
+ self.learned_sc = dim_in != dim_out
+ self._build_weights(dim_in, dim_out, style_dim)
+
+ def _build_weights(self, dim_in, dim_out, style_dim=64):
+ self.conv1 = nn.Conv2D(dim_in, dim_out, 3, 1, 1)
+ self.conv2 = nn.Conv2D(dim_out, dim_out, 3, 1, 1)
+ self.norm1 = AdaIN(style_dim, dim_in)
+ self.norm2 = AdaIN(style_dim, dim_out)
+ if self.learned_sc:
+ self.conv1x1 = nn.Conv2D(dim_in, dim_out, 1, 1, 0, bias_attr=False)
+
+ def _shortcut(self, x):
+ if self.upsample:
+ x = F.interpolate(x, scale_factor=2, mode='nearest')
+ if self.learned_sc:
+ x = self.conv1x1(x)
+ return x
+
+ def _residual(self, x, s):
+ x = self.norm1(x, s)
+ x = self.actv(x)
+ if self.upsample:
+ x = F.interpolate(x, scale_factor=2, mode='nearest')
+ x = self.conv1(x)
+ x = self.norm2(x, s)
+ x = self.actv(x)
+ x = self.conv2(x)
+ return x
+
+ def forward(self, x, s):
+ out = self._residual(x, s)
+ if self.w_hpf == 0:
+ out = (out + self._shortcut(x)) / math.sqrt(2)
+ return out
+
+
+class HighPass(nn.Layer):
+ def __init__(self, w_hpf):
+ super(HighPass, self).__init__()
+ self.filter = paddle.to_tensor([[-1, -1, -1], [-1, 8., -1],
+ [-1, -1, -1]]) / w_hpf
+
+ def forward(self, x):
+ # filter = self.filter.unsqueeze(0).unsqueeze(1).repeat(x.size(1), 1, 1, 1)
+ filter = self.filter.unsqueeze(0).unsqueeze(1).tile(
+ [x.shape[1], 1, 1, 1])
+ return F.conv2d(x, filter, padding=1, groups=x.shape[1])
+
+
+@GENERATORS.register()
+class StarGANv2Generator(nn.Layer):
+ def __init__(self, img_size=256, style_dim=64, max_conv_dim=512, w_hpf=1):
+ super().__init__()
+ dim_in = 2**14 // img_size
+ self.img_size = img_size
+ self.from_rgb = nn.Conv2D(3, dim_in, 3, 1, 1)
+ self.encode = nn.LayerList()
+ self.decode = nn.LayerList()
+ self.to_rgb = nn.Sequential(
+ nn.InstanceNorm2D(dim_in, weight_attr=True, bias_attr=True),
+ nn.LeakyReLU(0.2), nn.Conv2D(dim_in, 3, 1, 1, 0))
+
+ # down/up-sampling blocks
+ repeat_num = int(np.log2(img_size)) - 4
+ if w_hpf > 0:
+ repeat_num += 1
+ for _ in range(repeat_num):
+ dim_out = min(dim_in * 2, max_conv_dim)
+ self.encode.append(
+ ResBlk(dim_in, dim_out, normalize=True, downsample=True))
+ if len(self.decode) == 0:
+ self.decode.append(
+ AdainResBlk(dim_out,
+ dim_in,
+ style_dim,
+ w_hpf=w_hpf,
+ upsample=True))
+ else:
+ self.decode.insert(0,
+ AdainResBlk(dim_out,
+ dim_in,
+ style_dim,
+ w_hpf=w_hpf,
+ upsample=True)) # stack-like
+ dim_in = dim_out
+
+ # bottleneck blocks
+ for _ in range(2):
+ self.encode.append(ResBlk(dim_out, dim_out, normalize=True))
+ self.decode.insert(
+ 0, AdainResBlk(dim_out, dim_out, style_dim, w_hpf=w_hpf))
+
+ if w_hpf > 0:
+ self.hpf = HighPass(w_hpf)
+
+ def forward(self, x, s, masks=None):
+ x = self.from_rgb(x)
+ cache = {}
+ for block in self.encode:
+ if (masks is not None) and (x.shape[2] in [32, 64, 128]):
+ cache[x.shape[2]] = x
+ x = block(x)
+ for block in self.decode:
+ x = block(x, s)
+ if (masks is not None) and (x.shape[2] in [32, 64, 128]):
+ mask = masks[0] if x.shape[2] in [32] else masks[1]
+ mask = F.interpolate(mask,
+ size=[x.shape[2], x.shape[2]],
+ mode='bilinear')
+ x = x + self.hpf(mask * cache[x.shape[2]])
+ return self.to_rgb(x)
+
+
+@GENERATORS.register()
+class StarGANv2Mapping(nn.Layer):
+ def __init__(self, latent_dim=16, style_dim=64, num_domains=2):
+ super().__init__()
+ layers = []
+ layers += [nn.Linear(latent_dim, 512)]
+ layers += [nn.ReLU()]
+ for _ in range(3):
+ layers += [nn.Linear(512, 512)]
+ layers += [nn.ReLU()]
+ self.shared = nn.Sequential(*layers)
+
+ self.unshared = nn.LayerList()
+ for _ in range(num_domains):
+ self.unshared.append(
+ nn.Sequential(nn.Linear(512, 512),
+ nn.ReLU(), nn.Linear(512, 512), nn.ReLU(),
+ nn.Linear(512, 512), nn.ReLU(),
+ nn.Linear(512, style_dim)))
+
+ def forward(self, z, y):
+ h = self.shared(z)
+ out = []
+ for layer in self.unshared:
+ out += [layer(h)]
+ out = paddle.stack(out, axis=1) # (batch, num_domains, style_dim)
+ idx = paddle.to_tensor(np.array(range(y.shape[0]))).astype('int')
+ s = []
+ for i in range(idx.shape[0]):
+ s += [
+ out[idx[i].numpy().astype(np.int_).tolist()[0],
+ y[i].numpy().astype(np.int_).tolist()[0]]
+ ]
+ s = paddle.stack(s)
+ s = paddle.reshape(s, (s.shape[0], -1))
+ return s
+
+
+@GENERATORS.register()
+class StarGANv2Style(nn.Layer):
+ def __init__(self,
+ img_size=256,
+ style_dim=64,
+ num_domains=2,
+ max_conv_dim=512):
+ super().__init__()
+ dim_in = 2**14 // img_size
+ blocks = []
+ blocks += [nn.Conv2D(3, dim_in, 3, 1, 1)]
+
+ repeat_num = int(np.log2(img_size)) - 2
+ for _ in range(repeat_num):
+ dim_out = min(dim_in * 2, max_conv_dim)
+ blocks += [ResBlk(dim_in, dim_out, downsample=True)]
+ dim_in = dim_out
+
+ blocks += [nn.LeakyReLU(0.2)]
+ blocks += [nn.Conv2D(dim_out, dim_out, 4, 1, 0)]
+ blocks += [nn.LeakyReLU(0.2)]
+ self.shared = nn.Sequential(*blocks)
+
+ self.unshared = nn.LayerList()
+ for _ in range(num_domains):
+ self.unshared.append(nn.Linear(dim_out, style_dim))
+
+ def forward(self, x, y):
+ h = self.shared(x)
+ h = paddle.reshape(h, (h.shape[0], -1))
+ out = []
+ for layer in self.unshared:
+ out += [layer(h)]
+ out = paddle.stack(out, axis=1) # (batch, num_domains, style_dim)
+ idx = paddle.to_tensor(np.array(range(y.shape[0]))).astype('int')
+ s = []
+ for i in range(idx.shape[0]):
+ s += [
+ out[idx[i].numpy().astype(np.int_).tolist()[0],
+ y[i].numpy().astype(np.int_).tolist()[0]]
+ ]
+ s = paddle.stack(s)
+ s = paddle.reshape(s, (s.shape[0], -1))
+ return s
+
+
+@GENERATORS.register()
+class FAN(nn.Layer):
+ def __init__(self,
+ num_modules=1,
+ end_relu=False,
+ num_landmarks=98,
+ fname_pretrained=None):
+ super(FAN, self).__init__()
+ self.num_modules = num_modules
+ self.end_relu = end_relu
+
+ # Base part
+ self.conv1 = CoordConvTh(256,
+ 256,
+ True,
+ False,
+ in_channels=3,
+ out_channels=64,
+ kernel_size=7,
+ stride=2,
+ padding=3)
+ self.bn1 = nn.BatchNorm2D(64)
+ self.conv2 = ConvBlock(64, 128)
+ self.conv3 = ConvBlock(128, 128)
+ self.conv4 = ConvBlock(128, 256)
+
+ # Stacking part
+ self.add_sublayer('m0', HourGlass(1, 4, 256, first_one=True))
+ self.add_sublayer('top_m_0', ConvBlock(256, 256))
+ self.add_sublayer('conv_last0', nn.Conv2D(256, 256, 1, 1, 0))
+ self.add_sublayer('bn_end0', nn.BatchNorm2D(256))
+ self.add_sublayer('l0', nn.Conv2D(256, num_landmarks + 1, 1, 1, 0))
+
+ if fname_pretrained is not None:
+ self.load_pretrained_weights(fname_pretrained)
+ else:
+ weight_path = get_path_from_url(FAN_WEIGHT_URL)
+ self.load_pretrained_weights(weight_path)
+
+ def load_pretrained_weights(self, fname):
+ import pickle
+ import six
+
+ with open(fname, 'rb') as f:
+ checkpoint = pickle.load(f) if six.PY2 else pickle.load(
+ f, encoding='latin1')
+
+ model_weights = self.state_dict()
+ model_weights.update({
+ k: v
+ for k, v in checkpoint['state_dict'].items() if k in model_weights
+ })
+ self.set_state_dict(model_weights)
+
+ def forward(self, x):
+ x, _ = self.conv1(x)
+ x = F.relu(self.bn1(x), True)
+ x = F.avg_pool2d(self.conv2(x), 2, stride=2)
+ x = self.conv3(x)
+ x = self.conv4(x)
+
+ outputs = []
+ boundary_channels = []
+ tmp_out = None
+ ll, boundary_channel = self._sub_layers['m0'](x, tmp_out)
+ ll = self._sub_layers['top_m_0'](ll)
+ ll = F.relu(
+ self._sub_layers['bn_end0'](self._sub_layers['conv_last0'](ll)),
+ True)
+
+ # Predict heatmaps
+ tmp_out = self._sub_layers['l0'](ll)
+ if self.end_relu:
+ tmp_out = F.relu(tmp_out) # HACK: Added relu
+ outputs.append(tmp_out)
+ boundary_channels.append(boundary_channel)
+ return outputs, boundary_channels
+
+ @paddle.no_grad()
+ def get_heatmap(self, x, b_preprocess=True):
+ ''' outputs 0-1 normalized heatmap '''
+ x = F.interpolate(x, size=[256, 256], mode='bilinear')
+ x_01 = x * 0.5 + 0.5
+ outputs, _ = self(x_01)
+ heatmaps = outputs[-1][:, :-1, :, :]
+ scale_factor = x.shape[2] // heatmaps.shape[2]
+ if b_preprocess:
+ heatmaps = F.interpolate(heatmaps,
+ scale_factor=scale_factor,
+ mode='bilinear',
+ align_corners=True)
+ heatmaps = preprocess(heatmaps)
+ return heatmaps
diff --git a/ppgan/models/generators/generator_styleganv2.py b/ppgan/models/generators/generator_styleganv2.py
index cabfe340e2dd38e74e1c24a01b4d8a979b24388d..0965a51091589b0cd0915457c195327b1870d0f9 100644
--- a/ppgan/models/generators/generator_styleganv2.py
+++ b/ppgan/models/generators/generator_styleganv2.py
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/rosinality/stylegan2-pytorch
+# MIT License
+# Copyright (c) 2019 Kim Seonghyeon
+
import math
import random
import paddle
@@ -28,9 +32,9 @@ class PixelNorm(nn.Layer):
def __init__(self):
super().__init__()
- def forward(self, input):
- return input * paddle.rsqrt(
- paddle.mean(input * input, 1, keepdim=True) + 1e-8)
+ def forward(self, inputs):
+ return inputs * paddle.rsqrt(
+ paddle.mean(inputs * inputs, 1, keepdim=True) + 1e-8)
class ModulatedConv2D(nn.Layer):
@@ -89,11 +93,13 @@ class ModulatedConv2D(nn.Layer):
f"{self.__class__.__name__}({self.in_channel}, {self.out_channel}, {self.kernel_size}, "
f"upsample={self.upsample}, downsample={self.downsample})")
- def forward(self, input, style):
- batch, in_channel, height, width = input.shape
+ def forward(self, inputs, style, apply_modulation=False):
+ batch, in_channel, height, width = inputs.shape
- style = self.modulation(style).reshape((batch, 1, in_channel, 1, 1))
+ if apply_modulation: style = self.modulation(style)
+ style = style.reshape((batch, 1, in_channel, 1, 1))
weight = self.scale * self.weight * style
+ del style
if self.demodulate:
demod = paddle.rsqrt((weight * weight).sum([2, 3, 4]) + 1e-8)
@@ -103,13 +109,13 @@ class ModulatedConv2D(nn.Layer):
self.kernel_size, self.kernel_size))
if self.upsample:
- input = input.reshape((1, batch * in_channel, height, width))
+ inputs = inputs.reshape((1, batch * in_channel, height, width))
weight = weight.reshape((batch, self.out_channel, in_channel,
self.kernel_size, self.kernel_size))
weight = weight.transpose((0, 2, 1, 3, 4)).reshape(
(batch * in_channel, self.out_channel, self.kernel_size,
self.kernel_size))
- out = F.conv2d_transpose(input,
+ out = F.conv2d_transpose(inputs,
weight,
padding=0,
stride=2,
@@ -119,16 +125,16 @@ class ModulatedConv2D(nn.Layer):
out = self.blur(out)
elif self.downsample:
- input = self.blur(input)
- _, _, height, width = input.shape
- input = input.reshape((1, batch * in_channel, height, width))
- out = F.conv2d(input, weight, padding=0, stride=2, groups=batch)
+ inputs = self.blur(inputs)
+ _, _, height, width = inputs.shape
+ inputs = inputs.reshape((1, batch * in_channel, height, width))
+ out = F.conv2d(inputs, weight, padding=0, stride=2, groups=batch)
_, _, height, width = out.shape
out = out.reshape((batch, self.out_channel, height, width))
else:
- input = input.reshape((1, batch * in_channel, height, width))
- out = F.conv2d(input, weight, padding=self.padding, groups=batch)
+ inputs = inputs.reshape((1, batch * in_channel, height, width))
+ out = F.conv2d(inputs, weight, padding=self.padding, groups=batch)
_, _, height, width = out.shape
out = out.reshape((batch, self.out_channel, height, width))
@@ -136,18 +142,21 @@ class ModulatedConv2D(nn.Layer):
class NoiseInjection(nn.Layer):
- def __init__(self):
+ def __init__(self, is_concat=False):
super().__init__()
self.weight = self.create_parameter(
(1, ), default_initializer=nn.initializer.Constant(0.0))
+ self.is_concat = is_concat
def forward(self, image, noise=None):
if noise is None:
batch, _, height, width = image.shape
noise = paddle.randn((batch, 1, height, width))
-
- return image + self.weight * noise
+ if self.is_concat:
+ return paddle.concat([image, self.weight * noise], axis=1)
+ else:
+ return image + self.weight * noise
class ConstantInput(nn.Layer):
@@ -158,24 +167,22 @@ class ConstantInput(nn.Layer):
(1, channel, size, size),
default_initializer=nn.initializer.Normal())
- def forward(self, input):
- batch = input.shape[0]
+ def forward(self, batch):
out = self.input.tile((batch, 1, 1, 1))
return out
class StyledConv(nn.Layer):
- def __init__(
- self,
- in_channel,
- out_channel,
- kernel_size,
- style_dim,
- upsample=False,
- blur_kernel=[1, 3, 3, 1],
- demodulate=True,
- ):
+ def __init__(self,
+ in_channel,
+ out_channel,
+ kernel_size,
+ style_dim,
+ upsample=False,
+ blur_kernel=[1, 3, 3, 1],
+ demodulate=True,
+ is_concat=False):
super().__init__()
self.conv = ModulatedConv2D(
@@ -188,11 +195,12 @@ class StyledConv(nn.Layer):
demodulate=demodulate,
)
- self.noise = NoiseInjection()
- self.activate = FusedLeakyReLU(out_channel)
+ self.noise = NoiseInjection(is_concat=is_concat)
+ self.activate = FusedLeakyReLU(out_channel *
+ 2 if is_concat else out_channel)
- def forward(self, input, style, noise=None):
- out = self.conv(input, style)
+ def forward(self, inputs, style, noise=None):
+ out = self.conv(inputs, style)
out = self.noise(out, noise=noise)
out = self.activate(out)
@@ -218,8 +226,8 @@ class ToRGB(nn.Layer):
self.bias = self.create_parameter((1, 3, 1, 1),
nn.initializer.Constant(0.0))
- def forward(self, input, style, skip=None):
- out = self.conv(input, style)
+ def forward(self, inputs, style, skip=None):
+ out = self.conv(inputs, style)
out = out + self.bias
if skip is not None:
@@ -232,20 +240,20 @@ class ToRGB(nn.Layer):
@GENERATORS.register()
class StyleGANv2Generator(nn.Layer):
- def __init__(
- self,
- size,
- style_dim,
- n_mlp,
- channel_multiplier=2,
- blur_kernel=[1, 3, 3, 1],
- lr_mlp=0.01,
- ):
+ def __init__(self,
+ size,
+ style_dim,
+ n_mlp,
+ channel_multiplier=2,
+ blur_kernel=[1, 3, 3, 1],
+ lr_mlp=0.01,
+ is_concat=False):
super().__init__()
self.size = size
-
self.style_dim = style_dim
+ self.log_size = int(math.log(size, 2))
+ self.num_layers = (self.log_size - 2) * 2 + 1
layers = [PixelNorm()]
@@ -269,17 +277,46 @@ class StyleGANv2Generator(nn.Layer):
512: 32 * channel_multiplier,
1024: 16 * channel_multiplier,
}
+ self.channels_lst = []
+ self.w_idx_lst = [
+ 0,1, # 4
+ 1,2,3, # 8
+ 3,4,5, # 16
+ 5,6,7, # 32
+ 7,8,9, # 64
+ 9,10,11, # 128
+ 11,12,13, # 256
+ 13,14,15, # 512
+ 15,16,17, # 1024
+ ]
+ self.style_layers = [
+ 0, #1,
+ 2, 3, #4,
+ 5, 6, #7,
+ 8, 9, #10,
+ 11, 12,# 13,
+ 14, 15,# 16,
+ 17, 18,# 19,
+ 20, 21,# 22,
+ 23, 24,# 25
+ ]
+
+ if self.log_size != 10:
+ self.w_idx_lst = self.w_idx_lst[:-(3 * (10 - self.log_size))]
+ self.style_layers = self.style_layers[:-(2 * (10 - self.log_size))]
self.input = ConstantInput(self.channels[4])
self.conv1 = StyledConv(self.channels[4],
self.channels[4],
3,
style_dim,
- blur_kernel=blur_kernel)
- self.to_rgb1 = ToRGB(self.channels[4], style_dim, upsample=False)
-
- self.log_size = int(math.log(size, 2))
- self.num_layers = (self.log_size - 2) * 2 + 1
+ blur_kernel=blur_kernel,
+ is_concat=is_concat)
+ self.to_rgb1 = ToRGB(self.channels[4] *
+ 2 if is_concat else self.channels[4],
+ style_dim,
+ upsample=False)
+ self.channels_lst.extend([self.channels[4], self.channels[4]])
self.convs = nn.LayerList()
self.upsamples = nn.LayerList()
@@ -299,26 +336,31 @@ class StyleGANv2Generator(nn.Layer):
self.convs.append(
StyledConv(
- in_channel,
+ in_channel * 2 if is_concat else in_channel,
out_channel,
3,
style_dim,
upsample=True,
blur_kernel=blur_kernel,
+ is_concat=is_concat,
))
self.convs.append(
- StyledConv(out_channel,
+ StyledConv(out_channel * 2 if is_concat else out_channel,
out_channel,
3,
style_dim,
- blur_kernel=blur_kernel))
+ blur_kernel=blur_kernel,
+ is_concat=is_concat))
- self.to_rgbs.append(ToRGB(out_channel, style_dim))
+ self.to_rgbs.append(
+ ToRGB(out_channel * 2 if is_concat else out_channel, style_dim))
+ self.channels_lst.extend([in_channel, out_channel, out_channel])
in_channel = out_channel
self.n_latent = self.log_size * 2 - 2
+ self.is_concat = is_concat
def make_noise(self):
noises = [paddle.randn((1, 1, 2**2, 2**2))]
@@ -335,39 +377,139 @@ class StyleGANv2Generator(nn.Layer):
return latent
- def get_latent(self, input):
- return self.style(input)
+ def get_latent(self, inputs):
+ return self.style(inputs)
- def forward(
+ def get_latents(
self,
- styles,
- return_latents=False,
- inject_index=None,
- truncation=1,
+ inputs,
+ truncation=1.0,
+ truncation_cutoff=None,
truncation_latent=None,
input_is_latent=False,
- noise=None,
- randomize_noise=True,
):
+ assert truncation >= 0, "truncation should be a float in range [0, 1]"
+
if not input_is_latent:
- styles = [self.style(s) for s in styles]
+ style = self.style(inputs)
+ if truncation < 1.0:
+ if truncation_latent is None:
+ truncation_latent = self.get_mean_style()
+ cutoff = truncation_cutoff
+ if truncation_cutoff is None:
+ style = truncation_latent + \
+ truncation * (style - truncation_latent)
+ else:
+ style[:, :cutoff] = truncation_latent[:, :cutoff] + \
+ truncation * (style[:, :cutoff] - truncation_latent[:, :cutoff])
+ return style
+
+ @paddle.no_grad()
+ def get_mean_style(self, n_sample=10, n_latent=1024):
+ mean_style = None
+ for i in range(n_sample):
+ style = self.mean_latent(n_latent)
+ if mean_style is None:
+ mean_style = style
+ else:
+ mean_style += style
+
+ mean_style /= n_sample
+ return mean_style
+
+ def get_latent_S(self, inputs):
+ return self.style_affine(self.style(inputs))
+ def style_affine(self, latent):
+ if latent.ndim < 3:
+ latent = latent.unsqueeze(1).tile((1, self.n_latent, 1))
+ latent_ = []
+ latent_.append(self.conv1.conv.modulation(latent[:, 0]))
+ latent_.append(self.to_rgb1.conv.modulation(latent[:, 1]))
+
+ i = 1
+ for conv1, conv2, to_rgb in zip(self.convs[::2], self.convs[1::2],
+ self.to_rgbs):
+ latent_.append(conv1.conv.modulation(latent[:, i + 0]))
+ latent_.append(conv2.conv.modulation(latent[:, i + 1]))
+ latent_.append(to_rgb.conv.modulation(latent[:, i + 2]))
+ i += 2
+ return latent_ #paddle.concat(latent_, axis=1)
+
+ def synthesis(self,
+ latent,
+ noise=None,
+ randomize_noise=True,
+ is_w_latent=False):
+ out = self.input(latent[0].shape[0])
if noise is None:
if randomize_noise:
noise = [None] * self.num_layers
+ #noise = [paddle.randn(getattr(self.noises, f"noise_{i}").shape) for i in range(self.num_layers)]
else:
noise = [
getattr(self.noises, f"noise_{i}")
for i in range(self.num_layers)
]
- if truncation < 1:
- style_t = []
+ out = self.conv1(out, latent[0], noise=noise[0])
- for style in styles:
- style_t.append(truncation_latent + truncation *
- (style - truncation_latent))
+ skip = self.to_rgb1(out, latent[1])
+ i = 2
+ if self.is_concat:
+ noise_i = 1
+
+ for conv1, conv2, to_rgb in zip(self.convs[::2], self.convs[1::2],
+ self.to_rgbs):
+ out = conv1(out, latent[i],
+ noise=noise[(noise_i + 1) // 2]) ### 1 for 2
+ out = conv2(out, latent[i + 1],
+ noise=noise[(noise_i + 2) // 2]) ### 1 for 2
+ skip = to_rgb(out, latent[i + 2], skip)
+
+ i += 3
+ noise_i += 2
+ else:
+ for conv1, conv2, noise1, noise2, to_rgb in zip(
+ self.convs[::2], self.convs[1::2], noise[1::2], noise[2::2],
+ self.to_rgbs):
+ out = conv1(out, latent[i], noise=noise1)
+ out = conv2(out, latent[i + 1], noise=noise2)
+ skip = to_rgb(out, latent[i + 2], skip)
+
+ i += 3
+
+ return skip #image = skip
+
+ def forward(
+ self,
+ styles,
+ return_latents=False,
+ inject_index=None,
+ truncation=1.0,
+ truncation_cutoff=None,
+ truncation_latent=None,
+ input_is_latent=False,
+ noise=None,
+ randomize_noise=True,
+ ):
+ if not input_is_latent:
+ styles = [self.style(s) for s in styles]
+
+ if truncation < 1.0:
+ style_t = []
+ if truncation_latent is None:
+ truncation_latent = self.get_mean_style()
+ cutoff = truncation_cutoff
+ for style in styles:
+ if truncation_cutoff is None:
+ style = truncation_latent + \
+ truncation * (style - truncation_latent)
+ else:
+ style[:, :cutoff] = truncation_latent[:, :cutoff] + \
+ truncation * (style[:, :cutoff] - truncation_latent[:, :cutoff])
+ style_t.append(style)
styles = style_t
if len(styles) < 2:
@@ -389,27 +531,12 @@ class StyleGANv2Generator(nn.Layer):
latent = paddle.concat([latent, latent2], 1)
- out = self.input(latent)
- out = self.conv1(out, latent[:, 0], noise=noise[0])
+ #if not input_is_affined_latent:
+ styles = self.style_affine(latent)
- skip = self.to_rgb1(out, latent[:, 1])
-
- i = 1
- for conv1, conv2, noise1, noise2, to_rgb in zip(self.convs[::2],
- self.convs[1::2],
- noise[1::2],
- noise[2::2],
- self.to_rgbs):
- out = conv1(out, latent[:, i], noise=noise1)
- out = conv2(out, latent[:, i + 1], noise=noise2)
- skip = to_rgb(out, latent[:, i + 2], skip)
-
- i += 2
-
- image = skip
+ image = self.synthesis(styles, noise, randomize_noise)
if return_latents:
return image, latent
-
else:
return image, None
diff --git a/ppgan/models/generators/gfpganv1_arch.py b/ppgan/models/generators/gfpganv1_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9c334574b31585913ee3be250988d54d13c09ba
--- /dev/null
+++ b/ppgan/models/generators/gfpganv1_arch.py
@@ -0,0 +1,1418 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+
+from ppgan.models.discriminators.builder import DISCRIMINATORS
+from ppgan.models.generators.builder import GENERATORS
+from ppgan.utils.download import get_path_from_url
+
+
+class StyleGAN2Generator(nn.Layer):
+ """StyleGAN2 Generator.
+
+ Args:
+ out_size (int): The spatial size of outputs.
+ num_style_feat (int): Channel number of style features. Default: 512.
+ num_mlp (int): Layer number of MLP style layers. Default: 8.
+ channel_multiplier (int): Channel multiplier for large networks of
+ StyleGAN2. Default: 2.
+ resample_kernel (list[int]): A list indicating the 1D resample kernel
+ magnitude. A cross production will be applied to extent 1D resample
+ kenrel to 2D resample kernel. Default: (1, 3, 3, 1).
+ lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01.
+ narrow (float): Narrow ratio for channels. Default: 1.0.
+ """
+ def __init__(self,
+ out_size,
+ num_style_feat=512,
+ num_mlp=8,
+ channel_multiplier=2,
+ resample_kernel=(1, 3, 3, 1),
+ lr_mlp=0.01,
+ narrow=1):
+ super(StyleGAN2Generator, self).__init__()
+ self.num_style_feat = num_style_feat
+ style_mlp_layers = [NormStyleCode()]
+ for i in range(num_mlp):
+ style_mlp_layers.append(
+ EqualLinear(num_style_feat,
+ num_style_feat,
+ bias=True,
+ bias_init_val=0,
+ lr_mul=lr_mlp,
+ activation='fused_lrelu'))
+ self.style_mlp = nn.Sequential(*style_mlp_layers)
+ channels = {
+ '4': int(512 * narrow),
+ '8': int(512 * narrow),
+ '16': int(512 * narrow),
+ '32': int(512 * narrow),
+ '64': int(256 * channel_multiplier * narrow),
+ '128': int(128 * channel_multiplier * narrow),
+ '256': int(64 * channel_multiplier * narrow),
+ '512': int(32 * channel_multiplier * narrow),
+ '1024': int(16 * channel_multiplier * narrow)
+ }
+ self.channels = channels
+ self.constant_input = ConstantInput(channels['4'], size=4)
+ self.style_conv1 = StyleConv(channels['4'],
+ channels['4'],
+ kernel_size=3,
+ num_style_feat=num_style_feat,
+ demodulate=True,
+ sample_mode=None,
+ resample_kernel=resample_kernel)
+ self.to_rgb1 = ToRGB(channels['4'],
+ num_style_feat,
+ upsample=False,
+ resample_kernel=resample_kernel)
+ self.log_size = int(math.log(out_size, 2))
+ self.num_layers = (self.log_size - 2) * 2 + 1
+ self.num_latent = self.log_size * 2 - 2
+ self.style_convs = nn.LayerList()
+ self.to_rgbs = nn.LayerList()
+ self.noises = nn.Layer()
+ in_channels = channels['4']
+ for layer_idx in range(self.num_layers):
+ resolution = 2**((layer_idx + 5) // 2)
+ shape = [1, 1, resolution, resolution]
+ x = paddle.ones(shape=shape, dtype='float32')
+ self.noises.register_buffer(f'noise{layer_idx}', x)
+ for i in range(3, self.log_size + 1):
+ out_channels = channels[f'{2 ** i}']
+ self.style_convs.append(
+ StyleConv(in_channels,
+ out_channels,
+ kernel_size=3,
+ num_style_feat=num_style_feat,
+ demodulate=True,
+ sample_mode='upsample',
+ resample_kernel=resample_kernel))
+ self.style_convs.append(
+ StyleConv(out_channels,
+ out_channels,
+ kernel_size=3,
+ num_style_feat=num_style_feat,
+ demodulate=True,
+ sample_mode=None,
+ resample_kernel=resample_kernel))
+ self.to_rgbs.append(
+ ToRGB(out_channels,
+ num_style_feat,
+ upsample=True,
+ resample_kernel=resample_kernel))
+ in_channels = out_channels
+
+ def make_noise(self):
+ """Make noise for noise injection."""
+ device = self.constant_input.weight.device
+ x = paddle.ones(shape=[1, 1, 4, 4], dtype='float32')
+ noises = [x]
+ for i in range(3, self.log_size + 1):
+ for _ in range(2):
+ x = paddle.ones(shape=[1, 1, 2**i, 2**i], dtype='float32')
+ noises.append(x)
+ return noises
+
+ def get_latent(self, x):
+ return self.style_mlp(x)
+
+ def mean_latent(self, num_latent):
+ x = paddle.ones(shape=[num_latent, self.num_style_feat],
+ dtype='float32')
+ latent_in = x
+ latent = self.style_mlp(latent_in).mean(0, keepdim=True)
+ return latent
+
+ def forward(self,
+ styles,
+ input_is_latent=False,
+ noise=None,
+ randomize_noise=True,
+ truncation=1,
+ truncation_latent=None,
+ inject_index=None,
+ return_latents=False):
+ """Forward function for StyleGAN2Generator.
+
+ Args:
+ styles (list[Tensor]): Sample codes of styles.
+ input_is_latent (bool): Whether input is latent style.
+ Default: False.
+ noise (Tensor | None): Input noise or None. Default: None.
+ randomize_noise (bool): Randomize noise, used when 'noise' is
+ False. Default: True.
+ truncation (float): TODO. Default: 1.
+ truncation_latent (Tensor | None): TODO. Default: None.
+ inject_index (int | None): The injection index for mixing noise.
+ Default: None.
+ return_latents (bool): Whether to return style latents.
+ Default: False.
+ """
+ if not input_is_latent:
+ styles = [self.style_mlp(s) for s in styles]
+ if noise is None:
+ if randomize_noise:
+ noise = [None] * self.num_layers
+ else:
+ noise = [
+ getattr(self.noises, f'noise{i}')
+ for i in range(self.num_layers)
+ ]
+ if truncation < 1:
+ style_truncation = []
+ for style in styles:
+ style_truncation.append(truncation_latent + truncation *
+ (style - truncation_latent))
+ styles = style_truncation
+ if len(styles) == 1:
+ inject_index = self.num_latent
+ if styles[0].ndim < 3:
+ latent = styles[0].unsqueeze(1)
+ latent = paddle.tile(latent, repeat_times=[1, inject_index, 1])
+ else:
+ latent = styles[0]
+ elif len(styles) == 2:
+ if inject_index is None:
+ inject_index = random.randint(1, self.num_latent - 1)
+ latent1 = styles[0].unsqueeze(1)
+ latent1 = paddle.tile(latent, repeat_times=[1, inject_index, 1])
+
+ latent2 = styles[1].unsqueeze(1)
+ latent2 = paddle.tile(
+ latent2, repeat_times=[1, self.num_latent - inject_index, 1])
+ latent = paddle.concat([latent1, latent2], 1)
+ out = self.constant_input(latent.shape[0])
+ out = self.style_conv1(out, latent[:, 0], noise=noise[0])
+ skip = self.to_rgb1(out, latent[:, 1])
+ i = 1
+ for conv1, conv2, noise1, noise2, to_rgb in zip(self.style_convs[::2],
+ self.style_convs[1::2],
+ noise[1::2],
+ noise[2::2],
+ self.to_rgbs):
+ out = conv1(out, latent[:, i], noise=noise1)
+ out = conv2(out, latent[:, i + 1], noise=noise2)
+ skip = to_rgb(out, latent[:, i + 2], skip)
+ i += 2
+ image = skip
+ if return_latents:
+ return image, latent
+ else:
+ return image, None
+
+
+def var(x, axis=None, unbiased=True, keepdim=False, name=None):
+
+ u = paddle.mean(x, axis, True, name)
+ out = paddle.sum((x - u) * (x - u), axis, keepdim=keepdim, name=name)
+
+ n = paddle.cast(paddle.numel(x), x.dtype) \
+ / paddle.cast(paddle.numel(out), x.dtype)
+ if unbiased:
+ one_const = paddle.ones([1], x.dtype)
+ n = paddle.where(n > one_const, n - 1., one_const)
+ out /= n
+ return out
+
+
+@DISCRIMINATORS.register()
+class StyleGAN2DiscriminatorGFPGAN(nn.Layer):
+ """StyleGAN2 Discriminator.
+
+ Args:
+ out_size (int): The spatial size of outputs.
+ channel_multiplier (int): Channel multiplier for large networks of
+ StyleGAN2. Default: 2.
+ resample_kernel (list[int]): A list indicating the 1D resample kernel
+ magnitude. A cross production will be applied to extent 1D resample
+ kenrel to 2D resample kernel. Default: (1, 3, 3, 1).
+ stddev_group (int): For group stddev statistics. Default: 4.
+ narrow (float): Narrow ratio for channels. Default: 1.0.
+ """
+ def __init__(self,
+ out_size,
+ channel_multiplier=2,
+ resample_kernel=(1, 3, 3, 1),
+ stddev_group=4,
+ narrow=1):
+ super(StyleGAN2DiscriminatorGFPGAN, self).__init__()
+ channels = {
+ '4': int(512 * narrow),
+ '8': int(512 * narrow),
+ '16': int(512 * narrow),
+ '32': int(512 * narrow),
+ '64': int(256 * channel_multiplier * narrow),
+ '128': int(128 * channel_multiplier * narrow),
+ '256': int(64 * channel_multiplier * narrow),
+ '512': int(32 * channel_multiplier * narrow),
+ '1024': int(16 * channel_multiplier * narrow)
+ }
+ log_size = int(math.log(out_size, 2))
+ conv_body = [
+ ConvLayer(3, channels[f'{out_size}'], 1, bias=True, activate=True)
+ ]
+ in_channels = channels[f'{out_size}']
+ for i in range(log_size, 2, -1):
+ out_channels = channels[f'{2 ** (i - 1)}']
+ conv_body.append(
+ ResBlock(in_channels, out_channels, resample_kernel))
+ in_channels = out_channels
+ self.conv_body = nn.Sequential(*conv_body)
+ self.final_conv = ConvLayer(in_channels + 1,
+ channels['4'],
+ 3,
+ bias=True,
+ activate=True)
+ self.final_linear = nn.Sequential(
+ EqualLinear(channels['4'] * 4 * 4,
+ channels['4'],
+ bias=True,
+ bias_init_val=0,
+ lr_mul=1,
+ activation='fused_lrelu'),
+ EqualLinear(channels['4'],
+ 1,
+ bias=True,
+ bias_init_val=0,
+ lr_mul=1,
+ activation=None))
+ self.stddev_group = stddev_group
+ self.stddev_feat = 1
+
+ def forward(self, x):
+ out = self.conv_body(x)
+ b, c, h, w = out.shape
+ group = min(b, self.stddev_group)
+ stddev = out.reshape(
+ [group, -1, self.stddev_feat, c // self.stddev_feat, h, w])
+ stddev = paddle.sqrt(var(stddev, 0, unbiased=False) + 1e-08)
+ stddev = stddev.mean(axis=[2, 3, 4], keepdim=True).squeeze(2)
+
+ stddev = paddle.tile(stddev, repeat_times=[group, 1, h, w])
+ out = paddle.concat([out, stddev], 1)
+ out = self.final_conv(out)
+ out = out.reshape([b, -1])
+ out = self.final_linear(out)
+ return out
+
+
+class StyleGAN2GeneratorSFT(StyleGAN2Generator):
+ """StyleGAN2 Generator with SFT modulation (Spatial Feature Transform).
+
+ Args:
+ out_size (int): The spatial size of outputs.
+ num_style_feat (int): Channel number of style features. Default: 512.
+ num_mlp (int): Layer number of MLP style layers. Default: 8.
+ channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2.
+ resample_kernel (list[int]): A list indicating the 1D resample kernel magnitude. A cross production will be
+ applied to extent 1D resample kernel to 2D resample kernel. Default: (1, 3, 3, 1).
+ lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01.
+ narrow (float): The narrow ratio for channels. Default: 1.
+ sft_half (bool): Whether to apply SFT on half of the input channels. Default: False.
+ """
+ def __init__(self,
+ out_size,
+ num_style_feat=512,
+ num_mlp=8,
+ channel_multiplier=2,
+ resample_kernel=(1, 3, 3, 1),
+ lr_mlp=0.01,
+ narrow=1,
+ sft_half=False):
+ super(StyleGAN2GeneratorSFT,
+ self).__init__(out_size,
+ num_style_feat=num_style_feat,
+ num_mlp=num_mlp,
+ channel_multiplier=channel_multiplier,
+ resample_kernel=resample_kernel,
+ lr_mlp=lr_mlp,
+ narrow=narrow)
+ self.sft_half = sft_half
+
+ def forward(self,
+ styles,
+ conditions,
+ input_is_latent=False,
+ noise=None,
+ randomize_noise=True,
+ truncation=1,
+ truncation_latent=None,
+ inject_index=None,
+ return_latents=False):
+ """Forward function for StyleGAN2GeneratorSFT.
+
+ Args:
+ styles (list[Tensor]): Sample codes of styles.
+ conditions (list[Tensor]): SFT conditions to generators.
+ input_is_latent (bool): Whether input is latent style. Default: False.
+ noise (Tensor | None): Input noise or None. Default: None.
+ randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True.
+ truncation (float): The truncation ratio. Default: 1.
+ truncation_latent (Tensor | None): The truncation latent tensor. Default: None.
+ inject_index (int | None): The injection index for mixing noise. Default: None.
+ return_latents (bool): Whether to return style latents. Default: False.
+ """
+ if not input_is_latent:
+ styles = [self.style_mlp(s) for s in styles]
+ if noise is None:
+ if randomize_noise:
+ noise = [None] * self.num_layers
+ else:
+ noise = [
+ getattr(self.noises, f'noise{i}')
+ for i in range(self.num_layers)
+ ]
+ if truncation < 1:
+ style_truncation = []
+ for style in styles:
+ style_truncation.append(truncation_latent + truncation *
+ (style - truncation_latent))
+ styles = style_truncation
+ if len(styles) == 1:
+ inject_index = self.num_latent
+ if styles[0].ndim < 3:
+ latent = paddle.tile(styles[0].unsqueeze(1),
+ repeat_times=[1, inject_index, 1])
+ else:
+ latent = styles[0]
+ elif len(styles) == 2:
+ if inject_index is None:
+ inject_index = random.randint(1, self.num_latent - 1)
+ latent1 = styles[0].unsqueeze(1)
+ latent1 = paddle.tile(latent, repeat_times=[1, inject_index, 1])
+
+ latent2 = styles[1].unsqueeze(1)
+ latent2 = paddle.tile(
+ latent2, repeat_times=[1, self.num_latent - inject_index, 1])
+ latent = paddle.concat([latent1, latent2], 1)
+ out = self.constant_input(latent.shape[0])
+ out = self.style_conv1(out, latent[:, 0], noise=noise[0])
+ skip = self.to_rgb1(out, latent[:, 1])
+ i = 1
+ for conv1, conv2, noise1, noise2, to_rgb in zip(self.style_convs[::2],
+ self.style_convs[1::2],
+ noise[1::2],
+ noise[2::2],
+ self.to_rgbs):
+ out = conv1(out, latent[:, i], noise=noise1)
+ if i < len(conditions):
+ if self.sft_half:
+ out_same, out_sft = paddle.split(out, 2, axis=1)
+ out_sft = out_sft * conditions[i - 1] + conditions[i]
+ out = paddle.concat([out_same, out_sft], axis=1)
+ else:
+ out = out * conditions[i - 1] + conditions[i]
+ out = conv2(out, latent[:, i + 1], noise=noise2)
+ skip = to_rgb(out, latent[:, i + 2], skip)
+ i += 2
+ image = skip
+ if return_latents:
+ return image, latent
+ else:
+ return image, None
+
+
+@GENERATORS.register()
+class GFPGANv1(nn.Layer):
+ """The GFPGAN architecture: Unet + StyleGAN2 decoder with SFT.
+
+ Ref: GFP-GAN: Towards Real-World Blind Face Restoration with Generative Facial Prior.
+
+ Args:
+ out_size (int): The spatial size of outputs.
+ num_style_feat (int): Channel number of style features. Default: 512.
+ channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2.
+ resample_kernel (list[int]): A list indicating the 1D resample kernel magnitude. A cross production will be
+ applied to extent 1D resample kernel to 2D resample kernel. Default: (1, 3, 3, 1).
+ decoder_load_path (str): The path to the pre-trained decoder model (usually, the StyleGAN2). Default: None.
+ fix_decoder (bool): Whether to fix the decoder. Default: True.
+
+ num_mlp (int): Layer number of MLP style layers. Default: 8.
+ lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01.
+ input_is_latent (bool): Whether input is latent style. Default: False.
+ different_w (bool): Whether to use different latent w for different layers. Default: False.
+ narrow (float): The narrow ratio for channels. Default: 1.
+ sft_half (bool): Whether to apply SFT on half of the input channels. Default: False.
+ """
+ def __init__(self,
+ out_size,
+ num_style_feat=512,
+ channel_multiplier=1,
+ resample_kernel=(1, 3, 3, 1),
+ decoder_load_path=None,
+ fix_decoder=True,
+ num_mlp=8,
+ lr_mlp=0.01,
+ input_is_latent=False,
+ different_w=False,
+ narrow=1,
+ sft_half=False):
+ super(GFPGANv1, self).__init__()
+ self.input_is_latent = input_is_latent
+ self.different_w = different_w
+ self.num_style_feat = num_style_feat
+ unet_narrow = narrow * 0.5
+ channels = {
+ '4': int(512 * unet_narrow),
+ '8': int(512 * unet_narrow),
+ '16': int(512 * unet_narrow),
+ '32': int(512 * unet_narrow),
+ '64': int(256 * channel_multiplier * unet_narrow),
+ '128': int(128 * channel_multiplier * unet_narrow),
+ '256': int(64 * channel_multiplier * unet_narrow),
+ '512': int(32 * channel_multiplier * unet_narrow),
+ '1024': int(16 * channel_multiplier * unet_narrow)
+ }
+ self.log_size = int(math.log(out_size, 2))
+ first_out_size = 2**int(math.log(out_size, 2))
+ self.conv_body_first = ConvLayer(3,
+ channels[f'{first_out_size}'],
+ 1,
+ bias=True,
+ activate=True)
+ in_channels = channels[f'{first_out_size}']
+ self.conv_body_down = nn.LayerList()
+ for i in range(self.log_size, 2, -1):
+ out_channels = channels[f'{2 ** (i - 1)}']
+ self.conv_body_down.append(
+ ResBlock(in_channels, out_channels, resample_kernel))
+ in_channels = out_channels
+ self.final_conv = ConvLayer(in_channels,
+ channels['4'],
+ 3,
+ bias=True,
+ activate=True)
+ in_channels = channels['4']
+ self.conv_body_up = nn.LayerList()
+ for i in range(3, self.log_size + 1):
+ out_channels = channels[f'{2 ** i}']
+ self.conv_body_up.append(ResUpBlock(in_channels, out_channels))
+ in_channels = out_channels
+ self.toRGB = nn.LayerList()
+ for i in range(3, self.log_size + 1):
+ self.toRGB.append(
+ EqualConv2d(channels[f'{2 ** i}'],
+ 3,
+ 1,
+ stride=1,
+ padding=0,
+ bias=True,
+ bias_init_val=0))
+ if different_w:
+ linear_out_channel = (int(math.log(out_size, 2)) * 2 -
+ 2) * num_style_feat
+ else:
+ linear_out_channel = num_style_feat
+ self.final_linear = EqualLinear(channels['4'] * 4 * 4,
+ linear_out_channel,
+ bias=True,
+ bias_init_val=0,
+ lr_mul=1,
+ activation=None)
+ self.stylegan_decoder = StyleGAN2GeneratorSFT(
+ out_size=out_size,
+ num_style_feat=num_style_feat,
+ num_mlp=num_mlp,
+ channel_multiplier=channel_multiplier,
+ resample_kernel=resample_kernel,
+ lr_mlp=lr_mlp,
+ narrow=narrow,
+ sft_half=sft_half)
+ if decoder_load_path:
+ decoder_load_path = get_path_from_url(decoder_load_path)
+ self.stylegan_decoder.set_state_dict(paddle.load(decoder_load_path))
+
+ if fix_decoder:
+ for _, param in self.stylegan_decoder.named_parameters():
+ param.stop_gradient = True
+ self.condition_scale = nn.LayerList()
+ self.condition_shift = nn.LayerList()
+ for i in range(3, self.log_size + 1):
+ out_channels = channels[f'{2 ** i}']
+ if sft_half:
+ sft_out_channels = out_channels
+ else:
+ sft_out_channels = out_channels * 2
+ self.condition_scale.append(
+ nn.Sequential(
+ EqualConv2d(out_channels,
+ out_channels,
+ 3,
+ stride=1,
+ padding=1,
+ bias=True,
+ bias_init_val=0), ScaledLeakyReLU(0.2),
+ EqualConv2d(out_channels,
+ sft_out_channels,
+ 3,
+ stride=1,
+ padding=1,
+ bias=True,
+ bias_init_val=1)))
+ self.condition_shift.append(
+ nn.Sequential(
+ EqualConv2d(out_channels,
+ out_channels,
+ 3,
+ stride=1,
+ padding=1,
+ bias=True,
+ bias_init_val=0), ScaledLeakyReLU(0.2),
+ EqualConv2d(out_channels,
+ sft_out_channels,
+ 3,
+ stride=1,
+ padding=1,
+ bias=True,
+ bias_init_val=0)))
+
+ def forward(self,
+ x,
+ return_latents=False,
+ return_rgb=True,
+ randomize_noise=False):
+ """Forward function for GFPGANv1.
+
+ Args:
+ x (Tensor): Input images.
+ return_latents (bool): Whether to return style latents. Default: False.
+ return_rgb (bool): Whether return intermediate rgb images. Default: True.
+ randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True.
+ """
+ conditions = []
+ unet_skips = []
+ out_rgbs = []
+
+ feat = self.conv_body_first(x)
+
+ for i in range(self.log_size - 2):
+ feat = self.conv_body_down[i](feat)
+ unet_skips.insert(0, feat)
+ feat = self.final_conv(feat)
+ style_code = self.final_linear(feat.reshape([feat.shape[0], -1]))
+ if self.different_w:
+ style_code = style_code.reshape(
+ [style_code.shape[0], -1, self.num_style_feat])
+
+ for i in range(self.log_size - 2):
+ feat = feat + unet_skips[i]
+ feat = self.conv_body_up[i](feat)
+ scale = self.condition_scale[i](feat)
+ conditions.append(scale.clone())
+ shift = self.condition_shift[i](feat)
+ conditions.append(shift.clone())
+ if return_rgb:
+ out_rgbs.append(self.toRGB[i](feat))
+ image, _ = self.stylegan_decoder([style_code],
+ conditions,
+ return_latents=return_latents,
+ input_is_latent=self.input_is_latent,
+ randomize_noise=randomize_noise)
+ return image, out_rgbs
+
+
+class FacialComponentDiscriminator(nn.Layer):
+ """Facial component (eyes, mouth, noise) discriminator used in GFPGAN.
+ """
+ def __init__(self):
+ super(FacialComponentDiscriminator, self).__init__()
+ self.conv1 = ConvLayer(3,
+ 64,
+ 3,
+ downsample=False,
+ resample_kernel=(1, 3, 3, 1),
+ bias=True,
+ activate=True)
+ self.conv2 = ConvLayer(64,
+ 128,
+ 3,
+ downsample=True,
+ resample_kernel=(1, 3, 3, 1),
+ bias=True,
+ activate=True)
+ self.conv3 = ConvLayer(128,
+ 128,
+ 3,
+ downsample=False,
+ resample_kernel=(1, 3, 3, 1),
+ bias=True,
+ activate=True)
+ self.conv4 = ConvLayer(128,
+ 256,
+ 3,
+ downsample=True,
+ resample_kernel=(1, 3, 3, 1),
+ bias=True,
+ activate=True)
+ self.conv5 = ConvLayer(256,
+ 256,
+ 3,
+ downsample=False,
+ resample_kernel=(1, 3, 3, 1),
+ bias=True,
+ activate=True)
+ self.final_conv = ConvLayer(256, 1, 3, bias=True, activate=False)
+
+ def forward(self, x, return_feats=False):
+ """Forward function for FacialComponentDiscriminator.
+
+ Args:
+ x (Tensor): Input images.
+ return_feats (bool): Whether to return intermediate features. Default: False.
+ """
+ feat = self.conv1(x)
+ feat = self.conv3(self.conv2(feat))
+ rlt_feats = []
+ if return_feats:
+ rlt_feats.append(feat.clone())
+ feat = self.conv5(self.conv4(feat))
+ if return_feats:
+ rlt_feats.append(feat.clone())
+ out = self.final_conv(feat)
+ if return_feats:
+ return out, rlt_feats
+ else:
+ return out, None
+
+
+class ConvUpLayer(nn.Layer):
+ """Convolutional upsampling layer. It uses bilinear upsampler + Conv.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ kernel_size (int): Size of the convolving kernel.
+ stride (int): Stride of the convolution. Default: 1
+ padding (int): Zero-padding added to both sides of the input. Default: 0.
+ bias (bool): If ``True``, adds a learnable bias to the output. Default: ``True``.
+ bias_init_val (float): Bias initialized value. Default: 0.
+ activate (bool): Whether use activateion. Default: True.
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ padding=0,
+ bias=True,
+ bias_init_val=0,
+ activate=True):
+ super(ConvUpLayer, self).__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.stride = stride
+ self.padding = padding
+ self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
+ self.weight = paddle.create_parameter(
+ shape=[out_channels, in_channels, kernel_size, kernel_size],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Normal())
+ if bias and not activate:
+ self.bias = paddle.create_parameter(
+ shape=[out_channels],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Constant(
+ bias_init_val))
+ else:
+ pass
+ self.bias = None
+ if activate:
+ if bias:
+ self.activation = FusedLeakyReLU(out_channels)
+ else:
+ self.activation = ScaledLeakyReLU(0.2)
+ else:
+ self.activation = None
+
+ def forward(self, x):
+ out = F.interpolate(x,
+ scale_factor=2,
+ mode='bilinear',
+ align_corners=False)
+ out = F.conv2d(out,
+ self.weight * self.scale,
+ bias=self.bias,
+ stride=self.stride,
+ padding=self.padding)
+ if self.activation is not None:
+ out = self.activation(out)
+ return out
+
+
+class ResUpBlock(nn.Layer):
+ """Residual block with upsampling.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ """
+ def __init__(self, in_channels, out_channels):
+ super(ResUpBlock, self).__init__()
+ self.conv1 = ConvLayer(in_channels,
+ in_channels,
+ 3,
+ bias=True,
+ activate=True)
+ self.conv2 = ConvUpLayer(in_channels,
+ out_channels,
+ 3,
+ stride=1,
+ padding=1,
+ bias=True,
+ activate=True)
+ self.skip = ConvUpLayer(in_channels,
+ out_channels,
+ 1,
+ bias=False,
+ activate=False)
+
+ def forward(self, x):
+ out = self.conv1(x)
+ out = self.conv2(out)
+ skip = self.skip(x)
+ out = (out + skip) / math.sqrt(2)
+ return out
+
+
+def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
+ pad_y0, pad_y1):
+ _, channel, in_h, in_w = input.shape
+ input = input.reshape((-1, in_h, in_w, 1))
+ _, in_h, in_w, minor = input.shape
+ kernel_h, kernel_w = kernel.shape
+ out = input.reshape((-1, in_h, 1, in_w, 1, minor))
+ out = out.transpose((0, 1, 3, 5, 2, 4))
+ out = out.reshape((-1, 1, 1, 1))
+ out = F.pad(out, [0, up_x - 1, 0, up_y - 1])
+ out = out.reshape((-1, in_h, in_w, minor, up_y, up_x))
+ out = out.transpose((0, 3, 1, 4, 2, 5))
+ out = out.reshape((-1, minor, in_h * up_y, in_w * up_x))
+ out = F.pad(
+ out, [max(pad_x0, 0),
+ max(pad_x1, 0),
+ max(pad_y0, 0),
+ max(pad_y1, 0)])
+ out = out[:, :,
+ max(-pad_y0, 0):out.shape[2] - max(-pad_y1, 0),
+ max(-pad_x0, 0):out.shape[3] - max(-pad_x1, 0)]
+ out = out.reshape(
+ [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+ w = paddle.flip(kernel, [0, 1]).reshape((1, 1, kernel_h, kernel_w))
+ out = F.conv2d(out, w)
+ out = out.reshape((-1, minor, in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+ in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1))
+ out = out.transpose((0, 2, 3, 1))
+ out = out[:, ::down_y, ::down_x, :]
+ out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+ out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+ return out.reshape((-1, channel, out_h, out_w))
+
+
+def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
+ out = upfirdn2d_native(input, kernel, up, up, down, down, pad[0], pad[1],
+ pad[0], pad[1])
+ return out
+
+
+class NormStyleCode(nn.Layer):
+ def forward(self, x):
+ """Normalize the style codes.
+
+ Args:
+ x (Tensor): Style codes with shape (b, c).
+
+ Returns:
+ Tensor: Normalized tensor.
+ """
+ return x * paddle.rsqrt(paddle.mean(x**2, axis=1, keepdim=True) + 1e-08)
+
+
+def make_resample_kernel(k):
+ """Make resampling kernel for UpFirDn.
+
+ Args:
+ k (list[int]): A list indicating the 1D resample kernel magnitude.
+
+ Returns:
+ Tensor: 2D resampled kernel.
+ """
+ k = paddle.to_tensor(k, dtype="float32")
+ if k.ndim == 1:
+ k = k[None, :] * k[:, None]
+ k /= k.sum()
+ return k
+
+
+class UpFirDnUpsample(nn.Layer):
+ """Upsample, FIR filter, and downsample (upsampole version).
+
+ References:
+ 1. https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.upfirdn.html # noqa: E501
+ 2. http://www.ece.northwestern.edu/local-apps/matlabhelp/toolbox/signal/upfirdn.html # noqa: E501
+
+ Args:
+ resample_kernel (list[int]): A list indicating the 1D resample kernel
+ magnitude.
+ factor (int): Upsampling scale factor. Default: 2.
+ """
+ def __init__(self, resample_kernel, factor=2):
+ super(UpFirDnUpsample, self).__init__()
+ self.kernel = make_resample_kernel(resample_kernel) * factor**2
+ self.factor = factor
+ pad = self.kernel.shape[0] - factor
+ self.pad = (pad + 1) // 2 + factor - 1, pad // 2
+
+ def forward(self, x):
+ out = upfirdn2d(x, self.kernel, up=self.factor, down=1, pad=self.pad)
+ return out
+
+ def __repr__(self):
+ return f'{self.__class__.__name__}(factor={self.factor})'
+
+
+class UpFirDnDownsample(nn.Layer):
+ """Upsample, FIR filter, and downsample (downsampole version).
+
+ Args:
+ resample_kernel (list[int]): A list indicating the 1D resample kernel
+ magnitude.
+ factor (int): Downsampling scale factor. Default: 2.
+ """
+ def __init__(self, resample_kernel, factor=2):
+ super(UpFirDnDownsample, self).__init__()
+ self.kernel = make_resample_kernel(resample_kernel)
+ self.factor = factor
+ pad = self.kernel.shape[0] - factor
+ self.pad = (pad + 1) // 2, pad // 2
+
+ def forward(self, x):
+ out = upfirdn2d(x, self.kernel, up=1, down=self.factor, pad=self.pad)
+ return out
+
+ def __repr__(self):
+ return f'{self.__class__.__name__}(factor={self.factor})'
+
+
+class UpFirDnSmooth(nn.Layer):
+ """Upsample, FIR filter, and downsample (smooth version).
+
+ Args:
+ resample_kernel (list[int]): A list indicating the 1D resample kernel
+ magnitude.
+ upsample_factor (int): Upsampling scale factor. Default: 1.
+ downsample_factor (int): Downsampling scale factor. Default: 1.
+ kernel_size (int): Kernel size: Deafult: 1.
+ """
+ def __init__(self,
+ resample_kernel,
+ upsample_factor=1,
+ downsample_factor=1,
+ kernel_size=1):
+ super(UpFirDnSmooth, self).__init__()
+ self.upsample_factor = upsample_factor
+ self.downsample_factor = downsample_factor
+ self.kernel = make_resample_kernel(resample_kernel)
+ if upsample_factor > 1:
+ self.kernel = self.kernel * upsample_factor**2
+ if upsample_factor > 1:
+ pad = self.kernel.shape[0] - upsample_factor - (kernel_size - 1)
+ self.pad = (pad + 1) // 2 + upsample_factor - 1, pad // 2 + 1
+ elif downsample_factor > 1:
+ pad = self.kernel.shape[0] - downsample_factor + (kernel_size - 1)
+ self.pad = (pad + 1) // 2, pad // 2
+ else:
+ raise NotImplementedError
+
+ def forward(self, x):
+ out = upfirdn2d(x, self.kernel, up=1, down=1, pad=self.pad)
+ return out
+
+ def __repr__(self):
+ return (
+ f'{self.__class__.__name__}(upsample_factor={self.upsample_factor}, \
+ downsample_factor={self.downsample_factor})')
+
+
+class EqualLinear(nn.Layer):
+ """This linear layer class stabilizes the learning rate changes of its parameters.
+ Equalizing learning rate keeps the weights in the network at a similar scale during training.
+ """
+ def __init__(self,
+ in_dim,
+ out_dim,
+ bias=True,
+ bias_init_val=0,
+ lr_mul=1,
+ activation=None):
+ super().__init__()
+
+ self.weight = paddle.create_parameter(
+ (in_dim, out_dim),
+ default_initializer=nn.initializer.Normal(),
+ dtype='float32')
+ self.weight.set_value((self.weight / lr_mul))
+
+ if bias:
+ self.bias = self.create_parameter(
+ (out_dim, ), nn.initializer.Constant(bias_init_val))
+
+ else:
+ self.bias = None
+
+ self.activation = activation
+
+ self.scale = (1 / math.sqrt(in_dim)) * lr_mul
+ self.lr_mul = lr_mul
+
+ def forward(self, input):
+ if self.activation:
+ out = F.linear(input, self.weight * self.scale)
+ out = fused_leaky_relu(out, self.bias * self.lr_mul)
+
+ else:
+ out = F.linear(input,
+ self.weight * self.scale,
+ bias=self.bias * self.lr_mul)
+
+ return out
+
+ def __repr__(self):
+ return (
+ f"{self.__class__.__name__}({self.weight.shape[0]}, {self.weight.shape[1]})"
+ )
+
+
+class ModulatedConv2d(nn.Layer):
+ """Modulated Conv2d used in StyleGAN2.
+
+ There is no bias in ModulatedConv2d.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ kernel_size (int): Size of the convolving kernel.
+ num_style_feat (int): Channel number of style features.
+ demodulate (bool): Whether to demodulate in the conv layer.
+ Default: True.
+ sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
+ Default: None.
+ resample_kernel (list[int]): A list indicating the 1D resample kernel
+ magnitude. Default: (1, 3, 3, 1).
+ eps (float): A value added to the denominator for numerical stability.
+ Default: 1e-8.
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ num_style_feat,
+ demodulate=True,
+ sample_mode=None,
+ resample_kernel=(1, 3, 3, 1),
+ eps=1e-08):
+ super(ModulatedConv2d, self).__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.demodulate = demodulate
+ self.sample_mode = sample_mode
+ self.eps = eps
+ if self.sample_mode == 'upsample':
+ self.smooth = UpFirDnSmooth(resample_kernel,
+ upsample_factor=2,
+ downsample_factor=1,
+ kernel_size=kernel_size)
+ elif self.sample_mode == 'downsample':
+ self.smooth = UpFirDnSmooth(resample_kernel,
+ upsample_factor=1,
+ downsample_factor=2,
+ kernel_size=kernel_size)
+ elif self.sample_mode is None:
+ pass
+ else:
+ raise ValueError(
+ f"Wrong sample mode {self.sample_mode}, supported ones are ['upsample', 'downsample', None]."
+ )
+ self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
+ self.modulation = EqualLinear(num_style_feat,
+ in_channels,
+ bias=True,
+ bias_init_val=1,
+ lr_mul=1,
+ activation=None)
+ self.weight = paddle.create_parameter(
+ shape=[1, out_channels, in_channels, kernel_size, kernel_size],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Normal())
+ self.padding = kernel_size // 2
+
+ def forward(self, x, style):
+ """Forward function.
+
+ Args:
+ x (Tensor): Tensor with shape (b, c, h, w).
+ style (Tensor): Tensor with shape (b, num_style_feat).
+
+ Returns:
+ Tensor: Modulated tensor after convolution.
+ """
+ b, c, h, w = x.shape
+ style = self.modulation(style).reshape([b, 1, c, 1, 1])
+ weight = self.scale * self.weight * style
+ if self.demodulate:
+ demod = paddle.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps)
+ weight = weight * demod.reshape([b, self.out_channels, 1, 1, 1])
+ weight = weight.reshape(
+ [b * self.out_channels, c, self.kernel_size, self.kernel_size])
+ if self.sample_mode == 'upsample':
+ x = x.reshape([1, b * c, h, w])
+ weight = weight.reshape(
+ [b, self.out_channels, c, self.kernel_size, self.kernel_size])
+ weight = weight.transpose([0, 2, 1, 3, 4]).reshape(
+ [b * c, self.out_channels, self.kernel_size, self.kernel_size])
+ out = F.conv2d_transpose(x, weight, padding=0, stride=2, groups=b)
+ out = out.reshape([b, self.out_channels, *out.shape[2:4]])
+ out = self.smooth(out)
+ elif self.sample_mode == 'downsample':
+ x = self.smooth(x)
+ x = x.reshape([1, b * c, *x.shape[2:4]])
+ out = F.conv2d(x, weight, padding=0, stride=2, groups=b)
+ out = out.reshape([b, self.out_channels, *out.shape[2:4]])
+ else:
+ x = x.reshape([1, b * c, h, w])
+ out = F.conv2d(x, weight, padding=self.padding, groups=b)
+ out = out.reshape([b, self.out_channels, *out.shape[2:4]])
+ return out
+
+ def __repr__(self):
+ return (f'{self.__class__.__name__}(in_channels={self.in_channels}, \
+ out_channels={self.out_channels}, \
+ kernel_size={self.kernel_size}, \
+ demodulate={self.demodulate}, \
+ sample_mode={self.sample_mode})')
+
+
+class StyleConv(nn.Layer):
+ """Style conv.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ kernel_size (int): Size of the convolving kernel.
+ num_style_feat (int): Channel number of style features.
+ demodulate (bool): Whether demodulate in the conv layer. Default: True.
+ sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
+ Default: None.
+ resample_kernel (list[int]): A list indicating the 1D resample kernel
+ magnitude. Default: (1, 3, 3, 1).
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ num_style_feat,
+ demodulate=True,
+ sample_mode=None,
+ resample_kernel=(1, 3, 3, 1)):
+ super(StyleConv, self).__init__()
+ self.modulated_conv = ModulatedConv2d(in_channels,
+ out_channels,
+ kernel_size,
+ num_style_feat,
+ demodulate=demodulate,
+ sample_mode=sample_mode,
+ resample_kernel=resample_kernel)
+ self.weight = paddle.create_parameter(
+ shape=[1],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Constant(0.))
+ self.activate = FusedLeakyReLU(out_channels)
+
+ def forward(self, x, style, noise=None):
+ out = self.modulated_conv(x, style)
+ if noise is None:
+ b, _, h, w = out.shape
+ noise = paddle.normal(shape=[b, 1, h, w])
+ out = out + self.weight * noise
+ out = self.activate(out)
+ return out
+
+
+class ToRGB(nn.Layer):
+ """To RGB from features.
+
+ Args:
+ in_channels (int): Channel number of input.
+ num_style_feat (int): Channel number of style features.
+ upsample (bool): Whether to upsample. Default: True.
+ resample_kernel (list[int]): A list indicating the 1D resample kernel
+ magnitude. Default: (1, 3, 3, 1).
+ """
+ def __init__(self,
+ in_channels,
+ num_style_feat,
+ upsample=True,
+ resample_kernel=(1, 3, 3, 1)):
+ super(ToRGB, self).__init__()
+ if upsample:
+ self.upsample = UpFirDnUpsample(resample_kernel, factor=2)
+ else:
+ self.upsample = None
+ self.modulated_conv = ModulatedConv2d(in_channels,
+ 3,
+ kernel_size=1,
+ num_style_feat=num_style_feat,
+ demodulate=False,
+ sample_mode=None)
+ self.bias = paddle.create_parameter(
+ shape=[1, 3, 1, 1],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Constant(0))
+
+ def forward(self, x, style, skip=None):
+ """Forward function.
+
+ Args:
+ x (Tensor): Feature tensor with shape (b, c, h, w).
+ style (Tensor): Tensor with shape (b, num_style_feat).
+ skip (Tensor): Base/skip tensor. Default: None.
+
+ Returns:
+ Tensor: RGB images.
+ """
+ out = self.modulated_conv(x, style)
+ out = out + self.bias
+ if skip is not None:
+ if self.upsample:
+ skip = self.upsample(skip)
+ out = out + skip
+ return out
+
+
+class ConstantInput(nn.Layer):
+ """Constant input.
+
+ Args:
+ num_channel (int): Channel number of constant input.
+ size (int): Spatial size of constant input.
+ """
+ def __init__(self, num_channel, size):
+ super(ConstantInput, self).__init__()
+ self.weight = paddle.create_parameter(
+ shape=[1, num_channel, size, size],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Normal())
+
+ def forward(self, batch):
+ out = paddle.tile(self.weight, repeat_times=[batch, 1, 1, 1])
+ return out
+
+
+class FusedLeakyReLU(nn.Layer):
+ def __init__(self, channel, bias=True, negative_slope=0.2, scale=2**0.5):
+ super().__init__()
+ if bias:
+ self.bias = self.create_parameter(
+ (channel, ), default_initializer=nn.initializer.Constant(0.0))
+ else:
+ self.bias = None
+ self.negative_slope = negative_slope
+ self.scale = scale
+
+ def forward(self, input):
+ return fused_leaky_relu(input, self.bias, self.negative_slope,
+ self.scale)
+
+
+def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2**0.5):
+ if bias is not None:
+ rest_dim = [1] * (len(input.shape) - len(bias.shape) - 1)
+ return F.leaky_relu(input + bias.reshape([1, bias.shape[0], *rest_dim]),
+ negative_slope=0.2) * scale
+ else:
+ return F.leaky_relu(input, negative_slope=0.2) * scale
+
+
+class ScaledLeakyReLU(nn.Layer):
+ """Scaled LeakyReLU.
+
+ Args:
+ negative_slope (float): Negative slope. Default: 0.2.
+ """
+ def __init__(self, negative_slope=0.2):
+ super(ScaledLeakyReLU, self).__init__()
+ self.negative_slope = negative_slope
+
+ def forward(self, x):
+ out = F.leaky_relu(x, negative_slope=self.negative_slope)
+ return out * math.sqrt(2)
+
+
+class EqualConv2d(nn.Layer):
+ """Equalized Linear as StyleGAN2.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ kernel_size (int): Size of the convolving kernel.
+ stride (int): Stride of the convolution. Default: 1
+ padding (int): Zero-padding added to both sides of the input.
+ Default: 0.
+ bias (bool): If ``True``, adds a learnable bias to the output.
+ Default: ``True``.
+ bias_init_val (float): Bias initialized value. Default: 0.
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ padding=0,
+ bias=True,
+ bias_init_val=0):
+ super(EqualConv2d, self).__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.stride = stride
+ self.padding = padding
+ self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
+ x = paddle.ones([out_channels, in_channels, kernel_size, kernel_size],
+ dtype="float32")
+ self.weight = paddle.create_parameter(
+ shape=[out_channels, in_channels, kernel_size, kernel_size],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Normal())
+ if bias:
+ self.bias = paddle.create_parameter(
+ shape=[out_channels],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Constant(
+ bias_init_val))
+ else:
+ pass
+ self.bias = None
+
+ def forward(self, x):
+ out = F.conv2d(x,
+ self.weight * self.scale,
+ bias=self.bias,
+ stride=self.stride,
+ padding=self.padding)
+ return out
+
+ def __repr__(self):
+ return (f'{self.__class__.__name__}(in_channels={self.in_channels}, \
+ out_channels={self.out_channels}, kernel_size={self.kernel_size}, \
+ stride={self.stride}, padding={self.padding}, \
+ bias={self.bias is not None})')
+
+
+class ConvLayer(nn.Sequential):
+ """Conv Layer used in StyleGAN2 Discriminator.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ kernel_size (int): Kernel size.
+ downsample (bool): Whether downsample by a factor of 2.
+ Default: False.
+ resample_kernel (list[int]): A list indicating the 1D resample
+ kernel magnitude. A cross production will be applied to
+ extent 1D resample kenrel to 2D resample kernel.
+ Default: (1, 3, 3, 1).
+ bias (bool): Whether with bias. Default: True.
+ activate (bool): Whether use activateion. Default: True.
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ downsample=False,
+ resample_kernel=(1, 3, 3, 1),
+ bias=True,
+ activate=True):
+ layers = []
+ if downsample:
+ layers.append(
+ UpFirDnSmooth(resample_kernel,
+ upsample_factor=1,
+ downsample_factor=2,
+ kernel_size=kernel_size))
+ stride = 2
+ self.padding = 0
+ else:
+ stride = 1
+ self.padding = kernel_size // 2
+ layers.append(
+ EqualConv2d(in_channels,
+ out_channels,
+ kernel_size,
+ stride=stride,
+ padding=self.padding,
+ bias=bias and not activate))
+ if activate:
+ if bias:
+ layers.append(FusedLeakyReLU(out_channels))
+ else:
+ layers.append(ScaledLeakyReLU(0.2))
+ super(ConvLayer, self).__init__(*layers)
+
+
+class ResBlock(nn.Layer):
+ """Residual block used in StyleGAN2 Discriminator.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ resample_kernel (list[int]): A list indicating the 1D resample
+ kernel magnitude. A cross production will be applied to
+ extent 1D resample kenrel to 2D resample kernel.
+ Default: (1, 3, 3, 1).
+ """
+ def __init__(self, in_channels, out_channels, resample_kernel=(1, 3, 3, 1)):
+ super(ResBlock, self).__init__()
+ self.conv1 = ConvLayer(in_channels,
+ in_channels,
+ 3,
+ bias=True,
+ activate=True)
+ self.conv2 = ConvLayer(in_channels,
+ out_channels,
+ 3,
+ downsample=True,
+ resample_kernel=resample_kernel,
+ bias=True,
+ activate=True)
+ self.skip = ConvLayer(in_channels,
+ out_channels,
+ 1,
+ downsample=True,
+ resample_kernel=resample_kernel,
+ bias=False,
+ activate=False)
+
+ def forward(self, x):
+ out = self.conv1(x)
+ out = self.conv2(out)
+ skip = self.skip(x)
+ out = (out + skip) / math.sqrt(2.)
+ return out
diff --git a/ppgan/models/generators/gfpganv1_clean_arch.py b/ppgan/models/generators/gfpganv1_clean_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..6568f62641864efcc7b3022c91c9bf8482c8d8cf
--- /dev/null
+++ b/ppgan/models/generators/gfpganv1_clean_arch.py
@@ -0,0 +1,329 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from ppgan.models.generators.stylegan2_clean_arch import StyleGAN2GeneratorClean
+from ppgan.models.generators.builder import GENERATORS
+
+
+class StyleGAN2GeneratorCSFT(StyleGAN2GeneratorClean):
+ """StyleGAN2 Generator with SFT modulation (Spatial Feature Transform).
+
+ It is the clean version without custom compiled CUDA extensions used in StyleGAN2.
+
+ Args:
+ out_size (int): The spatial size of outputs.
+ num_style_feat (int): Channel number of style features. Default: 512.
+ num_mlp (int): Layer number of MLP style layers. Default: 8.
+ channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2.
+ narrow (float): The narrow ratio for channels. Default: 1.
+ sft_half (bool): Whether to apply SFT on half of the input channels. Default: False.
+ """
+ def __init__(self,
+ out_size,
+ num_style_feat=512,
+ num_mlp=8,
+ channel_multiplier=2,
+ narrow=1,
+ sft_half=False):
+ super(StyleGAN2GeneratorCSFT,
+ self).__init__(out_size,
+ num_style_feat=num_style_feat,
+ num_mlp=num_mlp,
+ channel_multiplier=channel_multiplier,
+ narrow=narrow)
+ self.sft_half = sft_half
+
+ def forward(self,
+ styles,
+ conditions,
+ input_is_latent=False,
+ noise=None,
+ randomize_noise=True,
+ truncation=1,
+ truncation_latent=None,
+ inject_index=None,
+ return_latents=False):
+ """Forward function for StyleGAN2GeneratorCSFT.
+
+ Args:
+ styles (list[Tensor]): Sample codes of styles.
+ conditions (list[Tensor]): SFT conditions to generators.
+ input_is_latent (bool): Whether input is latent style. Default: False.
+ noise (Tensor | None): Input noise or None. Default: None.
+ randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True.
+ truncation (float): The truncation ratio. Default: 1.
+ truncation_latent (Tensor | None): The truncation latent tensor. Default: None.
+ inject_index (int | None): The injection index for mixing noise. Default: None.
+ return_latents (bool): Whether to return style latents. Default: False.
+ """
+ if not input_is_latent:
+ styles = [self.style_mlp(s) for s in styles]
+ if noise is None:
+ if randomize_noise:
+ noise = [None] * self.num_layers
+ else:
+ noise = [
+ getattr(self.noises, f'noise{i}')
+ for i in range(self.num_layers)
+ ]
+ if truncation < 1:
+ style_truncation = []
+ for style in styles:
+ style_truncation.append(truncation_latent + truncation *
+ (style - truncation_latent))
+ styles = style_truncation
+ if len(styles) == 1:
+ inject_index = self.num_latent
+ if styles[0].ndim < 3:
+ latent = paddle.tile(styles[0].unsqueeze(1),
+ repeat_times=[1, inject_index, 1])
+ else:
+ latent = styles[0]
+ elif len(styles) == 2:
+ if inject_index is None:
+ inject_index = random.randint(1, self.num_latent - 1)
+ latent1 = paddle.tile(styles[0].unsqueeze(1),
+ repeat_times=[1, inject_index, 1])
+ latent2 = paddle.tile(
+ styles[1].unsqueeze(1),
+ repeat_times=[1, self.num_latent - inject_index, 1])
+ latent = paddle.concat([latent1, latent2], axis=1)
+ out = self.constant_input(latent.shape[0])
+ out = self.style_conv1(out, latent[:, 0], noise=noise[0])
+ skip = self.to_rgb1(out, latent[:, 1])
+ i = 1
+ for conv1, conv2, noise1, noise2, to_rgb in zip(self.style_convs[::2],
+ self.style_convs[1::2],
+ noise[1::2],
+ noise[2::2],
+ self.to_rgbs):
+ out = conv1(out, latent[:, i], noise=noise1)
+ if i < len(conditions):
+ if self.sft_half:
+ out_same, out_sft = paddle.split(out, 2, axis=1)
+
+ out_sft = out_sft * conditions[i - 1] + conditions[i]
+ out = paddle.concat([out_same, out_sft], axis=1)
+ else:
+ out = out * conditions[i - 1] + conditions[i]
+ out = conv2(out, latent[:, i + 1], noise=noise2)
+ skip = to_rgb(out, latent[:, i + 2], skip)
+ i += 2
+ image = skip
+ if return_latents:
+
+ return image, latent
+ else:
+ return image, None
+
+
+class ResBlock(nn.Layer):
+ """Residual block with bilinear upsampling/downsampling.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ mode (str): Upsampling/downsampling mode. Options: down | up. Default: down.
+ """
+ def __init__(self, in_channels, out_channels, mode='down'):
+ super(ResBlock, self).__init__()
+ self.conv1 = nn.Conv2D(in_channels, in_channels, 3, 1, 1)
+ self.conv2 = nn.Conv2D(in_channels, out_channels, 3, 1, 1)
+ self.skip = nn.Conv2D(in_channels, out_channels, 1, bias_attr=False)
+ if mode == 'down':
+ self.scale_factor = 0.5
+ elif mode == 'up':
+ self.scale_factor = 2
+
+ def forward(self, x):
+ out = paddle.nn.functional.leaky_relu(self.conv1(x), negative_slope=0.2)
+ out = F.interpolate(out, scale_factor=self.scale_factor, mode=\
+ 'bilinear', align_corners=False)
+ out = paddle.nn.functional.leaky_relu(self.conv2(out),
+ negative_slope=0.2)
+ x = F.interpolate(x, scale_factor=self.scale_factor, mode=\
+ 'bilinear', align_corners=False)
+ skip = self.skip(x)
+ out = out + skip
+ return out
+
+
+def debug(x):
+ print(type(x))
+ if isinstance(x, list):
+ for i, v in enumerate(x):
+ print(i, v.shape)
+ else:
+ print(0, x.shape)
+
+
+@GENERATORS.register()
+class GFPGANv1Clean(nn.Layer):
+ """The GFPGAN architecture: Unet + StyleGAN2 decoder with SFT.
+
+ It is the clean version without custom compiled CUDA extensions used in StyleGAN2.
+
+ Ref: GFP-GAN: Towards Real-World Blind Face Restoration with Generative Facial Prior.
+
+ Args:
+ out_size (int): The spatial size of outputs.
+ num_style_feat (int): Channel number of style features. Default: 512.
+ channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2.
+ decoder_load_path (str): The path to the pre-trained decoder model (usually, the StyleGAN2). Default: None.
+ fix_decoder (bool): Whether to fix the decoder. Default: True.
+
+ num_mlp (int): Layer number of MLP style layers. Default: 8.
+ input_is_latent (bool): Whether input is latent style. Default: False.
+ different_w (bool): Whether to use different latent w for different layers. Default: False.
+ narrow (float): The narrow ratio for channels. Default: 1.
+ sft_half (bool): Whether to apply SFT on half of the input channels. Default: False.
+ """
+ def __init__(self,
+ out_size,
+ num_style_feat=512,
+ channel_multiplier=1,
+ decoder_load_path=None,
+ fix_decoder=True,
+ num_mlp=8,
+ input_is_latent=False,
+ different_w=False,
+ narrow=1,
+ sft_half=False):
+ super(GFPGANv1Clean, self).__init__()
+ self.input_is_latent = input_is_latent
+ self.different_w = different_w
+ self.num_style_feat = num_style_feat
+ unet_narrow = narrow * 0.5
+ print("unet_narrow", unet_narrow, "channel_multiplier",
+ channel_multiplier)
+ channels = {
+ '4': int(512 * unet_narrow),
+ '8': int(512 * unet_narrow),
+ '16': int(512 * unet_narrow),
+ '32': int(512 * unet_narrow),
+ '64': int(256 * channel_multiplier * unet_narrow),
+ '128': int(128 * channel_multiplier * unet_narrow),
+ '256': int(64 * channel_multiplier * unet_narrow),
+ '512': int(32 * channel_multiplier * unet_narrow),
+ '1024': int(16 * channel_multiplier * unet_narrow)
+ }
+
+ self.log_size = int(math.log(out_size, 2))
+ first_out_size = 2**int(math.log(out_size, 2))
+ self.conv_body_first = nn.Conv2D(3, channels[f'{first_out_size}'], 1)
+ in_channels = channels[f'{first_out_size}']
+ self.conv_body_down = nn.LayerList()
+ for i in range(self.log_size, 2, -1):
+ out_channels = channels[f'{2 ** (i - 1)}']
+ self.conv_body_down.append(
+ ResBlock(in_channels, out_channels, mode='down'))
+ in_channels = out_channels
+ self.final_conv = nn.Conv2D(in_channels, channels['4'], 3, 1, 1)
+ in_channels = channels['4']
+ self.conv_body_up = nn.LayerList()
+ for i in range(3, self.log_size + 1):
+ out_channels = channels[f'{2 ** i}']
+ self.conv_body_up.append(
+ ResBlock(in_channels, out_channels, mode='up'))
+ in_channels = out_channels
+ self.toRGB = nn.LayerList()
+ for i in range(3, self.log_size + 1):
+ self.toRGB.append(nn.Conv2D(channels[f'{2 ** i}'], 3, 1))
+ if different_w:
+ linear_out_channel = (int(math.log(out_size, 2)) * 2 -
+ 2) * num_style_feat
+ else:
+ linear_out_channel = num_style_feat
+ self.final_linear = nn.Linear(channels['4'] * 4 * 4, linear_out_channel)
+ self.stylegan_decoder = StyleGAN2GeneratorCSFT(out_size=out_size,
+ num_style_feat=num_style_feat, num_mlp=num_mlp,
+ channel_multiplier=channel_multiplier, narrow=narrow, sft_half=\
+ sft_half)
+ if decoder_load_path:
+ self.stylegan_decoder.load_state_dict(
+ paddle.load(decoder_load_path)['params_ema'])
+ if fix_decoder:
+ for _, param in self.stylegan_decoder.named_parameters():
+ param.requires_grad = False
+ self.condition_scale = nn.LayerList()
+ self.condition_shift = nn.LayerList()
+ for i in range(3, self.log_size + 1):
+ out_channels = channels[f'{2 ** i}']
+ if sft_half:
+ sft_out_channels = out_channels
+ else:
+ sft_out_channels = out_channels * 2
+ self.condition_scale.append(
+ nn.Sequential(
+ nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+ nn.LeakyReLU(0.2, True),
+ nn.Conv2D(out_channels, sft_out_channels, 3, 1, 1)))
+ self.condition_shift.append(
+ nn.Sequential(
+ nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+ nn.LeakyReLU(0.2, True),
+ nn.Conv2D(out_channels, sft_out_channels, 3, 1, 1)))
+
+ def forward(self,
+ x,
+ return_latents=False,
+ return_rgb=True,
+ randomize_noise=True):
+ """Forward function for GFPGANv1Clean.
+
+ Args:
+ x (Tensor): Input images.
+ return_latents (bool): Whether to return style latents. Default: False.
+ return_rgb (bool): Whether return intermediate rgb images. Default: True.
+ randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True.
+ """
+ conditions = []
+ unet_skips = []
+ out_rgbs = []
+ feat = paddle.nn.functional.leaky_relu(self.conv_body_first(x),
+ negative_slope=0.2)
+ for i in range(self.log_size - 2):
+ feat = self.conv_body_down[i](feat)
+ unet_skips.insert(0, feat)
+ feat = paddle.nn.functional.leaky_relu(self.final_conv(feat),
+ negative_slope=0.2)
+ style_code = self.final_linear(feat.reshape([feat.shape[0], -1]))
+ if self.different_w:
+ style_code = style_code.reshape(
+ [style_code.shape[0], -1, self.num_style_feat])
+ for i in range(self.log_size - 2):
+ feat = feat + unet_skips[i]
+ feat = self.conv_body_up[i](feat)
+ scale = self.condition_scale[i](feat)
+ conditions.append(scale.clone())
+ shift = self.condition_shift[i](feat)
+ conditions.append(shift.clone())
+ if return_rgb:
+ out_rgbs.append(self.toRGB[i](feat))
+
+ image, _ = self.stylegan_decoder(styles=[style_code],
+ conditions=conditions,
+ return_latents=return_latents,
+ input_is_latent=self.input_is_latent,
+ randomize_noise=randomize_noise)
+ if return_latents:
+ return image, _
+ else:
+ return image, out_rgbs
diff --git a/ppgan/models/generators/gpen.py b/ppgan/models/generators/gpen.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcf34d4c2938fdf80e3bd811a6178ce6a63320de
--- /dev/null
+++ b/ppgan/models/generators/gpen.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# code was heavily based on code was heavily based on https://github.com/yangxy/GPEN
+
+import math
+import paddle
+import paddle.nn as nn
+from ppgan.models.generators.builder import GENERATORS
+from ppgan.models.generators import StyleGANv2Generator
+from ppgan.models.discriminators.discriminator_styleganv2 import ConvLayer
+from ppgan.modules.equalized import EqualLinear
+
+class GPEN(nn.Layer):
+
+ def __init__(
+ self,
+ size,
+ style_dim,
+ n_mlp,
+ channel_multiplier=2,
+ blur_kernel=[1, 3, 3, 1],
+ lr_mlp=0.01,
+ is_concat=True,
+ ):
+ super(GPEN, self).__init__()
+ channels = {
+ 4: 512,
+ 8: 512,
+ 16: 512,
+ 32: 512,
+ 64: 256 * channel_multiplier,
+ 128: 128 * channel_multiplier,
+ 256: 64 * channel_multiplier,
+ 512: 32 * channel_multiplier,
+ 1024: 16 * channel_multiplier,
+ }
+ self.log_size = int(math.log(size, 2))
+ self.generator = StyleGANv2Generator(
+ size,
+ style_dim,
+ n_mlp,
+ channel_multiplier=channel_multiplier,
+ blur_kernel=blur_kernel,
+ lr_mlp=lr_mlp,
+ is_concat=is_concat)
+
+ conv = [ConvLayer(3, channels[size], 1)]
+ self.ecd0 = nn.Sequential(*conv)
+ in_channel = channels[size]
+
+ self.names = ['ecd%d' % i for i in range(self.log_size - 1)]
+ for i in range(self.log_size, 2, -1):
+ out_channel = channels[2**(i - 1)]
+ conv = [ConvLayer(in_channel, out_channel, 3, downsample=True)]
+ setattr(self, self.names[self.log_size - i + 1],
+ nn.Sequential(*conv))
+ in_channel = out_channel
+ self.final_linear = nn.Sequential(
+ EqualLinear(channels[4] * 4 * 4,
+ style_dim,
+ activation='fused_lrelu'))
+
+ def forward(
+ self,
+ inputs,
+ return_latents=False,
+ inject_index=None,
+ truncation=1,
+ truncation_latent=None,
+ input_is_latent=False,
+ ):
+ noise = []
+ for i in range(self.log_size - 1):
+ ecd = getattr(self, self.names[i])
+ inputs = ecd(inputs)
+ noise.append(inputs)
+ inputs = inputs.reshape([inputs.shape[0], -1])
+ outs = self.final_linear(inputs)
+ outs = self.generator([outs], return_latents, inject_index, truncation,
+ truncation_latent, input_is_latent,
+ noise=noise[::-1])
+ return outs
diff --git a/ppgan/models/generators/hook.py b/ppgan/models/generators/hook.py
index ba1bcd4819096a4f7eb77a036a897ddf7122e3a2..0ff12f439de6f67a58d672f29ac91fc8610e7041 100644
--- a/ppgan/models/generators/hook.py
+++ b/ppgan/models/generators/hook.py
@@ -1,16 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was based on https://github.com/fastai/fastai
import numpy as np
@@ -91,7 +79,7 @@ class Hooks():
def _hook_inner(m, i, o):
return o if isinstance(
- o, paddle.fluid.framework.Variable) else o if is_listy(o) else list(o)
+ o, paddle.static.Variable) else o if is_listy(o) else list(o)
def hook_output(module, detach=True, grad=False):
diff --git a/ppgan/models/generators/iconvsr.py b/ppgan/models/generators/iconvsr.py
new file mode 100644
index 0000000000000000000000000000000000000000..f97931befc3485ed89adce576c5f0ac333b4b5f6
--- /dev/null
+++ b/ppgan/models/generators/iconvsr.py
@@ -0,0 +1,423 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# basicvsr and iconvsr code are heavily based on mmedit
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .builder import GENERATORS
+from .edvr import PCDAlign, TSAFusion
+from .basicvsr import SPyNet, PixelShufflePack, ResidualBlockNoBN, \
+ ResidualBlocksWithInputConv, flow_warp
+from ...utils.download import get_path_from_url
+
+
+@GENERATORS.register()
+class IconVSR(nn.Layer):
+ """BasicVSR network structure for video super-resolution.
+
+ Support only x4 upsampling.
+ Paper:
+ BasicVSR: The Search for Essential Components in Video Super-Resolution
+ and Beyond, CVPR, 2021
+
+ Args:
+ mid_channels (int): Channel number of the intermediate features.
+ Default: 64.
+ num_blocks (int): Number of residual blocks in each propagation branch.
+ Default: 30.
+ padding (int): Number of frames to be padded at two ends of the
+ sequence. 2 for REDS and 3 for Vimeo-90K. Default: 2.
+ keyframe_stride (int): Number determining the keyframes. If stride=5,
+ then the (0, 5, 10, 15, ...)-th frame will be the keyframes.
+ Default: 5.
+ """
+ def __init__(self,
+ mid_channels=64,
+ num_blocks=30,
+ padding=2,
+ keyframe_stride=5):
+
+ super().__init__()
+
+ self.mid_channels = mid_channels
+ self.padding = padding
+ self.keyframe_stride = keyframe_stride
+
+ # optical flow network for feature alignment
+ self.spynet = SPyNet()
+ weight_path = get_path_from_url(
+ 'https://paddlegan.bj.bcebos.com/models/spynet.pdparams')
+ self.spynet.set_state_dict(paddle.load(weight_path))
+
+ # information-refill
+ self.edvr = EDVRFeatureExtractor(num_frames=padding * 2 + 1,
+ center_frame_idx=padding)
+
+ edvr_wight_path = get_path_from_url(
+ 'https://paddlegan.bj.bcebos.com/models/edvrm.pdparams')
+ self.edvr.set_state_dict(paddle.load(edvr_wight_path))
+
+ self.backward_fusion = nn.Conv2D(2 * mid_channels,
+ mid_channels,
+ 3,
+ 1,
+ 1,
+ bias_attr=True)
+ self.forward_fusion = nn.Conv2D(2 * mid_channels,
+ mid_channels,
+ 3,
+ 1,
+ 1,
+ bias_attr=True)
+
+ # propagation branches
+ self.backward_resblocks = ResidualBlocksWithInputConv(
+ mid_channels + 3, mid_channels, num_blocks)
+ self.forward_resblocks = ResidualBlocksWithInputConv(
+ 2 * mid_channels + 3, mid_channels, num_blocks)
+
+ # upsample
+ # self.fusion = nn.Conv2D(mid_channels * 2, mid_channels, 1, 1, 0)
+ self.upsample1 = PixelShufflePack(mid_channels,
+ mid_channels,
+ 2,
+ upsample_kernel=3)
+ self.upsample2 = PixelShufflePack(mid_channels,
+ 64,
+ 2,
+ upsample_kernel=3)
+ self.conv_hr = nn.Conv2D(64, 64, 3, 1, 1)
+ self.conv_last = nn.Conv2D(64, 3, 3, 1, 1)
+ self.img_upsample = nn.Upsample(scale_factor=4,
+ mode='bilinear',
+ align_corners=False)
+
+ # activation function
+ self.lrelu = nn.LeakyReLU(negative_slope=0.1)
+
+ def spatial_padding(self, lrs):
+ """ Apply pdding spatially.
+
+ Since the PCD module in EDVR requires that the resolution is a multiple
+ of 4, we apply padding to the input LR images if their resolution is
+ not divisible by 4.
+
+ Args:
+ lrs (Tensor): Input LR sequence with shape (n, t, c, h, w).
+
+ Returns:
+ Tensor: Padded LR sequence with shape (n, t, c, h_pad, w_pad).
+
+ """
+ n, t, c, h, w = lrs.shape
+
+ pad_h = (4 - h % 4) % 4
+ pad_w = (4 - w % 4) % 4
+
+ # padding
+ lrs = lrs.reshape([-1, c, h, w])
+ lrs = F.pad(lrs, [0, pad_w, 0, pad_h], mode='reflect')
+
+ return lrs.reshape([n, t, c, h + pad_h, w + pad_w])
+
+ def check_if_mirror_extended(self, lrs):
+ """Check whether the input is a mirror-extended sequence.
+
+ If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the
+ (t-1-i)-th frame.
+
+ Args:
+ lrs (tensor): Input LR images with shape (n, t, c, h, w)
+
+ Returns:
+ bool: whether the input is a mirror-extended sequence
+ """
+
+ self.is_mirror_extended = False
+ if lrs.shape[1] % 2 == 0:
+ lrs_1, lrs_2 = paddle.chunk(lrs, 2, axis=1)
+ lrs_2 = paddle.flip(lrs_2, [1])
+ if paddle.norm(lrs_1 - lrs_2) == 0:
+ self.is_mirror_extended = True
+
+ def compute_refill_features(self, lrs, keyframe_idx):
+ """ Compute keyframe features for information-refill.
+ Since EDVR-M is used, padding is performed before feature computation.
+ Args:
+ lrs (Tensor): Input LR images with shape (n, t, c, h, w)
+ keyframe_idx (list(int)): The indices specifying the keyframes.
+ Return:
+ dict(Tensor): The keyframe features. Each key corresponds to the
+ indices in keyframe_idx.
+ """
+
+ if self.padding == 2:
+ lrs = [
+ lrs[:, 4:5, :, :], lrs[:, 3:4, :, :], lrs, lrs[:, -4:-3, :, :],
+ lrs[:, -5:-4, :, :]
+ ]
+ elif self.padding == 3:
+ lrs = [lrs[:, [6, 5, 4]], lrs, lrs[:, [-5, -6, -7]]]
+ lrs = paddle.concat(lrs, axis=1)
+
+ num_frames = 2 * self.padding + 1
+ feats_refill = {}
+ for i in keyframe_idx:
+ feats_refill[i] = self.edvr(lrs[:, i:i + num_frames])
+ return feats_refill
+
+ def compute_flow(self, lrs):
+ """Compute optical flow using SPyNet for feature warping.
+
+ Note that if the input is an mirror-extended sequence, 'flows_forward'
+ is not needed, since it is equal to 'flows_backward.flip(1)'.
+
+ Args:
+ lrs (tensor): Input LR images with shape (n, t, c, h, w)
+
+ Return:
+ tuple(Tensor): Optical flow. 'flows_forward' corresponds to the
+ flows used for forward-time propagation (current to previous).
+ 'flows_backward' corresponds to the flows used for
+ backward-time propagation (current to next).
+ """
+
+ n, t, c, h, w = lrs.shape
+
+ lrs_1 = lrs[:, :-1, :, :, :].reshape([-1, c, h, w])
+ lrs_2 = lrs[:, 1:, :, :, :].reshape([-1, c, h, w])
+
+ flows_backward = self.spynet(lrs_1, lrs_2).reshape([n, t - 1, 2, h, w])
+
+ if self.is_mirror_extended: # flows_forward = flows_backward.flip(1)
+ flows_forward = None
+ else:
+ flows_forward = self.spynet(lrs_2,
+ lrs_1).reshape([n, t - 1, 2, h, w])
+
+ return flows_forward, flows_backward
+
+ def forward(self, lrs):
+ """Forward function for BasicVSR.
+
+ Args:
+ lrs (Tensor): Input LR sequence with shape (n, t, c, h, w).
+
+ Returns:
+ Tensor: Output HR sequence with shape (n, t, c, 4h, 4w).
+ """
+
+ n, t, c, h_input, w_input = lrs.shape
+ assert h_input >= 64 and w_input >= 64, (
+ 'The height and width of inputs should be at least 64, '
+ f'but got {h_input} and {w_input}.')
+
+ # check whether the input is an extended sequence
+ self.check_if_mirror_extended(lrs)
+
+ lrs = self.spatial_padding(lrs)
+ h, w = lrs.shape[3], lrs.shape[4]
+
+ # get the keyframe indices for information-refill
+ keyframe_idx = list(range(0, t, self.keyframe_stride))
+ if keyframe_idx[-1] != t - 1:
+ keyframe_idx.append(t - 1) # the last frame must be a keyframe
+
+ # compute optical flow and compute features for information-refill
+ flows_forward, flows_backward = self.compute_flow(lrs)
+ feats_refill = self.compute_refill_features(lrs, keyframe_idx)
+ # compute optical flow
+ flows_forward, flows_backward = self.compute_flow(lrs)
+
+ # backward-time propgation
+ outputs = []
+
+ feat_prop = paddle.to_tensor(
+ np.zeros([n, self.mid_channels, h, w], 'float32'))
+ for i in range(t - 1, -1, -1):
+ # no warping required for the last timestep
+ if i < t - 1:
+ flow = flows_backward[:, i, :, :, :]
+ feat_prop = flow_warp(feat_prop, flow.transpose([0, 2, 3, 1]))
+
+ # information refill
+ if i in keyframe_idx:
+ feat_prop = paddle.concat([feat_prop, feats_refill[i]], axis=1)
+ feat_prop = self.backward_fusion(feat_prop)
+
+ feat_prop = paddle.concat([lrs[:, i, :, :, :], feat_prop], axis=1)
+ feat_prop = self.backward_resblocks(feat_prop)
+
+ outputs.append(feat_prop)
+ outputs = outputs[::-1]
+
+ # forward-time propagation and upsampling
+ feat_prop = paddle.zeros_like(feat_prop)
+ for i in range(0, t):
+ lr_curr = lrs[:, i, :, :, :]
+ if i > 0: # no warping required for the first timestep
+ if flows_forward is not None:
+ flow = flows_forward[:, i - 1, :, :, :]
+ else:
+ flow = flows_backward[:, -i, :, :, :]
+ feat_prop = flow_warp(feat_prop, flow.transpose([0, 2, 3, 1]))
+
+ # information refill
+ if i in keyframe_idx:
+ feat_prop = paddle.concat([feat_prop, feats_refill[i]], axis=1)
+ feat_prop = self.forward_fusion(feat_prop)
+
+ feat_prop = paddle.concat([lr_curr, outputs[i], feat_prop], axis=1)
+ feat_prop = self.forward_resblocks(feat_prop)
+
+ # upsampling given the backward and forward features
+ out = self.lrelu(self.upsample1(feat_prop))
+ out = self.lrelu(self.upsample2(out))
+ out = self.lrelu(self.conv_hr(out))
+ out = self.conv_last(out)
+ base = self.img_upsample(lr_curr)
+ out += base
+ outputs[i] = out
+
+ return paddle.stack(outputs, axis=1)
+
+
+class EDVRFeatureExtractor(nn.Layer):
+ """EDVR feature extractor for information-refill in IconVSR.
+
+ We use EDVR-M in IconVSR. To adopt pretrained models, please
+ specify "pretrained".
+
+ Paper:
+ EDVR: Video Restoration with Enhanced Deformable Convolutional Networks.
+ Args:
+ in_channels (int): Channel number of inputs.
+ out_channels (int): Channel number of outputs.
+ mid_channels (int): Channel number of intermediate features.
+ Default: 64.
+ num_frames (int): Number of input frames. Default: 5.
+ deform_groups (int): Deformable groups. Defaults: 8.
+ num_blocks_extraction (int): Number of blocks for feature extraction.
+ Default: 5.
+ num_blocks_reconstruction (int): Number of blocks for reconstruction.
+ Default: 10.
+ center_frame_idx (int): The index of center frame. Frame counting from
+ 0. Default: 2.
+ with_tsa (bool): Whether to use TSA module. Default: True.
+ """
+ def __init__(self,
+ in_channels=3,
+ out_channel=3,
+ mid_channels=64,
+ num_frames=5,
+ deform_groups=8,
+ num_blocks_extraction=5,
+ num_blocks_reconstruction=10,
+ center_frame_idx=2,
+ with_tsa=True):
+
+ super().__init__()
+
+ self.center_frame_idx = center_frame_idx
+ self.with_tsa = with_tsa
+
+ self.conv_first = nn.Conv2D(in_channels, mid_channels, 3, 1, 1)
+ self.feature_extraction = make_layer(ResidualBlockNoBN,
+ num_blocks_extraction,
+ nf=mid_channels)
+
+ # generate pyramid features
+ self.feat_l2_conv1 = nn.Conv2D(mid_channels, mid_channels, 3, 2, 1)
+ self.feat_l2_conv2 = nn.Conv2D(mid_channels, mid_channels, 3, 1, 1)
+ self.feat_l3_conv1 = nn.Conv2D(mid_channels, mid_channels, 3, 2, 1)
+ self.feat_l3_conv2 = nn.Conv2D(mid_channels, mid_channels, 3, 1, 1)
+
+ # pcd alignment
+ self.pcd_alignment = PCDAlign(nf=mid_channels, groups=deform_groups)
+ # fusion
+ if self.with_tsa:
+ self.fusion = TSAFusion(nf=mid_channels,
+ nframes=num_frames,
+ center=self.center_frame_idx)
+ else:
+ self.fusion = nn.Conv2D(num_frames * mid_channels, mid_channels, 1,
+ 1)
+
+ # activation function
+ self.lrelu = nn.LeakyReLU(negative_slope=0.1)
+
+ def forward(self, x):
+ """Forward function for EDVRFeatureExtractor.
+ Args:
+ x (Tensor): Input tensor with shape (n, t, 3, h, w).
+ Returns:
+ Tensor: Intermediate feature with shape (n, mid_channels, h, w).
+ """
+
+ n, t, c, h, w = x.shape
+
+ # extract LR features
+ # L1
+ l1_feat = self.lrelu(self.conv_first(x.reshape([-1, c, h, w])))
+ l1_feat = self.feature_extraction(l1_feat)
+ # L2
+ l2_feat = self.lrelu(
+ self.feat_l2_conv2(self.lrelu(self.feat_l2_conv1(l1_feat))))
+ # L3
+ l3_feat = self.lrelu(
+ self.feat_l3_conv2(self.lrelu(self.feat_l3_conv1(l2_feat))))
+
+ l1_feat = l1_feat.reshape([n, t, -1, h, w])
+ l2_feat = l2_feat.reshape([n, t, -1, h // 2, w // 2])
+ l3_feat = l3_feat.reshape([n, t, -1, h // 4, w // 4])
+
+ # pcd alignment
+ ref_feats = [ # reference feature list
+ l1_feat[:, self.center_frame_idx, :, :, :].clone(),
+ l2_feat[:, self.center_frame_idx, :, :, :].clone(),
+ l3_feat[:, self.center_frame_idx, :, :, :].clone()
+ ]
+ aligned_feat = []
+ for i in range(t):
+ neighbor_feats = [
+ l1_feat[:, i, :, :, :].clone(), l2_feat[:, i, :, :, :].clone(),
+ l3_feat[:, i, :, :, :].clone()
+ ]
+ aligned_feat.append(self.pcd_alignment(neighbor_feats, ref_feats))
+ aligned_feat = paddle.stack(aligned_feat, axis=1) # (n, t, c, h, w)
+
+ if self.with_tsa:
+ feat = self.fusion(aligned_feat)
+ else:
+ aligned_feat = aligned_feat.reshape([n, -1, h, w])
+ feat = self.fusion(aligned_feat)
+
+ return feat
+
+
+def make_layer(block, num_blocks, **kwarg):
+ """Make layers by stacking the same blocks.
+ Args:
+ block (nn.Layer): nn.module class for basic block.
+ num_blocks (int): number of blocks.
+ Returns:
+ nn.Sequential: Stacked blocks in nn.Sequential.
+ """
+ layers = []
+ for _ in range(num_blocks):
+ layers.append(block(**kwarg))
+ return nn.Sequential(*layers)
diff --git a/ppgan/models/generators/invdn.py b/ppgan/models/generators/invdn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc64eb9e21c860dea091e37326b41d18ac0c645
--- /dev/null
+++ b/ppgan/models/generators/invdn.py
@@ -0,0 +1,199 @@
+# code was heavily based on https://github.com/Yang-Liu1082/InvDN
+
+from itertools import repeat
+import collections.abc
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .builder import GENERATORS
+
+
+class ResBlock(nn.Layer):
+ def __init__(self, channel_in, channel_out):
+ super(ResBlock, self).__init__()
+ feature = 64
+ weight_attr, bias_attr = self._init_weights()
+ self.conv1 = nn.Conv2D(channel_in,
+ feature,
+ kernel_size=3,
+ padding=1,
+ weight_attr=weight_attr,
+ bias_attr=bias_attr)
+ self.relu1 = nn.LeakyReLU(negative_slope=0.2)
+ self.conv2 = nn.Conv2D(feature,
+ feature,
+ kernel_size=3,
+ padding=1,
+ weight_attr=weight_attr,
+ bias_attr=bias_attr)
+ self.conv3 = nn.Conv2D((feature + channel_in),
+ channel_out,
+ kernel_size=3,
+ padding=1,
+ weight_attr=weight_attr,
+ bias_attr=bias_attr)
+
+ def forward(self, x):
+ residual = self.relu1(self.conv1(x))
+ residual = self.relu1(self.conv2(residual))
+ input = paddle.concat((x, residual), 1)
+ out = self.conv3(input)
+ return out
+
+ def _init_weights(self):
+ weight_attr = paddle.ParamAttr(
+ initializer=paddle.nn.initializer.KaimingUniform(
+ negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
+ bias_attr = paddle.ParamAttr(
+ initializer=paddle.nn.initializer.KaimingUniform(
+ negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
+ return weight_attr, bias_attr
+
+
+class InvBlockExp(nn.Layer):
+ def __init__(self,
+ subnet_constructor,
+ channel_num,
+ channel_split_num,
+ clamp=1.):
+ super(InvBlockExp, self).__init__()
+
+ self.split_len1 = channel_split_num #3
+ self.split_len2 = channel_num - channel_split_num #12-3
+
+ self.clamp = clamp
+
+ self.F = subnet_constructor(self.split_len2, self.split_len1) #9->3
+ self.G = subnet_constructor(self.split_len1, self.split_len2) #3->9
+ self.H = subnet_constructor(self.split_len1, self.split_len2) #3->9
+
+ def forward(self, x, rev=False):
+ x1 = paddle.slice(x, [1], [0], [self.split_len1]) #low resolution img
+ x2 = paddle.slice(x, [1], [self.split_len1],
+ [self.split_len1 + self.split_len2]) #high frenquency
+
+ if not rev:
+ y1 = x1 + self.F(x2)
+ self.s = self.clamp * (F.sigmoid(self.H(y1)) * 2 - 1)
+ y2 = x2.multiply(paddle.exp(self.s)) + self.G(y1)
+ else:
+ self.s = self.clamp * (F.sigmoid(self.H(x1)) * 2 - 1)
+ y2 = (x2 - self.G(x1)).divide(paddle.exp(self.s))
+ y1 = x1 - self.F(y2)
+
+ return paddle.concat((y1, y2), 1)
+
+
+class HaarDownsampling(nn.Layer):
+ def __init__(self, channel_in):
+ super(HaarDownsampling, self).__init__()
+ self.channel_in = channel_in
+
+ self.haar_weights = paddle.ones([4, 1, 2, 2])
+
+ self.haar_weights[1, 0, 0, 1] = -1
+ self.haar_weights[1, 0, 1, 1] = -1
+
+ self.haar_weights[2, 0, 1, 0] = -1
+ self.haar_weights[2, 0, 1, 1] = -1
+
+ self.haar_weights[3, 0, 1, 0] = -1
+ self.haar_weights[3, 0, 0, 1] = -1
+
+ self.haar_weights = paddle.concat([self.haar_weights] * self.channel_in,
+ 0)
+ self.haar_weights = paddle.create_parameter(
+ shape=self.haar_weights.shape,
+ dtype=str(self.haar_weights.numpy().dtype),
+ default_initializer=paddle.nn.initializer.Assign(self.haar_weights))
+ self.haar_weights.stop_gradient = True
+
+ def forward(self, x, rev=False):
+ if not rev:
+ self.elements = x.shape[1] * x.shape[2] * x.shape[3]
+
+ out = F.conv2d(x,
+ self.haar_weights,
+ bias=None,
+ stride=2,
+ groups=self.channel_in) / 4.0
+ out = out.reshape([
+ x.shape[0], self.channel_in, 4, x.shape[2] // 2, x.shape[3] // 2
+ ])
+ out = paddle.transpose(out, [0, 2, 1, 3, 4])
+
+ out = out.reshape([
+ x.shape[0], self.channel_in * 4, x.shape[2] // 2,
+ x.shape[3] // 2
+ ])
+
+ return out
+ else:
+ self.elements = x.shape[1] * x.shape[2] * x.shape[3]
+
+ out = x.reshape(
+ [x.shape[0], 4, self.channel_in, x.shape[2], x.shape[3]])
+ out = paddle.transpose(out, [0, 2, 1, 3, 4])
+ out = out.reshape(
+ [x.shape[0], self.channel_in * 4, x.shape[2], x.shape[3]])
+ return F.conv2d_transpose(out,
+ self.haar_weights,
+ bias=None,
+ stride=2,
+ groups=self.channel_in)
+
+
+@GENERATORS.register()
+class InvDN(nn.Layer):
+ def __init__(self,
+ channel_in=3,
+ channel_out=3,
+ block_num=[8, 8],
+ scale=4,
+ down_num=2):
+ super(InvDN, self).__init__()
+
+ operations = []
+
+ current_channel = channel_in
+
+ subnet_constructor = constructor
+
+ self.down_num = int(math.log(scale, 2))
+ assert self.down_num == down_num
+
+ for i in range(self.down_num):
+ b = HaarDownsampling(current_channel)
+ operations.append(b)
+ current_channel *= 4
+ for j in range(block_num[i]):
+ b = InvBlockExp(subnet_constructor, current_channel,
+ channel_out)
+ operations.append(b)
+
+ self.operations = nn.LayerList(operations)
+
+ def forward(self, x, noise):
+
+ #forward
+ out = x
+ for op in self.operations:
+ out = op.forward(out, False)
+ lq = out
+
+ #backward
+ _, _, H, W = lq.shape
+ noise = noise[:, :, :H, :W]
+ out = paddle.concat((out[:, :3, :, :], noise), axis=1)
+ for op in reversed(self.operations):
+ out = op.forward(out, True)
+
+ return out, lq
+
+
+def constructor(channel_in, channel_out):
+ return ResBlock(channel_in, channel_out)
diff --git a/ppgan/models/generators/lesrcnn.py b/ppgan/models/generators/lesrcnn.py
index 19befcf4125ce0c1e6ea23e74fc53b2a591d31bd..7bb2a4e99b05da1edb407a5515a8f15d364d0cce 100644
--- a/ppgan/models/generators/lesrcnn.py
+++ b/ppgan/models/generators/lesrcnn.py
@@ -1,3 +1,4 @@
+# code was based on https://github.com/hellloxiaotian/LESRCNN
import math
import numpy as np
diff --git a/ppgan/models/generators/makeup.py b/ppgan/models/generators/makeup.py
index 5f80ae4f4106f6f0bb07af840757d2b02ca3685d..4ac8178fb982a18d932e30a2b767733ffaa67f05 100644
--- a/ppgan/models/generators/makeup.py
+++ b/ppgan/models/generators/makeup.py
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/wtjiang98/PSGAN
+# MIT License
+# Copyright (c) 2020 Wentao Jiang
+
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
diff --git a/ppgan/models/generators/mpr.py b/ppgan/models/generators/mpr.py
new file mode 100644
index 0000000000000000000000000000000000000000..9be802a19b437805a3ca0e988b9591d37dceda8f
--- /dev/null
+++ b/ppgan/models/generators/mpr.py
@@ -0,0 +1,514 @@
+# code was based on https://github.com/swz30/MPRNet
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/swz30/MPRNet/blob/main/LICENSE.md
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ...modules.init import kaiming_normal_, constant_
+
+from .builder import GENERATORS
+
+
+def conv(in_channels, out_channels, kernel_size, bias_attr=False, stride=1):
+ return nn.Conv2D(in_channels,
+ out_channels,
+ kernel_size,
+ padding=(kernel_size // 2),
+ bias_attr=bias_attr,
+ stride=stride)
+
+
+## Channel Attention Layer
+class CALayer(nn.Layer):
+ def __init__(self, channel, reduction=16, bias_attr=False):
+ super(CALayer, self).__init__()
+ # global average pooling: feature --> point
+ self.avg_pool = nn.AdaptiveAvgPool2D(1)
+ # feature channel downscale and upscale --> channel weight
+ self.conv_du = nn.Sequential(
+ nn.Conv2D(channel,
+ channel // reduction,
+ 1,
+ padding=0,
+ bias_attr=bias_attr), nn.ReLU(),
+ nn.Conv2D(channel // reduction,
+ channel,
+ 1,
+ padding=0,
+ bias_attr=bias_attr), nn.Sigmoid())
+
+ def forward(self, x):
+ y = self.avg_pool(x)
+ y = self.conv_du(y)
+ return x * y
+
+
+## Channel Attention Block (CAB)
+class CAB(nn.Layer):
+ def __init__(self, n_feat, kernel_size, reduction, bias_attr, act):
+ super(CAB, self).__init__()
+ modules_body = []
+ modules_body.append(
+ conv(n_feat, n_feat, kernel_size, bias_attr=bias_attr))
+ modules_body.append(act)
+ modules_body.append(
+ conv(n_feat, n_feat, kernel_size, bias_attr=bias_attr))
+
+ self.CA = CALayer(n_feat, reduction, bias_attr=bias_attr)
+ self.body = nn.Sequential(*modules_body)
+
+ def forward(self, x):
+ res = self.body(x)
+ res = self.CA(res)
+ res += x
+ return res
+
+
+##---------- Resizing Modules ----------
+class DownSample(nn.Layer):
+ def __init__(self, in_channels, s_factor):
+ super(DownSample, self).__init__()
+ self.down = nn.Sequential(
+ nn.Upsample(scale_factor=0.5, mode='bilinear', align_corners=False),
+ nn.Conv2D(in_channels,
+ in_channels + s_factor,
+ 1,
+ stride=1,
+ padding=0,
+ bias_attr=False))
+
+ def forward(self, x):
+ x = self.down(x)
+ return x
+
+
+class UpSample(nn.Layer):
+ def __init__(self, in_channels, s_factor):
+ super(UpSample, self).__init__()
+ self.up = nn.Sequential(
+ nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
+ nn.Conv2D(in_channels + s_factor,
+ in_channels,
+ 1,
+ stride=1,
+ padding=0,
+ bias_attr=False))
+
+ def forward(self, x):
+ x = self.up(x)
+ return x
+
+
+class SkipUpSample(nn.Layer):
+ def __init__(self, in_channels, s_factor):
+ super(SkipUpSample, self).__init__()
+ self.up = nn.Sequential(
+ nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False),
+ nn.Conv2D(in_channels + s_factor,
+ in_channels,
+ 1,
+ stride=1,
+ padding=0,
+ bias_attr=False))
+
+ def forward(self, x, y):
+ x = self.up(x)
+ x = x + y
+ return x
+
+
+##########################################################################
+## U-Net
+class Encoder(nn.Layer):
+ def __init__(self, n_feat, kernel_size, reduction, act, bias_attr,
+ scale_unetfeats, csff):
+ super(Encoder, self).__init__()
+
+ self.encoder_level1 = [
+ CAB(n_feat, kernel_size, reduction, bias_attr=bias_attr, act=act)
+ for _ in range(2)
+ ]
+ self.encoder_level2 = [
+ CAB(n_feat + scale_unetfeats,
+ kernel_size,
+ reduction,
+ bias_attr=bias_attr,
+ act=act) for _ in range(2)
+ ]
+ self.encoder_level3 = [
+ CAB(n_feat + (scale_unetfeats * 2),
+ kernel_size,
+ reduction,
+ bias_attr=bias_attr,
+ act=act) for _ in range(2)
+ ]
+
+ self.encoder_level1 = nn.Sequential(*self.encoder_level1)
+ self.encoder_level2 = nn.Sequential(*self.encoder_level2)
+ self.encoder_level3 = nn.Sequential(*self.encoder_level3)
+
+ self.down12 = DownSample(n_feat, scale_unetfeats)
+ self.down23 = DownSample(n_feat + scale_unetfeats, scale_unetfeats)
+
+ # Cross Stage Feature Fusion (CSFF)
+ if csff:
+ self.csff_enc1 = nn.Conv2D(n_feat,
+ n_feat,
+ kernel_size=1,
+ bias_attr=bias_attr)
+ self.csff_enc2 = nn.Conv2D(n_feat + scale_unetfeats,
+ n_feat + scale_unetfeats,
+ kernel_size=1,
+ bias_attr=bias_attr)
+ self.csff_enc3 = nn.Conv2D(n_feat + (scale_unetfeats * 2),
+ n_feat + (scale_unetfeats * 2),
+ kernel_size=1,
+ bias_attr=bias_attr)
+
+ self.csff_dec1 = nn.Conv2D(n_feat,
+ n_feat,
+ kernel_size=1,
+ bias_attr=bias_attr)
+ self.csff_dec2 = nn.Conv2D(n_feat + scale_unetfeats,
+ n_feat + scale_unetfeats,
+ kernel_size=1,
+ bias_attr=bias_attr)
+ self.csff_dec3 = nn.Conv2D(n_feat + (scale_unetfeats * 2),
+ n_feat + (scale_unetfeats * 2),
+ kernel_size=1,
+ bias_attr=bias_attr)
+
+ def forward(self, x, encoder_outs=None, decoder_outs=None):
+ enc1 = self.encoder_level1(x)
+ if (encoder_outs is not None) and (decoder_outs is not None):
+ enc1 = enc1 + self.csff_enc1(encoder_outs[0]) + self.csff_dec1(
+ decoder_outs[0])
+
+ x = self.down12(enc1)
+
+ enc2 = self.encoder_level2(x)
+ if (encoder_outs is not None) and (decoder_outs is not None):
+ enc2 = enc2 + self.csff_enc2(encoder_outs[1]) + self.csff_dec2(
+ decoder_outs[1])
+
+ x = self.down23(enc2)
+
+ enc3 = self.encoder_level3(x)
+ if (encoder_outs is not None) and (decoder_outs is not None):
+ enc3 = enc3 + self.csff_enc3(encoder_outs[2]) + self.csff_dec3(
+ decoder_outs[2])
+
+ return [enc1, enc2, enc3]
+
+
+class Decoder(nn.Layer):
+ def __init__(self, n_feat, kernel_size, reduction, act, bias_attr,
+ scale_unetfeats):
+ super(Decoder, self).__init__()
+
+ self.decoder_level1 = [
+ CAB(n_feat, kernel_size, reduction, bias_attr=bias_attr, act=act)
+ for _ in range(2)
+ ]
+ self.decoder_level2 = [
+ CAB(n_feat + scale_unetfeats,
+ kernel_size,
+ reduction,
+ bias_attr=bias_attr,
+ act=act) for _ in range(2)
+ ]
+ self.decoder_level3 = [
+ CAB(n_feat + (scale_unetfeats * 2),
+ kernel_size,
+ reduction,
+ bias_attr=bias_attr,
+ act=act) for _ in range(2)
+ ]
+
+ self.decoder_level1 = nn.Sequential(*self.decoder_level1)
+ self.decoder_level2 = nn.Sequential(*self.decoder_level2)
+ self.decoder_level3 = nn.Sequential(*self.decoder_level3)
+
+ self.skip_attn1 = CAB(n_feat,
+ kernel_size,
+ reduction,
+ bias_attr=bias_attr,
+ act=act)
+ self.skip_attn2 = CAB(n_feat + scale_unetfeats,
+ kernel_size,
+ reduction,
+ bias_attr=bias_attr,
+ act=act)
+
+ self.up21 = SkipUpSample(n_feat, scale_unetfeats)
+ self.up32 = SkipUpSample(n_feat + scale_unetfeats, scale_unetfeats)
+
+ def forward(self, outs):
+ enc1, enc2, enc3 = outs
+ dec3 = self.decoder_level3(enc3)
+
+ x = self.up32(dec3, self.skip_attn2(enc2))
+ dec2 = self.decoder_level2(x)
+
+ x = self.up21(dec2, self.skip_attn1(enc1))
+ dec1 = self.decoder_level1(x)
+
+ return [dec1, dec2, dec3]
+
+
+## Original Resolution Block (ORB)
+class ORB(nn.Layer):
+ def __init__(self, n_feat, kernel_size, reduction, act, bias_attr, num_cab):
+ super(ORB, self).__init__()
+ modules_body = []
+ modules_body = [
+ CAB(n_feat, kernel_size, reduction, bias_attr=bias_attr, act=act)
+ for _ in range(num_cab)
+ ]
+ modules_body.append(conv(n_feat, n_feat, kernel_size))
+ self.body = nn.Sequential(*modules_body)
+
+ def forward(self, x):
+ res = self.body(x)
+ res += x
+ return res
+
+
+class ORSNet(nn.Layer):
+ def __init__(self, n_feat, scale_orsnetfeats, kernel_size, reduction, act,
+ bias_attr, scale_unetfeats, num_cab):
+ super(ORSNet, self).__init__()
+
+ self.orb1 = ORB(n_feat + scale_orsnetfeats, kernel_size, reduction, act,
+ bias_attr, num_cab)
+ self.orb2 = ORB(n_feat + scale_orsnetfeats, kernel_size, reduction, act,
+ bias_attr, num_cab)
+ self.orb3 = ORB(n_feat + scale_orsnetfeats, kernel_size, reduction, act,
+ bias_attr, num_cab)
+
+ self.up_enc1 = UpSample(n_feat, scale_unetfeats)
+ self.up_dec1 = UpSample(n_feat, scale_unetfeats)
+
+ self.up_enc2 = nn.Sequential(
+ UpSample(n_feat + scale_unetfeats, scale_unetfeats),
+ UpSample(n_feat, scale_unetfeats))
+ self.up_dec2 = nn.Sequential(
+ UpSample(n_feat + scale_unetfeats, scale_unetfeats),
+ UpSample(n_feat, scale_unetfeats))
+
+ self.conv_enc1 = nn.Conv2D(n_feat,
+ n_feat + scale_orsnetfeats,
+ kernel_size=1,
+ bias_attr=bias_attr)
+ self.conv_enc2 = nn.Conv2D(n_feat,
+ n_feat + scale_orsnetfeats,
+ kernel_size=1,
+ bias_attr=bias_attr)
+ self.conv_enc3 = nn.Conv2D(n_feat,
+ n_feat + scale_orsnetfeats,
+ kernel_size=1,
+ bias_attr=bias_attr)
+
+ self.conv_dec1 = nn.Conv2D(n_feat,
+ n_feat + scale_orsnetfeats,
+ kernel_size=1,
+ bias_attr=bias_attr)
+ self.conv_dec2 = nn.Conv2D(n_feat,
+ n_feat + scale_orsnetfeats,
+ kernel_size=1,
+ bias_attr=bias_attr)
+ self.conv_dec3 = nn.Conv2D(n_feat,
+ n_feat + scale_orsnetfeats,
+ kernel_size=1,
+ bias_attr=bias_attr)
+
+ def forward(self, x, encoder_outs, decoder_outs):
+ x = self.orb1(x)
+ x = x + self.conv_enc1(encoder_outs[0]) + self.conv_dec1(
+ decoder_outs[0])
+
+ x = self.orb2(x)
+ x = x + self.conv_enc2(self.up_enc1(encoder_outs[1])) + self.conv_dec2(
+ self.up_dec1(decoder_outs[1]))
+
+ x = self.orb3(x)
+ x = x + self.conv_enc3(self.up_enc2(encoder_outs[2])) + self.conv_dec3(
+ self.up_dec2(decoder_outs[2]))
+
+ return x
+
+
+# Supervised Attention Module
+class SAM(nn.Layer):
+ def __init__(self, n_feat, kernel_size, bias_attr):
+ super(SAM, self).__init__()
+ self.conv1 = conv(n_feat, n_feat, kernel_size, bias_attr=bias_attr)
+ self.conv2 = conv(n_feat, 3, kernel_size, bias_attr=bias_attr)
+ self.conv3 = conv(3, n_feat, kernel_size, bias_attr=bias_attr)
+
+ def forward(self, x, x_img):
+ x1 = self.conv1(x)
+ img = self.conv2(x) + x_img
+ x2 = F.sigmoid(self.conv3(img))
+ x1 = x1 * x2
+ x1 = x1 + x
+ return x1, img
+
+
+@GENERATORS.register()
+class MPRNet(nn.Layer):
+ def __init__(self,
+ in_c=3,
+ out_c=3,
+ n_feat=96,
+ scale_unetfeats=48,
+ scale_orsnetfeats=32,
+ num_cab=8,
+ kernel_size=3,
+ reduction=4,
+ bias_attr=False):
+ super(MPRNet, self).__init__()
+ act = nn.PReLU()
+ self.shallow_feat1 = nn.Sequential(
+ conv(in_c, n_feat, kernel_size, bias_attr=bias_attr),
+ CAB(n_feat, kernel_size, reduction, bias_attr=bias_attr, act=act))
+ self.shallow_feat2 = nn.Sequential(
+ conv(in_c, n_feat, kernel_size, bias_attr=bias_attr),
+ CAB(n_feat, kernel_size, reduction, bias_attr=bias_attr, act=act))
+ self.shallow_feat3 = nn.Sequential(
+ conv(in_c, n_feat, kernel_size, bias_attr=bias_attr),
+ CAB(n_feat, kernel_size, reduction, bias_attr=bias_attr, act=act))
+
+ # Cross Stage Feature Fusion (CSFF)
+ self.stage1_encoder = Encoder(n_feat,
+ kernel_size,
+ reduction,
+ act,
+ bias_attr,
+ scale_unetfeats,
+ csff=False)
+ self.stage1_decoder = Decoder(n_feat, kernel_size, reduction, act,
+ bias_attr, scale_unetfeats)
+
+ self.stage2_encoder = Encoder(n_feat,
+ kernel_size,
+ reduction,
+ act,
+ bias_attr,
+ scale_unetfeats,
+ csff=True)
+ self.stage2_decoder = Decoder(n_feat, kernel_size, reduction, act,
+ bias_attr, scale_unetfeats)
+
+ self.stage3_orsnet = ORSNet(n_feat, scale_orsnetfeats, kernel_size,
+ reduction, act, bias_attr, scale_unetfeats,
+ num_cab)
+
+ self.sam12 = SAM(n_feat, kernel_size=1, bias_attr=bias_attr)
+ self.sam23 = SAM(n_feat, kernel_size=1, bias_attr=bias_attr)
+
+ self.concat12 = conv(n_feat * 2,
+ n_feat,
+ kernel_size,
+ bias_attr=bias_attr)
+ self.concat23 = conv(n_feat * 2,
+ n_feat + scale_orsnetfeats,
+ kernel_size,
+ bias_attr=bias_attr)
+ self.tail = conv(n_feat + scale_orsnetfeats,
+ out_c,
+ kernel_size,
+ bias_attr=bias_attr)
+
+ def forward(self, x3_img):
+ # Original-resolution Image for Stage 3
+ H = x3_img.shape[2]
+ W = x3_img.shape[3]
+
+ # Multi-Patch Hierarchy: Split Image into four non-overlapping patches
+
+ # Two Patches for Stage 2
+ x2top_img = x3_img[:, :, 0:int(H / 2), :]
+ x2bot_img = x3_img[:, :, int(H / 2):H, :]
+
+ # Four Patches for Stage 1
+ x1ltop_img = x2top_img[:, :, :, 0:int(W / 2)]
+ x1rtop_img = x2top_img[:, :, :, int(W / 2):W]
+ x1lbot_img = x2bot_img[:, :, :, 0:int(W / 2)]
+ x1rbot_img = x2bot_img[:, :, :, int(W / 2):W]
+
+ ##-------------------------------------------
+ ##-------------- Stage 1---------------------
+ ##-------------------------------------------
+ ## Compute Shallow Features
+ x1ltop = self.shallow_feat1(x1ltop_img)
+ x1rtop = self.shallow_feat1(x1rtop_img)
+ x1lbot = self.shallow_feat1(x1lbot_img)
+ x1rbot = self.shallow_feat1(x1rbot_img)
+
+ ## Process features of all 4 patches with Encoder of Stage 1
+ feat1_ltop = self.stage1_encoder(x1ltop)
+ feat1_rtop = self.stage1_encoder(x1rtop)
+ feat1_lbot = self.stage1_encoder(x1lbot)
+ feat1_rbot = self.stage1_encoder(x1rbot)
+
+ ## Concat deep features
+ feat1_top = [
+ paddle.concat((k, v), 3) for k, v in zip(feat1_ltop, feat1_rtop)
+ ]
+ feat1_bot = [
+ paddle.concat((k, v), 3) for k, v in zip(feat1_lbot, feat1_rbot)
+ ]
+
+ ## Pass features through Decoder of Stage 1
+ res1_top = self.stage1_decoder(feat1_top)
+ res1_bot = self.stage1_decoder(feat1_bot)
+
+ ## Apply Supervised Attention Module (SAM)
+ x2top_samfeats, stage1_img_top = self.sam12(res1_top[0], x2top_img)
+ x2bot_samfeats, stage1_img_bot = self.sam12(res1_bot[0], x2bot_img)
+
+ ## Output image at Stage 1
+ stage1_img = paddle.concat([stage1_img_top, stage1_img_bot], 2)
+ ##-------------------------------------------
+ ##-------------- Stage 2---------------------
+ ##-------------------------------------------
+ ## Compute Shallow Features
+ x2top = self.shallow_feat2(x2top_img)
+ x2bot = self.shallow_feat2(x2bot_img)
+
+ ## Concatenate SAM features of Stage 1 with shallow features of Stage 2
+ x2top_cat = self.concat12(paddle.concat([x2top, x2top_samfeats], 1))
+ x2bot_cat = self.concat12(paddle.concat([x2bot, x2bot_samfeats], 1))
+
+ ## Process features of both patches with Encoder of Stage 2
+ feat2_top = self.stage2_encoder(x2top_cat, feat1_top, res1_top)
+ feat2_bot = self.stage2_encoder(x2bot_cat, feat1_bot, res1_bot)
+
+ ## Concat deep features
+ feat2 = [paddle.concat((k, v), 2) for k, v in zip(feat2_top, feat2_bot)]
+
+ ## Pass features through Decoder of Stage 2
+ res2 = self.stage2_decoder(feat2)
+
+ ## Apply SAM
+ x3_samfeats, stage2_img = self.sam23(res2[0], x3_img)
+
+ ##-------------------------------------------
+ ##-------------- Stage 3---------------------
+ ##-------------------------------------------
+ ## Compute Shallow Features
+ x3 = self.shallow_feat3(x3_img)
+
+ ## Concatenate SAM features of Stage 2 with shallow features of Stage 3
+ x3_cat = self.concat23(paddle.concat([x3, x3_samfeats], 1))
+
+ x3_cat = self.stage3_orsnet(x3_cat, feat2, res2)
+
+ stage3_img = self.tail(x3_cat)
+
+ return [stage3_img + x3_img, stage2_img, stage1_img]
diff --git a/ppgan/models/generators/msvsr.py b/ppgan/models/generators/msvsr.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e841cf282245161fc2a02191534ceb1fefb3ca
--- /dev/null
+++ b/ppgan/models/generators/msvsr.py
@@ -0,0 +1,1108 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.vision.ops import DeformConv2D
+
+from .basicvsr import PixelShufflePack, flow_warp, SPyNet, ResidualBlocksWithInputConv
+from ...utils.download import get_path_from_url
+from ...modules.init import kaiming_normal_, constant_
+from .builder import GENERATORS
+
+
+@GENERATORS.register()
+class MSVSR(nn.Layer):
+ """PP-MSVSR network structure for video super-resolution.
+
+ Support only x4 upsampling.
+ Paper:
+ PP-MSVSR: Multi-Stage Video Super-Resolution, 2021
+
+ Args:
+ mid_channels (int): Channel number of the intermediate features.
+ Default: 32.
+ num_init_blocks (int): Number of residual blocks in feat_extract.
+ Default: 2.
+ num_blocks (int): Number of residual blocks in each propagation branch.
+ Default: 3.
+ num_reconstruction_blocks (int): Number of residual blocks in reconstruction.
+ Default: 2.
+ only_last (bool): Whether the hr feature only do the last convolution.
+ Default: True.
+ use_tiny_spynet (bool): Whether use tiny spynet.
+ Default: True.
+ deform_groups (int): Number of deformable_groups in DeformConv2D in stage2 and stage3.
+ Defaults: 4.
+ stage1_groups (int): Number of deformable_groups in DeformConv2D in stage1.
+ Defaults: 8.
+ auxiliary_loss (bool): Whether use auxiliary loss.
+ Default: True.
+ use_refine_align (bool): Whether use refine align.
+ Default: True.
+ aux_reconstruction_blocks : Number of residual blocks in auxiliary reconstruction.
+ Default: 1.
+ use_local_connnect (bool): Whether add feature of stage1 after upsample.
+ Default: True.
+ """
+ def __init__(self,
+ mid_channels=32,
+ num_init_blocks=2,
+ num_blocks=3,
+ num_reconstruction_blocks=2,
+ only_last=True,
+ use_tiny_spynet=True,
+ deform_groups=4,
+ stage1_groups=8,
+ auxiliary_loss=True,
+ use_refine_align=True,
+ aux_reconstruction_blocks=1,
+ use_local_connnect=True):
+
+ super().__init__()
+
+ self.mid_channels = mid_channels
+ self.only_last = only_last
+ self.deform_groups = deform_groups
+ self.auxiliary_loss = auxiliary_loss
+ self.use_refine_align = use_refine_align
+ self.use_local_connnect = use_local_connnect
+
+ # optical flow module
+ if use_tiny_spynet:
+ self.spynet = ModifiedSPyNet(num_blocks=3, use_tiny_block=True)
+ weight_path = get_path_from_url(
+ 'https://paddlegan.bj.bcebos.com/models/modified_spynet_tiny.pdparams'
+ )
+ self.spynet.set_state_dict(paddle.load(weight_path))
+ else:
+ self.spynet = ModifiedSPyNet(num_blocks=6, use_tiny_block=False)
+ weight_path = get_path_from_url(
+ 'https://paddlegan.bj.bcebos.com/models/modified_spynet.pdparams'
+ )
+ self.spynet.set_state_dict(paddle.load(weight_path))
+
+ # feature extraction module
+ self.feat_extract = ResidualBlocksWithInputConv(3, mid_channels,
+ num_init_blocks)
+
+ # propagation branches module for stage2 and stage3
+ self.deform_align = nn.LayerDict()
+ self.backbone = nn.LayerDict()
+
+ prop_names = [
+ 'stage2_backward', 'stage2_forward', 'stage3_backward',
+ 'stage3_forward'
+ ]
+
+ for i, layer in enumerate(prop_names):
+ if i > 1 and self.use_refine_align:
+ self.deform_align[layer] = ReAlignmentModule(
+ mid_channels,
+ mid_channels,
+ 3,
+ padding=1,
+ deformable_groups=deform_groups)
+ else:
+ self.deform_align[layer] = AlignmentModule(
+ mid_channels,
+ mid_channels,
+ 3,
+ padding=1,
+ deformable_groups=deform_groups)
+
+ self.backbone[layer] = ResidualBlocksWithInputConv(
+ (3 + i) * mid_channels, mid_channels, num_blocks)
+
+ # stage1
+ self.stage1_align = AlignmentModule(mid_channels,
+ mid_channels,
+ 3,
+ padding=1,
+ deformable_groups=stage1_groups)
+ self.stage1_blocks = ResidualBlocksWithInputConv(
+ 3 * mid_channels, mid_channels, 3)
+
+ # upsampling module
+ self.reconstruction = ResidualBlocksWithInputConv(
+ 6 * mid_channels, mid_channels, num_reconstruction_blocks)
+
+ self.upsample1 = PixelShufflePack(mid_channels,
+ mid_channels,
+ 2,
+ upsample_kernel=3)
+ self.upsample2 = PixelShufflePack(mid_channels,
+ mid_channels,
+ 2,
+ upsample_kernel=3)
+ if self.only_last:
+ self.conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+ else:
+ self.conv_hr = nn.Conv2D(mid_channels, mid_channels, 3, 1, 1)
+ self.conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+ self.img_upsample = nn.Upsample(scale_factor=4,
+ mode='bilinear',
+ align_corners=False)
+
+ # activation function
+ self.lrelu = nn.LeakyReLU(negative_slope=0.1)
+
+ # auxiliary loss
+ if self.auxiliary_loss:
+ self.aux_fusion = nn.Conv2D(mid_channels * 2, mid_channels, 3, 1, 1)
+
+ self.aux_reconstruction = ResidualBlocksWithInputConv(
+ 4 * mid_channels, mid_channels, aux_reconstruction_blocks)
+
+ self.aux_block_down1 = nn.Sequential(
+ nn.Conv2D(3 + mid_channels, mid_channels, 3, 2, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(mid_channels, mid_channels, 3, 1, 1))
+ self.aux_block_down2 = nn.Sequential(
+ nn.Conv2D(mid_channels * 2, mid_channels, 3, 2, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(mid_channels, mid_channels, 3, 1, 1))
+
+ self.aux_conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+
+ self.aux_upsample1 = PixelShufflePack(mid_channels,
+ mid_channels,
+ 2,
+ upsample_kernel=3)
+ self.aux_upsample2 = PixelShufflePack(mid_channels,
+ mid_channels,
+ 2,
+ upsample_kernel=3)
+ self.hybrid_conv_last = nn.Conv2D(mid_channels, 3, 3, 1, 1)
+
+ def check_if_mirror_extended(self, lrs):
+ """Check whether the input is a mirror-extended sequence.
+ If mirror-extended, the i-th (i=0, ..., t-1) frame is equal to the
+ (t-1-i)-th frame.
+ Args:
+ lrs (tensor): Input LR images with shape (n, t, c, h, w)
+
+ Returns:
+ Bool: Whether the input is a mirror-extended sequence.
+ """
+
+ with paddle.no_grad():
+ self.is_mirror_extended = False
+ if lrs.shape[1] % 2 == 0:
+ lrs_1, lrs_2 = paddle.chunk(lrs, 2, axis=1)
+ lrs_2 = paddle.flip(lrs_2, [1])
+ if paddle.norm(lrs_1 - lrs_2) == 0:
+ self.is_mirror_extended = True
+
+ def compute_flow(self, lrs):
+ """Compute optical flow using pretrained flow network for feature alignment.
+ Args:
+ lrs (tensor): Input LR images with shape (n, t, c, h, w)
+
+ Returns:
+ Tuple: Tensor of forward optical flow and backward optical flow with shape (n, t-1, 2, h, w).
+ """
+ n, t, c, h, w = lrs.shape
+
+ lrs_1 = lrs[:, :-1, :, :, :].reshape([-1, c, h, w])
+ lrs_2 = lrs[:, 1:, :, :, :].reshape([-1, c, h, w])
+
+ flows_backward = self.spynet(lrs_1, lrs_2).reshape([n, t - 1, 2, h, w])
+
+ if self.is_mirror_extended:
+ flows_forward = flows_backward.flip(1)
+ else:
+ flows_forward = self.spynet(lrs_2,
+ lrs_1).reshape([n, t - 1, 2, h, w])
+
+ return flows_forward, flows_backward
+
+ def stage1(self, feats, flows, flows_forward=None):
+ """Stage1 of PP-MSVSR network.
+ Args:
+ feats (dict): Dict with key 'spatial', the value is Array of tensor after feature extraction with shape (n, c, h, w).
+ flows (tensor): Backward optical flow with shape (n, t-1, 2, h, w).
+ flows_forward (tensor): Forward optical flow with shape (n, t-1, 2, h, w).
+
+ Returns:
+ Dict: The input dict with new keys 'feat_stage1', the value of 'feat_stage1' is Array of tensor after Local Fusion Module with shape (n, c, h, w).
+ """
+
+ n, t, _, h, w = flows.shape
+
+ frame_idx = range(t, -1, -1)
+ flow_idx = range(t, -1, -1)
+ mapping_idx = list(range(0, len(feats['spatial'])))
+ mapping_idx += mapping_idx[::-1]
+
+ # Local Fusion Module
+ for i, idx in enumerate(frame_idx):
+ feat_current = feats['spatial'][mapping_idx[idx]]
+
+ # get aligned right adjacent frames
+ if i > 0:
+ feat_prop = feats['spatial'][mapping_idx[idx + 1]]
+ flow_n1 = flows[:, flow_idx[i], :, :, :]
+ cond_n1 = flow_warp(feat_prop, flow_n1.transpose([0, 2, 3, 1]))
+ cond = paddle.concat([cond_n1, feat_current], axis=1)
+ feat_prop, _, _ = self.stage1_align(feat_prop, cond, flow_n1)
+ else:
+ feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+
+ # get aligned left adjacent frames
+ if i < t:
+ feat_back = feats['spatial'][mapping_idx[idx - 1]]
+ flow_n1_ = flows_forward[:, flow_idx[i] - 1, :, :, :]
+ cond_n1_ = flow_warp(feat_back, flow_n1_.transpose([0, 2, 3,
+ 1]))
+ cond_ = paddle.concat([cond_n1_, feat_current], axis=1)
+ feat_back, _, _ = self.stage1_align(feat_back, cond_, flow_n1_)
+ else:
+ feat_back = paddle.zeros([n, self.mid_channels, h, w])
+
+ # concatenate and residual blocks
+ feat = [feat_current] + [feat_prop] + [feat_back]
+ feat = paddle.concat(feat, axis=1)
+ feat = self.stage1_blocks(feat)
+
+ feats['feat_stage1'].append(feat)
+
+ feats['feat_stage1'] = feats['feat_stage1'][::-1]
+
+ return feats
+
+ def stage2(self, feats, flows):
+ """Stage2 of PP-MSVSR network.
+ Args:
+ feats (dict): Dict with key 'spatial' and 'feat_stage1' after stage1.
+ flows (tuple): Tensor of backward optical flow and forward optical flow with shape (n, t-1, 2, h, w).
+
+ Returns:
+ feats (dict): The input dict with new keys 'stage2_backward' and 'stage2_forward', the value of both is Array of feature after stage2 with shape (n, c, h, w).
+ pre_offset (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of offset in stage2 with shape (n, 18*deform_groups, h, w).
+ pre_mask (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of mask in stage2 with shape (n, 9*deform_groups, h, w).
+ """
+ flows_backward, flows_forward = flows
+ n, t, _, h, w = flows_backward.shape
+
+ pre_offset = {}
+ pre_mask = {}
+
+ # propagation branches module
+ prop_names = ['stage2_backward', 'stage2_forward']
+ for index in range(2):
+ prop_name = prop_names[index]
+ pre_offset[prop_name] = [0 for _ in range(t)]
+ pre_mask[prop_name] = [0 for _ in range(t)]
+ feats[prop_name] = []
+ frame_idx = range(0, t + 1)
+ flow_idx = range(-1, t)
+ mapping_idx = list(range(0, len(feats['spatial'])))
+ mapping_idx += mapping_idx[::-1]
+
+ if 'backward' in prop_name:
+ frame_idx = frame_idx[::-1]
+ flow_idx = frame_idx
+ flows = flows_backward
+ else:
+ flows = flows_forward
+
+ feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+ for i, idx in enumerate(frame_idx):
+ feat_current = feats['spatial'][mapping_idx[idx]]
+
+ if i > 0:
+ flow_n1 = flows[:, flow_idx[i], :, :, :]
+
+ cond_n1 = flow_warp(feat_prop,
+ flow_n1.transpose([0, 2, 3, 1]))
+ cond = paddle.concat([cond_n1, feat_current], axis=1)
+
+ feat_prop, offset, mask = self.deform_align[prop_name](
+ feat_prop, cond, flow_n1)
+ pre_offset[prop_name][flow_idx[i]] = offset
+ pre_mask[prop_name][flow_idx[i]] = (mask)
+
+ # concatenate and residual blocks
+ feat = [feat_current] + [
+ feats[k][idx]
+ for k in feats if k not in ['spatial', prop_name]
+ ] + [feat_prop]
+
+ feat = paddle.concat(feat, axis=1)
+ feat_prop = feat_prop + self.backbone[prop_name](feat)
+
+ feats[prop_name].append(feat_prop)
+
+ if 'backward' in prop_name:
+ feats[prop_name] = feats[prop_name][::-1]
+
+ return feats, pre_offset, pre_mask
+
+ def stage3(self,
+ feats,
+ flows,
+ aux_feats=None,
+ pre_offset=None,
+ pre_mask=None):
+ """Stage3 of PP-MSVSR network.
+ Args:
+ feats (dict): Dict of features after stage2.
+ flows (tuple): Tensor of backward optical flow and forward optical flow with shape (n, t-1, 2, h, w).
+ aux_feats (dict): Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
+ pre_offset (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of offset in stage2 with shape (n, 18*deform_groups, h, w).
+ pre_mask (dict): Dict with keys 'stage2_backward' and 'stage2_forward', the value of both is Array of mask in stage2 with shape (n, 9*deform_groups, h, w).
+
+ Returns:
+ feats (dict): The input feats dict with new keys 'stage3_backward' and 'stage3_forward', the value of both is Array of feature after stage3 with shape (n, c, h, w).
+ """
+ flows_backward, flows_forward = flows
+ n, t, _, h, w = flows_backward.shape
+
+ # propagation branches module
+ prop_names = ['stage3_backward', 'stage3_forward']
+ for index in range(2):
+ prop_name = prop_names[index]
+ feats[prop_name] = []
+ frame_idx = range(0, t + 1)
+ flow_idx = range(-1, t)
+ mapping_idx = list(range(0, len(feats['spatial'])))
+ mapping_idx += mapping_idx[::-1]
+
+ if 'backward' in prop_name:
+ frame_idx = frame_idx[::-1]
+ flow_idx = frame_idx
+ flows = flows_backward
+ pre_stage_name = 'stage2_backward'
+ else:
+ flows = flows_forward
+ pre_stage_name = 'stage2_forward'
+
+ feat_prop = paddle.zeros([n, self.mid_channels, h, w])
+ for i, idx in enumerate(frame_idx):
+ feat_current = feats['spatial'][mapping_idx[idx]]
+ if aux_feats is not None and 'feats' in aux_feats:
+ feat_current = aux_feats['feats'][mapping_idx[idx]]
+
+ if i > 0:
+ flow_n1 = flows[:, flow_idx[i], :, :, :]
+
+ cond_n1 = flow_warp(feat_prop,
+ flow_n1.transpose([0, 2, 3, 1]))
+ cond = paddle.concat([cond_n1, feat_current], axis=1)
+
+ feat_prop = self.deform_align[prop_name](
+ feat_prop, cond, flow_n1, feat_current,
+ pre_offset[pre_stage_name][flow_idx[i]],
+ pre_mask[pre_stage_name][flow_idx[i]])
+
+ # concatenate and residual blocks
+ feat = [feat_current] + [
+ feats[k][idx]
+ for k in feats if k not in ['spatial', prop_name]
+ ] + [feat_prop]
+
+ feat = paddle.concat(feat, axis=1)
+ feat_prop = feat_prop + self.backbone[prop_name](feat)
+
+ feats[prop_name].append(feat_prop)
+
+ if 'backward' in prop_name:
+ feats[prop_name] = feats[prop_name][::-1]
+
+ return feats
+
+ def auxiliary_stage(self, feats, lqs):
+ """Compute the output image and auxiliary feature for Auxiliary Loss in stage2.
+ Args:
+ feats (dict): Dict of features after stage2.
+ lqs (tensor): Input LR images with shape (n, t, c, h, w)
+
+ Returns:
+ dict: Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
+ """
+ aux_feats = {}
+ aux_feats['outs'] = []
+ aux_feats['feats'] = []
+ num_outputs = len(feats['spatial'])
+
+ mapping_idx = list(range(0, num_outputs))
+ mapping_idx += mapping_idx[::-1]
+
+ t = lqs.shape[1]
+ for i in range(0, t):
+ hr = [feats[k][i] for k in feats if (k != 'spatial')]
+ feat_current = feats['spatial'][mapping_idx[i]]
+ hr.insert(0, feat_current)
+ hr = paddle.concat(hr, axis=1)
+
+ hr_low = self.aux_reconstruction(hr)
+ hr_mid = self.lrelu(self.aux_upsample1(hr_low))
+ hr_high = self.lrelu(self.aux_upsample2(hr_mid))
+
+ hr = self.aux_conv_last(hr_high)
+ hr += self.img_upsample(lqs[:, i, :, :, :])
+
+ # output tensor of auxiliary_stage with shape (n, 3, 4*h, 4*w)
+ aux_feats['outs'].append(hr)
+
+ aux_feat = self.aux_block_down1(paddle.concat([hr, hr_high],
+ axis=1))
+ aux_feat = self.aux_block_down2(
+ paddle.concat([aux_feat, hr_mid], axis=1))
+ aux_feat = self.aux_fusion(paddle.concat([aux_feat, hr_low],
+ axis=1))
+
+ # out feature of auxiliary_stage with shape (n, c, h, w)
+ aux_feats['feats'].append(aux_feat)
+
+ return aux_feats
+
+ def upsample(self, lqs, feats, aux_feats=None):
+ """Compute the output image given the features.
+ Args:
+ lqs (tensor): Input LR images with shape (n, t, c, h, w).
+ feats (dict): Dict of features after stage3.
+ aux_feats (dict): Dict with keys 'outs' and 'feats', the value is Array of tensor after auxiliary_stage with shape (n, 3, 4*h, 4*w) and (n, c, h, w), separately.
+
+ Returns:
+ Tensor: Output HR sequence with shape (n, t, 3, 4*h, 4*w).
+ """
+
+ outputs = []
+ num_outputs = len(feats['spatial'])
+
+ mapping_idx = list(range(0, num_outputs))
+ mapping_idx += mapping_idx[::-1]
+
+ t = lqs.shape[1]
+ for i in range(0, t):
+ hr = [
+ feats[k].pop(0) for k in feats
+ if (k != 'spatial' and k != 'feat_stage1')
+ ]
+ if 'feat_stage1' in feats:
+ local_feat = feats['feat_stage1'].pop(0)
+ hr.insert(0, local_feat)
+ hr.insert(0, feats['spatial'][mapping_idx[i]])
+ hr = paddle.concat(hr, axis=1)
+
+ hr = self.reconstruction(hr)
+
+ hr = self.lrelu(self.upsample1(hr))
+ hr = self.lrelu(self.upsample2(hr))
+ if self.only_last:
+ hr = self.conv_last(hr)
+ else:
+ hr = self.lrelu(self.conv_hr(hr))
+ hr = self.conv_last(hr)
+
+ hr += self.img_upsample(lqs[:, i, :, :, :])
+ if self.use_local_connnect:
+ local_head = self.lrelu(self.aux_upsample1(local_feat))
+ local_head = self.lrelu(self.aux_upsample2(local_head))
+ hr = self.hybrid_conv_last(local_head) + hr
+
+ outputs.append(hr)
+
+ if self.auxiliary_loss:
+ return paddle.stack(aux_feats['outs'],
+ axis=1), paddle.stack(outputs, axis=1)
+ return paddle.stack(outputs, axis=1)
+
+ def forward(self, lqs):
+ """Forward function for PP-MSVSR.
+ Args:
+ lqs (Tensor): Input LR sequence with shape (n, t, c, h, w).
+ Returns:
+ Tensor: Output HR sequence with shape (n, t, 3, 4*h, 4*w).
+ """
+
+ n, t, c, h, w = lqs.shape
+
+ lqs_downsample = lqs
+
+ # check whether the input is an extended sequence
+ self.check_if_mirror_extended(lqs)
+
+ feats = {}
+ feats_ = self.feat_extract(lqs.reshape([-1, c, h, w]))
+
+ h, w = feats_.shape[2:]
+ feats_ = feats_.reshape([n, t, -1, h, w])
+ feats['spatial'] = [feats_[:, i, :, :, :] for i in range(0, t)]
+
+ # compute optical flow using the low-res inputs
+ assert lqs_downsample.shape[3] >= 64 and lqs_downsample.shape[4] >= 64, (
+ 'The height and width of low-res inputs must be at least 64, '
+ f'but got {h} and {w}.')
+
+ flows_forward, flows_backward = self.compute_flow(lqs_downsample)
+
+ # feature propgation
+ feats['feat_stage1'] = []
+ feats = self.stage1(feats, flows_backward, flows_forward)
+
+ feats, pre_offset, pre_mask = self.stage2(
+ feats, (flows_backward, flows_forward))
+
+ if self.auxiliary_loss:
+ aux_feats = self.auxiliary_stage(feats, lqs)
+
+ feats = self.stage3(feats, (flows_backward, flows_forward), aux_feats,
+ pre_offset, pre_mask)
+
+ return self.upsample(lqs, feats, aux_feats=aux_feats)
+
+
+class AlignmentModule(nn.Layer):
+ """deformable alignment module.
+ Args:
+ in_channels (int): Same as nn.Conv2d.
+ out_channels (int): Same as nn.Conv2d.
+ kernel_size (int or tuple[int]): Same as nn.Conv2d.
+ stride (int or tuple[int]): Same as nn.Conv2d.
+ padding (int or tuple[int]): Same as nn.Conv2d.
+ dilation (int or tuple[int]): Same as nn.Conv2d.
+ groups (int): Same as nn.Conv2d.
+ deformable_groups (int): Number of deformable_groups in DeformConv2D.
+ """
+ def __init__(self,
+ in_channels=128,
+ out_channels=64,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ dilation=1,
+ groups=1,
+ deformable_groups=16):
+ super(AlignmentModule, self).__init__()
+
+ self.conv_offset = nn.Sequential(
+ nn.Conv2D(2 * out_channels + 2, out_channels, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(out_channels, 27 * deformable_groups, 3, 1, 1),
+ )
+ self.dcn = DeformConv2D(in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ deformable_groups=deformable_groups)
+
+ self.init_offset()
+
+ def init_offset(self):
+ constant_(self.conv_offset[-1].weight, 0)
+ constant_(self.conv_offset[-1].bias, 0)
+
+ def forward(self, x, extra_feat, flow_1):
+ extra_feat = paddle.concat([extra_feat, flow_1], axis=1)
+ out = self.conv_offset(extra_feat)
+ o1, o2, mask = paddle.chunk(out, 3, axis=1)
+
+ # offset
+ offset = 10 * paddle.tanh(paddle.concat((o1, o2), axis=1))
+ offset = offset + flow_1.flip(1).tile([1, offset.shape[1] // 2, 1, 1])
+
+ # mask
+ mask = F.sigmoid(mask)
+ out = self.dcn(x, offset, mask)
+ return out, offset, mask
+
+
+class ReAlignmentModule(nn.Layer):
+ """refine deformable alignment module.
+ Args:
+ in_channels (int): Same as nn.Conv2d.
+ out_channels (int): Same as nn.Conv2d.
+ kernel_size (int or tuple[int]): Same as nn.Conv2d.
+ stride (int or tuple[int]): Same as nn.Conv2d.
+ padding (int or tuple[int]): Same as nn.Conv2d.
+ dilation (int or tuple[int]): Same as nn.Conv2d.
+ groups (int): Same as nn.Conv2d.
+ deformable_groups (int): Number of deformable_groups in DeformConv2D.
+ """
+ def __init__(self,
+ in_channels=128,
+ out_channels=64,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ dilation=1,
+ groups=1,
+ deformable_groups=16):
+ super(ReAlignmentModule, self).__init__()
+
+ self.mdconv = DeformConv2D(in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ deformable_groups=deformable_groups)
+ self.conv_offset = nn.Sequential(
+ nn.Conv2D(2 * out_channels + 2, out_channels, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(out_channels, out_channels, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.1),
+ nn.Conv2D(out_channels, 27 * deformable_groups, 3, 1, 1),
+ )
+ self.dcn = DeformConv2D(in_channels,
+ out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ deformable_groups=deformable_groups)
+
+ self.init_offset()
+
+ def init_offset(self):
+ constant_(self.conv_offset[-1].weight, 0)
+ constant_(self.conv_offset[-1].bias, 0)
+
+ def forward(self,
+ x,
+ extra_feat,
+ flow_1,
+ feat_current,
+ pre_stage_flow=None,
+ pre_stage_mask=None):
+ if pre_stage_flow is not None:
+ pre_feat = self.mdconv(x, pre_stage_flow, pre_stage_mask)
+ extra_feat = paddle.concat([pre_feat, feat_current, flow_1], axis=1)
+ else:
+ extra_feat = paddle.concat([extra_feat, flow_1], axis=1)
+ out = self.conv_offset(extra_feat)
+ o1, o2, mask = paddle.chunk(out, 3, axis=1)
+
+ # offset
+ offset = 10 * paddle.tanh(paddle.concat((o1, o2), axis=1))
+ if pre_stage_flow is not None:
+ offset = offset + pre_stage_flow
+ else:
+ offset = offset + flow_1.flip(1).tile(
+ [1, offset.shape[1] // 2, 1, 1])
+
+ # mask
+ if pre_stage_mask is not None:
+ mask = (F.sigmoid(mask) + pre_stage_mask) / 2.0
+ else:
+ mask = F.sigmoid(mask)
+ out = self.dcn(x, offset, mask)
+ return out
+
+
+class ModifiedSPyNet(nn.Layer):
+ """Modified SPyNet network structure.
+
+ The difference to the SPyNet in paper is that
+ 1. convolution with kernel_size=7 is replaced by convolution with kernel_size=3 in this version,
+ 2. less SPyNetBasicModule is used in this version,
+ 3. no BN is used in this version.
+
+ Paper:
+ Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+
+ Args:
+ act_cfg (dict): Activation function.
+ Default: dict(name='LeakyReLU').
+ num_blocks (int): Number of SPyNetBlock.
+ Default: 6.
+ use_tiny_block (bool): Whether use tiny spynet.
+ Default: True.
+ """
+ def __init__(self,
+ act_cfg=dict(name='LeakyReLU'),
+ num_blocks=6,
+ use_tiny_block=False):
+ super().__init__()
+ self.num_blocks = num_blocks
+ self.basic_module = nn.LayerList([
+ SPyNetBlock(act_cfg=act_cfg, use_tiny_block=use_tiny_block)
+ for _ in range(num_blocks)
+ ])
+
+ self.register_buffer(
+ 'mean',
+ paddle.to_tensor([0.485, 0.456, 0.406]).reshape([1, 3, 1, 1]))
+ self.register_buffer(
+ 'std',
+ paddle.to_tensor([0.229, 0.224, 0.225]).reshape([1, 3, 1, 1]))
+
+ def compute_flow(self, ref, supp):
+ """Compute flow from ref to supp.
+
+ Note that in this function, the images are already resized to a
+ multiple of 32.
+
+ Args:
+ ref (Tensor): Reference image with shape of (n, 3, h, w).
+ supp (Tensor): Supporting image with shape of (n, 3, h, w).
+
+ Returns:
+ Tensor: Estimated optical flow: (n, 2, h, w).
+ """
+ n, _, h, w = ref.shape
+
+ # normalize the input images
+ ref = [(ref - self.mean) / self.std]
+ supp = [(supp - self.mean) / self.std]
+
+ # generate downsampled frames
+ for level in range(self.num_blocks - 1):
+ ref.append(F.avg_pool2d(ref[-1], kernel_size=2, stride=2))
+ supp.append(F.avg_pool2d(supp[-1], kernel_size=2, stride=2))
+ ref = ref[::-1]
+ supp = supp[::-1]
+
+ # flow computation
+ flow = paddle.to_tensor(
+ np.zeros([
+ n, 2, h // (2**(self.num_blocks - 1)), w //
+ (2**(self.num_blocks - 1))
+ ], 'float32'))
+
+ for level in range(len(ref)):
+ if level == 0:
+ flow_up = flow
+ else:
+ flow_up = F.interpolate(
+ flow, scale_factor=2, mode='bilinear',
+ align_corners=True) * 2.0
+
+ # add the residue to the upsampled flow
+ flow = flow_up + self.basic_module[level](paddle.concat([
+ ref[level],
+ flow_warp(supp[level],
+ flow_up.transpose([0, 2, 3, 1]),
+ padding_mode='border'), flow_up
+ ],
+ axis=1))
+
+ return flow
+
+ def compute_flow_list(self, ref, supp):
+ n, _, h, w = ref.shape
+
+ # normalize the input images
+ ref = [(ref - self.mean) / self.std]
+ supp = [(supp - self.mean) / self.std]
+
+ # generate downsampled frames
+ for level in range(self.num_blocks - 1):
+ ref.append(F.avg_pool2d(ref[-1], kernel_size=2, stride=2))
+ supp.append(F.avg_pool2d(supp[-1], kernel_size=2, stride=2))
+ ref = ref[::-1]
+ supp = supp[::-1]
+
+ # flow computation
+ flow_list = []
+ flow = paddle.to_tensor(
+ np.zeros([
+ n, 2, h // (2**(self.num_blocks - 1)), w //
+ (2**(self.num_blocks - 1))
+ ], 'float32'))
+ for level in range(len(ref)):
+ if level == 0:
+ flow_up = flow
+ else:
+ flow_up = F.interpolate(
+ flow, scale_factor=2, mode='bilinear',
+ align_corners=True) * 2.0
+
+ # add the residue to the upsampled flow
+ flow = flow_up + self.basic_module[level](paddle.concat([
+ ref[level],
+ flow_warp(supp[level],
+ flow_up.transpose([0, 2, 3, 1]),
+ padding_mode='border'), flow_up
+ ],
+ axis=1))
+ flow_list.append(flow)
+ return flow_list
+
+ def forward(self, ref, supp):
+ """Forward function of Modified SPyNet.
+
+ This function computes the optical flow from ref to supp.
+
+ Args:
+ ref (Tensor): Reference image with shape of (n, 3, h, w).
+ supp (Tensor): Supporting image with shape of (n, 3, h, w).
+
+ Returns:
+ Tensor: Estimated optical flow: (n, 2, h, w).
+ """
+
+ # upsize to a multiple of 32
+ h, w = ref.shape[2:4]
+ w_up = w if (w % 32) == 0 else 32 * (w // 32 + 1)
+ h_up = h if (h % 32) == 0 else 32 * (h // 32 + 1)
+ ref = F.interpolate(ref,
+ size=(h_up, w_up),
+ mode='bilinear',
+ align_corners=False)
+
+ supp = F.interpolate(supp,
+ size=(h_up, w_up),
+ mode='bilinear',
+ align_corners=False)
+
+ ref.stop_gradient = False
+ supp.stop_gradient = False
+
+ # compute flow, and resize back to the original resolution
+ flow = F.interpolate(self.compute_flow(ref, supp),
+ size=(h, w),
+ mode='bilinear',
+ align_corners=False)
+
+ # adjust the flow values
+ flow[:, 0, :, :] *= float(w) / float(w_up)
+ flow[:, 1, :, :] *= float(h) / float(h_up)
+
+ return flow
+
+
+class SPyNetBlock(nn.Layer):
+ """Basic Block of Modified SPyNet.
+ refer to Optical Flow Estimation using a Spatial Pyramid Network, CVPR, 2017
+ """
+ def __init__(self, act_cfg=dict(name='LeakyReLU'), use_tiny_block=False):
+ super().__init__()
+ if use_tiny_block:
+ self.basic_module = nn.Sequential(
+ ConvLayer(in_channels=8,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=8,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=8,
+ out_channels=8,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=8,
+ out_channels=2,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=None))
+ else:
+ self.basic_module = nn.Sequential(
+ ConvLayer(in_channels=8,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=64,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=64,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=32,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=32,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=16,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=act_cfg),
+ ConvLayer(in_channels=16,
+ out_channels=2,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ act_cfg=None))
+
+ def forward(self, tensor_input):
+ """Forward function of SPyNetBlock.
+ Args:
+ tensor_input (Tensor): Input tensor with shape (b, 8, h, w).
+ 8 channels contain:
+ [reference image (3), neighbor image (3), initial flow (2)].
+
+ Returns:
+ Tensor: Refined flow with shape (b, 2, h, w)
+ """
+ return self.basic_module(tensor_input)
+
+
+class ConvLayer(nn.Layer):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ padding=0,
+ dilation=1,
+ groups=1,
+ act_cfg=dict(name='ReLU')):
+ super(ConvLayer, self).__init__()
+ self.act_cfg = act_cfg
+ self.with_activation = act_cfg is not None
+
+ self.conv = nn.Conv2D(in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=padding,
+ dilation=dilation,
+ groups=groups)
+
+ if self.with_activation:
+ if act_cfg['name'] == 'ReLU':
+ self.act = paddle.nn.ReLU()
+ elif act_cfg['name'] == 'LeakyReLU':
+ self.act = nn.LeakyReLU(negative_slope=0.1)
+
+ def forward(self, tensor_input):
+ out = self.conv(tensor_input)
+ if self.with_activation:
+ out = self.act(out)
+ return out
diff --git a/ppgan/models/generators/nafnet.py b/ppgan/models/generators/nafnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa17bcdfe25544a72396c73587726484a6ea5002
--- /dev/null
+++ b/ppgan/models/generators/nafnet.py
@@ -0,0 +1,407 @@
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# Modified from BasicSR (https://github.com/xinntao/BasicSR)
+# Copyright 2018-2020 BasicSR Authors
+
+import paddle
+from paddle import nn as nn
+import paddle.nn.functional as F
+from paddle.autograd import PyLayer
+
+from .builder import GENERATORS
+
+
+class LayerNormFunction(PyLayer):
+
+ @staticmethod
+ def forward(ctx, x, weight, bias, eps):
+ ctx.eps = eps
+ N, C, H, W = x.shape
+ mu = x.mean(1, keepdim=True)
+ var = (x - mu).pow(2).mean(1, keepdim=True)
+ y = (x - mu) / (var + eps).sqrt()
+ ctx.save_for_backward(y, var, weight)
+ y = weight.reshape([1, C, 1, 1]) * y + bias.reshape([1, C, 1, 1])
+ return y
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ eps = ctx.eps
+
+ N, C, H, W = grad_output.shape
+ y, var, weight = ctx.saved_tensor()
+ g = grad_output * weight.reshape([1, C, 1, 1])
+ mean_g = g.mean(axis=1, keepdim=True)
+
+ mean_gy = (g * y).mean(axis=1, keepdim=True)
+ gx = 1. / paddle.sqrt(var + eps) * (g - y * mean_gy - mean_g)
+ return gx, (grad_output * y).sum(axis=3).sum(axis=2).sum(
+ axis=0), grad_output.sum(axis=3).sum(axis=2).sum(axis=0)
+
+
+class LayerNorm2D(nn.Layer):
+
+ def __init__(self, channels, eps=1e-6):
+ super(LayerNorm2D, self).__init__()
+ self.add_parameter(
+ 'weight',
+ self.create_parameter(
+ [channels],
+ default_initializer=paddle.nn.initializer.Constant(value=1.0)))
+ self.add_parameter(
+ 'bias',
+ self.create_parameter(
+ [channels],
+ default_initializer=paddle.nn.initializer.Constant(value=0.0)))
+ self.eps = eps
+
+ def forward(self, x):
+ if self.training:
+ y = LayerNormFunction.apply(x, self.weight, self.bias, self.eps)
+ else:
+ N, C, H, W = x.shape
+ mu = x.mean(1, keepdim=True)
+ var = (x - mu).pow(2).mean(1, keepdim=True)
+ y = (x - mu) / (var + self.eps).sqrt()
+ y = self.weight.reshape([1, C, 1, 1]) * y + self.bias.reshape(
+ [1, C, 1, 1])
+
+ return y
+
+
+class AvgPool2D(nn.Layer):
+
+ def __init__(self,
+ kernel_size=None,
+ base_size=None,
+ auto_pad=True,
+ fast_imp=False,
+ train_size=None):
+ super().__init__()
+ self.kernel_size = kernel_size
+ self.base_size = base_size
+ self.auto_pad = auto_pad
+
+ # only used for fast implementation
+ self.fast_imp = fast_imp
+ self.rs = [5, 4, 3, 2, 1]
+ self.max_r1 = self.rs[0]
+ self.max_r2 = self.rs[0]
+ self.train_size = train_size
+
+ def extra_repr(self) -> str:
+ return 'kernel_size={}, base_size={}, stride={}, fast_imp={}'.format(
+ self.kernel_size, self.base_size, self.kernel_size, self.fast_imp)
+
+ def forward(self, x):
+ if self.kernel_size is None and self.base_size:
+ train_size = self.train_size
+ if isinstance(self.base_size, int):
+ self.base_size = (self.base_size, self.base_size)
+ self.kernel_size = list(self.base_size)
+ self.kernel_size[
+ 0] = x.shape[2] * self.base_size[0] // train_size[-2]
+ self.kernel_size[
+ 1] = x.shape[3] * self.base_size[1] // train_size[-1]
+
+ # only used for fast implementation
+ self.max_r1 = max(1, self.rs[0] * x.shape[2] // train_size[-2])
+ self.max_r2 = max(1, self.rs[0] * x.shape[3] // train_size[-1])
+
+ if self.kernel_size[0] >= x.shape[-2] and self.kernel_size[
+ 1] >= x.shape[-1]:
+ return F.adaptive_avg_pool2d(x, 1)
+
+ if self.fast_imp: # Non-equivalent implementation but faster
+ h, w = x.shape[2:]
+ if self.kernel_size[0] >= h and self.kernel_size[1] >= w:
+ out = F.adaptive_avg_pool2d(x, 1)
+ else:
+ r1 = [r for r in self.rs if h % r == 0][0]
+ r2 = [r for r in self.rs if w % r == 0][0]
+ # reduction_constraint
+ r1 = min(self.max_r1, r1)
+ r2 = min(self.max_r2, r2)
+ s = x[:, :, ::r1, ::r2].cumsum(axis=-1).cumsum(axis=-2)
+ n, c, h, w = s.shape
+ k1, k2 = min(h - 1, self.kernel_size[0] // r1), min(
+ w - 1, self.kernel_size[1] // r2)
+ out = (s[:, :, :-k1, :-k2] - s[:, :, :-k1, k2:] -
+ s[:, :, k1:, :-k2] + s[:, :, k1:, k2:]) / (k1 * k2)
+ out = paddle.nn.functional.interpolate(out,
+ scale_factor=(r1, r2))
+ else:
+ n, c, h, w = x.shape
+ s = x.cumsum(axis=-1).cumsum(axis=-2)
+ s = paddle.nn.functional.pad(s,
+ [1, 0, 1, 0]) # pad 0 for convenience
+ k1, k2 = min(h, self.kernel_size[0]), min(w, self.kernel_size[1])
+ s1, s2, s3, s4 = s[:, :, :-k1, :-k2], s[:, :, :-k1,
+ k2:], s[:, :,
+ k1:, :-k2], s[:, :,
+ k1:,
+ k2:]
+ out = s4 + s1 - s2 - s3
+ out = out / (k1 * k2)
+
+ if self.auto_pad:
+ n, c, h, w = x.shape
+ _h, _w = out.shape[2:]
+ pad2d = [(w - _w) // 2, (w - _w + 1) // 2, (h - _h) // 2,
+ (h - _h + 1) // 2]
+ out = paddle.nn.functional.pad(out, pad2d, mode='replicate')
+
+ return out
+
+
+def replace_layers(model, base_size, train_size, fast_imp, **kwargs):
+ for n, m in model.named_children():
+ if len(list(m.children())) > 0:
+ ## compound module, go inside it
+ replace_layers(m, base_size, train_size, fast_imp, **kwargs)
+
+ if isinstance(m, nn.AdaptiveAvgPool2D):
+ pool = AvgPool2D(base_size=base_size,
+ fast_imp=fast_imp,
+ train_size=train_size)
+ assert m._output_size == 1
+ setattr(model, n, pool)
+
+
+'''
+ref.
+@article{chu2021tlsc,
+ title={Revisiting Global Statistics Aggregation for Improving Image Restoration},
+ author={Chu, Xiaojie and Chen, Liangyu and and Chen, Chengpeng and Lu, Xin},
+ journal={arXiv preprint arXiv:2112.04491},
+ year={2021}
+}
+'''
+
+
+class Local_Base():
+
+ def convert(self, *args, train_size, **kwargs):
+ replace_layers(self, *args, train_size=train_size, **kwargs)
+ imgs = paddle.rand(train_size)
+ with paddle.no_grad():
+ self.forward(imgs)
+
+
+class SimpleGate(nn.Layer):
+
+ def forward(self, x):
+ x1, x2 = x.chunk(2, axis=1)
+ return x1 * x2
+
+
+class NAFBlock(nn.Layer):
+
+ def __init__(self, c, DW_Expand=2, FFN_Expand=2, drop_out_rate=0.):
+ super().__init__()
+ dw_channel = c * DW_Expand
+ self.conv1 = nn.Conv2D(in_channels=c,
+ out_channels=dw_channel,
+ kernel_size=1,
+ padding=0,
+ stride=1,
+ groups=1,
+ bias_attr=True)
+ self.conv2 = nn.Conv2D(in_channels=dw_channel,
+ out_channels=dw_channel,
+ kernel_size=3,
+ padding=1,
+ stride=1,
+ groups=dw_channel,
+ bias_attr=True)
+ self.conv3 = nn.Conv2D(in_channels=dw_channel // 2,
+ out_channels=c,
+ kernel_size=1,
+ padding=0,
+ stride=1,
+ groups=1,
+ bias_attr=True)
+
+ # Simplified Channel Attention
+ self.sca = nn.Sequential(
+ nn.AdaptiveAvgPool2D(1),
+ nn.Conv2D(in_channels=dw_channel // 2,
+ out_channels=dw_channel // 2,
+ kernel_size=1,
+ padding=0,
+ stride=1,
+ groups=1,
+ bias_attr=True),
+ )
+
+ # SimpleGate
+ self.sg = SimpleGate()
+
+ ffn_channel = FFN_Expand * c
+ self.conv4 = nn.Conv2D(in_channels=c,
+ out_channels=ffn_channel,
+ kernel_size=1,
+ padding=0,
+ stride=1,
+ groups=1,
+ bias_attr=True)
+ self.conv5 = nn.Conv2D(in_channels=ffn_channel // 2,
+ out_channels=c,
+ kernel_size=1,
+ padding=0,
+ stride=1,
+ groups=1,
+ bias_attr=True)
+
+ self.norm1 = LayerNorm2D(c)
+ self.norm2 = LayerNorm2D(c)
+
+ self.drop_out_rate = drop_out_rate
+
+ self.dropout1 = nn.Dropout(
+ drop_out_rate) if drop_out_rate > 0. else None
+ self.dropout2 = nn.Dropout(
+ drop_out_rate) if drop_out_rate > 0. else None
+
+ self.add_parameter(
+ "beta",
+ self.create_parameter(
+ [1, c, 1, 1],
+ default_initializer=paddle.nn.initializer.Constant(value=0.0)))
+ self.add_parameter(
+ "gamma",
+ self.create_parameter(
+ [1, c, 1, 1],
+ default_initializer=paddle.nn.initializer.Constant(value=0.0)))
+
+ def forward(self, inp):
+ x = inp
+
+ x = self.norm1(x)
+
+ x = self.conv1(x)
+ x = self.conv2(x)
+ x = self.sg(x)
+ x = x * self.sca(x)
+ x = self.conv3(x)
+
+ if self.drop_out_rate > 0:
+ x = self.dropout1(x)
+
+ y = inp + x * self.beta
+
+ x = self.conv4(self.norm2(y))
+ x = self.sg(x)
+ x = self.conv5(x)
+
+ if self.drop_out_rate > 0:
+ x = self.dropout2(x)
+
+ return y + x * self.gamma
+
+
+@GENERATORS.register()
+class NAFNet(nn.Layer):
+
+ def __init__(self,
+ img_channel=3,
+ width=16,
+ middle_blk_num=1,
+ enc_blk_nums=[],
+ dec_blk_nums=[]):
+ super().__init__()
+
+ self.intro = nn.Conv2D(in_channels=img_channel,
+ out_channels=width,
+ kernel_size=3,
+ padding=1,
+ stride=1,
+ groups=1,
+ bias_attr=True)
+ self.ending = nn.Conv2D(in_channels=width,
+ out_channels=img_channel,
+ kernel_size=3,
+ padding=1,
+ stride=1,
+ groups=1,
+ bias_attr=True)
+
+ self.encoders = nn.LayerList()
+ self.decoders = nn.LayerList()
+ self.middle_blks = nn.LayerList()
+ self.ups = nn.LayerList()
+ self.downs = nn.LayerList()
+
+ chan = width
+ for num in enc_blk_nums:
+ self.encoders.append(
+ nn.Sequential(*[NAFBlock(chan) for _ in range(num)]))
+ self.downs.append(nn.Conv2D(chan, 2 * chan, 2, 2))
+ chan = chan * 2
+
+ self.middle_blks = \
+ nn.Sequential(
+ *[NAFBlock(chan) for _ in range(middle_blk_num)]
+ )
+
+ for num in dec_blk_nums:
+ self.ups.append(
+ nn.Sequential(nn.Conv2D(chan, chan * 2, 1, bias_attr=False),
+ nn.PixelShuffle(2)))
+ chan = chan // 2
+ self.decoders.append(
+ nn.Sequential(*[NAFBlock(chan) for _ in range(num)]))
+
+ self.padder_size = 2**len(self.encoders)
+
+ def forward(self, inp):
+ B, C, H, W = inp.shape
+ inp = self.check_image_size(inp)
+
+ x = self.intro(inp)
+
+ encs = []
+
+ for encoder, down in zip(self.encoders, self.downs):
+ x = encoder(x)
+ encs.append(x)
+ x = down(x)
+
+ x = self.middle_blks(x)
+
+ for decoder, up, enc_skip in zip(self.decoders, self.ups, encs[::-1]):
+ x = up(x)
+ x = x + enc_skip
+ x = decoder(x)
+
+ x = self.ending(x)
+ x = x + inp
+
+ return x[:, :, :H, :W]
+
+ def check_image_size(self, x):
+ _, _, h, w = x.shape
+ mod_pad_h = (self.padder_size - h % self.padder_size) % self.padder_size
+ mod_pad_w = (self.padder_size - w % self.padder_size) % self.padder_size
+ x = F.pad(x, [0, mod_pad_w, 0, mod_pad_h])
+ return x
+
+
+@GENERATORS.register()
+class NAFNetLocal(Local_Base, NAFNet):
+
+ def __init__(self,
+ *args,
+ train_size=(1, 3, 256, 256),
+ fast_imp=False,
+ **kwargs):
+ Local_Base.__init__(self)
+ NAFNet.__init__(self, *args, **kwargs)
+
+ N, C, H, W = train_size
+ base_size = (int(H * 1.5), int(W * 1.5))
+
+ self.eval()
+ with paddle.no_grad():
+ self.convert(base_size=base_size,
+ train_size=train_size,
+ fast_imp=fast_imp)
diff --git a/ppgan/models/generators/occlusion_aware.py b/ppgan/models/generators/occlusion_aware.py
index 41a68fa58818ec67aa48e5531452a2b07b114b30..7558caca664c220cdba71ff6738da696c0e0ab29 100644
--- a/ppgan/models/generators/occlusion_aware.py
+++ b/ppgan/models/generators/occlusion_aware.py
@@ -1,22 +1,15 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/AliaksandrSiarohin/first-order-model
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/AliaksandrSiarohin/first-order-model/blob/master/LICENSE.md
import paddle
from paddle import nn
import paddle.nn.functional as F
-from ...modules.first_order import ResBlock2d, SameBlock2d, UpBlock2d, DownBlock2d
+from ...modules.first_order import ResBlock2d, SameBlock2d, UpBlock2d, DownBlock2d, make_coordinate_grid
+from ...modules.first_order import MobileResBlock2d, MobileUpBlock2d, MobileDownBlock2d
from ...modules.dense_motion import DenseMotionNetwork
+import numpy as np
+import cv2
class OcclusionAwareGenerator(nn.Layer):
@@ -33,7 +26,9 @@ class OcclusionAwareGenerator(nn.Layer):
num_bottleneck_blocks,
estimate_occlusion_map=False,
dense_motion_params=None,
- estimate_jacobian=False):
+ estimate_jacobian=False,
+ inference=False,
+ mobile_net=False):
super(OcclusionAwareGenerator, self).__init__()
if dense_motion_params is not None:
@@ -41,52 +36,128 @@ class OcclusionAwareGenerator(nn.Layer):
num_kp=num_kp,
num_channels=num_channels,
estimate_occlusion_map=estimate_occlusion_map,
- **dense_motion_params)
+ **dense_motion_params,
+ mobile_net=mobile_net)
else:
self.dense_motion_network = None
- self.first = SameBlock2d(num_channels,
+ if mobile_net:
+ self.first = nn.Sequential(
+ SameBlock2d(num_channels,
+ num_channels,
+ kernel_size=3,
+ padding=1,
+ mobile_net=mobile_net),
+ SameBlock2d(num_channels,
+ num_channels,
+ kernel_size=3,
+ padding=1,
+ mobile_net=mobile_net),
+ SameBlock2d(num_channels,
+ block_expansion,
+ kernel_size=3,
+ padding=1,
+ mobile_net=mobile_net)
+ )
+ else:
+ self.first = SameBlock2d(num_channels,
block_expansion,
kernel_size=(7, 7),
- padding=(3, 3))
+ padding=(3, 3),
+ mobile_net=mobile_net)
down_blocks = []
- for i in range(num_down_blocks):
- in_features = min(max_features, block_expansion * (2**i))
- out_features = min(max_features, block_expansion * (2**(i + 1)))
- down_blocks.append(
- DownBlock2d(in_features,
- out_features,
- kernel_size=(3, 3),
- padding=(1, 1)))
+ if mobile_net:
+ for i in range(num_down_blocks):
+ in_features = min(max_features, block_expansion * (2**i))
+ out_features = min(max_features, block_expansion * (2**(i + 1)))
+ down_blocks.append(
+ MobileDownBlock2d(in_features,
+ out_features,
+ kernel_size=(3, 3),
+ padding=(1, 1)))
+ else:
+ for i in range(num_down_blocks):
+ in_features = min(max_features, block_expansion * (2**i))
+ out_features = min(max_features, block_expansion * (2**(i + 1)))
+ down_blocks.append(
+ DownBlock2d(in_features,
+ out_features,
+ kernel_size=(3, 3),
+ padding=(1, 1)))
self.down_blocks = nn.LayerList(down_blocks)
up_blocks = []
- for i in range(num_down_blocks):
- in_features = min(max_features,
- block_expansion * (2**(num_down_blocks - i)))
- out_features = min(max_features,
- block_expansion * (2**(num_down_blocks - i - 1)))
- up_blocks.append(
- UpBlock2d(in_features,
- out_features,
- kernel_size=(3, 3),
- padding=(1, 1)))
+ if mobile_net:
+ for i in range(num_down_blocks):
+ in_features = min(max_features,
+ block_expansion * (2**(num_down_blocks - i)))
+ out_features = min(
+ max_features,
+ block_expansion * (2**(num_down_blocks - i - 1)))
+ up_blocks.append(
+ MobileUpBlock2d(in_features,
+ out_features,
+ kernel_size=(3, 3),
+ padding=(1, 1)))
+ else:
+ for i in range(num_down_blocks):
+ in_features = min(max_features,
+ block_expansion * (2**(num_down_blocks - i)))
+ out_features = min(
+ max_features,
+ block_expansion * (2**(num_down_blocks - i - 1)))
+ up_blocks.append(
+ UpBlock2d(in_features,
+ out_features,
+ kernel_size=(3, 3),
+ padding=(1, 1)))
self.up_blocks = nn.LayerList(up_blocks)
self.bottleneck = paddle.nn.Sequential()
in_features = min(max_features, block_expansion * (2**num_down_blocks))
- for i in range(num_bottleneck_blocks):
- self.bottleneck.add_sublayer(
- 'r' + str(i),
- ResBlock2d(in_features, kernel_size=(3, 3), padding=(1, 1)))
-
- self.final = nn.Conv2D(block_expansion,
+ if mobile_net:
+ for i in range(num_bottleneck_blocks):
+ self.bottleneck.add_sublayer(
+ 'r' + str(i),
+ MobileResBlock2d(in_features,
+ kernel_size=(3, 3),
+ padding=(1, 1)))
+ else:
+ for i in range(num_bottleneck_blocks):
+ self.bottleneck.add_sublayer(
+ 'r' + str(i),
+ ResBlock2d(in_features, kernel_size=(3, 3), padding=(1, 1)))
+ if mobile_net:
+ self.final = nn.Sequential(
+ nn.Conv2D(block_expansion,
+ block_expansion,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=1),
+ nn.ReLU(),
+ nn.Conv2D(block_expansion,
+ block_expansion,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=1),
+ nn.ReLU(),
+ nn.Conv2D(block_expansion,
+ num_channels,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=1)
+ )
+ else:
+ self.final = nn.Conv2D(block_expansion,
num_channels,
kernel_size=(7, 7),
padding=(3, 3))
self.estimate_occlusion_map = estimate_occlusion_map
self.num_channels = num_channels
+ self.inference = inference
+ self.pad = 5
+ self.mobile_net = mobile_net
def deform_input(self, inp, deformation):
_, h_old, w_old, _ = deformation.shape
@@ -98,7 +169,25 @@ class OcclusionAwareGenerator(nn.Layer):
mode='bilinear',
align_corners=False)
deformation = deformation.transpose([0, 2, 3, 1])
- return F.grid_sample(inp, deformation, align_corners=False)
+ if self.inference:
+ identity_grid = make_coordinate_grid((h, w), type=inp.dtype)
+ identity_grid = identity_grid.reshape([1, h, w, 2])
+ visualization_matrix = np.zeros((h, w)).astype("float32")
+ visualization_matrix[self.pad:h - self.pad,
+ self.pad:w - self.pad] = 1.0
+ gauss_kernel = paddle.to_tensor(
+ cv2.GaussianBlur(visualization_matrix, (9, 9),
+ 0.0,
+ borderType=cv2.BORDER_ISOLATED))
+ gauss_kernel = gauss_kernel.unsqueeze(0).unsqueeze(-1)
+ deformation = gauss_kernel * deformation + (
+ 1 - gauss_kernel) * identity_grid
+
+ return F.grid_sample(inp,
+ deformation,
+ mode='bilinear',
+ padding_mode='zeros',
+ align_corners=True)
def forward(self, source_image, kp_driving, kp_source):
# Encoding (downsampling) part
@@ -130,6 +219,12 @@ class OcclusionAwareGenerator(nn.Layer):
size=out.shape[2:],
mode='bilinear',
align_corners=False)
+ if self.inference and not self.mobile_net:
+ h, w = occlusion_map.shape[2:]
+ occlusion_map[:, :, 0:self.pad, :] = 1.0
+ occlusion_map[:, :, :, 0:self.pad] = 1.0
+ occlusion_map[:, :, h - self.pad:h, :] = 1.0
+ occlusion_map[:, :, :, w - self.pad:w] = 1.0
out = out * occlusion_map
output_dict["deformed"] = self.deform_input(source_image,
diff --git a/ppgan/models/generators/pan.py b/ppgan/models/generators/pan.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e11975643d1c8ae80a4fee8b19a42b17ed5377e
--- /dev/null
+++ b/ppgan/models/generators/pan.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .builder import GENERATORS
+
+
+def make_multi_blocks(func, num_layers):
+ """Make layers by stacking the same blocks.
+
+ Args:
+ func (nn.Layer): nn.Layer class for basic block.
+ num_layers (int): number of blocks.
+
+ Returns:
+ nn.Sequential: Stacked blocks in nn.Sequential.
+ """
+ Blocks = nn.Sequential()
+ for i in range(num_layers):
+ Blocks.add_sublayer('block%d' % i, func())
+ return Blocks
+
+
+class PA(nn.Layer):
+ '''PA is pixel attention'''
+ def __init__(self, nf):
+
+ super(PA, self).__init__()
+ self.conv = nn.Conv2D(nf, nf, 1)
+ self.sigmoid = nn.Sigmoid()
+
+ def forward(self, x):
+
+ y = self.conv(x)
+ y = self.sigmoid(y)
+ out = x * y
+
+ return out
+
+
+class PAConv(nn.Layer):
+ def __init__(self, nf, k_size=3):
+
+ super(PAConv, self).__init__()
+ self.k2 = nn.Conv2D(nf, nf, 1) # 1x1 convolution nf->nf
+ self.sigmoid = nn.Sigmoid()
+ self.k3 = nn.Conv2D(nf,
+ nf,
+ kernel_size=k_size,
+ padding=(k_size - 1) // 2,
+ bias_attr=False) # 3x3 convolution
+ self.k4 = nn.Conv2D(nf,
+ nf,
+ kernel_size=k_size,
+ padding=(k_size - 1) // 2,
+ bias_attr=False) # 3x3 convolution
+
+ def forward(self, x):
+
+ y = self.k2(x)
+ y = self.sigmoid(y)
+
+ out = self.k3(x) * y
+ out = self.k4(out)
+
+ return out
+
+
+class SCPA(nn.Layer):
+ """
+ SCPA is modified from SCNet (Jiang-Jiang Liu et al. Improving Convolutional Networks with Self-Calibrated Convolutions. In CVPR, 2020)
+ """
+ def __init__(self, nf, reduction=2, stride=1, dilation=1):
+ super(SCPA, self).__init__()
+ group_width = nf // reduction
+
+ self.conv1_a = nn.Conv2D(nf,
+ group_width,
+ kernel_size=1,
+ bias_attr=False)
+ self.conv1_b = nn.Conv2D(nf,
+ group_width,
+ kernel_size=1,
+ bias_attr=False)
+
+ self.k1 = nn.Sequential(
+ nn.Conv2D(group_width,
+ group_width,
+ kernel_size=3,
+ stride=stride,
+ padding=dilation,
+ dilation=dilation,
+ bias_attr=False))
+
+ self.PAConv = PAConv(group_width)
+
+ self.conv3 = nn.Conv2D(group_width * reduction,
+ nf,
+ kernel_size=1,
+ bias_attr=False)
+
+ self.lrelu = nn.LeakyReLU(negative_slope=0.2)
+
+ def forward(self, x):
+ residual = x
+
+ out_a = self.conv1_a(x)
+ out_b = self.conv1_b(x)
+ out_a = self.lrelu(out_a)
+ out_b = self.lrelu(out_b)
+
+ out_a = self.k1(out_a)
+ out_b = self.PAConv(out_b)
+ out_a = self.lrelu(out_a)
+ out_b = self.lrelu(out_b)
+
+ out = self.conv3(paddle.concat([out_a, out_b], axis=1))
+ out += residual
+
+ return out
+
+
+@GENERATORS.register()
+class PAN(nn.Layer):
+ def __init__(self, in_nc, out_nc, nf, unf, nb, scale=4):
+ super(PAN, self).__init__()
+ # SCPA
+ SCPA_block_f = functools.partial(SCPA, nf=nf, reduction=2)
+ self.scale = scale
+
+ ### first convolution
+ self.conv_first = nn.Conv2D(in_nc, nf, 3, 1, 1)
+
+ ### main blocks
+ self.SCPA_trunk = make_multi_blocks(SCPA_block_f, nb)
+ self.trunk_conv = nn.Conv2D(nf, nf, 3, 1, 1)
+
+ #### upsampling
+ self.upconv1 = nn.Conv2D(nf, unf, 3, 1, 1)
+ self.att1 = PA(unf)
+ self.HRconv1 = nn.Conv2D(unf, unf, 3, 1, 1)
+
+ if self.scale == 4:
+ self.upconv2 = nn.Conv2D(unf, unf, 3, 1, 1)
+ self.att2 = PA(unf)
+ self.HRconv2 = nn.Conv2D(unf, unf, 3, 1, 1)
+
+ self.conv_last = nn.Conv2D(unf, out_nc, 3, 1, 1)
+ self.lrelu = nn.LeakyReLU(negative_slope=0.2)
+
+ def forward(self, x):
+
+ fea = self.conv_first(x)
+ trunk = self.trunk_conv(self.SCPA_trunk(fea))
+ fea = fea + trunk
+
+ if self.scale == 2 or self.scale == 3:
+ fea = self.upconv1(
+ F.interpolate(fea, scale_factor=self.scale, mode='nearest'))
+ fea = self.lrelu(self.att1(fea))
+ fea = self.lrelu(self.HRconv1(fea))
+ elif self.scale == 4:
+ fea = self.upconv1(
+ F.interpolate(fea, scale_factor=2, mode='nearest'))
+ fea = self.lrelu(self.att1(fea))
+ fea = self.lrelu(self.HRconv1(fea))
+ fea = self.upconv2(
+ F.interpolate(fea, scale_factor=2, mode='nearest'))
+ fea = self.lrelu(self.att2(fea))
+ fea = self.lrelu(self.HRconv2(fea))
+
+ out = self.conv_last(fea)
+
+ ILR = F.interpolate(x,
+ scale_factor=self.scale,
+ mode='bilinear',
+ align_corners=False)
+ out = out + ILR
+ return out
diff --git a/ppgan/models/generators/prenet.py b/ppgan/models/generators/prenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd08271434b7abe8ef0c67666b311017295792f5
--- /dev/null
+++ b/ppgan/models/generators/prenet.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# code was heavily based on https://github.com/csdwren/PReNet
+# Users should be careful about adopting these functions in any commercial matters.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .builder import GENERATORS
+
+
+def convWithBias(in_channels, out_channels, kernel_size, stride, padding):
+ """ Obtain a 2d convolution layer with bias and initialized by KaimingUniform
+ in_channels (int): Number of input channels.
+ out_channels (int): Number of output channels.
+ kernel_size (int): Convolution kernel size
+ stride (int): Convolution stride
+ padding (int|tuple): Convolution padding.
+ """
+ if isinstance(kernel_size, int):
+ fan_in = kernel_size * kernel_size * in_channels
+ else:
+ fan_in = kernel_size[0] * kernel_size[1] * in_channels
+ bound = 1 / math.sqrt(fan_in)
+ bias_attr = paddle.framework.ParamAttr(
+ initializer=nn.initializer.Uniform(-bound, bound))
+ weight_attr = paddle.framework.ParamAttr(
+ initializer=nn.initializer.KaimingUniform(fan_in=6 * fan_in))
+ conv = nn.Conv2D(in_channels,
+ out_channels,
+ kernel_size,
+ stride,
+ padding,
+ weight_attr=weight_attr,
+ bias_attr=bias_attr)
+ return conv
+
+
+@GENERATORS.register()
+class PReNet(nn.Layer):
+ """
+ Args:
+ recurrent_iter (int): Number of iterations.
+ Default: 6.
+ use_GPU (bool): whether use gpu or not .
+ Default: True.
+ """
+
+ def __init__(self, recurrent_iter=6, use_GPU=True):
+ super(PReNet, self).__init__()
+ self.iteration = recurrent_iter
+ self.use_GPU = use_GPU
+
+ self.conv0 = nn.Sequential(convWithBias(6, 32, 3, 1, 1), nn.ReLU())
+ self.res_conv1 = nn.Sequential(convWithBias(32, 32, 3, 1, 1), nn.ReLU(),
+ convWithBias(32, 32, 3, 1, 1), nn.ReLU())
+ self.res_conv2 = nn.Sequential(convWithBias(32, 32, 3, 1, 1), nn.ReLU(),
+ convWithBias(32, 32, 3, 1, 1), nn.ReLU())
+ self.res_conv3 = nn.Sequential(convWithBias(32, 32, 3, 1, 1), nn.ReLU(),
+ convWithBias(32, 32, 3, 1, 1), nn.ReLU())
+ self.res_conv4 = nn.Sequential(convWithBias(32, 32, 3, 1, 1), nn.ReLU(),
+ convWithBias(32, 32, 3, 1, 1), nn.ReLU())
+ self.res_conv5 = nn.Sequential(convWithBias(32, 32, 3, 1, 1), nn.ReLU(),
+ convWithBias(32, 32, 3, 1, 1), nn.ReLU())
+ self.conv_i = nn.Sequential(convWithBias(32 + 32, 32, 3, 1, 1),
+ nn.Sigmoid())
+ self.conv_f = nn.Sequential(convWithBias(32 + 32, 32, 3, 1, 1),
+ nn.Sigmoid())
+ self.conv_g = nn.Sequential(convWithBias(32 + 32, 32, 3, 1, 1),
+ nn.Tanh())
+ self.conv_o = nn.Sequential(convWithBias(32 + 32, 32, 3, 1, 1),
+ nn.Sigmoid())
+ self.conv = nn.Sequential(convWithBias(32, 3, 3, 1, 1), )
+
+ def forward(self, input):
+ batch_size, row, col = input.shape[0], input.shape[2], input.shape[3]
+
+ x = input
+
+ h = paddle.to_tensor(paddle.zeros(shape=(batch_size, 32, row, col),
+ dtype='float32'),
+ stop_gradient=False)
+ c = paddle.to_tensor(paddle.zeros(shape=(batch_size, 32, row, col),
+ dtype='float32'),
+ stop_gradient=False)
+
+ x_list = []
+ for _ in range(self.iteration):
+ x = paddle.concat((input, x), 1)
+ x = self.conv0(x)
+
+ x = paddle.concat((x, h), 1)
+ i = self.conv_i(x)
+ f = self.conv_f(x)
+ g = self.conv_g(x)
+ o = self.conv_o(x)
+ c = f * c + i * g
+ h = o * paddle.tanh(c)
+
+ x = h
+ resx = x
+ x = F.relu(self.res_conv1(x) + resx)
+ resx = x
+ x = F.relu(self.res_conv2(x) + resx)
+ resx = x
+ x = F.relu(self.res_conv3(x) + resx)
+ resx = x
+ x = F.relu(self.res_conv4(x) + resx)
+ resx = x
+ x = F.relu(self.res_conv5(x) + resx)
+ x = self.conv(x)
+
+ x = x + input
+ x_list.append(x)
+ return x
diff --git a/ppgan/models/generators/rcan.py b/ppgan/models/generators/rcan.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fe989b32e27821301f19ff2f00544ae44056f6d
--- /dev/null
+++ b/ppgan/models/generators/rcan.py
@@ -0,0 +1,202 @@
+# base on https://github.com/kongdebug/RCAN-Paddle
+import math
+import paddle
+import paddle.nn as nn
+
+from .builder import GENERATORS
+
+
+def default_conv(in_channels, out_channels, kernel_size, bias=True):
+ weight_attr = paddle.ParamAttr(
+ initializer=paddle.nn.initializer.XavierUniform(), need_clip=True)
+ return nn.Conv2D(in_channels,
+ out_channels,
+ kernel_size,
+ padding=(kernel_size // 2),
+ weight_attr=weight_attr,
+ bias_attr=bias)
+
+
+class MeanShift(nn.Conv2D):
+
+ def __init__(self, rgb_range, rgb_mean, rgb_std, sign=-1):
+ super(MeanShift, self).__init__(3, 3, kernel_size=1)
+ std = paddle.to_tensor(rgb_std)
+ self.weight.set_value(paddle.eye(3).reshape([3, 3, 1, 1]))
+ self.weight.set_value(self.weight / (std.reshape([3, 1, 1, 1])))
+
+ mean = paddle.to_tensor(rgb_mean)
+ self.bias.set_value(sign * rgb_range * mean / std)
+
+ self.weight.trainable = False
+ self.bias.trainable = False
+
+
+## Channel Attention (CA) Layer
+class CALayer(nn.Layer):
+
+ def __init__(self, channel, reduction=16):
+ super(CALayer, self).__init__()
+ # global average pooling: feature --> point
+ self.avg_pool = nn.AdaptiveAvgPool2D(1)
+ # feature channel downscale and upscale --> channel weight
+ self.conv_du = nn.Sequential(
+ nn.Conv2D(channel,
+ channel // reduction,
+ 1,
+ padding=0,
+ bias_attr=True), nn.ReLU(),
+ nn.Conv2D(channel // reduction,
+ channel,
+ 1,
+ padding=0,
+ bias_attr=True), nn.Sigmoid())
+
+ def forward(self, x):
+ y = self.avg_pool(x)
+ y = self.conv_du(y)
+ return x * y
+
+
+class RCAB(nn.Layer):
+
+ def __init__(self,
+ conv,
+ n_feat,
+ kernel_size,
+ reduction=16,
+ bias=True,
+ bn=False,
+ act=nn.ReLU(),
+ res_scale=1):
+ super(RCAB, self).__init__()
+ modules_body = []
+ for i in range(2):
+ modules_body.append(conv(n_feat, n_feat, kernel_size, bias=bias))
+ if bn: modules_body.append(nn.BatchNorm2D(n_feat))
+ if i == 0: modules_body.append(act)
+ modules_body.append(CALayer(n_feat, reduction))
+ self.body = nn.Sequential(*modules_body)
+ self.res_scale = res_scale
+
+ def forward(self, x):
+ res = self.body(x)
+ res += x
+ return res
+
+
+## Residual Group (RG)
+class ResidualGroup(nn.Layer):
+
+ def __init__(self, conv, n_feat, kernel_size, reduction, act, res_scale,
+ n_resblocks):
+ super(ResidualGroup, self).__init__()
+ modules_body = []
+ modules_body = [
+ RCAB(
+ conv, n_feat, kernel_size, reduction, bias=True, bn=False, act=nn.ReLU(), res_scale=1) \
+ for _ in range(n_resblocks)]
+ modules_body.append(conv(n_feat, n_feat, kernel_size))
+ self.body = nn.Sequential(*modules_body)
+
+ def forward(self, x):
+ res = self.body(x)
+ res += x
+ return res
+
+
+class Upsampler(nn.Sequential):
+
+ def __init__(self, conv, scale, n_feats, bn=False, act=False, bias=True):
+ m = []
+ if (scale & (scale - 1)) == 0: # Is scale = 2^n?
+ for _ in range(int(math.log(scale, 2))):
+ m.append(conv(n_feats, 4 * n_feats, 3, bias))
+ m.append(nn.PixelShuffle(2))
+ if bn: m.append(nn.BatchNorm2D(n_feats))
+
+ if act == 'relu':
+ m.append(nn.ReLU())
+ elif act == 'prelu':
+ m.append(nn.PReLU(n_feats))
+
+ elif scale == 3:
+ m.append(conv(n_feats, 9 * n_feats, 3, bias))
+ m.append(nn.PixelShuffle(3))
+ if bn: m.append(nn.BatchNorm2D(n_feats))
+
+ if act == 'relu':
+ m.append(nn.ReLU())
+ elif act == 'prelu':
+ m.append(nn.PReLU(n_feats))
+ else:
+ raise NotImplementedError
+
+ super(Upsampler, self).__init__(*m)
+
+
+@GENERATORS.register()
+class RCAN(nn.Layer):
+
+ def __init__(
+ self,
+ scale,
+ n_resgroups,
+ n_resblocks,
+ n_feats=64,
+ n_colors=3,
+ rgb_range=255,
+ kernel_size=3,
+ reduction=16,
+ conv=default_conv,
+ ):
+ super(RCAN, self).__init__()
+ self.scale = scale
+ act = nn.ReLU()
+
+ n_resgroups = n_resgroups
+ n_resblocks = n_resblocks
+ n_feats = n_feats
+ kernel_size = kernel_size
+ reduction = reduction
+ scale = scale
+ act = nn.ReLU()
+
+ rgb_mean = (0.4488, 0.4371, 0.4040)
+ rgb_std = (1.0, 1.0, 1.0)
+ self.sub_mean = MeanShift(rgb_range, rgb_mean, rgb_std)
+
+ # define head module
+ modules_head = [conv(n_colors, n_feats, kernel_size)]
+
+ # define body module
+ modules_body = [
+ ResidualGroup(
+ conv, n_feats, kernel_size, reduction, act=act, res_scale= 1, n_resblocks=n_resblocks) \
+ for _ in range(n_resgroups)]
+
+ modules_body.append(conv(n_feats, n_feats, kernel_size))
+
+ # define tail module
+ modules_tail = [
+ Upsampler(conv, scale, n_feats, act=False),
+ conv(n_feats, n_colors, kernel_size)
+ ]
+
+ self.head = nn.Sequential(*modules_head)
+ self.body = nn.Sequential(*modules_body)
+ self.tail = nn.Sequential(*modules_tail)
+
+ self.add_mean = MeanShift(rgb_range, rgb_mean, rgb_std, 1)
+
+ def forward(self, x):
+ x = self.sub_mean(x)
+ x = self.head(x)
+
+ res = self.body(x)
+ res += x
+
+ x = self.tail(res)
+ x = self.add_mean(x)
+
+ return x
diff --git a/ppgan/models/generators/remaster.py b/ppgan/models/generators/remaster.py
index 8de4bd07e05dd583f881fe1b2c8b62ea419d6765..80340216645da0d092ef80ffb68dcbeb504941e9 100644
--- a/ppgan/models/generators/remaster.py
+++ b/ppgan/models/generators/remaster.py
@@ -91,16 +91,14 @@ class UpsampleConcat(nn.Layer):
class SourceReferenceAttention(nn.Layer):
"""
Source-Reference Attention Layer
+
+
+ Args:
+ in_planes_s (int): Number of input source feature vector channels.
+ in_planes_r (int): Number of input reference feature vector channels.
+
"""
def __init__(self, in_planes_s, in_planes_r):
- """
- Parameters
- ----------
- in_planes_s: int
- Number of input source feature vector channels.
- in_planes_r: int
- Number of input reference feature vector channels.
- """
super(SourceReferenceAttention, self).__init__()
self.query_conv = nn.Conv3D(in_channels=in_planes_s,
out_channels=in_planes_s // 8,
diff --git a/ppgan/models/generators/resnet.py b/ppgan/models/generators/resnet.py
index 5e83e7116817c217908104d5f211a4f711d12584..3c30d1aedff6714ecb1eef53ea7ecc5b9c133f2a 100644
--- a/ppgan/models/generators/resnet.py
+++ b/ppgan/models/generators/resnet.py
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was based on https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
+
import paddle
import paddle.nn as nn
import functools
@@ -26,6 +28,16 @@ class ResnetGenerator(nn.Layer):
"""Resnet-based generator that consists of Resnet blocks between a few downsampling/upsampling operations.
code and idea from Justin Johnson's neural style transfer project(https://github.com/jcjohnson/fast-neural-style)
+
+ Args:
+ input_nc (int): the number of channels in input images
+ output_nc (int): the number of channels in output images
+ ngf (int): the number of filters in the last conv layer
+ norm_type (str): the name of the normalization layer: batch | instance | none
+ use_dropout (bool): if use dropout layers
+ n_blocks (int): the number of ResNet blocks
+ padding_type (str): the name of padding layer in conv layers: reflect | replicate | zero
+
"""
def __init__(self,
input_nc,
@@ -35,17 +47,7 @@ class ResnetGenerator(nn.Layer):
use_dropout=False,
n_blocks=6,
padding_type='reflect'):
- """Construct a Resnet-based generator
- Args:
- input_nc (int) -- the number of channels in input images
- output_nc (int) -- the number of channels in output images
- ngf (int) -- the number of filters in the last conv layer
- norm_layer -- normalization layer
- use_dropout (bool) -- if use dropout layers
- n_blocks (int) -- the number of ResNet blocks
- padding_type (str) -- the name of padding layer in conv layers: reflect | replicate | zero
- """
assert (n_blocks >= 0)
super(ResnetGenerator, self).__init__()
@@ -133,12 +135,12 @@ class ResnetBlock(nn.Layer):
use_bias):
"""Construct a convolutional block.
- Parameters:
- dim (int) -- the number of channels in the conv layer.
- padding_type (str) -- the name of padding layer: reflect | replicate | zero
- norm_layer -- normalization layer
- use_dropout (bool) -- if use dropout layers.
- use_bias (bool) -- if the conv layer uses bias or not
+ Args:
+ dim (int): the number of channels in the conv layer.
+ padding_type (str): the name of padding layer: reflect | replicate | zero.
+ norm_layer (paddle.nn.Layer): normalization layer.
+ use_dropout (bool): whether to use dropout layers.
+ use_bias (bool): whether to use the conv layer bias or not.
Returns a conv block (with a conv layer, a normalization layer, and a non-linearity layer (ReLU))
"""
diff --git a/ppgan/models/generators/resnet_ugatit.py b/ppgan/models/generators/resnet_ugatit.py
index 187caf4ac15e3e47b0fc5a7b12b4c303ee6cd831..d5991ae4e5c997d4bd2b97600e05867bc791eedf 100644
--- a/ppgan/models/generators/resnet_ugatit.py
+++ b/ppgan/models/generators/resnet_ugatit.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# MIT License
+# Copyright (c) 2019 Hyeonwoo Kang
+# code was based on https://github.com/znxlwm/UGATIT-pytorch
import functools
import paddle
diff --git a/ppgan/models/generators/resnet_ugatit_p2c.py b/ppgan/models/generators/resnet_ugatit_p2c.py
index 865fd9ca5d8ea08d55292fc6db967414d031adbd..e7874c8bccfb97ad05f9b04aea000889e78c9805 100644
--- a/ppgan/models/generators/resnet_ugatit_p2c.py
+++ b/ppgan/models/generators/resnet_ugatit_p2c.py
@@ -12,6 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/znxlwm/UGATIT-pytorch
+# MIT License
+# Copyright (c) 2019 Hyeonwoo Kang
import paddle
import paddle.nn as nn
@@ -45,72 +48,82 @@ class ResnetUGATITP2CGenerator(nn.Layer):
nn.ReLU()
]
- DownBlock += [
- HourGlass(ngf, ngf),
- HourGlass(ngf, ngf)
- ]
+ DownBlock += [HourGlass(ngf, ngf), HourGlass(ngf, ngf)]
# Down-Sampling
n_downsampling = 2
for i in range(n_downsampling):
- mult = 2 ** i
+ mult = 2**i
DownBlock += [
nn.Pad2D([1, 1, 1, 1], 'reflect'),
- nn.Conv2D(ngf*mult, ngf*mult*2, kernel_size=3, stride=2, bias_attr=False),
- nn.InstanceNorm2D(ngf*mult*2, weight_attr=False, bias_attr=False),
+ nn.Conv2D(ngf * mult,
+ ngf * mult * 2,
+ kernel_size=3,
+ stride=2,
+ bias_attr=False),
+ nn.InstanceNorm2D(ngf * mult * 2,
+ weight_attr=False,
+ bias_attr=False),
nn.ReLU()
]
# Encoder Bottleneck
- mult = 2 ** n_downsampling
+ mult = 2**n_downsampling
for i in range(n_blocks):
- setattr(self, 'EncodeBlock'+str(i+1), ResnetBlock(ngf*mult))
+ setattr(self, 'EncodeBlock' + str(i + 1), ResnetBlock(ngf * mult))
# Class Activation Map
- self.gap_fc = nn.Linear(ngf*mult, 1, bias_attr=False)
- self.gmp_fc = nn.Linear(ngf*mult, 1, bias_attr=False)
- self.conv1x1 = nn.Conv2D(ngf*mult*2, ngf*mult, kernel_size=1, stride=1)
+ self.gap_fc = nn.Linear(ngf * mult, 1, bias_attr=False)
+ self.gmp_fc = nn.Linear(ngf * mult, 1, bias_attr=False)
+ self.conv1x1 = nn.Conv2D(ngf * mult * 2,
+ ngf * mult,
+ kernel_size=1,
+ stride=1)
self.relu = nn.ReLU()
# Gamma, Beta block
FC = []
if self.light:
FC += [
- nn.Linear(ngf*mult, ngf*mult, bias_attr=False),
+ nn.Linear(ngf * mult, ngf * mult, bias_attr=False),
nn.ReLU(),
- nn.Linear(ngf*mult, ngf*mult, bias_attr=False),
+ nn.Linear(ngf * mult, ngf * mult, bias_attr=False),
nn.ReLU()
]
else:
FC += [
- nn.Linear(img_size//mult*img_size//mult*ngf*mult, ngf*mult, bias_attr=False),
+ nn.Linear(img_size // mult * img_size // mult * ngf * mult,
+ ngf * mult,
+ bias_attr=False),
nn.ReLU(),
- nn.Linear(ngf*mult, ngf*mult, bias_attr=False),
+ nn.Linear(ngf * mult, ngf * mult, bias_attr=False),
nn.ReLU()
]
# Decoder Bottleneck
- mult = 2 ** n_downsampling
+ mult = 2**n_downsampling
for i in range(n_blocks):
- setattr(self, 'DecodeBlock'+str(i + 1), ResnetSoftAdaLINBlock(ngf*mult))
+ setattr(self, 'DecodeBlock' + str(i + 1),
+ ResnetSoftAdaLINBlock(ngf * mult))
# Up-Sampling
UpBlock = []
for i in range(n_downsampling):
- mult = 2 ** (n_downsampling - i)
+ mult = 2**(n_downsampling - i)
UpBlock += [
nn.Upsample(scale_factor=2),
nn.Pad2D([1, 1, 1, 1], 'reflect'),
- nn.Conv2D(ngf*mult, ngf*mult//2, kernel_size=3, stride=1, bias_attr=False),
- LIN(ngf*mult//2),
+ nn.Conv2D(ngf * mult,
+ ngf * mult // 2,
+ kernel_size=3,
+ stride=1,
+ bias_attr=False),
+ LIN(ngf * mult // 2),
nn.ReLU()
]
- UpBlock += [
- HourGlass(ngf, ngf),
- HourGlass(ngf, ngf, False)
- ]
+ UpBlock += [HourGlass(ngf, ngf), HourGlass(ngf, ngf, False)]
UpBlock += [
nn.Pad2D([3, 3, 3, 3], 'reflect'),
@@ -129,8 +142,9 @@ class ResnetUGATITP2CGenerator(nn.Layer):
content_features = []
for i in range(self.n_blocks):
- x = getattr(self, 'EncodeBlock'+str(i+1))(x)
- content_features.append(F.adaptive_avg_pool2d(x, 1).reshape([bs, -1]))
+ x = getattr(self, 'EncodeBlock' + str(i + 1))(x)
+ content_features.append(
+ F.adaptive_avg_pool2d(x, 1).reshape([bs, -1]))
gap = F.adaptive_avg_pool2d(x, 1)
gap_logit = self.gap_fc(gap.reshape([bs, -1]))
@@ -155,7 +169,10 @@ class ResnetUGATITP2CGenerator(nn.Layer):
style_features = self.FC(x.reshape([bs, -1]))
for i in range(self.n_blocks):
- x = getattr(self, 'DecodeBlock'+str(i+1))(x, content_features[4-i-1], style_features)
+ x = getattr(self,
+ 'DecodeBlock' + str(i + 1))(x,
+ content_features[4 - i - 1],
+ style_features)
out = self.UpBlock(x)
@@ -168,25 +185,27 @@ class ConvBlock(nn.Layer):
self.dim_in = dim_in
self.dim_out = dim_out
- self.conv_block1 = self.__convblock(dim_in, dim_out//2)
- self.conv_block2 = self.__convblock(dim_out//2, dim_out//4)
- self.conv_block3 = self.__convblock(dim_out//4, dim_out//4)
+ self.conv_block1 = self.__convblock(dim_in, dim_out // 2)
+ self.conv_block2 = self.__convblock(dim_out // 2, dim_out // 4)
+ self.conv_block3 = self.__convblock(dim_out // 4, dim_out // 4)
if self.dim_in != self.dim_out:
self.conv_skip = nn.Sequential(
nn.InstanceNorm2D(dim_in, weight_attr=False, bias_attr=False),
nn.ReLU(),
- nn.Conv2D(dim_in, dim_out, kernel_size=1, stride=1, bias_attr=False)
- )
+ nn.Conv2D(dim_in,
+ dim_out,
+ kernel_size=1,
+ stride=1,
+ bias_attr=False))
@staticmethod
def __convblock(dim_in, dim_out):
return nn.Sequential(
nn.InstanceNorm2D(dim_in, weight_attr=False, bias_attr=False),
- nn.ReLU(),
- nn.Pad2D([1, 1, 1, 1], 'reflect'),
- nn.Conv2D(dim_in, dim_out, kernel_size=3, stride=1, bias_attr=False)
- )
+ nn.ReLU(), nn.Pad2D([1, 1, 1, 1], 'reflect'),
+ nn.Conv2D(dim_in, dim_out, kernel_size=3, stride=1,
+ bias_attr=False))
def forward(self, x):
residual = x
@@ -210,24 +229,25 @@ class HourGlassBlock(nn.Layer):
self.n_block = 9
for i in range(self.n_skip):
- setattr(self, 'ConvBlockskip'+str(i+1), ConvBlock(dim_in, dim_in))
+ setattr(self, 'ConvBlockskip' + str(i + 1),
+ ConvBlock(dim_in, dim_in))
for i in range(self.n_block):
- setattr(self, 'ConvBlock'+str(i+1), ConvBlock(dim_in, dim_in))
+ setattr(self, 'ConvBlock' + str(i + 1), ConvBlock(dim_in, dim_in))
def forward(self, x):
skips = []
for i in range(self.n_skip):
- skips.append(getattr(self, 'ConvBlockskip'+str(i+1))(x))
+ skips.append(getattr(self, 'ConvBlockskip' + str(i + 1))(x))
x = F.avg_pool2d(x, 2)
- x = getattr(self, 'ConvBlock'+str(i+1))(x)
+ x = getattr(self, 'ConvBlock' + str(i + 1))(x)
x = self.ConvBlock5(x)
for i in range(self.n_skip):
- x = getattr(self, 'ConvBlock'+str(i+6))(x)
+ x = getattr(self, 'ConvBlock' + str(i + 6))(x)
x = F.upsample(x, scale_factor=2)
- x = skips[self.n_skip-i-1] + x
+ x = skips[self.n_skip - i - 1] + x
return x
@@ -238,12 +258,14 @@ class HourGlass(nn.Layer):
self.use_res = use_res
self.HG = nn.Sequential(
- HourGlassBlock(dim_in),
- ConvBlock(dim_out, dim_out),
- nn.Conv2D(dim_out, dim_out, kernel_size=1, stride=1, bias_attr=False),
+ HourGlassBlock(dim_in), ConvBlock(dim_out, dim_out),
+ nn.Conv2D(dim_out,
+ dim_out,
+ kernel_size=1,
+ stride=1,
+ bias_attr=False),
nn.InstanceNorm2D(dim_out, weight_attr=False, bias_attr=False),
- nn.ReLU()
- )
+ nn.ReLU())
self.Conv1 = nn.Conv2D(dim_out, 3, kernel_size=1, stride=1)
@@ -292,12 +314,20 @@ class ResnetSoftAdaLINBlock(nn.Layer):
def __init__(self, dim, use_bias=False):
super(ResnetSoftAdaLINBlock, self).__init__()
self.pad1 = nn.Pad2D([1, 1, 1, 1], 'reflect')
- self.conv1 = nn.Conv2D(dim, dim, kernel_size=3, stride=1, bias_attr=use_bias)
+ self.conv1 = nn.Conv2D(dim,
+ dim,
+ kernel_size=3,
+ stride=1,
+ bias_attr=use_bias)
self.norm1 = SoftAdaLIN(dim)
self.relu1 = nn.ReLU()
self.pad2 = nn.Pad2D([1, 1, 1, 1], 'reflect')
- self.conv2 = nn.Conv2D(dim, dim, kernel_size=3, stride=1, bias_attr=use_bias)
+ self.conv2 = nn.Conv2D(dim,
+ dim,
+ kernel_size=3,
+ stride=1,
+ bias_attr=use_bias)
self.norm2 = SoftAdaLIN(dim)
def forward(self, x, content_features, style_features):
@@ -317,23 +347,28 @@ class SoftAdaLIN(nn.Layer):
super(SoftAdaLIN, self).__init__()
self.norm = AdaLIN(num_features, eps)
- self.w_gamma = self.create_parameter([1, num_features], default_initializer=nn.initializer.Constant(0.))
- self.w_beta = self.create_parameter([1, num_features], default_initializer=nn.initializer.Constant(0.))
-
- self.c_gamma = nn.Sequential(nn.Linear(num_features, num_features, bias_attr=False),
- nn.ReLU(),
- nn.Linear(num_features, num_features, bias_attr=False))
- self.c_beta = nn.Sequential(nn.Linear(num_features, num_features, bias_attr=False),
- nn.ReLU(),
- nn.Linear(num_features, num_features, bias_attr=False))
+ self.w_gamma = self.create_parameter(
+ [1, num_features], default_initializer=nn.initializer.Constant(0.))
+ self.w_beta = self.create_parameter(
+ [1, num_features], default_initializer=nn.initializer.Constant(0.))
+
+ self.c_gamma = nn.Sequential(
+ nn.Linear(num_features, num_features, bias_attr=False), nn.ReLU(),
+ nn.Linear(num_features, num_features, bias_attr=False))
+ self.c_beta = nn.Sequential(
+ nn.Linear(num_features, num_features, bias_attr=False), nn.ReLU(),
+ nn.Linear(num_features, num_features, bias_attr=False))
self.s_gamma = nn.Linear(num_features, num_features, bias_attr=False)
self.s_beta = nn.Linear(num_features, num_features, bias_attr=False)
def forward(self, x, content_features, style_features):
- content_gamma, content_beta = self.c_gamma(content_features), self.c_beta(content_features)
- style_gamma, style_beta = self.s_gamma(style_features), self.s_beta(style_features)
+ content_gamma, content_beta = self.c_gamma(
+ content_features), self.c_beta(content_features)
+ style_gamma, style_beta = self.s_gamma(style_features), self.s_beta(
+ style_features)
- w_gamma_, w_beta_ = self.w_gamma.expand([x.shape[0], -1]), self.w_beta.expand([x.shape[0], -1])
+ w_gamma_, w_beta_ = self.w_gamma.expand(
+ [x.shape[0], -1]), self.w_beta.expand([x.shape[0], -1])
soft_gamma = (1. - w_gamma_) * style_gamma + w_gamma_ * content_gamma
soft_beta = (1. - w_beta_) * style_beta + w_beta_ * content_beta
@@ -345,16 +380,25 @@ class AdaLIN(nn.Layer):
def __init__(self, num_features, eps=1e-5):
super(AdaLIN, self).__init__()
self.eps = eps
- self.rho = self.create_parameter([1, num_features, 1, 1], default_initializer=nn.initializer.Constant(0.9))
+ self.rho = self.create_parameter(
+ [1, num_features, 1, 1],
+ default_initializer=nn.initializer.Constant(0.9))
def forward(self, x, gamma, beta):
- in_mean, in_var = paddle.mean(x, axis=[2, 3], keepdim=True), paddle.var(x, axis=[2, 3], keepdim=True)
+ in_mean, in_var = paddle.mean(x, axis=[2, 3],
+ keepdim=True), paddle.var(x,
+ axis=[2, 3],
+ keepdim=True)
out_in = (x - in_mean) / paddle.sqrt(in_var + self.eps)
- ln_mean, ln_var = paddle.mean(x, axis=[1, 2, 3], keepdim=True), paddle.var(x, axis=[1, 2, 3], keepdim=True)
+ ln_mean, ln_var = paddle.mean(x, axis=[1, 2, 3],
+ keepdim=True), paddle.var(x,
+ axis=[1, 2, 3],
+ keepdim=True)
out_ln = (x - ln_mean) / paddle.sqrt(ln_var + self.eps)
out = self.rho.expand([x.shape[0], -1, -1, -1]) * out_in + \
(1-self.rho.expand([x.shape[0], -1, -1, -1])) * out_ln
- out = out * gamma.unsqueeze(2).unsqueeze(3) + beta.unsqueeze(2).unsqueeze(3)
+ out = out * gamma.unsqueeze(2).unsqueeze(3) + beta.unsqueeze(
+ 2).unsqueeze(3)
return out
@@ -363,17 +407,31 @@ class LIN(nn.Layer):
def __init__(self, num_features, eps=1e-5):
super(LIN, self).__init__()
self.eps = eps
- self.rho = self.create_parameter([1, num_features, 1, 1], default_initializer=nn.initializer.Constant(0.))
- self.gamma = self.create_parameter([1, num_features, 1, 1], default_initializer=nn.initializer.Constant(1.))
- self.beta = self.create_parameter([1, num_features, 1, 1], default_initializer=nn.initializer.Constant(0.))
+ self.rho = self.create_parameter(
+ [1, num_features, 1, 1],
+ default_initializer=nn.initializer.Constant(0.))
+ self.gamma = self.create_parameter(
+ [1, num_features, 1, 1],
+ default_initializer=nn.initializer.Constant(1.))
+ self.beta = self.create_parameter(
+ [1, num_features, 1, 1],
+ default_initializer=nn.initializer.Constant(0.))
def forward(self, x):
- in_mean, in_var = paddle.mean(x, axis=[2, 3], keepdim=True), paddle.var(x, axis=[2, 3], keepdim=True)
+ in_mean, in_var = paddle.mean(x, axis=[2, 3],
+ keepdim=True), paddle.var(x,
+ axis=[2, 3],
+ keepdim=True)
out_in = (x - in_mean) / paddle.sqrt(in_var + self.eps)
- ln_mean, ln_var = paddle.mean(x, axis=[1, 2, 3], keepdim=True), paddle.var(x, axis=[1, 2, 3], keepdim=True)
+ ln_mean, ln_var = paddle.mean(x, axis=[1, 2, 3],
+ keepdim=True), paddle.var(x,
+ axis=[1, 2, 3],
+ keepdim=True)
out_ln = (x - ln_mean) / paddle.sqrt(ln_var + self.eps)
out = self.rho.expand([x.shape[0], -1, -1, -1]) * out_in + \
(1-self.rho.expand([x.shape[0], -1, -1, -1])) * out_ln
- out = out * self.gamma.expand([x.shape[0], -1, -1, -1]) + self.beta.expand([x.shape[0], -1, -1, -1])
+ out = out * self.gamma.expand([x.shape[0], -1, -1, -1
+ ]) + self.beta.expand(
+ [x.shape[0], -1, -1, -1])
return out
diff --git a/ppgan/models/generators/rrdb_net.py b/ppgan/models/generators/rrdb_net.py
index 7381e149a8e104acab6fb392f7ea12fef18715a6..c0d5f73a71a56638803f3f2a70dfcf267529b12e 100644
--- a/ppgan/models/generators/rrdb_net.py
+++ b/ppgan/models/generators/rrdb_net.py
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was based on https://github.com/xinntao/ESRGAN
+
import functools
import paddle
import paddle.nn as nn
@@ -20,6 +22,26 @@ import paddle.nn.functional as F
from .builder import GENERATORS
+def pixel_unshuffle(x, scale):
+ """ Pixel unshuffle function.
+
+ Args:
+ x (paddle.Tensor): Input feature.
+ scale (int): Downsample ratio.
+
+ Returns:
+ paddle.Tensor: the pixel unshuffled feature.
+ """
+ b, c, h, w = x.shape
+ out_channel = c * (scale**2)
+ assert h % scale == 0 and w % scale == 0
+ hh = h // scale
+ ww = w // scale
+ x_reshaped = x.reshape([b, c, hh, scale, ww, scale])
+ return x_reshaped.transpose([0, 1, 3, 5, 2,
+ 4]).reshape([b, out_channel, hh, ww])
+
+
class ResidualDenseBlock_5C(nn.Layer):
def __init__(self, nf=64, gc=32, bias=True):
super(ResidualDenseBlock_5C, self).__init__()
@@ -64,13 +86,21 @@ def make_layer(block, n_layers):
@GENERATORS.register()
class RRDBNet(nn.Layer):
- def __init__(self, in_nc, out_nc, nf, nb, gc=32):
+ def __init__(self, in_nc, out_nc, nf, nb, gc=32, scale=4):
super(RRDBNet, self).__init__()
+
+ self.scale = scale
+ if scale == 2:
+ in_nc = in_nc * 4
+ elif scale == 1:
+ in_nc = in_nc * 16
+
RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
self.conv_first = nn.Conv2D(in_nc, nf, 3, 1, 1, bias_attr=True)
self.RRDB_trunk = make_layer(RRDB_block_f, nb)
self.trunk_conv = nn.Conv2D(nf, nf, 3, 1, 1, bias_attr=True)
+
#### upsampling
self.upconv1 = nn.Conv2D(nf, nf, 3, 1, 1, bias_attr=True)
self.upconv2 = nn.Conv2D(nf, nf, 3, 1, 1, bias_attr=True)
@@ -80,7 +110,14 @@ class RRDBNet(nn.Layer):
self.lrelu = nn.LeakyReLU(negative_slope=0.2)
def forward(self, x):
- fea = self.conv_first(x)
+ if self.scale == 2:
+ fea = pixel_unshuffle(x, scale=2)
+ elif self.scale == 1:
+ fea = pixel_unshuffle(x, scale=4)
+ else:
+ fea = x
+
+ fea = self.conv_first(fea)
trunk = self.trunk_conv(self.RRDB_trunk(fea))
fea = fea + trunk
diff --git a/ppgan/models/generators/stylegan2_clean_arch.py b/ppgan/models/generators/stylegan2_clean_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..50f66d9f6ed0e4be81a55e72df360c578de999b8
--- /dev/null
+++ b/ppgan/models/generators/stylegan2_clean_arch.py
@@ -0,0 +1,396 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import random
+
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+
+class NormStyleCode(nn.Layer):
+ def forward(self, x):
+ """Normalize the style codes.
+
+ Args:
+ x (Tensor): Style codes with shape (b, c).
+
+ Returns:
+ Tensor: Normalized tensor.
+ """
+ return x * paddle.rsqrt(paddle.mean(x ** 2, axis=1, keepdim=\
+ True) + 1e-08)
+
+
+class ModulatedConv2d(nn.Layer):
+ """Modulated Conv2d used in StyleGAN2.
+
+ There is no bias in ModulatedConv2d.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ kernel_size (int): Size of the convolving kernel.
+ num_style_feat (int): Channel number of style features.
+ demodulate (bool): Whether to demodulate in the conv layer. Default: True.
+ sample_mode (str | None): Indicating 'upsample', 'downsample' or None. Default: None.
+ eps (float): A value added to the denominator for numerical stability. Default: 1e-8.
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ num_style_feat,
+ demodulate=True,
+ sample_mode=None,
+ eps=1e-08):
+ super(ModulatedConv2d, self).__init__()
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.kernel_size = kernel_size
+ self.demodulate = demodulate
+ self.sample_mode = sample_mode
+ self.eps = eps
+ self.modulation = nn.Linear(num_style_feat, in_channels, bias_attr=True)
+ # default_init_weights(self.modulation, scale=1, bias_fill=1, a=0,
+ # mode='fan_in', nonlinearity='linear')
+ x=paddle.randn(shape=[1, out_channels, in_channels, kernel_size, kernel_size],dtype='float32')/math. \
+ sqrt(in_channels * kernel_size ** 2)
+
+ self.weight = paddle.create_parameter(
+ shape=x.shape,
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Assign(x))
+ self.weight.stop_gradient = False
+ self.padding = kernel_size // 2
+
+ def forward(self, x, style):
+ """Forward function.
+
+ Args:
+ x (Tensor): Tensor with shape (b, c, h, w).
+ style (Tensor): Tensor with shape (b, num_style_feat).
+
+ Returns:
+ Tensor: Modulated tensor after convolution.
+ """
+ b, c, h, w = x.shape
+ style = self.modulation(style).reshape([b, 1, c, 1, 1])
+ weight = self.weight * style
+ if self.demodulate:
+ demod = paddle.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps)
+ weight = weight * demod.reshape([b, self.out_channels, 1, 1, 1])
+ weight = weight.reshape(
+ [b * self.out_channels, c, self.kernel_size, self.kernel_size])
+ if self.sample_mode == 'upsample':
+ x = F.interpolate(x,
+ scale_factor=2,
+ mode='bilinear',
+ align_corners=False)
+ elif self.sample_mode == 'downsample':
+ x = F.interpolate(x,
+ scale_factor=0.5,
+ mode='bilinear',
+ align_corners=False)
+ b, c, h, w = x.shape
+ x = x.reshape([1, b * c, h, w])
+ out = paddle.nn.functional.conv2d(x,
+ weight,
+ padding=self.padding,
+ groups=b)
+ out = out.reshape([b, self.out_channels, *out.shape[2:4]])
+ return out
+
+ def __repr__(self):
+ return (f'{self.__class__.__name__}(in_channels={self.in_channels}, \
+ out_channels={self.out_channels}, \
+ kernel_size={self.kernel_size}, \
+ demodulate={self.demodulate}, \
+ sample_mode={self.sample_mode})')
+
+
+class StyleConv(nn.Layer):
+ """Style conv used in StyleGAN2.
+
+ Args:
+ in_channels (int): Channel number of the input.
+ out_channels (int): Channel number of the output.
+ kernel_size (int): Size of the convolving kernel.
+ num_style_feat (int): Channel number of style features.
+ demodulate (bool): Whether demodulate in the conv layer. Default: True.
+ sample_mode (str | None): Indicating 'upsample', 'downsample' or None. Default: None.
+ """
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ num_style_feat,
+ demodulate=True,
+ sample_mode=None):
+ super(StyleConv, self).__init__()
+ self.modulated_conv = ModulatedConv2d(in_channels,
+ out_channels,
+ kernel_size,
+ num_style_feat,
+ demodulate=demodulate,
+ sample_mode=sample_mode)
+
+ x = paddle.zeros([1], dtype="float32")
+ self.weight = paddle.create_parameter(
+ x.shape,
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Assign(
+ x)) # for noise injection
+ x = paddle.zeros([1, out_channels, 1, 1], dtype="float32")
+ self.bias = paddle.create_parameter(
+ x.shape,
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Assign(x))
+ self.activate = nn.LeakyReLU(negative_slope=0.2)
+
+ def forward(self, x, style, noise=None):
+ out = self.modulated_conv(x, style) * 2**0.5
+ if noise is None:
+ b, _, h, w = out.shape
+ noise = paddle.normal(shape=[b, 1, h, w])
+ out = out + self.weight * noise
+ out = out + self.bias
+ out = self.activate(out)
+ return out
+
+
+class ToRGB(nn.Layer):
+ """To RGB (image space) from features.
+
+ Args:
+ in_channels (int): Channel number of input.
+ num_style_feat (int): Channel number of style features.
+ upsample (bool): Whether to upsample. Default: True.
+ """
+ def __init__(self, in_channels, num_style_feat, upsample=True):
+ super(ToRGB, self).__init__()
+ self.upsample = upsample
+ self.modulated_conv = ModulatedConv2d(in_channels,
+ 3,
+ kernel_size=1,
+ num_style_feat=num_style_feat,
+ demodulate=False,
+ sample_mode=None)
+ x = paddle.zeros(shape=[1, 3, 1, 1], dtype='float32')
+ self.bias = paddle.create_parameter(
+ shape=x.shape,
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Assign(x))
+ self.bias.stop_gradient = False
+
+ def forward(self, x, style, skip=None):
+ """Forward function.
+
+ Args:
+ x (Tensor): Feature tensor with shape (b, c, h, w).
+ style (Tensor): Tensor with shape (b, num_style_feat).
+ skip (Tensor): Base/skip tensor. Default: None.
+
+ Returns:
+ Tensor: RGB images.
+ """
+ out = self.modulated_conv(x, style)
+ out = out + self.bias
+ if skip is not None:
+ if self.upsample:
+ skip = F.interpolate(skip,
+ scale_factor=2,
+ mode='bilinear',
+ align_corners=False)
+ out = out + skip
+ return out
+
+
+class ConstantInput(nn.Layer):
+ """Constant input.
+
+ Args:
+ num_channel (int): Channel number of constant input.
+ size (int): Spatial size of constant input.
+ """
+ def __init__(self, num_channel, size):
+ super(ConstantInput, self).__init__()
+ x = paddle.randn(shape=[1, num_channel, size, size], dtype='float32')
+ self.weight = paddle.create_parameter(
+ shape=x.shape,
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.Assign(x))
+ self.weight.stop_gradient = False
+
+ def forward(self, batch):
+ out = paddle.tile(self.weight, repeat_times=[batch, 1, 1, 1])
+ return out
+
+
+class StyleGAN2GeneratorClean(nn.Layer):
+ """Clean version of StyleGAN2 Generator.
+
+ Args:
+ out_size (int): The spatial size of outputs.
+ num_style_feat (int): Channel number of style features. Default: 512.
+ num_mlp (int): Layer number of MLP style layers. Default: 8.
+ channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2.
+ narrow (float): Narrow ratio for channels. Default: 1.0.
+ """
+ def __init__(self,
+ out_size,
+ num_style_feat=512,
+ num_mlp=8,
+ channel_multiplier=2,
+ narrow=1):
+ super(StyleGAN2GeneratorClean, self).__init__()
+ self.num_style_feat = num_style_feat
+ style_mlp_layers = [NormStyleCode()]
+ for i in range(num_mlp):
+ style_mlp_layers.extend([
+ nn.Linear(num_style_feat, num_style_feat, bias_attr=True),
+ nn.LeakyReLU(negative_slope=0.2)
+ ])
+ self.style_mlp = nn.Sequential(*style_mlp_layers)
+ # default_init_weights(self.style_mlp, scale=1, bias_fill=0, a=0.2,
+ # mode='fan_in', nonlinearity='leaky_relu')
+ channels = {
+ '4': int(512 * narrow),
+ '8': int(512 * narrow),
+ '16': int(512 * narrow),
+ '32': int(512 * narrow),
+ '64': int(256 * channel_multiplier * narrow),
+ '128': int(128 * channel_multiplier * narrow),
+ '256': int(64 * channel_multiplier * narrow),
+ '512': int(32 * channel_multiplier * narrow),
+ '1024': int(16 * channel_multiplier * narrow)
+ }
+ self.channels = channels
+ self.constant_input = ConstantInput(channels['4'], size=4)
+ self.style_conv1 = StyleConv(channels['4'],
+ channels['4'],
+ kernel_size=3,
+ num_style_feat=num_style_feat,
+ demodulate=True,
+ sample_mode=None)
+ self.to_rgb1 = ToRGB(channels['4'], num_style_feat, upsample=False)
+ self.log_size = int(math.log(out_size, 2))
+ self.num_layers = (self.log_size - 2) * 2 + 1
+ self.num_latent = self.log_size * 2 - 2
+ self.style_convs = nn.LayerList()
+ self.to_rgbs = nn.LayerList()
+ self.noises = nn.Layer()
+ in_channels = channels['4']
+ for layer_idx in range(self.num_layers):
+ resolution = 2**((layer_idx + 5) // 2)
+ shape = [1, 1, resolution, resolution]
+ self.noises.register_buffer(f'noise{layer_idx}',
+ paddle.randn(shape=shape))
+ for i in range(3, self.log_size + 1):
+ out_channels = channels[f'{2 ** i}']
+ self.style_convs.append(StyleConv(in_channels, out_channels,
+ kernel_size=3, num_style_feat=num_style_feat, demodulate=\
+ True, sample_mode='upsample'))
+ self.style_convs.append(StyleConv(out_channels, out_channels,
+ kernel_size=3, num_style_feat=num_style_feat, demodulate=\
+ True, sample_mode=None))
+ self.to_rgbs.append(
+ ToRGB(out_channels, num_style_feat, upsample=True))
+ in_channels = out_channels
+
+ def make_noise(self):
+ """Make noise for noise injection."""
+ device = self.constant_input.weight.device
+ noises = [paddle.randn(shape=[1, 1, 4, 4])]
+ for i in range(3, self.log_size + 1):
+ for _ in range(2):
+ noises.append(paddle.randn(shape=[1, 1, 2**i, 2**i]))
+ return noises
+
+ def get_latent(self, x):
+ return self.style_mlp(x)
+
+ def mean_latent(self, num_latent):
+ latent_in = paddle.randn(shape=[num_latent, self.num_style_feat])
+ latent = self.style_mlp(latent_in).mean(0, keepdim=True)
+ return latent
+
+ def forward(self,
+ styles,
+ input_is_latent=False,
+ noise=None,
+ randomize_noise=True,
+ truncation=1,
+ truncation_latent=None,
+ inject_index=None,
+ return_latents=False):
+ """Forward function for StyleGAN2GeneratorClean.
+
+ Args:
+ styles (list[Tensor]): Sample codes of styles.
+ input_is_latent (bool): Whether input is latent style. Default: False.
+ noise (Tensor | None): Input noise or None. Default: None.
+ randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True.
+ truncation (float): The truncation ratio. Default: 1.
+ truncation_latent (Tensor | None): The truncation latent tensor. Default: None.
+ inject_index (int | None): The injection index for mixing noise. Default: None.
+ return_latents (bool): Whether to return style latents. Default: False.
+ """
+ if not input_is_latent:
+ styles = [self.style_mlp(s) for s in styles]
+ if noise is None:
+ if randomize_noise:
+ noise = [None] * self.num_layers
+ else:
+ noise = [
+ getattr(self.noises, f'noise{i}')
+ for i in range(self.num_layers)
+ ]
+ if truncation < 1:
+ style_truncation = []
+ for style in styles:
+ style_truncation.append(truncation_latent + truncation *
+ (style - truncation_latent))
+ styles = style_truncation
+ if len(styles) == 1:
+ inject_index = self.num_latent
+ if styles[0].ndim < 3:
+ latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
+ else:
+ latent = styles[0]
+ elif len(styles) == 2:
+ if inject_index is None:
+ inject_index = random.randint(1, self.num_latent - 1)
+ latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
+ latent2 = styles[1].unsqueeze(1).repeat(
+ 1, self.num_latent - inject_index, 1)
+ latent = paddle.concat([latent1, latent2], axis=1)
+ out = self.constant_input(latent.shape[0])
+ out = self.style_conv1(out, latent[:, 0], noise=noise[0])
+ skip = self.to_rgb1(out, latent[:, 1])
+ i = 1
+ for conv1, conv2, noise1, noise2, to_rgb in zip(self.style_convs[::2],
+ self.style_convs[1::2],
+ noise[1::2],
+ noise[2::2],
+ self.to_rgbs):
+ out = conv1(out, latent[:, i], noise=noise1)
+ out = conv2(out, latent[:, i + 1], noise=noise2)
+ skip = to_rgb(out, latent[:, i + 2], skip)
+ i += 2
+ image = skip
+ if return_latents:
+ return image, latent
+ else:
+ return image, None
diff --git a/ppgan/models/generators/swinir.py b/ppgan/models/generators/swinir.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6c553a0baba270869f40170222d2758bc37b56c
--- /dev/null
+++ b/ppgan/models/generators/swinir.py
@@ -0,0 +1,1060 @@
+# code was heavily based on https://github.com/cszn/KAIR
+# MIT License
+# Copyright (c) 2019 Kai Zhang
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+from itertools import repeat
+import collections.abc
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .builder import GENERATORS
+
+
+def _ntuple(n):
+
+ def parse(x):
+ if isinstance(x, collections.abc.Iterable):
+ return x
+ return tuple(repeat(x, n))
+
+ return parse
+
+
+class DropPath(nn.Layer):
+ """DropPath class"""
+
+ def __init__(self, drop_prob=None):
+ super(DropPath, self).__init__()
+ self.drop_prob = drop_prob
+
+ def drop_path(self, inputs):
+ """drop path op
+ Args:
+ input: tensor with arbitrary shape
+ drop_prob: float number of drop path probability, default: 0.0
+ training: bool, if current mode is training, default: False
+ Returns:
+ output: output tensor after drop path
+ """
+ # if prob is 0 or eval mode, return original input
+ if self.drop_prob == 0. or not self.training:
+ return inputs
+ keep_prob = 1 - self.drop_prob
+ keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+ shape = (
+ inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1)
+ random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+ random_tensor = random_tensor.floor() # mask
+ output = inputs.divide(
+ keep_prob
+ ) * random_tensor # divide is to keep same output expectation
+ return output
+
+ def forward(self, inputs):
+ return self.drop_path(inputs)
+
+
+to_2tuple = _ntuple(2)
+
+
+@paddle.jit.not_to_static
+def swapdim(x, dim1, dim2):
+ a = list(range(len(x.shape)))
+ a[dim1], a[dim2] = a[dim2], a[dim1]
+ return x.transpose(a)
+
+
+class Identity(nn.Layer):
+ """ Identity layer
+ The output of this layer is the input without any change.
+ Use this layer to avoid if condition in some forward methods
+ """
+
+ def __init__(self):
+ super(Identity, self).__init__()
+
+ def forward(self, x):
+ return x
+
+
+class Mlp(nn.Layer):
+
+ def __init__(self, in_features, hidden_features, dropout):
+ super(Mlp, self).__init__()
+ w_attr_1, b_attr_1 = self._init_weights()
+ self.fc1 = nn.Linear(in_features,
+ hidden_features,
+ weight_attr=w_attr_1,
+ bias_attr=b_attr_1)
+
+ w_attr_2, b_attr_2 = self._init_weights()
+ self.fc2 = nn.Linear(hidden_features,
+ in_features,
+ weight_attr=w_attr_2,
+ bias_attr=b_attr_2)
+ self.act = nn.GELU()
+ self.dropout = nn.Dropout(dropout)
+
+ def _init_weights(self):
+ weight_attr = paddle.ParamAttr(
+ initializer=paddle.nn.initializer.XavierUniform())
+ bias_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Normal(
+ std=1e-6))
+ return weight_attr, bias_attr
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.dropout(x)
+ x = self.fc2(x)
+ x = self.dropout(x)
+ return x
+
+
+class WindowAttention(nn.Layer):
+ """Window based multihead attention, with relative position bias.
+ Both shifted window and non-shifted window are supported.
+ Args:
+ dim (int): input dimension (channels)
+ window_size (int): height and width of the window
+ num_heads (int): number of attention heads
+ qkv_bias (bool): if True, enable learnable bias to q,k,v, default: True
+ qk_scale (float): override default qk scale head_dim**-0.5 if set, default: None
+ attention_dropout (float): dropout of attention
+ dropout (float): dropout for output
+ """
+
+ def __init__(self,
+ dim,
+ window_size,
+ num_heads,
+ qkv_bias=True,
+ qk_scale=None,
+ attention_dropout=0.,
+ dropout=0.):
+ super(WindowAttention, self).__init__()
+ self.window_size = window_size
+ self.num_heads = num_heads
+ self.dim = dim
+ self.dim_head = dim // num_heads
+ self.scale = qk_scale or self.dim_head**-0.5
+
+ self.relative_position_bias_table = paddle.create_parameter(
+ shape=[(2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+ num_heads],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
+
+ weight_attr, bias_attr = self._init_weights()
+
+ # relative position index for each token inside window
+ coords_h = paddle.arange(self.window_size[0])
+ coords_w = paddle.arange(self.window_size[1])
+ coords = paddle.stack(paddle.meshgrid([coords_h, coords_w
+ ])) # [2, window_h, window_w]
+ coords_flatten = paddle.flatten(coords, 1) # [2, window_h * window_w]
+ # 2, window_h * window_w, window_h * window_h
+ relative_coords = coords_flatten.unsqueeze(
+ 2) - coords_flatten.unsqueeze(1)
+ # winwod_h*window_w, window_h*window_w, 2
+ relative_coords = relative_coords.transpose([1, 2, 0])
+ relative_coords[:, :, 0] += self.window_size[0] - 1
+ relative_coords[:, :, 1] += self.window_size[1] - 1
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+ # [window_size * window_size, window_size*window_size]
+ relative_position_index = relative_coords.sum(-1)
+ self.register_buffer("relative_position_index", relative_position_index)
+
+ self.qkv = nn.Linear(dim,
+ dim * 3,
+ weight_attr=weight_attr,
+ bias_attr=bias_attr if qkv_bias else False)
+ self.attn_dropout = nn.Dropout(attention_dropout)
+ self.proj = nn.Linear(dim,
+ dim,
+ weight_attr=weight_attr,
+ bias_attr=bias_attr)
+ self.proj_dropout = nn.Dropout(dropout)
+ self.softmax = nn.Softmax(axis=-1)
+
+ def transpose_multihead(self, x):
+ tensor_shape = list(x.shape[:-1])
+ new_shape = tensor_shape + [self.num_heads, self.dim_head]
+ x = x.reshape(new_shape)
+ x = x.transpose([0, 2, 1, 3])
+ return x
+
+ def get_relative_pos_bias_from_pos_index(self):
+ # relative_position_bias_table is a ParamBase object
+ # https://github.com/PaddlePaddle/Paddle/blob/067f558c59b34dd6d8626aad73e9943cf7f5960f/python/paddle/fluid/framework.py#L5727
+ table = self.relative_position_bias_table # N x num_heads
+ # index is a tensor
+ index = self.relative_position_index.reshape(
+ [-1]) # window_h*window_w * window_h*window_w
+ # NOTE: paddle does NOT support indexing Tensor by a Tensor
+ relative_position_bias = paddle.index_select(x=table, index=index)
+ return relative_position_bias
+
+ def forward(self, x, mask=None):
+ qkv = self.qkv(x).chunk(3, axis=-1)
+ q, k, v = map(self.transpose_multihead, qkv)
+ q = q * self.scale
+ attn = paddle.matmul(q, k, transpose_y=True)
+ relative_position_bias = self.get_relative_pos_bias_from_pos_index()
+ relative_position_bias = relative_position_bias.reshape([
+ self.window_size[0] * self.window_size[1],
+ self.window_size[0] * self.window_size[1], -1
+ ])
+ # nH, window_h*window_w, window_h*window_w
+ relative_position_bias = relative_position_bias.transpose([2, 0, 1])
+ attn = attn + relative_position_bias.unsqueeze(0)
+ if mask is not None:
+ nW = mask.shape[0]
+ attn = attn.reshape(
+ [x.shape[0] // nW, nW, self.num_heads, x.shape[1], x.shape[1]])
+ attn += mask.unsqueeze(1).unsqueeze(0)
+ attn = attn.reshape([-1, self.num_heads, x.shape[1], x.shape[1]])
+ attn = self.softmax(attn)
+ else:
+ attn = self.softmax(attn)
+
+ attn = self.attn_dropout(attn)
+
+ z = paddle.matmul(attn, v)
+ z = z.transpose([0, 2, 1, 3])
+ tensor_shape = list(z.shape[:-2])
+ new_shape = tensor_shape + [self.dim]
+ z = z.reshape(new_shape)
+ z = self.proj(z)
+ z = self.proj_dropout(z)
+
+ return z
+
+ def _init_weights(self):
+ weight_attr = paddle.ParamAttr(
+ initializer=nn.initializer.TruncatedNormal(std=.02))
+ bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0))
+ return weight_attr, bias_attr
+
+ def extra_repr(self) -> str:
+ return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+ def flops(self, N):
+ # calculate flops for 1 window with token length of N
+ flops = 0
+ flops += N * self.dim * 3 * self.dim
+ flops += self.num_heads * N * (self.dim // self.num_heads) * N
+ flops += self.num_heads * N * N * (self.dim // self.num_heads)
+ flops += N * self.dim * self.dim
+ return flops
+
+
+def windows_partition(x, window_size):
+ """
+ Args:
+ x: (B, H, W, C)
+ window_size (int): window size
+
+ Returns:
+ windows: (num_windows*B, window_size, window_size, C)
+ """
+ B, H, W, C = x.shape
+ x = x.reshape(
+ [B, H // window_size, window_size, W // window_size, window_size, C])
+ windows = x.transpose([0, 1, 3, 2, 4,
+ 5]).reshape([-1, window_size, window_size, C])
+ return windows
+
+
+def windows_reverse(windows, window_size, H, W):
+ """
+ Args:
+ windows: (num_windows*B, window_size, window_size, C)
+ window_size (int): Window size
+ H (int): Height of image
+ W (int): Width of image
+
+ Returns:
+ x: (B, H, W, C)
+ """
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
+ x = windows.reshape(
+ [B, H // window_size, W // window_size, window_size, window_size, -1])
+ x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, H, W, -1])
+ return x
+
+
+class SwinTransformerBlock(nn.Layer):
+ """Swin transformer block
+ Contains window multi head self attention, droppath, mlp, norm and residual.
+ Attributes:
+ dim: int, input dimension (channels)
+ input_resolution: int, input resoultion
+ num_heads: int, number of attention heads
+ windos_size: int, window size, default: 7
+ shift_size: int, shift size for SW-MSA, default: 0
+ mlp_ratio: float, ratio of mlp hidden dim and input embedding dim, default: 4.
+ qkv_bias: bool, if True, enable learnable bias to q,k,v, default: True
+ qk_scale: float, override default qk scale head_dim**-0.5 if set, default: None
+ dropout: float, dropout for output, default: 0.
+ attention_dropout: float, dropout of attention, default: 0.
+ droppath: float, drop path rate, default: 0.
+ """
+
+ def __init__(self,
+ dim,
+ input_resolution,
+ num_heads,
+ window_size=7,
+ shift_size=0,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ dropout=0.,
+ attention_dropout=0.,
+ droppath=0.):
+ super(SwinTransformerBlock, self).__init__()
+ self.dim = dim
+ self.input_resolution = input_resolution
+ self.num_heads = num_heads
+ self.window_size = window_size
+ self.shift_size = shift_size
+ self.mlp_ratio = mlp_ratio
+ if min(self.input_resolution) <= self.window_size:
+ self.shift_size = 0
+ self.window_size = min(self.input_resolution)
+
+ self.norm1 = nn.LayerNorm(dim)
+ self.attn = WindowAttention(dim,
+ window_size=to_2tuple(self.window_size),
+ num_heads=num_heads,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ attention_dropout=attention_dropout,
+ dropout=dropout)
+ self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+ self.norm2 = nn.LayerNorm(dim)
+ self.mlp = Mlp(in_features=dim,
+ hidden_features=int(dim * mlp_ratio),
+ dropout=dropout)
+
+ attn_mask = self.calculate_mask(self.input_resolution)
+
+ self.register_buffer("attn_mask", attn_mask)
+
+ def calculate_mask(self, x_size):
+ if self.shift_size > 0:
+ # calculate attention mask for SW-MSA
+ H, W = x_size
+ img_mask = paddle.zeros((1, H, W, 1))
+ h_slices = (slice(0, -self.window_size),
+ slice(-self.window_size,
+ -self.shift_size), slice(-self.shift_size, None))
+ w_slices = (slice(0, -self.window_size),
+ slice(-self.window_size,
+ -self.shift_size), slice(-self.shift_size, None))
+ cnt = 0
+
+ for h in h_slices:
+ for w in w_slices:
+ img_mask[:, h, w, :] = cnt
+ cnt += 1
+
+ mask_windows = windows_partition(
+ img_mask, self.window_size) # nW, window_size, window_size, 1
+ mask_windows = mask_windows.reshape(
+ [-1, self.window_size * self.window_size])
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+
+ huns = -100.0 * paddle.ones_like(attn_mask)
+ attn_mask = huns * (attn_mask != 0).astype("float32")
+
+ return attn_mask
+ else:
+ return None
+
+ def forward(self, x, x_size):
+ H, W = x_size
+ B, L, C = x.shape
+
+ shortcut = x
+ x = self.norm1(x)
+ x = x.reshape([B, H, W, C])
+
+ # cyclic shift
+ if self.shift_size > 0:
+ shifted_x = paddle.roll(x,
+ shifts=(-self.shift_size, -self.shift_size),
+ axis=(1, 2))
+ else:
+ shifted_x = x
+
+ # partition windows
+ x_windows = windows_partition(shifted_x, self.window_size)
+ x_windows = x_windows.reshape(
+ [-1, self.window_size * self.window_size, C])
+
+ # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size
+
+ if self.input_resolution == x_size:
+ attn_windows = self.attn(x_windows, mask=self.attn_mask)
+ else:
+ attn_windows = self.attn(x_windows,
+ mask=self.calculate_mask(x_size))
+
+ # merge windows
+ attn_windows = attn_windows.reshape(
+ [-1, self.window_size, self.window_size, C])
+ shifted_x = windows_reverse(attn_windows, self.window_size, H, W)
+
+ # reverse cyclic shift
+ if self.shift_size > 0:
+ x = paddle.roll(shifted_x,
+ shifts=(self.shift_size, self.shift_size),
+ axis=(1, 2))
+ else:
+ x = shifted_x
+
+ x = x.reshape([B, H * W, C])
+
+ # FFN
+ x = shortcut + self.drop_path(x)
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+ return x
+
+ def extra_repr(self) -> str:
+ return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+ f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+
+ def flops(self):
+ flops = 0
+ H, W = self.input_resolution
+ # norm1
+ flops += self.dim * H * W
+ # W-MSA/SW-MSA
+ nW = H * W / self.window_size / self.window_size
+ flops += nW * self.attn.flops(self.window_size * self.window_size)
+ # mlp
+ flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+ # norm2
+ flops += self.dim * H * W
+ return flops
+
+
+class PatchMerging(nn.Layer):
+ """ Patch Merging class
+ Merge multiple patch into one path and keep the out dim.
+ Spefically, merge adjacent 2x2 patches(dim=C) into 1 patch.
+ The concat dim 4*C is rescaled to 2*C
+ Args:
+ input_resolution (tuple | ints): the size of input
+ dim: dimension of single patch
+ reduction: nn.Linear which maps 4C to 2C dim
+ norm: nn.LayerNorm, applied after linear layer.
+ """
+
+ def __init__(self, input_resolution, dim):
+ super(PatchMerging, self).__init__()
+ self.input_resolution = input_resolution
+ self.dim = dim
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+ self.norm = nn.LayerNorm(4 * dim)
+
+ def forward(self, x):
+ h, w = self.input_resolution
+ b, _, c = x.shape
+ x = x.reshape([b, h, w, c])
+
+ x0 = x[:, 0::2, 0::2, :] # [B, H/2, W/2, C]
+ x1 = x[:, 1::2, 0::2, :] # [B, H/2, W/2, C]
+ x2 = x[:, 0::2, 1::2, :] # [B, H/2, W/2, C]
+ x3 = x[:, 1::2, 1::2, :] # [B, H/2, W/2, C]
+ x = paddle.concat([x0, x1, x2, x3], -1) #[B, H/2, W/2, 4*C]
+ x = x.reshape([b, -1, 4 * c]) # [B, H/2*W/2, 4*C]
+
+ x = self.norm(x)
+ x = self.reduction(x)
+
+ return x
+
+ def extra_repr(self) -> str:
+ return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+ def flops(self):
+ H, W = self.input_resolution
+ flops = H * W * self.dim
+ flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+ return flops
+
+
+class BasicLayer(nn.Layer):
+ """ A basic Swin Transformer layer for one stage.
+
+ Args:
+ dim (int): Number of input channels.
+ input_resolution (tuple[int]): Input resolution.
+ depth (int): Number of blocks.
+ num_heads (int): Number of attention heads.
+ window_size (int): Local window size.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+ dropout (float, optional): Dropout rate. Default: 0.0
+ attention_dropout (float, optional): Attention dropout rate. Default: 0.0
+ droppath (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+ downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+ """
+
+ def __init__(self,
+ dim,
+ input_resolution,
+ depth,
+ num_heads,
+ window_size,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ dropout=0.,
+ attention_dropout=0.,
+ droppath=0.,
+ downsample=None):
+ super(BasicLayer, self).__init__()
+ self.dim = dim
+ self.input_resolution = input_resolution
+ self.depth = depth
+
+ self.blocks = nn.LayerList()
+ for i in range(depth):
+ self.blocks.append(
+ SwinTransformerBlock(dim=dim,
+ input_resolution=input_resolution,
+ num_heads=num_heads,
+ window_size=window_size,
+ shift_size=0 if
+ (i % 2 == 0) else window_size // 2,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ dropout=dropout,
+ attention_dropout=attention_dropout,
+ droppath=droppath[i] if isinstance(
+ droppath, list) else droppath))
+
+ if downsample is not None:
+ self.downsample = downsample(input_resolution, dim=dim)
+ else:
+ self.downsample = None
+
+ def forward(self, x, x_size):
+ for block in self.blocks:
+ x = block(x, x_size)
+ if self.downsample is not None:
+ x = self.downsample(x)
+ return x
+
+ def extra_repr(self) -> str:
+ return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+ def flops(self):
+ flops = 0
+ for blk in self.blocks:
+ flops += blk.flops()
+ if self.downsample is not None:
+ flops += self.downsample.flops()
+ return flops
+
+
+class RSTB(nn.Layer):
+ """Residual Swin Transformer Block (RSTB).
+
+ Args:
+ dim (int): Number of input channels.
+ input_resolution (tuple[int]): Input resolution.
+ depth (int): Number of blocks.
+ num_heads (int): Number of attention heads.
+ window_size (int): Local window size.
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+ drop (float, optional): Dropout rate. Default: 0.0
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+ downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+ img_size: Input image size.
+ patch_size: Patch size.
+ resi_connection: The convolutional block before residual connection.
+ """
+
+ def __init__(self,
+ dim,
+ input_resolution,
+ depth,
+ num_heads,
+ window_size,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ drop=0.,
+ attn_drop=0.,
+ drop_path=0.,
+ downsample=None,
+ img_size=224,
+ patch_size=4,
+ resi_connection='1conv'):
+ super(RSTB, self).__init__()
+
+ self.dim = dim
+ self.input_resolution = input_resolution
+
+ self.residual_group = BasicLayer(dim=dim,
+ input_resolution=input_resolution,
+ depth=depth,
+ num_heads=num_heads,
+ window_size=window_size,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ dropout=drop,
+ attention_dropout=attn_drop,
+ droppath=drop_path,
+ downsample=downsample)
+
+ if resi_connection == '1conv':
+ self.conv = nn.Conv2D(dim, dim, 3, 1, 1)
+ elif resi_connection == '3conv':
+ # to save parameters and memory
+ self.conv = nn.Sequential(nn.Conv2D(dim, dim // 4, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.2),
+ nn.Conv2D(dim // 4, dim // 4, 1, 1, 0),
+ nn.LeakyReLU(negative_slope=0.2),
+ nn.Conv2D(dim // 4, dim, 3, 1, 1))
+
+ self.patch_embed = PatchEmbed(img_size=img_size,
+ patch_size=patch_size,
+ in_chans=0,
+ embed_dim=dim,
+ norm_layer=None)
+
+ self.patch_unembed = PatchUnEmbed(img_size=img_size,
+ patch_size=patch_size,
+ in_chans=0,
+ embed_dim=dim,
+ norm_layer=None)
+
+ def forward(self, x, x_size):
+ return self.patch_embed(
+ self.conv(self.patch_unembed(self.residual_group(x, x_size),
+ x_size))) + x
+
+ def flops(self):
+ flops = 0
+ flops += self.residual_group.flops()
+ H, W = self.input_resolution
+ flops += H * W * self.dim * self.dim * 9
+ flops += self.patch_embed.flops()
+ flops += self.patch_unembed.flops()
+
+ return flops
+
+
+class PatchEmbed(nn.Layer):
+ r""" Image to Patch Embedding
+
+ Args:
+ img_size (int): Image size. Default: 224.
+ patch_size (int): Patch token size. Default: 4.
+ in_chans (int): Number of input image channels. Default: 3.
+ embed_dim (int): Number of linear projection output channels. Default: 96.
+ norm_layer (nn.Layer, optional): Normalization layer. Default: None
+ """
+
+ def __init__(self,
+ img_size=224,
+ patch_size=4,
+ in_chans=3,
+ embed_dim=96,
+ norm_layer=None):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ patches_resolution = [
+ img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+ ]
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.patches_resolution = patches_resolution
+ self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+
+ if norm_layer is not None:
+ self.norm = norm_layer(embed_dim)
+ else:
+ self.norm = None
+
+ def forward(self, x):
+ x = x.flatten(2).transpose([0, 2, 1]) # B Ph*Pw C
+ if self.norm is not None:
+ x = self.norm(x)
+ return x
+
+ def flops(self):
+ flops = 0
+ H, W = self.img_size
+ if self.norm is not None:
+ flops += H * W * self.embed_dim
+ return flops
+
+
+class PatchUnEmbed(nn.Layer):
+ r""" Image to Patch Unembedding
+
+ Args:
+ img_size (int): Image size. Default: 224.
+ patch_size (int): Patch token size. Default: 4.
+ in_chans (int): Number of input image channels. Default: 3.
+ embed_dim (int): Number of linear projection output channels. Default: 96.
+ norm_layer (nn.Layer, optional): Normalization layer. Default: None
+ """
+
+ def __init__(self,
+ img_size=224,
+ patch_size=4,
+ in_chans=3,
+ embed_dim=96,
+ norm_layer=None):
+ super().__init__()
+ img_size = to_2tuple(img_size)
+ patch_size = to_2tuple(patch_size)
+ patches_resolution = [
+ img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+ ]
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.patches_resolution = patches_resolution
+ self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+ self.in_chans = in_chans
+ self.embed_dim = embed_dim
+
+ def forward(self, x, x_size):
+ B, HW, C = x.shape
+ x = x.transpose([0, 2,
+ 1]).reshape([B, self.embed_dim, x_size[0],
+ x_size[1]]) # B Ph*Pw C
+ return x
+
+ def flops(self):
+ flops = 0
+ return flops
+
+
+class Upsample(nn.Sequential):
+ """Upsample module.
+
+ Args:
+ scale (int): Scale factor. Supported scales: 2^n and 3.
+ num_feat (int): Channel number of intermediate features.
+ """
+
+ def __init__(self, scale, num_feat):
+ m = []
+ if (scale & (scale - 1)) == 0: # scale = 2^n
+ for _ in range(int(math.log(scale, 2))):
+ m.append(nn.Conv2D(num_feat, 4 * num_feat, 3, 1, 1))
+ m.append(nn.PixelShuffle(2))
+ elif scale == 3:
+ m.append(nn.Conv2D(num_feat, 9 * num_feat, 3, 1, 1))
+ m.append(nn.PixelShuffle(3))
+ else:
+ raise ValueError(f'scale {scale} is not supported. '
+ 'Supported scales: 2^n and 3.')
+ super(Upsample, self).__init__(*m)
+
+
+class UpsampleOneStep(nn.Sequential):
+ """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle)
+ Used in lightweight SR to save parameters.
+
+ Args:
+ scale (int): Scale factor. Supported scales: 2^n and 3.
+ num_feat (int): Channel number of intermediate features.
+
+ """
+
+ def __init__(self, scale, num_feat, num_out_ch, input_resolution=None):
+ self.num_feat = num_feat
+ self.input_resolution = input_resolution
+ m = []
+ m.append(nn.Conv2D(num_feat, (scale**2) * num_out_ch, 3, 1, 1))
+ m.append(nn.PixelShuffle(scale))
+ super(UpsampleOneStep, self).__init__(*m)
+
+ def flops(self):
+ H, W = self.input_resolution
+ flops = H * W * self.num_feat * 3 * 9
+ return flops
+
+
+@GENERATORS.register()
+class SwinIR(nn.Layer):
+ r""" SwinIR
+ A Pypaddle impl of : `SwinIR: Image Restoration Using Swin Transformer`, based on Swin Transformer.
+
+ Args:
+ img_size (int | tuple(int)): Input image size. Default 64
+ patch_size (int | tuple(int)): Patch size. Default: 1
+ in_chans (int): Number of input image channels. Default: 3
+ embed_dim (int): Patch embedding dimension. Default: 96
+ depths (tuple(int)): Depth of each Swin Transformer layer.
+ num_heads (tuple(int)): Number of attention heads in different layers.
+ window_size (int): Window size. Default: 7
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+ drop_rate (float): Dropout rate. Default: 0
+ attn_drop_rate (float): Attention dropout rate. Default: 0
+ drop_path_rate (float): Stochastic depth rate. Default: 0.1
+ norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+ ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+ patch_norm (bool): If True, add normalization after patch embedding. Default: True
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+ upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction
+ img_range: Image range. 1. or 255.
+ upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None
+ resi_connection: The convolutional block before residual connection. '1conv'/'3conv'
+ """
+
+ def __init__(self,
+ img_size=64,
+ patch_size=1,
+ in_chans=3,
+ embed_dim=96,
+ depths=[6, 6, 6, 6],
+ num_heads=[6, 6, 6, 6],
+ window_size=7,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0.,
+ attn_drop_rate=0.,
+ drop_path_rate=0.1,
+ norm_layer=nn.LayerNorm,
+ ape=False,
+ patch_norm=True,
+ upscale=2,
+ img_range=1.,
+ upsampler='',
+ resi_connection='1conv'):
+ super(SwinIR, self).__init__()
+ num_in_ch = in_chans
+ num_out_ch = in_chans
+ num_feat = 64
+ self.img_range = img_range
+ if in_chans == 3:
+ rgb_mean = np.array([0.4488, 0.4371, 0.4040], dtype=np.float32)
+ self.mean = paddle.Tensor(rgb_mean).reshape([1, 3, 1, 1])
+ else:
+ self.mean = paddle.zeros([1., 1., 1., 1.], dtype=paddle.float32)
+ self.upscale = upscale
+ self.upsampler = upsampler
+ self.window_size = window_size
+
+ # 1. shallow feature extraction
+ self.conv_first = nn.Conv2D(num_in_ch, embed_dim, 3, 1, 1)
+
+ # 2. deep feature extraction
+ self.num_layers = len(depths)
+ self.embed_dim = embed_dim
+ self.ape = ape
+ self.patch_norm = patch_norm
+ self.num_features = embed_dim
+ self.mlp_ratio = mlp_ratio
+
+ # split image into non-overlapping patches
+ self.patch_embed = PatchEmbed(
+ img_size=img_size,
+ patch_size=patch_size,
+ in_chans=embed_dim,
+ embed_dim=embed_dim,
+ norm_layer=norm_layer if self.patch_norm else None)
+ num_patches = self.patch_embed.num_patches
+ patches_resolution = self.patch_embed.patches_resolution
+ self.patches_resolution = patches_resolution
+
+ # merge non-overlapping patches into image
+ self.patch_unembed = PatchUnEmbed(
+ img_size=img_size,
+ patch_size=patch_size,
+ in_chans=embed_dim,
+ embed_dim=embed_dim,
+ norm_layer=norm_layer if self.patch_norm else None)
+
+ # absolute position embedding
+ if self.ape:
+ self.absolute_pos_embed = paddle.nn.ParameterList([
+ paddle.create_parameter(
+ shape=[1, num_patches, embed_dim],
+ dtype='float32',
+ default_initializer=paddle.nn.initializer.TruncatedNormal(
+ std=.02))
+ ])
+
+ self.pos_drop = nn.Dropout(p=drop_rate)
+
+ # stochastic depth
+ dpr = [
+ x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+ ] # stochastic depth decay rule
+
+ # build Residual Swin Transformer blocks (RSTB)
+ self.layers = nn.LayerList()
+ for i_layer in range(self.num_layers):
+ layer = RSTB(
+ dim=embed_dim,
+ input_resolution=(patches_resolution[0], patches_resolution[1]),
+ depth=depths[i_layer],
+ num_heads=num_heads[i_layer],
+ window_size=window_size,
+ mlp_ratio=self.mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ drop=drop_rate,
+ attn_drop=attn_drop_rate,
+ drop_path=dpr[sum(depths[:i_layer]
+ ):sum(depths[:i_layer +
+ 1])], # no impact on SR results
+ downsample=None,
+ img_size=img_size,
+ patch_size=patch_size,
+ resi_connection=resi_connection)
+ self.layers.append(layer)
+ self.norm = norm_layer(self.num_features)
+
+ # build the last conv layer in deep feature extraction
+ if resi_connection == '1conv':
+ self.conv_after_body = nn.Conv2D(embed_dim, embed_dim, 3, 1, 1)
+ elif resi_connection == '3conv':
+ # to save parameters and memory
+ self.conv_after_body = nn.Sequential(
+ nn.Conv2D(embed_dim, embed_dim // 4, 3, 1, 1),
+ nn.LeakyReLU(negative_slope=0.2),
+ nn.Conv2D(embed_dim // 4, embed_dim // 4, 1, 1, 0),
+ nn.LeakyReLU(negative_slope=0.2),
+ nn.Conv2D(embed_dim // 4, embed_dim, 3, 1, 1))
+
+ # 3, high quality image reconstruction ################################
+ if self.upsampler == 'pixelshuffle':
+ # for classical SR
+ self.conv_before_upsample = nn.Sequential(
+ nn.Conv2D(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU())
+ self.upsample = Upsample(upscale, num_feat)
+ self.conv_last = nn.Conv2D(num_feat, num_out_ch, 3, 1, 1)
+ elif self.upsampler == 'pixelshuffledirect':
+ # for lightweight SR (to save parameters)
+ self.upsample = UpsampleOneStep(
+ upscale, embed_dim, num_out_ch,
+ (patches_resolution[0], patches_resolution[1]))
+ elif self.upsampler == 'nearest+conv':
+ # for real-world SR (less artifacts)
+ assert self.upscale == 4, 'only support x4 now.'
+ self.conv_before_upsample = nn.Sequential(
+ nn.Conv2D(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU())
+ self.conv_up1 = nn.Conv2D(num_feat, num_feat, 3, 1, 1)
+ self.conv_up2 = nn.Conv2D(num_feat, num_feat, 3, 1, 1)
+ self.conv_hr = nn.Conv2D(num_feat, num_feat, 3, 1, 1)
+ self.conv_last = nn.Conv2D(num_feat, num_out_ch, 3, 1, 1)
+ self.lrelu = nn.LeakyReLU(negative_slope=0.2)
+ else:
+ # for image denoising and JPEG compression artifact reduction
+ self.conv_last = nn.Conv2D(embed_dim, num_out_ch, 3, 1, 1)
+
+ def no_weight_decay(self):
+ return {'absolute_pos_embed'}
+
+ def no_weight_decay_keywords(self):
+ return {'relative_position_bias_table'}
+
+ def check_image_size(self, x):
+ _, _, h, w = x.shape
+ mod_pad_h = (self.window_size - h % self.window_size) % self.window_size
+ mod_pad_w = (self.window_size - w % self.window_size) % self.window_size
+ x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), 'reflect')
+ return x
+
+ def forward_features(self, x):
+ x_size = (x.shape[2], x.shape[3])
+ x = self.patch_embed(x)
+ if self.ape:
+ x = x + self.absolute_pos_embed
+ x = self.pos_drop(x)
+
+ for layer in self.layers:
+ x = layer(x, x_size)
+
+ x = self.norm(x) # B L C
+ x = self.patch_unembed(x, x_size)
+
+ return x
+
+ def forward(self, x):
+ H, W = x.shape[2:]
+ x = self.check_image_size(x)
+
+ x = (x - self.mean) * self.img_range
+
+ if self.upsampler == 'pixelshuffle':
+ # for classical SR
+ x = self.conv_first(x)
+ x = self.conv_after_body(self.forward_features(x)) + x
+ x = self.conv_before_upsample(x)
+ x = self.conv_last(self.upsample(x))
+ elif self.upsampler == 'pixelshuffledirect':
+ # for lightweight SR
+ x = self.conv_first(x)
+ x = self.conv_after_body(self.forward_features(x)) + x
+ x = self.upsample(x)
+ elif self.upsampler == 'nearest+conv':
+ # for real-world SR
+ x = self.conv_first(x)
+ x = self.conv_after_body(self.forward_features(x)) + x
+ x = self.conv_before_upsample(x)
+ x = self.lrelu(
+ self.conv_up1(
+ paddle.nn.functional.interpolate(x,
+ scale_factor=2,
+ mode='nearest')))
+ x = self.lrelu(
+ self.conv_up2(
+ paddle.nn.functional.interpolate(x,
+ scale_factor=2,
+ mode='nearest')))
+ x = self.conv_last(self.lrelu(self.conv_hr(x)))
+ else:
+ # for image denoising and JPEG compression artifact reduction
+ x_first = self.conv_first(x)
+ res = self.conv_after_body(self.forward_features(x_first)) + x_first
+ x = x + self.conv_last(res)
+
+ x = x / self.img_range + self.mean
+
+ return x[:, :, :H * self.upscale, :W * self.upscale]
+
+ def flops(self):
+ flops = 0
+ H, W = self.patches_resolution
+ flops += H * W * 3 * self.embed_dim * 9
+ flops += self.patch_embed.flops()
+ for i, layer in enumerate(self.layers):
+ flops += layer.flops()
+ flops += H * W * 3 * self.embed_dim * self.embed_dim
+ flops += self.upsample.flops()
+ return flops
diff --git a/ppgan/models/generators/unet.py b/ppgan/models/generators/unet.py
index 38d294a686fe884951b4ff657ad790bc97874ce7..f8c2a1b17a95a643dc5bc22f9b725eacec337c5e 100644
--- a/ppgan/models/generators/unet.py
+++ b/ppgan/models/generators/unet.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was based on https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix
import functools
import paddle
import paddle.nn as nn
@@ -30,17 +31,19 @@ class UnetGenerator(nn.Layer):
ngf=64,
norm_type='batch',
use_dropout=False):
- """Construct a Unet generator
+ """
+ Construct a Unet generator
+ the U-Net from the innermost layer to the outermost layer.
+ It is a recursive process.
+
Args:
- input_nc (int) -- the number of channels in input images
- output_nc (int) -- the number of channels in output images
- num_downs (int) -- the number of downsamplings in UNet. For example, # if |num_downs| == 7,
- image of size 128x128 will become of size 1x1 # at the bottleneck
- ngf (int) -- the number of filters in the last conv layer
- norm_layer -- normalization layer
+ input_nc (int): the number of channels in input images.
+ output_nc (int): the number of channels in output images.
+ num_downs (int): the number of downsamplings in UNet. For example, # if |num_downs| == 7,
+ image of size 128x128 will become of size 1x1 # at the bottleneck.
+ ngf (int): the number of filters in the last conv layer.
+ norm_type (str): normalization type, default: 'batch'.
- We construct the U-Net from the innermost layer to the outermost layer.
- It is a recursive process.
"""
super(UnetGenerator, self).__init__()
norm_layer = build_norm_layer(norm_type)
@@ -105,15 +108,15 @@ class UnetSkipConnectionBlock(nn.Layer):
use_dropout=False):
"""Construct a Unet submodule with skip connections.
- Parameters:
- outer_nc (int) -- the number of filters in the outer conv layer
- inner_nc (int) -- the number of filters in the inner conv layer
- input_nc (int) -- the number of channels in input images/features
- submodule (UnetSkipConnectionBlock) -- previously defined submodules
- outermost (bool) -- if this module is the outermost module
- innermost (bool) -- if this module is the innermost module
- norm_layer -- normalization layer
- use_dropout (bool) -- if use dropout layers.
+ Args:
+ outer_nc (int): the number of filters in the outer conv layer
+ inner_nc (int): the number of filters in the inner conv layer
+ input_nc (int): the number of channels in input images/features
+ submodule (UnetSkipConnectionBlock): previously defined submodules
+ outermost (bool): if this module is the outermost module
+ innermost (bool): if this module is the innermost module
+ norm_layer (paddle.nn.Layer): normalization layer
+ use_dropout (bool): whether to use dropout layers.
"""
super(UnetSkipConnectionBlock, self).__init__()
self.outermost = outermost
@@ -173,5 +176,6 @@ class UnetSkipConnectionBlock(nn.Layer):
def forward(self, x):
if self.outermost:
return self.model(x)
- else: # add skip connections
+ # add skip connections
+ else:
return paddle.concat([x, self.model(x)], 1)
diff --git a/ppgan/models/generators/wav2lip.py b/ppgan/models/generators/wav2lip.py
index c4b9fc3c16d928398c449e6628a021693a82c3e1..5c8b0c9438a0819ec26aa4a58a64622092ff95ba 100644
--- a/ppgan/models/generators/wav2lip.py
+++ b/ppgan/models/generators/wav2lip.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/Rudrabha/Wav2Lip
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/Rudrabha/Wav2Lip#license-and-citation
import paddle
from paddle import nn
@@ -28,9 +18,9 @@ class Wav2Lip(nn.Layer):
self.face_encoder_blocks = nn.LayerList([
nn.Sequential(ConvBNRelu(6, 16, kernel_size=7, stride=1,
- padding=3)), # 96,96
+ padding=3)),
nn.Sequential(
- ConvBNRelu(16, 32, kernel_size=3, stride=2, padding=1), # 48,48
+ ConvBNRelu(16, 32, kernel_size=3, stride=2, padding=1),
ConvBNRelu(32,
32,
kernel_size=3,
@@ -44,7 +34,7 @@ class Wav2Lip(nn.Layer):
padding=1,
residual=True)),
nn.Sequential(
- ConvBNRelu(32, 64, kernel_size=3, stride=2, padding=1), # 24,24
+ ConvBNRelu(32, 64, kernel_size=3, stride=2, padding=1),
ConvBNRelu(64,
64,
kernel_size=3,
@@ -64,8 +54,7 @@ class Wav2Lip(nn.Layer):
padding=1,
residual=True)),
nn.Sequential(
- ConvBNRelu(64, 128, kernel_size=3, stride=2,
- padding=1), # 12,12
+ ConvBNRelu(64, 128, kernel_size=3, stride=2, padding=1),
ConvBNRelu(128,
128,
kernel_size=3,
@@ -79,7 +68,7 @@ class Wav2Lip(nn.Layer):
padding=1,
residual=True)),
nn.Sequential(
- ConvBNRelu(128, 256, kernel_size=3, stride=2, padding=1), # 6,6
+ ConvBNRelu(128, 256, kernel_size=3, stride=2, padding=1),
ConvBNRelu(256,
256,
kernel_size=3,
@@ -93,7 +82,7 @@ class Wav2Lip(nn.Layer):
padding=1,
residual=True)),
nn.Sequential(
- ConvBNRelu(256, 512, kernel_size=3, stride=2, padding=1), # 3,3
+ ConvBNRelu(256, 512, kernel_size=3, stride=2, padding=1),
ConvBNRelu(512,
512,
kernel_size=3,
@@ -102,8 +91,7 @@ class Wav2Lip(nn.Layer):
residual=True),
),
nn.Sequential(
- ConvBNRelu(512, 512, kernel_size=3, stride=1,
- padding=0), # 1, 1
+ ConvBNRelu(512, 512, kernel_size=3, stride=1, padding=0),
ConvBNRelu(512, 512, kernel_size=1, stride=1, padding=0)),
])
@@ -166,7 +154,7 @@ class Wav2Lip(nn.Layer):
512,
kernel_size=3,
stride=1,
- padding=0), # 3,3
+ padding=0),
ConvBNRelu(512,
512,
kernel_size=3,
@@ -193,7 +181,7 @@ class Wav2Lip(nn.Layer):
stride=1,
padding=1,
residual=True),
- ), # 6, 6
+ ),
nn.Sequential(
Conv2dTransposeRelu(768,
384,
@@ -213,7 +201,7 @@ class Wav2Lip(nn.Layer):
stride=1,
padding=1,
residual=True),
- ), # 12, 12
+ ),
nn.Sequential(
Conv2dTransposeRelu(512,
256,
@@ -233,7 +221,7 @@ class Wav2Lip(nn.Layer):
stride=1,
padding=1,
residual=True),
- ), # 24, 24
+ ),
nn.Sequential(
Conv2dTransposeRelu(320,
128,
@@ -253,7 +241,7 @@ class Wav2Lip(nn.Layer):
stride=1,
padding=1,
residual=True),
- ), # 48, 48
+ ),
nn.Sequential(
Conv2dTransposeRelu(160,
64,
@@ -274,14 +262,13 @@ class Wav2Lip(nn.Layer):
padding=1,
residual=True),
),
- ]) # 96,96
+ ])
self.output_block = nn.Sequential(
ConvBNRelu(80, 32, kernel_size=3, stride=1, padding=1),
nn.Conv2D(32, 3, kernel_size=1, stride=1, padding=0), nn.Sigmoid())
def forward(self, audio_sequences, face_sequences):
- # audio_sequences = (B, T, 1, 80, 16)
B = audio_sequences.shape[0]
input_dim_size = len(face_sequences.shape)
@@ -295,7 +282,7 @@ class Wav2Lip(nn.Layer):
],
axis=0)
- audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1
+ audio_embedding = self.audio_encoder(audio_sequences)
feats = []
x = face_sequences
@@ -318,8 +305,8 @@ class Wav2Lip(nn.Layer):
x = self.output_block(x)
if input_dim_size > 4:
- x = paddle.split(x, int(x.shape[0] / B), axis=0) # [(B, C, H, W)]
- outputs = paddle.stack(x, axis=2) # (B, C, T, H, W)
+ x = paddle.split(x, int(x.shape[0] / B), axis=0)
+ outputs = paddle.stack(x, axis=2)
else:
outputs = x
diff --git a/ppgan/models/gfpgan_model.py b/ppgan/models/gfpgan_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bde5e48b8b6f946f21bb1d273698223d3294620
--- /dev/null
+++ b/ppgan/models/gfpgan_model.py
@@ -0,0 +1,552 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import sys
+import paddle
+from paddle.nn import functional as F
+from paddle import autograd
+
+from .base_model import BaseModel
+from .builder import MODELS
+from .generators.builder import build_generator
+from .discriminators.builder import build_discriminator
+from .criterions.builder import build_criterion
+from ..modules.init import init_weights
+from collections import OrderedDict
+from ..solver import build_lr_scheduler, build_optimizer
+from ppgan.utils.visual import *
+from ppgan.models.generators.gfpganv1_arch import FacialComponentDiscriminator
+from ppgan.utils.download import get_path_from_url
+
+
+@MODELS.register()
+class GFPGANModel(BaseModel):
+ """ This class implements the gfpgan model.
+
+ """
+ def __init__(self, **opt):
+
+ super(GFPGANModel, self).__init__()
+ self.opt = opt
+ train_opt = opt
+ if 'image_visual' in self.opt['path']:
+ self.image_paths = self.opt['path']['image_visual']
+ self.current_iter = 0
+ self.nets['net_g'] = build_generator(opt['network_g'])
+ self.log_size = int(math.log(self.opt['network_g']['out_size'], 2))
+ # define networks (both generator and discriminator)
+ self.nets['net_g_ema'] = build_generator(self.opt['network_g'])
+ self.nets['net_d'] = build_discriminator(self.opt['network_d'])
+ self.nets['net_g_ema'].eval()
+ pretrain_network_g = self.opt['path'].get('pretrain_network_g', None)
+ if pretrain_network_g != None:
+ t_weight = get_path_from_url(pretrain_network_g)
+ t_weight = paddle.load(t_weight)
+ if 'net_g' in t_weight:
+ self.nets['net_g'].set_state_dict(t_weight['net_g'])
+ self.nets['net_g_ema'].set_state_dict(t_weight['net_g_ema'])
+ else:
+ self.nets['net_g'].set_state_dict(t_weight)
+ self.nets['net_g_ema'].set_state_dict(t_weight)
+
+ del t_weight
+
+ self.nets['net_d'].train()
+ self.nets['net_g'].train()
+ if ('network_d_left_eye' in self.opt
+ and 'network_d_right_eye' in self.opt
+ and 'network_d_mouth' in self.opt):
+ self.use_facial_disc = True
+ else:
+ self.use_facial_disc = False
+
+ if self.use_facial_disc:
+ # left eye
+ self.nets['net_d_left_eye'] = FacialComponentDiscriminator()
+ self.nets['net_d_right_eye'] = FacialComponentDiscriminator()
+ self.nets['net_d_mouth'] = FacialComponentDiscriminator()
+ load_path = self.opt['path'].get('pretrain_network_d_left_eye')
+ if load_path is not None:
+ load_val = get_path_from_url(load_path)
+ load_val = paddle.load(load_val)
+ self.nets['net_d_left_eye'].set_state_dict(load_val)
+ self.nets['net_d_right_eye'].set_state_dict(load_val)
+ self.nets['net_d_mouth'].set_state_dict(load_val)
+ del load_val
+ self.nets['net_d_left_eye'].train()
+ self.nets['net_d_right_eye'].train()
+ self.nets['net_d_mouth'].train()
+ self.cri_component = build_criterion(train_opt['gan_component_opt'])
+
+ if train_opt.get('pixel_opt'):
+ self.cri_pix = build_criterion(train_opt['pixel_opt'])
+ else:
+ self.cri_pix = None
+
+ # perceptual loss
+ if train_opt.get('perceptual_opt'):
+ self.cri_perceptual = build_criterion(train_opt['perceptual_opt'])
+ else:
+ self.cri_perceptual = None
+
+ # L1 loss is used in pyramid loss, component style loss and identity loss
+ self.cri_l1 = build_criterion(train_opt['L1_opt'])
+
+ # gan loss (wgan)
+ self.cri_gan = build_criterion(train_opt['gan_opt'])
+
+ # ----------- define identity loss ----------- #
+ if 'network_identity' in self.opt:
+ self.use_identity = True
+ else:
+ self.use_identity = False
+
+ if self.use_identity:
+ # define identity network
+ self.network_identity = build_discriminator(
+ self.opt['network_identity'])
+ load_path = self.opt['path'].get('pretrain_network_identity')
+ if load_path is not None:
+ load_val = get_path_from_url(load_path)
+ load_val = paddle.load(load_val)
+ self.network_identity.set_state_dict(load_val)
+ del load_val
+ self.network_identity.eval()
+ for param in self.network_identity.parameters():
+ param.stop_gradient = True
+
+ # regularization weights
+ self.r1_reg_weight = train_opt['r1_reg_weight'] # for discriminator
+ self.net_d_iters = train_opt.get('net_d_iters', 1)
+ self.net_d_init_iters = train_opt.get('net_d_init_iters', 0)
+ self.net_d_reg_every = train_opt['net_d_reg_every']
+
+ def setup_input(self, data):
+ self.lq = data['lq']
+
+ if 'gt' in data:
+ self.gt = data['gt']
+
+ if 'loc_left_eye' in data:
+ # get facial component locations, shape (batch, 4)
+ self.loc_left_eyes = data['loc_left_eye'].astype('float32')
+ self.loc_right_eyes = data['loc_right_eye'].astype('float32')
+ self.loc_mouths = data['loc_mouth'].astype('float32')
+
+ def forward(self, test_mode=False, regularize=False):
+ pass
+
+ def train_iter(self, optimizers=None):
+ # optimize nets['net_g']
+ for p in self.nets['net_d'].parameters():
+ p.stop_gradient = True
+ self.optimizers['optim_g'].clear_grad(set_to_zero=False)
+
+ # do not update facial component net_d
+ if self.use_facial_disc:
+ for p in self.nets['net_d_left_eye'].parameters():
+ p.stop_gradient = True
+ for p in self.nets['net_d_right_eye'].parameters():
+ p.stop_gradient = True
+ for p in self.nets['net_d_mouth'].parameters():
+ p.stop_gradient = True
+
+ # image pyramid loss weight
+ pyramid_loss_weight = self.opt.get('pyramid_loss_weight', 0)
+ if pyramid_loss_weight > 0 and self.current_iter > self.opt.get(
+ 'remove_pyramid_loss', float('inf')):
+ pyramid_loss_weight = 1e-12 # very small weight to avoid unused param error
+ if pyramid_loss_weight > 0:
+ self.output, out_rgbs = self.nets['net_g'](self.lq, return_rgb=True)
+ pyramid_gt = self.construct_img_pyramid()
+ else:
+ self.output, out_rgbs = self.nets['net_g'](self.lq,
+ return_rgb=False)
+
+ # get roi-align regions
+ if self.use_facial_disc:
+ self.get_roi_regions(eye_out_size=80, mouth_out_size=120)
+ l_g_total = 0
+ if (self.current_iter % self.net_d_iters == 0
+ and self.current_iter > self.net_d_init_iters):
+ # pixel loss
+ if self.cri_pix:
+ l_g_pix = self.cri_pix(self.output, self.gt)
+ l_g_total += l_g_pix
+ self.losses['l_g_pix'] = l_g_pix
+
+ # image pyramid loss
+ if pyramid_loss_weight > 0:
+ for i in range(0, self.log_size - 2):
+ l_pyramid = self.cri_l1(out_rgbs[i],
+ pyramid_gt[i]) * pyramid_loss_weight
+ l_g_total += l_pyramid
+ self.losses[f'l_p_{2**(i+3)}'] = l_pyramid
+
+ # perceptual loss
+ if self.cri_perceptual:
+ l_g_percep, l_g_style = self.cri_perceptual(
+ self.output, self.gt)
+ if l_g_percep is not None:
+ l_g_total += l_g_percep
+ self.losses['l_g_percep'] = l_g_percep
+ if l_g_style is not None:
+ l_g_total += l_g_style
+ self.losses['l_g_style'] = l_g_style
+
+ # gan loss
+ fake_g_pred = self.nets['net_d'](self.output)
+ l_g_gan = self.cri_gan(fake_g_pred, True, is_disc=False)
+ l_g_total += l_g_gan
+ self.losses['l_g_gan'] = l_g_gan
+
+ # facial component loss
+ if self.use_facial_disc:
+ # left eye
+ fake_left_eye, fake_left_eye_feats = self.nets[
+ 'net_d_left_eye'](self.left_eyes, return_feats=True)
+ l_g_gan = self.cri_component(fake_left_eye, True, is_disc=False)
+ l_g_total += l_g_gan
+ self.losses['l_g_gan_left_eye'] = l_g_gan
+ # right eye
+ fake_right_eye, fake_right_eye_feats = self.nets[
+ 'net_d_right_eye'](self.right_eyes, return_feats=True)
+ l_g_gan = self.cri_component(fake_right_eye,
+ True,
+ is_disc=False)
+ l_g_total += l_g_gan
+ self.losses['l_g_gan_right_eye'] = l_g_gan
+ # mouth
+ fake_mouth, fake_mouth_feats = self.nets['net_d_mouth'](
+ self.mouths, return_feats=True)
+ l_g_gan = self.cri_component(fake_mouth, True, is_disc=False)
+ l_g_total += l_g_gan
+ self.losses['l_g_gan_mouth'] = l_g_gan
+
+ if self.opt.get('comp_style_weight', 0) > 0:
+ # get gt feat
+ _, real_left_eye_feats = self.nets['net_d_left_eye'](
+ self.left_eyes_gt, return_feats=True)
+ _, real_right_eye_feats = self.nets['net_d_right_eye'](
+ self.right_eyes_gt, return_feats=True)
+ _, real_mouth_feats = self.nets['net_d_mouth'](
+ self.mouths_gt, return_feats=True)
+
+ def _comp_style(feat, feat_gt, criterion):
+ return criterion(self._gram_mat(
+ feat[0]), self._gram_mat(
+ feat_gt[0].detach())) * 0.5 + criterion(
+ self._gram_mat(feat[1]),
+ self._gram_mat(feat_gt[1].detach()))
+
+ # facial component style loss
+ comp_style_loss = 0
+ comp_style_loss += _comp_style(fake_left_eye_feats,
+ real_left_eye_feats,
+ self.cri_l1)
+ comp_style_loss += _comp_style(fake_right_eye_feats,
+ real_right_eye_feats,
+ self.cri_l1)
+ comp_style_loss += _comp_style(fake_mouth_feats,
+ real_mouth_feats,
+ self.cri_l1)
+ comp_style_loss = comp_style_loss * self.opt[
+ 'comp_style_weight']
+ l_g_total += comp_style_loss
+ self.losses['l_g_comp_style_loss'] = comp_style_loss
+
+ # identity loss
+ if self.use_identity:
+ identity_weight = self.opt['identity_weight']
+ # get gray images and resize
+ out_gray = self.gray_resize_for_identity(self.output)
+ gt_gray = self.gray_resize_for_identity(self.gt)
+
+ identity_gt = self.network_identity(gt_gray).detach()
+ identity_out = self.network_identity(out_gray)
+ l_identity = self.cri_l1(identity_out,
+ identity_gt) * identity_weight
+ l_g_total += l_identity
+ self.losses['l_identity'] = l_identity
+
+ l_g_total.backward()
+ self.optimizers['optim_g'].step()
+ # EMA
+ self.accumulate(self.nets['net_g_ema'],
+ self.nets['net_g'],
+ decay=0.5**(32 / (10 * 1000)))
+
+ # ----------- optimize net_d ----------- #
+ for p in self.nets['net_d'].parameters():
+ p.stop_gradient = False
+ self.optimizers['optim_d'].clear_grad(set_to_zero=False)
+ if self.use_facial_disc:
+ for p in self.nets['net_d_left_eye'].parameters():
+ p.stop_gradient = False
+ for p in self.nets['net_d_right_eye'].parameters():
+ p.stop_gradient = False
+ for p in self.nets['net_d_mouth'].parameters():
+ p.stop_gradient = False
+ self.optimizers['optim_net_d_left_eye'].clear_grad(
+ set_to_zero=False)
+ self.optimizers['optim_net_d_right_eye'].clear_grad(
+ set_to_zero=False)
+ self.optimizers['optim_net_d_mouth'].clear_grad(set_to_zero=False)
+ fake_d_pred = self.nets['net_d'](self.output.detach())
+ real_d_pred = self.nets['net_d'](self.gt)
+
+ l_d = self.cri_gan(real_d_pred, True, is_disc=True) + self.cri_gan(
+ fake_d_pred, False, is_disc=True)
+ self.losses['l_d'] = l_d
+ # In WGAN, real_score should be positive and fake_score should be negative
+ self.losses['real_score'] = real_d_pred.detach().mean()
+ self.losses['fake_score'] = fake_d_pred.detach().mean()
+ l_d.backward()
+ if self.current_iter % self.net_d_reg_every == 0:
+ self.gt.stop_gradient = False
+ real_pred = self.nets['net_d'](self.gt)
+ l_d_r1 = r1_penalty(real_pred, self.gt)
+ l_d_r1 = (self.r1_reg_weight / 2 * l_d_r1 * self.net_d_reg_every +
+ 0 * real_pred[0])
+ self.losses['l_d_r1'] = l_d_r1.detach().mean()
+ l_d_r1.backward()
+
+ self.optimizers['optim_d'].step()
+
+ # optimize facial component discriminators
+ if self.use_facial_disc:
+ # left eye
+ fake_d_pred, _ = self.nets['net_d_left_eye'](
+ self.left_eyes.detach())
+ real_d_pred, _ = self.nets['net_d_left_eye'](self.left_eyes_gt)
+ l_d_left_eye = self.cri_component(
+ real_d_pred, True, is_disc=True) + self.cri_gan(
+ fake_d_pred, False, is_disc=True)
+ self.losses['l_d_left_eye'] = l_d_left_eye
+ l_d_left_eye.backward()
+ # right eye
+ fake_d_pred, _ = self.nets['net_d_right_eye'](
+ self.right_eyes.detach())
+ real_d_pred, _ = self.nets['net_d_right_eye'](self.right_eyes_gt)
+ l_d_right_eye = self.cri_component(
+ real_d_pred, True, is_disc=True) + self.cri_gan(
+ fake_d_pred, False, is_disc=True)
+ self.losses['l_d_right_eye'] = l_d_right_eye
+ l_d_right_eye.backward()
+ # mouth
+ fake_d_pred, _ = self.nets['net_d_mouth'](self.mouths.detach())
+ real_d_pred, _ = self.nets['net_d_mouth'](self.mouths_gt)
+ l_d_mouth = self.cri_component(real_d_pred, True,
+ is_disc=True) + self.cri_gan(
+ fake_d_pred, False, is_disc=True)
+ self.losses['l_d_mouth'] = l_d_mouth
+ l_d_mouth.backward()
+
+ self.optimizers['optim_net_d_left_eye'].step()
+ self.optimizers['optim_net_d_right_eye'].step()
+ self.optimizers['optim_net_d_mouth'].step()
+ # if self.current_iter%1000==0:
+
+ def test_iter(self, metrics=None):
+ self.nets['net_g_ema'].eval()
+ self.fake_img, _ = self.nets['net_g_ema'](self.lq)
+ self.visual_items['cur_fake'] = self.fake_img[0]
+ self.visual_items['cur_gt'] = self.gt[0]
+ self.visual_items['cur_lq'] = self.lq[0]
+ with paddle.no_grad():
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(self.fake_img.detach().numpy(),
+ self.gt.detach().numpy())
+
+ def setup_lr_schedulers(self, cfg):
+ self.lr_scheduler = OrderedDict()
+ self.lr_scheduler['_g'] = build_lr_scheduler(cfg)
+ self.lr_scheduler['_component'] = build_lr_scheduler(cfg)
+ cfg_d = cfg.copy()
+ net_d_reg_ratio = self.net_d_reg_every / (self.net_d_reg_every + 1)
+ cfg_d['learning_rate'] *= net_d_reg_ratio
+ self.lr_scheduler['_d'] = build_lr_scheduler(cfg_d)
+ return self.lr_scheduler
+
+ def setup_optimizers(self, lr, cfg):
+ # ----------- optimizer g ----------- #
+ net_g_reg_ratio = 1
+ parameters = []
+ parameters += self.nets['net_g'].parameters()
+ cfg['optim_g']['beta1'] = 0**net_g_reg_ratio
+ cfg['optim_g']['beta2'] = 0.99**net_g_reg_ratio
+
+ self.optimizers['optim_g'] = build_optimizer(cfg['optim_g'],
+ self.lr_scheduler['_g'],
+ parameters)
+
+ # ----------- optimizer d ----------- #
+ net_d_reg_ratio = self.net_d_reg_every / (self.net_d_reg_every + 1)
+ parameters = []
+ parameters += self.nets['net_d'].parameters()
+ cfg['optim_d']['beta1'] = 0**net_d_reg_ratio
+ cfg['optim_d']['beta2'] = 0.99**net_d_reg_ratio
+
+ self.optimizers['optim_d'] = build_optimizer(cfg['optim_d'],
+ self.lr_scheduler['_d'],
+ parameters)
+
+ # ----------- optimizers for facial component networks ----------- #
+ if self.use_facial_disc:
+ parameters = []
+ parameters += self.nets['net_d_left_eye'].parameters()
+
+ self.optimizers['optim_net_d_left_eye'] = build_optimizer(
+ cfg['optim_component'], self.lr_scheduler['_component'],
+ parameters)
+
+ parameters = []
+ parameters += self.nets['net_d_right_eye'].parameters()
+
+ self.optimizers['optim_net_d_right_eye'] = build_optimizer(
+ cfg['optim_component'], self.lr_scheduler['_component'],
+ parameters)
+
+ parameters = []
+ parameters += self.nets['net_d_mouth'].parameters()
+
+ self.optimizers['optim_net_d_mouth'] = build_optimizer(
+ cfg['optim_component'], self.lr_scheduler['_component'],
+ parameters)
+
+ return self.optimizers
+
+ def construct_img_pyramid(self):
+ """Construct image pyramid for intermediate restoration loss"""
+ pyramid_gt = [self.gt]
+ down_img = self.gt
+ for _ in range(0, self.log_size - 3):
+ down_img = F.interpolate(down_img,
+ scale_factor=0.5,
+ mode='bilinear',
+ align_corners=False)
+ pyramid_gt.insert(0, down_img)
+ return pyramid_gt
+
+ def get_roi_regions(self, eye_out_size=80, mouth_out_size=120):
+ from paddle.vision.ops import roi_align
+ face_ratio = int(self.opt['network_g']['out_size'] / 512)
+ eye_out_size *= face_ratio
+ mouth_out_size *= face_ratio
+
+ rois_eyes = []
+ rois_mouths = []
+ num_eye = []
+ num_mouth = []
+ for b in range(self.loc_left_eyes.shape[0]): # loop for batch size
+ # left eye and right eye
+
+ img_inds = paddle.ones([2, 1], dtype=self.loc_left_eyes.dtype) * b
+ bbox = paddle.stack(
+ [self.loc_left_eyes[b, :], self.loc_right_eyes[b, :]],
+ axis=0) # shape: (2, 4)
+ # rois = paddle.concat([img_inds, bbox], axis=-1) # shape: (2, 5)
+ rois_eyes.append(bbox)
+ # mouse
+ img_inds = paddle.ones([1, 1], dtype=self.loc_left_eyes.dtype) * b
+ num_eye.append(2)
+ num_mouth.append(1)
+ # rois = paddle.concat([img_inds, self.loc_mouths[b:b + 1, :]], axis=-1) # shape: (1, 5)
+ rois_mouths.append(self.loc_mouths[b:b + 1, :])
+ rois_eyes = paddle.concat(rois_eyes, 0)
+ rois_mouths = paddle.concat(rois_mouths, 0)
+ # real images
+ num_eye = paddle.to_tensor(num_eye, dtype='int32')
+ num_mouth = paddle.to_tensor(num_mouth, dtype='int32')
+
+ all_eyes = roi_align(self.gt,
+ boxes=rois_eyes,
+ boxes_num=num_eye,
+ output_size=eye_out_size,
+ aligned=False) * face_ratio
+ self.left_eyes_gt = all_eyes[0::2, :, :, :]
+ self.right_eyes_gt = all_eyes[1::2, :, :, :]
+ self.mouths_gt = roi_align(self.gt,
+ boxes=rois_mouths,
+ boxes_num=num_mouth,
+ output_size=mouth_out_size,
+ aligned=False) * face_ratio
+ # output
+ all_eyes = roi_align(self.output,
+ boxes=rois_eyes,
+ boxes_num=num_eye,
+ output_size=eye_out_size,
+ aligned=False) * face_ratio
+ self.left_eyes = all_eyes[0::2, :, :, :]
+ self.right_eyes = all_eyes[1::2, :, :, :]
+ self.mouths = roi_align(self.output,
+ boxes=rois_mouths,
+ boxes_num=num_mouth,
+ output_size=mouth_out_size,
+ aligned=False) * face_ratio
+
+ def _gram_mat(self, x):
+ """Calculate Gram matrix.
+
+ Args:
+ x (paddle.Tensor): Tensor with shape of (n, c, h, w).
+
+ Returns:
+ paddle.Tensor: Gram matrix.
+ """
+ n, c, h, w = x.shape
+ features = x.reshape((n, c, w * h))
+ features_t = features.transpose([0, 2, 1])
+ gram = features.bmm(features_t) / (c * h * w)
+ return gram
+
+ def gray_resize_for_identity(self, out, size=128):
+ out_gray = (0.2989 * out[:, 0, :, :] + 0.5870 * out[:, 1, :, :] +
+ 0.1140 * out[:, 2, :, :])
+ out_gray = out_gray.unsqueeze(1)
+ out_gray = F.interpolate(out_gray, (size, size),
+ mode='bilinear',
+ align_corners=False)
+ return out_gray
+
+ def accumulate(self, model1, model2, decay=0.999):
+ par1 = dict(model1.state_dict())
+ par2 = dict(model2.state_dict())
+
+ for k in par1.keys():
+ par1[k] = par1[k] * decay + par2[k] * (1 - decay)
+
+ model1.load_dict(par1)
+
+
+def r1_penalty(real_pred, real_img):
+ """R1 regularization for discriminator. The core idea is to
+ penalize the gradient on real data alone: when the
+ generator distribution produces the true data distribution
+ and the discriminator is equal to 0 on the data manifold, the
+ gradient penalty ensures that the discriminator cannot create
+ a non-zero gradient orthogonal to the data manifold without
+ suffering a loss in the GAN game.
+
+ Ref:
+ Eq. 9 in Which training methods for GANs do actually converge.
+ """
+ grad_real = paddle.grad(outputs=real_pred.sum(),
+ inputs=real_img,
+ create_graph=True)[0]
+ grad_penalty = grad_real.pow(2).reshape(
+ (grad_real.shape[0], -1)).sum(1).mean()
+ return grad_penalty
diff --git a/ppgan/models/gpen_model.py b/ppgan/models/gpen_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccd9f5260511f73e339375e284fd1060f52cf954
--- /dev/null
+++ b/ppgan/models/gpen_model.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from .base_model import BaseModel
+
+from .builder import MODELS
+from .generators.builder import build_generator
+from .discriminators.builder import build_discriminator
+from ..modules.init import init_weights
+
+from .criterions.id_loss import IDLoss
+from paddle.nn import functional as F
+from paddle import autograd
+import math
+
+
+def d_logistic_loss(real_pred, fake_pred):
+ real_loss = F.softplus(-real_pred)
+ fake_loss = F.softplus(fake_pred)
+
+ return real_loss.mean() + fake_loss.mean()
+
+
+def d_r1_loss(real_pred, real_img):
+ grad_real, = autograd.grad(outputs=real_pred.sum(),
+ inputs=real_img,
+ create_graph=True)
+ grad_penalty = grad_real.pow(2).reshape([grad_real.shape[0],
+ -1]).sum(1).mean()
+
+ return grad_penalty
+
+
+def g_nonsaturating_loss(fake_pred,
+ loss_funcs=None,
+ fake_img=None,
+ real_img=None,
+ input_img=None):
+ smooth_l1_loss, id_loss = loss_funcs
+
+ loss = F.softplus(-fake_pred).mean()
+ loss_l1 = smooth_l1_loss(fake_img, real_img)
+ loss_id = id_loss(fake_img, real_img, input_img)
+ loss += 1.0 * loss_l1 + 1.0 * loss_id
+
+ return loss
+
+
+def g_path_regularize(fake_img, latents, mean_path_length, decay=0.01):
+ noise = paddle.randn(fake_img.shape) / math.sqrt(
+ fake_img.shape[2] * fake_img.shape[3])
+ grad, = autograd.grad(outputs=(fake_img * noise).sum(),
+ inputs=latents,
+ create_graph=True)
+ path_lengths = paddle.sqrt(grad.pow(2).sum(2).mean(1))
+
+ path_mean = mean_path_length + decay * (path_lengths.mean() -
+ mean_path_length)
+
+ path_penalty = (path_lengths - path_mean).pow(2).mean()
+
+ return path_penalty, path_mean.detach(), path_lengths
+
+
+@MODELS.register()
+class GPENModel(BaseModel):
+ """ This class implements the gpen model.
+
+ """
+
+ def __init__(self, generator, discriminator=None, direction='a2b'):
+
+ super(GPENModel, self).__init__()
+
+ self.direction = direction
+ # define networks (both generator and discriminator)
+ self.nets['netG'] = build_generator(generator)
+ self.nets['g_ema'] = build_generator(generator)
+ self.nets['g_ema'].eval()
+
+ if discriminator:
+ self.nets['netD'] = build_discriminator(discriminator)
+
+ self.accum = 0.5**(32 / (10 * 1000))
+ self.mean_path_length = 0
+
+ self.gan_criterions = []
+ self.gan_criterions.append(paddle.nn.SmoothL1Loss())
+ self.gan_criterions.append(IDLoss())
+ self.current_iter = 0
+
+ def setup_input(self, input):
+
+ self.degraded_img = paddle.to_tensor(input[0])
+ self.real_img = paddle.to_tensor(input[1])
+
+ def forward(self, test_mode=False, regularize=False):
+ if test_mode:
+ self.fake_img, _ = self.nets['g_ema'](self.degraded_img) # G(A)
+ else:
+ if regularize:
+ self.fake_img, self.latents = self.nets['netG'](
+ self.degraded_img, return_latents=True)
+ else:
+ self.fake_img, _ = self.nets['netG'](self.degraded_img)
+
+ def backward_D(self, regularize=False):
+ """Calculate GAN loss for the discriminator"""
+ if regularize:
+ self.real_img.stop_gradient = False
+ real_pred = self.nets['netD'](self.real_img)
+ r1_loss = d_r1_loss(real_pred, self.real_img)
+ (10 / 2 * r1_loss * 16).backward()
+ else:
+ fake_pred = self.nets['netD'](self.fake_img)
+ real_pred = self.nets['netD'](self.real_img)
+ self.loss_D = d_logistic_loss(real_pred, fake_pred)
+ self.loss_D.backward()
+ self.losses['D_loss'] = self.loss_D
+
+ def backward_G(self, regularize):
+ """Calculate GAN and L1 loss for the generator"""
+
+ if regularize:
+ path_loss, self.mean_path_length, path_lengths = g_path_regularize(
+ self.fake_img, self.latents, self.mean_path_length)
+ weighted_path_loss = 2 * 4 * path_loss
+ weighted_path_loss.backward()
+ else:
+ fake_pred = self.nets['netD'](self.fake_img)
+ self.loss_G = g_nonsaturating_loss(fake_pred, self.gan_criterions,
+ self.fake_img, self.real_img,
+ self.degraded_img)
+ self.loss_G.backward()
+ self.losses['G_loss'] = self.loss_G
+
+ def train_iter(self, optimizers=None):
+
+ self.current_iter += 1
+ # update D
+ self.set_requires_grad(self.nets['netD'], True)
+ self.set_requires_grad(self.nets['netG'], False)
+ self.forward(test_mode=False)
+ optimizers['optimD'].clear_grad()
+ self.backward_D(regularize=False)
+ optimizers['optimD'].step()
+
+ d_regularize = self.current_iter % 24 == 0
+ if d_regularize:
+ optimizers['optimD'].clear_grad()
+ self.backward_D(regularize=True)
+ optimizers['optimD'].step()
+ # update G
+ self.set_requires_grad(self.nets['netD'], False)
+ self.set_requires_grad(self.nets['netG'], True)
+ self.forward(test_mode=False)
+ optimizers['optimG'].clear_grad()
+ self.backward_G(regularize=False)
+ optimizers['optimG'].step()
+
+ g_regularize = self.current_iter % 4 == 0
+ if g_regularize:
+ self.forward(test_mode=False, regularize=True)
+ optimizers['optimG'].clear_grad()
+ self.backward_G(regularize=True)
+ optimizers['optimG'].step()
+
+ self.accumulate(self.nets['g_ema'], self.nets['netG'], self.accum)
+
+ def test_iter(self, metrics=None):
+ self.nets['g_ema'].eval()
+ self.forward(test_mode=True)
+
+ with paddle.no_grad():
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(self.fake_img, self.real_img)
+
+ def accumulate(self, model1, model2, decay=0.999):
+ par1 = dict(model1.state_dict())
+ par2 = dict(model2.state_dict())
+
+ for k in par1.keys():
+ par1[k] = par1[k] * decay + par2[k] * (1 - decay)
+
+ model1.load_dict(par1)
diff --git a/ppgan/models/invdn_model.py b/ppgan/models/invdn_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e700469aee8e775692be54f00d6eeb8eefb31dd
--- /dev/null
+++ b/ppgan/models/invdn_model.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import paddle.nn as nn
+
+from .builder import MODELS
+from .base_model import BaseModel
+from .generators.builder import build_generator
+from .criterions.builder import build_criterion
+from ppgan.utils.visual import tensor2img
+from ..solver import build_lr_scheduler, build_optimizer
+
+
+@MODELS.register()
+class InvDNModel(BaseModel):
+ """InvDN Model.
+ Invertible Denoising Network: A Light Solution for Real Noise Removal (CVPR 2021)
+ Originally Written by Liu, Yang and Qin, Zhenyue.
+ """
+ def __init__(self, generator):
+ """Initialize the the class.
+
+ Args:
+ generator (dict): config of generator.
+ """
+ super(InvDNModel, self).__init__(generator)
+ self.current_iter = 1
+
+ self.nets['generator'] = build_generator(generator)
+
+ self.generator_cfg = generator
+
+ def setup_input(self, input):
+ self.noisy = input[0]
+ self.gt = input[1]
+ self.lq = input[2]
+
+ def train_iter(self, optims=None):
+ optims['optim'].clear_gradients()
+
+ noise_channel = 3 * 4**(self.generator_cfg.down_num) - 3
+ noise = paddle.randn((self.noisy.shape[0], noise_channel,
+ self.noisy.shape[2], self.noisy.shape[3]))
+ output_hq, output_lq = self.nets['generator'](self.noisy, noise)
+ output_hq = output_hq[:, :3, :, :]
+ output_lq = output_lq[:, :3, :, :]
+
+ self.lq = self.lq.detach()
+ l_forw_fit = 16.0 * paddle.mean(
+ paddle.sum((output_lq - self.lq)**2, (1, 2, 3)))
+ l_back_rec = paddle.mean(
+ paddle.sum(
+ paddle.sqrt((self.gt - output_hq) * (self.gt - output_hq) +
+ 1e-3), (1, 2, 3)))
+
+ l_total = l_forw_fit + l_back_rec
+
+ l_total.backward()
+ optims['optim'].step()
+ self.losses['loss'] = l_total.numpy()
+
+ def setup_optimizers(self, lr, cfg):
+ if cfg.get('name', None):
+ cfg_ = cfg.copy()
+ net_names = cfg_.pop('net_names')
+ parameters = []
+ for net_name in net_names:
+ parameters += self.nets[net_name].parameters()
+
+ cfg_['grad_clip'] = nn.ClipGradByNorm(cfg_['clip_grad_norm'])
+ cfg_.pop('clip_grad_norm')
+
+ self.optimizers['optim'] = build_optimizer(cfg_, lr, parameters)
+ else:
+ for opt_name, opt_cfg in cfg.items():
+ cfg_ = opt_cfg.copy()
+ net_names = cfg_.pop('net_names')
+ parameters = []
+ for net_name in net_names:
+ parameters += self.nets[net_name].parameters()
+ self.optimizers[opt_name] = build_optimizer(
+ cfg_, lr, parameters)
+
+ return self.optimizers
+
+ def forward(self):
+ pass
+
+ def test_iter(self, metrics=None):
+ self.nets['generator'].eval()
+ with paddle.no_grad():
+
+ noise_channel = 3 * 4**(self.generator_cfg.down_num) - 3
+ noise = paddle.randn((self.noisy.shape[0], noise_channel,
+ self.noisy.shape[2], self.noisy.shape[3]))
+ output_hq, _ = self.nets['generator'](self.noisy, noise)
+ output_hq = output_hq[:, :3, :, :]
+
+ self.output = output_hq
+ self.visual_items['output'] = self.output
+
+ self.nets['generator'].train()
+
+ out_img = []
+ gt_img = []
+ for out_tensor, gt_tensor in zip(self.output, self.gt):
+ out_img.append(tensor2img(out_tensor, (0., 1.)))
+ gt_img.append(tensor2img(gt_tensor, (0., 1.)))
+
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(out_img, gt_img)
+
+ def export_model(self,
+ export_model=None,
+ output_dir=None,
+ inputs_size=None,
+ export_serving_model=False,
+ model_name=None):
+ shape = inputs_size[0]
+ new_model = self.nets['generator']
+ new_model.eval()
+
+ noise_channel = 3 * 4**(self.generator_cfg.down_num) - 3
+ noise_shape = (shape[0], noise_channel, shape[2], shape[3])
+ input_spec = [
+ paddle.static.InputSpec(shape=shape, dtype="float32"),
+ paddle.static.InputSpec(shape=noise_shape, dtype="float32")
+ ]
+
+ static_model = paddle.jit.to_static(new_model, input_spec=input_spec)
+
+ if output_dir is None:
+ output_dir = 'inference_model'
+ if model_name is None:
+ model_name = '{}_{}'.format(self.__class__.__name__.lower(),
+ export_model[0]['name'])
+
+ paddle.jit.save(static_model, os.path.join(output_dir, model_name))
diff --git a/ppgan/models/lapstyle_model.py b/ppgan/models/lapstyle_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a644e8dbffc8c87f8496a53dae3fe7e00489a4
--- /dev/null
+++ b/ppgan/models/lapstyle_model.py
@@ -0,0 +1,445 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from .base_model import BaseModel
+
+from .builder import MODELS
+from .generators.builder import build_generator
+from .criterions import build_criterion
+from .discriminators.builder import build_discriminator
+
+from ..modules.init import init_weights
+
+
+@MODELS.register()
+class LapStyleDraModel(BaseModel):
+ def __init__(self,
+ generator_encode,
+ generator_decode,
+ calc_style_emd_loss=None,
+ calc_content_relt_loss=None,
+ calc_content_loss=None,
+ calc_style_loss=None,
+ content_layers=['r11', 'r21', 'r31', 'r41', 'r51'],
+ style_layers=['r11', 'r21', 'r31', 'r41', 'r51'],
+ content_weight=1.0,
+ style_weight=3.0):
+
+ super(LapStyleDraModel, self).__init__()
+
+ # define generators
+ self.nets['net_enc'] = build_generator(generator_encode)
+ self.nets['net_dec'] = build_generator(generator_decode)
+ init_weights(self.nets['net_dec'])
+ self.set_requires_grad([self.nets['net_enc']], False)
+
+ # define loss functions
+ self.calc_style_emd_loss = build_criterion(calc_style_emd_loss)
+ self.calc_content_relt_loss = build_criterion(calc_content_relt_loss)
+ self.calc_content_loss = build_criterion(calc_content_loss)
+ self.calc_style_loss = build_criterion(calc_style_loss)
+
+ self.content_layers = content_layers
+ self.style_layers = style_layers
+ self.content_weight = content_weight
+ self.style_weight = style_weight
+
+ def setup_input(self, input):
+ self.ci = paddle.to_tensor(input['ci'])
+ self.visual_items['ci'] = self.ci
+ self.si = paddle.to_tensor(input['si'])
+ self.visual_items['si'] = self.si
+ self.image_paths = input['ci_path']
+
+ def forward(self):
+ """Run forward pass; called by both functions and ."""
+ self.cF = self.nets['net_enc'](self.ci)
+ self.sF = self.nets['net_enc'](self.si)
+ self.stylized = self.nets['net_dec'](self.cF, self.sF)
+ self.visual_items['stylized'] = self.stylized
+
+ def backward_Dec(self):
+ self.tF = self.nets['net_enc'](self.stylized)
+ """content loss"""
+ self.loss_c = 0
+ for layer in self.content_layers:
+ self.loss_c += self.calc_content_loss(self.tF[layer],
+ self.cF[layer],
+ norm=True)
+ self.losses['loss_c'] = self.loss_c
+ """style loss"""
+ self.loss_s = 0
+ for layer in self.style_layers:
+ self.loss_s += self.calc_style_loss(self.tF[layer], self.sF[layer])
+ self.losses['loss_s'] = self.loss_s
+ """IDENTITY LOSSES"""
+ self.Icc = self.nets['net_dec'](self.cF, self.cF)
+ self.l_identity1 = self.calc_content_loss(self.Icc, self.ci)
+ self.Fcc = self.nets['net_enc'](self.Icc)
+ self.l_identity2 = 0
+ for layer in self.content_layers:
+ self.l_identity2 += self.calc_content_loss(self.Fcc[layer],
+ self.cF[layer])
+ self.losses['l_identity1'] = self.l_identity1
+ self.losses['l_identity2'] = self.l_identity2
+ """relative loss"""
+ self.loss_style_remd = self.calc_style_emd_loss(
+ self.tF['r31'], self.sF['r31']) + self.calc_style_emd_loss(
+ self.tF['r41'], self.sF['r41'])
+ self.loss_content_relt = self.calc_content_relt_loss(
+ self.tF['r31'], self.cF['r31']) + self.calc_content_relt_loss(
+ self.tF['r41'], self.cF['r41'])
+ self.losses['loss_style_remd'] = self.loss_style_remd
+ self.losses['loss_content_relt'] = self.loss_content_relt
+
+ self.loss = self.loss_c * self.content_weight + self.loss_s * self.style_weight +\
+ self.l_identity1 * 50 + self.l_identity2 * 1 + self.loss_style_remd * 10 + \
+ self.loss_content_relt * 16
+ self.loss.backward()
+
+ return self.loss
+
+ def train_iter(self, optimizers=None):
+ """Calculate losses, gradients, and update network weights"""
+ self.forward()
+ optimizers['optimG'].clear_grad()
+ self.backward_Dec()
+ self.optimizers['optimG'].step()
+
+
+def tensor_resample(tensor, dst_size, mode='bilinear'):
+ return F.interpolate(tensor, dst_size, mode=mode, align_corners=False)
+
+
+def laplacian(x):
+ """
+ Laplacian
+
+ return:
+ x - upsample(downsample(x))
+ """
+ return x - tensor_resample(
+ tensor_resample(x, [x.shape[2] // 2, x.shape[3] // 2]),
+ [x.shape[2], x.shape[3]])
+
+
+def make_laplace_pyramid(x, levels):
+ """
+ Make Laplacian Pyramid
+ """
+ pyramid = []
+ current = x
+ for i in range(levels):
+ pyramid.append(laplacian(current))
+ current = tensor_resample(
+ current,
+ (max(current.shape[2] // 2, 1), max(current.shape[3] // 2, 1)))
+ pyramid.append(current)
+ return pyramid
+
+
+def fold_laplace_pyramid(pyramid):
+ """
+ Fold Laplacian Pyramid
+ """
+ current = pyramid[-1]
+ for i in range(len(pyramid) - 2, -1, -1): # iterate from len-2 to 0
+ up_h, up_w = pyramid[i].shape[2], pyramid[i].shape[3]
+ current = pyramid[i] + tensor_resample(current, (up_h, up_w))
+ return current
+
+
+@MODELS.register()
+class LapStyleRevFirstModel(BaseModel):
+ def __init__(self,
+ revnet_generator,
+ revnet_discriminator,
+ draftnet_encode,
+ draftnet_decode,
+ calc_style_emd_loss=None,
+ calc_content_relt_loss=None,
+ calc_content_loss=None,
+ calc_style_loss=None,
+ gan_criterion=None,
+ content_layers=['r11', 'r21', 'r31', 'r41', 'r51'],
+ style_layers=['r11', 'r21', 'r31', 'r41', 'r51'],
+ content_weight=1.0,
+ style_weight=3.0):
+
+ super(LapStyleRevFirstModel, self).__init__()
+
+ # define draftnet params
+ self.nets['net_enc'] = build_generator(draftnet_encode)
+ self.nets['net_dec'] = build_generator(draftnet_decode)
+
+ self.set_requires_grad([self.nets['net_enc']], False)
+ self.set_requires_grad([self.nets['net_dec']], False)
+
+ # define revision-net params
+ self.nets['net_rev'] = build_generator(revnet_generator)
+ init_weights(self.nets['net_rev'])
+ self.nets['netD'] = build_discriminator(revnet_discriminator)
+ init_weights(self.nets['netD'])
+
+ # define loss functions
+ self.calc_style_emd_loss = build_criterion(calc_style_emd_loss)
+ self.calc_content_relt_loss = build_criterion(calc_content_relt_loss)
+ self.calc_content_loss = build_criterion(calc_content_loss)
+ self.calc_style_loss = build_criterion(calc_style_loss)
+ self.gan_criterion = build_criterion(gan_criterion)
+
+ self.content_layers = content_layers
+ self.style_layers = style_layers
+ self.content_weight = content_weight
+ self.style_weight = style_weight
+
+ def setup_input(self, input):
+ self.ci = paddle.to_tensor(input['ci'])
+ self.visual_items['ci'] = self.ci
+ self.si = paddle.to_tensor(input['si'])
+ self.visual_items['si'] = self.si
+ self.image_paths = input['ci_path']
+
+ self.pyr_ci = make_laplace_pyramid(self.ci, 1)
+ self.pyr_si = make_laplace_pyramid(self.si, 1)
+ self.pyr_ci.append(self.ci)
+ self.pyr_si.append(self.si)
+
+ def forward(self):
+ """Run forward pass; called by both functions and ."""
+
+ cF = self.nets['net_enc'](self.pyr_ci[1])
+ sF = self.nets['net_enc'](self.pyr_si[1])
+
+ stylized_small = self.nets['net_dec'](cF, sF)
+ self.visual_items['stylized_small'] = stylized_small
+ stylized_up = F.interpolate(stylized_small, scale_factor=2)
+
+ revnet_input = paddle.concat(x=[self.pyr_ci[0], stylized_up], axis=1)
+ stylized_rev_lap = self.nets['net_rev'](revnet_input)
+ stylized_rev = fold_laplace_pyramid([stylized_rev_lap, stylized_small])
+
+ self.stylized = stylized_rev
+ self.visual_items['stylized'] = self.stylized
+
+ def backward_G(self):
+ self.tF = self.nets['net_enc'](self.stylized)
+ self.cF = self.nets['net_enc'](self.pyr_ci[2])
+ self.sF = self.nets['net_enc'](self.pyr_si[2])
+ """content loss"""
+ self.loss_c = 0
+ for layer in self.content_layers:
+ self.loss_c += self.calc_content_loss(self.tF[layer],
+ self.cF[layer],
+ norm=True)
+ self.losses['loss_c'] = self.loss_c
+ """style loss"""
+ self.loss_s = 0
+ for layer in self.style_layers:
+ self.loss_s += self.calc_style_loss(self.tF[layer], self.sF[layer])
+ self.losses['loss_s'] = self.loss_s
+ """relative loss"""
+ self.loss_style_remd = self.calc_style_emd_loss(
+ self.tF['r31'], self.sF['r31']) + self.calc_style_emd_loss(
+ self.tF['r41'], self.sF['r41'])
+ self.loss_content_relt = self.calc_content_relt_loss(
+ self.tF['r31'], self.cF['r31']) + self.calc_content_relt_loss(
+ self.tF['r41'], self.cF['r41'])
+ self.losses['loss_style_remd'] = self.loss_style_remd
+ self.losses['loss_content_relt'] = self.loss_content_relt
+ """gan loss"""
+ pred_fake = self.nets['netD'](self.stylized)
+ self.loss_G_GAN = self.gan_criterion(pred_fake, True)
+ self.losses['loss_gan_G'] = self.loss_G_GAN
+
+ self.loss = self.loss_G_GAN + self.loss_c * self.content_weight + self.loss_s * self.style_weight +\
+ self.loss_style_remd * 10 + self.loss_content_relt * 16
+ self.loss.backward()
+ return self.loss
+
+ def backward_D(self):
+ """Calculate GAN loss for the discriminator"""
+ pred_fake = self.nets['netD'](self.stylized.detach())
+ self.loss_D_fake = self.gan_criterion(pred_fake, False)
+ pred_real = self.nets['netD'](self.pyr_si[2])
+ self.loss_D_real = self.gan_criterion(pred_real, True)
+ self.loss_D = (self.loss_D_fake + self.loss_D_real) * 0.5
+
+ self.loss_D.backward()
+
+ self.losses['D_fake_loss'] = self.loss_D_fake
+ self.losses['D_real_loss'] = self.loss_D_real
+
+ def train_iter(self, optimizers=None):
+ # compute fake images: G(A)
+ self.forward()
+ # update D
+ self.set_requires_grad(self.nets['netD'], True)
+ optimizers['optimD'].clear_grad()
+ self.backward_D()
+ optimizers['optimD'].step()
+
+ # update G
+ self.set_requires_grad(self.nets['netD'], False)
+ optimizers['optimG'].clear_grad()
+ self.backward_G()
+ optimizers['optimG'].step()
+
+
+@MODELS.register()
+class LapStyleRevSecondModel(BaseModel):
+ def __init__(self,
+ revnet_generator,
+ revnet_discriminator,
+ draftnet_encode,
+ draftnet_decode,
+ calc_style_emd_loss=None,
+ calc_content_relt_loss=None,
+ calc_content_loss=None,
+ calc_style_loss=None,
+ gan_criterion=None,
+ content_layers=['r11', 'r21', 'r31', 'r41', 'r51'],
+ style_layers=['r11', 'r21', 'r31', 'r41', 'r51'],
+ content_weight=1.0,
+ style_weight=3.0):
+
+ super(LapStyleRevSecondModel, self).__init__()
+
+ # define draftnet params
+ self.nets['net_enc'] = build_generator(draftnet_encode)
+ self.nets['net_dec'] = build_generator(draftnet_decode)
+ self.set_requires_grad([self.nets['net_enc']], False)
+ self.set_requires_grad([self.nets['net_dec']], False)
+
+ # define the first revnet params
+ self.nets['net_rev'] = build_generator(revnet_generator)
+ self.set_requires_grad([self.nets['net_rev']], False)
+
+ # define the second revnet params
+ self.nets['net_rev_2'] = build_generator(revnet_generator)
+ init_weights(self.nets['net_rev_2'])
+ self.nets['netD'] = build_discriminator(revnet_discriminator)
+ init_weights(self.nets['netD'])
+
+ # define loss functions
+ self.calc_style_emd_loss = build_criterion(calc_style_emd_loss)
+ self.calc_content_relt_loss = build_criterion(calc_content_relt_loss)
+ self.calc_content_loss = build_criterion(calc_content_loss)
+ self.calc_style_loss = build_criterion(calc_style_loss)
+ self.gan_criterion = build_criterion(gan_criterion)
+
+ self.content_layers = content_layers
+ self.style_layers = style_layers
+ self.content_weight = content_weight
+ self.style_weight = style_weight
+
+ def setup_input(self, input):
+ self.ci = paddle.to_tensor(input['ci'])
+ self.visual_items['ci'] = self.ci
+ self.si = paddle.to_tensor(input['si'])
+ self.visual_items['si'] = self.si
+ self.image_paths = input['ci_path']
+
+ self.pyr_ci = make_laplace_pyramid(self.ci, 2)
+ self.pyr_si = make_laplace_pyramid(self.si, 2)
+ self.pyr_ci.append(self.ci)
+ self.pyr_si.append(self.si)
+
+ def forward(self):
+ """Run forward pass; called by both functions and ."""
+
+ cF = self.nets['net_enc'](self.pyr_ci[2])
+ sF = self.nets['net_enc'](self.pyr_si[2])
+
+ stylized_small = self.nets['net_dec'](cF, sF)
+ self.visual_items['stylized_small'] = stylized_small
+ stylized_up = F.interpolate(stylized_small, scale_factor=2)
+
+ revnet_input = paddle.concat(x=[self.pyr_ci[1], stylized_up], axis=1)
+ stylized_rev_lap = self.nets['net_rev'](revnet_input)
+ stylized_rev = fold_laplace_pyramid([stylized_rev_lap, stylized_small])
+ self.visual_items['stylized_rev_first'] = stylized_rev
+ stylized_up = F.interpolate(stylized_rev, scale_factor=2)
+
+ revnet_input = paddle.concat(x=[self.pyr_ci[0], stylized_up], axis=1)
+ stylized_rev_lap_second = self.nets['net_rev_2'](revnet_input)
+ stylized_rev_second = fold_laplace_pyramid(
+ [stylized_rev_lap_second, stylized_rev_lap, stylized_small])
+
+ self.stylized = stylized_rev_second
+ self.visual_items['stylized'] = self.stylized
+
+ def backward_G(self):
+ self.tF = self.nets['net_enc'](self.stylized)
+ self.cF = self.nets['net_enc'](self.pyr_ci[3])
+ self.sF = self.nets['net_enc'](self.pyr_si[3])
+ """content loss"""
+ self.loss_c = 0
+ for layer in self.content_layers:
+ self.loss_c += self.calc_content_loss(self.tF[layer],
+ self.cF[layer],
+ norm=True)
+ self.losses['loss_c'] = self.loss_c
+ """style loss"""
+ self.loss_s = 0
+ for layer in self.style_layers:
+ self.loss_s += self.calc_style_loss(self.tF[layer], self.sF[layer])
+ self.losses['loss_s'] = self.loss_s
+ """relative loss"""
+ self.loss_style_remd = self.calc_style_emd_loss(self.tF['r41'],
+ self.sF['r41'])
+ self.loss_content_relt = self.calc_content_relt_loss(
+ self.tF['r41'], self.cF['r41'])
+ self.losses['loss_style_remd'] = self.loss_style_remd
+ self.losses['loss_content_relt'] = self.loss_content_relt
+ """gan loss"""
+ pred_fake = self.nets['netD'](self.stylized)
+ self.loss_G_GAN = self.gan_criterion(pred_fake, True)
+ self.losses['loss_gan_G'] = self.loss_G_GAN
+
+ self.loss = self.loss_G_GAN + self.loss_c * self.content_weight + self.loss_s * self.style_weight +\
+ self.loss_style_remd * 10 + self.loss_content_relt * 16
+ self.loss.backward()
+ return self.loss
+
+ def backward_D(self):
+ """Calculate GAN loss for the discriminator"""
+ pred_fake = self.nets['netD'](self.stylized.detach())
+ self.loss_D_fake = self.gan_criterion(pred_fake, False)
+ pred_real = self.nets['netD'](self.pyr_si[3])
+ self.loss_D_real = self.gan_criterion(pred_real, True)
+ self.loss_D = (self.loss_D_fake + self.loss_D_real) * 0.5
+
+ self.loss_D.backward()
+
+ self.losses['D_fake_loss'] = self.loss_D_fake
+ self.losses['D_real_loss'] = self.loss_D_real
+
+ def train_iter(self, optimizers=None):
+ # compute fake images: G(A)
+ self.forward()
+ # update D
+
+ self.set_requires_grad(self.nets['netD'], True)
+ optimizers['optimD'].clear_grad()
+ self.backward_D()
+ optimizers['optimD'].step()
+
+ # update G
+ self.set_requires_grad(self.nets['netD'], False)
+ optimizers['optimG'].clear_grad()
+ self.backward_G()
+ optimizers['optimG'].step()
diff --git a/ppgan/models/makeup_model.py b/ppgan/models/makeup_model.py
index 947191b053690115968aa768b8a5815076088f81..63868cb8b6d0b03095247a384ca325b3d93de5ba 100644
--- a/ppgan/models/makeup_model.py
+++ b/ppgan/models/makeup_model.py
@@ -11,6 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+
+# code was heavily based on https://github.com/wtjiang98/PSGAN
+# MIT License
+# Copyright (c) 2020 Wentao Jiang
+
import os
import numpy as np
@@ -141,6 +146,13 @@ class MakeupModel(BaseModel):
self.visual_items['fake_A'] = self.fake_A
self.visual_items['rec_B'] = self.rec_B
+ def test(self, input):
+ with paddle.no_grad():
+ return self.nets['netG'](input['image_A'], input['image_B'],
+ input['P_A'], input['P_B'],
+ input['consis_mask'], input['mask_A_aug'],
+ input['mask_B_aug'])
+
def backward_D_basic(self, netD, real, fake):
"""Calculate GAN loss for the discriminator
@@ -229,13 +241,13 @@ class MakeupModel(BaseModel):
mask_B_lip_np = mask_B_lip.numpy().squeeze()
mask_A_lip_np, mask_B_lip_np, index_A_lip, index_B_lip = mask_preprocess(
mask_A_lip_np, mask_B_lip_np)
- real_A = paddle.nn.clip((self.real_A + 1.0) / 2.0, 0.0, 1.0) * 255.0
+ real_A = paddle.clip((self.real_A + 1.0) / 2.0, 0.0, 1.0) * 255.0
real_A_np = real_A.numpy().squeeze()
- real_B = paddle.nn.clip((self.real_B + 1.0) / 2.0, 0.0, 1.0) * 255.0
+ real_B = paddle.clip((self.real_B + 1.0) / 2.0, 0.0, 1.0) * 255.0
real_B_np = real_B.numpy().squeeze()
- fake_A = paddle.nn.clip((self.fake_A + 1.0) / 2.0, 0.0, 1.0) * 255.0
+ fake_A = paddle.clip((self.fake_A + 1.0) / 2.0, 0.0, 1.0) * 255.0
fake_A_np = fake_A.numpy().squeeze()
- fake_B = paddle.nn.clip((self.fake_B + 1.0) / 2.0, 0.0, 1.0) * 255.0
+ fake_B = paddle.clip((self.fake_B + 1.0) / 2.0, 0.0, 1.0) * 255.0
fake_B_np = fake_B.numpy().squeeze()
fake_match_lip_A = hisMatch(fake_A_np, real_B_np, mask_A_lip_np,
diff --git a/ppgan/models/mpr_model.py b/ppgan/models/mpr_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..426b9c3eec16f33990dec2b85e48b8a77b1c445c
--- /dev/null
+++ b/ppgan/models/mpr_model.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .builder import MODELS
+from .base_model import BaseModel
+from .generators.builder import build_generator
+from .criterions.builder import build_criterion
+from ..modules.init import reset_parameters, init_weights
+from ..utils.visual import tensor2img
+
+
+@MODELS.register()
+class MPRModel(BaseModel):
+ """MPR Model.
+
+ Paper: MPR: Multi-Stage Progressive Image Restoration (CVPR 2021).
+ https://arxiv.org/abs/2102.02808
+ """
+ def __init__(self, generator, char_criterion=None, edge_criterion=None):
+ """Initialize the MPR class.
+
+ Args:
+ generator (dict): config of generator.
+ char_criterion (dict): config of char criterion.
+ edge_criterion (dict): config of edge criterion.
+ """
+ super(MPRModel, self).__init__(generator)
+ self.current_iter = 1
+
+ self.nets['generator'] = build_generator(generator)
+ init_weights(self.nets['generator'])
+
+ if char_criterion:
+ self.char_criterion = build_criterion(char_criterion)
+ if edge_criterion:
+ self.edge_criterion = build_criterion(edge_criterion)
+
+ def setup_input(self, input):
+ self.target = input[0]
+ self.lq = input[1]
+
+ def train_iter(self, optims=None):
+ optims['optim'].clear_gradients()
+
+ restored = self.nets['generator'](self.lq)
+
+ loss_char = []
+ loss_edge = []
+
+ for i in range(len(restored)):
+ loss_char.append(self.char_criterion(restored[i], self.target))
+ loss_edge.append(self.edge_criterion(restored[i], self.target))
+ loss_char = paddle.stack(loss_char)
+ loss_edge = paddle.stack(loss_edge)
+ loss_char = paddle.sum(loss_char)
+ loss_edge = paddle.sum(loss_edge)
+
+ loss = (loss_char) + (0.05 * loss_edge)
+
+ loss.backward()
+ optims['optim'].step()
+ self.losses['loss'] = loss.numpy()
+
+ def forward(self):
+ pass
+
+ def test_iter(self, metrics=None):
+ self.nets['generator'].eval()
+ with paddle.no_grad():
+ self.output = self.nets['generator'](self.lq)[0]
+ self.visual_items['output'] = self.output
+ self.nets['generator'].train()
+
+ out_img = []
+ gt_img = []
+ for out_tensor, gt_tensor in zip(self.output, self.target):
+ out_img.append(tensor2img(out_tensor, (0., 1.)))
+ gt_img.append(tensor2img(gt_tensor, (0., 1.)))
+
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(out_img, gt_img)
diff --git a/ppgan/models/msvsr_model.py b/ppgan/models/msvsr_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7fe9142f4d5f5158770141ced9787b385fb509b
--- /dev/null
+++ b/ppgan/models/msvsr_model.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .builder import MODELS
+from .sr_model import BaseSRModel
+from .generators.basicvsr import ResidualBlockNoBN, PixelShufflePack, SPyNet
+from .generators.msvsr import ModifiedSPyNet
+from ..modules.init import reset_parameters
+from ..utils.visual import tensor2img
+
+
+@MODELS.register()
+class MultiStageVSRModel(BaseSRModel):
+ """PP-MSVSR Model.
+
+ Paper:
+ PP-MSVSR: Multi-Stage Video Super-Resolution, 2021
+ """
+
+ def __init__(self, generator, fix_iter, pixel_criterion=None, to_static=False,
+ image_shape=None):
+ """Initialize the PP-MSVSR class.
+
+ Args:
+ generator (dict): config of generator.
+ fix_iter (dict): config of fix_iter.
+ pixel_criterion (dict): config of pixel criterion.
+ """
+ super(MultiStageVSRModel, self).__init__(generator, pixel_criterion,
+ to_static=to_static,
+ image_shape=image_shape)
+ self.fix_iter = fix_iter
+ self.current_iter = 1
+ self.flag = True
+ init_basicvsr_weight(self.nets['generator'])
+ if not self.fix_iter:
+ print('init train all parameters!!!')
+ for name, param in self.nets['generator'].named_parameters():
+ param.trainable = True
+ if 'spynet' in name:
+ param.optimize_attr['learning_rate'] = 0.25
+
+ def setup_input(self, input):
+ self.lq = paddle.to_tensor(input['lq'])
+ self.visual_items['lq'] = self.lq[:, 0, :, :, :]
+ if 'gt' in input:
+ self.gt = paddle.to_tensor(input['gt'])
+ self.visual_items['gt'] = self.gt[:, 0, :, :, :]
+ self.image_paths = input['lq_path']
+
+ def train_iter(self, optims=None):
+ optims['optim'].clear_grad()
+ if self.fix_iter:
+ if self.current_iter == 1:
+ print('Train MSVSR with fixed spynet for', self.fix_iter,
+ 'iters.')
+ for name, param in self.nets['generator'].named_parameters():
+ if 'spynet' in name:
+ param.trainable = False
+ elif self.current_iter >= self.fix_iter + 1 and self.flag:
+ print('Train all the parameters.')
+ for name, param in self.nets['generator'].named_parameters():
+ param.trainable = True
+ if 'spynet' in name:
+ param.optimize_attr['learning_rate'] = 0.25
+ self.flag = False
+ for net in self.nets.values():
+ net.find_unused_parameters = False
+
+ output = self.nets['generator'](self.lq)
+ if isinstance(output, (list, tuple)):
+ out_stage2, output = output
+ loss_pix_stage2 = self.pixel_criterion(out_stage2, self.gt)
+ self.losses['loss_pix_stage2'] = loss_pix_stage2
+ self.visual_items['output'] = output[:, 0, :, :, :]
+ # pixel loss
+ loss_pix = self.pixel_criterion(output, self.gt)
+ self.losses['loss_pix'] = loss_pix
+
+ self.loss = sum(_value for _key, _value in self.losses.items()
+ if 'loss_pix' in _key)
+ self.losses['loss'] = self.loss
+
+ self.loss.backward()
+ optims['optim'].step()
+
+ self.current_iter += 1
+
+ # amp train with brute force implementation
+ def train_iter_amp(self, optims=None, scalers=None, amp_level='O1'):
+ optims['optim'].clear_grad()
+ if self.fix_iter:
+ if self.current_iter == 1:
+ print('Train MSVSR with fixed spynet for', self.fix_iter,
+ 'iters.')
+ for name, param in self.nets['generator'].named_parameters():
+ if 'spynet' in name:
+ param.trainable = False
+ elif self.current_iter >= self.fix_iter + 1 and self.flag:
+ print('Train all the parameters.')
+ for name, param in self.nets['generator'].named_parameters():
+ param.trainable = True
+ if 'spynet' in name:
+ param.optimize_attr['learning_rate'] = 0.25
+ self.flag = False
+ for net in self.nets.values():
+ net.find_unused_parameters = False
+
+ # put loss computation in amp context
+ with paddle.amp.auto_cast(enable=True, custom_black_list={'sqrt','scale'}, level=amp_level):
+ output = self.nets['generator'](self.lq)
+ if isinstance(output, (list, tuple)):
+ out_stage2, output = output
+ loss_pix_stage2 = self.pixel_criterion(out_stage2, self.gt)
+ self.losses['loss_pix_stage2'] = loss_pix_stage2
+ self.visual_items['output'] = output[:, 0, :, :, :]
+ # pixel loss
+ loss_pix = self.pixel_criterion(output, self.gt)
+ self.losses['loss_pix'] = loss_pix
+
+ self.loss = sum(_value for _key, _value in self.losses.items()
+ if 'loss_pix' in _key)
+ self.losses['loss'] = self.loss
+
+ scaled_loss = scalers[0].scale(self.loss)
+ scaled_loss.backward()
+ scalers[0].minimize(optims['optim'], scaled_loss)
+
+ self.current_iter += 1
+
+ def test_iter(self, metrics=None):
+ self.gt = self.gt.cpu()
+ self.nets['generator'].eval()
+ with paddle.no_grad():
+ output = self.nets['generator'](self.lq)
+ if isinstance(output, (list, tuple)):
+ out_stage1, output = output
+ self.nets['generator'].train()
+
+ out_img = []
+ gt_img = []
+
+ _, t, _, _, _ = self.gt.shape
+ for i in range(t):
+ out_tensor = output[0, i]
+ gt_tensor = self.gt[0, i]
+ out_img.append(tensor2img(out_tensor, (0., 1.)))
+ gt_img.append(tensor2img(gt_tensor, (0., 1.)))
+
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(out_img, gt_img, is_seq=True)
+
+
+def init_basicvsr_weight(net):
+ for m in net.children():
+ if hasattr(m,
+ 'weight') and not isinstance(m,
+ (nn.BatchNorm, nn.BatchNorm2D)):
+ reset_parameters(m)
+ continue
+
+ if (not isinstance(
+ m,
+ (ResidualBlockNoBN, PixelShufflePack, SPyNet, ModifiedSPyNet))):
+ init_basicvsr_weight(m)
diff --git a/ppgan/models/nafnet_model.py b/ppgan/models/nafnet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4c7cf193110f36a0027c5d3fe5afe58bdb6fa19
--- /dev/null
+++ b/ppgan/models/nafnet_model.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn as nn
+
+from .builder import MODELS
+from .base_model import BaseModel
+from .generators.builder import build_generator
+from .criterions.builder import build_criterion
+from ..utils.visual import tensor2img
+
+
+@MODELS.register()
+class NAFNetModel(BaseModel):
+ """NAFNet Model.
+
+ Paper: Simple Baselines for Image Restoration
+ https://arxiv.org/pdf/2204.04676
+ """
+
+ def __init__(self, generator, psnr_criterion=None):
+ """Initialize the MPR class.
+
+ Args:
+ generator (dict): config of generator.
+ psnr_criterion (dict): config of psnr criterion.
+ """
+ super(NAFNetModel, self).__init__(generator)
+ self.current_iter = 1
+
+ self.nets['generator'] = build_generator(generator)
+
+ if psnr_criterion:
+ self.psnr_criterion = build_criterion(psnr_criterion)
+
+ def setup_input(self, input):
+ self.target = input[0]
+ self.lq = input[1]
+
+ def train_iter(self, optims=None):
+ optims['optim'].clear_gradients()
+
+ restored = self.nets['generator'](self.lq)
+
+ loss = self.psnr_criterion(restored, self.target)
+
+ loss.backward()
+ optims['optim'].step()
+ self.losses['loss'] = loss.numpy()
+
+ def forward(self):
+ pass
+
+ def test_iter(self, metrics=None):
+ self.nets['generator'].eval()
+ with paddle.no_grad():
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output
+ self.nets['generator'].train()
+
+ out_img = []
+ gt_img = []
+ for out_tensor, gt_tensor in zip(self.output, self.target):
+ out_img.append(tensor2img(out_tensor, (0., 1.)))
+ gt_img.append(tensor2img(gt_tensor, (0., 1.)))
+
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(out_img, gt_img)
+
+ def export_model(self,
+ export_model=None,
+ output_dir=None,
+ inputs_size=None,
+ export_serving_model=False,
+ model_name=None):
+ shape = inputs_size[0]
+ new_model = self.nets['generator']
+ new_model.eval()
+ input_spec = [paddle.static.InputSpec(shape=shape, dtype="float32")]
+
+ static_model = paddle.jit.to_static(new_model, input_spec=input_spec)
+
+ if output_dir is None:
+ output_dir = 'inference_model'
+ if model_name is None:
+ model_name = '{}_{}'.format(self.__class__.__name__.lower(),
+ export_model[0]['name'])
+
+ paddle.jit.save(static_model, os.path.join(output_dir, model_name))
diff --git a/ppgan/models/photopen_model.py b/ppgan/models/photopen_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..af1fab4e7bd105d72ed6c19a14ec153a614f226f
--- /dev/null
+++ b/ppgan/models/photopen_model.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from .base_model import BaseModel
+
+from .builder import MODELS
+from .generators.builder import build_generator
+from .criterions import build_criterion
+from .discriminators.builder import build_discriminator
+
+from ..modules.init import init_weights
+from ..solver import build_optimizer
+from ppgan.utils.photopen import data_onehot_pro, Dict
+
+
+@MODELS.register()
+class PhotoPenModel(BaseModel):
+ def __init__(self,
+ generator,
+ discriminator,
+ criterion,
+ label_nc,
+ contain_dontcare_label,
+ batchSize,
+ crop_size,
+ lambda_feat,
+ ):
+
+ super(PhotoPenModel, self).__init__()
+
+ opt = {
+ 'label_nc': label_nc,
+ 'contain_dontcare_label': contain_dontcare_label,
+ 'batchSize': batchSize,
+ 'crop_size': crop_size,
+ 'lambda_feat': lambda_feat,
+# 'semantic_nc': semantic_nc,
+# 'use_vae': use_vae,
+# 'nef': nef,
+ }
+ self.opt = Dict(opt)
+
+
+ # define nets
+ self.nets['net_gen'] = build_generator(generator)
+# init_weights(self.nets['net_gen'])
+ self.nets['net_des'] = build_discriminator(discriminator)
+# init_weights(self.nets['net_des'])
+ self.net_vgg = build_criterion(criterion)
+
+ def setup_input(self, input):
+ if 'img' in input.keys():
+ self.img = paddle.to_tensor(input['img'])
+ self.ins = paddle.to_tensor(input['ins'])
+ self.img_paths = input['img_path']
+
+ def forward(self):
+ self.one_hot = data_onehot_pro(self.ins, self.opt)
+ self.img_f = self.nets['net_gen'](self.one_hot)
+ self.visual_items['img_f'] = self.img_f
+
+ def backward_G(self):
+ fake_data = paddle.concat((self.one_hot, self.img_f), 1)
+ real_data = paddle.concat((self.one_hot, self.img), 1)
+ fake_and_real_data = paddle.concat((fake_data, real_data), 0)
+ pred = self.nets['net_des'](fake_and_real_data)
+
+ """content loss"""
+ g_ganloss = 0.
+ for i in range(len(pred)):
+ pred_i = pred[i][-1][:self.opt.batchSize]
+ new_loss = -pred_i.mean() # hinge loss
+ g_ganloss += new_loss
+ g_ganloss /= len(pred)
+
+ g_featloss = 0.
+ for i in range(len(pred)):
+ for j in range(len(pred[i]) - 1): # 除去最后一层的中间层featuremap
+ unweighted_loss = (pred[i][j][:self.opt.batchSize] - pred[i][j][self.opt.batchSize:]).abs().mean() # L1 loss
+ g_featloss += unweighted_loss * self.opt.lambda_feat / len(pred)
+
+ g_vggloss = self.net_vgg(self.img, self.img_f)
+ self.g_loss = g_ganloss + g_featloss + g_vggloss
+
+ self.g_loss.backward()
+ self.losses['g_ganloss'] = g_ganloss
+ self.losses['g_featloss'] = g_featloss
+ self.losses['g_vggloss'] = g_vggloss
+
+
+ def backward_D(self):
+ fake_data = paddle.concat((self.one_hot, self.img_f), 1)
+ real_data = paddle.concat((self.one_hot, self.img), 1)
+ fake_and_real_data = paddle.concat((fake_data, real_data), 0)
+ pred = self.nets['net_des'](fake_and_real_data)
+
+ """content loss"""
+ df_ganloss = 0.
+ for i in range(len(pred)):
+ pred_i = pred[i][-1][:self.opt.batchSize]
+ new_loss = -paddle.minimum(-pred_i - 1, paddle.zeros_like(pred_i)).mean() # hingle loss
+ df_ganloss += new_loss
+ df_ganloss /= len(pred)
+
+ dr_ganloss = 0.
+ for i in range(len(pred)):
+ pred_i = pred[i][-1][self.opt.batchSize:]
+ new_loss = -paddle.minimum(pred_i - 1, paddle.zeros_like(pred_i)).mean() # hingle loss
+ dr_ganloss += new_loss
+ dr_ganloss /= len(pred)
+
+ self.d_loss = df_ganloss + dr_ganloss
+ self.d_loss.backward()
+ self.losses['df_ganloss'] = df_ganloss
+ self.losses['dr_ganloss'] = dr_ganloss
+
+
+ def train_iter(self, optimizers=None):
+ self.forward()
+ self.optimizers['optimG'].clear_grad()
+ self.backward_G()
+ self.optimizers['optimG'].step()
+
+ self.forward()
+ self.optimizers['optimD'].clear_grad()
+ self.backward_D()
+ self.optimizers['optimD'].step()
+
+ def test_iter(self, metrics=None):
+ self.eval()
+ with paddle.no_grad():
+ self.forward()
+ self.train()
+
+ def setup_optimizers(self, lr, cfg):
+ for opt_name, opt_cfg in cfg.items():
+ if opt_name == 'lr':
+ learning_rate = opt_cfg
+ continue
+ cfg_ = opt_cfg.copy()
+ net_names = cfg_.pop('net_names')
+ parameters = []
+ for net_name in net_names:
+ parameters += self.nets[net_name].parameters()
+ if opt_name == 'optimG':
+ lr = learning_rate * 4
+ else:
+ lr = learning_rate
+ self.optimizers[opt_name] = build_optimizer(
+ cfg_, lr, parameters)
+
+ return self.optimizers
diff --git a/ppgan/models/pix2pix_model.py b/ppgan/models/pix2pix_model.py
index bfb3d0b849933909fd84a851aa456cc50d45b83e..a5784e8c8b0effcc3f6834e33c193946d8bcd17a 100644
--- a/ppgan/models/pix2pix_model.py
+++ b/ppgan/models/pix2pix_model.py
@@ -13,7 +13,7 @@
# limitations under the License.
import paddle
-from .base_model import BaseModel
+from .base_model import BaseModel, apply_to_static
from .builder import MODELS
from .generators.builder import build_generator
@@ -36,7 +36,9 @@ class Pix2PixModel(BaseModel):
discriminator=None,
pixel_criterion=None,
gan_criterion=None,
- direction='a2b'):
+ direction='a2b',
+ to_static=False,
+ image_shape=None):
"""Initialize the pix2pix class.
Args:
@@ -51,11 +53,15 @@ class Pix2PixModel(BaseModel):
# define networks (both generator and discriminator)
self.nets['netG'] = build_generator(generator)
init_weights(self.nets['netG'])
+ # set @to_static for benchmark, skip this by default.
+ apply_to_static(to_static, image_shape, self.nets['netG'])
# define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc
if discriminator:
self.nets['netD'] = build_discriminator(discriminator)
init_weights(self.nets['netD'])
+ # set @to_static for benchmark, skip this by default.
+ apply_to_static(to_static, image_shape, self.nets['netD'])
if pixel_criterion:
self.pixel_criterion = build_criterion(pixel_criterion)
@@ -74,10 +80,8 @@ class Pix2PixModel(BaseModel):
AtoB = self.direction == 'AtoB'
- self.real_A = paddle.fluid.dygraph.to_variable(
- input['A' if AtoB else 'B'])
- self.real_B = paddle.fluid.dygraph.to_variable(
- input['B' if AtoB else 'A'])
+ self.real_A = paddle.to_tensor(input['A' if AtoB else 'B'])
+ self.real_B = paddle.to_tensor(input['B' if AtoB else 'A'])
self.image_paths = input['A_path' if AtoB else 'B_path']
@@ -141,3 +145,12 @@ class Pix2PixModel(BaseModel):
optimizers['optimG'].clear_grad()
self.backward_G()
optimizers['optimG'].step()
+
+ def test_iter(self, metrics=None):
+ self.nets['netG'].eval()
+ self.forward()
+ with paddle.no_grad():
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(self.fake_B, self.real_B)
+ self.nets['netG'].train()
diff --git a/ppgan/models/prenet_model.py b/ppgan/models/prenet_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..2926f5f7a3f32b7e917de60919092a3114b959f4
--- /dev/null
+++ b/ppgan/models/prenet_model.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .builder import MODELS
+from .sr_model import BaseSRModel
+from .generators.iconvsr import EDVRFeatureExtractor
+from .generators.basicvsr import ResidualBlockNoBN, PixelShufflePack, SPyNet
+from ..modules.init import reset_parameters
+from ..utils.visual import tensor2img
+
+
+@MODELS.register()
+class PReNetModel(BaseSRModel):
+ """PReNet Model.
+
+ Paper: Progressive Image Deraining Networks: A Better and Simpler Baseline, IEEE,2019
+ """
+
+ def __init__(self, generator, pixel_criterion=None):
+ """Initialize the BasicVSR class.
+
+ Args:
+ generator (dict): config of generator.
+ fix_iter (dict): config of fix_iter.
+ pixel_criterion (dict): config of pixel criterion.
+ """
+ super(PReNetModel, self).__init__(generator, pixel_criterion)
+ self.current_iter = 1
+ self.flag = True
+
+ def setup_input(self, input):
+ self.lq = input['lq']
+ self.visual_items['lq'] = self.lq[0, :, :, :]
+ if 'gt' in input:
+ self.gt = input['gt']
+ self.visual_items['gt'] = self.gt[0, :, :, :]
+ self.image_paths = input['lq_path']
+
+ def train_iter(self, optims=None):
+ optims['optim'].clear_grad()
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output[0, :, :, :]
+ # pixel loss
+ loss_pixel = -self.pixel_criterion(self.output, self.gt)
+ loss_pixel.backward()
+ optims['optim'].step()
+
+ self.losses['loss_pixel'] = loss_pixel
+ self.current_iter += 1
+
+ def test_iter(self, metrics=None):
+ self.gt = self.gt.cpu()
+ self.nets['generator'].eval()
+ with paddle.no_grad():
+ output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = output[0, :, :, :].cpu()
+ self.nets['generator'].train()
+
+ out_img = []
+ gt_img = []
+
+ out_tensor = output[0]
+ gt_tensor = self.gt[0]
+ out_img = tensor2img(out_tensor, (0., 1.))
+ gt_img = tensor2img(gt_tensor, (0., 1.))
+
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(out_img, gt_img, is_seq=True)
diff --git a/ppgan/models/rcan_model.py b/ppgan/models/rcan_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..45061f4a7d913c815166af6abd9b9ec96d74b527
--- /dev/null
+++ b/ppgan/models/rcan_model.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from .generators.builder import build_generator
+from .criterions.builder import build_criterion
+from .base_model import BaseModel
+from .builder import MODELS
+from ..utils.visual import tensor2img
+from ..modules.init import reset_parameters
+
+
+@MODELS.register()
+class RCANModel(BaseModel):
+ """Base SR model for single image super-resolution.
+ """
+
+ def __init__(self, generator, pixel_criterion=None, use_init_weight=False):
+ """
+ Args:
+ generator (dict): config of generator.
+ pixel_criterion (dict): config of pixel criterion.
+ """
+ super(RCANModel, self).__init__()
+
+ self.nets['generator'] = build_generator(generator)
+ self.error_last = 1e8
+ self.batch = 0
+ if pixel_criterion:
+ self.pixel_criterion = build_criterion(pixel_criterion)
+ if use_init_weight:
+ init_sr_weight(self.nets['generator'])
+
+ def setup_input(self, input):
+ self.lq = paddle.to_tensor(input['lq'])
+ self.visual_items['lq'] = self.lq
+ if 'gt' in input:
+ self.gt = paddle.to_tensor(input['gt'])
+ self.visual_items['gt'] = self.gt
+ self.image_paths = input['lq_path']
+
+ def forward(self):
+ pass
+
+ def train_iter(self, optims=None):
+ optims['optim'].clear_grad()
+
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output
+ # pixel loss
+ loss_pixel = self.pixel_criterion(self.output, self.gt)
+ self.losses['loss_pixel'] = loss_pixel
+
+ skip_threshold = 1e6
+
+ if loss_pixel.item() < skip_threshold * self.error_last:
+ loss_pixel.backward()
+ optims['optim'].step()
+ else:
+ print('Skip this batch {}! (Loss: {})'.format(
+ self.batch + 1, loss_pixel.item()))
+ self.batch += 1
+
+ if self.batch % 1000 == 0:
+ self.error_last = loss_pixel.item() / 1000
+ print("update error_last:{}".format(self.error_last))
+
+ def test_iter(self, metrics=None):
+ self.nets['generator'].eval()
+ with paddle.no_grad():
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output
+ self.nets['generator'].train()
+
+ out_img = []
+ gt_img = []
+ for out_tensor, gt_tensor in zip(self.output, self.gt):
+ out_img.append(tensor2img(out_tensor, (0., 255.)))
+ gt_img.append(tensor2img(gt_tensor, (0., 255.)))
+
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(out_img, gt_img)
+
+
+def init_sr_weight(net):
+
+ def reset_func(m):
+ if hasattr(m, 'weight') and (not isinstance(
+ m, (nn.BatchNorm, nn.BatchNorm2D))):
+ reset_parameters(m)
+
+ net.apply(reset_func)
diff --git a/ppgan/models/singan_model.py b/ppgan/models/singan_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..005d474c0ee836be52754550eb5562e45603e152
--- /dev/null
+++ b/ppgan/models/singan_model.py
@@ -0,0 +1,337 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import math
+import warnings
+from collections import OrderedDict
+from sklearn.cluster import KMeans
+
+import paddle
+import paddle.nn.functional as F
+import paddle.vision.transforms as T
+
+from .base_model import BaseModel
+from .builder import MODELS
+from .generators.builder import build_generator
+from .criterions.builder import build_criterion
+from .discriminators.builder import build_discriminator
+from ..solver import build_lr_scheduler, build_optimizer
+
+warnings.filterwarnings('ignore', category=DeprecationWarning)
+warnings.filterwarnings('ignore', category=UserWarning)
+
+
+def pad_shape(shape, pad_size):
+ shape[-2] += 2 * pad_size
+ shape[-1] += 2 * pad_size
+ return shape
+
+
+def quant(x, num):
+ n, c, h, w = x.shape
+ kmeans = KMeans(num, random_state=0).fit(
+ x.transpose([0, 2, 3, 1]).reshape([-1, c]))
+ centers = kmeans.cluster_centers_
+ x = centers[kmeans.labels_].reshape([n, h, w, c]).transpose([0, 3, 1, 2])
+ return paddle.to_tensor(x, 'float32'), centers
+
+
+def quant_to_centers(x, centers):
+ n, c, h, w = x.shape
+ num = centers.shape[0]
+ kmeans = KMeans(num, init=centers,
+ n_init=1).fit(x.transpose([0, 2, 3, 1]).reshape([-1, c]))
+ x = centers[kmeans.labels_].reshape([n, h, w, c]).transpose([0, 3, 1, 2])
+ return paddle.to_tensor(x, 'float32')
+
+
+@MODELS.register()
+class SinGANModel(BaseModel):
+
+ def __init__(self,
+ generator,
+ discriminator,
+ gan_criterion=None,
+ recon_criterion=None,
+ gp_criterion=None,
+ train_image=None,
+ scale_factor=0.75,
+ min_size=25,
+ is_finetune=False,
+ finetune_scale=1,
+ color_num=5,
+ gen_iters=3,
+ disc_iters=3,
+ noise_amp_init=0.1):
+ super(SinGANModel, self).__init__()
+
+ # setup config
+ self.gen_iters = gen_iters
+ self.disc_iters = disc_iters
+ self.min_size = min_size
+ self.is_finetune = is_finetune
+ self.noise_amp_init = noise_amp_init
+ self.train_image = T.Compose([T.Transpose(),
+ T.Normalize(127.5, 127.5)])(cv2.cvtColor(
+ cv2.imread(train_image,
+ cv2.IMREAD_COLOR),
+ cv2.COLOR_BGR2RGB))
+ self.train_image = paddle.to_tensor(self.train_image).unsqueeze(0)
+ self.scale_num = math.ceil(
+ math.log(self.min_size / min(self.train_image.shape[-2:]),
+ scale_factor)) + 1
+ self.scale_factor = math.pow(
+ self.min_size / min(self.train_image.shape[-2:]),
+ 1 / (self.scale_num - 1))
+ self.reals = [
+ F.interpolate(self.train_image, None, self.scale_factor**i,
+ 'bicubic') for i in range(self.scale_num - 1, -1, -1)
+ ]
+
+ # build generator
+ generator['scale_num'] = self.scale_num
+ generator['coarsest_shape'] = self.reals[0].shape
+ self.nets['netG'] = build_generator(generator)
+ self.niose_pad_size = 0 if generator.get('noise_zero_pad', True) \
+ else self.nets['netG']._pad_size
+ self.nets['netG'].scale_factor = paddle.to_tensor(
+ self.scale_factor, 'float32')
+
+ # build discriminator
+ nfc_init = discriminator.pop('nfc_init', 32)
+ min_nfc_init = discriminator.pop('min_nfc_init', 32)
+ for i in range(self.scale_num):
+ discriminator['nfc'] = min(nfc_init * pow(2, math.floor(i / 4)),
+ 128)
+ discriminator['min_nfc'] = min(
+ min_nfc_init * pow(2, math.floor(i / 4)), 128)
+ self.nets[f'netD{i}'] = build_discriminator(discriminator)
+
+ # build criterion
+ self.gan_criterion = build_criterion(gan_criterion)
+ self.recon_criterion = build_criterion(recon_criterion)
+ self.gp_criterion = build_criterion(gp_criterion)
+
+ if self.is_finetune:
+ self.finetune_scale = finetune_scale
+ self.quant_real, self.quant_centers = quant(
+ self.reals[finetune_scale], color_num)
+
+ # setup training config
+ self.lr_schedulers = OrderedDict()
+ self.current_scale = (finetune_scale if self.is_finetune else 0) - 1
+ self.current_iter = 0
+
+ def set_total_iter(self, total_iter):
+ super().set_total_iter(total_iter)
+ if self.is_finetune:
+ self.scale_iters = total_iter
+ else:
+ self.scale_iters = math.ceil(total_iter / self.scale_num)
+
+ def setup_lr_schedulers(self, cfg):
+ for i in range(self.scale_num):
+ self.lr_schedulers[f"lr{i}"] = build_lr_scheduler(cfg)
+ return self.lr_schedulers
+
+ def setup_optimizers(self, lr_schedulers, cfg):
+ for i in range(self.scale_num):
+ self.optimizers[f'optim_netG{i}'] = build_optimizer(
+ cfg['optimizer_G'], lr_schedulers[f"lr{i}"],
+ self.nets[f'netG'].generators[i].parameters())
+ self.optimizers[f'optim_netD{i}'] = build_optimizer(
+ cfg['optimizer_D'], lr_schedulers[f"lr{i}"],
+ self.nets[f'netD{i}'].parameters())
+ return self.optimizers
+
+ def setup_input(self, input):
+ pass
+
+ def backward_D(self):
+ self.loss_D_real = self.gan_criterion(self.pred_real, True, True)
+ self.loss_D_fake = self.gan_criterion(self.pred_fake, False, True)
+ self.loss_D_gp = self.gp_criterion(
+ self.nets[f'netD{self.current_scale}'], self.real_img,
+ self.fake_img)
+ self.loss_D = self.loss_D_real + self.loss_D_fake + self.loss_D_gp
+ self.loss_D.backward()
+
+ self.losses[f'scale{self.current_scale}/D_total_loss'] = self.loss_D
+ self.losses[f'scale{self.current_scale}/D_real_loss'] = self.loss_D_real
+ self.losses[f'scale{self.current_scale}/D_fake_loss'] = self.loss_D_fake
+ self.losses[
+ f'scale{self.current_scale}/D_gradient_penalty'] = self.loss_D_gp
+
+ def backward_G(self):
+ self.loss_G_gan = self.gan_criterion(self.pred_fake, True, False)
+ self.loss_G_recon = self.recon_criterion(self.recon_img, self.real_img)
+ self.loss_G = self.loss_G_gan + self.loss_G_recon
+ self.loss_G.backward()
+
+ self.losses[f'scale{self.current_scale}/G_adv_loss'] = self.loss_G_gan
+ self.losses[
+ f'scale{self.current_scale}/G_recon_loss'] = self.loss_G_recon
+
+ def scale_prepare(self):
+ self.real_img = self.reals[self.current_scale]
+ self.lr_scheduler = self.lr_schedulers[f"lr{self.current_scale}"]
+ for i in range(self.current_scale):
+ self.optimizers.pop(f'optim_netG{i}', None)
+ self.optimizers.pop(f'optim_netD{i}', None)
+ self.losses.clear()
+ self.visual_items.clear()
+ self.visual_items[f'real_img_scale{self.current_scale}'] = self.real_img
+ if self.is_finetune:
+ self.visual_items['quant_real'] = self.quant_real
+
+ self.recon_prev = paddle.zeros_like(self.reals[0])
+ if self.current_scale > 0:
+ z_pyramid = []
+ for i in range(self.current_scale):
+ if i == 0:
+ z = self.nets['netG'].z_fixed
+ else:
+ z = paddle.zeros(
+ pad_shape(self.reals[i].shape, self.niose_pad_size))
+ z_pyramid.append(z)
+ self.recon_prev = self.nets['netG'](z_pyramid, self.recon_prev,
+ self.current_scale - 1,
+ 0).detach()
+ self.recon_prev = F.interpolate(self.recon_prev,
+ self.real_img.shape[-2:], None,
+ 'bicubic')
+ if self.is_finetune:
+ self.recon_prev = quant_to_centers(self.recon_prev,
+ self.quant_centers)
+ self.nets['netG'].sigma[self.current_scale] = F.mse_loss(
+ self.real_img, self.recon_prev).sqrt() * self.noise_amp_init
+
+ for i in range(self.scale_num):
+ self.set_requires_grad(self.nets['netG'].generators[i],
+ i == self.current_scale)
+
+ def forward(self):
+ if not self.is_finetune:
+ self.fake_img = self.nets['netG'](self.z_pyramid,
+ paddle.zeros(
+ pad_shape(
+ self.z_pyramid[0].shape,
+ -self.niose_pad_size)),
+ self.current_scale, 0)
+ else:
+ x_prev = self.nets['netG'](self.z_pyramid[:self.finetune_scale],
+ paddle.zeros(
+ pad_shape(self.z_pyramid[0].shape,
+ -self.niose_pad_size)),
+ self.finetune_scale - 1, 0)
+ x_prev = F.interpolate(
+ x_prev, self.z_pyramid[self.finetune_scale].shape[-2:], None,
+ 'bicubic')
+ x_prev_quant = quant_to_centers(x_prev, self.quant_centers)
+ self.fake_img = self.nets['netG'](
+ self.z_pyramid[self.finetune_scale:], x_prev_quant,
+ self.current_scale, self.finetune_scale)
+
+ self.recon_img = self.nets['netG'](
+ [(paddle.randn if self.current_scale == 0 else paddle.zeros)(
+ pad_shape(self.real_img.shape, self.niose_pad_size))],
+ self.recon_prev, self.current_scale, self.current_scale)
+
+ self.pred_real = self.nets[f'netD{self.current_scale}'](self.real_img)
+ self.pred_fake = self.nets[f'netD{self.current_scale}'](
+ self.fake_img.detach() if self.update_D else self.fake_img)
+
+ self.visual_items[f'fake_img_scale{self.current_scale}'] = self.fake_img
+ self.visual_items[
+ f'recon_img_scale{self.current_scale}'] = self.recon_img
+ if self.is_finetune:
+ self.visual_items[f'prev_img_scale{self.current_scale}'] = x_prev
+ self.visual_items[
+ f'quant_prev_img_scale{self.current_scale}'] = x_prev_quant
+
+ def train_iter(self, optimizers=None):
+ if self.current_iter % self.scale_iters == 0:
+ self.current_scale += 1
+ self.scale_prepare()
+
+ self.z_pyramid = [
+ paddle.randn(pad_shape(self.reals[i].shape, self.niose_pad_size))
+ for i in range(self.current_scale + 1)
+ ]
+
+ self.update_D = (self.current_iter %
+ (self.disc_iters + self.gen_iters) < self.disc_iters)
+ self.set_requires_grad(self.nets[f'netD{self.current_scale}'],
+ self.update_D)
+ self.forward()
+ if self.update_D:
+ optimizers[f'optim_netD{self.current_scale}'].clear_grad()
+ self.backward_D()
+ optimizers[f'optim_netD{self.current_scale}'].step()
+ else:
+ optimizers[f'optim_netG{self.current_scale}'].clear_grad()
+ self.backward_G()
+ optimizers[f'optim_netG{self.current_scale}'].step()
+
+ self.current_iter += 1
+
+ def test_iter(self, metrics=None):
+ z_pyramid = [
+ paddle.randn(pad_shape(self.reals[i].shape, self.niose_pad_size))
+ for i in range(self.scale_num)
+ ]
+ self.nets['netG'].eval()
+ fake_img = self.nets['netG'](z_pyramid,
+ paddle.zeros(
+ pad_shape(z_pyramid[0].shape,
+ -self.niose_pad_size)),
+ self.scale_num - 1, 0)
+ self.visual_items['fake_img_test'] = fake_img
+ with paddle.no_grad():
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(fake_img, self.train_image)
+ self.nets['netG'].train()
+
+ class InferGenerator(paddle.nn.Layer):
+
+ def set_config(self, generator, noise_shapes, scale_num):
+ self.generator = generator
+ self.noise_shapes = noise_shapes
+ self.scale_num = scale_num
+
+ def forward(self, x):
+ coarsest_shape = self.generator._coarsest_shape
+ z_pyramid = [paddle.randn(shp) for shp in self.noise_shapes]
+ x_init = paddle.zeros(coarsest_shape)
+ out = self.generator(z_pyramid, x_init, self.scale_num - 1, 0)
+ return out
+
+ def export_model(self,
+ export_model=None,
+ output_dir=None,
+ inputs_size=None,
+ export_serving_model=False,
+ model_name=None):
+ noise_shapes = [
+ pad_shape(x.shape, self.niose_pad_size) for x in self.reals
+ ]
+ infer_generator = self.InferGenerator()
+ infer_generator.set_config(self.nets['netG'], noise_shapes,
+ self.scale_num)
+ paddle.jit.save(infer_generator,
+ os.path.join(output_dir, "singan_random_sample"),
+ input_spec=[1])
diff --git a/ppgan/models/sr_model.py b/ppgan/models/sr_model.py
index 565dc649f6a49d67e67b0ae6fdc5a25f25cf9d2e..7a0db5513bd52e3071ea95ed84a9e8b1c61fc40f 100644
--- a/ppgan/models/sr_model.py
+++ b/ppgan/models/sr_model.py
@@ -17,16 +17,19 @@ import paddle.nn as nn
from .generators.builder import build_generator
from .criterions.builder import build_criterion
-from .base_model import BaseModel
+from .base_model import BaseModel, apply_to_static
from .builder import MODELS
from ..utils.visual import tensor2img
+from ..modules.init import reset_parameters
@MODELS.register()
class BaseSRModel(BaseModel):
"""Base SR model for single image super-resolution.
"""
- def __init__(self, generator, pixel_criterion=None):
+
+ def __init__(self, generator, pixel_criterion=None, use_init_weight=False, to_static=False,
+ image_shape=None):
"""
Args:
generator (dict): config of generator.
@@ -35,15 +38,19 @@ class BaseSRModel(BaseModel):
super(BaseSRModel, self).__init__()
self.nets['generator'] = build_generator(generator)
+ # set @to_static for benchmark, skip this by default.
+ apply_to_static(to_static, image_shape, self.nets['generator'])
if pixel_criterion:
self.pixel_criterion = build_criterion(pixel_criterion)
+ if use_init_weight:
+ init_sr_weight(self.nets['generator'])
def setup_input(self, input):
- self.lq = paddle.fluid.dygraph.to_variable(input['lq'])
+ self.lq = paddle.to_tensor(input['lq'])
self.visual_items['lq'] = self.lq
if 'gt' in input:
- self.gt = paddle.fluid.dygraph.to_variable(input['gt'])
+ self.gt = paddle.to_tensor(input['gt'])
self.visual_items['gt'] = self.gt
self.image_paths = input['lq_path']
@@ -62,6 +69,22 @@ class BaseSRModel(BaseModel):
loss_pixel.backward()
optims['optim'].step()
+ # amp training
+ def train_iter_amp(self, optims=None, scalers=None, amp_level='O1'):
+ optims['optim'].clear_grad()
+
+ # put fwd and loss computation in amp context
+ with paddle.amp.auto_cast(enable=True, level=amp_level):
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output
+ # pixel loss
+ loss_pixel = self.pixel_criterion(self.output, self.gt)
+ self.losses['loss_pixel'] = loss_pixel
+
+ scaled_loss_pixel = scalers[0].scale(loss_pixel)
+ scaled_loss_pixel.backward()
+ scalers[0].minimize(optims['optim'], scaled_loss_pixel)
+
def test_iter(self, metrics=None):
self.nets['generator'].eval()
with paddle.no_grad():
@@ -78,3 +101,13 @@ class BaseSRModel(BaseModel):
if metrics is not None:
for metric in metrics.values():
metric.update(out_img, gt_img)
+
+
+def init_sr_weight(net):
+
+ def reset_func(m):
+ if hasattr(m, 'weight') and (not isinstance(
+ m, (nn.BatchNorm, nn.BatchNorm2D))):
+ reset_parameters(m)
+
+ net.apply(reset_func)
diff --git a/ppgan/models/starganv2_model.py b/ppgan/models/starganv2_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..3203264a8eec7f489d4e6fbb667915f44e2d153f
--- /dev/null
+++ b/ppgan/models/starganv2_model.py
@@ -0,0 +1,379 @@
+# code was heavily based on https://github.com/clovaai/stargan-v2
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/clovaai/stargan-v2#license
+
+from .base_model import BaseModel
+
+from paddle import nn
+import paddle
+import paddle.nn.functional as F
+from .builder import MODELS
+from .generators.builder import build_generator
+from .discriminators.builder import build_discriminator
+from ..modules.init import kaiming_normal_, constant_
+from ppgan.utils.visual import make_grid, tensor2img
+
+import numpy as np
+
+
+def translate_using_reference(nets, w_hpf, x_src, x_ref, y_ref):
+ N, C, H, W = x_src.shape
+ wb = paddle.to_tensor(np.ones((1, C, H, W))).astype('float32')
+ x_src_with_wb = paddle.concat([wb, x_src], axis=0)
+
+ masks = nets['fan'].get_heatmap(x_src) if w_hpf > 0 else None
+ s_ref = nets['style_encoder'](x_ref, y_ref)
+ s_ref_list = paddle.unsqueeze(s_ref, axis=[1])
+ s_ref_lists = []
+ for _ in range(N):
+ s_ref_lists.append(s_ref_list)
+ s_ref_list = paddle.stack(s_ref_lists, axis=1)
+ s_ref_list = paddle.reshape(
+ s_ref_list,
+ (s_ref_list.shape[0], s_ref_list.shape[1], s_ref_list.shape[3]))
+ x_concat = [x_src_with_wb]
+ for i, s_ref in enumerate(s_ref_list):
+ x_fake = nets['generator'](x_src, s_ref, masks=masks)
+ x_fake_with_ref = paddle.concat([x_ref[i:i + 1], x_fake], axis=0)
+ x_concat += [x_fake_with_ref]
+
+ x_concat = paddle.concat(x_concat, axis=0)
+ img = tensor2img(make_grid(x_concat, nrow=N + 1, range=(0, 1)))
+ del x_concat
+ return img
+
+
+def compute_d_loss(nets,
+ lambda_reg,
+ x_real,
+ y_org,
+ y_trg,
+ z_trg=None,
+ x_ref=None,
+ masks=None):
+ assert (z_trg is None) != (x_ref is None)
+ # with real images
+ x_real.stop_gradient = False
+ out = nets['discriminator'](x_real, y_org)
+ loss_real = adv_loss(out, 1)
+ loss_reg = r1_reg(out, x_real)
+
+ # with fake images
+ with paddle.no_grad():
+ if z_trg is not None:
+ s_trg = nets['mapping_network'](z_trg, y_trg)
+ else: # x_ref is not None
+ s_trg = nets['style_encoder'](x_ref, y_trg)
+
+ x_fake = nets['generator'](x_real, s_trg, masks=masks)
+ out = nets['discriminator'](x_fake, y_trg)
+ loss_fake = adv_loss(out, 0)
+
+ loss = loss_real + loss_fake + lambda_reg * loss_reg
+ return loss, {
+ 'real': loss_real.numpy(),
+ 'fake': loss_fake.numpy(),
+ 'reg': loss_reg.numpy()
+ }
+
+
+def adv_loss(logits, target):
+ assert target in [1, 0]
+ targets = paddle.full_like(logits, fill_value=target)
+ loss = F.binary_cross_entropy_with_logits(logits, targets)
+ return loss
+
+
+def r1_reg(d_out, x_in):
+ # zero-centered gradient penalty for real images
+ batch_size = x_in.shape[0]
+ grad_dout = paddle.grad(outputs=d_out.sum(),
+ inputs=x_in,
+ create_graph=True,
+ retain_graph=True,
+ only_inputs=True)[0]
+ grad_dout2 = grad_dout.pow(2)
+ assert (grad_dout2.shape == x_in.shape)
+ reg = 0.5 * paddle.reshape(grad_dout2, (batch_size, -1)).sum(1).mean(0)
+ return reg
+
+
+def soft_update(source, target, beta=1.0):
+ assert 0.0 <= beta <= 1.0
+
+ if isinstance(source, paddle.DataParallel):
+ source = source._layers
+
+ target_model_map = dict(target.named_parameters())
+ for param_name, source_param in source.named_parameters():
+ target_param = target_model_map[param_name]
+ target_param.set_value(beta * source_param +
+ (1.0 - beta) * target_param)
+
+
+def dump_model(model):
+ params = {}
+ for k in model.state_dict().keys():
+ if k.endswith('.scale'):
+ params[k] = model.state_dict()[k].shape
+ return params
+
+
+def compute_g_loss(nets,
+ w_hpf,
+ lambda_sty,
+ lambda_ds,
+ lambda_cyc,
+ x_real,
+ y_org,
+ y_trg,
+ z_trgs=None,
+ x_refs=None,
+ masks=None):
+ assert (z_trgs is None) != (x_refs is None)
+ if z_trgs is not None:
+ z_trg, z_trg2 = z_trgs
+ if x_refs is not None:
+ x_ref, x_ref2 = x_refs
+
+ # adversarial loss
+ if z_trgs is not None:
+ s_trg = nets['mapping_network'](z_trg, y_trg)
+ else:
+ s_trg = nets['style_encoder'](x_ref, y_trg)
+
+ x_fake = nets['generator'](x_real, s_trg, masks=masks)
+ out = nets['discriminator'](x_fake, y_trg)
+ loss_adv = adv_loss(out, 1)
+
+ # style reconstruction loss
+ s_pred = nets['style_encoder'](x_fake, y_trg)
+ loss_sty = paddle.mean(paddle.abs(s_pred - s_trg))
+
+ # diversity sensitive loss
+ if z_trgs is not None:
+ s_trg2 = nets['mapping_network'](z_trg2, y_trg)
+ else:
+ s_trg2 = nets['style_encoder'](x_ref2, y_trg)
+ x_fake2 = nets['generator'](x_real, s_trg2, masks=masks)
+ loss_ds = paddle.mean(paddle.abs(x_fake - x_fake2))
+
+ # cycle-consistency loss
+ if w_hpf > 0:
+ if isinstance(nets['fan'], paddle.DataParallel):
+ masks = nets['fan']._layers.get_heatmap(x_fake)
+ else:
+ masks = nets['fan'].get_heatmap(x_fake)
+ else:
+ masks = None
+
+ s_org = nets['style_encoder'](x_real, y_org)
+ x_rec = nets['generator'](x_fake, s_org, masks=masks)
+ loss_cyc = paddle.mean(paddle.abs(x_rec - x_real))
+
+ loss = loss_adv + lambda_sty * loss_sty \
+ - lambda_ds * loss_ds + lambda_cyc * loss_cyc
+ return loss, {
+ 'adv': loss_adv.numpy(),
+ 'sty': loss_sty.numpy(),
+ 'ds:': loss_ds.numpy(),
+ 'cyc': loss_cyc.numpy()
+ }
+
+
+def he_init(module):
+ if isinstance(module, nn.Conv2D):
+ kaiming_normal_(module.weight, mode='fan_in', nonlinearity='relu')
+ if module.bias is not None:
+ constant_(module.bias, 0)
+ if isinstance(module, nn.Linear):
+ kaiming_normal_(module.weight, mode='fan_in', nonlinearity='relu')
+ if module.bias is not None:
+ constant_(module.bias, 0)
+
+
+@MODELS.register()
+class StarGANv2Model(BaseModel):
+ def __init__(
+ self,
+ generator,
+ style=None,
+ mapping=None,
+ discriminator=None,
+ fan=None,
+ latent_dim=16,
+ lambda_reg=1,
+ lambda_sty=1,
+ lambda_ds=1,
+ lambda_cyc=1,
+ ):
+ super(StarGANv2Model, self).__init__()
+ self.w_hpf = generator['w_hpf']
+ self.nets_ema = {}
+ self.nets['generator'] = build_generator(generator)
+ self.nets_ema['generator'] = build_generator(generator)
+ self.nets['style_encoder'] = build_generator(style)
+ self.nets_ema['style_encoder'] = build_generator(style)
+ self.nets['mapping_network'] = build_generator(mapping)
+ self.nets_ema['mapping_network'] = build_generator(mapping)
+ if discriminator:
+ self.nets['discriminator'] = build_discriminator(discriminator)
+ if self.w_hpf > 0:
+ fan_model = build_generator(fan)
+ fan_model.eval()
+ self.nets['fan'] = fan_model
+ self.nets_ema['fan'] = fan_model
+ self.latent_dim = latent_dim
+ self.lambda_reg = lambda_reg
+ self.lambda_sty = lambda_sty
+ self.lambda_ds = lambda_ds
+ self.lambda_cyc = lambda_cyc
+
+ self.nets['generator'].apply(he_init)
+ self.nets['style_encoder'].apply(he_init)
+ self.nets['mapping_network'].apply(he_init)
+ self.nets['discriminator'].apply(he_init)
+
+ # remember the initial value of ds weight
+ self.initial_lambda_ds = self.lambda_ds
+
+ def setup_input(self, input):
+ """Unpack input data from the dataloader and perform necessary pre-processing steps.
+
+ Args:
+ input (dict): include the data itself and its metadata information.
+
+ The option 'direction' can be used to swap images in domain A and domain B.
+ """
+ pass
+ self.input = input
+ self.input['z_trg'] = paddle.randn(
+ (input['src'].shape[0], self.latent_dim))
+ self.input['z_trg2'] = paddle.randn(
+ (input['src'].shape[0], self.latent_dim))
+
+ def forward(self):
+ """Run forward pass; called by both functions and ."""
+ pass
+
+ def _reset_grad(self, optims):
+ for optim in optims.values():
+ optim.clear_gradients()
+
+ def train_iter(self, optimizers=None):
+ #TODO
+ x_real, y_org = self.input['src'], self.input['src_cls']
+ x_ref, x_ref2, y_trg = self.input['ref'], self.input[
+ 'ref2'], self.input['ref_cls']
+ z_trg, z_trg2 = self.input['z_trg'], self.input['z_trg2']
+
+ if self.w_hpf > 0:
+ if isinstance(self.nets['fan'], paddle.DataParallel):
+ masks = self.nets['fan']._layers.get_heatmap(x_real)
+ else:
+ masks = self.nets['fan'].get_heatmap(x_real)
+ else:
+ masks = None
+
+ # train the discriminator
+ d_loss, d_losses_latent = compute_d_loss(self.nets,
+ self.lambda_reg,
+ x_real,
+ y_org,
+ y_trg,
+ z_trg=z_trg,
+ masks=masks)
+ self._reset_grad(optimizers)
+ d_loss.backward()
+ optimizers['discriminator'].minimize(d_loss)
+
+ d_loss, d_losses_ref = compute_d_loss(self.nets,
+ self.lambda_reg,
+ x_real,
+ y_org,
+ y_trg,
+ x_ref=x_ref,
+ masks=masks)
+ self._reset_grad(optimizers)
+ d_loss.backward()
+ optimizers['discriminator'].step()
+
+ # train the generator
+ g_loss, g_losses_latent = compute_g_loss(self.nets,
+ self.w_hpf,
+ self.lambda_sty,
+ self.lambda_ds,
+ self.lambda_cyc,
+ x_real,
+ y_org,
+ y_trg,
+ z_trgs=[z_trg, z_trg2],
+ masks=masks)
+ self._reset_grad(optimizers)
+ g_loss.backward()
+ optimizers['generator'].step()
+ optimizers['mapping_network'].step()
+ optimizers['style_encoder'].step()
+
+ g_loss, g_losses_ref = compute_g_loss(self.nets,
+ self.w_hpf,
+ self.lambda_sty,
+ self.lambda_ds,
+ self.lambda_cyc,
+ x_real,
+ y_org,
+ y_trg,
+ x_refs=[x_ref, x_ref2],
+ masks=masks)
+ self._reset_grad(optimizers)
+ g_loss.backward()
+ optimizers['generator'].step()
+
+ # compute moving average of network parameters
+ soft_update(self.nets['generator'],
+ self.nets_ema['generator'],
+ beta=0.999)
+ soft_update(self.nets['mapping_network'],
+ self.nets_ema['mapping_network'],
+ beta=0.999)
+ soft_update(self.nets['style_encoder'],
+ self.nets_ema['style_encoder'],
+ beta=0.999)
+
+ # decay weight for diversity sensitive loss
+ if self.lambda_ds > 0:
+ self.lambda_ds -= (self.initial_lambda_ds / self.total_iter)
+
+ for loss, prefix in zip(
+ [d_losses_latent, d_losses_ref, g_losses_latent, g_losses_ref],
+ ['D/latent_', 'D/ref_', 'G/latent_', 'G/ref_']):
+ for key, value in loss.items():
+ self.losses[prefix + key] = value
+ self.losses['G/lambda_ds'] = self.lambda_ds
+ self.losses['Total iter'] = int(self.total_iter)
+
+ def test_iter(self, metrics=None):
+ #TODO
+ self.nets_ema['generator'].eval()
+ self.nets_ema['style_encoder'].eval()
+ soft_update(self.nets['generator'],
+ self.nets_ema['generator'],
+ beta=0.999)
+ soft_update(self.nets['mapping_network'],
+ self.nets_ema['mapping_network'],
+ beta=0.999)
+ soft_update(self.nets['style_encoder'],
+ self.nets_ema['style_encoder'],
+ beta=0.999)
+ src_img = self.input['src']
+ ref_img = self.input['ref']
+ ref_label = self.input['ref_cls']
+ with paddle.no_grad():
+ img = translate_using_reference(
+ self.nets_ema, self.w_hpf,
+ paddle.to_tensor(src_img).astype('float32'),
+ paddle.to_tensor(ref_img).astype('float32'),
+ paddle.to_tensor(ref_label).astype('float32'))
+ self.visual_items['reference'] = img
+ self.nets_ema['generator'].train()
+ self.nets_ema['style_encoder'].train()
diff --git a/ppgan/models/styleganv2_model.py b/ppgan/models/styleganv2_model.py
index 1f10ed0393d9eacce79b1e03350bca7dc1da96eb..28452587b2eb524fdf4048c439dfc0a6a5ae4d90 100644
--- a/ppgan/models/styleganv2_model.py
+++ b/ppgan/models/styleganv2_model.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import os
import math
import random
import paddle
@@ -24,6 +25,7 @@ from .discriminators.builder import build_discriminator
from ..solver import build_lr_scheduler, build_optimizer
+
def r1_penalty(real_pred, real_img):
"""
R1 regularization for discriminator. The core idea is to
@@ -79,7 +81,8 @@ class StyleGAN2Model(BaseModel):
r1_reg_weight=10.,
path_reg_weight=2.,
path_batch_shrink=2.,
- params=None):
+ params=None,
+ max_eval_steps=50000):
"""Initialize the CycleGAN class.
Args:
@@ -107,6 +110,7 @@ class StyleGAN2Model(BaseModel):
self.mean_path_length = 0
self.nets['gen'] = build_generator(generator)
+ self.max_eval_steps = max_eval_steps
# define discriminators
if discriminator:
@@ -180,7 +184,7 @@ class StyleGAN2Model(BaseModel):
input (dict): include the data itself and its metadata information.
"""
- self.real_img = paddle.fluid.dygraph.to_variable(input['A'])
+ self.real_img = paddle.to_tensor(input['A'])
def forward(self):
"""Run forward pass; called by both functions and ."""
@@ -193,7 +197,6 @@ class StyleGAN2Model(BaseModel):
noises = []
for _ in range(num_noise):
noises.append(paddle.randn([batch, self.num_style_feat]))
-
return noises
def mixing_noise(self, batch, prob):
@@ -256,7 +259,7 @@ class StyleGAN2Model(BaseModel):
l_g_total += l_g
if current_iter % self.gen_iters == 0:
- path_batch_size = max(1, batch // self.path_batch_shrink)
+ path_batch_size = max(1, int(batch // self.path_batch_shrink))
noise = self.mixing_noise(path_batch_size, self.mixing_prob)
fake_img, latents = self.nets['gen'](noise, return_latents=True)
l_g_path, path_lengths, self.mean_path_length = g_path_regularize(
@@ -280,3 +283,43 @@ class StyleGAN2Model(BaseModel):
self.visual_items['fake_img_ema'] = sample
self.current_iter += 1
+
+ def test_iter(self, metrics=None):
+ self.nets['gen_ema'].eval()
+ batch = self.real_img.shape[0]
+ noises = [paddle.randn([batch, self.num_style_feat])]
+ fake_img, _ = self.nets['gen_ema'](noises)
+ with paddle.no_grad():
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(fake_img, self.real_img)
+ self.nets['gen_ema'].train()
+
+ class InferGenerator(paddle.nn.Layer):
+ def set_generator(self, generator):
+ self.generator = generator
+
+ def forward(self, style, truncation):
+ truncation_latent = self.generator.get_mean_style()
+ out = self.generator(styles=style,
+ truncation=truncation,
+ truncation_latent=truncation_latent)
+ return out[0]
+
+ def export_model(self,
+ export_model=None,
+ output_dir=None,
+ inputs_size=[[1, 1, 512], [1, 1]],
+ export_serving_model=False,
+ model_name=None):
+ infer_generator = self.InferGenerator()
+ infer_generator.set_generator(self.nets['gen'])
+ style = paddle.rand(shape=inputs_size[0], dtype='float32')
+ truncation = paddle.rand(shape=inputs_size[1], dtype='float32')
+ if output_dir is None:
+ output_dir = 'inference_model'
+ if model_name is None:
+ model_name = "stylegan2model_gen"
+ paddle.jit.save(infer_generator,
+ os.path.join(output_dir, model_name),
+ input_spec=[style, truncation])
diff --git a/ppgan/models/swinir_model.py b/ppgan/models/swinir_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fc8c4e1435fd6e2e726c35b8ab83117e68d7c60
--- /dev/null
+++ b/ppgan/models/swinir_model.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import paddle.nn as nn
+
+from .builder import MODELS
+from .base_model import BaseModel
+from .generators.builder import build_generator
+from .criterions.builder import build_criterion
+from ppgan.utils.visual import tensor2img
+
+
+@MODELS.register()
+class SwinIRModel(BaseModel):
+ """SwinIR Model.
+ SwinIR: Image Restoration Using Swin Transformer, https://arxiv.org/abs/2108.10257
+ Originally Written by Ze Liu, Modified by Jingyun Liang.
+ """
+
+ def __init__(self, generator, char_criterion=None):
+ """Initialize the MPR class.
+
+ Args:
+ generator (dict): config of generator.
+ char_criterion (dict): config of char criterion.
+ """
+ super(SwinIRModel, self).__init__(generator)
+ self.current_iter = 1
+
+ self.nets['generator'] = build_generator(generator)
+
+ if char_criterion:
+ self.char_criterion = build_criterion(char_criterion)
+
+ def setup_input(self, input):
+ self.target = input[0]
+ self.lq = input[1]
+
+ def train_iter(self, optims=None):
+ optims['optim'].clear_gradients()
+
+ restored = self.nets['generator'](self.lq)
+
+ loss = self.char_criterion(restored, self.target)
+
+ loss.backward()
+ optims['optim'].step()
+ self.losses['loss'] = loss.numpy()
+
+ def forward(self):
+ pass
+
+ def test_iter(self, metrics=None):
+ self.nets['generator'].eval()
+ with paddle.no_grad():
+ self.output = self.nets['generator'](self.lq)
+ self.visual_items['output'] = self.output
+ self.nets['generator'].train()
+
+ out_img = []
+ gt_img = []
+ for out_tensor, gt_tensor in zip(self.output, self.target):
+ out_img.append(tensor2img(out_tensor, (0., 1.)))
+ gt_img.append(tensor2img(gt_tensor, (0., 1.)))
+
+ if metrics is not None:
+ for metric in metrics.values():
+ metric.update(out_img, gt_img)
+
+ def export_model(self,
+ export_model=None,
+ output_dir=None,
+ inputs_size=None,
+ export_serving_model=False,
+ model_name=None):
+ shape = inputs_size[0]
+ new_model = self.nets['generator']
+ new_model.eval()
+ input_spec = [paddle.static.InputSpec(shape=shape, dtype="float32")]
+
+ static_model = paddle.jit.to_static(new_model, input_spec=input_spec)
+
+ if output_dir is None:
+ output_dir = 'inference_model'
+ if model_name is None:
+ model_name = '{}_{}'.format(self.__class__.__name__.lower(),
+ export_model[0]['name'])
+
+ paddle.jit.save(static_model, os.path.join(output_dir, model_name))
diff --git a/ppgan/models/ugatit_model.py b/ppgan/models/ugatit_model.py
index 8488b1ba3f6143866cdc7162cd41945d8e27be35..007dde19df991b2a57037171691747ee80a1230c 100644
--- a/ppgan/models/ugatit_model.py
+++ b/ppgan/models/ugatit_model.py
@@ -48,8 +48,18 @@ class UGATITModel(BaseModel):
cam_weight=1000.0):
"""Initialize the CycleGAN class.
- Parameters:
- opt (config)-- stores all the experiment flags; needs to be a subclass of Dict
+ Args:
+ generator (dict): config of generator.
+ discriminator_g (dict): config of discriminator_g.
+ discriminator_l (dict): config of discriminator_l.
+ l1_criterion (dict): config of l1_criterion.
+ mse_criterion (dict): config of mse_criterion.
+ bce_criterion (dict): config of bce_criterion.
+ direction (str): direction of dataset, default: 'a2b'.
+ adv_weight (float): adversial loss weight, default: 1.0.
+ cycle_weight (float): cycle loss weight, default: 10.0.
+ identity_weight (float): identity loss weight, default: 10.0.
+ cam_weight (float): cam loss weight, default: 1000.0.
"""
super(UGATITModel, self).__init__()
self.adv_weight = adv_weight
diff --git a/ppgan/models/wav2lip_hq_model.py b/ppgan/models/wav2lip_hq_model.py
index 034e81f9ffbd399fd8a716d8f8d8aa7f2905de0e..5661b44ef4c1c5ea3117a511db30f3c2b3e95ce0 100644
--- a/ppgan/models/wav2lip_hq_model.py
+++ b/ppgan/models/wav2lip_hq_model.py
@@ -103,8 +103,7 @@ class Wav2LipModelHq(BaseModel):
self.l1_loss = self.recon_loss(self.g, self.y)
if self.disc_wt > 0.:
- if isinstance(self.nets['netDH'], paddle.DataParallel
- ): #paddle.fluid.dygraph.parallel.DataParallel)
+ if isinstance(self.nets['netDH'], paddle.DataParallel):
self.perceptual_loss = self.nets[
'netDH']._layers.perceptual_forward(self.g)
else:
@@ -175,8 +174,7 @@ class Wav2LipModelHq(BaseModel):
self.eval_recon_losses.append(l1loss.numpy().item())
if self.disc_wt > 0.:
- if isinstance(self.nets['netDH'], paddle.DataParallel
- ): #paddle.fluid.dygraph.parallel.DataParallel)
+ if isinstance(self.nets['netDH'], paddle.DataParallel):
perceptual_loss = self.nets[
'netDH']._layers.perceptual_forward(
self.g).numpy().item()
diff --git a/ppgan/modules/caffevgg.py b/ppgan/modules/caffevgg.py
index bf40f5bb30208eac6d6d410b15486ec9f3bb215f..b0780899ecceb8f65d3e4d690babb9a43c92dd59 100644
--- a/ppgan/modules/caffevgg.py
+++ b/ppgan/modules/caffevgg.py
@@ -2,6 +2,7 @@ import paddle
import paddle.nn as nn
import numpy as np
from ppgan.utils.download import get_path_from_url
+
model_urls = {
'caffevgg19': ('https://paddlegan.bj.bcebos.com/models/vgg19_no_fc.npy',
'8ea1ef2374f8684b6cea9f300849be81')
@@ -29,10 +30,13 @@ class CaffeVGG19(nn.Layer):
self.mean = mean.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
def _process(self, x):
- rgb = (x * 0.5 + 0.5) * 255 # value to 255
+ # value to 255
+ rgb = (x * 0.5 + 0.5) * 255
+ # rgb to bgr
bgr = paddle.stack((rgb[:, 2, :, :], rgb[:, 1, :, :], rgb[:, 0, :, :]),
- 1) # rgb to bgr
- return bgr - self.mean # vgg norm
+ 1)
+ # vgg norm
+ return bgr - self.mean
def _forward_impl(self, x):
x = self._process(x)
diff --git a/ppgan/modules/dense_motion.py b/ppgan/modules/dense_motion.py
index dce20e278f1aaacc70821c9d515666d64dd2d3bd..131d6f4b2e780b553358582a8f5076f3d0800a9f 100644
--- a/ppgan/modules/dense_motion.py
+++ b/ppgan/modules/dense_motion.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/AliaksandrSiarohin/first-order-model
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/AliaksandrSiarohin/first-order-model/blob/master/LICENSE.md
import paddle
import paddle.nn as nn
@@ -31,21 +21,64 @@ class DenseMotionNetwork(nn.Layer):
num_channels,
estimate_occlusion_map=False,
scale_factor=1,
- kp_variance=0.01):
+ kp_variance=0.01,
+ mobile_net=False):
super(DenseMotionNetwork, self).__init__()
self.hourglass = Hourglass(block_expansion=block_expansion,
in_features=(num_kp + 1) *
(num_channels + 1),
max_features=max_features,
- num_blocks=num_blocks)
-
- self.mask = nn.Conv2D(self.hourglass.out_filters,
+ num_blocks=num_blocks,
+ mobile_net=mobile_net)
+
+ if mobile_net:
+ self.mask = nn.Sequential(
+ nn.Conv2D(self.hourglass.out_filters,
+ self.hourglass.out_filters,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=1),
+ nn.ReLU(),
+ nn.Conv2D(self.hourglass.out_filters,
+ self.hourglass.out_filters,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=1),
+ nn.ReLU(),
+ nn.Conv2D(self.hourglass.out_filters,
+ num_kp + 1,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=1))
+ else:
+ self.mask = nn.Conv2D(self.hourglass.out_filters,
num_kp + 1,
kernel_size=(7, 7),
padding=(3, 3))
if estimate_occlusion_map:
- self.occlusion = nn.Conv2D(self.hourglass.out_filters,
+ if mobile_net:
+ self.occlusion = nn.Sequential(
+ nn.Conv2D(self.hourglass.out_filters,
+ self.hourglass.out_filters,
+ kernel_size=3,
+ padding=1,
+ weight_attr=nn.initializer.KaimingUniform()),
+ nn.ReLU(),
+ nn.Conv2D(self.hourglass.out_filters,
+ self.hourglass.out_filters,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=1),
+ nn.ReLU(),
+ nn.Conv2D(self.hourglass.out_filters,
+ 1,
+ kernel_size=3,
+ padding=1,
+ weight_attr=nn.initializer.KaimingUniform())
+ )
+ else:
+ self.occlusion = nn.Conv2D(self.hourglass.out_filters,
1,
kernel_size=(7, 7),
padding=(3, 3))
@@ -58,7 +91,8 @@ class DenseMotionNetwork(nn.Layer):
if self.scale_factor != 1:
self.down = AntiAliasInterpolation2d(num_channels,
- self.scale_factor)
+ self.scale_factor,
+ mobile_net=mobile_net)
def create_heatmap_representations(self, source_image, kp_driving,
kp_source):
@@ -96,9 +130,15 @@ class DenseMotionNetwork(nn.Layer):
jacobian = paddle.matmul(kp_source['jacobian'],
paddle.inverse(kp_driving['jacobian']))
jacobian = jacobian.unsqueeze(-3).unsqueeze(-3)
- jacobian = paddle.tile(jacobian, [1, 1, h, w, 1, 1])
- coordinate_grid = paddle.matmul(jacobian,
+ # Todo: fix bug of paddle.tile
+ p_jacobian = jacobian.reshape([bs, self.num_kp, 1, 1, 4])
+ paddle_jacobian = paddle.tile(p_jacobian, [1, 1, h, w, 1])
+ paddle_jacobian = paddle_jacobian.reshape(
+ [bs, self.num_kp, h, w, 2, 2])
+
+ coordinate_grid = paddle.matmul(paddle_jacobian,
coordinate_grid.unsqueeze(-1))
+
coordinate_grid = coordinate_grid.squeeze(-1)
driving_to_source = coordinate_grid + kp_source['value'].reshape(
@@ -125,7 +165,9 @@ class DenseMotionNetwork(nn.Layer):
(bs * (self.num_kp + 1), h, w, -1))
sparse_deformed = F.grid_sample(source_repeat,
sparse_motions,
- align_corners=False)
+ mode='bilinear',
+ padding_mode='zeros',
+ align_corners=True)
sparse_deformed = sparse_deformed.reshape(
(bs, self.num_kp + 1, -1, h, w))
return sparse_deformed
@@ -145,10 +187,10 @@ class DenseMotionNetwork(nn.Layer):
source_image, sparse_motion)
out_dict['sparse_deformed'] = deformed_source
- input = paddle.concat([heatmap_representation, deformed_source], axis=2)
- input = input.reshape([bs, -1, h, w])
+ temp = paddle.concat([heatmap_representation, deformed_source], axis=2)
+ temp = temp.reshape([bs, -1, h, w])
- prediction = self.hourglass(input)
+ prediction = self.hourglass(temp)
mask = self.mask(prediction)
mask = F.softmax(mask, axis=1)
diff --git a/ppgan/modules/equalized.py b/ppgan/modules/equalized.py
index 7d2eef17ba6febb1c12a0ff2d5e685191b7d5e2f..9d1440245d00806448b48ebc84401c93029183c6 100644
--- a/ppgan/modules/equalized.py
+++ b/ppgan/modules/equalized.py
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/rosinality/stylegan2-pytorch
+# MIT License
+# Copyright (c) 2019 Kim Seonghyeon
+
import math
import paddle
import paddle.nn as nn
@@ -24,6 +28,7 @@ class EqualConv2D(nn.Layer):
"""This convolutional layer class stabilizes the learning rate changes of its parameters.
Equalizing learning rate keeps the weights in the network at a similar scale during training.
"""
+
def __init__(self,
in_channel,
out_channel,
@@ -70,6 +75,7 @@ class EqualLinear(nn.Layer):
"""This linear layer class stabilizes the learning rate changes of its parameters.
Equalizing learning rate keeps the weights in the network at a similar scale during training.
"""
+
def __init__(self,
in_dim,
out_dim,
@@ -111,3 +117,50 @@ class EqualLinear(nn.Layer):
return (
f"{self.__class__.__name__}({self.weight.shape[0]}, {self.weight.shape[1]})"
)
+
+
+class EqualLinear_gpen(nn.Layer):
+ """This linear layer class stabilizes the learning rate changes of its parameters.
+ Equalizing learning rate keeps the weights in the network at a similar scale during training.
+ """
+
+ def __init__(self,
+ in_dim,
+ out_dim,
+ bias=True,
+ bias_init=0,
+ lr_mul=1,
+ activation=None):
+ super().__init__()
+
+ self.weight = self.create_parameter(
+ (out_dim, in_dim), default_initializer=nn.initializer.Normal())
+ self.weight.set_value((self.weight / lr_mul))
+
+ if bias:
+ self.bias = self.create_parameter(
+ (out_dim, ), nn.initializer.Constant(bias_init))
+
+ else:
+ self.bias = None
+
+ self.activation = activation
+
+ self.scale = (1 / math.sqrt(in_dim)) * lr_mul
+ self.lr_mul = lr_mul
+
+ def forward(self, input):
+ if self.activation:
+ out = F.linear(input, (self.weight * self.scale).t())
+ out = fused_leaky_relu(out, self.bias * self.lr_mul)
+
+ else:
+ out = F.linear(input, (self.weight * self.scale).t(),
+ bias=self.bias * self.lr_mul)
+
+ return out
+
+ def __repr__(self):
+ return (
+ f'{self.__class__.__name__}({self.weight.shape[1]}, {self.weight.shape[0]})'
+ )
diff --git a/ppgan/modules/first_order.py b/ppgan/modules/first_order.py
index d38f38a2d3f261c9eff9dcb2813778b4671d7705..bf9bb9029f3668ae9f0c962639094109a95f3609 100644
--- a/ppgan/modules/first_order.py
+++ b/ppgan/modules/first_order.py
@@ -1,22 +1,45 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/AliaksandrSiarohin/first-order-model
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/AliaksandrSiarohin/first-order-model/blob/master/LICENSE.md
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
+def SyncBatchNorm(*args, **kwargs):
+ if paddle.distributed.get_world_size() > 1:
+ return nn.SyncBatchNorm(*args, **kwargs)
+ else:
+ return nn.BatchNorm(*args, **kwargs)
+
+
+class ImagePyramide(nn.Layer):
+ """
+ Create image pyramide for computing pyramide perceptual loss. See Sec 3.3
+ """
+ def __init__(self, scales, num_channels):
+ super(ImagePyramide, self).__init__()
+ self.downs = paddle.nn.LayerList()
+ self.name_list = []
+ for scale in scales:
+ self.downs.add_sublayer(
+ str(scale).replace('.', '-'),
+ AntiAliasInterpolation2d(num_channels, scale))
+ self.name_list.append(str(scale).replace('.', '-'))
+
+ def forward(self, x):
+ out_dict = {}
+ for scale, down_module in zip(self.name_list, self.downs):
+ out_dict['prediction_' +
+ str(scale).replace('-', '.')] = down_module(x)
+ return out_dict
+
+
+def detach_kp(kp):
+ return {key: value.detach() for key, value in kp.items()}
+
+
def kp2gaussian(kp, spatial_size, kp_variance):
"""
Transform a keypoint into gaussian like representation
@@ -26,9 +49,9 @@ def kp2gaussian(kp, spatial_size, kp_variance):
coordinate_grid = make_coordinate_grid(spatial_size, mean.dtype)
number_of_leading_dimensions = len(mean.shape) - 1
shape = (1, ) * number_of_leading_dimensions + tuple(coordinate_grid.shape)
- coordinate_grid = coordinate_grid.reshape([*shape])
repeats = tuple(mean.shape[:number_of_leading_dimensions]) + (1, 1, 1)
- coordinate_grid = paddle.tile(coordinate_grid, [*repeats])
+ coordinate_grid = coordinate_grid.reshape(shape)
+ coordinate_grid = coordinate_grid.tile(repeats)
# Preprocess kp shape
shape = tuple(mean.shape[:number_of_leading_dimensions]) + (1, 1, 2)
@@ -41,7 +64,7 @@ def kp2gaussian(kp, spatial_size, kp_variance):
return out
-def make_coordinate_grid(spatial_size, type):
+def make_coordinate_grid(spatial_size, type='float32'):
"""
Create a meshgrid [-1,1] x [-1,1] of given spatial_size.
"""
@@ -74,8 +97,8 @@ class ResBlock2d(nn.Layer):
out_channels=in_features,
kernel_size=kernel_size,
padding=padding)
- self.norm1 = nn.BatchNorm2D(in_features)
- self.norm2 = nn.BatchNorm2D(in_features)
+ self.norm1 = SyncBatchNorm(in_features)
+ self.norm2 = SyncBatchNorm(in_features)
def forward(self, x):
out = self.norm1(x)
@@ -88,6 +111,50 @@ class ResBlock2d(nn.Layer):
return out
+class MobileResBlock2d(nn.Layer):
+ """
+ Res block, preserve spatial resolution.
+ """
+ def __init__(self, in_features, kernel_size, padding):
+ super(MobileResBlock2d, self).__init__()
+ out_features = in_features * 2
+ self.conv_pw = nn.Conv2D(in_channels=in_features,
+ out_channels=out_features,
+ kernel_size=1,
+ padding=0,
+ bias_attr=False)
+ self.conv_dw = nn.Conv2D(in_channels=out_features,
+ out_channels=out_features,
+ kernel_size=kernel_size,
+ padding=padding,
+ groups=out_features,
+ bias_attr=False)
+ self.conv_pw_linear = nn.Conv2D(in_channels=out_features,
+ out_channels=in_features,
+ kernel_size=1,
+ padding=0,
+ bias_attr=False)
+ self.norm1 = SyncBatchNorm(in_features)
+ self.norm_pw = SyncBatchNorm(out_features)
+ self.norm_dw = SyncBatchNorm(out_features)
+ self.norm_pw_linear = SyncBatchNorm(in_features)
+
+ def forward(self, x):
+ out = self.norm1(x)
+ out = F.relu(out)
+ out = self.conv_pw(out)
+ out = self.norm_pw(out)
+
+ out = self.conv_dw(out)
+ out = self.norm_dw(out)
+ out = F.relu(out)
+
+ out = self.conv_pw_linear(out)
+ out = self.norm_pw_linear(out)
+ out += x
+ return out
+
+
class UpBlock2d(nn.Layer):
"""
Upsampling block for use in decoder.
@@ -105,13 +172,50 @@ class UpBlock2d(nn.Layer):
kernel_size=kernel_size,
padding=padding,
groups=groups)
- self.norm = nn.BatchNorm2D(out_features)
+ self.norm = SyncBatchNorm(out_features)
+
+ def forward(self, x):
+ out = F.interpolate(x, scale_factor=2)
+ out = self.conv(out)
+ out = self.norm(out)
+ out = F.relu(out)
+ return out
+
+
+class MobileUpBlock2d(nn.Layer):
+ """
+ Upsampling block for use in decoder.
+ """
+ def __init__(self,
+ in_features,
+ out_features,
+ kernel_size=3,
+ padding=1,
+ groups=1):
+ super(MobileUpBlock2d, self).__init__()
+
+ self.conv = nn.Conv2D(in_channels=in_features,
+ out_channels=in_features,
+ kernel_size=kernel_size,
+ padding=padding,
+ groups=in_features,
+ bias_attr=False)
+ self.conv1 = nn.Conv2D(in_channels=in_features,
+ out_channels=out_features,
+ kernel_size=1,
+ padding=0,
+ bias_attr=False)
+ self.norm = SyncBatchNorm(in_features)
+ self.norm1 = SyncBatchNorm(out_features)
def forward(self, x):
out = F.interpolate(x, scale_factor=2)
out = self.conv(out)
out = self.norm(out)
out = F.relu(out)
+ out = self.conv1(out)
+ out = self.norm1(out)
+ out = F.relu(out)
return out
@@ -131,7 +235,7 @@ class DownBlock2d(nn.Layer):
kernel_size=kernel_size,
padding=padding,
groups=groups)
- self.norm = nn.BatchNorm2D(out_features)
+ self.norm = SyncBatchNorm(out_features)
self.pool = nn.AvgPool2D(kernel_size=(2, 2))
def forward(self, x):
@@ -142,6 +246,45 @@ class DownBlock2d(nn.Layer):
return out
+class MobileDownBlock2d(nn.Layer):
+ """
+ Downsampling block for use in encoder.
+ """
+ def __init__(self,
+ in_features,
+ out_features,
+ kernel_size=3,
+ padding=1,
+ groups=1):
+ super(MobileDownBlock2d, self).__init__()
+ self.conv = nn.Conv2D(in_channels=in_features,
+ out_channels=in_features,
+ kernel_size=kernel_size,
+ padding=padding,
+ groups=in_features,
+ bias_attr=False)
+ self.norm = SyncBatchNorm(in_features)
+ self.pool = nn.AvgPool2D(kernel_size=(2, 2))
+
+ self.conv1 = nn.Conv2D(in_features,
+ out_features,
+ kernel_size=1,
+ padding=0,
+ stride=1,
+ bias_attr=False)
+ self.norm1 = SyncBatchNorm(out_features)
+
+ def forward(self, x):
+ out = self.conv(x)
+ out = self.norm(out)
+ out = F.relu(out)
+ out = self.conv1(out)
+ out = self.norm1(out)
+ out = F.relu(out)
+ out = self.pool(out)
+ return out
+
+
class SameBlock2d(nn.Layer):
"""
Simple block, preserve spatial resolution.
@@ -151,14 +294,17 @@ class SameBlock2d(nn.Layer):
out_features,
groups=1,
kernel_size=3,
- padding=1):
+ padding=1,
+ mobile_net=False):
super(SameBlock2d, self).__init__()
self.conv = nn.Conv2D(in_channels=in_features,
out_channels=out_features,
kernel_size=kernel_size,
padding=padding,
- groups=groups)
- self.norm = nn.BatchNorm2D(out_features)
+ groups=groups,
+ bias_attr=(mobile_net == False),
+ weight_attr=nn.initializer.KaimingUniform())
+ self.norm = SyncBatchNorm(out_features)
def forward(self, x):
out = self.conv(x)
@@ -175,17 +321,28 @@ class Encoder(nn.Layer):
block_expansion,
in_features,
num_blocks=3,
- max_features=256):
+ max_features=256,
+ mobile_net=False):
super(Encoder, self).__init__()
down_blocks = []
for i in range(num_blocks):
- down_blocks.append(
- DownBlock2d(in_features if i == 0 else min(
- max_features, block_expansion * (2**i)),
- min(max_features, block_expansion * (2**(i + 1))),
- kernel_size=3,
- padding=1))
+ if mobile_net:
+ down_blocks.append(
+ MobileDownBlock2d(in_features if i == 0 else min(
+ max_features, block_expansion * (2**i)),
+ min(max_features,
+ block_expansion * (2**(i + 1))),
+ kernel_size=3,
+ padding=1))
+ else:
+ down_blocks.append(
+ DownBlock2d(in_features if i == 0 else min(
+ max_features, block_expansion * (2**i)),
+ min(max_features,
+ block_expansion * (2**(i + 1))),
+ kernel_size=3,
+ padding=1))
self.down_blocks = nn.LayerList(down_blocks)
def forward(self, x):
@@ -203,17 +360,28 @@ class Decoder(nn.Layer):
block_expansion,
in_features,
num_blocks=3,
- max_features=256):
+ max_features=256,
+ mobile_net=False):
super(Decoder, self).__init__()
up_blocks = []
for i in range(num_blocks)[::-1]:
- in_filters = (1 if i == num_blocks - 1 else 2) * min(
- max_features, block_expansion * (2**(i + 1)))
out_filters = min(max_features, block_expansion * (2**i))
- up_blocks.append(
- UpBlock2d(in_filters, out_filters, kernel_size=3, padding=1))
+ if mobile_net:
+ in_filters = (1 if i == num_blocks - 1 else 2) * min(
+ max_features, block_expansion * (2**(i + 1)))
+ up_blocks.append(
+ MobileUpBlock2d(in_filters,
+ out_filters,
+ kernel_size=3,
+ padding=1))
+ else:
+ in_filters = (1 if i == num_blocks - 1 else 2) * min(
+ max_features, block_expansion * (2**(i + 1)))
+ up_blocks.append(
+ UpBlock2d(in_filters, out_filters, kernel_size=3,
+ padding=1))
self.up_blocks = nn.LayerList(up_blocks)
self.out_filters = block_expansion + in_features
@@ -235,12 +403,19 @@ class Hourglass(nn.Layer):
block_expansion,
in_features,
num_blocks=3,
- max_features=256):
+ max_features=256,
+ mobile_net=False):
super(Hourglass, self).__init__()
- self.encoder = Encoder(block_expansion, in_features, num_blocks,
- max_features)
- self.decoder = Decoder(block_expansion, in_features, num_blocks,
- max_features)
+ self.encoder = Encoder(block_expansion,
+ in_features,
+ num_blocks,
+ max_features,
+ mobile_net=mobile_net)
+ self.decoder = Decoder(block_expansion,
+ in_features,
+ num_blocks,
+ max_features,
+ mobile_net=mobile_net)
self.out_filters = self.decoder.out_filters
def forward(self, x):
@@ -251,10 +426,14 @@ class AntiAliasInterpolation2d(nn.Layer):
"""
Band-limited downsampling, for better preservation of the input signal.
"""
- def __init__(self, channels, scale):
+ def __init__(self, channels, scale, mobile_net=False):
super(AntiAliasInterpolation2d, self).__init__()
- sigma = (1 / scale - 1) / 2
- kernel_size = 2 * round(sigma * 4) + 1
+ if mobile_net:
+ sigma = 0.25
+ kernel_size = 3
+ else:
+ sigma = (1 / scale - 1) / 2
+ kernel_size = 2 * round(sigma * 4) + 1
self.ka = kernel_size // 2
self.kb = self.ka - 1 if kernel_size % 2 == 0 else self.ka
@@ -267,7 +446,7 @@ class AntiAliasInterpolation2d(nn.Layer):
[paddle.arange(size, dtype='float32') for size in kernel_size])
for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
mean = (size - 1) / 2
- kernel *= paddle.exp(-(mgrid - mean)**2 / (2 * std**2))
+ kernel *= paddle.exp(-(mgrid - mean)**2 / (2 * std**2 + 1e-9))
# Make sure sum of values in gaussian kernel equals 1.
kernel = kernel / paddle.sum(kernel)
@@ -285,6 +464,12 @@ class AntiAliasInterpolation2d(nn.Layer):
out = F.pad(input, [self.ka, self.kb, self.ka, self.kb])
out = F.conv2d(out, weight=self.weight, groups=self.groups)
- out = F.interpolate(out, scale_factor=[self.scale, self.scale])
-
+ out.stop_gradient = False
+ inv_scale = 1 / self.scale
+ int_inv_scale = int(inv_scale)
+ assert (inv_scale == int_inv_scale)
+ # lite: fluid resize_nearest
+ # out = paddle.fluid.layers.resize_nearest(out, scale=self.scale)
+ out = out[:, :, ::int_inv_scale, ::int_inv_scale]
+ # patch end
return out
diff --git a/ppgan/modules/fused_act.py b/ppgan/modules/fused_act.py
index 8723af36c2799c5f3e82d6d4b2baccf70a347cce..0bf89f00d0aac718e04537bc2669b7dd3334068f 100644
--- a/ppgan/modules/fused_act.py
+++ b/ppgan/modules/fused_act.py
@@ -12,37 +12,40 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/rosinality/stylegan2-pytorch
+# MIT License
+# Copyright (c) 2019 Kim Seonghyeon
+
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
-
-
+
+
class FusedLeakyReLU(nn.Layer):
- def __init__(self, channel, bias=True, negative_slope=0.2, scale=2 ** 0.5):
+ def __init__(self, channel, bias=True, negative_slope=0.2, scale=2**0.5):
super().__init__()
-
+
if bias:
- self.bias = self.create_parameter((channel,), default_initializer=nn.initializer.Constant(0.0))
-
+ self.bias = self.create_parameter(
+ (channel, ), default_initializer=nn.initializer.Constant(0.0))
+
else:
self.bias = None
-
+
self.negative_slope = negative_slope
self.scale = scale
-
+
def forward(self, input):
- return fused_leaky_relu(input, self.bias, self.negative_slope, self.scale)
-
-
-def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2 ** 0.5):
+ return fused_leaky_relu(input, self.bias, self.negative_slope,
+ self.scale)
+
+
+def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2**0.5):
if bias is not None:
- rest_dim = [1] * (input.ndim - bias.ndim - 1)
- return (
- F.leaky_relu(
- input + bias.reshape((1, bias.shape[0], *rest_dim)), negative_slope=0.2
- )
- * scale
- )
-
+ rest_dim = [1] * (len(input.shape) - len(bias.shape) - 1)
+ return (F.leaky_relu(input + bias.reshape(
+ (1, bias.shape[0], *rest_dim)),
+ negative_slope=0.2) * scale)
+
else:
return F.leaky_relu(input, negative_slope=0.2) * scale
diff --git a/ppgan/modules/init.py b/ppgan/modules/init.py
index 91dfd0677215ac96c9ad9934001ae1345c882045..12784ce1207a9711175a8ba6a1493682e3194433 100644
--- a/ppgan/modules/init.py
+++ b/ppgan/modules/init.py
@@ -1,16 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was based on torch init module
import math
import numpy as np
@@ -324,3 +312,11 @@ def init_weights(net,
logger = get_logger()
logger.debug('initialize network with %s' % init_type)
net.apply(init_func) # apply the initialization function
+
+
+def reset_parameters(m):
+ kaiming_uniform_(m.weight, a=math.sqrt(5))
+ if m.bias is not None:
+ fan_in, _ = _calculate_fan_in_and_fan_out(m.weight)
+ bound = 1 / math.sqrt(fan_in)
+ uniform_(m.bias, -bound, bound)
diff --git a/ppgan/modules/keypoint_detector.py b/ppgan/modules/keypoint_detector.py
index 9d5a7688910992d4c05343a50413d248b7b78d53..809e64d491d43dd4ce5c3fbd64555b7e0dfd703a 100644
--- a/ppgan/modules/keypoint_detector.py
+++ b/ppgan/modules/keypoint_detector.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/AliaksandrSiarohin/first-order-model
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/AliaksandrSiarohin/first-order-model/blob/master/LICENSE.md
import paddle
import paddle.nn as nn
@@ -32,27 +22,74 @@ class KPDetector(nn.Layer):
estimate_jacobian=False,
scale_factor=1,
single_jacobian_map=False,
- pad=0):
+ pad=0,
+ mobile_net=False):
super(KPDetector, self).__init__()
self.predictor = Hourglass(block_expansion,
in_features=num_channels,
max_features=max_features,
- num_blocks=num_blocks)
-
- self.kp = nn.Conv2D(in_channels=self.predictor.out_filters,
+ num_blocks=num_blocks,
+ mobile_net=mobile_net)
+ if mobile_net:
+ self.kp = nn.Sequential(
+ nn.Conv2D(in_channels=self.predictor.out_filters,
+ out_channels=self.predictor.out_filters,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=pad),
+ nn.Conv2D(in_channels=self.predictor.out_filters,
+ out_channels=self.predictor.out_filters,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=pad),
+ nn.Conv2D(in_channels=self.predictor.out_filters,
+ out_channels=num_kp,
+ kernel_size=3,
+ weight_attr=nn.initializer.KaimingUniform(),
+ padding=pad))
+ else:
+ self.kp = nn.Conv2D(in_channels=self.predictor.out_filters,
out_channels=num_kp,
kernel_size=(7, 7),
padding=pad)
if estimate_jacobian:
self.num_jacobian_maps = 1 if single_jacobian_map else num_kp
- self.jacobian = nn.Conv2D(in_channels=self.predictor.out_filters,
+ if mobile_net:
+ self.jacobian = nn.Sequential(
+ nn.Conv2D(in_channels=self.predictor.out_filters,
+ out_channels=self.predictor.out_filters,
+ kernel_size=3,
+ padding=pad),
+ nn.Conv2D(in_channels=self.predictor.out_filters,
+ out_channels=self.predictor.out_filters,
+ kernel_size=3,
+ padding=pad),
+ nn.Conv2D(in_channels=self.predictor.out_filters,
+ out_channels=4 * self.num_jacobian_maps,
+ kernel_size=3,
+ padding=pad),
+ )
+ self.jacobian[0].weight.set_value(
+ paddle.zeros(self.jacobian[0].weight.shape, dtype='float32'))
+ self.jacobian[1].weight.set_value(
+ paddle.zeros(self.jacobian[1].weight.shape, dtype='float32'))
+ self.jacobian[2].weight.set_value(
+ paddle.zeros(self.jacobian[2].weight.shape, dtype='float32'))
+ self.jacobian[2].bias.set_value(
+ paddle.to_tensor([1, 0, 0, 1] *
+ self.num_jacobian_maps).astype('float32'))
+ else:
+ self.jacobian = nn.Conv2D(in_channels=self.predictor.out_filters,
out_channels=4 * self.num_jacobian_maps,
kernel_size=(7, 7),
padding=pad)
- # self.jacobian.weight.data.zero_()
- # self.jacobian.bias.data.copy_(paddle.tensor([1, 0, 0, 1] * self.num_jacobian_maps, dtype='float32'))
+ self.jacobian.weight.set_value(
+ paddle.zeros(self.jacobian.weight.shape, dtype='float32'))
+ self.jacobian.bias.set_value(
+ paddle.to_tensor([1, 0, 0, 1] *
+ self.num_jacobian_maps).astype('float32'))
else:
self.jacobian = None
@@ -60,7 +97,8 @@ class KPDetector(nn.Layer):
self.scale_factor = scale_factor
if self.scale_factor != 1:
self.down = AntiAliasInterpolation2d(num_channels,
- self.scale_factor)
+ self.scale_factor,
+ mobile_net=mobile_net)
def gaussian2kp(self, heatmap):
"""
@@ -68,26 +106,21 @@ class KPDetector(nn.Layer):
"""
shape = heatmap.shape
heatmap = heatmap.unsqueeze(-1)
- grid = make_coordinate_grid(shape[2:],
- heatmap.dtype).unsqueeze(0).unsqueeze(0)
+ grid = make_coordinate_grid(shape[2:]).unsqueeze([0, 1])
value = (heatmap * grid).sum(axis=(2, 3))
-
kp = {'value': value}
-
return kp
def forward(self, x):
if self.scale_factor != 1:
x = self.down(x)
-
feature_map = self.predictor(x)
prediction = self.kp(feature_map)
final_shape = prediction.shape
heatmap = prediction.reshape([final_shape[0], final_shape[1], -1])
heatmap = F.softmax(heatmap / self.temperature, axis=2)
- heatmap = heatmap.reshape([*final_shape])
-
+ heatmap = heatmap.reshape(final_shape)
out = self.gaussian2kp(heatmap)
if self.jacobian is not None:
@@ -97,7 +130,7 @@ class KPDetector(nn.Layer):
final_shape[3]
])
heatmap = heatmap.unsqueeze(2)
-
+ heatmap = paddle.tile(heatmap, [1, 1, 4, 1, 1])
jacobian = heatmap * jacobian_map
jacobian = jacobian.reshape([final_shape[0], final_shape[1], 4, -1])
jacobian = jacobian.sum(axis=-1)
diff --git a/ppgan/modules/upfirdn2d.py b/ppgan/modules/upfirdn2d.py
index ac34a889b279a1cf439b866192807ffad5f04571..ca5972d93cddcf627da7226d14bc11bf1ff37e6c 100644
--- a/ppgan/modules/upfirdn2d.py
+++ b/ppgan/modules/upfirdn2d.py
@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/rosinality/stylegan2-pytorch
+# MIT License
+# Copyright (c) 2019 Kim Seonghyeon
+
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
diff --git a/ppgan/modules/wing.py b/ppgan/modules/wing.py
new file mode 100755
index 0000000000000000000000000000000000000000..4cdc1826aef4b145548360017e71f3a8270cef8a
--- /dev/null
+++ b/ppgan/modules/wing.py
@@ -0,0 +1,308 @@
+# code was heavily based on https://github.com/clovaai/stargan-v2
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/clovaai/stargan-v2#license
+
+from collections import namedtuple
+from copy import deepcopy
+from functools import partial
+
+from munch import Munch
+import numpy as np
+import cv2
+from skimage.filters import gaussian
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppgan.models.generators.builder import GENERATORS
+
+
+class HourGlass(nn.Layer):
+ def __init__(self, num_modules, depth, num_features, first_one=False):
+ super(HourGlass, self).__init__()
+ self.num_modules = num_modules
+ self.depth = depth
+ self.features = num_features
+ self.coordconv = CoordConvTh(64,
+ 64,
+ True,
+ True,
+ 256,
+ first_one,
+ out_channels=256,
+ kernel_size=1,
+ stride=1,
+ padding=0)
+ self._generate_network(self.depth)
+
+ def _generate_network(self, level):
+ self.add_sublayer('b1_' + str(level), ConvBlock(256, 256))
+ self.add_sublayer('b2_' + str(level), ConvBlock(256, 256))
+ if level > 1:
+ self._generate_network(level - 1)
+ else:
+ self.add_sublayer('b2_plus_' + str(level), ConvBlock(256, 256))
+ self.add_sublayer('b3_' + str(level), ConvBlock(256, 256))
+
+ def _forward(self, level, inp):
+ up1 = inp
+ up1 = self._sub_layers['b1_' + str(level)](up1)
+ low1 = F.avg_pool2d(inp, 2, stride=2)
+ low1 = self._sub_layers['b2_' + str(level)](low1)
+
+ if level > 1:
+ low2 = self._forward(level - 1, low1)
+ else:
+ low2 = low1
+ low2 = self._sub_layers['b2_plus_' + str(level)](low2)
+ low3 = low2
+ low3 = self._sub_layers['b3_' + str(level)](low3)
+ up2 = F.interpolate(low3, scale_factor=2, mode='nearest')
+
+ return up1 + up2
+
+ def forward(self, x, heatmap):
+ x, last_channel = self.coordconv(x, heatmap)
+ return self._forward(self.depth, x), last_channel
+
+
+class AddCoordsTh(nn.Layer):
+ def __init__(self, height=64, width=64, with_r=False, with_boundary=False):
+ super(AddCoordsTh, self).__init__()
+ self.with_r = with_r
+ self.with_boundary = with_boundary
+
+ with paddle.no_grad():
+ x_coords = paddle.arange(height).unsqueeze(1).expand(
+ (height, width)).astype('float32')
+ y_coords = paddle.arange(width).unsqueeze(0).expand(
+ (height, width)).astype('float32')
+ x_coords = (x_coords / (height - 1)) * 2 - 1
+ y_coords = (y_coords / (width - 1)) * 2 - 1
+ coords = paddle.stack([x_coords, y_coords],
+ axis=0) # (2, height, width)
+
+ if self.with_r:
+ rr = paddle.sqrt(
+ paddle.pow(x_coords, 2) +
+ paddle.pow(y_coords, 2)) # (height, width)
+ rr = (rr / paddle.max(rr)).unsqueeze(0)
+ coords = paddle.concat([coords, rr], axis=0)
+
+ self.coords = coords.unsqueeze(0) # (1, 2 or 3, height, width)
+ self.x_coords = x_coords
+ self.y_coords = y_coords
+
+ def forward(self, x, heatmap=None):
+ """
+ x: (batch, c, x_dim, y_dim)
+ """
+ coords = self.coords.tile((x.shape[0], 1, 1, 1))
+
+ if self.with_boundary and heatmap is not None:
+ boundary_channel = paddle.clip(heatmap[:, -1:, :, :], 0.0, 1.0)
+ zero_tensor = paddle.zeros_like(self.x_coords)
+ xx_boundary_channel = paddle.where(boundary_channel > 0.05,
+ self.x_coords, zero_tensor)
+ yy_boundary_channel = paddle.where(boundary_channel > 0.05,
+ self.y_coords, zero_tensor)
+ coords = paddle.concat(
+ [coords, xx_boundary_channel, yy_boundary_channel], axis=1)
+
+ x_and_coords = paddle.concat([x, coords], axis=1)
+ return x_and_coords
+
+
+class CoordConvTh(nn.Layer):
+ """CoordConv layer as in the paper."""
+ def __init__(self,
+ height,
+ width,
+ with_r,
+ with_boundary,
+ in_channels,
+ first_one=False,
+ *args,
+ **kwargs):
+ super(CoordConvTh, self).__init__()
+ self.addcoords = AddCoordsTh(height, width, with_r, with_boundary)
+ in_channels += 2
+ if with_r:
+ in_channels += 1
+ if with_boundary and not first_one:
+ in_channels += 2
+ self.conv = nn.Conv2D(in_channels=in_channels, *args, **kwargs)
+
+ def forward(self, input_tensor, heatmap=None):
+ ret = self.addcoords(input_tensor, heatmap)
+ last_channel = ret[:, -2:, :, :]
+ ret = self.conv(ret)
+ return ret, last_channel
+
+
+class ConvBlock(nn.Layer):
+ def __init__(self, in_planes, out_planes):
+ super(ConvBlock, self).__init__()
+ self.bn1 = nn.BatchNorm2D(in_planes)
+ conv3x3 = partial(nn.Conv2D,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False,
+ dilation=1)
+ self.conv1 = conv3x3(in_planes, int(out_planes / 2))
+ self.bn2 = nn.BatchNorm2D(int(out_planes / 2))
+ self.conv2 = conv3x3(int(out_planes / 2), int(out_planes / 4))
+ self.bn3 = nn.BatchNorm2D(int(out_planes / 4))
+ self.conv3 = conv3x3(int(out_planes / 4), int(out_planes / 4))
+
+ self.downsample = None
+ if in_planes != out_planes:
+ self.downsample = nn.Sequential(
+ nn.BatchNorm2D(in_planes), nn.ReLU(True),
+ nn.Conv2D(in_planes, out_planes, 1, 1, bias_attr=False))
+
+ def forward(self, x):
+ residual = x
+
+ out1 = self.bn1(x)
+ out1 = F.relu(out1, True)
+ out1 = self.conv1(out1)
+
+ out2 = self.bn2(out1)
+ out2 = F.relu(out2, True)
+ out2 = self.conv2(out2)
+
+ out3 = self.bn3(out2)
+ out3 = F.relu(out3, True)
+ out3 = self.conv3(out3)
+
+ out3 = paddle.concat((out1, out2, out3), 1)
+ if self.downsample is not None:
+ residual = self.downsample(residual)
+ out3 += residual
+ return out3
+
+
+# ========================== #
+# Mask related functions #
+# ========================== #
+
+
+def normalize(x, eps=1e-6):
+ """Apply min-max normalization."""
+ # x = x.contiguous()
+ N, C, H, W = x.shape
+ x_ = paddle.reshape(x, (N * C, -1))
+ max_val = paddle.max(x_, axis=1, keepdim=True)[0]
+ min_val = paddle.min(x_, axis=1, keepdim=True)[0]
+ x_ = (x_ - min_val) / (max_val - min_val + eps)
+ out = paddle.reshape(x_, (N, C, H, W))
+ return out
+
+
+def truncate(x, thres=0.1):
+ """Remove small values in heatmaps."""
+ return paddle.where(x < thres, paddle.zeros_like(x), x)
+
+
+def resize(x, p=2):
+ """Resize heatmaps."""
+ return x**p
+
+
+def shift(x, N):
+ """Shift N pixels up or down."""
+ x = x.numpy()
+ up = N >= 0
+ N = abs(N)
+ _, _, H, W = x.shape
+ head = np.arange(N)
+ tail = np.arange(H - N)
+
+ if up:
+ head = np.arange(H - N) + N
+ tail = np.arange(N)
+ else:
+ head = np.arange(N) + (H - N)
+ tail = np.arange(H - N)
+
+ # permutation indices
+ perm = np.concatenate([head, tail])
+ out = x[:, :, perm, :]
+ out = paddle.to_tensor(out)
+ return out
+
+
+IDXPAIR = namedtuple('IDXPAIR', 'start end')
+index_map = Munch(chin=IDXPAIR(0 + 8, 33 - 8),
+ eyebrows=IDXPAIR(33, 51),
+ eyebrowsedges=IDXPAIR(33, 46),
+ nose=IDXPAIR(51, 55),
+ nostrils=IDXPAIR(55, 60),
+ eyes=IDXPAIR(60, 76),
+ lipedges=IDXPAIR(76, 82),
+ lipupper=IDXPAIR(77, 82),
+ liplower=IDXPAIR(83, 88),
+ lipinner=IDXPAIR(88, 96))
+OPPAIR = namedtuple('OPPAIR', 'shift resize')
+
+
+def preprocess(x):
+ """Preprocess 98-dimensional heatmaps."""
+ N, C, H, W = x.shape
+ x = truncate(x)
+ x = normalize(x)
+
+ sw = H // 256
+ operations = Munch(chin=OPPAIR(0, 3),
+ eyebrows=OPPAIR(-7 * sw, 2),
+ nostrils=OPPAIR(8 * sw, 4),
+ lipupper=OPPAIR(-8 * sw, 4),
+ liplower=OPPAIR(8 * sw, 4),
+ lipinner=OPPAIR(-2 * sw, 3))
+
+ for part, ops in operations.items():
+ start, end = index_map[part]
+ x[:, start:end] = resize(shift(x[:, start:end], ops.shift), ops.resize)
+
+ zero_out = paddle.concat([
+ paddle.arange(0, index_map.chin.start),
+ paddle.arange(index_map.chin.end, 33),
+ paddle.to_tensor([
+ index_map.eyebrowsedges.start, index_map.eyebrowsedges.end,
+ index_map.lipedges.start, index_map.lipedges.end
+ ])
+ ])
+ x = x.numpy()
+ zero_out = zero_out.numpy()
+ x[:, zero_out] = 0
+ x = paddle.to_tensor(x)
+
+ start, end = index_map.nose
+ x[:, start + 1:end] = shift(x[:, start + 1:end], 4 * sw)
+ x[:, start:end] = resize(x[:, start:end], 1)
+
+ start, end = index_map.eyes
+ x[:, start:end] = resize(x[:, start:end], 1)
+ x[:, start:end] = resize(shift(x[:, start:end], -8), 3) + \
+ shift(x[:, start:end], -24)
+
+ # Second-level mask
+ x2 = deepcopy(x)
+ x2[:, index_map.chin.start:index_map.chin.end] = 0 # start:end was 0:33
+ x2[:, index_map.lipedges.start:index_map.lipinner.
+ end] = 0 # start:end was 76:96
+ x2[:, index_map.eyebrows.start:index_map.eyebrows.
+ end] = 0 # start:end was 33:51
+
+ x = paddle.sum(x, axis=1, keepdim=True) # (N, 1, H, W)
+ x2 = paddle.sum(x2, axis=1, keepdim=True) # mask without faceline and mouth
+
+ x = x.numpy()
+ x2 = x2.numpy()
+ x[x != x] = 0 # set nan to zero
+ x2[x != x] = 0 # set nan to zero
+ x = paddle.to_tensor(x)
+ x2 = paddle.to_tensor(x2)
+ return x.clip(0, 1), x2.clip(0, 1)
diff --git a/ppgan/solver/__init__.py b/ppgan/solver/__init__.py
index 1b4d1fc7b586773978d80c0a397592b0ca7af5de..41df0560513a1bf2ec33b743df7c69aed61a2d85 100644
--- a/ppgan/solver/__init__.py
+++ b/ppgan/solver/__init__.py
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from .lr_scheduler import CosineAnnealingRestartLR, LinearDecay
+from .lr_scheduler import CosineAnnealingRestartLR, LinearDecay, NonLinearDecay
from .optimizer import *
from .builder import build_lr_scheduler
from .builder import build_optimizer
diff --git a/ppgan/solver/lr_scheduler.py b/ppgan/solver/lr_scheduler.py
index aa7cc3de1ddeb95434eafd26b1c2bb8c7c8b8e0b..ca68528a2dc53dccea1799145fb0f328202851cb 100644
--- a/ppgan/solver/lr_scheduler.py
+++ b/ppgan/solver/lr_scheduler.py
@@ -21,6 +21,17 @@ from .builder import LRSCHEDULERS
LRSCHEDULERS.register(MultiStepDecay)
+@LRSCHEDULERS.register()
+class NonLinearDecay(LRScheduler):
+ def __init__(self, learning_rate, lr_decay, last_epoch=-1):
+ self.lr_decay = lr_decay
+ super(NonLinearDecay, self).__init__(learning_rate, last_epoch)
+
+ def get_lr(self):
+ lr = self.base_lr / (1.0 + self.lr_decay * self.last_epoch)
+ return lr
+
+
@LRSCHEDULERS.register()
class LinearDecay(LambdaDecay):
def __init__(self, learning_rate, start_epoch, decay_epochs,
@@ -34,46 +45,26 @@ class LinearDecay(LambdaDecay):
super().__init__(learning_rate, lambda_rule)
-def get_position_from_periods(iteration, cumulative_period):
- """Get the position from a period list.
-
- It will return the index of the right-closest number in the period list.
- For example, the cumulative_period = [100, 200, 300, 400],
- if iteration == 50, return 0;
- if iteration == 210, return 2;
- if iteration == 300, return 2.
-
- Args:
- iteration (int): Current iteration.
- cumulative_period (list[int]): Cumulative period list.
-
- Returns:
- int: The position of the right-closest number in the period list.
- """
- for i, period in enumerate(cumulative_period):
- if iteration <= period:
- return i
-
-
@LRSCHEDULERS.register()
class CosineAnnealingRestartLR(LRScheduler):
""" Cosine annealing with restarts learning rate scheme.
- An example of config:
- periods = [10, 10, 10, 10]
- restart_weights = [1, 0.5, 0.5, 0.5]
- eta_min=1e-7
+ An example config from configs/edvr_l_blur_wo_tsa.yaml:
+ learning_rate: !!float 4e-4
+ periods: [150000, 150000, 150000, 150000]
+ restart_weights: [1, 1, 1, 1]
+ eta_min: !!float 1e-7
- It has four cycles, each has 10 iterations. At 10th, 20th, 30th, the
- scheduler will restart with the weights in restart_weights.
+ It has four cycles, each has 150000 iterations. At 150000th, 300000th,
+ 450000th, the scheduler will restart with the weights in restart_weights.
Args:
- learning_rate (float|paddle.nn.optimizer): PaddlePaddle optimizer.
+ learning_rate (float): Base learning rate.
periods (list): Period for each cosine anneling cycle.
restart_weights (list): Restart weights at each restart iteration.
Default: [1].
- eta_min (float): The mimimum lr. Default: 0.
- last_epoch (int): Used in _LRScheduler. Default: -1.
+ eta_min (float): The mimimum learning rate of the cosine anneling cycle. Default: 0.
+ last_epoch (int): Used in paddle.nn._LRScheduler. Default: -1.
"""
def __init__(self,
learning_rate,
@@ -93,10 +84,14 @@ class CosineAnnealingRestartLR(LRScheduler):
last_epoch)
def get_lr(self):
- idx = get_position_from_periods(self.last_epoch, self.cumulative_period)
- current_weight = self.restart_weights[idx]
- nearest_restart = 0 if idx == 0 else self.cumulative_period[idx - 1]
- current_period = self.periods[idx]
+ for i, period in enumerate(self.cumulative_period):
+ if self.last_epoch <= period:
+ index = i
+ break
+
+ current_weight = self.restart_weights[index]
+ nearest_restart = 0 if index == 0 else self.cumulative_period[index - 1]
+ current_period = self.periods[index]
lr = self.eta_min + current_weight * 0.5 * (
self.base_lr - self.eta_min) * (1 + math.cos(math.pi * (
diff --git a/ppgan/solver/optimizer.py b/ppgan/solver/optimizer.py
index 36345b54cffdcd89e2349ac13a05a6b24f86f1fd..bc9c48d0c9167c22279c95ff504d4fa45f9d7bcc 100644
--- a/ppgan/solver/optimizer.py
+++ b/ppgan/solver/optimizer.py
@@ -21,3 +21,4 @@ OPTIMIZERS.register(paddle.optimizer.Adam)
OPTIMIZERS.register(paddle.optimizer.SGD)
OPTIMIZERS.register(paddle.optimizer.Momentum)
OPTIMIZERS.register(paddle.optimizer.RMSProp)
+OPTIMIZERS.register(paddle.optimizer.AdamW)
diff --git a/ppgan/utils/animate.py b/ppgan/utils/animate.py
index 414120a044f642529efb49db8ef198f576d8efcf..e25e35ea7e66b81be31d49646473aa8e62469cfa 100644
--- a/ppgan/utils/animate.py
+++ b/ppgan/utils/animate.py
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+# code was heavily based on https://github.com/AliaksandrSiarohin/first-order-model
+
import numpy as np
from scipy.spatial import ConvexHull
diff --git a/ppgan/utils/audio.py b/ppgan/utils/audio.py
index 9cf1e74208e9552062bf0922b26e4b6433e3b5b2..432305bdc669c9203b4e227918d0fbe9f3077581 100644
--- a/ppgan/utils/audio.py
+++ b/ppgan/utils/audio.py
@@ -1,3 +1,7 @@
+# code was heavily based on https://github.com/Rudrabha/Wav2Lip
+# Users should be careful about adopting these functions in any commercial matters.
+# https://github.com/Rudrabha/Wav2Lip#license-and-citation
+
import numpy as np
from scipy import signal
from scipy.io import wavfile
diff --git a/ppgan/utils/audio_config.py b/ppgan/utils/audio_config.py
index a104499424c426e022455a0a9415655a057479be..bb1a3888d615de5839d8d02de8e1212c3dd16a7c 100644
--- a/ppgan/utils/audio_config.py
+++ b/ppgan/utils/audio_config.py
@@ -1,28 +1,28 @@
-from easydict import EasyDict as edict
+from .config import AttrDict
-_C = edict()
+_audio_cfg = AttrDict()
-_C.num_mels = 80
-_C.rescale = True
-_C.rescaling_max = 0.9
-_C.use_lws = False
-_C.n_fft = 800
-_C.hop_size = 200
-_C.win_size = 800
-_C.sample_rate = 16000
-_C.frame_shift_ms = None
-_C.signal_normalization = True
-_C.allow_clipping_in_normalization = True
-_C.symmetric_mels = True
-_C.max_abs_value = 4.
-_C.preemphasize = True
-_C.preemphasis = 0.97
-_C.min_level_db = -100
-_C.ref_level_db = 20
-_C.fmin = 55
-_C.fmax = 7600
-_C.fps = 25
+_audio_cfg.num_mels = 80
+_audio_cfg.rescale = True
+_audio_cfg.rescaling_max = 0.9
+_audio_cfg.use_lws = False
+_audio_cfg.n_fft = 800
+_audio_cfg.hop_size = 200
+_audio_cfg.win_size = 800
+_audio_cfg.sample_rate = 16000
+_audio_cfg.frame_shift_ms = None
+_audio_cfg.signal_normalization = True
+_audio_cfg.allow_clipping_in_normalization = True
+_audio_cfg.symmetric_mels = True
+_audio_cfg.max_abs_value = 4.
+_audio_cfg.preemphasize = True
+_audio_cfg.preemphasis = 0.97
+_audio_cfg.min_level_db = -100
+_audio_cfg.ref_level_db = 20
+_audio_cfg.fmin = 55
+_audio_cfg.fmax = 7600
+_audio_cfg.fps = 25
def get_audio_config():
- return _C
+ return _audio_cfg
diff --git a/ppgan/utils/config.py b/ppgan/utils/config.py
index 1c0e3e8ebb0d63e3e6e648c67a80e58c3121540b..c98e9fc26afa4f1204b28da25b60afc2dcee7f62 100644
--- a/ppgan/utils/config.py
+++ b/ppgan/utils/config.py
@@ -19,7 +19,6 @@ __all__ = ['get_config']
class AttrDict(dict):
def __getattr__(self, key):
- # return self[key]
try:
return self[key]
except KeyError:
@@ -125,3 +124,13 @@ def get_config(fname, overrides=None, show=True):
override_config(config, overrides)
return config
+
+
+def cfg2dict(cfg):
+ if isinstance(cfg, AttrDict):
+ cfg = dict(cfg)
+ for k in cfg.keys():
+ if isinstance(cfg[k], AttrDict):
+ cfg[k] = cfg2dict(cfg[k])
+ return cfg
+
\ No newline at end of file
diff --git a/ppgan/utils/download.py b/ppgan/utils/download.py
index 016358404180a29471da3cc55992f2becffd3867..3261d5a8dde5a40fa2d07fb9fce0b3f64fbe1433 100644
--- a/ppgan/utils/download.py
+++ b/ppgan/utils/download.py
@@ -29,7 +29,7 @@ from tqdm import tqdm
from .logger import get_logger
-PPGAN_HOME = os.path.expanduser("~/.cache/ppgan/")
+PPGAN_HOME = os.path.expanduser(os.path.join('~', '.cache', 'ppgan'))
DOWNLOAD_RETRY_LIMIT = 3
@@ -64,7 +64,7 @@ def get_path_from_url(url, md5sum=None, check_exist=True):
str: a local path to save downloaded models & weights & datasets.
"""
- from paddle.fluid.dygraph.parallel import ParallelEnv
+ from paddle.distributed import ParallelEnv
assert is_url(url), "downloading from {} not a url".format(url)
root_dir = PPGAN_HOME
@@ -243,14 +243,14 @@ def _uncompress_file_tar(filepath, mode="r:*"):
def _is_a_single_file(file_list):
- if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+ if len(file_list) == 1 and file_list[0].find('/') < -1:
return True
return False
def _is_a_single_dir(file_list):
- file_name = file_list[0].split(os.sep)[0]
+ file_name = file_list[0].split('/')[0]
for i in range(1, len(file_list)):
- if file_name != file_list[i].split(os.sep)[0]:
+ if file_name != file_list[i].split('/')[0]:
return False
return True
diff --git a/ppgan/utils/filesystem.py b/ppgan/utils/filesystem.py
index 43774dcc8c83d13cc2491c1064387f78a2880848..a43c18f985bc623d31790ff4bbea172687ad4ff7 100644
--- a/ppgan/utils/filesystem.py
+++ b/ppgan/utils/filesystem.py
@@ -34,7 +34,7 @@ def save(state_dicts, file_name):
for k, v in state_dict.items():
if isinstance(
v,
- (paddle.fluid.framework.Variable, paddle.fluid.core.VarBase)):
+ (paddle.static.Variable, paddle.Tensor)):
model_dict[k] = v.numpy()
else:
model_dict[k] = v
@@ -45,7 +45,7 @@ def save(state_dicts, file_name):
for k, v in state_dicts.items():
if isinstance(
v,
- (paddle.fluid.framework.Variable, paddle.fluid.core.VarBase)):
+ (paddle.static.Variable, paddle.Tensor)):
final_dict = convert(state_dicts)
break
elif isinstance(v, dict):
@@ -53,8 +53,7 @@ def save(state_dicts, file_name):
else:
final_dict[k] = v
- with open(file_name, 'wb') as f:
- pickle.dump(final_dict, f, protocol=2)
+ paddle.save(final_dict, file_name)
def load(file_name):
diff --git a/ppgan/utils/gfpgan_tools.py b/ppgan/utils/gfpgan_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..80d59b7def928ef853616f5238b9b50718b890f1
--- /dev/null
+++ b/ppgan/utils/gfpgan_tools.py
@@ -0,0 +1,1127 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import math
+import numpy as np
+import random
+import os
+import os.path as osp
+from abc import ABCMeta
+from abc import abstractmethod
+import paddle
+import paddle.nn.functional as F
+from paddle.vision.transforms.functional import normalize
+
+
+def _blend(img1, img2, ratio):
+ ratio = float(ratio)
+ bound = 1.0 if paddle.is_floating_point(img1) else 255.0
+ return (ratio * img1 + (1.0 - ratio) * img2).clip(0, bound)
+
+
+def _get_image_num_channels(img):
+ if img.ndim == 2:
+ return 1
+ elif img.ndim > 2:
+ return img.shape[-3]
+
+ raise TypeError("Input ndim should be 2 or more. Got {}".format(img.ndim))
+
+
+def _rgb2hsv(img):
+ r, g, b = img.unbind(axis=-3)
+
+ # Implementation is based on https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/
+ # src/libImaging/Convert.c#L330
+ maxc = paddle.max(img, axis=-3)
+ minc = paddle.min(img, axis=-3)
+
+ # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
+ # from happening in the results, because
+ # + S channel has division by `maxc`, which is zero only if `maxc = minc`
+ # + H channel has division by `(maxc - minc)`.
+ #
+ # Instead of overwriting NaN afterwards, we just prevent it from occuring so
+ # we don't need to deal with it in case we save the NaN in a buffer in
+ # backprop, if it is ever supported, but it doesn't hurt to do so.
+ eqc = maxc == minc
+
+ cr = maxc - minc
+ # Since `eqc => cr = 0`, replacing denominator with 1 when `eqc` is fine.
+ ones = paddle.ones_like(maxc)
+ s = cr / paddle.where(eqc, ones, maxc)
+ # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
+ # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
+ # would not matter what values `rc`, `gc`, and `bc` have here, and thus
+ # replacing denominator with 1 when `eqc` is fine.
+ cr_divisor = paddle.where(eqc, ones, cr)
+ rc = (maxc - r) / cr_divisor
+ gc = (maxc - g) / cr_divisor
+ bc = (maxc - b) / cr_divisor
+
+ t_zero = paddle.zeros_like(bc)
+ hr = paddle.where(maxc == r, (bc - gc), t_zero)
+ hg = paddle.where((maxc == g) & (maxc != r), (2.0 + rc - bc), t_zero)
+ hb = paddle.where((maxc != g) & (maxc != r), (4.0 + gc - rc), t_zero)
+
+ h = (hr + hg + hb)
+ h = paddle.mod((h / 6.0 + 1.0), paddle.to_tensor([1.0]))
+ return paddle.stack((h, s, maxc), axis=-3)
+
+
+def _hsv2rgb(img):
+ h, s, v = img.unbind(axis=-3)
+ i = paddle.floor(h * 6.0)
+ f = (h * 6.0) - i
+ i = paddle.cast(i, dtype='int32')
+
+ p = paddle.clip((v * (1.0 - s)), 0.0, 1.0)
+ q = paddle.clip((v * (1.0 - s * f)), 0.0, 1.0)
+ t = paddle.clip((v * (1.0 - s * (1.0 - f))), 0.0, 1.0)
+ i = i % 6
+
+ mask = i.unsqueeze(axis=-3) == paddle.arange(6).reshape([-1, 1, 1])
+
+ a1 = paddle.stack((v, q, p, p, t, v), axis=-3)
+ a2 = paddle.stack((t, v, v, q, p, p), axis=-3)
+ a3 = paddle.stack((p, p, t, v, v, q), axis=-3)
+ a4 = paddle.stack((a1, a2, a3), axis=-4)
+ t_zero = paddle.zeros_like(mask, dtype='float32')
+ t_ones = paddle.ones_like(mask, dtype='float32')
+ mask = paddle.where(mask, t_ones, t_zero)
+ return paddle.einsum("...ijk, ...xijk -> ...xjk", mask, a4)
+
+
+def rgb_to_grayscale(img, num_output_channels=1):
+ if img.ndim < 3:
+ raise TypeError(
+ "Input image tensor should have at least 3 axisensions, but found {}"
+ .format(img.ndim))
+
+ if num_output_channels not in (1, 3):
+ raise ValueError('num_output_channels should be either 1 or 3')
+
+ r, g, b = img.unbind(axis=-3)
+ l_img = (0.2989 * r + 0.587 * g + 0.114 * b)
+ l_img = l_img.unsqueeze(axis=-3)
+
+ if num_output_channels == 3:
+ return l_img.expand(img.shape)
+
+ return l_img
+
+
+def adjust_brightness(img, brightness_factor):
+ if brightness_factor < 0:
+ raise ValueError('brightness_factor ({}) is not non-negative.'.format(
+ brightness_factor))
+
+ return _blend(img, paddle.zeros_like(img), brightness_factor)
+
+
+def adjust_contrast(img, contrast_factor):
+ if contrast_factor < 0:
+ raise ValueError(
+ 'contrast_factor ({}) is not non-negative.'.format(contrast_factor))
+
+ dtype = img.dtype if paddle.is_floating_point(img) else paddle.float32
+ mean = paddle.mean(paddle.cast(rgb_to_grayscale(img), dtype=dtype),
+ axis=(-3, -2, -1),
+ keepdim=True)
+
+ return _blend(img, mean, contrast_factor)
+
+
+def adjust_hue(img, hue_factor):
+ if not (-0.5 <= hue_factor <= 0.5):
+ raise ValueError(
+ 'hue_factor ({}) is not in [-0.5, 0.5].'.format(hue_factor))
+
+ if not (isinstance(img, paddle.Tensor)):
+ raise TypeError('Input img should be Tensor image')
+
+ if _get_image_num_channels(img) == 1: # Match PIL behaviour
+ return img
+
+ orig_dtype = img.dtype
+ if img.dtype == paddle.uint8:
+ img = paddle.cast(img, dtype='float32') / 255.0
+
+ img = _rgb2hsv(img)
+ h, s, v = img.unbind(axis=-3)
+ h = (h + hue_factor) % 1.0
+ img = paddle.stack((h, s, v), axis=-3)
+ img_hue_adj = _hsv2rgb(img)
+
+ if orig_dtype == paddle.uint8:
+ img_hue_adj = paddle.cast(img_hue_adj * 255.0, dtype=orig_dtype)
+
+ return img_hue_adj
+
+
+def adjust_saturation(img, saturation_factor):
+ if saturation_factor < 0:
+ raise ValueError('saturation_factor ({}) is not non-negative.'.format(
+ saturation_factor))
+
+ return _blend(img, rgb_to_grayscale(img), saturation_factor)
+
+
+def generate_gaussian_noise(img, sigma=10, gray_noise=False):
+ """Generate Gaussian noise.
+
+ Args:
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+ sigma (float): Noise scale (measured in range 255). Default: 10.
+
+ Returns:
+ (Numpy array): Returned noisy image, shape (h, w, c), range[0, 1],
+ float32.
+ """
+ if gray_noise:
+ noise = np.float32(np.random.randn(*img.shape[0:2])) * sigma / 255.0
+ noise = np.expand_dims(noise, axis=2).repeat(3, axis=2)
+ else:
+ noise = np.float32(np.random.randn(*img.shape)) * sigma / 255.0
+ return noise
+
+
+def random_generate_gaussian_noise(img, sigma_range=(0, 10), gray_prob=0):
+ sigma = np.random.uniform(sigma_range[0], sigma_range[1])
+ if np.random.uniform() < gray_prob:
+ gray_noise = True
+ else:
+ gray_noise = False
+ return generate_gaussian_noise(img, sigma, gray_noise)
+
+
+def random_add_gaussian_noise(img, sigma_range=(0, 1.0), gray_prob=0, clip=\
+ True, rounds=False):
+ noise = random_generate_gaussian_noise(img, sigma_range, gray_prob)
+ out = img + noise
+ if clip and rounds:
+ out = np.clip((out * 255.0).round(), 0, 255) / 255.0
+ elif clip:
+ out = np.clip(out, 0, 1)
+ elif rounds:
+ out = (out * 255.0).round() / 255.0
+ return out
+
+
+def add_jpg_compression(img, quality=90):
+ """Add JPG compression artifacts.
+
+ Args:
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+ quality (float): JPG compression quality. 0 for lowest quality, 100 for
+ best quality. Default: 90.
+
+ Returns:
+ (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
+ float32.
+ """
+ img = np.clip(img, 0, 1)
+ encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), quality]
+ _, encimg = cv2.imencode('.jpg', img * 255.0, encode_param)
+ img = np.float32(cv2.imdecode(encimg, 1)) / 255.0
+ return img
+
+
+def random_add_jpg_compression(img, quality_range=(90, 100)):
+ """Randomly add JPG compression artifacts.
+
+ Args:
+ img (Numpy array): Input image, shape (h, w, c), range [0, 1], float32.
+ quality_range (tuple[float] | list[float]): JPG compression quality
+ range. 0 for lowest quality, 100 for best quality.
+ Default: (90, 100).
+
+ Returns:
+ (Numpy array): Returned image after JPG, shape (h, w, c), range[0, 1],
+ float32.
+ """
+ quality = int(np.random.uniform(quality_range[0], quality_range[1]))
+ return add_jpg_compression(img, quality)
+
+
+def random_mixed_kernels(kernel_list,
+ kernel_prob,
+ kernel_size=21,
+ sigma_x_range=(0.6, 5),
+ sigma_y_range=(0.6, 5),
+ rotation_range=(-math.pi, math.pi),
+ betag_range=(0.5, 8),
+ betap_range=(0.5, 8),
+ noise_range=None):
+ """Randomly generate mixed kernels.
+
+ Args:
+ kernel_list (tuple): a list name of kernel types,
+ support ['iso', 'aniso', 'skew', 'generalized', 'plateau_iso',
+ 'plateau_aniso']
+ kernel_prob (tuple): corresponding kernel probability for each
+ kernel type
+ kernel_size (int):
+ sigma_x_range (tuple): [0.6, 5]
+ sigma_y_range (tuple): [0.6, 5]
+ rotation range (tuple): [-math.pi, math.pi]
+ beta_range (tuple): [0.5, 8]
+ noise_range(tuple, optional): multiplicative kernel noise,
+ [0.75, 1.25]. Default: None
+
+ Returns:
+ kernel (ndarray):
+ """
+ kernel_type = random.choices(kernel_list, kernel_prob)[0]
+ if kernel_type == 'iso':
+ kernel = random_bivariate_Gaussian(kernel_size,
+ sigma_x_range,
+ sigma_y_range,
+ rotation_range,
+ noise_range=noise_range,
+ isotropic=True)
+ elif kernel_type == 'aniso':
+ kernel = random_bivariate_Gaussian(kernel_size,
+ sigma_x_range,
+ sigma_y_range,
+ rotation_range,
+ noise_range=noise_range,
+ isotropic=False)
+ return kernel
+
+
+def random_bivariate_Gaussian(kernel_size,
+ sigma_x_range,
+ sigma_y_range,
+ rotation_range,
+ noise_range=None,
+ isotropic=True):
+ """Randomly generate bivariate isotropic or anisotropic Gaussian kernels.
+
+ In the isotropic mode, only `sigma_x_range` is used. `sigma_y_range` and `rotation_range` is ignored.
+
+ Args:
+ kernel_size (int):
+ sigma_x_range (tuple): [0.6, 5]
+ sigma_y_range (tuple): [0.6, 5]
+ rotation range (tuple): [-math.pi, math.pi]
+ noise_range(tuple, optional): multiplicative kernel noise,
+ [0.75, 1.25]. Default: None
+
+ Returns:
+ kernel (ndarray):
+ """
+ assert kernel_size % 2 == 1, 'Kernel size must be an odd number.'
+ assert sigma_x_range[0] < sigma_x_range[1], 'Wrong sigma_x_range.'
+ sigma_x = np.random.uniform(sigma_x_range[0], sigma_x_range[1])
+ if isotropic is False:
+ assert sigma_y_range[0] < sigma_y_range[1], 'Wrong sigma_y_range.'
+ assert rotation_range[0] < rotation_range[1], 'Wrong rotation_range.'
+ sigma_y = np.random.uniform(sigma_y_range[0], sigma_y_range[1])
+ rotation = np.random.uniform(rotation_range[0], rotation_range[1])
+ else:
+ sigma_y = sigma_x
+ rotation = 0
+ kernel = bivariate_Gaussian(kernel_size,
+ sigma_x,
+ sigma_y,
+ rotation,
+ isotropic=isotropic)
+ if noise_range is not None:
+ assert noise_range[0] < noise_range[1], 'Wrong noise range.'
+ noise = np.random.uniform(noise_range[0], noise_range[1], size=\
+ kernel.shape)
+ kernel = kernel * noise
+ kernel = kernel / np.sum(kernel)
+ return kernel
+
+
+def bivariate_Gaussian(kernel_size,
+ sig_x,
+ sig_y,
+ theta,
+ grid=None,
+ isotropic=True):
+ """Generate a bivariate isotropic or anisotropic Gaussian kernel.
+
+ In the isotropic mode, only `sig_x` is used. `sig_y` and `theta` is ignored.
+
+ Args:
+ kernel_size (int):
+ sig_x (float):
+ sig_y (float):
+ theta (float): Radian measurement.
+ grid (ndarray, optional): generated by :func:`mesh_grid`,
+ with the shape (K, K, 2), K is the kernel size. Default: None
+ isotropic (bool):
+
+ Returns:
+ kernel (ndarray): normalized kernel.
+ """
+ if grid is None:
+ grid, _, _ = mesh_grid(kernel_size)
+ if isotropic:
+ sigma_matrix = np.array([[sig_x**2, 0], [0, sig_x**2]])
+ else:
+ sigma_matrix = sigma_matrix2(sig_x, sig_y, theta)
+ kernel = pdf2(sigma_matrix, grid)
+ kernel = kernel / np.sum(kernel)
+ return kernel
+
+
+def sigma_matrix2(sig_x, sig_y, theta):
+ """Calculate the rotated sigma matrix (two dimensional matrix).
+
+ Args:
+ sig_x (float):
+ sig_y (float):
+ theta (float): Radian measurement.
+
+ Returns:
+ ndarray: Rotated sigma matrix.
+ """
+ d_matrix = np.array([[sig_x**2, 0], [0, sig_y**2]])
+ u_matrix = np.array([[np.cos(theta), -np.sin(theta)],
+ [np.sin(theta), np.cos(theta)]])
+ return np.dot(u_matrix, np.dot(d_matrix, u_matrix.T))
+
+
+def mesh_grid(kernel_size):
+ """Generate the mesh grid, centering at zero.
+
+ Args:
+ kernel_size (int):
+
+ Returns:
+ xy (ndarray): with the shape (kernel_size, kernel_size, 2)
+ xx (ndarray): with the shape (kernel_size, kernel_size)
+ yy (ndarray): with the shape (kernel_size, kernel_size)
+ """
+ ax = np.arange(-kernel_size // 2 + 1.0, kernel_size // 2 + 1.0)
+ xx, yy = np.meshgrid(ax, ax)
+ xy = np.hstack((xx.reshape((kernel_size * kernel_size, 1)),
+ yy.reshape(kernel_size * kernel_size,
+ 1))).reshape(kernel_size, kernel_size, 2)
+ return xy, xx, yy
+
+
+def pdf2(sigma_matrix, grid):
+ """Calculate PDF of the bivariate Gaussian distribution.
+
+ Args:
+ sigma_matrix (ndarray): with the shape (2, 2)
+ grid (ndarray): generated by :func:`mesh_grid`,
+ with the shape (K, K, 2), K is the kernel size.
+
+ Returns:
+ kernel (ndarrray): un-normalized kernel.
+ """
+ inverse_sigma = np.linalg.inv(sigma_matrix)
+ kernel = np.exp(-0.5 * np.sum(np.dot(grid, inverse_sigma) * grid, 2))
+ return kernel
+
+
+def paths_from_folder(folder):
+ """Generate paths from folder.
+
+ Args:
+ folder (str): Folder path.
+
+ Returns:
+ list[str]: Returned path list.
+ """
+ paths = list(scandir(folder))
+ paths = [osp.join(folder, path) for path in paths]
+ return paths
+
+
+def scandir(dir_path, suffix=None, recursive=False, full_path=False):
+ """Scan a directory to find the interested files.
+
+ Args:
+ dir_path (str): Path of the directory.
+ suffix (str | tuple(str), optional): File suffix that we are
+ interested in. Default: None.
+ recursive (bool, optional): If set to True, recursively scan the
+ directory. Default: False.
+ full_path (bool, optional): If set to True, include the dir_path.
+ Default: False.
+
+ Returns:
+ A generator for all the interested files with relative paths.
+ """
+ if suffix is not None and not isinstance(suffix, (str, tuple)):
+ raise TypeError('"suffix" must be a string or tuple of strings')
+ root = dir_path
+
+ def _scandir(dir_path, suffix, recursive):
+ for entry in os.scandir(dir_path):
+ if not entry.name.startswith('.') and entry.is_file():
+ if full_path:
+ return_path = entry.path
+ else:
+ return_path = osp.relpath(entry.path, root)
+ if suffix is None:
+ yield return_path
+ elif return_path.endswith(suffix):
+ yield return_path
+ elif recursive:
+ yield from _scandir(entry.path, suffix=suffix, recursive=\
+ recursive)
+ else:
+ continue
+
+ return _scandir(dir_path, suffix=suffix, recursive=recursive)
+
+
+class BaseStorageBackend(metaclass=ABCMeta):
+ """Abstract class of storage backends.
+
+ All backends need to implement two apis: ``get()`` and ``get_text()``.
+ ``get()`` reads the file as a byte stream and ``get_text()`` reads the file
+ as texts.
+ """
+ @abstractmethod
+ def get(self, filepath):
+ pass
+
+ @abstractmethod
+ def get_text(self, filepath):
+ pass
+
+
+class MemcachedBackend(BaseStorageBackend):
+ """Memcached storage backend.
+
+ Attributes:
+ server_list_cfg (str): Config file for memcached server list.
+ client_cfg (str): Config file for memcached client.
+ sys_path (str | None): Additional path to be appended to `sys.path`.
+ Default: None.
+ """
+ def __init__(self, server_list_cfg, client_cfg, sys_path=None):
+ if sys_path is not None:
+ import sys
+ sys.path.append(sys_path)
+ try:
+ import mc
+ except ImportError:
+ raise ImportError(
+ 'Please install memcached to enable MemcachedBackend.')
+ self.server_list_cfg = server_list_cfg
+ self.client_cfg = client_cfg
+ self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg,
+ self.client_cfg)
+ self._mc_buffer = mc.pyvector()
+
+ def get(self, filepath):
+ filepath = str(filepath)
+ import mc
+ self._client.Get(filepath, self._mc_buffer)
+ value_buf = mc.ConvertBuffer(self._mc_buffer)
+ return value_buf
+
+ def get_text(self, filepath):
+ raise NotImplementedError
+
+
+class HardDiskBackend(BaseStorageBackend):
+ """Raw hard disks storage backend."""
+ def get(self, filepath):
+ filepath = str(filepath)
+ with open(filepath, 'rb') as f:
+ value_buf = f.read()
+ return value_buf
+
+ def get_text(self, filepath):
+ filepath = str(filepath)
+ with open(filepath, 'r') as f:
+ value_buf = f.read()
+ return value_buf
+
+
+class LmdbBackend(BaseStorageBackend):
+ """Lmdb storage backend.
+
+ Args:
+ db_paths (str | list[str]): Lmdb database paths.
+ client_keys (str | list[str]): Lmdb client keys. Default: 'default'.
+ readonly (bool, optional): Lmdb environment parameter. If True,
+ disallow any write operations. Default: True.
+ lock (bool, optional): Lmdb environment parameter. If False, when
+ concurrent access occurs, do not lock the database. Default: False.
+ readahead (bool, optional): Lmdb environment parameter. If False,
+ disable the OS filesystem readahead mechanism, which may improve
+ random read performance when a database is larger than RAM.
+ Default: False.
+
+ Attributes:
+ db_paths (list): Lmdb database path.
+ _client (list): A list of several lmdb envs.
+ """
+ def __init__(self,
+ db_paths,
+ client_keys='default',
+ readonly=True,
+ lock=False,
+ readahead=False,
+ **kwargs):
+ try:
+ import lmdb
+ except ImportError:
+ raise ImportError('Please install lmdb to enable LmdbBackend.')
+ if isinstance(client_keys, str):
+ client_keys = [client_keys]
+ if isinstance(db_paths, list):
+ self.db_paths = [str(v) for v in db_paths]
+ elif isinstance(db_paths, str):
+ self.db_paths = [str(db_paths)]
+ assert len(client_keys) == len(
+ self.db_paths
+ ), f'client_keys and db_paths should have the same length, but received {len(client_keys)} and {len(self.db_paths)}.'
+ self._client = {}
+ for client, path in zip(client_keys, self.db_paths):
+ self._client[client] = lmdb.open(path, readonly=readonly, lock=\
+ lock, readahead=readahead, **kwargs)
+
+ def get(self, filepath, client_key):
+ """Get values according to the filepath from one lmdb named client_key.
+
+ Args:
+ filepath (str | obj:`Path`): Here, filepath is the lmdb key.
+ client_key (str): Used for distinguishing different lmdb envs.
+ """
+ filepath = str(filepath)
+ assert client_key in self._client, f'client_key {client_key} is not in lmdb clients.'
+ client = self._client[client_key]
+ with client.begin(write=False) as txn:
+ value_buf = txn.get(filepath.encode('ascii'))
+ return value_buf
+
+ def get_text(self, filepath):
+ raise NotImplementedError
+
+
+class FileClient(object):
+ """A general file client to access files in different backend.
+
+ The client loads a file or text in a specified backend from its path
+ and return it as a binary file. it can also register other backend
+ accessor with a given name and backend class.
+
+ Attributes:
+ backend (str): The storage backend type. Options are "disk",
+ "memcached" and "lmdb".
+ client (:obj:`BaseStorageBackend`): The backend object.
+ """
+ _backends = {
+ 'disk': HardDiskBackend,
+ 'memcached': MemcachedBackend,
+ 'lmdb': LmdbBackend
+ }
+
+ def __init__(self, backend='disk', **kwargs):
+ if backend not in self._backends:
+ raise ValueError(
+ f'Backend {backend} is not supported. Currently supported ones are {list(self._backends.keys())}'
+ )
+ self.backend = backend
+ self.client = self._backends[backend](**kwargs)
+
+ def get(self, filepath, client_key='default'):
+ if self.backend == 'lmdb':
+ return self.client.get(filepath, client_key)
+ else:
+ return self.client.get(filepath)
+
+ def get_text(self, filepath):
+ return self.client.get_text(filepath)
+
+
+def imfrombytes(content, flag='color', float32=False):
+ """Read an image from bytes.
+
+ Args:
+ content (bytes): Image bytes got from files or other streams.
+ flag (str): Flags specifying the color type of a loaded image,
+ candidates are `color`, `grayscale` and `unchanged`.
+ float32 (bool): Whether to change to float32., If True, will also norm
+ to [0, 1]. Default: False.
+
+ Returns:
+ ndarray: Loaded image array.
+ """
+ img_np = np.frombuffer(content, np.uint8)
+ imread_flags = {
+ 'color': cv2.IMREAD_COLOR,
+ 'grayscale': cv2.IMREAD_GRAYSCALE,
+ 'unchanged': cv2.IMREAD_UNCHANGED
+ }
+ img = cv2.imdecode(img_np, imread_flags[flag])
+ if float32:
+ img = img.astype(np.float32) / 255.0
+ return img
+
+
+def img2tensor(imgs, bgr2rgb=True, float32=True):
+ """Numpy array to tensor.
+
+ Args:
+ imgs (list[ndarray] | ndarray): Input images.
+ bgr2rgb (bool): Whether to change bgr to rgb.
+ float32 (bool): Whether to change to float32.
+
+ Returns:
+ list[tensor] | tensor: Tensor images. If returned results only have
+ one element, just return tensor.
+ """
+ def _totensor(img, bgr2rgb, float32):
+ if img.shape[2] == 3 and bgr2rgb:
+ if img.dtype == 'float64':
+ img = img.astype('float32')
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+ return img.transpose(2, 0, 1)
+
+ if isinstance(imgs, list):
+ return [_totensor(img, bgr2rgb, float32) for img in imgs]
+ else:
+ return _totensor(imgs, bgr2rgb, float32)
+
+
+def mod_crop(img, scale):
+ """Mod crop images, used during testing.
+
+ Args:
+ img (ndarray): Input image.
+ scale (int): Scale factor.
+
+ Returns:
+ ndarray: Result image.
+ """
+ img = img.copy()
+ if img.ndim in (2, 3):
+ h, w = img.shape[0], img.shape[1]
+ h_remainder, w_remainder = h % scale, w % scale
+ img = img[:h - h_remainder, :w - w_remainder, ...]
+ else:
+ raise ValueError(f'Wrong img ndim: {img.ndim}.')
+ return img
+
+
+def paired_random_crop(img_gts, img_lqs, gt_patch_size, scale, gt_path=None):
+ """Paired random crop. Support Numpy array and Tensor inputs.
+
+ It crops lists of lq and gt images with corresponding locations.
+
+ Args:
+ img_gts (list[ndarray] | ndarray | list[Tensor] | Tensor): GT images. Note that all images
+ should have the same shape. If the input is an ndarray, it will
+ be transformed to a list containing itself.
+ img_lqs (list[ndarray] | ndarray): LQ images. Note that all images
+ should have the same shape. If the input is an ndarray, it will
+ be transformed to a list containing itself.
+ gt_patch_size (int): GT patch size.
+ scale (int): Scale factor.
+ gt_path (str): Path to ground-truth. Default: None.
+
+ Returns:
+ list[ndarray] | ndarray: GT images and LQ images. If returned results
+ only have one element, just return ndarray.
+ """
+ if not isinstance(img_gts, list):
+ img_gts = [img_gts]
+ if not isinstance(img_lqs, list):
+ img_lqs = [img_lqs]
+ input_type = 'Tensor' if isinstance(img_gts[0], paddle.Tensor) else 'Numpy'
+ if input_type == 'Tensor':
+ h_lq, w_lq = img_lqs[0].size()[-2:]
+ h_gt, w_gt = img_gts[0].size()[-2:]
+ else:
+ h_lq, w_lq = img_lqs[0].shape[0:2]
+ h_gt, w_gt = img_gts[0].shape[0:2]
+ lq_patch_size = gt_patch_size // scale
+ if h_gt != h_lq * scale or w_gt != w_lq * scale:
+ raise ValueError(
+ f'Scale mismatches. GT ({h_gt}, {w_gt}) is not {scale}x ',
+ f'multiplication of LQ ({h_lq}, {w_lq}).')
+ if h_lq < lq_patch_size or w_lq < lq_patch_size:
+ raise ValueError(
+ f'LQ ({h_lq}, {w_lq}) is smaller than patch size ({lq_patch_size}, {lq_patch_size}). Please remove {gt_path}.'
+ )
+ top = random.randint(0, h_lq - lq_patch_size)
+ left = random.randint(0, w_lq - lq_patch_size)
+ if input_type == 'Tensor':
+ img_lqs = [
+ v[:, :, top:top + lq_patch_size, left:left + lq_patch_size]
+ for v in img_lqs
+ ]
+ else:
+ img_lqs = [
+ v[top:top + lq_patch_size, left:left + lq_patch_size, ...]
+ for v in img_lqs
+ ]
+ top_gt, left_gt = int(top * scale), int(left * scale)
+ if input_type == 'Tensor':
+ img_gts = [
+ v[:, :, top_gt:top_gt + gt_patch_size,
+ left_gt:left_gt + gt_patch_size] for v in img_gts
+ ]
+ else:
+ img_gts = [
+ v[top_gt:top_gt + gt_patch_size, left_gt:left_gt + gt_patch_size,
+ ...] for v in img_gts
+ ]
+ if len(img_gts) == 1:
+ img_gts = img_gts[0]
+ if len(img_lqs) == 1:
+ img_lqs = img_lqs[0]
+ return img_gts, img_lqs
+
+
+def augment(imgs, hflip=True, rotation=True, flows=None, return_status=False):
+ """Augment: horizontal flips OR rotate (0, 90, 180, 270 degrees).
+
+ We use vertical flip and transpose for rotation implementation.
+ All the images in the list use the same augmentation.
+
+ Args:
+ imgs (list[ndarray] | ndarray): Images to be augmented. If the input
+ is an ndarray, it will be transformed to a list.
+ hflip (bool): Horizontal flip. Default: True.
+ rotation (bool): Ratotation. Default: True.
+ flows (list[ndarray]: Flows to be augmented. If the input is an
+ ndarray, it will be transformed to a list.
+ Dimension is (h, w, 2). Default: None.
+ return_status (bool): Return the status of flip and rotation.
+ Default: False.
+
+ Returns:
+ list[ndarray] | ndarray: Augmented images and flows. If returned
+ results only have one element, just return ndarray.
+
+ """
+ hflip = hflip and random.random() < 0.5
+ vflip = rotation and random.random() < 0.5
+ rot90 = rotation and random.random() < 0.5
+
+ def _augment(img):
+ if hflip:
+ cv2.flip(img, 1, img)
+ if vflip:
+ cv2.flip(img, 0, img)
+ if rot90:
+ img = img.transpose(1, 0, 2)
+ return img
+
+ def _augment_flow(flow):
+ if hflip:
+ cv2.flip(flow, 1, flow)
+ flow[:, :, 0] *= -1
+ if vflip:
+ cv2.flip(flow, 0, flow)
+ flow[:, :, 1] *= -1
+ if rot90:
+ flow = flow.transpose(1, 0, 2)
+ flow = flow[:, :, [1, 0]]
+ return flow
+
+ if not isinstance(imgs, list):
+ imgs = [imgs]
+ imgs = [_augment(img) for img in imgs]
+ if len(imgs) == 1:
+ imgs = imgs[0]
+ if flows is not None:
+ if not isinstance(flows, list):
+ flows = [flows]
+ flows = [_augment_flow(flow) for flow in flows]
+ if len(flows) == 1:
+ flows = flows[0]
+ return imgs, flows
+ elif return_status:
+ return imgs, (hflip, vflip, rot90)
+ else:
+ return imgs
+
+
+def img_rotate(img, angle, center=None, scale=1.0):
+ """Rotate image.
+
+ Args:
+ img (ndarray): Image to be rotated.
+ angle (float): Rotation angle in degrees. Positive values mean
+ counter-clockwise rotation.
+ center (tuple[int]): Rotation center. If the center is None,
+ initialize it as the center of the image. Default: None.
+ scale (float): Isotropic scale factor. Default: 1.0.
+ """
+ h, w = img.shape[:2]
+ if center is None:
+ center = w // 2, h // 2
+ matrix = cv2.getRotationMatrix2D(center, angle, scale)
+ rotated_img = cv2.warpAffine(img, matrix, (w, h))
+ return rotated_img
+
+
+def _convert_input_type_range(img):
+ """Convert the type and range of the input image.
+
+ It converts the input image to np.float32 type and range of [0, 1].
+ It is mainly used for pre-processing the input image in colorspace
+ conversion functions such as rgb2ycbcr and ycbcr2rgb.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+
+ Returns:
+ (ndarray): The converted image with type of np.float32 and range of
+ [0, 1].
+ """
+ img_type = img.dtype
+ img = img.astype(np.float32)
+ if img_type == np.float32:
+ pass
+ elif img_type == np.uint8:
+ img /= 255.
+ else:
+ raise TypeError(
+ f'The img type should be np.float32 or np.uint8, but got {img_type}'
+ )
+ return img
+
+
+def _convert_output_type_range(img, dst_type):
+ """Convert the type and range of the image according to dst_type.
+
+ It converts the image to desired type and range. If `dst_type` is np.uint8,
+ images will be converted to np.uint8 type with range [0, 255]. If
+ `dst_type` is np.float32, it converts the image to np.float32 type with
+ range [0, 1].
+ It is mainly used for post-processing images in colorspace conversion
+ functions such as rgb2ycbcr and ycbcr2rgb.
+
+ Args:
+ img (ndarray): The image to be converted with np.float32 type and
+ range [0, 255].
+ dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
+ converts the image to np.uint8 type with range [0, 255]. If
+ dst_type is np.float32, it converts the image to np.float32 type
+ with range [0, 1].
+
+ Returns:
+ (ndarray): The converted image with desired type and range.
+ """
+ if dst_type not in (np.uint8, np.float32):
+ raise TypeError(
+ f'The dst_type should be np.float32 or np.uint8, but got {dst_type}'
+ )
+ if dst_type == np.uint8:
+ img = img.round()
+ else:
+ img /= 255.
+ return img.astype(dst_type)
+
+
+def bgr2ycbcr(img, y_only=False):
+ """Convert a BGR image to YCbCr image.
+
+ The bgr version of rgb2ycbcr.
+ It implements the ITU-R BT.601 conversion for standard-definition
+ television. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+ It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+ In OpenCV, it implements a JPEG conversion. See more details in
+ https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+ Args:
+ img (ndarray): The input image. It accepts:
+ 1. np.uint8 type with range [0, 255];
+ 2. np.float32 type with range [0, 1].
+ y_only (bool): Whether to only return Y channel. Default: False.
+
+ Returns:
+ ndarray: The converted YCbCr image. The output image has the same type
+ and range as input image.
+ """
+ img_type = img.dtype
+ img = _convert_input_type_range(img)
+ if y_only:
+ out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+ else:
+ out_img = np.matmul(
+ img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+ [65.481, -37.797, 112.0]]) + [16, 128, 128]
+ out_img = _convert_output_type_range(out_img, img_type)
+ return out_img
+
+
+def paired_paths_from_folder(folders, keys, filename_tmpl):
+ """Generate paired paths from folders.
+
+ Args:
+ folders (list[str]): A list of folder path. The order of list should
+ be [input_folder, gt_folder].
+ keys (list[str]): A list of keys identifying folders. The order should
+ be in consistent with folders, e.g., ['lq', 'gt'].
+ filename_tmpl (str): Template for each filename. Note that the
+ template excludes the file extension. Usually the filename_tmpl is
+ for files in the input folder.
+
+ Returns:
+ list[str]: Returned path list.
+ """
+ assert len(folders) == 2, (
+ 'The len of folders should be 2 with [input_folder, gt_folder]. '
+ f'But got {len(folders)}')
+ assert len(
+ keys
+ ) == 2, f'The len of keys should be 2 with [input_key, gt_key]. But got {len(keys)}'
+ input_folder, gt_folder = folders
+ input_key, gt_key = keys
+
+ input_paths = list(scandir(input_folder))
+ gt_paths = list(scandir(gt_folder))
+ assert len(input_paths) == len(gt_paths), (
+ f'{input_key} and {gt_key} datasets have different number of images: '
+ f'{len(input_paths)}, {len(gt_paths)}.')
+ paths = []
+ for gt_path in gt_paths:
+ basename, ext = osp.splitext(osp.basename(gt_path))
+ input_name = f'{filename_tmpl.format(basename)}{ext}'
+ input_path = osp.join(input_folder, input_name)
+ assert input_name in input_paths, f'{input_name} is not in {input_key}_paths.'
+ gt_path = osp.join(gt_folder, gt_path)
+ paths.append(
+ dict([(f'{input_key}_path', input_path),
+ (f'{gt_key}_path', gt_path)]))
+ return paths
+
+
+def paired_paths_from_lmdb(folders, keys):
+ """Generate paired paths from lmdb files.
+
+ Contents of lmdb. Taking the `lq.lmdb` for example, the file structure is:
+
+ lq.lmdb
+ ├── data.mdb
+ ├── lock.mdb
+ ├── meta_info.txt
+
+ The data.mdb and lock.mdb are standard lmdb files and you can refer to
+ https://lmdb.readthedocs.io/en/release/ for more details.
+
+ The meta_info.txt is a specified txt file to record the meta information
+ of our datasets. It will be automatically created when preparing
+ datasets by our provided dataset tools.
+ Each line in the txt file records
+ 1)image name (with extension),
+ 2)image shape,
+ 3)compression level, separated by a white space.
+ Example: `baboon.png (120,125,3) 1`
+
+ We use the image name without extension as the lmdb key.
+ Note that we use the same key for the corresponding lq and gt images.
+
+ Args:
+ folders (list[str]): A list of folder path. The order of list should
+ be [input_folder, gt_folder].
+ keys (list[str]): A list of keys identifying folders. The order should
+ be in consistent with folders, e.g., ['lq', 'gt'].
+ Note that this key is different from lmdb keys.
+
+ Returns:
+ list[str]: Returned path list.
+ """
+ assert len(folders) == 2, (
+ 'The len of folders should be 2 with [input_folder, gt_folder]. '
+ f'But got {len(folders)}')
+ assert len(
+ keys
+ ) == 2, f'The len of keys should be 2 with [input_key, gt_key]. But got {len(keys)}'
+ input_folder, gt_folder = folders
+ input_key, gt_key = keys
+
+ if not (input_folder.endswith('.lmdb') and gt_folder.endswith('.lmdb')):
+ raise ValueError(
+ f'{input_key} folder and {gt_key} folder should both in lmdb '
+ f'formats. But received {input_key}: {input_folder}; '
+ f'{gt_key}: {gt_folder}')
+ # ensure that the two meta_info files are the same
+ with open(osp.join(input_folder, 'meta_info.txt')) as fin:
+ input_lmdb_keys = [line.split('.')[0] for line in fin]
+ with open(osp.join(gt_folder, 'meta_info.txt')) as fin:
+ gt_lmdb_keys = [line.split('.')[0] for line in fin]
+ if set(input_lmdb_keys) != set(gt_lmdb_keys):
+ raise ValueError(
+ f'Keys in {input_key}_folder and {gt_key}_folder are different.')
+ else:
+ paths = []
+ for lmdb_key in sorted(input_lmdb_keys):
+ paths.append(
+ dict([(f'{input_key}_path', lmdb_key),
+ (f'{gt_key}_path', lmdb_key)]))
+ return paths
+
+
+def paired_paths_from_meta_info_file(folders, keys, meta_info_file,
+ filename_tmpl):
+ """Generate paired paths from an meta information file.
+
+ Each line in the meta information file contains the image names and
+ image shape (usually for gt), separated by a white space.
+
+ Example of an meta information file:
+ ```
+ 0001_s001.png (480,480,3)
+ 0001_s002.png (480,480,3)
+ ```
+
+ Args:
+ folders (list[str]): A list of folder path. The order of list should
+ be [input_folder, gt_folder].
+ keys (list[str]): A list of keys identifying folders. The order should
+ be in consistent with folders, e.g., ['lq', 'gt'].
+ meta_info_file (str): Path to the meta information file.
+ filename_tmpl (str): Template for each filename. Note that the
+ template excludes the file extension. Usually the filename_tmpl is
+ for files in the input folder.
+
+ Returns:
+ list[str]: Returned path list.
+ """
+ assert len(folders) == 2, (
+ 'The len of folders should be 2 with [input_folder, gt_folder]. '
+ f'But got {len(folders)}')
+ assert len(
+ keys
+ ) == 2, f'The len of keys should be 2 with [input_key, gt_key]. But got {len(keys)}'
+ input_folder, gt_folder = folders
+ input_key, gt_key = keys
+
+ with open(meta_info_file, 'r') as fin:
+ gt_names = [line.strip().split(' ')[0] for line in fin]
+
+ paths = []
+ for gt_name in gt_names:
+ basename, ext = osp.splitext(osp.basename(gt_name))
+ input_name = f'{filename_tmpl.format(basename)}{ext}'
+ input_path = osp.join(input_folder, input_name)
+ gt_path = osp.join(gt_folder, gt_name)
+ paths.append(
+ dict([(f'{input_key}_path', input_path),
+ (f'{gt_key}_path', gt_path)]))
+ return paths
diff --git a/ppgan/utils/image_pool.py b/ppgan/utils/image_pool.py
index cd12f62f823e0bf3e2fcc17d9579817260f73271..e76dceadb69234d0c434704fd3745b084e3659d9 100644
--- a/ppgan/utils/image_pool.py
+++ b/ppgan/utils/image_pool.py
@@ -21,51 +21,48 @@ class ImagePool():
This buffer enables us to update discriminators using a history of generated images
rather than the ones produced by the latest generators.
- """
- def __init__(self, pool_size):
- """Initialize the ImagePool class
- Parameters:
- pool_size (int) -- the size of image buffer, if pool_size=0, no buffer will be created
- """
+ Args:
+ pool_size (int) -- the size of image buffer, if pool_size=0, no buffer will be created
+ """
+ def __init__(self, pool_size, prob=0.5):
self.pool_size = pool_size
- if self.pool_size > 0: # create an empty pool
+ self.prob = prob
+
+ if self.pool_size > 0:
self.num_imgs = 0
self.images = []
def query(self, images):
"""Return an image from the pool.
- Parameters:
- images: the latest generated images from the generator
+ Args:
+ images (paddle.Tensor): the latest generated images from the generator
Returns images from the buffer.
-
- By 50/100, the buffer will return input images.
- By 50/100, the buffer will return images previously stored in the buffer,
- and insert the current images to the buffer.
"""
- if self.pool_size == 0: # if the buffer size is 0, do nothing
+ # if the buffer size is 0, do nothing
+ if self.pool_size == 0:
return images
return_images = []
for image in images:
image = paddle.unsqueeze(image, 0)
- if self.num_imgs < self.pool_size: # if the buffer is not full; keep inserting current images to the buffer
+ # if the buffer is not full; keep inserting current images to the buffer
+ if self.num_imgs < self.pool_size:
self.num_imgs = self.num_imgs + 1
self.images.append(image)
return_images.append(image)
else:
p = random.uniform(0, 1)
- if p > 0.5: # by 50% chance, the buffer will return a previously stored image, and insert the current image into the buffer
- random_id = random.randint(0, self.pool_size -
- 1) # randint is inclusive
- # FIXME: clone
- # tmp = (self.images[random_id]).detach() #.clone()
- tmp = self.images[random_id] #.clone()
+ # by 50% chance, the buffer will return a previously stored image, and insert the current image into the buffer
+ if p > self.prob:
+ random_id = random.randint(0, self.pool_size - 1)
+ tmp = self.images[random_id].clone()
self.images[random_id] = image
return_images.append(tmp)
- else: # by another 50% chance, the buffer will return the current image
+ else:
+ # by another 50% chance, the buffer will return the current image
return_images.append(image)
- return_images = paddle.concat(return_images,
- 0) # collect all the images and return
+ # collect all the images and return
+ return_images = paddle.concat(return_images, 0)
return return_images
diff --git a/ppgan/utils/logger.py b/ppgan/utils/logger.py
index 6fb9701f89b27920d826fcdad699264f29d79585..c14a1036c06fecfa941ca9e3493c3b13e97a2ed4 100644
--- a/ppgan/utils/logger.py
+++ b/ppgan/utils/logger.py
@@ -61,10 +61,9 @@ def setup_logger(output=None, name="ppgan"):
if local_rank > 0:
filename = filename + ".rank{}".format(local_rank)
- # PathManager.mkdirs(os.path.dirname(filename))
+ # make dir if path not exist
os.makedirs(os.path.dirname(filename), exist_ok=True)
- # fh = logging.StreamHandler(_cached_log_stream(filename)
fh = logging.FileHandler(filename, mode='a')
fh.setLevel(logging.DEBUG)
fh.setFormatter(plain_formatter)
diff --git a/ppgan/utils/options.py b/ppgan/utils/options.py
index e87477af0ca867e1d53d0d64ae9249e9ca2f1a1d..d371d63316ff28451dd506fc4b3eee4df5cd66e8 100644
--- a/ppgan/utils/options.py
+++ b/ppgan/utils/options.py
@@ -45,10 +45,10 @@ def parse_args():
default=False,
help='skip validation during training')
# config options
- parser.add_argument('opts',
- help='See config for all options',
- default=None,
- nargs=argparse.REMAINDER)
+ parser.add_argument("-o",
+ "--opt",
+ nargs='+',
+ help="set configuration options")
#for inference
parser.add_argument("--source_path",
@@ -60,6 +60,31 @@ def parse_args():
help="path to reference images")
parser.add_argument("--model_path", default=None, help="model for loading")
+ # for profiler
+ parser.add_argument(
+ '-p',
+ '--profiler_options',
+ type=str,
+ default=None,
+ help=
+ 'The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+ )
+ # fix random numbers by setting seed
+ parser.add_argument('--seed',
+ type=int,
+ default=None,
+ help='fix random numbers by setting seed\".')
+
+ # add for amp training
+ parser.add_argument('--amp',
+ action='store_true',
+ default=False,
+ help='whether to enable amp training')
+ parser.add_argument('--amp_level',
+ type=str,
+ default='O1',
+ choices=['O1', 'O2'],
+ help='level of amp training; O2 represent pure fp16')
args = parser.parse_args()
return args
diff --git a/ppgan/utils/photopen.py b/ppgan/utils/photopen.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8fdf263443491a6398b97c5ee092918851ed42c
--- /dev/null
+++ b/ppgan/utils/photopen.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle.io import Dataset, DataLoader
+import paddle
+import paddle.nn as nn
+import math
+import functools
+from paddle.nn import Conv1DTranspose, Conv2DTranspose, Conv3DTranspose, Linear
+
+# 处理图片数据:裁切、水平翻转、调整图片数据形状、归一化数据
+def data_transform(img, resize_w, resize_h, load_size=286, pos=[0, 0, 256, 256], flip=True, is_image=True):
+ if is_image:
+ resized = img.resize((resize_w, resize_h), Image.BICUBIC)
+ else:
+ resized = img.resize((resize_w, resize_h), Image.NEAREST)
+ croped = resized.crop((pos[0], pos[1], pos[2], pos[3]))
+ fliped = ImageOps.mirror(croped) if flip else croped
+ fliped = np.array(fliped) # transform to numpy array
+ expanded = np.expand_dims(fliped, 2) if len(fliped.shape) < 3 else fliped
+ transposed = np.transpose(expanded, (2, 0, 1)).astype('float32')
+ if is_image:
+ normalized = transposed / 255. * 2. - 1.
+ else:
+ normalized = transposed
+ return normalized
+
+# 定义CoCo数据集对象
+class COCODateset(Dataset):
+ def __init__(self, opt):
+ super(COCODateset, self).__init__()
+ inst_dir = opt.dataroot+'train_inst/'
+ _, _, inst_list = next(os.walk(inst_dir))
+ self.inst_list = np.sort(inst_list)
+ self.opt = opt
+
+ def __getitem__(self, idx):
+ ins = Image.open(self.opt.dataroot+'train_inst/'+self.inst_list[idx])
+ img = Image.open(self.opt.dataroot+'train_img/'+self.inst_list[idx].replace(".png", ".jpg"))
+ img = img.convert('RGB')
+
+ w, h = img.size
+ resize_w, resize_h = 0, 0
+ if w < h:
+ resize_w, resize_h = self.opt.load_size, int(h * self.opt.load_size / w)
+ else:
+ resize_w, resize_h = int(w * self.opt.load_size / h), self.opt.load_size
+ left = random.randint(0, resize_w - self.opt.crop_size)
+ top = random.randint(0, resize_h - self.opt.crop_size)
+ flip = False
+
+ img = data_transform(img, resize_w, resize_h, load_size=opt.load_size,
+ pos=[left, top, left + self.opt.crop_size, top + self.opt.crop_size], flip=flip, is_image=True)
+ ins = data_transform(ins, resize_w, resize_h, load_size=opt.load_size,
+ pos=[left, top, left + self.opt.crop_size, top + self.opt.crop_size], flip=flip, is_image=False)
+
+ return img, ins, self.inst_list[idx]
+
+ def __len__(self):
+ return len(self.inst_list)
+
+
+def data_onehot_pro(instance, opt):
+ shape = instance.shape
+ nc = opt.label_nc + 1 if opt.contain_dontcare_label \
+ else opt.label_nc
+ shape[1] = nc
+ semantics = paddle.nn.functional.one_hot(instance.astype('int64'). \
+ reshape([opt.batchSize, opt.crop_size, opt.crop_size]), nc). \
+ transpose((0, 3, 1, 2))
+
+ # edge
+ edge = np.zeros(instance.shape, 'int64')
+ t = instance.numpy()
+ edge[:, :, :, 1:] = edge[:, :, :, 1:] | (t[:, :, :, 1:] != t[:, :, :, :-1])
+ edge[:, :, :, :-1] = edge[:, :, :, :-1] | (t[:, :, :, 1:] != t[:, :, :, :-1])
+ edge[:, :, 1:, :] = edge[:, :, 1:, :] | (t[:, :, 1:, :] != t[:, :, :-1, :])
+ edge[:, :, :-1, :] = edge[:, :, :-1, :] | (t[:, :, 1:, :] != t[:, :, :-1, :])
+ edge = paddle.to_tensor(edge).astype('float32')
+
+ semantics = paddle.concat([semantics, edge], 1)
+ return semantics
+
+# 设置除 spade 以外的归一化层
+def build_norm_layer(norm_type='instance'):
+ """Return a normalization layer
+
+ Args:
+ norm_type (str) -- the name of the normalization layer: batch | instance | none
+
+ For BatchNorm, we do not use learnable affine parameters and track running statistics (mean/stddev).
+ For InstanceNorm, we do not use learnable affine parameters. We do not track running statistics.
+ """
+ if norm_type == 'batch':
+ norm_layer = functools.partial(
+ nn.BatchNorm2D,
+ weight_attr=False,
+ bias_attr=False)
+ elif norm_type == 'syncbatch':
+ norm_layer = functools.partial(
+ nn.SyncBatchNorm,
+ weight_attr=False,
+ bias_attr=False)
+ elif norm_type == 'instance':
+ norm_layer = functools.partial(
+ nn.InstanceNorm2D,)
+ elif norm_type == 'spectral':
+ norm_layer = functools.partial(Spectralnorm)
+ elif norm_type == 'none':
+
+ def norm_layer(x):
+ return Identity()
+ else:
+ raise NotImplementedError('normalization layer [%s] is not found' %
+ norm_type)
+ return norm_layer
+
+def simam(x, e_lambda=1e-4):
+ b, c, h, w = x.shape
+ n = w * h - 1
+ x_minus_mu_square = (x - x.mean(axis=[2, 3], keepdim=True)) ** 2
+ y = x_minus_mu_square / (4 * (x_minus_mu_square.sum(axis=[2, 3], keepdim=True) / n + e_lambda)) + 0.5
+ return x * nn.functional.sigmoid(y)
+
+class Dict(dict):
+ __setattr__ = dict.__setitem__
+ __getattr__ = dict.__getitem__
+
diff --git a/ppgan/utils/preprocess.py b/ppgan/utils/preprocess.py
index 10613c79211ca8c9ce3ddd7077a7a6687a73e85f..28fc34d870e4c63ab5a7c5738e1d8cf6e24fbce4 100644
--- a/ppgan/utils/preprocess.py
+++ b/ppgan/utils/preprocess.py
@@ -1,16 +1,6 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# code was heavily based on https://github.com/wtjiang98/PSGAN
+# MIT License
+# Copyright (c) 2020 Wentao Jiang
import cv2
import numpy as np
diff --git a/ppgan/utils/profiler.py b/ppgan/utils/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7359739fa48ec847891f034187b81d25406ff81
--- /dev/null
+++ b/ppgan/utils/profiler.py
@@ -0,0 +1,111 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+ '''
+ Use a string to initialize a ProfilerOptions.
+ The string should be in the format: "key1=value1;key2=value;key3=value3".
+ For example:
+ "profile_path=model.profile"
+ "batch_range=[50, 60]; profile_path=model.profile"
+ "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+ ProfilerOptions supports following key-value pair:
+ batch_range - a integer list, e.g. [100, 110].
+ state - a string, the optional values are 'CPU', 'GPU' or 'All'.
+ sorted_key - a string, the optional values are 'calls', 'total',
+ 'max', 'min' or 'ave.
+ tracer_option - a string, the optional values are 'Default', 'OpDetail',
+ 'AllOpDetail'.
+ profile_path - a string, the path to save the serialized profile data,
+ which can be used to generate a timeline.
+ exit_on_finished - a boolean.
+ '''
+
+ def __init__(self, options_str):
+ assert isinstance(options_str, str)
+
+ self._options = {
+ 'batch_range': [10, 20],
+ 'state': 'All',
+ 'sorted_key': 'total',
+ 'tracer_option': 'Default',
+ 'profile_path': '/tmp/profile',
+ 'exit_on_finished': True
+ }
+ self._parse_from_string(options_str)
+
+ def _parse_from_string(self, options_str):
+ for kv in options_str.replace(' ', '').split(';'):
+ key, value = kv.split('=')
+ if key == 'batch_range':
+ value_list = value.replace('[', '').replace(']', '').split(',')
+ value_list = list(map(int, value_list))
+ if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+ 1] > value_list[0]:
+ self._options[key] = value_list
+ elif key == 'exit_on_finished':
+ self._options[key] = value.lower() in ("yes", "true", "t", "1")
+ elif key in [
+ 'state', 'sorted_key', 'tracer_option', 'profile_path'
+ ]:
+ self._options[key] = value
+
+ def __getitem__(self, name):
+ if self._options.get(name, None) is None:
+ raise ValueError(
+ "ProfilerOptions does not have an option named %s." % name)
+ return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+ '''
+ Enable the operator-level timing using PaddlePaddle's profiler.
+ The profiler uses a independent variable to count the profiler steps.
+ One call of this function is treated as a profiler step.
+
+ Args:
+ options_str - a string to initialize the ProfilerOptions.
+ Default is None, and the profiler is disabled.
+ '''
+ if options_str is None:
+ return
+
+ global _profiler_step_id
+ global _profiler_options
+
+ if _profiler_options is None:
+ _profiler_options = ProfilerOptions(options_str)
+
+ if _profiler_step_id == _profiler_options['batch_range'][0]:
+ paddle.utils.profiler.start_profiler(
+ _profiler_options['state'], _profiler_options['tracer_option'])
+ elif _profiler_step_id == _profiler_options['batch_range'][1]:
+ paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+ _profiler_options['profile_path'])
+ if _profiler_options['exit_on_finished']:
+ sys.exit(0)
+
+ _profiler_step_id += 1
+
diff --git a/ppgan/utils/registry.py b/ppgan/utils/registry.py
index 3287854d5ed0b131cae5ab23569fa144faeba943..11941be44f7fcce180bbab6b331d09c4d8947b52 100644
--- a/ppgan/utils/registry.py
+++ b/ppgan/utils/registry.py
@@ -77,6 +77,8 @@ class Registry(object):
return ret
+# code was based on mmcv
+# Copyright (c) Copyright (c) OpenMMLab.
def build_from_config(cfg, registry, default_args=None):
"""Build a class from config dict.
@@ -96,7 +98,7 @@ def build_from_config(cfg, registry, default_args=None):
'`cfg` or `default_args` must contain the key "name", '
f'but got {cfg}\n{default_args}')
if not isinstance(registry, Registry):
- raise TypeError('registry must be an mmcv.Registry object, '
+ raise TypeError('registry must be an ppgan.utils.Registry object, '
f'but got {type(registry)}')
if not (isinstance(default_args, dict) or default_args is None):
raise TypeError('default_args must be a dict or None, '
diff --git a/ppgan/utils/setup.py b/ppgan/utils/setup.py
index e37bde59793e33160edee56368ce9c817223d3de..a439d3e4744064e47ad054b94108f520b677678f 100644
--- a/ppgan/utils/setup.py
+++ b/ppgan/utils/setup.py
@@ -14,17 +14,24 @@
import os
import time
+import yaml
import paddle
-
+import numpy as np
+import random
+from .config import cfg2dict
from .logger import setup_logger
-
def setup(args, cfg):
if args.evaluate_only:
cfg.is_train = False
else:
cfg.is_train = True
+ if args.profiler_options:
+ cfg.profiler_options = args.profiler_options
+ else:
+ cfg.profiler_options = None
+
cfg.timestamp = time.strftime('-%Y-%m-%d-%H-%M', time.localtime())
cfg.output_dir = os.path.join(
cfg.output_dir,
@@ -33,9 +40,23 @@ def setup(args, cfg):
logger = setup_logger(cfg.output_dir)
- logger.info('Configs: {}'.format(cfg))
+ logger.info('Configs: \n{}'.format(yaml.dump(cfg2dict(cfg))))
if paddle.is_compiled_with_cuda():
paddle.set_device('gpu')
+ elif paddle.is_compiled_with_npu():
+ paddle.set_device('npu')
+ elif paddle.is_compiled_with_xpu():
+ paddle.set_device('xpu')
else:
paddle.set_device('cpu')
+
+ if args.seed:
+ paddle.seed(args.seed)
+ random.seed(args.seed)
+ np.random.seed(args.seed)
+ paddle.framework.random._manual_program_seed(args.seed)
+
+ # add amp and amp_level args into cfg
+ cfg['amp'] = args.amp
+ cfg['amp_level'] = args.amp_level
diff --git a/ppgan/utils/video.py b/ppgan/utils/video.py
index da8a8b7aacefa40a0942731b67a5c126eb0b447f..4204da936c91ba73b48650e14c60f589f0765477 100644
--- a/ppgan/utils/video.py
+++ b/ppgan/utils/video.py
@@ -24,14 +24,14 @@ def video2frames(video_path, outpath, **kargs):
return cmd_str
ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error ']
- vid_name = video_path.split('/')[-1].split('.')[0]
+ vid_name = os.path.basename(video_path).split('.')[0]
out_full_path = os.path.join(outpath, vid_name)
if not os.path.exists(out_full_path):
os.makedirs(out_full_path)
# video file name
- outformat = out_full_path + '/%08d.png'
+ outformat = os.path.join(out_full_path, '%08d.png')
cmd = ffmpeg
cmd = ffmpeg + [' -i ', video_path, ' -start_number ', ' 0 ', outformat]
diff --git a/ppgan/utils/visual.py b/ppgan/utils/visual.py
index dccb7a9d62f8fc42432fe27be451662aa4490ebd..6982634661b896348dec932a67d82e5fefad3b64 100644
--- a/ppgan/utils/visual.py
+++ b/ppgan/utils/visual.py
@@ -45,18 +45,23 @@ def make_grid(tensor, nrow=8, normalize=False, range=None, scale_each=False):
if isinstance(tensor, list):
tensor = paddle.stack(tensor, 0)
- if tensor.dim() == 2: # single image H x W
+ # single image H x W
+ if tensor.dim() == 2:
tensor = tensor.unsqueeze(0)
- if tensor.dim() == 3: # single image
- if tensor.shape[0] == 1: # if single-channel, convert to 3-channel
+ # single image
+ if tensor.dim() == 3:
+ # if single-channel, convert to 3-channel
+ if tensor.shape[0] == 1:
tensor = paddle.concat([tensor, tensor, tensor], 0)
tensor = tensor.unsqueeze(0)
- if tensor.dim() == 4 and tensor.shape[1] == 1: # single-channel images
+ # single-channel images
+ if tensor.dim() == 4 and tensor.shape[1] == 1:
tensor = paddle.concat([tensor, tensor, tensor], 1)
if normalize is True:
- tensor = tensor.astype(tensor.dtype) # avoid modifying tensor in-place
+ # avoid modifying tensor in-place
+ tensor = tensor.astype(tensor.dtype)
if range is not None:
assert isinstance(range, tuple), \
"range has to be a tuple (min, max) if specified. min and max are numbers"
@@ -72,7 +77,8 @@ def make_grid(tensor, nrow=8, normalize=False, range=None, scale_each=False):
norm_ip(t, float(t.min()), float(t.max()))
if scale_each is True:
- for t in tensor: # loop over mini-batch dimension
+ # loop over mini-batch dimension
+ for t in tensor:
norm_range(t, range)
else:
norm_range(tensor, range)
@@ -103,27 +109,31 @@ def tensor2img(input_image, min_max=(-1., 1.), image_num=1, imtype=np.uint8):
""""Converts a Tensor array into a numpy image array.
Parameters:
- input_image (tensor) -- the input image tensor array
- image_num (int) -- the convert iamge numbers
- imtype (type) -- the desired type of the converted numpy array
+ input_image (tensor): the input image tensor array
+ image_num (int): the convert iamge numbers
+ imtype (type): the desired type of the converted numpy array
"""
def processing(img, transpose=True):
""""processing one numpy image.
Parameters:
- im (tensor) -- the input image numpy array
+ im (tensor): the input image numpy array
"""
- if img.shape[0] == 1: # grayscale to RGB
+ # grayscale to RGB
+ if img.shape[0] == 1:
img = np.tile(img, (3, 1, 1))
img = img.clip(min_max[0], min_max[1])
img = (img - min_max[0]) / (min_max[1] - min_max[0])
if imtype == np.uint8:
- img = img * 255.0 # scaling
- img = np.transpose(img, (1, 2, 0)) if transpose else img # tranpose
+ # scaling
+ img = img * 255.0
+ # tranpose
+ img = np.transpose(img, (1, 2, 0)) if transpose else img
return img
if not isinstance(input_image, np.ndarray):
- image_numpy = input_image.numpy() # convert it into a numpy array
+ # convert it into a numpy array
+ image_numpy = input_image.numpy()
ndim = image_numpy.ndim
if ndim == 4:
image_numpy = image_numpy[0:image_num]
@@ -144,7 +154,8 @@ def tensor2img(input_image, min_max=(-1., 1.), image_num=1, imtype=np.uint8):
image_numpy = np.stack(
[processing(im, transpose=False) for im in image_numpy])
- else: # if it is a numpy array, do nothing
+ else:
+ # if it is a numpy array, do nothing
image_numpy = input_image
image_numpy = image_numpy.round()
return image_numpy.astype(imtype)
diff --git a/ppgan/version.py b/ppgan/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d8d23ad327be9de3fafacc3747d2ad217fa23a3
--- /dev/null
+++ b/ppgan/version.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ppgan_version = '2.1.0'
diff --git a/python b/python
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/requirements.txt b/requirements.txt
index d1446d98bd21e6c74642bf98b352df25a477cb30..4df4b5b2a3f3e64aca2261850813eeff55b3c77f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,8 +2,12 @@ tqdm
PyYAML>=5.1
scikit-image>=0.14.0
scipy>=1.1.0
-opencv-python
+opencv-python<=4.6.0.66
+imageio==2.9.0
imageio-ffmpeg
-librosa==0.7.0
-numba==0.48
+librosa==0.8.1
+numba
easydict
+munch
+natsort
+matplotlib
diff --git a/setup.py b/setup.py
index ff5d222dcb7c7b126d5135680068a8285a9f7ede..50bdb31eb583ca1ff0b8ce8628cc163ac9299a6d 100644
--- a/setup.py
+++ b/setup.py
@@ -13,8 +13,11 @@
# limitations under the License.
from setuptools import setup
+from setuptools import find_packages
from io import open
+from ppgan import __version__
+
with open('requirements.txt', encoding="utf-8-sig") as f:
requirements = f.readlines()
@@ -27,11 +30,11 @@ def readme():
setup(
name='ppgan',
- packages=['ppgan'],
+ packages=find_packages(),
include_package_data=True,
entry_points={"console_scripts": ["paddlegan= paddlegan.paddlegan:main"]},
author='PaddlePaddle Author',
- version='0.1.0',
+ version=__version__,
install_requires=requirements,
license='Apache License 2.0',
description='Awesome GAN toolkits based on PaddlePaddle',
diff --git a/test/coco_stuff/test_inst/ADE_train_00000569.png b/test/coco_stuff/test_inst/ADE_train_00000569.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b4635a3923c52f5d484c3cc909a12e5c4058a3f
Binary files /dev/null and b/test/coco_stuff/test_inst/ADE_train_00000569.png differ
diff --git a/test/coco_stuff/test_inst/ADE_train_00000583.png b/test/coco_stuff/test_inst/ADE_train_00000583.png
new file mode 100644
index 0000000000000000000000000000000000000000..0a1e77ce713c7bc7c210701f1807a709ad4911c1
Binary files /dev/null and b/test/coco_stuff/test_inst/ADE_train_00000583.png differ
diff --git a/test/coco_stuff/test_inst/ADE_train_00000955.png b/test/coco_stuff/test_inst/ADE_train_00000955.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c205ede2dad762ddbb937ec43da9bc1f3eba72d
Binary files /dev/null and b/test/coco_stuff/test_inst/ADE_train_00000955.png differ
diff --git a/test/coco_stuff/test_inst/ADE_train_00002089.png b/test/coco_stuff/test_inst/ADE_train_00002089.png
new file mode 100644
index 0000000000000000000000000000000000000000..3d11ead9081776c36e8d674aa8e74e146fa9b0df
Binary files /dev/null and b/test/coco_stuff/test_inst/ADE_train_00002089.png differ
diff --git a/test/coco_stuff/train_img/ADE_train_00000569.jpg b/test/coco_stuff/train_img/ADE_train_00000569.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9108a9b99132668eef7f51b9ee528ca237617c21
Binary files /dev/null and b/test/coco_stuff/train_img/ADE_train_00000569.jpg differ
diff --git a/test/coco_stuff/train_img/ADE_train_00000583.jpg b/test/coco_stuff/train_img/ADE_train_00000583.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d5a5902012fe328eee9291253d581ac62d6b886c
Binary files /dev/null and b/test/coco_stuff/train_img/ADE_train_00000583.jpg differ
diff --git a/test/coco_stuff/train_img/ADE_train_00000955.jpg b/test/coco_stuff/train_img/ADE_train_00000955.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..566415773c2e555f1801c700c1c7e82d85917e27
Binary files /dev/null and b/test/coco_stuff/train_img/ADE_train_00000955.jpg differ
diff --git a/test/coco_stuff/train_img/ADE_train_00002089.jpg b/test/coco_stuff/train_img/ADE_train_00002089.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..82e88fe6f989c602c6b17cc3c32ec33733425215
Binary files /dev/null and b/test/coco_stuff/train_img/ADE_train_00002089.jpg differ
diff --git a/test/coco_stuff/train_inst/ADE_train_00000569.png b/test/coco_stuff/train_inst/ADE_train_00000569.png
new file mode 100644
index 0000000000000000000000000000000000000000..9b4635a3923c52f5d484c3cc909a12e5c4058a3f
Binary files /dev/null and b/test/coco_stuff/train_inst/ADE_train_00000569.png differ
diff --git a/test/coco_stuff/train_inst/ADE_train_00000583.png b/test/coco_stuff/train_inst/ADE_train_00000583.png
new file mode 100644
index 0000000000000000000000000000000000000000..0a1e77ce713c7bc7c210701f1807a709ad4911c1
Binary files /dev/null and b/test/coco_stuff/train_inst/ADE_train_00000583.png differ
diff --git a/test/coco_stuff/train_inst/ADE_train_00000955.png b/test/coco_stuff/train_inst/ADE_train_00000955.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c205ede2dad762ddbb937ec43da9bc1f3eba72d
Binary files /dev/null and b/test/coco_stuff/train_inst/ADE_train_00000955.png differ
diff --git a/test/coco_stuff/train_inst/ADE_train_00002089.png b/test/coco_stuff/train_inst/ADE_train_00002089.png
new file mode 100644
index 0000000000000000000000000000000000000000..3d11ead9081776c36e8d674aa8e74e146fa9b0df
Binary files /dev/null and b/test/coco_stuff/train_inst/ADE_train_00002089.png differ
diff --git a/test/pic_.jpg b/test/pic_.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9b2ddf24935109c5be24096591e7ca1b5aa2ac57
Binary files /dev/null and b/test/pic_.jpg differ
diff --git a/test/readme.txt b/test/readme.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d70402ec4c3a535493f5d5ff40600ad3381b6a35
--- /dev/null
+++ b/test/readme.txt
@@ -0,0 +1,20 @@
+## 本路径下所有文件均为 photopen 模型的训练过程、预测过程的测试使用。
+
+# 使用预训练模型预测
+python applications/tools/photopen.py --semantic_label_path test/sem.png --weight_path test/generator.pdparams --output_path output_dir/pic.jpg --config-file configs/photopen.yaml
+
+# 使用checkpoint预测
+python applications/tools/photopen.py --semantic_label_path test/sem.png --weight_path output_dir/photopen-2021-10-05-14-38/iter_1_weight.pdparams --output_path output_dir/pic.jpg --config-file configs/photopen.yaml
+
+
+# 训练
+python -u tools/main.py --config-file configs/photopen.yaml
+
+# 继续训练
+python -u tools/main.py --config-file configs/photopen.yaml --resume output_dir/photopen-2021-09-30-15-59/iter_3_checkpoint.pdparams
+
+# 训练,覆盖参数
+python -u tools/main.py --config-file configs/photopen.yaml --o model.generator.ngf=1 model.discriminator.ndf=1
+
+# 测试
+python -u tools/main.py --config-file configs/photopen.yaml --evaluate-only --load output_dir/photopen-2021-11-06-20-59/iter_1_checkpoint.pdparams
diff --git a/test/sem.png b/test/sem.png
new file mode 100644
index 0000000000000000000000000000000000000000..345e230a0763449cd160f886a134604f5b8d72b8
Binary files /dev/null and b/test/sem.png differ
diff --git a/test_tipc/README.md b/test_tipc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..27c67291df5dd3044d570f4aca63fdf996e2eb25
--- /dev/null
+++ b/test_tipc/README.md
@@ -0,0 +1,75 @@
+
+# 飞桨训推一体认证
+
+## 1. 简介
+
+飞桨除了基本的模型训练和预测,还提供了支持多端多平台的高性能推理部署工具。本文档提供了PaddleGAN中所有模型的飞桨训推一体认证 (Training and Inference Pipeline Certification(TIPC)) 信息和测试工具,方便用户查阅每种模型的训练推理部署打通情况,并可以进行一键测试。
+
+## 2. 汇总信息
+
+打通情况汇总如下,已填写的部分表示可以使用本工具进行一键测试,未填写的表示正在支持中。
+
+**字段说明:**
+- 基础训练预测:包括模型训练、Paddle Inference Python预测。
+- 更多训练方式:包括多机多卡、混合精度。
+- 模型压缩:包括裁剪、离线/在线量化、蒸馏。
+- 其他预测部署:包括Paddle Inference C++预测、Paddle Serving部署、Paddle-Lite部署等。
+
+更详细的mkldnn、Tensorrt等预测加速相关功能的支持情况可以查看各测试工具的[更多教程](#more)。
+
+| 算法论文 | 模型名称 | 模型类型 | 基础
训练预测 | 更多
训练方式 | 模型压缩 | 其他预测部署 |
+| :--- | :--- | :----: | :--------: | :---- | :---- | :---- |
+| Pix2Pix |Pix2Pix | 生成 | 支持 | 多机多卡 | | |
+| CycleGAN |CycleGAN | 生成 | 支持 | 多机多卡 | | |
+| StyleGAN2 |StyleGAN2 | 生成 | 支持 | 多机多卡 | | |
+| FOMM |FOMM | 生成 | 支持 | 多机多卡 | | |
+| BasicVSR |BasicVSR | 超分 | 支持 | 多机多卡 | | |
+|PP-MSVSR|PP-MSVSR | 超分|
+|SinGAN|SinGAN | 生成| 支持 |
+
+
+
+
+## 3. 一键测试工具使用
+### 目录介绍
+
+```shell
+test_tipc/
+├── configs/ # 配置文件目录
+ ├── basicvsr_reds.yaml # 测试basicvsr模型训练的yaml文件
+ ├── cyclegan_horse2zebra.yaml # 测试cyclegan模型训练的yaml文件
+ ├── firstorder_vox_256.yaml # 测试fomm模型训练的yaml文件
+ ├── pix2pix_facedes.yaml # 测试pix2pix模型训练的yaml文件
+ ├── stylegan_v2_256_ffhq.yaml # 测试stylegan模型训练的yaml文件
+
+ ├── ...
+├── results/ # 预先保存的预测结果,用于和实际预测结果进行精读比对
+ ├── python_basicvsr_results_fp32.txt # 预存的basicvsr模型python预测fp32精度的结果
+ ├── python_cyclegan_results_fp32.txt # 预存的cyclegan模型python预测fp32精度的结果
+ ├── python_pix2pix_results_fp32.txt # 预存的pix2pix模型python预测的fp32精度的结果
+ ├── python_stylegan2_results_fp32.txt # 预存的stylegan2模型python预测的fp32精度的结果
+ ├── ...
+├── prepare.sh # 完成test_*.sh运行所需要的数据和模型下载
+├── test_train_inference_python.sh # 测试python训练预测的主程序
+├── compare_results.py # 用于对比log中的预测结果与results中的预存结果精度误差是否在限定范围内
+└── readme.md # 使用文档
+```
+
+### 测试流程
+使用本工具,可以测试不同功能的支持情况,以及预测结果是否对齐,测试流程如下:
+
+
+
+1. 运行prepare.sh准备测试所需数据和模型;
+2. 运行要测试的功能对应的测试脚本`test_*.sh`,产出log,由log可以看到不同配置是否运行成功;
+3. 用`compare_results.py`对比log中的预测结果和预存在results目录下的结果,判断预测精度是否符合预期(在误差范围内)。
+
+其中,有4个测试主程序,功能如下:
+- `test_train_inference_python.sh`:测试基于Python的模型训练、评估、推理等基本功能。
+
+
+
+#### 更多教程
+各功能测试中涉及混合精度、裁剪、量化等训练相关,及mkldnn、Tensorrt等多种预测相关参数配置,请点击下方相应链接了解更多细节和使用教程:
+- [test_train_inference_python 使用](docs/test_train_inference_python.md) 测试基于Python的模型训练、推理等基本功能。
+- [test_inference_cpp 使用](docs/test_inference_cpp.md) 测试基于C++的模型推理功能。
diff --git a/test_tipc/benchmark_train.sh b/test_tipc/benchmark_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d5e225d1238d6775d95ec3c6df4eceaddbb62d27
--- /dev/null
+++ b/test_tipc/benchmark_train.sh
@@ -0,0 +1,256 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+# set env
+python=python
+export model_branch=`git symbolic-ref HEAD 2>/dev/null | cut -d"/" -f 3`
+export model_commit=$(git log|head -n1|awk '{print $2}')
+export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
+export frame_version=${str_tmp%%.post*}
+export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`)
+
+# run benchmark sh
+# Usage:
+# bash run_benchmark_train.sh config.txt params
+# or
+# bash run_benchmark_train.sh config.txt
+
+function func_parser_params(){
+ strs=$1
+ IFS="="
+ array=(${strs})
+ tmp=${array[1]}
+ echo ${tmp}
+}
+
+function func_sed_params(){
+ filename=$1
+ line=$2
+ param_value=$3
+ params=`sed -n "${line}p" $filename`
+ IFS=":"
+ array=(${params})
+ key=${array[0]}
+ value=${array[1]}
+ new_params="${key}:${param_value}"
+ IFS=";"
+ cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'"
+ eval $cmd
+}
+
+function set_gpu_id(){
+ string=$1
+ _str=${string:1:6}
+ IFS="C"
+ arr=(${_str})
+ M=${arr[0]}
+ P=${arr[1]}
+ gn=`expr $P - 1`
+ gpu_num=`expr $gn / $M`
+ seq=`seq -s "," 0 $gpu_num`
+ echo $seq
+}
+
+function get_repo_name(){
+ IFS=";"
+ cur_dir=$(pwd)
+ IFS="/"
+ arr=(${cur_dir})
+ echo ${arr[-1]}
+}
+
+FILENAME=$1
+# copy FILENAME as new
+new_filename="./test_tipc/benchmark_train.txt"
+cmd=`yes|cp $FILENAME $new_filename`
+FILENAME=$new_filename
+# MODE must be one of ['benchmark_train']
+MODE=$2
+PARAMS=$3
+
+IFS=$'\n'
+# parser params from train_benchmark.txt
+dataline=`cat $FILENAME`
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+model_name=$(func_parser_value "${lines[1]}")
+
+# 获取benchmark_params所在的行数
+line_num=`grep -n "train_benchmark_params" $FILENAME | cut -d ":" -f 1`
+# for train log parser
+batch_size=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+fp_items=$(func_parser_value "${lines[line_num]}")
+line_num=`expr $line_num + 1`
+epoch=$(func_parser_value "${lines[line_num]}")
+
+line_num=`expr $line_num + 1`
+profile_option_key=$(func_parser_key "${lines[line_num]}")
+profile_option_params=$(func_parser_value "${lines[line_num]}")
+profile_option="${profile_option_key}:${profile_option_params}"
+
+line_num=`expr $line_num + 1`
+flags_value=$(func_parser_value "${lines[line_num]}")
+# set flags
+IFS=";"
+flags_list=(${flags_value})
+for _flag in ${flags_list[*]}; do
+ cmd="export ${_flag}"
+ eval $cmd
+done
+
+# set log_name
+repo_name=$(get_repo_name )
+SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log
+mkdir -p "${SAVE_LOG}/benchmark_log/"
+status_log="${SAVE_LOG}/benchmark_log/results.log"
+
+# The number of lines in which train params can be replaced.
+line_python=3
+line_gpuid=4
+line_precision=6
+line_epoch=7
+line_batchsize=9
+line_profile=13
+line_eval_py=24
+line_export_py=30
+
+func_sed_params "$FILENAME" "${line_eval_py}" "null"
+func_sed_params "$FILENAME" "${line_export_py}" "null"
+func_sed_params "$FILENAME" "${line_python}" "$python"
+
+# if params
+if [ ! -n "$PARAMS" ] ;then
+ # PARAMS input is not a word.
+ IFS="|"
+ batch_size_list=(${batch_size})
+ fp_items_list=(${fp_items})
+ device_num_list=(N1C4)
+ run_mode="DP"
+else
+ # parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}
+ IFS="_"
+ params_list=(${PARAMS})
+ model_type=${params_list[0]}
+ batch_size=${params_list[1]}
+ batch_size=`echo ${batch_size} | tr -cd "[0-9]" `
+ precision=${params_list[2]}
+ run_mode=${params_list[3]}
+ device_num=${params_list[4]}
+ IFS=";"
+
+ if [ ${precision} = "null" ];then
+ precision="fp32"
+ fi
+
+ fp_items_list=($precision)
+ batch_size_list=($batch_size)
+ device_num_list=($device_num)
+fi
+
+# for log name
+to_static=""
+# parse "to_static" options and modify trainer into "to_static_trainer"
+if [[ ${model_type} = "dynamicTostatic" ]];then
+ to_static="d2sT_"
+ sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME
+fi
+
+IFS="|"
+for batch_size in ${batch_size_list[*]}; do
+ for precision in ${fp_items_list[*]}; do
+ for device_num in ${device_num_list[*]}; do
+ # sed batchsize and precision
+ func_sed_params "$FILENAME" "${line_precision}" "$precision"
+ func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size"
+ func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch"
+ gpu_id=$(set_gpu_id $device_num)
+
+ if [ ${#gpu_id} -le 1 ];then
+ log_path="$SAVE_LOG/profiling_log"
+ mkdir -p $log_path
+ log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling"
+ func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id
+ # set profile_option params
+ tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
+
+ # run test_train_inference_python.sh
+ cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+ echo $cmd
+ eval $cmd
+ eval "cat ${log_path}/${log_name}"
+
+ # without profile
+ log_path="$SAVE_LOG/train_log"
+ speed_log_path="$SAVE_LOG/index"
+ mkdir -p $log_path
+ mkdir -p $speed_log_path
+ log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
+ speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
+ func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null
+ cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+ echo $cmd
+ job_bt=`date '+%Y%m%d%H%M%S'`
+ eval $cmd
+ job_et=`date '+%Y%m%d%H%M%S'`
+ export model_run_time=$((${job_et}-${job_bt}))
+ eval "cat ${log_path}/${log_name}"
+
+ # parser log
+ _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
+ cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+ --speed_log_file '${speed_log_path}/${speed_log_name}' \
+ --model_name ${_model_name} \
+ --base_batch_size ${batch_size} \
+ --run_mode ${run_mode} \
+ --fp_item ${precision} \
+ --keyword ips: \
+ --skip_steps 2 \
+ --device_num ${device_num} \
+ --speed_unit samples/s \
+ --convergence_key loss: "
+ echo $cmd
+ eval $cmd
+ last_status=${PIPESTATUS[0]}
+ status_check $last_status "${cmd}" "${status_log}"
+ else
+ IFS=";"
+ unset_env=`unset CUDA_VISIBLE_DEVICES`
+ log_path="$SAVE_LOG/train_log"
+ speed_log_path="$SAVE_LOG/index"
+ mkdir -p $log_path
+ mkdir -p $speed_log_path
+ log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
+ speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
+ func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id
+ func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null
+ cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+ echo $cmd
+ job_bt=`date '+%Y%m%d%H%M%S'`
+ eval $cmd
+ job_et=`date '+%Y%m%d%H%M%S'`
+ export model_run_time=$((${job_et}-${job_bt}))
+ eval "cat ${log_path}/${log_name}"
+ # parser log
+ _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
+
+ cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
+ --speed_log_file '${speed_log_path}/${speed_log_name}' \
+ --model_name ${_model_name} \
+ --base_batch_size ${batch_size} \
+ --run_mode ${run_mode} \
+ --fp_item ${precision} \
+ --keyword ips: \
+ --skip_steps 2 \
+ --device_num ${device_num} \
+ --speed_unit images/s \
+ --convergence_key loss: "
+ echo $cmd
+ eval $cmd
+ last_status=${PIPESTATUS[0]}
+ status_check $last_status "${cmd}" "${status_log}"
+ fi
+ done
+ done
+done
diff --git a/test_tipc/common_func.sh b/test_tipc/common_func.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7e349f0e2ff079f715a3ad1211fd6cd7432c0283
--- /dev/null
+++ b/test_tipc/common_func.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+function func_parser_key(){
+ strs=$1
+ IFS=":"
+ array=(${strs})
+ tmp=${array[0]}
+ echo ${tmp}
+}
+
+function func_parser_value(){
+ strs=$1
+ IFS=":"
+ array=(${strs})
+ tmp=${array[1]}
+ echo ${tmp}
+}
+
+function func_set_params(){
+ key=$1
+ value=$2
+ if [ ${key}x = "null"x ];then
+ echo " "
+ elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then
+ echo " "
+ else
+ echo "${key}=${value}"
+ fi
+}
+
+function func_parser_params(){
+ strs=$1
+ IFS=":"
+ array=(${strs})
+ key=${array[0]}
+ tmp=${array[1]}
+ IFS="|"
+ res=""
+ for _params in ${tmp[*]}; do
+ IFS="="
+ array=(${_params})
+ mode=${array[0]}
+ value=${array[1]}
+ if [[ ${mode} = ${MODE} ]]; then
+ IFS="|"
+ #echo $(func_set_params "${mode}" "${value}")
+ echo $value
+ break
+ fi
+ IFS="|"
+ done
+ echo ${res}
+}
+
+function status_check(){
+ last_status=$1 # the exit code
+ run_command=$2
+ run_log=$3
+ model_name=$4
+ log_path=$5
+ if [ $last_status -eq 0 ]; then
+ echo -e "\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log}
+ else
+ echo -e "\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log}
+ fi
+}
diff --git a/test_tipc/compare_results.py b/test_tipc/compare_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d19197e89bccdb6b0beaf84bc2b83cc22ef5c48
--- /dev/null
+++ b/test_tipc/compare_results.py
@@ -0,0 +1,55 @@
+import numpy as np
+import os
+import subprocess
+import json
+import argparse
+import glob
+
+
+def init_args():
+ parser = argparse.ArgumentParser()
+ # params for testing assert allclose
+ parser.add_argument("--atol", type=float, default=1e-3)
+ parser.add_argument("--rtol", type=float, default=1e-3)
+ parser.add_argument("--gt_file", type=str, default="")
+ parser.add_argument("--log_file", type=str, default="")
+ parser.add_argument("--precision", type=str, default="fp32")
+ return parser
+
+def parse_args():
+ parser = init_args()
+ return parser.parse_args()
+
+def load_from_file(gt_file):
+ if not os.path.exists(gt_file):
+ raise ValueError("The log file {} does not exists!".format(gt_file))
+ with open(gt_file, 'r') as f:
+ data = f.readlines()
+ f.close()
+ parser_gt = {}
+ for line in data:
+ metric_name, result = line.strip("\n").split(":")
+ parser_gt[metric_name] = float(result)
+ return parser_gt
+
+if __name__ == "__main__":
+ # Usage:
+ # python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/*.txt --log_file=./test_tipc/output/*/*.txt
+
+ args = parse_args()
+
+ gt_collection = load_from_file(args.gt_file)
+ pre_collection = load_from_file(args.log_file)
+
+ for metric in pre_collection.keys():
+ try:
+ np.testing.assert_allclose(
+ np.array(pre_collection[metric]), np.array(gt_collection[metric]), atol=args.atol, rtol=args.rtol)
+ print(
+ "Assert allclose passed! The results of {} are consistent!".
+ format(metric))
+ except Exception as E:
+ print(E)
+ raise ValueError(
+ "The results of {} are inconsistent!".
+ format(metric))
\ No newline at end of file
diff --git a/test_tipc/configs/CycleGAN/train_infer_python.txt b/test_tipc/configs/CycleGAN/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cd27a2218cd1c4b2e577e6b4f33f6df2b99264d9
--- /dev/null
+++ b/test_tipc/configs/CycleGAN/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:CycleGAN
+python:python3.7
+gpu_list:0|0,1
+##
+auto_cast:null
+epochs:lite_train_lite_infer=1|lite_train_whole_infer=1|whole_train_whole_infer=200
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=1|whole_train_whole_infer=1
+pretrained_model:null
+train_model_name:cyclegan_horse2zebra*/*checkpoint.pdparams
+train_infer_img_dir:./data/horse2zebra/test
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/cyclegan_horse2zebra.yaml --seed 123 -o log_config.interval=1 snapshot_config.interval=1
+pact_train:null
+fpgm_train:null
+distill_train:null
+to_static_train:model.to_static=True
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/cyclegan_horse2zebra.yaml --inputs_size="-1,3,-1,-1;-1,3,-1,-1" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/cyclegan_horse2zebra/cycleganmodel_netG_A
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type cyclegan --seed 123 -c configs/cyclegan_horse2zebra.yaml --output_path test_tipc/output/
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
+===========================train_benchmark_params==========================
+batch_size:1
+fp_items:fp32
+epoch:1
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3,256,256]}]
\ No newline at end of file
diff --git a/test_tipc/configs/FOMM/train_infer_python.txt b/test_tipc/configs/FOMM/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..50b522f249102eeb668deedc364171279004b88d
--- /dev/null
+++ b/test_tipc/configs/FOMM/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:FOMM
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+epochs:lite_train_lite_infer=1|lite_train_whole_infer=1|whole_train_whole_infer=100
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=8|whole_train_whole_infer=8
+pretrained_model:null
+train_model_name:firstorder_vox_256*/*checkpoint.pdparams
+train_infer_img_dir:./data/firstorder_vox_256/test
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/firstorder_vox_256.yaml --seed 123 -o log_config.interval=1 snapshot_config.interval=1 dataset.train.num_repeats=1 dataset.train.id_sampling=False
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/firstorder_vox_256.yaml --inputs_size="1,3,256,256;1,3,256,256;1,10,2;1,10,2,2" --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:fom_dy2st
+train_model:./inference/fom_dy2st/
+infer_export:null
+infer_quant:False
+inference:tools/fom_infer.py --driving_path data/first_order/Voxceleb/test --output_path test_tipc/output/fom/
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
+===========================train_benchmark_params==========================
+batch_size:16
+fp_items:fp32
+epoch:15
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_cudnn_exhaustive_search=1
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3,256,256]}]
diff --git a/test_tipc/configs/GFPGAN/train_infer_python.txt b/test_tipc/configs/GFPGAN/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e38a7eac883ed608afc613730a03090f4ee7fe1f
--- /dev/null
+++ b/test_tipc/configs/GFPGAN/train_infer_python.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:GFPGAN
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=10
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=3
+pretrained_model:null
+train_model_name:gfpgan_ffhq1024*/*checkpoint.pdparams
+train_infer_img_dir:null
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/gfpgan_ffhq1024.yaml --seed 123 -o log_config.interval=1 snapshot_config.interval=10
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/gfpgan_ffhq1024.yaml --inputs_size="1,3,512,512" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/stylegan2/stylegan2model_gen
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type gfpgan --seed 123 -c configs/gfpgan_ffhq1024.yaml --output_path test_tipc/output/ -o validate=None
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:False
+null:null
diff --git a/test_tipc/configs/Pix2pix/train_infer_python.txt b/test_tipc/configs/Pix2pix/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e1aed5ebb886a926b60a1635c6e3df758200708d
--- /dev/null
+++ b/test_tipc/configs/Pix2pix/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:Pix2pix
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+epochs:lite_train_lite_infer=10|lite_train_whole_infer=10|whole_train_whole_infer=200
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=1|whole_train_whole_infer=1
+pretrained_model:null
+train_model_name:pix2pix_facades*/*checkpoint.pdparams
+train_infer_img_dir:./data/facades/test
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/pix2pix_facades.yaml --seed 123 -o log_config.interval=1
+pact_train:null
+fpgm_train:null
+distill_train:null
+to_static_train:model.to_static=True
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/pix2pix_facades.yaml --inputs_size="-1,3,-1,-1" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/pix2pix_facade/pix2pixmodel_netG
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type pix2pix --seed 123 -c configs/pix2pix_facades.yaml --output_path test_tipc/output/
+--device:cpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
+===========================train_benchmark_params==========================
+batch_size:1
+fp_items:fp32
+epoch:10
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:null
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[3,256,256]}]
diff --git a/test_tipc/configs/StyleGANv2/train_infer_python.txt b/test_tipc/configs/StyleGANv2/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..182214ecdb9ae42da47fdc4898f1f96984df736a
--- /dev/null
+++ b/test_tipc/configs/StyleGANv2/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:StyleGANv2
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=10|lite_train_whole_infer=10|whole_train_whole_infer=800
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=3|whole_train_whole_infer=3
+pretrained_model:null
+train_model_name:stylegan_v2_256_ffhq*/*checkpoint.pdparams
+train_infer_img_dir:null
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/stylegan_v2_256_ffhq.yaml --seed 123 -o log_config.interval=1 snapshot_config.interval=10
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/stylegan_v2_256_ffhq.yaml --inputs_size="1,1,512;1,1" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/stylegan2/stylegan2model_gen
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type stylegan2 --seed 123 -c configs/stylegan_v2_256_ffhq.yaml --output_path test_tipc/output/
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
+===========================train_benchmark_params==========================
+batch_size:8
+fp_items:fp32
+epoch:100
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_cudnn_exhaustive_search=1
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[1, 512]}, {float32,[1]}]
diff --git a/test_tipc/configs/aotgan/train_infer_python.txt b/test_tipc/configs/aotgan/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a5d82f988862ff58f4a41196609f3dc52b017fd6
--- /dev/null
+++ b/test_tipc/configs/aotgan/train_infer_python.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:aotgan
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+epochs:lite_train_lite_infer=10|lite_train_whole_infer=10|whole_train_whole_infer=200
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=1|whole_train_whole_infer=1
+pretrained_model:null
+train_model_name:aotgan*/*checkpoint.pdparams
+train_infer_img_dir:./data/aotgan
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/aotgan.yaml --seed 123 -o log_config.interval=1 snapshot_config.interval=1
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/aotgan.yaml --inputs_size="-1,4,-1,-1" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/aotgan/aotganmodel_netG
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type aotgan --seed 123 -c configs/aotgan.yaml --output_path test_tipc/output/
+--device:cpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
diff --git a/test_tipc/configs/basicvsr/train_infer_python.txt b/test_tipc/configs/basicvsr/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..484aecbad64f3ba038ad5c0d3271f132248fcf57
--- /dev/null
+++ b/test_tipc/configs/basicvsr/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:basicvsr
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=10|lite_train_whole_infer=10|whole_train_whole_infer=200
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=1|whole_train_whole_infer=1
+pretrained_model:null
+train_model_name:basicvsr_reds*/*checkpoint.pdparams
+train_infer_img_dir:./data/basicvsr_reds/test
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/basicvsr_reds.yaml --seed 123 -o log_config.interval=1 snapshot_config.interval=5
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/basicvsr_reds.yaml --inputs_size="1,6,3,180,320" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/basicvsr/basicvsrmodel_generator
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type basicvsr -c configs/basicvsr_reds.yaml --seed 123 -o dataset.test.num_frames=6 --output_path test_tipc/output/
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
+===========================train_benchmark_params==========================
+batch_size:2|4
+fp_items:fp32
+total_iters:50
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_cudnn_exhaustive_search=1
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[6,3,180,320]}]
diff --git a/test_tipc/configs/edvr/train_infer_python.txt b/test_tipc/configs/edvr/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9379f9c4e4c1cdffa0c31f1aef09b13c7971aadf
--- /dev/null
+++ b/test_tipc/configs/edvr/train_infer_python.txt
@@ -0,0 +1,57 @@
+===========================train_params===========================
+model_name:edvr
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=100
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=4
+pretrained_model:null
+train_model_name:edvr_m_wo_tsa*/*checkpoint.pdparams
+train_infer_img_dir:./data/basicvsr_reds/test
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/edvr_m_wo_tsa.yaml --seed 123 -o log_config.interval=5 snapshot_config.interval=25
+pact_train:null
+fpgm_train:null
+distill_train:null
+to_static_train:model.to_static=True
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/edvr_m_wo_tsa.yaml --inputs_size="1,5,3,180,320" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/edvr/edvrmodel_generator
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type edvr -c configs/edvr_m_wo_tsa.yaml --seed 123 -o dataset.test.num_frames=5 --output_path test_tipc/output/
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
+===========================train_benchmark_params==========================
+batch_size:64
+fp_items:fp32|fp16
+total_iters:100
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_cudnn_exhaustive_search=0
diff --git a/test_tipc/configs/edvr/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt b/test_tipc/configs/edvr/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a9ed70507c734dfeae8b54c00c2183309da6f343
--- /dev/null
+++ b/test_tipc/configs/edvr/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt
@@ -0,0 +1,26 @@
+===========================train_params===========================
+model_name:edvr
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=100
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=4
+pretrained_model:null
+train_model_name:basicvsr_reds*/*checkpoint.pdparams
+train_infer_img_dir:./data/basicvsr_reds/test
+null:null
+##
+trainer:amp_train
+amp_train:tools/main.py --amp --amp_level O2 -c configs/edvr_m_wo_tsa.yaml --seed 123 -o log_config.interval=5
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
diff --git a/test_tipc/configs/esrgan/train_infer_python.txt b/test_tipc/configs/esrgan/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c70b93d27a3437b1bfdeb6b27dc807b82c1c4340
--- /dev/null
+++ b/test_tipc/configs/esrgan/train_infer_python.txt
@@ -0,0 +1,57 @@
+===========================train_params===========================
+model_name:esrgan
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=100
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=2
+pretrained_model:null
+train_model_name:esrgan_psnr_x4_div2k*/*checkpoint.pdparams
+train_infer_img_dir:null
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/esrgan_psnr_x4_div2k.yaml --seed 123 -o log_config.interval=10 snapshot_config.interval=25
+pact_train:null
+fpgm_train:null
+distill_train:null
+to_static_train:model.to_static=True
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/esrgan_psnr_x4_div2k.yaml --inputs_size="1,3,128,128" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/esrgan/esrganmodel_generator
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type esrgan -c configs/esrgan_psnr_x4_div2k.yaml --seed 123 --output_path test_tipc/output/
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
+===========================train_benchmark_params==========================
+batch_size:32|64
+fp_items:fp32
+total_iters:500
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_cudnn_exhaustive_search=0
diff --git a/test_tipc/configs/invdn/train_infer_python.txt b/test_tipc/configs/invdn/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..78df03128754596ae4db64596a7ed6854463a9a1
--- /dev/null
+++ b/test_tipc/configs/invdn/train_infer_python.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:invdn
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=10
+output_dir:./output/
+snapshot_config.interval:lite_train_lite_infer=10
+pretrained_model:null
+train_model_name:invdn*/*checkpoint.pdparams
+train_infer_img_dir:null
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/invdn_denoising.yaml --seed 100 -o log_config.interval=1
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/invdn_denoising.yaml --inputs_size=1,3,256,256 --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/invdn/invdnmodel_generator
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type invdn --seed 100 -c configs/invdn_denoising.yaml --output_path test_tipc/output/
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
diff --git a/test_tipc/configs/msvsr/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt b/test_tipc/configs/msvsr/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d96065b549fa5d3e33812b4eb5292321c1114a65
--- /dev/null
+++ b/test_tipc/configs/msvsr/model_linux_gpu_normal_normal_infer_cpp_linux_gpu_cpu.txt
@@ -0,0 +1,12 @@
+===========================cpp_infer_params===========================
+model_name:msvsr
+inference:./deploy/cpp_infer/build/vsr
+--infer_model_path:./inference/msvsr/multistagevsrmodel_generator.pdmodel
+--infer_param_path:./inference/msvsr/multistagevsrmodel_generator.pdiparams
+--video_path:./data/low_res.mp4
+--output_dir:./test_tipc/output/msvsr
+--frame_num:2
+--device:GPU
+--gpu_id:1
+--use_mkldnn:True
+--cpu_threads:1
diff --git a/test_tipc/configs/msvsr/train_infer_python.txt b/test_tipc/configs/msvsr/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..21539838a2bb69ea55df4b9b070d96d42d2294c6
--- /dev/null
+++ b/test_tipc/configs/msvsr/train_infer_python.txt
@@ -0,0 +1,59 @@
+===========================train_params===========================
+model_name:msvsr
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=10|lite_train_whole_infer=10|whole_train_whole_infer=200
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=1|whole_train_whole_infer=1
+pretrained_model:null
+train_model_name:msvsr_reds*/*checkpoint.pdparams
+train_infer_img_dir:./data/msvsr_reds/test
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/msvsr_reds.yaml --seed 123 -o log_config.interval=1 snapshot_config.interval=5 dataset.train.dataset.num_frames=15
+pact_train:null
+fpgm_train:null
+distill_train:null
+to_static_train:model.to_static=True
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/msvsr_reds.yaml --inputs_size="1,2,3,180,320" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/msvsr/multistagevsrmodel_generator
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type msvsr -c configs/msvsr_reds.yaml --seed 123 -o dataset.test.num_frames=2 --output_path test_tipc/output/
+--device:cpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
+===========================train_benchmark_params==========================
+batch_size:2|4
+fp_items:fp32|fp16
+total_iters:60
+--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
+flags:FLAGS_cudnn_exhaustive_search=0
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[2,3,180,320]}]
diff --git a/test_tipc/configs/msvsr/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt b/test_tipc/configs/msvsr/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2de77fb11980d56441a266ad001016032f8d4fa0
--- /dev/null
+++ b/test_tipc/configs/msvsr/train_linux_gpu_normal_amp_infer_python_linux_gpu_cpu.txt
@@ -0,0 +1,53 @@
+===========================train_params===========================
+model_name:msvsr
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=10|lite_train_whole_infer=10|whole_train_whole_infer=200
+output_dir:./output/
+dataset.train.batch_size:lite_train_lite_infer=1|whole_train_whole_infer=1
+pretrained_model:null
+train_model_name:msvsr_reds*/*checkpoint.pdparams
+train_infer_img_dir:./data/msvsr_reds/test
+null:null
+##
+trainer:amp_train
+amp_train:tools/main.py --amp --amp_level O1 -c configs/msvsr_reds.yaml --seed 123 -o dataset.train.num_workers=0 log_config.interval=1 snapshot_config.interval=5 dataset.train.dataset.num_frames=2
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/msvsr_reds.yaml --inputs_size="1,2,3,180,320" --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/msvsr/multistagevsrmodel_generator
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type msvsr -c configs/msvsr_reds.yaml --seed 123 -o dataset.test.num_frames=2 --output_path test_tipc/output/
+--device:cpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
+===========================infer_benchmark_params==========================
+random_infer_input:[{float32,[2,3,180,320]}]
diff --git a/test_tipc/configs/nafnet/train_infer_python.txt b/test_tipc/configs/nafnet/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8cd3ddc16ebbe1ef62067831c8ca8770589f7a5a
--- /dev/null
+++ b/test_tipc/configs/nafnet/train_infer_python.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:nafnet
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=10
+output_dir:./output/
+snapshot_config.interval:lite_train_lite_infer=10
+pretrained_model:null
+train_model_name:nafnet*/*checkpoint.pdparams
+train_infer_img_dir:null
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/nafnet_denoising.yaml --seed 100 -o log_config.interval=1
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/nafnet_denoising.yaml --inputs_size=1,3,256,256 --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/nafnet/nafnetmodel_generator
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type nafnet --seed 100 -c configs/nafnet_denoising.yaml --output_path test_tipc/output/
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
diff --git a/test_tipc/configs/singan/train_infer_python.txt b/test_tipc/configs/singan/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1d7f13da0921f51d6e194021294d38012c99ae19
--- /dev/null
+++ b/test_tipc/configs/singan/train_infer_python.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:singan
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=100|whole_train_whole_infer=100000
+output_dir:./output/
+snapshot_config.interval:lite_train_lite_infer=25|whole_train_whole_infer=10000
+pretrained_model:null
+train_model_name:singan*/*checkpoint.pdparams
+train_infer_img_dir:./data/stone
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/singan_universal.yaml --seed 123 -o log_config.interval=50
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/singan_universal.yaml --inputs_size=1 --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:singan_random_sample
+train_model:./inference/singan/singan_random_sample
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type singan --seed 123 -c configs/singan_universal.yaml --output_path test_tipc/output/
+--device:cpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
\ No newline at end of file
diff --git a/test_tipc/configs/swinir/train_infer_python.txt b/test_tipc/configs/swinir/train_infer_python.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f41cec2d4338c3c02f9e34190eb9595ac7944b11
--- /dev/null
+++ b/test_tipc/configs/swinir/train_infer_python.txt
@@ -0,0 +1,51 @@
+===========================train_params===========================
+model_name:swinir
+python:python3.7
+gpu_list:0
+##
+auto_cast:null
+total_iters:lite_train_lite_infer=10
+output_dir:./output/
+snapshot_config.interval:lite_train_lite_infer=10
+pretrained_model:null
+train_model_name:swinir*/*checkpoint.pdparams
+train_infer_img_dir:null
+null:null
+##
+trainer:norm_train
+norm_train:tools/main.py -c configs/swinir_denoising.yaml --seed 100 -o log_config.interval=1
+pact_train:null
+fpgm_train:null
+distill_train:null
+null:null
+null:null
+##
+===========================eval_params===========================
+eval:null
+null:null
+##
+===========================infer_params===========================
+--output_dir:./output/
+load:null
+norm_export:tools/export_model.py -c configs/swinir_denoising.yaml --inputs_size=1,3,128,128 --model_name inference --load
+quant_export:null
+fpgm_export:null
+distill_export:null
+export1:null
+export2:null
+inference_dir:inference
+train_model:./inference/swinir/swinirmodel_generator
+infer_export:null
+infer_quant:False
+inference:tools/inference.py --model_type swinir --seed 100 -c configs/swinir_denoising.yaml --output_path test_tipc/output/
+--device:gpu
+null:null
+null:null
+null:null
+null:null
+null:null
+--model_path:
+null:null
+null:null
+--benchmark:True
+null:null
diff --git a/test_tipc/docs/benchmark_train.md b/test_tipc/docs/benchmark_train.md
new file mode 100644
index 0000000000000000000000000000000000000000..acd85d66370f7273b2bd8341f3ad3b598b018529
--- /dev/null
+++ b/test_tipc/docs/benchmark_train.md
@@ -0,0 +1,52 @@
+
+# TIPC Linux端Benchmark测试文档
+
+该文档为Benchmark测试说明,Benchmark预测功能测试的主程序为`benchmark_train.sh`,用于验证监控模型训练的性能。
+
+# 1. 测试流程
+## 1.1 准备数据和环境安装
+运行`test_tipc/prepare.sh`,完成训练数据准备和安装环境流程。
+
+```shell
+# 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode
+bash test_tipc/prepare.sh test_tipc/configs/msvsr/train_infer_python.txt benchmark_train
+```
+
+## 1.2 功能测试
+执行`test_tipc/benchmark_train.sh`,完成模型训练和日志解析
+
+```shell
+# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode
+bash test_tipc/benchmark_train.sh test_tipc/configs/msvsr/train_infer_python.txt benchmark_train
+```
+
+`test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置,如下:
+```shell
+# 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode
+bash test_tipc/benchmark_train.sh test_tipc/configs/msvsr/train_infer_python.txt benchmark_train dynamic_bs4_fp32_DP_N1C1
+```
+dynamic_bs4_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式如下:
+`${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}`
+包含的信息有:模型类型、batchsize大小、训练精度如fp32,fp16等、分布式运行模式以及分布式训练使用的机器信息如单机单卡(N1C1)。
+
+
+## 2. 日志输出
+
+运行后将保存模型的训练日志和解析日志,使用 `test_tipc/configs/basicvsr/train_benchmark.txt` 参数文件的训练日志解析结果是:
+
+```
+{"model_branch": "dygaph", "model_commit": "7c39a1996b19087737c05d883fd346d2f39dbcc0", "model_name": "basicvsr_bs4_fp32_SingleP_DP", "batch_size": 4, "fp_item": "fp32", "run_process_type": "SingleP", "run_mode": "DP", "convergence_value": "5.413110", "convergence_key": "loss:", "ips": 19.333, "speed_unit": "samples/s", "device_num": "N1C1", "model_run_time": "0", "frame_commit": "8cc09552473b842c651ead3b9848d41827a3dbab", "frame_version": "0.0.0"}
+```
+
+训练日志和日志解析结果保存在benchmark_log目录下,文件组织格式如下:
+```
+train_log/
+├── index
+│ ├── PaddleGAN_msvsr_bs4_fp32_SingleP_DP_N1C1_speed
+│ └── PaddleGAN_msvsr_bs4_fp32_SingleP_DP_N1C4_speed
+├── profiling_log
+│ └── PaddleGAN_msvsr_bs4_fp32_SingleP_DP_N1C1_profiling
+└── train_log
+ ├── PaddleGAN_msvsr_bs4_fp32_SingleP_DP_N1C1_log
+ └── PaddleGAN_msvsr_bs4_fp32_MultiP_DP_N1C4_log
+```
diff --git a/test_tipc/docs/compare_right.png b/test_tipc/docs/compare_right.png
new file mode 100644
index 0000000000000000000000000000000000000000..af5766a30f0972e29405fbf5a9a36a8f35aece80
Binary files /dev/null and b/test_tipc/docs/compare_right.png differ
diff --git a/test_tipc/docs/compare_wrong.png b/test_tipc/docs/compare_wrong.png
new file mode 100644
index 0000000000000000000000000000000000000000..3454b54b0a0a722806fb65f76c01a4efdd9b9444
Binary files /dev/null and b/test_tipc/docs/compare_wrong.png differ
diff --git a/test_tipc/docs/test.png b/test_tipc/docs/test.png
new file mode 100644
index 0000000000000000000000000000000000000000..f99f23d7050eb61879cf317c0d7728ef14531b08
Binary files /dev/null and b/test_tipc/docs/test.png differ
diff --git a/test_tipc/docs/test_inference_cpp.md b/test_tipc/docs/test_inference_cpp.md
new file mode 100644
index 0000000000000000000000000000000000000000..5051b3e4be1bcd2fc698ade20cfa4e5a85bf1dd5
--- /dev/null
+++ b/test_tipc/docs/test_inference_cpp.md
@@ -0,0 +1,49 @@
+# C++预测功能测试
+
+C++预测功能测试的主程序为`test_inference_cpp.sh`,可以测试基于C++预测库的模型推理功能。
+
+## 1. 测试结论汇总
+
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+| :----: | :----: | :----: | :----: | :----: | :----: |
+| 正常模型 | GPU | 1 | - | - | - |
+| 正常模型 | CPU | 1 | - | fp32 | 支持 |
+
+## 2. 测试流程
+运行环境配置请参考[文档](../../docs/zh_CN/install.md)的内容安装PaddleGAN,TIPC推荐的环境:
+- PaddlePaddle=2.3.1
+- CUDA=10.2
+- cuDNN=7.6.5
+
+### 2.1 功能测试
+先运行`prepare.sh`准备数据和模型,然后运行`test_inference_cpp.sh`进行测试,msvsr模型的具体测试如下
+
+```bash
+# 准备模型和数据
+bash test_tipc/test_inference_cpp.sh test_tipc/configs/msvsr/inference_cpp.txt
+# cpp推理测试,可修改inference_cpp.txt配置累测试不同配置下的推理结果
+bash test_tipc/test_inference_cpp.sh test_tipc/configs/msvsr/inference_cpp.txt
+```
+
+运行预测指令后,在`test_tipc/output`文件夹下自动会保存运行日志和输出结果,包括以下文件:
+
+```shell
+test_tipc/output
+ ├── infer_cpp/results_cpp_infer.log # 运行指令状态的日志
+ ├── infer_cpp/infer_cpp_GPU.log # 使用GPU推理测试的日志
+ ├── infer_cpp/infer_cpp_CPU_use_mkldnn_threads_1.log # 使用CPU开启mkldnn,thread为1的推理测试日志
+ ├── output.mp4 # 视频超分预测结果
+......
+```
+其中results_cpp_infer.log中包含了每条指令的运行状态,如果运行成功会输出:
+
+```
+Run successfully with command - ./deploy/cpp_infer/build/vsr --model_path=./inference/msvsr/multistagevsrmodel_generator.pdmodel --param_path=./inference/msvsr/multistagevsrmodel_generator.pdiparams --video_path=./data/low_res.mp4 --output_dir=./test_tipc/output/msvsr --frame_num=2 --device=GPU --gpu_id=1 --use_mkldnn=True --cpu_threads=1 > ./test_tipc/output/infer_cpp/infer_cpp_GPU.log 2>&1!
+......
+```
+如果运行失败,会输出:
+```
+Run failed with command - ./deploy/cpp_infer/build/vsr --model_path=./inference/msvsr/multistagevsrmodel_generator.pdmodel --param_path=./inference/msvsr/multistagevsrmodel_generator.pdiparams --video_path=./data/low_res.mp4 --output_dir=./test_tipc/output/msvsr --frame_num=2 --device=GPU --gpu_id=1 --use_mkldnn=True --cpu_threads=1 > ./test_tipc/output/infer_cpp/infer_cpp_GPU.log 2>&1!
+......
+```
+可以根据results_cpp_infer.log中的内容判定哪一个指令运行错误。
diff --git a/test_tipc/docs/test_train_inference_python.md b/test_tipc/docs/test_train_inference_python.md
new file mode 100644
index 0000000000000000000000000000000000000000..0090fe606ace8e59e100cb46d69f4bd515c9c2d9
--- /dev/null
+++ b/test_tipc/docs/test_train_inference_python.md
@@ -0,0 +1,129 @@
+# Linux端基础训练预测功能测试
+
+Linux端基础训练预测功能测试的主程序为`test_train_inference_python.sh`,可以测试基于Python的模型训练、评估、推理等基本功能。
+
+
+## 1. 测试结论汇总
+
+- 训练相关:
+
+| 算法论文 | 模型名称 | 模型类型 | 基础
训练预测 | 更多
训练方式 | 模型压缩 | 其他预测部署 |
+| :--- | :--- | :----: | :--------: | :---- | :---- | :---- |
+| Pix2Pix |Pix2Pix | 生成 | 支持 | 多机多卡 | | |
+| CycleGAN |CycleGAN | 生成 | 支持 | 多机多卡 | | |
+| StyleGAN2 |StyleGAN2 | 生成 | 支持 | 多机多卡 | | |
+| FOMM |FOMM | 生成 | 支持 | 多机多卡 | | |
+| BasicVSR |BasicVSR | 超分 | 支持 | 多机多卡 | | |
+|PP-MSVSR|PP-MSVSR | 超分|
+|edvr|edvr | 超分|支持|
+|esrgan|esrgan | 超分|支持|
+
+- 预测相关:预测功能汇总如下,
+
+| 模型类型 |device | batchsize | tensorrt | mkldnn | cpu多线程 |
+| ---- | ---- | ---- | :----: | :----: | :----: |
+| 正常模型 | GPU | 1/6 | fp32 | - | - |
+
+
+
+## 2. 测试流程
+
+运行环境配置请参考[文档](../../docs/zh_CN/install.md)的内容配置运行环境。
+
+### 2.1 安装依赖
+- 安装PaddlePaddle >= 2.1
+- 安装PaddleGAN依赖
+ ```
+ pip install -v -e .
+ ```
+- 安装autolog(规范化日志输出工具)
+ ```
+ git clone https://github.com/LDOUBLEV/AutoLog
+ cd AutoLog
+ pip3 install -r requirements.txt
+ python3 setup.py bdist_wheel
+ pip3 install ./dist/auto_log-1.0.0-py3-none-any.whl
+ cd ../
+ ```
+
+
+### 2.2 功能测试
+先运行`prepare.sh`准备数据和模型,然后运行`test_train_inference_python.sh`进行测试,最终在```test_tipc/output```目录下生成`python_infer_*.log`格式的日志文件。
+
+
+`test_train_inference_python.sh`包含5种运行模式,每种模式的运行数据不同,分别用于测试速度和精度,分别是:
+
+- 模式1:lite_train_lite_infer,使用少量数据训练,用于快速验证训练到预测的走通流程,不验证精度和速度;
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/basicvsr/train_infer_python.txt 'lite_train_lite_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/basicvsr/train_infer_python.txt 'lite_train_lite_infer'
+```
+
+- 模式2:lite_train_whole_infer,使用少量数据训练,一定量数据预测,用于验证训练后的模型执行预测,预测速度是否合理;
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/basicvsr/train_infer_python.txt 'lite_train_whole_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/basicvsr/train_infer_python.txt 'lite_train_whole_infer'
+```
+
+- 模式3:whole_infer,不训练,全量数据预测,走通开源模型评估、动转静,检查inference model预测时间和精度;
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/basicvsr/train_infer_python.txt 'whole_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/basicvsr/train_infer_python.txt 'whole_infer'
+```
+
+- 模式4:whole_train_whole_infer,CE: 全量数据训练,全量数据预测,验证模型训练精度,预测精度,预测速度;
+```shell
+bash test_tipc/prepare.sh ./test_tipc/configs/basicvsr/train_infer_python.txt 'whole_train_whole_infer'
+bash test_tipc/test_train_inference_python.sh ./test_tipc/configs/basicvsr/train_infer_python.txt 'whole_train_whole_infer'
+```
+
+运行相应指令后,在`test_tipc/output`文件夹下自动会保存运行日志。如'lite_train_lite_infer'模式下,会运行训练+inference的链条,因此,在`test_tipc/output`文件夹有以下文件:
+```
+test_tipc/output/
+|- results_python.log # 运行指令状态的日志
+|- norm_train_gpus_0_autocast_null/ # GPU 0号卡上正常训练的训练日志和模型保存文件夹
+......
+```
+
+其中`results_python.log`中包含了每条指令的运行状态,如果运行成功会输出:
+```
+Run successfully with command - python3.7 tools/main.py -c configs/basicvsr_reds.yaml -o dataset.train.dataset.num_clips=2 output_dir=./test_tipc/output/norm_train_gpus_0_autocast_null total_iters=5 dataset.train.batch_size=1 !
+-=Run successfully with command - python3.7 tools/export_model.py -c configs/basicvsr_reds.yaml --inputs_size="1,6,3,180,320" --load ./test_tipc/output/norm_train_gpus_0_autocast_null/basicvsr_reds-2021-11-22-07-18/iter_1_checkpoint.pdparams --output_dir ./test_tipc/output/norm_train_gpus_0_autocast_null!
+......
+```
+如果运行失败,会输出:
+```
+Run failed with command - python3.7 tools/main.py -c configs/basicvsr_reds.yaml -o dataset.train.dataset.num_clips=2 output_dir=./test_tipc/output/norm_train_gpus_0_autocast_null total_iters=5 dataset.train.batch_size=1 ! !
+Run failed with command - python3.7 tools/export_model.py -c configs/basicvsr_reds.yaml --inputs_size="1,6,3,180,320" --load ./test_tipc/output/norm_train_gpus_0_autocast_null/basicvsr_reds-2021-11-22-07-18/iter_1_checkpoint.pdparams --output_dir ./test_tipc/output/norm_train_gpus_0_autocast_null!
+......
+```
+可以很方便的根据`results_python.log`中的内容判定哪一个指令运行错误。
+
+
+### 2.3 精度测试
+
+使用compare_results.py脚本比较模型预测的结果是否符合预期,主要步骤包括:
+- 提取日志中的预测坐标;
+- 从本地文件中提取保存好的坐标结果;
+- 比较上述两个结果是否符合精度预期,误差大于设置阈值时会报错。
+
+#### 使用方式
+运行命令:
+```shell
+python3.7 test_tipc/compare_results.py --gt_file=./test_tipc/results/*.txt --log_file=./test_tipc/output/*/*.txt --atol=1e-3 --rtol=1e-3
+```
+
+参数介绍:
+- gt_file: 指向事先保存好的预测结果路径,支持*.txt 结尾,会自动索引*.txt格式的文件,文件默认保存在test_tipc/result/ 文件夹下
+- log_file: 指向运行test_tipc/test_train_inference_python.sh 脚本的infer模式保存的预测日志,预测日志中打印的有预测结果,
+- atol: 设置的绝对误差
+- rtol: 设置的相对误差
+
+#### 运行结果
+
+正常运行效果如下图:
+
+
+出现不一致结果时的运行输出:
+
+
diff --git a/test_tipc/prepare.sh b/test_tipc/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c43542c34d10e2a5a4697ce9140217af508afce0
--- /dev/null
+++ b/test_tipc/prepare.sh
@@ -0,0 +1,202 @@
+#!/bin/bash
+FILENAME=$1
+
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer',
+# 'whole_infer', 'benchmark_train', 'cpp_infer']
+
+MODE=$2
+
+dataline=$(cat ${FILENAME})
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+function func_parser_key(){
+ strs=$1
+ IFS=":"
+ array=(${strs})
+ tmp=${array[0]}
+ echo ${tmp}
+}
+function func_parser_value(){
+ strs=$1
+ IFS=":"
+ array=(${strs})
+ tmp=${array[1]}
+ echo ${tmp}
+}
+IFS=$'\n'
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+trainer_list=$(func_parser_value "${lines[14]}")
+
+if [ ${MODE} = "benchmark_train" ];then
+ pip install -r requirements.txt
+ MODE="lite_train_lite_infer"
+fi
+
+if [ ${MODE} = "lite_train_lite_infer" ];then
+
+ case ${model_name} in
+ Pix2pix)
+ rm -rf ./data/pix2pix*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/pix2pix_facade_lite.tar --no-check-certificate
+ cd ./data/ && tar xf pix2pix_facade_lite.tar && cd ../ ;;
+ CycleGAN)
+ rm -rf ./data/cyclegan*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/cyclegan_horse2zebra_lite.tar --no-check-certificate
+ cd ./data/ && tar xf cyclegan_horse2zebra_lite.tar && cd ../ ;;
+ StyleGANv2)
+ rm -rf ./data/ffhq*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/ffhq_256.tar --no-check-certificate
+ cd ./data/ && tar xf ffhq_256.tar && cd ../ ;;
+ FOMM)
+ rm -rf ./data/fom_lite*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/fom_lite.tar --no-check-certificate --no-check-certificate
+ cd ./data/ && tar xf fom_lite.tar && cd ../ ;;
+ edvr|basicvsr|msvsr)
+ rm -rf ./data/reds*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/reds_lite.tar --no-check-certificate
+ cd ./data/ && tar xf reds_lite.tar && cd ../ ;;
+ esrgan)
+ rm -rf ./data/DIV2K*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/DIV2KandSet14paddle.tar --no-check-certificate
+ cd ./data/ && tar xf DIV2KandSet14paddle.tar && cd ../ ;;
+ swinir)
+ rm -rf ./data/*sets
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/swinir_data.zip --no-check-certificate
+ cd ./data/ && unzip -q swinir_data.zip && cd ../ ;;
+ invdn)
+ rm -rf ./data/SIDD_*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/SIDD_mini.zip --no-check-certificate
+ cd ./data/ && unzip -q SIDD_mini.zip && cd ../ ;;
+ nafnet)
+ rm -rf ./data/SIDD*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/SIDD_mini.zip --no-check-certificate
+ cd ./data/ && unzip -q SIDD_mini.zip && mkdir -p SIDD && mv ./SIDD_Medium_Srgb_Patches_512/* ./SIDD/ \
+ && mv ./SIDD_Valid_Srgb_Patches_256/* ./SIDD/ && mv ./SIDD/valid ./SIDD/val \
+ && mv ./SIDD/train/GT ./SIDD/train/target && mv ./SIDD/train/Noisy ./SIDD/train/input \
+ && mv ./SIDD/val/Noisy ./SIDD/val/input && mv ./SIDD/val/GT ./SIDD/val/target && cd ../ ;;
+ singan)
+ rm -rf ./data/singan*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/singan-official_images.zip --no-check-certificate
+ cd ./data/ && unzip -q singan-official_images.zip && cd ../
+ mkdir -p ./data/singan
+ mv ./data/SinGAN-official_images/Images/stone.png ./data/singan ;;
+ GFPGAN)
+ rm -rf ./data/gfpgan*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/gfpgan_tipc_data.zip --no-check-certificate
+ mkdir -p ./data/gfpgan_data
+ cd ./data/ && unzip -q gfpgan_tipc_data.zip -d gfpgan_data/ && cd ../ ;;
+ aotgan)
+ rm -rf ./data/aotgan*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/aotgan.zip --no-check-certificate
+ cd ./data/ && unzip -q aotgan.zip && cd ../ ;;
+ esac
+elif [ ${MODE} = "whole_train_whole_infer" ];then
+ if [ ${model_name} == "Pix2pix" ]; then
+ rm -rf ./data/facades*
+ wget -nc -P ./data/ http://efrosgans.eecs.berkeley.edu/pix2pix/datasets/facades.tar.gz --no-check-certificate
+ cd ./data/ && tar -xzf facades.tar.gz && cd ../
+ elif [ ${model_name} == "CycleGAN" ]; then
+ rm -rf ./data/horse2zebra*
+ wget -nc -P ./data/ https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/horse2zebra.zip --no-check-certificate
+ cd ./data/ && unzip horse2zebra.zip && cd ../
+ elif [ ${model_name} == "singan" ]; then
+ rm -rf ./data/singan*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/singan-official_images.zip --no-check-certificate
+ cd ./data/ && unzip -q singan-official_images.zip && cd ../
+ mkdir -p ./data/singan
+ mv ./data/SinGAN-official_images/Images/stone.png ./data/singan
+ fi
+elif [ ${MODE} = "lite_train_whole_infer" ];then
+ if [ ${model_name} == "Pix2pix" ]; then
+ rm -rf ./data/facades*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/pix2pix_facade_lite.tar --no-check-certificate
+ cd ./data/ && tar xf pix2pix_facade_lite.tar && cd ../
+ elif [ ${model_name} == "CycleGAN" ]; then
+ rm -rf ./data/horse2zebra*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/cyclegan_horse2zebra_lite.tar --no-check-certificate --no-check-certificate
+ cd ./data/ && tar xf cyclegan_horse2zebra_lite.tar && cd ../
+ elif [ ${model_name} == "FOMM" ]; then
+ rm -rf ./data/first_order*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/fom_lite.tar --no-check-certificate --no-check-certificate
+ cd ./data/ && tar xf fom_lite.tar && cd ../
+ elif [ ${model_name} == "StyleGANv2" ]; then
+ rm -rf ./data/ffhq*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/ffhq_256.tar --no-check-certificate
+ cd ./data/ && tar xf ffhq_256.tar && cd ../
+ elif [ ${model_name} == "basicvsr" ]; then
+ rm -rf ./data/reds*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/reds_lite.tar --no-check-certificate
+ cd ./data/ && tar xf reds_lite.tar && cd ../
+ elif [ ${model_name} == "msvsr" ]; then
+ rm -rf ./data/reds*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/reds_lite.tar --no-check-certificate
+ cd ./data/ && tar xf reds_lite.tar && cd ../
+ elif [ ${model_name} == "singan" ]; then
+ rm -rf ./data/singan*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/singan-official_images.zip --no-check-certificate
+ cd ./data/ && unzip -q singan-official_images.zip && cd ../
+ mkdir -p ./data/singan
+ mv ./data/SinGAN-official_images/Images/stone.png ./data/singan
+ fi
+elif [ ${MODE} = "whole_infer" ];then
+ if [ ${model_name} = "Pix2pix" ]; then
+ rm -rf ./data/facades*
+ rm -rf ./inference/pix2pix*
+ wget -nc -P ./inference https://paddlegan.bj.bcebos.com/static_model/pix2pix_facade.tar --no-check-certificate
+ wget -nc -P ./data https://paddlegan.bj.bcebos.com/datasets/facades_test.tar --no-check-certificate
+ cd ./data && tar xf facades_test.tar && mv facades_test facades && cd ../
+ cd ./inference && tar xf pix2pix_facade.tar && cd ../
+ elif [ ${model_name} = "CycleGAN" ]; then
+ rm -rf ./data/cyclegan*
+ rm -rf ./inference/cyclegan*
+ wget -nc -P ./inference https://paddlegan.bj.bcebos.com/static_model/cyclegan_horse2zebra.tar --no-check-certificate
+ wget -nc -P ./data https://paddlegan.bj.bcebos.com/datasets/cyclegan_horse2zebra_test.tar --no-check-certificate
+ cd ./data && tar xf cyclegan_horse2zebra_test.tar && mv cyclegan_test horse2zebra && cd ../
+ cd ./inference && tar xf cyclegan_horse2zebra.tar && cd ../
+ elif [ ${model_name} == "FOMM" ]; then
+ rm -rf ./data/first_order*
+ rm -rf ./inference/fom_dy2st*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/fom_lite_test.tar --no-check-certificate
+ wget -nc -P ./inference https://paddlegan.bj.bcebos.com/static_model/fom_dy2st.tar --no-check-certificate
+ cd ./data/ && tar xf fom_lite_test.tar && cd ../
+ cd ./inference && tar xf fom_dy2st.tar && cd ../
+ elif [ ${model_name} == "StyleGANv2" ]; then
+ rm -rf ./data/ffhq*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/ffhq_256.tar --no-check-certificate
+ wget -nc -P ./inference https://paddlegan.bj.bcebos.com/static_model/stylegan2_1024.tar --no-check-certificate
+ cd ./inference && tar xf stylegan2_1024.tar && cd ../
+ cd ./data/ && tar xf ffhq_256.tar && cd ../
+ elif [ ${model_name} == "basicvsr" ]; then
+ rm -rf ./data/reds*
+ rm -rf ./inference/basic*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/reds_lite.tar --no-check-certificate
+ wget -nc -P ./inference https://paddlegan.bj.bcebos.com/static_model/basicvsr.tar --no-check-certificate
+ cd ./inference && tar xf basicvsr.tar && cd ../
+ cd ./data/ && tar xf reds_lite.tar && cd ../
+ elif [ ${model_name} == "msvsr" ]; then
+ rm -rf ./data/reds*
+ rm -rf ./inference/msvsr*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/reds_lite.tar --no-check-certificate
+ wget -nc -P ./inference https://paddlegan.bj.bcebos.com/static_model/msvsr.tar --no-check-certificate
+ cd ./inference && tar xf msvsr.tar && cd ../
+ cd ./data/ && tar xf reds_lite.tar && cd ../
+ elif [ ${model_name} == "singan" ]; then
+ rm -rf ./data/singan*
+ wget -nc -P ./data/ https://paddlegan.bj.bcebos.com/datasets/singan-official_images.zip --no-check-certificate
+ wget -nc -P ./inference https://paddlegan.bj.bcebos.com/datasets/singan.zip --no-check-certificate
+ cd ./data/ && unzip -q singan-official_images.zip && cd ../
+ cd ./inference/ && unzip -q singan.zip && cd ../
+ mkdir -p ./data/singan
+ mv ./data/SinGAN-official_images/Images/stone.png ./data/singan
+ fi
+elif [ ${MODE} = "cpp_infer" ]; then
+ if [ ${model_name} == "msvsr" ]; then
+ rm -rf ./inference/msvsr*
+ wget -nc -P ./inference https://paddlegan.bj.bcebos.com/static_model/msvsr.tar --no-check-certificate
+ cd ./inference && tar xf msvsr.tar && cd ../
+ wget -nc -P ./data https://paddlegan.bj.bcebos.com/datasets/low_res.mp4 --no-check-certificate
+ fi
+fi
diff --git a/test_tipc/results/python_basicvsr_results_fp32.txt b/test_tipc/results/python_basicvsr_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1f0df64217ef2a8454c914a5398bc2f5279e7c7f
--- /dev/null
+++ b/test_tipc/results/python_basicvsr_results_fp32.txt
@@ -0,0 +1,2 @@
+Metric psnr: 27.0864
+Metric ssim: 0.7835
diff --git a/test_tipc/results/python_cyclegan_results_fp32.txt b/test_tipc/results/python_cyclegan_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..beffc319e03dee64c28d333f777e12ff4209b7fa
--- /dev/null
+++ b/test_tipc/results/python_cyclegan_results_fp32.txt
@@ -0,0 +1 @@
+Metric fid: 67.9814
diff --git a/test_tipc/results/python_fom_results_fp32.txt b/test_tipc/results/python_fom_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8515822f73d267204eef9f89c9bd51d372c32b56
--- /dev/null
+++ b/test_tipc/results/python_fom_results_fp32.txt
@@ -0,0 +1 @@
+Metric l1 loss: 0.1210
diff --git a/test_tipc/results/python_msvsr_results_fp32.txt b/test_tipc/results/python_msvsr_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..01b8907f04d574feeda65c0e8e6c11c1e5cb2fd6
--- /dev/null
+++ b/test_tipc/results/python_msvsr_results_fp32.txt
@@ -0,0 +1,3 @@
+Metric psnr: 23.6020
+Metric ssim: 0.5636
+
diff --git a/test_tipc/results/python_pix2pix_results_fp32.txt b/test_tipc/results/python_pix2pix_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4950de48cdae119a71628a37cbcad4e5e8db9ba8
--- /dev/null
+++ b/test_tipc/results/python_pix2pix_results_fp32.txt
@@ -0,0 +1 @@
+Metric fid: 139.3846
diff --git a/test_tipc/results/python_singan_results_fp32.txt b/test_tipc/results/python_singan_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..575aabd0cec4551b638ee62f66b87d0d8a8fef5c
--- /dev/null
+++ b/test_tipc/results/python_singan_results_fp32.txt
@@ -0,0 +1 @@
+Metric fid: 124.0369
diff --git a/test_tipc/results/python_stylegan2_results_fp32.txt b/test_tipc/results/python_stylegan2_results_fp32.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3b384beb7205595a16e380ed2acffb7fd0e591ee
--- /dev/null
+++ b/test_tipc/results/python_stylegan2_results_fp32.txt
@@ -0,0 +1 @@
+Metric fid: 153.9647
diff --git a/test_tipc/test_inference_cpp.sh b/test_tipc/test_inference_cpp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c5f533c9ade38e3e419fa1e69e8e23b4c1cb1f4a
--- /dev/null
+++ b/test_tipc/test_inference_cpp.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+MODE=$2
+dataline=$(awk 'NR==1, NR==18{print}' $FILENAME)
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# parser cpp inference params
+model_name=$(func_parser_value "${lines[1]}")
+infer_cmd=$(func_parser_value "${lines[2]}")
+model_path=$(func_parser_value "${lines[3]}")
+param_path=$(func_parser_value "${lines[4]}")
+video_path=$(func_parser_value "${lines[5]}")
+output_dir=$(func_parser_value "${lines[6]}")
+frame_num=$(func_parser_value "${lines[7]}")
+device=$(func_parser_value "${lines[8]}")
+gpu_id=$(func_parser_value "${lines[9]}")
+use_mkldnn=$(func_parser_value "${lines[10]}")
+cpu_threads=$(func_parser_value "${lines[11]}")
+
+# only support fp32、bs=1, trt is not supported yet.
+precision="fp32"
+use_trt=false
+batch_size=1
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_cpp.log"
+
+function func_cpp_inference(){
+ # set log
+ if [ ${device} = "GPU" ]; then
+ _save_log_path="${LOG_PATH}/cpp_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
+ elif [ ${device} = "CPU" ]; then
+ _save_log_path="${LOG_PATH}/cpp_infer_cpu_usemkldnn_${use_mkldnn}_threads_${cpu_threads}_precision_${precision}_batchsize_${batch_size}.log"
+ fi
+
+ # set params
+ set_model_path=$(func_set_params "--model_path" "${model_path}")
+ set_param_path=$(func_set_params "--param_path" "${param_path}")
+ set_video_path=$(func_set_params "--video_path" "${video_path}")
+ set_output_dir=$(func_set_params "--output_dir" "${output_dir}")
+ set_frame_num=$(func_set_params "--frame_num" "${frame_num}")
+ set_device=$(func_set_params "--device" "${device}")
+ set_gpu_id=$(func_set_params "--gpu_id" "${gpu_id}")
+ set_use_mkldnn=$(func_set_params "--use_mkldnn" "${use_mkldnn}")
+ set_cpu_threads=$(func_set_params "--cpu_threads" "${cpu_threads}")
+
+ # run infer
+ cmd="${infer_cmd} ${set_model_path} ${set_param_path} ${set_video_path} ${set_output_dir} ${set_frame_num} ${set_device} ${set_gpu_id} ${set_use_mkldnn} ${set_cpu_threads} > ${_save_log_path} 2>&1"
+ eval $cmd
+ last_status=${PIPESTATUS[0]}
+ status_check $last_status "${cmd}" "${status_log}" "${model_name}"
+}
+
+cd deploy/cpp_infer
+if [ -d "opencv-3.4.7/opencv3/" ] && [ $(md5sum opencv-3.4.7.tar.gz | awk -F ' ' '{print $1}') = "faa2b5950f8bee3f03118e600c74746a" ];then
+ echo "################### build opencv skipped ###################"
+else
+ echo "################### building opencv ###################"
+ rm -rf opencv-3.4.7.tar.gz opencv-3.4.7/
+ wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/opencv-3.4.7.tar.gz
+ tar -xf opencv-3.4.7.tar.gz
+
+ cd opencv-3.4.7/
+ install_path=$(pwd)/opencv3
+
+ rm -rf build
+ mkdir build
+ cd build
+
+ cmake .. \
+ -DCMAKE_INSTALL_PREFIX=${install_path} \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DBUILD_SHARED_LIBS=OFF \
+ -DWITH_IPP=OFF \
+ -DBUILD_IPP_IW=OFF \
+ -DWITH_LAPACK=OFF \
+ -DWITH_EIGEN=OFF \
+ -DCMAKE_INSTALL_LIBDIR=lib64 \
+ -DWITH_ZLIB=ON \
+ -DBUILD_ZLIB=ON \
+ -DWITH_JPEG=ON \
+ -DBUILD_JPEG=ON \
+ -DWITH_PNG=ON \
+ -DBUILD_PNG=ON \
+ -DWITH_TIFF=ON \
+ -DBUILD_TIFF=ON \
+ -DWITH_FFMPEG=ON
+
+ make -j
+ make install
+ cd ../../
+ echo "################### building opencv finished ###################"
+fi
+
+if [ -d "paddle_inference" ]; then
+ echo "################### download inference lib skipped ###################"
+else
+ echo "################### downloading inference lib ###################"
+ wget -nc https://paddle-inference-lib.bj.bcebos.com/2.3.1/cxx_c/Linux/GPU/x86-64_gcc8.2_avx_mkl_cuda10.1_cudnn7.6.5_trt6.0.1.5/paddle_inference.tgz
+ tar -xf paddle_inference.tgz
+ echo "################### downloading inference lib finished ###################"
+fi
+
+echo "################### building PaddleGAN demo ####################"
+OPENCV_DIR=$(pwd)/opencv-3.4.7/opencv3
+LIB_DIR=$(pwd)/paddle_inference
+CUDA_LIB_DIR=$(dirname `find /usr -name libcudart.so`)
+CUDNN_LIB_DIR=$(dirname `find /usr -name libcudnn.so`)
+TENSORRT_DIR=''
+
+export LD_LIBRARY_PATH=$(dirname `find ${PWD} -name libonnxruntime.so.1.11.1`):"$LD_LIBRARY_PATH"
+export LD_LIBRARY_PATH=$(dirname `find ${PWD} -name libpaddle2onnx.so.0.9.9`):"$LD_LIBRARY_PATH"
+
+BUILD_DIR=build
+rm -rf ${BUILD_DIR}
+mkdir ${BUILD_DIR}
+cd ${BUILD_DIR}
+cmake .. \
+ -DPADDLE_LIB=${LIB_DIR} \
+ -DWITH_MKL=ON \
+ -DWITH_GPU=ON \
+ -DWITH_STATIC_LIB=OFF \
+ -DWITH_TENSORRT=OFF \
+ -DOPENCV_DIR=${OPENCV_DIR} \
+ -DCUDNN_LIB=${CUDNN_LIB_DIR} \
+ -DCUDA_LIB=${CUDA_LIB_DIR} \
+ -DTENSORRT_DIR=${TENSORRT_DIR}
+
+make -j
+cd ../
+echo "################### building PaddleGAN demo finished ###################"
+
+echo "################### running test ###################"
+cd ../../
+func_cpp_inference
diff --git a/test_tipc/test_train_inference_python.sh b/test_tipc/test_train_inference_python.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c11e9b9a22d43628fad832e68f18beff9803516e
--- /dev/null
+++ b/test_tipc/test_train_inference_python.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+FILENAME=$1
+# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer']
+MODE=$2
+
+dataline=$(awk 'NR==1, NR==51{print}' $FILENAME)
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+# The training params
+model_name=$(func_parser_value "${lines[1]}")
+python=$(func_parser_value "${lines[2]}")
+gpu_list=$(func_parser_value "${lines[3]}")
+
+autocast_list=$(func_parser_value "${lines[5]}")
+epoch_key=$(func_parser_key "${lines[6]}")
+epoch_num=$(func_parser_params "${lines[6]}")
+save_model_key=$(func_parser_key "${lines[7]}")
+train_batch_key=$(func_parser_key "${lines[8]}")
+train_batch_value=$(func_parser_params "${lines[8]}")
+pretrain_model_key=$(func_parser_key "${lines[9]}")
+pretrain_model_value=$(func_parser_value "${lines[9]}")
+train_model_name=$(func_parser_value "${lines[10]}")
+train_infer_img_dir=$(func_parser_value "${lines[11]}")
+train_param_key1=$(func_parser_key "${lines[12]}")
+train_param_value1=$(func_parser_value "${lines[12]}")
+
+trainer_list=$(func_parser_value "${lines[14]}")
+
+trainer_norm=$(func_parser_key "${lines[15]}")
+norm_trainer=$(func_parser_value "${lines[15]}")
+
+to_static_key=$(func_parser_key "${lines[19]}")
+to_static_trainer=$(func_parser_value "${lines[19]}")
+trainer_key2=$(func_parser_key "${lines[20]}")
+trainer_value2=$(func_parser_value "${lines[20]}")
+
+eval_py=$(func_parser_value "${lines[23]}")
+eval_key1=$(func_parser_key "${lines[24]}")
+eval_value1=$(func_parser_value "${lines[24]}")
+
+save_infer_key=$(func_parser_key "${lines[27]}")
+export_weight=$(func_parser_value "${lines[28]}")
+norm_export=$(func_parser_value "${lines[29]}")
+
+inference_dir=$(func_parser_value "${lines[35]}")
+
+# parser inference model
+infer_model_dir_list=$(func_parser_value "${lines[36]}")
+infer_export_list=$(func_parser_value "${lines[37]}")
+infer_is_quant=$(func_parser_value "${lines[38]}")
+# parser inference
+inference_py=$(func_parser_value "${lines[39]}")
+use_gpu_key=$(func_parser_key "${lines[40]}")
+use_gpu_list=$(func_parser_value "${lines[40]}")
+use_mkldnn_key=$(func_parser_key "${lines[41]}")
+use_mkldnn_list=$(func_parser_value "${lines[41]}")
+cpu_threads_key=$(func_parser_key "${lines[42]}")
+cpu_threads_list=$(func_parser_value "${lines[42]}")
+batch_size_key=$(func_parser_key "${lines[43]}")
+batch_size_list=$(func_parser_value "${lines[43]}")
+use_trt_key=$(func_parser_key "${lines[44]}")
+use_trt_list=$(func_parser_value "${lines[44]}")
+precision_key=$(func_parser_key "${lines[45]}")
+precision_list=$(func_parser_value "${lines[45]}")
+infer_model_key=$(func_parser_key "${lines[46]}")
+image_dir_key=$(func_parser_key "${lines[47]}")
+infer_img_dir=$(func_parser_value "${lines[47]}")
+save_log_key=$(func_parser_key "${lines[48]}")
+infer_key1=$(func_parser_key "${lines[50]}")
+infer_value1=$(func_parser_value "${lines[50]}")
+
+LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
+mkdir -p ${LOG_PATH}
+status_log="${LOG_PATH}/results_python.log"
+
+function func_inference(){
+ IFS='|'
+ _python=$1
+ _script=$2
+ _model_dir=$3
+ _log_path=$4
+ _img_dir=$5
+ _flag_quant=$6
+ # inference
+ for use_gpu in ${use_gpu_list[*]}; do
+ if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
+ for use_mkldnn in ${use_mkldnn_list[*]}; do
+ if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
+ continue
+ fi
+ for threads in ${cpu_threads_list[*]}; do
+ for batch_size in ${batch_size_list[*]}; do
+ for precision in ${precision_list[*]}; do
+ set_precision=$(func_set_params "${precision_key}" "${precision}")
+
+ _save_log_path="${_log_path}/python_infer_cpu_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
+ set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
+ set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+ set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+ set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}")
+ set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
+ set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
+ command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_model_dir} > ${_save_log_path} 2>&1 "
+ eval $command
+ last_status=${PIPESTATUS[0]}
+ eval "cat ${_save_log_path}"
+ status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+ done
+ done
+ done
+ done
+ elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
+ for use_trt in ${use_trt_list[*]}; do
+ for precision in ${precision_list[*]}; do
+ if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then
+ continue
+ fi
+ if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
+ continue
+ fi
+ if [[ ${use_trt} = "False" || ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
+ continue
+ fi
+ for batch_size in ${batch_size_list[*]}; do
+ _save_log_path="${_log_path}/python_infer_gpu_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
+ set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
+ set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
+ set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
+ set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}")
+ set_precision=$(func_set_params "${precision_key}" "${precision}")
+ set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
+ set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
+ command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} > ${_save_log_path} 2>&1 "
+ eval $command
+ last_status=${PIPESTATUS[0]}
+ eval "cat ${_save_log_path}"
+ status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
+
+ done
+ done
+ done
+ else
+ echo "Does not support hardware other than CPU and GPU Currently!"
+ fi
+ done
+}
+
+if [ ${MODE} = "whole_infer" ]; then
+ GPUID=$3
+ if [ ${#GPUID} -le 0 ];then
+ env=" "
+ else
+ env="export CUDA_VISIBLE_DEVICES=${GPUID}"
+ fi
+ # set CUDA_VISIBLE_DEVICES
+ eval $env
+ export Count=0
+ IFS="|"
+ infer_run_exports=(${infer_export_list})
+ infer_quant_flag=(${infer_is_quant})
+ for infer_model in ${infer_model_dir_list[*]}; do
+ # run export
+ if [ ${infer_run_exports[Count]} != "null" ];then
+ save_infer_dir=$(dirname $infer_model)
+ set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
+ set_save_infer_key="${save_infer_key} ${save_infer_dir}"
+ export_log_path="${LOG_PATH}_export_${Count}.log"
+ export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1"
+ echo ${infer_run_exports[Count]}
+ echo $export_cmd
+ eval $export_cmd
+ status_export=$?
+ status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+ else
+ save_infer_dir=${infer_model}
+ fi
+ #run inference
+ func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}"
+ Count=$(($Count + 1))
+ done
+else
+ IFS="|"
+ export Count=0
+ USE_GPU_KEY=(${train_use_gpu_value})
+ for gpu in ${gpu_list[*]}; do
+ train_use_gpu=${USE_GPU_KEY[Count]}
+ Count=$(($Count + 1))
+ ips=""
+ if [ ${gpu} = "-1" ];then
+ env=""
+ elif [ ${#gpu} -le 1 ];then
+ env="export CUDA_VISIBLE_DEVICES=${gpu}"
+ eval ${env}
+ elif [ ${#gpu} -le 15 ];then
+ IFS=","
+ array=(${gpu})
+ env="export CUDA_VISIBLE_DEVICES=${gpu}"
+ IFS="|"
+ else
+ IFS=";"
+ array=(${gpu})
+ ips=${array[0]}
+ gpu=${array[1]}
+ IFS="|"
+ env=" "
+ fi
+ for autocast in ${autocast_list[*]}; do
+ if [ ${autocast} = "fp16" ]; then
+ set_amp_config="--amp"
+ set_amp_level="--amp_level=O2"
+ else
+ set_amp_config=" "
+ set_amp_level=" "
+ fi
+ for trainer in ${trainer_list[*]}; do
+ flag_quant=False
+ # In case of @to_static, we re-used norm_traier,
+ # but append "-o Global.to_static=True" for config
+ # to trigger "apply_to_static" logic in 'engine.py'
+ if [ ${trainer} = "${to_static_key}" ]; then
+ run_train="${norm_trainer} ${to_static_trainer}"
+ run_export=${norm_export}
+ else
+ run_train=${norm_trainer}
+ run_export=${norm_export}
+ fi
+
+ if [ ${run_train} = "null" ]; then
+ continue
+ fi
+ set_autocast=$(func_set_params "${autocast_key}" "${autocast}")
+ set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
+ set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
+ set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
+ set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
+ set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}")
+ if [ ${#ips} -le 26 ];then
+ save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
+ nodes=1
+ else
+ IFS=","
+ ips_array=(${ips})
+ IFS="|"
+ nodes=${#ips_array[@]}
+ save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
+ fi
+ set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
+ if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
+ cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_train_params1} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_amp_level}"
+ elif [ ${#ips} -le 26 ];then # train with multi-gpu
+ cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_train_params1} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_amp_level}"
+ else # train with multi-machine
+ cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_train_params1} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_amp_config} ${set_amp_level}"
+ fi
+ # run train
+ export FLAGS_cudnn_deterministic=True
+ eval $cmd
+ echo $cmd
+ log_name=${train_model_name/checkpoint.pdparams/.txt}
+ train_log_path=$( echo "${save_log}/${log_name}")
+ eval "cat ${train_log_path} >> ${save_log}.log"
+ status_check $? "${cmd}" "${status_log}" "${model_name}" "${save_log}.log"
+
+ set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}")
+ # save norm trained models to set pretrain for pact training and fpgm training
+
+ # run eval
+ if [ ${eval_py} != "null" ]; then
+ set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}")
+ eval_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log"
+ eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1} > ${eval_log_path} 2>&1"
+ eval $eval_cmd
+ status_check $? "${eval_cmd}" "${status_log}" "${model_name}" "${eval_log_path}"
+ fi
+ # run export model
+ if [ ${run_export} != "null" ]; then
+ # run export model
+ save_infer_path="${save_log}"
+ set_export_weight="${save_log}/${train_model_name}"
+ set_export_weight_path=$( echo ${set_export_weight})
+ set_save_infer_key="${save_infer_key} ${save_infer_path}"
+ export_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_export.log"
+ export_cmd="${python} ${run_export} ${set_export_weight_path} ${set_save_infer_key} > ${export_log_path} 2>&1"
+ eval "$export_cmd"
+ status_check $? "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
+
+ #run inference
+ eval $env
+ save_infer_path="${save_log}"
+ if [ ${inference_dir} != "null" ] && [ ${inference_dir} != '##' ]; then
+ infer_model_dir="${save_infer_path}/${inference_dir}"
+ else
+ infer_model_dir=${save_infer_path}
+ fi
+ func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}"
+
+ eval "unset CUDA_VISIBLE_DEVICES"
+ fi
+ done # done with: for trainer in ${trainer_list[*]}; do
+ done # done with: for autocast in ${autocast_list[*]}; do
+ done # done with: for gpu in ${gpu_list[*]}; do
+fi # end if [ ${MODE} = "infer" ]; then
diff --git a/test_tipc/test_train_inference_python_npu.sh b/test_tipc/test_train_inference_python_npu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a1e597236c490770ea6282c319c526d2543d1542
--- /dev/null
+++ b/test_tipc/test_train_inference_python_npu.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+function readlinkf() {
+ perl -MCwd -e 'print Cwd::abs_path shift' "$1";
+}
+
+function func_parser_config() {
+ strs=$1
+ IFS=" "
+ array=(${strs})
+ tmp=${array[2]}
+ echo ${tmp}
+}
+
+BASEDIR=$(dirname "$0")
+REPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)
+
+FILENAME=$1
+
+# change gpu to npu in tipc txt configs
+sed -i "s/state=GPU/state=NPU/g" $FILENAME
+sed -i "s/--device:gpu/--device:npu/g" $FILENAME
+sed -i "s/--benchmark:True/--benchmark:False/g" $FILENAME
+dataline=`cat $FILENAME`
+
+# change gpu to npu in execution script
+sed -i 's/\"gpu\"/\"npu\"/g' test_tipc/test_train_inference_python.sh
+sed -i 's/--gpus/--npus/g' test_tipc/test_train_inference_python.sh
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# pass parameters to test_train_inference_python.sh
+cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} $2"
+echo $cmd
+eval $cmd
diff --git a/test_tipc/test_train_inference_python_xpu.sh b/test_tipc/test_train_inference_python_xpu.sh
new file mode 100644
index 0000000000000000000000000000000000000000..78dd55683efb4c3916e104f6a81cfc365cae9373
--- /dev/null
+++ b/test_tipc/test_train_inference_python_xpu.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+source test_tipc/common_func.sh
+
+function readlinkf() {
+ perl -MCwd -e 'print Cwd::abs_path shift' "$1";
+}
+
+function func_parser_config() {
+ strs=$1
+ IFS=" "
+ array=(${strs})
+ tmp=${array[2]}
+ echo ${tmp}
+}
+
+BASEDIR=$(dirname "$0")
+REPO_ROOT_PATH=$(readlinkf ${BASEDIR}/../)
+
+FILENAME=$1
+
+# change gpu to npu in tipc txt configs
+sed -i "s/state=GPU/state=XPU/g" $FILENAME
+sed -i "s/--device:gpu/--device:xpu/g" $FILENAME
+sed -i "s/--benchmark:True/--benchmark:False/g" $FILENAME
+dataline=`cat $FILENAME`
+
+# change gpu to npu in execution script
+sed -i 's/\"gpu\"/\"xpu\"/g' test_tipc/test_train_inference_python.sh
+sed -i 's/--gpus/--xpus/g' test_tipc/test_train_inference_python.sh
+
+# parser params
+IFS=$'\n'
+lines=(${dataline})
+
+# pass parameters to test_train_inference_python.sh
+cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} $2"
+echo $cmd
+eval $cmd
diff --git a/tools/export_model.py b/tools/export_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..42c5425505003d195de8e0da35a8f9dcb1b7db45
--- /dev/null
+++ b/tools/export_model.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import argparse
+
+sys.path.append(".")
+
+import ppgan
+from ppgan.utils.config import get_config
+from ppgan.utils.setup import setup
+from ppgan.engine.trainer import Trainer
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-c',
+ '--config-file',
+ metavar="FILE",
+ required=True,
+ help="config file path")
+ parser.add_argument("--load",
+ type=str,
+ default=None,
+ required=True,
+ help="put the path to resuming file if needed")
+ # config options
+ parser.add_argument("-o",
+ "--opt",
+ nargs="+",
+ help="set configuration options")
+ parser.add_argument("-s",
+ "--inputs_size",
+ type=str,
+ default=None,
+ required=True,
+ help="the inputs size")
+ parser.add_argument(
+ "--output_dir",
+ default=None,
+ type=str,
+ help="The path prefix of inference model to be used.",
+ )
+ parser.add_argument(
+ "--export_serving_model",
+ default=False,
+ type=bool,
+ help="export serving model.",
+ )
+ parser.add_argument(
+ "--model_name",
+ default=None,
+ type=str,
+ help="model_name.",
+ )
+ args = parser.parse_args()
+ return args
+
+
+def main(args, cfg):
+ inputs_size = [[int(size) for size in input_size.split(',')]
+ for input_size in args.inputs_size.split(';')]
+ model = ppgan.models.builder.build_model(cfg.model)
+ model.setup_train_mode(is_train=False)
+ state_dicts = ppgan.utils.filesystem.load(args.load)
+ for net_name, net in model.nets.items():
+ if net_name in state_dicts:
+ net.set_state_dict(state_dicts[net_name])
+ model.export_model(cfg.export_model, args.output_dir, inputs_size,
+ args.export_serving_model, args.model_name)
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ cfg = get_config(args.config_file, args.opt)
+ main(args, cfg)
diff --git a/tools/extract_weight.py b/tools/extract_weight.py
new file mode 100644
index 0000000000000000000000000000000000000000..56dffa9f1570fe8bed6bf9e510d9fec66a5de60c
--- /dev/null
+++ b/tools/extract_weight.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import argparse
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description='This script extracts weights from a checkpoint')
+ parser.add_argument('checkpoint', help='checkpoint file')
+ parser.add_argument('--net-name',
+ type=str,
+ help='net name in checkpoint dict')
+ parser.add_argument('--output', type=str, help='destination file name')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+ assert args.output.endswith(".pdparams")
+ ckpt = paddle.load(args.checkpoint)
+ state_dict = ckpt[args.net_name]
+ paddle.save(state_dict, args.output)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/fom_infer.py b/tools/fom_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3f0969a30e0b42fd49c561da23ff4def9c3d04e
--- /dev/null
+++ b/tools/fom_infer.py
@@ -0,0 +1,245 @@
+import paddle.inference as paddle_infer
+import argparse
+import numpy as np
+import cv2
+import imageio
+import time
+from tqdm import tqdm
+import os
+from functools import reduce
+import paddle
+from ppgan.utils.filesystem import makedirs
+from pathlib import Path
+
+
+def read_img(path):
+ img = imageio.imread(path)
+ if img.ndim == 2:
+ img = np.expand_dims(img, axis=2)
+ # som images have 4 channels
+ if img.shape[2] > 3:
+ img = img[:, :, :3]
+ return img
+
+
+def read_video(path):
+ reader = imageio.get_reader(path)
+ fps = reader.get_meta_data()['fps']
+ driving_video = []
+ try:
+ for im in reader:
+ driving_video.append(im)
+ except RuntimeError:
+ print("Read driving video error!")
+ pass
+ reader.close()
+ return driving_video, fps
+
+
+def face_detection(img_ori, weight_path):
+ config = paddle_infer.Config(os.path.join(weight_path, '__model__'),
+ os.path.join(weight_path, '__params__'))
+ config.disable_gpu()
+ # disable print log when predict
+ config.disable_glog_info()
+ # enable shared memory
+ config.enable_memory_optim()
+ # disable feed, fetch OP, needed by zero_copy_run
+ config.switch_use_feed_fetch_ops(False)
+ predictor = paddle_infer.create_predictor(config)
+
+ img = img_ori.astype(np.float32)
+ mean = np.array([123, 117, 104])[np.newaxis, np.newaxis, :]
+ std = np.array([127.502231, 127.502231, 127.502231])[np.newaxis,
+ np.newaxis, :]
+ img -= mean
+ img /= std
+ img = img[:, :, [2, 1, 0]]
+ img = img[np.newaxis].transpose([0, 3, 1, 2])
+
+ input_names = predictor.get_input_names()
+ input_tensor = predictor.get_input_handle(input_names[0])
+ input_tensor.copy_from_cpu(img)
+ predictor.run()
+ output_names = predictor.get_output_names()
+ boxes_tensor = predictor.get_output_handle(output_names[0])
+ np_boxes = boxes_tensor.copy_to_cpu()
+ if reduce(lambda x, y: x * y, np_boxes.shape) < 6:
+ print('[WARNNING] No object detected.')
+ exit()
+ w, h = img.shape[2:]
+ np_boxes[:, 2] *= h
+ np_boxes[:, 3] *= w
+ np_boxes[:, 4] *= h
+ np_boxes[:, 5] *= w
+ expect_boxes = (np_boxes[:, 1] > 0.5) & (np_boxes[:, 0] > -1)
+ rect = np_boxes[expect_boxes, :][0][2:]
+ bh = rect[3] - rect[1]
+ bw = rect[2] - rect[0]
+ cy = rect[1] + int(bh / 2)
+ cx = rect[0] + int(bw / 2)
+ margin = max(bh, bw)
+ y1 = max(0, cy - margin)
+ x1 = max(0, cx - int(0.8 * margin))
+ y2 = min(h, cy + margin)
+ x2 = min(w, cx + int(0.8 * margin))
+ return int(y1), int(y2), int(x1), int(x2)
+
+
+def main():
+ args = parse_args()
+
+ source_path = args.source_path
+ driving_path = Path(args.driving_path)
+ makedirs(args.output_path)
+ if driving_path.is_dir():
+ driving_paths = list(driving_path.iterdir())
+ else:
+ driving_paths = [driving_path]
+
+ # 创建 config
+ kp_detector_config = paddle_infer.Config(
+ os.path.join(args.model_path, "kp_detector.pdmodel"),
+ os.path.join(args.model_path, "kp_detector.pdiparams"))
+ generator_config = paddle_infer.Config(
+ os.path.join(args.model_path, "generator.pdmodel"),
+ os.path.join(args.model_path, "generator.pdiparams"))
+ if args.device == "gpu":
+ kp_detector_config.enable_use_gpu(100, 0)
+ generator_config.enable_use_gpu(100, 0)
+ elif args.device == "xpu":
+ kp_detector_config.enable_xpu()
+ generator_config.enable_xpu()
+ elif args.device == "npu":
+ kp_detector_config.enable_npu()
+ generator_config.enable_npu()
+ else:
+ kp_detector_config.set_mkldnn_cache_capacity(10)
+ kp_detector_config.enable_mkldnn()
+ generator_config.set_mkldnn_cache_capacity(10)
+ generator_config.enable_mkldnn()
+ kp_detector_config.disable_gpu()
+ kp_detector_config.set_cpu_math_library_num_threads(6)
+ generator_config.disable_gpu()
+ generator_config.set_cpu_math_library_num_threads(6)
+ # 根据 config 创建 predictor
+ kp_detector_predictor = paddle_infer.create_predictor(kp_detector_config)
+ generator_predictor = paddle_infer.create_predictor(generator_config)
+ test_loss = []
+ for k in range(len(driving_paths)):
+ driving_path = driving_paths[k]
+ driving_video, fps = read_video(driving_path)
+ driving_video = [
+ cv2.resize(frame, (256, 256)) / 255.0 for frame in driving_video
+ ]
+ driving_len = len(driving_video)
+ driving_video = np.array(driving_video).astype(np.float32).transpose(
+ [0, 3, 1, 2])
+
+ if source_path == None:
+ source = driving_video[0:1]
+ else:
+ source_img = read_img(source_path)
+ #Todo:add blazeface static model
+ #left, right, up, bottom = face_detection(source_img, "/workspace/PaddleDetection/static/inference_model/blazeface/")
+ source = source_img #[left:right, up:bottom]
+ source = cv2.resize(source, (256, 256)) / 255.0
+ source = source[np.newaxis].astype(np.float32).transpose(
+ [0, 3, 1, 2])
+
+ # 获取输入的名称
+ kp_detector_input_names = kp_detector_predictor.get_input_names()
+ kp_detector_input_handle = kp_detector_predictor.get_input_handle(
+ kp_detector_input_names[0])
+
+ kp_detector_input_handle.reshape([args.batch_size, 3, 256, 256])
+ kp_detector_input_handle.copy_from_cpu(source)
+ kp_detector_predictor.run()
+ kp_detector_output_names = kp_detector_predictor.get_output_names()
+ kp_detector_output_handle = kp_detector_predictor.get_output_handle(
+ kp_detector_output_names[0])
+ source_j = kp_detector_output_handle.copy_to_cpu()
+ kp_detector_output_handle = kp_detector_predictor.get_output_handle(
+ kp_detector_output_names[1])
+ source_v = kp_detector_output_handle.copy_to_cpu()
+
+ kp_detector_input_handle.reshape([args.batch_size, 3, 256, 256])
+ kp_detector_input_handle.copy_from_cpu(driving_video[0:1])
+ kp_detector_predictor.run()
+ kp_detector_output_names = kp_detector_predictor.get_output_names()
+ kp_detector_output_handle = kp_detector_predictor.get_output_handle(
+ kp_detector_output_names[0])
+ driving_init_j = kp_detector_output_handle.copy_to_cpu()
+ kp_detector_output_handle = kp_detector_predictor.get_output_handle(
+ kp_detector_output_names[1])
+ driving_init_v = kp_detector_output_handle.copy_to_cpu()
+ start_time = time.time()
+ results = []
+ for i in tqdm(range(0, driving_len)):
+ kp_detector_input_handle.copy_from_cpu(driving_video[i:i + 1])
+ kp_detector_predictor.run()
+ kp_detector_output_names = kp_detector_predictor.get_output_names()
+ kp_detector_output_handle = kp_detector_predictor.get_output_handle(
+ kp_detector_output_names[0])
+ driving_j = kp_detector_output_handle.copy_to_cpu()
+ kp_detector_output_handle = kp_detector_predictor.get_output_handle(
+ kp_detector_output_names[1])
+ driving_v = kp_detector_output_handle.copy_to_cpu()
+ generator_inputs = [
+ source, source_j, source_v, driving_j, driving_v,
+ driving_init_j, driving_init_v
+ ]
+ generator_input_names = generator_predictor.get_input_names()
+ for i in range(len(generator_input_names)):
+ generator_input_handle = generator_predictor.get_input_handle(
+ generator_input_names[i])
+ generator_input_handle.copy_from_cpu(generator_inputs[i])
+ generator_predictor.run()
+ generator_output_names = generator_predictor.get_output_names()
+ generator_output_handle = generator_predictor.get_output_handle(
+ generator_output_names[0])
+ output_data = generator_output_handle.copy_to_cpu()
+ loss = paddle.abs(
+ paddle.to_tensor(output_data) -
+ paddle.to_tensor(driving_video[i])).mean().cpu().numpy()
+ test_loss.append(loss)
+ output_data = np.transpose(output_data, [0, 2, 3, 1])[0] * 255.0
+
+ #Todo:add blazeface static model
+ #frame = source_img.copy()
+ #frame[left:right, up:bottom] = cv2.resize(output_data.astype(np.uint8), (bottom - up, right - left), cv2.INTER_AREA)
+ results.append(output_data.astype(np.uint8))
+ print(time.time() - start_time)
+ imageio.mimsave(os.path.join(args.output_path,
+ "result_" + str(k) + ".mp4"),
+ [frame for frame in results],
+ fps=fps)
+ metric_file = os.path.join(args.output_path, "metric.txt")
+ log_file = open(metric_file, 'a')
+ loss_string = "Metric {}: {:.4f}".format("l1 loss", np.mean(test_loss))
+ log_file.close()
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, help="model filename profix")
+ parser.add_argument("--batch_size", type=int, default=1, help="batch size")
+ parser.add_argument("--source_path",
+ type=str,
+ default=None,
+ help="source_path")
+ parser.add_argument("--driving_path",
+ type=str,
+ default=None,
+ help="driving_path")
+ parser.add_argument("--output_path",
+ type=str,
+ default="infer_output/fom/",
+ help="output_path")
+ parser.add_argument("--device", type=str, default="gpu", help="device")
+
+ return parser.parse_args()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/inference.py b/tools/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..11ea4f4026e83e49672fc5732ca1eb835d12294b
--- /dev/null
+++ b/tools/inference.py
@@ -0,0 +1,471 @@
+import paddle
+import argparse
+import numpy as np
+import random
+import os
+from collections import OrderedDict
+import sys
+import cv2
+
+sys.path.append(".")
+
+from ppgan.utils.config import get_config
+from ppgan.datasets.builder import build_dataloader
+from ppgan.engine.trainer import IterLoader
+from ppgan.utils.visual import save_image
+from ppgan.utils.visual import tensor2img
+from ppgan.utils.filesystem import makedirs
+from ppgan.metrics import build_metric
+
+
+MODEL_CLASSES = ["pix2pix", "cyclegan", "wav2lip", "esrgan", \
+ "edvr", "fom", "stylegan2", "basicvsr", "msvsr", \
+ "singan", "swinir", "invdn", "aotgan", "nafnet"]
+
+
+def parse_args():
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--model_path",
+ default=None,
+ type=str,
+ required=True,
+ help="The path prefix of inference model to be used.",
+ )
+ parser.add_argument("--model_type",
+ default=None,
+ type=str,
+ required=True,
+ help="Model type selected in the list: " +
+ ", ".join(MODEL_CLASSES))
+ parser.add_argument(
+ "--device",
+ default="gpu",
+ type=str,
+ choices=["cpu", "gpu", "xpu", "npu"],
+ help="The device to select to train the model, is must be cpu/gpu/xpu.")
+ parser.add_argument('-c',
+ '--config-file',
+ metavar="FILE",
+ help='config file path')
+ parser.add_argument("--output_path",
+ type=str,
+ default="infer_output",
+ help="output_path")
+ # config options
+ parser.add_argument("-o",
+ "--opt",
+ nargs='+',
+ help="set configuration options")
+ # fix random numbers by setting seed
+ parser.add_argument('--seed',
+ type=int,
+ default=None,
+ help='fix random numbers by setting seed\".')
+ # for tensorRT
+ parser.add_argument("--run_mode",
+ default="fluid",
+ type=str,
+ choices=["fluid", "trt_fp32", "trt_fp16"],
+ help="mode of running(fluid/trt_fp32/trt_fp16)")
+ parser.add_argument("--trt_min_shape",
+ default=1,
+ type=int,
+ help="trt_min_shape for tensorRT")
+ parser.add_argument("--trt_max_shape",
+ default=1280,
+ type=int,
+ help="trt_max_shape for tensorRT")
+ parser.add_argument("--trt_opt_shape",
+ default=640,
+ type=int,
+ help="trt_opt_shape for tensorRT")
+ parser.add_argument("--min_subgraph_size",
+ default=3,
+ type=int,
+ help="trt_opt_shape for tensorRT")
+ parser.add_argument("--batch_size",
+ default=1,
+ type=int,
+ help="batch_size for tensorRT")
+ parser.add_argument("--use_dynamic_shape",
+ dest="use_dynamic_shape",
+ action="store_true",
+ help="use_dynamic_shape for tensorRT")
+ parser.add_argument("--trt_calib_mode",
+ dest="trt_calib_mode",
+ action="store_true",
+ help="trt_calib_mode for tensorRT")
+ args = parser.parse_args()
+ return args
+
+
+def create_predictor(model_path,
+ device="gpu",
+ run_mode='fluid',
+ batch_size=1,
+ min_subgraph_size=3,
+ use_dynamic_shape=False,
+ trt_min_shape=1,
+ trt_max_shape=1280,
+ trt_opt_shape=640,
+ trt_calib_mode=False):
+ config = paddle.inference.Config(model_path + ".pdmodel",
+ model_path + ".pdiparams")
+ if device == "gpu":
+ config.enable_use_gpu(100, 0)
+ elif device == "cpu":
+ config.disable_gpu()
+ elif device == "npu":
+ config.enable_npu()
+ elif device == "xpu":
+ config.enable_xpu()
+ else:
+ config.disable_gpu()
+
+ precision_map = {
+ 'trt_int8': paddle.inference.Config.Precision.Int8,
+ 'trt_fp32': paddle.inference.Config.Precision.Float32,
+ 'trt_fp16': paddle.inference.Config.Precision.Half
+ }
+ if run_mode in precision_map.keys():
+ config.enable_tensorrt_engine(workspace_size=1 << 25,
+ max_batch_size=batch_size,
+ min_subgraph_size=min_subgraph_size,
+ precision_mode=precision_map[run_mode],
+ use_static=False,
+ use_calib_mode=trt_calib_mode)
+
+ if use_dynamic_shape:
+ min_input_shape = {
+ 'image': [batch_size, 3, trt_min_shape, trt_min_shape]
+ }
+ max_input_shape = {
+ 'image': [batch_size, 3, trt_max_shape, trt_max_shape]
+ }
+ opt_input_shape = {
+ 'image': [batch_size, 3, trt_opt_shape, trt_opt_shape]
+ }
+ config.set_trt_dynamic_shape_info(min_input_shape, max_input_shape,
+ opt_input_shape)
+ print('trt set dynamic shape done!')
+
+ predictor = paddle.inference.create_predictor(config)
+ return predictor
+
+
+def setup_metrics(cfg):
+ metrics = OrderedDict()
+ if isinstance(list(cfg.values())[0], dict):
+ for metric_name, cfg_ in cfg.items():
+ metrics[metric_name] = build_metric(cfg_)
+ else:
+ metric = build_metric(cfg)
+ metrics[metric.__class__.__name__] = metric
+
+ return metrics
+
+
+def main():
+ args = parse_args()
+ if args.seed:
+ paddle.seed(args.seed)
+ random.seed(args.seed)
+ np.random.seed(args.seed)
+ cfg = get_config(args.config_file, args.opt)
+ predictor = create_predictor(args.model_path, args.device, args.run_mode,
+ args.batch_size, args.min_subgraph_size,
+ args.use_dynamic_shape, args.trt_min_shape,
+ args.trt_max_shape, args.trt_opt_shape,
+ args.trt_calib_mode)
+ input_handles = [
+ predictor.get_input_handle(name)
+ for name in predictor.get_input_names()
+ ]
+
+ output_handle = predictor.get_output_handle(predictor.get_output_names()[0])
+ test_dataloader = build_dataloader(cfg.dataset.test,
+ is_train=False,
+ distributed=False)
+
+ max_eval_steps = len(test_dataloader)
+ iter_loader = IterLoader(test_dataloader)
+ min_max = cfg.get('min_max', None)
+ if min_max is None:
+ min_max = (-1., 1.)
+
+ model_type = args.model_type
+ makedirs(os.path.join(args.output_path, model_type))
+
+ validate_cfg = cfg.get('validate', None)
+ metrics = None
+ if validate_cfg and 'metrics' in validate_cfg:
+ metrics = setup_metrics(validate_cfg['metrics'])
+ for metric in metrics.values():
+ metric.reset()
+
+ for i in range(max_eval_steps):
+ data = next(iter_loader)
+ if model_type == "pix2pix":
+ real_A = data['B'].numpy()
+ input_handles[0].copy_from_cpu(real_A)
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction)
+ image_numpy = tensor2img(prediction[0], min_max)
+ save_image(
+ image_numpy,
+ os.path.join(args.output_path, "pix2pix/{}.png".format(i)))
+ metric_file = os.path.join(args.output_path, "pix2pix/metric.txt")
+ real_B = paddle.to_tensor(data['A'])
+ for metric in metrics.values():
+ metric.update(prediction, real_B)
+
+ elif model_type == "cyclegan":
+ real_A = data['A'].numpy()
+ input_handles[0].copy_from_cpu(real_A)
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction)
+ image_numpy = tensor2img(prediction[0], min_max)
+ save_image(
+ image_numpy,
+ os.path.join(args.output_path, "cyclegan/{}.png".format(i)))
+ metric_file = os.path.join(args.output_path, "cyclegan/metric.txt")
+ real_B = paddle.to_tensor(data['B'])
+ for metric in metrics.values():
+ metric.update(prediction, real_B)
+
+ elif model_type == "wav2lip":
+ indiv_mels, x = data['indiv_mels'].numpy()[0], data['x'].numpy()[0]
+ x = x.transpose([1, 0, 2, 3])
+ input_handles[0].copy_from_cpu(indiv_mels)
+ input_handles[1].copy_from_cpu(x)
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ for j in range(prediction.shape[0]):
+ prediction[j] = prediction[j][::-1, :, :]
+ image_numpy = paddle.to_tensor(prediction[j])
+ image_numpy = tensor2img(image_numpy, (0, 1))
+ save_image(image_numpy,
+ "infer_output/wav2lip/{}_{}.png".format(i, j))
+
+ elif model_type == "esrgan":
+ lq = data['lq'].numpy()
+ input_handles[0].copy_from_cpu(lq)
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction[0])
+ image_numpy = tensor2img(prediction, min_max)
+ gt_numpy = tensor2img(data['gt'][0], min_max)
+ save_image(
+ image_numpy,
+ os.path.join(args.output_path, "esrgan/{}.png".format(i)))
+ metric_file = os.path.join(args.output_path, model_type,
+ "metric.txt")
+ for metric in metrics.values():
+ metric.update(image_numpy, gt_numpy)
+ break
+ elif model_type == "edvr":
+ lq = data['lq'].numpy()
+ input_handles[0].copy_from_cpu(lq)
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction[0])
+ image_numpy = tensor2img(prediction, min_max)
+ gt_numpy = tensor2img(data['gt'][0, 0], min_max)
+ save_image(image_numpy,
+ os.path.join(args.output_path, "edvr/{}.png".format(i)))
+ metric_file = os.path.join(args.output_path, model_type,
+ "metric.txt")
+ for metric in metrics.values():
+ metric.update(image_numpy, gt_numpy)
+ break
+ elif model_type == "stylegan2":
+ noise = paddle.randn([1, 1, 512]).cpu().numpy()
+ input_handles[0].copy_from_cpu(noise)
+ input_handles[1].copy_from_cpu(np.array([0.7]).astype('float32'))
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction)
+ image_numpy = tensor2img(prediction[0], min_max)
+ save_image(
+ image_numpy,
+ os.path.join(args.output_path, "stylegan2/{}.png".format(i)))
+ metric_file = os.path.join(args.output_path, "stylegan2/metric.txt")
+ real_img = paddle.to_tensor(data['A'])
+ for metric in metrics.values():
+ metric.update(prediction, real_img)
+ elif model_type in ["basicvsr", "msvsr"]:
+ lq = data['lq'].numpy()
+ input_handles[0].copy_from_cpu(lq)
+ predictor.run()
+ if len(predictor.get_output_names()) > 1:
+ output_handle = predictor.get_output_handle(
+ predictor.get_output_names()[-1])
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction)
+ _, t, _, _, _ = prediction.shape
+
+ out_img = []
+ gt_img = []
+ for ti in range(t):
+ out_tensor = prediction[0, ti]
+ gt_tensor = data['gt'][0, ti]
+ out_img.append(tensor2img(out_tensor, (0., 1.)))
+ gt_img.append(tensor2img(gt_tensor, (0., 1.)))
+
+ image_numpy = tensor2img(prediction[0], min_max)
+ save_image(
+ image_numpy,
+ os.path.join(args.output_path, model_type, "{}.png".format(i)))
+
+ metric_file = os.path.join(args.output_path, model_type,
+ "metric.txt")
+ for metric in metrics.values():
+ metric.update(out_img, gt_img, is_seq=True)
+ elif model_type == "singan":
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction)
+ image_numpy = tensor2img(prediction, min_max)
+ save_image(
+ image_numpy,
+ os.path.join(args.output_path, "singan/{}.png".format(i)))
+ metric_file = os.path.join(args.output_path, "singan/metric.txt")
+ for metric in metrics.values():
+ metric.update(prediction, data['A'])
+ elif model_type == 'gfpgan':
+ input_handles[0].copy_from_cpu(data['lq'].numpy())
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction)
+ image_numpy = tensor2img(prediction, min_max)
+ save_image(
+ image_numpy,
+ os.path.join(args.output_path, "gfpgan/{}.png".format(i)))
+ elif model_type == "swinir":
+ lq = data[1].numpy()
+ _, _, h_old, w_old = lq.shape
+ window_size = 8
+ tile = 128
+ tile_overlap = 32
+ # after feed data to model, shape of feature map is change
+ h_pad = (h_old // window_size + 1) * window_size - h_old
+ w_pad = (w_old // window_size + 1) * window_size - w_old
+ lq = np.concatenate([lq, np.flip(lq, 2)],
+ axis=2)[:, :, :h_old + h_pad, :]
+ lq = np.concatenate([lq, np.flip(lq, 3)],
+ axis=3)[:, :, :, :w_old + w_pad]
+ lq = lq.astype("float32")
+
+ b, c, h, w = lq.shape
+ tile = min(tile, h, w)
+ assert tile % window_size == 0, "tile size should be a multiple of window_size"
+ sf = 1 # scale
+ stride = tile - tile_overlap
+ h_idx_list = list(range(0, h - tile, stride)) + [h - tile]
+ w_idx_list = list(range(0, w - tile, stride)) + [w - tile]
+ E = np.zeros([b, c, h * sf, w * sf], dtype=np.float32)
+ W = np.zeros_like(E)
+
+ for h_idx in h_idx_list:
+ for w_idx in w_idx_list:
+ in_patch = lq[..., h_idx:h_idx + tile, w_idx:w_idx + tile]
+ input_handles[0].copy_from_cpu(in_patch)
+ predictor.run()
+ out_patch = output_handle.copy_to_cpu()
+ out_patch_mask = np.ones_like(out_patch)
+
+ E[..., h_idx * sf:(h_idx + tile) * sf,
+ w_idx * sf:(w_idx + tile) * sf] += out_patch
+ W[..., h_idx * sf:(h_idx + tile) * sf,
+ w_idx * sf:(w_idx + tile) * sf] += out_patch_mask
+
+ output = np.true_divide(E, W)
+ prediction = output[..., :h_old * sf, :w_old * sf]
+
+ prediction = paddle.to_tensor(prediction)
+ target = tensor2img(data[0], (0., 1.))
+ prediction = tensor2img(prediction, (0., 1.))
+
+ metric_file = os.path.join(args.output_path, model_type,
+ "metric.txt")
+ for metric in metrics.values():
+ metric.update(prediction, target)
+
+ lq = tensor2img(data[1], (0., 1.))
+
+ sample_result = np.concatenate((lq, prediction, target), 1)
+ sample = cv2.cvtColor(sample_result, cv2.COLOR_RGB2BGR)
+ file_name = os.path.join(args.output_path, model_type,
+ "{}.png".format(i))
+ cv2.imwrite(file_name, sample)
+ elif model_type == "invdn":
+ noisy = data[0].numpy()
+ noise_channel = 3 * 4**(cfg.model.generator.down_num) - 3
+ input_handles[0].copy_from_cpu(noisy)
+ input_handles[1].copy_from_cpu(
+ np.random.randn(noisy.shape[0], noise_channel, noisy.shape[2],
+ noisy.shape[3]).astype(np.float32))
+ predictor.run()
+ output_handles = [
+ predictor.get_output_handle(name)
+ for name in predictor.get_output_names()
+ ]
+ prediction = output_handles[0].copy_to_cpu()
+ prediction = paddle.to_tensor(prediction[0])
+ image_numpy = tensor2img(prediction, min_max)
+ gt_numpy = tensor2img(data[1], min_max)
+ save_image(image_numpy,
+ os.path.join(args.output_path, "invdn/{}.png".format(i)))
+ metric_file = os.path.join(args.output_path, model_type,
+ "metric.txt")
+ for metric in metrics.values():
+ metric.update(image_numpy, gt_numpy)
+ break
+
+ elif model_type == "nafnet":
+ lq = data[1].numpy()
+ input_handles[0].copy_from_cpu(lq)
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction)
+ target = tensor2img(data[0], (0., 1.))
+ prediction = tensor2img(prediction, (0., 1.))
+
+ metric_file = os.path.join(args.output_path, model_type,
+ "metric.txt")
+ for metric in metrics.values():
+ metric.update(prediction, target)
+
+ lq = tensor2img(data[1], (0., 1.))
+
+ sample_result = np.concatenate((lq, prediction, target), 1)
+ sample = cv2.cvtColor(sample_result, cv2.COLOR_RGB2BGR)
+ file_name = os.path.join(args.output_path, model_type,
+ "{}.png".format(i))
+ cv2.imwrite(file_name, sample)
+ elif model_type == 'aotgan':
+ input_data = paddle.concat((data['img'], data['mask']),
+ axis=1).numpy()
+ input_handles[0].copy_from_cpu(input_data)
+ predictor.run()
+ prediction = output_handle.copy_to_cpu()
+ prediction = paddle.to_tensor(prediction)
+ image_numpy = tensor2img(prediction, min_max)
+ save_image(
+ image_numpy,
+ os.path.join(args.output_path, "aotgan/{}.png".format(i)))
+
+ if metrics:
+ log_file = open(metric_file, 'a')
+ for metric_name, metric in metrics.items():
+ loss_string = "Metric {}: {:.4f}".format(metric_name,
+ metric.accumulate())
+ print(loss_string, file=log_file)
+ log_file.close()
+
+
+if __name__ == '__main__':
+ main()
diff --git a/tools/main.py b/tools/main.py
index 7d40c11c5b4b93ae9cd4678909efc8b4526dedde..b7e2b82607401d87fa6b2309528e776d2f0f2f92 100644
--- a/tools/main.py
+++ b/tools/main.py
@@ -51,6 +51,6 @@ def main(args, cfg):
if __name__ == '__main__':
args = parse_args()
- cfg = get_config(args.config_file)
+ cfg = get_config(args.config_file, args.opt)
main(args, cfg)
diff --git a/tools/styleclip_getf.py b/tools/styleclip_getf.py
new file mode 100644
index 0000000000000000000000000000000000000000..57767dc7e129cd23825ef9edd6aeff5f78231e50
--- /dev/null
+++ b/tools/styleclip_getf.py
@@ -0,0 +1,89 @@
+import argparse
+from tqdm import tqdm
+import paddle
+import numpy as np
+
+from ppgan.apps.styleganv2_predictor import StyleGANv2Predictor
+
+
+def concat_style_paddle(s_lst, n_layers):
+ result = [list() for _ in range(n_layers)]
+ assert n_layers == len(s_lst[0])
+ for i in range(n_layers):
+ for s_ in s_lst:
+ result[i].append(s_[i])
+ for i in range(n_layers):
+ result[i] = paddle.concat(result[i])
+ return result
+
+
+def to_np(s_lst):
+ for i in range(len(s_lst)):
+ s_lst[i] = s_lst[i].numpy()
+ return s_lst
+
+
+def concat_style_np(s_lst, n_layers):
+ result = [list() for _ in range(n_layers)]
+ assert n_layers == len(s_lst[0])
+ for i in range(n_layers):
+ for s_ in s_lst:
+ result[i].append(s_[i])
+ for i in range(n_layers):
+ result[i] = np.concatenate(result[i])
+ return result
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--dataset_name', type=str, default='ffhq-config-f')
+ parser.add_argument('--seed', type=int, default=1234)
+ args = parser.parse_args()
+
+ dataset_name = args.dataset_name
+ G = StyleGANv2Predictor(model_type=dataset_name).generator
+ w_idx_lst = G.w_idx_lst
+
+ with paddle.no_grad():
+ # get intermediate latent of 100000 samples
+ w_lst = list()
+ z = paddle.to_tensor(
+ np.random.RandomState(args.seed).randn(
+ 1000, 100, G.style_dim).astype('float32'))
+ #z = paddle.randn([1000, 100, G.style_dim])
+ for i in tqdm(range(1000)): # 100 * 1000 = 100000 # 1000
+ # apply truncation_psi=.7 truncation_cutoff=8
+ w_lst.append(
+ G.get_latents(z[i], truncation=0.7, truncation_cutoff=8))
+ #paddle.save(paddle.concat(w_lst[:20]), f'W-{dataset_name}.pdparams')
+
+ s_lst = []
+ # get style of first 2000 sample in W
+ for i in tqdm(range(20)): # 2*1000
+ s_ = G.style_affine(w_lst[i])
+ s_lst.append(s_)
+ paddle.save(concat_style_paddle(s_lst, len(w_idx_lst)),
+ f'S-{dataset_name}.pdparams')
+
+ for i in tqdm(range(20)): # 2*1000
+ s_lst[i] = to_np(s_lst[i])
+
+ # get std, mean of 100000 style samples
+ for i in tqdm(range(20, 1000)): # 100 * 1000
+ s_ = G.style_affine(w_lst[i])
+ s_lst.append(to_np(s_))
+ del w_lst, z, s_, G
+ s_lst = concat_style_np(s_lst, len(w_idx_lst))
+ s_mean = [
+ paddle.mean(paddle.to_tensor(s_lst[i]), axis=0)
+ for i in range(len(w_idx_lst))
+ ]
+ s_std = [
+ paddle.std(paddle.to_tensor(s_lst[i]), axis=0)
+ for i in range(len(w_idx_lst))
+ ]
+ paddle.save({
+ 'mean': s_mean,
+ 'std': s_std
+ }, f'stylegan2-{dataset_name}-styleclip-stats.pdparams')
+ print("Done.")