first commit

2024-08-21 22:15:12 +08:00 · 2024-08-21 22:15:12 +08:00 · b4244090ae
commit b4244090ae
775 changed files with 134703 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1,29 @@
+Dockerfile
+**/publish.py
+my
+.git
+.refresh
+__pycache__
+.ipynb_checkpoints/
+.vscode/
+__res/
+perf.data
+perf.data.old
+*.swp
+*.ipynb
+*.pdf
+*.zip
+*.tgz
+test.py
+extern/mkl/mkldnn_lnx*/*
+data/
+build/
+venv/
+*.md
+!*.src.md
+!README.md
+!README.cn.md
+python/jittor.egg-info
+dist/
+!doc/source/*
+__data__
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -0,0 +1,57 @@
+# This is a basic workflow to help you get started with Actions
+
+name: CI
+
+# Controls when the action will run. Triggers the workflow on push or pull request
+# events but only for the master branch
+on: [ push ]
+#   push:
+#     branches: [ master ]
+#   pull_request:
+#     branches: [ master ]
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  test_clang_8_cuda_10:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+    # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+    - uses: actions/checkout@v2
+
+    - name: test
+      run: |
+        export cache_name=github_${GITHUB_REF##*/}
+        export cc_path="clang++-8"
+        export cc_flags=" -g "
+        export log_sync=0
+        export log_v=0
+        export PYTHONIOENCODING=utf8
+        export PYTHONPATH=`pwd`/python
+        export nvcc_path=/usr/local/cuda/bin/nvcc
+        python3.7 -c "import jittor"
+        python3.7 -m jittor.test -v
+        
+  test_gcc:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+    # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+    - uses: actions/checkout@v2
+
+    - name: test
+      run: |
+        export cache_name=github_${GITHUB_REF##*/}
+        export cc_path="g++"
+        export cc_flags=" -g "
+        export log_sync=0
+        export log_v=0
+        export PYTHONIOENCODING=utf8
+        export PYTHONPATH=`pwd`/python
+        export nvcc_path=
+        python3.7 -c "import jittor"
+        python3.7 -m jittor.test -v
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,30 @@
+my
+.refresh
+.DS_Store
+__pycache__
+.ipynb_checkpoints/
+.vscode/
+__res/
+perf.data
+perf.data.old
+*.swp
+*.ipynb
+*.pdf
+*.zip
+*.tgz
+*.obj
+test.py
+extern/mkl/mkldnn_lnx*/*
+data/
+build/
+venv/
+*.md
+!*.src.md
+!README.md
+!README.cn.md
+!CHANGELOG.md
+python/jittor.egg-info
+dist/
+!doc/source/*
+core
+__data__
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -0,0 +1,46 @@
+test_clang_8_cuda_10:
+  tags:
+    - clang
+    - cuda
+  script:
+    - export cache_name=$CI_COMMIT_REF_NAME
+    - export cc_path="clang-8"
+    - export cc_flags=" -g "
+    - export log_sync=0
+    - export log_v=0
+    - export PYTHONIOENCODING=utf8
+    - export PYTHONPATH=`pwd`/python
+    - export nvcc_path=/usr/local/cuda/bin/nvcc
+    - python3.7 -c "import jittor"
+    - python3.7 -m jittor.test -v
+    
+# test_icc_19:
+#   tags:
+#     - icc
+#   script:
+#     - export cache_name=$CI_COMMIT_REF_NAME
+#     - export cc_path="/opt/intel/system_studio_2019/bin/icc"
+#     - export cc_flags=" -g "
+#     - export log_sync=0
+#     - export log_v=0
+#     - export PYTHONIOENCODING=utf8
+#     - export PYTHONPATH=`pwd`/python
+#     - export LD_LIBRARY_PATH="/opt/intel/system_studio_2019/compilers_and_libraries/linux/lib/intel64"
+#     - python3.7 -c "import jittor"
+#     - python3.7 -m jittor.test -v
+    
+test_g++:
+  tags:
+    - gcc
+  script:
+    - export cache_name=$CI_COMMIT_REF_NAME
+    - export cc_path="g++"
+    - export cc_flags=" -g "
+    - export log_sync=0
+    - export log_v=0
+    - export PYTHONIOENCODING=utf8
+    - export PYTHONPATH=`pwd`/python
+    - export nvcc_path=
+    - python3.7 -c "import jittor"
+    - python3.7 -m jittor.test -v
+    
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,75 @@
+# CHANGELOG
+
+### 计图 1.1.5.5
+
+* 新增numpy code算子，现在可以直接使用numpy来自定义算子了，使用用例：
+
+```python
+import jittor as jt
+
+def forward_code(np, data):
+    a = data["inputs"][0]
+    b = data["outputs"][0]
+    np.add(a,a,out=b)
+
+def backward_code(np, data):
+    dout = data["dout"]
+    out = data["outputs"][0]
+    np.copyto(out, dout*2.0)
+
+a = jt.random((5,1))
+b = jt.numpy_code(
+    a.shape,
+    a.dtype,
+    [a],
+    forward_code,
+    [backward_code],
+)
+```
+
+* 新增 Function 模块，用户可以自定义反向传播了，使用用例：
+
+```python
+import jittor as jt
+from jittor import Function
+
+class MyFunc(Function):
+    def execute(self, x, y):
+        self.x = x
+        self.y = y
+        return x*y, x/y
+
+    def grad(self, grad0, grad1):
+        return grad0 * self.y, grad1 * self.x
+a = jt.array(3.0)
+b = jt.array(4.0)
+func = MyFunc()
+c,d = func(a, b)
+da, db = jt.grad(c+d*3, [a, b])
+assert da.data == 4
+assert db.data == 9
+```
+
+* 新增 no_grad scope, 在这个scope中创建的所有变量都会停止梯度：
+
+```python
+import jittor as jt
+
+with jt.no_grad():
+    ...
+```
+
+* 新增 bmm（batch matrix multiply） 支持:
+
+```python
+import jittor as jt
+from jittor import nn
+
+batch, n, m, k = 100, 5, 6, 7
+
+a = jt.random((batch, n, m))
+b = jt.random((batch, m, k))
+c = nn.bmm(a, b)
+```
+
+* 修复 unsqueeze
--- a/50
+++ b/50
@ -0,0 +1,50 @@
+# docker build commands
+ARG FROM_IMAGE=ubuntu:18.04
+
+FROM ${FROM_IMAGE}
+
+RUN apt update && apt install ca-certificates -y
+
+# change tsinghua mirror
+RUN echo \
+"deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-updates main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-backports main restricted universe multiverse\n\
+deb [trusted=yes] https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ bionic-security main restricted universe multiverse" > /etc/apt/sources.list
+
+RUN apt update && apt install wget \
+        python3.7 python3.7-dev \
+        g++ build-essential openssh-server -y
+
+WORKDIR /usr/src/jittor
+
+RUN apt download python3-distutils && dpkg-deb -x ./python3-distutils* / \
+    && wget -O - https://bootstrap.pypa.io/get-pip.py | python3.7
+
+ENV PYTHONIOENCODING utf8
+
+# change tsinghua mirror
+RUN pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+RUN pip3 install  \
+        numpy \
+        tqdm \
+        pillow \
+        astunparse \
+        notebook
+
+RUN pip3 install matplotlib
+
+RUN apt install openmpi-bin openmpi-common libopenmpi-dev -y
+
+RUN pip3 install jittor --timeout 100 && python3.7 -m jittor.test.test_example
+
+RUN pip3 uninstall jittor -y
+
+COPY . .
+
+RUN pip3 install . --timeout 100
+
+RUN python3.7 -m jittor.test.test_example
+
+CMD python3.7 -m jittor.notebook --allow-root --ip=0.0.0.0
--- a/LICENSE.txt
+++ b/LICENSE.txt
@ -0,0 +1,203 @@
+Copyright (c) 2023 Jittor. All Rights Reserved
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright (c) 2023 Jittor. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,5 @@
+exclude __data__
+exclude __pycache__
+prune **/__data__/
+prune **/__pycache__
+prune *.pyc
--- a/README.cn.md
+++ b/README.cn.md
@ -0,0 +1,422 @@
+# Jittor: 即时编译深度学习框架
+
+![Jittor Logo](https://cg.cs.tsinghua.edu.cn/jittor/favicon_package_v0/JittorLogo_Final1220.svg)
+
+
+[快速开始](#快速开始) | [安装](#安装) | [教程](#教程) | [English](./README.md)
+
+
+Jittor 是一个基于即时编译和元算子的高性能深度学习框架，整个框架在即时编译的同时，还集成了强大的Op编译器和调优器，为您的模型生成定制化的高性能代码。Jittor还包含了丰富的高性能模型库，涵盖范围包括：图像识别，检测，分割，生成，可微渲染，几何学习，强化学习等等。
+
+
+Jittor前端语言为Python。前端使用了模块化和动态图执行的设计，这是目前最主流的深度学习框架接口设计。后端则使用高性能语言编写，如CUDA，C++。
+
+
+相关链接：
+*  [Jittor官网](https://cg.cs.tsinghua.edu.cn/jittor/)
+*  [Jittor教程](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/)
+*  [Jittor模型库](https://cg.cs.tsinghua.edu.cn/jittor/resources/)
+*  [Jittor文档](https://cg.cs.tsinghua.edu.cn/jittor/assets/docs/index.html)
+*  [Github](https://github.com/jittor/jittor)， [GitLink](https://www.gitlink.org.cn/jittor/jittor)， [Gitee](https://gitee.com/jittor/jittor)
+*  [Jittor 论坛](https://discuss.jittor.org/)
+*  [Jittor 精选仓库](https://github.com/Jittor/jittor/blob/master/AWESOME-JITTOR-LIST.md)
+*  即时通信: QQ Group(761222083)
+
+
+
+下面的代码演示了如何一步一步使用Python代码，从头对一个双层神经网络建模。
+
+```python
+import jittor as jt
+from jittor import Module
+from jittor import nn
+import numpy as np
+
+class Model(Module):
+    def __init__(self):
+        self.layer1 = nn.Linear(1, 10)
+        self.relu = nn.Relu() 
+        self.layer2 = nn.Linear(10, 1)
+    def execute (self,x) :
+        x = self.layer1(x)
+        x = self.relu(x)
+        x = self.layer2(x)
+        return x
+
+def get_data(n): # generate random data for training test.
+    for i in range(n):
+        x = np.random.rand(batch_size, 1)
+        y = x*x
+        yield jt.float32(x), jt.float32(y)
+
+
+learning_rate = 0.1
+batch_size = 50
+n = 1000
+
+model = Model()
+optim = nn.SGD(model.parameters(), learning_rate)
+
+for i,(x,y) in enumerate(get_data(n)):
+    pred_y = model(x)
+    dy = pred_y - y
+    loss = dy * dy
+    loss_mean = loss.mean()
+    optim.step(loss_mean)
+    print(f"step {i}, loss = {loss_mean.data.sum()}")
+```
+
+
+
+## 大纲
+
+- [快速开始](#快速开始)
+- [安装](#安装)
+- [教程](#教程)
+- [贡献](#贡献)
+- [团队](#团队)
+- [版权声明](#版权声明)
+
+
+## 快速开始
+
+
+我们提供了一些jupyter notebooks来帮助您快速入门Jittor。
+
+- [示例：模型定义与训练][1]
+- [基础：Op, Var][2]
+- [元算子：通过元算子实现自己的卷积层][3]
+
+
+## 安装
+
+Jittor框架对环境要求如下:
+
+
+| OS                                                     | CPU                                 | Python | Compiler     | (Optional) GPU platform                                |
+|--------------------------------------------------------|-------------------------------------|--------|--------------|---------------------------------------------|
+| Linux<br>(Ubuntu, CentOS, Arch, <br>UOS, KylinOS, ...) | x86 <br>x86_64 <br>ARM <br>loongson | >= 3.7 | g++ >=5.4    | Nvidia CUDA >= 10.0, [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar) <br> or [AMD ROCm](https://docs.amd.com/) >= 4.0 <br> or [Hygon DCU DTK](https://tycloud.hpccube.com/doc/1.0.6/11277/general-handbook/software-tutorial/jittor.html) >= 22.04 |
+| macOS <br>(>= 10.14 Mojave)                            | intel<br>Apple Silicon              | >= 3.7 | clang >= 8.0 | -                                           |
+| Windows 10 & 11                                        | x86_64                              | [>= 3.8](https://www.python.org/downloads/windows/) | -            | Nvidia CUDA >= 10.2 [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#install-windows)                               |
+
+Jittor 提供了三种安装方法：pip、docker和手动安装：
+
+
+## Pip 安装
+
+
+下面将展示Ubuntu的安装命令，如果您在使用其他Linux操作系统（如CentOS）， 请安装好依赖（Python>=3.7, g++>=5.4）或者使用**docker安装**， 如果您已经装好编译器和对应版本的Python,我们强烈推荐您使用这种方法
+(如果无法访问github, 可以通过Jittor主页下载):
+
+```bash
+sudo apt install python3.7-dev libomp-dev
+python3.7 -m pip install jittor
+# or install from github(latest version)
+# python3.7 -m pip install git+https://github.com/Jittor/jittor.git
+python3.7 -m jittor.test.test_example
+```
+
+如果测试运行通过,恭喜你已经安装完成.
+jittor会自动在路径中寻找合适的编译器, 如果您希望手动指定编译器, 请使用环境变量 `cc_path` 和 `nvcc_path`(可选).
+
+### macOS 安装
+
+
+macOS 请使用 [homebrew](https://brew.sh) 安装额外的依赖。
+
+
+```bash
+brew install libomp
+```
+
+之后您可以通过 pip 安装 jittor，并测试是否可以成功运行。
+
+
+```bash
+python3.7 -m pip install jittor
+python3.7 -m jittor.test.test_example
+```
+
+目前在 macOS 中，jittor 只支持 CPU 计算。
+
+
+### Windows安装
+
+
+Windows 请准备好Python>=3.8，安装方法如下(conda安装需要额外命令)：
+
+Windows user please prepare Python>=3.8, install instructions are list below(conda needs extra instructions)：
+
+```bash
+# check your python version(>=3.8)
+python --version
+python -m pip install jittor
+# if conda is used
+conda install pywin32
+```
+
+Windows 下，jittor会自动检测显卡并安装对应的 CUDA， 请确保您的NVIDIA驱动支持CUDA 10.2 以上，您还可以使用如下命令手动为Jittor安装CUDA：
+
+
+```bash
+python -m jittor_utils.install_cuda
+```
+
+
+
+## Docker 安装
+
+我们提供了Docker安装方式，免去您配置环境，Docker安装方法如下：
+
+
+```
+# CPU only(Linux)
+docker run -it --network host jittor/jittor
+# CPU and CUDA(Linux)
+docker run -it --network host --gpus all jittor/jittor-cuda
+# CPU only(Mac and Windows)
+docker run -it -p 8888:8888 jittor/jittor
+```
+
+关于Docker安装的详细教程，可以参考[Windows/Mac/Linux通过Docker安装计图](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/2020-5-15-00-00-docker/)
+
+## 手动安装
+
+
+我们将逐步演示如何在Ubuntu 16.04中安装Jittor，其他Linux发行版可能可以使用类似的命令。
+
+
+### 步骤一：选择您的后端编译器
+
+```bash
+# g++
+sudo apt install g++ build-essential libomp-dev
+
+# OR clang++-8
+wget -O - https://raw.githubusercontent.com/Jittor/jittor/master/script/install_llvm.sh > /tmp/llvm.sh
+bash /tmp/llvm.sh 8
+```
+
+### 步骤二：安装Python和python-dev
+
+
+Jittor需要python的版本>=3.7。
+
+```bash
+sudo apt install python3.7 python3.7-dev
+```
+
+
+### 步骤三：运行Jittor
+
+
+整个框架是即时编译的。 让我们通过pip安装jittor
+
+```bash
+git clone https://github.com/Jittor/jittor.git
+sudo pip3.7 install ./jittor
+export cc_path="clang++-8"
+# if other compiler is used, change cc_path
+# export cc_path="g++"
+# export cc_path="icc"
+
+# run a simple test
+python3.7 -m jittor.test.test_example
+```
+
+如果通过了测试，那么您的Jittor已经准备就绪。
+
+
+### 可选步骤四：启用CUDA
+
+
+在Jittor中使用CUDA非常简单，只需设置环境值`nvcc_path`
+
+```bash
+# replace this var with your nvcc location 
+export nvcc_path="/usr/local/cuda/bin/nvcc" 
+# run a simple cuda test
+python3.7 -m jittor.test.test_cuda 
+```
+
+如果测试通过，则可以通过设置`use_cuda`标识符在Jittor中启用CUDA。
+
+```python
+import jittor as jt
+jt.flags.use_cuda = 1
+```
+
+
+### 可选步骤五：测试训练Resnet18
+
+
+要检查Jittor的完整性，您可以运行Resnet18训练测试。需要注意的是，这个测试需要6G显存。
+
+```bash
+python3.7 -m jittor.test.test_resnet
+```
+
+如果这些测试失败，请为我们报告错误，我们十分欢迎您为Jittor做出贡献^ _ ^
+
+
+## 教程
+
+
+在教程部分，我们将简要解释Jittor的基本概念。
+
+
+要使用Jittor训练模型，您需要了解两个主要概念：
+
+* Var：Jittor的基本数据类型
+* Operations：Jittor的算子与numpy类似
+
+
+### 数据类型
+
+
+首先，让我们开始使用Var。Var是jittor的基本数据类型，为了运算更加高效Jittor中的计算过程是异步的。 如果要访问数据，可以使用`Var.data`进行同步数据访问。
+
+```python
+import jittor as jt
+a = jt.float32([1,2,3])
+print (a)
+print (a.data)
+# Output: float32[3,]
+# Output: [ 1. 2. 3.]
+```
+
+
+此外我们可以给变量起一个名字。
+
+```python
+a.name('a')
+print(a.name())
+# Output: a
+```
+
+
+### 数据运算
+
+
+ Jittor的算子与numpy类似。 让我们尝试一些运算， 我们通过Op`jt.float32`创建Var `a`和`b`，并将它们相加。 输出这些变量相关信息，可以看出它们具有相同的形状和类型。
+
+```python
+import jittor as jt
+a = jt.float32([1,2,3])
+b = jt.float32([4,5,6])
+c = a*b
+print(a,b,c)
+print(type(a), type(b), type(c))
+# Output: float32[3,] float32[3,] float32[3,]
+# Output: <class 'jittor_core.Var'> <class 'jittor_core.Var'> <class 'jittor_core.Var'>
+```
+
+除此之外，我们使用的所有算子`jt.xxx(Var,...)`都具有别名`Var.xxx(...)`。 例如：
+
+```python
+c.max() # alias of jt.max(c)
+c.add(a) # alias of jt.add(c, a)
+c.min(keepdims=True) # alias of jt.min(c, keepdims=True)
+```
+
+
+如果您想知道Jittor支持的所有运算，可以运行`help(jt.ops)`。 您在`jt.ops.xxx`中找到的所有运算都可以通过别名`jt.xxx`。
+
+```python
+help(jt.ops)
+# Output:
+#   abs(x: core.Var) -> core.Var
+#   add(x: core.Var, y: core.Var) -> core.Var
+#   array(data: array) -> core.Var
+#   binary(x: core.Var, y: core.Var, op: str) -> core.Var
+#   ......
+```
+
+### 更多教程
+
+
+如果您想进一步了解Jittor，请查看以下notebooks：
+
+* 快速开始
+    * [示例：模型定义与训练][1]
+    * [基本概念：Op, Var][2]
+    * [元算子：通过元算子实现自己的卷积层][3]
+* 进阶
+    * [自定义算子：使用C ++和CUDA编写您的算子，并其进行即时编译][4]
+    * [性能分析器：分析您的模型][5]
+    * Jtune：性能调优工具
+
+
+
+[1]: python/jittor/notebook/example.src.md	"示例"
+[2]: python/jittor/notebook/basics.src.md	"基本概念"
+[3]: python/jittor/notebook/meta_op.src.md	"元算子"
+[4]: python/jittor/notebook/custom_op.src.md	"自定义算子"
+[5]: python/jittor/notebook/profiler.src.md	"性能分析器"
+
+
+这些notebooks可以通过python3.7 -m jittor.notebook在您自己的计算机中运行。
+
+
+## 贡献
+
+
+Jittor还很年轻。 它可能存在错误和问题。 请在我们的错误跟踪系统中报告它们。 我们欢迎您为Jittor做出贡献。 此外，如果您对Jittor有任何想法，请告诉我们。
+
+您可以用以下方式帮助Jittor：
+
+* 在论文中引用 Jittor
+* 向身边的好朋友推荐 Jittor
+* 贡献代码
+* 贡献教程和文档
+* 提出issue
+* 回答 jittor 相关问题
+* 点亮小星星
+* 持续关注 jittor
+* ……
+
+
+
+
+## 联系我们
+
+官方主页： http://cg.cs.tsinghua.edu.cn/jittor/
+
+电子邮件：jittor@qq.com
+
+提出issue：https://github.com/Jittor/jittor/issues
+
+
+
+
+
+QQ 群：761222083
+
+
+
+## 团队
+
+
+Jittor目前由[清华大学计算机图形学组](https://cg.cs.tsinghua.edu.cn/)维护。 如果您也对Jittor感兴趣并希望对其进行改进，请加入我们！
+
+
+## 引用
+
+```
+@article{hu2020jittor,
+  title={Jittor: a novel deep learning framework with meta-operators and unified graph execution},
+  author={Hu, Shi-Min and Liang, Dun and Yang, Guo-Ye and Yang, Guo-Wei and Zhou, Wen-Yang},
+  journal={Science China Information Sciences},
+  volume={63},
+  number={222103},
+  pages={1--21},
+  year={2020}
+}
+```
+
+
+## 版权声明
+
+
+如LICENSE.txt文件中所示，Jittor使用Apache 2.0版权协议。
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,416 @@
+# Jittor: a Just-in-time(JIT) deep learning framework
+
+![Jittor Logo](https://cg.cs.tsinghua.edu.cn/jittor/favicon_package_v0/JittorLogo_Final1220.svg)
+
+[Quickstart](#quickstart) | [Install](#install) | [Tutorial](#tutorial) | [简体中文](./README.cn.md)
+
+
+Jittor is a high-performance deep learning framework based on JIT compiling and meta-operators. The whole framework and meta-operators are compiled just-in-time. A powerful op compiler and tuner are integrated into Jittor. It allowed us to generate high-performance code with specialized for your model. Jittor also contains a wealth of high-performance model libraries, including: image recognition, detection, segmentation, generation, differentiable rendering, geometric learning, reinforcement learning, etc. .
+
+
+The front-end language is Python. Module Design and Dynamic Graph Execution is used in the front-end, which is the most popular design for deeplearning framework interface. The back-end is implemented by high performance language, such as CUDA,C++.
+
+
+Related Links:
+*  [Jittor Website](https://cg.cs.tsinghua.edu.cn/jittor/)
+*  [Jittor Tutorials](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/)
+*  [Jittor Models](https://cg.cs.tsinghua.edu.cn/jittor/resources/)
+*  [Jittor Documents](https://cg.cs.tsinghua.edu.cn/jittor/assets/docs/index.html)
+*  [Github](https://github.com/jittor/jittor), [GitLink](https://www.gitlink.org.cn/jittor/jittor), [Gitee](https://gitee.com/jittor/jittor)
+*  [Jittor Forum](https://discuss.jittor.org/)
+*  [Awesome Jittor List](https://github.com/Jittor/jittor/blob/master/AWESOME-JITTOR-LIST.md)
+*  IM: QQ Group(761222083)
+
+
+
+The following example shows how to model a two-layer neural network step by step and train from scratch In a few lines of Python code.
+
+
+```python
+import jittor as jt
+from jittor import Module
+from jittor import nn
+import numpy as np
+
+class Model(Module):
+    def __init__(self):
+        self.layer1 = nn.Linear(1, 10)
+        self.relu = nn.Relu() 
+        self.layer2 = nn.Linear(10, 1)
+    def execute (self,x) :
+        x = self.layer1(x)
+        x = self.relu(x)
+        x = self.layer2(x)
+        return x
+
+def get_data(n): # generate random data for training test.
+    for i in range(n):
+        x = np.random.rand(batch_size, 1)
+        y = x*x
+        yield jt.float32(x), jt.float32(y)
+
+
+learning_rate = 0.1
+batch_size = 50
+n = 1000
+
+model = Model()
+optim = nn.SGD(model.parameters(), learning_rate)
+
+for i,(x,y) in enumerate(get_data(n)):
+    pred_y = model(x)
+    dy = pred_y - y
+    loss = dy * dy
+    loss_mean = loss.mean()
+    optim.step(loss_mean)
+    print(f"step {i}, loss = {loss_mean.data.sum()}")
+```
+
+## Contents
+
+* [Quickstart](#quickstart)
+* [Install](#install)
+* [Tutorial](#tutorial)
+* [Contributing](#contributing)
+* [The Team](#theteam)
+* [License](#license)
+
+
+
+## Quickstart
+
+
+We provide some jupyter notebooks to help you quick start with Jittor.
+
+
+- [Example: Model definition and training][1]
+- [Basics: Op, Var][2]
+- [Meta-operator: Implement your own convolution with Meta-operator][3]
+
+## Install
+
+
+
+Jittor environment requirements:
+
+| OS                                                     | CPU                                 | Python | Compiler     | (Optional) GPU platform                                |
+|--------------------------------------------------------|-------------------------------------|--------|--------------|---------------------------------------------|
+| Linux<br>(Ubuntu, CentOS, Arch, <br>UOS, KylinOS, ...) | x86 <br>x86_64 <br>ARM <br>loongson | >= 3.7 | g++ >=5.4    | Nvidia CUDA >= 10.0, [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar) <br> or [AMD ROCm](https://docs.amd.com/) >= 4.0 <br> or [Hygon DCU DTK](https://tycloud.hpccube.com/doc/1.0.6/11277/general-handbook/software-tutorial/jittor.html) >= 22.04 |
+| macOS <br>(>= 10.14 Mojave)                            | intel<br>Apple Silicon              | >= 3.7 | clang >= 8.0 | -                                           |
+| Windows 10 & 11                                        | x86_64                              | [>= 3.8](https://www.python.org/downloads/windows/) | -            | Nvidia CUDA >= 10.2 [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#install-windows)                               |
+
+
+Jittor offers three ways to install: pip, docker, or manual.
+
+
+## Pip install
+
+
+```bash
+sudo apt install python3.7-dev libomp-dev
+python3.7 -m pip install jittor
+# or install from github(latest version)
+# python3.7 -m pip install git+https://github.com/Jittor/jittor.git
+python3.7 -m jittor.test.test_example
+```
+
+
+
+### macOS install
+
+
+Please first install additional dependencies with [homebrew](https://brew.sh).
+
+```bash
+brew install libomp
+```
+
+
+Then you can install jittor through pip and run the example.
+
+```bash
+python3.7 -m pip install jittor
+python3.7 -m jittor.test.test_example
+```
+
+
+Currently jittor only supports CPU on macOS.
+
+
+### Windows install
+
+
+
+```bash
+# check your python version(>=3.8)
+python --version
+python -m pip install jittor
+# if conda is used
+conda install pywin32
+```
+
+
+In Windows, jittor will automatically detect and install CUDA, please make sure your NVIDIA driver support CUDA 10.2  or above, or you can manually let jittor install CUDA for you:
+
+```bash
+python -m jittor_utils.install_cuda
+```
+
+
+## Docker Install
+
+
+
+We provide a Docker installation method to save you from configuring the environment. The Docker installation method is as follows:
+
+```
+# CPU only(Linux)
+docker run -it --network host jittor/jittor
+# CPU and CUDA(Linux)
+docker run -it --network host --gpus all jittor/jittor-cuda
+# CPU only(Mac and Windows)
+docker run -it -p 8888:8888 jittor/jittor
+```
+
+
+## manual install
+
+We will show how to install Jittor in Ubuntu 16.04 step by step, Other Linux distributions may have similar commands.
+
+
+### Step 1: Choose your back-end compiler
+
+
+```bash
+# g++
+sudo apt install g++ build-essential libomp-dev
+
+# OR clang++-8
+wget -O - https://raw.githubusercontent.com/Jittor/jittor/master/script/install_llvm.sh > /tmp/llvm.sh
+bash /tmp/llvm.sh 8
+```
+### Step 2: Install Python and python-dev
+
+
+Jittor need python version >= 3.7.
+
+
+```bash
+sudo apt install python3.7 python3.7-dev
+```
+
+### Step 3: Run Jittor
+
+
+The whole framework is compiled Just-in-time. Let's install jittor via pip
+
+
+```bash
+git clone https://github.com/Jittor/jittor.git
+sudo pip3.7 install ./jittor
+export cc_path="clang++-8"
+# if other compiler is used, change cc_path
+# export cc_path="g++"
+# export cc_path="icc"
+
+# run a simple test
+python3.7 -m jittor.test.test_example
+```
+if the test is passed, your Jittor is ready.
+
+
+### Optional Step 4: Enable CUDA
+
+
+Using CUDA in Jittor is very simple, Just setup environment value `nvcc_path`
+
+
+```bash
+# replace this var with your nvcc location 
+export nvcc_path="/usr/local/cuda/bin/nvcc" 
+# run a simple cuda test
+python3.7 -m jittor.test.test_cuda 
+```
+if the test is passed, your can use Jittor with CUDA by setting `use_cuda` flag.
+
+
+```python
+import jittor as jt
+jt.flags.use_cuda = 1
+```
+
+### Optional Step 5: Test Resnet18 training
+
+
+To check the integrity of Jittor, you can run Resnet18 training test. Note: 6G GPU RAM is requires in this test.
+
+
+```bash
+python3.7 -m jittor.test.test_resnet
+```
+if those tests are failed, please report bugs for us, and feel free to contribute ^_^
+
+
+## Tutorial
+
+
+In the tutorial section, we will briefly explain the basic concept of Jittor.
+
+
+To train your model with Jittor, there are only three main concepts you need to know:
+
+
+* Var: basic data type of jittor
+* Operations: Jittor'op is simular with numpy
+
+### Var
+
+
+First, let's get started with Var. Var is the basic data type of jittor. Computation process in Jittor is asynchronous for optimization. If you want to access the data, `Var.data` can be used for synchronous data accessing.
+
+
+```python
+import jittor as jt
+a = jt.float32([1,2,3])
+print (a)
+print (a.data)
+# Output: float32[3,]
+# Output: [ 1. 2. 3.]
+```
+
+And we can give the variable a name.
+
+
+```python
+a.name('a')
+print(a.name())
+# Output: a
+```
+
+### Operations
+
+
+Jittor'op is simular with numpy. Let's try some operations. We create Var `a` and `b` via operation `jt.float32`, and add them. Printing those variables shows they have the same shape and dtype.
+
+
+```python
+import jittor as jt
+a = jt.float32([1,2,3])
+b = jt.float32([4,5,6])
+c = a*b
+print(a,b,c)
+print(type(a), type(b), type(c))
+# Output: float32[3,] float32[3,] float32[3,]
+# Output: <class 'jittor_core.Var'> <class 'jittor_core.Var'> <class 'jittor_core.Var'>
+```
+Beside that, All the operators we used `jt.xxx(Var, ...)` have alias `Var.xxx(...)`. For example:
+
+
+```python
+c.max() # alias of jt.max(c)
+c.add(a) # alias of jt.add(c, a)
+c.min(keepdims=True) # alias of jt.min(c, keepdims=True)
+```
+
+if you want to know all the operation which Jittor supports. try `help(jt.ops)`. All the operation you found in `jt.ops.xxx`, can be used via alias `jt.xxx`.
+
+
+```python
+help(jt.ops)
+# Output:
+#   abs(x: core.Var) -> core.Var
+#   add(x: core.Var, y: core.Var) -> core.Var
+#   array(data: array) -> core.Var
+#   binary(x: core.Var, y: core.Var, op: str) -> core.Var
+#   ......
+```
+### More
+
+
+If you want to know more about Jittor, please check out the notebooks below:
+
+
+* Quickstart
+    - [Example: Model definition and training][1]
+    - [Basics: Op, Var][2]
+    - [Meta-operator: Implement your own convolution with Meta-operator][3]
+* Advanced
+    - [Custom Op: write your operator with C++ and CUDA and JIT compile it][4]
+    - [Profiler: Profiling your model][5]
+    - Jtune: Tool for performance tuning
+
+
+
+[1]: python/jittor/notebook/example.src.md	"example"
+[2]: python/jittor/notebook/basics.src.md	"basics"
+[3]: python/jittor/notebook/meta_op.src.md	"meta_op"
+[4]: python/jittor/notebook/custom_op.src.md	"custom_op"
+[5]: python/jittor/notebook/profiler.src.md	"profiler"
+
+Those notebooks can be started in your own computer by `python3.7 -m jittor.notebook`
+
+
+## Contributing
+
+
+Jittor is still young. It may contain bugs and issues. Please report them in our bug track system. Contributions are welcome. Besides, if you have any ideas about Jittor, please let us know.
+
+
+
+
+You can help Jittor in the following ways:
+
+* Citing Jittor in your paper
+* recommend Jittor to your friends
+* Contributing code
+* Contributed tutorials and documentation
+* File an issue
+* Answer jittor related questions
+* Light up the stars
+* Keep an eye on jittor
+* ......
+
+## Contact Us
+
+
+
+
+
+Website: http://cg.cs.tsinghua.edu.cn/jittor/
+
+Email: jittor@qq.com
+
+File an issue: https://github.com/Jittor/jittor/issues
+
+QQ Group: 836860279
+
+
+<img src="https://github.com/Jittor/jittor/assets/62846124/8dd830bd-b31c-4e4f-9a78-5fd7a3409145" width="200"/>
+
+## The Team
+
+
+Jittor is currently maintained by the [Tsinghua CSCG Group](https://cg.cs.tsinghua.edu.cn/). If you are also interested in Jittor and want to improve it, Please join us!
+
+
+## Citation
+
+
+```
+@article{hu2020jittor,
+  title={Jittor: a novel deep learning framework with meta-operators and unified graph execution},
+  author={Hu, Shi-Min and Liang, Dun and Yang, Guo-Ye and Yang, Guo-Wei and Zhou, Wen-Yang},
+  journal={Science China Information Sciences},
+  volume={63},
+  number={222103},
+  pages={1--21},
+  year={2020}
+}
+```
+
+## License
+
+
+Jittor is Apache 2.0 licensed, as found in the LICENSE.txt file.
+
+
--- a/README.src.md
+++ b/README.src.md
@ -0,0 +1,524 @@
+# Jittor: a Just-in-time(JIT) deep learning framework
+# Jittor: 即时编译深度学习框架
+
+![Jittor Logo](https://cg.cs.tsinghua.edu.cn/jittor/favicon_package_v0/JittorLogo_Final1220.svg)
+
+[Quickstart](#quickstart) | [Install](#install) | [Tutorial](#tutorial) | [Chinese](./README.cn.md)
+
+[快速开始](#快速开始) | [安装](#安装) | [教程](#教程) | [English](./README.md)
+
+Jittor is a high-performance deep learning framework based on JIT compiling and meta-operators. The whole framework and meta-operators are compiled just-in-time. A powerful op compiler and tuner are integrated into Jittor. It allowed us to generate high-performance code with specialized for your model. Jittor also contains a wealth of high-performance model libraries, including: image recognition, detection, segmentation, generation, differentiable rendering, geometric learning, reinforcement learning, etc. .
+
+Jittor 是一个基于即时编译和元算子的高性能深度学习框架，整个框架在即时编译的同时，还集成了强大的Op编译器和调优器，为您的模型生成定制化的高性能代码。Jittor还包含了丰富的高性能模型库，涵盖范围包括：图像识别，检测，分割，生成，可微渲染，几何学习，强化学习等等。
+
+The front-end language is Python. Module Design and Dynamic Graph Execution is used in the front-end, which is the most popular design for deeplearning framework interface. The back-end is implemented by high performance language, such as CUDA,C++.
+
+Jittor前端语言为Python。前端使用了模块化和动态图执行的设计，这是目前最主流的深度学习框架接口设计。后端则使用高性能语言编写，如CUDA，C++。
+
+Related Links:
+*  [Jittor Website](https://cg.cs.tsinghua.edu.cn/jittor/)
+*  [Jittor Tutorials](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/)
+*  [Jittor Models](https://cg.cs.tsinghua.edu.cn/jittor/resources/)
+*  [Jittor Documents](https://cg.cs.tsinghua.edu.cn/jittor/assets/docs/index.html)
+*  [Github](https://github.com/jittor/jittor), [GitLink](https://www.gitlink.org.cn/jittor/jittor), [Gitee](https://gitee.com/jittor/jittor)
+*  [Jittor Forum](https://discuss.jittor.org/)
+*  [Awesome Jittor List](https://github.com/Jittor/jittor/blob/master/AWESOME-JITTOR-LIST.md)
+*  IM: QQ Group(761222083)
+
+相关链接：
+*  [Jittor官网](https://cg.cs.tsinghua.edu.cn/jittor/)
+*  [Jittor教程](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/)
+*  [Jittor模型库](https://cg.cs.tsinghua.edu.cn/jittor/resources/)
+*  [Jittor文档](https://cg.cs.tsinghua.edu.cn/jittor/assets/docs/index.html)
+*  [Github](https://github.com/jittor/jittor)， [GitLink](https://www.gitlink.org.cn/jittor/jittor)， [Gitee](https://gitee.com/jittor/jittor)
+*  [Jittor 论坛](https://discuss.jittor.org/)
+*  [Jittor 精选仓库](https://github.com/Jittor/jittor/blob/master/AWESOME-JITTOR-LIST.md)
+*  即时通信: QQ Group(761222083)
+
+
+The following example shows how to model a two-layer neural network step by step and train from scratch In a few lines of Python code.
+
+下面的代码演示了如何一步一步使用Python代码，从头对一个双层神经网络建模。
+
+```python
+import jittor as jt
+from jittor import Module
+from jittor import nn
+import numpy as np
+
+class Model(Module):
+    def __init__(self):
+        self.layer1 = nn.Linear(1, 10)
+        self.relu = nn.Relu() 
+        self.layer2 = nn.Linear(10, 1)
+    def execute (self,x) :
+        x = self.layer1(x)
+        x = self.relu(x)
+        x = self.layer2(x)
+        return x
+
+def get_data(n): # generate random data for training test.
+    for i in range(n):
+        x = np.random.rand(batch_size, 1)
+        y = x*x
+        yield jt.float32(x), jt.float32(y)
+
+
+learning_rate = 0.1
+batch_size = 50
+n = 1000
+
+model = Model()
+optim = nn.SGD(model.parameters(), learning_rate)
+
+for i,(x,y) in enumerate(get_data(n)):
+    pred_y = model(x)
+    dy = pred_y - y
+    loss = dy * dy
+    loss_mean = loss.mean()
+    optim.step(loss_mean)
+    print(f"step {i}, loss = {loss_mean.data.sum()}")
+```
+
+## Contents
+
+* [Quickstart](#quickstart)
+* [Install](#install)
+* [Tutorial](#tutorial)
+* [Contributing](#contributing)
+* [The Team](#theteam)
+* [License](#license)
+
+## 大纲
+
+- [快速开始](#快速开始)
+- [安装](#安装)
+- [教程](#教程)
+- [贡献](#贡献)
+- [团队](#团队)
+- [版权声明](#版权声明)
+
+## Quickstart
+
+## 快速开始
+
+We provide some jupyter notebooks to help you quick start with Jittor.
+
+我们提供了一些jupyter notebooks来帮助您快速入门Jittor。
+
+- [Example: Model definition and training][1]
+- [示例：模型定义与训练][1]
+- [Basics: Op, Var][2]
+- [基础：Op, Var][2]
+- [Meta-operator: Implement your own convolution with Meta-operator][3]
+- [元算子：通过元算子实现自己的卷积层][3]
+
+## Install
+
+## 安装
+
+Jittor框架对环境要求如下:
+
+Jittor environment requirements:
+
+| OS                                                     | CPU                                 | Python | Compiler     | (Optional) GPU platform                                |
+|--------------------------------------------------------|-------------------------------------|--------|--------------|---------------------------------------------|
+| Linux<br>(Ubuntu, CentOS, Arch, <br>UOS, KylinOS, ...) | x86 <br>x86_64 <br>ARM <br>loongson | >= 3.7 | g++ >=5.4    | Nvidia CUDA >= 10.0, [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar) <br> or [AMD ROCm](https://docs.amd.com/) >= 4.0 <br> or [Hygon DCU DTK](https://tycloud.hpccube.com/doc/1.0.6/11277/general-handbook/software-tutorial/jittor.html) >= 22.04 |
+| macOS <br>(>= 10.14 Mojave)                            | intel<br>Apple Silicon              | >= 3.7 | clang >= 8.0 | -                                           |
+| Windows 10 & 11                                        | x86_64                              | [>= 3.8](https://www.python.org/downloads/windows/) | -            | Nvidia CUDA >= 10.2 [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#install-windows)                               |
+
+Jittor 提供了三种安装方法：pip、docker和手动安装：
+
+Jittor offers three ways to install: pip, docker, or manual.
+
+## Pip 安装
+
+## Pip install
+
+下面将展示Ubuntu的安装命令，如果您在使用其他Linux操作系统（如CentOS）， 请安装好依赖（Python>=3.7, g++>=5.4）或者使用**docker安装**， 如果您已经装好编译器和对应版本的Python,我们强烈推荐您使用这种方法
+(如果无法访问github, 可以通过Jittor主页下载):
+
+```bash
+sudo apt install python3.7-dev libomp-dev
+python3.7 -m pip install jittor
+# or install from github(latest version)
+# python3.7 -m pip install git+https://github.com/Jittor/jittor.git
+python3.7 -m jittor.test.test_example
+```
+
+如果测试运行通过,恭喜你已经安装完成.
+jittor会自动在路径中寻找合适的编译器, 如果您希望手动指定编译器, 请使用环境变量 `cc_path` 和 `nvcc_path`(可选).
+
+### macOS 安装
+
+### macOS install
+
+macOS 请使用 [homebrew](https://brew.sh) 安装额外的依赖。
+
+Please first install additional dependencies with [homebrew](https://brew.sh).
+
+```bash
+brew install libomp
+```
+
+之后您可以通过 pip 安装 jittor，并测试是否可以成功运行。
+
+Then you can install jittor through pip and run the example.
+
+```bash
+python3.7 -m pip install jittor
+python3.7 -m jittor.test.test_example
+```
+
+目前在 macOS 中，jittor 只支持 CPU 计算。
+
+Currently jittor only supports CPU on macOS.
+
+### Windows安装
+
+### Windows install
+
+Windows 请准备好Python>=3.8，安装方法如下(conda安装需要额外命令)：
+
+Windows user please prepare Python>=3.8, install instructions are list below(conda needs extra instructions)：
+
+```bash
+# check your python version(>=3.8)
+python --version
+python -m pip install jittor
+# if conda is used
+conda install pywin32
+```
+
+Windows 下，jittor会自动检测显卡并安装对应的 CUDA， 请确保您的NVIDIA驱动支持CUDA 10.2 以上，您还可以使用如下命令手动为Jittor安装CUDA：
+
+In Windows, jittor will automatically detect and install CUDA, please make sure your NVIDIA driver support CUDA 10.2  or above, or you can manually let jittor install CUDA for you:
+
+```bash
+python -m jittor_utils.install_cuda
+```
+
+
+## Docker Install
+
+## Docker 安装
+
+我们提供了Docker安装方式，免去您配置环境，Docker安装方法如下：
+
+We provide a Docker installation method to save you from configuring the environment. The Docker installation method is as follows:
+
+```
+# CPU only(Linux)
+docker run -it --network host jittor/jittor
+# CPU and CUDA(Linux)
+docker run -it --network host --gpus all jittor/jittor-cuda
+# CPU only(Mac and Windows)
+docker run -it -p 8888:8888 jittor/jittor
+```
+
+关于Docker安装的详细教程，可以参考[Windows/Mac/Linux通过Docker安装计图](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/2020-5-15-00-00-docker/)
+
+## 手动安装
+## manual install
+
+We will show how to install Jittor in Ubuntu 16.04 step by step, Other Linux distributions may have similar commands.
+
+我们将逐步演示如何在Ubuntu 16.04中安装Jittor，其他Linux发行版可能可以使用类似的命令。
+
+### Step 1: Choose your back-end compiler
+
+### 步骤一：选择您的后端编译器
+
+```bash
+# g++
+sudo apt install g++ build-essential libomp-dev
+
+# OR clang++-8
+wget -O - https://raw.githubusercontent.com/Jittor/jittor/master/script/install_llvm.sh > /tmp/llvm.sh
+bash /tmp/llvm.sh 8
+```
+### Step 2: Install Python and python-dev
+
+### 步骤二：安装Python和python-dev
+
+Jittor need python version >= 3.7.
+
+Jittor需要python的版本>=3.7。
+
+```bash
+sudo apt install python3.7 python3.7-dev
+```
+
+### Step 3: Run Jittor
+
+### 步骤三：运行Jittor
+
+The whole framework is compiled Just-in-time. Let's install jittor via pip
+
+整个框架是即时编译的。 让我们通过pip安装jittor
+
+```bash
+git clone https://github.com/Jittor/jittor.git
+sudo pip3.7 install ./jittor
+export cc_path="clang++-8"
+# if other compiler is used, change cc_path
+# export cc_path="g++"
+# export cc_path="icc"
+
+# run a simple test
+python3.7 -m jittor.test.test_example
+```
+if the test is passed, your Jittor is ready.
+
+如果通过了测试，那么您的Jittor已经准备就绪。
+
+### Optional Step 4: Enable CUDA
+
+### 可选步骤四：启用CUDA
+
+Using CUDA in Jittor is very simple, Just setup environment value `nvcc_path`
+
+在Jittor中使用CUDA非常简单，只需设置环境值`nvcc_path`
+
+```bash
+# replace this var with your nvcc location 
+export nvcc_path="/usr/local/cuda/bin/nvcc" 
+# run a simple cuda test
+python3.7 -m jittor.test.test_cuda 
+```
+if the test is passed, your can use Jittor with CUDA by setting `use_cuda` flag.
+
+如果测试通过，则可以通过设置`use_cuda`标识符在Jittor中启用CUDA。
+
+```python
+import jittor as jt
+jt.flags.use_cuda = 1
+```
+
+### Optional Step 5: Test Resnet18 training
+
+### 可选步骤五：测试训练Resnet18
+
+To check the integrity of Jittor, you can run Resnet18 training test. Note: 6G GPU RAM is requires in this test.
+
+要检查Jittor的完整性，您可以运行Resnet18训练测试。需要注意的是，这个测试需要6G显存。
+
+```bash
+python3.7 -m jittor.test.test_resnet
+```
+if those tests are failed, please report bugs for us, and feel free to contribute ^_^
+
+如果这些测试失败，请为我们报告错误，我们十分欢迎您为Jittor做出贡献^ _ ^
+
+## Tutorial
+
+## 教程
+
+In the tutorial section, we will briefly explain the basic concept of Jittor.
+
+在教程部分，我们将简要解释Jittor的基本概念。
+
+To train your model with Jittor, there are only three main concepts you need to know:
+
+要使用Jittor训练模型，您需要了解两个主要概念：
+
+* Var: basic data type of jittor
+* Var：Jittor的基本数据类型
+* Operations: Jittor'op is simular with numpy
+* Operations：Jittor的算子与numpy类似
+
+### Var
+
+### 数据类型
+
+First, let's get started with Var. Var is the basic data type of jittor. Computation process in Jittor is asynchronous for optimization. If you want to access the data, `Var.data` can be used for synchronous data accessing.
+
+首先，让我们开始使用Var。Var是jittor的基本数据类型，为了运算更加高效Jittor中的计算过程是异步的。 如果要访问数据，可以使用`Var.data`进行同步数据访问。
+
+```python
+import jittor as jt
+a = jt.float32([1,2,3])
+print (a)
+print (a.data)
+# Output: float32[3,]
+# Output: [ 1. 2. 3.]
+```
+
+And we can give the variable a name.
+
+此外我们可以给变量起一个名字。
+
+```python
+a.name('a')
+print(a.name())
+# Output: a
+```
+
+### Operations
+
+### 数据运算
+
+Jittor'op is simular with numpy. Let's try some operations. We create Var `a` and `b` via operation `jt.float32`, and add them. Printing those variables shows they have the same shape and dtype.
+
+ Jittor的算子与numpy类似。 让我们尝试一些运算， 我们通过Op`jt.float32`创建Var `a`和`b`，并将它们相加。 输出这些变量相关信息，可以看出它们具有相同的形状和类型。
+
+```python
+import jittor as jt
+a = jt.float32([1,2,3])
+b = jt.float32([4,5,6])
+c = a*b
+print(a,b,c)
+print(type(a), type(b), type(c))
+# Output: float32[3,] float32[3,] float32[3,]
+# Output: <class 'jittor_core.Var'> <class 'jittor_core.Var'> <class 'jittor_core.Var'>
+```
+Beside that, All the operators we used `jt.xxx(Var, ...)` have alias `Var.xxx(...)`. For example:
+
+除此之外，我们使用的所有算子`jt.xxx(Var,...)`都具有别名`Var.xxx(...)`。 例如：
+
+```python
+c.max() # alias of jt.max(c)
+c.add(a) # alias of jt.add(c, a)
+c.min(keepdims=True) # alias of jt.min(c, keepdims=True)
+```
+
+if you want to know all the operation which Jittor supports. try `help(jt.ops)`. All the operation you found in `jt.ops.xxx`, can be used via alias `jt.xxx`.
+
+如果您想知道Jittor支持的所有运算，可以运行`help(jt.ops)`。 您在`jt.ops.xxx`中找到的所有运算都可以通过别名`jt.xxx`。
+
+```python
+help(jt.ops)
+# Output:
+#   abs(x: core.Var) -> core.Var
+#   add(x: core.Var, y: core.Var) -> core.Var
+#   array(data: array) -> core.Var
+#   binary(x: core.Var, y: core.Var, op: str) -> core.Var
+#   ......
+```
+### More
+
+### 更多教程
+
+If you want to know more about Jittor, please check out the notebooks below:
+
+如果您想进一步了解Jittor，请查看以下notebooks：
+
+* Quickstart
+    - [Example: Model definition and training][1]
+    - [Basics: Op, Var][2]
+    - [Meta-operator: Implement your own convolution with Meta-operator][3]
+* 快速开始
+    * [示例：模型定义与训练][1]
+    * [基本概念：Op, Var][2]
+    * [元算子：通过元算子实现自己的卷积层][3]
+* Advanced
+    - [Custom Op: write your operator with C++ and CUDA and JIT compile it][4]
+    - [Profiler: Profiling your model][5]
+    - Jtune: Tool for performance tuning
+* 进阶
+    * [自定义算子：使用C ++和CUDA编写您的算子，并其进行即时编译][4]
+    * [性能分析器：分析您的模型][5]
+    * Jtune：性能调优工具
+
+
+
+[1]: python/jittor/notebook/example.src.md	"example"
+[2]: python/jittor/notebook/basics.src.md	"basics"
+[3]: python/jittor/notebook/meta_op.src.md	"meta_op"
+[4]: python/jittor/notebook/custom_op.src.md	"custom_op"
+[5]: python/jittor/notebook/profiler.src.md	"profiler"
+[1]: python/jittor/notebook/example.src.md	"示例"
+[2]: python/jittor/notebook/basics.src.md	"基本概念"
+[3]: python/jittor/notebook/meta_op.src.md	"元算子"
+[4]: python/jittor/notebook/custom_op.src.md	"自定义算子"
+[5]: python/jittor/notebook/profiler.src.md	"性能分析器"
+
+Those notebooks can be started in your own computer by `python3.7 -m jittor.notebook`
+
+这些notebooks可以通过python3.7 -m jittor.notebook在您自己的计算机中运行。
+
+## Contributing
+
+## 贡献
+
+Jittor is still young. It may contain bugs and issues. Please report them in our bug track system. Contributions are welcome. Besides, if you have any ideas about Jittor, please let us know.
+
+Jittor还很年轻。 它可能存在错误和问题。 请在我们的错误跟踪系统中报告它们。 我们欢迎您为Jittor做出贡献。 此外，如果您对Jittor有任何想法，请告诉我们。
+
+您可以用以下方式帮助Jittor：
+
+* 在论文中引用 Jittor
+* 向身边的好朋友推荐 Jittor
+* 贡献代码
+* 贡献教程和文档
+* 提出issue
+* 回答 jittor 相关问题
+* 点亮小星星
+* 持续关注 jittor
+* ……
+
+You can help Jittor in the following ways:
+
+* Citing Jittor in your paper
+* recommend Jittor to your friends
+* Contributing code
+* Contributed tutorials and documentation
+* File an issue
+* Answer jittor related questions
+* Light up the stars
+* Keep an eye on jittor
+* ......
+
+## Contact Us
+
+## 联系我们
+
+官方主页： http://cg.cs.tsinghua.edu.cn/jittor/
+
+电子邮件：jittor@qq.com
+
+提出issue：https://github.com/Jittor/jittor/issues
+
+Website: http://cg.cs.tsinghua.edu.cn/jittor/
+
+Email: jittor@qq.com
+
+File an issue: https://github.com/Jittor/jittor/issues
+
+QQ Group: 761222083
+
+QQ 群：761222083
+
+<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/news/2020-12-8-21-19-1_2_2/fig4.png" width="200"/>
+
+## The Team
+
+## 团队
+
+Jittor is currently maintained by the [Tsinghua CSCG Group](https://cg.cs.tsinghua.edu.cn/). If you are also interested in Jittor and want to improve it, Please join us!
+
+Jittor目前由[清华大学计算机图形学组](https://cg.cs.tsinghua.edu.cn/)维护。 如果您也对Jittor感兴趣并希望对其进行改进，请加入我们！
+
+## Citation
+
+## 引用
+
+```
+@article{hu2020jittor,
+  title={Jittor: a novel deep learning framework with meta-operators and unified graph execution},
+  author={Hu, Shi-Min and Liang, Dun and Yang, Guo-Ye and Yang, Guo-Wei and Zhou, Wen-Yang},
+  journal={Science China Information Sciences},
+  volume={63},
+  number={222103},
+  pages={1--21},
+  year={2020}
+}
+```
+
+## License
+
+## 版权声明
+
+Jittor is Apache 2.0 licensed, as found in the LICENSE.txt file.
+
+如LICENSE.txt文件中所示，Jittor使用Apache 2.0版权协议。
--- a/doc/Makefile
+++ b/doc/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/doc/build_doc.sh
+++ b/doc/build_doc.sh
@ -0,0 +1,18 @@
+# sudo python3.7 -m pip install \
+#     recommonmark \
+#     sphinx sphinx-autobuild sphinx_rtd_theme \
+#     sphinx-autobuild \
+#     --timeout 100
+
+
+bpath=$(readlink -f "${BASH_SOURCE[0]}")
+bpath=$(dirname "${bpath}")
+
+jittor_path=$(readlink -f "${bpath}/..")
+
+echo "[doc path] $bpath" 
+echo "[jittor path] $jittor_path" 
+
+export PYTHONPATH=$jittor_path/python
+cd $bpath
+sphinx-autobuild -b html source build -H 0.0.0.0 -p 8890
--- a/doc/logo.png
+++ b/doc/logo.png
--- a/doc/source/Jittor性能测试与对比方法.md
+++ b/doc/source/Jittor性能测试与对比方法.md
@ -0,0 +1,176 @@
+Jittor性能测试与对比方法
+=====================
+
+下面代码以AlexNet为例，用于演示 Jittor 性能测试的正确方法：
+
+```python
+import time
+import jittor as jt
+from jittor.models import resnet50
+jt.flags.use_cuda = jt.has_cuda
+
+warmup = 10
+rerun = 100
+batch_size = 8
+data = jt.random((batch_size, 3, 224, 224))
+model = resnet50()
+model.eval()
+
+# 此段代码对jittor进行热身，确保时间测试准确
+jt.sync_all(True)
+for i in range(warmup):
+    pred = model(data)
+    # sync是把计算图发送到计算设备上
+    pred.sync()
+# sync_all(true)是把计算图发射到计算设备上，并且同步。
+# 只有运行了jt.sync_all(True)才会真正地运行，时间才是有效的，因此执行forward前后都要执行这句话
+jt.sync_all(True)
+
+# 开始测试运行时间
+start = time.time()
+for i in range(rerun):
+    pred = model(data)
+    pred.sync()
+jt.sync_all(True)
+end = time.time()
+
+print("Jittor FPS:", (rerun*batch_size)/(end-start))
+
+```
+
+在这段代码中，我们定义了几个参数`batch_size`, `warmup`, `rerun`, batch_size代表批大小，warmup是用于热身的循环次数，而rerun是用于测速的循环次数，最终输出FPS，对Jittor进行正确测速的关键是 热身部分和同步部分，热身部分确保测试时间稳定，没有包含编译用的时间，而同步部分确保计算完成，因为jittor是一个异步框架，只有同步操作能保证计算完成。
+
+以上代码的运行结果如下（RTX Titan，batch 8）：
+
+```
+Compiling Operators(8/8) used: 7.35s eta:    0s
+Compiling Operators(13/13) used: 8.36s eta:    0s
+Jittor FPS: 908.9853866375396
+```
+
+我们还可以使用类似的代码测试 PyTorch的性能：
+
+```python
+import time
+import torch
+from torchvision.models import resnet50
+
+warmup = 10
+rerun = 100
+batch_size = 8
+data = torch.randn((batch_size, 3, 224, 224)).cuda()
+model = resnet50()
+model.cuda()
+model.eval()
+
+# 此段代码对pytorch进行热身，确保时间测试准确
+torch.cuda.synchronize()
+for i in range(warmup):
+    pred = model(data)
+# synchronize用于确保PyTorch计算完成
+torch.cuda.synchronize()
+
+# 开始测试运行时间
+start = time.time()
+for i in range(rerun):
+    pred = model(data)
+torch.cuda.synchronize()
+end = time.time()
+
+print("PyTorch FPS:", (rerun*batch_size)/(end-start))
+```
+
+
+以上代码的运行结果如下（RTX Titan，batch 8）：
+
+```
+PyTorch FPS: 807.4806873965665
+```
+
+我们还可以对这两段代码合并，并对比结果的一致性：
+
+```python
+import time
+import jittor as jt
+from jittor.models import resnet50
+jt.flags.use_cuda = jt.has_cuda
+
+warmup = 100
+rerun = 1000
+batch_size = 8
+data = jt.random((batch_size, 3, 224, 224))
+model = resnet50()
+model.eval()
+
+# 此段代码对jittor进行热身，确保时间测试准确
+jt.sync_all(True)
+for i in range(warmup):
+    pred = model(data)
+    # sync是把计算图发送到计算设备上
+    pred.sync()
+# sync_all(true)是把计算图发射到计算设备上，并且同步。
+# 只有运行了jt.sync_all(True)才会真正地运行，时间才是有效的，因此执行forward前后都要执行这句话
+jt.sync_all(True)
+
+# 开始测试运行时间
+start = time.time()
+for i in range(rerun):
+    pred = model(data)
+    pred.sync()
+jt.sync_all(True)
+end = time.time()
+
+print("Jittor FPS:", (rerun*batch_size)/(end-start))
+# 将 jittor 数据和参数导出为 numpy 和 torch 格式
+jittor_data = pred.numpy()
+jittor_param = model.state_dict(to="torch")
+
+import numpy as np
+import torch
+from torchvision.models import resnet50
+data = torch.Tensor(data.numpy()).cuda()
+model = resnet50()
+# 加载 jittor 参数
+model.load_state_dict(jittor_param)
+model.cuda()
+model.eval()
+
+# 此段代码对pytorch进行热身，确保时间测试准确
+torch.cuda.synchronize()
+for i in range(warmup):
+    pred = model(data)
+# synchronize用于确保PyTorch计算完成
+torch.cuda.synchronize()
+
+# 开始测试运行时间
+start = time.time()
+for i in range(rerun):
+    pred = model(data)
+torch.cuda.synchronize()
+end = time.time()
+
+print("PyTorch FPS:", (rerun*batch_size)/(end-start))
+pytorch_data = pred.detach().cpu().numpy()
+err = np.mean(np.abs(pytorch_data - jittor_data))
+print("mean error:", err)
+
+```
+
+
+以上代码运行结果如下：
+
+```
+Jittor FPS: 908.9853866375396
+PyTorch FPS: 807.4806873965665
+mean error: 1e-5
+```
+
+误差输出为1e-5, 在可接受范围内。正确测速与对比的几大关键点为：
+
+1. 充分热身，除去框架的准备时间。
+2. 多次运行，确保测试时间稳定。
+3. 加上同步语句，确保测试时间准确。
+4. 保证显存充足，在显存不足时，jittor会调用统一内存来弥补，会产生性能损失，请密切关注`nvidia-smi`的输出结果。
+5. 保证对比模型的一致性，检查输出结果的一致。
+
+如果您对测试结果有疑问，或者有优化需求，欢迎随时联系Jittor开发团队。
--- a/doc/source/Jittor显存以及内存优化方法.md
+++ b/doc/source/Jittor显存以及内存优化方法.md
@ -0,0 +1,75 @@
+Jittor显存以及内存优化方法
+=====================
+
+您可以主要通过两种方法，来改进内存消耗：
+
+1. 优化消耗内存比较大的变量
+2. 使用Jittor自动交换技术，将变量在显存-内存-硬盘之间交换，降低运行部署门槛。
+
+## 优化消耗内存比较大的变量
+
+您可以使用jittor的memory profiler，来分析显存消耗较大的代码，并且针对特定代码进行优化。使用方法如下：
+
+```
+net = jt.models.resnet18()
+with jt.flag_scope(trace_py_var=3, profile_memory_enable=1):
+    imgs = jt.randn((1,3,224,224))
+    net(imgs).sync()
+    jt.get_max_memory_treemap()
+```
+
+输出如下：
+```
+    | 
+    ├─./python/jittor/test/test_memory_profiler.py:100(test_sample)
+    | [19.03 MB; 29.67%]
+    | ./python/jittor/test/test_memory_profiler.py:100
+    |    | 
+    |    └─./python/jittor/__init__.py:730(__call__)
+    |      [19.03 MB; 29.67%]
+    |      ./python/jittor/__init__.py:730
+    |         | 
+    |         └─./python/jittor/models/resnet.py:152(execute)
+    |           [19.03 MB; 29.67%]
+    |           ./python/jittor/models/resnet.py:152
+    |              | 
+    |              ├─./python/jittor/models/resnet.py:142(_forward_impl)
+    |              | [6.13 MB; 9.55%]
+    |              | ./python/jittor/models/resnet.py:142
+    |              |    | 
+```
+
+
+## 使用自动交换技术
+
+该技术确保Jittor在显存或者内存不足的情况下，都能以一定速度运行。
+
+节省内存方法，请安装Jittor版本大于1.3.7.5，并添加如下环境变量：
+
+```bash
+export JT_SAVE_MEM=1
+# 限制cpu最多使用16G
+export cpu_mem_limit=16000000000
+# 限制device内存（如gpu、tpu等）最多使用8G
+export device_mem_limit=8000000000
+# windows 用户，请使用powershell
+# $env:JT_SAVE_MEM="1"
+# $env:cpu_mem_limit="16000000000"
+# $env:device_mem_limit="8000000000"
+```
+用户可以自由设定cpu和设备内存的使用量，如果不希望对内存进行限制，可以设置为`-1`。
+```bash
+# 限制cpu最多使用16G
+export cpu_mem_limit=-1
+# 限制device内存（如gpu、tpu等）最多使用8G
+export device_mem_limit=-1
+# windows 用户，请使用powershell
+# $env:JT_SAVE_MEM="1"
+# $env:cpu_mem_limit="-1"
+# $env:device_mem_limit="-1"
+```
+
+如果想要清理磁盘交换文件，可以运行如下命令
+```bash
+python3 -m jittor_utils.clean_cache swap
+```
--- a/doc/source/Jittor调试技巧.md
+++ b/doc/source/Jittor调试技巧.md
@ -0,0 +1,90 @@
+Jittor调试技巧
+=====================
+
+该文档包含了几种异常情况的调试方法和技巧。
+
+## 爆Nan、Inf
+
+在模型训练的过程中，可能因为数值不稳定而出现Nan或者Inf，为了帮助您定位出现nan的代码，您可以设置如下环境变量：
+
+```bash
+export JT_CHECK_NAN=1
+export trace_py_var=3
+```
+
+其中，环境变量`JT_CHECK_NAN=1`的用途是：当算子的输出出现异常浮点数时，自动报错并停止程序，环境变量`trace_py_var=3`的用途是：输出算子对应的Python代码行数，3代表输出的详细等级，为最高等级。
+
+需要注意的是，开启这两个特性之后，jittor速度会大幅下降，并且触发重编译，请不要在训练环境或者生产环境开启该模式，也不要长时间开启该模式。
+
+## 错误信息定位不准确
+
+Jittor框架默认采用延迟执行（Lazy execution）的方式进行加速，算子的执行和创建是不同步的，这可能导致报错信息定位不准确，您可以手动关闭延迟执行，采取立刻执行（eager execution）的模式，使用如下环境变量即可：
+
+```bash
+export lazy_execution=0
+```
+
+或者在python代码中通过flag关闭
+```python
+jt.flags.lazy_execution=0
+```
+
+## 内存不足
+
+当您发现Jittor由于内存相关问题，无法运行时，Jittor会向您报告内存使用情况，内存不足可能有两种情况：
+
+1. 训练模型过大，一个迭代就崩溃报错。
+2. 多次迭代的过程中，内存占用不断增长，直到最后内存耗尽报错。
+
+**对于第一种情况** ，您可能需要调整模型或者数据大小，或者使用[多卡训练](jittor.mpi)，此外，您还可以在每个迭代内部，让Jittor强制回收内存：
+
+```python
+for ...:
+    ...
+    jt.sync_all()
+    jt.gc()
+```
+
+如果您使用到了CUDA和卷积，还有可能是卷积消耗的临时空间过大，在这种情况下，可以关闭cudnn的临时内存申请，请将如下代码插入到最开始：
+
+```python
+jt.cudnn.set_max_workspace_ratio(0.0)
+```
+
+**对于第二种情况**，可能是存在内存内存泄漏，请检查您是否存在全局变量没有释放，或者全局变量没有停止梯度，导致计算图不断增加，检查方法如下，您可以在每个迭代内部，插入如下调试代码：
+
+```python
+for ...:
+    ...
+    jt.sync_all()
+    jt.display_memory_info()
+```
+
+Jittor会输出内存消耗，以及计算图的大小`lived_var,lived_op`，以及用户持有的变量数`hold_var`, 如果计算图规模不断增大，请检查代码，或者提交github issue联系我们，并且附上错误日志和代码复现脚本。
+
+
+## 段错误
+
+如果Jittor出现了段错误，建议您将错误提交github issue联系我们，并且附上错误日志和代码复现脚本。您也可以使用如下环境变量对程序以及框架进行诊断：
+
+```bash
+export debug=1
+export gdb_attach=1
+```
+
+其中，环境变量`debug=1`代表开启jittor的debug模式，性能会大幅下降，但会保留调试信息，`gdb_attach=1`将会自动将gdb贴在jittor的主进程上，方便您进行单步调试。关于gdb的使用，您可以参考[GDB Cheat Sheet](https://darkdust.net/files/GDB%20Cheat%20Sheet.pdf)
+
+
+## 管理Jittor cache
+
+Jittor会在`～/.cache/jittor`目录下创建cache， cache里面可能包括 core（内核）、cuda编译器、cuda库、数据集（dataset）、预训练参数等等，在某些情况下cache可能失效，如系统更新、驱动更新等等，这种情况可能需要用户手动清除cache， 清除的方法如下：
+
+```
+python3 -m jittor_utils.clean_cache all
+```
+
+以上命令会清除jittor的所有cache，如果您不想全部清除，可以参考命令行帮助：
+
+```
+python3 -m jittor_utils.clean_cache help
+```
--- a/doc/source/README.cn.md
+++ b/doc/source/README.cn.md
@ -0,0 +1 @@
+../../README.cn.md
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -0,0 +1,101 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+jittor_path = os.path.abspath('../../python')
+print(f"[jittor_path] {jittor_path}")
+sys.path.insert(0, jittor_path)
+
+import jittor
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Jittor'
+copyright = '2020, Jittor'
+author = 'Jittor'
+
+# The full version, including alpha/beta/rc tags
+release = jittor.__version__
+# fix AttributeError for "typing.get_type_hints(jt.Var)"
+jittor.Var.__module__ = "jittor_core"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = 'zh_CN'
+
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    # 'recommonmark',
+    'myst_parser',
+    'sphinx.ext.autodoc',
+    # Auto-generate section labels.
+    'sphinx.ext.autosectionlabel',
+    'sphinx.ext.viewcode',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+import sphinx_rtd_theme
+html_theme = "sphinx_rtd_theme"
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.txt': 'markdown',
+    '.md': 'markdown',
+}
+
+import recommonmark
+from recommonmark.transform import AutoStructify
+
+# At the bottom of conf.py
+def setup(app):
+    app.add_config_value('recommonmark_config', {
+            # 'url_resolver': lambda url: github_doc_root + url,
+            'auto_toc_tree_section': 'Contents',
+            }, True)
+    app.add_transform(AutoStructify)
+
+
+# Prefix document path to section labels, otherwise autogenerated labels would look like 'heading'
+# rather than 'path/to/file:heading'
+autosectionlabel_prefix_document = True
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@ -0,0 +1,60 @@
+.. Jittor documentation master file, created by
+   sphinx-quickstart on Mon May 18 23:05:53 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+欢迎查阅计图文档
+==================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 内容一览:
+
+   README.cn.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 模块API:
+
+   jittor
+   jittor.nn
+   jittor.models
+   jittor.optim
+   jittor.init
+   jittor.contrib
+   jittor.dataset
+   jittor.transform
+   jittor.mpi
+   jittor.linalg
+   jittor.console
+   jittor.distributions
+   jittor.attention
+   jittor.loss3d
+
+
+.. toctree::
+   :maxdepth: 2 
+   :caption: 计图模型库:
+   
+   JDet
+   segmentation-jittor
+   InstanceSegmentation-jittor
+   gan-jittor
+   PointCloudLib
+   jrender
+
+.. toctree::
+   :maxdepth: 1
+   :caption: 其他:
+   
+   Jittor调试技巧
+   Jittor性能测试与对比方法
+   Jittor显存以及内存优化方法
+   教程 <https://cg.cs.tsinghua.edu.cn/jittor/tutorial/>
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/doc/source/jittor.attention.md
+++ b/doc/source/jittor.attention.md
@ -0,0 +1,10 @@
+jittor.attention
+=====================
+
+这里是Jittor的 注意力 模块的API文档，您可以通过`from jittor import attention`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.attention
+   :members:
+   :undoc-members:
+```
--- a/doc/source/jittor.console.md
+++ b/doc/source/jittor.console.md
@ -0,0 +1,237 @@
+jittor.console
+=====================
+
+这里是Jittor的console api文档，console功能主要面向c/c++, 方便c++用户通过console使用jittor，jittor console 优化了
+c++数组和jittor内核之间的数据传输，减少了python额外开销，是通过c++使用jittor的高性能接口。
+
+该功能要求 jittor版本大于1.2.2.17, 编译器支持c++17。
+
+## 简单教程
+
+我们提供了一个完整的教程，用户可以通过如下几行命令编译运行：
+
+```bash
+# 生成c++ example源代码文件
+python3.7 -m jittor_utils.config --cxx-example > example.cc
+# 调用g++编译example, 需要g++支持std=c++17
+g++ example.cc $(python3.7 -m jittor_utils.config --include-flags --libs-flags --cxx-flags) -o example
+# 运行example
+./example
+```
+
+运行结果可能如下：
+```bash
+hello jt console
+1
+hello
+1 2 3 4 
+jt.Var([[-1  5  4]
+ [ 3  2  1]], dtype=int32)
+2 3
+1 25 16 
+9 4 1 
+pred.shape 2 1000
+```
+
+用户可以打开 example.cc, 修改成所需的应用，接下来我们会为大家讲解 example.cc 中的细节。
+
+打开example.cc, 我们可以看到如下代码：
+
+```cpp
+#include <pyjt/pyjt_console.h>
+#include <iostream>
+
+using namespace std;
+
+int main() {
+   ...
+}
+```
+
+这里我们导入了使用 console 所需的头文件 `pyjt/pyjt_console.h`
+
+接下来是jittor console的实例化， 并且使用python的print输出hello jt console：
+
+```cpp
+    jittor::Console console;
+    // run python code in console
+    console.run("print('hello jt console', flush=True)");
+```
+
+输出结果：
+
+```
+hello jt console
+```
+
+注意到这里我们在 python print的时候使用了flush keyword，这是为了让python的输出流和c++的输出流保持一致，
+不会错乱。
+
+接下来我们调用了 `console.set<T>(name, data)` 和 `console.get<T>(name)` 往 console 里面设置了一个int变量a，并且再从console里面取出来。
+
+```cpp
+    // set a python value: a = 1
+    console.set<int>("a", 1);
+    // get a python value
+    cout << console.get<int>("a") << endl;
+```
+
+输出结果：
+
+```
+1
+```
+
+同样的方法，我们还设置了 `string` 和 `vector<int>`， 如下所示
+
+```cpp
+    // set a python string
+    console.set<string>("b", "hello");
+    cout << console.get<string>("b") << endl;
+
+    // set a python array
+    vector<int> x{1,2,3,4};
+    console.set("x", x);
+    auto x2 = console.get<std::vector<int>>("x");
+    for (auto a : x2) cout << a << " "; cout << endl;
+```
+
+输出结果：
+
+```
+hello
+1 2 3 4 
+```
+
+我们还可以往console里面设置jittor变量，这里我们使用了下面几个新的接口：
+
+1. `jittor::array<T, NDIM>(shape, data)`: 这个接口创建了一个jittor的array，类型是`T`， 维度大小为`NDIM`， 形状为 `shape`， 注意shape的长度需要和`NDIM`保持一致，最后是传入的数据，可以是一个vector，也可以是一个指针。
+2. `console.set_array(name, arr)`: 往console里面设置该jittor array， 名称为`name`。
+3. `console.get<T, NDIM>(name)`: 从console里取出一个jittor array，类型为`T`，维度大小为`NDIM`，需要注意的是类型和维度大小必须和console中的变量匹配，否则会抛出异常。
+4. `arr(i,j)`: 对jittor变量取值。
+5. `arr.shape[i]`: 获取jittor变量的维度大小。
+
+在这段代码中，我们首先创建了一个2x3的矩阵， 然后修改了矩阵中的值，随即设置到了python console里面，并且取出输出：
+
+```cpp
+    // set and get a jittor array
+    jittor::array<int, 2> arr2({2,3}, {6,5,4,3,2,1});
+    arr2(0,0) = -1;
+    console.set_array("arr2", arr2);
+    console.run("print(arr2, flush=True); arr3 = arr2**2;");
+    auto arr3 = console.get_array<int, 2>("arr3");
+    cout << arr3.shape[0] << ' ' << arr3.shape[1] << endl;
+    for (int i=0; i<arr3.shape[0]; i++) {
+        for (int j=0; j<arr3.shape[1]; j++)
+            cout << arr3(i,j) << ' ';
+        cout << endl;
+    }
+```
+
+输出结果如下：
+
+```
+jt.Var([[-1  5  4]
+ [ 3  2  1]], dtype=int32)
+2 3
+1 25 16 
+9 4 1 
+```
+
+最后，我们演示了从`jittor.models`中导入`resnet`并且将结果从console中取出。
+
+```cpp
+    jittor::array<float, 4> input({2, 3, 224, 224});
+    memset(input.data.get(), 0, input.nbyte());
+    console.set_array("input", input);
+    console.run(R"(
+import jittor as jt
+from jittor.models import resnet
+
+model = resnet.resnet18()
+pred = model(input)
+    )");
+    auto pred = console.get_array<float, 2>("pred");
+    cout << "pred.shape " << pred.shape[0] << ' ' << pred.shape[1] << endl;
+```
+
+我们输出了取出的变量的形状，结果如下：
+
+```
+pred.shape 2 1000
+```
+
+## jittor array 接口一览
+
+`jittor::array` 是 c++和jittor console交互的 array类型，他的定义如下：
+
+```cpp
+
+// T： 类型， N： 维度数量
+template<class T, int N>
+struct array {
+
+// N维 形状大小
+int64 shape[N];
+// 数据指针
+unique_ptr<T[]> data;
+
+// 是否为浮点数
+bool is_float();
+// 是否为无符号类型
+bool is_unsigned();
+// 数组总大小，为shape数组累乘的结果
+int64 size();
+// 数组总比特数
+int64 nbyte();
+// 数据类型的字符串表示
+string dtype();
+// 维度数量， 同 N
+int ndim();
+
+// array 构造函数，shape为形状，数据未被初始化
+array(const vector<int64>& shape);
+// array 构造函数，shape为形状，数据从data指针拷贝初始化
+array(const vector<int64>& shape, const T* data);
+// array 构造函数，shape为形状，数据从data vector拷贝初始化
+array(const vector<int64>& shape, const vector<T>& data);
+
+T& operator()(...);
+
+};
+```
+
+## Console 接口一览
+
+console接口主要用于设置变量，取出变量，运行脚本， 三部分构成。
+
+```cpp
+
+struct Console {
+
+// 运行代码接口
+void run(const string& src);
+
+// 设置变量名称为s， 值为data
+template<class T>
+void set(const string& s, const T& data);
+
+// 获取变量名称为s
+template<class T>
+T get(const string& s)
+
+// 设置 array 变量
+void set_array(const string& s, const array<T,N>& data);
+
+// 获取一个jittor array，类型为`T`，维度大小为`NDIM`，需要注意的是类型和维度大小必须和console中的变量匹配，否则会抛出异常。
+void get_array<T,N>(const string& s);
+
+};
+```
+
+其中 `get`，`set` 支持常见的c++类型有：
+
+1. int， uint, int64, uint64, float, double
+2. string
+3. vector
+4. map, unordered_map
--- a/doc/source/jittor.contrib.md
+++ b/doc/source/jittor.contrib.md
@ -0,0 +1,10 @@
+jittor.contrib
+=====================
+
+这里是Jittor的贡献代码模块模块的API文档，此模块的代码可能还没有完全成熟，我们将在后续迭代开发中继续完善，您可以通过`from jittor import contrib`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.contrib
+   :members:
+   :undoc-members:
+```
--- a/doc/source/jittor.dataset.md
+++ b/doc/source/jittor.dataset.md
@ -0,0 +1,11 @@
+jittor.dataset
+=====================
+
+这里是Jittor的数据集模块的API文档，您可以通过`from jittor import dataset`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.dataset
+   :imported-members:
+   :members:
+   :undoc-members:
+```
--- a/doc/source/jittor.distributions.md
+++ b/doc/source/jittor.distributions.md
@ -0,0 +1,10 @@
+jittor.distributions
+=====================
+
+这里是Jittor的随机分布模块的API文档，您可以通过`from jittor import distributions`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.distributions
+   :members:
+   :undoc-members:
+```
--- a/doc/source/jittor.init.md
+++ b/doc/source/jittor.init.md
@ -0,0 +1,10 @@
+jittor.init
+=====================
+
+这里是Jittor的参数初始化模块的API文档，您可以通过`from jittor import init`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.init
+   :members:
+   :undoc-members:
+```
--- a/doc/source/jittor.linalg.md
+++ b/doc/source/jittor.linalg.md
@ -0,0 +1,57 @@
+jittor.linalg
+=====================
+
+这里是Jittor的线性代数函数的API文档，您可以通过`from jittor import linalg`来获取该模块。
+
+## 基本函数简介
+#### 基本线性代数运算API
+- linalg.inv(a)
+
+  对a进行求逆运算
+
+- linalg.pinv(a)
+
+  对a进行广义求逆运算。该运算不要求原矩阵a可逆。
+
+- linalg.slogdet(a)
+
+  对a求取slogdet。会返回值以及符号。
+
+- linalg.det(a)
+
+  对a求行列式。
+
+- linalg.solve(a,b)
+
+  求解线性方程Ax=b的解。
+
+#### 分解API
+- linalg.cholesky(a)
+
+  对a进行cholesky分解。
+
+- linalg.qr(a)
+
+  对a进行qr分解。
+
+- linalg.svd
+
+  对a进行奇异值分解。
+####  特征值API
+- linalg.eig(a)
+
+  求取a的特征值以及特征向量。
+
+- linalg.eigh(a)
+
+  针对埃尔米特矩阵或者对称矩阵求特征值以及特征向量。
+  
+
+目前的linalg库支持
+
+```eval_rst
+.. automodule:: jittor.linalg
+   :members:
+   :undoc-members:
+```
+
--- a/doc/source/jittor.loss3d.md
+++ b/doc/source/jittor.loss3d.md
@ -0,0 +1,10 @@
+jittor.loss3d
+=====================
+
+这里是Jittor的 3d 损失函数 模块的API文档，您可以通过`from jittor import loss3d`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.loss3d
+   :members: chamfer_loss, ChamferLoss, earth_mover_distance, EarthMoverDistance
+   :undoc-members:
+```
--- a/doc/source/jittor.md
+++ b/doc/source/jittor.md
@ -0,0 +1,54 @@
+jittor
+=====================
+
+## jittor
+
+这里是Jittor主模块的API文档，您可以通过`import jittor`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor
+   :members:
+   :undoc-members:
+```
+
+## jittor.core
+
+以下为Jittor的内核API，内核API可以通过`jittor.core.XXX`或者`jittor.XXX`直接访问。
+
+
+```eval_rst
+.. automodule:: jittor_core
+   :imported-members:
+   :members:
+   :undoc-members:
+```
+
+## jittor.ops
+
+这里是Jittor的基础算子模块的API文档，该API可以通过`jittor.ops.XXX`或者`jittor.XXX`直接访问。
+
+```eval_rst
+.. automodule:: jittor_core.ops
+   :members:
+   :undoc-members:
+```
+
+## jittor.Var
+
+这里是Jittor的基础变量类的API文档。该API可以通过`my_jittor_var.XXX`直接访问。
+
+```eval_rst
+.. automodule:: jittor_core.Var
+   :members:
+   :undoc-members:
+```
+
+## jittor.Misc
+
+这里是Jittor的基础算子模块的API文档，该API可以通过`jittor.misc.XXX`或者`jittor.XXX`直接访问。
+
+```eval_rst
+.. automodule:: jittor.misc
+   :members:
+   :undoc-members:
+```
--- a/doc/source/jittor.models.md
+++ b/doc/source/jittor.models.md
@ -0,0 +1,14 @@
+jittor.models
+=====================
+
+这里是Jittor的骨干网络模块的API文档，您可以通过`from jittor import models`来获取该模块。
+
+```eval_rst
+
+.. automodule:: jittor.models
+   :members: 
+   :imported-members:
+   :undoc-members:
+   :exclude-members: ResNet,ShuffleNetV2,SqueezeNet,VGG
+```
+
--- a/doc/source/jittor.mpi.md
+++ b/doc/source/jittor.mpi.md
@ -0,0 +1,215 @@
+jittor.mpi
+=====================
+
+计图分布式基于MPI（Message Passing Interface），本文档主要阐述使用计图MPI，进行多卡和分布式训练的教程。
+
+
+## 计图MPI安装
+
+计图依赖`OpenMPI`，用户可以使用如下命令安装`OpenMPI`：
+
+```bash
+sudo apt install openmpi-bin openmpi-common libopenmpi-dev
+```
+
+也可以参考 [OpenMPI 文档](https://www.open-mpi.org/faq/?category=building#easy-build)，自行编译安装。
+
+计图会自动检测环境变量中是否包含`mpicc`，如果计图成功的检测到了`mpicc`，那么会输出如下信息：
+
+```
+[i 0502 14:09:55.758481 24 __init__.py:203] Found mpicc(1.10.2) at /usr/bin/mpicc
+```
+
+如果计图没有在环境变量中找到mpi，用户也可以手动指定mpicc的路径告诉计图，添加环境变量即可：`export mpicc_path=/you/mpicc/path`
+
+`OpenMPI`安装完成以后，用户无需修改代码，需要做的仅仅是修改启动命令行，计图就会用数据并行的方式自动完成并行操作。
+
+```bash
+# 单卡训练代码
+python3.7 -m jittor.test.test_resnet
+# 分布式多卡训练代码
+mpirun -np 4 python3.7 -m jittor.test.test_resnet
+# 指定特定显卡的多卡训练代码
+CUDA_VISIBLE_DEVICES="2,3" mpirun -np 2 python3.7 -m jittor.test.test_resnet
+```
+
+这种便捷性的背后是计图的分布式算子的支撑，计图支持的mpi算子后端会使用nccl进行进一步的加速。计图所有分布式算法的开发均在Python前端完成，这让分布式算法的灵活度增强，开发分布式算法的难度也大大降低。
+
+## 如何从单卡代码适配多卡代码
+
+使用`mpirun`时，以下几种模块会自动检测mpi环境并且自动切换成多卡版本：
+
+* jittor.optimizer: 自动同步梯度
+* jittor.nn.BatchNorm*： 同步batch norm
+* jittor.dataset： 自动数据并行
+
+用户在使用MPI进行分布式训练时，计图内部的Dataset类会自动并行分发数据，需要注意的是Dataset类中设置的Batch size是**所有节点的batch size之和**，也就是总batch size， 不是单个节点接收到的batch size。
+
+大部分情况下，单卡训练的代码可以直接使用`mpirun`实现分布式多卡运行。 但仍然如下几种情况下，需要对代码进行调整：
+
+1. 对硬盘进行写操作（保存模型，保存曲线）
+2. 需要统计全局信息（validation 上的全局准确率）
+
+### 对硬盘进行写操作
+
+对于第一点，假设原来您的代码如下：
+
+```python
+for i, (images, labels) in enumerate(dataset):
+    output = model(images)
+    loss = nn.cross_entropy_loss(output, labels)
+    acc1 = accuracy(output, labels)
+    SGD.step(loss)
+    loss_data = loss.data
+    writer.add_scalar("Train/loss")
+```
+
+更改后的代码如下：
+
+```python
+for i, (images, labels) in enumerate(dataset):
+    output = model(images)
+    loss = nn.cross_entropy_loss(output, labels)
+    acc1 = accuracy(output, labels)
+    SGD.step(loss)
+    loss_data = loss.data
+    if jt.rank == 0:
+        writer.add_scalar("Train/loss")
+```
+
+这里我们使用了 jt.rank 来限制，只允许第一个进程可以写 loss，这个代码在单卡下也是有效的，因为单卡的 jt.rank 值为 0， 需要注意的是，在 `if jt.rank == 0` 代码块里面的代码，不允许调用任何jittor的api，因为这很有可能导致多卡之间的api调用不一致而产生**死锁**!
+
+### 需要统计全局信息
+
+统计全局信息有两种方法，第一种是使用提供的 mpi op 来实现全局信息统计, 如下所示， 是一个validation的代码：
+
+```python
+def val(epoch):
+    global min_error
+    model.eval()
+    correct_nums = 0
+    for i, (images, labels) in enumerate(valdataset):
+        output = model(images)
+        correct_nums += top1error(output, labels)
+        correct_nums.sync()
+    top1_error = (valdataset.total_len - correct_nums.data[0]) / valdataset.total_len
+    if top1_error < min_error:
+        print("[*] Best model is updated ...")
+        model.save('model_best.pkl')
+```
+
+更改方案如下：
+
+```python
+def val(epoch):
+    global min_error
+    model.eval()
+    correct_nums = 0
+    for i, (images, labels) in enumerate(valdataset):
+        output = model(images)
+        correct_nums += top1error(output, labels)
+        correct_nums.sync()
+    if jt.in_mpi:
+        correct_nums = correct_nums.mpi_all_reduce()
+    top1_error = (valdataset.total_len - correct_nums.data[0]) / valdataset.total_len
+    if jt.rank == 0 and top1_error < min_error:
+        print("[*] Best model is updated ...")
+        model.save('model_best.pkl')
+```
+
+可以留意到我们首先使用了 `mpi_all_reduce`， 来统计多卡的正确数量(mpi_all_reduce会将多个mpi进程的结果累加起来)， 然后在 `jt.rank == 0` 的情况下才更新模型。
+
+第二种方法是使用`@jt.single_process_scope()`，被装饰的代码会直接以单进程的方式执行，无需处理多卡。
+
+```python
+@jt.single_process_scope()
+def val(epoch):
+    ......
+```
+
+
+## MPI接口
+
+下面是 jittor 的 mpi api reference.
+目前MPI开放接口如下：
+
+* `jt.in_mpi`: 当计图不在MPI环境下时，`jt.mpi == False`， 用户可以用这个判断是否在mpi环境下。
+* `jt.world_size`: 获取当前进程总数量，如果没有用mpi，则为1。
+* `jt.rank`: 获取当前进程的编号，区间为`0 ～ jt.world_size-1`， 如果没有用mpi，则为0。
+* `jt.mpi`: 计图的MPI模块。
+* `jt.Module.mpi_param_broadcast(root=0)`: 将模块的参数从root节点广播给其他节点。
+* `jt.mpi.mpi_reduce(x, op='add', root=0)`: 将所有节点的变量x使用算子op，reduce到root节点。如果op是'add'或者'sum'，该接口会把所有变量求和，如果op是'mean'，该接口会取均值。
+
+<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/tutorial/2020-5-2-16-44-distributed/mpi_reduce.png">
+
+* `jt.mpi.mpi_broadcast(x, root=0)`: 将变量x从root节点广播到所有节点。
+
+<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/tutorial/2020-5-2-16-44-distributed/mpi_broadcast.png">
+
+* `jt.mpi.mpi_all_reduce(x, op='add')`: 将所有节点的变量x使用一起reduce，并且吧reduce的结果再次广播到所有节点。如果op是'add'或者'sum'，该接口会把所有变量求和，如果op是'mean'，该接口会取均值。
+
+<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/tutorial/2020-5-2-16-44-distributed/mpi_all_reduce.png">
+
+
+
+```eval_rst
+.. automodule:: jittor_mpi_core
+   :members:
+   :undoc-members:
+.. automodule:: jittor_mpi_core.ops
+   :members:
+   :undoc-members:
+```
+
+## 实例：MPI实现分布式同步批归一化层
+
+
+下面的代码是使用计图实现分布式同步批归一化层的实例代码，在原来批归一化层的基础上，只需增加三行代码，就可以实现分布式的batch norm，添加的代码如下：
+
+```python
+# 将均值和方差，通过all reduce同步到所有节点
+if self.sync and jt.mpi:
+    xmean = xmean.mpi_all_reduce("mean")
+    x2mean = x2mean.mpi_all_reduce("mean")
+```
+
+> 注：计图内部已经实现了同步的批归一化层，用户不需要自己实现
+
+分布式同步批归一化层的完整代码：
+
+```python
+class BatchNorm(Module):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=None, is_train=True, sync=True):
+        assert affine == None
+
+        self.sync = sync
+        self.num_features = num_features
+        self.is_train = is_train
+        self.eps = eps
+        self.momentum = momentum
+        self.weight = init.constant((num_features,), "float32", 1.0)
+        self.bias = init.constant((num_features,), "float32", 0.0)
+        self.running_mean = init.constant((num_features,), "float32", 0.0).stop_grad()
+        self.running_var = init.constant((num_features,), "float32", 1.0).stop_grad()
+
+    def execute(self, x):
+        if self.is_train:
+            xmean = jt.mean(x, dims=[0,2,3], keepdims=1)
+            x2mean = jt.mean(x*x, dims=[0,2,3], keepdims=1)
+            # 将均值和方差，通过all reduce同步到所有节点
+            if self.sync and jt.mpi:
+                xmean = xmean.mpi_all_reduce("mean")
+                x2mean = x2mean.mpi_all_reduce("mean")
+
+            xvar = x2mean-xmean*xmean
+            norm_x = (x-xmean)/jt.sqrt(xvar+self.eps)
+            self.running_mean += (xmean.sum([0,2,3])-self.running_mean)*self.momentum
+            self.running_var += (xvar.sum([0,2,3])-self.running_var)*self.momentum
+        else:
+            running_mean = self.running_mean.broadcast(x, [0,2,3])
+            running_var = self.running_var.broadcast(x, [0,2,3])
+            norm_x = (x-running_mean)/jt.sqrt(running_var+self.eps)
+        w = self.weight.broadcast(x, [0,2,3])
+        b = self.bias.broadcast(x, [0,2,3])
+        return norm_x * w + b
+```
--- a/doc/source/jittor.nn.md
+++ b/doc/source/jittor.nn.md
@ -0,0 +1,24 @@
+jittor.nn
+=====================
+
+这里是Jittor的神经网络模块的API文档，您可以通过`from jittor import nn`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.nn
+   :members:
+   :undoc-members:
+
+.. automodule:: jittor.nn
+   :imported-members:
+   :members: Pool, pool, AdaptiveAvgPool2d, Pool3d, AdaptiveMaxPool2d, AdaptiveAvgPool3d, AdaptiveMaxPool2d, pool3d, AvgPool2d, AvgPool3d, avg_pool2d, MaxPool2d, MaxPool3d, max_pool2d, max_pool3d, MaxUnpool2d, MaxUnpool3d
+   :undoc-members: 
+
+.. autoclass:: jittor.nn.ReLU
+   :members:
+.. autoclass:: jittor.nn.ReLU6
+   :members:
+.. autoclass:: jittor.nn.LeakyReLU
+   :members:
+.. autoclass:: jittor.nn.Softmax
+   :members:
+```
--- a/doc/source/jittor.optim.md
+++ b/doc/source/jittor.optim.md
@ -0,0 +1,18 @@
+jittor.optim
+=====================
+
+这里是Jittor的优化器模块的API文档，您可以通过`from jittor import optim`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.optim
+   :members:
+   :undoc-members:
+```
+
+以下是Jittor的学习率调度模块的API文档，学习率调度模块需要配合优化器使用，您可以通过`from jittor import lr_scheduler`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.lr_scheduler
+   :members:
+   :undoc-members:
+```
--- a/doc/source/jittor.transform.md
+++ b/doc/source/jittor.transform.md
@ -0,0 +1,10 @@
+jittor.transform
+=====================
+
+这里是Jittor的 数据变换 模块的API文档，您可以通过`from jittor import transform`来获取该模块。
+
+```eval_rst
+.. automodule:: jittor.transform
+   :members:
+   :undoc-members:
+```
--- a/doc/source/todo.md
+++ b/doc/source/todo.md
@ -0,0 +1,12 @@
+TODO
+=====================
+
+## 文档相关
+
+*   文档语法规范
+*   文档加上教程链接
+*   MPI接口文档
+*   文档自动更新
+*   首页到文档的链接
+*   模型库的文档（GAN，segmentation，detection...）
+*   文档补全，重要的类加上使用example
--- a/python/jittor/init.py
+++ b/python/jittor/init.py
--- a/python/jittor/init.pyi
+++ b/python/jittor/init.pyi
--- a/python/jittor/attention.py
+++ b/python/jittor/attention.py
@ -0,0 +1,176 @@
+# ***************************************************************
+# Copyright (c) 2023 Jittor. All Rights Reserved. 
+# Maintainers:
+#     Guowei Yang <471184555@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+#
+# 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+
+import jittor as jt
+from jittor import init, Module, nn
+import numpy as np
+import math
+
+class MultiheadAttention(Module):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        q_noise=0.0,
+        qn_block_size=8,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        assert dropout==0, "TODO: dropout>0"
+
+        self.head_dim = embed_dim // num_heads
+        assert (self.head_dim * num_heads == self.embed_dim), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+
+        assert not self.self_attention or self.qkv_same_dim, ("Self-attention requires query, key and " "value to be of the same size")
+
+        #TODO: quant_noise
+        self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+        assert not add_bias_kv, "TODO: add_bias_kv=True"
+        self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self.reset_parameters()
+
+        self.onnx_trace = False
+        self.tpu = False
+        
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            init.xavier_uniform_(self.k_proj.weight)
+            init.xavier_uniform_(self.v_proj.weight)
+            init.xavier_uniform_(self.q_proj.weight)
+
+        # init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            init.constant_(self.out_proj.bias, 0.)
+        if self.bias_k is not None:
+            init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            init.xavier_normal_(self.bias_v)
+
+    def execute(
+        self,
+        query,
+        key = None,
+        value = None,
+        key_padding_mask = None,
+        incremental_state = None,
+        need_weights = True,
+        static_kv = False,
+        attn_mask = None,
+        before_softmax = False,
+        need_head_weights = False,
+    ):
+        if need_head_weights:
+            need_weights = True
+
+        tgt_len, bsz, embed_dim = query.shape
+        assert embed_dim == self.embed_dim
+        assert list(query.shape) == [tgt_len, bsz, embed_dim]
+
+        assert incremental_state is None, "TODO: incremental_state is not None"
+        saved_state = None
+
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q = q*self.scaling
+
+        assert self.bias_k is None, "TODO: self.bias_k is not None:"
+
+        q = q.view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(1, 0, 2)
+        if k is not None:
+            k = k.view(-1, bsz * self.num_heads, self.head_dim).transpose(1, 0, 2)
+        if v is not None:
+            v = v.view(-1, bsz * self.num_heads, self.head_dim).transpose(1, 0, 2)
+
+        assert saved_state is None, "TODO: saved_state is not None"
+        assert k is not None
+        src_len = k.shape[1]
+
+        assert key_padding_mask is None, "TODO: key_padding_mask is not None"
+        assert not self.add_zero_attn, "TODO: self.add_zero_attn=True"
+
+        attn_weights = nn.bmm(q, k.transpose(0, 2, 1))
+
+        assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
+
+        assert attn_mask is None, "TODO: attn_mask is not None"
+        assert key_padding_mask is None, "TODO: key_padding_mask is not None"
+        
+        if before_softmax:
+            return attn_weights, v
+        
+        attn_weights_float = nn.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+
+        assert v is not None
+        attn = nn.bmm(attn_weights, v)
+        assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.shape[1] == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.view(tgt_len, bsz, embed_dim)
+        else:
+            attn = attn.transpose(1, 0, 2).view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0, 2, 3)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dims=[0])
+
+        return attn, attn_weights
--- a/python/jittor/ccl/init.py
+++ b/python/jittor/ccl/init.py
@ -0,0 +1,3 @@
+from .ccl_2d import ccl_2d
+from .ccl_3d import ccl_3d
+from .ccl_link import ccl_link
--- a/python/jittor/ccl/ccl_2d.py
+++ b/python/jittor/ccl/ccl_2d.py
@ -0,0 +1,177 @@
+import jittor as jt
+
+
+def ccl_2d(data_2d):
+    ''' 
+    2D connected component labelling, original code from https://github.com/DanielPlayne/playne-equivalence-algorithm
+    Args:
+        [in]param data_2d: binary two-dimensional vector
+            type data_2d: jittor array
+
+    Returns:
+        [out]result: labeled two-dimensional vector
+
+    Example:
+    >>> import jittor as jt
+    >>> jt.flags.use_cuda = 1
+    >>> import cv2
+    >>> import numpy as np
+    >>> img = cv2.imread('testImg.png', 0)
+    >>> a = img.mean()
+    >>> img[img <= a] = 0
+    >>> img[img > a] = 1
+    >>> img = jt.Var(img)
+
+    >>> result = ccl_2d(img)
+    >>> print(jt.unique(result, return_counts=True, return_inverse=True)[0], jt.unique(result, return_counts=True, return_inverse=True)[2])
+    >>> cv2.imwrite('testImg_result.png', result.numpy().astype(np.uint8) * 50)
+    '''
+
+    data_2d = data_2d.astype(jt.uint32)
+    cY = data_2d.shape[0]
+    cX = data_2d.shape[1]
+    data_2d_copy = data_2d.clone()
+    changed = jt.ones([1], dtype=jt.uint32)
+    data_2d = data_2d.reshape(cX * cY)
+    result = jt.code(data_2d.shape,
+                     data_2d.dtype, [data_2d, changed],
+                     cuda_header='''
+                    @alias(g_image, in0)
+                    @alias(g_labels, out)
+                    ''',
+                     cuda_src=r'''
+                    __global__ void init_labels(@ARGS_DEF, const int cX, const int cY) {
+                        @PRECALC
+                        // Calculate index
+                        const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
+                        const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
+                        @g_labels(iy*cX + ix) = iy*cX + ix;
+                    }
+
+                    __device__ __inline__ unsigned int find_root(@ARGS_DEF, unsigned int label) {
+                        // Resolve Label
+                        unsigned int next = @g_labels(label);
+
+                        // Follow chain
+                        while(label != next) {
+                            // Move to next
+                            label = next;
+                            next = @g_labels(label);
+                        }
+
+                        // Return label
+                        return label;
+                    }
+
+                    __global__ void resolve_labels(@ARGS_DEF, const int cX, const int cY) {
+                        @PRECALC
+                        // Calculate index
+                        const unsigned int id = ((blockIdx.y * blockDim.y) + threadIdx.y) * cX +
+                                                ((blockIdx.x * blockDim.x) + threadIdx.x);
+                        
+                        // Check Thread Range
+                        if(id < cX*cY) {
+                            // Resolve Label
+                            @g_labels(id) = find_root(@ARGS, @g_labels(id));
+                        }
+                    }
+
+                    __global__ void label_equivalence(@ARGS_DEF, const int cX, const int cY) {
+                        @PRECALC
+                        // Calculate index
+                        const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
+                        const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+                        // Check Thread Range
+                        if((ix < cX) && (iy < cY)) {
+                            // Get image and label values
+                            const unsigned char cyx   = @g_image( iy*cX + ix);
+                            
+                            // Get neighbour labels
+                            const unsigned int lym1x = (iy > 0)    ? @g_labels((iy-1)*cX +   ix) : 0;
+                            const unsigned int lyxm1 = (ix > 0)    ? @g_labels(iy    *cX + ix-1) : 0;
+                            const unsigned int lyx   =               @g_labels(iy    *cX +   ix);
+                            const unsigned int lyxp1 = (ix < cX-1) ? @g_labels(iy    *cX + ix+1) : 0;
+                            const unsigned int lyp1x = (iy < cY-1) ? @g_labels((iy+1)*cX +   ix) : 0;
+
+                            const unsigned int lym1xm1 = (iy > 0    && ix > 0   )    ? @g_labels((iy-1)*cX +   ix-1) : 0;
+                            const unsigned int lym1xp1 = (iy > 0    && ix < cX-1)    ? @g_labels((iy-1)*cX +   ix+1) : 0;
+                            const unsigned int lyp1xm1 = (iy < cY-1 && ix > 0   )    ? @g_labels((iy+1)*cX +   ix-1) : 0;
+                            const unsigned int lyp1xp1 = (iy < cY-1 && ix < cX-1)    ? @g_labels((iy+1)*cX +   ix+1) : 0;
+
+                            const bool nym1x = (iy > 0)    ? (cyx == (@g_image((iy-1)*cX +   ix))) : false;
+                            const bool nyxm1 = (ix > 0)    ? (cyx == (@g_image(iy    *cX + ix-1))) : false;
+                            const bool nyxp1 = (ix < cX-1) ? (cyx == (@g_image(iy    *cX + ix+1))) : false;
+                            const bool nyp1x = (iy > cY-1) ? (cyx == (@g_image((iy+1)*cX +   ix))) : false;
+
+                            const bool nym1xm1 = (iy > 0    && ix > 0   )    ? (cyx == (@g_image((iy-1)*cX +   ix-1))) : false;
+                            const bool nym1xp1 = (iy > 0    && ix < cX-1)    ? (cyx == (@g_image((iy-1)*cX +   ix+1))) : false;
+                            const bool nyp1xm1 = (iy < cY-1 && ix > 0   )    ? (cyx == (@g_image((iy+1)*cX +   ix-1))) : false;
+                            const bool nyp1xp1 = (iy < cY-1 && ix < cX-1)    ? (cyx == (@g_image((iy+1)*cX +   ix+1))) : false;
+
+                            // Lowest label
+                            unsigned int label = lyx;
+
+                            // Find lowest neighbouring label
+                            label = ((nym1x) && (lym1x < label)) ? lym1x : label;
+                            label = ((nyxm1) && (lyxm1 < label)) ? lyxm1 : label;
+                            label = ((nyxp1) && (lyxp1 < label)) ? lyxp1 : label;
+                            label = ((nyp1x) && (lyp1x < label)) ? lyp1x : label;
+
+                            label = ((nym1xm1) && (lym1xm1 < label)) ? lym1xm1 : label;
+                            label = ((nym1xp1) && (lym1xp1 < label)) ? lym1xp1 : label;
+                            label = ((nyp1xm1) && (lyp1xm1 < label)) ? lyp1xm1 : label;
+                            label = ((nyp1xp1) && (lyp1xp1 < label)) ? lyp1xp1 : label;
+
+                            // If labels are different, resolve them
+                            if(label < lyx) {
+                                // Update label
+                                // Nonatomic write may overwrite another label but on average seems to give faster results
+                                @g_labels(lyx) = label;
+
+                                // Record the change
+                                @in1(0) = 1;
+                            }
+                        }
+                    }
+                    ''' + f'''
+                    dim3 block(32, 32);
+                    const int cX= {cX};
+                    const int cY= {cY};''' + '''
+                    dim3 grid(ceil(cX/(float)block.x), ceil(cY/(float)block.y));
+                    dim3 resolve_block(32, 32);
+                    dim3 resolve_grid(ceil(cX/(float)resolve_block.x), ceil(cY/(float)resolve_block.y));
+                    
+                    // Initialise labels
+                    init_labels <<< grid, block >>>(@ARGS, cX, cY);
+                    
+                    // Resolve the labels
+                    resolve_labels <<< resolve_grid, resolve_block >>>(@ARGS, cX, cY);
+
+                    // Changed Flag
+                    int32 changed = 1;
+                    
+                    // While labels have changed
+                    while(changed) {
+                        // Copy changed to device
+                        cudaMemsetAsync(in1_p, 0, 4);
+                        
+                        // Label image
+                        label_equivalence <<< grid, block >>>(@ARGS, cX, cY);
+
+                        // Copy changed flag to host
+                        cudaMemcpy(&changed, in1_p, sizeof(int32), cudaMemcpyDeviceToHost);
+                        
+                        // Resolve the labels
+                        resolve_labels <<< resolve_grid, resolve_block>>>(@ARGS, cX, cY);
+                    }
+                    ''')
+    result = result.reshape((cY, cX)) * data_2d_copy
+    value = jt.unique(result)
+    value = value[value != 0]
+
+    map_result = jt.zeros((int(value.max().numpy()[0]) + 1), dtype=jt.uint32)
+    map_result[value] = jt.index(value.shape)[0] + 1
+    result = map_result[result]
+
+    return result
--- a/python/jittor/ccl/ccl_3d.py
+++ b/python/jittor/ccl/ccl_3d.py
@ -0,0 +1,196 @@
+import jittor as jt
+
+
+def ccl_3d(data_3d):
+    ''' 
+    3D connected component labelling, original code from https://github.com/DanielPlayne/playne-equivalence-algorithm
+    Args:
+        [in]param data_3d: binary three-dimensional vector
+            type data_3d: jittor array
+
+    Returns:
+        [out]result : labeled three-dimensional vector
+
+    Example:
+    >>> import jittor as jt
+    >>> jt.flags.use_cuda = 1
+    >>> data_3d = jt.zeros((10, 11, 12), dtype=jt.uint32)
+    >>> data_3d[2:4, :, :] = 1
+    >>> data_3d[5:7, :, :] = 1
+    >>> result = ccl_3d(data_3d)
+    >>> print(result[:, 0, 0])
+    >>> print(
+        jt.unique(result, return_counts=True, return_inverse=True)[0],
+        jt.unique(result, return_counts=True, return_inverse=True)[2])
+    '''
+
+    data_3d = data_3d.astype(jt.uint32)
+    cX = data_3d.shape[0]
+    cY = data_3d.shape[1]
+    cZ = data_3d.shape[2]
+    changed = jt.ones([1], dtype=jt.uint32)
+    data_3d_copy = data_3d.copy()
+    data_3d = data_3d.reshape(cX * cY * cZ)
+    result = jt.code(data_3d.shape,
+                     data_3d.dtype, [data_3d, changed],
+                     cuda_header='''
+                    @alias(g_image, in0)
+                    @alias(g_labels, out)
+                    ''',
+                     cuda_src=r'''
+                    __global__ void init_labels(@ARGS_DEF, const int cX, const int cY, const int cZ, const int pX, const int pY) {
+                        @PRECALC
+                        // Calculate index
+                        const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
+                        const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
+                        const unsigned int iz = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+                        if((ix < cX) && (iy < cY) && (iz < cZ)) {
+                            const unsigned char pzyx = @g_image(iz*pY + iy*pX + ix);
+                            
+                            // Neighbour Connections
+                            const bool nzm1yx   = (iz > 0) ? (pzyx == @g_image((iz-1)*pY +  iy   *pX + ix  )) : false;
+                            const bool nzym1x   = (iy > 0) ? (pzyx == @g_image( iz   *pY + (iy-1)*pX + ix  )) : false;
+                            const bool nzyxm1   = (ix > 0) ? (pzyx == @g_image( iz   *pY +  iy   *pX + ix-1)) : false;
+
+                            // Label
+                            unsigned int label;
+
+                            // Initialise Label
+                            label = (nzyxm1) ? (    iz*pY +     iy*pX + ix-1) : (iz*pY + iy*pX + ix);
+                            label = (nzym1x) ? (    iz*pY + (iy-1)*pX +   ix) : label;
+                            label = (nzm1yx) ? ((iz-1)*pY +     iy*pX +   ix) : label;
+                            // Write to Global Memory
+                            @g_labels(iz*pY + iy*pX + ix) = label;
+                        }
+                    }
+
+                    __device__ __inline__ unsigned int find_root(@ARGS_DEF, unsigned int label) {
+                        // Resolve Label
+                        unsigned int next = @g_labels(label);
+
+                        // Follow chain
+                        while(label != next) {
+                            // Move to next
+                            label = next;
+                            next = @g_labels(label);
+                        }
+
+                        // Return label
+                        return label;
+                    }
+
+                    __global__ void resolve_labels(@ARGS_DEF, const int cX, const int cY, const int cZ, const int pX, const int pY) {
+                        @PRECALC
+                        // Calculate index
+                        const unsigned int id = ((blockIdx.z * blockDim.z) + threadIdx.z) * pY +
+                                                ((blockIdx.y * blockDim.y) + threadIdx.y) * pX +
+                                                ((blockIdx.x * blockDim.x) + threadIdx.x);
+                        
+                        // Check Thread Range
+                        if(id < cX*cY*cZ) {
+                            // Resolve Label
+                            @g_labels(id) = find_root(@ARGS, @g_labels(id));
+                        }
+                    }
+
+                    __global__ void label_equivalence(@ARGS_DEF, const int cX, const int cY, const int cZ, const int pX, const int pY) {
+                        @PRECALC
+                        // Calculate index
+                        const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
+                        const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
+                        const unsigned int iz = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+                        // Check Thread Range
+                        if((ix < cX) && (iy < cY) && (iz < cZ)) {
+                            // Get image and label values
+                            const unsigned char pzyx  = @g_image(iz*pY + iy*pX + ix);
+                            
+                            // Neighbouring indexes
+                            const unsigned int xm1 = ix-1;
+                            const unsigned int xp1 = ix+1;
+                            const unsigned int ym1 = iy-1;
+                            const unsigned int yp1 = iy+1;
+                            const unsigned int zm1 = iz-1;
+                            const unsigned int zp1 = iz+1;
+
+                            // Get neighbour labels
+                            const unsigned int lzm1yx = (iz > 0)    ? @g_labels(zm1*pY +  iy*pX +  ix) : 0;
+                            const unsigned int lzym1x = (iy > 0)    ? @g_labels( iz*pY + ym1*pX +  ix) : 0;
+                            const unsigned int lzyxm1 = (ix > 0)    ? @g_labels( iz*pY +  iy*pX + xm1) : 0;
+                            const unsigned int lzyx   =               @g_labels( iz*pY +  iy*pX +  ix);
+                            const unsigned int lzyxp1 = (ix < cX-1) ? @g_labels( iz*pY +  iy*pX + xp1) : 0;
+                            const unsigned int lzyp1x = (iy < cY-1) ? @g_labels( iz*pY + yp1*pX +  ix) : 0;
+                            const unsigned int lzp1yx = (iz < cZ-1) ? @g_labels(zp1*pY +  iy*pX +  ix) : 0;
+
+                            const bool nzm1yx = (iz > 0)    ? (pzyx == @g_image(zm1*pY +  iy*pX +  ix)) : false;
+                            const bool nzym1x = (iy > 0)    ? (pzyx == @g_image( iz*pY + ym1*pX +  ix)) : false;
+                            const bool nzyxm1 = (ix > 0)    ? (pzyx == @g_image( iz*pY +  iy*pX + xm1)) : false;
+                            const bool nzyxp1 = (ix < cX-1) ? (pzyx == @g_image( iz*pY +  iy*pX + xp1)) : false;
+                            const bool nzyp1x = (iy < cY-1) ? (pzyx == @g_image( iz*pY + yp1*pX +  ix)) : false;
+                            const bool nzp1yx = (iz < cZ-1) ? (pzyx == @g_image(zp1*pY +  iy*pX +  ix)) : false;
+
+                            // Lowest label
+                            unsigned int label = lzyx;
+
+                            // Find lowest neighbouring label
+                            label = ((nzm1yx) && (lzm1yx < label)) ? lzm1yx : label;
+                            label = ((nzym1x) && (lzym1x < label)) ? lzym1x : label;
+                            label = ((nzyxm1) && (lzyxm1 < label)) ? lzyxm1 : label;
+                            label = ((nzyxp1) && (lzyxp1 < label)) ? lzyxp1 : label;
+                            label = ((nzyp1x) && (lzyp1x < label)) ? lzyp1x : label;
+                            label = ((nzp1yx) && (lzp1yx < label)) ? lzp1yx : label;
+
+                            // If labels are different, resolve them
+                            if(label < lzyx) {
+                                // Update label
+                                // Nonatomic write may overwrite another label but on average seems to give faster results
+                                @g_labels(lzyx) = label;
+
+                                // Record the change
+                                @in1(0) = 1;
+                            }
+                        }
+                    }
+                    ''' + f'''
+                    dim3 block(32, 4, 4);
+                    const int cX= {cX};
+                    const int cY= {cY};
+                    const int cZ= {cZ};
+                    const int pX= cX;
+                    const int pY= cX*cY;''' + '''
+                    dim3 grid(ceil(cX/(float)block.x), ceil(cY/(float)block.y), ceil(cZ/(float)block.z));
+                    
+                    // Initialise labels
+                    init_labels <<< grid, block >>>(@ARGS, cX, cY, cZ, pX, pY);
+                    
+                    // Resolve the labels
+                    resolve_labels <<< grid, block >>>(@ARGS, cX, cY, cZ, pX, pY);
+
+                    // Changed Flag
+                    int32 changed = 1;
+                    
+                    // While labels have changed
+                    while(changed) {
+                        // Copy changed to device
+                        cudaMemsetAsync(in1_p, 0, 4);
+                        
+                        // Label image
+                        label_equivalence <<< grid, block >>>(@ARGS, cX, cY, cZ, pX, pY);
+
+                        // Copy changed flag to host
+                        cudaMemcpy(&changed, in1_p, sizeof(int32), cudaMemcpyDeviceToHost);
+                        
+                        // Resolve the labels
+                        resolve_labels <<< grid, block>>>(@ARGS, cX, cY, cZ, pX, pY);
+                    }
+                    ''')
+    result = result.reshape((cX, cY, cZ)) * data_3d_copy
+    value = jt.unique(result)
+    value = value[value != 0]
+
+    map_result = jt.zeros((int(value.max().numpy()[0]) + 1), dtype=jt.uint32)
+    map_result[value] = jt.index(value.shape)[0] + 1
+    result = map_result[result]
+
+    return result
--- a/python/jittor/ccl/ccl_link.py
+++ b/python/jittor/ccl/ccl_link.py
@ -0,0 +1,195 @@
+import jittor as jt
+
+
+def ccl_link(score_map, link_map, result_comp_area_thresh=6):
+    """
+    Find components in score map and link them with link map, original code from https://github.com/DanielPlayne/playne-equivalence-algorithm.
+    Args:
+        [in]param score_map: binary two-dimensional vector
+            type score_map: jittor array
+        [in]param link_map: two-dimensional vector with 8 channels
+            type link_map: jittor array
+        [in]param result_comp_area_thresh: threshold of component area
+            type result_comp_area_thresh: int
+    Returns:
+        [out]result: labeled two-dimensional vector
+    Example:
+    >>> import jittor as jt
+    >>> jt.flags.use_cuda = 1
+    >>> import cv2
+    >>> import numpy as np
+    >>> score_map = jt.Var(np.load("score_map.npy"))
+    >>> link_map = jt.Var(np.load("link_map.npy"))
+    >>> score_map = score_map >= 0.5
+    >>> link_map = link_map >= 0.8
+    >>> for i in range(8):
+    >>>     link_map[:, :, i] = link_map[:, :, i] & score_map
+    
+    >>> result = ccl_link(score_map, link_map)
+    >>> cv2.imwrite('pixellink.png', result.numpy().astype(np.uint8) * 50)
+    """
+    score_map = score_map.astype(jt.uint32)
+    link_map = link_map.astype(jt.uint32)
+    cY = score_map.shape[0]
+    cX = score_map.shape[1]
+    changed = jt.ones([1], dtype=jt.uint32)
+    score_map = score_map.reshape(cX * cY)
+    result = jt.code(score_map.shape,
+                     score_map.dtype, [score_map, link_map, changed],
+                     cuda_header='''
+                    @alias(score_map, in0)
+                    @alias(link_map, in1)
+                    @alias(g_labels, out)
+                    ''',
+                     cuda_src=r'''
+                    __global__ void init_labels(@ARGS_DEF, const int cX, const int cY) {
+                        @PRECALC
+                        // Calculate index
+                        const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
+                        const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
+                        @g_labels(iy*cX + ix) = iy*cX + ix;
+                    }
+
+                    __device__ __inline__ unsigned int find_root(@ARGS_DEF, unsigned int label) {
+                        // Resolve Label
+                        unsigned int next = @g_labels(label);
+
+                        // Follow chain
+                        while(label != next) {
+                            // Move to next
+                            label = next;
+                            next = @g_labels(label);
+                        }
+
+                        // Return label
+                        return label;
+                    }
+
+                    __global__ void resolve_labels(@ARGS_DEF, const int cX, const int cY) {
+                        @PRECALC
+                        // Calculate index
+                        const unsigned int id = ((blockIdx.y * blockDim.y) + threadIdx.y) * cX +
+                                                ((blockIdx.x * blockDim.x) + threadIdx.x);
+                        
+                        // Check Thread Range
+                        if(id < cX*cY) {
+                            // Resolve Label
+                            @g_labels(id) = find_root(@ARGS, @g_labels(id));
+                        }
+                    }
+
+                    __global__ void label_equivalence(@ARGS_DEF, const int cX, const int cY) {
+                        @PRECALC
+                        // Calculate index
+                        const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
+                        const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+                        // Check Thread Range
+                        if((ix < cX) && (iy < cY)) {
+                            // Get image and label values
+                            const unsigned char cyx   = @score_map( iy*cX + ix);
+                            
+                            // Get neighbour labels
+                            const unsigned int lym1x = (iy > 0)    ? @g_labels((iy-1)*cX +   ix) : 0;
+                            const unsigned int lyxm1 = (ix > 0)    ? @g_labels(iy    *cX + ix-1) : 0;
+                            const unsigned int lyx   =               @g_labels(iy    *cX +   ix);
+                            const unsigned int lyxp1 = (ix < cX-1) ? @g_labels(iy    *cX + ix+1) : 0;
+                            const unsigned int lyp1x = (iy < cY-1) ? @g_labels((iy+1)*cX +   ix) : 0;
+
+                            const unsigned int lym1xm1 = (iy > 0    && ix > 0   )    ? @g_labels((iy-1)*cX +   ix-1) : 0;
+                            const unsigned int lym1xp1 = (iy > 0    && ix < cX-1)    ? @g_labels((iy-1)*cX +   ix+1) : 0;
+                            const unsigned int lyp1xm1 = (iy < cY-1 && ix > 0   )    ? @g_labels((iy+1)*cX +   ix-1) : 0;
+                            const unsigned int lyp1xp1 = (iy < cY-1 && ix < cX-1)    ? @g_labels((iy+1)*cX +   ix+1) : 0;
+                            bool nym1x, nyxm1, nyxp1, nyp1x, nym1xm1, nym1xp1, nyp1xm1, nyp1xp1;
+                            if(cyx) {
+                                nym1x = (iy > 0)    ? ((cyx == (@score_map((iy-1)*cX +   ix))) && (@link_map(iy, ix, 6) || @link_map(iy-1,   ix, 7))) : false; // up
+                                nyxm1 = (ix > 0)    ? ((cyx == (@score_map(iy    *cX + ix-1))) && (@link_map(iy, ix, 0) || @link_map(iy-1, ix-1, 3))) : false; // left
+                                nyxp1 = (ix < cX-1) ? ((cyx == (@score_map(iy    *cX + ix+1))) && (@link_map(iy, ix, 3) || @link_map(iy,   ix+1, 0))) : false; // right
+                                nyp1x = (iy > cY-1) ? ((cyx == (@score_map((iy+1)*cX +   ix))) && (@link_map(iy, ix, 7) || @link_map(iy+1,   ix, 6))) : false; // down
+
+                                nym1xm1 = (iy > 0    && ix > 0   )    ? ((cyx == (@score_map((iy-1)*cX +   ix-1))) && (@link_map(iy, ix, 2) || @link_map(iy-1,  ix-1, 4))) : false; // up-left
+                                nym1xp1 = (iy > 0    && ix < cX-1)    ? ((cyx == (@score_map((iy-1)*cX +   ix+1))) && (@link_map(iy, ix, 5) || @link_map(iy-1,  ix+1, 1))) : false; // up-right
+                                nyp1xm1 = (iy < cY-1 && ix > 0   )    ? ((cyx == (@score_map((iy+1)*cX +   ix-1))) && (@link_map(iy, ix, 1) || @link_map(iy+1,  ix-1, 5))) : false; // down-left
+                                nyp1xp1 = (iy < cY-1 && ix < cX-1)    ? ((cyx == (@score_map((iy+1)*cX +   ix+1))) && (@link_map(iy, ix, 4) || @link_map(iy+1,  ix+1, 2))) : false; // down-right
+                            }
+                            else {
+                                nym1x = (iy > 0)    ? (cyx == (@score_map((iy-1)*cX +   ix))) : false; // up
+                                nyxm1 = (ix > 0)    ? (cyx == (@score_map(iy    *cX + ix-1))) : false; // left
+                                nyxp1 = (ix < cX-1) ? (cyx == (@score_map(iy    *cX + ix+1))) : false; // right
+                                nyp1x = (iy > cY-1) ? (cyx == (@score_map((iy+1)*cX +   ix))) : false; // down
+
+                                nym1xm1 = (iy > 0    && ix > 0   )    ? (cyx == (@score_map((iy-1)*cX +   ix-1))) : false; // up-left
+                                nym1xp1 = (iy > 0    && ix < cX-1)    ? (cyx == (@score_map((iy-1)*cX +   ix+1))) : false; // up-right
+                                nyp1xm1 = (iy < cY-1 && ix > 0   )    ? (cyx == (@score_map((iy+1)*cX +   ix-1))) : false; // down-left
+                                nyp1xp1 = (iy < cY-1 && ix < cX-1)    ? (cyx == (@score_map((iy+1)*cX +   ix+1))) : false; // down-right
+                            }
+
+                            // Lowest label
+                            unsigned int label = lyx;
+
+                            // Find lowest neighbouring label
+                            label = ((nym1x) && (lym1x < label)) ? lym1x : label;
+                            label = ((nyxm1) && (lyxm1 < label)) ? lyxm1 : label;
+                            label = ((nyxp1) && (lyxp1 < label)) ? lyxp1 : label;
+                            label = ((nyp1x) && (lyp1x < label)) ? lyp1x : label;
+
+                            label = ((nym1xm1) && (lym1xm1 < label)) ? lym1xm1 : label;
+                            label = ((nym1xp1) && (lym1xp1 < label)) ? lym1xp1 : label;
+                            label = ((nyp1xm1) && (lyp1xm1 < label)) ? lyp1xm1 : label;
+                            label = ((nyp1xp1) && (lyp1xp1 < label)) ? lyp1xp1 : label;
+
+                            // If labels are different, resolve them
+                            if(label < lyx) {
+                                // Update label
+                                // Nonatomic write may overwrite another label but on average seems to give faster results
+                                @g_labels(lyx) = label;
+
+                                // Record the change
+                                @in2(0) = 1;
+                            }
+                        }
+                    }
+                    ''' + f'''
+                    dim3 block(32, 32);
+                    const int cX= {cX};
+                    const int cY= {cY};''' + '''
+                    dim3 grid(ceil(cX/(float)block.x), ceil(cY/(float)block.y));
+                    dim3 resolve_block(32, 32);
+                    dim3 resolve_grid(ceil(cX/(float)resolve_block.x), ceil(cY/(float)resolve_block.y));
+                    
+                    // Initialise labels
+                    init_labels <<< grid, block >>>(@ARGS, cX, cY);
+                    
+                    // Resolve the labels
+                    resolve_labels <<< resolve_grid, resolve_block >>>(@ARGS, cX, cY);
+
+                    // Changed Flag
+                    int32 changed = 1;
+                    
+                    // While labels have changed
+                    while(changed) {
+                        // Copy changed to device
+                        cudaMemsetAsync(in2_p, 0, 4);
+                        
+                        // Label image
+                        label_equivalence <<< grid, block >>>(@ARGS, cX, cY);
+
+                        // Copy changed flag to host
+                        cudaMemcpy(&changed, in2_p, sizeof(int32), cudaMemcpyDeviceToHost);
+                        
+                        // Resolve the labels
+                        resolve_labels <<< resolve_grid, resolve_block >>>(@ARGS, cX, cY);
+                    }
+                    ''')
+
+    result = result.reshape((cY, cX))
+
+    value, _, cnt = jt.unique(result, return_inverse=True, return_counts=True)
+    value = (cnt > result_comp_area_thresh) * value
+    value = value[value != 0]
+
+    map_result = jt.zeros((int(value.max().numpy()[0]) + 1), dtype=jt.uint32)
+    map_result[value] = jt.index(value.shape)[0] + 1
+    result = map_result[result]
+
+    return result
--- a/python/jittor/compatibility/init.py
+++ b/python/jittor/compatibility/init.py
@ -0,0 +1,430 @@
+# import os
+# os.environ["FIX_TORCH_ERROR"] = "0"
+
+# import jittor as jt
+# from jittor import *
+# from typing import Tuple
+
+# org_int = int = type(1)
+# org_float = float = type(1.0)
+# org_bool = bool = type(True)
+
+# import jtorch.compiler
+
+# import jtorch_core
+# from jtorch_core import *
+
+# device.__reduce__ = lambda self: (device, (self.type,))
+# device.__module__ = "jtorch"
+# jt.jittor_core.device = device
+
+# def handle_dtype(args, kw, dtype):
+#     def convert(x):
+#         if isinstance(x, jt.Var):
+#             return x.cast(dtype)
+#         return x
+#     if dtype is not None:
+#         if args is not None:
+#             if isinstance(args, (tuple,list)):
+#                 args = [ convert(a) for a in args ]
+#             else:
+#                 args = convert(x)
+#         if kw is not None:
+#             kw = { k:convert(v) for k,v in kw.items() }
+#     return args, kw
+
+# def get_args_names(func):
+#     import inspect
+#     spec = inspect.getfullargspec(func)
+#     return spec[0] + spec[4]
+
+# def wrapper(func):
+#     has_dtype = False
+#     if hasattr(func, "__code__"):
+#         has_dtype = "dtype" in get_args_names(func)
+#     def inner(*args, **kw):
+#         requires_grad = None
+#         dtype = None
+#         if "requires_grad" in kw:
+#             requires_grad = kw["requires_grad"]
+#             del kw["requires_grad"]
+#         if not has_dtype and "dtype" in kw:
+#             dtype = kw["dtype"]
+#             del kw["dtype"]
+#         if "device" in kw:
+#             del kw["device"]
+#         if 'pin_memory' in kw:
+#             del kw['pin_memory']
+#         args, kw = handle_dtype(args, kw, dtype)
+#         ret = func(*args, **kw)
+#         if isinstance(ret, jt.Var):
+#             if requires_grad is not None:
+#                 ret.requires_grad = requires_grad
+#             if dtype is not None:
+#                 ret.astype(dtype)
+#         return ret
+#     return inner
+        
+
+# import inspect
+# _wrapper_keys = set(["shape", "start", "size"])
+# _wrapper_keys.add("x")
+# for k,v in list(globals().items()):
+#     if callable(v) and not isinstance(v, type):
+#         try:
+#             spec = inspect.getfullargspec(v)
+#             args_name = spec[0]
+#             if len(args_name) and args_name[0] in _wrapper_keys:
+#                 globals()[k] = wrapper(v)
+#             elif spec.varargs in _wrapper_keys:
+#                 globals()[k] = wrapper(v)
+#         except:
+#             pass
+
+# def empty(*size, dtype=jt.float32, device=None, requires_grad=False):
+#     if len(size) == 1 and not isinstance(size[0], org_int):
+#         size = size[0]
+#     return jt.empty(size, dtype)
+
+# Tensor = Var
+
+# Tensor.backward = lambda x: jtorch_core.backward(x)
+# Tensor.grad = property(grad_get, grad_set, grad_del)
+# Tensor.retains_grad = property(retain_grad_get, retain_grad_set)
+# def retain_grad(x:Tensor, value:bool=True):
+#     x.retains_grad = value
+#     return value
+# Tensor.retain_grad = retain_grad
+
+# Tensor.dim = lambda self: self.ndim
+# Tensor.ndimension = lambda self: self.ndim
+# Tensor.nelement = lambda self: self.numel()
+# Tensor.cuda = lambda self: self
+# def device_get(x:Tensor):
+#     return device("cpu") if not jt.has_cuda or not jt.flags.use_cuda else device("cuda")
+# Tensor.device = property(device_get)
+
+# def argmax(x: Var, dim=None, keepdim: bool = False):
+#     return jt.argmax(x, dim, keepdim)[0]
+# Tensor.argmax = argmax
+
+# def tensor_type(x: Var, dtype=None, **kwargs):
+#     if dtype:
+#         return x.astype(dtype)
+#     else:
+#         return x.dtype
+# Tensor.type = tensor_type
+
+# def is_floating_point(x: Var):
+#     return "float" in str(x.dtype)
+# Tensor.is_floating_point = is_floating_point
+
+# from . import autograd
+# from .autograd import *
+
+# def tensor(data, *, dtype=None, device=None, requires_grad=False, pin_memory=False):
+#     if isinstance(data,list):
+#         data_list = []
+#         check = True
+#         for p in data:
+#             if isinstance(p, Tensor) and p.numel()==1:
+#                 data_list.append(p.item())
+#             elif isinstance(p, (org_int,org_float)):
+#                 data_list.append(p)
+#             else:
+#                 check = False
+#                 break
+#         if check:
+#             data = data_list
+#     return wrapper(array)(data, dtype=dtype, device=device, requires_grad=requires_grad, pin_memory=pin_memory)
+
+# # tensor = wrapper(array)
+# from_numpy = wrapper(array)
+# strided = None
+
+# def mod_zero_grad(self):
+#     for p in self.parameters():
+#         p.grad = None
+# Module.zero_grad = mod_zero_grad
+
+# class ModuleMisc:
+#     def parameters(self):
+#         return iter(super().parameters())
+
+#     def load_state_dict(self, state_dict, strict=False):
+#         return super().load_state_dict(state_dict)
+
+#     def to(self, device=None,dtype=None):
+#         ''' do nothing but return its self'''
+#         return self
+#     def register_parameter(self,name,data):
+#         self.name = data
+
+#     def buffers(self):
+#         for _, buf in self.named_buffers():
+#             yield buf
+
+        
+# def make_module(cls):
+#     class TMod(ModuleMisc, cls):
+#         def __init__(self, *args, **kw):
+#             dtype = None
+#             if "dtype" in kw:
+#                 dtype = kw["dtype"]
+#                 del kw["dtype"]
+#             self._dtype = dtype
+#             with jt.flag_scope(th_mode=0):
+#                 if "device" in kw:
+#                     del kw["device"]
+#                 super().__init__(*args, **kw)
+#             for k,v in self.__dict__.items():
+#                 if not k.startswith("_") and isinstance(v, Var) \
+#                     and v.requires_grad:
+#                     v.retain_grad()
+#                 if dtype is not None and isinstance(v, Var):
+#                     v.assign(v.cast(dtype))
+#         def __call__(self, *args, **kw):
+#             args, kw = handle_dtype(args, kw, self._dtype)
+#             # if forward is override by user, call forward
+#             if self.__class__.forward is not TMod.forward:
+#                 return self.forward(*args, **kw)
+#             return self.execute(*args, **kw)
+#         def forward(self, *args, **kw):
+#             args, kw = handle_dtype(args, kw, self._dtype)
+#             return self.execute(*args, **kw)
+        
+#         @property
+#         def training(self):
+#             if not hasattr(self, "is_train"):
+#                 self.is_train = True
+#             return self.is_train
+#         @training.setter
+#         def training(self, value):
+#             self.is_train = value
+
+#     TMod.__name__ = cls.__name__
+#     return TMod
+
+# import jtorch.cuda
+# import jtorch.nn
+# from jtorch.nn import Module, Parameter
+# import jtorch.optim
+
+# from jtorch.utils.dtype import Dtype, get_string_dtype
+
+# def frombuffer(buffer: bytearray, 
+#               *, 
+#               dtype: Dtype, 
+#               count: int = -1, 
+#               offset: int = 0, 
+#               requires_grad: bool = True) -> Tensor:
+#     dtype = get_string_dtype(dtype)
+#     tensor = jt.array(np.frombuffer(buffer, dtype, count=count, offset=offset))
+#     if requires_grad and tensor.dtype.is_float():
+#         tensor.requires_grad = True
+#     return tensor
+
+# def conflict_wrapper(origin_func, new_func):
+#     def wrapper(*args, **kw):
+#         if jt.flags.th_mode:
+#             return new_func(*args, **kw)
+#         else:
+#             return origin_func(*args, **kw)
+#     return wrapper
+
+# def min(*args, **kw):
+#     dim = None
+#     if len(args) >= 2 and isinstance(args[1], org_int):
+#         dim = args[1]
+#     elif "dim" in kw and isinstance(kw["dim"], org_int):
+#         dim = kw["dim"]
+#     if dim is not None:
+#         k, v = jt.argmin(*args, **kw)
+#         return v, k
+#     elif len(args) == 2 and isinstance(args[1], jt.Var):
+#         return jt.minimum(args[0], args[1])
+#     else:
+#         return jt.min(*args, **kw)
+# Tensor.min = conflict_wrapper(jt.min, min)
+
+# def max(*args, **kw):
+#     dim = None
+#     if "dim" in kw:
+#         x = kw["dim"]
+#     if len(args) >= 2 and isinstance(args[1], org_int):
+#         dim = args[1]
+#     elif "dim" in kw and isinstance(kw["dim"], org_int):
+#         dim = kw["dim"]
+#     if dim is not None:
+#         k, v = jt.argmax(*args, **kw)
+#         return v, k
+#     elif len(args) == 2 and isinstance(args[1], jt.Var):
+#         return jt.maximum(args[0], args[1])
+#     else:
+#         return jt.max(*args, **kw)
+# Tensor.max = conflict_wrapper(jt.max, max)
+
+# def argsort(*args, **kw):
+#     k, v = jt.argsort(*args, **kw)
+#     return k
+# Tensor.argsort = conflict_wrapper(jt.argsort, argsort)
+
+# LongTensor = jt.int64
+# FloatTensor = jt.float
+# HalfTensor = jt.float16
+# BoolTensor = jt.bool
+# IntTensor = jt.int32
+
+# class JDType:
+#     def __init__(self, func, str):
+#         self.func = func
+#         self.str = str
+#         self.__name__ = str.split(".")[-1]
+#     def __call__(self, *args, **kw):
+#         return self.func(*args, **kw)
+#     def __str__(self):
+#         return self.str
+#     def is_floating_point(self):
+#         return "float" in str(self.str)
+
+# int8 = JDType(jt.int8, "torch.int8")
+# int16 = JDType(jt.int16, "torch.int16")
+# int = int32 = JDType(jt.int32, "torch.int32")
+# long = int64 = JDType(jt.int64, "torch.int64")
+
+# half = float16 = JDType(jt.float16, "torch.float16")
+# float = float32 = JDType(jt.float32, "torch.float32")
+# double = float64 = JDType(jt.float64, "torch.float64")
+# bfloat16 = "bfloat16" # TODO
+# complex64 = "complex64" # TODO
+# complex128 = "complex128" # TODO
+# def get_JDtype(dtype):
+#     if dtype=='float32' or dtype == jt.float32:
+#         return float32
+#     elif dtype=='float64' or dtype == jt.float64:
+#         return float64
+#     elif dtype=='float16' or dtype == jt.float16:
+#         return float16
+#     elif dtype=='int32' or dtype == jt.int32:
+#         return int32
+#     elif dtype=='int64' or dtype == jt.int64:
+#         return int64
+#     elif dtype=='int16' or dtype == jt.int16:
+#         return int16
+#     elif dtype=='int8' or dtype == jt.int8:
+#         return int8
+#     else:
+#         raise Exception("dtype {} not supported".format(dtype))
+
+# def load(path,**kwargs):
+#     def _to_jittor(data):
+#         if isinstance(data,dict):
+#             return {k:_to_jittor(d) for k,d in data.items()}
+#         if isinstance(data,list):
+#             return [_to_jittor(d) for d in data]
+#         if isinstance(data,np.ndarray):
+#             return jt.array(data)
+#         return data
+#     data = jt.load(path)
+    
+#     return _to_jittor(data)
+
+# def is_tensor(x):
+#     return isinstance(x, Tensor)
+
+# manual_seed = jt.set_global_seed
+# jt.flags.amp_level = 3
+# Size = jt.NanoVector
+
+# class Generator:
+#     def __init__(self,*args,**kw) -> None:
+#         self.seed = None
+#     def manual_seed(self,seed):
+#         self.seed = seed
+
+
+
+# from . import fx
+
+
+# _default_type = "float32"
+
+# def get_default_dtype():
+#     return _default_type
+# def set_default_dtype(dtype):
+#     global _default_type
+#     _default_type = dtype
+
+# dtype = JDType
+
+# def div(x,y,rounding_mode="floor"):
+#     assert rounding_mode == "floor"
+#     z = (x / y)
+#     if rounding_mode == "floor":
+#         z = z.floor()    
+#     if x.dtype == "int32" and (isinstance(y,org_int) or y.dtype == "int32"):
+#         z = z.int32()
+#     return z
+
+
+# def randn(*args,**kw):
+#     wrap_randn = wrapper(jt.randn)
+#     generator = kw.get('generator',None)
+#     kw.pop('generator',None)
+#     if 'layout' in kw:
+#         del kw['layout']
+#     if generator is not None and generator.seed is not None:
+#         jt.set_global_seed(generator.seed)
+#     return wrap_randn(*args,**kw)
+
+# def rand(*args,**kw):
+#     print("rand")
+#     wrap_rand = wrapper(jt.rand)
+#     generator = kw.get('generator',None)
+#     kw.pop('generator',None)
+#     if 'layout' in kw:
+#         del kw['layout']
+#     if generator is not None and generator.seed is not None:
+#         jt.set_global_seed(generator.seed)
+#     return wrap_rand(*args,**kw)
+
+
+
+# def set_default_tensor_type(t: type or str):
+#     if isinstance(t, str):
+#         info = t.split(".")
+#         if len(info) == 3 and info[1] == 'cuda':
+#             jt.flags.use_cuda = 1
+#     #TODO: type
+
+
+# def clamp(x, min=None, max=None):
+#     return jt.clamp(x, min, max)
+
+
+# def to(x,*args,**kw):
+#     device = None
+#     if len(args) == 1:
+#         device = args[0]
+#         if isinstance(device, jt.NanoString) or callable(device):
+#             return jt.to(x,*args,**kw)
+#         if 'cpu' in str(device):
+#             args = []
+#     device = kw.get("device",None)
+#     if 'cpu' in str(device):
+#         kw.pop('device',None)
+#         print("to cpu")
+#         # print(kw)
+#     return jt.to(x,*args,**kw)
+# Tensor.to = conflict_wrapper(jt.to, to)
+
+# mm = wrapper(jt.matmul)
+
+# def _data_get(x):
+#     return x
+
+# def _data_set(x, value):
+#     x.assign(value)
+    
+# Tensor.data = property(_data_get, _data_set)
+# Tensor.layout = None
--- a/python/jittor/compatibility/autograd.py
+++ b/python/jittor/compatibility/autograd.py
@ -0,0 +1,134 @@
+import jittor as jt
+from jittor import Var
+from collections.abc import Sequence, Mapping
+
+Variable = Var
+
+class FunctionContext:
+    def save_for_backward(self, *args):
+        self.saved_tensors = args
+
+class Function:
+    ''' Function Module for customized backward operations
+
+Example 1 (Function can have multiple input and multiple output, and user
+can store value for backward computation)::
+
+    import jtorch
+    from jtorch import Function
+
+    class MyFunc(Function):
+        @staticmethod
+        def forward(self, x, y):
+            self.x = x
+            self.y = y
+            return x*y, x/y
+
+        @staticmethod
+        def backward(self, grad0, grad1):
+            return grad0 * self.y, grad1 * self.x
+
+    a = jtorch.array(3.0)
+    a.requires_grad = True
+    b = jtorch.array(4.0)
+    b.requires_grad = True
+    func = MyFunc.apply
+    c,d = func(a, b)
+    (c+d*3).backward()
+    assert a.grad.data == 4
+    assert b.grad.data == 9
+
+Example 2(Function can return None for no gradiant, and gradiant
+can also be None)::
+
+    import jtorch
+    from jtorch import Function
+    
+    class MyFunc(Function):
+        @staticmethod
+        def forward(self, x, y):
+            self.x = x
+            self.y = y
+            return x*y, x/y
+
+        @staticmethod
+        def backward(self, grad0, grad1):
+            assert grad1 is None
+            return grad0 * self.y, None
+    a = jt.array(3.0)
+    a.requires_grad = True
+    b = jt.array(4.0)
+    b.requires_grad = True
+    func = MyFunc.apply
+    c,d = func(a, b)
+    d.stop_grad()
+    da, db = jt.grad(c+d*3, [a, b])
+    assert da.data == 4
+    assert db.data == 0
+
+    '''
+    def __call__(self, *args):
+        backup = args
+        args = list(args)
+        taped_inputs = []
+        taped_outputs = []
+        input_mask = [-1] * len(args)
+        for i,v in enumerate(args):
+            if isinstance(v, Var):
+                if v.is_stop_grad():
+                    # -2 in input_mask represents it is stop_grad
+                    input_mask[i] = -2
+                    continue
+                v = v.tape()
+                input_mask[i] = len(taped_inputs)
+                args[i] = v
+                taped_inputs.append(v)
+        ctx = FunctionContext()
+        ori_res = self.forward(ctx, *args)
+        # ori_res = self.execute(*args)
+        if not isinstance(ori_res, Sequence):
+            res = [ori_res]
+        else:
+            res = list(ori_res)
+        output_mask = [-1] * len(res)
+        for i,v in enumerate(res):
+            if isinstance(v, Var):
+                v = v.tape()
+                output_mask[i] = len(taped_outputs)
+                res[i] = v
+                taped_outputs.append(v)
+        ctx.input_mask = input_mask
+        ctx.output_mask = output_mask
+        # tape output and input together so
+        # backward treat them as one operator
+        jt.tape_together(taped_inputs, taped_outputs, 
+            lambda *args: self._grad(ctx, self, *args))
+        if isinstance(ori_res, Sequence):
+            return res
+        else:
+            return res[0]
+
+    @staticmethod
+    def _grad(ctx, func, *args):
+        new_args = ( (args[i] if i>=0 else None) for i in ctx.output_mask )
+        ret = func.backward(ctx, *new_args)
+        if not isinstance(ret, Sequence):
+            ret = (ret,)
+        new_ret = []
+        for i, r in enumerate(ret):
+            j = ctx.input_mask[i]
+            if j<0:
+                # -2 in input_mask represents it is stop_grad
+                assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
+                    "because the input value is not jittor variable."
+            else:
+                new_ret.append(r)
+        return new_ret
+
+    def dfs(self, parents, k, callback, callback_leave=None):
+        pass
+
+    @classmethod
+    def apply(cls, *args, **kw):
+        func = cls()
+        return func(*args, **kw)
--- a/python/jittor/compatibility/compiler.py
+++ b/python/jittor/compatibility/compiler.py
@ -0,0 +1,39 @@
+import jittor as jt
+import jittor_utils
+import glob
+import os
+from jittor import pyjt_compiler
+import sys
+from jittor_utils import lock
+
+
+jtorch_path = os.path.dirname(__file__)
+cache_path = os.path.join(jt.compiler.cache_path, "jtorch")
+# os.makedirs(cache_path, exist_ok=True)
+os.makedirs(os.path.join(cache_path, "gen"), exist_ok=True)
+
+with lock.lock_scope():
+    pyjt_gen_src = pyjt_compiler.compile(cache_path, jtorch_path)
+
+ext_args = 'c[cu]' if jt.has_cuda else 'cc'
+files = glob.glob(jtorch_path+"/src/**/*."+ext_args, recursive=True)
+files += pyjt_gen_src
+cc_flags = " -I\""+os.path.join(jtorch_path, "src")+"\" "
+if os.environ.get("use_data_o", "1") == "1":
+    files += glob.glob(jtorch_path+"/src/**/*.o", recursive=True)
+    files = [f for f in files if "__data__" not in f]
+
+
+with lock.lock_scope():
+    jt.compiler.compile(
+        jt.compiler.cc_path,
+        jt.compiler.cc_flags+jt.compiler.opt_flags+ cc_flags,
+        files,
+        "jtorch_core"+jt.compiler.extension_suffix,
+        obj_dirname="jtorch_objs")
+
+    
+with jittor_utils.import_scope(jt.compiler.import_flags):
+    import jtorch_core as core
+
+jt.flags.th_mode = 1
--- a/python/jittor/compatibility/cuda.py
+++ b/python/jittor/compatibility/cuda.py
@ -0,0 +1,64 @@
+import jittor as jt
+import jtorch
+
+def is_available():
+    return jt.has_cuda
+
+def device_count():
+    return int(jt.has_cuda)
+
+def set_device(device=None):
+    pass
+
+def get_rng_state(device=None):
+    pass
+
+def current_device():
+    return jtorch.device("cuda")
+
+def mem_get_info(i):
+    return ("75GB",)
+
+
+class Generator:
+    def __init__(self):
+        pass
+
+    def set_state(self, state):
+        self.state = state
+
+default_generators = [Generator()]
+_lazy_call = lambda func: func()
+device = None
+
+LongTensor = jt.int64
+FloatTensor = jt.float
+HalfTensor = jt.float16
+BoolTensor = jt.bool
+
+manual_seed = jt.set_global_seed
+manual_seed_all = jt.set_global_seed
+
+def synchronize():
+    jt.sync_all(True)
+
+class Event:
+    pass
+
+class Stream:
+    pass
+
+from typing import Any
+
+from .gradscaler import GradScaler
+
+class autocast:
+    def __init__(self,**kwargs):
+        pass 
+
+    def __enter__(self,):
+        pass 
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any):
+        pass
+
--- a/python/jittor/compatibility/distributed.py
+++ b/python/jittor/compatibility/distributed.py
@ -0,0 +1,53 @@
+import datetime
+from enum import Enum
+import jittor as jt
+
+
+class DistributedDataParallel:
+    def __new__(cls, model):
+        return model
+
+def is_initialized():
+    return True
+
+def get_rank(group=None):
+    return 0
+
+def get_world_size(group=None):
+    return 1
+
+def get_backend(group=None):
+    return "nccl"
+
+def new_group(ranks=None, timeout=datetime.timedelta(seconds=1800), backend=None, pg_options=None):
+    return 1
+
+def barrier():
+    pass
+
+def is_available():
+    return True 
+
+def is_built():
+    return True
+
+class ReduceOp:
+    SUM = 0
+
+class GroupMember:
+    WORLD = 0 
+
+class ProcessGroup:
+    pass
+
+class Join:
+    pass
+
+dist_backend = Enum("dist_backend", ("GLOO", "MPI", "NCCL"))
+_backend = dist_backend.NCCL
+
+def is_mpi_available():
+    return jt.in_mpi
+
+def DistributedDataParallel(model, *args, **kw):
+    return model
--- a/python/jittor/compatibility/distributions.py
+++ b/python/jittor/compatibility/distributions.py
@ -0,0 +1,15 @@
+import jittor as jt
+
+class RelaxedBernoulli:
+    def __init__(self, temperature, probs=None, logits=None):
+        self.temperature = temperature
+        self.probs = probs
+        self.logits = logits
+    
+    def rsample(self):
+        noise = jt.rand_like(self.logits)
+        eps = 1e-20
+        noise = jt.clamp(noise, eps, 1.0 - eps)
+        logit_noise = jt.log(noise) - jt.log(1 - noise)
+        sample = (self.logits + logit_noise) / self.temperature
+        return jt.sigmoid(sample)
--- a/python/jittor/compatibility/fft/init.py
+++ b/python/jittor/compatibility/fft/init.py
@ -0,0 +1,5 @@
+#TODO: Implement FFT and IFFT
+fftn = None
+fftshift = None
+ifftn = None
+ifftshift = None
--- a/python/jittor/compatibility/fx.py
+++ b/python/jittor/compatibility/fx.py
@ -0,0 +1,2 @@
+class Proxy:
+    pass
--- a/python/jittor/compatibility/gradscaler.py
+++ b/python/jittor/compatibility/gradscaler.py
@ -0,0 +1,519 @@
+from collections import defaultdict, abc
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, cast
+import inspect
+import warnings
+
+import jittor as jt
+# import torch
+
+def _refresh_per_optimizer_state():
+    return {}
+
+
+class GradScaler:
+    _scale: Optional[jt.Var]
+    _grows_tracker: Optional[jt.Var]
+    _per_optimizer_states: Dict[int, Dict[str, Any]]
+    """
+    An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling
+    conveniently.
+
+    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
+    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
+    * ``scaler.update()`` updates ``scaler``'s scale factor.
+
+    Example::
+
+        # Creates a GradScaler once at the beginning of training.
+        scaler = GradScaler()
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                scaler.scale(loss).backward()
+
+                # scaler.step() first unscales gradients of the optimizer's params.
+                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
+    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
+    and multiple losses/optimizers.
+
+    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
+    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
+    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
+    without incurring inf or NaN gradient values.
+    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
+    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
+
+    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
+      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
+
+    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
+      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
+      ``growth_factor``.
+
+    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
+    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
+    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
+
+    Args:
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+            Default: ``True``
+    """
+    def __init__(self,
+                 init_scale=2.**16,
+                 growth_factor=2.0,
+                 backoff_factor=0.5,
+                 growth_interval=2000,
+                 enabled=True):
+        self._enabled = enabled
+
+        if self._enabled:
+            assert growth_factor > 1.0, "The growth factor must be > 1.0."
+            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
+
+            self._init_scale = init_scale
+            # self._scale will be lazily initialized during the first call to scale()
+            self._scale = None
+            self._growth_factor = growth_factor
+            self._backoff_factor = backoff_factor
+            self._growth_interval = growth_interval
+            self._init_growth_tracker = 0
+            # self._growth_tracker will be lazily initialized during the first call to scale()
+            self._growth_tracker = None
+            self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _check_scale_growth_tracker(self, funcname) -> Tuple[jt.Var, jt.Var]:
+        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
+        assert self._scale is not None, "Attempted {} but _scale is None.  ".format(funcname) + fix
+        assert self._growth_tracker is not None, "Attempted {} but _growth_tracker is None.  ".format(funcname) + fix
+        return (self._scale, self._growth_tracker)
+
+    def _lazy_init_scale_growth_tracker(self):
+        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
+        self._scale = self._init_scale
+        self._growth_tracker = self._init_growth_tracker
+
+    def scale(self, outputs):
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Args:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+        
+
+        # Short-circuit for the common case.
+        if isinstance(outputs, jt.Var):
+            assert jt.flags.use_cuda == 1
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker()
+            assert self._scale is not None
+            return outputs * self._scale
+
+        def apply_scale(val):
+            if isinstance(val, jt.Var):
+                assert jt.flags.use_cuda == 1
+                if self._scale is None:
+                    self._lazy_init_scale_growth_tracker()
+                assert self._scale is not None
+                return val * self._scale
+            elif isinstance(val, abc.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, (list, tuple)):
+                    return type(val)(iterable)
+                else:
+                    return iterable
+            else:
+                raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)
+
+    def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
+        with jt.no_grad():
+            optimizer.pre_step()
+            for group in optimizer.param_groups:
+                for to_unscale in group["grads"]:
+                    if to_unscale is None or isinstance(to_unscale,(int,float)):
+                        continue
+                    if (not allow_fp16) and str(to_unscale.dtype) == "float16":
+                        raise ValueError("Attempting to unscale FP16 gradients.")
+                    
+                    if not (to_unscale.isinf().any()):
+                        if inv_scale != 1.0:
+                            to_unscale.update(to_unscale*inv_scale) 
+                    else:
+                        found_inf = 1.0
+
+        return found_inf
+
+    def unscale_(self, optimizer):
+        """
+        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+
+        :meth:`unscale_` is optional, serving cases where you need to
+        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+        between the backward pass(es) and :meth:`step`.
+        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+
+        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+
+            ...
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+            scaler.step(optimizer)
+            scaler.update()
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+
+        .. note::
+            :meth:`unscale_` does not incur a CPU-GPU sync.
+
+        .. warning::
+            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+            and only after all gradients for that optimizer's assigned parameters have been accumulated.
+            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+
+        .. warning::
+            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+        """
+        if not self._enabled:
+            return
+
+        self._check_scale_growth_tracker("unscale_")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+        
+        if hasattr(optimizer,"get_find_inf"):
+            return
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
+        inv_scale = 1.0 / self._scale
+        found_inf = 0.0
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False)
+
+
+    def step(self, optimizer, *args, **kwargs):
+        """
+        :meth:`step` carries out the following two operations:
+
+        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
+            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
+        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
+            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
+
+        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
+
+        Returns the return value of ``optimizer.step(*args, **kwargs)``.
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
+            args:  Any arguments.
+            kwargs:  Any keyword arguments.
+
+        .. warning::
+            Closure use is not currently supported.
+        """
+        if (not self._enabled):
+            return optimizer.step(*args, **kwargs)
+
+        if "closure" in kwargs:
+            raise RuntimeError("Closure use is not currently supported if GradScaler is enabled.")
+
+        self._check_scale_growth_tracker("step")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+        retval = None
+
+        if (hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling):
+            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
+            # The contract with custom optimizers is that their step() should accept an additional,
+            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
+            # it can query its own state, invoke unscale_ on itself, etc
+            # The contract above is being deprecated to avoid introducing `grad_scaler: GradScaler` argument
+            # to `Optimizer.step`. The new behavior is going to add two Tensor attributes of `grad_scale`
+            # and `found_inf` to the passed optimizer so that the optimizer can utilize those
+            # to skip the parameter updates or unscale gradients before updating parameters in
+            # the fused kernel, e.g. `FusedAdamMathFunctor`.
+            # In this behavior, `GradScaler._check_inf_per_device` is called if `OptState.READY`,
+            # while the method is expected to be called by users side, i.e. their optimizers.
+            kwargs_ = kwargs
+            has_grad_scaler_kwarg = "grad_scaler" in inspect.signature(optimizer.step).parameters
+            if has_grad_scaler_kwarg:
+                warnings.warn(
+                    "GradScaler is going to stop passing itself as a keyword argument to the passed "
+                    "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
+                    "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
+                    FutureWarning)
+                kwargs_.update({"grad_scaler": self})
+            else:
+                if optimizer_state["stage"] is OptState.READY:
+                    self._check_inf_per_device(optimizer)
+                scaler = self._get_scale_async()
+                found_inf = cast(
+                    jt.Var,
+                    sum([
+                        t for t in optimizer_state["found_inf_per_device"].values()
+                    ])
+                )
+                optimizer.grad_scale = None if optimizer_state["stage"] == OptState.UNSCALED else scaler
+                optimizer.found_inf = found_inf
+            retval = optimizer.step(*args, **kwargs_)
+            optimizer_state["stage"] = OptState.STEPPED
+            if not has_grad_scaler_kwarg:
+                del optimizer.grad_scale
+                del optimizer.found_inf
+            return retval
+
+        if hasattr(optimizer,"get_find_inf"):
+            optimizer.set_grad_scale(self._scale)
+            optimizer.step()
+            optimizer_state["found_inf_per_device"] = optimizer.get_find_inf()
+            return
+        
+        retval = None
+        if not optimizer_state["found_inf_per_device"]:
+            retval = optimizer.step(*args, **kwargs)
+        else:
+            optimizer.post_step()
+
+        return retval
+    
+
+    def update(self, new_scale=None):
+        """
+        Updates the scale factor.
+
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+
+        Args:
+            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
+
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+        """
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+        if new_scale is not None:
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)  # type: ignore[union-attr]
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason  # type: ignore[attr-defined]
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)  # type: ignore[union-attr]
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [state["found_inf_per_device"]
+                          for state in self._per_optimizer_states.values()
+                          ]
+
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+
+
+            current_scale = _scale
+            if found_inf_combined:
+                current_scale *=self._backoff_factor
+                _growth_tracker = 0
+            else:
+                successful = _growth_tracker+1
+                if successful == self._growth_interval:
+                    new_scale = current_scale*self._growth_factor
+                    if new_scale < 1e9:
+                        current_scale = new_scale
+                    _growth_tracker = 0
+                else:
+                    _growth_tracker = successful
+
+            self._scale, self._growth_tracker = current_scale,_growth_tracker
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _get_scale_async(self):
+        return self._scale
+
+    def get_scale(self):
+        """
+        Returns a Python float containing the current scale, or 1.0 if scaling is disabled.
+
+        .. warning::
+            :meth:`get_scale` incurs a CPU-GPU sync.
+        """
+        if self._enabled:
+            return self._init_scale if self._scale is None else self._get_scale_async()
+        else:
+            return 1.0
+
+    def get_growth_factor(self):
+        r"""
+        Returns a Python float containing the scale growth factor.
+        """
+        return self._growth_factor
+
+    def set_growth_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale growth factor.
+        """
+        self._growth_factor = new_factor
+
+    def get_backoff_factor(self):
+        r"""
+        Returns a Python float containing the scale backoff factor.
+        """
+        return self._backoff_factor
+
+    def set_backoff_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale backoff factor.
+        """
+        self._backoff_factor = new_factor
+
+    def get_growth_interval(self):
+        r"""
+        Returns a Python int containing the growth interval.
+        """
+        return self._growth_interval
+
+    def set_growth_interval(self, new_interval):
+        r"""
+        Args:
+            new_interval (int):  Value to use as the new growth interval.
+        """
+        self._growth_interval = new_interval
+
+    def _get_growth_tracker(self):
+        if self._enabled:
+            return self._init_growth_tracker if self._growth_tracker is None else self._growth_tracker.item()
+        else:
+            return 0
+
+    def is_enabled(self):
+        r"""
+        Returns a bool indicating whether this instance is enabled.
+        """
+        return self._enabled
+
+    def state_dict(self):
+        r"""
+        Returns the state of the scaler as a :class:`dict`.  It contains five entries:
+
+        * ``"scale"`` - a Python float containing the current scale
+        * ``"growth_factor"`` - a Python float containing the current growth factor
+        * ``"backoff_factor"`` - a Python float containing the current backoff factor
+        * ``"growth_interval"`` - a Python int containing the current growth interval
+        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
+
+        If this instance is not enabled, returns an empty dict.
+
+        .. note::
+           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
+           should be called after :meth:`update`.
+        """
+        return {"scale": self.get_scale(),
+                "growth_factor": self._growth_factor,
+                "backoff_factor": self._backoff_factor,
+                "growth_interval": self._growth_interval,
+                "_growth_tracker": self._get_growth_tracker()} if self._enabled else {}
+
+    def load_state_dict(self, state_dict):
+        r"""
+        Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op.
+
+        Args:
+           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
+        """
+        if not self._enabled:
+            return
+
+        if len(state_dict) == 0:
+            raise RuntimeError("The source state dict is empty, possibly because it was saved "
+                               "from a disabled instance of GradScaler.")
+
+        self._init_scale = state_dict["scale"]
+        if self._scale is not None:
+            self._scale.fill_(state_dict["scale"])
+        self._growth_factor = state_dict["growth_factor"]
+        self._backoff_factor = state_dict["backoff_factor"]
+        self._growth_interval = state_dict["growth_interval"]
+        self._init_growth_tracker = state_dict["_growth_tracker"]
+        if self._growth_tracker is not None:
+            self._growth_tracker.fill_(state_dict["_growth_tracker"])
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        if self._enabled:
+            assert len(self._per_optimizer_states) == 0, "A GradScaler instance may only be pickled at the beginning "\
+                                                         "of an iteration, or at the end after scaler.update()."
+            # Pickling _scale and _growth_tracker Tensors directly triggers
+            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
+            # so instead, we set the unpickled instance up to reinitialize them lazily.
+            state['_init_scale'] = self.get_scale()
+            state['_init_growth_tracker'] = self._get_growth_tracker()
+            state['_scale'] = None
+            state['_growth_tracker'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def _check_inf_per_device(self, optimizer):
+        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
+
+        dummy_inv_scale = 1.0
+        found_inf = 0.0
+
+        self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = \
+            self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
+
+    def _found_inf_per_device(self, optimizer):
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
--- a/python/jittor/compatibility/gradscaler_old.py
+++ b/python/jittor/compatibility/gradscaler_old.py
@ -0,0 +1,556 @@
+from collections import defaultdict, abc
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, cast
+import inspect
+import warnings
+
+import jittor as jt
+# import torch
+
+
+__all__ = ["OptState", "GradScaler"]
+
+
+# Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
+# as well as associated "enum" values.  Prefers defining these at top level because
+# - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
+# - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
+#   causes a circular reference, which we'd rather avoid.
+class OptState(Enum):
+    READY = 0
+    UNSCALED = 1
+    STEPPED = 2
+
+
+def _refresh_per_optimizer_state():
+    return {"stage": OptState.READY, "found_inf_per_device": {}}
+
+
+class GradScaler:
+    _scale: Optional[jt.Var]
+    _grows_tracker: Optional[jt.Var]
+    _per_optimizer_states: Dict[int, Dict[str, Any]]
+    """
+    An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling
+    conveniently.
+
+    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
+    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
+    * ``scaler.update()`` updates ``scaler``'s scale factor.
+
+    Example::
+
+        # Creates a GradScaler once at the beginning of training.
+        scaler = GradScaler()
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                scaler.scale(loss).backward()
+
+                # scaler.step() first unscales gradients of the optimizer's params.
+                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
+    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
+    and multiple losses/optimizers.
+
+    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
+    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
+    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
+    without incurring inf or NaN gradient values.
+    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
+    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
+
+    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
+      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
+
+    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
+      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
+      ``growth_factor``.
+
+    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
+    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
+    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
+
+    Args:
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+            Default: ``True``
+    """
+    def __init__(self,
+                 init_scale=2.**16,
+                 growth_factor=2.0,
+                 backoff_factor=0.5,
+                 growth_interval=2000,
+                 enabled=True):
+        self._enabled = enabled
+
+        if self._enabled:
+            assert growth_factor > 1.0, "The growth factor must be > 1.0."
+            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
+
+            self._init_scale = init_scale
+            # self._scale will be lazily initialized during the first call to scale()
+            self._scale = None
+            self._growth_factor = growth_factor
+            self._backoff_factor = backoff_factor
+            self._growth_interval = growth_interval
+            self._init_growth_tracker = 0
+            # self._growth_tracker will be lazily initialized during the first call to scale()
+            self._growth_tracker = None
+            self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _check_scale_growth_tracker(self, funcname) -> Tuple[jt.Var, jt.Var]:
+        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
+        assert self._scale is not None, "Attempted {} but _scale is None.  ".format(funcname) + fix
+        assert self._growth_tracker is not None, "Attempted {} but _growth_tracker is None.  ".format(funcname) + fix
+        return (self._scale, self._growth_tracker)
+
+    def _lazy_init_scale_growth_tracker(self):
+        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
+        self._scale = self._init_scale
+        self._growth_tracker = self._init_growth_tracker
+
+    def scale(self, outputs):
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Args:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        print("scale")
+        if not self._enabled:
+            return outputs
+        
+
+        # Short-circuit for the common case.
+        if isinstance(outputs, jt.Var):
+            assert jt.flags.use_cuda == 1
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker()
+            assert self._scale is not None
+            return outputs * self._scale
+
+        def apply_scale(val):
+            if isinstance(val, jt.Var):
+                assert jt.flags.use_cuda == 1
+                if self._scale is None:
+                    self._lazy_init_scale_growth_tracker()
+                assert self._scale is not None
+                return val * self._scale
+            elif isinstance(val, abc.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, (list, tuple)):
+                    return type(val)(iterable)
+                else:
+                    return iterable
+            else:
+                raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)
+
+    def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
+
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be hundreds of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        with jt.no_grad():
+            optimizer.pre_step()
+            for group in optimizer.param_groups:
+                for to_unscale in group["grads"]:
+                    if to_unscale is None or isinstance(to_unscale,(int,float)):
+                        continue
+                    if (not allow_fp16) and str(to_unscale.dtype) == "float16":
+                        raise ValueError("Attempting to unscale FP16 gradients.")
+                    
+                    if not (to_unscale.isinf().any()):
+                        if inv_scale != 1.0:
+                            to_unscale.update(to_unscale*inv_scale) 
+                    else:
+                        found_inf = 1.0
+
+        return found_inf
+
+    def unscale_(self, optimizer):
+        """
+        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+
+        :meth:`unscale_` is optional, serving cases where you need to
+        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+        between the backward pass(es) and :meth:`step`.
+        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+
+        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+
+            ...
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+            scaler.step(optimizer)
+            scaler.update()
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+
+        .. note::
+            :meth:`unscale_` does not incur a CPU-GPU sync.
+
+        .. warning::
+            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+            and only after all gradients for that optimizer's assigned parameters have been accumulated.
+            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+
+        .. warning::
+            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+        """
+        if not self._enabled:
+            return
+
+        self._check_scale_growth_tracker("unscale_")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.UNSCALED:
+            raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
+        elif optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
+        
+
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
+        inv_scale = 1.0 / self._scale
+        found_inf = 0.0
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False)
+        optimizer_state["stage"] = OptState.UNSCALED
+
+    def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwargs):
+        retval = None
+        if not optimizer_state["found_inf_per_device"]:
+            retval = optimizer.step(*args, **kwargs)
+        else:
+            optimizer.post_step()
+
+        return retval
+
+    def step(self, optimizer, *args, **kwargs):
+        """
+        :meth:`step` carries out the following two operations:
+
+        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
+            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
+        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
+            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
+
+        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
+
+        Returns the return value of ``optimizer.step(*args, **kwargs)``.
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
+            args:  Any arguments.
+            kwargs:  Any keyword arguments.
+
+        .. warning::
+            Closure use is not currently supported.
+        """
+        if (not self._enabled):
+            return optimizer.step(*args, **kwargs)
+
+        if "closure" in kwargs:
+            raise RuntimeError("Closure use is not currently supported if GradScaler is enabled.")
+
+        self._check_scale_growth_tracker("step")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("step() has already been called since the last update().")
+
+        retval = None
+
+        if (hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling):
+            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
+            # The contract with custom optimizers is that their step() should accept an additional,
+            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
+            # it can query its own state, invoke unscale_ on itself, etc
+            # The contract above is being deprecated to avoid introducing `grad_scaler: GradScaler` argument
+            # to `Optimizer.step`. The new behavior is going to add two Tensor attributes of `grad_scale`
+            # and `found_inf` to the passed optimizer so that the optimizer can utilize those
+            # to skip the parameter updates or unscale gradients before updating parameters in
+            # the fused kernel, e.g. `FusedAdamMathFunctor`.
+            # In this behavior, `GradScaler._check_inf_per_device` is called if `OptState.READY`,
+            # while the method is expected to be called by users side, i.e. their optimizers.
+            kwargs_ = kwargs
+            has_grad_scaler_kwarg = "grad_scaler" in inspect.signature(optimizer.step).parameters
+            if has_grad_scaler_kwarg:
+                warnings.warn(
+                    "GradScaler is going to stop passing itself as a keyword argument to the passed "
+                    "optimizer. In the near future GradScaler registers `grad_scale: Tensor` and "
+                    "`found_inf: Tensor` to the passed optimizer and let the optimizer use them directly.",
+                    FutureWarning)
+                kwargs_.update({"grad_scaler": self})
+            else:
+                if optimizer_state["stage"] is OptState.READY:
+                    self._check_inf_per_device(optimizer)
+                scaler = self._get_scale_async()
+                found_inf = cast(
+                    jt.Var,
+                    sum([
+                        t for t in optimizer_state["found_inf_per_device"].values()
+                    ])
+                )
+                optimizer.grad_scale = None if optimizer_state["stage"] == OptState.UNSCALED else scaler
+                optimizer.found_inf = found_inf
+            retval = optimizer.step(*args, **kwargs_)
+            optimizer_state["stage"] = OptState.STEPPED
+            if not has_grad_scaler_kwarg:
+                del optimizer.grad_scale
+                del optimizer.found_inf
+            return retval
+
+
+        if optimizer_state["stage"] is OptState.READY:
+            self.unscale_(optimizer)
+
+        assert "found_inf_per_device" in optimizer_state, "No inf checks were recorded for this optimizer."
+
+        retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
+
+        optimizer_state["stage"] = OptState.STEPPED
+
+        return retval
+
+    def update(self, new_scale=None):
+        """
+        Updates the scale factor.
+
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+
+        Args:
+            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
+
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+        """
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+        if new_scale is not None:
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)  # type: ignore[union-attr]
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason  # type: ignore[attr-defined]
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)  # type: ignore[union-attr]
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [state["found_inf_per_device"]
+                          for state in self._per_optimizer_states.values()
+                          ]
+
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+
+
+            current_scale = _scale
+            if found_inf_combined:
+                current_scale *=self._backoff_factor
+                _growth_tracker = 0
+            else:
+                successful = _growth_tracker+1
+                if successful == self._growth_interval:
+                    new_scale = current_scale*self._growth_factor
+                    if new_scale < 1e9:
+                        current_scale = new_scale
+                    _growth_tracker = 0
+                else:
+                    _growth_tracker = successful
+
+            self._scale, self._growth_tracker = current_scale,_growth_tracker
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _get_scale_async(self):
+        return self._scale
+
+    def get_scale(self):
+        """
+        Returns a Python float containing the current scale, or 1.0 if scaling is disabled.
+
+        .. warning::
+            :meth:`get_scale` incurs a CPU-GPU sync.
+        """
+        if self._enabled:
+            return self._init_scale if self._scale is None else self._get_scale_async()
+        else:
+            return 1.0
+
+    def get_growth_factor(self):
+        r"""
+        Returns a Python float containing the scale growth factor.
+        """
+        return self._growth_factor
+
+    def set_growth_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale growth factor.
+        """
+        self._growth_factor = new_factor
+
+    def get_backoff_factor(self):
+        r"""
+        Returns a Python float containing the scale backoff factor.
+        """
+        return self._backoff_factor
+
+    def set_backoff_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale backoff factor.
+        """
+        self._backoff_factor = new_factor
+
+    def get_growth_interval(self):
+        r"""
+        Returns a Python int containing the growth interval.
+        """
+        return self._growth_interval
+
+    def set_growth_interval(self, new_interval):
+        r"""
+        Args:
+            new_interval (int):  Value to use as the new growth interval.
+        """
+        self._growth_interval = new_interval
+
+    def _get_growth_tracker(self):
+        if self._enabled:
+            return self._init_growth_tracker if self._growth_tracker is None else self._growth_tracker.item()
+        else:
+            return 0
+
+    def is_enabled(self):
+        r"""
+        Returns a bool indicating whether this instance is enabled.
+        """
+        return self._enabled
+
+    def state_dict(self):
+        r"""
+        Returns the state of the scaler as a :class:`dict`.  It contains five entries:
+
+        * ``"scale"`` - a Python float containing the current scale
+        * ``"growth_factor"`` - a Python float containing the current growth factor
+        * ``"backoff_factor"`` - a Python float containing the current backoff factor
+        * ``"growth_interval"`` - a Python int containing the current growth interval
+        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
+
+        If this instance is not enabled, returns an empty dict.
+
+        .. note::
+           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
+           should be called after :meth:`update`.
+        """
+        return {"scale": self.get_scale(),
+                "growth_factor": self._growth_factor,
+                "backoff_factor": self._backoff_factor,
+                "growth_interval": self._growth_interval,
+                "_growth_tracker": self._get_growth_tracker()} if self._enabled else {}
+
+    def load_state_dict(self, state_dict):
+        r"""
+        Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op.
+
+        Args:
+           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
+        """
+        if not self._enabled:
+            return
+
+        if len(state_dict) == 0:
+            raise RuntimeError("The source state dict is empty, possibly because it was saved "
+                               "from a disabled instance of GradScaler.")
+
+        self._init_scale = state_dict["scale"]
+        if self._scale is not None:
+            self._scale.fill_(state_dict["scale"])
+        self._growth_factor = state_dict["growth_factor"]
+        self._backoff_factor = state_dict["backoff_factor"]
+        self._growth_interval = state_dict["growth_interval"]
+        self._init_growth_tracker = state_dict["_growth_tracker"]
+        if self._growth_tracker is not None:
+            self._growth_tracker.fill_(state_dict["_growth_tracker"])
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        if self._enabled:
+            assert len(self._per_optimizer_states) == 0, "A GradScaler instance may only be pickled at the beginning "\
+                                                         "of an iteration, or at the end after scaler.update()."
+            # Pickling _scale and _growth_tracker Tensors directly triggers
+            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
+            # so instead, we set the unpickled instance up to reinitialize them lazily.
+            state['_init_scale'] = self.get_scale()
+            state['_init_growth_tracker'] = self._get_growth_tracker()
+            state['_scale'] = None
+            state['_growth_tracker'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def _check_inf_per_device(self, optimizer):
+        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
+
+        dummy_inv_scale = 1.0
+        found_inf = 0.0
+
+        self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = \
+            self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
+
+    def _found_inf_per_device(self, optimizer):
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
--- a/python/jittor/compatibility/misc.py
+++ b/python/jittor/compatibility/misc.py
@ -0,0 +1,12 @@
+import math
+
+def _jit_set_profiling_mode(x): pass
+def _jit_set_profiling_executor(x): pass
+def _jit_override_can_fuse_on_cpu(x): pass
+def _jit_override_can_fuse_on_gpu(x): pass
+
+def script(func):
+    return func
+
+inf = math.inf
+nan = math.nan
--- a/python/jittor/compatibility/nn/init.py
+++ b/python/jittor/compatibility/nn/init.py
@ -0,0 +1,281 @@
+import jtorch
+from typing import List, Optional, Tuple, Iterable, Iterator, Mapping, Any, overload, TypeVar, Dict
+from typing_extensions import Self
+import jittor as jt
+from jtorch import make_module, Tensor, ModuleMisc, wrapper
+#from . import init
+from jittor import Function
+import operator
+import warnings
+
+for k,v in jt.nn.__dict__.items():
+    if callable(v):
+        globals()[k] = wrapper(v)
+
+for k,v in jt.nn.__dict__.items():
+    if isinstance(v, type) and issubclass(v, jt.Module):
+        globals()[k] = make_module(v)
+
+from collections import OrderedDict
+from collections import abc as container_abcs
+
+class Module(ModuleMisc, jt.Module):
+    
+    def __call__(self, *args, **kw):
+        return self.execute(*args, **kw)
+
+    def execute(self, *args, **kw):
+        return self.forward(*args, **kw)
+
+    def get_submodule(self, target: str):
+        if target == "":
+            return self
+
+        atoms: List[str] = target.split(".")
+        mod: jt.nn.Module = self
+
+        for item in atoms:
+            if not hasattr(mod, item):
+                raise AttributeError(mod._get_name() + " has no "
+                                     "attribute `" + item + "`")
+
+            mod = getattr(mod, item)
+
+            if not isinstance(mod, jt.nn.Module):
+                raise AttributeError("`" + item + "` is not "
+                                     "an nn.Module")
+        return mod
+
+    
+
+def Parameter(x:Tensor, requires_grad:bool=True) -> Tensor:
+    x = x.clone()
+    x.requires_grad = requires_grad
+    x.retains_grad = requires_grad
+    return x
+
+def embedding(input, weight, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False):
+    return jt.nn.embedding(input, weight)
+
+def dropout(x, p=0.5, training=False):
+    return jt.nn.dropout(x, p, training)
+
+
+class Flatten(Module):
+    ''' Flattens the contiguous range of dimensions in a Var.
+    :param start_dim: the first dimension to be flattened. Defaults: 1.
+    :type start_dim: int
+    :param end_dim: the last dimension to be flattened. Defaults: -1.
+    :type end_dim: int
+    '''
+    def __init__(self, start_dim=1, end_dim=-1):
+        self.start_dim = start_dim
+        self.end_dim = end_dim
+
+    def forward(self, x) -> jt.Var:
+        return x.flatten(self.start_dim, self.end_dim)
+
+class _IncompatibleKeys:
+    def __init__(self, missing_keys, unexpected_keys):
+        self.missing_keys = missing_keys
+        self.unexpected_keys = unexpected_keys
+
+_BatchNorm = None
+
+#from . import utils
+normalize = wrapper(jt.normalize)
+
+T = TypeVar('T', bound=Module)
+
+class ModuleDict(Module):
+    _modules: Dict[str, Module]  # type: ignore[assignment]
+
+    def __init__(self, modules: Optional[Mapping[str, Module]] = None) -> None:
+        super().__init__()
+        if modules is not None:
+            self.update(modules)
+
+    def __getitem__(self, key: str) -> Module:
+        return self._modules[key]
+
+    def __setitem__(self, key: str, module: Module) -> None:
+        self.add_module(key, module)
+
+    def __delitem__(self, key: str) -> None:
+        del self._modules[key]
+
+    def __len__(self) -> int:
+        return len(self._modules)
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self._modules)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._modules
+
+    def clear(self) -> None:
+        """Remove all items from the ModuleDict."""
+        self._modules.clear()
+
+    def pop(self, key: str) -> Module:
+        r"""Remove key from the ModuleDict and return its module.
+
+        Args:
+            key (str): key to pop from the ModuleDict
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self) -> Iterable[str]:
+        r"""Return an iterable of the ModuleDict keys."""
+        return self._modules.keys()
+
+    def items(self) -> Iterable[Tuple[str, Module]]:
+        r"""Return an iterable of the ModuleDict key/value pairs."""
+        return self._modules.items()
+
+    def values(self) -> Iterable[Module]:
+        r"""Return an iterable of the ModuleDict values."""
+        return self._modules.values()
+
+    def update(self, modules: Mapping[str, Module]) -> None:
+        r"""Update the :class:`~torch.nn.ModuleDict` with key-value pairs from a mapping, overwriting existing keys.
+
+        .. note::
+            If :attr:`modules` is an ``OrderedDict``, a :class:`~torch.nn.ModuleDict`, or
+            an iterable of key-value pairs, the order of new elements in it is preserved.
+
+        Args:
+            modules (iterable): a mapping (dictionary) from string to :class:`~torch.nn.Module`,
+                or an iterable of key-value pairs of type (string, :class:`~torch.nn.Module`)
+        """
+        if not isinstance(modules, container_abcs.Iterable):
+            raise TypeError("ModuleDict.update should be called with an "
+                            "iterable of key/value pairs, but got " +
+                            type(modules).__name__)
+
+        if isinstance(modules, (OrderedDict, ModuleDict, container_abcs.Mapping)):
+            for key, module in modules.items():
+                self[key] = module
+        else:
+            # modules here can be a list with two items
+            for j, m in enumerate(modules):
+                if not isinstance(m, container_abcs.Iterable):
+                    raise TypeError("ModuleDict update sequence element "
+                                    "#" + str(j) + " should be Iterable; is" +
+                                    type(m).__name__)
+                if not len(m) == 2:
+                    raise ValueError("ModuleDict update sequence element "
+                                     "#" + str(j) + " has length " + str(len(m)) +
+                                     "; 2 is required")
+                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
+                # that's too cumbersome to type correctly with overloads, so we add an ignore here
+                self[m[0]] = m[1]  # type: ignore[assignment]
+
+    # remove forward alltogether to fallback on Module's _forward_unimplemented
+
+
+class ParameterList(Module):
+
+    def __init__(self, values: Optional[Iterable[Any]] = None) -> None:
+        super().__init__()
+        self._size = 0
+        if values is not None:
+            self += values
+
+    def _get_abs_string_index(self, idx):
+        """Get the absolute index for the list of modules."""
+        idx = operator.index(idx)
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f'index {idx} is out of range')
+        if idx < 0:
+            idx += len(self)
+        return str(idx)
+
+    @overload
+    def __getitem__(self, idx: int) -> Any:
+        ...
+
+    @overload
+    def __getitem__(self: T, idx: slice) -> T:
+        ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            out = self.__class__()
+            for i in range(start, stop, step):
+                out.append(self[i])
+            return out
+        else:
+            idx = self._get_abs_string_index(idx)
+            return getattr(self, str(idx))
+
+    def __setitem__(self, idx: int, param: Any) -> None:
+        # Note that all other function that add an entry to the list part of
+        # the ParameterList end up here. So this is the only place where we need
+        # to wrap things into Parameter if needed.
+        # Objects added via setattr() are not in the list part and thus won't
+        # call into this function.
+        idx = self._get_abs_string_index(idx)
+        if isinstance(param, jt.Var) and not isinstance(param, Parameter):
+            param = Parameter(param)
+        return setattr(self, str(idx), param)
+
+    def __len__(self) -> int:
+        return self._size
+
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self[i] for i in range(len(self)))
+
+    def __iadd__(self, parameters: Iterable[Any]) -> Self:
+        return self.extend(parameters)
+
+    def __dir__(self):
+        keys = super().__dir__()
+        keys = [key for key in keys if not key.isdigit()]
+        return keys
+
+    def append(self, value: Any) -> 'ParameterList':
+        """Append a given value at the end of the list.
+
+        Args:
+            value (Any): value to append
+        """
+        new_idx = len(self)
+        self._size += 1
+        self[new_idx] = value
+        return self
+
+    def extend(self, values: Iterable[Any]) -> Self:
+        """Append values from a Python iterable to the end of the list.
+
+        Args:
+            values (iterable): iterable of values to append
+        """
+        # Tensor is an iterable but we never want to unpack it here
+        if not isinstance(values, container_abcs.Iterable) or isinstance(values, jt.Var):
+            raise TypeError("ParameterList.extend should be called with an "
+                            "iterable, but got " + type(values).__name__)
+        for value in values:
+            self.append(value)
+        return self
+
+    def extra_repr(self) -> str:
+        child_lines = []
+        for k, p in enumerate(self):
+            if isinstance(p, jt.Var):
+                size_str = 'x'.join(str(size) for size in p.size())
+                parastr = '{} containing: [{} of size {}{}]'.format(
+                    "Parameter" if isinstance(p, Parameter) else "Tensor",
+                    p.dtype, size_str, "cuda" if jt.flags.use_cuda else "cpu")
+                child_lines.append('  (' + str(k) + '): ' + parastr)
+            else:
+                child_lines.append('  (' + str(k) + '): Object of type: ' + type(p).__name__)
+
+        tmpstr = '\n'.join(child_lines)
+        return tmpstr
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError('ParameterList should not be called.')
--- a/python/jittor/compatibility/nn/init.py
+++ b/python/jittor/compatibility/nn/init.py
@ -0,0 +1,16 @@
+import jittor as jt
+
+for k,v in jt.nn.init.__dict__.items():
+    if callable(v):
+        globals()[k] = v
+
+
+normal = gauss
+normal_ = gauss_
+xavier_normal = xavier_gauss
+xavier_normal_ = xavier_gauss_
+zeros_ = zero_
+
+
+jt.Var.normal_ = normal_
+
--- a/python/jittor/compatibility/nn/utils/init.py
+++ b/python/jittor/compatibility/nn/utils/init.py
@ -0,0 +1 @@
+from . import rnn
--- a/python/jittor/compatibility/nn/utils/rnn.py
+++ b/python/jittor/compatibility/nn/utils/rnn.py
@ -0,0 +1,20 @@
+import jittor as jt
+
+PackedSequence = None
+
+def pad_sequence(sequences,batch_first=False,padding_value=0.0):
+    max_f = max([len(s) for s in sequences])
+    # max_f = 512
+    b = len(sequences)
+    if batch_first:
+        ret = sequences[0].new_full([b,max_f,]+list(sequences[0].shape[1:]),padding_value)
+        for i,s in enumerate(sequences):
+            ret[i,:len(s)] = s 
+    else:
+        ret = sequences[0].new_full([max_f,b,]+list(sequences[0].shape[1:]),padding_value)
+        for i,s in enumerate(sequences):
+            ret[:len(s),i] = s
+    # print(ret.shape)
+    # ret = ret[:,:406]
+    return ret 
+    
--- a/python/jittor/compatibility/optim.py
+++ b/python/jittor/compatibility/optim.py
--- a/python/jittor/compatibility/src/jtorch_core.cc
+++ b/python/jittor/compatibility/src/jtorch_core.cc
@ -0,0 +1,102 @@
+
+#include "pyjt/py_obj_holder.h"
+#include "utils/str_utils.h"
+#include "jtorch_core.h"
+#include "graph.h"
+#include "grad.h"
+#include "ops/op_register.h"
+
+namespace jittor {
+
+void pyjt_def_all(PyObject* m);
+
+EXTERN_LIB void setter_use_cuda(int value);
+
+Device::Device(const string& name, int ordinal) : name(name) {
+    if (startswith(name, "cpu"))
+        setter_use_cuda(0);
+    else
+        setter_use_cuda(1);
+}
+
+unordered_map<int64, VarPtr> grad_backup;
+EXTERN_LIB void (*_var_free_hook)(Var*);
+EXTERN_LIB unordered_map<int64, VarPtr>* _grad_backup_ptr;
+
+void jtorch_var_free_hook(Var* v) {
+    auto iter = grad_backup.find(v->id);
+    if (iter != grad_backup.end()) {
+        grad_backup.erase(iter);
+    }
+}
+
+void jtorch_init() {
+    _var_free_hook = &jtorch_var_free_hook;
+    _grad_backup_ptr = &grad_backup;
+}
+
+inline static VarPtr& get_grad(Var* v) {
+    return grad_backup[v->id];
+}
+static auto make_binary = get_op_info("binary")
+    .get_constructor<VarPtr, Var*, Var*, NanoString>();
+
+inline static void add_grad(VarPtr& a, VarPtr&& b) {
+    if (!a) a = move(b);
+    else {
+        a = make_binary(a, b, ns_add);
+    }
+}
+
+
+void grad_set(VarHolder* x, Maybe<VarHolder> v) {
+    if (!v) {
+        grad_del(x);
+        return;
+    }
+    grad_backup[x->var->id] = v.ptr->var;
+}
+
+Maybe<VarHolder> grad_get(VarHolder* x) {
+    auto iter = grad_backup.find(x->var->id);
+    if (iter != grad_backup.end()) {
+        if (!iter->second.ptr) return nullptr;
+        return new VarHolder(iter->second.ptr);
+    }
+    return nullptr;
+}
+
+void grad_del(VarHolder* x) {
+    auto iter = grad_backup.find(x->var->id);
+    if (iter != grad_backup.end())
+        grad_backup.erase(iter);
+}
+
+void backward(VarHolder* x) {
+    vector<Node*> gnodes({x->var});
+    bfs_backward(gnodes, [&](Node* node) {
+        if (node->is_stop_grad())
+            return false;
+        return true;
+    });
+    vector<Var*> targets;
+    for (auto* node : gnodes) {
+        if (node->is_var() && node->flags.get(NodeFlags::_th_require_grad))
+            targets.push_back(node->var());
+    }
+    auto grads = grad(x->var, targets);
+    for (int i=0; i<targets.size(); i++) {
+        auto& gptr = get_grad(targets[i]);
+        add_grad(gptr, move(grads[i]));
+    }
+}
+
+
+}
+
+static void init_module(PyModuleDef* mdef, PyObject* m) {
+    jittor::jtorch_init();
+    mdef->m_doc = "Inner c++ core of jtorch";
+    jittor::pyjt_def_all(m);
+}
+PYJT_MODULE_INIT(jtorch_core);
--- a/python/jittor/compatibility/src/jtorch_core.h
+++ b/python/jittor/compatibility/src/jtorch_core.h
@ -0,0 +1,40 @@
+#pragma once
+#include "common.h"
+#include "var_holder.h"
+#include "misc/fast_shared_ptr.h"
+
+namespace jittor {
+
+// @pyjt(device)
+// @attrs(heaptype)
+struct Device {
+    string name;
+    
+    // @pyjt(__init__)
+    Device(const string& name, int ordinal=0);
+    // @pyjt(__get__type, __str__)
+    inline string get_type() {return name;}
+    // @pyjt(__get__index)
+    inline int index() {return 0;}
+};
+
+// @pyjt(backward)
+void backward(VarHolder* x);
+
+// @pyjt(grad_set)
+void grad_set(VarHolder* x, Maybe<VarHolder> v);
+// @pyjt(grad_get)
+Maybe<VarHolder> grad_get(VarHolder* x);
+// @pyjt(grad_del)
+void grad_del(VarHolder* x);
+
+// @pyjt(retain_grad_set)
+inline void retain_grad_set(VarHolder* x, bool v) {
+    x->var->flags.set(NodeFlags::_th_require_grad, v);
+}
+// @pyjt(retain_grad_get)
+inline bool retain_grad_get(VarHolder* x) {
+    return x->var->flags.get(NodeFlags::_th_require_grad);
+}
+
+}
--- a/python/jittor/compatibility/test/test_conflict_func.py
+++ b/python/jittor/compatibility/test/test_conflict_func.py
@ -0,0 +1,25 @@
+import unittest
+import numpy as np
+import torch
+import jittor as jt
+
+class TestConflictFunc(unittest.TestCase):
+    def test_max(self):
+        a = torch.Tensor([1,4,2])
+        assert a.max() == 4
+        v, k = a.max(dim=0)
+        assert v==4 and k==1
+
+    def test_argsort(self):
+        a = torch.Tensor([1,4,2])
+        k = a.argsort()
+        assert jt.all_equal(k, [0,2,1])
+
+        with jt.flag_scope(th_mode=0):
+            k, v = a.argsort()
+            assert jt.all_equal(k, [0,2,1])
+
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/compatibility/test/test_function.py
+++ b/python/jittor/compatibility/test/test_function.py
@ -0,0 +1,58 @@
+import unittest
+import numpy as np
+import torch
+
+class TestFunction(unittest.TestCase):
+    def test_example1(self):
+        import jtorch
+        from jtorch import Function
+
+        class MyFunc(Function):
+            @staticmethod
+            def forward(self, x, y):
+                self.x = x
+                self.y = y
+                return x*y, x/y
+
+            @staticmethod
+            def backward(self, grad0, grad1):
+                return grad0 * self.y, grad1 * self.x
+
+        a = jtorch.array(3.0)
+        a.requires_grad = True
+        b = jtorch.array(4.0)
+        b.requires_grad = True
+        func = MyFunc.apply
+        c,d = func(a, b)
+        (c+d*3).backward()
+        assert a.grad.data == 4
+        assert b.grad.data == 9
+
+    def test_example2(self):
+        import jtorch as jt
+        from jtorch import Function
+        
+        class MyFunc(Function):
+            @staticmethod
+            def forward(self, x, y):
+                self.x = x
+                self.y = y
+                return x*y, x/y
+
+            @staticmethod
+            def backward(self, grad0, grad1):
+                assert grad1 is None
+                return grad0 * self.y, None
+        a = jt.array(3.0)
+        a.requires_grad = True
+        b = jt.array(4.0)
+        b.requires_grad = True
+        func = MyFunc.apply
+        c,d = func(a, b)
+        d.stop_grad()
+        da, db = jt.grad(c+d*3, [a, b])
+        assert da.data == 4
+        assert db.data == 0
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/compatibility/test/test_misc.py
+++ b/python/jittor/compatibility/test/test_misc.py
@ -0,0 +1,24 @@
+import unittest
+import numpy as np
+import torch
+
+class TestMisc(unittest.TestCase):
+    def test_update_grad(self):
+        class Net(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.a = torch.nn.Parameter(torch.Tensor([1.0, 2.0]))
+        net = Net()
+        assert(net.a.requires_grad)
+        net.load_state_dict({"a": torch.Tensor([3.0, 4.0])})
+        assert(net.a.requires_grad)
+
+    def test_reshape(self):
+        a = torch.ones(3,3)
+        a.requires_grad = True
+        b = torch.reshape(a, [9])
+        assert b.requires_grad == True
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/compatibility/test/test_tutorial.py
+++ b/python/jittor/compatibility/test/test_tutorial.py
@ -0,0 +1,56 @@
+import unittest
+import numpy as np
+import os
+import subprocess as sp
+import sys
+
+def check_two(cmd, parser=None, checker=None):
+    jtorch_out = sp.getoutput(cmd)
+    print("=========JTORCH OUT==========")
+    print(jtorch_out)
+    torch_out = sp.getoutput("PYTHONPATH= "+cmd)
+    print("=========TORCH OUT==========")
+    print(torch_out)
+    if parser:
+        torch_out = parser(torch_out)
+        jtorch_out = parser(jtorch_out)
+    if checker:
+        checker(torch_out, jtorch_out)
+    else:
+        assert torch_out == jtorch_out
+    return jtorch_out, torch_out
+
+jtorch_path = os.path.join(os.path.dirname(__file__), "..")
+# come from https://pytorch.org/tutorials/beginner/pytorch_with_examples.html
+class TestTutorial(unittest.TestCase):
+    def test_auto_grad1(self):
+        check_two(f"{sys.executable} {jtorch_path}/tutorial/auto_grad1.py",
+            parser=lambda s: np.array(s.split())[[-10,-8,-5,-2]].astype(float),
+            checker=lambda a,b: np.testing.assert_allclose(a, b, atol=1e-4))
+    def test_auto_grad2(self):
+        check_two(f"{sys.executable} {jtorch_path}/tutorial/auto_grad2.py",
+            parser=lambda s: np.array(s.split())[[-10,-8,-5,-2]].astype(float),
+            checker=lambda a,b: np.testing.assert_allclose(a, b, atol=1e-4))
+    def test_auto_grad3(self):
+        check_two(f"{sys.executable} {jtorch_path}/tutorial/auto_grad3.py",
+            parser=lambda s: np.array(s.split())[[-9,-7,-4,-2]].astype(float),
+            checker=lambda a,b: np.testing.assert_allclose(a, b, atol=1e-4))
+    def test_auto_grad4(self):
+        check_two(f"{sys.executable} {jtorch_path}/tutorial/auto_grad4.py",
+            parser=lambda s: np.array(s.split())[[-10,-8,-5,-2]].astype(float),
+            checker=lambda a,b: np.testing.assert_allclose(a, b, atol=1e-4))
+    def test_auto_grad5(self):
+        check_two(f"{sys.executable} {jtorch_path}/tutorial/auto_grad5_optim.py",
+            parser=lambda s: np.array(s.split())[[-10,-8,-5,-2]].astype(float),
+            checker=lambda a,b: np.testing.assert_allclose(a, b, atol=1e-2))
+    def test_auto_grad6(self):
+        check_two(f"{sys.executable} {jtorch_path}/tutorial/auto_grad6_module.py",
+            parser=lambda s: np.array(s.split())[[-10,-8,-5,-2]].astype(float),
+            checker=lambda a,b: np.testing.assert_allclose(a, b, atol=1e-4))
+    def test_auto_grad7(self):
+        check_two(f"{sys.executable} {jtorch_path}/tutorial/auto_grad7_dynet.py",
+            parser=lambda s: np.array(s.split())[[-13,-10,-7,-3]].astype(float),
+            checker=lambda a,b: np.testing.assert_allclose(a, b, atol=1e-2))
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/jittor/compatibility/tutorial/auto_grad1.py
+++ b/python/jittor/compatibility/tutorial/auto_grad1.py
@ -0,0 +1,44 @@
+import torch
+import math
+
+dtype = torch.float
+device = torch.device("cpu")
+# device = torch.device("cuda:0") # Uncomment this to run on GPU
+
+# Create random input and output data
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Randomly initialize weights
+a = torch.randn((), device=device, dtype=dtype)
+b = torch.randn((), device=device, dtype=dtype)
+c = torch.randn((), device=device, dtype=dtype)
+d = torch.randn((), device=device, dtype=dtype)
+
+learning_rate = 1e-6
+for t in range(20000):
+    # Forward pass: compute predicted y
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+
+    # Compute and print loss
+    loss = (y_pred - y).pow(2).sum().item()
+    if t % 1000 == 999:
+        print(t, loss)
+
+    # Backprop to compute gradients of a, b, c, d with respect to loss
+    grad_y_pred = 2.0 * (y_pred - y)
+    grad_a = grad_y_pred.sum()
+    grad_b = (grad_y_pred * x).sum()
+    grad_c = (grad_y_pred * x ** 2).sum()
+    grad_d = (grad_y_pred * x ** 3).sum()
+
+    # Update weights using gradient descent
+    a -= learning_rate * grad_a
+    b -= learning_rate * grad_b
+    c -= learning_rate * grad_c
+    d -= learning_rate * grad_d
+    # print(t, torch.liveness_info())
+    # torch.sync_all()
+
+
+print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')
--- a/python/jittor/compatibility/tutorial/auto_grad2.py
+++ b/python/jittor/compatibility/tutorial/auto_grad2.py
@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+import torch
+import math
+
+dtype = torch.float
+device = torch.device("cpu")
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+
+# Create Tensors to hold input and outputs.
+# By default, requires_grad=False, which indicates that we do not need to
+# compute gradients with respect to these Tensors during the backward pass.
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Create random Tensors for weights. For a third order polynomial, we need
+# 4 weights: y = a + b x + c x^2 + d x^3
+# Setting requires_grad=True indicates that we want to compute gradients with
+# respect to these Tensors during the backward pass.
+a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
+b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
+c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
+d = torch.randn((), device=device, dtype=dtype, requires_grad=True)
+
+learning_rate = 1e-6
+for t in range(20000):
+    # Forward pass: compute predicted y using operations on Tensors.
+    y_pred = a + b * x + c * x ** 2 + d * x ** 3
+    # print(y_pred.requires_grad)
+    # y_pred.requires_grad = False
+
+    # Compute and print loss using operations on Tensors.
+    # Now loss is a Tensor of shape (1,)
+    # loss.item() gets the scalar value held in the loss.
+    loss = (y_pred - y).pow(2).sum()
+    if t % 1000 == 990:
+        print(t, loss.item())
+
+    # Use autograd to compute the backward pass. This call will compute the
+    # gradient of loss with respect to all Tensors with requires_grad=True.
+    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
+    # the gradient of the loss with respect to a, b, c, d respectively.
+    # torch.backward(loss)
+    loss.backward()
+
+    # Manually update weights using gradient descent. Wrap in torch.no_grad()
+    # because weights have requires_grad=True, but we don't need to track this
+    # in autograd.
+    with torch.no_grad():
+        a -= learning_rate * a.grad
+        b -= learning_rate * b.grad
+        c -= learning_rate * c.grad
+        d -= learning_rate * d.grad
+
+        # Manually zero the gradients after updating weights
+        a.grad = None
+        b.grad = None
+        c.grad = None
+        d.grad = None
+
+print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')
--- a/python/jittor/compatibility/tutorial/auto_grad3.py
+++ b/python/jittor/compatibility/tutorial/auto_grad3.py
@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+import torch
+import math
+
+
+class LegendrePolynomial3(torch.autograd.Function):
+    """
+    We can implement our own custom autograd Functions by subclassing
+    torch.autograd.Function and implementing the forward and backward passes
+    which operate on Tensors.
+    """
+
+    @staticmethod
+    def forward(ctx, input):
+        """
+        In the forward pass we receive a Tensor containing the input and return
+        a Tensor containing the output. ctx is a context object that can be used
+        to stash information for backward computation. You can cache arbitrary
+        objects for use in the backward pass using the ctx.save_for_backward method.
+        """
+        ctx.save_for_backward(input)
+        return 0.5 * (5 * input ** 3 - 3 * input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        In the backward pass we receive a Tensor containing the gradient of the loss
+        with respect to the output, and we need to compute the gradient of the loss
+        with respect to the input.
+        """
+        input, = ctx.saved_tensors
+        return grad_output * 1.5 * (5 * input ** 2 - 1)
+
+
+dtype = torch.float
+device = torch.device("cpu")
+# device = torch.device("cuda:0")  # Uncomment this to run on GPU
+
+# Create Tensors to hold input and outputs.
+# By default, requires_grad=False, which indicates that we do not need to
+# compute gradients with respect to these Tensors during the backward pass.
+x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
+y = torch.sin(x)
+
+# Create random Tensors for weights. For this example, we need
+# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
+# not too far from the correct result to ensure convergence.
+# Setting requires_grad=True indicates that we want to compute gradients with
+# respect to these Tensors during the backward pass.
+a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
+c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
+d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)
+
+learning_rate = 5e-6
+for t in range(2000):
+    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
+    P3 = LegendrePolynomial3.apply
+
+    # Forward pass: compute predicted y using operations; we compute
+    # P3 using our custom autograd operation.
+    y_pred = a + b * P3(c + d * x)
+
+    # Compute and print loss
+    loss = (y_pred - y).pow(2).sum()
+    if t % 100 == 99:
+        print(t, loss.item())
+
+    # Use autograd to compute the backward pass.
+    loss.backward()
+
+    # Update weights using gradient descent
+    with torch.no_grad():
+        a -= learning_rate * a.grad
+        b -= learning_rate * b.grad
+        c -= learning_rate * c.grad
+        d -= learning_rate * d.grad
+
+        # Manually zero the gradients after updating weights
+        a.grad = None
+        b.grad = None
+        c.grad = None
+        d.grad = None
+
+print(f'Result: y = {a.item()} + {b.item()} * P3( {c.item()} + {d.item()} x)')
--- a/python/jittor/compatibility/tutorial/auto_grad4.py
+++ b/python/jittor/compatibility/tutorial/auto_grad4.py
@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+import torch
+import math
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# For this example, the output y is a linear function of (x, x^2, x^3), so
+# we can consider it as a linear layer neural network. Let's prepare the
+# tensor (x, x^2, x^3).
+p = torch.tensor([1, 2, 3])
+xx = x.unsqueeze(-1).pow(p)
+
+# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape
+# (3,), for this case, broadcasting semantics will apply to obtain a tensor
+# of shape (2000, 3) 
+
+# Use the nn package to define our model as a sequence of layers. nn.Sequential
+# is a Module which contains other Modules, and applies them in sequence to
+# produce its output. The Linear Module computes output from input using a
+# linear function, and holds internal Tensors for its weight and bias.
+# The Flatten layer flatens the output of the linear layer to a 1D tensor,
+# to match the shape of `y`.
+model = torch.nn.Sequential(
+    torch.nn.Linear(3, 1),
+    torch.nn.Flatten(0, 1)
+)
+
+# The nn package also contains definitions of popular loss functions; in this
+# case we will use Mean Squared Error (MSE) as our loss function.
+loss_fn = torch.nn.MSELoss(reduction='sum')
+# print(model[0].weight.requires_grad)
+
+learning_rate = 1e-6
+for t in range(8000):
+
+    # Forward pass: compute predicted y by passing x to the model. Module objects
+    # override the __call__ operator so you can call them like functions. When
+    # doing so you pass a Tensor of input data to the Module and it produces
+    # a Tensor of output data.
+    y_pred = model(xx)
+
+    # Compute and print loss. We pass Tensors containing the predicted and true
+    # values of y, and the loss function returns a Tensor containing the
+    # loss.
+    loss = loss_fn(y_pred, y)
+    if t % 1000 == 999:
+        print(t, loss.item())
+
+    # Zero the gradients before running the backward pass.
+    model.zero_grad()
+
+    # Backward pass: compute gradient of the loss with respect to all the learnable
+    # parameters of the model. Internally, the parameters of each Module are stored
+    # in Tensors with requires_grad=True, so this call will compute gradients for
+    # all learnable parameters in the model.
+    loss.backward()
+
+    # Update the weights using gradient descent. Each parameter is a Tensor, so
+    # we can access its gradients like we did before.
+    with torch.no_grad():
+        for param in model.parameters():
+            param -= learning_rate * param.grad
+
+# You can access the first layer of `model` like accessing the first item of a list
+linear_layer = model[0]
+
+# For linear layer, its parameters are stored as `weight` and `bias`.
+print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')
--- a/python/jittor/compatibility/tutorial/auto_grad5_optim.py
+++ b/python/jittor/compatibility/tutorial/auto_grad5_optim.py
@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+import torch
+import math
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# Prepare the input tensor (x, x^2, x^3).
+p = torch.tensor([1, 2, 3])
+xx = x.unsqueeze(-1).pow(p)
+
+# Use the nn package to define our model and loss function.
+model = torch.nn.Sequential(
+    torch.nn.Linear(3, 1),
+    torch.nn.Flatten(0, 1)
+)
+loss_fn = torch.nn.MSELoss(reduction='sum')
+
+# Use the optim package to define an Optimizer that will update the weights of
+# the model for us. Here we will use RMSprop; the optim package contains many other
+# optimization algorithms. The first argument to the RMSprop constructor tells the
+# optimizer which Tensors it should update.
+learning_rate = 1e-3
+optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
+for t in range(8000):
+    # Forward pass: compute predicted y by passing x to the model.
+    y_pred = model(xx)
+
+    # Compute and print loss.
+    loss = loss_fn(y_pred, y)
+    if t % 1000 == 999:
+        print(t, loss.item())
+
+    # Before the backward pass, use the optimizer object to zero all of the
+    # gradients for the variables it will update (which are the learnable
+    # weights of the model). This is because by default, gradients are
+    # accumulated in buffers( i.e, not overwritten) whenever .backward()
+    # is called. Checkout docs of torch.autograd.backward for more details.
+    optimizer.zero_grad()
+
+    # Backward pass: compute gradient of the loss with respect to model
+    # parameters
+    loss.backward()
+
+    # Calling the step function on an Optimizer makes an update to its
+    # parameters
+    optimizer.step()
+
+
+linear_layer = model[0]
+print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')
--- a/python/jittor/compatibility/tutorial/auto_grad6_module.py
+++ b/python/jittor/compatibility/tutorial/auto_grad6_module.py
@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+import torch
+import math
+
+
+class Polynomial3(torch.nn.Module):
+    def __init__(self):
+        """
+        In the constructor we instantiate four parameters and assign them as
+        member parameters.
+        """
+        super().__init__()
+        self.a = torch.nn.Parameter(torch.randn(()))
+        self.b = torch.nn.Parameter(torch.randn(()))
+        self.c = torch.nn.Parameter(torch.randn(()))
+        self.d = torch.nn.Parameter(torch.randn(()))
+
+    def forward(self, x):
+        """
+        In the forward function we accept a Tensor of input data and we must return
+        a Tensor of output data. We can use Modules defined in the constructor as
+        well as arbitrary operators on Tensors.
+        """
+        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
+
+    def string(self):
+        """
+        Just like any class in Python, you can also define custom method on PyTorch modules
+        """
+        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# Construct our model by instantiating the class defined above
+model = Polynomial3()
+
+# Construct our loss function and an Optimizer. The call to model.parameters()
+# in the SGD constructor will contain the learnable parameters (defined 
+# with torch.nn.Parameter) which are members of the model.
+criterion = torch.nn.MSELoss(reduction='sum')
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
+for t in range(8000):
+    # Forward pass: Compute predicted y by passing x to the model
+    y_pred = model(x)
+
+    # Compute and print loss
+    loss = criterion(y_pred, y)
+    if t % 1000 == 999:
+        print(t, loss.item())
+
+    # Zero gradients, perform a backward pass, and update the weights.
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+
+print(f'Result: {model.string()}')
--- a/python/jittor/compatibility/tutorial/auto_grad7_dynet.py
+++ b/python/jittor/compatibility/tutorial/auto_grad7_dynet.py
@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+import random
+import torch
+import math
+
+
+class DynamicNet(torch.nn.Module):
+    def __init__(self):
+        """
+        In the constructor we instantiate five parameters and assign them as members.
+        """
+        super().__init__()
+        self.a = torch.nn.Parameter(torch.randn(()))
+        self.b = torch.nn.Parameter(torch.randn(()))
+        self.c = torch.nn.Parameter(torch.randn(()))
+        self.d = torch.nn.Parameter(torch.randn(()))
+        self.e = torch.nn.Parameter(torch.randn(()))
+
+    def forward(self, x):
+        """
+        For the forward pass of the model, we randomly choose either 4, 5
+        and reuse the e parameter to compute the contribution of these orders.
+
+        Since each forward pass builds a dynamic computation graph, we can use normal
+        Python control-flow operators like loops or conditional statements when
+        defining the forward pass of the model.
+
+        Here we also see that it is perfectly safe to reuse the same parameter many
+        times when defining a computational graph.
+        """
+        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
+        for exp in range(4, random.randint(4, 6)):
+            y = y + self.e * x ** exp
+        return y
+
+    def string(self):
+        """
+        Just like any class in Python, you can also define custom method on PyTorch modules
+        """
+        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'
+
+
+# Create Tensors to hold input and outputs.
+x = torch.linspace(-math.pi, math.pi, 2000)
+y = torch.sin(x)
+
+# Construct our model by instantiating the class defined above
+model = DynamicNet()
+
+# Construct our loss function and an Optimizer. Training this strange model with
+# vanilla stochastic gradient descent is tough, so we use momentum
+criterion = torch.nn.MSELoss(reduction='sum')
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
+for t in range(60000):
+    # Forward pass: Compute predicted y by passing x to the model
+    y_pred = model(x)
+
+    # Compute and print loss
+    loss = criterion(y_pred, y)
+    if t % 2000 == 1999:
+        print(t, loss.item())
+
+    # Zero gradients, perform a backward pass, and update the weights.
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+    # print(torch.liveness_info())
+
+print(f'Result: {model.string()}')
--- a/python/jittor/compatibility/tutorial/quickstart.py
+++ b/python/jittor/compatibility/tutorial/quickstart.py
@ -0,0 +1,106 @@
+import torch
+from torch import nn
+# from jtorch.utils import DataLoader
+from torch.utils.data import DataLoader
+from torchvision import datasets
+from torchvision.transforms import ToTensor
+
+# Download training data from open datasets.
+training_data = datasets.FashionMNIST(
+    root="data",
+    train=True,
+    download=True,
+    transform=ToTensor(),
+)
+
+# Download test data from open datasets.
+test_data = datasets.FashionMNIST(
+    root="data",
+    train=False,
+    download=True,
+    transform=ToTensor(),
+)
+
+batch_size = 64
+
+# Create data loaders.
+train_dataloader = DataLoader(training_data, batch_size=batch_size)
+test_dataloader = DataLoader(test_data, batch_size=batch_size)
+
+print(len(train_dataloader))
+for X, y in test_dataloader:
+    print(f"Shape of X [N, C, H, W]: {X.shape}")
+    print(f"Shape of y: {y.shape} {y.dtype}")
+    break
+
+# Get cpu or gpu device for training.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using {device} device")
+
+# Define model
+class NeuralNetwork(nn.Module):
+    def __init__(self):
+        super(NeuralNetwork, self).__init__()
+        self.flatten = nn.Flatten()
+        self.linear_relu_stack = nn.Sequential(
+            nn.Linear(28*28, 512),
+            nn.ReLU(),
+            nn.Linear(512, 512),
+            nn.ReLU(),
+            nn.Linear(512, 10)
+        )
+
+    def forward(self, x):
+        x = self.flatten(x)
+        logits = self.linear_relu_stack(x)
+        return logits
+
+model = NeuralNetwork().to(device)
+print(model)
+
+
+loss_fn = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
+
+def train(dataloader, model, loss_fn, optimizer):
+    size = len(dataloader.dataset)
+    model.train()
+    for batch, (X, y) in enumerate(dataloader):
+        X, y = X.to(device), y.to(device)
+
+        # Compute prediction error
+        pred = model(X)
+        loss = loss_fn(pred, y)
+
+        # Backpropagation
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        if batch % 100 == 0:
+            loss, current = loss.item(), batch * len(X)
+            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
+
+def test(dataloader, model, loss_fn):
+    size = len(dataloader.dataset)
+    num_batches = len(dataloader)
+    model.eval()
+    test_loss, correct = 0, 0
+    with torch.no_grad():
+        for X, y in dataloader:
+            X, y = X.to(device), y.to(device)
+            pred = model(X)
+            test_loss += loss_fn(pred, y).item()
+            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
+    test_loss /= num_batches
+    correct /= size
+    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
+
+
+epochs = 5
+test(test_dataloader, model, loss_fn)
+for t in range(epochs):
+    print(f"Epoch {t+1}\n-------------------------------")
+    train(train_dataloader, model, loss_fn, optimizer)
+    test(test_dataloader, model, loss_fn)
+print("Done!")
--- a/python/jittor/compatibility/utils/init.py
+++ b/python/jittor/compatibility/utils/init.py
@ -0,0 +1,5 @@
+cpp_extension = None
+_flatten_dense_tensors = None
+_unflatten_dense_tensors = None
+
+tensorboard = None
--- a/python/jittor/compatibility/utils/_pytree.py
+++ b/python/jittor/compatibility/utils/_pytree.py
@ -0,0 +1,3 @@
+#TODO: Implement this
+_register_pytree_node = None
+_dict_flatten = None
--- a/python/jittor/compatibility/utils/checkpoint.py
+++ b/python/jittor/compatibility/utils/checkpoint.py
@ -0,0 +1,8 @@
+detach_variable = None
+
+
+def checkpoint(
+    *args,
+    **kwargs
+):
+    pass
--- a/python/jittor/compatibility/utils/data.py
+++ b/python/jittor/compatibility/utils/data.py
@ -0,0 +1,137 @@
+import jittor as jt
+import jittor.dataset
+from jittor.dataset import Dataset as JDataset
+
+from collections import namedtuple
+from typing import Any, Callable, Iterable, Optional, Sequence, Union
+
+
+class Dataset:
+    def __getitem__(self, index):
+        raise NotImplementedError
+
+class IterableDataset:
+    def __iter__(self):
+        raise NotImplementedError
+
+
+class DataLoader(JDataset):
+    def __init__(self, dataset,
+                 batch_size: Optional[int] = 1,
+                 shuffle: Optional[bool] = False, 
+                 sampler = None,
+                 batch_sampler = None,
+                 num_workers: int = 0, 
+                 collate_fn = None,
+                 pin_memory: bool = False, 
+                 drop_last: bool = False,
+                 timeout: float = 0, 
+                 worker_init_fn = None,
+                 multiprocessing_context=None, 
+                 generator=None,
+                 *, prefetch_factor: int = 2,
+                 persistent_workers: bool = False,
+                 pin_memory_device: str = "") -> None:
+        super().__init__(batch_size=batch_size, 
+                         shuffle=shuffle,
+                         num_workers=num_workers,
+                         drop_last=drop_last)
+        
+        unsupported_kwargs = {
+            "batch_sampler": batch_sampler, 
+            "pin_memory": pin_memory, 
+            "timeout": timeout,
+            "worker_init_fn": worker_init_fn,
+            "multiprocessing_context": multiprocessing_context, 
+            "generator": generator, 
+            "persistent_workers": persistent_workers, 
+            "pin_memory_device": pin_memory_device
+        }
+        for kwarg, value in unsupported_kwargs.items():
+            if value:
+                jt.LOG.w(f"Not implemented Dataloader kwarg: {kwarg}")
+
+        self.dataset = dataset
+        self.collate_fn = collate_fn
+        self.sampler = sampler
+
+        if not isinstance(dataset, IterableDataset):
+            self.total_len = len(dataset)
+        else:
+            # TODO: support multiple worker for iterable dataset
+            assert(num_workers == 0)
+
+    def collate_batch(self, batch):
+        if self.collate_fn is not None:
+            return self.collate_fn(batch)
+        else:
+            return super().collate_batch(batch)
+
+    def __getitem__(self, i):
+        return self.dataset[i]
+    
+    def __iter__(self):
+        if isinstance(self.dataset, IterableDataset):
+            return self.inner_iter()
+        else:
+            return super().__iter__()
+
+    def inner_iter(self):
+        current_batch = []
+
+        if jt.world_size > 1:
+            assert self.batch_size % jt.world_size == 0, \
+                f"IterableDataset does not support a batch size ({self.batch_size}) that is not evenly divisible by the number of processes f{jt.world_size}"
+            real_batch_size = int(self.batch_size / jt.world_size)
+        else:
+            real_batch_size = self.batch_size
+
+        for element in self.dataset:
+            current_batch.append(element)
+
+            if len(current_batch) == real_batch_size:
+                current_batch = self.collate_batch(current_batch)
+                current_batch = self.to_jittor(current_batch)
+                yield current_batch
+                current_batch = []
+        
+        if not self.drop_last and len(current_batch) > 0:
+            current_batch = self.collate_batch(current_batch)
+            yield self.to_jittor(current_batch)
+
+# def get_worker_info():
+#     # always return the fake worker info
+#     return namedtuple('WorkerInfo', 'id num_workers')(0, 1)
+
+# class RandomSampler(jt.dataset.RandomSampler):
+#     def __init__(self, dataset, generator=None, **kwargs):
+#         super().__init__(dataset, **kwargs)
+
+#     def __iter__(self):
+#         if getattr(self.dataset, "support_random_access", True):
+#             return super().__iter__()
+#         else:
+#             self.dataset.shuffle()
+#             return iter(range(self.dataset.__real_len__() if hasattr(self.dataset,"__real_len__") else self.dataset.__len__()))
+
+# class DistributedSampler(jt.dataset.Sampler):
+#     def __init__(self, sampler: RandomSampler):
+#         assert(isinstance(sampler, RandomSampler))
+#         self.sampler = sampler
+
+#     def set_epoch(self, epoch: int):
+#         ### do nothing, let jittor's inner dataset handle 
+#         pass
+
+#     def __iter__(self):
+#         return self.sampler.__iter__()
+    
+#     def __len__(self):
+#         return self.sampler.__len__()
+
+# BatchSampler = jt.dataset.BatchSampler
+# Sampler = jt.dataset.Sampler
+# SequentialSampler = jt.dataset.SequentialSampler
+# SubsetRandomSampler = jt.dataset.SubsetRandomSampler
+
+# TensorDataset = Dataset
--- a/python/jittor/compatibility/utils/dtype.py
+++ b/python/jittor/compatibility/utils/dtype.py
@ -0,0 +1,9 @@
+from typing import Callable, Union
+Dtype = Union[Callable, str]
+
+def get_string_dtype(dtype):
+    if callable(dtype):
+        dtype = dtype.__name__
+    if not isinstance(dtype, str):
+        raise ValueError(f"dtype is expected to be str, python type function, or jittor type function, but got {dtype}.")
+    return dtype
--- a/python/jittor/compatibility/utils/hooks.py
+++ b/python/jittor/compatibility/utils/hooks.py
--- a/python/jittor/compatibility/utils/pip_publish.py
+++ b/python/jittor/compatibility/utils/pip_publish.py
@ -0,0 +1,34 @@
+import os
+import glob
+import shutil
+import sys
+
+home_path = os.path.join(os.path.dirname(__file__), "..", "..", "..")
+home_path = os.path.abspath(home_path)
+
+def callback(func, path, exc_info):
+    print(f"remove \"{path}\" failed.")
+
+def rmtree(path):
+    if os.path.isdir(path):
+        print(f"remove \"{path}\" recursive.")
+        shutil.rmtree(path, onerror=callback)
+
+def remove_tmpfile():
+    dist_file = home_path+"/dist"
+    egg_file = glob.glob(home_path+"/**/*egg-info")
+    rmtree(dist_file)
+    for e in egg_file:
+        rmtree(e)
+
+def run_cmd(cmd):
+    print("[CMD]", cmd)
+    assert os.system(cmd)==0
+
+os.chdir(home_path)
+remove_tmpfile()
+
+run_cmd(f"{sys.executable} ./setup.py sdist")
+run_cmd(f"{sys.executable} -m twine upload dist/*")
+
+remove_tmpfile()
--- a/python/jittor/compatibility/vision/_internally_replaced_utils.py
+++ b/python/jittor/compatibility/vision/_internally_replaced_utils.py
@ -0,0 +1,46 @@
+import importlib.machinery
+import os
+
+
+def _download_file_from_remote_location(fpath: str, url: str) -> None:
+    pass
+
+
+def _is_remote_location_available() -> bool:
+    return False
+
+
+def _get_extension_path(lib_name):
+
+    lib_dir = os.path.dirname(__file__)
+    if os.name == "nt":
+        # Register the main torchvision library location on the default DLL path
+        import ctypes
+        import sys
+
+        kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
+        with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
+        prev_error_mode = kernel32.SetErrorMode(0x0001)
+
+        if with_load_library_flags:
+            kernel32.AddDllDirectory.restype = ctypes.c_void_p
+
+        if sys.version_info >= (3, 8):
+            os.add_dll_directory(lib_dir)
+        elif with_load_library_flags:
+            res = kernel32.AddDllDirectory(lib_dir)
+            if res is None:
+                err = ctypes.WinError(ctypes.get_last_error())
+                err.strerror += f' Error adding "{lib_dir}" to the DLL directories.'
+                raise err
+
+        kernel32.SetErrorMode(prev_error_mode)
+
+    loader_details = (importlib.machinery.ExtensionFileLoader, importlib.machinery.EXTENSION_SUFFIXES)
+
+    extfinder = importlib.machinery.FileFinder(lib_dir, loader_details)
+    ext_specs = extfinder.find_spec(lib_name)
+    if ext_specs is None:
+        raise ImportError
+
+    return ext_specs.origin
--- a/python/jittor/compatibility/vision/datasets/init.py
+++ b/python/jittor/compatibility/vision/datasets/init.py
@ -0,0 +1,9 @@
+from .mnist import EMNIST, FashionMNIST, KMNIST, MNIST, QMNIST
+
+__all__ = (
+    "EMNIST",
+    "FashionMNIST",
+    "QMNIST",
+    "MNIST",
+    "KMNIST",
+)
--- a/python/jittor/compatibility/vision/datasets/mnist.py
+++ b/python/jittor/compatibility/vision/datasets/mnist.py
@ -0,0 +1,558 @@
+import codecs
+import os
+import os.path
+import shutil
+import string
+import sys
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Tuple
+from urllib.error import URLError
+
+import numpy as np
+import torch
+from PIL import Image
+
+from .utils import check_integrity, download_and_extract_archive, extract_archive, verify_str_arg
+from .vision import VisionDataset
+
+
+class MNIST(VisionDataset):
+    """`MNIST <http://yann.lecun.com/exdb/mnist/>`_ Dataset.
+
+    Args:
+        root (string): Root directory of dataset where ``MNIST/raw/train-images-idx3-ubyte``
+            and  ``MNIST/raw/t10k-images-idx3-ubyte`` exist.
+        train (bool, optional): If True, creates dataset from ``train-images-idx3-ubyte``,
+            otherwise from ``t10k-images-idx3-ubyte``.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+    """
+
+    mirrors = [
+        "http://yann.lecun.com/exdb/mnist/",
+        "https://ossci-datasets.s3.amazonaws.com/mnist/",
+    ]
+
+    resources = [
+        ("train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873"),
+        ("train-labels-idx1-ubyte.gz", "d53e105ee54ea40749a09fcbcd1e9432"),
+        ("t10k-images-idx3-ubyte.gz", "9fb629c4189551a2d022fa330f9573f3"),
+        ("t10k-labels-idx1-ubyte.gz", "ec29112dd5afa0611ce80d1b7f02629c"),
+    ]
+
+    training_file = "training.pt"
+    test_file = "test.pt"
+    classes = [
+        "0 - zero",
+        "1 - one",
+        "2 - two",
+        "3 - three",
+        "4 - four",
+        "5 - five",
+        "6 - six",
+        "7 - seven",
+        "8 - eight",
+        "9 - nine",
+    ]
+
+    @property
+    def train_labels(self):
+        warnings.warn("train_labels has been renamed targets")
+        return self.targets
+
+    @property
+    def test_labels(self):
+        warnings.warn("test_labels has been renamed targets")
+        return self.targets
+
+    @property
+    def train_data(self):
+        warnings.warn("train_data has been renamed data")
+        return self.data
+
+    @property
+    def test_data(self):
+        warnings.warn("test_data has been renamed data")
+        return self.data
+
+    def __init__(
+        self,
+        root: str,
+        train: bool = True,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        download: bool = False,
+    ) -> None:
+        super().__init__(root, transform=transform, target_transform=target_transform)
+        self.train = train  # training set or test set
+
+        if self._check_legacy_exist():
+            self.data, self.targets = self._load_legacy_data()
+            return
+
+        if download:
+            self.download()
+
+        if not self._check_exists():
+            raise RuntimeError("Dataset not found. You can use download=True to download it")
+
+        self.data, self.targets = self._load_data()
+
+    def _check_legacy_exist(self):
+        processed_folder_exists = os.path.exists(self.processed_folder)
+        if not processed_folder_exists:
+            return False
+
+        return all(
+            check_integrity(os.path.join(self.processed_folder, file)) for file in (self.training_file, self.test_file)
+        )
+
+    def _load_legacy_data(self):
+        # This is for BC only. We no longer cache the data in a custom binary, but simply read from the raw data
+        # directly.
+        data_file = self.training_file if self.train else self.test_file
+        return torch.load(os.path.join(self.processed_folder, data_file))
+
+    def _load_data(self):
+        image_file = f"{'train' if self.train else 't10k'}-images-idx3-ubyte"
+        data = read_image_file(os.path.join(self.raw_folder, image_file))
+
+        label_file = f"{'train' if self.train else 't10k'}-labels-idx1-ubyte"
+        targets = read_label_file(os.path.join(self.raw_folder, label_file))
+
+        return data, targets
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        img, target = self.data[index], int(self.targets[index])
+
+        # doing this so that it is consistent with all other datasets
+        # to return a PIL Image
+        img = Image.fromarray(img.numpy(), mode="L")
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    @property
+    def raw_folder(self) -> str:
+        return os.path.join(self.root, self.__class__.__name__, "raw")
+
+    @property
+    def processed_folder(self) -> str:
+        return os.path.join(self.root, self.__class__.__name__, "processed")
+
+    @property
+    def class_to_idx(self) -> Dict[str, int]:
+        return {_class: i for i, _class in enumerate(self.classes)}
+
+    def _check_exists(self) -> bool:
+        return all(
+            check_integrity(os.path.join(self.raw_folder, os.path.splitext(os.path.basename(url))[0]))
+            for url, _ in self.resources
+        )
+
+    def download(self) -> None:
+        """Download the MNIST data if it doesn't exist already."""
+
+        if self._check_exists():
+            return
+
+        os.makedirs(self.raw_folder, exist_ok=True)
+
+        # download files
+        for filename, md5 in self.resources:
+            for mirror in self.mirrors:
+                url = f"{mirror}{filename}"
+                try:
+                    print(f"Downloading {url}")
+                    download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5)
+                except URLError as error:
+                    print(f"Failed to download (trying next):\n{error}")
+                    continue
+                finally:
+                    print()
+                break
+            else:
+                raise RuntimeError(f"Error downloading {filename}")
+
+    def extra_repr(self) -> str:
+        split = "Train" if self.train is True else "Test"
+        return f"Split: {split}"
+
+
+class FashionMNIST(MNIST):
+    """`Fashion-MNIST <https://github.com/zalandoresearch/fashion-mnist>`_ Dataset.
+
+    Args:
+        root (string): Root directory of dataset where ``FashionMNIST/raw/train-images-idx3-ubyte``
+            and  ``FashionMNIST/raw/t10k-images-idx3-ubyte`` exist.
+        train (bool, optional): If True, creates dataset from ``train-images-idx3-ubyte``,
+            otherwise from ``t10k-images-idx3-ubyte``.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+    """
+
+    mirrors = ["http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"]
+
+    resources = [
+        ("train-images-idx3-ubyte.gz", "8d4fb7e6c68d591d4c3dfef9ec88bf0d"),
+        ("train-labels-idx1-ubyte.gz", "25c81989df183df01b3e8a0aad5dffbe"),
+        ("t10k-images-idx3-ubyte.gz", "bef4ecab320f06d8554ea6380940ec79"),
+        ("t10k-labels-idx1-ubyte.gz", "bb300cfdad3c16e7a12a480ee83cd310"),
+    ]
+    classes = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
+
+
+class KMNIST(MNIST):
+    """`Kuzushiji-MNIST <https://github.com/rois-codh/kmnist>`_ Dataset.
+
+    Args:
+        root (string): Root directory of dataset where ``KMNIST/raw/train-images-idx3-ubyte``
+            and  ``KMNIST/raw/t10k-images-idx3-ubyte`` exist.
+        train (bool, optional): If True, creates dataset from ``train-images-idx3-ubyte``,
+            otherwise from ``t10k-images-idx3-ubyte``.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+    """
+
+    mirrors = ["http://codh.rois.ac.jp/kmnist/dataset/kmnist/"]
+
+    resources = [
+        ("train-images-idx3-ubyte.gz", "bdb82020997e1d708af4cf47b453dcf7"),
+        ("train-labels-idx1-ubyte.gz", "e144d726b3acfaa3e44228e80efcd344"),
+        ("t10k-images-idx3-ubyte.gz", "5c965bf0a639b31b8f53240b1b52f4d7"),
+        ("t10k-labels-idx1-ubyte.gz", "7320c461ea6c1c855c0b718fb2a4b134"),
+    ]
+    classes = ["o", "ki", "su", "tsu", "na", "ha", "ma", "ya", "re", "wo"]
+
+
+class EMNIST(MNIST):
+    """`EMNIST <https://www.westernsydney.edu.au/bens/home/reproducible_research/emnist>`_ Dataset.
+
+    Args:
+        root (string): Root directory of dataset where ``EMNIST/raw/train-images-idx3-ubyte``
+            and  ``EMNIST/raw/t10k-images-idx3-ubyte`` exist.
+        split (string): The dataset has 6 different splits: ``byclass``, ``bymerge``,
+            ``balanced``, ``letters``, ``digits`` and ``mnist``. This argument specifies
+            which one to use.
+        train (bool, optional): If True, creates dataset from ``training.pt``,
+            otherwise from ``test.pt``.
+        download (bool, optional): If True, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+    """
+
+    url = "https://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip"
+    md5 = "58c8d27c78d21e728a6bc7b3cc06412e"
+    splits = ("byclass", "bymerge", "balanced", "letters", "digits", "mnist")
+    # Merged Classes assumes Same structure for both uppercase and lowercase version
+    _merged_classes = {"c", "i", "j", "k", "l", "m", "o", "p", "s", "u", "v", "w", "x", "y", "z"}
+    _all_classes = set(string.digits + string.ascii_letters)
+    classes_split_dict = {
+        "byclass": sorted(list(_all_classes)),
+        "bymerge": sorted(list(_all_classes - _merged_classes)),
+        "balanced": sorted(list(_all_classes - _merged_classes)),
+        "letters": ["N/A"] + list(string.ascii_lowercase),
+        "digits": list(string.digits),
+        "mnist": list(string.digits),
+    }
+
+    def __init__(self, root: str, split: str, **kwargs: Any) -> None:
+        self.split = verify_str_arg(split, "split", self.splits)
+        self.training_file = self._training_file(split)
+        self.test_file = self._test_file(split)
+        super().__init__(root, **kwargs)
+        self.classes = self.classes_split_dict[self.split]
+
+    @staticmethod
+    def _training_file(split) -> str:
+        return f"training_{split}.pt"
+
+    @staticmethod
+    def _test_file(split) -> str:
+        return f"test_{split}.pt"
+
+    @property
+    def _file_prefix(self) -> str:
+        return f"emnist-{self.split}-{'train' if self.train else 'test'}"
+
+    @property
+    def images_file(self) -> str:
+        return os.path.join(self.raw_folder, f"{self._file_prefix}-images-idx3-ubyte")
+
+    @property
+    def labels_file(self) -> str:
+        return os.path.join(self.raw_folder, f"{self._file_prefix}-labels-idx1-ubyte")
+
+    def _load_data(self):
+        return read_image_file(self.images_file), read_label_file(self.labels_file)
+
+    def _check_exists(self) -> bool:
+        return all(check_integrity(file) for file in (self.images_file, self.labels_file))
+
+    def download(self) -> None:
+        """Download the EMNIST data if it doesn't exist already."""
+
+        if self._check_exists():
+            return
+
+        os.makedirs(self.raw_folder, exist_ok=True)
+
+        download_and_extract_archive(self.url, download_root=self.raw_folder, md5=self.md5)
+        gzip_folder = os.path.join(self.raw_folder, "gzip")
+        for gzip_file in os.listdir(gzip_folder):
+            if gzip_file.endswith(".gz"):
+                extract_archive(os.path.join(gzip_folder, gzip_file), self.raw_folder)
+        shutil.rmtree(gzip_folder)
+
+
+class QMNIST(MNIST):
+    """`QMNIST <https://github.com/facebookresearch/qmnist>`_ Dataset.
+
+    Args:
+        root (string): Root directory of dataset whose ``raw``
+            subdir contains binary files of the datasets.
+        what (string,optional): Can be 'train', 'test', 'test10k',
+            'test50k', or 'nist' for respectively the mnist compatible
+            training set, the 60k qmnist testing set, the 10k qmnist
+            examples that match the mnist testing set, the 50k
+            remaining qmnist testing examples, or all the nist
+            digits. The default is to select 'train' or 'test'
+            according to the compatibility argument 'train'.
+        compat (bool,optional): A boolean that says whether the target
+            for each example is class number (for compatibility with
+            the MNIST dataloader) or a torch vector containing the
+            full qmnist information. Default=True.
+        download (bool, optional): If True, downloads the dataset from
+            the internet and puts it in root directory. If dataset is
+            already downloaded, it is not downloaded again.
+        transform (callable, optional): A function/transform that
+            takes in an PIL image and returns a transformed
+            version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform
+            that takes in the target and transforms it.
+        train (bool,optional,compatibility): When argument 'what' is
+            not specified, this boolean decides whether to load the
+            training set ot the testing set.  Default: True.
+    """
+
+    subsets = {"train": "train", "test": "test", "test10k": "test", "test50k": "test", "nist": "nist"}
+    resources: Dict[str, List[Tuple[str, str]]] = {  # type: ignore[assignment]
+        "train": [
+            (
+                "https://raw.githubusercontent.com/facebookresearch/qmnist/master/qmnist-train-images-idx3-ubyte.gz",
+                "ed72d4157d28c017586c42bc6afe6370",
+            ),
+            (
+                "https://raw.githubusercontent.com/facebookresearch/qmnist/master/qmnist-train-labels-idx2-int.gz",
+                "0058f8dd561b90ffdd0f734c6a30e5e4",
+            ),
+        ],
+        "test": [
+            (
+                "https://raw.githubusercontent.com/facebookresearch/qmnist/master/qmnist-test-images-idx3-ubyte.gz",
+                "1394631089c404de565df7b7aeaf9412",
+            ),
+            (
+                "https://raw.githubusercontent.com/facebookresearch/qmnist/master/qmnist-test-labels-idx2-int.gz",
+                "5b5b05890a5e13444e108efe57b788aa",
+            ),
+        ],
+        "nist": [
+            (
+                "https://raw.githubusercontent.com/facebookresearch/qmnist/master/xnist-images-idx3-ubyte.xz",
+                "7f124b3b8ab81486c9d8c2749c17f834",
+            ),
+            (
+                "https://raw.githubusercontent.com/facebookresearch/qmnist/master/xnist-labels-idx2-int.xz",
+                "5ed0e788978e45d4a8bd4b7caec3d79d",
+            ),
+        ],
+    }
+    classes = [
+        "0 - zero",
+        "1 - one",
+        "2 - two",
+        "3 - three",
+        "4 - four",
+        "5 - five",
+        "6 - six",
+        "7 - seven",
+        "8 - eight",
+        "9 - nine",
+    ]
+
+    def __init__(
+        self, root: str, what: Optional[str] = None, compat: bool = True, train: bool = True, **kwargs: Any
+    ) -> None:
+        if what is None:
+            what = "train" if train else "test"
+        self.what = verify_str_arg(what, "what", tuple(self.subsets.keys()))
+        self.compat = compat
+        self.data_file = what + ".pt"
+        self.training_file = self.data_file
+        self.test_file = self.data_file
+        super().__init__(root, train, **kwargs)
+
+    @property
+    def images_file(self) -> str:
+        (url, _), _ = self.resources[self.subsets[self.what]]
+        return os.path.join(self.raw_folder, os.path.splitext(os.path.basename(url))[0])
+
+    @property
+    def labels_file(self) -> str:
+        _, (url, _) = self.resources[self.subsets[self.what]]
+        return os.path.join(self.raw_folder, os.path.splitext(os.path.basename(url))[0])
+
+    def _check_exists(self) -> bool:
+        return all(check_integrity(file) for file in (self.images_file, self.labels_file))
+
+    def _load_data(self):
+        data = read_sn3_pascalvincent_tensor(self.images_file)
+        if data.dtype != torch.uint8:
+            raise TypeError(f"data should be of dtype torch.uint8 instead of {data.dtype}")
+        if data.ndimension() != 3:
+            raise ValueError("data should have 3 dimensions instead of {data.ndimension()}")
+
+        targets = read_sn3_pascalvincent_tensor(self.labels_file).long()
+        if targets.ndimension() != 2:
+            raise ValueError(f"targets should have 2 dimensions instead of {targets.ndimension()}")
+
+        if self.what == "test10k":
+            data = data[0:10000, :, :].clone()
+            targets = targets[0:10000, :].clone()
+        elif self.what == "test50k":
+            data = data[10000:, :, :].clone()
+            targets = targets[10000:, :].clone()
+
+        return data, targets
+
+    def download(self) -> None:
+        """Download the QMNIST data if it doesn't exist already.
+        Note that we only download what has been asked for (argument 'what').
+        """
+        if self._check_exists():
+            return
+
+        os.makedirs(self.raw_folder, exist_ok=True)
+        split = self.resources[self.subsets[self.what]]
+
+        for url, md5 in split:
+            download_and_extract_archive(url, self.raw_folder, md5=md5)
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        # redefined to handle the compat flag
+        img, target = self.data[index], self.targets[index]
+        img = Image.fromarray(img.numpy(), mode="L")
+        if self.transform is not None:
+            img = self.transform(img)
+        if self.compat:
+            target = int(target[0])
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return img, target
+
+    def extra_repr(self) -> str:
+        return f"Split: {self.what}"
+
+
+def get_int(b: bytes) -> int:
+    return int(codecs.encode(b, "hex"), 16)
+
+
+SN3_PASCALVINCENT_BITSMAP = {
+    8: torch.uint8,
+    9: torch.int8,
+    11: torch.int16,
+    12: torch.int32,
+    13: torch.float32,
+    14: torch.float64,
+}
+
+TORCH_TYPE_BITS = {
+    torch.uint8: 8,
+    torch.int8: 8,
+    torch.int16: 16,
+    torch.int32: 32,
+    torch.float32: 32,
+    torch.float64: 64,
+}
+
+
+def read_sn3_pascalvincent_tensor(path: str, strict: bool = True) -> torch.Tensor:
+    """Read a SN3 file in "Pascal Vincent" format (Lush file 'libidx/idx-io.lsh').
+    Argument may be a filename, compressed filename, or file object.
+    """
+    # read
+    with open(path, "rb") as f:
+        data = f.read()
+    # parse
+    magic = get_int(data[0:4])
+    nd = magic % 256
+    ty = magic // 256
+    assert 1 <= nd <= 3
+    assert 8 <= ty <= 14
+    torch_type = SN3_PASCALVINCENT_BITSMAP[ty]
+    s = [get_int(data[4 * (i + 1) : 4 * (i + 2)]) for i in range(nd)]
+
+    num_bytes_per_value = TORCH_TYPE_BITS[torch_type] // 8
+    # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default,
+    # we need to reverse the bytes before we can read them with torch.frombuffer().
+    needs_byte_reversal = sys.byteorder == "little" and num_bytes_per_value > 1
+    parsed = torch.frombuffer(bytearray(data), dtype=torch_type, offset=(4 * (nd + 1)))
+    if needs_byte_reversal:
+        parsed = parsed.flip(0)
+
+    assert parsed.shape[0] == np.prod(s) or not strict
+    return parsed.view(*s)
+
+
+def read_label_file(path: str) -> torch.Tensor:
+    x = read_sn3_pascalvincent_tensor(path, strict=False)
+    if x.dtype != torch.uint8:
+        raise TypeError(f"x should be of dtype torch.uint8 instead of {x.dtype}")
+    if x.ndimension() != 1:
+        raise ValueError(f"x should have 1 dimension instead of {x.ndimension()}")
+    return x.long()
+
+
+def read_image_file(path: str) -> torch.Tensor:
+    x = read_sn3_pascalvincent_tensor(path, strict=False)
+    if x.dtype != torch.uint8:
+        raise TypeError(f"x should be of dtype torch.uint8 instead of {x.dtype}")
+    if x.ndimension() != 3:
+        raise ValueError(f"x should have 3 dimension instead of {x.ndimension()}")
+    return x
--- a/python/jittor/compatibility/vision/datasets/utils.py
+++ b/python/jittor/compatibility/vision/datasets/utils.py
@ -0,0 +1,522 @@
+import bz2
+import contextlib
+import gzip
+import hashlib
+import itertools
+import lzma
+import os
+import os.path
+import pathlib
+import re
+import sys
+import tarfile
+import urllib
+import urllib.error
+import urllib.request
+import warnings
+import zipfile
+from typing import Any, Callable, Dict, IO, Iterable, Iterator, List, Optional, Tuple, TypeVar
+from urllib.parse import urlparse
+
+import numpy as np
+import requests
+import torch
+from tqdm import tqdm
+
+from .._internally_replaced_utils import _download_file_from_remote_location, _is_remote_location_available
+
+USER_AGENT = "pytorch/vision"
+
+
+def _save_response_content(
+    content: Iterator[bytes],
+    destination: str,
+    length: Optional[int] = None,
+) -> None:
+    with open(destination, "wb") as fh, tqdm(total=length) as pbar:
+        for chunk in content:
+            # filter out keep-alive new chunks
+            if not chunk:
+                continue
+
+            fh.write(chunk)
+            pbar.update(len(chunk))
+
+
+def _urlretrieve(url: str, filename: str, chunk_size: int = 1024 * 32) -> None:
+    with urllib.request.urlopen(urllib.request.Request(url, headers={"User-Agent": USER_AGENT})) as response:
+        _save_response_content(iter(lambda: response.read(chunk_size), b""), filename, length=response.length)
+
+
+def gen_bar_updater() -> Callable[[int, int, int], None]:
+    warnings.warn("The function `gen_bar_update` is deprecated since 0.13 and will be removed in 0.15.")
+    pbar = tqdm(total=None)
+
+    def bar_update(count, block_size, total_size):
+        if pbar.total is None and total_size:
+            pbar.total = total_size
+        progress_bytes = count * block_size
+        pbar.update(progress_bytes - pbar.n)
+
+    return bar_update
+
+
+def calculate_md5(fpath: str, chunk_size: int = 1024 * 1024) -> str:
+    # Setting the `usedforsecurity` flag does not change anything about the functionality, but indicates that we are
+    # not using the MD5 checksum for cryptography. This enables its usage in restricted environments like FIPS. Without
+    # it torchvision.datasets is unusable in these environments since we perform a MD5 check everywhere.
+    if sys.version_info >= (3, 9):
+        md5 = hashlib.md5(usedforsecurity=False)
+    else:
+        md5 = hashlib.md5()
+    with open(fpath, "rb") as f:
+        for chunk in iter(lambda: f.read(chunk_size), b""):
+            md5.update(chunk)
+    return md5.hexdigest()
+
+
+def check_md5(fpath: str, md5: str, **kwargs: Any) -> bool:
+    return md5 == calculate_md5(fpath, **kwargs)
+
+
+def check_integrity(fpath: str, md5: Optional[str] = None) -> bool:
+    if not os.path.isfile(fpath):
+        return False
+    if md5 is None:
+        return True
+    return check_md5(fpath, md5)
+
+
+def _get_redirect_url(url: str, max_hops: int = 3) -> str:
+    initial_url = url
+    headers = {"Method": "HEAD", "User-Agent": USER_AGENT}
+
+    for _ in range(max_hops + 1):
+        with urllib.request.urlopen(urllib.request.Request(url, headers=headers)) as response:
+            if response.url == url or response.url is None:
+                return url
+
+            url = response.url
+    else:
+        raise RecursionError(
+            f"Request to {initial_url} exceeded {max_hops} redirects. The last redirect points to {url}."
+        )
+
+
+def _get_google_drive_file_id(url: str) -> Optional[str]:
+    parts = urlparse(url)
+
+    if re.match(r"(drive|docs)[.]google[.]com", parts.netloc) is None:
+        return None
+
+    match = re.match(r"/file/d/(?P<id>[^/]*)", parts.path)
+    if match is None:
+        return None
+
+    return match.group("id")
+
+
+def download_url(
+    url: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None, max_redirect_hops: int = 3
+) -> None:
+    """Download a file from a url and place it in root.
+
+    Args:
+        url (str): URL to download file from
+        root (str): Directory to place downloaded file in
+        filename (str, optional): Name to save the file under. If None, use the basename of the URL
+        md5 (str, optional): MD5 checksum of the download. If None, do not check
+        max_redirect_hops (int, optional): Maximum number of redirect hops allowed
+    """
+    root = os.path.expanduser(root)
+    if not filename:
+        filename = os.path.basename(url)
+    fpath = os.path.join(root, filename)
+
+    os.makedirs(root, exist_ok=True)
+
+    # check if file is already present locally
+    if check_integrity(fpath, md5):
+        print("Using downloaded and verified file: " + fpath)
+        return
+
+    if _is_remote_location_available():
+        _download_file_from_remote_location(fpath, url)
+    else:
+        # expand redirect chain if needed
+        url = _get_redirect_url(url, max_hops=max_redirect_hops)
+
+        # check if file is located on Google Drive
+        file_id = _get_google_drive_file_id(url)
+        if file_id is not None:
+            return download_file_from_google_drive(file_id, root, filename, md5)
+
+        # download the file
+        try:
+            print("Downloading " + url + " to " + fpath)
+            _urlretrieve(url, fpath)
+        except (urllib.error.URLError, OSError) as e:  # type: ignore[attr-defined]
+            if url[:5] == "https":
+                url = url.replace("https:", "http:")
+                print("Failed download. Trying https -> http instead. Downloading " + url + " to " + fpath)
+                _urlretrieve(url, fpath)
+            else:
+                raise e
+
+    # check integrity of downloaded file
+    if not check_integrity(fpath, md5):
+        raise RuntimeError("File not found or corrupted.")
+
+
+def list_dir(root: str, prefix: bool = False) -> List[str]:
+    """List all directories at a given root
+
+    Args:
+        root (str): Path to directory whose folders need to be listed
+        prefix (bool, optional): If true, prepends the path to each result, otherwise
+            only returns the name of the directories found
+    """
+    root = os.path.expanduser(root)
+    directories = [p for p in os.listdir(root) if os.path.isdir(os.path.join(root, p))]
+    if prefix is True:
+        directories = [os.path.join(root, d) for d in directories]
+    return directories
+
+
+def list_files(root: str, suffix: str, prefix: bool = False) -> List[str]:
+    """List all files ending with a suffix at a given root
+
+    Args:
+        root (str): Path to directory whose folders need to be listed
+        suffix (str or tuple): Suffix of the files to match, e.g. '.png' or ('.jpg', '.png').
+            It uses the Python "str.endswith" method and is passed directly
+        prefix (bool, optional): If true, prepends the path to each result, otherwise
+            only returns the name of the files found
+    """
+    root = os.path.expanduser(root)
+    files = [p for p in os.listdir(root) if os.path.isfile(os.path.join(root, p)) and p.endswith(suffix)]
+    if prefix is True:
+        files = [os.path.join(root, d) for d in files]
+    return files
+
+
+def _extract_gdrive_api_response(response, chunk_size: int = 32 * 1024) -> Tuple[bytes, Iterator[bytes]]:
+    content = response.iter_content(chunk_size)
+    first_chunk = None
+    # filter out keep-alive new chunks
+    while not first_chunk:
+        first_chunk = next(content)
+    content = itertools.chain([first_chunk], content)
+
+    try:
+        match = re.search("<title>Google Drive - (?P<api_response>.+?)</title>", first_chunk.decode())
+        api_response = match["api_response"] if match is not None else None
+    except UnicodeDecodeError:
+        api_response = None
+    return api_response, content
+
+
+def download_file_from_google_drive(file_id: str, root: str, filename: Optional[str] = None, md5: Optional[str] = None):
+    """Download a Google Drive file from  and place it in root.
+
+    Args:
+        file_id (str): id of file to be downloaded
+        root (str): Directory to place downloaded file in
+        filename (str, optional): Name to save the file under. If None, use the id of the file.
+        md5 (str, optional): MD5 checksum of the download. If None, do not check
+    """
+    # Based on https://stackoverflow.com/questions/38511444/python-download-files-from-google-drive-using-url
+
+    root = os.path.expanduser(root)
+    if not filename:
+        filename = file_id
+    fpath = os.path.join(root, filename)
+
+    os.makedirs(root, exist_ok=True)
+
+    if check_integrity(fpath, md5):
+        print(f"Using downloaded {'and verified ' if md5 else ''}file: {fpath}")
+        return
+
+    url = "https://drive.google.com/uc"
+    params = dict(id=file_id, export="download")
+    with requests.Session() as session:
+        response = session.get(url, params=params, stream=True)
+
+        for key, value in response.cookies.items():
+            if key.startswith("download_warning"):
+                token = value
+                break
+        else:
+            api_response, content = _extract_gdrive_api_response(response)
+            token = "t" if api_response == "Virus scan warning" else None
+
+        if token is not None:
+            response = session.get(url, params=dict(params, confirm=token), stream=True)
+            api_response, content = _extract_gdrive_api_response(response)
+
+        if api_response == "Quota exceeded":
+            raise RuntimeError(
+                f"The daily quota of the file {filename} is exceeded and it "
+                f"can't be downloaded. This is a limitation of Google Drive "
+                f"and can only be overcome by trying again later."
+            )
+
+        _save_response_content(content, fpath)
+
+    # In case we deal with an unhandled GDrive API response, the file should be smaller than 10kB and contain only text
+    if os.stat(fpath).st_size < 10 * 1024:
+        with contextlib.suppress(UnicodeDecodeError), open(fpath) as fh:
+            text = fh.read()
+            # Regular expression to detect HTML. Copied from https://stackoverflow.com/a/70585604
+            if re.search(r"</?\s*[a-z-][^>]*\s*>|(&(?:[\w\d]+|#\d+|#x[a-f\d]+);)", text):
+                warnings.warn(
+                    f"We detected some HTML elements in the downloaded file. "
+                    f"This most likely means that the download triggered an unhandled API response by GDrive. "
+                    f"Please report this to torchvision at https://github.com/pytorch/vision/issues including "
+                    f"the response:\n\n{text}"
+                )
+
+    if md5 and not check_md5(fpath, md5):
+        raise RuntimeError(
+            f"The MD5 checksum of the download file {fpath} does not match the one on record."
+            f"Please delete the file and try again. "
+            f"If the issue persists, please report this to torchvision at https://github.com/pytorch/vision/issues."
+        )
+
+
+def _extract_tar(from_path: str, to_path: str, compression: Optional[str]) -> None:
+    with tarfile.open(from_path, f"r:{compression[1:]}" if compression else "r") as tar:
+        tar.extractall(to_path)
+
+
+_ZIP_COMPRESSION_MAP: Dict[str, int] = {
+    ".bz2": zipfile.ZIP_BZIP2,
+    ".xz": zipfile.ZIP_LZMA,
+}
+
+
+def _extract_zip(from_path: str, to_path: str, compression: Optional[str]) -> None:
+    with zipfile.ZipFile(
+        from_path, "r", compression=_ZIP_COMPRESSION_MAP[compression] if compression else zipfile.ZIP_STORED
+    ) as zip:
+        zip.extractall(to_path)
+
+
+_ARCHIVE_EXTRACTORS: Dict[str, Callable[[str, str, Optional[str]], None]] = {
+    ".tar": _extract_tar,
+    ".zip": _extract_zip,
+}
+_COMPRESSED_FILE_OPENERS: Dict[str, Callable[..., IO]] = {
+    ".bz2": bz2.open,
+    ".gz": gzip.open,
+    ".xz": lzma.open,
+}
+_FILE_TYPE_ALIASES: Dict[str, Tuple[Optional[str], Optional[str]]] = {
+    ".tbz": (".tar", ".bz2"),
+    ".tbz2": (".tar", ".bz2"),
+    ".tgz": (".tar", ".gz"),
+}
+
+
+def _detect_file_type(file: str) -> Tuple[str, Optional[str], Optional[str]]:
+    """Detect the archive type and/or compression of a file.
+
+    Args:
+        file (str): the filename
+
+    Returns:
+        (tuple): tuple of suffix, archive type, and compression
+
+    Raises:
+        RuntimeError: if file has no suffix or suffix is not supported
+    """
+    suffixes = pathlib.Path(file).suffixes
+    if not suffixes:
+        raise RuntimeError(
+            f"File '{file}' has no suffixes that could be used to detect the archive type and compression."
+        )
+    suffix = suffixes[-1]
+
+    # check if the suffix is a known alias
+    if suffix in _FILE_TYPE_ALIASES:
+        return (suffix, *_FILE_TYPE_ALIASES[suffix])
+
+    # check if the suffix is an archive type
+    if suffix in _ARCHIVE_EXTRACTORS:
+        return suffix, suffix, None
+
+    # check if the suffix is a compression
+    if suffix in _COMPRESSED_FILE_OPENERS:
+        # check for suffix hierarchy
+        if len(suffixes) > 1:
+            suffix2 = suffixes[-2]
+
+            # check if the suffix2 is an archive type
+            if suffix2 in _ARCHIVE_EXTRACTORS:
+                return suffix2 + suffix, suffix2, suffix
+
+        return suffix, None, suffix
+
+    valid_suffixes = sorted(set(_FILE_TYPE_ALIASES) | set(_ARCHIVE_EXTRACTORS) | set(_COMPRESSED_FILE_OPENERS))
+    raise RuntimeError(f"Unknown compression or archive type: '{suffix}'.\nKnown suffixes are: '{valid_suffixes}'.")
+
+
+def _decompress(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str:
+    r"""Decompress a file.
+
+    The compression is automatically detected from the file name.
+
+    Args:
+        from_path (str): Path to the file to be decompressed.
+        to_path (str): Path to the decompressed file. If omitted, ``from_path`` without compression extension is used.
+        remove_finished (bool): If ``True``, remove the file after the extraction.
+
+    Returns:
+        (str): Path to the decompressed file.
+    """
+    suffix, archive_type, compression = _detect_file_type(from_path)
+    if not compression:
+        raise RuntimeError(f"Couldn't detect a compression from suffix {suffix}.")
+
+    if to_path is None:
+        to_path = from_path.replace(suffix, archive_type if archive_type is not None else "")
+
+    # We don't need to check for a missing key here, since this was already done in _detect_file_type()
+    compressed_file_opener = _COMPRESSED_FILE_OPENERS[compression]
+
+    with compressed_file_opener(from_path, "rb") as rfh, open(to_path, "wb") as wfh:
+        wfh.write(rfh.read())
+
+    if remove_finished:
+        os.remove(from_path)
+
+    return to_path
+
+
+def extract_archive(from_path: str, to_path: Optional[str] = None, remove_finished: bool = False) -> str:
+    """Extract an archive.
+
+    The archive type and a possible compression is automatically detected from the file name. If the file is compressed
+    but not an archive the call is dispatched to :func:`decompress`.
+
+    Args:
+        from_path (str): Path to the file to be extracted.
+        to_path (str): Path to the directory the file will be extracted to. If omitted, the directory of the file is
+            used.
+        remove_finished (bool): If ``True``, remove the file after the extraction.
+
+    Returns:
+        (str): Path to the directory the file was extracted to.
+    """
+    if to_path is None:
+        to_path = os.path.dirname(from_path)
+
+    suffix, archive_type, compression = _detect_file_type(from_path)
+    if not archive_type:
+        return _decompress(
+            from_path,
+            os.path.join(to_path, os.path.basename(from_path).replace(suffix, "")),
+            remove_finished=remove_finished,
+        )
+
+    # We don't need to check for a missing key here, since this was already done in _detect_file_type()
+    extractor = _ARCHIVE_EXTRACTORS[archive_type]
+
+    extractor(from_path, to_path, compression)
+    if remove_finished:
+        os.remove(from_path)
+
+    return to_path
+
+
+def download_and_extract_archive(
+    url: str,
+    download_root: str,
+    extract_root: Optional[str] = None,
+    filename: Optional[str] = None,
+    md5: Optional[str] = None,
+    remove_finished: bool = False,
+) -> None:
+    download_root = os.path.expanduser(download_root)
+    if extract_root is None:
+        extract_root = download_root
+    if not filename:
+        filename = os.path.basename(url)
+
+    download_url(url, download_root, filename, md5)
+
+    archive = os.path.join(download_root, filename)
+    print(f"Extracting {archive} to {extract_root}")
+    extract_archive(archive, extract_root, remove_finished)
+
+
+def iterable_to_str(iterable: Iterable) -> str:
+    return "'" + "', '".join([str(item) for item in iterable]) + "'"
+
+
+T = TypeVar("T", str, bytes)
+
+
+def verify_str_arg(
+    value: T,
+    arg: Optional[str] = None,
+    valid_values: Optional[Iterable[T]] = None,
+    custom_msg: Optional[str] = None,
+) -> T:
+    if not isinstance(value, torch._six.string_classes):
+        if arg is None:
+            msg = "Expected type str, but got type {type}."
+        else:
+            msg = "Expected type str for argument {arg}, but got type {type}."
+        msg = msg.format(type=type(value), arg=arg)
+        raise ValueError(msg)
+
+    if valid_values is None:
+        return value
+
+    if value not in valid_values:
+        if custom_msg is not None:
+            msg = custom_msg
+        else:
+            msg = "Unknown value '{value}' for argument {arg}. Valid values are {{{valid_values}}}."
+            msg = msg.format(value=value, arg=arg, valid_values=iterable_to_str(valid_values))
+        raise ValueError(msg)
+
+    return value
+
+
+def _read_pfm(file_name: str, slice_channels: int = 2) -> np.ndarray:
+    """Read file in .pfm format. Might contain either 1 or 3 channels of data.
+
+    Args:
+        file_name (str): Path to the file.
+        slice_channels (int): Number of channels to slice out of the file.
+            Useful for reading different data formats stored in .pfm files: Optical Flows, Stereo Disparity Maps, etc.
+    """
+
+    with open(file_name, "rb") as f:
+        header = f.readline().rstrip()
+        if header not in [b"PF", b"Pf"]:
+            raise ValueError("Invalid PFM file")
+
+        dim_match = re.match(rb"^(\d+)\s(\d+)\s$", f.readline())
+        if not dim_match:
+            raise Exception("Malformed PFM header.")
+        w, h = (int(dim) for dim in dim_match.groups())
+
+        scale = float(f.readline().rstrip())
+        if scale < 0:  # little-endian
+            endian = "<"
+            scale = -scale
+        else:
+            endian = ">"  # big-endian
+
+        data = np.fromfile(f, dtype=endian + "f")
+
+    pfm_channels = 3 if header == b"PF" else 1
+
+    data = data.reshape(h, w, pfm_channels).transpose(2, 0, 1)
+    data = np.flip(data, axis=1)  # flip on h dimension
+    data = data[:slice_channels, :, :]
+    return data.astype(np.float32)
--- a/python/jittor/compatibility/vision/datasets/vision.py
+++ b/python/jittor/compatibility/vision/datasets/vision.py
@ -0,0 +1,104 @@
+import os
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+import torch.utils.data as data
+
+from ..utils import _log_api_usage_once
+
+
+class VisionDataset(data.Dataset):
+    """
+    Base Class For making datasets which are compatible with torchvision.
+    It is necessary to override the ``__getitem__`` and ``__len__`` method.
+    Args:
+        root (string): Root directory of dataset.
+        transforms (callable, optional): A function/transforms that takes in
+            an image and a label and returns the transformed versions of both.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+    .. note::
+        :attr:`transforms` and the combination of :attr:`transform` and :attr:`target_transform` are mutually exclusive.
+    """
+
+    _repr_indent = 4
+
+    def __init__(
+        self,
+        root: str,
+        transforms: Optional[Callable] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+    ) -> None:
+        self.root = root
+
+        has_transforms = transforms is not None
+        has_separate_transform = transform is not None or target_transform is not None
+        if has_transforms and has_separate_transform:
+            raise ValueError("Only transforms or transform/target_transform can be passed as argument")
+
+        # for backwards-compatibility
+        self.transform = transform
+        self.target_transform = target_transform
+
+        if has_separate_transform:
+            transforms = StandardTransform(transform, target_transform)
+        self.transforms = transforms
+
+    def __getitem__(self, index: int) -> Any:
+        """
+        Args:
+            index (int): Index
+        Returns:
+            (Any): Sample and meta data, optionally transformed by the respective transforms.
+        """
+        raise NotImplementedError
+
+    def __len__(self) -> int:
+        raise NotImplementedError
+
+    def __repr__(self) -> str:
+        head = "Dataset " + self.__class__.__name__
+        body = [f"Number of datapoints: {self.__len__()}"]
+        if self.root is not None:
+            body.append(f"Root location: {self.root}")
+        body += self.extra_repr().splitlines()
+        if hasattr(self, "transforms") and self.transforms is not None:
+            body += [repr(self.transforms)]
+        lines = [head] + [" " * self._repr_indent + line for line in body]
+        return "\n".join(lines)
+
+    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+        lines = transform.__repr__().splitlines()
+        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
+
+    def extra_repr(self) -> str:
+        return ""
+
+
+class StandardTransform:
+    def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
+        self.transform = transform
+        self.target_transform = target_transform
+
+    def __call__(self, input: Any, target: Any) -> Tuple[Any, Any]:
+        if self.transform is not None:
+            input = self.transform(input)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return input, target
+
+    def _format_transform_repr(self, transform: Callable, head: str) -> List[str]:
+        lines = transform.__repr__().splitlines()
+        return [f"{head}{lines[0]}"] + ["{}{}".format(" " * len(head), line) for line in lines[1:]]
+
+    def __repr__(self) -> str:
+        body = [self.__class__.__name__]
+        if self.transform is not None:
+            body += self._format_transform_repr(self.transform, "Transform: ")
+        if self.target_transform is not None:
+            body += self._format_transform_repr(self.target_transform, "Target transform: ")
+
+        return "\n".join(body)
--- a/python/jittor/compatibility/vision/transforms.py
+++ b/python/jittor/compatibility/vision/transforms.py
@ -0,0 +1 @@
+from jittor.transform import *
--- a/python/jittor/compatibility/vision/utils.py
+++ b/python/jittor/compatibility/vision/utils.py
@ -0,0 +1,582 @@
+import collections
+import math
+import pathlib
+import warnings
+from itertools import repeat
+from types import FunctionType
+from typing import Any, BinaryIO, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image, ImageColor, ImageDraw, ImageFont
+
+__all__ = [
+    "make_grid",
+    "save_image",
+    "draw_bounding_boxes",
+    "draw_segmentation_masks",
+    "draw_keypoints",
+    "flow_to_image",
+]
+
+
+@torch.no_grad()
+def make_grid(
+    tensor: Union[torch.Tensor, List[torch.Tensor]],
+    nrow: int = 8,
+    padding: int = 2,
+    normalize: bool = False,
+    value_range: Optional[Tuple[int, int]] = None,
+    scale_each: bool = False,
+    pad_value: float = 0.0,
+    **kwargs,
+) -> torch.Tensor:
+    """
+    Make a grid of images.
+
+    Args:
+        tensor (Tensor or list): 4D mini-batch Tensor of shape (B x C x H x W)
+            or a list of images all of the same size.
+        nrow (int, optional): Number of images displayed in each row of the grid.
+            The final grid size is ``(B / nrow, nrow)``. Default: ``8``.
+        padding (int, optional): amount of padding. Default: ``2``.
+        normalize (bool, optional): If True, shift the image to the range (0, 1),
+            by the min and max values specified by ``value_range``. Default: ``False``.
+        value_range (tuple, optional): tuple (min, max) where min and max are numbers,
+            then these numbers are used to normalize the image. By default, min and max
+            are computed from the tensor.
+        range (tuple. optional):
+            .. warning::
+                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``value_range``
+                instead.
+        scale_each (bool, optional): If ``True``, scale each image in the batch of
+            images separately rather than the (min, max) over all images. Default: ``False``.
+        pad_value (float, optional): Value for the padded pixels. Default: ``0``.
+
+    Returns:
+        grid (Tensor): the tensor containing grid of images.
+    """
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(make_grid)
+    if not torch.is_tensor(tensor):
+        if isinstance(tensor, list):
+            for t in tensor:
+                if not torch.is_tensor(t):
+                    raise TypeError(f"tensor or list of tensors expected, got a list containing {type(t)}")
+        else:
+            raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}")
+
+    if "range" in kwargs.keys():
+        warnings.warn(
+            "The parameter 'range' is deprecated since 0.12 and will be removed in 0.14. "
+            "Please use 'value_range' instead."
+        )
+        value_range = kwargs["range"]
+
+    # if list of tensors, convert to a 4D mini-batch Tensor
+    if isinstance(tensor, list):
+        tensor = torch.stack(tensor, dim=0)
+
+    if tensor.dim() == 2:  # single image H x W
+        tensor = tensor.unsqueeze(0)
+    if tensor.dim() == 3:  # single image
+        if tensor.size(0) == 1:  # if single-channel, convert to 3-channel
+            tensor = torch.cat((tensor, tensor, tensor), 0)
+        tensor = tensor.unsqueeze(0)
+
+    if tensor.dim() == 4 and tensor.size(1) == 1:  # single-channel images
+        tensor = torch.cat((tensor, tensor, tensor), 1)
+
+    if normalize is True:
+        tensor = tensor.clone()  # avoid modifying tensor in-place
+        if value_range is not None and not isinstance(value_range, tuple):
+            raise TypeError("value_range has to be a tuple (min, max) if specified. min and max are numbers")
+
+        def norm_ip(img, low, high):
+            img.clamp_(min=low, max=high)
+            img.sub_(low).div_(max(high - low, 1e-5))
+
+        def norm_range(t, value_range):
+            if value_range is not None:
+                norm_ip(t, value_range[0], value_range[1])
+            else:
+                norm_ip(t, float(t.min()), float(t.max()))
+
+        if scale_each is True:
+            for t in tensor:  # loop over mini-batch dimension
+                norm_range(t, value_range)
+        else:
+            norm_range(tensor, value_range)
+
+    if not isinstance(tensor, torch.Tensor):
+        raise TypeError("tensor should be of type torch.Tensor")
+    if tensor.size(0) == 1:
+        return tensor.squeeze(0)
+
+    # make the mini-batch of images into a grid
+    nmaps = tensor.size(0)
+    xmaps = min(nrow, nmaps)
+    ymaps = int(math.ceil(float(nmaps) / xmaps))
+    height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding)
+    num_channels = tensor.size(1)
+    grid = tensor.new_full((num_channels, height * ymaps + padding, width * xmaps + padding), pad_value)
+    k = 0
+    for y in range(ymaps):
+        for x in range(xmaps):
+            if k >= nmaps:
+                break
+            # Tensor.copy_() is a valid method but seems to be missing from the stubs
+            # https://pytorch.org/docs/stable/tensors.html#torch.Tensor.copy_
+            grid.narrow(1, y * height + padding, height - padding).narrow(  # type: ignore[attr-defined]
+                2, x * width + padding, width - padding
+            ).copy_(tensor[k])
+            k = k + 1
+    return grid
+
+
+@torch.no_grad()
+def save_image(
+    tensor: Union[torch.Tensor, List[torch.Tensor]],
+    fp: Union[str, pathlib.Path, BinaryIO],
+    format: Optional[str] = None,
+    **kwargs,
+) -> None:
+    """
+    Save a given Tensor into an image file.
+
+    Args:
+        tensor (Tensor or list): Image to be saved. If given a mini-batch tensor,
+            saves the tensor as a grid of images by calling ``make_grid``.
+        fp (string or file object): A filename or a file object
+        format(Optional):  If omitted, the format to use is determined from the filename extension.
+            If a file object was used instead of a filename, this parameter should always be used.
+        **kwargs: Other arguments are documented in ``make_grid``.
+    """
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(save_image)
+    grid = make_grid(tensor, **kwargs)
+    # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer
+    ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+    im = Image.fromarray(ndarr)
+    im.save(fp, format=format)
+
+
+@torch.no_grad()
+def draw_bounding_boxes(
+    image: torch.Tensor,
+    boxes: torch.Tensor,
+    labels: Optional[List[str]] = None,
+    colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
+    fill: Optional[bool] = False,
+    width: int = 1,
+    font: Optional[str] = None,
+    font_size: Optional[int] = None,
+) -> torch.Tensor:
+
+    """
+    Draws bounding boxes on given image.
+    The values of the input image should be uint8 between 0 and 255.
+    If fill is True, Resulting Tensor should be saved as PNG image.
+
+    Args:
+        image (Tensor): Tensor of shape (C x H x W) and dtype uint8.
+        boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax) format. Note that
+            the boxes are absolute coordinates with respect to the image. In other words: `0 <= xmin < xmax < W` and
+            `0 <= ymin < ymax < H`.
+        labels (List[str]): List containing the labels of bounding boxes.
+        colors (color or list of colors, optional): List containing the colors
+            of the boxes or single color for all boxes. The color can be represented as
+            PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
+            By default, random colors are generated for boxes.
+        fill (bool): If `True` fills the bounding box with specified color.
+        width (int): Width of bounding box.
+        font (str): A filename containing a TrueType font. If the file is not found in this filename, the loader may
+            also search in other directories, such as the `fonts/` directory on Windows or `/Library/Fonts/`,
+            `/System/Library/Fonts/` and `~/Library/Fonts/` on macOS.
+        font_size (int): The requested font size in points.
+
+    Returns:
+        img (Tensor[C, H, W]): Image Tensor of dtype uint8 with bounding boxes plotted.
+    """
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(draw_bounding_boxes)
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"Tensor expected, got {type(image)}")
+    elif image.dtype != torch.uint8:
+        raise ValueError(f"Tensor uint8 expected, got {image.dtype}")
+    elif image.dim() != 3:
+        raise ValueError("Pass individual images, not batches")
+    elif image.size(0) not in {1, 3}:
+        raise ValueError("Only grayscale and RGB images are supported")
+    elif (boxes[:, 0] > boxes[:, 2]).any() or (boxes[:, 1] > boxes[:, 3]).any():
+        raise ValueError(
+            "Boxes need to be in (xmin, ymin, xmax, ymax) format. Use torchvision.ops.box_convert to convert them"
+        )
+
+    num_boxes = boxes.shape[0]
+
+    if num_boxes == 0:
+        warnings.warn("boxes doesn't contain any box. No box was drawn")
+        return image
+
+    if labels is None:
+        labels: Union[List[str], List[None]] = [None] * num_boxes  # type: ignore[no-redef]
+    elif len(labels) != num_boxes:
+        raise ValueError(
+            f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. Please specify labels for each box."
+        )
+
+    if colors is None:
+        colors = _generate_color_palette(num_boxes)
+    elif isinstance(colors, list):
+        if len(colors) < num_boxes:
+            raise ValueError(f"Number of colors ({len(colors)}) is less than number of boxes ({num_boxes}). ")
+    else:  # colors specifies a single color for all boxes
+        colors = [colors] * num_boxes
+
+    colors = [(ImageColor.getrgb(color) if isinstance(color, str) else color) for color in colors]
+
+    if font is None:
+        if font_size is not None:
+            warnings.warn("Argument 'font_size' will be ignored since 'font' is not set.")
+        txt_font = ImageFont.load_default()
+    else:
+        txt_font = ImageFont.truetype(font=font, size=font_size or 10)
+
+    # Handle Grayscale images
+    if image.size(0) == 1:
+        image = torch.tile(image, (3, 1, 1))
+
+    ndarr = image.permute(1, 2, 0).cpu().numpy()
+    img_to_draw = Image.fromarray(ndarr)
+    img_boxes = boxes.to(torch.int64).tolist()
+
+    if fill:
+        draw = ImageDraw.Draw(img_to_draw, "RGBA")
+    else:
+        draw = ImageDraw.Draw(img_to_draw)
+
+    for bbox, color, label in zip(img_boxes, colors, labels):  # type: ignore[arg-type]
+        if fill:
+            fill_color = color + (100,)
+            draw.rectangle(bbox, width=width, outline=color, fill=fill_color)
+        else:
+            draw.rectangle(bbox, width=width, outline=color)
+
+        if label is not None:
+            margin = width + 1
+            draw.text((bbox[0] + margin, bbox[1] + margin), label, fill=color, font=txt_font)
+
+    return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
+
+
+@torch.no_grad()
+def draw_segmentation_masks(
+    image: torch.Tensor,
+    masks: torch.Tensor,
+    alpha: float = 0.8,
+    colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
+) -> torch.Tensor:
+
+    """
+    Draws segmentation masks on given RGB image.
+    The values of the input image should be uint8 between 0 and 255.
+
+    Args:
+        image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
+        masks (Tensor): Tensor of shape (num_masks, H, W) or (H, W) and dtype bool.
+        alpha (float): Float number between 0 and 1 denoting the transparency of the masks.
+            0 means full transparency, 1 means no transparency.
+        colors (color or list of colors, optional): List containing the colors
+            of the masks or single color for all masks. The color can be represented as
+            PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
+            By default, random colors are generated for each mask.
+
+    Returns:
+        img (Tensor[C, H, W]): Image Tensor, with segmentation masks drawn on top.
+    """
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(draw_segmentation_masks)
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"The image must be a tensor, got {type(image)}")
+    elif image.dtype != torch.uint8:
+        raise ValueError(f"The image dtype must be uint8, got {image.dtype}")
+    elif image.dim() != 3:
+        raise ValueError("Pass individual images, not batches")
+    elif image.size()[0] != 3:
+        raise ValueError("Pass an RGB image. Other Image formats are not supported")
+    if masks.ndim == 2:
+        masks = masks[None, :, :]
+    if masks.ndim != 3:
+        raise ValueError("masks must be of shape (H, W) or (batch_size, H, W)")
+    if masks.dtype != torch.bool:
+        raise ValueError(f"The masks must be of dtype bool. Got {masks.dtype}")
+    if masks.shape[-2:] != image.shape[-2:]:
+        raise ValueError("The image and the masks must have the same height and width")
+
+    num_masks = masks.size()[0]
+    if colors is not None and num_masks > len(colors):
+        raise ValueError(f"There are more masks ({num_masks}) than colors ({len(colors)})")
+
+    if num_masks == 0:
+        warnings.warn("masks doesn't contain any mask. No mask was drawn")
+        return image
+
+    if colors is None:
+        colors = _generate_color_palette(num_masks)
+
+    if not isinstance(colors, list):
+        colors = [colors]
+    if not isinstance(colors[0], (tuple, str)):
+        raise ValueError("colors must be a tuple or a string, or a list thereof")
+    if isinstance(colors[0], tuple) and len(colors[0]) != 3:
+        raise ValueError("It seems that you passed a tuple of colors instead of a list of colors")
+
+    out_dtype = torch.uint8
+
+    colors_ = []
+    for color in colors:
+        if isinstance(color, str):
+            color = ImageColor.getrgb(color)
+        colors_.append(torch.tensor(color, dtype=out_dtype))
+
+    img_to_draw = image.detach().clone()
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors_):
+        img_to_draw[:, mask] = color[:, None]
+
+    out = image * (1 - alpha) + img_to_draw * alpha
+    return out.to(out_dtype)
+
+
+@torch.no_grad()
+def draw_keypoints(
+    image: torch.Tensor,
+    keypoints: torch.Tensor,
+    connectivity: Optional[List[Tuple[int, int]]] = None,
+    colors: Optional[Union[str, Tuple[int, int, int]]] = None,
+    radius: int = 2,
+    width: int = 3,
+) -> torch.Tensor:
+
+    """
+    Draws Keypoints on given RGB image.
+    The values of the input image should be uint8 between 0 and 255.
+
+    Args:
+        image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
+        keypoints (Tensor): Tensor of shape (num_instances, K, 2) the K keypoints location for each of the N instances,
+            in the format [x, y].
+        connectivity (List[Tuple[int, int]]]): A List of tuple where,
+            each tuple contains pair of keypoints to be connected.
+        colors (str, Tuple): The color can be represented as
+            PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
+        radius (int): Integer denoting radius of keypoint.
+        width (int): Integer denoting width of line connecting keypoints.
+
+    Returns:
+        img (Tensor[C, H, W]): Image Tensor of dtype uint8 with keypoints drawn.
+    """
+
+    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+        _log_api_usage_once(draw_keypoints)
+    if not isinstance(image, torch.Tensor):
+        raise TypeError(f"The image must be a tensor, got {type(image)}")
+    elif image.dtype != torch.uint8:
+        raise ValueError(f"The image dtype must be uint8, got {image.dtype}")
+    elif image.dim() != 3:
+        raise ValueError("Pass individual images, not batches")
+    elif image.size()[0] != 3:
+        raise ValueError("Pass an RGB image. Other Image formats are not supported")
+
+    if keypoints.ndim != 3:
+        raise ValueError("keypoints must be of shape (num_instances, K, 2)")
+
+    ndarr = image.permute(1, 2, 0).cpu().numpy()
+    img_to_draw = Image.fromarray(ndarr)
+    draw = ImageDraw.Draw(img_to_draw)
+    img_kpts = keypoints.to(torch.int64).tolist()
+
+    for kpt_id, kpt_inst in enumerate(img_kpts):
+        for inst_id, kpt in enumerate(kpt_inst):
+            x1 = kpt[0] - radius
+            x2 = kpt[0] + radius
+            y1 = kpt[1] - radius
+            y2 = kpt[1] + radius
+            draw.ellipse([x1, y1, x2, y2], fill=colors, outline=None, width=0)
+
+        if connectivity:
+            for connection in connectivity:
+                start_pt_x = kpt_inst[connection[0]][0]
+                start_pt_y = kpt_inst[connection[0]][1]
+
+                end_pt_x = kpt_inst[connection[1]][0]
+                end_pt_y = kpt_inst[connection[1]][1]
+
+                draw.line(
+                    ((start_pt_x, start_pt_y), (end_pt_x, end_pt_y)),
+                    width=width,
+                )
+
+    return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
+
+
+# Flow visualization code adapted from https://github.com/tomrunia/OpticalFlow_Visualization
+@torch.no_grad()
+def flow_to_image(flow: torch.Tensor) -> torch.Tensor:
+
+    """
+    Converts a flow to an RGB image.
+
+    Args:
+        flow (Tensor): Flow of shape (N, 2, H, W) or (2, H, W) and dtype torch.float.
+
+    Returns:
+        img (Tensor): Image Tensor of dtype uint8 where each color corresponds
+            to a given flow direction. Shape is (N, 3, H, W) or (3, H, W) depending on the input.
+    """
+
+    if flow.dtype != torch.float:
+        raise ValueError(f"Flow should be of dtype torch.float, got {flow.dtype}.")
+
+    orig_shape = flow.shape
+    if flow.ndim == 3:
+        flow = flow[None]  # Add batch dim
+
+    if flow.ndim != 4 or flow.shape[1] != 2:
+        raise ValueError(f"Input flow should have shape (2, H, W) or (N, 2, H, W), got {orig_shape}.")
+
+    max_norm = torch.sum(flow**2, dim=1).sqrt().max()
+    epsilon = torch.finfo((flow).dtype).eps
+    normalized_flow = flow / (max_norm + epsilon)
+    img = _normalized_flow_to_image(normalized_flow)
+
+    if len(orig_shape) == 3:
+        img = img[0]  # Remove batch dim
+    return img
+
+
+@torch.no_grad()
+def _normalized_flow_to_image(normalized_flow: torch.Tensor) -> torch.Tensor:
+
+    """
+    Converts a batch of normalized flow to an RGB image.
+
+    Args:
+        normalized_flow (torch.Tensor): Normalized flow tensor of shape (N, 2, H, W)
+    Returns:
+       img (Tensor(N, 3, H, W)): Flow visualization image of dtype uint8.
+    """
+
+    N, _, H, W = normalized_flow.shape
+    device = normalized_flow.device
+    flow_image = torch.zeros((N, 3, H, W), dtype=torch.uint8, device=device)
+    colorwheel = _make_colorwheel().to(device)  # shape [55x3]
+    num_cols = colorwheel.shape[0]
+    norm = torch.sum(normalized_flow**2, dim=1).sqrt()
+    a = torch.atan2(-normalized_flow[:, 1, :, :], -normalized_flow[:, 0, :, :]) / torch.pi
+    fk = (a + 1) / 2 * (num_cols - 1)
+    k0 = torch.floor(fk).to(torch.long)
+    k1 = k0 + 1
+    k1[k1 == num_cols] = 0
+    f = fk - k0
+
+    for c in range(colorwheel.shape[1]):
+        tmp = colorwheel[:, c]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1 - f) * col0 + f * col1
+        col = 1 - norm * (1 - col)
+        flow_image[:, c, :, :] = torch.floor(255 * col)
+    return flow_image
+
+
+def _make_colorwheel() -> torch.Tensor:
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+    Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+    URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf.
+
+    Returns:
+        colorwheel (Tensor[55, 3]): Colorwheel Tensor.
+    """
+
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = torch.zeros((ncols, 3))
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = torch.floor(255 * torch.arange(0, RY) / RY)
+    col = col + RY
+    # YG
+    colorwheel[col : col + YG, 0] = 255 - torch.floor(255 * torch.arange(0, YG) / YG)
+    colorwheel[col : col + YG, 1] = 255
+    col = col + YG
+    # GC
+    colorwheel[col : col + GC, 1] = 255
+    colorwheel[col : col + GC, 2] = torch.floor(255 * torch.arange(0, GC) / GC)
+    col = col + GC
+    # CB
+    colorwheel[col : col + CB, 1] = 255 - torch.floor(255 * torch.arange(CB) / CB)
+    colorwheel[col : col + CB, 2] = 255
+    col = col + CB
+    # BM
+    colorwheel[col : col + BM, 2] = 255
+    colorwheel[col : col + BM, 0] = torch.floor(255 * torch.arange(0, BM) / BM)
+    col = col + BM
+    # MR
+    colorwheel[col : col + MR, 2] = 255 - torch.floor(255 * torch.arange(MR) / MR)
+    colorwheel[col : col + MR, 0] = 255
+    return colorwheel
+
+
+def _generate_color_palette(num_objects: int):
+    palette = torch.tensor([2**25 - 1, 2**15 - 1, 2**21 - 1])
+    return [tuple((i * palette) % 255) for i in range(num_objects)]
+
+
+def _log_api_usage_once(obj: Any) -> None:
+
+    """
+    Logs API usage(module and name) within an organization.
+    In a large ecosystem, it's often useful to track the PyTorch and
+    TorchVision APIs usage. This API provides the similar functionality to the
+    logging module in the Python stdlib. It can be used for debugging purpose
+    to log which methods are used and by default it is inactive, unless the user
+    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
+    Please note it is triggered only once for the same API call within a process.
+    It does not collect any data from open-source users since it is no-op by default.
+    For more information, please refer to
+    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
+    * Logging policy: https://github.com/pytorch/vision/issues/5052;
+
+    Args:
+        obj (class instance or method): an object to extract info from.
+    """
+    pass
+
+
+def _make_ntuple(x: Any, n: int) -> Tuple[Any, ...]:
+    """
+    Make n-tuple from input x. If x is an iterable, then we just convert it to tuple.
+    Otherwise we will make a tuple of length n, all with value of x.
+    reference: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/utils.py#L8
+
+    Args:
+        x (Any): input value
+        n (int): length of the resulting tuple
+    """
+    if isinstance(x, collections.abc.Iterable):
+        return tuple(x)
+    return tuple(repeat(x, n))
--- a/python/jittor/compile_extern.py
+++ b/python/jittor/compile_extern.py
@ -0,0 +1,715 @@
+# ***************************************************************
+# Copyright (c) 2023 Jittor. All Rights Reserved. 
+# Maintainers: Dun Liang <randonlang@gmail.com>. 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import os, sys, shutil
+import platform
+from .compiler import *
+from jittor_utils import run_cmd, get_version, get_int_version
+from jittor_utils.misc import download_url_to_local
+import jittor_utils as jit_utils
+
+def search_file(dirs, name, prefer_version=()):
+    if os.name == 'nt':
+        if name.startswith("lib"):
+            name = name[3:].replace(".so", "64*.dll")
+    for d in dirs:
+        fname = os.path.join(d, name)
+        if os.name == 'nt':
+            lname = os.path.join(d, name)
+            names = glob.glob(lname)
+            if len(names):
+                return names[0]
+            continue
+        prefer_version = tuple( str(p) for p in prefer_version )
+        for i in range(len(prefer_version),-1,-1):
+            vname = ".".join((fname,)+prefer_version[:i])
+            if os.path.isfile(vname):
+                LOG.v(f"found {vname}")
+                return vname
+    LOG.f(f"file {name} not found in {dirs}")
+
+def install_mkl(root_folder):
+    # origin url is
+    # url = "https://github.com/intel/mkl-dnn/releases/download/v1.0.2/mkldnn_lnx_1.0.2_cpu_gomp.tgz"
+    import platform
+    url = None
+    if platform.system()=="Linux":
+        if platform.machine()=='x86_64':
+            filename = "dnnl_lnx_2.2.0_cpu_gomp.tgz"
+            md5 = "35bbbdf550a9d8ad54db798e372000f6"
+        elif platform.machine()=='aarch64':
+            filename = "dnnl_lnx_2.2.0_cpu_gomp_aarch64.tgz"
+            md5 = "72cf9b0b8fd6c3c786d35a9daaee22b8"
+        else:
+            raise RuntimeError(f"platform.machine()=={platform.machine()} not support yet,"
+            " Please contact us on https://github.com/jittor/jittor ")
+    elif os.name == "nt":
+        # url = "https://github.com/oneapi-src/oneDNN/releases/download/v2.2/dnnl_win_2.2.0_cpu_iomp.zip"
+        # url = "https://github.com/oneapi-src/oneDNN/releases/download/v2.2/dnnl_win_2.2.0_cpu_vcomp.zip"
+        filename = "dnnl_win_2.2.0_cpu_vcomp.zip"
+        md5 = "fa12c693b2ec07700d174e1e99d60a7e"
+    elif platform.system() == "Darwin":
+        if platform.machine() == "arm64":
+            filename = "dnnl_mac_2.2.0_cpu_omp_arm64.tgz"
+            md5 = "d8fdf56d3cf618685d22d18f08119f88"
+        else:
+            filename = "dnnl_mac_2.2.0_cpu_omp_x86_64.tgz"
+            md5 = "6e2f065d6a589c82081536b684768fe6"
+    else:
+        raise RuntimeError(f"platform.machine()=={platform.machine()} not support yet,"
+        " Please contact us on https://github.com/jittor/jittor ")
+
+    if not url:
+        url = "https://cg.cs.tsinghua.edu.cn/jittor/assets/" + filename
+    fullname = os.path.join(root_folder, filename)
+    dirname = os.path.join(root_folder, filename.rsplit(".",1)[0])
+
+    if not (os.path.isfile(os.path.join(dirname, "lib", "libmkldnn.so")) or
+        os.path.isfile(os.path.join(dirname, "bin", "dnnl.dll")) or 
+        os.path.isfile(os.path.join(dirname, "lib", "libmkldnn.dylib"))):
+        LOG.i("Downloading mkl...")
+        download_url_to_local(url, filename, root_folder, md5)
+        if fullname.endswith(".zip"):
+            import zipfile
+            with zipfile.ZipFile(fullname, "r") as f:
+                f.extractall(root_folder)
+        else:
+            import tarfile
+            with tarfile.open(fullname, "r") as tar:
+                tar.extractall(root_folder)
+        if os.name == 'nt':
+            # this env is used for execute example/text
+            bin_path = os.path.join(dirname, "bin")
+            sys.path.append(bin_path)
+            os.environ["PATH"] = os.environ.get("PATH", "") + ";" + bin_path
+            cmd = f"cd /d {dirname}/examples && {cc_path} {dirname}/examples/cnn_inference_f32.cpp -I{dirname}/include -Fe: {dirname}/examples/test.exe {fix_cl_flags(cc_flags).replace('-LD', '')} {dirname}/lib/mkldnn.lib"
+            
+            assert 0 == os.system(cmd)
+            assert 0 == os.system(f"{dirname}/examples/test")
+        elif platform.system() == "Darwin":
+            assert 0 == os.system(f"cd {dirname}/examples && "
+            f"{cc_path} -std=c++14 cnn_inference_f32.cpp -Ofast -lmkldnn -I ../include -L ../lib -o test && DYLD_LIBRARY_PATH=../lib/ ./test")
+        else:
+            assert 0 == os.system(f"cd {dirname}/examples && "
+            f"{cc_path} -std=c++14 cnn_inference_f32.cpp -Ofast -lmkldnn -I ../include -L ../lib -o test && LD_LIBRARY_PATH=../lib/ ./test")
+
+def setup_mkl():
+    global mkl_ops, use_mkl
+    use_mkl = os.environ.get("use_mkl", "1")=="1"
+    mkl_ops = None
+    if not use_mkl: return
+
+    # pytorch mkl is conflict with jittor mkl
+    # yield error "free: invalide size" or
+    # "mmap error"
+    # import pytorch(>1.8) first can fix this problem
+    # try:
+    #     # jt.dirty_fix_pytorch_runtime_error()
+    #     import torch
+    #     from torch import nn
+    # except:
+    #     torch = None
+
+    mkl_include_path = os.environ.get("mkl_include_path")
+    mkl_lib_path = os.environ.get("mkl_lib_path")
+    
+    if mkl_lib_path is None or mkl_include_path is None:
+        LOG.v("setup mkl...")
+        # mkl_path = os.path.join(cache_path, "mkl")
+        # mkl_path decouple with cc_path
+        mkl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "mkl")
+        
+        make_cache_dir(mkl_path)
+        install_mkl(mkl_path)
+        mkl_home = ""
+        for name in os.listdir(mkl_path):
+            if name.startswith("dnnl") and os.path.isdir(os.path.join(mkl_path, name)):
+                mkl_home = os.path.join(mkl_path, name)
+                break
+        assert mkl_home!=""
+    mkl_include_path = os.path.join(mkl_home, "include")
+    mkl_lib_path = os.path.join(mkl_home, "lib")
+
+    mkl_lib_name = os.path.join(mkl_lib_path, "libmkldnn.so")
+    extra_flags = f" -I\"{mkl_include_path}\" -L\"{mkl_lib_path}\" -lmkldnn "
+    if os.name == 'nt':
+        mkl_lib_name = os.path.join(mkl_home, 'bin', 'dnnl.dll')
+        mkl_bin_path = os.path.join(mkl_home, 'bin')
+        extra_flags = f" -I\"{mkl_include_path}\"  -L\"{mkl_lib_path}\" -L\"{mkl_bin_path}\" -ldnnl "
+    elif platform.system() == "Darwin":
+        mkl_lib_name = os.path.join(mkl_lib_path, "libmkldnn.dylib")
+
+    assert os.path.isdir(mkl_include_path)
+    assert os.path.isdir(mkl_lib_path)
+    assert os.path.isfile(mkl_lib_name)
+    LOG.v(f"mkl_include_path: {mkl_include_path}")
+    LOG.v(f"mkl_lib_path: {mkl_lib_path}")
+    LOG.v(f"mkl_lib_name: {mkl_lib_name}")
+    # We do not link manualy, link in custom ops
+    # ctypes.CDLL(mkl_lib_name, dlopen_flags)
+
+    mkl_op_dir = os.path.join(jittor_path, "extern", "mkl", "ops")
+    mkl_op_files = [os.path.join(mkl_op_dir, name) for name in os.listdir(mkl_op_dir)]
+    mkl_ops = compile_custom_ops(mkl_op_files, extra_flags=extra_flags)
+    LOG.vv("Get mkl_ops: "+str(dir(mkl_ops)))
+
+
+def install_cub(root_folder):
+    url = "https://github.com/NVIDIA/cub/archive/1.11.0.tar.gz"
+    url = "https://codeload.github.com/NVIDIA/cub/tar.gz/1.11.0"
+    filename = "cub-1.11.0.tgz"
+    md5 = "97196a885598e40592100e1caaf3d5ea"
+    fullname = os.path.join(root_folder, filename)
+    dirname = os.path.join(root_folder, filename.replace(".tgz",""))
+    
+    if not os.path.isfile(os.path.join(dirname, "examples", "device/example_device_radix_sort.cu")):
+        LOG.i("Downloading cub...")
+        download_url_to_local(url, filename, root_folder, md5)
+        import tarfile
+    
+        with tarfile.open(fullname, "r") as tar:
+            tar.extractall(root_folder)
+        # assert 0 == os.system(f"cd {dirname}/examples && "
+        #             f"{nvcc_path} --cudart=shared -ccbin=\"{cc_path}\"  device/example_device_radix_sort.cu -O2 -I.. -std=c++14 -o test")
+        # if core.get_device_count():
+        #     assert 0 == os.system(f"cd {dirname}/examples && ./test")
+    return dirname
+
+def setup_cub():
+    global cub_home
+    cub_home = ""
+    cub_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cub")
+    cuda_version = int(get_version(nvcc_path)[1:-1].split('.')[0])
+    extra_flags = ""
+    if cuda_version < 11:
+        cub_home = install_cub(cub_path)
+        extra_flags = f"-I{cub_home}"
+        cub_home += "/"
+    setup_cuda_lib("cub", link=False, extra_flags=extra_flags)
+
+def setup_cuda_extern():
+    if not has_cuda: return
+    def split(a): return a.replace(";",":").split(":")
+    check_ld_path = split(os.environ.get("LD_LIBRARY_PATH", "")) + \
+        split(os.environ.get("PATH", ""))
+    for cp in check_ld_path:
+        cp = cp.lower()
+        if "cuda" in cp and \
+            "lib" in cp and \
+            "jtcuda" not in cp:
+            LOG.w(f"CUDA related path found in LD_LIBRARY_PATH or PATH, "
+            "This path may cause jittor found the wrong libs, "
+            "please unset LD_LIBRARY_PATH and remove cuda lib path in Path. \n"
+            "Or you can let jittor install cuda for you: `python3.x -m jittor_utils.install_cuda`")
+            break
+    LOG.vv("setup cuda extern...")
+    cache_path_cuda = os.path.join(cache_path, "cuda")
+    cuda_include = os.path.join(jittor_path, "extern", "cuda", "inc")
+    make_cache_dir(cache_path_cuda)
+    cuda_extern_src = os.path.join(jittor_path, "extern", "cuda", "src")
+    cuda_extern_files = [os.path.join(cuda_extern_src, name)
+        for name in os.listdir(cuda_extern_src)]
+    so_name = os.path.join(cache_path_cuda, "libcuda_extern"+so)
+    compile(cc_path, cc_flags+f" -I\"{cuda_include}\" ", cuda_extern_files, so_name)
+    link_cuda_extern = f" -L\"{cache_path_cuda}\" -llibcuda_extern "
+    ctypes.CDLL(so_name, dlopen_flags)
+
+    try:
+        setup_cub()
+    except Exception as e:
+        import traceback
+        line = traceback.format_exc()
+        LOG.w(f"CUDA found but cub is not loaded:\n{line}")
+
+    libs = ["cublas", "cudnn", "curand", "cufft"]
+    # in cuda 11.4, module memory comsumptions:
+    # default context: 259 MB
+    # cublas: 340 MB
+    # cudnn: 340 MB
+    if int(os.environ.get("conv_opt", "0")):
+        libs = ["cublas", "curand"]
+    for lib_name in libs:
+        try:
+            setup_cuda_lib(lib_name, extra_flags=link_cuda_extern)
+        except Exception as e:
+            msg = f"CUDA found but {lib_name} is not loaded:\n"
+            if lib_name == "cudnn":
+                msg += """Develop version of CUDNN not found, 
+please refer to CUDA offical tar file installation: 
+https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar"""
+            if platform.machine() in ["x86_64", "AMD64"]:
+                msg += f"""
+or you can let jittor install cuda and cudnn for you:
+>>> python3.{sys.version_info.minor} -m jittor_utils.install_cuda
+"""
+            LOG.f(msg)
+
+def setup_cuda_lib(lib_name, link=True, extra_flags=""):
+    arch_key = "x86_64"
+    if platform.machine() not in ["x86_64", "AMD64"]:
+        arch_key = "aarch64"
+    globals()[lib_name+"_ops"] = None
+    globals()[lib_name] = None
+    if not has_cuda: return
+    LOG.v(f"setup {lib_name}...")
+
+    culib_path = os.path.join(cuda_lib, f"lib{lib_name}.so")
+    jt_cuda_include = os.path.join(jittor_path, "extern", "cuda", "inc")
+    jt_culib_include = os.path.join(jittor_path, "extern", "cuda", lib_name, "inc")
+
+    link_flags = ""
+    if link:
+        extra_include_path = os.path.abspath(os.path.join(cuda_include, "..", f"targets/{arch_key}-linux/include"))
+        extra_lib_path = os.path.abspath(os.path.join(cuda_lib, "..", f"targets/{arch_key}-linux/lib"))
+        cuda_include_name = search_file([cuda_include, extra_include_path, "/usr/include"], lib_name+".h")
+        # cuda11 prefer cudnn 8
+        nvcc_version = get_int_version(nvcc_path)
+        if has_corex:
+            nvcc_version = (10,2,89)
+        prefer_version = ()
+        if nvcc_version[0] == 11:
+            prefer_version = ("8",)
+        culib_path = search_file([cuda_bin, cuda_lib, extra_lib_path, f"/usr/lib/{arch_key}-linux-gnu", "/usr/lib"], f"lib{lib_name}.so", prefer_version)
+
+        if lib_name == "cublas" and nvcc_version[0] >= 10:
+            # manual link libcublasLt.so
+            try:
+                cublas_lt_lib_path = search_file([cuda_bin, cuda_lib, extra_lib_path, f"/usr/lib/{arch_key}-linux-gnu", "/usr/lib"], f"libcublasLt.so", nvcc_version)
+                ctypes.CDLL(cublas_lt_lib_path, dlopen_flags)
+            except:
+                # some aarch64 os, such as uos with FT2000 cpu,
+                # it's cuda 10 doesn't have libcublasLt.so
+                pass
+
+
+
+        if lib_name == "cudnn":
+            # cudnn cannot found libcudnn_cnn_train.so.8, we manual link for it.
+            if nvcc_version >= (11,0,0):
+                libs = ["libcudnn_ops_infer.so", "libcudnn_ops_train.so", "libcudnn_cnn_infer.so", "libcudnn_cnn_train.so"]
+                for l in libs:
+                    ex_cudnn_path = search_file([cuda_bin, cuda_lib, extra_lib_path, f"/usr/lib/{arch_key}-linux-gnu", "/usr/lib"], l, prefer_version)
+                    ctypes.CDLL(ex_cudnn_path, dlopen_flags)
+
+        # dynamic link cuda library
+        # ctypes.CDLL(culib_path, dlopen_flags)
+        # link_flags = f"-l{lib_name} -L\"{cuda_lib}\""
+        link_flags = f"-l{lib_name} -L\"{os.path.dirname(culib_path)}\""
+        # print("link_flags", link_flags, culib_path)
+
+    # find all source files
+    culib_src_dir = os.path.join(jittor_path, "extern", "cuda", lib_name)
+    culib_src_files = []
+    for r, _, f in os.walk(culib_src_dir):
+        for fname in f:
+            culib_src_files.append(os.path.join(r, fname))
+    if len(culib_src_files) == 0:
+        return
+
+    # compile and get operators
+    culib = compile_custom_ops(culib_src_files, return_module=True,
+        extra_flags=f" -I\"{jt_cuda_include}\" -I\"{jt_culib_include}\" {link_flags} {extra_flags} ")
+    culib_ops = culib.ops
+    globals()[lib_name+"_ops"] = culib_ops
+    globals()[lib_name] = culib
+    LOG.vv(f"Get {lib_name}_ops: "+str(dir(culib_ops)))
+
+
+def _setup_fake_cuda_lib(lib_name=None, link=True, extra_flags=""):
+    if lib_name is None:
+        lib_names = ["cudnn", "cublas", "curand", "cufft", "cub", "cutt", "cutlass"]
+        for lib_name in lib_names:
+            _setup_fake_cuda_lib(lib_name, link, extra_flags)
+        return
+    arch_key = "x86_64"
+    if platform.machine() not in ["x86_64", "AMD64"]:
+        arch_key = "aarch64"
+    globals()[lib_name+"_ops"] = None
+    globals()[lib_name] = None
+    LOG.v(f"setup {lib_name}...")
+
+    jt_cuda_include = os.path.join(jittor_path, "extern", "cuda", "inc")
+    jt_culib_include = os.path.join(jittor_path, "extern", "cuda", lib_name, "inc")
+
+    # find all source files
+    culib_src_dir = os.path.join(jittor_path, "extern", "cuda", lib_name, "ops")
+    culib_src_files = []
+    for r, _, f in os.walk(culib_src_dir):
+        for fname in f:
+            if fname.endswith("op.cc") or fname.endswith("op.h"):
+                culib_src_files.append(os.path.join(r, fname))
+    if len(culib_src_files) == 0:
+        return
+
+    # compile and get operators
+    culib = compile_custom_ops(culib_src_files, return_module=True,
+        extra_flags=f" -I\"{jt_cuda_include}\" -I\"{jt_culib_include}\" {extra_flags} ")
+    culib_ops = culib.ops
+    globals()[lib_name+"_ops"] = culib_ops
+    globals()[lib_name] = culib
+    LOG.vv(f"Get {lib_name}_ops: "+str(dir(culib_ops)))
+
+if setup_fake_cuda_lib:
+    _setup_fake_cuda_lib()
+
+def install_cutt(root_folder):
+    # Modified from: https://github.com/ap-hynninen/cutt
+    url = "https://codeload.github.com/Jittor/cutt/zip/v1.2"
+
+    filename = "cutt-1.2.zip"
+    fullname = os.path.join(root_folder, filename)
+    dirname = os.path.join(root_folder, filename.replace(".zip",""))
+    true_md5 = "14d0fd1132c8cd657dc3cf29ce4db931"
+
+    if os.path.exists(fullname):
+        from jittor_utils.misc import calculate_md5
+        md5 = calculate_md5(fullname)
+        if md5 != true_md5:
+            os.remove(fullname)
+            shutil.rmtree(dirname)
+    CUTT_PATH = os.environ.get("CUTT_PATH", "")
+    if not os.path.isfile(os.path.join(cache_path, "libcutt"+so)) or CUTT_PATH:
+        if CUTT_PATH:
+            dirname = CUTT_PATH
+        else:
+            LOG.i("Downloading cutt...")
+            download_url_to_local(url, filename, root_folder, true_md5)
+
+            import zipfile
+
+            zf = zipfile.ZipFile(fullname)
+            try:
+                zf.extractall(path=root_folder)
+            except RuntimeError as e:
+                print(e)
+                raise
+            zf.close()
+
+        LOG.i("installing cutt...")
+        # -Xptxas -dlcm=ca actually not work
+        arch_flag = " -Xptxas -dlcm=ca "
+        if len(flags.cuda_archs):
+            arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
+            arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
+        cutt_include = f" -I\"{dirname}/include\" -I\"{dirname}/src\" "
+        files = glob.glob(dirname+"/src/*.c*", recursive=True)
+        files2 = []
+        for f in files:
+            if f.endswith("cutt_bench.cpp") or \
+                f.endswith("cutt_test.cpp"):
+                continue
+            files2.append(f)
+        cutt_flags = cc_flags+opt_flags+cutt_include
+        compile(cc_path, cutt_flags, files2, cache_path+"/libcutt"+so, cuda_flags=arch_flag)
+    return dirname
+
+def setup_cutt():
+    global cutt_ops, use_cutt
+    if not has_cuda:
+        use_cutt = False
+        return
+    use_cutt = os.environ.get("use_cutt", "1")=="1"
+    cutt_ops = None
+    if not use_cutt: return
+    cutt_include_path = os.environ.get("cutt_include_path")
+    cutt_lib_path = os.environ.get("cutt_lib_path")
+    
+    if cutt_lib_path is None or cutt_include_path is None:
+        LOG.v("setup cutt...")
+        # cutt_path decouple with cc_path
+        cutt_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cutt")
+        
+        make_cache_dir(cutt_path)
+        install_cutt(cutt_path)
+        cutt_home = os.path.join(cutt_path, "cutt-1.2")
+        cutt_include_path = os.path.join(cutt_home, "src")
+        cutt_lib_path = cache_path
+
+    cutt_lib_name = os.path.join(cutt_lib_path, "libcutt"+so)
+    assert os.path.isdir(cutt_include_path)
+    assert os.path.isdir(cutt_lib_path)
+    assert os.path.isfile(cutt_lib_name), cutt_lib_name
+    LOG.v(f"cutt_include_path: {cutt_include_path}")
+    LOG.v(f"cutt_lib_path: {cutt_lib_path}")
+    LOG.v(f"cutt_lib_name: {cutt_lib_name}")
+    # We do not link manualy, link in custom ops
+    ctypes.CDLL(cutt_lib_name, dlopen_flags)
+
+    cutt_op_dir = os.path.join(jittor_path, "extern", "cuda", "cutt", "ops")
+    cutt_op_files = [os.path.join(cutt_op_dir, name) for name in os.listdir(cutt_op_dir)]
+    cutt_ops = compile_custom_ops(cutt_op_files, 
+        extra_flags=f" -I\"{cutt_include_path}\" -L\"{cutt_lib_path}\" -llibcutt ")
+    LOG.vv("Get cutt_ops: "+str(dir(cutt_ops)))
+
+def install_cutlass(root_folder):
+    # Modified from: https://github.com/ap-hynninen/cutlass
+    url = "https://cloud.tsinghua.edu.cn/f/171e49e5825549548bc4/?dl=1"
+
+    filename = "cutlass.zip"
+    fullname = os.path.join(root_folder, filename)
+    dirname = os.path.join(root_folder, filename.replace(".zip",""))
+    true_md5 = "999ecb7e217e40c497bc3d0ded6643f0"
+
+    if os.path.exists(fullname):
+        from jittor_utils.misc import calculate_md5
+        md5 = calculate_md5(fullname)
+        if md5 != true_md5:
+            os.remove(fullname)
+            shutil.rmtree(dirname)
+    CUTLASS_PATH = os.environ.get("CUTLASS_PATH", "")
+    if not os.path.isfile(os.path.join(jit_utils.home(), ".cache/jittor/cutlass/cutlass/include/cutlass/cutlass.h")) or CUTLASS_PATH:
+        if CUTLASS_PATH:
+            dirname = CUTLASS_PATH
+        else:
+            LOG.i("Downloading cutlass...")
+            download_url_to_local(url, filename, root_folder, true_md5)
+
+            import zipfile
+
+            zf = zipfile.ZipFile(fullname)
+            try:
+                zf.extractall(path=root_folder)
+            except RuntimeError as e:
+                print(e)
+                raise
+            zf.close()
+
+        # LOG.i("installing cutlass...")
+        # # -Xptxas -dlcm=ca actually not work
+        # arch_flag = " -Xptxas -dlcm=ca "
+        # if len(flags.cuda_archs):
+        #     arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
+        #     arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
+        # cutlass_include = f" -I\"{dirname}/include\" -I\"{dirname}/src\" "
+        # files = glob.glob(dirname+"/src/*.c*", recursive=True)
+        # files2 = []
+        # for f in files:
+        #     if f.endswith("cutlass_bench.cpp") or \
+        #         f.endswith("cutlass_test.cpp"):
+        #         continue
+        #     files2.append(f)
+        # cutlass_flags = cc_flags+opt_flags+cutlass_include
+        # compile(cc_path, cutlass_flags, files2, cache_path+"/libcutlass"+so, cuda_flags=arch_flag)
+    return dirname
+
+def setup_cutlass():
+    global cutlass_ops, use_cutlass
+    if not has_cuda:
+        use_cutlass = False
+        return
+    use_cutlass = os.environ.get("use_cutlass", "1")=="1"
+    cutlass_ops = None
+    if not use_cutlass: return
+    cutlass_include_path = os.environ.get("cutlass_include_path")
+    
+    if cutlass_include_path is None:
+        LOG.v("setup cutlass...")
+        # cutlass_path decouple with cc_path
+        cutlass_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cutlass")
+        
+        make_cache_dir(cutlass_path)
+        install_cutlass(cutlass_path)
+
+
+def install_nccl(root_folder):
+    url = "https://github.com/NVIDIA/nccl/archive/v2.8.4-1.tar.gz"
+    url = "https://codeload.github.com/NVIDIA/nccl/tar.gz/v2.8.4-1"
+
+    filename = "nccl.tgz"
+    fullname = os.path.join(root_folder, filename)
+    dirname = os.path.join(root_folder, "nccl-2.8.4-1")
+    true_md5 = "900666558c5bc43e0a5e84045b88a06f"
+
+    if os.path.exists(fullname):
+        md5 = run_cmd('md5sum '+fullname).split()[0]
+        if md5 != true_md5:
+            os.remove(fullname)
+            if os.path.isdir(dirname):
+                shutil.rmtree(dirname)
+    if not os.path.isfile(os.path.join(dirname, "build", "lib", "libnccl.so")):
+        if not os.path.isfile(os.path.join(root_folder, filename)):
+            LOG.i("Downloading nccl...")
+        download_url_to_local(url, filename, root_folder, true_md5)
+
+        if core.get_device_count() == 0:
+            return
+        if not inside_mpi():
+            return
+
+        import tarfile
+        with tarfile.open(fullname, "r") as tar:
+            tar.extractall(root_folder)
+
+        LOG.i("installing nccl...")
+        arch_flag = ""
+        if len(flags.cuda_archs):
+            arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
+            arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
+        run_cmd(f"CC=\"{cc_path}\" CXX=\"{cc_path}\" make -j8 src.build CUDA_HOME='{cuda_home}' NVCC_GENCODE='{arch_flag} --cudart=shared ' ", cwd=dirname)
+    return dirname
+
+def setup_nccl():
+    global nccl, nccl_ops, use_nccl
+    use_nccl = os.environ.get("use_nccl", "1")=="1"
+    nccl = None
+    nccl_ops = None
+    if not has_cuda or not has_mpi:
+        use_nccl = False
+        return
+    if not use_nccl: return
+    nccl_include_path = os.environ.get("nccl_include_path")
+    nccl_lib_path = os.environ.get("nccl_lib_path")
+    
+    if nccl_lib_path is None or nccl_include_path is None:
+        LOG.v("setup nccl...")
+        # nccl_path decouple with cc_path
+        nccl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "nccl")
+        
+        make_cache_dir(nccl_path)
+        nccl_home = install_nccl(nccl_path)
+        if nccl_home is None: return
+        nccl_include_path = os.path.join(nccl_home, "build", "include")
+        nccl_lib_path = os.path.join(nccl_home, "build", "lib")
+        
+    if not inside_mpi():
+        return
+
+    nccl_lib_name = os.path.join(nccl_lib_path, "libnccl.so")
+    assert os.path.isdir(nccl_include_path)
+    assert os.path.isdir(nccl_lib_path)
+    assert os.path.isfile(nccl_lib_name), nccl_lib_name
+    LOG.v(f"nccl_include_path: {nccl_include_path}")
+    LOG.v(f"nccl_lib_path: {nccl_lib_path}")
+    LOG.v(f"nccl_lib_name: {nccl_lib_name}")
+    # We do not link manualy, link in custom ops
+    ctypes.CDLL(nccl_lib_name, dlopen_flags)
+
+    nccl_src_dir = os.path.join(jittor_path, "extern", "cuda", "nccl")
+    nccl_src_files = []
+    for r, _, f in os.walk(nccl_src_dir):
+        for fname in f:
+            nccl_src_files.append(os.path.join(r, fname))
+
+    nccl = compile_custom_ops(nccl_src_files, 
+        extra_flags=f" -I\"{nccl_include_path}\" {mpi_compile_flags} ",
+        return_module=True, dlopen_flags=os.RTLD_GLOBAL | os.RTLD_NOW,
+        gen_name_="jittor_nccl_core")
+    nccl_ops = nccl.ops
+    LOG.vv("Get nccl_ops: "+str(dir(nccl_ops)))
+
+def manual_link(flags):
+    lib_dirs = []
+    libs = []
+    for f in flags.split():
+        if f.startswith("-l"):
+            libs.append(f[2:])
+        elif f.startswith("-L"):
+            lib_dirs.append(f[2:])
+    LOG.v("manual_link:", flags)
+    LOG.v("lib_dirs:", lib_dirs)
+    LOG.v("libs:", libs)
+    for lib in libs:
+        for d in lib_dirs:
+            libname = os.path.join(d, f"lib{lib}.so")
+            if os.path.isfile(libname):
+                LOG.v("link:", libname)
+                ctypes.CDLL(libname, dlopen_flags)
+                break
+
+def inside_mpi():
+    return "OMPI_COMM_WORLD_SIZE" in os.environ
+
+def setup_mpi():
+    global mpi_ops, mpi, use_mpi
+    global mpicc_path, has_mpi
+    use_mpi = os.environ.get("use_mpi", "1")=="1"
+    mpi_ops = None
+    mpi = None
+    has_mpi = False
+    if not use_mpi: return
+    mpicc_path = env_or_try_find('mpicc_path', 'mpicc')
+    if mpicc_path == "":
+        # LOG.i("mpicc not found, distribution disabled.")
+        use_mpi = False
+    else:
+        use_mpi = True
+        has_mpi = True
+    if not use_mpi:
+        return
+
+    global mpi_compile_flags, mpi_link_flags, mpi_flags
+    mpi_compile_flags = run_cmd(mpicc_path+" --showme:compile")
+    mpi_link_flags = run_cmd(mpicc_path+" --showme:link")
+    mpi_flags = mpi_compile_flags + " " + mpi_link_flags
+    LOG.v("mpi_flags: "+mpi_flags)
+
+    # find all source files
+    mpi_src_dir = os.path.join(jittor_path, "extern", "mpi")
+    mpi_src_files = []
+    for r, _, f in os.walk(mpi_src_dir):
+        for fname in f:
+            mpi_src_files.append(os.path.join(r, fname))
+
+    # mpi compile flags add for nccl
+    mpi_compile_flags += f" -I\"{os.path.join(mpi_src_dir, 'inc')}\" "
+    mpi_compile_flags = mpi_compile_flags.replace("-pthread", "")
+
+    mpi_version = get_version(mpicc_path)
+    if mpi_version.startswith("(1.") or mpi_version.startswith("(2."):
+        # mpi version 1.x need to link like this
+        manual_link(mpi_flags)
+    # mpi(4.x) cannot use deepbind, it need to
+    # share the 'environ' symbol.
+    mpi = compile_custom_ops(mpi_src_files, 
+        extra_flags=f" {mpi_flags} ", return_module=True,
+        dlopen_flags=os.RTLD_GLOBAL | os.RTLD_NOW, gen_name_="jittor_mpi_core")
+    mpi_ops = mpi.ops
+    LOG.vv("Get mpi: "+str(mpi.__dict__.keys()))
+    LOG.vv("Get mpi_ops: "+str(mpi_ops.__dict__.keys()))
+    def wrapper(func):
+        def inner(self, *args, **kw):
+            return func(self, *args, **kw)
+        inner.__doc__ = func.__doc__
+        return inner
+    for k in mpi_ops.__dict__:
+        if not k.startswith("mpi_"): continue
+        if k == "mpi_test": continue
+        setattr(core.Var, k, wrapper(mpi_ops.__dict__[k]))
+
+in_mpi = inside_mpi()
+FIX_TORCH_ERROR = 0
+if os.name != 'nt' and not in_mpi:
+    FIX_TORCH_ERROR = 1
+if "FIX_TORCH_ERROR" in os.environ:
+    FIX_TORCH_ERROR = os.environ["FIX_TORCH_ERROR"] != "0"
+if FIX_TORCH_ERROR:
+    try:
+        import torch
+        from jittor_utils import dirty_fix_pytorch_runtime_error
+        dirty_fix_pytorch_runtime_error()
+    except:
+        pass
+
+cudnn = cublas = curand = cufft = None
+setup_mpi()
+rank = mpi.world_rank() if in_mpi else 0
+world_size = mpi.world_size() if in_mpi else 1
+setup_nccl()
+
+setup_cutt()
+setup_cutlass()
+
+# try:
+setup_mkl()
+# except Exception as e:
+#     LOG.w("MKL install failed, msg:", e)
+
+setup_cuda_extern()
+
+# install backend extern library
+for mod in jit_utils.backends:
+    if mod.install_extern():
+        break
--- a/python/jittor/compiler.py
+++ b/python/jittor/compiler.py
--- a/python/jittor/contrib.py
+++ b/python/jittor/contrib.py
@ -0,0 +1,274 @@
+# ***************************************************************
+# Copyright (c) 2023 Jittor. All Rights Reserved. 
+# Maintainers: 
+#     Guowei Yang <471184555@qq.com>
+#     Guoye Yang <498731903@qq.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import jittor as jt
+import numpy as np
+from jittor import pool
+from collections.abc import Sequence
+
+
+def argmax_pool(x, size, stride, padding=0):
+    if stride<=0:
+        raise RuntimeError(f"stride must be > 0, but got {stride}")
+    return pool.pool(x, size, 'maximum', padding, stride)
+
+def concat(arr, dim):
+    '''Concat Operator can concat a list of jt Var at a specfic dimension.
+    
+    * [in] x:   input var list for concat
+
+    * [in] dim: concat which dim
+
+    * [out] out:  concat result
+
+Example::
+
+        >>> jt.concat([jt.array([[1],[2]]), jt.array([[2],[2]])], dim=1)
+        jt.Var([[1 2]
+                [2 2]], dtype=int32)
+    '''
+    # TODO: low performance when concat lots of vars
+    total_dim = 0
+    if dim < 0: dim += len(arr[0].shape)
+    for a in arr:
+        total_dim += a.shape[dim]
+    cdim = 0
+    s = None
+    indexes = [ f"i{i}" for i in range(len(a.shape)) ]
+    for a in arr:
+        shape = list(a.shape)
+        shape[dim] = total_dim
+        indexes[dim] = f"i{dim}-{cdim}"
+        b = a.reindex(shape, indexes)
+        # ugly fix for preventing large fused op 
+        if len(arr)>=100:
+            b.stop_fuse()
+        if s is None:
+            s = b
+        else:
+            s += b
+        cdim += a.shape[dim]
+    return s
+
+def check(bc):
+    bc = np.array(bc)
+    if ((bc != 1) * (bc != bc.max(0))).sum() > 0:
+        raise Exception(f"Shape not match.")
+    else:
+        return bc.max(0)
+
+def slice_var_index(x, slices):
+    if not isinstance(slices, tuple):
+        slices = (slices,)
+    if isinstance(slices[0], jt.Var):
+        if len(slices) == 1 and slices[0].dtype == "bool":
+            return slice_var_index(x, tuple(slices[0].where()))
+    bc = []
+    ml = -1
+    for idx, s in enumerate(slices):
+        if isinstance(s, jt.Var):
+            shape = s.shape
+        elif isinstance(s, np.ndarray):
+            shape = list(s.shape)
+        elif isinstance(s, list):
+            shape = list(np.array(s).shape)
+        else:
+            continue
+        if len(shape) >= ml:
+            ml = len(shape)
+        bc.append(shape)
+    for idx, shape in enumerate(bc):
+        if len(shape) < ml:
+            shape = (ml - len(shape)) * [1] + shape
+            bc[idx] = shape
+    if len(bc) >= 1: 
+        bc_shape = check(bc)
+        ss = []
+        for idx, s in enumerate(slices):
+            if isinstance(s, np.ndarray) or isinstance(s, list):
+                ss.append(jt.array(s).broadcast(bc_shape.tolist()))
+            elif isinstance(s, jt.Var):
+                ss.append(s.broadcast(bc_shape.tolist()))
+            else:
+                ss.append(s)
+        slices = ss
+    out_shape = []
+    out_index = []
+    shape = x.shape
+    cnt_list = 0
+    extras_idx = []
+    extras = []
+    has_ellipse = 0
+    ellipse_index = 0
+    for s,i in zip(slices,range(len(slices))):
+        if isinstance(s,type(...)):
+            has_ellipse+=1
+            ellipse_index = i
+    if has_ellipse>1:
+        raise Exception(f"There are more than one ...")
+    elif has_ellipse==1:
+        slices = list(slices)
+        del slices[ellipse_index]
+        while len(slices)<len(shape):
+            slices.insert(ellipse_index,slice(None))
+
+    for i in range(len(shape)):
+        if i>=len(slices):
+            s = slice(None)
+        else:
+            s = slices[i]
+        sp = shape[i]
+        j = len(out_shape)
+        if isinstance(s, int):
+            if s<0: s += sp
+            out_index.append(str(s))
+        elif isinstance(s, slice):
+            if s == slice(None):
+                out_shape.append(sp)
+                out_index.append(f"i{j}")
+                continue
+            start = 0 if s.start is None else s.start
+            stop = sp if s.stop is None else s.stop
+            step = 1 if s.step is None else s.step
+            if start<0: start += sp
+            if stop<0: stop += sp
+            if stop>sp+1: stop = sp
+            out_shape.append(1+int(max(0, (stop-start-1)//step)))
+            out_index.append(f"{start}+i{j}*{step}")
+        elif isinstance(s, jt.Var):
+            if cnt_list == 0:
+                for idx in range(len(bc_shape)):
+                    extras_idx.append(f"i{len(out_shape) + idx}")
+                out_shape += bc_shape.tolist()
+            out_index.append(f"@e{cnt_list}("+ ",".join(extras_idx) + ")")
+            cnt_list += 1
+            extras.append(s)
+        else:
+            raise Exception(f"Not support slice {s}")
+    if len(out_shape)==0:
+        out_shape = [1]
+    # Stop fuse both input and output, prevent recompile
+    x.stop_fuse()
+    return (out_shape, out_index, 0, [], extras)
+
+def _slice_var_old(x, slices):
+    reindex_args = slice_var_index(x, slices)
+    x.stop_fuse()
+    return x.reindex(*reindex_args).stop_fuse()
+
+def _setitem_old(x, slices, value):
+    reindex_args = slice_var_index(x, slices)
+    reindex_reduce_args = (x.shape, reindex_args[1]) + reindex_args[3:]
+    xslice = x.stop_fuse().reindex(*reindex_args).stop_fuse()
+    value = jt.broadcast(value, xslice)
+    value = value.cast(x.dtype)
+    one = jt.broadcast(1, xslice)
+    if not isinstance(reindex_args[0][0], jt.Var):
+        reindex_args = (x.shape,) + reindex_args[1:]
+    mask = one.reindex_reduce("add", *reindex_reduce_args)
+    data = value.reindex_reduce("add", *reindex_reduce_args)
+    # Stop fuse both input and output, prevent recompile
+    out = mask.ternary(data, x).stop_fuse()
+    x.assign(out)
+    return x
+
+# PATCH
+def getitem(x, slices):
+    if isinstance(slices, jt.Var) and slices.dtype == "bool":
+        return getitem(x, slices.where())
+    if isinstance(slices, tuple):
+        ss = []
+        for s in slices:
+            if isinstance(s, jt.Var) and s.dtype == "bool":
+                ss.extend(s.where())
+            else:
+                ss.append(s)
+        slices = tuple(ss)
+    return x.getitem(slices)
+
+def setitem(x, slices, value):
+    if isinstance(slices, jt.Var) and slices.dtype == "bool":
+        if slices.shape == x.shape:
+            if isinstance(value, (int, float)):
+                value = jt.array(value).broadcast(x.shape)
+                return x.assign(slices.ternary(value, x))
+            elif isinstance(value, jt.Var) and value.shape == [1,]:
+                value = jt.broadcast(value, x.shape)
+                return x.assign(slices.ternary(value, x))
+        slices = slices.where()
+    elif isinstance(slices, tuple):
+        ss = []
+        for s in slices:
+            if isinstance(s, jt.Var) and s.dtype == "bool":
+                ss.extend(s.where())
+            else:
+                ss.append(s)
+        slices = tuple(ss)
+    return x.check_cascade_setitem(x.setitem(slices, value))
+
+jt.Var.__getitem__ = jt.Var.slice_var = getitem
+jt.Var.__setitem__ = setitem
+
+
+def _merge_dtypes(dtypes):
+    dtype = dtypes[0]
+    for i in range(1, len(dtypes)):
+        dtype = jt.binary_dtype_infer("add", dtype, dtypes[i])
+    return dtype 
+
+@jt.flag_scope(amp_reg=4) # _custom_flag
+def concat(arr, dim=0):
+    '''Concat Operator can concat a list of jt Var at a specfic dimension.
+    
+    * [in] x:   input var list for concat
+
+    * [in] dim: concat which dim
+
+    * return:  concat result
+
+Example::
+
+        jt.concat([jt.array([[1],[2]]), jt.array([[2],[2]])], dim=1)
+        # return jt.Var([[1,2],[2,2]],dtype=int32)
+    '''
+    if not isinstance(arr, Sequence):
+        raise TypeError("concat arr needs to be a tuple or list")
+    if len(arr) == 0:
+        raise ValueError("need at least one array to concat")
+    total_dim = 0
+    base_dim = len(arr[0].shape)
+    if dim < 0: dim += base_dim
+    if dim < 0 or dim >= base_dim: 
+        raise IndexError(f"Dimension out of range (expected to be in range of [{-base_dim}, {base_dim-1}], but got {dim})")
+    dtypes = []
+    for a in arr:
+        if len(a.shape) != base_dim:
+            raise RuntimeError(f"get different number of dimensions of {base_dim} and {len(a.shape)}")
+        for i in range(base_dim):
+            if i != dim and a.shape[i] != arr[0].shape[i]:
+                raise RuntimeError(f"Sizes of vars must match except in dimension {dim}. Expected size {arr[0].shape[i]} but got size {a.shape[i]} for dimension number {i} in the list.")
+        total_dim += a.shape[dim]
+        dtypes.append(str(a.dtype))
+    cdim = 0
+    shape = list(a.shape)
+    shape[dim] = total_dim
+    s = jt.empty(shape, dtype = _merge_dtypes(dtypes))
+    slices = [slice(None)]*len(a.shape)
+    for a in arr:
+        if a.shape[dim] == 0:
+            continue
+        slices[dim] = slice(cdim, cdim+a.shape[dim])
+        # print(slices, type(a))
+        s = s.setitem(tuple(slices), a)
+        # s = jt.setitem(s, tuple(slices), a)
+        cdim += a.shape[dim]
+    return s
+
+cat = concat
--- a/python/jittor/dataset/init.py
+++ b/python/jittor/dataset/init.py
@ -0,0 +1,6 @@
+
+from .dataset import Dataset, ImageFolder, dataset_root, TensorDataset, VarDataset, DataLoader
+from .mnist import MNIST
+from .cifar import CIFAR10, CIFAR100
+from .voc import VOC
+from .sampler import *
--- a/python/jittor/dataset/cifar.py
+++ b/python/jittor/dataset/cifar.py
@ -0,0 +1,189 @@
+
+import os
+from jittor_utils.misc import download_and_extract_archive, check_integrity
+from PIL import Image
+import sys, pickle
+import numpy as np
+from jittor.dataset import Dataset, dataset_root
+
+class CIFAR10(Dataset):
+    """`CIFAR10 <https://www.cs.toronto.edu/~kriz/cifar.html>`_ Dataset.
+
+    Args:
+        root (string): Root directory of dataset where directory
+            ``cifar-10-batches-py`` exists or will be saved to if download is set to True.
+        train (bool, optional): If True, creates dataset from training set, otherwise
+            creates from test set.
+        transform (callable, optional): A function/transform that takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        download (bool, optional): If true, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+
+    Example::
+
+
+        from jittor.dataset.cifar import CIFAR10
+        a = CIFAR10()
+        a.set_attrs(batch_size=16)
+        for imgs, labels in a:
+            print(imgs.shape, labels.shape)
+            break
+
+    """
+    base_folder = 'cifar-10-batches-py'
+    url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
+    filename = "cifar-10-python.tar.gz"
+    tgz_md5 = 'c58f30108f718f92721af3b95e74349a'
+    train_list = [
+        ['data_batch_1', 'c99cafc152244af753f735de768cd75f'],
+        ['data_batch_2', 'd4bba439e000b95fd0a9bffe97cbabec'],
+        ['data_batch_3', '54ebc095f3ab1f0389bbae665268c751'],
+        ['data_batch_4', '634d18415352ddfa80567beed471001a'],
+        ['data_batch_5', '482c414d41f54cd18b22e5b47cb7c3cb'],
+    ]
+
+    test_list = [
+        ['test_batch', '40351d587109b95175f43aff81a1287e'],
+    ]
+    meta = {
+        'filename': 'batches.meta',
+        'key': 'label_names',
+        'md5': '5ff9c542aee3614f3951f8cda6e48888',
+    }
+
+    def __init__(self, root=dataset_root+"/cifar_data/", train=True, transform=None, target_transform=None,
+                 download=True):
+
+        super(CIFAR10, self).__init__()
+        self.root = root
+        self.transform=transform
+        self.target_transform=target_transform
+
+        self.train = train  # training set or test set
+
+        if download:
+            self.download()
+
+        if not self._check_integrity():
+            raise RuntimeError('Dataset not found or corrupted.' +
+                               ' You can use download=True to download it')
+
+        if self.train:
+            downloaded_list = self.train_list
+        else:
+            downloaded_list = self.test_list
+
+        self.data = []
+        self.targets = []
+
+        # now load the picked numpy arrays
+        for file_name, checksum in downloaded_list:
+            file_path = os.path.join(self.root, self.base_folder, file_name)
+            with open(file_path, 'rb') as f:
+                if sys.version_info[0] == 2:
+                    entry = pickle.load(f)
+                else:
+                    entry = pickle.load(f, encoding='latin1')
+                self.data.append(entry['data'])
+                if 'labels' in entry:
+                    self.targets.extend(entry['labels'])
+                else:
+                    self.targets.extend(entry['fine_labels'])
+
+        self.data = np.vstack(self.data).reshape(-1, 3, 32, 32)
+        self.data = self.data.transpose((0, 2, 3, 1))  # convert to HWC
+
+        self._load_meta()
+
+    def _load_meta(self):
+        path = os.path.join(self.root, self.base_folder, self.meta['filename'])
+        if not check_integrity(path, self.meta['md5']):
+            raise RuntimeError('Dataset metadata file not found or corrupted.' +
+                               ' You can use download=True to download it')
+        with open(path, 'rb') as infile:
+            if sys.version_info[0] == 2:
+                data = pickle.load(infile)
+            else:
+                data = pickle.load(infile, encoding='latin1')
+            self.classes = data[self.meta['key']]
+        self.class_to_idx = {_class: i for i, _class in enumerate(self.classes)}
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        img, target = self.data[index], self.targets[index]
+
+        # doing this so that it is consistent with all other datasets
+        # to return a PIL Image
+        img = Image.fromarray(img)
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.data)
+
+    def _check_integrity(self):
+        root = self.root
+        for fentry in (self.train_list + self.test_list):
+            filename, md5 = fentry[0], fentry[1]
+            fpath = os.path.join(root, self.base_folder, filename)
+            if not check_integrity(fpath, md5):
+                return False
+        return True
+
+    def download(self):
+        if self._check_integrity():
+            print('Files already downloaded and verified')
+            return
+        download_and_extract_archive(self.url, self.root, filename=self.filename, md5=self.tgz_md5)
+
+    def extra_repr(self):
+        return "Split: {}".format("Train" if self.train is True else "Test")
+
+
+class CIFAR100(CIFAR10):
+    """`CIFAR100 <https://www.cs.toronto.edu/~kriz/cifar.html>`_ Dataset.
+
+    This is a subclass of the `CIFAR10` Dataset.
+
+
+    Example::
+
+
+        from jittor.dataset.cifar import CIFAR100
+        a = CIFAR100()
+        a.set_attrs(batch_size=16)
+        for imgs, labels in a:
+            print(imgs.shape, labels.shape)
+            break
+    """
+    base_folder = 'cifar-100-python'
+    url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
+    filename = "cifar-100-python.tar.gz"
+    tgz_md5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+    train_list = [
+        ['train', '16019d7e3df5f24257cddd939b257f8d'],
+    ]
+
+    test_list = [
+        ['test', 'f0ef6b0ae62326f3e7ffdfab6717acfc'],
+    ]
+    meta = {
+        'filename': 'meta',
+        'key': 'fine_label_names',
+        'md5': '7973b15100ade9c7d40fb424638fde48',
+    }
--- a/python/jittor/dataset/dataset.py
+++ b/python/jittor/dataset/dataset.py
@ -0,0 +1,728 @@
+# ***************************************************************
+# Copyright (c) 2023 Jittor. All Rights Reserved. 
+# Maintainers: 
+#     Meng-Hao Guo <guomenghao1997@gmail.com>
+#     Dun Liang <randonlang@gmail.com>. 
+# 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import numpy as np
+from urllib import request
+import gzip
+import pickle
+import os
+from jittor.dataset.utils import get_random_list, get_order_list, collate_batch, HookTimer
+from collections.abc import Sequence, Mapping
+import pathlib
+from PIL import Image
+import multiprocessing as mp
+import signal
+from jittor_utils import LOG
+import jittor as jt
+import time
+import jittor_utils as jit_utils
+
+dataset_root = os.path.join(jit_utils.home(), ".cache", "jittor", "dataset")
+mp_log_v = os.environ.get("mp_log_v", 0) 
+mpi = jt.mpi
+img_open_hook = HookTimer(Image, "open")
+CHECK_MEMORY = int(os.environ.get("CHECK_MEMORY", "0"))
+
+if os.name == "nt":
+    from multiprocessing import shared_memory
+    class RingBuffer:
+        def __init__(self, size, shm=None):
+            for i in range(100):
+                if (1<<i) >= size: break
+            size = 1<<i
+            init = False
+            if shm is None:
+                init = True
+                shm = shared_memory.SharedMemory(create=True, size=size+1024)
+            rb = jt.core.RingBuffer(size, id(shm.buf), init)
+            self.size = size
+            self.shm = shm
+            self.rb = rb
+
+        def __reduce__(self):
+            return (RingBuffer, (self.size, self.shm))
+            
+        def __del__(self):
+            del self.rb
+            del self.shm
+
+        def push(self, obj): self.send(obj)
+        def pop(self): return self.recv()
+        def send(self, obj): self.rb.push(obj)
+        def recv(self): return self.rb.pop()
+        def clear(self): return self.rb.clear()
+        def stop(self): return self.rb.stop()
+        def is_stop(self): return self.rb.is_stop()
+        def total_pop(self): return self.rb.total_pop()
+        def total_push(self): return self.rb.total_push()
+        def __repr__(self): return repr(self.rb)
+        def keep_numpy_array(self, keep): self.rb.keep_numpy_array(keep)
+
+    jt.RingBuffer = RingBuffer
+
+class Worker:
+    def __init__(self, target, args, buffer_size, keep_numpy_array=False):
+        self.buffer = jt.RingBuffer(buffer_size)
+        self.buffer.keep_numpy_array(keep_numpy_array)
+
+        self.status = mp.Array('f', 5, lock=False)
+        self.p = mp.Process(target=target, args=args+(self.buffer,self.status))
+        self.p.daemon = True
+        self.p.start()
+
+class Dataset(object):
+    '''
+    Base class for reading data.
+
+    Args::
+
+        [in] batch_size(int): batch size, default 16.
+        [in] shuffle(bool): shuffle at each epoch, default False.
+        [in] drop_last(bool): if true, the last batch of dataset might smaller than batch_size, default True.
+        [in] num_workers(int): number of workers for loading data.
+        [in] buffer_size(int): buffer size for each worker in bytes, default(512MB).
+        [in] keep_numpy_array(bool): return numpy array rather than jittor array, default(False).
+        [in] endless(bool): will this dataset yield data forever, default(False).
+    
+    Example::
+
+        class YourDataset(Dataset):
+            def __init__(self):
+                super().__init__()
+                self.set_attrs(total_len=1024)
+
+            def __getitem__(self, k):
+                return k, k*k
+
+        dataset = YourDataset().set_attrs(batch_size=256, shuffle=True)
+        for x, y in dataset:
+            ......
+    '''
+    def __init__(self,
+                 batch_size = 16,
+                 shuffle = False,
+                 drop_last = False,
+                 num_workers = 0,
+                 buffer_size = 512*1024*1024,
+                 stop_grad = True,
+                 keep_numpy_array = False,
+                 endless = False):
+        super().__init__()
+        if os.environ.get("DISABLE_MULTIPROCESSING", '0') == '1':
+            num_workers = 0
+        self.total_len = None
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.num_workers = num_workers
+        self.buffer_size = buffer_size
+        self.stop_grad = stop_grad
+        self.keep_numpy_array = keep_numpy_array
+        self.endless = endless
+        self.epoch_id = 0
+        self.sampler = None
+        self._disable_workers = False
+        self._shuffle_rng = np.random.default_rng(1)
+        self.dataset = self
+
+    def __getitem__(self, index):
+        raise NotImplementedError
+
+    def __batch_len__(self):
+        assert self.total_len >= 0
+        assert self.batch_size > 0
+        if self.drop_last:
+            return self.total_len // self.batch_size
+        return (self.total_len-1) // self.batch_size + 1
+
+    def __len__(self):
+        return self.__batch_len__()
+
+    def set_attrs(self, **kw):
+        ''' 
+        You can set attributes of dataset by using set_attrs function, including total_len, batch_size, shuffle, drop_last, num_workers, buffer_size.
+        
+        Example::
+
+            dataset = YourDataset().set_attrs(batch_size=256, shuffle=True)
+
+        Attrs:
+
+            * batch_size(int): batch size, default 16.
+            * total_len(int): total lenght.
+            * shuffle(bool): shuffle at each epoch, default False.
+            * drop_last(bool): if true, the last batch of dataset might smaller than batch_size, default True.
+            * num_workers: number of workers for loading data
+            * buffer_size: buffer size for each worker in bytes, default(512MB).
+            * stop_grad: stop grad for data, default(True).
+        '''
+        for k,v in kw.items():
+            assert hasattr(self, k), k
+            setattr(self, k, v)
+        self.reset()
+        return self
+
+    def to_jittor(self, batch):
+        '''
+        Change batch data to jittor array, such as np.ndarray, int, and float.
+        '''
+        if self.keep_numpy_array: return batch
+        if isinstance(batch, jt.Var): return batch
+        to_jt = lambda x: jt.array(x).stop_grad() \
+            if self.stop_grad else jt.array(x)
+        if isinstance(batch, np.ndarray):
+            return to_jt(batch)
+        if isinstance(batch, dict):
+            new_batch = {}
+            for k,v in batch.items():
+                new_batch[k] = self.to_jittor(v)
+            return new_batch
+        if not isinstance(batch, (list, tuple)):
+            return batch
+        new_batch = []
+        for a in batch:
+            if isinstance(a, np.ndarray):
+                new_batch.append(to_jt(a))
+            else:
+                new_batch.append(self.to_jittor(a))
+        return new_batch
+
+    def collate_batch(self, batch):
+        '''
+        Puts each data field into a tensor with outer dimension batch size.
+
+        Args::
+
+        [in] batch(list): A list of variables, such as jt.var, Image.Image, np.ndarray, int, float, str and so on.
+
+        '''
+        return collate_batch(batch)
+
+    def terminate(self):
+        '''
+        Terminate is used to terminate multi-process worker reading data.
+        '''
+        if hasattr(self, "workers"):
+            for w in self.workers:
+                w.p.terminate()
+    
+    def _worker_main(self, worker_id, buffer, status):
+        import jittor_utils
+        jt.flags.use_cuda_host_allocator = 0
+
+        jittor_utils.cc.init_subprocess()
+        jt.jt_init_subprocess()
+        seed = jt.get_seed()
+        wseed = (seed ^ (worker_id*1167)) ^ 1234
+        jt.set_global_seed(wseed)
+        # parallel_op_compiler still problematic,
+        # it is not work on ubuntu 16.04. but worked on ubuntu 20.04
+        # it seems like the static value of parallel compiler
+        # is not correctly init.
+        jt.flags.use_parallel_op_compiler = 0
+        import time
+        try:
+            gid_obj = self.gid.get_obj()
+            gid_lock = self.gid.get_lock()
+            start = time.time()
+            while True:
+                # get id
+                with gid_lock:
+                    while buffer.is_stop() or self.idqueue.is_stop() or \
+                        gid_obj.value >= self.batch_len:
+                        self.num_idle.value += 1
+                        self.num_idle_c.notify()
+                        self.gidc.wait()
+                        self.num_idle.value -= 1
+                    cid = gid_obj.value
+                    batch_index_list = self.index_list_numpy[
+                        cid*self.real_batch_size:
+                        min(self.real_len, (cid+1)*self.real_batch_size)
+                    ].copy()
+                    gid_obj.value += 1
+                with self.idqueue_lock:
+                    self.idqueue.push(worker_id)
+                now = time.time()
+                other_time = now - start
+                start = now
+
+                # load and transform data
+                batch = []
+                if mp_log_v:
+                    print(f"#{worker_id} {os.getpid()} load batch", cid*self.real_batch_size, min(self.real_len, (cid+1)*self.real_batch_size))
+                for i in batch_index_list:
+                    batch.append(self[i])
+                batch = self.collate_batch(batch)
+                now = time.time()
+                data_time = now - start
+                start = now
+
+                # send data to main process
+                if mp_log_v:
+                    print(f"#{worker_id} {os.getpid()} send", type(batch).__name__, [ type(b).__name__ for b in batch ], buffer)
+                try:
+                    buffer.send(batch)
+                except:
+                    if buffer.is_stop():
+                        continue
+                    raise
+                now = time.time()
+                send_time = now - start
+                start = now
+                status[0], status[1], status[2], status[3], status[4] = \
+                    other_time, data_time, send_time, \
+                    other_time + data_time + send_time, \
+                    img_open_hook.duration
+                img_open_hook.duration = 0.0
+        except:
+            import traceback
+            line = traceback.format_exc()
+            print(line)
+            os.kill(os.getppid(), signal.SIGINT)
+            exit(0)
+
+    def display_worker_status(self):
+        ''' Display dataset worker status, when dataset.num_workers > 0, it will display infomation blow:
+
+.. code-block:: console
+
+        progress:479/5005
+        batch(s): 0.302 wait(s):0.000
+        recv(s): 0.069  to_jittor(s):0.021
+        recv_raw_call: 6720.0
+        last 10 workers: [6, 7, 3, 0, 2, 4, 7, 5, 6, 1]
+        ID      wait(s) load(s) send(s) total
+        #0      0.000   1.340   2.026   3.366   Buffer(free=0.000% l=462425368 r=462425368 size=536870912)
+        #1      0.000   1.451   3.607   5.058   Buffer(free=0.000% l=462425368 r=462425368 size=536870912)
+        #2      0.000   1.278   1.235   2.513   Buffer(free=0.000% l=462425368 r=462425368 size=536870912)
+        #3      0.000   1.426   1.927   3.353   Buffer(free=0.000% l=462425368 r=462425368 size=536870912)
+        #4      0.000   1.452   1.074   2.526   Buffer(free=0.000% l=462425368 r=462425368 size=536870912)
+        #5      0.000   1.422   3.204   4.625   Buffer(free=0.000% l=462425368 r=462425368 size=536870912)
+        #6      0.000   1.445   1.953   3.398   Buffer(free=0.000% l=462425368 r=462425368 size=536870912)
+        #7      0.000   1.582   0.507   2.090   Buffer(free=0.000% l=308283552 r=308283552 size=536870912)
+
+Meaning of the outputs:
+
+* progress: dataset loading progress (current/total)
+* batch: batch time, exclude data loading time
+* wait: time of main proc wait worker proc
+* recv: time of recv batch data
+* to_jittor: time of batch data to jittor variable
+* recv_raw_call: total number of underlying recv_raw called
+* last 10 workers: id of last 10 workers which main proc load from.
+* table meaning
+    * ID: worker id
+    * wait: worker wait time
+    * open: worker image open time
+    * load: worker load time
+    * buffer: ring buffer status, such as how many free space, left index, right index, total size(bytes).
+
+Example::
+  
+  from jittor.dataset import Dataset
+  class YourDataset(Dataset):
+      pass
+  dataset = YourDataset().set_attrs(num_workers=8)
+  for x, y in dataset:
+      dataset.display_worker_status()
+        '''
+        if not hasattr(self, "workers"):
+            return
+        msg = [""]
+        msg.append(f"progress:{self.batch_id}/{self.batch_len}")
+        msg.append(f"batch(s): {self.batch_time:.3f}\twait(s):{self.wait_time:.3f}")
+        msg.append(f"recv(s): {self.recv_time:.3f}\tto_jittor(s):{self.to_jittor_time:.3f}")
+        msg.append(f"last 10 workers: {self.last_ids}")
+        msg.append(f"ID\twait(s)\topen(s)\tload(s)\tsend(s)\ttotal(s)")
+        for i in range(self.num_workers):
+            w = self.workers[i]
+            s = w.status
+            msg.append(f"#{i}\t{s[0]:.3f}\t{s[4]:.3f}\t{s[1]:.3f}\t{s[2]:.3f}\t{s[3]:.3f}\t{w.buffer}")
+        LOG.i('\n'.join(msg))
+
+    def _stop_all_workers(self):
+        # stop workers
+        for w in self.workers:
+            w.buffer.stop()
+        self.idqueue.stop()
+        # wait until all workers idle
+        if self.num_idle.value < self.num_workers:
+            with self.gid.get_lock():
+                self.gid.get_obj().value = self.batch_len
+                if mp_log_v:
+                    print("idle num", self.num_idle.value)
+                while self.num_idle.value < self.num_workers:
+                    self.num_idle_c.wait()
+                    if mp_log_v:
+                        print("idle num", self.num_idle.value)
+        # clean workers' buffer
+        for w in self.workers:
+            w.buffer.clear()
+        self.idqueue.clear()
+        self.gid.value = 0
+            
+    def _init_workers(self, index_list):
+        jt.migrate_all_to_cpu()
+        jt.clean()
+        jt.gc()
+        self.index_list = mp.Array('i', self.real_len, lock=False)
+        workers = []
+        # get worker id
+        self.idqueue = jt.RingBuffer(2048)
+        self.idqueue_lock = mp.Lock()
+        # global token index
+        self.gid = mp.Value('i', self.batch_len)
+        self.gid.value = 0
+        # global token index condition
+        self.gidc = mp.Condition(self.gid.get_lock())
+        # number of idle workers
+        self.num_idle = mp.Value('i', 0, lock=False)
+        # number of idle workers condition
+        self.num_idle_c = mp.Condition(self.gid.get_lock())
+        self.index_list_numpy = np.ndarray(dtype='int32', shape=self.real_len, buffer=self.index_list)
+        self.index_list_numpy[:] = index_list
+        for i in range(self.num_workers):
+            w = Worker(target=self._worker_main, args=(i,), 
+                       buffer_size=self.buffer_size,
+                       keep_numpy_array=self.keep_numpy_array)
+            workers.append(w)
+        self.workers = workers
+
+    def reset(self):
+        if not hasattr(self, "workers"):
+            return
+        self._stop_all_workers()
+        self.terminate()
+        del self.index_list
+        del self.idqueue
+        del self.idqueue_lock
+        del self.gid
+        del self.gidc
+        del self.num_idle
+        del self.num_idle_c
+        del self.workers
+        del self.index_list_numpy
+
+    def __del__(self):
+        if mp_log_v:
+            print("dataset deleted")
+        try:
+            self.terminate()
+        except:
+            pass
+
+    def __deepcopy__(self, memo=None, _nil=[]):
+        from copy import deepcopy
+        if memo is None:
+            memo = {}
+        d = id(self)
+        y = memo.get(d, _nil)
+        if y is not _nil:
+            return y
+
+        obj = self.__class__.__new__(self.__class__)
+        memo[d] = id(obj)
+        exclude_key = {"index_list", "idqueue", "idqueue_lock", "gid", "gidc", "num_idle", "num_idle_c", "workers", "index_list_numpy", "dataset", "idqueue", "idqueue_lock"}
+        for k,v in self.__dict__.items():
+            if k in exclude_key: continue
+            obj.__setattr__(k, deepcopy(v))
+        obj.dataset = obj
+        return obj
+
+    def __real_len__(self):
+        if self.total_len is None:
+            self.total_len = len(self)
+        return self.total_len
+
+    def _get_index_list(self):
+        if self.total_len is None:
+            self.total_len = len(self)
+        # maybe rewrite by sampler
+        total_len = self.total_len
+        if self.sampler:
+            index_list = list(self.sampler.__iter__())
+            total_len = len(index_list)
+            # check is not batch sampler
+            if len(index_list):
+                assert not isinstance(index_list[0], (list,tuple)), "Batch sampler not support yet."
+        elif self.shuffle == False:
+            index_list = get_order_list(self.total_len)
+        else:
+            # using _shuffle_rng to generate multiprocess
+            # consist shuffle list
+            # index_list = get_random_list(self.total_len)
+            index_list = self._shuffle_rng.permutation(range(self.total_len))
+        
+        # scatter index_list for all mpi process
+        # scatter rule:
+        #   batch 1   batch 2
+        # [........] [........] ...
+        #  00011122   00011122
+        # if last batch is smaller than world_size
+        # pad to world_size
+        #  last batch
+        # [.] -> [012]
+        if jt.in_mpi:
+            world_size = mpi.world_size()
+            world_rank = mpi.world_rank()
+            index_list = np.int32(index_list)
+            # TODO: mpi broadcast in subprocess has bug, fix it
+            # mpi.broadcast(index_list, 0)
+
+            assert self.batch_size >= world_size, \
+                f"Batch size({self.batch_size}) is smaller than MPI world_size({world_size})"
+            real_batch_size = (self.batch_size-1) // world_size + 1
+            if real_batch_size * world_size != self.batch_size:
+                LOG.w("Batch size is not divisible by MPI world size, "
+                      "The distributed version may be different from "
+                      "the single-process version.")
+            fix_batch = total_len // self.batch_size
+            last_batch = total_len - fix_batch * self.batch_size
+            fix_batch_l = index_list[0:fix_batch*self.batch_size] \
+                .reshape(-1,self.batch_size)
+            fix_batch_l = fix_batch_l[
+                :,real_batch_size*world_rank:real_batch_size*(world_rank+1)]
+            real_batch_size = fix_batch_l.shape[1]
+            fix_batch_l = fix_batch_l.flatten()
+            if not self.drop_last and last_batch > 0:
+                last_batch_l = index_list[-last_batch:]
+                real_last_batch = (last_batch-1)//world_size+1
+                l = real_last_batch * world_rank
+                r = l + real_last_batch
+                if r > last_batch: 
+                    r = last_batch
+                    l = r-real_last_batch
+                index_list = np.concatenate([fix_batch_l, last_batch_l[l:r]])
+            else:
+                index_list = fix_batch_l
+
+            self.real_len = len(index_list)
+            self.real_batch_size = real_batch_size
+            # assert total_len // self.batch_size == \
+            #     self.real_len // self.real_batch_size, f"Number of batches({total_len // self.batch_size}!={self.real_len // self.real_batch_size}) not match, total_len: {total_len}, batch_size: {self.batch_size}, real_len: {self.real_len}, real_batch_size: {self.real_batch_size}"
+
+            # print(f"Number of batches({total_len // self.batch_size}!={self.real_len // self.real_batch_size}) not match, total_len: {total_len}, batch_size: {self.batch_size}, real_len: {self.real_len}, real_batch_size: {self.real_batch_size}")
+            # print("mpi dataset init ")
+        else:
+            self.real_len = len(index_list)
+            self.real_batch_size = self.batch_size
+
+        if self.drop_last:
+            self.batch_len = self.real_len // self.real_batch_size
+        else:
+            self.batch_len = (self.real_len-1) // self.real_batch_size + 1
+
+        return index_list
+
+    def _epochs(self):
+        if self.endless:
+            while True:
+                yield
+                self.epoch_id += 1
+        else:
+            yield
+        
+    def __iter__(self):
+        if self._disable_workers:
+            self.num_workers = 0
+        index_list = self._get_index_list()
+        
+        if not hasattr(self, "workers") and self.num_workers:
+            self._init_workers(index_list)
+            self.last_ids = [-1] * 10
+        
+        if self.num_workers:
+            start = time.time()
+            self.batch_time = 0
+            gid_obj = self.gid.get_obj()
+            gid_lock = self.gid.get_lock()
+
+            for _ in self._epochs():
+                with gid_lock:
+                    if self.num_idle.value:
+                        self.gidc.notify_all()
+
+                for i in range(self.batch_len):
+                    if self.num_idle.value:
+                        with gid_lock:
+                            if self.num_idle.value and \
+                                gid_obj.value >= self.batch_len:
+                                index_list = self._get_index_list()
+                                self.index_list_numpy[:] = index_list
+                                gid_obj.value = 0
+                                self.gidc.notify_all()
+
+                    # get which worker has this batch
+                    worker_id = self.idqueue.pop()
+
+                    now = time.time()
+                    self.wait_time = now - start
+                    start = now
+
+                    self.last_ids[i%10] = worker_id
+                    self.batch_id = i
+                    w = self.workers[worker_id]
+                    if mp_log_v:
+                        print(f"#{worker_id} {os.getpid()} recv buffer", w.buffer)
+                    batch = w.buffer.recv()
+
+                    now = time.time()
+                    self.recv_time = now - start
+                    start = now
+
+                    if mp_log_v:
+                        print(f"#{worker_id} {os.getpid()} recv", type(batch).__name__, [ type(b).__name__ for b in batch ])
+                    batch = self.to_jittor(batch)
+                    
+                    now = time.time()
+                    self.to_jittor_time = now - start
+                    start = now
+
+                    yield batch
+
+                    now = time.time()
+                    self.batch_time = now - start
+                    start = now
+
+                    if CHECK_MEMORY and self.batch_id % CHECK_MEMORY == 0:
+                        jt.display_memory_info()
+        else:
+            for _ in self._epochs():
+                self.batch_id = 0
+                batch_data = []
+                for idx in index_list:
+                    batch_data.append(self[int(idx)])
+                    if len(batch_data) == self.real_batch_size:
+                        batch_data = self.collate_batch(batch_data)
+                        tmp = batch_data
+                        batch_data = self.to_jittor(batch_data)
+                        # breakpoint()
+                        yield batch_data
+                        self.batch_id += 1
+                        if CHECK_MEMORY and self.batch_id % CHECK_MEMORY == 0:
+                            jt.display_memory_info()
+                        batch_data = []
+
+                # depend on drop_last
+                if not self.drop_last and len(batch_data) > 0:
+                    batch_data = self.collate_batch(batch_data)
+                    batch_data = self.to_jittor(batch_data)
+                    self.batch_id += 1
+                    yield batch_data
+
+def DataLoader(dataset: Dataset, *args, **kargs):
+    """ Simple dataloader.
+    
+    Example::
+
+        train_dir = './data/celebA_train'
+        train_dataset = ImageFolder(train_dir)
+        dataloader = jt.dataset.DataLoader(train_dataset, batch_size=8)
+
+    """
+    return dataset.set_attrs(*args, **kargs)
+
+class ImageFolder(Dataset):
+    """
+    A image classify dataset, load image and label from directory::
+    
+        * root/label1/img1.png
+        * root/label1/img2.png
+        * ...
+        * root/label2/img1.png
+        * root/label2/img2.png
+        * ...
+
+    Args::
+
+        [in] root(string): Root directory path.
+
+    Attributes::
+
+        * classes(list): List of the class names.
+        * class_to_idx(dict): map from class_name to class_index.
+        * imgs(list): List of (image_path, class_index) tuples
+
+    Example::
+
+        train_dir = './data/celebA_train'
+        train_loader = ImageFolder(train_dir).set_attrs(batch_size=batch_size, shuffle=True)
+        for batch_idx, (x_, target) in enumerate(train_loader):
+            ...
+
+    """
+    def __init__(self, root, transform=None):
+        super().__init__()
+        self.root = root
+        self.transform = transform
+        self.classes = sorted([d.name for d in os.scandir(root) if d.is_dir()])
+        self.class_to_idx = {v:k for k,v in enumerate(self.classes)}
+        self.imgs = []
+        image_exts = set(('.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff'))
+        
+        for i, class_name in enumerate(self.classes):
+            class_dir = os.path.join(root, class_name)
+            for dname, _, fnames in sorted(os.walk(class_dir, followlinks=True)):
+                for fname in sorted(fnames):
+                    if os.path.splitext(fname)[-1].lower() in image_exts:
+                        path = os.path.join(class_dir, fname)
+                        self.imgs.append((path, i))
+        LOG.i(f"Found {len(self.classes)} classes and {len(self.imgs)} images.")
+        self.set_attrs(total_len=len(self.imgs))
+        
+    def __getitem__(self, k):
+        with open(self.imgs[k][0], 'rb') as f:
+            img = Image.open(f).convert('RGB')
+            if self.transform:
+                img = self.transform(img)
+            return img, self.imgs[k][1]
+
+class VarDataset(Dataset):
+    """ Dataset using Var directly, TensorDataset is alias of VarDataset, Example::
+
+    import jittor as jt
+    from jittor.dataset import VarDataset
+
+    x = jt.array([1,2,3])
+    y = jt.array([4,5,6])
+    z = jt.array([7,8,9])
+    dataset = VarDataset(x, y, z)
+    dataset.set_attrs(batch_size=1)
+
+    for a,b,c in dataset:
+        print(a,b,c)
+    # will print
+    #  1,4,7
+    #  2,5,8
+    #  3,6,9
+
+    """
+    def __init__(self, *args):
+        super().__init__()
+        self.args = args
+        self._disable_workers = True
+        assert len(args), "At lease one args"
+        l = len(args[0])
+        for a in args:
+            assert l == len(a), "Len should be the same"
+        self.set_attrs(total_len=l)
+
+    def __getitem__(self, idx):
+        return [ a[idx] for a in self.args ]
+        
+
+    def collate_batch(self, batch):
+        b = collate_batch(batch)
+        for i in range(len(self.args)):
+            x = b[i]
+            if jt.is_var(self.args[i]) and self.args[i].ndim == 1:
+                x.assign(x.squeeze(-1))
+        return b
+
+TensorDataset = VarDataset
--- a/python/jittor/dataset/mnist.py
+++ b/python/jittor/dataset/mnist.py
@ -0,0 +1,200 @@
+# ***************************************************************
+# Copyright(c) 2019
+#     Meng-Hao Guo <guomenghao1997@gmail.com>
+#     Dun Liang <randonlang@gmail.com>.
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+
+import os
+import string
+import numpy as np
+import gzip
+from PIL import Image
+# our lib jittor import
+from jittor.dataset.dataset import Dataset, dataset_root
+from jittor_utils.misc import ensure_dir, download_url_to_local
+import jittor as jt 
+import jittor.transform as trans
+
+class MNIST(Dataset):
+    '''
+    Jittor's own class for loading MNIST dataset.
+
+    Args::
+
+        [in] data_root(str): your data root.
+        [in] train(bool): choose model train or val.
+        [in] download(bool): Download data automatically if download is True.
+        [in] batch_size(int): Data batch size.
+        [in] shuffle(bool): Shuffle data if true.
+        [in] transform(jittor.transform): transform data.
+
+    Example::
+
+        from jittor.dataset.mnist import MNIST
+        train_loader = MNIST(train=True).set_attrs(batch_size=16, shuffle=True)
+        for i, (imgs, target) in enumerate(train_loader):
+            ...
+    '''
+    def __init__(self, data_root=dataset_root+"/mnist_data/", 
+                 train=True, 
+                 download=True, 
+                 batch_size = 16,
+                 shuffle = False,
+                 transform=None):
+        # if you want to test resnet etc you should set input_channel = 3, because the net set 3 as the input dimensions
+        super().__init__()
+        self.data_root = data_root
+        self.is_train = train
+        self.transform = transform
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        if download == True:
+            self.download_url()
+
+        filesname = [
+                "train-images-idx3-ubyte.gz",
+                "t10k-images-idx3-ubyte.gz",
+                "train-labels-idx1-ubyte.gz",
+                "t10k-labels-idx1-ubyte.gz"
+        ]
+        self.mnist = {}
+        if self.is_train:
+            with gzip.open(data_root + filesname[0], 'rb') as f:
+                self.mnist["images"] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28, 28)
+            with gzip.open(data_root + filesname[2], 'rb') as f:
+                self.mnist["labels"] = np.frombuffer(f.read(), np.uint8, offset=8)
+        else:
+            with gzip.open(data_root + filesname[1], 'rb') as f:
+                self.mnist["images"] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28, 28)
+            with gzip.open(data_root + filesname[3], 'rb') as f:
+                self.mnist["labels"] = np.frombuffer(f.read(), np.uint8, offset=8)
+        assert(self.mnist["images"].shape[0] == self.mnist["labels"].shape[0])
+        self.total_len = self.mnist["images"].shape[0]
+        # this function must be called
+        self.set_attrs(total_len = self.total_len)
+
+    def __getitem__(self, index):
+        img = Image.fromarray(self.mnist['images'][index]).convert('RGB')
+        if self.transform:
+            img = self.transform(img)
+        return trans.to_tensor(img), self.mnist['labels'][index]
+
+    def download_url(self):
+        '''
+        Download mnist data set function, this function will be called when download is True.
+        '''
+        resources = [
+            ("https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873"),
+            ("https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz", "d53e105ee54ea40749a09fcbcd1e9432"),
+            ("https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz", "9fb629c4189551a2d022fa330f9573f3"),
+            ("https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz", "ec29112dd5afa0611ce80d1b7f02629c")
+        ]
+
+        for url, md5 in resources:
+            filename = url.rpartition('/')[2]
+            download_url_to_local(url, filename, self.data_root, md5)
+
+class EMNIST(Dataset):
+    '''
+    Jittor's own class for loading EMNIST dataset.
+
+    Args::
+
+        [in] data_root(str): your data root.
+        [in] split(str): one of 'byclass', 'bymerge', 'balanced', 'letters', 'digits', 'mnist'.
+        [in] train(bool): choose model train or val.
+        [in] download(bool): Download data automatically if download is True.
+        [in] batch_size(int): Data batch size.
+        [in] shuffle(bool): Shuffle data if true.
+        [in] transform(jittor.transform): transform data.
+
+    Example::
+
+        from jittor.dataset.mnist import EMNIST
+        train_loader = EMNIST(train=True).set_attrs(batch_size=16, shuffle=True)
+        for i, (imgs, target) in enumerate(train_loader):
+            ...
+    '''
+
+    _merged_classes = {'c', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 's', 'u', 'v', 'w', 'x', 'y', 'z'}
+    _all_classes = set(string.digits + string.ascii_letters)
+    classes_split_dict = {
+        'byclass': sorted(list(_all_classes)),
+        'bymerge': sorted(list(_all_classes - _merged_classes)),
+        'balanced': sorted(list(_all_classes - _merged_classes)),
+        'letters': ['N/A'] + list(string.ascii_lowercase),
+        'digits': list(string.digits),
+        'mnist': list(string.digits),
+    }
+
+    def __init__(self, data_root=dataset_root+"/emnist_data/", 
+                 split='byclass',
+                 train=True, 
+                 download=True, 
+                 batch_size = 16,
+                 shuffle = False,
+                 transform=None):
+        # if you want to test resnet etc you should set input_channel = 3, because the net set 3 as the input dimensions
+        super().__init__()
+        self.data_root = data_root
+        self.is_train = train
+        self.transform = transform
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        if download == True:
+            self.download_url()
+        data_root = os.path.join(data_root, "gzip")
+
+        filesname = [
+                f"emnist-{split}-train-images-idx3-ubyte.gz",
+                f"emnist-{split}-t10k-images-idx3-ubyte.gz",
+                f"emnist-{split}-train-labels-idx1-ubyte.gz",
+                f"emnist-{split}-t10k-labels-idx1-ubyte.gz"
+        ]
+        for i in range(4):
+            filesname[i] = os.path.join(data_root, filesname[i])
+        self.mnist = {}
+        if self.is_train:
+            with gzip.open(filesname[0], 'rb') as f:
+                self.mnist["images"] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28, 28).transpose(0,2,1)
+            with gzip.open(filesname[2], 'rb') as f:
+                self.mnist["labels"] = np.frombuffer(f.read(), np.uint8, offset=8)
+        else:
+            with gzip.open(filesname[1], 'rb') as f:
+                self.mnist["images"] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28, 28).transpose(0,2,1)
+            with gzip.open(filesname[3], 'rb') as f:
+                self.mnist["labels"] = np.frombuffer(f.read(), np.uint8, offset=8)
+        assert(self.mnist["images"].shape[0] == self.mnist["labels"].shape[0])
+        self.total_len = self.mnist["images"].shape[0]
+        # this function must be called
+        self.set_attrs(total_len = self.total_len)
+
+    def __getitem__(self, index):
+        img = Image.fromarray(self.mnist['images'][index]).convert('RGB')
+        if self.transform:
+            img = self.transform(img)
+        return trans.to_tensor(img), self.mnist['labels'][index]
+
+    def download_url(self):
+        '''
+        Download mnist data set function, this function will be called when download is True.
+        '''
+        resources = [
+            ("https://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip", "58c8d27c78d21e728a6bc7b3cc06412e"),
+        ]
+
+        for url, md5 in resources:
+            filename = "emnist.zip"
+            download_url_to_local(url, filename, self.data_root, md5)
+            import zipfile
+            zf = zipfile.ZipFile(os.path.join(self.data_root, filename))
+            try:
+                zf.extractall(path=self.data_root)
+            except RuntimeError as e:
+                print(e)
+                raise
+            zf.close()
+
--- a/python/jittor/dataset/sampler.py
+++ b/python/jittor/dataset/sampler.py
@ -0,0 +1,126 @@
+# ***************************************************************
+# Copyright (c) 2023 Jittor. All Rights Reserved. 
+# Maintainers: 
+#     Hao-Yang Peng
+#     Dun Liang <randonlang@gmail.com>. 
+# 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import jittor as jt
+from .dataset import Dataset
+import numpy as np
+from PIL import Image
+
+
+class Sampler():
+    def __init__(self, dataset):
+        self.dataset = dataset
+        # MUST set sampler here
+        dataset.sampler = self
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def __len__(self):
+        raise NotImplementedError
+
+
+class SequentialSampler(Sampler):
+    def __init__(self, dataset):
+        # MUST set sampler here
+        dataset.sampler = self
+        self.dataset = dataset
+
+    def __iter__(self):
+        return iter(range(self.dataset.__real_len__() if hasattr(self.dataset,"__real_len__") else self.dataset.__len__()))
+
+    def __len__(self):
+        return self.dataset.__real_len__() if hasattr(self.dataset,"__real_len__") else self.dataset.__len__()
+
+
+class RandomSampler(Sampler):
+    def __init__(self, dataset, replacement=False, num_samples=None):
+        # MUST set sampler here
+        dataset.sampler = self
+        self.dataset = dataset
+        self.rep = replacement
+        self._num_samples = num_samples
+        self._shuffle_rng = np.random.default_rng(1)
+
+    @property
+    def num_samples(self):
+        if self._num_samples is None:
+            return self.dataset.__real_len__() if hasattr(self.dataset,"__real_len__") else self.dataset.__len__()
+        return self._num_samples
+
+    def __len__(self):
+        return self.num_samples
+
+    def __iter__(self):
+        n = self.dataset.__real_len__() if hasattr(self.dataset,"__real_len__") else self.dataset.__len__()
+        if self.rep:
+            return iter(self._shuffle_rng.integers(low=0, high=n, size=(self.num_samples,), dtype=np.int64).tolist())
+        return iter(self._shuffle_rng.permutation(n).tolist())
+
+
+class SkipFirstBatchesSampler(Sampler):
+    def __init__(self, sampler, num_skip_batches):
+        # MUST set sampler here
+        sampler.dataset.sampler = self
+        self.sampler = sampler
+        self.num_skip_batches = num_skip_batches
+
+    def __len__(self):
+        return len(self.sampler) - self.num_skip_batches
+
+    def __iter__(self):
+        return iter(list(iter(self.sampler))[self.num_skip_batches:])
+
+
+class SubsetRandomSampler(Sampler):
+    def __init__(self, dataset, indice):
+        '''
+        testdataset = TestSamplerDataset()
+        subsetsampler = SubsetRandomSampler(testdataset, (20, 30))
+
+        for i, data in enumerate(testdataset):
+            # data between 20 ~ 29
+            ......
+            
+        '''
+        # MUST set sampler here
+        dataset.sampler = self
+        self.dataset = dataset
+        self.indices = indice
+        dlen = dataset.__real_len__() if hasattr(dataset,"__real_len__") else dataset.__len__()
+        assert indice[0] >= 0 and indice[1] < dlen and indice[0] < indice[1]
+
+    def __iter__(self):
+        return (int(i) + self.indices[0] for i in np.random.permutation(self.indices[1] - self.indices[0]))
+
+    def __len__(self):
+        return self.indices[1] - self.indices[0]
+
+
+class BatchSampler(Sampler):
+    def __init__(self, sampler, batch_size, drop_last):
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+    def __iter__(self):
+        batch = []
+        for idx in self.sampler:
+            batch.append(idx)
+            if len(batch) == self.batch_size:
+                yield batch
+                batch = []
+        if len(batch) > 0 and not self.drop_last:
+            yield batch
+
+    def __len__(self):
+        if self.drop_last:
+            return len(self.sampler) // self.batch_size
+        else:
+            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
--- a/python/jittor/dataset/utils.py
+++ b/python/jittor/dataset/utils.py
@ -0,0 +1,68 @@
+# ***************************************************************
+# Copyright (c) 2023 Jittor. All Rights Reserved. 
+# Maintainers:
+#     Meng-Hao Guo <guomenghao1997@gmail.com>
+#     Dun Liang <randonlang@gmail.com>.
+# 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+
+import jittor as jt
+import numpy as np
+from collections.abc import Sequence, Mapping
+from PIL import Image
+import time
+
+def get_random_list(n):
+    return list(np.random.permutation(range(n)))
+
+def get_order_list(n):
+    return [i for i in range(n)]
+
+
+def collate_batch(batch):
+    r"""Puts each data field into a tensor with outer dimension batch size"""
+    real_size = len(batch)
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, jt.Var):
+        temp_data = jt.stack([data for data in batch], 0)
+        return temp_data
+    if elem_type is np.ndarray:
+        temp_data = np.stack([data for data in batch], 0)
+        return temp_data
+    elif np.issubdtype(elem_type, np.integer):
+        return np.int32(batch)
+    elif isinstance(elem, int):
+        return np.int32(batch)
+    elif isinstance(elem, float):
+        return np.float32(batch)
+    elif isinstance(elem, str):
+        return batch
+    elif isinstance(elem, Mapping):
+        return {key: collate_batch([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple):
+        transposed = zip(*batch)
+        return tuple(collate_batch(samples) for samples in transposed)
+    elif isinstance(elem, Sequence):
+        transposed = zip(*batch)
+        return [collate_batch(samples) for samples in transposed]
+    elif isinstance(elem, Image.Image):
+        temp_data = np.stack([np.array(data) for data in batch], 0)
+        return temp_data
+    else:
+        raise TypeError(f"Not support type <{elem_type.__name__}>")
+
+class HookTimer:
+    def __init__(self, obj, attr):
+        self.origin = getattr(obj, attr)
+        self.duration = 0.0
+        setattr(obj, attr, self)
+
+    def __call__(self, *args, **kw):
+        start = time.time()
+        rt = self.origin(*args, **kw)
+        self.duration += time.time() - start
+        return rt
+
--- a/python/jittor/dataset/voc.py
+++ b/python/jittor/dataset/voc.py
@ -0,0 +1,70 @@
+# ***************************************************************
+# Copyright(c) 2019
+#     Meng-Hao Guo <guomenghao1997@gmail.com>
+#     Dun Liang <randonlang@gmail.com>.
+# All Rights Reserved.
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+
+import numpy as np
+import os
+from PIL import Image
+from .dataset import Dataset, dataset_root
+
+class VOC(Dataset):
+    '''
+    Jittor's own class for loading VOC dataset.
+
+    Args::
+
+        [in] data_root(str): your data root.
+        [in] split(str): which split you want to use, train or val.
+    
+    Attribute::
+
+        NUM_CLASSES: Number of total categories, default is 21.
+
+    Example::
+
+        from jittor.dataset.voc import VOC
+        train_loader = VOC(data_root='...').set_attrs(batch_size=16, shuffle=True)
+        for i, (imgs, target) in enumerate(train_loader):
+            ...
+    '''
+    NUM_CLASSES = 21
+    def __init__(self, data_root=dataset_root+'/voc/', split='train'):
+        super().__init__()
+        ''' total_len , batch_size, shuffle must be set '''
+        self.data_root = data_root
+        self.split = split
+
+        self.image_root = os.path.join(data_root, 'JPEGImages')
+        self.label_root = os.path.join(data_root, 'SegmentationClass')
+
+        self.data_list_path = os.path.join(self.data_root, 'ImageSets', 'Segmentation', self.split + '.txt')
+        self.image_path = []
+        self.label_path = []
+
+        with open(self.data_list_path, "r") as f:
+            lines = f.read().splitlines()
+
+        for idx, line in enumerate(lines):
+            _img_path = os.path.join(self.image_root, line + '.jpg')
+            _label_path = os.path.join(self.label_root, line + '.png')
+            assert os.path.isfile(_img_path)
+            assert os.path.isfile(_label_path)
+            self.image_path.append(_img_path)
+            self.label_path.append(_label_path)
+        self.set_attrs(total_len = len(self.image_path))
+
+    def __getitem__(self, index):
+        _img = Image.open(self.image_path[index])
+        _label = Image.open(self.label_path[index])
+        _img = _img.resize((513, 513))
+        _label = _label.resize((513, 513))
+        _img = np.array(_img)
+        _label = np.array(_label)
+        _img = _img.transpose(2,0,1)
+        return _img, _label
+
--- a/python/jittor/demo/simple_cgan.py
+++ b/python/jittor/demo/simple_cgan.py
@ -0,0 +1,107 @@
+import jittor as jt
+from jittor import nn
+import numpy as np
+# import pylab as pl
+
+# 隐空间向量长度
+latent_dim = 100
+# 类别数量
+n_classes = 10
+# 图片大小
+img_size = 32
+# 图片通道数量
+channels = 1
+# 图片张量的形状
+img_shape = (channels, img_size, img_size)
+
+class Generator(nn.Module):
+    def __init__(self):
+        super(Generator, self).__init__()
+        self.label_emb = nn.Embedding(n_classes, n_classes)
+
+        def block(in_feat, out_feat, normalize=True):
+            layers = [nn.Linear(in_feat, out_feat)]
+            if normalize:
+                layers.append(nn.BatchNorm1d(out_feat, 0.8))
+            layers.append(nn.LeakyReLU(0.2))
+            return layers
+        self.model = nn.Sequential(
+            *block((latent_dim + n_classes), 128, normalize=False), 
+            *block(128, 256), 
+            *block(256, 512), 
+            *block(512, 1024), 
+            nn.Linear(1024, int(np.prod(img_shape))), 
+            nn.Tanh())
+
+    def execute(self, noise, labels):
+        gen_input = jt.concat((self.label_emb(labels), noise), dim=1)
+        img = self.model(gen_input)
+        img = img.view((img.shape[0], *img_shape))
+        return img
+
+class Discriminator(nn.Module):
+    def __init__(self):
+        super(Discriminator, self).__init__()
+        self.label_embedding = nn.Embedding(n_classes, n_classes)
+        self.model = nn.Sequential(
+            nn.Linear((n_classes + int(np.prod(img_shape))), 512), 
+            nn.LeakyReLU(0.2), 
+            nn.Linear(512, 512), 
+            nn.Dropout(0.4), 
+            nn.LeakyReLU(0.2), 
+            nn.Linear(512, 512), 
+            nn.Dropout(0.4), 
+            nn.LeakyReLU(0.2), 
+            nn.Linear(512, 1))
+
+    def execute(self, img, labels):
+        d_in = jt.concat((img.view((img.shape[0], (- 1))), self.label_embedding(labels)), dim=1)
+        validity = self.model(d_in)
+        return validity
+
+
+# 定义模型
+generator = Generator()
+discriminator = Discriminator()
+generator.eval()
+discriminator.eval()
+
+# 加载参数
+generator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/generator_last.pkl')
+discriminator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/discriminator_last.pkl')
+
+
+
+def gen_img(number):
+    print(number, type(number))
+    n_row = len(number)
+    z = jt.array(np.random.normal(0, 1, (n_row, latent_dim))).float32().stop_grad()
+    labels = jt.array(np.array([int(number[num]) for num in range(n_row)])).float32().stop_grad()
+    gen_imgs = generator(z,labels)
+    gen_imgs = gen_imgs.transpose((1,2,0,3)).reshape(gen_imgs.shape[2], -1)
+    gen_imgs = gen_imgs[:,:,None].broadcast(gen_imgs.shape+(3,)) # .uint8()
+    gen_imgs = (gen_imgs - gen_imgs.min()) / (gen_imgs.max() - gen_imgs.min()) * 255
+    gen_imgs = gen_imgs.uint8()
+    # print(gen_imgs.shape, gen_imgs.max(), gen_imgs.min())
+    return gen_imgs.numpy()
+    # gen_imgs = gen_imgs.data.transpose((1,2,0,3))[0].reshape((gen_imgs.shape[2], -1))
+    # print(gen_imgs.shape)
+    return gen_imgs[:,:,None]
+
+from PIL import Image
+import pywebio as pw
+# 定义一串数字
+number = "201962517"
+# gen_img(number)
+Image.fromarray(gen_img(number))
+# pl.imshow()
+# pl.show()
+# print("done")
+
+
+def web_server():
+    pw.pin.put_input("number", label="输入用于生成的数字(由计图框架支持)：")
+    pw.output.put_buttons(['Gen image'], 
+        lambda _: pw.output.put_image(Image.fromarray(gen_img(pw.pin.pin.number))))
+
+pw.start_server(web_server, port=8123)
--- a/python/jittor/depthwise_conv.py
+++ b/python/jittor/depthwise_conv.py
@ -0,0 +1,325 @@
+# ***************************************************************
+# Copyright (c) 2023 Jittor. All Rights Reserved. 
+# Maintainers:
+#     Guoye Yang <498731903@qq.com>
+#     Dun Liang <randonlang@gmail.com>.
+#
+# 
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import jittor as jt
+from jittor import init
+from jittor import nn
+from jittor import Function
+
+class DepthwiseConv(Function):
+    def __init__(self, stride=1, padding=0, dilation=1):
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
+        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
+        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
+
+    def execute(self, x, weight):
+        if not jt.flags.use_cuda or not jt.compiler.is_cuda:
+            return nn.conv2d(x, weight, None, self.stride, self.padding, self.dilation, x.shape[1])
+        self.save_vars = x, weight
+        N,C,H,W = x.shape
+        o,i,Kh,Kw = weight.shape
+        assert(o == C)
+        oh = (H+self.padding[0]*2-Kh*self.dilation[0]+self.dilation[0]-1)//self.stride[0]+1
+        ow = (W+self.padding[1]*2-Kw*self.dilation[1]+self.dilation[1]-1)//self.stride[1]+1
+        filter_height, filter_width = Kh, Kw
+        self.Khw = Kh, Kw
+        assert oh>0 and ow>0
+        output = jt.code(
+            [N, C, oh, ow],
+            x.dtype,
+            [x, weight],
+            cuda_header = """
+        template <typename T, 
+            int filter_height,
+            int filter_width, 
+            int stride_height,
+            int stride_width>
+        __global__ void KernelDepthwiseConv(
+            const T *const input_data, const T *const filter_data, const int batch_size,
+            const int output_channels, const int output_height,
+            const int output_width, const int input_channels,  
+            const int input_height, const int input_width,     
+            const int padding_height, const int padding_width, 
+            const int dilate_height, const int dilate_width, T *const output_data) {
+            const int kWeghtSize = filter_height * filter_width;
+            T r_weight[kWeghtSize];
+            const int batch = blockIdx.y;
+            const int c_out = blockIdx.x;
+            const T* weight = filter_data + c_out * filter_height * filter_width;
+            for (int i = 0; i < filter_height * filter_width; i++) r_weight[i] = weight[i];
+
+            for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
+                for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
+                    const int batch = blockIdx.y;
+                    const int c_out = blockIdx.x;
+
+                    const int c_in = c_out;
+                    T value = 0;
+                    const int h_in_start = -padding_height + h_out * stride_height;
+                    const int w_in_start = -padding_width + w_out * stride_width;
+                    const int h_in_end = h_in_start + filter_height * dilate_height;
+                    const int w_in_end = w_in_start + filter_width * dilate_width;
+
+                    const int in_offset =
+                        ((batch * input_channels + c_in) * input_height) * input_width;
+
+                    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+                    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+                    const int h_start = h_in_start > 0 ? h_in_start : 0;
+                    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+                    for (int h_in = h_in_start, h_f = 0; h_f < filter_height;
+                        h_in += dilate_height, h_f++) {
+                        for (int w_in = w_in_start, w_f = 0; w_f < filter_width;
+                            w_in += dilate_width, w_f++) {
+                            if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
+                                w_in < input_width) {
+                                const int offset = in_offset + h_in * input_width + w_in;
+                                value += r_weight[h_f * filter_width + w_f] * input_data[offset];
+                            }
+                        }
+                    }
+                    int index =
+                        ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
+                        w_out;
+                    output_data[index] = value;
+                }
+            }
+        }
+        """,
+        cuda_src=f"""
+            @alias(input, in0)
+            @alias(filter, in1)
+            @alias(output, out)
+            
+            const int batch_size = input_shape0;
+            const int input_channels = input_shape1;
+            const int input_height = input_shape2;
+            const int input_width = input_shape3;
+            const int output_channels = output_shape1;
+            const int output_height = output_shape2;
+            const int output_width = output_shape3;
+            const int ksize_height = {Kh};
+            const int ksize_width = {Kw};
+            const int stride_height = {self.stride[0]};
+            const int stride_width = {self.stride[1]};
+            const int padding_height = {self.padding[0]};
+            const int padding_width = {self.padding[1]};
+            const int dilate_height = {self.dilation[0]};
+            const int dilate_width = {self.dilation[1]};
+
+            int thread = 512;
+            if (output_width > 1024 && output_width <= 2048)
+                thread = (output_width - 1) / 2 + 1;
+            else if (output_width > 512 && output_width <= 1024)
+                thread = output_width;
+            int blocks = std::min(std::max(thread / output_width, 1), output_height);
+            dim3 threads(std::min(output_width, thread), blocks, 1);
+            dim3 grid(output_channels, batch_size, 1);
+            KernelDepthwiseConv<
+                input_type, ksize_height, ksize_width, 
+                stride_height, stride_width>
+            <<<grid, threads>>>( 
+                input_p, filter_p, batch_size, output_channels, output_height,
+                output_width, input_channels, input_height, input_width,
+                padding_height, padding_width, dilate_height,
+                dilate_width, output_p);
+        """
+        )
+        return output
+
+    def grad(self, grad):
+        x, weight = self.save_vars
+        Kh, Kw = self.Khw
+        return jt.code([x.shape, weight.shape], [x.dtype, weight.dtype], [x, weight, grad],
+        cuda_header = f"#include <{jt.compile_extern.cub_home}cub/cub.cuh>"+"""
+    template <typename T>
+    __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
+    typedef cub::WarpReduce<T> WarpReduce;
+    typename WarpReduce::TempStorage temp_storage;
+    value = WarpReduce(temp_storage).Sum(value);
+    if (cub::LaneId() == 0) 
+        atomicAdd(sum, value);
+    }
+    
+    // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+    template <typename T, 
+        int filter_height,
+        int filter_width, 
+        int stride_height,
+        int stride_width>
+    __global__ void KernelDepthwiseConvInputGradCFilter(
+        const T *const input_data, const T *const output_grad_data,
+        const T *const filter_data, const int batch_size,   
+        const int output_channels, const int output_height, 
+        const int output_width, const int input_channels,   
+        const int input_height, const int input_width,      
+        const int padding_height, const int padding_width,  
+        const int dilate_height, const int dilate_width,    
+        T *const input_grad_data) {
+        const int kWeghtSize = filter_height * filter_width + 1;
+        T r_weight[kWeghtSize];
+        const int batch = blockIdx.y;
+        const int c_in = blockIdx.x;
+
+        const T* weight = filter_data + c_in * filter_height * filter_width;
+        for (int i = 0; i < filter_height * filter_width; i++)
+            r_weight[i] =
+                weight[filter_height * filter_width - i - 1];
+
+        for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
+            for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
+                const int batch = blockIdx.y;
+                const int c_in = blockIdx.x;
+
+                int h_out_start = h_in - (filter_height - 1) * dilate_height + padding_height;
+
+                int w_out_start = w_in - (filter_width - 1) * dilate_width + padding_width;
+
+                T value = 0;
+                int index =
+                    ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+                    w_in;
+
+                for (int h_out = h_out_start, h_f = 0; h_f < filter_height;
+                    h_out += dilate_height, h_f++) {
+                    for (int w_out = w_out_start, w_f = 0; w_f < filter_width;
+                        w_out += dilate_width, w_f++) {
+                        int s_h_out = h_out / stride_height;
+                        int s_w_out = w_out / stride_width;
+                        if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                            s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                            s_w_out < output_width) {
+                        const int output_grad_offset =
+                            ((batch * output_channels + c_in) * output_height +
+                            s_h_out) *
+                                output_width +
+                            s_w_out;
+                        value +=
+                            output_grad_data[output_grad_offset] *
+                            r_weight[h_f * filter_width + w_f];
+                        }
+                    }
+                }
+                input_grad_data[index] = value;
+            }
+        }
+    }
+
+    // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
+    template <typename T>
+    __global__ void KernelDepthwiseConvFilterGrad(
+        const T* output_grad_data, const T* input_data, const int num,
+        const int output_channels, const int output_height, const int output_width,
+        const int input_channels, const int input_height, const int input_width,
+        const int filter_height,
+        const int filter_width, const int stride_height, const int stride_width,
+        const int padding_height, const int padding_width, const int dilate_height,
+        const int dilate_width, T* filter_grad_data) {
+        T s = 0;
+
+        int gbid = (((blockIdx.z * blockDim.z + threadIdx.z) * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
+
+        for (int image_w = threadIdx.x; image_w < output_width;
+            image_w += blockDim.x) {
+            for (int bid = 0; bid < num; bid++) {
+            //for (int bid = threadIdx.z; bid < num; bid+=blockDim.z) {
+                for (int image_h = threadIdx.y; image_h < output_height;
+                    image_h += blockDim.y) {
+                    int kernel_id = blockIdx.z;
+                    int kernel_h = blockIdx.y * dilate_height - padding_height;
+                    int kernel_w = blockIdx.x * dilate_width - padding_width;
+
+                    int image_hk = image_h * stride_height + kernel_h;
+                    int image_wk = image_w * stride_width + kernel_w;
+                    if (image_hk < 0 || image_hk >= input_height) continue;
+                    if (image_wk < 0 || image_wk >= input_width) continue;
+                    #define gaid(N, C, H, W) \
+                    ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
+                            int input_id = ((bid * gridDim.z +
+                                            kernel_id) *
+                                                input_height +
+                                            image_hk) *
+                                            input_width +
+                                        image_wk;
+                            s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+                                input_data[input_id];
+
+                    #undef gaid
+                }
+            }
+        }
+        CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
+    }
+        """,
+    cuda_src=f"""
+    // source for backward to data
+        @alias(input, in0)
+        @alias(filter, in1)
+        @alias(output_grad, in2)
+        @alias(input_grad, out0)
+        @alias(filter_grad, out1)
+
+        const int batch_size = input_shape0;
+        const int input_channels = input_shape1;
+        const int input_height = input_shape2;
+        const int input_width = input_shape3;
+        const int output_channels = output_grad_shape1;
+        const int output_height = output_grad_shape2;
+        const int output_width = output_grad_shape3;
+        const int ksize_height = {Kh};
+        const int ksize_width = {Kw};
+        const int stride_height = {self.stride[0]};
+        const int stride_width = {self.stride[1]};
+        const int padding_height = {self.padding[0]};
+        const int padding_width = {self.padding[1]};
+        const int dilate_height = {self.dilation[0]};
+        const int dilate_width = {self.dilation[1]};
+
+        int thread = 512;
+        if (input_width > 1024 && input_width <= 2048)
+        thread = (input_width - 1) / 2 + 1;
+        else if (input_width > 512 && input_width <= 1024)
+        thread = input_width;
+        int blocks = std::min(std::max(thread / input_width, 1), input_height);
+        dim3 threads(std::min(input_width, thread), blocks, 1);
+        dim3 grid(input_channels, batch_size, 1);
+        KernelDepthwiseConvInputGradCFilter<
+            input_type, ksize_height, ksize_width
+            , stride_height, stride_width>
+            <<<grid, threads, 0>>>( 
+            input_p, output_grad_p, filter_p, batch_size,          
+            output_channels, output_height, output_width, input_channels,   
+            input_height, input_width, padding_height,       
+            padding_width, dilate_height, dilate_width, input_grad_p);   
+
+    // source for backward to filter
+    
+        int block_size = 512;
+        if (output_width > 1024 && output_width <= 2048)
+        block_size = (output_width - 1) / 2 + 1;
+        else if (output_width > 512 && output_width <= 1024)
+        block_size = output_width;
+        int crop_output_height =
+            std::min(std::max(block_size / output_width, 1), output_height);
+
+        grid = dim3(ksize_width, ksize_height, output_channels);
+        threads = dim3(std::min(output_width, block_size), crop_output_height, 1);
+        cudaMemsetAsync(filter_grad_p, 0, filter_grad->size);
+
+        KernelDepthwiseConvFilterGrad<                                         
+            input_type><<<grid, threads, 0>>>(      
+            output_grad_p, input_p, batch_size, output_channels,           
+            output_height, output_width, input_channels, input_height,           
+            input_width, ksize_height, ksize_width,           
+            stride_height, stride_width, padding_height, padding_width,          
+            dilate_height, dilate_width, filter_grad_p);                      
+    """
+    )
--- a/python/jittor/distributions.py
+++ b/python/jittor/distributions.py
@ -0,0 +1,190 @@
+# ***************************************************************
+# Copyright (c) 2023 Jittor. All Rights Reserved. 
+# Maintainers:
+#     Haoyang Peng <2247838039@qq.com>
+#     Dun Liang <randonlang@gmail.com>.
+#
+# This file is subject to the terms and conditions defined in
+# file 'LICENSE.txt', which is part of this source code package.
+# ***************************************************************
+import math
+import os
+import numpy as np
+import jittor as jt
+from jittor import nn
+from jittor.nn import binary_cross_entropy_with_logits
+from jittor import lgamma, igamma
+from jittor.math_util.gamma import gamma_grad, sample_gamma
+
+def simple_presum(x):
+    src = '''
+__inline_static__
+@python.jittor.auto_parallel(1)
+void kernel(int n0, int i0, in0_type* x, in0_type* out, int nl) {
+    out[i0*(nl+1)] = 0;
+    for (int i=0; i<nl; i++)
+        out[i0*(nl+1)+i+1] = out[i0*(nl+1)+i] + x[i0*nl+i];
+}
+kernel(in0->num/in0->shape[in0->shape.size()-1], 0, in0_p, out0_p, in0->shape[in0->shape.size()-1]);
+    '''
+    return jt.code(x.shape[:-1]+(x.shape[-1]+1,), x.dtype, [x],
+        cpu_src=src, cuda_src=src)
+
+
+class OneHotCategorical:
+    def __init__(self, probs=None, logits=None):
+        Categorical.__init__(self, probs, logits)
+
+    def sample(self, sample_shape=[]):
+        shape = sample_shape + self.probs.shape[:-1] + (1,)
+        rand = jt.rand(shape)
+        one_hot = jt.logical_and(self.cum_probs_l < rand, rand <= self.cum_probs_r).float()
+        return one_hot
+    
+    def log_prob(self, x):
+        x = jt.argmax(x, dim=-1)[0]
+        return Categorical.log_prob(self, x)
+    
+    def entropy(self):
+        p_log_p = self.logits * self.probs
+        return -p_log_p.sum(-1)
+    
+    
+class Categorical:
+    def __init__(self, probs=None, logits=None):
+        assert not (probs is None and logits is None)
+        if probs is None:
+            # cannot align to pytorch
+            probs = jt.sigmoid(logits)
+        probs = probs / probs.sum(-1, True)
+        if logits is None:
+            logits = jt.safe_log(probs)
+        with jt.no_grad():
+            self.probs = probs
+            self.logits = logits
+            self.cum_probs = simple_presum(self.probs)
+            self.cum_probs_l = self.cum_probs[..., :-1]
+            self.cum_probs_r = self.cum_probs[..., 1:]
+
+    def sample(self, sample_shape=()):
+        shape = sample_shape + self.probs.shape[:-1] + (1,)
+        rand = jt.rand(shape)
+        one_hot = jt.logical_and(self.cum_probs_l < rand, rand <= self.cum_probs_r)
+        index = one_hot.index(one_hot.ndim - 1)
+        return (one_hot * index).sum(-1)
+
+    def log_prob(self, x):
+        a = self.probs.ndim
+        b = x.ndim
+        indexes = tuple( f'i{i}' for i in range(b-a+1, b) )
+        indexes = indexes + (x,)
+        return jt.safe_log(self.probs).getitem(indexes)
+
+    def entropy(self):
+        p_log_p = self.logits * self.probs
+        return -p_log_p.sum(-1)
+
+
+class Normal:
+    def __init__(self, mu, sigma):
+        self.mu = mu
+        self.sigma = sigma
+    
+    def sample(self, sample_shape=None):
+        return jt.normal(jt.array(self.mu), jt.array(self.sigma),size=sample_shape)
+
+    def log_prob(self, x):
+        var = self.sigma**2
+        log_scale = jt.safe_log(self.sigma)
+        return -((x-self.mu)**2) / (2*var) - log_scale-np.log(np.sqrt(2*np.pi))
+    
+    def entropy(self):
+        return 0.5+0.5*np.log(2*np.pi)+jt.safe_log(self.sigma)
+
+
+class Uniform:
+    def __init__(self,low,high):
+        self.low = low
+        self.high = high
+        assert high > low
+    
+    def sample(self,sample_shape):
+        return jt.uniform(self.low,self.high,sample_shape)
+    
+    def log_prob(self,x):
+        if x < self.low or x >= self.high:
+            return math.inf
+        return -jt.safe_log(self.high - self.low)
+    
+    def entropy(self):
+        return jt.safe_log(self.high - self.low)
+
+
+class Geometric:
+    def __init__(self,p=None,logits=None):
+        assert (p is not None) or (logits is not None)
+        assert 0 < p and p < 1
+        if p is None:
+            self.prob = jt.sigmoid(logits)
+            self.logits = logits
+        elif logits is None:
+            self.prob = p
+            self.logits = -jt.safe_log(1. / p - 1)
+        
+    def sample(self, sample_shape):
+        u = jt.rand(sample_shape)
+        return (jt.safe_log(u) / (jt.safe_log(-self.probs+1))).floor_int()
+    
+    def log_prob(self, x):
+        return x*jt.safe_log(-self.prob+1)+jt.safe_log(self.prob)
+    
+    def entropy(self):
+        return binary_cross_entropy_with_logits(jt.array(self.logits),jt.array(self.prob)) / self.prob
+
+
+class GammaDistribution:
+    '''
+    For now only support gamma distribution.
+    '''
+    def __init__(self, concentration, rate):
+        self.concentration = concentration
+        self.rate = rate
+        self.lgamma_alpha = lgamma.apply(jt.array([concentration,]))
+
+    def sample(self, shape):
+        return sample_gamma(self.concentration, shape)
+    
+    def cdf(self, value):
+        return igamma(self.concentration, value)
+    
+    def log_prob(self, value):
+        return (self.concentration * jt.log(self.rate) +
+                (self.concentration - 1) * jt.log(value) -
+                self.rate * value - self.lgamma_alpha)
+    
+    def mean(self):
+        return self.concentration / self.rate
+    
+    def mode(self):
+        return np.minimum((self.concentration - 1) / self.rate, 1)
+    
+    def variance(self):
+        return self.concentration / (self.rate * self.rate)
+
+
+def kl_divergence(cur_dist, old_dist):
+    assert isinstance(cur_dist, type(old_dist))
+    if isinstance(cur_dist, Normal):
+        vr = (cur_dist.sigma / old_dist.sigma)**2
+        t1 = ((cur_dist.mu - old_dist.mu) / old_dist.sigma)**2
+        return 0.5*(vr+t1-1-jt.safe_log(vr))
+    if isinstance(cur_dist, Categorical) or isinstance(cur_dist,OneHotCategorical):
+        t = cur_dist.probs * (cur_dist.logits-old_dist.logits)
+        return t.sum(-1)
+    if isinstance(cur_dist, Uniform):
+        res = jt.safe_log((old_dist.high - old_dist.low) / (cur_dist.high - cur_dist.low))
+        if old_dist.low > cur_dist.low or old_dist.high < cur_dist.high:
+            res = math.inf
+        return res
+    if isinstance(cur_dist, Geometric):
+        return -cur_dist.entropy() - jt.safe_log(-old_dist.prob+1) / cur_dist.prob - old_dist.logits
--- a/python/jittor/einops/init.py
+++ b/python/jittor/einops/init.py
@ -0,0 +1,8 @@
+class EinopsError(RuntimeError):
+    """ Runtime error thrown by einops """
+    pass
+
+
+__all__ = ['rearrange', 'reduce', 'repeat', 'parse_shape', 'asnumpy', 'EinopsError']
+
+from jittor.einops.einops import rearrange, reduce, repeat, parse_shape, asnumpy
--- a/Show More
+++ b/Show More