Compare commits

...

1116 Commits

Author SHA1 Message Date
lidongyang 845c24c9f8 update to 1.3.10.0 2025-07-28 18:50:28 +08:00
DongYang Li d892a83d1c
Merge pull request #654 from Jittor/hyx
merge hw backend
2025-07-28 18:36:24 +08:00
lidongyang 4017b161d2 fix master 2025-07-28 18:33:35 +08:00
uyzhang c78db2a794 enable cuda and acl 2025-07-19 11:05:30 +08:00
uyzhang f8e44de79d merge by JittorHW 2025-07-19 08:59:51 +08:00
lidongyang b79ac22b05 add updated code 2025-07-15 20:05:00 +08:00
lidongyang 2f37158e3e revert huawei support code 2025-07-15 19:51:30 +08:00
Yuxuan Han daf04e9fb5
Merge pull request #646 from Jittor/fixHW
adjust aclnn.h reference
2025-06-17 13:34:28 +08:00
Yuxuan Han 3cf5d7f2a4 adjust aclnn.h reference 2025-06-16 11:05:05 +08:00
Zikai Xiao 330dec69d2 Merge pull request #623 from 514flowey/master
Fix Get Item Problem. Warning: This change has not passed a completely check.
2025-06-10 21:52:37 +08:00
514flowey 58192fc7ef fix cutlass.zip url 2025-06-10 21:50:39 +08:00
Zikai Xiao 8f5048882f Merge branch 'Jittor:master' into master 2025-06-10 21:48:20 +08:00
Yi Zhang 330ef620f7 Merge pull request #626 from fleurs03/master
fix unqualified call to 'std::move'
2025-05-14 11:27:01 +08:00
Yi Zhang b04e197c22 Merge pull request #638 from Jittor/JittorHW
Update Huawei ACL
2025-05-14 11:26:33 +08:00
Yuxuan Han 91190b949e temporarily unable hccl 2025-05-13 09:09:37 +08:00
Yuxuan Han e8f94f4003 update from HuaWei ACL 2025-05-12 15:28:18 +08:00
DongYang Li 0abdc60b77 Merge pull request #630 from Exusial/rdkit
debug rdkit
2025-04-22 20:20:49 +08:00
Exusial 45465befd3 fix histc. 2025-04-22 20:16:43 +08:00
Yuxuan Han 6407252044 fix bug: Conv2dBackward gradBias shape 2025-04-13 23:14:18 +08:00
Exusial 917d122e96 Merge branch 'master' of https://github.com/Jittor/jittor 2025-03-30 09:18:43 +08:00
CHEN Xinsheng 9b53d2b5a7 Merge pull request #632 from CHEN-Xinsheng/reduce-memory
reduce SFRL large block size to 5242880
2025-03-18 20:09:33 +08:00
Xinsheng Chen 95a17684fa reduce memory alloc (in certain cases) 2025-03-18 16:04:39 +08:00
DongYang Li 4c75b24cc9 Update README.md 2025-03-05 19:46:54 +08:00
Exusial 449874356d debug. 2025-02-26 16:21:30 +08:00
CHEN Xinsheng 8a74e9e78a support `jt.any` with argument `dim` 2025-02-18 23:29:05 +08:00
CHEN Xinsheng 23abcda711 copy `jittor.attention` from jittor official repo 2025-02-18 22:06:54 +08:00
CHEN Xinsheng cc3b402913 allow the input of `concat` to be tuple` 2025-02-18 20:56:57 +08:00
zjp_shadow e4be9b1f78 Update HCCL to support multi npus 2025-02-10 16:18:34 +08:00
DongYang Li 86841e858d fix arch90 2025-02-09 02:30:35 +08:00
DongYang Li b166e4e385 Update attention.py fix parameter_name error 2025-02-08 18:03:34 +08:00
MenghaoGuo 646a0346fb Merge pull request #627 from plutoZZZZ/master
update Cusparse op
2025-01-03 17:23:03 +08:00
Yuxuan Han 7ba878bf49 Merge pull request #17 from CSCG-Lab/splits
fix conv2dbackward
2025-01-02 14:40:25 +08:00
Yuxuan Han 2a193eb836 fix conv2dbackward 2025-01-02 14:32:02 +08:00
lusz ece4e3efaa update cusparse trans 2024-12-29 17:25:18 +08:00
lusz 02c3173def update cusparse trans 2024-12-29 17:23:49 +08:00
lusz 1bf6f73d4c update cusparse trans 2024-12-29 16:48:34 +08:00
Yuxuan Han 4e462a6b85 Merge pull request #16 from CSCG-Lab/splits
delete acl_op.h
2024-12-24 10:09:13 +08:00
Exusial 6483f2710b delete acl_op.h 2024-12-24 10:04:44 +08:00
Yuxuan Han 71b990590d Merge pull request #15 from CSCG-Lab/splits
fix conv,relu, split expand
2024-12-23 22:05:31 +08:00
Exusial 9679b992a5 fix conv,relu, split expand 2024-12-23 17:24:09 +08:00
Yuxuan Han 5ce5b45c58 Merge pull request #14 from CSCG-Lab/splits
split triu,embedding,batchnorm
2024-12-23 15:57:30 +08:00
Yuxuan Han 36cc2b33d6 Merge branch 'main' into splits 2024-12-23 15:57:22 +08:00
Exusial fb89c96cc4 split triu,embedding,batchnorm 2024-12-23 15:49:07 +08:00
Exusial 4c2c9bc8e1 polish code 2024-12-23 13:43:21 +08:00
Yi Zhang 093d562aeb Update acl_op_exec.cc 2024-12-23 12:32:31 +08:00
Yi Zhang 572f4301c2 Update acl_compiler.py 2024-12-20 22:44:49 +08:00
Yuxuan Han 1c752fbd83 Merge pull request #13 from CSCG-Lab/splits
split stack,rope,nantonum
2024-12-19 19:42:16 +08:00
Exusial 130f02814d split stack,rope,nantonum 2024-12-19 19:40:37 +08:00
lidongyang 8419709e31 update version 1.3.9.14 2024-12-19 16:33:10 +08:00
lidongyang 14f000f867 tmp fix zipfile for jittorllama 2024-12-19 16:32:37 +08:00
DongYang Li 30bb3dbf22 Merge pull request #625 from Exusial/master
fix duplicate definition in cudnnops
2024-12-19 16:29:32 +08:00
Yuxuan Han c3a6df6682 Merge pull request #12 from CSCG-Lab/splits
split silu,sigmoid,softmax
2024-12-19 10:18:26 +08:00
Exusial 144b7bc57d split silu,sigmoid,softmax 2024-12-19 10:17:06 +08:00
Yuxuan Han 4219d445a4 Merge pull request #11 from CSCG-Lab/splits
split relu,dropout,transpose,flashattention
2024-12-19 09:37:32 +08:00
Exusial bfe1ceb82b split relu,dropout,transpose,flashattention 2024-12-19 09:36:32 +08:00
hjc21 67f9b7ad61 fix unqualified call to 'std::move' 2024-12-18 17:09:24 +08:00
Exusial 9ce77dfb82 fix duplicate definition in cudnnops 2024-12-18 14:19:04 +08:00
CHEN Xinsheng 2a67644b0d fix lack of import 2024-12-17 19:44:35 +08:00
514flowey 4225804df2 Merge branch 'Jittor:master' into master 2024-12-16 22:25:47 +08:00
514flowey eefd57c0f4 Merge branch 'master' of github.com:514flowey/jittor 2024-12-16 22:20:20 +08:00
514flowey 9e7e479df2 Add Index Check for Get Item. Warning: It may slow down the speed, and has not passed a fully check! 2024-12-16 22:19:55 +08:00
Yuxuan Han fa89429a21 Merge pull request #10 from CSCG-Lab/splits
split where,scatter,floor
2024-12-14 10:37:15 +08:00
Exusial 8762352c64 split where,scatter,floor 2024-12-14 10:36:37 +08:00
MenghaoGuo ac78f57a7e Merge pull request #622 from plutoZZZZ/master
add cuda extern: Cusparse
2024-12-13 17:08:36 +08:00
Yuxuan Han 1554d416b0 Merge pull request #9 from CSCG-Lab/splits
split cumsum,gather,index
2024-12-13 14:32:01 +08:00
Exusial 064af9d543 split cumsum,gather,index 2024-12-13 14:31:29 +08:00
Yuxuan Han 5a30cd334f Merge pull request #8 from CSCG-Lab/splits
split maxpool,flip,concat
2024-12-12 19:52:31 +08:00
Exusial f8c8f7e8d7 split maxpool,flip,concat 2024-12-12 19:49:06 +08:00
Exusial 64e3ceb59e shut off sync expect reduce op 2024-12-12 17:02:51 +08:00
Exusial c3b1f380eb Merge branch 'main' of https://github.com/CSCG-Lab/JittorHW 2024-12-12 16:31:40 +08:00
Exusial 14af6f0980 Merge branch 'ddd' 2024-12-12 16:27:07 +08:00
Exusial 2b63a07aa0 udpate base. 2024-12-12 16:26:38 +08:00
Exusial c9c02508d4 Debug nan. 2024-12-12 14:49:18 +08:00
lusz a5fdfd1408 cusparse 2024-12-11 21:38:17 +08:00
lusz 2d93b36cbb cusparse 2024-12-11 21:33:33 +08:00
Exusial 722cb8e3fc add sync in broadcast_to when shape is [1] 2024-12-10 19:20:18 +08:00
Exusial da6acc6cc3 Add flags for sync. 2024-12-10 10:17:58 +08:00
Exusial d1b313bf1d add random 2024-12-09 17:18:27 +08:00
Exusial ce533cbeb3 fix setitem 2024-12-09 16:22:37 +08:00
Exusial 99413285cb modify getitem & setitem 2024-12-07 18:52:34 +08:00
邓一轩 1db9bc2993 splite matmul and bmm from acl_op 2024-12-07 11:16:49 +08:00
Exusial 420f94f283 update 2024-12-06 14:23:08 +08:00
Exusial 86331a8d8f add setitem & getitem op 2024-12-06 11:21:55 +08:00
Exusial fb00b8a558 add conv_forward op. 2024-12-04 15:45:48 +08:00
Exusial 15a7fba3da add conv_op. 2024-12-04 15:41:38 +08:00
Exusial 5544147573 Merge branch 'main' of https://github.com/CSCG-Lab/JittorHW 2024-12-04 15:40:05 +08:00
Exusial d71e59b262 fixed bug of cpp 2024-12-04 15:36:25 +08:00
CHEN Xinsheng 135446ca59 improve reduce op output 2024-12-03 11:12:56 +08:00
Exusial 3bea663698 fixed the bug of not recompile 2024-12-02 22:24:01 +08:00
Exusial f7edd32327 fix bug 2024-12-02 17:53:42 +08:00
Exusial e24a37f5ce add base op class 2024-12-01 23:36:28 +08:00
DongYang Li 63d9392e49 update version 1.3.9.13 2024-11-28 22:13:06 +08:00
CHEN Xinsheng acf5d1a05e add `jt.Var.isnan` and `jt.Var.isinf` 2024-11-28 22:04:42 +08:00
514flowey 7638ab5ffb Merge pull request #604 from 514flowey/master
Fix RNN code op bug
2024-11-28 19:41:29 +08:00
514flowey 421d5a4fa4 Merge branch 'Jittor:master' into master 2024-11-28 19:40:55 +08:00
514flowey 9ee61d26f1 fix rnn op bug 2024-11-28 19:40:08 +08:00
Exusial 352bb8d6a7 update reduce. 2024-11-28 16:24:07 +08:00
Exusial ca712e241b update 2024-11-28 15:45:15 +08:00
Exusial a1add64d6c fix compile include aclops.h in aclops 2024-11-28 13:24:47 +08:00
Exusial 0dc84ebed8 update get_dtype 2024-11-27 20:21:34 +08:00
Exusial 8e5ee574f5 merge reduce. 2024-11-27 19:31:48 +08:00
Exusial edf2755cb5 Merge branch 'main' into dev 2024-11-27 11:18:47 +08:00
Exusial 8c33770036 update. 2024-11-27 11:18:19 +08:00
张仪 89f5b98741 split binary and unary op by hy 2024-11-26 23:22:41 +08:00
张仪 e8ae65d797 update 2024-11-26 21:27:06 +08:00
CHEN Xinsheng d4793e2146 Merge pull request #603 from CHEN-Xinsheng/master
fix `nn.Dropout` (dtype convert)
2024-11-26 19:08:17 +08:00
Xinsheng Chen 2bab0bb8dd fix `nn.Dropout` (dtype convert) 2024-11-26 19:07:20 +08:00
CHEN Xinsheng b37fae105b fix `nn.Dropout` (dtype) 2024-11-26 18:59:12 +08:00
Yuxuan Han 8ee6a45d5c Merge pull request #7 from CSCG-Lab/unittest
add isnan, isinf
2024-11-26 10:05:03 +08:00
Yuxuan Han 2580a98710 add isnan, isinf 2024-11-26 10:01:58 +08:00
Yuxuan Han 8a55cfe5b3 Merge pull request #5 from CSCG-Lab/unittest
add stack
2024-11-24 09:47:29 +08:00
Yuxuan Han 66e18b85a8 add stack 2024-11-24 09:46:23 +08:00
张仪 2e137f73b0 big op and recompile bug 2024-11-21 23:59:41 +08:00
Yuxuan Han c8b76acece Merge pull request #4 from CSCG-Lab/unittest
Fix relu grad, flip. Add more unittest
2024-11-21 16:01:35 +08:00
Yuxuan Han d6917eda4c fix relu grad, add more unnittest 2024-11-21 15:59:40 +08:00
Yuxuan Han 0ff1deccb7 fix flip, add softmax 2024-11-21 15:12:12 +08:00
CHEN Xinsheng 8159093262 fix `getitem` (list case) 2024-11-18 20:15:39 +08:00
CHEN Xinsheng d3f2dc5606 add: support numpy int as an index for `getitem` 2024-11-18 17:36:22 +08:00
张仪 47f0c8acda update unit test 2024-11-18 11:54:32 +08:00
CHEN Xinsheng 0b637852f1 Merge pull request #601 from CHEN-Xinsheng/master
fix `stack`
2024-11-14 19:31:45 +08:00
CHEN Xinsheng 1d0602ae32 fix `stack` 2024-11-14 19:30:22 +08:00
CHEN Xinsheng 19b7bbbe57 fix `stack` 2024-11-14 18:01:25 +08:00
CHEN Xinsheng a5b16925e8 add unit test for `any` 2024-11-14 17:35:25 +08:00
CHEN Xinsheng 9b6fd17e20 add `jt.Var.cumsum` and `jt.Var.cub_cumsum` 2024-11-14 17:34:48 +08:00
DongYang Li c10acf34bc Update version 1.3.9.12 2024-11-14 16:17:33 +08:00
CHEN Xinsheng 3d06d25077 add ACL op `any` 2024-11-12 10:32:07 +08:00
CHEN Xinsheng a0dfdc5ff0 add `getitem` (`None` case) 2024-11-11 21:33:55 +08:00
邓一轩 e0537e5c1a concat 2024-11-11 19:58:31 +08:00
邓一轩 58fc5a9b35 fix conv 2024-11-11 19:43:56 +08:00
邓一轩 495a26a458 add sync at end of all op 2024-11-11 17:09:22 +08:00
邓一轩 b9986ac53b use switch 2024-11-11 15:52:35 +08:00
CHEN Xinsheng c747053a54 fix `concat` 2024-11-09 20:13:47 +08:00
514flowey 1474ebe608 Merge pull request #600 from 514flowey/master
update optimizer
2024-11-05 14:18:37 +08:00
514flowey 382bd3f0e5 update optimizer 2024-11-05 14:14:10 +08:00
CHEN Xinsheng 4e4e67dfd5 fix `nonzero` 2024-11-05 12:46:00 +08:00
dengyx21 810af5953b Revert "sync only on broadcast_to from [1]"
This reverts commit 1439a03fca.
2024-11-05 11:29:10 +08:00
CHEN Xinsheng 33bd28fdb3 add ACL op `where` (unary case) 2024-11-04 22:41:16 +08:00
CHEN Xinsheng 19d2e2e912 add ACL op `nonzero`, a temporary implementation, a bit slow 2024-11-04 22:38:29 +08:00
dengyx21 1902dab9c5 sync only on broadcast_to from [1] 2024-11-04 19:42:09 +08:00
CHEN Xinsheng f79e2908ed Merge pull request #3 from CSCG-Lab/jtorch
fix some bugs for jtorch
2024-10-30 12:34:08 +08:00
dengyx21 158ec0756c shut off a stream 2024-10-29 20:14:41 +08:00
CHEN Xinsheng b279960344 Merge branch 'main' 2024-10-29 15:42:39 +08:00
DongYang Li 56bc5f65be fix jt.index error 2024-10-24 02:32:02 +08:00
CHEN Xinsheng f34e1beafa fix warp (class case) 2024-10-21 20:39:15 +08:00
CHEN Xinsheng 1776dd4da9 fix warp (class case) 2024-10-21 17:08:01 +08:00
CHEN Xinsheng 8a31c402de fix `jt.index` 2024-10-18 17:18:06 +08:00
CHEN Xinsheng 811dc241d4 fix `jt.Var.triu_` 2024-10-18 11:13:32 +08:00
CHEN Xinsheng 17048da065 fix finfo & iinfo 2024-10-17 22:00:22 +08:00
CHEN Xinsheng cf4ce2c95e fix finfo bug in jittor 2024-10-17 21:36:00 +08:00
CHEN Xinsheng 89010f5475 fix `Var.triu` & `Var.triu_` 2024-10-17 21:24:19 +08:00
CHEN Xinsheng 6edc1f74a3 add cub_cumsum & cumprod 2024-10-17 21:23:32 +08:00
CHEN Xinsheng c886f01b53 fix warp (class case) 2024-10-17 21:22:26 +08:00
lidongyang fc1fff8c0e update version to 1.3.9.11 2024-10-08 23:03:24 +08:00
lidongyang 72be1396d9 fix: jupyter restart error 2024-10-08 23:02:27 +08:00
zjp_shadow 8966ca4320 fix transpose 2024-10-06 21:20:12 +08:00
uyzhang a078268e18 polish 2024-10-01 19:34:51 +08:00
uyzhang 33898421e4 Merge branch 'main' of https://github.com/CSCG-Lab/JittorHW into main 2024-10-01 18:16:47 +08:00
uyzhang 4c6d726a4c Refactor transpose_acl function and fix bug in matmul_acl 2024-10-01 18:14:08 +08:00
张仪 cb75c8dedd format 2024-09-29 13:47:22 +08:00
uyzhang c268a0bfaf Refactor aclnn.h and acl_op.h to add support for FlashAttention and FlashAttentionBackward 2024-09-29 12:29:41 +08:00
uyzhang 146574d7d1 Refactor transpose_acl function and fix bug in matmul_acl 2024-09-27 19:47:13 +08:00
uyzhang 4329f3b287 Refactor transpose_acl function and fix bug in matmul_acl 2024-09-27 19:44:04 +08:00
zjp_shadow b48d8664a1 add transpose 2024-09-27 19:37:19 +08:00
uyzhang c7c7326456 fixed the bug in matmul 2024-09-27 16:54:43 +08:00
Yi Zhang 810530b3cc Merge pull request #1 from CSCG-Lab/concat
Update concat
2024-09-25 12:19:26 +08:00
zjp_shadow d648713ec5 Update concat 2024-09-25 00:37:36 +08:00
uyzhang 934885c96e Merge branch 'main' of https://github.com/CSCG-Lab/JittorHW into main 2024-09-23 23:12:49 +08:00
uyzhang dc29fa69dc FEAT! opt transpose in matmul and bmm 2024-09-23 23:12:44 +08:00
uyzhang c3df41e77b Refactor acl_compiler.py to handle gradient accumulation in bmm_acl and matmul_acl functions 2024-09-23 23:11:04 +08:00
uyzhang d092b83d0b Merge branch 'main' of https://github.com/CSCG-Lab/JittorHW into main 2024-09-23 22:43:47 +08:00
uyzhang 74aa4e68c2 Refactor acl_compiler.py to handle gradient accumulation in bmm_acl and matmul_acl functions 2024-09-23 22:43:44 +08:00
uyzhang 7fa22e2e32 add Ellipsis 2024-09-23 22:27:44 +08:00
uyzhang 9578e30972 Refactor acl_compiler.py to handle gradient accumulation in bmm_acl and matmul_acl functions 2024-09-23 20:40:53 +08:00
uyzhang 37671ccec1 Refactor acl_compiler.py to handle gradient accumulation in bmm_acl and matmul_acl functions 2024-09-23 20:26:46 +08:00
uyzhang 657687e0c0 Refactor acl_compiler.py to handle gradient accumulation in bmm_acl and matmul_acl functions 2024-09-23 16:09:42 +08:00
uyzhang 2a142ae73d fix bug of setitem cpu when use acl 2024-09-23 15:34:45 +08:00
uyzhang 9907aad7de fix getitem&setitem slice bug 2024-09-23 13:58:37 +08:00
uyzhang 2c2e8abe59 fix slice setitem 2024-09-23 13:18:49 +08:00
uyzhang 0d5035443e fix setitem not in graph 2024-09-23 03:26:12 +08:00
uyzhang fa288cb4d9 Refactor acl_op.h to use __fp16 for alphaValue in the case of ACL_FLOAT16 dtype 2024-09-22 18:06:38 +08:00
uyzhang 9ff62acf7d Refactor acl_op.h to use __fp16 for alphaValue in the case of ACL_FLOAT16 dtype
Refactor grad method for improved performance and synchronization
Index indices to int32
Fix getitem bug
Add getitem&setitem mask
2024-09-22 16:41:43 +08:00
lidongyang 8888b25ea7 fix getitem bug 2024-09-22 02:30:16 +08:00
lidongyang 464009af42 add getitem&setitem mask 2024-09-21 22:57:54 +08:00
uyzhang a357a7913d Refactor acl_op.h to use __fp16 for alphaValue in the case of ACL_FLOAT16 dtype 2024-09-21 17:17:47 +08:00
uyzhang 631a9a3aaa Refactor grad method for improved performance and synchronization 2024-09-21 14:20:10 +08:00
lidongyang 0705ed9d8f index indices to int32 2024-09-20 22:10:15 +08:00
uyzhang 015bd10210 Refactor flip and squeeze operations for improved performance and synchronization 2024-09-20 21:54:49 +08:00
lidongyang 898ec600b4 polish getitem&setitem 2024-09-20 21:44:47 +08:00
lidongyang babd92a002 polish getitem&setitem -1 2024-09-20 20:01:45 +08:00
lidongyang cdad66c01d polish output dtype 2024-09-20 19:43:56 +08:00
张仪 18afb843ad Fix synchronization issue in acl_op.h 2024-09-19 19:52:25 +08:00
张仪 4006f242de fixed bugs 2024-09-18 17:33:23 +08:00
张仪 e47a74a497 Fix broadcasting issue in acl_compiler.py and add support for setting item in jt.Var 2024-09-14 16:00:15 +08:00
lidongyang 651b24e634 add sigmoid embedding silu 2024-09-13 03:19:25 +08:00
lidongyang 0641a50a5d change op file to acl_op.h 2024-09-12 22:29:20 +08:00
lidongyang e00e4f099c add getitem&setitem 2024-09-12 20:25:48 +08:00
张仪 c55d49a8de add new aclop 2024-09-12 20:14:22 +08:00
张仪 3beeec78b1 add new aclop & fixed some bugs 2024-09-12 17:11:23 +08:00
张仪 eb89ae19ed add new aclop 2024-09-07 22:11:39 +08:00
张仪 21580ce80e update aclnn 2024-09-07 18:18:00 +08:00
514flowey 593519203b Merge pull request #586 from fansunqi/dim
fix dim=3 error
2024-09-05 20:18:02 +08:00
范孙奇 2c141fa996 fix dim=3 error 2024-09-05 20:14:36 +08:00
DongYang Li 4b907d493c Merge pull request #584 from liylo/module
Make forward hook modifiy the inputs and outputs
2024-09-04 16:33:04 +08:00
DongYang Li 79527c40e9 Merge pull request #583 from liylo/func
Add support for block diag function
2024-09-04 16:32:45 +08:00
DongYang Li a1fcd0f337 Merge pull request #503 from 514flowey/attention_mask
add attention mask
2024-09-04 16:19:46 +08:00
DongYang Li 96b97ccf55 Merge pull request #549 from fansunqi/bilinear
check input1 and input2 shape in jt.nn.Bilinear()
2024-09-04 16:16:31 +08:00
DongYang Li 818edc962e Merge pull request #558 from fansunqi/Upsample
check input shape and scale factor's positiveness in jt.nn.Upsample
2024-09-04 16:16:18 +08:00
DongYang Li 60d4f5a2ef Merge pull request #582 from liylo/master
fix load_parameter for Parameterlist issue Jittor#581
2024-09-04 16:15:49 +08:00
lidongyang 30b8a637de remove compatibility 2024-09-04 16:11:51 +08:00
liylo df442516ab forward hooks now could modifiy inputs and outputs 2024-08-28 21:35:12 +08:00
liylo 949c6ed676 init 2024-08-28 21:27:02 +08:00
liylo 1c5519acf2 simple implementation for block diag with proper grad 2024-08-28 21:18:56 +08:00
liylo c8ca6d30eb simple implementation for block diag 2024-08-28 21:13:00 +08:00
liylo ddaf3520e3 fix load 2024-08-28 20:50:37 +08:00
514flowey dc6e888d19 Merge pull request #579 from 514flowey/complex
Add Complex Operators
2024-08-22 12:55:05 +08:00
514flowey 1fbd56bb6d fix unique bug 2024-08-22 12:53:03 +08:00
张仪 b4244090ae first commit 2024-08-21 22:15:12 +08:00
514flowey 822955ac00 add several ffunctions 2024-08-20 15:08:19 +08:00
Yi Zhang c124023085 Merge pull request #567 from Hanyx2021/master
complement of test_aclop
2024-08-12 19:51:07 +08:00
Yuxuan Han 1c0cf4c2e4 complement of test_aclop: error of scatter()-multiple and where() 2024-08-12 19:50:29 +08:00
Yuxuan Han b46264b9f8 complement of test_aclop 2024-08-12 19:28:01 +08:00
Yuxuan Han f353b18472 complement of test_aclop 2024-08-01 16:00:00 +08:00
Yuxuan Han 4deb69c4e5 Merge pull request #1 from Jittor/master
Fixed the BUG of ACL op memory
2024-07-26 21:20:43 +08:00
Yuxuan Han 550ca96a75 complement of test_aclop 2024-07-26 21:16:09 +08:00
张仪 c25ac3a4e8 Fixed the BUG of ACL op memory 2024-07-25 15:54:57 +08:00
hanyx 69b6dd3b42 Merge remote-tracking branch 'upstream/master' 2024-07-24 21:18:52 +08:00
Yi Zhang 496b771211 Update acl_compiler.py 2024-07-24 16:20:02 +08:00
Yi Zhang 29f2fbd853 Update compile_extern.py 2024-07-24 15:43:19 +08:00
张仪 53327feff2 feat: enable ACL optimization in split function 2024-07-24 15:25:10 +08:00
Yi Zhang f2a471c2ec Merge pull request #575 from dengyx21/dev-dyx
FEAT! add floor_int
2024-07-24 15:20:15 +08:00
邓一轩 a755d64f9e FEAT! add floor_int 2024-07-24 15:13:57 +08:00
Yi Zhang 279e4113f3 Update compile_extern.py 2024-07-24 15:01:30 +08:00
Yi Zhang 140b17b824 Update acl_compiler.py 2024-07-24 14:53:04 +08:00
Yi Zhang 67d79a66d4 Merge pull request #572 from dengyx21/dev-dyx
FEAT! add aclop unittest
2024-07-19 17:05:00 +08:00
Yi Zhang 8a9c10d615 Format test_aclop.py 2024-07-19 17:04:42 +08:00
邓一轩 2b12e55447 FEAT! add aclop unittest 2024-07-19 17:01:17 +08:00
Yi Zhang f71c00c3d5 Merge pull request #571 from CHEN-Xinsheng/dev-cross_entropy_loss
fix dtype mismatch in `nn.cross_entropy_loss`
2024-07-19 16:46:41 +08:00
CHEN Xinsheng 9758b18c7d fix dtype mismatch in `nn.cross_entropy_loss` 2024-07-19 16:42:55 +08:00
Yi Zhang 54bc8484e9 Merge pull request #570 from dengyx21/dev-dyx
FEAT! where,scatter,cumsum,gather,flip
2024-07-18 20:08:37 +08:00
邓一轩 8f6563cba9 FEAT! where,scatter,cumsum,gather,flip 2024-07-18 20:04:40 +08:00
lidongyang 121fee583d add no gpu device error 2024-07-12 15:07:35 +08:00
Jiapeng Zhang f7bc197200 fix load bugs
fix load bugs of state
2024-07-10 19:58:07 +08:00
hanyx fa8b332f32 ComplexNumber:polar,view_as_complex,view_as_real 2024-07-09 22:27:28 +08:00
Yi Zhang 3f0814b482 Update acl_compiler.py 2024-07-09 21:48:35 +08:00
张仪 2ae2f1d453 update acl 2024-07-09 19:50:35 +08:00
lidongyang 3b2ca1c2c0 Merge branch 'master' of https://github.com/Jittor/jittor 2024-07-09 14:28:57 +08:00
lidongyang a58c8c7988 polish nn.Sequential __getattr__ 2024-07-09 14:28:17 +08:00
DongYang Li 914cd170b4 Merge pull request #548 from fansunqi/binary_cross_entropy_with_logits
check target and output shape in jt.nn.binary_cross_entropy_with_logits
2024-07-08 17:16:26 +08:00
DongYang Li 6736ce68e3 Merge pull request #553 from fansunqi/conv_transpose3d
modify stride positive check in jt.nn.conv_transpose3d/jt.nn.conv_transpose; add input shape check in jt.nn.conv_transpose3d/jt.nn.conv_transpose
2024-07-08 17:15:38 +08:00
DongYang Li dde745407e Merge pull request #554 from fansunqi/ConvTranspose
check stride positiveness and input shape in jt.nn.ConvTranspose
2024-07-08 17:14:05 +08:00
DongYang Li 133307627e Update nn.py 2024-07-08 17:13:32 +08:00
DongYang Li 9983779d7a Merge pull request #551 from fansunqi/Conv1d_sp
check input shape in jt.nn.Conv1d_sp
2024-07-08 17:08:33 +08:00
DongYang Li bdd6bb6de5 Merge pull request #550 from fansunqi/Conv1d
check input shape in jt.nn.Conv1d
2024-07-08 17:07:57 +08:00
DongYang Li 2b57b2d988 Merge pull request #555 from fansunqi/Dropout2d
check input shape in nn.Dropout2d
2024-07-08 17:05:49 +08:00
DongYang Li c669b1219a Merge pull request #556 from fansunqi/zeroPad2d
check input shape in jt.nn.ZeroPad2d
2024-07-08 17:05:22 +08:00
JittorRepos 596368ae7c Merge pull request #557 from fansunqi/ReplicationPad2d
check input shape in jt.nn.ReplicationPad2d
2024-07-08 17:04:21 +08:00
JittorRepos 98d7c2d0fa Merge pull request #562 from fansunqi/unfold
check parameter's positive in jt.nn.Unfold
2024-07-08 17:03:02 +08:00
lidongyang c47549e673 add isin 2024-07-05 18:12:43 +08:00
DongYang Li dcd6c6b2be update version 2024-07-02 20:02:02 +08:00
DongYang Li 7a2b94a91d Merge pull request #561 from fansunqi/fold
check parameters' positive in jt.nn.fold
2024-07-02 20:01:27 +08:00
fansunqi f4d4c9d55c check parameter's positive in jt.nn.unfold 2024-07-01 15:41:24 +08:00
Sunqi Fan c45dac35e6 Merge branch 'Jittor:master' into fold 2024-07-01 12:26:35 +08:00
fansunqi 45ccf3d2ac check parameters' positive in jt.nn.fold 2024-07-01 12:23:03 +08:00
Sunqi Fan dfec39c2b8 Merge branch 'Jittor:master' into master 2024-07-01 11:00:18 +08:00
DongYang Li 4196cb8154 update version 2024-06-25 16:49:44 +08:00
DongYang Li d8ce49cd70 Update setup.py
fix numpy version
2024-06-25 16:47:09 +08:00
范孙奇 f358fb7518 check input shape and scale factor's positiveness in jt.nn.Upsample 2024-06-10 19:27:29 +08:00
范孙奇 969d810f55 resume 2024-06-10 19:26:40 +08:00
范孙奇 78b7cf091b check input shape and scale factor's positiveness in jt.nn.Upsample 2024-06-10 19:25:53 +08:00
范孙奇 c4480b7e3b check input shape in jt.nn.ReplicationPad2d 2024-06-10 19:08:53 +08:00
范孙奇 d31b0a244d check input shape in jt.nn.ZeroPad2d 2024-06-10 19:05:49 +08:00
范孙奇 1fba329474 check input shape in nn.Dropout2d 2024-06-10 17:02:05 +08:00
范孙奇 958708ed60 modify error information 2024-06-10 16:48:27 +08:00
范孙奇 e6e5949765 add stride check in jt.nn.ConvTranspose 2024-06-10 16:45:53 +08:00
范孙奇 2266d21a8b remove 3D(unbatch) description 2024-06-10 16:43:22 +08:00
范孙奇 db8fcb33da modify stride positive check in jt.nn.conv_transpose; add input shape check in jt.nn.conv_transpose 2024-06-10 16:39:05 +08:00
范孙奇 baf6b45cf1 add input shape check in jt.nn.transpose3d 2024-06-10 16:29:58 +08:00
范孙奇 8fd834465c modify stride positive check in jt.nn.transpose3d 2024-06-10 16:08:48 +08:00
范孙奇 ae0e52dca5 check input shape in jt.nn.ConvTranspose 2024-06-06 20:55:46 +08:00
范孙奇 d895cb9d36 jt.nn.Conv1d in_channels and out_channels must be positive 2024-06-06 20:39:10 +08:00
范孙奇 b4155d8021 jt.nn.Conv1d_sp in_channels and out_channels must be positive 2024-06-06 20:35:05 +08:00
范孙奇 a42198705b check input shape in jt.nn.Conv1d_sp 2024-06-06 20:25:29 +08:00
范孙奇 4d11325634 check input shape in jt.nn.Conv1d 2024-06-06 20:18:31 +08:00
范孙奇 7f6beb58b9 check input1 and input2 shape in jt.nn.Bilinear() 2024-06-06 20:04:35 +08:00
范孙奇 2f11e3bbbe check target shape and output shape in jt.nn.binary_cross_entropy_with_logits 2024-06-06 18:02:29 +08:00
lidongyang 393684f196 polish nn.Sequential attribute 2024-06-05 22:31:20 +08:00
DongYang Li c49be7cf79 Merge pull request #546 from Hanyx2021/fix-expand
fix: some function&class input illegal paramters
2024-06-05 21:37:59 +08:00
DongYang Li 1baf90dd1b Update README.md 2024-06-04 17:55:59 +08:00
Hanyuxuan f20ea9dcf1 fix illegal parameters of ConvTranspose and Pool,issue #478,#480,#481,#482,#483 2024-05-31 15:23:42 +08:00
Hanyuxuan 26963bc70f fix Pad2d with illegal padding,issue #464,#465,#466,#467 2024-05-31 14:50:47 +08:00
Hanyuxuan 64c6400070 check x.shape and kernel_size of Pool and Pool3d,issue #461,#463 2024-05-31 14:33:07 +08:00
Hanyuxuan c7e31604c2 fix illegal parameters of PixelShuffle of issue #458,fix validity of concat of issue #459 2024-05-30 19:03:17 +08:00
Hanyuxuan 981f60c381 fix illegal parameters of Conv2d issue #471,#472,#473,#474,#475,#476,#477 2024-05-30 16:10:06 +08:00
Hanyuxuan 102a689fee fix illegal parameters of Pool and Pool3d of issue #451,#453,#456,#457 2024-05-30 15:19:19 +08:00
Hanyuxuan 681174a606 a ValueError fix of issue #450 2024-05-30 14:29:34 +08:00
Hanyuxuan fe8fb30136 a IndexError fix of issue #448 2024-05-30 14:00:27 +08:00
Hanyuxuan 317defa7a1 fix: jt.Var.expand with valid index -1 2024-05-29 11:05:53 +08:00
DongYang Li a83eea318d Merge pull request #545 from zhc7/patch-1
fix: fix for issue #544
2024-05-28 20:12:33 +08:00
zhc7 fce14c8d9d fix: a minimal quick fix for issue #544 2024-05-22 11:08:28 +08:00
Yi Zhang 426c83a8d4 Merge pull request #533 from uyzhang/master
Update ACL library and fix bugs in ACL integration
2024-05-21 12:50:05 +08:00
Yi Zhang 2c1a5e14f1 Merge branch 'master' into master 2024-05-21 12:48:58 +08:00
DongYang Li f8fde94c3f Update version to 1.3.9.8 2024-05-20 21:43:48 +08:00
DongYang Li 4e41e6b070 Merge pull request #539 from fansunqi/issue523_branch
fix issue 523;update jt.nn.Conv1d/Conv3d/conv2d/conv3d
2024-05-20 21:42:16 +08:00
DongYang Li 21eaa919b9 Merge pull request #540 from fansunqi/issue522_branch
fix issue 522,520,519,516; update jt.Pool/Pool3d
2024-05-20 21:41:52 +08:00
DongYang Li f3744aa47d Merge pull request #541 from fansunqi/issue521_branch
fix issue 521;update jt.nn.MaxUnpool2d/MaxUnpool3d
2024-05-20 21:41:34 +08:00
DongYang Li b8df8e0098 Merge pull request #543 from LDYang694/master
polish rocm support
2024-05-20 21:37:53 +08:00
lidongyang 9190180d8d polish rocm support 2024-05-20 21:34:26 +08:00
范孙奇 1bd014ca9b fix issue 521;update jt.nn.MaxUnpool2d/MaxUnpool3d
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-17 11:34:46 +00:00
范孙奇 e4981653e3 fix issue 522;update jt.Pool/Pool3d
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-17 11:22:12 +00:00
范孙奇 dd9ac69eec fix issue 523;update jt.nn.Conv1d/Conv3d/conv2d/conv3d
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-17 09:35:12 +00:00
uyzhang 871ed92fc4 fix: Add conditional import for change_function in __init__.py 2024-05-16 16:03:09 +08:00
DongYang Li 4efbbbf75c Merge pull request #443 from co63oc/patch-1
Update mnist.py
2024-05-16 15:20:07 +08:00
DongYang Li 9943ddf8de Merge pull request #536 from fansunqi/issue528_branch
fix issue 528;update conv_transpose
2024-05-16 15:19:19 +08:00
DongYang Li 5934b20720 Merge pull request #535 from fansunqi/issue529_branch
fix issue 529;update contrib.argmax_pool()
2024-05-16 15:18:14 +08:00
DongYang Li 9370896b35 Merge pull request #537 from fansunqi/issue527_branch
fix issue 527,526;update jt.zeros/ones/full/randn/randint/random
2024-05-16 14:47:53 +08:00
DongYang Li ee3c68ce7a Merge pull request #538 from fansunqi/issue525_branch
fix issue 525;update jt.nn.Reflection2d/Replication2d
2024-05-16 14:43:28 +08:00
DongYang Li 136a710775 polish PixelShuffle in nn.py 2024-05-16 14:35:21 +08:00
DongYang Li 75429f83b7 Merge pull request #534 from fansunqi/master
fix issue 531,530;update jt.nn.PixelShuffle/jt.histc
2024-05-16 14:27:24 +08:00
514flowey 96fda6dee5 Merge pull request #518 from 514flowey/complex
add complex matmul, inv, qr, eig, and svd
2024-05-16 13:38:53 +08:00
范孙奇 82595dc766 fix issue 525;update jt.nn.Reflection2d/Replication2d
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-15 13:49:26 +00:00
范孙奇 69fc229912 fix issue 526;update jt.randn/random/randint
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-15 13:34:06 +00:00
范孙奇 9f4c156e12 fix issue 527;update jt.zeros/ones/full
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-15 13:23:30 +00:00
范孙奇 9baccaed4d fix issue 528;update conv_transpose
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-15 12:21:21 +00:00
范孙奇 fc252af9a2 fix issue 529;update contrib.argmax_pool()-2
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-15 12:04:38 +00:00
范孙奇 72f72900d6 fix issue 529;update contrib.argmax_pool()
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-15 12:01:08 +00:00
范孙奇 0a1a8d738b fix issue 530; update jt.histc
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-15 11:31:28 +00:00
范孙奇 ab39f8283c fix issue 531
Signed-off-by: 范孙奇 <fansq20@mails.tsinghua.edu.cn>
2024-05-15 11:14:24 +00:00
uyzhang d587961209 Update ACL library and fix bugs in ACL integration 2024-05-15 19:03:04 +08:00
514flowey 290f2aec60 add complex matmul, inv, qr, eig, and svd 2024-05-08 21:09:07 +08:00
DongYang Li 4808184bee
Update version to 1.3.9.6 2024-05-07 10:33:25 +08:00
lidongyang 4c048d185a polish stack 2024-05-06 14:44:14 +08:00
DongYang Li c8092d61f5
Merge pull request #513 from LDYang694/master
fix stack bug
2024-05-06 01:39:30 +08:00
DongYang Li ce783d2950
Merge branch 'Jittor:master' into master 2024-05-06 01:38:45 +08:00
lidongyang 6945c035c5 fix stack grad bug 2024-05-06 01:38:10 +08:00
MenghaoGuo 80004c2928
Merge pull request #509 from uyzhang/master
Fix memory leak in TempAllocator::free()
2024-04-25 21:53:02 +08:00
uyzhang 82b3f99f51 Fix memory leak in TempAllocator::free() 2024-04-24 18:17:43 +08:00
MenghaoGuo 63022e143d
Merge pull request #506 from LDYang694/master
add new cuda12.2 for g++11
2024-04-23 17:13:37 +08:00
lidongyang 3092bba175 polish atomicCAS use 2024-04-23 00:43:53 +08:00
lidongyang 185ae5d7eb add new cuda12.2 for g++11 2024-04-11 00:39:57 +08:00
MenghaoGuo 0287f749e9
Merge pull request #505 from LDYang694/master
fix fp16 include
2024-04-10 12:12:47 +08:00
DongYang Li aab6b0f784
Update version 1.3.9.5 2024-04-10 02:07:32 +08:00
lidongyang 8c8c5007d8 fix fp16 include 2024-04-10 02:03:00 +08:00
514flowey e5bfdfb162 add attention mask 2024-04-08 19:26:47 +08:00
MenghaoGuo af07705726
Merge pull request #502 from LDYang694/master
merge jdiff
2024-04-08 19:10:58 +08:00
DongYang Li c9c08f5cde
Update __init__.py 2024-04-08 18:56:08 +08:00
DongYang Li c2716512cf
Merge pull request #1 from JittorRepos/master
update compatibility
2024-04-08 18:48:07 +08:00
zjp_shadow 4f2fe9c4d1 Update compatibility 2024-04-02 16:38:12 +08:00
zhangjiapeng f045e196c1 Update Load 2024-04-01 18:00:34 +08:00
Dun Liang 2744992946 add fallback_func for static graph compiler 2024-01-03 03:57:14 +08:00
Dun Liang 4b9d777570 add doc 2023-12-29 03:55:56 +08:00
lidongyang 4ae9171578 fix merge conflict 2023-12-28 16:10:57 +08:00
Dun Liang 2a995d6cc4 update version to 1.3.9.2 2023-12-27 23:20:56 +08:00
Dun Liang fa40284b60 Merge branch 'master' of https://github.com/Jittor/jittor 2023-12-27 23:18:17 +08:00
Dun Liang 6e1327ba07 fix vcompiler bug 2023-12-27 15:16:06 +08:00
Dun Liang e260f2b6bd fitten code 2023-12-05 01:34:26 +08:00
lidongyang 9f5224819e Merge branch 'master' of github.com:Jittor/jittor 2023-12-04 15:47:49 +08:00
lidongyang 1f30881c6e polish GroupNorm&fix different op 2023-12-04 15:46:03 +08:00
Dun Liang 2bd05ce04e fix pyobj memleak 2023-11-29 14:10:16 +08:00
Dun Liang 881f769ba7 add tile and ne 2023-10-06 06:59:55 +08:00
Dun Liang e4272cdd4a polish computing graph liveness 2023-09-25 14:48:05 +08:00
Dun Liang ea9026aa6b fix memleak 2023-09-25 10:43:07 +08:00
lzhengning 6e78553016 remove unnecessary prints 2023-09-18 02:41:27 +08:00
lzhengning d21dfe65e6 Add SkipFirstBatchesSampler 2023-09-18 02:06:49 +08:00
Dun Liang 77a5f37f12 fix random sampler 2023-09-16 01:56:48 +08:00
Dun Liang 47d1483ca8 dont deep copy optim state dict 2023-09-15 13:49:13 +08:00
Dun Liang 40368453e8 polish bfloat16 2023-09-15 13:12:42 +08:00
Dun Liang c485fdc07b madd bf16 2023-09-08 22:43:31 +08:00
Dun Liang 2457c5ce4a mem opt for llm 2023-09-03 02:14:09 +08:00
lzhengning 9c69783f05 Merge branch 'master' of github.com:Jittor/jittor 2023-08-25 18:04:58 +08:00
Dun Liang 88582fa854 add reuse_np_array 2023-08-25 17:50:22 +08:00
zhouwy19 ca05be86f0
Update vcompiler.cc free unused output directly 2023-08-15 18:15:44 +08:00
Dun Liang dea40b30a3 bug fix for getitem 2023-08-10 03:19:57 +08:00
Dun Liang a6fca86633 polish nccl wrapper 2023-08-08 01:03:22 +08:00
Dun Liang c48216fb75 Merge branch 'master' of github.com:Jittor/jittor 2023-08-06 18:27:53 +08:00
Dun Liang 3a65eb3c43 add mpi barrier 2023-08-06 18:26:50 +08:00
yang guo ye 21b551411e
Merge pull request #486 from 514flowey/complex_number
add complex matmul
2023-08-06 17:05:52 +08:00
514flowey 14c39a7a53 add complex matmul 2023-08-06 16:58:50 +08:00
Dun Liang aa2dc39317 Merge branch 'master' of github.com:Jittor/jittor into HEAD 2023-08-02 00:15:43 +08:00
Dun Liang f03556533c MPI half type 2023-08-02 00:14:39 +08:00
lzhengning 6df06213e3 polish mpi dataset 2023-08-01 21:35:03 +08:00
Gword 7ea3c5530d save&load 2023-07-27 21:03:05 +08:00
Dun Liang f65c600508 remove print 2023-07-26 14:45:32 +08:00
Dun Liang 54112e0e2a Merge branch 'master' of github.com:Jittor/jittor 2023-07-26 00:35:39 +08:00
Dun Liang 39cff908a7 add lock for mpi compile_module 2023-07-26 00:34:34 +08:00
cxjyxxme 89ef365c1a install cutlass 2023-07-25 15:27:27 +08:00
Dun Liang 3f2d0313ed support detial nan checker 2023-07-24 12:41:24 +08:00
Dun Liang 4b35e70b65 polish code 2023-07-17 13:47:58 +08:00
Dun Liang 9a73639a09 polish acl interface 2023-07-13 16:07:48 +08:00
Dun Liang b8854f93d1 add acl support [version 1.3.8.7] 2023-07-13 14:36:57 +08:00
Dun Liang c99546586d update version to 1.3.8.6 2023-07-07 22:09:15 +08:00
Dun Liang 5b6c26ede7 Merge commit '1018e3b2' into HEAD 2023-07-07 22:08:03 +08:00
Dun Liang 1018e3b237 add save pytorch format 2023-07-07 19:18:51 +08:00
Dun Liang 192a11c6b6 add ccec support 2023-06-30 23:31:34 +08:00
Dun Liang f2fdf89fdc add fp16 acc 2023-06-27 22:22:00 +08:00
li-xl 640bfd9c6c fix save checkpoint bugs 2023-06-26 18:36:48 +08:00
Dun Liang a92051ec23 polish version 3.9 2023-06-25 22:58:20 +08:00
Dun Liang df4e6dd5cf add static compiler 2023-06-25 22:55:42 +08:00
li-xl 100d1bdd12 support scalar in getitem && fix bugs for jtorch 2023-06-19 16:06:19 +08:00
Dun Liang b9e34290c0 Merge branch 'master' of github.com:Jittor/jittor 2023-06-10 02:36:30 +08:00
Haoyang Peng ccbe047257
fix load pth with certain format. (#445)
* fix load pth with certain format.

* fix load 64bit.

* add comments.
2023-06-10 09:31:35 +08:00
Dun Liang 4d7c7ee748 add __deepcopy__ for dataset method 2023-06-10 02:34:17 +08:00
Dun Liang 39c09a3bc2 add add_module 2023-06-10 01:16:20 +08:00
Dun Liang d595c8b00f Merge branch 'master' of github.com:Jittor/jittor 2023-06-10 01:01:30 +08:00
Dun Liang 2d4a82d5e5 polish mish activation, thanks! 2023-06-10 00:59:39 +08:00
li-xl ae3c480c21 merge 2023-06-05 16:46:50 +08:00
li-xl d4691b6f07 update jittor 2023-06-05 16:40:02 +08:00
Dun Liang 48fd6185b9 add sfrl large block setting 2023-06-05 05:30:13 +08:00
li-xl 125ce3e491 fix small bugs 2023-05-27 22:53:22 +08:00
li-xl 5480280429 fix start grad bugs 2023-05-24 07:21:44 +08:00
li-xl b721d851b5 add transformers support 2023-05-24 06:25:25 +08:00
Dun Liang c527badcf1 polish cuda flag set 2023-05-24 04:37:16 +08:00
co63oc 11e29147db Update mnist.py 2023-05-22 13:18:08 +08:00
Dun Liang b0ba0d682e LLM support 2023-05-22 11:28:08 +08:00
Dun Liang f26fdfea22 LLM training tuning 2023-05-16 23:11:05 +08:00
yang guo ye 2b1647f1c9
Merge pull request #436 from 514flowey/complex_number
add complex number
2023-05-16 16:42:11 +08:00
Dun Liang 18f054333d fix grad for torch compatible 2023-05-11 19:30:42 +08:00
514flowey 10d9fb1431 add complex number 2023-05-02 18:20:52 +08:00
Dun Liang 66119cbcb5 update version to 1.3.7.15 2023-05-01 16:06:42 +08:00
Dun Liang 289f43e431 Merge branch 'master' of github.com:Jittor/jittor 2023-05-01 16:04:48 +08:00
Dun Liang dd215b0470 polish windows error msg 2023-05-01 16:04:33 +08:00
lzhengning b824ab83c0 update copyright to 2023; update version for PR https://github.com/Jittor/jittor/pull/435 2023-04-25 14:28:38 +08:00
Haoyang Peng 4825cead21
add Gamma Distribution on Cuda (#435)
* add digamma.

* add tdir.

* add gamma distribution.

* add test for gamma distribution.

* update location.

* add api directly into jt.__init__
2023-04-25 14:25:10 +08:00
lzhengning 6bf14ee650 pip publish for ADAN optimizer, related PR https://github.com/Jittor/jittor/pull/430 2023-04-10 14:03:04 +08:00
xingyuxie ab2e2e11d4 add adan optimizer 2023-04-10 13:59:57 +08:00
zhouwy19 9c4924881d
Update AWESOME-JITTOR-LIST.md 2023-04-07 11:27:35 +08:00
Dun Liang e395ae7791 update version with weight norm 2023-04-05 02:16:23 +08:00
Haoyang Peng 2e67d0eb8f
add support for weight_norm and histc. (#420)
* add support for weight_norm and histc.

* add docs.

* fix typo.

---------

Co-authored-by: Exusial <Exusial>
2023-04-05 21:59:20 +08:00
Dun Liang d11d2c816e polish include 2023-04-05 02:12:17 +08:00
GanymedeNil 2b79a2c496
Update swap.cc (#429)
fix getpid not found
2023-04-05 21:46:21 +08:00
Dun Liang 0464661920 update version 2023-04-04 23:49:10 +08:00
Dun Liang 45f617cb54 add pid location for swap 2023-04-04 23:15:42 +08:00
Dun Liang 5cf9c10c92 fix windows virtual env bug 2023-04-04 19:30:22 +08:00
Dun Liang 3bcdfc97a9 update version 2023-04-04 17:37:24 +08:00
Dun Liang 57d5b52b18 Merge branch 'master' of github.com:Jittor/jittor 2023-04-04 17:37:07 +08:00
lzhengning 10be18fbc5 support python3.11 on mac 2023-04-04 22:59:36 +08:00
Dun Liang 0036671536 add mem opt doc 2023-04-04 17:36:56 +08:00
Dun Liang 33b091e041 support python3.11 2023-04-04 14:54:40 +08:00
Dun Liang b88ffdbdf5 fix windows load pth bug 2023-04-04 13:12:32 +08:00
Dun Liang 2ea015061c polish swap iter 2023-04-03 12:55:01 +08:00
Dun Liang 7a83dcf09e fix windows mkl bug 2023-04-03 07:56:44 +08:00
Dun Liang 16b7966a9a set default amp level 2023-04-03 01:27:55 +08:00
Dun Liang df8628a3a5 Merge branch 'master' of github.com:Jittor/jittor 2023-04-02 07:18:24 +08:00
Haoyang Peng 779274653f
Gamma (#427)
* add digamma.

* add tdir.
2023-04-02 13:57:09 +08:00
Dun Liang a19936f954 new feature: save_mem 2023-04-02 07:17:55 +08:00
Dun Liang 00b7783c46 Merge branch 'master' of github.com:Jittor/jittor 2023-03-31 21:14:20 +08:00
Dun Liang e969103399 polish to_float interface 2023-03-31 21:13:59 +08:00
Dun Liang 3300a3e4f3 polish fp16 amp_level3 2023-03-31 21:07:43 +08:00
lzhengning 463fdcc46d rollback version 2023-03-31 19:45:07 +08:00
lzhengning 1119f3cc51 update readme && nn.linear 2023-03-31 19:45:07 +08:00
lzhengning 8a9f0bb904 update load_pytorch.py && add linear 2023-03-31 19:45:07 +08:00
Dun Liang b451a1eb85 Merge branch 'master' of github.com:Jittor/jittor 2023-03-31 13:12:08 +08:00
Dun Liang faa5386d82 chatglm optimize v2 2023-03-31 13:07:47 +08:00
Exusial f79f2d37e8 add digamma. 2023-03-31 11:05:06 +08:00
Dun Liang 1e00883ed1 add attr_dict for jittor var 2023-03-24 01:53:30 +08:00
Dun Liang 5638de9eca LLM support and add version 2023-03-22 19:51:11 +08:00
Dun Liang 07f8bd4aff Merge branch 'master' of github.com:Jittor/jittor 2023-03-22 19:47:26 +08:00
Dun Liang db8679748c fp32_guard && ternary broadcast 2023-03-22 19:41:36 +08:00
lzhengning 50c859f60c support build triu/tril of an inf matrix
The original implementation results in nan because zeros * inf = nan.
Suggest to use jt.ternary op instead of multiply a mask matrix.
2023-03-22 19:19:36 +08:00
lzhengning 576b0c9e03 feat: jt.outer & jt.nn.silu 2023-03-21 15:56:21 +08:00
Dun Liang 90850d36ad LLM support with jtorch 2023-03-20 22:33:12 +08:00
lzhengning 2f0e243e01 feat: support dim=None in Var.squeeze to squeeze all dimensions of size 1 2023-03-07 11:33:16 +08:00
LiDongyang 3703fe4421 polish optimzers save&load 2023-01-15 13:38:01 +08:00
Exusial daa0862896 fix multiple sequence load. 2023-01-15 13:37:07 +08:00
Brian Pugh 5dba84ee67 Allow for nn.Sequential to be sliced. 2023-01-15 13:36:29 +08:00
Exusial d1eeec5f89 Add nan fp16 support. 2023-01-15 13:35:10 +08:00
Dun Liang b25e62f1bb polish dataloader 2023-01-15 13:30:03 +08:00
Dun Liang 6fbffb7c62 jtorch compatible 2023-01-06 20:48:04 +08:00
Dun Liang 437a720500 add multinomial operator 2023-01-06 19:39:58 +08:00
Dun Liang 5a4ae74a3a add index_select 2023-01-06 17:08:30 +08:00
lzhengning 2b24d436e3 update version 2023-01-05 11:05:35 +08:00
lzhengning eace68f368 Merge branch 'fix-macOS' 2023-01-05 10:12:07 +08:00
lzhengning 948b9796ad support onednn on macOS with intel chips 2023-01-05 10:11:35 +08:00
lzhengning 5a17e3d265 support builtin download of onednn because homebrew no longer provides onednn 2.x 2023-01-05 09:38:03 +08:00
lzhengning 919e863e55 fix depthwise conv error on ROCm 2023-01-04 14:09:53 +08:00
li-xl cc0e4f120a polish cuda thread 2023-01-03 17:47:25 +08:00
lzhengning 24fe48451a update version to fix Windows CUDA bug 2022-12-25 23:49:47 +08:00
Exusial 35763e6c2c fix windows cuda. 2022-12-25 23:45:52 +08:00
lzhengning 6b7d1a27d4 polish corex_compiler.py 2022-12-17 23:33:50 +08:00
lzhengning d50fe5e754 1. fix compilation failure of fp16_compute.h under rocm platform
2. add jt.compiler.is_cuda to identify if the current device is CUDA
2022-12-17 21:36:54 +08:00
Dun Liang 2e74df517b fix zeros_like dtype select 2022-12-09 20:24:04 +08:00
Dun Liang 2a06801681 add cascade setitem test 2022-12-08 17:36:19 +08:00
Dun Liang 7c8cfa2016 polish ones interface 2022-12-07 15:34:40 +08:00
Dun Liang dc7cf4abb0 Merge branch 'master' of github.com:Jittor/jittor 2022-12-05 14:44:07 +08:00
Dun Liang c7e604af1a cascade_setitem v[a][b][c] = x -> v[a,b,c] = x 2022-12-05 14:42:15 +08:00
lzhengning a698c88981 improved compatibility with homebrew libomp >= 15.0.6 on macOS 2022-12-04 22:00:40 +08:00
Dun Liang 499d3ee99c add triu and tril function 2022-12-04 20:47:38 +08:00
Dun Liang 44fdc718ab fix assign var in th_mode 2022-12-04 18:22:12 +08:00
Dun Liang 87489d719e add klo bench 2022-12-02 22:37:48 +08:00
zhouwy19 a76c664c65
Merge pull request #411 from uyzhang/master
add ccl
2022-12-02 22:05:26 +08:00
zhouwy19 fe6970efc8
Merge pull request #415 from Jittor/from_torch
add from_torch
2022-12-02 22:00:39 +08:00
zhouwy19 725615872a add from_torch 2022-12-02 17:21:18 +08:00
Dun Liang be8faf4dfc add grad memory check 2022-11-30 13:12:10 +08:00
Dun Liang e014f4f25c polish cuda env setup 2022-11-30 11:05:21 +08:00
Dun Liang 607d13079f add data interface for code op without recompile 2022-11-29 13:42:42 +08:00
Dun Liang f7ba3cab31 polish flag scope for profile_mark 2022-11-20 23:39:12 +08:00
Dun Liang ef55bd378f fix isnan check for int 2022-11-19 10:55:04 +08:00
cxjyxx_me fd5bd4aba9 polish ternary 2022-11-17 08:18:25 -05:00
Dun Liang ec2eef1fd9 add profiler mark for easy range profiling 2022-11-16 16:05:27 +08:00
Dun Liang 5bc160b19c dataset and dataloader interface polish 2022-11-16 11:31:55 +08:00
Dun Liang 1f06bbf22e fix dropout2d 2022-11-13 16:36:45 +08:00
Dun Liang 74932f3c32 add dropout 2d 2022-11-13 16:17:48 +08:00
Dun Liang f4f327bd12 update version 2022-11-07 12:02:35 +08:00
lidongyang b5f03f996b fix pr#355&add unittest 2022-11-07 12:01:23 +08:00
liuruiyang98 13f9eaafc0 Support einops for Jittor 2022-11-07 12:01:23 +08:00
uyzhang de6724692f add ccl 2022-11-06 15:20:54 +08:00
Dun Liang 8c9bfb639d update version 2022-11-04 23:56:20 +08:00
Dun Liang 89998ebc60 polish corex compatible 2022-11-04 23:55:55 +08:00
Dun Liang 20357caf42 update corex backend 2022-11-04 14:45:02 +08:00
Dun Liang 7e40d7831c polish import message 2022-10-27 15:45:39 +08:00
Dun Liang bc7467dbff polish backend compiler 2022-10-26 20:25:14 +08:00
Dun Liang 344e13948c add device id interface 2022-10-26 16:29:00 +08:00
Dun Liang 49a0f8ba43 polish backends 2022-10-26 14:39:38 +08:00
Dun Liang 1ccc22caab update version && fit float64 dtype 2022-10-26 14:31:56 +08:00
Dun Liang 9f36d52575 Merge branch 'master' of github.com:Jittor/jittor 2022-10-26 14:31:17 +08:00
Dun Liang 003cdf6b16 polish backend 2022-10-26 14:29:53 +08:00
lzhengning a4eb197938 fix: polish use_[device] 2022-10-18 00:28:36 +08:00
lzhengning db81cc938f fix: default to use CUDA when both CUDA and ROCm are installed 2022-10-17 12:09:36 +08:00
Dun Liang 335a2e5c1d support conv 2022-10-13 01:20:14 +08:00
Dun Liang 042c3610a3 improve cuda restart issue 2022-10-10 17:31:53 +08:00
Dun Liang c12549020f Merge branch 'master' of github.com:Jittor/jittor 2022-10-10 17:30:36 +08:00
Dun Liang 524564b763 fix some acl issue 2022-10-07 14:20:18 +08:00
Zheng-Ning Liu 7827c45047
Merge pull request #378 from Exusial/npth
Polish loading weights for PyTorch .pth files
2022-10-06 16:50:35 +08:00
Exusial 17db747ae4 merge. 2022-10-06 16:42:08 +08:00
Dun Liang a57de764f6 fix issue #401, improve atan and atan2 2022-10-05 17:21:51 +08:00
Dun Liang 9a5e7ea6f5 code op support more than 10 args 2022-10-05 16:43:00 +08:00
Exusial b21ee40360 fix wrong transpose. 2022-10-03 18:38:14 +08:00
lzhengning b2fb32aa52 feats: defaults to keep the first dim in nn.Flatten 2022-09-29 22:14:29 +08:00
lidongyang 601101ea44 add new inplace function document 2022-09-26 14:35:45 +08:00
lidongyang 59e49b064d fix trunc_normal_ & add some inplace function 2022-09-26 14:35:45 +08:00
Zheng-Ning Liu 7164b1cc0f Polish document 2022-09-22 00:56:15 +08:00
lzhengning a44c016408 Add a flag 2022-09-20 16:23:23 +08:00
lzhengning 1a91f1dd01 update version to 1.3.5.15:
1. pretty-print jt.Var
2. support area mode in interpolate
3. support reshape empty var with uncertain dimension
2022-09-19 21:46:53 +08:00
lzhengning e050dd6dc9 pretty print jt.Var 2022-09-19 21:44:48 +08:00
Exusial b258cf3a84 add documentation and tests. 2022-09-18 23:41:57 +08:00
Exusial 56255578f9 Add interpolate area support. 2022-09-18 23:41:57 +08:00
lzhengning 05ed6c7e34 fix: support reshape empty var with uncertain dimension 2022-09-16 15:36:33 +08:00
Dun Liang e661f19e20 Merge branch 'master' of github.com:Jittor/jittor 2022-09-16 01:01:20 +08:00
Dun Liang c0ed98cbd6 polish mem leak problem 2022-09-16 00:59:29 +08:00
uyzhang e0bd748ff1 fix droppath 2022-09-15 16:08:02 +08:00
uyzhang 658ab32bda add droppath in jt.nn 2022-09-14 16:35:38 +08:00
Exusial 48997cd2b3 fix linspace zero division. 2022-09-14 16:14:36 +08:00
lzhengning 46e2dcbefd 1. polish index_op doc; 2. polish pyi files 2022-09-14 16:09:14 +08:00
Xiang-Li Li 95db8b310e
Merge pull request #389 from Jittor/kldiv
feat: add KLDivLoss in jittor.nn
2022-09-13 21:30:47 +08:00
lzhengning 9d176643a0 feat: add KLDivLoss in jittor.nn 2022-09-13 21:22:51 +08:00
lzhengning 3cce690218 feat: add one_hot in jt.nn 2022-09-07 21:48:53 +08:00
root 2c8279ebe8 update installation in README 2022-09-07 15:32:40 +08:00
lixl19 83622edc97 polish unique 2022-08-30 20:17:30 +08:00
lixl19 c3efd2bd9d fix unique 2022-08-30 19:31:45 +08:00
Xiang-Li Li b54ce7f51b
Merge pull request #374 from zjp-shadow/zjp
Update the unique (add new features)
2022-08-30 19:20:26 +08:00
root 87f6c4296e Update compilation and Add the test of cuda 2022-08-30 16:28:08 +08:00
Exusial 5eaccf538d add epillsis (numpy version) support. 2022-08-29 14:09:16 +08:00
Exusial 6339f62f56 fix einsum space interpret error. 2022-08-27 22:25:00 +08:00
Exusial b6c9421a9a fix multiple read. 2022-08-27 11:11:22 +08:00
Exusial e58e0d2247 fix index. 2022-08-27 00:03:34 +08:00
Zheng-Ning Liu 1025d94ffa
Update version 2022-08-25 00:25:21 +08:00
Exusial 880441f193 allclose, add log. 2022-08-25 00:23:28 +08:00
Exusial 125adc7a04 add support for non-zip pytorch pth format. 2022-08-25 00:23:28 +08:00
root 5ffa2747a8 update the unique 2022-08-23 01:53:51 +08:00
Exusial 3b814405ce allclose, add log. 2022-08-19 21:57:28 +08:00
Exusial 92b87448a2 add support for non-zip pytorch pth format. 2022-08-18 14:47:34 +08:00
Zheng-Ning Liu 89bf097445
Update version 2022-08-17 13:54:59 +08:00
Zheng-Ning Liu ae5988f512 Update cutt_transpose_op.cc 2022-08-17 13:51:55 +08:00
lzhengning 72eb7c76bd fix: cutt tranpose error when input is empty 2022-08-17 13:51:55 +08:00
Dun Liang d2b5c281b2 polish self update issue 2022-08-17 13:51:15 +08:00
Dun Liang 9e599ebf9d polish win bug 2022-08-13 23:41:28 +08:00
Dun Liang 64a30cc6fe add reuse array optimization 2022-08-13 23:26:31 +08:00
Dun Liang 51037cbe9f polish keepdim and keepdims 2022-08-13 14:15:33 +08:00
Dun Liang a8f4a97994 update version 2022-08-13 14:00:30 +08:00
Dun Liang 268bdbea80 polish win space support 2022-08-13 13:47:32 +08:00
HinGwenWoong 93c00f884e Update README.cn.md 2022-08-13 11:50:01 +08:00
HinGwenWoong fc60ad1a36 Update README.md 2022-08-13 11:50:01 +08:00
lidongyang c59351f171 polish linspace 2022-08-13 11:49:09 +08:00
Dun Liang 19ec9d0a4e polish conda lib conflict 2022-08-09 19:25:51 +08:00
yang guo ye 20da1fe7ac
Merge pull request #366 from Jittor/gopt_zero
polish gopt
2022-08-07 19:44:41 +08:00
cxjyxx_me 8507ee4330 update version 2022-08-07 07:43:43 -04:00
cxjyxx_me 3dcee91262 Merge branch 'master' into gopt_zero 2022-08-07 07:43:13 -04:00
lzhengning 3d4a9240a2 update version 2022-08-06 01:26:42 +08:00
lzhengning 7b8f51b868 fix: no long assume default parallel compilers 2022-08-06 01:08:39 +08:00
lzhengning 75b77e411f feature: official ROCm support 2022-08-06 00:44:33 +08:00
cxjyxx_me ae9273605d polish gopt 2022-08-04 22:04:56 -04:00
lzhengning 2246519eaf support jt.where (cub & rocprim) 2022-08-01 10:37:49 +08:00
Dun Liang 2c91bc1405 polish transform compose 2022-07-31 20:18:53 +08:00
lzhengning 23c4de4901 fix: unable to launch ROCm after upgrade 2022-07-29 15:36:52 +08:00
lzhengning 130aee53d4 Merge branch 'master' into hip 2022-07-28 12:36:58 +08:00
lzhengning 056fbf1f71 :Merge branch 'hip' of https://github.com/Jittor/jittor into hip 2022-07-28 12:36:28 +08:00
cxjyxx_me 767a7d9774 polish cufft 2022-07-27 23:33:01 -04:00
Dun Liang a7496d751c polish load pth 2022-07-28 11:13:51 +08:00
Dun Liang 6f178e8b10 update version 2022-07-28 11:13:51 +08:00
Dun Liang 2efa11997d polish arm support 2022-07-28 11:13:51 +08:00
lzhengning 39a866552b fix: occasional segmentation fault when return value is not provided in c++ lambda function 2022-07-27 01:19:59 +08:00
zhouwy19 1925031877
Merge pull request #352 from LetianLee/master
Make the function full_like and zeros_like can specify the dtype
2022-07-23 20:05:40 +08:00
Letian Li 22881c40f4 Make the function full_like and zeros_like can specify the dtype 2022-07-08 03:02:49 +01:00
Dun Liang 3dc86c7552 update jittor_offline 2022-07-06 13:24:42 +08:00
Zheng-Ning Liu d1c73d3878
fix: dtype in rand_like is not used 2022-07-06 10:00:42 +08:00
Dun Liang dfc80a12e0 polish jittor_offline version dependency 2022-07-05 00:12:52 +08:00
Dun Liang 9ac0376005 add offline jittor 2022-07-05 00:11:21 +08:00
Exusial 0691e2e615 update load function. 2022-07-04 23:01:02 +08:00
Exusial 41aa22a7d3 add pth support. 2022-07-04 23:01:02 +08:00
lzhengning b10178a325 Support AMD ROCm and HIP backend 2022-06-29 00:00:09 +08:00
Dun Liang 569af2a138 polish expr type cast 2022-06-23 21:16:19 +08:00
lzhengning ee12b80007 polish mpi document 2022-06-23 11:53:21 +08:00
Zheng-Ning Liu 47d938b2f3
Update post_step document 2022-06-22 19:47:27 +08:00
Exusial 348c0050b9
add dlink. (#334)
* add dlink.

* fix if.

* split if.

* delete useless flags.
2022-06-19 13:55:11 +08:00
lzhengning e316f511c3 Support AMD ROCm and HIP backend 2022-06-11 15:19:11 +08:00
Dun Liang 46a03098f9 update version 2022-06-04 23:21:44 +08:00
Dun Liang 05464a8227 Merge branch 'master' of github.com:Jittor/jittor 2022-06-04 23:21:17 +08:00
Dun Liang 92c86fde75 fix use_cuda_managed_allocator 2022-06-04 23:21:03 +08:00
Gword d5849ac54b update version 2022-06-02 11:16:56 +08:00
Gword fa935e1a6f getlasterror in cutt_transpose 2022-06-02 11:11:50 +08:00
Zheng-Ning Liu ee3993199f
Update jittor.mpi.md
add documents for manually building openmpi
2022-05-30 16:34:07 +08:00
lzhengning 3835a56b5b fix: jt.sync_al() after load_parameters to stop fusion array & broadcast 2022-05-28 11:38:00 +08:00
Dun Liang bc945bae94 polish win bug 2022-05-27 21:03:34 +08:00
Dun Liang 7de460536c update version, add support for jtorch 2022-05-26 15:00:06 +08:00
Dun Liang 8b1d620b22 Merge branch 'master' of github.com:Jittor/jittor 2022-05-26 14:58:33 +08:00
Dun Liang bfa6cd47ae polish test 2022-05-26 14:56:08 +08:00
Dun Liang 5a1fe489cb update data gz 2022-05-26 14:29:54 +08:00
Dun Liang 31fbabfb6f add th_mode 2022-05-25 17:13:43 +08:00
zhouwy19 a011b8a7ed polish unified memory 2022-05-25 12:19:26 +08:00
lzhengning e66d22f25c fix: wrong cumsum results with the specified dimension is larger than 4096 2022-05-24 17:05:23 +08:00
Zheng-Ning Liu 2ae5e019df feat: add dozens of docs 2022-05-24 15:58:55 +08:00
Dun Liang c5ccdaf330 polish jit key 2022-05-23 14:23:02 +08:00
Dun Liang 91fe1fac85 polish win bug 2022-05-20 14:34:54 +08:00
Dun Liang 643ca5bbb4 polish setitem grad nullptr 2022-05-19 11:34:50 +08:00
Dun Liang ba266fa99c polish log_softmax precision 2022-05-18 15:37:09 +08:00
Dun Liang 4c5ac0fda9 polish JITTOR_HOME env 2022-05-18 12:32:57 +08:00
Your Name 772bdfcdf2 fix abs. 2022-05-16 21:02:56 +08:00
Exusial 24f73cfdf2 fix fp16 abs error. Add cublas acc support. 2022-05-16 21:02:56 +08:00
Gword f946599e9e
fix bug: no recompiler #include"" 2022-05-15 21:36:49 +08:00
li-xl 9b7a184808 polish cross_entropy_loss 2022-05-13 20:04:50 +08:00
Dun Liang c113eabcca add th_mode 2022-05-10 18:04:35 +08:00
Dun Liang 88815e8dd3 add global var support for code op 2022-05-09 16:17:46 +08:00
cxjyxx_me 7aefa6b83d Merge branch 'master' of github.com:Jittor/jittor 2022-05-09 09:05:15 +08:00
cxjyxx_me f0198a2678 support multi gpu 2022-05-09 09:04:51 +08:00
Dun Liang 0d592abc4a add auto git tag 2022-05-08 19:48:51 +08:00
li-xl 59ec0c54f0 polish keep numpy array in multiple worker 2022-05-08 17:00:29 +08:00
Dun Liang f815a8cd95 polish migrate_all_to_cpu 2022-05-06 14:46:50 +08:00
Dun Liang 16bffae407 update version 2022-05-06 13:14:36 +08:00
Dun Liang 41fef77707 Merge branch 'master' of github.com:Jittor/jittor 2022-05-06 12:59:54 +08:00
Dun Liang 88bb84255f add node_order control execute order 2022-05-06 12:57:47 +08:00
zhouwy19 7a2b7f9182
Merge pull request #319 from Jittor/knn
add knn op
2022-05-03 15:09:13 +08:00
zhouwy19 5dcc1392e4 update test_knn 2022-05-03 15:08:44 +08:00
zhouwy19 228dbd9583 Merge branch 'master' of github.com:Jittor/jittor 2022-04-29 21:21:24 +08:00
zhouwy19 a59d5b7ce6 add knn op 2022-04-29 21:18:22 +08:00
Dun Liang eda20ee15e polish windows cuda bug 2022-04-28 22:30:00 +08:00
lzhengning 34740f9e52 fix: encountered nullptr in RNN 2022-04-27 15:55:17 +08:00
Dun Liang 0280f141e2 FIX_TORCH_ERROR by default 2022-04-26 14:27:05 +08:00
lzhengning a1322782ae improve compatibility with miniforge 2022-04-25 12:48:59 +08:00
Dun Liang 3e6fb4cad8 polish win log memptr 2022-04-24 14:25:14 +08:00
Dun Liang 9d899dcf23 polish reindex memory optimize 2022-04-24 13:57:36 +08:00
Dun Liang 0b13930ed3 add slice broadcast 2022-04-23 19:00:15 +08:00
Dun Liang ab30a15eae update version 2022-04-23 18:09:17 +08:00
Dun Liang 8f017cf57c polish fuser 2022-04-23 18:09:05 +08:00
Dun Liang cc6cb28d46 polish dataset use jittor with cuda 2022-04-23 17:32:12 +08:00
Dun Liang 5e559cf8fe remove cupy require for cumprod 2022-04-23 16:17:14 +08:00
Dun Liang 9e58fac49f polish windows install error 2022-04-22 21:40:58 +08:00
Dun Liang 3d4861265d polish fuser 2022-04-22 21:01:17 +08:00
lzhengning cac2b83a1c update copyright to 2022 2022-04-22 18:01:02 +08:00
lzhengning de8a193e9a add support jt.var / Var.var to compute variance.
Acknowledgement:
    Thanks fangtiancheng https://discuss.jittor.org/t/topic/193/3 for a demo implementation.
2022-04-22 17:49:51 +08:00
Letian 9b50f6370c Fix _upload
- Modify jkey permission
2022-04-22 17:33:04 +08:00
Dun Liang 8e703b83b1 Merge branch 'master' of github.com:Jittor/jittor 2022-04-22 15:04:53 +08:00
Dun Liang 0666456a2f v 1.3.3 memory optimization 2022-04-22 15:04:06 +08:00
Dun Liang a8880935a3 polish mem info 2022-04-22 13:51:06 +08:00
Dun Liang 0bb901ed9a polish mem info 2022-04-22 12:49:18 +08:00
Dun Liang 7c90e60424 add nvcc_path= check in install_cuda 2022-04-14 12:22:21 +08:00
Dun Liang 9048f3fd41 polish migrate to cpu 2022-04-07 16:42:22 +08:00
Dun Liang db88d73ed1 polish inplace size 2022-04-05 19:00:23 +08:00
Dun Liang 2ff5eba3a1 polish einsum(phy) 2022-04-05 17:19:14 +08:00
Dun Liang 385ab261e7 Merge branch 'einsum' of https://github.com/Exusial/jittor 2022-04-05 16:56:41 +08:00
Dun Liang 5a8b6c9710 Merge branch 'master' of https://github.com/Exusial/jittor 2022-04-05 16:55:22 +08:00
Dun Liang 2721e9fb55 polish torch issue 2022-04-05 16:55:01 +08:00
Dun Liang 2901e578dc add where(cond,x,y) alias 2022-04-05 12:42:51 +08:00
Dun Liang 9c74699707 optimize concat and split 2022-04-04 22:53:20 +08:00
Dun Liang 54fb38caed polish fp16 compute 2022-04-03 21:50:00 +08:00
Exusial 1aa77b9feb fix numpy. 2022-04-03 04:18:19 -04:00
Exusial fca1688bf1 Merge branch 'master' of github.com:Exusial/jittor 2022-04-02 22:56:16 -04:00
Exusial 1bd1ca768c add einsum and test. 2022-04-02 22:55:14 -04:00
Exusial 347c646634
Merge branch 'Jittor:master' into master 2022-04-03 10:53:30 +08:00
Dun Liang 18795bd02f polish gopt setitem concat 2022-04-02 23:02:58 +08:00
Exusial d615f3b22c
Merge branch 'Jittor:master' into master 2022-04-02 16:22:48 +08:00
Dun Liang da0dc2cfba add bug 42 test 2022-04-02 14:19:08 +08:00
Dun Liang 14f0e9be73 polish numpy code op 2022-04-02 14:15:22 +08:00
Dun Liang 47391bf07c polish logo readme 2022-04-02 12:13:23 +08:00
Dun Liang 0c66be5540 polish fp16 compute without if constexpr 2022-04-01 21:50:01 +08:00
Dun Liang 6ad04632ab add clean graph operation 2022-03-31 21:05:03 +08:00
Dun Liang cfb8f0c4a8 use -O2 for nan check 2022-03-31 20:28:29 +08:00
Dun Liang 3948cba176 add isnan and isinf check 2022-03-31 20:21:35 +08:00
Dun Liang baf9a91e0c add acl test for linear 2022-03-31 20:21:35 +08:00
yang guo ye 52127befec
Merge pull request #303 from Jittor/fix_bug
fix bug of   bool xxop bool
2022-03-31 04:02:41 -05:00
Dun Liang 22948ba07a polish transpose and matmul 2022-03-31 12:40:59 +08:00
cxjyxx_me 6858913536 Merge branch 'master' into fix_bug 2022-03-30 03:09:31 -04:00
cxjyxx_me 4e3f980c0c test bool reduce 2022-03-30 03:08:08 -04:00
cxjyxx_me 4ac5c66b67 bool reduce test 2022-03-30 03:05:36 -04:00
cxjyxx_me 9cdd36a3fa bool binary test 2022-03-30 03:05:30 -04:00
cxjyxx_me 16f9eba32e fix bool 2022-03-30 03:05:15 -04:00
Dun Liang b8c3c82c40 Merge branch 'master' of github.com:Jittor/jittor 2022-03-30 14:36:09 +08:00
Dun Liang 64e7ee9d63 polish bool reduce 2022-03-30 14:35:49 +08:00
Dun Liang 6644cf890b add seperate test 2022-03-30 13:50:14 +08:00
yang guo ye cbcfc57775
Merge pull request #302 from Jittor/fft
Fft
2022-03-30 00:30:12 -05:00
cxjyxx_me 45df1a4eee fft polish 2022-03-30 01:29:25 -04:00
cxjyxx_me 592ead713c fix bool xx bool 2022-03-29 09:45:14 -04:00
cxjyxx_me 65832ac10f clear cache 2022-03-28 09:41:46 -04:00
cxjyxx_me 0b5e367cb1 add cufft plan map 2022-03-28 09:36:10 -04:00
cxjyxx_me aee89141a8 fix bug of uint8.any_() 2022-03-28 07:53:25 -04:00
cxjyxx_me 254b8609fe add cufft wrapper 2022-03-28 07:45:33 -04:00
Dun Liang 2fe3ba696e force unverified ssl context 2022-03-28 14:51:50 +08:00
cxjyxx_me b642b8f1d1 Merge branch 'master' into fft 2022-03-26 23:30:35 -04:00
cxjyxx_me b04ad0ccb4 Merge branch 'master' into fft 2022-03-26 23:22:55 -04:00
cxjyxx_me fa62b3a217 support float64 fft 2022-03-26 23:20:16 -04:00
cxjyxx_me e087a56d86 update fft op test 2022-03-26 22:27:01 -04:00
Dun Liang 65e9ff1265 add amp alias 2022-03-26 15:51:47 +08:00
Dun Liang e13215bfd5 add module float64 2022-03-23 20:41:51 +08:00
Dun Liang 4b0c6a032b change rtol 2022-03-23 20:28:48 +08:00
Dun Liang fbc38c33ab prevent trace py var memleak 2022-03-23 17:29:27 +08:00
Dun Liang 0c1151305e update version 2022-03-22 22:49:47 +08:00
Dun Liang 5062b2d6e6 polish fused cpu and gpu op 2022-03-22 22:48:20 +08:00
Xiang-Li Li 1987728950
Merge pull request #300 from Jittor/jittor_home
add jittor_home
2022-03-22 21:12:39 +08:00
li-xl bd48b925dc add jittor_home 2022-03-22 20:58:39 +08:00
li-xl 107910ce07 add jittor_home 2022-03-22 20:53:05 +08:00
liuruiyang98 5625147244 Update trunc_normal_ with var = var.add(XX) and var = var.clamp(XX) 2022-03-22 20:48:41 +08:00
liuruiyang98 3dd187f34c Update var = var.erfinv() and var = var.multiply(XX) 2022-03-22 20:48:41 +08:00
liuruiyang98 4480351396 Update trunc_normal_ 2022-03-22 20:48:41 +08:00
li-xl 4d1e8f6fd8 add jittor_home 2022-03-22 20:42:49 +08:00
li-xl 90ecdc6729 add jittor_home 2022-03-22 20:38:59 +08:00
li-xl aa45703143 add jittor_home 2022-03-22 20:28:43 +08:00
lzhengning 10c51c6823 fix a compatible issue with onednn 2.5 2022-03-22 17:18:31 +08:00
lzhengning de2fceab22 Transitioning from Intel MKL-DNN to oneDNN 2022-03-22 17:10:45 +08:00
Dun Liang 7d9cff9e24 add cgan demo 2022-03-22 16:14:43 +08:00
Dun Liang 6f4e95e55f add simple_cgan demo 2022-03-22 16:14:43 +08:00
SilenceLi 3c8ed89d5e Update __init__.pyi 2022-03-22 15:56:10 +08:00
Dun Liang d4e8092c93 add test_modules 2022-03-22 14:38:50 +08:00
Dun Liang 31fbf4af9c add _modules and _parameters property 2022-03-22 14:37:44 +08:00
Dun Liang 99d6d6b6e1 Merge branch 'master' of github.com:Jittor/jittor 2022-03-21 17:49:24 +08:00
Dun Liang e7bb2545d3 add acl backend 2022-03-21 17:48:50 +08:00
lixl19 1fa89771c4 polish matmul 2022-03-18 16:21:34 +08:00
lixl19 ad57ec890f polish fp16 2022-03-18 12:56:24 +08:00
Exusial 3954ca172d
Merge pull request #10 from Jittor/master
update
2022-03-17 13:32:43 +08:00
Dun Liang 5efb222dd3 add half alias 2022-03-15 22:02:38 +08:00
Dun Liang 438c8ba692 Merge branch 'master' of github.com:Jittor/jittor 2022-03-15 21:18:08 +08:00
Dun Liang 39ecdd84fd add fp16 support 2022-03-15 17:45:39 +08:00
Dun Liang 15f162d4f3 cuda11.4 in windows disabled 2022-03-10 17:11:55 +08:00
Dun Liang 56af8b392f update version 2022-03-10 14:02:11 +08:00
Dun 70c462502a polish windows encoding support 2022-03-10 05:56:56 +08:00
Dun Liang 495d78ad20 fix windows encoding bugs 2022-03-09 18:23:42 +08:00
Dun Liang 53b377ee7d fix windows encoding error, thanks liduan for reporting this bug 2022-03-08 16:40:24 +08:00
Dun Liang 61ea95b76f polish windows notebook 2022-03-08 15:14:37 +08:00
Dun Liang f796ad0d7f fix some jupyter related bugs 2022-03-07 17:15:17 +08:00
Zheng-Ning Liu ca2ec40a59
fix: cumsum not available on CPU 2022-03-02 18:05:35 +08:00
Dun Liang 7cf6165a10 profiler polish 2022-02-26 16:14:52 +08:00
lzhengning 9c04a01143 fix: improve compatibility of Windows and python 10 2022-02-26 14:48:53 +08:00
Dun Liang c5acaa2474 polish example 2022-02-24 21:21:04 +08:00
Dun Liang 120057640c customize code op compilation flags 2022-02-24 21:08:29 +08:00
Xiang-Li Li 83e4f53af5
Update contrib.py
update concat docs
2022-02-24 18:18:16 +08:00
Xiang-Li Li 43d3a47964
Update __init__.pyi
polish docs of arg_reduce
2022-02-24 17:11:57 +08:00
Zheng-Ning Liu 0c8b2e3bb9
Update compiler.py
fix-bug: explicitly set log_v when querying cuda to avoid wrong output of jittor_utils.
2022-02-23 21:32:32 +08:00
Xiang-Li Li 4383886feb
Update misc.py
polish cub_cumsum to support ‘range’  item assignment'
2022-02-22 22:34:14 +08:00
Dun Liang 7c7b856dcf fix op stat 2022-02-20 21:27:31 +08:00
Dun Liang 58e8c650eb rename common_op_type 2022-02-20 21:15:56 +08:00
Dun Liang 2dd70a7002 check crlf 2022-02-20 21:06:09 +08:00
Dun Liang bcb57086c3 fp16 support, opbytype interface 2022-02-20 20:34:49 +08:00
Dun Liang 81b847e6f0 optimize softmax 2022-02-18 16:45:08 +08:00
lzhengning 8c0f66c638 polish nvcc_flags 2022-02-18 14:55:00 +08:00
yang guo ye e9aca0444c
Update nn.py
fix typo
2022-01-30 11:59:38 +08:00
cxjyxx_me e9f681de53 fft 2022-01-25 19:32:06 +08:00
Dun Liang 5d191e6247 add nvcc_path check 2022-01-18 16:32:23 +08:00
Dun Liang 6a372f5f4f polish reindex reduce fuse op 2022-01-12 16:43:39 +08:00
Dun Liang 70ccdb1f17 polish loop var pass 2022-01-11 21:03:37 +08:00
Dun Liang 6d1b5e42bc polish fuser 2022-01-11 17:25:00 +08:00
Dun Liang 5b4576c4dd polish tensorcore of cublas in cuda 10 2022-01-10 14:35:30 +08:00
Dun Liang f36693c797 add bool setitem alias and vector_to_tuple 2022-01-10 12:36:28 +08:00
Dun Liang 0c35e1a29b update version 2022-01-04 17:30:37 +08:00
Dun Liang 8f6928315e Merge branch 'master' of github.com:Jittor/jittor 2022-01-04 17:30:14 +08:00
Dun Liang 85ba4898cf add function version conv transpose 2022-01-04 17:29:58 +08:00
tupig-7 5fd65c72cb fix one bug in win10 jittor\src\misc\cpu_math.cc(47): error C2065: 'M_PI': undeclared identifier 2022-01-04 16:10:29 +08:00
liuruiyang98 ef86da29f1 Add init.trunc_normal_ 2022-01-04 16:09:57 +08:00
lidongyang 5c1164233e polish make_grid 2022-01-04 16:09:28 +08:00
Dun Liang f20fa98e44 add use_tensorcore flags 2021-12-30 20:59:48 +08:00
Dun Liang 7b9113f828 support tensorcore 2021-12-30 16:28:21 +08:00
li-xl 928f7ae5be Merge branch 'master' of github.com:jittor/jittor 2021-12-30 11:05:23 +08:00
Dun Liang d6b8349203 polish reindex fuse 2021-12-29 17:50:20 +08:00
Dun Liang 3a31e32a1a add disable lock flags 2021-12-28 12:59:43 +08:00
Dun Liang 8c44329def update version 2021-12-27 13:40:54 +08:00
Dun Liang 0d371ebbe4 add __setattr__ and __getattr__ 2021-12-27 13:40:19 +08:00
cxjyxx_me e19c9aef7a adapt ptrace missed 2021-12-27 10:52:55 +08:00
Dun Liang ac66897047 polish conv transpose group 2021-12-17 17:54:41 +08:00
Dun Liang 922c0d8246 remove some tests 2021-12-17 10:30:14 +08:00
Dun Liang 1f2b100be0 add mpi in py example 2021-12-16 17:37:16 +08:00
Dun Liang c82f5146c9 catch more out of memory error 2021-12-16 16:28:39 +08:00
Dun Liang 9013c70ddf update where document 2021-12-16 16:20:00 +08:00
Dun Liang bc083360e7 test array reindex fuse 2021-12-15 15:38:49 +08:00
Zheng-Ning Liu 03dd698e65
fix-doc: concat example 2021-12-15 10:20:58 +08:00
Dun Liang b913cd8254 polish atan2 2021-12-09 14:41:41 +08:00
Dun Liang 6bc17cb99c polish interpolate 2021-12-06 12:21:18 +08:00
lzhengning eae0357224 doc: fix the wrong example of arg_reduce_op 2021-11-25 11:03:55 +08:00
Dun Liang 9af14f4f55 polish rnn grad 2021-11-22 13:29:04 +08:00
Dun Liang 1e1fe66a30 update version 2021-11-22 13:17:31 +08:00
Dun Liang 5cebf93e31 polish rnn grad 2021-11-22 13:16:48 +08:00
lzhengning 38694a1b6e fix: wrong grad of varslice like [::-1] 2021-11-21 16:50:45 +08:00
Exusial eca21d5bba
Merge pull request #9 from Jittor/master
update
2021-11-21 11:18:45 +08:00
Zheng-Ning Liu 4bd0c4a2f6
fix: to_tensor applied on a jt.Var
Allow jt.transform.to_tensor applied on a jt.Var (do nothing but return the inputs).
2021-11-09 23:00:57 +08:00
Dun Liang b3ee4a54e4 polish cu key 2021-11-04 20:51:58 +08:00
Dun Liang 0274c22afd polish cu key 2021-11-04 20:51:31 +08:00
Dun Liang 5695a2e61f polish windows and erfinv 2021-11-04 20:43:54 +08:00
li-xl d08afa840d Merge branch 'master' of github.com:jittor/jittor 2021-11-02 17:45:34 +08:00
li-xl 40ed259665 support tensorcore mode 2021-11-02 17:45:08 +08:00
Dun Liang a1a82f6b2c update version 2021-11-01 12:20:45 +08:00
Dun Liang 0a2ca63cbc rename wrapper 2021-11-01 12:01:01 +08:00
Dun Liang 08cc8a0451 fix typo 2021-11-01 11:42:38 +08:00
Dun Liang 47c2f65749 polish data migrate 2021-10-28 19:59:54 +08:00
Dun Liang b488cc7c61 add erfinv 2021-10-28 17:23:45 +08:00
Dun Liang 26db9872c4 polish error msg 2021-10-28 14:56:41 +08:00
lzhengning 5257284740 update version 2021-10-28 13:44:44 +08:00
lzhengning 205bd011e7 polish embedding 2021-10-28 13:44:08 +08:00
lzhengning a778458831 update version 2021-10-26 20:20:28 +08:00
lzhengning 3147634775 polish 2021-10-26 20:14:17 +08:00
lzhengning ea497ce80a polish gen_pyi 2021-10-26 20:14:17 +08:00
Dun Liang db3b20d886 polish gen pyi 2021-10-26 20:14:17 +08:00
lzhengning b5331c9026 gen pyi files 2021-10-26 20:14:17 +08:00
Dun Liang 6c980c2146 add performance test tutorial 2021-10-25 16:12:27 +08:00
Dun Liang 856f201d9f polish init doc 2021-10-22 16:11:05 +08:00
Dun Liang 60f45b6adf polish init document 2021-10-22 16:05:07 +08:00
Dun Liang 10e28e2151 update res2net50 url 2021-10-19 20:01:20 +08:00
Dun Liang 537487695a polish download error msg 2021-10-19 17:48:14 +08:00
Dun Liang 849fb796ce polish no device found error 2021-10-19 13:10:04 +08:00
Dun Liang e24dd4f14a transpose fit pt 2021-10-18 16:02:02 +08:00
Dun Liang 34a9dab5fb polish error msg 2021-10-18 15:37:02 +08:00
Dun Liang 6c64a20493 update version 2021-10-17 15:58:36 +08:00
Dun Liang 57b4973ac1 polish error msg 2021-10-17 15:58:36 +08:00
Dun Liang b077d6c185 polish cuda install 2021-10-17 15:56:44 +08:00
Dun Liang 4a47390964 add cuda 11.0 11.2 pack 2021-10-17 15:09:58 +08:00
Dun Liang f65b5df1a6 polish msvc install 2021-10-17 14:26:11 +08:00
Dun Liang 878cb365ac add dependency for mpi ops 2021-10-15 16:22:56 +08:00
Dun Liang f9f02df3d6 polish reduce doc and pass_manager 2021-10-15 12:20:50 +08:00
Zheng-Ning Liu e6e7423631
Merge pull request #272 from Jittor/meminfo
polish meminfo
2021-10-14 16:51:31 +08:00
li-xl 2b16f80289 polish meminfo 2021-10-14 16:43:28 +08:00
Dun Liang eee405669f fix issue #271 2021-10-14 16:36:04 +08:00
Dun Liang 7cfd216372 update version 2021-10-13 20:16:32 +08:00
Dun Liang b40975d4df polish clean cache 2021-10-13 20:15:54 +08:00
Dun Liang c05193408a cutt for seperate cache 2021-10-13 20:15:17 +08:00
lzhengning 45481adb3b mac cpu check 2021-10-13 19:54:54 +08:00
Dun Liang 102ffa31a5 better cache key 2021-10-13 19:17:19 +08:00
zhouwy19 5683d338e8 update awesome 2021-10-12 16:52:49 +08:00
zhouwy19 4663589aea update readme 2021-10-12 16:49:03 +08:00
zhouwy19 30adcf41f5 add awesome jittor list 2021-10-12 16:42:01 +08:00
Dun Liang 030b30727d add pip_publish 2021-10-12 14:29:13 +08:00
Dun Liang e0d046f449 polish install from source 2021-10-12 14:29:13 +08:00
lzhengning ca2b4f20d0 polish doc for conv, rnn, and activation 2021-10-11 11:54:02 +08:00
Dun Liang 9e93a671c7 add clean cache script 2021-10-11 11:13:43 +08:00
Dun Liang aba912eb22 polish error message 2021-10-09 16:46:53 +08:00
Dun Liang 2951214deb polish getitem win support 2021-10-09 16:07:48 +08:00
Dun Liang b02b3192bc polish space with path 2021-10-09 09:36:06 +08:00
Dun Liang 66d9df2f82 polish win path order 2021-10-05 19:59:30 +08:00
Dun Liang ad015f13f5 polish win path handle 2021-10-05 19:18:35 +08:00
Dun Liang 203d2f4de8 polish readme 2021-10-03 23:33:50 +08:00
Dun Liang 2bf1b89c5e polish mkl and cutt install 2021-10-03 22:42:18 +08:00
Dun Liang 7bc620f274 add cuda arch recommend 2021-10-01 20:31:57 +08:00
Dun Liang 6300b0908f lower arch support for 30 2021-10-01 19:56:47 +08:00
Dun Liang 0fa0584fd3 polish files and add cuda arch hint 2021-10-01 19:41:04 +08:00
Dun Liang a78c3b4f12 Merge branch 'master' of github.com:Jittor/jittor into win_cuda 2021-10-01 19:16:25 +08:00
Dun Liang 2866c64876 update version 2021-10-01 19:15:33 +08:00
Dun Liang 9914e0918b polish msvc crt shared link with -MD 2021-10-01 19:15:27 +08:00
lzhengning 03b892d473 update logo 2021-09-30 17:50:49 +08:00
lzhengning 1be676caad Merge branch 'master' of https://github.com/Jittor/jittor 2021-09-30 17:47:57 +08:00
Dun Liang 1ccf45a03d update version 2021-09-30 17:42:58 +08:00
Dun Liang 11bb7bc884 check python version 2021-09-30 17:39:29 +08:00
Dun Liang 121c633aa1 add python windows install recommend and ssl polish 2021-09-30 16:46:40 +08:00
lzhengning e57f0e4879 updated docs 2021-09-30 16:15:32 +08:00
Zheng-Ning Liu 6ab86a1c72
Merge pull request #270 from Jittor/cudnn_rnn
Polish RNN, GRU, and LSTM
2021-09-30 15:36:35 +08:00
Dun Liang 752a5c62ed polish readme and add test 2021-09-30 15:33:21 +08:00
Dun Liang 2e6c6079cb Merge branch 'win_cuda' of github.com:Jittor/jittor into win_cuda 2021-09-30 15:20:40 +08:00
Dun Liang f43edfb665 polish readme 2021-09-30 15:20:28 +08:00
Dun Liang 28985cfea6 polish event queue 2021-09-30 14:41:40 +08:00
Dun Liang 0136d24d83 Merge branch 'win_cuda' of github.com:Jittor/jittor into win_cuda 2021-09-30 14:11:54 +08:00
lzhengning 6938c25e19 polish mac support for win_cuda 2021-09-30 13:42:51 +08:00
Dun Liang cfbadbbfb0 polish windows 2021-09-29 20:03:33 +08:00
lzhengning 9ec300f2aa Polish RNN, GRU, and LSTM
1. Use Cudnn to speed rnn
2. Fix: document string not correctly rendered
3. Fix: RNN cannot accept relu nonlinearity
4. feat: add default hidden state when executing rnn/gru/lstm
2021-09-28 17:54:56 +08:00
Dun Liang 9293045993 Merge branch 'master' of github.com:Jittor/jittor into win_cuda 2021-09-28 17:23:11 +08:00
Dun Liang 5c2fb6bfce update version 2021-09-28 17:17:20 +08:00
Dun Liang 740b9c8552 polish win_cuda in linux 2021-09-28 17:13:03 +08:00
cxjyxx_me 9ab20a5536 Merge branch 'master' of github.com:Jittor/jittor 2021-09-27 23:21:03 -04:00
cxjyxx_me 36faa8b617 polish shape=1 array fuse 2021-09-27 23:20:20 -04:00
cxjyxx_me 1f51e985d0 polish bool add/substract 2021-09-27 23:19:44 -04:00
Dun Liang ec80112709 polish link problem 2021-09-27 12:12:08 +08:00
Dun Liang c1ee6d9ed3 polish win_cuda on linux 2021-09-26 19:48:22 +08:00
Dun Liang e77f1ea7cb add nvcc exe suffix 2021-09-26 15:57:08 +08:00
Dun Liang 123e915bb3 support cuda win 2021-09-26 15:35:18 +08:00
Xiang-Li Li 4e7dbd327f
Update __init__.py
update version for nccl & mpi uint8
2021-09-25 20:53:40 +08:00
Xiang-Li Li 6eb1c77474
Update mpi_broadcast_op.cc
support uint8 in mpi
2021-09-25 20:49:29 +08:00
Xiang-Li Li beebddafd5
Update nccl_broadcast_op.cc
support uint8 in nccl
2021-09-25 20:48:18 +08:00
cxjyxx_me 764c8425af polish named_parameters 2021-09-22 22:27:13 -04:00
Exusial ac0648190a
Merge pull request #7 from Jittor/master
update
2021-09-16 16:58:30 +08:00
Dun Liang d85af13024 polish initialize order 2021-09-16 13:51:28 +08:00
Dun Liang 40cb853e21 update version 103 for windows 2021-09-15 17:48:59 +08:00
Dun Liang 500dfc6ee3 Merge branch 'master' of github.com:Jittor/jittor 2021-09-15 17:45:37 +08:00
Dun Liang c3938e14bf msvc support 2021-09-15 17:34:50 +08:00
Exusial 93ea100b7a
Merge pull request #4 from Jittor/master
update
2021-09-15 16:02:35 +08:00
Dun Liang 4fb462e1b9 polish sub module save load 2021-09-13 11:03:28 +08:00
Dun Liang 42dfaaed2e polish transform 2021-09-09 11:32:16 +08:00
Dun Liang f807a28e6b add ctcloss 2021-09-08 17:45:17 +08:00
Dun Liang 057dd95658 fix typo 2021-09-08 13:46:51 +08:00
Dun Liang 36f09282fd optimize repeat 2021-09-08 10:38:20 +08:00
lzhengning 40ce5e85cd polish repeat: support higher dimension 2021-09-07 20:15:33 +08:00
Dun Liang 69325deb45 add emnist dataset and mac polish 2021-09-04 14:37:49 +08:00
Dun Liang 4e712f283f optimize cumsum with unroll 2021-09-02 17:22:46 +08:00
cxjyxx_me 546860e19e fix cumprod 2021-09-02 04:10:17 -04:00
cxjyxx_me 8a2e7a1881 fix 2021-09-02 04:09:29 -04:00
cxjyxx_me 010d306163 fix 2021-09-02 04:05:38 -04:00
cxjyxx_me 393b83e22c test cub cumsum 2021-09-02 04:02:40 -04:00
cxjyxx_me 5b830dcf66 cub cumsum 2021-09-02 04:02:31 -04:00
cxjyxx_me c53d473d2a fix cumsum 2021-09-02 04:02:22 -04:00
Jittor ef83872459
Update res2net.py 2021-09-02 13:24:01 +08:00
Jittor c90954a325
Update res2net.py 2021-09-02 12:25:32 +08:00
Dun Liang 655362efd7 better memory profiler 2021-09-02 11:20:25 +08:00
Jittor 8dd2520adc
Update res2net.py 2021-09-02 11:17:40 +08:00
Dun Liang aa749dd0e0 polish exit cleanup 2021-08-31 11:42:12 +08:00
Exusial 91f9ef9feb
Merge pull request #2 from Jittor/master
update
2021-08-28 12:27:35 +08:00
cxjyxx_me 7349284663 Merge branch 'master' of github.com:Jittor/jittor 2021-08-26 22:28:06 -04:00
cxjyxx_me b9f97430c6 set different seed for different process&worker 2021-08-26 22:27:50 -04:00
Exusial 0f6b05f579
Merge pull request #1 from Jittor/master
update
2021-08-26 12:38:01 +08:00
Dun Liang 72ae3d669b polish doc 2021-08-25 12:25:31 +08:00
Dun Liang 810781eae7 doc polish and normalize polish 2021-08-25 12:11:06 +08:00
cxjyxx_me 76090168d0 Merge branch 'master' of github.com:Jittor/jittor 2021-08-24 10:59:41 -04:00
cxjyxx_me 46955965d9 set_global_seed for worker 2021-08-24 10:58:09 -04:00
Dun Liang 1f16af33a2 polish mpi dataset shuffle list 2021-08-24 17:22:39 +08:00
Dun Liang 4e38190483 Merge branch 'master' of https://github.com/Jittor/jittor 2021-08-17 13:35:18 +08:00
Dun Liang 4fd355202c add naive windows support 2021-08-15 19:58:18 +08:00
Dun Liang 4217359f86 add fuseable transpose op 2021-08-12 20:56:03 +08:00
Dun Liang 1c2668cdb8 add float64 atomic 2021-08-11 19:16:15 +08:00
Dun Liang 692cbddb8e polish world rank and world size 2021-08-05 20:09:25 +08:00
Dun Liang 39165cccfb polish bmm 2021-08-04 19:36:02 +08:00
Dun Liang 86a3feeaab polish pool interface 2021-08-04 19:28:54 +08:00
li-xl 317e07907f polish transpose 2021-08-04 16:49:45 +08:00
li-xl aab4bda835 polish dataset 2021-08-04 16:49:45 +08:00
Dun Liang 1f0ea3b796 add mac mem_info support 2021-07-31 14:33:41 +08:00
lzhengning 71f862b898 add mac meminfo 2021-07-31 14:25:42 +08:00
Dun Liang 11c2812b5c prune pyc 2021-07-31 14:22:39 +08:00
Dun Liang b7fff3072a Merge branch 'master' of https://github.com/Jittor/jittor 2021-07-30 20:40:31 +08:00
Dun Liang d482161be0 floor keep type and better mem_info 2021-07-30 20:40:21 +08:00
Zheng-Ning Liu 43b150a6c8
Update auto_diff.py
fixed: symbol not found when running "from jittor_utils import auto_diff; import jittor as jt"
2021-07-30 15:52:52 +08:00
li-xl 4d9d2d8601 Merge branch 'master' of github.com:jittor/jittor 2021-07-29 21:47:51 +08:00
li-xl abccb16248 polish xavier_uniform 2021-07-29 21:47:41 +08:00
Dun Liang 08eeb67de5 fix g++-10 line error 2021-07-29 20:02:14 +08:00
Dun Liang d563826b2b add cuda aarch64 support 2021-07-28 11:41:13 +08:00
Dun Liang 0748fc1854 polish cutt transpose 2021-07-27 11:13:39 +08:00
Dun Liang 01974db52d add exp lr and remove warning as error 2021-07-26 21:21:20 +08:00
Dun Liang ee020b60f7 data gz and register_hook and requires_grads_ 2021-07-26 20:58:15 +08:00
Dun Liang 0cf77ea11c add bilinear module 2021-07-23 23:31:21 +08:00
Dun Liang 7052251098 polish slice ellipsis_with_none 2021-07-23 14:38:45 +08:00
Dun Liang 51a28574c8 add jt_sync env 2021-07-23 14:17:29 +08:00
Dun Liang 31c56a1017 add var_dataset 2021-07-23 13:36:58 +08:00
Dun Liang b0a8943404 add TensorDataset 2021-07-23 13:30:16 +08:00
Dun Liang 87e639730c update version 2021-07-22 22:15:53 +08:00
Dun Liang e3181e706f fix console error 2021-07-22 22:14:51 +08:00
Dun Liang 399060e08c ssl polish 2021-07-21 17:08:25 +08:00
Dun Liang 08fcf01d62 add sw_64 support 2021-07-21 17:04:48 +08:00
Dun Liang c82db520b1 Merge branch 'master' of https://github.com/Jittor/jittor 2021-07-20 21:17:33 +08:00
Dun Liang 69979f71e4 polish setitem optimize 2021-07-20 21:14:55 +08:00
cxjyxx_me d89f1419f9 Merge branch 'master' of github.com:Jittor/jittor 2021-07-20 14:16:49 +08:00
cxjyxx_me b65286072e add TODO 2021-07-20 14:16:22 +08:00
cxjyxx_me 4e80c2118d adamw 2021-07-20 14:16:11 +08:00
Dun Liang e1472a7a8f polish clip_grad_norm update queue leak 2021-07-19 16:40:02 +08:00
Dun Liang cbc5a98bcf Merge branch 'setitem_inplace' of https://github.com/Jittor/jittor 2021-07-19 16:30:54 +08:00
Dun Liang cf6299009e polish auto diff with backward name 2021-07-19 16:30:05 +08:00
zhouwy19 b5e2780ba7 polish setitem data inplace 2021-07-19 16:27:20 +08:00
Dun Liang 62bbdcd7d9 Merge branch 'auto_diff' of https://github.com/lzhengning/jittor 2021-07-19 11:22:00 +08:00
Dun Liang 150604a1a2 polish mod float negtive 2021-07-18 20:14:41 +08:00
Dun Liang 9bf604934e gopt_disable && JT_macro fix && mod grad 2021-07-18 17:18:45 +08:00
Dun Liang 242084349f add more test for setitem 2021-07-17 19:55:53 +08:00
Dun Liang b4cb572c90 polish setitem inplace opt 2021-07-17 19:48:20 +08:00
Dun Liang 9c03fd4f75 polish conv3d bias 2021-07-17 16:57:18 +08:00
lzhengning b61065e8b3 Polish auto_diff.py
- Use the layer name as file name to be cached.
- Now it can compare input and output of layers with the same name, even if the networks are not identical.
- Add a convenient way to provide identical input data (save_input, load_input).
2021-07-17 13:58:31 +08:00
Zheng-Ning Liu e371a9d887
Merge pull request #240 from lzhengning/doc
doc: set seed
2021-07-17 13:34:06 +08:00
Dun Liang ee5900bd90 add functional normalize 2021-07-16 18:18:02 +08:00
Dun Liang db9d2a808a fix notebook 2021-07-16 15:47:58 +08:00
Dun Liang 6d4437043f polish cross_entropy_loss 2021-07-15 15:05:07 +08:00
Dun Liang 5a73db73e1 naive opt state_dict 2021-07-14 21:51:52 +08:00
Dun Liang 7ff46370e3 add learn jittor in 60 min 2021-07-14 21:51:52 +08:00
uyzhang 304d5034fd add reduction parameter to cross_entropy_loss 2021-07-14 21:42:21 +08:00
zhouwy19 9491b68031 add StepLR 2021-07-13 15:53:24 +08:00
Dun Liang 39981b896e polish prelu param 2021-07-09 15:27:44 +08:00
cxjyxx_me e7d8d8e2fd Merge branch 'master' of github.com:Jittor/jittor 2021-07-06 15:31:26 +08:00
cxjyxx_me 82eca30ee6 tf_resize 2021-07-06 15:31:20 +08:00
Dun Liang 2db62c17a6 polish pool interface 2021-07-05 21:20:10 +08:00
Dun Liang 9f58355341 add no_fuse and polish dataset 2021-07-05 21:17:57 +08:00
Dun Liang 8cab7d2a3e support different stride and kernel for unpool 2021-07-05 17:11:43 +08:00
Dun Liang f00ffa3861 add unpool doc 2021-07-05 10:58:01 +08:00
Dun Liang 95d85f29ef add backward hook 2021-07-04 17:35:09 +08:00
lzhengning 5b8a6dba0d doc: add loss3d to documents 2021-07-02 22:20:23 +08:00
Dun Liang cf6a73c6a5 fix typo 2021-07-02 15:50:14 +08:00
Dun Liang 93940ebb31 add expand -1 interface 2021-07-02 15:42:11 +08:00
Dun Liang b4d6f9880d add lock for dl_open 2021-07-02 14:47:18 +08:00
lzhengning 3a5a78544f doc: set seed 2021-07-02 12:09:17 +08:00
Dun Liang 7a253e4a4a polish ddp with multiple workers 2021-07-01 21:16:45 +08:00
Dun Liang 085074b625 allow dict convert in dataset.to_jittor 2021-07-01 20:41:12 +08:00
Dun Liang 5d4912b6df ddp polish with endless dataset 2021-07-01 18:09:09 +08:00
Dun Liang 4828cdc896 add endless dataset 2021-07-01 17:42:46 +08:00
Dun Liang 8fcdb63236 fix resnet test 2021-06-30 17:38:22 +08:00
lzhengning 7df4142542 fix: apple-a14 not supported by clang 12.0.0 2021-06-30 16:11:06 +08:00
lzhengning c23d11354e update version 2021-06-30 15:31:49 +08:00
lzhengning 52246efbcc loss3d: chamfer & emd 2021-06-30 15:22:43 +08:00
yang guo ye ec62338d45
Update README.md 2021-06-30 10:07:16 +08:00
Dun Liang fe11a83155 resize nearest polish 2021-06-25 23:44:50 +08:00
Dun Liang 7cd5f93f19 show jit utils error 2021-06-25 23:44:50 +08:00
li-xl ff7bb3c58d Merge branch 'master' of github.com:Jittor/jittor 2021-06-25 16:27:18 +08:00
li-xl 4285d5d61d polish concat 2021-06-25 16:26:48 +08:00
Jittor 7f4a098e8a Update issue templates 2021-06-25 15:39:24 +08:00
Dun Liang aaf97d5f58 add ce loss test 2021-06-25 14:36:33 +08:00
li-xl ef66d6d832 update Function doc 2021-06-25 14:01:10 +08:00
Dun Liang 8908e9d9ad fix os key 2021-06-25 11:27:02 +08:00
Dun Liang 1246c37692 fix ce 2021-06-24 21:39:43 +08:00
Dun Liang 385d60a147 update version 2021-06-24 17:17:43 +08:00
Zheng-Ning Liu 34b68c8754
Merge pull request #233 from lzhengning/master
cross entropy fix bug & new feature
2021-06-24 17:03:13 +08:00
lzhengning 4e1ee052f7 cross entropy fix & feat
fix: wrong cross entropy when using ignore_index
feat: support weight of labels
2021-06-24 16:50:39 +08:00
Dun Liang 575cbf4612 remove todo index 2021-06-24 11:21:56 +08:00
Dun Liang acd55b4f1b add attention doc 2021-06-24 11:18:31 +08:00
Dun Liang 8aa478fa5e add fake Parameter and backward interface 2021-06-23 21:45:11 +08:00
Dun Liang 1d0df10f13 pytorch make jittor segfault, wtf 2021-06-23 20:59:03 +08:00
Dun Liang 69550e3efd fix cop hash 2021-06-23 20:35:58 +08:00
Dun Liang 9c030a1329 fix nccl compile flags 2021-06-23 16:20:23 +08:00
Dun Liang 36af798634 add distributions doc 2021-06-22 21:40:12 +08:00
Dun Liang ca9b94f590 param list test polist 2021-06-22 21:35:56 +08:00
Dun Liang 698fc6fe88 add parameter list and dict 2021-06-22 21:25:38 +08:00
Dun Liang 1acf6492f4 DISABLE_MULTIPROCESSING flags 2021-06-22 17:01:50 +08:00
Dun Liang 66c3ab8c5a fix md5 2021-06-22 15:13:16 +08:00
Dun Liang 6906df6d18 Merge branch 'master' of https://github.com/Jittor/jittor 2021-06-22 15:08:32 +08:00
Dun Liang ab3ea18742 update md5 2021-06-22 15:02:44 +08:00
Zheng-Ning Liu ae2eafd878
Merge pull request #226 from lzhengning/macOS
Mac os support
2021-06-21 21:05:11 +08:00
lzhengning aacc387c49 Merge branch 'master' into macOS 2021-06-21 21:03:09 +08:00
lzhengning 4f793d11ad Merge remote-tracking branch 'upstream/master' 2021-06-21 21:02:53 +08:00
Dun Liang 8f495ffdc0 fix readme link 2021-06-21 17:38:23 +08:00
lzhengning 1e9b04c828 Merge branch 'master' into macOS 2021-06-21 12:03:53 +08:00
lzhengning 746794064a Merge remote-tracking branch 'upstream/master' 2021-06-21 12:01:09 +08:00
Dun Liang 3f169e5a66 resnet tune batch size 2021-06-20 16:53:10 +08:00
Dun Liang 28308c3b5c support aarch64 uos 2021-06-20 15:03:28 +08:00
lzhengning 5fd890afa2 avoid unknown cc 2021-06-18 16:58:30 +08:00
lzhengning 28ab13d804 update test_trace_var 2021-06-18 16:25:27 +08:00
lzhengning dcbf8b76f2 update test_searchsorted_op 2021-06-18 16:09:51 +08:00
lzhengning 556f0cfb1e avoid import error when no torch installed 2021-06-18 15:56:13 +08:00
lzhengning df0ea12d7e Merge branch 'master' into macOS 2021-06-18 10:30:57 +08:00
lzhengning d4b12d443d Merge remote-tracking branch 'upstream/master' 2021-06-18 10:30:36 +08:00
lzhengning c9eadd9b2b Merge branch 'master' of github.com:lzhengning/jittor 2021-06-18 10:30:02 +08:00
lzhengning bde5be81d0 update 2021-06-17 22:16:41 +08:00
lzhengning aea2fdfe9e update 2021-06-17 22:15:44 +08:00
lzhengning a866cc2387 update int64-type w.r.t OS 2021-06-17 22:13:39 +08:00
lzhengning e095821843 Merge branch 'macOS' of github.com:lzhengning/jittor into macOS 2021-06-17 22:06:01 +08:00
lzhengning c75c856d2f updated os specific codes 2021-06-17 22:04:24 +08:00
Dun Liang 4ec2bfacb2 polish distributions 2021-06-17 21:45:48 +08:00
Dun Liang 9c0f3cfdf4 polish cifar 2021-06-16 20:22:12 +08:00
Dun Liang e4089ecc4a polish cifar 2021-06-16 20:14:27 +08:00
Dun Liang e160a83a7e add cifar document 2021-06-16 17:53:24 +08:00
Dun Liang 16af6e7f8e add cifar dataset 2021-06-16 17:49:52 +08:00
Dun Liang a7ced77f69 polish to_tensor 2021-06-14 15:15:34 +08:00
lzhengning 533238cd5d polish 2021-06-11 21:11:05 +08:00
lzhengning 9edb1890b6 updated ld path 2021-06-11 21:05:42 +08:00
lzhengning 8bf235bb85 polish 2021-06-11 20:52:42 +08:00
Jittor ebb07a1efa support M1 chip 2021-06-11 20:27:16 +08:00
Dun Liang 80000c6941 add jt_check_nan flag 2021-06-11 17:32:31 +08:00
Jittor 815c3ed005 Merge branch 'master' into macOS 2021-06-11 14:38:55 +08:00
Jittor ee570e056c ignore annoying .DS_Store on macOS 2021-06-11 14:36:33 +08:00
lzhengning ed7f0d6c9d pretend clang to be g++ 2021-06-11 14:27:59 +08:00
Dun Liang 826513a156 add safe_clip and safe_log 2021-06-11 14:11:58 +08:00
lzhengning 72c22c1216 change default cc on mac to clang 2021-06-11 14:03:53 +08:00
lzhengning 7702715944 update version 2021-06-11 10:24:50 +08:00
lzhengning 6b6fefb8e2 Merge branch 'master' into macOS 2021-06-10 21:42:35 +08:00
lzhengning 577f407288 Merge branch 'macOS' of github.com:lzhengning/jittor into macOS 2021-06-10 21:41:39 +08:00
lzhengning 42da3afd36 improve Linux compatibility 2021-06-10 21:40:27 +08:00
lzhengning a750d1dcea improve Linux compatibility 2021-06-10 17:33:57 +08:00
Dun Liang fd798d9925 conv3d optimization 2021-06-10 15:27:12 +08:00
Dun Liang b6fe53e984 optimize conv3d 2021-06-10 15:11:03 +08:00
lzhengning 582bdfe52b updated polish 2021-06-09 21:57:03 +08:00
lzhengning 3fdda2f91d passed most test cases 2021-06-09 21:01:32 +08:00
lzhengning 5935f77fbb Merge branch 'master' into macOS 2021-06-09 20:40:43 +08:00
Dun Liang 77b293b6b8 bug fix 2021-06-09 15:52:44 +08:00
Dun Liang dc8415f61a Merge branch 'add_transforms' of https://github.com/yaox12/jittor 2021-06-08 20:33:54 +08:00
Dun Liang b14a60ad74 add roll 2021-06-05 23:05:01 +08:00
Dun Liang a07eb6bc12 add nvcc search path 2021-06-05 11:46:07 +08:00
Dun Liang fd7d68e6aa add /usr/lib as cuda lib search path 2021-06-05 10:59:06 +08:00
Dun Liang 2c2f5b156d python trace data 2021-06-04 22:39:29 +08:00
Dun Liang 1931f2fb41 update cuda md5 2021-06-04 14:25:51 +08:00
Dun Liang 88483fedbc add layernorm3d 2021-06-04 14:07:07 +08:00
Dun Liang 23eb540b7d support pool3d 2021-06-04 14:03:10 +08:00
Dun Liang 599963527a Merge branch 'master' of https://github.com/Jittor/jittor 2021-06-04 14:01:23 +08:00
Dun Liang 6eafa4dd66 Merge branch 'master' of https://github.com/jwzxgy2007/jittor 2021-06-04 14:01:11 +08:00
lzhengning 23106e0606 Merge branch 'master' into macOS 2021-06-04 13:26:29 +08:00
lzhengning cf171bf577 support macOS 2021-06-04 13:26:24 +08:00
Zheng-Ning Liu e394a54026
Merge pull request #224 from Jittor/fix-array_op
fix: array_op changes input
2021-06-03 20:59:55 +08:00
lzhengning 36eba44602 fix: array_op changes input 2021-06-03 20:53:49 +08:00
Dun Liang 1a2c9e724d fix compile error 2021-05-31 14:38:52 +08:00
Dun Liang 7981efd3f1 auto cuda install boost 2021-05-29 21:35:09 +08:00
Dun Liang a6830de216 polish array env loading 2021-05-29 20:20:52 +08:00
Dun Liang 07eb4a7a0e auto cuda downloader 2021-05-28 15:46:05 +08:00
Naiyang Lin 3ac160cb9a
Add fallback to find nvcc in PATH (#222) 2021-05-28 11:32:43 +08:00
Dun Liang b721afeac3 support package installed cuda 2021-05-27 19:36:17 +08:00
Siwei Chen c4a937cd32
fix typos in Chinese README (#221)
* fix typo in README.src.md

jupyterr -> jupyter

* fix typo in README.cn.md

jupyterr -> jupyter
2021-05-27 14:17:43 +08:00
Dun Liang b011bc6380 fix manifest 2021-05-24 13:46:50 +08:00
Dun Liang 558400e527 os_key enviroment 2021-05-23 21:31:05 +08:00
Dun Liang 6d69be7943 move files 2021-05-16 16:08:31 +08:00
Dun Liang 01b9608f27 remove soft link 2021-05-16 16:04:42 +08:00
Dun Liang c6404e29f3 update os install 2021-05-15 15:59:58 +08:00
Dun Liang 8e4e2fd9fa many linux support 2021-05-14 16:18:34 +08:00
Dun Liang 67e8e9a1ea os polish 2021-05-14 15:16:16 +08:00
Dun Liang 504442e47a update version 2021-05-14 15:04:54 +08:00
Dun Liang 6d1fc47a4a add diff os type 2021-05-14 15:01:34 +08:00
Dun Liang 8f2274059c failback to mpi if nccl not available 2021-05-14 13:45:38 +08:00
Dun Liang 21dad69e51 system log to stderr 2021-05-14 12:36:20 +08:00
Dun Liang 5a66739efa add clip_grad_norm 2021-05-13 22:03:20 +08:00
Dun Liang ed159c8c79 add centos support 2021-05-13 21:25:13 +08:00
Dun Liang 0753f06713 update version 2021-05-12 21:52:21 +08:00
wwhio 422878ee36
update lr according to param_group["lr"] (#210)
* update lr according to param_group["lr"]

* tiny fix

Co-authored-by: wwhio <i.wwh@qq.com>
2021-05-12 21:51:38 +08:00
jwzxgy2007 9f8da7507c fix bug 2021-02-05 16:45:16 +08:00
jwzxgy2007 81d8c528db add pool3d 2020-12-16 22:32:36 +08:00
jwzxgy2007 b026c3b702 add conv3d 2020-12-13 21:06:34 +08:00
yaox12 672df91385 declare the input format for to_tensor and to_pil_image 2020-10-07 16:22:33 +08:00
yaox12 130bc3db5a add test for transforms 2020-10-07 16:13:37 +08:00
yaox12 513507c6a0 add to_pil_image 2020-10-06 16:30:41 +08:00
yaox12 3d07af7917 add more transforming modules 2020-10-03 16:59:56 +08:00
yaox12 2f722bd877 update basic transforms 2020-10-01 11:29:06 +08:00
845 changed files with 74682 additions and 7158 deletions

25
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@ -0,0 +1,25 @@
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''
---
## Describe the bug
A clear and concise description of what the bug is. 使用中文也可以。
## Full Log
Provide a full log of Jittor execution, Jittor will log environment information which help us to locate your bugs. Provide a screenshot is also acceptable.
## Minimal Reproduce
Reproduce this error with a file or several lines of code.
If it is not possible, leave it blank.
## Expected behavior
A clear and concise description of what you expected to happen.
If you are submitting an issue for the first time, please refer to [our guideline](https://github.com/Jittor/jittor/issues/238)

2
.gitignore vendored
View File

@ -1,5 +1,6 @@
my
.refresh
.DS_Store
__pycache__
.ipynb_checkpoints/
.vscode/
@ -11,6 +12,7 @@ perf.data.old
*.pdf
*.zip
*.tgz
*.obj
test.py
extern/mkl/mkldnn_lnx*/*
data/

29
AWESOME-JITTOR-LIST.md Normal file
View File

@ -0,0 +1,29 @@
# Awesome Jittor List
- [JittorLLMs](https://github.com/Jittor/JittorLLMs): 随着ChatGPT的推出大模型正在快速地发展国内同样涌现出一大批优秀的大模型研究然而大模型高昂的配置、复杂的环境要求让人望而却步。非十科技领衔与清华大学可视媒体研究中心合作研发了大模型推理库JittorLLMs希望为国内大模型的研究提供软硬件的支撑。[缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/download/jittorllms-0.jpg)
- [GAN](https://github.com/Jittor/gan-jittor): Jittor GAN模型库一共包括了从2014到2019最主流的27种GAN模型。这27个GAN总计被引用60953次每篇文章平均引用次数为2176。[缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/resources/jittor-gan/gan-all.png)
- [实例分割](https://github.com/Jittor/InstanceSegmentation-jittor): Jittor实例分割模型库一共包含了6种Backbone和11类检测分割模型包含最经典的Mask RCNN系列实时实例分割网络以及人体分割网络等等。[缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/resources/jittor-seg/fenge.png)
- [语义分割](https://github.com/Jittor/segmentation-jittor): 目前Jittor已经支持了目前主流的语义分割算法。其中包含了三种经典的 Backbone ,以及六种常见的分割模型。[缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/resources/jittor-is/fenge.png)
- [点云](https://github.com/Jittor/PointCloudLib): 计图框架本次发布的点云模型库包括几种最有影响力的模型PointNet、PointNet++、PointCNN、DGCNN 和PointConv ,支持分类和分割。[缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/resources/jittor-point/dianyun.png)
- [可微渲染](https://github.com/Jittor/jrender): 可微渲染目前被广泛地应用于三维重建同时在人体重建、人脸重建、三维属性估计等应用。目前Jrender已经支持可微表面渲染和可微体渲染特性。[缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/tutorial/2020-10-17-22-00-dr/dr.png)
- [遥感检测](https://github.com/Jittor/JDet): JDet是基于Jittor的遥感目标检测算法库。JDet目前提供了4个主流遥感目标检测S2ANet、Gliding、RetinaNet和Faster R-CNN其他主流模型陆续添加中。[缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images//download/jdet.png)
- [医学图像分割](https://github.com/THU-CVlab/JMedSeg): Jittor Medical Segmentation Lib -- The assignment of Pattern Recognition course (2021 Spring) in Tsinghua University. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/download/JMedSeg-0.jpg)
- [PCT](https://github.com/MenghaoGuo/PCT): This is a Jittor implementation of PCT: Point Cloud Transformer. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/download/pct-0.jpg)
- [DeepFaceDrawing](https://github.com/IGLICT/DeepFaceDrawing-Jittor): One version of our system is implemented using the Jittor, and you need to install Jittor first. We will also provide a version in pytorch. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/papers/2020-9-10-DeepFaceDrawing.jpg)
- [JittorVis](https://github.com/thu-vis/JittorVis): JittorVis is an open-source library for understanding the inner workings of Jittor models by visually illustrating their dataflow graphs. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/download/jittorvis.png)
- [PraNet](https://github.com/DengPingFan/PraNet): Parallel Reverse Attention Network for Polyp Segmentation. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/download/PraNet.png)
- [DeepFaceEditing](https://github.com/IGLICT/DeepFaceEditing-Jittor): Deep Face Generation and Editing with Disentangled Geometry and Appearance Control. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/images/download/deepfaceediting-0.jpg)
- [SINet-V2](https://github.com/GewelsJI/SINet-V2): Concealed Object Detection (SINet-V2, IEEE TPAMI 2021). Code using Jittor Framework is available. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/SINet-V2.png)
- [hlagcn](https://github.com/shedy-pub/hlagcn-jittor): Jittor implementation of the paper "Hierarchical Layout-Aware Graph Convolutional Network for Unified Aesthetics Assessment". [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/hlagcn-jittor.jpg)
- [Jittor-Image-Models](https://github.com/Jittor-Image-Models/Jittor-Image-Models): About
Jittor Image Models is a library for pulling together a wide variety of SOTA deep learning models in the Jittor framework. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/white.png)
- [LearnJittorBasicIn60Min](https://github.com/Jittor/LearnJittorBasicIn60Min): 计图零基础快速入门教程60分钟) [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/white.png)
- [di-fusion-network-jittor](https://github.com/heiwang1997/di-fusion-network-jittor): Jittor implementation of the network architecture in DI-Fusion: Online Implicit 3D Reconstruction with Deep Priors. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/white.png)
- [Zhusuan](https://github.com/McGrady00H/Zhusuan-Jittor): Zhusuan with backend Jittor. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/white.png)
- [PRS-Net](https://github.com/IGLICT/PRS-NET-Jittor): This repository is code release for PRS-Net: Planar Reflective Symmetry Detection Net for 3D Models. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/PRS-Net.png)
- [PFSegNets](https://github.com/Jittor/PFSegNets-Jittor): This repo contains the the implementation of CVPR-2021 work: PointFlow: Flowing Semantics Through Points for Aerial Image Segmentation by Jittor. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/PFSegNets.jpg)
- [APDrawingGAN](https://github.com/yiranran/APDrawingGAN-Jittor): We provide Jittor implementations for our CVPR 2019 oral paper "APDrawingGAN: Generating Artistic Portrait Drawings from Face Photos with Hierarchical GANs". [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/APDrawingGAN.png)
- [APDrawingGAN2](https://github.com/yiranran/APDrawingGAN2-Jittor): We provide Jittor implementations for our TPAMI 2020 paper "Line Drawings for Face Portraits from Photos using Global and Local Structure based GANs". [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/APDrawingGAN2.png)
- [CMIC-Retrieval](https://github.com/IGLICT/IBSR_jittor): Code for Single Image 3D Shape Retrieval via Cross-Modal Instance and Category Contrastive Learning. ICCV 2021. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/CMIC-Retrieval.png)
- [Unpaired-Portrait-Drawing](https://github.com/yiranran/Unpaired-Portrait-Drawing-Jittor): We provide Jittor implementations for our CVPR 2020 paper "Unpaired Portrait Drawing Generation via Asymmetric Cycle Mapping". [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/Unpaired-Portrait-Drawing.jpg)
- [Jittor-MLP](https://github.com/liuruiyang98/Jittor-MLP): Unofficial Implementation of MLP-Mixer, gMLP, resMLP, Vision Permutator, S2MLPv2, ConvMLP, ConvMixer in Jittor. [缩略图](https://cg.cs.tsinghua.edu.cn/jittor/assets/images/white.png)

View File

@ -1,4 +1,4 @@
Copyright (c) 2021 Jittor. All Rights Reserved
Copyright (c) 2023 Jittor. All Rights Reserved
Apache License
Version 2.0, January 2004
@ -188,7 +188,7 @@ Copyright (c) 2021 Jittor. All Rights Reserved
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright (c) 2021 Jittor. All Rights Reserved.
Copyright (c) 2023 Jittor. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.

View File

@ -1,2 +1,5 @@
exclude __data__
exclude __pycache__
exclude __pycache__
prune **/__data__/
prune **/__pycache__
prune *.pyc

View File

@ -1,7 +1,9 @@
# Jittor: 即时编译深度学习框架
![Jittor Logo](https://cg.cs.tsinghua.edu.cn/jittor/favicon_package_v0/JittorLogo_Final1220.svg)
[快速开始](#快速开始) | [安装](#安装) | [教程](#教程)
[快速开始](#快速开始) | [安装](#安装) | [教程](#教程) | [English](./README.md)
Jittor 是一个基于即时编译和元算子的高性能深度学习框架整个框架在即时编译的同时还集成了强大的Op编译器和调优器为您的模型生成定制化的高性能代码。Jittor还包含了丰富的高性能模型库涵盖范围包括图像识别检测分割生成可微渲染几何学习强化学习等等。
@ -15,7 +17,10 @@ Jittor前端语言为Python。前端使用了模块化和动态图执行的设
* [Jittor教程](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/)
* [Jittor模型库](https://cg.cs.tsinghua.edu.cn/jittor/resources/)
* [Jittor文档](https://cg.cs.tsinghua.edu.cn/jittor/assets/docs/index.html)
* [Github](https://github.com/jittor/jittor) [Gitee](https://gitee.com/jittor/jittor)
* [Github](https://github.com/jittor/jittor) [GitLink](https://www.gitlink.org.cn/jittor/jittor) [Gitee](https://gitee.com/jittor/jittor)
* [Jittor 论坛](https://discuss.jittor.org/)
* [Jittor 精选仓库](https://github.com/Jittor/jittor/blob/master/AWESOME-JITTOR-LIST.md)
* 即时通信: QQ Group(761222083)
@ -76,7 +81,7 @@ for i,(x,y) in enumerate(get_data(n)):
## 快速开始
我们提供了一些jupyterr notebooks来帮助您快速入门Jittor。
我们提供了一些jupyter notebooks来帮助您快速入门Jittor。
- [示例:模型定义与训练][1]
- [基础Op, Var][2]
@ -85,30 +90,77 @@ for i,(x,y) in enumerate(get_data(n)):
## 安装
Jittor框架对环境要求如下:
* 操作系统: **Ubuntu** >= 16.04 或 **Windows Subsystem of LinuxWSL**
* Python版本 >= 3.7
* C++编译器 (需要下列至少一个)
- g++ >=5.4.0
- clang >=8.0
* GPU 编译器可选nvcc >=10.0
* GPU 加速库可选cudnn-dev (cudnn开发版, 推荐使用tar安装方法[参考链接](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar))
| OS | CPU | Python | Compiler | (Optional) GPU platform |
|--------------------------------------------------------|-------------------------------------|--------|--------------|---------------------------------------------|
| Linux<br>(Ubuntu, CentOS, Arch, <br>UOS, KylinOS, ...) | x86 <br>x86_64 <br>ARM <br>loongson | >= 3.7 | g++ >=5.4 | Nvidia CUDA >= 10.0, [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar) <br> or [AMD ROCm](https://docs.amd.com/) >= 4.0 <br> or [Hygon DCU DTK](https://tycloud.hpccube.com/doc/1.0.6/11277/general-handbook/software-tutorial/jittor.html) >= 22.04 |
| macOS <br>(>= 10.14 Mojave) | intel<br>Apple Silicon | >= 3.7 | clang >= 8.0 | - |
| Windows 10 & 11 | x86_64 | [>= 3.8](https://www.python.org/downloads/windows/) | - | Nvidia CUDA >= 10.2 [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#install-windows) |
如果您不希望手动配置环境,我们推荐使用 Docker 进行安装。
除此之外,您还可以使用 pip 安装和手动安装。
注意目前Jittor通过WSL的方式在Windows操作系统上运行WSL的安装方法请参考[微软官网](https://docs.microsoft.com/en-us/windows/wsl/install-win10)WSL版本目前尚不支持CUDA。
Jittor 提供了三种安装方法dockerpip和手动安装
Jittor 提供了三种安装方法pip、docker和手动安装
## Pip 安装
下面将展示Ubuntu的安装命令如果您在使用其他Linux操作系统如CentOS 请安装好依赖Python>=3.7, g++>=5.4)或者使用**docker安装** 如果您已经装好编译器和对应版本的Python,我们强烈推荐您使用这种方法
(如果无法访问github, 可以通过Jittor主页下载):
```bash
sudo apt install python3.7-dev libomp-dev
python3.7 -m pip install jittor
# or install from github(latest version)
# python3.7 -m pip install git+https://github.com/Jittor/jittor.git
python3.7 -m jittor.test.test_example
```
如果测试运行通过,恭喜你已经安装完成.
jittor会自动在路径中寻找合适的编译器, 如果您希望手动指定编译器, 请使用环境变量 `cc_path``nvcc_path`(可选).
### macOS 安装
macOS 请使用 [homebrew](https://brew.sh) 安装额外的依赖。
```bash
brew install libomp
```
之后您可以通过 pip 安装 jittor并测试是否可以成功运行。
```bash
python3.7 -m pip install jittor
python3.7 -m jittor.test.test_example
```
目前在 macOS 中jittor 只支持 CPU 计算。
### Windows安装
Windows 请准备好Python>=3.8,安装方法如下(conda安装需要额外命令)
Windows user please prepare Python>=3.8, install instructions are list below(conda needs extra instructions)
```bash
# check your python version(>=3.8)
python --version
python -m pip install jittor
# if conda is used
conda install pywin32
```
Windows 下jittor会自动检测显卡并安装对应的 CUDA 请确保您的NVIDIA驱动支持CUDA 10.2 以上您还可以使用如下命令手动为Jittor安装CUDA
```bash
python -m jittor_utils.install_cuda
```
@ -128,23 +180,6 @@ docker run -it -p 8888:8888 jittor/jittor
关于Docker安装的详细教程可以参考[Windows/Mac/Linux通过Docker安装计图](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/2020-5-15-00-00-docker/)
## Pip 安装
如果您没有准备好环境或者使用的不是Ubuntu操作系统 推荐使用**docker安装** 如果您已经装好编译器和对应版本的Python,我们强烈推荐您使用这种方法
(如果无法访问github, 可以通过jittor主页下载):
```bash
sudo apt install python3.7-dev libomp-dev
python3.7 -m pip install jittor
# or install from github(latest version)
# python3.7 -m pip install git+https://github.com/Jittor/jittor.git
python3.7 -m jittor.test.test_example
```
如果测试运行通过,恭喜你已经安装完成.
jittor会自动在路径中寻找合适的编译器, 如果您希望手动指定编译器, 请使用环境变量 `cc_path``nvcc_path`(可选).
## 手动安装
@ -313,11 +348,11 @@ help(jt.ops)
[1]: notebook/example.src.md "示例"
[2]: notebook/basics.src.md "基本概念"
[3]: notebook/meta_op.src.md "元算子"
[4]: notebook/custom_op.src.md "自定义算子"
[5]: notebook/profiler.src.md "性能分析器"
[1]: python/jittor/notebook/example.src.md "示例"
[2]: python/jittor/notebook/basics.src.md "基本概念"
[3]: python/jittor/notebook/meta_op.src.md "元算子"
[4]: python/jittor/notebook/custom_op.src.md "自定义算子"
[5]: python/jittor/notebook/profiler.src.md "性能分析器"
这些notebooks可以通过python3.7 -m jittor.notebook在您自己的计算机中运行。
@ -371,7 +406,7 @@ Jittor目前由[清华大学计算机图形学组](https://cg.cs.tsinghua.edu.cn
@article{hu2020jittor,
title={Jittor: a novel deep learning framework with meta-operators and unified graph execution},
author={Hu, Shi-Min and Liang, Dun and Yang, Guo-Ye and Yang, Guo-Wei and Zhou, Wen-Yang},
journal={Information Sciences},
journal={Science China Information Sciences},
volume={63},
number={222103},
pages={1--21},
@ -384,3 +419,4 @@ Jittor目前由[清华大学计算机图形学组](https://cg.cs.tsinghua.edu.cn
如LICENSE.txt文件中所示Jittor使用Apache 2.0版权协议。

View File

@ -1,6 +1,8 @@
# Jittor: a Just-in-time(JIT) deep learning framework
[Quickstart](#quickstart) | [Install](#install) | [Tutorial](#tutorial) | [Chinese](./README.cn.md)
![Jittor Logo](https://cg.cs.tsinghua.edu.cn/jittor/favicon_package_v0/JittorLogo_Final1220.svg)
[Quickstart](#quickstart) | [Install](#install) | [Tutorial](#tutorial) | [简体中文](./README.cn.md)
Jittor is a high-performance deep learning framework based on JIT compiling and meta-operators. The whole framework and meta-operators are compiled just-in-time. A powerful op compiler and tuner are integrated into Jittor. It allowed us to generate high-performance code with specialized for your model. Jittor also contains a wealth of high-performance model libraries, including: image recognition, detection, segmentation, generation, differentiable rendering, geometric learning, reinforcement learning, etc. .
@ -14,7 +16,10 @@ Related Links:
* [Jittor Tutorials](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/)
* [Jittor Models](https://cg.cs.tsinghua.edu.cn/jittor/resources/)
* [Jittor Documents](https://cg.cs.tsinghua.edu.cn/jittor/assets/docs/index.html)
* [Github](https://github.com/jittor/jittor), [Gitee](https://gitee.com/jittor/jittor)
* [Github](https://github.com/jittor/jittor), [GitLink](https://www.gitlink.org.cn/jittor/jittor), [Gitee](https://gitee.com/jittor/jittor)
* [Jittor Forum](https://discuss.jittor.org/)
* [Awesome Jittor List](https://github.com/Jittor/jittor/blob/master/AWESOME-JITTOR-LIST.md)
* IM: QQ Group(761222083)
@ -86,28 +91,50 @@ We provide some jupyter notebooks to help you quick start with Jittor.
Jittor environment requirements:
* System: **Ubuntu** >= 16.04 (or **Windows** Subsystem of Linux)
* Python version >= 3.7
* CPU compiler (require at least one of the following)
* g++ (>=5.4.0)
* clang (>=8.0)
* GPU compiler (optional)
* nvcc (>=10.0 for g++ or >=10.2 for clang)
* GPU library: cudnn-dev (recommend tar file installation, [reference link](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar))
| OS | CPU | Python | Compiler | (Optional) GPU platform |
|--------------------------------------------------------|-------------------------------------|--------|--------------|---------------------------------------------|
| Linux<br>(Ubuntu, CentOS, Arch, <br>UOS, KylinOS, ...) | x86 <br>x86_64 <br>ARM <br>loongson | >= 3.7 | g++ >=5.4 | Nvidia CUDA >= 10.0, [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar) <br> or [AMD ROCm](https://docs.amd.com/) >= 4.0 <br> or [Hygon DCU DTK](https://tycloud.hpccube.com/doc/1.0.6/11277/general-handbook/software-tutorial/jittor.html) >= 22.04 |
| Windows 10 & 11 | x86_64 | [>= 3.8](https://www.python.org/downloads/windows/) | - | Nvidia CUDA >= 10.2 [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#install-windows) |
Jittor offers three ways to install: pip, docker, or manual.
## Pip install
```bash
sudo apt install python3.7-dev libomp-dev
python3.7 -m pip install jittor
# or install from github(latest version)
# python3.7 -m pip install git+https://github.com/Jittor/jittor.git
python3.7 -m jittor.test.test_example
```
Note: Currently Jittor runs on the Windows operating system through WSL. For the installation method of WSL, please refer to [Microsoft official website](https://docs.microsoft.com/en-us/windows/wsl/install-win10). WSL does not yet support CUDA.
Jittor offers three ways to install: docker, pip, or manual.
### Windows install
```bash
# check your python version(>=3.8)
python --version
python -m pip install jittor
# if conda is used
conda install pywin32
```
In Windows, jittor will automatically detect and install CUDA, please make sure your NVIDIA driver support CUDA 10.2 or above, or you can manually let jittor install CUDA for you:
```bash
python -m jittor_utils.install_cuda
```
## Docker Install
@ -126,19 +153,6 @@ docker run -it -p 8888:8888 jittor/jittor
```
## Pip install
```bash
sudo apt install python3.7-dev libomp-dev
python3.7 -m pip install jittor
# or install from github(latest version)
# python3.7 -m pip install git+https://github.com/Jittor/jittor.git
python3.7 -m jittor.test.test_example
```
## manual install
We will show how to install Jittor in Ubuntu 16.04 step by step, Other Linux distributions may have similar commands.
@ -256,7 +270,7 @@ print(a.name())
### Operations
Jittor'op is similar with numpy. Let's try some operations. We create Var `a` and `b` via operation `jt.float32`, and add them. Printing those variables shows they have the same shape and dtype.
Jittor'op is simular with numpy. Let's try some operations. We create Var `a` and `b` via operation `jt.float32`, and add them. Printing those variables shows they have the same shape and dtype.
```python
@ -307,11 +321,11 @@ If you want to know more about Jittor, please check out the notebooks below:
[1]: notebook/example.src.md "example"
[2]: notebook/basics.src.md "basics"
[3]: notebook/meta_op.src.md "meta_op"
[4]: notebook/custom_op.src.md "custom_op"
[5]: notebook/profiler.src.md "profiler"
[1]: python/jittor/notebook/example.src.md "example"
[2]: python/jittor/notebook/basics.src.md "basics"
[3]: python/jittor/notebook/meta_op.src.md "meta_op"
[4]: python/jittor/notebook/custom_op.src.md "custom_op"
[5]: python/jittor/notebook/profiler.src.md "profiler"
Those notebooks can be started in your own computer by `python3.7 -m jittor.notebook`
@ -348,10 +362,10 @@ Email: jittor@qq.com
File an issue: https://github.com/Jittor/jittor/issues
QQ Group: 761222083
QQ Group: 836860279
<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/news/2020-12-8-21-19-1_2_2/fig4.png" width="200"/>
<img src="https://github.com/Jittor/jittor/assets/62846124/8dd830bd-b31c-4e4f-9a78-5fd7a3409145" width="200"/>
## The Team
@ -366,7 +380,7 @@ Jittor is currently maintained by the [Tsinghua CSCG Group](https://cg.cs.tsingh
@article{hu2020jittor,
title={Jittor: a novel deep learning framework with meta-operators and unified graph execution},
author={Hu, Shi-Min and Liang, Dun and Yang, Guo-Ye and Yang, Guo-Wei and Zhou, Wen-Yang},
journal={Information Sciences},
journal={Science China Information Sciences},
volume={63},
number={222103},
pages={1--21},
@ -379,3 +393,4 @@ Jittor is currently maintained by the [Tsinghua CSCG Group](https://cg.cs.tsingh
Jittor is Apache 2.0 licensed, as found in the LICENSE.txt file.

View File

@ -1,9 +1,11 @@
# Jittor: a Just-in-time(JIT) deep learning framework
# Jittor: 即时编译深度学习框架
![Jittor Logo](https://cg.cs.tsinghua.edu.cn/jittor/favicon_package_v0/JittorLogo_Final1220.svg)
[Quickstart](#quickstart) | [Install](#install) | [Tutorial](#tutorial) | [Chinese](./README.cn.md)
[快速开始](#快速开始) | [安装](#安装) | [教程](#教程)
[快速开始](#快速开始) | [安装](#安装) | [教程](#教程) | [English](./README.md)
Jittor is a high-performance deep learning framework based on JIT compiling and meta-operators. The whole framework and meta-operators are compiled just-in-time. A powerful op compiler and tuner are integrated into Jittor. It allowed us to generate high-performance code with specialized for your model. Jittor also contains a wealth of high-performance model libraries, including: image recognition, detection, segmentation, generation, differentiable rendering, geometric learning, reinforcement learning, etc. .
@ -18,14 +20,20 @@ Related Links:
* [Jittor Tutorials](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/)
* [Jittor Models](https://cg.cs.tsinghua.edu.cn/jittor/resources/)
* [Jittor Documents](https://cg.cs.tsinghua.edu.cn/jittor/assets/docs/index.html)
* [Github](https://github.com/jittor/jittor), [Gitee](https://gitee.com/jittor/jittor)
* [Github](https://github.com/jittor/jittor), [GitLink](https://www.gitlink.org.cn/jittor/jittor), [Gitee](https://gitee.com/jittor/jittor)
* [Jittor Forum](https://discuss.jittor.org/)
* [Awesome Jittor List](https://github.com/Jittor/jittor/blob/master/AWESOME-JITTOR-LIST.md)
* IM: QQ Group(761222083)
相关链接:
* [Jittor官网](https://cg.cs.tsinghua.edu.cn/jittor/)
* [Jittor教程](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/)
* [Jittor模型库](https://cg.cs.tsinghua.edu.cn/jittor/resources/)
* [Jittor文档](https://cg.cs.tsinghua.edu.cn/jittor/assets/docs/index.html)
* [Github](https://github.com/jittor/jittor) [Gitee](https://gitee.com/jittor/jittor)
* [Github](https://github.com/jittor/jittor) [GitLink](https://www.gitlink.org.cn/jittor/jittor) [Gitee](https://gitee.com/jittor/jittor)
* [Jittor 论坛](https://discuss.jittor.org/)
* [Jittor 精选仓库](https://github.com/Jittor/jittor/blob/master/AWESOME-JITTOR-LIST.md)
* 即时通信: QQ Group(761222083)
The following example shows how to model a two-layer neural network step by step and train from scratch In a few lines of Python code.
@ -96,7 +104,7 @@ for i,(x,y) in enumerate(get_data(n)):
We provide some jupyter notebooks to help you quick start with Jittor.
我们提供了一些jupyterr notebooks来帮助您快速入门Jittor。
我们提供了一些jupyter notebooks来帮助您快速入门Jittor。
- [Example: Model definition and training][1]
- [示例:模型定义与训练][1]
@ -109,41 +117,86 @@ We provide some jupyter notebooks to help you quick start with Jittor.
## 安装
Jittor框架对环境要求如下:
* 操作系统: **Ubuntu** >= 16.04 或 **Windows Subsystem of LinuxWSL**
* Python版本 >= 3.7
* C++编译器 (需要下列至少一个)
- g++ >=5.4.0
- clang >=8.0
* GPU 编译器可选nvcc >=10.0
* GPU 加速库可选cudnn-dev (cudnn开发版, 推荐使用tar安装方法[参考链接](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar))
如果您不希望手动配置环境,我们推荐使用 Docker 进行安装。
除此之外,您还可以使用 pip 安装和手动安装。
注意目前Jittor通过WSL的方式在Windows操作系统上运行WSL的安装方法请参考[微软官网](https://docs.microsoft.com/en-us/windows/wsl/install-win10)WSL版本目前尚不支持CUDA。
Jittor 提供了三种安装方法dockerpip和手动安装
Jittor environment requirements:
* System: **Ubuntu** >= 16.04 (or **Windows** Subsystem of Linux)
* Python version >= 3.7
* CPU compiler (require at least one of the following)
* g++ (>=5.4.0)
* clang (>=8.0)
* GPU compiler (optional)
* nvcc (>=10.0 for g++ or >=10.2 for clang)
* GPU library: cudnn-dev (recommend tar file installation, [reference link](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar))
| OS | CPU | Python | Compiler | (Optional) GPU platform |
|--------------------------------------------------------|-------------------------------------|--------|--------------|---------------------------------------------|
| Linux<br>(Ubuntu, CentOS, Arch, <br>UOS, KylinOS, ...) | x86 <br>x86_64 <br>ARM <br>loongson | >= 3.7 | g++ >=5.4 | Nvidia CUDA >= 10.0, [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar) <br> or [AMD ROCm](https://docs.amd.com/) >= 4.0 <br> or [Hygon DCU DTK](https://tycloud.hpccube.com/doc/1.0.6/11277/general-handbook/software-tutorial/jittor.html) >= 22.04 |
| macOS <br>(>= 10.14 Mojave) | intel<br>Apple Silicon | >= 3.7 | clang >= 8.0 | - |
| Windows 10 & 11 | x86_64 | [>= 3.8](https://www.python.org/downloads/windows/) | - | Nvidia CUDA >= 10.2 [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#install-windows) |
Jittor 提供了三种安装方法pip、docker和手动安装
Jittor offers three ways to install: pip, docker, or manual.
Note: Currently Jittor runs on the Windows operating system through WSL. For the installation method of WSL, please refer to [Microsoft official website](https://docs.microsoft.com/en-us/windows/wsl/install-win10). WSL does not yet support CUDA.
## Pip 安装
Jittor offers three ways to install: docker, pip, or manual.
## Pip install
下面将展示Ubuntu的安装命令如果您在使用其他Linux操作系统如CentOS 请安装好依赖Python>=3.7, g++>=5.4)或者使用**docker安装** 如果您已经装好编译器和对应版本的Python,我们强烈推荐您使用这种方法
(如果无法访问github, 可以通过Jittor主页下载):
```bash
sudo apt install python3.7-dev libomp-dev
python3.7 -m pip install jittor
# or install from github(latest version)
# python3.7 -m pip install git+https://github.com/Jittor/jittor.git
python3.7 -m jittor.test.test_example
```
如果测试运行通过,恭喜你已经安装完成.
jittor会自动在路径中寻找合适的编译器, 如果您希望手动指定编译器, 请使用环境变量 `cc_path``nvcc_path`(可选).
### macOS 安装
### macOS install
macOS 请使用 [homebrew](https://brew.sh) 安装额外的依赖。
Please first install additional dependencies with [homebrew](https://brew.sh).
```bash
brew install libomp
```
之后您可以通过 pip 安装 jittor并测试是否可以成功运行。
Then you can install jittor through pip and run the example.
```bash
python3.7 -m pip install jittor
python3.7 -m jittor.test.test_example
```
目前在 macOS 中jittor 只支持 CPU 计算。
Currently jittor only supports CPU on macOS.
### Windows安装
### Windows install
Windows 请准备好Python>=3.8,安装方法如下(conda安装需要额外命令)
Windows user please prepare Python>=3.8, install instructions are list below(conda needs extra instructions)
```bash
# check your python version(>=3.8)
python --version
python -m pip install jittor
# if conda is used
conda install pywin32
```
Windows 下jittor会自动检测显卡并安装对应的 CUDA 请确保您的NVIDIA驱动支持CUDA 10.2 以上您还可以使用如下命令手动为Jittor安装CUDA
In Windows, jittor will automatically detect and install CUDA, please make sure your NVIDIA driver support CUDA 10.2 or above, or you can manually let jittor install CUDA for you:
```bash
python -m jittor_utils.install_cuda
```
## Docker Install
@ -165,24 +218,6 @@ docker run -it -p 8888:8888 jittor/jittor
关于Docker安装的详细教程可以参考[Windows/Mac/Linux通过Docker安装计图](https://cg.cs.tsinghua.edu.cn/jittor/tutorial/2020-5-15-00-00-docker/)
## Pip 安装
## Pip install
如果您没有准备好环境或者使用的不是Ubuntu操作系统 推荐使用**docker安装** 如果您已经装好编译器和对应版本的Python,我们强烈推荐您使用这种方法
(如果无法访问github, 可以通过jittor主页下载):
```bash
sudo apt install python3.7-dev libomp-dev
python3.7 -m pip install jittor
# or install from github(latest version)
# python3.7 -m pip install git+https://github.com/Jittor/jittor.git
python3.7 -m jittor.test.test_example
```
如果测试运行通过,恭喜你已经安装完成.
jittor会自动在路径中寻找合适的编译器, 如果您希望手动指定编译器, 请使用环境变量 `cc_path``nvcc_path`(可选).
## 手动安装
## manual install
@ -387,16 +422,16 @@ If you want to know more about Jittor, please check out the notebooks below:
[1]: notebook/example.src.md "example"
[2]: notebook/basics.src.md "basics"
[3]: notebook/meta_op.src.md "meta_op"
[4]: notebook/custom_op.src.md "custom_op"
[5]: notebook/profiler.src.md "profiler"
[1]: notebook/example.src.md "示例"
[2]: notebook/basics.src.md "基本概念"
[3]: notebook/meta_op.src.md "元算子"
[4]: notebook/custom_op.src.md "自定义算子"
[5]: notebook/profiler.src.md "性能分析器"
[1]: python/jittor/notebook/example.src.md "example"
[2]: python/jittor/notebook/basics.src.md "basics"
[3]: python/jittor/notebook/meta_op.src.md "meta_op"
[4]: python/jittor/notebook/custom_op.src.md "custom_op"
[5]: python/jittor/notebook/profiler.src.md "profiler"
[1]: python/jittor/notebook/example.src.md "示例"
[2]: python/jittor/notebook/basics.src.md "基本概念"
[3]: python/jittor/notebook/meta_op.src.md "元算子"
[4]: python/jittor/notebook/custom_op.src.md "自定义算子"
[5]: python/jittor/notebook/profiler.src.md "性能分析器"
Those notebooks can be started in your own computer by `python3.7 -m jittor.notebook`
@ -472,7 +507,7 @@ Jittor目前由[清华大学计算机图形学组](https://cg.cs.tsinghua.edu.cn
@article{hu2020jittor,
title={Jittor: a novel deep learning framework with meta-operators and unified graph execution},
author={Hu, Shi-Min and Liang, Dun and Yang, Guo-Ye and Yang, Guo-Wei and Zhou, Wen-Yang},
journal={Information Sciences},
journal={Science China Information Sciences},
volume={63},
number={222103},
pages={1--21},

BIN
doc/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

View File

@ -0,0 +1,176 @@
Jittor性能测试与对比方法
=====================
下面代码以AlexNet为例用于演示 Jittor 性能测试的正确方法:
```python
import time
import jittor as jt
from jittor.models import resnet50
jt.flags.use_cuda = jt.has_cuda
warmup = 10
rerun = 100
batch_size = 8
data = jt.random((batch_size, 3, 224, 224))
model = resnet50()
model.eval()
# 此段代码对jittor进行热身确保时间测试准确
jt.sync_all(True)
for i in range(warmup):
pred = model(data)
# sync是把计算图发送到计算设备上
pred.sync()
# sync_all(true)是把计算图发射到计算设备上,并且同步。
# 只有运行了jt.sync_all(True)才会真正地运行时间才是有效的因此执行forward前后都要执行这句话
jt.sync_all(True)
# 开始测试运行时间
start = time.time()
for i in range(rerun):
pred = model(data)
pred.sync()
jt.sync_all(True)
end = time.time()
print("Jittor FPS:", (rerun*batch_size)/(end-start))
```
在这段代码中,我们定义了几个参数`batch_size`, `warmup`, `rerun`, batch_size代表批大小warmup是用于热身的循环次数而rerun是用于测速的循环次数最终输出FPS对Jittor进行正确测速的关键是 热身部分和同步部分热身部分确保测试时间稳定没有包含编译用的时间而同步部分确保计算完成因为jittor是一个异步框架只有同步操作能保证计算完成。
以上代码的运行结果如下RTX Titanbatch 8
```
Compiling Operators(8/8) used: 7.35s eta: 0s
Compiling Operators(13/13) used: 8.36s eta: 0s
Jittor FPS: 908.9853866375396
```
我们还可以使用类似的代码测试 PyTorch的性能
```python
import time
import torch
from torchvision.models import resnet50
warmup = 10
rerun = 100
batch_size = 8
data = torch.randn((batch_size, 3, 224, 224)).cuda()
model = resnet50()
model.cuda()
model.eval()
# 此段代码对pytorch进行热身确保时间测试准确
torch.cuda.synchronize()
for i in range(warmup):
pred = model(data)
# synchronize用于确保PyTorch计算完成
torch.cuda.synchronize()
# 开始测试运行时间
start = time.time()
for i in range(rerun):
pred = model(data)
torch.cuda.synchronize()
end = time.time()
print("PyTorch FPS:", (rerun*batch_size)/(end-start))
```
以上代码的运行结果如下RTX Titanbatch 8
```
PyTorch FPS: 807.4806873965665
```
我们还可以对这两段代码合并,并对比结果的一致性:
```python
import time
import jittor as jt
from jittor.models import resnet50
jt.flags.use_cuda = jt.has_cuda
warmup = 100
rerun = 1000
batch_size = 8
data = jt.random((batch_size, 3, 224, 224))
model = resnet50()
model.eval()
# 此段代码对jittor进行热身确保时间测试准确
jt.sync_all(True)
for i in range(warmup):
pred = model(data)
# sync是把计算图发送到计算设备上
pred.sync()
# sync_all(true)是把计算图发射到计算设备上,并且同步。
# 只有运行了jt.sync_all(True)才会真正地运行时间才是有效的因此执行forward前后都要执行这句话
jt.sync_all(True)
# 开始测试运行时间
start = time.time()
for i in range(rerun):
pred = model(data)
pred.sync()
jt.sync_all(True)
end = time.time()
print("Jittor FPS:", (rerun*batch_size)/(end-start))
# 将 jittor 数据和参数导出为 numpy 和 torch 格式
jittor_data = pred.numpy()
jittor_param = model.state_dict(to="torch")
import numpy as np
import torch
from torchvision.models import resnet50
data = torch.Tensor(data.numpy()).cuda()
model = resnet50()
# 加载 jittor 参数
model.load_state_dict(jittor_param)
model.cuda()
model.eval()
# 此段代码对pytorch进行热身确保时间测试准确
torch.cuda.synchronize()
for i in range(warmup):
pred = model(data)
# synchronize用于确保PyTorch计算完成
torch.cuda.synchronize()
# 开始测试运行时间
start = time.time()
for i in range(rerun):
pred = model(data)
torch.cuda.synchronize()
end = time.time()
print("PyTorch FPS:", (rerun*batch_size)/(end-start))
pytorch_data = pred.detach().cpu().numpy()
err = np.mean(np.abs(pytorch_data - jittor_data))
print("mean error:", err)
```
以上代码运行结果如下:
```
Jittor FPS: 908.9853866375396
PyTorch FPS: 807.4806873965665
mean error: 1e-5
```
误差输出为1e-5, 在可接受范围内。正确测速与对比的几大关键点为:
1. 充分热身,除去框架的准备时间。
2. 多次运行,确保测试时间稳定。
3. 加上同步语句,确保测试时间准确。
4. 保证显存充足在显存不足时jittor会调用统一内存来弥补会产生性能损失请密切关注`nvidia-smi`的输出结果。
5. 保证对比模型的一致性,检查输出结果的一致。
如果您对测试结果有疑问或者有优化需求欢迎随时联系Jittor开发团队。

View File

@ -0,0 +1,75 @@
Jittor显存以及内存优化方法
=====================
您可以主要通过两种方法,来改进内存消耗:
1. 优化消耗内存比较大的变量
2. 使用Jittor自动交换技术将变量在显存-内存-硬盘之间交换,降低运行部署门槛。
## 优化消耗内存比较大的变量
您可以使用jittor的memory profiler来分析显存消耗较大的代码并且针对特定代码进行优化。使用方法如下
```
net = jt.models.resnet18()
with jt.flag_scope(trace_py_var=3, profile_memory_enable=1):
imgs = jt.randn((1,3,224,224))
net(imgs).sync()
jt.get_max_memory_treemap()
```
输出如下:
```
|
├─./python/jittor/test/test_memory_profiler.py:100(test_sample)
| [19.03 MB; 29.67%]
| ./python/jittor/test/test_memory_profiler.py:100
| |
| └─./python/jittor/__init__.py:730(__call__)
| [19.03 MB; 29.67%]
| ./python/jittor/__init__.py:730
| |
| └─./python/jittor/models/resnet.py:152(execute)
| [19.03 MB; 29.67%]
| ./python/jittor/models/resnet.py:152
| |
| ├─./python/jittor/models/resnet.py:142(_forward_impl)
| | [6.13 MB; 9.55%]
| | ./python/jittor/models/resnet.py:142
| | |
```
## 使用自动交换技术
该技术确保Jittor在显存或者内存不足的情况下都能以一定速度运行。
节省内存方法请安装Jittor版本大于1.3.7.5,并添加如下环境变量:
```bash
export JT_SAVE_MEM=1
# 限制cpu最多使用16G
export cpu_mem_limit=16000000000
# 限制device内存如gpu、tpu等最多使用8G
export device_mem_limit=8000000000
# windows 用户请使用powershell
# $env:JT_SAVE_MEM="1"
# $env:cpu_mem_limit="16000000000"
# $env:device_mem_limit="8000000000"
```
用户可以自由设定cpu和设备内存的使用量如果不希望对内存进行限制可以设置为`-1`。
```bash
# 限制cpu最多使用16G
export cpu_mem_limit=-1
# 限制device内存如gpu、tpu等最多使用8G
export device_mem_limit=-1
# windows 用户请使用powershell
# $env:JT_SAVE_MEM="1"
# $env:cpu_mem_limit="-1"
# $env:device_mem_limit="-1"
```
如果想要清理磁盘交换文件,可以运行如下命令
```bash
python3 -m jittor_utils.clean_cache swap
```

View File

@ -0,0 +1,90 @@
Jittor调试技巧
=====================
该文档包含了几种异常情况的调试方法和技巧。
## 爆Nan、Inf
在模型训练的过程中可能因为数值不稳定而出现Nan或者Inf为了帮助您定位出现nan的代码您可以设置如下环境变量
```bash
export JT_CHECK_NAN=1
export trace_py_var=3
```
其中,环境变量`JT_CHECK_NAN=1`的用途是:当算子的输出出现异常浮点数时,自动报错并停止程序,环境变量`trace_py_var=3`的用途是输出算子对应的Python代码行数3代表输出的详细等级为最高等级。
需要注意的是开启这两个特性之后jittor速度会大幅下降并且触发重编译请不要在训练环境或者生产环境开启该模式也不要长时间开启该模式。
## 错误信息定位不准确
Jittor框架默认采用延迟执行Lazy execution的方式进行加速算子的执行和创建是不同步的这可能导致报错信息定位不准确您可以手动关闭延迟执行采取立刻执行eager execution的模式使用如下环境变量即可
```bash
export lazy_execution=0
```
或者在python代码中通过flag关闭
```python
jt.flags.lazy_execution=0
```
## 内存不足
当您发现Jittor由于内存相关问题无法运行时Jittor会向您报告内存使用情况内存不足可能有两种情况
1. 训练模型过大,一个迭代就崩溃报错。
2. 多次迭代的过程中,内存占用不断增长,直到最后内存耗尽报错。
**对于第一种情况** ,您可能需要调整模型或者数据大小,或者使用[多卡训练](jittor.mpi)此外您还可以在每个迭代内部让Jittor强制回收内存
```python
for ...:
...
jt.sync_all()
jt.gc()
```
如果您使用到了CUDA和卷积还有可能是卷积消耗的临时空间过大在这种情况下可以关闭cudnn的临时内存申请请将如下代码插入到最开始
```python
jt.cudnn.set_max_workspace_ratio(0.0)
```
**对于第二种情况**,可能是存在内存内存泄漏,请检查您是否存在全局变量没有释放,或者全局变量没有停止梯度,导致计算图不断增加,检查方法如下,您可以在每个迭代内部,插入如下调试代码:
```python
for ...:
...
jt.sync_all()
jt.display_memory_info()
```
Jittor会输出内存消耗以及计算图的大小`lived_var,lived_op`,以及用户持有的变量数`hold_var`, 如果计算图规模不断增大请检查代码或者提交github issue联系我们并且附上错误日志和代码复现脚本。
## 段错误
如果Jittor出现了段错误建议您将错误提交github issue联系我们并且附上错误日志和代码复现脚本。您也可以使用如下环境变量对程序以及框架进行诊断
```bash
export debug=1
export gdb_attach=1
```
其中,环境变量`debug=1`代表开启jittor的debug模式性能会大幅下降但会保留调试信息`gdb_attach=1`将会自动将gdb贴在jittor的主进程上方便您进行单步调试。关于gdb的使用您可以参考[GDB Cheat Sheet](https://darkdust.net/files/GDB%20Cheat%20Sheet.pdf)
## 管理Jittor cache
Jittor会在`/.cache/jittor`目录下创建cache cache里面可能包括 core内核、cuda编译器、cuda库、数据集dataset、预训练参数等等在某些情况下cache可能失效如系统更新、驱动更新等等这种情况可能需要用户手动清除cache 清除的方法如下:
```
python3 -m jittor_utils.clean_cache all
```
以上命令会清除jittor的所有cache如果您不想全部清除可以参考命令行帮助
```
python3 -m jittor_utils.clean_cache help
```

View File

@ -1 +0,0 @@
../../README.cn.md

1
doc/source/README.cn.md Normal file
View File

@ -0,0 +1 @@
../../README.cn.md

View File

@ -45,7 +45,8 @@ language = 'zh_CN'
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'recommonmark',
# 'recommonmark',
'myst_parser',
'sphinx.ext.autodoc',
# Auto-generate section labels.
'sphinx.ext.autosectionlabel',

View File

@ -27,14 +27,30 @@
jittor.mpi
jittor.linalg
jittor.console
jittor.distributions
jittor.attention
jittor.loss3d
.. toctree::
:maxdepth: 2
:caption: 计图模型库:
JDet
segmentation-jittor
InstanceSegmentation-jittor
gan-jittor
PointCloudLib
jrender
.. toctree::
:maxdepth: 1
:caption: 其他:
Jittor调试技巧
Jittor性能测试与对比方法
Jittor显存以及内存优化方法
教程 <https://cg.cs.tsinghua.edu.cn/jittor/tutorial/>
todo
Indices and tables
==================

View File

@ -0,0 +1,10 @@
jittor.attention
=====================
这里是Jittor的 注意力 模块的API文档您可以通过`from jittor import attention`来获取该模块。
```eval_rst
.. automodule:: jittor.attention
:members:
:undoc-members:
```

View File

@ -0,0 +1,10 @@
jittor.distributions
=====================
这里是Jittor的随机分布模块的API文档您可以通过`from jittor import distributions`来获取该模块。
```eval_rst
.. automodule:: jittor.distributions
:members:
:undoc-members:
```

View File

@ -0,0 +1,10 @@
jittor.loss3d
=====================
这里是Jittor的 3d 损失函数 模块的API文档您可以通过`from jittor import loss3d`来获取该模块。
```eval_rst
.. automodule:: jittor.loss3d
:members: chamfer_loss, ChamferLoss, earth_mover_distance, EarthMoverDistance
:undoc-members:
```

View File

@ -1,7 +1,39 @@
jittor.mpi
=====================
这里是Jittor的MPI模块的API文档您可以通过`from jittor import mpi`来获取该模块。
计图分布式基于MPIMessage Passing Interface本文档主要阐述使用计图MPI进行多卡和分布式训练的教程。
## 计图MPI安装
计图依赖`OpenMPI`,用户可以使用如下命令安装`OpenMPI`
```bash
sudo apt install openmpi-bin openmpi-common libopenmpi-dev
```
也可以参考 [OpenMPI 文档](https://www.open-mpi.org/faq/?category=building#easy-build),自行编译安装。
计图会自动检测环境变量中是否包含`mpicc`,如果计图成功的检测到了`mpicc`,那么会输出如下信息:
```
[i 0502 14:09:55.758481 24 __init__.py:203] Found mpicc(1.10.2) at /usr/bin/mpicc
```
如果计图没有在环境变量中找到mpi用户也可以手动指定mpicc的路径告诉计图添加环境变量即可`export mpicc_path=/you/mpicc/path`
`OpenMPI`安装完成以后,用户无需修改代码,需要做的仅仅是修改启动命令行,计图就会用数据并行的方式自动完成并行操作。
```bash
# 单卡训练代码
python3.7 -m jittor.test.test_resnet
# 分布式多卡训练代码
mpirun -np 4 python3.7 -m jittor.test.test_resnet
# 指定特定显卡的多卡训练代码
CUDA_VISIBLE_DEVICES="2,3" mpirun -np 2 python3.7 -m jittor.test.test_resnet
```
这种便捷性的背后是计图的分布式算子的支撑计图支持的mpi算子后端会使用nccl进行进一步的加速。计图所有分布式算法的开发均在Python前端完成这让分布式算法的灵活度增强开发分布式算法的难度也大大降低。
## 如何从单卡代码适配多卡代码
@ -11,6 +43,8 @@ jittor.mpi
* jittor.nn.BatchNorm* 同步batch norm
* jittor.dataset 自动数据并行
用户在使用MPI进行分布式训练时计图内部的Dataset类会自动并行分发数据需要注意的是Dataset类中设置的Batch size是**所有节点的batch size之和**也就是总batch size 不是单个节点接收到的batch size。
大部分情况下,单卡训练的代码可以直接使用`mpirun`实现分布式多卡运行。 但仍然如下几种情况下,需要对代码进行调整:
1. 对硬盘进行写操作(保存模型,保存曲线)
@ -93,7 +127,30 @@ def val(epoch):
......
```
## MPI接口
下面是 jittor 的 mpi api reference.
目前MPI开放接口如下
* `jt.in_mpi`: 当计图不在MPI环境下时`jt.mpi == False` 用户可以用这个判断是否在mpi环境下。
* `jt.world_size`: 获取当前进程总数量如果没有用mpi则为1。
* `jt.rank`: 获取当前进程的编号,区间为`0 jt.world_size-1` 如果没有用mpi则为0。
* `jt.mpi`: 计图的MPI模块。
* `jt.Module.mpi_param_broadcast(root=0)`: 将模块的参数从root节点广播给其他节点。
* `jt.mpi.mpi_reduce(x, op='add', root=0)`: 将所有节点的变量x使用算子opreduce到root节点。如果op是'add'或者'sum'该接口会把所有变量求和如果op是'mean',该接口会取均值。
<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/tutorial/2020-5-2-16-44-distributed/mpi_reduce.png">
* `jt.mpi.mpi_broadcast(x, root=0)`: 将变量x从root节点广播到所有节点。
<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/tutorial/2020-5-2-16-44-distributed/mpi_broadcast.png">
* `jt.mpi.mpi_all_reduce(x, op='add')`: 将所有节点的变量x使用一起reduce并且吧reduce的结果再次广播到所有节点。如果op是'add'或者'sum'该接口会把所有变量求和如果op是'mean',该接口会取均值。
<img src="https://cg.cs.tsinghua.edu.cn/jittor/images/tutorial/2020-5-2-16-44-distributed/mpi_all_reduce.png">
```eval_rst
.. automodule:: jittor_mpi_core
@ -103,3 +160,56 @@ def val(epoch):
:members:
:undoc-members:
```
## 实例MPI实现分布式同步批归一化层
下面的代码是使用计图实现分布式同步批归一化层的实例代码在原来批归一化层的基础上只需增加三行代码就可以实现分布式的batch norm添加的代码如下
```python
# 将均值和方差通过all reduce同步到所有节点
if self.sync and jt.mpi:
xmean = xmean.mpi_all_reduce("mean")
x2mean = x2mean.mpi_all_reduce("mean")
```
> 注:计图内部已经实现了同步的批归一化层,用户不需要自己实现
分布式同步批归一化层的完整代码:
```python
class BatchNorm(Module):
def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=None, is_train=True, sync=True):
assert affine == None
self.sync = sync
self.num_features = num_features
self.is_train = is_train
self.eps = eps
self.momentum = momentum
self.weight = init.constant((num_features,), "float32", 1.0)
self.bias = init.constant((num_features,), "float32", 0.0)
self.running_mean = init.constant((num_features,), "float32", 0.0).stop_grad()
self.running_var = init.constant((num_features,), "float32", 1.0).stop_grad()
def execute(self, x):
if self.is_train:
xmean = jt.mean(x, dims=[0,2,3], keepdims=1)
x2mean = jt.mean(x*x, dims=[0,2,3], keepdims=1)
# 将均值和方差通过all reduce同步到所有节点
if self.sync and jt.mpi:
xmean = xmean.mpi_all_reduce("mean")
x2mean = x2mean.mpi_all_reduce("mean")
xvar = x2mean-xmean*xmean
norm_x = (x-xmean)/jt.sqrt(xvar+self.eps)
self.running_mean += (xmean.sum([0,2,3])-self.running_mean)*self.momentum
self.running_var += (xvar.sum([0,2,3])-self.running_var)*self.momentum
else:
running_mean = self.running_mean.broadcast(x, [0,2,3])
running_var = self.running_var.broadcast(x, [0,2,3])
norm_x = (x-running_mean)/jt.sqrt(running_var+self.eps)
w = self.weight.broadcast(x, [0,2,3])
b = self.bias.broadcast(x, [0,2,3])
return norm_x * w + b
```

View File

@ -10,7 +10,7 @@ jittor.nn
.. automodule:: jittor.nn
:imported-members:
:members: Pool, pool, AdaptiveAvgPool2d
:members: Pool, pool, AdaptiveAvgPool2d, Pool3d, AdaptiveMaxPool2d, AdaptiveAvgPool3d, AdaptiveMaxPool2d, pool3d, AvgPool2d, AvgPool3d, avg_pool2d, MaxPool2d, MaxPool3d, max_pool2d, max_pool3d, MaxUnpool2d, MaxUnpool3d
:undoc-members:
.. autoclass:: jittor.nn.ReLU

View File

@ -1,84 +0,0 @@
// ***************************************************************
// Copyright (c) 2021 Jittor. All Rights Reserved.
// Maintainers:
// Guowei Yang <471184555@qq.com>
// Dun Liang <randonlang@gmail.com>.
//
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include "var.h"
#include "cublas_matmul_op.h"
#include "cublas_warper.h"
using namespace std;
namespace jittor {
#ifndef JIT
CublasMatmulOp::CublasMatmulOp(Var* a, Var* b, bool trans_a, bool trans_b)
: a(a), b(b), trans_a(trans_a), trans_b(trans_b) {
// TODO: support int8 * int8
ASSERT(a->dtype().is_float() && b->dtype().is_float()) << "type of two inputs should be the same";
// TODO: support diffrent input type
ASSERT(a->dtype().dsize() == b->dtype().dsize()) << "type of two inputs should be the same";
c = create_output(nullptr, a->dtype());
}
void CublasMatmulOp::infer_shape() {
ASSERTop(a->shape.size(),==,2);
ASSERTop(b->shape.size(),==,2);
int n = a->shape[0], m = a->shape[1];
int m_ = b->shape[0], k = b->shape[1];
if (trans_a) {
swap(n, m);
}
if (trans_b) {
swap(m_, k);
}
ASSERTop(m,==,m_);
c->set_shape({n, k});
}
void CublasMatmulOp::jit_prepare(JK& jk) {
jk << _CS("[T:") << a->dtype();
jk << _CS("][Trans_a:") << (trans_a ? 'T' : 'N');
jk << _CS("][Trans_b:") << (trans_b ? 'T' : 'N');
jk << _CS("][op:") << (a->dtype().dsize() == 4 ? 'S' : 'D');
jk << ']';
}
#else // JIT
#ifdef JIT_cpu
#pragma clang diagnostic ignored "-Wtautological-compare"
void CublasMatmulOp::jit_run() {
cublasHandle_t& handle_ = cublas_handle;
const T alpha = 1.0f;
const T beta = 0.0f;
const auto& as = a->shape;
const auto& bs = b->shape;
auto n = as[0];
auto m = as[1];
auto k = bs[1];
if ('@Trans_a'=='T') {
n = as[1];
m = as[0];
}
if ('@Trans_b'=='T') {
k = bs[0];
}
// a: [n,m], b: [m,k], c: [n,k]
checkCudaErrors(cublas@op@@gemm(handle_,
CUBLAS_OP_@Trans_b, CUBLAS_OP_@Trans_a,
k, n, m, &alpha,
b->ptr<T>(), '@Trans_b' == 'N' ? k : m,
a->ptr<T>(), '@Trans_a' == 'N' ? m : n, &beta,
c->ptr<T>(), k));
}
#endif
#endif // JIT
} // jittor

View File

@ -1,42 +0,0 @@
/**
* Copyright 2014 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include "helper_cuda.h"
#include "fp16_dev.h"
#define BLOCK_SIZE 128
template <class value_type>
__global__ void float2half_rn_kernel(int size, const value_type *buffIn, half1 *buffOut)
{
const int idx = BLOCK_SIZE*blockIdx.x+threadIdx.x;
if (idx >= size) {
return;
}
#if CUDART_VERSION < 9000
half1 val;
val.x = __float2half_rn(float(buffIn[idx]));
#else
half1 val = __float2half_rn(float(buffIn[idx]));
#endif
buffOut[idx] = val;
}
template <class value_type>
void gpu_float2half_rn(int size, const value_type *buffIn, half1 *buffOut)
{
int grid_size = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
float2half_rn_kernel<value_type><<<grid_size, BLOCK_SIZE>>> (size, buffIn, buffOut);
checkCudaErrors(cudaDeviceSynchronize());
}
template void gpu_float2half_rn<float> (int, const float*, half1*);
template void gpu_float2half_rn<double> (int, const double*, half1*);

View File

@ -1,113 +0,0 @@
// ***************************************************************
// Copyright (c) 2021 Jittor.
// All Rights Reserved.
// Maintainers:
// Dun Liang <randonlang@gmail.com>.
//
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include <unistd.h>
#include <stdint.h>
#include <stdio.h>
#include "mpi_warper.h"
#include "common.h"
#include "ops/array_op.h"
char jt_mpi_err_buffer[MPI_MAX_ERROR_STRING];
void throw_mpi_error(int result,
char const *const func, const char *const file, int const line) {
int resultlen;
MPI_Error_string(result, jt_mpi_err_buffer, &resultlen);
LOGf << "MPI error at " >> file >> ":" >> line << "code="
>> result >> '(' >> jt_mpi_err_buffer >> ')' << func;
}
namespace jittor {
int mpi_world_size = 1;
int mpi_world_rank = 0;
int mpi_local_rank = 0;
bool inside_mpi = false;
bool mpi_enabled = false;
int _mpi_world_size() {
return mpi_enabled ? mpi_world_size : 1;
}
int _mpi_world_rank() {
return mpi_enabled ? mpi_world_rank : 0;
}
int _mpi_local_rank() {
return mpi_enabled ? mpi_local_rank : 0;
}
void _mpi_broadcast(ArrayArgs&& args, int root) {
if (!mpi_enabled) return;
int64 size = args.dtype.dsize();
for (auto j : args.shape)
size *= j;
MPI_CHECK(MPI_Bcast((void *)args.ptr, size, MPI_BYTE, root, MPI_COMM_WORLD));
}
static uint64_t getHostHash(const char* string) {
// Based on DJB2, result = result * 33 + char
uint64_t result = 5381;
for (int c = 0; string[c] != '\0'; c++){
result = ((result << 5) + result) + string[c];
}
return result;
}
static void getHostName(char* hostname, int maxlen) {
gethostname(hostname, maxlen);
for (int i=0; i< maxlen; i++) {
if (hostname[i] == '.') {
hostname[i] = '\0';
return;
}
}
}
struct mpi_initer {
mpi_initer() {
inside_mpi = !!getenv("OMPI_COMM_WORLD_SIZE");
if (!inside_mpi) return;
mpi_enabled = true;
LOGvv << "MPI init...";
MPI_CHECK(MPI_Init(NULL, NULL));
MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &mpi_world_size));
MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_world_rank));
//calculating localRank based on hostname which is used in selecting a GPU
uint64_t hostHashs[mpi_world_rank];
char hostname[1024];
getHostName(hostname, 1024);
hostHashs[mpi_world_rank] = getHostHash(hostname);
MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
mpi_local_rank = 0;
for (int p=0; p<mpi_world_size; p++) {
if (p == mpi_world_rank) break;
if (hostHashs[p] == hostHashs[mpi_world_rank]) mpi_local_rank++;
}
LOGv << "MPI init finished: local" << mpi_local_rank
<< "global" << mpi_world_rank
<< "size" << mpi_world_size;
}
~mpi_initer() {
if (!inside_mpi) return;
MPI_CHECK(MPI_Finalize());
}
};
static mpi_initer mpi_init;
} // jittor

View File

@ -1,7 +0,0 @@
from .md_to_ipynb import dirname, notebook_dir
import os
import sys
cmd = f"cp -r {dirname}/* {notebook_dir}/ && cd {notebook_dir} && jupyter notebook {' '.join(sys.argv[1:])}"
print("run cmd:", cmd)
os.system(cmd)

File diff suppressed because it is too large Load Diff

7995
python/jittor/__init__.pyi Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Copyright (c) 2023 Jittor. All Rights Reserved.
# Maintainers:
# Guowei Yang <471184555@qq.com>
# Dun Liang <randonlang@gmail.com>.
@ -9,168 +9,575 @@
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import jittor as jt
from jittor import init, Module, nn
import numpy as np
from typing import Optional, Tuple, List
import warnings
import math
import jittor as jt
from jittor import Var
from jittor.nn import Module, Linear, softmax, pad, linear, dropout
from jittor.init import xavier_uniform_, xavier_gauss_, constant_
def _canonical_mask(
mask: Optional[Var],
mask_name: str,
other_type,
other_name: str,
target_type,
check_other: bool = True,
) -> Optional[Var]:
if mask is not None:
_mask_dtype = mask.dtype
_mask_is_float = mask.dtype == jt.float16 or mask.dtype == jt.float32 or mask.dtype == jt.float64
if _mask_dtype != jt.bool and not _mask_is_float:
raise AssertionError(
f"only bool and floating types of {mask_name} are supported")
if check_other and other_type is not None:
if _mask_dtype != other_type:
warnings.warn(
f"Support for mismatched {mask_name} and {other_name} "
"is deprecated. Use same type for both instead."
)
if not _mask_is_float:
# WARNING(514flowey): Check Here
new_mask = jt.zeros_like(mask, dtype=target_type)
new_mask[mask] = float("-inf")
mask = new_mask
return mask
def _none_or_dtype(input: Optional[Var]):
if input is None:
return None
elif isinstance(input, jt.Var):
return input.dtype
raise RuntimeError("input to _none_or_dtype() must be None or torch.Tensor")
def baddbmm(input_var:jt.Var, batch1:jt.Var, batch2:jt.Var, beta=1, alpha=1) -> jt.Var:
# WARNING(514flowey): Check here
return beta * input_var + alpha * (batch1 @ batch2)
def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> jt.Var:
# Efficient implementation equivalent to the following:
L, S = query.size(-2), key.size(-2)
scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
attn_bias = jt.zeros(L, S, dtype=query.dtype)
if is_causal:
assert attn_mask is None
temp_mask = jt.ones(L, S, dtype=jt.bool).tril(diagonal=0)
attn_bias[jt.logical_not(temp_mask)] = float("-inf")
# attn_bias.to(query.dtype)
attn_bias = jt.array(attn_bias, query.dtype)
if attn_mask is not None:
if attn_mask.dtype == jt.bool:
attn_bias[jt.logical_not(temp_mask)] = float("-inf")
else:
attn_bias += attn_mask
attn_weight = query @ key.transpose(-2, -1) * scale_factor
attn_weight += attn_bias
attn_weight = softmax(attn_weight, dim=-1)
attn_weight = dropout(attn_weight, dropout_p, is_train=True)
return attn_weight @ value
def _mha_shape_check(query: Var, key: Var, value: Var,
key_padding_mask: Optional[Var], attn_mask: Optional[Var], num_heads: int):
if query.dim() == 3:
is_batched = True
assert key.dim() == 3 and value.dim() == 3, \
("For batched (3-D) `query`, expected `key` and `value` to be 3-D"
f" but found {key.dim()}-D and {value.dim()}-D tensors respectively")
if key_padding_mask is not None:
assert key_padding_mask.dim() == 2, \
("For batched (3-D) `query`, expected `key_padding_mask` to be `None` or 2-D"
f" but found {key_padding_mask.dim()}-D tensor instead")
if attn_mask is not None:
assert attn_mask.dim() in (2, 3), \
("For batched (3-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
f" but found {attn_mask.dim()}-D tensor instead")
elif query.dim() == 2:
is_batched = False
assert key.dim() == 2 and value.dim() == 2, \
("For unbatched (2-D) `query`, expected `key` and `value` to be 2-D"
f" but found {key.dim()}-D and {value.dim()}-D tensors respectively")
if key_padding_mask is not None:
assert key_padding_mask.dim() == 1, \
("For unbatched (2-D) `query`, expected `key_padding_mask` to be `None` or 1-D"
f" but found {key_padding_mask.dim()}-D tensor instead")
if attn_mask is not None:
assert attn_mask.dim() in (2, 3), \
("For unbatched (2-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
f" but found {attn_mask.dim()}-D tensor instead")
if attn_mask.dim() == 3:
expected_shape = (num_heads, query.shape[0], key.shape[0])
assert attn_mask.shape == expected_shape, \
(f"Expected `attn_mask` shape to be {expected_shape} but got {attn_mask.shape}")
else:
raise AssertionError(
f"query should be unbatched 2D or batched 3D tensor but received {query.dim()}-D query tensor")
return is_batched
def _in_projection_packed(
q: Var,
k: Var,
v: Var,
w: Var,
b: Optional[Var] = None,
) -> List[Var]:
E = q.size(-1)
if k is v:
if q is k:
# self-attention
proj = linear(q, w, b)
# reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
# proj = proj.unflatten(-1, (3, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
nshape = proj.shape[:-1] + (3, E)
proj = proj.reshape(nshape).unsqueeze(0).transpose(0, -2).squeeze(-2)
return proj[0], proj[1], proj[2]
else:
# encoder-decoder attention
w_q, w_kv = w.split([E, E * 2])
if b is None:
b_q = b_kv = None
else:
b_q, b_kv = b.split([E, E * 2])
q_proj = linear(q, w_q, b_q)
kv_proj = linear(k, w_kv, b_kv)
# reshape to 2, E and not E, 2 is deliberate for better memory coalescing and keeping same order as chunk()
# kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
nshape = kv_proj.shape[:-1] + (2, E)
kv_proj = kv_proj.reshape(nshape).unsqueeze(0).transpose(0, -2).squeeze(-2)
return (q_proj, kv_proj[0], kv_proj[1])
else:
w_q, w_k, w_v = w.chunk(3)
if b is None:
b_q = b_k = b_v = None
else:
b_q, b_k, b_v = b.chunk(3)
return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
def _in_projection(
q: Var,
k: Var,
v: Var,
w_q: Var,
w_k: Var,
w_v: Var,
b_q: Optional[Var] = None,
b_k: Optional[Var] = None,
b_v: Optional[Var] = None,
) -> Tuple[Var, Var, Var]:
Eq, Ek, Ev = q.size(-1), k.size(-1), v.size(-1)
assert w_q.shape == (Eq, Eq), f"expecting query weights shape of {(Eq, Eq)}, but got {w_q.shape}"
assert w_k.shape == (Eq, Ek), f"expecting key weights shape of {(Eq, Ek)}, but got {w_k.shape}"
assert w_v.shape == (Eq, Ev), f"expecting value weights shape of {(Eq, Ev)}, but got {w_v.shape}"
assert b_q is None or b_q.shape == (Eq,), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
assert b_k is None or b_k.shape == (Eq,), f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}"
assert b_v is None or b_v.shape == (Eq,), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
def multi_head_attention_forward(
query: Var,
key: Var,
value: Var,
embed_dim_to_check: int,
num_heads: int,
in_proj_weight: Optional[Var],
in_proj_bias: Optional[Var],
bias_k: Optional[Var],
bias_v: Optional[Var],
add_zero_attn: bool,
dropout_p: float,
out_proj_weight: Var,
out_proj_bias: Optional[Var],
training: bool = True,
key_padding_mask: Optional[Var] = None,
need_weights: bool = True,
attn_mask: Optional[Var] = None,
use_separate_proj_weight: bool = False,
q_proj_weight: Optional[Var] = None,
k_proj_weight: Optional[Var] = None,
v_proj_weight: Optional[Var] = None,
static_k: Optional[Var] = None,
static_v: Optional[Var] = None,
average_attn_weights: bool = True,
is_causal: bool = False,
) -> Tuple[Var, Optional[Var]]:
is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
# For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
# is batched, run the computation and before returning squeeze the
# batch dimension so that the output doesn't carry this temporary batch dimension.
if not is_batched:
# unsqueeze if the input is unbatched
query = query.unsqueeze(1)
key = key.unsqueeze(1)
value = value.unsqueeze(1)
if key_padding_mask is not None:
key_padding_mask = key_padding_mask.unsqueeze(0)
# set up shape vars
tgt_len, bsz, embed_dim = query.shape
src_len, _, _ = key.shape
key_padding_mask = _canonical_mask(
mask=key_padding_mask,
mask_name="key_padding_mask",
other_type=_none_or_dtype(attn_mask),
other_name="attn_mask",
target_type=query.dtype
)
if is_causal and attn_mask is None:
raise RuntimeError(
"Need attn_mask if specifying the is_causal hint. "
"You may use the Transformer module method "
"`generate_square_subsequent_mask` to create this mask."
)
if is_causal and key_padding_mask is None and not need_weights:
# when we have a kpm or need weights, we need attn_mask
# Otherwise, we use the is_causal hint go as is_causal
# indicator to SDPA.
attn_mask = None
else:
attn_mask = _canonical_mask(
mask=attn_mask,
mask_name="attn_mask",
other_type=None,
other_name="",
target_type=query.dtype,
check_other=False,
)
if key_padding_mask is not None:
# We have the attn_mask, and use that to merge kpm into it.
# Turn off use of is_causal hint, as the merged mask is no
# longer causal.
is_causal = False
assert embed_dim == embed_dim_to_check, \
f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
if isinstance(embed_dim, jt.Var):
# embed_dim can be a tensor when JIT tracing
head_dim = embed_dim.div(num_heads, rounding_mode='trunc')
else:
head_dim = embed_dim // num_heads
assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
if use_separate_proj_weight:
# allow MHA to have different embedding dimensions when separate projection weights are used
assert key.shape[:2] == value.shape[:2], \
f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
else:
assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
#
# compute in-projection
#
if not use_separate_proj_weight:
assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
else:
assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
if in_proj_bias is None:
b_q = b_k = b_v = None
else:
b_q, b_k, b_v = in_proj_bias.chunk(3)
q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
# prep attention mask
if attn_mask is not None:
# ensure attn_mask's dim is 3
if attn_mask.dim() == 2:
correct_2d_size = (tgt_len, src_len)
if attn_mask.shape != correct_2d_size:
raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
attn_mask = attn_mask.unsqueeze(0)
elif attn_mask.dim() == 3:
correct_3d_size = (bsz * num_heads, tgt_len, src_len)
if attn_mask.shape != correct_3d_size:
raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.")
else:
raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
# add bias along batch dimension (currently second)
if bias_k is not None and bias_v is not None:
assert static_k is None, "bias cannot be added to static key."
assert static_v is None, "bias cannot be added to static value."
k = jt.concat([k, bias_k.repeat(1, bsz, 1)])
v = jt.concat([v, bias_v.repeat(1, bsz, 1)])
if attn_mask is not None:
attn_mask = pad(attn_mask, (0, 1))
if key_padding_mask is not None:
key_padding_mask = pad(key_padding_mask, (0, 1))
else:
assert bias_k is None
assert bias_v is None
#
# reshape q, k, v for multihead attention and make em batch first
#
q = q.view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
if static_k is None:
k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
else:
# TODO finish disentangling control flow so we don't do in-projections when statics are passed
assert static_k.size(0) == bsz * num_heads, \
f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
assert static_k.size(2) == head_dim, \
f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
k = static_k
if static_v is None:
v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
else:
# TODO finish disentangling control flow so we don't do in-projections when statics are passed
assert static_v.size(0) == bsz * num_heads, \
f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
assert static_v.size(2) == head_dim, \
f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
v = static_v
# add zero attention along batch dimension (now first)
if add_zero_attn:
zero_attn_shape = (bsz * num_heads, 1, head_dim)
k = jt.concat([k, jt.zeros(zero_attn_shape, dtype=k.dtype)], dim=1)
v = jt.concat([v, jt.zeros(zero_attn_shape, dtype=v.dtype)], dim=1)
if attn_mask is not None:
attn_mask = pad(attn_mask, (0, 1))
if key_padding_mask is not None:
key_padding_mask = pad(key_padding_mask, (0, 1))
# update source sequence length after adjustments
src_len = k.size(1)
# merge key padding and attention masks
if key_padding_mask is not None:
assert key_padding_mask.shape == (bsz, src_len), \
f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len). \
expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
if attn_mask is None:
attn_mask = key_padding_mask
else:
attn_mask = attn_mask + key_padding_mask
# adjust dropout probability
if not training:
dropout_p = 0.0
#
# (deep breath) calculate attention and out projection
#
if need_weights:
B, Nt, E = q.shape
q_scaled = q / math.sqrt(E)
assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"
if attn_mask is not None:
attn_output_weights = baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
else:
attn_output_weights = jt.bmm(q_scaled, k.transpose(-2, -1))
attn_output_weights = softmax(attn_output_weights, dim=-1)
if dropout_p > 0.0:
attn_output_weights = dropout(attn_output_weights, p=dropout_p)
attn_output = jt.bmm(attn_output_weights, v)
attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
# optionally average attention weights over heads
attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
if average_attn_weights:
attn_output_weights = attn_output_weights.mean(dim=1)
if not is_batched:
# squeeze the output if input was unbatched
attn_output = attn_output.squeeze(1)
attn_output_weights = attn_output_weights.squeeze(0)
return attn_output, attn_output_weights
else:
# attn_mask can be either (L,S) or (N*num_heads, L, S)
# if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S)
# in order to match the input for SDPA of (N, num_heads, L, S)
if attn_mask is not None:
if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
attn_mask = attn_mask.unsqueeze(0)
else:
attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
q = q.view(bsz, num_heads, tgt_len, head_dim)
k = k.view(bsz, num_heads, src_len, head_dim)
v = v.view(bsz, num_heads, src_len, head_dim)
attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
if not is_batched:
# squeeze the output if input was unbatched
attn_output = attn_output.squeeze(1)
return attn_output, None
class MultiheadAttention(Module):
def __init__(
self,
embed_dim,
num_heads,
kdim=None,
vdim=None,
dropout=0.0,
bias=True,
add_bias_kv=False,
add_zero_attn=False,
self_attention=False,
encoder_decoder_attention=False,
q_noise=0.0,
qn_block_size=8,
):
__constants__ = ['batch_first']
bias_k: Optional[jt.Var]
bias_v: Optional[jt.Var]
def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False,
kdim=None, vdim=None, batch_first=False, dtype=jt.float32) -> None:
if embed_dim <= 0 or num_heads <= 0:
raise ValueError(
f"embed_dim and num_heads must be greater than 0,"
f" got embed_dim={embed_dim} and num_heads={num_heads} instead"
)
factory_kwargs = {'dtype': dtype}
super().__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
self.num_heads = num_heads
assert dropout==0, "TODO: dropout>0"
self.dropout = dropout
self.batch_first = batch_first
self.head_dim = embed_dim // num_heads
assert (self.head_dim * num_heads == self.embed_dim), "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim ** -0.5
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.self_attention = self_attention
self.encoder_decoder_attention = encoder_decoder_attention
if not self._qkv_same_embed_dim:
self.q_proj_weight = jt.empty((embed_dim, embed_dim), **factory_kwargs)
self.k_proj_weight = jt.empty((embed_dim, self.kdim), **factory_kwargs)
self.v_proj_weight = jt.empty((embed_dim, self.vdim), **factory_kwargs)
self.in_proj_weight = None
else:
self.q_proj_weight = None
self.k_proj_weight = None
self.v_proj_weight = None
self.in_proj_weight = jt.empty((3 * embed_dim, embed_dim), **factory_kwargs)
assert not self.self_attention or self.qkv_same_dim, ("Self-attention requires query, key and " "value to be of the same size")
if bias:
self.in_proj_bias = jt.empty(3 * embed_dim, **factory_kwargs)
else:
self.in_proj_bias = None
self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
#TODO: quant_noise
self.k_proj = nn.Linear(self.kdim, embed_dim, bias=bias)
self.v_proj = nn.Linear(self.vdim, embed_dim, bias=bias)
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
assert not add_bias_kv, "TODO: add_bias_kv=True"
self.bias_k = self.bias_v = None
if add_bias_kv:
self.bias_k = jt.empty((1, 1, embed_dim), **factory_kwargs)
self.bias_v = jt.empty((1, 1, embed_dim), **factory_kwargs)
else:
self.bias_k = self.bias_v = None
self.add_zero_attn = add_zero_attn
self.reset_parameters()
self._reset_parameters()
self.onnx_trace = False
self.tpu = False
def reset_parameters(self):
if self.qkv_same_dim:
# Empirically observed the convergence to be much better with
# the scaled initialization
init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
def _reset_parameters(self):
if self._qkv_same_embed_dim:
xavier_uniform_(self.in_proj_weight)
else:
init.xavier_uniform_(self.k_proj.weight)
init.xavier_uniform_(self.v_proj.weight)
init.xavier_uniform_(self.q_proj.weight)
xavier_uniform_(self.q_proj_weight)
xavier_uniform_(self.k_proj_weight)
xavier_uniform_(self.v_proj_weight)
# init.xavier_uniform_(self.out_proj.weight)
if self.out_proj.bias is not None:
init.constant_(self.out_proj.bias, 0.)
if self.in_proj_bias is not None:
constant_(self.in_proj_bias, 0.)
constant_(self.out_proj.bias, 0.)
if self.bias_k is not None:
init.xavier_normal_(self.bias_k)
xavier_gauss_(self.bias_k)
if self.bias_v is not None:
init.xavier_normal_(self.bias_v)
xavier_gauss_(self.bias_v)
def __setstate__(self, state):
# Support loading old MultiheadAttention checkpoints generated by v1.1.0
if '_qkv_same_embed_dim' not in state:
state['_qkv_same_embed_dim'] = True
super().__setstate__(state)
def execute(
self,
query,
key = None,
value = None,
key_padding_mask = None,
incremental_state = None,
need_weights = True,
static_kv = False,
attn_mask = None,
before_softmax = False,
need_head_weights = False,
):
if need_head_weights:
need_weights = True
self,
query: Var,
key: Var,
value: Var,
key_padding_mask: Optional[Var] = None,
need_weights: bool = True,
attn_mask: Optional[Var] = None,
average_attn_weights: bool = True,
is_causal : bool = False) -> Tuple[Var, Optional[Var]]:
tgt_len, bsz, embed_dim = query.shape
assert embed_dim == self.embed_dim
assert list(query.shape) == [tgt_len, bsz, embed_dim]
#####
# Fast Path is not Supported.
#####
assert incremental_state is None, "TODO: incremental_state is not None"
saved_state = None
is_batched = query.dim() == 3
if self.self_attention:
q = self.q_proj(query)
k = self.k_proj(query)
v = self.v_proj(query)
elif self.encoder_decoder_attention:
# encoder-decoder attention
q = self.q_proj(query)
if key is None:
assert value is None
k = v = None
key_padding_mask = _canonical_mask(
mask=key_padding_mask,
mask_name="key_padding_mask",
other_type=_none_or_dtype(attn_mask),
other_name="attn_mask",
target_type=query.dtype
)
attn_mask = _canonical_mask(
mask=attn_mask,
mask_name="attn_mask",
other_type=None,
other_name="",
target_type=query.dtype,
check_other=False,
)
if self.batch_first and is_batched:
# make sure that the transpose op does not affect the "is" property
if key is value:
if query is key:
query = key = value = query.transpose(1, 0)
else:
query, key = (x.transpose(1, 0) for x in (query, key))
value = key
else:
k = self.k_proj(key)
v = self.v_proj(key)
query, key, value = (x.transpose(1, 0) for x in (query, key, value))
if not self._qkv_same_embed_dim:
attn_output, attn_output_weights = multi_head_attention_forward(
query, key, value, self.embed_dim, self.num_heads,
self.in_proj_weight, self.in_proj_bias,
self.bias_k, self.bias_v, self.add_zero_attn,
self.dropout, self.out_proj.weight, self.out_proj.bias,
training=self.is_training(),
key_padding_mask=key_padding_mask, need_weights=need_weights,
attn_mask=attn_mask,
use_separate_proj_weight=True,
q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
v_proj_weight=self.v_proj_weight,
average_attn_weights=average_attn_weights,
is_causal=is_causal)
else:
assert key is not None and value is not None
q = self.q_proj(query)
k = self.k_proj(key)
v = self.v_proj(value)
q = q*self.scaling
assert self.bias_k is None, "TODO: self.bias_k is not None:"
q = q.view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(1, 0, 2)
if k is not None:
k = k.view(-1, bsz * self.num_heads, self.head_dim).transpose(1, 0, 2)
if v is not None:
v = v.view(-1, bsz * self.num_heads, self.head_dim).transpose(1, 0, 2)
assert saved_state is None, "TODO: saved_state is not None"
assert k is not None
src_len = k.shape[1]
assert key_padding_mask is None, "TODO: key_padding_mask is not None"
assert not self.add_zero_attn, "TODO: self.add_zero_attn=True"
attn_weights = nn.bmm(q, k.transpose(0, 2, 1))
assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
assert attn_mask is None, "TODO: attn_mask is not None"
assert key_padding_mask is None, "TODO: key_padding_mask is not None"
if before_softmax:
return attn_weights, v
attn_weights_float = nn.softmax(attn_weights, dim=-1)
attn_weights = attn_weights_float.type_as(attn_weights)
assert v is not None
attn = nn.bmm(attn_weights, v)
assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
if self.onnx_trace and attn.shape[1] == 1:
# when ONNX tracing a single decoder step (sequence length == 1)
# the transpose is a no-op copy before view, thus unnecessary
attn = attn.view(tgt_len, bsz, embed_dim)
attn_output, attn_output_weights = multi_head_attention_forward(
query, key, value, self.embed_dim, self.num_heads,
self.in_proj_weight, self.in_proj_bias,
self.bias_k, self.bias_v, self.add_zero_attn,
self.dropout, self.out_proj.weight, self.out_proj.bias,
training=self.is_training(),
key_padding_mask=key_padding_mask,
need_weights=need_weights,
attn_mask=attn_mask,
average_attn_weights=average_attn_weights,
is_causal=is_causal)
if self.batch_first and is_batched:
return attn_output.transpose(1, 0), attn_output_weights
else:
attn = attn.transpose(1, 0, 2).view(tgt_len, bsz, embed_dim)
attn = self.out_proj(attn)
attn_weights = None
if need_weights:
attn_weights = attn_weights_float.view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0, 2, 3)
if not need_head_weights:
# average attention weights over heads
attn_weights = attn_weights.mean(dims=[0])
return attn, attn_weights
return attn_output, attn_output_weights

View File

@ -0,0 +1,3 @@
from .ccl_2d import ccl_2d
from .ccl_3d import ccl_3d
from .ccl_link import ccl_link

177
python/jittor/ccl/ccl_2d.py Normal file
View File

@ -0,0 +1,177 @@
import jittor as jt
def ccl_2d(data_2d):
'''
2D connected component labelling, original code from https://github.com/DanielPlayne/playne-equivalence-algorithm
Args:
[in]param data_2d: binary two-dimensional vector
type data_2d: jittor array
Returns:
[out]result: labeled two-dimensional vector
Example:
>>> import jittor as jt
>>> jt.flags.use_cuda = 1
>>> import cv2
>>> import numpy as np
>>> img = cv2.imread('testImg.png', 0)
>>> a = img.mean()
>>> img[img <= a] = 0
>>> img[img > a] = 1
>>> img = jt.Var(img)
>>> result = ccl_2d(img)
>>> print(jt.unique(result, return_counts=True, return_inverse=True)[0], jt.unique(result, return_counts=True, return_inverse=True)[2])
>>> cv2.imwrite('testImg_result.png', result.numpy().astype(np.uint8) * 50)
'''
data_2d = data_2d.astype(jt.uint32)
cY = data_2d.shape[0]
cX = data_2d.shape[1]
data_2d_copy = data_2d.clone()
changed = jt.ones([1], dtype=jt.uint32)
data_2d = data_2d.reshape(cX * cY)
result = jt.code(data_2d.shape,
data_2d.dtype, [data_2d, changed],
cuda_header='''
@alias(g_image, in0)
@alias(g_labels, out)
''',
cuda_src=r'''
__global__ void init_labels(@ARGS_DEF, const int cX, const int cY) {
@PRECALC
// Calculate index
const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
@g_labels(iy*cX + ix) = iy*cX + ix;
}
__device__ __inline__ unsigned int find_root(@ARGS_DEF, unsigned int label) {
// Resolve Label
unsigned int next = @g_labels(label);
// Follow chain
while(label != next) {
// Move to next
label = next;
next = @g_labels(label);
}
// Return label
return label;
}
__global__ void resolve_labels(@ARGS_DEF, const int cX, const int cY) {
@PRECALC
// Calculate index
const unsigned int id = ((blockIdx.y * blockDim.y) + threadIdx.y) * cX +
((blockIdx.x * blockDim.x) + threadIdx.x);
// Check Thread Range
if(id < cX*cY) {
// Resolve Label
@g_labels(id) = find_root(@ARGS, @g_labels(id));
}
}
__global__ void label_equivalence(@ARGS_DEF, const int cX, const int cY) {
@PRECALC
// Calculate index
const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
// Check Thread Range
if((ix < cX) && (iy < cY)) {
// Get image and label values
const unsigned char cyx = @g_image( iy*cX + ix);
// Get neighbour labels
const unsigned int lym1x = (iy > 0) ? @g_labels((iy-1)*cX + ix) : 0;
const unsigned int lyxm1 = (ix > 0) ? @g_labels(iy *cX + ix-1) : 0;
const unsigned int lyx = @g_labels(iy *cX + ix);
const unsigned int lyxp1 = (ix < cX-1) ? @g_labels(iy *cX + ix+1) : 0;
const unsigned int lyp1x = (iy < cY-1) ? @g_labels((iy+1)*cX + ix) : 0;
const unsigned int lym1xm1 = (iy > 0 && ix > 0 ) ? @g_labels((iy-1)*cX + ix-1) : 0;
const unsigned int lym1xp1 = (iy > 0 && ix < cX-1) ? @g_labels((iy-1)*cX + ix+1) : 0;
const unsigned int lyp1xm1 = (iy < cY-1 && ix > 0 ) ? @g_labels((iy+1)*cX + ix-1) : 0;
const unsigned int lyp1xp1 = (iy < cY-1 && ix < cX-1) ? @g_labels((iy+1)*cX + ix+1) : 0;
const bool nym1x = (iy > 0) ? (cyx == (@g_image((iy-1)*cX + ix))) : false;
const bool nyxm1 = (ix > 0) ? (cyx == (@g_image(iy *cX + ix-1))) : false;
const bool nyxp1 = (ix < cX-1) ? (cyx == (@g_image(iy *cX + ix+1))) : false;
const bool nyp1x = (iy > cY-1) ? (cyx == (@g_image((iy+1)*cX + ix))) : false;
const bool nym1xm1 = (iy > 0 && ix > 0 ) ? (cyx == (@g_image((iy-1)*cX + ix-1))) : false;
const bool nym1xp1 = (iy > 0 && ix < cX-1) ? (cyx == (@g_image((iy-1)*cX + ix+1))) : false;
const bool nyp1xm1 = (iy < cY-1 && ix > 0 ) ? (cyx == (@g_image((iy+1)*cX + ix-1))) : false;
const bool nyp1xp1 = (iy < cY-1 && ix < cX-1) ? (cyx == (@g_image((iy+1)*cX + ix+1))) : false;
// Lowest label
unsigned int label = lyx;
// Find lowest neighbouring label
label = ((nym1x) && (lym1x < label)) ? lym1x : label;
label = ((nyxm1) && (lyxm1 < label)) ? lyxm1 : label;
label = ((nyxp1) && (lyxp1 < label)) ? lyxp1 : label;
label = ((nyp1x) && (lyp1x < label)) ? lyp1x : label;
label = ((nym1xm1) && (lym1xm1 < label)) ? lym1xm1 : label;
label = ((nym1xp1) && (lym1xp1 < label)) ? lym1xp1 : label;
label = ((nyp1xm1) && (lyp1xm1 < label)) ? lyp1xm1 : label;
label = ((nyp1xp1) && (lyp1xp1 < label)) ? lyp1xp1 : label;
// If labels are different, resolve them
if(label < lyx) {
// Update label
// Nonatomic write may overwrite another label but on average seems to give faster results
@g_labels(lyx) = label;
// Record the change
@in1(0) = 1;
}
}
}
''' + f'''
dim3 block(32, 32);
const int cX= {cX};
const int cY= {cY};''' + '''
dim3 grid(ceil(cX/(float)block.x), ceil(cY/(float)block.y));
dim3 resolve_block(32, 32);
dim3 resolve_grid(ceil(cX/(float)resolve_block.x), ceil(cY/(float)resolve_block.y));
// Initialise labels
init_labels <<< grid, block >>>(@ARGS, cX, cY);
// Resolve the labels
resolve_labels <<< resolve_grid, resolve_block >>>(@ARGS, cX, cY);
// Changed Flag
int32 changed = 1;
// While labels have changed
while(changed) {
// Copy changed to device
cudaMemsetAsync(in1_p, 0, 4);
// Label image
label_equivalence <<< grid, block >>>(@ARGS, cX, cY);
// Copy changed flag to host
cudaMemcpy(&changed, in1_p, sizeof(int32), cudaMemcpyDeviceToHost);
// Resolve the labels
resolve_labels <<< resolve_grid, resolve_block>>>(@ARGS, cX, cY);
}
''')
result = result.reshape((cY, cX)) * data_2d_copy
value = jt.unique(result)
value = value[value != 0]
map_result = jt.zeros((int(value.max().numpy()[0]) + 1), dtype=jt.uint32)
map_result[value] = jt.index(value.shape)[0] + 1
result = map_result[result]
return result

196
python/jittor/ccl/ccl_3d.py Normal file
View File

@ -0,0 +1,196 @@
import jittor as jt
def ccl_3d(data_3d):
'''
3D connected component labelling, original code from https://github.com/DanielPlayne/playne-equivalence-algorithm
Args:
[in]param data_3d: binary three-dimensional vector
type data_3d: jittor array
Returns:
[out]result : labeled three-dimensional vector
Example:
>>> import jittor as jt
>>> jt.flags.use_cuda = 1
>>> data_3d = jt.zeros((10, 11, 12), dtype=jt.uint32)
>>> data_3d[2:4, :, :] = 1
>>> data_3d[5:7, :, :] = 1
>>> result = ccl_3d(data_3d)
>>> print(result[:, 0, 0])
>>> print(
jt.unique(result, return_counts=True, return_inverse=True)[0],
jt.unique(result, return_counts=True, return_inverse=True)[2])
'''
data_3d = data_3d.astype(jt.uint32)
cX = data_3d.shape[0]
cY = data_3d.shape[1]
cZ = data_3d.shape[2]
changed = jt.ones([1], dtype=jt.uint32)
data_3d_copy = data_3d.copy()
data_3d = data_3d.reshape(cX * cY * cZ)
result = jt.code(data_3d.shape,
data_3d.dtype, [data_3d, changed],
cuda_header='''
@alias(g_image, in0)
@alias(g_labels, out)
''',
cuda_src=r'''
__global__ void init_labels(@ARGS_DEF, const int cX, const int cY, const int cZ, const int pX, const int pY) {
@PRECALC
// Calculate index
const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
const unsigned int iz = (blockIdx.z * blockDim.z) + threadIdx.z;
if((ix < cX) && (iy < cY) && (iz < cZ)) {
const unsigned char pzyx = @g_image(iz*pY + iy*pX + ix);
// Neighbour Connections
const bool nzm1yx = (iz > 0) ? (pzyx == @g_image((iz-1)*pY + iy *pX + ix )) : false;
const bool nzym1x = (iy > 0) ? (pzyx == @g_image( iz *pY + (iy-1)*pX + ix )) : false;
const bool nzyxm1 = (ix > 0) ? (pzyx == @g_image( iz *pY + iy *pX + ix-1)) : false;
// Label
unsigned int label;
// Initialise Label
label = (nzyxm1) ? ( iz*pY + iy*pX + ix-1) : (iz*pY + iy*pX + ix);
label = (nzym1x) ? ( iz*pY + (iy-1)*pX + ix) : label;
label = (nzm1yx) ? ((iz-1)*pY + iy*pX + ix) : label;
// Write to Global Memory
@g_labels(iz*pY + iy*pX + ix) = label;
}
}
__device__ __inline__ unsigned int find_root(@ARGS_DEF, unsigned int label) {
// Resolve Label
unsigned int next = @g_labels(label);
// Follow chain
while(label != next) {
// Move to next
label = next;
next = @g_labels(label);
}
// Return label
return label;
}
__global__ void resolve_labels(@ARGS_DEF, const int cX, const int cY, const int cZ, const int pX, const int pY) {
@PRECALC
// Calculate index
const unsigned int id = ((blockIdx.z * blockDim.z) + threadIdx.z) * pY +
((blockIdx.y * blockDim.y) + threadIdx.y) * pX +
((blockIdx.x * blockDim.x) + threadIdx.x);
// Check Thread Range
if(id < cX*cY*cZ) {
// Resolve Label
@g_labels(id) = find_root(@ARGS, @g_labels(id));
}
}
__global__ void label_equivalence(@ARGS_DEF, const int cX, const int cY, const int cZ, const int pX, const int pY) {
@PRECALC
// Calculate index
const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
const unsigned int iz = (blockIdx.z * blockDim.z) + threadIdx.z;
// Check Thread Range
if((ix < cX) && (iy < cY) && (iz < cZ)) {
// Get image and label values
const unsigned char pzyx = @g_image(iz*pY + iy*pX + ix);
// Neighbouring indexes
const unsigned int xm1 = ix-1;
const unsigned int xp1 = ix+1;
const unsigned int ym1 = iy-1;
const unsigned int yp1 = iy+1;
const unsigned int zm1 = iz-1;
const unsigned int zp1 = iz+1;
// Get neighbour labels
const unsigned int lzm1yx = (iz > 0) ? @g_labels(zm1*pY + iy*pX + ix) : 0;
const unsigned int lzym1x = (iy > 0) ? @g_labels( iz*pY + ym1*pX + ix) : 0;
const unsigned int lzyxm1 = (ix > 0) ? @g_labels( iz*pY + iy*pX + xm1) : 0;
const unsigned int lzyx = @g_labels( iz*pY + iy*pX + ix);
const unsigned int lzyxp1 = (ix < cX-1) ? @g_labels( iz*pY + iy*pX + xp1) : 0;
const unsigned int lzyp1x = (iy < cY-1) ? @g_labels( iz*pY + yp1*pX + ix) : 0;
const unsigned int lzp1yx = (iz < cZ-1) ? @g_labels(zp1*pY + iy*pX + ix) : 0;
const bool nzm1yx = (iz > 0) ? (pzyx == @g_image(zm1*pY + iy*pX + ix)) : false;
const bool nzym1x = (iy > 0) ? (pzyx == @g_image( iz*pY + ym1*pX + ix)) : false;
const bool nzyxm1 = (ix > 0) ? (pzyx == @g_image( iz*pY + iy*pX + xm1)) : false;
const bool nzyxp1 = (ix < cX-1) ? (pzyx == @g_image( iz*pY + iy*pX + xp1)) : false;
const bool nzyp1x = (iy < cY-1) ? (pzyx == @g_image( iz*pY + yp1*pX + ix)) : false;
const bool nzp1yx = (iz < cZ-1) ? (pzyx == @g_image(zp1*pY + iy*pX + ix)) : false;
// Lowest label
unsigned int label = lzyx;
// Find lowest neighbouring label
label = ((nzm1yx) && (lzm1yx < label)) ? lzm1yx : label;
label = ((nzym1x) && (lzym1x < label)) ? lzym1x : label;
label = ((nzyxm1) && (lzyxm1 < label)) ? lzyxm1 : label;
label = ((nzyxp1) && (lzyxp1 < label)) ? lzyxp1 : label;
label = ((nzyp1x) && (lzyp1x < label)) ? lzyp1x : label;
label = ((nzp1yx) && (lzp1yx < label)) ? lzp1yx : label;
// If labels are different, resolve them
if(label < lzyx) {
// Update label
// Nonatomic write may overwrite another label but on average seems to give faster results
@g_labels(lzyx) = label;
// Record the change
@in1(0) = 1;
}
}
}
''' + f'''
dim3 block(32, 4, 4);
const int cX= {cX};
const int cY= {cY};
const int cZ= {cZ};
const int pX= cX;
const int pY= cX*cY;''' + '''
dim3 grid(ceil(cX/(float)block.x), ceil(cY/(float)block.y), ceil(cZ/(float)block.z));
// Initialise labels
init_labels <<< grid, block >>>(@ARGS, cX, cY, cZ, pX, pY);
// Resolve the labels
resolve_labels <<< grid, block >>>(@ARGS, cX, cY, cZ, pX, pY);
// Changed Flag
int32 changed = 1;
// While labels have changed
while(changed) {
// Copy changed to device
cudaMemsetAsync(in1_p, 0, 4);
// Label image
label_equivalence <<< grid, block >>>(@ARGS, cX, cY, cZ, pX, pY);
// Copy changed flag to host
cudaMemcpy(&changed, in1_p, sizeof(int32), cudaMemcpyDeviceToHost);
// Resolve the labels
resolve_labels <<< grid, block>>>(@ARGS, cX, cY, cZ, pX, pY);
}
''')
result = result.reshape((cX, cY, cZ)) * data_3d_copy
value = jt.unique(result)
value = value[value != 0]
map_result = jt.zeros((int(value.max().numpy()[0]) + 1), dtype=jt.uint32)
map_result[value] = jt.index(value.shape)[0] + 1
result = map_result[result]
return result

View File

@ -0,0 +1,195 @@
import jittor as jt
def ccl_link(score_map, link_map, result_comp_area_thresh=6):
"""
Find components in score map and link them with link map, original code from https://github.com/DanielPlayne/playne-equivalence-algorithm.
Args:
[in]param score_map: binary two-dimensional vector
type score_map: jittor array
[in]param link_map: two-dimensional vector with 8 channels
type link_map: jittor array
[in]param result_comp_area_thresh: threshold of component area
type result_comp_area_thresh: int
Returns:
[out]result: labeled two-dimensional vector
Example:
>>> import jittor as jt
>>> jt.flags.use_cuda = 1
>>> import cv2
>>> import numpy as np
>>> score_map = jt.Var(np.load("score_map.npy"))
>>> link_map = jt.Var(np.load("link_map.npy"))
>>> score_map = score_map >= 0.5
>>> link_map = link_map >= 0.8
>>> for i in range(8):
>>> link_map[:, :, i] = link_map[:, :, i] & score_map
>>> result = ccl_link(score_map, link_map)
>>> cv2.imwrite('pixellink.png', result.numpy().astype(np.uint8) * 50)
"""
score_map = score_map.astype(jt.uint32)
link_map = link_map.astype(jt.uint32)
cY = score_map.shape[0]
cX = score_map.shape[1]
changed = jt.ones([1], dtype=jt.uint32)
score_map = score_map.reshape(cX * cY)
result = jt.code(score_map.shape,
score_map.dtype, [score_map, link_map, changed],
cuda_header='''
@alias(score_map, in0)
@alias(link_map, in1)
@alias(g_labels, out)
''',
cuda_src=r'''
__global__ void init_labels(@ARGS_DEF, const int cX, const int cY) {
@PRECALC
// Calculate index
const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
@g_labels(iy*cX + ix) = iy*cX + ix;
}
__device__ __inline__ unsigned int find_root(@ARGS_DEF, unsigned int label) {
// Resolve Label
unsigned int next = @g_labels(label);
// Follow chain
while(label != next) {
// Move to next
label = next;
next = @g_labels(label);
}
// Return label
return label;
}
__global__ void resolve_labels(@ARGS_DEF, const int cX, const int cY) {
@PRECALC
// Calculate index
const unsigned int id = ((blockIdx.y * blockDim.y) + threadIdx.y) * cX +
((blockIdx.x * blockDim.x) + threadIdx.x);
// Check Thread Range
if(id < cX*cY) {
// Resolve Label
@g_labels(id) = find_root(@ARGS, @g_labels(id));
}
}
__global__ void label_equivalence(@ARGS_DEF, const int cX, const int cY) {
@PRECALC
// Calculate index
const unsigned int ix = (blockIdx.x * blockDim.x) + threadIdx.x;
const unsigned int iy = (blockIdx.y * blockDim.y) + threadIdx.y;
// Check Thread Range
if((ix < cX) && (iy < cY)) {
// Get image and label values
const unsigned char cyx = @score_map( iy*cX + ix);
// Get neighbour labels
const unsigned int lym1x = (iy > 0) ? @g_labels((iy-1)*cX + ix) : 0;
const unsigned int lyxm1 = (ix > 0) ? @g_labels(iy *cX + ix-1) : 0;
const unsigned int lyx = @g_labels(iy *cX + ix);
const unsigned int lyxp1 = (ix < cX-1) ? @g_labels(iy *cX + ix+1) : 0;
const unsigned int lyp1x = (iy < cY-1) ? @g_labels((iy+1)*cX + ix) : 0;
const unsigned int lym1xm1 = (iy > 0 && ix > 0 ) ? @g_labels((iy-1)*cX + ix-1) : 0;
const unsigned int lym1xp1 = (iy > 0 && ix < cX-1) ? @g_labels((iy-1)*cX + ix+1) : 0;
const unsigned int lyp1xm1 = (iy < cY-1 && ix > 0 ) ? @g_labels((iy+1)*cX + ix-1) : 0;
const unsigned int lyp1xp1 = (iy < cY-1 && ix < cX-1) ? @g_labels((iy+1)*cX + ix+1) : 0;
bool nym1x, nyxm1, nyxp1, nyp1x, nym1xm1, nym1xp1, nyp1xm1, nyp1xp1;
if(cyx) {
nym1x = (iy > 0) ? ((cyx == (@score_map((iy-1)*cX + ix))) && (@link_map(iy, ix, 6) || @link_map(iy-1, ix, 7))) : false; // up
nyxm1 = (ix > 0) ? ((cyx == (@score_map(iy *cX + ix-1))) && (@link_map(iy, ix, 0) || @link_map(iy-1, ix-1, 3))) : false; // left
nyxp1 = (ix < cX-1) ? ((cyx == (@score_map(iy *cX + ix+1))) && (@link_map(iy, ix, 3) || @link_map(iy, ix+1, 0))) : false; // right
nyp1x = (iy > cY-1) ? ((cyx == (@score_map((iy+1)*cX + ix))) && (@link_map(iy, ix, 7) || @link_map(iy+1, ix, 6))) : false; // down
nym1xm1 = (iy > 0 && ix > 0 ) ? ((cyx == (@score_map((iy-1)*cX + ix-1))) && (@link_map(iy, ix, 2) || @link_map(iy-1, ix-1, 4))) : false; // up-left
nym1xp1 = (iy > 0 && ix < cX-1) ? ((cyx == (@score_map((iy-1)*cX + ix+1))) && (@link_map(iy, ix, 5) || @link_map(iy-1, ix+1, 1))) : false; // up-right
nyp1xm1 = (iy < cY-1 && ix > 0 ) ? ((cyx == (@score_map((iy+1)*cX + ix-1))) && (@link_map(iy, ix, 1) || @link_map(iy+1, ix-1, 5))) : false; // down-left
nyp1xp1 = (iy < cY-1 && ix < cX-1) ? ((cyx == (@score_map((iy+1)*cX + ix+1))) && (@link_map(iy, ix, 4) || @link_map(iy+1, ix+1, 2))) : false; // down-right
}
else {
nym1x = (iy > 0) ? (cyx == (@score_map((iy-1)*cX + ix))) : false; // up
nyxm1 = (ix > 0) ? (cyx == (@score_map(iy *cX + ix-1))) : false; // left
nyxp1 = (ix < cX-1) ? (cyx == (@score_map(iy *cX + ix+1))) : false; // right
nyp1x = (iy > cY-1) ? (cyx == (@score_map((iy+1)*cX + ix))) : false; // down
nym1xm1 = (iy > 0 && ix > 0 ) ? (cyx == (@score_map((iy-1)*cX + ix-1))) : false; // up-left
nym1xp1 = (iy > 0 && ix < cX-1) ? (cyx == (@score_map((iy-1)*cX + ix+1))) : false; // up-right
nyp1xm1 = (iy < cY-1 && ix > 0 ) ? (cyx == (@score_map((iy+1)*cX + ix-1))) : false; // down-left
nyp1xp1 = (iy < cY-1 && ix < cX-1) ? (cyx == (@score_map((iy+1)*cX + ix+1))) : false; // down-right
}
// Lowest label
unsigned int label = lyx;
// Find lowest neighbouring label
label = ((nym1x) && (lym1x < label)) ? lym1x : label;
label = ((nyxm1) && (lyxm1 < label)) ? lyxm1 : label;
label = ((nyxp1) && (lyxp1 < label)) ? lyxp1 : label;
label = ((nyp1x) && (lyp1x < label)) ? lyp1x : label;
label = ((nym1xm1) && (lym1xm1 < label)) ? lym1xm1 : label;
label = ((nym1xp1) && (lym1xp1 < label)) ? lym1xp1 : label;
label = ((nyp1xm1) && (lyp1xm1 < label)) ? lyp1xm1 : label;
label = ((nyp1xp1) && (lyp1xp1 < label)) ? lyp1xp1 : label;
// If labels are different, resolve them
if(label < lyx) {
// Update label
// Nonatomic write may overwrite another label but on average seems to give faster results
@g_labels(lyx) = label;
// Record the change
@in2(0) = 1;
}
}
}
''' + f'''
dim3 block(32, 32);
const int cX= {cX};
const int cY= {cY};''' + '''
dim3 grid(ceil(cX/(float)block.x), ceil(cY/(float)block.y));
dim3 resolve_block(32, 32);
dim3 resolve_grid(ceil(cX/(float)resolve_block.x), ceil(cY/(float)resolve_block.y));
// Initialise labels
init_labels <<< grid, block >>>(@ARGS, cX, cY);
// Resolve the labels
resolve_labels <<< resolve_grid, resolve_block >>>(@ARGS, cX, cY);
// Changed Flag
int32 changed = 1;
// While labels have changed
while(changed) {
// Copy changed to device
cudaMemsetAsync(in2_p, 0, 4);
// Label image
label_equivalence <<< grid, block >>>(@ARGS, cX, cY);
// Copy changed flag to host
cudaMemcpy(&changed, in2_p, sizeof(int32), cudaMemcpyDeviceToHost);
// Resolve the labels
resolve_labels <<< resolve_grid, resolve_block >>>(@ARGS, cX, cY);
}
''')
result = result.reshape((cY, cX))
value, _, cnt = jt.unique(result, return_inverse=True, return_counts=True)
value = (cnt > result_comp_area_thresh) * value
value = value[value != 0]
map_result = jt.zeros((int(value.max().numpy()[0]) + 1), dtype=jt.uint32)
map_result[value] = jt.index(value.shape)[0] + 1
result = map_result[result]
return result

View File

@ -1,71 +1,147 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Copyright (c) 2023 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import os, sys, shutil
import platform
from .compiler import *
from jittor_utils import run_cmd, get_version, get_int_version
from jittor.utils.misc import download_url_to_local
from jittor_utils.misc import download_url_to_local
import jittor_utils as jit_utils
def search_file(dirs, name, prefer_version=()):
if os.name == 'nt':
if name.startswith("lib"):
name = name[3:].replace(".so", "64*.dll")
for d in dirs:
fname = os.path.join(d, name)
if os.name == 'nt':
lname = os.path.join(d, name)
names = glob.glob(lname)
if len(names):
return names[0]
continue
prefer_version = tuple( str(p) for p in prefer_version )
for i in range(len(prefer_version),-1,-1):
vname = ".".join((fname,)+prefer_version[:i])
if os.path.isfile(vname):
LOG.i(f"found {vname}")
LOG.v(f"found {vname}")
return vname
LOG.f(f"file {name} not found in {dirs}")
def install_mkl(root_folder):
# origin url is
# url = "https://github.com/intel/mkl-dnn/releases/download/v1.0.2/mkldnn_lnx_1.0.2_cpu_gomp.tgz"
url = "https://cloud.tsinghua.edu.cn/f/da02bf62b55b4aa3b8ee/?dl=1"
filename = "mkldnn_lnx_1.0.2_cpu_gomp.tgz"
import platform
url = None
if platform.system()=="Linux":
if platform.machine()=='x86_64':
filename = "dnnl_lnx_2.2.0_cpu_gomp.tgz"
md5 = "35bbbdf550a9d8ad54db798e372000f6"
elif platform.machine()=='aarch64':
filename = "dnnl_lnx_2.2.0_cpu_gomp_aarch64.tgz"
md5 = "72cf9b0b8fd6c3c786d35a9daaee22b8"
else:
raise RuntimeError(f"platform.machine()=={platform.machine()} not support yet,"
" Please contact us on https://github.com/jittor/jittor ")
elif os.name == "nt":
# url = "https://github.com/oneapi-src/oneDNN/releases/download/v2.2/dnnl_win_2.2.0_cpu_iomp.zip"
# url = "https://github.com/oneapi-src/oneDNN/releases/download/v2.2/dnnl_win_2.2.0_cpu_vcomp.zip"
filename = "dnnl_win_2.2.0_cpu_vcomp.zip"
md5 = "fa12c693b2ec07700d174e1e99d60a7e"
elif platform.system() == "Darwin":
if platform.machine() == "arm64":
filename = "dnnl_mac_2.2.0_cpu_omp_arm64.tgz"
md5 = "d8fdf56d3cf618685d22d18f08119f88"
else:
filename = "dnnl_mac_2.2.0_cpu_omp_x86_64.tgz"
md5 = "6e2f065d6a589c82081536b684768fe6"
else:
raise RuntimeError(f"platform.machine()=={platform.machine()} not support yet,"
" Please contact us on https://github.com/jittor/jittor ")
if not url:
url = "https://cg.cs.tsinghua.edu.cn/jittor/assets/" + filename
fullname = os.path.join(root_folder, filename)
dirname = os.path.join(root_folder, filename.replace(".tgz",""))
dirname = os.path.join(root_folder, filename.rsplit(".",1)[0])
if not os.path.isfile(os.path.join(dirname, "examples", "test")):
if not (os.path.isfile(os.path.join(dirname, "lib", "libmkldnn.so")) or
os.path.isfile(os.path.join(dirname, "bin", "dnnl.dll")) or
os.path.isfile(os.path.join(dirname, "lib", "libmkldnn.dylib"))):
LOG.i("Downloading mkl...")
download_url_to_local(url, filename, root_folder, "47187284ede27ad3bd64b5f0e7d5e730")
import tarfile
with tarfile.open(fullname, "r") as tar:
tar.extractall(root_folder)
assert 0 == os.system(f"cd {dirname}/examples && "
f"{cc_path} -std=c++14 cpu_cnn_inference_f32.cpp -Ofast -lmkldnn -I ../include -L ../lib -o test && LD_LIBRARY_PATH=../lib/ ./test")
download_url_to_local(url, filename, root_folder, md5)
if fullname.endswith(".zip"):
import zipfile
with zipfile.ZipFile(fullname, "r") as f:
f.extractall(root_folder)
else:
import tarfile
with tarfile.open(fullname, "r") as tar:
tar.extractall(root_folder)
if os.name == 'nt':
# this env is used for execute example/text
bin_path = os.path.join(dirname, "bin")
sys.path.append(bin_path)
os.environ["PATH"] = os.environ.get("PATH", "") + ";" + bin_path
cmd = f"cd /d {dirname}/examples && {cc_path} {dirname}/examples/cnn_inference_f32.cpp -I{dirname}/include -Fe: {dirname}/examples/test.exe {fix_cl_flags(cc_flags).replace('-LD', '')} {dirname}/lib/mkldnn.lib"
assert 0 == os.system(cmd)
assert 0 == os.system(f"{dirname}/examples/test")
elif platform.system() == "Darwin":
assert 0 == os.system(f"cd {dirname}/examples && "
f"{cc_path} -std=c++14 cnn_inference_f32.cpp -Ofast -lmkldnn -I ../include -L ../lib -o test && DYLD_LIBRARY_PATH=../lib/ ./test")
else:
assert 0 == os.system(f"cd {dirname}/examples && "
f"{cc_path} -std=c++14 cnn_inference_f32.cpp -Ofast -lmkldnn -I ../include -L ../lib -o test && LD_LIBRARY_PATH=../lib/ ./test")
def setup_mkl():
global mkl_ops, use_mkl
use_mkl = os.environ.get("use_mkl", "1")=="1"
mkl_ops = None
if not use_mkl: return
# pytorch mkl is conflict with jittor mkl
# yield error "free: invalide size" or
# "mmap error"
# import pytorch(>1.8) first can fix this problem
# try:
# # jt.dirty_fix_pytorch_runtime_error()
# import torch
# from torch import nn
# except:
# torch = None
mkl_include_path = os.environ.get("mkl_include_path")
mkl_lib_path = os.environ.get("mkl_lib_path")
if mkl_lib_path is None or mkl_include_path is None:
mkl_install_sh = os.path.join(jittor_path, "script", "install_mkl.sh")
LOG.v("setup mkl...")
# mkl_path = os.path.join(cache_path, "mkl")
# mkl_path decouple with cc_path
from pathlib import Path
mkl_path = os.path.join(str(Path.home()), ".cache", "jittor", "mkl")
mkl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "mkl")
make_cache_dir(mkl_path)
install_mkl(mkl_path)
mkl_home = ""
for name in os.listdir(mkl_path):
if name.startswith("mkldnn_lnx") and os.path.isdir(os.path.join(mkl_path, name)):
if name.startswith("dnnl") and os.path.isdir(os.path.join(mkl_path, name)):
mkl_home = os.path.join(mkl_path, name)
break
assert mkl_home!=""
mkl_include_path = os.path.join(mkl_home, "include")
mkl_lib_path = os.path.join(mkl_home, "lib")
mkl_include_path = os.path.join(mkl_home, "include")
mkl_lib_path = os.path.join(mkl_home, "lib")
mkl_lib_name = os.path.join(mkl_lib_path, "libmkldnn.so")
extra_flags = f" -I\"{mkl_include_path}\" -L\"{mkl_lib_path}\" -lmkldnn "
if os.name == 'nt':
mkl_lib_name = os.path.join(mkl_home, 'bin', 'dnnl.dll')
mkl_bin_path = os.path.join(mkl_home, 'bin')
extra_flags = f" -I\"{mkl_include_path}\" -L\"{mkl_lib_path}\" -L\"{mkl_bin_path}\" -ldnnl "
elif platform.system() == "Darwin":
mkl_lib_name = os.path.join(mkl_lib_path, "libmkldnn.dylib")
assert os.path.isdir(mkl_include_path)
assert os.path.isdir(mkl_lib_path)
assert os.path.isfile(mkl_lib_name)
@ -77,8 +153,7 @@ def setup_mkl():
mkl_op_dir = os.path.join(jittor_path, "extern", "mkl", "ops")
mkl_op_files = [os.path.join(mkl_op_dir, name) for name in os.listdir(mkl_op_dir)]
mkl_ops = compile_custom_ops(mkl_op_files,
extra_flags=f" -I'{mkl_include_path}' -L'{mkl_lib_path}' -lmkldnn -Wl,-rpath='{mkl_lib_path}' ")
mkl_ops = compile_custom_ops(mkl_op_files, extra_flags=extra_flags)
LOG.vv("Get mkl_ops: "+str(dir(mkl_ops)))
@ -90,24 +165,23 @@ def install_cub(root_folder):
fullname = os.path.join(root_folder, filename)
dirname = os.path.join(root_folder, filename.replace(".tgz",""))
if not os.path.isfile(os.path.join(dirname, "examples", "test")):
if not os.path.isfile(os.path.join(dirname, "examples", "device/example_device_radix_sort.cu")):
LOG.i("Downloading cub...")
download_url_to_local(url, filename, root_folder, md5)
import tarfile
with tarfile.open(fullname, "r") as tar:
tar.extractall(root_folder)
assert 0 == os.system(f"cd {dirname}/examples && "
f"{nvcc_path} device/example_device_radix_sort.cu -O2 -I.. -std=c++14 -o test")
if core.get_device_count():
assert 0 == os.system(f"cd {dirname}/examples && ./test")
# assert 0 == os.system(f"cd {dirname}/examples && "
# f"{nvcc_path} --cudart=shared -ccbin=\"{cc_path}\" device/example_device_radix_sort.cu -O2 -I.. -std=c++14 -o test")
# if core.get_device_count():
# assert 0 == os.system(f"cd {dirname}/examples && ./test")
return dirname
def setup_cub():
global cub_home
cub_home = ""
from pathlib import Path
cub_path = os.path.join(str(Path.home()), ".cache", "jittor", "cub")
cub_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cub")
cuda_version = int(get_version(nvcc_path)[1:-1].split('.')[0])
extra_flags = ""
if cuda_version < 11:
@ -118,6 +192,19 @@ def setup_cub():
def setup_cuda_extern():
if not has_cuda: return
def split(a): return a.replace(";",":").split(":")
check_ld_path = split(os.environ.get("LD_LIBRARY_PATH", "")) + \
split(os.environ.get("PATH", ""))
for cp in check_ld_path:
cp = cp.lower()
if "cuda" in cp and \
"lib" in cp and \
"jtcuda" not in cp:
LOG.w(f"CUDA related path found in LD_LIBRARY_PATH or PATH, "
"This path may cause jittor found the wrong libs, "
"please unset LD_LIBRARY_PATH and remove cuda lib path in Path. \n"
"Or you can let jittor install cuda for you: `python3.x -m jittor_utils.install_cuda`")
break
LOG.vv("setup cuda extern...")
cache_path_cuda = os.path.join(cache_path, "cuda")
cuda_include = os.path.join(jittor_path, "extern", "cuda", "inc")
@ -125,8 +212,9 @@ def setup_cuda_extern():
cuda_extern_src = os.path.join(jittor_path, "extern", "cuda", "src")
cuda_extern_files = [os.path.join(cuda_extern_src, name)
for name in os.listdir(cuda_extern_src)]
so_name = os.path.join(cache_path_cuda, "cuda_extern.so")
compile(cc_path, cc_flags+f" -I'{cuda_include}' ", cuda_extern_files, so_name)
so_name = os.path.join(cache_path_cuda, "libcuda_extern"+so)
compile(cc_path, cc_flags+f" -I\"{cuda_include}\" ", cuda_extern_files, so_name)
link_cuda_extern = f" -L\"{cache_path_cuda}\" -llibcuda_extern "
ctypes.CDLL(so_name, dlopen_flags)
try:
@ -136,20 +224,36 @@ def setup_cuda_extern():
line = traceback.format_exc()
LOG.w(f"CUDA found but cub is not loaded:\n{line}")
libs = ["cublas", "cudnn", "curand"]
libs = ["cublas", "cudnn", "curand", "cufft", "cusparse"]
# in cuda 11.4, module memory comsumptions:
# default context: 259 MB
# cublas: 340 MB
# cudnn: 340 MB
if int(os.environ.get("conv_opt", "0")):
libs = ["cublas", "curand"]
for lib_name in libs:
try:
setup_cuda_lib(lib_name)
setup_cuda_lib(lib_name, extra_flags=link_cuda_extern)
except Exception as e:
import traceback
line = traceback.format_exc()
LOG.w(f"CUDA found but {lib_name} is not loaded:\n{line}")
msg = f"CUDA found but {lib_name} is not loaded:\n"
if lib_name == "cudnn":
LOG.w(f"Develop version of CUDNN not found, "
"please refer to CUDA offical tar file installation: "
"https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar")
msg += """Develop version of CUDNN not found,
please refer to CUDA offical tar file installation:
https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar"""
if lib_name == "cusparse":
msg += """CUSPARSE library is not loaded,
please ensure it is installed along with the CUDA toolkit."""
if platform.machine() in ["x86_64", "AMD64"]:
msg += f"""
or you can let jittor install cuda and cudnn for you:
>>> python3.{sys.version_info.minor} -m jittor_utils.install_cuda
"""
LOG.f(msg)
def setup_cuda_lib(lib_name, link=True, extra_flags=""):
arch_key = "x86_64"
if platform.machine() not in ["x86_64", "AMD64"]:
arch_key = "aarch64"
globals()[lib_name+"_ops"] = None
globals()[lib_name] = None
if not has_cuda: return
@ -161,27 +265,50 @@ def setup_cuda_lib(lib_name, link=True, extra_flags=""):
link_flags = ""
if link:
extra_include_path = os.path.abspath(os.path.join(cuda_include, "..", "targets/x86_64-linux/include"))
extra_lib_path = os.path.abspath(os.path.join(cuda_lib, "..", "targets/x86_64-linux/lib"))
extra_include_path = os.path.abspath(os.path.join(cuda_include, "..", f"targets/{arch_key}-linux/include"))
extra_lib_path = os.path.abspath(os.path.join(cuda_lib, "..", f"targets/{arch_key}-linux/lib"))
cuda_include_name = search_file([cuda_include, extra_include_path, "/usr/include"], lib_name+".h")
# cuda11 prefer cudnn 8
nvcc_version = get_int_version(nvcc_path)
if has_corex:
nvcc_version = (10,2,89)
prefer_version = ()
if nvcc_version[0] == 11:
prefer_version = ("8",)
culib_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu"], f"lib{lib_name}.so", prefer_version)
culib_path = search_file([cuda_bin, cuda_lib, extra_lib_path, f"/usr/lib/{arch_key}-linux-gnu", "/usr/lib"], f"lib{lib_name}.so", prefer_version)
if lib_name == "cublas" and nvcc_version[0] >= 10:
# manual link libcublasLt.so
try:
cublas_lt_lib_path = search_file([cuda_bin, cuda_lib, extra_lib_path, f"/usr/lib/{arch_key}-linux-gnu", "/usr/lib"], f"libcublasLt.so", nvcc_version)
ctypes.CDLL(cublas_lt_lib_path, dlopen_flags)
except:
# some aarch64 os, such as uos with FT2000 cpu,
# it's cuda 10 doesn't have libcublasLt.so
pass
if lib_name == "cudnn":
# cudnn cannot found libcudnn_cnn_train.so.8, we manual link for it.
if nvcc_version >= (11,0,0):
libs = ["libcudnn_ops_infer.so", "libcudnn_ops_train.so", "libcudnn_cnn_infer.so", "libcudnn_cnn_train.so"]
for l in libs:
ex_cudnn_path = search_file([cuda_lib, extra_lib_path, "/usr/lib/x86_64-linux-gnu"], l, prefer_version)
ex_cudnn_path = search_file([cuda_bin, cuda_lib, extra_lib_path, f"/usr/lib/{arch_key}-linux-gnu", "/usr/lib"], l, prefer_version)
ctypes.CDLL(ex_cudnn_path, dlopen_flags)
# dynamic link cuda library
ctypes.CDLL(culib_path, dlopen_flags)
link_flags = f"-l{lib_name} -L'{cuda_lib}'"
# ctypes.CDLL(culib_path, dlopen_flags)
# link_flags = f"-l{lib_name} -L\"{cuda_lib}\""
link_flags = f"-l{lib_name} -L\"{os.path.dirname(culib_path)}\""
# print("link_flags", link_flags, culib_path)
if lib_name == "cusparse" :
try:
cusparse_spmv_path = search_file([cuda_lib, extra_lib_path], "libcusparse.so")
ctypes.CDLL(cusparse_spmv_path, dlopen_flags)
except:
LOG.w("Failed to load cusparse-specific shared libraries.")
# find all source files
culib_src_dir = os.path.join(jittor_path, "extern", "cuda", lib_name)
@ -194,46 +321,99 @@ def setup_cuda_lib(lib_name, link=True, extra_flags=""):
# compile and get operators
culib = compile_custom_ops(culib_src_files, return_module=True,
extra_flags=f" -I'{jt_cuda_include}' -I'{jt_culib_include}' {link_flags} {extra_flags} ")
extra_flags=f" -I\"{jt_cuda_include}\" -I\"{jt_culib_include}\" {link_flags} {extra_flags} ")
culib_ops = culib.ops
globals()[lib_name+"_ops"] = culib_ops
globals()[lib_name] = culib
LOG.vv(f"Get {lib_name}_ops: "+str(dir(culib_ops)))
def _setup_fake_cuda_lib(lib_name=None, link=True, extra_flags=""):
if lib_name is None:
lib_names = ["cudnn", "cublas", "curand", "cufft", "cub", "cutt", "cutlass"]
for lib_name in lib_names:
_setup_fake_cuda_lib(lib_name, link, extra_flags)
return
arch_key = "x86_64"
if platform.machine() not in ["x86_64", "AMD64"]:
arch_key = "aarch64"
globals()[lib_name+"_ops"] = None
globals()[lib_name] = None
LOG.v(f"setup {lib_name}...")
jt_cuda_include = os.path.join(jittor_path, "extern", "cuda", "inc")
jt_culib_include = os.path.join(jittor_path, "extern", "cuda", lib_name, "inc")
# find all source files
culib_src_dir = os.path.join(jittor_path, "extern", "cuda", lib_name, "ops")
culib_src_files = []
for r, _, f in os.walk(culib_src_dir):
for fname in f:
if fname.endswith("op.cc") or fname.endswith("op.h"):
culib_src_files.append(os.path.join(r, fname))
if len(culib_src_files) == 0:
return
# compile and get operators
culib = compile_custom_ops(culib_src_files, return_module=True,
extra_flags=f" -I\"{jt_cuda_include}\" -I\"{jt_culib_include}\" {extra_flags} ")
culib_ops = culib.ops
globals()[lib_name+"_ops"] = culib_ops
globals()[lib_name] = culib
LOG.vv(f"Get {lib_name}_ops: "+str(dir(culib_ops)))
if setup_fake_cuda_lib:
_setup_fake_cuda_lib()
def install_cutt(root_folder):
# Modified from: https://github.com/ap-hynninen/cutt
url = "https://codeload.github.com/Jittor/cutt/zip/v1.1"
url = "https://codeload.github.com/Jittor/cutt/zip/v1.2"
filename = "cutt-1.1.zip"
filename = "cutt-1.2.zip"
fullname = os.path.join(root_folder, filename)
dirname = os.path.join(root_folder, filename.replace(".zip",""))
true_md5 = "7bb71cf7c49dbe57772539bf043778f7"
true_md5 = "14d0fd1132c8cd657dc3cf29ce4db931"
if os.path.exists(fullname):
md5 = run_cmd('md5sum '+fullname).split()[0]
from jittor_utils.misc import calculate_md5
md5 = calculate_md5(fullname)
if md5 != true_md5:
os.remove(fullname)
shutil.rmtree(dirname)
if not os.path.isfile(os.path.join(dirname, "bin", "cutt_test")):
LOG.i("Downloading cutt...")
download_url_to_local(url, filename, root_folder, true_md5)
CUTT_PATH = os.environ.get("CUTT_PATH", "")
if not os.path.isfile(os.path.join(cache_path, "libcutt"+so)) or CUTT_PATH:
if CUTT_PATH:
dirname = CUTT_PATH
else:
LOG.i("Downloading cutt...")
download_url_to_local(url, filename, root_folder, true_md5)
import zipfile
import zipfile
zf = zipfile.ZipFile(fullname)
try:
zf.extractall(path=root_folder)
except RuntimeError as e:
print(e)
raise
zf.close()
zf = zipfile.ZipFile(fullname)
try:
zf.extractall(path=root_folder)
except RuntimeError as e:
print(e)
raise
zf.close()
LOG.i("installing cutt...")
arch_flag = ""
# -Xptxas -dlcm=ca actually not work
arch_flag = " -Xptxas -dlcm=ca "
if len(flags.cuda_archs):
arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
run_cmd(f"make NVCC_GENCODE='{arch_flag}' nvcc_path='{nvcc_path}'", cwd=dirname)
cutt_include = f" -I\"{dirname}/include\" -I\"{dirname}/src\" "
files = glob.glob(dirname+"/src/*.c*", recursive=True)
files2 = []
for f in files:
if f.endswith("cutt_bench.cpp") or \
f.endswith("cutt_test.cpp"):
continue
files2.append(f)
cutt_flags = cc_flags+opt_flags+cutt_include
compile(cc_path, cutt_flags, files2, cache_path+"/libcutt"+so, cuda_flags=arch_flag)
return dirname
def setup_cutt():
@ -250,16 +430,15 @@ def setup_cutt():
if cutt_lib_path is None or cutt_include_path is None:
LOG.v("setup cutt...")
# cutt_path decouple with cc_path
from pathlib import Path
cutt_path = os.path.join(str(Path.home()), ".cache", "jittor", "cutt")
cutt_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cutt")
make_cache_dir(cutt_path)
install_cutt(cutt_path)
cutt_home = os.path.join(cutt_path, "cutt-1.1")
cutt_home = os.path.join(cutt_path, "cutt-1.2")
cutt_include_path = os.path.join(cutt_home, "src")
cutt_lib_path = os.path.join(cutt_home, "lib")
cutt_lib_path = cache_path
cutt_lib_name = os.path.join(cutt_lib_path, "libcutt.so")
cutt_lib_name = os.path.join(cutt_lib_path, "libcutt"+so)
assert os.path.isdir(cutt_include_path)
assert os.path.isdir(cutt_lib_path)
assert os.path.isfile(cutt_lib_name), cutt_lib_name
@ -272,9 +451,79 @@ def setup_cutt():
cutt_op_dir = os.path.join(jittor_path, "extern", "cuda", "cutt", "ops")
cutt_op_files = [os.path.join(cutt_op_dir, name) for name in os.listdir(cutt_op_dir)]
cutt_ops = compile_custom_ops(cutt_op_files,
extra_flags=f" -I'{cutt_include_path}'")
extra_flags=f" -I\"{cutt_include_path}\" -L\"{cutt_lib_path}\" -llibcutt ")
LOG.vv("Get cutt_ops: "+str(dir(cutt_ops)))
def install_cutlass(root_folder):
# Modified from: https://github.com/ap-hynninen/cutlass
# url = "https://cloud.tsinghua.edu.cn/f/171e49e5825549548bc4/?dl=1"
url = "https://cg.cs.tsinghua.edu.cn/jittor/assets/cutlass.zip"
filename = "cutlass.zip"
fullname = os.path.join(root_folder, filename)
dirname = os.path.join(root_folder, filename.replace(".zip",""))
true_md5 = "999ecb7e217e40c497bc3d0ded6643f0"
if os.path.exists(fullname):
from jittor_utils.misc import calculate_md5
md5 = calculate_md5(fullname)
if md5 != true_md5:
os.remove(fullname)
shutil.rmtree(dirname)
CUTLASS_PATH = os.environ.get("CUTLASS_PATH", "")
if not os.path.isfile(os.path.join(jit_utils.home(), ".cache/jittor/cutlass/cutlass/include/cutlass/cutlass.h")) or CUTLASS_PATH:
if CUTLASS_PATH:
dirname = CUTLASS_PATH
else:
LOG.i("Downloading cutlass...")
download_url_to_local(url, filename, root_folder, true_md5)
import zipfile
zf = zipfile.ZipFile(fullname)
try:
zf.extractall(path=root_folder)
except RuntimeError as e:
print(e)
raise
zf.close()
# LOG.i("installing cutlass...")
# # -Xptxas -dlcm=ca actually not work
# arch_flag = " -Xptxas -dlcm=ca "
# if len(flags.cuda_archs):
# arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
# arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
# cutlass_include = f" -I\"{dirname}/include\" -I\"{dirname}/src\" "
# files = glob.glob(dirname+"/src/*.c*", recursive=True)
# files2 = []
# for f in files:
# if f.endswith("cutlass_bench.cpp") or \
# f.endswith("cutlass_test.cpp"):
# continue
# files2.append(f)
# cutlass_flags = cc_flags+opt_flags+cutlass_include
# compile(cc_path, cutlass_flags, files2, cache_path+"/libcutlass"+so, cuda_flags=arch_flag)
return dirname
def setup_cutlass():
global cutlass_ops, use_cutlass
if not has_cuda:
use_cutlass = False
return
use_cutlass = os.environ.get("use_cutlass", "1")=="1"
cutlass_ops = None
if not use_cutlass: return
cutlass_include_path = os.environ.get("cutlass_include_path")
if cutlass_include_path is None:
LOG.v("setup cutlass...")
# cutlass_path decouple with cc_path
cutlass_path = os.path.join(jit_utils.home(), ".cache", "jittor", "cutlass")
make_cache_dir(cutlass_path)
install_cutlass(cutlass_path)
def install_nccl(root_folder):
url = "https://github.com/NVIDIA/nccl/archive/v2.8.4-1.tar.gz"
@ -292,7 +541,8 @@ def install_nccl(root_folder):
if os.path.isdir(dirname):
shutil.rmtree(dirname)
if not os.path.isfile(os.path.join(dirname, "build", "lib", "libnccl.so")):
LOG.i("Downloading nccl...")
if not os.path.isfile(os.path.join(root_folder, filename)):
LOG.i("Downloading nccl...")
download_url_to_local(url, filename, root_folder, true_md5)
if core.get_device_count() == 0:
@ -309,12 +559,13 @@ def install_nccl(root_folder):
if len(flags.cuda_archs):
arch_flag = f" -arch=compute_{min(flags.cuda_archs)} "
arch_flag += ''.join(map(lambda x:f' -code=sm_{x} ', flags.cuda_archs))
run_cmd(f"make -j8 src.build CUDA_HOME='{cuda_home}' NVCC_GENCODE='{arch_flag}' ", cwd=dirname)
run_cmd(f"CC=\"{cc_path}\" CXX=\"{cc_path}\" make -j8 src.build CUDA_HOME='{cuda_home}' NVCC_GENCODE='{arch_flag} --cudart=shared ' ", cwd=dirname)
return dirname
def setup_nccl():
global nccl_ops, use_nccl
global nccl, nccl_ops, use_nccl
use_nccl = os.environ.get("use_nccl", "1")=="1"
nccl = None
nccl_ops = None
if not has_cuda or not has_mpi:
use_nccl = False
@ -326,8 +577,7 @@ def setup_nccl():
if nccl_lib_path is None or nccl_include_path is None:
LOG.v("setup nccl...")
# nccl_path decouple with cc_path
from pathlib import Path
nccl_path = os.path.join(str(Path.home()), ".cache", "jittor", "nccl")
nccl_path = os.path.join(jit_utils.home(), ".cache", "jittor", "nccl")
make_cache_dir(nccl_path)
nccl_home = install_nccl(nccl_path)
@ -354,10 +604,33 @@ def setup_nccl():
for fname in f:
nccl_src_files.append(os.path.join(r, fname))
nccl_ops = compile_custom_ops(nccl_src_files,
extra_flags=f" -I'{nccl_include_path}' {mpi_compile_flags} ")
nccl = compile_custom_ops(nccl_src_files,
extra_flags=f" -I\"{nccl_include_path}\" {mpi_compile_flags} ",
return_module=True, dlopen_flags=os.RTLD_GLOBAL | os.RTLD_NOW,
gen_name_="jittor_nccl_core")
nccl_ops = nccl.ops
LOG.vv("Get nccl_ops: "+str(dir(nccl_ops)))
def setup_hccl():
global hccl_ops
hccl_src_dir = os.path.join(jittor_path, "extern", "acl", "hccl")
hccl_src_files = []
for r, _, f in os.walk(hccl_src_dir):
for fname in f:
hccl_src_files.append(os.path.join(r, fname))
hccl_include_path = os.path.join(os.environ.get("ASCEND_TOOLKIT_HOME"), "aarch64-linux/include/hccl")
hccl_lib_name = os.path.join(os.environ.get("ASCEND_TOOLKIT_HOME"), "aarch64-linux/lib64/libhccl.so")
ctypes.CDLL(hccl_lib_name, dlopen_flags)
hccl = compile_custom_ops(hccl_src_files,
extra_flags=f" -I\"{hccl_include_path}\" {mpi_compile_flags} ",
return_module=True, dlopen_flags=os.RTLD_GLOBAL | os.RTLD_NOW,
gen_name_="jittor_hccl_core")
hccl_ops = hccl.ops
LOG.vv("Get hccl_ops: "+str(dir(hccl_ops)))
def manual_link(flags):
lib_dirs = []
libs = []
@ -387,9 +660,10 @@ def setup_mpi():
mpi_ops = None
mpi = None
has_mpi = False
if not use_mpi: return
mpicc_path = env_or_try_find('mpicc_path', 'mpicc')
if mpicc_path == "":
LOG.i("mpicc not found, distribution disabled.")
# LOG.i("mpicc not found, distribution disabled.")
use_mpi = False
else:
use_mpi = True
@ -411,7 +685,7 @@ def setup_mpi():
mpi_src_files.append(os.path.join(r, fname))
# mpi compile flags add for nccl
mpi_compile_flags += f" -I'{os.path.join(mpi_src_dir, 'inc')}' "
mpi_compile_flags += f" -I\"{os.path.join(mpi_src_dir, 'inc')}\" "
mpi_compile_flags = mpi_compile_flags.replace("-pthread", "")
mpi_version = get_version(mpicc_path)
@ -426,7 +700,7 @@ def setup_mpi():
mpi_ops = mpi.ops
LOG.vv("Get mpi: "+str(mpi.__dict__.keys()))
LOG.vv("Get mpi_ops: "+str(mpi_ops.__dict__.keys()))
def warper(func):
def wrapper(func):
def inner(self, *args, **kw):
return func(self, *args, **kw)
inner.__doc__ = func.__doc__
@ -434,14 +708,45 @@ def setup_mpi():
for k in mpi_ops.__dict__:
if not k.startswith("mpi_"): continue
if k == "mpi_test": continue
setattr(core.Var, k, warper(mpi_ops.__dict__[k]))
setattr(core.Var, k, wrapper(mpi_ops.__dict__[k]))
setup_mpi()
in_mpi = inside_mpi()
rank = mpi.world_rank() if in_mpi else 0
setup_nccl()
FIX_TORCH_ERROR = 0
if os.name != 'nt' and not in_mpi:
FIX_TORCH_ERROR = 1
if "FIX_TORCH_ERROR" in os.environ:
FIX_TORCH_ERROR = os.environ["FIX_TORCH_ERROR"] != "0"
if FIX_TORCH_ERROR:
try:
import torch
from jittor_utils import dirty_fix_pytorch_runtime_error
dirty_fix_pytorch_runtime_error()
except:
pass
cudnn = cublas = curand = cufft = cusparse = None
setup_mpi()
rank = mpi.world_rank() if in_mpi else 0
world_size = mpi.world_size() if in_mpi else 1
# if has_acl:
# setup_hccl()
# elif has_cuda:
# setup_nccl()
# setup_cutt()
# setup_cutlass()
setup_nccl()
setup_cutt()
setup_cutlass()
# try:
setup_mkl()
# except Exception as e:
# LOG.w("MKL install failed, msg:", e)
setup_cuda_extern()
# install backend extern library
for mod in jit_utils.backends:
if mod.install_extern():
break

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Copyright (c) 2023 Jittor. All Rights Reserved.
# Maintainers:
# Guowei Yang <471184555@qq.com>
# Guoye Yang <498731903@qq.com>
@ -15,6 +15,8 @@ from collections.abc import Sequence
def argmax_pool(x, size, stride, padding=0):
if stride<=0:
raise RuntimeError(f"stride must be > 0, but got {stride}")
return pool.pool(x, size, 'maximum', padding, stride)
def concat(arr, dim):
@ -28,8 +30,9 @@ def concat(arr, dim):
Example::
jt.concat([jt.array([[1],[2]]), jt.array([[2],[2]])], dim=1)
# return [[1],[2],[2],[2]]
>>> jt.concat([jt.array([[1],[2]]), jt.array([[2],[2]])], dim=1)
jt.Var([[1 2]
[2 2]], dtype=int32)
'''
# TODO: low performance when concat lots of vars
total_dim = 0
@ -179,7 +182,7 @@ def _setitem_old(x, slices, value):
# PATCH
def getitem(x, slices):
if isinstance(slices, jt.Var) and slices.dtype == "bool":
return getitem(x, tuple(slices.where()))
return getitem(x, slices.where())
if isinstance(slices, tuple):
ss = []
for s in slices:
@ -192,7 +195,14 @@ def getitem(x, slices):
def setitem(x, slices, value):
if isinstance(slices, jt.Var) and slices.dtype == "bool":
slices = tuple(slices.where())
if slices.shape == x.shape:
if isinstance(value, (int, float)):
value = jt.array(value).broadcast(x.shape)
return x.assign(slices.ternary(value, x))
elif isinstance(value, jt.Var) and value.shape == [1,]:
value = jt.broadcast(value, x.shape)
return x.assign(slices.ternary(value, x))
slices = slices.where()
elif isinstance(slices, tuple):
ss = []
for s in slices:
@ -201,11 +211,19 @@ def setitem(x, slices, value):
else:
ss.append(s)
slices = tuple(ss)
return x.assign(x.setitem(slices, value))
return x.check_cascade_setitem(x.setitem(slices, value))
jt.Var.__getitem__ = jt.Var.slice_var = getitem
jt.Var.__setitem__ = setitem
def _merge_dtypes(dtypes):
dtype = dtypes[0]
for i in range(1, len(dtypes)):
dtype = jt.binary_dtype_infer("add", dtype, dtypes[i])
return dtype
@jt.flag_scope(amp_reg=4) # _custom_flag
def concat(arr, dim=0):
'''Concat Operator can concat a list of jt Var at a specfic dimension.
@ -218,20 +236,30 @@ def concat(arr, dim=0):
Example::
jt.concat([jt.array([[1],[2]]), jt.array([[2],[2]])], dim=1)
# return [[1],[2],[2],[2]]
# return jt.Var([[1,2],[2,2]],dtype=int32)
'''
if not isinstance(arr, Sequence):
raise TypeError("concat arr needs to be a tuple or list")
if len(arr) == 0:
raise ValueError("need at least one array to concat")
total_dim = 0
if dim < 0: dim += len(arr[0].shape)
base_dim = len(arr[0].shape)
if dim < 0: dim += base_dim
if dim < 0 or dim >= base_dim:
raise IndexError(f"Dimension out of range (expected to be in range of [{-base_dim}, {base_dim-1}], but got {dim})")
dtypes = []
for a in arr:
if len(a.shape) != base_dim:
raise RuntimeError(f"get different number of dimensions of {base_dim} and {len(a.shape)}")
for i in range(base_dim):
if i != dim and a.shape[i] != arr[0].shape[i]:
raise RuntimeError(f"Sizes of vars must match except in dimension {dim}. Expected size {arr[0].shape[i]} but got size {a.shape[i]} for dimension number {i} in the list.")
total_dim += a.shape[dim]
dtypes.append(str(a.dtype))
cdim = 0
shape = list(a.shape)
shape[dim] = total_dim
s = jt.empty(shape, a.dtype)
s = jt.empty(shape, dtype = _merge_dtypes(dtypes))
slices = [slice(None)]*len(a.shape)
for a in arr:
if a.shape[dim] == 0:
@ -242,3 +270,5 @@ Example::
# s = jt.setitem(s, tuple(slices), a)
cdim += a.shape[dim]
return s
cat = concat

View File

@ -1,5 +1,6 @@
from .dataset import Dataset, ImageFolder
from .dataset import Dataset, ImageFolder, dataset_root, TensorDataset, VarDataset, DataLoader
from .mnist import MNIST
from .cifar import CIFAR10, CIFAR100
from .voc import VOC
from .sampler import *

View File

@ -0,0 +1,189 @@
import os
from jittor_utils.misc import download_and_extract_archive, check_integrity
from PIL import Image
import sys, pickle
import numpy as np
from jittor.dataset import Dataset, dataset_root
class CIFAR10(Dataset):
"""`CIFAR10 <https://www.cs.toronto.edu/~kriz/cifar.html>`_ Dataset.
Args:
root (string): Root directory of dataset where directory
``cifar-10-batches-py`` exists or will be saved to if download is set to True.
train (bool, optional): If True, creates dataset from training set, otherwise
creates from test set.
transform (callable, optional): A function/transform that takes in an PIL image
and returns a transformed version. E.g, ``transforms.RandomCrop``
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If true, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
Example::
from jittor.dataset.cifar import CIFAR10
a = CIFAR10()
a.set_attrs(batch_size=16)
for imgs, labels in a:
print(imgs.shape, labels.shape)
break
"""
base_folder = 'cifar-10-batches-py'
url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
filename = "cifar-10-python.tar.gz"
tgz_md5 = 'c58f30108f718f92721af3b95e74349a'
train_list = [
['data_batch_1', 'c99cafc152244af753f735de768cd75f'],
['data_batch_2', 'd4bba439e000b95fd0a9bffe97cbabec'],
['data_batch_3', '54ebc095f3ab1f0389bbae665268c751'],
['data_batch_4', '634d18415352ddfa80567beed471001a'],
['data_batch_5', '482c414d41f54cd18b22e5b47cb7c3cb'],
]
test_list = [
['test_batch', '40351d587109b95175f43aff81a1287e'],
]
meta = {
'filename': 'batches.meta',
'key': 'label_names',
'md5': '5ff9c542aee3614f3951f8cda6e48888',
}
def __init__(self, root=dataset_root+"/cifar_data/", train=True, transform=None, target_transform=None,
download=True):
super(CIFAR10, self).__init__()
self.root = root
self.transform=transform
self.target_transform=target_transform
self.train = train # training set or test set
if download:
self.download()
if not self._check_integrity():
raise RuntimeError('Dataset not found or corrupted.' +
' You can use download=True to download it')
if self.train:
downloaded_list = self.train_list
else:
downloaded_list = self.test_list
self.data = []
self.targets = []
# now load the picked numpy arrays
for file_name, checksum in downloaded_list:
file_path = os.path.join(self.root, self.base_folder, file_name)
with open(file_path, 'rb') as f:
if sys.version_info[0] == 2:
entry = pickle.load(f)
else:
entry = pickle.load(f, encoding='latin1')
self.data.append(entry['data'])
if 'labels' in entry:
self.targets.extend(entry['labels'])
else:
self.targets.extend(entry['fine_labels'])
self.data = np.vstack(self.data).reshape(-1, 3, 32, 32)
self.data = self.data.transpose((0, 2, 3, 1)) # convert to HWC
self._load_meta()
def _load_meta(self):
path = os.path.join(self.root, self.base_folder, self.meta['filename'])
if not check_integrity(path, self.meta['md5']):
raise RuntimeError('Dataset metadata file not found or corrupted.' +
' You can use download=True to download it')
with open(path, 'rb') as infile:
if sys.version_info[0] == 2:
data = pickle.load(infile)
else:
data = pickle.load(infile, encoding='latin1')
self.classes = data[self.meta['key']]
self.class_to_idx = {_class: i for i, _class in enumerate(self.classes)}
def __getitem__(self, index):
"""
Args:
index (int): Index
Returns:
tuple: (image, target) where target is index of the target class.
"""
img, target = self.data[index], self.targets[index]
# doing this so that it is consistent with all other datasets
# to return a PIL Image
img = Image.fromarray(img)
if self.transform is not None:
img = self.transform(img)
if self.target_transform is not None:
target = self.target_transform(target)
return img, target
def __len__(self):
return len(self.data)
def _check_integrity(self):
root = self.root
for fentry in (self.train_list + self.test_list):
filename, md5 = fentry[0], fentry[1]
fpath = os.path.join(root, self.base_folder, filename)
if not check_integrity(fpath, md5):
return False
return True
def download(self):
if self._check_integrity():
print('Files already downloaded and verified')
return
download_and_extract_archive(self.url, self.root, filename=self.filename, md5=self.tgz_md5)
def extra_repr(self):
return "Split: {}".format("Train" if self.train is True else "Test")
class CIFAR100(CIFAR10):
"""`CIFAR100 <https://www.cs.toronto.edu/~kriz/cifar.html>`_ Dataset.
This is a subclass of the `CIFAR10` Dataset.
Example::
from jittor.dataset.cifar import CIFAR100
a = CIFAR100()
a.set_attrs(batch_size=16)
for imgs, labels in a:
print(imgs.shape, labels.shape)
break
"""
base_folder = 'cifar-100-python'
url = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
filename = "cifar-100-python.tar.gz"
tgz_md5 = 'eb9058c3a382ffc7106e4002c42a8d85'
train_list = [
['train', '16019d7e3df5f24257cddd939b257f8d'],
]
test_list = [
['test', 'f0ef6b0ae62326f3e7ffdfab6717acfc'],
]
meta = {
'filename': 'meta',
'key': 'fine_label_names',
'md5': '7973b15100ade9c7d40fb424638fde48',
}

View File

@ -1,5 +1,5 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Copyright (c) 2023 Jittor. All Rights Reserved.
# Maintainers:
# Meng-Hao Guo <guomenghao1997@gmail.com>
# Dun Liang <randonlang@gmail.com>.
@ -21,11 +21,50 @@ import signal
from jittor_utils import LOG
import jittor as jt
import time
import jittor_utils as jit_utils
dataset_root = os.path.join(pathlib.Path.home(), ".cache", "jittor", "dataset")
dataset_root = os.path.join(jit_utils.home(), ".cache", "jittor", "dataset")
mp_log_v = os.environ.get("mp_log_v", 0)
mpi = jt.mpi
img_open_hook = HookTimer(Image, "open")
CHECK_MEMORY = int(os.environ.get("CHECK_MEMORY", "0"))
if os.name == "nt":
from multiprocessing import shared_memory
class RingBuffer:
def __init__(self, size, shm=None):
for i in range(100):
if (1<<i) >= size: break
size = 1<<i
init = False
if shm is None:
init = True
shm = shared_memory.SharedMemory(create=True, size=size+1024)
rb = jt.core.RingBuffer(size, id(shm.buf), init)
self.size = size
self.shm = shm
self.rb = rb
def __reduce__(self):
return (RingBuffer, (self.size, self.shm))
def __del__(self):
del self.rb
del self.shm
def push(self, obj): self.send(obj)
def pop(self): return self.recv()
def send(self, obj): self.rb.push(obj)
def recv(self): return self.rb.pop()
def clear(self): return self.rb.clear()
def stop(self): return self.rb.stop()
def is_stop(self): return self.rb.is_stop()
def total_pop(self): return self.rb.total_pop()
def total_push(self): return self.rb.total_push()
def __repr__(self): return repr(self.rb)
def keep_numpy_array(self, keep): self.rb.keep_numpy_array(keep)
jt.RingBuffer = RingBuffer
class Worker:
def __init__(self, target, args, buffer_size, keep_numpy_array=False):
@ -48,6 +87,8 @@ class Dataset(object):
[in] drop_last(bool): if true, the last batch of dataset might smaller than batch_size, default True.
[in] num_workers(int): number of workers for loading data.
[in] buffer_size(int): buffer size for each worker in bytes, default(512MB).
[in] keep_numpy_array(bool): return numpy array rather than jittor array, default(False).
[in] endless(bool): will this dataset yield data forever, default(False).
Example::
@ -70,8 +111,11 @@ class Dataset(object):
num_workers = 0,
buffer_size = 512*1024*1024,
stop_grad = True,
keep_numpy_array = False):
keep_numpy_array = False,
endless = False):
super().__init__()
if os.environ.get("DISABLE_MULTIPROCESSING", '0') == '1':
num_workers = 0
self.total_len = None
self.batch_size = batch_size
self.shuffle = shuffle
@ -80,7 +124,12 @@ class Dataset(object):
self.buffer_size = buffer_size
self.stop_grad = stop_grad
self.keep_numpy_array = keep_numpy_array
self.endless = endless
self.epoch_id = 0
self.sampler = None
self._disable_workers = False
self._shuffle_rng = np.random.default_rng(1)
self.dataset = self
def __getitem__(self, index):
raise NotImplementedError
@ -129,13 +178,16 @@ class Dataset(object):
if self.stop_grad else jt.array(x)
if isinstance(batch, np.ndarray):
return to_jt(batch)
if isinstance(batch, dict):
new_batch = {}
for k,v in batch.items():
new_batch[k] = self.to_jittor(v)
return new_batch
if not isinstance(batch, (list, tuple)):
return batch
new_batch = []
for a in batch:
if isinstance(a, np.ndarray) or \
isinstance(a, int) or \
isinstance(a, float):
if isinstance(a, np.ndarray):
new_batch.append(to_jt(a))
else:
new_batch.append(self.to_jittor(a))
@ -162,11 +214,13 @@ class Dataset(object):
def _worker_main(self, worker_id, buffer, status):
import jittor_utils
jt.flags.use_cuda_host_allocator = 0
jittor_utils.cc.init_subprocess()
jt.jt_init_subprocess()
seed = jt.get_seed()
wseed = (seed ^ worker_id) ^ 1234
jt.set_seed(wseed)
wseed = (seed ^ (worker_id*1167)) ^ 1234
jt.set_global_seed(wseed)
# parallel_op_compiler still problematic,
# it is not work on ubuntu 16.04. but worked on ubuntu 20.04
# it seems like the static value of parallel compiler
@ -180,15 +234,20 @@ class Dataset(object):
while True:
# get id
with gid_lock:
while gid_obj.value >= self.batch_len or buffer.is_stop():
while buffer.is_stop() or self.idqueue.is_stop() or \
gid_obj.value >= self.batch_len:
self.num_idle.value += 1
self.num_idle_c.notify()
self.gidc.wait()
self.num_idle.value -= 1
cid = gid_obj.value
self.idmap[cid] = worker_id
batch_index_list = self.index_list_numpy[
cid*self.real_batch_size:
min(self.real_len, (cid+1)*self.real_batch_size)
].copy()
gid_obj.value += 1
self.gidc.notify()
with self.idqueue_lock:
self.idqueue.push(worker_id)
now = time.time()
other_time = now - start
start = now
@ -197,8 +256,8 @@ class Dataset(object):
batch = []
if mp_log_v:
print(f"#{worker_id} {os.getpid()} load batch", cid*self.real_batch_size, min(self.real_len, (cid+1)*self.real_batch_size))
for i in range(cid*self.real_batch_size, min(self.real_len, (cid+1)*self.real_batch_size)):
batch.append(self[self.index_list[i]])
for i in batch_index_list:
batch.append(self[i])
batch = self.collate_batch(batch)
now = time.time()
data_time = now - start
@ -276,10 +335,10 @@ Example::
if not hasattr(self, "workers"):
return
msg = [""]
msg.append(f"progress:{self.last_id}/{self.batch_len}")
msg.append(f"progress:{self.batch_id}/{self.batch_len}")
msg.append(f"batch(s): {self.batch_time:.3f}\twait(s):{self.wait_time:.3f}")
msg.append(f"recv(s): {self.recv_time:.3f}\tto_jittor(s):{self.to_jittor_time:.3f}")
msg.append(f"last 10 workers: {self.idmap[max(0, self.last_id-9):self.last_id+1]}")
msg.append(f"last 10 workers: {self.last_ids}")
msg.append(f"ID\twait(s)\topen(s)\tload(s)\tsend(s)\ttotal(s)")
for i in range(self.num_workers):
w = self.workers[i]
@ -291,6 +350,7 @@ Example::
# stop workers
for w in self.workers:
w.buffer.stop()
self.idqueue.stop()
# wait until all workers idle
if self.num_idle.value < self.num_workers:
with self.gid.get_lock():
@ -304,29 +364,35 @@ Example::
# clean workers' buffer
for w in self.workers:
w.buffer.clear()
self.idqueue.clear()
self.gid.value = 0
def _init_workers(self):
def _init_workers(self, index_list):
jt.migrate_all_to_cpu()
jt.clean()
jt.gc()
self.index_list = mp.Array('i', self.real_len, lock=False)
workers = []
# batch id to worker id
self.idmap = mp.Array('i', self.batch_len, lock=False)
# get worker id
self.idqueue = jt.RingBuffer(2048)
self.idqueue_lock = mp.Lock()
# global token index
self.gid = mp.Value('i', self.batch_len)
self.gid.value = 0
# global token index condition
self.gidc = mp.Condition(self.gid.get_lock())
# number of idle workers
self.num_idle = mp.Value('i', 0, lock=False)
# number of idle workers condition
self.num_idle_c = mp.Condition(self.gid.get_lock())
self.index_list_numpy = np.ndarray(dtype='int32', shape=self.real_len, buffer=self.index_list)
self.index_list_numpy[:] = index_list
for i in range(self.num_workers):
w = Worker(target=self._worker_main, args=(i,),
buffer_size=self.buffer_size,
keep_numpy_array=self.keep_numpy_array)
workers.append(w)
self.workers = workers
self.index_list_numpy = np.ndarray(dtype='int32', shape=self.real_len, buffer=self.index_list)
def reset(self):
if not hasattr(self, "workers"):
@ -334,7 +400,8 @@ Example::
self._stop_all_workers()
self.terminate()
del self.index_list
del self.idmap
del self.idqueue
del self.idqueue_lock
del self.gid
del self.gidc
del self.num_idle
@ -345,14 +412,35 @@ Example::
def __del__(self):
if mp_log_v:
print("dataset deleted")
self.terminate()
try:
self.terminate()
except:
pass
def __deepcopy__(self, memo=None, _nil=[]):
from copy import deepcopy
if memo is None:
memo = {}
d = id(self)
y = memo.get(d, _nil)
if y is not _nil:
return y
obj = self.__class__.__new__(self.__class__)
memo[d] = id(obj)
exclude_key = {"index_list", "idqueue", "idqueue_lock", "gid", "gidc", "num_idle", "num_idle_c", "workers", "index_list_numpy", "dataset", "idqueue", "idqueue_lock"}
for k,v in self.__dict__.items():
if k in exclude_key: continue
obj.__setattr__(k, deepcopy(v))
obj.dataset = obj
return obj
def __real_len__(self):
if self.total_len is None:
self.total_len = len(self)
return self.total_len
def __iter__(self):
def _get_index_list(self):
if self.total_len is None:
self.total_len = len(self)
# maybe rewrite by sampler
@ -366,7 +454,10 @@ Example::
elif self.shuffle == False:
index_list = get_order_list(self.total_len)
else:
index_list = get_random_list(self.total_len)
# using _shuffle_rng to generate multiprocess
# consist shuffle list
# index_list = get_random_list(self.total_len)
index_list = self._shuffle_rng.permutation(range(self.total_len))
# scatter index_list for all mpi process
# scatter rule:
@ -381,7 +472,8 @@ Example::
world_size = mpi.world_size()
world_rank = mpi.world_rank()
index_list = np.int32(index_list)
mpi.broadcast(index_list, 0)
# TODO: mpi broadcast in subprocess has bug, fix it
# mpi.broadcast(index_list, 0)
assert self.batch_size >= world_size, \
f"Batch size({self.batch_size}) is smaller than MPI world_size({world_size})"
@ -403,85 +495,138 @@ Example::
real_last_batch = (last_batch-1)//world_size+1
l = real_last_batch * world_rank
r = l + real_last_batch
if r > last_batch: r = last_batch
if l >= r: l = r-1
if r > last_batch:
r = last_batch
l = r-real_last_batch
index_list = np.concatenate([fix_batch_l, last_batch_l[l:r]])
else:
index_list = fix_batch_l
self.real_len = len(index_list)
self.real_batch_size = real_batch_size
assert total_len // self.batch_size == \
self.real_len // self.real_batch_size, f"Number of batches({total_len // self.batch_size}!={self.real_len // self.real_batch_size}) not match, total_len: {total_len}, batch_size: {self.batch_size}, real_len: {self.real_len}, real_batch_size: {self.real_batch_size}"
# assert total_len // self.batch_size == \
# self.real_len // self.real_batch_size, f"Number of batches({total_len // self.batch_size}!={self.real_len // self.real_batch_size}) not match, total_len: {total_len}, batch_size: {self.batch_size}, real_len: {self.real_len}, real_batch_size: {self.real_batch_size}"
# print(f"Number of batches({total_len // self.batch_size}!={self.real_len // self.real_batch_size}) not match, total_len: {total_len}, batch_size: {self.batch_size}, real_len: {self.real_len}, real_batch_size: {self.real_batch_size}")
# print("mpi dataset init ")
else:
self.real_len = self.total_len
self.real_len = len(index_list)
self.real_batch_size = self.batch_size
self.batch_len = self.__batch_len__()
if self.drop_last:
self.batch_len = self.real_len // self.real_batch_size
else:
self.batch_len = (self.real_len-1) // self.real_batch_size + 1
return index_list
def _epochs(self):
if self.endless:
while True:
yield
self.epoch_id += 1
else:
yield
def __iter__(self):
if self._disable_workers:
self.num_workers = 0
index_list = self._get_index_list()
if not hasattr(self, "workers") and self.num_workers:
self._init_workers()
self._init_workers(index_list)
self.last_ids = [-1] * 10
if self.num_workers:
self._stop_all_workers()
self.index_list_numpy[:] = index_list
gid_obj = self.gid.get_obj()
gid_lock = self.gid.get_lock()
with gid_lock:
gid_obj.value = 0
self.gidc.notify_all()
start = time.time()
self.batch_time = 0
for i in range(self.batch_len):
# try not get lock first
if gid_obj.value <= i:
with gid_lock:
if gid_obj.value <= i:
if mp_log_v:
print("wait")
self.gidc.wait()
now = time.time()
self.wait_time = now - start
start = now
gid_obj = self.gid.get_obj()
gid_lock = self.gid.get_lock()
self.last_id = i
worker_id = self.idmap[i]
w = self.workers[worker_id]
if mp_log_v:
print(f"#{worker_id} {os.getpid()} recv buffer", w.buffer)
batch = w.buffer.recv()
now = time.time()
self.recv_time = now - start
start = now
for _ in self._epochs():
with gid_lock:
if self.num_idle.value:
self.gidc.notify_all()
if mp_log_v:
print(f"#{worker_id} {os.getpid()} recv", type(batch).__name__, [ type(b).__name__ for b in batch ])
batch = self.to_jittor(batch)
now = time.time()
self.to_jittor_time = now - start
start = now
for i in range(self.batch_len):
if self.num_idle.value:
with gid_lock:
if self.num_idle.value and \
gid_obj.value >= self.batch_len:
index_list = self._get_index_list()
self.index_list_numpy[:] = index_list
gid_obj.value = 0
self.gidc.notify_all()
yield batch
# get which worker has this batch
worker_id = self.idqueue.pop()
now = time.time()
self.batch_time = now - start
start = now
now = time.time()
self.wait_time = now - start
start = now
self.last_ids[i%10] = worker_id
self.batch_id = i
w = self.workers[worker_id]
if mp_log_v:
print(f"#{worker_id} {os.getpid()} recv buffer", w.buffer)
batch = w.buffer.recv()
now = time.time()
self.recv_time = now - start
start = now
if mp_log_v:
print(f"#{worker_id} {os.getpid()} recv", type(batch).__name__, [ type(b).__name__ for b in batch ])
batch = self.to_jittor(batch)
now = time.time()
self.to_jittor_time = now - start
start = now
yield batch
now = time.time()
self.batch_time = now - start
start = now
if CHECK_MEMORY and self.batch_id % CHECK_MEMORY == 0:
jt.display_memory_info()
else:
batch_data = []
for idx in index_list:
batch_data.append(self[int(idx)])
if len(batch_data) == self.real_batch_size:
for _ in self._epochs():
self.batch_id = 0
batch_data = []
for idx in index_list:
batch_data.append(self[int(idx)])
if len(batch_data) == self.real_batch_size:
batch_data = self.collate_batch(batch_data)
tmp = batch_data
batch_data = self.to_jittor(batch_data)
# breakpoint()
yield batch_data
self.batch_id += 1
if CHECK_MEMORY and self.batch_id % CHECK_MEMORY == 0:
jt.display_memory_info()
batch_data = []
# depend on drop_last
if not self.drop_last and len(batch_data) > 0:
batch_data = self.collate_batch(batch_data)
batch_data = self.to_jittor(batch_data)
self.batch_id += 1
yield batch_data
batch_data = []
# depend on drop_last
if not self.drop_last and len(batch_data) > 0:
batch_data = self.collate_batch(batch_data)
batch_data = self.to_jittor(batch_data)
yield batch_data
def DataLoader(dataset: Dataset, *args, **kargs):
""" Simple dataloader.
Example::
train_dir = './data/celebA_train'
train_dataset = ImageFolder(train_dir)
dataloader = jt.dataset.DataLoader(train_dataset, batch_size=8)
"""
return dataset.set_attrs(*args, **kargs)
class ImageFolder(Dataset):
"""
@ -537,3 +682,47 @@ class ImageFolder(Dataset):
if self.transform:
img = self.transform(img)
return img, self.imgs[k][1]
class VarDataset(Dataset):
""" Dataset using Var directly, TensorDataset is alias of VarDataset, Example::
import jittor as jt
from jittor.dataset import VarDataset
x = jt.array([1,2,3])
y = jt.array([4,5,6])
z = jt.array([7,8,9])
dataset = VarDataset(x, y, z)
dataset.set_attrs(batch_size=1)
for a,b,c in dataset:
print(a,b,c)
# will print
# 1,4,7
# 2,5,8
# 3,6,9
"""
def __init__(self, *args):
super().__init__()
self.args = args
self._disable_workers = True
assert len(args), "At lease one args"
l = len(args[0])
for a in args:
assert l == len(a), "Len should be the same"
self.set_attrs(total_len=l)
def __getitem__(self, idx):
return [ a[idx] for a in self.args ]
def collate_batch(self, batch):
b = collate_batch(batch)
for i in range(len(self.args)):
x = b[i]
if jt.is_var(self.args[i]) and self.args[i].ndim == 1:
x.assign(x.squeeze(-1))
return b
TensorDataset = VarDataset

View File

@ -7,12 +7,14 @@
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import os
import string
import numpy as np
import gzip
from PIL import Image
# our lib jittor import
from jittor.dataset.dataset import Dataset, dataset_root
from jittor.utils.misc import ensure_dir, download_url_to_local
from jittor_utils.misc import ensure_dir, download_url_to_local
import jittor as jt
import jittor.transform as trans
@ -24,7 +26,7 @@ class MNIST(Dataset):
[in] data_root(str): your data root.
[in] train(bool): choose model train or val.
[in] download(bool): Download data automatically if download is Ture.
[in] download(bool): Download data automatically if download is True.
[in] batch_size(int): Data batch size.
[in] shuffle(bool): Shuffle data if true.
[in] transform(jittor.transform): transform data.
@ -94,3 +96,105 @@ class MNIST(Dataset):
for url, md5 in resources:
filename = url.rpartition('/')[2]
download_url_to_local(url, filename, self.data_root, md5)
class EMNIST(Dataset):
'''
Jittor's own class for loading EMNIST dataset.
Args::
[in] data_root(str): your data root.
[in] split(str): one of 'byclass', 'bymerge', 'balanced', 'letters', 'digits', 'mnist'.
[in] train(bool): choose model train or val.
[in] download(bool): Download data automatically if download is True.
[in] batch_size(int): Data batch size.
[in] shuffle(bool): Shuffle data if true.
[in] transform(jittor.transform): transform data.
Example::
from jittor.dataset.mnist import EMNIST
train_loader = EMNIST(train=True).set_attrs(batch_size=16, shuffle=True)
for i, (imgs, target) in enumerate(train_loader):
...
'''
_merged_classes = {'c', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 's', 'u', 'v', 'w', 'x', 'y', 'z'}
_all_classes = set(string.digits + string.ascii_letters)
classes_split_dict = {
'byclass': sorted(list(_all_classes)),
'bymerge': sorted(list(_all_classes - _merged_classes)),
'balanced': sorted(list(_all_classes - _merged_classes)),
'letters': ['N/A'] + list(string.ascii_lowercase),
'digits': list(string.digits),
'mnist': list(string.digits),
}
def __init__(self, data_root=dataset_root+"/emnist_data/",
split='byclass',
train=True,
download=True,
batch_size = 16,
shuffle = False,
transform=None):
# if you want to test resnet etc you should set input_channel = 3, because the net set 3 as the input dimensions
super().__init__()
self.data_root = data_root
self.is_train = train
self.transform = transform
self.batch_size = batch_size
self.shuffle = shuffle
if download == True:
self.download_url()
data_root = os.path.join(data_root, "gzip")
filesname = [
f"emnist-{split}-train-images-idx3-ubyte.gz",
f"emnist-{split}-t10k-images-idx3-ubyte.gz",
f"emnist-{split}-train-labels-idx1-ubyte.gz",
f"emnist-{split}-t10k-labels-idx1-ubyte.gz"
]
for i in range(4):
filesname[i] = os.path.join(data_root, filesname[i])
self.mnist = {}
if self.is_train:
with gzip.open(filesname[0], 'rb') as f:
self.mnist["images"] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28, 28).transpose(0,2,1)
with gzip.open(filesname[2], 'rb') as f:
self.mnist["labels"] = np.frombuffer(f.read(), np.uint8, offset=8)
else:
with gzip.open(filesname[1], 'rb') as f:
self.mnist["images"] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1,28, 28).transpose(0,2,1)
with gzip.open(filesname[3], 'rb') as f:
self.mnist["labels"] = np.frombuffer(f.read(), np.uint8, offset=8)
assert(self.mnist["images"].shape[0] == self.mnist["labels"].shape[0])
self.total_len = self.mnist["images"].shape[0]
# this function must be called
self.set_attrs(total_len = self.total_len)
def __getitem__(self, index):
img = Image.fromarray(self.mnist['images'][index]).convert('RGB')
if self.transform:
img = self.transform(img)
return trans.to_tensor(img), self.mnist['labels'][index]
def download_url(self):
'''
Download mnist data set function, this function will be called when download is True.
'''
resources = [
("https://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip", "58c8d27c78d21e728a6bc7b3cc06412e"),
]
for url, md5 in resources:
filename = "emnist.zip"
download_url_to_local(url, filename, self.data_root, md5)
import zipfile
zf = zipfile.ZipFile(os.path.join(self.data_root, filename))
try:
zf.extractall(path=self.data_root)
except RuntimeError as e:
print(e)
raise
zf.close()

View File

@ -1,5 +1,5 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Copyright (c) 2023 Jittor. All Rights Reserved.
# Maintainers:
# Hao-Yang Peng
# Dun Liang <randonlang@gmail.com>.
@ -33,10 +33,10 @@ class SequentialSampler(Sampler):
self.dataset = dataset
def __iter__(self):
return iter(range(self.dataset.__real_len__()))
return iter(range(self.dataset.__real_len__() if hasattr(self.dataset,"__real_len__") else self.dataset.__len__()))
def __len__(self):
return self.dataset.__real_len__()
return self.dataset.__real_len__() if hasattr(self.dataset,"__real_len__") else self.dataset.__len__()
class RandomSampler(Sampler):
@ -46,21 +46,36 @@ class RandomSampler(Sampler):
self.dataset = dataset
self.rep = replacement
self._num_samples = num_samples
self._shuffle_rng = np.random.default_rng(1)
@property
def num_samples(self):
if self._num_samples is None:
return self.dataset.__real_len__()
return self.dataset.__real_len__() if hasattr(self.dataset,"__real_len__") else self.dataset.__len__()
return self._num_samples
def __len__(self):
return self.num_samples
def __iter__(self):
n = self.dataset.__real_len__()
n = self.dataset.__real_len__() if hasattr(self.dataset,"__real_len__") else self.dataset.__len__()
if self.rep:
return iter(np.random.randint(low=0, high=n, size=(self.num_samples,), dtype=np.int64).tolist())
return iter(np.random.permutation(n).tolist())
return iter(self._shuffle_rng.integers(low=0, high=n, size=(self.num_samples,), dtype=np.int64).tolist())
return iter(self._shuffle_rng.permutation(n).tolist())
class SkipFirstBatchesSampler(Sampler):
def __init__(self, sampler, num_skip_batches):
# MUST set sampler here
sampler.dataset.sampler = self
self.sampler = sampler
self.num_skip_batches = num_skip_batches
def __len__(self):
return len(self.sampler) - self.num_skip_batches
def __iter__(self):
return iter(list(iter(self.sampler))[self.num_skip_batches:])
class SubsetRandomSampler(Sampler):
@ -78,7 +93,8 @@ class SubsetRandomSampler(Sampler):
dataset.sampler = self
self.dataset = dataset
self.indices = indice
assert indice[0] >= 0 and indice[1] < dataset.__real_len__() and indice[0] < indice[1]
dlen = dataset.__real_len__() if hasattr(dataset,"__real_len__") else dataset.__len__()
assert indice[0] >= 0 and indice[1] < dlen and indice[0] < indice[1]
def __iter__(self):
return (int(i) + self.indices[0] for i in np.random.permutation(self.indices[1] - self.indices[0]))

View File

@ -1,5 +1,5 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Copyright (c) 2023 Jittor. All Rights Reserved.
# Maintainers:
# Meng-Hao Guo <guomenghao1997@gmail.com>
# Dun Liang <randonlang@gmail.com>.
@ -27,8 +27,7 @@ def collate_batch(batch):
elem = batch[0]
elem_type = type(elem)
if isinstance(elem, jt.Var):
# TODO: use jittor
temp_data = np.stack([data.data for data in batch], 0)
temp_data = jt.stack([data for data in batch], 0)
return temp_data
if elem_type is np.ndarray:
temp_data = np.stack([data for data in batch], 0)

View File

@ -0,0 +1,107 @@
import jittor as jt
from jittor import nn
import numpy as np
# import pylab as pl
# 隐空间向量长度
latent_dim = 100
# 类别数量
n_classes = 10
# 图片大小
img_size = 32
# 图片通道数量
channels = 1
# 图片张量的形状
img_shape = (channels, img_size, img_size)
class Generator(nn.Module):
def __init__(self):
super(Generator, self).__init__()
self.label_emb = nn.Embedding(n_classes, n_classes)
def block(in_feat, out_feat, normalize=True):
layers = [nn.Linear(in_feat, out_feat)]
if normalize:
layers.append(nn.BatchNorm1d(out_feat, 0.8))
layers.append(nn.LeakyReLU(0.2))
return layers
self.model = nn.Sequential(
*block((latent_dim + n_classes), 128, normalize=False),
*block(128, 256),
*block(256, 512),
*block(512, 1024),
nn.Linear(1024, int(np.prod(img_shape))),
nn.Tanh())
def execute(self, noise, labels):
gen_input = jt.concat((self.label_emb(labels), noise), dim=1)
img = self.model(gen_input)
img = img.view((img.shape[0], *img_shape))
return img
class Discriminator(nn.Module):
def __init__(self):
super(Discriminator, self).__init__()
self.label_embedding = nn.Embedding(n_classes, n_classes)
self.model = nn.Sequential(
nn.Linear((n_classes + int(np.prod(img_shape))), 512),
nn.LeakyReLU(0.2),
nn.Linear(512, 512),
nn.Dropout(0.4),
nn.LeakyReLU(0.2),
nn.Linear(512, 512),
nn.Dropout(0.4),
nn.LeakyReLU(0.2),
nn.Linear(512, 1))
def execute(self, img, labels):
d_in = jt.concat((img.view((img.shape[0], (- 1))), self.label_embedding(labels)), dim=1)
validity = self.model(d_in)
return validity
# 定义模型
generator = Generator()
discriminator = Discriminator()
generator.eval()
discriminator.eval()
# 加载参数
generator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/generator_last.pkl')
discriminator.load('https://cg.cs.tsinghua.edu.cn/jittor/assets/build/discriminator_last.pkl')
def gen_img(number):
print(number, type(number))
n_row = len(number)
z = jt.array(np.random.normal(0, 1, (n_row, latent_dim))).float32().stop_grad()
labels = jt.array(np.array([int(number[num]) for num in range(n_row)])).float32().stop_grad()
gen_imgs = generator(z,labels)
gen_imgs = gen_imgs.transpose((1,2,0,3)).reshape(gen_imgs.shape[2], -1)
gen_imgs = gen_imgs[:,:,None].broadcast(gen_imgs.shape+(3,)) # .uint8()
gen_imgs = (gen_imgs - gen_imgs.min()) / (gen_imgs.max() - gen_imgs.min()) * 255
gen_imgs = gen_imgs.uint8()
# print(gen_imgs.shape, gen_imgs.max(), gen_imgs.min())
return gen_imgs.numpy()
# gen_imgs = gen_imgs.data.transpose((1,2,0,3))[0].reshape((gen_imgs.shape[2], -1))
# print(gen_imgs.shape)
return gen_imgs[:,:,None]
from PIL import Image
import pywebio as pw
# 定义一串数字
number = "201962517"
# gen_img(number)
Image.fromarray(gen_img(number))
# pl.imshow()
# pl.show()
# print("done")
def web_server():
pw.pin.put_input("number", label="输入用于生成的数字(由计图框架支持)")
pw.output.put_buttons(['Gen image'],
lambda _: pw.output.put_image(Image.fromarray(gen_img(pw.pin.pin.number))))
pw.start_server(web_server, port=8123)

View File

@ -1,5 +1,5 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Copyright (c) 2023 Jittor. All Rights Reserved.
# Maintainers:
# Guoye Yang <498731903@qq.com>
# Dun Liang <randonlang@gmail.com>.
@ -20,6 +20,8 @@ class DepthwiseConv(Function):
self.dilation = dilation if isinstance(dilation, tuple) else (dilation, dilation)
def execute(self, x, weight):
if not jt.flags.use_cuda or not jt.compiler.is_cuda:
return nn.conv2d(x, weight, None, self.stride, self.padding, self.dilation, x.shape[1])
self.save_vars = x, weight
N,C,H,W = x.shape
o,i,Kh,Kw = weight.shape

View File

@ -1,5 +1,5 @@
# ***************************************************************
# Copyright (c) 2021 Jittor. All Rights Reserved.
# Copyright (c) 2023 Jittor. All Rights Reserved.
# Maintainers:
# Haoyang Peng <2247838039@qq.com>
# Dun Liang <randonlang@gmail.com>.
@ -8,9 +8,13 @@
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import math
import os
import numpy as np
import jittor as jt
from jittor import nn
from jittor.nn import binary_cross_entropy_with_logits
from jittor import lgamma, igamma
from jittor.math_util.gamma import gamma_grad, sample_gamma
def simple_presum(x):
src = '''
@ -29,18 +33,7 @@ kernel(in0->num/in0->shape[in0->shape.size()-1], 0, in0_p, out0_p, in0->shape[in
class OneHotCategorical:
def __init__(self, probs=None, logits=None):
assert not (probs is None and logits is None)
if probs is None:
# cannot align to pytorch
probs = jt.sigmoid(logits)
elif logits is None:
logits = jt.log(probs)
with jt.no_grad():
self.probs = probs / probs.sum(-1, True)
self.cum_probs = simple_presum(self.probs)
self.cum_probs_l = self.cum_probs[..., :-1]
self.cum_probs_r = self.cum_probs[..., 1:]
self.logits = logits
Categorical.__init__(self, probs, logits)
def sample(self, sample_shape=[]):
shape = sample_shape + self.probs.shape[:-1] + (1,)
@ -48,17 +41,12 @@ class OneHotCategorical:
one_hot = jt.logical_and(self.cum_probs_l < rand, rand <= self.cum_probs_r).float()
return one_hot
def log_prob(self,x):
if len(x.shape) == 1:
x = x.unsqueeze(0)
logits = self.logits.broadcast(x.shape)
indices = jt.argmax(x, dim=-1)[0]
return logits.gather(1, indices.unsqueeze(-1)).reshape(-1)
def log_prob(self, x):
x = jt.argmax(x, dim=-1)[0]
return Categorical.log_prob(self, x)
def entropy(self):
min_real = -(math.pow(2,23)-1) / math.pow(2,22) * math.pow(2,127)
logits = jt.clamp(self.logits,min_v=min_real)
p_log_p = logits * self.probs
p_log_p = self.logits * self.probs
return -p_log_p.sum(-1)
@ -68,29 +56,32 @@ class Categorical:
if probs is None:
# cannot align to pytorch
probs = jt.sigmoid(logits)
elif logits is None:
logits = jt.log(probs)
probs = probs / probs.sum(-1, True)
if logits is None:
logits = jt.safe_log(probs)
with jt.no_grad():
self.probs = probs / probs.sum(-1, True)
self.probs = probs
self.logits = logits
self.cum_probs = simple_presum(probs)
self.cum_probs = simple_presum(self.probs)
self.cum_probs_l = self.cum_probs[..., :-1]
self.cum_probs_r = self.cum_probs[..., 1:]
def sample(self, sample_shape=[]):
def sample(self, sample_shape=()):
shape = sample_shape + self.probs.shape[:-1] + (1,)
rand = jt.rand(shape)
one_hot = jt.logical_and(self.cum_probs_l < rand, rand <= self.cum_probs_r)
index = one_hot.index(one_hot.ndim-1)
index = one_hot.index(one_hot.ndim - 1)
return (one_hot * index).sum(-1)
def log_prob(self, x):
return jt.log(self.probs)[0,x]
a = self.probs.ndim
b = x.ndim
indexes = tuple( f'i{i}' for i in range(b-a+1, b) )
indexes = indexes + (x,)
return jt.safe_log(self.probs).getitem(indexes)
def entropy(self):
min_real = -(math.pow(2,23)-1) / math.pow(2,22) * math.pow(2,127)
logits = jt.clamp(self.logits,min_v=min_real)
p_log_p = logits * self.probs
p_log_p = self.logits * self.probs
return -p_log_p.sum(-1)
@ -104,11 +95,11 @@ class Normal:
def log_prob(self, x):
var = self.sigma**2
log_scale = jt.log(self.sigma)
log_scale = jt.safe_log(self.sigma)
return -((x-self.mu)**2) / (2*var) - log_scale-np.log(np.sqrt(2*np.pi))
def entropy(self):
return 0.5+0.5*np.log(2*np.pi)+jt.log(self.sigma)
return 0.5+0.5*np.log(2*np.pi)+jt.safe_log(self.sigma)
class Uniform:
@ -123,10 +114,10 @@ class Uniform:
def log_prob(self,x):
if x < self.low or x >= self.high:
return math.inf
return -jt.log(self.high - self.low)
return -jt.safe_log(self.high - self.low)
def entropy(self):
return jt.log(self.high - self.low)
return jt.safe_log(self.high - self.low)
class Geometric:
@ -138,35 +129,62 @@ class Geometric:
self.logits = logits
elif logits is None:
self.prob = p
self.logits = -jt.log(1. / p - 1)
self.logits = -jt.safe_log(1. / p - 1)
def sample(self, sample_shape):
tiny = jt.info(self.probs.dtype).tiny
u = jt.clamp(jt.rand(sample_shape),min_v=tiny)
return (jt.log(u) / (jt.log(-self.probs+1))).floor()
u = jt.rand(sample_shape)
return (jt.safe_log(u) / (jt.safe_log(-self.probs+1))).floor_int()
def log_prob(self, x):
return x*jt.log(-self.prob+1)+jt.log(self.prob)
return x*jt.safe_log(-self.prob+1)+jt.safe_log(self.prob)
def entropy(self):
return binary_cross_entropy_with_logits(jt.array(self.logits),jt.array(self.prob)) / self.prob
class GammaDistribution:
'''
For now only support gamma distribution.
'''
def __init__(self, concentration, rate):
self.concentration = concentration
self.rate = rate
self.lgamma_alpha = lgamma.apply(jt.array([concentration,]))
def sample(self, shape):
return sample_gamma(self.concentration, shape)
def cdf(self, value):
return igamma(self.concentration, value)
def log_prob(self, value):
return (self.concentration * jt.log(self.rate) +
(self.concentration - 1) * jt.log(value) -
self.rate * value - self.lgamma_alpha)
def mean(self):
return self.concentration / self.rate
def mode(self):
return np.minimum((self.concentration - 1) / self.rate, 1)
def variance(self):
return self.concentration / (self.rate * self.rate)
def kl_divergence(cur_dist, old_dist):
assert isinstance(cur_dist, type(old_dist))
if isinstance(cur_dist, Normal):
vr = (cur_dist.sigma / old_dist.sigma)**2
t1 = ((cur_dist.mu - old_dist.mu) / old_dist.sigma)**2
return 0.5*(vr+t1-1-jt.log(vr))
return 0.5*(vr+t1-1-jt.safe_log(vr))
if isinstance(cur_dist, Categorical) or isinstance(cur_dist,OneHotCategorical):
t = cur_dist.probs * (cur_dist.logits-old_dist.logits)
t[jt.array((old_dist.probs == 0))] = math.inf
t[jt.array((cur_dist.probs == 0))] = 0
return t.sum(-1)
if isinstance(cur_dist, Uniform):
res = jt.log((old_dist.high - old_dist.low) / (cur_dist.high - cur_dist.low))
res = jt.safe_log((old_dist.high - old_dist.low) / (cur_dist.high - cur_dist.low))
if old_dist.low > cur_dist.low or old_dist.high < cur_dist.high:
res = math.inf
return res
if isinstance(cur_dist, Geometric):
return -cur_dist.entropy() - jt.log(-old_dist.prob+1) / cur_dist.prob - old_dist.logits
return -cur_dist.entropy() - jt.safe_log(-old_dist.prob+1) / cur_dist.prob - old_dist.logits

View File

@ -0,0 +1,8 @@
class EinopsError(RuntimeError):
""" Runtime error thrown by einops """
pass
__all__ = ['rearrange', 'reduce', 'repeat', 'parse_shape', 'asnumpy', 'EinopsError']
from jittor.einops.einops import rearrange, reduce, repeat, parse_shape, asnumpy

View File

@ -0,0 +1,264 @@
"""
Backends in `einops` are organized to meet the following requirements
- backends are not imported unless those are actually needed, because
- backends may not be installed
- importing all available backends will drive to significant memory footprint
- backends may by present but installed with errors (but never used),
importing may drive to crashes
- backend should be either symbolic or imperative (tensorflow is for both, but that causes problems)
- this determines which methods (from_numpy/to_numpy or create_symbol/eval_symbol) should be defined
- if backend can't (temporarily) provide symbols for shape dimensions, UnknownSize objects are used
"""
import sys
import warnings
__author__ = 'Alex Rogozhnikov, RuiYang Liu'
_backends = {}
_debug_importing = False
def get_backend(tensor) -> 'AbstractBackend':
"""
Takes a correct backend (e.g. numpy backend if tensor is numpy.ndarray) for a tensor.
If needed, imports package and creates backend
"""
for framework_name, backend in _backends.items():
if backend.is_appropriate_type(tensor):
return backend
# Find backend subclasses recursively
backend_subclasses = []
backends = AbstractBackend.__subclasses__()
while backends:
backend = backends.pop()
backends += backend.__subclasses__()
backend_subclasses.append(backend)
for BackendSubclass in backend_subclasses:
if _debug_importing:
print('Testing for subclass of ', BackendSubclass)
if BackendSubclass.framework_name not in _backends:
# check that module was already imported. Otherwise it can't be imported
if BackendSubclass.framework_name in sys.modules:
if _debug_importing:
print('Imported backend for ', BackendSubclass.framework_name)
backend = BackendSubclass()
_backends[backend.framework_name] = backend
if backend.is_appropriate_type(tensor):
return backend
raise RuntimeError('Tensor type unknown to einops {}'.format(type(tensor)))
class AbstractBackend:
""" Base backend class, major part of methods are only for debugging purposes. """
framework_name = None
def is_appropriate_type(self, tensor):
""" helper method should recognize tensors it can handle """
raise NotImplementedError()
def from_numpy(self, x):
raise NotImplementedError("framework doesn't support imperative execution")
def to_numpy(self, x):
raise NotImplementedError("framework doesn't support imperative execution")
def create_symbol(self, shape):
raise NotImplementedError("framework doesn't support symbolic computations")
def eval_symbol(self, symbol, input_dict):
raise NotImplementedError("framework doesn't support symbolic computations")
def arange(self, start, stop):
# supplementary method used only in testing, so should implement CPU version
raise NotImplementedError("framework doesn't implement arange")
def shape(self, x):
"""shape should return a tuple with integers or "shape symbols" (which will evaluate to actual size)"""
return x.shape
def reshape(self, x, shape):
return x.reshape(shape)
def transpose(self, x, axes):
return x.transpose(axes)
def reduce(self, x, operation, axes):
return getattr(x, operation)(axis=axes)
def stack_on_zeroth_dimension(self, tensors: list):
raise NotImplementedError()
def add_axis(self, x, new_position):
raise NotImplementedError()
def add_axes(self, x, n_axes, pos2len):
repeats = [1] * n_axes
for axis_position, axis_length in pos2len.items():
x = self.add_axis(x, axis_position)
repeats[axis_position] = axis_length
return self.tile(x, tuple(repeats))
def tile(self, x, repeats):
"""repeats is a number of """
raise NotImplementedError()
def is_float_type(self, x):
# Decided to drop average for all backends if type is not floating
raise NotImplementedError()
def layers(self):
raise NotImplementedError("backend does not provide layers")
def __repr__(self):
return "<einops backend for {}>".format(self.framework_name)
def einsum(self, pattern, *x):
raise NotImplementedError("backend does not support einsum")
class UnknownSize:
""" pseudo-symbol for symbolic frameworks which do not provide symbols for shape elements """
def __floordiv__(self, other):
return self
def __eq__(self, other):
return True # we don't know actual size
def __mul__(self, other):
return self
def __rmul__(self, other):
return self
def __hash__(self):
return None.__hash__()
class NumpyBackend(AbstractBackend):
framework_name = 'numpy'
def __init__(self):
import numpy
self.np = numpy
def is_appropriate_type(self, tensor):
return isinstance(tensor, self.np.ndarray)
def from_numpy(self, x):
return x
def to_numpy(self, x):
return x
def arange(self, start, stop):
return self.np.arange(start, stop)
def stack_on_zeroth_dimension(self, tensors: list):
return self.np.stack(tensors)
def tile(self, x, repeats):
return self.np.tile(x, repeats)
def is_float_type(self, x):
return x.dtype in ('float16', 'float32', 'float64', 'float128', 'bfloat16')
def add_axis(self, x, new_position):
return self.np.expand_dims(x, new_position)
def einsum(self, pattern, *x):
return self.np.einsum(pattern, *x)
class HashableTuple:
"""Overcomes non-hashability of symbolic elements"""
def __init__(self, elements: tuple):
self.elements = elements
def __iter__(self):
for x in self.elements:
yield x
def __len__(self):
return len(self.elements)
def __getitem__(self, item):
return self.elements[item]
class JittorBackend(AbstractBackend):
framework_name = 'jittor'
def __init__(self):
import jittor
self.jittor = jittor
def is_appropriate_type(self, tensor):
return isinstance(tensor, self.jittor.Var)
def from_numpy(self, x):
variable = self.jittor.array(x)
return variable
def to_numpy(self, x):
return x.detach().numpy()
def arange(self, start, stop):
return self.jittor.arange(start, stop, dtype='int64')
def shape(self, x):
return tuple(x.shape)
def reshape(self, x, shape):
if len(shape) == 0:
return x
return self.jittor.reshape(x, shape)
def reduce(self, x, operation, reduced_axes):
if operation == 'prod':
#avoid overflow
return x.prod(reduced_axes)
for axis in sorted(reduced_axes, reverse=True):
if operation == 'min':
x = x.min(dim=axis)
elif operation == 'max':
x = x.max(dim=axis)
elif operation in ['sum', 'mean']:
x = getattr(x, operation)(dim=axis)
else:
raise NotImplementedError('Unknown reduction ', operation)
return x
def transpose(self, x, axes):
return x.permute(axes)
def stack_on_zeroth_dimension(self, tensors: list):
return self.jittor.stack(tensors)
def add_axes(self, x, n_axes, pos2len):
repeats = [-1] * n_axes
for axis_position, axis_length in pos2len.items():
x = self.add_axis(x, axis_position)
repeats[axis_position] = axis_length
return x.expand(repeats)
def tile(self, x, repeats):
return x.repeat(repeats)
def add_axis(self, x, new_position):
return self.jittor.unsqueeze(x, new_position)
def is_float_type(self, x):
return x.dtype in ["float16", "bfloat16", "float32", "float64"]
def layers(self):
from jittor.einops.layers import jittor
return jittor
def einsum(self, pattern, *x):
return self.jittor.linalg.einsum(pattern, *x)

View File

@ -0,0 +1,782 @@
import functools
import itertools
import string
import typing
from collections import OrderedDict
from typing import Tuple, List, Dict, Union, Callable, Optional, TypeVar
if typing.TYPE_CHECKING:
import numpy as np
from jittor.einops import EinopsError
from jittor.einops._backends import get_backend
from jittor.einops.parsing import ParsedExpression, _ellipsis, AnonymousAxis
Tensor = TypeVar('Tensor')
ReductionCallable = Callable[[Tensor, List[int]], Tensor]
Reduction = Union[str, ReductionCallable]
_reductions = ('min', 'max', 'sum', 'mean', 'prod')
_ellipsis_not_in_parenthesis: List[int] = [-999]
_unknown_axis_length = -999999
def is_ellipsis_not_in_parenthesis(group: List[int]) -> bool:
if len(group) != 1:
return False
return group[0] == -999
def _product(sequence: List[int]) -> int:
""" minimalistic product that works both with numbers and symbols. Supports empty lists """
result = 1
for element in sequence:
result *= element
return result
def _reduce_axes(tensor, reduction_type: Reduction, reduced_axes: List[int], backend):
reduced_axes = tuple(reduced_axes)
if callable(reduction_type):
# custom callable
return reduction_type(tensor, reduced_axes)
else:
# one of built-in operations
if len(reduced_axes) == 0:
return tensor
assert reduction_type in _reductions
if reduction_type == 'mean':
if not backend.is_float_type(tensor):
raise NotImplementedError('reduce_mean is not available for non-floating tensors')
return backend.reduce(tensor, reduction_type, reduced_axes)
def _optimize_transformation(init_shapes, reduced_axes, axes_reordering, final_shapes):
# 'collapses' neighboring axes if those participate in the result pattern in the same order
# TODO add support for added_axes
assert len(axes_reordering) + len(reduced_axes) == len(init_shapes)
# joining consecutive axes that will be reduced
# possibly we can skip this if all backends can optimize this (not sure)
reduced_axes = tuple(sorted(reduced_axes))
for i in range(len(reduced_axes) - 1)[::-1]:
if reduced_axes[i] + 1 == reduced_axes[i + 1]:
removed_axis = reduced_axes[i + 1]
removed_length = init_shapes[removed_axis]
init_shapes = init_shapes[:removed_axis] + init_shapes[removed_axis + 1:]
init_shapes[removed_axis - 1] *= removed_length
reduced_axes = reduced_axes[:i + 1] + tuple(axis - 1 for axis in reduced_axes[i + 2:])
# removing axes that are moved together during reshape
def build_mapping():
init_to_final = {}
for axis in range(len(init_shapes)):
if axis in reduced_axes:
init_to_final[axis] = None
else:
after_reduction = sum(x is not None for x in init_to_final.values())
init_to_final[axis] = list(axes_reordering).index(after_reduction)
return init_to_final
init_axis_to_final_axis = build_mapping()
for init_axis in range(len(init_shapes) - 1)[::-1]:
if init_axis_to_final_axis[init_axis] is None:
continue
if init_axis_to_final_axis[init_axis + 1] is None:
continue
if init_axis_to_final_axis[init_axis] + 1 == init_axis_to_final_axis[init_axis + 1]:
removed_axis = init_axis + 1
removed_length = init_shapes[removed_axis]
removed_axis_after_reduction = sum(x not in reduced_axes for x in range(removed_axis))
reduced_axes = tuple(axis if axis < removed_axis else axis - 1 for axis in reduced_axes)
init_shapes = init_shapes[:removed_axis] + init_shapes[removed_axis + 1:]
init_shapes[removed_axis - 1] *= removed_length
old_reordering = axes_reordering
axes_reordering = []
for axis in old_reordering:
if axis == removed_axis_after_reduction:
pass
elif axis < removed_axis_after_reduction:
axes_reordering.append(axis)
else:
axes_reordering.append(axis - 1)
init_axis_to_final_axis = build_mapping()
return init_shapes, reduced_axes, axes_reordering, final_shapes
CookedRecipe = Tuple[List[int], List[int], List[int], Dict[int, int], List[int]]
class TransformRecipe:
"""
Recipe describes actual computation pathway.
Recipe can be applied to a tensor or variable.
"""
# structure is non-mutable. In future, this can be non-mutable dataclass (python 3.7+)
def __init__(self,
# list of expressions (or just sizes) for elementary axes as they appear in left expression.
# this is what (after computing unknown parts) will be a shape after first transposition.
# If ellipsis is present, it forms one dimension here (in the right position).
elementary_axes_lengths: List[int],
# each dimension in input can help to reconstruct length of one elementary axis
# or verify one of dimensions. Each element points to element of elementary_axes_lengths
input_composite_axes: List[Tuple[List[int], List[int]]],
# indices of axes to be squashed
reduced_elementary_axes: List[int],
# in which order should axes be reshuffled after reduction
axes_permutation: List[int],
# at which positions which of elementary axes should appear
added_axes: Dict[int, int],
# ids of axes as they appear in result, again pointers to elementary_axes_lengths,
# only used to infer result dimensions
output_composite_axes: List[List[int]],
# positions of ellipsis in lhs and rhs of expression
ellipsis_position_in_lhs: Optional[int] = None,
):
self.elementary_axes_lengths: List[int] = elementary_axes_lengths
self.input_composite_axes: List[Tuple[List[int], List[int]]] = input_composite_axes
self.output_composite_axes: List[List[int]] = output_composite_axes
self.axes_permutation: List[int] = axes_permutation
self.added_axes: Dict[int, int] = added_axes
# This is redundant information, but more convenient to use
self.reduced_elementary_axes: List[int] = reduced_elementary_axes
# setting to a large number to avoid handling Nones in reconstruct_from_shape
self.ellipsis_position_in_lhs: int = ellipsis_position_in_lhs if ellipsis_position_in_lhs is not None else 10000
def _reconstruct_from_shape_uncached(self: TransformRecipe, shape: List[int]) -> CookedRecipe:
"""
Reconstruct all actual parameters using shape.
Shape is a tuple that may contain integers, shape symbols (tf, keras, theano) and UnknownSize (keras, mxnet)
known axes can be integers or symbols, but not Nones.
"""
axes_lengths: List[int] = list(self.elementary_axes_lengths)
if self.ellipsis_position_in_lhs != 10000:
if len(shape) < len(self.input_composite_axes) - 1:
raise EinopsError('Expected at least {} dimensions, got {}'.format(
len(self.input_composite_axes) - 1, len(shape)))
else:
if len(shape) != len(self.input_composite_axes):
raise EinopsError('Expected {} dimensions, got {}'.format(len(self.input_composite_axes), len(shape)))
ellipsis_shape: List[int] = []
for input_axis, (known_axes, unknown_axes) in enumerate(self.input_composite_axes):
before_ellipsis = input_axis
after_ellipsis = input_axis + len(shape) - len(self.input_composite_axes)
if input_axis == self.ellipsis_position_in_lhs:
assert len(known_axes) == 0 and len(unknown_axes) == 1
unknown_axis, = unknown_axes
ellipsis_shape = shape[before_ellipsis:after_ellipsis + 1]
for d in ellipsis_shape:
if d is None:
raise EinopsError("Couldn't infer shape for one or more axes represented by ellipsis")
total_dim_size: int = _product(ellipsis_shape)
axes_lengths[unknown_axis] = total_dim_size
else:
if input_axis < self.ellipsis_position_in_lhs:
length = shape[before_ellipsis]
else:
length = shape[after_ellipsis]
known_product = 1
for axis in known_axes:
known_product *= axes_lengths[axis]
if len(unknown_axes) == 0:
if isinstance(length, int) and isinstance(known_product, int) and length != known_product:
raise EinopsError('Shape mismatch, {} != {}'.format(length, known_product))
# this is enforced when recipe is created
# elif len(unknown_axes) > 1:
# raise EinopsError(
# "Lengths of two or more axes in parenthesis not provided (dim={}), can't infer dimensions".
# format(known_product)
# )
else:
if isinstance(length, int) and isinstance(known_product, int) and length % known_product != 0:
raise EinopsError("Shape mismatch, can't divide axis of length {} in chunks of {}".format(
length, known_product))
unknown_axis: int = unknown_axes[0]
inferred_length: int = length // known_product
axes_lengths[unknown_axis] = inferred_length
# at this point all axes_lengths are computed (either have values or variables, but not Nones)
# TODO more readable expression
init_shapes = axes_lengths[:len(axes_lengths) - len(self.added_axes)]
final_shapes: List[int] = []
for output_axis, grouping in enumerate(self.output_composite_axes):
if is_ellipsis_not_in_parenthesis(grouping):
final_shapes.extend(ellipsis_shape)
else:
lengths = [axes_lengths[elementary_axis] for elementary_axis in grouping]
final_shapes.append(_product(lengths))
reduced_axes = self.reduced_elementary_axes
axes_reordering = self.axes_permutation
added_axes: Dict[int, int] = {
pos: axes_lengths[pos_in_elementary] for pos, pos_in_elementary in self.added_axes.items()}
# if optimize:
# assert len(self.added_axes) == 0
# return _optimize_transformation(init_shapes, reduced_axes, axes_reordering, final_shapes)
return init_shapes, reduced_axes, axes_reordering, added_axes, final_shapes
_reconstruct_from_shape = functools.lru_cache(1024)(_reconstruct_from_shape_uncached)
def _apply_recipe(recipe: TransformRecipe, tensor: Tensor, reduction_type: Reduction) -> Tensor:
# this method works for all backends but not compilable with
backend = get_backend(tensor)
init_shapes, reduced_axes, axes_reordering, added_axes, final_shapes = \
_reconstruct_from_shape(recipe, backend.shape(tensor))
tensor = backend.reshape(tensor, init_shapes)
tensor = _reduce_axes(tensor, reduction_type=reduction_type, reduced_axes=reduced_axes, backend=backend)
tensor = backend.transpose(tensor, axes_reordering)
if len(added_axes) > 0:
tensor = backend.add_axes(tensor, n_axes=len(axes_reordering) + len(added_axes), pos2len=added_axes)
return backend.reshape(tensor, final_shapes)
@functools.lru_cache(256)
def _prepare_transformation_recipe(pattern: str,
operation: Reduction,
axes_lengths: Tuple[Tuple, ...]) -> TransformRecipe:
""" Perform initial parsing of pattern and provided supplementary info
axes_lengths is a tuple of tuples (axis_name, axis_length)
"""
left, rght = pattern.split('->')
left = ParsedExpression(left)
rght = ParsedExpression(rght)
# checking that axes are in agreement - new axes appear only in repeat, while disappear only in reduction
if not left.has_ellipsis and rght.has_ellipsis:
raise EinopsError('Ellipsis found in right side, but not left side of a pattern {}'.format(pattern))
if left.has_ellipsis and left.has_ellipsis_parenthesized:
raise EinopsError('Ellipsis is parenthesis in the left side is not allowed: {}'.format(pattern))
if operation == 'rearrange':
difference = set.symmetric_difference(left.identifiers, rght.identifiers)
if left.has_non_unitary_anonymous_axes or rght.has_non_unitary_anonymous_axes:
raise EinopsError('Non-unitary anonymous axes are not supported in rearrange (exception is length 1)')
if len(difference) > 0:
raise EinopsError('Identifiers only on one side of expression (should be on both): {}'.format(difference))
elif operation == 'repeat':
difference = set.difference(left.identifiers, rght.identifiers)
if len(difference) > 0:
raise EinopsError('Unexpected identifiers on the left side of repeat: {}'.format(difference))
axes_without_size = set.difference({ax for ax in rght.identifiers if not isinstance(ax, AnonymousAxis)},
{*left.identifiers, *(ax for ax, _ in axes_lengths)})
if len(axes_without_size) > 0:
raise EinopsError('Specify sizes for new axes in repeat: {}'.format(axes_without_size))
elif operation in _reductions or callable(operation):
difference = set.difference(rght.identifiers, left.identifiers)
if len(difference) > 0:
raise EinopsError('Unexpected identifiers on the right side of reduce {}: {}'.format(operation, difference))
else:
raise EinopsError('Unknown reduction {}. Expect one of {}.'.format(operation, _reductions))
# parsing all dimensions to find out lengths
axis_name2known_length = OrderedDict()
for composite_axis in left.composition:
for axis_name in composite_axis:
if isinstance(axis_name, AnonymousAxis):
axis_name2known_length[axis_name] = axis_name.value
else:
axis_name2known_length[axis_name] = _unknown_axis_length
# axis_ids_after_first_reshape = range(len(axis_name2known_length)) at this point
repeat_axes_names = []
for axis_name in rght.identifiers:
if axis_name not in axis_name2known_length:
if isinstance(axis_name, AnonymousAxis):
axis_name2known_length[axis_name] = axis_name.value
else:
axis_name2known_length[axis_name] = _unknown_axis_length
repeat_axes_names.append(axis_name)
axis_name2position = {name: position for position, name in enumerate(axis_name2known_length)}
reduced_axes: List[int] = [position for axis, position in axis_name2position.items() if
axis not in rght.identifiers]
reduced_axes: List[int] = list(sorted(reduced_axes))
for elementary_axis, axis_length in axes_lengths:
if not ParsedExpression.check_axis_name(elementary_axis):
raise EinopsError('Invalid name for an axis', elementary_axis)
if elementary_axis not in axis_name2known_length:
raise EinopsError('Axis {} is not used in transform'.format(elementary_axis))
axis_name2known_length[elementary_axis] = axis_length
input_axes_known_unknown = []
# some of shapes will be inferred later - all information is prepared for faster inference
for composite_axis in left.composition:
known = {axis for axis in composite_axis if axis_name2known_length[axis] != _unknown_axis_length}
unknown = {axis for axis in composite_axis if axis_name2known_length[axis] == _unknown_axis_length}
if len(unknown) > 1:
raise EinopsError('Could not infer sizes for {}'.format(unknown))
assert len(unknown) + len(known) == len(composite_axis)
input_axes_known_unknown.append(
([axis_name2position[axis] for axis in known],
[axis_name2position[axis] for axis in unknown])
)
axis_position_after_reduction = {}
for axis_name in itertools.chain(*left.composition):
if axis_name in rght.identifiers:
axis_position_after_reduction[axis_name] = len(axis_position_after_reduction)
result_axes_grouping: List[List[int]] = []
for composite_axis in rght.composition:
if composite_axis == _ellipsis:
result_axes_grouping.append(_ellipsis_not_in_parenthesis)
else:
result_axes_grouping.append([axis_name2position[axis] for axis in composite_axis])
ordered_axis_right = list(itertools.chain(*rght.composition))
axes_permutation = [
axis_position_after_reduction[axis] for axis in ordered_axis_right if axis in left.identifiers]
added_axes = {i: axis_name2position[axis_name] for i, axis_name in enumerate(ordered_axis_right)
if axis_name not in left.identifiers}
ellipsis_left = None if _ellipsis not in left.composition else left.composition.index(_ellipsis)
return TransformRecipe(
elementary_axes_lengths=list(axis_name2known_length.values()),
input_composite_axes=input_axes_known_unknown,
reduced_elementary_axes=reduced_axes,
axes_permutation=axes_permutation,
added_axes=added_axes,
output_composite_axes=result_axes_grouping,
ellipsis_position_in_lhs=ellipsis_left,
)
def reduce(tensor: Tensor, pattern: str, reduction: Reduction, **axes_lengths: int) -> Tensor:
"""
einops.reduce provides combination of reordering and reduction using reader-friendly notation.
Examples for reduce operation:
```python
>>> x = np.random.randn(100, 32, 64)
# perform max-reduction on the first axis
>>> y = reduce(x, 't b c -> b c', 'max')
# same as previous, but with clearer axes meaning
>>> y = reduce(x, 'time batch channel -> batch channel', 'max')
>>> x = np.random.randn(10, 20, 30, 40)
# 2d max-pooling with kernel size = 2 * 2 for image processing
>>> y1 = reduce(x, 'b c (h1 h2) (w1 w2) -> b c h1 w1', 'max', h2=2, w2=2)
# if one wants to go back to the original height and width, depth-to-space trick can be applied
>>> y2 = rearrange(y1, 'b (c h2 w2) h1 w1 -> b c (h1 h2) (w1 w2)', h2=2, w2=2)
>>> assert parse_shape(x, 'b _ h w') == parse_shape(y2, 'b _ h w')
# Adaptive 2d max-pooling to 3 * 4 grid
>>> reduce(x, 'b c (h1 h2) (w1 w2) -> b c h1 w1', 'max', h1=3, w1=4).shape
(10, 20, 3, 4)
# Global average pooling
>>> reduce(x, 'b c h w -> b c', 'mean').shape
(10, 20)
# Subtracting mean over batch for each channel
>>> y = x - reduce(x, 'b c h w -> () c () ()', 'mean')
# Subtracting per-image mean for each channel
>>> y = x - reduce(x, 'b c h w -> b c () ()', 'mean')
```
Parameters:
tensor: tensor: tensor of any supported library (e.g. numpy.ndarray, jittor.Var).
list of tensors is also accepted, those should be of the same type and shape
pattern: string, reduction pattern
reduction: one of available reductions ('min', 'max', 'sum', 'mean', 'prod'), case-sensitive
alternatively, a callable f(tensor, reduced_axes) -> tensor can be provided.
axes_lengths: any additional specifications for dimensions
Returns:
tensor of the same type as input
"""
try:
hashable_axes_lengths = tuple(sorted(axes_lengths.items()))
recipe = _prepare_transformation_recipe(pattern, reduction, axes_lengths=hashable_axes_lengths)
return _apply_recipe(recipe, tensor, reduction_type=reduction)
except EinopsError as e:
message = ' Error while processing {}-reduction pattern "{}".'.format(reduction, pattern)
if not isinstance(tensor, list):
message += '\n Input tensor shape: {}. '.format(get_backend(tensor).shape(tensor))
else:
message += '\n Input is list. '
message += 'Additional info: {}.'.format(axes_lengths)
raise EinopsError(message + '\n {}'.format(e))
def rearrange(tensor: Union[Tensor, List[Tensor]], pattern: str, **axes_lengths) -> Tensor:
"""
einops.rearrange is a reader-friendly smart element reordering for multidimensional tensors.
This operation includes functionality of transpose (axes permutation), reshape (view), squeeze, unsqueeze,
stack, concatenate and other operations.
Examples for rearrange operation:
```python
# suppose we have a set of 32 images in "h w c" format (height-width-channel)
>>> images = [np.random.randn(30, 40, 3) for _ in range(32)]
# stack along first (batch) axis, output is a single array
>>> rearrange(images, 'b h w c -> b h w c').shape
(32, 30, 40, 3)
# concatenate images along height (vertical axis), 960 = 32 * 30
>>> rearrange(images, 'b h w c -> (b h) w c').shape
(960, 40, 3)
# concatenated images along horizontal axis, 1280 = 32 * 40
>>> rearrange(images, 'b h w c -> h (b w) c').shape
(30, 1280, 3)
# reordered axes to "b c h w" format for deep learning
>>> rearrange(images, 'b h w c -> b c h w').shape
(32, 3, 30, 40)
# flattened each image into a vector, 3600 = 30 * 40 * 3
>>> rearrange(images, 'b h w c -> b (c h w)').shape
(32, 3600)
# split each image into 4 smaller (top-left, top-right, bottom-left, bottom-right), 128 = 32 * 2 * 2
>>> rearrange(images, 'b (h1 h) (w1 w) c -> (b h1 w1) h w c', h1=2, w1=2).shape
(128, 15, 20, 3)
# space-to-depth operation
>>> rearrange(images, 'b (h h1) (w w1) c -> b h w (c h1 w1)', h1=2, w1=2).shape
(32, 15, 20, 12)
```
When composing axes, C-order enumeration used (consecutive elements have different last axis)
Find more examples in einops tutorial.
Parameters:
tensor: tensor of any supported library (e.g. numpy.ndarray, jittor.Var).
list of tensors is also accepted, those should be of the same type and shape
pattern: string, rearrangement pattern
axes_lengths: any additional specifications for dimensions
Returns:
tensor of the same type as input. If possible, a view to the original tensor is returned.
"""
if isinstance(tensor, list):
if len(tensor) == 0:
raise TypeError("Rearrange can't be applied to an empty list")
tensor = get_backend(tensor[0]).stack_on_zeroth_dimension(tensor)
return reduce(tensor, pattern, reduction='rearrange', **axes_lengths)
def repeat(tensor: Tensor, pattern: str, **axes_lengths) -> Tensor:
"""
einops.repeat allows reordering elements and repeating them in arbitrary combinations.
This operation includes functionality of repeat, tile, broadcast functions.
Examples for repeat operation:
```python
# a grayscale image (of shape height x width)
>>> image = np.random.randn(30, 40)
# change it to RGB format by repeating in each channel
>>> repeat(image, 'h w -> h w c', c=3).shape
(30, 40, 3)
# repeat image 2 times along height (vertical axis)
>>> repeat(image, 'h w -> (repeat h) w', repeat=2).shape
(60, 40)
# repeat image 2 time along height and 3 times along width
>>> repeat(image, 'h w -> (h2 h) (w3 w)', h2=2, w3=3).shape
(60, 120)
# convert each pixel to a small square 2x2. Upsample image by 2x
>>> repeat(image, 'h w -> (h h2) (w w2)', h2=2, w2=2).shape
(60, 80)
# pixelate image first by downsampling by 2x, then upsampling
>>> downsampled = reduce(image, '(h h2) (w w2) -> h w', 'mean', h2=2, w2=2)
>>> repeat(downsampled, 'h w -> (h h2) (w w2)', h2=2, w2=2).shape
(30, 40)
```
When composing axes, C-order enumeration used (consecutive elements have different last axis)
Find more examples in einops tutorial.
Parameters:
tensor: tensor of any supported library (e.g. numpy.ndarray, jittor.Var).
list of tensors is also accepted, those should be of the same type and shape
pattern: string, rearrangement pattern
axes_lengths: any additional specifications for dimensions
Returns:
Tensor of the same type as input. If possible, a view to the original tensor is returned.
"""
return reduce(tensor, pattern, reduction='repeat', **axes_lengths)
def parse_shape(x, pattern: str) -> dict:
"""
Parse a tensor shape to dictionary mapping axes names to their lengths.
```python
# Use underscore to skip the dimension in parsing.
>>> x = np.zeros([2, 3, 5, 7])
>>> parse_shape(x, 'batch _ h w')
{'batch': 2, 'h': 5, 'w': 7}
# `parse_shape` output can be used to specify axes_lengths for other operations:
>>> y = np.zeros([700])
>>> rearrange(y, '(b c h w) -> b c h w', **parse_shape(x, 'b _ h w')).shape
(2, 10, 5, 7)
```
For symbolic frameworks may return symbols, not integers.
Parameters:
x: tensor of any of supported frameworks
pattern: str, space separated names for axes, underscore means skip axis
Returns:
dict, maps axes names to their lengths
"""
exp = ParsedExpression(pattern, allow_underscore=True)
shape = get_backend(x).shape(x)
if exp.has_composed_axes():
raise RuntimeError("Can't parse shape with composite axes: {pattern} {shape}".format(
pattern=pattern, shape=shape))
if len(shape) != len(exp.composition):
if exp.has_ellipsis:
if len(shape) < len(exp.composition) - 1:
raise RuntimeError("Can't parse shape with this number of dimensions: {pattern} {shape}".format(
pattern=pattern, shape=shape))
else:
raise RuntimeError("Can't parse shape with different number of dimensions: {pattern} {shape}".format(
pattern=pattern, shape=shape))
if exp.has_ellipsis:
ellipsis_idx = exp.composition.index(_ellipsis)
composition = (exp.composition[:ellipsis_idx] +
['_'] * (len(shape) - len(exp.composition) + 1) +
exp.composition[ellipsis_idx + 1:])
else:
composition = exp.composition
result = {}
for (axis_name,), axis_length in zip(composition, shape):
if axis_name != '_':
result[axis_name] = axis_length
return result
# this one is probably not needed in the public API
def _enumerate_directions(x):
"""
For an n-dimensional tensor, returns tensors to enumerate each axis.
```python
x = np.zeros([2, 3, 4]) # or any other tensor
i, j, k = _enumerate_directions(x)
result = i + 2*j + 3*k
```
`result[i, j, k] = i + 2j + 3k`, and also has the same shape as result
Works very similarly to numpy.ogrid (open indexing grid)
"""
backend = get_backend(x)
shape = backend.shape(x)
result = []
for axis_id, axis_length in enumerate(shape):
shape = [1] * len(shape)
shape[axis_id] = axis_length
result.append(backend.reshape(backend.arange(0, axis_length), shape))
return result
def asnumpy(tensor) -> 'numpy.ndarray':
"""
Convert a tensor of an imperative framework (i.e. numpy/jittor.) to `numpy.ndarray`
Parameters:
tensor: tensor of any of known imperative framework
Returns:
`numpy.ndarray`, converted to numpy
"""
return get_backend(tensor).to_numpy(tensor)
def _validate_einsum_axis_name(axis_name):
if len(axis_name) == 0:
raise NotImplementedError("Singleton () axes are not yet supported in einsum.")
if len(axis_name) > 1:
raise NotImplementedError("Shape rearrangement is not yet supported in einsum.")
axis_name = axis_name[0]
if isinstance(axis_name, AnonymousAxis):
raise NotImplementedError("Anonymous axes are not yet supported in einsum.")
if len(axis_name) == 0:
raise RuntimeError("Encountered empty axis name in einsum.")
if not isinstance(axis_name, str):
raise RuntimeError("Axis name in einsum must be a string.")
@functools.lru_cache(256)
def _compactify_pattern_for_einsum(pattern: str) -> str:
if "->" not in pattern:
# numpy allows this, so make sure users
# don't accidentally do something like this.
raise ValueError("Einsum pattern must contain '->'.")
lefts, right = pattern.split('->')
lefts = lefts.split(',')
lefts = [
ParsedExpression(left, allow_underscore=True, allow_duplicates=True)
for left in lefts
]
right = ParsedExpression(right, allow_underscore=True)
# Start from 'a' and go up to 'Z'
output_axis_names = string.ascii_letters
i = 0
axis_name_mapping = {}
left_patterns = []
for left in lefts:
left_pattern = ""
for raw_axis_name in left.composition:
if raw_axis_name == _ellipsis:
left_pattern += '...'
continue
_validate_einsum_axis_name(raw_axis_name)
axis_name = raw_axis_name[0]
if axis_name not in axis_name_mapping:
if i >= len(output_axis_names):
raise RuntimeError("Too many axes in einsum.")
axis_name_mapping[axis_name] = output_axis_names[i]
i += 1
left_pattern += axis_name_mapping[axis_name]
left_patterns.append(left_pattern)
compact_pattern = ",".join(left_patterns) + "->"
for raw_axis_name in right.composition:
if raw_axis_name == _ellipsis:
compact_pattern += '...'
continue
_validate_einsum_axis_name(raw_axis_name)
axis_name = raw_axis_name[0]
if axis_name not in axis_name_mapping:
raise EinopsError(f"Unknown axis {axis_name} on right side of einsum {pattern}.")
compact_pattern += axis_name_mapping[axis_name]
return compact_pattern
@typing.overload
def einsum(tensor: Tensor, pattern: str) -> Tensor: ...
@typing.overload
def einsum(tensor1: Tensor, tensor2: Tensor, pattern: str) -> Tensor: ...
@typing.overload
def einsum(tensor1: Tensor, tensor2: Tensor, tensor3: Tensor, pattern: str) -> Tensor: ...
@typing.overload
def einsum(tensor1: Tensor, tensor2: Tensor, tensor3: Tensor, tensor4: Tensor, pattern: str) -> Tensor: ...
def einsum(*tensors_and_pattern: List[Union[Tensor, str]]) -> Tensor:
"""
einops.einsum calls einsum operations with einops-style named
axes indexing, computing tensor products with an arbitrary
number of tensors. Unlike typical einsum syntax, here you must
pass tensors first, and then the pattern.
Also, note that rearrange operations such as `"(batch chan) out"`,
or singleton axes `()`, are not currently supported.
Examples:
For a given pattern such as:
```python
>>> x, y, z = np.random.randn(3, 20, 20, 20)
>>> output = einsum(x, y, z, "a b c, c b d, a g k -> a b k")
```
the following formula is computed:
```tex
output[a, b, k] =
\sum_{c, d, g} x[a, b, c] * y[c, b, d] * z[a, g, k]
```
where the summation over `c`, `d`, and `g` is performed
because those axes names do not appear on the right-hand side.
Let's see some additional examples:
```python
# Filter a set of images:
>>> batched_images = np.random.randn(128, 16, 16)
>>> filters = np.random.randn(16, 16, 30)
>>> result = einsum(batched_images, filters,
... "batch h w, h w channel -> batch channel")
>>> result.shape
(128, 30)
# Matrix multiplication, with an unknown input shape:
>>> batch_shape = (50, 30)
>>> data = np.random.randn(*batch_shape, 20)
>>> weights = np.random.randn(10, 20)
>>> result = einsum(weights, data,
... "out_dim in_dim, ... in_dim -> ... out_dim")
>>> result.shape
(50, 30, 10)
# Matrix trace on a single tensor:
>>> matrix = np.random.randn(10, 10)
>>> result = einsum(matrix, "i i ->")
>>> result.shape
()
```
Parameters:
tensors: tensors of any supported library (numpy, jittor).
pattern: string, einsum pattern, with commas
separating specifications for each tensor.
Returns:
Tensor of the same type as input, after processing with einsum.
"""
if len(tensors_and_pattern) <= 1:
raise ValueError(
"`einops.einsum` takes at minimum two arguments: the tensors (at least one),"
" followed by the pattern."
)
pattern = tensors_and_pattern[-1]
if not isinstance(pattern, str):
raise ValueError(
"The last argument passed to `einops.einsum` must be a string,"
" representing the einsum pattern."
)
tensors = tensors_and_pattern[:-1]
pattern = _compactify_pattern_for_einsum(pattern)
return get_backend(tensors[0]).einsum(pattern, *tensors)

View File

@ -0,0 +1,393 @@
"""
Indexing one array with the other(s).
Concept for discussion.
Notation targets hard cases, not simple ones, like indexing of 1d-array with another 1d-array
(notation supports that, but you can't simplify arr[ind], and there is no reason to)
Examples
1. query for every token in sequence a token in the image. Images and sequences are paired
einindex('b t c <- b h w c, [h, w] b t', arr_bhwc, [h_indices_bt, w_indices_bt])
this is equivalent, so you can pass indexers idependently or together
einindex('b t c <- b h w c, [h, w] b t', arr_bhwc, np.asarray([h_indices_bt, w_indices_bt]))
after some thinking I decided that having first axis for indexing variable is not too restrictive,
but should simplify mapping of such cases.
For this reason [...] part should always go first in indexer.
This makes the largest difference with einindex https://github.com/malmaud/einindex,
which has almost identical grammar, but puts special dimension last, while we put it first.
This trick allows naturally decomposing multiindex into individual dimensions or visa versa.
2. query for every token in the video the most suitable word in a (matching) sentence
einindex('b t h w <- seq b, [seq] t b h w', arr_tbc, [t_indices_bhw])
note, that only one indexer is used, but still it has to be enclosed in the list.
That's a price for being generic. Alternatively leading singleton dimension can be added.
3. (not supported now, future planning)
for every timeframe in a video, find the token with the highest norm (across h and w), and compose a new stack of them
indices_2bt = argmax(x_bthwc.norm(dim=-1), 'b t h w -> [h, w] b t')
selected_embeddings_btc = einindex('b t c <- b t h w c, [h, w] b t', x_bthwc, indices_2bt)
while currently question is around 'how do we index',
it is important to pre-align that with a question 'what are natural ways to get indices'.
Most common are min/max. less common options: topk (works here), random sampling.
Some important properties of this notation:
- support for multiple indexers, including using a single tensor to keep multiple indexers
- 'batch' indexing, when some axes of indexer and array should be matched
- universal (one-indexing-to-rule-them-all)
- extensible for (named) ellipses, including variadic number of indexers
- extensible for einops-style compositions and decompositions
- extensible for outer indexing when indexers are not aligned
Current implementation based on python array api and uses loops,
because no appropriate indexing available in the standard.
"""
from typing import List, Union, TypeVar, Tuple
from jittor.einops import EinopsError
T = TypeVar('T')
class CompositionDecomposition:
def __init__(
self,
decomposed_shape: List[str],
composed_shape: List[List[str]],
):
flat_shape = []
for x in composed_shape:
flat_shape.extend(x)
self.compose_transposition: Tuple[int] = tuple([decomposed_shape.index(x) for x in flat_shape])
self.decompose_transposition: Tuple[int] = tuple([flat_shape.index(x) for x in decomposed_shape])
self.composed_shape = composed_shape
self.decomposed_shape = decomposed_shape
def decompose(self, x, known_axes_lengths: dict[str, int]):
xp = x.__array_namespace__()
shape = x.shape
flat_shape = []
for i, axis_group in enumerate(self.composed_shape):
unknown_axis_name = None
known_sizes_prod = 1
for axis_name in axis_group:
if axis_name in known_axes_lengths:
known_sizes_prod *= known_axes_lengths[axis_name]
else:
if unknown_axis_name is None:
unknown_axis_name = axis_name
else:
raise EinopsError("Can't infer the size")
if unknown_axis_name is None:
assert shape[i] == known_sizes_prod
else:
known_axes_lengths[unknown_axis_name] = shape[i] // known_sizes_prod
for axis in axis_group:
flat_shape.append(known_axes_lengths[axis])
x = xp.reshape(x, flat_shape)
return xp.permute_dims(x, self.decompose_transposition)
def compose(self, x, known_axes_lengths: dict[str, int]):
xp = x.__array_namespace__()
for axis_len, axis_name in zip(x.shape, self.decomposed_shape):
if axis_name in known_axes_lengths:
assert known_axes_lengths[axis_name] == axis_len
else:
known_axes_lengths[axis_name] = axis_len
x = xp.permute_dims(x, self.compose_transposition)
new_shape = []
for axis_group in self.composed_shape:
composed_axis_size = 1
for axis_name in axis_group:
composed_axis_size *= known_axes_lengths[axis_name]
new_shape.append(composed_axis_size)
return xp.reshape(x, tuple(new_shape))
def arange_at_position(xp, n_axes, axis, axis_len, device=None):
x = xp.arange(axis_len, dtype=xp.int64, device=device)
shape = [1] * n_axes
shape[axis] = axis_len
x = xp.reshape(x, shape)
return x
class IndexingFormula:
def __init__(self, pattern: str):
"""
:param pattern: example 'b t c <- b hsel wsel c, [hsel, wsel] b t'
"""
self.pattern = pattern
left, right = pattern.split('<-')
arg_split = right.index(',')
arr_pattern, ind_pattern = right[:arg_split], right[arg_split + 1:]
ind_pattern = ind_pattern.strip()
# print(
# arr_pattern, '\n',
# ind_pattern,
# )
assert ind_pattern.startswith('['), 'composition axis should go first in indexer (second argument) [h w] i j k'
composition_start = ind_pattern.index('[')
composition_end = ind_pattern.index(']')
composition = ind_pattern[composition_start + 1: composition_end]
ind_other_axes = ind_pattern[composition_end + 1:]
self.result_axes_names = left.split()
self.array_axes_names = arr_pattern.split()
self.indexing_axes_names = [x.strip() for x in composition.split(',')]
self.indexer_other_axes_names = ind_other_axes.split()
for group_name, group in [
('result', self.result_axes_names),
('array', self.array_axes_names),
('indexer', self.indexing_axes_names + self.indexer_other_axes_names),
]:
if len(set(group)) != len(group):
# need more verbosity, which axis, raise
raise EinopsError(f'{group_name} pattern ({group}) contains a duplicated axis')
axis_groups = [
self.result_axes_names,
self.array_axes_names,
self.indexing_axes_names,
self.indexer_other_axes_names,
]
all_axes = set()
for group in axis_groups:
all_axes.update(group)
self.indexer_axes = []
self.batch_axes = []
self.result_and_index_axes = []
self.result_and_array_axes = []
for axis in all_axes:
presence = tuple(axis in g for g in axis_groups)
# want match-case here. sweet dreams
if presence == (False, True, True, False):
self.indexer_axes.append(axis)
elif presence[2]:
raise EinopsError(f'Wrong usage of indexer variable {axis}')
elif presence == (True, True, False, True):
self.batch_axes.append(axis)
elif presence == (True, False, False, True):
self.result_and_index_axes.append(axis)
elif presence == (True, True, False, False):
self.result_and_array_axes.append(axis)
else:
# TODO better categorization of wrong usage patterns
raise EinopsError(f'{axis} is used incorrectly in {pattern}')
assert set(self.indexer_axes) == set(self.indexing_axes_names)
# order of these variables matters, since we can't lose mapping here
self.indexer_axes = self.indexing_axes_names
self.array_composition = CompositionDecomposition(
decomposed_shape=self.array_axes_names,
composed_shape=[self.batch_axes + self.indexer_axes, self.result_and_array_axes],
)
self.index_composition = CompositionDecomposition(
decomposed_shape=self.indexer_other_axes_names,
# single axis after composition
composed_shape=[self.batch_axes + self.result_and_index_axes],
)
self.result_composition = CompositionDecomposition(
decomposed_shape=self.result_axes_names,
composed_shape=[self.batch_axes + self.result_and_index_axes, self.result_and_array_axes],
)
def apply_to_array_api(self, arr: T, ind: Union[T, List[T]]):
known_axes_sizes: dict[str, int] = {}
xp = arr.__array_namespace__()
if not isinstance(ind, list):
ind = [ind[i, ...] for i in range(ind.shape[0])]
for indexer in ind:
assert len(indexer.shape) == len(self.indexer_other_axes_names)
# step 1. transpose, reshapes of arr; learn its dimensions
arr_2d = self.array_composition.compose(arr, known_axes_sizes)
# step 2. compute shifts and create an actual indexing array
shift = 1
full_index = xp.zeros([1] * len(ind[0].shape), dtype=xp.int64, device=arr.device)
# original order: [*batch-like axes, *indexing_axes,]
# now we need to traverse them in the opposite direction
for axis_name, indexer in list(zip(self.indexing_axes_names, ind))[::-1]:
full_index = full_index + shift * (indexer % known_axes_sizes[axis_name])
shift *= known_axes_sizes[axis_name]
for axis_name in self.batch_axes[::-1]:
axis_id = self.indexer_other_axes_names.index(axis_name)
full_index = full_index + arange_at_position(
xp, len(self.indexer_other_axes_names), axis=axis_id, axis_len=known_axes_sizes[axis_name],
device=arr.device,
) * shift
shift *= known_axes_sizes[axis_name]
assert shift == arr_2d.shape[0]
# step 3. Flatten index
full_index = self.index_composition.compose(full_index, known_axes_sizes)
# step 4. indexing
# python array api lacks any integer indexing, so... I use loops.
# did you know that there is conceptual programming ... just like art?
# result_2d = arr_2d[full_index]
result_2d = xp.stack([arr_2d[full_index[i], :] for i in range(full_index.shape[0])])
# step 5. doing resulting
result = self.result_composition.decompose(result_2d, known_axes_sizes)
return result
def einindex(pattern: str, arr: T, /, ind: Union[T, List[T]]):
"""
Demonstrates how einindex should work.
Supports data-api compliant arrays.
"""
formula = IndexingFormula(pattern)
return formula.apply_to_array_api(arr, ind)
def test_composition_and_decomposition():
import numpy.array_api as np
x = np.arange(2 * 3 * 5 * 7)
x = np.reshape(x, (2, 3, 5, 7))
comp = CompositionDecomposition(
decomposed_shape=['a', 'b', 'c', 'd'],
composed_shape=[['a', 'b'], ['c', 'd']],
)
assert comp.compose(x, known_axes_lengths={}).shape == (2 * 3, 5 * 7)
y = CompositionDecomposition(
decomposed_shape=['a', 'b', 'c', 'd'],
composed_shape=[['a', 'b'], [], ['c', 'd']],
).compose(x, {})
assert y.shape == (2 * 3, 1, 5 * 7)
assert np.all(np.reshape(x, (-1,)) == np.reshape(y, (-1,)))
comp = CompositionDecomposition(
decomposed_shape=['a', 'b', 'e', 'c', 'd'],
composed_shape=[['e', 'c'], ['b'], ['a', 'd']],
)
x = np.arange(2 * 3 * 5 * 7 * 3)
x = np.reshape(x, (2, 3, 5, 7, 3))
axes = {}
y = comp.compose(x, axes)
x2 = comp.decompose(y, axes)
assert np.all(x == x2)
def test_simple_indexing():
import numpy.array_api as np
# simple 2d test
arr = np.reshape(np.arange(5 * 7), (5, 7))
ind = np.arange(7) % 5
x = einindex('j <- i j, [i] j', arr, [ind])
for j, i in enumerate(ind):
assert arr[i, j] == x[j]
y = einindex('j <- j i, [i] j', np.permute_dims(arr, (1, 0)), [ind])
for j, i in enumerate(ind):
assert arr[i, j] == y[j]
def test_multidimensional_indexing():
import numpy.array_api as np
embedding_bhwc = (
+ arange_at_position(np, 4, 0, 2) * 1000
+ arange_at_position(np, 4, 1, 3) * 100
+ arange_at_position(np, 4, 2, 5) * 10
+ arange_at_position(np, 4, 3, 7) * 1
)
hindices_bt = np.reshape(np.arange(6), (2, 3)) % 3
windices_bt = np.reshape(np.arange(6), (2, 3)) % 5
# imagine that you have pairs of image <> sentence
# your goal is to get most suitable token from image for every token in sentence
# thus for every token in sentence you compute best k and v
result = einindex('c t b <- b h w c, [h, w] b t', embedding_bhwc, [hindices_bt, windices_bt])
# example of using a single array for indexing multiple axes
hw_indices_bt = np.stack([hindices_bt, windices_bt])
result2 = einindex('c t b <- b h w c, [h, w] b t', embedding_bhwc, hw_indices_bt)
assert np.all(result == result2)
# check vs manual element computation
result_manual = result * 0
for b in range(2):
for t in range(3):
for c in range(7):
h = hindices_bt[b, t]
w = windices_bt[b, t]
result_manual[c, t, b] = embedding_bhwc[b, h, w, c]
assert np.all(result == result_manual)
def test_reverse_indexing():
import numpy.array_api as np
C, T, B = 2, 3, 5
# G = GPU, batch-like varaible
G = 4
H = 7
W = 9
arr_gtbc = (
+ arange_at_position(np, 4, 0, G) * 1000
+ arange_at_position(np, 4, 1, T) * 100
+ arange_at_position(np, 4, 2, B) * 10
+ arange_at_position(np, 4, 3, C) * 1
)
t_indices_gbhw = np.reshape(np.arange(G * B * H * W), (G, B, H, W)) % T
result = einindex('g b c h w <- g t b c, [t] g b h w', arr_gtbc, [t_indices_gbhw])
result_manual = result * 0
for g in range(G):
for b in range(B):
for c in range(C):
for h in range(H):
for w in range(W):
t = t_indices_gbhw[g, b, h, w]
result_manual[g, b, c, h, w] = arr_gtbc[g, t, b, c]
assert np.all(result == result_manual)

View File

@ -0,0 +1,79 @@
__author__ = 'Alex Rogozhnikov'
import functools
from jittor.einops.einops import _apply_recipe
from jittor.einops.einops import TransformRecipe, _prepare_transformation_recipe
from jittor.einops import EinopsError
class RearrangeMixin:
"""
Rearrange layer behaves identically to einops.rearrange operation.
:param pattern: str, rearrangement pattern
:param axes_lengths: any additional specification of dimensions
See einops.rearrange for source_examples.
"""
def __init__(self, pattern, **axes_lengths):
super().__init__()
self.pattern = pattern
self.axes_lengths = axes_lengths
self._recipe = self.recipe() # checking parameters
def __repr__(self):
params = repr(self.pattern)
for axis, length in self.axes_lengths.items():
params += ', {}={}'.format(axis, length)
return '{}({})'.format(self.__class__.__name__, params)
@functools.lru_cache(maxsize=1024)
def recipe(self) -> TransformRecipe:
try:
hashable_lengths = tuple(sorted(self.axes_lengths.items()))
return _prepare_transformation_recipe(self.pattern, operation='rearrange', axes_lengths=hashable_lengths)
except EinopsError as e:
raise EinopsError(' Error while preparing {!r}\n {}'.format(self, e))
def _apply_recipe(self, x):
return _apply_recipe(self._recipe, x, reduction_type='rearrange')
class ReduceMixin:
"""
Reduce layer behaves identically to einops.reduce operation.
:param pattern: str, rearrangement pattern
:param reduction: one of available reductions ('min', 'max', 'sum', 'mean', 'prod'), case-sensitive
:param axes_lengths: any additional specification of dimensions
See einops.reduce for source_examples.
"""
def __init__(self, pattern, reduction, **axes_lengths):
super().__init__()
self.pattern = pattern
self.reduction = reduction
self.axes_lengths = axes_lengths
self._recipe = self.recipe() # checking parameters
def __repr__(self):
params = '{!r}, {!r}'.format(self.pattern, self.reduction)
for axis, length in self.axes_lengths.items():
params += ', {}={}'.format(axis, length)
return '{}({})'.format(self.__class__.__name__, params)
@functools.lru_cache(maxsize=1024)
def recipe(self) -> TransformRecipe:
try:
hashable_lengths = tuple(sorted(self.axes_lengths.items()))
return _prepare_transformation_recipe(
self.pattern, operation=self.reduction, axes_lengths=hashable_lengths)
except EinopsError as e:
raise EinopsError(' Error while preparing {!r}\n {}'.format(self, e))
def _apply_recipe(self, x):
return _apply_recipe(self._recipe, x, reduction_type=self.reduction)

View File

@ -0,0 +1,176 @@
from typing import Optional, Dict
from jittor.einops import EinopsError
from jittor.einops.parsing import ParsedExpression
import warnings
import string
from jittor.einops.einops import _product
def _report_axes(axes: set, report_message: str):
if len(axes) > 0:
raise EinopsError(report_message.format(axes))
class _EinmixMixin:
def __init__(self, pattern, weight_shape, bias_shape=None, **axes_lengths):
"""
EinMix - Einstein summation with automated tensor management and axis packing/unpacking.
EinMix is an advanced tool, helpful tutorial:
https://github.com/arogozhnikov/einops/blob/master/docs/3-einmix-layer.ipynb
Imagine taking einsum with two arguments, one of each input, and one - tensor with weights
>>> einsum('time batch channel_in, channel_in channel_out -> time batch channel_out', input, weight)
This layer manages weights for you, syntax highlights separate role of weight matrix
>>> EinMix('time batch channel_in -> time batch channel_out', weight_shape='channel_in channel_out')
But otherwise it is the same einsum under the hood.
Simple linear layer with bias term (you have one like that in your framework)
>>> EinMix('t b cin -> t b cout', weight_shape='cin cout', bias_shape='cout', cin=10, cout=20)
There is restriction to mix the last axis. Let's mix along height
>>> EinMix('h w c-> hout w c', weight_shape='h hout', bias_shape='hout', h=32, hout=32)
Channel-wise multiplication (like one used in normalizations)
>>> EinMix('t b c -> t b c', weight_shape='c', c=128)
Separate dense layer within each head, no connection between different heads
>>> EinMix('t b (head cin) -> t b (head cout)', weight_shape='head cin cout', ...)
... ah yes, you need to specify all dimensions of weight shape/bias shape in parameters.
Use cases:
- when channel dimension is not last, use EinMix, not transposition
- patch/segment embeddings
- when need only within-group connections to reduce number of weights and computations
- perfect as a part of sequential models
- next-gen MLPs (follow tutorial to learn more)
Uniform He initialization is applied to weight tensor and encounters for number of elements mixed.
Parameters
:param pattern: transformation pattern, left side - dimensions of input, right side - dimensions of output
:param weight_shape: axes of weight. A tensor of this shape is created, stored, and optimized in a layer
:param bias_shape: axes of bias added to output. Weights of this shape are created and stored. If `None` (the default), no bias is added.
:param axes_lengths: dimensions of weight tensor
"""
super().__init__()
self.pattern = pattern
self.weight_shape = weight_shape
self.bias_shape = bias_shape
self.axes_lengths = axes_lengths
self.initialize_einmix(pattern=pattern, weight_shape=weight_shape, bias_shape=bias_shape, axes_lengths=axes_lengths)
def initialize_einmix(self, pattern, weight_shape, bias_shape, axes_lengths):
left_pattern, right_pattern = pattern.split('->')
left = ParsedExpression(left_pattern)
right = ParsedExpression(right_pattern)
weight = ParsedExpression(weight_shape)
_report_axes(
set.difference(right.identifiers, {*left.identifiers, *weight.identifiers}),
'Unrecognized identifiers on the right side of EinMix {}'
)
if left.has_ellipsis or right.has_ellipsis or weight.has_ellipsis:
raise EinopsError('Ellipsis is not supported in EinMix (right now)')
if any(x.has_non_unitary_anonymous_axes for x in [left, right, weight]):
raise EinopsError('Anonymous axes (numbers) are not allowed in EinMix')
if '(' in weight_shape or ')' in weight_shape:
raise EinopsError(f'Parenthesis is not allowed in weight shape: {weight_shape}')
pre_reshape_pattern = None
pre_reshape_lengths = None
post_reshape_pattern = None
if any(len(group) != 1 for group in left.composition):
names = []
for group in left.composition:
names += group
composition = ' '.join(names)
pre_reshape_pattern = f'{left_pattern}->{composition}'
pre_reshape_lengths = {name: length for name, length in axes_lengths.items() if name in names}
if any(len(group) != 1 for group in right.composition):
names = []
for group in right.composition:
names += group
composition = ' '.join(names)
post_reshape_pattern = f'{composition}->{right_pattern}'
self._create_rearrange_layers(pre_reshape_pattern, pre_reshape_lengths, post_reshape_pattern, {})
for axis in weight.identifiers:
if axis not in axes_lengths:
raise EinopsError('Dimension {} of weight should be specified'.format(axis))
_report_axes(
set.difference(set(axes_lengths), {*left.identifiers, *weight.identifiers}),
'Axes {} are not used in pattern',
)
_report_axes(
set.difference(weight.identifiers, {*left.identifiers, *right.identifiers}),
'Weight axes {} are redundant'
)
if len(weight.identifiers) == 0:
warnings.warn('EinMix: weight has no dimensions (means multiplication by a number)')
_weight_shape = [axes_lengths[axis] for axis, in weight.composition]
# single output element is a combination of fan_in input elements
_fan_in = _product([axes_lengths[axis] for axis, in weight.composition if axis not in right.identifiers])
if bias_shape is not None:
if not isinstance(bias_shape, str):
raise EinopsError('bias shape should be string specifying which axes bias depends on')
bias = ParsedExpression(bias_shape)
_report_axes(
set.difference(bias.identifiers, right.identifiers),
'Bias axes {} not present in output'
)
_report_axes(
set.difference(bias.identifiers, set(axes_lengths)),
'Sizes not provided for bias axes {}',
)
_bias_shape = []
for axes in right.composition:
for axis in axes:
if axis in bias.identifiers:
_bias_shape.append(axes_lengths[axis])
else:
_bias_shape.append(1)
else:
_bias_shape = None
weight_bound = (3 / _fan_in) ** 0.5
bias_bound = (1 / _fan_in) ** 0.5
self._create_parameters(_weight_shape, weight_bound, _bias_shape, bias_bound)
# rewrite einsum expression with single-letter latin identifiers so that
# expression will be understood by any framework
mapping2letters = {*left.identifiers, *right.identifiers, *weight.identifiers}
mapping2letters = {k: letter for letter, k in zip(string.ascii_lowercase, mapping2letters)}
def write_flat(axes: list):
return ''.join(mapping2letters[axis] for axis in axes)
self.einsum_pattern: str = '{},{}->{}'.format(
write_flat(left.flat_axes_order()),
write_flat(weight.flat_axes_order()),
write_flat(right.flat_axes_order()),
)
def _create_rearrange_layers(self,
pre_reshape_pattern: Optional[str],
pre_reshape_lengths: Optional[Dict],
post_reshape_pattern: Optional[str],
post_reshape_lengths: Optional[Dict]):
raise NotImplementedError('Should be defined in framework implementations')
def _create_parameters(self, weight_shape, weight_bound, bias_shape, bias_bound):
""" Shape and implementations """
raise NotImplementedError('Should be defined in framework implementations')
def __repr__(self):
params = repr(self.pattern)
params += f", '{self.weight_shape}'"
if self.bias_shape is not None:
params += f", '{self.bias_shape}'"
for axis, length in self.axes_lengths.items():
params += ', {}={}'.format(axis, length)
return '{}({})'.format(self.__class__.__name__, params)

View File

@ -0,0 +1,55 @@
from typing import Optional, Dict
import jittor as jt
from jittor import nn
import numpy as np
from jittor.einops.layers import RearrangeMixin, ReduceMixin
from jittor.einops.layers._einmix import _EinmixMixin
__author__ = 'Ruiyang Liu'
class Rearrange(RearrangeMixin, jt.nn.Module):
def execute(self, input):
return self._apply_recipe(input)
class Reduce(ReduceMixin, jt.nn.Module):
def execute(self, input):
return self._apply_recipe(input)
class EinMix(_EinmixMixin, jt.nn.Module):
def _create_parameters(self, weight_shape, weight_bound, bias_shape, bias_bound):
self.weight = jt.zeros(weight_shape)
nn.init.uniform_(self.weight, low = -weight_bound, high = weight_bound)
if bias_shape is not None:
self.bias = jt.zeros(bias_shape)
nn.init.uniform_(self.bias, low = -bias_bound, high = bias_bound)
else:
self.bias = None
def _create_rearrange_layers(self,
pre_reshape_pattern: Optional[str],
pre_reshape_lengths: Optional[Dict],
post_reshape_pattern: Optional[str],
post_reshape_lengths: Optional[Dict],
):
self.pre_rearrange = None
if pre_reshape_pattern is not None:
self.pre_rearrange = Rearrange(pre_reshape_pattern, **pre_reshape_lengths)
self.post_rearrange = None
if post_reshape_pattern is not None:
self.post_rearrange = Rearrange(post_reshape_pattern, **post_reshape_lengths)
def execute(self, input):
if self.pre_rearrange is not None:
input = self.pre_rearrange(input)
result = jt.linalg.einsum(self.einsum_pattern, input, self.weight)
if self.bias is not None:
result += self.bias
if self.post_rearrange is not None:
result = self.post_rearrange(result)
return result

View File

@ -0,0 +1,147 @@
from jittor.einops import EinopsError
import keyword
import warnings
from typing import List, Optional, Set, Tuple
_ellipsis: str = '' # NB, this is a single unicode symbol. String is used as it is not a list, but can be iterated
class AnonymousAxis(object):
"""Important thing: all instances of this class are not equal to each other """
def __init__(self, value: str):
self.value = int(value)
if self.value <= 1:
if self.value == 1:
raise EinopsError('No need to create anonymous axis of length 1. Report this as an issue')
else:
raise EinopsError('Anonymous axis should have positive length, not {}'.format(self.value))
def __repr__(self):
return "{}-axis".format(str(self.value))
class ParsedExpression:
"""
non-mutable structure that contains information about one side of expression (e.g. 'b c (h w)')
and keeps some information important for downstream
"""
def __init__(self, expression, *, allow_underscore: bool = False, allow_duplicates: bool = False):
self.has_ellipsis: bool = False
self.has_ellipsis_parenthesized: Optional[bool] = None
self.identifiers: Set[str] = set()
# that's axes like 2, 3, 4 or 5. Axes with size 1 are exceptional and replaced with empty composition
self.has_non_unitary_anonymous_axes: bool = False
# composition keeps structure of composite axes, see how different corner cases are handled in tests
self.composition = []
if '.' in expression:
if '...' not in expression:
raise EinopsError('Expression may contain dots only inside ellipsis (...)')
if str.count(expression, '...') != 1 or str.count(expression, '.') != 3:
raise EinopsError(
'Expression may contain dots only inside ellipsis (...); only one ellipsis for tensor ')
expression = expression.replace('...', _ellipsis)
self.has_ellipsis = True
bracket_group = None
def add_axis_name(x):
if x is not None:
if x in self.identifiers:
if not (allow_underscore and x == "_") and not allow_duplicates:
raise EinopsError('Indexing expression contains duplicate dimension "{}"'.format(x))
if x == _ellipsis:
self.identifiers.add(_ellipsis)
if bracket_group is None:
self.composition.append(_ellipsis)
self.has_ellipsis_parenthesized = False
else:
bracket_group.append(_ellipsis)
self.has_ellipsis_parenthesized = True
else:
is_number = str.isdecimal(x)
if is_number and int(x) == 1:
# handling the case of anonymous axis of length 1
if bracket_group is None:
self.composition.append([])
else:
pass # no need to think about 1s inside parenthesis
return
is_axis_name, reason = self.check_axis_name_return_reason(x, allow_underscore=allow_underscore)
if not (is_number or is_axis_name):
raise EinopsError('Invalid axis identifier: {}\n{}'.format(x, reason))
if is_number:
x = AnonymousAxis(x)
self.identifiers.add(x)
if is_number:
self.has_non_unitary_anonymous_axes = True
if bracket_group is None:
self.composition.append([x])
else:
bracket_group.append(x)
current_identifier = None
for char in expression:
if char in '() ':
add_axis_name(current_identifier)
current_identifier = None
if char == '(':
if bracket_group is not None:
raise EinopsError("Axis composition is one-level (brackets inside brackets not allowed)")
bracket_group = []
elif char == ')':
if bracket_group is None:
raise EinopsError('Brackets are not balanced')
self.composition.append(bracket_group)
bracket_group = None
elif str.isalnum(char) or char in ['_', _ellipsis]:
if current_identifier is None:
current_identifier = char
else:
current_identifier += char
else:
raise EinopsError("Unknown character '{}'".format(char))
if bracket_group is not None:
raise EinopsError('Imbalanced parentheses in expression: "{}"'.format(expression))
add_axis_name(current_identifier)
def flat_axes_order(self) -> List:
result = []
for composed_axis in self.composition:
assert isinstance(composed_axis, list), 'does not work with ellipsis'
for axis in composed_axis:
result.append(axis)
return result
def has_composed_axes(self) -> bool:
# this will ignore 1 inside brackets
for axes in self.composition:
if isinstance(axes, list) and len(axes) > 1:
return True
return False
@staticmethod
def check_axis_name_return_reason(name: str, allow_underscore: bool = False) -> Tuple[bool, str]:
if not str.isidentifier(name):
return False, 'not a valid python identifier'
elif name[0] == '_' or name[-1] == '_':
if name == '_' and allow_underscore:
return True, ''
return False, 'axis name should should not start or end with underscore'
else:
if keyword.iskeyword(name):
warnings.warn("It is discouraged to use axes names that are keywords: {}".format(name), RuntimeWarning)
if name in ['axis']:
warnings.warn("It is discouraged to use 'axis' as an axis name "
"and will raise an error in future", FutureWarning)
return True, ''
@staticmethod
def check_axis_name(name: str) -> bool:
"""
Valid axes names are python identifiers except keywords,
and additionally should not start or end with underscore
"""
is_valid, _reason = ParsedExpression.check_axis_name_return_reason(name)
return is_valid

View File

@ -1 +0,0 @@
../../extern

696
python/jittor/extern/acl/acl_compiler.py vendored Normal file
View File

@ -0,0 +1,696 @@
# ***************************************************************
# Copyright (c) 2023 Jittor. All Rights Reserved.
# Maintainers: Dun Liang <randonlang@gmail.com>.
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
# ***************************************************************
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def _ntuple(n):
def parse(x):
if isinstance(x, Iterable):
return x
return tuple([x] * n)
return parse
_pair = _ntuple(2)
has_acl = 0
cc_flags = ""
tikcc_path = env_or_try_find('tikcc_path', 'ccec')
dlopen_flags = os.RTLD_NOW | os.RTLD_GLOBAL
compiler.has_acl = has_acl
# export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/tools/aoe/lib64:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/plugin/opskernel:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/plugin/nnengine:/usr/local/Ascend/ascend-toolkit/latest/runtime/lib64:/usr/local/Ascend/ascend-toolkit/latest/compiler/lib64/stub:/usr/local/Ascend/ascend-toolkit/latest/tools/tikicpulib/lib/Ascend910A:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/simulator/Ascend910A/lib:/opt/AXESMI/lib64:/usr/local/Ascend/driver/lib64/driver/
# export PYTHONPATH=/home/cjld/new_jittor/jittor/python
# export tikcc_path=g++
# conda activate cann
# source /usr/local/Ascend/ascend-toolkit/set_env.sh
# export PYTHONPATH=/home/cjld/new_jittor/jittor/python:/home/cjld/new_jittor/jittor/my/jtorch/python:$PYTHONPATH
# export TASK_QUEUE_ENABLE=0
# python3 -m jittor.test.test_acl -k array
# jittor: conda activate cann && source /usr/local/Ascend/ascend-toolkit/set_env.sh && PYTHONPATH=/home/cjld/new_jittor/jittor/python:/home/cjld/new_jittor/jittor/my/jtorch/python:$PYTHONPATH && cd /home/cjld/new_jittor/jittor/my/mm_benchmark
# python3 -m jittor.test.test_acl -k test_sum
# export ASCEND_SLOG_PRINT_TO_STDOUT=0
# ASCEND_GLOBAL_LOG_LEVEL
# export DUMP_GE_GRAPH=1
# export DUMP_GRAPH_LEVEL=1
# build pytorch-npu
# bash ./ci/build.sh
# python3 -m pip install ./dist/torch_npu-1.11.0.post1-cp37-cp37m-linux_x86_64.whl --force-reinstall
# pytorch: conda activate cann && source /usr/local/Ascend/ascend-toolkit/set_env.sh && export TASK_QUEUE_ENABLE=0 && cd /home/cjld/new_jittor/jittor/my/mm_benchmark
# python3 ./mm_bench_pt_npu.py
def install():
import jittor.compiler as compiler
global has_acl, cc_flags
acl_compiler_home = os.path.dirname(__file__)
cc_files = sorted(glob.glob(acl_compiler_home + "/**/*.cc",
recursive=True))
cc_files2 = []
for name in cc_files:
# Skip files in hccl directory
if "hccl" in name:
continue
# if "acl_op_exec" in name or "_op_acl.cc" in name:
if "acl_op_exec" in name or "_op_acl.cc" in name or "utils.cc" in name:
compiler.extra_core_files.append(name)
else:
cc_files2.append(name)
cc_files = cc_files2
ascend_toolkit_home = os.getenv('ASCEND_TOOLKIT_HOME')
#print(ascend_toolkit_home)
#print(acl_compiler_home)
cc_flags += f" -MD -DHAS_CUDA -DIS_ACL \
-I{ascend_toolkit_home}/include/ \
-I{ascend_toolkit_home}/include/acl/ \
-I{ascend_toolkit_home}/include/aclnn/ \
-I{ascend_toolkit_home}/include/aclnnop/ \
-I{acl_compiler_home} -lascendcl -lacl_op_compiler \
-I{acl_compiler_home}/aclnn \
-I{acl_compiler_home}/aclops \
-L{ascend_toolkit_home}/lib64/"
cc_flags += " -llibascendcl "
cc_flags += " -llibnnopbase "
cc_flags += " -llibopapi "
#pdb.set_trace()
ctypes.CDLL("libascendcl.so", dlopen_flags)
f'''
-ltikc_runtime
-I/usr/local/Ascend/driver/include/ \
-L{ascend_toolkit_home}/compiler/lib64/ \
-L{ascend_toolkit_home}/runtime/lib64/ \
'''
jittor_utils.LOG.i("ACL detected")
global mod
mod = jittor_utils.compile_module(
'''
#include "common.h"
namespace jittor {
// @pyjt(process)
string process_acl(const string& src, const string& name, const map<string,string>& kargs);
// @pyjt(init_acl_ops)
void init_acl_ops();
}''', compiler.cc_flags + " " + " ".join(cc_files) + cc_flags)
jittor_utils.process_jittor_source("acl", mod.process)
has_acl = 1
os.environ["use_mkl"] = "0"
compiler.setup_fake_cuda_lib = True
def install_extern():
return False
def check():
import jittor.compiler as compiler
global has_acl, cc_flags
if tikcc_path:
try:
install()
except Exception as e:
jittor_utils.LOG.w(f"load ACL failed, exception: {e}")
has_acl = 0
compiler.has_acl = has_acl
compiler.tikcc_path = tikcc_path
if not has_acl: return False
compiler.cc_flags += cc_flags
compiler.nvcc_path = tikcc_path
compiler.nvcc_flags = compiler.cc_flags.replace("-std=c++14", "")
return True
def post_process():
if has_acl:
from jittor import pool
pool.pool_use_code_op = False
import jittor as jt
jt.flags.use_cuda_host_allocator = 1
jt.flags.use_parallel_op_compiler = 0
jt.flags.amp_reg |= 32 + 4 # 32 keep float16, 4 keep reduce type
mod.init_acl_ops()
def change_function():
import jittor as jt
from jittor import Function
from .aclops.flashattention_op import FlashAttentionACL
from .aclops.conv_op import ConvACL
from .aclops.pool_op import PoolACL
from .aclops.nantonum_op import NanToNumACL
from .aclops.stack_op import StackACL
from .aclops.rope_op import RopeACL
from .aclops.softmax_op import SoftmaxACL
from .aclops.sigmoid_op import SigmoidACL
from .aclops.silu_op import SiLUACL
from .aclops.dropout_op import DropoutACL
from .aclops.relu_op import LeakyReLUACL
from .aclops.flip_op import FlipACL
from .aclops.concat_op import ConcatACL
from .aclops.gather_scatter_op import GatherACL
from .aclops.cumsum_op import CumsumACL
from .aclops.index_op import IndexACL
from .aclops.gather_scatter_op import ScatterACL
from .aclops.where_op import WhereACL
from .aclops.where_op import NonzeroACL
from .aclops.floor_op import FloorIntACL
from .aclops.getitem_op import GetItemACL
from .aclops.setitem_op import SetItemACL
from .aclops.bmm_op import BmmACL
from .aclops.matmul_op import MatmulACL
from .aclops.transpose_op import TransPoseACL
from .aclops.triu_op import TriuACL
def triu_acl(x, diagonal=0):
return TriuACL()(x, diagonal)
from .aclops.conv_op import ConvACL
def conv_acl(x,
weight,
bias=None,
stride=1,
padding=0,
dilation=1,
groups=1):
return ConvACL()(x, weight, bias, stride, padding, dilation, groups)
class Conv2D(jt.nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True):
if in_channels <= 0:
raise ValueError(
f"in_channels must be greater than zero, got {in_channels}"
)
if out_channels <= 0:
raise ValueError(
f"out_channels must be greater than zero, got {out_channels}"
)
if groups <= 0:
raise ValueError(
f"groups must must be greater than zero, got {groups}")
assert in_channels % groups == 0, 'in_channels must be divisible by groups'
assert out_channels % groups == 0, 'out_channels must be divisible by groups'
if isinstance(kernel_size, tuple):
for size in kernel_size:
if size <= 0:
raise ValueError(
f"kernel_size must be greater than zero, got {kernel_size}"
)
else:
if kernel_size <= 0:
raise ValueError(
f"kernel_size must be greater than zero, got {kernel_size}"
)
if isinstance(stride, tuple):
for size in stride:
if size <= 0:
raise ValueError(
f"stride must be greater than zero, got {stride}")
else:
if stride <= 0:
raise ValueError(
f"stride must be greater than zero, got {stride}")
if isinstance(padding, tuple):
for size in padding:
if size < 0:
raise ValueError(
f"padding must be nonnegative, got {padding}")
else:
if padding < 0:
raise ValueError(
f"padding must be nonnegative, got {padding}")
if isinstance(dilation, tuple):
for size in dilation:
if size <= 0:
raise ValueError(
f"dilation must be greater than zero, got {dilation}"
)
else:
if dilation <= 0:
raise ValueError(
f"dilation must be greater than zero, got {dilation}")
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size if isinstance(
kernel_size, tuple) else (kernel_size, kernel_size)
self.stride = stride if isinstance(stride, tuple) else (stride,
stride)
self.padding = padding if isinstance(padding, tuple) else (padding,
padding)
self.dilation = dilation if isinstance(
dilation, tuple) else (dilation, dilation)
self.groups = groups
self.is_depthwise_conv = self.groups == self.out_channels and self.groups == self.in_channels
if self.is_depthwise_conv and jt.flags.use_cuda and jt.compiler.is_cuda:
self.depthwise_conv = jt.nn.DepthwiseConv(
stride, padding, dilation)
Kh, Kw = self.kernel_size
# self.weight = init.relu_invariant_gauss([out_channels, in_channels//groups, Kh, Kw], dtype="float", mode="fan_out")
self.weight = jt.init.invariant_uniform(
[out_channels, in_channels // groups, Kh, Kw], dtype="float")
if bias:
fan = 1
for i in self.weight.shape[1:]:
fan *= i
bound = 1 / math.sqrt(fan)
self.bias = jt.init.uniform([out_channels],
dtype="float",
low=-bound,
high=bound)
else:
self.bias = None
def execute(self, x):
ret = jt.nn.conv2d(x, self.weight, self.bias, self.stride,
self.padding, self.dilation, self.groups)
return ret
from .aclops.flip_op import FlipACL
def flip_acl(x, dim):
return FlipACL()(x, dim)
from .aclops.concat_op import ConcatACL
def concat(x, dim=0):
return ConcatACL()(x, dim)
from .aclops.gather_scatter_op import GatherACL
def gather_acl(input, dim, index):
return GatherACL()(input, dim, index)
def any_acl(input, dim=None):
if dim is None:
if jt.sum(input != 0).item() > 0:
return jt.array([True])
else:
return jt.array([False])
else:
return jt.sum(input != 0, dim=dim) > 0
from .aclops.cumsum_op import CumsumACL
def cumsum_acl(input, dim=-1):
return CumsumACL()(input, dim)
def cumprod_acl(x, dim=None):
x = jt.log(x)
x = cumsum_acl(x, dim=dim)
return jt.exp(x)
from .aclops.index_op import IndexACL
def index_acl(inshape: Union[jt.Var, list], dim=None, dtype="int32"):
if isinstance(inshape, jt.Var):
inshape = inshape.shape
return IndexACL()(inshape, dim, dtype)
from .aclops.gather_scatter_op import ScatterACL
def scatter_acl(input, dim, index, src, reduce='void'):
return ScatterACL()(input, dim, index, src, reduce)
from .aclops.where_op import WhereACL
def where_acl(condition, x=None, y=None):
return WhereACL()(condition, x, y)
from .aclops.where_op import NonzeroACL
def nonzero_acl(x):
return NonzeroACL()(x)
from .aclops.floor_op import FloorIntACL
def floor_int_acl(x):
return FloorIntACL()(x)
from .aclops.getitem_op import GetItemACL
def getitem_acl(x, slices, return_x=None):
# Transform numpy int to int
if isinstance(slices, (np.int8, np.int16, np.int32, np.int64)):
slices = int(slices)
if hasattr(np, 'int128') and isinstance(slices, np.int128):
slices = int(slices)
if hasattr(np, 'int256') and isinstance(slices, np.int256):
slices = int(slices)
## If not related to `None`, directly use `GetItemACL`
if slices is not None and (not isinstance(slices, Iterable)
or all([s is not None for s in slices])):
return GetItemACL()(x, slices, return_x)
## If related to `None`, filter out `None` first, then use `GetItemACL`, and finally insert `None` (new dimensions) back
# Transform to tuple
if isinstance(slices, int) or isinstance(slices, slice):
slices = (slices, )
assert isinstance(slices, tuple)
def get_insert_positions(slices):
result = []
pos = 0
not_none_cnt = len(slices) - slices.count(None)
for s in slices:
if isinstance(s, int):
continue
elif s is None:
result.append(pos)
pos += 1
elif s == Ellipsis:
pos += 1 + x.ndim - not_none_cnt
else:
pos += 1
return result
insert_positions = get_insert_positions(slices)
slices_without_none = tuple(s for s in slices if s is not None)
result = GetItemACL()(x, slices_without_none, return_x)
for i in insert_positions:
result = result.unsqueeze(i)
return result
from .aclops.setitem_op import SetItemACL
def setitem_acl(x, slices, value):
res = SetItemACL()(x, slices, value)
return x.assign(res)
from .aclops.bmm_op import BmmACL
def bmm_acl(x1, x2):
return BmmACL()(x1, x2)
def bmm_transpose_acl(x1, x2):
return BmmACL(True)(x1, x2)
from .aclops.matmul_op import MatmulACL
def matmul_acl(x1, x2):
return MatmulACL()(x1, x2)
def matmul_transpose_acl(x1, x2):
return MatmulACL(True)(x1, x2)
from .aclops.transpose_op import TransPoseACL
def transpose_acl(x, *dim):
return TransPoseACL()(x, *dim)
from .aclops.relu_op import ReLUACL
class ReLU(jt.nn.Module):
def __init__(self):
super(ReLU, self).__init__()
def execute(self, x):
return ReLUACL()(x)
def relu(x):
return ReLUACL()(x)
from .aclops.relu_op import LeakyReLUACL
class LeakyReLU(jt.nn.Module):
def __init__(self, negative_slope=0.01):
super(LeakyReLU, self).__init__()
self.negative_slope = negative_slope
def execute(self, x):
return LeakyReLUACL()(x, self.negative_slope)
def leaky_relu(x, scale=0.01):
return LeakyReLUACL()(x, scale)
from .aclops.dropout_op import DropoutACL
class Dropout(jt.nn.Module):
def __init__(self, p=0.5, is_train=False):
super(Dropout, self).__init__()
self.p = p
self.is_train = is_train
def execute(self, x):
return DropoutACL()(x, self.p, self.is_train)
def dropout_acl(x, p=0.5, is_train=False):
return DropoutACL()(x, p, is_train)
from .aclops.silu_op import SiLUACL
def silu_acl(x):
return SiLUACL()(x)
class SiLU(jt.nn.Module):
def __init__(self):
super(SiLU, self).__init__()
def execute(self, x):
return SiLUACL()(x)
from .aclops.sigmoid_op import SigmoidACL
def sigmoid_acl(x):
return SigmoidACL()(x)
class Sigmoid(jt.nn.Module):
def __init__(self):
super(Sigmoid, self).__init__()
def execute(self, x):
return SigmoidACL()(x)
# class Embedding(jt.nn.Module):
# def __init__(self,
# num_embeddings,
# embedding_dim,
# padding_idx=None,
# dtype="float32"):
# self.num_embeddings = num_embeddings
# self.embedding_dim = embedding_dim
# self.padding_idx = padding_idx
# self.weight = jt.init.gauss(
# [self.num_embeddings, self.embedding_dim], dtype)
# if padding_idx is not None:
# self.weight[padding_idx] = 0
# def execute(self, x):
# res = embedding_acl(x, self.weight)
# return res
class Softmax(jt.nn.Module):
def __init__(self):
super(Softmax, self).__init__()
def execute(self, x, dim):
return SoftmaxACL()(x, dim)
def softmax_acl(x, dim):
return SoftmaxACL()(x, dim)
from .aclops.rope_op import RopeACL
def rope_acl(xq, xk, freqs_cis=None, freq_sin=None, freq_cos=None):
return RopeACL()(xq, xk, freqs_cis, freq_sin, freq_cos)
from .aclops.stack_op import StackACL
def stack_acl(x, dim=0):
return StackACL()(x, dim)
from .aclops.nantonum_op import NanToNumACL
def isnan_acl(x):
tonum = NanToNumACL()(x, -1.0)
return jt.not_equal(x, tonum).logical_and(
jt.not_equal(tonum, jt.ones_like(x)))
def isinf_acl(x):
tonum = NanToNumACL()(x, 1.0)
return jt.not_equal(x, tonum).logical_and(
jt.not_equal(tonum, jt.ones_like(x)))
def warp(origin_func, new_func, name=None):
if isinstance(origin_func, type):
class WrappedClass(origin_func, new_func):
def __init__(self, *args, **kwargs):
if jt.flags.use_acl:
new_func.__init__(self, *args, **kwargs)
else:
origin_func.__init__(self, *args, **kwargs)
def execute(self, *args, **kwargs):
if jt.flags.use_acl:
return new_func.execute(self, *args, **kwargs)
elif name == 'setitem':
return args[0].assign(origin_func(*args, **kwargs))
else:
return origin_func.execute(self, *args, **kwargs)
return WrappedClass
else:
def warpper(*args, **kwargs):
if jt.flags.use_acl:
return new_func(*args, **kwargs)
elif name == 'setitem':
return args[0].assign(origin_func(*args, **kwargs))
else:
return origin_func(*args, **kwargs)
return warpper
jt.triu = warp(jt.triu, triu_acl)
jt.triu_ = warp(jt.triu, triu_acl)
jt.Var.triu = jt.triu
jt.Var.triu_ = lambda x, diagonal=0: x.assign(x.triu(diagonal))
jt.nn.conv2d = warp(jt.nn.conv2d, conv_acl)
jt.nn.Conv2d = warp(jt.nn.Conv2d, Conv2D)
jt.nn.Conv = warp(jt.nn.Conv, Conv2D)
jt.nn.Pool = warp(jt.nn.Pool, PoolACL)
jt.flip = warp(jt.flip, flip_acl)
jt.Var.flip = lambda x, dim_vector=0: jt.flip(x, dim_vector)
jt.concat = warp(jt.concat, concat)
jt.stack = warp(jt.stack, stack_acl)
jt.gather = warp(jt.gather, gather_acl)
jt.any = warp(jt.any, any_acl)
jt.Var.any = jt.any
jt.cumsum = warp(jt.cumsum, cumsum_acl)
jt.cub_cumsum = jt.cumsum
jt.Var.cumsum = jt.cumsum
jt.Var.cub_cumsum = jt.cumsum
jt.cumprod = warp(jt.cumprod, cumprod_acl)
jt.index = warp(jt.index, index_acl)
jt.Var.index = jt.index
jt.scatter = warp(jt.scatter, scatter_acl)
jt.Var.scatter = lambda x, dim, index, src, reduce="void": jt.scatter(
x, dim, index, src, reduce)
jt.where = warp(jt.where, where_acl)
jt.nonzero = warp(jt.nonzero, nonzero_acl)
jt.misc.nonzero = warp(jt.misc.nonzero, nonzero_acl)
jt.Var.nonzero = jt.misc.nonzero
jt.floor_int = warp(jt.floor_int, floor_int_acl)
jt.Var.floor_int = lambda x: jt.floor_int(x)
jt.getitem = warp(jt.contrib.getitem, getitem_acl)
fake_getitem = jt.Var.getitem
jt.Var.getitem = lambda x, slices, return_x=None: warp(
fake_getitem, getitem_acl)(x, slices)
jt.Var.slice_var = lambda x, slices, return_x=None: warp(
fake_getitem, getitem_acl)(x, slices)
jt.Var.__getitem__ = lambda x, slices, return_x=None: warp(
fake_getitem, getitem_acl)(x, slices)
jt.setitem = warp(jt.contrib.setitem, setitem_acl)
fake_setitem = jt.Var.setitem
jt.Var.setitem = lambda x, slices, value: warp(
fake_setitem, setitem_acl, name='setitem')(x, slices, value)
jt.Var.__setitem__ = lambda x, slices, value: warp(
fake_setitem, setitem_acl, name='setitem')(x, slices, value)
fake_matmul = jt.Var.matmul
jt.nn.bmm = warp(jt.nn.bmm, bmm_acl)
jt.bmm = warp(jt.bmm, bmm_acl)
jt.nn.matmul = warp(jt.matmul, matmul_acl)
jt.matmul = warp(jt.matmul, matmul_acl)
jt.nn.matmul_transpose = warp(jt.nn.matmul_transpose, matmul_transpose_acl)
jt.nn.bmm_transpose = warp(jt.nn.bmm_transpose, bmm_transpose_acl)
jt.bmm_transpose = warp(jt.bmm_transpose, bmm_transpose_acl)
jt.Var.__matmul__ = lambda x, y: warp(fake_matmul, matmul_acl)(x, y)
jt.transpose = warp(jt.transpose, transpose_acl)
fake_transpose = jt.transpose
jt.Var.transpose = lambda x, *dim: warp(fake_transpose, transpose_acl)(x, *
dim)
# jt.Var.permute = lambda x: warp(fake_transpose, transpose_acl)(x)
# jt.Var.t = lambda x: warp(fake_transpose, transpose_acl)(x)
jt.nn.relu = warp(jt.nn.relu, relu)
jt.nn.ReLU = warp(jt.nn.ReLU, ReLU)
jt.nn.leaky_relu = warp(jt.nn.leaky_relu, leaky_relu)
jt.nn.LeakyReLU = warp(jt.nn.LeakyReLU, LeakyReLU)
# jt.nn.silu = warp(jt.nn.silu, silu_acl)
# jt.nn.SiLU = warp(jt.nn.SiLU, SiLU)
jt.sigmoid = warp(jt.sigmoid, sigmoid_acl)
jt.nn.Sigmoid = warp(jt.nn.Sigmoid, Sigmoid)
# from .aclops.embedding_op import EmbeddingACL
# def embedding_acl(indices, weight):
# return EmbeddingACL()(indices, weight)
# jt.nn.embedding = warp(jt.nn.embedding, embedding_acl)
# jt.nn.Embedding = warp(jt.nn.Embedding, Embedding)
jt.nn.dropout = warp(jt.nn.dropout, dropout_acl)
jt.nn.Dropout = warp(jt.nn.Dropout, Dropout)
jt.nn.softmax = warp(jt.nn.softmax, softmax_acl)
# from .aclops.norms_op import BatchNormACL,LayerNormACL
# jt.nn.BatchNorm = warp(jt.nn.BatchNorm, BatchNormACL)
# jt.nn.LayerNorm = warp(jt.nn.LayerNorm, LayerNormACL)
jt.nn.FlashAttention = warp(jt.nn.FlashAttention, FlashAttentionACL)
jt.isnan = warp(jt.isnan, isnan_acl)
jt.isinf = warp(jt.isinf, isinf_acl)
jt.Var.isnan = jt.isnan
jt.Var.isinf = jt.isinf
jt.nn.rotary_emb = rope_acl

View File

@ -0,0 +1,232 @@
// ***************************************************************
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include "common.h"
using std::string;
using std::unordered_map;
typedef int aclError;
static inline unordered_map<aclError, string> gen_map(string s)
{
unordered_map<aclError, string> smap;
for (int i = 0; i < s.size(); i++)
{
if (s[i] == ';')
{
int j = s.rfind(" ", i);
int code = std::stoi(s.substr(j + 1, i - j - 1));
int k = s.rfind(" ", j - 1);
int l = s.rfind(" ACL_", k - 1);
smap[code] = s.substr(l + 1, k - l - 1);
}
}
return smap;
}
string acl_error_to_string(aclError error)
{
static unordered_map<aclError, string> acl_error_map = gen_map(R"(
// from acl_base.h
static const int ACL_ERROR_INVALID_PARAM = 100000;
static const int ACL_ERROR_UNINITIALIZE = 100001;
static const int ACL_ERROR_REPEAT_INITIALIZE = 100002;
static const int ACL_ERROR_INVALID_FILE = 100003;
static const int ACL_ERROR_WRITE_FILE = 100004;
static const int ACL_ERROR_INVALID_FILE_SIZE = 100005;
static const int ACL_ERROR_PARSE_FILE = 100006;
static const int ACL_ERROR_FILE_MISSING_ATTR = 100007;
static const int ACL_ERROR_FILE_ATTR_INVALID = 100008;
static const int ACL_ERROR_INVALID_DUMP_CONFIG = 100009;
static const int ACL_ERROR_INVALID_PROFILING_CONFIG = 100010;
static const int ACL_ERROR_INVALID_MODEL_ID = 100011;
static const int ACL_ERROR_DESERIALIZE_MODEL = 100012;
static const int ACL_ERROR_PARSE_MODEL = 100013;
static const int ACL_ERROR_READ_MODEL_FAILURE = 100014;
static const int ACL_ERROR_MODEL_SIZE_INVALID = 100015;
static const int ACL_ERROR_MODEL_MISSING_ATTR = 100016;
static const int ACL_ERROR_MODEL_INPUT_NOT_MATCH = 100017;
static const int ACL_ERROR_MODEL_OUTPUT_NOT_MATCH = 100018;
static const int ACL_ERROR_MODEL_NOT_DYNAMIC = 100019;
static const int ACL_ERROR_OP_TYPE_NOT_MATCH = 100020;
static const int ACL_ERROR_OP_INPUT_NOT_MATCH = 100021;
static const int ACL_ERROR_OP_OUTPUT_NOT_MATCH = 100022;
static const int ACL_ERROR_OP_ATTR_NOT_MATCH = 100023;
static const int ACL_ERROR_OP_NOT_FOUND = 100024;
static const int ACL_ERROR_OP_LOAD_FAILED = 100025;
static const int ACL_ERROR_UNSUPPORTED_DATA_TYPE = 100026;
static const int ACL_ERROR_FORMAT_NOT_MATCH = 100027;
static const int ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED = 100028;
static const int ACL_ERROR_KERNEL_NOT_FOUND = 100029;
static const int ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED = 100030;
static const int ACL_ERROR_KERNEL_ALREADY_REGISTERED = 100031;
static const int ACL_ERROR_INVALID_QUEUE_ID = 100032;
static const int ACL_ERROR_REPEAT_SUBSCRIBE = 100033;
static const int ACL_ERROR_STREAM_NOT_SUBSCRIBE = 100034;
static const int ACL_ERROR_THREAD_NOT_SUBSCRIBE = 100035;
static const int ACL_ERROR_WAIT_CALLBACK_TIMEOUT = 100036;
static const int ACL_ERROR_REPEAT_FINALIZE = 100037;
static const int ACL_ERROR_NOT_STATIC_AIPP = 100038;
static const int ACL_ERROR_COMPILING_STUB_MODE = 100039;
static const int ACL_ERROR_GROUP_NOT_SET = 100040;
static const int ACL_ERROR_GROUP_NOT_CREATE = 100041;
static const int ACL_ERROR_PROF_ALREADY_RUN = 100042;
static const int ACL_ERROR_PROF_NOT_RUN = 100043;
static const int ACL_ERROR_DUMP_ALREADY_RUN = 100044;
static const int ACL_ERROR_DUMP_NOT_RUN = 100045;
static const int ACL_ERROR_PROF_REPEAT_SUBSCRIBE = 148046;
static const int ACL_ERROR_PROF_API_CONFLICT = 148047;
static const int ACL_ERROR_INVALID_MAX_OPQUEUE_NUM_CONFIG = 148048;
static const int ACL_ERROR_INVALID_OPP_PATH = 148049;
static const int ACL_ERROR_OP_UNSUPPORTED_DYNAMIC = 148050;
static const int ACL_ERROR_RELATIVE_RESOURCE_NOT_CLEARED = 148051;
static const int ACL_ERROR_BAD_ALLOC = 200000;
static const int ACL_ERROR_API_NOT_SUPPORT = 200001;
static const int ACL_ERROR_INVALID_DEVICE = 200002;
static const int ACL_ERROR_MEMORY_ADDRESS_UNALIGNED = 200003;
static const int ACL_ERROR_RESOURCE_NOT_MATCH = 200004;
static const int ACL_ERROR_INVALID_RESOURCE_HANDLE = 200005;
static const int ACL_ERROR_FEATURE_UNSUPPORTED = 200006;
static const int ACL_ERROR_PROF_MODULES_UNSUPPORTED = 200007;
static const int ACL_ERROR_STORAGE_OVER_LIMIT = 300000;
static const int ACL_ERROR_INTERNAL_ERROR = 500000;
static const int ACL_ERROR_FAILURE = 500001;
static const int ACL_ERROR_GE_FAILURE = 500002;
static const int ACL_ERROR_RT_FAILURE = 500003;
static const int ACL_ERROR_DRV_FAILURE = 500004;
static const int ACL_ERROR_PROFILING_FAILURE = 500005;
// from ge_error_codes.h
static const uint32_t ACL_ERROR_GE_PARAM_INVALID = 145000U;
static const uint32_t ACL_ERROR_GE_EXEC_NOT_INIT = 145001U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_PATH_INVALID = 145002U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ID_INVALID = 145003U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_DATA_SIZE_INVALID = 145006U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_ADDR_INVALID = 145007U;
static const uint32_t ACL_ERROR_GE_EXEC_MODEL_QUEUE_ID_INVALID = 145008U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_REPEATED = 145009U;
static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_ADDR_INVALID = 145011U;
static const uint32_t ACL_ERROR_GE_DYNAMIC_INPUT_LENGTH_INVALID = 145012U;
static const uint32_t ACL_ERROR_GE_DYNAMIC_BATCH_SIZE_INVALID = 145013U;
static const uint32_t ACL_ERROR_GE_AIPP_BATCH_EMPTY = 145014U;
static const uint32_t ACL_ERROR_GE_AIPP_NOT_EXIST = 145015U;
static const uint32_t ACL_ERROR_GE_AIPP_MODE_INVALID = 145016U;
static const uint32_t ACL_ERROR_GE_OP_TASK_TYPE_INVALID = 145017U;
static const uint32_t ACL_ERROR_GE_OP_KERNEL_TYPE_INVALID = 145018U;
static const uint32_t ACL_ERROR_GE_PLGMGR_PATH_INVALID = 145019U;
static const uint32_t ACL_ERROR_GE_FORMAT_INVALID = 145020U;
static const uint32_t ACL_ERROR_GE_SHAPE_INVALID = 145021U;
static const uint32_t ACL_ERROR_GE_DATATYPE_INVALID = 145022U;
static const uint32_t ACL_ERROR_GE_MEMORY_ALLOCATION = 245000U;
static const uint32_t ACL_ERROR_GE_MEMORY_OPERATE_FAILED = 245001U;
static const uint32_t ACL_ERROR_GE_INTERNAL_ERROR = 545000U;
static const uint32_t ACL_ERROR_GE_LOAD_MODEL = 545001U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_MODEL_PARTITION_FAILED = 545002U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_WEIGHT_PARTITION_FAILED = 545003U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_TASK_PARTITION_FAILED = 545004U;
static const uint32_t ACL_ERROR_GE_EXEC_LOAD_KERNEL_PARTITION_FAILED = 545005U;
static const uint32_t ACL_ERROR_GE_EXEC_RELEASE_MODEL_DATA = 545006U;
static const uint32_t ACL_ERROR_GE_COMMAND_HANDLE = 545007U;
static const uint32_t ACL_ERROR_GE_GET_TENSOR_INFO = 545008U;
static const uint32_t ACL_ERROR_GE_UNLOAD_MODEL = 545009U;
static const int32_t ACL_ERROR_RT_PARAM_INVALID = 107000; // param invalid
static const int32_t ACL_ERROR_RT_INVALID_DEVICEID = 107001; // invalid device id
static const int32_t ACL_ERROR_RT_CONTEXT_NULL = 107002; // current context null
static const int32_t ACL_ERROR_RT_STREAM_CONTEXT = 107003; // stream not in current context
static const int32_t ACL_ERROR_RT_MODEL_CONTEXT = 107004; // model not in current context
static const int32_t ACL_ERROR_RT_STREAM_MODEL = 107005; // stream not in model
static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID = 107006; // event timestamp invalid
static const int32_t ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL = 107007; // event timestamp reversal
static const int32_t ACL_ERROR_RT_ADDR_UNALIGNED = 107008; // memory address unaligned
static const int32_t ACL_ERROR_RT_FILE_OPEN = 107009; // open file failed
static const int32_t ACL_ERROR_RT_FILE_WRITE = 107010; // write file failed
static const int32_t ACL_ERROR_RT_STREAM_SUBSCRIBE = 107011; // error subscribe stream
static const int32_t ACL_ERROR_RT_THREAD_SUBSCRIBE = 107012; // error subscribe thread
static const int32_t ACL_ERROR_RT_GROUP_NOT_SET = 107013; // group not set
static const int32_t ACL_ERROR_RT_GROUP_NOT_CREATE = 107014; // group not create
static const int32_t ACL_ERROR_RT_STREAM_NO_CB_REG = 107015; // callback not register to stream
static const int32_t ACL_ERROR_RT_INVALID_MEMORY_TYPE = 107016; // invalid memory type
static const int32_t ACL_ERROR_RT_INVALID_HANDLE = 107017; // invalid handle
static const int32_t ACL_ERROR_RT_INVALID_MALLOC_TYPE = 107018; // invalid malloc type
static const int32_t ACL_ERROR_RT_WAIT_TIMEOUT = 107019; // wait timeout
static const int32_t ACL_ERROR_RT_FEATURE_NOT_SUPPORT = 207000; // feature not support
static const int32_t ACL_ERROR_RT_MEMORY_ALLOCATION = 207001; // memory allocation error
static const int32_t ACL_ERROR_RT_MEMORY_FREE = 207002; // memory free error
static const int32_t ACL_ERROR_RT_AICORE_OVER_FLOW = 207003; // aicore over flow
static const int32_t ACL_ERROR_RT_NO_DEVICE = 207004; // no device
static const int32_t ACL_ERROR_RT_RESOURCE_ALLOC_FAIL = 207005; // resource alloc fail
static const int32_t ACL_ERROR_RT_NO_PERMISSION = 207006; // no permission
static const int32_t ACL_ERROR_RT_NO_EVENT_RESOURCE = 207007; // no event resource
static const int32_t ACL_ERROR_RT_NO_STREAM_RESOURCE = 207008; // no stream resource
static const int32_t ACL_ERROR_RT_NO_NOTIFY_RESOURCE = 207009; // no notify resource
static const int32_t ACL_ERROR_RT_NO_MODEL_RESOURCE = 207010; // no model resource
static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE = 207011; // no cdq resource
static const int32_t ACL_ERROR_RT_OVER_LIMIT = 207012; // over limit
static const int32_t ACL_ERROR_RT_QUEUE_EMPTY = 207013; // queue is empty
static const int32_t ACL_ERROR_RT_QUEUE_FULL = 207014; // queue is full
static const int32_t ACL_ERROR_RT_REPEATED_INIT = 207015; // repeated init
static const int32_t ACL_ERROR_RT_AIVEC_OVER_FLOW = 207016; // aivec over flow
static const int32_t ACL_ERROR_RT_INTERNAL_ERROR = 507000; // runtime internal error
static const int32_t ACL_ERROR_RT_TS_ERROR = 507001; // ts internel error
static const int32_t ACL_ERROR_RT_STREAM_TASK_FULL = 507002; // task full in stream
static const int32_t ACL_ERROR_RT_STREAM_TASK_EMPTY = 507003; // task empty in stream
static const int32_t ACL_ERROR_RT_STREAM_NOT_COMPLETE = 507004; // stream not complete
static const int32_t ACL_ERROR_RT_END_OF_SEQUENCE = 507005; // end of sequence
static const int32_t ACL_ERROR_RT_EVENT_NOT_COMPLETE = 507006; // event not complete
static const int32_t ACL_ERROR_RT_CONTEXT_RELEASE_ERROR = 507007; // context release error
static const int32_t ACL_ERROR_RT_SOC_VERSION = 507008; // soc version error
static const int32_t ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT = 507009; // task type not support
static const int32_t ACL_ERROR_RT_LOST_HEARTBEAT = 507010; // ts lost heartbeat
static const int32_t ACL_ERROR_RT_MODEL_EXECUTE = 507011; // model execute failed
static const int32_t ACL_ERROR_RT_REPORT_TIMEOUT = 507012; // report timeout
static const int32_t ACL_ERROR_RT_SYS_DMA = 507013; // sys dma error
static const int32_t ACL_ERROR_RT_AICORE_TIMEOUT = 507014; // aicore timeout
static const int32_t ACL_ERROR_RT_AICORE_EXCEPTION = 507015; // aicore exception
static const int32_t ACL_ERROR_RT_AICORE_TRAP_EXCEPTION = 507016; // aicore trap exception
static const int32_t ACL_ERROR_RT_AICPU_TIMEOUT = 507017; // aicpu timeout
static const int32_t ACL_ERROR_RT_AICPU_EXCEPTION = 507018; // aicpu exception
static const int32_t ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR = 507019; // aicpu datadump response error
static const int32_t ACL_ERROR_RT_AICPU_MODEL_RSP_ERR = 507020; // aicpu model operate response error
static const int32_t ACL_ERROR_RT_PROFILING_ERROR = 507021; // profiling error
static const int32_t ACL_ERROR_RT_IPC_ERROR = 507022; // ipc error
static const int32_t ACL_ERROR_RT_MODEL_ABORT_NORMAL = 507023; // model abort normal
static const int32_t ACL_ERROR_RT_KERNEL_UNREGISTERING = 507024; // kernel unregistering
static const int32_t ACL_ERROR_RT_RINGBUFFER_NOT_INIT = 507025; // ringbuffer not init
static const int32_t ACL_ERROR_RT_RINGBUFFER_NO_DATA = 507026; // ringbuffer no data
static const int32_t ACL_ERROR_RT_KERNEL_LOOKUP = 507027; // kernel lookup error
static const int32_t ACL_ERROR_RT_KERNEL_DUPLICATE = 507028; // kernel register duplicate
static const int32_t ACL_ERROR_RT_DEBUG_REGISTER_FAIL = 507029; // debug register failed
static const int32_t ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL = 507030; // debug unregister failed
static const int32_t ACL_ERROR_RT_LABEL_CONTEXT = 507031; // label not in current context
static const int32_t ACL_ERROR_RT_PROGRAM_USE_OUT = 507032; // program register num use out
static const int32_t ACL_ERROR_RT_DEV_SETUP_ERROR = 507033; // device setup error
static const int32_t ACL_ERROR_RT_VECTOR_CORE_TIMEOUT = 507034; // vector core timeout
static const int32_t ACL_ERROR_RT_VECTOR_CORE_EXCEPTION = 507035; // vector core exception
static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_EXCEPTION = 507036; // vector core trap exception
static const int32_t ACL_ERROR_RT_CDQ_BATCH_ABNORMAL = 507037; // cdq alloc batch abnormal
static const int32_t ACL_ERROR_RT_DIE_MODE_CHANGE_ERROR = 507038; // can not change die mode
static const int32_t ACL_ERROR_RT_DIE_SET_ERROR = 507039; // single die mode can not set die
static const int32_t ACL_ERROR_RT_INVALID_DIEID = 507040; // invalid die id
static const int32_t ACL_ERROR_RT_DIE_MODE_NOT_SET = 507041; // die mode not set
static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR = 507899; // drv internal error
static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR = 507900; // aicpu internal error
static const int32_t ACL_ERROR_RT_SOCKET_CLOSE = 507901; // hdc disconnect
)");
if (acl_error_map.count(error))
return acl_error_map[error];
return "unknown " + std::to_string((int)error);
}

320
python/jittor/extern/acl/acl_jittor.cc vendored Normal file
View File

@ -0,0 +1,320 @@
// ***************************************************************
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "utils/str_utils.h"
#include <chrono>
#include <thread>
#include "aclnn/aclnn.h"
namespace jittor
{
uint64_t acl_jittor_tid;
int acl_jittor_thread_running = 0;
aclrtStream aclstream;
void *workspaceAddr = nullptr;
uint64_t nowWorkSpaceSize = 0;
#define CHECK_ACL(x) ASSERTop(x, ==, 0)
void mallocWorkSpace(uint64_t size)
{
uint64_t alloc_size = size + 32;
alloc_size = ((alloc_size - 1) / 32 + 1) * 32;
if (alloc_size > nowWorkSpaceSize)
{
aclrtFree(workspaceAddr);
nowWorkSpaceSize = alloc_size;
auto ret = aclrtMalloc(&workspaceAddr, nowWorkSpaceSize, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return);
}
}
static void *acl_jittor_process_callback(void *)
{
acl_jittor_thread_running = 1;
while (acl_jittor_thread_running)
{
// LOGir << "acl_jittor_process_callback";
auto ret = aclrtProcessReport(1000);
if (ret)
{
if (acl_jittor_thread_running && ret != ACL_ERROR_RT_REPORT_TIMEOUT && ret != ACL_ERROR_RT_THREAD_SUBSCRIBE)
LOGir << "aclrtProcessReport:" << ret << acl_error_to_string(ret);
break;
}
}
acl_jittor_thread_running = 0;
return (void *)0;
}
struct acl_jittor_initer
{
int32_t deviceId;
acl_jittor_initer()
{
CHECK_ACL(aclInit(nullptr));
uint device_count = 0;
deviceId = 0;
// 获取可用的Device数量
CHECK_ACL(aclrtGetDeviceCount(&device_count));
LOGi << "Found ACL device number:" << device_count;
CHECK_ACL(aclrtSetDevice(deviceId));
CHECK_ACL(aclrtCreateStream(&aclstream));
// pthread_create(&acl_jittor_tid, nullptr, acl_jittor_process_callback, 0);
}
~acl_jittor_initer()
{
acl_jittor_thread_running = 0;
// CHECK_ACL(aclrtUnSubscribeReport(acl_jittor_tid, 0));
aclrtDestroyStream(aclstream);
aclrtResetDevice(deviceId);
CHECK_ACL(aclFinalize());
if (nowWorkSpaceSize > 0)
{
aclrtFree(workspaceAddr);
}
}
} _acl_jittor_initer;
string process_acl(const string &src, const string &name, const map<string, string> &kargs)
{
if (endswith(name, "_jittor.cc"))
return src;
// static vector<string> dont_compile = {"fp16_emu.cc"};
// for (auto& s : dont_compile)
// if (endswith(name, s))
// return " ";
static unordered_set<string> cuda_headers = {
"cuda_runtime", "cudnn", "driver_types",
"cuda_fp16", "cuda_runtime_api", "fp16_emu",
"cudnn_rnn_descriptor", "cublas_v2", "cublas_wrapper",
"curand", "curand_wrapper", "cufft", "cufftXt",
"CudaUtils", "cutt", "cudnn_wrapper", "cuda_bf16"};
static unordered_set<string> fake_class = {
"cudnnHandle_t", "cudnnConvolutionBwdFilterAlgo_t",
"cudnnConvolutionBwdDataAlgo_t", "cudnnConvolutionFwdAlgo_t",
"cufftHandle"};
try
{
auto tokens = token_split(src);
int edit = 0;
for (int i = 0; i < tokens.size(); i++)
{
auto &token = tokens[i];
if (cuda_headers.count(token))
token = "acl_jittor", edit++;
else if (fake_class.count(token))
token = "int", edit++;
else if (token == "CUDA")
token = "ACL", edit++;
else if (startswith(token, "cuda"))
{
if (token.size() >= 5 && token[4] >= 'A' && token[4] <= 'Z')
{
if (token == "cudaGetDeviceCount")
{
token_replace(tokens, i, "($1);", "((uint*)$1);");
}
else if (token == "cudaLaunchHostFunc")
{
// ACL_CALLBACK_BLOCK for 310
token_replace(tokens, i, "LaunchHostFunc($1,$2,$3)",
"LaunchCallback($2,$3,ACL_CALLBACK_NO_BLOCK,$1)");
}
else if (token == "cudaMemcpy")
token_replace(tokens, i, "cudaMemcpy($1,$2,$3,",
"aclrtMemcpy($1,$3,$2,$3,");
else if (token == "cudaMemcpyAsync")
token_replace(tokens, i, "cudaMemcpyAsync($1,$2,$3,",
"aclrtMemcpyAsync($1,$3,$2,$3,");
else if (token == "cudaMemcpyDeviceToHost")
token = "ACL_MEMCPY_DEVICE_TO_HOST";
else if (token == "cudaMemcpyDefault")
token = "ACL_MEMCPY_HOST_TO_DEVICE";
else if (token == "cudaMemcpyHostToDevice")
token = "ACL_MEMCPY_HOST_TO_DEVICE";
else if (token == "cudaMemcpyDeviceToDevice")
token = "ACL_MEMCPY_DEVICE_TO_DEVICE";
else if (token == "cudaMallocManaged" || token == "cudaMalloc")
{
// unified address not supported
token = "aclrtMalloc";
token_replace(tokens, i, "($1,$2)",
"($1,$2,ACL_MEM_MALLOC_HUGE_FIRST)");
}
else if (token == "cudaMemGetInfo")
token_replace(tokens, i, "cudaMemGetInfo($1,$2)",
"aclrtGetMemInfo(ACL_DDR_MEM,$1,$2)");
else if (token == "cudaGetLastError")
token_replace(tokens, i, "cudaGetLastError()", "0");
else if (token == "cudaStreamCreateWithFlags")
token_replace(tokens, i - 1,
"(cudaStreamCreateWithFlags($1,$2));",
"(aclrtCreateStream($1)); checkAclErrors(aclrtSubscribeReport(acl_jittor_tid,*$1));");
else if (token == "cudaEventCreate")
token_replace(tokens, i,
"cudaEventCreate($1,$2)",
"aclrtCreateEvent($1)");
else if (token == "cudaDeviceSynchronize")
token = "aclrtSynchronizeDevice";
else if (token == "cudaStreamDestroy")
token_replace(tokens, i, "cudaStreamDestroy($1)",
"(aclrtUnSubscribeReport(acl_jittor_tid,$1), aclrtDestroyStream($1))");
else if (token == "cudaEventDestroy")
token = "aclrtDestroyEvent";
else if (token == "cudaEventRecord")
token = "aclrtRecordEvent";
else if (token == "cudaStreamWaitEvent")
token_replace(tokens, i,
"cudaStreamWaitEvent($1,$2,$3)",
"aclrtStreamWaitEvent($1,$2)");
if (token.size() && token[0] == 'c')
token = "aclrt" + token.substr(4);
if (endswith(token, "_t"))
token = token.substr(0, token.size() - 2);
edit++;
}
}
else if (token == "_cudaGetErrorEnum")
{
token_replace(tokens, i, "_cudaGetErrorEnum($1)", "(acl_error_to_string($1))");
edit++;
}
else if (token == "checkCudaErrors")
token = "checkAclErrors";
else if (token == "JPU")
{
edit++;
string new_code;
if (tokens[i + 2] == "op_compiler")
token_replace(tokens, i,
"JPU(op_compiler($1,$2,$3))",
"acl_jittor_op_compiler($1,$2,$3)");
else if (tokens[i + 2] == "header")
new_code = "#include \"acl_jittor.h\"";
if (new_code.size())
token_replace(tokens, i, "JPU($1)", new_code);
}
else if (token == "use_cuda_managed_allocator" && tokens[i + 1][0] == ',')
{
tokens[i + 2] = "0"; // disable unified address
}
}
if (!edit)
return src;
string new_src = join(tokens, "");
// if (name == "executor.cc") {
// new_src = string("#include <Python.h>\n#include <pystate.h>\n#include <common.h>\n")+
// "namespace jittor { void acl_op_exec(Op*); }\n" +
// replace(new_src, "op->do_run_after_prepare(jkl);",
// R"({
// acl_op_exec(op);
// })");
// }
if (name == "profiler.cc")
{
new_src = token_replace_all(new_src, ".cc", ".tikcc");
}
// LOGir << name << (name == "pass_manager.cc");
if (name == "pass_manager.cc")
{
LOGir << "replace" << name;
new_src = token_replace_all(new_src, "run_pass<FloatAtomicFixPass>();", "WTF");
}
// ????????
return new_src;
}
catch (const std::exception &e)
{
LOGe << "process acl error:" << e.what();
LOGe << "name:" << name;
throw;
}
}
void acl_jittor_op_compiler(string &filename, string &src, bool is_acl, string &extra_flags)
{
if (!is_acl)
return;
string new_src = process_acl(src, "", {});
new_src = replace(new_src, R"(#include "misc/cuda_atomic.h")", "");
new_src = replace(new_src, R"(#include "misc/cuda_limits.h")", "");
new_src = replace(new_src, "__global__", "__ai_device_entry__");
new_src = token_replace_all(new_src, "__launch_bounds__($1)", "");
new_src = token_replace_all(new_src, "int thread_num = $1;", "int thread_num = 1;");
new_src = token_replace_all(new_src, "tn0=std::max(tn0, $1);", "");
new_src = token_replace_all(new_src, "<<<$1>>>", "<<<1,0>>>");
new_src = token_replace_all(new_src, "int thread_id = $1;", "int thread_id = 1;");
// for inc error
new_src = token_replace_all(new_src, "for ($1+=$2)", "for ($1++)");
// bit op error
new_src = token_replace_all(new_src, "int tnum$1;", "");
new_src = token_replace_all(new_src, "int p1$1;", "");
new_src = token_replace_all(new_src, "int p2$1;", "");
new_src = token_replace_all(new_src, "int tn$1=$2;", "int tn$1=0;");
new_src = token_replace_all(new_src, "int tid$1=$2;", "int tid$1=0;");
src = new_src;
new_src = token_replace_all(new_src, "atomicAdd(&$1,$2);", "$1=$1+$2;");
// new_src = token_replace_all(new_src, "bool", "int8");
new_src = token_replace_all(new_src, "::numeric_min<float32>()", "-1e30");
new_src = token_replace_all(new_src, "::numeric_max<float32>()", "1e30");
// TODO: support max
unordered_map<string, string> opmap = {
// {"::max","tikcc::scalar_max"},
{"::sqrtf", "tikcc::scalar_sqrt"}};
auto ss = split(new_src, ";");
for (auto &s : ss)
{
if (s.find("?") != string::npos)
{
s = token_replace_all(s + ";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
}
if (s.find("::max") != string::npos)
{
if (s.find("auto") == string::npos)
{
s = token_replace_all(s + ";", " $1=$4::max($2,$3);", " $1=$2;if ($2 < $3) $1=$3;");
}
else
{
s = token_replace_all(s + ";", "auto $1=$4::max($2,$3);", "auto $1=$2;if ($2 < $3) $1=$3;");
}
}
for (auto &kv : opmap)
{
if (s.find(kv.first) != string::npos)
{
if (s.find("auto") == string::npos)
{
// $1 = op($2) --> op($1, $2)
s = token_replace_all(s + ";", " $1= " + kv.first + "($2);", kv.second + "($1, $2);");
}
else
{
// auto $1 = op($2) --> float32 $1; op($1, $2);
s = token_replace_all(s + ";", "auto $1= " + kv.first + "($2);", "float32 $1; " + kv.second + "($1, $2);");
}
}
}
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
// s = token_replace_all(s+";", "auto $1=$2?$3:$4;", "auto $1=$3;if (!($2)) $1=$4;");
// if (s.find("::max") != string::npos) {
// s = token_replace_all(s+";", " $1= ::max($2);", "tikcc::scalar_max($1, $2);");
// }
}
new_src = join(ss, ";");
src = new_src;
}
}

700
python/jittor/extern/acl/acl_jittor.h vendored Normal file
View File

@ -0,0 +1,700 @@
// ***************************************************************
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#pragma once
#include "common.h"
#include "aclnn/aclnn.h"
#include <acl/acl.h>
std::string acl_error_to_string(aclError error);
namespace jittor
{
EXTERN_LIB uint64_t acl_jittor_tid;
EXTERN_LIB aclrtStream aclstream;
EXTERN_LIB void *workspaceAddr;
void mallocWorkSpace(uint64_t size);
void acl_jittor_op_compiler(string &filename, string &src, bool is_acl, string &extra_flags);
struct AclOpFunctions
{
// for Unary and Nonzero
std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncUnaryNonzero;
// for Cast
std::function<aclnnStatus(aclTensor *, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncCast;
// for Bianry
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBinary;
// for Add and Sub
std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAdd;
// for Expand, permute, flip
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncExpand;
// for bmm and matmul
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMatmul;
// for conv
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int64_t, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConv;
// for reducesum, mean
std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncReduceSum;
// for amax and amin
std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAmax;
// for conv backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int, aclBoolArray *, int8_t, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConvBackward;
// for proddim
std::function<aclnnStatus(aclTensor *, float, float, float, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncProdDim;
// for select, where
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSelect;
// for random_uniform and random_normal
std::function<aclnnStatus(aclTensor *, int64_t, int64_t, int64_t, int64_t, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncRandom;
// for maxpool
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMaxPool;
// for maxpool backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncMaxPoolBackward;
// for avgpool
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAvgPool;
// for avgpool backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncAvgPoolBackward;
// for concat
std::function<aclnnStatus(aclTensorList *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncConcat;
// for gather
std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncGather;
// for cumsum
std::function<aclnnStatus(aclTensor *, uint64_t, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncCumsum;
// for scatter
std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncScatter;
// for index
std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncIndex;
// for stridesliceassignv2
std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncStridedSliceAssignV2;
// for slicev2
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSliceV2;
// for indexputimpl
std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, bool, bool, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncIndexPutImpl;
// for range
std::function<aclnnStatus(aclScalar *, aclScalar *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncRange;
// for leaky_relu
std::function<aclnnStatus(aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLeakyRelu;
// for leaky_relu backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLeakyReluBackward;
// for dropout
std::function<aclnnStatus(aclTensor *, double, bool, int64_t, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncDropout;
// for dropout backward
std::function<aclnnStatus(aclTensor *, aclTensor *, double, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncDropoutBackward;
// for split with size
std::function<aclnnStatus(aclTensor *, aclIntArray *, int64_t, aclTensorList *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSplitWithSize;
// for silu
// std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSilu;
// for silu backward
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSiluBackward;
// for sigmoid
// std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSigmoid;
// for sigmoid backward
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncSigmoidBackward;
// for embedding
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncEmbedding;
// for embedding backward
std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t, uint64_t, bool, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncEmbeddingBackward;
// for InplaceMaskedScatter MaskedSelect
// std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncInplaceMaskedScatter;
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> executeFunc;
// for flashattention
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
getWorkspaceSizeFuncFalshAttention;
// for flashattention backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
getWorkspaceSizeFuncFalshAttentionBackward;
// for batchnorm
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBatchNorm;
// for batchnorm backward
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, aclBoolArray *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncBatchNormBackward;
// for layernorm
std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, aclTensor *, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> getWorkspaceSizeFuncLayerNorm;
// for ROPE
std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, int64_t, uint64_t *, aclOpExecutor **)>
getWorkspaceSizeFuncRotaryPosEmb;
// 添加一个默认构造函数
AclOpFunctions() = default;
// for Unary and Nonzero
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> execf)
: getWorkspaceSizeFuncUnaryNonzero(gwsf), executeFunc(execf) {}
// for Cast
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, aclrtStream)> execf)
: getWorkspaceSizeFuncCast(gwsf), executeFunc(execf) {}
// for Binary
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncBinary(gwsf), executeFunc(execf) {}
// for Add and Sub
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncAdd(gwsf), executeFunc(execf) {}
// for Expand, flip
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncExpand(gwsf), executeFunc(execf) {}
// for Matmul
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncMatmul(gwsf), executeFunc(execf) {}
// for conv
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int64_t, aclTensor *, int8_t, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncConv(gwsf), executeFunc(execf) {}
// for reducesum, mean
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncReduceSum(gwsf), executeFunc(execf) {}
// for amax amin
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncAmax(gwsf), executeFunc(execf) {}
// for conv backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclIntArray *, int, aclBoolArray *, int8_t, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncConvBackward(gwsf), executeFunc(execf) {}
// for proddim
AclOpFunctions(std::function<aclnnStatus(const aclTensor *, float, float, float, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncProdDim(gwsf), executeFunc(execf) {}
// for select, where
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncSelect(gwsf), executeFunc(execf) {}
// for random_normal
AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, int64_t, int64_t, int64_t, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncRandom(gwsf), executeFunc(execf) {}
// for maxpool
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncMaxPool(gwsf), executeFunc(execf) {}
// for maxpool backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncMaxPoolBackward(gwsf), executeFunc(execf) {}
// for avgpool
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncAvgPool(gwsf), executeFunc(execf) {}
// for avgpool backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, bool, bool, int64_t, int8_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncAvgPoolBackward(gwsf), executeFunc(execf) {}
// for concat
AclOpFunctions(std::function<aclnnStatus(aclTensorList *, int64_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncConcat(gwsf), executeFunc(execf) {}
// for gather
AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncGather(gwsf), executeFunc(execf) {}
// for cumsum
AclOpFunctions(std::function<aclnnStatus(aclTensor *, int64_t, aclDataType, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncCumsum(gwsf), executeFunc(execf) {}
// for scatter
AclOpFunctions(std::function<aclnnStatus(aclTensor *, uint64_t, aclTensor *, aclTensor *, uint64_t, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncScatter(gwsf), executeFunc(execf) {}
// for index
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncIndex(gwsf), executeFunc(execf) {}
// for stridesliceassignv2
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncStridedSliceAssignV2(gwsf), executeFunc(execf) {}
// for slicev2
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclIntArray *, aclIntArray *, aclIntArray *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncSliceV2(gwsf), executeFunc(execf) {}
// for indexputimpl
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensorList *, aclTensor *, bool, bool, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncIndexPutImpl(gwsf), executeFunc(execf) {}
// for range
AclOpFunctions(std::function<aclnnStatus(aclScalar *, aclScalar *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncRange(gwsf), executeFunc(execf) {}
// for leaky_relu
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclScalar *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncLeakyRelu(gwsf), executeFunc(execf) {}
// for leaky_relu backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclScalar *, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncLeakyReluBackward(gwsf), executeFunc(execf) {}
// for dropout
AclOpFunctions(std::function<aclnnStatus(aclTensor *, double, bool, int64_t, int64_t, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncDropout(gwsf), executeFunc(execf) {}
// for dropout backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, double, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncDropoutBackward(gwsf), executeFunc(execf) {}
// for embedding backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, uint64_t, uint64_t, bool, aclTensor *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncEmbeddingBackward(gwsf), executeFunc(execf) {}
// for split with size
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, int64_t, aclTensorList *, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncSplitWithSize(gwsf), executeFunc(execf) {}
// for flash attention
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncFalshAttention(gwsf), executeFunc(execf) {}
// for flash attention backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *,
aclIntArray *, aclIntArray *, aclIntArray *, double, double, int64_t, int64_t, int64_t, char *, int64_t, int64_t, int64_t,
aclTensor *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncFalshAttentionBackward(gwsf), executeFunc(execf) {}
// for batchnorm
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncBatchNorm(gwsf), executeFunc(execf) {}
// for batchnorm backward
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, aclTensor *, bool, double, aclBoolArray *, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncBatchNormBackward(gwsf), executeFunc(execf) {}
// for layernorm
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclIntArray *, aclTensor *, aclTensor *, double, aclTensor *, aclTensor *, aclTensor *, uint64_t *, aclOpExecutor **)>
gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncLayerNorm(gwsf), executeFunc(execf) {}
// for ROPE
AclOpFunctions(std::function<aclnnStatus(aclTensor *, aclTensor *, const aclTensor *, const aclTensor *, int64_t, uint64_t *, aclOpExecutor **)> gwsf,
std::function<aclnnStatus(void *, uint64_t, aclOpExecutor *, const aclrtStream)> execf)
: getWorkspaceSizeFuncRotaryPosEmb(gwsf), executeFunc(execf) {}
};
static std::unordered_map<std::string, AclOpFunctions> aclOpFuncMap = {
{"Abs", AclOpFunctions(aclnnAbsGetWorkspaceSize, aclnnAbs)},
{"Exp", AclOpFunctions(aclnnExpGetWorkspaceSize, aclnnExp)},
{"Log", AclOpFunctions(aclnnLogGetWorkspaceSize, aclnnLog)},
{"Sqrt", AclOpFunctions(aclnnSqrtGetWorkspaceSize, aclnnSqrt)},
{"Ceil", AclOpFunctions(aclnnCeilGetWorkspaceSize, aclnnCeil)},
{"Floor", AclOpFunctions(aclnnFloorGetWorkspaceSize, aclnnFloor)},
{"Round", AclOpFunctions(aclnnRoundGetWorkspaceSize, aclnnRound)},
{"Sin", AclOpFunctions(aclnnSinGetWorkspaceSize, aclnnSin)},
{"Cos", AclOpFunctions(aclnnCosGetWorkspaceSize, aclnnCos)},
{"Tan", AclOpFunctions(aclnnTanGetWorkspaceSize, aclnnTan)},
{"Asin", AclOpFunctions(aclnnAsinGetWorkspaceSize, aclnnAsin)},
{"Acos", AclOpFunctions(aclnnAcosGetWorkspaceSize, aclnnAcos)},
{"Atan", AclOpFunctions(aclnnAtanGetWorkspaceSize, aclnnAtan)},
{"Sinh", AclOpFunctions(aclnnSinhGetWorkspaceSize, aclnnSinh)},
{"Cosh", AclOpFunctions(aclnnCoshGetWorkspaceSize, aclnnCosh)},
{"Tanh", AclOpFunctions(aclnnTanhGetWorkspaceSize, aclnnTanh)},
{"Asinh", AclOpFunctions(aclnnAsinhGetWorkspaceSize, aclnnAsinh)},
{"Acosh", AclOpFunctions(aclnnAcoshGetWorkspaceSize, aclnnAcosh)},
{"Atanh", AclOpFunctions(aclnnAtanhGetWorkspaceSize, aclnnAtanh)},
{"Sigmoid", AclOpFunctions(aclnnSigmoidGetWorkspaceSize, aclnnSigmoid)},
{"Erf", AclOpFunctions(aclnnErfGetWorkspaceSize, aclnnErf)},
{"Erfinv", AclOpFunctions(aclnnErfinvGetWorkspaceSize, aclnnErfinv)},
{"LogicalNot", AclOpFunctions(aclnnLogicalNotGetWorkspaceSize, aclnnLogicalNot)},
{"BitwiseNot", AclOpFunctions(aclnnBitwiseNotGetWorkspaceSize, aclnnBitwiseNot)},
{"Neg", AclOpFunctions(aclnnNegGetWorkspaceSize, aclnnNeg)},
{"Cast", AclOpFunctions(aclnnCastGetWorkspaceSize, aclnnCast)},
{"Maximum", AclOpFunctions(aclnnMaximumGetWorkspaceSize, aclnnMaximum)},
{"Minimum", AclOpFunctions(aclnnMinimumGetWorkspaceSize, aclnnMinimum)},
{"Add", AclOpFunctions(aclnnAddGetWorkspaceSize, aclnnAdd)},
{"Sub", AclOpFunctions(aclnnSubGetWorkspaceSize, aclnnSub)},
{"Mul", AclOpFunctions(aclnnMulGetWorkspaceSize, aclnnMul)},
{"RealDiv", AclOpFunctions(aclnnDivGetWorkspaceSize, aclnnDiv)},
{"FloorDiv", AclOpFunctions(aclnnFloorDivideGetWorkspaceSize, aclnnFloorDivide)},
{"LessEqual", AclOpFunctions(aclnnLeTensorGetWorkspaceSize, aclnnLeTensor)},
{"Less", AclOpFunctions(aclnnLtTensorGetWorkspaceSize, aclnnLtTensor)},
{"GreaterEqual", AclOpFunctions(aclnnGeTensorGetWorkspaceSize, aclnnGeTensor)},
{"Greater", AclOpFunctions(aclnnGtTensorGetWorkspaceSize, aclnnGtTensor)},
{"Equal", AclOpFunctions(aclnnEqTensorGetWorkspaceSize, aclnnEqTensor)},
{"NotEqual", AclOpFunctions(aclnnNeTensorGetWorkspaceSize, aclnnNeTensor)},
{"LogicalAnd", AclOpFunctions(aclnnLogicalAndGetWorkspaceSize, aclnnLogicalAnd)},
{"LogicalOr", AclOpFunctions(aclnnLogicalOrGetWorkspaceSize, aclnnLogicalOr)},
{"LogicalXor", AclOpFunctions(aclnnLogicalXorGetWorkspaceSize, aclnnLogicalXor)},
{"BitwiseAnd", AclOpFunctions(aclnnBitwiseAndTensorGetWorkspaceSize, aclnnBitwiseAndTensor)},
{"BitwiseOr", AclOpFunctions(aclnnBitwiseOrTensorGetWorkspaceSize, aclnnBitwiseOrTensor)},
{"BitwiseXor", AclOpFunctions(aclnnBitwiseXorTensorGetWorkspaceSize, aclnnBitwiseXorTensor)},
{"Pow", AclOpFunctions(aclnnPowTensorTensorGetWorkspaceSize, aclnnPowTensorTensor)},
{"Expand", AclOpFunctions(aclnnExpandGetWorkspaceSize, aclnnExpand)},
{"MatMul", AclOpFunctions(aclnnMatmulGetWorkspaceSize, aclnnMatmul)},
{"BatchMatMul", AclOpFunctions(aclnnBatchMatMulGetWorkspaceSize, aclnnBatchMatMul)},
{"ReduceMax", AclOpFunctions(aclnnAmaxGetWorkspaceSize, aclnnAmax)},
{"ReduceMin", AclOpFunctions(aclnnAminGetWorkspaceSize, aclnnAmin)},
{"ReduceSum", AclOpFunctions(aclnnReduceSumGetWorkspaceSize, aclnnReduceSum)},
{"Triu", AclOpFunctions(aclnnTriuGetWorkspaceSize, aclnnTriu)},
{"Conv2d", AclOpFunctions(aclnnConvolutionGetWorkspaceSize, aclnnConvolution)},
{"Conv2dBackward", AclOpFunctions(aclnnConvolutionBackwardGetWorkspaceSize, aclnnConvolutionBackward)},
{"ReduceMean", AclOpFunctions(aclnnMeanGetWorkspaceSize, aclnnMean)},
// {"ReduceProd", AclOpFunctions(aclnnProdDimGetWorkspaceSize, aclnnProdDim)},
{"Select", AclOpFunctions(aclnnSWhereGetWorkspaceSize, aclnnSWhere)},
{"RandomUniform", AclOpFunctions(aclnnInplaceUniformGetWorkspaceSize, aclnnInplaceUniform)},
{"RandomNormal", AclOpFunctions(aclnnInplaceNormalGetWorkspaceSize, aclnnInplaceNormal)},
{"Transpose", AclOpFunctions(aclnnPermuteGetWorkspaceSize, aclnnPermute)},
{"Maxpool", AclOpFunctions(aclnnMaxPool2dWithIndicesGetWorkspaceSize, aclnnMaxPool2dWithIndices)},
{"MaxpoolBackward", AclOpFunctions(aclnnMaxPool2dWithIndicesBackwardGetWorkspaceSize, aclnnMaxPool2dWithIndicesBackward)},
{"Avgpool", AclOpFunctions(aclnnAvgPool2dGetWorkspaceSize, aclnnAvgPool2d)},
{"AvgpoolBackward", AclOpFunctions(aclnnAvgPool2dBackwardGetWorkspaceSize, aclnnAvgPool2dBackward)},
{"Flip", AclOpFunctions(aclnnFlipGetWorkspaceSize, aclnnFlip)},
{"Concat", AclOpFunctions(aclnnCatGetWorkspaceSize, aclnnCat)},
{"Gather", AclOpFunctions(aclnnGatherGetWorkspaceSize, aclnnGather)},
{"Cumsum", AclOpFunctions(aclnnCumsumGetWorkspaceSize, aclnnCumsum)},
{"Index", AclOpFunctions(aclnnIndexGetWorkspaceSize, aclnnIndex)},
{"Scatter", AclOpFunctions(aclnnScatterGetWorkspaceSize, aclnnScatter)},
{"Nonzero", AclOpFunctions(aclnnNonzeroGetWorkspaceSize, aclnnNonzero)},
{"Where", AclOpFunctions(aclnnSWhereGetWorkspaceSize, aclnnSWhere)},
{"Floor", AclOpFunctions(aclnnFloorGetWorkspaceSize, aclnnFloor)},
{"StridedSliceAssignV2", AclOpFunctions(aclnnStridedSliceAssignV2GetWorkspaceSize, aclnnStridedSliceAssignV2)},
{"SliceV2", AclOpFunctions(aclnnSliceV2GetWorkspaceSize, aclnnSliceV2)},
{"IndexPutImpl", AclOpFunctions(aclnnIndexPutImplGetWorkspaceSize, aclnnIndexPutImpl)},
{"IndexPutImplAccumulate", AclOpFunctions(aclnnIndexPutImplGetWorkspaceSize, aclnnIndexPutImpl)},
{"Range", AclOpFunctions(aclnnRangeGetWorkspaceSize, aclnnRange)},
{"ReLU", AclOpFunctions(aclnnReluGetWorkspaceSize, aclnnRelu)},
{"LeakyReLU", AclOpFunctions(aclnnLeakyReluGetWorkspaceSize, aclnnLeakyRelu)},
{"LeakyReLUBackward", AclOpFunctions(aclnnLeakyReluBackwardGetWorkspaceSize, aclnnLeakyReluBackward)},
{"Dropout", AclOpFunctions(aclnnDropoutGetWorkspaceSize, aclnnDropout)},
{"DropoutBackward", AclOpFunctions(aclnnDropoutBackwardGetWorkspaceSize, aclnnDropoutBackward)},
{"SiLU", AclOpFunctions(aclnnSiluGetWorkspaceSize, aclnnSilu)},
{"SiLUBackward", AclOpFunctions(aclnnSiluBackwardGetWorkspaceSize, aclnnSiluBackward)},
{"Sigmoid", AclOpFunctions(aclnnSigmoidGetWorkspaceSize, aclnnSigmoid)},
{"SigmoidBackward", AclOpFunctions(aclnnSigmoidBackwardGetWorkspaceSize, aclnnSigmoidBackward)},
{"Embedding", AclOpFunctions(aclnnEmbeddingGetWorkspaceSize, aclnnEmbedding)},
{"EmbeddingBackward", AclOpFunctions(aclnnEmbeddingDenseBackwardGetWorkspaceSize, aclnnEmbeddingDenseBackward)},
{"InplaceMaskedScatter", AclOpFunctions(aclnnInplaceMaskedScatterGetWorkspaceSize, aclnnInplaceMaskedScatter)},
{"MaskedSelect", AclOpFunctions(aclnnMaskedSelectGetWorkspaceSize, aclnnMaskedSelect)},
{"SplitWithSize", AclOpFunctions(aclnnSplitWithSizeGetWorkspaceSize, aclnnSplitWithSize)},
{"Softmax", AclOpFunctions(aclnnSoftmaxGetWorkspaceSize, aclnnSoftmax)},
{"SoftmaxBackward", AclOpFunctions(aclnnSoftmaxBackwardGetWorkspaceSize, aclnnSoftmaxBackward)},
{"FlashAttention", AclOpFunctions(aclnnFlashAttentionScoreV2GetWorkspaceSize, aclnnFlashAttentionScoreV2)},
{"FlashAttentionBackward", AclOpFunctions(aclnnFlashAttentionScoreGradV2GetWorkspaceSize, aclnnFlashAttentionScoreGradV2)},
{"BatchNorm", AclOpFunctions(aclnnBatchNormGetWorkspaceSize, aclnnBatchNorm)},
{"BatchNormBackward", AclOpFunctions(aclnnBatchNormBackwardGetWorkspaceSize, aclnnBatchNormBackward)},
{"LayerNorm", AclOpFunctions(aclnnLayerNormGetWorkspaceSize, aclnnLayerNorm)},
{"RotaryPosEmb", AclOpFunctions(aclnnApplyRotaryPosEmbGetWorkspaceSize, aclnnApplyRotaryPosEmb)},
{"Stack", AclOpFunctions(aclnnStackGetWorkspaceSize, aclnnStack)},
{"NanToNum", AclOpFunctions(aclnnNanToNumGetWorkspaceSize, aclnnNanToNum)},
};
struct AclOpAttr
{
virtual ~AclOpAttr() {}
};
struct ConvAttr : AclOpAttr
{
vector<int64_t> convStrides;
vector<int64_t> convPads;
vector<int64_t> convOutPads;
vector<int64_t> convDilations;
bool convWithBias;
bool is_transposed;
int64_t group;
// 析构函数
~ConvAttr()
{
convStrides.clear();
convPads.clear();
convOutPads.clear();
convDilations.clear();
}
};
struct ReduceAttr : AclOpAttr
{
vector<int64_t> axes;
// for proddim
int64_t prod_dim;
bool keepdims;
~ReduceAttr()
{
axes.clear();
}
};
struct RandomAttr : AclOpAttr
{
int64_t seed, offset;
~RandomAttr()
{
}
};
struct TriuAttr : AclOpAttr
{
int64_t diagonal;
~TriuAttr()
{
}
};
struct PoolAttr : AclOpAttr
{
vector<int64_t> kernel_size;
vector<int64_t> poolStrides;
vector<int64_t> poolPads;
vector<int64_t> poolDilations;
bool poolCeil;
bool countIncludePad;
// divisorOverride(const int64_t计算输入): 表示取平均的除数。数据类型支持INT64。divisorOverride配置为默认值0时表示功能不使能。
// https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/appdevgapi/context/aclnnAvgPool2d.md
int64_t divisorOverride = 0;
// cubeMathType(int8_t计算输入): host侧的整型判断Cube单元应该使用哪种计算逻辑进行运算数据类型支持INT8。对于无特殊说明的数据类型均保持原始输入数据类型计算。支持的枚举值如下
// 0:KEEP_DTYPE保持输入的数据类型进行计算。当输入是FLOATAtlas 训练系列产品和Atlas 推理系列产品Ascend 310P处理器暂不支持取0时会报错。
// 1:ALLOW_FP32_DOWN_PRECISION允许将输入数据降精度计算。当输入是FLOATAtlas 训练系列产品和Atlas 推理系列产品Ascend 310P处理器允许转换为FLOAT16计算。
// 2:USE_FP16允许转换为数据类型FLOAT16进行计算。当输入数据类型是FLOAT转换为FLOAT16计算。
// 3:USE_HF32允许转换为数据类型HFLOAT32计算。当输入是FLOATAtlas 训练系列产品、Atlas 推理系列产品Ascend 310P处理器和Atlas A2训练系列产品/Atlas 800I A2推理产品暂不支持取3时会报错。
// https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/apiref/appdevgapi/context/aclnnAvgPool2d.md
int8_t cubeMathType = 0;
// 析构函数
~PoolAttr()
{
kernel_size.clear();
poolStrides.clear();
poolPads.clear();
poolDilations.clear();
}
};
struct ConcatAttr : AclOpAttr
{
int64_t tensorNum;
int64_t dim;
~ConcatAttr()
{
}
};
struct GatherAttr : AclOpAttr
{
int64_t dim;
~GatherAttr()
{
}
};
struct ScatterAttr : AclOpAttr
{
int64_t axis;
int64_t reduction;
~ScatterAttr()
{
}
};
struct StrideAttr : AclOpAttr
{
vector<int64_t> begins;
vector<int64_t> ends;
vector<int64_t> steps;
vector<int64_t> axes;
~StrideAttr()
{
begins.clear();
ends.clear();
steps.clear();
axes.clear();
}
};
struct RangeAttr : AclOpAttr
{
int64_t start;
int64_t end;
int64_t step;
~RangeAttr()
{
}
};
struct LeakyReluAttr : AclOpAttr
{
float negativeSlope;
bool selfIsResult;
~LeakyReluAttr()
{
}
};
struct DropoutAttr : AclOpAttr
{
float p;
bool train;
int64_t seed;
int64_t offset;
float scale;
~DropoutAttr()
{
}
};
struct EmbeddingAttr : AclOpAttr
{
int64_t numEmbeddings;
// int64_t embeddingDim;
int64_t paddingIdx;
bool scaleGradByFreq;
// bool sparse;
// bool isSparse;
// bool isDense;
~EmbeddingAttr()
{
}
};
struct SplitWithSizeAttr : AclOpAttr
{
vector<int64_t> splitSize;
int64_t dim;
~SplitWithSizeAttr()
{
splitSize.clear();
}
};
struct SoftmaxAttr : AclOpAttr
{
int64_t dim;
~SoftmaxAttr()
{
}
};
struct BatchNormAttr : AclOpAttr
{
bool is_train;
float momentum;
float eps;
~BatchNormAttr()
{
}
};
struct LayerNormAttr : AclOpAttr
{
float eps;
vector<int64_t> normalizedShape;
int64_t size;
~LayerNormAttr()
{
normalizedShape.clear();
}
};
struct FlashAttentionAttr : AclOpAttr
{
vector<int64_t> prefix;
vector<int64_t> qStartIdx;
vector<int64_t> kvStartIdx;
float scale;
float keepProb;
int64_t preToken;
int64_t nextToken;
int64_t headNum;
string inputLayout;
int64_t innerPrecise;
int64_t sparseMode;
int64_t psetype;
bool hasRealshift;
bool hasDropmask;
bool hasPaddingmask;
bool hasAttentmask;
~FlashAttentionAttr()
{
prefix.clear();
qStartIdx.clear();
kvStartIdx.clear();
}
};
struct NanToNumAttr : AclOpAttr
{
float nan;
float posinf;
float neginf;
~NanToNumAttr()
{
}
};
}

502
python/jittor/extern/acl/acl_op_exec.cc vendored Normal file
View File

@ -0,0 +1,502 @@
// ***************************************************************
// Copyright (c) 2023 Jittor. All Rights Reserved.
// Maintainers: Dun Liang <randonlang@gmail.com>.
// This file is subject to the terms and conditions defined in
// file 'LICENSE.txt', which is part of this source code package.
// ***************************************************************
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "aclops/aclops.h"
namespace jittor
{
void free_var_mem(Var *v);
unordered_map<uint32, string> opname_map = {
// unary op
{ns_cast, "Cast"},
{ns_negative, "Neg"},
{ns_abs, "Abs"},
{ns_exp, "Exp"},
{ns_log, "Log"},
{ns_sqrt, "Sqrt"},
{ns_ceil, "Ceil"},
{ns_floor, "Floor"},
{ns_round, "Round"},
// m(round_int)
// m(floor_int)
// m(ceil_int)
{ns_sin, "Sin"},
{ns_cos, "Cos"},
{ns_tan, "Tan"},
{ns_asin, "Asin"},
{ns_acos, "Acos"},
{ns_atan, "Atan"},
{ns_sinh, "Sinh"},
{ns_cosh, "Cosh"},
{ns_tanh, "Tanh"},
{ns_asinh, "Asinh"},
{ns_acosh, "Acosh"},
{ns_atanh, "Atanh"},
{ns_sigmoid, "Sigmoid"},
{ns_erf, "Erf"},
{ns_erfinv, "Erfinv"},
{ns_logical_not, "LogicalNot"},
{ns_bitwise_not, "BitwiseNot"},
// binary op
{ns_pow, "Pow"},
{ns_maximum, "Maximum"},
{ns_minimum, "Minimum"},
{ns_add, "Add"},
{ns_subtract, "Sub"},
{ns_multiply, "Mul"},
{ns_divide, "RealDiv"},
{ns_floor_divide, "FloorDiv"},
{ns_mod, "Mod"},
{ns_less, "Less"},
{ns_less_equal, "LessEqual"},
{ns_greater, "Greater"},
{ns_greater_equal, "GreaterEqual"},
{ns_equal, "Equal"},
{ns_not_equal, "NotEqual"},
{ns_left_shift, "LeftShift"},
{ns_right_shift, "RightShift"},
{ns_logical_and, "LogicalAnd"},
{ns_logical_or, "LogicalOr"},
{ns_logical_xor, "LogicalXor"},
{ns_bitwise_and, "BitwiseAnd"},
{ns_bitwise_or, "BitwiseOr"},
{ns_bitwise_xor, "BitwiseXor"},
};
void fallback_cpu(Op *op)
{
LOGy << "!!! fallback_cpu " << op;
use_cuda = 0;
for (auto v : op->inputs())
{
if (v->mem_ptr && v->allocator->is_cuda())
{
migrate_to_cpu(v, exe.allocator);
}
}
for (auto v : op->outputs())
{
if (v->mem_ptr && v->allocator->is_cuda())
{
migrate_to_cpu(v, exe.allocator);
}
}
op->flags.set(NodeFlags::_cpu);
op->flags.set(NodeFlags::_cuda, 0);
if (op->name() == string("fused"))
{
auto fop = (FusedOp *)op;
for (auto op : fop->ops)
{
op->flags.set(NodeFlags::_cpu);
op->flags.set(NodeFlags::_cuda, 0);
}
}
op->do_run();
use_cuda = 1;
}
/*
check compile
if compiled: exec
else: compile
check is fused
check is relay
else
compile func = try exec
if failed: fallback_cpu
else
try compile
if failed: fallback_cpu
*/
extern jit_op_entry_t (*do_compile_hook)(Op *);
jit_op_entry_t do_compile_inner(Op *op);
void try_exec_and_fallback_cpu(Op *op)
{
aclrtSynchronizeStream(aclstream);
auto fop = (FusedOp *)op;
std::set<Var *> new_alloced;
map<Op *, int> op_indeg;
map<Var *, int> var_outdeg;
std::queue<Op *> queue;
for (Op *op : fop->ops)
op_indeg[op] = 0;
map<Op *, vector<Op *>> out_map;
map<Var *, vector<Op *>> from;
int len = 0;
for (Op *v : fop->ops)
{
for (auto in : v->inputs())
from[in].push_back(v);
++len;
}
for (Op *u : fop->ops)
{
for (auto out : u->outputs())
{
if (from.find(out) != from.end())
{
for (auto v : from[out])
{
++op_indeg[v];
++var_outdeg[out];
out_map[u].push_back(v);
}
}
}
}
for (Op *op : fop->ops)
{
if (op_indeg[op] == 0)
queue.push(op);
}
int total = 0;
int fallback = 0;
try
{
while (!queue.empty())
{
total++;
for (auto in : op->inputs())
{
ASSERT(in->mem_ptr);
}
auto op = queue.front();
queue.pop();
for (auto out : op->outputs())
{
if (out->mem_ptr)
continue;
out->alloc(exe.allocator);
new_alloced.insert(out);
}
for (auto out : out_map[op])
{
--op_indeg[out];
if (op_indeg[out] == 0)
queue.push(out);
}
if (op->name() == string("unary"))
{
auto uop = (UnaryOp *)op;
UnaryOpRunner op;
op.add(uop->x, true);
op.add(uop->y, false);
auto iter = opname_map.find(uop->ns);
ASSERT(iter != opname_map.end()) << "op " << uop->ns << " not found";
op.name = iter->second;
op.jt_name = uop->name();
op.run();
}
else if (op->name() == string("binary"))
{
auto bop = (BinaryOp *)op;
BinaryOpRunner op;
op.add(bop->x, true);
op.add(bop->y, true);
op.add(bop->z, false);
auto iter = opname_map.find(bop->ns);
ASSERT(iter != opname_map.end()) << "op " << bop->ns << " not found";
op.name = iter->second;
op.jt_name = bop->name();
if (bop->x->dtype() == ns_bool and bop->y->dtype() == ns_bool)
{
// BitwiseOr, BitwiseAnd, BitwiseXor -> LogicalOr, LogicalAnd, LogicalXor
if (bop->ns == ns_bitwise_or)
{
op.name = "LogicalOr";
}
else if (bop->ns == ns_bitwise_and)
{
op.name = "LogicalAnd";
}
else if (bop->ns == ns_bitwise_xor)
{
op.name = "LogicalXor";
}
}
op.run();
}
else if (op->name() == string("ternary"))
{
auto top = (TernaryOp *)op;
TernaryOpRunner op;
op.add(top->cond, true);
op.add(top->x, true);
op.add(top->y, true);
op.add(top->z, false);
op.run();
}
else if (op->name() == string("array"))
{
auto aop = (ArrayOp *)op;
aclrtMemcpy(aop->output->mem_ptr, aop->output->size, aop->ptr<void>(), aop->output->size, ACL_MEMCPY_HOST_TO_DEVICE);
}
else if (op->name() == string("reduce"))
{
auto rop = (ReduceOp *)op;
ReduceOpRunner op;
if (rop->ns == ns_add)
op.op_idx = 9;
else if (rop->ns == ns_multiply)
// TODO unsupported the multi dim
op.op_idx = 999;
else if (rop->ns == ns_maximum)
op.op_idx = 11;
else if (rop->ns == ns_minimum)
op.op_idx = 12;
else if (rop->ns == ns_mean)
op.op_idx = 10;
else
LOGf << "op " << rop->ns << " not supported";
op.add(rop->x, true);
ReduceAttr *attr = new ReduceAttr();
for (int i = 0; i < rop->x->shape.size(); i++)
if (rop->reduce_mask & (1 << i))
attr->axes.push_back(i);
if (rop->x->shape.size() == rop->y->shape.size())
attr->keepdims = true;
else
attr->keepdims = false;
op.op_attr.reset(attr);
op.add(rop->y, false);
op.run();
aclrtSynchronizeStream(aclstream);
}
else if (op->name() == string("broadcast_to"))
{
auto bop = (BroadcastToOp *)op;
ExpandOpRunner op;
op.jt_name = "expand";
NanoVector xshape, xshape_bk = bop->x->shape;
NanoVector zshape = bop->z->shape;
for (int i = 0; i < zshape.size(); i++)
{
if (bop->bcast_mask & (1 << i))
{
xshape.push_back(1);
}
else
{
xshape.push_back(zshape[i]);
}
}
bop->x->shape = xshape;
op.add(bop->x, true);
// bop->x->shape = xshape_bk;
op.add(bop->z, false);
op.run();
bop->x->shape = xshape_bk;
aclrtSynchronizeStream(aclstream);
}
else if (op->name() == string("fuse_transpose"))
{
// replace fuse_transpose with transpose
auto top = (TransposeOp *)op;
TransposeOpRunner op;
op.add(top->x, true);
op.add(top->y, false);
op.jt_name = "transpose";
ReduceAttr *attr = new ReduceAttr();
for (int i = 0; i < top->axes.size(); i++)
attr->axes.push_back(top->axes[i]);
op.op_attr.reset(attr);
op.run();
}
else
{
LOGf << "op " << op->name() << " not supported";
}
for (auto in : op->inputs())
{
--var_outdeg[in];
if (var_outdeg[in] == 0)
{
if (new_alloced.find(in) != new_alloced.end())
{
free_var_mem(in);
new_alloced.erase(in);
}
}
}
}
}
catch (std::exception &e)
{
fallback = 1;
LOGir << "fallback cpu" << e.what();
}
for (auto v : new_alloced)
{
free_var_mem(v);
}
if (fallback)
{
fallback_cpu(op);
}
}
extern int current_seed;
extern int64 current_offset;
static unordered_map<string, std::function<void(Op *)>> acl_ops = {
{"curand_random", [&current_seed, &current_offset](Op *op)
{
auto _op = (RandomOp *)op;
RandomOpRunner runner(_op->type == ns_uniform ? "RandomUniform" : "RandomNormal");
auto out = op->output(0);
RandomAttr *attr = new RandomAttr();
attr->seed = current_seed;
attr->offset = current_offset;
runner.jt_name = "random";
runner.op_attr.reset(attr);
runner.add(out, false);
runner.run();
current_offset += out->numel();
}},
};
static void exec_mapped_acl_ops(Op *op)
{
auto iter = acl_ops.find(op->name());
if (iter != acl_ops.end())
{
LOGv << "exec acl op " << op->name() << op;
iter->second(op);
}
else
{
LOGf << "op " << op->name() << " not supported";
}
}
static jit_op_entry_t acl_do_compile(Op *op)
{
LOGv << "compile" << op;
OpCompiler oc(op);
string *src = &oc.src;
for (auto op_type : op_types)
op_type->post_pass(&oc);
string src_after_passes;
// if is fused op
if (oc.op)
{
TunerManager tm(&oc);
src_after_passes = tm.tune();
src = &src_after_passes;
}
op->compile_optimize(*src);
if (!op->flags.get(NodeFlags::_cuda))
{
LOGv << "compile cpu";
return oc.compile(op->get_jit_key(get_jk()), *src);
}
if (op->name() == string("fused"))
{
FusedOp *fop = (FusedOp *)op;
// if is a relayed op
if (fop->context->vrm.relay_groups.size())
{
LOGv << "relay fused op";
return oc.compile(op->get_jit_key(get_jk()), *src);
}
else
{
return &try_exec_and_fallback_cpu;
}
}
else if (op->name() == string("code"))
{
CodeOp *cop = (CodeOp *)op;
if (cop->cuda_src.find("acl") != string::npos)
{
LOGv << "compile acl op";
return oc.compile(op->get_jit_key(get_jk()), *src);
}
else
{
return &exec_mapped_acl_ops;
}
}
else if (strncmp(op->name(), "hccl", 4) == 0)
{
LOGv << "Compiling HCCL op: " << op->name();
return oc.compile(op->get_jit_key(get_jk()), *src);
}
else
{
LOGv << "compile finish" << op;
return &exec_mapped_acl_ops;
}
return do_compile_inner(op);
}
// from op_register.cc
extern unordered_map<string, OpInfo> op_info_map;
void init_acl_ops()
{
do_compile_hook = acl_do_compile;
vector<string> to_erase;
for (auto &kv : op_info_map)
{
if (startswith(kv.first, "cu") && acl_ops.count(kv.first) == 0)
{
to_erase.push_back(kv.first);
}
}
for (auto &k : to_erase)
{
LOGv << "op not supported: " << k << ", erase it.";
op_info_map.erase(k);
}
}
} // jittor

58
python/jittor/extern/acl/aclnn/aclnn.cc vendored Normal file
View File

@ -0,0 +1,58 @@
#include <iostream>
#include <vector>
#include "aclnn.h"
int64_t GetShapeSize(const std::vector<int64_t>& shape) {
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
void PrintOutResult(std::vector<int64_t> &shape, void** deviceAddr) {
auto size = GetShapeSize(shape);
std::vector<int> resultData(size, 0);
auto ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]),
*deviceAddr, size * sizeof(resultData[0]), ACL_MEMCPY_DEVICE_TO_HOST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); return);
for (int64_t i = 0; i < size; i++) {
LOG_PRINT("mean result[%ld] is: %d\n", i, resultData[i]);
}
}
/*int Init(int32_t deviceId) {
// 固定写法AscendCL初始化
auto ret = aclInit(nullptr);
CHECK_RET(ret == ACL_SUCCESS or ret == 100002, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret);
ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret);
//ret = aclrtCreateStream(stream);
//CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret);
return 0;
}*/
/*
template <typename T>
int CreateAclTensor(const std::vector<T>& hostData, const std::vector<int64_t>& shape, void** deviceAddr,
aclDataType dataType, aclTensor** tensor) {
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
// 调用aclrtMemcpy将host侧数据拷贝到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND,
shape.data(), shape.size(), *deviceAddr);
return 0;
}*/

134
python/jittor/extern/acl/aclnn/aclnn.h vendored Normal file
View File

@ -0,0 +1,134 @@
#include <iostream>
#include <vector>
#include "acl.h"
// unary
#include "aclnnop/aclnn_abs.h"
#include "aclnnop/aclnn_neg.h"
#include "aclnnop/aclnn_exp.h"
#include "aclnnop/aclnn_log.h"
#include "aclnnop/aclnn_sqrt.h"
#include "aclnnop/aclnn_ceil.h"
#include "aclnnop/aclnn_floor.h"
#include "aclnnop/aclnn_round.h"
#include "aclnnop/aclnn_sin.h"
#include "aclnnop/aclnn_cos.h"
#include "aclnnop/aclnn_tan.h"
#include "aclnnop/aclnn_asin.h"
#include "aclnnop/aclnn_acos.h"
#include "aclnnop/aclnn_atan.h"
#include "aclnnop/aclnn_sinh.h"
#include "aclnnop/aclnn_cosh.h"
#include "aclnnop/aclnn_tanh.h"
#include "aclnnop/aclnn_asinh.h"
#include "aclnnop/aclnn_acosh.h"
#include "aclnnop/aclnn_atanh.h"
#include "aclnnop/aclnn_sigmoid.h"
#include "aclnnop/aclnn_erf.h"
#include "aclnnop/aclnn_erfinv.h"
#include "aclnnop/aclnn_logical_not.h"
#include "aclnnop/aclnn_bitwise_not.h"
#include "aclnnop/aclnn_cast.h"
#include "aclnnop/aclnn_nonzero.h"
// binary
#include "aclnnop/aclnn_maximum.h"
#include "aclnnop/aclnn_minimum.h"
#include "aclnnop/aclnn_add.h"
#include "aclnnop/aclnn_sub.h"
#include "aclnnop/aclnn_mul.h"
#include "aclnnop/aclnn_div.h"
#include "aclnnop/aclnn_floor_divide.h"
#include "aclnnop/aclnn_le_tensor.h"
#include "aclnnop/aclnn_lt_tensor.h"
#include "aclnnop/aclnn_ge_tensor.h"
#include "aclnnop/aclnn_gt_tensor.h"
#include "aclnnop/aclnn_eq_tensor.h"
#include "aclnnop/aclnn_ne_tensor.h"
#include "aclnnop/aclnn_logical_and.h"
#include "aclnnop/aclnn_logical_or.h"
#include "aclnnop/aclnn_logical_xor.h"
#include "aclnnop/aclnn_bitwise_and_tensor.h"
#include "aclnnop/aclnn_bitwise_or_tensor.h"
#include "aclnnop/aclnn_bitwise_xor_tensor.h"
#include "aclnnop/aclnn_pow_tensor_tensor.h"
#include "aclnnop/aclnn_expand.h"
#include "aclnnop/aclnn_matmul.h"
#include "aclnnop/aclnn_batch_matmul.h"
#include "aclnnop/aclnn_convolution.h"
#include "aclnnop/aclnn_convolution_backward.h"
#include "aclnnop/aclnn_reduce_sum.h"
#include "aclnnop/aclnn_amax.h"
#include "aclnnop/aclnn_amin.h"
#include "aclnnop/aclnn_mean.h"
#include "aclnnop/aclnn_prod.h"
#include "aclnnop/aclnn_triu.h"
#include "aclnnop/aclnn_s_where.h"
#include "aclnnop/aclnn_random.h"
#include "aclnnop/aclnn_normal.h"
#include "aclnnop/aclnn_permute.h"
#include "aclnnop/aclnn_max_pool2d_with_indices.h"
#include "aclnnop/aclnn_max_pool2d_with_indices_backward.h"
#include "aclnnop/aclnn_avgpool2d.h"
#include "aclnnop/aclnn_avgpool2d_backward.h"
#include "aclnnop/aclnn_flip.h"
#include "aclnnop/aclnn_cat.h"
#include "aclnnop/aclnn_gather.h"
#include "aclnnop/aclnn_cumsum.h"
#include "aclnnop/aclnn_index.h"
#include "aclnnop/aclnn_scatter.h"
#include "aclnnop/aclnn_index.h"
#include "aclnnop/aclnn_strided_slice_assign_v2.h"
#include "aclnnop/aclnn_slice_v2.h"
#include "aclnnop/aclnn_index_put_impl.h"
#include "aclnnop/aclnn_range.h"
#include "aclnnop/aclnn_relu.h"
#include "aclnnop/aclnn_dropout.h"
#include "aclnnop/aclnn_dropout_backward.h"
#include "aclnnop/aclnn_leaky_relu.h"
#include "aclnnop/aclnn_leaky_relu_backward.h"
#include "aclnnop/aclnn_uniform.h"
#include "aclnnop/aclnn_silu.h"
#include "aclnnop/aclnn_silu_backward.h"
#include "aclnnop/aclnn_sigmoid.h"
#include "aclnnop/aclnn_sigmoid_backward.h"
#include "aclnnop/aclnn_embedding.h"
#include "aclnnop/aclnn_embedding_dense_backward.h"
#include "aclnnop/aclnn_masked_scatter.h"
#include "aclnnop/aclnn_masked_select.h"
#include "aclnnop/aclnn_split_with_size.h"
#include "aclnnop/aclnn_flash_attention_score.h"
#include "aclnnop/aclnn_flash_attention_score_grad.h"
#include "aclnnop/aclnn_softmax.h"
#include "aclnnop/aclnn_softmax_backward.h"
#include "aclnnop/aclnn_batch_norm.h"
#include "aclnnop/aclnn_batch_norm_backward.h"
#include "aclnnop/aclnn_layer_norm.h"
#include "aclnnop/aclnn_apply_rotary_pos_emb.h"
#include "aclnnop/aclnn_stack.h"
#include "aclnnop/aclnn_nan_to_num.h"
#define CHECK_RET(cond, return_expr) \
do \
{ \
if (!(cond)) \
{ \
return_expr; \
} \
} while (0)
#define LOG_PRINT(message, ...) \
do \
{ \
printf(message, ##__VA_ARGS__); \
} while (0)
int64_t GetShapeSize(const std::vector<int64_t> &shape);
void PrintOutResult(std::vector<int64_t> &shape, void **deviceAddr);
//int Init(int32_t deviceId);
/*
template <typename T>
int CreateAclTensor(const std::vector<T>& hostData, const std::vector<int64_t>& shape, void** deviceAddr,
aclDataType dataType, aclTensor** tensor);
*/

View File

View File

@ -0,0 +1,33 @@
#pragma once
#include <acl/aclops/binary_op_acl.h>
#include <acl/aclops/unary_op_acl.h>
#include <acl/aclops/conv_op_acl.h>
#include <acl/aclops/ternary_op_acl.h>
#include <acl/aclops/reduce_op_acl.h>
#include <acl/aclops/expand_op_acl.h>
#include <acl/aclops/getitem_op_acl.h>
#include <acl/aclops/setitem_op_acl.h>
#include <acl/aclops/matmul_op_acl.h>
#include <acl/aclops/random_op_acl.h>
#include <acl/aclops/bmm_op_acl.h>
#include <acl/aclops/pool_op_acl.h>
#include <acl/aclops/flip_op_acl.h>
#include <acl/aclops/concat_op_acl.h>
#include <acl/aclops/gather_scatter_op_acl.h>
#include <acl/aclops/cumsum_op_acl.h>
#include <acl/aclops/index_op_acl.h>
#include <acl/aclops/where_op_acl.h>
#include <acl/aclops/floor_op_acl.h>
#include <acl/aclops/transpose_op_acl.h>
#include <acl/aclops/flashattention_op_acl.h>
#include <acl/aclops/relu_op_acl.h>
#include <acl/aclops/dropout_op_acl.h>
#include <acl/aclops/silu_op_acl.h>
#include <acl/aclops/sigmoid_op_acl.h>
#include <acl/aclops/softmax_op_acl.h>
#include <acl/aclops/stack_op_acl.h>
#include <acl/aclops/nantonum_op_acl.h>
#include <acl/aclops/rope_op_acl.h>
#include <acl/aclops/triu_op_acl.h>
#include <acl/aclops/embedding_op_acl.h>
#include <acl/aclops/norms_op_acl.h>

View File

@ -0,0 +1,56 @@
#pragma once
#include "utils.h"
#include "acl_jittor.h"
namespace jittor
{
extern int sync_run;
class BaseOpRunner
{
protected:
vector<Var *> in_;
vector<Var *> out_;
int ret = -1;
uint64_t workspaceSize = 0;
aclOpExecutor *executor;
bool is_group_op = false;
std::vector<std::vector<int64_t>> inputShapes;
std::vector<std::vector<int64_t>> outputShapes;
std::vector<aclTensor *> inputTensors;
std::vector<aclTensor *> outputTensors;
public:
string name;
string jt_name;
std::unique_ptr<AclOpAttr> op_attr;
bool use_nchw = false;
BaseOpRunner(const string &name = "") : name(name) {}
virtual ~BaseOpRunner() = default;
// Common functionality for adding input/output variables
void add(Var *v, bool is_input);
virtual void setupInputDesc();
void cleanupDesc();
virtual void setupOutputDesc();
virtual void syncRun();
void checkRet(aclnnStatus ret);
// Base run method with common operator lookup logic
void run();
protected:
// Virtual method for specific operator execution
virtual void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) = 0;
void cleanupAttr();
};
}

View File

@ -0,0 +1,152 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "binary_op_acl.h"
#include "base_op.h"
namespace jittor
{
extern int sync_run;
// Common functionality for adding input/output variables
void BaseOpRunner::add(Var *v, bool is_input)
{
if (is_input)
{
in_.push_back(v);
}
else
{
out_.push_back(v);
}
return;
}
void BaseOpRunner::setupInputDesc()
{
auto input_num = in_.size();
for (int input_idx = 0; input_idx < input_num; input_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
{
shape.push_back(in_[input_idx]->shape[j]);
}
inputShapes.push_back(shape);
}
for (int idx = 0; idx < input_num; idx++)
{
inputTensors.push_back(nullptr);
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
void BaseOpRunner::cleanupDesc()
{
auto input_num = in_.size();
auto output_num = out_.size();
for (int idx = 0; idx < input_num; idx++)
{
aclDestroyTensor(inputTensors[idx]);
}
for (int idx = 0; idx < output_num; idx++)
{
aclDestroyTensor(outputTensors[idx]);
}
}
void BaseOpRunner::setupOutputDesc()
{
auto output_num = out_.size();
for (int output_idx = 0; output_idx < output_num; output_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
{
shape.push_back(out_[output_idx]->shape[j]);
}
outputShapes.push_back(shape);
}
for (int idx = 0; idx < output_num; idx++)
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
void BaseOpRunner::syncRun()
{
if (sync_run)
{
// ret = aclrtSynchronizeStream(aclstream);
// CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclrtSynchronizeStream failed. ERROR: %d\n", name.c_str(), ret); return);
}
}
void BaseOpRunner::checkRet(aclnnStatus ret)
{
if (ret != ACL_SUCCESS)
{
auto tmp_err_msg = aclGetRecentErrMsg();
LOGir << name << ", " << tmp_err_msg;
}
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxxGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
}
// Base run method with common operator lookup logic
void BaseOpRunner::run()
{
if (is_group_op)
{
auto it = aclOpFuncMap.find(name);
if (it == aclOpFuncMap.end())
{
LOGir << "aclOpFuncMap Not supported op: " << name;
throw std::runtime_error("Unsupported operation type.");
}
setupInputDesc();
setupOutputDesc();
executeOp(it);
cleanupDesc();
}
else
{
auto it = aclOpFuncMap.find(name);
setupInputDesc();
setupOutputDesc();
executeOp(it);
cleanupDesc();
}
}
}

View File

@ -0,0 +1,124 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "binary_op_acl.h"
namespace jittor
{
BinaryOpRunner::BinaryOpRunner() : BaseOpRunner("binary")
{
use_nchw = false;
is_group_op = true;
}
void BinaryOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclScalar *alpha = nullptr;
if (name == string("Add") || name == string("Sub"))
{
if (get_dtype(in_[0]->dtype()) == ACL_FLOAT)
{
float alphaValue = 1.0;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_FLOAT16)
{
__fp16 alphaValue = 1.0;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_INT64)
{
int64_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_INT32)
{
int alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_INT8)
{
int8_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_INT16)
{
int16_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_UINT8)
{
uint8_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_UINT16)
{
uint16_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_UINT32)
{
uint32_t alphaValue = 1;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else if (get_dtype(in_[0]->dtype()) == ACL_BOOL)
{
bool alphaValue = true;
alpha = aclCreateScalar(&alphaValue, get_dtype(in_[0]->dtype()));
}
else
{
LOGf << "Not supported dtype: " << in_[0]->dtype();
}
CHECK_RET(alpha != nullptr, return);
ret = it->second.getWorkspaceSizeFuncAdd(inputTensors[0], inputTensors[1], alpha, outputTensors[0], &workspaceSize, &executor);
}
else
{
ret = it->second.getWorkspaceSizeFuncBinary(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
}
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = it->second.executeFunc(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnxxx failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyScalar(alpha);
return;
}
}

View File

@ -0,0 +1,14 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
struct BinaryOpRunner : public BaseOpRunner
{
BinaryOpRunner();
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
};
}

View File

@ -0,0 +1,128 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def acl_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None,
extra_data: dict = {}):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
BatchMatMulOpRunner op;
{input_code}
op.add(out0, false);
{attr_code}
op.run();""",
data=extra_data)
class BmmACL(jt.Function):
def __init__(self, trans_x2=False):
super(BmmACL, self).__init__()
self.trans_x2 = trans_x2
def execute(self, x1, x2):
self.input = [x1, x2]
result = acl_cmd("BatchMatMul", [x1, x2],
output_dtypes=[x1.dtype],
output_shapes=[
x1.shape[:-1] + x2.shape[-2:-1] if self.trans_x2
else x1.shape[:-1] + x2.shape[-1:]
],
attr_code="op.jt_name=\"bmm_trans_1\";"
if self.trans_x2 else "op.jt_name=\"bmm\";")[0]
return result
def grad(self, grad_output):
x1, x2 = self.input
if len(x1) != len(x2):
reshape_grad_x2 = True
else:
reshape_grad_x2 = False
grad_x1 = acl_cmd(
"BatchMatMul", [grad_output, x2],
output_dtypes=[x1.dtype],
output_shapes=[
grad_output.shape[:-1] + x2.shape[-2:-1] if not self.trans_x2
else grad_output.shape[:-1] + x1.shape[-1:]
],
attr_code="op.jt_name=\"bmm_trans_1\";"
if not self.trans_x2 else "op.jt_name=\"bmm\";")[0]
if self.trans_x2:
if reshape_grad_x2:
output_shape = grad_output.shape[1:-2] + grad_output.shape[
-1:] + x1.shape[-1:]
grad_x2 = acl_cmd("BatchMatMul", [
grad_output.reshape(-1, grad_output.shape[-1]),
x1.reshape(-1, x1.shape[-1])
],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
else:
output_shape = grad_output.shape[:-2] + grad_output.shape[
-1:] + x1.shape[-1:]
grad_x2 = acl_cmd("BatchMatMul", [grad_output, x1],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
else:
if reshape_grad_x2:
output_shape = x1.shape[1:-2] + x1.shape[
-1:] + grad_output.shape[-1:]
grad_x2 = acl_cmd("BatchMatMul", [
x1.reshape(-1, x1.shape[-1]),
grad_output.reshape(-1, grad_output.shape[-1])
],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
else:
output_shape = x1.shape[:-2] + x1.shape[
-1:] + grad_output.shape[-1:]
grad_x2 = acl_cmd("BatchMatMul", [x1, grad_output],
output_dtypes=[x2.dtype],
output_shapes=[output_shape],
attr_code="op.jt_name=\"bmm_trans_0\";")[0]
if len(grad_x1.shape) > len(x1.shape):
grad_x1 = grad_x1.sum(0)
if len(grad_x2.shape) > len(x2.shape):
grad_x2 = grad_x2.sum(0)
return grad_x1, grad_x2

View File

@ -0,0 +1,77 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "bmm_op_acl.h"
namespace jittor
{
BatchMatMulOpRunner::BatchMatMulOpRunner() : BaseOpRunner("BatchMatMulMatMul")
{
}
void BatchMatMulOpRunner::setupInputDesc()
{
auto input_num = in_.size();
for (int input_idx = 0; input_idx < input_num; input_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < in_[input_idx]->shape.size(); j++)
{
shape.push_back(in_[input_idx]->shape[j]);
}
inputShapes.push_back(shape);
}
for (int idx = 0; idx < input_num; idx++)
{
inputTensors.push_back(nullptr);
if ((jt_name == "bmm_trans_1" && idx == 1) || (jt_name == "bmm_trans_0" && idx == 0))
{
auto ret = CreateFakeTransAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
else
{
auto ret = CreateAclTensor(inputShapes[idx], in_[idx]->mem_ptr, in_[idx]->size, get_dtype(in_[idx]->dtype()), &inputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
}
void BatchMatMulOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnBatchMatMulGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], 1, &workspaceSize, &executor);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnBatchMatmulGetWorkspaceSize failed. ERROR: %d\n", name.c_str(), ret); return);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnbatchMatmul failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class BatchMatMulOpRunner : public BaseOpRunner
{
protected:
void setupInputDesc() override;
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
BatchMatMulOpRunner();
};
}

View File

@ -0,0 +1,186 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def concat_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class ConcatACL(jt.Function):
def __init__(self):
super(ConcatACL, self).__init__()
def __call__(self, *args):
assert isinstance(args[0], (list, tuple))
assert isinstance(args[1], int)
if jt.flags.no_grad:
return self.execute(*args)
backup = args
args = list(args)
taped_inputs = []
taped_outputs = []
input_mask = [-1] * (len(args[0]) + 1)
newargs = [list(), args[1]]
for i, v in enumerate(args[0]):
if isinstance(v, jt.Var):
if v.is_stop_grad():
# -2 in input_mask represents it is stop_grad
input_mask[i] = -2
newargs[0].append(v)
continue
v = v.tape()
newargs[0].append(v)
input_mask[i] = len(taped_inputs)
taped_inputs.append(v)
ori_res = self.execute(*newargs)
if not isinstance(ori_res, Sequence):
res = [ori_res]
else:
res = list(ori_res)
output_mask = [-1] * len(res)
for i, v in enumerate(res):
if isinstance(v, jt.Var):
v = v.tape()
output_mask[i] = len(taped_outputs)
res[i] = v
taped_outputs.append(v)
self.input_mask = input_mask
self.output_mask = output_mask
# tape output and input together so
# backward treat them as one operator
jt.tape_together(taped_inputs, taped_outputs, self._grad)
if isinstance(ori_res, Sequence):
return res
else:
return res[0]
def execute(self, input_tensors, dim=0):
for _ in input_tensors:
if not (-_.ndim <= dim < _.ndim):
print(_.shape, dim)
raise ValueError("dim out of range")
if dim < 0:
dim += input_tensors[0].ndim
self.input = input_tensors
self.dim = dim
for i in range(len(input_tensors)):
if input_tensors[i].dtype != input_tensors[0].dtype:
raise ValueError("All input tensors must have the same dtype")
if input_tensors[i].shape[:dim] != input_tensors[
0].shape[:dim] or input_tensors[i].shape[
dim + 1:] != input_tensors[0].shape[dim + 1:]:
raise ValueError("All input tensors must have the same shape")
attr_code = f"""
op.jt_name = "concat";
ConcatAttr *attr = new ConcatAttr();
attr->tensorNum = {len(input_tensors)};
attr->dim = {dim};
op.op_attr.reset(attr);
"""
result = concat_cmd(
"Concat",
input_tensors,
output_dtypes=[input_tensors[0].dtype],
output_shapes=[
jt.empty(self.calculate_output_shape(input_tensors, dim)).shape
],
attr_code=attr_code)[0]
return result
def _grad(self, *args):
new_args = ((args[i] if i >= 0 else None) for i in self.output_mask)
ret = self.grad(*new_args)
new_ret = []
for i, r in enumerate(ret):
j = self.input_mask[i]
if j < 0:
# -2 in input_mask represents it is stop_grad
assert r is None or j==-2, f"{type(self)}'s {i}-th returned grad should be None, "\
"because the input value is not jittor variable."
else:
new_ret.append(r)
return new_ret
def grad(self, grad_output):
grad_inputs = self.split_grad(grad_output, self.input, self.dim)
return grad_inputs
def calculate_output_shape(self, input_tensors, axis):
shape = list(input_tensors[0].shape)
for tensor in input_tensors[1:]:
shape[axis] += tensor.shape[axis]
return tuple(shape)
def split_grad(self, grad_output, input_tensors, axis):
offset = []
shapeVec = []
dtypeVec = []
for tensor in input_tensors:
offset.append(tensor.shape[axis])
dtypeVec.append(tensor.dtype)
shapeVec.append(tensor.shape)
attr_code = f"""
op.jt_name = "splitwithsize";
auto *attr = new SplitWithSizeAttr();
attr->splitSize = {{ {", ".join(map(str, offset))} }};
attr->dim = {axis};
op.op_attr.reset(attr);
"""
result = concat_cmd("SplitWithSize", [grad_output],
output_dtypes=dtypeVec,
output_shapes=shapeVec,
attr_code=attr_code)
return result

View File

@ -0,0 +1,89 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "concat_op_acl.h"
namespace jittor
{
ConcatOpRunner::ConcatOpRunner() : BaseOpRunner("Concat")
{
}
void ConcatOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto input_num = in_.size();
std::vector<aclTensor *> concatTensorList = {};
for (int i = 0; i < input_num; i++)
{
concatTensorList.push_back(inputTensors[i]);
}
auto concatTensorListInput = aclCreateTensorList(&concatTensorList[0], input_num);
auto attr = dynamic_cast<ConcatAttr *>(op_attr.get());
ret = aclnnCatGetWorkspaceSize(concatTensorListInput, attr->dim, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnCat(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCat failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
SplitWithSizeOpRunner::SplitWithSizeOpRunner() : BaseOpRunner("SplitWithSize")
{
}
void SplitWithSizeOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto output_num = out_.size();
auto attr = dynamic_cast<SplitWithSizeAttr *>(op_attr.get());
auto splitSize = aclCreateIntArray(attr->splitSize.data(), attr->splitSize.size());
auto tensorList = aclCreateTensorList(&outputTensors[0], output_num);
ret = aclnnSplitWithSizeGetWorkspaceSize(inputTensors[0], splitSize, attr->dim, tensorList, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnSplitWithSize(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnSplitWithSize failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class ConcatOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
ConcatOpRunner();
};
class SplitWithSizeOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
SplitWithSizeOpRunner();
};
}

View File

@ -0,0 +1,160 @@
import os
import jittor_utils
from jittor_utils import env_or_try_find
import ctypes
import glob
import jittor as jt
import jittor.compiler as compiler
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def _ntuple(n):
def parse(x):
if isinstance(x, Iterable):
return x
return tuple([x] * n)
return parse
_pair = _ntuple(2)
def conv_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class ConvACL(jt.Function):
def execute(self,
x,
weight,
bias=None,
stride=1,
padding=0,
dilation=1,
groups=1):
self.input = x
self.weight = weight
self.bias = bias
padding = _pair(padding)
stride = _pair(stride)
dilation = _pair(dilation)
out_channels = weight.shape[0]
if groups <= 0:
raise ValueError("groups must be a positive integer")
self.padding = padding
self.stride = stride
self.dilation = dilation
self.groups = groups
attr_code = f"""
op.jt_name = "conv2d";
ConvAttr *attr = new ConvAttr();
attr->convStrides = {{ {stride[0]}, {stride[1]} }};
attr->convPads = {{ {padding[0]}, {padding[1]} }};
attr->convDilations = {{ {dilation[0]}, {dilation[1]} }};
attr->group = {groups};
attr->convOutPads = {{1,1}};
op.op_attr.reset(attr);
"""
input_height, input_width = x.shape[-2:]
kernel_height, kernel_width = weight.shape[-2:]
output_height = (input_height + 2 * padding[0] - dilation[0] *
(kernel_height - 1) - 1) // stride[0] + 1
output_width = (input_width + 2 * padding[1] - dilation[1] *
(kernel_width - 1) - 1) // stride[1] + 1
output_shape = (x.shape[0], out_channels, output_height, output_width)
inputs = [x, weight]
if bias is not None:
inputs.append(bias)
result = conv_cmd(
"Conv2d",
inputs,
output_dtypes=[x.dtype],
output_shapes=[output_shape],
attr_code=attr_code,
)[0]
return result
def grad(self, grad_output):
x = self.input
weight = self.weight
bias = self.bias
inputs = [grad_output, x, weight]
if bias is not None:
inputs.append(bias)
output_shapes = [x.shape, weight.shape]
output_dtypes = [x.dtype, weight.dtype]
if bias is not None:
output_shapes.append(bias.shape)
output_dtypes.append(bias.dtype)
else:
output_shapes.append([weight.shape[0]])
output_dtypes.append(x.dtype)
padding = self.padding
stride = self.stride
dilation = self.dilation
groups = self.groups
attr_code = f"""
op.jt_name = "conv2dbackward";
ConvAttr *attr = new ConvAttr();
attr->convStrides = {{ {stride[0]}, {stride[1]} }};
attr->convPads = {{ {padding[0]}, {padding[1]} }};
attr->convDilations = {{ {dilation[0]}, {dilation[1]} }};
attr->group = {groups};
attr->convOutPads = {{ 1,1}};
op.op_attr.reset(attr);
"""
results = conv_cmd("Conv2dBackward",
inputs,
output_dtypes=output_dtypes,
output_shapes=output_shapes,
attr_code=attr_code)
if self.bias is None:
return results[0], results[1]
return results

View File

@ -0,0 +1,152 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "conv_op_acl.h"
namespace jittor
{
Conv2dOpRunner::Conv2dOpRunner() : BaseOpRunner("Conv2d")
{
use_nchw = true;
}
void Conv2dOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
// for conv
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *outPads = nullptr;
aclIntArray *dilations = nullptr;
auto attr = dynamic_cast<ConvAttr *>(op_attr.get());
strides = aclCreateIntArray(attr->convStrides.data(), 2);
pads = aclCreateIntArray(attr->convPads.data(), 2);
outPads = aclCreateIntArray(attr->convOutPads.data(), 2);
dilations = aclCreateIntArray(attr->convDilations.data(), 2);
aclTensor *bias = nullptr;
auto input_num = in_.size();
if (input_num == 3)
bias = inputTensors[2];
ret = aclnnConvolutionGetWorkspaceSize(inputTensors[0], inputTensors[1], bias, strides, pads, dilations, false, outPads, attr->group, outputTensors[0], 0, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnConvolution(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnConvolution failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(outPads);
aclDestroyIntArray(dilations);
return;
}
Conv2dBackwardOpRunner::Conv2dBackwardOpRunner() : BaseOpRunner("Conv2dBackward")
{
use_nchw = true;
}
void Conv2dBackwardOpRunner::setupOutputDesc()
{
auto output_num = out_.size();
for (int output_idx = 0; output_idx < output_num; output_idx++)
{
std::vector<int64_t> shape;
for (int j = 0; j < out_[output_idx]->shape.size(); j++)
{
shape.push_back(out_[output_idx]->shape[j]);
}
outputShapes.push_back(shape);
}
for (int idx = 0; idx < 2; idx++)
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[idx], out_[idx]->mem_ptr, out_[idx]->size, get_dtype(out_[idx]->dtype()), &outputTensors[idx], use_nchw);
CHECK_RET(ret == ACL_SUCCESS, return);
}
// biasgrad nd format
{
outputTensors.push_back(nullptr);
auto ret = CreateAclTensor(outputShapes[2], out_[2]->mem_ptr, out_[2]->size, get_dtype(out_[2]->dtype()), &outputTensors[2], false);
CHECK_RET(ret == ACL_SUCCESS, return);
}
}
void Conv2dBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
// for conv
aclIntArray *strides = nullptr;
aclIntArray *pads = nullptr;
aclIntArray *outPads = nullptr;
aclIntArray *dilations = nullptr;
auto attr = dynamic_cast<ConvAttr *>(op_attr.get());
strides = aclCreateIntArray(attr->convStrides.data(), 2);
pads = aclCreateIntArray(attr->convPads.data(), 2);
outPads = aclCreateIntArray(attr->convOutPads.data(), 2);
dilations = aclCreateIntArray(attr->convDilations.data(), 2);
bool outputMask[3] = {true, true, true};
auto input_num = in_.size();
if (input_num == 3)
{
outputMask[2] = false;
}
aclBoolArray *outMask = aclCreateBoolArray(outputMask, 3);
auto biasSizes = aclCreateIntArray(&outputShapes[2][0], outputShapes[2].size());
ret = aclnnConvolutionBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], biasSizes, strides, pads, dilations, false, outPads, attr->group, outMask, 0, outputTensors[0], outputTensors[1], outputTensors[2], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnConvolutionBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnConvolutionBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
aclDestroyIntArray(strides);
aclDestroyIntArray(pads);
aclDestroyIntArray(outPads);
aclDestroyIntArray(dilations);
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class Conv2dOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
Conv2dOpRunner();
};
class Conv2dBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
void setupOutputDesc() override;
public:
Conv2dBackwardOpRunner();
};
}

View File

@ -0,0 +1,101 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def cumsum_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class CumsumACL(jt.Function):
def __init__(self):
super(CumsumACL, self).__init__()
def execute(self, input, dim=-1):
self.dim = dim
attr_code = f"""
op.jt_name = "cumsum";
GatherAttr *attr = new GatherAttr();
attr->dim = {dim};
op.op_attr.reset(attr);
"""
result = cumsum_cmd("Cumsum", [input],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
cumsum_attr_code = f"""
op.jt_name = "cumsum";
GatherAttr *attr = new GatherAttr();
attr->dim = {self.dim};
op.op_attr.reset(attr);
"""
flip_attr_code = f"""
op.jt_name = "flip";
ReduceAttr *attr = new ReduceAttr();
attr->axes = {{{self.dim}}};
attr->prod_dim = {{{1}}};
op.op_attr.reset(attr);
"""
flipped_grad_output = cumsum_cmd("Flip", [grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=flip_attr_code)[0]
cumulative_grad = cumsum_cmd("Cumsum", [flipped_grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=cumsum_attr_code)[0]
grad_input = cumsum_cmd("Flip", [cumulative_grad],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=flip_attr_code)[0]
return grad_input

View File

@ -0,0 +1,57 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "cumsum_op_acl.h"
namespace jittor
{
CumsumOpRunner::CumsumOpRunner() : BaseOpRunner("Cumsum")
{
}
void CumsumOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
ret = aclnnCumsumGetWorkspaceSize(inputTensors[0], attr->dim, get_dtype(out_[0]->dtype()), outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnCumsum(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnCumsum failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,17 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class CumsumOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
CumsumOpRunner();
};
}

View File

@ -0,0 +1,94 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def dropout_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class DropoutACL(jt.Function):
def __init__(self):
super(DropoutACL, self).__init__()
def execute(self, x, p=0.5, is_train=False):
self.input = x
num_elements = x.numel()
aligned_elements = (num_elements + 127) // 128 * 128
mask_shape = (aligned_elements // 8, )
attr_code = f"""
op.jt_name = "dropout";
DropoutAttr *attr = new DropoutAttr();
attr->p = {p};
attr->train = {"true" if is_train else "false"};
attr->seed = 0;
attr->offset = 0;
op.op_attr.reset(attr);
"""
result = dropout_cmd("Dropout", [x],
output_dtypes=[x.dtype, "uint8"],
output_shapes=[x.shape, mask_shape],
attr_code=attr_code)
self.maskout = result[1]
return result[0]
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "dropoutbackward";
DropoutAttr *attr = new DropoutAttr();
attr->scale = 1.0;
op.op_attr.reset(attr);
"""
grad_input = dropout_cmd("DropoutBackward",
[grad_output, self.maskout],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=attr_code)[0]
return grad_input

View File

@ -0,0 +1,82 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "dropout_op_acl.h"
namespace jittor
{
DropoutOpRunner::DropoutOpRunner() : BaseOpRunner("Dropout")
{
}
void DropoutOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<DropoutAttr *>(op_attr.get());
ret = aclnnDropoutGetWorkspaceSize(inputTensors[0], attr->p, attr->train, attr->seed, attr->offset, outputTensors[0], outputTensors[1], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnDropout(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnDropout failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
DropoutBackwardOpRunner::DropoutBackwardOpRunner() : BaseOpRunner("DropoutBackward")
{
}
void DropoutBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<DropoutAttr *>(op_attr.get());
ret = aclnnDropoutBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], attr->scale, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnDropoutBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnDropoutBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class DropoutOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
DropoutOpRunner();
};
class DropoutBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
DropoutBackwardOpRunner();
};
}

View File

@ -0,0 +1,91 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def embedding_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class EmbeddingACL(jt.Function):
def __init__(self):
super(EmbeddingACL, self).__init__()
def execute(
self,
indices,
weight,
):
inputs = [weight, indices]
self.indices = indices
self.weight_shape = weight.shape
output_shape = list(indices.shape) + list(weight.shape[1:])
outputs = [jt.empty(output_shape, weight.dtype)]
attr_code = f"""
op.jt_name = "embedding";
"""
result = embedding_cmd("Embedding",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
inputs = [grad_output, self.indices]
outputs = [jt.empty(self.weight_shape, grad_output.dtype)]
attr_code = f"""
op.jt_name = "embeddingbackward";
EmbeddingAttr *attr = new EmbeddingAttr();
attr->numEmbeddings = {self.weight_shape[0]};
op.op_attr.reset(attr);
"""
grad_weight = embedding_cmd("EmbeddingBackward",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
return None, grad_weight

View File

@ -0,0 +1,82 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "embedding_op_acl.h"
namespace jittor
{
EmbeddingOpRunner::EmbeddingOpRunner() : BaseOpRunner("Embedding")
{
}
void EmbeddingOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnEmbeddingGetWorkspaceSize(inputTensors[0], inputTensors[1], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnEmbedding(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnEmbedding failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
EmbeddingBackwardOpRunner::EmbeddingBackwardOpRunner() : BaseOpRunner("EmbeddingBackward")
{
}
void EmbeddingBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<EmbeddingAttr *>(op_attr.get());
auto numEmbeddings = attr->numEmbeddings;
ret = aclnnEmbeddingDenseBackwardGetWorkspaceSize(inputTensors[0], inputTensors[1], numEmbeddings, 0, false, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnEmbeddingDenseBackward(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnEmbeddingDenseBackward failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,25 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class EmbeddingOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
EmbeddingOpRunner();
};
class EmbeddingBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
EmbeddingBackwardOpRunner();
};
}

View File

@ -0,0 +1,58 @@
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "expand_op_acl.h"
namespace jittor
{
ExpandOpRunner::ExpandOpRunner() : BaseOpRunner("ternary")
{
use_nchw = false;
}
void ExpandOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
aclIntArray *size = nullptr;
size = aclCreateIntArray(&outputShapes[0][0], outputShapes[0].size());
ret = aclnnExpandGetWorkspaceSize(inputTensors[0], size, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnExpand(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnExpand failed. ERROR: %d\n", name.c_str(), ret); return);
aclDestroyIntArray(size);
return;
}
}

View File

@ -0,0 +1,14 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
struct ExpandOpRunner : public BaseOpRunner
{
ExpandOpRunner();
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
};
}

View File

@ -0,0 +1,209 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def flashattention_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class FlashAttentionACL(jt.Function):
def __init__(self,
headnum,
layout="BNSD",
prefix=None,
qstart=None,
kvstart=None,
scale=1.0,
prob=1.0,
pretokens=2147483647,
nexttokens=2147483647,
innerprecise=0,
sparsemode=0,
psetype=1):
self.headnum = headnum
self.layout = layout
self.scale = scale
self.prob = prob
self.pretokens = pretokens
self.nexttokens = nexttokens
self.innerprecise = innerprecise
self.sparsemode = sparsemode
self.psetype = psetype
self.prefix = prefix
self.qstart = qstart
self.kvstart = kvstart
def execute(
self,
q,
k,
v,
realshift=None,
dropMask=None,
paddingMask=None,
attenMask=None,
):
if self.layout == 'BSH':
B, SQ, H = q.shape
SKV = k.shape[1]
N = self.headnum
D = H / N
elif self.layout == 'SBH':
SQ, B, H = q.shape
SKV = k.shape[0]
N = self.headnum
D = H / N
elif self.layout == 'BSND':
B, SQ, N, D = q.shape
SKV = k.shape[1]
elif self.layout == 'BNSD':
B, N, SQ, D = q.shape
SKV = k.shape[2]
else:
raise ValueError(f"got invalid input layout {self.layout}")
output_shape = (B, N, SQ, 8)
self.q = q
self.k = k
self.v = v
self.prefix = self.prefix if self.prefix else [0 for _ in range(B)]
self.qstart = self.qstart if self.qstart else [0 for _ in range(B)]
self.kvstart = self.kvstart if self.kvstart else [0 for _ in range(B)]
self.hasRealshift = (not realshift == None)
self.hasDropmask = (not dropMask == None)
self.hasPaddingmask = (not paddingMask == None)
self.hasAttenmask = (not attenMask == None)
# 待定目前设为nullptr
self.realshift = realshift if realshift else jt.zeros(B, N, SQ, SKV)
self.dropMask = dropMask if dropMask else jt.ones(B, N, SQ, SKV)
self.paddingMask = paddingMask if paddingMask else jt.zeros(
B, N, SQ, SKV)
self.attenMask = attenMask if attenMask else jt.zeros(SQ, SKV)
attr_code = f"""
op.jt_name = "flashattention";
FlashAttentionAttr *attr = new FlashAttentionAttr();
attr->scale = {self.scale};
attr->keepProb = {self.prob};
attr->preToken = {self.pretokens};
attr->nextToken = {self.nexttokens};
attr->headNum = {self.headnum};
attr->inputLayout = "{self.layout}";
attr->innerPrecise = {self.innerprecise};
attr->sparseMode = {self.sparsemode};
attr->psetype = {self.psetype};
attr->prefix = {{ {", ".join(map(str, self.prefix))} }};
attr->qStartIdx = {{ {", ".join(map(str, self.qstart))} }};
attr->kvStartIdx = {{ {", ".join(map(str, self.kvstart))} }};
attr->hasRealshift = {"true" if self.hasRealshift else "false"};
attr->hasDropmask = {"true" if self.hasDropmask else "false"};
attr->hasPaddingmask = {"true" if self.hasPaddingmask else "false"};
attr->hasAttentmask = {"true" if self.hasAttenmask else "false"};
op.op_attr.reset(attr);
"""
inputs = [
q, k, v, self.realshift, self.dropMask, self.paddingMask,
self.attenMask
]
result = flashattention_cmd(
"FlashAttention",
inputs,
output_dtypes=["float", "float", q.dtype],
output_shapes=[output_shape, output_shape, q.shape],
attr_code=attr_code)
self.maxout = result[0]
self.sumout = result[1]
self.attenout = result[2]
return self.attenout
def grad(self, dy):
attr_code = f"""
op.jt_name = "flashattentionbackward";
FlashAttentionAttr *attr = new FlashAttentionAttr();
attr->scale = {self.scale};
attr->keepProb = {self.prob};
attr->preToken = {self.pretokens};
attr->nextToken = {self.nexttokens};
attr->headNum = {self.headnum};
attr->inputLayout = "{self.layout}";
attr->innerPrecise = {self.innerprecise};
attr->sparseMode = {self.sparsemode};
attr->psetype = {self.psetype};
attr->prefix = {{ {", ".join(map(str, self.prefix))} }};
attr->qStartIdx = {{ {", ".join(map(str, self.qstart))} }};
attr->kvStartIdx = {{ {", ".join(map(str, self.kvstart))} }};
attr->hasRealshift = {"true" if self.hasRealshift else "false"};
attr->hasDropmask = {"true" if self.hasDropmask else "false"};
attr->hasPaddingmask = {"true" if self.hasPaddingmask else "false"};
attr->hasAttentmask = {"true" if self.hasAttenmask else "false"};
op.op_attr.reset(attr);
"""
inputs = [
self.q, self.k, self.v, dy, self.realshift, self.dropMask,
self.paddingMask, self.attenMask, self.maxout, self.sumout,
self.attenout
]
result = flashattention_cmd(
"FlashAttentionBackward",
inputs,
output_dtypes=[self.q.dtype, self.k.dtype, self.v.dtype],
output_shapes=[self.q.shape, self.k.shape, self.v.shape],
attr_code=attr_code)
return result

View File

@ -0,0 +1,88 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "flashattention_op_acl.h"
namespace jittor
{
FlashAttentionOpRunner::FlashAttentionOpRunner() : BaseOpRunner("FlashAttention")
{
}
void FlashAttentionOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());
auto prefix = aclCreateIntArray(attr->prefix.data(), attr->prefix.size());
auto qstart = aclCreateIntArray(attr->qStartIdx.data(), attr->qStartIdx.size());
auto kvstart = aclCreateIntArray(attr->kvStartIdx.data(), attr->kvStartIdx.size());
char *layout = const_cast<char *>(attr->inputLayout.data());
ret = aclnnFlashAttentionScoreV2GetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], attr->hasRealshift ? inputTensors[3] : nullptr, attr->hasDropmask ? inputTensors[4] : nullptr, nullptr, attr->hasAttentmask ? inputTensors[6] : nullptr, prefix, qstart, kvstart, attr->scale, attr->keepProb, attr->preToken, attr->nextToken, attr->headNum, layout, attr->innerPrecise, attr->sparseMode, attr->psetype, outputTensors[0], outputTensors[1], nullptr, outputTensors[2], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnFlashAttentionScoreV2(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlashAttentionScoreV2 failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
FlashAttentionBackwardOpRunner::FlashAttentionBackwardOpRunner() : BaseOpRunner("FlashAttentionBackward")
{
}
void FlashAttentionBackwardOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<FlashAttentionAttr *>(op_attr.get());
auto prefix = aclCreateIntArray(attr->prefix.data(), attr->prefix.size());
auto qstart = aclCreateIntArray(attr->qStartIdx.data(), attr->qStartIdx.size());
auto kvstart = aclCreateIntArray(attr->kvStartIdx.data(), attr->kvStartIdx.size());
char *layout = const_cast<char *>(attr->inputLayout.data());
ret = aclnnFlashAttentionScoreGradV2GetWorkspaceSize(inputTensors[0], inputTensors[1], inputTensors[2], inputTensors[3], attr->hasRealshift ? inputTensors[4] : nullptr, attr->hasDropmask ? inputTensors[5] : nullptr, nullptr, attr->hasAttentmask ? inputTensors[7] : nullptr, inputTensors[8], inputTensors[9], nullptr, inputTensors[10], prefix, qstart, kvstart, attr->scale, attr->keepProb, attr->preToken, attr->nextToken, attr->headNum, layout, attr->innerPrecise, attr->sparseMode, attr->psetype, outputTensors[0], outputTensors[1], outputTensors[2], nullptr, &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnFlashAttentionScoreGradV2(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlashAttentionScoreGradV2 failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,27 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class FlashAttentionOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
FlashAttentionOpRunner();
};
class FlashAttentionBackwardOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
FlashAttentionBackwardOpRunner();
};
}

View File

@ -0,0 +1,85 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def flip_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class FlipACL(jt.Function):
def __init__(self):
super(FlipACL, self).__init__()
def execute(self, input, dim):
if type(dim) is tuple:
dim = list(dim)
if type(dim) is not list:
dim = [dim]
attr_code = f"""
op.jt_name = "flip";
ReduceAttr *attr = new ReduceAttr();
attr->axes = {{{', '.join(map(str, (list(dim))))}}};
attr->prod_dim = {len(dim)};
op.op_attr.reset(attr);
"""
self.attr_code = attr_code
result = flip_cmd("Flip", [input],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code=self.attr_code)[0]
return result
def grad(self, grad_output):
grad_input = flip_cmd("Flip", [grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[grad_output.shape],
attr_code=self.attr_code)[0]
return grad_input

View File

@ -0,0 +1,58 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "flip_op_acl.h"
namespace jittor
{
FlipOpRunner::FlipOpRunner() : BaseOpRunner("Flip")
{
}
void FlipOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<ReduceAttr *>(op_attr.get());
auto dim = aclCreateIntArray(attr->axes.data(), attr->axes.size());
ret = aclnnFlipGetWorkspaceSize(inputTensors[0], dim, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnFlip(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFlip failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,16 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class FlipOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
FlipOpRunner();
};
}

View File

@ -0,0 +1,70 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def floor_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class FloorIntACL(jt.Function):
def __init__(self):
super(FloorIntACL, self).__init__()
def execute(self, input):
self.shape = input.shape
result = floor_cmd("Floor", [input],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code="op.jt_name=\"floor\";")[0]
return result
def grad(self, grad_output):
return jt.zeros(self.shape, dtype=grad_output.dtype)

View File

@ -0,0 +1,56 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "floor_op_acl.h"
namespace jittor
{
FloorOpRunner::FloorOpRunner() : BaseOpRunner("Floor")
{
}
void FloorOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
ret = aclnnFloorGetWorkspaceSize(inputTensors[0], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnFloor(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnFloor failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,16 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class FloorOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
FloorOpRunner();
};
}

View File

@ -0,0 +1,126 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def gather_scatter_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
class GatherACL(jt.Function):
def __init__(self):
super(GatherACL, self).__init__()
def execute(self, input, dim, index):
self.dim = dim
self.index = index
attr_code = f"""
op.jt_name = "gather";
GatherAttr *attr = new GatherAttr();
attr->dim = {dim};
op.op_attr.reset(attr);
"""
result = gather_scatter_cmd("Gather", [input, index],
output_dtypes=[input.dtype],
output_shapes=[index.shape],
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
tmp = jt.zeros(self.index.shape, dtype=grad_output.dtype)
attr_code = f"""
op.jt_name = "scatter";
ScatterAttr *attr = new ScatterAttr();
attr->axis = {self.dim};
attr->reduction = {1};
op.op_attr.reset(attr);
"""
grad_input = gather_scatter_cmd("Scatter",
[tmp, self.index, grad_output],
output_dtypes=[grad_output.dtype],
output_shapes=[tmp.shape],
attr_code=attr_code)[0]
return grad_input
class ScatterACL(jt.Function):
def __init__(self):
super(ScatterACL, self).__init__()
def execute(self, input, dim, index, src, reduce='void'):
self.dim = dim
self.index = index
self.reduce = reduce
attr_code = f"""
op.jt_name = "scatter";
ScatterAttr *attr = new ScatterAttr();
attr->axis = {dim};
attr->reduction = {1 if reduce == 'add' else 2 if reduce == 'mul' else 0};
op.op_attr.reset(attr);
"""
result = gather_scatter_cmd("Scatter", [input, self.index, src],
output_dtypes=[input.dtype],
output_shapes=[input.shape],
attr_code=attr_code)[0]
return result
def grad(self, grad_output):
attr_code = f"""
op.jt_name = "gather";
GatherAttr *attr = new GatherAttr();
attr->dim = {self.dim};
op.op_attr.reset(attr);
"""
grad_input = gather_scatter_cmd("Gather", [grad_output, self.index],
output_dtypes=[grad_output.dtype],
output_shapes=[self.index.shape],
attr_code=attr_code)[0]
return grad_output, None, None, grad_input

View File

@ -0,0 +1,80 @@
#pragma once
#include <acl/acl.h>
#include <acl/acl_op_compiler.h>
#include <Python.h>
#include <pystate.h>
#include <algorithm>
#include <queue>
#include <set>
#include "common.h"
#include "op.h"
#include "acl_jittor.h"
#include "ops/random_op.h"
#include "ops/reduce_op.h"
#include "ops/binary_op.h"
#include "ops/broadcast_to_op.h"
#include "ops/transpose_op.h"
#include "ops/array_op.h"
#include "ops/code_op.h"
#include "fused_op.h"
#include "ops/unary_op.h"
#include "ops/ternary_op.h"
#include "executor.h"
#include "misc/cuda_flags.h"
#include "mem/allocator.h"
#include "op_compiler.h"
#include "ops/op_register.h"
#include "opt/tuner_manager.h"
#include "utils/str_utils.h"
#include "aclnn/aclnn.h"
#include "gather_scatter_op_acl.h"
namespace jittor
{
GatherOpRunner::GatherOpRunner() : BaseOpRunner("Gather")
{
}
void GatherOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<GatherAttr *>(op_attr.get());
ret = aclnnGatherGetWorkspaceSize(inputTensors[0], attr->dim, inputTensors[1], outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnGather(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnGather failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
ScatterOpRunner::ScatterOpRunner() : BaseOpRunner("Scatter")
{
}
void ScatterOpRunner::executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it)
{
auto attr = dynamic_cast<ScatterAttr *>(op_attr.get());
ret = aclnnScatterGetWorkspaceSize(inputTensors[0], attr->axis, inputTensors[1], inputTensors[2], attr->reduction, outputTensors[0], &workspaceSize, &executor);
checkRet(ret);
if (workspaceSize > 0)
{
mallocWorkSpace(workspaceSize);
}
ret = aclnnScatter(workspaceAddr, workspaceSize, executor, aclstream);
CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("%s: aclnnScatter failed. ERROR: %d\n", name.c_str(), ret); return);
syncRun();
return;
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include "utils.h"
#include "base_op.h"
namespace jittor
{
class GatherOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
GatherOpRunner();
};
class ScatterOpRunner : public BaseOpRunner
{
protected:
void executeOp(std::unordered_map<string, AclOpFunctions>::iterator &it) override;
public:
ScatterOpRunner();
};
}

View File

@ -0,0 +1,419 @@
import os
from jittor_utils import env_or_try_find
import jittor_utils
import ctypes
import glob
import jittor.compiler as compiler
import jittor as jt
import math
import numpy as np
from typing import Union
from collections.abc import Sequence, Iterable
def getitem_cmd(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
{output_code}
{attr_code}
op.run();""")
def getitem_forward(name: str,
inputs: list,
output_dtypes: list = None,
output_shapes: list = None,
attr_code: str = "",
attr_header: str = "",
outputs: list = None,
extra_data: dict = {}):
attr_header = "\nnamespace jittor{" + attr_header + "}\n"
cuda_header = '''
#include "acl/aclops/aclops.h"
'''
outputs_ = []
if outputs is not None:
outputs_ = outputs
else:
assert output_dtypes is not None
assert output_shapes is not None
assert len(output_dtypes) == len(output_shapes)
for i in range(len(output_shapes)):
outputs_.append(jt.empty(output_shapes[i], output_dtypes[i]))
input_code = ''
for i in range(len(inputs)):
input_code += f"op.add(in{i}, true);\n"
output_code = ''
for i in range(len(outputs_)):
output_code += f"op.add(out{i}, false);\n"
return jt.code(outputs=outputs_,
inputs=inputs,
cuda_header=attr_header + cuda_header,
cuda_src=f"""
// aclop
{name}OpRunner op;
{input_code}
op.add(out0, false);
{attr_code}
op.run();""",
data=extra_data)
def caculate_shape(tensors):
if isinstance(tensors, jt.Var):
# tensors = tensors[0]
return tensors.shape
elif isinstance(tensors, (int, float)):
return []
elif isinstance(tensors, (list, tuple)):
# return [caculate_shape(tensor) for tensor in tensors]
sub_shape = caculate_shape(tensors[0])
return [len(tensors)] + sub_shape
else:
assert False, f"not implemented for {type(tensors)}"
def can_broadcast_and_shape(shape1, shape2):
"""
检查两个张量是否可以广播并返回广播后的形状
参数:
- shape1: 第一个张量的形状tuple list
- shape2: 第二个张量的形状tuple list
返回:
- can_broadcast: 布尔值表示是否可以广播
- broadcast_shape: 如果可以广播返回广播后的形状否则返回 None
"""
# 将形状转换为元组,以防输入是列表
shape1 = tuple(shape1)
shape2 = tuple(shape2)
# 使两个形状的长度一致通过在前面补1
len1, len2 = len(shape1), len(shape2)
if len1 < len2:
shape1 = (1, ) * (len2 - len1) + shape1
elif len2 < len1:
shape2 = (1, ) * (len1 - len2) + shape2
broadcast_shape = []
# 从最后一维开始检查每一维度
for dim1, dim2 in zip(shape1, shape2):
if dim1 == dim2:
broadcast_shape.append(dim1)
elif dim1 == 1:
broadcast_shape.append(dim2)
elif dim2 == 1:
broadcast_shape.append(dim1)
else:
# 如果在某一维度上不兼容,则不能广播
return False, None
return True, tuple(broadcast_shape)
class GetItemACL(jt.Function):
def __init__(self):
self.type_ = 'notype'
def stride(self, x, dim):
stride = 1
for i in range(dim + 1, len(x.shape)):
stride *= x.shape[i]
return stride
def execute(self, x, slices, return_x=None):
if isinstance(slices, jt.Var) and slices.dtype == 'bool':
# assert False, "not support bool type now"
#TODO:优化
assert x.shape == slices.shape, "shape not match"
output_len = slices.sum().item()
# output = jt.empty((output_len,),dtype=x.dtype)
x_len = x.numel()
output = jt.empty((x_len), dtype=x.dtype)
outputs = [output]
inputs = [x, slices]
# print(inputs,outputs)
# print(output.shape)
self.mask = slices
self.type_ = 'mask'
attr_code = f"""
op.jt_name = "maskedselect";
"""
result = getitem_cmd("MaskedSelect",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
result = result[:output_len]
result.sync()
return result
self.x_shape = x.shape
if not isinstance(slices, tuple):
slices = (slices, )
slices = list(slices)
for i, s in enumerate(slices):
if isinstance(s, int) and s < 0:
slices[i] = s + x.shape[i]
slices = tuple(slices)
slices_list = list(slices)
# if not isinstance(slices[0], slice):
#check slices contains slice type
contains_slice = False
for s in slices:
if not isinstance(s, jt.Var) and (isinstance(s, slice)
or s == Ellipsis):
contains_slice = True
break
if not contains_slice:
indices = []
output_shape = []
slices_len = len(slices)
boardcast_shape = caculate_shape(slices_list[0])
for ii in range(1, len(slices)):
dd, boardcast_shape = can_broadcast_and_shape(
boardcast_shape, caculate_shape(slices_list[ii]))
assert dd is True, "can not broadcast"
output_shape = boardcast_shape
output_shape += x.shape[slices_len:]
if output_shape == []:
output_shape = [1]
for ii in slices:
indices.append(jt.Var(ii).int32())
if isinstance(slices[0],
jt.Var) or isinstance(slices[0], int) or isinstance(
slices[0], list) or isinstance(slices[0], tuple):
self.indices = indices
inputs = [x] + indices
attr_code = f"""
op.jt_name = "index";
"""
self.type_ = 'index'
result = getitem_cmd("Index",
inputs=inputs,
output_dtypes=[x.dtype],
output_shapes=[output_shape],
attr_code=attr_code)[0]
result.sync()
return result
assert contains_slice, "slice type error"
x_dim = len(x.shape)
slices = list(slices)
for s in slices:
if not isinstance(s, jt.Var) and s == Ellipsis:
slices = slices[:slices.index(s)] + [
slice(None, None, None)
] * (x_dim - len(slices) + 1) + slices[slices.index(s) + 1:]
break
slices = tuple(slices)
if len(slices) < x_dim:
slices += (slice(None, None, None), ) * (x_dim - len(slices))
inputs = [x]
sizes = []
begins = []
ends = []
steps = []
dims = []
squeeze_dims = []
extra_data = {}
if len(slices):
extra_data["a"] = len(slices)
for dim, s in enumerate(slices):
if isinstance(s, int):
s = slice(s, s + 1, 1)
squeeze_dims.append(dim)
if isinstance(s, jt.Var):
assert False, "jt.Var not supported"
start, stop, step = s.indices(x.size(dim))
size = (stop - start - 1) // step + 1
# stride = self.stride(x, dim) * step
sizes.append(size)
extra_data[str(dim * 3)] = start
extra_data[str(dim * 3 + 1)] = stop
extra_data[str(dim * 3 + 2)] = step
steps.append(step)
begins.append(start)
ends.append(stop)
dims.append(dim)
else:
extra_data["a"] = -1
sizes = [1]
steps = [1]
self.type_ = 'slicev2'
# for backward
self.begins = begins
self.ends = ends
self.steps = steps
self.dims = dims
self.slices = slices
attr_code = """
op.jt_name = "slicev2";
StrideAttr *attr = new StrideAttr();
int slice_dim = data["a"];
if(slice_dim == -1) {
attr->begins = {};
attr->ends = {};
attr->steps = {1};
attr->axes = {};
} else {
vector<long int> begins;
vector<long int> ends;
vector<long int> steps;
vector<long int> dims;
for(int dim = 0; dim < slice_dim; dim++) {
dims.push_back(dim);
begins.push_back(data[std::to_string(dim*3)]);
ends.push_back(data[std::to_string(dim*3+1)]);
steps.push_back(data[std::to_string(dim*3+2)]);
}
attr->begins = begins;
attr->ends = ends;
attr->steps = steps;
attr->axes = dims;
}
op.op_attr.reset(attr);
"""
result = getitem_forward("SliceV2",
inputs,
output_dtypes=[x.dtype],
output_shapes=[jt.empty(sizes).shape],
attr_code=attr_code,
extra_data=extra_data)[0]
self.squeeze_dims = squeeze_dims
for dim in squeeze_dims[::-1]:
result = jt.squeeze(result, dim)
result.sync()
return result
def grad(self, grad_output):
if self.type_ == 'index':
indices = self.indices
inputs = [grad_output] + indices
attr_code = f"""
op.jt_name = "indexputimplaccumulate";
"""
outputs = [jt.zeros(self.x_shape, dtype=grad_output.dtype)]
# breakpoint()
result = getitem_cmd("IndexPutImplAccumulate",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
result.sync()
return result, None
elif self.type_ == 'slicev2':
begins = self.begins
ends = self.ends
steps = self.steps
dims = self.dims
slices = self.slices
#注意前向的维数可能会被压缩,所以这里要还原
for dim in self.squeeze_dims:
grad_output = jt.unsqueeze(grad_output, dim)
#适配华为奇怪的要求最后一个维度的step必须是1
expand_dim = False
if isinstance(slices[-1], slice):
if slices[-1].step is not None and slices[-1].step != 1:
slices = slices + (slice(None, None, None), )
expand_dim = True
elif isinstance(slices[-1], int):
#注意最后一个维度是数字
slices = list(slices)
slices[-1] = slice(slices[-1], slices[-1] + 1, 1)
slices = tuple(slices)
slices = slices + (slice(None, None, None), )
expand_dim = True
else:
assert False, "not supported"
# x = x.unsqueeze(-1)
if expand_dim:
grad_output = grad_output.unsqueeze(-1)
self.x_shape = self.x_shape + (1, )
sizes = []
begins = []
ends = []
steps = []
dims = []
for dim, s in enumerate(slices):
if isinstance(s, int):
s = slice(s, s + 1, 1)
# squeeze_dims.append(dim)
if isinstance(s, jt.Var):
assert False, "jt.Var not supported"
start, stop, step = s.indices(self.x_shape[dim])
size = (stop - start - 1) // step + 1
# stride = self.stride(x, dim) * step
sizes.append(size)
steps.append(step)
begins.append(start)
ends.append(stop)
dims.append(dim)
if not sizes:
sizes = [1]
steps = [1]
attr_code = f"""
op.jt_name = "stridedsliceassignv2";
StrideAttr *attr = new StrideAttr();
attr->begins = {{ {", ".join(map(str, begins))} }};
attr->ends = {{ {", ".join(map(str, ends))} }};
attr->steps = {{ {", ".join(map(str, steps))} }};
attr->axes = {{ {", ".join(map(str, dims))} }};
op.op_attr.reset(attr);
"""
inputs = [grad_output]
outputs = [jt.zeros(self.x_shape, dtype=grad_output.dtype)]
result = getitem_cmd("StridedSliceAssignV2",
inputs=inputs,
outputs=outputs,
attr_code=attr_code)[0]
result.sync()
if expand_dim:
result = result.squeeze(-1)
return result, None
elif self.type_ == 'mask':
return self.mask.float()
pass
else:
assert False, f"grad not implemented for {self.type_}"

Some files were not shown because too many files have changed in this diff Show More