存算联动提交接口修改

This commit is contained in:
tzwang 2023-10-27 17:08:39 +08:00
parent 0ad85477e4
commit 6b5a30defe
4 changed files with 33 additions and 7 deletions

View File

@ -2,11 +2,13 @@ package storeLink
import (
"context"
"errors"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils/timeutils"
"gitlink.org.cn/jcce-pcm/pcm-participant-ac/hpcAC"
"strings"
"time"
)
@ -16,6 +18,16 @@ type ShuguangAi struct {
participant *models.ScParticipantPhyInfo
}
const (
WORKER_RAM_SIZE = 10240 // 10G
WORKER_NUMBER = 1
WORKER_CPU_NUMBER = 5
WORKER_GPU_NUMBER = 1
PY_PARAM_PREFIX = "--"
SPACE = " "
SHUGUANGAI_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
)
func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, participant *models.ScParticipantPhyInfo) *ShuguangAi {
return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, participant: participant}
}
@ -51,6 +63,11 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) {
func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, resourceId string) (interface{}, error) {
// shuguangAi提交任务
//判断是否resourceId匹配自定义资源Id
if resourceId != SHUGUANGAI_RESOURCE_ID {
return nil, errors.New("shuguangAi资源Id不存在")
}
//根据imageId获取imagePath, version
imageReq := &hpcAC.GetImageAiByIdReq{ImageId: imageId}
imageResp, err := s.svcCtx.ACRpc.GetImageAiById(s.ctx, imageReq)
@ -60,6 +77,13 @@ func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, res
dateStr := timeutils.UnixTimeToString(time.Now().Unix())
//python参数
var pythonArg string
for _, param := range params {
s := strings.Split(param, COMMA)
pythonArg += PY_PARAM_PREFIX + s[0] + "=" + s[1] + SPACE
}
req := &hpcAC.SubmitPytorchTaskReq{
Params: &hpcAC.SubmitPytorchTaskParams{
TaskName: TASK_PYTORCH_PREFIX + "_" + utils.RandomString(7) + dateStr,
@ -70,13 +94,14 @@ func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, res
AcceleratorType: DCU,
Version: imageResp.Image.Version,
ImagePath: imageResp.Image.Path,
WorkerNumber: 1,
WorkerCpuNumber: "1",
WorkerGpuNumber: 1,
WorkerRamSize: 1024,
WorkerNumber: WORKER_NUMBER,
WorkerCpuNumber: WORKER_CPU_NUMBER,
WorkerGpuNumber: WORKER_GPU_NUMBER,
WorkerRamSize: WORKER_RAM_SIZE,
ResourceGroup: RESOURCE_GROUP,
TimeoutLimit: TimeoutLimit,
PythonCodePath: PythonCodePath,
PythonArg: pythonArg,
},
}
resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)

View File

@ -315,6 +315,7 @@ func ConvertType[T any](in *T, participant *models.ScParticipantPhyInfo) (interf
resp.Success = true
spec.ParticipantName = participant.Name
spec.ParticipantId = strconv.FormatInt(participant.Id, 10)
spec.SpecId = SHUGUANGAI_RESOURCE_ID
resp.ResourceSpecs = append(resp.ResourceSpecs, &spec)
}
return resp, nil

2
go.mod
View File

@ -21,7 +21,7 @@ require (
github.com/shopspring/decimal v1.3.1
github.com/zeromicro/go-queue v1.1.8
github.com/zeromicro/go-zero v1.5.5
gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231026084523-f76f3da5525d
gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835
gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef
gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1

4
go.sum
View File

@ -1033,8 +1033,8 @@ github.com/zeromicro/go-zero v1.4.3/go.mod h1:UmDjuW7LHd9j7+nnnPBcXF0HLNmjJw6OjH
github.com/zeromicro/go-zero v1.5.1/go.mod h1:bGYm4XWsGN9GhDsO2O2BngpVoWjf3Eog2a5hUOMhlXs=
github.com/zeromicro/go-zero v1.5.3 h1:9poyd+raeL7gSMUu6P19N7bssTppieR2j7Oos2j1yFQ=
github.com/zeromicro/go-zero v1.5.3/go.mod h1:dmoBpgJTxt9KWmgrNGpv06XxZRPXMakrxUVgROFAR3g=
gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231026084523-f76f3da5525d h1:CY4pWM8JVRXBtD5CdVZC0fe4xUxjHmQegdwpHBaOBes=
gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231026084523-f76f3da5525d/go.mod h1:DY45tXlPBWBptj9YjCHWnAK5LshvJ33PjFkE5/vtd4o=
gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31 h1:SppjTZvObJgqliPk1wSeuezQu1k/tMGcyVaMVEaDIUU=
gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31/go.mod h1:DY45tXlPBWBptj9YjCHWnAK5LshvJ33PjFkE5/vtd4o=
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835 h1:WDCPqD8IrepGJXankkpG14Ny6inh9AldB0RX9WWa+ck=
gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835/go.mod h1:r/KLzUpupCV5jdxSfgDhc2pVjP0fBi3VhAWRttsBn30=
gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef h1:s7JfXjka2MhGaDjKMJ57fj0k3XuDB6w+UlYHFLyJlUY=