存算联动调整2

This commit is contained in:
tzwang 2023-12-07 18:05:34 +08:00
parent eb90e99d47
commit 8bdfaa2320
2 changed files with 126 additions and 24 deletions

View File

@ -6,6 +6,7 @@ import (
"gitlink.org.cn/jcce-pcm/pcm-ac/hpcAC"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
"strings"
)
@ -16,9 +17,95 @@ type ShuguangHpc struct {
}
const (
SHUGUANGHPC_CUSTOM_RESOURCE_ID = "10240 // 10G"
GAP_WALL_TIME_24H = "24:00:00"
TASK_SHUGUANG_PREFIX = "ShuguangHPC"
NEWLINE = "\n"
JOBNAME = "JOBNAME"
GAP_CMD_FILE = "cmd"
GAP_NNODE = "1" // 节点个数
GAP_NODE_STRING = ""
GAP_APPNAME = "BASE"
GAP_QUEUE = "wzhdtest"
GAP_WORK_DIR = "/work/home/acgnnmfbwo/BASE/JOBNAME"
GAP_STD_OUT_FILE = "/work/home/acgnnmfbwo/BASE/JOBNAME/std.out.%j"
GAP_STD_ERR_FILE = "/work/home/acgnnmfbwo/BASE/JOBNAME/std.err.%j"
StrJobManagerID = 1637920656
Apptype = "BASIC"
EXPORT = "export"
GAP_NPROC = "1"
GAP_NDCU = "1"
GAP_EXCLUSIVE = ""
GAP_PPN = ""
GAP_NGPU = ""
GAP_MULTI_SUB = ""
)
var RESOURCEMAP = map[string]ResourceSpec{
"Nd99eGNoBFC2ZTycKDlqD37heWTOmrMS": {
GAP_NNODE: "1",
GAP_NPROC: "2",
GAP_NDCU: "1",
},
"uAmLkz6jgSZkC6o8JywG7Yo2aiFPPOBO": {
GAP_NNODE: "1",
GAP_NPROC: "4",
GAP_NDCU: "2",
},
"D71OZQYrRabJc2nfL2GDWOdLEfbiMzYH": {
GAP_NNODE: "1",
GAP_NPROC: "8",
GAP_NDCU: "4",
},
"sXUMrGmgMDFJaLi6dPiB9LkHjFb3lvL5": {
GAP_NNODE: "1",
GAP_NPROC: "16",
GAP_NDCU: "4",
},
"ZfCKQKbNbQl9RPwlSyWLah1Gf7Ti7uJA": {
GAP_NNODE: "1",
GAP_NPROC: "32",
GAP_NDCU: "4",
},
"cfEI4ulTNo2gYUozzdG59URByUjwLl3x": {
GAP_NNODE: "2",
GAP_NPROC: "4",
GAP_NDCU: "2",
},
"vtbkaks8bErhpLRkUDiPDUHq6ssotFpD": {
GAP_NNODE: "2",
GAP_NPROC: "8",
GAP_NDCU: "4",
},
"QJXZFJSReVWWQfkvQjGyEq1JpDHN55Oh": {
GAP_NNODE: "2",
GAP_NPROC: "16",
GAP_NDCU: "4",
},
"79xSdy48yLbVLl9DqEV6tQ2J6jaHe5KO": {
GAP_NNODE: "2",
GAP_NPROC: "32",
GAP_NDCU: "8",
},
}
var RESOURCESPECS = map[string]string{
"Nd99eGNoBFC2ZTycKDlqD37heWTOmrMS": "1*NODE, CPU:2, 1*DCU",
"uAmLkz6jgSZkC6o8JywG7Yo2aiFPPOBO": "1*NODE, CPU:4, 2*DCU",
"D71OZQYrRabJc2nfL2GDWOdLEfbiMzYH": "1*NODE, CPU:8, 4*DCU",
"sXUMrGmgMDFJaLi6dPiB9LkHjFb3lvL5": "1*NODE, CPU:16, 4*DCU",
"ZfCKQKbNbQl9RPwlSyWLah1Gf7Ti7uJA": "1*NODE, CPU:32, 4*DCU",
"cfEI4ulTNo2gYUozzdG59URByUjwLl3x": "2*NODE, CPU:4, 2*DCU",
"vtbkaks8bErhpLRkUDiPDUHq6ssotFpD": "2*NODE, CPU:8, 4*DCU",
"QJXZFJSReVWWQfkvQjGyEq1JpDHN55Oh": "2*NODE, CPU:16, 4*DCU",
"79xSdy48yLbVLl9DqEV6tQ2J6jaHe5KO": "2*NODE, CPU:32, 8*DCU",
}
type ResourceSpec struct {
GAP_NNODE string
GAP_NPROC string
GAP_NDCU string
}
func NewShuguangHpc(ctx context.Context, svcCtx *svc.ServiceContext, participant *models.StorelinkCenter) *ShuguangHpc {
return &ShuguangHpc{ctx: ctx, svcCtx: svcCtx, participant: participant}
}
@ -39,7 +126,8 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param
// shuguangHpc提交任务
//判断是否resourceId匹配自定义资源Id
if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID {
_, isMapContainsKey := RESOURCESPECS[resourceId]
if !isMapContainsKey {
return nil, errors.New("shuguangHpc资源Id不存在")
}
@ -47,37 +135,43 @@ func (s ShuguangHpc) SubmitTask(imageId string, cmd string, envs []string, param
var env string
for _, e := range envs {
s := strings.Split(e, COMMA)
env += s[0] + "=" + s[1] + SPACE
env += EXPORT + SPACE + s[0] + EQUAL + s[1] + NEWLINE
}
//请求
taskName := TASK_SHUGUANG_PREFIX + UNDERSCORE + UNDERSCORE + utils.RandomString(10)
GAP_WORK_DIR := strings.Replace(GAP_WORK_DIR, JOBNAME, taskName, -1)
GAP_STD_OUT_FILE := strings.Replace(GAP_STD_OUT_FILE, JOBNAME, taskName, -1)
GAP_STD_ERR_FILE := strings.Replace(GAP_STD_ERR_FILE, JOBNAME, taskName, -1)
req := &hpcAC.SubmitJobReq{
Apptype: "",
Appname: "",
StrJobManagerID: 0,
Apptype: Apptype,
Appname: GAP_APPNAME,
StrJobManagerID: StrJobManagerID,
MapAppJobInfo: &hpcAC.MapAppJobInfo{
GAP_CMD_FILE: "echo $TESTDIR; echo $TESTENV; sleep 30",
GAP_NNODE: "1",
GAP_NODE_STRING: "",
GAP_SUBMIT_TYPE: "cmd",
GAP_JOB_NAME: "testSlurmjob1",
GAP_WORK_DIR: "/work/home/acgnnmfbwo/BASE/testSlurmjob1",
GAP_QUEUE: "wzhdtest",
GAP_NPROC: "1",
GAP_PPN: "",
GAP_NGPU: "",
GAP_NDCU: "1",
GAP_WALL_TIME: "01:00:00",
GAP_EXCLUSIVE: "",
GAP_APPNAME: "BASE",
GAP_MULTI_SUB: "",
GAP_STD_OUT_FILE: "/work/home/acgnnmfbwo/BASE/testSlurmjob1/std.out.%j",
GAP_STD_ERR_FILE: "/work/home/acgnnmfbwo/BASE/testSlurmjob1/std.err.%j",
GAP_SCHEDULER_OPT_WEB: "export TESTDIR=/bin/emacs\nexport TESTENV=12345",
GAP_CMD_FILE: cmd,
GAP_NNODE: GAP_NNODE,
GAP_NODE_STRING: GAP_NODE_STRING,
GAP_SUBMIT_TYPE: GAP_CMD_FILE,
GAP_JOB_NAME: taskName,
GAP_WORK_DIR: GAP_WORK_DIR,
GAP_QUEUE: GAP_QUEUE,
GAP_NPROC: GAP_NPROC,
GAP_PPN: GAP_PPN,
GAP_NGPU: GAP_NGPU,
GAP_NDCU: GAP_NDCU,
GAP_WALL_TIME: GAP_WALL_TIME_24H,
GAP_EXCLUSIVE: GAP_EXCLUSIVE,
GAP_APPNAME: GAP_APPNAME,
GAP_MULTI_SUB: GAP_MULTI_SUB,
GAP_STD_OUT_FILE: GAP_STD_OUT_FILE,
GAP_STD_ERR_FILE: GAP_STD_ERR_FILE,
GAP_SCHEDULER_OPT_WEB: env,
},
}
updateRequestByResouceId(resourceId, req)
resp, err := s.svcCtx.ACRpc.SubmitJob(s.ctx, req)
if err != nil {
return nil, err
@ -107,3 +201,10 @@ func (s ShuguangHpc) DeleteTask(taskId string) (interface{}, error) {
//TODO implement me
panic("implement me")
}
func updateRequestByResouceId(resourceId string, req *hpcAC.SubmitJobReq) {
spec := RESOURCEMAP[resourceId]
req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE
req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC
req.MapAppJobInfo.GAP_NDCU = spec.GAP_NDCU
}

View File

@ -41,6 +41,7 @@ const (
PY_PARAM_PREFIX = "--"
SPACE = " "
UNDERSCORE = "_"
EQUAL = "="
COMMA = ","
TYPE_OCTOPUS = "1"
TYPE_MODELARTS = "2"