updated runtasklogic

This commit is contained in:
tzwang 2024-12-27 17:40:12 +08:00
parent 82bc8681c0
commit 3cf1521eca
10 changed files with 116 additions and 39 deletions

View File

@ -70,7 +70,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er
utils.Convert(&req, &opt)
sc, _ := schedulers.NewCloudScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, tx, l.svcCtx.PromClient)
results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.JOINT_CLOUD_MODE, nil)
results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
if err != nil {
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
return err

View File

@ -63,7 +63,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type
return nil, err
}
// 3、Return scheduling results
results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.JOINT_CLOUD_MODE, nil)
results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
if err != nil {
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
return nil, err

View File

@ -24,7 +24,11 @@ func NewScheduleCancelTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext)
}
func (l *ScheduleCancelTaskLogic) ScheduleCancelTask(req *types.CancelTaskReq) (resp *types.CancelTaskResp, err error) {
// todo: add your logic here and delete this line
// find task
_, err = l.svcCtx.Scheduler.AiStorages.GetTaskById(req.TaskId)
if err != nil {
return nil, err
}
return
}

View File

@ -6,12 +6,16 @@ import (
"errors"
"fmt"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gopkg.in/yaml.v2"
"strings"
)
type ScheduleRunTaskLogic struct {
@ -49,8 +53,9 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ
return nil, err
}
_ = &option.AiOption{
opt := &option.AiOption{
AdapterId: ADAPTERID,
TaskName: task.Name,
}
// update assignedClusters
err = updateClustersByScheduledDatas(task.Id, &clusters, req.ScheduledDatas)
@ -58,35 +63,52 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ
return nil, err
}
//aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
//if err != nil {
// return nil, err
//}
//
//results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.STORAGE_SCHEDULE_MODE, clusters)
//if err != nil {
// return nil, err
//}
aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
if err != nil {
return nil, err
}
//adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(ADAPTERID)
//if err != nil {
// return nil, err
//}
//
//for _, i := range clusters {
// clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(i.ClusterID)
//
// opt := &option.AiOption{}
//
// err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, i.ClusterID, clusterName, "", constants.Saved, "")
// if err != nil {
// return nil, errors.New("database add failed: " + err.Error())
// }
//}
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_STORAGE_SCHEDULE, clusters)
if err != nil {
return nil, err
}
rs := (results).([]*schedulers.AiResult)
err = l.SaveResult(task, rs, opt)
if err != nil {
return nil, err
}
return
}
func (l *ScheduleRunTaskLogic) SaveResult(task *models.Task, results []*schedulers.AiResult, opt *option.AiOption) error {
for _, r := range results {
opt.ComputeCard = strings.ToUpper(r.Card)
adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(r.AdapterId)
if err != nil {
return err
}
clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId)
err = l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg)
if err != nil {
return err
}
l.svcCtx.Scheduler.AiStorages.AddNoticeInfo(r.AdapterId, adapterName, r.ClusterId, clusterName, r.TaskName, "create", "任务创建中")
}
return nil
}
func updateClustersByScheduledDatas(taskId int64, assignedClusters *[]*strategy.AssignedCluster, scheduledDatas []*types.DataScheduleResults) error {
for _, cluster := range *assignedClusters {
for _, data := range scheduledDatas {

View File

@ -52,7 +52,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
return nil, err
}
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil)
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
if err != nil {
return nil, err
}

View File

@ -41,7 +41,7 @@ func (l *AiQueue) Consume(val string) error {
aiSchdl, _ := schedulers.NewAiScheduler(l.ctx, val, l.svcCtx.Scheduler, nil)
// 调度算法
_, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil)
_, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
if err != nil {
return err
}

View File

@ -30,8 +30,8 @@ import (
)
const (
JOINT_CLOUD_MODE = iota + 1
STORAGE_SCHEDULE_MODE
SUBMIT_MODE_JOINT_CLOUD = iota + 1
SUBMIT_MODE_STORAGE_SCHEDULE
)
type Scheduler struct {
@ -134,7 +134,7 @@ func (s *Scheduler) TempAssign() error {
func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters []*strategy.AssignedCluster) (interface{}, error) {
var result interface{}
switch mode {
case JOINT_CLOUD_MODE:
case SUBMIT_MODE_JOINT_CLOUD:
//choose strategy
strategy, err := ss.PickOptimalStrategy()
if err != nil {
@ -155,7 +155,7 @@ func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters
result = resp
case STORAGE_SCHEDULE_MODE:
case SUBMIT_MODE_STORAGE_SCHEDULE:
//assign tasks to clusters
resp, err := ss.AssignTask(assignedClusters, mode)

View File

@ -175,7 +175,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int
opt, _ := cloneAiOption(as.option)
// decide opt params by mode
updateAiOptionByMode(c, opt, scheduler.STORAGE_SCHEDULE_MODE)
updateAiOptionByMode(c, opt, mode)
resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode)
if err != nil {
@ -282,7 +282,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int
func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) {
switch mode {
case scheduler.STORAGE_SCHEDULE_MODE:
case scheduler.SUBMIT_MODE_STORAGE_SCHEDULE:
opt.Cmd = cluster.Cmd
opt.Envs = cluster.Envs
opt.Params = cluster.Params
@ -290,6 +290,8 @@ func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOptio
opt.ImageId = cluster.ImageId
opt.AlgorithmId = cluster.CodeId
opt.DatasetsId = cluster.DatasetId
opt.ResourcesRequired = cluster.ResourcesRequired
default:
}

View File

@ -32,6 +32,8 @@ type AiOption struct {
AlgorithmCode string
Image string
Model interface{}
ResourcesRequired []map[string]interface{}
}
func (a AiOption) GetOptionType() string {

View File

@ -179,6 +179,7 @@ func (s *ShuguangAi) SubmitPytorchTask(ctx context.Context, imageId string, cmd
workPath = ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2]
codePath = workPath + FORWARD_SLASH + TRAIN_FILE
} else {
// storage schedule submit mode
codePath = algorithmId
paths = strings.Split(algorithmId, FORWARD_SLASH)
last := paths[len(paths)-1]
@ -602,10 +603,56 @@ func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*colle
}
func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
err := s.GenerateSubmitParams(ctx, option)
if err != nil {
return nil, err
switch mode {
case 1:
err := s.GenerateSubmitParams(ctx, option)
if err != nil {
return nil, err
}
case 2:
var dcuNum int64
for _, res := range option.ResourcesRequired {
typeName, ok := res["type"]
if !ok {
continue
}
switch typeName {
case DCU:
num, ok := res["number"]
if !ok {
continue
}
n := common.ConvertTypeToString(num)
val, err := strconv.ParseInt(n, 10, 64)
if err != nil {
return nil, err
}
dcuNum = val
}
}
for k, v := range RESOURCESGAIMAP {
if dcuNum == v.GPU {
option.ResourceId = k
break
}
if dcuNum == 0 && v.GPU == 1 {
option.ResourceId = k
break
}
if dcuNum >= 5 && v.GPU == 5 {
option.ResourceId = k
break
}
}
option.ComputeCard = DCU
default:
return nil, errors.New("failed to choose submit mode")
}
task, err := s.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
if err != nil {
return nil, err