forked from JointCloud/pcm-coordinator
updated runtasklogic
This commit is contained in:
parent
82bc8681c0
commit
3cf1521eca
|
@ -70,7 +70,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er
|
|||
utils.Convert(&req, &opt)
|
||||
sc, _ := schedulers.NewCloudScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, tx, l.svcCtx.PromClient)
|
||||
|
||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.JOINT_CLOUD_MODE, nil)
|
||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
|
||||
if err != nil {
|
||||
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
|
||||
return err
|
||||
|
|
|
@ -63,7 +63,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type
|
|||
return nil, err
|
||||
}
|
||||
// 3、Return scheduling results
|
||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.JOINT_CLOUD_MODE, nil)
|
||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
|
||||
if err != nil {
|
||||
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
|
||||
return nil, err
|
||||
|
|
|
@ -24,7 +24,11 @@ func NewScheduleCancelTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext)
|
|||
}
|
||||
|
||||
func (l *ScheduleCancelTaskLogic) ScheduleCancelTask(req *types.CancelTaskReq) (resp *types.CancelTaskResp, err error) {
|
||||
// todo: add your logic here and delete this line
|
||||
// find task
|
||||
_, err = l.svcCtx.Scheduler.AiStorages.GetTaskById(req.TaskId)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
|
|
@ -6,12 +6,16 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"github.com/zeromicro/go-zero/core/logx"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
||||
"gopkg.in/yaml.v2"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type ScheduleRunTaskLogic struct {
|
||||
|
@ -49,8 +53,9 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ
|
|||
return nil, err
|
||||
}
|
||||
|
||||
_ = &option.AiOption{
|
||||
opt := &option.AiOption{
|
||||
AdapterId: ADAPTERID,
|
||||
TaskName: task.Name,
|
||||
}
|
||||
// update assignedClusters
|
||||
err = updateClustersByScheduledDatas(task.Id, &clusters, req.ScheduledDatas)
|
||||
|
@ -58,35 +63,52 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ
|
|||
return nil, err
|
||||
}
|
||||
|
||||
//aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
|
||||
//if err != nil {
|
||||
// return nil, err
|
||||
//}
|
||||
//
|
||||
//results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.STORAGE_SCHEDULE_MODE, clusters)
|
||||
//if err != nil {
|
||||
// return nil, err
|
||||
//}
|
||||
aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
//adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(ADAPTERID)
|
||||
//if err != nil {
|
||||
// return nil, err
|
||||
//}
|
||||
//
|
||||
//for _, i := range clusters {
|
||||
// clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(i.ClusterID)
|
||||
//
|
||||
// opt := &option.AiOption{}
|
||||
//
|
||||
// err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, i.ClusterID, clusterName, "", constants.Saved, "")
|
||||
// if err != nil {
|
||||
// return nil, errors.New("database add failed: " + err.Error())
|
||||
// }
|
||||
//}
|
||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_STORAGE_SCHEDULE, clusters)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
rs := (results).([]*schedulers.AiResult)
|
||||
|
||||
err = l.SaveResult(task, rs, opt)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (l *ScheduleRunTaskLogic) SaveResult(task *models.Task, results []*schedulers.AiResult, opt *option.AiOption) error {
|
||||
|
||||
for _, r := range results {
|
||||
|
||||
opt.ComputeCard = strings.ToUpper(r.Card)
|
||||
|
||||
adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(r.AdapterId)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId)
|
||||
|
||||
err = l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
l.svcCtx.Scheduler.AiStorages.AddNoticeInfo(r.AdapterId, adapterName, r.ClusterId, clusterName, r.TaskName, "create", "任务创建中")
|
||||
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
}
|
||||
|
||||
func updateClustersByScheduledDatas(taskId int64, assignedClusters *[]*strategy.AssignedCluster, scheduledDatas []*types.DataScheduleResults) error {
|
||||
for _, cluster := range *assignedClusters {
|
||||
for _, data := range scheduledDatas {
|
||||
|
|
|
@ -52,7 +52,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
|
|||
return nil, err
|
||||
}
|
||||
|
||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil)
|
||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ func (l *AiQueue) Consume(val string) error {
|
|||
aiSchdl, _ := schedulers.NewAiScheduler(l.ctx, val, l.svcCtx.Scheduler, nil)
|
||||
|
||||
// 调度算法
|
||||
_, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil)
|
||||
_, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -30,8 +30,8 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
JOINT_CLOUD_MODE = iota + 1
|
||||
STORAGE_SCHEDULE_MODE
|
||||
SUBMIT_MODE_JOINT_CLOUD = iota + 1
|
||||
SUBMIT_MODE_STORAGE_SCHEDULE
|
||||
)
|
||||
|
||||
type Scheduler struct {
|
||||
|
@ -134,7 +134,7 @@ func (s *Scheduler) TempAssign() error {
|
|||
func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters []*strategy.AssignedCluster) (interface{}, error) {
|
||||
var result interface{}
|
||||
switch mode {
|
||||
case JOINT_CLOUD_MODE:
|
||||
case SUBMIT_MODE_JOINT_CLOUD:
|
||||
//choose strategy
|
||||
strategy, err := ss.PickOptimalStrategy()
|
||||
if err != nil {
|
||||
|
@ -155,7 +155,7 @@ func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters
|
|||
|
||||
result = resp
|
||||
|
||||
case STORAGE_SCHEDULE_MODE:
|
||||
case SUBMIT_MODE_STORAGE_SCHEDULE:
|
||||
|
||||
//assign tasks to clusters
|
||||
resp, err := ss.AssignTask(assignedClusters, mode)
|
||||
|
|
|
@ -175,7 +175,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int
|
|||
opt, _ := cloneAiOption(as.option)
|
||||
|
||||
// decide opt params by mode
|
||||
updateAiOptionByMode(c, opt, scheduler.STORAGE_SCHEDULE_MODE)
|
||||
updateAiOptionByMode(c, opt, mode)
|
||||
|
||||
resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode)
|
||||
if err != nil {
|
||||
|
@ -282,7 +282,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int
|
|||
|
||||
func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) {
|
||||
switch mode {
|
||||
case scheduler.STORAGE_SCHEDULE_MODE:
|
||||
case scheduler.SUBMIT_MODE_STORAGE_SCHEDULE:
|
||||
opt.Cmd = cluster.Cmd
|
||||
opt.Envs = cluster.Envs
|
||||
opt.Params = cluster.Params
|
||||
|
@ -290,6 +290,8 @@ func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOptio
|
|||
opt.ImageId = cluster.ImageId
|
||||
opt.AlgorithmId = cluster.CodeId
|
||||
opt.DatasetsId = cluster.DatasetId
|
||||
|
||||
opt.ResourcesRequired = cluster.ResourcesRequired
|
||||
default:
|
||||
|
||||
}
|
||||
|
|
|
@ -32,6 +32,8 @@ type AiOption struct {
|
|||
AlgorithmCode string
|
||||
Image string
|
||||
Model interface{}
|
||||
|
||||
ResourcesRequired []map[string]interface{}
|
||||
}
|
||||
|
||||
func (a AiOption) GetOptionType() string {
|
||||
|
|
|
@ -179,6 +179,7 @@ func (s *ShuguangAi) SubmitPytorchTask(ctx context.Context, imageId string, cmd
|
|||
workPath = ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2]
|
||||
codePath = workPath + FORWARD_SLASH + TRAIN_FILE
|
||||
} else {
|
||||
// storage schedule submit mode
|
||||
codePath = algorithmId
|
||||
paths = strings.Split(algorithmId, FORWARD_SLASH)
|
||||
last := paths[len(paths)-1]
|
||||
|
@ -602,10 +603,56 @@ func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*colle
|
|||
}
|
||||
|
||||
func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
|
||||
err := s.GenerateSubmitParams(ctx, option)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
switch mode {
|
||||
case 1:
|
||||
err := s.GenerateSubmitParams(ctx, option)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
case 2:
|
||||
var dcuNum int64
|
||||
for _, res := range option.ResourcesRequired {
|
||||
typeName, ok := res["type"]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
switch typeName {
|
||||
case DCU:
|
||||
num, ok := res["number"]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
n := common.ConvertTypeToString(num)
|
||||
val, err := strconv.ParseInt(n, 10, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
dcuNum = val
|
||||
}
|
||||
}
|
||||
for k, v := range RESOURCESGAIMAP {
|
||||
if dcuNum == v.GPU {
|
||||
option.ResourceId = k
|
||||
break
|
||||
}
|
||||
|
||||
if dcuNum == 0 && v.GPU == 1 {
|
||||
option.ResourceId = k
|
||||
break
|
||||
}
|
||||
|
||||
if dcuNum >= 5 && v.GPU == 5 {
|
||||
option.ResourceId = k
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
option.ComputeCard = DCU
|
||||
|
||||
default:
|
||||
return nil, errors.New("failed to choose submit mode")
|
||||
}
|
||||
|
||||
task, err := s.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
|
Loading…
Reference in New Issue