updated runtasklogic
This commit is contained in:
parent
82bc8681c0
commit
3cf1521eca
|
@ -70,7 +70,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er
|
||||||
utils.Convert(&req, &opt)
|
utils.Convert(&req, &opt)
|
||||||
sc, _ := schedulers.NewCloudScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, tx, l.svcCtx.PromClient)
|
sc, _ := schedulers.NewCloudScheduler(l.ctx, "", l.svcCtx.Scheduler, opt, tx, l.svcCtx.PromClient)
|
||||||
|
|
||||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.JOINT_CLOUD_MODE, nil)
|
results, err := l.svcCtx.Scheduler.AssignAndSchedule(sc, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
|
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -63,7 +63,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
// 3、Return scheduling results
|
// 3、Return scheduling results
|
||||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.JOINT_CLOUD_MODE, nil)
|
results, err := l.svcCtx.Scheduler.AssignAndSchedule(vmSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
|
logx.Errorf("AssignAndSchedule() => execution error: %v", err)
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
|
@ -24,7 +24,11 @@ func NewScheduleCancelTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (l *ScheduleCancelTaskLogic) ScheduleCancelTask(req *types.CancelTaskReq) (resp *types.CancelTaskResp, err error) {
|
func (l *ScheduleCancelTaskLogic) ScheduleCancelTask(req *types.CancelTaskReq) (resp *types.CancelTaskResp, err error) {
|
||||||
// todo: add your logic here and delete this line
|
// find task
|
||||||
|
_, err = l.svcCtx.Scheduler.AiStorages.GetTaskById(req.TaskId)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,12 +6,16 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/zeromicro/go-zero/core/logx"
|
"github.com/zeromicro/go-zero/core/logx"
|
||||||
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler"
|
||||||
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
||||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
||||||
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
||||||
"gopkg.in/yaml.v2"
|
"gopkg.in/yaml.v2"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
type ScheduleRunTaskLogic struct {
|
type ScheduleRunTaskLogic struct {
|
||||||
|
@ -49,8 +53,9 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
_ = &option.AiOption{
|
opt := &option.AiOption{
|
||||||
AdapterId: ADAPTERID,
|
AdapterId: ADAPTERID,
|
||||||
|
TaskName: task.Name,
|
||||||
}
|
}
|
||||||
// update assignedClusters
|
// update assignedClusters
|
||||||
err = updateClustersByScheduledDatas(task.Id, &clusters, req.ScheduledDatas)
|
err = updateClustersByScheduledDatas(task.Id, &clusters, req.ScheduledDatas)
|
||||||
|
@ -58,35 +63,52 @@ func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *typ
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
//aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
|
aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
|
||||||
//if err != nil {
|
if err != nil {
|
||||||
// return nil, err
|
return nil, err
|
||||||
//}
|
}
|
||||||
//
|
|
||||||
//results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.STORAGE_SCHEDULE_MODE, clusters)
|
|
||||||
//if err != nil {
|
|
||||||
// return nil, err
|
|
||||||
//}
|
|
||||||
|
|
||||||
//adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(ADAPTERID)
|
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_STORAGE_SCHEDULE, clusters)
|
||||||
//if err != nil {
|
if err != nil {
|
||||||
// return nil, err
|
return nil, err
|
||||||
//}
|
}
|
||||||
//
|
|
||||||
//for _, i := range clusters {
|
rs := (results).([]*schedulers.AiResult)
|
||||||
// clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(i.ClusterID)
|
|
||||||
//
|
err = l.SaveResult(task, rs, opt)
|
||||||
// opt := &option.AiOption{}
|
if err != nil {
|
||||||
//
|
return nil, err
|
||||||
// err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, i.ClusterID, clusterName, "", constants.Saved, "")
|
}
|
||||||
// if err != nil {
|
|
||||||
// return nil, errors.New("database add failed: " + err.Error())
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (l *ScheduleRunTaskLogic) SaveResult(task *models.Task, results []*schedulers.AiResult, opt *option.AiOption) error {
|
||||||
|
|
||||||
|
for _, r := range results {
|
||||||
|
|
||||||
|
opt.ComputeCard = strings.ToUpper(r.Card)
|
||||||
|
|
||||||
|
adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(r.AdapterId)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId)
|
||||||
|
|
||||||
|
err = l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
l.svcCtx.Scheduler.AiStorages.AddNoticeInfo(r.AdapterId, adapterName, r.ClusterId, clusterName, r.TaskName, "create", "任务创建中")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func updateClustersByScheduledDatas(taskId int64, assignedClusters *[]*strategy.AssignedCluster, scheduledDatas []*types.DataScheduleResults) error {
|
func updateClustersByScheduledDatas(taskId int64, assignedClusters *[]*strategy.AssignedCluster, scheduledDatas []*types.DataScheduleResults) error {
|
||||||
for _, cluster := range *assignedClusters {
|
for _, cluster := range *assignedClusters {
|
||||||
for _, data := range scheduledDatas {
|
for _, data := range scheduledDatas {
|
||||||
|
|
|
@ -52,7 +52,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil)
|
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,7 +41,7 @@ func (l *AiQueue) Consume(val string) error {
|
||||||
aiSchdl, _ := schedulers.NewAiScheduler(l.ctx, val, l.svcCtx.Scheduler, nil)
|
aiSchdl, _ := schedulers.NewAiScheduler(l.ctx, val, l.svcCtx.Scheduler, nil)
|
||||||
|
|
||||||
// 调度算法
|
// 调度算法
|
||||||
_, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.JOINT_CLOUD_MODE, nil)
|
_, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, scheduler.SUBMIT_MODE_JOINT_CLOUD, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,8 +30,8 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
JOINT_CLOUD_MODE = iota + 1
|
SUBMIT_MODE_JOINT_CLOUD = iota + 1
|
||||||
STORAGE_SCHEDULE_MODE
|
SUBMIT_MODE_STORAGE_SCHEDULE
|
||||||
)
|
)
|
||||||
|
|
||||||
type Scheduler struct {
|
type Scheduler struct {
|
||||||
|
@ -134,7 +134,7 @@ func (s *Scheduler) TempAssign() error {
|
||||||
func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters []*strategy.AssignedCluster) (interface{}, error) {
|
func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters []*strategy.AssignedCluster) (interface{}, error) {
|
||||||
var result interface{}
|
var result interface{}
|
||||||
switch mode {
|
switch mode {
|
||||||
case JOINT_CLOUD_MODE:
|
case SUBMIT_MODE_JOINT_CLOUD:
|
||||||
//choose strategy
|
//choose strategy
|
||||||
strategy, err := ss.PickOptimalStrategy()
|
strategy, err := ss.PickOptimalStrategy()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -155,7 +155,7 @@ func (s *Scheduler) AssignAndSchedule(ss SubSchedule, mode int, assignedClusters
|
||||||
|
|
||||||
result = resp
|
result = resp
|
||||||
|
|
||||||
case STORAGE_SCHEDULE_MODE:
|
case SUBMIT_MODE_STORAGE_SCHEDULE:
|
||||||
|
|
||||||
//assign tasks to clusters
|
//assign tasks to clusters
|
||||||
resp, err := ss.AssignTask(assignedClusters, mode)
|
resp, err := ss.AssignTask(assignedClusters, mode)
|
||||||
|
|
|
@ -175,7 +175,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int
|
||||||
opt, _ := cloneAiOption(as.option)
|
opt, _ := cloneAiOption(as.option)
|
||||||
|
|
||||||
// decide opt params by mode
|
// decide opt params by mode
|
||||||
updateAiOptionByMode(c, opt, scheduler.STORAGE_SCHEDULE_MODE)
|
updateAiOptionByMode(c, opt, mode)
|
||||||
|
|
||||||
resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode)
|
resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -282,7 +282,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int
|
||||||
|
|
||||||
func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) {
|
func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) {
|
||||||
switch mode {
|
switch mode {
|
||||||
case scheduler.STORAGE_SCHEDULE_MODE:
|
case scheduler.SUBMIT_MODE_STORAGE_SCHEDULE:
|
||||||
opt.Cmd = cluster.Cmd
|
opt.Cmd = cluster.Cmd
|
||||||
opt.Envs = cluster.Envs
|
opt.Envs = cluster.Envs
|
||||||
opt.Params = cluster.Params
|
opt.Params = cluster.Params
|
||||||
|
@ -290,6 +290,8 @@ func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOptio
|
||||||
opt.ImageId = cluster.ImageId
|
opt.ImageId = cluster.ImageId
|
||||||
opt.AlgorithmId = cluster.CodeId
|
opt.AlgorithmId = cluster.CodeId
|
||||||
opt.DatasetsId = cluster.DatasetId
|
opt.DatasetsId = cluster.DatasetId
|
||||||
|
|
||||||
|
opt.ResourcesRequired = cluster.ResourcesRequired
|
||||||
default:
|
default:
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,6 +32,8 @@ type AiOption struct {
|
||||||
AlgorithmCode string
|
AlgorithmCode string
|
||||||
Image string
|
Image string
|
||||||
Model interface{}
|
Model interface{}
|
||||||
|
|
||||||
|
ResourcesRequired []map[string]interface{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a AiOption) GetOptionType() string {
|
func (a AiOption) GetOptionType() string {
|
||||||
|
|
|
@ -179,6 +179,7 @@ func (s *ShuguangAi) SubmitPytorchTask(ctx context.Context, imageId string, cmd
|
||||||
workPath = ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2]
|
workPath = ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2]
|
||||||
codePath = workPath + FORWARD_SLASH + TRAIN_FILE
|
codePath = workPath + FORWARD_SLASH + TRAIN_FILE
|
||||||
} else {
|
} else {
|
||||||
|
// storage schedule submit mode
|
||||||
codePath = algorithmId
|
codePath = algorithmId
|
||||||
paths = strings.Split(algorithmId, FORWARD_SLASH)
|
paths = strings.Split(algorithmId, FORWARD_SLASH)
|
||||||
last := paths[len(paths)-1]
|
last := paths[len(paths)-1]
|
||||||
|
@ -602,10 +603,56 @@ func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*colle
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
|
func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
|
||||||
err := s.GenerateSubmitParams(ctx, option)
|
switch mode {
|
||||||
if err != nil {
|
case 1:
|
||||||
return nil, err
|
err := s.GenerateSubmitParams(ctx, option)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
case 2:
|
||||||
|
var dcuNum int64
|
||||||
|
for _, res := range option.ResourcesRequired {
|
||||||
|
typeName, ok := res["type"]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch typeName {
|
||||||
|
case DCU:
|
||||||
|
num, ok := res["number"]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
n := common.ConvertTypeToString(num)
|
||||||
|
val, err := strconv.ParseInt(n, 10, 64)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
dcuNum = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for k, v := range RESOURCESGAIMAP {
|
||||||
|
if dcuNum == v.GPU {
|
||||||
|
option.ResourceId = k
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
if dcuNum == 0 && v.GPU == 1 {
|
||||||
|
option.ResourceId = k
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
if dcuNum >= 5 && v.GPU == 5 {
|
||||||
|
option.ResourceId = k
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
option.ComputeCard = DCU
|
||||||
|
|
||||||
|
default:
|
||||||
|
return nil, errors.New("failed to choose submit mode")
|
||||||
}
|
}
|
||||||
|
|
||||||
task, err := s.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
|
task, err := s.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
Loading…
Reference in New Issue