forked from JointCloud/pcm-coordinator
320 lines
9.3 KiB
Go
320 lines
9.3 KiB
Go
package schedule
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"github.com/zeromicro/go-zero/core/logx"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
|
"gopkg.in/yaml.v2"
|
|
"strings"
|
|
)
|
|
|
|
type ScheduleRunTaskLogic struct {
|
|
logx.Logger
|
|
ctx context.Context
|
|
svcCtx *svc.ServiceContext
|
|
}
|
|
|
|
func NewScheduleRunTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleRunTaskLogic {
|
|
return &ScheduleRunTaskLogic{
|
|
Logger: logx.WithContext(ctx),
|
|
ctx: ctx,
|
|
svcCtx: svcCtx,
|
|
}
|
|
}
|
|
|
|
func (l *ScheduleRunTaskLogic) ScheduleRunTask(req *types.RunTaskReq) (resp *types.RunTaskResp, err error) {
|
|
// find task
|
|
task, err := l.svcCtx.Scheduler.AiStorages.GetTaskById(req.TaskID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if task == nil {
|
|
return nil, errors.New("task not found ")
|
|
}
|
|
|
|
if task.Status != constants.Saved {
|
|
switch task.Status {
|
|
case constants.Cancelled:
|
|
return nil, errors.New("task has been cancelled ")
|
|
case constants.Failed:
|
|
return nil, errors.New("task was already failed ")
|
|
case constants.Running:
|
|
return nil, errors.New("task is running ")
|
|
case constants.Succeeded:
|
|
return nil, errors.New("task is completed ")
|
|
default:
|
|
return nil, fmt.Errorf("task is being: %s", task.Status)
|
|
}
|
|
}
|
|
|
|
var clustersWithDataDistributes ClustersWithDataDistributes
|
|
err = yaml.Unmarshal([]byte(task.YamlString), &clustersWithDataDistributes)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
opt := &option.AiOption{
|
|
AdapterId: ADAPTERID,
|
|
TaskName: task.Name,
|
|
TaskId: task.Id,
|
|
StrategyName: "",
|
|
}
|
|
|
|
// update assignedClusters
|
|
assignedClusters, err := updateClustersByScheduledDatas(task.Id, &clustersWithDataDistributes, req.ScheduledDatas)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
aiSchdl, err := schedulers.NewAiScheduler(l.ctx, "", l.svcCtx.Scheduler, opt)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
results, err := l.svcCtx.Scheduler.AssignAndSchedule(aiSchdl, executor.SUBMIT_MODE_STORAGE_SCHEDULE, assignedClusters)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rs := (results).([]*schedulers.AiResult)
|
|
|
|
err = l.SaveResult(task, rs, opt)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func (l *ScheduleRunTaskLogic) SaveResult(task *models.Task, results []*schedulers.AiResult, opt *option.AiOption) error {
|
|
|
|
for _, r := range results {
|
|
|
|
opt.ComputeCard = strings.ToUpper(r.Card)
|
|
opt.Replica = r.Replica
|
|
opt.Output = r.Output
|
|
|
|
adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(r.AdapterId)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(r.ClusterId)
|
|
|
|
err = l.svcCtx.Scheduler.AiStorages.SaveAiTask(task.Id, opt, adapterName, r.ClusterId, clusterName, r.JobId, constants.Saved, r.Msg)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
l.svcCtx.Scheduler.AiStorages.AddNoticeInfo(r.AdapterId, adapterName, r.ClusterId, clusterName, r.TaskName, "create", "任务创建中")
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
func updateClustersByScheduledDatas(taskId int64, clustersWithDataDistributes *ClustersWithDataDistributes, scheduledDatas []*types.DataScheduleResults) ([]*strategy.AssignedCluster, error) {
|
|
assignedClusters := make([]*strategy.AssignedCluster, 0)
|
|
|
|
if len(scheduledDatas) == 0 {
|
|
for _, cluster := range clustersWithDataDistributes.Clusters {
|
|
assignedClusters = append(assignedClusters, cluster)
|
|
}
|
|
} else {
|
|
// handle pass-in scheduledDatas
|
|
for _, cluster := range clustersWithDataDistributes.Clusters {
|
|
for _, data := range scheduledDatas {
|
|
switch data.DataType {
|
|
case "dataset":
|
|
for _, result := range data.Results {
|
|
if !result.Status {
|
|
continue
|
|
}
|
|
for _, c := range result.Clusters {
|
|
if cluster.ClusterId == c.ClusterID {
|
|
if c.JsonData == "" {
|
|
continue
|
|
}
|
|
jsonData := entity.JsonData{}
|
|
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("pass-in jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "dataset")
|
|
}
|
|
cluster.DatasetId = jsonData.Id
|
|
}
|
|
}
|
|
}
|
|
case "image":
|
|
for _, result := range data.Results {
|
|
if !result.Status {
|
|
continue
|
|
}
|
|
for _, c := range result.Clusters {
|
|
if cluster.ClusterId == c.ClusterID {
|
|
if c.JsonData == "" {
|
|
continue
|
|
}
|
|
jsonData := entity.JsonData{}
|
|
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("pass-in jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "image")
|
|
}
|
|
cluster.ImageId = jsonData.Id
|
|
}
|
|
}
|
|
}
|
|
case "code":
|
|
for _, result := range data.Results {
|
|
if !result.Status {
|
|
continue
|
|
}
|
|
for _, c := range result.Clusters {
|
|
if cluster.ClusterId == c.ClusterID {
|
|
if c.JsonData == "" {
|
|
continue
|
|
}
|
|
jsonData := entity.JsonData{}
|
|
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("pass-in jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "code")
|
|
}
|
|
cluster.CodeId = jsonData.Id
|
|
}
|
|
}
|
|
}
|
|
case "model":
|
|
for _, result := range data.Results {
|
|
if !result.Status {
|
|
continue
|
|
}
|
|
for _, c := range result.Clusters {
|
|
if cluster.ClusterId == c.ClusterID {
|
|
if c.JsonData == "" {
|
|
continue
|
|
}
|
|
jsonData := entity.JsonData{}
|
|
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("pass-in jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "model")
|
|
}
|
|
cluster.ModelId = jsonData.Id
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
assignedClusters = append(assignedClusters, cluster)
|
|
}
|
|
}
|
|
|
|
// handle db yaml clustersWithDataDistributes
|
|
for _, cluster := range assignedClusters {
|
|
if cluster.DatasetId == "" {
|
|
for _, distribute := range clustersWithDataDistributes.DataDistributes.Dataset {
|
|
for _, c := range distribute.Clusters {
|
|
if cluster.ClusterId == c.ClusterID {
|
|
if c.JsonData == "" {
|
|
continue
|
|
}
|
|
jsonData := entity.JsonData{}
|
|
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("db yaml jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "dataset")
|
|
}
|
|
cluster.DatasetId = jsonData.Id
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if cluster.ImageId == "" {
|
|
for _, distribute := range clustersWithDataDistributes.DataDistributes.Image {
|
|
for _, c := range distribute.Clusters {
|
|
if cluster.ClusterId == c.ClusterID {
|
|
if c.JsonData == "" {
|
|
continue
|
|
}
|
|
jsonData := entity.JsonData{}
|
|
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("db yaml jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "image")
|
|
}
|
|
cluster.ImageId = jsonData.Id
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//if cluster.CodeId == "" {
|
|
for _, distribute := range clustersWithDataDistributes.DataDistributes.Code {
|
|
for _, c := range distribute.Clusters {
|
|
if cluster.ClusterId == c.ClusterID {
|
|
|
|
cluster.Output = distribute.Output
|
|
|
|
if cluster.CodeId == "" {
|
|
if c.JsonData == "" {
|
|
continue
|
|
}
|
|
jsonData := entity.JsonData{}
|
|
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("db yaml jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "code")
|
|
}
|
|
cluster.CodeId = jsonData.Id
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
if cluster.ModelId == "" {
|
|
for _, distribute := range clustersWithDataDistributes.DataDistributes.Model {
|
|
for _, c := range distribute.Clusters {
|
|
if cluster.ClusterId == c.ClusterID {
|
|
if c.JsonData == "" {
|
|
continue
|
|
}
|
|
jsonData := entity.JsonData{}
|
|
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskId, cluster.ClusterId, "model")
|
|
}
|
|
cluster.ModelId = jsonData.Id
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// check empty data
|
|
for _, cluster := range assignedClusters {
|
|
if cluster.DatasetId == "" {
|
|
return nil, fmt.Errorf("failed to run task %d, cluster %s cannot find %s", taskId, cluster.ClusterId, "DatasetId")
|
|
}
|
|
|
|
if cluster.ImageId == "" {
|
|
return nil, fmt.Errorf("failed to run task %d, cluster %s cannot find %s", taskId, cluster.ClusterId, "ImageId")
|
|
}
|
|
|
|
if cluster.CodeId == "" {
|
|
return nil, fmt.Errorf("failed to run task %d, cluster %s cannot find %s", taskId, cluster.ClusterId, "CodeId")
|
|
}
|
|
}
|
|
|
|
return assignedClusters, nil
|
|
}
|