JCC-CSScheduler/manager/internal/jobmgr/job/state/multiInstance_running.go

162 lines
4.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package state
import (
"context"
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/future"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
"strings"
"sync"
)
type MultiInstanceRunning struct {
preScheduler prescheduler.PreScheduler
}
func NewMultiInstanceRunning(preScheduler prescheduler.PreScheduler) *MultiInstanceRunning {
return &MultiInstanceRunning{
preScheduler: preScheduler,
}
}
func (s *MultiInstanceRunning) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
s.do(rtx, job)
}
func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
multInstJob := jo.Body.(*job.MultiInstanceJob)
waitFut := event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
for {
chanValue := <-waitFut.Chan()
instanceInfo := chanValue.Value.(*event.InstanceOperate)
instanceFuture := instanceInfo.Result
logger.Info("wait a event happened")
waitFut = event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
switch info := instanceInfo.Info.(type) {
case *event.InstanceCreateInfo:
createInstance(rtx, info, s.preScheduler, jo, multInstJob, instanceFuture)
case *event.InstanceUpdateInfo:
updateInstance(rtx, info, multInstJob, instanceFuture)
}
}
}
func updateInstance(rtx jobmgr.JobStateRunContext, updateInfo *event.InstanceUpdateInfo, parentJob *job.MultiInstanceJob, updateInstanceFuture event.OperateInstanceFuture) {
// 更新策略
strategy := updateInfo.Info.UpdateStrategy
println("update strategy: " + strategy)
var failJobs []string
var wg sync.WaitGroup
for i := 0; i < len(parentJob.SubJobs); i++ {
// 发送请求进行任务更新
instanceID := parentJob.SubJobs[i]
wg.Add(1)
go func() {
defer wg.Done()
fut := future.NewSetValue[event.UpdateResult]()
rtx.Mgr.PostEvent(instanceID, event.NewUpdate("update", fut))
_, err := fut.Wait(context.TODO())
if err != nil {
logger.Error(err.Error())
failJobs = append(failJobs, string(instanceID))
}
println()
}()
}
wg.Wait()
if len(failJobs) == 0 {
updateInstanceFuture.SetValue(event.OperateInstanceResult{
Err: nil,
})
return
}
// 返回更新失败的instance
result := strings.Join(failJobs, ",")
updateInstanceFuture.SetValue(event.OperateInstanceResult{
OperateResult: result,
Err: fmt.Errorf("error"),
})
}
func createInstance(rtx jobmgr.JobStateRunContext, info *event.InstanceCreateInfo, preScheduler prescheduler.PreScheduler, jo *jobmgr.Job, multInstJob *job.MultiInstanceJob, future event.OperateInstanceFuture) {
dataSet := info.DataSet
//如果是模型扩容任务直接使用父Job的资源文件
if &multInstJob.Info.ModelJobInfo != nil {
dataSet = multInstJob.Info.Files.Dataset
}
// 构建InstanceJobInfo
infoFiles := schsdk.JobFilesInfo{
Dataset: dataSet,
Code: multInstJob.Info.Files.Code,
Image: multInstJob.Info.Files.Image,
}
newLocalJobID := fmt.Sprintf("%s_%s", multInstJob.Info.LocalJobID, utils.GenerateRandomID())
instJobInfo := &schsdk.InstanceJobInfo{
Type: schsdk.JobTypeInstance,
LocalJobID: newLocalJobID,
Files: infoFiles,
Runtime: multInstJob.Info.Runtime,
Resources: multInstJob.Info.Resources,
ModelJobInfo: multInstJob.Info.ModelJobInfo,
}
files := jobmod.JobFiles{
Code: multInstJob.Files.Code,
Image: multInstJob.Files.Image,
}
// 生成预调度方案和文件上传方案
jobSchedule, filesUploadScheme, err := preScheduler.ScheduleJob(instJobInfo)
if err != nil {
future.SetError(err)
return
}
// 创建实例并运行
instanceJob := job.NewInstanceJob(*instJobInfo, files)
jobID := rtx.Mgr.AddJob(jo.JobSetID, instanceJob, NewPreSchuduling(*jobSchedule))
// 在多实例任务中新增这个实例的任务ID
multInstJob.SubJobs = append(multInstJob.SubJobs, jobID)
// 将实例ID和文件上传方案返回
future.SetValue(event.OperateInstanceResult{
JobID: jobID,
FilesUploadScheme: *filesUploadScheme,
})
}
func (s *MultiInstanceRunning) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.MultiInstCreateRunningDump{}
}