162 lines
4.6 KiB
Go
162 lines
4.6 KiB
Go
package state
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler"
|
||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||
"strings"
|
||
"sync"
|
||
)
|
||
|
||
type MultiInstanceRunning struct {
|
||
preScheduler prescheduler.PreScheduler
|
||
}
|
||
|
||
func NewMultiInstanceRunning(preScheduler prescheduler.PreScheduler) *MultiInstanceRunning {
|
||
return &MultiInstanceRunning{
|
||
preScheduler: preScheduler,
|
||
}
|
||
}
|
||
|
||
func (s *MultiInstanceRunning) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||
s.do(rtx, job)
|
||
}
|
||
|
||
func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||
|
||
ctx, cancel := context.WithCancel(context.Background())
|
||
defer cancel()
|
||
|
||
go func() {
|
||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||
cancel()
|
||
}()
|
||
|
||
multInstJob := jo.Body.(*job.MultiInstanceJob)
|
||
|
||
waitFut := event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
|
||
for {
|
||
chanValue := <-waitFut.Chan()
|
||
instanceInfo := chanValue.Value.(*event.InstanceOperate)
|
||
instanceFuture := instanceInfo.Result
|
||
logger.Info("wait a event happened")
|
||
waitFut = event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
|
||
|
||
switch info := instanceInfo.Info.(type) {
|
||
case *event.InstanceCreateInfo:
|
||
createInstance(rtx, info, s.preScheduler, jo, multInstJob, instanceFuture)
|
||
case *event.InstanceUpdateInfo:
|
||
updateInstance(rtx, info, multInstJob, instanceFuture)
|
||
}
|
||
|
||
}
|
||
}
|
||
|
||
func updateInstance(rtx jobmgr.JobStateRunContext, updateInfo *event.InstanceUpdateInfo, parentJob *job.MultiInstanceJob, updateInstanceFuture event.OperateInstanceFuture) {
|
||
|
||
// 更新策略
|
||
strategy := updateInfo.Info.UpdateStrategy
|
||
println("update strategy: " + strategy)
|
||
|
||
var failJobs []string
|
||
var wg sync.WaitGroup
|
||
|
||
for i := 0; i < len(parentJob.SubJobs); i++ {
|
||
// 发送请求进行任务更新
|
||
instanceID := parentJob.SubJobs[i]
|
||
wg.Add(1)
|
||
go func() {
|
||
defer wg.Done()
|
||
fut := future.NewSetValue[event.UpdateResult]()
|
||
rtx.Mgr.PostEvent(instanceID, event.NewUpdate("update", fut))
|
||
_, err := fut.Wait(context.TODO())
|
||
|
||
if err != nil {
|
||
logger.Error(err.Error())
|
||
failJobs = append(failJobs, string(instanceID))
|
||
}
|
||
println()
|
||
}()
|
||
}
|
||
|
||
wg.Wait()
|
||
|
||
if len(failJobs) == 0 {
|
||
updateInstanceFuture.SetValue(event.OperateInstanceResult{
|
||
Err: nil,
|
||
})
|
||
return
|
||
}
|
||
|
||
// 返回更新失败的instance
|
||
result := strings.Join(failJobs, ",")
|
||
updateInstanceFuture.SetValue(event.OperateInstanceResult{
|
||
OperateResult: result,
|
||
Err: fmt.Errorf("error"),
|
||
})
|
||
}
|
||
|
||
func createInstance(rtx jobmgr.JobStateRunContext, info *event.InstanceCreateInfo, preScheduler prescheduler.PreScheduler, jo *jobmgr.Job, multInstJob *job.MultiInstanceJob, future event.OperateInstanceFuture) {
|
||
dataSet := info.DataSet
|
||
|
||
//如果是模型扩容任务,直接使用父Job的资源文件
|
||
if &multInstJob.Info.ModelJobInfo != nil {
|
||
dataSet = multInstJob.Info.Files.Dataset
|
||
}
|
||
|
||
// 构建InstanceJobInfo
|
||
infoFiles := schsdk.JobFilesInfo{
|
||
Dataset: dataSet,
|
||
Code: multInstJob.Info.Files.Code,
|
||
Image: multInstJob.Info.Files.Image,
|
||
}
|
||
|
||
newLocalJobID := fmt.Sprintf("%s_%s", multInstJob.Info.LocalJobID, utils.GenerateRandomID())
|
||
|
||
instJobInfo := &schsdk.InstanceJobInfo{
|
||
Type: schsdk.JobTypeInstance,
|
||
LocalJobID: newLocalJobID,
|
||
Files: infoFiles,
|
||
Runtime: multInstJob.Info.Runtime,
|
||
Resources: multInstJob.Info.Resources,
|
||
ModelJobInfo: multInstJob.Info.ModelJobInfo,
|
||
}
|
||
|
||
files := jobmod.JobFiles{
|
||
Code: multInstJob.Files.Code,
|
||
Image: multInstJob.Files.Image,
|
||
}
|
||
|
||
// 生成预调度方案和文件上传方案
|
||
jobSchedule, filesUploadScheme, err := preScheduler.ScheduleJob(instJobInfo)
|
||
if err != nil {
|
||
future.SetError(err)
|
||
return
|
||
}
|
||
|
||
// 创建实例并运行
|
||
instanceJob := job.NewInstanceJob(*instJobInfo, files)
|
||
jobID := rtx.Mgr.AddJob(jo.JobSetID, instanceJob, NewPreSchuduling(*jobSchedule))
|
||
|
||
// 在多实例任务中新增这个实例的任务ID
|
||
multInstJob.SubJobs = append(multInstJob.SubJobs, jobID)
|
||
|
||
// 将实例ID和文件上传方案返回
|
||
future.SetValue(event.OperateInstanceResult{
|
||
JobID: jobID,
|
||
FilesUploadScheme: *filesUploadScheme,
|
||
})
|
||
}
|
||
|
||
func (s *MultiInstanceRunning) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump {
|
||
return &jobmod.MultiInstCreateRunningDump{}
|
||
}
|