JCC-CSScheduler/manager/internal/mq/job.go

232 lines
7.3 KiB
Go

package mq
import (
"context"
"errors"
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/future"
"gitlink.org.cn/cloudream/common/consts/errorcode"
"gitlink.org.cn/cloudream/common/pkgs/logger"
"gitlink.org.cn/cloudream/common/pkgs/mq"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job/state"
)
// 提交任务集
func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetResp, *mq.CodeMessage) {
logger.Debugf("submitting job")
var jobs []jobmgr.SubmittingJob
for _, jobInfo := range msg.JobSet.Jobs {
switch info := jobInfo.(type) {
case *schsdk.NormalJobInfo:
jo := job.NewNormalJob(*info)
jo.SubType = schsdk.JobTypeNormal
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
if !ok {
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state.NewPreSchuduling(preSch),
})
case *schsdk.DataReturnJobInfo:
jo := job.NewDataReturnJob(*info)
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state.NewWaitTargetComplete(),
})
case *schsdk.MultiInstanceJobInfo:
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
jo := job.NewMultiInstanceJob(*info, preSch)
if !ok {
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state.NewMultiInstanceInit(),
})
case *schsdk.UpdateMultiInstanceJobInfo:
modelJob := job.NewUpdateMultiInstanceJob(*info)
instanceJobSets := svc.jobMgr.DumpJobSet(modelJob.Info.MultiInstanceJobSetID)
if len(instanceJobSets) == 0 {
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("job set %s is not found", modelJob.Info.MultiInstanceJobSetID))
}
// 找到多实例任务本身
var multiInstanceJobDump jobmod.JobDump
for i := 0; i < len(instanceJobSets); i++ {
jobDump := instanceJobSets[i]
if _, ok := jobDump.Body.(*jobmod.MultiInstanceJobDump); ok {
multiInstanceJobDump = jobDump
break
}
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: modelJob,
InitState: state.NewMultiInstanceUpdate(multiInstanceJobDump),
})
case *schsdk.DataPreprocessJobInfo:
// 后续的调度流程跟NormalJob是一致的
normalJobInfo := &schsdk.NormalJobInfo{
Type: schsdk.JobTypeNormal,
JobInfoBase: info.JobInfoBase,
Files: info.Files,
Runtime: info.Runtime,
Services: info.Services,
Resources: info.Resources,
}
jo := job.NewNormalJob(*normalJobInfo)
jo.SubType = schsdk.JobTypeDataPreprocess
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
if !ok {
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state.NewPreSchuduling(preSch),
})
case *schsdk.FinetuningJobInfo:
// 后续的调度流程跟NormalJob是一致的
normalJobInfo := &schsdk.NormalJobInfo{
Type: schsdk.JobTypeNormal,
Files: info.Files,
JobInfoBase: info.JobInfoBase,
Runtime: info.Runtime,
Services: info.Services,
Resources: info.Resources,
ModelJobInfo: info.ModelJobInfo,
}
jo := job.NewNormalJob(*normalJobInfo)
jo.SubType = schsdk.JobTypeFinetuning
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
if !ok {
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state.NewPreSchuduling(preSch),
})
}
}
return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(svc.jobMgr.SubmitJobSet(jobs)))
}
func (svc *Service) CreateInstance(instInfo *mgrmq.CreateInstance) (*mgrmq.CreateInstanceResp, *mq.CodeMessage) {
logger.Debugf("start create instance")
fut := future.NewSetValue[event.OperateInstanceResult]()
info := event.InstanceCreateInfo{
DataSet: instInfo.DataSet,
}
instanceJobSets := svc.jobMgr.DumpJobSet(instInfo.JobSetID)
if len(instanceJobSets) == 0 {
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("job set %s is not found", instInfo.JobSetID))
}
// 找到多实例任务本身
var jobID schsdk.JobID
for i := 0; i < len(instanceJobSets); i++ {
jobDump := instanceJobSets[i]
if _, ok := jobDump.Body.(*jobmod.MultiInstanceJobDump); ok {
jobID = jobDump.JobID
break
}
}
svc.jobMgr.PostEvent(jobID, event.NewInstanceOperate(&info, fut))
result, err := fut.Wait(context.TODO())
if err != nil {
return nil, mq.Failed(errorcode.OperationFailed, err.Error())
}
return mq.ReplyOK(mgrmq.NewCreateInstanceResp(result.JobID, result.FilesUploadScheme))
}
// 任务集中某个文件上传完成
func (svc *Service) JobSetLocalFileUploaded(msg *mgrmq.JobSetLocalFileUploaded) (*mgrmq.JobSetLocalFileUploadedResp, *mq.CodeMessage) {
logger.WithField("LocalPath", msg.LocalPath).
WithField("PackageID", msg.PackageID).
Debugf("local file uploaded")
var err error
if msg.Error != "" {
err = errors.New(msg.Error)
}
svc.jobMgr.BroadcastEvent(msg.JobSetID, event.NewLocalFileUploaded(msg.LocalPath, err, msg.PackageID))
return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp())
}
func (svc *Service) GetJobSetDump(msg *mgrmq.GetJobSetDump) (*mgrmq.GetJobSetDumpResp, *mq.CodeMessage) {
jobs := svc.jobMgr.DumpJobSet(msg.JobSetID)
if len(jobs) == 0 {
return nil, mq.Failed(errorcode.OperationFailed, "job set not found")
}
return mq.ReplyOK(mgrmq.RespGetJobSetDump(jobs))
}
func (svc *Service) GetServiceList(msg *mgrmq.GetServiceList) (*mgrmq.GetServiceListResp, *mq.CodeMessage) {
jobs := svc.jobMgr.DumpJobSet(msg.JobSetID)
var jobSetServiceInfos []schsdk.JobSetServiceInfo
for _, jo := range jobs {
var cdsNodeID cdssdk.StorageID
norJob, ok := jo.Body.(*jobmod.NormalJobDump)
if !ok {
continue
}
_, ok = jo.State.(*jobmod.NormalJobExecutingDump)
if ok {
computingCenter, err := svc.db.ComputingCenter().GetByID(svc.db.SQLCtx(), norJob.TargetCCID)
if err != nil {
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("get cdsNodeID failed by CCID: %s", err.Error()))
}
cdsNodeID = computingCenter.CDSStorageID
}
norJobInfo := jo.Info.(*schsdk.NormalJobInfo)
for _, servicePortInfo := range norJobInfo.Services.ServicePortInfos {
jobSetServiceInfo := schsdk.JobSetServiceInfo{
Name: servicePortInfo.Name,
Port: servicePortInfo.Port,
CDSStorageID: cdsNodeID,
LocalJobID: norJobInfo.LocalJobID,
}
jobSetServiceInfos = append(jobSetServiceInfos, jobSetServiceInfo)
}
}
return mq.ReplyOK(mgrmq.NewGetServiceListResp(jobSetServiceInfos))
}