236 lines
7.4 KiB
Go
236 lines
7.4 KiB
Go
package mq
|
||
|
||
import (
|
||
"context"
|
||
"errors"
|
||
"fmt"
|
||
|
||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||
|
||
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job/state"
|
||
)
|
||
|
||
// 提交任务集
|
||
func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetResp, *mq.CodeMessage) {
|
||
logger.Debugf("submitting job")
|
||
|
||
var jobs []jobmgr.SubmittingJob
|
||
for _, jobInfo := range msg.JobSet.Jobs {
|
||
switch info := jobInfo.(type) {
|
||
case *schsdk.NormalJobInfo:
|
||
jo := job.NewNormalJob(*info)
|
||
jo.SubType = schsdk.JobTypeNormal
|
||
|
||
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
||
if !ok {
|
||
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
||
}
|
||
|
||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||
Body: jo,
|
||
InitState: state.NewPreSchuduling(preSch),
|
||
})
|
||
|
||
case *schsdk.DataReturnJobInfo:
|
||
jo := job.NewDataReturnJob(*info)
|
||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||
Body: jo,
|
||
InitState: state.NewWaitTargetComplete(),
|
||
})
|
||
|
||
case *schsdk.MultiInstanceJobInfo:
|
||
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
||
|
||
jo := job.NewMultiInstanceJob(*info, preSch)
|
||
|
||
if !ok {
|
||
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
||
}
|
||
|
||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||
Body: jo,
|
||
InitState: state.NewMultiInstanceInit(),
|
||
})
|
||
|
||
case *schsdk.UpdateMultiInstanceJobInfo:
|
||
modelJob := job.NewUpdateMultiInstanceJob(*info)
|
||
instanceJobSets := svc.jobMgr.DumpJobSet(modelJob.Info.MultiInstanceJobSetID)
|
||
if len(instanceJobSets) == 0 {
|
||
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("job set %s is not found", modelJob.Info.MultiInstanceJobSetID))
|
||
}
|
||
|
||
// 找到多实例任务本身
|
||
var multiInstanceJobDump jobmod.JobDump
|
||
for i := 0; i < len(instanceJobSets); i++ {
|
||
jobDump := instanceJobSets[i]
|
||
if _, ok := jobDump.Body.(*jobmod.MultiInstanceJobDump); ok {
|
||
multiInstanceJobDump = jobDump
|
||
break
|
||
}
|
||
}
|
||
|
||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||
Body: modelJob,
|
||
InitState: state.NewMultiInstanceUpdate(multiInstanceJobDump),
|
||
})
|
||
|
||
case *schsdk.DataPreprocessJobInfo:
|
||
// 后续的调度流程跟NormalJob是一致的
|
||
normalJobInfo := &schsdk.NormalJobInfo{
|
||
Type: schsdk.JobTypeNormal,
|
||
JobInfoBase: info.JobInfoBase,
|
||
Files: info.Files,
|
||
Runtime: info.Runtime,
|
||
Services: info.Services,
|
||
Resources: info.Resources,
|
||
}
|
||
jo := job.NewNormalJob(*normalJobInfo)
|
||
jo.SubType = schsdk.JobTypeDataPreprocess
|
||
|
||
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
||
if !ok {
|
||
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
||
}
|
||
|
||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||
Body: jo,
|
||
InitState: state.NewPreSchuduling(preSch),
|
||
})
|
||
|
||
case *schsdk.FinetuningJobInfo:
|
||
// 后续的调度流程跟NormalJob是一致的
|
||
normalJobInfo := &schsdk.NormalJobInfo{
|
||
Type: schsdk.JobTypeNormal,
|
||
Files: info.Files,
|
||
JobInfoBase: info.JobInfoBase,
|
||
Runtime: info.Runtime,
|
||
Services: info.Services,
|
||
Resources: info.Resources,
|
||
ModelJobInfo: info.ModelJobInfo,
|
||
}
|
||
jo := job.NewNormalJob(*normalJobInfo)
|
||
jo.SubType = schsdk.JobTypeFinetuning
|
||
|
||
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
||
if !ok {
|
||
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
||
}
|
||
|
||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||
Body: jo,
|
||
InitState: state.NewPreSchuduling(preSch),
|
||
})
|
||
}
|
||
}
|
||
|
||
return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(svc.jobMgr.SubmitJobSet(jobs)))
|
||
}
|
||
|
||
func (svc *Service) CreateInstance(instInfo *mgrmq.CreateInstance) (*mgrmq.CreateInstanceResp, *mq.CodeMessage) {
|
||
logger.Debugf("start create instance")
|
||
|
||
fut := future.NewSetValue[event.OperateInstanceResult]()
|
||
info := event.InstanceCreateInfo{
|
||
DataSet: instInfo.DataSet,
|
||
}
|
||
instanceJobSets := svc.jobMgr.DumpJobSet(instInfo.JobSetID)
|
||
if len(instanceJobSets) == 0 {
|
||
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("job set %s is not found", instInfo.JobSetID))
|
||
}
|
||
|
||
// 找到多实例任务本身
|
||
var jobID schsdk.JobID
|
||
for i := 0; i < len(instanceJobSets); i++ {
|
||
jobDump := instanceJobSets[i]
|
||
if _, ok := jobDump.Body.(*jobmod.MultiInstanceJobDump); ok {
|
||
jobID = jobDump.JobID
|
||
break
|
||
}
|
||
}
|
||
svc.jobMgr.PostEvent(jobID, event.NewInstanceOperate(&info, fut))
|
||
|
||
result, err := fut.Wait(context.TODO())
|
||
|
||
if err != nil {
|
||
return nil, mq.Failed(errorcode.OperationFailed, err.Error())
|
||
}
|
||
|
||
return mq.ReplyOK(mgrmq.NewCreateInstanceResp(result.JobID, result.FilesUploadScheme))
|
||
}
|
||
|
||
// 任务集中某个文件上传完成
|
||
func (svc *Service) JobSetLocalFileUploaded(msg *mgrmq.JobSetLocalFileUploaded) (*mgrmq.JobSetLocalFileUploadedResp, *mq.CodeMessage) {
|
||
logger.WithField("LocalPath", msg.LocalPath).
|
||
WithField("PackageID", msg.PackageID).
|
||
Debugf("local file uploaded")
|
||
|
||
var err error
|
||
if msg.Error != "" {
|
||
err = errors.New(msg.Error)
|
||
}
|
||
|
||
svc.jobMgr.BroadcastEvent(msg.JobSetID, event.NewLocalFileUploaded(msg.LocalPath, err, msg.PackageID))
|
||
return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp())
|
||
}
|
||
|
||
func (svc *Service) GetJobSetDump(msg *mgrmq.GetJobSetDump) (*mgrmq.GetJobSetDumpResp, *mq.CodeMessage) {
|
||
jobs := svc.jobMgr.DumpJobSet(msg.JobSetID)
|
||
if len(jobs) == 0 {
|
||
return nil, mq.Failed(errorcode.OperationFailed, "job set not found")
|
||
}
|
||
|
||
return mq.ReplyOK(mgrmq.RespGetJobSetDump(jobs))
|
||
}
|
||
|
||
func (svc *Service) GetServiceList(msg *mgrmq.GetServiceList) (*mgrmq.GetServiceListResp, *mq.CodeMessage) {
|
||
jobs := svc.jobMgr.DumpJobSet(msg.JobSetID)
|
||
|
||
var jobSetServiceInfos []schsdk.JobSetServiceInfo
|
||
|
||
for _, jo := range jobs {
|
||
var cdsNodeID *cdssdk.NodeID
|
||
|
||
norJob, ok := jo.Body.(*jobmod.NormalJobDump)
|
||
if !ok {
|
||
continue
|
||
}
|
||
|
||
_, ok = jo.State.(*jobmod.NormalJobExecutingDump)
|
||
if ok {
|
||
computingCenter, err := svc.db.ComputingCenter().GetByID(svc.db.SQLCtx(), norJob.TargetCCID)
|
||
if err != nil {
|
||
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("get cdsNodeID failed by CCID: %s", err.Error()))
|
||
}
|
||
|
||
cdsNodeID = &computingCenter.CDSNodeID
|
||
|
||
} else {
|
||
//返回空指针,表明查询任务不在执行状态,没有id
|
||
cdsNodeID = nil
|
||
}
|
||
|
||
norJobInfo := jo.Info.(*schsdk.NormalJobInfo)
|
||
for _, servicePortInfo := range norJobInfo.Services.ServicePortInfos {
|
||
jobSetServiceInfo := schsdk.JobSetServiceInfo{
|
||
Name: servicePortInfo.Name,
|
||
Port: servicePortInfo.Port,
|
||
CDSNodeID: cdsNodeID,
|
||
LocalJobID: norJobInfo.LocalJobID,
|
||
}
|
||
jobSetServiceInfos = append(jobSetServiceInfos, jobSetServiceInfo)
|
||
}
|
||
}
|
||
|
||
return mq.ReplyOK(mgrmq.NewGetServiceListResp(jobSetServiceInfos))
|
||
}
|