232 lines
7.3 KiB
Go
232 lines
7.3 KiB
Go
package mq
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
|
|
"gitlink.org.cn/cloudream/common/pkgs/future"
|
|
|
|
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
|
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
|
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job/state"
|
|
)
|
|
|
|
// 提交任务集
|
|
func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetResp, *mq.CodeMessage) {
|
|
logger.Debugf("submitting job")
|
|
|
|
var jobs []jobmgr.SubmittingJob
|
|
for _, jobInfo := range msg.JobSet.Jobs {
|
|
switch info := jobInfo.(type) {
|
|
case *schsdk.NormalJobInfo:
|
|
jo := job.NewNormalJob(*info)
|
|
jo.SubType = schsdk.JobTypeNormal
|
|
|
|
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
|
if !ok {
|
|
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
|
}
|
|
|
|
jobs = append(jobs, jobmgr.SubmittingJob{
|
|
Body: jo,
|
|
InitState: state.NewPreSchuduling(preSch),
|
|
})
|
|
|
|
case *schsdk.DataReturnJobInfo:
|
|
jo := job.NewDataReturnJob(*info)
|
|
jobs = append(jobs, jobmgr.SubmittingJob{
|
|
Body: jo,
|
|
InitState: state.NewWaitTargetComplete(),
|
|
})
|
|
|
|
case *schsdk.MultiInstanceJobInfo:
|
|
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
|
|
|
jo := job.NewMultiInstanceJob(*info, preSch)
|
|
|
|
if !ok {
|
|
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
|
}
|
|
|
|
jobs = append(jobs, jobmgr.SubmittingJob{
|
|
Body: jo,
|
|
InitState: state.NewMultiInstanceInit(),
|
|
})
|
|
|
|
case *schsdk.UpdateMultiInstanceJobInfo:
|
|
modelJob := job.NewUpdateMultiInstanceJob(*info)
|
|
instanceJobSets := svc.jobMgr.DumpJobSet(modelJob.Info.MultiInstanceJobSetID)
|
|
if len(instanceJobSets) == 0 {
|
|
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("job set %s is not found", modelJob.Info.MultiInstanceJobSetID))
|
|
}
|
|
|
|
// 找到多实例任务本身
|
|
var multiInstanceJobDump jobmod.JobDump
|
|
for i := 0; i < len(instanceJobSets); i++ {
|
|
jobDump := instanceJobSets[i]
|
|
if _, ok := jobDump.Body.(*jobmod.MultiInstanceJobDump); ok {
|
|
multiInstanceJobDump = jobDump
|
|
break
|
|
}
|
|
}
|
|
|
|
jobs = append(jobs, jobmgr.SubmittingJob{
|
|
Body: modelJob,
|
|
InitState: state.NewMultiInstanceUpdate(multiInstanceJobDump),
|
|
})
|
|
|
|
case *schsdk.DataPreprocessJobInfo:
|
|
// 后续的调度流程跟NormalJob是一致的
|
|
normalJobInfo := &schsdk.NormalJobInfo{
|
|
Type: schsdk.JobTypeNormal,
|
|
JobInfoBase: info.JobInfoBase,
|
|
Files: info.Files,
|
|
Runtime: info.Runtime,
|
|
Services: info.Services,
|
|
Resources: info.Resources,
|
|
}
|
|
jo := job.NewNormalJob(*normalJobInfo)
|
|
jo.SubType = schsdk.JobTypeDataPreprocess
|
|
|
|
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
|
if !ok {
|
|
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
|
}
|
|
|
|
jobs = append(jobs, jobmgr.SubmittingJob{
|
|
Body: jo,
|
|
InitState: state.NewPreSchuduling(preSch),
|
|
})
|
|
|
|
case *schsdk.FinetuningJobInfo:
|
|
// 后续的调度流程跟NormalJob是一致的
|
|
normalJobInfo := &schsdk.NormalJobInfo{
|
|
Type: schsdk.JobTypeNormal,
|
|
Files: info.Files,
|
|
JobInfoBase: info.JobInfoBase,
|
|
Runtime: info.Runtime,
|
|
Services: info.Services,
|
|
Resources: info.Resources,
|
|
ModelJobInfo: info.ModelJobInfo,
|
|
}
|
|
jo := job.NewNormalJob(*normalJobInfo)
|
|
jo.SubType = schsdk.JobTypeFinetuning
|
|
|
|
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
|
if !ok {
|
|
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
|
}
|
|
|
|
jobs = append(jobs, jobmgr.SubmittingJob{
|
|
Body: jo,
|
|
InitState: state.NewPreSchuduling(preSch),
|
|
})
|
|
}
|
|
}
|
|
|
|
return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(svc.jobMgr.SubmitJobSet(jobs)))
|
|
}
|
|
|
|
func (svc *Service) CreateInstance(instInfo *mgrmq.CreateInstance) (*mgrmq.CreateInstanceResp, *mq.CodeMessage) {
|
|
logger.Debugf("start create instance")
|
|
|
|
fut := future.NewSetValue[event.OperateInstanceResult]()
|
|
info := event.InstanceCreateInfo{
|
|
DataSet: instInfo.DataSet,
|
|
}
|
|
instanceJobSets := svc.jobMgr.DumpJobSet(instInfo.JobSetID)
|
|
if len(instanceJobSets) == 0 {
|
|
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("job set %s is not found", instInfo.JobSetID))
|
|
}
|
|
|
|
// 找到多实例任务本身
|
|
var jobID schsdk.JobID
|
|
for i := 0; i < len(instanceJobSets); i++ {
|
|
jobDump := instanceJobSets[i]
|
|
if _, ok := jobDump.Body.(*jobmod.MultiInstanceJobDump); ok {
|
|
jobID = jobDump.JobID
|
|
break
|
|
}
|
|
}
|
|
svc.jobMgr.PostEvent(jobID, event.NewInstanceOperate(&info, fut))
|
|
|
|
result, err := fut.Wait(context.TODO())
|
|
|
|
if err != nil {
|
|
return nil, mq.Failed(errorcode.OperationFailed, err.Error())
|
|
}
|
|
|
|
return mq.ReplyOK(mgrmq.NewCreateInstanceResp(result.JobID, result.FilesUploadScheme))
|
|
}
|
|
|
|
// 任务集中某个文件上传完成
|
|
func (svc *Service) JobSetLocalFileUploaded(msg *mgrmq.JobSetLocalFileUploaded) (*mgrmq.JobSetLocalFileUploadedResp, *mq.CodeMessage) {
|
|
logger.WithField("LocalPath", msg.LocalPath).
|
|
WithField("PackageID", msg.PackageID).
|
|
Debugf("local file uploaded")
|
|
|
|
var err error
|
|
if msg.Error != "" {
|
|
err = errors.New(msg.Error)
|
|
}
|
|
|
|
svc.jobMgr.BroadcastEvent(msg.JobSetID, event.NewLocalFileUploaded(msg.LocalPath, err, msg.PackageID))
|
|
return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp())
|
|
}
|
|
|
|
func (svc *Service) GetJobSetDump(msg *mgrmq.GetJobSetDump) (*mgrmq.GetJobSetDumpResp, *mq.CodeMessage) {
|
|
jobs := svc.jobMgr.DumpJobSet(msg.JobSetID)
|
|
if len(jobs) == 0 {
|
|
return nil, mq.Failed(errorcode.OperationFailed, "job set not found")
|
|
}
|
|
|
|
return mq.ReplyOK(mgrmq.RespGetJobSetDump(jobs))
|
|
}
|
|
|
|
func (svc *Service) GetServiceList(msg *mgrmq.GetServiceList) (*mgrmq.GetServiceListResp, *mq.CodeMessage) {
|
|
jobs := svc.jobMgr.DumpJobSet(msg.JobSetID)
|
|
|
|
var jobSetServiceInfos []schsdk.JobSetServiceInfo
|
|
|
|
for _, jo := range jobs {
|
|
var cdsNodeID cdssdk.StorageID
|
|
|
|
norJob, ok := jo.Body.(*jobmod.NormalJobDump)
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
_, ok = jo.State.(*jobmod.NormalJobExecutingDump)
|
|
if ok {
|
|
computingCenter, err := svc.db.ComputingCenter().GetByID(svc.db.SQLCtx(), norJob.TargetCCID)
|
|
if err != nil {
|
|
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("get cdsNodeID failed by CCID: %s", err.Error()))
|
|
}
|
|
|
|
cdsNodeID = computingCenter.CDSStorageID
|
|
}
|
|
|
|
norJobInfo := jo.Info.(*schsdk.NormalJobInfo)
|
|
for _, servicePortInfo := range norJobInfo.Services.ServicePortInfos {
|
|
jobSetServiceInfo := schsdk.JobSetServiceInfo{
|
|
Name: servicePortInfo.Name,
|
|
Port: servicePortInfo.Port,
|
|
CDSStorageID: cdsNodeID,
|
|
LocalJobID: norJobInfo.LocalJobID,
|
|
}
|
|
jobSetServiceInfos = append(jobSetServiceInfos, jobSetServiceInfo)
|
|
}
|
|
}
|
|
|
|
return mq.ReplyOK(mgrmq.NewGetServiceListResp(jobSetServiceInfos))
|
|
}
|