重构manager模块

This commit is contained in:
Sydonian 2024-04-26 09:27:10 +08:00
parent 49a80a693c
commit 1e1c8dd691
54 changed files with 1515 additions and 2553 deletions

View File

@ -38,7 +38,7 @@ const (
var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait") var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait")
type Scheduler interface { type Scheduler interface {
Schedule(info *jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) Schedule(info *schsdk.NormalJobInfo, status jobmod.NormalJobStatus) (*jobmod.JobScheduleScheme, error)
} }
type candidate struct { type candidate struct {
@ -129,7 +129,7 @@ func NewDefaultSchedule() *DefaultScheduler {
return &DefaultScheduler{} return &DefaultScheduler{}
} }
func (s *DefaultScheduler) Schedule(job *jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) { func (s *DefaultScheduler) Schedule(info *schsdk.NormalJobInfo, status jobmod.NormalJobStatus) (*jobmod.JobScheduleScheme, error) {
mgrCli, err := schglb.ManagerMQPool.Acquire() mgrCli, err := schglb.ManagerMQPool.Acquire()
if err != nil { if err != nil {
return nil, fmt.Errorf("new collector client: %w", err) return nil, fmt.Errorf("new collector client: %w", err)
@ -151,17 +151,17 @@ func (s *DefaultScheduler) Schedule(job *jobmod.NormalJob) (*jobmod.JobScheduleS
for _, cc := range allCC.ComputingCenters { for _, cc := range allCC.ComputingCenters {
allCCs[cc.CCID] = &candidate{ allCCs[cc.CCID] = &candidate{
CC: cc, CC: cc,
IsPreScheduled: cc.CCID == job.TargetCCID, IsPreScheduled: cc.CCID == status.TargetCCID,
} }
} }
// 计算 // 计算
err = s.calcFileScore(job.Files, allCCs) err = s.calcFileScore(status.Files, allCCs)
if err != nil { if err != nil {
return nil, err return nil, err
} }
err = s.calcResourceScore(job, allCCs) err = s.calcResourceScore(info, allCCs)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -204,9 +204,9 @@ func (s *DefaultScheduler) makeSchemeForNode(targetCC *candidate) jobmod.JobSche
return scheme return scheme
} }
func (s *DefaultScheduler) calcResourceScore(job *jobmod.NormalJob, allCCs map[schsdk.CCID]*candidate) error { func (s *DefaultScheduler) calcResourceScore(info *schsdk.NormalJobInfo, allCCs map[schsdk.CCID]*candidate) error {
for _, cc := range allCCs { for _, cc := range allCCs {
res, err := s.calcOneResourceScore(job.Info.Resources, &cc.CC) res, err := s.calcOneResourceScore(info.Resources, &cc.CC)
if err != nil { if err != nil {
return err return err
} }

View File

@ -8,6 +8,7 @@ import (
"github.com/samber/lo" "github.com/samber/lo"
"gitlink.org.cn/cloudream/common/pkgs/future" "gitlink.org.cn/cloudream/common/pkgs/future"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
) )
@ -30,7 +31,7 @@ func NewService(scheduler Scheduler) *Service {
} }
} }
func (s *Service) MakeScheme(job jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) { func (s *Service) MakeScheme(job schsdk.NormalJobInfo) (*jobmod.JobScheduleScheme, error) {
s.lock.Lock() s.lock.Lock()
callback := future.NewSetValue[*jobmod.JobScheduleScheme]() callback := future.NewSetValue[*jobmod.JobScheduleScheme]()
s.jobs = append(s.jobs, &schedulingJob{ s.jobs = append(s.jobs, &schedulingJob{

View File

@ -4,7 +4,7 @@ import (
"gitlink.org.cn/cloudream/common/consts/errorcode" "gitlink.org.cn/cloudream/common/consts/errorcode"
"gitlink.org.cn/cloudream/common/pkgs/logger" "gitlink.org.cn/cloudream/common/pkgs/logger"
"gitlink.org.cn/cloudream/common/pkgs/mq" "gitlink.org.cn/cloudream/common/pkgs/mq"
"gitlink.org.cn/cloudream/common/utils/reflect" "gitlink.org.cn/cloudream/common/utils/reflect2"
myglbs "gitlink.org.cn/cloudream/scheduler/advisor/internal/globals" myglbs "gitlink.org.cn/cloudream/scheduler/advisor/internal/globals"
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor" advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
) )
@ -12,7 +12,7 @@ import (
func (svc *Service) StartTask(msg *advmq.StartTask) (*advmq.StartTaskResp, *mq.CodeMessage) { func (svc *Service) StartTask(msg *advmq.StartTask) (*advmq.StartTaskResp, *mq.CodeMessage) {
tsk, err := svc.taskManager.StartByInfo(msg.Info) tsk, err := svc.taskManager.StartByInfo(msg.Info)
if err != nil { if err != nil {
logger.WithField("Info", reflect.TypeOfValue(msg.Info).Name()). logger.WithField("Info", reflect2.TypeOfValue(msg.Info).Name()).
Warnf("starting task by info: %s", err.Error()) Warnf("starting task by info: %s", err.Error())
return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed") return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed")
} }

View File

@ -39,7 +39,7 @@ func (t *MakeScheduleScheme) Execute(task *task.Task[TaskContext], ctx TaskConte
} }
func (t *MakeScheduleScheme) do(taskID string, ctx TaskContext) (*jobmod.JobScheduleScheme, error) { func (t *MakeScheduleScheme) do(taskID string, ctx TaskContext) (*jobmod.JobScheduleScheme, error) {
scheme, err := ctx.scheduleSvc.MakeScheme(t.Job) scheme, err := ctx.scheduleSvc.MakeScheme(t.JobInfo)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -5,7 +5,7 @@ import (
"reflect" "reflect"
"gitlink.org.cn/cloudream/common/pkgs/task" "gitlink.org.cn/cloudream/common/pkgs/task"
myreflect "gitlink.org.cn/cloudream/common/utils/reflect" "gitlink.org.cn/cloudream/common/utils/reflect2"
reporter "gitlink.org.cn/cloudream/scheduler/advisor/internal/reporter" reporter "gitlink.org.cn/cloudream/scheduler/advisor/internal/reporter"
"gitlink.org.cn/cloudream/scheduler/advisor/internal/scheduler" "gitlink.org.cn/cloudream/scheduler/advisor/internal/scheduler"
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task" advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
@ -40,7 +40,7 @@ func NewManager(reporter *reporter.Reporter, scheduleSvc *scheduler.Service) Man
} }
func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) { func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) {
infoType := myreflect.TypeOfValue(info) infoType := reflect2.TypeOfValue(info)
ctor, ok := taskFromInfoCtors[infoType] ctor, ok := taskFromInfoCtors[infoType]
if !ok { if !ok {
@ -53,7 +53,7 @@ func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) {
var taskFromInfoCtors map[reflect.Type]func(advtsk.TaskInfo) TaskBody = make(map[reflect.Type]func(advtsk.TaskInfo) task.TaskBody[TaskContext]) var taskFromInfoCtors map[reflect.Type]func(advtsk.TaskInfo) TaskBody = make(map[reflect.Type]func(advtsk.TaskInfo) task.TaskBody[TaskContext])
func Register[TInfo advtsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) { func Register[TInfo advtsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) {
taskFromInfoCtors[myreflect.TypeOf[TInfo]()] = func(info advtsk.TaskInfo) TaskBody { taskFromInfoCtors[reflect2.TypeOf[TInfo]()] = func(info advtsk.TaskInfo) TaskBody {
return ctor(info.(TInfo)) return ctor(info.(TInfo))
} }
} }

View File

@ -171,7 +171,7 @@ func (s *DefaultPreScheduler) Schedule(info *schsdk.JobSetInfo) (*jobmod.JobSetP
if resFile, ok := norJob.Files.Code.(*schsdk.ResourceJobFileInfo); ok { if resFile, ok := norJob.Files.Code.(*schsdk.ResourceJobFileInfo); ok {
j.Afters = append(j.Afters, resFile.ResourceLocalJobID) j.Afters = append(j.Afters, resFile.ResourceLocalJobID)
} }
} else if resJob, ok := job.(*schsdk.ResourceJobInfo); ok { } else if resJob, ok := job.(*schsdk.DataReturnJobInfo); ok {
j.Afters = append(j.Afters, resJob.TargetLocalJobID) j.Afters = append(j.Afters, resJob.TargetLocalJobID)
} }
@ -270,7 +270,7 @@ func (s *DefaultPreScheduler) scheduleForNormalJob(jobSet *schsdk.JobSetInfo, jo
// 检查此节点是否是它所引用的任务所选的节点 // 检查此节点是否是它所引用的任务所选的节点
for _, af := range job.Afters { for _, af := range job.Afters {
resJob := findJobInfo[*schsdk.ResourceJobInfo](jobSet.Jobs, af) resJob := findJobInfo[*schsdk.DataReturnJobInfo](jobSet.Jobs, af)
if resJob == nil { if resJob == nil {
return nil, fmt.Errorf("resource job %s not found in the job set", af) return nil, fmt.Errorf("resource job %s not found in the job set", af)
} }

View File

@ -1,10 +1,8 @@
package jobmod package jobmod
import ( import (
"github.com/samber/lo"
"gitlink.org.cn/cloudream/common/pkgs/types"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
"gitlink.org.cn/cloudream/common/utils/serder" cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
) )
type FileScheduleAction string type FileScheduleAction string
@ -34,66 +32,41 @@ type JobSetPreScheduleScheme struct {
JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID
} }
// 任务集 type JobFiles struct {
type JobSet struct { Dataset PackageJobFile `json:"dataset"`
JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID Code PackageJobFile `json:"code"`
JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用 Image ImageJobFile `json:"image"`
PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"`
}
type JobSetJobRef struct {
JobID schsdk.JobID `json:"jobID"` // 任务ID
LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID
} }
func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet { type PackageJobFile struct {
return &JobSet{ PackageID cdssdk.PackageID `json:"packageID"`
JobSetID: jobSetID, FullPath string `json:"fullPath"` // Load之后的完整文件路径
JobRefs: jobRefs,
PreScheduleScheme: preScheduleScheme,
}
} }
func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef { type ImageJobFile struct {
ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID }) PackageID *cdssdk.PackageID `json:"packageID"`
if !ok { ImageID schsdk.ImageID `json:"imageID"`
return nil
}
return &ref
} }
// 任务 type JobStatus struct {
type Job interface { JobID schsdk.JobID `json:"jobID"`
GetJobSetID() schsdk.JobSetID JobSetID schsdk.JobSetID `json:"jobSetID"`
GetJobID() schsdk.JobID Info schsdk.JobInfo `json:"info"`
GetState() JobState Body JobBodyStatus `json:"body"`
SetState(state JobState) State JobStateStatus `json:"state"`
Clone() Job
} }
var JobTypeUnion = types.NewTypeUnion[Job]( type JobBodyStatus interface {
(*NormalJob)(nil),
(*ResourceJob)(nil),
)
var _ = serder.UseTypeUnionExternallyTagged(&JobTypeUnion)
// TODO var _ = serder.RegisterNewTaggedTypeUnion(JobTypeUnion, "Type", "type")
type JobBase struct {
JobSetID schsdk.JobSetID `json:"jobSetID"` // 任务集ID
JobID schsdk.JobID `json:"jobID"` // 全局唯一任务ID
State JobState `json:"state"` // 任务当前的状态。包含当前在状态下执行操作所需的数据
} }
func (j *JobBase) GetJobSetID() schsdk.JobSetID { type NormalJobStatus struct {
return j.JobSetID TargetCCID schsdk.CCID `json:"targetCCID"`
Files JobFiles `json:"files"`
} }
func (j *JobBase) GetJobID() schsdk.JobID {
return j.JobID type DataReturnJobStatus struct {
DataReturnPackageID cdssdk.PackageID `json:"dataReturnPackageID"`
} }
func (j *JobBase) GetState() JobState {
return j.State type JobStateStatus interface {
}
func (j *JobBase) SetState(state JobState) {
j.State = state
} }

View File

@ -1,46 +0,0 @@
package jobmod
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
)
type NormalJob struct {
JobBase
Info schsdk.NormalJobInfo `json:"info"` // 提交任务时提供的任务描述信息
Files JobFiles `json:"files"` // 任务需要的文件
TargetCCID schsdk.CCID `json:"targetSlwNodeID"` // 将要运行此任务的算力中心ID
OutputFullPath string `json:"outputFullPath"` // 程序结果的完整输出路径
}
func NewNormalJob(jobSetID schsdk.JobSetID, jobID schsdk.JobID, info schsdk.NormalJobInfo) *NormalJob {
return &NormalJob{
JobBase: JobBase{
JobSetID: jobSetID,
JobID: jobID,
},
Info: info,
}
}
func (j *NormalJob) Clone() Job {
tmp := *j
tmp.State = tmp.State.Clone()
return &tmp
}
type JobFiles struct {
Dataset PackageJobFile `json:"dataset"`
Code PackageJobFile `json:"code"`
Image ImageJobFile `json:"image"`
}
type PackageJobFile struct {
PackageID cdssdk.PackageID `json:"packageID"`
FullPath string `json:"fullPath"` // Load之后的完整文件路径
}
type ImageJobFile struct {
PackageID *cdssdk.PackageID `json:"packageID"`
ImageID schsdk.ImageID `json:"imageID"`
}

View File

@ -1,28 +0,0 @@
package jobmod
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
)
type ResourceJob struct {
JobBase
Info schsdk.ResourceJobInfo `json:"info"`
ResourcePackageID cdssdk.PackageID `json:"resourcePackageID"` // 回源之后得到的PackageID
}
func NewResourceJob(jobSetID schsdk.JobSetID, jobID schsdk.JobID, info schsdk.ResourceJobInfo) *ResourceJob {
return &ResourceJob{
JobBase: JobBase{
JobSetID: jobSetID,
JobID: jobID,
},
Info: info,
}
}
func (j *ResourceJob) Clone() Job {
tmp := *j
tmp.State = tmp.State.Clone()
return &tmp
}

View File

@ -1,183 +0,0 @@
package jobmod
import (
"gitlink.org.cn/cloudream/common/pkgs/types"
"gitlink.org.cn/cloudream/common/utils/serder"
)
type JobState interface {
Clone() JobState
}
type JobStateBase struct{}
var JobStateTypeUnion = types.NewTypeUnion[JobState](
(*StatePreScheduling)(nil),
(*StateReadyToAdjust)(nil),
(*StateMakingAdjustScheme)(nil),
(*StateAdjusting)(nil),
(*StateReadyToExecute)(nil),
(*StateExecuting)(nil),
(*StateFailed)(nil),
(*StateSuccess)(nil),
)
var _ = serder.UseTypeUnionExternallyTagged(&JobStateTypeUnion)
// TODO var _ = serder.RegisterNewTaggedTypeUnion(JobStateTypeUnion, "Type", "type")
type FileSchedulingStep string
const (
StepBegin FileSchedulingStep = "Begin" // 准备开始调度
StepUploading FileSchedulingStep = "Uploading" // 正在等待文件上传
StepUploaded FileSchedulingStep = "Uploaded" // 文件上传完成
StepMoving FileSchedulingStep = "Moving" // 正在移动缓存
StepLoading FileSchedulingStep = "Loading" // 正在加载
StepImageImporting FileSchedulingStep = "ImageImporting" // 正在导入镜像
StepCompleted FileSchedulingStep = "Completed" // 完成
)
type FileSchedulingState struct {
Step FileSchedulingStep `json:"step"`
Error string `json:"error"`
FullTaskID string `json:"fullTaskID"`
}
type StatePreScheduling struct {
JobStateBase
Scheme JobScheduleScheme `json:"scheme"`
Dataset FileSchedulingState `json:"dataset"`
Code FileSchedulingState `json:"code"`
Image FileSchedulingState `json:"image"`
}
func NewStatePreScheduling(scheme JobScheduleScheme) *StatePreScheduling {
return &StatePreScheduling{
Scheme: scheme,
Dataset: FileSchedulingState{
Step: StepBegin,
},
Code: FileSchedulingState{
Step: StepBegin,
},
Image: FileSchedulingState{
Step: StepBegin,
},
}
}
func (s *StatePreScheduling) Clone() JobState {
tmp := *s
return &tmp
}
type StateReadyToAdjust struct {
JobStateBase
}
func NewStateReadyToAdjust() *StateReadyToAdjust {
return &StateReadyToAdjust{}
}
func (s *StateReadyToAdjust) Clone() JobState {
tmp := *s
return &tmp
}
type StateMakingAdjustScheme struct {
JobStateBase
FullTaskID string `json:"fullTaskID"`
}
func NewStateMakingAdjustScheme() *StateMakingAdjustScheme {
return &StateMakingAdjustScheme{}
}
func (s *StateMakingAdjustScheme) Clone() JobState {
tmp := *s
return &tmp
}
type StateAdjusting struct {
JobStateBase
Scheme JobScheduleScheme `json:"scheme"`
Dataset FileSchedulingState `json:"dataset"`
Code FileSchedulingState `json:"code"`
Image FileSchedulingState `json:"image"`
}
func NewStateAdjusting(scheme JobScheduleScheme) *StateAdjusting {
return &StateAdjusting{
Scheme: scheme,
Dataset: FileSchedulingState{
Step: StepBegin,
},
Code: FileSchedulingState{
Step: StepBegin,
},
Image: FileSchedulingState{
Step: StepBegin,
},
}
}
func (s *StateAdjusting) Clone() JobState {
tmp := *s
return &tmp
}
type StateReadyToExecute struct {
JobStateBase
}
func NewStateReadyToExecute() *StateReadyToExecute {
return &StateReadyToExecute{}
}
func (s *StateReadyToExecute) Clone() JobState {
tmp := *s
return &tmp
}
type StateExecuting struct {
JobStateBase
FullTaskID string `json:"fullTaskID"`
}
func NewStateExecuting() *StateExecuting {
return &StateExecuting{}
}
func (s *StateExecuting) Clone() JobState {
tmp := *s
return &tmp
}
type StateFailed struct {
JobStateBase
Error string `json:"error"`
LastState JobState `json:"lastState"`
}
func NewStateFailed(err string, lastState JobState) *StateFailed {
return &StateFailed{
Error: err,
LastState: lastState,
}
}
func (s *StateFailed) Clone() JobState {
tmp := *s
return &tmp
}
type StateSuccess struct {
JobStateBase
}
func NewStateSuccess() *StateSuccess {
return &StateSuccess{}
}
func (s *StateSuccess) Clone() JobState {
tmp := *s
return &tmp
}

View File

@ -8,7 +8,7 @@ import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops" uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
myreflect "gitlink.org.cn/cloudream/common/utils/reflect" "gitlink.org.cn/cloudream/common/utils/reflect2"
"gitlink.org.cn/cloudream/common/utils/serder" "gitlink.org.cn/cloudream/common/utils/serder"
) )
@ -78,7 +78,7 @@ type CCResourceInfo struct {
func (i *CCResourceInfo) Scan(src interface{}) error { func (i *CCResourceInfo) Scan(src interface{}) error {
data, ok := src.([]uint8) data, ok := src.([]uint8)
if !ok { if !ok {
return fmt.Errorf("unknow src type: %v", myreflect.TypeOfValue(data).String()) return fmt.Errorf("unknow src type: %v", reflect2.TypeOfValue(data).String())
} }
return serder.JSONToObject(data, i) return serder.JSONToObject(data, i)

View File

@ -1,17 +1,20 @@
package task package task
import ( import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
) )
type MakeAdjustScheme struct { type MakeAdjustScheme struct {
TaskInfoBase TaskInfoBase
Job jobmod.NormalJob `json:"job"` JobInfo schsdk.NormalJobInfo `json:"jobInfo"`
JobStatus jobmod.NormalJobStatus `json:"jobStatus"`
} }
func NewMakeAdjustScheme(job jobmod.NormalJob) *MakeAdjustScheme { func NewMakeAdjustScheme(jobInfo schsdk.NormalJobInfo, jobStatus jobmod.NormalJobStatus) *MakeAdjustScheme {
return &MakeAdjustScheme{ return &MakeAdjustScheme{
Job: job, JobInfo: jobInfo,
JobStatus: jobStatus,
} }
} }

View File

@ -2,7 +2,7 @@ package task
import ( import (
"gitlink.org.cn/cloudream/common/pkgs/types" "gitlink.org.cn/cloudream/common/pkgs/types"
myreflect "gitlink.org.cn/cloudream/common/utils/reflect" "gitlink.org.cn/cloudream/common/utils/reflect2"
"gitlink.org.cn/cloudream/common/utils/serder" "gitlink.org.cn/cloudream/common/utils/serder"
) )
@ -32,9 +32,9 @@ func (s *TaskStatusBase) Noop() {}
// 只能在init函数中调用因为包级变量初始化会比init函数调用先进行 // 只能在init函数中调用因为包级变量初始化会比init函数调用先进行
func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any { func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any {
TaskInfoTypeUnion.Add(myreflect.TypeOf[TTaskInfo]()) TaskInfoTypeUnion.Add(reflect2.TypeOf[TTaskInfo]())
TaskStatusTypeUnion.Add(myreflect.TypeOf[TTaskStatus]()) TaskStatusTypeUnion.Add(reflect2.TypeOf[TTaskStatus]())
return nil return nil
} }

View File

@ -2,7 +2,7 @@ package task
import ( import (
"gitlink.org.cn/cloudream/common/pkgs/types" "gitlink.org.cn/cloudream/common/pkgs/types"
myreflect "gitlink.org.cn/cloudream/common/utils/reflect" "gitlink.org.cn/cloudream/common/utils/reflect2"
"gitlink.org.cn/cloudream/common/utils/serder" "gitlink.org.cn/cloudream/common/utils/serder"
) )
@ -32,9 +32,9 @@ func (s *TaskStatusBase) Noop() {}
// 只能在init函数中调用因为包级变量初始化会比init函数调用先进行 // 只能在init函数中调用因为包级变量初始化会比init函数调用先进行
func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any { func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any {
TaskInfoTypeUnion.Add(myreflect.TypeOf[TTaskInfo]()) TaskInfoTypeUnion.Add(reflect2.TypeOf[TTaskInfo]())
TaskStatusTypeUnion.Add(myreflect.TypeOf[TTaskStatus]()) TaskStatusTypeUnion.Add(reflect2.TypeOf[TTaskStatus]())
return nil return nil
} }

View File

@ -12,9 +12,7 @@ type JobService interface {
JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded) (*JobSetLocalFileUploadedResp, *mq.CodeMessage) JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded) (*JobSetLocalFileUploadedResp, *mq.CodeMessage)
GetJob(msg *GetJob) (*GetJobResp, *mq.CodeMessage) GetJobSetStatus(msg *GetJobSetStatus) (*GetJobSetStatusResp, *mq.CodeMessage)
// GetJobSetJobs(msg *GetJobSetJobs) (*GetJobSetJobsResp, *mq.CodeMessage)
} }
// 提交任务集 // 提交任务集
@ -74,52 +72,28 @@ func (c *Client) JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded, opts ...m
return mq.Request(Service.JobSetLocalFileUploaded, c.roundTripper, msg, opts...) return mq.Request(Service.JobSetLocalFileUploaded, c.roundTripper, msg, opts...)
} }
// 获取任务数据 var _ = Register(Service.GetJobSetStatus)
type GetJob struct {
// 获取任务集的状态
type GetJobSetStatus struct {
mq.MessageBodyBase mq.MessageBodyBase
JobID schsdk.JobID `json:"jobID"` JobSetID schsdk.JobSetID `json:"jobSetID"`
} }
type GetJobResp struct { type GetJobSetStatusResp struct {
mq.MessageBodyBase mq.MessageBodyBase
Job jobmod.Job `json:"job"` Jobs []jobmod.JobStatus `json:"jobs"`
} }
func NewGetJob(jobID schsdk.JobID) *GetJob { func ReqGetJobSetStatus(jobSetID schsdk.JobSetID) *GetJobSetStatus {
return &GetJob{ return &GetJobSetStatus{
JobID: jobID,
}
}
func NewGetJobResp(job jobmod.Job) *GetJobResp {
return &GetJobResp{
Job: job,
}
}
func (c *Client) GetJob(msg *GetJob, opts ...mq.RequestOption) (*GetJobResp, error) {
return mq.Request(Service.GetJob, c.roundTripper, msg, opts...)
}
/*
// 获取指定任务集中的所有任务数据
type GetJobSetJobs struct {
mq.MessageBodyBase
JobSetID string `json:"jobSetID"`
}
type GetJobSetJobsResp struct {
mq.MessageBodyBase
Jobs []jobmod.Job `json:"jobs"`
}
func NewGetJobSetJobs(jobSetID string) *GetJobSetJobs {
return &GetJobSetJobs{
JobSetID: jobSetID, JobSetID: jobSetID,
} }
} }
func NewGetJobSetJobsResp(jobs []jobmod.Job) *GetJobSetJobsResp { func RespGetJobSetStatus(jobs []jobmod.JobStatus) *GetJobSetStatusResp {
return &GetJobSetJobsResp{ return &GetJobSetStatusResp{
Jobs: jobs, Jobs: jobs,
} }
} }
func (c *Client) GetJobSetJobs(msg *GetJobSetJobs, opts ...mq.RequestOption) (*GetJobSetJobsResp, error) { func (c *Client) GetJob(msg *GetJobSetStatus, opts ...mq.RequestOption) (*GetJobSetStatusResp, error) {
return mq.Request(Service.GetJobSetJobs, c.rabbitCli, msg, opts...) return mq.Request(Service.GetJobSetStatus, c.roundTripper, msg, opts...)
} }
*/

View File

@ -4,7 +4,7 @@ import (
"gitlink.org.cn/cloudream/common/consts/errorcode" "gitlink.org.cn/cloudream/common/consts/errorcode"
"gitlink.org.cn/cloudream/common/pkgs/logger" "gitlink.org.cn/cloudream/common/pkgs/logger"
"gitlink.org.cn/cloudream/common/pkgs/mq" "gitlink.org.cn/cloudream/common/pkgs/mq"
"gitlink.org.cn/cloudream/common/utils/reflect" "gitlink.org.cn/cloudream/common/utils/reflect2"
execmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor" execmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
myglbs "gitlink.org.cn/cloudream/scheduler/executor/internal/globals" myglbs "gitlink.org.cn/cloudream/scheduler/executor/internal/globals"
) )
@ -12,7 +12,7 @@ import (
func (svc *Service) StartTask(msg *execmq.StartTask) (*execmq.StartTaskResp, *mq.CodeMessage) { func (svc *Service) StartTask(msg *execmq.StartTask) (*execmq.StartTaskResp, *mq.CodeMessage) {
tsk, err := svc.taskManager.StartByInfo(msg.Info) tsk, err := svc.taskManager.StartByInfo(msg.Info)
if err != nil { if err != nil {
logger.WithField("Info", reflect.TypeOfValue(msg.Info).Name()). logger.WithField("Info", reflect2.TypeOfValue(msg.Info).Name()).
Warnf("starting task by info: %s", err.Error()) Warnf("starting task by info: %s", err.Error())
return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed") return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed")
} }

View File

@ -81,7 +81,7 @@ func (t *PCMSubmitTask) do(taskID string, ctx TaskContext) error {
return nil return nil
} }
if tsResp.TaskStatus == pcmsdk.TaskStatuFailed { if tsResp.TaskStatus == pcmsdk.TaskStatusFailed {
// TODO 返回更详细的信息 // TODO 返回更详细的信息
return fmt.Errorf("task failed") return fmt.Errorf("task failed")
} }

View File

@ -5,7 +5,7 @@ import (
"reflect" "reflect"
"gitlink.org.cn/cloudream/common/pkgs/task" "gitlink.org.cn/cloudream/common/pkgs/task"
myreflect "gitlink.org.cn/cloudream/common/utils/reflect" "gitlink.org.cn/cloudream/common/utils/reflect2"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
reporter "gitlink.org.cn/cloudream/scheduler/executor/internal/reporter" reporter "gitlink.org.cn/cloudream/scheduler/executor/internal/reporter"
) )
@ -37,7 +37,7 @@ func NewManager(reporter *reporter.Reporter) Manager {
} }
func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) { func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) {
infoType := myreflect.TypeOfValue(info) infoType := reflect2.TypeOfValue(info)
ctor, ok := taskFromInfoCtors[infoType] ctor, ok := taskFromInfoCtors[infoType]
if !ok { if !ok {
@ -50,7 +50,7 @@ func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) {
var taskFromInfoCtors map[reflect.Type]func(exectsk.TaskInfo) TaskBody = make(map[reflect.Type]func(exectsk.TaskInfo) task.TaskBody[TaskContext]) var taskFromInfoCtors map[reflect.Type]func(exectsk.TaskInfo) TaskBody = make(map[reflect.Type]func(exectsk.TaskInfo) task.TaskBody[TaskContext])
func Register[TInfo exectsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) { func Register[TInfo exectsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) {
taskFromInfoCtors[myreflect.TypeOf[TInfo]()] = func(info exectsk.TaskInfo) TaskBody { taskFromInfoCtors[reflect2.TypeOf[TInfo]()] = func(info exectsk.TaskInfo) TaskBody {
return ctor(info.(TInfo)) return ctor(info.(TInfo))
} }
} }

View File

@ -5,7 +5,7 @@ import (
"sync" "sync"
"time" "time"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" "gitlink.org.cn/cloudream/common/utils/sync2"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals" schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models" schmod "gitlink.org.cn/cloudream/scheduler/common/models"
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor" advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
@ -13,29 +13,23 @@ import (
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
) )
type jobTask struct { type task struct {
JobID schsdk.JobID statusChan *sync2.Channel[advtsk.TaskStatus]
TaskID string
FullTaskID string
} }
type AdvisorInfo struct { type AdvisorInfo struct {
advisorID schmod.AdvisorID advisorID schmod.AdvisorID
jobTasks map[string]jobTask // key 为 TaskID tasks map[string]task // key 为 TaskID
lastReportTime time.Time lastReportTime time.Time
} }
type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus) var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string)
type Manager struct { type Manager struct {
advisors map[schmod.AdvisorID]*AdvisorInfo advisors map[schmod.AdvisorID]*AdvisorInfo
lock sync.Mutex lock sync.Mutex
advCli *advmq.Client advCli *advmq.Client
onTaskUpdated OnTaskUpdatedCallbackFn
onTaskTimeout OnTimeoutCallbackFn
reportTimeout time.Duration reportTimeout time.Duration
} }
@ -52,83 +46,66 @@ func NewManager(reportTimeout time.Duration) (*Manager, error) {
}, nil }, nil
} }
func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) {
m.onTaskUpdated = callback
}
func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) {
m.onTaskTimeout = callback
}
func (m *Manager) Report(advID schmod.AdvisorID, taskStatus []mgrmq.AdvisorTaskStatus) { func (m *Manager) Report(advID schmod.AdvisorID, taskStatus []mgrmq.AdvisorTaskStatus) {
m.lock.Lock() m.lock.Lock()
defer m.lock.Unlock() defer m.lock.Unlock()
info, ok := m.advisors[advID] adv, ok := m.advisors[advID]
if !ok { if !ok {
info = &AdvisorInfo{ adv = &AdvisorInfo{
advisorID: advID, advisorID: advID,
jobTasks: make(map[string]jobTask), tasks: make(map[string]task),
} }
m.advisors[advID] = info m.advisors[advID] = adv
} }
info.lastReportTime = time.Now() adv.lastReportTime = time.Now()
for _, s := range taskStatus { for _, s := range taskStatus {
tsk, ok := info.jobTasks[s.TaskID] tsk, ok := adv.tasks[s.TaskID]
if !ok { if !ok {
continue continue
} }
m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status) // TODO 考虑主动检测channel是否关闭然后取消task
if tsk.statusChan.Send(s.Status) != nil {
delete(adv.tasks, s.TaskID)
if len(adv.tasks) == 0 {
delete(m.advisors, advID)
}
}
} }
} }
// 启动一个Task并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID // 启动一个Task
func (m *Manager) StartTask(jobID schsdk.JobID, info advtsk.TaskInfo) (string, error) { func (m *Manager) StartTask(info advtsk.TaskInfo) *sync2.Channel[advtsk.TaskStatus] {
m.lock.Lock() m.lock.Lock()
defer m.lock.Unlock() defer m.lock.Unlock()
ch := sync2.NewChannel[advtsk.TaskStatus]()
resp, err := m.advCli.StartTask(advmq.NewStartTask(info)) resp, err := m.advCli.StartTask(advmq.NewStartTask(info))
if err != nil { if err != nil {
return "", err ch.CloseWithError(fmt.Errorf("start task: %w", err))
return ch
} }
fullTaskID := fmt.Sprintf("%s-%s", resp.AdvisorID, resp.TaskID)
exeInfo, ok := m.advisors[resp.AdvisorID] exeInfo, ok := m.advisors[resp.AdvisorID]
if !ok { if !ok {
exeInfo = &AdvisorInfo{ exeInfo = &AdvisorInfo{
advisorID: resp.AdvisorID, advisorID: resp.AdvisorID,
jobTasks: make(map[string]jobTask), tasks: make(map[string]task),
lastReportTime: time.Now(), lastReportTime: time.Now(),
} }
m.advisors[resp.AdvisorID] = exeInfo m.advisors[resp.AdvisorID] = exeInfo
} }
exeInfo.jobTasks[resp.TaskID] = jobTask{ exeInfo.tasks[resp.TaskID] = task{
JobID: jobID, statusChan: ch,
TaskID: resp.TaskID,
FullTaskID: fullTaskID,
} }
return fullTaskID, nil return ch
}
// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新
func (m *Manager) ForgetTask(fullTaskID string) {
m.lock.Lock()
defer m.lock.Unlock()
for _, exe := range m.advisors {
for _, tsk := range exe.jobTasks {
if tsk.FullTaskID == fullTaskID {
delete(exe.jobTasks, fullTaskID)
return
}
}
}
} }
func (m *Manager) Serve() error { func (m *Manager) Serve() error {
@ -150,8 +127,8 @@ func (m *Manager) Serve() error {
continue continue
} }
for _, tsk := range exeInfo.jobTasks { for _, tsk := range exeInfo.tasks {
m.onTaskTimeout(tsk.JobID, tsk.FullTaskID) tsk.statusChan.CloseWithError(ErrWaitReportTimeout)
} }
delete(m.advisors, exeID) delete(m.advisors, exeID)

View File

@ -5,7 +5,7 @@ import (
"sync" "sync"
"time" "time"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" "gitlink.org.cn/cloudream/common/utils/sync2"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals" schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models" schmod "gitlink.org.cn/cloudream/scheduler/common/models"
@ -14,29 +14,22 @@ import (
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
) )
type jobTask struct { type task struct {
JobID schsdk.JobID statusChan *sync2.Channel[exetsk.TaskStatus]
TaskID string
FullTaskID string
} }
type ExecutorStatus struct {
type ExecutorInfo struct {
executorID schmod.ExecutorID executorID schmod.ExecutorID
jobTasks map[string]jobTask // key 为 TaskID tasks map[string]task // key 为 TaskID
lastReportTime time.Time lastReportTime time.Time
} }
type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus exetsk.TaskStatus) var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string)
type Manager struct { type Manager struct {
executors map[schmod.ExecutorID]*ExecutorInfo executors map[schmod.ExecutorID]*ExecutorStatus
lock sync.Mutex lock sync.Mutex
exeCli *exemq.Client exeCli *exemq.Client
onTaskUpdated OnTaskUpdatedCallbackFn
onTaskTimeout OnTimeoutCallbackFn
reportTimeout time.Duration reportTimeout time.Duration
} }
@ -47,89 +40,71 @@ func NewManager(reportTimeout time.Duration) (*Manager, error) {
} }
return &Manager{ return &Manager{
executors: make(map[schmod.ExecutorID]*ExecutorInfo), executors: make(map[schmod.ExecutorID]*ExecutorStatus),
exeCli: exeCli, exeCli: exeCli,
reportTimeout: reportTimeout, reportTimeout: reportTimeout,
}, nil }, nil
} }
func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) {
m.onTaskUpdated = callback
}
func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) {
m.onTaskTimeout = callback
}
func (m *Manager) Report(execID schmod.ExecutorID, taskStatus []mgrmq.ExecutorTaskStatus) { func (m *Manager) Report(execID schmod.ExecutorID, taskStatus []mgrmq.ExecutorTaskStatus) {
m.lock.Lock() m.lock.Lock()
defer m.lock.Unlock() defer m.lock.Unlock()
info, ok := m.executors[execID] exec, ok := m.executors[execID]
if !ok { if !ok {
info = &ExecutorInfo{ exec = &ExecutorStatus{
executorID: execID, executorID: execID,
jobTasks: make(map[string]jobTask), tasks: make(map[string]task),
} }
m.executors[execID] = info m.executors[execID] = exec
} }
info.lastReportTime = time.Now() exec.lastReportTime = time.Now()
for _, s := range taskStatus { for _, s := range taskStatus {
tsk, ok := info.jobTasks[s.TaskID] tsk, ok := exec.tasks[s.TaskID]
if !ok { if !ok {
continue continue
} }
m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status) // TODO 考虑主动检测channel是否关闭然后取消task
if tsk.statusChan.Send(s.Status) != nil {
delete(exec.tasks, s.TaskID)
if len(exec.tasks) == 0 {
delete(m.executors, execID)
}
}
} }
} }
// 启动一个Task并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID // 启动一个Task
func (m *Manager) StartTask(jobID schsdk.JobID, info exetsk.TaskInfo) (string, error) { func (m *Manager) StartTask(info exetsk.TaskInfo) *sync2.Channel[exetsk.TaskStatus] {
m.lock.Lock() m.lock.Lock()
defer m.lock.Unlock() defer m.lock.Unlock()
ch := sync2.NewChannel[exetsk.TaskStatus]()
resp, err := m.exeCli.StartTask(exemq.NewStartTask(info)) resp, err := m.exeCli.StartTask(exemq.NewStartTask(info))
if err != nil { if err != nil {
return "", err ch.CloseWithError(fmt.Errorf("start task: %w", err))
return ch
} }
fullTaskID := fmt.Sprintf("%s-%s", resp.ExecutorID, resp.TaskID)
exeInfo, ok := m.executors[resp.ExecutorID] exeInfo, ok := m.executors[resp.ExecutorID]
if !ok { if !ok {
exeInfo = &ExecutorInfo{ exeInfo = &ExecutorStatus{
executorID: resp.ExecutorID, executorID: resp.ExecutorID,
jobTasks: make(map[string]jobTask), tasks: make(map[string]task),
lastReportTime: time.Now(), lastReportTime: time.Now(),
} }
m.executors[resp.ExecutorID] = exeInfo m.executors[resp.ExecutorID] = exeInfo
} }
exeInfo.jobTasks[resp.TaskID] = jobTask{ exeInfo.tasks[resp.TaskID] = task{
JobID: jobID, statusChan: ch,
TaskID: resp.TaskID,
FullTaskID: fullTaskID,
} }
return fullTaskID, nil return ch
}
// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新
func (m *Manager) ForgetTask(fullTaskID string) {
m.lock.Lock()
defer m.lock.Unlock()
for _, exe := range m.executors {
for _, tsk := range exe.jobTasks {
if tsk.FullTaskID == fullTaskID {
delete(exe.jobTasks, fullTaskID)
return
}
}
}
} }
func (m *Manager) Serve() error { func (m *Manager) Serve() error {
@ -151,8 +126,8 @@ func (m *Manager) Serve() error {
continue continue
} }
for _, tsk := range exeInfo.jobTasks { for _, tsk := range exeInfo.tasks {
m.onTaskTimeout(tsk.JobID, tsk.FullTaskID) tsk.statusChan.CloseWithError(ErrWaitReportTimeout)
} }
delete(m.executors, exeID) delete(m.executors, exeID)

View File

@ -1,371 +0,0 @@
package jobmgr
import (
"fmt"
"reflect"
"time"
"gitlink.org.cn/cloudream/common/pkgs/actor"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type adjustingJob struct {
job *jobmod.NormalJob
state *jobmod.StateAdjusting
ccInfo schmod.ComputingCenter
}
type AdjustingHandler struct {
mgr *Manager
jobs map[schsdk.JobID]*adjustingJob
cmdChan actor.CommandChannel
}
func NewAdjustingHandler(mgr *Manager) *AdjustingHandler {
return &AdjustingHandler{
mgr: mgr,
jobs: make(map[schsdk.JobID]*adjustingJob),
cmdChan: *actor.NewCommandChannel(),
}
}
func (h *AdjustingHandler) Handle(job jobmod.Job) {
h.cmdChan.Send(func() {
norJob, ok := job.(*jobmod.NormalJob)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState()))
return
}
adjustingState, ok := norJob.GetState().(*jobmod.StateAdjusting)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
return
}
colCli, err := schglb.CollectorMQPool.Acquire()
if err != nil {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err.Error()), job.GetState()))
return
}
defer schglb.CollectorMQPool.Release(colCli)
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), adjustingState.Scheme.TargetCCID)
if err != nil {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.GetState()))
return
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new cloudream storage client: %s", err.Error()), job.GetState()))
return
}
defer schglb.CloudreamStoragePool.Release(stgCli)
stgInfo, err := stgCli.StorageGetInfo(cdssdk.StorageGetInfoReq{
StorageID: ccInfo.CDSStorageID,
})
if err != nil {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting cloudream storage info: %s", err.Error()), job.GetState()))
return
}
norJob.TargetCCID = adjustingState.Scheme.TargetCCID
// TODO UserID
norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, norJob.JobID)
adjJob := &adjustingJob{
job: norJob,
state: adjustingState,
ccInfo: ccInfo,
}
h.jobs[job.GetJobID()] = adjJob
h.onJobEvent(nil, adjJob)
})
}
func (h *AdjustingHandler) onJobEvent(evt event.Event, job *adjustingJob) {
if cloneEvt, ok := evt.(*event.CloneJob); ok {
cloneEvt.Callback.SetValue(job.job.Clone())
return
}
err := h.doPackageScheduling(evt, job,
job.job.Info.Files.Dataset, &job.job.Files.Dataset,
&job.state.Scheme.Dataset, &job.state.Dataset,
)
if err != nil {
job.state.Dataset.Error = err.Error()
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
err = h.doPackageScheduling(evt, job,
job.job.Info.Files.Code, &job.job.Files.Code,
&job.state.Scheme.Code, &job.state.Code,
)
if err != nil {
job.state.Code.Error = err.Error()
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
err = h.doImageScheduling(evt, job,
job.job.Info.Files.Image, &job.job.Files.Image,
&job.state.Scheme.Image, &job.state.Image,
)
if err != nil {
job.state.Image.Error = err.Error()
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
// 如果三种文件都调度完成,则可以进入下个阶段了
if job.state.Dataset.Step == jobmod.StepCompleted &&
job.state.Code.Step == jobmod.StepCompleted &&
job.state.Image.Step == jobmod.StepCompleted {
h.changeJobState(job.job, jobmod.NewStateReadyToExecute())
}
}
func (h *AdjustingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
job.SetState(state)
delete(h.jobs, job.GetJobID())
h.mgr.pubLock.Lock()
h.mgr.handleState(job)
h.mgr.pubLock.Unlock()
}
func (h *AdjustingHandler) doPackageScheduling(evt event.Event, job *adjustingJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
if state.Step == jobmod.StepBegin {
state.Step = jobmod.StepUploaded
}
if state.Step == jobmod.StepUploaded {
if scheme.Action == jobmod.ActionNo {
state.Step = jobmod.StepCompleted
return nil
}
if scheme.Action == jobmod.ActionMove {
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, file.PackageID, job.ccInfo.CDSNodeID))
if err != nil {
return fmt.Errorf("starting cache move package: %w", err)
}
state.Step = jobmod.StepMoving
state.FullTaskID = fullTaskID
return nil
}
if scheme.Action == jobmod.ActionLoad {
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewStorageLoadPackage(1, file.PackageID, job.ccInfo.CDSStorageID))
if err != nil {
return fmt.Errorf("starting stroage load package: %w", err)
}
state.Step = jobmod.StepLoading
state.FullTaskID = fullTaskID
return nil
}
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
}
if state.Step == jobmod.StepMoving {
moveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
if err == event.ErrUnconcernedTask {
return nil
}
if err == event.ErrTaskTimeout {
return fmt.Errorf("cache move package timeout")
}
h.mgr.execMgr.ForgetTask(state.FullTaskID)
if moveRet.Error != "" {
return fmt.Errorf("cache move pacakge: %s", moveRet.Error)
}
state.Step = jobmod.StepCompleted
return nil
}
if state.Step == jobmod.StepLoading {
loadRet, err := event.AssertExecutorTaskStatus[*exectsk.StorageLoadPackageStatus](evt, state.FullTaskID)
if err == event.ErrUnconcernedTask {
return nil
}
if err == event.ErrTaskTimeout {
return fmt.Errorf("storage load package timeout")
}
h.mgr.execMgr.ForgetTask(state.FullTaskID)
if loadRet.Error != "" {
return fmt.Errorf("storage load package: %s", loadRet.Error)
}
file.FullPath = loadRet.FullPath
state.Step = jobmod.StepCompleted
return nil
}
return nil
}
func (h *AdjustingHandler) doImageScheduling(evt event.Event, job *adjustingJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
if state.Step == jobmod.StepBegin {
state.Step = jobmod.StepUploaded
}
if state.Step == jobmod.StepUploaded {
if scheme.Action == jobmod.ActionNo {
state.Step = jobmod.StepCompleted
return nil
}
// 要导入镜像,则需要先将镜像移动到指点节点的缓存中
if scheme.Action == jobmod.ActionImportImage {
if file.PackageID == nil {
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, job.ccInfo.CCID)
}
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, *file.PackageID, job.ccInfo.CDSNodeID))
if err != nil {
return fmt.Errorf("starting cache move package: %w", err)
}
state.Step = jobmod.StepMoving
state.FullTaskID = fullTaskID
return nil
}
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
}
if state.Step == jobmod.StepMoving {
cacheMoveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
if err == event.ErrUnconcernedTask {
return nil
}
if err == event.ErrTaskTimeout {
return fmt.Errorf("cache move package timeout")
}
h.mgr.execMgr.ForgetTask(state.FullTaskID)
if cacheMoveRet.Error != "" {
return fmt.Errorf("cache move pacakge: %s", cacheMoveRet.Error)
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cloudream storage client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
if err != nil {
return fmt.Errorf("getting package objects: %w", err)
}
if len(pkgObjs.Objects) != 1 {
return fmt.Errorf("there must be only 1 object in the package that will be imported")
}
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewUploadImage(job.ccInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
if err != nil {
return fmt.Errorf("starting import image: %w", err)
}
state.Step = jobmod.StepImageImporting
state.FullTaskID = fullTaskID
return nil
}
if state.Step == jobmod.StepImageImporting {
uploadImageRet, err := event.AssertExecutorTaskStatus[*exectsk.UploadImageStatus](evt, state.FullTaskID)
if err == event.ErrUnconcernedTask {
return nil
}
if err == event.ErrTaskTimeout {
return fmt.Errorf("import image timeout")
}
h.mgr.execMgr.ForgetTask(state.FullTaskID)
if uploadImageRet.Error != "" {
return fmt.Errorf("import image: %s", uploadImageRet.Error)
}
// 调整过程中不会更换镜像所以ImageID不会发生变化
err = h.mgr.db.PCMImage().Create(h.mgr.db.SQLCtx(), file.ImageID, job.ccInfo.CCID, uploadImageRet.PCMImageID, uploadImageRet.Name, time.Now())
if err != nil {
return fmt.Errorf("creating pcm image info: %w", err)
}
state.Step = jobmod.StepCompleted
return nil
}
return nil
}
func (h *AdjustingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
h.cmdChan.Send(func() {
if broadcast.ToAll() {
for _, job := range h.jobs {
h.onJobEvent(evt, job)
}
} else if broadcast.ToJobSet() {
for _, job := range h.jobs {
if job.job.GetJobSetID() != broadcast.JobSetID {
continue
}
h.onJobEvent(evt, job)
}
} else if broadcast.ToJob() {
if job, ok := h.jobs[broadcast.JobID]; ok {
h.onJobEvent(evt, job)
}
}
})
}
func (h *AdjustingHandler) Serve() {
cmdChan := h.cmdChan.BeginChanReceive()
defer h.cmdChan.CloseChanReceive()
for {
select {
case cmd := <-cmdChan:
cmd()
}
}
}
func (h *AdjustingHandler) Stop() {
// TODO 支持STOP
}

View File

@ -1,63 +0,0 @@
package jobmgr
import (
"fmt"
"reflect"
"gitlink.org.cn/cloudream/common/pkgs/logger"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type CompleteHandler struct {
mgr *Manager
}
func NewCompleteHandler(mgr *Manager) *CompleteHandler {
return &CompleteHandler{
mgr: mgr,
}
}
func (h *CompleteHandler) Handle(job jobmod.Job) {
// TODO 可以考虑将执行记录落库
if state, ok := job.GetState().(*jobmod.StateSuccess); ok {
h.handleSuccess(job, state)
} else if state, ok := job.GetState().(*jobmod.StateFailed); ok {
h.handleFailed(job, state)
} else {
state := jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState())
job.SetState(state)
h.handleFailed(job, state)
}
}
func (h *CompleteHandler) handleSuccess(job jobmod.Job, state *jobmod.StateSuccess) {
logger.WithField("JobID", job.GetJobID()).Infof("job completed successfuly")
h.mgr.onEvent(event.ToJobSet(job.GetJobSetID()), event.NewJobCompleted(job))
}
func (h *CompleteHandler) handleFailed(job jobmod.Job, state *jobmod.StateFailed) {
logger.
WithField("JobID", job.GetJobID()).
WithField("LastState", reflect.TypeOf(state.LastState).String()).
Infof("job failed with: %v", state.Error)
h.mgr.onEvent(event.ToJobSet(job.GetJobSetID()), event.NewJobCompleted(job))
}
func (h *CompleteHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
if cloneEvt, ok := evt.(*event.CloneJob); ok {
cloneEvt.Callback.SetError(fmt.Errorf("job not found"))
return
}
}
func (h *CompleteHandler) Serve() {
}
func (h *CompleteHandler) Stop() {
}

View File

@ -1,50 +0,0 @@
package jobmgr
import (
"gitlink.org.cn/cloudream/common/pkgs/logger"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type DefaultHandler struct {
mgr *Manager
}
func NewDefaultHandler(mgr *Manager) *DefaultHandler {
return &DefaultHandler{
mgr: mgr,
}
}
// 处理Job。在此期间全局锁已锁定
func (h *DefaultHandler) Handle(job jobmod.Job) {
state := job.GetState()
if state == nil {
job.SetState(jobmod.NewStateFailed("unexpected nil state", nil))
h.mgr.handleState(job)
return
}
if _, ok := state.(*jobmod.StateFailed); ok {
logger.Warnf("state failed should not be handled by default handler")
return
}
job.SetState(jobmod.NewStateFailed("no handler for this state", state))
h.mgr.handleState(job)
}
// 外部发生了一个事件
func (h *DefaultHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
}
// 运行Handler
func (h *DefaultHandler) Serve() {
}
// 停止此Handler
func (h *DefaultHandler) Stop() {
}

View File

@ -1,4 +1,4 @@
package event package jobmgr
import ( import (
"errors" "errors"
@ -11,6 +11,8 @@ var ErrUnconcernedTask = errors.New("unconcerned task")
var ErrTaskTimeout = errors.New("task timeout") var ErrTaskTimeout = errors.New("task timeout")
var ErrJobCancelled = errors.New("job cancelled")
type Event interface{} type Event interface{}
type BroadcastType string type BroadcastType string

View File

@ -1,12 +0,0 @@
package event
// advisor的任务执行超时
type AdvisorTaskTimeout struct {
FullTaskID string
}
func NewAdvisorTaskTimeout(fullTaskID string) *AdvisorTaskTimeout {
return &AdvisorTaskTimeout{
FullTaskID: fullTaskID,
}
}

View File

@ -1,46 +0,0 @@
package event
import advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
// advisor上报任务进度
type AdvisorTaskUpdated struct {
FullTaskID string
TaskStatus advtsk.TaskStatus
}
func NewAdvisorTaskUpdated(fullTaskID string, taskStatus advtsk.TaskStatus) *AdvisorTaskUpdated {
return &AdvisorTaskUpdated{
FullTaskID: fullTaskID,
TaskStatus: taskStatus,
}
}
func AssertAdvisorTaskStatus[T advtsk.TaskStatus](evt Event, fullTaskID string) (T, error) {
var ret T
if evt == nil {
return ret, ErrUnconcernedTask
}
if reportTaskStatus, ok := evt.(*AdvisorTaskUpdated); ok {
if reportTaskStatus.FullTaskID != fullTaskID {
return ret, ErrUnconcernedTask
}
status, ok := reportTaskStatus.TaskStatus.(T)
if !ok {
return ret, ErrUnconcernedTask
}
return status, nil
}
if taskTimeout, ok := evt.(*AdvisorTaskTimeout); ok {
if taskTimeout.FullTaskID != fullTaskID {
return ret, ErrUnconcernedTask
}
return ret, ErrTaskTimeout
}
return ret, ErrUnconcernedTask
}

View File

@ -0,0 +1,4 @@
package event
type Cancel struct {
}

View File

@ -1,14 +0,0 @@
package event
import (
"gitlink.org.cn/cloudream/common/pkgs/future"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type CloneJob struct {
Callback future.SetValueFuture[jobmod.Job]
}
func NewCloneJob() *CloneJob {
return &CloneJob{}
}

View File

@ -1,12 +0,0 @@
package event
// executor的任务执行超时
type ExecutorTaskTimeout struct {
FullTaskID string
}
func NewExecutorTaskTimeout(fullTaskID string) *ExecutorTaskTimeout {
return &ExecutorTaskTimeout{
FullTaskID: fullTaskID,
}
}

View File

@ -1,48 +0,0 @@
package event
import (
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
)
// executor上报任务进度
type ExecutorTaskUpdated struct {
FullTaskID string
TaskStatus exectsk.TaskStatus
}
func NewExecutorTaskUpdated(fullTaskID string, taskStatus exectsk.TaskStatus) *ExecutorTaskUpdated {
return &ExecutorTaskUpdated{
FullTaskID: fullTaskID,
TaskStatus: taskStatus,
}
}
func AssertExecutorTaskStatus[T exectsk.TaskStatus](evt Event, fullTaskID string) (T, error) {
var ret T
if evt == nil {
return ret, ErrUnconcernedTask
}
if reportTaskStatus, ok := evt.(*ExecutorTaskUpdated); ok {
if reportTaskStatus.FullTaskID != fullTaskID {
return ret, ErrUnconcernedTask
}
status, ok := reportTaskStatus.TaskStatus.(T)
if !ok {
return ret, ErrUnconcernedTask
}
return status, nil
}
if taskTimeout, ok := evt.(*ExecutorTaskTimeout); ok {
if taskTimeout.FullTaskID != fullTaskID {
return ret, ErrUnconcernedTask
}
return ret, ErrTaskTimeout
}
return ret, ErrUnconcernedTask
}

View File

@ -1,16 +1,18 @@
package event package event
import ( import (
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
) )
// 任务结束,包括成功或者失败 // 任务结束,包括成功或者失败
type JobCompleted struct { type JobCompleted struct {
Job jobmod.Job Job *jobmgr.Job
Err error
} }
func NewJobCompleted(job jobmod.Job) *JobCompleted { func NewJobCompleted(job *jobmgr.Job, err error) *JobCompleted {
return &JobCompleted{ return &JobCompleted{
Job: job, Job: job,
Err: err,
} }
} }

View File

@ -1,21 +1,18 @@
package event package event
import ( import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
) )
// 本地文件上传结束 // 本地文件上传结束
type LocalFileUploaded struct { type LocalFileUploaded struct {
JobSetID schsdk.JobSetID
LocalPath string LocalPath string
Error string Error error
PackageID cdssdk.PackageID PackageID cdssdk.PackageID
} }
func NewLocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) *LocalFileUploaded { func NewLocalFileUploaded(localPath string, err error, packageID cdssdk.PackageID) *LocalFileUploaded {
return &LocalFileUploaded{ return &LocalFileUploaded{
JobSetID: jobSetID,
LocalPath: localPath, LocalPath: localPath,
Error: err, Error: err,
PackageID: packageID, PackageID: packageID,

View File

@ -0,0 +1,27 @@
package event
import (
"context"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
)
func WaitType[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet) (T, bool) {
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
_, ok := evt.(T)
return ok
})
return ret.(T), ok
}
func WaitTypeAnd[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet, cond func(val T) bool) (T, bool) {
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
e, ok := evt.(T)
if !ok {
return false
}
return cond(e)
})
return ret.(T), ok
}

View File

@ -0,0 +1,72 @@
package jobmgr
import (
"context"
"sync"
"gitlink.org.cn/cloudream/common/pkgs/future"
"gitlink.org.cn/cloudream/common/utils/lo2"
)
type EventWaitCondition func(evt Event) bool
type EventWaiter struct {
condition EventWaitCondition
future *future.SetValueFuture[Event]
}
type EventSet struct {
events []Event
waiters []EventWaiter
lock sync.Mutex
}
func NewEventSet() EventSet {
return EventSet{}
}
func (s *EventSet) Post(evt Event) {
s.lock.Lock()
defer s.lock.Unlock()
// 一个事件能唤醒多个等待者
used := false
for i, waiter := range s.waiters {
if waiter.condition(evt) {
s.waiters = lo2.RemoveAt(s.waiters, i)
waiter.future.SetValue(evt)
used = true
}
}
if !used {
s.events = append(s.events, evt)
}
}
func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bool) {
s.lock.Lock()
defer s.lock.Unlock()
// 一个等待者只能等待一个事件
for i, evt := range s.events {
if cond(evt) {
s.events = lo2.RemoveAt(s.events, i)
return evt, true
}
}
fut := future.NewSetValue[Event]()
waiter := EventWaiter{
condition: cond,
future: fut,
}
s.events = append(s.events, waiter)
val, err := fut.WaitValue(ctx)
if err != nil {
return nil, false
}
return val, true
}

View File

@ -1,264 +0,0 @@
package jobmgr
import (
"fmt"
"reflect"
"gitlink.org.cn/cloudream/common/pkgs/actor"
"gitlink.org.cn/cloudream/common/pkgs/logger"
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type executingJob struct {
job jobmod.Job
state *jobmod.StateExecuting
}
type ExecutingHandler struct {
mgr *Manager
jobs map[schsdk.JobID]*executingJob
cmdChan actor.CommandChannel
}
func NewExecutingHandler(mgr *Manager) *ExecutingHandler {
return &ExecutingHandler{
mgr: mgr,
jobs: make(map[schsdk.JobID]*executingJob),
cmdChan: *actor.NewCommandChannel(),
}
}
func (h *ExecutingHandler) Handle(job jobmod.Job) {
h.cmdChan.Send(func() {
state, ok := job.GetState().(*jobmod.StateExecuting)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
return
}
rjob := &executingJob{
job: job,
state: state,
}
h.jobs[job.GetJobID()] = rjob
h.onJobEvent(nil, rjob)
})
}
func (h *ExecutingHandler) onJobEvent(evt event.Event, job *executingJob) {
if cloneEvt, ok := evt.(*event.CloneJob); ok {
cloneEvt.Callback.SetValue(job.job.Clone())
return
}
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
h.onNormalJobEvent(evt, job, norJob)
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
h.onResourceJobEvent(evt, job, resJob)
}
}
func (h *ExecutingHandler) onNormalJobEvent(evt event.Event, job *executingJob, norJob *jobmod.NormalJob) {
if job.state.FullTaskID == "" {
pcmImgInfo, err := h.mgr.db.PCMImage().GetByImageIDAndCCID(h.mgr.db.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID)
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed("getting pcm image info: "+err.Error(), job.state))
return
}
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), norJob.TargetCCID)
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state))
return
}
// TODO 需要添加DATA_IN、DATA_OUT等环境变量这些数据从Job的信息中来获取
ress, err := h.mgr.db.CCResource().GetByCCID(h.mgr.db.SQLCtx(), norJob.TargetCCID)
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center resource info: %s", err.Error()), job.state))
return
}
if len(ress) == 0 {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("there is no resource at computing center %v", norJob.TargetCCID), job.state))
return
}
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(),
exetsk.NewSubmitTask(
ccInfo.PCMParticipantID,
pcmImgInfo.PCMImageID,
// TODO 选择资源的算法
ress[0].PCMResourceID,
norJob.Info.Runtime.Command,
norJob.Info.Runtime.Envs,
))
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
job.state.FullTaskID = fullTaskID
}
if execRet, err := event.AssertExecutorTaskStatus[*exetsk.SubmitTaskStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
if err == event.ErrTaskTimeout {
h.changeJobState(job.job, jobmod.NewStateFailed("schedule task timeout", job.state))
return
}
logger.WithField("JobID", job.job.GetJobID()).
WithField("State", reflect.TypeOf(job.state).String()).
Infof("pcm task state change to: %s", execRet.Status)
if execRet.Status == pcmsdk.TaskStatusSuccess {
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
h.changeJobState(job.job, jobmod.NewStateSuccess())
} else if execRet.Status == pcmsdk.TaskStatuFailed {
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
h.changeJobState(job.job, jobmod.NewStateFailed(execRet.Error, job.state))
}
}
}
func (h *ExecutingHandler) onResourceJobEvent(evt event.Event, job *executingJob, resJob *jobmod.ResourceJob) {
if job.state.FullTaskID == "" {
h.mgr.pubLock.Lock()
jobSet, ok := h.mgr.jobSets[resJob.GetJobSetID()]
if !ok {
h.mgr.pubLock.Unlock()
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", resJob.GetJobSetID()), job.state))
return
}
ref := jobSet.FindRefByLocalJobID(resJob.Info.TargetLocalJobID)
if ref == nil {
h.mgr.pubLock.Unlock()
h.changeJobState(job.job, jobmod.NewStateFailed(
fmt.Sprintf("job %s not found in job set %s",
resJob.Info.TargetLocalJobID,
resJob.GetJobSetID()),
job.state,
))
return
}
targetJob, ok := h.mgr.jobs[ref.JobID]
h.mgr.pubLock.Unlock()
if !ok {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
return
}
tarNorJob, ok := targetJob.Job.(*jobmod.NormalJob)
if !ok {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job(%v) %s is not a Normal job", reflect.TypeOf(targetJob), ref.JobID), job.state))
return
}
colCli, err := schglb.CollectorMQPool.Acquire()
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err.Error()), job.state))
return
}
defer schglb.CollectorMQPool.Release(colCli)
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), tarNorJob.TargetCCID)
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state))
return
}
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(), exetsk.NewStorageCreatePackage(
1, // TOOD 用户ID
ccInfo.CDSStorageID,
tarNorJob.OutputFullPath,
resJob.Info.BucketID,
utils.MakeResourcePackageName(resJob.JobID),
))
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
job.state.FullTaskID = fullTaskID
}
if createRet, err := event.AssertExecutorTaskStatus[*exetsk.StorageCreatePackageStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
if err == event.ErrTaskTimeout {
h.changeJobState(job.job, jobmod.NewStateFailed("storage create package timeout", job.state))
return
}
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
if createRet.Error != "" {
h.changeJobState(job.job, jobmod.NewStateFailed(createRet.Error, job.state))
return
}
resJob.ResourcePackageID = createRet.PackageID
h.changeJobState(job.job, jobmod.NewStateSuccess())
}
}
func (h *ExecutingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
job.SetState(state)
delete(h.jobs, job.GetJobID())
h.mgr.pubLock.Lock()
h.mgr.handleState(job)
h.mgr.pubLock.Unlock()
}
func (h *ExecutingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
h.cmdChan.Send(func() {
if broadcast.ToAll() {
for _, job := range h.jobs {
h.onJobEvent(evt, job)
}
} else if broadcast.ToJobSet() {
for _, job := range h.jobs {
if job.job.GetJobSetID() != broadcast.JobSetID {
continue
}
h.onJobEvent(evt, job)
}
} else if broadcast.ToJob() {
if job, ok := h.jobs[broadcast.JobID]; ok {
h.onJobEvent(evt, job)
}
}
})
}
func (h *ExecutingHandler) Serve() {
cmdChan := h.cmdChan.BeginChanReceive()
defer h.cmdChan.CloseChanReceive()
for {
select {
case cmd := <-cmdChan:
cmd()
}
}
}
func (h *ExecutingHandler) Stop() {
// TODO 支持STOP
}

View File

@ -0,0 +1,88 @@
package jobmgr
import (
"github.com/samber/lo"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type FileScheduleAction string
// 文件调度方案
const (
ActionNo FileScheduleAction = "No" // 不需要操作
ActionMove FileScheduleAction = "Move" // 需要在指定节点上建立缓存
ActionLoad FileScheduleAction = "Load" // 需要加载到Storage
ActionImportImage FileScheduleAction = "ImportImage" // 需要导入镜像
)
type FileScheduleScheme struct {
Action FileScheduleAction `json:"action"`
}
// 任务调度方案
type JobScheduleScheme struct {
TargetCCID schsdk.CCID `json:"targetCCID"`
Dataset FileScheduleScheme `json:"dataset"`
Code FileScheduleScheme `json:"code"`
Image FileScheduleScheme `json:"image"`
}
// 任务集的预调度方案
type JobSetPreScheduleScheme struct {
JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID
}
// 任务集
type JobSet struct {
JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID
JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用
PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"`
}
type JobSetJobRef struct {
JobID schsdk.JobID `json:"jobID"` // 任务ID
LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID
}
func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet {
return &JobSet{
JobSetID: jobSetID,
JobRefs: jobRefs,
PreScheduleScheme: preScheduleScheme,
}
}
func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef {
ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID })
if !ok {
return nil
}
return &ref
}
// 任务
type Job struct {
JobSetID schsdk.JobSetID // 任务集ID
JobID schsdk.JobID // 全局唯一任务ID
Body JobBody // 具体任务
}
func (j *Job) GetInfo() schsdk.JobInfo {
return j.Body.GetInfo()
}
func (j *Job) Dump(ctx JobStateRunContext, job *Job, curState JobState) jobmod.JobStatus {
return jobmod.JobStatus{
JobID: j.JobID,
JobSetID: j.JobSetID,
Info: j.GetInfo(),
Body: job.Body.Dump(),
State: curState.Dump(ctx, job),
}
}
type JobBody interface {
GetInfo() schsdk.JobInfo
Dump() jobmod.JobBodyStatus
}

View File

@ -0,0 +1,30 @@
package job
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type DataReturnJob struct {
Info schsdk.DataReturnJobInfo
TargetJobCCID schsdk.CCID // 目标任务所在计算中心的ID
TargetJobOutputFullPath string // 目标任务的结果输出全路径
DataReturnPackageID cdssdk.PackageID // 回源之后得到的PackageID
}
func NewResourceJob(info schsdk.DataReturnJobInfo) *DataReturnJob {
return &DataReturnJob{
Info: info,
}
}
func (j *DataReturnJob) GetInfo() schsdk.JobInfo {
return &j.Info
}
func (j *DataReturnJob) Dump() jobmod.JobBodyStatus {
return jobmod.DataReturnJobStatus{
DataReturnPackageID: j.DataReturnPackageID,
}
}

View File

@ -0,0 +1,30 @@
package job
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type NormalJob struct {
Info schsdk.NormalJobInfo // 提交任务时提供的任务描述信息
Files jobmod.JobFiles // 任务需要的文件
TargetCCID schsdk.CCID // 将要运行此任务的算力中心ID
OutputFullPath string // 程序结果的完整输出路径
}
func NewNormalJob(info schsdk.NormalJobInfo) *NormalJob {
return &NormalJob{
Info: info,
}
}
func (j *NormalJob) GetInfo() schsdk.JobInfo {
return &j.Info
}
func (j *NormalJob) Dump() jobmod.JobBodyStatus {
return &jobmod.NormalJobStatus{
Files: j.Files,
TargetCCID: j.TargetCCID,
}
}

View File

@ -0,0 +1,271 @@
package state
import (
"context"
"errors"
"fmt"
"sync"
"time"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
)
type Adjusting struct {
scheme jobmod.JobScheduleScheme
targetCCInfo schmod.ComputingCenter
}
func NewAdjusting(scheme jobmod.JobScheduleScheme) *Adjusting {
return &Adjusting{
scheme: scheme,
}
}
func (s *Adjusting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewNormalJobReadyToExecute())
}
}
func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}
func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
norJob := jo.Body.(*job.NormalJob)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[event.Cancel](ctx, rtx.EventSet)
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
s.targetCCInfo = ccInfo
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cds client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
// 已经确定最终执行的目标计算中心,则可以生成结果输出路径了
stgInfo, err := stgCli.StorageGetInfo(cdssdk.StorageGetInfoReq{
StorageID: ccInfo.CDSStorageID,
})
if err != nil {
return fmt.Errorf("getting cds storage info: %w", err)
}
// TODO UserID
norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID)
wg := sync.WaitGroup{}
wg.Add(3)
var e1, e2, e3 error
go func() {
defer wg.Done()
e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset)
if e1 != nil {
cancel()
}
}()
go func() {
defer wg.Done()
e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code)
if e2 != nil {
cancel()
}
}()
go func() {
defer wg.Done()
e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image)
if e3 != nil {
cancel()
}
}()
return errors.Join(e1, e2, e3)
}
func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
return e.LocalPath == info.LocalPath
})
if !ok {
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
}
if evt.Error != nil {
return evt.Error
}
file.PackageID = evt.PackageID
case *schsdk.PackageJobFileInfo:
file.PackageID = info.PackageID
case *schsdk.ResourceJobFileInfo:
return nil
default:
return fmt.Errorf("unknown dataset type: %T", info)
}
if scheme.Action == jobmod.ActionMove {
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSNodeID))
defer wt.Close()
status, err := wt.Receive(ctx)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
moveStatus := status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
return nil
}
if scheme.Action == jobmod.ActionLoad {
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID))
defer wt.Close()
status, err := wt.Receive(ctx)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
moveStatus := status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
return nil
}
return nil
}
func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
return e.LocalPath == info.LocalPath
})
if !ok {
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
}
if evt.Error != nil {
return evt.Error
}
// 上传完毕,则可以新建一个空的镜像的记录
// TODO 镜像名称
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
// 填充ImageID和PackageID
file.ImageID = imgID
file.PackageID = &evt.PackageID
case *schsdk.ImageJobFileInfo:
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID)
if err != nil {
return fmt.Errorf("getting image info: %w", err)
}
file.ImageID = imageInfo.ImageID
file.PackageID = imageInfo.CDSPackageID
}
if scheme.Action == jobmod.ActionImportImage {
if file.PackageID == nil {
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
}
// TODO UserID
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSNodeID))
defer wt.Close()
status, err := wt.Receive(ctx)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
moveStatus := status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cloudream storage client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
// TODO UserID
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
if err != nil {
return fmt.Errorf("getting package objects: %w", err)
}
if len(pkgObjs.Objects) == 0 {
return fmt.Errorf("no object in the package which will be imported")
}
if len(pkgObjs.Objects) > 1 {
return fmt.Errorf("there must be only 1 object in the package which will be imported")
}
wt2 := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
defer wt2.Close()
status2, err := wt2.Receive(ctx)
if err != nil {
return fmt.Errorf("uploading image: %w", err)
}
uploadStatus := status2.(*exectsk.UploadImageStatus)
if uploadStatus.Error != "" {
return fmt.Errorf("uploading image: %s", uploadStatus.Error)
}
// TODO 镜像名称
err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, job.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
return nil
}
return nil
}

View File

@ -0,0 +1,48 @@
package state
import (
"reflect"
"gitlink.org.cn/cloudream/common/pkgs/logger"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type Completed struct {
err error
}
func SuccessComplete() *Completed {
return &Completed{}
}
func FailureComplete(err error) *Completed {
return &Completed{err: err}
}
func (c *Completed) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
// TODO 可以考虑将执行记录落库
if c.err == nil {
c.handleSuccess(rtx, jo)
} else {
c.handleFailed(rtx, jo)
}
}
func (s *Completed) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}
func (c *Completed) handleSuccess(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
logger.WithField("JobID", job.JobID).Infof("job completed successfuly")
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
}
func (c *Completed) handleFailed(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
logger.
WithField("JobID", job.JobID).
WithField("LastState", reflect.TypeOf(rtx.LastState).String()).
Infof("job failed with: %v", c.err)
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
}

View File

@ -0,0 +1,154 @@
package state
import (
"context"
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/logger"
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
)
type NormalJobExecuting struct {
lastStatus pcmsdk.TaskStatus
}
func NewNormalJobExecuting() *NormalJobExecuting {
return &NormalJobExecuting{
lastStatus: "Begin",
}
}
func (s *NormalJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, SuccessComplete())
}
}
func (s *NormalJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}
func (s *NormalJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
norJob := jo.Body.(*job.NormalJob)
log := logger.WithType[NormalJobExecuting]("State").WithField("JobID", jo.JobID)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID)
if err != nil {
return fmt.Errorf("getting pcm image info: %w", err)
}
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
// TODO 需要添加DATA_IN、DATA_OUT等环境变量这些数据从Job的信息中来获取
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID)
if err != nil {
return fmt.Errorf("getting computing center resource: %w", err)
}
if len(ress) == 0 {
return fmt.Errorf("no resource found at computing center %v", norJob.TargetCCID)
}
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSubmitTask(
ccInfo.PCMParticipantID,
pcmImgInfo.PCMImageID,
// TODO 选择资源的算法
ress[0].PCMResourceID,
norJob.Info.Runtime.Command,
norJob.Info.Runtime.Envs,
))
defer wt.Close()
for {
status, err := wt.Receive(ctx)
if err != nil {
return err
}
tskStatus := status.(*exetsk.SubmitTaskStatus)
if tskStatus.Error != "" {
return fmt.Errorf("submitting task: %s", tskStatus.Error)
}
if tskStatus.Status != s.lastStatus {
log.Infof("task %s -> %s", s.lastStatus, tskStatus.Status)
}
s.lastStatus = tskStatus.Status
switch tskStatus.Status {
case pcmsdk.TaskStatusSuccess:
return nil
case pcmsdk.TaskStatusFailed:
return fmt.Errorf("task failed")
}
}
}
type DataReturnJobExecuting struct {
}
func NewDataReturnJobExecuting() *DataReturnJobExecuting {
return &DataReturnJobExecuting{}
}
func (s *DataReturnJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, SuccessComplete())
}
}
func (s *DataReturnJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}
func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
reJob := jo.Body.(*job.DataReturnJob)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), reJob.TargetJobCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewStorageCreatePackage(
1, // TOOD 用户ID
ccInfo.CDSStorageID,
reJob.TargetJobOutputFullPath,
reJob.Info.BucketID,
utils.MakeResourcePackageName(jo.JobID),
))
defer wt.Close()
status, err := wt.Receive(ctx)
if err != nil {
return err
}
tskStatus := status.(*exetsk.StorageCreatePackageStatus)
if tskStatus.Error != "" {
return fmt.Errorf("creating package: %s", tskStatus.Error)
}
reJob.DataReturnPackageID = tskStatus.PackageID
return nil
}

View File

@ -0,0 +1,61 @@
package state
import (
"context"
"fmt"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
)
type MakingAdjustScheme struct {
}
func NewMakeingAdjustScheme() *MakingAdjustScheme {
return &MakingAdjustScheme{}
}
func (s *MakingAdjustScheme) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
scheme, err := s.do(rtx, jo.Body.(*job.NormalJob))
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewAdjusting(*scheme))
}
}
func (s *MakingAdjustScheme) do(rtx jobmgr.JobStateRunContext, norJob *job.NormalJob) (*jobmod.JobScheduleScheme, error) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
event.WaitType[event.Cancel](ctx, rtx.EventSet)
cancel()
}()
wt := rtx.Mgr.AdvMgr.StartTask(advtsk.NewMakeAdjustScheme(norJob.Info, jobmod.NormalJobStatus{
TargetCCID: norJob.TargetCCID,
Files: norJob.Files,
}))
defer wt.Close()
status, err := wt.Receive(ctx)
if err != nil {
return nil, fmt.Errorf("making adjust scheme: %w", err)
}
mkStatus := status.(*advtsk.MakeAdjustSchemeStatus)
if mkStatus.Error != "" {
return nil, fmt.Errorf("making adjust scheme: %s", mkStatus.Error)
}
return &mkStatus.Scheme, nil
}
func (s *MakingAdjustScheme) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}

View File

@ -0,0 +1,251 @@
package state
import (
"context"
"errors"
"fmt"
"sync"
"time"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
)
type PreScheduling struct {
scheme jobmod.JobScheduleScheme
targetCCInfo schmod.ComputingCenter
}
func NewPreSchuduling(scheme jobmod.JobScheduleScheme) *PreScheduling {
return &PreScheduling{
scheme: scheme,
}
}
func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
norJob := jo.Body.(*job.NormalJob)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[event.Cancel](ctx, rtx.EventSet)
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(fmt.Errorf("getting computing center info: %w", err)))
return
}
s.targetCCInfo = ccInfo
wg := sync.WaitGroup{}
wg.Add(3)
var e1, e2, e3 error
go func() {
defer wg.Done()
e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset)
if e1 != nil {
cancel()
}
}()
go func() {
defer wg.Done()
e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code)
if e2 != nil {
cancel()
}
}()
go func() {
defer wg.Done()
e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image)
if e3 != nil {
cancel()
}
}()
allErr := errors.Join(e1, e2, e3)
if allErr != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewReadyToAdjust())
}
}
func (s *PreScheduling) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}
func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
return e.LocalPath == info.LocalPath
})
if !ok {
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
}
if evt.Error != nil {
return evt.Error
}
file.PackageID = evt.PackageID
case *schsdk.PackageJobFileInfo:
file.PackageID = info.PackageID
case *schsdk.ResourceJobFileInfo:
return nil
default:
return fmt.Errorf("unknown dataset type: %T", info)
}
if scheme.Action == jobmod.ActionMove {
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSNodeID))
defer wt.Close()
status, err := wt.Receive(ctx)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
moveStatus := status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
return nil
}
if scheme.Action == jobmod.ActionLoad {
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID))
defer wt.Close()
status, err := wt.Receive(ctx)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
moveStatus := status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
return nil
}
return nil
}
func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
return e.LocalPath == info.LocalPath
})
if !ok {
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
}
if evt.Error != nil {
return evt.Error
}
// 上传完毕,则可以新建一个空的镜像的记录
// TODO 镜像名称
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
// 填充ImageID和PackageID
file.ImageID = imgID
file.PackageID = &evt.PackageID
case *schsdk.ImageJobFileInfo:
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID)
if err != nil {
return fmt.Errorf("getting image info: %w", err)
}
file.ImageID = imageInfo.ImageID
file.PackageID = imageInfo.CDSPackageID
}
if scheme.Action == jobmod.ActionImportImage {
if file.PackageID == nil {
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
}
// TODO UserID
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSNodeID))
defer wt.Close()
status, err := wt.Receive(ctx)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
moveStatus := status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cloudream storage client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
// TODO UserID
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
if err != nil {
return fmt.Errorf("getting package objects: %w", err)
}
if len(pkgObjs.Objects) == 0 {
return fmt.Errorf("no object in the package which will be imported")
}
if len(pkgObjs.Objects) > 1 {
return fmt.Errorf("there must be only 1 object in the package which will be imported")
}
wt2 := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
defer wt2.Close()
status2, err := wt2.Receive(ctx)
if err != nil {
return fmt.Errorf("uploading image: %w", err)
}
uploadStatus := status2.(*exectsk.UploadImageStatus)
if uploadStatus.Error != "" {
return fmt.Errorf("uploading image: %s", uploadStatus.Error)
}
// TODO 镜像名称
err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, norJob.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
return nil
}
return nil
}

View File

@ -0,0 +1,65 @@
package state
import (
"context"
"fmt"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
)
type ReadyToAdjust struct {
}
func NewReadyToAdjust() *ReadyToAdjust {
return &ReadyToAdjust{}
}
func (s *ReadyToAdjust) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewMakeingAdjustScheme())
}
}
func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
norJob := jo.Body.(*job.NormalJob)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
event.WaitType[event.Cancel](ctx, rtx.EventSet)
cancel()
}()
if rt, ok := norJob.Info.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok {
evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool {
return val.Job.GetInfo().GetLocalJobID() == rt.ResourceLocalJobID
})
if !ok {
return jobmgr.ErrJobCancelled
}
if evt.Err != nil {
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
}
rtJob, ok := evt.Job.Body.(*job.DataReturnJob)
if !ok {
return fmt.Errorf("job %s is not a DataReturn job(which is %T)", evt.Job.JobID, evt.Job)
}
norJob.Files.Dataset.PackageID = rtJob.DataReturnPackageID
}
return nil
}
func (s *ReadyToAdjust) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}

View File

@ -0,0 +1,40 @@
package state
import (
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
)
type NormalJobReadyToExecute struct {
}
func NewNormalJobReadyToExecute() *NormalJobReadyToExecute {
return &NormalJobReadyToExecute{}
}
func (s *NormalJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
// TODO 目前直接启动执行
rtx.Mgr.ChangeState(jo, NewNormalJobExecuting())
}
func (s *NormalJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}
type DataReturnJobReadyToExecute struct {
}
func NewDataReturnJobReadyToExecute() *DataReturnJobReadyToExecute {
return &DataReturnJobReadyToExecute{}
}
func (s *DataReturnJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
// TODO 目前直接启动执行
rtx.Mgr.ChangeState(jo, NewDataReturnJobExecuting())
}
func (s *DataReturnJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}

View File

@ -0,0 +1,62 @@
package state
import (
"context"
"fmt"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
)
type WaitTargetComplete struct {
}
func NewWaitTargetComplete() *WaitTargetComplete {
return &WaitTargetComplete{}
}
func (s *WaitTargetComplete) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewDataReturnJobReadyToExecute())
}
}
func (s *WaitTargetComplete) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
reJob := jo.Body.(*job.DataReturnJob)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
event.WaitType[event.Cancel](ctx, rtx.EventSet)
cancel()
}()
evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool {
return val.Job.GetInfo().GetLocalJobID() == reJob.Info.TargetLocalJobID
})
if !ok {
return jobmgr.ErrJobCancelled
}
if evt.Err != nil {
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
}
norJob, ok := evt.Job.Body.(*job.NormalJob)
if !ok {
return fmt.Errorf("job %s is not a Normal job(which is %T)", evt.Job.JobID, evt.Job)
}
reJob.TargetJobCCID = norJob.TargetCCID
reJob.TargetJobOutputFullPath = norJob.OutputFullPath
return nil
}
func (s *WaitTargetComplete) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
// TODO
return nil
}

View File

@ -0,0 +1,14 @@
package jobmgr
import jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
type JobStateRunContext struct {
Mgr *Manager
EventSet *EventSet
LastState JobState
}
type JobState interface {
Run(ctx JobStateRunContext, job *Job)
Dump(ctx JobStateRunContext, job *Job) jobmod.JobStateStatus
}

View File

@ -1,283 +1,169 @@
package jobmgr package jobmgr
import ( import (
"context"
"fmt" "fmt"
"reflect"
"sync" "sync"
"time" "time"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db" "gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/manager/internal/advisormgr" "gitlink.org.cn/cloudream/scheduler/manager/internal/advisormgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr" "gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
) )
type mgrJob struct { type mgrJob struct {
Job jobmod.Job job Job
Handler StateHandler eventSet EventSet
state JobState
}
type mgrJobSet struct {
jobs map[schsdk.JobID]*mgrJob
} }
type Manager struct { type Manager struct {
// 任何修改job、jobset的操作都需要加这个锁 // 任何修改job、jobset的操作都需要加这个锁
pubLock sync.Mutex pubLock sync.Mutex
execMgr *executormgr.Manager ExecMgr *executormgr.Manager
advMgr *advisormgr.Manager AdvMgr *advisormgr.Manager
db *db.DB DB *db.DB
handlers map[reflect.Type]StateHandler
defaultHandler StateHandler
jobSetIDIndex int jobSetIDIndex int
jobSets map[schsdk.JobSetID]*jobmod.JobSet jobSets map[schsdk.JobSetID]*mgrJobSet
jobIDIndex int jobIDIndex int
jobs map[schsdk.JobID]*mgrJob jobs map[schsdk.JobID]*mgrJob
} }
func NewManager(execMgr *executormgr.Manager, advMgr *advisormgr.Manager, db *db.DB) (*Manager, error) { func NewManager(execMgr *executormgr.Manager, advMgr *advisormgr.Manager, db *db.DB) (*Manager, error) {
mgr := &Manager{ mgr := &Manager{
execMgr: execMgr, ExecMgr: execMgr,
advMgr: advMgr, AdvMgr: advMgr,
db: db, DB: db,
jobSets: make(map[schsdk.JobSetID]*mgrJobSet),
handlers: make(map[reflect.Type]StateHandler), jobs: make(map[schsdk.JobID]*mgrJob),
jobSets: make(map[schsdk.JobSetID]*jobmod.JobSet),
jobs: make(map[schsdk.JobID]*mgrJob),
} }
execMgr.OnTaskUpdated(mgr.executorTaskUpdated)
execMgr.OnTaskTimeout(mgr.executorTaskTimeout)
advMgr.OnTaskUpdated(mgr.advisorTaskUpdated)
advMgr.OnTaskTimeout(mgr.advisorTaskTimeout)
// TODO 考虑优化这部分逻辑
mgr.handlers[myreflect.TypeOf[*jobmod.StatePreScheduling]()] = NewPreSchedulingHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToAdjust]()] = NewReadyToAdjustHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateMakingAdjustScheme]()] = NewMakingAdjustSchemeHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateAdjusting]()] = NewAdjustingHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToExecute]()] = NewReadyToExecuteHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateExecuting]()] = NewExecutingHandler(mgr)
compHder := NewCompleteHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateFailed]()] = compHder
mgr.handlers[myreflect.TypeOf[*jobmod.StateSuccess]()] = compHder
mgr.defaultHandler = NewDefaultHandler(mgr)
return mgr, nil return mgr, nil
} }
func (m *Manager) Serve() error { func (m *Manager) Serve() error {
for _, h := range m.handlers {
go h.Serve()
}
go m.defaultHandler.Serve()
ticker := time.NewTicker(time.Minute) ticker := time.NewTicker(time.Minute)
defer ticker.Stop() defer ticker.Stop()
for {
select {
case <-ticker.C:
// 每一分钟产生一个空事件,防止无限等待
m.pubLock.Lock()
m.onEvent(event.ToAll(), nil)
m.pubLock.Unlock()
}
}
return nil return nil
} }
func (m *Manager) Stop() { func (m *Manager) Stop() {
for _, h := range m.handlers {
h.Stop()
}
m.defaultHandler.Stop()
} }
func (m *Manager) SubmitJobSet(jobSetInfo schsdk.JobSetInfo, preScheduleScheme jobmod.JobSetPreScheduleScheme) (*jobmod.JobSet, error) { func (m *Manager) ChangeState(job *Job, state JobState) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
mgrJob, ok := m.jobs[job.JobID]
if !ok {
return
}
lastState := mgrJob.state
mgrJob.state = state
go func() {
state.Run(JobStateRunContext{
Mgr: m,
EventSet: &mgrJob.eventSet,
LastState: lastState,
}, job)
}()
}
func (m *Manager) PostEvent(jobID schsdk.JobID, evt Event) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
mgrJob, ok := m.jobs[jobID]
if !ok {
return
}
go func() {
mgrJob.eventSet.Post(evt)
}()
}
func (m *Manager) BroadcastEvent(jobSetID schsdk.JobSetID, evt Event) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
jobSet, ok := m.jobSets[jobSetID]
if !ok {
return
}
for _, mgrJob := range jobSet.jobs {
go func() {
mgrJob.eventSet.Post(evt)
}()
}
}
type SubmittingJob struct {
Body JobBody
InitState JobState
}
func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID {
m.pubLock.Lock() m.pubLock.Lock()
defer m.pubLock.Unlock() defer m.pubLock.Unlock()
jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex)) jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex))
var jobs []jobmod.Job
var normalJobs []*jobmod.NormalJob
var resJobs []*jobmod.ResourceJob
var jobRefs []jobmod.JobSetJobRef
for i, jobInfo := range jobSetInfo.Jobs {
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
switch info := jobInfo.(type) {
case *schsdk.NormalJobInfo:
job := jobmod.NewNormalJob(jobSetID, jobID, *info)
jobs = append(jobs, job)
normalJobs = append(normalJobs, job)
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
LocalJobID: info.LocalJobID,
JobID: jobID,
})
preSch, ok := preScheduleScheme.JobSchemes[info.LocalJobID]
if !ok {
return nil, fmt.Errorf("pre schedule scheme for job %s is not found", info.LocalJobID)
}
job.State = jobmod.NewStatePreScheduling(preSch)
job.TargetCCID = preSch.TargetCCID
case *schsdk.ResourceJobInfo:
job := jobmod.NewResourceJob(jobSetID, jobID, *info)
jobs = append(jobs, job)
resJobs = append(resJobs, job)
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
LocalJobID: info.LocalJobID,
JobID: jobID,
})
// 回源任务不需要预调度,所以直接是进入待调整状态
job.State = jobmod.NewStateReadyToAdjust()
}
}
// TODO 可以考虑检查一下有依赖的任务的信息所描述依赖的LocalJobID是不是有效的
jobSet := jobmod.NewJobSet(jobSetID, jobRefs, preScheduleScheme)
m.jobSets[jobSetID] = jobSet
for _, job := range jobs {
m.jobs[job.GetJobID()] = &mgrJob{
Job: job,
}
m.handleState(job)
}
m.jobSetIDIndex += 1 m.jobSetIDIndex += 1
m.jobIDIndex += len(jobSetInfo.Jobs)
return jobSet, nil jobSet := &mgrJobSet{
jobs: make(map[schsdk.JobID]*mgrJob),
}
m.jobSets[jobSetID] = jobSet
for i, subJob := range jobs {
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
job := &mgrJob{
job: Job{
JobSetID: jobSetID,
JobID: jobID,
Body: subJob.Body,
},
eventSet: NewEventSet(),
}
jobSet.jobs[jobID] = job
m.ChangeState(&job.job, subJob.InitState)
}
m.jobIDIndex += len(jobs)
return jobSetID
} }
func (m *Manager) LocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) error { func (m *Manager) DumpJobSet(jobSetID schsdk.JobSetID) []jobmod.JobStatus {
m.pubLock.Lock() m.pubLock.Lock()
defer m.pubLock.Unlock() defer m.pubLock.Unlock()
for _, h := range m.handlers { jobSet, ok := m.jobSets[jobSetID]
h.OnEvent(event.ToJobSet(jobSetID), event.NewLocalFileUploaded(jobSetID, localPath, err, packageID))
}
return nil
}
func (m *Manager) executorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus exectsk.TaskStatus) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
job, ok := m.jobs[jobID]
if !ok { if !ok {
return return nil
} }
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskUpdated(fullTaskID, taskStatus)) var jobStatuses []jobmod.JobStatus
} for _, mgrJob := range jobSet.jobs {
jobStatuses = append(jobStatuses, mgrJob.job.Dump(JobStateRunContext{
func (m *Manager) executorTaskTimeout(jobID schsdk.JobID, fullTaskID string) { Mgr: m,
m.pubLock.Lock() EventSet: &mgrJob.eventSet,
defer m.pubLock.Unlock() LastState: mgrJob.state,
}, &mgrJob.job, mgrJob.state))
job, ok := m.jobs[jobID] }
if !ok {
return return jobStatuses
}
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskTimeout(fullTaskID))
}
func (m *Manager) advisorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
job, ok := m.jobs[jobID]
if !ok {
return
}
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskUpdated(fullTaskID, taskStatus))
}
func (m *Manager) advisorTaskTimeout(jobID schsdk.JobID, fullTaskID string) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
job, ok := m.jobs[jobID]
if !ok {
return
}
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskTimeout(fullTaskID))
}
func (m *Manager) CloneJob(jobID schsdk.JobID) (jobmod.Job, error) {
m.pubLock.Lock()
job, ok := m.jobs[jobID]
if !ok {
m.pubLock.Unlock()
return nil, fmt.Errorf("job not found")
}
evt := event.NewCloneJob()
job.Handler.OnEvent(event.ToJob(jobID), evt)
m.pubLock.Unlock()
return evt.Callback.WaitValue(context.Background())
}
// 根据job状态选择handler进行处理。需要加锁
func (m *Manager) handleState(job jobmod.Job) {
logger.WithField("JobID", job.GetJobID()).
WithField("State", reflect.TypeOf(job.GetState()).String()).
Debugf("job state changed")
runtime, ok := m.jobs[job.GetJobID()]
if !ok {
return
}
state := job.GetState()
if state == nil {
runtime.Handler = m.defaultHandler
m.defaultHandler.Handle(job)
return
}
stateType := reflect.TypeOf(state)
handler, ok := m.handlers[stateType]
if !ok {
runtime.Handler = m.defaultHandler
m.defaultHandler.Handle(job)
return
}
runtime.Handler = handler
handler.Handle(job)
}
func (m *Manager) onEvent(broadcast event.Broadcast, evt event.Event) {
for _, h := range m.handlers {
h.OnEvent(broadcast, evt)
}
} }

View File

@ -1,139 +0,0 @@
package jobmgr
import (
"fmt"
"reflect"
"gitlink.org.cn/cloudream/common/pkgs/actor"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type makingAdjustSchemeJob struct {
job *jobmod.NormalJob
state *jobmod.StateMakingAdjustScheme
}
type MakingAdjustSchemeHandler struct {
mgr *Manager
jobs map[schsdk.JobID]*makingAdjustSchemeJob
cmdChan actor.CommandChannel
}
func NewMakingAdjustSchemeHandler(mgr *Manager) *MakingAdjustSchemeHandler {
return &MakingAdjustSchemeHandler{
mgr: mgr,
jobs: make(map[schsdk.JobID]*makingAdjustSchemeJob),
cmdChan: *actor.NewCommandChannel(),
}
}
func (h *MakingAdjustSchemeHandler) Handle(job jobmod.Job) {
h.cmdChan.Send(func() {
norJob, ok := job.(*jobmod.NormalJob)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState()))
return
}
state, ok := job.GetState().(*jobmod.StateMakingAdjustScheme)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
return
}
rjob := &makingAdjustSchemeJob{
job: norJob,
state: state,
}
h.jobs[job.GetJobID()] = rjob
h.onJobEvent(nil, rjob)
})
}
func (h *MakingAdjustSchemeHandler) onJobEvent(evt event.Event, job *makingAdjustSchemeJob) {
if cloneEvt, ok := evt.(*event.CloneJob); ok {
cloneEvt.Callback.SetValue(job.job.Clone())
return
}
if job.state.FullTaskID == "" {
fullTaskID, err := h.mgr.advMgr.StartTask(job.job.GetJobID(), advtsk.NewMakeAdjustScheme(*job.job))
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
job.state.FullTaskID = fullTaskID
}
if makingRet, err := event.AssertAdvisorTaskStatus[*advtsk.MakeAdjustSchemeStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
if err == event.ErrTaskTimeout {
h.changeJobState(job.job, jobmod.NewStateFailed("make adjust scheme timeout", job.state))
return
}
h.mgr.advMgr.ForgetTask(job.state.FullTaskID)
if makingRet.Error != "" {
h.changeJobState(job.job, jobmod.NewStateFailed(makingRet.Error, job.state))
return
}
h.changeJobState(job.job, jobmod.NewStateAdjusting(makingRet.Scheme))
}
}
func (h *MakingAdjustSchemeHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
job.SetState(state)
delete(h.jobs, job.GetJobID())
h.mgr.pubLock.Lock()
h.mgr.handleState(job)
h.mgr.pubLock.Unlock()
}
func (h *MakingAdjustSchemeHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
h.cmdChan.Send(func() {
if broadcast.ToAll() {
for _, job := range h.jobs {
h.onJobEvent(evt, job)
}
} else if broadcast.ToJobSet() {
for _, job := range h.jobs {
if job.job.GetJobSetID() != broadcast.JobSetID {
continue
}
h.onJobEvent(evt, job)
}
} else if broadcast.ToJob() {
if job, ok := h.jobs[broadcast.JobID]; ok {
h.onJobEvent(evt, job)
}
}
})
}
func (h *MakingAdjustSchemeHandler) Serve() {
cmdChan := h.cmdChan.BeginChanReceive()
defer h.cmdChan.CloseChanReceive()
for {
select {
case cmd := <-cmdChan:
cmd()
}
}
}
func (h *MakingAdjustSchemeHandler) Stop() {
// TODO 支持STOP
}

View File

@ -1,442 +0,0 @@
package jobmgr
import (
"fmt"
"reflect"
"time"
"gitlink.org.cn/cloudream/common/pkgs/actor"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
var ErrPreScheduleFailed = fmt.Errorf("pre schedule failed")
type preSchedulingJob struct {
job *jobmod.NormalJob
state *jobmod.StatePreScheduling
ccInfo schmod.ComputingCenter
}
type PreSchedulingHandler struct {
mgr *Manager
jobs map[schsdk.JobID]*preSchedulingJob
cmdChan actor.CommandChannel
}
func NewPreSchedulingHandler(mgr *Manager) *PreSchedulingHandler {
return &PreSchedulingHandler{
mgr: mgr,
jobs: make(map[schsdk.JobID]*preSchedulingJob),
cmdChan: *actor.NewCommandChannel(),
}
}
func (h *PreSchedulingHandler) Handle(job jobmod.Job) {
h.cmdChan.Send(func() {
norJob, ok := job.(*jobmod.NormalJob)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState()))
return
}
preSchState, ok := norJob.GetState().(*jobmod.StatePreScheduling)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
return
}
colCli, err := schglb.CollectorMQPool.Acquire()
if err != nil {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err), job.GetState()))
return
}
defer schglb.CollectorMQPool.Release(colCli)
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), preSchState.Scheme.TargetCCID)
if err != nil {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.GetState()))
return
}
norJob.TargetCCID = preSchState.Scheme.TargetCCID
preJob := &preSchedulingJob{
job: norJob,
state: preSchState,
ccInfo: ccInfo,
}
h.jobs[job.GetJobID()] = preJob
h.onJobEvent(nil, preJob)
})
}
func (h *PreSchedulingHandler) onJobEvent(evt event.Event, job *preSchedulingJob) {
if cloneEvt, ok := evt.(*event.CloneJob); ok {
cloneEvt.Callback.SetValue(job.job.Clone())
return
}
err := h.doPackageScheduling(evt, job,
job.job.Info.Files.Dataset, &job.job.Files.Dataset,
&job.state.Scheme.Dataset, &job.state.Dataset,
)
if err != nil {
job.state.Dataset.Error = err.Error()
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
err = h.doPackageScheduling(evt, job,
job.job.Info.Files.Code, &job.job.Files.Code,
&job.state.Scheme.Code, &job.state.Code,
)
if err != nil {
job.state.Code.Error = err.Error()
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
err = h.doImageScheduling(evt, job,
job.job.Info.Files.Image, &job.job.Files.Image,
&job.state.Scheme.Image, &job.state.Image,
)
if err != nil {
job.state.Image.Error = err.Error()
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
// 如果三种文件都调度完成,则可以进入下个阶段了
if job.state.Dataset.Step == jobmod.StepCompleted &&
job.state.Code.Step == jobmod.StepCompleted &&
job.state.Image.Step == jobmod.StepCompleted {
h.changeJobState(job.job, jobmod.NewStateReadyToAdjust())
}
}
func (h *PreSchedulingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
job.SetState(state)
delete(h.jobs, job.GetJobID())
h.mgr.pubLock.Lock()
h.mgr.handleState(job)
h.mgr.pubLock.Unlock()
}
func (h *PreSchedulingHandler) doPackageScheduling(evt event.Event, job *preSchedulingJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
// TODO 考虑拆分成多个函数
if state.Step == jobmod.StepBegin {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
state.Step = jobmod.StepUploading
case *schsdk.PackageJobFileInfo:
file.PackageID = info.PackageID
state.Step = jobmod.StepUploaded
case *schsdk.ResourceJobFileInfo:
state.Step = jobmod.StepCompleted
default:
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
}
}
if state.Step == jobmod.StepUploading {
if evt == nil {
return nil
}
localFileCmd, ok := evt.(*event.LocalFileUploaded)
if !ok {
return nil
}
if localFileCmd.LocalPath != fileInfo.(*schsdk.LocalJobFileInfo).LocalPath {
return nil
}
if localFileCmd.Error != "" {
return fmt.Errorf("local file uploading: %s", localFileCmd.Error)
}
file.PackageID = localFileCmd.PackageID
state.Step = jobmod.StepUploaded
}
if state.Step == jobmod.StepUploaded {
if scheme.Action == jobmod.ActionNo {
state.Step = jobmod.StepCompleted
return nil
}
if scheme.Action == jobmod.ActionMove {
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, file.PackageID, job.ccInfo.CDSNodeID))
if err != nil {
return fmt.Errorf("starting cache move package: %w", err)
}
state.Step = jobmod.StepMoving
state.FullTaskID = fullTaskID
return nil
}
if scheme.Action == jobmod.ActionLoad {
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewStorageLoadPackage(1, file.PackageID, job.ccInfo.CDSStorageID))
if err != nil {
return fmt.Errorf("starting stroage load package: %w", err)
}
state.Step = jobmod.StepLoading
state.FullTaskID = fullTaskID
return nil
}
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
}
if state.Step == jobmod.StepMoving {
moveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
if err == event.ErrUnconcernedTask {
return nil
}
if err == event.ErrTaskTimeout {
return fmt.Errorf("cache move package timeout")
}
h.mgr.execMgr.ForgetTask(state.FullTaskID)
if moveRet.Error != "" {
return fmt.Errorf("cache move pacakge: %s", moveRet.Error)
}
state.Step = jobmod.StepCompleted
return nil
}
if state.Step == jobmod.StepLoading {
loadRet, err := event.AssertExecutorTaskStatus[*exectsk.StorageLoadPackageStatus](evt, state.FullTaskID)
if err == event.ErrUnconcernedTask {
return nil
}
if err == event.ErrTaskTimeout {
return fmt.Errorf("storage load package timeout")
}
h.mgr.execMgr.ForgetTask(state.FullTaskID)
if loadRet.Error != "" {
return fmt.Errorf("storage load package: %s", loadRet.Error)
}
file.FullPath = loadRet.FullPath
state.Step = jobmod.StepCompleted
return nil
}
return nil
}
func (h *PreSchedulingHandler) doImageScheduling(evt event.Event, job *preSchedulingJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
// TODO 考虑拆分成多个函数
if state.Step == jobmod.StepBegin {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
state.Step = jobmod.StepUploading
case *schsdk.ImageJobFileInfo:
imageInfo, err := h.mgr.db.Image().GetByID(h.mgr.db.SQLCtx(), info.ImageID)
if err != nil {
return fmt.Errorf("getting image info: %w", err)
}
file.ImageID = imageInfo.ImageID
file.PackageID = imageInfo.CDSPackageID
state.Step = jobmod.StepUploaded
default:
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(info))
}
}
if state.Step == jobmod.StepUploading {
if evt == nil {
return nil
}
localFileCmd, ok := evt.(*event.LocalFileUploaded)
if !ok {
return nil
}
if localFileCmd.LocalPath != fileInfo.(*schsdk.LocalJobFileInfo).LocalPath {
return nil
}
if localFileCmd.Error != "" {
return fmt.Errorf("local file uploading: %s", localFileCmd.Error)
}
// 上传完毕,则可以新建一个空的镜像的记录
// TODO 镜像名称
imgID, err := h.mgr.db.Image().Create(h.mgr.db.SQLCtx(), &localFileCmd.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
// 填充ImageID和PackageID
file.ImageID = imgID
file.PackageID = &localFileCmd.PackageID
state.Step = jobmod.StepUploaded
}
if state.Step == jobmod.StepUploaded {
if scheme.Action == jobmod.ActionNo {
state.Step = jobmod.StepCompleted
return nil
}
// 要导入镜像,则需要先将镜像移动到指点节点的缓存中
if scheme.Action == jobmod.ActionImportImage {
if file.PackageID == nil {
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, job.ccInfo.CCID)
}
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, *file.PackageID, job.ccInfo.CDSNodeID))
if err != nil {
return fmt.Errorf("starting cache move package: %w", err)
}
state.Step = jobmod.StepMoving
state.FullTaskID = fullTaskID
return nil
}
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
}
if state.Step == jobmod.StepMoving {
cacheMoveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
if err == event.ErrUnconcernedTask {
return nil
}
if err == event.ErrTaskTimeout {
return fmt.Errorf("cache move package timeout")
}
h.mgr.execMgr.ForgetTask(state.FullTaskID)
if cacheMoveRet.Error != "" {
return fmt.Errorf("cache move pacakge: %s", cacheMoveRet.Error)
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cloudream storage client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
if err != nil {
return fmt.Errorf("getting package objects: %w", err)
}
if len(pkgObjs.Objects) == 0 {
return fmt.Errorf("no object in the package which will be imported")
}
if len(pkgObjs.Objects) > 1 {
return fmt.Errorf("there must be only 1 object in the package which will be imported")
}
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewUploadImage(job.ccInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
if err != nil {
return fmt.Errorf("starting import image: %w", err)
}
state.Step = jobmod.StepImageImporting
state.FullTaskID = fullTaskID
return nil
}
if state.Step == jobmod.StepImageImporting {
uploadImageRet, err := event.AssertExecutorTaskStatus[*exectsk.UploadImageStatus](evt, state.FullTaskID)
if err == event.ErrUnconcernedTask {
return nil
}
if err == event.ErrTaskTimeout {
return fmt.Errorf("import image timeout")
}
h.mgr.execMgr.ForgetTask(state.FullTaskID)
if uploadImageRet.Error != "" {
return fmt.Errorf("import image: %s", uploadImageRet.Error)
}
err = h.mgr.db.PCMImage().Create(h.mgr.db.SQLCtx(), file.ImageID, job.ccInfo.CCID, uploadImageRet.PCMImageID, uploadImageRet.Name, time.Now())
if err != nil {
return fmt.Errorf("adding image importing info: %w", err)
}
state.Step = jobmod.StepCompleted
return nil
}
return nil
}
func (h *PreSchedulingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
h.cmdChan.Send(func() {
if broadcast.ToAll() {
for _, job := range h.jobs {
h.onJobEvent(evt, job)
}
} else if broadcast.ToJobSet() {
for _, job := range h.jobs {
if job.job.JobSetID != broadcast.JobSetID {
continue
}
h.onJobEvent(evt, job)
}
} else if broadcast.ToJob() {
if job, ok := h.jobs[broadcast.JobID]; ok {
h.onJobEvent(evt, job)
}
}
})
}
func (h *PreSchedulingHandler) Serve() {
cmdChan := h.cmdChan.BeginChanReceive()
defer h.cmdChan.CloseChanReceive()
for {
select {
case cmd := <-cmdChan:
cmd()
}
}
}
func (h *PreSchedulingHandler) Stop() {
// TODO 支持STOP
}

View File

@ -1,214 +0,0 @@
package jobmgr
import (
"fmt"
"reflect"
"gitlink.org.cn/cloudream/common/pkgs/actor"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type readyToAdjustJob struct {
job jobmod.Job
state *jobmod.StateReadyToAdjust
}
type ReadyToAdjustHandler struct {
mgr *Manager
jobs map[schsdk.JobID]*readyToAdjustJob
cmdChan actor.CommandChannel
}
func NewReadyToAdjustHandler(mgr *Manager) *ReadyToAdjustHandler {
return &ReadyToAdjustHandler{
mgr: mgr,
jobs: make(map[schsdk.JobID]*readyToAdjustJob),
cmdChan: *actor.NewCommandChannel(),
}
}
func (h *ReadyToAdjustHandler) Handle(job jobmod.Job) {
h.cmdChan.Send(func() {
state, ok := job.GetState().(*jobmod.StateReadyToAdjust)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
return
}
rjob := &readyToAdjustJob{
job: job,
state: state,
}
h.jobs[job.GetJobID()] = rjob
h.onJobEvent(nil, rjob)
})
}
func (h *ReadyToAdjustHandler) onJobEvent(evt event.Event, job *readyToAdjustJob) {
if cloneEvt, ok := evt.(*event.CloneJob); ok {
cloneEvt.Callback.SetValue(job.job.Clone())
return
}
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
h.onNormalJobEvent(evt, job, norJob)
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
h.onResourceJobEvent(evt, job, resJob)
}
}
func (h *ReadyToAdjustHandler) onNormalJobEvent(evt event.Event, job *readyToAdjustJob, norJob *jobmod.NormalJob) {
h.mgr.pubLock.Lock()
jobSet, ok := h.mgr.jobSets[job.job.GetJobSetID()]
h.mgr.pubLock.Unlock()
if !ok {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", job.job.GetJobSetID()), job.state))
return
}
needWait := false
// 无论发生什么事件,都检查一下前置任务的状态
if resFile, ok := norJob.Info.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok {
ref := jobSet.FindRefByLocalJobID(resFile.ResourceLocalJobID)
if ref == nil {
h.changeJobState(job.job, jobmod.NewStateFailed(
fmt.Sprintf("job %s not found in job set %s", resFile.ResourceLocalJobID, jobSet.JobSetID),
job.state,
))
return
}
h.mgr.pubLock.Lock()
waitJob := h.mgr.jobs[ref.JobID]
h.mgr.pubLock.Unlock()
if waitJob == nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
return
}
if _, ok = waitJob.Job.GetState().(*jobmod.StateSuccess); ok {
waitResJob, ok := waitJob.Job.(*jobmod.ResourceJob)
if !ok {
h.changeJobState(job.job, jobmod.NewStateFailed(
fmt.Sprintf("job(%v) %s is not a resource job", reflect.TypeOf(waitJob), waitResJob.JobID),
job.state,
))
return
}
norJob.Files.Dataset.PackageID = waitResJob.ResourcePackageID
} else if _, ok = waitJob.Job.GetState().(*jobmod.StateFailed); ok {
h.changeJobState(job.job, jobmod.NewStateFailed(
fmt.Sprintf("job %s is failed", waitJob.Job.GetJobID()),
job.state,
))
return
} else {
// 等待的Job不是失败或者成功状态则需要继续等待
needWait = true
}
}
if !needWait {
h.changeJobState(job.job, jobmod.NewStateMakingAdjustScheme())
}
}
func (h *ReadyToAdjustHandler) onResourceJobEvent(evt event.Event, job *readyToAdjustJob, resJob *jobmod.ResourceJob) {
h.mgr.pubLock.Lock()
jobSet, ok := h.mgr.jobSets[job.job.GetJobSetID()]
h.mgr.pubLock.Unlock()
if !ok {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", job.job.GetJobSetID()), job.state))
return
}
needWait := false
ref := jobSet.FindRefByLocalJobID(resJob.Info.TargetLocalJobID)
if ref == nil {
h.changeJobState(job.job, jobmod.NewStateFailed(
fmt.Sprintf("job %s not found in job set %s", resJob.Info.TargetLocalJobID, jobSet.JobSetID),
job.state,
))
return
}
h.mgr.pubLock.Lock()
waitJob := h.mgr.jobs[ref.JobID]
h.mgr.pubLock.Unlock()
if waitJob == nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
return
}
// 无论发生什么事件,都检查一下前置任务的状态
if _, ok = waitJob.Job.GetState().(*jobmod.StateFailed); ok {
h.changeJobState(job.job, jobmod.NewStateFailed(
fmt.Sprintf("job %s is failed", waitJob.Job.GetJobID()),
job.state,
))
return
} else if _, ok = waitJob.Job.GetState().(*jobmod.StateSuccess); !ok {
needWait = true
}
if !needWait {
h.changeJobState(job.job, jobmod.NewStateReadyToExecute())
}
}
func (h *ReadyToAdjustHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
job.SetState(state)
delete(h.jobs, job.GetJobID())
h.mgr.pubLock.Lock()
h.mgr.handleState(job)
h.mgr.pubLock.Unlock()
}
func (h *ReadyToAdjustHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
h.cmdChan.Send(func() {
if broadcast.ToAll() {
for _, job := range h.jobs {
h.onJobEvent(evt, job)
}
} else if broadcast.ToJobSet() {
for _, job := range h.jobs {
if job.job.GetJobSetID() != broadcast.JobSetID {
continue
}
h.onJobEvent(evt, job)
}
} else if broadcast.ToJob() {
if job, ok := h.jobs[broadcast.JobID]; ok {
h.onJobEvent(evt, job)
}
}
})
}
func (h *ReadyToAdjustHandler) Serve() {
cmdChan := h.cmdChan.BeginChanReceive()
defer h.cmdChan.CloseChanReceive()
for {
select {
case cmd := <-cmdChan:
cmd()
}
}
}
func (h *ReadyToAdjustHandler) Stop() {
// TODO 支持STOP
}

View File

@ -1,122 +0,0 @@
package jobmgr
import (
"fmt"
"reflect"
"gitlink.org.cn/cloudream/common/pkgs/actor"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type readyToExecuteJob struct {
job jobmod.Job
state *jobmod.StateReadyToExecute
}
type ReadyToExecuteHandler struct {
mgr *Manager
jobs map[schsdk.JobID]*readyToExecuteJob
cmdChan actor.CommandChannel
}
func NewReadyToExecuteHandler(mgr *Manager) *ReadyToExecuteHandler {
return &ReadyToExecuteHandler{
mgr: mgr,
jobs: make(map[schsdk.JobID]*readyToExecuteJob),
cmdChan: *actor.NewCommandChannel(),
}
}
func (h *ReadyToExecuteHandler) Handle(job jobmod.Job) {
h.cmdChan.Send(func() {
state, ok := job.GetState().(*jobmod.StateReadyToExecute)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
return
}
rjob := &readyToExecuteJob{
job: job,
state: state,
}
h.jobs[job.GetJobID()] = rjob
h.onJobEvent(nil, rjob)
})
}
func (h *ReadyToExecuteHandler) onJobEvent(evt event.Event, job *readyToExecuteJob) {
if cloneEvt, ok := evt.(*event.CloneJob); ok {
cloneEvt.Callback.SetValue(job.job.Clone())
return
}
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
h.onNormalJobEvent(evt, job, norJob)
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
h.onResourceJobEvent(evt, job, resJob)
}
}
func (h *ReadyToExecuteHandler) onNormalJobEvent(evt event.Event, job *readyToExecuteJob, norJob *jobmod.NormalJob) {
// TODO 目前直接启动执行
h.changeJobState(job.job, jobmod.NewStateExecuting())
}
func (h *ReadyToExecuteHandler) onResourceJobEvent(evt event.Event, job *readyToExecuteJob, resJob *jobmod.ResourceJob) {
// TODO 目前直接启动执行
h.changeJobState(job.job, jobmod.NewStateExecuting())
}
func (h *ReadyToExecuteHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
job.SetState(state)
delete(h.jobs, job.GetJobID())
h.mgr.pubLock.Lock()
h.mgr.handleState(job)
h.mgr.pubLock.Unlock()
}
func (h *ReadyToExecuteHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
h.cmdChan.Send(func() {
if broadcast.ToAll() {
for _, job := range h.jobs {
h.onJobEvent(evt, job)
}
} else if broadcast.ToJobSet() {
for _, job := range h.jobs {
if job.job.GetJobSetID() != broadcast.JobSetID {
continue
}
h.onJobEvent(evt, job)
}
} else if broadcast.ToJob() {
if job, ok := h.jobs[broadcast.JobID]; ok {
h.onJobEvent(evt, job)
}
}
})
}
func (h *ReadyToExecuteHandler) Serve() {
cmdChan := h.cmdChan.BeginChanReceive()
defer h.cmdChan.CloseChanReceive()
for {
select {
case cmd := <-cmdChan:
cmd()
}
}
}
func (h *ReadyToExecuteHandler) Stop() {
// TODO 支持STOP
}

View File

@ -1,17 +0,0 @@
package jobmgr
import (
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type StateHandler interface {
// 处理Job。在此期间全局锁已锁定
Handle(job jobmod.Job)
// 外部发生了一个事件
OnEvent(broadcast event.Broadcast, evt event.Event)
// 运行Handler
Serve()
// 停止此Handler
Stop()
}

View File

@ -1,23 +1,50 @@
package mq package mq
import ( import (
"errors"
"fmt"
"gitlink.org.cn/cloudream/common/consts/errorcode" "gitlink.org.cn/cloudream/common/consts/errorcode"
"gitlink.org.cn/cloudream/common/pkgs/logger" "gitlink.org.cn/cloudream/common/pkgs/logger"
"gitlink.org.cn/cloudream/common/pkgs/mq" "gitlink.org.cn/cloudream/common/pkgs/mq"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job/state"
) )
// 提交任务集 // 提交任务集
func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetResp, *mq.CodeMessage) { func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetResp, *mq.CodeMessage) {
logger.Debugf("submitting job") logger.Debugf("submitting job")
jobSet, err := svc.jobMgr.SubmitJobSet(msg.JobSet, msg.PreScheduleScheme) var jobs []jobmgr.SubmittingJob
if err != nil { for _, jobInfo := range msg.JobSet.Jobs {
logger.Warnf("submitting job set: %s", err.Error()) switch info := jobInfo.(type) {
return nil, mq.Failed(errorcode.OperationFailed, "submit job set failed") case *schsdk.NormalJobInfo:
job := job.NewNormalJob(*info)
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
if !ok {
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: job,
InitState: state.NewPreSchuduling(preSch),
})
case *schsdk.DataReturnJobInfo:
job := job.NewResourceJob(*info)
jobs = append(jobs, jobmgr.SubmittingJob{
Body: job,
InitState: state.NewWaitTargetComplete(),
})
}
} }
return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(jobSet.JobSetID)) return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(svc.jobMgr.SubmitJobSet(jobs)))
} }
// 任务集中某个文件上传完成 // 任务集中某个文件上传完成
@ -26,16 +53,15 @@ func (svc *Service) JobSetLocalFileUploaded(msg *mgrmq.JobSetLocalFileUploaded)
WithField("PackageID", msg.PackageID). WithField("PackageID", msg.PackageID).
Debugf("local file uploaded") Debugf("local file uploaded")
svc.jobMgr.LocalFileUploaded(msg.JobSetID, msg.LocalPath, msg.Error, msg.PackageID) svc.jobMgr.BroadcastEvent(msg.JobSetID, event.NewLocalFileUploaded(msg.LocalPath, errors.New(msg.Error), msg.PackageID))
return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp()) return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp())
} }
func (svc *Service) GetJob(msg *mgrmq.GetJob) (*mgrmq.GetJobResp, *mq.CodeMessage) { func (svc *Service) GetJobSetStatus(msg *mgrmq.GetJobSetStatus) (*mgrmq.GetJobSetStatusResp, *mq.CodeMessage) {
job, err := svc.jobMgr.CloneJob(msg.JobID) jobs := svc.jobMgr.DumpJobSet(msg.JobSetID)
if err != nil { if len(jobs) == 0 {
logger.WithField("JobID", msg.JobID).Warnf("cloning job: %s", err.Error()) return nil, mq.Failed(errorcode.OperationFailed, "job set not found")
return nil, mq.Failed(errorcode.OperationFailed, "get job failed")
} }
return mq.ReplyOK(mgrmq.NewGetJobResp(job)) return mq.ReplyOK(mgrmq.RespGetJobSetStatus(jobs))
} }