From 1e1c8dd6917b580840c79e45386ba69bae591da6 Mon Sep 17 00:00:00 2001 From: Sydonian <794346190@qq.com> Date: Fri, 26 Apr 2024 09:27:10 +0800 Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E6=9E=84manager=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- advisor/internal/scheduler/scheduler.go | 14 +- advisor/internal/scheduler/service.go | 3 +- advisor/internal/services/task.go | 4 +- advisor/internal/task/schedule_scheme.go | 2 +- advisor/internal/task/task.go | 6 +- .../prescheduler/default_prescheduler.go | 4 +- common/models/job/job.go | 79 ++-- common/models/job/normal_job.go | 46 -- common/models/job/resource_job.go | 28 -- common/models/job/state.go | 183 -------- common/models/models.go | 4 +- .../mq/advisor/task/make_adjust_scheme.go | 9 +- common/pkgs/mq/advisor/task/task.go | 6 +- common/pkgs/mq/executor/task/task.go | 6 +- common/pkgs/mq/manager/job.go | 54 +-- executor/internal/services/task.go | 4 +- executor/internal/task/pcm_schedule_task.go | 2 +- executor/internal/task/task.go | 6 +- manager/internal/advisormgr/advisormgr.go | 85 ++-- manager/internal/executormgr/executormgr.go | 93 ++-- manager/internal/jobmgr/adjusting_handler.go | 371 --------------- manager/internal/jobmgr/complete_handler.go | 63 --- manager/internal/jobmgr/default_handler.go | 50 -- manager/internal/jobmgr/{event => }/event.go | 4 +- .../jobmgr/event/advisor_task_timeout.go | 12 - .../jobmgr/event/advisor_task_updated.go | 46 -- manager/internal/jobmgr/event/cancel.go | 4 + manager/internal/jobmgr/event/clone_job.go | 14 - .../jobmgr/event/executor_task_timeout.go | 12 - .../jobmgr/event/executor_task_updated.go | 48 -- .../internal/jobmgr/event/job_completed.go | 8 +- .../jobmgr/event/local_file_uploaded.go | 7 +- manager/internal/jobmgr/event/utils.go | 27 ++ manager/internal/jobmgr/event_set.go | 72 +++ manager/internal/jobmgr/executing_handler.go | 264 ----------- manager/internal/jobmgr/job.go | 88 ++++ .../internal/jobmgr/job/data_return_job.go | 30 ++ manager/internal/jobmgr/job/normal_job.go | 30 ++ .../internal/jobmgr/job/state/adjusting.go | 271 +++++++++++ manager/internal/jobmgr/job/state/complete.go | 48 ++ .../internal/jobmgr/job/state/executing.go | 154 ++++++ .../jobmgr/job/state/making_adjust_scheme.go | 61 +++ .../jobmgr/job/state/prescheduling.go | 251 ++++++++++ .../jobmgr/job/state/ready_to_adjust.go | 65 +++ .../jobmgr/job/state/ready_to_execute.go | 40 ++ .../jobmgr/job/state/wait_target_complete.go | 62 +++ manager/internal/jobmgr/job_state.go | 14 + manager/internal/jobmgr/jobmgr.go | 330 +++++-------- .../jobmgr/making_adjust_scheme_handler.go | 139 ------ .../internal/jobmgr/prescheduling_handler.go | 442 ------------------ .../jobmgr/ready_to_adjust_handler.go | 214 --------- .../jobmgr/ready_to_execute_handler.go | 122 ----- manager/internal/jobmgr/state_handler.go | 17 - manager/internal/mq/job.go | 50 +- 54 files changed, 1515 insertions(+), 2553 deletions(-) delete mode 100644 common/models/job/normal_job.go delete mode 100644 common/models/job/resource_job.go delete mode 100644 common/models/job/state.go delete mode 100644 manager/internal/jobmgr/adjusting_handler.go delete mode 100644 manager/internal/jobmgr/complete_handler.go delete mode 100644 manager/internal/jobmgr/default_handler.go rename manager/internal/jobmgr/{event => }/event.go (93%) delete mode 100644 manager/internal/jobmgr/event/advisor_task_timeout.go delete mode 100644 manager/internal/jobmgr/event/advisor_task_updated.go create mode 100644 manager/internal/jobmgr/event/cancel.go delete mode 100644 manager/internal/jobmgr/event/clone_job.go delete mode 100644 manager/internal/jobmgr/event/executor_task_timeout.go delete mode 100644 manager/internal/jobmgr/event/executor_task_updated.go create mode 100644 manager/internal/jobmgr/event/utils.go create mode 100644 manager/internal/jobmgr/event_set.go delete mode 100644 manager/internal/jobmgr/executing_handler.go create mode 100644 manager/internal/jobmgr/job.go create mode 100644 manager/internal/jobmgr/job/data_return_job.go create mode 100644 manager/internal/jobmgr/job/normal_job.go create mode 100644 manager/internal/jobmgr/job/state/adjusting.go create mode 100644 manager/internal/jobmgr/job/state/complete.go create mode 100644 manager/internal/jobmgr/job/state/executing.go create mode 100644 manager/internal/jobmgr/job/state/making_adjust_scheme.go create mode 100644 manager/internal/jobmgr/job/state/prescheduling.go create mode 100644 manager/internal/jobmgr/job/state/ready_to_adjust.go create mode 100644 manager/internal/jobmgr/job/state/ready_to_execute.go create mode 100644 manager/internal/jobmgr/job/state/wait_target_complete.go create mode 100644 manager/internal/jobmgr/job_state.go delete mode 100644 manager/internal/jobmgr/making_adjust_scheme_handler.go delete mode 100644 manager/internal/jobmgr/prescheduling_handler.go delete mode 100644 manager/internal/jobmgr/ready_to_adjust_handler.go delete mode 100644 manager/internal/jobmgr/ready_to_execute_handler.go delete mode 100644 manager/internal/jobmgr/state_handler.go diff --git a/advisor/internal/scheduler/scheduler.go b/advisor/internal/scheduler/scheduler.go index 2678d92..481e06a 100644 --- a/advisor/internal/scheduler/scheduler.go +++ b/advisor/internal/scheduler/scheduler.go @@ -38,7 +38,7 @@ const ( var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait") type Scheduler interface { - Schedule(info *jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) + Schedule(info *schsdk.NormalJobInfo, status jobmod.NormalJobStatus) (*jobmod.JobScheduleScheme, error) } type candidate struct { @@ -129,7 +129,7 @@ func NewDefaultSchedule() *DefaultScheduler { return &DefaultScheduler{} } -func (s *DefaultScheduler) Schedule(job *jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) { +func (s *DefaultScheduler) Schedule(info *schsdk.NormalJobInfo, status jobmod.NormalJobStatus) (*jobmod.JobScheduleScheme, error) { mgrCli, err := schglb.ManagerMQPool.Acquire() if err != nil { return nil, fmt.Errorf("new collector client: %w", err) @@ -151,17 +151,17 @@ func (s *DefaultScheduler) Schedule(job *jobmod.NormalJob) (*jobmod.JobScheduleS for _, cc := range allCC.ComputingCenters { allCCs[cc.CCID] = &candidate{ CC: cc, - IsPreScheduled: cc.CCID == job.TargetCCID, + IsPreScheduled: cc.CCID == status.TargetCCID, } } // 计算 - err = s.calcFileScore(job.Files, allCCs) + err = s.calcFileScore(status.Files, allCCs) if err != nil { return nil, err } - err = s.calcResourceScore(job, allCCs) + err = s.calcResourceScore(info, allCCs) if err != nil { return nil, err } @@ -204,9 +204,9 @@ func (s *DefaultScheduler) makeSchemeForNode(targetCC *candidate) jobmod.JobSche return scheme } -func (s *DefaultScheduler) calcResourceScore(job *jobmod.NormalJob, allCCs map[schsdk.CCID]*candidate) error { +func (s *DefaultScheduler) calcResourceScore(info *schsdk.NormalJobInfo, allCCs map[schsdk.CCID]*candidate) error { for _, cc := range allCCs { - res, err := s.calcOneResourceScore(job.Info.Resources, &cc.CC) + res, err := s.calcOneResourceScore(info.Resources, &cc.CC) if err != nil { return err } diff --git a/advisor/internal/scheduler/service.go b/advisor/internal/scheduler/service.go index 3c76d5b..ea6f905 100644 --- a/advisor/internal/scheduler/service.go +++ b/advisor/internal/scheduler/service.go @@ -8,6 +8,7 @@ import ( "github.com/samber/lo" "gitlink.org.cn/cloudream/common/pkgs/future" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" ) @@ -30,7 +31,7 @@ func NewService(scheduler Scheduler) *Service { } } -func (s *Service) MakeScheme(job jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) { +func (s *Service) MakeScheme(job schsdk.NormalJobInfo) (*jobmod.JobScheduleScheme, error) { s.lock.Lock() callback := future.NewSetValue[*jobmod.JobScheduleScheme]() s.jobs = append(s.jobs, &schedulingJob{ diff --git a/advisor/internal/services/task.go b/advisor/internal/services/task.go index d59a4c2..9feac33 100644 --- a/advisor/internal/services/task.go +++ b/advisor/internal/services/task.go @@ -4,7 +4,7 @@ import ( "gitlink.org.cn/cloudream/common/consts/errorcode" "gitlink.org.cn/cloudream/common/pkgs/logger" "gitlink.org.cn/cloudream/common/pkgs/mq" - "gitlink.org.cn/cloudream/common/utils/reflect" + "gitlink.org.cn/cloudream/common/utils/reflect2" myglbs "gitlink.org.cn/cloudream/scheduler/advisor/internal/globals" advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor" ) @@ -12,7 +12,7 @@ import ( func (svc *Service) StartTask(msg *advmq.StartTask) (*advmq.StartTaskResp, *mq.CodeMessage) { tsk, err := svc.taskManager.StartByInfo(msg.Info) if err != nil { - logger.WithField("Info", reflect.TypeOfValue(msg.Info).Name()). + logger.WithField("Info", reflect2.TypeOfValue(msg.Info).Name()). Warnf("starting task by info: %s", err.Error()) return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed") } diff --git a/advisor/internal/task/schedule_scheme.go b/advisor/internal/task/schedule_scheme.go index d22088a..5e59b6a 100644 --- a/advisor/internal/task/schedule_scheme.go +++ b/advisor/internal/task/schedule_scheme.go @@ -39,7 +39,7 @@ func (t *MakeScheduleScheme) Execute(task *task.Task[TaskContext], ctx TaskConte } func (t *MakeScheduleScheme) do(taskID string, ctx TaskContext) (*jobmod.JobScheduleScheme, error) { - scheme, err := ctx.scheduleSvc.MakeScheme(t.Job) + scheme, err := ctx.scheduleSvc.MakeScheme(t.JobInfo) if err != nil { return nil, err } diff --git a/advisor/internal/task/task.go b/advisor/internal/task/task.go index 03feb0a..db1a6ce 100644 --- a/advisor/internal/task/task.go +++ b/advisor/internal/task/task.go @@ -5,7 +5,7 @@ import ( "reflect" "gitlink.org.cn/cloudream/common/pkgs/task" - myreflect "gitlink.org.cn/cloudream/common/utils/reflect" + "gitlink.org.cn/cloudream/common/utils/reflect2" reporter "gitlink.org.cn/cloudream/scheduler/advisor/internal/reporter" "gitlink.org.cn/cloudream/scheduler/advisor/internal/scheduler" advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task" @@ -40,7 +40,7 @@ func NewManager(reporter *reporter.Reporter, scheduleSvc *scheduler.Service) Man } func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) { - infoType := myreflect.TypeOfValue(info) + infoType := reflect2.TypeOfValue(info) ctor, ok := taskFromInfoCtors[infoType] if !ok { @@ -53,7 +53,7 @@ func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) { var taskFromInfoCtors map[reflect.Type]func(advtsk.TaskInfo) TaskBody = make(map[reflect.Type]func(advtsk.TaskInfo) task.TaskBody[TaskContext]) func Register[TInfo advtsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) { - taskFromInfoCtors[myreflect.TypeOf[TInfo]()] = func(info advtsk.TaskInfo) TaskBody { + taskFromInfoCtors[reflect2.TypeOf[TInfo]()] = func(info advtsk.TaskInfo) TaskBody { return ctor(info.(TInfo)) } } diff --git a/client/internal/prescheduler/default_prescheduler.go b/client/internal/prescheduler/default_prescheduler.go index 8d73152..65f0b76 100644 --- a/client/internal/prescheduler/default_prescheduler.go +++ b/client/internal/prescheduler/default_prescheduler.go @@ -171,7 +171,7 @@ func (s *DefaultPreScheduler) Schedule(info *schsdk.JobSetInfo) (*jobmod.JobSetP if resFile, ok := norJob.Files.Code.(*schsdk.ResourceJobFileInfo); ok { j.Afters = append(j.Afters, resFile.ResourceLocalJobID) } - } else if resJob, ok := job.(*schsdk.ResourceJobInfo); ok { + } else if resJob, ok := job.(*schsdk.DataReturnJobInfo); ok { j.Afters = append(j.Afters, resJob.TargetLocalJobID) } @@ -270,7 +270,7 @@ func (s *DefaultPreScheduler) scheduleForNormalJob(jobSet *schsdk.JobSetInfo, jo // 检查此节点是否是它所引用的任务所选的节点 for _, af := range job.Afters { - resJob := findJobInfo[*schsdk.ResourceJobInfo](jobSet.Jobs, af) + resJob := findJobInfo[*schsdk.DataReturnJobInfo](jobSet.Jobs, af) if resJob == nil { return nil, fmt.Errorf("resource job %s not found in the job set", af) } diff --git a/common/models/job/job.go b/common/models/job/job.go index 12bb888..33131bc 100644 --- a/common/models/job/job.go +++ b/common/models/job/job.go @@ -1,10 +1,8 @@ package jobmod import ( - "github.com/samber/lo" - "gitlink.org.cn/cloudream/common/pkgs/types" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - "gitlink.org.cn/cloudream/common/utils/serder" + cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" ) type FileScheduleAction string @@ -34,66 +32,41 @@ type JobSetPreScheduleScheme struct { JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID } -// 任务集 -type JobSet struct { - JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID - JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用 - PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"` -} -type JobSetJobRef struct { - JobID schsdk.JobID `json:"jobID"` // 任务ID - LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID +type JobFiles struct { + Dataset PackageJobFile `json:"dataset"` + Code PackageJobFile `json:"code"` + Image ImageJobFile `json:"image"` } -func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet { - return &JobSet{ - JobSetID: jobSetID, - JobRefs: jobRefs, - PreScheduleScheme: preScheduleScheme, - } +type PackageJobFile struct { + PackageID cdssdk.PackageID `json:"packageID"` + FullPath string `json:"fullPath"` // Load之后的完整文件路径 } -func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef { - ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID }) - if !ok { - return nil - } - - return &ref +type ImageJobFile struct { + PackageID *cdssdk.PackageID `json:"packageID"` + ImageID schsdk.ImageID `json:"imageID"` } -// 任务 -type Job interface { - GetJobSetID() schsdk.JobSetID - GetJobID() schsdk.JobID - GetState() JobState - SetState(state JobState) - Clone() Job +type JobStatus struct { + JobID schsdk.JobID `json:"jobID"` + JobSetID schsdk.JobSetID `json:"jobSetID"` + Info schsdk.JobInfo `json:"info"` + Body JobBodyStatus `json:"body"` + State JobStateStatus `json:"state"` } -var JobTypeUnion = types.NewTypeUnion[Job]( - (*NormalJob)(nil), - (*ResourceJob)(nil), -) -var _ = serder.UseTypeUnionExternallyTagged(&JobTypeUnion) - -// TODO var _ = serder.RegisterNewTaggedTypeUnion(JobTypeUnion, "Type", "type") - -type JobBase struct { - JobSetID schsdk.JobSetID `json:"jobSetID"` // 任务集ID - JobID schsdk.JobID `json:"jobID"` // 全局唯一任务ID - State JobState `json:"state"` // 任务当前的状态。包含当前在状态下执行操作所需的数据 +type JobBodyStatus interface { } -func (j *JobBase) GetJobSetID() schsdk.JobSetID { - return j.JobSetID +type NormalJobStatus struct { + TargetCCID schsdk.CCID `json:"targetCCID"` + Files JobFiles `json:"files"` } -func (j *JobBase) GetJobID() schsdk.JobID { - return j.JobID + +type DataReturnJobStatus struct { + DataReturnPackageID cdssdk.PackageID `json:"dataReturnPackageID"` } -func (j *JobBase) GetState() JobState { - return j.State -} -func (j *JobBase) SetState(state JobState) { - j.State = state + +type JobStateStatus interface { } diff --git a/common/models/job/normal_job.go b/common/models/job/normal_job.go deleted file mode 100644 index bb1dea1..0000000 --- a/common/models/job/normal_job.go +++ /dev/null @@ -1,46 +0,0 @@ -package jobmod - -import ( - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" -) - -type NormalJob struct { - JobBase - Info schsdk.NormalJobInfo `json:"info"` // 提交任务时提供的任务描述信息 - Files JobFiles `json:"files"` // 任务需要的文件 - TargetCCID schsdk.CCID `json:"targetSlwNodeID"` // 将要运行此任务的算力中心ID - OutputFullPath string `json:"outputFullPath"` // 程序结果的完整输出路径 -} - -func NewNormalJob(jobSetID schsdk.JobSetID, jobID schsdk.JobID, info schsdk.NormalJobInfo) *NormalJob { - return &NormalJob{ - JobBase: JobBase{ - JobSetID: jobSetID, - JobID: jobID, - }, - Info: info, - } -} - -func (j *NormalJob) Clone() Job { - tmp := *j - tmp.State = tmp.State.Clone() - return &tmp -} - -type JobFiles struct { - Dataset PackageJobFile `json:"dataset"` - Code PackageJobFile `json:"code"` - Image ImageJobFile `json:"image"` -} - -type PackageJobFile struct { - PackageID cdssdk.PackageID `json:"packageID"` - FullPath string `json:"fullPath"` // Load之后的完整文件路径 -} - -type ImageJobFile struct { - PackageID *cdssdk.PackageID `json:"packageID"` - ImageID schsdk.ImageID `json:"imageID"` -} diff --git a/common/models/job/resource_job.go b/common/models/job/resource_job.go deleted file mode 100644 index 8cf02b5..0000000 --- a/common/models/job/resource_job.go +++ /dev/null @@ -1,28 +0,0 @@ -package jobmod - -import ( - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" -) - -type ResourceJob struct { - JobBase - Info schsdk.ResourceJobInfo `json:"info"` - ResourcePackageID cdssdk.PackageID `json:"resourcePackageID"` // 回源之后得到的PackageID -} - -func NewResourceJob(jobSetID schsdk.JobSetID, jobID schsdk.JobID, info schsdk.ResourceJobInfo) *ResourceJob { - return &ResourceJob{ - JobBase: JobBase{ - JobSetID: jobSetID, - JobID: jobID, - }, - Info: info, - } -} - -func (j *ResourceJob) Clone() Job { - tmp := *j - tmp.State = tmp.State.Clone() - return &tmp -} diff --git a/common/models/job/state.go b/common/models/job/state.go deleted file mode 100644 index 91f9365..0000000 --- a/common/models/job/state.go +++ /dev/null @@ -1,183 +0,0 @@ -package jobmod - -import ( - "gitlink.org.cn/cloudream/common/pkgs/types" - "gitlink.org.cn/cloudream/common/utils/serder" -) - -type JobState interface { - Clone() JobState -} -type JobStateBase struct{} - -var JobStateTypeUnion = types.NewTypeUnion[JobState]( - (*StatePreScheduling)(nil), - (*StateReadyToAdjust)(nil), - (*StateMakingAdjustScheme)(nil), - (*StateAdjusting)(nil), - (*StateReadyToExecute)(nil), - (*StateExecuting)(nil), - (*StateFailed)(nil), - (*StateSuccess)(nil), -) -var _ = serder.UseTypeUnionExternallyTagged(&JobStateTypeUnion) - -// TODO var _ = serder.RegisterNewTaggedTypeUnion(JobStateTypeUnion, "Type", "type") - -type FileSchedulingStep string - -const ( - StepBegin FileSchedulingStep = "Begin" // 准备开始调度 - StepUploading FileSchedulingStep = "Uploading" // 正在等待文件上传 - StepUploaded FileSchedulingStep = "Uploaded" // 文件上传完成 - StepMoving FileSchedulingStep = "Moving" // 正在移动缓存 - StepLoading FileSchedulingStep = "Loading" // 正在加载 - StepImageImporting FileSchedulingStep = "ImageImporting" // 正在导入镜像 - StepCompleted FileSchedulingStep = "Completed" // 完成 -) - -type FileSchedulingState struct { - Step FileSchedulingStep `json:"step"` - Error string `json:"error"` - FullTaskID string `json:"fullTaskID"` -} - -type StatePreScheduling struct { - JobStateBase - Scheme JobScheduleScheme `json:"scheme"` - Dataset FileSchedulingState `json:"dataset"` - Code FileSchedulingState `json:"code"` - Image FileSchedulingState `json:"image"` -} - -func NewStatePreScheduling(scheme JobScheduleScheme) *StatePreScheduling { - return &StatePreScheduling{ - Scheme: scheme, - Dataset: FileSchedulingState{ - Step: StepBegin, - }, - Code: FileSchedulingState{ - Step: StepBegin, - }, - Image: FileSchedulingState{ - Step: StepBegin, - }, - } -} -func (s *StatePreScheduling) Clone() JobState { - tmp := *s - return &tmp -} - -type StateReadyToAdjust struct { - JobStateBase -} - -func NewStateReadyToAdjust() *StateReadyToAdjust { - return &StateReadyToAdjust{} -} - -func (s *StateReadyToAdjust) Clone() JobState { - tmp := *s - return &tmp -} - -type StateMakingAdjustScheme struct { - JobStateBase - FullTaskID string `json:"fullTaskID"` -} - -func NewStateMakingAdjustScheme() *StateMakingAdjustScheme { - return &StateMakingAdjustScheme{} -} - -func (s *StateMakingAdjustScheme) Clone() JobState { - tmp := *s - return &tmp -} - -type StateAdjusting struct { - JobStateBase - Scheme JobScheduleScheme `json:"scheme"` - Dataset FileSchedulingState `json:"dataset"` - Code FileSchedulingState `json:"code"` - Image FileSchedulingState `json:"image"` -} - -func NewStateAdjusting(scheme JobScheduleScheme) *StateAdjusting { - return &StateAdjusting{ - Scheme: scheme, - Dataset: FileSchedulingState{ - Step: StepBegin, - }, - Code: FileSchedulingState{ - Step: StepBegin, - }, - Image: FileSchedulingState{ - Step: StepBegin, - }, - } -} - -func (s *StateAdjusting) Clone() JobState { - tmp := *s - return &tmp -} - -type StateReadyToExecute struct { - JobStateBase -} - -func NewStateReadyToExecute() *StateReadyToExecute { - return &StateReadyToExecute{} -} - -func (s *StateReadyToExecute) Clone() JobState { - tmp := *s - return &tmp -} - -type StateExecuting struct { - JobStateBase - FullTaskID string `json:"fullTaskID"` -} - -func NewStateExecuting() *StateExecuting { - return &StateExecuting{} -} - -func (s *StateExecuting) Clone() JobState { - tmp := *s - return &tmp -} - -type StateFailed struct { - JobStateBase - Error string `json:"error"` - LastState JobState `json:"lastState"` -} - -func NewStateFailed(err string, lastState JobState) *StateFailed { - return &StateFailed{ - Error: err, - LastState: lastState, - } -} - -func (s *StateFailed) Clone() JobState { - tmp := *s - return &tmp -} - -type StateSuccess struct { - JobStateBase -} - -func NewStateSuccess() *StateSuccess { - return &StateSuccess{} -} - -func (s *StateSuccess) Clone() JobState { - tmp := *s - return &tmp -} diff --git a/common/models/models.go b/common/models/models.go index 14b8435..5fd3fa6 100644 --- a/common/models/models.go +++ b/common/models/models.go @@ -8,7 +8,7 @@ import ( schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops" - myreflect "gitlink.org.cn/cloudream/common/utils/reflect" + "gitlink.org.cn/cloudream/common/utils/reflect2" "gitlink.org.cn/cloudream/common/utils/serder" ) @@ -78,7 +78,7 @@ type CCResourceInfo struct { func (i *CCResourceInfo) Scan(src interface{}) error { data, ok := src.([]uint8) if !ok { - return fmt.Errorf("unknow src type: %v", myreflect.TypeOfValue(data).String()) + return fmt.Errorf("unknow src type: %v", reflect2.TypeOfValue(data).String()) } return serder.JSONToObject(data, i) diff --git a/common/pkgs/mq/advisor/task/make_adjust_scheme.go b/common/pkgs/mq/advisor/task/make_adjust_scheme.go index 1d073d4..60b639c 100644 --- a/common/pkgs/mq/advisor/task/make_adjust_scheme.go +++ b/common/pkgs/mq/advisor/task/make_adjust_scheme.go @@ -1,17 +1,20 @@ package task import ( + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" ) type MakeAdjustScheme struct { TaskInfoBase - Job jobmod.NormalJob `json:"job"` + JobInfo schsdk.NormalJobInfo `json:"jobInfo"` + JobStatus jobmod.NormalJobStatus `json:"jobStatus"` } -func NewMakeAdjustScheme(job jobmod.NormalJob) *MakeAdjustScheme { +func NewMakeAdjustScheme(jobInfo schsdk.NormalJobInfo, jobStatus jobmod.NormalJobStatus) *MakeAdjustScheme { return &MakeAdjustScheme{ - Job: job, + JobInfo: jobInfo, + JobStatus: jobStatus, } } diff --git a/common/pkgs/mq/advisor/task/task.go b/common/pkgs/mq/advisor/task/task.go index 0e1b289..6167387 100644 --- a/common/pkgs/mq/advisor/task/task.go +++ b/common/pkgs/mq/advisor/task/task.go @@ -2,7 +2,7 @@ package task import ( "gitlink.org.cn/cloudream/common/pkgs/types" - myreflect "gitlink.org.cn/cloudream/common/utils/reflect" + "gitlink.org.cn/cloudream/common/utils/reflect2" "gitlink.org.cn/cloudream/common/utils/serder" ) @@ -32,9 +32,9 @@ func (s *TaskStatusBase) Noop() {} // 只能在init函数中调用,因为包级变量初始化会比init函数调用先进行 func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any { - TaskInfoTypeUnion.Add(myreflect.TypeOf[TTaskInfo]()) + TaskInfoTypeUnion.Add(reflect2.TypeOf[TTaskInfo]()) - TaskStatusTypeUnion.Add(myreflect.TypeOf[TTaskStatus]()) + TaskStatusTypeUnion.Add(reflect2.TypeOf[TTaskStatus]()) return nil } diff --git a/common/pkgs/mq/executor/task/task.go b/common/pkgs/mq/executor/task/task.go index 73045b5..db2e1b6 100644 --- a/common/pkgs/mq/executor/task/task.go +++ b/common/pkgs/mq/executor/task/task.go @@ -2,7 +2,7 @@ package task import ( "gitlink.org.cn/cloudream/common/pkgs/types" - myreflect "gitlink.org.cn/cloudream/common/utils/reflect" + "gitlink.org.cn/cloudream/common/utils/reflect2" "gitlink.org.cn/cloudream/common/utils/serder" ) @@ -32,9 +32,9 @@ func (s *TaskStatusBase) Noop() {} // 只能在init函数中调用,因为包级变量初始化会比init函数调用先进行 func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any { - TaskInfoTypeUnion.Add(myreflect.TypeOf[TTaskInfo]()) + TaskInfoTypeUnion.Add(reflect2.TypeOf[TTaskInfo]()) - TaskStatusTypeUnion.Add(myreflect.TypeOf[TTaskStatus]()) + TaskStatusTypeUnion.Add(reflect2.TypeOf[TTaskStatus]()) return nil } diff --git a/common/pkgs/mq/manager/job.go b/common/pkgs/mq/manager/job.go index 0fed86d..a07a03a 100644 --- a/common/pkgs/mq/manager/job.go +++ b/common/pkgs/mq/manager/job.go @@ -12,9 +12,7 @@ type JobService interface { JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded) (*JobSetLocalFileUploadedResp, *mq.CodeMessage) - GetJob(msg *GetJob) (*GetJobResp, *mq.CodeMessage) - - // GetJobSetJobs(msg *GetJobSetJobs) (*GetJobSetJobsResp, *mq.CodeMessage) + GetJobSetStatus(msg *GetJobSetStatus) (*GetJobSetStatusResp, *mq.CodeMessage) } // 提交任务集 @@ -74,52 +72,28 @@ func (c *Client) JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded, opts ...m return mq.Request(Service.JobSetLocalFileUploaded, c.roundTripper, msg, opts...) } -// 获取任务数据 -type GetJob struct { +var _ = Register(Service.GetJobSetStatus) + +// 获取任务集的状态 +type GetJobSetStatus struct { mq.MessageBodyBase - JobID schsdk.JobID `json:"jobID"` + JobSetID schsdk.JobSetID `json:"jobSetID"` } -type GetJobResp struct { +type GetJobSetStatusResp struct { mq.MessageBodyBase - Job jobmod.Job `json:"job"` + Jobs []jobmod.JobStatus `json:"jobs"` } -func NewGetJob(jobID schsdk.JobID) *GetJob { - return &GetJob{ - JobID: jobID, - } -} -func NewGetJobResp(job jobmod.Job) *GetJobResp { - return &GetJobResp{ - Job: job, - } -} -func (c *Client) GetJob(msg *GetJob, opts ...mq.RequestOption) (*GetJobResp, error) { - return mq.Request(Service.GetJob, c.roundTripper, msg, opts...) -} - -/* -// 获取指定任务集中的所有任务数据 -type GetJobSetJobs struct { - mq.MessageBodyBase - JobSetID string `json:"jobSetID"` -} -type GetJobSetJobsResp struct { - mq.MessageBodyBase - Jobs []jobmod.Job `json:"jobs"` -} - -func NewGetJobSetJobs(jobSetID string) *GetJobSetJobs { - return &GetJobSetJobs{ +func ReqGetJobSetStatus(jobSetID schsdk.JobSetID) *GetJobSetStatus { + return &GetJobSetStatus{ JobSetID: jobSetID, } } -func NewGetJobSetJobsResp(jobs []jobmod.Job) *GetJobSetJobsResp { - return &GetJobSetJobsResp{ +func RespGetJobSetStatus(jobs []jobmod.JobStatus) *GetJobSetStatusResp { + return &GetJobSetStatusResp{ Jobs: jobs, } } -func (c *Client) GetJobSetJobs(msg *GetJobSetJobs, opts ...mq.RequestOption) (*GetJobSetJobsResp, error) { - return mq.Request(Service.GetJobSetJobs, c.rabbitCli, msg, opts...) +func (c *Client) GetJob(msg *GetJobSetStatus, opts ...mq.RequestOption) (*GetJobSetStatusResp, error) { + return mq.Request(Service.GetJobSetStatus, c.roundTripper, msg, opts...) } -*/ diff --git a/executor/internal/services/task.go b/executor/internal/services/task.go index 02657ed..32de7e1 100644 --- a/executor/internal/services/task.go +++ b/executor/internal/services/task.go @@ -4,7 +4,7 @@ import ( "gitlink.org.cn/cloudream/common/consts/errorcode" "gitlink.org.cn/cloudream/common/pkgs/logger" "gitlink.org.cn/cloudream/common/pkgs/mq" - "gitlink.org.cn/cloudream/common/utils/reflect" + "gitlink.org.cn/cloudream/common/utils/reflect2" execmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor" myglbs "gitlink.org.cn/cloudream/scheduler/executor/internal/globals" ) @@ -12,7 +12,7 @@ import ( func (svc *Service) StartTask(msg *execmq.StartTask) (*execmq.StartTaskResp, *mq.CodeMessage) { tsk, err := svc.taskManager.StartByInfo(msg.Info) if err != nil { - logger.WithField("Info", reflect.TypeOfValue(msg.Info).Name()). + logger.WithField("Info", reflect2.TypeOfValue(msg.Info).Name()). Warnf("starting task by info: %s", err.Error()) return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed") } diff --git a/executor/internal/task/pcm_schedule_task.go b/executor/internal/task/pcm_schedule_task.go index 73f55f0..6f4f4e7 100644 --- a/executor/internal/task/pcm_schedule_task.go +++ b/executor/internal/task/pcm_schedule_task.go @@ -81,7 +81,7 @@ func (t *PCMSubmitTask) do(taskID string, ctx TaskContext) error { return nil } - if tsResp.TaskStatus == pcmsdk.TaskStatuFailed { + if tsResp.TaskStatus == pcmsdk.TaskStatusFailed { // TODO 返回更详细的信息 return fmt.Errorf("task failed") } diff --git a/executor/internal/task/task.go b/executor/internal/task/task.go index 63bbf29..664cc46 100644 --- a/executor/internal/task/task.go +++ b/executor/internal/task/task.go @@ -5,7 +5,7 @@ import ( "reflect" "gitlink.org.cn/cloudream/common/pkgs/task" - myreflect "gitlink.org.cn/cloudream/common/utils/reflect" + "gitlink.org.cn/cloudream/common/utils/reflect2" exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" reporter "gitlink.org.cn/cloudream/scheduler/executor/internal/reporter" ) @@ -37,7 +37,7 @@ func NewManager(reporter *reporter.Reporter) Manager { } func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) { - infoType := myreflect.TypeOfValue(info) + infoType := reflect2.TypeOfValue(info) ctor, ok := taskFromInfoCtors[infoType] if !ok { @@ -50,7 +50,7 @@ func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) { var taskFromInfoCtors map[reflect.Type]func(exectsk.TaskInfo) TaskBody = make(map[reflect.Type]func(exectsk.TaskInfo) task.TaskBody[TaskContext]) func Register[TInfo exectsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) { - taskFromInfoCtors[myreflect.TypeOf[TInfo]()] = func(info exectsk.TaskInfo) TaskBody { + taskFromInfoCtors[reflect2.TypeOf[TInfo]()] = func(info exectsk.TaskInfo) TaskBody { return ctor(info.(TInfo)) } } diff --git a/manager/internal/advisormgr/advisormgr.go b/manager/internal/advisormgr/advisormgr.go index 0ba7231..09512a8 100644 --- a/manager/internal/advisormgr/advisormgr.go +++ b/manager/internal/advisormgr/advisormgr.go @@ -5,7 +5,7 @@ import ( "sync" "time" - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + "gitlink.org.cn/cloudream/common/utils/sync2" schglb "gitlink.org.cn/cloudream/scheduler/common/globals" schmod "gitlink.org.cn/cloudream/scheduler/common/models" advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor" @@ -13,29 +13,23 @@ import ( mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" ) -type jobTask struct { - JobID schsdk.JobID - TaskID string - FullTaskID string +type task struct { + statusChan *sync2.Channel[advtsk.TaskStatus] } type AdvisorInfo struct { advisorID schmod.AdvisorID - jobTasks map[string]jobTask // key 为 TaskID + tasks map[string]task // key 为 TaskID lastReportTime time.Time } -type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus) -type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string) +var ErrWaitReportTimeout = fmt.Errorf("wait report timeout") type Manager struct { advisors map[schmod.AdvisorID]*AdvisorInfo lock sync.Mutex advCli *advmq.Client - onTaskUpdated OnTaskUpdatedCallbackFn - onTaskTimeout OnTimeoutCallbackFn - reportTimeout time.Duration } @@ -52,83 +46,66 @@ func NewManager(reportTimeout time.Duration) (*Manager, error) { }, nil } -func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) { - m.onTaskUpdated = callback -} - -func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) { - m.onTaskTimeout = callback -} - func (m *Manager) Report(advID schmod.AdvisorID, taskStatus []mgrmq.AdvisorTaskStatus) { m.lock.Lock() defer m.lock.Unlock() - info, ok := m.advisors[advID] + adv, ok := m.advisors[advID] if !ok { - info = &AdvisorInfo{ + adv = &AdvisorInfo{ advisorID: advID, - jobTasks: make(map[string]jobTask), + tasks: make(map[string]task), } - m.advisors[advID] = info + m.advisors[advID] = adv } - info.lastReportTime = time.Now() + adv.lastReportTime = time.Now() for _, s := range taskStatus { - tsk, ok := info.jobTasks[s.TaskID] + tsk, ok := adv.tasks[s.TaskID] if !ok { continue } - m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status) + // TODO 考虑主动检测channel是否关闭,然后取消task + if tsk.statusChan.Send(s.Status) != nil { + delete(adv.tasks, s.TaskID) + + if len(adv.tasks) == 0 { + delete(m.advisors, advID) + } + } } } -// 启动一个Task,并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID -func (m *Manager) StartTask(jobID schsdk.JobID, info advtsk.TaskInfo) (string, error) { +// 启动一个Task +func (m *Manager) StartTask(info advtsk.TaskInfo) *sync2.Channel[advtsk.TaskStatus] { m.lock.Lock() defer m.lock.Unlock() + ch := sync2.NewChannel[advtsk.TaskStatus]() + resp, err := m.advCli.StartTask(advmq.NewStartTask(info)) if err != nil { - return "", err + ch.CloseWithError(fmt.Errorf("start task: %w", err)) + return ch } - fullTaskID := fmt.Sprintf("%s-%s", resp.AdvisorID, resp.TaskID) - exeInfo, ok := m.advisors[resp.AdvisorID] if !ok { exeInfo = &AdvisorInfo{ advisorID: resp.AdvisorID, - jobTasks: make(map[string]jobTask), + tasks: make(map[string]task), lastReportTime: time.Now(), } m.advisors[resp.AdvisorID] = exeInfo } - exeInfo.jobTasks[resp.TaskID] = jobTask{ - JobID: jobID, - TaskID: resp.TaskID, - FullTaskID: fullTaskID, + exeInfo.tasks[resp.TaskID] = task{ + statusChan: ch, } - return fullTaskID, nil -} - -// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新 -func (m *Manager) ForgetTask(fullTaskID string) { - m.lock.Lock() - defer m.lock.Unlock() - - for _, exe := range m.advisors { - for _, tsk := range exe.jobTasks { - if tsk.FullTaskID == fullTaskID { - delete(exe.jobTasks, fullTaskID) - return - } - } - } + return ch } func (m *Manager) Serve() error { @@ -150,8 +127,8 @@ func (m *Manager) Serve() error { continue } - for _, tsk := range exeInfo.jobTasks { - m.onTaskTimeout(tsk.JobID, tsk.FullTaskID) + for _, tsk := range exeInfo.tasks { + tsk.statusChan.CloseWithError(ErrWaitReportTimeout) } delete(m.advisors, exeID) diff --git a/manager/internal/executormgr/executormgr.go b/manager/internal/executormgr/executormgr.go index 33a5edb..b09e4b9 100644 --- a/manager/internal/executormgr/executormgr.go +++ b/manager/internal/executormgr/executormgr.go @@ -5,7 +5,7 @@ import ( "sync" "time" - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + "gitlink.org.cn/cloudream/common/utils/sync2" schglb "gitlink.org.cn/cloudream/scheduler/common/globals" schmod "gitlink.org.cn/cloudream/scheduler/common/models" @@ -14,29 +14,22 @@ import ( mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" ) -type jobTask struct { - JobID schsdk.JobID - TaskID string - FullTaskID string +type task struct { + statusChan *sync2.Channel[exetsk.TaskStatus] } - -type ExecutorInfo struct { +type ExecutorStatus struct { executorID schmod.ExecutorID - jobTasks map[string]jobTask // key 为 TaskID + tasks map[string]task // key 为 TaskID lastReportTime time.Time } -type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus exetsk.TaskStatus) -type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string) +var ErrWaitReportTimeout = fmt.Errorf("wait report timeout") type Manager struct { - executors map[schmod.ExecutorID]*ExecutorInfo + executors map[schmod.ExecutorID]*ExecutorStatus lock sync.Mutex exeCli *exemq.Client - onTaskUpdated OnTaskUpdatedCallbackFn - onTaskTimeout OnTimeoutCallbackFn - reportTimeout time.Duration } @@ -47,89 +40,71 @@ func NewManager(reportTimeout time.Duration) (*Manager, error) { } return &Manager{ - executors: make(map[schmod.ExecutorID]*ExecutorInfo), + executors: make(map[schmod.ExecutorID]*ExecutorStatus), exeCli: exeCli, reportTimeout: reportTimeout, }, nil } -func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) { - m.onTaskUpdated = callback -} - -func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) { - m.onTaskTimeout = callback -} - func (m *Manager) Report(execID schmod.ExecutorID, taskStatus []mgrmq.ExecutorTaskStatus) { m.lock.Lock() defer m.lock.Unlock() - info, ok := m.executors[execID] + exec, ok := m.executors[execID] if !ok { - info = &ExecutorInfo{ + exec = &ExecutorStatus{ executorID: execID, - jobTasks: make(map[string]jobTask), + tasks: make(map[string]task), } - m.executors[execID] = info + m.executors[execID] = exec } - info.lastReportTime = time.Now() + exec.lastReportTime = time.Now() for _, s := range taskStatus { - tsk, ok := info.jobTasks[s.TaskID] + tsk, ok := exec.tasks[s.TaskID] if !ok { continue } - m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status) + // TODO 考虑主动检测channel是否关闭,然后取消task + if tsk.statusChan.Send(s.Status) != nil { + delete(exec.tasks, s.TaskID) + + if len(exec.tasks) == 0 { + delete(m.executors, execID) + } + } } } -// 启动一个Task,并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID -func (m *Manager) StartTask(jobID schsdk.JobID, info exetsk.TaskInfo) (string, error) { +// 启动一个Task +func (m *Manager) StartTask(info exetsk.TaskInfo) *sync2.Channel[exetsk.TaskStatus] { m.lock.Lock() defer m.lock.Unlock() + ch := sync2.NewChannel[exetsk.TaskStatus]() resp, err := m.exeCli.StartTask(exemq.NewStartTask(info)) if err != nil { - return "", err + ch.CloseWithError(fmt.Errorf("start task: %w", err)) + return ch } - fullTaskID := fmt.Sprintf("%s-%s", resp.ExecutorID, resp.TaskID) - exeInfo, ok := m.executors[resp.ExecutorID] if !ok { - exeInfo = &ExecutorInfo{ + exeInfo = &ExecutorStatus{ executorID: resp.ExecutorID, - jobTasks: make(map[string]jobTask), + tasks: make(map[string]task), lastReportTime: time.Now(), } m.executors[resp.ExecutorID] = exeInfo } - exeInfo.jobTasks[resp.TaskID] = jobTask{ - JobID: jobID, - TaskID: resp.TaskID, - FullTaskID: fullTaskID, + exeInfo.tasks[resp.TaskID] = task{ + statusChan: ch, } - return fullTaskID, nil -} - -// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新 -func (m *Manager) ForgetTask(fullTaskID string) { - m.lock.Lock() - defer m.lock.Unlock() - - for _, exe := range m.executors { - for _, tsk := range exe.jobTasks { - if tsk.FullTaskID == fullTaskID { - delete(exe.jobTasks, fullTaskID) - return - } - } - } + return ch } func (m *Manager) Serve() error { @@ -151,8 +126,8 @@ func (m *Manager) Serve() error { continue } - for _, tsk := range exeInfo.jobTasks { - m.onTaskTimeout(tsk.JobID, tsk.FullTaskID) + for _, tsk := range exeInfo.tasks { + tsk.statusChan.CloseWithError(ErrWaitReportTimeout) } delete(m.executors, exeID) diff --git a/manager/internal/jobmgr/adjusting_handler.go b/manager/internal/jobmgr/adjusting_handler.go deleted file mode 100644 index 3219a4a..0000000 --- a/manager/internal/jobmgr/adjusting_handler.go +++ /dev/null @@ -1,371 +0,0 @@ -package jobmgr - -import ( - "fmt" - "reflect" - "time" - - "gitlink.org.cn/cloudream/common/pkgs/actor" - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" - - schglb "gitlink.org.cn/cloudream/scheduler/common/globals" - schmod "gitlink.org.cn/cloudream/scheduler/common/models" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" - "gitlink.org.cn/cloudream/scheduler/common/utils" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" -) - -type adjustingJob struct { - job *jobmod.NormalJob - state *jobmod.StateAdjusting - ccInfo schmod.ComputingCenter -} - -type AdjustingHandler struct { - mgr *Manager - - jobs map[schsdk.JobID]*adjustingJob - - cmdChan actor.CommandChannel -} - -func NewAdjustingHandler(mgr *Manager) *AdjustingHandler { - return &AdjustingHandler{ - mgr: mgr, - jobs: make(map[schsdk.JobID]*adjustingJob), - cmdChan: *actor.NewCommandChannel(), - } -} - -func (h *AdjustingHandler) Handle(job jobmod.Job) { - h.cmdChan.Send(func() { - norJob, ok := job.(*jobmod.NormalJob) - if !ok { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState())) - return - } - - adjustingState, ok := norJob.GetState().(*jobmod.StateAdjusting) - if !ok { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState())) - return - } - - colCli, err := schglb.CollectorMQPool.Acquire() - if err != nil { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err.Error()), job.GetState())) - return - } - defer schglb.CollectorMQPool.Release(colCli) - - ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), adjustingState.Scheme.TargetCCID) - if err != nil { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.GetState())) - return - } - - stgCli, err := schglb.CloudreamStoragePool.Acquire() - if err != nil { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new cloudream storage client: %s", err.Error()), job.GetState())) - return - } - defer schglb.CloudreamStoragePool.Release(stgCli) - - stgInfo, err := stgCli.StorageGetInfo(cdssdk.StorageGetInfoReq{ - StorageID: ccInfo.CDSStorageID, - }) - if err != nil { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting cloudream storage info: %s", err.Error()), job.GetState())) - return - } - - norJob.TargetCCID = adjustingState.Scheme.TargetCCID - // TODO UserID - norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, norJob.JobID) - - adjJob := &adjustingJob{ - job: norJob, - state: adjustingState, - ccInfo: ccInfo, - } - h.jobs[job.GetJobID()] = adjJob - - h.onJobEvent(nil, adjJob) - }) -} - -func (h *AdjustingHandler) onJobEvent(evt event.Event, job *adjustingJob) { - if cloneEvt, ok := evt.(*event.CloneJob); ok { - cloneEvt.Callback.SetValue(job.job.Clone()) - return - } - - err := h.doPackageScheduling(evt, job, - job.job.Info.Files.Dataset, &job.job.Files.Dataset, - &job.state.Scheme.Dataset, &job.state.Dataset, - ) - if err != nil { - job.state.Dataset.Error = err.Error() - h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state)) - return - } - - err = h.doPackageScheduling(evt, job, - job.job.Info.Files.Code, &job.job.Files.Code, - &job.state.Scheme.Code, &job.state.Code, - ) - if err != nil { - job.state.Code.Error = err.Error() - h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state)) - return - } - - err = h.doImageScheduling(evt, job, - job.job.Info.Files.Image, &job.job.Files.Image, - &job.state.Scheme.Image, &job.state.Image, - ) - if err != nil { - job.state.Image.Error = err.Error() - h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state)) - return - } - - // 如果三种文件都调度完成,则可以进入下个阶段了 - if job.state.Dataset.Step == jobmod.StepCompleted && - job.state.Code.Step == jobmod.StepCompleted && - job.state.Image.Step == jobmod.StepCompleted { - - h.changeJobState(job.job, jobmod.NewStateReadyToExecute()) - } -} - -func (h *AdjustingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) { - job.SetState(state) - - delete(h.jobs, job.GetJobID()) - - h.mgr.pubLock.Lock() - h.mgr.handleState(job) - h.mgr.pubLock.Unlock() -} - -func (h *AdjustingHandler) doPackageScheduling(evt event.Event, job *adjustingJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error { - if state.Step == jobmod.StepBegin { - state.Step = jobmod.StepUploaded - } - - if state.Step == jobmod.StepUploaded { - if scheme.Action == jobmod.ActionNo { - state.Step = jobmod.StepCompleted - return nil - } - - if scheme.Action == jobmod.ActionMove { - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, file.PackageID, job.ccInfo.CDSNodeID)) - if err != nil { - return fmt.Errorf("starting cache move package: %w", err) - } - - state.Step = jobmod.StepMoving - state.FullTaskID = fullTaskID - return nil - - } - - if scheme.Action == jobmod.ActionLoad { - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewStorageLoadPackage(1, file.PackageID, job.ccInfo.CDSStorageID)) - if err != nil { - return fmt.Errorf("starting stroage load package: %w", err) - } - - state.Step = jobmod.StepLoading - state.FullTaskID = fullTaskID - return nil - } - - return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo)) - } - - if state.Step == jobmod.StepMoving { - moveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID) - if err == event.ErrUnconcernedTask { - return nil - } - - if err == event.ErrTaskTimeout { - return fmt.Errorf("cache move package timeout") - } - - h.mgr.execMgr.ForgetTask(state.FullTaskID) - - if moveRet.Error != "" { - return fmt.Errorf("cache move pacakge: %s", moveRet.Error) - } - - state.Step = jobmod.StepCompleted - return nil - } - - if state.Step == jobmod.StepLoading { - loadRet, err := event.AssertExecutorTaskStatus[*exectsk.StorageLoadPackageStatus](evt, state.FullTaskID) - if err == event.ErrUnconcernedTask { - return nil - } - - if err == event.ErrTaskTimeout { - return fmt.Errorf("storage load package timeout") - } - - h.mgr.execMgr.ForgetTask(state.FullTaskID) - - if loadRet.Error != "" { - return fmt.Errorf("storage load package: %s", loadRet.Error) - } - - file.FullPath = loadRet.FullPath - - state.Step = jobmod.StepCompleted - return nil - } - return nil -} - -func (h *AdjustingHandler) doImageScheduling(evt event.Event, job *adjustingJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error { - if state.Step == jobmod.StepBegin { - state.Step = jobmod.StepUploaded - } - - if state.Step == jobmod.StepUploaded { - if scheme.Action == jobmod.ActionNo { - state.Step = jobmod.StepCompleted - return nil - } - - // 要导入镜像,则需要先将镜像移动到指点节点的缓存中 - if scheme.Action == jobmod.ActionImportImage { - if file.PackageID == nil { - return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, job.ccInfo.CCID) - } - - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, *file.PackageID, job.ccInfo.CDSNodeID)) - if err != nil { - return fmt.Errorf("starting cache move package: %w", err) - } - - state.Step = jobmod.StepMoving - state.FullTaskID = fullTaskID - return nil - } - return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo)) - } - - if state.Step == jobmod.StepMoving { - cacheMoveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID) - if err == event.ErrUnconcernedTask { - return nil - } - - if err == event.ErrTaskTimeout { - return fmt.Errorf("cache move package timeout") - } - - h.mgr.execMgr.ForgetTask(state.FullTaskID) - - if cacheMoveRet.Error != "" { - return fmt.Errorf("cache move pacakge: %s", cacheMoveRet.Error) - } - - stgCli, err := schglb.CloudreamStoragePool.Acquire() - if err != nil { - return fmt.Errorf("new cloudream storage client: %w", err) - } - defer schglb.CloudreamStoragePool.Release(stgCli) - - pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID}) - if err != nil { - return fmt.Errorf("getting package objects: %w", err) - } - - if len(pkgObjs.Objects) != 1 { - return fmt.Errorf("there must be only 1 object in the package that will be imported") - } - - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewUploadImage(job.ccInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash))) - if err != nil { - return fmt.Errorf("starting import image: %w", err) - } - - state.Step = jobmod.StepImageImporting - state.FullTaskID = fullTaskID - return nil - } - - if state.Step == jobmod.StepImageImporting { - uploadImageRet, err := event.AssertExecutorTaskStatus[*exectsk.UploadImageStatus](evt, state.FullTaskID) - if err == event.ErrUnconcernedTask { - return nil - } - - if err == event.ErrTaskTimeout { - return fmt.Errorf("import image timeout") - } - - h.mgr.execMgr.ForgetTask(state.FullTaskID) - - if uploadImageRet.Error != "" { - return fmt.Errorf("import image: %s", uploadImageRet.Error) - } - - // 调整过程中不会更换镜像,所以ImageID不会发生变化 - err = h.mgr.db.PCMImage().Create(h.mgr.db.SQLCtx(), file.ImageID, job.ccInfo.CCID, uploadImageRet.PCMImageID, uploadImageRet.Name, time.Now()) - if err != nil { - return fmt.Errorf("creating pcm image info: %w", err) - } - - state.Step = jobmod.StepCompleted - return nil - } - - return nil -} - -func (h *AdjustingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) { - h.cmdChan.Send(func() { - if broadcast.ToAll() { - for _, job := range h.jobs { - h.onJobEvent(evt, job) - } - - } else if broadcast.ToJobSet() { - for _, job := range h.jobs { - if job.job.GetJobSetID() != broadcast.JobSetID { - continue - } - - h.onJobEvent(evt, job) - } - } else if broadcast.ToJob() { - if job, ok := h.jobs[broadcast.JobID]; ok { - h.onJobEvent(evt, job) - } - } - }) -} - -func (h *AdjustingHandler) Serve() { - cmdChan := h.cmdChan.BeginChanReceive() - defer h.cmdChan.CloseChanReceive() - - for { - select { - case cmd := <-cmdChan: - cmd() - } - } -} - -func (h *AdjustingHandler) Stop() { - // TODO 支持STOP -} diff --git a/manager/internal/jobmgr/complete_handler.go b/manager/internal/jobmgr/complete_handler.go deleted file mode 100644 index dd56798..0000000 --- a/manager/internal/jobmgr/complete_handler.go +++ /dev/null @@ -1,63 +0,0 @@ -package jobmgr - -import ( - "fmt" - "reflect" - - "gitlink.org.cn/cloudream/common/pkgs/logger" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" -) - -type CompleteHandler struct { - mgr *Manager -} - -func NewCompleteHandler(mgr *Manager) *CompleteHandler { - return &CompleteHandler{ - mgr: mgr, - } -} - -func (h *CompleteHandler) Handle(job jobmod.Job) { - // TODO 可以考虑将执行记录落库 - if state, ok := job.GetState().(*jobmod.StateSuccess); ok { - h.handleSuccess(job, state) - } else if state, ok := job.GetState().(*jobmod.StateFailed); ok { - h.handleFailed(job, state) - } else { - state := jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()) - job.SetState(state) - h.handleFailed(job, state) - } -} - -func (h *CompleteHandler) handleSuccess(job jobmod.Job, state *jobmod.StateSuccess) { - logger.WithField("JobID", job.GetJobID()).Infof("job completed successfuly") - - h.mgr.onEvent(event.ToJobSet(job.GetJobSetID()), event.NewJobCompleted(job)) -} - -func (h *CompleteHandler) handleFailed(job jobmod.Job, state *jobmod.StateFailed) { - logger. - WithField("JobID", job.GetJobID()). - WithField("LastState", reflect.TypeOf(state.LastState).String()). - Infof("job failed with: %v", state.Error) - - h.mgr.onEvent(event.ToJobSet(job.GetJobSetID()), event.NewJobCompleted(job)) -} - -func (h *CompleteHandler) OnEvent(broadcast event.Broadcast, evt event.Event) { - if cloneEvt, ok := evt.(*event.CloneJob); ok { - cloneEvt.Callback.SetError(fmt.Errorf("job not found")) - return - } -} - -func (h *CompleteHandler) Serve() { - -} - -func (h *CompleteHandler) Stop() { - -} diff --git a/manager/internal/jobmgr/default_handler.go b/manager/internal/jobmgr/default_handler.go deleted file mode 100644 index 5b6bdf0..0000000 --- a/manager/internal/jobmgr/default_handler.go +++ /dev/null @@ -1,50 +0,0 @@ -package jobmgr - -import ( - "gitlink.org.cn/cloudream/common/pkgs/logger" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" -) - -type DefaultHandler struct { - mgr *Manager -} - -func NewDefaultHandler(mgr *Manager) *DefaultHandler { - return &DefaultHandler{ - mgr: mgr, - } -} - -// 处理Job。在此期间全局锁已锁定 -func (h *DefaultHandler) Handle(job jobmod.Job) { - state := job.GetState() - if state == nil { - job.SetState(jobmod.NewStateFailed("unexpected nil state", nil)) - h.mgr.handleState(job) - return - } - - if _, ok := state.(*jobmod.StateFailed); ok { - logger.Warnf("state failed should not be handled by default handler") - return - } - - job.SetState(jobmod.NewStateFailed("no handler for this state", state)) - h.mgr.handleState(job) -} - -// 外部发生了一个事件 -func (h *DefaultHandler) OnEvent(broadcast event.Broadcast, evt event.Event) { - -} - -// 运行Handler -func (h *DefaultHandler) Serve() { - -} - -// 停止此Handler -func (h *DefaultHandler) Stop() { - -} diff --git a/manager/internal/jobmgr/event/event.go b/manager/internal/jobmgr/event.go similarity index 93% rename from manager/internal/jobmgr/event/event.go rename to manager/internal/jobmgr/event.go index 107db65..1b9f6d2 100644 --- a/manager/internal/jobmgr/event/event.go +++ b/manager/internal/jobmgr/event.go @@ -1,4 +1,4 @@ -package event +package jobmgr import ( "errors" @@ -11,6 +11,8 @@ var ErrUnconcernedTask = errors.New("unconcerned task") var ErrTaskTimeout = errors.New("task timeout") +var ErrJobCancelled = errors.New("job cancelled") + type Event interface{} type BroadcastType string diff --git a/manager/internal/jobmgr/event/advisor_task_timeout.go b/manager/internal/jobmgr/event/advisor_task_timeout.go deleted file mode 100644 index 0c156ee..0000000 --- a/manager/internal/jobmgr/event/advisor_task_timeout.go +++ /dev/null @@ -1,12 +0,0 @@ -package event - -// advisor的任务执行超时 -type AdvisorTaskTimeout struct { - FullTaskID string -} - -func NewAdvisorTaskTimeout(fullTaskID string) *AdvisorTaskTimeout { - return &AdvisorTaskTimeout{ - FullTaskID: fullTaskID, - } -} diff --git a/manager/internal/jobmgr/event/advisor_task_updated.go b/manager/internal/jobmgr/event/advisor_task_updated.go deleted file mode 100644 index 1e3931c..0000000 --- a/manager/internal/jobmgr/event/advisor_task_updated.go +++ /dev/null @@ -1,46 +0,0 @@ -package event - -import advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task" - -// advisor上报任务进度 -type AdvisorTaskUpdated struct { - FullTaskID string - TaskStatus advtsk.TaskStatus -} - -func NewAdvisorTaskUpdated(fullTaskID string, taskStatus advtsk.TaskStatus) *AdvisorTaskUpdated { - return &AdvisorTaskUpdated{ - FullTaskID: fullTaskID, - TaskStatus: taskStatus, - } -} - -func AssertAdvisorTaskStatus[T advtsk.TaskStatus](evt Event, fullTaskID string) (T, error) { - var ret T - if evt == nil { - return ret, ErrUnconcernedTask - } - - if reportTaskStatus, ok := evt.(*AdvisorTaskUpdated); ok { - if reportTaskStatus.FullTaskID != fullTaskID { - return ret, ErrUnconcernedTask - } - - status, ok := reportTaskStatus.TaskStatus.(T) - if !ok { - return ret, ErrUnconcernedTask - } - - return status, nil - } - - if taskTimeout, ok := evt.(*AdvisorTaskTimeout); ok { - if taskTimeout.FullTaskID != fullTaskID { - return ret, ErrUnconcernedTask - } - - return ret, ErrTaskTimeout - } - - return ret, ErrUnconcernedTask -} diff --git a/manager/internal/jobmgr/event/cancel.go b/manager/internal/jobmgr/event/cancel.go new file mode 100644 index 0000000..1eede5b --- /dev/null +++ b/manager/internal/jobmgr/event/cancel.go @@ -0,0 +1,4 @@ +package event + +type Cancel struct { +} diff --git a/manager/internal/jobmgr/event/clone_job.go b/manager/internal/jobmgr/event/clone_job.go deleted file mode 100644 index cbdf5bf..0000000 --- a/manager/internal/jobmgr/event/clone_job.go +++ /dev/null @@ -1,14 +0,0 @@ -package event - -import ( - "gitlink.org.cn/cloudream/common/pkgs/future" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" -) - -type CloneJob struct { - Callback future.SetValueFuture[jobmod.Job] -} - -func NewCloneJob() *CloneJob { - return &CloneJob{} -} diff --git a/manager/internal/jobmgr/event/executor_task_timeout.go b/manager/internal/jobmgr/event/executor_task_timeout.go deleted file mode 100644 index f4ade90..0000000 --- a/manager/internal/jobmgr/event/executor_task_timeout.go +++ /dev/null @@ -1,12 +0,0 @@ -package event - -// executor的任务执行超时 -type ExecutorTaskTimeout struct { - FullTaskID string -} - -func NewExecutorTaskTimeout(fullTaskID string) *ExecutorTaskTimeout { - return &ExecutorTaskTimeout{ - FullTaskID: fullTaskID, - } -} diff --git a/manager/internal/jobmgr/event/executor_task_updated.go b/manager/internal/jobmgr/event/executor_task_updated.go deleted file mode 100644 index fe299d6..0000000 --- a/manager/internal/jobmgr/event/executor_task_updated.go +++ /dev/null @@ -1,48 +0,0 @@ -package event - -import ( - exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" -) - -// executor上报任务进度 -type ExecutorTaskUpdated struct { - FullTaskID string - TaskStatus exectsk.TaskStatus -} - -func NewExecutorTaskUpdated(fullTaskID string, taskStatus exectsk.TaskStatus) *ExecutorTaskUpdated { - return &ExecutorTaskUpdated{ - FullTaskID: fullTaskID, - TaskStatus: taskStatus, - } -} - -func AssertExecutorTaskStatus[T exectsk.TaskStatus](evt Event, fullTaskID string) (T, error) { - var ret T - if evt == nil { - return ret, ErrUnconcernedTask - } - - if reportTaskStatus, ok := evt.(*ExecutorTaskUpdated); ok { - if reportTaskStatus.FullTaskID != fullTaskID { - return ret, ErrUnconcernedTask - } - - status, ok := reportTaskStatus.TaskStatus.(T) - if !ok { - return ret, ErrUnconcernedTask - } - - return status, nil - } - - if taskTimeout, ok := evt.(*ExecutorTaskTimeout); ok { - if taskTimeout.FullTaskID != fullTaskID { - return ret, ErrUnconcernedTask - } - - return ret, ErrTaskTimeout - } - - return ret, ErrUnconcernedTask -} diff --git a/manager/internal/jobmgr/event/job_completed.go b/manager/internal/jobmgr/event/job_completed.go index aa94584..2452134 100644 --- a/manager/internal/jobmgr/event/job_completed.go +++ b/manager/internal/jobmgr/event/job_completed.go @@ -1,16 +1,18 @@ package event import ( - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" ) // 任务结束,包括成功或者失败 type JobCompleted struct { - Job jobmod.Job + Job *jobmgr.Job + Err error } -func NewJobCompleted(job jobmod.Job) *JobCompleted { +func NewJobCompleted(job *jobmgr.Job, err error) *JobCompleted { return &JobCompleted{ Job: job, + Err: err, } } diff --git a/manager/internal/jobmgr/event/local_file_uploaded.go b/manager/internal/jobmgr/event/local_file_uploaded.go index 4c261ea..9b81ad3 100644 --- a/manager/internal/jobmgr/event/local_file_uploaded.go +++ b/manager/internal/jobmgr/event/local_file_uploaded.go @@ -1,21 +1,18 @@ package event import ( - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" ) // 本地文件上传结束 type LocalFileUploaded struct { - JobSetID schsdk.JobSetID LocalPath string - Error string + Error error PackageID cdssdk.PackageID } -func NewLocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) *LocalFileUploaded { +func NewLocalFileUploaded(localPath string, err error, packageID cdssdk.PackageID) *LocalFileUploaded { return &LocalFileUploaded{ - JobSetID: jobSetID, LocalPath: localPath, Error: err, PackageID: packageID, diff --git a/manager/internal/jobmgr/event/utils.go b/manager/internal/jobmgr/event/utils.go new file mode 100644 index 0000000..c2c803d --- /dev/null +++ b/manager/internal/jobmgr/event/utils.go @@ -0,0 +1,27 @@ +package event + +import ( + "context" + + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" +) + +func WaitType[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet) (T, bool) { + ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool { + _, ok := evt.(T) + return ok + }) + return ret.(T), ok +} + +func WaitTypeAnd[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet, cond func(val T) bool) (T, bool) { + ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool { + e, ok := evt.(T) + if !ok { + return false + } + + return cond(e) + }) + return ret.(T), ok +} diff --git a/manager/internal/jobmgr/event_set.go b/manager/internal/jobmgr/event_set.go new file mode 100644 index 0000000..efcce6f --- /dev/null +++ b/manager/internal/jobmgr/event_set.go @@ -0,0 +1,72 @@ +package jobmgr + +import ( + "context" + "sync" + + "gitlink.org.cn/cloudream/common/pkgs/future" + "gitlink.org.cn/cloudream/common/utils/lo2" +) + +type EventWaitCondition func(evt Event) bool + +type EventWaiter struct { + condition EventWaitCondition + future *future.SetValueFuture[Event] +} + +type EventSet struct { + events []Event + waiters []EventWaiter + lock sync.Mutex +} + +func NewEventSet() EventSet { + return EventSet{} +} + +func (s *EventSet) Post(evt Event) { + s.lock.Lock() + defer s.lock.Unlock() + + // 一个事件能唤醒多个等待者 + used := false + for i, waiter := range s.waiters { + if waiter.condition(evt) { + s.waiters = lo2.RemoveAt(s.waiters, i) + waiter.future.SetValue(evt) + used = true + } + } + + if !used { + s.events = append(s.events, evt) + } +} + +func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bool) { + s.lock.Lock() + defer s.lock.Unlock() + + // 一个等待者只能等待一个事件 + for i, evt := range s.events { + if cond(evt) { + s.events = lo2.RemoveAt(s.events, i) + return evt, true + } + } + + fut := future.NewSetValue[Event]() + waiter := EventWaiter{ + condition: cond, + future: fut, + } + s.events = append(s.events, waiter) + + val, err := fut.WaitValue(ctx) + if err != nil { + return nil, false + } + + return val, true +} diff --git a/manager/internal/jobmgr/executing_handler.go b/manager/internal/jobmgr/executing_handler.go deleted file mode 100644 index c1f37ef..0000000 --- a/manager/internal/jobmgr/executing_handler.go +++ /dev/null @@ -1,264 +0,0 @@ -package jobmgr - -import ( - "fmt" - "reflect" - - "gitlink.org.cn/cloudream/common/pkgs/actor" - "gitlink.org.cn/cloudream/common/pkgs/logger" - pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm" - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - - schglb "gitlink.org.cn/cloudream/scheduler/common/globals" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" - "gitlink.org.cn/cloudream/scheduler/common/utils" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" -) - -type executingJob struct { - job jobmod.Job - state *jobmod.StateExecuting -} - -type ExecutingHandler struct { - mgr *Manager - - jobs map[schsdk.JobID]*executingJob - - cmdChan actor.CommandChannel -} - -func NewExecutingHandler(mgr *Manager) *ExecutingHandler { - return &ExecutingHandler{ - mgr: mgr, - jobs: make(map[schsdk.JobID]*executingJob), - cmdChan: *actor.NewCommandChannel(), - } -} - -func (h *ExecutingHandler) Handle(job jobmod.Job) { - h.cmdChan.Send(func() { - state, ok := job.GetState().(*jobmod.StateExecuting) - if !ok { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState())) - return - } - - rjob := &executingJob{ - job: job, - state: state, - } - h.jobs[job.GetJobID()] = rjob - - h.onJobEvent(nil, rjob) - }) -} - -func (h *ExecutingHandler) onJobEvent(evt event.Event, job *executingJob) { - if cloneEvt, ok := evt.(*event.CloneJob); ok { - cloneEvt.Callback.SetValue(job.job.Clone()) - return - } - - if norJob, ok := job.job.(*jobmod.NormalJob); ok { - h.onNormalJobEvent(evt, job, norJob) - } else if resJob, ok := job.job.(*jobmod.ResourceJob); ok { - h.onResourceJobEvent(evt, job, resJob) - } -} - -func (h *ExecutingHandler) onNormalJobEvent(evt event.Event, job *executingJob, norJob *jobmod.NormalJob) { - if job.state.FullTaskID == "" { - pcmImgInfo, err := h.mgr.db.PCMImage().GetByImageIDAndCCID(h.mgr.db.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID) - if err != nil { - h.changeJobState(job.job, jobmod.NewStateFailed("getting pcm image info: "+err.Error(), job.state)) - return - } - - ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), norJob.TargetCCID) - if err != nil { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state)) - return - } - - // TODO 需要添加DATA_IN、DATA_OUT等环境变量,这些数据从Job的信息中来获取 - ress, err := h.mgr.db.CCResource().GetByCCID(h.mgr.db.SQLCtx(), norJob.TargetCCID) - if err != nil { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center resource info: %s", err.Error()), job.state)) - return - } - if len(ress) == 0 { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("there is no resource at computing center %v", norJob.TargetCCID), job.state)) - return - } - - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(), - exetsk.NewSubmitTask( - ccInfo.PCMParticipantID, - pcmImgInfo.PCMImageID, - // TODO 选择资源的算法 - ress[0].PCMResourceID, - norJob.Info.Runtime.Command, - norJob.Info.Runtime.Envs, - )) - if err != nil { - h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state)) - return - } - - job.state.FullTaskID = fullTaskID - } - - if execRet, err := event.AssertExecutorTaskStatus[*exetsk.SubmitTaskStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask { - if err == event.ErrTaskTimeout { - h.changeJobState(job.job, jobmod.NewStateFailed("schedule task timeout", job.state)) - return - } - - logger.WithField("JobID", job.job.GetJobID()). - WithField("State", reflect.TypeOf(job.state).String()). - Infof("pcm task state change to: %s", execRet.Status) - - if execRet.Status == pcmsdk.TaskStatusSuccess { - h.mgr.execMgr.ForgetTask(job.state.FullTaskID) - h.changeJobState(job.job, jobmod.NewStateSuccess()) - - } else if execRet.Status == pcmsdk.TaskStatuFailed { - h.mgr.execMgr.ForgetTask(job.state.FullTaskID) - h.changeJobState(job.job, jobmod.NewStateFailed(execRet.Error, job.state)) - } - } -} - -func (h *ExecutingHandler) onResourceJobEvent(evt event.Event, job *executingJob, resJob *jobmod.ResourceJob) { - if job.state.FullTaskID == "" { - h.mgr.pubLock.Lock() - jobSet, ok := h.mgr.jobSets[resJob.GetJobSetID()] - if !ok { - h.mgr.pubLock.Unlock() - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", resJob.GetJobSetID()), job.state)) - return - } - - ref := jobSet.FindRefByLocalJobID(resJob.Info.TargetLocalJobID) - if ref == nil { - h.mgr.pubLock.Unlock() - h.changeJobState(job.job, jobmod.NewStateFailed( - fmt.Sprintf("job %s not found in job set %s", - resJob.Info.TargetLocalJobID, - resJob.GetJobSetID()), - job.state, - )) - return - } - - targetJob, ok := h.mgr.jobs[ref.JobID] - h.mgr.pubLock.Unlock() - - if !ok { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state)) - return - } - - tarNorJob, ok := targetJob.Job.(*jobmod.NormalJob) - if !ok { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job(%v) %s is not a Normal job", reflect.TypeOf(targetJob), ref.JobID), job.state)) - return - } - - colCli, err := schglb.CollectorMQPool.Acquire() - if err != nil { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err.Error()), job.state)) - return - } - defer schglb.CollectorMQPool.Release(colCli) - - ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), tarNorJob.TargetCCID) - if err != nil { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state)) - return - } - - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(), exetsk.NewStorageCreatePackage( - 1, // TOOD 用户ID - ccInfo.CDSStorageID, - tarNorJob.OutputFullPath, - resJob.Info.BucketID, - utils.MakeResourcePackageName(resJob.JobID), - )) - if err != nil { - h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state)) - return - } - - job.state.FullTaskID = fullTaskID - } - - if createRet, err := event.AssertExecutorTaskStatus[*exetsk.StorageCreatePackageStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask { - if err == event.ErrTaskTimeout { - h.changeJobState(job.job, jobmod.NewStateFailed("storage create package timeout", job.state)) - return - } - - h.mgr.execMgr.ForgetTask(job.state.FullTaskID) - - if createRet.Error != "" { - h.changeJobState(job.job, jobmod.NewStateFailed(createRet.Error, job.state)) - return - } - - resJob.ResourcePackageID = createRet.PackageID - - h.changeJobState(job.job, jobmod.NewStateSuccess()) - } -} - -func (h *ExecutingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) { - job.SetState(state) - - delete(h.jobs, job.GetJobID()) - - h.mgr.pubLock.Lock() - h.mgr.handleState(job) - h.mgr.pubLock.Unlock() -} - -func (h *ExecutingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) { - h.cmdChan.Send(func() { - if broadcast.ToAll() { - for _, job := range h.jobs { - h.onJobEvent(evt, job) - } - - } else if broadcast.ToJobSet() { - for _, job := range h.jobs { - if job.job.GetJobSetID() != broadcast.JobSetID { - continue - } - - h.onJobEvent(evt, job) - } - } else if broadcast.ToJob() { - if job, ok := h.jobs[broadcast.JobID]; ok { - h.onJobEvent(evt, job) - } - } - }) -} - -func (h *ExecutingHandler) Serve() { - cmdChan := h.cmdChan.BeginChanReceive() - defer h.cmdChan.CloseChanReceive() - - for { - select { - case cmd := <-cmdChan: - cmd() - } - } -} - -func (h *ExecutingHandler) Stop() { - // TODO 支持STOP -} diff --git a/manager/internal/jobmgr/job.go b/manager/internal/jobmgr/job.go new file mode 100644 index 0000000..66a5349 --- /dev/null +++ b/manager/internal/jobmgr/job.go @@ -0,0 +1,88 @@ +package jobmgr + +import ( + "github.com/samber/lo" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" +) + +type FileScheduleAction string + +// 文件调度方案 +const ( + ActionNo FileScheduleAction = "No" // 不需要操作 + ActionMove FileScheduleAction = "Move" // 需要在指定节点上建立缓存 + ActionLoad FileScheduleAction = "Load" // 需要加载到Storage + ActionImportImage FileScheduleAction = "ImportImage" // 需要导入镜像 +) + +type FileScheduleScheme struct { + Action FileScheduleAction `json:"action"` +} + +// 任务调度方案 +type JobScheduleScheme struct { + TargetCCID schsdk.CCID `json:"targetCCID"` + Dataset FileScheduleScheme `json:"dataset"` + Code FileScheduleScheme `json:"code"` + Image FileScheduleScheme `json:"image"` +} + +// 任务集的预调度方案 +type JobSetPreScheduleScheme struct { + JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID +} + +// 任务集 +type JobSet struct { + JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID + JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用 + PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"` +} +type JobSetJobRef struct { + JobID schsdk.JobID `json:"jobID"` // 任务ID + LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID +} + +func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet { + return &JobSet{ + JobSetID: jobSetID, + JobRefs: jobRefs, + PreScheduleScheme: preScheduleScheme, + } +} + +func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef { + ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID }) + if !ok { + return nil + } + + return &ref +} + +// 任务 +type Job struct { + JobSetID schsdk.JobSetID // 任务集ID + JobID schsdk.JobID // 全局唯一任务ID + Body JobBody // 具体任务 +} + +func (j *Job) GetInfo() schsdk.JobInfo { + return j.Body.GetInfo() +} + +func (j *Job) Dump(ctx JobStateRunContext, job *Job, curState JobState) jobmod.JobStatus { + return jobmod.JobStatus{ + JobID: j.JobID, + JobSetID: j.JobSetID, + Info: j.GetInfo(), + Body: job.Body.Dump(), + State: curState.Dump(ctx, job), + } +} + +type JobBody interface { + GetInfo() schsdk.JobInfo + Dump() jobmod.JobBodyStatus +} diff --git a/manager/internal/jobmgr/job/data_return_job.go b/manager/internal/jobmgr/job/data_return_job.go new file mode 100644 index 0000000..9585743 --- /dev/null +++ b/manager/internal/jobmgr/job/data_return_job.go @@ -0,0 +1,30 @@ +package job + +import ( + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" +) + +type DataReturnJob struct { + Info schsdk.DataReturnJobInfo + TargetJobCCID schsdk.CCID // 目标任务所在计算中心的ID + TargetJobOutputFullPath string // 目标任务的结果输出全路径 + DataReturnPackageID cdssdk.PackageID // 回源之后得到的PackageID +} + +func NewResourceJob(info schsdk.DataReturnJobInfo) *DataReturnJob { + return &DataReturnJob{ + Info: info, + } +} + +func (j *DataReturnJob) GetInfo() schsdk.JobInfo { + return &j.Info +} + +func (j *DataReturnJob) Dump() jobmod.JobBodyStatus { + return jobmod.DataReturnJobStatus{ + DataReturnPackageID: j.DataReturnPackageID, + } +} diff --git a/manager/internal/jobmgr/job/normal_job.go b/manager/internal/jobmgr/job/normal_job.go new file mode 100644 index 0000000..c8e5c04 --- /dev/null +++ b/manager/internal/jobmgr/job/normal_job.go @@ -0,0 +1,30 @@ +package job + +import ( + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" +) + +type NormalJob struct { + Info schsdk.NormalJobInfo // 提交任务时提供的任务描述信息 + Files jobmod.JobFiles // 任务需要的文件 + TargetCCID schsdk.CCID // 将要运行此任务的算力中心ID + OutputFullPath string // 程序结果的完整输出路径 +} + +func NewNormalJob(info schsdk.NormalJobInfo) *NormalJob { + return &NormalJob{ + Info: info, + } +} + +func (j *NormalJob) GetInfo() schsdk.JobInfo { + return &j.Info +} + +func (j *NormalJob) Dump() jobmod.JobBodyStatus { + return &jobmod.NormalJobStatus{ + Files: j.Files, + TargetCCID: j.TargetCCID, + } +} diff --git a/manager/internal/jobmgr/job/state/adjusting.go b/manager/internal/jobmgr/job/state/adjusting.go new file mode 100644 index 0000000..19a9246 --- /dev/null +++ b/manager/internal/jobmgr/job/state/adjusting.go @@ -0,0 +1,271 @@ +package state + +import ( + "context" + "errors" + "fmt" + "sync" + "time" + + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" + schglb "gitlink.org.cn/cloudream/scheduler/common/globals" + schmod "gitlink.org.cn/cloudream/scheduler/common/models" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" + "gitlink.org.cn/cloudream/scheduler/common/utils" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" +) + +type Adjusting struct { + scheme jobmod.JobScheduleScheme + targetCCInfo schmod.ComputingCenter +} + +func NewAdjusting(scheme jobmod.JobScheduleScheme) *Adjusting { + return &Adjusting{ + scheme: scheme, + } +} + +func (s *Adjusting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + err := s.do(rtx, jo) + if err != nil { + rtx.Mgr.ChangeState(jo, FailureComplete(err)) + } else { + rtx.Mgr.ChangeState(jo, NewNormalJobReadyToExecute()) + } +} + +func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} + +func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { + norJob := jo.Body.(*job.NormalJob) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // 监听取消事件 + go func() { + event.WaitType[event.Cancel](ctx, rtx.EventSet) + cancel() + }() + + ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID) + if err != nil { + return fmt.Errorf("getting computing center info: %w", err) + } + s.targetCCInfo = ccInfo + + stgCli, err := schglb.CloudreamStoragePool.Acquire() + if err != nil { + return fmt.Errorf("new cds client: %w", err) + } + defer schglb.CloudreamStoragePool.Release(stgCli) + + // 已经确定最终执行的目标计算中心,则可以生成结果输出路径了 + stgInfo, err := stgCli.StorageGetInfo(cdssdk.StorageGetInfoReq{ + StorageID: ccInfo.CDSStorageID, + }) + if err != nil { + return fmt.Errorf("getting cds storage info: %w", err) + } + // TODO UserID + norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID) + + wg := sync.WaitGroup{} + wg.Add(3) + + var e1, e2, e3 error + + go func() { + defer wg.Done() + e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset) + if e1 != nil { + cancel() + } + }() + + go func() { + defer wg.Done() + e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code) + if e2 != nil { + cancel() + } + }() + + go func() { + defer wg.Done() + e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image) + if e3 != nil { + cancel() + } + }() + + return errors.Join(e1, e2, e3) +} + +func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error { + switch info := fileInfo.(type) { + case *schsdk.LocalJobFileInfo: + evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { + return e.LocalPath == info.LocalPath + }) + if !ok { + return fmt.Errorf("local file %s not uploaded", info.LocalPath) + } + if evt.Error != nil { + return evt.Error + } + + file.PackageID = evt.PackageID + + case *schsdk.PackageJobFileInfo: + file.PackageID = info.PackageID + + case *schsdk.ResourceJobFileInfo: + return nil + + default: + return fmt.Errorf("unknown dataset type: %T", info) + } + + if scheme.Action == jobmod.ActionMove { + wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSNodeID)) + defer wt.Close() + + status, err := wt.Receive(ctx) + if err != nil { + return fmt.Errorf("moving package: %w", err) + } + + moveStatus := status.(*exectsk.CacheMovePackageStatus) + if moveStatus.Error != "" { + return fmt.Errorf("moving package: %s", moveStatus.Error) + } + + return nil + } + + if scheme.Action == jobmod.ActionLoad { + wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID)) + defer wt.Close() + + status, err := wt.Receive(ctx) + if err != nil { + return fmt.Errorf("moving package: %w", err) + } + + moveStatus := status.(*exectsk.CacheMovePackageStatus) + if moveStatus.Error != "" { + return fmt.Errorf("moving package: %s", moveStatus.Error) + } + + return nil + } + + return nil +} + +func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error { + switch info := fileInfo.(type) { + case *schsdk.LocalJobFileInfo: + evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { + return e.LocalPath == info.LocalPath + }) + if !ok { + return fmt.Errorf("local file %s not uploaded", info.LocalPath) + } + if evt.Error != nil { + return evt.Error + } + + // 上传完毕,则可以新建一个空的镜像的记录 + // TODO 镜像名称 + imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now()) + if err != nil { + return fmt.Errorf("creating image info: %w", err) + } + + // 填充ImageID和PackageID + file.ImageID = imgID + file.PackageID = &evt.PackageID + + case *schsdk.ImageJobFileInfo: + imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID) + if err != nil { + return fmt.Errorf("getting image info: %w", err) + } + + file.ImageID = imageInfo.ImageID + file.PackageID = imageInfo.CDSPackageID + } + + if scheme.Action == jobmod.ActionImportImage { + if file.PackageID == nil { + return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID) + } + + // TODO UserID + wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSNodeID)) + defer wt.Close() + + status, err := wt.Receive(ctx) + if err != nil { + return fmt.Errorf("moving package: %w", err) + } + + moveStatus := status.(*exectsk.CacheMovePackageStatus) + if moveStatus.Error != "" { + return fmt.Errorf("moving package: %s", moveStatus.Error) + } + + stgCli, err := schglb.CloudreamStoragePool.Acquire() + if err != nil { + return fmt.Errorf("new cloudream storage client: %w", err) + } + defer schglb.CloudreamStoragePool.Release(stgCli) + + // TODO UserID + pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID}) + if err != nil { + return fmt.Errorf("getting package objects: %w", err) + } + + if len(pkgObjs.Objects) == 0 { + return fmt.Errorf("no object in the package which will be imported") + } + + if len(pkgObjs.Objects) > 1 { + return fmt.Errorf("there must be only 1 object in the package which will be imported") + } + + wt2 := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash))) + defer wt2.Close() + + status2, err := wt2.Receive(ctx) + if err != nil { + return fmt.Errorf("uploading image: %w", err) + } + + uploadStatus := status2.(*exectsk.UploadImageStatus) + if uploadStatus.Error != "" { + return fmt.Errorf("uploading image: %s", uploadStatus.Error) + } + + // TODO 镜像名称 + err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, job.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now()) + if err != nil { + return fmt.Errorf("creating image info: %w", err) + } + + return nil + } + + return nil +} diff --git a/manager/internal/jobmgr/job/state/complete.go b/manager/internal/jobmgr/job/state/complete.go new file mode 100644 index 0000000..47512ca --- /dev/null +++ b/manager/internal/jobmgr/job/state/complete.go @@ -0,0 +1,48 @@ +package state + +import ( + "reflect" + + "gitlink.org.cn/cloudream/common/pkgs/logger" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" +) + +type Completed struct { + err error +} + +func SuccessComplete() *Completed { + return &Completed{} +} +func FailureComplete(err error) *Completed { + return &Completed{err: err} +} + +func (c *Completed) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + // TODO 可以考虑将执行记录落库 + if c.err == nil { + c.handleSuccess(rtx, jo) + } else { + c.handleFailed(rtx, jo) + } +} + +func (s *Completed) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} + +func (c *Completed) handleSuccess(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) { + logger.WithField("JobID", job.JobID).Infof("job completed successfuly") + rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err)) +} + +func (c *Completed) handleFailed(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) { + logger. + WithField("JobID", job.JobID). + WithField("LastState", reflect.TypeOf(rtx.LastState).String()). + Infof("job failed with: %v", c.err) + rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err)) +} diff --git a/manager/internal/jobmgr/job/state/executing.go b/manager/internal/jobmgr/job/state/executing.go new file mode 100644 index 0000000..79534cb --- /dev/null +++ b/manager/internal/jobmgr/job/state/executing.go @@ -0,0 +1,154 @@ +package state + +import ( + "context" + "fmt" + + "gitlink.org.cn/cloudream/common/pkgs/logger" + pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" + "gitlink.org.cn/cloudream/scheduler/common/utils" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" +) + +type NormalJobExecuting struct { + lastStatus pcmsdk.TaskStatus +} + +func NewNormalJobExecuting() *NormalJobExecuting { + return &NormalJobExecuting{ + lastStatus: "Begin", + } +} + +func (s *NormalJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + err := s.do(rtx, jo) + if err != nil { + rtx.Mgr.ChangeState(jo, FailureComplete(err)) + } else { + rtx.Mgr.ChangeState(jo, SuccessComplete()) + } +} + +func (s *NormalJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} + +func (s *NormalJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { + norJob := jo.Body.(*job.NormalJob) + + log := logger.WithType[NormalJobExecuting]("State").WithField("JobID", jo.JobID) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID) + if err != nil { + return fmt.Errorf("getting pcm image info: %w", err) + } + + ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID) + if err != nil { + return fmt.Errorf("getting computing center info: %w", err) + } + + // TODO 需要添加DATA_IN、DATA_OUT等环境变量,这些数据从Job的信息中来获取 + ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID) + if err != nil { + return fmt.Errorf("getting computing center resource: %w", err) + } + if len(ress) == 0 { + return fmt.Errorf("no resource found at computing center %v", norJob.TargetCCID) + } + + wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSubmitTask( + ccInfo.PCMParticipantID, + pcmImgInfo.PCMImageID, + // TODO 选择资源的算法 + ress[0].PCMResourceID, + norJob.Info.Runtime.Command, + norJob.Info.Runtime.Envs, + )) + defer wt.Close() + + for { + status, err := wt.Receive(ctx) + if err != nil { + return err + } + tskStatus := status.(*exetsk.SubmitTaskStatus) + if tskStatus.Error != "" { + return fmt.Errorf("submitting task: %s", tskStatus.Error) + } + + if tskStatus.Status != s.lastStatus { + log.Infof("task %s -> %s", s.lastStatus, tskStatus.Status) + } + s.lastStatus = tskStatus.Status + + switch tskStatus.Status { + case pcmsdk.TaskStatusSuccess: + return nil + + case pcmsdk.TaskStatusFailed: + return fmt.Errorf("task failed") + } + } +} + +type DataReturnJobExecuting struct { +} + +func NewDataReturnJobExecuting() *DataReturnJobExecuting { + return &DataReturnJobExecuting{} +} + +func (s *DataReturnJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + err := s.do(rtx, jo) + if err != nil { + rtx.Mgr.ChangeState(jo, FailureComplete(err)) + } else { + rtx.Mgr.ChangeState(jo, SuccessComplete()) + } +} + +func (s *DataReturnJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} + +func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { + reJob := jo.Body.(*job.DataReturnJob) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), reJob.TargetJobCCID) + if err != nil { + return fmt.Errorf("getting computing center info: %w", err) + } + + wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewStorageCreatePackage( + 1, // TOOD 用户ID + ccInfo.CDSStorageID, + reJob.TargetJobOutputFullPath, + reJob.Info.BucketID, + utils.MakeResourcePackageName(jo.JobID), + )) + defer wt.Close() + + status, err := wt.Receive(ctx) + if err != nil { + return err + } + tskStatus := status.(*exetsk.StorageCreatePackageStatus) + if tskStatus.Error != "" { + return fmt.Errorf("creating package: %s", tskStatus.Error) + } + + reJob.DataReturnPackageID = tskStatus.PackageID + return nil +} diff --git a/manager/internal/jobmgr/job/state/making_adjust_scheme.go b/manager/internal/jobmgr/job/state/making_adjust_scheme.go new file mode 100644 index 0000000..115cf70 --- /dev/null +++ b/manager/internal/jobmgr/job/state/making_adjust_scheme.go @@ -0,0 +1,61 @@ +package state + +import ( + "context" + "fmt" + + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" +) + +type MakingAdjustScheme struct { +} + +func NewMakeingAdjustScheme() *MakingAdjustScheme { + return &MakingAdjustScheme{} +} + +func (s *MakingAdjustScheme) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + scheme, err := s.do(rtx, jo.Body.(*job.NormalJob)) + if err != nil { + rtx.Mgr.ChangeState(jo, FailureComplete(err)) + } else { + rtx.Mgr.ChangeState(jo, NewAdjusting(*scheme)) + } +} + +func (s *MakingAdjustScheme) do(rtx jobmgr.JobStateRunContext, norJob *job.NormalJob) (*jobmod.JobScheduleScheme, error) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go func() { + event.WaitType[event.Cancel](ctx, rtx.EventSet) + cancel() + }() + + wt := rtx.Mgr.AdvMgr.StartTask(advtsk.NewMakeAdjustScheme(norJob.Info, jobmod.NormalJobStatus{ + TargetCCID: norJob.TargetCCID, + Files: norJob.Files, + })) + defer wt.Close() + + status, err := wt.Receive(ctx) + if err != nil { + return nil, fmt.Errorf("making adjust scheme: %w", err) + } + + mkStatus := status.(*advtsk.MakeAdjustSchemeStatus) + if mkStatus.Error != "" { + return nil, fmt.Errorf("making adjust scheme: %s", mkStatus.Error) + } + + return &mkStatus.Scheme, nil +} + +func (s *MakingAdjustScheme) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} diff --git a/manager/internal/jobmgr/job/state/prescheduling.go b/manager/internal/jobmgr/job/state/prescheduling.go new file mode 100644 index 0000000..fd50cb8 --- /dev/null +++ b/manager/internal/jobmgr/job/state/prescheduling.go @@ -0,0 +1,251 @@ +package state + +import ( + "context" + "errors" + "fmt" + "sync" + "time" + + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" + schglb "gitlink.org.cn/cloudream/scheduler/common/globals" + schmod "gitlink.org.cn/cloudream/scheduler/common/models" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" +) + +type PreScheduling struct { + scheme jobmod.JobScheduleScheme + targetCCInfo schmod.ComputingCenter +} + +func NewPreSchuduling(scheme jobmod.JobScheduleScheme) *PreScheduling { + return &PreScheduling{ + scheme: scheme, + } +} + +func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + norJob := jo.Body.(*job.NormalJob) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // 监听取消事件 + go func() { + event.WaitType[event.Cancel](ctx, rtx.EventSet) + cancel() + }() + + ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID) + if err != nil { + rtx.Mgr.ChangeState(jo, FailureComplete(fmt.Errorf("getting computing center info: %w", err))) + return + } + s.targetCCInfo = ccInfo + + wg := sync.WaitGroup{} + wg.Add(3) + + var e1, e2, e3 error + + go func() { + defer wg.Done() + e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset) + if e1 != nil { + cancel() + } + }() + + go func() { + defer wg.Done() + e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code) + if e2 != nil { + cancel() + } + }() + + go func() { + defer wg.Done() + e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image) + if e3 != nil { + cancel() + } + }() + + allErr := errors.Join(e1, e2, e3) + if allErr != nil { + rtx.Mgr.ChangeState(jo, FailureComplete(err)) + } else { + rtx.Mgr.ChangeState(jo, NewReadyToAdjust()) + } +} + +func (s *PreScheduling) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} + +func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error { + switch info := fileInfo.(type) { + case *schsdk.LocalJobFileInfo: + evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { + return e.LocalPath == info.LocalPath + }) + if !ok { + return fmt.Errorf("local file %s not uploaded", info.LocalPath) + } + if evt.Error != nil { + return evt.Error + } + + file.PackageID = evt.PackageID + + case *schsdk.PackageJobFileInfo: + file.PackageID = info.PackageID + + case *schsdk.ResourceJobFileInfo: + return nil + + default: + return fmt.Errorf("unknown dataset type: %T", info) + } + + if scheme.Action == jobmod.ActionMove { + wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSNodeID)) + defer wt.Close() + + status, err := wt.Receive(ctx) + if err != nil { + return fmt.Errorf("moving package: %w", err) + } + + moveStatus := status.(*exectsk.CacheMovePackageStatus) + if moveStatus.Error != "" { + return fmt.Errorf("moving package: %s", moveStatus.Error) + } + + return nil + } + + if scheme.Action == jobmod.ActionLoad { + wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID)) + defer wt.Close() + + status, err := wt.Receive(ctx) + if err != nil { + return fmt.Errorf("moving package: %w", err) + } + + moveStatus := status.(*exectsk.CacheMovePackageStatus) + if moveStatus.Error != "" { + return fmt.Errorf("moving package: %s", moveStatus.Error) + } + + return nil + } + + return nil +} + +func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error { + switch info := fileInfo.(type) { + case *schsdk.LocalJobFileInfo: + evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { + return e.LocalPath == info.LocalPath + }) + if !ok { + return fmt.Errorf("local file %s not uploaded", info.LocalPath) + } + if evt.Error != nil { + return evt.Error + } + + // 上传完毕,则可以新建一个空的镜像的记录 + // TODO 镜像名称 + imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now()) + if err != nil { + return fmt.Errorf("creating image info: %w", err) + } + + // 填充ImageID和PackageID + file.ImageID = imgID + file.PackageID = &evt.PackageID + + case *schsdk.ImageJobFileInfo: + imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID) + if err != nil { + return fmt.Errorf("getting image info: %w", err) + } + + file.ImageID = imageInfo.ImageID + file.PackageID = imageInfo.CDSPackageID + } + + if scheme.Action == jobmod.ActionImportImage { + if file.PackageID == nil { + return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID) + } + + // TODO UserID + wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSNodeID)) + defer wt.Close() + + status, err := wt.Receive(ctx) + if err != nil { + return fmt.Errorf("moving package: %w", err) + } + + moveStatus := status.(*exectsk.CacheMovePackageStatus) + if moveStatus.Error != "" { + return fmt.Errorf("moving package: %s", moveStatus.Error) + } + + stgCli, err := schglb.CloudreamStoragePool.Acquire() + if err != nil { + return fmt.Errorf("new cloudream storage client: %w", err) + } + defer schglb.CloudreamStoragePool.Release(stgCli) + + // TODO UserID + pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID}) + if err != nil { + return fmt.Errorf("getting package objects: %w", err) + } + + if len(pkgObjs.Objects) == 0 { + return fmt.Errorf("no object in the package which will be imported") + } + + if len(pkgObjs.Objects) > 1 { + return fmt.Errorf("there must be only 1 object in the package which will be imported") + } + + wt2 := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash))) + defer wt2.Close() + + status2, err := wt2.Receive(ctx) + if err != nil { + return fmt.Errorf("uploading image: %w", err) + } + + uploadStatus := status2.(*exectsk.UploadImageStatus) + if uploadStatus.Error != "" { + return fmt.Errorf("uploading image: %s", uploadStatus.Error) + } + + // TODO 镜像名称 + err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, norJob.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now()) + if err != nil { + return fmt.Errorf("creating image info: %w", err) + } + + return nil + } + + return nil +} diff --git a/manager/internal/jobmgr/job/state/ready_to_adjust.go b/manager/internal/jobmgr/job/state/ready_to_adjust.go new file mode 100644 index 0000000..d8333ff --- /dev/null +++ b/manager/internal/jobmgr/job/state/ready_to_adjust.go @@ -0,0 +1,65 @@ +package state + +import ( + "context" + "fmt" + + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" +) + +type ReadyToAdjust struct { +} + +func NewReadyToAdjust() *ReadyToAdjust { + return &ReadyToAdjust{} +} + +func (s *ReadyToAdjust) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + err := s.do(rtx, jo) + if err != nil { + rtx.Mgr.ChangeState(jo, FailureComplete(err)) + } else { + rtx.Mgr.ChangeState(jo, NewMakeingAdjustScheme()) + } +} + +func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { + norJob := jo.Body.(*job.NormalJob) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go func() { + event.WaitType[event.Cancel](ctx, rtx.EventSet) + cancel() + }() + + if rt, ok := norJob.Info.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok { + evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool { + return val.Job.GetInfo().GetLocalJobID() == rt.ResourceLocalJobID + }) + if !ok { + return jobmgr.ErrJobCancelled + } + if evt.Err != nil { + return fmt.Errorf("depended job %s was failed", evt.Job.JobID) + } + rtJob, ok := evt.Job.Body.(*job.DataReturnJob) + if !ok { + return fmt.Errorf("job %s is not a DataReturn job(which is %T)", evt.Job.JobID, evt.Job) + } + + norJob.Files.Dataset.PackageID = rtJob.DataReturnPackageID + } + + return nil +} + +func (s *ReadyToAdjust) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} diff --git a/manager/internal/jobmgr/job/state/ready_to_execute.go b/manager/internal/jobmgr/job/state/ready_to_execute.go new file mode 100644 index 0000000..d72212f --- /dev/null +++ b/manager/internal/jobmgr/job/state/ready_to_execute.go @@ -0,0 +1,40 @@ +package state + +import ( + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" +) + +type NormalJobReadyToExecute struct { +} + +func NewNormalJobReadyToExecute() *NormalJobReadyToExecute { + return &NormalJobReadyToExecute{} +} + +func (s *NormalJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + // TODO 目前直接启动执行 + rtx.Mgr.ChangeState(jo, NewNormalJobExecuting()) +} + +func (s *NormalJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} + +type DataReturnJobReadyToExecute struct { +} + +func NewDataReturnJobReadyToExecute() *DataReturnJobReadyToExecute { + return &DataReturnJobReadyToExecute{} +} + +func (s *DataReturnJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + // TODO 目前直接启动执行 + rtx.Mgr.ChangeState(jo, NewDataReturnJobExecuting()) +} + +func (s *DataReturnJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} diff --git a/manager/internal/jobmgr/job/state/wait_target_complete.go b/manager/internal/jobmgr/job/state/wait_target_complete.go new file mode 100644 index 0000000..19ffc5e --- /dev/null +++ b/manager/internal/jobmgr/job/state/wait_target_complete.go @@ -0,0 +1,62 @@ +package state + +import ( + "context" + "fmt" + + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" +) + +type WaitTargetComplete struct { +} + +func NewWaitTargetComplete() *WaitTargetComplete { + return &WaitTargetComplete{} +} + +func (s *WaitTargetComplete) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + err := s.do(rtx, jo) + if err != nil { + rtx.Mgr.ChangeState(jo, FailureComplete(err)) + } else { + rtx.Mgr.ChangeState(jo, NewDataReturnJobReadyToExecute()) + } +} + +func (s *WaitTargetComplete) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { + reJob := jo.Body.(*job.DataReturnJob) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go func() { + event.WaitType[event.Cancel](ctx, rtx.EventSet) + cancel() + }() + + evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool { + return val.Job.GetInfo().GetLocalJobID() == reJob.Info.TargetLocalJobID + }) + if !ok { + return jobmgr.ErrJobCancelled + } + if evt.Err != nil { + return fmt.Errorf("depended job %s was failed", evt.Job.JobID) + } + norJob, ok := evt.Job.Body.(*job.NormalJob) + if !ok { + return fmt.Errorf("job %s is not a Normal job(which is %T)", evt.Job.JobID, evt.Job) + } + + reJob.TargetJobCCID = norJob.TargetCCID + reJob.TargetJobOutputFullPath = norJob.OutputFullPath + return nil +} + +func (s *WaitTargetComplete) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus { + // TODO + return nil +} diff --git a/manager/internal/jobmgr/job_state.go b/manager/internal/jobmgr/job_state.go new file mode 100644 index 0000000..da0881a --- /dev/null +++ b/manager/internal/jobmgr/job_state.go @@ -0,0 +1,14 @@ +package jobmgr + +import jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + +type JobStateRunContext struct { + Mgr *Manager + EventSet *EventSet + LastState JobState +} + +type JobState interface { + Run(ctx JobStateRunContext, job *Job) + Dump(ctx JobStateRunContext, job *Job) jobmod.JobStateStatus +} diff --git a/manager/internal/jobmgr/jobmgr.go b/manager/internal/jobmgr/jobmgr.go index 5393159..cd5d218 100644 --- a/manager/internal/jobmgr/jobmgr.go +++ b/manager/internal/jobmgr/jobmgr.go @@ -1,283 +1,169 @@ package jobmgr import ( - "context" "fmt" - "reflect" "sync" "time" - "gitlink.org.cn/cloudream/common/pkgs/logger" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" - myreflect "gitlink.org.cn/cloudream/common/utils/reflect" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" "gitlink.org.cn/cloudream/scheduler/common/pkgs/db" - advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task" - exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" "gitlink.org.cn/cloudream/scheduler/manager/internal/advisormgr" "gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" ) type mgrJob struct { - Job jobmod.Job - Handler StateHandler + job Job + eventSet EventSet + state JobState +} + +type mgrJobSet struct { + jobs map[schsdk.JobID]*mgrJob } type Manager struct { // 任何修改job、jobset的操作,都需要加这个锁 pubLock sync.Mutex - execMgr *executormgr.Manager - advMgr *advisormgr.Manager - db *db.DB - - handlers map[reflect.Type]StateHandler - defaultHandler StateHandler + ExecMgr *executormgr.Manager + AdvMgr *advisormgr.Manager + DB *db.DB jobSetIDIndex int - jobSets map[schsdk.JobSetID]*jobmod.JobSet + jobSets map[schsdk.JobSetID]*mgrJobSet jobIDIndex int jobs map[schsdk.JobID]*mgrJob } func NewManager(execMgr *executormgr.Manager, advMgr *advisormgr.Manager, db *db.DB) (*Manager, error) { mgr := &Manager{ - execMgr: execMgr, - advMgr: advMgr, - db: db, - - handlers: make(map[reflect.Type]StateHandler), - jobSets: make(map[schsdk.JobSetID]*jobmod.JobSet), - jobs: make(map[schsdk.JobID]*mgrJob), + ExecMgr: execMgr, + AdvMgr: advMgr, + DB: db, + jobSets: make(map[schsdk.JobSetID]*mgrJobSet), + jobs: make(map[schsdk.JobID]*mgrJob), } - execMgr.OnTaskUpdated(mgr.executorTaskUpdated) - execMgr.OnTaskTimeout(mgr.executorTaskTimeout) - - advMgr.OnTaskUpdated(mgr.advisorTaskUpdated) - advMgr.OnTaskTimeout(mgr.advisorTaskTimeout) - - // TODO 考虑优化这部分逻辑 - - mgr.handlers[myreflect.TypeOf[*jobmod.StatePreScheduling]()] = NewPreSchedulingHandler(mgr) - mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToAdjust]()] = NewReadyToAdjustHandler(mgr) - mgr.handlers[myreflect.TypeOf[*jobmod.StateMakingAdjustScheme]()] = NewMakingAdjustSchemeHandler(mgr) - mgr.handlers[myreflect.TypeOf[*jobmod.StateAdjusting]()] = NewAdjustingHandler(mgr) - mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToExecute]()] = NewReadyToExecuteHandler(mgr) - mgr.handlers[myreflect.TypeOf[*jobmod.StateExecuting]()] = NewExecutingHandler(mgr) - - compHder := NewCompleteHandler(mgr) - mgr.handlers[myreflect.TypeOf[*jobmod.StateFailed]()] = compHder - mgr.handlers[myreflect.TypeOf[*jobmod.StateSuccess]()] = compHder - - mgr.defaultHandler = NewDefaultHandler(mgr) - return mgr, nil } func (m *Manager) Serve() error { - for _, h := range m.handlers { - go h.Serve() - } - - go m.defaultHandler.Serve() ticker := time.NewTicker(time.Minute) defer ticker.Stop() - for { - select { - case <-ticker.C: - // 每一分钟产生一个空事件,防止无限等待 - m.pubLock.Lock() - m.onEvent(event.ToAll(), nil) - m.pubLock.Unlock() - } - } - return nil } func (m *Manager) Stop() { - for _, h := range m.handlers { - h.Stop() - } - m.defaultHandler.Stop() } -func (m *Manager) SubmitJobSet(jobSetInfo schsdk.JobSetInfo, preScheduleScheme jobmod.JobSetPreScheduleScheme) (*jobmod.JobSet, error) { +func (m *Manager) ChangeState(job *Job, state JobState) { + m.pubLock.Lock() + defer m.pubLock.Unlock() + + mgrJob, ok := m.jobs[job.JobID] + if !ok { + return + } + + lastState := mgrJob.state + mgrJob.state = state + + go func() { + state.Run(JobStateRunContext{ + Mgr: m, + EventSet: &mgrJob.eventSet, + LastState: lastState, + }, job) + }() +} + +func (m *Manager) PostEvent(jobID schsdk.JobID, evt Event) { + m.pubLock.Lock() + defer m.pubLock.Unlock() + + mgrJob, ok := m.jobs[jobID] + if !ok { + return + } + + go func() { + mgrJob.eventSet.Post(evt) + }() +} + +func (m *Manager) BroadcastEvent(jobSetID schsdk.JobSetID, evt Event) { + m.pubLock.Lock() + defer m.pubLock.Unlock() + + jobSet, ok := m.jobSets[jobSetID] + if !ok { + return + } + + for _, mgrJob := range jobSet.jobs { + go func() { + mgrJob.eventSet.Post(evt) + }() + } +} + +type SubmittingJob struct { + Body JobBody + InitState JobState +} + +func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID { m.pubLock.Lock() defer m.pubLock.Unlock() jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex)) - - var jobs []jobmod.Job - var normalJobs []*jobmod.NormalJob - var resJobs []*jobmod.ResourceJob - var jobRefs []jobmod.JobSetJobRef - for i, jobInfo := range jobSetInfo.Jobs { - jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i)) - - switch info := jobInfo.(type) { - case *schsdk.NormalJobInfo: - job := jobmod.NewNormalJob(jobSetID, jobID, *info) - jobs = append(jobs, job) - normalJobs = append(normalJobs, job) - jobRefs = append(jobRefs, jobmod.JobSetJobRef{ - LocalJobID: info.LocalJobID, - JobID: jobID, - }) - - preSch, ok := preScheduleScheme.JobSchemes[info.LocalJobID] - if !ok { - return nil, fmt.Errorf("pre schedule scheme for job %s is not found", info.LocalJobID) - } - - job.State = jobmod.NewStatePreScheduling(preSch) - job.TargetCCID = preSch.TargetCCID - - case *schsdk.ResourceJobInfo: - job := jobmod.NewResourceJob(jobSetID, jobID, *info) - jobs = append(jobs, job) - resJobs = append(resJobs, job) - jobRefs = append(jobRefs, jobmod.JobSetJobRef{ - LocalJobID: info.LocalJobID, - JobID: jobID, - }) - - // 回源任务不需要预调度,所以直接是进入待调整状态 - job.State = jobmod.NewStateReadyToAdjust() - } - } - - // TODO 可以考虑检查一下有依赖的任务的信息所描述依赖的LocalJobID是不是有效的 - - jobSet := jobmod.NewJobSet(jobSetID, jobRefs, preScheduleScheme) - m.jobSets[jobSetID] = jobSet - for _, job := range jobs { - m.jobs[job.GetJobID()] = &mgrJob{ - Job: job, - } - - m.handleState(job) - } - m.jobSetIDIndex += 1 - m.jobIDIndex += len(jobSetInfo.Jobs) - return jobSet, nil + jobSet := &mgrJobSet{ + jobs: make(map[schsdk.JobID]*mgrJob), + } + m.jobSets[jobSetID] = jobSet + + for i, subJob := range jobs { + jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i)) + job := &mgrJob{ + job: Job{ + JobSetID: jobSetID, + JobID: jobID, + Body: subJob.Body, + }, + eventSet: NewEventSet(), + } + jobSet.jobs[jobID] = job + + m.ChangeState(&job.job, subJob.InitState) + } + m.jobIDIndex += len(jobs) + + return jobSetID } -func (m *Manager) LocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) error { +func (m *Manager) DumpJobSet(jobSetID schsdk.JobSetID) []jobmod.JobStatus { m.pubLock.Lock() defer m.pubLock.Unlock() - for _, h := range m.handlers { - h.OnEvent(event.ToJobSet(jobSetID), event.NewLocalFileUploaded(jobSetID, localPath, err, packageID)) - } - - return nil -} - -func (m *Manager) executorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus exectsk.TaskStatus) { - m.pubLock.Lock() - defer m.pubLock.Unlock() - - job, ok := m.jobs[jobID] + jobSet, ok := m.jobSets[jobSetID] if !ok { - return + return nil } - job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskUpdated(fullTaskID, taskStatus)) -} - -func (m *Manager) executorTaskTimeout(jobID schsdk.JobID, fullTaskID string) { - m.pubLock.Lock() - defer m.pubLock.Unlock() - - job, ok := m.jobs[jobID] - if !ok { - return - } - - job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskTimeout(fullTaskID)) -} - -func (m *Manager) advisorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus) { - m.pubLock.Lock() - defer m.pubLock.Unlock() - - job, ok := m.jobs[jobID] - if !ok { - return - } - - job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskUpdated(fullTaskID, taskStatus)) -} - -func (m *Manager) advisorTaskTimeout(jobID schsdk.JobID, fullTaskID string) { - m.pubLock.Lock() - defer m.pubLock.Unlock() - - job, ok := m.jobs[jobID] - if !ok { - return - } - - job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskTimeout(fullTaskID)) -} - -func (m *Manager) CloneJob(jobID schsdk.JobID) (jobmod.Job, error) { - m.pubLock.Lock() - - job, ok := m.jobs[jobID] - if !ok { - m.pubLock.Unlock() - return nil, fmt.Errorf("job not found") - } - - evt := event.NewCloneJob() - job.Handler.OnEvent(event.ToJob(jobID), evt) - m.pubLock.Unlock() - - return evt.Callback.WaitValue(context.Background()) -} - -// 根据job状态选择handler进行处理。需要加锁 -func (m *Manager) handleState(job jobmod.Job) { - logger.WithField("JobID", job.GetJobID()). - WithField("State", reflect.TypeOf(job.GetState()).String()). - Debugf("job state changed") - - runtime, ok := m.jobs[job.GetJobID()] - if !ok { - return - } - - state := job.GetState() - if state == nil { - runtime.Handler = m.defaultHandler - m.defaultHandler.Handle(job) - return - } - - stateType := reflect.TypeOf(state) - handler, ok := m.handlers[stateType] - if !ok { - runtime.Handler = m.defaultHandler - m.defaultHandler.Handle(job) - return - } - - runtime.Handler = handler - handler.Handle(job) -} - -func (m *Manager) onEvent(broadcast event.Broadcast, evt event.Event) { - for _, h := range m.handlers { - h.OnEvent(broadcast, evt) - } + var jobStatuses []jobmod.JobStatus + for _, mgrJob := range jobSet.jobs { + jobStatuses = append(jobStatuses, mgrJob.job.Dump(JobStateRunContext{ + Mgr: m, + EventSet: &mgrJob.eventSet, + LastState: mgrJob.state, + }, &mgrJob.job, mgrJob.state)) + } + + return jobStatuses } diff --git a/manager/internal/jobmgr/making_adjust_scheme_handler.go b/manager/internal/jobmgr/making_adjust_scheme_handler.go deleted file mode 100644 index 124385f..0000000 --- a/manager/internal/jobmgr/making_adjust_scheme_handler.go +++ /dev/null @@ -1,139 +0,0 @@ -package jobmgr - -import ( - "fmt" - "reflect" - - "gitlink.org.cn/cloudream/common/pkgs/actor" - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" -) - -type makingAdjustSchemeJob struct { - job *jobmod.NormalJob - state *jobmod.StateMakingAdjustScheme -} - -type MakingAdjustSchemeHandler struct { - mgr *Manager - - jobs map[schsdk.JobID]*makingAdjustSchemeJob - - cmdChan actor.CommandChannel -} - -func NewMakingAdjustSchemeHandler(mgr *Manager) *MakingAdjustSchemeHandler { - return &MakingAdjustSchemeHandler{ - mgr: mgr, - jobs: make(map[schsdk.JobID]*makingAdjustSchemeJob), - cmdChan: *actor.NewCommandChannel(), - } -} - -func (h *MakingAdjustSchemeHandler) Handle(job jobmod.Job) { - h.cmdChan.Send(func() { - norJob, ok := job.(*jobmod.NormalJob) - if !ok { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState())) - return - } - - state, ok := job.GetState().(*jobmod.StateMakingAdjustScheme) - if !ok { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState())) - return - } - - rjob := &makingAdjustSchemeJob{ - job: norJob, - state: state, - } - h.jobs[job.GetJobID()] = rjob - - h.onJobEvent(nil, rjob) - }) -} - -func (h *MakingAdjustSchemeHandler) onJobEvent(evt event.Event, job *makingAdjustSchemeJob) { - if cloneEvt, ok := evt.(*event.CloneJob); ok { - cloneEvt.Callback.SetValue(job.job.Clone()) - return - } - - if job.state.FullTaskID == "" { - fullTaskID, err := h.mgr.advMgr.StartTask(job.job.GetJobID(), advtsk.NewMakeAdjustScheme(*job.job)) - if err != nil { - h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state)) - return - } - - job.state.FullTaskID = fullTaskID - } - - if makingRet, err := event.AssertAdvisorTaskStatus[*advtsk.MakeAdjustSchemeStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask { - if err == event.ErrTaskTimeout { - h.changeJobState(job.job, jobmod.NewStateFailed("make adjust scheme timeout", job.state)) - return - } - - h.mgr.advMgr.ForgetTask(job.state.FullTaskID) - - if makingRet.Error != "" { - h.changeJobState(job.job, jobmod.NewStateFailed(makingRet.Error, job.state)) - return - } - - h.changeJobState(job.job, jobmod.NewStateAdjusting(makingRet.Scheme)) - } -} - -func (h *MakingAdjustSchemeHandler) changeJobState(job jobmod.Job, state jobmod.JobState) { - job.SetState(state) - - delete(h.jobs, job.GetJobID()) - - h.mgr.pubLock.Lock() - h.mgr.handleState(job) - h.mgr.pubLock.Unlock() -} - -func (h *MakingAdjustSchemeHandler) OnEvent(broadcast event.Broadcast, evt event.Event) { - h.cmdChan.Send(func() { - if broadcast.ToAll() { - for _, job := range h.jobs { - h.onJobEvent(evt, job) - } - - } else if broadcast.ToJobSet() { - for _, job := range h.jobs { - if job.job.GetJobSetID() != broadcast.JobSetID { - continue - } - - h.onJobEvent(evt, job) - } - } else if broadcast.ToJob() { - if job, ok := h.jobs[broadcast.JobID]; ok { - h.onJobEvent(evt, job) - } - } - }) -} - -func (h *MakingAdjustSchemeHandler) Serve() { - cmdChan := h.cmdChan.BeginChanReceive() - defer h.cmdChan.CloseChanReceive() - - for { - select { - case cmd := <-cmdChan: - cmd() - } - } -} - -func (h *MakingAdjustSchemeHandler) Stop() { - // TODO 支持STOP -} diff --git a/manager/internal/jobmgr/prescheduling_handler.go b/manager/internal/jobmgr/prescheduling_handler.go deleted file mode 100644 index d19a5cb..0000000 --- a/manager/internal/jobmgr/prescheduling_handler.go +++ /dev/null @@ -1,442 +0,0 @@ -package jobmgr - -import ( - "fmt" - "reflect" - "time" - - "gitlink.org.cn/cloudream/common/pkgs/actor" - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" - - schglb "gitlink.org.cn/cloudream/scheduler/common/globals" - schmod "gitlink.org.cn/cloudream/scheduler/common/models" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" -) - -var ErrPreScheduleFailed = fmt.Errorf("pre schedule failed") - -type preSchedulingJob struct { - job *jobmod.NormalJob - state *jobmod.StatePreScheduling - ccInfo schmod.ComputingCenter -} - -type PreSchedulingHandler struct { - mgr *Manager - - jobs map[schsdk.JobID]*preSchedulingJob - - cmdChan actor.CommandChannel -} - -func NewPreSchedulingHandler(mgr *Manager) *PreSchedulingHandler { - return &PreSchedulingHandler{ - mgr: mgr, - jobs: make(map[schsdk.JobID]*preSchedulingJob), - cmdChan: *actor.NewCommandChannel(), - } -} - -func (h *PreSchedulingHandler) Handle(job jobmod.Job) { - h.cmdChan.Send(func() { - norJob, ok := job.(*jobmod.NormalJob) - if !ok { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState())) - return - } - - preSchState, ok := norJob.GetState().(*jobmod.StatePreScheduling) - if !ok { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState())) - return - } - - colCli, err := schglb.CollectorMQPool.Acquire() - if err != nil { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err), job.GetState())) - return - } - defer schglb.CollectorMQPool.Release(colCli) - - ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), preSchState.Scheme.TargetCCID) - if err != nil { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.GetState())) - return - } - - norJob.TargetCCID = preSchState.Scheme.TargetCCID - preJob := &preSchedulingJob{ - job: norJob, - state: preSchState, - ccInfo: ccInfo, - } - h.jobs[job.GetJobID()] = preJob - - h.onJobEvent(nil, preJob) - }) -} - -func (h *PreSchedulingHandler) onJobEvent(evt event.Event, job *preSchedulingJob) { - if cloneEvt, ok := evt.(*event.CloneJob); ok { - cloneEvt.Callback.SetValue(job.job.Clone()) - return - } - - err := h.doPackageScheduling(evt, job, - job.job.Info.Files.Dataset, &job.job.Files.Dataset, - &job.state.Scheme.Dataset, &job.state.Dataset, - ) - if err != nil { - job.state.Dataset.Error = err.Error() - h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state)) - return - } - - err = h.doPackageScheduling(evt, job, - job.job.Info.Files.Code, &job.job.Files.Code, - &job.state.Scheme.Code, &job.state.Code, - ) - if err != nil { - job.state.Code.Error = err.Error() - h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state)) - return - } - - err = h.doImageScheduling(evt, job, - job.job.Info.Files.Image, &job.job.Files.Image, - &job.state.Scheme.Image, &job.state.Image, - ) - if err != nil { - job.state.Image.Error = err.Error() - h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state)) - return - } - - // 如果三种文件都调度完成,则可以进入下个阶段了 - if job.state.Dataset.Step == jobmod.StepCompleted && - job.state.Code.Step == jobmod.StepCompleted && - job.state.Image.Step == jobmod.StepCompleted { - - h.changeJobState(job.job, jobmod.NewStateReadyToAdjust()) - } -} - -func (h *PreSchedulingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) { - job.SetState(state) - - delete(h.jobs, job.GetJobID()) - - h.mgr.pubLock.Lock() - h.mgr.handleState(job) - h.mgr.pubLock.Unlock() -} - -func (h *PreSchedulingHandler) doPackageScheduling(evt event.Event, job *preSchedulingJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error { - // TODO 考虑拆分成多个函数 - if state.Step == jobmod.StepBegin { - switch info := fileInfo.(type) { - case *schsdk.LocalJobFileInfo: - state.Step = jobmod.StepUploading - - case *schsdk.PackageJobFileInfo: - file.PackageID = info.PackageID - state.Step = jobmod.StepUploaded - - case *schsdk.ResourceJobFileInfo: - state.Step = jobmod.StepCompleted - - default: - return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo)) - } - } - - if state.Step == jobmod.StepUploading { - if evt == nil { - return nil - } - - localFileCmd, ok := evt.(*event.LocalFileUploaded) - if !ok { - return nil - } - - if localFileCmd.LocalPath != fileInfo.(*schsdk.LocalJobFileInfo).LocalPath { - return nil - } - - if localFileCmd.Error != "" { - return fmt.Errorf("local file uploading: %s", localFileCmd.Error) - } - - file.PackageID = localFileCmd.PackageID - state.Step = jobmod.StepUploaded - } - - if state.Step == jobmod.StepUploaded { - if scheme.Action == jobmod.ActionNo { - state.Step = jobmod.StepCompleted - return nil - } - - if scheme.Action == jobmod.ActionMove { - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, file.PackageID, job.ccInfo.CDSNodeID)) - if err != nil { - return fmt.Errorf("starting cache move package: %w", err) - } - - state.Step = jobmod.StepMoving - state.FullTaskID = fullTaskID - return nil - - } - - if scheme.Action == jobmod.ActionLoad { - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewStorageLoadPackage(1, file.PackageID, job.ccInfo.CDSStorageID)) - if err != nil { - return fmt.Errorf("starting stroage load package: %w", err) - } - - state.Step = jobmod.StepLoading - state.FullTaskID = fullTaskID - return nil - } - - return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo)) - } - - if state.Step == jobmod.StepMoving { - moveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID) - if err == event.ErrUnconcernedTask { - return nil - } - - if err == event.ErrTaskTimeout { - return fmt.Errorf("cache move package timeout") - } - - h.mgr.execMgr.ForgetTask(state.FullTaskID) - - if moveRet.Error != "" { - return fmt.Errorf("cache move pacakge: %s", moveRet.Error) - } - - state.Step = jobmod.StepCompleted - return nil - } - - if state.Step == jobmod.StepLoading { - loadRet, err := event.AssertExecutorTaskStatus[*exectsk.StorageLoadPackageStatus](evt, state.FullTaskID) - if err == event.ErrUnconcernedTask { - return nil - } - - if err == event.ErrTaskTimeout { - return fmt.Errorf("storage load package timeout") - } - - h.mgr.execMgr.ForgetTask(state.FullTaskID) - - if loadRet.Error != "" { - return fmt.Errorf("storage load package: %s", loadRet.Error) - } - - file.FullPath = loadRet.FullPath - - state.Step = jobmod.StepCompleted - return nil - } - - return nil -} - -func (h *PreSchedulingHandler) doImageScheduling(evt event.Event, job *preSchedulingJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error { - // TODO 考虑拆分成多个函数 - if state.Step == jobmod.StepBegin { - switch info := fileInfo.(type) { - case *schsdk.LocalJobFileInfo: - state.Step = jobmod.StepUploading - - case *schsdk.ImageJobFileInfo: - imageInfo, err := h.mgr.db.Image().GetByID(h.mgr.db.SQLCtx(), info.ImageID) - if err != nil { - return fmt.Errorf("getting image info: %w", err) - } - - file.ImageID = imageInfo.ImageID - file.PackageID = imageInfo.CDSPackageID - state.Step = jobmod.StepUploaded - - default: - return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(info)) - } - } - - if state.Step == jobmod.StepUploading { - if evt == nil { - return nil - } - - localFileCmd, ok := evt.(*event.LocalFileUploaded) - if !ok { - return nil - } - - if localFileCmd.LocalPath != fileInfo.(*schsdk.LocalJobFileInfo).LocalPath { - return nil - } - - if localFileCmd.Error != "" { - return fmt.Errorf("local file uploading: %s", localFileCmd.Error) - } - - // 上传完毕,则可以新建一个空的镜像的记录 - // TODO 镜像名称 - imgID, err := h.mgr.db.Image().Create(h.mgr.db.SQLCtx(), &localFileCmd.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now()) - if err != nil { - return fmt.Errorf("creating image info: %w", err) - } - - // 填充ImageID和PackageID - file.ImageID = imgID - file.PackageID = &localFileCmd.PackageID - state.Step = jobmod.StepUploaded - } - - if state.Step == jobmod.StepUploaded { - if scheme.Action == jobmod.ActionNo { - state.Step = jobmod.StepCompleted - return nil - } - - // 要导入镜像,则需要先将镜像移动到指点节点的缓存中 - if scheme.Action == jobmod.ActionImportImage { - if file.PackageID == nil { - return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, job.ccInfo.CCID) - } - - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, *file.PackageID, job.ccInfo.CDSNodeID)) - if err != nil { - return fmt.Errorf("starting cache move package: %w", err) - } - - state.Step = jobmod.StepMoving - state.FullTaskID = fullTaskID - return nil - } - return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo)) - } - - if state.Step == jobmod.StepMoving { - cacheMoveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID) - if err == event.ErrUnconcernedTask { - return nil - } - - if err == event.ErrTaskTimeout { - return fmt.Errorf("cache move package timeout") - } - - h.mgr.execMgr.ForgetTask(state.FullTaskID) - - if cacheMoveRet.Error != "" { - return fmt.Errorf("cache move pacakge: %s", cacheMoveRet.Error) - } - - stgCli, err := schglb.CloudreamStoragePool.Acquire() - if err != nil { - return fmt.Errorf("new cloudream storage client: %w", err) - } - defer schglb.CloudreamStoragePool.Release(stgCli) - - pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID}) - if err != nil { - return fmt.Errorf("getting package objects: %w", err) - } - - if len(pkgObjs.Objects) == 0 { - return fmt.Errorf("no object in the package which will be imported") - } - - if len(pkgObjs.Objects) > 1 { - return fmt.Errorf("there must be only 1 object in the package which will be imported") - } - - fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewUploadImage(job.ccInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash))) - if err != nil { - return fmt.Errorf("starting import image: %w", err) - } - - state.Step = jobmod.StepImageImporting - state.FullTaskID = fullTaskID - return nil - } - - if state.Step == jobmod.StepImageImporting { - uploadImageRet, err := event.AssertExecutorTaskStatus[*exectsk.UploadImageStatus](evt, state.FullTaskID) - if err == event.ErrUnconcernedTask { - return nil - } - - if err == event.ErrTaskTimeout { - return fmt.Errorf("import image timeout") - } - - h.mgr.execMgr.ForgetTask(state.FullTaskID) - - if uploadImageRet.Error != "" { - return fmt.Errorf("import image: %s", uploadImageRet.Error) - } - - err = h.mgr.db.PCMImage().Create(h.mgr.db.SQLCtx(), file.ImageID, job.ccInfo.CCID, uploadImageRet.PCMImageID, uploadImageRet.Name, time.Now()) - if err != nil { - return fmt.Errorf("adding image importing info: %w", err) - } - - state.Step = jobmod.StepCompleted - return nil - } - - return nil -} - -func (h *PreSchedulingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) { - h.cmdChan.Send(func() { - if broadcast.ToAll() { - for _, job := range h.jobs { - h.onJobEvent(evt, job) - } - - } else if broadcast.ToJobSet() { - for _, job := range h.jobs { - if job.job.JobSetID != broadcast.JobSetID { - continue - } - - h.onJobEvent(evt, job) - } - } else if broadcast.ToJob() { - if job, ok := h.jobs[broadcast.JobID]; ok { - h.onJobEvent(evt, job) - } - } - }) -} - -func (h *PreSchedulingHandler) Serve() { - cmdChan := h.cmdChan.BeginChanReceive() - defer h.cmdChan.CloseChanReceive() - - for { - select { - case cmd := <-cmdChan: - cmd() - } - } -} - -func (h *PreSchedulingHandler) Stop() { - // TODO 支持STOP -} diff --git a/manager/internal/jobmgr/ready_to_adjust_handler.go b/manager/internal/jobmgr/ready_to_adjust_handler.go deleted file mode 100644 index b837ef6..0000000 --- a/manager/internal/jobmgr/ready_to_adjust_handler.go +++ /dev/null @@ -1,214 +0,0 @@ -package jobmgr - -import ( - "fmt" - "reflect" - - "gitlink.org.cn/cloudream/common/pkgs/actor" - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" -) - -type readyToAdjustJob struct { - job jobmod.Job - state *jobmod.StateReadyToAdjust -} - -type ReadyToAdjustHandler struct { - mgr *Manager - - jobs map[schsdk.JobID]*readyToAdjustJob - - cmdChan actor.CommandChannel -} - -func NewReadyToAdjustHandler(mgr *Manager) *ReadyToAdjustHandler { - return &ReadyToAdjustHandler{ - mgr: mgr, - jobs: make(map[schsdk.JobID]*readyToAdjustJob), - cmdChan: *actor.NewCommandChannel(), - } -} - -func (h *ReadyToAdjustHandler) Handle(job jobmod.Job) { - h.cmdChan.Send(func() { - state, ok := job.GetState().(*jobmod.StateReadyToAdjust) - if !ok { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState())) - return - } - - rjob := &readyToAdjustJob{ - job: job, - state: state, - } - h.jobs[job.GetJobID()] = rjob - - h.onJobEvent(nil, rjob) - }) -} - -func (h *ReadyToAdjustHandler) onJobEvent(evt event.Event, job *readyToAdjustJob) { - if cloneEvt, ok := evt.(*event.CloneJob); ok { - cloneEvt.Callback.SetValue(job.job.Clone()) - return - } - - if norJob, ok := job.job.(*jobmod.NormalJob); ok { - h.onNormalJobEvent(evt, job, norJob) - } else if resJob, ok := job.job.(*jobmod.ResourceJob); ok { - h.onResourceJobEvent(evt, job, resJob) - } -} - -func (h *ReadyToAdjustHandler) onNormalJobEvent(evt event.Event, job *readyToAdjustJob, norJob *jobmod.NormalJob) { - h.mgr.pubLock.Lock() - jobSet, ok := h.mgr.jobSets[job.job.GetJobSetID()] - h.mgr.pubLock.Unlock() - if !ok { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", job.job.GetJobSetID()), job.state)) - return - } - - needWait := false - - // 无论发生什么事件,都检查一下前置任务的状态 - if resFile, ok := norJob.Info.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok { - ref := jobSet.FindRefByLocalJobID(resFile.ResourceLocalJobID) - if ref == nil { - h.changeJobState(job.job, jobmod.NewStateFailed( - fmt.Sprintf("job %s not found in job set %s", resFile.ResourceLocalJobID, jobSet.JobSetID), - job.state, - )) - return - } - - h.mgr.pubLock.Lock() - waitJob := h.mgr.jobs[ref.JobID] - h.mgr.pubLock.Unlock() - if waitJob == nil { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state)) - return - } - - if _, ok = waitJob.Job.GetState().(*jobmod.StateSuccess); ok { - waitResJob, ok := waitJob.Job.(*jobmod.ResourceJob) - if !ok { - h.changeJobState(job.job, jobmod.NewStateFailed( - fmt.Sprintf("job(%v) %s is not a resource job", reflect.TypeOf(waitJob), waitResJob.JobID), - job.state, - )) - return - } - - norJob.Files.Dataset.PackageID = waitResJob.ResourcePackageID - } else if _, ok = waitJob.Job.GetState().(*jobmod.StateFailed); ok { - h.changeJobState(job.job, jobmod.NewStateFailed( - fmt.Sprintf("job %s is failed", waitJob.Job.GetJobID()), - job.state, - )) - return - } else { - // 等待的Job不是失败或者成功状态,则需要继续等待 - needWait = true - } - } - - if !needWait { - h.changeJobState(job.job, jobmod.NewStateMakingAdjustScheme()) - } -} - -func (h *ReadyToAdjustHandler) onResourceJobEvent(evt event.Event, job *readyToAdjustJob, resJob *jobmod.ResourceJob) { - h.mgr.pubLock.Lock() - jobSet, ok := h.mgr.jobSets[job.job.GetJobSetID()] - h.mgr.pubLock.Unlock() - if !ok { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", job.job.GetJobSetID()), job.state)) - return - } - - needWait := false - - ref := jobSet.FindRefByLocalJobID(resJob.Info.TargetLocalJobID) - if ref == nil { - h.changeJobState(job.job, jobmod.NewStateFailed( - fmt.Sprintf("job %s not found in job set %s", resJob.Info.TargetLocalJobID, jobSet.JobSetID), - job.state, - )) - return - } - - h.mgr.pubLock.Lock() - waitJob := h.mgr.jobs[ref.JobID] - h.mgr.pubLock.Unlock() - if waitJob == nil { - h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state)) - return - } - - // 无论发生什么事件,都检查一下前置任务的状态 - if _, ok = waitJob.Job.GetState().(*jobmod.StateFailed); ok { - h.changeJobState(job.job, jobmod.NewStateFailed( - fmt.Sprintf("job %s is failed", waitJob.Job.GetJobID()), - job.state, - )) - return - } else if _, ok = waitJob.Job.GetState().(*jobmod.StateSuccess); !ok { - needWait = true - } - - if !needWait { - h.changeJobState(job.job, jobmod.NewStateReadyToExecute()) - } -} - -func (h *ReadyToAdjustHandler) changeJobState(job jobmod.Job, state jobmod.JobState) { - job.SetState(state) - - delete(h.jobs, job.GetJobID()) - - h.mgr.pubLock.Lock() - h.mgr.handleState(job) - h.mgr.pubLock.Unlock() -} - -func (h *ReadyToAdjustHandler) OnEvent(broadcast event.Broadcast, evt event.Event) { - h.cmdChan.Send(func() { - if broadcast.ToAll() { - for _, job := range h.jobs { - h.onJobEvent(evt, job) - } - - } else if broadcast.ToJobSet() { - for _, job := range h.jobs { - if job.job.GetJobSetID() != broadcast.JobSetID { - continue - } - - h.onJobEvent(evt, job) - } - } else if broadcast.ToJob() { - if job, ok := h.jobs[broadcast.JobID]; ok { - h.onJobEvent(evt, job) - } - } - }) -} - -func (h *ReadyToAdjustHandler) Serve() { - cmdChan := h.cmdChan.BeginChanReceive() - defer h.cmdChan.CloseChanReceive() - - for { - select { - case cmd := <-cmdChan: - cmd() - } - } -} - -func (h *ReadyToAdjustHandler) Stop() { - // TODO 支持STOP -} diff --git a/manager/internal/jobmgr/ready_to_execute_handler.go b/manager/internal/jobmgr/ready_to_execute_handler.go deleted file mode 100644 index 4fecfe1..0000000 --- a/manager/internal/jobmgr/ready_to_execute_handler.go +++ /dev/null @@ -1,122 +0,0 @@ -package jobmgr - -import ( - "fmt" - "reflect" - - "gitlink.org.cn/cloudream/common/pkgs/actor" - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" -) - -type readyToExecuteJob struct { - job jobmod.Job - state *jobmod.StateReadyToExecute -} - -type ReadyToExecuteHandler struct { - mgr *Manager - - jobs map[schsdk.JobID]*readyToExecuteJob - - cmdChan actor.CommandChannel -} - -func NewReadyToExecuteHandler(mgr *Manager) *ReadyToExecuteHandler { - return &ReadyToExecuteHandler{ - mgr: mgr, - jobs: make(map[schsdk.JobID]*readyToExecuteJob), - cmdChan: *actor.NewCommandChannel(), - } -} - -func (h *ReadyToExecuteHandler) Handle(job jobmod.Job) { - h.cmdChan.Send(func() { - state, ok := job.GetState().(*jobmod.StateReadyToExecute) - if !ok { - h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState())) - return - } - - rjob := &readyToExecuteJob{ - job: job, - state: state, - } - h.jobs[job.GetJobID()] = rjob - - h.onJobEvent(nil, rjob) - }) -} - -func (h *ReadyToExecuteHandler) onJobEvent(evt event.Event, job *readyToExecuteJob) { - if cloneEvt, ok := evt.(*event.CloneJob); ok { - cloneEvt.Callback.SetValue(job.job.Clone()) - return - } - - if norJob, ok := job.job.(*jobmod.NormalJob); ok { - h.onNormalJobEvent(evt, job, norJob) - } else if resJob, ok := job.job.(*jobmod.ResourceJob); ok { - h.onResourceJobEvent(evt, job, resJob) - } -} - -func (h *ReadyToExecuteHandler) onNormalJobEvent(evt event.Event, job *readyToExecuteJob, norJob *jobmod.NormalJob) { - // TODO 目前直接启动执行 - h.changeJobState(job.job, jobmod.NewStateExecuting()) -} - -func (h *ReadyToExecuteHandler) onResourceJobEvent(evt event.Event, job *readyToExecuteJob, resJob *jobmod.ResourceJob) { - // TODO 目前直接启动执行 - h.changeJobState(job.job, jobmod.NewStateExecuting()) -} - -func (h *ReadyToExecuteHandler) changeJobState(job jobmod.Job, state jobmod.JobState) { - job.SetState(state) - - delete(h.jobs, job.GetJobID()) - - h.mgr.pubLock.Lock() - h.mgr.handleState(job) - h.mgr.pubLock.Unlock() -} - -func (h *ReadyToExecuteHandler) OnEvent(broadcast event.Broadcast, evt event.Event) { - h.cmdChan.Send(func() { - if broadcast.ToAll() { - for _, job := range h.jobs { - h.onJobEvent(evt, job) - } - - } else if broadcast.ToJobSet() { - for _, job := range h.jobs { - if job.job.GetJobSetID() != broadcast.JobSetID { - continue - } - - h.onJobEvent(evt, job) - } - } else if broadcast.ToJob() { - if job, ok := h.jobs[broadcast.JobID]; ok { - h.onJobEvent(evt, job) - } - } - }) -} - -func (h *ReadyToExecuteHandler) Serve() { - cmdChan := h.cmdChan.BeginChanReceive() - defer h.cmdChan.CloseChanReceive() - - for { - select { - case cmd := <-cmdChan: - cmd() - } - } -} - -func (h *ReadyToExecuteHandler) Stop() { - // TODO 支持STOP -} diff --git a/manager/internal/jobmgr/state_handler.go b/manager/internal/jobmgr/state_handler.go deleted file mode 100644 index 9209fa7..0000000 --- a/manager/internal/jobmgr/state_handler.go +++ /dev/null @@ -1,17 +0,0 @@ -package jobmgr - -import ( - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" -) - -type StateHandler interface { - // 处理Job。在此期间全局锁已锁定 - Handle(job jobmod.Job) - // 外部发生了一个事件 - OnEvent(broadcast event.Broadcast, evt event.Event) - // 运行Handler - Serve() - // 停止此Handler - Stop() -} diff --git a/manager/internal/mq/job.go b/manager/internal/mq/job.go index 334d495..eb105cf 100644 --- a/manager/internal/mq/job.go +++ b/manager/internal/mq/job.go @@ -1,23 +1,50 @@ package mq import ( + "errors" + "fmt" + "gitlink.org.cn/cloudream/common/consts/errorcode" "gitlink.org.cn/cloudream/common/pkgs/logger" "gitlink.org.cn/cloudream/common/pkgs/mq" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job/state" ) // 提交任务集 func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetResp, *mq.CodeMessage) { logger.Debugf("submitting job") - jobSet, err := svc.jobMgr.SubmitJobSet(msg.JobSet, msg.PreScheduleScheme) - if err != nil { - logger.Warnf("submitting job set: %s", err.Error()) - return nil, mq.Failed(errorcode.OperationFailed, "submit job set failed") + var jobs []jobmgr.SubmittingJob + for _, jobInfo := range msg.JobSet.Jobs { + switch info := jobInfo.(type) { + case *schsdk.NormalJobInfo: + job := job.NewNormalJob(*info) + + preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID] + if !ok { + return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID)) + } + + jobs = append(jobs, jobmgr.SubmittingJob{ + Body: job, + InitState: state.NewPreSchuduling(preSch), + }) + + case *schsdk.DataReturnJobInfo: + job := job.NewResourceJob(*info) + jobs = append(jobs, jobmgr.SubmittingJob{ + Body: job, + InitState: state.NewWaitTargetComplete(), + }) + } } - return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(jobSet.JobSetID)) + return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(svc.jobMgr.SubmitJobSet(jobs))) } // 任务集中某个文件上传完成 @@ -26,16 +53,15 @@ func (svc *Service) JobSetLocalFileUploaded(msg *mgrmq.JobSetLocalFileUploaded) WithField("PackageID", msg.PackageID). Debugf("local file uploaded") - svc.jobMgr.LocalFileUploaded(msg.JobSetID, msg.LocalPath, msg.Error, msg.PackageID) + svc.jobMgr.BroadcastEvent(msg.JobSetID, event.NewLocalFileUploaded(msg.LocalPath, errors.New(msg.Error), msg.PackageID)) return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp()) } -func (svc *Service) GetJob(msg *mgrmq.GetJob) (*mgrmq.GetJobResp, *mq.CodeMessage) { - job, err := svc.jobMgr.CloneJob(msg.JobID) - if err != nil { - logger.WithField("JobID", msg.JobID).Warnf("cloning job: %s", err.Error()) - return nil, mq.Failed(errorcode.OperationFailed, "get job failed") +func (svc *Service) GetJobSetStatus(msg *mgrmq.GetJobSetStatus) (*mgrmq.GetJobSetStatusResp, *mq.CodeMessage) { + jobs := svc.jobMgr.DumpJobSet(msg.JobSetID) + if len(jobs) == 0 { + return nil, mq.Failed(errorcode.OperationFailed, "job set not found") } - return mq.ReplyOK(mgrmq.NewGetJobResp(job)) + return mq.ReplyOK(mgrmq.RespGetJobSetStatus(jobs)) }