重构manager模块
This commit is contained in:
parent
49a80a693c
commit
1e1c8dd691
|
@ -38,7 +38,7 @@ const (
|
|||
var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait")
|
||||
|
||||
type Scheduler interface {
|
||||
Schedule(info *jobmod.NormalJob) (*jobmod.JobScheduleScheme, error)
|
||||
Schedule(info *schsdk.NormalJobInfo, status jobmod.NormalJobStatus) (*jobmod.JobScheduleScheme, error)
|
||||
}
|
||||
|
||||
type candidate struct {
|
||||
|
@ -129,7 +129,7 @@ func NewDefaultSchedule() *DefaultScheduler {
|
|||
return &DefaultScheduler{}
|
||||
}
|
||||
|
||||
func (s *DefaultScheduler) Schedule(job *jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) {
|
||||
func (s *DefaultScheduler) Schedule(info *schsdk.NormalJobInfo, status jobmod.NormalJobStatus) (*jobmod.JobScheduleScheme, error) {
|
||||
mgrCli, err := schglb.ManagerMQPool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new collector client: %w", err)
|
||||
|
@ -151,17 +151,17 @@ func (s *DefaultScheduler) Schedule(job *jobmod.NormalJob) (*jobmod.JobScheduleS
|
|||
for _, cc := range allCC.ComputingCenters {
|
||||
allCCs[cc.CCID] = &candidate{
|
||||
CC: cc,
|
||||
IsPreScheduled: cc.CCID == job.TargetCCID,
|
||||
IsPreScheduled: cc.CCID == status.TargetCCID,
|
||||
}
|
||||
}
|
||||
|
||||
// 计算
|
||||
err = s.calcFileScore(job.Files, allCCs)
|
||||
err = s.calcFileScore(status.Files, allCCs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
err = s.calcResourceScore(job, allCCs)
|
||||
err = s.calcResourceScore(info, allCCs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -204,9 +204,9 @@ func (s *DefaultScheduler) makeSchemeForNode(targetCC *candidate) jobmod.JobSche
|
|||
return scheme
|
||||
}
|
||||
|
||||
func (s *DefaultScheduler) calcResourceScore(job *jobmod.NormalJob, allCCs map[schsdk.CCID]*candidate) error {
|
||||
func (s *DefaultScheduler) calcResourceScore(info *schsdk.NormalJobInfo, allCCs map[schsdk.CCID]*candidate) error {
|
||||
for _, cc := range allCCs {
|
||||
res, err := s.calcOneResourceScore(job.Info.Resources, &cc.CC)
|
||||
res, err := s.calcOneResourceScore(info.Resources, &cc.CC)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"github.com/samber/lo"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
|
@ -30,7 +31,7 @@ func NewService(scheduler Scheduler) *Service {
|
|||
}
|
||||
}
|
||||
|
||||
func (s *Service) MakeScheme(job jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) {
|
||||
func (s *Service) MakeScheme(job schsdk.NormalJobInfo) (*jobmod.JobScheduleScheme, error) {
|
||||
s.lock.Lock()
|
||||
callback := future.NewSetValue[*jobmod.JobScheduleScheme]()
|
||||
s.jobs = append(s.jobs, &schedulingJob{
|
||||
|
|
|
@ -4,7 +4,7 @@ import (
|
|||
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
||||
"gitlink.org.cn/cloudream/common/utils/reflect"
|
||||
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||
myglbs "gitlink.org.cn/cloudream/scheduler/advisor/internal/globals"
|
||||
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
|
||||
)
|
||||
|
@ -12,7 +12,7 @@ import (
|
|||
func (svc *Service) StartTask(msg *advmq.StartTask) (*advmq.StartTaskResp, *mq.CodeMessage) {
|
||||
tsk, err := svc.taskManager.StartByInfo(msg.Info)
|
||||
if err != nil {
|
||||
logger.WithField("Info", reflect.TypeOfValue(msg.Info).Name()).
|
||||
logger.WithField("Info", reflect2.TypeOfValue(msg.Info).Name()).
|
||||
Warnf("starting task by info: %s", err.Error())
|
||||
return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed")
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ func (t *MakeScheduleScheme) Execute(task *task.Task[TaskContext], ctx TaskConte
|
|||
}
|
||||
|
||||
func (t *MakeScheduleScheme) do(taskID string, ctx TaskContext) (*jobmod.JobScheduleScheme, error) {
|
||||
scheme, err := ctx.scheduleSvc.MakeScheme(t.Job)
|
||||
scheme, err := ctx.scheduleSvc.MakeScheme(t.JobInfo)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ import (
|
|||
"reflect"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/task"
|
||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
||||
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||
reporter "gitlink.org.cn/cloudream/scheduler/advisor/internal/reporter"
|
||||
"gitlink.org.cn/cloudream/scheduler/advisor/internal/scheduler"
|
||||
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
||||
|
@ -40,7 +40,7 @@ func NewManager(reporter *reporter.Reporter, scheduleSvc *scheduler.Service) Man
|
|||
}
|
||||
|
||||
func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) {
|
||||
infoType := myreflect.TypeOfValue(info)
|
||||
infoType := reflect2.TypeOfValue(info)
|
||||
|
||||
ctor, ok := taskFromInfoCtors[infoType]
|
||||
if !ok {
|
||||
|
@ -53,7 +53,7 @@ func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) {
|
|||
var taskFromInfoCtors map[reflect.Type]func(advtsk.TaskInfo) TaskBody = make(map[reflect.Type]func(advtsk.TaskInfo) task.TaskBody[TaskContext])
|
||||
|
||||
func Register[TInfo advtsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) {
|
||||
taskFromInfoCtors[myreflect.TypeOf[TInfo]()] = func(info advtsk.TaskInfo) TaskBody {
|
||||
taskFromInfoCtors[reflect2.TypeOf[TInfo]()] = func(info advtsk.TaskInfo) TaskBody {
|
||||
return ctor(info.(TInfo))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -171,7 +171,7 @@ func (s *DefaultPreScheduler) Schedule(info *schsdk.JobSetInfo) (*jobmod.JobSetP
|
|||
if resFile, ok := norJob.Files.Code.(*schsdk.ResourceJobFileInfo); ok {
|
||||
j.Afters = append(j.Afters, resFile.ResourceLocalJobID)
|
||||
}
|
||||
} else if resJob, ok := job.(*schsdk.ResourceJobInfo); ok {
|
||||
} else if resJob, ok := job.(*schsdk.DataReturnJobInfo); ok {
|
||||
j.Afters = append(j.Afters, resJob.TargetLocalJobID)
|
||||
}
|
||||
|
||||
|
@ -270,7 +270,7 @@ func (s *DefaultPreScheduler) scheduleForNormalJob(jobSet *schsdk.JobSetInfo, jo
|
|||
|
||||
// 检查此节点是否是它所引用的任务所选的节点
|
||||
for _, af := range job.Afters {
|
||||
resJob := findJobInfo[*schsdk.ResourceJobInfo](jobSet.Jobs, af)
|
||||
resJob := findJobInfo[*schsdk.DataReturnJobInfo](jobSet.Jobs, af)
|
||||
if resJob == nil {
|
||||
return nil, fmt.Errorf("resource job %s not found in the job set", af)
|
||||
}
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
package jobmod
|
||||
|
||||
import (
|
||||
"github.com/samber/lo"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/types"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
)
|
||||
|
||||
type FileScheduleAction string
|
||||
|
@ -34,66 +32,41 @@ type JobSetPreScheduleScheme struct {
|
|||
JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID
|
||||
}
|
||||
|
||||
// 任务集
|
||||
type JobSet struct {
|
||||
JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID
|
||||
JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用
|
||||
PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"`
|
||||
}
|
||||
type JobSetJobRef struct {
|
||||
JobID schsdk.JobID `json:"jobID"` // 任务ID
|
||||
LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID
|
||||
type JobFiles struct {
|
||||
Dataset PackageJobFile `json:"dataset"`
|
||||
Code PackageJobFile `json:"code"`
|
||||
Image ImageJobFile `json:"image"`
|
||||
}
|
||||
|
||||
func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet {
|
||||
return &JobSet{
|
||||
JobSetID: jobSetID,
|
||||
JobRefs: jobRefs,
|
||||
PreScheduleScheme: preScheduleScheme,
|
||||
}
|
||||
type PackageJobFile struct {
|
||||
PackageID cdssdk.PackageID `json:"packageID"`
|
||||
FullPath string `json:"fullPath"` // Load之后的完整文件路径
|
||||
}
|
||||
|
||||
func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef {
|
||||
ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID })
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &ref
|
||||
type ImageJobFile struct {
|
||||
PackageID *cdssdk.PackageID `json:"packageID"`
|
||||
ImageID schsdk.ImageID `json:"imageID"`
|
||||
}
|
||||
|
||||
// 任务
|
||||
type Job interface {
|
||||
GetJobSetID() schsdk.JobSetID
|
||||
GetJobID() schsdk.JobID
|
||||
GetState() JobState
|
||||
SetState(state JobState)
|
||||
Clone() Job
|
||||
type JobStatus struct {
|
||||
JobID schsdk.JobID `json:"jobID"`
|
||||
JobSetID schsdk.JobSetID `json:"jobSetID"`
|
||||
Info schsdk.JobInfo `json:"info"`
|
||||
Body JobBodyStatus `json:"body"`
|
||||
State JobStateStatus `json:"state"`
|
||||
}
|
||||
|
||||
var JobTypeUnion = types.NewTypeUnion[Job](
|
||||
(*NormalJob)(nil),
|
||||
(*ResourceJob)(nil),
|
||||
)
|
||||
var _ = serder.UseTypeUnionExternallyTagged(&JobTypeUnion)
|
||||
|
||||
// TODO var _ = serder.RegisterNewTaggedTypeUnion(JobTypeUnion, "Type", "type")
|
||||
|
||||
type JobBase struct {
|
||||
JobSetID schsdk.JobSetID `json:"jobSetID"` // 任务集ID
|
||||
JobID schsdk.JobID `json:"jobID"` // 全局唯一任务ID
|
||||
State JobState `json:"state"` // 任务当前的状态。包含当前在状态下执行操作所需的数据
|
||||
type JobBodyStatus interface {
|
||||
}
|
||||
|
||||
func (j *JobBase) GetJobSetID() schsdk.JobSetID {
|
||||
return j.JobSetID
|
||||
type NormalJobStatus struct {
|
||||
TargetCCID schsdk.CCID `json:"targetCCID"`
|
||||
Files JobFiles `json:"files"`
|
||||
}
|
||||
func (j *JobBase) GetJobID() schsdk.JobID {
|
||||
return j.JobID
|
||||
|
||||
type DataReturnJobStatus struct {
|
||||
DataReturnPackageID cdssdk.PackageID `json:"dataReturnPackageID"`
|
||||
}
|
||||
func (j *JobBase) GetState() JobState {
|
||||
return j.State
|
||||
}
|
||||
func (j *JobBase) SetState(state JobState) {
|
||||
j.State = state
|
||||
|
||||
type JobStateStatus interface {
|
||||
}
|
||||
|
|
|
@ -1,46 +0,0 @@
|
|||
package jobmod
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
)
|
||||
|
||||
type NormalJob struct {
|
||||
JobBase
|
||||
Info schsdk.NormalJobInfo `json:"info"` // 提交任务时提供的任务描述信息
|
||||
Files JobFiles `json:"files"` // 任务需要的文件
|
||||
TargetCCID schsdk.CCID `json:"targetSlwNodeID"` // 将要运行此任务的算力中心ID
|
||||
OutputFullPath string `json:"outputFullPath"` // 程序结果的完整输出路径
|
||||
}
|
||||
|
||||
func NewNormalJob(jobSetID schsdk.JobSetID, jobID schsdk.JobID, info schsdk.NormalJobInfo) *NormalJob {
|
||||
return &NormalJob{
|
||||
JobBase: JobBase{
|
||||
JobSetID: jobSetID,
|
||||
JobID: jobID,
|
||||
},
|
||||
Info: info,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *NormalJob) Clone() Job {
|
||||
tmp := *j
|
||||
tmp.State = tmp.State.Clone()
|
||||
return &tmp
|
||||
}
|
||||
|
||||
type JobFiles struct {
|
||||
Dataset PackageJobFile `json:"dataset"`
|
||||
Code PackageJobFile `json:"code"`
|
||||
Image ImageJobFile `json:"image"`
|
||||
}
|
||||
|
||||
type PackageJobFile struct {
|
||||
PackageID cdssdk.PackageID `json:"packageID"`
|
||||
FullPath string `json:"fullPath"` // Load之后的完整文件路径
|
||||
}
|
||||
|
||||
type ImageJobFile struct {
|
||||
PackageID *cdssdk.PackageID `json:"packageID"`
|
||||
ImageID schsdk.ImageID `json:"imageID"`
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
package jobmod
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
)
|
||||
|
||||
type ResourceJob struct {
|
||||
JobBase
|
||||
Info schsdk.ResourceJobInfo `json:"info"`
|
||||
ResourcePackageID cdssdk.PackageID `json:"resourcePackageID"` // 回源之后得到的PackageID
|
||||
}
|
||||
|
||||
func NewResourceJob(jobSetID schsdk.JobSetID, jobID schsdk.JobID, info schsdk.ResourceJobInfo) *ResourceJob {
|
||||
return &ResourceJob{
|
||||
JobBase: JobBase{
|
||||
JobSetID: jobSetID,
|
||||
JobID: jobID,
|
||||
},
|
||||
Info: info,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *ResourceJob) Clone() Job {
|
||||
tmp := *j
|
||||
tmp.State = tmp.State.Clone()
|
||||
return &tmp
|
||||
}
|
|
@ -1,183 +0,0 @@
|
|||
package jobmod
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/common/pkgs/types"
|
||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||
)
|
||||
|
||||
type JobState interface {
|
||||
Clone() JobState
|
||||
}
|
||||
type JobStateBase struct{}
|
||||
|
||||
var JobStateTypeUnion = types.NewTypeUnion[JobState](
|
||||
(*StatePreScheduling)(nil),
|
||||
(*StateReadyToAdjust)(nil),
|
||||
(*StateMakingAdjustScheme)(nil),
|
||||
(*StateAdjusting)(nil),
|
||||
(*StateReadyToExecute)(nil),
|
||||
(*StateExecuting)(nil),
|
||||
(*StateFailed)(nil),
|
||||
(*StateSuccess)(nil),
|
||||
)
|
||||
var _ = serder.UseTypeUnionExternallyTagged(&JobStateTypeUnion)
|
||||
|
||||
// TODO var _ = serder.RegisterNewTaggedTypeUnion(JobStateTypeUnion, "Type", "type")
|
||||
|
||||
type FileSchedulingStep string
|
||||
|
||||
const (
|
||||
StepBegin FileSchedulingStep = "Begin" // 准备开始调度
|
||||
StepUploading FileSchedulingStep = "Uploading" // 正在等待文件上传
|
||||
StepUploaded FileSchedulingStep = "Uploaded" // 文件上传完成
|
||||
StepMoving FileSchedulingStep = "Moving" // 正在移动缓存
|
||||
StepLoading FileSchedulingStep = "Loading" // 正在加载
|
||||
StepImageImporting FileSchedulingStep = "ImageImporting" // 正在导入镜像
|
||||
StepCompleted FileSchedulingStep = "Completed" // 完成
|
||||
)
|
||||
|
||||
type FileSchedulingState struct {
|
||||
Step FileSchedulingStep `json:"step"`
|
||||
Error string `json:"error"`
|
||||
FullTaskID string `json:"fullTaskID"`
|
||||
}
|
||||
|
||||
type StatePreScheduling struct {
|
||||
JobStateBase
|
||||
Scheme JobScheduleScheme `json:"scheme"`
|
||||
Dataset FileSchedulingState `json:"dataset"`
|
||||
Code FileSchedulingState `json:"code"`
|
||||
Image FileSchedulingState `json:"image"`
|
||||
}
|
||||
|
||||
func NewStatePreScheduling(scheme JobScheduleScheme) *StatePreScheduling {
|
||||
return &StatePreScheduling{
|
||||
Scheme: scheme,
|
||||
Dataset: FileSchedulingState{
|
||||
Step: StepBegin,
|
||||
},
|
||||
Code: FileSchedulingState{
|
||||
Step: StepBegin,
|
||||
},
|
||||
Image: FileSchedulingState{
|
||||
Step: StepBegin,
|
||||
},
|
||||
}
|
||||
}
|
||||
func (s *StatePreScheduling) Clone() JobState {
|
||||
tmp := *s
|
||||
return &tmp
|
||||
}
|
||||
|
||||
type StateReadyToAdjust struct {
|
||||
JobStateBase
|
||||
}
|
||||
|
||||
func NewStateReadyToAdjust() *StateReadyToAdjust {
|
||||
return &StateReadyToAdjust{}
|
||||
}
|
||||
|
||||
func (s *StateReadyToAdjust) Clone() JobState {
|
||||
tmp := *s
|
||||
return &tmp
|
||||
}
|
||||
|
||||
type StateMakingAdjustScheme struct {
|
||||
JobStateBase
|
||||
FullTaskID string `json:"fullTaskID"`
|
||||
}
|
||||
|
||||
func NewStateMakingAdjustScheme() *StateMakingAdjustScheme {
|
||||
return &StateMakingAdjustScheme{}
|
||||
}
|
||||
|
||||
func (s *StateMakingAdjustScheme) Clone() JobState {
|
||||
tmp := *s
|
||||
return &tmp
|
||||
}
|
||||
|
||||
type StateAdjusting struct {
|
||||
JobStateBase
|
||||
Scheme JobScheduleScheme `json:"scheme"`
|
||||
Dataset FileSchedulingState `json:"dataset"`
|
||||
Code FileSchedulingState `json:"code"`
|
||||
Image FileSchedulingState `json:"image"`
|
||||
}
|
||||
|
||||
func NewStateAdjusting(scheme JobScheduleScheme) *StateAdjusting {
|
||||
return &StateAdjusting{
|
||||
Scheme: scheme,
|
||||
Dataset: FileSchedulingState{
|
||||
Step: StepBegin,
|
||||
},
|
||||
Code: FileSchedulingState{
|
||||
Step: StepBegin,
|
||||
},
|
||||
Image: FileSchedulingState{
|
||||
Step: StepBegin,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *StateAdjusting) Clone() JobState {
|
||||
tmp := *s
|
||||
return &tmp
|
||||
}
|
||||
|
||||
type StateReadyToExecute struct {
|
||||
JobStateBase
|
||||
}
|
||||
|
||||
func NewStateReadyToExecute() *StateReadyToExecute {
|
||||
return &StateReadyToExecute{}
|
||||
}
|
||||
|
||||
func (s *StateReadyToExecute) Clone() JobState {
|
||||
tmp := *s
|
||||
return &tmp
|
||||
}
|
||||
|
||||
type StateExecuting struct {
|
||||
JobStateBase
|
||||
FullTaskID string `json:"fullTaskID"`
|
||||
}
|
||||
|
||||
func NewStateExecuting() *StateExecuting {
|
||||
return &StateExecuting{}
|
||||
}
|
||||
|
||||
func (s *StateExecuting) Clone() JobState {
|
||||
tmp := *s
|
||||
return &tmp
|
||||
}
|
||||
|
||||
type StateFailed struct {
|
||||
JobStateBase
|
||||
Error string `json:"error"`
|
||||
LastState JobState `json:"lastState"`
|
||||
}
|
||||
|
||||
func NewStateFailed(err string, lastState JobState) *StateFailed {
|
||||
return &StateFailed{
|
||||
Error: err,
|
||||
LastState: lastState,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *StateFailed) Clone() JobState {
|
||||
tmp := *s
|
||||
return &tmp
|
||||
}
|
||||
|
||||
type StateSuccess struct {
|
||||
JobStateBase
|
||||
}
|
||||
|
||||
func NewStateSuccess() *StateSuccess {
|
||||
return &StateSuccess{}
|
||||
}
|
||||
|
||||
func (s *StateSuccess) Clone() JobState {
|
||||
tmp := *s
|
||||
return &tmp
|
||||
}
|
|
@ -8,7 +8,7 @@ import (
|
|||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
|
||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
||||
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||
)
|
||||
|
||||
|
@ -78,7 +78,7 @@ type CCResourceInfo struct {
|
|||
func (i *CCResourceInfo) Scan(src interface{}) error {
|
||||
data, ok := src.([]uint8)
|
||||
if !ok {
|
||||
return fmt.Errorf("unknow src type: %v", myreflect.TypeOfValue(data).String())
|
||||
return fmt.Errorf("unknow src type: %v", reflect2.TypeOfValue(data).String())
|
||||
}
|
||||
|
||||
return serder.JSONToObject(data, i)
|
||||
|
|
|
@ -1,17 +1,20 @@
|
|||
package task
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type MakeAdjustScheme struct {
|
||||
TaskInfoBase
|
||||
Job jobmod.NormalJob `json:"job"`
|
||||
JobInfo schsdk.NormalJobInfo `json:"jobInfo"`
|
||||
JobStatus jobmod.NormalJobStatus `json:"jobStatus"`
|
||||
}
|
||||
|
||||
func NewMakeAdjustScheme(job jobmod.NormalJob) *MakeAdjustScheme {
|
||||
func NewMakeAdjustScheme(jobInfo schsdk.NormalJobInfo, jobStatus jobmod.NormalJobStatus) *MakeAdjustScheme {
|
||||
return &MakeAdjustScheme{
|
||||
Job: job,
|
||||
JobInfo: jobInfo,
|
||||
JobStatus: jobStatus,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ package task
|
|||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/common/pkgs/types"
|
||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
||||
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||
)
|
||||
|
||||
|
@ -32,9 +32,9 @@ func (s *TaskStatusBase) Noop() {}
|
|||
|
||||
// 只能在init函数中调用,因为包级变量初始化会比init函数调用先进行
|
||||
func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any {
|
||||
TaskInfoTypeUnion.Add(myreflect.TypeOf[TTaskInfo]())
|
||||
TaskInfoTypeUnion.Add(reflect2.TypeOf[TTaskInfo]())
|
||||
|
||||
TaskStatusTypeUnion.Add(myreflect.TypeOf[TTaskStatus]())
|
||||
TaskStatusTypeUnion.Add(reflect2.TypeOf[TTaskStatus]())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@ package task
|
|||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/common/pkgs/types"
|
||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
||||
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||
)
|
||||
|
||||
|
@ -32,9 +32,9 @@ func (s *TaskStatusBase) Noop() {}
|
|||
|
||||
// 只能在init函数中调用,因为包级变量初始化会比init函数调用先进行
|
||||
func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any {
|
||||
TaskInfoTypeUnion.Add(myreflect.TypeOf[TTaskInfo]())
|
||||
TaskInfoTypeUnion.Add(reflect2.TypeOf[TTaskInfo]())
|
||||
|
||||
TaskStatusTypeUnion.Add(myreflect.TypeOf[TTaskStatus]())
|
||||
TaskStatusTypeUnion.Add(reflect2.TypeOf[TTaskStatus]())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -12,9 +12,7 @@ type JobService interface {
|
|||
|
||||
JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded) (*JobSetLocalFileUploadedResp, *mq.CodeMessage)
|
||||
|
||||
GetJob(msg *GetJob) (*GetJobResp, *mq.CodeMessage)
|
||||
|
||||
// GetJobSetJobs(msg *GetJobSetJobs) (*GetJobSetJobsResp, *mq.CodeMessage)
|
||||
GetJobSetStatus(msg *GetJobSetStatus) (*GetJobSetStatusResp, *mq.CodeMessage)
|
||||
}
|
||||
|
||||
// 提交任务集
|
||||
|
@ -74,52 +72,28 @@ func (c *Client) JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded, opts ...m
|
|||
return mq.Request(Service.JobSetLocalFileUploaded, c.roundTripper, msg, opts...)
|
||||
}
|
||||
|
||||
// 获取任务数据
|
||||
type GetJob struct {
|
||||
var _ = Register(Service.GetJobSetStatus)
|
||||
|
||||
// 获取任务集的状态
|
||||
type GetJobSetStatus struct {
|
||||
mq.MessageBodyBase
|
||||
JobID schsdk.JobID `json:"jobID"`
|
||||
JobSetID schsdk.JobSetID `json:"jobSetID"`
|
||||
}
|
||||
type GetJobResp struct {
|
||||
type GetJobSetStatusResp struct {
|
||||
mq.MessageBodyBase
|
||||
Job jobmod.Job `json:"job"`
|
||||
Jobs []jobmod.JobStatus `json:"jobs"`
|
||||
}
|
||||
|
||||
func NewGetJob(jobID schsdk.JobID) *GetJob {
|
||||
return &GetJob{
|
||||
JobID: jobID,
|
||||
}
|
||||
}
|
||||
func NewGetJobResp(job jobmod.Job) *GetJobResp {
|
||||
return &GetJobResp{
|
||||
Job: job,
|
||||
}
|
||||
}
|
||||
func (c *Client) GetJob(msg *GetJob, opts ...mq.RequestOption) (*GetJobResp, error) {
|
||||
return mq.Request(Service.GetJob, c.roundTripper, msg, opts...)
|
||||
}
|
||||
|
||||
/*
|
||||
// 获取指定任务集中的所有任务数据
|
||||
type GetJobSetJobs struct {
|
||||
mq.MessageBodyBase
|
||||
JobSetID string `json:"jobSetID"`
|
||||
}
|
||||
type GetJobSetJobsResp struct {
|
||||
mq.MessageBodyBase
|
||||
Jobs []jobmod.Job `json:"jobs"`
|
||||
}
|
||||
|
||||
func NewGetJobSetJobs(jobSetID string) *GetJobSetJobs {
|
||||
return &GetJobSetJobs{
|
||||
func ReqGetJobSetStatus(jobSetID schsdk.JobSetID) *GetJobSetStatus {
|
||||
return &GetJobSetStatus{
|
||||
JobSetID: jobSetID,
|
||||
}
|
||||
}
|
||||
func NewGetJobSetJobsResp(jobs []jobmod.Job) *GetJobSetJobsResp {
|
||||
return &GetJobSetJobsResp{
|
||||
func RespGetJobSetStatus(jobs []jobmod.JobStatus) *GetJobSetStatusResp {
|
||||
return &GetJobSetStatusResp{
|
||||
Jobs: jobs,
|
||||
}
|
||||
}
|
||||
func (c *Client) GetJobSetJobs(msg *GetJobSetJobs, opts ...mq.RequestOption) (*GetJobSetJobsResp, error) {
|
||||
return mq.Request(Service.GetJobSetJobs, c.rabbitCli, msg, opts...)
|
||||
func (c *Client) GetJob(msg *GetJobSetStatus, opts ...mq.RequestOption) (*GetJobSetStatusResp, error) {
|
||||
return mq.Request(Service.GetJobSetStatus, c.roundTripper, msg, opts...)
|
||||
}
|
||||
*/
|
||||
|
|
|
@ -4,7 +4,7 @@ import (
|
|||
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
||||
"gitlink.org.cn/cloudream/common/utils/reflect"
|
||||
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||
execmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
|
||||
myglbs "gitlink.org.cn/cloudream/scheduler/executor/internal/globals"
|
||||
)
|
||||
|
@ -12,7 +12,7 @@ import (
|
|||
func (svc *Service) StartTask(msg *execmq.StartTask) (*execmq.StartTaskResp, *mq.CodeMessage) {
|
||||
tsk, err := svc.taskManager.StartByInfo(msg.Info)
|
||||
if err != nil {
|
||||
logger.WithField("Info", reflect.TypeOfValue(msg.Info).Name()).
|
||||
logger.WithField("Info", reflect2.TypeOfValue(msg.Info).Name()).
|
||||
Warnf("starting task by info: %s", err.Error())
|
||||
return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed")
|
||||
}
|
||||
|
|
|
@ -81,7 +81,7 @@ func (t *PCMSubmitTask) do(taskID string, ctx TaskContext) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
if tsResp.TaskStatus == pcmsdk.TaskStatuFailed {
|
||||
if tsResp.TaskStatus == pcmsdk.TaskStatusFailed {
|
||||
// TODO 返回更详细的信息
|
||||
return fmt.Errorf("task failed")
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ import (
|
|||
"reflect"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/task"
|
||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
||||
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
reporter "gitlink.org.cn/cloudream/scheduler/executor/internal/reporter"
|
||||
)
|
||||
|
@ -37,7 +37,7 @@ func NewManager(reporter *reporter.Reporter) Manager {
|
|||
}
|
||||
|
||||
func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) {
|
||||
infoType := myreflect.TypeOfValue(info)
|
||||
infoType := reflect2.TypeOfValue(info)
|
||||
|
||||
ctor, ok := taskFromInfoCtors[infoType]
|
||||
if !ok {
|
||||
|
@ -50,7 +50,7 @@ func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) {
|
|||
var taskFromInfoCtors map[reflect.Type]func(exectsk.TaskInfo) TaskBody = make(map[reflect.Type]func(exectsk.TaskInfo) task.TaskBody[TaskContext])
|
||||
|
||||
func Register[TInfo exectsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) {
|
||||
taskFromInfoCtors[myreflect.TypeOf[TInfo]()] = func(info exectsk.TaskInfo) TaskBody {
|
||||
taskFromInfoCtors[reflect2.TypeOf[TInfo]()] = func(info exectsk.TaskInfo) TaskBody {
|
||||
return ctor(info.(TInfo))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ import (
|
|||
"sync"
|
||||
"time"
|
||||
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
"gitlink.org.cn/cloudream/common/utils/sync2"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
|
||||
|
@ -13,29 +13,23 @@ import (
|
|||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||||
)
|
||||
|
||||
type jobTask struct {
|
||||
JobID schsdk.JobID
|
||||
TaskID string
|
||||
FullTaskID string
|
||||
type task struct {
|
||||
statusChan *sync2.Channel[advtsk.TaskStatus]
|
||||
}
|
||||
|
||||
type AdvisorInfo struct {
|
||||
advisorID schmod.AdvisorID
|
||||
jobTasks map[string]jobTask // key 为 TaskID
|
||||
tasks map[string]task // key 为 TaskID
|
||||
lastReportTime time.Time
|
||||
}
|
||||
|
||||
type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus)
|
||||
type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string)
|
||||
var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
|
||||
|
||||
type Manager struct {
|
||||
advisors map[schmod.AdvisorID]*AdvisorInfo
|
||||
lock sync.Mutex
|
||||
advCli *advmq.Client
|
||||
|
||||
onTaskUpdated OnTaskUpdatedCallbackFn
|
||||
onTaskTimeout OnTimeoutCallbackFn
|
||||
|
||||
reportTimeout time.Duration
|
||||
}
|
||||
|
||||
|
@ -52,83 +46,66 @@ func NewManager(reportTimeout time.Duration) (*Manager, error) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) {
|
||||
m.onTaskUpdated = callback
|
||||
}
|
||||
|
||||
func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) {
|
||||
m.onTaskTimeout = callback
|
||||
}
|
||||
|
||||
func (m *Manager) Report(advID schmod.AdvisorID, taskStatus []mgrmq.AdvisorTaskStatus) {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
info, ok := m.advisors[advID]
|
||||
adv, ok := m.advisors[advID]
|
||||
if !ok {
|
||||
info = &AdvisorInfo{
|
||||
adv = &AdvisorInfo{
|
||||
advisorID: advID,
|
||||
jobTasks: make(map[string]jobTask),
|
||||
tasks: make(map[string]task),
|
||||
}
|
||||
m.advisors[advID] = info
|
||||
m.advisors[advID] = adv
|
||||
}
|
||||
|
||||
info.lastReportTime = time.Now()
|
||||
adv.lastReportTime = time.Now()
|
||||
|
||||
for _, s := range taskStatus {
|
||||
tsk, ok := info.jobTasks[s.TaskID]
|
||||
tsk, ok := adv.tasks[s.TaskID]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status)
|
||||
// TODO 考虑主动检测channel是否关闭,然后取消task
|
||||
if tsk.statusChan.Send(s.Status) != nil {
|
||||
delete(adv.tasks, s.TaskID)
|
||||
|
||||
if len(adv.tasks) == 0 {
|
||||
delete(m.advisors, advID)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 启动一个Task,并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID
|
||||
func (m *Manager) StartTask(jobID schsdk.JobID, info advtsk.TaskInfo) (string, error) {
|
||||
// 启动一个Task
|
||||
func (m *Manager) StartTask(info advtsk.TaskInfo) *sync2.Channel[advtsk.TaskStatus] {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
ch := sync2.NewChannel[advtsk.TaskStatus]()
|
||||
|
||||
resp, err := m.advCli.StartTask(advmq.NewStartTask(info))
|
||||
if err != nil {
|
||||
return "", err
|
||||
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||||
return ch
|
||||
}
|
||||
|
||||
fullTaskID := fmt.Sprintf("%s-%s", resp.AdvisorID, resp.TaskID)
|
||||
|
||||
exeInfo, ok := m.advisors[resp.AdvisorID]
|
||||
if !ok {
|
||||
exeInfo = &AdvisorInfo{
|
||||
advisorID: resp.AdvisorID,
|
||||
jobTasks: make(map[string]jobTask),
|
||||
tasks: make(map[string]task),
|
||||
lastReportTime: time.Now(),
|
||||
}
|
||||
m.advisors[resp.AdvisorID] = exeInfo
|
||||
}
|
||||
|
||||
exeInfo.jobTasks[resp.TaskID] = jobTask{
|
||||
JobID: jobID,
|
||||
TaskID: resp.TaskID,
|
||||
FullTaskID: fullTaskID,
|
||||
exeInfo.tasks[resp.TaskID] = task{
|
||||
statusChan: ch,
|
||||
}
|
||||
|
||||
return fullTaskID, nil
|
||||
}
|
||||
|
||||
// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新
|
||||
func (m *Manager) ForgetTask(fullTaskID string) {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
for _, exe := range m.advisors {
|
||||
for _, tsk := range exe.jobTasks {
|
||||
if tsk.FullTaskID == fullTaskID {
|
||||
delete(exe.jobTasks, fullTaskID)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
return ch
|
||||
}
|
||||
|
||||
func (m *Manager) Serve() error {
|
||||
|
@ -150,8 +127,8 @@ func (m *Manager) Serve() error {
|
|||
continue
|
||||
}
|
||||
|
||||
for _, tsk := range exeInfo.jobTasks {
|
||||
m.onTaskTimeout(tsk.JobID, tsk.FullTaskID)
|
||||
for _, tsk := range exeInfo.tasks {
|
||||
tsk.statusChan.CloseWithError(ErrWaitReportTimeout)
|
||||
}
|
||||
|
||||
delete(m.advisors, exeID)
|
||||
|
|
|
@ -5,7 +5,7 @@ import (
|
|||
"sync"
|
||||
"time"
|
||||
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
"gitlink.org.cn/cloudream/common/utils/sync2"
|
||||
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
|
@ -14,29 +14,22 @@ import (
|
|||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||||
)
|
||||
|
||||
type jobTask struct {
|
||||
JobID schsdk.JobID
|
||||
TaskID string
|
||||
FullTaskID string
|
||||
type task struct {
|
||||
statusChan *sync2.Channel[exetsk.TaskStatus]
|
||||
}
|
||||
|
||||
type ExecutorInfo struct {
|
||||
type ExecutorStatus struct {
|
||||
executorID schmod.ExecutorID
|
||||
jobTasks map[string]jobTask // key 为 TaskID
|
||||
tasks map[string]task // key 为 TaskID
|
||||
lastReportTime time.Time
|
||||
}
|
||||
|
||||
type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus exetsk.TaskStatus)
|
||||
type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string)
|
||||
var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
|
||||
|
||||
type Manager struct {
|
||||
executors map[schmod.ExecutorID]*ExecutorInfo
|
||||
executors map[schmod.ExecutorID]*ExecutorStatus
|
||||
lock sync.Mutex
|
||||
exeCli *exemq.Client
|
||||
|
||||
onTaskUpdated OnTaskUpdatedCallbackFn
|
||||
onTaskTimeout OnTimeoutCallbackFn
|
||||
|
||||
reportTimeout time.Duration
|
||||
}
|
||||
|
||||
|
@ -47,89 +40,71 @@ func NewManager(reportTimeout time.Duration) (*Manager, error) {
|
|||
}
|
||||
|
||||
return &Manager{
|
||||
executors: make(map[schmod.ExecutorID]*ExecutorInfo),
|
||||
executors: make(map[schmod.ExecutorID]*ExecutorStatus),
|
||||
exeCli: exeCli,
|
||||
reportTimeout: reportTimeout,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) {
|
||||
m.onTaskUpdated = callback
|
||||
}
|
||||
|
||||
func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) {
|
||||
m.onTaskTimeout = callback
|
||||
}
|
||||
|
||||
func (m *Manager) Report(execID schmod.ExecutorID, taskStatus []mgrmq.ExecutorTaskStatus) {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
info, ok := m.executors[execID]
|
||||
exec, ok := m.executors[execID]
|
||||
if !ok {
|
||||
info = &ExecutorInfo{
|
||||
exec = &ExecutorStatus{
|
||||
executorID: execID,
|
||||
jobTasks: make(map[string]jobTask),
|
||||
tasks: make(map[string]task),
|
||||
}
|
||||
m.executors[execID] = info
|
||||
m.executors[execID] = exec
|
||||
}
|
||||
|
||||
info.lastReportTime = time.Now()
|
||||
exec.lastReportTime = time.Now()
|
||||
|
||||
for _, s := range taskStatus {
|
||||
tsk, ok := info.jobTasks[s.TaskID]
|
||||
tsk, ok := exec.tasks[s.TaskID]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status)
|
||||
// TODO 考虑主动检测channel是否关闭,然后取消task
|
||||
if tsk.statusChan.Send(s.Status) != nil {
|
||||
delete(exec.tasks, s.TaskID)
|
||||
|
||||
if len(exec.tasks) == 0 {
|
||||
delete(m.executors, execID)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 启动一个Task,并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID
|
||||
func (m *Manager) StartTask(jobID schsdk.JobID, info exetsk.TaskInfo) (string, error) {
|
||||
// 启动一个Task
|
||||
func (m *Manager) StartTask(info exetsk.TaskInfo) *sync2.Channel[exetsk.TaskStatus] {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
ch := sync2.NewChannel[exetsk.TaskStatus]()
|
||||
|
||||
resp, err := m.exeCli.StartTask(exemq.NewStartTask(info))
|
||||
if err != nil {
|
||||
return "", err
|
||||
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||||
return ch
|
||||
}
|
||||
|
||||
fullTaskID := fmt.Sprintf("%s-%s", resp.ExecutorID, resp.TaskID)
|
||||
|
||||
exeInfo, ok := m.executors[resp.ExecutorID]
|
||||
if !ok {
|
||||
exeInfo = &ExecutorInfo{
|
||||
exeInfo = &ExecutorStatus{
|
||||
executorID: resp.ExecutorID,
|
||||
jobTasks: make(map[string]jobTask),
|
||||
tasks: make(map[string]task),
|
||||
lastReportTime: time.Now(),
|
||||
}
|
||||
m.executors[resp.ExecutorID] = exeInfo
|
||||
}
|
||||
|
||||
exeInfo.jobTasks[resp.TaskID] = jobTask{
|
||||
JobID: jobID,
|
||||
TaskID: resp.TaskID,
|
||||
FullTaskID: fullTaskID,
|
||||
exeInfo.tasks[resp.TaskID] = task{
|
||||
statusChan: ch,
|
||||
}
|
||||
|
||||
return fullTaskID, nil
|
||||
}
|
||||
|
||||
// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新
|
||||
func (m *Manager) ForgetTask(fullTaskID string) {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
for _, exe := range m.executors {
|
||||
for _, tsk := range exe.jobTasks {
|
||||
if tsk.FullTaskID == fullTaskID {
|
||||
delete(exe.jobTasks, fullTaskID)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
return ch
|
||||
}
|
||||
|
||||
func (m *Manager) Serve() error {
|
||||
|
@ -151,8 +126,8 @@ func (m *Manager) Serve() error {
|
|||
continue
|
||||
}
|
||||
|
||||
for _, tsk := range exeInfo.jobTasks {
|
||||
m.onTaskTimeout(tsk.JobID, tsk.FullTaskID)
|
||||
for _, tsk := range exeInfo.tasks {
|
||||
tsk.statusChan.CloseWithError(ErrWaitReportTimeout)
|
||||
}
|
||||
|
||||
delete(m.executors, exeID)
|
||||
|
|
|
@ -1,371 +0,0 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type adjustingJob struct {
|
||||
job *jobmod.NormalJob
|
||||
state *jobmod.StateAdjusting
|
||||
ccInfo schmod.ComputingCenter
|
||||
}
|
||||
|
||||
type AdjustingHandler struct {
|
||||
mgr *Manager
|
||||
|
||||
jobs map[schsdk.JobID]*adjustingJob
|
||||
|
||||
cmdChan actor.CommandChannel
|
||||
}
|
||||
|
||||
func NewAdjustingHandler(mgr *Manager) *AdjustingHandler {
|
||||
return &AdjustingHandler{
|
||||
mgr: mgr,
|
||||
jobs: make(map[schsdk.JobID]*adjustingJob),
|
||||
cmdChan: *actor.NewCommandChannel(),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *AdjustingHandler) Handle(job jobmod.Job) {
|
||||
h.cmdChan.Send(func() {
|
||||
norJob, ok := job.(*jobmod.NormalJob)
|
||||
if !ok {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
adjustingState, ok := norJob.GetState().(*jobmod.StateAdjusting)
|
||||
if !ok {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
colCli, err := schglb.CollectorMQPool.Acquire()
|
||||
if err != nil {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err.Error()), job.GetState()))
|
||||
return
|
||||
}
|
||||
defer schglb.CollectorMQPool.Release(colCli)
|
||||
|
||||
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), adjustingState.Scheme.TargetCCID)
|
||||
if err != nil {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new cloudream storage client: %s", err.Error()), job.GetState()))
|
||||
return
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
stgInfo, err := stgCli.StorageGetInfo(cdssdk.StorageGetInfoReq{
|
||||
StorageID: ccInfo.CDSStorageID,
|
||||
})
|
||||
if err != nil {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting cloudream storage info: %s", err.Error()), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
norJob.TargetCCID = adjustingState.Scheme.TargetCCID
|
||||
// TODO UserID
|
||||
norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, norJob.JobID)
|
||||
|
||||
adjJob := &adjustingJob{
|
||||
job: norJob,
|
||||
state: adjustingState,
|
||||
ccInfo: ccInfo,
|
||||
}
|
||||
h.jobs[job.GetJobID()] = adjJob
|
||||
|
||||
h.onJobEvent(nil, adjJob)
|
||||
})
|
||||
}
|
||||
|
||||
func (h *AdjustingHandler) onJobEvent(evt event.Event, job *adjustingJob) {
|
||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
||||
return
|
||||
}
|
||||
|
||||
err := h.doPackageScheduling(evt, job,
|
||||
job.job.Info.Files.Dataset, &job.job.Files.Dataset,
|
||||
&job.state.Scheme.Dataset, &job.state.Dataset,
|
||||
)
|
||||
if err != nil {
|
||||
job.state.Dataset.Error = err.Error()
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
err = h.doPackageScheduling(evt, job,
|
||||
job.job.Info.Files.Code, &job.job.Files.Code,
|
||||
&job.state.Scheme.Code, &job.state.Code,
|
||||
)
|
||||
if err != nil {
|
||||
job.state.Code.Error = err.Error()
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
err = h.doImageScheduling(evt, job,
|
||||
job.job.Info.Files.Image, &job.job.Files.Image,
|
||||
&job.state.Scheme.Image, &job.state.Image,
|
||||
)
|
||||
if err != nil {
|
||||
job.state.Image.Error = err.Error()
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
// 如果三种文件都调度完成,则可以进入下个阶段了
|
||||
if job.state.Dataset.Step == jobmod.StepCompleted &&
|
||||
job.state.Code.Step == jobmod.StepCompleted &&
|
||||
job.state.Image.Step == jobmod.StepCompleted {
|
||||
|
||||
h.changeJobState(job.job, jobmod.NewStateReadyToExecute())
|
||||
}
|
||||
}
|
||||
|
||||
func (h *AdjustingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
||||
job.SetState(state)
|
||||
|
||||
delete(h.jobs, job.GetJobID())
|
||||
|
||||
h.mgr.pubLock.Lock()
|
||||
h.mgr.handleState(job)
|
||||
h.mgr.pubLock.Unlock()
|
||||
}
|
||||
|
||||
func (h *AdjustingHandler) doPackageScheduling(evt event.Event, job *adjustingJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
|
||||
if state.Step == jobmod.StepBegin {
|
||||
state.Step = jobmod.StepUploaded
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepUploaded {
|
||||
if scheme.Action == jobmod.ActionNo {
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionMove {
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, file.PackageID, job.ccInfo.CDSNodeID))
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting cache move package: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepMoving
|
||||
state.FullTaskID = fullTaskID
|
||||
return nil
|
||||
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionLoad {
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewStorageLoadPackage(1, file.PackageID, job.ccInfo.CDSStorageID))
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting stroage load package: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepLoading
|
||||
state.FullTaskID = fullTaskID
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepMoving {
|
||||
moveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
|
||||
if err == event.ErrUnconcernedTask {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err == event.ErrTaskTimeout {
|
||||
return fmt.Errorf("cache move package timeout")
|
||||
}
|
||||
|
||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
||||
|
||||
if moveRet.Error != "" {
|
||||
return fmt.Errorf("cache move pacakge: %s", moveRet.Error)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepLoading {
|
||||
loadRet, err := event.AssertExecutorTaskStatus[*exectsk.StorageLoadPackageStatus](evt, state.FullTaskID)
|
||||
if err == event.ErrUnconcernedTask {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err == event.ErrTaskTimeout {
|
||||
return fmt.Errorf("storage load package timeout")
|
||||
}
|
||||
|
||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
||||
|
||||
if loadRet.Error != "" {
|
||||
return fmt.Errorf("storage load package: %s", loadRet.Error)
|
||||
}
|
||||
|
||||
file.FullPath = loadRet.FullPath
|
||||
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *AdjustingHandler) doImageScheduling(evt event.Event, job *adjustingJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
|
||||
if state.Step == jobmod.StepBegin {
|
||||
state.Step = jobmod.StepUploaded
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepUploaded {
|
||||
if scheme.Action == jobmod.ActionNo {
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
|
||||
// 要导入镜像,则需要先将镜像移动到指点节点的缓存中
|
||||
if scheme.Action == jobmod.ActionImportImage {
|
||||
if file.PackageID == nil {
|
||||
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, job.ccInfo.CCID)
|
||||
}
|
||||
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, *file.PackageID, job.ccInfo.CDSNodeID))
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting cache move package: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepMoving
|
||||
state.FullTaskID = fullTaskID
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepMoving {
|
||||
cacheMoveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
|
||||
if err == event.ErrUnconcernedTask {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err == event.ErrTaskTimeout {
|
||||
return fmt.Errorf("cache move package timeout")
|
||||
}
|
||||
|
||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
||||
|
||||
if cacheMoveRet.Error != "" {
|
||||
return fmt.Errorf("cache move pacakge: %s", cacheMoveRet.Error)
|
||||
}
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cloudream storage client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting package objects: %w", err)
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) != 1 {
|
||||
return fmt.Errorf("there must be only 1 object in the package that will be imported")
|
||||
}
|
||||
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewUploadImage(job.ccInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting import image: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepImageImporting
|
||||
state.FullTaskID = fullTaskID
|
||||
return nil
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepImageImporting {
|
||||
uploadImageRet, err := event.AssertExecutorTaskStatus[*exectsk.UploadImageStatus](evt, state.FullTaskID)
|
||||
if err == event.ErrUnconcernedTask {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err == event.ErrTaskTimeout {
|
||||
return fmt.Errorf("import image timeout")
|
||||
}
|
||||
|
||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
||||
|
||||
if uploadImageRet.Error != "" {
|
||||
return fmt.Errorf("import image: %s", uploadImageRet.Error)
|
||||
}
|
||||
|
||||
// 调整过程中不会更换镜像,所以ImageID不会发生变化
|
||||
err = h.mgr.db.PCMImage().Create(h.mgr.db.SQLCtx(), file.ImageID, job.ccInfo.CCID, uploadImageRet.PCMImageID, uploadImageRet.Name, time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating pcm image info: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *AdjustingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
||||
h.cmdChan.Send(func() {
|
||||
if broadcast.ToAll() {
|
||||
for _, job := range h.jobs {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
|
||||
} else if broadcast.ToJobSet() {
|
||||
for _, job := range h.jobs {
|
||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
||||
continue
|
||||
}
|
||||
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
} else if broadcast.ToJob() {
|
||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (h *AdjustingHandler) Serve() {
|
||||
cmdChan := h.cmdChan.BeginChanReceive()
|
||||
defer h.cmdChan.CloseChanReceive()
|
||||
|
||||
for {
|
||||
select {
|
||||
case cmd := <-cmdChan:
|
||||
cmd()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (h *AdjustingHandler) Stop() {
|
||||
// TODO 支持STOP
|
||||
}
|
|
@ -1,63 +0,0 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type CompleteHandler struct {
|
||||
mgr *Manager
|
||||
}
|
||||
|
||||
func NewCompleteHandler(mgr *Manager) *CompleteHandler {
|
||||
return &CompleteHandler{
|
||||
mgr: mgr,
|
||||
}
|
||||
}
|
||||
|
||||
func (h *CompleteHandler) Handle(job jobmod.Job) {
|
||||
// TODO 可以考虑将执行记录落库
|
||||
if state, ok := job.GetState().(*jobmod.StateSuccess); ok {
|
||||
h.handleSuccess(job, state)
|
||||
} else if state, ok := job.GetState().(*jobmod.StateFailed); ok {
|
||||
h.handleFailed(job, state)
|
||||
} else {
|
||||
state := jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState())
|
||||
job.SetState(state)
|
||||
h.handleFailed(job, state)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *CompleteHandler) handleSuccess(job jobmod.Job, state *jobmod.StateSuccess) {
|
||||
logger.WithField("JobID", job.GetJobID()).Infof("job completed successfuly")
|
||||
|
||||
h.mgr.onEvent(event.ToJobSet(job.GetJobSetID()), event.NewJobCompleted(job))
|
||||
}
|
||||
|
||||
func (h *CompleteHandler) handleFailed(job jobmod.Job, state *jobmod.StateFailed) {
|
||||
logger.
|
||||
WithField("JobID", job.GetJobID()).
|
||||
WithField("LastState", reflect.TypeOf(state.LastState).String()).
|
||||
Infof("job failed with: %v", state.Error)
|
||||
|
||||
h.mgr.onEvent(event.ToJobSet(job.GetJobSetID()), event.NewJobCompleted(job))
|
||||
}
|
||||
|
||||
func (h *CompleteHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
||||
cloneEvt.Callback.SetError(fmt.Errorf("job not found"))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func (h *CompleteHandler) Serve() {
|
||||
|
||||
}
|
||||
|
||||
func (h *CompleteHandler) Stop() {
|
||||
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type DefaultHandler struct {
|
||||
mgr *Manager
|
||||
}
|
||||
|
||||
func NewDefaultHandler(mgr *Manager) *DefaultHandler {
|
||||
return &DefaultHandler{
|
||||
mgr: mgr,
|
||||
}
|
||||
}
|
||||
|
||||
// 处理Job。在此期间全局锁已锁定
|
||||
func (h *DefaultHandler) Handle(job jobmod.Job) {
|
||||
state := job.GetState()
|
||||
if state == nil {
|
||||
job.SetState(jobmod.NewStateFailed("unexpected nil state", nil))
|
||||
h.mgr.handleState(job)
|
||||
return
|
||||
}
|
||||
|
||||
if _, ok := state.(*jobmod.StateFailed); ok {
|
||||
logger.Warnf("state failed should not be handled by default handler")
|
||||
return
|
||||
}
|
||||
|
||||
job.SetState(jobmod.NewStateFailed("no handler for this state", state))
|
||||
h.mgr.handleState(job)
|
||||
}
|
||||
|
||||
// 外部发生了一个事件
|
||||
func (h *DefaultHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
||||
|
||||
}
|
||||
|
||||
// 运行Handler
|
||||
func (h *DefaultHandler) Serve() {
|
||||
|
||||
}
|
||||
|
||||
// 停止此Handler
|
||||
func (h *DefaultHandler) Stop() {
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package event
|
||||
package jobmgr
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
@ -11,6 +11,8 @@ var ErrUnconcernedTask = errors.New("unconcerned task")
|
|||
|
||||
var ErrTaskTimeout = errors.New("task timeout")
|
||||
|
||||
var ErrJobCancelled = errors.New("job cancelled")
|
||||
|
||||
type Event interface{}
|
||||
|
||||
type BroadcastType string
|
|
@ -1,12 +0,0 @@
|
|||
package event
|
||||
|
||||
// advisor的任务执行超时
|
||||
type AdvisorTaskTimeout struct {
|
||||
FullTaskID string
|
||||
}
|
||||
|
||||
func NewAdvisorTaskTimeout(fullTaskID string) *AdvisorTaskTimeout {
|
||||
return &AdvisorTaskTimeout{
|
||||
FullTaskID: fullTaskID,
|
||||
}
|
||||
}
|
|
@ -1,46 +0,0 @@
|
|||
package event
|
||||
|
||||
import advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
||||
|
||||
// advisor上报任务进度
|
||||
type AdvisorTaskUpdated struct {
|
||||
FullTaskID string
|
||||
TaskStatus advtsk.TaskStatus
|
||||
}
|
||||
|
||||
func NewAdvisorTaskUpdated(fullTaskID string, taskStatus advtsk.TaskStatus) *AdvisorTaskUpdated {
|
||||
return &AdvisorTaskUpdated{
|
||||
FullTaskID: fullTaskID,
|
||||
TaskStatus: taskStatus,
|
||||
}
|
||||
}
|
||||
|
||||
func AssertAdvisorTaskStatus[T advtsk.TaskStatus](evt Event, fullTaskID string) (T, error) {
|
||||
var ret T
|
||||
if evt == nil {
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
||||
|
||||
if reportTaskStatus, ok := evt.(*AdvisorTaskUpdated); ok {
|
||||
if reportTaskStatus.FullTaskID != fullTaskID {
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
||||
|
||||
status, ok := reportTaskStatus.TaskStatus.(T)
|
||||
if !ok {
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
if taskTimeout, ok := evt.(*AdvisorTaskTimeout); ok {
|
||||
if taskTimeout.FullTaskID != fullTaskID {
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
||||
|
||||
return ret, ErrTaskTimeout
|
||||
}
|
||||
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
package event
|
||||
|
||||
type Cancel struct {
|
||||
}
|
|
@ -1,14 +0,0 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type CloneJob struct {
|
||||
Callback future.SetValueFuture[jobmod.Job]
|
||||
}
|
||||
|
||||
func NewCloneJob() *CloneJob {
|
||||
return &CloneJob{}
|
||||
}
|
|
@ -1,12 +0,0 @@
|
|||
package event
|
||||
|
||||
// executor的任务执行超时
|
||||
type ExecutorTaskTimeout struct {
|
||||
FullTaskID string
|
||||
}
|
||||
|
||||
func NewExecutorTaskTimeout(fullTaskID string) *ExecutorTaskTimeout {
|
||||
return &ExecutorTaskTimeout{
|
||||
FullTaskID: fullTaskID,
|
||||
}
|
||||
}
|
|
@ -1,48 +0,0 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
)
|
||||
|
||||
// executor上报任务进度
|
||||
type ExecutorTaskUpdated struct {
|
||||
FullTaskID string
|
||||
TaskStatus exectsk.TaskStatus
|
||||
}
|
||||
|
||||
func NewExecutorTaskUpdated(fullTaskID string, taskStatus exectsk.TaskStatus) *ExecutorTaskUpdated {
|
||||
return &ExecutorTaskUpdated{
|
||||
FullTaskID: fullTaskID,
|
||||
TaskStatus: taskStatus,
|
||||
}
|
||||
}
|
||||
|
||||
func AssertExecutorTaskStatus[T exectsk.TaskStatus](evt Event, fullTaskID string) (T, error) {
|
||||
var ret T
|
||||
if evt == nil {
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
||||
|
||||
if reportTaskStatus, ok := evt.(*ExecutorTaskUpdated); ok {
|
||||
if reportTaskStatus.FullTaskID != fullTaskID {
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
||||
|
||||
status, ok := reportTaskStatus.TaskStatus.(T)
|
||||
if !ok {
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
||||
|
||||
return status, nil
|
||||
}
|
||||
|
||||
if taskTimeout, ok := evt.(*ExecutorTaskTimeout); ok {
|
||||
if taskTimeout.FullTaskID != fullTaskID {
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
||||
|
||||
return ret, ErrTaskTimeout
|
||||
}
|
||||
|
||||
return ret, ErrUnconcernedTask
|
||||
}
|
|
@ -1,16 +1,18 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
)
|
||||
|
||||
// 任务结束,包括成功或者失败
|
||||
type JobCompleted struct {
|
||||
Job jobmod.Job
|
||||
Job *jobmgr.Job
|
||||
Err error
|
||||
}
|
||||
|
||||
func NewJobCompleted(job jobmod.Job) *JobCompleted {
|
||||
func NewJobCompleted(job *jobmgr.Job, err error) *JobCompleted {
|
||||
return &JobCompleted{
|
||||
Job: job,
|
||||
Err: err,
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,21 +1,18 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
)
|
||||
|
||||
// 本地文件上传结束
|
||||
type LocalFileUploaded struct {
|
||||
JobSetID schsdk.JobSetID
|
||||
LocalPath string
|
||||
Error string
|
||||
Error error
|
||||
PackageID cdssdk.PackageID
|
||||
}
|
||||
|
||||
func NewLocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) *LocalFileUploaded {
|
||||
func NewLocalFileUploaded(localPath string, err error, packageID cdssdk.PackageID) *LocalFileUploaded {
|
||||
return &LocalFileUploaded{
|
||||
JobSetID: jobSetID,
|
||||
LocalPath: localPath,
|
||||
Error: err,
|
||||
PackageID: packageID,
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
)
|
||||
|
||||
func WaitType[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet) (T, bool) {
|
||||
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
|
||||
_, ok := evt.(T)
|
||||
return ok
|
||||
})
|
||||
return ret.(T), ok
|
||||
}
|
||||
|
||||
func WaitTypeAnd[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet, cond func(val T) bool) (T, bool) {
|
||||
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
|
||||
e, ok := evt.(T)
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
return cond(e)
|
||||
})
|
||||
return ret.(T), ok
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
"gitlink.org.cn/cloudream/common/utils/lo2"
|
||||
)
|
||||
|
||||
type EventWaitCondition func(evt Event) bool
|
||||
|
||||
type EventWaiter struct {
|
||||
condition EventWaitCondition
|
||||
future *future.SetValueFuture[Event]
|
||||
}
|
||||
|
||||
type EventSet struct {
|
||||
events []Event
|
||||
waiters []EventWaiter
|
||||
lock sync.Mutex
|
||||
}
|
||||
|
||||
func NewEventSet() EventSet {
|
||||
return EventSet{}
|
||||
}
|
||||
|
||||
func (s *EventSet) Post(evt Event) {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
// 一个事件能唤醒多个等待者
|
||||
used := false
|
||||
for i, waiter := range s.waiters {
|
||||
if waiter.condition(evt) {
|
||||
s.waiters = lo2.RemoveAt(s.waiters, i)
|
||||
waiter.future.SetValue(evt)
|
||||
used = true
|
||||
}
|
||||
}
|
||||
|
||||
if !used {
|
||||
s.events = append(s.events, evt)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bool) {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
// 一个等待者只能等待一个事件
|
||||
for i, evt := range s.events {
|
||||
if cond(evt) {
|
||||
s.events = lo2.RemoveAt(s.events, i)
|
||||
return evt, true
|
||||
}
|
||||
}
|
||||
|
||||
fut := future.NewSetValue[Event]()
|
||||
waiter := EventWaiter{
|
||||
condition: cond,
|
||||
future: fut,
|
||||
}
|
||||
s.events = append(s.events, waiter)
|
||||
|
||||
val, err := fut.WaitValue(ctx)
|
||||
if err != nil {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
return val, true
|
||||
}
|
|
@ -1,264 +0,0 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type executingJob struct {
|
||||
job jobmod.Job
|
||||
state *jobmod.StateExecuting
|
||||
}
|
||||
|
||||
type ExecutingHandler struct {
|
||||
mgr *Manager
|
||||
|
||||
jobs map[schsdk.JobID]*executingJob
|
||||
|
||||
cmdChan actor.CommandChannel
|
||||
}
|
||||
|
||||
func NewExecutingHandler(mgr *Manager) *ExecutingHandler {
|
||||
return &ExecutingHandler{
|
||||
mgr: mgr,
|
||||
jobs: make(map[schsdk.JobID]*executingJob),
|
||||
cmdChan: *actor.NewCommandChannel(),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ExecutingHandler) Handle(job jobmod.Job) {
|
||||
h.cmdChan.Send(func() {
|
||||
state, ok := job.GetState().(*jobmod.StateExecuting)
|
||||
if !ok {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
rjob := &executingJob{
|
||||
job: job,
|
||||
state: state,
|
||||
}
|
||||
h.jobs[job.GetJobID()] = rjob
|
||||
|
||||
h.onJobEvent(nil, rjob)
|
||||
})
|
||||
}
|
||||
|
||||
func (h *ExecutingHandler) onJobEvent(evt event.Event, job *executingJob) {
|
||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
||||
return
|
||||
}
|
||||
|
||||
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
|
||||
h.onNormalJobEvent(evt, job, norJob)
|
||||
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
|
||||
h.onResourceJobEvent(evt, job, resJob)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ExecutingHandler) onNormalJobEvent(evt event.Event, job *executingJob, norJob *jobmod.NormalJob) {
|
||||
if job.state.FullTaskID == "" {
|
||||
pcmImgInfo, err := h.mgr.db.PCMImage().GetByImageIDAndCCID(h.mgr.db.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID)
|
||||
if err != nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed("getting pcm image info: "+err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), norJob.TargetCCID)
|
||||
if err != nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
// TODO 需要添加DATA_IN、DATA_OUT等环境变量,这些数据从Job的信息中来获取
|
||||
ress, err := h.mgr.db.CCResource().GetByCCID(h.mgr.db.SQLCtx(), norJob.TargetCCID)
|
||||
if err != nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center resource info: %s", err.Error()), job.state))
|
||||
return
|
||||
}
|
||||
if len(ress) == 0 {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("there is no resource at computing center %v", norJob.TargetCCID), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(),
|
||||
exetsk.NewSubmitTask(
|
||||
ccInfo.PCMParticipantID,
|
||||
pcmImgInfo.PCMImageID,
|
||||
// TODO 选择资源的算法
|
||||
ress[0].PCMResourceID,
|
||||
norJob.Info.Runtime.Command,
|
||||
norJob.Info.Runtime.Envs,
|
||||
))
|
||||
if err != nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
job.state.FullTaskID = fullTaskID
|
||||
}
|
||||
|
||||
if execRet, err := event.AssertExecutorTaskStatus[*exetsk.SubmitTaskStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
|
||||
if err == event.ErrTaskTimeout {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed("schedule task timeout", job.state))
|
||||
return
|
||||
}
|
||||
|
||||
logger.WithField("JobID", job.job.GetJobID()).
|
||||
WithField("State", reflect.TypeOf(job.state).String()).
|
||||
Infof("pcm task state change to: %s", execRet.Status)
|
||||
|
||||
if execRet.Status == pcmsdk.TaskStatusSuccess {
|
||||
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
|
||||
h.changeJobState(job.job, jobmod.NewStateSuccess())
|
||||
|
||||
} else if execRet.Status == pcmsdk.TaskStatuFailed {
|
||||
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(execRet.Error, job.state))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ExecutingHandler) onResourceJobEvent(evt event.Event, job *executingJob, resJob *jobmod.ResourceJob) {
|
||||
if job.state.FullTaskID == "" {
|
||||
h.mgr.pubLock.Lock()
|
||||
jobSet, ok := h.mgr.jobSets[resJob.GetJobSetID()]
|
||||
if !ok {
|
||||
h.mgr.pubLock.Unlock()
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", resJob.GetJobSetID()), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
ref := jobSet.FindRefByLocalJobID(resJob.Info.TargetLocalJobID)
|
||||
if ref == nil {
|
||||
h.mgr.pubLock.Unlock()
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
||||
fmt.Sprintf("job %s not found in job set %s",
|
||||
resJob.Info.TargetLocalJobID,
|
||||
resJob.GetJobSetID()),
|
||||
job.state,
|
||||
))
|
||||
return
|
||||
}
|
||||
|
||||
targetJob, ok := h.mgr.jobs[ref.JobID]
|
||||
h.mgr.pubLock.Unlock()
|
||||
|
||||
if !ok {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
tarNorJob, ok := targetJob.Job.(*jobmod.NormalJob)
|
||||
if !ok {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job(%v) %s is not a Normal job", reflect.TypeOf(targetJob), ref.JobID), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
colCli, err := schglb.CollectorMQPool.Acquire()
|
||||
if err != nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err.Error()), job.state))
|
||||
return
|
||||
}
|
||||
defer schglb.CollectorMQPool.Release(colCli)
|
||||
|
||||
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), tarNorJob.TargetCCID)
|
||||
if err != nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(), exetsk.NewStorageCreatePackage(
|
||||
1, // TOOD 用户ID
|
||||
ccInfo.CDSStorageID,
|
||||
tarNorJob.OutputFullPath,
|
||||
resJob.Info.BucketID,
|
||||
utils.MakeResourcePackageName(resJob.JobID),
|
||||
))
|
||||
if err != nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
job.state.FullTaskID = fullTaskID
|
||||
}
|
||||
|
||||
if createRet, err := event.AssertExecutorTaskStatus[*exetsk.StorageCreatePackageStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
|
||||
if err == event.ErrTaskTimeout {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed("storage create package timeout", job.state))
|
||||
return
|
||||
}
|
||||
|
||||
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
|
||||
|
||||
if createRet.Error != "" {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(createRet.Error, job.state))
|
||||
return
|
||||
}
|
||||
|
||||
resJob.ResourcePackageID = createRet.PackageID
|
||||
|
||||
h.changeJobState(job.job, jobmod.NewStateSuccess())
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ExecutingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
||||
job.SetState(state)
|
||||
|
||||
delete(h.jobs, job.GetJobID())
|
||||
|
||||
h.mgr.pubLock.Lock()
|
||||
h.mgr.handleState(job)
|
||||
h.mgr.pubLock.Unlock()
|
||||
}
|
||||
|
||||
func (h *ExecutingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
||||
h.cmdChan.Send(func() {
|
||||
if broadcast.ToAll() {
|
||||
for _, job := range h.jobs {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
|
||||
} else if broadcast.ToJobSet() {
|
||||
for _, job := range h.jobs {
|
||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
||||
continue
|
||||
}
|
||||
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
} else if broadcast.ToJob() {
|
||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (h *ExecutingHandler) Serve() {
|
||||
cmdChan := h.cmdChan.BeginChanReceive()
|
||||
defer h.cmdChan.CloseChanReceive()
|
||||
|
||||
for {
|
||||
select {
|
||||
case cmd := <-cmdChan:
|
||||
cmd()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ExecutingHandler) Stop() {
|
||||
// TODO 支持STOP
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"github.com/samber/lo"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type FileScheduleAction string
|
||||
|
||||
// 文件调度方案
|
||||
const (
|
||||
ActionNo FileScheduleAction = "No" // 不需要操作
|
||||
ActionMove FileScheduleAction = "Move" // 需要在指定节点上建立缓存
|
||||
ActionLoad FileScheduleAction = "Load" // 需要加载到Storage
|
||||
ActionImportImage FileScheduleAction = "ImportImage" // 需要导入镜像
|
||||
)
|
||||
|
||||
type FileScheduleScheme struct {
|
||||
Action FileScheduleAction `json:"action"`
|
||||
}
|
||||
|
||||
// 任务调度方案
|
||||
type JobScheduleScheme struct {
|
||||
TargetCCID schsdk.CCID `json:"targetCCID"`
|
||||
Dataset FileScheduleScheme `json:"dataset"`
|
||||
Code FileScheduleScheme `json:"code"`
|
||||
Image FileScheduleScheme `json:"image"`
|
||||
}
|
||||
|
||||
// 任务集的预调度方案
|
||||
type JobSetPreScheduleScheme struct {
|
||||
JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID
|
||||
}
|
||||
|
||||
// 任务集
|
||||
type JobSet struct {
|
||||
JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID
|
||||
JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用
|
||||
PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"`
|
||||
}
|
||||
type JobSetJobRef struct {
|
||||
JobID schsdk.JobID `json:"jobID"` // 任务ID
|
||||
LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID
|
||||
}
|
||||
|
||||
func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet {
|
||||
return &JobSet{
|
||||
JobSetID: jobSetID,
|
||||
JobRefs: jobRefs,
|
||||
PreScheduleScheme: preScheduleScheme,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef {
|
||||
ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID })
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &ref
|
||||
}
|
||||
|
||||
// 任务
|
||||
type Job struct {
|
||||
JobSetID schsdk.JobSetID // 任务集ID
|
||||
JobID schsdk.JobID // 全局唯一任务ID
|
||||
Body JobBody // 具体任务
|
||||
}
|
||||
|
||||
func (j *Job) GetInfo() schsdk.JobInfo {
|
||||
return j.Body.GetInfo()
|
||||
}
|
||||
|
||||
func (j *Job) Dump(ctx JobStateRunContext, job *Job, curState JobState) jobmod.JobStatus {
|
||||
return jobmod.JobStatus{
|
||||
JobID: j.JobID,
|
||||
JobSetID: j.JobSetID,
|
||||
Info: j.GetInfo(),
|
||||
Body: job.Body.Dump(),
|
||||
State: curState.Dump(ctx, job),
|
||||
}
|
||||
}
|
||||
|
||||
type JobBody interface {
|
||||
GetInfo() schsdk.JobInfo
|
||||
Dump() jobmod.JobBodyStatus
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package job
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type DataReturnJob struct {
|
||||
Info schsdk.DataReturnJobInfo
|
||||
TargetJobCCID schsdk.CCID // 目标任务所在计算中心的ID
|
||||
TargetJobOutputFullPath string // 目标任务的结果输出全路径
|
||||
DataReturnPackageID cdssdk.PackageID // 回源之后得到的PackageID
|
||||
}
|
||||
|
||||
func NewResourceJob(info schsdk.DataReturnJobInfo) *DataReturnJob {
|
||||
return &DataReturnJob{
|
||||
Info: info,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *DataReturnJob) GetInfo() schsdk.JobInfo {
|
||||
return &j.Info
|
||||
}
|
||||
|
||||
func (j *DataReturnJob) Dump() jobmod.JobBodyStatus {
|
||||
return jobmod.DataReturnJobStatus{
|
||||
DataReturnPackageID: j.DataReturnPackageID,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package job
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type NormalJob struct {
|
||||
Info schsdk.NormalJobInfo // 提交任务时提供的任务描述信息
|
||||
Files jobmod.JobFiles // 任务需要的文件
|
||||
TargetCCID schsdk.CCID // 将要运行此任务的算力中心ID
|
||||
OutputFullPath string // 程序结果的完整输出路径
|
||||
}
|
||||
|
||||
func NewNormalJob(info schsdk.NormalJobInfo) *NormalJob {
|
||||
return &NormalJob{
|
||||
Info: info,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *NormalJob) GetInfo() schsdk.JobInfo {
|
||||
return &j.Info
|
||||
}
|
||||
|
||||
func (j *NormalJob) Dump() jobmod.JobBodyStatus {
|
||||
return &jobmod.NormalJobStatus{
|
||||
Files: j.Files,
|
||||
TargetCCID: j.TargetCCID,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,271 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||
)
|
||||
|
||||
type Adjusting struct {
|
||||
scheme jobmod.JobScheduleScheme
|
||||
targetCCInfo schmod.ComputingCenter
|
||||
}
|
||||
|
||||
func NewAdjusting(scheme jobmod.JobScheduleScheme) *Adjusting {
|
||||
return &Adjusting{
|
||||
scheme: scheme,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Adjusting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewNormalJobReadyToExecute())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
norJob := jo.Body.(*job.NormalJob)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// 监听取消事件
|
||||
go func() {
|
||||
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
s.targetCCInfo = ccInfo
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cds client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
// 已经确定最终执行的目标计算中心,则可以生成结果输出路径了
|
||||
stgInfo, err := stgCli.StorageGetInfo(cdssdk.StorageGetInfoReq{
|
||||
StorageID: ccInfo.CDSStorageID,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting cds storage info: %w", err)
|
||||
}
|
||||
// TODO UserID
|
||||
norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID)
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(3)
|
||||
|
||||
var e1, e2, e3 error
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset)
|
||||
if e1 != nil {
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code)
|
||||
if e2 != nil {
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image)
|
||||
if e3 != nil {
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
|
||||
return errors.Join(e1, e2, e3)
|
||||
}
|
||||
|
||||
func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||
switch info := fileInfo.(type) {
|
||||
case *schsdk.LocalJobFileInfo:
|
||||
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
|
||||
return e.LocalPath == info.LocalPath
|
||||
})
|
||||
if !ok {
|
||||
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||
}
|
||||
if evt.Error != nil {
|
||||
return evt.Error
|
||||
}
|
||||
|
||||
file.PackageID = evt.PackageID
|
||||
|
||||
case *schsdk.PackageJobFileInfo:
|
||||
file.PackageID = info.PackageID
|
||||
|
||||
case *schsdk.ResourceJobFileInfo:
|
||||
return nil
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unknown dataset type: %T", info)
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionMove {
|
||||
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSNodeID))
|
||||
defer wt.Close()
|
||||
|
||||
status, err := wt.Receive(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||
if moveStatus.Error != "" {
|
||||
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionLoad {
|
||||
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID))
|
||||
defer wt.Close()
|
||||
|
||||
status, err := wt.Receive(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||
if moveStatus.Error != "" {
|
||||
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||
switch info := fileInfo.(type) {
|
||||
case *schsdk.LocalJobFileInfo:
|
||||
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
|
||||
return e.LocalPath == info.LocalPath
|
||||
})
|
||||
if !ok {
|
||||
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||
}
|
||||
if evt.Error != nil {
|
||||
return evt.Error
|
||||
}
|
||||
|
||||
// 上传完毕,则可以新建一个空的镜像的记录
|
||||
// TODO 镜像名称
|
||||
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating image info: %w", err)
|
||||
}
|
||||
|
||||
// 填充ImageID和PackageID
|
||||
file.ImageID = imgID
|
||||
file.PackageID = &evt.PackageID
|
||||
|
||||
case *schsdk.ImageJobFileInfo:
|
||||
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting image info: %w", err)
|
||||
}
|
||||
|
||||
file.ImageID = imageInfo.ImageID
|
||||
file.PackageID = imageInfo.CDSPackageID
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionImportImage {
|
||||
if file.PackageID == nil {
|
||||
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
|
||||
}
|
||||
|
||||
// TODO UserID
|
||||
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSNodeID))
|
||||
defer wt.Close()
|
||||
|
||||
status, err := wt.Receive(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||
if moveStatus.Error != "" {
|
||||
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
}
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cloudream storage client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
// TODO UserID
|
||||
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting package objects: %w", err)
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) == 0 {
|
||||
return fmt.Errorf("no object in the package which will be imported")
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) > 1 {
|
||||
return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
||||
}
|
||||
|
||||
wt2 := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
|
||||
defer wt2.Close()
|
||||
|
||||
status2, err := wt2.Receive(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("uploading image: %w", err)
|
||||
}
|
||||
|
||||
uploadStatus := status2.(*exectsk.UploadImageStatus)
|
||||
if uploadStatus.Error != "" {
|
||||
return fmt.Errorf("uploading image: %s", uploadStatus.Error)
|
||||
}
|
||||
|
||||
// TODO 镜像名称
|
||||
err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, job.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating image info: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type Completed struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func SuccessComplete() *Completed {
|
||||
return &Completed{}
|
||||
}
|
||||
func FailureComplete(err error) *Completed {
|
||||
return &Completed{err: err}
|
||||
}
|
||||
|
||||
func (c *Completed) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
// TODO 可以考虑将执行记录落库
|
||||
if c.err == nil {
|
||||
c.handleSuccess(rtx, jo)
|
||||
} else {
|
||||
c.handleFailed(rtx, jo)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Completed) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *Completed) handleSuccess(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||||
logger.WithField("JobID", job.JobID).Infof("job completed successfuly")
|
||||
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
|
||||
}
|
||||
|
||||
func (c *Completed) handleFailed(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||||
logger.
|
||||
WithField("JobID", job.JobID).
|
||||
WithField("LastState", reflect.TypeOf(rtx.LastState).String()).
|
||||
Infof("job failed with: %v", c.err)
|
||||
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||
)
|
||||
|
||||
type NormalJobExecuting struct {
|
||||
lastStatus pcmsdk.TaskStatus
|
||||
}
|
||||
|
||||
func NewNormalJobExecuting() *NormalJobExecuting {
|
||||
return &NormalJobExecuting{
|
||||
lastStatus: "Begin",
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NormalJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, SuccessComplete())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NormalJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *NormalJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
norJob := jo.Body.(*job.NormalJob)
|
||||
|
||||
log := logger.WithType[NormalJobExecuting]("State").WithField("JobID", jo.JobID)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting pcm image info: %w", err)
|
||||
}
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
|
||||
// TODO 需要添加DATA_IN、DATA_OUT等环境变量,这些数据从Job的信息中来获取
|
||||
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center resource: %w", err)
|
||||
}
|
||||
if len(ress) == 0 {
|
||||
return fmt.Errorf("no resource found at computing center %v", norJob.TargetCCID)
|
||||
}
|
||||
|
||||
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSubmitTask(
|
||||
ccInfo.PCMParticipantID,
|
||||
pcmImgInfo.PCMImageID,
|
||||
// TODO 选择资源的算法
|
||||
ress[0].PCMResourceID,
|
||||
norJob.Info.Runtime.Command,
|
||||
norJob.Info.Runtime.Envs,
|
||||
))
|
||||
defer wt.Close()
|
||||
|
||||
for {
|
||||
status, err := wt.Receive(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tskStatus := status.(*exetsk.SubmitTaskStatus)
|
||||
if tskStatus.Error != "" {
|
||||
return fmt.Errorf("submitting task: %s", tskStatus.Error)
|
||||
}
|
||||
|
||||
if tskStatus.Status != s.lastStatus {
|
||||
log.Infof("task %s -> %s", s.lastStatus, tskStatus.Status)
|
||||
}
|
||||
s.lastStatus = tskStatus.Status
|
||||
|
||||
switch tskStatus.Status {
|
||||
case pcmsdk.TaskStatusSuccess:
|
||||
return nil
|
||||
|
||||
case pcmsdk.TaskStatusFailed:
|
||||
return fmt.Errorf("task failed")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type DataReturnJobExecuting struct {
|
||||
}
|
||||
|
||||
func NewDataReturnJobExecuting() *DataReturnJobExecuting {
|
||||
return &DataReturnJobExecuting{}
|
||||
}
|
||||
|
||||
func (s *DataReturnJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, SuccessComplete())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DataReturnJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
reJob := jo.Body.(*job.DataReturnJob)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), reJob.TargetJobCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
|
||||
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewStorageCreatePackage(
|
||||
1, // TOOD 用户ID
|
||||
ccInfo.CDSStorageID,
|
||||
reJob.TargetJobOutputFullPath,
|
||||
reJob.Info.BucketID,
|
||||
utils.MakeResourcePackageName(jo.JobID),
|
||||
))
|
||||
defer wt.Close()
|
||||
|
||||
status, err := wt.Receive(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tskStatus := status.(*exetsk.StorageCreatePackageStatus)
|
||||
if tskStatus.Error != "" {
|
||||
return fmt.Errorf("creating package: %s", tskStatus.Error)
|
||||
}
|
||||
|
||||
reJob.DataReturnPackageID = tskStatus.PackageID
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||
)
|
||||
|
||||
type MakingAdjustScheme struct {
|
||||
}
|
||||
|
||||
func NewMakeingAdjustScheme() *MakingAdjustScheme {
|
||||
return &MakingAdjustScheme{}
|
||||
}
|
||||
|
||||
func (s *MakingAdjustScheme) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
scheme, err := s.do(rtx, jo.Body.(*job.NormalJob))
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewAdjusting(*scheme))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MakingAdjustScheme) do(rtx jobmgr.JobStateRunContext, norJob *job.NormalJob) (*jobmod.JobScheduleScheme, error) {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
go func() {
|
||||
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
wt := rtx.Mgr.AdvMgr.StartTask(advtsk.NewMakeAdjustScheme(norJob.Info, jobmod.NormalJobStatus{
|
||||
TargetCCID: norJob.TargetCCID,
|
||||
Files: norJob.Files,
|
||||
}))
|
||||
defer wt.Close()
|
||||
|
||||
status, err := wt.Receive(ctx)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("making adjust scheme: %w", err)
|
||||
}
|
||||
|
||||
mkStatus := status.(*advtsk.MakeAdjustSchemeStatus)
|
||||
if mkStatus.Error != "" {
|
||||
return nil, fmt.Errorf("making adjust scheme: %s", mkStatus.Error)
|
||||
}
|
||||
|
||||
return &mkStatus.Scheme, nil
|
||||
}
|
||||
|
||||
func (s *MakingAdjustScheme) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,251 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||
)
|
||||
|
||||
type PreScheduling struct {
|
||||
scheme jobmod.JobScheduleScheme
|
||||
targetCCInfo schmod.ComputingCenter
|
||||
}
|
||||
|
||||
func NewPreSchuduling(scheme jobmod.JobScheduleScheme) *PreScheduling {
|
||||
return &PreScheduling{
|
||||
scheme: scheme,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
norJob := jo.Body.(*job.NormalJob)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// 监听取消事件
|
||||
go func() {
|
||||
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(fmt.Errorf("getting computing center info: %w", err)))
|
||||
return
|
||||
}
|
||||
s.targetCCInfo = ccInfo
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(3)
|
||||
|
||||
var e1, e2, e3 error
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset)
|
||||
if e1 != nil {
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code)
|
||||
if e2 != nil {
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image)
|
||||
if e3 != nil {
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
|
||||
allErr := errors.Join(e1, e2, e3)
|
||||
if allErr != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewReadyToAdjust())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PreScheduling) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||
switch info := fileInfo.(type) {
|
||||
case *schsdk.LocalJobFileInfo:
|
||||
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
|
||||
return e.LocalPath == info.LocalPath
|
||||
})
|
||||
if !ok {
|
||||
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||
}
|
||||
if evt.Error != nil {
|
||||
return evt.Error
|
||||
}
|
||||
|
||||
file.PackageID = evt.PackageID
|
||||
|
||||
case *schsdk.PackageJobFileInfo:
|
||||
file.PackageID = info.PackageID
|
||||
|
||||
case *schsdk.ResourceJobFileInfo:
|
||||
return nil
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unknown dataset type: %T", info)
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionMove {
|
||||
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSNodeID))
|
||||
defer wt.Close()
|
||||
|
||||
status, err := wt.Receive(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||
if moveStatus.Error != "" {
|
||||
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionLoad {
|
||||
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID))
|
||||
defer wt.Close()
|
||||
|
||||
status, err := wt.Receive(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||
if moveStatus.Error != "" {
|
||||
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||
switch info := fileInfo.(type) {
|
||||
case *schsdk.LocalJobFileInfo:
|
||||
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
|
||||
return e.LocalPath == info.LocalPath
|
||||
})
|
||||
if !ok {
|
||||
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||
}
|
||||
if evt.Error != nil {
|
||||
return evt.Error
|
||||
}
|
||||
|
||||
// 上传完毕,则可以新建一个空的镜像的记录
|
||||
// TODO 镜像名称
|
||||
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating image info: %w", err)
|
||||
}
|
||||
|
||||
// 填充ImageID和PackageID
|
||||
file.ImageID = imgID
|
||||
file.PackageID = &evt.PackageID
|
||||
|
||||
case *schsdk.ImageJobFileInfo:
|
||||
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting image info: %w", err)
|
||||
}
|
||||
|
||||
file.ImageID = imageInfo.ImageID
|
||||
file.PackageID = imageInfo.CDSPackageID
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionImportImage {
|
||||
if file.PackageID == nil {
|
||||
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
|
||||
}
|
||||
|
||||
// TODO UserID
|
||||
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSNodeID))
|
||||
defer wt.Close()
|
||||
|
||||
status, err := wt.Receive(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||
if moveStatus.Error != "" {
|
||||
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
}
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cloudream storage client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
// TODO UserID
|
||||
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting package objects: %w", err)
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) == 0 {
|
||||
return fmt.Errorf("no object in the package which will be imported")
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) > 1 {
|
||||
return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
||||
}
|
||||
|
||||
wt2 := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
|
||||
defer wt2.Close()
|
||||
|
||||
status2, err := wt2.Receive(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("uploading image: %w", err)
|
||||
}
|
||||
|
||||
uploadStatus := status2.(*exectsk.UploadImageStatus)
|
||||
if uploadStatus.Error != "" {
|
||||
return fmt.Errorf("uploading image: %s", uploadStatus.Error)
|
||||
}
|
||||
|
||||
// TODO 镜像名称
|
||||
err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, norJob.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating image info: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||
)
|
||||
|
||||
type ReadyToAdjust struct {
|
||||
}
|
||||
|
||||
func NewReadyToAdjust() *ReadyToAdjust {
|
||||
return &ReadyToAdjust{}
|
||||
}
|
||||
|
||||
func (s *ReadyToAdjust) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewMakeingAdjustScheme())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
norJob := jo.Body.(*job.NormalJob)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
go func() {
|
||||
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
if rt, ok := norJob.Info.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok {
|
||||
evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool {
|
||||
return val.Job.GetInfo().GetLocalJobID() == rt.ResourceLocalJobID
|
||||
})
|
||||
if !ok {
|
||||
return jobmgr.ErrJobCancelled
|
||||
}
|
||||
if evt.Err != nil {
|
||||
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
|
||||
}
|
||||
rtJob, ok := evt.Job.Body.(*job.DataReturnJob)
|
||||
if !ok {
|
||||
return fmt.Errorf("job %s is not a DataReturn job(which is %T)", evt.Job.JobID, evt.Job)
|
||||
}
|
||||
|
||||
norJob.Files.Dataset.PackageID = rtJob.DataReturnPackageID
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *ReadyToAdjust) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
)
|
||||
|
||||
type NormalJobReadyToExecute struct {
|
||||
}
|
||||
|
||||
func NewNormalJobReadyToExecute() *NormalJobReadyToExecute {
|
||||
return &NormalJobReadyToExecute{}
|
||||
}
|
||||
|
||||
func (s *NormalJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
// TODO 目前直接启动执行
|
||||
rtx.Mgr.ChangeState(jo, NewNormalJobExecuting())
|
||||
}
|
||||
|
||||
func (s *NormalJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
||||
|
||||
type DataReturnJobReadyToExecute struct {
|
||||
}
|
||||
|
||||
func NewDataReturnJobReadyToExecute() *DataReturnJobReadyToExecute {
|
||||
return &DataReturnJobReadyToExecute{}
|
||||
}
|
||||
|
||||
func (s *DataReturnJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
// TODO 目前直接启动执行
|
||||
rtx.Mgr.ChangeState(jo, NewDataReturnJobExecuting())
|
||||
}
|
||||
|
||||
func (s *DataReturnJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||
)
|
||||
|
||||
type WaitTargetComplete struct {
|
||||
}
|
||||
|
||||
func NewWaitTargetComplete() *WaitTargetComplete {
|
||||
return &WaitTargetComplete{}
|
||||
}
|
||||
|
||||
func (s *WaitTargetComplete) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewDataReturnJobReadyToExecute())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *WaitTargetComplete) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
reJob := jo.Body.(*job.DataReturnJob)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
go func() {
|
||||
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool {
|
||||
return val.Job.GetInfo().GetLocalJobID() == reJob.Info.TargetLocalJobID
|
||||
})
|
||||
if !ok {
|
||||
return jobmgr.ErrJobCancelled
|
||||
}
|
||||
if evt.Err != nil {
|
||||
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
|
||||
}
|
||||
norJob, ok := evt.Job.Body.(*job.NormalJob)
|
||||
if !ok {
|
||||
return fmt.Errorf("job %s is not a Normal job(which is %T)", evt.Job.JobID, evt.Job)
|
||||
}
|
||||
|
||||
reJob.TargetJobCCID = norJob.TargetCCID
|
||||
reJob.TargetJobOutputFullPath = norJob.OutputFullPath
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *WaitTargetComplete) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||
// TODO
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package jobmgr
|
||||
|
||||
import jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
|
||||
type JobStateRunContext struct {
|
||||
Mgr *Manager
|
||||
EventSet *EventSet
|
||||
LastState JobState
|
||||
}
|
||||
|
||||
type JobState interface {
|
||||
Run(ctx JobStateRunContext, job *Job)
|
||||
Dump(ctx JobStateRunContext, job *Job) jobmod.JobStateStatus
|
||||
}
|
|
@ -1,283 +1,169 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
|
||||
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/advisormgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type mgrJob struct {
|
||||
Job jobmod.Job
|
||||
Handler StateHandler
|
||||
job Job
|
||||
eventSet EventSet
|
||||
state JobState
|
||||
}
|
||||
|
||||
type mgrJobSet struct {
|
||||
jobs map[schsdk.JobID]*mgrJob
|
||||
}
|
||||
|
||||
type Manager struct {
|
||||
// 任何修改job、jobset的操作,都需要加这个锁
|
||||
pubLock sync.Mutex
|
||||
|
||||
execMgr *executormgr.Manager
|
||||
advMgr *advisormgr.Manager
|
||||
db *db.DB
|
||||
|
||||
handlers map[reflect.Type]StateHandler
|
||||
defaultHandler StateHandler
|
||||
ExecMgr *executormgr.Manager
|
||||
AdvMgr *advisormgr.Manager
|
||||
DB *db.DB
|
||||
|
||||
jobSetIDIndex int
|
||||
jobSets map[schsdk.JobSetID]*jobmod.JobSet
|
||||
jobSets map[schsdk.JobSetID]*mgrJobSet
|
||||
jobIDIndex int
|
||||
jobs map[schsdk.JobID]*mgrJob
|
||||
}
|
||||
|
||||
func NewManager(execMgr *executormgr.Manager, advMgr *advisormgr.Manager, db *db.DB) (*Manager, error) {
|
||||
mgr := &Manager{
|
||||
execMgr: execMgr,
|
||||
advMgr: advMgr,
|
||||
db: db,
|
||||
|
||||
handlers: make(map[reflect.Type]StateHandler),
|
||||
jobSets: make(map[schsdk.JobSetID]*jobmod.JobSet),
|
||||
jobs: make(map[schsdk.JobID]*mgrJob),
|
||||
ExecMgr: execMgr,
|
||||
AdvMgr: advMgr,
|
||||
DB: db,
|
||||
jobSets: make(map[schsdk.JobSetID]*mgrJobSet),
|
||||
jobs: make(map[schsdk.JobID]*mgrJob),
|
||||
}
|
||||
|
||||
execMgr.OnTaskUpdated(mgr.executorTaskUpdated)
|
||||
execMgr.OnTaskTimeout(mgr.executorTaskTimeout)
|
||||
|
||||
advMgr.OnTaskUpdated(mgr.advisorTaskUpdated)
|
||||
advMgr.OnTaskTimeout(mgr.advisorTaskTimeout)
|
||||
|
||||
// TODO 考虑优化这部分逻辑
|
||||
|
||||
mgr.handlers[myreflect.TypeOf[*jobmod.StatePreScheduling]()] = NewPreSchedulingHandler(mgr)
|
||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToAdjust]()] = NewReadyToAdjustHandler(mgr)
|
||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateMakingAdjustScheme]()] = NewMakingAdjustSchemeHandler(mgr)
|
||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateAdjusting]()] = NewAdjustingHandler(mgr)
|
||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToExecute]()] = NewReadyToExecuteHandler(mgr)
|
||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateExecuting]()] = NewExecutingHandler(mgr)
|
||||
|
||||
compHder := NewCompleteHandler(mgr)
|
||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateFailed]()] = compHder
|
||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateSuccess]()] = compHder
|
||||
|
||||
mgr.defaultHandler = NewDefaultHandler(mgr)
|
||||
|
||||
return mgr, nil
|
||||
}
|
||||
|
||||
func (m *Manager) Serve() error {
|
||||
for _, h := range m.handlers {
|
||||
go h.Serve()
|
||||
}
|
||||
|
||||
go m.defaultHandler.Serve()
|
||||
|
||||
ticker := time.NewTicker(time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
// 每一分钟产生一个空事件,防止无限等待
|
||||
m.pubLock.Lock()
|
||||
m.onEvent(event.ToAll(), nil)
|
||||
m.pubLock.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) Stop() {
|
||||
for _, h := range m.handlers {
|
||||
h.Stop()
|
||||
}
|
||||
|
||||
m.defaultHandler.Stop()
|
||||
}
|
||||
|
||||
func (m *Manager) SubmitJobSet(jobSetInfo schsdk.JobSetInfo, preScheduleScheme jobmod.JobSetPreScheduleScheme) (*jobmod.JobSet, error) {
|
||||
func (m *Manager) ChangeState(job *Job, state JobState) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
mgrJob, ok := m.jobs[job.JobID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
lastState := mgrJob.state
|
||||
mgrJob.state = state
|
||||
|
||||
go func() {
|
||||
state.Run(JobStateRunContext{
|
||||
Mgr: m,
|
||||
EventSet: &mgrJob.eventSet,
|
||||
LastState: lastState,
|
||||
}, job)
|
||||
}()
|
||||
}
|
||||
|
||||
func (m *Manager) PostEvent(jobID schsdk.JobID, evt Event) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
mgrJob, ok := m.jobs[jobID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
go func() {
|
||||
mgrJob.eventSet.Post(evt)
|
||||
}()
|
||||
}
|
||||
|
||||
func (m *Manager) BroadcastEvent(jobSetID schsdk.JobSetID, evt Event) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
jobSet, ok := m.jobSets[jobSetID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
for _, mgrJob := range jobSet.jobs {
|
||||
go func() {
|
||||
mgrJob.eventSet.Post(evt)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
type SubmittingJob struct {
|
||||
Body JobBody
|
||||
InitState JobState
|
||||
}
|
||||
|
||||
func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex))
|
||||
|
||||
var jobs []jobmod.Job
|
||||
var normalJobs []*jobmod.NormalJob
|
||||
var resJobs []*jobmod.ResourceJob
|
||||
var jobRefs []jobmod.JobSetJobRef
|
||||
for i, jobInfo := range jobSetInfo.Jobs {
|
||||
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
|
||||
|
||||
switch info := jobInfo.(type) {
|
||||
case *schsdk.NormalJobInfo:
|
||||
job := jobmod.NewNormalJob(jobSetID, jobID, *info)
|
||||
jobs = append(jobs, job)
|
||||
normalJobs = append(normalJobs, job)
|
||||
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
|
||||
LocalJobID: info.LocalJobID,
|
||||
JobID: jobID,
|
||||
})
|
||||
|
||||
preSch, ok := preScheduleScheme.JobSchemes[info.LocalJobID]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("pre schedule scheme for job %s is not found", info.LocalJobID)
|
||||
}
|
||||
|
||||
job.State = jobmod.NewStatePreScheduling(preSch)
|
||||
job.TargetCCID = preSch.TargetCCID
|
||||
|
||||
case *schsdk.ResourceJobInfo:
|
||||
job := jobmod.NewResourceJob(jobSetID, jobID, *info)
|
||||
jobs = append(jobs, job)
|
||||
resJobs = append(resJobs, job)
|
||||
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
|
||||
LocalJobID: info.LocalJobID,
|
||||
JobID: jobID,
|
||||
})
|
||||
|
||||
// 回源任务不需要预调度,所以直接是进入待调整状态
|
||||
job.State = jobmod.NewStateReadyToAdjust()
|
||||
}
|
||||
}
|
||||
|
||||
// TODO 可以考虑检查一下有依赖的任务的信息所描述依赖的LocalJobID是不是有效的
|
||||
|
||||
jobSet := jobmod.NewJobSet(jobSetID, jobRefs, preScheduleScheme)
|
||||
m.jobSets[jobSetID] = jobSet
|
||||
for _, job := range jobs {
|
||||
m.jobs[job.GetJobID()] = &mgrJob{
|
||||
Job: job,
|
||||
}
|
||||
|
||||
m.handleState(job)
|
||||
}
|
||||
|
||||
m.jobSetIDIndex += 1
|
||||
m.jobIDIndex += len(jobSetInfo.Jobs)
|
||||
|
||||
return jobSet, nil
|
||||
jobSet := &mgrJobSet{
|
||||
jobs: make(map[schsdk.JobID]*mgrJob),
|
||||
}
|
||||
m.jobSets[jobSetID] = jobSet
|
||||
|
||||
for i, subJob := range jobs {
|
||||
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
|
||||
job := &mgrJob{
|
||||
job: Job{
|
||||
JobSetID: jobSetID,
|
||||
JobID: jobID,
|
||||
Body: subJob.Body,
|
||||
},
|
||||
eventSet: NewEventSet(),
|
||||
}
|
||||
jobSet.jobs[jobID] = job
|
||||
|
||||
m.ChangeState(&job.job, subJob.InitState)
|
||||
}
|
||||
m.jobIDIndex += len(jobs)
|
||||
|
||||
return jobSetID
|
||||
}
|
||||
|
||||
func (m *Manager) LocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) error {
|
||||
func (m *Manager) DumpJobSet(jobSetID schsdk.JobSetID) []jobmod.JobStatus {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
for _, h := range m.handlers {
|
||||
h.OnEvent(event.ToJobSet(jobSetID), event.NewLocalFileUploaded(jobSetID, localPath, err, packageID))
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) executorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus exectsk.TaskStatus) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
job, ok := m.jobs[jobID]
|
||||
jobSet, ok := m.jobSets[jobSetID]
|
||||
if !ok {
|
||||
return
|
||||
return nil
|
||||
}
|
||||
|
||||
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskUpdated(fullTaskID, taskStatus))
|
||||
}
|
||||
|
||||
func (m *Manager) executorTaskTimeout(jobID schsdk.JobID, fullTaskID string) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
job, ok := m.jobs[jobID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskTimeout(fullTaskID))
|
||||
}
|
||||
|
||||
func (m *Manager) advisorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
job, ok := m.jobs[jobID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskUpdated(fullTaskID, taskStatus))
|
||||
}
|
||||
|
||||
func (m *Manager) advisorTaskTimeout(jobID schsdk.JobID, fullTaskID string) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
job, ok := m.jobs[jobID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskTimeout(fullTaskID))
|
||||
}
|
||||
|
||||
func (m *Manager) CloneJob(jobID schsdk.JobID) (jobmod.Job, error) {
|
||||
m.pubLock.Lock()
|
||||
|
||||
job, ok := m.jobs[jobID]
|
||||
if !ok {
|
||||
m.pubLock.Unlock()
|
||||
return nil, fmt.Errorf("job not found")
|
||||
}
|
||||
|
||||
evt := event.NewCloneJob()
|
||||
job.Handler.OnEvent(event.ToJob(jobID), evt)
|
||||
m.pubLock.Unlock()
|
||||
|
||||
return evt.Callback.WaitValue(context.Background())
|
||||
}
|
||||
|
||||
// 根据job状态选择handler进行处理。需要加锁
|
||||
func (m *Manager) handleState(job jobmod.Job) {
|
||||
logger.WithField("JobID", job.GetJobID()).
|
||||
WithField("State", reflect.TypeOf(job.GetState()).String()).
|
||||
Debugf("job state changed")
|
||||
|
||||
runtime, ok := m.jobs[job.GetJobID()]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
state := job.GetState()
|
||||
if state == nil {
|
||||
runtime.Handler = m.defaultHandler
|
||||
m.defaultHandler.Handle(job)
|
||||
return
|
||||
}
|
||||
|
||||
stateType := reflect.TypeOf(state)
|
||||
handler, ok := m.handlers[stateType]
|
||||
if !ok {
|
||||
runtime.Handler = m.defaultHandler
|
||||
m.defaultHandler.Handle(job)
|
||||
return
|
||||
}
|
||||
|
||||
runtime.Handler = handler
|
||||
handler.Handle(job)
|
||||
}
|
||||
|
||||
func (m *Manager) onEvent(broadcast event.Broadcast, evt event.Event) {
|
||||
for _, h := range m.handlers {
|
||||
h.OnEvent(broadcast, evt)
|
||||
}
|
||||
var jobStatuses []jobmod.JobStatus
|
||||
for _, mgrJob := range jobSet.jobs {
|
||||
jobStatuses = append(jobStatuses, mgrJob.job.Dump(JobStateRunContext{
|
||||
Mgr: m,
|
||||
EventSet: &mgrJob.eventSet,
|
||||
LastState: mgrJob.state,
|
||||
}, &mgrJob.job, mgrJob.state))
|
||||
}
|
||||
|
||||
return jobStatuses
|
||||
}
|
||||
|
|
|
@ -1,139 +0,0 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type makingAdjustSchemeJob struct {
|
||||
job *jobmod.NormalJob
|
||||
state *jobmod.StateMakingAdjustScheme
|
||||
}
|
||||
|
||||
type MakingAdjustSchemeHandler struct {
|
||||
mgr *Manager
|
||||
|
||||
jobs map[schsdk.JobID]*makingAdjustSchemeJob
|
||||
|
||||
cmdChan actor.CommandChannel
|
||||
}
|
||||
|
||||
func NewMakingAdjustSchemeHandler(mgr *Manager) *MakingAdjustSchemeHandler {
|
||||
return &MakingAdjustSchemeHandler{
|
||||
mgr: mgr,
|
||||
jobs: make(map[schsdk.JobID]*makingAdjustSchemeJob),
|
||||
cmdChan: *actor.NewCommandChannel(),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *MakingAdjustSchemeHandler) Handle(job jobmod.Job) {
|
||||
h.cmdChan.Send(func() {
|
||||
norJob, ok := job.(*jobmod.NormalJob)
|
||||
if !ok {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
state, ok := job.GetState().(*jobmod.StateMakingAdjustScheme)
|
||||
if !ok {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
rjob := &makingAdjustSchemeJob{
|
||||
job: norJob,
|
||||
state: state,
|
||||
}
|
||||
h.jobs[job.GetJobID()] = rjob
|
||||
|
||||
h.onJobEvent(nil, rjob)
|
||||
})
|
||||
}
|
||||
|
||||
func (h *MakingAdjustSchemeHandler) onJobEvent(evt event.Event, job *makingAdjustSchemeJob) {
|
||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
||||
return
|
||||
}
|
||||
|
||||
if job.state.FullTaskID == "" {
|
||||
fullTaskID, err := h.mgr.advMgr.StartTask(job.job.GetJobID(), advtsk.NewMakeAdjustScheme(*job.job))
|
||||
if err != nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
job.state.FullTaskID = fullTaskID
|
||||
}
|
||||
|
||||
if makingRet, err := event.AssertAdvisorTaskStatus[*advtsk.MakeAdjustSchemeStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
|
||||
if err == event.ErrTaskTimeout {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed("make adjust scheme timeout", job.state))
|
||||
return
|
||||
}
|
||||
|
||||
h.mgr.advMgr.ForgetTask(job.state.FullTaskID)
|
||||
|
||||
if makingRet.Error != "" {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(makingRet.Error, job.state))
|
||||
return
|
||||
}
|
||||
|
||||
h.changeJobState(job.job, jobmod.NewStateAdjusting(makingRet.Scheme))
|
||||
}
|
||||
}
|
||||
|
||||
func (h *MakingAdjustSchemeHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
||||
job.SetState(state)
|
||||
|
||||
delete(h.jobs, job.GetJobID())
|
||||
|
||||
h.mgr.pubLock.Lock()
|
||||
h.mgr.handleState(job)
|
||||
h.mgr.pubLock.Unlock()
|
||||
}
|
||||
|
||||
func (h *MakingAdjustSchemeHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
||||
h.cmdChan.Send(func() {
|
||||
if broadcast.ToAll() {
|
||||
for _, job := range h.jobs {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
|
||||
} else if broadcast.ToJobSet() {
|
||||
for _, job := range h.jobs {
|
||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
||||
continue
|
||||
}
|
||||
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
} else if broadcast.ToJob() {
|
||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (h *MakingAdjustSchemeHandler) Serve() {
|
||||
cmdChan := h.cmdChan.BeginChanReceive()
|
||||
defer h.cmdChan.CloseChanReceive()
|
||||
|
||||
for {
|
||||
select {
|
||||
case cmd := <-cmdChan:
|
||||
cmd()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (h *MakingAdjustSchemeHandler) Stop() {
|
||||
// TODO 支持STOP
|
||||
}
|
|
@ -1,442 +0,0 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
var ErrPreScheduleFailed = fmt.Errorf("pre schedule failed")
|
||||
|
||||
type preSchedulingJob struct {
|
||||
job *jobmod.NormalJob
|
||||
state *jobmod.StatePreScheduling
|
||||
ccInfo schmod.ComputingCenter
|
||||
}
|
||||
|
||||
type PreSchedulingHandler struct {
|
||||
mgr *Manager
|
||||
|
||||
jobs map[schsdk.JobID]*preSchedulingJob
|
||||
|
||||
cmdChan actor.CommandChannel
|
||||
}
|
||||
|
||||
func NewPreSchedulingHandler(mgr *Manager) *PreSchedulingHandler {
|
||||
return &PreSchedulingHandler{
|
||||
mgr: mgr,
|
||||
jobs: make(map[schsdk.JobID]*preSchedulingJob),
|
||||
cmdChan: *actor.NewCommandChannel(),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *PreSchedulingHandler) Handle(job jobmod.Job) {
|
||||
h.cmdChan.Send(func() {
|
||||
norJob, ok := job.(*jobmod.NormalJob)
|
||||
if !ok {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
preSchState, ok := norJob.GetState().(*jobmod.StatePreScheduling)
|
||||
if !ok {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
colCli, err := schglb.CollectorMQPool.Acquire()
|
||||
if err != nil {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err), job.GetState()))
|
||||
return
|
||||
}
|
||||
defer schglb.CollectorMQPool.Release(colCli)
|
||||
|
||||
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), preSchState.Scheme.TargetCCID)
|
||||
if err != nil {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
norJob.TargetCCID = preSchState.Scheme.TargetCCID
|
||||
preJob := &preSchedulingJob{
|
||||
job: norJob,
|
||||
state: preSchState,
|
||||
ccInfo: ccInfo,
|
||||
}
|
||||
h.jobs[job.GetJobID()] = preJob
|
||||
|
||||
h.onJobEvent(nil, preJob)
|
||||
})
|
||||
}
|
||||
|
||||
func (h *PreSchedulingHandler) onJobEvent(evt event.Event, job *preSchedulingJob) {
|
||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
||||
return
|
||||
}
|
||||
|
||||
err := h.doPackageScheduling(evt, job,
|
||||
job.job.Info.Files.Dataset, &job.job.Files.Dataset,
|
||||
&job.state.Scheme.Dataset, &job.state.Dataset,
|
||||
)
|
||||
if err != nil {
|
||||
job.state.Dataset.Error = err.Error()
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
err = h.doPackageScheduling(evt, job,
|
||||
job.job.Info.Files.Code, &job.job.Files.Code,
|
||||
&job.state.Scheme.Code, &job.state.Code,
|
||||
)
|
||||
if err != nil {
|
||||
job.state.Code.Error = err.Error()
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
err = h.doImageScheduling(evt, job,
|
||||
job.job.Info.Files.Image, &job.job.Files.Image,
|
||||
&job.state.Scheme.Image, &job.state.Image,
|
||||
)
|
||||
if err != nil {
|
||||
job.state.Image.Error = err.Error()
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
// 如果三种文件都调度完成,则可以进入下个阶段了
|
||||
if job.state.Dataset.Step == jobmod.StepCompleted &&
|
||||
job.state.Code.Step == jobmod.StepCompleted &&
|
||||
job.state.Image.Step == jobmod.StepCompleted {
|
||||
|
||||
h.changeJobState(job.job, jobmod.NewStateReadyToAdjust())
|
||||
}
|
||||
}
|
||||
|
||||
func (h *PreSchedulingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
||||
job.SetState(state)
|
||||
|
||||
delete(h.jobs, job.GetJobID())
|
||||
|
||||
h.mgr.pubLock.Lock()
|
||||
h.mgr.handleState(job)
|
||||
h.mgr.pubLock.Unlock()
|
||||
}
|
||||
|
||||
func (h *PreSchedulingHandler) doPackageScheduling(evt event.Event, job *preSchedulingJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
|
||||
// TODO 考虑拆分成多个函数
|
||||
if state.Step == jobmod.StepBegin {
|
||||
switch info := fileInfo.(type) {
|
||||
case *schsdk.LocalJobFileInfo:
|
||||
state.Step = jobmod.StepUploading
|
||||
|
||||
case *schsdk.PackageJobFileInfo:
|
||||
file.PackageID = info.PackageID
|
||||
state.Step = jobmod.StepUploaded
|
||||
|
||||
case *schsdk.ResourceJobFileInfo:
|
||||
state.Step = jobmod.StepCompleted
|
||||
|
||||
default:
|
||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
||||
}
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepUploading {
|
||||
if evt == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
localFileCmd, ok := evt.(*event.LocalFileUploaded)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
if localFileCmd.LocalPath != fileInfo.(*schsdk.LocalJobFileInfo).LocalPath {
|
||||
return nil
|
||||
}
|
||||
|
||||
if localFileCmd.Error != "" {
|
||||
return fmt.Errorf("local file uploading: %s", localFileCmd.Error)
|
||||
}
|
||||
|
||||
file.PackageID = localFileCmd.PackageID
|
||||
state.Step = jobmod.StepUploaded
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepUploaded {
|
||||
if scheme.Action == jobmod.ActionNo {
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionMove {
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, file.PackageID, job.ccInfo.CDSNodeID))
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting cache move package: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepMoving
|
||||
state.FullTaskID = fullTaskID
|
||||
return nil
|
||||
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionLoad {
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewStorageLoadPackage(1, file.PackageID, job.ccInfo.CDSStorageID))
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting stroage load package: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepLoading
|
||||
state.FullTaskID = fullTaskID
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepMoving {
|
||||
moveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
|
||||
if err == event.ErrUnconcernedTask {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err == event.ErrTaskTimeout {
|
||||
return fmt.Errorf("cache move package timeout")
|
||||
}
|
||||
|
||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
||||
|
||||
if moveRet.Error != "" {
|
||||
return fmt.Errorf("cache move pacakge: %s", moveRet.Error)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepLoading {
|
||||
loadRet, err := event.AssertExecutorTaskStatus[*exectsk.StorageLoadPackageStatus](evt, state.FullTaskID)
|
||||
if err == event.ErrUnconcernedTask {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err == event.ErrTaskTimeout {
|
||||
return fmt.Errorf("storage load package timeout")
|
||||
}
|
||||
|
||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
||||
|
||||
if loadRet.Error != "" {
|
||||
return fmt.Errorf("storage load package: %s", loadRet.Error)
|
||||
}
|
||||
|
||||
file.FullPath = loadRet.FullPath
|
||||
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *PreSchedulingHandler) doImageScheduling(evt event.Event, job *preSchedulingJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
|
||||
// TODO 考虑拆分成多个函数
|
||||
if state.Step == jobmod.StepBegin {
|
||||
switch info := fileInfo.(type) {
|
||||
case *schsdk.LocalJobFileInfo:
|
||||
state.Step = jobmod.StepUploading
|
||||
|
||||
case *schsdk.ImageJobFileInfo:
|
||||
imageInfo, err := h.mgr.db.Image().GetByID(h.mgr.db.SQLCtx(), info.ImageID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting image info: %w", err)
|
||||
}
|
||||
|
||||
file.ImageID = imageInfo.ImageID
|
||||
file.PackageID = imageInfo.CDSPackageID
|
||||
state.Step = jobmod.StepUploaded
|
||||
|
||||
default:
|
||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(info))
|
||||
}
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepUploading {
|
||||
if evt == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
localFileCmd, ok := evt.(*event.LocalFileUploaded)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
if localFileCmd.LocalPath != fileInfo.(*schsdk.LocalJobFileInfo).LocalPath {
|
||||
return nil
|
||||
}
|
||||
|
||||
if localFileCmd.Error != "" {
|
||||
return fmt.Errorf("local file uploading: %s", localFileCmd.Error)
|
||||
}
|
||||
|
||||
// 上传完毕,则可以新建一个空的镜像的记录
|
||||
// TODO 镜像名称
|
||||
imgID, err := h.mgr.db.Image().Create(h.mgr.db.SQLCtx(), &localFileCmd.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating image info: %w", err)
|
||||
}
|
||||
|
||||
// 填充ImageID和PackageID
|
||||
file.ImageID = imgID
|
||||
file.PackageID = &localFileCmd.PackageID
|
||||
state.Step = jobmod.StepUploaded
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepUploaded {
|
||||
if scheme.Action == jobmod.ActionNo {
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
|
||||
// 要导入镜像,则需要先将镜像移动到指点节点的缓存中
|
||||
if scheme.Action == jobmod.ActionImportImage {
|
||||
if file.PackageID == nil {
|
||||
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, job.ccInfo.CCID)
|
||||
}
|
||||
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, *file.PackageID, job.ccInfo.CDSNodeID))
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting cache move package: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepMoving
|
||||
state.FullTaskID = fullTaskID
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepMoving {
|
||||
cacheMoveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
|
||||
if err == event.ErrUnconcernedTask {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err == event.ErrTaskTimeout {
|
||||
return fmt.Errorf("cache move package timeout")
|
||||
}
|
||||
|
||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
||||
|
||||
if cacheMoveRet.Error != "" {
|
||||
return fmt.Errorf("cache move pacakge: %s", cacheMoveRet.Error)
|
||||
}
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cloudream storage client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting package objects: %w", err)
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) == 0 {
|
||||
return fmt.Errorf("no object in the package which will be imported")
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) > 1 {
|
||||
return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
||||
}
|
||||
|
||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewUploadImage(job.ccInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
|
||||
if err != nil {
|
||||
return fmt.Errorf("starting import image: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepImageImporting
|
||||
state.FullTaskID = fullTaskID
|
||||
return nil
|
||||
}
|
||||
|
||||
if state.Step == jobmod.StepImageImporting {
|
||||
uploadImageRet, err := event.AssertExecutorTaskStatus[*exectsk.UploadImageStatus](evt, state.FullTaskID)
|
||||
if err == event.ErrUnconcernedTask {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err == event.ErrTaskTimeout {
|
||||
return fmt.Errorf("import image timeout")
|
||||
}
|
||||
|
||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
||||
|
||||
if uploadImageRet.Error != "" {
|
||||
return fmt.Errorf("import image: %s", uploadImageRet.Error)
|
||||
}
|
||||
|
||||
err = h.mgr.db.PCMImage().Create(h.mgr.db.SQLCtx(), file.ImageID, job.ccInfo.CCID, uploadImageRet.PCMImageID, uploadImageRet.Name, time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("adding image importing info: %w", err)
|
||||
}
|
||||
|
||||
state.Step = jobmod.StepCompleted
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *PreSchedulingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
||||
h.cmdChan.Send(func() {
|
||||
if broadcast.ToAll() {
|
||||
for _, job := range h.jobs {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
|
||||
} else if broadcast.ToJobSet() {
|
||||
for _, job := range h.jobs {
|
||||
if job.job.JobSetID != broadcast.JobSetID {
|
||||
continue
|
||||
}
|
||||
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
} else if broadcast.ToJob() {
|
||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (h *PreSchedulingHandler) Serve() {
|
||||
cmdChan := h.cmdChan.BeginChanReceive()
|
||||
defer h.cmdChan.CloseChanReceive()
|
||||
|
||||
for {
|
||||
select {
|
||||
case cmd := <-cmdChan:
|
||||
cmd()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (h *PreSchedulingHandler) Stop() {
|
||||
// TODO 支持STOP
|
||||
}
|
|
@ -1,214 +0,0 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type readyToAdjustJob struct {
|
||||
job jobmod.Job
|
||||
state *jobmod.StateReadyToAdjust
|
||||
}
|
||||
|
||||
type ReadyToAdjustHandler struct {
|
||||
mgr *Manager
|
||||
|
||||
jobs map[schsdk.JobID]*readyToAdjustJob
|
||||
|
||||
cmdChan actor.CommandChannel
|
||||
}
|
||||
|
||||
func NewReadyToAdjustHandler(mgr *Manager) *ReadyToAdjustHandler {
|
||||
return &ReadyToAdjustHandler{
|
||||
mgr: mgr,
|
||||
jobs: make(map[schsdk.JobID]*readyToAdjustJob),
|
||||
cmdChan: *actor.NewCommandChannel(),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ReadyToAdjustHandler) Handle(job jobmod.Job) {
|
||||
h.cmdChan.Send(func() {
|
||||
state, ok := job.GetState().(*jobmod.StateReadyToAdjust)
|
||||
if !ok {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
rjob := &readyToAdjustJob{
|
||||
job: job,
|
||||
state: state,
|
||||
}
|
||||
h.jobs[job.GetJobID()] = rjob
|
||||
|
||||
h.onJobEvent(nil, rjob)
|
||||
})
|
||||
}
|
||||
|
||||
func (h *ReadyToAdjustHandler) onJobEvent(evt event.Event, job *readyToAdjustJob) {
|
||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
||||
return
|
||||
}
|
||||
|
||||
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
|
||||
h.onNormalJobEvent(evt, job, norJob)
|
||||
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
|
||||
h.onResourceJobEvent(evt, job, resJob)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ReadyToAdjustHandler) onNormalJobEvent(evt event.Event, job *readyToAdjustJob, norJob *jobmod.NormalJob) {
|
||||
h.mgr.pubLock.Lock()
|
||||
jobSet, ok := h.mgr.jobSets[job.job.GetJobSetID()]
|
||||
h.mgr.pubLock.Unlock()
|
||||
if !ok {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", job.job.GetJobSetID()), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
needWait := false
|
||||
|
||||
// 无论发生什么事件,都检查一下前置任务的状态
|
||||
if resFile, ok := norJob.Info.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok {
|
||||
ref := jobSet.FindRefByLocalJobID(resFile.ResourceLocalJobID)
|
||||
if ref == nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
||||
fmt.Sprintf("job %s not found in job set %s", resFile.ResourceLocalJobID, jobSet.JobSetID),
|
||||
job.state,
|
||||
))
|
||||
return
|
||||
}
|
||||
|
||||
h.mgr.pubLock.Lock()
|
||||
waitJob := h.mgr.jobs[ref.JobID]
|
||||
h.mgr.pubLock.Unlock()
|
||||
if waitJob == nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
if _, ok = waitJob.Job.GetState().(*jobmod.StateSuccess); ok {
|
||||
waitResJob, ok := waitJob.Job.(*jobmod.ResourceJob)
|
||||
if !ok {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
||||
fmt.Sprintf("job(%v) %s is not a resource job", reflect.TypeOf(waitJob), waitResJob.JobID),
|
||||
job.state,
|
||||
))
|
||||
return
|
||||
}
|
||||
|
||||
norJob.Files.Dataset.PackageID = waitResJob.ResourcePackageID
|
||||
} else if _, ok = waitJob.Job.GetState().(*jobmod.StateFailed); ok {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
||||
fmt.Sprintf("job %s is failed", waitJob.Job.GetJobID()),
|
||||
job.state,
|
||||
))
|
||||
return
|
||||
} else {
|
||||
// 等待的Job不是失败或者成功状态,则需要继续等待
|
||||
needWait = true
|
||||
}
|
||||
}
|
||||
|
||||
if !needWait {
|
||||
h.changeJobState(job.job, jobmod.NewStateMakingAdjustScheme())
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ReadyToAdjustHandler) onResourceJobEvent(evt event.Event, job *readyToAdjustJob, resJob *jobmod.ResourceJob) {
|
||||
h.mgr.pubLock.Lock()
|
||||
jobSet, ok := h.mgr.jobSets[job.job.GetJobSetID()]
|
||||
h.mgr.pubLock.Unlock()
|
||||
if !ok {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", job.job.GetJobSetID()), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
needWait := false
|
||||
|
||||
ref := jobSet.FindRefByLocalJobID(resJob.Info.TargetLocalJobID)
|
||||
if ref == nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
||||
fmt.Sprintf("job %s not found in job set %s", resJob.Info.TargetLocalJobID, jobSet.JobSetID),
|
||||
job.state,
|
||||
))
|
||||
return
|
||||
}
|
||||
|
||||
h.mgr.pubLock.Lock()
|
||||
waitJob := h.mgr.jobs[ref.JobID]
|
||||
h.mgr.pubLock.Unlock()
|
||||
if waitJob == nil {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
|
||||
return
|
||||
}
|
||||
|
||||
// 无论发生什么事件,都检查一下前置任务的状态
|
||||
if _, ok = waitJob.Job.GetState().(*jobmod.StateFailed); ok {
|
||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
||||
fmt.Sprintf("job %s is failed", waitJob.Job.GetJobID()),
|
||||
job.state,
|
||||
))
|
||||
return
|
||||
} else if _, ok = waitJob.Job.GetState().(*jobmod.StateSuccess); !ok {
|
||||
needWait = true
|
||||
}
|
||||
|
||||
if !needWait {
|
||||
h.changeJobState(job.job, jobmod.NewStateReadyToExecute())
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ReadyToAdjustHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
||||
job.SetState(state)
|
||||
|
||||
delete(h.jobs, job.GetJobID())
|
||||
|
||||
h.mgr.pubLock.Lock()
|
||||
h.mgr.handleState(job)
|
||||
h.mgr.pubLock.Unlock()
|
||||
}
|
||||
|
||||
func (h *ReadyToAdjustHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
||||
h.cmdChan.Send(func() {
|
||||
if broadcast.ToAll() {
|
||||
for _, job := range h.jobs {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
|
||||
} else if broadcast.ToJobSet() {
|
||||
for _, job := range h.jobs {
|
||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
||||
continue
|
||||
}
|
||||
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
} else if broadcast.ToJob() {
|
||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (h *ReadyToAdjustHandler) Serve() {
|
||||
cmdChan := h.cmdChan.BeginChanReceive()
|
||||
defer h.cmdChan.CloseChanReceive()
|
||||
|
||||
for {
|
||||
select {
|
||||
case cmd := <-cmdChan:
|
||||
cmd()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ReadyToAdjustHandler) Stop() {
|
||||
// TODO 支持STOP
|
||||
}
|
|
@ -1,122 +0,0 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type readyToExecuteJob struct {
|
||||
job jobmod.Job
|
||||
state *jobmod.StateReadyToExecute
|
||||
}
|
||||
|
||||
type ReadyToExecuteHandler struct {
|
||||
mgr *Manager
|
||||
|
||||
jobs map[schsdk.JobID]*readyToExecuteJob
|
||||
|
||||
cmdChan actor.CommandChannel
|
||||
}
|
||||
|
||||
func NewReadyToExecuteHandler(mgr *Manager) *ReadyToExecuteHandler {
|
||||
return &ReadyToExecuteHandler{
|
||||
mgr: mgr,
|
||||
jobs: make(map[schsdk.JobID]*readyToExecuteJob),
|
||||
cmdChan: *actor.NewCommandChannel(),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ReadyToExecuteHandler) Handle(job jobmod.Job) {
|
||||
h.cmdChan.Send(func() {
|
||||
state, ok := job.GetState().(*jobmod.StateReadyToExecute)
|
||||
if !ok {
|
||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
||||
return
|
||||
}
|
||||
|
||||
rjob := &readyToExecuteJob{
|
||||
job: job,
|
||||
state: state,
|
||||
}
|
||||
h.jobs[job.GetJobID()] = rjob
|
||||
|
||||
h.onJobEvent(nil, rjob)
|
||||
})
|
||||
}
|
||||
|
||||
func (h *ReadyToExecuteHandler) onJobEvent(evt event.Event, job *readyToExecuteJob) {
|
||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
||||
return
|
||||
}
|
||||
|
||||
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
|
||||
h.onNormalJobEvent(evt, job, norJob)
|
||||
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
|
||||
h.onResourceJobEvent(evt, job, resJob)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ReadyToExecuteHandler) onNormalJobEvent(evt event.Event, job *readyToExecuteJob, norJob *jobmod.NormalJob) {
|
||||
// TODO 目前直接启动执行
|
||||
h.changeJobState(job.job, jobmod.NewStateExecuting())
|
||||
}
|
||||
|
||||
func (h *ReadyToExecuteHandler) onResourceJobEvent(evt event.Event, job *readyToExecuteJob, resJob *jobmod.ResourceJob) {
|
||||
// TODO 目前直接启动执行
|
||||
h.changeJobState(job.job, jobmod.NewStateExecuting())
|
||||
}
|
||||
|
||||
func (h *ReadyToExecuteHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
||||
job.SetState(state)
|
||||
|
||||
delete(h.jobs, job.GetJobID())
|
||||
|
||||
h.mgr.pubLock.Lock()
|
||||
h.mgr.handleState(job)
|
||||
h.mgr.pubLock.Unlock()
|
||||
}
|
||||
|
||||
func (h *ReadyToExecuteHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
||||
h.cmdChan.Send(func() {
|
||||
if broadcast.ToAll() {
|
||||
for _, job := range h.jobs {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
|
||||
} else if broadcast.ToJobSet() {
|
||||
for _, job := range h.jobs {
|
||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
||||
continue
|
||||
}
|
||||
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
} else if broadcast.ToJob() {
|
||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
||||
h.onJobEvent(evt, job)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (h *ReadyToExecuteHandler) Serve() {
|
||||
cmdChan := h.cmdChan.BeginChanReceive()
|
||||
defer h.cmdChan.CloseChanReceive()
|
||||
|
||||
for {
|
||||
select {
|
||||
case cmd := <-cmdChan:
|
||||
cmd()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (h *ReadyToExecuteHandler) Stop() {
|
||||
// TODO 支持STOP
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
)
|
||||
|
||||
type StateHandler interface {
|
||||
// 处理Job。在此期间全局锁已锁定
|
||||
Handle(job jobmod.Job)
|
||||
// 外部发生了一个事件
|
||||
OnEvent(broadcast event.Broadcast, evt event.Event)
|
||||
// 运行Handler
|
||||
Serve()
|
||||
// 停止此Handler
|
||||
Stop()
|
||||
}
|
|
@ -1,23 +1,50 @@
|
|||
package mq
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job/state"
|
||||
)
|
||||
|
||||
// 提交任务集
|
||||
func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetResp, *mq.CodeMessage) {
|
||||
logger.Debugf("submitting job")
|
||||
|
||||
jobSet, err := svc.jobMgr.SubmitJobSet(msg.JobSet, msg.PreScheduleScheme)
|
||||
if err != nil {
|
||||
logger.Warnf("submitting job set: %s", err.Error())
|
||||
return nil, mq.Failed(errorcode.OperationFailed, "submit job set failed")
|
||||
var jobs []jobmgr.SubmittingJob
|
||||
for _, jobInfo := range msg.JobSet.Jobs {
|
||||
switch info := jobInfo.(type) {
|
||||
case *schsdk.NormalJobInfo:
|
||||
job := job.NewNormalJob(*info)
|
||||
|
||||
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
||||
if !ok {
|
||||
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
||||
}
|
||||
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: job,
|
||||
InitState: state.NewPreSchuduling(preSch),
|
||||
})
|
||||
|
||||
case *schsdk.DataReturnJobInfo:
|
||||
job := job.NewResourceJob(*info)
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: job,
|
||||
InitState: state.NewWaitTargetComplete(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(jobSet.JobSetID))
|
||||
return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(svc.jobMgr.SubmitJobSet(jobs)))
|
||||
}
|
||||
|
||||
// 任务集中某个文件上传完成
|
||||
|
@ -26,16 +53,15 @@ func (svc *Service) JobSetLocalFileUploaded(msg *mgrmq.JobSetLocalFileUploaded)
|
|||
WithField("PackageID", msg.PackageID).
|
||||
Debugf("local file uploaded")
|
||||
|
||||
svc.jobMgr.LocalFileUploaded(msg.JobSetID, msg.LocalPath, msg.Error, msg.PackageID)
|
||||
svc.jobMgr.BroadcastEvent(msg.JobSetID, event.NewLocalFileUploaded(msg.LocalPath, errors.New(msg.Error), msg.PackageID))
|
||||
return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp())
|
||||
}
|
||||
|
||||
func (svc *Service) GetJob(msg *mgrmq.GetJob) (*mgrmq.GetJobResp, *mq.CodeMessage) {
|
||||
job, err := svc.jobMgr.CloneJob(msg.JobID)
|
||||
if err != nil {
|
||||
logger.WithField("JobID", msg.JobID).Warnf("cloning job: %s", err.Error())
|
||||
return nil, mq.Failed(errorcode.OperationFailed, "get job failed")
|
||||
func (svc *Service) GetJobSetStatus(msg *mgrmq.GetJobSetStatus) (*mgrmq.GetJobSetStatusResp, *mq.CodeMessage) {
|
||||
jobs := svc.jobMgr.DumpJobSet(msg.JobSetID)
|
||||
if len(jobs) == 0 {
|
||||
return nil, mq.Failed(errorcode.OperationFailed, "job set not found")
|
||||
}
|
||||
|
||||
return mq.ReplyOK(mgrmq.NewGetJobResp(job))
|
||||
return mq.ReplyOK(mgrmq.RespGetJobSetStatus(jobs))
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue