重构manager模块
This commit is contained in:
parent
49a80a693c
commit
1e1c8dd691
|
@ -38,7 +38,7 @@ const (
|
||||||
var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait")
|
var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait")
|
||||||
|
|
||||||
type Scheduler interface {
|
type Scheduler interface {
|
||||||
Schedule(info *jobmod.NormalJob) (*jobmod.JobScheduleScheme, error)
|
Schedule(info *schsdk.NormalJobInfo, status jobmod.NormalJobStatus) (*jobmod.JobScheduleScheme, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
type candidate struct {
|
type candidate struct {
|
||||||
|
@ -129,7 +129,7 @@ func NewDefaultSchedule() *DefaultScheduler {
|
||||||
return &DefaultScheduler{}
|
return &DefaultScheduler{}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *DefaultScheduler) Schedule(job *jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) {
|
func (s *DefaultScheduler) Schedule(info *schsdk.NormalJobInfo, status jobmod.NormalJobStatus) (*jobmod.JobScheduleScheme, error) {
|
||||||
mgrCli, err := schglb.ManagerMQPool.Acquire()
|
mgrCli, err := schglb.ManagerMQPool.Acquire()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("new collector client: %w", err)
|
return nil, fmt.Errorf("new collector client: %w", err)
|
||||||
|
@ -151,17 +151,17 @@ func (s *DefaultScheduler) Schedule(job *jobmod.NormalJob) (*jobmod.JobScheduleS
|
||||||
for _, cc := range allCC.ComputingCenters {
|
for _, cc := range allCC.ComputingCenters {
|
||||||
allCCs[cc.CCID] = &candidate{
|
allCCs[cc.CCID] = &candidate{
|
||||||
CC: cc,
|
CC: cc,
|
||||||
IsPreScheduled: cc.CCID == job.TargetCCID,
|
IsPreScheduled: cc.CCID == status.TargetCCID,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 计算
|
// 计算
|
||||||
err = s.calcFileScore(job.Files, allCCs)
|
err = s.calcFileScore(status.Files, allCCs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = s.calcResourceScore(job, allCCs)
|
err = s.calcResourceScore(info, allCCs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
@ -204,9 +204,9 @@ func (s *DefaultScheduler) makeSchemeForNode(targetCC *candidate) jobmod.JobSche
|
||||||
return scheme
|
return scheme
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *DefaultScheduler) calcResourceScore(job *jobmod.NormalJob, allCCs map[schsdk.CCID]*candidate) error {
|
func (s *DefaultScheduler) calcResourceScore(info *schsdk.NormalJobInfo, allCCs map[schsdk.CCID]*candidate) error {
|
||||||
for _, cc := range allCCs {
|
for _, cc := range allCCs {
|
||||||
res, err := s.calcOneResourceScore(job.Info.Resources, &cc.CC)
|
res, err := s.calcOneResourceScore(info.Resources, &cc.CC)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,7 @@ import (
|
||||||
"github.com/samber/lo"
|
"github.com/samber/lo"
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||||
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -30,7 +31,7 @@ func NewService(scheduler Scheduler) *Service {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Service) MakeScheme(job jobmod.NormalJob) (*jobmod.JobScheduleScheme, error) {
|
func (s *Service) MakeScheme(job schsdk.NormalJobInfo) (*jobmod.JobScheduleScheme, error) {
|
||||||
s.lock.Lock()
|
s.lock.Lock()
|
||||||
callback := future.NewSetValue[*jobmod.JobScheduleScheme]()
|
callback := future.NewSetValue[*jobmod.JobScheduleScheme]()
|
||||||
s.jobs = append(s.jobs, &schedulingJob{
|
s.jobs = append(s.jobs, &schedulingJob{
|
||||||
|
|
|
@ -4,7 +4,7 @@ import (
|
||||||
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
||||||
"gitlink.org.cn/cloudream/common/utils/reflect"
|
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||||
myglbs "gitlink.org.cn/cloudream/scheduler/advisor/internal/globals"
|
myglbs "gitlink.org.cn/cloudream/scheduler/advisor/internal/globals"
|
||||||
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
|
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
|
||||||
)
|
)
|
||||||
|
@ -12,7 +12,7 @@ import (
|
||||||
func (svc *Service) StartTask(msg *advmq.StartTask) (*advmq.StartTaskResp, *mq.CodeMessage) {
|
func (svc *Service) StartTask(msg *advmq.StartTask) (*advmq.StartTaskResp, *mq.CodeMessage) {
|
||||||
tsk, err := svc.taskManager.StartByInfo(msg.Info)
|
tsk, err := svc.taskManager.StartByInfo(msg.Info)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.WithField("Info", reflect.TypeOfValue(msg.Info).Name()).
|
logger.WithField("Info", reflect2.TypeOfValue(msg.Info).Name()).
|
||||||
Warnf("starting task by info: %s", err.Error())
|
Warnf("starting task by info: %s", err.Error())
|
||||||
return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed")
|
return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed")
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ func (t *MakeScheduleScheme) Execute(task *task.Task[TaskContext], ctx TaskConte
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *MakeScheduleScheme) do(taskID string, ctx TaskContext) (*jobmod.JobScheduleScheme, error) {
|
func (t *MakeScheduleScheme) do(taskID string, ctx TaskContext) (*jobmod.JobScheduleScheme, error) {
|
||||||
scheme, err := ctx.scheduleSvc.MakeScheme(t.Job)
|
scheme, err := ctx.scheduleSvc.MakeScheme(t.JobInfo)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import (
|
||||||
"reflect"
|
"reflect"
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/task"
|
"gitlink.org.cn/cloudream/common/pkgs/task"
|
||||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||||
reporter "gitlink.org.cn/cloudream/scheduler/advisor/internal/reporter"
|
reporter "gitlink.org.cn/cloudream/scheduler/advisor/internal/reporter"
|
||||||
"gitlink.org.cn/cloudream/scheduler/advisor/internal/scheduler"
|
"gitlink.org.cn/cloudream/scheduler/advisor/internal/scheduler"
|
||||||
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
||||||
|
@ -40,7 +40,7 @@ func NewManager(reporter *reporter.Reporter, scheduleSvc *scheduler.Service) Man
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) {
|
func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) {
|
||||||
infoType := myreflect.TypeOfValue(info)
|
infoType := reflect2.TypeOfValue(info)
|
||||||
|
|
||||||
ctor, ok := taskFromInfoCtors[infoType]
|
ctor, ok := taskFromInfoCtors[infoType]
|
||||||
if !ok {
|
if !ok {
|
||||||
|
@ -53,7 +53,7 @@ func (m *Manager) StartByInfo(info advtsk.TaskInfo) (*Task, error) {
|
||||||
var taskFromInfoCtors map[reflect.Type]func(advtsk.TaskInfo) TaskBody = make(map[reflect.Type]func(advtsk.TaskInfo) task.TaskBody[TaskContext])
|
var taskFromInfoCtors map[reflect.Type]func(advtsk.TaskInfo) TaskBody = make(map[reflect.Type]func(advtsk.TaskInfo) task.TaskBody[TaskContext])
|
||||||
|
|
||||||
func Register[TInfo advtsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) {
|
func Register[TInfo advtsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) {
|
||||||
taskFromInfoCtors[myreflect.TypeOf[TInfo]()] = func(info advtsk.TaskInfo) TaskBody {
|
taskFromInfoCtors[reflect2.TypeOf[TInfo]()] = func(info advtsk.TaskInfo) TaskBody {
|
||||||
return ctor(info.(TInfo))
|
return ctor(info.(TInfo))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -171,7 +171,7 @@ func (s *DefaultPreScheduler) Schedule(info *schsdk.JobSetInfo) (*jobmod.JobSetP
|
||||||
if resFile, ok := norJob.Files.Code.(*schsdk.ResourceJobFileInfo); ok {
|
if resFile, ok := norJob.Files.Code.(*schsdk.ResourceJobFileInfo); ok {
|
||||||
j.Afters = append(j.Afters, resFile.ResourceLocalJobID)
|
j.Afters = append(j.Afters, resFile.ResourceLocalJobID)
|
||||||
}
|
}
|
||||||
} else if resJob, ok := job.(*schsdk.ResourceJobInfo); ok {
|
} else if resJob, ok := job.(*schsdk.DataReturnJobInfo); ok {
|
||||||
j.Afters = append(j.Afters, resJob.TargetLocalJobID)
|
j.Afters = append(j.Afters, resJob.TargetLocalJobID)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -270,7 +270,7 @@ func (s *DefaultPreScheduler) scheduleForNormalJob(jobSet *schsdk.JobSetInfo, jo
|
||||||
|
|
||||||
// 检查此节点是否是它所引用的任务所选的节点
|
// 检查此节点是否是它所引用的任务所选的节点
|
||||||
for _, af := range job.Afters {
|
for _, af := range job.Afters {
|
||||||
resJob := findJobInfo[*schsdk.ResourceJobInfo](jobSet.Jobs, af)
|
resJob := findJobInfo[*schsdk.DataReturnJobInfo](jobSet.Jobs, af)
|
||||||
if resJob == nil {
|
if resJob == nil {
|
||||||
return nil, fmt.Errorf("resource job %s not found in the job set", af)
|
return nil, fmt.Errorf("resource job %s not found in the job set", af)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,8 @@
|
||||||
package jobmod
|
package jobmod
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/samber/lo"
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/types"
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||||
)
|
)
|
||||||
|
|
||||||
type FileScheduleAction string
|
type FileScheduleAction string
|
||||||
|
@ -34,66 +32,41 @@ type JobSetPreScheduleScheme struct {
|
||||||
JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID
|
JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID
|
||||||
}
|
}
|
||||||
|
|
||||||
// 任务集
|
type JobFiles struct {
|
||||||
type JobSet struct {
|
Dataset PackageJobFile `json:"dataset"`
|
||||||
JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID
|
Code PackageJobFile `json:"code"`
|
||||||
JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用
|
Image ImageJobFile `json:"image"`
|
||||||
PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"`
|
|
||||||
}
|
|
||||||
type JobSetJobRef struct {
|
|
||||||
JobID schsdk.JobID `json:"jobID"` // 任务ID
|
|
||||||
LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet {
|
type PackageJobFile struct {
|
||||||
return &JobSet{
|
PackageID cdssdk.PackageID `json:"packageID"`
|
||||||
JobSetID: jobSetID,
|
FullPath string `json:"fullPath"` // Load之后的完整文件路径
|
||||||
JobRefs: jobRefs,
|
|
||||||
PreScheduleScheme: preScheduleScheme,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef {
|
type ImageJobFile struct {
|
||||||
ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID })
|
PackageID *cdssdk.PackageID `json:"packageID"`
|
||||||
if !ok {
|
ImageID schsdk.ImageID `json:"imageID"`
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return &ref
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 任务
|
type JobStatus struct {
|
||||||
type Job interface {
|
JobID schsdk.JobID `json:"jobID"`
|
||||||
GetJobSetID() schsdk.JobSetID
|
JobSetID schsdk.JobSetID `json:"jobSetID"`
|
||||||
GetJobID() schsdk.JobID
|
Info schsdk.JobInfo `json:"info"`
|
||||||
GetState() JobState
|
Body JobBodyStatus `json:"body"`
|
||||||
SetState(state JobState)
|
State JobStateStatus `json:"state"`
|
||||||
Clone() Job
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var JobTypeUnion = types.NewTypeUnion[Job](
|
type JobBodyStatus interface {
|
||||||
(*NormalJob)(nil),
|
|
||||||
(*ResourceJob)(nil),
|
|
||||||
)
|
|
||||||
var _ = serder.UseTypeUnionExternallyTagged(&JobTypeUnion)
|
|
||||||
|
|
||||||
// TODO var _ = serder.RegisterNewTaggedTypeUnion(JobTypeUnion, "Type", "type")
|
|
||||||
|
|
||||||
type JobBase struct {
|
|
||||||
JobSetID schsdk.JobSetID `json:"jobSetID"` // 任务集ID
|
|
||||||
JobID schsdk.JobID `json:"jobID"` // 全局唯一任务ID
|
|
||||||
State JobState `json:"state"` // 任务当前的状态。包含当前在状态下执行操作所需的数据
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (j *JobBase) GetJobSetID() schsdk.JobSetID {
|
type NormalJobStatus struct {
|
||||||
return j.JobSetID
|
TargetCCID schsdk.CCID `json:"targetCCID"`
|
||||||
|
Files JobFiles `json:"files"`
|
||||||
}
|
}
|
||||||
func (j *JobBase) GetJobID() schsdk.JobID {
|
|
||||||
return j.JobID
|
type DataReturnJobStatus struct {
|
||||||
|
DataReturnPackageID cdssdk.PackageID `json:"dataReturnPackageID"`
|
||||||
}
|
}
|
||||||
func (j *JobBase) GetState() JobState {
|
|
||||||
return j.State
|
type JobStateStatus interface {
|
||||||
}
|
|
||||||
func (j *JobBase) SetState(state JobState) {
|
|
||||||
j.State = state
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,46 +0,0 @@
|
||||||
package jobmod
|
|
||||||
|
|
||||||
import (
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
||||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
|
||||||
)
|
|
||||||
|
|
||||||
type NormalJob struct {
|
|
||||||
JobBase
|
|
||||||
Info schsdk.NormalJobInfo `json:"info"` // 提交任务时提供的任务描述信息
|
|
||||||
Files JobFiles `json:"files"` // 任务需要的文件
|
|
||||||
TargetCCID schsdk.CCID `json:"targetSlwNodeID"` // 将要运行此任务的算力中心ID
|
|
||||||
OutputFullPath string `json:"outputFullPath"` // 程序结果的完整输出路径
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewNormalJob(jobSetID schsdk.JobSetID, jobID schsdk.JobID, info schsdk.NormalJobInfo) *NormalJob {
|
|
||||||
return &NormalJob{
|
|
||||||
JobBase: JobBase{
|
|
||||||
JobSetID: jobSetID,
|
|
||||||
JobID: jobID,
|
|
||||||
},
|
|
||||||
Info: info,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (j *NormalJob) Clone() Job {
|
|
||||||
tmp := *j
|
|
||||||
tmp.State = tmp.State.Clone()
|
|
||||||
return &tmp
|
|
||||||
}
|
|
||||||
|
|
||||||
type JobFiles struct {
|
|
||||||
Dataset PackageJobFile `json:"dataset"`
|
|
||||||
Code PackageJobFile `json:"code"`
|
|
||||||
Image ImageJobFile `json:"image"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type PackageJobFile struct {
|
|
||||||
PackageID cdssdk.PackageID `json:"packageID"`
|
|
||||||
FullPath string `json:"fullPath"` // Load之后的完整文件路径
|
|
||||||
}
|
|
||||||
|
|
||||||
type ImageJobFile struct {
|
|
||||||
PackageID *cdssdk.PackageID `json:"packageID"`
|
|
||||||
ImageID schsdk.ImageID `json:"imageID"`
|
|
||||||
}
|
|
|
@ -1,28 +0,0 @@
|
||||||
package jobmod
|
|
||||||
|
|
||||||
import (
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
||||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
|
||||||
)
|
|
||||||
|
|
||||||
type ResourceJob struct {
|
|
||||||
JobBase
|
|
||||||
Info schsdk.ResourceJobInfo `json:"info"`
|
|
||||||
ResourcePackageID cdssdk.PackageID `json:"resourcePackageID"` // 回源之后得到的PackageID
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewResourceJob(jobSetID schsdk.JobSetID, jobID schsdk.JobID, info schsdk.ResourceJobInfo) *ResourceJob {
|
|
||||||
return &ResourceJob{
|
|
||||||
JobBase: JobBase{
|
|
||||||
JobSetID: jobSetID,
|
|
||||||
JobID: jobID,
|
|
||||||
},
|
|
||||||
Info: info,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (j *ResourceJob) Clone() Job {
|
|
||||||
tmp := *j
|
|
||||||
tmp.State = tmp.State.Clone()
|
|
||||||
return &tmp
|
|
||||||
}
|
|
|
@ -1,183 +0,0 @@
|
||||||
package jobmod
|
|
||||||
|
|
||||||
import (
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/types"
|
|
||||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
|
||||||
)
|
|
||||||
|
|
||||||
type JobState interface {
|
|
||||||
Clone() JobState
|
|
||||||
}
|
|
||||||
type JobStateBase struct{}
|
|
||||||
|
|
||||||
var JobStateTypeUnion = types.NewTypeUnion[JobState](
|
|
||||||
(*StatePreScheduling)(nil),
|
|
||||||
(*StateReadyToAdjust)(nil),
|
|
||||||
(*StateMakingAdjustScheme)(nil),
|
|
||||||
(*StateAdjusting)(nil),
|
|
||||||
(*StateReadyToExecute)(nil),
|
|
||||||
(*StateExecuting)(nil),
|
|
||||||
(*StateFailed)(nil),
|
|
||||||
(*StateSuccess)(nil),
|
|
||||||
)
|
|
||||||
var _ = serder.UseTypeUnionExternallyTagged(&JobStateTypeUnion)
|
|
||||||
|
|
||||||
// TODO var _ = serder.RegisterNewTaggedTypeUnion(JobStateTypeUnion, "Type", "type")
|
|
||||||
|
|
||||||
type FileSchedulingStep string
|
|
||||||
|
|
||||||
const (
|
|
||||||
StepBegin FileSchedulingStep = "Begin" // 准备开始调度
|
|
||||||
StepUploading FileSchedulingStep = "Uploading" // 正在等待文件上传
|
|
||||||
StepUploaded FileSchedulingStep = "Uploaded" // 文件上传完成
|
|
||||||
StepMoving FileSchedulingStep = "Moving" // 正在移动缓存
|
|
||||||
StepLoading FileSchedulingStep = "Loading" // 正在加载
|
|
||||||
StepImageImporting FileSchedulingStep = "ImageImporting" // 正在导入镜像
|
|
||||||
StepCompleted FileSchedulingStep = "Completed" // 完成
|
|
||||||
)
|
|
||||||
|
|
||||||
type FileSchedulingState struct {
|
|
||||||
Step FileSchedulingStep `json:"step"`
|
|
||||||
Error string `json:"error"`
|
|
||||||
FullTaskID string `json:"fullTaskID"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type StatePreScheduling struct {
|
|
||||||
JobStateBase
|
|
||||||
Scheme JobScheduleScheme `json:"scheme"`
|
|
||||||
Dataset FileSchedulingState `json:"dataset"`
|
|
||||||
Code FileSchedulingState `json:"code"`
|
|
||||||
Image FileSchedulingState `json:"image"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewStatePreScheduling(scheme JobScheduleScheme) *StatePreScheduling {
|
|
||||||
return &StatePreScheduling{
|
|
||||||
Scheme: scheme,
|
|
||||||
Dataset: FileSchedulingState{
|
|
||||||
Step: StepBegin,
|
|
||||||
},
|
|
||||||
Code: FileSchedulingState{
|
|
||||||
Step: StepBegin,
|
|
||||||
},
|
|
||||||
Image: FileSchedulingState{
|
|
||||||
Step: StepBegin,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
func (s *StatePreScheduling) Clone() JobState {
|
|
||||||
tmp := *s
|
|
||||||
return &tmp
|
|
||||||
}
|
|
||||||
|
|
||||||
type StateReadyToAdjust struct {
|
|
||||||
JobStateBase
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewStateReadyToAdjust() *StateReadyToAdjust {
|
|
||||||
return &StateReadyToAdjust{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *StateReadyToAdjust) Clone() JobState {
|
|
||||||
tmp := *s
|
|
||||||
return &tmp
|
|
||||||
}
|
|
||||||
|
|
||||||
type StateMakingAdjustScheme struct {
|
|
||||||
JobStateBase
|
|
||||||
FullTaskID string `json:"fullTaskID"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewStateMakingAdjustScheme() *StateMakingAdjustScheme {
|
|
||||||
return &StateMakingAdjustScheme{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *StateMakingAdjustScheme) Clone() JobState {
|
|
||||||
tmp := *s
|
|
||||||
return &tmp
|
|
||||||
}
|
|
||||||
|
|
||||||
type StateAdjusting struct {
|
|
||||||
JobStateBase
|
|
||||||
Scheme JobScheduleScheme `json:"scheme"`
|
|
||||||
Dataset FileSchedulingState `json:"dataset"`
|
|
||||||
Code FileSchedulingState `json:"code"`
|
|
||||||
Image FileSchedulingState `json:"image"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewStateAdjusting(scheme JobScheduleScheme) *StateAdjusting {
|
|
||||||
return &StateAdjusting{
|
|
||||||
Scheme: scheme,
|
|
||||||
Dataset: FileSchedulingState{
|
|
||||||
Step: StepBegin,
|
|
||||||
},
|
|
||||||
Code: FileSchedulingState{
|
|
||||||
Step: StepBegin,
|
|
||||||
},
|
|
||||||
Image: FileSchedulingState{
|
|
||||||
Step: StepBegin,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *StateAdjusting) Clone() JobState {
|
|
||||||
tmp := *s
|
|
||||||
return &tmp
|
|
||||||
}
|
|
||||||
|
|
||||||
type StateReadyToExecute struct {
|
|
||||||
JobStateBase
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewStateReadyToExecute() *StateReadyToExecute {
|
|
||||||
return &StateReadyToExecute{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *StateReadyToExecute) Clone() JobState {
|
|
||||||
tmp := *s
|
|
||||||
return &tmp
|
|
||||||
}
|
|
||||||
|
|
||||||
type StateExecuting struct {
|
|
||||||
JobStateBase
|
|
||||||
FullTaskID string `json:"fullTaskID"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewStateExecuting() *StateExecuting {
|
|
||||||
return &StateExecuting{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *StateExecuting) Clone() JobState {
|
|
||||||
tmp := *s
|
|
||||||
return &tmp
|
|
||||||
}
|
|
||||||
|
|
||||||
type StateFailed struct {
|
|
||||||
JobStateBase
|
|
||||||
Error string `json:"error"`
|
|
||||||
LastState JobState `json:"lastState"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewStateFailed(err string, lastState JobState) *StateFailed {
|
|
||||||
return &StateFailed{
|
|
||||||
Error: err,
|
|
||||||
LastState: lastState,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *StateFailed) Clone() JobState {
|
|
||||||
tmp := *s
|
|
||||||
return &tmp
|
|
||||||
}
|
|
||||||
|
|
||||||
type StateSuccess struct {
|
|
||||||
JobStateBase
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewStateSuccess() *StateSuccess {
|
|
||||||
return &StateSuccess{}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *StateSuccess) Clone() JobState {
|
|
||||||
tmp := *s
|
|
||||||
return &tmp
|
|
||||||
}
|
|
|
@ -8,7 +8,7 @@ import (
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||||
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
|
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
|
||||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ type CCResourceInfo struct {
|
||||||
func (i *CCResourceInfo) Scan(src interface{}) error {
|
func (i *CCResourceInfo) Scan(src interface{}) error {
|
||||||
data, ok := src.([]uint8)
|
data, ok := src.([]uint8)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("unknow src type: %v", myreflect.TypeOfValue(data).String())
|
return fmt.Errorf("unknow src type: %v", reflect2.TypeOfValue(data).String())
|
||||||
}
|
}
|
||||||
|
|
||||||
return serder.JSONToObject(data, i)
|
return serder.JSONToObject(data, i)
|
||||||
|
|
|
@ -1,17 +1,20 @@
|
||||||
package task
|
package task
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
)
|
)
|
||||||
|
|
||||||
type MakeAdjustScheme struct {
|
type MakeAdjustScheme struct {
|
||||||
TaskInfoBase
|
TaskInfoBase
|
||||||
Job jobmod.NormalJob `json:"job"`
|
JobInfo schsdk.NormalJobInfo `json:"jobInfo"`
|
||||||
|
JobStatus jobmod.NormalJobStatus `json:"jobStatus"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewMakeAdjustScheme(job jobmod.NormalJob) *MakeAdjustScheme {
|
func NewMakeAdjustScheme(jobInfo schsdk.NormalJobInfo, jobStatus jobmod.NormalJobStatus) *MakeAdjustScheme {
|
||||||
return &MakeAdjustScheme{
|
return &MakeAdjustScheme{
|
||||||
Job: job,
|
JobInfo: jobInfo,
|
||||||
|
JobStatus: jobStatus,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@ package task
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/types"
|
"gitlink.org.cn/cloudream/common/pkgs/types"
|
||||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -32,9 +32,9 @@ func (s *TaskStatusBase) Noop() {}
|
||||||
|
|
||||||
// 只能在init函数中调用,因为包级变量初始化会比init函数调用先进行
|
// 只能在init函数中调用,因为包级变量初始化会比init函数调用先进行
|
||||||
func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any {
|
func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any {
|
||||||
TaskInfoTypeUnion.Add(myreflect.TypeOf[TTaskInfo]())
|
TaskInfoTypeUnion.Add(reflect2.TypeOf[TTaskInfo]())
|
||||||
|
|
||||||
TaskStatusTypeUnion.Add(myreflect.TypeOf[TTaskStatus]())
|
TaskStatusTypeUnion.Add(reflect2.TypeOf[TTaskStatus]())
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,7 @@ package task
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/types"
|
"gitlink.org.cn/cloudream/common/pkgs/types"
|
||||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -32,9 +32,9 @@ func (s *TaskStatusBase) Noop() {}
|
||||||
|
|
||||||
// 只能在init函数中调用,因为包级变量初始化会比init函数调用先进行
|
// 只能在init函数中调用,因为包级变量初始化会比init函数调用先进行
|
||||||
func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any {
|
func Register[TTaskInfo TaskInfo, TTaskStatus TaskStatus]() any {
|
||||||
TaskInfoTypeUnion.Add(myreflect.TypeOf[TTaskInfo]())
|
TaskInfoTypeUnion.Add(reflect2.TypeOf[TTaskInfo]())
|
||||||
|
|
||||||
TaskStatusTypeUnion.Add(myreflect.TypeOf[TTaskStatus]())
|
TaskStatusTypeUnion.Add(reflect2.TypeOf[TTaskStatus]())
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,9 +12,7 @@ type JobService interface {
|
||||||
|
|
||||||
JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded) (*JobSetLocalFileUploadedResp, *mq.CodeMessage)
|
JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded) (*JobSetLocalFileUploadedResp, *mq.CodeMessage)
|
||||||
|
|
||||||
GetJob(msg *GetJob) (*GetJobResp, *mq.CodeMessage)
|
GetJobSetStatus(msg *GetJobSetStatus) (*GetJobSetStatusResp, *mq.CodeMessage)
|
||||||
|
|
||||||
// GetJobSetJobs(msg *GetJobSetJobs) (*GetJobSetJobsResp, *mq.CodeMessage)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 提交任务集
|
// 提交任务集
|
||||||
|
@ -74,52 +72,28 @@ func (c *Client) JobSetLocalFileUploaded(msg *JobSetLocalFileUploaded, opts ...m
|
||||||
return mq.Request(Service.JobSetLocalFileUploaded, c.roundTripper, msg, opts...)
|
return mq.Request(Service.JobSetLocalFileUploaded, c.roundTripper, msg, opts...)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 获取任务数据
|
var _ = Register(Service.GetJobSetStatus)
|
||||||
type GetJob struct {
|
|
||||||
|
// 获取任务集的状态
|
||||||
|
type GetJobSetStatus struct {
|
||||||
mq.MessageBodyBase
|
mq.MessageBodyBase
|
||||||
JobID schsdk.JobID `json:"jobID"`
|
JobSetID schsdk.JobSetID `json:"jobSetID"`
|
||||||
}
|
}
|
||||||
type GetJobResp struct {
|
type GetJobSetStatusResp struct {
|
||||||
mq.MessageBodyBase
|
mq.MessageBodyBase
|
||||||
Job jobmod.Job `json:"job"`
|
Jobs []jobmod.JobStatus `json:"jobs"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewGetJob(jobID schsdk.JobID) *GetJob {
|
func ReqGetJobSetStatus(jobSetID schsdk.JobSetID) *GetJobSetStatus {
|
||||||
return &GetJob{
|
return &GetJobSetStatus{
|
||||||
JobID: jobID,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
func NewGetJobResp(job jobmod.Job) *GetJobResp {
|
|
||||||
return &GetJobResp{
|
|
||||||
Job: job,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
func (c *Client) GetJob(msg *GetJob, opts ...mq.RequestOption) (*GetJobResp, error) {
|
|
||||||
return mq.Request(Service.GetJob, c.roundTripper, msg, opts...)
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
// 获取指定任务集中的所有任务数据
|
|
||||||
type GetJobSetJobs struct {
|
|
||||||
mq.MessageBodyBase
|
|
||||||
JobSetID string `json:"jobSetID"`
|
|
||||||
}
|
|
||||||
type GetJobSetJobsResp struct {
|
|
||||||
mq.MessageBodyBase
|
|
||||||
Jobs []jobmod.Job `json:"jobs"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewGetJobSetJobs(jobSetID string) *GetJobSetJobs {
|
|
||||||
return &GetJobSetJobs{
|
|
||||||
JobSetID: jobSetID,
|
JobSetID: jobSetID,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
func NewGetJobSetJobsResp(jobs []jobmod.Job) *GetJobSetJobsResp {
|
func RespGetJobSetStatus(jobs []jobmod.JobStatus) *GetJobSetStatusResp {
|
||||||
return &GetJobSetJobsResp{
|
return &GetJobSetStatusResp{
|
||||||
Jobs: jobs,
|
Jobs: jobs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
func (c *Client) GetJobSetJobs(msg *GetJobSetJobs, opts ...mq.RequestOption) (*GetJobSetJobsResp, error) {
|
func (c *Client) GetJob(msg *GetJobSetStatus, opts ...mq.RequestOption) (*GetJobSetStatusResp, error) {
|
||||||
return mq.Request(Service.GetJobSetJobs, c.rabbitCli, msg, opts...)
|
return mq.Request(Service.GetJobSetStatus, c.roundTripper, msg, opts...)
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ import (
|
||||||
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
||||||
"gitlink.org.cn/cloudream/common/utils/reflect"
|
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||||
execmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
|
execmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
|
||||||
myglbs "gitlink.org.cn/cloudream/scheduler/executor/internal/globals"
|
myglbs "gitlink.org.cn/cloudream/scheduler/executor/internal/globals"
|
||||||
)
|
)
|
||||||
|
@ -12,7 +12,7 @@ import (
|
||||||
func (svc *Service) StartTask(msg *execmq.StartTask) (*execmq.StartTaskResp, *mq.CodeMessage) {
|
func (svc *Service) StartTask(msg *execmq.StartTask) (*execmq.StartTaskResp, *mq.CodeMessage) {
|
||||||
tsk, err := svc.taskManager.StartByInfo(msg.Info)
|
tsk, err := svc.taskManager.StartByInfo(msg.Info)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.WithField("Info", reflect.TypeOfValue(msg.Info).Name()).
|
logger.WithField("Info", reflect2.TypeOfValue(msg.Info).Name()).
|
||||||
Warnf("starting task by info: %s", err.Error())
|
Warnf("starting task by info: %s", err.Error())
|
||||||
return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed")
|
return nil, mq.Failed(errorcode.OperationFailed, "start task by info failed")
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,7 +81,7 @@ func (t *PCMSubmitTask) do(taskID string, ctx TaskContext) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if tsResp.TaskStatus == pcmsdk.TaskStatuFailed {
|
if tsResp.TaskStatus == pcmsdk.TaskStatusFailed {
|
||||||
// TODO 返回更详细的信息
|
// TODO 返回更详细的信息
|
||||||
return fmt.Errorf("task failed")
|
return fmt.Errorf("task failed")
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import (
|
||||||
"reflect"
|
"reflect"
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/task"
|
"gitlink.org.cn/cloudream/common/pkgs/task"
|
||||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
"gitlink.org.cn/cloudream/common/utils/reflect2"
|
||||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||||
reporter "gitlink.org.cn/cloudream/scheduler/executor/internal/reporter"
|
reporter "gitlink.org.cn/cloudream/scheduler/executor/internal/reporter"
|
||||||
)
|
)
|
||||||
|
@ -37,7 +37,7 @@ func NewManager(reporter *reporter.Reporter) Manager {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) {
|
func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) {
|
||||||
infoType := myreflect.TypeOfValue(info)
|
infoType := reflect2.TypeOfValue(info)
|
||||||
|
|
||||||
ctor, ok := taskFromInfoCtors[infoType]
|
ctor, ok := taskFromInfoCtors[infoType]
|
||||||
if !ok {
|
if !ok {
|
||||||
|
@ -50,7 +50,7 @@ func (m *Manager) StartByInfo(info exectsk.TaskInfo) (*Task, error) {
|
||||||
var taskFromInfoCtors map[reflect.Type]func(exectsk.TaskInfo) TaskBody = make(map[reflect.Type]func(exectsk.TaskInfo) task.TaskBody[TaskContext])
|
var taskFromInfoCtors map[reflect.Type]func(exectsk.TaskInfo) TaskBody = make(map[reflect.Type]func(exectsk.TaskInfo) task.TaskBody[TaskContext])
|
||||||
|
|
||||||
func Register[TInfo exectsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) {
|
func Register[TInfo exectsk.TaskInfo, TTaskBody TaskBody](ctor func(info TInfo) TTaskBody) {
|
||||||
taskFromInfoCtors[myreflect.TypeOf[TInfo]()] = func(info exectsk.TaskInfo) TaskBody {
|
taskFromInfoCtors[reflect2.TypeOf[TInfo]()] = func(info exectsk.TaskInfo) TaskBody {
|
||||||
return ctor(info.(TInfo))
|
return ctor(info.(TInfo))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import (
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
"gitlink.org.cn/cloudream/common/utils/sync2"
|
||||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||||
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
|
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
|
||||||
|
@ -13,29 +13,23 @@ import (
|
||||||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||||||
)
|
)
|
||||||
|
|
||||||
type jobTask struct {
|
type task struct {
|
||||||
JobID schsdk.JobID
|
statusChan *sync2.Channel[advtsk.TaskStatus]
|
||||||
TaskID string
|
|
||||||
FullTaskID string
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type AdvisorInfo struct {
|
type AdvisorInfo struct {
|
||||||
advisorID schmod.AdvisorID
|
advisorID schmod.AdvisorID
|
||||||
jobTasks map[string]jobTask // key 为 TaskID
|
tasks map[string]task // key 为 TaskID
|
||||||
lastReportTime time.Time
|
lastReportTime time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus)
|
var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
|
||||||
type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string)
|
|
||||||
|
|
||||||
type Manager struct {
|
type Manager struct {
|
||||||
advisors map[schmod.AdvisorID]*AdvisorInfo
|
advisors map[schmod.AdvisorID]*AdvisorInfo
|
||||||
lock sync.Mutex
|
lock sync.Mutex
|
||||||
advCli *advmq.Client
|
advCli *advmq.Client
|
||||||
|
|
||||||
onTaskUpdated OnTaskUpdatedCallbackFn
|
|
||||||
onTaskTimeout OnTimeoutCallbackFn
|
|
||||||
|
|
||||||
reportTimeout time.Duration
|
reportTimeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,83 +46,66 @@ func NewManager(reportTimeout time.Duration) (*Manager, error) {
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) {
|
|
||||||
m.onTaskUpdated = callback
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) {
|
|
||||||
m.onTaskTimeout = callback
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Manager) Report(advID schmod.AdvisorID, taskStatus []mgrmq.AdvisorTaskStatus) {
|
func (m *Manager) Report(advID schmod.AdvisorID, taskStatus []mgrmq.AdvisorTaskStatus) {
|
||||||
m.lock.Lock()
|
m.lock.Lock()
|
||||||
defer m.lock.Unlock()
|
defer m.lock.Unlock()
|
||||||
|
|
||||||
info, ok := m.advisors[advID]
|
adv, ok := m.advisors[advID]
|
||||||
if !ok {
|
if !ok {
|
||||||
info = &AdvisorInfo{
|
adv = &AdvisorInfo{
|
||||||
advisorID: advID,
|
advisorID: advID,
|
||||||
jobTasks: make(map[string]jobTask),
|
tasks: make(map[string]task),
|
||||||
}
|
}
|
||||||
m.advisors[advID] = info
|
m.advisors[advID] = adv
|
||||||
}
|
}
|
||||||
|
|
||||||
info.lastReportTime = time.Now()
|
adv.lastReportTime = time.Now()
|
||||||
|
|
||||||
for _, s := range taskStatus {
|
for _, s := range taskStatus {
|
||||||
tsk, ok := info.jobTasks[s.TaskID]
|
tsk, ok := adv.tasks[s.TaskID]
|
||||||
if !ok {
|
if !ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status)
|
// TODO 考虑主动检测channel是否关闭,然后取消task
|
||||||
|
if tsk.statusChan.Send(s.Status) != nil {
|
||||||
|
delete(adv.tasks, s.TaskID)
|
||||||
|
|
||||||
|
if len(adv.tasks) == 0 {
|
||||||
|
delete(m.advisors, advID)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 启动一个Task,并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID
|
// 启动一个Task
|
||||||
func (m *Manager) StartTask(jobID schsdk.JobID, info advtsk.TaskInfo) (string, error) {
|
func (m *Manager) StartTask(info advtsk.TaskInfo) *sync2.Channel[advtsk.TaskStatus] {
|
||||||
m.lock.Lock()
|
m.lock.Lock()
|
||||||
defer m.lock.Unlock()
|
defer m.lock.Unlock()
|
||||||
|
|
||||||
|
ch := sync2.NewChannel[advtsk.TaskStatus]()
|
||||||
|
|
||||||
resp, err := m.advCli.StartTask(advmq.NewStartTask(info))
|
resp, err := m.advCli.StartTask(advmq.NewStartTask(info))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||||||
|
return ch
|
||||||
}
|
}
|
||||||
|
|
||||||
fullTaskID := fmt.Sprintf("%s-%s", resp.AdvisorID, resp.TaskID)
|
|
||||||
|
|
||||||
exeInfo, ok := m.advisors[resp.AdvisorID]
|
exeInfo, ok := m.advisors[resp.AdvisorID]
|
||||||
if !ok {
|
if !ok {
|
||||||
exeInfo = &AdvisorInfo{
|
exeInfo = &AdvisorInfo{
|
||||||
advisorID: resp.AdvisorID,
|
advisorID: resp.AdvisorID,
|
||||||
jobTasks: make(map[string]jobTask),
|
tasks: make(map[string]task),
|
||||||
lastReportTime: time.Now(),
|
lastReportTime: time.Now(),
|
||||||
}
|
}
|
||||||
m.advisors[resp.AdvisorID] = exeInfo
|
m.advisors[resp.AdvisorID] = exeInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
exeInfo.jobTasks[resp.TaskID] = jobTask{
|
exeInfo.tasks[resp.TaskID] = task{
|
||||||
JobID: jobID,
|
statusChan: ch,
|
||||||
TaskID: resp.TaskID,
|
|
||||||
FullTaskID: fullTaskID,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return fullTaskID, nil
|
return ch
|
||||||
}
|
|
||||||
|
|
||||||
// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新
|
|
||||||
func (m *Manager) ForgetTask(fullTaskID string) {
|
|
||||||
m.lock.Lock()
|
|
||||||
defer m.lock.Unlock()
|
|
||||||
|
|
||||||
for _, exe := range m.advisors {
|
|
||||||
for _, tsk := range exe.jobTasks {
|
|
||||||
if tsk.FullTaskID == fullTaskID {
|
|
||||||
delete(exe.jobTasks, fullTaskID)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) Serve() error {
|
func (m *Manager) Serve() error {
|
||||||
|
@ -150,8 +127,8 @@ func (m *Manager) Serve() error {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tsk := range exeInfo.jobTasks {
|
for _, tsk := range exeInfo.tasks {
|
||||||
m.onTaskTimeout(tsk.JobID, tsk.FullTaskID)
|
tsk.statusChan.CloseWithError(ErrWaitReportTimeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
delete(m.advisors, exeID)
|
delete(m.advisors, exeID)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import (
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
"gitlink.org.cn/cloudream/common/utils/sync2"
|
||||||
|
|
||||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||||
|
@ -14,29 +14,22 @@ import (
|
||||||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||||||
)
|
)
|
||||||
|
|
||||||
type jobTask struct {
|
type task struct {
|
||||||
JobID schsdk.JobID
|
statusChan *sync2.Channel[exetsk.TaskStatus]
|
||||||
TaskID string
|
|
||||||
FullTaskID string
|
|
||||||
}
|
}
|
||||||
|
type ExecutorStatus struct {
|
||||||
type ExecutorInfo struct {
|
|
||||||
executorID schmod.ExecutorID
|
executorID schmod.ExecutorID
|
||||||
jobTasks map[string]jobTask // key 为 TaskID
|
tasks map[string]task // key 为 TaskID
|
||||||
lastReportTime time.Time
|
lastReportTime time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus exetsk.TaskStatus)
|
var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
|
||||||
type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string)
|
|
||||||
|
|
||||||
type Manager struct {
|
type Manager struct {
|
||||||
executors map[schmod.ExecutorID]*ExecutorInfo
|
executors map[schmod.ExecutorID]*ExecutorStatus
|
||||||
lock sync.Mutex
|
lock sync.Mutex
|
||||||
exeCli *exemq.Client
|
exeCli *exemq.Client
|
||||||
|
|
||||||
onTaskUpdated OnTaskUpdatedCallbackFn
|
|
||||||
onTaskTimeout OnTimeoutCallbackFn
|
|
||||||
|
|
||||||
reportTimeout time.Duration
|
reportTimeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -47,89 +40,71 @@ func NewManager(reportTimeout time.Duration) (*Manager, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
return &Manager{
|
return &Manager{
|
||||||
executors: make(map[schmod.ExecutorID]*ExecutorInfo),
|
executors: make(map[schmod.ExecutorID]*ExecutorStatus),
|
||||||
exeCli: exeCli,
|
exeCli: exeCli,
|
||||||
reportTimeout: reportTimeout,
|
reportTimeout: reportTimeout,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) {
|
|
||||||
m.onTaskUpdated = callback
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) {
|
|
||||||
m.onTaskTimeout = callback
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Manager) Report(execID schmod.ExecutorID, taskStatus []mgrmq.ExecutorTaskStatus) {
|
func (m *Manager) Report(execID schmod.ExecutorID, taskStatus []mgrmq.ExecutorTaskStatus) {
|
||||||
m.lock.Lock()
|
m.lock.Lock()
|
||||||
defer m.lock.Unlock()
|
defer m.lock.Unlock()
|
||||||
|
|
||||||
info, ok := m.executors[execID]
|
exec, ok := m.executors[execID]
|
||||||
if !ok {
|
if !ok {
|
||||||
info = &ExecutorInfo{
|
exec = &ExecutorStatus{
|
||||||
executorID: execID,
|
executorID: execID,
|
||||||
jobTasks: make(map[string]jobTask),
|
tasks: make(map[string]task),
|
||||||
}
|
}
|
||||||
m.executors[execID] = info
|
m.executors[execID] = exec
|
||||||
}
|
}
|
||||||
|
|
||||||
info.lastReportTime = time.Now()
|
exec.lastReportTime = time.Now()
|
||||||
|
|
||||||
for _, s := range taskStatus {
|
for _, s := range taskStatus {
|
||||||
tsk, ok := info.jobTasks[s.TaskID]
|
tsk, ok := exec.tasks[s.TaskID]
|
||||||
if !ok {
|
if !ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status)
|
// TODO 考虑主动检测channel是否关闭,然后取消task
|
||||||
|
if tsk.statusChan.Send(s.Status) != nil {
|
||||||
|
delete(exec.tasks, s.TaskID)
|
||||||
|
|
||||||
|
if len(exec.tasks) == 0 {
|
||||||
|
delete(m.executors, execID)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 启动一个Task,并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID
|
// 启动一个Task
|
||||||
func (m *Manager) StartTask(jobID schsdk.JobID, info exetsk.TaskInfo) (string, error) {
|
func (m *Manager) StartTask(info exetsk.TaskInfo) *sync2.Channel[exetsk.TaskStatus] {
|
||||||
m.lock.Lock()
|
m.lock.Lock()
|
||||||
defer m.lock.Unlock()
|
defer m.lock.Unlock()
|
||||||
|
ch := sync2.NewChannel[exetsk.TaskStatus]()
|
||||||
|
|
||||||
resp, err := m.exeCli.StartTask(exemq.NewStartTask(info))
|
resp, err := m.exeCli.StartTask(exemq.NewStartTask(info))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||||||
|
return ch
|
||||||
}
|
}
|
||||||
|
|
||||||
fullTaskID := fmt.Sprintf("%s-%s", resp.ExecutorID, resp.TaskID)
|
|
||||||
|
|
||||||
exeInfo, ok := m.executors[resp.ExecutorID]
|
exeInfo, ok := m.executors[resp.ExecutorID]
|
||||||
if !ok {
|
if !ok {
|
||||||
exeInfo = &ExecutorInfo{
|
exeInfo = &ExecutorStatus{
|
||||||
executorID: resp.ExecutorID,
|
executorID: resp.ExecutorID,
|
||||||
jobTasks: make(map[string]jobTask),
|
tasks: make(map[string]task),
|
||||||
lastReportTime: time.Now(),
|
lastReportTime: time.Now(),
|
||||||
}
|
}
|
||||||
m.executors[resp.ExecutorID] = exeInfo
|
m.executors[resp.ExecutorID] = exeInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
exeInfo.jobTasks[resp.TaskID] = jobTask{
|
exeInfo.tasks[resp.TaskID] = task{
|
||||||
JobID: jobID,
|
statusChan: ch,
|
||||||
TaskID: resp.TaskID,
|
|
||||||
FullTaskID: fullTaskID,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return fullTaskID, nil
|
return ch
|
||||||
}
|
|
||||||
|
|
||||||
// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新
|
|
||||||
func (m *Manager) ForgetTask(fullTaskID string) {
|
|
||||||
m.lock.Lock()
|
|
||||||
defer m.lock.Unlock()
|
|
||||||
|
|
||||||
for _, exe := range m.executors {
|
|
||||||
for _, tsk := range exe.jobTasks {
|
|
||||||
if tsk.FullTaskID == fullTaskID {
|
|
||||||
delete(exe.jobTasks, fullTaskID)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) Serve() error {
|
func (m *Manager) Serve() error {
|
||||||
|
@ -151,8 +126,8 @@ func (m *Manager) Serve() error {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tsk := range exeInfo.jobTasks {
|
for _, tsk := range exeInfo.tasks {
|
||||||
m.onTaskTimeout(tsk.JobID, tsk.FullTaskID)
|
tsk.statusChan.CloseWithError(ErrWaitReportTimeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
delete(m.executors, exeID)
|
delete(m.executors, exeID)
|
||||||
|
|
|
@ -1,371 +0,0 @@
|
||||||
package jobmgr
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"reflect"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
||||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
|
||||||
|
|
||||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
|
||||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
|
||||||
|
|
||||||
type adjustingJob struct {
|
|
||||||
job *jobmod.NormalJob
|
|
||||||
state *jobmod.StateAdjusting
|
|
||||||
ccInfo schmod.ComputingCenter
|
|
||||||
}
|
|
||||||
|
|
||||||
type AdjustingHandler struct {
|
|
||||||
mgr *Manager
|
|
||||||
|
|
||||||
jobs map[schsdk.JobID]*adjustingJob
|
|
||||||
|
|
||||||
cmdChan actor.CommandChannel
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewAdjustingHandler(mgr *Manager) *AdjustingHandler {
|
|
||||||
return &AdjustingHandler{
|
|
||||||
mgr: mgr,
|
|
||||||
jobs: make(map[schsdk.JobID]*adjustingJob),
|
|
||||||
cmdChan: *actor.NewCommandChannel(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *AdjustingHandler) Handle(job jobmod.Job) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
norJob, ok := job.(*jobmod.NormalJob)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
adjustingState, ok := norJob.GetState().(*jobmod.StateAdjusting)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
colCli, err := schglb.CollectorMQPool.Acquire()
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err.Error()), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer schglb.CollectorMQPool.Release(colCli)
|
|
||||||
|
|
||||||
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), adjustingState.Scheme.TargetCCID)
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new cloudream storage client: %s", err.Error()), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
|
||||||
|
|
||||||
stgInfo, err := stgCli.StorageGetInfo(cdssdk.StorageGetInfoReq{
|
|
||||||
StorageID: ccInfo.CDSStorageID,
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting cloudream storage info: %s", err.Error()), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
norJob.TargetCCID = adjustingState.Scheme.TargetCCID
|
|
||||||
// TODO UserID
|
|
||||||
norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, norJob.JobID)
|
|
||||||
|
|
||||||
adjJob := &adjustingJob{
|
|
||||||
job: norJob,
|
|
||||||
state: adjustingState,
|
|
||||||
ccInfo: ccInfo,
|
|
||||||
}
|
|
||||||
h.jobs[job.GetJobID()] = adjJob
|
|
||||||
|
|
||||||
h.onJobEvent(nil, adjJob)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *AdjustingHandler) onJobEvent(evt event.Event, job *adjustingJob) {
|
|
||||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
|
||||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
err := h.doPackageScheduling(evt, job,
|
|
||||||
job.job.Info.Files.Dataset, &job.job.Files.Dataset,
|
|
||||||
&job.state.Scheme.Dataset, &job.state.Dataset,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
job.state.Dataset.Error = err.Error()
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
err = h.doPackageScheduling(evt, job,
|
|
||||||
job.job.Info.Files.Code, &job.job.Files.Code,
|
|
||||||
&job.state.Scheme.Code, &job.state.Code,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
job.state.Code.Error = err.Error()
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
err = h.doImageScheduling(evt, job,
|
|
||||||
job.job.Info.Files.Image, &job.job.Files.Image,
|
|
||||||
&job.state.Scheme.Image, &job.state.Image,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
job.state.Image.Error = err.Error()
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// 如果三种文件都调度完成,则可以进入下个阶段了
|
|
||||||
if job.state.Dataset.Step == jobmod.StepCompleted &&
|
|
||||||
job.state.Code.Step == jobmod.StepCompleted &&
|
|
||||||
job.state.Image.Step == jobmod.StepCompleted {
|
|
||||||
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateReadyToExecute())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *AdjustingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
|
||||||
job.SetState(state)
|
|
||||||
|
|
||||||
delete(h.jobs, job.GetJobID())
|
|
||||||
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
h.mgr.handleState(job)
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *AdjustingHandler) doPackageScheduling(evt event.Event, job *adjustingJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
|
|
||||||
if state.Step == jobmod.StepBegin {
|
|
||||||
state.Step = jobmod.StepUploaded
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepUploaded {
|
|
||||||
if scheme.Action == jobmod.ActionNo {
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if scheme.Action == jobmod.ActionMove {
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, file.PackageID, job.ccInfo.CDSNodeID))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("starting cache move package: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepMoving
|
|
||||||
state.FullTaskID = fullTaskID
|
|
||||||
return nil
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if scheme.Action == jobmod.ActionLoad {
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewStorageLoadPackage(1, file.PackageID, job.ccInfo.CDSStorageID))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("starting stroage load package: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepLoading
|
|
||||||
state.FullTaskID = fullTaskID
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepMoving {
|
|
||||||
moveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
|
|
||||||
if err == event.ErrUnconcernedTask {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
return fmt.Errorf("cache move package timeout")
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
|
||||||
|
|
||||||
if moveRet.Error != "" {
|
|
||||||
return fmt.Errorf("cache move pacakge: %s", moveRet.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepLoading {
|
|
||||||
loadRet, err := event.AssertExecutorTaskStatus[*exectsk.StorageLoadPackageStatus](evt, state.FullTaskID)
|
|
||||||
if err == event.ErrUnconcernedTask {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
return fmt.Errorf("storage load package timeout")
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
|
||||||
|
|
||||||
if loadRet.Error != "" {
|
|
||||||
return fmt.Errorf("storage load package: %s", loadRet.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
file.FullPath = loadRet.FullPath
|
|
||||||
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *AdjustingHandler) doImageScheduling(evt event.Event, job *adjustingJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
|
|
||||||
if state.Step == jobmod.StepBegin {
|
|
||||||
state.Step = jobmod.StepUploaded
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepUploaded {
|
|
||||||
if scheme.Action == jobmod.ActionNo {
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// 要导入镜像,则需要先将镜像移动到指点节点的缓存中
|
|
||||||
if scheme.Action == jobmod.ActionImportImage {
|
|
||||||
if file.PackageID == nil {
|
|
||||||
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, job.ccInfo.CCID)
|
|
||||||
}
|
|
||||||
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, *file.PackageID, job.ccInfo.CDSNodeID))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("starting cache move package: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepMoving
|
|
||||||
state.FullTaskID = fullTaskID
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepMoving {
|
|
||||||
cacheMoveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
|
|
||||||
if err == event.ErrUnconcernedTask {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
return fmt.Errorf("cache move package timeout")
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
|
||||||
|
|
||||||
if cacheMoveRet.Error != "" {
|
|
||||||
return fmt.Errorf("cache move pacakge: %s", cacheMoveRet.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("new cloudream storage client: %w", err)
|
|
||||||
}
|
|
||||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
|
||||||
|
|
||||||
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("getting package objects: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(pkgObjs.Objects) != 1 {
|
|
||||||
return fmt.Errorf("there must be only 1 object in the package that will be imported")
|
|
||||||
}
|
|
||||||
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewUploadImage(job.ccInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("starting import image: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepImageImporting
|
|
||||||
state.FullTaskID = fullTaskID
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepImageImporting {
|
|
||||||
uploadImageRet, err := event.AssertExecutorTaskStatus[*exectsk.UploadImageStatus](evt, state.FullTaskID)
|
|
||||||
if err == event.ErrUnconcernedTask {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
return fmt.Errorf("import image timeout")
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
|
||||||
|
|
||||||
if uploadImageRet.Error != "" {
|
|
||||||
return fmt.Errorf("import image: %s", uploadImageRet.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
// 调整过程中不会更换镜像,所以ImageID不会发生变化
|
|
||||||
err = h.mgr.db.PCMImage().Create(h.mgr.db.SQLCtx(), file.ImageID, job.ccInfo.CCID, uploadImageRet.PCMImageID, uploadImageRet.Name, time.Now())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("creating pcm image info: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *AdjustingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
if broadcast.ToAll() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if broadcast.ToJobSet() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
} else if broadcast.ToJob() {
|
|
||||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *AdjustingHandler) Serve() {
|
|
||||||
cmdChan := h.cmdChan.BeginChanReceive()
|
|
||||||
defer h.cmdChan.CloseChanReceive()
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case cmd := <-cmdChan:
|
|
||||||
cmd()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *AdjustingHandler) Stop() {
|
|
||||||
// TODO 支持STOP
|
|
||||||
}
|
|
|
@ -1,63 +0,0 @@
|
||||||
package jobmgr
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"reflect"
|
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
|
||||||
|
|
||||||
type CompleteHandler struct {
|
|
||||||
mgr *Manager
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewCompleteHandler(mgr *Manager) *CompleteHandler {
|
|
||||||
return &CompleteHandler{
|
|
||||||
mgr: mgr,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *CompleteHandler) Handle(job jobmod.Job) {
|
|
||||||
// TODO 可以考虑将执行记录落库
|
|
||||||
if state, ok := job.GetState().(*jobmod.StateSuccess); ok {
|
|
||||||
h.handleSuccess(job, state)
|
|
||||||
} else if state, ok := job.GetState().(*jobmod.StateFailed); ok {
|
|
||||||
h.handleFailed(job, state)
|
|
||||||
} else {
|
|
||||||
state := jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState())
|
|
||||||
job.SetState(state)
|
|
||||||
h.handleFailed(job, state)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *CompleteHandler) handleSuccess(job jobmod.Job, state *jobmod.StateSuccess) {
|
|
||||||
logger.WithField("JobID", job.GetJobID()).Infof("job completed successfuly")
|
|
||||||
|
|
||||||
h.mgr.onEvent(event.ToJobSet(job.GetJobSetID()), event.NewJobCompleted(job))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *CompleteHandler) handleFailed(job jobmod.Job, state *jobmod.StateFailed) {
|
|
||||||
logger.
|
|
||||||
WithField("JobID", job.GetJobID()).
|
|
||||||
WithField("LastState", reflect.TypeOf(state.LastState).String()).
|
|
||||||
Infof("job failed with: %v", state.Error)
|
|
||||||
|
|
||||||
h.mgr.onEvent(event.ToJobSet(job.GetJobSetID()), event.NewJobCompleted(job))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *CompleteHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
|
||||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
|
||||||
cloneEvt.Callback.SetError(fmt.Errorf("job not found"))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *CompleteHandler) Serve() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *CompleteHandler) Stop() {
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,50 +0,0 @@
|
||||||
package jobmgr
|
|
||||||
|
|
||||||
import (
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
|
||||||
|
|
||||||
type DefaultHandler struct {
|
|
||||||
mgr *Manager
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewDefaultHandler(mgr *Manager) *DefaultHandler {
|
|
||||||
return &DefaultHandler{
|
|
||||||
mgr: mgr,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 处理Job。在此期间全局锁已锁定
|
|
||||||
func (h *DefaultHandler) Handle(job jobmod.Job) {
|
|
||||||
state := job.GetState()
|
|
||||||
if state == nil {
|
|
||||||
job.SetState(jobmod.NewStateFailed("unexpected nil state", nil))
|
|
||||||
h.mgr.handleState(job)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if _, ok := state.(*jobmod.StateFailed); ok {
|
|
||||||
logger.Warnf("state failed should not be handled by default handler")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
job.SetState(jobmod.NewStateFailed("no handler for this state", state))
|
|
||||||
h.mgr.handleState(job)
|
|
||||||
}
|
|
||||||
|
|
||||||
// 外部发生了一个事件
|
|
||||||
func (h *DefaultHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// 运行Handler
|
|
||||||
func (h *DefaultHandler) Serve() {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// 停止此Handler
|
|
||||||
func (h *DefaultHandler) Stop() {
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,4 +1,4 @@
|
||||||
package event
|
package jobmgr
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"errors"
|
"errors"
|
||||||
|
@ -11,6 +11,8 @@ var ErrUnconcernedTask = errors.New("unconcerned task")
|
||||||
|
|
||||||
var ErrTaskTimeout = errors.New("task timeout")
|
var ErrTaskTimeout = errors.New("task timeout")
|
||||||
|
|
||||||
|
var ErrJobCancelled = errors.New("job cancelled")
|
||||||
|
|
||||||
type Event interface{}
|
type Event interface{}
|
||||||
|
|
||||||
type BroadcastType string
|
type BroadcastType string
|
|
@ -1,12 +0,0 @@
|
||||||
package event
|
|
||||||
|
|
||||||
// advisor的任务执行超时
|
|
||||||
type AdvisorTaskTimeout struct {
|
|
||||||
FullTaskID string
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewAdvisorTaskTimeout(fullTaskID string) *AdvisorTaskTimeout {
|
|
||||||
return &AdvisorTaskTimeout{
|
|
||||||
FullTaskID: fullTaskID,
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,46 +0,0 @@
|
||||||
package event
|
|
||||||
|
|
||||||
import advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
|
||||||
|
|
||||||
// advisor上报任务进度
|
|
||||||
type AdvisorTaskUpdated struct {
|
|
||||||
FullTaskID string
|
|
||||||
TaskStatus advtsk.TaskStatus
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewAdvisorTaskUpdated(fullTaskID string, taskStatus advtsk.TaskStatus) *AdvisorTaskUpdated {
|
|
||||||
return &AdvisorTaskUpdated{
|
|
||||||
FullTaskID: fullTaskID,
|
|
||||||
TaskStatus: taskStatus,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func AssertAdvisorTaskStatus[T advtsk.TaskStatus](evt Event, fullTaskID string) (T, error) {
|
|
||||||
var ret T
|
|
||||||
if evt == nil {
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
||||||
|
|
||||||
if reportTaskStatus, ok := evt.(*AdvisorTaskUpdated); ok {
|
|
||||||
if reportTaskStatus.FullTaskID != fullTaskID {
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
||||||
|
|
||||||
status, ok := reportTaskStatus.TaskStatus.(T)
|
|
||||||
if !ok {
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
||||||
|
|
||||||
return status, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if taskTimeout, ok := evt.(*AdvisorTaskTimeout); ok {
|
|
||||||
if taskTimeout.FullTaskID != fullTaskID {
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret, ErrTaskTimeout
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
package event
|
||||||
|
|
||||||
|
type Cancel struct {
|
||||||
|
}
|
|
@ -1,14 +0,0 @@
|
||||||
package event
|
|
||||||
|
|
||||||
import (
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
)
|
|
||||||
|
|
||||||
type CloneJob struct {
|
|
||||||
Callback future.SetValueFuture[jobmod.Job]
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewCloneJob() *CloneJob {
|
|
||||||
return &CloneJob{}
|
|
||||||
}
|
|
|
@ -1,12 +0,0 @@
|
||||||
package event
|
|
||||||
|
|
||||||
// executor的任务执行超时
|
|
||||||
type ExecutorTaskTimeout struct {
|
|
||||||
FullTaskID string
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewExecutorTaskTimeout(fullTaskID string) *ExecutorTaskTimeout {
|
|
||||||
return &ExecutorTaskTimeout{
|
|
||||||
FullTaskID: fullTaskID,
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,48 +0,0 @@
|
||||||
package event
|
|
||||||
|
|
||||||
import (
|
|
||||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
|
||||||
)
|
|
||||||
|
|
||||||
// executor上报任务进度
|
|
||||||
type ExecutorTaskUpdated struct {
|
|
||||||
FullTaskID string
|
|
||||||
TaskStatus exectsk.TaskStatus
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewExecutorTaskUpdated(fullTaskID string, taskStatus exectsk.TaskStatus) *ExecutorTaskUpdated {
|
|
||||||
return &ExecutorTaskUpdated{
|
|
||||||
FullTaskID: fullTaskID,
|
|
||||||
TaskStatus: taskStatus,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func AssertExecutorTaskStatus[T exectsk.TaskStatus](evt Event, fullTaskID string) (T, error) {
|
|
||||||
var ret T
|
|
||||||
if evt == nil {
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
||||||
|
|
||||||
if reportTaskStatus, ok := evt.(*ExecutorTaskUpdated); ok {
|
|
||||||
if reportTaskStatus.FullTaskID != fullTaskID {
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
||||||
|
|
||||||
status, ok := reportTaskStatus.TaskStatus.(T)
|
|
||||||
if !ok {
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
||||||
|
|
||||||
return status, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if taskTimeout, ok := evt.(*ExecutorTaskTimeout); ok {
|
|
||||||
if taskTimeout.FullTaskID != fullTaskID {
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret, ErrTaskTimeout
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret, ErrUnconcernedTask
|
|
||||||
}
|
|
|
@ -1,16 +1,18 @@
|
||||||
package event
|
package event
|
||||||
|
|
||||||
import (
|
import (
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
)
|
)
|
||||||
|
|
||||||
// 任务结束,包括成功或者失败
|
// 任务结束,包括成功或者失败
|
||||||
type JobCompleted struct {
|
type JobCompleted struct {
|
||||||
Job jobmod.Job
|
Job *jobmgr.Job
|
||||||
|
Err error
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewJobCompleted(job jobmod.Job) *JobCompleted {
|
func NewJobCompleted(job *jobmgr.Job, err error) *JobCompleted {
|
||||||
return &JobCompleted{
|
return &JobCompleted{
|
||||||
Job: job,
|
Job: job,
|
||||||
|
Err: err,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,21 +1,18 @@
|
||||||
package event
|
package event
|
||||||
|
|
||||||
import (
|
import (
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
||||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||||
)
|
)
|
||||||
|
|
||||||
// 本地文件上传结束
|
// 本地文件上传结束
|
||||||
type LocalFileUploaded struct {
|
type LocalFileUploaded struct {
|
||||||
JobSetID schsdk.JobSetID
|
|
||||||
LocalPath string
|
LocalPath string
|
||||||
Error string
|
Error error
|
||||||
PackageID cdssdk.PackageID
|
PackageID cdssdk.PackageID
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewLocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) *LocalFileUploaded {
|
func NewLocalFileUploaded(localPath string, err error, packageID cdssdk.PackageID) *LocalFileUploaded {
|
||||||
return &LocalFileUploaded{
|
return &LocalFileUploaded{
|
||||||
JobSetID: jobSetID,
|
|
||||||
LocalPath: localPath,
|
LocalPath: localPath,
|
||||||
Error: err,
|
Error: err,
|
||||||
PackageID: packageID,
|
PackageID: packageID,
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
package event
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
)
|
||||||
|
|
||||||
|
func WaitType[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet) (T, bool) {
|
||||||
|
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
|
||||||
|
_, ok := evt.(T)
|
||||||
|
return ok
|
||||||
|
})
|
||||||
|
return ret.(T), ok
|
||||||
|
}
|
||||||
|
|
||||||
|
func WaitTypeAnd[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet, cond func(val T) bool) (T, bool) {
|
||||||
|
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
|
||||||
|
e, ok := evt.(T)
|
||||||
|
if !ok {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return cond(e)
|
||||||
|
})
|
||||||
|
return ret.(T), ok
|
||||||
|
}
|
|
@ -0,0 +1,72 @@
|
||||||
|
package jobmgr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||||
|
"gitlink.org.cn/cloudream/common/utils/lo2"
|
||||||
|
)
|
||||||
|
|
||||||
|
type EventWaitCondition func(evt Event) bool
|
||||||
|
|
||||||
|
type EventWaiter struct {
|
||||||
|
condition EventWaitCondition
|
||||||
|
future *future.SetValueFuture[Event]
|
||||||
|
}
|
||||||
|
|
||||||
|
type EventSet struct {
|
||||||
|
events []Event
|
||||||
|
waiters []EventWaiter
|
||||||
|
lock sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewEventSet() EventSet {
|
||||||
|
return EventSet{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *EventSet) Post(evt Event) {
|
||||||
|
s.lock.Lock()
|
||||||
|
defer s.lock.Unlock()
|
||||||
|
|
||||||
|
// 一个事件能唤醒多个等待者
|
||||||
|
used := false
|
||||||
|
for i, waiter := range s.waiters {
|
||||||
|
if waiter.condition(evt) {
|
||||||
|
s.waiters = lo2.RemoveAt(s.waiters, i)
|
||||||
|
waiter.future.SetValue(evt)
|
||||||
|
used = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !used {
|
||||||
|
s.events = append(s.events, evt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bool) {
|
||||||
|
s.lock.Lock()
|
||||||
|
defer s.lock.Unlock()
|
||||||
|
|
||||||
|
// 一个等待者只能等待一个事件
|
||||||
|
for i, evt := range s.events {
|
||||||
|
if cond(evt) {
|
||||||
|
s.events = lo2.RemoveAt(s.events, i)
|
||||||
|
return evt, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fut := future.NewSetValue[Event]()
|
||||||
|
waiter := EventWaiter{
|
||||||
|
condition: cond,
|
||||||
|
future: fut,
|
||||||
|
}
|
||||||
|
s.events = append(s.events, waiter)
|
||||||
|
|
||||||
|
val, err := fut.WaitValue(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
return val, true
|
||||||
|
}
|
|
@ -1,264 +0,0 @@
|
||||||
package jobmgr
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"reflect"
|
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
|
||||||
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
||||||
|
|
||||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
|
||||||
|
|
||||||
type executingJob struct {
|
|
||||||
job jobmod.Job
|
|
||||||
state *jobmod.StateExecuting
|
|
||||||
}
|
|
||||||
|
|
||||||
type ExecutingHandler struct {
|
|
||||||
mgr *Manager
|
|
||||||
|
|
||||||
jobs map[schsdk.JobID]*executingJob
|
|
||||||
|
|
||||||
cmdChan actor.CommandChannel
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewExecutingHandler(mgr *Manager) *ExecutingHandler {
|
|
||||||
return &ExecutingHandler{
|
|
||||||
mgr: mgr,
|
|
||||||
jobs: make(map[schsdk.JobID]*executingJob),
|
|
||||||
cmdChan: *actor.NewCommandChannel(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ExecutingHandler) Handle(job jobmod.Job) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
state, ok := job.GetState().(*jobmod.StateExecuting)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
rjob := &executingJob{
|
|
||||||
job: job,
|
|
||||||
state: state,
|
|
||||||
}
|
|
||||||
h.jobs[job.GetJobID()] = rjob
|
|
||||||
|
|
||||||
h.onJobEvent(nil, rjob)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ExecutingHandler) onJobEvent(evt event.Event, job *executingJob) {
|
|
||||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
|
||||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
|
|
||||||
h.onNormalJobEvent(evt, job, norJob)
|
|
||||||
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
|
|
||||||
h.onResourceJobEvent(evt, job, resJob)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ExecutingHandler) onNormalJobEvent(evt event.Event, job *executingJob, norJob *jobmod.NormalJob) {
|
|
||||||
if job.state.FullTaskID == "" {
|
|
||||||
pcmImgInfo, err := h.mgr.db.PCMImage().GetByImageIDAndCCID(h.mgr.db.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID)
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed("getting pcm image info: "+err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), norJob.TargetCCID)
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO 需要添加DATA_IN、DATA_OUT等环境变量,这些数据从Job的信息中来获取
|
|
||||||
ress, err := h.mgr.db.CCResource().GetByCCID(h.mgr.db.SQLCtx(), norJob.TargetCCID)
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center resource info: %s", err.Error()), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if len(ress) == 0 {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("there is no resource at computing center %v", norJob.TargetCCID), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(),
|
|
||||||
exetsk.NewSubmitTask(
|
|
||||||
ccInfo.PCMParticipantID,
|
|
||||||
pcmImgInfo.PCMImageID,
|
|
||||||
// TODO 选择资源的算法
|
|
||||||
ress[0].PCMResourceID,
|
|
||||||
norJob.Info.Runtime.Command,
|
|
||||||
norJob.Info.Runtime.Envs,
|
|
||||||
))
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
job.state.FullTaskID = fullTaskID
|
|
||||||
}
|
|
||||||
|
|
||||||
if execRet, err := event.AssertExecutorTaskStatus[*exetsk.SubmitTaskStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed("schedule task timeout", job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.WithField("JobID", job.job.GetJobID()).
|
|
||||||
WithField("State", reflect.TypeOf(job.state).String()).
|
|
||||||
Infof("pcm task state change to: %s", execRet.Status)
|
|
||||||
|
|
||||||
if execRet.Status == pcmsdk.TaskStatusSuccess {
|
|
||||||
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateSuccess())
|
|
||||||
|
|
||||||
} else if execRet.Status == pcmsdk.TaskStatuFailed {
|
|
||||||
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(execRet.Error, job.state))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ExecutingHandler) onResourceJobEvent(evt event.Event, job *executingJob, resJob *jobmod.ResourceJob) {
|
|
||||||
if job.state.FullTaskID == "" {
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
jobSet, ok := h.mgr.jobSets[resJob.GetJobSetID()]
|
|
||||||
if !ok {
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", resJob.GetJobSetID()), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
ref := jobSet.FindRefByLocalJobID(resJob.Info.TargetLocalJobID)
|
|
||||||
if ref == nil {
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
|
||||||
fmt.Sprintf("job %s not found in job set %s",
|
|
||||||
resJob.Info.TargetLocalJobID,
|
|
||||||
resJob.GetJobSetID()),
|
|
||||||
job.state,
|
|
||||||
))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
targetJob, ok := h.mgr.jobs[ref.JobID]
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
tarNorJob, ok := targetJob.Job.(*jobmod.NormalJob)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job(%v) %s is not a Normal job", reflect.TypeOf(targetJob), ref.JobID), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
colCli, err := schglb.CollectorMQPool.Acquire()
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err.Error()), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer schglb.CollectorMQPool.Release(colCli)
|
|
||||||
|
|
||||||
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), tarNorJob.TargetCCID)
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(), exetsk.NewStorageCreatePackage(
|
|
||||||
1, // TOOD 用户ID
|
|
||||||
ccInfo.CDSStorageID,
|
|
||||||
tarNorJob.OutputFullPath,
|
|
||||||
resJob.Info.BucketID,
|
|
||||||
utils.MakeResourcePackageName(resJob.JobID),
|
|
||||||
))
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
job.state.FullTaskID = fullTaskID
|
|
||||||
}
|
|
||||||
|
|
||||||
if createRet, err := event.AssertExecutorTaskStatus[*exetsk.StorageCreatePackageStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed("storage create package timeout", job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
|
|
||||||
|
|
||||||
if createRet.Error != "" {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(createRet.Error, job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
resJob.ResourcePackageID = createRet.PackageID
|
|
||||||
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateSuccess())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ExecutingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
|
||||||
job.SetState(state)
|
|
||||||
|
|
||||||
delete(h.jobs, job.GetJobID())
|
|
||||||
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
h.mgr.handleState(job)
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ExecutingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
if broadcast.ToAll() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if broadcast.ToJobSet() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
} else if broadcast.ToJob() {
|
|
||||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ExecutingHandler) Serve() {
|
|
||||||
cmdChan := h.cmdChan.BeginChanReceive()
|
|
||||||
defer h.cmdChan.CloseChanReceive()
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case cmd := <-cmdChan:
|
|
||||||
cmd()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ExecutingHandler) Stop() {
|
|
||||||
// TODO 支持STOP
|
|
||||||
}
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
package jobmgr
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/samber/lo"
|
||||||
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
)
|
||||||
|
|
||||||
|
type FileScheduleAction string
|
||||||
|
|
||||||
|
// 文件调度方案
|
||||||
|
const (
|
||||||
|
ActionNo FileScheduleAction = "No" // 不需要操作
|
||||||
|
ActionMove FileScheduleAction = "Move" // 需要在指定节点上建立缓存
|
||||||
|
ActionLoad FileScheduleAction = "Load" // 需要加载到Storage
|
||||||
|
ActionImportImage FileScheduleAction = "ImportImage" // 需要导入镜像
|
||||||
|
)
|
||||||
|
|
||||||
|
type FileScheduleScheme struct {
|
||||||
|
Action FileScheduleAction `json:"action"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// 任务调度方案
|
||||||
|
type JobScheduleScheme struct {
|
||||||
|
TargetCCID schsdk.CCID `json:"targetCCID"`
|
||||||
|
Dataset FileScheduleScheme `json:"dataset"`
|
||||||
|
Code FileScheduleScheme `json:"code"`
|
||||||
|
Image FileScheduleScheme `json:"image"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// 任务集的预调度方案
|
||||||
|
type JobSetPreScheduleScheme struct {
|
||||||
|
JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID
|
||||||
|
}
|
||||||
|
|
||||||
|
// 任务集
|
||||||
|
type JobSet struct {
|
||||||
|
JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID
|
||||||
|
JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用
|
||||||
|
PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"`
|
||||||
|
}
|
||||||
|
type JobSetJobRef struct {
|
||||||
|
JobID schsdk.JobID `json:"jobID"` // 任务ID
|
||||||
|
LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet {
|
||||||
|
return &JobSet{
|
||||||
|
JobSetID: jobSetID,
|
||||||
|
JobRefs: jobRefs,
|
||||||
|
PreScheduleScheme: preScheduleScheme,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef {
|
||||||
|
ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID })
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return &ref
|
||||||
|
}
|
||||||
|
|
||||||
|
// 任务
|
||||||
|
type Job struct {
|
||||||
|
JobSetID schsdk.JobSetID // 任务集ID
|
||||||
|
JobID schsdk.JobID // 全局唯一任务ID
|
||||||
|
Body JobBody // 具体任务
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *Job) GetInfo() schsdk.JobInfo {
|
||||||
|
return j.Body.GetInfo()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *Job) Dump(ctx JobStateRunContext, job *Job, curState JobState) jobmod.JobStatus {
|
||||||
|
return jobmod.JobStatus{
|
||||||
|
JobID: j.JobID,
|
||||||
|
JobSetID: j.JobSetID,
|
||||||
|
Info: j.GetInfo(),
|
||||||
|
Body: job.Body.Dump(),
|
||||||
|
State: curState.Dump(ctx, job),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type JobBody interface {
|
||||||
|
GetInfo() schsdk.JobInfo
|
||||||
|
Dump() jobmod.JobBodyStatus
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
package job
|
||||||
|
|
||||||
|
import (
|
||||||
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
|
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
)
|
||||||
|
|
||||||
|
type DataReturnJob struct {
|
||||||
|
Info schsdk.DataReturnJobInfo
|
||||||
|
TargetJobCCID schsdk.CCID // 目标任务所在计算中心的ID
|
||||||
|
TargetJobOutputFullPath string // 目标任务的结果输出全路径
|
||||||
|
DataReturnPackageID cdssdk.PackageID // 回源之后得到的PackageID
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewResourceJob(info schsdk.DataReturnJobInfo) *DataReturnJob {
|
||||||
|
return &DataReturnJob{
|
||||||
|
Info: info,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *DataReturnJob) GetInfo() schsdk.JobInfo {
|
||||||
|
return &j.Info
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *DataReturnJob) Dump() jobmod.JobBodyStatus {
|
||||||
|
return jobmod.DataReturnJobStatus{
|
||||||
|
DataReturnPackageID: j.DataReturnPackageID,
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
package job
|
||||||
|
|
||||||
|
import (
|
||||||
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NormalJob struct {
|
||||||
|
Info schsdk.NormalJobInfo // 提交任务时提供的任务描述信息
|
||||||
|
Files jobmod.JobFiles // 任务需要的文件
|
||||||
|
TargetCCID schsdk.CCID // 将要运行此任务的算力中心ID
|
||||||
|
OutputFullPath string // 程序结果的完整输出路径
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewNormalJob(info schsdk.NormalJobInfo) *NormalJob {
|
||||||
|
return &NormalJob{
|
||||||
|
Info: info,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *NormalJob) GetInfo() schsdk.JobInfo {
|
||||||
|
return &j.Info
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j *NormalJob) Dump() jobmod.JobBodyStatus {
|
||||||
|
return &jobmod.NormalJobStatus{
|
||||||
|
Files: j.Files,
|
||||||
|
TargetCCID: j.TargetCCID,
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,271 @@
|
||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
|
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||||
|
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||||
|
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Adjusting struct {
|
||||||
|
scheme jobmod.JobScheduleScheme
|
||||||
|
targetCCInfo schmod.ComputingCenter
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewAdjusting(scheme jobmod.JobScheduleScheme) *Adjusting {
|
||||||
|
return &Adjusting{
|
||||||
|
scheme: scheme,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Adjusting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
err := s.do(rtx, jo)
|
||||||
|
if err != nil {
|
||||||
|
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||||
|
} else {
|
||||||
|
rtx.Mgr.ChangeState(jo, NewNormalJobReadyToExecute())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||||
|
norJob := jo.Body.(*job.NormalJob)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// 监听取消事件
|
||||||
|
go func() {
|
||||||
|
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting computing center info: %w", err)
|
||||||
|
}
|
||||||
|
s.targetCCInfo = ccInfo
|
||||||
|
|
||||||
|
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("new cds client: %w", err)
|
||||||
|
}
|
||||||
|
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||||
|
|
||||||
|
// 已经确定最终执行的目标计算中心,则可以生成结果输出路径了
|
||||||
|
stgInfo, err := stgCli.StorageGetInfo(cdssdk.StorageGetInfoReq{
|
||||||
|
StorageID: ccInfo.CDSStorageID,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting cds storage info: %w", err)
|
||||||
|
}
|
||||||
|
// TODO UserID
|
||||||
|
norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID)
|
||||||
|
|
||||||
|
wg := sync.WaitGroup{}
|
||||||
|
wg.Add(3)
|
||||||
|
|
||||||
|
var e1, e2, e3 error
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset)
|
||||||
|
if e1 != nil {
|
||||||
|
cancel()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code)
|
||||||
|
if e2 != nil {
|
||||||
|
cancel()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image)
|
||||||
|
if e3 != nil {
|
||||||
|
cancel()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
return errors.Join(e1, e2, e3)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||||
|
switch info := fileInfo.(type) {
|
||||||
|
case *schsdk.LocalJobFileInfo:
|
||||||
|
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
|
||||||
|
return e.LocalPath == info.LocalPath
|
||||||
|
})
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||||
|
}
|
||||||
|
if evt.Error != nil {
|
||||||
|
return evt.Error
|
||||||
|
}
|
||||||
|
|
||||||
|
file.PackageID = evt.PackageID
|
||||||
|
|
||||||
|
case *schsdk.PackageJobFileInfo:
|
||||||
|
file.PackageID = info.PackageID
|
||||||
|
|
||||||
|
case *schsdk.ResourceJobFileInfo:
|
||||||
|
return nil
|
||||||
|
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("unknown dataset type: %T", info)
|
||||||
|
}
|
||||||
|
|
||||||
|
if scheme.Action == jobmod.ActionMove {
|
||||||
|
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSNodeID))
|
||||||
|
defer wt.Close()
|
||||||
|
|
||||||
|
status, err := wt.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("moving package: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||||
|
if moveStatus.Error != "" {
|
||||||
|
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if scheme.Action == jobmod.ActionLoad {
|
||||||
|
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID))
|
||||||
|
defer wt.Close()
|
||||||
|
|
||||||
|
status, err := wt.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("moving package: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||||
|
if moveStatus.Error != "" {
|
||||||
|
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||||
|
switch info := fileInfo.(type) {
|
||||||
|
case *schsdk.LocalJobFileInfo:
|
||||||
|
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
|
||||||
|
return e.LocalPath == info.LocalPath
|
||||||
|
})
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||||
|
}
|
||||||
|
if evt.Error != nil {
|
||||||
|
return evt.Error
|
||||||
|
}
|
||||||
|
|
||||||
|
// 上传完毕,则可以新建一个空的镜像的记录
|
||||||
|
// TODO 镜像名称
|
||||||
|
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("creating image info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 填充ImageID和PackageID
|
||||||
|
file.ImageID = imgID
|
||||||
|
file.PackageID = &evt.PackageID
|
||||||
|
|
||||||
|
case *schsdk.ImageJobFileInfo:
|
||||||
|
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting image info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
file.ImageID = imageInfo.ImageID
|
||||||
|
file.PackageID = imageInfo.CDSPackageID
|
||||||
|
}
|
||||||
|
|
||||||
|
if scheme.Action == jobmod.ActionImportImage {
|
||||||
|
if file.PackageID == nil {
|
||||||
|
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO UserID
|
||||||
|
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSNodeID))
|
||||||
|
defer wt.Close()
|
||||||
|
|
||||||
|
status, err := wt.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("moving package: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||||
|
if moveStatus.Error != "" {
|
||||||
|
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("new cloudream storage client: %w", err)
|
||||||
|
}
|
||||||
|
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||||
|
|
||||||
|
// TODO UserID
|
||||||
|
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting package objects: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(pkgObjs.Objects) == 0 {
|
||||||
|
return fmt.Errorf("no object in the package which will be imported")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(pkgObjs.Objects) > 1 {
|
||||||
|
return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
||||||
|
}
|
||||||
|
|
||||||
|
wt2 := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
|
||||||
|
defer wt2.Close()
|
||||||
|
|
||||||
|
status2, err := wt2.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("uploading image: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
uploadStatus := status2.(*exectsk.UploadImageStatus)
|
||||||
|
if uploadStatus.Error != "" {
|
||||||
|
return fmt.Errorf("uploading image: %s", uploadStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO 镜像名称
|
||||||
|
err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, job.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("creating image info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,48 @@
|
||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
|
||||||
|
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Completed struct {
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func SuccessComplete() *Completed {
|
||||||
|
return &Completed{}
|
||||||
|
}
|
||||||
|
func FailureComplete(err error) *Completed {
|
||||||
|
return &Completed{err: err}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Completed) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
// TODO 可以考虑将执行记录落库
|
||||||
|
if c.err == nil {
|
||||||
|
c.handleSuccess(rtx, jo)
|
||||||
|
} else {
|
||||||
|
c.handleFailed(rtx, jo)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Completed) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Completed) handleSuccess(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||||||
|
logger.WithField("JobID", job.JobID).Infof("job completed successfuly")
|
||||||
|
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Completed) handleFailed(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||||||
|
logger.
|
||||||
|
WithField("JobID", job.JobID).
|
||||||
|
WithField("LastState", reflect.TypeOf(rtx.LastState).String()).
|
||||||
|
Infof("job failed with: %v", c.err)
|
||||||
|
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
|
||||||
|
}
|
|
@ -0,0 +1,154 @@
|
||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||||
|
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NormalJobExecuting struct {
|
||||||
|
lastStatus pcmsdk.TaskStatus
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewNormalJobExecuting() *NormalJobExecuting {
|
||||||
|
return &NormalJobExecuting{
|
||||||
|
lastStatus: "Begin",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *NormalJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
err := s.do(rtx, jo)
|
||||||
|
if err != nil {
|
||||||
|
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||||
|
} else {
|
||||||
|
rtx.Mgr.ChangeState(jo, SuccessComplete())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *NormalJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *NormalJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||||
|
norJob := jo.Body.(*job.NormalJob)
|
||||||
|
|
||||||
|
log := logger.WithType[NormalJobExecuting]("State").WithField("JobID", jo.JobID)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting pcm image info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting computing center info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO 需要添加DATA_IN、DATA_OUT等环境变量,这些数据从Job的信息中来获取
|
||||||
|
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting computing center resource: %w", err)
|
||||||
|
}
|
||||||
|
if len(ress) == 0 {
|
||||||
|
return fmt.Errorf("no resource found at computing center %v", norJob.TargetCCID)
|
||||||
|
}
|
||||||
|
|
||||||
|
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSubmitTask(
|
||||||
|
ccInfo.PCMParticipantID,
|
||||||
|
pcmImgInfo.PCMImageID,
|
||||||
|
// TODO 选择资源的算法
|
||||||
|
ress[0].PCMResourceID,
|
||||||
|
norJob.Info.Runtime.Command,
|
||||||
|
norJob.Info.Runtime.Envs,
|
||||||
|
))
|
||||||
|
defer wt.Close()
|
||||||
|
|
||||||
|
for {
|
||||||
|
status, err := wt.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tskStatus := status.(*exetsk.SubmitTaskStatus)
|
||||||
|
if tskStatus.Error != "" {
|
||||||
|
return fmt.Errorf("submitting task: %s", tskStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
if tskStatus.Status != s.lastStatus {
|
||||||
|
log.Infof("task %s -> %s", s.lastStatus, tskStatus.Status)
|
||||||
|
}
|
||||||
|
s.lastStatus = tskStatus.Status
|
||||||
|
|
||||||
|
switch tskStatus.Status {
|
||||||
|
case pcmsdk.TaskStatusSuccess:
|
||||||
|
return nil
|
||||||
|
|
||||||
|
case pcmsdk.TaskStatusFailed:
|
||||||
|
return fmt.Errorf("task failed")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type DataReturnJobExecuting struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewDataReturnJobExecuting() *DataReturnJobExecuting {
|
||||||
|
return &DataReturnJobExecuting{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DataReturnJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
err := s.do(rtx, jo)
|
||||||
|
if err != nil {
|
||||||
|
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||||
|
} else {
|
||||||
|
rtx.Mgr.ChangeState(jo, SuccessComplete())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DataReturnJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||||
|
reJob := jo.Body.(*job.DataReturnJob)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), reJob.TargetJobCCID)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting computing center info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewStorageCreatePackage(
|
||||||
|
1, // TOOD 用户ID
|
||||||
|
ccInfo.CDSStorageID,
|
||||||
|
reJob.TargetJobOutputFullPath,
|
||||||
|
reJob.Info.BucketID,
|
||||||
|
utils.MakeResourcePackageName(jo.JobID),
|
||||||
|
))
|
||||||
|
defer wt.Close()
|
||||||
|
|
||||||
|
status, err := wt.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tskStatus := status.(*exetsk.StorageCreatePackageStatus)
|
||||||
|
if tskStatus.Error != "" {
|
||||||
|
return fmt.Errorf("creating package: %s", tskStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
reJob.DataReturnPackageID = tskStatus.PackageID
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,61 @@
|
||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||||
|
)
|
||||||
|
|
||||||
|
type MakingAdjustScheme struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMakeingAdjustScheme() *MakingAdjustScheme {
|
||||||
|
return &MakingAdjustScheme{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *MakingAdjustScheme) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
scheme, err := s.do(rtx, jo.Body.(*job.NormalJob))
|
||||||
|
if err != nil {
|
||||||
|
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||||
|
} else {
|
||||||
|
rtx.Mgr.ChangeState(jo, NewAdjusting(*scheme))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *MakingAdjustScheme) do(rtx jobmgr.JobStateRunContext, norJob *job.NormalJob) (*jobmod.JobScheduleScheme, error) {
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
wt := rtx.Mgr.AdvMgr.StartTask(advtsk.NewMakeAdjustScheme(norJob.Info, jobmod.NormalJobStatus{
|
||||||
|
TargetCCID: norJob.TargetCCID,
|
||||||
|
Files: norJob.Files,
|
||||||
|
}))
|
||||||
|
defer wt.Close()
|
||||||
|
|
||||||
|
status, err := wt.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("making adjust scheme: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
mkStatus := status.(*advtsk.MakeAdjustSchemeStatus)
|
||||||
|
if mkStatus.Error != "" {
|
||||||
|
return nil, fmt.Errorf("making adjust scheme: %s", mkStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &mkStatus.Scheme, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *MakingAdjustScheme) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,251 @@
|
||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
|
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||||
|
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||||
|
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||||
|
)
|
||||||
|
|
||||||
|
type PreScheduling struct {
|
||||||
|
scheme jobmod.JobScheduleScheme
|
||||||
|
targetCCInfo schmod.ComputingCenter
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewPreSchuduling(scheme jobmod.JobScheduleScheme) *PreScheduling {
|
||||||
|
return &PreScheduling{
|
||||||
|
scheme: scheme,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
norJob := jo.Body.(*job.NormalJob)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// 监听取消事件
|
||||||
|
go func() {
|
||||||
|
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
|
||||||
|
if err != nil {
|
||||||
|
rtx.Mgr.ChangeState(jo, FailureComplete(fmt.Errorf("getting computing center info: %w", err)))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.targetCCInfo = ccInfo
|
||||||
|
|
||||||
|
wg := sync.WaitGroup{}
|
||||||
|
wg.Add(3)
|
||||||
|
|
||||||
|
var e1, e2, e3 error
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset)
|
||||||
|
if e1 != nil {
|
||||||
|
cancel()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code)
|
||||||
|
if e2 != nil {
|
||||||
|
cancel()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image)
|
||||||
|
if e3 != nil {
|
||||||
|
cancel()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
allErr := errors.Join(e1, e2, e3)
|
||||||
|
if allErr != nil {
|
||||||
|
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||||
|
} else {
|
||||||
|
rtx.Mgr.ChangeState(jo, NewReadyToAdjust())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PreScheduling) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||||
|
switch info := fileInfo.(type) {
|
||||||
|
case *schsdk.LocalJobFileInfo:
|
||||||
|
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
|
||||||
|
return e.LocalPath == info.LocalPath
|
||||||
|
})
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||||
|
}
|
||||||
|
if evt.Error != nil {
|
||||||
|
return evt.Error
|
||||||
|
}
|
||||||
|
|
||||||
|
file.PackageID = evt.PackageID
|
||||||
|
|
||||||
|
case *schsdk.PackageJobFileInfo:
|
||||||
|
file.PackageID = info.PackageID
|
||||||
|
|
||||||
|
case *schsdk.ResourceJobFileInfo:
|
||||||
|
return nil
|
||||||
|
|
||||||
|
default:
|
||||||
|
return fmt.Errorf("unknown dataset type: %T", info)
|
||||||
|
}
|
||||||
|
|
||||||
|
if scheme.Action == jobmod.ActionMove {
|
||||||
|
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSNodeID))
|
||||||
|
defer wt.Close()
|
||||||
|
|
||||||
|
status, err := wt.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("moving package: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||||
|
if moveStatus.Error != "" {
|
||||||
|
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if scheme.Action == jobmod.ActionLoad {
|
||||||
|
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID))
|
||||||
|
defer wt.Close()
|
||||||
|
|
||||||
|
status, err := wt.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("moving package: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||||
|
if moveStatus.Error != "" {
|
||||||
|
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||||
|
switch info := fileInfo.(type) {
|
||||||
|
case *schsdk.LocalJobFileInfo:
|
||||||
|
evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool {
|
||||||
|
return e.LocalPath == info.LocalPath
|
||||||
|
})
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||||
|
}
|
||||||
|
if evt.Error != nil {
|
||||||
|
return evt.Error
|
||||||
|
}
|
||||||
|
|
||||||
|
// 上传完毕,则可以新建一个空的镜像的记录
|
||||||
|
// TODO 镜像名称
|
||||||
|
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("creating image info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 填充ImageID和PackageID
|
||||||
|
file.ImageID = imgID
|
||||||
|
file.PackageID = &evt.PackageID
|
||||||
|
|
||||||
|
case *schsdk.ImageJobFileInfo:
|
||||||
|
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting image info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
file.ImageID = imageInfo.ImageID
|
||||||
|
file.PackageID = imageInfo.CDSPackageID
|
||||||
|
}
|
||||||
|
|
||||||
|
if scheme.Action == jobmod.ActionImportImage {
|
||||||
|
if file.PackageID == nil {
|
||||||
|
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO UserID
|
||||||
|
wt := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSNodeID))
|
||||||
|
defer wt.Close()
|
||||||
|
|
||||||
|
status, err := wt.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("moving package: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
moveStatus := status.(*exectsk.CacheMovePackageStatus)
|
||||||
|
if moveStatus.Error != "" {
|
||||||
|
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("new cloudream storage client: %w", err)
|
||||||
|
}
|
||||||
|
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||||
|
|
||||||
|
// TODO UserID
|
||||||
|
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("getting package objects: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(pkgObjs.Objects) == 0 {
|
||||||
|
return fmt.Errorf("no object in the package which will be imported")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(pkgObjs.Objects) > 1 {
|
||||||
|
return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
||||||
|
}
|
||||||
|
|
||||||
|
wt2 := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
|
||||||
|
defer wt2.Close()
|
||||||
|
|
||||||
|
status2, err := wt2.Receive(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("uploading image: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
uploadStatus := status2.(*exectsk.UploadImageStatus)
|
||||||
|
if uploadStatus.Error != "" {
|
||||||
|
return fmt.Errorf("uploading image: %s", uploadStatus.Error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO 镜像名称
|
||||||
|
err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, norJob.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("creating image info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,65 @@
|
||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ReadyToAdjust struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewReadyToAdjust() *ReadyToAdjust {
|
||||||
|
return &ReadyToAdjust{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *ReadyToAdjust) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
err := s.do(rtx, jo)
|
||||||
|
if err != nil {
|
||||||
|
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||||
|
} else {
|
||||||
|
rtx.Mgr.ChangeState(jo, NewMakeingAdjustScheme())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||||
|
norJob := jo.Body.(*job.NormalJob)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
if rt, ok := norJob.Info.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok {
|
||||||
|
evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool {
|
||||||
|
return val.Job.GetInfo().GetLocalJobID() == rt.ResourceLocalJobID
|
||||||
|
})
|
||||||
|
if !ok {
|
||||||
|
return jobmgr.ErrJobCancelled
|
||||||
|
}
|
||||||
|
if evt.Err != nil {
|
||||||
|
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
|
||||||
|
}
|
||||||
|
rtJob, ok := evt.Job.Body.(*job.DataReturnJob)
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("job %s is not a DataReturn job(which is %T)", evt.Job.JobID, evt.Job)
|
||||||
|
}
|
||||||
|
|
||||||
|
norJob.Files.Dataset.PackageID = rtJob.DataReturnPackageID
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *ReadyToAdjust) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NormalJobReadyToExecute struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewNormalJobReadyToExecute() *NormalJobReadyToExecute {
|
||||||
|
return &NormalJobReadyToExecute{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *NormalJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
// TODO 目前直接启动执行
|
||||||
|
rtx.Mgr.ChangeState(jo, NewNormalJobExecuting())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *NormalJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type DataReturnJobReadyToExecute struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewDataReturnJobReadyToExecute() *DataReturnJobReadyToExecute {
|
||||||
|
return &DataReturnJobReadyToExecute{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DataReturnJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
// TODO 目前直接启动执行
|
||||||
|
rtx.Mgr.ChangeState(jo, NewDataReturnJobExecuting())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DataReturnJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,62 @@
|
||||||
|
package state
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||||
|
)
|
||||||
|
|
||||||
|
type WaitTargetComplete struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewWaitTargetComplete() *WaitTargetComplete {
|
||||||
|
return &WaitTargetComplete{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *WaitTargetComplete) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||||
|
err := s.do(rtx, jo)
|
||||||
|
if err != nil {
|
||||||
|
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||||
|
} else {
|
||||||
|
rtx.Mgr.ChangeState(jo, NewDataReturnJobReadyToExecute())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *WaitTargetComplete) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||||
|
reJob := jo.Body.(*job.DataReturnJob)
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
event.WaitType[event.Cancel](ctx, rtx.EventSet)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool {
|
||||||
|
return val.Job.GetInfo().GetLocalJobID() == reJob.Info.TargetLocalJobID
|
||||||
|
})
|
||||||
|
if !ok {
|
||||||
|
return jobmgr.ErrJobCancelled
|
||||||
|
}
|
||||||
|
if evt.Err != nil {
|
||||||
|
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
|
||||||
|
}
|
||||||
|
norJob, ok := evt.Job.Body.(*job.NormalJob)
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("job %s is not a Normal job(which is %T)", evt.Job.JobID, evt.Job)
|
||||||
|
}
|
||||||
|
|
||||||
|
reJob.TargetJobCCID = norJob.TargetCCID
|
||||||
|
reJob.TargetJobOutputFullPath = norJob.OutputFullPath
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *WaitTargetComplete) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateStatus {
|
||||||
|
// TODO
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,14 @@
|
||||||
|
package jobmgr
|
||||||
|
|
||||||
|
import jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
|
|
||||||
|
type JobStateRunContext struct {
|
||||||
|
Mgr *Manager
|
||||||
|
EventSet *EventSet
|
||||||
|
LastState JobState
|
||||||
|
}
|
||||||
|
|
||||||
|
type JobState interface {
|
||||||
|
Run(ctx JobStateRunContext, job *Job)
|
||||||
|
Dump(ctx JobStateRunContext, job *Job) jobmod.JobStateStatus
|
||||||
|
}
|
|
@ -1,283 +1,169 @@
|
||||||
package jobmgr
|
package jobmgr
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"reflect"
|
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
|
||||||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
|
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
|
||||||
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
|
||||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/advisormgr"
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/advisormgr"
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr"
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr"
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type mgrJob struct {
|
type mgrJob struct {
|
||||||
Job jobmod.Job
|
job Job
|
||||||
Handler StateHandler
|
eventSet EventSet
|
||||||
|
state JobState
|
||||||
|
}
|
||||||
|
|
||||||
|
type mgrJobSet struct {
|
||||||
|
jobs map[schsdk.JobID]*mgrJob
|
||||||
}
|
}
|
||||||
|
|
||||||
type Manager struct {
|
type Manager struct {
|
||||||
// 任何修改job、jobset的操作,都需要加这个锁
|
// 任何修改job、jobset的操作,都需要加这个锁
|
||||||
pubLock sync.Mutex
|
pubLock sync.Mutex
|
||||||
|
|
||||||
execMgr *executormgr.Manager
|
ExecMgr *executormgr.Manager
|
||||||
advMgr *advisormgr.Manager
|
AdvMgr *advisormgr.Manager
|
||||||
db *db.DB
|
DB *db.DB
|
||||||
|
|
||||||
handlers map[reflect.Type]StateHandler
|
|
||||||
defaultHandler StateHandler
|
|
||||||
|
|
||||||
jobSetIDIndex int
|
jobSetIDIndex int
|
||||||
jobSets map[schsdk.JobSetID]*jobmod.JobSet
|
jobSets map[schsdk.JobSetID]*mgrJobSet
|
||||||
jobIDIndex int
|
jobIDIndex int
|
||||||
jobs map[schsdk.JobID]*mgrJob
|
jobs map[schsdk.JobID]*mgrJob
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewManager(execMgr *executormgr.Manager, advMgr *advisormgr.Manager, db *db.DB) (*Manager, error) {
|
func NewManager(execMgr *executormgr.Manager, advMgr *advisormgr.Manager, db *db.DB) (*Manager, error) {
|
||||||
mgr := &Manager{
|
mgr := &Manager{
|
||||||
execMgr: execMgr,
|
ExecMgr: execMgr,
|
||||||
advMgr: advMgr,
|
AdvMgr: advMgr,
|
||||||
db: db,
|
DB: db,
|
||||||
|
jobSets: make(map[schsdk.JobSetID]*mgrJobSet),
|
||||||
handlers: make(map[reflect.Type]StateHandler),
|
jobs: make(map[schsdk.JobID]*mgrJob),
|
||||||
jobSets: make(map[schsdk.JobSetID]*jobmod.JobSet),
|
|
||||||
jobs: make(map[schsdk.JobID]*mgrJob),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
execMgr.OnTaskUpdated(mgr.executorTaskUpdated)
|
|
||||||
execMgr.OnTaskTimeout(mgr.executorTaskTimeout)
|
|
||||||
|
|
||||||
advMgr.OnTaskUpdated(mgr.advisorTaskUpdated)
|
|
||||||
advMgr.OnTaskTimeout(mgr.advisorTaskTimeout)
|
|
||||||
|
|
||||||
// TODO 考虑优化这部分逻辑
|
|
||||||
|
|
||||||
mgr.handlers[myreflect.TypeOf[*jobmod.StatePreScheduling]()] = NewPreSchedulingHandler(mgr)
|
|
||||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToAdjust]()] = NewReadyToAdjustHandler(mgr)
|
|
||||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateMakingAdjustScheme]()] = NewMakingAdjustSchemeHandler(mgr)
|
|
||||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateAdjusting]()] = NewAdjustingHandler(mgr)
|
|
||||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToExecute]()] = NewReadyToExecuteHandler(mgr)
|
|
||||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateExecuting]()] = NewExecutingHandler(mgr)
|
|
||||||
|
|
||||||
compHder := NewCompleteHandler(mgr)
|
|
||||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateFailed]()] = compHder
|
|
||||||
mgr.handlers[myreflect.TypeOf[*jobmod.StateSuccess]()] = compHder
|
|
||||||
|
|
||||||
mgr.defaultHandler = NewDefaultHandler(mgr)
|
|
||||||
|
|
||||||
return mgr, nil
|
return mgr, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) Serve() error {
|
func (m *Manager) Serve() error {
|
||||||
for _, h := range m.handlers {
|
|
||||||
go h.Serve()
|
|
||||||
}
|
|
||||||
|
|
||||||
go m.defaultHandler.Serve()
|
|
||||||
|
|
||||||
ticker := time.NewTicker(time.Minute)
|
ticker := time.NewTicker(time.Minute)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ticker.C:
|
|
||||||
// 每一分钟产生一个空事件,防止无限等待
|
|
||||||
m.pubLock.Lock()
|
|
||||||
m.onEvent(event.ToAll(), nil)
|
|
||||||
m.pubLock.Unlock()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) Stop() {
|
func (m *Manager) Stop() {
|
||||||
for _, h := range m.handlers {
|
|
||||||
h.Stop()
|
|
||||||
}
|
|
||||||
|
|
||||||
m.defaultHandler.Stop()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) SubmitJobSet(jobSetInfo schsdk.JobSetInfo, preScheduleScheme jobmod.JobSetPreScheduleScheme) (*jobmod.JobSet, error) {
|
func (m *Manager) ChangeState(job *Job, state JobState) {
|
||||||
|
m.pubLock.Lock()
|
||||||
|
defer m.pubLock.Unlock()
|
||||||
|
|
||||||
|
mgrJob, ok := m.jobs[job.JobID]
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
lastState := mgrJob.state
|
||||||
|
mgrJob.state = state
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
state.Run(JobStateRunContext{
|
||||||
|
Mgr: m,
|
||||||
|
EventSet: &mgrJob.eventSet,
|
||||||
|
LastState: lastState,
|
||||||
|
}, job)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Manager) PostEvent(jobID schsdk.JobID, evt Event) {
|
||||||
|
m.pubLock.Lock()
|
||||||
|
defer m.pubLock.Unlock()
|
||||||
|
|
||||||
|
mgrJob, ok := m.jobs[jobID]
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
mgrJob.eventSet.Post(evt)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Manager) BroadcastEvent(jobSetID schsdk.JobSetID, evt Event) {
|
||||||
|
m.pubLock.Lock()
|
||||||
|
defer m.pubLock.Unlock()
|
||||||
|
|
||||||
|
jobSet, ok := m.jobSets[jobSetID]
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, mgrJob := range jobSet.jobs {
|
||||||
|
go func() {
|
||||||
|
mgrJob.eventSet.Post(evt)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type SubmittingJob struct {
|
||||||
|
Body JobBody
|
||||||
|
InitState JobState
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID {
|
||||||
m.pubLock.Lock()
|
m.pubLock.Lock()
|
||||||
defer m.pubLock.Unlock()
|
defer m.pubLock.Unlock()
|
||||||
|
|
||||||
jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex))
|
jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex))
|
||||||
|
|
||||||
var jobs []jobmod.Job
|
|
||||||
var normalJobs []*jobmod.NormalJob
|
|
||||||
var resJobs []*jobmod.ResourceJob
|
|
||||||
var jobRefs []jobmod.JobSetJobRef
|
|
||||||
for i, jobInfo := range jobSetInfo.Jobs {
|
|
||||||
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
|
|
||||||
|
|
||||||
switch info := jobInfo.(type) {
|
|
||||||
case *schsdk.NormalJobInfo:
|
|
||||||
job := jobmod.NewNormalJob(jobSetID, jobID, *info)
|
|
||||||
jobs = append(jobs, job)
|
|
||||||
normalJobs = append(normalJobs, job)
|
|
||||||
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
|
|
||||||
LocalJobID: info.LocalJobID,
|
|
||||||
JobID: jobID,
|
|
||||||
})
|
|
||||||
|
|
||||||
preSch, ok := preScheduleScheme.JobSchemes[info.LocalJobID]
|
|
||||||
if !ok {
|
|
||||||
return nil, fmt.Errorf("pre schedule scheme for job %s is not found", info.LocalJobID)
|
|
||||||
}
|
|
||||||
|
|
||||||
job.State = jobmod.NewStatePreScheduling(preSch)
|
|
||||||
job.TargetCCID = preSch.TargetCCID
|
|
||||||
|
|
||||||
case *schsdk.ResourceJobInfo:
|
|
||||||
job := jobmod.NewResourceJob(jobSetID, jobID, *info)
|
|
||||||
jobs = append(jobs, job)
|
|
||||||
resJobs = append(resJobs, job)
|
|
||||||
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
|
|
||||||
LocalJobID: info.LocalJobID,
|
|
||||||
JobID: jobID,
|
|
||||||
})
|
|
||||||
|
|
||||||
// 回源任务不需要预调度,所以直接是进入待调整状态
|
|
||||||
job.State = jobmod.NewStateReadyToAdjust()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO 可以考虑检查一下有依赖的任务的信息所描述依赖的LocalJobID是不是有效的
|
|
||||||
|
|
||||||
jobSet := jobmod.NewJobSet(jobSetID, jobRefs, preScheduleScheme)
|
|
||||||
m.jobSets[jobSetID] = jobSet
|
|
||||||
for _, job := range jobs {
|
|
||||||
m.jobs[job.GetJobID()] = &mgrJob{
|
|
||||||
Job: job,
|
|
||||||
}
|
|
||||||
|
|
||||||
m.handleState(job)
|
|
||||||
}
|
|
||||||
|
|
||||||
m.jobSetIDIndex += 1
|
m.jobSetIDIndex += 1
|
||||||
m.jobIDIndex += len(jobSetInfo.Jobs)
|
|
||||||
|
|
||||||
return jobSet, nil
|
jobSet := &mgrJobSet{
|
||||||
|
jobs: make(map[schsdk.JobID]*mgrJob),
|
||||||
|
}
|
||||||
|
m.jobSets[jobSetID] = jobSet
|
||||||
|
|
||||||
|
for i, subJob := range jobs {
|
||||||
|
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
|
||||||
|
job := &mgrJob{
|
||||||
|
job: Job{
|
||||||
|
JobSetID: jobSetID,
|
||||||
|
JobID: jobID,
|
||||||
|
Body: subJob.Body,
|
||||||
|
},
|
||||||
|
eventSet: NewEventSet(),
|
||||||
|
}
|
||||||
|
jobSet.jobs[jobID] = job
|
||||||
|
|
||||||
|
m.ChangeState(&job.job, subJob.InitState)
|
||||||
|
}
|
||||||
|
m.jobIDIndex += len(jobs)
|
||||||
|
|
||||||
|
return jobSetID
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *Manager) LocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) error {
|
func (m *Manager) DumpJobSet(jobSetID schsdk.JobSetID) []jobmod.JobStatus {
|
||||||
m.pubLock.Lock()
|
m.pubLock.Lock()
|
||||||
defer m.pubLock.Unlock()
|
defer m.pubLock.Unlock()
|
||||||
|
|
||||||
for _, h := range m.handlers {
|
jobSet, ok := m.jobSets[jobSetID]
|
||||||
h.OnEvent(event.ToJobSet(jobSetID), event.NewLocalFileUploaded(jobSetID, localPath, err, packageID))
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Manager) executorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus exectsk.TaskStatus) {
|
|
||||||
m.pubLock.Lock()
|
|
||||||
defer m.pubLock.Unlock()
|
|
||||||
|
|
||||||
job, ok := m.jobs[jobID]
|
|
||||||
if !ok {
|
if !ok {
|
||||||
return
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskUpdated(fullTaskID, taskStatus))
|
var jobStatuses []jobmod.JobStatus
|
||||||
}
|
for _, mgrJob := range jobSet.jobs {
|
||||||
|
jobStatuses = append(jobStatuses, mgrJob.job.Dump(JobStateRunContext{
|
||||||
func (m *Manager) executorTaskTimeout(jobID schsdk.JobID, fullTaskID string) {
|
Mgr: m,
|
||||||
m.pubLock.Lock()
|
EventSet: &mgrJob.eventSet,
|
||||||
defer m.pubLock.Unlock()
|
LastState: mgrJob.state,
|
||||||
|
}, &mgrJob.job, mgrJob.state))
|
||||||
job, ok := m.jobs[jobID]
|
}
|
||||||
if !ok {
|
|
||||||
return
|
return jobStatuses
|
||||||
}
|
|
||||||
|
|
||||||
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskTimeout(fullTaskID))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Manager) advisorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus) {
|
|
||||||
m.pubLock.Lock()
|
|
||||||
defer m.pubLock.Unlock()
|
|
||||||
|
|
||||||
job, ok := m.jobs[jobID]
|
|
||||||
if !ok {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskUpdated(fullTaskID, taskStatus))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Manager) advisorTaskTimeout(jobID schsdk.JobID, fullTaskID string) {
|
|
||||||
m.pubLock.Lock()
|
|
||||||
defer m.pubLock.Unlock()
|
|
||||||
|
|
||||||
job, ok := m.jobs[jobID]
|
|
||||||
if !ok {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskTimeout(fullTaskID))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Manager) CloneJob(jobID schsdk.JobID) (jobmod.Job, error) {
|
|
||||||
m.pubLock.Lock()
|
|
||||||
|
|
||||||
job, ok := m.jobs[jobID]
|
|
||||||
if !ok {
|
|
||||||
m.pubLock.Unlock()
|
|
||||||
return nil, fmt.Errorf("job not found")
|
|
||||||
}
|
|
||||||
|
|
||||||
evt := event.NewCloneJob()
|
|
||||||
job.Handler.OnEvent(event.ToJob(jobID), evt)
|
|
||||||
m.pubLock.Unlock()
|
|
||||||
|
|
||||||
return evt.Callback.WaitValue(context.Background())
|
|
||||||
}
|
|
||||||
|
|
||||||
// 根据job状态选择handler进行处理。需要加锁
|
|
||||||
func (m *Manager) handleState(job jobmod.Job) {
|
|
||||||
logger.WithField("JobID", job.GetJobID()).
|
|
||||||
WithField("State", reflect.TypeOf(job.GetState()).String()).
|
|
||||||
Debugf("job state changed")
|
|
||||||
|
|
||||||
runtime, ok := m.jobs[job.GetJobID()]
|
|
||||||
if !ok {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
state := job.GetState()
|
|
||||||
if state == nil {
|
|
||||||
runtime.Handler = m.defaultHandler
|
|
||||||
m.defaultHandler.Handle(job)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
stateType := reflect.TypeOf(state)
|
|
||||||
handler, ok := m.handlers[stateType]
|
|
||||||
if !ok {
|
|
||||||
runtime.Handler = m.defaultHandler
|
|
||||||
m.defaultHandler.Handle(job)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
runtime.Handler = handler
|
|
||||||
handler.Handle(job)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (m *Manager) onEvent(broadcast event.Broadcast, evt event.Event) {
|
|
||||||
for _, h := range m.handlers {
|
|
||||||
h.OnEvent(broadcast, evt)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,139 +0,0 @@
|
||||||
package jobmgr
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"reflect"
|
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
|
||||||
|
|
||||||
type makingAdjustSchemeJob struct {
|
|
||||||
job *jobmod.NormalJob
|
|
||||||
state *jobmod.StateMakingAdjustScheme
|
|
||||||
}
|
|
||||||
|
|
||||||
type MakingAdjustSchemeHandler struct {
|
|
||||||
mgr *Manager
|
|
||||||
|
|
||||||
jobs map[schsdk.JobID]*makingAdjustSchemeJob
|
|
||||||
|
|
||||||
cmdChan actor.CommandChannel
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewMakingAdjustSchemeHandler(mgr *Manager) *MakingAdjustSchemeHandler {
|
|
||||||
return &MakingAdjustSchemeHandler{
|
|
||||||
mgr: mgr,
|
|
||||||
jobs: make(map[schsdk.JobID]*makingAdjustSchemeJob),
|
|
||||||
cmdChan: *actor.NewCommandChannel(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *MakingAdjustSchemeHandler) Handle(job jobmod.Job) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
norJob, ok := job.(*jobmod.NormalJob)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
state, ok := job.GetState().(*jobmod.StateMakingAdjustScheme)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
rjob := &makingAdjustSchemeJob{
|
|
||||||
job: norJob,
|
|
||||||
state: state,
|
|
||||||
}
|
|
||||||
h.jobs[job.GetJobID()] = rjob
|
|
||||||
|
|
||||||
h.onJobEvent(nil, rjob)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *MakingAdjustSchemeHandler) onJobEvent(evt event.Event, job *makingAdjustSchemeJob) {
|
|
||||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
|
||||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if job.state.FullTaskID == "" {
|
|
||||||
fullTaskID, err := h.mgr.advMgr.StartTask(job.job.GetJobID(), advtsk.NewMakeAdjustScheme(*job.job))
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
job.state.FullTaskID = fullTaskID
|
|
||||||
}
|
|
||||||
|
|
||||||
if makingRet, err := event.AssertAdvisorTaskStatus[*advtsk.MakeAdjustSchemeStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed("make adjust scheme timeout", job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.advMgr.ForgetTask(job.state.FullTaskID)
|
|
||||||
|
|
||||||
if makingRet.Error != "" {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(makingRet.Error, job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateAdjusting(makingRet.Scheme))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *MakingAdjustSchemeHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
|
||||||
job.SetState(state)
|
|
||||||
|
|
||||||
delete(h.jobs, job.GetJobID())
|
|
||||||
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
h.mgr.handleState(job)
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *MakingAdjustSchemeHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
if broadcast.ToAll() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if broadcast.ToJobSet() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
} else if broadcast.ToJob() {
|
|
||||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *MakingAdjustSchemeHandler) Serve() {
|
|
||||||
cmdChan := h.cmdChan.BeginChanReceive()
|
|
||||||
defer h.cmdChan.CloseChanReceive()
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case cmd := <-cmdChan:
|
|
||||||
cmd()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *MakingAdjustSchemeHandler) Stop() {
|
|
||||||
// TODO 支持STOP
|
|
||||||
}
|
|
|
@ -1,442 +0,0 @@
|
||||||
package jobmgr
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"reflect"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
||||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
|
||||||
|
|
||||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
|
||||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
|
||||||
|
|
||||||
var ErrPreScheduleFailed = fmt.Errorf("pre schedule failed")
|
|
||||||
|
|
||||||
type preSchedulingJob struct {
|
|
||||||
job *jobmod.NormalJob
|
|
||||||
state *jobmod.StatePreScheduling
|
|
||||||
ccInfo schmod.ComputingCenter
|
|
||||||
}
|
|
||||||
|
|
||||||
type PreSchedulingHandler struct {
|
|
||||||
mgr *Manager
|
|
||||||
|
|
||||||
jobs map[schsdk.JobID]*preSchedulingJob
|
|
||||||
|
|
||||||
cmdChan actor.CommandChannel
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewPreSchedulingHandler(mgr *Manager) *PreSchedulingHandler {
|
|
||||||
return &PreSchedulingHandler{
|
|
||||||
mgr: mgr,
|
|
||||||
jobs: make(map[schsdk.JobID]*preSchedulingJob),
|
|
||||||
cmdChan: *actor.NewCommandChannel(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *PreSchedulingHandler) Handle(job jobmod.Job) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
norJob, ok := job.(*jobmod.NormalJob)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow job: %v", reflect.TypeOf(job)), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
preSchState, ok := norJob.GetState().(*jobmod.StatePreScheduling)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
colCli, err := schglb.CollectorMQPool.Acquire()
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
defer schglb.CollectorMQPool.Release(colCli)
|
|
||||||
|
|
||||||
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), preSchState.Scheme.TargetCCID)
|
|
||||||
if err != nil {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
norJob.TargetCCID = preSchState.Scheme.TargetCCID
|
|
||||||
preJob := &preSchedulingJob{
|
|
||||||
job: norJob,
|
|
||||||
state: preSchState,
|
|
||||||
ccInfo: ccInfo,
|
|
||||||
}
|
|
||||||
h.jobs[job.GetJobID()] = preJob
|
|
||||||
|
|
||||||
h.onJobEvent(nil, preJob)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *PreSchedulingHandler) onJobEvent(evt event.Event, job *preSchedulingJob) {
|
|
||||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
|
||||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
err := h.doPackageScheduling(evt, job,
|
|
||||||
job.job.Info.Files.Dataset, &job.job.Files.Dataset,
|
|
||||||
&job.state.Scheme.Dataset, &job.state.Dataset,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
job.state.Dataset.Error = err.Error()
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
err = h.doPackageScheduling(evt, job,
|
|
||||||
job.job.Info.Files.Code, &job.job.Files.Code,
|
|
||||||
&job.state.Scheme.Code, &job.state.Code,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
job.state.Code.Error = err.Error()
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
err = h.doImageScheduling(evt, job,
|
|
||||||
job.job.Info.Files.Image, &job.job.Files.Image,
|
|
||||||
&job.state.Scheme.Image, &job.state.Image,
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
job.state.Image.Error = err.Error()
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// 如果三种文件都调度完成,则可以进入下个阶段了
|
|
||||||
if job.state.Dataset.Step == jobmod.StepCompleted &&
|
|
||||||
job.state.Code.Step == jobmod.StepCompleted &&
|
|
||||||
job.state.Image.Step == jobmod.StepCompleted {
|
|
||||||
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateReadyToAdjust())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *PreSchedulingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
|
||||||
job.SetState(state)
|
|
||||||
|
|
||||||
delete(h.jobs, job.GetJobID())
|
|
||||||
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
h.mgr.handleState(job)
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *PreSchedulingHandler) doPackageScheduling(evt event.Event, job *preSchedulingJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
|
|
||||||
// TODO 考虑拆分成多个函数
|
|
||||||
if state.Step == jobmod.StepBegin {
|
|
||||||
switch info := fileInfo.(type) {
|
|
||||||
case *schsdk.LocalJobFileInfo:
|
|
||||||
state.Step = jobmod.StepUploading
|
|
||||||
|
|
||||||
case *schsdk.PackageJobFileInfo:
|
|
||||||
file.PackageID = info.PackageID
|
|
||||||
state.Step = jobmod.StepUploaded
|
|
||||||
|
|
||||||
case *schsdk.ResourceJobFileInfo:
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
|
|
||||||
default:
|
|
||||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepUploading {
|
|
||||||
if evt == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
localFileCmd, ok := evt.(*event.LocalFileUploaded)
|
|
||||||
if !ok {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if localFileCmd.LocalPath != fileInfo.(*schsdk.LocalJobFileInfo).LocalPath {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if localFileCmd.Error != "" {
|
|
||||||
return fmt.Errorf("local file uploading: %s", localFileCmd.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
file.PackageID = localFileCmd.PackageID
|
|
||||||
state.Step = jobmod.StepUploaded
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepUploaded {
|
|
||||||
if scheme.Action == jobmod.ActionNo {
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if scheme.Action == jobmod.ActionMove {
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, file.PackageID, job.ccInfo.CDSNodeID))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("starting cache move package: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepMoving
|
|
||||||
state.FullTaskID = fullTaskID
|
|
||||||
return nil
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
if scheme.Action == jobmod.ActionLoad {
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewStorageLoadPackage(1, file.PackageID, job.ccInfo.CDSStorageID))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("starting stroage load package: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepLoading
|
|
||||||
state.FullTaskID = fullTaskID
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepMoving {
|
|
||||||
moveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
|
|
||||||
if err == event.ErrUnconcernedTask {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
return fmt.Errorf("cache move package timeout")
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
|
||||||
|
|
||||||
if moveRet.Error != "" {
|
|
||||||
return fmt.Errorf("cache move pacakge: %s", moveRet.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepLoading {
|
|
||||||
loadRet, err := event.AssertExecutorTaskStatus[*exectsk.StorageLoadPackageStatus](evt, state.FullTaskID)
|
|
||||||
if err == event.ErrUnconcernedTask {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
return fmt.Errorf("storage load package timeout")
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
|
||||||
|
|
||||||
if loadRet.Error != "" {
|
|
||||||
return fmt.Errorf("storage load package: %s", loadRet.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
file.FullPath = loadRet.FullPath
|
|
||||||
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *PreSchedulingHandler) doImageScheduling(evt event.Event, job *preSchedulingJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme, state *jobmod.FileSchedulingState) error {
|
|
||||||
// TODO 考虑拆分成多个函数
|
|
||||||
if state.Step == jobmod.StepBegin {
|
|
||||||
switch info := fileInfo.(type) {
|
|
||||||
case *schsdk.LocalJobFileInfo:
|
|
||||||
state.Step = jobmod.StepUploading
|
|
||||||
|
|
||||||
case *schsdk.ImageJobFileInfo:
|
|
||||||
imageInfo, err := h.mgr.db.Image().GetByID(h.mgr.db.SQLCtx(), info.ImageID)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("getting image info: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
file.ImageID = imageInfo.ImageID
|
|
||||||
file.PackageID = imageInfo.CDSPackageID
|
|
||||||
state.Step = jobmod.StepUploaded
|
|
||||||
|
|
||||||
default:
|
|
||||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(info))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepUploading {
|
|
||||||
if evt == nil {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
localFileCmd, ok := evt.(*event.LocalFileUploaded)
|
|
||||||
if !ok {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if localFileCmd.LocalPath != fileInfo.(*schsdk.LocalJobFileInfo).LocalPath {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if localFileCmd.Error != "" {
|
|
||||||
return fmt.Errorf("local file uploading: %s", localFileCmd.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
// 上传完毕,则可以新建一个空的镜像的记录
|
|
||||||
// TODO 镜像名称
|
|
||||||
imgID, err := h.mgr.db.Image().Create(h.mgr.db.SQLCtx(), &localFileCmd.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("creating image info: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// 填充ImageID和PackageID
|
|
||||||
file.ImageID = imgID
|
|
||||||
file.PackageID = &localFileCmd.PackageID
|
|
||||||
state.Step = jobmod.StepUploaded
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepUploaded {
|
|
||||||
if scheme.Action == jobmod.ActionNo {
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// 要导入镜像,则需要先将镜像移动到指点节点的缓存中
|
|
||||||
if scheme.Action == jobmod.ActionImportImage {
|
|
||||||
if file.PackageID == nil {
|
|
||||||
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, job.ccInfo.CCID)
|
|
||||||
}
|
|
||||||
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewCacheMovePackage(1, *file.PackageID, job.ccInfo.CDSNodeID))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("starting cache move package: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepMoving
|
|
||||||
state.FullTaskID = fullTaskID
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return fmt.Errorf("invalid schedule action %s for file info type %v", scheme.Action, reflect.TypeOf(fileInfo))
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepMoving {
|
|
||||||
cacheMoveRet, err := event.AssertExecutorTaskStatus[*exectsk.CacheMovePackageStatus](evt, state.FullTaskID)
|
|
||||||
if err == event.ErrUnconcernedTask {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
return fmt.Errorf("cache move package timeout")
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
|
||||||
|
|
||||||
if cacheMoveRet.Error != "" {
|
|
||||||
return fmt.Errorf("cache move pacakge: %s", cacheMoveRet.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("new cloudream storage client: %w", err)
|
|
||||||
}
|
|
||||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
|
||||||
|
|
||||||
pkgObjs, err := stgCli.Object().GetPackageObjects(cdssdk.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("getting package objects: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(pkgObjs.Objects) == 0 {
|
|
||||||
return fmt.Errorf("no object in the package which will be imported")
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(pkgObjs.Objects) > 1 {
|
|
||||||
return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
|
||||||
}
|
|
||||||
|
|
||||||
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.JobID, exectsk.NewUploadImage(job.ccInfo.PCMParticipantID, cdssdk.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)))
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("starting import image: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepImageImporting
|
|
||||||
state.FullTaskID = fullTaskID
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if state.Step == jobmod.StepImageImporting {
|
|
||||||
uploadImageRet, err := event.AssertExecutorTaskStatus[*exectsk.UploadImageStatus](evt, state.FullTaskID)
|
|
||||||
if err == event.ErrUnconcernedTask {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err == event.ErrTaskTimeout {
|
|
||||||
return fmt.Errorf("import image timeout")
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.execMgr.ForgetTask(state.FullTaskID)
|
|
||||||
|
|
||||||
if uploadImageRet.Error != "" {
|
|
||||||
return fmt.Errorf("import image: %s", uploadImageRet.Error)
|
|
||||||
}
|
|
||||||
|
|
||||||
err = h.mgr.db.PCMImage().Create(h.mgr.db.SQLCtx(), file.ImageID, job.ccInfo.CCID, uploadImageRet.PCMImageID, uploadImageRet.Name, time.Now())
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("adding image importing info: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
state.Step = jobmod.StepCompleted
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *PreSchedulingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
if broadcast.ToAll() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if broadcast.ToJobSet() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
if job.job.JobSetID != broadcast.JobSetID {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
} else if broadcast.ToJob() {
|
|
||||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *PreSchedulingHandler) Serve() {
|
|
||||||
cmdChan := h.cmdChan.BeginChanReceive()
|
|
||||||
defer h.cmdChan.CloseChanReceive()
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case cmd := <-cmdChan:
|
|
||||||
cmd()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *PreSchedulingHandler) Stop() {
|
|
||||||
// TODO 支持STOP
|
|
||||||
}
|
|
|
@ -1,214 +0,0 @@
|
||||||
package jobmgr
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"reflect"
|
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
|
||||||
|
|
||||||
type readyToAdjustJob struct {
|
|
||||||
job jobmod.Job
|
|
||||||
state *jobmod.StateReadyToAdjust
|
|
||||||
}
|
|
||||||
|
|
||||||
type ReadyToAdjustHandler struct {
|
|
||||||
mgr *Manager
|
|
||||||
|
|
||||||
jobs map[schsdk.JobID]*readyToAdjustJob
|
|
||||||
|
|
||||||
cmdChan actor.CommandChannel
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewReadyToAdjustHandler(mgr *Manager) *ReadyToAdjustHandler {
|
|
||||||
return &ReadyToAdjustHandler{
|
|
||||||
mgr: mgr,
|
|
||||||
jobs: make(map[schsdk.JobID]*readyToAdjustJob),
|
|
||||||
cmdChan: *actor.NewCommandChannel(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToAdjustHandler) Handle(job jobmod.Job) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
state, ok := job.GetState().(*jobmod.StateReadyToAdjust)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
rjob := &readyToAdjustJob{
|
|
||||||
job: job,
|
|
||||||
state: state,
|
|
||||||
}
|
|
||||||
h.jobs[job.GetJobID()] = rjob
|
|
||||||
|
|
||||||
h.onJobEvent(nil, rjob)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToAdjustHandler) onJobEvent(evt event.Event, job *readyToAdjustJob) {
|
|
||||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
|
||||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
|
|
||||||
h.onNormalJobEvent(evt, job, norJob)
|
|
||||||
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
|
|
||||||
h.onResourceJobEvent(evt, job, resJob)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToAdjustHandler) onNormalJobEvent(evt event.Event, job *readyToAdjustJob, norJob *jobmod.NormalJob) {
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
jobSet, ok := h.mgr.jobSets[job.job.GetJobSetID()]
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", job.job.GetJobSetID()), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
needWait := false
|
|
||||||
|
|
||||||
// 无论发生什么事件,都检查一下前置任务的状态
|
|
||||||
if resFile, ok := norJob.Info.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok {
|
|
||||||
ref := jobSet.FindRefByLocalJobID(resFile.ResourceLocalJobID)
|
|
||||||
if ref == nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
|
||||||
fmt.Sprintf("job %s not found in job set %s", resFile.ResourceLocalJobID, jobSet.JobSetID),
|
|
||||||
job.state,
|
|
||||||
))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
waitJob := h.mgr.jobs[ref.JobID]
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
if waitJob == nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if _, ok = waitJob.Job.GetState().(*jobmod.StateSuccess); ok {
|
|
||||||
waitResJob, ok := waitJob.Job.(*jobmod.ResourceJob)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
|
||||||
fmt.Sprintf("job(%v) %s is not a resource job", reflect.TypeOf(waitJob), waitResJob.JobID),
|
|
||||||
job.state,
|
|
||||||
))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
norJob.Files.Dataset.PackageID = waitResJob.ResourcePackageID
|
|
||||||
} else if _, ok = waitJob.Job.GetState().(*jobmod.StateFailed); ok {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
|
||||||
fmt.Sprintf("job %s is failed", waitJob.Job.GetJobID()),
|
|
||||||
job.state,
|
|
||||||
))
|
|
||||||
return
|
|
||||||
} else {
|
|
||||||
// 等待的Job不是失败或者成功状态,则需要继续等待
|
|
||||||
needWait = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !needWait {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateMakingAdjustScheme())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToAdjustHandler) onResourceJobEvent(evt event.Event, job *readyToAdjustJob, resJob *jobmod.ResourceJob) {
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
jobSet, ok := h.mgr.jobSets[job.job.GetJobSetID()]
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", job.job.GetJobSetID()), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
needWait := false
|
|
||||||
|
|
||||||
ref := jobSet.FindRefByLocalJobID(resJob.Info.TargetLocalJobID)
|
|
||||||
if ref == nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
|
||||||
fmt.Sprintf("job %s not found in job set %s", resJob.Info.TargetLocalJobID, jobSet.JobSetID),
|
|
||||||
job.state,
|
|
||||||
))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
waitJob := h.mgr.jobs[ref.JobID]
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
if waitJob == nil {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// 无论发生什么事件,都检查一下前置任务的状态
|
|
||||||
if _, ok = waitJob.Job.GetState().(*jobmod.StateFailed); ok {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateFailed(
|
|
||||||
fmt.Sprintf("job %s is failed", waitJob.Job.GetJobID()),
|
|
||||||
job.state,
|
|
||||||
))
|
|
||||||
return
|
|
||||||
} else if _, ok = waitJob.Job.GetState().(*jobmod.StateSuccess); !ok {
|
|
||||||
needWait = true
|
|
||||||
}
|
|
||||||
|
|
||||||
if !needWait {
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateReadyToExecute())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToAdjustHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
|
||||||
job.SetState(state)
|
|
||||||
|
|
||||||
delete(h.jobs, job.GetJobID())
|
|
||||||
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
h.mgr.handleState(job)
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToAdjustHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
if broadcast.ToAll() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if broadcast.ToJobSet() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
} else if broadcast.ToJob() {
|
|
||||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToAdjustHandler) Serve() {
|
|
||||||
cmdChan := h.cmdChan.BeginChanReceive()
|
|
||||||
defer h.cmdChan.CloseChanReceive()
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case cmd := <-cmdChan:
|
|
||||||
cmd()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToAdjustHandler) Stop() {
|
|
||||||
// TODO 支持STOP
|
|
||||||
}
|
|
|
@ -1,122 +0,0 @@
|
||||||
package jobmgr
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"reflect"
|
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/actor"
|
|
||||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
|
||||||
|
|
||||||
type readyToExecuteJob struct {
|
|
||||||
job jobmod.Job
|
|
||||||
state *jobmod.StateReadyToExecute
|
|
||||||
}
|
|
||||||
|
|
||||||
type ReadyToExecuteHandler struct {
|
|
||||||
mgr *Manager
|
|
||||||
|
|
||||||
jobs map[schsdk.JobID]*readyToExecuteJob
|
|
||||||
|
|
||||||
cmdChan actor.CommandChannel
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewReadyToExecuteHandler(mgr *Manager) *ReadyToExecuteHandler {
|
|
||||||
return &ReadyToExecuteHandler{
|
|
||||||
mgr: mgr,
|
|
||||||
jobs: make(map[schsdk.JobID]*readyToExecuteJob),
|
|
||||||
cmdChan: *actor.NewCommandChannel(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToExecuteHandler) Handle(job jobmod.Job) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
state, ok := job.GetState().(*jobmod.StateReadyToExecute)
|
|
||||||
if !ok {
|
|
||||||
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
rjob := &readyToExecuteJob{
|
|
||||||
job: job,
|
|
||||||
state: state,
|
|
||||||
}
|
|
||||||
h.jobs[job.GetJobID()] = rjob
|
|
||||||
|
|
||||||
h.onJobEvent(nil, rjob)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToExecuteHandler) onJobEvent(evt event.Event, job *readyToExecuteJob) {
|
|
||||||
if cloneEvt, ok := evt.(*event.CloneJob); ok {
|
|
||||||
cloneEvt.Callback.SetValue(job.job.Clone())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
|
|
||||||
h.onNormalJobEvent(evt, job, norJob)
|
|
||||||
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
|
|
||||||
h.onResourceJobEvent(evt, job, resJob)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToExecuteHandler) onNormalJobEvent(evt event.Event, job *readyToExecuteJob, norJob *jobmod.NormalJob) {
|
|
||||||
// TODO 目前直接启动执行
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateExecuting())
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToExecuteHandler) onResourceJobEvent(evt event.Event, job *readyToExecuteJob, resJob *jobmod.ResourceJob) {
|
|
||||||
// TODO 目前直接启动执行
|
|
||||||
h.changeJobState(job.job, jobmod.NewStateExecuting())
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToExecuteHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
|
|
||||||
job.SetState(state)
|
|
||||||
|
|
||||||
delete(h.jobs, job.GetJobID())
|
|
||||||
|
|
||||||
h.mgr.pubLock.Lock()
|
|
||||||
h.mgr.handleState(job)
|
|
||||||
h.mgr.pubLock.Unlock()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToExecuteHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
|
|
||||||
h.cmdChan.Send(func() {
|
|
||||||
if broadcast.ToAll() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
|
|
||||||
} else if broadcast.ToJobSet() {
|
|
||||||
for _, job := range h.jobs {
|
|
||||||
if job.job.GetJobSetID() != broadcast.JobSetID {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
} else if broadcast.ToJob() {
|
|
||||||
if job, ok := h.jobs[broadcast.JobID]; ok {
|
|
||||||
h.onJobEvent(evt, job)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToExecuteHandler) Serve() {
|
|
||||||
cmdChan := h.cmdChan.BeginChanReceive()
|
|
||||||
defer h.cmdChan.CloseChanReceive()
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case cmd := <-cmdChan:
|
|
||||||
cmd()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *ReadyToExecuteHandler) Stop() {
|
|
||||||
// TODO 支持STOP
|
|
||||||
}
|
|
|
@ -1,17 +0,0 @@
|
||||||
package jobmgr
|
|
||||||
|
|
||||||
import (
|
|
||||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
|
||||||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
|
||||||
)
|
|
||||||
|
|
||||||
type StateHandler interface {
|
|
||||||
// 处理Job。在此期间全局锁已锁定
|
|
||||||
Handle(job jobmod.Job)
|
|
||||||
// 外部发生了一个事件
|
|
||||||
OnEvent(broadcast event.Broadcast, evt event.Event)
|
|
||||||
// 运行Handler
|
|
||||||
Serve()
|
|
||||||
// 停止此Handler
|
|
||||||
Stop()
|
|
||||||
}
|
|
|
@ -1,23 +1,50 @@
|
||||||
package mq
|
package mq
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
"gitlink.org.cn/cloudream/common/pkgs/mq"
|
||||||
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||||||
|
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job/state"
|
||||||
)
|
)
|
||||||
|
|
||||||
// 提交任务集
|
// 提交任务集
|
||||||
func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetResp, *mq.CodeMessage) {
|
func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetResp, *mq.CodeMessage) {
|
||||||
logger.Debugf("submitting job")
|
logger.Debugf("submitting job")
|
||||||
|
|
||||||
jobSet, err := svc.jobMgr.SubmitJobSet(msg.JobSet, msg.PreScheduleScheme)
|
var jobs []jobmgr.SubmittingJob
|
||||||
if err != nil {
|
for _, jobInfo := range msg.JobSet.Jobs {
|
||||||
logger.Warnf("submitting job set: %s", err.Error())
|
switch info := jobInfo.(type) {
|
||||||
return nil, mq.Failed(errorcode.OperationFailed, "submit job set failed")
|
case *schsdk.NormalJobInfo:
|
||||||
|
job := job.NewNormalJob(*info)
|
||||||
|
|
||||||
|
preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID]
|
||||||
|
if !ok {
|
||||||
|
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
||||||
|
}
|
||||||
|
|
||||||
|
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||||
|
Body: job,
|
||||||
|
InitState: state.NewPreSchuduling(preSch),
|
||||||
|
})
|
||||||
|
|
||||||
|
case *schsdk.DataReturnJobInfo:
|
||||||
|
job := job.NewResourceJob(*info)
|
||||||
|
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||||
|
Body: job,
|
||||||
|
InitState: state.NewWaitTargetComplete(),
|
||||||
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(jobSet.JobSetID))
|
return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(svc.jobMgr.SubmitJobSet(jobs)))
|
||||||
}
|
}
|
||||||
|
|
||||||
// 任务集中某个文件上传完成
|
// 任务集中某个文件上传完成
|
||||||
|
@ -26,16 +53,15 @@ func (svc *Service) JobSetLocalFileUploaded(msg *mgrmq.JobSetLocalFileUploaded)
|
||||||
WithField("PackageID", msg.PackageID).
|
WithField("PackageID", msg.PackageID).
|
||||||
Debugf("local file uploaded")
|
Debugf("local file uploaded")
|
||||||
|
|
||||||
svc.jobMgr.LocalFileUploaded(msg.JobSetID, msg.LocalPath, msg.Error, msg.PackageID)
|
svc.jobMgr.BroadcastEvent(msg.JobSetID, event.NewLocalFileUploaded(msg.LocalPath, errors.New(msg.Error), msg.PackageID))
|
||||||
return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp())
|
return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp())
|
||||||
}
|
}
|
||||||
|
|
||||||
func (svc *Service) GetJob(msg *mgrmq.GetJob) (*mgrmq.GetJobResp, *mq.CodeMessage) {
|
func (svc *Service) GetJobSetStatus(msg *mgrmq.GetJobSetStatus) (*mgrmq.GetJobSetStatusResp, *mq.CodeMessage) {
|
||||||
job, err := svc.jobMgr.CloneJob(msg.JobID)
|
jobs := svc.jobMgr.DumpJobSet(msg.JobSetID)
|
||||||
if err != nil {
|
if len(jobs) == 0 {
|
||||||
logger.WithField("JobID", msg.JobID).Warnf("cloning job: %s", err.Error())
|
return nil, mq.Failed(errorcode.OperationFailed, "job set not found")
|
||||||
return nil, mq.Failed(errorcode.OperationFailed, "get job failed")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return mq.ReplyOK(mgrmq.NewGetJobResp(job))
|
return mq.ReplyOK(mgrmq.RespGetJobSetStatus(jobs))
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue