JCC-CSScheduler/manager/internal/executormgr/executormgr.go

164 lines
3.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package executormgr
import (
"fmt"
"sync"
"time"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
exemq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
)
type jobTask struct {
JobID schsdk.JobID
TaskID string
FullTaskID string
}
type ExecutorInfo struct {
executorID schmod.ExecutorID
jobTasks map[string]jobTask // key 为 TaskID
lastReportTime time.Time
}
type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus exetsk.TaskStatus)
type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string)
type Manager struct {
executors map[schmod.ExecutorID]*ExecutorInfo
lock sync.Mutex
exeCli *exemq.Client
onTaskUpdated OnTaskUpdatedCallbackFn
onTaskTimeout OnTimeoutCallbackFn
reportTimeout time.Duration
}
func NewManager(reportTimeout time.Duration) (*Manager, error) {
exeCli, err := schglb.ExecutorMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new executor client: %w", err)
}
return &Manager{
executors: make(map[schmod.ExecutorID]*ExecutorInfo),
exeCli: exeCli,
reportTimeout: reportTimeout,
}, nil
}
func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) {
m.onTaskUpdated = callback
}
func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) {
m.onTaskTimeout = callback
}
func (m *Manager) Report(execID schmod.ExecutorID, taskStatus []mgrmq.ExecutorTaskStatus) {
m.lock.Lock()
defer m.lock.Unlock()
info, ok := m.executors[execID]
if !ok {
info := &ExecutorInfo{
executorID: execID,
jobTasks: make(map[string]jobTask),
}
m.executors[execID] = info
}
info.lastReportTime = time.Now()
for _, s := range taskStatus {
tsk, ok := info.jobTasks[s.TaskID]
if !ok {
continue
}
m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status)
}
}
// 启动一个Task并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID
func (m *Manager) StartTask(jobID schsdk.JobID, info exetsk.TaskInfo) (string, error) {
m.lock.Lock()
defer m.lock.Unlock()
resp, err := m.exeCli.StartTask(exemq.NewStartTask(info))
if err != nil {
return "", err
}
fullTaskID := fmt.Sprintf("%s-%s", resp.ExecutorID, resp.TaskID)
exeInfo, ok := m.executors[resp.ExecutorID]
if !ok {
exeInfo = &ExecutorInfo{
executorID: resp.ExecutorID,
jobTasks: make(map[string]jobTask),
lastReportTime: time.Now(),
}
m.executors[resp.ExecutorID] = exeInfo
}
exeInfo.jobTasks[resp.TaskID] = jobTask{
JobID: jobID,
TaskID: resp.TaskID,
FullTaskID: fullTaskID,
}
return fullTaskID, nil
}
// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新
func (m *Manager) ForgetTask(fullTaskID string) {
m.lock.Lock()
defer m.lock.Unlock()
for _, exe := range m.executors {
for _, tsk := range exe.jobTasks {
if tsk.FullTaskID == fullTaskID {
delete(exe.jobTasks, fullTaskID)
return
}
}
}
}
func (m *Manager) Serve() error {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
func() {
m.lock.Lock()
defer m.lock.Unlock()
now := time.Now()
for exeID, exeInfo := range m.executors {
dt := now.Sub(exeInfo.lastReportTime)
if dt < m.reportTimeout {
continue
}
for _, tsk := range exeInfo.jobTasks {
m.onTaskTimeout(tsk.JobID, tsk.FullTaskID)
}
delete(m.executors, exeID)
}
}()
}
}
}