JCC-CSScheduler/manager/internal/advisormgr/advisormgr.go

163 lines
3.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package advisormgr
import (
"fmt"
"sync"
"time"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
)
type jobTask struct {
JobID schsdk.JobID
TaskID string
FullTaskID string
}
type AdvisorInfo struct {
advisorID schmod.AdvisorID
jobTasks map[string]jobTask // key 为 TaskID
lastReportTime time.Time
}
type OnTaskUpdatedCallbackFn func(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.AdvTaskStatus)
type OnTimeoutCallbackFn func(jobID schsdk.JobID, fullTaskID string)
type Manager struct {
advisors map[schmod.AdvisorID]*AdvisorInfo
lock sync.Mutex
advCli *advmq.Client
onTaskUpdated OnTaskUpdatedCallbackFn
onTaskTimeout OnTimeoutCallbackFn
reportTimeout time.Duration
}
func NewManager(reportTimeout time.Duration) (*Manager, error) {
advCli, err := schglb.AdvisorMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new executor client: %w", err)
}
return &Manager{
advisors: make(map[schmod.AdvisorID]*AdvisorInfo),
advCli: advCli,
reportTimeout: reportTimeout,
}, nil
}
func (m *Manager) OnTaskUpdated(callback OnTaskUpdatedCallbackFn) {
m.onTaskUpdated = callback
}
func (m *Manager) OnTaskTimeout(callback OnTimeoutCallbackFn) {
m.onTaskTimeout = callback
}
func (m *Manager) Report(advID schmod.AdvisorID, taskStatus []mgrmq.AdvisorTaskStatus) {
m.lock.Lock()
defer m.lock.Unlock()
info, ok := m.advisors[advID]
if !ok {
info = &AdvisorInfo{
advisorID: advID,
jobTasks: make(map[string]jobTask),
}
m.advisors[advID] = info
}
info.lastReportTime = time.Now()
for _, s := range taskStatus {
tsk, ok := info.jobTasks[s.TaskID]
if !ok {
continue
}
m.onTaskUpdated(tsk.JobID, tsk.FullTaskID, s.Status)
}
}
// 启动一个Task并将其关联到指定的Job。返回一个在各Executor之间唯一的TaskID
func (m *Manager) StartTask(jobID schsdk.JobID, info advtsk.AdvTaskInfo) (string, error) {
m.lock.Lock()
defer m.lock.Unlock()
resp, err := m.advCli.StartTask(advmq.NewStartTask(info))
if err != nil {
return "", err
}
fullTaskID := fmt.Sprintf("%s-%s", resp.AdvisorID, resp.TaskID)
exeInfo, ok := m.advisors[resp.AdvisorID]
if !ok {
exeInfo = &AdvisorInfo{
advisorID: resp.AdvisorID,
jobTasks: make(map[string]jobTask),
lastReportTime: time.Now(),
}
m.advisors[resp.AdvisorID] = exeInfo
}
exeInfo.jobTasks[resp.TaskID] = jobTask{
JobID: jobID,
TaskID: resp.TaskID,
FullTaskID: fullTaskID,
}
return fullTaskID, nil
}
// 放弃对指定任务进度的等待。调用此函数不会停止任务执行,只是回调里不会再收到此任务的进度更新
func (m *Manager) ForgetTask(fullTaskID string) {
m.lock.Lock()
defer m.lock.Unlock()
for _, exe := range m.advisors {
for _, tsk := range exe.jobTasks {
if tsk.FullTaskID == fullTaskID {
delete(exe.jobTasks, fullTaskID)
return
}
}
}
}
func (m *Manager) Serve() error {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ticker.C:
func() {
m.lock.Lock()
defer m.lock.Unlock()
now := time.Now()
for exeID, exeInfo := range m.advisors {
dt := now.Sub(exeInfo.lastReportTime)
if dt < m.reportTimeout {
continue
}
for _, tsk := range exeInfo.jobTasks {
m.onTaskTimeout(tsk.JobID, tsk.FullTaskID)
}
delete(m.advisors, exeID)
}
}()
}
}
}