JCC-CSScheduler/manager/internal/executormgr/executormgr.go

195 lines
4.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package executormgr
import (
"bufio"
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/async"
log "gitlink.org.cn/cloudream/common/pkgs/logger"
"gitlink.org.cn/cloudream/common/utils/serder"
jobTask "gitlink.org.cn/cloudream/scheduler/manager/internal/task"
"io"
"strings"
"sync"
"time"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
exemq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
)
type task struct {
statusChan *async.UnboundChannel[mgrmq.ExecutorTaskStatus]
}
type ExecutorStatus struct {
executorID schmod.ExecutorID
tasks map[string]task // key 为 TaskID
}
var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
var ExecutorPool exemq.HttpPool
func InitExecutorPool() {
ExecutorPool = exemq.NewHttpPool(&exemq.Config{})
}
type Manager struct {
executors map[schmod.ExecutorID]*ExecutorStatus
lock sync.Mutex
exeCli *exemq.Client
reportTimeout time.Duration
}
func NewManager(reportTimeout time.Duration) (*Manager, error) {
exeCli, err := schglb.ExecutorMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new executor client: %w", err)
}
return &Manager{
executors: make(map[schmod.ExecutorID]*ExecutorStatus),
exeCli: exeCli,
reportTimeout: reportTimeout,
}, nil
}
func (m *Manager) ReceiveExecutorTaskStatus(url string) (*mgrmq.ExecutorTaskStatus, error) {
client, err := ExecutorPool.AcquireByUrl(url)
if err != nil {
log.Error(err)
return &mgrmq.ExecutorTaskStatus{}, err
}
resp, err := client.GetReportInfo()
if err != nil {
log.Error(err)
return &mgrmq.ExecutorTaskStatus{}, err
}
reader := bufio.NewReader(resp.Body)
line, err := reader.ReadString('\n')
if err != nil && err != io.EOF {
log.Error("Error reading from response body:", err)
return &mgrmq.ExecutorTaskStatus{}, err
}
// TODO 第一次获取的值包含执行器所有任务,用于失败重试
executorInfo := convertLine(line)
// 将第一次的executor放入到池子中
exec := &ExecutorStatus{
executorID: executorInfo.ExecutorID,
tasks: make(map[string]task),
}
m.executors[executorInfo.ExecutorID] = exec
go func() {
for {
line, err = reader.ReadString('\n')
if err != nil {
if err != io.EOF {
log.Error("Error reading from response body:", err)
}
return
}
status := convertLine(line)
if status == nil {
continue
}
m.Report(*status)
}
}()
return executorInfo, nil
}
func convertLine(line string) *mgrmq.ExecutorTaskStatus {
if line == "" {
return nil
}
line = strings.TrimPrefix(line, "data: ")
line = strings.TrimSpace(line)
if len(line) == 0 {
return nil
}
readResp, err := serder.JSONToObjectEx[mgrmq.ExecutorTaskStatus]([]byte(line))
if err != nil {
log.Error(err)
return nil
}
return &readResp
}
func (m *Manager) Report(status mgrmq.ExecutorTaskStatus) {
m.lock.Lock()
defer m.lock.Unlock()
exec := m.executors[status.ExecutorID]
if exec == nil {
log.Error("Executor not found: ", status.ExecutorID)
return
}
// 由于先将task chan放入到池子中再执行的task所以这里的task必存在
tsk := exec.tasks[status.TaskID]
// TODO 考虑主动检测channel是否关闭然后取消task
if tsk.statusChan.Send(status) != nil {
delete(exec.tasks, status.TaskID)
if len(exec.tasks) == 0 {
delete(m.executors, exec.executorID)
}
}
}
// 启动一个Task
func (m *Manager) StartTask(info exetsk.TaskInfo, ccInfo schmod.ComputingCenter) (*jobTask.JobTask[mgrmq.ExecutorTaskStatus], error) {
m.lock.Lock()
defer m.lock.Unlock()
newJobTask := jobTask.NewJobTask[mgrmq.ExecutorTaskStatus]()
ch := newJobTask.Chan()
client, err := ExecutorPool.AcquireByUrl(ccInfo.ExecutorURL)
if err != nil {
ch.CloseWithError(fmt.Errorf("start task: %w", err))
return newJobTask, err
}
executorID := schmod.ExecutorID(ccInfo.ExecutorID)
// 检测是否连接过这个Executor如果第一次连则发送请求监听上报信息
_, ok := m.executors[executorID]
if !ok {
_, err = m.ReceiveExecutorTaskStatus(ccInfo.ExecutorURL)
if err != nil {
ch.CloseWithError(fmt.Errorf("start task: %w", err))
return newJobTask, err
}
}
// 上面已经将executor放入到池子中了这里的executor必存在
exeInfo := m.executors[executorID]
exeInfo.tasks[newJobTask.ID()] = task{
statusChan: ch,
}
_, err = client.SubmitTask(exemq.NewStartTask(newJobTask.ID(), info))
if err != nil {
ch.CloseWithError(fmt.Errorf("start task: %w", err))
return newJobTask, err
}
return newJobTask, nil
}
func (m *Manager) Serve() {
InitExecutorPool()
}