195 lines
4.7 KiB
Go
195 lines
4.7 KiB
Go
package executormgr
|
||
|
||
import (
|
||
"bufio"
|
||
"fmt"
|
||
"gitlink.org.cn/cloudream/common/pkgs/async"
|
||
log "gitlink.org.cn/cloudream/common/pkgs/logger"
|
||
"gitlink.org.cn/cloudream/common/utils/serder"
|
||
jobTask "gitlink.org.cn/cloudream/scheduler/manager/internal/task"
|
||
"io"
|
||
"strings"
|
||
"sync"
|
||
"time"
|
||
|
||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||
exemq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
|
||
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||
)
|
||
|
||
type task struct {
|
||
statusChan *async.UnboundChannel[mgrmq.ExecutorTaskStatus]
|
||
}
|
||
type ExecutorStatus struct {
|
||
executorID schmod.ExecutorID
|
||
tasks map[string]task // key 为 TaskID
|
||
}
|
||
|
||
var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
|
||
|
||
var ExecutorPool exemq.HttpPool
|
||
|
||
func InitExecutorPool() {
|
||
ExecutorPool = exemq.NewHttpPool(&exemq.Config{})
|
||
}
|
||
|
||
type Manager struct {
|
||
executors map[schmod.ExecutorID]*ExecutorStatus
|
||
lock sync.Mutex
|
||
exeCli *exemq.Client
|
||
|
||
reportTimeout time.Duration
|
||
}
|
||
|
||
func NewManager(reportTimeout time.Duration) (*Manager, error) {
|
||
exeCli, err := schglb.ExecutorMQPool.Acquire()
|
||
if err != nil {
|
||
return nil, fmt.Errorf("new executor client: %w", err)
|
||
}
|
||
|
||
return &Manager{
|
||
executors: make(map[schmod.ExecutorID]*ExecutorStatus),
|
||
exeCli: exeCli,
|
||
reportTimeout: reportTimeout,
|
||
}, nil
|
||
}
|
||
|
||
func (m *Manager) ReceiveExecutorTaskStatus(url string) (*mgrmq.ExecutorTaskStatus, error) {
|
||
|
||
client, err := ExecutorPool.AcquireByUrl(url)
|
||
if err != nil {
|
||
log.Error(err)
|
||
return &mgrmq.ExecutorTaskStatus{}, err
|
||
}
|
||
resp, err := client.GetReportInfo()
|
||
if err != nil {
|
||
log.Error(err)
|
||
return &mgrmq.ExecutorTaskStatus{}, err
|
||
}
|
||
|
||
reader := bufio.NewReader(resp.Body)
|
||
|
||
line, err := reader.ReadString('\n')
|
||
if err != nil && err != io.EOF {
|
||
log.Error("Error reading from response body:", err)
|
||
return &mgrmq.ExecutorTaskStatus{}, err
|
||
}
|
||
// TODO 第一次获取的值包含执行器所有任务,用于失败重试
|
||
executorInfo := convertLine(line)
|
||
// 将第一次的executor放入到池子中
|
||
exec := &ExecutorStatus{
|
||
executorID: executorInfo.ExecutorID,
|
||
tasks: make(map[string]task),
|
||
}
|
||
|
||
m.executors[executorInfo.ExecutorID] = exec
|
||
|
||
go func() {
|
||
for {
|
||
line, err = reader.ReadString('\n')
|
||
if err != nil {
|
||
if err != io.EOF {
|
||
log.Error("Error reading from response body:", err)
|
||
}
|
||
return
|
||
}
|
||
|
||
status := convertLine(line)
|
||
if status == nil {
|
||
continue
|
||
}
|
||
|
||
m.Report(*status)
|
||
}
|
||
}()
|
||
|
||
return executorInfo, nil
|
||
}
|
||
|
||
func convertLine(line string) *mgrmq.ExecutorTaskStatus {
|
||
if line == "" {
|
||
return nil
|
||
}
|
||
|
||
line = strings.TrimPrefix(line, "data: ")
|
||
line = strings.TrimSpace(line)
|
||
if len(line) == 0 {
|
||
return nil
|
||
}
|
||
|
||
readResp, err := serder.JSONToObjectEx[mgrmq.ExecutorTaskStatus]([]byte(line))
|
||
if err != nil {
|
||
log.Error(err)
|
||
return nil
|
||
}
|
||
|
||
return &readResp
|
||
}
|
||
|
||
func (m *Manager) Report(status mgrmq.ExecutorTaskStatus) {
|
||
m.lock.Lock()
|
||
defer m.lock.Unlock()
|
||
|
||
exec := m.executors[status.ExecutorID]
|
||
if exec == nil {
|
||
log.Error("Executor not found: ", status.ExecutorID)
|
||
return
|
||
}
|
||
// 由于先将task chan放入到池子中再执行的task,所以这里的task必存在
|
||
tsk := exec.tasks[status.TaskID]
|
||
|
||
// TODO 考虑主动检测channel是否关闭,然后取消task
|
||
if tsk.statusChan.Send(status) != nil {
|
||
delete(exec.tasks, status.TaskID)
|
||
|
||
if len(exec.tasks) == 0 {
|
||
delete(m.executors, exec.executorID)
|
||
}
|
||
}
|
||
}
|
||
|
||
// 启动一个Task
|
||
func (m *Manager) StartTask(info exetsk.TaskInfo, ccInfo schmod.ComputingCenter) (*jobTask.JobTask[mgrmq.ExecutorTaskStatus], error) {
|
||
m.lock.Lock()
|
||
defer m.lock.Unlock()
|
||
newJobTask := jobTask.NewJobTask[mgrmq.ExecutorTaskStatus]()
|
||
ch := newJobTask.Chan()
|
||
|
||
client, err := ExecutorPool.AcquireByUrl(ccInfo.ExecutorURL)
|
||
if err != nil {
|
||
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||
return newJobTask, err
|
||
}
|
||
|
||
executorID := schmod.ExecutorID(ccInfo.ExecutorID)
|
||
// 检测是否连接过这个Executor,如果第一次连,则发送请求监听上报信息
|
||
_, ok := m.executors[executorID]
|
||
if !ok {
|
||
_, err = m.ReceiveExecutorTaskStatus(ccInfo.ExecutorURL)
|
||
if err != nil {
|
||
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||
return newJobTask, err
|
||
}
|
||
}
|
||
|
||
// 上面已经将executor放入到池子中了,这里的executor必存在
|
||
exeInfo := m.executors[executorID]
|
||
exeInfo.tasks[newJobTask.ID()] = task{
|
||
statusChan: ch,
|
||
}
|
||
|
||
_, err = client.SubmitTask(exemq.NewStartTask(newJobTask.ID(), info))
|
||
if err != nil {
|
||
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||
return newJobTask, err
|
||
}
|
||
|
||
return newJobTask, nil
|
||
}
|
||
|
||
func (m *Manager) Serve() {
|
||
InitExecutorPool()
|
||
}
|