181 lines
4.7 KiB
Go
181 lines
4.7 KiB
Go
package task
|
||
|
||
import (
|
||
"fmt"
|
||
"time"
|
||
|
||
"gitlink.org.cn/cloudream/common/models"
|
||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||
"gitlink.org.cn/cloudream/common/pkgs/task"
|
||
"gitlink.org.cn/cloudream/common/utils/convertto"
|
||
"gitlink.org.cn/cloudream/scheduler/common/globals"
|
||
"gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/collector"
|
||
)
|
||
|
||
type GetScheduleScheme struct {
|
||
Job job.NormalJob
|
||
preAdjustNodeID int64
|
||
}
|
||
|
||
func NewGetScheduleScheme() *GetScheduleScheme {
|
||
return &GetScheduleScheme{}
|
||
}
|
||
|
||
func (t *GetScheduleScheme) Execute(task *task.Task[TaskContext], ctx TaskContext, complete CompleteFn) {
|
||
log := logger.WithType[GetScheduleScheme]("Task")
|
||
log.Debugf("begin")
|
||
defer log.Debugf("end")
|
||
|
||
err := t.do(task.ID(), ctx)
|
||
if err != nil {
|
||
//TODO 若任务失败,上报的状态failed字段根据情况修改
|
||
ctx.reporter.Report(task.ID(), advtsk.NewTaskStatus("failed", err.Error(), true, advtsk.AdjustedScheme{}))
|
||
} else {
|
||
ctx.reporter.Report(task.ID(), advtsk.NewTaskStatus("failed", err.Error(), false, advtsk.AdjustedScheme{}))
|
||
}
|
||
ctx.reporter.ReportNow()
|
||
|
||
complete(err, CompleteOption{
|
||
RemovingDelay: time.Minute,
|
||
})
|
||
}
|
||
|
||
func (t *GetScheduleScheme) do(taskID string, ctx TaskContext) error {
|
||
isAvailable, err := t.CheckResourceAvailability()
|
||
if err != nil {
|
||
return err
|
||
}
|
||
|
||
if isAvailable {
|
||
// 确认code、dataset、image是否已经调度到该中心
|
||
} else {
|
||
// 重新执行预调度方案,寻找最优节点
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// 检查预调度节点资源是否足够
|
||
func (t *GetScheduleScheme) CheckResourceAvailability() (bool, error) {
|
||
colCli, err := globals.CollectorMQPool.Acquire()
|
||
if err != nil {
|
||
return false, fmt.Errorf("new collector client: %w", err)
|
||
}
|
||
defer colCli.Close()
|
||
|
||
neededCPU := t.Job.Info.Resources.CPU
|
||
if neededCPU > 0 {
|
||
resp, err := colCli.GetOneResourceData(collector.GetOneResourceData{
|
||
NodeId: t.preAdjustNodeID,
|
||
ResourceType: models.ResourceTypeCPU,
|
||
})
|
||
if err != nil {
|
||
return false, err
|
||
}
|
||
|
||
availCPU := resp.Data.(models.CPUResourceData).Available.Value
|
||
|
||
if float64(availCPU) < 1.5*neededCPU {
|
||
fmt.Printf("Schedule Scheme is wrong: Insufficient cpu")
|
||
return false, nil
|
||
}
|
||
}
|
||
|
||
neededNPU := t.Job.Info.Resources.NPU
|
||
if neededNPU > 0 {
|
||
resp, err := colCli.GetOneResourceData(collector.GetOneResourceData{
|
||
NodeId: t.preAdjustNodeID,
|
||
ResourceType: models.ResourceTypeNPU,
|
||
})
|
||
if err != nil {
|
||
return false, err
|
||
}
|
||
|
||
availNPU := resp.Data.(models.NPUResourceData).Available.Value
|
||
|
||
if float64(availNPU) < 1.5*neededNPU {
|
||
fmt.Printf("Schedule Scheme is wrong: Insufficient npu")
|
||
return false, nil
|
||
}
|
||
}
|
||
|
||
neededGPU := t.Job.Info.Resources.GPU
|
||
if neededGPU > 0 {
|
||
resp, err := colCli.GetOneResourceData(collector.GetOneResourceData{
|
||
NodeId: t.preAdjustNodeID,
|
||
ResourceType: models.ResourceTypeGPU,
|
||
})
|
||
if err != nil {
|
||
return false, err
|
||
}
|
||
|
||
availGPU := resp.Data.(models.GPUResourceData).Available.Value
|
||
|
||
if float64(availGPU) < 1.5*neededGPU {
|
||
fmt.Printf("Schedule Scheme is wrong: Insufficient gpu")
|
||
return false, nil
|
||
}
|
||
}
|
||
|
||
neededMLU := t.Job.Info.Resources.MLU
|
||
if neededMLU > 0 {
|
||
resp, err := colCli.GetOneResourceData(collector.GetOneResourceData{
|
||
NodeId: t.preAdjustNodeID,
|
||
ResourceType: models.ResourceTypeMLU,
|
||
})
|
||
if err != nil {
|
||
return false, err
|
||
}
|
||
|
||
availMLU := resp.Data.(models.MLUResourceData).Available.Value
|
||
|
||
if float64(availMLU) < 1.5*neededMLU {
|
||
fmt.Printf("Schedule Scheme is wrong: Insufficient mlu")
|
||
return false, nil
|
||
}
|
||
}
|
||
|
||
neededStorage := t.Job.Info.Resources.Storage
|
||
if neededStorage > 0 {
|
||
resp, err := colCli.GetOneResourceData(collector.GetOneResourceData{
|
||
NodeId: t.preAdjustNodeID,
|
||
ResourceType: models.ResourceTypeStorage,
|
||
})
|
||
if err != nil {
|
||
return false, err
|
||
}
|
||
|
||
availStorage := resp.Data.(models.StorageResourceData).Available.Value
|
||
|
||
bytesStorage := convertto.GBToBytes(availStorage)
|
||
|
||
if bytesStorage < int64(1.5*float64(neededStorage)) {
|
||
fmt.Printf("Schedule Scheme is wrong: Insufficient storage")
|
||
return false, nil
|
||
}
|
||
}
|
||
|
||
neededMemory := t.Job.Info.Resources.Memory
|
||
if neededMemory > 0 {
|
||
resp, err := colCli.GetOneResourceData(collector.GetOneResourceData{
|
||
NodeId: t.preAdjustNodeID,
|
||
ResourceType: models.ResourceTypeMemory,
|
||
})
|
||
if err != nil {
|
||
return false, err
|
||
}
|
||
|
||
availMemory := resp.Data.(models.MemoryResourceData).Available.Value
|
||
|
||
bytesMemory := convertto.GBToBytes(availMemory)
|
||
|
||
if bytesMemory < int64(1.5*float64(neededMemory)) {
|
||
fmt.Printf("Schedule Scheme is wrong: Insufficient memory")
|
||
return false, nil
|
||
}
|
||
}
|
||
return true, nil
|
||
|
||
}
|