JCC-CSScheduler/manager/internal/jobmgr/node_info.go

188 lines
4.6 KiB
Go

package jobmgr
import (
"github.com/patrickmn/go-cache"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
"sort"
"strconv"
"strings"
"sync"
"time"
)
type NodeService struct {
RunningModels map[string]schsdk.RunningModelInfo
NodeUsageCache map[schsdk.JobID]*cache.Cache
Lock sync.Mutex
}
func NewNodeService() *NodeService {
return &NodeService{
NodeUsageCache: make(map[schsdk.JobID]*cache.Cache),
RunningModels: make(map[string]schsdk.RunningModelInfo),
}
}
// SetNodeData 新增节点
func (s *NodeService) SetNodeData(jobSetID schsdk.JobSetID, modelJobInfo schsdk.ModelJobInfo, node schsdk.NodeInfo) {
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
value, ok := s.RunningModels[key]
// 如果不存在
if !ok {
nodes := []schsdk.NodeInfo{node}
value = schsdk.RunningModelInfo{
JobSetID: jobSetID,
Nodes: nodes,
ModelID: modelJobInfo.ModelID,
// 这里的model name应该从数据库中查询
ModelName: "",
CustomModelName: modelJobInfo.CustomModelName,
}
s.RunningModels[key] = value
return
}
// 如果存在
value.Nodes = append(value.Nodes, node)
s.RunningModels[key] = value
}
// RemoveNodeFromRunningModels 移除节点
func (s *NodeService) RemoveNodeFromRunningModels(modelJobInfo schsdk.ModelJobInfo, instanceID schsdk.JobID) {
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
value, ok := s.RunningModels[key]
if !ok {
return
}
for i := 0; i < len(value.Nodes); i++ {
node := value.Nodes[i]
if node.InstanceID == instanceID {
value.Nodes = append(value.Nodes[:i], value.Nodes[i+1:]...)
s.RunningModels[key] = value
logger.Info("remove node success from running models, job id: " + instanceID)
break
}
}
}
func (s *NodeService) UpdateNodeFromRunningModels(modelJobInfo schsdk.ModelJobInfo, instanceID schsdk.JobID, status string) {
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
value, ok := s.RunningModels[key]
if !ok {
return
}
for i := 0; i < len(value.Nodes); i++ {
node := value.Nodes[i]
if node.InstanceID == instanceID {
node.Status = status
logger.Info("update node success from running models, job id: " + instanceID)
value.Nodes[i] = node
s.RunningModels[key] = value
break
}
}
}
func (s *NodeService) GetAvailableNodes() map[string]schsdk.RunningModelInfo {
return s.RunningModels
}
func (s *NodeService) GetNodeUsageRateInfo(customModelName schsdk.ModelName, modelID schsdk.ModelID) []schsdk.NodeUsageRateInfo {
var rateInfos []schsdk.NodeUsageRateInfo
key := string(customModelName) + "_" + string(modelID)
value, ok := s.RunningModels[key]
if !ok {
return nil
}
for i := 0; i < len(value.Nodes); i++ {
node := value.Nodes[i]
c, ok := s.NodeUsageCache[node.InstanceID]
if !ok {
continue
}
rateInfo := getCacheData(c)
rateInfo.InstanceID = node.InstanceID
rateInfo.Address = node.Address
rateInfos = append(rateInfos, rateInfo)
}
return rateInfos
}
func (s *NodeService) SetNodeUsageRateInfo(key schsdk.JobID, value string) {
timeStamp := strconv.FormatInt(time.Now().Unix(), 10)
ch, ok := s.NodeUsageCache[key]
if !ok {
ch = cache.New(time.Minute*60, time.Minute*60)
ch.Set(timeStamp, value, cache.DefaultExpiration)
s.NodeUsageCache[key] = ch
return
}
ch.Set(timeStamp, value, cache.DefaultExpiration)
}
func getCacheData(c *cache.Cache) schsdk.NodeUsageRateInfo {
var nodeUsageRateInfo schsdk.NodeUsageRateInfo
infoMap := make(map[string][]schsdk.UsageRate)
// 获取缓存中的所有项
items := c.Items()
// 遍历缓存项,将其放入 map 中
for tmstamp, item := range items {
msg := item.Object.(string)
arr1 := strings.Split(msg, "\n")
// 提取所有kv
for i := 0; i < len(arr1); i++ {
arr2 := strings.Split(arr1[i], ":")
if len(arr2) != 2 {
continue
}
key := strings.TrimSpace(arr2[0])
value := strings.TrimSpace(arr2[1])
rate, ok := infoMap[key]
if !ok {
infoMap[key] = []schsdk.UsageRate{
{
Timestamp: tmstamp,
Number: value,
},
}
continue
}
rate = append(rate, schsdk.UsageRate{
Timestamp: tmstamp,
Number: value,
})
infoMap[key] = rate
}
}
for k, v := range infoMap {
// 对v 进行排序
sort.Slice(v, func(i, j int) bool {
return v[i].Timestamp < v[j].Timestamp
})
switch k {
case schsdk.MemoryUtilization:
nodeUsageRateInfo.MemoryUtilization = v
case schsdk.GPUUtilization:
nodeUsageRateInfo.GPUUtilization = v
case schsdk.CPUUtilization:
nodeUsageRateInfo.CPUUtilization = v
}
}
return nodeUsageRateInfo
}