188 lines
4.6 KiB
Go
188 lines
4.6 KiB
Go
package jobmgr
|
|
|
|
import (
|
|
"github.com/patrickmn/go-cache"
|
|
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
|
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
type NodeService struct {
|
|
RunningModels map[string]schsdk.RunningModelInfo
|
|
NodeUsageCache map[schsdk.JobID]*cache.Cache
|
|
Lock sync.Mutex
|
|
}
|
|
|
|
func NewNodeService() *NodeService {
|
|
return &NodeService{
|
|
NodeUsageCache: make(map[schsdk.JobID]*cache.Cache),
|
|
RunningModels: make(map[string]schsdk.RunningModelInfo),
|
|
}
|
|
}
|
|
|
|
// SetNodeData 新增节点
|
|
func (s *NodeService) SetNodeData(jobSetID schsdk.JobSetID, modelJobInfo schsdk.ModelJobInfo, node schsdk.NodeInfo) {
|
|
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
|
|
value, ok := s.RunningModels[key]
|
|
// 如果不存在
|
|
if !ok {
|
|
nodes := []schsdk.NodeInfo{node}
|
|
value = schsdk.RunningModelInfo{
|
|
JobSetID: jobSetID,
|
|
Nodes: nodes,
|
|
ModelID: modelJobInfo.ModelID,
|
|
// 这里的model name应该从数据库中查询
|
|
ModelName: "",
|
|
CustomModelName: modelJobInfo.CustomModelName,
|
|
}
|
|
s.RunningModels[key] = value
|
|
return
|
|
}
|
|
// 如果存在
|
|
value.Nodes = append(value.Nodes, node)
|
|
s.RunningModels[key] = value
|
|
}
|
|
|
|
// RemoveNodeFromRunningModels 移除节点
|
|
func (s *NodeService) RemoveNodeFromRunningModels(modelJobInfo schsdk.ModelJobInfo, instanceID schsdk.JobID) {
|
|
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
|
|
value, ok := s.RunningModels[key]
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
for i := 0; i < len(value.Nodes); i++ {
|
|
node := value.Nodes[i]
|
|
if node.InstanceID == instanceID {
|
|
value.Nodes = append(value.Nodes[:i], value.Nodes[i+1:]...)
|
|
s.RunningModels[key] = value
|
|
logger.Info("remove node success from running models, job id: " + instanceID)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
func (s *NodeService) UpdateNodeFromRunningModels(modelJobInfo schsdk.ModelJobInfo, instanceID schsdk.JobID, status string) {
|
|
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
|
|
value, ok := s.RunningModels[key]
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
for i := 0; i < len(value.Nodes); i++ {
|
|
node := value.Nodes[i]
|
|
if node.InstanceID == instanceID {
|
|
node.Status = status
|
|
logger.Info("update node success from running models, job id: " + instanceID)
|
|
value.Nodes[i] = node
|
|
s.RunningModels[key] = value
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
func (s *NodeService) GetAvailableNodes() map[string]schsdk.RunningModelInfo {
|
|
return s.RunningModels
|
|
}
|
|
|
|
func (s *NodeService) GetNodeUsageRateInfo(customModelName schsdk.ModelName, modelID schsdk.ModelID) []schsdk.NodeUsageRateInfo {
|
|
var rateInfos []schsdk.NodeUsageRateInfo
|
|
|
|
key := string(customModelName) + "_" + string(modelID)
|
|
value, ok := s.RunningModels[key]
|
|
if !ok {
|
|
return nil
|
|
}
|
|
for i := 0; i < len(value.Nodes); i++ {
|
|
node := value.Nodes[i]
|
|
c, ok := s.NodeUsageCache[node.InstanceID]
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
rateInfo := getCacheData(c)
|
|
rateInfo.InstanceID = node.InstanceID
|
|
rateInfo.Address = node.Address
|
|
|
|
rateInfos = append(rateInfos, rateInfo)
|
|
}
|
|
|
|
return rateInfos
|
|
}
|
|
|
|
func (s *NodeService) SetNodeUsageRateInfo(key schsdk.JobID, value string) {
|
|
timeStamp := strconv.FormatInt(time.Now().Unix(), 10)
|
|
ch, ok := s.NodeUsageCache[key]
|
|
if !ok {
|
|
ch = cache.New(time.Minute*60, time.Minute*60)
|
|
ch.Set(timeStamp, value, cache.DefaultExpiration)
|
|
s.NodeUsageCache[key] = ch
|
|
return
|
|
}
|
|
ch.Set(timeStamp, value, cache.DefaultExpiration)
|
|
}
|
|
|
|
func getCacheData(c *cache.Cache) schsdk.NodeUsageRateInfo {
|
|
|
|
var nodeUsageRateInfo schsdk.NodeUsageRateInfo
|
|
|
|
infoMap := make(map[string][]schsdk.UsageRate)
|
|
|
|
// 获取缓存中的所有项
|
|
|
|
items := c.Items()
|
|
|
|
// 遍历缓存项,将其放入 map 中
|
|
for tmstamp, item := range items {
|
|
|
|
msg := item.Object.(string)
|
|
arr1 := strings.Split(msg, "\n")
|
|
// 提取所有kv
|
|
for i := 0; i < len(arr1); i++ {
|
|
arr2 := strings.Split(arr1[i], ":")
|
|
if len(arr2) != 2 {
|
|
continue
|
|
}
|
|
key := strings.TrimSpace(arr2[0])
|
|
value := strings.TrimSpace(arr2[1])
|
|
rate, ok := infoMap[key]
|
|
if !ok {
|
|
infoMap[key] = []schsdk.UsageRate{
|
|
{
|
|
Timestamp: tmstamp,
|
|
Number: value,
|
|
},
|
|
}
|
|
continue
|
|
}
|
|
|
|
rate = append(rate, schsdk.UsageRate{
|
|
Timestamp: tmstamp,
|
|
Number: value,
|
|
})
|
|
infoMap[key] = rate
|
|
}
|
|
}
|
|
|
|
for k, v := range infoMap {
|
|
// 对v 进行排序
|
|
sort.Slice(v, func(i, j int) bool {
|
|
return v[i].Timestamp < v[j].Timestamp
|
|
})
|
|
switch k {
|
|
case schsdk.MemoryUtilization:
|
|
nodeUsageRateInfo.MemoryUtilization = v
|
|
case schsdk.GPUUtilization:
|
|
nodeUsageRateInfo.GPUUtilization = v
|
|
case schsdk.CPUUtilization:
|
|
nodeUsageRateInfo.CPUUtilization = v
|
|
}
|
|
}
|
|
|
|
return nodeUsageRateInfo
|
|
}
|