added cron updateClusterResources

Former-commit-id: f38bd68df8
This commit is contained in:
tzwang 2024-06-06 10:53:48 +08:00
parent 04f0eae608
commit b61e434f75
6 changed files with 150 additions and 21 deletions

View File

@ -327,3 +327,79 @@ func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[st
return executorMap, collectorMap
}
func UpdateClusterResource(svc *svc.ServiceContext) {
list, err := svc.Scheduler.AiStorages.GetAdaptersByType("1")
if err != nil {
return
}
var wg sync.WaitGroup
for _, adapter := range list {
clusters, err := svc.Scheduler.AiStorages.GetClustersByAdapterId(adapter.Id)
if err != nil {
continue
}
for _, cluster := range clusters.List {
c := cluster
clusterResource, err := svc.Scheduler.AiStorages.GetClusterResourcesById(c.Id)
if err != nil {
continue
}
wg.Add(1)
go func() {
_, ok := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id]
if !ok {
wg.Done()
return
}
h := http.Request{}
stat, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][c.Id].GetResourceStats(h.Context())
if err != nil {
wg.Done()
return
}
if stat == nil {
wg.Done()
return
}
clusterType, err := strconv.ParseInt(adapter.Type, 10, 64)
if err != nil {
wg.Done()
return
}
var cardTotal int64
var topsTotal float64
for _, card := range stat.CardsAvail {
cardTotal += int64(card.CardNum)
topsTotal += card.TOpsAtFp16 * float64(card.CardNum)
}
if (models.TClusterResource{} == *clusterResource) {
err = svc.Scheduler.AiStorages.SaveClusterResources(c.Id, c.Name, clusterType, float64(stat.CpuCoreAvail), float64(stat.CpuCoreTotal),
stat.MemAvail, stat.MemTotal, stat.DiskAvail, stat.DiskTotal, float64(stat.GpuAvail), float64(stat.GpuTotal), cardTotal, topsTotal)
if err != nil {
wg.Done()
return
}
} else {
clusterResource.CardTotal = cardTotal
clusterResource.CardTopsTotal = topsTotal
clusterResource.CpuAvail = float64(stat.CpuCoreAvail)
clusterResource.CpuTotal = float64(stat.CpuCoreTotal)
clusterResource.MemAvail = stat.MemAvail
clusterResource.MemTotal = stat.MemTotal
clusterResource.DiskAvail = stat.DiskAvail
clusterResource.DiskTotal = stat.DiskTotal
err := svc.Scheduler.AiStorages.UpdateClusterResources(clusterResource)
if err != nil {
wg.Done()
return
}
}
wg.Done()
}()
}
}
wg.Wait()
}

View File

@ -44,4 +44,7 @@ func AddCronGroup(svc *svc.ServiceContext) {
UpdateAiAdapterMaps(svc)
})
svc.Cron.AddFunc("30 21 * * *", func() {
UpdateClusterResource(svc)
})
}

View File

@ -137,6 +137,13 @@ func (l *GetCenterOverviewLogic) updateClusterResource(mu *sync.RWMutex, ch chan
} else {
clusterResource.CardTotal = cardTotal
clusterResource.CardTopsTotal = topsTotal
clusterResource.CpuAvail = float64(stat.CpuCoreAvail)
clusterResource.CpuTotal = float64(stat.CpuCoreTotal)
clusterResource.MemAvail = stat.MemAvail
clusterResource.MemTotal = stat.MemTotal
clusterResource.DiskAvail = stat.DiskAvail
clusterResource.DiskTotal = stat.DiskTotal
err := l.svcCtx.Scheduler.AiStorages.UpdateClusterResources(clusterResource)
if err != nil {
mu.Unlock()

View File

@ -2,12 +2,11 @@ package core
import (
"context"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker"
"strconv"
"github.com/zeromicro/go-zero/core/logx"
tool "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
)
type SyncClusterLoadLogic struct {
@ -25,24 +24,10 @@ func NewSyncClusterLoadLogic(ctx context.Context, svcCtx *svc.ServiceContext) *S
}
func (l *SyncClusterLoadLogic) SyncClusterLoad(req *types.SyncClusterLoadReq) error {
if len(req.ClusterLoadRecords) != 0 {
for _, record := range req.ClusterLoadRecords {
tracker.ClusterCpuUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuUtilisation)
tracker.ClusterCpuAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuAvail)
tracker.ClusterCpuTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuTotal)
tracker.ClusterMemoryUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryUtilisation)
tracker.ClusterMemoryAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryAvail)
tracker.ClusterMemoryTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryTotal)
tracker.ClusterDiskUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskUtilisation)
tracker.ClusterDiskAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskAvail)
tracker.ClusterDiskTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskTotal)
tracker.ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation)
tracker.ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount))
tracker.ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal))
}
if nil != req.ClusterLoadRecords {
var param tracker.ClusterLoadRecord
tool.Convert(req, &param)
tracker.SyncClusterLoad(param)
}
return nil
}

View File

@ -6,6 +6,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker"
"gorm.io/gorm"
"strconv"
"time"
@ -211,6 +212,17 @@ func (s *AiStorage) SaveClusterResources(clusterId string, clusterName string, c
if tx.Error != nil {
return tx.Error
}
// prometheus
param := tracker.ClusterLoadRecord{
ClusterName: clusterName,
CpuAvail: cpuAvail,
CpuTotal: cpuTotal,
MemoryAvail: memAvail,
MemoryTotal: memTotal,
DiskAvail: diskAvail,
DiskTotal: diskTotal,
}
tracker.SyncClusterLoad(param)
return nil
}
@ -219,6 +231,17 @@ func (s *AiStorage) UpdateClusterResources(clusterResource *models.TClusterResou
if tx.Error != nil {
return tx.Error
}
// prometheus
param := tracker.ClusterLoadRecord{
ClusterName: clusterResource.ClusterName,
CpuAvail: clusterResource.CpuAvail,
CpuTotal: clusterResource.CpuTotal,
MemoryAvail: clusterResource.MemAvail,
MemoryTotal: clusterResource.MemTotal,
DiskAvail: clusterResource.DiskAvail,
DiskTotal: clusterResource.DiskTotal,
}
tracker.SyncClusterLoad(param)
return nil
}

View File

@ -107,6 +107,23 @@ var (
}
)
type ClusterLoadRecord struct {
AdapterId int64 `json:"adapterId,optional"`
ClusterName string `json:"clusterName,optional"`
CpuAvail float64 `json:"cpuAvail,optional"`
CpuTotal float64 `json:"cpuTotal,optional"`
CpuUtilisation float64 `json:"cpuUtilisation,optional"`
MemoryAvail float64 `json:"memoryAvail,optional"`
MemoryUtilisation float64 `json:"memoryUtilisation,optional"`
MemoryTotal float64 `json:"memoryTotal,optional"`
DiskAvail float64 `json:"diskAvail,optional"`
DiskTotal float64 `json:"diskTotal,optional"`
DiskUtilisation float64 `json:"diskUtilisation,optional"`
PodsUtilisation float64 `json:"podsUtilisation,optional"`
PodsCount int64 `json:"podsCount,optional"`
PodsTotal int64 `json:"podsTotal,optional"`
}
func init() {
prometheus.MustRegister(metrics...)
}
@ -302,3 +319,21 @@ func (p Prometheus) GetRawData(expr string, o QueryOption) (model.Value, error)
}
return value, nil
}
func SyncClusterLoad(record ClusterLoadRecord) {
ClusterCpuUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuUtilisation)
ClusterCpuAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuAvail)
ClusterCpuTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.CpuTotal)
ClusterMemoryUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryUtilisation)
ClusterMemoryAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryAvail)
ClusterMemoryTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.MemoryTotal)
ClusterDiskUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskUtilisation)
ClusterDiskAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskAvail)
ClusterDiskTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskTotal)
ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation)
ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount))
ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal))
}