监控接口修改

This commit is contained in:
zhangwei 2025-05-30 16:35:13 +08:00
parent 4e59d689e9
commit fe8d858cb3
14 changed files with 7094 additions and 7275 deletions

View File

@ -59,9 +59,9 @@ type (
Type int64 `json:"type"` // 租户所属(0数算1超算2智算
DeletedFlag int64 `json:"deletedFlag"` // 是否删除
CreatedBy int64 `json:"createdBy"` // 创建人
CreateTime string `json:"createdTime"` // 创建时间
CreateTime string `json:"createdTime"` // 创建时间
UpdatedBy int64 `json:"updatedBy"` // 更新人
UpdateTime string `json:"updated_time"` // 更新时间
UpdateTime string `json:"updated_time"` // 更新时间
}
UpdateTenantReq {
@ -103,6 +103,7 @@ type DataSet {
type cloudListResp {
Clouds []Cloud `json:"clouds"`
}
type Cloud {
Id int64 `json:"id"` // id
TaskId int64 `json:"taskId"` // 任务id
@ -115,6 +116,13 @@ type Cloud {
StartTime string `json:"startTime"` // 开始时间
RunningTime int64 `json:"runningTime"` // 运行时长
CreatedBy int64 `json:"createdBy"` // 创建人
CreateTime string `json:"createdTime"` // 创建时间
CreateTime string `json:"createdTime"` // 创建时间
Result string `json:"result"`
}
type PodsListReq {
ClusterName string `form:"clusterName"`
}
type PodsListResp {
Data []interface{} `json:"data"`
}

View File

@ -128,6 +128,7 @@ type (
PodsUtilisation float64 `json:"podsUtilisation,optional"`
PodsCount int64 `json:"podsCount,optional"`
PodsTotal int64 `json:"podsTotal,optional"`
NodeCount float64 `json:"nodeCount,optional"`
}
)
@ -1455,7 +1456,7 @@ type EditResourceReq {
CostType string `json:"costType" gorm:"column:cost_type"` //计费类型hourly, daily, monthly,perUse
Type string `json:"type,optional" gorm:"column:type"`
// 基础资源规格
// 基础资源规格
StorageValue string `json:"storageValue,optional"`
StorageUnit string `json:"storageUnit,optional"`
CpuValue string `json:"cpuValue,optional"`

View File

@ -203,7 +203,7 @@ service pcm {
@doc "删除资源规格"
@handler deleteResourceSpecHandler
delete /core/ai/resourceSpec/delete/:id (DeletePathId) returns (CommonResp)
//集群资源规格----- 结束
//集群资源规格----- 结束
}
//hpc二级接口
@ -289,6 +289,9 @@ service pcm {
@handler podLogs
post /cloud/pod/logs (PodLogsReq) returns (string)
@handler podsList
get /cloud/pods/list (PodsListReq) returns (PodsListResp)
}
//智算二级接口
@ -441,7 +444,7 @@ service pcm {
@doc "文本识别"
@handler ChatHandler
post /ai/chat (ChatReq) returns (ChatResult)
/******chat end***********/
/******chat end***********/
}
//screen接口
@ -1130,5 +1133,4 @@ service pcm {
@handler scheduleSituationHandler
get /monitoring/schedule/situation returns (scheduleSituationResp)
}
}

4
go.mod
View File

@ -12,6 +12,7 @@ require (
github.com/golang-jwt/jwt/v5 v5.2.2
github.com/jinzhu/copier v0.4.0
github.com/json-iterator/go v1.1.12
github.com/mitchellh/mapstructure v1.5.0
github.com/pkg/errors v0.9.1
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2
github.com/prometheus/alertmanager v0.27.0
@ -35,6 +36,7 @@ require (
gorm.io/datatypes v1.2.0
gorm.io/driver/mysql v1.5.7
gorm.io/gorm v1.25.12
k8s.io/api v0.31.4
k8s.io/apimachinery v0.31.4
k8s.io/client-go v0.31.4
sigs.k8s.io/yaml v1.4.0
@ -120,7 +122,6 @@ require (
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/miekg/dns v1.1.58 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
@ -186,7 +187,6 @@ require (
google.golang.org/protobuf v1.36.5 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
k8s.io/api v0.31.4 // indirect
k8s.io/klog/v2 v2.130.1 // indirect
k8s.io/kube-openapi v0.0.0-20241127205056-99599406b04f // indirect
k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect

File diff suppressed because it is too large Load Diff

View File

@ -39,7 +39,7 @@ func NewCloudListLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CloudLi
func (l *CloudListLogic) CloudList() (resp *types.CloudListResp, err error) {
// 查询数据库中数算任务列表
var clouds []*models.Cloud
var clouds []*models.TaskCloud
tx := l.svcCtx.DbEngin.Find(&clouds)
if tx.Error != nil {
return nil, tx.Error

View File

@ -47,8 +47,8 @@ func (l *DeleteTaskLogic) DeleteTask(req *types.DeleteTaskReq) error {
return tx.Error
}
// 将子任务状态修改为待删除
tx = l.svcCtx.DbEngin.Model(&models.Cloud{}).Where("task_id", req.Id).Update("status", constants.WaitDelete)
l.svcCtx.DbEngin.Where("task_id = ?", req.Id).Delete(&models.Cloud{}, req.Id)
tx = l.svcCtx.DbEngin.Model(&models.TaskCloud{}).Where("task_id", req.Id).Update("status", constants.WaitDelete)
l.svcCtx.DbEngin.Where("task_id = ?", req.Id).Delete(&models.TaskCloud{}, req.Id)
if tx.Error != nil {
return tx.Error
}

View File

@ -19,10 +19,6 @@ import (
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/enum"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/httputils"
"k8s.io/apimachinery/pkg/util/json"
)
type JobTotalLogic struct {
@ -55,60 +51,60 @@ func NewJobTotalLogic(ctx context.Context, svcCtx *svc.ServiceContext) *JobTotal
}
func (l *JobTotalLogic) JobTotal() (resp *types.JobTotalResp, err error) {
// 获取任务时间信息
resp = &types.JobTotalResp{}
bytes, err := httputils.HttpGet("GET", "http://grampus.openi.org.cn/openapi/v1/sharescreen/computepower/alljobinfo")
if err != nil {
return nil, err
}
json.Unmarshal(bytes, resp)
// 获取其他任务信息
jobs := &Job{}
jobBytes, err := httputils.HttpGet("GET", "http://grampus.openi.org.cn/openapi/v1/sharescreen/trainjob?pageIndex=1&pageSize=10")
if err != nil {
return nil, err
}
json.Unmarshal(jobBytes, jobs)
for _, job := range jobs.OtJobs {
trainJob := types.TrainJob{
Name: job.Name,
Status: enum.ExternalStatus(job.Status).String(),
Strategy: 0,
SynergyStatus: "未协同",
}
if job.Tasks[0].CenterName != nil {
trainJob.ParticipantName = job.Tasks[0].CenterName[0]
}
resp.TrainJobs = append(resp.TrainJobs, trainJob)
}
var tasks []models.Task
tx := l.svcCtx.DbEngin.Find(&tasks)
if tx.Error != nil {
logx.Error(err)
return nil, tx.Error
}
if len(tasks) == 0 {
return nil, nil
}
for _, task := range tasks {
var participantName string
tx := l.svcCtx.DbEngin.Raw("SELECT name from sc_participant_phy_info where id in (SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.participant_id) ,GROUP_CONCAT(DISTINCT a.participant_id) ,GROUP_CONCAT(DISTINCT c.participant_id))as service_name from task t left join hpc h on t.id = h.task_id left join cloud c on t.id = c.task_id left join ai a on t.id = a.task_id where t.id = ?)", task.Id).Scan(&participantName)
if tx.Error != nil {
logx.Error(err)
return nil, tx.Error
}
// 承接方转义
resp.TrainJobs = append(resp.TrainJobs, types.TrainJob{
ParticipantName: participantName,
Name: task.Name,
Strategy: int(task.Strategy),
SynergyStatus: enum.SynergyStatus(task.SynergyStatus).String(),
Status: task.Status,
})
}
//// 获取任务时间信息
//resp = &types.JobTotalResp{}
//bytes, err := httputils.HttpGet("http://grampus.openi.org.cn/openapi/v1/sharescreen/computepower/alljobinfo")
//if err != nil {
// return nil, err
//}
//json.Unmarshal(bytes, resp)
//
//// 获取其他任务信息
//jobs := &Job{}
//jobBytes, err := httputils.HttpGet("http://grampus.openi.org.cn/openapi/v1/sharescreen/trainjob?pageIndex=1&pageSize=10")
//if err != nil {
// return nil, err
//}
//json.Unmarshal(jobBytes, jobs)
//
//for _, job := range jobs.OtJobs {
// trainJob := types.TrainJob{
// Name: job.Name,
// Status: enum.ExternalStatus(job.Status).String(),
// Strategy: 0,
// SynergyStatus: "未协同",
// }
// if job.Tasks[0].CenterName != nil {
// trainJob.ParticipantName = job.Tasks[0].CenterName[0]
// }
// resp.TrainJobs = append(resp.TrainJobs, trainJob)
//}
//
//var tasks []models.Task
//tx := l.svcCtx.DbEngin.Find(&tasks)
//if tx.Error != nil {
// logx.Error(err)
// return nil, tx.Error
//}
//if len(tasks) == 0 {
// return nil, nil
//}
//for _, task := range tasks {
// var participantName string
// tx := l.svcCtx.DbEngin.Raw("SELECT name from sc_participant_phy_info where id in (SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.participant_id) ,GROUP_CONCAT(DISTINCT a.participant_id) ,GROUP_CONCAT(DISTINCT c.participant_id))as service_name from task t left join hpc h on t.id = h.task_id left join cloud c on t.id = c.task_id left join ai a on t.id = a.task_id where t.id = ?)", task.Id).Scan(&participantName)
// if tx.Error != nil {
// logx.Error(err)
// return nil, tx.Error
// }
// // 承接方转义
// resp.TrainJobs = append(resp.TrainJobs, types.TrainJob{
// ParticipantName: participantName,
// Name: task.Name,
// Strategy: int(task.Strategy),
// SynergyStatus: enum.SynergyStatus(task.SynergyStatus).String(),
// Status: task.Status,
// })
//
//}
return resp, nil
}

View File

@ -27,7 +27,7 @@ func NewClustersLoadLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Clus
func (l *ClustersLoadLogic) ClustersLoad(req *types.ClustersLoadReq) (resp *types.ClustersLoadResp, err error) {
resp = &types.ClustersLoadResp{}
metrics := []string{"cluster_cpu_utilisation", "cluster_cpu_avail", "cluster_cpu_total", "cluster_memory_total", "cluster_memory_avail", "cluster_memory_utilisation", "cluster_disk_utilisation", "cluster_disk_avail", "cluster_disk_total", "cluster_pod_utilisation"}
metrics := []string{"cluster_cpu_utilisation", "cluster_cpu_avail", "cluster_cpu_total", "cluster_memory_total", "cluster_memory_avail", "cluster_memory_utilisation", "cluster_disk_utilisation", "cluster_disk_avail", "cluster_disk_total", "cluster_pod_utilisation", "cluster_node_count"}
result := l.svcCtx.PromClient.GetNamedMetrics(metrics, time.Now(), tracker.ClusterOption{ClusterName: req.ClusterName})
resp.Data = result
return resp, nil

File diff suppressed because it is too large Load Diff

View File

@ -14,7 +14,7 @@ import (
)
var (
cloudFieldNames = builder.RawFieldNames(&Cloud{})
cloudFieldNames = builder.RawFieldNames(&TaskCloud{})
cloudRows = strings.Join(cloudFieldNames, ",")
cloudRowsExpectAutoSet = strings.Join(stringx.Remove(cloudFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), ",")
cloudRowsWithPlaceHolder = strings.Join(stringx.Remove(cloudFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), "=?,") + "=?"
@ -22,10 +22,10 @@ var (
type (
cloudModel interface {
Insert(ctx context.Context, data *Cloud) (sql.Result, error)
FindOne(ctx context.Context, id int64) (*Cloud, error)
FindOneByNamespaceNameServiceName(ctx context.Context, namespace sql.NullString, name sql.NullString, serviceName sql.NullString) (*Cloud, error)
Update(ctx context.Context, data *Cloud) error
Insert(ctx context.Context, data *TaskCloud) (sql.Result, error)
FindOne(ctx context.Context, id int64) (*TaskCloud, error)
FindOneByNamespaceNameServiceName(ctx context.Context, namespace sql.NullString, name sql.NullString, serviceName sql.NullString) (*TaskCloud, error)
Update(ctx context.Context, data *TaskCloud) error
Delete(ctx context.Context, id int64) error
}
@ -34,7 +34,7 @@ type (
table string
}
Cloud struct {
TaskCloud struct {
Id int64 `db:"id"` // id
TaskId int64 `db:"task_id"` // 任务id
ParticipantId int64 `db:"participant_id"` // 集群静态信息id
@ -56,7 +56,7 @@ type (
func newCloudModel(conn sqlx.SqlConn) *defaultCloudModel {
return &defaultCloudModel{
conn: conn,
table: "`cloud`",
table: "`task_cloud`",
}
}

View File

@ -31,6 +31,7 @@ var promQLTemplates = map[string]string{
"cluster_memory_avail": "cluster_memory_avail{$1}",
"cluster_disk_avail": "cluster_disk_avail{$1}",
"cluster_pod_utilisation": "cluster_pod_utilisation{$1}",
"cluster_node_count": `cluster_pod_utilisation{$1}`,
// center
"center_cpu_utilisation": "(sum by (adapter_id)(cluster_cpu_total{$1})-sum by (adapter_id)(cluster_cpu_avail{$1}))/sum by (adapter_id)(cluster_cpu_total{$1})",

View File

@ -90,6 +90,10 @@ var (
Name: "cluster_gpu_avail",
Help: "Cluster Gpu Available.",
}, []string{"cluster_name", "adapter_id"})
ClusterNodeCountGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cluster_node_count",
Help: "Cluster Node Count.",
}, []string{"cluster_name", "adapter_id"})
metrics = []prometheus.Collector{
ClusterCpuUtilisationGauge,
@ -104,6 +108,7 @@ var (
ClusterPodUtilisationGauge,
ClusterPodCountGauge,
ClusterPodTotalGauge,
ClusterNodeCountGauge,
}
)
@ -122,6 +127,7 @@ type ClusterLoadRecord struct {
PodsUtilisation float64 `json:"podsUtilisation,optional"`
PodsCount int64 `json:"podsCount,optional"`
PodsTotal int64 `json:"podsTotal,optional"`
NodeCount float64 `json:"nodeCount,optional"`
}
func init() {
@ -333,7 +339,9 @@ func SyncClusterLoad(record ClusterLoadRecord) {
ClusterDiskAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskAvail)
ClusterDiskTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskTotal)
ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation)
ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount))
ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal))
//ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation)
//ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount))
//ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal))
ClusterNodeCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.NodeCount)
}

View File

@ -62,13 +62,6 @@ func NewHttpsClient() *resty.Client {
return c
}
func GetHttpRequest() *resty.Request {
client := resty.New()
request := client.R()
return request
}
func HttpClient(method string, url string, payload io.Reader, token string) ([]byte, error) {
request, err := http.NewRequest(method, url, payload)
request.Header.Add("Content-Type", "application/json")
@ -88,20 +81,17 @@ func HttpClient(method string, url string, payload io.Reader, token string) ([]b
return body, err
}
func HttpGet(method string, url string) ([]byte, error) {
request, err := http.NewRequest(method, url, nil)
client := &http.Client{}
res, err := client.Do(request)
if err != nil {
log.Fatal(err)
}
defer res.Body.Close()
body, err := io.ReadAll(res.Body)
if err != nil {
log.Fatal(err)
func HttpGetWithResult(params map[string]string, url string, result interface{}) error {
client := NewHttpsClient()
req := client.R()
// 添加查询参数
for k, v := range params {
req.SetQueryParam(k, v)
}
return body, err
_, err := req.SetResult(result).Get(url)
return err
}
// 发送POST请求