forked from JointCloud/pcm-coordinator
监控接口修改
This commit is contained in:
parent
4e59d689e9
commit
fe8d858cb3
|
@ -59,9 +59,9 @@ type (
|
|||
Type int64 `json:"type"` // 租户所属(0数算,1超算,2智算)
|
||||
DeletedFlag int64 `json:"deletedFlag"` // 是否删除
|
||||
CreatedBy int64 `json:"createdBy"` // 创建人
|
||||
CreateTime string `json:"createdTime"` // 创建时间
|
||||
CreateTime string `json:"createdTime"` // 创建时间
|
||||
UpdatedBy int64 `json:"updatedBy"` // 更新人
|
||||
UpdateTime string `json:"updated_time"` // 更新时间
|
||||
UpdateTime string `json:"updated_time"` // 更新时间
|
||||
}
|
||||
|
||||
UpdateTenantReq {
|
||||
|
@ -103,6 +103,7 @@ type DataSet {
|
|||
type cloudListResp {
|
||||
Clouds []Cloud `json:"clouds"`
|
||||
}
|
||||
|
||||
type Cloud {
|
||||
Id int64 `json:"id"` // id
|
||||
TaskId int64 `json:"taskId"` // 任务id
|
||||
|
@ -115,6 +116,13 @@ type Cloud {
|
|||
StartTime string `json:"startTime"` // 开始时间
|
||||
RunningTime int64 `json:"runningTime"` // 运行时长
|
||||
CreatedBy int64 `json:"createdBy"` // 创建人
|
||||
CreateTime string `json:"createdTime"` // 创建时间
|
||||
CreateTime string `json:"createdTime"` // 创建时间
|
||||
Result string `json:"result"`
|
||||
}
|
||||
|
||||
type PodsListReq {
|
||||
ClusterName string `form:"clusterName"`
|
||||
}
|
||||
type PodsListResp {
|
||||
Data []interface{} `json:"data"`
|
||||
}
|
|
@ -128,6 +128,7 @@ type (
|
|||
PodsUtilisation float64 `json:"podsUtilisation,optional"`
|
||||
PodsCount int64 `json:"podsCount,optional"`
|
||||
PodsTotal int64 `json:"podsTotal,optional"`
|
||||
NodeCount float64 `json:"nodeCount,optional"`
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -1455,7 +1456,7 @@ type EditResourceReq {
|
|||
CostType string `json:"costType" gorm:"column:cost_type"` //计费类型(hourly, daily, monthly,perUse)
|
||||
Type string `json:"type,optional" gorm:"column:type"`
|
||||
|
||||
// 基础资源规格
|
||||
// 基础资源规格
|
||||
StorageValue string `json:"storageValue,optional"`
|
||||
StorageUnit string `json:"storageUnit,optional"`
|
||||
CpuValue string `json:"cpuValue,optional"`
|
||||
|
|
10
desc/pcm.api
10
desc/pcm.api
|
@ -203,7 +203,7 @@ service pcm {
|
|||
@doc "删除资源规格"
|
||||
@handler deleteResourceSpecHandler
|
||||
delete /core/ai/resourceSpec/delete/:id (DeletePathId) returns (CommonResp)
|
||||
//集群资源规格----- 结束
|
||||
//集群资源规格----- 结束
|
||||
}
|
||||
|
||||
//hpc二级接口
|
||||
|
@ -289,6 +289,9 @@ service pcm {
|
|||
|
||||
@handler podLogs
|
||||
post /cloud/pod/logs (PodLogsReq) returns (string)
|
||||
|
||||
@handler podsList
|
||||
get /cloud/pods/list (PodsListReq) returns (PodsListResp)
|
||||
}
|
||||
|
||||
//智算二级接口
|
||||
|
@ -441,7 +444,7 @@ service pcm {
|
|||
@doc "文本识别"
|
||||
@handler ChatHandler
|
||||
post /ai/chat (ChatReq) returns (ChatResult)
|
||||
/******chat end***********/
|
||||
/******chat end***********/
|
||||
}
|
||||
|
||||
//screen接口
|
||||
|
@ -1130,5 +1133,4 @@ service pcm {
|
|||
|
||||
@handler scheduleSituationHandler
|
||||
get /monitoring/schedule/situation returns (scheduleSituationResp)
|
||||
}
|
||||
|
||||
}
|
4
go.mod
4
go.mod
|
@ -12,6 +12,7 @@ require (
|
|||
github.com/golang-jwt/jwt/v5 v5.2.2
|
||||
github.com/jinzhu/copier v0.4.0
|
||||
github.com/json-iterator/go v1.1.12
|
||||
github.com/mitchellh/mapstructure v1.5.0
|
||||
github.com/pkg/errors v0.9.1
|
||||
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.71.2
|
||||
github.com/prometheus/alertmanager v0.27.0
|
||||
|
@ -35,6 +36,7 @@ require (
|
|||
gorm.io/datatypes v1.2.0
|
||||
gorm.io/driver/mysql v1.5.7
|
||||
gorm.io/gorm v1.25.12
|
||||
k8s.io/api v0.31.4
|
||||
k8s.io/apimachinery v0.31.4
|
||||
k8s.io/client-go v0.31.4
|
||||
sigs.k8s.io/yaml v1.4.0
|
||||
|
@ -120,7 +122,6 @@ require (
|
|||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
|
||||
github.com/miekg/dns v1.1.58 // indirect
|
||||
github.com/mitchellh/mapstructure v1.5.0 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
|
@ -186,7 +187,6 @@ require (
|
|||
google.golang.org/protobuf v1.36.5 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/ini.v1 v1.67.0 // indirect
|
||||
k8s.io/api v0.31.4 // indirect
|
||||
k8s.io/klog/v2 v2.130.1 // indirect
|
||||
k8s.io/kube-openapi v0.0.0-20241127205056-99599406b04f // indirect
|
||||
k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -39,7 +39,7 @@ func NewCloudListLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CloudLi
|
|||
|
||||
func (l *CloudListLogic) CloudList() (resp *types.CloudListResp, err error) {
|
||||
// 查询数据库中数算任务列表
|
||||
var clouds []*models.Cloud
|
||||
var clouds []*models.TaskCloud
|
||||
tx := l.svcCtx.DbEngin.Find(&clouds)
|
||||
if tx.Error != nil {
|
||||
return nil, tx.Error
|
||||
|
|
|
@ -47,8 +47,8 @@ func (l *DeleteTaskLogic) DeleteTask(req *types.DeleteTaskReq) error {
|
|||
return tx.Error
|
||||
}
|
||||
// 将子任务状态修改为待删除
|
||||
tx = l.svcCtx.DbEngin.Model(&models.Cloud{}).Where("task_id", req.Id).Update("status", constants.WaitDelete)
|
||||
l.svcCtx.DbEngin.Where("task_id = ?", req.Id).Delete(&models.Cloud{}, req.Id)
|
||||
tx = l.svcCtx.DbEngin.Model(&models.TaskCloud{}).Where("task_id", req.Id).Update("status", constants.WaitDelete)
|
||||
l.svcCtx.DbEngin.Where("task_id = ?", req.Id).Delete(&models.TaskCloud{}, req.Id)
|
||||
if tx.Error != nil {
|
||||
return tx.Error
|
||||
}
|
||||
|
|
|
@ -19,10 +19,6 @@ import (
|
|||
"github.com/zeromicro/go-zero/core/logx"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/helper/enum"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/httputils"
|
||||
"k8s.io/apimachinery/pkg/util/json"
|
||||
)
|
||||
|
||||
type JobTotalLogic struct {
|
||||
|
@ -55,60 +51,60 @@ func NewJobTotalLogic(ctx context.Context, svcCtx *svc.ServiceContext) *JobTotal
|
|||
}
|
||||
|
||||
func (l *JobTotalLogic) JobTotal() (resp *types.JobTotalResp, err error) {
|
||||
// 获取任务时间信息
|
||||
resp = &types.JobTotalResp{}
|
||||
bytes, err := httputils.HttpGet("GET", "http://grampus.openi.org.cn/openapi/v1/sharescreen/computepower/alljobinfo")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
json.Unmarshal(bytes, resp)
|
||||
|
||||
// 获取其他任务信息
|
||||
jobs := &Job{}
|
||||
jobBytes, err := httputils.HttpGet("GET", "http://grampus.openi.org.cn/openapi/v1/sharescreen/trainjob?pageIndex=1&pageSize=10")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
json.Unmarshal(jobBytes, jobs)
|
||||
|
||||
for _, job := range jobs.OtJobs {
|
||||
trainJob := types.TrainJob{
|
||||
Name: job.Name,
|
||||
Status: enum.ExternalStatus(job.Status).String(),
|
||||
Strategy: 0,
|
||||
SynergyStatus: "未协同",
|
||||
}
|
||||
if job.Tasks[0].CenterName != nil {
|
||||
trainJob.ParticipantName = job.Tasks[0].CenterName[0]
|
||||
}
|
||||
resp.TrainJobs = append(resp.TrainJobs, trainJob)
|
||||
}
|
||||
|
||||
var tasks []models.Task
|
||||
tx := l.svcCtx.DbEngin.Find(&tasks)
|
||||
if tx.Error != nil {
|
||||
logx.Error(err)
|
||||
return nil, tx.Error
|
||||
}
|
||||
if len(tasks) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
for _, task := range tasks {
|
||||
var participantName string
|
||||
tx := l.svcCtx.DbEngin.Raw("SELECT name from sc_participant_phy_info where id in (SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.participant_id) ,GROUP_CONCAT(DISTINCT a.participant_id) ,GROUP_CONCAT(DISTINCT c.participant_id))as service_name from task t left join hpc h on t.id = h.task_id left join cloud c on t.id = c.task_id left join ai a on t.id = a.task_id where t.id = ?)", task.Id).Scan(&participantName)
|
||||
if tx.Error != nil {
|
||||
logx.Error(err)
|
||||
return nil, tx.Error
|
||||
}
|
||||
// 承接方转义
|
||||
resp.TrainJobs = append(resp.TrainJobs, types.TrainJob{
|
||||
ParticipantName: participantName,
|
||||
Name: task.Name,
|
||||
Strategy: int(task.Strategy),
|
||||
SynergyStatus: enum.SynergyStatus(task.SynergyStatus).String(),
|
||||
Status: task.Status,
|
||||
})
|
||||
|
||||
}
|
||||
//// 获取任务时间信息
|
||||
//resp = &types.JobTotalResp{}
|
||||
//bytes, err := httputils.HttpGet("http://grampus.openi.org.cn/openapi/v1/sharescreen/computepower/alljobinfo")
|
||||
//if err != nil {
|
||||
// return nil, err
|
||||
//}
|
||||
//json.Unmarshal(bytes, resp)
|
||||
//
|
||||
//// 获取其他任务信息
|
||||
//jobs := &Job{}
|
||||
//jobBytes, err := httputils.HttpGet("http://grampus.openi.org.cn/openapi/v1/sharescreen/trainjob?pageIndex=1&pageSize=10")
|
||||
//if err != nil {
|
||||
// return nil, err
|
||||
//}
|
||||
//json.Unmarshal(jobBytes, jobs)
|
||||
//
|
||||
//for _, job := range jobs.OtJobs {
|
||||
// trainJob := types.TrainJob{
|
||||
// Name: job.Name,
|
||||
// Status: enum.ExternalStatus(job.Status).String(),
|
||||
// Strategy: 0,
|
||||
// SynergyStatus: "未协同",
|
||||
// }
|
||||
// if job.Tasks[0].CenterName != nil {
|
||||
// trainJob.ParticipantName = job.Tasks[0].CenterName[0]
|
||||
// }
|
||||
// resp.TrainJobs = append(resp.TrainJobs, trainJob)
|
||||
//}
|
||||
//
|
||||
//var tasks []models.Task
|
||||
//tx := l.svcCtx.DbEngin.Find(&tasks)
|
||||
//if tx.Error != nil {
|
||||
// logx.Error(err)
|
||||
// return nil, tx.Error
|
||||
//}
|
||||
//if len(tasks) == 0 {
|
||||
// return nil, nil
|
||||
//}
|
||||
//for _, task := range tasks {
|
||||
// var participantName string
|
||||
// tx := l.svcCtx.DbEngin.Raw("SELECT name from sc_participant_phy_info where id in (SELECT CONCAT_WS(',',GROUP_CONCAT(DISTINCT h.participant_id) ,GROUP_CONCAT(DISTINCT a.participant_id) ,GROUP_CONCAT(DISTINCT c.participant_id))as service_name from task t left join hpc h on t.id = h.task_id left join cloud c on t.id = c.task_id left join ai a on t.id = a.task_id where t.id = ?)", task.Id).Scan(&participantName)
|
||||
// if tx.Error != nil {
|
||||
// logx.Error(err)
|
||||
// return nil, tx.Error
|
||||
// }
|
||||
// // 承接方转义
|
||||
// resp.TrainJobs = append(resp.TrainJobs, types.TrainJob{
|
||||
// ParticipantName: participantName,
|
||||
// Name: task.Name,
|
||||
// Strategy: int(task.Strategy),
|
||||
// SynergyStatus: enum.SynergyStatus(task.SynergyStatus).String(),
|
||||
// Status: task.Status,
|
||||
// })
|
||||
//
|
||||
//}
|
||||
return resp, nil
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ func NewClustersLoadLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Clus
|
|||
|
||||
func (l *ClustersLoadLogic) ClustersLoad(req *types.ClustersLoadReq) (resp *types.ClustersLoadResp, err error) {
|
||||
resp = &types.ClustersLoadResp{}
|
||||
metrics := []string{"cluster_cpu_utilisation", "cluster_cpu_avail", "cluster_cpu_total", "cluster_memory_total", "cluster_memory_avail", "cluster_memory_utilisation", "cluster_disk_utilisation", "cluster_disk_avail", "cluster_disk_total", "cluster_pod_utilisation"}
|
||||
metrics := []string{"cluster_cpu_utilisation", "cluster_cpu_avail", "cluster_cpu_total", "cluster_memory_total", "cluster_memory_avail", "cluster_memory_utilisation", "cluster_disk_utilisation", "cluster_disk_avail", "cluster_disk_total", "cluster_pod_utilisation", "cluster_node_count"}
|
||||
result := l.svcCtx.PromClient.GetNamedMetrics(metrics, time.Now(), tracker.ClusterOption{ClusterName: req.ClusterName})
|
||||
resp.Data = result
|
||||
return resp, nil
|
||||
|
|
11453
internal/types/types.go
11453
internal/types/types.go
File diff suppressed because it is too large
Load Diff
|
@ -14,7 +14,7 @@ import (
|
|||
)
|
||||
|
||||
var (
|
||||
cloudFieldNames = builder.RawFieldNames(&Cloud{})
|
||||
cloudFieldNames = builder.RawFieldNames(&TaskCloud{})
|
||||
cloudRows = strings.Join(cloudFieldNames, ",")
|
||||
cloudRowsExpectAutoSet = strings.Join(stringx.Remove(cloudFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), ",")
|
||||
cloudRowsWithPlaceHolder = strings.Join(stringx.Remove(cloudFieldNames, "`id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), "=?,") + "=?"
|
||||
|
@ -22,10 +22,10 @@ var (
|
|||
|
||||
type (
|
||||
cloudModel interface {
|
||||
Insert(ctx context.Context, data *Cloud) (sql.Result, error)
|
||||
FindOne(ctx context.Context, id int64) (*Cloud, error)
|
||||
FindOneByNamespaceNameServiceName(ctx context.Context, namespace sql.NullString, name sql.NullString, serviceName sql.NullString) (*Cloud, error)
|
||||
Update(ctx context.Context, data *Cloud) error
|
||||
Insert(ctx context.Context, data *TaskCloud) (sql.Result, error)
|
||||
FindOne(ctx context.Context, id int64) (*TaskCloud, error)
|
||||
FindOneByNamespaceNameServiceName(ctx context.Context, namespace sql.NullString, name sql.NullString, serviceName sql.NullString) (*TaskCloud, error)
|
||||
Update(ctx context.Context, data *TaskCloud) error
|
||||
Delete(ctx context.Context, id int64) error
|
||||
}
|
||||
|
||||
|
@ -34,7 +34,7 @@ type (
|
|||
table string
|
||||
}
|
||||
|
||||
Cloud struct {
|
||||
TaskCloud struct {
|
||||
Id int64 `db:"id"` // id
|
||||
TaskId int64 `db:"task_id"` // 任务id
|
||||
ParticipantId int64 `db:"participant_id"` // 集群静态信息id
|
||||
|
@ -56,7 +56,7 @@ type (
|
|||
func newCloudModel(conn sqlx.SqlConn) *defaultCloudModel {
|
||||
return &defaultCloudModel{
|
||||
conn: conn,
|
||||
table: "`cloud`",
|
||||
table: "`task_cloud`",
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@ var promQLTemplates = map[string]string{
|
|||
"cluster_memory_avail": "cluster_memory_avail{$1}",
|
||||
"cluster_disk_avail": "cluster_disk_avail{$1}",
|
||||
"cluster_pod_utilisation": "cluster_pod_utilisation{$1}",
|
||||
"cluster_node_count": `cluster_pod_utilisation{$1}`,
|
||||
|
||||
// center
|
||||
"center_cpu_utilisation": "(sum by (adapter_id)(cluster_cpu_total{$1})-sum by (adapter_id)(cluster_cpu_avail{$1}))/sum by (adapter_id)(cluster_cpu_total{$1})",
|
||||
|
|
|
@ -90,6 +90,10 @@ var (
|
|||
Name: "cluster_gpu_avail",
|
||||
Help: "Cluster Gpu Available.",
|
||||
}, []string{"cluster_name", "adapter_id"})
|
||||
ClusterNodeCountGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Name: "cluster_node_count",
|
||||
Help: "Cluster Node Count.",
|
||||
}, []string{"cluster_name", "adapter_id"})
|
||||
|
||||
metrics = []prometheus.Collector{
|
||||
ClusterCpuUtilisationGauge,
|
||||
|
@ -104,6 +108,7 @@ var (
|
|||
ClusterPodUtilisationGauge,
|
||||
ClusterPodCountGauge,
|
||||
ClusterPodTotalGauge,
|
||||
ClusterNodeCountGauge,
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -122,6 +127,7 @@ type ClusterLoadRecord struct {
|
|||
PodsUtilisation float64 `json:"podsUtilisation,optional"`
|
||||
PodsCount int64 `json:"podsCount,optional"`
|
||||
PodsTotal int64 `json:"podsTotal,optional"`
|
||||
NodeCount float64 `json:"nodeCount,optional"`
|
||||
}
|
||||
|
||||
func init() {
|
||||
|
@ -333,7 +339,9 @@ func SyncClusterLoad(record ClusterLoadRecord) {
|
|||
ClusterDiskAvailGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskAvail)
|
||||
ClusterDiskTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.DiskTotal)
|
||||
|
||||
ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation)
|
||||
ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount))
|
||||
ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal))
|
||||
//ClusterPodUtilisationGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.PodsUtilisation)
|
||||
//ClusterPodCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsCount))
|
||||
//ClusterPodTotalGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(float64(record.PodsTotal))
|
||||
|
||||
ClusterNodeCountGauge.WithLabelValues(record.ClusterName, strconv.FormatInt(record.AdapterId, 10)).Set(record.NodeCount)
|
||||
}
|
||||
|
|
|
@ -62,13 +62,6 @@ func NewHttpsClient() *resty.Client {
|
|||
return c
|
||||
}
|
||||
|
||||
func GetHttpRequest() *resty.Request {
|
||||
|
||||
client := resty.New()
|
||||
request := client.R()
|
||||
return request
|
||||
}
|
||||
|
||||
func HttpClient(method string, url string, payload io.Reader, token string) ([]byte, error) {
|
||||
request, err := http.NewRequest(method, url, payload)
|
||||
request.Header.Add("Content-Type", "application/json")
|
||||
|
@ -88,20 +81,17 @@ func HttpClient(method string, url string, payload io.Reader, token string) ([]b
|
|||
return body, err
|
||||
}
|
||||
|
||||
func HttpGet(method string, url string) ([]byte, error) {
|
||||
request, err := http.NewRequest(method, url, nil)
|
||||
client := &http.Client{}
|
||||
res, err := client.Do(request)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
body, err := io.ReadAll(res.Body)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
func HttpGetWithResult(params map[string]string, url string, result interface{}) error {
|
||||
client := NewHttpsClient()
|
||||
|
||||
req := client.R()
|
||||
// 添加查询参数
|
||||
for k, v := range params {
|
||||
req.SetQueryParam(k, v)
|
||||
}
|
||||
|
||||
return body, err
|
||||
_, err := req.SetResult(result).Get(url)
|
||||
return err
|
||||
}
|
||||
|
||||
// 发送POST请求
|
||||
|
|
Loading…
Reference in New Issue