hpc resource overview

Former-commit-id: cfb5c293ce
This commit is contained in:
zhouqunjie 2024-04-19 16:04:36 +08:00
parent 9766e68075
commit 58263827d1
5 changed files with 264 additions and 132 deletions

View File

@ -1,137 +1,137 @@
syntax = "v1"
info(
title: "type title here"
desc: "type desc here"
author: "type author here"
email: "type email here"
version: "type version here"
title: "type title here"
desc: "type desc here"
author: "type author here"
email: "type email here"
version: "type version here"
)
type (
commitHpcTaskReq {
Name string `json:"name"` // paratera:jobName
Description string `json:"description,optional"`
tenantId int64 `json:"tenantId,optional"`
TaskId int64 `json:"taskId,optional"`
AdapterId string `json:"adapterId,optional"`
MatchLabels map[string]string `json:"matchLabels,optional"`
CardCount int64 `json:"cardCount,optional"`
WorkDir string `json:"workDir,optional"` //paratera:workingDir
WallTime string `json:"wallTime,optional"`
CmdScript string `json:"cmdScript,optional"` // paratera:bootScript
AppType string `json:"appType,optional"`
AppName string `json:"appName,optional"` // paratera:jobGroupName ac:appname
Queue string `json:"queue,optional"`
NNode string `json:"nNode,optional"`
SubmitType string `json:"submitType,optional"`
StdOutFile string `json:"stdOutFile,optional"`
StdErrFile string `json:"stdErrFile,optional"`
StdInput string `json:"stdInput,optional"`
Environment map[string]string `json:"environment,optional"`
ClusterType string `json:"clusterType,optional"`
}
commitHpcTaskReq {
Name string `json:"name"` // paratera:jobName
Description string `json:"description,optional"`
TenantId int64 `json:"tenantId,optional"`
TaskId int64 `json:"taskId,optional"`
AdapterId string `json:"adapterId,optional"`
MatchLabels map[string]string `json:"matchLabels,optional"`
CardCount int64 `json:"cardCount,optional"`
WorkDir string `json:"workDir,optional"` //paratera:workingDir
WallTime string `json:"wallTime,optional"`
CmdScript string `json:"cmdScript,optional"` // paratera:bootScript
AppType string `json:"appType,optional"`
AppName string `json:"appName,optional"` // paratera:jobGroupName ac:appname
Queue string `json:"queue,optional"`
NNode string `json:"nNode,optional"`
SubmitType string `json:"submitType,optional"`
StdOutFile string `json:"stdOutFile,optional"`
StdErrFile string `json:"stdErrFile,optional"`
StdInput string `json:"stdInput,optional"`
Environment map[string]string `json:"environment,optional"`
ClusterType string `json:"clusterType,optional"`
}
commitHpcTaskResp {
TaskId int64 `json:"taskId"`
Code int32 `json:"code"`
Msg string `json:"msg"`
}
commitHpcTaskResp {
TaskId int64 `json:"taskId"`
Code int32 `json:"code"`
Msg string `json:"msg"`
}
)
type (
hpcOverViewReq {
}
hpcOverViewResp {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data HPCOverView `json:"data"`
}
HPCOverView {
AdapterCount int32 `json:"adapterCount"`
StackCount int32 `json:"stackCount"`
ClusterCount int32 `json:"clusterCount"`
TaskCount int32 `json:"taskCount"`
}
hpcOverViewReq {
}
hpcOverViewResp {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data HPCOverView `json:"data"`
}
HPCOverView {
AdapterCount int32 `json:"adapterCount"`
StackCount int32 `json:"stackCount"`
ClusterCount int32 `json:"clusterCount"`
TaskCount int32 `json:"taskCount"`
}
)
type (
hpcAdapterSummaryReq {
}
hpcAdapterSummaryResp {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data []HPCAdapterSummary `json:"data"`
}
HPCAdapterSummary {
AdapterName string `json:"adapterName"`
StackCount int32 `json:"stackCount"`
ClusterCount int32 `json:"clusterCount"`
TaskCount int32 `json:"taskCount"`
}
hpcAdapterSummaryReq {
}
hpcAdapterSummaryResp {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data []HPCAdapterSummary `json:"data"`
}
HPCAdapterSummary {
AdapterName string `json:"adapterName"`
StackCount int32 `json:"stackCount"`
ClusterCount int32 `json:"clusterCount"`
TaskCount int32 `json:"taskCount"`
}
)
type (
hpcJobReq {
}
hpcJobResp {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data []Job `json:"data"`
}
Job {
JobName string `json:"jobName"`
JobDesc string `json:"jobDesc"`
SubmitTime string `json:"submitTime"`
JobStatus string `json:"jobStatus"`
AdapterName string `json:"adapterName"`
ClusterName string `json:"clusterName"`
ClusterType string `json:"clusterType"`
}
hpcJobReq {
}
hpcJobResp {
Code int32 `json:"code"`
Msg string `json:"msg"`
Data []Job `json:"data"`
}
Job {
JobName string `json:"jobName"`
JobDesc string `json:"jobDesc"`
SubmitTime string `json:"submitTime"`
JobStatus string `json:"jobStatus"`
AdapterName string `json:"adapterName"`
ClusterName string `json:"clusterName"`
ClusterType string `json:"clusterType"`
}
)
type (
hpcResourceReq {
}
hpcResourceResp {
Code int32 `json:"code"`
Msg string `json:"msg"`
HPCResource HPCResource `json:"hpcResource"`
}
HPCResource {
GPUCardsTotal int32 `json:"gpuCoresTotal"`
CPUCoresTotal int32 `json:"cpuCoresTotal"`
RAMTotal int32 `json:"ramTotal"`
GPUCardsUsed int32 `json:"gpuCoresUsed"`
CPUCoresUsed int32 `json:"cpuCoresUsed"`
RAMUsed int32 `json:"ramUsed"`
GPURate float32 `json:"gpuRate"`
CPURate float32 `json:"cpuRate"`
RAMRate float32 `json:"ramRate"`
}
hpcResourceReq {
}
hpcResourceResp {
Code int32 `json:"code"`
Msg string `json:"msg"`
HPCResource HPCResource `json:"hpcResource"`
}
HPCResource {
GPUCardsTotal float64 `json:"gpuCoresTotal"`
CPUCoresTotal float64 `json:"cpuCoresTotal"`
RAMTotal float64 `json:"ramTotal"`
GPUCardsUsed float64 `json:"gpuCoresUsed"`
CPUCoresUsed float64 `json:"cpuCoresUsed"`
RAMUsed float64 `json:"ramUsed"`
GPURate float64 `json:"gpuRate"`
CPURate float64 `json:"cpuRate"`
RAMRate float64 `json:"ramRate"`
}
)
type QueueAssetsResp {
QueueAssets []QueueAsset `json:"queueAsset"`
QueueAssets []QueueAsset `json:"queueAsset"`
}
type QueueAsset {
TenantName string `json:"tenantName"` //租户名称
ParticipantId int64 `json:"participantId"`
AclHosts string `json:"aclHosts"` // 可用节点,多个节点用逗号隔开
QueNodes string `json:"queNodes"` //队列节点总数
QueMinNodect string `json:"queMinNodect,omitempty"` //队列最小节点数
QueMaxNgpus string `json:"queMaxNgpus,omitempty"` //队列最大GPU卡数
QueMaxPPN string `json:"queMaxPPN,omitempty"` //使用该队列作业最大CPU核心数
QueChargeRate string `json:"queChargeRate,omitempty"` //费率
QueMaxNcpus string `json:"queMaxNcpus,omitempty"` //用户最大可用核心数
QueMaxNdcus string `json:"queMaxNdcus,omitempty"` //队列总DCU卡数
QueueName string `json:"queueName,omitempty"` //队列名称
QueMinNcpus string `json:"queMinNcpus,omitempty"` //队列最小CPU核数
QueFreeNodes string `json:"queFreeNodes,omitempty"` //队列空闲节点数
QueMaxNodect string `json:"queMaxNodect,omitempty"` //队列作业最大节点数
QueMaxGpuPN string `json:"queMaxGpuPN,omitempty"` //队列单作业最大GPU卡数
QueMaxWalltime string `json:"queMaxWalltime,omitempty"` //队列最大运行时间
QueMaxDcuPN string `json:"queMaxDcuPN,omitempty"` //队列单作业最大DCU卡数
QueFreeNcpus string `json:"queFreeNcpus"` //队列空闲cpu数
QueNcpus string `json:"queNcpus"` //队列cpu数
TenantName string `json:"tenantName"` //租户名称
ParticipantId int64 `json:"participantId"`
AclHosts string `json:"aclHosts"` // 可用节点,多个节点用逗号隔开
QueNodes string `json:"queNodes"` //队列节点总数
QueMinNodect string `json:"queMinNodect,omitempty"` //队列最小节点数
QueMaxNgpus string `json:"queMaxNgpus,omitempty"` //队列最大GPU卡数
QueMaxPPN string `json:"queMaxPPN,omitempty"` //使用该队列作业最大CPU核心数
QueChargeRate string `json:"queChargeRate,omitempty"` //费率
QueMaxNcpus string `json:"queMaxNcpus,omitempty"` //用户最大可用核心数
QueMaxNdcus string `json:"queMaxNdcus,omitempty"` //队列总DCU卡数
QueueName string `json:"queueName,omitempty"` //队列名称
QueMinNcpus string `json:"queMinNcpus,omitempty"` //队列最小CPU核数
QueFreeNodes string `json:"queFreeNodes,omitempty"` //队列空闲节点数
QueMaxNodect string `json:"queMaxNodect,omitempty"` //队列作业最大节点数
QueMaxGpuPN string `json:"queMaxGpuPN,omitempty"` //队列单作业最大GPU卡数
QueMaxWalltime string `json:"queMaxWalltime,omitempty"` //队列最大运行时间
QueMaxDcuPN string `json:"queMaxDcuPN,omitempty"` //队列单作业最大DCU卡数
QueFreeNcpus string `json:"queFreeNcpus"` //队列空闲cpu数
QueNcpus string `json:"queNcpus"` //队列cpu数
}

View File

@ -2,7 +2,6 @@ package hpc
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
@ -25,18 +24,29 @@ func NewResourceLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Resource
func (l *ResourceLogic) Resource(req *types.HpcResourceReq) (resp *types.HpcResourceResp, err error) {
l.svcCtx.DbEngin.Raw("SELECT th.NAME as job_name,t.description as job_desc,t.commit_time as submit_time,th.STATUS as job_status,ta.name as adapter_name,tc.name as cluster_name,tc.label as cluster_type FROM task_hpc th LEFT JOIN task t ON t.id = th.task_id JOIN t_cluster tc on th.cluster_id = tc.id JOIN t_adapter ta on tc.adapter_id = ta.id")
type hpcResourceOV struct {
CpuAvail float64 `json:"cpu_avail"`
CpuTotal float64 `json:"cpu_total"`
MemAvail float64 `json:"mem_avail"`
MemTotal float64 `json:"mem_total"`
DiskAvail float64 `json:"disk_avail"`
DiskTotal float64 `json:"disk_total"`
GpuAvail float64 `json:"gpu_avail"`
GpuTotal float64 `json:"gpu_total"`
}
var hrov hpcResourceOV
l.svcCtx.DbEngin.Raw("SELECT sum(cpu_avail) as cpu_avail,sum(cpu_total) as cpu_total,sum(mem_avail) as mem_avail,sum(mem_total) as mem_total,sum(disk_avail) as disk_avail,sum(disk_total) as disk_total,sum(gpu_avail) as gpu_avail,sum(gpu_total) as gpu_total FROM t_cluster_resource where cluster_type = 2").Scan(&hrov)
hpcResource := types.HPCResource{
GPUCardsTotal: 0,
CPUCoresTotal: 0,
RAMTotal: 0,
GPUCardsUsed: 0,
CPUCoresUsed: 0,
RAMUsed: 0,
GPURate: 0,
CPURate: 0,
RAMRate: 0,
GPUCardsTotal: hrov.GpuTotal,
CPUCoresTotal: hrov.CpuTotal,
RAMTotal: hrov.MemTotal,
GPUCardsUsed: hrov.GpuTotal - hrov.GpuAvail,
CPUCoresUsed: hrov.CpuTotal - hrov.CpuAvail,
RAMUsed: hrov.MemTotal - hrov.MemAvail,
GPURate: (hrov.GpuTotal - hrov.GpuAvail) / hrov.GpuTotal,
CPURate: (hrov.CpuTotal - hrov.CpuAvail) / hrov.CpuTotal,
RAMRate: (hrov.MemTotal - hrov.MemAvail) / hrov.MemTotal,
}
resp = &types.HpcResourceResp{

View File

@ -937,15 +937,15 @@ type HpcResourceResp struct {
}
type HPCResource struct {
GPUCardsTotal int32 `json:"gpuCoresTotal"`
CPUCoresTotal int32 `json:"cpuCoresTotal"`
RAMTotal int32 `json:"ramTotal"`
GPUCardsUsed int32 `json:"gpuCoresUsed"`
CPUCoresUsed int32 `json:"cpuCoresUsed"`
RAMUsed int32 `json:"ramUsed"`
GPURate float32 `json:"gpuRate"`
CPURate float32 `json:"cpuRate"`
RAMRate float32 `json:"ramRate"`
GPUCardsTotal float64 `json:"gpuCoresTotal"`
CPUCoresTotal float64 `json:"cpuCoresTotal"`
RAMTotal float64 `json:"ramTotal"`
GPUCardsUsed float64 `json:"gpuCoresUsed"`
CPUCoresUsed float64 `json:"cpuCoresUsed"`
RAMUsed float64 `json:"ramUsed"`
GPURate float64 `json:"gpuRate"`
CPURate float64 `json:"cpuRate"`
RAMRate float64 `json:"ramRate"`
}
type QueueAssetsResp struct {

View File

@ -0,0 +1,29 @@
package models
import "github.com/zeromicro/go-zero/core/stores/sqlx"
var _ TClusterResourceModel = (*customTClusterResourceModel)(nil)
type (
// TClusterResourceModel is an interface to be customized, add more methods here,
// and implement the added methods in customTClusterResourceModel.
TClusterResourceModel interface {
tClusterResourceModel
withSession(session sqlx.Session) TClusterResourceModel
}
customTClusterResourceModel struct {
*defaultTClusterResourceModel
}
)
// NewTClusterResourceModel returns a model for the database table.
func NewTClusterResourceModel(conn sqlx.SqlConn) TClusterResourceModel {
return &customTClusterResourceModel{
defaultTClusterResourceModel: newTClusterResourceModel(conn),
}
}
func (m *customTClusterResourceModel) withSession(session sqlx.Session) TClusterResourceModel {
return NewTClusterResourceModel(sqlx.NewSqlConnFromSession(session))
}

View File

@ -0,0 +1,93 @@
// Code generated by goctl. DO NOT EDIT.
package models
import (
"context"
"database/sql"
"fmt"
"strings"
"github.com/zeromicro/go-zero/core/stores/builder"
"github.com/zeromicro/go-zero/core/stores/sqlc"
"github.com/zeromicro/go-zero/core/stores/sqlx"
"github.com/zeromicro/go-zero/core/stringx"
)
var (
tClusterResourceFieldNames = builder.RawFieldNames(&TClusterResource{})
tClusterResourceRows = strings.Join(tClusterResourceFieldNames, ",")
tClusterResourceRowsExpectAutoSet = strings.Join(stringx.Remove(tClusterResourceFieldNames, "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), ",")
tClusterResourceRowsWithPlaceHolder = strings.Join(stringx.Remove(tClusterResourceFieldNames, "`cluster_id`", "`create_at`", "`create_time`", "`created_at`", "`update_at`", "`update_time`", "`updated_at`"), "=?,") + "=?"
)
type (
tClusterResourceModel interface {
Insert(ctx context.Context, data *TClusterResource) (sql.Result, error)
FindOne(ctx context.Context, clusterId int64) (*TClusterResource, error)
Update(ctx context.Context, data *TClusterResource) error
Delete(ctx context.Context, clusterId int64) error
}
defaultTClusterResourceModel struct {
conn sqlx.SqlConn
table string
}
TClusterResource struct {
ClusterId int64 `db:"cluster_id"`
ClusterName string `db:"cluster_name"`
ClusterType int64 `db:"cluster_type"` // 类型0->容器1->智算2->超算3-虚拟机
CpuAvail float64 `db:"cpu_avail"`
CpuTotal float64 `db:"cpu_total"`
MemAvail float64 `db:"mem_avail"`
MemTotal float64 `db:"mem_total"`
DiskAvail float64 `db:"disk_avail"`
DiskTotal float64 `db:"disk_total"`
GpuAvail float64 `db:"gpu_avail"`
GpuTotal float64 `db:"gpu_total"`
}
)
func newTClusterResourceModel(conn sqlx.SqlConn) *defaultTClusterResourceModel {
return &defaultTClusterResourceModel{
conn: conn,
table: "`t_cluster_resource`",
}
}
func (m *defaultTClusterResourceModel) Delete(ctx context.Context, clusterId int64) error {
query := fmt.Sprintf("delete from %s where `cluster_id` = ?", m.table)
_, err := m.conn.ExecCtx(ctx, query, clusterId)
return err
}
func (m *defaultTClusterResourceModel) FindOne(ctx context.Context, clusterId int64) (*TClusterResource, error) {
query := fmt.Sprintf("select %s from %s where `cluster_id` = ? limit 1", tClusterResourceRows, m.table)
var resp TClusterResource
err := m.conn.QueryRowCtx(ctx, &resp, query, clusterId)
switch err {
case nil:
return &resp, nil
case sqlc.ErrNotFound:
return nil, ErrNotFound
default:
return nil, err
}
}
func (m *defaultTClusterResourceModel) Insert(ctx context.Context, data *TClusterResource) (sql.Result, error) {
query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, tClusterResourceRowsExpectAutoSet)
ret, err := m.conn.ExecCtx(ctx, query, data.ClusterId, data.ClusterName, data.ClusterType, data.CpuAvail, data.CpuTotal, data.MemAvail, data.MemTotal, data.DiskAvail, data.DiskTotal, data.GpuAvail, data.GpuTotal)
return ret, err
}
func (m *defaultTClusterResourceModel) Update(ctx context.Context, data *TClusterResource) error {
query := fmt.Sprintf("update %s set %s where `cluster_id` = ?", m.table, tClusterResourceRowsWithPlaceHolder)
_, err := m.conn.ExecCtx(ctx, query, data.ClusterName, data.ClusterType, data.CpuAvail, data.CpuTotal, data.MemAvail, data.MemTotal, data.DiskAvail, data.DiskTotal, data.GpuAvail, data.GpuTotal, data.ClusterId)
return err
}
func (m *defaultTClusterResourceModel) tableName() string {
return m.table
}