Signed-off-by: jagger <cossjie@foxmail.com>
This commit is contained in:
jagger 2025-04-21 11:45:02 +08:00
parent 8dd14a6421
commit 888833d5bd
11 changed files with 7391 additions and 6985 deletions

View File

@ -1369,6 +1369,7 @@ type (
Code int `json:"code,omitempty"`
Msg string `json:"msg,omitempty"`
Data interface{} `json:"data,omitempty"`
TraceId string `json:"traceId,omitempty"`
}
)

View File

@ -111,8 +111,9 @@ type (
)
type cancelJobReq {
ClusterId int64 `form:"clusterId"`
JobId string `form:"jobId"`
ClusterId int64 `form:"clusterId,optional"`
JobId string `form:"jobId,optional"`
taskId string `form:"taskId"`
}
type jobInfoReq {
@ -174,4 +175,18 @@ type (
}
/******************instance center*************************/
)
type(
HpcTaskLogResp{
Code int32 `json:"code"`
Msg string `json:"msg"`
Data interface{} `json:"data"`
}
)
type(
HpcTaskLogReq{
TaskId string `path:"taskId"`
}
)

View File

@ -13,18 +13,18 @@ import (
"inference/inference.api"
)
info(
title: "pcm api service"
desc: "type desc here"
author: "type author here"
email: "type email here"
info (
title: "pcm api service"
desc: "type desc here"
author: "type author here"
email: "type email here"
version: "type version here"
)
//core端接口
@server(
@server (
prefix: pcm/v1
group: core
group: core
)
service pcm {
@doc "查询P端服务列表"
@ -177,9 +177,9 @@ service pcm {
}
//hpc二级接口
@server(
@server (
prefix: pcm/v1
group: hpc
group: hpc
)
service pcm {
@doc "提交超算任务"
@ -216,13 +216,17 @@ service pcm {
@doc "查询超算应用中心列表"
@handler ListInstanceCenter
get /hpc/ListInstanceCenter(HpcInstanceCenterReq) returns (PageResult)
get /hpc/ListInstanceCenter (HpcInstanceCenterReq) returns (PageResult)
@doc "超算任务日志"
@handler getHpcTaskLogHandler
get /hpc/jobLogs/:taskId (HpcTaskLogReq) returns (HpcTaskLogResp)
}
//cloud二级接口
@server(
@server (
prefix: pcm/v1
group: cloud
group: cloud
)
service pcm {
@doc "云算任务列表"
@ -258,9 +262,9 @@ service pcm {
}
//智算二级接口
@server(
@server (
prefix: pcm/v1
group: ai
group: ai
)
service pcm {
@doc "训练任务统计"
@ -407,13 +411,13 @@ service pcm {
@doc "文本识别"
@handler ChatHandler
post /ai/chat (ChatReq) returns (ChatResult)
/******chat end***********/
/******chat end***********/
}
//screen接口
@server(
@server (
prefix: pcm/v1
group: storage
group: storage
)
service pcm {
@doc "日常算力查询"
@ -426,9 +430,9 @@ service pcm {
}
//openstack 接口
@server(
@server (
prefix: pcm/v1
group: vm
group: vm
)
service pcm {
@doc "openstack计算中心概览"
@ -825,9 +829,9 @@ service pcm {
}
//存算联动 接口
@server(
@server (
prefix: pcm/v1
group: storelink
group: storelink
)
service pcm {
@handler UploadLinkImageHandler
@ -856,9 +860,9 @@ service pcm {
}
// 接口
@server(
@server (
prefix: pcm/v1
group: adapters
group: adapters
)
service pcm {
@handler AdaptersListHandler
@ -901,9 +905,9 @@ service pcm {
get /adapter/getAdapterInfo (adapterInfoNameReq) returns (adapterInfoNameReqResp)
}
@server(
@server (
prefix: pcm/v1
group: schedule
group: schedule
)
service pcm {
@handler ScheduleGetAiResourceTypesHandler
@ -955,9 +959,9 @@ service pcm {
post /schedule/cancelTask (CancelTaskReq) returns (CancelTaskResp)
}
@server(
@server (
prefix: pcm/v1
group: inference
group: inference
)
service pcm {
@handler GetDeployInstanceHandler
@ -1018,9 +1022,9 @@ service pcm {
get /inference/getAdaptersByModel (GetAdaptersByModelReq) returns (GetAdaptersByModelResp)
}
@server(
@server (
prefix: pcm/v1
group: dictionary
group: dictionary
)
service pcm {
@handler GetDict
@ -1057,9 +1061,9 @@ service pcm {
get /dictItem/code/:dictCode (DictCodeReq) returns (PageResult)
}
@server(
@server (
prefix: pcm/v1
group: monitoring
group: monitoring
)
service pcm {
@handler CreateAlertRuleHandler
@ -1096,4 +1100,5 @@ service pcm {
@handler scheduleSituationHandler
get /monitoring/schedule/situation returns (scheduleSituationResp)
}
}

View File

@ -0,0 +1,24 @@
package hpc
import (
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/hpc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
)
func GetHpcTaskLogHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.HpcTaskLogReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
l := hpc.NewGetHpcTaskLogLogic(r.Context(), svcCtx)
resp, err := l.GetHpcTaskLog(&req)
result.HttpResult(r, w, resp, err)
}
}

File diff suppressed because it is too large Load Diff

View File

@ -2,54 +2,101 @@ package hpc
import (
"context"
"github.com/go-resty/resty/v2"
"github.com/pkg/errors"
"fmt"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-hpc/slurm"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
)
type CancelJobLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
ctx context.Context
svcCtx *svc.ServiceContext
hpcService *service.HpcService
}
type TaskHPCResult struct {
ID uint `gorm:"column:id"` // 对应 t.id
JobID string `gorm:"column:job_id"` // 对应 hpc.job_id
AdapterId string `gorm:"column:adapter_id"` // 对应 hpc.adapter_id
}
func NewCancelJobLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CancelJobLogic {
cache := make(map[string]interface{}, 10)
hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
if err != nil {
return nil
}
return &CancelJobLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
hpcService: hpcService,
}
}
func (l *CancelJobLogic) CancelJob(req *types.CancelJobReq) error {
var clusterInfo *types.ClusterInfo
tx := l.svcCtx.DbEngin.Raw("select * from t_cluster where id = ?", req.ClusterId).Scan(&clusterInfo)
if tx.Error != nil {
return tx.Error
//var clusterInfo *types.ClusterInfo
//tx := l.svcCtx.DbEngin.Raw("select * from t_cluster where id = ?", req.ClusterId).Scan(&clusterInfo)
//if tx.Error != nil {
// return tx.Error
//}
//// 查询p端调用地址
//var adapterAddress string
//l.svcCtx.DbEngin.Raw("SELECT server FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterAddress)
//var jobResp slurm.GetJobResp
//httpClient := resty.New().R()
//_, err := httpClient.SetHeader("Content-Type", "application/json").
// SetQueryParams(map[string]string{
// "jobId": req.JobId,
// "server": clusterInfo.Server,
// "version": clusterInfo.Version,
// "token": clusterInfo.Token,
// "username": clusterInfo.Username,
// }).
// SetResult(&jobResp).
// Delete(adapterAddress + "/api/v1/job/cancel")
//if err != nil {
// return err
//}
//if len(jobResp.Errors) != 0 {
// return errors.Errorf(jobResp.Errors[0].Description)
//}
//return nil
var hpcR TaskHPCResult
tx := l.svcCtx.DbEngin.Raw(
"SELECT t.id, hpc.job_id ,hpc.adapter_id FROM task t "+
"INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+
"WHERE adapter_type_dict = 2 AND t.id = ?",
req.TaskId,
).Scan(&hpcR).Error
if tx != nil {
return fmt.Errorf("数据库查询失败: %v", tx.Error)
}
// 查询p端调用地址
var adapterAddress string
l.svcCtx.DbEngin.Raw("SELECT server FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterAddress)
var jobResp slurm.GetJobResp
httpClient := resty.New().R()
_, err := httpClient.SetHeader("Content-Type", "application/json").
SetQueryParams(map[string]string{
"jobId": req.JobId,
"server": clusterInfo.Server,
"version": clusterInfo.Version,
"token": clusterInfo.Token,
"username": clusterInfo.Username,
}).
SetResult(&jobResp).
Delete(adapterAddress + "/api/v1/job/cancel")
if hpcR.ID == 0 || hpcR.JobID == "" {
return fmt.Errorf("作业不存在")
}
var adapterInfo types.AdapterInfo
l.svcCtx.DbEngin.Raw("SELECT * FROM `t_adapter` where id = ?", hpcR.AdapterId).Scan(&adapterInfo)
if adapterInfo.Id == "" {
return fmt.Errorf("adapter not found")
}
// 取消作业
err := l.hpcService.HpcExecutorAdapterMap[adapterInfo.Id].CancelTask(l.ctx, hpcR.JobID)
if err != nil {
return err
}
if len(jobResp.Errors) != 0 {
return errors.Errorf(jobResp.Errors[0].Description)
// 更新数据库状态
tx = l.svcCtx.DbEngin.Model(&types.Task{}).Where("id = ?", hpcR.ID).Update("status", "Canceled").Error
if tx != nil {
return fmt.Errorf("数据库更新失败: %v", tx.Error)
}
// 更新数据库状态
tx = l.svcCtx.DbEngin.Model(&models.TaskHpc{}).Where("task_id = ?", hpcR.ID).Update("status", "Canceled").Error
if tx != nil {
return fmt.Errorf("数据库更新失败: %v", tx.Error)
}
return nil
}

View File

@ -155,6 +155,7 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t
return nil, updates.Error
}
resp.Data.JobInfo["taskId"] = strconv.FormatInt(taskModel.Id, 10)
logx.Infof("提交job到指定集群成功, resp: %v", resp)
return resp, nil
}

View File

@ -0,0 +1,59 @@
package hpc
import (
"context"
"fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type GetHpcTaskLogLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
hpcService *service.HpcService
}
// 超算任务日志
func NewGetHpcTaskLogLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetHpcTaskLogLogic {
cache := make(map[string]interface{}, 10)
hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
if err != nil {
return nil
}
return &GetHpcTaskLogLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
hpcService: hpcService,
}
}
func (l *GetHpcTaskLogLogic) GetHpcTaskLog(req *types.HpcTaskLogReq) (resp interface{}, err error) {
var hpcR TaskHPCResult
tx := l.svcCtx.DbEngin.Raw(
"SELECT t.id, hpc.job_id ,hpc.adapter_id FROM task t "+
"INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+
"WHERE adapter_type_dict = 2 AND t.id = ?",
req.TaskId,
).Scan(&hpcR).Error
if tx != nil {
return nil, fmt.Errorf("数据库查询失败: %v", tx.Error)
}
var adapterInfo types.AdapterInfo
l.svcCtx.DbEngin.Raw("SELECT * FROM `t_adapter` where id = ?", hpcR.AdapterId).Scan(&adapterInfo)
if adapterInfo.Id == "" {
return nil, fmt.Errorf("adapter not found")
}
// 取消作业
resp, err = l.hpcService.HpcExecutorAdapterMap[adapterInfo.Id].GetTaskLogs(l.ctx, hpcR.JobID)
if err != nil {
return nil, err
}
return resp, nil
}

View File

@ -9,6 +9,8 @@ import (
type HPCCollector interface {
GetTask(ctx context.Context, taskId string) (*Task, error)
SubmitTask(ctx context.Context, req types.CommitHpcTaskReq) (*types.CommitHpcTaskResp, error)
CancelTask(ctx context.Context, jobId string) error
GetTaskLogs(ctx context.Context, jobId string) (interface{}, error)
}
type JobInfo struct {

View File

@ -19,8 +19,11 @@ type ParticipantHpc struct {
}
const (
BackendSlurm = "slurm"
JobDetailUrl = "/api/v1/jobs/detail/{backend}/{jobId}"
SubmitTaskUrl = "/api/v1/jobs"
CancelTaskUrl = "/api/v1/jobs/cancel/{backend}/{jobId}"
JobLogUrl = "/api/v1/jobs/logs/{backend}/{jobId}"
)
func NewHpc(host string, id int64, platform string) *ParticipantHpc {
@ -91,3 +94,41 @@ func (c *ParticipantHpc) SubmitTask(ctx context.Context, req types.CommitHpcTask
return &resp, nil
}
func (c *ParticipantHpc) CancelTask(ctx context.Context, jobId string) error {
reqUrl := c.host + CancelTaskUrl
resp := types.CommonResp{}
logx.WithContext(ctx).Infof("取消超算集群任务, url: %s, jobId: %s", reqUrl, jobId)
httpClient := resty.New().R()
_, err := httpClient.SetHeaders(
map[string]string{
"Content-Type": "application/json",
"traceId": result.TraceIDFromContext(ctx),
}).SetPathParams(map[string]string{
"backend": BackendSlurm,
"jobId": jobId,
}).SetResult(&resp).Delete(reqUrl)
if err != nil {
return err
}
return nil
}
func (c *ParticipantHpc) GetTaskLogs(ctx context.Context, jobId string) (interface{}, error) {
reqUrl := c.host + JobLogUrl
resp := types.CommonResp{}
logx.WithContext(ctx).Infof("获取超算集群任务日志, url: %s, jobId: %s", reqUrl, jobId)
httpClient := resty.New().R()
_, err := httpClient.SetHeaders(
map[string]string{
"Content-Type": "application/json",
"traceId": result.TraceIDFromContext(ctx),
}).SetPathParams(map[string]string{
"backend": BackendSlurm,
"jobId": jobId,
}).SetResult(&resp).Get(reqUrl)
if err != nil {
return nil, err
}
return resp, nil
}

File diff suppressed because it is too large Load Diff