forked from JointCloud/pcm-coordinator
parent
8dd14a6421
commit
888833d5bd
|
@ -1369,6 +1369,7 @@ type (
|
|||
Code int `json:"code,omitempty"`
|
||||
Msg string `json:"msg,omitempty"`
|
||||
Data interface{} `json:"data,omitempty"`
|
||||
TraceId string `json:"traceId,omitempty"`
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
@ -111,8 +111,9 @@ type (
|
|||
)
|
||||
|
||||
type cancelJobReq {
|
||||
ClusterId int64 `form:"clusterId"`
|
||||
JobId string `form:"jobId"`
|
||||
ClusterId int64 `form:"clusterId,optional"`
|
||||
JobId string `form:"jobId,optional"`
|
||||
taskId string `form:"taskId"`
|
||||
}
|
||||
|
||||
type jobInfoReq {
|
||||
|
@ -174,4 +175,18 @@ type (
|
|||
}
|
||||
|
||||
/******************instance center*************************/
|
||||
)
|
||||
|
||||
type(
|
||||
HpcTaskLogResp{
|
||||
Code int32 `json:"code"`
|
||||
Msg string `json:"msg"`
|
||||
Data interface{} `json:"data"`
|
||||
}
|
||||
)
|
||||
|
||||
type(
|
||||
HpcTaskLogReq{
|
||||
TaskId string `path:"taskId"`
|
||||
}
|
||||
)
|
69
desc/pcm.api
69
desc/pcm.api
|
@ -13,18 +13,18 @@ import (
|
|||
"inference/inference.api"
|
||||
)
|
||||
|
||||
info(
|
||||
title: "pcm api service"
|
||||
desc: "type desc here"
|
||||
author: "type author here"
|
||||
email: "type email here"
|
||||
info (
|
||||
title: "pcm api service"
|
||||
desc: "type desc here"
|
||||
author: "type author here"
|
||||
email: "type email here"
|
||||
version: "type version here"
|
||||
)
|
||||
|
||||
//core端接口
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: core
|
||||
group: core
|
||||
)
|
||||
service pcm {
|
||||
@doc "查询P端服务列表"
|
||||
|
@ -177,9 +177,9 @@ service pcm {
|
|||
}
|
||||
|
||||
//hpc二级接口
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: hpc
|
||||
group: hpc
|
||||
)
|
||||
service pcm {
|
||||
@doc "提交超算任务"
|
||||
|
@ -216,13 +216,17 @@ service pcm {
|
|||
|
||||
@doc "查询超算应用中心列表"
|
||||
@handler ListInstanceCenter
|
||||
get /hpc/ListInstanceCenter(HpcInstanceCenterReq) returns (PageResult)
|
||||
get /hpc/ListInstanceCenter (HpcInstanceCenterReq) returns (PageResult)
|
||||
|
||||
@doc "超算任务日志"
|
||||
@handler getHpcTaskLogHandler
|
||||
get /hpc/jobLogs/:taskId (HpcTaskLogReq) returns (HpcTaskLogResp)
|
||||
}
|
||||
|
||||
//cloud二级接口
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: cloud
|
||||
group: cloud
|
||||
)
|
||||
service pcm {
|
||||
@doc "云算任务列表"
|
||||
|
@ -258,9 +262,9 @@ service pcm {
|
|||
}
|
||||
|
||||
//智算二级接口
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: ai
|
||||
group: ai
|
||||
)
|
||||
service pcm {
|
||||
@doc "训练任务统计"
|
||||
|
@ -407,13 +411,13 @@ service pcm {
|
|||
@doc "文本识别"
|
||||
@handler ChatHandler
|
||||
post /ai/chat (ChatReq) returns (ChatResult)
|
||||
/******chat end***********/
|
||||
/******chat end***********/
|
||||
}
|
||||
|
||||
//screen接口
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: storage
|
||||
group: storage
|
||||
)
|
||||
service pcm {
|
||||
@doc "日常算力查询"
|
||||
|
@ -426,9 +430,9 @@ service pcm {
|
|||
}
|
||||
|
||||
//openstack 接口
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: vm
|
||||
group: vm
|
||||
)
|
||||
service pcm {
|
||||
@doc "openstack计算中心概览"
|
||||
|
@ -825,9 +829,9 @@ service pcm {
|
|||
}
|
||||
|
||||
//存算联动 接口
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: storelink
|
||||
group: storelink
|
||||
)
|
||||
service pcm {
|
||||
@handler UploadLinkImageHandler
|
||||
|
@ -856,9 +860,9 @@ service pcm {
|
|||
}
|
||||
|
||||
// 接口
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: adapters
|
||||
group: adapters
|
||||
)
|
||||
service pcm {
|
||||
@handler AdaptersListHandler
|
||||
|
@ -901,9 +905,9 @@ service pcm {
|
|||
get /adapter/getAdapterInfo (adapterInfoNameReq) returns (adapterInfoNameReqResp)
|
||||
}
|
||||
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: schedule
|
||||
group: schedule
|
||||
)
|
||||
service pcm {
|
||||
@handler ScheduleGetAiResourceTypesHandler
|
||||
|
@ -955,9 +959,9 @@ service pcm {
|
|||
post /schedule/cancelTask (CancelTaskReq) returns (CancelTaskResp)
|
||||
}
|
||||
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: inference
|
||||
group: inference
|
||||
)
|
||||
service pcm {
|
||||
@handler GetDeployInstanceHandler
|
||||
|
@ -1018,9 +1022,9 @@ service pcm {
|
|||
get /inference/getAdaptersByModel (GetAdaptersByModelReq) returns (GetAdaptersByModelResp)
|
||||
}
|
||||
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: dictionary
|
||||
group: dictionary
|
||||
)
|
||||
service pcm {
|
||||
@handler GetDict
|
||||
|
@ -1057,9 +1061,9 @@ service pcm {
|
|||
get /dictItem/code/:dictCode (DictCodeReq) returns (PageResult)
|
||||
}
|
||||
|
||||
@server(
|
||||
@server (
|
||||
prefix: pcm/v1
|
||||
group: monitoring
|
||||
group: monitoring
|
||||
)
|
||||
service pcm {
|
||||
@handler CreateAlertRuleHandler
|
||||
|
@ -1096,4 +1100,5 @@ service pcm {
|
|||
|
||||
@handler scheduleSituationHandler
|
||||
get /monitoring/schedule/situation returns (scheduleSituationResp)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
package hpc
|
||||
|
||||
import (
|
||||
"github.com/zeromicro/go-zero/rest/httpx"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/hpc"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func GetHpcTaskLogHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
var req types.HpcTaskLogReq
|
||||
if err := httpx.Parse(r, &req); err != nil {
|
||||
result.ParamErrorResult(r, w, err)
|
||||
return
|
||||
}
|
||||
|
||||
l := hpc.NewGetHpcTaskLogLogic(r.Context(), svcCtx)
|
||||
resp, err := l.GetHpcTaskLog(&req)
|
||||
result.HttpResult(r, w, resp, err)
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -2,54 +2,101 @@ package hpc
|
|||
|
||||
import (
|
||||
"context"
|
||||
"github.com/go-resty/resty/v2"
|
||||
"github.com/pkg/errors"
|
||||
"fmt"
|
||||
"github.com/zeromicro/go-zero/core/logx"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
||||
"gitlink.org.cn/JointCloud/pcm-hpc/slurm"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
||||
)
|
||||
|
||||
type CancelJobLogic struct {
|
||||
logx.Logger
|
||||
ctx context.Context
|
||||
svcCtx *svc.ServiceContext
|
||||
ctx context.Context
|
||||
svcCtx *svc.ServiceContext
|
||||
hpcService *service.HpcService
|
||||
}
|
||||
|
||||
type TaskHPCResult struct {
|
||||
ID uint `gorm:"column:id"` // 对应 t.id
|
||||
JobID string `gorm:"column:job_id"` // 对应 hpc.job_id
|
||||
AdapterId string `gorm:"column:adapter_id"` // 对应 hpc.adapter_id
|
||||
}
|
||||
|
||||
func NewCancelJobLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CancelJobLogic {
|
||||
cache := make(map[string]interface{}, 10)
|
||||
hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &CancelJobLogic{
|
||||
Logger: logx.WithContext(ctx),
|
||||
ctx: ctx,
|
||||
svcCtx: svcCtx,
|
||||
Logger: logx.WithContext(ctx),
|
||||
ctx: ctx,
|
||||
svcCtx: svcCtx,
|
||||
hpcService: hpcService,
|
||||
}
|
||||
}
|
||||
|
||||
func (l *CancelJobLogic) CancelJob(req *types.CancelJobReq) error {
|
||||
var clusterInfo *types.ClusterInfo
|
||||
tx := l.svcCtx.DbEngin.Raw("select * from t_cluster where id = ?", req.ClusterId).Scan(&clusterInfo)
|
||||
if tx.Error != nil {
|
||||
return tx.Error
|
||||
//var clusterInfo *types.ClusterInfo
|
||||
//tx := l.svcCtx.DbEngin.Raw("select * from t_cluster where id = ?", req.ClusterId).Scan(&clusterInfo)
|
||||
//if tx.Error != nil {
|
||||
// return tx.Error
|
||||
//}
|
||||
//// 查询p端调用地址
|
||||
//var adapterAddress string
|
||||
//l.svcCtx.DbEngin.Raw("SELECT server FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterAddress)
|
||||
//var jobResp slurm.GetJobResp
|
||||
//httpClient := resty.New().R()
|
||||
//_, err := httpClient.SetHeader("Content-Type", "application/json").
|
||||
// SetQueryParams(map[string]string{
|
||||
// "jobId": req.JobId,
|
||||
// "server": clusterInfo.Server,
|
||||
// "version": clusterInfo.Version,
|
||||
// "token": clusterInfo.Token,
|
||||
// "username": clusterInfo.Username,
|
||||
// }).
|
||||
// SetResult(&jobResp).
|
||||
// Delete(adapterAddress + "/api/v1/job/cancel")
|
||||
//if err != nil {
|
||||
// return err
|
||||
//}
|
||||
//if len(jobResp.Errors) != 0 {
|
||||
// return errors.Errorf(jobResp.Errors[0].Description)
|
||||
//}
|
||||
//return nil
|
||||
var hpcR TaskHPCResult
|
||||
tx := l.svcCtx.DbEngin.Raw(
|
||||
"SELECT t.id, hpc.job_id ,hpc.adapter_id FROM task t "+
|
||||
"INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+
|
||||
"WHERE adapter_type_dict = 2 AND t.id = ?",
|
||||
req.TaskId,
|
||||
).Scan(&hpcR).Error
|
||||
if tx != nil {
|
||||
return fmt.Errorf("数据库查询失败: %v", tx.Error)
|
||||
}
|
||||
// 查询p端调用地址
|
||||
var adapterAddress string
|
||||
l.svcCtx.DbEngin.Raw("SELECT server FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterAddress)
|
||||
var jobResp slurm.GetJobResp
|
||||
httpClient := resty.New().R()
|
||||
_, err := httpClient.SetHeader("Content-Type", "application/json").
|
||||
SetQueryParams(map[string]string{
|
||||
"jobId": req.JobId,
|
||||
"server": clusterInfo.Server,
|
||||
"version": clusterInfo.Version,
|
||||
"token": clusterInfo.Token,
|
||||
"username": clusterInfo.Username,
|
||||
}).
|
||||
SetResult(&jobResp).
|
||||
Delete(adapterAddress + "/api/v1/job/cancel")
|
||||
if hpcR.ID == 0 || hpcR.JobID == "" {
|
||||
return fmt.Errorf("作业不存在")
|
||||
}
|
||||
var adapterInfo types.AdapterInfo
|
||||
l.svcCtx.DbEngin.Raw("SELECT * FROM `t_adapter` where id = ?", hpcR.AdapterId).Scan(&adapterInfo)
|
||||
if adapterInfo.Id == "" {
|
||||
return fmt.Errorf("adapter not found")
|
||||
}
|
||||
// 取消作业
|
||||
err := l.hpcService.HpcExecutorAdapterMap[adapterInfo.Id].CancelTask(l.ctx, hpcR.JobID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(jobResp.Errors) != 0 {
|
||||
return errors.Errorf(jobResp.Errors[0].Description)
|
||||
// 更新数据库状态
|
||||
tx = l.svcCtx.DbEngin.Model(&types.Task{}).Where("id = ?", hpcR.ID).Update("status", "Canceled").Error
|
||||
if tx != nil {
|
||||
return fmt.Errorf("数据库更新失败: %v", tx.Error)
|
||||
}
|
||||
// 更新数据库状态
|
||||
tx = l.svcCtx.DbEngin.Model(&models.TaskHpc{}).Where("task_id = ?", hpcR.ID).Update("status", "Canceled").Error
|
||||
if tx != nil {
|
||||
return fmt.Errorf("数据库更新失败: %v", tx.Error)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -155,6 +155,7 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t
|
|||
return nil, updates.Error
|
||||
}
|
||||
resp.Data.JobInfo["taskId"] = strconv.FormatInt(taskModel.Id, 10)
|
||||
logx.Infof("提交job到指定集群成功, resp: %v", resp)
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
package hpc
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
|
||||
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
||||
|
||||
"github.com/zeromicro/go-zero/core/logx"
|
||||
)
|
||||
|
||||
type GetHpcTaskLogLogic struct {
|
||||
logx.Logger
|
||||
ctx context.Context
|
||||
svcCtx *svc.ServiceContext
|
||||
hpcService *service.HpcService
|
||||
}
|
||||
|
||||
// 超算任务日志
|
||||
func NewGetHpcTaskLogLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetHpcTaskLogLogic {
|
||||
cache := make(map[string]interface{}, 10)
|
||||
hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &GetHpcTaskLogLogic{
|
||||
Logger: logx.WithContext(ctx),
|
||||
ctx: ctx,
|
||||
svcCtx: svcCtx,
|
||||
hpcService: hpcService,
|
||||
}
|
||||
}
|
||||
|
||||
func (l *GetHpcTaskLogLogic) GetHpcTaskLog(req *types.HpcTaskLogReq) (resp interface{}, err error) {
|
||||
var hpcR TaskHPCResult
|
||||
tx := l.svcCtx.DbEngin.Raw(
|
||||
"SELECT t.id, hpc.job_id ,hpc.adapter_id FROM task t "+
|
||||
"INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+
|
||||
"WHERE adapter_type_dict = 2 AND t.id = ?",
|
||||
req.TaskId,
|
||||
).Scan(&hpcR).Error
|
||||
if tx != nil {
|
||||
return nil, fmt.Errorf("数据库查询失败: %v", tx.Error)
|
||||
}
|
||||
var adapterInfo types.AdapterInfo
|
||||
l.svcCtx.DbEngin.Raw("SELECT * FROM `t_adapter` where id = ?", hpcR.AdapterId).Scan(&adapterInfo)
|
||||
if adapterInfo.Id == "" {
|
||||
return nil, fmt.Errorf("adapter not found")
|
||||
}
|
||||
// 取消作业
|
||||
resp, err = l.hpcService.HpcExecutorAdapterMap[adapterInfo.Id].GetTaskLogs(l.ctx, hpcR.JobID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
|
@ -9,6 +9,8 @@ import (
|
|||
type HPCCollector interface {
|
||||
GetTask(ctx context.Context, taskId string) (*Task, error)
|
||||
SubmitTask(ctx context.Context, req types.CommitHpcTaskReq) (*types.CommitHpcTaskResp, error)
|
||||
CancelTask(ctx context.Context, jobId string) error
|
||||
GetTaskLogs(ctx context.Context, jobId string) (interface{}, error)
|
||||
}
|
||||
|
||||
type JobInfo struct {
|
||||
|
|
|
@ -19,8 +19,11 @@ type ParticipantHpc struct {
|
|||
}
|
||||
|
||||
const (
|
||||
BackendSlurm = "slurm"
|
||||
JobDetailUrl = "/api/v1/jobs/detail/{backend}/{jobId}"
|
||||
SubmitTaskUrl = "/api/v1/jobs"
|
||||
CancelTaskUrl = "/api/v1/jobs/cancel/{backend}/{jobId}"
|
||||
JobLogUrl = "/api/v1/jobs/logs/{backend}/{jobId}"
|
||||
)
|
||||
|
||||
func NewHpc(host string, id int64, platform string) *ParticipantHpc {
|
||||
|
@ -91,3 +94,41 @@ func (c *ParticipantHpc) SubmitTask(ctx context.Context, req types.CommitHpcTask
|
|||
|
||||
return &resp, nil
|
||||
}
|
||||
|
||||
func (c *ParticipantHpc) CancelTask(ctx context.Context, jobId string) error {
|
||||
reqUrl := c.host + CancelTaskUrl
|
||||
resp := types.CommonResp{}
|
||||
logx.WithContext(ctx).Infof("取消超算集群任务, url: %s, jobId: %s", reqUrl, jobId)
|
||||
httpClient := resty.New().R()
|
||||
_, err := httpClient.SetHeaders(
|
||||
map[string]string{
|
||||
"Content-Type": "application/json",
|
||||
"traceId": result.TraceIDFromContext(ctx),
|
||||
}).SetPathParams(map[string]string{
|
||||
"backend": BackendSlurm,
|
||||
"jobId": jobId,
|
||||
}).SetResult(&resp).Delete(reqUrl)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *ParticipantHpc) GetTaskLogs(ctx context.Context, jobId string) (interface{}, error) {
|
||||
reqUrl := c.host + JobLogUrl
|
||||
resp := types.CommonResp{}
|
||||
logx.WithContext(ctx).Infof("获取超算集群任务日志, url: %s, jobId: %s", reqUrl, jobId)
|
||||
httpClient := resty.New().R()
|
||||
_, err := httpClient.SetHeaders(
|
||||
map[string]string{
|
||||
"Content-Type": "application/json",
|
||||
"traceId": result.TraceIDFromContext(ctx),
|
||||
}).SetPathParams(map[string]string{
|
||||
"backend": BackendSlurm,
|
||||
"jobId": jobId,
|
||||
}).SetResult(&resp).Get(reqUrl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return resp, nil
|
||||
}
|
||||
|
|
11352
internal/types/types.go
11352
internal/types/types.go
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue