This commit is contained in:
zhangwei 2025-07-30 14:40:47 +08:00
commit 594dec4538
14 changed files with 409 additions and 85 deletions

View File

@ -947,6 +947,16 @@ type (
ProxyEnable string `json:"proxyEnable,omitempty" db:"proxy_enable"` ProxyEnable string `json:"proxyEnable,omitempty" db:"proxy_enable"`
Driver string `json:"driver,omitempty" db:"driver"` Driver string `json:"driver,omitempty" db:"driver"`
} }
ClusterBaseInfo {
Id string `json:"id,omitempty" db:"id"`
AdapterId int64 `json:"adapterId,omitempty,string" db:"adapter_id"`
Name string `json:"name,omitempty" db:"name"`
Nickname string `json:"nickname,omitempty" db:"nickname"`
Description string `json:"description,omitempty" db:"description"`
Server string `json:"server,omitempty" db:"server"`
Driver string `json:"driver,omitempty" db:"driver"`
}
) )
type ClusterDelReq { type ClusterDelReq {
@ -1416,6 +1426,7 @@ type ResourceSpecReq {
type FetchResourceSpecReq { type FetchResourceSpecReq {
ClusterId string `form:"clusterId,optional"` ClusterId string `form:"clusterId,optional"`
Tag string `form:"tag,optional"` Tag string `form:"tag,optional"`
UserId int64 `form:"userId,optional"`
} }
type IdReq { type IdReq {
@ -1479,8 +1490,10 @@ type EditResourceReq {
CpuUnit string `json:"cpuUnit,optional"` CpuUnit string `json:"cpuUnit,optional"`
MemoryValue string `json:"memoryValue,optional"` MemoryValue string `json:"memoryValue,optional"`
MemoryUnit string `json:"memoryUnit,optional"` MemoryUnit string `json:"memoryUnit,optional"`
UserId int64 `json:"userId,optional"`
} }
type SyncResourceReq { type SyncResourceReq {
Id string `json:"id"` Id string `json:"id"`
UserId int64 `json:"userId,optional"`
} }

View File

@ -338,7 +338,7 @@ service pcm {
@doc "创建数据集" @doc "创建数据集"
@handler CreateDataSetHandler @handler CreateDataSetHandler
post /ai/createDataSet/:projectId (CreateDataSetReq) returns (CreateDataSetResp) post /ai/createDataSet (CreateDataSetReq) returns (CreateDataSetResp)
@doc "删除数据集" @doc "删除数据集"
@handler DeleteDataSetHandler @handler DeleteDataSetHandler
@ -362,7 +362,7 @@ service pcm {
@doc "创建算法" @doc "创建算法"
@handler CreateAlgorithmHandler @handler CreateAlgorithmHandler
post /ai/CreateAlgorithm/:projectId (CreateAlgorithmReq) returns (CreateAlgorithmResp) post /ai/createAlgorithm (CreateAlgorithmReq) returns (CreateAlgorithmResp)
@doc "查询创建算法列表" @doc "查询创建算法列表"
@handler ListAlgorithms @handler ListAlgorithms
@ -948,6 +948,9 @@ service pcm {
@handler GetAdapterInfoHandler @handler GetAdapterInfoHandler
get /adapter/getAdapterInfo (adapterInfoNameReq) returns (adapterInfoNameReqResp) get /adapter/getAdapterInfo (adapterInfoNameReq) returns (adapterInfoNameReqResp)
@handler GetClusterBaseInfoHandler
get /adapter/cluster/getClusterBaseInfo (ClusterReq) returns (PageResult)
} }
@server ( @server (

View File

@ -0,0 +1,24 @@
package adapters
import (
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/logic/adapters"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
)
func GetClusterBaseInfoHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.ClusterReq
if err := httpx.Parse(r, &req); err != nil {
result.ParamErrorResult(r, w, err)
return
}
l := adapters.NewGetClusterBaseInfoLogic(r.Context(), svcCtx)
resp, err := l.GetClusterBaseInfo(&req)
result.HttpResult(r, w, resp, err)
}
}

View File

@ -6,6 +6,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"net/http" "net/http"
) )
@ -16,7 +17,14 @@ func CompareResourceSpecHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
result.ParamErrorResult(r, w, err) result.ParamErrorResult(r, w, err)
return return
} }
token := r.Header.Get("Authorization")
// 获取用户信息
jccUserInfo, err := utils.ParseTokenWithoutVerify(token)
if err != nil {
result.ParamErrorResult(r, w, err)
return
}
req.UserId = jccUserInfo.Id
l := core.NewCompareResourceSpecLogic(r.Context(), svcCtx) l := core.NewCompareResourceSpecLogic(r.Context(), svcCtx)
resp, err := l.CompareResourceSpec(&req) resp, err := l.CompareResourceSpec(&req)
result.HttpResult(r, w, resp, err) result.HttpResult(r, w, resp, err)

View File

@ -6,6 +6,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"net/http" "net/http"
) )
@ -17,6 +18,15 @@ func EditResourceSpecHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return return
} }
token := r.Header.Get("Authorization")
// 获取用户信息
jccUserInfo, err := utils.ParseTokenWithoutVerify(token)
if err != nil {
result.ParamErrorResult(r, w, err)
return
}
req.UserId = jccUserInfo.Id
l := core.NewEditResourceSpecLogic(r.Context(), svcCtx) l := core.NewEditResourceSpecLogic(r.Context(), svcCtx)
resp, err := l.EditResourceSpec(&req) resp, err := l.EditResourceSpec(&req)
result.HttpResult(r, w, resp, err) result.HttpResult(r, w, resp, err)

View File

@ -6,6 +6,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"net/http" "net/http"
) )
@ -17,6 +18,14 @@ func SyncResourceSpecHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return return
} }
token := r.Header.Get("Authorization")
// 获取用户信息
jccUserInfo, err := utils.ParseTokenWithoutVerify(token)
if err != nil {
result.ParamErrorResult(r, w, err)
return
}
req.UserId = jccUserInfo.Id
l := core.NewSyncResourceSpecLogic(r.Context(), svcCtx) l := core.NewSyncResourceSpecLogic(r.Context(), svcCtx)
resp, err := l.SyncResourceSpec(&req) resp, err := l.SyncResourceSpec(&req)
result.HttpResult(r, w, resp, err) result.HttpResult(r, w, resp, err)

View File

@ -39,6 +39,11 @@ func RegisterHandlers(server *rest.Server, serverCtx *svc.ServiceContext) {
Path: "/adapter/cluster/get", Path: "/adapter/cluster/get",
Handler: adapters.GetClusterHandler(serverCtx), Handler: adapters.GetClusterHandler(serverCtx),
}, },
{
Method: http.MethodGet,
Path: "/adapter/cluster/getClusterBaseInfo",
Handler: adapters.GetClusterBaseInfoHandler(serverCtx),
},
{ {
Method: http.MethodGet, Method: http.MethodGet,
Path: "/adapter/cluster/list", Path: "/adapter/cluster/list",

View File

@ -0,0 +1,84 @@
package adapters
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type GetClusterBaseInfoLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewGetClusterBaseInfoLogic(ctx context.Context, svcCtx *svc.ServiceContext) *GetClusterBaseInfoLogic {
return &GetClusterBaseInfoLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *GetClusterBaseInfoLogic) GetClusterBaseInfo(req *types.ClusterReq) (resp *types.PageResult, err error) {
limit := req.PageSize
offset := req.PageSize * (req.PageNum - 1)
resp = &types.PageResult{}
var list []types.ClusterBaseInfo
db := l.svcCtx.DbEngin.Model(&types.AdapterInfo{}).Table("t_cluster")
db = db.Joins("left join t_adapter on t_adapter.id = t_cluster.adapter_id").
Where("t_cluster.deleted_at is null")
if req.Name != "" {
db = db.Where("t_cluster.name LIKE ?", "%"+req.Name+"%")
}
if req.AdapterId != "" {
db = db.Where("t_cluster.adapter_id = ?", req.AdapterId)
}
if req.Nickname != "" {
db = db.Where("t_cluster.nickname LIKE ?", "%"+req.Nickname+"%")
}
if req.Label != "" {
db = db.Where("t_cluster.label = ?", req.Label)
}
if req.Version != "" {
db = db.Where("t_cluster.version = ?", req.Version)
}
if req.ProducerDict != "" {
db = db.Where("t_cluster.producer_dict = ?", req.ProducerDict)
}
if req.RegionDict != "" {
db = db.Where("t_cluster.region_dict = ?", req.RegionDict)
}
if req.Type != "" {
db = db.Where("t_adapter.type = ?", req.Type)
}
if req.ResourceType != "" {
db = db.Where("t_adapter.resource_type = ?", req.ResourceType)
}
if req.StorageSchedule != "" {
db = db.Where("t_cluster.storage_schedule = ?", req.StorageSchedule)
}
//count total
var total int64
err = db.Select("*").Count(&total).Error
if err != nil {
return resp, err
}
db = db.Limit(limit).Offset(offset)
err = db.Select("t_cluster.*").Order("t_cluster.create_time desc").Scan(&list).Error
if err != nil {
return resp, err
}
resp.List = list
resp.PageSize = req.PageSize
resp.PageNum = req.PageNum
resp.Total = total
return resp, nil
}

View File

@ -79,7 +79,7 @@ func (l *CompareResourceSpecLogic) CompareResourceSpec(req *types.FetchResourceS
} }
// 同步资源到数据库 // 同步资源到数据库
if err := l.syncResourcesToDB(apiResources); err != nil { if err := l.syncResourcesToDB(apiResources, req.UserId); err != nil {
return nil, fmt.Errorf("failed to sync resources: %w", err) return nil, fmt.Errorf("failed to sync resources: %w", err)
} }
@ -135,10 +135,10 @@ func decodeAPIResponse(input interface{}, output *[]APIResponse) error {
return nil return nil
} }
func (l *CompareResourceSpecLogic) syncResourcesToDB(apiResponses []APIResponse) error { func (l *CompareResourceSpecLogic) syncResourcesToDB(apiResponses []APIResponse, userId int64) error {
for _, response := range apiResponses { for _, response := range apiResponses {
// 转换API响应到数据库模型 // 转换API响应到数据库模型
dbSpecs, apiSpecs, err := l.processAPIResponse(response) dbSpecs, apiSpecs, err := l.processAPIResponse(response, userId)
if err != nil { if err != nil {
return err return err
} }
@ -151,7 +151,7 @@ func (l *CompareResourceSpecLogic) syncResourcesToDB(apiResponses []APIResponse)
return nil return nil
} }
func (l *CompareResourceSpecLogic) processAPIResponse(response APIResponse) ([]models.TResourceSpec, []models.TResourceSpec, error) { func (l *CompareResourceSpecLogic) processAPIResponse(response APIResponse, userId int64) ([]models.TResourceSpec, []models.TResourceSpec, error) {
ClusterId := utils.StringToInt64(response.ClusterId) ClusterId := utils.StringToInt64(response.ClusterId)
var dbSpecs []models.TResourceSpec var dbSpecs []models.TResourceSpec
if err := l.svcCtx.DbEngin.Model(models.TResourceSpec{}).Preload("BaseResourceSpecs"). if err := l.svcCtx.DbEngin.Model(models.TResourceSpec{}).Preload("BaseResourceSpecs").
@ -167,7 +167,7 @@ func (l *CompareResourceSpecLogic) processAPIResponse(response APIResponse) ([]m
if res.Resource.Name == "" || res.Resource.Type == "" { if res.Resource.Name == "" || res.Resource.Type == "" {
continue continue
} }
spec := l.convertToResourceSpec(ClusterId, res, response.Tag) spec := l.convertToResourceSpec(ClusterId, res, response.Tag, userId)
apiSpecs = append(apiSpecs, spec) apiSpecs = append(apiSpecs, spec)
} }
@ -333,7 +333,7 @@ func (l *CompareResourceSpecLogic) isSpecChanged(old, new models.TResourceSpec)
return len(oldBaseMap) > 0 return len(oldBaseMap) > 0
} }
func (l *CompareResourceSpecLogic) convertToResourceSpec(ClusterId int64, res Resource, tag string) models.TResourceSpec { func (l *CompareResourceSpecLogic) convertToResourceSpec(ClusterId int64, res Resource, tag string, userId int64) models.TResourceSpec {
spec := models.TResourceSpec{ spec := models.TResourceSpec{
SourceKey: resourceKey(res.Resource.Type, res.Resource.Name, tag), SourceKey: resourceKey(res.Resource.Type, res.Resource.Name, tag),
Type: res.Resource.Type, Type: res.Resource.Type,
@ -344,6 +344,7 @@ func (l *CompareResourceSpecLogic) convertToResourceSpec(ClusterId int64, res Re
ClusterId: ClusterId, ClusterId: ClusterId,
CreateTime: time.Now(), CreateTime: time.Now(),
UpdateTime: time.Now(), UpdateTime: time.Now(),
UserId: userId,
ChangeType: ChangeTypeNormal, ChangeType: ChangeTypeNormal,
} }
@ -355,6 +356,7 @@ func (l *CompareResourceSpecLogic) convertToResourceSpec(ClusterId int64, res Re
TotalUnit: br.Total.Unit, TotalUnit: br.Total.Unit,
AvailableValue: br.Available.Value, AvailableValue: br.Available.Value,
AvailableUnit: br.Available.Unit, AvailableUnit: br.Available.Unit,
UserId: userId,
CreateTime: time.Now(), CreateTime: time.Now(),
UpdateTime: time.Now(), UpdateTime: time.Now(),
}) })

View File

@ -58,7 +58,7 @@ func (l *EditResourceSpecLogic) EditResourceSpec(req *types.EditResourceReq) (re
costPerUnit := utils.StringToFloat64(req.CostPerUnit) costPerUnit := utils.StringToFloat64(req.CostPerUnit)
// 4. 更新主资源规格 // 4. 更新主资源规格
if err = updateMainResourceSpec(tx, req.Id, statusInt, req.CostType, costPerUnit); err != nil { if err = updateMainResourceSpec(tx, req.Id, statusInt, req.CostType, costPerUnit, req.UserId); err != nil {
return nil, err return nil, err
} }
@ -98,13 +98,14 @@ func validateRequestParams(req *types.EditResourceReq) error {
} }
// updateMainResourceSpec 更新主资源规格 // updateMainResourceSpec 更新主资源规格
func updateMainResourceSpec(tx *gorm.DB, id int64, status int64, costType string, costPerUnit float64) error { func updateMainResourceSpec(tx *gorm.DB, id int64, status int64, costType string, costPerUnit float64, userId int64) error {
return tx.Model(&models.TResourceSpec{}). return tx.Model(&models.TResourceSpec{}).
Where("id = ?", id). Where("id = ?", id).
Updates(map[string]interface{}{ Updates(map[string]interface{}{
"status": status, "status": status,
"cost_type": costType, "cost_type": costType,
"cost_per_unit": costPerUnit, "cost_per_unit": costPerUnit,
"user_id": userId,
}). }).
Error Error
} }

View File

@ -51,7 +51,7 @@ func (l *SyncResourceSpecLogic) SyncResourceSpec(req *types.SyncResourceReq) (re
} }
for _, response := range apiResources { for _, response := range apiResources {
// 转换API响应到数据库模型 // 转换API响应到数据库模型
_, apiSpecs, err := compareLogic.processAPIResponse(response) _, apiSpecs, err := compareLogic.processAPIResponse(response, req.UserId)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -1,6 +1,8 @@
package status package status
import ( import (
"context"
"fmt"
jsoniter "github.com/json-iterator/go" jsoniter "github.com/json-iterator/go"
"github.com/rs/zerolog/log" "github.com/rs/zerolog/log"
"github.com/zeromicro/go-zero/core/logx" "github.com/zeromicro/go-zero/core/logx"
@ -10,8 +12,9 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
"net/http" "gorm.io/gorm"
"strconv" "strconv"
"time"
) )
func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpcTask *models.TaskHpc, status bool, message string) error { func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpcTask *models.TaskHpc, status bool, message string) error {
@ -38,84 +41,226 @@ func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpc
} }
// UpdateHpcTaskStatus 更新超算任务状态,并通知中间件 // UpdateHpcTaskStatus 更新超算任务状态,并通知中间件
//func UpdateHpcTaskStatus(svc *svc.ServiceContext) {
// svc.Scheduler.HpcService.TaskSyncLock.Lock()
// defer svc.Scheduler.HpcService.TaskSyncLock.Unlock()
// taskHpcs := make([]*models.TaskHpc, 0)
// sqlStr := `SELECT *
// FROM task_hpc
// WHERE
// job_id != ''
// AND (
// status NOT IN ('Failed', 'Completed', 'Cancelled')
// OR start_time < created_time
// )
// ORDER BY created_time DESC
// LIMIT 10`
// db := svc.DbEngin.Raw(sqlStr).Scan(&taskHpcs)
// if db.Error != nil {
// logx.Errorf(db.Error.Error())
// return
// }
// for _, hpc := range taskHpcs {
// //更新task表的超算任务状态
// task := &types.TaskModel{}
// tx := svc.DbEngin.Model(models.Task{}).Where("id", hpc.TaskId).Scan(&task)
// if tx.Error != nil {
// logx.Errorf(tx.Error.Error())
// break
// }
// clusterId := utils.Int64ToString(hpc.ClusterId)
// h := http.Request{}
// hpcTask, err := svc.Scheduler.HpcService.HpcExecutorAdapterMap[strconv.FormatInt(hpc.AdapterId, 10)].GetTask(h.Context(), hpc.JobId, clusterId)
// if err != nil {
// logx.Errorf(err.Error())
// break
// }
// switch hpcTask.Status {
// case constants.Running:
// if hpc.Status != hpcTask.Status {
// svc.Scheduler.HpcStorages.AddNoticeInfo(strconv.FormatInt(hpc.AdapterId, 10), hpc.AdapterName, strconv.FormatInt(hpc.ClusterId, 10), hpc.ClusterName, hpc.Name, "running", "任务运行中")
// hpc.Status = hpcTask.Status
// task.Status = hpcTask.Status
// }
// case constants.Failed:
// if hpc.Status != hpcTask.Status {
// svc.Scheduler.HpcStorages.AddNoticeInfo(strconv.FormatInt(hpc.AdapterId, 10), hpc.AdapterName, strconv.FormatInt(hpc.ClusterId, 10), hpc.ClusterName, hpc.Name, "failed", "任务失败")
// hpc.Status = hpcTask.Status
// task.Status = hpcTask.Status
// logx.Infof("[%v]:任务执行失败,发送通知, 任务状态: [%v]", hpcTask, hpcTask.Status)
// _ = reportHpcStatusMessages(svc, task, hpc, false, "任务失败")
// }
// case constants.Completed:
// if hpc.Status != hpcTask.Status {
// svc.Scheduler.HpcStorages.AddNoticeInfo(strconv.FormatInt(hpc.AdapterId, 10), hpc.AdapterName, strconv.FormatInt(hpc.ClusterId, 10), hpc.ClusterName, hpc.Name, "completed", "任务完成")
// hpc.Status = hpcTask.Status
// task.Status = hpcTask.Status
// logx.Infof("[%v]:任务执行完成,发送通知, 任务状态: [%v]", hpcTask, hpcTask.Status)
// _ = reportHpcStatusMessages(svc, task, hpc, true, "任务完成")
// }
// default:
// if hpc.Status != hpcTask.Status {
// svc.Scheduler.HpcStorages.AddNoticeInfo(strconv.FormatInt(hpc.AdapterId, 10), hpc.AdapterName, strconv.FormatInt(hpc.ClusterId, 10), hpc.ClusterName, hpc.Name, "pending", "任务pending")
// hpc.Status = hpcTask.Status
// task.Status = hpcTask.Status
// }
// }
// //task.Id=hpcTask.
// task.StartTime = hpcTask.Start
// task.EndTime = hpcTask.End
// hpc.StartTime = hpcTask.Start
// hpc.EndTime = hpcTask.End
// logx.Info("# task 开始时间: %v, 结束时间: %v", task.StartTime, task.EndTime)
// err = svc.Scheduler.HpcStorages.UpdateTask(task)
// if err != nil {
// logx.Errorf(err.Error())
// break
// }
// err = svc.Scheduler.HpcStorages.UpdateHpcTask(hpc)
// if err != nil {
// logx.Errorf(err.Error())
// break
// }
// }
//}
// UpdateHpcTaskStatus HPC 任务状态同步函数
func UpdateHpcTaskStatus(svc *svc.ServiceContext) { func UpdateHpcTaskStatus(svc *svc.ServiceContext) {
svc.Scheduler.HpcService.TaskSyncLock.Lock() // 1. 查询需要同步的 HPC 任务
defer svc.Scheduler.HpcService.TaskSyncLock.Unlock() var hpcTasks []*models.TaskHpc
taskList := make([]*models.TaskHpc, 0) sqlStr := `SELECT * FROM task_hpc WHERE job_id != '' AND status NOT IN ('Failed', 'Completed', 'Cancelled') ORDER BY created_time DESC LIMIT 10`
sqlStr := `SELECT * if err := svc.DbEngin.Raw(sqlStr).Scan(&hpcTasks).Error; err != nil {
FROM task_hpc logx.Errorf("Failed to query HPC tasks for sync: %v", err)
WHERE
job_id != ''
AND (
status NOT IN ('Failed', 'Completed', 'Cancelled')
OR start_time < created_time
)
ORDER BY created_time DESC
LIMIT 10`
db := svc.DbEngin.Raw(sqlStr).Scan(&taskList)
if db.Error != nil {
logx.Errorf(db.Error.Error())
return return
} }
for _, hpc := range taskList {
//更新task表的超算任务状态 if len(hpcTasks) == 0 {
task := &types.TaskModel{} return
tx := svc.DbEngin.Model(models.Task{}).Where("id", hpc.TaskId).Scan(&task) }
if tx.Error != nil {
logx.Errorf(tx.Error.Error()) // 2. 批量获取关联的 Task 模型
break taskIDs := make([]int64, len(hpcTasks))
for i, hpc := range hpcTasks {
taskIDs[i] = hpc.TaskId
}
taskMap := make(map[int64]*types.TaskModel)
var tasks []*types.TaskModel
if err := svc.DbEngin.Model(&models.Task{}).Where("id IN ?", taskIDs).Find(&tasks).Error; err != nil {
logx.Errorf("Failed to batch query tasks: %v", err)
return
}
for _, task := range tasks {
taskMap[task.Id] = task
}
// 3. 遍历 HPC 任务并更新状态
for _, hpc := range hpcTasks {
task, ok := taskMap[hpc.TaskId]
if !ok {
logx.Errorf("Task with ID %d not found for HPC task %d, skipping", hpc.TaskId, hpc.Id)
continue
} }
clusterId := utils.Int64ToString(hpc.ClusterId)
h := http.Request{} // 使用带超时的 Context防止 API 调用阻塞
hpcTask, err := svc.Scheduler.HpcService.HpcExecutorAdapterMap[strconv.FormatInt(hpc.AdapterId, 10)].GetTask(h.Context(), hpc.JobId, clusterId) ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
adapterIDStr := strconv.FormatInt(hpc.AdapterId, 10)
adapter, adapterExists := svc.Scheduler.HpcService.HpcExecutorAdapterMap[adapterIDStr]
if !adapterExists {
logx.Errorf("HPC adapter with ID %s not found, skipping task %s", adapterIDStr, hpc.Name)
continue
}
// 4. 从 HPC 集群获取最新状态
hpcTaskInfo, err := adapter.GetTask(ctx, hpc.JobId, utils.Int64ToString(hpc.ClusterId))
if err != nil { if err != nil {
logx.Errorf(err.Error()) logx.Errorf("Failed to get task status from HPC executor for job %s: %v", hpc.JobId, err)
break continue // 继续处理下一个任务
} }
switch hpcTask.Status {
case constants.Running: // 如果状态没有变化,则跳过
if hpc.Status != hpcTask.Status { if hpc.Status == hpcTaskInfo.Status {
svc.Scheduler.HpcStorages.AddNoticeInfo(strconv.FormatInt(hpc.AdapterId, 10), hpc.AdapterName, strconv.FormatInt(hpc.ClusterId, 10), hpc.ClusterName, hpc.Name, "running", "任务运行中") continue
hpc.Status = hpcTask.Status
task.Status = hpcTask.Status
}
case constants.Failed:
if hpc.Status != hpcTask.Status {
svc.Scheduler.HpcStorages.AddNoticeInfo(strconv.FormatInt(hpc.AdapterId, 10), hpc.AdapterName, strconv.FormatInt(hpc.ClusterId, 10), hpc.ClusterName, hpc.Name, "failed", "任务失败")
hpc.Status = hpcTask.Status
task.Status = hpcTask.Status
logx.Infof("[%v]:任务执行失败,发送通知, 任务状态: [%v]", hpcTask, hpcTask.Status)
_ = reportHpcStatusMessages(svc, task, hpc, false, "任务失败")
}
case constants.Completed:
if hpc.Status != hpcTask.Status {
svc.Scheduler.HpcStorages.AddNoticeInfo(strconv.FormatInt(hpc.AdapterId, 10), hpc.AdapterName, strconv.FormatInt(hpc.ClusterId, 10), hpc.ClusterName, hpc.Name, "completed", "任务完成")
hpc.Status = hpcTask.Status
task.Status = hpcTask.Status
logx.Infof("[%v]:任务执行完成,发送通知, 任务状态: [%v]", hpcTask, hpcTask.Status)
_ = reportHpcStatusMessages(svc, task, hpc, true, "任务完成")
}
default:
if hpc.Status != hpcTask.Status {
svc.Scheduler.HpcStorages.AddNoticeInfo(strconv.FormatInt(hpc.AdapterId, 10), hpc.AdapterName, strconv.FormatInt(hpc.ClusterId, 10), hpc.ClusterName, hpc.Name, "pending", "任务pending")
hpc.Status = hpcTask.Status
task.Status = hpcTask.Status
}
} }
task.StartTime = hpcTask.Start
task.EndTime = hpcTask.End // 5. 准备更新
hpc.StartTime = hpcTask.Start previousStatus := hpc.Status
hpc.EndTime = hpcTask.End hpc.Status = hpcTaskInfo.Status
logx.Info("# task 开始时间: %v, 结束时间: %v", task.StartTime, task.EndTime) hpc.StartTime = hpcTaskInfo.Start
err = svc.Scheduler.HpcStorages.UpdateTask(task) hpc.EndTime = hpcTaskInfo.End
task.Status = hpcTaskInfo.Status
task.StartTime = hpcTaskInfo.Start
task.EndTime = hpcTaskInfo.End
logx.Infof("HPC task status change detected for job %s: %s -> %s", hpc.JobId, previousStatus, hpc.Status)
// 6. 在事务中更新数据库
err = svc.DbEngin.Transaction(func(tx *gorm.DB) error {
task.UpdatedTime = time.Now().Format(constants.Layout)
if err := tx.Table("task").Updates(task).Error; err != nil {
return fmt.Errorf("failed to update task table: %w", err)
}
if err := tx.Table("task_hpc").Updates(hpc).Error; err != nil {
return fmt.Errorf("failed to update hpc_task table: %w", err)
}
return nil
})
if err != nil { if err != nil {
logx.Errorf(err.Error()) logx.Errorf("Failed to update database in transaction for job %s: %v", hpc.JobId, err)
break // 事务失败,回滚状态,继续处理下一个任务
hpc.Status = previousStatus
task.Status = previousStatus
continue
} }
err = svc.Scheduler.HpcStorages.UpdateHpcTask(hpc)
if err != nil { // 7. 根据新状态执行后续操作 (通知、报告等)
logx.Errorf(err.Error()) handleStatusChange(svc, task, hpc, hpcTaskInfo.Status)
break }
}
// handleStatusChange 根据新状态执行后续操作
func handleStatusChange(svc *svc.ServiceContext, task *types.TaskModel, hpc *models.TaskHpc, newStatus string) {
adapterIDStr := strconv.FormatInt(hpc.AdapterId, 10)
clusterIDStr := strconv.FormatInt(hpc.ClusterId, 10)
var noticeType, noticeMessage string
var reportSuccess bool
var shouldReport bool
switch newStatus {
case constants.Running:
noticeType = "running"
noticeMessage = "任务运行中"
case constants.Failed:
noticeType = "failed"
noticeMessage = "任务失败"
reportSuccess = false
shouldReport = true
case constants.Completed:
noticeType = "completed"
noticeMessage = "任务完成"
reportSuccess = true
shouldReport = true
case constants.Pending:
noticeType = "pending"
noticeMessage = "任务pending"
default:
// 对于其他未知状态,可以选择记录日志并返回
logx.Errorf("Unhandled HPC task status '%s' for job %s", newStatus, hpc.JobId)
return
}
// 发送通知
svc.Scheduler.HpcStorages.AddNoticeInfo(adapterIDStr, hpc.AdapterName, clusterIDStr, hpc.ClusterName, hpc.Name, noticeType, noticeMessage)
logx.Infof("[%s]: 任务状态变更为 [%s],发送通知。", hpc.Name, newStatus)
// 上报状态
if shouldReport {
if err := reportHpcStatusMessages(svc, task, hpc, reportSuccess, noticeMessage); err != nil {
logx.Errorf("Failed to report HPC status for job %s: %v", hpc.JobId, err)
} }
} }
} }

View File

@ -162,8 +162,14 @@ func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd stri
outputs := make([]*modelarts.OutputTraining, 0) outputs := make([]*modelarts.OutputTraining, 0)
outputValue := "" outputValue := ""
for _, env := range envs { for _, env := range envs {
s := strings.Split(env, COMMA) // 找到第一个逗号位置
environments[s[0]] = s[1] idx := strings.Index(env, COMMA)
if idx == -1 {
continue
}
key := strings.TrimSpace(env[:idx])
value := strings.TrimSpace(env[idx+1:])
environments[key] = value
} }
for _, param := range params { for _, param := range params {
s := strings.Split(param, COMMA) s := strings.Split(param, COMMA)

View File

@ -700,6 +700,16 @@ type ClusterAvail struct {
ClusterName string `json:"clusterName"` ClusterName string `json:"clusterName"`
} }
type ClusterBaseInfo struct {
Id string `json:"id,omitempty" db:"id"`
AdapterId int64 `json:"adapterId,omitempty,string" db:"adapter_id"`
Name string `json:"name,omitempty" db:"name"`
Nickname string `json:"nickname,omitempty" db:"nickname"`
Description string `json:"description,omitempty" db:"description"`
Server string `json:"server,omitempty" db:"server"`
Driver string `json:"driver,omitempty" db:"driver"`
}
type ClusterCreateReq struct { type ClusterCreateReq struct {
Id string `json:"id,optional"` Id string `json:"id,optional"`
AdapterId string `json:"adapterId,optional"` AdapterId string `json:"adapterId,optional"`
@ -2145,6 +2155,8 @@ type EditResourceReq struct {
CpuUnit string `json:"cpuUnit,optional"` CpuUnit string `json:"cpuUnit,optional"`
MemoryValue string `json:"memoryValue,optional"` MemoryValue string `json:"memoryValue,optional"`
MemoryUnit string `json:"memoryUnit,optional"` MemoryUnit string `json:"memoryUnit,optional"`
UserId int64 `json:"userId,optional"`
} }
type EndpointsReq struct { type EndpointsReq struct {
@ -2283,6 +2295,7 @@ type Fault struct {
type FetchResourceSpecReq struct { type FetchResourceSpecReq struct {
ClusterId string `form:"clusterId,optional"` ClusterId string `form:"clusterId,optional"`
Tag string `form:"tag,optional"` Tag string `form:"tag,optional"`
UserId int64 `form:"userId,optional"`
} }
type Fields struct { type Fields struct {
@ -5566,7 +5579,8 @@ type SyncClusterAlertReq struct {
} }
type SyncResourceReq struct { type SyncResourceReq struct {
Id string `json:"id"` Id string `json:"id"`
UserId int64 `json:"userId,optional"`
} }
type Tags struct { type Tags struct {