pcm-coordinator/internal/logic/hpc/commithpctasklogic.go

155 lines
4.2 KiB
Go

package hpc
import (
"context"
"errors"
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"gitlink.org.cn/JointCloud/pcm-hpc/slurm"
"strconv"
"time"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type CommitHpcTaskLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewCommitHpcTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CommitHpcTaskLogic {
return &CommitHpcTaskLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *types.CommitHpcTaskResp, err error) {
var clusterInfo types.ClusterInfo
l.svcCtx.DbEngin.Raw("SELECT * FROM `t_cluster` where id = ?", req.ClusterId).First(&clusterInfo)
if len(clusterInfo.Id) == 0 {
return resp, errors.New("cluster not found")
}
// 构建主任务结构体
taskModel := models.Task{
Name: req.Name,
Description: req.Description,
CommitTime: time.Now(),
Status: "Running",
AdapterTypeDict: "2",
}
// 保存任务数据到数据库
tx := l.svcCtx.DbEngin.Create(&taskModel)
if tx.Error != nil {
return nil, tx.Error
}
var adapterName string
l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterName)
if len(adapterName) == 0 || adapterName == "" {
return nil, errors.New("no corresponding adapter found")
}
hpcInfo := models.TaskHpc{
TaskId: taskModel.Id,
AdapterId: clusterInfo.AdapterId,
AdapterName: adapterName,
ClusterId: req.ClusterId,
ClusterName: clusterInfo.Name,
Name: taskModel.Name,
CmdScript: req.CmdScript,
StartTime: time.Now().String(),
CardCount: req.CardCount,
WorkDir: req.WorkDir,
WallTime: req.WallTime,
AppType: req.AppType,
AppName: req.AppName,
Queue: req.Queue,
SubmitType: req.SubmitType,
NNode: req.NNode,
Account: clusterInfo.Username,
StdInput: req.StdInput,
Partition: req.Partition,
CreatedTime: time.Now(),
UpdatedTime: time.Now(),
Status: "Running",
}
hpcInfo.WorkDir = clusterInfo.WorkDir + req.WorkDir
tx = l.svcCtx.DbEngin.Create(&hpcInfo)
if tx.Error != nil {
return nil, tx.Error
}
// 提交job到指定集群
jobId, err := submitJob(&hpcInfo, &clusterInfo)
if err != nil {
return nil, err
}
// 保存操作记录
noticeInfo := clientCore.NoticeInfo{
AdapterId: clusterInfo.AdapterId,
AdapterName: adapterName,
ClusterId: req.ClusterId,
ClusterName: clusterInfo.Name,
NoticeType: "create",
TaskName: req.Name,
Incident: "任务创建中",
CreatedTime: time.Now(),
}
result := l.svcCtx.DbEngin.Table("t_notice").Create(&noticeInfo)
if result.Error != nil {
logx.Errorf("Task creation failure, err: %v", result.Error)
}
resp = &types.CommitHpcTaskResp{
JobId: string(jobId),
}
return resp, nil
}
func submitJob(hpcInfo *models.TaskHpc, clusterInfo *types.ClusterInfo) (string, error) {
client, err := slurm.NewClient(slurm.ClientOptions{
URL: clusterInfo.Server,
ClientVersion: clusterInfo.Version,
RestUserName: clusterInfo.Username,
Token: clusterInfo.Token})
if err != nil {
return "", err
}
job, err := client.Job(slurm.JobOptions{})
if err != nil {
return "", err
}
// 封装请求参数
submitReq := slurm.JobOptions{
Script: hpcInfo.CmdScript,
Job: &slurm.JobProperties{
Account: hpcInfo.Account,
Name: hpcInfo.Name,
NTasks: 1,
CurrentWorkingDirectory: hpcInfo.WorkDir,
Partition: hpcInfo.Partition,
Environment: map[string]string{"PATH": clusterInfo.EnvPath,
"LD_LIBRARY_PATH": clusterInfo.EnvLdPath},
},
}
submitReq.Job.StandardOutput = submitReq.Job.CurrentWorkingDirectory + "/%j.out"
submitReq.Job.StandardError = submitReq.Job.CurrentWorkingDirectory + "/%j.err"
jobResp, err := job.SubmitJob(submitReq)
if err != nil {
return "", err
}
jobId := strconv.Itoa(jobResp.JobId)
return jobId, nil
}