forked from JointCloud/pcm-coordinator
155 lines
4.2 KiB
Go
155 lines
4.2 KiB
Go
package hpc
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
|
"gitlink.org.cn/JointCloud/pcm-hpc/slurm"
|
|
"strconv"
|
|
"time"
|
|
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
|
|
|
"github.com/zeromicro/go-zero/core/logx"
|
|
)
|
|
|
|
type CommitHpcTaskLogic struct {
|
|
logx.Logger
|
|
ctx context.Context
|
|
svcCtx *svc.ServiceContext
|
|
}
|
|
|
|
func NewCommitHpcTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CommitHpcTaskLogic {
|
|
return &CommitHpcTaskLogic{
|
|
Logger: logx.WithContext(ctx),
|
|
ctx: ctx,
|
|
svcCtx: svcCtx,
|
|
}
|
|
}
|
|
|
|
func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *types.CommitHpcTaskResp, err error) {
|
|
|
|
var clusterInfo types.ClusterInfo
|
|
l.svcCtx.DbEngin.Raw("SELECT * FROM `t_cluster` where id = ?", req.ClusterId).First(&clusterInfo)
|
|
|
|
if len(clusterInfo.Id) == 0 {
|
|
return resp, errors.New("cluster not found")
|
|
}
|
|
|
|
// 构建主任务结构体
|
|
taskModel := models.Task{
|
|
Name: req.Name,
|
|
Description: req.Description,
|
|
CommitTime: time.Now(),
|
|
Status: "Running",
|
|
AdapterTypeDict: "2",
|
|
}
|
|
|
|
// 保存任务数据到数据库
|
|
tx := l.svcCtx.DbEngin.Create(&taskModel)
|
|
if tx.Error != nil {
|
|
return nil, tx.Error
|
|
}
|
|
|
|
var adapterName string
|
|
l.svcCtx.DbEngin.Raw("SELECT name FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterName)
|
|
if len(adapterName) == 0 || adapterName == "" {
|
|
return nil, errors.New("no corresponding adapter found")
|
|
}
|
|
clusterId, err := strconv.ParseInt(req.ClusterId, 10, 64)
|
|
hpcInfo := models.TaskHpc{
|
|
TaskId: taskModel.Id,
|
|
AdapterId: clusterInfo.AdapterId,
|
|
AdapterName: adapterName,
|
|
ClusterId: clusterId,
|
|
ClusterName: clusterInfo.Name,
|
|
Name: taskModel.Name,
|
|
CmdScript: req.CmdScript,
|
|
StartTime: time.Now().String(),
|
|
CardCount: req.CardCount,
|
|
WorkDir: req.WorkDir,
|
|
WallTime: req.WallTime,
|
|
AppType: req.AppType,
|
|
AppName: req.AppName,
|
|
Queue: req.Queue,
|
|
SubmitType: req.SubmitType,
|
|
NNode: req.NNode,
|
|
Account: clusterInfo.Username,
|
|
StdInput: req.StdInput,
|
|
Partition: req.Partition,
|
|
CreatedTime: time.Now(),
|
|
UpdatedTime: time.Now(),
|
|
Status: "Running",
|
|
}
|
|
hpcInfo.WorkDir = clusterInfo.WorkDir + req.WorkDir
|
|
tx = l.svcCtx.DbEngin.Create(&hpcInfo)
|
|
if tx.Error != nil {
|
|
return nil, tx.Error
|
|
}
|
|
// 提交job到指定集群
|
|
jobId, err := submitJob(&hpcInfo, &clusterInfo)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// 保存操作记录
|
|
noticeInfo := clientCore.NoticeInfo{
|
|
AdapterId: clusterInfo.AdapterId,
|
|
AdapterName: adapterName,
|
|
ClusterId: clusterId,
|
|
ClusterName: clusterInfo.Name,
|
|
NoticeType: "create",
|
|
TaskName: req.Name,
|
|
Incident: "任务创建中",
|
|
CreatedTime: time.Now(),
|
|
}
|
|
result := l.svcCtx.DbEngin.Table("t_notice").Create(¬iceInfo)
|
|
if result.Error != nil {
|
|
logx.Errorf("Task creation failure, err: %v", result.Error)
|
|
}
|
|
resp = &types.CommitHpcTaskResp{
|
|
JobId: string(jobId),
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func submitJob(hpcInfo *models.TaskHpc, clusterInfo *types.ClusterInfo) (string, error) {
|
|
|
|
client, err := slurm.NewClient(slurm.ClientOptions{
|
|
URL: clusterInfo.Server,
|
|
ClientVersion: clusterInfo.Version,
|
|
RestUserName: clusterInfo.Username,
|
|
Token: clusterInfo.Token})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
job, err := client.Job(slurm.JobOptions{})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
// 封装请求参数
|
|
submitReq := slurm.JobOptions{
|
|
Script: hpcInfo.CmdScript,
|
|
Job: &slurm.JobProperties{
|
|
Account: hpcInfo.Account,
|
|
Name: hpcInfo.Name,
|
|
NTasks: 1,
|
|
CurrentWorkingDirectory: hpcInfo.WorkDir,
|
|
Partition: hpcInfo.Partition,
|
|
Environment: map[string]string{"PATH": clusterInfo.EnvPath,
|
|
"LD_LIBRARY_PATH": clusterInfo.EnvLdPath},
|
|
},
|
|
}
|
|
submitReq.Job.StandardOutput = submitReq.Job.CurrentWorkingDirectory + "/%j.out"
|
|
submitReq.Job.StandardError = submitReq.Job.CurrentWorkingDirectory + "/%j.err"
|
|
|
|
jobResp, err := job.SubmitJob(submitReq)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
jobId := strconv.Itoa(jobResp.JobId)
|
|
return jobId, nil
|
|
}
|