pcm-coordinator/internal/logic/hpc/canceljoblogic.go

104 lines
3.3 KiB
Go

package hpc
import (
"context"
"fmt"
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
)
type CancelJobLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
hpcService *service.HpcService
}
type TaskHPCResult struct {
ID uint `gorm:"column:id"` // 对应 t.id
JobID string `gorm:"column:job_id"` // 对应 hpc.job_id
AdapterId string `gorm:"column:adapter_id"` // 对应 hpc.adapter_id
ClusterId string `gorm:"column:cluster_id"` // 对应 hpc.cluster_id
}
func NewCancelJobLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CancelJobLogic {
cache := make(map[string]interface{}, 10)
hpcService, err := service.NewHpcService(&svcCtx.Config, svcCtx.Scheduler.HpcStorages, cache)
if err != nil {
return nil
}
return &CancelJobLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
hpcService: hpcService,
}
}
func (l *CancelJobLogic) CancelJob(req *types.CancelJobReq) error {
//var clusterInfo *types.ClusterInfo
//tx := l.svcCtx.DbEngin.Raw("select * from t_cluster where id = ?", req.ClusterId).Scan(&clusterInfo)
//if tx.Error != nil {
// return tx.Error
//}
//// 查询p端调用地址
//var adapterAddress string
//l.svcCtx.DbEngin.Raw("SELECT server FROM `t_adapter` where id = ?", clusterInfo.AdapterId).Scan(&adapterAddress)
//var jobResp slurm.GetJobResp
//httpClient := resty.New().R()
//_, err := httpClient.SetHeader("Content-Type", "application/json").
// SetQueryParams(map[string]string{
// "jobId": req.JobId,
// "server": clusterInfo.Server,
// "version": clusterInfo.Version,
// "token": clusterInfo.Token,
// "username": clusterInfo.Username,
// }).
// SetResult(&jobResp).
// Delete(adapterAddress + "/api/v1/job/cancel")
//if err != nil {
// return err
//}
//if len(jobResp.Errors) != 0 {
// return errors.Errorf(jobResp.Errors[0].Description)
//}
//return nil
var hpcR TaskHPCResult
tx := l.svcCtx.DbEngin.Raw(
"SELECT t.id, hpc.job_id ,hpc.adapter_id, hpc.cluster_id FROM task t "+
"INNER JOIN task_hpc hpc ON t.id = hpc.task_id "+
"WHERE adapter_type_dict = 2 AND t.id = ?",
req.TaskId,
).Scan(&hpcR).Error
if tx != nil {
return fmt.Errorf("数据库查询失败: %v", tx.Error)
}
if hpcR.ID == 0 || hpcR.JobID == "" {
return fmt.Errorf("作业不存在")
}
var adapterInfo types.AdapterInfo
l.svcCtx.DbEngin.Raw("SELECT * FROM `t_adapter` where id = ?", hpcR.AdapterId).Scan(&adapterInfo)
if adapterInfo.Id == "" {
return fmt.Errorf("adapter not found")
}
// 取消作业
err := l.hpcService.HpcExecutorAdapterMap[adapterInfo.Id].CancelTask(l.ctx, hpcR.JobID, hpcR.ClusterId)
if err != nil {
return err
}
// 更新数据库状态
tx = l.svcCtx.DbEngin.Model(&types.Task{}).Where("id = ?", hpcR.ID).Update("status", "Canceled").Error
if tx != nil {
return fmt.Errorf("数据库更新失败: %v", tx.Error)
}
// 更新数据库状态
tx = l.svcCtx.DbEngin.Model(&models.TaskHpc{}).Where("task_id = ?", hpcR.ID).Update("status", "Canceled").Error
if tx != nil {
return fmt.Errorf("数据库更新失败: %v", tx.Error)
}
return nil
}