JCC-CSScheduler/manager/internal/jobmgr/job/state/executing.go

180 lines
4.8 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package state
import (
"context"
"fmt"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
"gitlink.org.cn/cloudream/common/pkgs/logger"
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
)
type NormalJobExecuting struct {
lastStatus pcmsdk.TaskStatus
}
func NewNormalJobExecuting() *NormalJobExecuting {
return &NormalJobExecuting{
lastStatus: "Begin",
}
}
func (s *NormalJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, SuccessComplete())
}
}
func (s *NormalJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.NormalJobExecutingDump{
TaskStatus: s.lastStatus,
}
}
func (s *NormalJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
//norJob := jo.Body.(*job.NormalJob)
var runtime *schsdk.JobRuntimeInfo
var jobFiles *jobmod.JobFiles
var targetCCID schsdk.CCID
switch runningJob := jo.Body.(type) {
case *job.NormalJob:
runtime = &runningJob.Info.Runtime
jobFiles = &runningJob.Files
targetCCID = runningJob.TargetCCID
case *job.InstanceJob:
runtime = &runningJob.Info.Runtime
jobFiles = &runningJob.Files
targetCCID = runningJob.TargetCCID
}
log := logger.WithType[NormalJobExecuting]("State").WithField("JobID", jo.JobID)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), jobFiles.Image.ImageID, targetCCID)
if err != nil {
return fmt.Errorf("getting pcm image info: %w", err)
}
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), targetCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
// TODO 需要添加DATA_IN、DATA_OUT等环境变量这些数据从Job的信息中来获取
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), targetCCID)
if err != nil {
return fmt.Errorf("getting computing center resource: %w", err)
}
if len(ress) == 0 {
return fmt.Errorf("no resource found at computing center %v", targetCCID)
}
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSubmitTask(
ccInfo.PCMParticipantID,
pcmImgInfo.PCMImageID,
// TODO 选择资源的算法
ress[0].PCMResourceID,
runtime.Command,
runtime.Envs,
))
defer wt.Close()
for {
status, err := wt.Receive(ctx)
if err != nil {
return err
}
tskStatus := status.(*exetsk.SubmitTaskStatus)
if tskStatus.Error != "" {
return fmt.Errorf("submitting task: %s", tskStatus.Error)
}
if tskStatus.Status != s.lastStatus {
log.Infof("task %s -> %s", s.lastStatus, tskStatus.Status)
}
s.lastStatus = tskStatus.Status
switch tskStatus.Status {
case pcmsdk.TaskStatusSuccess:
return nil
case pcmsdk.TaskStatusFailed:
return fmt.Errorf("task failed")
}
}
}
type DataReturnJobExecuting struct {
}
func NewDataReturnJobExecuting() *DataReturnJobExecuting {
return &DataReturnJobExecuting{}
}
func (s *DataReturnJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, SuccessComplete())
}
}
func (s *DataReturnJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.DataReturnExecutingDump{}
}
func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
reJob := jo.Body.(*job.DataReturnJob)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), reJob.TargetJobCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
logger.Infof("submited computer center name: %s, id: %s", ccInfo.Name, ccInfo.CCID)
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewStorageCreatePackage(
1, // TOOD 用户ID
ccInfo.CDSStorageID,
reJob.TargetJobOutputFullPath,
reJob.Info.BucketID,
utils.MakeResourcePackageName(jo.JobID),
))
defer wt.Close()
status, err := wt.Receive(ctx)
if err != nil {
return err
}
tskStatus := status.(*exetsk.StorageCreatePackageStatus)
if tskStatus.Error != "" {
return fmt.Errorf("creating package: %s", tskStatus.Error)
}
reJob.DataReturnPackageID = tskStatus.PackageID
return nil
}