180 lines
4.8 KiB
Go
180 lines
4.8 KiB
Go
package state
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||
|
||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
|
||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
|
||
)
|
||
|
||
type NormalJobExecuting struct {
|
||
lastStatus pcmsdk.TaskStatus
|
||
}
|
||
|
||
func NewNormalJobExecuting() *NormalJobExecuting {
|
||
return &NormalJobExecuting{
|
||
lastStatus: "Begin",
|
||
}
|
||
}
|
||
|
||
func (s *NormalJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||
err := s.do(rtx, jo)
|
||
if err != nil {
|
||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||
} else {
|
||
rtx.Mgr.ChangeState(jo, SuccessComplete())
|
||
}
|
||
}
|
||
|
||
func (s *NormalJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||
return &jobmod.NormalJobExecutingDump{
|
||
TaskStatus: s.lastStatus,
|
||
}
|
||
}
|
||
|
||
func (s *NormalJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||
//norJob := jo.Body.(*job.NormalJob)
|
||
|
||
var runtime *schsdk.JobRuntimeInfo
|
||
var jobFiles *jobmod.JobFiles
|
||
var targetCCID schsdk.CCID
|
||
|
||
switch runningJob := jo.Body.(type) {
|
||
case *job.NormalJob:
|
||
runtime = &runningJob.Info.Runtime
|
||
jobFiles = &runningJob.Files
|
||
targetCCID = runningJob.TargetCCID
|
||
case *job.InstanceJob:
|
||
runtime = &runningJob.Info.Runtime
|
||
jobFiles = &runningJob.Files
|
||
targetCCID = runningJob.TargetCCID
|
||
}
|
||
|
||
log := logger.WithType[NormalJobExecuting]("State").WithField("JobID", jo.JobID)
|
||
|
||
ctx, cancel := context.WithCancel(context.Background())
|
||
defer cancel()
|
||
|
||
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), jobFiles.Image.ImageID, targetCCID)
|
||
if err != nil {
|
||
return fmt.Errorf("getting pcm image info: %w", err)
|
||
}
|
||
|
||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), targetCCID)
|
||
if err != nil {
|
||
return fmt.Errorf("getting computing center info: %w", err)
|
||
}
|
||
|
||
// TODO 需要添加DATA_IN、DATA_OUT等环境变量,这些数据从Job的信息中来获取
|
||
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), targetCCID)
|
||
if err != nil {
|
||
return fmt.Errorf("getting computing center resource: %w", err)
|
||
}
|
||
if len(ress) == 0 {
|
||
return fmt.Errorf("no resource found at computing center %v", targetCCID)
|
||
}
|
||
|
||
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSubmitTask(
|
||
ccInfo.PCMParticipantID,
|
||
pcmImgInfo.PCMImageID,
|
||
// TODO 选择资源的算法
|
||
ress[0].PCMResourceID,
|
||
runtime.Command,
|
||
runtime.Envs,
|
||
))
|
||
defer wt.Close()
|
||
|
||
for {
|
||
status, err := wt.Receive(ctx)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
tskStatus := status.(*exetsk.SubmitTaskStatus)
|
||
if tskStatus.Error != "" {
|
||
return fmt.Errorf("submitting task: %s", tskStatus.Error)
|
||
}
|
||
|
||
if tskStatus.Status != s.lastStatus {
|
||
log.Infof("task %s -> %s", s.lastStatus, tskStatus.Status)
|
||
}
|
||
s.lastStatus = tskStatus.Status
|
||
|
||
switch tskStatus.Status {
|
||
case pcmsdk.TaskStatusSuccess:
|
||
return nil
|
||
|
||
case pcmsdk.TaskStatusFailed:
|
||
return fmt.Errorf("task failed")
|
||
}
|
||
}
|
||
}
|
||
|
||
type DataReturnJobExecuting struct {
|
||
}
|
||
|
||
func NewDataReturnJobExecuting() *DataReturnJobExecuting {
|
||
return &DataReturnJobExecuting{}
|
||
}
|
||
|
||
func (s *DataReturnJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||
err := s.do(rtx, jo)
|
||
if err != nil {
|
||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||
} else {
|
||
rtx.Mgr.ChangeState(jo, SuccessComplete())
|
||
}
|
||
}
|
||
|
||
func (s *DataReturnJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||
return &jobmod.DataReturnExecutingDump{}
|
||
}
|
||
|
||
func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||
reJob := jo.Body.(*job.DataReturnJob)
|
||
|
||
ctx, cancel := context.WithCancel(context.Background())
|
||
defer cancel()
|
||
|
||
// 监听取消事件
|
||
go func() {
|
||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||
cancel()
|
||
}()
|
||
|
||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), reJob.TargetJobCCID)
|
||
if err != nil {
|
||
return fmt.Errorf("getting computing center info: %w", err)
|
||
}
|
||
|
||
logger.Infof("submited computer center name: %s, id: %s", ccInfo.Name, ccInfo.CCID)
|
||
|
||
wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewStorageCreatePackage(
|
||
1, // TOOD 用户ID
|
||
ccInfo.CDSStorageID,
|
||
reJob.TargetJobOutputFullPath,
|
||
reJob.Info.BucketID,
|
||
utils.MakeResourcePackageName(jo.JobID),
|
||
))
|
||
defer wt.Close()
|
||
|
||
status, err := wt.Receive(ctx)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
tskStatus := status.(*exetsk.StorageCreatePackageStatus)
|
||
if tskStatus.Error != "" {
|
||
return fmt.Errorf("creating package: %s", tskStatus.Error)
|
||
}
|
||
|
||
reJob.DataReturnPackageID = tskStatus.PackageID
|
||
return nil
|
||
}
|