JCC-CSScheduler/manager/internal/jobmgr/job/state/adjusting.go

240 lines
7.1 KiB
Go

package state
import (
"context"
"errors"
"fmt"
"sync"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
)
type Adjusting struct {
scheme jobmod.JobScheduleScheme
targetCCInfo schmod.ComputingCenter
}
func NewAdjusting(scheme jobmod.JobScheduleScheme) *Adjusting {
return &Adjusting{
scheme: scheme,
}
}
func (s *Adjusting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewNormalJobReadyToExecute())
}
}
func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.AdjustingDump{
Scheme: s.scheme,
}
}
func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
userID := cdssdk.UserID(1)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
s.targetCCInfo = ccInfo
logger.WithField("JobID", jo.JobID).Infof("job is scheduled to %v(%v)", ccInfo.Name, ccInfo.CCID)
// 已经确定最终执行的目标计算中心,则可以生成结果输出路径了
// TODO UserID
outputPath := utils.MakeJobOutputPath(userID, jo.JobID)
var jobFilesInfo schsdk.JobFilesInfo
var jobFiles *jobmod.JobFiles
switch runningJob := jo.Body.(type) {
case *job.NormalJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
runningJob.OutputPath = outputPath
case *job.MultiInstanceJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
case *job.InstanceJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
runningJob.OutputPath = outputPath
}
wg := sync.WaitGroup{}
wg.Add(3)
var e1, e2, e3 error
go func() {
defer wg.Done()
e1 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Dataset, &jobFiles.Dataset, &s.scheme.Dataset)
if e1 != nil {
cancel()
}
}()
go func() {
defer wg.Done()
e2 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Code, &jobFiles.Code, &s.scheme.Code)
if e2 != nil {
cancel()
}
}()
go func() {
defer wg.Done()
e3 = s.doImageScheduling(ctx, rtx, s.scheme.TargetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image)
if e3 != nil {
cancel()
}
}()
wg.Wait()
return errors.Join(e1, e2, e3)
}
func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
if scheme.Action == jobmod.ActionMove {
logger.Debugf("begin move pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
return nil
}
if scheme.Action == jobmod.ActionLoad {
logger.Debugf("begin load pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
loadStatus := status.Value.Status.(*exectsk.StorageLoadPackageStatus)
if loadStatus.Error != "" {
return fmt.Errorf("loading package: %s", loadStatus.Error)
}
// file.PackagePath = loadStatus.PackagePath TODO 路径谁来产生的问题
return nil
}
return nil
}
func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
if scheme.Action == jobmod.ActionImportImage {
// TODO 镜像文件位置需要重新设计
return fmt.Errorf("not implemented yet")
if file.PackageID == nil {
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
}
// TODO UserID
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cloudream storage client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
// TODO UserID
pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
if err != nil {
return fmt.Errorf("getting package objects: %w", err)
}
if len(pkgObjs.Objects) == 0 {
return fmt.Errorf("no object in the package which will be imported")
}
if len(pkgObjs.Objects) > 1 {
return fmt.Errorf("there must be only 1 object in the package which will be imported")
}
// taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
// if err != nil {
// return fmt.Errorf("moving package: %w", err)
// }
// fut2 := taskStatus2.Receive()
// status2 := <-fut2.Chan()
// if err != nil {
// return fmt.Errorf("uploading image: %w", err)
// }
// uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
// if uploadStatus.Error != "" {
// return fmt.Errorf("uploading image: %s", uploadStatus.Error)
// }
// // TODO 镜像名称
// err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
// if err != nil {
// return fmt.Errorf("creating image info: %w", err)
// }
return nil
}
return nil
}