JCC-CSScheduler/manager/internal/jobmgr/job/state/prescheduling.go

291 lines
8.4 KiB
Go

package state
import (
"context"
"errors"
"fmt"
"sync"
"time"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job"
)
type PreScheduling struct {
scheme jobmod.JobScheduleScheme
targetCCInfo schmod.ComputingCenter
}
func NewPreSchuduling(scheme jobmod.JobScheduleScheme) *PreScheduling {
return &PreScheduling{
scheme: scheme,
}
}
func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
logger.Info("start run preScheduling, jobID: " + jo.JobID)
var jobFilesInfo schsdk.JobFilesInfo
var jobFiles *jobmod.JobFiles
switch runningJob := jo.Body.(type) {
case *job.NormalJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
case *job.MultiInstanceJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
case *job.InstanceJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(fmt.Errorf("getting computing center info: %w", err)))
return
}
s.targetCCInfo = ccInfo
wg := sync.WaitGroup{}
wg.Add(3)
var e1, e2, e3 error
go func() {
defer wg.Done()
e1 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Dataset, &jobFiles.Dataset, &s.scheme.Dataset)
if e1 != nil {
cancel()
logger.Debugf("dataset scheduling done, err: %v", e1)
} else {
logger.Debug("dataset scheduling done")
}
}()
go func() {
defer wg.Done()
e2 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Code, &jobFiles.Code, &s.scheme.Code)
if e2 != nil {
cancel()
logger.Debugf("code scheduling done, err: %v", e2)
} else {
logger.Debug("code scheduling done")
}
}()
go func() {
defer wg.Done()
e3 = s.doImageScheduling(ctx, rtx, s.scheme.TargetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image)
if e3 != nil {
cancel()
logger.Debugf("iamge scheduling done, err: %v", e3)
} else {
logger.Debug("image scheduling done")
}
}()
wg.Wait()
allErr := errors.Join(e1, e2, e3)
if allErr != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(allErr))
} else {
rtx.Mgr.ChangeState(jo, NewReadyToAdjust())
}
}
func (s *PreScheduling) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.PreSchedulingDump{
Scheme: s.scheme,
}
}
func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool {
return e.LocalPath == info.LocalPath
})
if !ok {
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
}
if evt.Error != nil {
return evt.Error
}
file.PackageID = evt.PackageID
case *schsdk.PackageJobFileInfo:
file.PackageID = info.PackageID
case *schsdk.DataReturnJobFileInfo:
return nil
default:
return fmt.Errorf("unknown dataset type: %T", info)
}
if scheme.Action == jobmod.ActionMove {
logger.Debugf("begin move pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
return nil
}
if scheme.Action == jobmod.ActionLoad {
logger.Debugf("begin load pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
loadStatus := status.Value.Status.(*exectsk.StorageLoadPackageStatus)
if loadStatus.Error != "" {
return fmt.Errorf("moving package: %s", loadStatus.Error)
}
return nil
}
return nil
}
func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool {
return e.LocalPath == info.LocalPath
})
if !ok {
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
}
if evt.Error != nil {
return evt.Error
}
// 上传完毕,则可以新建一个空的镜像的记录
// TODO 镜像名称
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
// 填充ImageID和PackageID
file.ImageID = imgID
file.PackageID = &evt.PackageID
case *schsdk.ImageJobFileInfo:
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID)
if err != nil {
return fmt.Errorf("getting image info: %w", err)
}
file.ImageID = imageInfo.ImageID
file.PackageID = imageInfo.CDSPackageID
}
if scheme.Action == jobmod.ActionImportImage {
// TODO 需要重新设计镜像导入流程
return fmt.Errorf("not implemented")
if file.PackageID == nil {
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
}
// TODO UserID
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cloudream storage client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
// TODO UserID
pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
if err != nil {
return fmt.Errorf("getting package objects: %w", err)
}
if len(pkgObjs.Objects) == 0 {
return fmt.Errorf("no object in the package which will be imported")
}
if len(pkgObjs.Objects) > 1 {
return fmt.Errorf("there must be only 1 object in the package which will be imported")
}
taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut2 := taskStatus2.Receive()
status2 := <-fut2.Chan()
uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
if uploadStatus.Error != "" {
return fmt.Errorf("uploading image: %s", uploadStatus.Error)
}
// TODO 镜像名称
err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
return nil
}
return nil
}