From cd64f3da5b0f637cf612ea565b6be6cef2b70f61 Mon Sep 17 00:00:00 2001 From: JeshuaRen <270813223@qq.com> Date: Fri, 26 Apr 2024 11:35:43 +0800 Subject: [PATCH 01/10] =?UTF-8?q?=E5=B0=86resource=E9=87=8D=E5=91=BD?= =?UTF-8?q?=E5=90=8D=E4=B8=BAdatareture?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- manager/internal/jobmgr/job/state/adjusting.go | 2 +- manager/internal/jobmgr/job/state/prescheduling.go | 2 +- manager/internal/jobmgr/job/state/ready_to_adjust.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/manager/internal/jobmgr/job/state/adjusting.go b/manager/internal/jobmgr/job/state/adjusting.go index 7095cb5..0269baf 100644 --- a/manager/internal/jobmgr/job/state/adjusting.go +++ b/manager/internal/jobmgr/job/state/adjusting.go @@ -129,7 +129,7 @@ func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobState case *schsdk.PackageJobFileInfo: file.PackageID = info.PackageID - case *schsdk.ResourceJobFileInfo: + case *schsdk.DataReturnJobFileInfo: return nil default: diff --git a/manager/internal/jobmgr/job/state/prescheduling.go b/manager/internal/jobmgr/job/state/prescheduling.go index b20664c..3617e79 100644 --- a/manager/internal/jobmgr/job/state/prescheduling.go +++ b/manager/internal/jobmgr/job/state/prescheduling.go @@ -109,7 +109,7 @@ func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobS case *schsdk.PackageJobFileInfo: file.PackageID = info.PackageID - case *schsdk.ResourceJobFileInfo: + case *schsdk.DataReturnJobFileInfo: return nil default: diff --git a/manager/internal/jobmgr/job/state/ready_to_adjust.go b/manager/internal/jobmgr/job/state/ready_to_adjust.go index d649653..a14fbc8 100644 --- a/manager/internal/jobmgr/job/state/ready_to_adjust.go +++ b/manager/internal/jobmgr/job/state/ready_to_adjust.go @@ -39,7 +39,7 @@ func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error cancel() }() - if rt, ok := norJob.Info.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok { + if rt, ok := norJob.Info.Files.Dataset.(*schsdk.DataReturnJobFileInfo); ok { evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool { return val.Job.GetInfo().GetLocalJobID() == rt.ResourceLocalJobID }) From a4c43731ac80a1c8eb1e981e831d88604198a37c Mon Sep 17 00:00:00 2001 From: JeshuaRen <270813223@qq.com> Date: Tue, 30 Apr 2024 16:31:51 +0800 Subject: [PATCH 02/10] =?UTF-8?q?=E8=B0=83=E5=BA=A6=E7=B3=BB=E7=BB=9F?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=A4=9A=E5=AE=9E=E4=BE=8B=E4=BB=BB=E5=8A=A1?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- client/internal/http/job.go | 53 ++ client/internal/http/server.go | 1 + .../prescheduler/default_prescheduler.go | 721 ------------------ .../prescheduler/default_prescheduler_test.go | 117 --- client/internal/prescheduler/prescheduler.go | 10 - client/internal/services/job.go | 27 + client/internal/services/jobset.go | 2 +- client/internal/services/service.go | 4 +- client/main.go | 2 +- common/pkgs/mq/manager/job.go | 34 + .../internal/jobmgr/event/instance_create.go | 25 + manager/internal/jobmgr/event/utils.go | 20 +- manager/internal/jobmgr/event_set.go | 22 +- manager/internal/jobmgr/job/instance_job.go | 30 + .../internal/jobmgr/job/multiInstance_job.go | 30 + .../jobmgr/job/state/ready_to_adjust.go | 2 +- manager/internal/jobmgr/job/state/running.go | 79 ++ manager/internal/jobmgr/jobmgr.go | 55 +- manager/internal/mq/job.go | 31 + 19 files changed, 402 insertions(+), 863 deletions(-) create mode 100644 client/internal/http/job.go delete mode 100644 client/internal/prescheduler/default_prescheduler.go delete mode 100644 client/internal/prescheduler/default_prescheduler_test.go delete mode 100644 client/internal/prescheduler/prescheduler.go create mode 100644 client/internal/services/job.go create mode 100644 manager/internal/jobmgr/event/instance_create.go create mode 100644 manager/internal/jobmgr/job/instance_job.go create mode 100644 manager/internal/jobmgr/job/multiInstance_job.go create mode 100644 manager/internal/jobmgr/job/state/running.go diff --git a/client/internal/http/job.go b/client/internal/http/job.go new file mode 100644 index 0000000..8ea8185 --- /dev/null +++ b/client/internal/http/job.go @@ -0,0 +1,53 @@ +package http + +import ( + "github.com/gin-gonic/gin" + "gitlink.org.cn/cloudream/common/consts/errorcode" + "gitlink.org.cn/cloudream/common/pkgs/logger" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + "net/http" +) + +type JobService struct { + *Server +} + +type CreateInstanceResp struct { + InstanceID schsdk.JobID `json:"instanceID"` + UploadScheme schsdk.JobFilesUploadScheme `json:"uploadScheme"` +} + +type CreateInstanceReq struct { + LocalJobID string `json:"localJobID" binding:"required"` + LocalPath schsdk.JobFileInfo `json:"filePath" binding:"required"` +} + +func (s *Server) JobSvc() *JobService { + return &JobService{ + Server: s, + } +} + +func (s *JobService) CreateInstance(ctx *gin.Context) { + log := logger.WithField("HTTP", "JobSet.HTTP") + + var req CreateInstanceReq + if err := ctx.ShouldBindQuery(&req); err != nil { + log.Warnf("binding body: %s", err.Error()) + ctx.JSON(http.StatusBadRequest, Failed(errorcode.BadArgument, "missing argument or invalid argument")) + return + } + + jobID, filesUploadScheme, err := s.svc.JobSetSvc().CreateInstance(req.LocalJobID, req.LocalPath) + if err != nil { + log.Warnf("create job instance: %s", err.Error()) + ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "create job instance failed")) + return + } + + ctx.JSON(http.StatusOK, OK(CreateInstanceResp{ + InstanceID: jobID, + UploadScheme: filesUploadScheme, + })) + +} diff --git a/client/internal/http/server.go b/client/internal/http/server.go index 8e2d284..2b2eb38 100644 --- a/client/internal/http/server.go +++ b/client/internal/http/server.go @@ -39,6 +39,7 @@ func (s *Server) Serve() error { func (s *Server) initRouters() { s.engine.POST("/jobSet/submit", s.JobSetSvc().Submit) + s.engine.POST("/jobSet/submit", s.JobSvc().CreateInstance) s.engine.POST("/jobSet/localFileUploaded", s.JobSetSvc().LocalFileUploaded) s.engine.GET("/jobSet/getServiceList", s.JobSetSvc().GetServiceList) } diff --git a/client/internal/prescheduler/default_prescheduler.go b/client/internal/prescheduler/default_prescheduler.go deleted file mode 100644 index 65f0b76..0000000 --- a/client/internal/prescheduler/default_prescheduler.go +++ /dev/null @@ -1,721 +0,0 @@ -package prescheduler - -import ( - "fmt" - "sort" - - "github.com/inhies/go-bytesize" - "github.com/samber/lo" - - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" - uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops" - "gitlink.org.cn/cloudream/common/utils/math2" - - schglb "gitlink.org.cn/cloudream/scheduler/common/globals" - schmod "gitlink.org.cn/cloudream/scheduler/common/models" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" - "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/collector" - mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" -) - -const ( - //每个节点划分的资源等级: - // ResourceLevel1:表示所有资源类型均满足 大于等于1.5倍 - ResourceLevel1 = 1 - // ResourceLevel2:表示不满足Level1,但所有资源类型均满足 大于等于1倍 - ResourceLevel2 = 2 - // ResourceLevel3: 表示某些资源类型 小于一倍 - ResourceLevel3 = 3 - - CpuResourceWeight float64 = 1 - StgResourceWeight float64 = 1.2 - - CachingWeight float64 = 1 - LoadedWeight float64 = 2 -) - -var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait") - -type candidate struct { - CC schmod.ComputingCenter - IsReferencedJobTarget bool // 这个节点是否是所依赖的任务所选择的节点 - Resource resourcesDetail - Files filesDetail -} - -type resourcesDetail struct { - CPU resourceDetail - GPU resourceDetail - NPU resourceDetail - MLU resourceDetail - Storage resourceDetail - Memory resourceDetail - - TotalScore float64 - AvgScore float64 - MaxLevel int -} -type resourceDetail struct { - Level int - Score float64 -} - -type filesDetail struct { - Dataset fileDetail - Code fileDetail - Image fileDetail - - TotalScore float64 -} -type fileDetail struct { - CachingScore float64 - LoadingScore float64 - IsLoaded bool //表示storage是否已经调度到该节点, image表示镜像是否已经加载到该算力中心 -} - -type schedulingJob struct { - Job schsdk.JobInfo - Afters []string -} - -type CandidateArr []*candidate - -func (a CandidateArr) Len() int { return len(a) } -func (a CandidateArr) Swap(i, j int) { a[i], a[j] = a[j], a[i] } -func (a CandidateArr) Less(i, j int) bool { - n1 := a[i] - n2 := a[j] - - // 优先与所依赖的任务放到一起,但要求那个节点的资源足够 - if n1.IsReferencedJobTarget && n1.Resource.MaxLevel < ResourceLevel3 { - return true - } - if n2.IsReferencedJobTarget && n2.Resource.MaxLevel < ResourceLevel3 { - return true - } - - // 优先判断资源等级,资源等级越低,代表越满足需求 - if n1.Resource.MaxLevel < n2.Resource.MaxLevel { - return true - } - if n1.Resource.MaxLevel > n2.Resource.MaxLevel { - return false - } - - // 等级相同时,根据单项分值比较 - switch n1.Resource.MaxLevel { - case ResourceLevel1: - // 数据文件总分越高,代表此节点上拥有的数据文件越完整,则越优先考虑 - return n1.Files.TotalScore > n2.Files.TotalScore - - case ResourceLevel2: - // 资源分的平均值越高,代表资源越空余,则越优先考虑 - return n1.Resource.AvgScore > n2.Resource.AvgScore - - case ResourceLevel3: - // 资源分的平均值越高,代表资源越空余,则越优先考虑 - return n1.Resource.AvgScore > n2.Resource.AvgScore - } - - return false -} - -type DefaultPreScheduler struct { -} - -func NewDefaultPreScheduler() *DefaultPreScheduler { - return &DefaultPreScheduler{} -} - -func (s *DefaultPreScheduler) Schedule(info *schsdk.JobSetInfo) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error) { - jobSetScheme := &jobmod.JobSetPreScheduleScheme{ - JobSchemes: make(map[string]jobmod.JobScheduleScheme), - } - filesUploadSchemes := make(map[string]schsdk.LocalFileUploadScheme) - - mgrCli, err := schglb.ManagerMQPool.Acquire() - if err != nil { - return nil, nil, fmt.Errorf("new collector client: %w", err) - } - defer schglb.ManagerMQPool.Release(mgrCli) - - // 查询有哪些算力中心可用 - - allCC, err := mgrCli.GetAllComputingCenter(mgrmq.NewGetAllComputingCenter()) - if err != nil { - return nil, nil, fmt.Errorf("getting all computing center info: %w", err) - } - - ccs := make(map[schsdk.CCID]schmod.ComputingCenter) - for _, node := range allCC.ComputingCenters { - ccs[node.CCID] = node - } - - if len(ccs) == 0 { - return nil, nil, ErrNoAvailableScheme - } - - // 先根据任务配置,收集它们依赖的任务的LocalID - var schJobs []*schedulingJob - for _, job := range info.Jobs { - j := &schedulingJob{ - Job: job, - } - - if norJob, ok := job.(*schsdk.NormalJobInfo); ok { - if resFile, ok := norJob.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok { - j.Afters = append(j.Afters, resFile.ResourceLocalJobID) - } - - if resFile, ok := norJob.Files.Code.(*schsdk.ResourceJobFileInfo); ok { - j.Afters = append(j.Afters, resFile.ResourceLocalJobID) - } - } else if resJob, ok := job.(*schsdk.DataReturnJobInfo); ok { - j.Afters = append(j.Afters, resJob.TargetLocalJobID) - } - - schJobs = append(schJobs, j) - } - - // 然后根据依赖进行排序 - schJobs, ok := s.orderByAfters(schJobs) - if !ok { - return nil, nil, fmt.Errorf("circular reference detected between jobs in the job set") - } - - // 经过排序后,按顺序生成调度方案 - for _, job := range schJobs { - if norJob, ok := job.Job.(*schsdk.NormalJobInfo); ok { - scheme, err := s.scheduleForNormalJob(info, job, ccs, jobSetScheme.JobSchemes) - if err != nil { - return nil, nil, err - } - - jobSetScheme.JobSchemes[job.Job.GetLocalJobID()] = *scheme - - // 检查数据文件的配置项,生成上传文件方案 - s.fillNormarlJobLocalUploadScheme(norJob, scheme.TargetCCID, filesUploadSchemes, ccs) - } - - // 回源任务目前不需要生成调度方案 - } - - return jobSetScheme, &schsdk.JobSetFilesUploadScheme{ - LocalFileSchemes: lo.Values(filesUploadSchemes), - }, nil -} - -func (s *DefaultPreScheduler) orderByAfters(jobs []*schedulingJob) ([]*schedulingJob, bool) { - type jobOrder struct { - Job *schedulingJob - Afters []string - } - - var jobOrders []*jobOrder - for _, job := range jobs { - od := &jobOrder{ - Job: job, - Afters: make([]string, len(job.Afters)), - } - - copy(od.Afters, job.Afters) - - jobOrders = append(jobOrders, od) - } - - // 然后排序 - var orderedJob []*schedulingJob - for { - rm := 0 - for i, jo := range jobOrders { - // 找到没有依赖的任务,然后将其取出 - if len(jo.Afters) == 0 { - orderedJob = append(orderedJob, jo.Job) - - // 删除其他任务对它的引用 - for _, job2 := range jobOrders { - job2.Afters = lo.Reject(job2.Afters, func(item string, idx int) bool { return item == jo.Job.Job.GetLocalJobID() }) - } - - rm++ - continue - } - - jobOrders[i-rm] = jobOrders[i] - } - - jobOrders = jobOrders[:len(jobOrders)-rm] - if len(jobOrders) == 0 { - break - } - - // 遍历一轮后没有找到无依赖的任务,那么就是存在循环引用,排序失败 - if rm == 0 { - return nil, false - } - } - - return orderedJob, true -} - -func (s *DefaultPreScheduler) scheduleForNormalJob(jobSet *schsdk.JobSetInfo, job *schedulingJob, ccs map[schsdk.CCID]schmod.ComputingCenter, jobSchemes map[string]jobmod.JobScheduleScheme) (*jobmod.JobScheduleScheme, error) { - allCCs := make(map[schsdk.CCID]*candidate) - - // 初始化备选节点信息 - for _, cc := range ccs { - caNode := &candidate{ - CC: cc, - } - - // 检查此节点是否是它所引用的任务所选的节点 - for _, af := range job.Afters { - resJob := findJobInfo[*schsdk.DataReturnJobInfo](jobSet.Jobs, af) - if resJob == nil { - return nil, fmt.Errorf("resource job %s not found in the job set", af) - } - - // 由于jobs已经按照引用排序,所以正常情况下这里肯定能取到值 - scheme, ok := jobSchemes[resJob.TargetLocalJobID] - if !ok { - continue - } - - if scheme.TargetCCID == cc.CCID { - caNode.IsReferencedJobTarget = true - break - } - } - - allCCs[cc.CCID] = caNode - } - - norJob := job.Job.(*schsdk.NormalJobInfo) - - // 计算文件占有量得分 - err := s.calcFileScore(norJob.Files, allCCs) - if err != nil { - return nil, err - } - - // 计算资源余量得分 - err = s.calcResourceScore(norJob, allCCs) - if err != nil { - return nil, err - } - - allCCsArr := lo.Values(allCCs) - sort.Sort(CandidateArr(allCCsArr)) - - targetNode := allCCsArr[0] - if targetNode.Resource.MaxLevel == ResourceLevel3 { - return nil, ErrNoAvailableScheme - } - - scheme := s.makeSchemeForNode(norJob, targetNode) - return &scheme, nil -} - -func (s *DefaultPreScheduler) fillNormarlJobLocalUploadScheme(norJob *schsdk.NormalJobInfo, targetCCID schsdk.CCID, schemes map[string]schsdk.LocalFileUploadScheme, ccs map[schsdk.CCID]schmod.ComputingCenter) { - if localFile, ok := norJob.Files.Dataset.(*schsdk.LocalJobFileInfo); ok { - if _, ok := schemes[localFile.LocalPath]; !ok { - cdsNodeID := ccs[targetCCID].CDSNodeID - schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{ - LocalPath: localFile.LocalPath, - UploadToCDSNodeID: &cdsNodeID, - } - } - } - - if localFile, ok := norJob.Files.Code.(*schsdk.LocalJobFileInfo); ok { - if _, ok := schemes[localFile.LocalPath]; !ok { - cdsNodeID := ccs[targetCCID].CDSNodeID - schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{ - LocalPath: localFile.LocalPath, - UploadToCDSNodeID: &cdsNodeID, - } - } - } - - if localFile, ok := norJob.Files.Image.(*schsdk.LocalJobFileInfo); ok { - if _, ok := schemes[localFile.LocalPath]; !ok { - cdsNodeID := ccs[targetCCID].CDSNodeID - schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{ - LocalPath: localFile.LocalPath, - UploadToCDSNodeID: &cdsNodeID, - } - } - } -} - -func (s *DefaultPreScheduler) makeSchemeForNode(job *schsdk.NormalJobInfo, targetCC *candidate) jobmod.JobScheduleScheme { - scheme := jobmod.JobScheduleScheme{ - TargetCCID: targetCC.CC.CCID, - } - - // TODO 根据实际情况选择Move或者Load - - if _, ok := job.Files.Dataset.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Dataset.IsLoaded { - scheme.Dataset.Action = jobmod.ActionLoad - } else { - scheme.Dataset.Action = jobmod.ActionNo - } - - if _, ok := job.Files.Code.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Code.IsLoaded { - scheme.Code.Action = jobmod.ActionLoad - } else { - scheme.Code.Action = jobmod.ActionNo - } - - if _, ok := job.Files.Image.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Image.IsLoaded { - scheme.Image.Action = jobmod.ActionImportImage - } else { - scheme.Image.Action = jobmod.ActionNo - } - - return scheme -} - -func (s *DefaultPreScheduler) calcResourceScore(job *schsdk.NormalJobInfo, allCCs map[schsdk.CCID]*candidate) error { - for _, cc := range allCCs { - res, err := s.calcOneResourceScore(job.Resources, &cc.CC) - if err != nil { - return err - } - - cc.Resource = *res - } - - return nil -} - -// 划分节点资源等级,并计算资源得分 -func (s *DefaultPreScheduler) calcOneResourceScore(requires schsdk.JobResourcesInfo, cc *schmod.ComputingCenter) (*resourcesDetail, error) { - colCli, err := schglb.CollectorMQPool.Acquire() - if err != nil { - return nil, fmt.Errorf("new collector client: %w", err) - } - defer schglb.CollectorMQPool.Release(colCli) - - getResDataResp, err := colCli.GetAllResourceData(collector.NewGetAllResourceData(cc.UOPSlwNodeID)) - if err != nil { - return nil, err - } - - var resDetail resourcesDetail - - //计算资源得分 - totalScore := 0.0 - maxLevel := 0 - resKinds := 0 - - if requires.CPU > 0 { - res := findResuorce[*uopsdk.CPUResourceData](getResDataResp.Datas) - if res == nil { - resDetail.CPU.Level = ResourceLevel3 - resDetail.CPU.Score = 0 - } else { - resDetail.CPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.CPU) - resDetail.CPU.Score = (float64(res.Available.Value) / requires.CPU) * CpuResourceWeight - } - - maxLevel = math2.Max(maxLevel, resDetail.CPU.Level) - totalScore += resDetail.CPU.Score - resKinds++ - } - - if requires.GPU > 0 { - res := findResuorce[*uopsdk.GPUResourceData](getResDataResp.Datas) - if res == nil { - resDetail.GPU.Level = ResourceLevel3 - resDetail.GPU.Score = 0 - } else { - resDetail.GPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.GPU) - resDetail.GPU.Score = (float64(res.Available.Value) / requires.GPU) * CpuResourceWeight - } - - maxLevel = math2.Max(maxLevel, resDetail.GPU.Level) - totalScore += resDetail.GPU.Score - resKinds++ - } - - if requires.NPU > 0 { - res := findResuorce[*uopsdk.NPUResourceData](getResDataResp.Datas) - if res == nil { - resDetail.NPU.Level = ResourceLevel3 - resDetail.NPU.Score = 0 - } else { - resDetail.NPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.NPU) - resDetail.NPU.Score = (float64(res.Available.Value) / requires.NPU) * CpuResourceWeight - } - - maxLevel = math2.Max(maxLevel, resDetail.NPU.Level) - totalScore += resDetail.NPU.Score - resKinds++ - } - - if requires.MLU > 0 { - res := findResuorce[*uopsdk.MLUResourceData](getResDataResp.Datas) - if res == nil { - resDetail.MLU.Level = ResourceLevel3 - resDetail.MLU.Score = 0 - } else { - resDetail.MLU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.MLU) - resDetail.MLU.Score = (float64(res.Available.Value) / requires.MLU) * CpuResourceWeight - } - - maxLevel = math2.Max(maxLevel, resDetail.MLU.Level) - totalScore += resDetail.MLU.Score - resKinds++ - } - - if requires.Storage > 0 { - res := findResuorce[*uopsdk.StorageResourceData](getResDataResp.Datas) - if res == nil { - resDetail.Storage.Level = ResourceLevel3 - resDetail.Storage.Score = 0 - } else { - bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit)) - if err != nil { - return nil, err - } - - resDetail.Storage.Level = s.calcResourceLevel(float64(bytes), float64(requires.Storage)) - resDetail.Storage.Score = (float64(bytes) / float64(requires.Storage)) * StgResourceWeight - } - - maxLevel = math2.Max(maxLevel, resDetail.Storage.Level) - totalScore += resDetail.Storage.Score - resKinds++ - } - - if requires.Memory > 0 { - res := findResuorce[*uopsdk.MemoryResourceData](getResDataResp.Datas) - if res == nil { - resDetail.Memory.Level = ResourceLevel3 - resDetail.Memory.Score = 0 - } else { - bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit)) - if err != nil { - return nil, err - } - - resDetail.Memory.Level = s.calcResourceLevel(float64(bytes), float64(requires.Memory)) - resDetail.Memory.Score = (float64(bytes) / float64(requires.Memory)) * StgResourceWeight - } - - maxLevel = math2.Max(maxLevel, resDetail.Memory.Level) - totalScore += resDetail.Memory.Score - resKinds++ - } - - if resKinds == 0 { - return &resDetail, nil - } - - resDetail.TotalScore = totalScore - resDetail.AvgScore = resDetail.AvgScore / float64(resKinds) - resDetail.MaxLevel = maxLevel - - return &resDetail, nil -} - -func (s *DefaultPreScheduler) calcResourceLevel(avai float64, need float64) int { - if avai >= 1.5*need { - return ResourceLevel1 - } - - if avai >= need { - return ResourceLevel2 - } - - return ResourceLevel3 -} - -// 计算节点得分情况 -func (s *DefaultPreScheduler) calcFileScore(files schsdk.JobFilesInfo, allCCs map[schsdk.CCID]*candidate) error { - // 只计算运控返回的可用计算中心上的存储服务的数据权重 - cdsNodeToCC := make(map[cdssdk.NodeID]*candidate) - for _, cc := range allCCs { - cdsNodeToCC[cc.CC.CDSNodeID] = cc - } - - //计算code相关得分 - if pkgFile, ok := files.Code.(*schsdk.PackageJobFileInfo); ok { - codeFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsNodeToCC) - if err != nil { - return fmt.Errorf("calc code file score: %w", err) - } - for id, score := range codeFileScores { - allCCs[id].Files.Code = *score - } - } - - //计算dataset相关得分 - if pkgFile, ok := files.Dataset.(*schsdk.PackageJobFileInfo); ok { - datasetFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsNodeToCC) - if err != nil { - return fmt.Errorf("calc dataset file score: %w", err) - } - for id, score := range datasetFileScores { - allCCs[id].Files.Dataset = *score - } - } - - //计算image相关得分 - if imgFile, ok := files.Image.(*schsdk.ImageJobFileInfo); ok { - //计算image相关得分 - imageFileScores, err := s.calcImageFileScore(imgFile.ImageID, allCCs, cdsNodeToCC) - if err != nil { - return fmt.Errorf("calc image file score: %w", err) - } - for id, score := range imageFileScores { - allCCs[id].Files.Image = *score - } - } - - for _, cc := range allCCs { - cc.Files.TotalScore = cc.Files.Code.CachingScore + - cc.Files.Code.LoadingScore + - cc.Files.Dataset.CachingScore + - cc.Files.Dataset.LoadingScore + - cc.Files.Image.CachingScore + - cc.Files.Image.LoadingScore - } - - return nil -} - -// 计算package在各节点的得分情况 -func (s *DefaultPreScheduler) calcPackageFileScore(packageID cdssdk.PackageID, cdsNodeToCC map[cdssdk.NodeID]*candidate) (map[schsdk.CCID]*fileDetail, error) { - colCli, err := schglb.CollectorMQPool.Acquire() - if err != nil { - return nil, fmt.Errorf("new collector client: %w", err) - } - defer schglb.CollectorMQPool.Release(colCli) - - ccFileScores := make(map[schsdk.CCID]*fileDetail) - - // TODO UserID - cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(1, packageID)) - if err != nil { - return nil, err - } - - for _, cdsNodeCacheInfo := range cachedResp.NodeInfos { - cc, ok := cdsNodeToCC[cdsNodeCacheInfo.NodeID] - if !ok { - continue - } - - ccFileScores[cc.CC.CCID] = &fileDetail{ - //TODO 根据缓存方式不同,可能会有不同的计算方式 - CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight, - } - } - - // TODO UserID - loadedResp, err := colCli.PackageGetLoadedStgNodes(collector.NewPackageGetLoadedStgNodes(1, packageID)) - if err != nil { - return nil, err - } - - for _, cdsNodeID := range loadedResp.StgNodeIDs { - cc, ok := cdsNodeToCC[cdsNodeID] - if !ok { - continue - } - - sfc, ok := ccFileScores[cc.CC.CCID] - if !ok { - sfc = &fileDetail{} - ccFileScores[cc.CC.CCID] = sfc - } - - sfc.LoadingScore = 1 * LoadedWeight - sfc.IsLoaded = true - } - - return ccFileScores, nil -} - -// 计算package在各节点的得分情况 -func (s *DefaultPreScheduler) calcImageFileScore(imageID schsdk.ImageID, allCCs map[schsdk.CCID]*candidate, cdsNodeToCC map[cdssdk.NodeID]*candidate) (map[schsdk.CCID]*fileDetail, error) { - colCli, err := schglb.CollectorMQPool.Acquire() - if err != nil { - return nil, fmt.Errorf("new collector client: %w", err) - } - defer schglb.CollectorMQPool.Release(colCli) - - magCli, err := schglb.ManagerMQPool.Acquire() - if err != nil { - return nil, fmt.Errorf("new manager client: %w", err) - } - defer schglb.ManagerMQPool.Release(magCli) - - imageInfoResp, err := magCli.GetImageInfo(mgrmq.NewGetImageInfo(imageID)) - if err != nil { - return nil, fmt.Errorf("getting image info: %w", err) - } - - ccFileScores := make(map[schsdk.CCID]*fileDetail) - - if imageInfoResp.Image.CDSPackageID != nil { - cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(1, *imageInfoResp.Image.CDSPackageID)) - if err != nil { - return nil, err - } - - for _, cdsNodeCacheInfo := range cachedResp.NodeInfos { - cc, ok := cdsNodeToCC[cdsNodeCacheInfo.NodeID] - if !ok { - continue - } - - ccFileScores[cc.CC.CCID] = &fileDetail{ - //TODO 根据缓存方式不同,可能会有不同的计算方式 - CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight, - } - } - } - - // 镜像的LoadingScore是判断是否导入到算力中心 - for _, pcmImg := range imageInfoResp.PCMImages { - _, ok := allCCs[pcmImg.CCID] - if !ok { - continue - } - - fsc, ok := ccFileScores[pcmImg.CCID] - if !ok { - fsc = &fileDetail{} - ccFileScores[pcmImg.CCID] = fsc - } - - fsc.LoadingScore = 1 * LoadedWeight - fsc.IsLoaded = true - } - - return ccFileScores, nil -} -func findResuorce[T uopsdk.ResourceData](all []uopsdk.ResourceData) T { - for _, data := range all { - if ret, ok := data.(T); ok { - return ret - } - } - - var def T - return def -} - -func findJobInfo[T schsdk.JobInfo](jobs []schsdk.JobInfo, localJobID string) T { - for _, job := range jobs { - if ret, ok := job.(T); ok && job.GetLocalJobID() == localJobID { - return ret - } - } - - var def T - return def -} diff --git a/client/internal/prescheduler/default_prescheduler_test.go b/client/internal/prescheduler/default_prescheduler_test.go deleted file mode 100644 index e10ae82..0000000 --- a/client/internal/prescheduler/default_prescheduler_test.go +++ /dev/null @@ -1,117 +0,0 @@ -package prescheduler - -import ( - "testing" - - "github.com/samber/lo" - . "github.com/smartystreets/goconvey/convey" - - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" -) - -func TestOrderByAfters(t *testing.T) { - cases := []struct { - title string - jobs []*schedulingJob - wants []string - }{ - { - title: "所有Job都有直接或间接的依赖关系", - jobs: []*schedulingJob{ - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "1"}}, - Afters: []string{"2"}, - }, - - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "2"}}, - Afters: []string{}, - }, - - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "3"}}, - Afters: []string{"1"}, - }, - }, - wants: []string{"2", "1", "3"}, - }, - - { - title: "部分Job之间无依赖关系", - jobs: []*schedulingJob{ - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "1"}}, - Afters: []string{"2"}, - }, - - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "2"}}, - Afters: []string{}, - }, - - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "3"}}, - Afters: []string{"1"}, - }, - - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "4"}}, - Afters: []string{"5"}, - }, - - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "5"}}, - Afters: []string{}, - }, - }, - wants: []string{"2", "5", "1", "3", "4"}, - }, - - { - title: "存在循环依赖", - jobs: []*schedulingJob{ - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "1"}}, - Afters: []string{"2"}, - }, - - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "2"}}, - Afters: []string{"1"}, - }, - }, - wants: nil, - }, - - { - title: "完全不依赖", - jobs: []*schedulingJob{ - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "1"}}, - Afters: []string{}, - }, - - { - Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "2"}}, - Afters: []string{}, - }, - }, - wants: []string{"1", "2"}, - }, - } - - sch := NewDefaultPreScheduler() - for _, c := range cases { - Convey(c.title, t, func() { - ordered, ok := sch.orderByAfters(c.jobs) - if c.wants == nil { - So(ok, ShouldBeFalse) - } else { - So(ok, ShouldBeTrue) - - ids := lo.Map(ordered, func(item *schedulingJob, idx int) string { return item.Job.GetLocalJobID() }) - So(ids, ShouldResemble, c.wants) - } - }) - } -} diff --git a/client/internal/prescheduler/prescheduler.go b/client/internal/prescheduler/prescheduler.go deleted file mode 100644 index a9dc5c6..0000000 --- a/client/internal/prescheduler/prescheduler.go +++ /dev/null @@ -1,10 +0,0 @@ -package prescheduler - -import ( - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" - jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" -) - -type PreScheduler interface { - Schedule(info *schsdk.JobSetInfo) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error) -} diff --git a/client/internal/services/job.go b/client/internal/services/job.go new file mode 100644 index 0000000..688ca58 --- /dev/null +++ b/client/internal/services/job.go @@ -0,0 +1,27 @@ +package services + +import ( + "fmt" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + schglb "gitlink.org.cn/cloudream/scheduler/common/globals" + mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" +) + +// Create 创建多实例任务中的实例任务 +func (svc *JobSetService) CreateInstance(LocalJobID string, LocalPath schsdk.JobFileInfo) (schsdk.JobID, schsdk.JobFilesUploadScheme, error) { + + scheme := new(schsdk.JobFilesUploadScheme) + + mgrCli, err := schglb.ManagerMQPool.Acquire() + if err != nil { + return "", *scheme, fmt.Errorf("new manager client: %w", err) + } + defer schglb.ManagerMQPool.Release(mgrCli) + + resp, err := mgrCli.CreateInstance(mgrmq.NewCreateInstance(LocalJobID, LocalPath)) + if err != nil { + return "", *scheme, fmt.Errorf("submitting job set to manager: %w", err) + } + + return resp.InstanceID, resp.UploadScheme, nil +} diff --git a/client/internal/services/jobset.go b/client/internal/services/jobset.go index 50e3d31..f3f361e 100644 --- a/client/internal/services/jobset.go +++ b/client/internal/services/jobset.go @@ -25,7 +25,7 @@ func (svc *JobSetService) Submit(info schsdk.JobSetInfo) (schsdk.JobSetID, *schs } defer schglb.ManagerMQPool.Release(mgrCli) - schScheme, uploadScheme, err := svc.preScheduler.Schedule(&info) + schScheme, uploadScheme, err := svc.preScheduler.ScheduleJobSet(&info) if err != nil { return "", nil, fmt.Errorf("pre scheduling: %w", err) } diff --git a/client/internal/services/service.go b/client/internal/services/service.go index 1d2029c..06a73dd 100644 --- a/client/internal/services/service.go +++ b/client/internal/services/service.go @@ -1,6 +1,8 @@ package services -import "gitlink.org.cn/cloudream/scheduler/client/internal/prescheduler" +import ( + "gitlink.org.cn/cloudream/common/pkgs/prescheduler" +) type Service struct { preScheduler prescheduler.PreScheduler diff --git a/client/main.go b/client/main.go index 8a24c72..7dfbdf6 100644 --- a/client/main.go +++ b/client/main.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "gitlink.org.cn/cloudream/common/pkgs/prescheduler" "os" _ "google.golang.org/grpc/balancer/grpclb" @@ -9,7 +10,6 @@ import ( "gitlink.org.cn/cloudream/common/pkgs/logger" "gitlink.org.cn/cloudream/scheduler/client/internal/cmdline" "gitlink.org.cn/cloudream/scheduler/client/internal/config" - "gitlink.org.cn/cloudream/scheduler/client/internal/prescheduler" "gitlink.org.cn/cloudream/scheduler/client/internal/services" schglb "gitlink.org.cn/cloudream/scheduler/common/globals" ) diff --git a/common/pkgs/mq/manager/job.go b/common/pkgs/mq/manager/job.go index 9e89540..d8394d6 100644 --- a/common/pkgs/mq/manager/job.go +++ b/common/pkgs/mq/manager/job.go @@ -15,11 +15,15 @@ type JobService interface { GetServiceList(msg *GetServiceList) (*GetServiceListResp, *mq.CodeMessage) GetJobSetDump(msg *GetJobSetDump) (*GetJobSetDumpResp, *mq.CodeMessage) + + CreateInstance(msg *CreateInstance) (*CreateInstanceResp, *mq.CodeMessage) } // 提交任务集 var _ = Register(Service.SubmitJobSet) +var _ = Register(Service.CreateInstance) + type SubmitJobSet struct { mq.MessageBodyBase JobSet schsdk.JobSetInfo `json:"jobSet"` @@ -45,6 +49,36 @@ func (c *Client) SubmitJobSet(msg *SubmitJobSet, opts ...mq.RequestOption) (*Sub return mq.Request(Service.SubmitJobSet, c.roundTripper, msg, opts...) } +type CreateInstance struct { + mq.MessageBodyBase + LocalJobID string + LocalPath schsdk.JobFileInfo +} + +type CreateInstanceResp struct { + mq.MessageBodyBase + InstanceID schsdk.JobID `json:"instanceID"` + UploadScheme schsdk.JobFilesUploadScheme `json:"uploadScheme"` +} + +func NewCreateInstance(LocalJobID string, LocalPath schsdk.JobFileInfo) *CreateInstance { + return &CreateInstance{ + LocalJobID: LocalJobID, + LocalPath: LocalPath, + } +} + +func NewCreateInstanceResp(InstanceID schsdk.JobID, UploadScheme schsdk.JobFilesUploadScheme) *CreateInstanceResp { + return &CreateInstanceResp{ + InstanceID: InstanceID, + UploadScheme: UploadScheme, + } +} + +func (c *Client) CreateInstance(instance *CreateInstance, opts ...mq.RequestOption) (*CreateInstanceResp, error) { + return mq.Request(Service.CreateInstance, c.roundTripper, instance, opts...) +} + // JobSet中需要使用的一个文件上传完成 var _ = Register(Service.JobSetLocalFileUploaded) diff --git a/manager/internal/jobmgr/event/instance_create.go b/manager/internal/jobmgr/event/instance_create.go new file mode 100644 index 0000000..4a2e59e --- /dev/null +++ b/manager/internal/jobmgr/event/instance_create.go @@ -0,0 +1,25 @@ +package event + +import ( + "gitlink.org.cn/cloudream/common/pkgs/future" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" +) + +type CreateInstanceFuture = *future.SetValueFuture[CreateInstanceResult] + +type InstanceCreate struct { + LocalPath schsdk.JobFileInfo + Result CreateInstanceFuture +} + +type CreateInstanceResult struct { + JobID schsdk.JobID + FilesUploadScheme schsdk.JobFilesUploadScheme +} + +func NewInstanceCreate(LocalPath schsdk.JobFileInfo, future CreateInstanceFuture) *InstanceCreate { + return &InstanceCreate{ + LocalPath: LocalPath, + Result: future, + } +} diff --git a/manager/internal/jobmgr/event/utils.go b/manager/internal/jobmgr/event/utils.go index c2c803d..dbfbc7d 100644 --- a/manager/internal/jobmgr/event/utils.go +++ b/manager/internal/jobmgr/event/utils.go @@ -6,22 +6,40 @@ import ( "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" ) +// WaitType 等待一个特定类型的事件。 +// 通过给定的上下文和事件集,这个函数会阻塞直到匹配指定类型的事件发生。 +// ctx: 用于控制等待过程的上下文,如果上下文被取消或到期,等待将被终止。 +// set: 指向一个事件集,这个事件集会被用来等待特定类型的事件。 +// 返回值 T: 等待到的事件,它会被强制转换为函数参数类型 T。 +// 返回值 bool: 表示等待操作是否成功。如果成功等到事件,返回 true;如果因为上下文被取消或到期而终止,返回 false。 func WaitType[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet) (T, bool) { + // 使用 set.Wait 方法等待一个满足给定条件的事件。 + // 条件函数检查事件是否能被转换为类型 T。 ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool { _, ok := evt.(T) return ok }) + // 因为 set.Wait 返回的事件类型是 jobmgr.Event,这里将它转换为 T 类型,并返回转换结果及操作成功标志。 return ret.(T), ok } +// WaitTypeAnd 等待一个特定类型的事件并检查该事件是否满足给定的条件。 +// ctx: 上下文,用于控制等待过程的取消或超时。 +// set: 事件集合,从中等待事件发生。 +// cond: 一个函数,用于检查等待的事件是否满足特定条件。 +// 返回值为满足条件的事件和一个布尔值,指示获取事件是否成功。 func WaitTypeAnd[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet, cond func(val T) bool) (T, bool) { + // 等待一个满足特定类型和条件的事件。 ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool { + // 尝试将事件断言为特定类型T,并检查断言是否成功。 e, ok := evt.(T) if !ok { - return false + return false // 如果事件不是期望的类型T,则返回false。 } + // 如果事件是类型T且满足给定条件,则返回true。 return cond(e) }) + // 断言返回的事件为类型T,并返回该事件和操作成功标志。 return ret.(T), ok } diff --git a/manager/internal/jobmgr/event_set.go b/manager/internal/jobmgr/event_set.go index efcce6f..a26ec55 100644 --- a/manager/internal/jobmgr/event_set.go +++ b/manager/internal/jobmgr/event_set.go @@ -25,20 +25,26 @@ func NewEventSet() EventSet { return EventSet{} } +// Post 函数用于向事件集合中发布一个事件。 +// 如果有等待该事件的协程,会唤醒它们并将事件传递给它们。 +// 参数: +// +// evt Event - 需要发布的事件对象。 func (s *EventSet) Post(evt Event) { - s.lock.Lock() - defer s.lock.Unlock() + s.lock.Lock() // 加锁保护事件集合 + defer s.lock.Unlock() // 确保在函数结束时释放锁 - // 一个事件能唤醒多个等待者 - used := false + // 遍历等待者列表,查找匹配的等待者。如果找到,从列表中移除,并设置其future的值。 + used := false // 标记当前事件是否已被使用(即是否唤醒了某个等待者) for i, waiter := range s.waiters { - if waiter.condition(evt) { - s.waiters = lo2.RemoveAt(s.waiters, i) - waiter.future.SetValue(evt) - used = true + if waiter.condition(evt) { // 检查当前事件是否满足等待条件 + s.waiters = lo2.RemoveAt(s.waiters, i) // 从等待者列表中移除当前等待者 + waiter.future.SetValue(evt) // 设置等待者的future值为当前事件 + used = true // 标记事件已被使用 } } + // 如果没有匹配的等待者,则将事件添加到事件列表中。 if !used { s.events = append(s.events, evt) } diff --git a/manager/internal/jobmgr/job/instance_job.go b/manager/internal/jobmgr/job/instance_job.go new file mode 100644 index 0000000..24e9243 --- /dev/null +++ b/manager/internal/jobmgr/job/instance_job.go @@ -0,0 +1,30 @@ +package job + +import ( + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" +) + +type InstanceJob struct { + Info schsdk.InstanceJobInfo // 提交任务时提供的任务描述信息 + Files jobmod.JobFiles // 任务需要的文件 + TargetCCID schsdk.CCID // 将要运行此任务的算力中心ID + OutputFullPath string // 程序结果的完整输出路径 +} + +func NewInstanceJob(info schsdk.InstanceJobInfo) *InstanceJob { + return &InstanceJob{ + Info: info, + } +} + +func (j *InstanceJob) GetInfo() schsdk.JobInfo { + return &j.Info +} + +func (j *InstanceJob) Dump() jobmod.JobBodyDump { + return &jobmod.NormalJobDump{ + Files: j.Files, + TargetCCID: j.TargetCCID, + } +} diff --git a/manager/internal/jobmgr/job/multiInstance_job.go b/manager/internal/jobmgr/job/multiInstance_job.go new file mode 100644 index 0000000..34325ad --- /dev/null +++ b/manager/internal/jobmgr/job/multiInstance_job.go @@ -0,0 +1,30 @@ +package job + +import ( + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" +) + +type MultiInstanceJob struct { + Info schsdk.MultiInstanceJobInfo + Files jobmod.JobFiles + TargetCCID schsdk.CCID + SubJobs []schsdk.JobID +} + +func NewMultiInstanceJob(info schsdk.MultiInstanceJobInfo) *MultiInstanceJob { + return &MultiInstanceJob{ + Info: info, + } +} + +func (j *MultiInstanceJob) GetInfo() schsdk.JobInfo { + return &j.Info +} + +func (j *MultiInstanceJob) Dump() jobmod.JobBodyDump { + return &jobmod.NormalJobDump{ + Files: j.Files, + TargetCCID: j.TargetCCID, + } +} diff --git a/manager/internal/jobmgr/job/state/ready_to_adjust.go b/manager/internal/jobmgr/job/state/ready_to_adjust.go index a14fbc8..9522ab1 100644 --- a/manager/internal/jobmgr/job/state/ready_to_adjust.go +++ b/manager/internal/jobmgr/job/state/ready_to_adjust.go @@ -41,7 +41,7 @@ func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error if rt, ok := norJob.Info.Files.Dataset.(*schsdk.DataReturnJobFileInfo); ok { evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool { - return val.Job.GetInfo().GetLocalJobID() == rt.ResourceLocalJobID + return val.Job.GetInfo().GetLocalJobID() == rt.DataReturnLocalJobID }) if !ok { return jobmgr.ErrJobCancelled diff --git a/manager/internal/jobmgr/job/state/running.go b/manager/internal/jobmgr/job/state/running.go new file mode 100644 index 0000000..fa829ea --- /dev/null +++ b/manager/internal/jobmgr/job/state/running.go @@ -0,0 +1,79 @@ +package state + +import ( + "context" + "gitlink.org.cn/cloudream/common/pkgs/prescheduler" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" +) + +type CreateInstance struct { + preScheduler prescheduler.PreScheduler +} + +func NewCreateInstance(preScheduler prescheduler.PreScheduler) *CreateInstance { + return &CreateInstance{ + preScheduler: preScheduler, + } +} + +func (s *CreateInstance) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) { + s.do(rtx, job) +} + +func (s *CreateInstance) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go func() { + event.WaitType[event.Cancel](ctx, rtx.EventSet) + cancel() + }() + + multInstJob := jo.Body.(*job.MultiInstanceJob) + + for { + // 监听创建实例事件 + ic, ok := event.WaitType[event.InstanceCreate](ctx, rtx.EventSet) + if !ok { + break + } + + // 构建InstanceJobInfo + files := schsdk.JobFilesInfo{ + Dataset: ic.LocalPath, + Code: multInstJob.Info.Files.Code, + Image: multInstJob.Info.Files.Image, + } + + instJobInfo := &schsdk.InstanceJobInfo{ + LocalJobID: multInstJob.Info.LocalJobID, + Files: files, + Runtime: multInstJob.Info.Runtime, + Resources: multInstJob.Info.Resources, + } + + // 生成预调度方案和文件上传方案 + jobSchedule, filesUploadScheme, err := s.preScheduler.ScheduleJob(instJobInfo) + if err != nil { + ic.Result.SetError(err) + continue + } + + // 创建实例并运行 + instanceJob := job.NewInstanceJob(*instJobInfo) + jobID := rtx.Mgr.AddJob(jo.JobSetID, instanceJob, NewPreSchuduling(*jobSchedule)) + + // 在多实例任务中新增这个实例的任务ID + multInstJob.SubJobs = append(multInstJob.SubJobs, jobID) + + // 将实例ID和文件上传方案返回 + ic.Result.SetValue(event.CreateInstanceResult{ + JobID: jobID, + FilesUploadScheme: *filesUploadScheme, + }) + + } +} diff --git a/manager/internal/jobmgr/jobmgr.go b/manager/internal/jobmgr/jobmgr.go index 62bd889..2a9f810 100644 --- a/manager/internal/jobmgr/jobmgr.go +++ b/manager/internal/jobmgr/jobmgr.go @@ -95,17 +95,25 @@ func (m *Manager) PostEvent(jobID schsdk.JobID, evt Event) { }() } +// BroadcastEvent 向所有属于指定 jobSet 的任务广播一个事件。 +// jobSetID: 代表作业集的唯一标识符。 +// evt: 需要广播的事件。 func (m *Manager) BroadcastEvent(jobSetID schsdk.JobSetID, evt Event) { + // 加锁以确保发布事件时的线程安全 m.pubLock.Lock() - defer m.pubLock.Unlock() + defer m.pubLock.Unlock() // 确保函数退出时释放锁 + // 尝试从管理器的作业集中获取指定的作业集 jobSet, ok := m.jobSets[jobSetID] if !ok { + // 如果作业集不存在,则直接返回 return } + // 遍历作业集中的所有任务,并为每个任务发布事件 for _, mjob := range jobSet.jobs { go func(j *mgrJob) { + // 使用 goroutine 为每个任务发布事件,以异步方式处理,避免阻塞 j.eventSet.Post(evt) }(mjob) } @@ -116,18 +124,31 @@ type SubmittingJob struct { InitState JobState } +// SubmitJobSet 提交一个作业集,将一组提交作业转换为系统可识别的作业集,并为每个提交的作业创建一个唯一的作业ID。 +// +// 参数: +// +// jobs []SubmittingJob - 要提交的作业列表,每个作业包含作业的初始状态和内容。 +// +// 返回值: +// +// schsdk.JobSetID - 生成的作业集ID,用于标识这个作业集。 func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID { + // 加锁以保护对作业集ID和作业ID索引的修改 m.pubLock.Lock() defer m.pubLock.Unlock() + // 生成一个新的作业集ID,并递增作业集ID索引 jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex)) m.jobSetIDIndex += 1 + // 创建一个新的作业集实例,并初始化其作业映射 jobSet := &mgrJobSet{ jobs: make(map[schsdk.JobID]*mgrJob), } m.jobSets[jobSetID] = jobSet + // 遍历提交的作业,为每个作业创建一个唯一的作业ID,初始化作业状态,并将其添加到作业集中 for i, subJob := range jobs { jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i)) job := &mgrJob{ @@ -140,10 +161,20 @@ func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID { } jobSet.jobs[jobID] = job - m.ChangeState(&job.job, subJob.InitState) + // 更改作业的初始状态 + //m.ChangeState(&job.job, subJob.InitState) + go func() { + subJob.InitState.Run(JobStateRunContext{ + Mgr: m, + EventSet: &job.eventSet, + LastState: job.state, + }, &job.job) + }() } + // 更新作业ID索引,基于提交的作业数量 m.jobIDIndex += len(jobs) + // 返回生成的作业集ID return jobSetID } @@ -167,3 +198,23 @@ func (m *Manager) DumpJobSet(jobSetID schsdk.JobSetID) []jobmod.JobDump { return jobDumps } + +type PreSchedulerInstJob struct { + Body JobBody + InitState JobState +} + +// AddJob 添加一个作业到指定的作业集。 +func (m *Manager) AddJob(jobSetID schsdk.JobSetID, jobBody JobBody, State JobState) schsdk.JobID { + jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+1)) + + job := Job{ + JobSetID: jobSetID, + JobID: jobID, + Body: jobBody, + } + + m.ChangeState(&job, State) + + return jobID +} diff --git a/manager/internal/mq/job.go b/manager/internal/mq/job.go index 13f99e8..766c8f5 100644 --- a/manager/internal/mq/job.go +++ b/manager/internal/mq/job.go @@ -1,8 +1,10 @@ package mq import ( + "context" "errors" "fmt" + "gitlink.org.cn/cloudream/common/pkgs/future" "gitlink.org.cn/cloudream/common/consts/errorcode" "gitlink.org.cn/cloudream/common/pkgs/logger" @@ -43,12 +45,41 @@ func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetRe Body: job, InitState: state.NewWaitTargetComplete(), }) + + case *schsdk.MultiInstanceJobInfo: + job := job.NewMultiInstanceJob(*info) + + preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID] + if !ok { + return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID)) + } + + jobs = append(jobs, jobmgr.SubmittingJob{ + Body: job, + InitState: state.NewPreSchuduling(preSch), + }) + } } return mq.ReplyOK(mgrmq.NewSubmitJobSetResp(svc.jobMgr.SubmitJobSet(jobs))) } +func (svc *Service) CreateInstance(instInfo *mgrmq.CreateInstance) (*mgrmq.CreateInstanceResp, *mq.CodeMessage) { + logger.Debugf("start create instance") + + fut := future.NewSetValue[event.CreateInstanceResult]() + svc.jobMgr.PostEvent(schsdk.JobID(instInfo.LocalJobID), event.NewInstanceCreate(instInfo.LocalPath, fut)) + + result, err := fut.WaitValue(context.TODO()) + + if err != nil { + return nil, mq.Failed(errorcode.OperationFailed, err.Error()) + } + + return mq.ReplyOK(mgrmq.NewCreateInstanceResp(result.JobID, result.FilesUploadScheme)) +} + // 任务集中某个文件上传完成 func (svc *Service) JobSetLocalFileUploaded(msg *mgrmq.JobSetLocalFileUploaded) (*mgrmq.JobSetLocalFileUploadedResp, *mq.CodeMessage) { logger.WithField("LocalPath", msg.LocalPath). From 0de174fd9aa401a580baaf6ca086850d19416f30 Mon Sep 17 00:00:00 2001 From: JeshuaRen <270813223@qq.com> Date: Tue, 7 May 2024 15:45:13 +0800 Subject: [PATCH 03/10] =?UTF-8?q?=E5=A4=9A=E5=AE=9E=E4=BE=8B=E4=BB=BB?= =?UTF-8?q?=E5=8A=A1=E4=BB=A3=E7=A0=81=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- client/internal/http/job.go | 26 +++++++-- client/internal/http/server.go | 2 +- common/models/job/body.go | 22 ++++++++ manager/internal/jobmgr/event/utils.go | 8 +++ manager/internal/jobmgr/job/instance_job.go | 7 +-- .../internal/jobmgr/job/multiInstance_job.go | 16 +++--- .../internal/jobmgr/job/state/adjusting.go | 35 +++++++++--- .../internal/jobmgr/job/state/executing.go | 30 ++++++++--- .../jobmgr/job/state/multiInstance_init.go | 53 +++++++++++++++++++ .../{running.go => multiInstance_running.go} | 37 +++++++++---- .../jobmgr/job/state/prescheduling.go | 31 ++++++++--- .../jobmgr/job/state/ready_to_adjust.go | 16 ++++-- manager/internal/mq/job.go | 6 ++- 13 files changed, 236 insertions(+), 53 deletions(-) create mode 100644 manager/internal/jobmgr/job/state/multiInstance_init.go rename manager/internal/jobmgr/job/state/{running.go => multiInstance_running.go} (62%) diff --git a/client/internal/http/job.go b/client/internal/http/job.go index 8ea8185..b429c44 100644 --- a/client/internal/http/job.go +++ b/client/internal/http/job.go @@ -5,6 +5,8 @@ import ( "gitlink.org.cn/cloudream/common/consts/errorcode" "gitlink.org.cn/cloudream/common/pkgs/logger" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + "gitlink.org.cn/cloudream/common/utils/serder" + "io" "net/http" ) @@ -19,7 +21,7 @@ type CreateInstanceResp struct { type CreateInstanceReq struct { LocalJobID string `json:"localJobID" binding:"required"` - LocalPath schsdk.JobFileInfo `json:"filePath" binding:"required"` + LocalPath schsdk.JobFileInfo `json:"localPath" binding:"required"` } func (s *Server) JobSvc() *JobService { @@ -31,10 +33,24 @@ func (s *Server) JobSvc() *JobService { func (s *JobService) CreateInstance(ctx *gin.Context) { log := logger.WithField("HTTP", "JobSet.HTTP") - var req CreateInstanceReq - if err := ctx.ShouldBindQuery(&req); err != nil { - log.Warnf("binding body: %s", err.Error()) - ctx.JSON(http.StatusBadRequest, Failed(errorcode.BadArgument, "missing argument or invalid argument")) + //var req CreateInstanceReq + //if err := ctx.ShouldBindJSON(&req); err != nil { + // log.Warnf("binding body: %s", err.Error()) + // ctx.JSON(http.StatusBadRequest, Failed(errorcode.BadArgument, "missing argument or invalid argument")) + // return + //} + + bodyData, err := io.ReadAll(ctx.Request.Body) + if err != nil { + log.Warnf("reading request body: %s", err.Error()) + ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed")) + return + } + + req, err := serder.JSONToObjectEx[CreateInstanceReq](bodyData) + if err != nil { + log.Warnf("parsing request body: %s", err.Error()) + ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed")) return } diff --git a/client/internal/http/server.go b/client/internal/http/server.go index 2b2eb38..38eba6a 100644 --- a/client/internal/http/server.go +++ b/client/internal/http/server.go @@ -39,7 +39,7 @@ func (s *Server) Serve() error { func (s *Server) initRouters() { s.engine.POST("/jobSet/submit", s.JobSetSvc().Submit) - s.engine.POST("/jobSet/submit", s.JobSvc().CreateInstance) + s.engine.POST("/job/CreateInstance", s.JobSvc().CreateInstance) s.engine.POST("/jobSet/localFileUploaded", s.JobSetSvc().LocalFileUploaded) s.engine.GET("/jobSet/getServiceList", s.JobSetSvc().GetServiceList) } diff --git a/common/models/job/body.go b/common/models/job/body.go index 8b99381..1d91981 100644 --- a/common/models/job/body.go +++ b/common/models/job/body.go @@ -40,3 +40,25 @@ type DataReturnJobDump struct { func (d *DataReturnJobDump) getType() JobBodyDumpType { return d.Type } + +type InstanceJobDump struct { + serder.Metadata `union:"NormalJob"` + Type JobBodyDumpType `json:"type"` + TargetCCID schsdk.CCID `json:"targetCCID"` + Files JobFiles `json:"files"` +} + +func (d *InstanceJobDump) getType() JobBodyDumpType { + return d.Type +} + +type MultiInstanceJobDump struct { + serder.Metadata `union:"NormalJob"` + Type JobBodyDumpType `json:"type"` + TargetCCID schsdk.CCID `json:"targetCCID"` + Files JobFiles `json:"files"` +} + +func (d *MultiInstanceJobDump) getType() JobBodyDumpType { + return d.Type +} diff --git a/manager/internal/jobmgr/event/utils.go b/manager/internal/jobmgr/event/utils.go index dbfbc7d..496ef1f 100644 --- a/manager/internal/jobmgr/event/utils.go +++ b/manager/internal/jobmgr/event/utils.go @@ -19,6 +19,10 @@ func WaitType[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet) (T, boo _, ok := evt.(T) return ok }) + if ret == nil { + var r T + return r, false // 如果事件为空,则返回false。 + } // 因为 set.Wait 返回的事件类型是 jobmgr.Event,这里将它转换为 T 类型,并返回转换结果及操作成功标志。 return ret.(T), ok } @@ -40,6 +44,10 @@ func WaitTypeAnd[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet, cond // 如果事件是类型T且满足给定条件,则返回true。 return cond(e) }) + if ret == nil { + var r T + return r, false // 如果事件为空,则返回false。 + } // 断言返回的事件为类型T,并返回该事件和操作成功标志。 return ret.(T), ok } diff --git a/manager/internal/jobmgr/job/instance_job.go b/manager/internal/jobmgr/job/instance_job.go index 24e9243..350bf5a 100644 --- a/manager/internal/jobmgr/job/instance_job.go +++ b/manager/internal/jobmgr/job/instance_job.go @@ -12,9 +12,10 @@ type InstanceJob struct { OutputFullPath string // 程序结果的完整输出路径 } -func NewInstanceJob(info schsdk.InstanceJobInfo) *InstanceJob { +func NewInstanceJob(info schsdk.InstanceJobInfo, files jobmod.JobFiles) *InstanceJob { return &InstanceJob{ - Info: info, + Info: info, + Files: files, } } @@ -23,7 +24,7 @@ func (j *InstanceJob) GetInfo() schsdk.JobInfo { } func (j *InstanceJob) Dump() jobmod.JobBodyDump { - return &jobmod.NormalJobDump{ + return &jobmod.InstanceJobDump{ Files: j.Files, TargetCCID: j.TargetCCID, } diff --git a/manager/internal/jobmgr/job/multiInstance_job.go b/manager/internal/jobmgr/job/multiInstance_job.go index 34325ad..bf0827e 100644 --- a/manager/internal/jobmgr/job/multiInstance_job.go +++ b/manager/internal/jobmgr/job/multiInstance_job.go @@ -6,15 +6,17 @@ import ( ) type MultiInstanceJob struct { - Info schsdk.MultiInstanceJobInfo - Files jobmod.JobFiles - TargetCCID schsdk.CCID - SubJobs []schsdk.JobID + Info schsdk.MultiInstanceJobInfo + Files jobmod.JobFiles + TargetCCID schsdk.CCID + SubJobs []schsdk.JobID + PreScheduler jobmod.JobScheduleScheme } -func NewMultiInstanceJob(info schsdk.MultiInstanceJobInfo) *MultiInstanceJob { +func NewMultiInstanceJob(info schsdk.MultiInstanceJobInfo, preScheduler jobmod.JobScheduleScheme) *MultiInstanceJob { return &MultiInstanceJob{ - Info: info, + Info: info, + PreScheduler: preScheduler, } } @@ -23,7 +25,7 @@ func (j *MultiInstanceJob) GetInfo() schsdk.JobInfo { } func (j *MultiInstanceJob) Dump() jobmod.JobBodyDump { - return &jobmod.NormalJobDump{ + return &jobmod.MultiInstanceJobDump{ Files: j.Files, TargetCCID: j.TargetCCID, } diff --git a/manager/internal/jobmgr/job/state/adjusting.go b/manager/internal/jobmgr/job/state/adjusting.go index 0269baf..19d1ac1 100644 --- a/manager/internal/jobmgr/job/state/adjusting.go +++ b/manager/internal/jobmgr/job/state/adjusting.go @@ -46,7 +46,25 @@ func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.J } func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { - norJob := jo.Body.(*job.NormalJob) + //norJob := jo.Body.(*job.NormalJob) + var jobFilesInfo schsdk.JobFilesInfo + var jobFiles *jobmod.JobFiles + var targetCCID schsdk.CCID + + switch runningJob := jo.Body.(type) { + case *job.NormalJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + targetCCID = runningJob.TargetCCID + case *job.MultiInstanceJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + targetCCID = runningJob.TargetCCID + case *job.InstanceJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + targetCCID = runningJob.TargetCCID + } ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -77,7 +95,8 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { return fmt.Errorf("getting cds storage info: %w", err) } // TODO UserID - norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID) + //norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID) + utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID) wg := sync.WaitGroup{} wg.Add(3) @@ -86,7 +105,7 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { go func() { defer wg.Done() - e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset) + e1 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Dataset, &jobFiles.Dataset, &s.scheme.Dataset) if e1 != nil { cancel() } @@ -94,7 +113,7 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { go func() { defer wg.Done() - e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code) + e2 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Code, &jobFiles.Code, &s.scheme.Code) if e2 != nil { cancel() } @@ -102,7 +121,7 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { go func() { defer wg.Done() - e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image) + e3 = s.doImageScheduling(ctx, rtx, targetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image) if e3 != nil { cancel() } @@ -111,7 +130,7 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { return errors.Join(e1, e2, e3) } -func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error { +func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error { switch info := fileInfo.(type) { case *schsdk.LocalJobFileInfo: evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { @@ -173,7 +192,7 @@ func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobState return nil } -func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, job *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error { +func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error { switch info := fileInfo.(type) { case *schsdk.LocalJobFileInfo: evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { @@ -260,7 +279,7 @@ func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRu } // TODO 镜像名称 - err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, job.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now()) + err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now()) if err != nil { return fmt.Errorf("creating image info: %w", err) } diff --git a/manager/internal/jobmgr/job/state/executing.go b/manager/internal/jobmgr/job/state/executing.go index a8cc7f4..f77a02c 100644 --- a/manager/internal/jobmgr/job/state/executing.go +++ b/manager/internal/jobmgr/job/state/executing.go @@ -3,6 +3,7 @@ package state import ( "context" "fmt" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" "gitlink.org.cn/cloudream/common/pkgs/logger" pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm" @@ -40,30 +41,45 @@ func (s *NormalJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) } func (s *NormalJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { - norJob := jo.Body.(*job.NormalJob) + //norJob := jo.Body.(*job.NormalJob) + + var runtime *schsdk.JobRuntimeInfo + var jobFiles *jobmod.JobFiles + var targetCCID schsdk.CCID + + switch runningJob := jo.Body.(type) { + case *job.NormalJob: + runtime = &runningJob.Info.Runtime + jobFiles = &runningJob.Files + targetCCID = runningJob.TargetCCID + case *job.InstanceJob: + runtime = &runningJob.Info.Runtime + jobFiles = &runningJob.Files + targetCCID = runningJob.TargetCCID + } log := logger.WithType[NormalJobExecuting]("State").WithField("JobID", jo.JobID) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID) + pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), jobFiles.Image.ImageID, targetCCID) if err != nil { return fmt.Errorf("getting pcm image info: %w", err) } - ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID) + ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), targetCCID) if err != nil { return fmt.Errorf("getting computing center info: %w", err) } // TODO 需要添加DATA_IN、DATA_OUT等环境变量,这些数据从Job的信息中来获取 - ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), norJob.TargetCCID) + ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), targetCCID) if err != nil { return fmt.Errorf("getting computing center resource: %w", err) } if len(ress) == 0 { - return fmt.Errorf("no resource found at computing center %v", norJob.TargetCCID) + return fmt.Errorf("no resource found at computing center %v", targetCCID) } wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSubmitTask( @@ -71,8 +87,8 @@ func (s *NormalJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) e pcmImgInfo.PCMImageID, // TODO 选择资源的算法 ress[0].PCMResourceID, - norJob.Info.Runtime.Command, - norJob.Info.Runtime.Envs, + runtime.Command, + runtime.Envs, )) defer wt.Close() diff --git a/manager/internal/jobmgr/job/state/multiInstance_init.go b/manager/internal/jobmgr/job/state/multiInstance_init.go new file mode 100644 index 0000000..99c8785 --- /dev/null +++ b/manager/internal/jobmgr/job/state/multiInstance_init.go @@ -0,0 +1,53 @@ +package state + +import ( + "context" + "gitlink.org.cn/cloudream/common/pkgs/logger" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" + "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" +) + +type MultiInstanceInit struct { +} + +func (s *MultiInstanceInit) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) { + s.do(rtx, job) +} + +func (s *MultiInstanceInit) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + multInstJob := jo.Body.(*job.MultiInstanceJob) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go func() { + event.WaitType[event.Cancel](ctx, rtx.EventSet) + cancel() + }() + + instJobInfo := &schsdk.InstanceJobInfo{ + LocalJobID: multInstJob.Info.LocalJobID, + Files: multInstJob.Info.Files, + Runtime: multInstJob.Info.Runtime, + Resources: multInstJob.Info.Resources, + } + + files := jobmod.JobFiles{ + Dataset: multInstJob.Files.Dataset, + Code: multInstJob.Files.Code, + Image: multInstJob.Files.Image, + } + + // 创建实例并运行 + instanceJob := job.NewInstanceJob(*instJobInfo, files) + jobID := rtx.Mgr.AddJob(jo.JobSetID, instanceJob, NewPreSchuduling(multInstJob.PreScheduler)) + logger.Info("Init instance success, jobID: " + jobID) + + // 在多实例任务中新增这个实例的任务ID + multInstJob.SubJobs = append(multInstJob.SubJobs, jobID) + + rtx.Mgr.ChangeState(jo, NewMultiInstanceRunning()) +} diff --git a/manager/internal/jobmgr/job/state/running.go b/manager/internal/jobmgr/job/state/multiInstance_running.go similarity index 62% rename from manager/internal/jobmgr/job/state/running.go rename to manager/internal/jobmgr/job/state/multiInstance_running.go index fa829ea..5786cac 100644 --- a/manager/internal/jobmgr/job/state/running.go +++ b/manager/internal/jobmgr/job/state/multiInstance_running.go @@ -2,28 +2,39 @@ package state import ( "context" + "gitlink.org.cn/cloudream/common/pkgs/logger" "gitlink.org.cn/cloudream/common/pkgs/prescheduler" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" ) -type CreateInstance struct { +type MultiInstanceRunning struct { preScheduler prescheduler.PreScheduler } -func NewCreateInstance(preScheduler prescheduler.PreScheduler) *CreateInstance { - return &CreateInstance{ - preScheduler: preScheduler, - } +func (s *MultiInstanceRunning) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump { + //TODO implement me + panic("implement me") } -func (s *CreateInstance) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) { +//func NewMultiInstanceRunning(preScheduler prescheduler.PreScheduler) *MultiInstanceRunning { +// return &MultiInstanceRunning{ +// preScheduler: preScheduler, +// } +//} + +func NewMultiInstanceRunning() *MultiInstanceRunning { + return &MultiInstanceRunning{} +} + +func (s *MultiInstanceRunning) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) { s.do(rtx, job) } -func (s *CreateInstance) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { +func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -42,7 +53,7 @@ func (s *CreateInstance) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { } // 构建InstanceJobInfo - files := schsdk.JobFilesInfo{ + infoFiles := schsdk.JobFilesInfo{ Dataset: ic.LocalPath, Code: multInstJob.Info.Files.Code, Image: multInstJob.Info.Files.Image, @@ -50,11 +61,16 @@ func (s *CreateInstance) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { instJobInfo := &schsdk.InstanceJobInfo{ LocalJobID: multInstJob.Info.LocalJobID, - Files: files, + Files: infoFiles, Runtime: multInstJob.Info.Runtime, Resources: multInstJob.Info.Resources, } + files := jobmod.JobFiles{ + Code: multInstJob.Files.Code, + Image: multInstJob.Files.Image, + } + // 生成预调度方案和文件上传方案 jobSchedule, filesUploadScheme, err := s.preScheduler.ScheduleJob(instJobInfo) if err != nil { @@ -63,8 +79,9 @@ func (s *CreateInstance) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { } // 创建实例并运行 - instanceJob := job.NewInstanceJob(*instJobInfo) + instanceJob := job.NewInstanceJob(*instJobInfo, files) jobID := rtx.Mgr.AddJob(jo.JobSetID, instanceJob, NewPreSchuduling(*jobSchedule)) + logger.Info("Create instance success, jobID: " + jobID) // 在多实例任务中新增这个实例的任务ID multInstJob.SubJobs = append(multInstJob.SubJobs, jobID) diff --git a/manager/internal/jobmgr/job/state/prescheduling.go b/manager/internal/jobmgr/job/state/prescheduling.go index 3617e79..efb246c 100644 --- a/manager/internal/jobmgr/job/state/prescheduling.go +++ b/manager/internal/jobmgr/job/state/prescheduling.go @@ -30,7 +30,24 @@ func NewPreSchuduling(scheme jobmod.JobScheduleScheme) *PreScheduling { } func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { - norJob := jo.Body.(*job.NormalJob) + var jobFilesInfo schsdk.JobFilesInfo + var jobFiles *jobmod.JobFiles + var targetCCID schsdk.CCID + + switch runningJob := jo.Body.(type) { + case *job.NormalJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + targetCCID = runningJob.TargetCCID + case *job.MultiInstanceJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + targetCCID = runningJob.TargetCCID + case *job.InstanceJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + targetCCID = runningJob.TargetCCID + } ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -55,7 +72,7 @@ func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { go func() { defer wg.Done() - e1 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Dataset, &norJob.Files.Dataset, &s.scheme.Dataset) + e1 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Dataset, &jobFiles.Dataset, &s.scheme.Dataset) if e1 != nil { cancel() } @@ -63,7 +80,7 @@ func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { go func() { defer wg.Done() - e2 = s.doPackageScheduling(ctx, rtx, norJob, norJob.Info.Files.Code, &norJob.Files.Code, &s.scheme.Code) + e2 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Code, &jobFiles.Code, &s.scheme.Code) if e2 != nil { cancel() } @@ -71,7 +88,7 @@ func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { go func() { defer wg.Done() - e3 = s.doImageScheduling(ctx, rtx, norJob, norJob.Info.Files.Image, &norJob.Files.Image, &s.scheme.Image) + e3 = s.doImageScheduling(ctx, rtx, targetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image) if e3 != nil { cancel() } @@ -91,7 +108,7 @@ func (s *PreScheduling) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobm } } -func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error { +func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error { switch info := fileInfo.(type) { case *schsdk.LocalJobFileInfo: evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { @@ -153,7 +170,7 @@ func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobS return nil } -func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, norJob *job.NormalJob, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error { +func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error { switch info := fileInfo.(type) { case *schsdk.LocalJobFileInfo: evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { @@ -240,7 +257,7 @@ func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobSta } // TODO 镜像名称 - err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, norJob.TargetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now()) + err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now()) if err != nil { return fmt.Errorf("creating image info: %w", err) } diff --git a/manager/internal/jobmgr/job/state/ready_to_adjust.go b/manager/internal/jobmgr/job/state/ready_to_adjust.go index 9522ab1..d2914a1 100644 --- a/manager/internal/jobmgr/job/state/ready_to_adjust.go +++ b/manager/internal/jobmgr/job/state/ready_to_adjust.go @@ -28,7 +28,17 @@ func (s *ReadyToAdjust) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { } func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { - norJob := jo.Body.(*job.NormalJob) + var jobFilesInfo schsdk.JobFilesInfo + var jobFiles *jobmod.JobFiles + + switch runningJob := jo.Body.(type) { + case *job.NormalJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + case *job.InstanceJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + } ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -39,7 +49,7 @@ func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error cancel() }() - if rt, ok := norJob.Info.Files.Dataset.(*schsdk.DataReturnJobFileInfo); ok { + if rt, ok := jobFilesInfo.Dataset.(*schsdk.DataReturnJobFileInfo); ok { evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool { return val.Job.GetInfo().GetLocalJobID() == rt.DataReturnLocalJobID }) @@ -54,7 +64,7 @@ func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error return fmt.Errorf("job %s is not a DataReturn job(which is %T)", evt.Job.JobID, evt.Job) } - norJob.Files.Dataset.PackageID = rtJob.DataReturnPackageID + jobFiles.Dataset.PackageID = rtJob.DataReturnPackageID } return nil diff --git a/manager/internal/mq/job.go b/manager/internal/mq/job.go index 766c8f5..bd45610 100644 --- a/manager/internal/mq/job.go +++ b/manager/internal/mq/job.go @@ -47,9 +47,11 @@ func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetRe }) case *schsdk.MultiInstanceJobInfo: - job := job.NewMultiInstanceJob(*info) - preSch, ok := msg.PreScheduleScheme.JobSchemes[info.LocalJobID] + logger.Info(">>>localJobID: " + info.LocalJobID) + + job := job.NewMultiInstanceJob(*info, preSch) + if !ok { return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID)) } From eaff609f94d5902ed48bc56d48f20c6a73969c29 Mon Sep 17 00:00:00 2001 From: JeshuaRen <270813223@qq.com> Date: Thu, 9 May 2024 09:04:30 +0800 Subject: [PATCH 04/10] =?UTF-8?q?=E5=A4=9A=E5=AE=9E=E4=BE=8B=E4=BB=BB?= =?UTF-8?q?=E5=8A=A1=E4=BB=A3=E7=A0=81=E6=B5=8B=E8=AF=95=E5=90=8E=E8=B0=83?= =?UTF-8?q?=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- advisor/internal/scheduler/scheduler.go | 30 +- client/internal/http/job.go | 6 +- client/internal/services/job.go | 4 +- client/internal/services/service.go | 2 +- client/main.go | 2 +- common/models/job/body.go | 6 +- common/models/job/state.go | 18 + common/pkgs/mq/manager/job.go | 10 +- common/pkgs/prescheduler/calc_score.go | 337 ++++++++++++ .../pkgs/prescheduler/default_prescheduler.go | 517 ++++++++++++++++++ .../prescheduler/default_prescheduler_test.go | 117 ++++ common/pkgs/prescheduler/prescheduler.go | 11 + manager/internal/jobmgr/event.go | 62 --- manager/internal/jobmgr/event/cancel.go | 3 + .../internal/jobmgr/event/instance_create.go | 3 + .../internal/jobmgr/event/job_completed.go | 3 + .../jobmgr/event/local_file_uploaded.go | 3 + manager/internal/jobmgr/event_set.go | 16 +- .../internal/jobmgr/job/state/adjusting.go | 17 +- .../internal/jobmgr/job/state/executing.go | 4 +- .../jobmgr/job/state/making_adjust_scheme.go | 2 +- .../jobmgr/job/state/multiInstance_init.go | 21 +- .../jobmgr/job/state/multiInstance_running.go | 32 +- .../jobmgr/job/state/prescheduling.go | 20 +- .../jobmgr/job/state/ready_to_adjust.go | 4 +- .../jobmgr/job/state/wait_target_complete.go | 4 +- manager/internal/jobmgr/jobmgr.go | 36 +- manager/internal/mq/job.go | 4 +- 28 files changed, 1160 insertions(+), 134 deletions(-) create mode 100644 common/pkgs/prescheduler/calc_score.go create mode 100644 common/pkgs/prescheduler/default_prescheduler.go create mode 100644 common/pkgs/prescheduler/default_prescheduler_test.go create mode 100644 common/pkgs/prescheduler/prescheduler.go delete mode 100644 manager/internal/jobmgr/event.go diff --git a/advisor/internal/scheduler/scheduler.go b/advisor/internal/scheduler/scheduler.go index 4685ebe..313a71e 100644 --- a/advisor/internal/scheduler/scheduler.go +++ b/advisor/internal/scheduler/scheduler.go @@ -130,9 +130,23 @@ func NewDefaultSchedule() *DefaultScheduler { } func (s *DefaultScheduler) Schedule(dump *jobmod.JobDump) (*jobmod.JobScheduleScheme, error) { - norJob, ok := dump.Body.(*jobmod.NormalJobDump) - if !ok { - return nil, fmt.Errorf("only normal job can be scheduled, but got %T", dump.Body) + //norJob, ok := dump.Body.(*jobmod.NormalJobDump) + + var jobResourceInfo schsdk.JobResourcesInfo + var jobFiles *jobmod.JobFiles + var targetCCID schsdk.CCID + + switch jobDump := dump.Body.(type) { + case *jobmod.NormalJobDump: + normalJobInfo := dump.Info.(*schsdk.NormalJobInfo) + jobResourceInfo = normalJobInfo.Resources + jobFiles = &jobDump.Files + targetCCID = jobDump.TargetCCID + case *jobmod.InstanceJobDump: + instanceJobInfo := dump.Info.(*schsdk.InstanceJobInfo) + jobResourceInfo = instanceJobInfo.Resources + jobFiles = &jobDump.Files + targetCCID = jobDump.TargetCCID } mgrCli, err := schglb.ManagerMQPool.Acquire() @@ -156,17 +170,17 @@ func (s *DefaultScheduler) Schedule(dump *jobmod.JobDump) (*jobmod.JobScheduleSc for _, cc := range allCC.ComputingCenters { allCCs[cc.CCID] = &candidate{ CC: cc, - IsPreScheduled: cc.CCID == norJob.TargetCCID, + IsPreScheduled: cc.CCID == targetCCID, } } // 计算 - err = s.calcFileScore(norJob.Files, allCCs) + err = s.calcFileScore(*jobFiles, allCCs) if err != nil { return nil, err } - err = s.calcResourceScore(dump.Info.(*schsdk.NormalJobInfo), allCCs) + err = s.calcResourceScore(jobResourceInfo, allCCs) if err != nil { return nil, err } @@ -209,9 +223,9 @@ func (s *DefaultScheduler) makeSchemeForNode(targetCC *candidate) jobmod.JobSche return scheme } -func (s *DefaultScheduler) calcResourceScore(info *schsdk.NormalJobInfo, allCCs map[schsdk.CCID]*candidate) error { +func (s *DefaultScheduler) calcResourceScore(jobResource schsdk.JobResourcesInfo, allCCs map[schsdk.CCID]*candidate) error { for _, cc := range allCCs { - res, err := s.calcOneResourceScore(info.Resources, &cc.CC) + res, err := s.calcOneResourceScore(jobResource, &cc.CC) if err != nil { return err } diff --git a/client/internal/http/job.go b/client/internal/http/job.go index b429c44..489f9cc 100644 --- a/client/internal/http/job.go +++ b/client/internal/http/job.go @@ -20,8 +20,8 @@ type CreateInstanceResp struct { } type CreateInstanceReq struct { - LocalJobID string `json:"localJobID" binding:"required"` - LocalPath schsdk.JobFileInfo `json:"localPath" binding:"required"` + JobID schsdk.JobID `json:"jobID" binding:"required"` + LocalPath schsdk.JobFileInfo `json:"localPath" binding:"required"` } func (s *Server) JobSvc() *JobService { @@ -54,7 +54,7 @@ func (s *JobService) CreateInstance(ctx *gin.Context) { return } - jobID, filesUploadScheme, err := s.svc.JobSetSvc().CreateInstance(req.LocalJobID, req.LocalPath) + jobID, filesUploadScheme, err := s.svc.JobSetSvc().CreateInstance(req.JobID, req.LocalPath) if err != nil { log.Warnf("create job instance: %s", err.Error()) ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "create job instance failed")) diff --git a/client/internal/services/job.go b/client/internal/services/job.go index 688ca58..c164b72 100644 --- a/client/internal/services/job.go +++ b/client/internal/services/job.go @@ -8,7 +8,7 @@ import ( ) // Create 创建多实例任务中的实例任务 -func (svc *JobSetService) CreateInstance(LocalJobID string, LocalPath schsdk.JobFileInfo) (schsdk.JobID, schsdk.JobFilesUploadScheme, error) { +func (svc *JobSetService) CreateInstance(jobID schsdk.JobID, LocalPath schsdk.JobFileInfo) (schsdk.JobID, schsdk.JobFilesUploadScheme, error) { scheme := new(schsdk.JobFilesUploadScheme) @@ -18,7 +18,7 @@ func (svc *JobSetService) CreateInstance(LocalJobID string, LocalPath schsdk.Job } defer schglb.ManagerMQPool.Release(mgrCli) - resp, err := mgrCli.CreateInstance(mgrmq.NewCreateInstance(LocalJobID, LocalPath)) + resp, err := mgrCli.CreateInstance(mgrmq.NewCreateInstance(jobID, LocalPath)) if err != nil { return "", *scheme, fmt.Errorf("submitting job set to manager: %w", err) } diff --git a/client/internal/services/service.go b/client/internal/services/service.go index 06a73dd..7eb124a 100644 --- a/client/internal/services/service.go +++ b/client/internal/services/service.go @@ -1,7 +1,7 @@ package services import ( - "gitlink.org.cn/cloudream/common/pkgs/prescheduler" + "gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler" ) type Service struct { diff --git a/client/main.go b/client/main.go index 7dfbdf6..34089fc 100644 --- a/client/main.go +++ b/client/main.go @@ -2,7 +2,7 @@ package main import ( "fmt" - "gitlink.org.cn/cloudream/common/pkgs/prescheduler" + "gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler" "os" _ "google.golang.org/grpc/balancer/grpclb" diff --git a/common/models/job/body.go b/common/models/job/body.go index 1d91981..9ac5a6a 100644 --- a/common/models/job/body.go +++ b/common/models/job/body.go @@ -18,6 +18,8 @@ type JobBodyDump interface { var _ = serder.UseTypeUnionExternallyTagged(types.Ref(types.NewTypeUnion[JobBodyDump]( (*NormalJobDump)(nil), (*DataReturnJobDump)(nil), + (*InstanceJobDump)(nil), + (*MultiInstanceJobDump)(nil), ))) type NormalJobDump struct { @@ -42,7 +44,7 @@ func (d *DataReturnJobDump) getType() JobBodyDumpType { } type InstanceJobDump struct { - serder.Metadata `union:"NormalJob"` + serder.Metadata `union:"InstanceJob"` Type JobBodyDumpType `json:"type"` TargetCCID schsdk.CCID `json:"targetCCID"` Files JobFiles `json:"files"` @@ -53,7 +55,7 @@ func (d *InstanceJobDump) getType() JobBodyDumpType { } type MultiInstanceJobDump struct { - serder.Metadata `union:"NormalJob"` + serder.Metadata `union:"MultiInstanceJob"` Type JobBodyDumpType `json:"type"` TargetCCID schsdk.CCID `json:"targetCCID"` Files JobFiles `json:"files"` diff --git a/common/models/job/state.go b/common/models/job/state.go index 68c827c..70f9038 100644 --- a/common/models/job/state.go +++ b/common/models/job/state.go @@ -49,6 +49,24 @@ func (dump *CompletedDump) getType() JobStateDumpType { return dump.Type } +type MultiInstCreateInitDump struct { + serder.Metadata `union:"MultiInstCreateInit"` + Type JobStateDumpType `json:"type"` +} + +func (dump *MultiInstCreateInitDump) getType() JobStateDumpType { + return dump.Type +} + +type MultiInstCreateRunningDump struct { + serder.Metadata `union:"MultiInstCreateRunning"` + Type JobStateDumpType `json:"type"` +} + +func (dump *MultiInstCreateRunningDump) getType() JobStateDumpType { + return dump.Type +} + // 普通任务执行中 type NormalJobExecutingDump struct { serder.Metadata `union:"NormalJobExecuting"` diff --git a/common/pkgs/mq/manager/job.go b/common/pkgs/mq/manager/job.go index d8394d6..dd60450 100644 --- a/common/pkgs/mq/manager/job.go +++ b/common/pkgs/mq/manager/job.go @@ -51,8 +51,8 @@ func (c *Client) SubmitJobSet(msg *SubmitJobSet, opts ...mq.RequestOption) (*Sub type CreateInstance struct { mq.MessageBodyBase - LocalJobID string - LocalPath schsdk.JobFileInfo + JobID schsdk.JobID + LocalPath schsdk.JobFileInfo } type CreateInstanceResp struct { @@ -61,10 +61,10 @@ type CreateInstanceResp struct { UploadScheme schsdk.JobFilesUploadScheme `json:"uploadScheme"` } -func NewCreateInstance(LocalJobID string, LocalPath schsdk.JobFileInfo) *CreateInstance { +func NewCreateInstance(jobID schsdk.JobID, LocalPath schsdk.JobFileInfo) *CreateInstance { return &CreateInstance{ - LocalJobID: LocalJobID, - LocalPath: LocalPath, + JobID: jobID, + LocalPath: LocalPath, } } diff --git a/common/pkgs/prescheduler/calc_score.go b/common/pkgs/prescheduler/calc_score.go new file mode 100644 index 0000000..68cc801 --- /dev/null +++ b/common/pkgs/prescheduler/calc_score.go @@ -0,0 +1,337 @@ +package prescheduler + +import ( + "fmt" + "github.com/inhies/go-bytesize" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" + uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops" + "gitlink.org.cn/cloudream/common/utils/math2" + schglb "gitlink.org.cn/cloudream/scheduler/common/globals" + schmod "gitlink.org.cn/cloudream/scheduler/common/models" + "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/collector" + mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" +) + +func (s *DefaultPreScheduler) calcResourceScore(jobResource schsdk.JobResourcesInfo, allCCs map[schsdk.CCID]*candidate) error { + for _, cc := range allCCs { + res, err := s.calcOneResourceScore(jobResource, &cc.CC) + if err != nil { + return err + } + + cc.Resource = *res + } + + return nil +} + +// 划分节点资源等级,并计算资源得分 +func (s *DefaultPreScheduler) calcOneResourceScore(requires schsdk.JobResourcesInfo, cc *schmod.ComputingCenter) (*resourcesDetail, error) { + colCli, err := schglb.CollectorMQPool.Acquire() + if err != nil { + return nil, fmt.Errorf("new collector client: %w", err) + } + defer schglb.CollectorMQPool.Release(colCli) + + getResDataResp, err := colCli.GetAllResourceData(collector.NewGetAllResourceData(cc.UOPSlwNodeID)) + if err != nil { + return nil, err + } + + var resDetail resourcesDetail + + //计算资源得分 + totalScore := 0.0 + maxLevel := 0 + resKinds := 0 + + if requires.CPU > 0 { + res := findResuorce[*uopsdk.CPUResourceData](getResDataResp.Datas) + if res == nil { + resDetail.CPU.Level = ResourceLevel3 + resDetail.CPU.Score = 0 + } else { + resDetail.CPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.CPU) + resDetail.CPU.Score = (float64(res.Available.Value) / requires.CPU) * CpuResourceWeight + } + + maxLevel = math2.Max(maxLevel, resDetail.CPU.Level) + totalScore += resDetail.CPU.Score + resKinds++ + } + + if requires.GPU > 0 { + res := findResuorce[*uopsdk.GPUResourceData](getResDataResp.Datas) + if res == nil { + resDetail.GPU.Level = ResourceLevel3 + resDetail.GPU.Score = 0 + } else { + resDetail.GPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.GPU) + resDetail.GPU.Score = (float64(res.Available.Value) / requires.GPU) * CpuResourceWeight + } + + maxLevel = math2.Max(maxLevel, resDetail.GPU.Level) + totalScore += resDetail.GPU.Score + resKinds++ + } + + if requires.NPU > 0 { + res := findResuorce[*uopsdk.NPUResourceData](getResDataResp.Datas) + if res == nil { + resDetail.NPU.Level = ResourceLevel3 + resDetail.NPU.Score = 0 + } else { + resDetail.NPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.NPU) + resDetail.NPU.Score = (float64(res.Available.Value) / requires.NPU) * CpuResourceWeight + } + + maxLevel = math2.Max(maxLevel, resDetail.NPU.Level) + totalScore += resDetail.NPU.Score + resKinds++ + } + + if requires.MLU > 0 { + res := findResuorce[*uopsdk.MLUResourceData](getResDataResp.Datas) + if res == nil { + resDetail.MLU.Level = ResourceLevel3 + resDetail.MLU.Score = 0 + } else { + resDetail.MLU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.MLU) + resDetail.MLU.Score = (float64(res.Available.Value) / requires.MLU) * CpuResourceWeight + } + + maxLevel = math2.Max(maxLevel, resDetail.MLU.Level) + totalScore += resDetail.MLU.Score + resKinds++ + } + + if requires.Storage > 0 { + res := findResuorce[*uopsdk.StorageResourceData](getResDataResp.Datas) + if res == nil { + resDetail.Storage.Level = ResourceLevel3 + resDetail.Storage.Score = 0 + } else { + bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit)) + if err != nil { + return nil, err + } + + resDetail.Storage.Level = s.calcResourceLevel(float64(bytes), float64(requires.Storage)) + resDetail.Storage.Score = (float64(bytes) / float64(requires.Storage)) * StgResourceWeight + } + + maxLevel = math2.Max(maxLevel, resDetail.Storage.Level) + totalScore += resDetail.Storage.Score + resKinds++ + } + + if requires.Memory > 0 { + res := findResuorce[*uopsdk.MemoryResourceData](getResDataResp.Datas) + if res == nil { + resDetail.Memory.Level = ResourceLevel3 + resDetail.Memory.Score = 0 + } else { + bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit)) + if err != nil { + return nil, err + } + + resDetail.Memory.Level = s.calcResourceLevel(float64(bytes), float64(requires.Memory)) + resDetail.Memory.Score = (float64(bytes) / float64(requires.Memory)) * StgResourceWeight + } + + maxLevel = math2.Max(maxLevel, resDetail.Memory.Level) + totalScore += resDetail.Memory.Score + resKinds++ + } + + if resKinds == 0 { + return &resDetail, nil + } + + resDetail.TotalScore = totalScore + resDetail.AvgScore = resDetail.AvgScore / float64(resKinds) + resDetail.MaxLevel = maxLevel + + return &resDetail, nil +} + +func (s *DefaultPreScheduler) calcResourceLevel(avai float64, need float64) int { + if avai >= 1.5*need { + return ResourceLevel1 + } + + if avai >= need { + return ResourceLevel2 + } + + return ResourceLevel3 +} + +// 计算节点得分情况 +func (s *DefaultPreScheduler) calcFileScore(files schsdk.JobFilesInfo, allCCs map[schsdk.CCID]*candidate) error { + // 只计算运控返回的可用计算中心上的存储服务的数据权重 + cdsNodeToCC := make(map[cdssdk.NodeID]*candidate) + for _, cc := range allCCs { + cdsNodeToCC[cc.CC.CDSNodeID] = cc + } + + //计算code相关得分 + if pkgFile, ok := files.Code.(*schsdk.PackageJobFileInfo); ok { + codeFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsNodeToCC) + if err != nil { + return fmt.Errorf("calc code file score: %w", err) + } + for id, score := range codeFileScores { + allCCs[id].Files.Code = *score + } + } + + //计算dataset相关得分 + if pkgFile, ok := files.Dataset.(*schsdk.PackageJobFileInfo); ok { + datasetFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsNodeToCC) + if err != nil { + return fmt.Errorf("calc dataset file score: %w", err) + } + for id, score := range datasetFileScores { + allCCs[id].Files.Dataset = *score + } + } + + //计算image相关得分 + if imgFile, ok := files.Image.(*schsdk.ImageJobFileInfo); ok { + //计算image相关得分 + imageFileScores, err := s.calcImageFileScore(imgFile.ImageID, allCCs, cdsNodeToCC) + if err != nil { + return fmt.Errorf("calc image file score: %w", err) + } + for id, score := range imageFileScores { + allCCs[id].Files.Image = *score + } + } + + for _, cc := range allCCs { + cc.Files.TotalScore = cc.Files.Code.CachingScore + + cc.Files.Code.LoadingScore + + cc.Files.Dataset.CachingScore + + cc.Files.Dataset.LoadingScore + + cc.Files.Image.CachingScore + + cc.Files.Image.LoadingScore + } + + return nil +} + +// 计算package在各节点的得分情况 +func (s *DefaultPreScheduler) calcPackageFileScore(packageID cdssdk.PackageID, cdsNodeToCC map[cdssdk.NodeID]*candidate) (map[schsdk.CCID]*fileDetail, error) { + colCli, err := schglb.CollectorMQPool.Acquire() + if err != nil { + return nil, fmt.Errorf("new collector client: %w", err) + } + defer schglb.CollectorMQPool.Release(colCli) + + ccFileScores := make(map[schsdk.CCID]*fileDetail) + + // TODO UserID + cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(1, packageID)) + if err != nil { + return nil, err + } + + for _, cdsNodeCacheInfo := range cachedResp.NodeInfos { + cc, ok := cdsNodeToCC[cdsNodeCacheInfo.NodeID] + if !ok { + continue + } + + ccFileScores[cc.CC.CCID] = &fileDetail{ + //TODO 根据缓存方式不同,可能会有不同的计算方式 + CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight, + } + } + + // TODO UserID + loadedResp, err := colCli.PackageGetLoadedStgNodes(collector.NewPackageGetLoadedStgNodes(1, packageID)) + if err != nil { + return nil, err + } + + for _, cdsNodeID := range loadedResp.StgNodeIDs { + cc, ok := cdsNodeToCC[cdsNodeID] + if !ok { + continue + } + + sfc, ok := ccFileScores[cc.CC.CCID] + if !ok { + sfc = &fileDetail{} + ccFileScores[cc.CC.CCID] = sfc + } + + sfc.LoadingScore = 1 * LoadedWeight + sfc.IsLoaded = true + } + + return ccFileScores, nil +} + +// 计算package在各节点的得分情况 +func (s *DefaultPreScheduler) calcImageFileScore(imageID schsdk.ImageID, allCCs map[schsdk.CCID]*candidate, cdsNodeToCC map[cdssdk.NodeID]*candidate) (map[schsdk.CCID]*fileDetail, error) { + colCli, err := schglb.CollectorMQPool.Acquire() + if err != nil { + return nil, fmt.Errorf("new collector client: %w", err) + } + defer schglb.CollectorMQPool.Release(colCli) + + magCli, err := schglb.ManagerMQPool.Acquire() + if err != nil { + return nil, fmt.Errorf("new manager client: %w", err) + } + defer schglb.ManagerMQPool.Release(magCli) + + imageInfoResp, err := magCli.GetImageInfo(mgrmq.NewGetImageInfo(imageID)) + if err != nil { + return nil, fmt.Errorf("getting image info: %w", err) + } + + ccFileScores := make(map[schsdk.CCID]*fileDetail) + + if imageInfoResp.Image.CDSPackageID != nil { + cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(1, *imageInfoResp.Image.CDSPackageID)) + if err != nil { + return nil, err + } + + for _, cdsNodeCacheInfo := range cachedResp.NodeInfos { + cc, ok := cdsNodeToCC[cdsNodeCacheInfo.NodeID] + if !ok { + continue + } + + ccFileScores[cc.CC.CCID] = &fileDetail{ + //TODO 根据缓存方式不同,可能会有不同的计算方式 + CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight, + } + } + } + + // 镜像的LoadingScore是判断是否导入到算力中心 + for _, pcmImg := range imageInfoResp.PCMImages { + _, ok := allCCs[pcmImg.CCID] + if !ok { + continue + } + + fsc, ok := ccFileScores[pcmImg.CCID] + if !ok { + fsc = &fileDetail{} + ccFileScores[pcmImg.CCID] = fsc + } + + fsc.LoadingScore = 1 * LoadedWeight + fsc.IsLoaded = true + } + + return ccFileScores, nil +} diff --git a/common/pkgs/prescheduler/default_prescheduler.go b/common/pkgs/prescheduler/default_prescheduler.go new file mode 100644 index 0000000..98f6e03 --- /dev/null +++ b/common/pkgs/prescheduler/default_prescheduler.go @@ -0,0 +1,517 @@ +package prescheduler + +import ( + "fmt" + "sort" + + "github.com/samber/lo" + + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops" + schglb "gitlink.org.cn/cloudream/scheduler/common/globals" + schmod "gitlink.org.cn/cloudream/scheduler/common/models" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager" +) + +const ( + //每个节点划分的资源等级: + // ResourceLevel1:表示所有资源类型均满足 大于等于1.5倍 + ResourceLevel1 = 1 + // ResourceLevel2:表示不满足Level1,但所有资源类型均满足 大于等于1倍 + ResourceLevel2 = 2 + // ResourceLevel3: 表示某些资源类型 小于一倍 + ResourceLevel3 = 3 + + CpuResourceWeight float64 = 1 + StgResourceWeight float64 = 1.2 + + CachingWeight float64 = 1 + LoadedWeight float64 = 2 +) + +var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait") + +type candidate struct { + CC schmod.ComputingCenter + IsReferencedJobTarget bool // 这个节点是否是所依赖的任务所选择的节点 + Resource resourcesDetail + Files filesDetail +} + +type resourcesDetail struct { + CPU resourceDetail + GPU resourceDetail + NPU resourceDetail + MLU resourceDetail + Storage resourceDetail + Memory resourceDetail + + TotalScore float64 + AvgScore float64 + MaxLevel int +} +type resourceDetail struct { + Level int + Score float64 +} + +type filesDetail struct { + Dataset fileDetail + Code fileDetail + Image fileDetail + + TotalScore float64 +} +type fileDetail struct { + CachingScore float64 + LoadingScore float64 + IsLoaded bool //表示storage是否已经调度到该节点, image表示镜像是否已经加载到该算力中心 +} + +type schedulingJob struct { + Job schsdk.JobInfo + Afters []string +} + +type CandidateArr []*candidate + +func (a CandidateArr) Len() int { return len(a) } +func (a CandidateArr) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a CandidateArr) Less(i, j int) bool { + n1 := a[i] + n2 := a[j] + + // 优先与所依赖的任务放到一起,但要求那个节点的资源足够 + if n1.IsReferencedJobTarget && n1.Resource.MaxLevel < ResourceLevel3 { + return true + } + if n2.IsReferencedJobTarget && n2.Resource.MaxLevel < ResourceLevel3 { + return true + } + + // 优先判断资源等级,资源等级越低,代表越满足需求 + if n1.Resource.MaxLevel < n2.Resource.MaxLevel { + return true + } + if n1.Resource.MaxLevel > n2.Resource.MaxLevel { + return false + } + + // 等级相同时,根据单项分值比较 + switch n1.Resource.MaxLevel { + case ResourceLevel1: + // 数据文件总分越高,代表此节点上拥有的数据文件越完整,则越优先考虑 + return n1.Files.TotalScore > n2.Files.TotalScore + + case ResourceLevel2: + // 资源分的平均值越高,代表资源越空余,则越优先考虑 + return n1.Resource.AvgScore > n2.Resource.AvgScore + + case ResourceLevel3: + // 资源分的平均值越高,代表资源越空余,则越优先考虑 + return n1.Resource.AvgScore > n2.Resource.AvgScore + } + + return false +} + +type DefaultPreScheduler struct { +} + +func NewDefaultPreScheduler() *DefaultPreScheduler { + return &DefaultPreScheduler{} +} + +// ScheduleJobSet 任务集预调度 +func (s *DefaultPreScheduler) ScheduleJobSet(info *schsdk.JobSetInfo) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error) { + jobSetScheme := &jobmod.JobSetPreScheduleScheme{ + JobSchemes: make(map[string]jobmod.JobScheduleScheme), + } + filesUploadSchemes := make(map[string]schsdk.LocalFileUploadScheme) + + mgrCli, err := schglb.ManagerMQPool.Acquire() + if err != nil { + return nil, nil, fmt.Errorf("new collector client: %w", err) + } + defer schglb.ManagerMQPool.Release(mgrCli) + + // 查询有哪些算力中心可用 + + allCC, err := mgrCli.GetAllComputingCenter(mgrmq.NewGetAllComputingCenter()) + if err != nil { + return nil, nil, fmt.Errorf("getting all computing center info: %w", err) + } + + ccs := make(map[schsdk.CCID]schmod.ComputingCenter) + for _, node := range allCC.ComputingCenters { + ccs[node.CCID] = node + } + + if len(ccs) == 0 { + return nil, nil, ErrNoAvailableScheme + } + + // 先根据任务配置,收集它们依赖的任务的LocalID + var schJobs []*schedulingJob + for _, job := range info.Jobs { + j := &schedulingJob{ + Job: job, + } + + if norJob, ok := job.(*schsdk.NormalJobInfo); ok { + if resFile, ok := norJob.Files.Dataset.(*schsdk.DataReturnJobFileInfo); ok { + j.Afters = append(j.Afters, resFile.DataReturnLocalJobID) + } + + if resFile, ok := norJob.Files.Code.(*schsdk.DataReturnJobFileInfo); ok { + j.Afters = append(j.Afters, resFile.DataReturnLocalJobID) + } + } else if resJob, ok := job.(*schsdk.DataReturnJobInfo); ok { + j.Afters = append(j.Afters, resJob.TargetLocalJobID) + } + + schJobs = append(schJobs, j) + } + + // 然后根据依赖进行排序 + schJobs, ok := s.orderByAfters(schJobs) + if !ok { + return nil, nil, fmt.Errorf("circular reference detected between jobs in the job set") + } + + // 经过排序后,按顺序生成调度方案 + for _, job := range schJobs { + if norJob, ok := job.Job.(*schsdk.NormalJobInfo); ok { + scheme, err := s.scheduleForNormalOrMultiJob(info, job, ccs, jobSetScheme.JobSchemes) + if err != nil { + return nil, nil, err + } + + jobSetScheme.JobSchemes[job.Job.GetLocalJobID()] = *scheme + + // 检查数据文件的配置项,生成上传文件方案 + s.fillNormarlJobLocalUploadScheme(norJob.Files, scheme.TargetCCID, filesUploadSchemes, ccs) + } + + if mulJob, ok := job.Job.(*schsdk.MultiInstanceJobInfo); ok { + scheme, err := s.scheduleForNormalOrMultiJob(info, job, ccs, jobSetScheme.JobSchemes) + if err != nil { + return nil, nil, err + } + + jobSetScheme.JobSchemes[job.Job.GetLocalJobID()] = *scheme + + // 检查数据文件的配置项,生成上传文件方案 + s.fillNormarlJobLocalUploadScheme(mulJob.Files, scheme.TargetCCID, filesUploadSchemes, ccs) + } + + // 回源任务目前不需要生成调度方案 + } + + return jobSetScheme, &schsdk.JobSetFilesUploadScheme{ + LocalFileSchemes: lo.Values(filesUploadSchemes), + }, nil +} + +// ScheduleJob 单个任务预调度 +func (s *DefaultPreScheduler) ScheduleJob(instJobInfo *schsdk.InstanceJobInfo) (*jobmod.JobScheduleScheme, *schsdk.JobFilesUploadScheme, error) { + filesUploadSchemes := make(map[string]schsdk.LocalFileUploadScheme) + + mgrCli, err := schglb.ManagerMQPool.Acquire() + if err != nil { + return nil, nil, fmt.Errorf("new collector client: %w", err) + } + defer schglb.ManagerMQPool.Release(mgrCli) + + // 查询有哪些算力中心可用 + + allCC, err := mgrCli.GetAllComputingCenter(mgrmq.NewGetAllComputingCenter()) + if err != nil { + return nil, nil, fmt.Errorf("getting all computing center info: %w", err) + } + + ccs := make(map[schsdk.CCID]schmod.ComputingCenter) + for _, node := range allCC.ComputingCenters { + ccs[node.CCID] = node + } + + if len(ccs) == 0 { + return nil, nil, ErrNoAvailableScheme + } + + info := &schsdk.NormalJobInfo{ + Files: instJobInfo.Files, + Runtime: instJobInfo.Runtime, + Resources: instJobInfo.Resources, + } + + job := &schedulingJob{ + Job: info, + } + scheme, err := s.scheduleForSingleJob(job, ccs) + if err != nil { + return nil, nil, err + } + + // 检查数据文件的配置项,生成上传文件方案 + s.fillNormarlJobLocalUploadScheme(info.Files, scheme.TargetCCID, filesUploadSchemes, ccs) + + return scheme, &schsdk.JobFilesUploadScheme{ + LocalFileSchemes: lo.Values(filesUploadSchemes), + }, nil +} + +func (s *DefaultPreScheduler) orderByAfters(jobs []*schedulingJob) ([]*schedulingJob, bool) { + type jobOrder struct { + Job *schedulingJob + Afters []string + } + + var jobOrders []*jobOrder + for _, job := range jobs { + od := &jobOrder{ + Job: job, + Afters: make([]string, len(job.Afters)), + } + + copy(od.Afters, job.Afters) + + jobOrders = append(jobOrders, od) + } + + // 然后排序 + var orderedJob []*schedulingJob + for { + rm := 0 + for i, jo := range jobOrders { + // 找到没有依赖的任务,然后将其取出 + if len(jo.Afters) == 0 { + orderedJob = append(orderedJob, jo.Job) + + // 删除其他任务对它的引用 + for _, job2 := range jobOrders { + job2.Afters = lo.Reject(job2.Afters, func(item string, idx int) bool { return item == jo.Job.Job.GetLocalJobID() }) + } + + rm++ + continue + } + + jobOrders[i-rm] = jobOrders[i] + } + + jobOrders = jobOrders[:len(jobOrders)-rm] + if len(jobOrders) == 0 { + break + } + + // 遍历一轮后没有找到无依赖的任务,那么就是存在循环引用,排序失败 + if rm == 0 { + return nil, false + } + } + + return orderedJob, true +} + +func (s *DefaultPreScheduler) scheduleForNormalOrMultiJob(jobSet *schsdk.JobSetInfo, job *schedulingJob, ccs map[schsdk.CCID]schmod.ComputingCenter, jobSchemes map[string]jobmod.JobScheduleScheme) (*jobmod.JobScheduleScheme, error) { + allCCs := make(map[schsdk.CCID]*candidate) + + // 初始化备选节点信息 + for _, cc := range ccs { + caNode := &candidate{ + CC: cc, + } + + // 检查此节点是否是它所引用的任务所选的节点 + for _, af := range job.Afters { + resJob := findJobInfo[*schsdk.DataReturnJobInfo](jobSet.Jobs, af) + if resJob == nil { + return nil, fmt.Errorf("resource job %s not found in the job set", af) + } + + // 由于jobs已经按照引用排序,所以正常情况下这里肯定能取到值 + scheme, ok := jobSchemes[resJob.TargetLocalJobID] + if !ok { + continue + } + + if scheme.TargetCCID == cc.CCID { + caNode.IsReferencedJobTarget = true + break + } + } + + allCCs[cc.CCID] = caNode + } + + //norJob := job.Job.(*schsdk.NormalJobInfo) + + var jobFiles *schsdk.JobFilesInfo + var jobResource *schsdk.JobResourcesInfo + + switch runningJob := job.Job.(type) { + case *schsdk.NormalJobInfo: + jobFiles = &runningJob.Files + jobResource = &runningJob.Resources + case *schsdk.MultiInstanceJobInfo: + jobFiles = &runningJob.Files + jobResource = &runningJob.Resources + } + + // 计算文件占有量得分 + err := s.calcFileScore(*jobFiles, allCCs) + if err != nil { + return nil, err + } + + // 计算资源余量得分 + err = s.calcResourceScore(*jobResource, allCCs) + if err != nil { + return nil, err + } + + allCCsArr := lo.Values(allCCs) + sort.Sort(CandidateArr(allCCsArr)) + + targetNode := allCCsArr[0] + if targetNode.Resource.MaxLevel == ResourceLevel3 { + return nil, ErrNoAvailableScheme + } + + scheme := s.makeSchemeForNode(jobFiles, targetNode) + return &scheme, nil +} + +func (s *DefaultPreScheduler) scheduleForSingleJob(job *schedulingJob, ccs map[schsdk.CCID]schmod.ComputingCenter) (*jobmod.JobScheduleScheme, error) { + allCCs := make(map[schsdk.CCID]*candidate) + + // 初始化备选节点信息 + for _, cc := range ccs { + caNode := &candidate{ + CC: cc, + } + + allCCs[cc.CCID] = caNode + } + + //norJob := job.Job.(*schsdk.NormalJobInfo) + + var jobFiles *schsdk.JobFilesInfo + var jobResource *schsdk.JobResourcesInfo + + switch runningJob := job.Job.(type) { + case *schsdk.NormalJobInfo: + jobFiles = &runningJob.Files + jobResource = &runningJob.Resources + case *schsdk.MultiInstanceJobInfo: + jobFiles = &runningJob.Files + jobResource = &runningJob.Resources + } + + // 计算文件占有量得分 + err := s.calcFileScore(*jobFiles, allCCs) + if err != nil { + return nil, err + } + + // 计算资源余量得分 + err = s.calcResourceScore(*jobResource, allCCs) + if err != nil { + return nil, err + } + + allCCsArr := lo.Values(allCCs) + sort.Sort(CandidateArr(allCCsArr)) + + targetNode := allCCsArr[0] + if targetNode.Resource.MaxLevel == ResourceLevel3 { + return nil, ErrNoAvailableScheme + } + + scheme := s.makeSchemeForNode(jobFiles, targetNode) + return &scheme, nil +} + +func (s *DefaultPreScheduler) fillNormarlJobLocalUploadScheme(files schsdk.JobFilesInfo, targetCCID schsdk.CCID, schemes map[string]schsdk.LocalFileUploadScheme, ccs map[schsdk.CCID]schmod.ComputingCenter) { + if localFile, ok := files.Dataset.(*schsdk.LocalJobFileInfo); ok { + if _, ok := schemes[localFile.LocalPath]; !ok { + cdsNodeID := ccs[targetCCID].CDSNodeID + schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{ + LocalPath: localFile.LocalPath, + UploadToCDSNodeID: &cdsNodeID, + } + } + } + + if localFile, ok := files.Code.(*schsdk.LocalJobFileInfo); ok { + if _, ok := schemes[localFile.LocalPath]; !ok { + cdsNodeID := ccs[targetCCID].CDSNodeID + schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{ + LocalPath: localFile.LocalPath, + UploadToCDSNodeID: &cdsNodeID, + } + } + } + + if localFile, ok := files.Image.(*schsdk.LocalJobFileInfo); ok { + if _, ok := schemes[localFile.LocalPath]; !ok { + cdsNodeID := ccs[targetCCID].CDSNodeID + schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{ + LocalPath: localFile.LocalPath, + UploadToCDSNodeID: &cdsNodeID, + } + } + } +} + +func (s *DefaultPreScheduler) makeSchemeForNode(jobFiles *schsdk.JobFilesInfo, targetCC *candidate) jobmod.JobScheduleScheme { + scheme := jobmod.JobScheduleScheme{ + TargetCCID: targetCC.CC.CCID, + } + + // TODO 根据实际情况选择Move或者Load + + if _, ok := jobFiles.Dataset.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Dataset.IsLoaded { + scheme.Dataset.Action = jobmod.ActionLoad + } else { + scheme.Dataset.Action = jobmod.ActionNo + } + + if _, ok := jobFiles.Code.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Code.IsLoaded { + scheme.Code.Action = jobmod.ActionLoad + } else { + scheme.Code.Action = jobmod.ActionNo + } + + if _, ok := jobFiles.Image.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Image.IsLoaded { + scheme.Image.Action = jobmod.ActionImportImage + } else { + scheme.Image.Action = jobmod.ActionNo + } + + return scheme +} + +func findResuorce[T uopsdk.ResourceData](all []uopsdk.ResourceData) T { + for _, data := range all { + if ret, ok := data.(T); ok { + return ret + } + } + + var def T + return def +} + +func findJobInfo[T schsdk.JobInfo](jobs []schsdk.JobInfo, localJobID string) T { + for _, job := range jobs { + if ret, ok := job.(T); ok && job.GetLocalJobID() == localJobID { + return ret + } + } + + var def T + return def +} diff --git a/common/pkgs/prescheduler/default_prescheduler_test.go b/common/pkgs/prescheduler/default_prescheduler_test.go new file mode 100644 index 0000000..e10ae82 --- /dev/null +++ b/common/pkgs/prescheduler/default_prescheduler_test.go @@ -0,0 +1,117 @@ +package prescheduler + +import ( + "testing" + + "github.com/samber/lo" + . "github.com/smartystreets/goconvey/convey" + + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" +) + +func TestOrderByAfters(t *testing.T) { + cases := []struct { + title string + jobs []*schedulingJob + wants []string + }{ + { + title: "所有Job都有直接或间接的依赖关系", + jobs: []*schedulingJob{ + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "1"}}, + Afters: []string{"2"}, + }, + + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "2"}}, + Afters: []string{}, + }, + + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "3"}}, + Afters: []string{"1"}, + }, + }, + wants: []string{"2", "1", "3"}, + }, + + { + title: "部分Job之间无依赖关系", + jobs: []*schedulingJob{ + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "1"}}, + Afters: []string{"2"}, + }, + + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "2"}}, + Afters: []string{}, + }, + + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "3"}}, + Afters: []string{"1"}, + }, + + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "4"}}, + Afters: []string{"5"}, + }, + + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "5"}}, + Afters: []string{}, + }, + }, + wants: []string{"2", "5", "1", "3", "4"}, + }, + + { + title: "存在循环依赖", + jobs: []*schedulingJob{ + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "1"}}, + Afters: []string{"2"}, + }, + + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "2"}}, + Afters: []string{"1"}, + }, + }, + wants: nil, + }, + + { + title: "完全不依赖", + jobs: []*schedulingJob{ + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "1"}}, + Afters: []string{}, + }, + + { + Job: &schsdk.NormalJobInfo{JobInfoBase: schsdk.JobInfoBase{LocalJobID: "2"}}, + Afters: []string{}, + }, + }, + wants: []string{"1", "2"}, + }, + } + + sch := NewDefaultPreScheduler() + for _, c := range cases { + Convey(c.title, t, func() { + ordered, ok := sch.orderByAfters(c.jobs) + if c.wants == nil { + So(ok, ShouldBeFalse) + } else { + So(ok, ShouldBeTrue) + + ids := lo.Map(ordered, func(item *schedulingJob, idx int) string { return item.Job.GetLocalJobID() }) + So(ids, ShouldResemble, c.wants) + } + }) + } +} diff --git a/common/pkgs/prescheduler/prescheduler.go b/common/pkgs/prescheduler/prescheduler.go new file mode 100644 index 0000000..7aa3f18 --- /dev/null +++ b/common/pkgs/prescheduler/prescheduler.go @@ -0,0 +1,11 @@ +package prescheduler + +import ( + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" + jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" +) + +type PreScheduler interface { + ScheduleJobSet(info *schsdk.JobSetInfo) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error) + ScheduleJob(info *schsdk.InstanceJobInfo) (*jobmod.JobScheduleScheme, *schsdk.JobFilesUploadScheme, error) +} diff --git a/manager/internal/jobmgr/event.go b/manager/internal/jobmgr/event.go deleted file mode 100644 index 1b9f6d2..0000000 --- a/manager/internal/jobmgr/event.go +++ /dev/null @@ -1,62 +0,0 @@ -package jobmgr - -import ( - "errors" - - schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" -) - -// 不是所关注的Task上报的进度 -var ErrUnconcernedTask = errors.New("unconcerned task") - -var ErrTaskTimeout = errors.New("task timeout") - -var ErrJobCancelled = errors.New("job cancelled") - -type Event interface{} - -type BroadcastType string - -const ( - BroadcastAll BroadcastType = "All" - BroadcastJobSet BroadcastType = "JobSet" - BroadcastJob BroadcastType = "Job" -) - -type Broadcast struct { - Type BroadcastType - JobSetID schsdk.JobSetID - JobID schsdk.JobID -} - -func (b *Broadcast) ToAll() bool { - return b.Type == BroadcastAll -} - -func (b *Broadcast) ToJobSet() bool { - return b.Type == BroadcastJobSet -} - -func (b *Broadcast) ToJob() bool { - return b.Type == BroadcastJob -} - -func ToAll() Broadcast { - return Broadcast{ - Type: BroadcastAll, - } -} - -func ToJobSet(jobSetID schsdk.JobSetID) Broadcast { - return Broadcast{ - Type: BroadcastJobSet, - JobSetID: jobSetID, - } -} - -func ToJob(jobID schsdk.JobID) Broadcast { - return Broadcast{ - Type: BroadcastJob, - JobID: jobID, - } -} diff --git a/manager/internal/jobmgr/event/cancel.go b/manager/internal/jobmgr/event/cancel.go index 1eede5b..7341a7b 100644 --- a/manager/internal/jobmgr/event/cancel.go +++ b/manager/internal/jobmgr/event/cancel.go @@ -2,3 +2,6 @@ package event type Cancel struct { } + +func (s *Cancel) Noop() { +} diff --git a/manager/internal/jobmgr/event/instance_create.go b/manager/internal/jobmgr/event/instance_create.go index 4a2e59e..8242b91 100644 --- a/manager/internal/jobmgr/event/instance_create.go +++ b/manager/internal/jobmgr/event/instance_create.go @@ -23,3 +23,6 @@ func NewInstanceCreate(LocalPath schsdk.JobFileInfo, future CreateInstanceFuture Result: future, } } + +func (s *InstanceCreate) Noop() { +} diff --git a/manager/internal/jobmgr/event/job_completed.go b/manager/internal/jobmgr/event/job_completed.go index 2452134..e9231ad 100644 --- a/manager/internal/jobmgr/event/job_completed.go +++ b/manager/internal/jobmgr/event/job_completed.go @@ -16,3 +16,6 @@ func NewJobCompleted(job *jobmgr.Job, err error) *JobCompleted { Err: err, } } + +func (s *JobCompleted) Noop() { +} diff --git a/manager/internal/jobmgr/event/local_file_uploaded.go b/manager/internal/jobmgr/event/local_file_uploaded.go index 9b81ad3..6ecebb8 100644 --- a/manager/internal/jobmgr/event/local_file_uploaded.go +++ b/manager/internal/jobmgr/event/local_file_uploaded.go @@ -18,3 +18,6 @@ func NewLocalFileUploaded(localPath string, err error, packageID cdssdk.PackageI PackageID: packageID, } } + +func (s *LocalFileUploaded) Noop() { +} diff --git a/manager/internal/jobmgr/event_set.go b/manager/internal/jobmgr/event_set.go index a26ec55..1792365 100644 --- a/manager/internal/jobmgr/event_set.go +++ b/manager/internal/jobmgr/event_set.go @@ -2,6 +2,7 @@ package jobmgr import ( "context" + "errors" "sync" "gitlink.org.cn/cloudream/common/pkgs/future" @@ -10,6 +11,12 @@ import ( type EventWaitCondition func(evt Event) bool +var ErrJobCancelled = errors.New("job cancelled") + +type Event interface { + Noop() +} + type EventWaiter struct { condition EventWaitCondition future *future.SetValueFuture[Event] @@ -52,12 +59,13 @@ func (s *EventSet) Post(evt Event) { func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bool) { s.lock.Lock() - defer s.lock.Unlock() + //defer s.lock.Unlock() // 一个等待者只能等待一个事件 for i, evt := range s.events { if cond(evt) { s.events = lo2.RemoveAt(s.events, i) + s.lock.Unlock() return evt, true } } @@ -67,9 +75,13 @@ func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bo condition: cond, future: fut, } - s.events = append(s.events, waiter) + //s.events = append(s.events, waiter) + s.waiters = append(s.waiters, waiter) + + s.lock.Unlock() val, err := fut.WaitValue(ctx) + if err != nil { return nil, false } diff --git a/manager/internal/jobmgr/job/state/adjusting.go b/manager/internal/jobmgr/job/state/adjusting.go index 19d1ac1..8ff8462 100644 --- a/manager/internal/jobmgr/job/state/adjusting.go +++ b/manager/internal/jobmgr/job/state/adjusting.go @@ -49,21 +49,20 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { //norJob := jo.Body.(*job.NormalJob) var jobFilesInfo schsdk.JobFilesInfo var jobFiles *jobmod.JobFiles - var targetCCID schsdk.CCID switch runningJob := jo.Body.(type) { case *job.NormalJob: jobFilesInfo = runningJob.Info.Files jobFiles = &runningJob.Files - targetCCID = runningJob.TargetCCID + runningJob.TargetCCID = s.scheme.TargetCCID case *job.MultiInstanceJob: jobFilesInfo = runningJob.Info.Files jobFiles = &runningJob.Files - targetCCID = runningJob.TargetCCID + runningJob.TargetCCID = s.scheme.TargetCCID case *job.InstanceJob: jobFilesInfo = runningJob.Info.Files jobFiles = &runningJob.Files - targetCCID = runningJob.TargetCCID + runningJob.TargetCCID = s.scheme.TargetCCID } ctx, cancel := context.WithCancel(context.Background()) @@ -71,7 +70,7 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { // 监听取消事件 go func() { - event.WaitType[event.Cancel](ctx, rtx.EventSet) + event.WaitType[*event.Cancel](ctx, rtx.EventSet) cancel() }() @@ -121,19 +120,21 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { go func() { defer wg.Done() - e3 = s.doImageScheduling(ctx, rtx, targetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image) + e3 = s.doImageScheduling(ctx, rtx, s.scheme.TargetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image) if e3 != nil { cancel() } }() + wg.Wait() + return errors.Join(e1, e2, e3) } func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error { switch info := fileInfo.(type) { case *schsdk.LocalJobFileInfo: - evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { + evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool { return e.LocalPath == info.LocalPath }) if !ok { @@ -195,7 +196,7 @@ func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobState func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error { switch info := fileInfo.(type) { case *schsdk.LocalJobFileInfo: - evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { + evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool { return e.LocalPath == info.LocalPath }) if !ok { diff --git a/manager/internal/jobmgr/job/state/executing.go b/manager/internal/jobmgr/job/state/executing.go index f77a02c..218dca4 100644 --- a/manager/internal/jobmgr/job/state/executing.go +++ b/manager/internal/jobmgr/job/state/executing.go @@ -145,7 +145,7 @@ func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Jo // 监听取消事件 go func() { - event.WaitType[event.Cancel](ctx, rtx.EventSet) + event.WaitType[*event.Cancel](ctx, rtx.EventSet) cancel() }() @@ -154,6 +154,8 @@ func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Jo return fmt.Errorf("getting computing center info: %w", err) } + logger.Infof("submited computer center name: %s, id: %s", ccInfo.Name, ccInfo.CCID) + wt := rtx.Mgr.ExecMgr.StartTask(exetsk.NewStorageCreatePackage( 1, // TOOD 用户ID ccInfo.CDSStorageID, diff --git a/manager/internal/jobmgr/job/state/making_adjust_scheme.go b/manager/internal/jobmgr/job/state/making_adjust_scheme.go index 19ffada..c857feb 100644 --- a/manager/internal/jobmgr/job/state/making_adjust_scheme.go +++ b/manager/internal/jobmgr/job/state/making_adjust_scheme.go @@ -32,7 +32,7 @@ func (s *MakingAdjustScheme) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) ( // 监听取消事件 go func() { - event.WaitType[event.Cancel](ctx, rtx.EventSet) + event.WaitType[*event.Cancel](ctx, rtx.EventSet) cancel() }() diff --git a/manager/internal/jobmgr/job/state/multiInstance_init.go b/manager/internal/jobmgr/job/state/multiInstance_init.go index 99c8785..7bdee80 100644 --- a/manager/internal/jobmgr/job/state/multiInstance_init.go +++ b/manager/internal/jobmgr/job/state/multiInstance_init.go @@ -5,6 +5,7 @@ import ( "gitlink.org.cn/cloudream/common/pkgs/logger" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + "gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" @@ -13,6 +14,10 @@ import ( type MultiInstanceInit struct { } +func NewMultiInstanceInit() *MultiInstanceInit { + return &MultiInstanceInit{} +} + func (s *MultiInstanceInit) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) { s.do(rtx, job) } @@ -24,11 +29,12 @@ func (s *MultiInstanceInit) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { defer cancel() go func() { - event.WaitType[event.Cancel](ctx, rtx.EventSet) + event.WaitType[*event.Cancel](ctx, rtx.EventSet) cancel() }() instJobInfo := &schsdk.InstanceJobInfo{ + Type: schsdk.JobTypeInstance, LocalJobID: multInstJob.Info.LocalJobID, Files: multInstJob.Info.Files, Runtime: multInstJob.Info.Runtime, @@ -49,5 +55,16 @@ func (s *MultiInstanceInit) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { // 在多实例任务中新增这个实例的任务ID multInstJob.SubJobs = append(multInstJob.SubJobs, jobID) - rtx.Mgr.ChangeState(jo, NewMultiInstanceRunning()) + job := &jobmgr.Job{ + JobSetID: jo.JobSetID, + JobID: jo.JobID, + Body: multInstJob, + } + + rtx.Mgr.ChangeState(job, NewMultiInstanceRunning(prescheduler.NewDefaultPreScheduler())) + logger.Info("Create multiInstance job success, jobID: " + job.JobID) +} + +func (s *MultiInstanceInit) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump { + return &jobmod.MultiInstCreateInitDump{} } diff --git a/manager/internal/jobmgr/job/state/multiInstance_running.go b/manager/internal/jobmgr/job/state/multiInstance_running.go index 5786cac..dfad01e 100644 --- a/manager/internal/jobmgr/job/state/multiInstance_running.go +++ b/manager/internal/jobmgr/job/state/multiInstance_running.go @@ -3,9 +3,9 @@ package state import ( "context" "gitlink.org.cn/cloudream/common/pkgs/logger" - "gitlink.org.cn/cloudream/common/pkgs/prescheduler" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" + "gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" @@ -15,31 +15,28 @@ type MultiInstanceRunning struct { preScheduler prescheduler.PreScheduler } -func (s *MultiInstanceRunning) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump { - //TODO implement me - panic("implement me") +func NewMultiInstanceRunning(preScheduler prescheduler.PreScheduler) *MultiInstanceRunning { + return &MultiInstanceRunning{ + preScheduler: preScheduler, + } } -//func NewMultiInstanceRunning(preScheduler prescheduler.PreScheduler) *MultiInstanceRunning { -// return &MultiInstanceRunning{ -// preScheduler: preScheduler, -// } +//func NewMultiInstanceRunning() *MultiInstanceRunning { +// return &MultiInstanceRunning{} //} -func NewMultiInstanceRunning() *MultiInstanceRunning { - return &MultiInstanceRunning{} -} - func (s *MultiInstanceRunning) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) { s.do(rtx, job) } func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + logger.Info("start run multiInstanceRunning, jobID: " + jo.JobID) + ctx, cancel := context.WithCancel(context.Background()) defer cancel() go func() { - event.WaitType[event.Cancel](ctx, rtx.EventSet) + event.WaitType[*event.Cancel](ctx, rtx.EventSet) cancel() }() @@ -47,10 +44,12 @@ func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) for { // 监听创建实例事件 - ic, ok := event.WaitType[event.InstanceCreate](ctx, rtx.EventSet) + ic, ok := event.WaitType[*event.InstanceCreate](ctx, rtx.EventSet) if !ok { + logger.Info("MultiInstanceRunning canceled") break } + logger.Info("wait a event happened") // 构建InstanceJobInfo infoFiles := schsdk.JobFilesInfo{ @@ -60,6 +59,7 @@ func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) } instJobInfo := &schsdk.InstanceJobInfo{ + Type: schsdk.JobTypeInstance, LocalJobID: multInstJob.Info.LocalJobID, Files: infoFiles, Runtime: multInstJob.Info.Runtime, @@ -94,3 +94,7 @@ func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) } } + +func (s *MultiInstanceRunning) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump { + return &jobmod.MultiInstCreateRunningDump{} +} diff --git a/manager/internal/jobmgr/job/state/prescheduling.go b/manager/internal/jobmgr/job/state/prescheduling.go index efb246c..647c834 100644 --- a/manager/internal/jobmgr/job/state/prescheduling.go +++ b/manager/internal/jobmgr/job/state/prescheduling.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "gitlink.org.cn/cloudream/common/pkgs/logger" "sync" "time" @@ -30,23 +31,24 @@ func NewPreSchuduling(scheme jobmod.JobScheduleScheme) *PreScheduling { } func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { + logger.Info("start run preScheduling, jobID: " + jo.JobID) + var jobFilesInfo schsdk.JobFilesInfo var jobFiles *jobmod.JobFiles - var targetCCID schsdk.CCID switch runningJob := jo.Body.(type) { case *job.NormalJob: jobFilesInfo = runningJob.Info.Files jobFiles = &runningJob.Files - targetCCID = runningJob.TargetCCID + runningJob.TargetCCID = s.scheme.TargetCCID case *job.MultiInstanceJob: jobFilesInfo = runningJob.Info.Files jobFiles = &runningJob.Files - targetCCID = runningJob.TargetCCID + runningJob.TargetCCID = s.scheme.TargetCCID case *job.InstanceJob: jobFilesInfo = runningJob.Info.Files jobFiles = &runningJob.Files - targetCCID = runningJob.TargetCCID + runningJob.TargetCCID = s.scheme.TargetCCID } ctx, cancel := context.WithCancel(context.Background()) @@ -54,7 +56,7 @@ func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { // 监听取消事件 go func() { - event.WaitType[event.Cancel](ctx, rtx.EventSet) + event.WaitType[*event.Cancel](ctx, rtx.EventSet) cancel() }() @@ -88,12 +90,14 @@ func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { go func() { defer wg.Done() - e3 = s.doImageScheduling(ctx, rtx, targetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image) + e3 = s.doImageScheduling(ctx, rtx, s.scheme.TargetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image) if e3 != nil { cancel() } }() + wg.Wait() + allErr := errors.Join(e1, e2, e3) if allErr != nil { rtx.Mgr.ChangeState(jo, FailureComplete(err)) @@ -111,7 +115,7 @@ func (s *PreScheduling) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobm func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error { switch info := fileInfo.(type) { case *schsdk.LocalJobFileInfo: - evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { + evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool { return e.LocalPath == info.LocalPath }) if !ok { @@ -173,7 +177,7 @@ func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobS func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error { switch info := fileInfo.(type) { case *schsdk.LocalJobFileInfo: - evt, ok := event.WaitTypeAnd[event.LocalFileUploaded](ctx, rtx.EventSet, func(e event.LocalFileUploaded) bool { + evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool { return e.LocalPath == info.LocalPath }) if !ok { diff --git a/manager/internal/jobmgr/job/state/ready_to_adjust.go b/manager/internal/jobmgr/job/state/ready_to_adjust.go index d2914a1..25b6dc9 100644 --- a/manager/internal/jobmgr/job/state/ready_to_adjust.go +++ b/manager/internal/jobmgr/job/state/ready_to_adjust.go @@ -45,12 +45,12 @@ func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error // 监听取消事件 go func() { - event.WaitType[event.Cancel](ctx, rtx.EventSet) + event.WaitType[*event.Cancel](ctx, rtx.EventSet) cancel() }() if rt, ok := jobFilesInfo.Dataset.(*schsdk.DataReturnJobFileInfo); ok { - evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool { + evt, ok := event.WaitTypeAnd[*event.JobCompleted](ctx, rtx.EventSet, func(val *event.JobCompleted) bool { return val.Job.GetInfo().GetLocalJobID() == rt.DataReturnLocalJobID }) if !ok { diff --git a/manager/internal/jobmgr/job/state/wait_target_complete.go b/manager/internal/jobmgr/job/state/wait_target_complete.go index c0f4d5c..aeb826e 100644 --- a/manager/internal/jobmgr/job/state/wait_target_complete.go +++ b/manager/internal/jobmgr/job/state/wait_target_complete.go @@ -34,11 +34,11 @@ func (s *WaitTargetComplete) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) e // 监听取消事件 go func() { - event.WaitType[event.Cancel](ctx, rtx.EventSet) + event.WaitType[*event.Cancel](ctx, rtx.EventSet) cancel() }() - evt, ok := event.WaitTypeAnd[event.JobCompleted](ctx, rtx.EventSet, func(val event.JobCompleted) bool { + evt, ok := event.WaitTypeAnd[*event.JobCompleted](ctx, rtx.EventSet, func(val *event.JobCompleted) bool { return val.Job.GetInfo().GetLocalJobID() == reJob.Info.TargetLocalJobID }) if !ok { diff --git a/manager/internal/jobmgr/jobmgr.go b/manager/internal/jobmgr/jobmgr.go index 2a9f810..0370119 100644 --- a/manager/internal/jobmgr/jobmgr.go +++ b/manager/internal/jobmgr/jobmgr.go @@ -158,8 +158,10 @@ func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID { Body: subJob.Body, }, eventSet: NewEventSet(), + state: subJob.InitState, } jobSet.jobs[jobID] = job + m.jobs[jobID] = job // 更改作业的初始状态 //m.ChangeState(&job.job, subJob.InitState) @@ -167,7 +169,7 @@ func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID { subJob.InitState.Run(JobStateRunContext{ Mgr: m, EventSet: &job.eventSet, - LastState: job.state, + LastState: nil, }, &job.job) }() } @@ -205,16 +207,34 @@ type PreSchedulerInstJob struct { } // AddJob 添加一个作业到指定的作业集。 -func (m *Manager) AddJob(jobSetID schsdk.JobSetID, jobBody JobBody, State JobState) schsdk.JobID { - jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+1)) +func (m *Manager) AddJob(jobSetID schsdk.JobSetID, jobBody JobBody, jobState JobState) schsdk.JobID { + m.pubLock.Lock() + defer m.pubLock.Unlock() - job := Job{ - JobSetID: jobSetID, - JobID: jobID, - Body: jobBody, + jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+1)) + m.jobIDIndex += 1 + + job := &mgrJob{ + job: Job{ + JobSetID: jobSetID, + JobID: jobID, + Body: jobBody, + }, + state: jobState, + eventSet: NewEventSet(), } - m.ChangeState(&job, State) + m.jobs[jobID] = job + jobSet := m.jobSets[jobSetID] + jobSet.jobs[jobID] = job + + go func() { + jobState.Run(JobStateRunContext{ + Mgr: m, + EventSet: &job.eventSet, + LastState: nil, + }, &job.job) + }() return jobID } diff --git a/manager/internal/mq/job.go b/manager/internal/mq/job.go index bd45610..38835a2 100644 --- a/manager/internal/mq/job.go +++ b/manager/internal/mq/job.go @@ -58,7 +58,7 @@ func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetRe jobs = append(jobs, jobmgr.SubmittingJob{ Body: job, - InitState: state.NewPreSchuduling(preSch), + InitState: state.NewMultiInstanceInit(), }) } @@ -71,7 +71,7 @@ func (svc *Service) CreateInstance(instInfo *mgrmq.CreateInstance) (*mgrmq.Creat logger.Debugf("start create instance") fut := future.NewSetValue[event.CreateInstanceResult]() - svc.jobMgr.PostEvent(schsdk.JobID(instInfo.LocalJobID), event.NewInstanceCreate(instInfo.LocalPath, fut)) + svc.jobMgr.PostEvent(instInfo.JobID, event.NewInstanceCreate(instInfo.LocalPath, fut)) result, err := fut.WaitValue(context.TODO()) From d22bd11e1805ef241738606b591d75494c8b26b4 Mon Sep 17 00:00:00 2001 From: JeshuaRen <270813223@qq.com> Date: Wed, 15 May 2024 10:07:58 +0800 Subject: [PATCH 05/10] 0515 --- common/assets/confs/advisor.config.json | 2 +- common/assets/confs/collector.config.json | 4 ++-- common/assets/confs/executor.config.json | 4 ++-- common/assets/confs/manager.config.json | 2 +- manager/internal/jobmgr/job/job_test.go | 14 ++++++++++++++ 5 files changed, 20 insertions(+), 6 deletions(-) create mode 100644 manager/internal/jobmgr/job/job_test.go diff --git a/common/assets/confs/advisor.config.json b/common/assets/confs/advisor.config.json index e699e44..7489d0b 100644 --- a/common/assets/confs/advisor.config.json +++ b/common/assets/confs/advisor.config.json @@ -1,6 +1,6 @@ { "logger": { - "output": "file", + "output": "stdout", "outputFileName": "advisor", "outputDirectory": "log", "level": "debug" diff --git a/common/assets/confs/collector.config.json b/common/assets/confs/collector.config.json index e7594b5..fc0e5ef 100644 --- a/common/assets/confs/collector.config.json +++ b/common/assets/confs/collector.config.json @@ -1,6 +1,6 @@ { "logger": { - "output": "file", + "output": "stdout", "outputFileName": "collector", "outputDirectory": "log", "level": "debug" @@ -15,7 +15,7 @@ "url": "http://localhost:7890" }, "unifyOps": { - "url": "http://localhost:7892" + "url": "http://localhost:7891" }, "slwNodes": [ { diff --git a/common/assets/confs/executor.config.json b/common/assets/confs/executor.config.json index 1439ccc..83f9aa5 100644 --- a/common/assets/confs/executor.config.json +++ b/common/assets/confs/executor.config.json @@ -1,6 +1,6 @@ { "logger": { - "output": "file", + "output": "stdout", "outputFileName": "executor", "outputDirectory": "log", "level": "debug" @@ -15,7 +15,7 @@ "url": "http://localhost:7890" }, "pcm": { - "url": "http://localhost:7892" + "url": "http://localhost:7070" }, "reportIntervalSec": 10 } \ No newline at end of file diff --git a/common/assets/confs/manager.config.json b/common/assets/confs/manager.config.json index 6f619a4..aa59370 100644 --- a/common/assets/confs/manager.config.json +++ b/common/assets/confs/manager.config.json @@ -1,6 +1,6 @@ { "logger": { - "output": "file", + "output": "stdout", "outputFileName": "manager", "outputDirectory": "log", "level": "debug" diff --git a/manager/internal/jobmgr/job/job_test.go b/manager/internal/jobmgr/job/job_test.go new file mode 100644 index 0000000..fc02658 --- /dev/null +++ b/manager/internal/jobmgr/job/job_test.go @@ -0,0 +1,14 @@ +package job + +import ( + "fmt" + "testing" +) + +func TestFunc(t *testing.T) { + a := 1 + switch a { + case 1, 2: + fmt.Println("aaa11111") + } +} From 1fc424a494e088d8a643d9435e8809ceffdc7e27 Mon Sep 17 00:00:00 2001 From: JeshuaRen <270813223@qq.com> Date: Wed, 15 May 2024 16:10:06 +0800 Subject: [PATCH 06/10] =?UTF-8?q?1=E3=80=81=E5=A4=9A=E5=AE=9E=E4=BE=8B?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1LocalJobID=E6=94=B9=E6=88=90=E6=8B=BC?= =?UTF-8?q?=E6=8E=A5=E4=B8=80=E4=B8=AA=E9=9A=8F=E6=9C=BA=E5=80=BC=202?= =?UTF-8?q?=E3=80=81=E6=99=AE=E9=80=9A=E4=BB=BB=E5=8A=A1OutputFullPath?= =?UTF-8?q?=E8=B5=8B=E5=80=BC=203=E3=80=81=E5=85=B6=E4=BB=96=E4=BC=98?= =?UTF-8?q?=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- client/internal/http/job.go | 13 ++---- client/internal/http/server.go | 2 +- client/internal/services/job.go | 4 +- common/models/job/state.go | 10 ----- common/pkgs/mq/manager/job.go | 10 ++--- .../pkgs/prescheduler/default_prescheduler.go | 4 -- common/utils/utils.go | 15 +++++++ .../internal/jobmgr/event/instance_create.go | 10 ++--- manager/internal/jobmgr/event_set.go | 8 ---- manager/internal/jobmgr/job/job_test.go | 14 ------- .../internal/jobmgr/job/state/adjusting.go | 41 ++++++++++--------- .../jobmgr/job/state/multiInstance_init.go | 16 ++++---- .../jobmgr/job/state/multiInstance_running.go | 12 +++--- manager/internal/jobmgr/jobmgr.go | 14 +------ manager/internal/mq/job.go | 2 +- 15 files changed, 67 insertions(+), 108 deletions(-) delete mode 100644 manager/internal/jobmgr/job/job_test.go diff --git a/client/internal/http/job.go b/client/internal/http/job.go index 489f9cc..b4052f7 100644 --- a/client/internal/http/job.go +++ b/client/internal/http/job.go @@ -20,8 +20,8 @@ type CreateInstanceResp struct { } type CreateInstanceReq struct { - JobID schsdk.JobID `json:"jobID" binding:"required"` - LocalPath schsdk.JobFileInfo `json:"localPath" binding:"required"` + JobID schsdk.JobID `json:"jobID" binding:"required"` + DataSet schsdk.JobFileInfo `json:"localPath" binding:"required"` } func (s *Server) JobSvc() *JobService { @@ -33,13 +33,6 @@ func (s *Server) JobSvc() *JobService { func (s *JobService) CreateInstance(ctx *gin.Context) { log := logger.WithField("HTTP", "JobSet.HTTP") - //var req CreateInstanceReq - //if err := ctx.ShouldBindJSON(&req); err != nil { - // log.Warnf("binding body: %s", err.Error()) - // ctx.JSON(http.StatusBadRequest, Failed(errorcode.BadArgument, "missing argument or invalid argument")) - // return - //} - bodyData, err := io.ReadAll(ctx.Request.Body) if err != nil { log.Warnf("reading request body: %s", err.Error()) @@ -54,7 +47,7 @@ func (s *JobService) CreateInstance(ctx *gin.Context) { return } - jobID, filesUploadScheme, err := s.svc.JobSetSvc().CreateInstance(req.JobID, req.LocalPath) + jobID, filesUploadScheme, err := s.svc.JobSetSvc().CreateInstance(req.JobID, req.DataSet) if err != nil { log.Warnf("create job instance: %s", err.Error()) ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "create job instance failed")) diff --git a/client/internal/http/server.go b/client/internal/http/server.go index 38eba6a..932977c 100644 --- a/client/internal/http/server.go +++ b/client/internal/http/server.go @@ -39,7 +39,7 @@ func (s *Server) Serve() error { func (s *Server) initRouters() { s.engine.POST("/jobSet/submit", s.JobSetSvc().Submit) - s.engine.POST("/job/CreateInstance", s.JobSvc().CreateInstance) + s.engine.POST("/job/createInstance", s.JobSvc().CreateInstance) s.engine.POST("/jobSet/localFileUploaded", s.JobSetSvc().LocalFileUploaded) s.engine.GET("/jobSet/getServiceList", s.JobSetSvc().GetServiceList) } diff --git a/client/internal/services/job.go b/client/internal/services/job.go index c164b72..f670bc6 100644 --- a/client/internal/services/job.go +++ b/client/internal/services/job.go @@ -8,7 +8,7 @@ import ( ) // Create 创建多实例任务中的实例任务 -func (svc *JobSetService) CreateInstance(jobID schsdk.JobID, LocalPath schsdk.JobFileInfo) (schsdk.JobID, schsdk.JobFilesUploadScheme, error) { +func (svc *JobSetService) CreateInstance(jobID schsdk.JobID, dataSet schsdk.JobFileInfo) (schsdk.JobID, schsdk.JobFilesUploadScheme, error) { scheme := new(schsdk.JobFilesUploadScheme) @@ -18,7 +18,7 @@ func (svc *JobSetService) CreateInstance(jobID schsdk.JobID, LocalPath schsdk.Jo } defer schglb.ManagerMQPool.Release(mgrCli) - resp, err := mgrCli.CreateInstance(mgrmq.NewCreateInstance(jobID, LocalPath)) + resp, err := mgrCli.CreateInstance(mgrmq.NewCreateInstance(jobID, dataSet)) if err != nil { return "", *scheme, fmt.Errorf("submitting job set to manager: %w", err) } diff --git a/common/models/job/state.go b/common/models/job/state.go index 70f9038..a3b884a 100644 --- a/common/models/job/state.go +++ b/common/models/job/state.go @@ -27,7 +27,6 @@ var _ = serder.UseTypeUnionExternallyTagged(types.Ref(types.NewTypeUnion[JobStat (*WaitTargetCompleteDump)(nil), ))) -// 调整中 type AdjustingDump struct { serder.Metadata `union:"Adjusting"` Type JobStateDumpType `json:"type"` @@ -38,7 +37,6 @@ func (dump *AdjustingDump) getType() JobStateDumpType { return dump.Type } -// 任务结束 type CompletedDump struct { serder.Metadata `union:"Completed"` Type JobStateDumpType `json:"type"` @@ -67,7 +65,6 @@ func (dump *MultiInstCreateRunningDump) getType() JobStateDumpType { return dump.Type } -// 普通任务执行中 type NormalJobExecutingDump struct { serder.Metadata `union:"NormalJobExecuting"` Type JobStateDumpType `json:"type"` @@ -78,7 +75,6 @@ func (dump *NormalJobExecutingDump) getType() JobStateDumpType { return dump.Type } -// 回源任务执行中 type DataReturnExecutingDump struct { serder.Metadata `union:"DataReturnExecuting"` Type JobStateDumpType `json:"type"` @@ -88,7 +84,6 @@ func (dump *DataReturnExecutingDump) getType() JobStateDumpType { return dump.Type } -// 制作调整方案中 type MakeingAdjustSchemeDump struct { serder.Metadata `union:"MakeingAdjustScheme"` Type JobStateDumpType `json:"type"` @@ -98,7 +93,6 @@ func (dump *MakeingAdjustSchemeDump) getType() JobStateDumpType { return dump.Type } -// 预调度中 type PreSchedulingDump struct { serder.Metadata `union:"PreScheduling"` Type JobStateDumpType `json:"type"` @@ -109,7 +103,6 @@ func (dump *PreSchedulingDump) getType() JobStateDumpType { return dump.Type } -// 准备调整中 type ReadyToAdjustDump struct { serder.Metadata `union:"ReadyToAdjust"` Type JobStateDumpType `json:"type"` @@ -119,7 +112,6 @@ func (dump *ReadyToAdjustDump) getType() JobStateDumpType { return dump.Type } -// 普通任务准备执行中 type NormalJobReadyToExecuteDump struct { serder.Metadata `union:"NormalJobReadyToExecute"` Type JobStateDumpType `json:"type"` @@ -129,7 +121,6 @@ func (dump *NormalJobReadyToExecuteDump) getType() JobStateDumpType { return dump.Type } -// 回源任务准备执行中 type DataReturnReadyToExecuteDump struct { serder.Metadata `union:"DataReturnReadyToExecute"` Type JobStateDumpType `json:"type"` @@ -139,7 +130,6 @@ func (dump *DataReturnReadyToExecuteDump) getType() JobStateDumpType { return dump.Type } -// 等待回源目标完成中 type WaitTargetCompleteDump struct { serder.Metadata `union:"WaitTargetComplete"` Type JobStateDumpType `json:"type"` diff --git a/common/pkgs/mq/manager/job.go b/common/pkgs/mq/manager/job.go index dd60450..9a7ab29 100644 --- a/common/pkgs/mq/manager/job.go +++ b/common/pkgs/mq/manager/job.go @@ -51,8 +51,8 @@ func (c *Client) SubmitJobSet(msg *SubmitJobSet, opts ...mq.RequestOption) (*Sub type CreateInstance struct { mq.MessageBodyBase - JobID schsdk.JobID - LocalPath schsdk.JobFileInfo + JobID schsdk.JobID + DataSet schsdk.JobFileInfo } type CreateInstanceResp struct { @@ -61,10 +61,10 @@ type CreateInstanceResp struct { UploadScheme schsdk.JobFilesUploadScheme `json:"uploadScheme"` } -func NewCreateInstance(jobID schsdk.JobID, LocalPath schsdk.JobFileInfo) *CreateInstance { +func NewCreateInstance(jobID schsdk.JobID, dataSet schsdk.JobFileInfo) *CreateInstance { return &CreateInstance{ - JobID: jobID, - LocalPath: LocalPath, + JobID: jobID, + DataSet: dataSet, } } diff --git a/common/pkgs/prescheduler/default_prescheduler.go b/common/pkgs/prescheduler/default_prescheduler.go index 98f6e03..4a9471f 100644 --- a/common/pkgs/prescheduler/default_prescheduler.go +++ b/common/pkgs/prescheduler/default_prescheduler.go @@ -346,8 +346,6 @@ func (s *DefaultPreScheduler) scheduleForNormalOrMultiJob(jobSet *schsdk.JobSetI allCCs[cc.CCID] = caNode } - //norJob := job.Job.(*schsdk.NormalJobInfo) - var jobFiles *schsdk.JobFilesInfo var jobResource *schsdk.JobResourcesInfo @@ -396,8 +394,6 @@ func (s *DefaultPreScheduler) scheduleForSingleJob(job *schedulingJob, ccs map[s allCCs[cc.CCID] = caNode } - //norJob := job.Job.(*schsdk.NormalJobInfo) - var jobFiles *schsdk.JobFilesInfo var jobResource *schsdk.JobResourcesInfo diff --git a/common/utils/utils.go b/common/utils/utils.go index b302c83..bc4264a 100644 --- a/common/utils/utils.go +++ b/common/utils/utils.go @@ -1,7 +1,10 @@ package utils import ( + "crypto/sha256" + "encoding/hex" "fmt" + "math/rand" "path/filepath" "strconv" "time" @@ -17,3 +20,15 @@ func MakeJobOutputFullPath(stgDir string, userID cdssdk.UserID, jobID schsdk.Job func MakeResourcePackageName(jobID schsdk.JobID) string { return fmt.Sprintf("%s@%s", string(jobID), time.Now().Format("2006-01-02 15:04:05")) } + +func GenerateRandomID() string { + currentTime := time.Now().UnixNano() / int64(time.Millisecond) + rand.Seed(currentTime) + randomNum := rand.Intn(1000) // 0 到 999 之间的随机整数 + idBase := fmt.Sprintf("%d%03d", currentTime, randomNum) + hasher := sha256.New() + hasher.Write([]byte(idBase)) + hashBytes := hasher.Sum(nil) + hashedID := hex.EncodeToString(hashBytes) + return hashedID +} diff --git a/manager/internal/jobmgr/event/instance_create.go b/manager/internal/jobmgr/event/instance_create.go index 8242b91..c7e05e3 100644 --- a/manager/internal/jobmgr/event/instance_create.go +++ b/manager/internal/jobmgr/event/instance_create.go @@ -8,8 +8,8 @@ import ( type CreateInstanceFuture = *future.SetValueFuture[CreateInstanceResult] type InstanceCreate struct { - LocalPath schsdk.JobFileInfo - Result CreateInstanceFuture + DataSet schsdk.JobFileInfo + Result CreateInstanceFuture } type CreateInstanceResult struct { @@ -17,10 +17,10 @@ type CreateInstanceResult struct { FilesUploadScheme schsdk.JobFilesUploadScheme } -func NewInstanceCreate(LocalPath schsdk.JobFileInfo, future CreateInstanceFuture) *InstanceCreate { +func NewInstanceCreate(dataSet schsdk.JobFileInfo, future CreateInstanceFuture) *InstanceCreate { return &InstanceCreate{ - LocalPath: LocalPath, - Result: future, + DataSet: dataSet, + Result: future, } } diff --git a/manager/internal/jobmgr/event_set.go b/manager/internal/jobmgr/event_set.go index 1792365..346dcbe 100644 --- a/manager/internal/jobmgr/event_set.go +++ b/manager/internal/jobmgr/event_set.go @@ -32,11 +32,6 @@ func NewEventSet() EventSet { return EventSet{} } -// Post 函数用于向事件集合中发布一个事件。 -// 如果有等待该事件的协程,会唤醒它们并将事件传递给它们。 -// 参数: -// -// evt Event - 需要发布的事件对象。 func (s *EventSet) Post(evt Event) { s.lock.Lock() // 加锁保护事件集合 defer s.lock.Unlock() // 确保在函数结束时释放锁 @@ -59,9 +54,7 @@ func (s *EventSet) Post(evt Event) { func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bool) { s.lock.Lock() - //defer s.lock.Unlock() - // 一个等待者只能等待一个事件 for i, evt := range s.events { if cond(evt) { s.events = lo2.RemoveAt(s.events, i) @@ -75,7 +68,6 @@ func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bo condition: cond, future: fut, } - //s.events = append(s.events, waiter) s.waiters = append(s.waiters, waiter) s.lock.Unlock() diff --git a/manager/internal/jobmgr/job/job_test.go b/manager/internal/jobmgr/job/job_test.go deleted file mode 100644 index fc02658..0000000 --- a/manager/internal/jobmgr/job/job_test.go +++ /dev/null @@ -1,14 +0,0 @@ -package job - -import ( - "fmt" - "testing" -) - -func TestFunc(t *testing.T) { - a := 1 - switch a { - case 1, 2: - fmt.Println("aaa11111") - } -} diff --git a/manager/internal/jobmgr/job/state/adjusting.go b/manager/internal/jobmgr/job/state/adjusting.go index 8ff8462..5ee40d5 100644 --- a/manager/internal/jobmgr/job/state/adjusting.go +++ b/manager/internal/jobmgr/job/state/adjusting.go @@ -46,24 +46,6 @@ func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.J } func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { - //norJob := jo.Body.(*job.NormalJob) - var jobFilesInfo schsdk.JobFilesInfo - var jobFiles *jobmod.JobFiles - - switch runningJob := jo.Body.(type) { - case *job.NormalJob: - jobFilesInfo = runningJob.Info.Files - jobFiles = &runningJob.Files - runningJob.TargetCCID = s.scheme.TargetCCID - case *job.MultiInstanceJob: - jobFilesInfo = runningJob.Info.Files - jobFiles = &runningJob.Files - runningJob.TargetCCID = s.scheme.TargetCCID - case *job.InstanceJob: - jobFilesInfo = runningJob.Info.Files - jobFiles = &runningJob.Files - runningJob.TargetCCID = s.scheme.TargetCCID - } ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -94,8 +76,27 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { return fmt.Errorf("getting cds storage info: %w", err) } // TODO UserID - //norJob.OutputFullPath = utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID) - utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID) + outputFullPath := utils.MakeJobOutputFullPath(stgInfo.Directory, 1, jo.JobID) + + var jobFilesInfo schsdk.JobFilesInfo + var jobFiles *jobmod.JobFiles + + switch runningJob := jo.Body.(type) { + case *job.NormalJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + runningJob.TargetCCID = s.scheme.TargetCCID + runningJob.OutputFullPath = outputFullPath + case *job.MultiInstanceJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + runningJob.TargetCCID = s.scheme.TargetCCID + case *job.InstanceJob: + jobFilesInfo = runningJob.Info.Files + jobFiles = &runningJob.Files + runningJob.TargetCCID = s.scheme.TargetCCID + runningJob.OutputFullPath = outputFullPath + } wg := sync.WaitGroup{} wg.Add(3) diff --git a/manager/internal/jobmgr/job/state/multiInstance_init.go b/manager/internal/jobmgr/job/state/multiInstance_init.go index 7bdee80..3323665 100644 --- a/manager/internal/jobmgr/job/state/multiInstance_init.go +++ b/manager/internal/jobmgr/job/state/multiInstance_init.go @@ -2,10 +2,12 @@ package state import ( "context" + "fmt" "gitlink.org.cn/cloudream/common/pkgs/logger" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" "gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler" + "gitlink.org.cn/cloudream/scheduler/common/utils" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" @@ -33,9 +35,11 @@ func (s *MultiInstanceInit) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { cancel() }() + newLocalJobID := fmt.Sprintf("%s_%s", multInstJob.Info.LocalJobID, utils.GenerateRandomID()) + instJobInfo := &schsdk.InstanceJobInfo{ Type: schsdk.JobTypeInstance, - LocalJobID: multInstJob.Info.LocalJobID, + LocalJobID: newLocalJobID, Files: multInstJob.Info.Files, Runtime: multInstJob.Info.Runtime, Resources: multInstJob.Info.Resources, @@ -55,14 +59,8 @@ func (s *MultiInstanceInit) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { // 在多实例任务中新增这个实例的任务ID multInstJob.SubJobs = append(multInstJob.SubJobs, jobID) - job := &jobmgr.Job{ - JobSetID: jo.JobSetID, - JobID: jo.JobID, - Body: multInstJob, - } - - rtx.Mgr.ChangeState(job, NewMultiInstanceRunning(prescheduler.NewDefaultPreScheduler())) - logger.Info("Create multiInstance job success, jobID: " + job.JobID) + rtx.Mgr.ChangeState(jo, NewMultiInstanceRunning(prescheduler.NewDefaultPreScheduler())) + logger.Info("Create multiInstance job success, jobID: " + jo.JobID) } func (s *MultiInstanceInit) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump { diff --git a/manager/internal/jobmgr/job/state/multiInstance_running.go b/manager/internal/jobmgr/job/state/multiInstance_running.go index dfad01e..d21d635 100644 --- a/manager/internal/jobmgr/job/state/multiInstance_running.go +++ b/manager/internal/jobmgr/job/state/multiInstance_running.go @@ -2,10 +2,12 @@ package state import ( "context" + "fmt" "gitlink.org.cn/cloudream/common/pkgs/logger" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job" "gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler" + "gitlink.org.cn/cloudream/scheduler/common/utils" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event" "gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/job" @@ -21,10 +23,6 @@ func NewMultiInstanceRunning(preScheduler prescheduler.PreScheduler) *MultiInsta } } -//func NewMultiInstanceRunning() *MultiInstanceRunning { -// return &MultiInstanceRunning{} -//} - func (s *MultiInstanceRunning) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) { s.do(rtx, job) } @@ -53,14 +51,16 @@ func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) // 构建InstanceJobInfo infoFiles := schsdk.JobFilesInfo{ - Dataset: ic.LocalPath, + Dataset: ic.DataSet, Code: multInstJob.Info.Files.Code, Image: multInstJob.Info.Files.Image, } + newLocalJobID := fmt.Sprintf("%s_%s", multInstJob.Info.LocalJobID, utils.GenerateRandomID()) + instJobInfo := &schsdk.InstanceJobInfo{ Type: schsdk.JobTypeInstance, - LocalJobID: multInstJob.Info.LocalJobID, + LocalJobID: newLocalJobID, Files: infoFiles, Runtime: multInstJob.Info.Runtime, Resources: multInstJob.Info.Resources, diff --git a/manager/internal/jobmgr/jobmgr.go b/manager/internal/jobmgr/jobmgr.go index ca2259b..befb904 100644 --- a/manager/internal/jobmgr/jobmgr.go +++ b/manager/internal/jobmgr/jobmgr.go @@ -129,21 +129,17 @@ func (m *Manager) PostEvent(jobID schsdk.JobID, evt Event) { // 向某个任务集中的所有任务投递事件 func (m *Manager) BroadcastEvent(jobSetID schsdk.JobSetID, evt Event) { - // 加锁以确保发布事件时的线程安全 m.pubLock.Lock() - defer m.pubLock.Unlock() // 确保函数退出时释放锁 + defer m.pubLock.Unlock() - // 尝试从管理器的作业集中获取指定的作业集 jobSet, ok := m.jobSets[jobSetID] if !ok { // 如果作业集不存在,则直接返回 return } - // 遍历作业集中的所有任务,并为每个任务发布事件 for _, mjob := range jobSet.jobs { go func(j *mgrJob) { - // 使用 goroutine 为每个任务发布事件,以异步方式处理,避免阻塞 j.eventSet.Post(evt) }(mjob) } @@ -156,21 +152,17 @@ type SubmittingJob struct { // 提交一个任务集 func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID { - // 加锁以保护对作业集ID和作业ID索引的修改 m.pubLock.Lock() defer m.pubLock.Unlock() - // 生成一个新的作业集ID,并递增作业集ID索引 jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex)) m.jobSetIDIndex += 1 - // 创建一个新的作业集实例,并初始化其作业映射 jobSet := &mgrJobSet{ jobs: make(map[schsdk.JobID]*mgrJob), } m.jobSets[jobSetID] = jobSet - // 遍历提交的作业,为每个作业创建一个唯一的作业ID,初始化作业状态,并将其添加到作业集中 for i, subJob := range jobs { jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i)) job := &mgrJob{ @@ -185,8 +177,6 @@ func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID { jobSet.jobs[jobID] = job m.jobs[jobID] = job - // 更改作业的初始状态 - //m.ChangeState(&job.job, subJob.InitState) go func() { subJob.InitState.Run(JobStateRunContext{ Mgr: m, @@ -195,10 +185,8 @@ func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID { }, &job.job) }() } - // 更新作业ID索引,基于提交的作业数量 m.jobIDIndex += len(jobs) - // 返回生成的作业集ID return jobSetID } diff --git a/manager/internal/mq/job.go b/manager/internal/mq/job.go index 38835a2..7435b08 100644 --- a/manager/internal/mq/job.go +++ b/manager/internal/mq/job.go @@ -71,7 +71,7 @@ func (svc *Service) CreateInstance(instInfo *mgrmq.CreateInstance) (*mgrmq.Creat logger.Debugf("start create instance") fut := future.NewSetValue[event.CreateInstanceResult]() - svc.jobMgr.PostEvent(instInfo.JobID, event.NewInstanceCreate(instInfo.LocalPath, fut)) + svc.jobMgr.PostEvent(instInfo.JobID, event.NewInstanceCreate(instInfo.DataSet, fut)) result, err := fut.WaitValue(context.TODO()) From e345da553519556080149ed5a98486022cc925c7 Mon Sep 17 00:00:00 2001 From: Sydonian <794346190@qq.com> Date: Fri, 17 May 2024 11:02:59 +0800 Subject: [PATCH 07/10] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- manager/internal/jobmgr/job/state/adjusting.go | 2 +- manager/internal/jobmgr/job/state/prescheduling.go | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/manager/internal/jobmgr/job/state/adjusting.go b/manager/internal/jobmgr/job/state/adjusting.go index 5ee40d5..2b1fb09 100644 --- a/manager/internal/jobmgr/job/state/adjusting.go +++ b/manager/internal/jobmgr/job/state/adjusting.go @@ -183,7 +183,7 @@ func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobState return fmt.Errorf("moving package: %w", err) } - moveStatus := status.(*exectsk.CacheMovePackageStatus) + moveStatus := status.(*exectsk.StorageLoadPackageStatus) if moveStatus.Error != "" { return fmt.Errorf("moving package: %s", moveStatus.Error) } diff --git a/manager/internal/jobmgr/job/state/prescheduling.go b/manager/internal/jobmgr/job/state/prescheduling.go index 647c834..6fcb142 100644 --- a/manager/internal/jobmgr/job/state/prescheduling.go +++ b/manager/internal/jobmgr/job/state/prescheduling.go @@ -4,10 +4,11 @@ import ( "context" "errors" "fmt" - "gitlink.org.cn/cloudream/common/pkgs/logger" "sync" "time" + "gitlink.org.cn/cloudream/common/pkgs/logger" + schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" schglb "gitlink.org.cn/cloudream/scheduler/common/globals" @@ -163,7 +164,7 @@ func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobS return fmt.Errorf("moving package: %w", err) } - moveStatus := status.(*exectsk.CacheMovePackageStatus) + moveStatus := status.(*exectsk.StorageLoadPackageStatus) if moveStatus.Error != "" { return fmt.Errorf("moving package: %s", moveStatus.Error) } From 10eace2fdc251e25bbe02b5ec8379b5417367cd6 Mon Sep 17 00:00:00 2001 From: Sydonian <794346190@qq.com> Date: Fri, 17 May 2024 11:11:38 +0800 Subject: [PATCH 08/10] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- manager/internal/jobmgr/job/state/prescheduling.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manager/internal/jobmgr/job/state/prescheduling.go b/manager/internal/jobmgr/job/state/prescheduling.go index 6fcb142..6b24a09 100644 --- a/manager/internal/jobmgr/job/state/prescheduling.go +++ b/manager/internal/jobmgr/job/state/prescheduling.go @@ -101,7 +101,7 @@ func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) { allErr := errors.Join(e1, e2, e3) if allErr != nil { - rtx.Mgr.ChangeState(jo, FailureComplete(err)) + rtx.Mgr.ChangeState(jo, FailureComplete(allErr)) } else { rtx.Mgr.ChangeState(jo, NewReadyToAdjust()) } From 48a11a4300055fc920d8a4a9cc383540d1f28725 Mon Sep 17 00:00:00 2001 From: Sydonian <794346190@qq.com> Date: Fri, 17 May 2024 11:39:45 +0800 Subject: [PATCH 09/10] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E9=81=97=E6=BC=8F?= =?UTF-8?q?=E7=9A=84=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- manager/internal/jobmgr/job/state/adjusting.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/manager/internal/jobmgr/job/state/adjusting.go b/manager/internal/jobmgr/job/state/adjusting.go index 2b1fb09..fd0fd0f 100644 --- a/manager/internal/jobmgr/job/state/adjusting.go +++ b/manager/internal/jobmgr/job/state/adjusting.go @@ -46,6 +46,7 @@ func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.J } func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { + userID := cdssdk.UserID(1) ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -70,6 +71,7 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { // 已经确定最终执行的目标计算中心,则可以生成结果输出路径了 stgInfo, err := stgCli.StorageGetInfo(cdssdk.StorageGetInfoReq{ + UserID: userID, StorageID: ccInfo.CDSStorageID, }) if err != nil { From 7bf04850d9e03e216806461122b7e1bf4997b9e9 Mon Sep 17 00:00:00 2001 From: Sydonian <794346190@qq.com> Date: Fri, 17 May 2024 15:15:28 +0800 Subject: [PATCH 10/10] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- manager/internal/mq/job.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/manager/internal/mq/job.go b/manager/internal/mq/job.go index 7435b08..9d876fc 100644 --- a/manager/internal/mq/job.go +++ b/manager/internal/mq/job.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "gitlink.org.cn/cloudream/common/pkgs/future" "gitlink.org.cn/cloudream/common/consts/errorcode" @@ -88,7 +89,12 @@ func (svc *Service) JobSetLocalFileUploaded(msg *mgrmq.JobSetLocalFileUploaded) WithField("PackageID", msg.PackageID). Debugf("local file uploaded") - svc.jobMgr.BroadcastEvent(msg.JobSetID, event.NewLocalFileUploaded(msg.LocalPath, errors.New(msg.Error), msg.PackageID)) + var err error + if msg.Error != "" { + err = errors.New(msg.Error) + } + + svc.jobMgr.BroadcastEvent(msg.JobSetID, event.NewLocalFileUploaded(msg.LocalPath, err, msg.PackageID)) return mq.ReplyOK(mgrmq.NewJobSetLocalFileUploadedResp()) }