增加mq重试的参数

This commit is contained in:
JeshuaRen 2024-11-19 15:44:49 +08:00
parent f5cb48eede
commit 294b659f02
8 changed files with 44 additions and 23 deletions

View File

@ -15,7 +15,11 @@
"address": "localhost:5672", "address": "localhost:5672",
"account": "cloudream", "account": "cloudream",
"password": "123456", "password": "123456",
"vhost": "/" "vhost": "/",
"param": {
"retryNum": 5,
"retryInterval": 5000
}
}, },
"cloudreamStorage": { "cloudreamStorage": {
"url": "http://121.36.5.116:7890" "url": "http://121.36.5.116:7890"

View File

@ -15,7 +15,11 @@
"address": "localhost:5672", "address": "localhost:5672",
"account": "cloudream", "account": "cloudream",
"password": "123456", "password": "123456",
"vhost": "/" "vhost": "/",
"param": {
"retryNum": 5,
"retryInterval": 5000
}
}, },
"cloudreamStorage": { "cloudreamStorage": {
"url": "http://121.36.5.116:7890" "url": "http://121.36.5.116:7890"

View File

@ -15,7 +15,11 @@
"address": "localhost:5672", "address": "localhost:5672",
"account": "cloudream", "account": "cloudream",
"password": "123456", "password": "123456",
"vhost": "/" "vhost": "/",
"param": {
"retryNum": 5,
"retryInterval": 5000
}
}, },
"cloudreamStorage": { "cloudreamStorage": {
"url": "http://121.36.5.116:7890" "url": "http://121.36.5.116:7890"

View File

@ -13,7 +13,11 @@
"address": "101.201.215.196:5672", "address": "101.201.215.196:5672",
"account": "cloudream", "account": "cloudream",
"password": "123456", "password": "123456",
"vhost": "/" "vhost": "/",
"param": {
"retryNum": 5,
"retryInterval": 5000
}
}, },
"cloudreamStorage": { "cloudreamStorage": {
"url": "http://121.36.5.116:7890" "url": "http://121.36.5.116:7890"

View File

@ -15,7 +15,11 @@
"address": "localhost:5672", "address": "localhost:5672",
"account": "cloudream", "account": "cloudream",
"password": "123456", "password": "123456",
"vhost": "/" "vhost": "/",
"param": {
"retryNum": 5,
"retryInterval": 5000
}
}, },
"db": { "db": {
"address": "101.201.215.196:3306", "address": "101.201.215.196:3306",

View File

@ -3,7 +3,7 @@ package schglb
import ( import (
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm" pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" "gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops" uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
scmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq" scmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq"
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor" advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
@ -27,12 +27,12 @@ func InitMQPool(cfg *scmq.Config) {
ManagerMQPool = mgrmq.NewPool(cfg) ManagerMQPool = mgrmq.NewPool(cfg)
} }
var CloudreamStoragePool cdssdk.Pool var CloudreamStoragePool cdsapi.Pool
var CloudreamStorageConfig *cdssdk.Config var CloudreamStorageConfig *cdsapi.Config
func InitCloudreamStoragePool(cfg *cdssdk.Config) { func InitCloudreamStoragePool(cfg *cdsapi.Config) {
CloudreamStorageConfig = cfg CloudreamStorageConfig = cfg
CloudreamStoragePool = cdssdk.NewPool(cfg) CloudreamStoragePool = cdsapi.NewPool(cfg)
} }
var UnifyOpsPool uopsdk.Pool var UnifyOpsPool uopsdk.Pool

View File

@ -2,6 +2,7 @@ package executor
import ( import (
"gitlink.org.cn/cloudream/common/pkgs/mq" "gitlink.org.cn/cloudream/common/pkgs/mq"
"gitlink.org.cn/cloudream/common/utils/sync2"
mymq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq" mymq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq"
) )
@ -46,7 +47,7 @@ func (s *Server) Stop() {
s.rabbitSvr.Close() s.rabbitSvr.Close()
} }
func (s *Server) Serve() error { func (s *Server) Serve() *sync2.UnboundChannel[mq.RabbitMQLogEvent] {
return s.rabbitSvr.Start() return s.rabbitSvr.Start()
} }

View File

@ -6,7 +6,7 @@ import (
"github.com/samber/lo" "github.com/samber/lo"
"gitlink.org.cn/cloudream/common/pkgs/future" "gitlink.org.cn/cloudream/common/pkgs/future"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler" schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage" cdsapi "gitlink.org.cn/cloudream/common/sdks/storage"
schmod "gitlink.org.cn/cloudream/scheduler/common/models" schmod "gitlink.org.cn/cloudream/scheduler/common/models"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor" "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
"gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr" "gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr"
@ -51,7 +51,7 @@ func (s *JobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmo
func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
// TODO UserID // TODO UserID
userID := cdssdk.UserID(1) userID := cdsapi.UserID(1)
err := error(nil) err := error(nil)
switch runningJob := jo.Body.(type) { switch runningJob := jo.Body.(type) {
@ -133,20 +133,20 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
return err return err
} }
func getDataSetPathByID(packageID cdssdk.PackageID) string { func getDataSetPathByID(packageID cdsapi.PackageID) string {
// TODO 临时使用这个路径应该来自于CDS // TODO 临时使用这个路径应该来自于CDS
dataSetPath := filepath.Join("packages", "1", fmt.Sprintf("%v", packageID)) dataSetPath := filepath.Join("packages", "1", fmt.Sprintf("%v", packageID))
return dataSetPath return dataSetPath
} }
func loadDatasetPackage(userID cdssdk.UserID, packageID cdssdk.PackageID, storageID cdssdk.StorageID) (string, error) { func loadDatasetPackage(userID cdsapi.UserID, packageID cdsapi.PackageID, storageID cdsapi.StorageID) (string, error) {
stgCli, err := schglb.CloudreamStoragePool.Acquire() stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil { if err != nil {
return "", err return "", err
} }
defer schglb.CloudreamStoragePool.Release(stgCli) defer schglb.CloudreamStoragePool.Release(stgCli)
loadPackageResp, err := stgCli.StorageLoadPackage(cdssdk.StorageLoadPackageReq{ loadPackageResp, err := stgCli.StorageLoadPackage(cdsapi.StorageLoadPackageReq{
UserID: userID, UserID: userID,
PackageID: packageID, PackageID: packageID,
StorageID: storageID, StorageID: storageID,
@ -200,7 +200,7 @@ func (s *JobExecuting) submitNormalTask(rtx jobmgr.JobStateRunContext, cmd strin
} }
} }
func (s *JobExecuting) submitDataPreprocessTask(rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdssdk.StorageID, userID cdssdk.UserID) (string, error) { func (s *JobExecuting) submitDataPreprocessTask(rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdsapi.StorageID, userID cdsapi.UserID) (string, error) {
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.SQLCtx(), storageID) objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.SQLCtx(), storageID)
if err != nil { if err != nil {
logger.Error(err.Error()) logger.Error(err.Error())
@ -231,7 +231,7 @@ func (s *JobExecuting) submitDataPreprocessTask(rtx jobmgr.JobStateRunContext, c
return tskStatus.InstanceID, nil return tskStatus.InstanceID, nil
} }
func (s *JobExecuting) submitFinetuningTask(userID cdssdk.UserID, rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdssdk.StorageID, runningJob *job.NormalJob) error { func (s *JobExecuting) submitFinetuningTask(userID cdsapi.UserID, rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdsapi.StorageID, runningJob *job.NormalJob) error {
objectStorage, modelInfo, err := getModelInfoAndObjectStorage(rtx, runningJob.Info.ModelJobInfo.ModelID, storageID) objectStorage, modelInfo, err := getModelInfoAndObjectStorage(rtx, runningJob.Info.ModelJobInfo.ModelID, storageID)
if err != nil { if err != nil {
@ -265,7 +265,7 @@ func (s *JobExecuting) submitFinetuningTask(userID cdssdk.UserID, rtx jobmgr.Job
} }
func (s *JobExecuting) submitInstanceTask(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job, runningJob *job.InstanceJob, ccInfo schmod.ComputingCenter, func (s *JobExecuting) submitInstanceTask(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job, runningJob *job.InstanceJob, ccInfo schmod.ComputingCenter,
storageID cdssdk.StorageID, userID cdssdk.UserID, envs []schsdk.KVPair) error { storageID cdsapi.StorageID, userID cdsapi.UserID, envs []schsdk.KVPair) error {
modelJobInfo := runningJob.Info.ModelJobInfo modelJobInfo := runningJob.Info.ModelJobInfo
@ -370,7 +370,7 @@ func (s *JobExecuting) submitInstanceTask(rtx jobmgr.JobStateRunContext, jo *job
} }
} }
func getModelInfoAndObjectStorage(rtx jobmgr.JobStateRunContext, modelID schsdk.ModelID, storageID cdssdk.StorageID) (*schmod.ObjectStorage, *schmod.ModelResource, error) { func getModelInfoAndObjectStorage(rtx jobmgr.JobStateRunContext, modelID schsdk.ModelID, storageID cdsapi.StorageID) (*schmod.ObjectStorage, *schmod.ModelResource, error) {
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.SQLCtx(), storageID) objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.SQLCtx(), storageID)
if err != nil { if err != nil {
logger.Error(err.Error()) logger.Error(err.Error())
@ -426,7 +426,7 @@ func getRuntimeCommand(runtime schsdk.JobRuntimeInfo, dataSetPath string, output
return cmd, envs return cmd, envs
} }
func getCCInfoAndStgInfo(rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, userID cdssdk.UserID) (*schmod.ComputingCenter, *cdssdk.StorageGetResp, error) { func getCCInfoAndStgInfo(rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, userID cdsapi.UserID) (*schmod.ComputingCenter, *cdsapi.StorageGetResp, error) {
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), targetCCID) ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), targetCCID)
if err != nil { if err != nil {
return nil, nil, fmt.Errorf("getting computing center info: %w", err) return nil, nil, fmt.Errorf("getting computing center info: %w", err)
@ -437,7 +437,7 @@ func getCCInfoAndStgInfo(rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID,
return nil, nil, fmt.Errorf("new cds client: %w", err) return nil, nil, fmt.Errorf("new cds client: %w", err)
} }
defer schglb.CloudreamStoragePool.Release(stgCli) defer schglb.CloudreamStoragePool.Release(stgCli)
getStg, err := stgCli.StorageGet(cdssdk.StorageGet{ getStg, err := stgCli.StorageGet(cdsapi.StorageGet{
UserID: userID, UserID: userID,
StorageID: ccInfo.CDSStorageID, StorageID: ccInfo.CDSStorageID,
}) })
@ -470,7 +470,7 @@ func (s *DataReturnJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.
func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error { func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
reJob := jo.Body.(*job.DataReturnJob) reJob := jo.Body.(*job.DataReturnJob)
userID := cdssdk.UserID(1) userID := cdsapi.UserID(1)
log := logger.WithType[JobExecuting]("State").WithField("JobID", jo.JobID) log := logger.WithType[JobExecuting]("State").WithField("JobID", jo.JobID)