新增调度中间件模块
This commit is contained in:
parent
017ef22216
commit
8d49c8684b
|
@ -0,0 +1,36 @@
|
|||
{
|
||||
"logger": {
|
||||
"output": "file",
|
||||
"outputFileName": "schedulerclient",
|
||||
"outputDirectory": "log",
|
||||
"level": "debug"
|
||||
},
|
||||
"db2": {
|
||||
"address": "101.201.215.196:3306",
|
||||
"account": "pcm",
|
||||
"password": "123456@Asd",
|
||||
"databaseName": "scheduler"
|
||||
},
|
||||
"db": {
|
||||
"address": "localhost:3306",
|
||||
"account": "root",
|
||||
"password": "123456",
|
||||
"databaseName": "scheduler"
|
||||
},
|
||||
"pcmScheduler": {
|
||||
"url": "https://comnet.jointcloud.net/pcm/v1/schedule"
|
||||
},
|
||||
"uploader": {
|
||||
"url": "https://kbguhfxfanfp.test.jointcloud.net:443/v1/storage"
|
||||
},
|
||||
"blockChain": {
|
||||
"url": "https://ai4m.jointcloud.net/blockChain",
|
||||
"contractAddress": "0xc860ab27901b3c2b810165a6096c64d88763617f",
|
||||
"functionName": "storeEvidence",
|
||||
"memberName": "pcm",
|
||||
"type": "6"
|
||||
},
|
||||
"cloudreamStorage": {
|
||||
"url": "http://localhost:32010"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
{
|
||||
"uploadParams": {
|
||||
"dataType": "dataset",
|
||||
"uploadInfo": {
|
||||
"type": "local",
|
||||
"localPath": "yuque_mind.jpeg"
|
||||
},
|
||||
"dataName": "yuque_mind.jpeg",
|
||||
"uploadPriority": {
|
||||
"type": "preference",
|
||||
"priorities": [
|
||||
{
|
||||
"type": "region",
|
||||
"options": [
|
||||
"华东区域",
|
||||
"华北区域"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "chip",
|
||||
"options": [
|
||||
"DCU"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "bias",
|
||||
"options": [
|
||||
"网络优先"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,10 +1,13 @@
|
|||
package schglb
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/common/sdks/blockchain"
|
||||
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
|
||||
pcmsch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
|
||||
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
|
||||
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
|
||||
scmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq"
|
||||
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
|
||||
cltmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/collector"
|
||||
|
@ -20,6 +23,27 @@ var CollectorMQPool cltmq.Pool
|
|||
|
||||
var ManagerMQPool mgrmq.Pool
|
||||
|
||||
var PCMSchePool pcmsch.Pool
|
||||
|
||||
var UploaderPool uploadersdk.Pool
|
||||
|
||||
var BlockChainPool blockchain.Pool
|
||||
|
||||
var BlockChainConfig *blockchain.Config
|
||||
|
||||
func InitBlockChainPool(cfg *blockchain.Config) {
|
||||
BlockChainPool = blockchain.NewPool(cfg)
|
||||
BlockChainConfig = cfg
|
||||
}
|
||||
|
||||
func InitUploaderPool(cfg *uploadersdk.Config) {
|
||||
UploaderPool = uploadersdk.NewPool(cfg)
|
||||
}
|
||||
|
||||
func InitPCMSchePool(cfg *pcmsch.Config) {
|
||||
PCMSchePool = pcmsch.NewPool(cfg)
|
||||
}
|
||||
|
||||
func InitMQPool(cfg *scmq.Config) {
|
||||
ExecutorMQPool = exemq.NewPool(cfg)
|
||||
AdvisorMQPool = advmq.NewPool(cfg)
|
||||
|
|
|
@ -18,56 +18,59 @@ type AdvisorID string
|
|||
|
||||
type ComputingCenter struct {
|
||||
// 计算中心名称
|
||||
CCID schsdk.CCID `json:"CCID" db:"CCID"`
|
||||
CCID schsdk.CCID `json:"CCID" gorm:"column:CCID"`
|
||||
// 计算中心在运控系统的ID
|
||||
UOPSlwNodeID uopsdk.SlwNodeID `json:"uopSlwNodeID" db:"UOPSlwNodeID"`
|
||||
UOPSlwNodeID uopsdk.SlwNodeID `json:"uopSlwNodeID" gorm:"column:UOPSlwNodeID"`
|
||||
// 计算中心在PCM系统的ID
|
||||
PCMParticipantID pcmsdk.ParticipantID `json:"pcmParticipantID" db:"PCMParticipantID"`
|
||||
PCMParticipantID pcmsdk.ParticipantID `json:"pcmParticipantID" gorm:"column:PCMParticipantID"`
|
||||
// 此算力中心的存储服务对应在存储系统中的ID
|
||||
CDSStorageID cdssdk.StorageID `json:"cdsStorageID" db:"CDSStorageID"`
|
||||
CDSStorageID cdssdk.StorageID `json:"cdsStorageID" gorm:"column:CDSStorageID"`
|
||||
// 计算中心名称
|
||||
Name string `json:"name" db:"Name"`
|
||||
Name string `json:"name" gorm:"column:Name"`
|
||||
// 任务启动方式
|
||||
Bootstrap schsdk.Bootstrap `json:"bootstrap" db:"Bootstrap"`
|
||||
//Bootstrap schsdk.Bootstrap `json:"bootstrap" gorm:"column:Bootstrap"`
|
||||
Bootstrap schsdk.Bootstrap `json:"bootstrap" gorm:"-"`
|
||||
// 执行器ID
|
||||
ExecutorID string `json:"executorID" db:"executorID"`
|
||||
ExecutorID string `json:"executorID" gorm:"column:executorID"`
|
||||
// 执行器URL
|
||||
ExecutorURL string `json:"executorURL" db:"executorURL"`
|
||||
ExecutorURL string `json:"executorURL" gorm:"column:executorURL"`
|
||||
|
||||
//ClusterID schsdk.ClusterID `json:"clusterID" gorm:"column:ClusterID"`
|
||||
}
|
||||
|
||||
type Image struct {
|
||||
// 调度系统内的镜像ID
|
||||
ImageID schsdk.ImageID `json:"imageID" db:"ImageID"`
|
||||
ImageID schsdk.ImageID `json:"imageID" gorm:"column:ImageID"`
|
||||
// 镜像文件对应的存储系统PackageID,可以为空,为空则代表此镜像不可被自动导入到算力中心,比如是预制镜像
|
||||
CDSPackageID *cdssdk.PackageID `json:"cdsPackageID" db:"CDSPackageID"`
|
||||
CDSPackageID *cdssdk.PackageID `json:"cdsPackageID" gorm:"column:CDSPackageID"`
|
||||
// 镜像名称,在调度系统上设置的
|
||||
Name string `json:"name" db:"Name"`
|
||||
Name string `json:"name" gorm:"column:Name"`
|
||||
// 镜像创建时间
|
||||
CreateTime time.Time `json:"createTime" db:"CreateTime"`
|
||||
CreateTime time.Time `json:"createTime" gorm:"column:CreateTime"`
|
||||
}
|
||||
|
||||
type PCMImage struct {
|
||||
// 调度系统内的镜像ID
|
||||
ImageID schsdk.ImageID `json:"imageID" db:"ImageID"`
|
||||
ImageID schsdk.ImageID `json:"imageID" gorm:"column:ImageID"`
|
||||
// 导入到的计算中心的ID
|
||||
CCID schsdk.CCID `json:"ccID" db:"CCID"`
|
||||
CCID schsdk.CCID `json:"ccID" gorm:"column:CCID"`
|
||||
// 通过PCM系统导入到各计算中心后,得到的ID
|
||||
PCMImageID pcmsdk.ImageID `json:"pcmImageID" db:"PCMImageID"`
|
||||
PCMImageID pcmsdk.ImageID `json:"pcmImageID" gorm:"column:PCMImageID"`
|
||||
// 镜像名称,通过PCM导入后获得
|
||||
Name string `json:"name" db:"Name"`
|
||||
Name string `json:"name" gorm:"column:Name"`
|
||||
// 镜像导入时间
|
||||
UploadTime time.Time `json:"uploadTime" db:"UploadTime"`
|
||||
UploadTime time.Time `json:"uploadTime" gorm:"column:UploadTime"`
|
||||
}
|
||||
|
||||
type CCResource struct {
|
||||
// 计算中心ID
|
||||
CCID schsdk.CCID `json:"ccID" db:"CCID"`
|
||||
CCID schsdk.CCID `json:"ccID" gorm:"column:CCID"`
|
||||
// PCM系统返回的资源规格ID
|
||||
PCMResourceID pcmsdk.ResourceID `json:"pcmResourceID" db:"PCMResourceID"`
|
||||
PCMResourceID pcmsdk.ResourceID `json:"pcmResourceID" gorm:"column:PCMResourceID"`
|
||||
// PCM系统返回的资源规格名称
|
||||
PCMResourceName string `json:"pcmResourceName" db:"PCMResourceName"`
|
||||
PCMResourceName string `json:"pcmResourceName" gorm:"column:PCMResourceName"`
|
||||
// 此种规格具体包含的资源信息
|
||||
Resource CCResourceInfo `json:"resource" db:"Resource"`
|
||||
Resource CCResourceInfo `json:"resource" gorm:"column:Resource"`
|
||||
}
|
||||
|
||||
type CCResourceInfo struct {
|
||||
|
@ -80,32 +83,52 @@ type CCResourceInfo struct {
|
|||
}
|
||||
|
||||
type Models struct {
|
||||
ModelID schsdk.ModelID `json:"modelID" db:"modelID"`
|
||||
ModelName schsdk.ModelName `json:"modelName" db:"modelName"`
|
||||
// 模型ID
|
||||
ModelID schsdk.ModelID `json:"modelID" gorm:"column:modelID"`
|
||||
// 模型名称
|
||||
ModelName schsdk.ModelName `json:"modelName" gorm:"column:modelName"`
|
||||
}
|
||||
|
||||
type ModelResource struct {
|
||||
ModelID int64 `json:"modelID" db:"modelID"`
|
||||
OjbStgID int64 `json:"OjbStgID" db:"OjbStgID"`
|
||||
ModelPath string `json:"modelPath" db:"modelPath"`
|
||||
StartShellPath string `json:"startShellPath" db:"startShellPath"`
|
||||
ServerPort int64 `json:"serverPort" db:"serverPort"`
|
||||
ServerUrlPath string `json:"serverUrlPath" db:"serverUrlPath"`
|
||||
StopShellPath string `json:"stopShellPath" db:"stopShellPath"`
|
||||
FinetuningShellPath string `json:"finetuningShellPath" db:"finetuningShellPath"`
|
||||
// 模型ID
|
||||
ModelID int64 `json:"modelID" gorm:"column:modelID"`
|
||||
// 存储ID
|
||||
OjbStgID int64 `json:"OjbStgID" gorm:"column:OjbStgID"`
|
||||
// 模型路径
|
||||
ModelPath string `json:"modelPath" gorm:"column:modelPath"`
|
||||
// 启动脚本路径
|
||||
StartShellPath string `json:"startShellPath" gorm:"column:startShellPath"`
|
||||
// 服务器端口
|
||||
ServerPort int64 `json:"serverPort" gorm:"column:serverPort"`
|
||||
// 服务器URL路径
|
||||
ServerUrlPath string `json:"serverUrlPath" gorm:"column:serverUrlPath"`
|
||||
// 停止脚本路径
|
||||
StopShellPath string `json:"stopShellPath" gorm:"column:stopShellPath"`
|
||||
// 微调脚本路径
|
||||
FinetuningShellPath string `json:"finetuningShellPath" gorm:"column:finetuningShellPath"`
|
||||
}
|
||||
|
||||
type ObjectStorage struct {
|
||||
ID int64 `json:"ID" db:"ID"`
|
||||
Name string `json:"name" db:"name"`
|
||||
Manufacturer string `json:"manufacturer" db:"manufacturer"`
|
||||
Region string `json:"region" db:"region"`
|
||||
AK string `json:"access_key_id" db:"access_key_id"`
|
||||
SK string `json:"secret_access_key" db:"secret_access_key"`
|
||||
Endpoint string `json:"endpoint" db:"endpoint"`
|
||||
Bucket string `json:"bucket" db:"bucket"`
|
||||
CDSStorageID cdssdk.StorageID `json:"CDSStorageID" db:"CDSStorageID"`
|
||||
MountType string `json:"mountType" db:"mountType"`
|
||||
// 存储ID
|
||||
ID int64 `json:"ID" gorm:"column:ID"`
|
||||
// 存储名称
|
||||
Name string `json:"name" gorm:"column:name"`
|
||||
// 厂商
|
||||
Manufacturer string `json:"manufacturer" gorm:"column:manufacturer"`
|
||||
// 区域
|
||||
Region string `json:"region" gorm:"column:region"`
|
||||
// 访问密钥ID
|
||||
AK string `json:"access_key_id" gorm:"column:access_key_id"`
|
||||
// 秘密访问密钥
|
||||
SK string `json:"secret_access_key" gorm:"column:secret_access_key"`
|
||||
// 存储端点
|
||||
Endpoint string `json:"endpoint" gorm:"column:endpoint"`
|
||||
// 存储桶
|
||||
Bucket string `json:"bucket" gorm:"column:bucket"`
|
||||
// CDS存储ID
|
||||
CDSStorageID cdssdk.StorageID `json:"CDSStorageID" gorm:"column:CDSStorageID"`
|
||||
// 挂载类型
|
||||
MountType string `json:"mountType" gorm:"column:mountType"`
|
||||
}
|
||||
|
||||
func (i *CCResourceInfo) Scan(src interface{}) error {
|
||||
|
@ -122,3 +145,9 @@ const (
|
|||
AliCloud = "AliCloud"
|
||||
SugonCloud = "SugonCloud"
|
||||
)
|
||||
|
||||
//type FileUploadedInfo struct {
|
||||
// FileName string `json:"fileName"`
|
||||
// FileHash string `json:"fileHash"`
|
||||
// FileSize int64 `json:"fileSize"`
|
||||
//}
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package db
|
||||
|
||||
import (
|
||||
"github.com/jmoiron/sqlx"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
)
|
||||
|
@ -16,6 +15,6 @@ func (db *DB) CCResource() *CCResourceDB {
|
|||
|
||||
func (*CCResourceDB) GetByCCID(ctx SQLContext, id schsdk.CCID) ([]schmod.CCResource, error) {
|
||||
var ret []schmod.CCResource
|
||||
err := sqlx.Select(ctx, &ret, "select * from CCResource where CCID = ?", id)
|
||||
err := ctx.Where("CCID = ?", id).Find(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package db
|
||||
|
||||
import (
|
||||
"github.com/jmoiron/sqlx"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
)
|
||||
|
@ -14,20 +13,52 @@ func (db *DB) ComputingCenter() *ComputingCenterDB {
|
|||
return &ComputingCenterDB{DB: db}
|
||||
}
|
||||
|
||||
//func (*ComputingCenterDB) GetByPackageID(ctx SQLContext, id schsdk.CCID) (schmod.ComputingCenter, error) {
|
||||
// var ret TempComputingCenter
|
||||
// err := sqlx.Get(ctx, &ret, "select cc.*, ei.executorURL from (select * from ComputingCenter where CCID = ?) as cc left join (select * from ExecutorInfo) as ei on cc.executorID = ei.executorID", id)
|
||||
// return ret.ToComputingCenter(), err
|
||||
//}
|
||||
|
||||
func (*ComputingCenterDB) GetByID(ctx SQLContext, id schsdk.CCID) (schmod.ComputingCenter, error) {
|
||||
var ret TempComputingCenter
|
||||
err := sqlx.Get(ctx, &ret, "select cc.*, ei.executorURL from (select * from ComputingCenter where CCID = ?) as cc left join (select * from ExecutorInfo) as ei on cc.executorID = ei.executorID", id)
|
||||
// 使用 GORM 的原始 SQL 查询
|
||||
err := ctx.Table("ComputingCenter").Raw(`
|
||||
SELECT cc.*, ei.executorURL
|
||||
FROM (SELECT * FROM ComputingCenter WHERE CCID = ?) AS cc
|
||||
LEFT JOIN (SELECT * FROM ExecutorInfo) AS ei
|
||||
ON cc.executorID = ei.executorID`, id).Scan(&ret).Error
|
||||
|
||||
// 将 TempComputingCenter 转换为 ComputingCenter
|
||||
return ret.ToComputingCenter(), err
|
||||
}
|
||||
|
||||
func (*ComputingCenterDB) GetAll(ctx SQLContext) ([]schmod.ComputingCenter, error) {
|
||||
var tmp []TempComputingCenter
|
||||
err := sqlx.Select(ctx, &tmp, "select * from ComputingCenter")
|
||||
//func (*ComputingCenterDB) GetAll(ctx SQLContext) ([]schmod.ComputingCenter, error) {
|
||||
// var tmp []TempComputingCenter
|
||||
// err := sqlx.Select(ctx, &tmp, "select * from ComputingCenter")
|
||||
//
|
||||
// var ret []schmod.ComputingCenter
|
||||
// for _, t := range tmp {
|
||||
// ret = append(ret, t.ToComputingCenter())
|
||||
// }
|
||||
//
|
||||
// return ret, err
|
||||
//}
|
||||
|
||||
var ret []schmod.ComputingCenter
|
||||
for _, t := range tmp {
|
||||
ret = append(ret, t.ToComputingCenter())
|
||||
func (*ComputingCenterDB) GetAll(ctx SQLContext) ([]schmod.ComputingCenter, error) {
|
||||
//var tmp []TempComputingCenter
|
||||
var tmp []schmod.ComputingCenter
|
||||
// 使用 GORM 的 Find 查询所有 ComputingCenter 数据
|
||||
err := ctx.Table("ComputingCenter").Find(&tmp).Error
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return ret, err
|
||||
// 将查询结果从 TempComputingCenter 转换为 ComputingCenter
|
||||
//var ret []schmod.ComputingCenter
|
||||
//for _, t := range tmp {
|
||||
// ret = append(ret, t.ToComputingCenter())
|
||||
//}
|
||||
|
||||
return tmp, nil
|
||||
}
|
||||
|
|
|
@ -1,64 +1,91 @@
|
|||
package db
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
|
||||
_ "github.com/go-sql-driver/mysql"
|
||||
"github.com/jmoiron/sqlx"
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db/config"
|
||||
"gorm.io/driver/mysql"
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
// TODO 迁移到Gorm
|
||||
// TODO ComputingCenter去掉了CDSNodeID字段,需要修改DB的结构
|
||||
|
||||
type DB struct {
|
||||
d *sqlx.DB
|
||||
}
|
||||
//type DB struct {
|
||||
// d *sqlx.DB
|
||||
//}
|
||||
//
|
||||
//type SQLContext interface {
|
||||
// sqlx.Queryer
|
||||
// sqlx.Execer
|
||||
// sqlx.Ext
|
||||
//}
|
||||
//
|
||||
//func NewDB(cfg *config.Config) (*DB, error) {
|
||||
// db, err := sqlx.Open("mysql", cfg.MakeSourceString())
|
||||
// if err != nil {
|
||||
// return nil, fmt.Errorf("open database connection failed, err: %w", err)
|
||||
// }
|
||||
//
|
||||
// // 尝试连接一下数据库,如果数据库配置有错误在这里就能报出来
|
||||
// err = db.Ping()
|
||||
// if err != nil {
|
||||
// return nil, err
|
||||
// }
|
||||
//
|
||||
// return &DB{
|
||||
// d: db,
|
||||
// }, nil
|
||||
//}
|
||||
//
|
||||
//func (db *DB) DoTx(isolation sql.IsolationLevel, fn func(tx *sqlx.Tx) error) error {
|
||||
// tx, err := db.d.BeginTxx(context.Background(), &sql.TxOptions{Isolation: isolation})
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// if err := fn(tx); err != nil {
|
||||
// tx.Rollback()
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// if err := tx.Commit(); err != nil {
|
||||
// tx.Rollback()
|
||||
// return err
|
||||
// }
|
||||
//
|
||||
// return nil
|
||||
//}
|
||||
//
|
||||
//func (db *DB) SQLCtx() SQLContext {
|
||||
// return db.d
|
||||
//}
|
||||
|
||||
type SQLContext interface {
|
||||
sqlx.Queryer
|
||||
sqlx.Execer
|
||||
sqlx.Ext
|
||||
type DB struct {
|
||||
db *gorm.DB
|
||||
}
|
||||
|
||||
func NewDB(cfg *config.Config) (*DB, error) {
|
||||
db, err := sqlx.Open("mysql", cfg.MakeSourceString())
|
||||
mydb, err := gorm.Open(mysql.Open(cfg.MakeSourceString()), &gorm.Config{})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open database connection failed, err: %w", err)
|
||||
}
|
||||
|
||||
// 尝试连接一下数据库,如果数据库配置有错误在这里就能报出来
|
||||
err = db.Ping()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
logrus.Fatalf("failed to connect to database: %v", err)
|
||||
}
|
||||
|
||||
return &DB{
|
||||
d: db,
|
||||
db: mydb,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (db *DB) DoTx(isolation sql.IsolationLevel, fn func(tx *sqlx.Tx) error) error {
|
||||
tx, err := db.d.BeginTxx(context.Background(), &sql.TxOptions{Isolation: isolation})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := fn(tx); err != nil {
|
||||
tx.Rollback()
|
||||
return err
|
||||
}
|
||||
|
||||
if err := tx.Commit(); err != nil {
|
||||
tx.Rollback()
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
func (db *DB) DoTx(do func(tx SQLContext) error) error {
|
||||
return db.db.Transaction(func(tx *gorm.DB) error {
|
||||
return do(SQLContext{tx})
|
||||
})
|
||||
}
|
||||
|
||||
func (db *DB) SQLCtx() SQLContext {
|
||||
return db.d
|
||||
type SQLContext struct {
|
||||
*gorm.DB
|
||||
}
|
||||
|
||||
func (db *DB) DefCtx() SQLContext {
|
||||
return SQLContext{db.db}
|
||||
}
|
||||
|
|
|
@ -3,7 +3,6 @@ package db
|
|||
import (
|
||||
"time"
|
||||
|
||||
"github.com/jmoiron/sqlx"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
|
@ -17,22 +16,42 @@ func (db *DB) Image() *ImageDB {
|
|||
return &ImageDB{DB: db}
|
||||
}
|
||||
|
||||
//func (*ImageDB) GetByPackageID(ctx SQLContext, id schsdk.ImageID) (schmod.Image, error) {
|
||||
// var ret schmod.Image
|
||||
// err := sqlx.Get(ctx, &ret, "select * from Image where ImageID = ?", id)
|
||||
// return ret, err
|
||||
//}
|
||||
|
||||
func (*ImageDB) GetByID(ctx SQLContext, id schsdk.ImageID) (schmod.Image, error) {
|
||||
var ret schmod.Image
|
||||
err := sqlx.Get(ctx, &ret, "select * from Image where ImageID = ?", id)
|
||||
err := ctx.Table("Image").Where("ImageID = ?", id).First(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
||||
//func (*ImageDB) Create(ctx SQLContext, cdsPackageID *cdssdk.PackageID, name string, createTime time.Time) (schsdk.ImageID, error) {
|
||||
// ret, err := ctx.Exec("insert into Image(CDSPackageID, Name, CreateTime) values(?, ?, ?)", cdsPackageID, name, createTime)
|
||||
// if err != nil {
|
||||
// return 0, err
|
||||
// }
|
||||
//
|
||||
// id, err := ret.LastInsertId()
|
||||
// if err != nil {
|
||||
// return 0, err
|
||||
// }
|
||||
//
|
||||
// return schsdk.ImageID(id), nil
|
||||
//}
|
||||
|
||||
func (*ImageDB) Create(ctx SQLContext, cdsPackageID *cdssdk.PackageID, name string, createTime time.Time) (schsdk.ImageID, error) {
|
||||
ret, err := ctx.Exec("insert into Image(CDSPackageID, Name, CreateTime) values(?, ?, ?)", cdsPackageID, name, createTime)
|
||||
if err != nil {
|
||||
image := schmod.Image{
|
||||
CDSPackageID: cdsPackageID,
|
||||
Name: name,
|
||||
CreateTime: createTime,
|
||||
}
|
||||
|
||||
if err := ctx.Table("Image").Create(&image).Error; err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
id, err := ret.LastInsertId()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return schsdk.ImageID(id), nil
|
||||
return image.ImageID, nil
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ import (
|
|||
|
||||
type TempComputingCenter struct {
|
||||
schmod.ComputingCenter
|
||||
Bootstrap BootstrapWarpper `db:"Bootstrap"`
|
||||
Bootstrap BootstrapWrapper `gorm:"column:Bootstrap"`
|
||||
}
|
||||
|
||||
func (c *TempComputingCenter) ToComputingCenter() schmod.ComputingCenter {
|
||||
|
@ -20,11 +20,11 @@ func (c *TempComputingCenter) ToComputingCenter() schmod.ComputingCenter {
|
|||
return cc
|
||||
}
|
||||
|
||||
type BootstrapWarpper struct {
|
||||
type BootstrapWrapper struct {
|
||||
Value schsdk.Bootstrap
|
||||
}
|
||||
|
||||
func (o *BootstrapWarpper) Scan(src interface{}) error {
|
||||
func (o *BootstrapWrapper) Scan(src interface{}) error {
|
||||
data, ok := src.([]uint8)
|
||||
if !ok {
|
||||
return fmt.Errorf("unknow src type: %v", reflect.TypeOf(data))
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package db
|
||||
|
||||
import (
|
||||
"github.com/jmoiron/sqlx"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
)
|
||||
|
@ -14,15 +13,27 @@ func (db *DB) Models() *ModelsDB {
|
|||
return &ModelsDB{DB: db}
|
||||
}
|
||||
|
||||
//func (*ModelsDB) GetAll(ctx SQLContext) ([]schmod.Models, error) {
|
||||
// var ret []schmod.Models
|
||||
// err := sqlx.Select(ctx, &ret, "select * from Models")
|
||||
//
|
||||
// return ret, err
|
||||
//}
|
||||
|
||||
func (*ModelsDB) GetAll(ctx SQLContext) ([]schmod.Models, error) {
|
||||
var ret []schmod.Models
|
||||
err := sqlx.Select(ctx, &ret, "select * from Models")
|
||||
|
||||
err := ctx.Table("Models").Find(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
||||
//func (*ModelsDB) GetModelByID(ctx SQLContext, modelID schsdk.ModelID, OjbStgID int64) (schmod.ModelResource, error) {
|
||||
// var ret schmod.ModelResource
|
||||
// err := sqlx.Get(ctx, &ret, "select * from ModelResource where modelID = ? and OjbStgID = ?", modelID, OjbStgID)
|
||||
// return ret, err
|
||||
//}
|
||||
|
||||
func (*ModelsDB) GetModelByID(ctx SQLContext, modelID schsdk.ModelID, OjbStgID int64) (schmod.ModelResource, error) {
|
||||
var ret schmod.ModelResource
|
||||
err := sqlx.Get(ctx, &ret, "select * from ModelResource where modelID = ? and OjbStgID = ?", modelID, OjbStgID)
|
||||
err := ctx.Table("ModelResource").Where("modelID = ? AND OjbStgID = ?", modelID, OjbStgID).First(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package db
|
||||
|
||||
import (
|
||||
"github.com/jmoiron/sqlx"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
)
|
||||
|
@ -14,8 +13,14 @@ func (db *DB) ObjectStorage() *ObjectStorageDB {
|
|||
return &ObjectStorageDB{DB: db}
|
||||
}
|
||||
|
||||
//func (*ObjectStorageDB) GetObjectStorageByStorageID(ctx SQLContext, CDSStorageID cdssdk.StorageID) (schmod.ObjectStorage, error) {
|
||||
// var ret schmod.ObjectStorage
|
||||
// err := sqlx.Get(ctx, &ret, "select * from ObjectStorage where CDSStorageID = ?", CDSStorageID)
|
||||
// return ret, err
|
||||
//}
|
||||
|
||||
func (*ObjectStorageDB) GetObjectStorageByStorageID(ctx SQLContext, CDSStorageID cdssdk.StorageID) (schmod.ObjectStorage, error) {
|
||||
var ret schmod.ObjectStorage
|
||||
err := sqlx.Get(ctx, &ret, "select * from ObjectStorage where CDSStorageID = ?", CDSStorageID)
|
||||
err := ctx.Table("ObjectStorage").Where("CDSStorageID = ?", CDSStorageID).First(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
|
|
@ -1,12 +1,10 @@
|
|||
package db
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/jmoiron/sqlx"
|
||||
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
"time"
|
||||
)
|
||||
|
||||
type PCMImageDB struct {
|
||||
|
@ -17,19 +15,43 @@ func (db *DB) PCMImage() *PCMImageDB {
|
|||
return &PCMImageDB{DB: db}
|
||||
}
|
||||
|
||||
//func (*PCMImageDB) GetByImageID(ctx SQLContext, id schsdk.ImageID) ([]schmod.PCMImage, error) {
|
||||
// var ret []schmod.PCMImage
|
||||
// err := sqlx.Select(ctx, &ret, "select * from PCMImage where ImageID = ?", id)
|
||||
// return ret, err
|
||||
//}
|
||||
//
|
||||
//func (*PCMImageDB) GetByImageIDAndCCID(ctx SQLContext, imageID schsdk.ImageID, ccID schsdk.CCID) (schmod.PCMImage, error) {
|
||||
// var ret schmod.PCMImage
|
||||
// err := sqlx.Get(ctx, &ret, "select * from PCMImage where ImageID = ? and CCID = ?", imageID, ccID)
|
||||
// return ret, err
|
||||
//}
|
||||
//
|
||||
//func (*PCMImageDB) Create(ctx SQLContext, imageID schsdk.ImageID, ccID schsdk.CCID, pcmImageID pcmsdk.ImageID, name string, uploadTime time.Time) error {
|
||||
// _, err := ctx.Exec("insert into PCMImage values(?, ?, ?, ?, ?)", imageID, ccID, pcmImageID, name, uploadTime)
|
||||
// return err
|
||||
//}
|
||||
|
||||
func (*PCMImageDB) GetByImageID(ctx SQLContext, id schsdk.ImageID) ([]schmod.PCMImage, error) {
|
||||
var ret []schmod.PCMImage
|
||||
err := sqlx.Select(ctx, &ret, "select * from PCMImage where ImageID = ?", id)
|
||||
err := ctx.Table("PCMImage").Where("ImageID = ?", id).Find(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
||||
func (*PCMImageDB) GetByImageIDAndCCID(ctx SQLContext, imageID schsdk.ImageID, ccID schsdk.CCID) (schmod.PCMImage, error) {
|
||||
var ret schmod.PCMImage
|
||||
err := sqlx.Get(ctx, &ret, "select * from PCMImage where ImageID = ? and CCID = ?", imageID, ccID)
|
||||
err := ctx.Table("PCMImage").Where("ImageID = ? AND CCID = ?", imageID, ccID).First(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
||||
func (*PCMImageDB) Create(ctx SQLContext, imageID schsdk.ImageID, ccID schsdk.CCID, pcmImageID pcmsdk.ImageID, name string, uploadTime time.Time) error {
|
||||
_, err := ctx.Exec("insert into PCMImage values(?, ?, ?, ?, ?)", imageID, ccID, pcmImageID, name, uploadTime)
|
||||
return err
|
||||
pcmImage := schmod.PCMImage{
|
||||
ImageID: imageID,
|
||||
CCID: ccID,
|
||||
PCMImageID: pcmImageID,
|
||||
Name: name,
|
||||
UploadTime: uploadTime,
|
||||
}
|
||||
|
||||
return ctx.Table("PCMImage").Create(&pcmImage).Error
|
||||
}
|
||||
|
|
|
@ -0,0 +1,198 @@
|
|||
package db
|
||||
|
||||
import (
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
|
||||
"gorm.io/gorm/clause"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type UploadDataDB struct {
|
||||
*DB
|
||||
}
|
||||
|
||||
func (db *DB) UploadData() *UploadDataDB {
|
||||
return &UploadDataDB{DB: db}
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) GetByPackageID(ctx SQLContext, packageIDs []cdssdk.PackageID, bindingIDs []int64) ([]uploadersdk.Package, error) {
|
||||
var ret []uploadersdk.Package
|
||||
err := ctx.Table("UploadData").Where("packageID IN ? or bindingID IN ?", packageIDs, bindingIDs).Find(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) GetByID(ctx SQLContext, ids []uploadersdk.DataID) ([]uploadersdk.Package, error) {
|
||||
var ret []uploadersdk.Package
|
||||
err := ctx.Table("UploadData").Where("ID IN ?", ids).Find(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) QueryFolder(ctx SQLContext, queryParams sch.QueryData) ([]uploadersdk.Folder, error) {
|
||||
var ret []uploadersdk.Folder
|
||||
err := ctx.Table("folders").Where("package_id = ? and path_name like ?", queryParams.PackageID, queryParams.Path+"%").Find(&ret).Error
|
||||
|
||||
// 使用 map 来去重路径
|
||||
pathSet := make(map[string]struct{})
|
||||
for _, folder := range ret {
|
||||
// 如果路径是输入路径的后一级子路径
|
||||
if strings.HasPrefix(folder.Path, queryParams.Path+"/") {
|
||||
path := folder.Path[len(queryParams.Path)+1:]
|
||||
pathArr := strings.Split(path, "/")
|
||||
if len(pathArr) == 0 {
|
||||
continue
|
||||
}
|
||||
path = queryParams.Path + "/" + pathArr[0]
|
||||
// 保留输入路径后的下一级路径,并去重
|
||||
pathSet[path] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
// 将去重后的路径从 map 转换回切片
|
||||
var result []uploadersdk.Folder
|
||||
for path := range pathSet {
|
||||
result = append(result, uploadersdk.Folder{Path: path})
|
||||
}
|
||||
|
||||
return result, err
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) InsertFolder(ctx SQLContext, packageID cdssdk.PackageID, path string) error {
|
||||
folder := uploadersdk.Folder{
|
||||
PackageID: packageID,
|
||||
Path: path,
|
||||
CreateTime: time.Now(),
|
||||
}
|
||||
|
||||
if err := ctx.Table("folders").Create(&folder).Error; err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) DeleteFolder(ctx SQLContext, packageID cdssdk.PackageID, path string) error {
|
||||
err := ctx.Table("folders").Where("package_id = ? and path_name like ?", packageID, path+"%").Delete(&uploadersdk.Folder{}).Error
|
||||
return err
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) QueryPackage(ctx SQLContext, queryParams sch.QueryData) ([]uploadersdk.Package, error) {
|
||||
var ret []uploadersdk.PackageDAO
|
||||
err := ctx.Table("package").Where("user_id = ? and data_type = ?", queryParams.UserID, queryParams.DataType).Find(&ret).Error
|
||||
|
||||
// 将数据转换成 uploadersdk.Package
|
||||
var res []uploadersdk.Package
|
||||
for _, dao := range ret {
|
||||
pkg := uploadersdk.Package{
|
||||
UserID: dao.UserID,
|
||||
PackageID: dao.PackageID,
|
||||
PackageName: dao.PackageName,
|
||||
DataType: dao.DataType,
|
||||
JsonData: dao.JsonData,
|
||||
BindingID: dao.BindingID,
|
||||
UploadedCluster: dao.UploadedCluster,
|
||||
}
|
||||
res = append(res, pkg)
|
||||
}
|
||||
|
||||
return res, err
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) InsertPackage(ctx SQLContext, newPackage uploadersdk.Package) error {
|
||||
|
||||
// 查询是否存在
|
||||
if err := ctx.Table("package").Where("package_id = ?", newPackage.PackageID).First(&uploadersdk.PackageDAO{}).Error; err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
dao := uploadersdk.PackageDAO{
|
||||
PackageID: newPackage.PackageID,
|
||||
PackageName: newPackage.PackageName,
|
||||
DataType: newPackage.DataType,
|
||||
UserID: newPackage.UserID,
|
||||
BindingID: -1,
|
||||
}
|
||||
|
||||
// 插入新包
|
||||
if err := ctx.Table("package").Create(&dao).Error; err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) DeletePackage(ctx SQLContext, userID cdssdk.UserID, packageID cdssdk.PackageID) error {
|
||||
err := ctx.Table("package").
|
||||
Where("package_id = ? and user_id = ?", packageID, userID).
|
||||
Delete(&uploadersdk.PackageDAO{}).Error
|
||||
return err
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) QueryPackageByID(ctx SQLContext, ID cdssdk.PackageID) (uploadersdk.PackageDAO, error) {
|
||||
var ret uploadersdk.PackageDAO
|
||||
err := ctx.Table("package").Where("package_id = ?", ID).Omit("Objects").Preload("UploadedCluster").Find(&ret).Error
|
||||
return ret, err
|
||||
}
|
||||
|
||||
type ClusterMappingRow struct {
|
||||
ClusterID schsdk.ClusterID `gorm:"column:clusterID" json:"clusterID"`
|
||||
StorageID cdssdk.StorageID `gorm:"column:storageID" json:"storageID"`
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) GetClusterMapping(ctx SQLContext) (map[schsdk.ClusterID]cdssdk.StorageID, error) {
|
||||
var rows []ClusterMappingRow
|
||||
err := ctx.Table("ClusterMapping").Find(&rows).Error
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// 手动将查询结果转换为 map
|
||||
ret := make(map[schsdk.ClusterID]cdssdk.StorageID)
|
||||
for _, row := range rows {
|
||||
ret[row.ClusterID] = row.StorageID
|
||||
}
|
||||
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) UpdatePackage(ctx SQLContext, packageID cdssdk.PackageID, jsonData string, bindingID uploadersdk.DataID) error {
|
||||
|
||||
if jsonData != "" {
|
||||
if err := ctx.Table("package").Where("package_id = ?", packageID).Update("json_data", jsonData).Error; err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if bindingID != -1 {
|
||||
if err := ctx.Table("package").Where("package_id = ?", packageID).Update("binding_id", bindingID).Error; err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) InsertBlockchains(ctx SQLContext, blockchains []*uploadersdk.BlockChain) error {
|
||||
|
||||
if err := ctx.Table("BlockChain").Create(&blockchains).Error; err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (db *UploadDataDB) InsertOrUpdateBinding(ctx SQLContext, data uploadersdk.BindingData) error {
|
||||
err := ctx.Table("BindingData").Clauses(clause.OnConflict{
|
||||
Columns: []clause.Column{{Name: "ID"}}, // 指定冲突列
|
||||
DoUpdates: clause.Assignments(map[string]interface{}{
|
||||
"bindingName": data.BindingName,
|
||||
//"bindingType": data.BindingType,
|
||||
}),
|
||||
}).Create(&data).Error
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -403,13 +403,6 @@ func (s *DefaultPreScheduler) scheduleForNormalOrMultiJob(jobSet *schsdk.JobSetI
|
|||
return nil, ErrNoAvailableScheme
|
||||
}
|
||||
|
||||
// 此逻辑用于测试,生产环境必须删除!
|
||||
for i := 0; i < len(allCCsArr); i++ {
|
||||
if allCCsArr[i].CC.CCID == schsdk.CCID(jobResource.Storage) {
|
||||
targetNode = allCCsArr[i]
|
||||
}
|
||||
}
|
||||
|
||||
scheme := s.makeSchemeForNode(jobFiles, targetNode)
|
||||
return &scheme, nil
|
||||
}
|
||||
|
|
|
@ -0,0 +1,349 @@
|
|||
package prescheduler2
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
pcmsch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
|
||||
"github.com/inhies/go-bytesize"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
"gitlink.org.cn/cloudream/common/utils/math2"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/collector"
|
||||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||||
)
|
||||
|
||||
func (s *DefaultPreScheduler) calcResourceScore(jobResource schsdk.JobResourcesInfo, allCCs map[schsdk.CCID]*candidate) error {
|
||||
schCli, err := schglb.PCMSchePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.PCMSchePool.Release(schCli)
|
||||
|
||||
clusterIDs := make([]schsdk.ClusterID, 0, len(allCCs))
|
||||
for _, cc := range allCCs {
|
||||
clusterIDs = append(clusterIDs, schsdk.ClusterID(cc.CC.CCID))
|
||||
}
|
||||
|
||||
clusterDetails, err := schCli.GetClusterInfo(pcmsch.GetClusterInfoReq{
|
||||
IDs: clusterIDs,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("get cluster info: %w", err)
|
||||
}
|
||||
|
||||
for _, cluster := range clusterDetails {
|
||||
res, err := s.calcOneResourceScore(jobResource, cluster.Resources2)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, cc := range allCCs {
|
||||
if schsdk.ClusterID(cc.CC.CCID) == cluster.ClusterId {
|
||||
cc.Resource = *res
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// 划分节点资源等级,并计算资源得分
|
||||
func (s *DefaultPreScheduler) calcOneResourceScore(requires schsdk.JobResourcesInfo, resourceData []pcmsch.ResourceData) (*resourcesDetail, error) {
|
||||
|
||||
var resDetail resourcesDetail
|
||||
|
||||
//计算资源得分
|
||||
totalScore := 0.0
|
||||
maxLevel := 0
|
||||
resKinds := 0
|
||||
|
||||
if requires.CPU > 0 {
|
||||
res := findResuorce[*pcmsch.CPUResourceData](resourceData)
|
||||
if res == nil {
|
||||
resDetail.CPU.Level = ResourceLevel3
|
||||
resDetail.CPU.Score = 0
|
||||
} else {
|
||||
resDetail.CPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.CPU)
|
||||
resDetail.CPU.Score = (float64(res.Available.Value) / requires.CPU) * CpuResourceWeight
|
||||
}
|
||||
|
||||
maxLevel = math2.Max(maxLevel, resDetail.CPU.Level)
|
||||
totalScore += resDetail.CPU.Score
|
||||
resKinds++
|
||||
}
|
||||
|
||||
if requires.GPU > 0 {
|
||||
res := findResuorce[*pcmsch.GPUResourceData](resourceData)
|
||||
if res == nil {
|
||||
resDetail.GPU.Level = ResourceLevel3
|
||||
resDetail.GPU.Score = 0
|
||||
} else {
|
||||
resDetail.GPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.GPU)
|
||||
resDetail.GPU.Score = (float64(res.Available.Value) / requires.GPU) * CpuResourceWeight
|
||||
}
|
||||
|
||||
maxLevel = math2.Max(maxLevel, resDetail.GPU.Level)
|
||||
totalScore += resDetail.GPU.Score
|
||||
resKinds++
|
||||
}
|
||||
|
||||
if requires.NPU > 0 {
|
||||
res := findResuorce[*pcmsch.NPUResourceData](resourceData)
|
||||
if res == nil {
|
||||
resDetail.NPU.Level = ResourceLevel3
|
||||
resDetail.NPU.Score = 0
|
||||
} else {
|
||||
resDetail.NPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.NPU)
|
||||
resDetail.NPU.Score = (float64(res.Available.Value) / requires.NPU) * CpuResourceWeight
|
||||
}
|
||||
|
||||
maxLevel = math2.Max(maxLevel, resDetail.NPU.Level)
|
||||
totalScore += resDetail.NPU.Score
|
||||
resKinds++
|
||||
}
|
||||
|
||||
if requires.MLU > 0 {
|
||||
res := findResuorce[*pcmsch.MLUResourceData](resourceData)
|
||||
if res == nil {
|
||||
resDetail.MLU.Level = ResourceLevel3
|
||||
resDetail.MLU.Score = 0
|
||||
} else {
|
||||
resDetail.MLU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.MLU)
|
||||
resDetail.MLU.Score = (float64(res.Available.Value) / requires.MLU) * CpuResourceWeight
|
||||
}
|
||||
|
||||
maxLevel = math2.Max(maxLevel, resDetail.MLU.Level)
|
||||
totalScore += resDetail.MLU.Score
|
||||
resKinds++
|
||||
}
|
||||
|
||||
if requires.Storage > 0 {
|
||||
res := findResuorce[*pcmsch.StorageResourceData](resourceData)
|
||||
if res == nil {
|
||||
resDetail.Storage.Level = ResourceLevel3
|
||||
resDetail.Storage.Score = 0
|
||||
} else {
|
||||
bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resDetail.Storage.Level = s.calcResourceLevel(float64(bytes), float64(requires.Storage))
|
||||
resDetail.Storage.Score = (float64(bytes) / float64(requires.Storage)) * StgResourceWeight
|
||||
}
|
||||
|
||||
maxLevel = math2.Max(maxLevel, resDetail.Storage.Level)
|
||||
totalScore += resDetail.Storage.Score
|
||||
resKinds++
|
||||
}
|
||||
|
||||
if requires.Memory > 0 {
|
||||
res := findResuorce[*pcmsch.MemoryResourceData](resourceData)
|
||||
if res == nil {
|
||||
resDetail.Memory.Level = ResourceLevel3
|
||||
resDetail.Memory.Score = 0
|
||||
} else {
|
||||
bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
resDetail.Memory.Level = s.calcResourceLevel(float64(bytes), float64(requires.Memory))
|
||||
resDetail.Memory.Score = (float64(bytes) / float64(requires.Memory)) * StgResourceWeight
|
||||
}
|
||||
|
||||
maxLevel = math2.Max(maxLevel, resDetail.Memory.Level)
|
||||
totalScore += resDetail.Memory.Score
|
||||
resKinds++
|
||||
}
|
||||
|
||||
if resKinds == 0 {
|
||||
return &resDetail, nil
|
||||
}
|
||||
|
||||
resDetail.TotalScore = totalScore
|
||||
resDetail.AvgScore = resDetail.AvgScore / float64(resKinds)
|
||||
resDetail.MaxLevel = maxLevel
|
||||
|
||||
return &resDetail, nil
|
||||
}
|
||||
|
||||
func (s *DefaultPreScheduler) calcResourceLevel(avai float64, need float64) int {
|
||||
if avai >= 1.5*need {
|
||||
return ResourceLevel1
|
||||
}
|
||||
|
||||
if avai >= need {
|
||||
return ResourceLevel2
|
||||
}
|
||||
|
||||
return ResourceLevel3
|
||||
}
|
||||
|
||||
// 计算节点得分情况
|
||||
func (s *DefaultPreScheduler) calcFileScore(files schsdk.JobFilesInfo, allCCs map[schsdk.CCID]*candidate) error {
|
||||
// 只计算运控返回的可用计算中心上的存储服务的数据权重
|
||||
cdsStgToCC := make(map[cdssdk.StorageID]*candidate)
|
||||
for _, cc := range allCCs {
|
||||
cdsStgToCC[cc.CC.CDSStorageID] = cc
|
||||
}
|
||||
|
||||
//计算code相关得分
|
||||
if pkgFile, ok := files.Code.(*schsdk.PackageJobFileInfo); ok {
|
||||
codeFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsStgToCC)
|
||||
if err != nil {
|
||||
return fmt.Errorf("calc code file score: %w", err)
|
||||
}
|
||||
for id, score := range codeFileScores {
|
||||
allCCs[id].Files.Code = *score
|
||||
}
|
||||
}
|
||||
|
||||
//计算dataset相关得分
|
||||
if pkgFile, ok := files.Dataset.(*schsdk.PackageJobFileInfo); ok {
|
||||
datasetFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsStgToCC)
|
||||
if err != nil {
|
||||
return fmt.Errorf("calc dataset file score: %w", err)
|
||||
}
|
||||
for id, score := range datasetFileScores {
|
||||
allCCs[id].Files.Dataset = *score
|
||||
}
|
||||
}
|
||||
|
||||
//计算image相关得分
|
||||
if imgFile, ok := files.Image.(*schsdk.ImageJobFileInfo); ok {
|
||||
//计算image相关得分
|
||||
imageFileScores, err := s.calcImageFileScore(imgFile.ImageID, allCCs, cdsStgToCC)
|
||||
if err != nil {
|
||||
return fmt.Errorf("calc image file score: %w", err)
|
||||
}
|
||||
for id, score := range imageFileScores {
|
||||
allCCs[id].Files.Image = *score
|
||||
}
|
||||
}
|
||||
|
||||
for _, cc := range allCCs {
|
||||
cc.Files.TotalScore = cc.Files.Code.CachingScore +
|
||||
cc.Files.Code.LoadingScore +
|
||||
cc.Files.Dataset.CachingScore +
|
||||
cc.Files.Dataset.LoadingScore +
|
||||
cc.Files.Image.CachingScore +
|
||||
cc.Files.Image.LoadingScore
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// 计算package在各节点的得分情况
|
||||
func (s *DefaultPreScheduler) calcPackageFileScore(packageID cdssdk.PackageID, cdsStgToCC map[cdssdk.StorageID]*candidate) (map[schsdk.CCID]*fileDetail, error) {
|
||||
colCli, err := schglb.CollectorMQPool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new collector client: %w", err)
|
||||
}
|
||||
defer schglb.CollectorMQPool.Release(colCli)
|
||||
|
||||
ccFileScores := make(map[schsdk.CCID]*fileDetail)
|
||||
|
||||
// TODO UserID
|
||||
cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(1, packageID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, cdsNodeCacheInfo := range cachedResp.StorageInfos {
|
||||
cc, ok := cdsStgToCC[cdsNodeCacheInfo.StorageID]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
ccFileScores[cc.CC.CCID] = &fileDetail{
|
||||
//TODO 根据缓存方式不同,可能会有不同的计算方式
|
||||
CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO UserID
|
||||
loadedResp, err := colCli.PackageGetLoadedStgs(collector.NewPackageGetLoadedStgs(1, packageID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, cdsStgID := range loadedResp.StgIDs {
|
||||
cc, ok := cdsStgToCC[cdsStgID]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
sfc, ok := ccFileScores[cc.CC.CCID]
|
||||
if !ok {
|
||||
sfc = &fileDetail{}
|
||||
ccFileScores[cc.CC.CCID] = sfc
|
||||
}
|
||||
|
||||
sfc.LoadingScore = 1 * LoadedWeight
|
||||
sfc.IsLoaded = true
|
||||
}
|
||||
|
||||
return ccFileScores, nil
|
||||
}
|
||||
|
||||
// 计算package在各节点的得分情况
|
||||
func (s *DefaultPreScheduler) calcImageFileScore(imageID schsdk.ImageID, allCCs map[schsdk.CCID]*candidate, cdsStgToCC map[cdssdk.StorageID]*candidate) (map[schsdk.CCID]*fileDetail, error) {
|
||||
colCli, err := schglb.CollectorMQPool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new collector client: %w", err)
|
||||
}
|
||||
defer schglb.CollectorMQPool.Release(colCli)
|
||||
|
||||
magCli, err := schglb.ManagerMQPool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new manager client: %w", err)
|
||||
}
|
||||
defer schglb.ManagerMQPool.Release(magCli)
|
||||
|
||||
imageInfoResp, err := magCli.GetImageInfo(mgrmq.NewGetImageInfo(imageID))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("getting image info: %w", err)
|
||||
}
|
||||
|
||||
ccFileScores := make(map[schsdk.CCID]*fileDetail)
|
||||
|
||||
if imageInfoResp.Image.CDSPackageID != nil {
|
||||
cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(1, *imageInfoResp.Image.CDSPackageID))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, cdsNodeCacheInfo := range cachedResp.StorageInfos {
|
||||
cc, ok := cdsStgToCC[cdsNodeCacheInfo.StorageID]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
ccFileScores[cc.CC.CCID] = &fileDetail{
|
||||
//TODO 根据缓存方式不同,可能会有不同的计算方式
|
||||
CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 镜像的LoadingScore是判断是否导入到算力中心
|
||||
for _, pcmImg := range imageInfoResp.PCMImages {
|
||||
_, ok := allCCs[pcmImg.CCID]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
fsc, ok := ccFileScores[pcmImg.CCID]
|
||||
if !ok {
|
||||
fsc = &fileDetail{}
|
||||
ccFileScores[pcmImg.CCID] = fsc
|
||||
}
|
||||
|
||||
fsc.LoadingScore = 1 * LoadedWeight
|
||||
fsc.IsLoaded = true
|
||||
}
|
||||
|
||||
return ccFileScores, nil
|
||||
}
|
|
@ -0,0 +1,636 @@
|
|||
package prescheduler2
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
pcmsch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
"sort"
|
||||
|
||||
"github.com/samber/lo"
|
||||
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
const (
|
||||
//每个节点划分的资源等级:
|
||||
// ResourceLevel1:表示所有资源类型均满足 大于等于1.5倍
|
||||
ResourceLevel1 = 1
|
||||
// ResourceLevel2:表示不满足Level1,但所有资源类型均满足 大于等于1倍
|
||||
ResourceLevel2 = 2
|
||||
// ResourceLevel3: 表示某些资源类型 小于一倍
|
||||
ResourceLevel3 = 3
|
||||
|
||||
CpuResourceWeight float64 = 1
|
||||
StgResourceWeight float64 = 1.2
|
||||
|
||||
CachingWeight float64 = 1
|
||||
LoadedWeight float64 = 2
|
||||
)
|
||||
|
||||
var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait")
|
||||
|
||||
type candidate struct {
|
||||
CC schmod.ComputingCenter
|
||||
IsReferencedJobTarget bool // 这个节点是否是所依赖的任务所选择的节点
|
||||
Resource resourcesDetail
|
||||
Files filesDetail
|
||||
}
|
||||
|
||||
type resourcesDetail struct {
|
||||
CPU resourceDetail
|
||||
GPU resourceDetail
|
||||
NPU resourceDetail
|
||||
MLU resourceDetail
|
||||
Storage resourceDetail
|
||||
Memory resourceDetail
|
||||
|
||||
TotalScore float64
|
||||
AvgScore float64
|
||||
MaxLevel int
|
||||
}
|
||||
type resourceDetail struct {
|
||||
Level int
|
||||
Score float64
|
||||
}
|
||||
|
||||
type filesDetail struct {
|
||||
Dataset fileDetail
|
||||
Code fileDetail
|
||||
Image fileDetail
|
||||
|
||||
TotalScore float64
|
||||
}
|
||||
type fileDetail struct {
|
||||
CachingScore float64
|
||||
LoadingScore float64
|
||||
IsLoaded bool //表示storage是否已经调度到该节点, image表示镜像是否已经加载到该算力中心
|
||||
}
|
||||
|
||||
type schedulingJob struct {
|
||||
Job schsdk.JobInfo
|
||||
Afters []string
|
||||
}
|
||||
|
||||
type CandidateArr []*candidate
|
||||
|
||||
func (a CandidateArr) Len() int { return len(a) }
|
||||
func (a CandidateArr) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a CandidateArr) Less(i, j int) bool {
|
||||
n1 := a[i]
|
||||
n2 := a[j]
|
||||
|
||||
// 优先与所依赖的任务放到一起,但要求那个节点的资源足够
|
||||
if n1.IsReferencedJobTarget && n1.Resource.MaxLevel < ResourceLevel3 {
|
||||
return true
|
||||
}
|
||||
if n2.IsReferencedJobTarget && n2.Resource.MaxLevel < ResourceLevel3 {
|
||||
return true
|
||||
}
|
||||
|
||||
// 优先判断资源等级,资源等级越低,代表越满足需求
|
||||
if n1.Resource.MaxLevel < n2.Resource.MaxLevel {
|
||||
return true
|
||||
}
|
||||
if n1.Resource.MaxLevel > n2.Resource.MaxLevel {
|
||||
return false
|
||||
}
|
||||
|
||||
// 等级相同时,根据单项分值比较
|
||||
switch n1.Resource.MaxLevel {
|
||||
case ResourceLevel1:
|
||||
// 数据文件总分越高,代表此节点上拥有的数据文件越完整,则越优先考虑
|
||||
return n1.Files.TotalScore > n2.Files.TotalScore
|
||||
|
||||
case ResourceLevel2:
|
||||
// 资源分的平均值越高,代表资源越空余,则越优先考虑
|
||||
return n1.Resource.AvgScore > n2.Resource.AvgScore
|
||||
|
||||
case ResourceLevel3:
|
||||
// 资源分的平均值越高,代表资源越空余,则越优先考虑
|
||||
return n1.Resource.AvgScore > n2.Resource.AvgScore
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
type DefaultPreScheduler struct {
|
||||
}
|
||||
|
||||
func NewDefaultPreScheduler() *DefaultPreScheduler {
|
||||
return &DefaultPreScheduler{}
|
||||
}
|
||||
|
||||
// ScheduleJobSet 任务集预调度
|
||||
func (s *DefaultPreScheduler) ScheduleJobSet(info *schsdk.JobSetInfo, allCC []schmod.ComputingCenter) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error) {
|
||||
jobSetScheme := &jobmod.JobSetPreScheduleScheme{
|
||||
JobSchemes: make(map[string]jobmod.JobScheduleScheme),
|
||||
}
|
||||
filesUploadSchemes := make(map[string]schsdk.LocalFileUploadScheme)
|
||||
|
||||
ccs := make(map[schsdk.CCID]schmod.ComputingCenter)
|
||||
for _, node := range allCC {
|
||||
ccs[node.CCID] = node
|
||||
}
|
||||
|
||||
if len(ccs) == 0 {
|
||||
return nil, nil, ErrNoAvailableScheme
|
||||
}
|
||||
|
||||
// 先根据任务配置,收集它们依赖的任务的LocalID
|
||||
var schJobs []*schedulingJob
|
||||
for _, job := range info.Jobs {
|
||||
j := &schedulingJob{
|
||||
Job: job,
|
||||
}
|
||||
|
||||
if norJob, ok := job.(*schsdk.NormalJobInfo); ok {
|
||||
if resFile, ok := norJob.Files.Dataset.(*schsdk.DataReturnJobFileInfo); ok {
|
||||
j.Afters = append(j.Afters, resFile.DataReturnLocalJobID)
|
||||
}
|
||||
|
||||
if resFile, ok := norJob.Files.Code.(*schsdk.DataReturnJobFileInfo); ok {
|
||||
j.Afters = append(j.Afters, resFile.DataReturnLocalJobID)
|
||||
}
|
||||
} else if resJob, ok := job.(*schsdk.DataReturnJobInfo); ok {
|
||||
j.Afters = append(j.Afters, resJob.TargetLocalJobID)
|
||||
}
|
||||
|
||||
schJobs = append(schJobs, j)
|
||||
}
|
||||
|
||||
// 然后根据依赖进行排序
|
||||
schJobs, ok := s.orderByAfters(schJobs)
|
||||
if !ok {
|
||||
return nil, nil, fmt.Errorf("circular reference detected between jobs in the job set")
|
||||
}
|
||||
|
||||
// 经过排序后,按顺序生成调度方案
|
||||
for _, job := range schJobs {
|
||||
|
||||
var fileInfo schsdk.JobFilesInfo
|
||||
isNormalType := false
|
||||
norJob, ok := job.Job.(*schsdk.NormalJobInfo)
|
||||
if ok {
|
||||
fileInfo = norJob.Files
|
||||
isNormalType = true
|
||||
}
|
||||
dpJob, ok := job.Job.(*schsdk.DataPreprocessJobInfo)
|
||||
if ok {
|
||||
fileInfo = dpJob.Files
|
||||
isNormalType = true
|
||||
}
|
||||
ftJob, ok := job.Job.(*schsdk.FinetuningJobInfo)
|
||||
if ok {
|
||||
fileInfo = ftJob.Files
|
||||
isNormalType = true
|
||||
}
|
||||
|
||||
if isNormalType {
|
||||
scheme, err := s.scheduleForNormalOrMultiJob(info, job, ccs, jobSetScheme.JobSchemes)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
jobSetScheme.JobSchemes[job.Job.GetLocalJobID()] = *scheme
|
||||
|
||||
// 检查数据文件的配置项,生成上传文件方案
|
||||
s.fillNormarlJobLocalUploadScheme(fileInfo, scheme.TargetCCID, filesUploadSchemes, ccs)
|
||||
}
|
||||
|
||||
if mulJob, ok := job.Job.(*schsdk.MultiInstanceJobInfo); ok {
|
||||
scheme, err := s.scheduleForNormalOrMultiJob(info, job, ccs, jobSetScheme.JobSchemes)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
jobSetScheme.JobSchemes[job.Job.GetLocalJobID()] = *scheme
|
||||
|
||||
// 检查数据文件的配置项,生成上传文件方案
|
||||
s.fillNormarlJobLocalUploadScheme(mulJob.Files, scheme.TargetCCID, filesUploadSchemes, ccs)
|
||||
}
|
||||
|
||||
// 回源任务目前不需要生成调度方案
|
||||
}
|
||||
|
||||
return jobSetScheme, &schsdk.JobSetFilesUploadScheme{
|
||||
LocalFileSchemes: lo.Values(filesUploadSchemes),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ScheduleJob 单个任务预调度
|
||||
func (s *DefaultPreScheduler) ScheduleJob(priorities []pcmsch.ResourcePriority, clusterMapping map[schsdk.ClusterID]cdssdk.StorageID) (*schsdk.ClusterID, error) {
|
||||
schCli, err := schglb.PCMSchePool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.PCMSchePool.Release(schCli)
|
||||
|
||||
// 查询指定算力中心
|
||||
clusterIDs := make([]schsdk.ClusterID, 0, len(clusterMapping))
|
||||
|
||||
for id, _ := range clusterMapping {
|
||||
clusterIDs = append(clusterIDs, id)
|
||||
}
|
||||
|
||||
clusterDetails, err := schCli.GetClusterInfo(pcmsch.GetClusterInfoReq{
|
||||
IDs: clusterIDs,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get cluster info: %w", err)
|
||||
}
|
||||
|
||||
//var regionPriority *pcmsch.RegionPriority
|
||||
var chipPriority *pcmsch.ChipPriority
|
||||
var biasPriority *pcmsch.BiasPriority
|
||||
for _, priority := range priorities {
|
||||
switch pr := priority.(type) {
|
||||
case *pcmsch.ChipPriority:
|
||||
chipPriority = pr
|
||||
//case *pcmsch.RegionPriority:
|
||||
// regionPriority = pr
|
||||
case *pcmsch.BiasPriority:
|
||||
biasPriority = pr
|
||||
}
|
||||
}
|
||||
|
||||
var eligibleClusters []pcmsch.ClusterDetail
|
||||
|
||||
// 遍历所有集群,检查是否符合选择条件
|
||||
for _, cluster := range clusterDetails {
|
||||
// 匹配地域选择
|
||||
//if !matchRegion(cluster.Region, regionPriority) {
|
||||
// continue
|
||||
//}
|
||||
|
||||
// 匹配卡类型选择(例如:CPU、GPU等)
|
||||
if !matchChipType(cluster.Resources, chipPriority) {
|
||||
continue
|
||||
}
|
||||
|
||||
// 匹配功能选择(云算,智算,超算等)
|
||||
if !matchFunction(cluster.ClusterType, biasPriority) {
|
||||
continue
|
||||
}
|
||||
|
||||
// 将符合条件的集群加入候选列表
|
||||
eligibleClusters = append(eligibleClusters, cluster)
|
||||
}
|
||||
|
||||
// 根据剩余资源量对符合条件的集群进行排序,优先选择剩余资源最多的
|
||||
if len(eligibleClusters) > 1 {
|
||||
sort.Slice(eligibleClusters, func(i, j int) bool {
|
||||
return getRemainingResources(eligibleClusters[i].Resources) > getRemainingResources(eligibleClusters[j].Resources)
|
||||
})
|
||||
// 返回剩余资源最多的算力中心
|
||||
return &eligibleClusters[0].ClusterId, nil
|
||||
} else if len(eligibleClusters) == 1 {
|
||||
// 如果只有一个符合条件的算力中心,直接选择
|
||||
return &eligibleClusters[0].ClusterId, nil
|
||||
}
|
||||
|
||||
// 如果没有符合条件的算力中心,返回 nil
|
||||
return nil, fmt.Errorf("no eligible clusters found")
|
||||
}
|
||||
|
||||
// 匹配地域选择
|
||||
func matchRegion(region string, regionPriority *pcmsch.RegionPriority) bool {
|
||||
if regionPriority == nil || len(regionPriority.Options) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, option := range regionPriority.Options {
|
||||
if option == region {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// 匹配卡类型选择
|
||||
func matchChipType(resources []pcmsch.TmpResourceData, chipPriority *pcmsch.ChipPriority) bool {
|
||||
if chipPriority == nil || len(chipPriority.Options) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, resource := range resources {
|
||||
if contains(chipPriority.Options, string(resource.Type)) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
//for _, resource := range resources {
|
||||
// switch r := resource.(type) {
|
||||
// case *pcmsch.CPUResourceData:
|
||||
// if contains(chipPriority.Options, string(r.Name)) {
|
||||
// return true
|
||||
// }
|
||||
//
|
||||
// case *pcmsch.GPUResourceData:
|
||||
// if contains(chipPriority.Options, string(r.Name)) {
|
||||
// return true
|
||||
// }
|
||||
//
|
||||
// case *pcmsch.NPUResourceData:
|
||||
// if contains(chipPriority.Options, string(r.Name)) {
|
||||
// return true
|
||||
// }
|
||||
//
|
||||
// case *pcmsch.MLUResourceData:
|
||||
// if contains(chipPriority.Options, string(r.Name)) {
|
||||
// return true
|
||||
// }
|
||||
//
|
||||
// case *pcmsch.MemoryResourceData:
|
||||
// if contains(chipPriority.Options, string(r.Name)) {
|
||||
// return true
|
||||
// }
|
||||
//
|
||||
// case *pcmsch.StorageResourceData:
|
||||
// if contains(chipPriority.Options, string(r.Name)) {
|
||||
// return true
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
return false
|
||||
}
|
||||
|
||||
// 匹配功能选择
|
||||
func matchFunction(functionType string, biasPriority *pcmsch.BiasPriority) bool {
|
||||
if biasPriority == nil || len(biasPriority.Options) == 0 {
|
||||
return true
|
||||
}
|
||||
|
||||
for _, option := range biasPriority.Options {
|
||||
if option == functionType {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// 获取剩余资源(可以基于具体资源类型进行扩展)
|
||||
func getRemainingResources(resources []pcmsch.TmpResourceData) float64 {
|
||||
var totalAvailable float64
|
||||
|
||||
for _, resource := range resources {
|
||||
switch resource.Type {
|
||||
case pcmsch.ResourceTypeCPU:
|
||||
totalAvailable += resource.Available.Value * CpuResourceWeight
|
||||
case pcmsch.ResourceTypeNPU:
|
||||
totalAvailable += resource.Available.Value * CpuResourceWeight
|
||||
case pcmsch.ResourceTypeGPU:
|
||||
totalAvailable += resource.Available.Value * CpuResourceWeight
|
||||
case pcmsch.ResourceTypeMLU:
|
||||
totalAvailable += resource.Available.Value * CpuResourceWeight
|
||||
case pcmsch.ResourceTypeStorage:
|
||||
totalAvailable += float64(resource.Available.Value) * StgResourceWeight
|
||||
case pcmsch.ResourceTypeMemory:
|
||||
totalAvailable += float64(resource.Available.Value) * StgResourceWeight
|
||||
}
|
||||
}
|
||||
|
||||
//for _, resource := range resources {
|
||||
// switch resourceData := resource.(type) {
|
||||
// case *pcmsch.CPUResourceData:
|
||||
// totalAvailable += float64(resourceData.Available.Value) * CpuResourceWeight
|
||||
//
|
||||
// case *pcmsch.GPUResourceData:
|
||||
// totalAvailable += float64(resourceData.Available.Value) * CpuResourceWeight
|
||||
//
|
||||
// case *pcmsch.NPUResourceData:
|
||||
// totalAvailable += float64(resourceData.Available.Value) * CpuResourceWeight
|
||||
//
|
||||
// case *pcmsch.MLUResourceData:
|
||||
// totalAvailable += float64(resourceData.Available.Value) * CpuResourceWeight
|
||||
//
|
||||
// case *pcmsch.StorageResourceData:
|
||||
// totalAvailable += float64(resourceData.Available.Value) * StgResourceWeight
|
||||
//
|
||||
// case *pcmsch.MemoryResourceData:
|
||||
// totalAvailable += float64(resourceData.Available.Value) * StgResourceWeight
|
||||
// }
|
||||
//}
|
||||
return totalAvailable
|
||||
}
|
||||
|
||||
// 判断切片中是否包含某个元素
|
||||
func contains(slice []string, str string) bool {
|
||||
for _, item := range slice {
|
||||
if item == str {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (s *DefaultPreScheduler) orderByAfters(jobs []*schedulingJob) ([]*schedulingJob, bool) {
|
||||
type jobOrder struct {
|
||||
Job *schedulingJob
|
||||
Afters []string
|
||||
}
|
||||
|
||||
var jobOrders []*jobOrder
|
||||
for _, job := range jobs {
|
||||
od := &jobOrder{
|
||||
Job: job,
|
||||
Afters: make([]string, len(job.Afters)),
|
||||
}
|
||||
|
||||
copy(od.Afters, job.Afters)
|
||||
|
||||
jobOrders = append(jobOrders, od)
|
||||
}
|
||||
|
||||
// 然后排序
|
||||
var orderedJob []*schedulingJob
|
||||
for {
|
||||
rm := 0
|
||||
for i, jo := range jobOrders {
|
||||
// 找到没有依赖的任务,然后将其取出
|
||||
if len(jo.Afters) == 0 {
|
||||
orderedJob = append(orderedJob, jo.Job)
|
||||
|
||||
// 删除其他任务对它的引用
|
||||
for _, job2 := range jobOrders {
|
||||
job2.Afters = lo.Reject(job2.Afters, func(item string, idx int) bool { return item == jo.Job.Job.GetLocalJobID() })
|
||||
}
|
||||
|
||||
rm++
|
||||
continue
|
||||
}
|
||||
|
||||
jobOrders[i-rm] = jobOrders[i]
|
||||
}
|
||||
|
||||
jobOrders = jobOrders[:len(jobOrders)-rm]
|
||||
if len(jobOrders) == 0 {
|
||||
break
|
||||
}
|
||||
|
||||
// 遍历一轮后没有找到无依赖的任务,那么就是存在循环引用,排序失败
|
||||
if rm == 0 {
|
||||
return nil, false
|
||||
}
|
||||
}
|
||||
|
||||
return orderedJob, true
|
||||
}
|
||||
|
||||
func (s *DefaultPreScheduler) scheduleForNormalOrMultiJob(jobSet *schsdk.JobSetInfo, job *schedulingJob, ccs map[schsdk.CCID]schmod.ComputingCenter, jobSchemes map[string]jobmod.JobScheduleScheme) (*jobmod.JobScheduleScheme, error) {
|
||||
allCCs := make(map[schsdk.CCID]*candidate)
|
||||
|
||||
// 初始化备选节点信息
|
||||
for _, cc := range ccs {
|
||||
caNode := &candidate{
|
||||
CC: cc,
|
||||
}
|
||||
|
||||
// 检查此节点是否是它所引用的任务所选的节点
|
||||
for _, af := range job.Afters {
|
||||
resJob := findJobInfo[*schsdk.DataReturnJobInfo](jobSet.Jobs, af)
|
||||
if resJob == nil {
|
||||
return nil, fmt.Errorf("resource job %s not found in the job set", af)
|
||||
}
|
||||
|
||||
// 由于jobs已经按照引用排序,所以正常情况下这里肯定能取到值
|
||||
scheme, ok := jobSchemes[resJob.TargetLocalJobID]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
if scheme.TargetCCID == cc.CCID {
|
||||
caNode.IsReferencedJobTarget = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
allCCs[cc.CCID] = caNode
|
||||
}
|
||||
|
||||
var jobFiles *schsdk.JobFilesInfo
|
||||
var jobResource *schsdk.JobResourcesInfo
|
||||
|
||||
switch runningJob := job.Job.(type) {
|
||||
case *schsdk.NormalJobInfo:
|
||||
jobFiles = &runningJob.Files
|
||||
jobResource = &runningJob.Resources
|
||||
case *schsdk.DataPreprocessJobInfo:
|
||||
jobFiles = &runningJob.Files
|
||||
jobResource = &runningJob.Resources
|
||||
case *schsdk.FinetuningJobInfo:
|
||||
jobFiles = &runningJob.Files
|
||||
jobResource = &runningJob.Resources
|
||||
case *schsdk.MultiInstanceJobInfo:
|
||||
jobFiles = &runningJob.Files
|
||||
jobResource = &runningJob.Resources
|
||||
}
|
||||
|
||||
// 计算文件占有量得分
|
||||
err := s.calcFileScore(*jobFiles, allCCs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// 计算资源余量得分
|
||||
err = s.calcResourceScore(*jobResource, allCCs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
allCCsArr := lo.Values(allCCs)
|
||||
sort.Sort(CandidateArr(allCCsArr))
|
||||
|
||||
targetNode := allCCsArr[0]
|
||||
if targetNode.Resource.MaxLevel == ResourceLevel3 {
|
||||
return nil, ErrNoAvailableScheme
|
||||
}
|
||||
|
||||
scheme := s.makeSchemeForNode(jobFiles, targetNode)
|
||||
return &scheme, nil
|
||||
}
|
||||
|
||||
func (s *DefaultPreScheduler) fillNormarlJobLocalUploadScheme(files schsdk.JobFilesInfo, targetCCID schsdk.CCID, schemes map[string]schsdk.LocalFileUploadScheme, ccs map[schsdk.CCID]schmod.ComputingCenter) {
|
||||
if localFile, ok := files.Dataset.(*schsdk.LocalJobFileInfo); ok {
|
||||
if _, ok := schemes[localFile.LocalPath]; !ok {
|
||||
cdsNodeID := ccs[targetCCID].CDSStorageID
|
||||
schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{
|
||||
LocalPath: localFile.LocalPath,
|
||||
UploadToCDStorageID: cdsNodeID,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if localFile, ok := files.Code.(*schsdk.LocalJobFileInfo); ok {
|
||||
if _, ok := schemes[localFile.LocalPath]; !ok {
|
||||
cdsNodeID := ccs[targetCCID].CDSStorageID
|
||||
schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{
|
||||
LocalPath: localFile.LocalPath,
|
||||
UploadToCDStorageID: cdsNodeID,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if localFile, ok := files.Image.(*schsdk.LocalJobFileInfo); ok {
|
||||
if _, ok := schemes[localFile.LocalPath]; !ok {
|
||||
cdsNodeID := ccs[targetCCID].CDSStorageID
|
||||
schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{
|
||||
LocalPath: localFile.LocalPath,
|
||||
UploadToCDStorageID: cdsNodeID,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DefaultPreScheduler) makeSchemeForNode(jobFiles *schsdk.JobFilesInfo, targetCC *candidate) jobmod.JobScheduleScheme {
|
||||
scheme := jobmod.JobScheduleScheme{
|
||||
TargetCCID: targetCC.CC.CCID,
|
||||
}
|
||||
|
||||
// TODO 根据实际情况选择Move或者Load
|
||||
|
||||
if _, ok := jobFiles.Dataset.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Dataset.IsLoaded {
|
||||
scheme.Dataset.Action = jobmod.ActionLoad
|
||||
} else {
|
||||
scheme.Dataset.Action = jobmod.ActionNo
|
||||
}
|
||||
|
||||
if _, ok := jobFiles.Code.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Code.IsLoaded {
|
||||
scheme.Code.Action = jobmod.ActionLoad
|
||||
} else {
|
||||
scheme.Code.Action = jobmod.ActionNo
|
||||
}
|
||||
|
||||
if _, ok := jobFiles.Image.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Image.IsLoaded {
|
||||
scheme.Image.Action = jobmod.ActionImportImage
|
||||
} else {
|
||||
scheme.Image.Action = jobmod.ActionNo
|
||||
}
|
||||
|
||||
return scheme
|
||||
}
|
||||
|
||||
func findResuorce[T uopsdk.ResourceData](all []pcmsch.ResourceData) T {
|
||||
for _, data := range all {
|
||||
if ret, ok := data.(T); ok {
|
||||
return ret
|
||||
}
|
||||
}
|
||||
|
||||
var def T
|
||||
return def
|
||||
}
|
||||
|
||||
func findJobInfo[T schsdk.JobInfo](jobs []schsdk.JobInfo, localJobID string) T {
|
||||
for _, job := range jobs {
|
||||
if ret, ok := job.(T); ok && job.GetLocalJobID() == localJobID {
|
||||
return ret
|
||||
}
|
||||
}
|
||||
|
||||
var def T
|
||||
return def
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package prescheduler2
|
||||
|
||||
import (
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type PreScheduler interface {
|
||||
ScheduleJobSet(info *schsdk.JobSetInfo, allCC []schmod.ComputingCenter) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error)
|
||||
ScheduleJob(priority []sch.ResourcePriority, clusterMapping map[schsdk.ClusterID]cdssdk.StorageID) (*schsdk.ClusterID, error)
|
||||
}
|
|
@ -192,3 +192,753 @@
|
|||
2024-09-26 15:21:41 [WARN] [HTTP:JobSet.Submit] submitting jobset: submitting job set to manager: code: OperationFailed, message: job set 0 is not found
|
||||
2024-09-26 16:15:22 [WARN] [HTTP:JobSet.Submit] submitting jobset: submitting job set to manager: code: OperationFailed, message: job set 0 is not found
|
||||
2024-09-26 16:30:52 [WARN] [HTTP:JobSet.Submit] submitting jobset: submitting job set to manager: code: OperationFailed, message: job set 0 is not found
|
||||
2024-12-05 16:38:44 [FATA] failed to connect to database: dial tcp 101.201.215.196:3306: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
|
||||
2024-12-05 16:41:25 [INFO] start serving http at: :7891
|
||||
2024-12-05 16:48:18 [DEBU] uploading job
|
||||
2024-12-05 16:51:36 [INFO] start serving http at: :7891
|
||||
2024-12-05 16:51:44 [DEBU] uploading job
|
||||
2024-12-05 16:54:57 [WARN] [HTTP:JobSet.LocalFileUploaded] binding body: Key: 'JobSetLocalFileUploadedReq.JobSetID' Error:Field validation for 'JobSetID' failed on the 'required' tag
|
||||
Key: 'JobSetLocalFileUploadedReq.LocalPath' Error:Field validation for 'LocalPath' failed on the 'required' tag
|
||||
2024-12-05 16:55:44 [DEBU] uploading job
|
||||
2024-12-05 16:56:17 [WARN] [HTTP:JobSet.LocalFileUploaded] binding body: Key: 'JobSetLocalFileUploadedReq.JobSetID' Error:Field validation for 'JobSetID' failed on the 'required' tag
|
||||
Key: 'JobSetLocalFileUploadedReq.LocalPath' Error:Field validation for 'LocalPath' failed on the 'required' tag
|
||||
2024-12-05 16:58:38 [ERRO]
|
||||
2024-12-05 16:58:38 [INFO] jobID: %s change state from %s to %s1&{0xc000780db0 test_image.png image [111]} &{0xc000780bc0}
|
||||
2024-12-05 16:58:38 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-05 16:58:38 [INFO] [JobID:1] [LastState:*state2.DataUpload] job failed with:
|
||||
2024-12-05 16:58:38 [INFO] job set 1 completed
|
||||
2024-12-05 16:58:52 [DEBU] uploading job
|
||||
2024-12-05 16:59:30 [ERRO]
|
||||
2024-12-05 16:59:30 [INFO] jobID: %s change state from %s to %s2&{0xc0005aa0c0 test_image.png image [111]} &{0xc000780c30}
|
||||
2024-12-05 16:59:30 [INFO] [JobID:2] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-05 16:59:30 [INFO] [JobID:2] [LastState:*state2.DataUpload] job failed with:
|
||||
2024-12-05 16:59:30 [INFO] job set 2 completed
|
||||
2024-12-05 16:59:53 [DEBU] uploading job
|
||||
2024-12-05 17:00:59 [ERRO]
|
||||
2024-12-05 17:00:59 [INFO] jobID: %s change state from %s to %s3&{0xc0005aa510 test_image.png image [111]} &{0xc0005aa750}
|
||||
2024-12-05 17:00:59 [INFO] [JobID:3] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-05 17:00:59 [INFO] [JobID:3] [LastState:*state2.DataUpload] job failed with:
|
||||
2024-12-05 17:00:59 [INFO] job set 3 completed
|
||||
2024-12-05 17:01:02 [INFO] start serving http at: :7891
|
||||
2024-12-05 17:01:12 [DEBU] uploading job
|
||||
2024-12-05 17:10:17 [ERRO] insert upload data fail: Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-05 17:10:17 [INFO] jobID: %s change state from %s to %s0&{0xc0004212c0 test_image.png image [111]} &{0xc00045e0e0}
|
||||
2024-12-05 17:10:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-05 17:10:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-05 17:10:17 [INFO] job set 0 completed
|
||||
2024-12-10 09:18:52 [INFO] start serving http at: :7891
|
||||
2024-12-10 14:54:34 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:05:05 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:06:50 [DEBU] submitting job
|
||||
2024-12-10 15:06:50 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-10 15:06:53 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:07:54 [DEBU] submitting job
|
||||
2024-12-10 15:07:54 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-10 15:07:58 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:10:34 [DEBU] submitting job
|
||||
2024-12-10 15:10:34 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-10 15:10:42 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:12:31 [DEBU] submitting job
|
||||
2024-12-10 15:12:31 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-10 15:12:33 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:13:32 [DEBU] submitting job
|
||||
2024-12-10 15:13:32 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-10 15:13:35 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:14:39 [DEBU] submitting job
|
||||
2024-12-10 15:14:39 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-10 15:14:41 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:19:47 [DEBU] submitting job
|
||||
2024-12-10 15:19:47 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-10 15:19:51 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:20:31 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:32:43 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:33:00 [DEBU] submitting job
|
||||
2024-12-10 15:35:12 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-10 15:35:13 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:35:36 [DEBU] submitting job
|
||||
2024-12-10 15:36:33 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
|
||||
2024-12-10 15:36:35 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:36:44 [DEBU] submitting job
|
||||
2024-12-10 15:37:11 [ERRO] no upload data
|
||||
2024-12-10 15:40:24 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:40:55 [DEBU] uploading job
|
||||
2024-12-10 15:40:55 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: Error 1146 (42S02): Table 'scheduler.clustermapping' doesn't exist
|
||||
2024-12-10 15:42:19 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:42:50 [DEBU] uploading job
|
||||
2024-12-10 15:42:50 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
|
||||
2024-12-10 15:45:43 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:45:52 [DEBU] uploading job
|
||||
2024-12-10 15:45:52 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
|
||||
2024-12-10 15:46:26 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:46:33 [DEBU] uploading job
|
||||
2024-12-10 15:46:33 [WARN] getting all computing center: Error 1054 (42S22): Unknown column 'ComputingCenter.executorID' in 'field list'
|
||||
2024-12-10 15:46:33 [WARN] [HTTP:JobSet.Upload] uploading file: Error 1054 (42S22): Unknown column 'ComputingCenter.executorID' in 'field list'
|
||||
2024-12-10 15:50:07 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:50:15 [DEBU] uploading job
|
||||
2024-12-10 15:50:15 [WARN] getting all computing center: Error 1054 (42S22): Unknown column 'ComputingCenter.executorURL' in 'field list'
|
||||
2024-12-10 15:50:15 [WARN] [HTTP:JobSet.Upload] uploading file: Error 1054 (42S22): Unknown column 'ComputingCenter.executorURL' in 'field list'
|
||||
2024-12-10 15:52:31 [INFO] start serving http at: :7891
|
||||
2024-12-10 15:52:36 [DEBU] uploading job
|
||||
2024-12-10 15:52:36 [WARN] getting all computing center: Error 1054 (42S22): Unknown column 'ComputingCenter.ClusterID' in 'field list'
|
||||
2024-12-10 15:52:36 [WARN] [HTTP:JobSet.Upload] uploading file: Error 1054 (42S22): Unknown column 'ComputingCenter.ClusterID' in 'field list'
|
||||
2024-12-10 16:02:11 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:02:15 [DEBU] uploading job
|
||||
2024-12-10 16:02:15 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
|
||||
2024-12-10 16:05:46 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:05:59 [DEBU] uploading job
|
||||
2024-12-10 16:05:59 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
|
||||
2024-12-10 16:06:09 [DEBU] uploading job
|
||||
2024-12-10 16:06:09 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
|
||||
2024-12-10 16:07:13 [DEBU] uploading job
|
||||
2024-12-10 16:10:39 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
|
||||
2024-12-10 16:10:41 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:10:46 [DEBU] uploading job
|
||||
2024-12-10 16:11:22 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
|
||||
2024-12-10 16:14:01 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:14:06 [DEBU] uploading job
|
||||
2024-12-10 16:20:51 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: Get "http://121.36.5.116:7890/queryResources": dial tcp 121.36.5.116:7890: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
|
||||
2024-12-10 16:20:55 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:21:00 [DEBU] uploading job
|
||||
2024-12-10 16:23:03 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
|
||||
2024-12-10 16:23:27 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:23:32 [DEBU] uploading job
|
||||
2024-12-10 16:26:44 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
|
||||
2024-12-10 16:26:47 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:26:53 [DEBU] uploading job
|
||||
2024-12-10 16:37:01 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
|
||||
2024-12-10 16:38:15 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:38:28 [DEBU] uploading job
|
||||
2024-12-10 16:50:06 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
|
||||
2024-12-10 16:50:08 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:50:14 [DEBU] uploading job
|
||||
2024-12-10 16:54:56 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
|
||||
2024-12-10 16:54:58 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:55:08 [DEBU] uploading job
|
||||
2024-12-10 16:57:27 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
|
||||
2024-12-10 16:57:29 [INFO] start serving http at: :7891
|
||||
2024-12-10 16:57:36 [DEBU] uploading job
|
||||
2024-12-10 17:03:50 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
|
||||
2024-12-10 17:03:53 [INFO] start serving http at: :7891
|
||||
2024-12-10 17:03:57 [DEBU] uploading job
|
||||
2024-12-10 17:04:12 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field ClusterDetail.data.data.ClusterId of type schsdk.ClusterID
|
||||
2024-12-10 17:04:23 [INFO] start serving http at: :7891
|
||||
2024-12-10 17:04:26 [DEBU] uploading job
|
||||
2024-12-10 17:05:08 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field ClusterDetail.data.data.ClusterId of type schsdk.ClusterID
|
||||
2024-12-10 17:05:10 [INFO] start serving http at: :7891
|
||||
2024-12-10 17:05:15 [DEBU] uploading job
|
||||
2024-12-10 17:11:05 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field response[[]gitlink.org.cn/cloudream/common/sdks/pcmscheduler.ClusterDetail].data of type []sch.ClusterDetail
|
||||
2024-12-10 17:11:08 [INFO] start serving http at: :7891
|
||||
2024-12-10 17:11:21 [DEBU] uploading job
|
||||
2024-12-10 17:13:01 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field ClusterDetail.data.data.ClusterId of type schsdk.ClusterID
|
||||
2024-12-10 17:13:15 [INFO] start serving http at: :7891
|
||||
2024-12-10 17:13:22 [DEBU] uploading job
|
||||
2024-12-10 17:13:28 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.data.Resource of type sch.ResourceData
|
||||
2024-12-10 17:13:37 [DEBU] uploading job
|
||||
2024-12-10 17:42:29 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.data.Resource of type sch.ResourceData
|
||||
2024-12-11 15:04:03 [INFO] start serving http at: :7891
|
||||
2024-12-11 15:07:00 [INFO] start serving http at: :7891
|
||||
2024-12-11 15:09:01 [DEBU] submitting job
|
||||
2024-12-11 15:09:40 [ERRO] no upload data
|
||||
2024-12-11 15:09:40 [INFO] jobID: %s change state from %s to %s0&{0xc0001d0000} &{0xc00023a150}
|
||||
2024-12-11 15:09:40 [INFO] [JobID:0] state changed: *state2.PCMJobCreate -> *state.Completed
|
||||
2024-12-11 15:09:40 [INFO] [JobID:0] [LastState:*state2.PCMJobCreate] job failed with: no upload data
|
||||
2024-12-11 15:09:40 [INFO] job set 0 completed
|
||||
2024-12-11 15:10:07 [DEBU] uploading job
|
||||
2024-12-11 15:10:24 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.data.Resource of type sch.ResourceData
|
||||
2024-12-11 15:12:21 [DEBU] uploading job
|
||||
2024-12-11 15:12:21 [WARN] cluster 2 not found
|
||||
2024-12-11 15:12:21 [WARN] cluster 3 not found
|
||||
2024-12-11 15:13:30 [DEBU] uploading job
|
||||
2024-12-11 15:13:30 [WARN] cluster 2 not found
|
||||
2024-12-11 15:13:30 [WARN] cluster 3 not found
|
||||
2024-12-11 15:14:45 [DEBU] uploading job
|
||||
2024-12-11 15:16:41 [ERRO] insert upload data fail: sql: Scan called without calling Next
|
||||
2024-12-11 15:16:41 [INFO] jobID: %s change state from %s to %s3&{0xc00059e290 test_image.png image [1 2]} &{0xc00022c1e0}
|
||||
2024-12-11 15:16:41 [INFO] [JobID:3] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-11 15:16:41 [INFO] [JobID:3] [LastState:*state2.DataUpload] job failed with: insert upload data fail: sql: Scan called without calling Next
|
||||
2024-12-11 15:16:41 [INFO] job set 3 completed
|
||||
2024-12-11 15:21:10 [INFO] start serving http at: :7891
|
||||
2024-12-11 15:21:20 [DEBU] uploading job
|
||||
2024-12-11 15:22:11 [INFO] jobID: %s change state from %s to %s0&{0xc0006a88c0 test_image.png image [1 2]} &{<nil>}
|
||||
2024-12-11 15:22:11 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-11 15:22:11 [INFO] [JobID:0] job completed successfuly
|
||||
2024-12-11 15:22:11 [INFO] job set 0 completed
|
||||
2024-12-11 15:24:59 [DEBU] uploading job
|
||||
2024-12-11 15:28:32 [INFO] jobID: %s change state from %s to %s1&{0xc00053a0d0 test_image.png image [1 2]} &{<nil>}
|
||||
2024-12-11 15:28:32 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-11 15:28:32 [INFO] [JobID:1] job completed successfuly
|
||||
2024-12-11 15:28:32 [INFO] job set 1 completed
|
||||
2024-12-11 15:38:34 [DEBU] uploading job
|
||||
2024-12-11 15:45:34 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal array into Go struct field response[gitlink.org.cn/cloudream/common/sdks/pcmscheduler.GetClusterInfoResp].data of type sch.GetClusterInfoResp
|
||||
2024-12-11 15:47:21 [DEBU] submitting job
|
||||
2024-12-11 16:36:15 [INFO] start serving http at: :7891
|
||||
2024-12-11 16:37:38 [DEBU] submitting job
|
||||
2024-12-11 16:40:34 [ERRO] create task: unknow response content type: text/plain; charset=utf-8
|
||||
2024-12-11 16:40:34 [INFO] jobID: %s change state from %s to %s0&{0xc00018e0e0} &{0xc0001da3c0}
|
||||
2024-12-11 16:40:34 [INFO] [JobID:0] state changed: *state2.PCMJobCreate -> *state.Completed
|
||||
2024-12-11 16:40:34 [INFO] [JobID:0] [LastState:*state2.PCMJobCreate] job failed with: create task: unknow response content type: text/plain; charset=utf-8
|
||||
2024-12-11 16:40:34 [INFO] job set 0 completed
|
||||
2024-12-11 16:58:56 [WARN] [HTTP:JobSet.GetServiceList] binding body: strconv.ParseInt: parsing "[1,5]": invalid syntax
|
||||
2024-12-11 16:59:28 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:00:35 [WARN] [HTTP:JobSet.GetServiceList] binding body: Key: 'QueryUploadedReq.DataType' Error:Field validation for 'DataType' failed on the 'required' tag
|
||||
Key: 'QueryUploadedReq.UserID' Error:Field validation for 'UserID' failed on the 'required' tag
|
||||
2024-12-11 17:01:19 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:01:29 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
|
||||
2024-12-11 17:02:19 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:02:26 [WARN] [HTTP:JobSet.GetServiceList] getting service list: UploadedCluster: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:08:14 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:08:26 [WARN] [HTTP:JobSet.GetServiceList] getting service list: UploadedCluster: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:15:22 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:15:26 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
|
||||
2024-12-11 17:17:08 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:17:11 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
|
||||
2024-12-11 17:20:27 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:20:32 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
|
||||
2024-12-11 17:21:05 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:21:08 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
|
||||
2024-12-11 17:21:49 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:21:51 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.uploaded_clusters' doesn't exist
|
||||
2024-12-11 17:22:21 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:22:23 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:23:02 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:23:05 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:26:19 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:26:21 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:26:53 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:26:57 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:28:05 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:28:06 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:29:20 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:29:34 [WARN] [HTTP:JobSet.GetServiceList] getting service list: clusters: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:30:25 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:31:05 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:31:07 [WARN] [HTTP:JobSet.GetServiceList] getting service list: UploadedCluster: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:31:39 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:31:41 [WARN] [HTTP:JobSet.GetServiceList] getting service list: UploadedCluster: unsupported relations for schema UploadedData
|
||||
2024-12-11 17:31:55 [INFO] start serving http at: :7891
|
||||
2024-12-11 17:33:56 [INFO] start serving http at: :7891
|
||||
2024-12-12 09:39:30 [DEBU] uploading job
|
||||
2024-12-12 09:39:30 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
|
||||
2024-12-12 09:41:57 [INFO] start serving http at: :7891
|
||||
2024-12-12 09:42:01 [DEBU] uploading job
|
||||
2024-12-12 09:42:01 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
|
||||
2024-12-12 09:53:16 [DEBU] uploading job
|
||||
2024-12-12 09:54:11 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
|
||||
2024-12-12 09:54:26 [DEBU] uploading job
|
||||
2024-12-12 10:00:14 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:00:21 [DEBU] uploading job
|
||||
2024-12-12 10:00:22 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
|
||||
2024-12-12 10:00:43 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:00:47 [DEBU] uploading job
|
||||
2024-12-12 10:00:48 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
|
||||
2024-12-12 10:00:54 [DEBU] uploading job
|
||||
2024-12-12 10:00:55 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
|
||||
2024-12-12 10:01:04 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:01:51 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:02:06 [DEBU] uploading job
|
||||
2024-12-12 10:02:06 [WARN] getting all computing center: sql: Scan error on column index 6, name "Bootstrap": unsupported Scan, storing driver.Value type []uint8 into type *schsdk.Bootstrap
|
||||
2024-12-12 10:02:06 [WARN] [HTTP:JobSet.Upload] uploading file: sql: Scan error on column index 6, name "Bootstrap": unsupported Scan, storing driver.Value type []uint8 into type *schsdk.Bootstrap
|
||||
2024-12-12 10:04:35 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:04:41 [DEBU] uploading job
|
||||
2024-12-12 10:04:41 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
|
||||
2024-12-12 10:06:41 [DEBU] uploading job
|
||||
2024-12-12 10:07:47 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
|
||||
2024-12-12 10:09:29 [DEBU] uploading job
|
||||
2024-12-12 10:12:19 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:12:30 [DEBU] uploading job
|
||||
2024-12-12 10:12:30 [WARN] getting all computing center: sql: Scan error on column index 6, name "Bootstrap": unsupported Scan, storing driver.Value type []uint8 into type *schsdk.Bootstrap
|
||||
2024-12-12 10:12:30 [WARN] [HTTP:JobSet.Upload] uploading file: sql: Scan error on column index 6, name "Bootstrap": unsupported Scan, storing driver.Value type []uint8 into type *schsdk.Bootstrap
|
||||
2024-12-12 10:15:08 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:15:10 [DEBU] uploading job
|
||||
2024-12-12 10:15:13 [WARN] getting all computing center: Error 1054 (42S22): Unknown column 'ComputingCenter.ClusterID' in 'field list'
|
||||
2024-12-12 10:15:13 [WARN] [HTTP:JobSet.Upload] uploading file: Error 1054 (42S22): Unknown column 'ComputingCenter.ClusterID' in 'field list'
|
||||
2024-12-12 10:16:19 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:16:21 [DEBU] uploading job
|
||||
2024-12-12 10:18:07 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:18:12 [DEBU] uploading job
|
||||
2024-12-12 10:18:35 [DEBU] uploading job
|
||||
2024-12-12 10:18:36 [DEBU] uploading job
|
||||
2024-12-12 10:18:37 [DEBU] uploading job
|
||||
2024-12-12 10:18:45 [DEBU] uploading job
|
||||
2024-12-12 10:24:51 [INFO] start serving http at: :7891
|
||||
2024-12-12 10:25:03 [DEBU] submitting job
|
||||
2024-12-12 10:25:03 [ERRO] create task: unknow response content type: text/plain; charset=utf-8
|
||||
2024-12-12 10:25:03 [INFO] jobID: %s change state from %s to %s0&{0xc0002204d0} &{0xc0001da000}
|
||||
2024-12-12 10:25:03 [INFO] [JobID:0] state changed: *state2.PCMJobCreate -> *state.Completed
|
||||
2024-12-12 10:25:03 [INFO] [JobID:0] [LastState:*state2.PCMJobCreate] job failed with: create task: unknow response content type: text/plain; charset=utf-8
|
||||
2024-12-12 10:25:03 [INFO] job set 0 completed
|
||||
2024-12-12 15:02:30 [INFO] start serving http at: :7891
|
||||
2024-12-12 15:10:51 [DEBU] submitting job
|
||||
2024-12-12 15:10:55 [ERRO] create task: unknow response content type: text/plain; charset=utf-8
|
||||
2024-12-12 15:10:55 [INFO] jobID: %s change state from %s to %s0&{0xc00018c2d0} &{0xc00007e560}
|
||||
2024-12-12 15:10:55 [INFO] [JobID:0] state changed: *state2.PCMJobCreate -> *state.Completed
|
||||
2024-12-12 15:10:55 [INFO] [JobID:0] [LastState:*state2.PCMJobCreate] job failed with: create task: unknow response content type: text/plain; charset=utf-8
|
||||
2024-12-12 15:10:55 [INFO] job set 0 completed
|
||||
2024-12-12 15:11:42 [DEBU] uploading job
|
||||
2024-12-12 15:15:39 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal array into Go struct field response[gitlink.org.cn/cloudream/common/sdks/pcmscheduler.GetClusterInfoResp].data of type sch.GetClusterInfoResp
|
||||
2024-12-12 15:16:10 [INFO] start serving http at: :7891
|
||||
2024-12-12 15:16:17 [DEBU] uploading job
|
||||
2024-12-12 15:24:49 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal array into Go struct field response[gitlink.org.cn/cloudream/common/sdks/pcmscheduler.GetClusterInfoResp].data of type sch.GetClusterInfoResp
|
||||
2024-12-12 15:24:55 [INFO] start serving http at: :7891
|
||||
2024-12-12 15:25:02 [DEBU] uploading job
|
||||
2024-12-12 15:27:38 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal array into Go struct field response[gitlink.org.cn/cloudream/common/sdks/pcmscheduler.GetClusterInfoResp].data of type sch.GetClusterInfoResp
|
||||
2024-12-12 15:27:43 [INFO] start serving http at: :7891
|
||||
2024-12-12 15:27:53 [DEBU] uploading job
|
||||
2024-12-12 15:31:21 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
|
||||
2024-12-12 15:31:27 [INFO] start serving http at: :7891
|
||||
2024-12-12 15:31:35 [DEBU] uploading job
|
||||
2024-12-12 15:31:52 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
|
||||
2024-12-12 15:31:55 [DEBU] uploading job
|
||||
2024-12-12 15:39:59 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
|
||||
2024-12-12 15:42:02 [INFO] start serving http at: :7891
|
||||
2024-12-12 15:42:14 [DEBU] uploading job
|
||||
2024-12-12 15:43:39 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal number into Go struct field TmpResourceData.data.resources.total of type sch.UnitValue[int64]
|
||||
2024-12-12 15:43:45 [INFO] start serving http at: :7891
|
||||
2024-12-12 15:43:57 [DEBU] uploading job
|
||||
2024-12-12 15:44:42 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal number 362398603018.24 into Go struct field TmpResourceData.data.resources.available of type int64
|
||||
2024-12-12 15:44:48 [INFO] start serving http at: :7891
|
||||
2024-12-12 15:44:57 [DEBU] uploading job
|
||||
2024-12-13 09:28:04 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
|
||||
2024-12-13 09:28:50 [INFO] start serving http at: :7891
|
||||
2024-12-13 09:29:15 [DEBU] uploading job
|
||||
2024-12-13 09:40:16 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
|
||||
2024-12-13 09:40:22 [INFO] start serving http at: :7891
|
||||
2024-12-13 09:40:27 [DEBU] uploading job
|
||||
2024-12-13 09:43:16 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
|
||||
2024-12-13 09:43:22 [INFO] start serving http at: :7891
|
||||
2024-12-13 09:43:26 [DEBU] uploading job
|
||||
2024-12-13 09:49:21 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field UnitValue[float64].data.resources.total.value of type float64
|
||||
2024-12-13 09:49:28 [INFO] start serving http at: :7891
|
||||
2024-12-13 09:49:31 [DEBU] uploading job
|
||||
2024-12-13 09:50:25 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field UnitValue[float64].data.resources.total.value of type float64
|
||||
2024-12-13 09:50:32 [INFO] start serving http at: :7891
|
||||
2024-12-13 09:50:47 [DEBU] uploading job
|
||||
2024-12-13 09:52:39 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal number 2886.65 into Go struct field UnitValue[int64].data.resources.total.value of type int64
|
||||
2024-12-13 09:52:46 [INFO] start serving http at: :7891
|
||||
2024-12-13 09:52:49 [DEBU] uploading job
|
||||
2024-12-13 09:53:15 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
|
||||
2024-12-13 09:53:32 [INFO] start serving http at: :7891
|
||||
2024-12-13 09:53:39 [DEBU] uploading job
|
||||
2024-12-13 10:04:48 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
|
||||
2024-12-13 10:04:51 [INFO] start serving http at: :7891
|
||||
2024-12-13 10:04:54 [DEBU] uploading job
|
||||
2024-12-13 10:18:24 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
|
||||
2024-12-13 10:24:48 [INFO] start serving http at: :7891
|
||||
2024-12-13 10:24:56 [DEBU] uploading job
|
||||
2024-12-13 10:29:33 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
|
||||
2024-12-13 10:30:34 [INFO] start serving http at: :7891
|
||||
2024-12-13 10:30:36 [DEBU] uploading job
|
||||
2024-12-13 10:38:20 [ERRO] insert upload data fail: Error 1264 (22003): Out of range value for column 'ClusterID' at row 1
|
||||
2024-12-13 10:38:20 [INFO] jobID: %s change state from %s to %s0&{0xc0001da420 test_image.png image [2]} &{0xc00022d320}
|
||||
2024-12-13 10:38:20 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-13 10:38:20 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1264 (22003): Out of range value for column 'ClusterID' at row 1
|
||||
2024-12-13 10:38:20 [INFO] job set 0 completed
|
||||
2024-12-13 16:25:25 [DEBU] uploading job
|
||||
2024-12-13 16:25:25 [WARN] cluster 0 not found
|
||||
2024-12-13 16:25:25 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
|
||||
2024-12-13 16:25:31 [DEBU] uploading job
|
||||
2024-12-13 16:25:31 [WARN] cluster 0 not found
|
||||
2024-12-13 16:25:31 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
|
||||
2024-12-13 16:35:36 [DEBU] uploading job
|
||||
2024-12-13 16:35:36 [WARN] cluster 0 not found
|
||||
2024-12-13 16:35:36 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
|
||||
2024-12-13 16:36:09 [INFO] start serving http at: :7891
|
||||
2024-12-13 16:36:13 [DEBU] uploading job
|
||||
2024-12-13 16:36:13 [WARN] cluster 0 not found
|
||||
2024-12-13 16:36:13 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
|
||||
2024-12-13 16:36:43 [DEBU] uploading job
|
||||
2024-12-13 16:37:15 [WARN] cluster 0 not found
|
||||
2024-12-13 16:37:26 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
|
||||
2024-12-13 16:37:45 [INFO] start serving http at: :7891
|
||||
2024-12-13 16:37:52 [DEBU] uploading job
|
||||
2024-12-13 16:37:52 [WARN] cluster 0 not found
|
||||
2024-12-13 16:37:52 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
|
||||
2024-12-13 16:50:35 [INFO] start serving http at: :7891
|
||||
2024-12-13 16:57:17 [DEBU] uploading job
|
||||
2024-12-13 16:57:17 [ERRO] upload data: Post "jobSet/upload": unsupported protocol scheme ""
|
||||
2024-12-13 16:57:17 [INFO] jobID: %s change state from %s to %s0&{0xc000124380 webgl_lines_dashed dataset [2]} &{0xc00007e700}
|
||||
2024-12-13 16:57:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-13 16:57:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: upload data: Post "jobSet/upload": unsupported protocol scheme ""
|
||||
2024-12-13 16:57:17 [INFO] job set 0 completed
|
||||
2024-12-13 16:58:20 [DEBU] uploading job
|
||||
2024-12-13 17:05:42 [ERRO] upload data: Post "jobSet/upload": unsupported protocol scheme ""
|
||||
2024-12-13 17:05:43 [INFO] jobID: %s change state from %s to %s1&{0xc000124740 webgl_lines_dashed dataset [2]} &{0xc0001da040}
|
||||
2024-12-13 17:05:46 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-13 17:05:46 [INFO] [JobID:1] [LastState:*state2.DataUpload] job failed with: upload data: Post "jobSet/upload": unsupported protocol scheme ""
|
||||
2024-12-13 17:05:46 [INFO] job set 1 completed
|
||||
2024-12-13 17:07:37 [INFO] start serving http at: :7891
|
||||
2024-12-13 17:07:47 [DEBU] uploading job
|
||||
2024-12-13 17:07:47 [ERRO] upload data: code: 400, message:
|
||||
2024-12-13 17:07:47 [INFO] jobID: %s change state from %s to %s0&{0xc000088740 webgl_lines_dashed dataset [2]} &{0xc00007e740}
|
||||
2024-12-13 17:07:47 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-13 17:07:47 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: upload data: code: 400, message:
|
||||
2024-12-13 17:07:47 [INFO] job set 0 completed
|
||||
2024-12-13 17:33:55 [DEBU] uploading job
|
||||
2024-12-13 17:33:55 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
|
||||
2024-12-16 09:26:34 [INFO] start serving http at: :7891
|
||||
2024-12-16 09:26:51 [DEBU] uploading job
|
||||
2024-12-16 09:26:52 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
|
||||
2024-12-17 09:47:49 [INFO] start serving http at: :7891
|
||||
2024-12-17 09:48:40 [DEBU] uploading job
|
||||
2024-12-17 09:52:27 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
|
||||
2024-12-17 09:52:30 [INFO] start serving http at: :7891
|
||||
2024-12-17 09:52:41 [DEBU] uploading job
|
||||
2024-12-17 09:52:43 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
|
||||
2024-12-17 09:53:03 [DEBU] uploading job
|
||||
2024-12-17 09:55:15 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
|
||||
2024-12-17 09:55:17 [INFO] start serving http at: :7891
|
||||
2024-12-17 09:55:21 [DEBU] uploading job
|
||||
2024-12-17 09:57:53 [ERRO] insert upload data fail: Error 1364 (HY000): Field 'userID' doesn't have a default value
|
||||
2024-12-17 09:57:53 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007e920 yuque_mind.jpeg dataset [2] {1 0}} &{0xc00022d100}
|
||||
2024-12-17 09:57:53 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-17 09:57:53 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1364 (HY000): Field 'userID' doesn't have a default value
|
||||
2024-12-17 09:57:53 [INFO] job set 0 completed
|
||||
2024-12-17 09:59:15 [INFO] start serving http at: :7891
|
||||
2024-12-17 09:59:22 [DEBU] uploading job
|
||||
2024-12-17 10:03:17 [ERRO] insert blockchains: empty slice found
|
||||
2024-12-17 10:03:17 [INFO] jobID: %s change state from %s to %s0&{1 0xc00045f740 yuque_mind.jpeg dataset [1] {1 0}} &{0xc00022c1a0}
|
||||
2024-12-17 10:03:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-17 10:03:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert blockchains: empty slice found
|
||||
2024-12-17 10:03:17 [INFO] job set 0 completed
|
||||
2024-12-17 10:03:21 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:03:36 [DEBU] uploading job
|
||||
2024-12-17 10:05:04 [ERRO] insert blockchains: empty slice found
|
||||
2024-12-17 10:05:04 [INFO] jobID: %s change state from %s to %s0&{1 0xc0003513a0 yuque_mind.jpeg dataset [2] {1 0}} &{0xc000351b80}
|
||||
2024-12-17 10:05:04 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-17 10:05:04 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert blockchains: empty slice found
|
||||
2024-12-17 10:05:04 [INFO] job set 0 completed
|
||||
2024-12-17 10:05:18 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:05:24 [DEBU] uploading job
|
||||
2024-12-17 10:09:36 [ERRO] insert blockchains: empty slice found
|
||||
2024-12-17 10:09:36 [INFO] jobID: %s change state from %s to %s0&{1 0xc00019d920 yuque_mind.jpeg dataset [1] {1 0}} &{0xc00027a0c0}
|
||||
2024-12-17 10:09:36 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-17 10:09:36 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert blockchains: empty slice found
|
||||
2024-12-17 10:09:36 [INFO] job set 0 completed
|
||||
2024-12-17 10:09:40 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:09:50 [DEBU] uploading job
|
||||
2024-12-17 10:12:11 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:12:14 [DEBU] uploading job
|
||||
2024-12-17 10:21:56 [ERRO] blockchain: invoke blockchain: unknow response content type: text/html
|
||||
2024-12-17 10:21:56 [INFO] jobID: %s change state from %s to %s0&{1 0xc00088ab80 yuque_mind.jpeg dataset [2] {1 0}} &{0xc000138020}
|
||||
2024-12-17 10:21:56 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-17 10:21:56 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: blockchain: invoke blockchain: unknow response content type: text/html
|
||||
2024-12-17 10:21:56 [INFO] job set 0 completed
|
||||
2024-12-17 10:22:03 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:22:12 [DEBU] uploading job
|
||||
2024-12-17 10:24:36 [ERRO] blockchain: invoke blockchain: unknow response content type: text/html
|
||||
2024-12-17 10:24:36 [INFO] jobID: %s change state from %s to %s0&{1 0xc0004612e0 yuque_mind.jpeg dataset [2] {1 0}} &{0xc00022c020}
|
||||
2024-12-17 10:24:36 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-17 10:24:36 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: blockchain: invoke blockchain: unknow response content type: text/html
|
||||
2024-12-17 10:24:36 [INFO] job set 0 completed
|
||||
2024-12-17 10:24:38 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:24:44 [DEBU] uploading job
|
||||
2024-12-17 10:26:17 [ERRO] blockchain: invoke blockchain: unknow response content type: text/html
|
||||
2024-12-17 10:26:17 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001da460 yuque_mind.jpeg dataset [2] {1 0}} &{0xc000120020}
|
||||
2024-12-17 10:26:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-17 10:26:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: blockchain: invoke blockchain: unknow response content type: text/html
|
||||
2024-12-17 10:26:17 [INFO] job set 0 completed
|
||||
2024-12-17 10:26:40 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:26:49 [DEBU] uploading job
|
||||
2024-12-17 10:29:39 [ERRO] insert blockchains: Error 1054 (42S22): Unknown column 'data_id' in 'field list'
|
||||
2024-12-17 10:29:39 [INFO] jobID: %s change state from %s to %s0&{1 0xc0003242c0 yuque_mind.jpeg dataset [1] {1 0}} &{0xc000324520}
|
||||
2024-12-17 10:29:39 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-17 10:29:39 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert blockchains: Error 1054 (42S22): Unknown column 'data_id' in 'field list'
|
||||
2024-12-17 10:29:39 [INFO] job set 0 completed
|
||||
2024-12-17 10:29:43 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:30:02 [DEBU] uploading job
|
||||
2024-12-17 10:30:28 [INFO] jobID: %s change state from %s to %s0&{1 0xc000128480 yuque_mind.jpeg dataset [1] {1 0}} &{<nil>}
|
||||
2024-12-17 10:30:28 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-17 10:30:28 [INFO] [JobID:0] job completed successfuly
|
||||
2024-12-17 10:30:28 [INFO] job set 0 completed
|
||||
2024-12-17 10:37:45 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:38:51 [WARN] [HTTP:JobSet.GetServiceList] parsing request body: http.QueryUploadedReq.ClusterIDs: []schsdk.ClusterID: ReadString: expects " or n, but found 1, error found in #10 byte of ...|erIDs": [1, 5]
|
||||
}|..., bigger context ...|"dataset",
|
||||
"userID": 1,
|
||||
"clusterIDs": [1, 5]
|
||||
}|...
|
||||
2024-12-17 10:42:06 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:42:26 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
|
||||
2024-12-17 10:42:31 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
|
||||
2024-12-17 10:46:45 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
|
||||
2024-12-17 10:46:48 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:47:23 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:48:10 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:48:58 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:50:11 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:54:10 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:54:33 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:54:44 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:55:22 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:55:27 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
|
||||
2024-12-17 10:57:12 [INFO] start serving http at: :7891
|
||||
2024-12-17 10:57:17 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
|
||||
2024-12-17 10:58:50 [INFO] start serving http at: :7891
|
||||
2024-12-19 11:06:46 [INFO] start serving http at: :7891
|
||||
2024-12-19 15:38:10 [INFO] start serving http at: :7891
|
||||
2024-12-24 09:49:07 [INFO] start serving http at: :7891
|
||||
2024-12-24 09:56:04 [DEBU] uploading job
|
||||
2024-12-24 09:57:17 [ERRO] insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`folders`, CONSTRAINT `folders_ibfk_1` FOREIGN KEY (`parentID`) REFERENCES `folders` (`id`))
|
||||
2024-12-24 09:57:17 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007ea60 yuque_mind.jpeg dataset [3] {1 0}} &{0xc00007eba0}
|
||||
2024-12-24 09:57:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 09:57:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`folders`, CONSTRAINT `folders_ibfk_1` FOREIGN KEY (`parentID`) REFERENCES `folders` (`id`))
|
||||
2024-12-24 09:57:17 [INFO] job set 0 completed
|
||||
2024-12-24 09:59:07 [INFO] start serving http at: :7891
|
||||
2024-12-24 09:59:14 [DEBU] uploading job
|
||||
2024-12-24 10:04:42 [ERRO] insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`folders`, CONSTRAINT `folders_ibfk_1` FOREIGN KEY (`parentID`) REFERENCES `folders` (`id`))
|
||||
2024-12-24 10:04:42 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007e8a0 yuque_mind.jpeg dataset [2] {1 0}} &{0xc0004615c0}
|
||||
2024-12-24 10:04:42 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:04:42 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`folders`, CONSTRAINT `folders_ibfk_1` FOREIGN KEY (`parentID`) REFERENCES `folders` (`id`))
|
||||
2024-12-24 10:04:42 [INFO] job set 0 completed
|
||||
2024-12-24 10:04:44 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:06:04 [DEBU] uploading job
|
||||
2024-12-24 10:09:21 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '0000-00-00' for column 'createTime' at row 1
|
||||
2024-12-24 10:09:21 [INFO] jobID: %s change state from %s to %s0&{1 0xc0004603a0 yuque_mind.jpeg dataset [3] {1 0}} &{0xc00007eb40}
|
||||
2024-12-24 10:09:21 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:09:21 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '0000-00-00' for column 'createTime' at row 1
|
||||
2024-12-24 10:09:21 [INFO] job set 0 completed
|
||||
2024-12-24 10:10:08 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:11:28 [DEBU] uploading job
|
||||
2024-12-24 10:12:31 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '0000-00-00' for column 'createTime' at row 1
|
||||
2024-12-24 10:12:31 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001fa440 yuque_mind.jpeg dataset [3] {1 0}} &{0xc0001fa000}
|
||||
2024-12-24 10:12:31 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:12:31 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '0000-00-00' for column 'createTime' at row 1
|
||||
2024-12-24 10:12:31 [INFO] job set 0 completed
|
||||
2024-12-24 10:12:35 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:12:41 [DEBU] uploading job
|
||||
2024-12-24 10:14:25 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
|
||||
2024-12-24 10:14:25 [INFO] jobID: %s change state from %s to %s0&{1 0xc0003534a0 yuque_mind.jpeg dataset [3] {1 0}} &{0xc000120a60}
|
||||
2024-12-24 10:14:25 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:14:25 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
|
||||
2024-12-24 10:14:25 [INFO] job set 0 completed
|
||||
2024-12-24 10:14:27 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:14:39 [DEBU] uploading job
|
||||
2024-12-24 10:17:21 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
|
||||
2024-12-24 10:17:21 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001fa480 yuque_mind.jpeg dataset [3] {1 0}} &{0xc000248180}
|
||||
2024-12-24 10:17:21 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:17:21 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
|
||||
2024-12-24 10:17:21 [INFO] job set 0 completed
|
||||
2024-12-24 10:17:26 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:17:32 [DEBU] uploading job
|
||||
2024-12-24 10:25:30 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
|
||||
2024-12-24 10:25:30 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001fa480 yuque_mind.jpeg dataset [3] {1 0}} &{0xc000249160}
|
||||
2024-12-24 10:25:30 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:25:30 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
|
||||
2024-12-24 10:25:30 [INFO] job set 0 completed
|
||||
2024-12-24 10:25:33 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:25:40 [DEBU] uploading job
|
||||
2024-12-24 10:29:18 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
|
||||
2024-12-24 10:29:18 [INFO] jobID: %s change state from %s to %s0&{1 0xc00019c800 dataset [3] {1 0}} &{0xc00019c3a0}
|
||||
2024-12-24 10:29:18 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:29:18 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
|
||||
2024-12-24 10:29:18 [INFO] job set 0 completed
|
||||
2024-12-24 10:29:50 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:34:07 [DEBU] uploading job
|
||||
2024-12-24 10:40:14 [ERRO] insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`uploaddata`, CONSTRAINT `uploaddata_ibfk_1` FOREIGN KEY (`folderID`) REFERENCES `folders` (`id`))
|
||||
2024-12-24 10:40:14 [INFO] jobID: %s change state from %s to %s0&{1 0xc00050e200 dataset [3] {1 0}} &{0xc000352260}
|
||||
2024-12-24 10:40:14 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:40:14 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`uploaddata`, CONSTRAINT `uploaddata_ibfk_1` FOREIGN KEY (`folderID`) REFERENCES `folders` (`id`))
|
||||
2024-12-24 10:40:14 [INFO] job set 0 completed
|
||||
2024-12-24 10:40:17 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:40:48 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:40:55 [DEBU] uploading job
|
||||
2024-12-24 10:43:04 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001da380 dataset [3] {1 0}} &{<nil>}
|
||||
2024-12-24 10:43:04 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:43:04 [INFO] [JobID:0] job completed successfuly
|
||||
2024-12-24 10:43:04 [INFO] job set 0 completed
|
||||
2024-12-24 10:44:57 [DEBU] uploading job
|
||||
2024-12-24 10:49:43 [INFO] start serving http at: :7891
|
||||
2024-12-24 10:49:51 [DEBU] uploading job
|
||||
2024-12-24 10:50:48 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007e920 dataset [3] {1 0}} &{<nil>}
|
||||
2024-12-24 10:50:48 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:50:48 [INFO] [JobID:0] job completed successfuly
|
||||
2024-12-24 10:50:48 [INFO] job set 0 completed
|
||||
2024-12-24 10:52:43 [DEBU] uploading job
|
||||
2024-12-24 10:52:50 [DEBU] uploading job
|
||||
2024-12-24 10:52:53 [DEBU] uploading job
|
||||
2024-12-24 10:53:05 [INFO] jobID: %s change state from %s to %s1&{1 0xc00007e720 dataset [3] {1 0}} &{<nil>}
|
||||
2024-12-24 10:53:05 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 10:53:05 [INFO] [JobID:1] job completed successfuly
|
||||
2024-12-24 10:53:05 [INFO] job set 1 completed
|
||||
2024-12-24 11:00:58 [INFO] start serving http at: :7891
|
||||
2024-12-24 11:01:03 [DEBU] uploading job
|
||||
2024-12-24 11:01:06 [DEBU] uploading job
|
||||
2024-12-24 11:01:10 [DEBU] uploading job
|
||||
2024-12-24 11:01:32 [INFO] jobID: %s change state from %s to %s0&{1 0xc00022d220 dataset [3] {1 0}} &{<nil>}
|
||||
2024-12-24 11:01:32 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 11:01:32 [INFO] [JobID:0] job completed successfuly
|
||||
2024-12-24 11:01:32 [INFO] job set 0 completed
|
||||
2024-12-24 15:37:05 [INFO] start serving http at: :7891
|
||||
2024-12-24 15:37:36 [DEBU] uploading job
|
||||
2024-12-24 15:37:42 [DEBU] uploading job
|
||||
2024-12-24 15:37:49 [DEBU] uploading job
|
||||
2024-12-24 15:38:00 [INFO] jobID: %s change state from %s to %s0&{1 0xc000531d80 dataset [3] {1 0}} &{<nil>}
|
||||
2024-12-24 15:38:00 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 15:38:00 [INFO] [JobID:0] job completed successfuly
|
||||
2024-12-24 15:38:00 [INFO] job set 0 completed
|
||||
2024-12-24 15:38:19 [INFO] jobID: %s change state from %s to %s1&{1 0xc0001da0a0 dataset [3] {1 0}} &{<nil>}
|
||||
2024-12-24 15:38:19 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 15:38:19 [INFO] [JobID:1] job completed successfuly
|
||||
2024-12-24 15:38:19 [INFO] job set 1 completed
|
||||
2024-12-24 15:38:23 [INFO] jobID: %s change state from %s to %s2&{1 0xc00070fa80 dataset [3] {1 0}} &{<nil>}
|
||||
2024-12-24 15:38:23 [INFO] [JobID:2] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 15:38:23 [INFO] [JobID:2] job completed successfuly
|
||||
2024-12-24 15:38:23 [INFO] job set 2 completed
|
||||
2024-12-24 15:38:45 [DEBU] uploading job
|
||||
2024-12-24 15:38:53 [ERRO] insert upload data fail: data already exists
|
||||
2024-12-24 15:38:53 [INFO] jobID: %s change state from %s to %s3&{1 0xc000460580 dataset [3] {1 0}} &{0xc00007ed00}
|
||||
2024-12-24 15:38:53 [INFO] [JobID:3] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-24 15:38:53 [INFO] [JobID:3] [LastState:*state2.DataUpload] job failed with: insert upload data fail: data already exists
|
||||
2024-12-24 15:38:53 [INFO] job set 3 completed
|
||||
2024-12-24 15:48:42 [INFO] start serving http at: :7891
|
||||
2024-12-24 16:06:49 [INFO] start serving http at: :7891
|
||||
2024-12-24 16:07:38 [INFO] start serving http at: :7891
|
||||
2024-12-24 16:09:14 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: Error 1054 (42S22): Unknown column 'queryType' in 'field list'
|
||||
2024-12-24 16:09:17 [INFO] start serving http at: :7891
|
||||
2024-12-24 16:13:11 [INFO] start serving http at: :7891
|
||||
2024-12-24 16:13:16 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1
|
||||
2024-12-24 16:15:11 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1
|
||||
2024-12-24 16:15:14 [INFO] start serving http at: :7891
|
||||
2024-12-24 16:15:25 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1
|
||||
2024-12-24 16:22:14 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1
|
||||
2024-12-24 16:22:18 [INFO] start serving http at: :7891
|
||||
2024-12-24 16:28:26 [INFO] start serving http at: :7891
|
||||
2024-12-24 17:09:10 [INFO] start serving http at: :7891
|
||||
2024-12-25 17:34:34 [INFO] start serving http at: :7891
|
||||
2024-12-25 17:35:57 [WARN] [HTTP:JobSet.CreateFolder] creating folder: folder already exists
|
||||
2024-12-25 17:36:22 [INFO] start serving http at: :7891
|
||||
2024-12-25 17:36:26 [WARN] [HTTP:JobSet.CreateFolder] creating folder: folder already exists
|
||||
2024-12-26 16:52:34 [INFO] start serving http at: :7891
|
||||
2024-12-26 16:53:03 [DEBU] uploading job
|
||||
2024-12-26 16:53:23 [ERRO] insert upload data fail: Error 1054 (42S22): Unknown column 'fileType' in 'field list'
|
||||
2024-12-26 16:53:23 [INFO] jobID: %s change state from %s to %s0&{1 0xc00080b680 dataset [3] {1 0}} &{0xc0001da020}
|
||||
2024-12-26 16:53:23 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-26 16:53:23 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1054 (42S22): Unknown column 'fileType' in 'field list'
|
||||
2024-12-26 16:53:23 [INFO] job set 0 completed
|
||||
2024-12-26 16:54:36 [DEBU] uploading job
|
||||
2024-12-26 16:59:19 [ERRO] insert upload data fail: Error 1054 (42S22): Unknown column 'fileType' in 'field list'
|
||||
2024-12-26 16:59:19 [INFO] jobID: %s change state from %s to %s1&{1 0xc00007e680 dataset [3] {1 0}} &{0xc00080a020}
|
||||
2024-12-26 16:59:19 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-26 16:59:19 [INFO] [JobID:1] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1054 (42S22): Unknown column 'fileType' in 'field list'
|
||||
2024-12-26 16:59:19 [INFO] job set 1 completed
|
||||
2024-12-26 16:59:22 [INFO] start serving http at: :7891
|
||||
2024-12-26 16:59:31 [DEBU] uploading job
|
||||
2024-12-26 17:00:20 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007e800 dataset [3] {1 0}} &{<nil>}
|
||||
2024-12-26 17:00:20 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
|
||||
2024-12-26 17:00:20 [INFO] [JobID:0] job completed successfuly
|
||||
2024-12-26 17:00:20 [INFO] job set 0 completed
|
||||
2024-12-26 17:02:04 [INFO] start serving http at: :7891
|
||||
2024-12-30 11:06:44 [INFO] start serving http at: :7891
|
||||
2024-12-30 11:12:35 [INFO] start serving http at: :7891
|
||||
2024-12-30 11:14:53 [INFO] start serving http at: :7891
|
||||
2024-12-30 11:19:30 [INFO] start serving http at: :7891
|
||||
2024-12-30 11:21:17 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Get "http://120.46.183.86:32010/object/list?isPrefix=true&packageID=1&path=objects&userID=1": dial tcp 120.46.183.86:32010: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
|
||||
2024-12-30 11:27:07 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Get "http://120.46.183.86:32010/object/list?isPrefix=true&packageID=1&path=objects&userID=1": dial tcp 120.46.183.86:32010: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
|
||||
2024-12-30 11:27:09 [INFO] start serving http at: :7891
|
||||
2024-12-30 11:28:58 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Get "http://localhost:32010/object/list?isPrefix=true&packageID=1&path=objects&userID=1": read tcp [::1]:19147->[::1]:32010: wsarecv: An existing connection was forcibly closed by the remote host.
|
||||
2024-12-30 11:36:50 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: code: OperationFailed, message: listing objects: requsting to coodinator: code: OperationFailed, message: get objects with prefix failed
|
||||
2024-12-30 11:39:18 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Error 1054 (42S22): Unknown column 'package_id' in 'where clause'
|
||||
2024-12-30 14:37:12 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Error 1054 (42S22): Unknown column 'package_id' in 'where clause'
|
||||
2024-12-30 14:37:15 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:37:43 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Get "http://localhost:32010/object/list?isPrefix=true&packageID=916&path=objects&userID=1": dial tcp [::1]:32010: connectex: No connection could be made because the target machine actively refused it.
|
||||
2024-12-30 14:39:50 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: sql: expected 1 arguments, got 2
|
||||
2024-12-30 14:39:53 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:40:40 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: sql: expected 1 arguments, got 2
|
||||
2024-12-30 14:40:43 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:42:13 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:44:12 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:44:16 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:45:40 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:45:42 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:46:35 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:46:39 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:47:24 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:47:26 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:52:55 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:52:58 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:54:02 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:54:04 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:54:46 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:54:48 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:56:02 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:56:05 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:58:39 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:58:42 [INFO] start serving http at: :7891
|
||||
2024-12-30 14:59:05 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 14:59:08 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:01:28 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:01:31 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:07:59 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:08:03 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:09:46 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:09:48 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:10:54 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:15:07 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:15:10 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:18:20 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:18:24 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:18:41 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:18:43 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:18:51 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:20:34 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:20:48 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:24:58 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:25:01 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:27:06 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:27:29 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:28:34 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:29:00 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:29:54 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:30:02 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:30:43 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:30:58 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: uploadedCluster: unsupported relations for schema Package2
|
||||
2024-12-30 15:31:01 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:32:36 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:37:24 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:37:27 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:38:49 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:38:52 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:40:15 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:40:18 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:51:15 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:51:24 [INFO] start serving http at: :7891
|
||||
2024-12-30 15:57:38 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
|
||||
2024-12-30 15:58:12 [INFO] start serving http at: :7891
|
||||
2024-12-30 16:18:58 [INFO] start serving http at: :7891
|
||||
2024-12-30 16:44:22 [INFO] start serving http at: :7891
|
||||
2024-12-30 16:46:14 [WARN] [HTTP:JobSet.CreateFolder] creating folder: invalid serializer type union
|
||||
2024-12-30 16:47:07 [INFO] start serving http at: :7891
|
||||
2024-12-30 16:47:34 [INFO] start serving http at: :7891
|
||||
2024-12-30 16:47:37 [WARN] [HTTP:JobSet.CreateFolder] creating folder: failed to create package: code: DataExists, message: package already exists
|
||||
2024-12-30 16:47:46 [WARN] [HTTP:JobSet.CreateFolder] creating folder: invalid serializer type union
|
||||
2024-12-30 16:49:00 [INFO] start serving http at: :7891
|
||||
2024-12-30 16:50:56 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: invalid serializer type union
|
||||
2024-12-30 16:53:38 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: invalid serializer type union
|
||||
2024-12-30 16:53:48 [INFO] start serving http at: :7891
|
||||
2024-12-30 16:54:49 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:00:28 [WARN] [HTTP:JobSet.CreateFolder] creating folder: failed to create package: code: DataExists, message: package already exists
|
||||
2024-12-30 17:01:06 [WARN] [HTTP:JobSet.CreateFolder] creating folder: failed to create package: code: DataExists, message: package already exists
|
||||
2024-12-30 17:02:00 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:03:55 [WARN] [HTTP:JobSet.CreateFolder] parsing request body: http.PackageDelete.ReadString: expects " or n, but found }, error found in #10 byte of ...|: 1042,
|
||||
}|..., bigger context ...|{
|
||||
"userID": 1,
|
||||
"packageID": 1042,
|
||||
}|...
|
||||
2024-12-30 17:04:00 [WARN] [HTTP:JobSet.CreateFolder] creating folder: invalid serializer type union
|
||||
2024-12-30 17:04:35 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:13:24 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:14:58 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:21:17 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:22:16 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:26:49 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:29:04 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:30:27 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:34:22 [INFO] start serving http at: :7891
|
||||
2024-12-30 17:35:43 [INFO] start serving http at: :7891
|
||||
2024-12-31 09:00:26 [INFO] start serving http at: :7891
|
||||
2024-12-31 09:00:33 [WARN] [HTTP:JobSet.DeleteFolder] creating folder: failed to delete object: Get "http://localhost:32010/object/list?isPrefix=true&packageID=1045&path=%2Fpath%2Ftest%2F3&userID=1": dial tcp [::1]:32010: connectex: No connection could be made because the target machine actively refused it.
|
||||
2024-12-31 09:01:11 [WARN] [HTTP:JobSet.DeleteFolder] creating folder: failed to delete object: Get "http://localhost:32010/object/list?isPrefix=true&packageID=1045&path=%2Fpath%2Ftest%2F3&userID=1": dial tcp [::1]:32010: connectex: No connection could be made because the target machine actively refused it.
|
||||
2024-12-31 09:03:02 [WARN] [HTTP:JobSet.DeleteFolder] creating folder: failed to delete object: code: BadArgument, message: missing argument or invalid argument
|
||||
2024-12-31 09:03:11 [INFO] start serving http at: :7891
|
||||
2024-12-31 09:05:52 [DEBU] uploading job
|
||||
2024-12-31 09:08:16 [INFO] start serving http at: :7891
|
||||
2024-12-31 09:08:32 [DEBU] uploading job
|
||||
|
|
|
@ -64,6 +64,9 @@ func Bin() error {
|
|||
if err := Manager(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := ScheduleMiddleware(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -95,7 +98,7 @@ func Confs() error {
|
|||
confDir := "./common/assets/confs"
|
||||
|
||||
info, err := os.Stat(confDir)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
fmt.Printf("no confs.\n")
|
||||
return nil
|
||||
}
|
||||
|
@ -158,3 +161,12 @@ func Manager() error {
|
|||
EntryFile: "manager/main.go",
|
||||
})
|
||||
}
|
||||
|
||||
func ScheduleMiddleware() error {
|
||||
return magefiles.Build(magefiles.BuildArgs{
|
||||
OutputName: "schedulerMiddleware",
|
||||
OutputDir: "schedulerMiddleware",
|
||||
AssetsDir: "assets",
|
||||
EntryFile: "schedulerMiddleware/main.go",
|
||||
})
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ type InstanceCreateInfo struct {
|
|||
}
|
||||
|
||||
type InstanceUpdateInfo struct {
|
||||
serder.Metadata `union:"Update"`
|
||||
serder.Metadata `union:"UpdatePackage"`
|
||||
InstanceInfoBase
|
||||
Type string `json:"type"`
|
||||
Info schsdk.UpdateMultiInstanceJobInfo `json:"info"`
|
||||
|
|
|
@ -58,7 +58,7 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
|||
cancel()
|
||||
}()
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), s.scheme.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
|
@ -227,7 +227,7 @@ func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRu
|
|||
// }
|
||||
|
||||
// // TODO 镜像名称
|
||||
// err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||
// err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.DefCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("creating image info: %w", err)
|
||||
// }
|
||||
|
|
|
@ -6,7 +6,6 @@ import (
|
|||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/samber/lo"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
|
@ -60,11 +59,11 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
|||
case *job.NormalJob:
|
||||
switch runningJob.SubType {
|
||||
case schsdk.JobTypeNormal: // 普通任务
|
||||
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), runningJob.Files.Image.ImageID, runningJob.TargetCCID)
|
||||
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.DefCtx(), runningJob.Files.Image.ImageID, runningJob.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting pcm image info: %w", err)
|
||||
}
|
||||
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), runningJob.TargetCCID)
|
||||
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.DefCtx(), runningJob.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center resource: %w", err)
|
||||
}
|
||||
|
@ -72,12 +71,12 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
|||
return fmt.Errorf("no resource found at computing center %v", runningJob.TargetCCID)
|
||||
}
|
||||
|
||||
ccInfo, getStg, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
|
||||
ccInfo, _, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting storage info: %w", err)
|
||||
}
|
||||
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
|
||||
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, getStg.RemoteBase, *ccInfo)
|
||||
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
|
||||
err = s.submitNormalTask(rtx, cmd, envs, *ccInfo, pcmImgInfo, ress[0].PCMResourceID)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
|
@ -89,7 +88,7 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
|||
return fmt.Errorf("getting storage info: %w", err)
|
||||
}
|
||||
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
|
||||
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, getStg.RemoteBase, *ccInfo)
|
||||
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
|
||||
instID, err := s.submitDataPreprocessTask(rtx, cmd, envs, *ccInfo, getStg.StorageID, userID)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
|
@ -110,7 +109,7 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
|||
return fmt.Errorf("loading dataset package: %w", err)
|
||||
}
|
||||
}
|
||||
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, getStg.RemoteBase, *ccInfo)
|
||||
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
|
||||
err = s.submitFinetuningTask(userID, rtx, cmd, envs, *ccInfo, getStg.StorageID, runningJob)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
|
@ -124,7 +123,7 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
|||
return fmt.Errorf("getting storage info: %w", err)
|
||||
}
|
||||
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
|
||||
_, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, getStg.RemoteBase, *ccInfo)
|
||||
_, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
|
||||
err = s.submitInstanceTask(rtx, jo, runningJob, *ccInfo, getStg.StorageID, userID, envs)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
|
@ -203,7 +202,7 @@ func (s *JobExecuting) submitNormalTask(rtx jobmgr.JobStateRunContext, cmd strin
|
|||
}
|
||||
|
||||
func (s *JobExecuting) submitDataPreprocessTask(rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdssdk.StorageID, userID cdssdk.UserID) (string, error) {
|
||||
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.SQLCtx(), storageID)
|
||||
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.DefCtx(), storageID)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
return "", fmt.Errorf("getting object storage info: %w", err)
|
||||
|
@ -373,14 +372,14 @@ func (s *JobExecuting) submitInstanceTask(rtx jobmgr.JobStateRunContext, jo *job
|
|||
}
|
||||
|
||||
func getModelInfoAndObjectStorage(rtx jobmgr.JobStateRunContext, modelID schsdk.ModelID, storageID cdssdk.StorageID) (*schmod.ObjectStorage, *schmod.ModelResource, error) {
|
||||
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.SQLCtx(), storageID)
|
||||
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.DefCtx(), storageID)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
return nil, nil, fmt.Errorf("getting object storage info: %w", err)
|
||||
}
|
||||
|
||||
// 先从数据库中查询是否已经预置了模型
|
||||
modelInfo, err := rtx.Mgr.DB.Models().GetModelByID(rtx.Mgr.DB.SQLCtx(), modelID, objectStorage.ID)
|
||||
modelInfo, err := rtx.Mgr.DB.Models().GetModelByID(rtx.Mgr.DB.DefCtx(), modelID, objectStorage.ID)
|
||||
if &modelInfo == nil {
|
||||
logger.Error(err.Error())
|
||||
return nil, nil, fmt.Errorf("the model is not exists: %w", err)
|
||||
|
@ -405,31 +404,31 @@ func postDeleteInstanceEvent(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job, runn
|
|||
// 判断算力中心是否支持环境变量配置,如果不支持,则读取脚本内容并拼接在Command参数后面
|
||||
func getRuntimeCommand(runtime schsdk.JobRuntimeInfo, dataSetPath string, outputPath string, remoteBase string, ccInfo schmod.ComputingCenter) (string, []schsdk.KVPair) {
|
||||
var envs []schsdk.KVPair
|
||||
var params []string
|
||||
//var params []string
|
||||
var cmd string
|
||||
|
||||
envs = append(envs, schsdk.KVPair{Key: schsdk.JobDataInEnv, Value: filepath.Join(remoteBase, dataSetPath)})
|
||||
envs = append(envs, schsdk.KVPair{Key: schsdk.JobDataOutEnv, Value: filepath.Join(remoteBase, outputPath)})
|
||||
envs = append(envs, runtime.Envs...)
|
||||
switch boot := ccInfo.Bootstrap.(type) {
|
||||
case *schsdk.DirectBootstrap:
|
||||
cmd = runtime.Command
|
||||
case *schsdk.NoEnvBootstrap:
|
||||
cmd = boot.ScriptFileName
|
||||
params = append(params, runtime.Command)
|
||||
envMap := lo.Map(envs, func(env schsdk.KVPair, _ int) string {
|
||||
return fmt.Sprintf("%s=%s", env.Key, env.Value)
|
||||
})
|
||||
params = append(params, envMap...)
|
||||
default:
|
||||
cmd = runtime.Command
|
||||
}
|
||||
//switch boot := ccInfo.Bootstrap.(type) {
|
||||
//case *schsdk.DirectBootstrap:
|
||||
// cmd = runtime.Command
|
||||
//case *schsdk.NoEnvBootstrap:
|
||||
// cmd = boot.ScriptFileName
|
||||
// params = append(params, runtime.Command)
|
||||
// envMap := lo.Map(envs, func(env schsdk.KVPair, _ int) string {
|
||||
// return fmt.Sprintf("%s=%s", env.Key, env.Value)
|
||||
// })
|
||||
// params = append(params, envMap...)
|
||||
//default:
|
||||
// cmd = runtime.Command
|
||||
//}
|
||||
|
||||
return cmd, envs
|
||||
}
|
||||
|
||||
func getCCInfoAndStgInfo(rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, userID cdssdk.UserID) (*schmod.ComputingCenter, *cdsapi.StorageGetResp, error) {
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), targetCCID)
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), targetCCID)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
|
@ -485,7 +484,7 @@ func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Jo
|
|||
cancel()
|
||||
}()
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), reJob.TargetJobCCID)
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), reJob.TargetJobCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
|
|
|
@ -73,7 +73,7 @@ func (s *MultiInstanceUpdate) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job)
|
|||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), dtrJob.TargetJobCCID)
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), dtrJob.TargetJobCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
|
|
|
@ -10,8 +10,6 @@ import (
|
|||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
|
@ -61,7 +59,7 @@ func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
|||
cancel()
|
||||
}()
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), s.scheme.TargetCCID)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(fmt.Errorf("getting computing center info: %w", err)))
|
||||
return
|
||||
|
@ -203,7 +201,7 @@ func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobSta
|
|||
|
||||
// 上传完毕,则可以新建一个空的镜像的记录
|
||||
// TODO 镜像名称
|
||||
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
|
||||
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.DefCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating image info: %w", err)
|
||||
}
|
||||
|
@ -213,7 +211,7 @@ func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobSta
|
|||
file.PackageID = &evt.PackageID
|
||||
|
||||
case *schsdk.ImageJobFileInfo:
|
||||
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID)
|
||||
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.DefCtx(), info.ImageID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting image info: %w", err)
|
||||
}
|
||||
|
@ -226,64 +224,64 @@ func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobSta
|
|||
// TODO 需要重新设计镜像导入流程
|
||||
return fmt.Errorf("not implemented")
|
||||
|
||||
if file.PackageID == nil {
|
||||
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
|
||||
}
|
||||
|
||||
// TODO UserID
|
||||
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
fut := taskStatus.Receive()
|
||||
status := <-fut.Chan()
|
||||
|
||||
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
|
||||
if moveStatus.Error != "" {
|
||||
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
}
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cloudream storage client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
// TODO UserID
|
||||
pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting package objects: %w", err)
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) == 0 {
|
||||
return fmt.Errorf("no object in the package which will be imported")
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) > 1 {
|
||||
return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
||||
}
|
||||
|
||||
taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
fut2 := taskStatus2.Receive()
|
||||
status2 := <-fut2.Chan()
|
||||
|
||||
uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
|
||||
if uploadStatus.Error != "" {
|
||||
return fmt.Errorf("uploading image: %s", uploadStatus.Error)
|
||||
}
|
||||
|
||||
// TODO 镜像名称
|
||||
err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating image info: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
//if file.PackageID == nil {
|
||||
// return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
|
||||
//}
|
||||
//
|
||||
//// TODO UserID
|
||||
//taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("moving package: %w", err)
|
||||
//}
|
||||
//
|
||||
//fut := taskStatus.Receive()
|
||||
//status := <-fut.Chan()
|
||||
//
|
||||
//moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
|
||||
//if moveStatus.Error != "" {
|
||||
// return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
//}
|
||||
//
|
||||
//stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("new cloudream storage client: %w", err)
|
||||
//}
|
||||
//defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
//
|
||||
//// TODO UserID
|
||||
//pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("getting package objects: %w", err)
|
||||
//}
|
||||
//
|
||||
//if len(pkgObjs.Objects) == 0 {
|
||||
// return fmt.Errorf("no object in the package which will be imported")
|
||||
//}
|
||||
//
|
||||
//if len(pkgObjs.Objects) > 1 {
|
||||
// return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
||||
//}
|
||||
//
|
||||
//taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("moving package: %w", err)
|
||||
//}
|
||||
//
|
||||
//fut2 := taskStatus2.Receive()
|
||||
//status2 := <-fut2.Chan()
|
||||
//
|
||||
//uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
|
||||
//if uploadStatus.Error != "" {
|
||||
// return fmt.Errorf("uploading image: %s", uploadStatus.Error)
|
||||
//}
|
||||
//
|
||||
//// TODO 镜像名称
|
||||
//err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.DefCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("creating image info: %w", err)
|
||||
//}
|
||||
//
|
||||
//return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
|
|
|
@ -8,7 +8,7 @@ import (
|
|||
)
|
||||
|
||||
func (svc *Service) GetAllComputingCenter(msg *mgrmq.GetAllComputingCenter) (*mgrmq.GetAllComputingCenterResp, *mq.CodeMessage) {
|
||||
ccs, err := svc.db.ComputingCenter().GetAll(svc.db.SQLCtx())
|
||||
ccs, err := svc.db.ComputingCenter().GetAll(svc.db.DefCtx())
|
||||
if err != nil {
|
||||
logger.Warnf("getting all computing center: %s", err.Error())
|
||||
return nil, mq.Failed(errorcode.OperationFailed, "get all computing center failed")
|
||||
|
|
|
@ -8,13 +8,13 @@ import (
|
|||
)
|
||||
|
||||
func (svc *Service) GetImageInfo(msg *mgrmq.GetImageInfo) (*mgrmq.GetImageInfoResp, *mq.CodeMessage) {
|
||||
image, err := svc.db.Image().GetByID(svc.db.SQLCtx(), msg.ImageID)
|
||||
image, err := svc.db.Image().GetByID(svc.db.DefCtx(), msg.ImageID)
|
||||
if err != nil {
|
||||
logger.WithField("ImageID", msg.ImageID).Warnf("getting image by id: %s", err.Error())
|
||||
return nil, mq.Failed(errorcode.OperationFailed, "get image failed")
|
||||
}
|
||||
|
||||
pcmImages, err := svc.db.PCMImage().GetByImageID(svc.db.SQLCtx(), msg.ImageID)
|
||||
pcmImages, err := svc.db.PCMImage().GetByImageID(svc.db.DefCtx(), msg.ImageID)
|
||||
if err != nil {
|
||||
logger.WithField("ImageID", msg.ImageID).Warnf("getting pcm image by image id: %s", err.Error())
|
||||
return nil, mq.Failed(errorcode.OperationFailed, "get pcm images failed")
|
||||
|
|
|
@ -207,7 +207,7 @@ func (svc *Service) GetServiceList(msg *mgrmq.GetServiceList) (*mgrmq.GetService
|
|||
|
||||
_, ok = jo.State.(*jobmod.NormalJobExecutingDump)
|
||||
if ok {
|
||||
computingCenter, err := svc.db.ComputingCenter().GetByID(svc.db.SQLCtx(), norJob.TargetCCID)
|
||||
computingCenter, err := svc.db.ComputingCenter().GetByID(svc.db.DefCtx(), norJob.TargetCCID)
|
||||
if err != nil {
|
||||
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("get cdsNodeID failed by CCID: %s", err.Error()))
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ func (svc *Service) ECSNodeRunningInfo(req *schsdk.ECSNodeRunningInfoReq) (*schs
|
|||
}
|
||||
|
||||
func (svc *Service) GetAllModels(msg *mgrmq.GetAllModels) (*mgrmq.GetAllModelsResp, *mq.CodeMessage) {
|
||||
models, err := svc.db.Models().GetAll(svc.db.SQLCtx())
|
||||
models, err := svc.db.Models().GetAll(svc.db.DefCtx())
|
||||
if err != nil {
|
||||
logger.Warnf("getting all models: %s", err.Error())
|
||||
return nil, mq.Failed(errorcode.OperationFailed, "get all models failed")
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
package cmdline
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/services"
|
||||
"os"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/cmdtrie"
|
||||
)
|
||||
|
||||
type CommandContext struct {
|
||||
Cmdline *Commandline
|
||||
}
|
||||
|
||||
var commands cmdtrie.CommandTrie[CommandContext, error] = cmdtrie.NewCommandTrie[CommandContext, error]()
|
||||
|
||||
type Commandline struct {
|
||||
Svc *services.Service
|
||||
}
|
||||
|
||||
func NewCommandline(svc *services.Service) (*Commandline, error) {
|
||||
return &Commandline{
|
||||
Svc: svc,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *Commandline) DispatchCommand(allArgs []string) {
|
||||
cmdCtx := CommandContext{
|
||||
Cmdline: c,
|
||||
}
|
||||
cmdErr, err := commands.Execute(cmdCtx, allArgs, cmdtrie.ExecuteOption{ReplaceEmptyArrayWithNil: true})
|
||||
if err != nil {
|
||||
fmt.Printf("execute command failed, err: %s", err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
if cmdErr != nil {
|
||||
fmt.Printf("execute command failed, err: %s", cmdErr.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
func MustAddCmd(fn any, prefixWords ...string) any {
|
||||
commands.MustAdd(fn, prefixWords...)
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
package cmdline
|
||||
|
||||
// var _ = MustAddCmd(func(ctx CommandContext, infoFilePath string) error {
|
||||
//
|
||||
// }, "jobset", "new")
|
|
@ -0,0 +1,25 @@
|
|||
package cmdline
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/http"
|
||||
)
|
||||
|
||||
var _ = MustAddCmd(func(ctx CommandContext, args []string) error {
|
||||
listenAddr := ":7891"
|
||||
if len(args) > 0 {
|
||||
listenAddr = args[0]
|
||||
}
|
||||
|
||||
httpSvr, err := http.NewServer(listenAddr, ctx.Cmdline.Svc)
|
||||
if err != nil {
|
||||
return fmt.Errorf("new http server: %w", err)
|
||||
}
|
||||
|
||||
err = httpSvr.Serve()
|
||||
if err != nil {
|
||||
return fmt.Errorf("serving http: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}, "serve", "http")
|
|
@ -0,0 +1,30 @@
|
|||
package config
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
"gitlink.org.cn/cloudream/common/sdks/blockchain"
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
|
||||
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
|
||||
"gitlink.org.cn/cloudream/common/utils/config"
|
||||
db "gitlink.org.cn/cloudream/scheduler/common/pkgs/db/config"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
Logger logger.Config `json:"logger"`
|
||||
DB db.Config `json:"db"`
|
||||
PCMScheduler sch.Config `json:"pcmScheduler"`
|
||||
Uploader uploadersdk.Config `json:"uploader"`
|
||||
BlockChain blockchain.Config `json:"blockChain"`
|
||||
CloudreamStorage cdsapi.Config `json:"cloudreamStorage"`
|
||||
}
|
||||
|
||||
var cfg Config
|
||||
|
||||
func Init() error {
|
||||
return config.DefaultLoad("middleware", &cfg)
|
||||
}
|
||||
|
||||
func Cfg() *Config {
|
||||
return &cfg
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package http
|
||||
|
||||
import "gitlink.org.cn/cloudream/common/consts/errorcode"
|
||||
|
||||
type Response struct {
|
||||
Code string `json:"code"`
|
||||
Message string `json:"message"`
|
||||
Data any `json:"data"`
|
||||
}
|
||||
|
||||
func OK(data any) Response {
|
||||
return Response{
|
||||
Code: errorcode.OK,
|
||||
Message: "",
|
||||
Data: data,
|
||||
}
|
||||
}
|
||||
|
||||
func Failed(code string, msg string) Response {
|
||||
return Response{
|
||||
Code: code,
|
||||
Message: msg,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,417 @@
|
|||
package http
|
||||
|
||||
import (
|
||||
"github.com/gin-gonic/gin"
|
||||
"gitlink.org.cn/cloudream/common/consts/errorcode"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
|
||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||
"io"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type JobSetService struct {
|
||||
*Server
|
||||
}
|
||||
|
||||
func (s *Server) JobSetSvc() *JobSetService {
|
||||
return &JobSetService{
|
||||
Server: s,
|
||||
}
|
||||
}
|
||||
|
||||
type JobSetSubmitResp struct {
|
||||
JobSetID schsdk.JobSetID `json:"jobSetID"`
|
||||
FilesUploadScheme schsdk.JobSetFilesUploadScheme `json:"filesUploadScheme"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) Submit(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.Submit")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
jobSetInfo, err := serder.JSONToObjectEx[schsdk.JobSetInfo](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
schScheme, uploadScheme, err := s.svc.JobSetSvc().PreScheduler(jobSetInfo)
|
||||
if err != nil {
|
||||
log.Warnf("pre-scheduling jobset: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "pre-scheduling jobset failed"))
|
||||
return
|
||||
}
|
||||
|
||||
jobsetID, err := s.svc.JobSetSvc().Submit(jobSetInfo, schScheme)
|
||||
if err != nil {
|
||||
log.Warnf("submitting jobset: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "submit jobset failed"))
|
||||
return
|
||||
}
|
||||
|
||||
ctx.JSON(http.StatusOK, OK(JobSetSubmitResp{
|
||||
JobSetID: *jobsetID,
|
||||
FilesUploadScheme: *uploadScheme,
|
||||
}))
|
||||
}
|
||||
|
||||
type JobSetLocalFileUploadedReq struct {
|
||||
JobSetID schsdk.JobSetID `json:"jobSetID" binding:"required"`
|
||||
LocalPath string `json:"localPath" binding:"required"`
|
||||
Error string `json:"error"`
|
||||
PackageID cdssdk.PackageID `json:"packageID"`
|
||||
ObjectIDs []cdssdk.ObjectID `json:"objectIDs"`
|
||||
//FolderID uploadersdk.FolderID `json:"folderID"`
|
||||
//UploadedInfo []schmod.FileUploadedInfo `json:"uploadedInfo"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) LocalFileUploaded(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.LocalFileUploaded")
|
||||
|
||||
var req JobSetLocalFileUploadedReq
|
||||
if err := ctx.ShouldBindJSON(&req); err != nil {
|
||||
log.Warnf("binding body: %s", err.Error())
|
||||
ctx.JSON(http.StatusBadRequest, Failed(errorcode.BadArgument, "missing argument or invalid argument"))
|
||||
return
|
||||
}
|
||||
|
||||
s.svc.JobSetSvc().LocalFileUploaded(req.JobSetID, req.LocalPath, req.Error, req.PackageID, req.ObjectIDs)
|
||||
|
||||
ctx.JSON(http.StatusOK, OK(nil))
|
||||
}
|
||||
|
||||
type UploadReq struct {
|
||||
UserID cdssdk.UserID `json:"userID"`
|
||||
UploadParams sch.UploadParams `json:"uploadParams"`
|
||||
}
|
||||
|
||||
type UploadResp struct {
|
||||
JobSetID schsdk.JobSetID `json:"jobSetID"`
|
||||
LocalPath string `json:"localPath"`
|
||||
StorageIDs []cdssdk.StorageID `json:"storageIDs"`
|
||||
BucketID cdssdk.BucketID `json:"bucketID"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) Upload(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.Upload")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
req, err := serder.JSONToObjectEx[UploadReq](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
jobsetID, storages, err := s.svc.JobSetSvc().Upload(req.UserID, req.UploadParams)
|
||||
if err != nil {
|
||||
log.Warnf("uploading file: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "upload file failed, error: "+err.Error()))
|
||||
return
|
||||
}
|
||||
|
||||
switch info := req.UploadParams.UploadInfo.(type) {
|
||||
case *sch.LocalUploadInfo:
|
||||
ctx.JSON(http.StatusOK, OK(UploadResp{
|
||||
JobSetID: *jobsetID,
|
||||
LocalPath: info.LocalPath,
|
||||
StorageIDs: *storages,
|
||||
BucketID: 1,
|
||||
}))
|
||||
|
||||
case *sch.RemoteUploadInfo:
|
||||
ctx.JSON(http.StatusOK, OK("success"))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
type CreateFolderReq struct {
|
||||
PackageID cdssdk.PackageID `json:"packageID"`
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) CreateFolder(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.CreateFolder")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
req, err := serder.JSONToObjectEx[CreateFolderReq](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
err = s.svc.JobSetSvc().CreateFolder(req.PackageID, req.Path)
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("creating folder: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, err.Error()))
|
||||
return
|
||||
}
|
||||
|
||||
ctx.JSON(http.StatusOK, OK("success"))
|
||||
}
|
||||
|
||||
type DeleteFileReq struct {
|
||||
UserID cdssdk.UserID `json:"userID" binding:"required"`
|
||||
ObjectIDs []cdssdk.ObjectID `json:"objectIDs" binding:"required"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) DeleteFile(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.DeleteFile")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
req, err := serder.JSONToObjectEx[DeleteFileReq](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
err = s.svc.JobSetSvc().DeleteFile(req.UserID, req.ObjectIDs)
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("creating folder: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "create folder failed"))
|
||||
return
|
||||
}
|
||||
|
||||
ctx.JSON(http.StatusOK, OK("success"))
|
||||
}
|
||||
|
||||
type DeleteFolderReq struct {
|
||||
UserID cdssdk.UserID `json:"userID" binding:"required"`
|
||||
PackageID cdssdk.PackageID `json:"packageID" binding:"required"`
|
||||
Path string `json:"path" binding:"required"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) DeleteFolder(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.DeleteFolder")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
req, err := serder.JSONToObjectEx[DeleteFolderReq](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
err = s.svc.JobSetSvc().DeleteFolder(req.UserID, req.PackageID, req.Path)
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("creating folder: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "delete folder failed"))
|
||||
return
|
||||
}
|
||||
|
||||
ctx.JSON(http.StatusOK, OK("success"))
|
||||
}
|
||||
|
||||
type QueryUploadedReq struct {
|
||||
QueryParams sch.QueryData `json:"queryParams" binding:"required"`
|
||||
}
|
||||
|
||||
type QueryUploadedResp struct {
|
||||
TotalPages int `json:"totalPages"`
|
||||
TotalCount int `json:"totalCount"`
|
||||
CurrentPage int `json:"currentPage"`
|
||||
PageSize int `json:"pageSize"`
|
||||
UploadedDatas []uploadersdk.Package `json:"uploadedDatas"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) QueryUploaded(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.QueryUploaded")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
req, err := serder.JSONToObjectEx[QueryUploadedReq](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
uploadedDatas, totalPages, totalCount, err := s.svc.JobSetSvc().QueryUploaded(req.QueryParams)
|
||||
if err != nil {
|
||||
log.Warnf("getting service list: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "get upload data list failed, error: "+err.Error()))
|
||||
return
|
||||
}
|
||||
|
||||
ctx.JSON(http.StatusOK, OK(QueryUploadedResp{
|
||||
TotalPages: totalPages,
|
||||
TotalCount: totalCount,
|
||||
CurrentPage: req.QueryParams.CurrentPage,
|
||||
PageSize: req.QueryParams.PageSize,
|
||||
UploadedDatas: uploadedDatas,
|
||||
}))
|
||||
}
|
||||
|
||||
type BindingReq struct {
|
||||
ID uploadersdk.DataID `json:"ID"`
|
||||
UserID cdssdk.UserID `json:"userID" binding:"required"`
|
||||
BindingName string `json:"bindingName" binding:"required"`
|
||||
BindingType string `json:"bindingType" binding:"required"`
|
||||
PacakgeIDs []cdssdk.PackageID `json:"pacakgeIDs" binding:"required"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) Binding(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.Binding")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
req, err := serder.JSONToObjectEx[BindingReq](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
params := uploadersdk.BindingData{
|
||||
ID: req.ID,
|
||||
UserID: req.UserID,
|
||||
BindingName: req.BindingName,
|
||||
BindingType: req.BindingType,
|
||||
}
|
||||
err = s.svc.JobSetSvc().DataBinding(params, req.PacakgeIDs)
|
||||
if err != nil {
|
||||
log.Warnf("getting service list: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "binding data failed, error: "+err.Error()))
|
||||
return
|
||||
}
|
||||
|
||||
ctx.JSON(http.StatusOK, OK("success"))
|
||||
}
|
||||
|
||||
type RemoveBindingReq struct {
|
||||
UploadDatas []uploadersdk.DataID `json:"uploadDatas"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) RemoveBinding(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.Binding")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
req, err := serder.JSONToObjectEx[BindingReq](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
err = s.svc.JobSetSvc().RemoveBinding(req.PacakgeIDs)
|
||||
if err != nil {
|
||||
log.Warnf("getting service list: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "remove binding failed, error: "+err.Error()))
|
||||
return
|
||||
}
|
||||
|
||||
ctx.JSON(http.StatusOK, OK("success"))
|
||||
}
|
||||
|
||||
type PackageCreate struct {
|
||||
UserID cdssdk.UserID `json:"userID"`
|
||||
Name string `json:"name"`
|
||||
DataType string `json:"dataType"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) CreatePackage(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.CreateFolder")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
req, err := serder.JSONToObjectEx[PackageCreate](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
err = s.svc.JobSetSvc().CreatePackage(req.UserID, req.Name, req.DataType)
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("creating folder: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, err.Error()))
|
||||
return
|
||||
}
|
||||
|
||||
ctx.JSON(http.StatusOK, OK("success"))
|
||||
}
|
||||
|
||||
type PackageDelete struct {
|
||||
UserID cdssdk.UserID `json:"userID" binding:"required"`
|
||||
PackageID cdssdk.PackageID `json:"packageID" binding:"required"`
|
||||
}
|
||||
|
||||
func (s *JobSetService) DeletePackage(ctx *gin.Context) {
|
||||
log := logger.WithField("HTTP", "JobSet.CreateFolder")
|
||||
|
||||
bodyData, err := io.ReadAll(ctx.Request.Body)
|
||||
if err != nil {
|
||||
log.Warnf("reading request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
|
||||
return
|
||||
}
|
||||
req, err := serder.JSONToObjectEx[PackageDelete](bodyData)
|
||||
if err != nil {
|
||||
log.Warnf("parsing request body: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
|
||||
return
|
||||
}
|
||||
|
||||
err = s.svc.JobSetSvc().DeletePackage(req.UserID, req.PackageID)
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("creating folder: %s", err.Error())
|
||||
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, err.Error()))
|
||||
return
|
||||
}
|
||||
|
||||
ctx.JSON(http.StatusOK, OK("success"))
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
package http
|
||||
|
||||
import (
|
||||
"github.com/gin-gonic/gin"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/services"
|
||||
)
|
||||
|
||||
type Server struct {
|
||||
engine *gin.Engine
|
||||
listenAddr string
|
||||
svc *services.Service
|
||||
}
|
||||
|
||||
func NewServer(listenAddr string, svc *services.Service) (*Server, error) {
|
||||
engine := gin.New()
|
||||
|
||||
return &Server{
|
||||
engine: engine,
|
||||
listenAddr: listenAddr,
|
||||
svc: svc,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *Server) Serve() error {
|
||||
s.initRouters()
|
||||
|
||||
logger.Infof("start serving http at: %s", s.listenAddr)
|
||||
err := s.engine.Run(s.listenAddr)
|
||||
|
||||
if err != nil {
|
||||
logger.Infof("http stopped with error: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
logger.Infof("http stopped")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Server) initRouters() {
|
||||
s.engine.POST("/jobSet/upload", s.JobSetSvc().Upload)
|
||||
s.engine.POST("/jobSet/submit", s.JobSetSvc().Submit)
|
||||
s.engine.POST("/jobSet/localFileUploaded", s.JobSetSvc().LocalFileUploaded)
|
||||
s.engine.POST("/jobSet/queryUploaded", s.JobSetSvc().QueryUploaded)
|
||||
|
||||
s.engine.POST("/jobSet/createPackage", s.JobSetSvc().CreatePackage)
|
||||
s.engine.POST("/jobSet/deletePackage", s.JobSetSvc().DeletePackage)
|
||||
|
||||
s.engine.POST("/jobSet/createFolder", s.JobSetSvc().CreateFolder)
|
||||
s.engine.POST("/jobSet/deleteFolder", s.JobSetSvc().DeleteFolder)
|
||||
s.engine.POST("/jobSet/deleteFile", s.JobSetSvc().DeleteFile)
|
||||
|
||||
s.engine.POST("/jobSet/binding", s.JobSetSvc().Binding)
|
||||
s.engine.POST("/jobSet/removeBinding", s.JobSetSvc().RemoveBinding)
|
||||
}
|
|
@ -0,0 +1,194 @@
|
|||
package executormgr
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/async"
|
||||
log "gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||
jobTask "gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/task"
|
||||
"io"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
exemq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
|
||||
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
|
||||
)
|
||||
|
||||
type task struct {
|
||||
statusChan *async.UnboundChannel[mgrmq.ExecutorTaskStatus]
|
||||
}
|
||||
type ExecutorStatus struct {
|
||||
executorID schmod.ExecutorID
|
||||
tasks map[string]task // key 为 TaskID
|
||||
}
|
||||
|
||||
var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
|
||||
|
||||
var ExecutorPool exemq.HttpPool
|
||||
|
||||
func InitExecutorPool() {
|
||||
ExecutorPool = exemq.NewHttpPool(&exemq.Config{})
|
||||
}
|
||||
|
||||
type Manager struct {
|
||||
executors map[schmod.ExecutorID]*ExecutorStatus
|
||||
lock sync.Mutex
|
||||
exeCli *exemq.Client
|
||||
|
||||
reportTimeout time.Duration
|
||||
}
|
||||
|
||||
func NewManager(reportTimeout time.Duration) (*Manager, error) {
|
||||
exeCli, err := schglb.ExecutorMQPool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new executor client: %w", err)
|
||||
}
|
||||
|
||||
return &Manager{
|
||||
executors: make(map[schmod.ExecutorID]*ExecutorStatus),
|
||||
exeCli: exeCli,
|
||||
reportTimeout: reportTimeout,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (m *Manager) ReceiveExecutorTaskStatus(url string) (*mgrmq.ExecutorTaskStatus, error) {
|
||||
|
||||
client, err := ExecutorPool.AcquireByUrl(url)
|
||||
if err != nil {
|
||||
log.Error(err)
|
||||
return &mgrmq.ExecutorTaskStatus{}, err
|
||||
}
|
||||
resp, err := client.GetReportInfo()
|
||||
if err != nil {
|
||||
log.Error(err)
|
||||
return &mgrmq.ExecutorTaskStatus{}, err
|
||||
}
|
||||
|
||||
reader := bufio.NewReader(resp.Body)
|
||||
|
||||
line, err := reader.ReadString('\n')
|
||||
if err != nil && err != io.EOF {
|
||||
log.Error("Error reading from response body:", err)
|
||||
return &mgrmq.ExecutorTaskStatus{}, err
|
||||
}
|
||||
// TODO 第一次获取的值包含执行器所有任务,用于失败重试
|
||||
executorInfo := convertLine(line)
|
||||
// 将第一次的executor放入到池子中
|
||||
exec := &ExecutorStatus{
|
||||
executorID: executorInfo.ExecutorID,
|
||||
tasks: make(map[string]task),
|
||||
}
|
||||
|
||||
m.executors[executorInfo.ExecutorID] = exec
|
||||
|
||||
go func() {
|
||||
for {
|
||||
line, err = reader.ReadString('\n')
|
||||
if err != nil {
|
||||
if err != io.EOF {
|
||||
log.Error("Error reading from response body:", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
status := convertLine(line)
|
||||
if status == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
m.Report(*status)
|
||||
}
|
||||
}()
|
||||
|
||||
return executorInfo, nil
|
||||
}
|
||||
|
||||
func convertLine(line string) *mgrmq.ExecutorTaskStatus {
|
||||
if line == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
line = strings.TrimPrefix(line, "data: ")
|
||||
line = strings.TrimSpace(line)
|
||||
if len(line) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
readResp, err := serder.JSONToObjectEx[mgrmq.ExecutorTaskStatus]([]byte(line))
|
||||
if err != nil {
|
||||
log.Error(err)
|
||||
return nil
|
||||
}
|
||||
|
||||
return &readResp
|
||||
}
|
||||
|
||||
func (m *Manager) Report(status mgrmq.ExecutorTaskStatus) {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
exec := m.executors[status.ExecutorID]
|
||||
if exec == nil {
|
||||
log.Error("Executor not found: ", status.ExecutorID)
|
||||
return
|
||||
}
|
||||
// 由于先将task chan放入到池子中再执行的task,所以这里的task必存在
|
||||
tsk := exec.tasks[status.TaskID]
|
||||
|
||||
// TODO 考虑主动检测channel是否关闭,然后取消task
|
||||
if tsk.statusChan.Send(status) != nil {
|
||||
delete(exec.tasks, status.TaskID)
|
||||
|
||||
if len(exec.tasks) == 0 {
|
||||
delete(m.executors, exec.executorID)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 启动一个Task
|
||||
func (m *Manager) StartTask(info exetsk.TaskInfo, ccInfo schmod.ComputingCenter) (*jobTask.JobTask[mgrmq.ExecutorTaskStatus], error) {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
newJobTask := jobTask.NewJobTask[mgrmq.ExecutorTaskStatus]()
|
||||
ch := newJobTask.Chan()
|
||||
|
||||
client, err := ExecutorPool.AcquireByUrl(ccInfo.ExecutorURL)
|
||||
if err != nil {
|
||||
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||||
return newJobTask, err
|
||||
}
|
||||
|
||||
executorID := schmod.ExecutorID(ccInfo.ExecutorID)
|
||||
// 检测是否连接过这个Executor,如果第一次连,则发送请求监听上报信息
|
||||
_, ok := m.executors[executorID]
|
||||
if !ok {
|
||||
_, err = m.ReceiveExecutorTaskStatus(ccInfo.ExecutorURL)
|
||||
if err != nil {
|
||||
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||||
return newJobTask, err
|
||||
}
|
||||
}
|
||||
|
||||
// 上面已经将executor放入到池子中了,这里的executor必存在
|
||||
exeInfo := m.executors[executorID]
|
||||
exeInfo.tasks[newJobTask.ID()] = task{
|
||||
statusChan: ch,
|
||||
}
|
||||
|
||||
_, err = client.SubmitTask(exemq.NewStartTask(newJobTask.ID(), info))
|
||||
if err != nil {
|
||||
ch.CloseWithError(fmt.Errorf("start task: %w", err))
|
||||
return newJobTask, err
|
||||
}
|
||||
|
||||
return newJobTask, nil
|
||||
}
|
||||
|
||||
func (m *Manager) Serve() {
|
||||
InitExecutorPool()
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
package event
|
||||
|
||||
type Cancel struct {
|
||||
}
|
||||
|
||||
func (s *Cancel) Noop() {
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/types"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
"gitlink.org.cn/cloudream/common/utils/serder"
|
||||
)
|
||||
|
||||
type OperateInstanceFuture = *future.SetValueFuture[OperateInstanceResult]
|
||||
|
||||
type InstanceOperate struct {
|
||||
Info InstanceOperateInfo
|
||||
Result OperateInstanceFuture
|
||||
}
|
||||
|
||||
type OperateInstanceResult struct {
|
||||
OperateResult string
|
||||
Err error
|
||||
JobID schsdk.JobID
|
||||
FilesUploadScheme schsdk.JobFilesUploadScheme
|
||||
}
|
||||
|
||||
type InstanceOperateInfo interface {
|
||||
Instance()
|
||||
}
|
||||
|
||||
type InstanceInfoBase struct{}
|
||||
|
||||
func (i *InstanceInfoBase) Instance() {}
|
||||
|
||||
var InstanceOperateInfoTypeUnion = types.NewTypeUnion[InstanceOperateInfo](
|
||||
(*InstanceCreateInfo)(nil),
|
||||
(*InstanceUpdateInfo)(nil),
|
||||
(*InstanceDeleteInfo)(nil),
|
||||
)
|
||||
|
||||
var _ = serder.UseTypeUnionInternallyTagged(&InstanceOperateInfoTypeUnion, "type")
|
||||
|
||||
type InstanceCreateInfo struct {
|
||||
serder.Metadata `union:"Create"`
|
||||
InstanceInfoBase
|
||||
DataSet schsdk.JobFileInfo
|
||||
}
|
||||
|
||||
type InstanceUpdateInfo struct {
|
||||
serder.Metadata `union:"UpdatePackage"`
|
||||
InstanceInfoBase
|
||||
Type string `json:"type"`
|
||||
Info schsdk.UpdateMultiInstanceJobInfo `json:"info"`
|
||||
//PackageID cdssdk.PackageID `json:"packageID"`
|
||||
//LoRAPackage string `json:"loraPackage"`
|
||||
}
|
||||
|
||||
type InstanceDeleteInfo struct {
|
||||
serder.Metadata `union:"Delete"`
|
||||
InstanceInfoBase
|
||||
InstanceID schsdk.JobID `json:"instanceID"`
|
||||
}
|
||||
|
||||
func NewInstanceOperate(info InstanceOperateInfo, future OperateInstanceFuture) *InstanceOperate {
|
||||
return &InstanceOperate{
|
||||
Info: info,
|
||||
Result: future,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *InstanceOperate) Noop() {
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
)
|
||||
|
||||
// 任务结束,包括成功或者失败
|
||||
type JobCompleted struct {
|
||||
Job *jobmgr.Job
|
||||
Err error
|
||||
}
|
||||
|
||||
func NewJobCompleted(job *jobmgr.Job, err error) *JobCompleted {
|
||||
return &JobCompleted{
|
||||
Job: job,
|
||||
Err: err,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JobCompleted) Noop() {
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
)
|
||||
|
||||
// 本地文件上传结束
|
||||
type LocalFileUploaded struct {
|
||||
LocalPath string
|
||||
Error error
|
||||
PackageID cdssdk.PackageID
|
||||
ObjectIDs []cdssdk.ObjectID
|
||||
//FolderID uploadersdk.FolderID
|
||||
//UploadedInfo []schmod.FileUploadedInfo
|
||||
}
|
||||
|
||||
func NewLocalFileUploaded(localPath string, err error, packageID cdssdk.PackageID, objectIDs []cdssdk.ObjectID) *LocalFileUploaded {
|
||||
return &LocalFileUploaded{
|
||||
LocalPath: localPath,
|
||||
Error: err,
|
||||
PackageID: packageID,
|
||||
ObjectIDs: objectIDs,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *LocalFileUploaded) Noop() {
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
)
|
||||
|
||||
type JobUpdateFuture = *future.SetValueFuture[UpdateResult]
|
||||
|
||||
type Update struct {
|
||||
Runtime schsdk.JobRuntimeInfo
|
||||
Operate string
|
||||
Result JobUpdateFuture
|
||||
}
|
||||
|
||||
func (s *Update) Noop() {}
|
||||
|
||||
type UpdateResult struct {
|
||||
Err error
|
||||
}
|
||||
|
||||
func NewUpdate(runTime schsdk.JobRuntimeInfo, operate string, jobUpdateFuture JobUpdateFuture) *Update {
|
||||
return &Update{
|
||||
Runtime: runTime,
|
||||
Operate: operate,
|
||||
Result: jobUpdateFuture,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
package event
|
||||
|
||||
import (
|
||||
"context"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
)
|
||||
|
||||
// WaitType 等待一个特定类型的事件。
|
||||
// 通过给定的上下文和事件集,这个函数会阻塞直到匹配指定类型的事件发生。
|
||||
// ctx: 用于控制等待过程的上下文,如果上下文被取消或到期,等待将被终止。
|
||||
// set: 指向一个事件集,这个事件集会被用来等待特定类型的事件。
|
||||
// 返回值 T: 等待到的事件,它会被强制转换为函数参数类型 T。
|
||||
// 返回值 bool: 表示等待操作是否成功。如果成功等到事件,返回 true;如果因为上下文被取消或到期而终止,返回 false。
|
||||
func WaitType[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet) (T, bool) {
|
||||
// 使用 set.Wait 方法等待一个满足给定条件的事件。
|
||||
// 条件函数检查事件是否能被转换为类型 T。
|
||||
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
|
||||
_, ok := evt.(T)
|
||||
return ok
|
||||
})
|
||||
if ret == nil {
|
||||
var r T
|
||||
return r, false // 如果事件为空,则返回false。
|
||||
}
|
||||
// 因为 set.Wait 返回的事件类型是 jobmgr.Event,这里将它转换为 T 类型,并返回转换结果及操作成功标志。
|
||||
return ret.(T), ok
|
||||
}
|
||||
|
||||
// WaitTypeAnd 等待一个特定类型的事件并检查该事件是否满足给定的条件。
|
||||
// ctx: 上下文,用于控制等待过程的取消或超时。
|
||||
// set: 事件集合,从中等待事件发生。
|
||||
// cond: 一个函数,用于检查等待的事件是否满足特定条件。
|
||||
// 返回值为满足条件的事件和一个布尔值,指示获取事件是否成功。
|
||||
func WaitTypeAnd[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet, cond func(val T) bool) (T, bool) {
|
||||
// 等待一个满足特定类型和条件的事件。
|
||||
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
|
||||
// 尝试将事件断言为特定类型T,并检查断言是否成功。
|
||||
e, ok := evt.(T)
|
||||
if !ok {
|
||||
return false // 如果事件不是期望的类型T,则返回false。
|
||||
}
|
||||
|
||||
// 如果事件是类型T且满足给定条件,则返回true。
|
||||
return cond(e)
|
||||
})
|
||||
if ret == nil {
|
||||
var r T
|
||||
return r, false // 如果事件为空,则返回false。
|
||||
}
|
||||
// 断言返回的事件为类型T,并返回该事件和操作成功标志。
|
||||
return ret.(T), ok
|
||||
}
|
||||
|
||||
func BeginWaitType[T jobmgr.Event](set *jobmgr.EventSet) future.Future1[jobmgr.Event] {
|
||||
// 等待一个满足特定类型和条件的事件。
|
||||
return set.BeginWait(func(evt jobmgr.Event) bool {
|
||||
_, ok := evt.(T)
|
||||
return ok
|
||||
})
|
||||
}
|
||||
|
||||
func BeginWaitTypeAnd[T jobmgr.Event](set *jobmgr.EventSet, cond func(val T) bool) future.Future1[jobmgr.Event] {
|
||||
// 等待一个满足特定类型和条件的事件。
|
||||
return set.BeginWait(func(evt jobmgr.Event) bool {
|
||||
// 尝试将事件断言为特定类型T,并检查断言是否成功。
|
||||
e, ok := evt.(T)
|
||||
if !ok {
|
||||
return false // 如果事件不是期望的类型T,则返回false。
|
||||
}
|
||||
|
||||
// 如果事件是类型T且满足给定条件,则返回true。
|
||||
return cond(e)
|
||||
})
|
||||
}
|
|
@ -0,0 +1,105 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"sync"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
"gitlink.org.cn/cloudream/common/utils/lo2"
|
||||
)
|
||||
|
||||
type EventWaitCondition func(evt Event) bool
|
||||
|
||||
var ErrJobCancelled = errors.New("job cancelled")
|
||||
|
||||
type Event interface {
|
||||
Noop()
|
||||
}
|
||||
|
||||
type EventWaiter struct {
|
||||
condition EventWaitCondition
|
||||
future *future.SetValueFuture[Event]
|
||||
}
|
||||
|
||||
type EventSet struct {
|
||||
events []Event
|
||||
waiters []EventWaiter
|
||||
lock sync.Mutex
|
||||
}
|
||||
|
||||
func NewEventSet() EventSet {
|
||||
return EventSet{}
|
||||
}
|
||||
|
||||
func (s *EventSet) Post(evt Event) {
|
||||
s.lock.Lock() // 加锁保护事件集合
|
||||
defer s.lock.Unlock() // 确保在函数结束时释放锁
|
||||
|
||||
// 遍历等待者列表,查找匹配的等待者。如果找到,从列表中移除,并设置其future的值。
|
||||
used := false // 标记当前事件是否已被使用(即是否唤醒了某个等待者)
|
||||
for i, waiter := range s.waiters {
|
||||
if waiter.condition(evt) { // 检查当前事件是否满足等待条件
|
||||
s.waiters = lo2.RemoveAt(s.waiters, i) // 从等待者列表中移除当前等待者
|
||||
waiter.future.SetValue(evt) // 设置等待者的future值为当前事件
|
||||
used = true // 标记事件已被使用
|
||||
}
|
||||
}
|
||||
|
||||
// 如果没有匹配的等待者,则将事件添加到事件列表中。
|
||||
if !used {
|
||||
s.events = append(s.events, evt)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bool) {
|
||||
s.lock.Lock()
|
||||
|
||||
for i, evt := range s.events {
|
||||
if cond(evt) {
|
||||
s.events = lo2.RemoveAt(s.events, i)
|
||||
s.lock.Unlock()
|
||||
return evt, true
|
||||
}
|
||||
}
|
||||
|
||||
fut := future.NewSetValue[Event]()
|
||||
waiter := EventWaiter{
|
||||
condition: cond,
|
||||
future: fut,
|
||||
}
|
||||
s.waiters = append(s.waiters, waiter)
|
||||
|
||||
s.lock.Unlock()
|
||||
|
||||
val, err := fut.Wait(ctx)
|
||||
|
||||
if err != nil {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
return val, true
|
||||
}
|
||||
|
||||
func (s *EventSet) BeginWait(cond EventWaitCondition) future.Future1[Event] {
|
||||
s.lock.Lock()
|
||||
|
||||
for i, evt := range s.events {
|
||||
if cond(evt) {
|
||||
s.events = lo2.RemoveAt(s.events, i)
|
||||
s.lock.Unlock()
|
||||
return future.NewReadyValue1(evt)
|
||||
}
|
||||
}
|
||||
|
||||
fut := future.NewSetValue[Event]()
|
||||
waiter := EventWaiter{
|
||||
condition: cond,
|
||||
future: fut,
|
||||
}
|
||||
s.waiters = append(s.waiters, waiter)
|
||||
|
||||
s.lock.Unlock()
|
||||
|
||||
return fut
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"github.com/samber/lo"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type FileScheduleAction string
|
||||
|
||||
// 文件调度方案
|
||||
const (
|
||||
ActionNo FileScheduleAction = "No" // 不需要操作
|
||||
ActionMove FileScheduleAction = "Move" // 需要在指定节点上建立缓存
|
||||
ActionLoad FileScheduleAction = "Load" // 需要加载到Storage
|
||||
ActionImportImage FileScheduleAction = "ImportImage" // 需要导入镜像
|
||||
)
|
||||
|
||||
type FileScheduleScheme struct {
|
||||
Action FileScheduleAction `json:"action"`
|
||||
}
|
||||
|
||||
// 任务调度方案
|
||||
type JobScheduleScheme struct {
|
||||
TargetCCID schsdk.CCID `json:"targetCCID"`
|
||||
Dataset FileScheduleScheme `json:"dataset"`
|
||||
Code FileScheduleScheme `json:"code"`
|
||||
Image FileScheduleScheme `json:"image"`
|
||||
}
|
||||
|
||||
// 任务集的预调度方案
|
||||
type JobSetPreScheduleScheme struct {
|
||||
JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID
|
||||
}
|
||||
|
||||
// 任务集
|
||||
type JobSet struct {
|
||||
JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID
|
||||
JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用
|
||||
PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"`
|
||||
}
|
||||
type JobSetJobRef struct {
|
||||
JobID schsdk.JobID `json:"jobID"` // 任务ID
|
||||
LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID
|
||||
}
|
||||
|
||||
func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet {
|
||||
return &JobSet{
|
||||
JobSetID: jobSetID,
|
||||
JobRefs: jobRefs,
|
||||
PreScheduleScheme: preScheduleScheme,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef {
|
||||
ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID })
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &ref
|
||||
}
|
||||
|
||||
// 任务
|
||||
type Job struct {
|
||||
JobSetID schsdk.JobSetID // 任务集ID
|
||||
JobID schsdk.JobID // 全局唯一任务ID
|
||||
Body JobBody // 具体任务
|
||||
}
|
||||
|
||||
func (j *Job) GetInfo() schsdk.JobInfo {
|
||||
return j.Body.GetInfo()
|
||||
}
|
||||
|
||||
func (j *Job) Dump(ctx JobStateRunContext, job *Job, curState JobState) jobmod.JobDump {
|
||||
return jobmod.JobDump{
|
||||
JobID: j.JobID,
|
||||
JobSetID: j.JobSetID,
|
||||
Info: j.GetInfo(),
|
||||
Body: job.Body.Dump(),
|
||||
State: curState.Dump(ctx, job),
|
||||
}
|
||||
}
|
||||
|
||||
type JobBody interface {
|
||||
GetInfo() schsdk.JobInfo
|
||||
Dump() jobmod.JobBodyDump
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package job
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type DataReturnJob struct {
|
||||
Info schsdk.DataReturnJobInfo
|
||||
TargetJobID schsdk.JobID // 目标任务的ID
|
||||
TargetJobCCID schsdk.CCID // 目标任务所在计算中心的ID
|
||||
TargetJobOutputPath string // 目标任务的结果输出路径,相对路径
|
||||
DataReturnPackageID cdssdk.PackageID // 回源之后得到的PackageID
|
||||
ECSInstanceID schsdk.ECSInstanceID // ECS实例ID,在数据预处理和模型微调需要复用同一台机器时使用
|
||||
}
|
||||
|
||||
func NewDataReturnJob(info schsdk.DataReturnJobInfo) *DataReturnJob {
|
||||
return &DataReturnJob{
|
||||
Info: info,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *DataReturnJob) GetInfo() schsdk.JobInfo {
|
||||
return &j.Info
|
||||
}
|
||||
|
||||
func (j *DataReturnJob) Dump() jobmod.JobBodyDump {
|
||||
return &jobmod.DataReturnJobDump{
|
||||
DataReturnPackageID: j.DataReturnPackageID,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
package job
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type InstanceJob struct {
|
||||
Info schsdk.InstanceJobInfo // 提交任务时提供的任务描述信息
|
||||
Files jobmod.JobFiles // 任务需要的文件
|
||||
TargetCCID schsdk.CCID // 将要运行此任务的算力中心ID
|
||||
OutputPath string // 程序结果输出路径,一个相对路径,需要加上CDS数据库中记录的RemoteBase才是完整路径
|
||||
ParentJobID schsdk.JobID
|
||||
}
|
||||
|
||||
func NewInstanceJob(info schsdk.InstanceJobInfo, files jobmod.JobFiles, parentJobID schsdk.JobID) *InstanceJob {
|
||||
return &InstanceJob{
|
||||
Info: info,
|
||||
Files: files,
|
||||
ParentJobID: parentJobID,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *InstanceJob) GetInfo() schsdk.JobInfo {
|
||||
return &j.Info
|
||||
}
|
||||
|
||||
func (j *InstanceJob) Dump() jobmod.JobBodyDump {
|
||||
return &jobmod.InstanceJobDump{
|
||||
Files: j.Files,
|
||||
TargetCCID: j.TargetCCID,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package job
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type MultiInstanceJob struct {
|
||||
Info schsdk.MultiInstanceJobInfo
|
||||
Files jobmod.JobFiles
|
||||
TargetCCID schsdk.CCID
|
||||
SubJobs []schsdk.JobID
|
||||
PreScheduler jobmod.JobScheduleScheme
|
||||
}
|
||||
|
||||
func NewMultiInstanceJob(info schsdk.MultiInstanceJobInfo, preScheduler jobmod.JobScheduleScheme) *MultiInstanceJob {
|
||||
return &MultiInstanceJob{
|
||||
Info: info,
|
||||
PreScheduler: preScheduler,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *MultiInstanceJob) GetInfo() schsdk.JobInfo {
|
||||
return &j.Info
|
||||
}
|
||||
|
||||
func (j *MultiInstanceJob) Dump() jobmod.JobBodyDump {
|
||||
return &jobmod.MultiInstanceJobDump{
|
||||
Files: j.Files,
|
||||
TargetCCID: j.TargetCCID,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package job
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type UpdateMultiInstanceJob struct {
|
||||
Info schsdk.UpdateMultiInstanceJobInfo
|
||||
Files jobmod.JobFiles
|
||||
|
||||
//InstanceIDs []schsdk.JobID
|
||||
//UpdateStrategy string
|
||||
}
|
||||
|
||||
func NewUpdateMultiInstanceJob(info schsdk.UpdateMultiInstanceJobInfo) *UpdateMultiInstanceJob {
|
||||
return &UpdateMultiInstanceJob{
|
||||
Info: info,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *UpdateMultiInstanceJob) GetInfo() schsdk.JobInfo {
|
||||
return &j.Info
|
||||
}
|
||||
|
||||
func (j *UpdateMultiInstanceJob) Dump() jobmod.JobBodyDump {
|
||||
return &jobmod.UpdateMultiInstanceJobDump{
|
||||
Files: j.Files,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package job
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type NormalJob struct {
|
||||
Info schsdk.NormalJobInfo // 提交任务时提供的任务描述信息
|
||||
Files jobmod.JobFiles // 任务需要的文件
|
||||
TargetCCID schsdk.CCID // 将要运行此任务的算力中心ID
|
||||
OutputPath string // 程序结果输出路径,一个相对路径,需要加上CDS数据库中记录的RemoteBase才是完整路径
|
||||
SubType string // 用于区分普通任务下的子类型
|
||||
ECSInstanceID schsdk.ECSInstanceID // ECS实例ID,在数据预处理和模型微调需要复用同一台机器时使用
|
||||
}
|
||||
|
||||
func NewNormalJob(info schsdk.NormalJobInfo) *NormalJob {
|
||||
return &NormalJob{
|
||||
Info: info,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *NormalJob) GetInfo() schsdk.JobInfo {
|
||||
return &j.Info
|
||||
}
|
||||
|
||||
func (j *NormalJob) Dump() jobmod.JobBodyDump {
|
||||
return &jobmod.NormalJobDump{
|
||||
Files: j.Files,
|
||||
TargetCCID: j.TargetCCID,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package job
|
||||
|
||||
import (
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type PCMJob struct {
|
||||
Info schsdk.PCMJobInfo // 提交任务时提供的任务描述信息
|
||||
Files jobmod.JobFiles // 任务需要的文件
|
||||
}
|
||||
|
||||
func NewPCMJob(info schsdk.PCMJobInfo) *PCMJob {
|
||||
return &PCMJob{
|
||||
Info: info,
|
||||
}
|
||||
}
|
||||
|
||||
func (j *PCMJob) GetInfo() schsdk.JobInfo {
|
||||
return &j.Info
|
||||
}
|
||||
|
||||
func (j *PCMJob) Dump() jobmod.JobBodyDump {
|
||||
return &jobmod.NormalJobDump{
|
||||
Files: j.Files,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,239 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
|
||||
"sync"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||
)
|
||||
|
||||
type Adjusting struct {
|
||||
scheme jobmod.JobScheduleScheme
|
||||
targetCCInfo schmod.ComputingCenter
|
||||
}
|
||||
|
||||
func NewAdjusting(scheme jobmod.JobScheduleScheme) *Adjusting {
|
||||
return &Adjusting{
|
||||
scheme: scheme,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Adjusting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewNormalJobReadyToExecute())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.AdjustingDump{
|
||||
Scheme: s.scheme,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
userID := cdssdk.UserID(1)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// 监听取消事件
|
||||
go func() {
|
||||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), s.scheme.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
s.targetCCInfo = ccInfo
|
||||
|
||||
logger.WithField("JobID", jo.JobID).Infof("job is scheduled to %v(%v)", ccInfo.Name, ccInfo.CCID)
|
||||
|
||||
// 已经确定最终执行的目标计算中心,则可以生成结果输出路径了
|
||||
// TODO UserID
|
||||
outputPath := utils.MakeJobOutputPath(userID, jo.JobID)
|
||||
|
||||
var jobFilesInfo schsdk.JobFilesInfo
|
||||
var jobFiles *jobmod.JobFiles
|
||||
|
||||
switch runningJob := jo.Body.(type) {
|
||||
case *job.NormalJob:
|
||||
jobFilesInfo = runningJob.Info.Files
|
||||
jobFiles = &runningJob.Files
|
||||
runningJob.TargetCCID = s.scheme.TargetCCID
|
||||
runningJob.OutputPath = outputPath
|
||||
case *job.MultiInstanceJob:
|
||||
jobFilesInfo = runningJob.Info.Files
|
||||
jobFiles = &runningJob.Files
|
||||
runningJob.TargetCCID = s.scheme.TargetCCID
|
||||
case *job.InstanceJob:
|
||||
jobFilesInfo = runningJob.Info.Files
|
||||
jobFiles = &runningJob.Files
|
||||
runningJob.TargetCCID = s.scheme.TargetCCID
|
||||
runningJob.OutputPath = outputPath
|
||||
}
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(3)
|
||||
|
||||
var e1, e2, e3 error
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e1 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Dataset, &jobFiles.Dataset, &s.scheme.Dataset)
|
||||
if e1 != nil {
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e2 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Code, &jobFiles.Code, &s.scheme.Code)
|
||||
if e2 != nil {
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e3 = s.doImageScheduling(ctx, rtx, s.scheme.TargetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image)
|
||||
if e3 != nil {
|
||||
cancel()
|
||||
}
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
return errors.Join(e1, e2, e3)
|
||||
}
|
||||
|
||||
func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||
if scheme.Action == jobmod.ActionMove {
|
||||
logger.Debugf("begin move pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
|
||||
|
||||
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
fut := taskStatus.Receive()
|
||||
status := <-fut.Chan()
|
||||
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
|
||||
if moveStatus.Error != "" {
|
||||
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionLoad {
|
||||
logger.Debugf("begin load pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
|
||||
|
||||
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
fut := taskStatus.Receive()
|
||||
status := <-fut.Chan()
|
||||
|
||||
loadStatus := status.Value.Status.(*exectsk.StorageLoadPackageStatus)
|
||||
if loadStatus.Error != "" {
|
||||
return fmt.Errorf("loading package: %s", loadStatus.Error)
|
||||
}
|
||||
|
||||
// file.PackagePath = loadStatus.PackagePath TODO 路径谁来产生的问题
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||
if scheme.Action == jobmod.ActionImportImage {
|
||||
// TODO 镜像文件位置需要重新设计
|
||||
return fmt.Errorf("not implemented yet")
|
||||
|
||||
if file.PackageID == nil {
|
||||
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
|
||||
}
|
||||
|
||||
// TODO UserID
|
||||
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
|
||||
if err != nil {
|
||||
return fmt.Errorf("moving package: %w", err)
|
||||
}
|
||||
|
||||
fut := taskStatus.Receive()
|
||||
status := <-fut.Chan()
|
||||
|
||||
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
|
||||
if moveStatus.Error != "" {
|
||||
return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
}
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cloudream storage client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
// TODO UserID
|
||||
pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting package objects: %w", err)
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) == 0 {
|
||||
return fmt.Errorf("no object in the package which will be imported")
|
||||
}
|
||||
|
||||
if len(pkgObjs.Objects) > 1 {
|
||||
return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
||||
}
|
||||
|
||||
// taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("moving package: %w", err)
|
||||
// }
|
||||
|
||||
// fut2 := taskStatus2.Receive()
|
||||
// status2 := <-fut2.Chan()
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("uploading image: %w", err)
|
||||
// }
|
||||
|
||||
// uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
|
||||
// if uploadStatus.Error != "" {
|
||||
// return fmt.Errorf("uploading image: %s", uploadStatus.Error)
|
||||
// }
|
||||
|
||||
// // TODO 镜像名称
|
||||
// err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("creating image info: %w", err)
|
||||
// }
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"reflect"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type Completed struct {
|
||||
err error
|
||||
}
|
||||
|
||||
func SuccessComplete() *Completed {
|
||||
return &Completed{}
|
||||
}
|
||||
func FailureComplete(err error) *Completed {
|
||||
return &Completed{err: err}
|
||||
}
|
||||
|
||||
func (c *Completed) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
// TODO 可以考虑将执行记录落库
|
||||
if c.err == nil {
|
||||
c.handleSuccess(rtx, jo)
|
||||
} else {
|
||||
c.handleFailed(rtx, jo)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Completed) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
err := ""
|
||||
if s.err != nil {
|
||||
err = s.err.Error()
|
||||
}
|
||||
return &jobmod.CompletedDump{
|
||||
Error: err,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Completed) handleSuccess(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||||
logger.WithField("JobID", job.JobID).Infof("job completed successfuly")
|
||||
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
|
||||
rtx.Mgr.JobCompleted(job)
|
||||
}
|
||||
|
||||
func (c *Completed) handleFailed(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||||
logger.
|
||||
WithField("JobID", job.JobID).
|
||||
WithField("LastState", reflect.TypeOf(rtx.LastState).String()).
|
||||
Infof("job failed with: %v", c.err)
|
||||
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
|
||||
rtx.Mgr.JobCompleted(job)
|
||||
}
|
|
@ -0,0 +1,522 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/executormgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/samber/lo"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||
)
|
||||
|
||||
type JobExecuting struct {
|
||||
lastStatus pcmsdk.TaskStatus
|
||||
}
|
||||
|
||||
func NewNormalJobExecuting() *JobExecuting {
|
||||
return &JobExecuting{
|
||||
lastStatus: "Begin",
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, SuccessComplete())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.NormalJobExecutingDump{
|
||||
TaskStatus: s.lastStatus,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
// TODO UserID
|
||||
userID := cdssdk.UserID(1)
|
||||
err := error(nil)
|
||||
|
||||
switch runningJob := jo.Body.(type) {
|
||||
case *job.NormalJob:
|
||||
switch runningJob.SubType {
|
||||
case schsdk.JobTypeNormal: // 普通任务
|
||||
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.DefCtx(), runningJob.Files.Image.ImageID, runningJob.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting pcm image info: %w", err)
|
||||
}
|
||||
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.DefCtx(), runningJob.TargetCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center resource: %w", err)
|
||||
}
|
||||
if len(ress) == 0 {
|
||||
return fmt.Errorf("no resource found at computing center %v", runningJob.TargetCCID)
|
||||
}
|
||||
|
||||
ccInfo, _, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting storage info: %w", err)
|
||||
}
|
||||
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
|
||||
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
|
||||
err = s.submitNormalTask(rtx, cmd, envs, *ccInfo, pcmImgInfo, ress[0].PCMResourceID)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
}
|
||||
|
||||
case schsdk.JobTypeDataPreprocess: // 数据预处理
|
||||
ccInfo, getStg, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting storage info: %w", err)
|
||||
}
|
||||
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
|
||||
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
|
||||
instID, err := s.submitDataPreprocessTask(rtx, cmd, envs, *ccInfo, getStg.StorageID, userID)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
}
|
||||
runningJob.ECSInstanceID = schsdk.ECSInstanceID(instID)
|
||||
|
||||
case schsdk.JobTypeFinetuning: // 模型微调
|
||||
ccInfo, getStg, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting storage info: %w", err)
|
||||
}
|
||||
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
|
||||
// 将整理的数据集提交到OSS
|
||||
if runningJob.Files.Dataset.ECSInstanceID != "" {
|
||||
logger.Infof("instance id: %v", runningJob.ECSInstanceID)
|
||||
dataSetPath, err = loadDatasetPackage(userID, runningJob.Files.Dataset.PackageID, getStg.StorageID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("loading dataset package: %w", err)
|
||||
}
|
||||
}
|
||||
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
|
||||
err = s.submitFinetuningTask(userID, rtx, cmd, envs, *ccInfo, getStg.StorageID, runningJob)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
case *job.InstanceJob: // 推理任务
|
||||
ccInfo, getStg, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting storage info: %w", err)
|
||||
}
|
||||
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
|
||||
_, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
|
||||
err = s.submitInstanceTask(rtx, jo, runningJob, *ccInfo, getStg.StorageID, userID, envs)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
// 创建失败,从多实例任务中删除
|
||||
postDeleteInstanceEvent(rtx, jo, runningJob)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func getDataSetPathByID(packageID cdssdk.PackageID) string {
|
||||
// TODO 临时使用,这个路径应该来自于CDS
|
||||
dataSetPath := filepath.Join("packages", "1", fmt.Sprintf("%v", packageID))
|
||||
return dataSetPath
|
||||
}
|
||||
|
||||
func loadDatasetPackage(userID cdssdk.UserID, packageID cdssdk.PackageID, storageID cdssdk.StorageID) (string, error) {
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
loadPackageResp, err := stgCli.StorageLoadPackage(cdsapi.StorageLoadPackageReq{
|
||||
UserID: userID,
|
||||
PackageID: packageID,
|
||||
StorageID: storageID,
|
||||
})
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
logger.Info("load pacakge path: " + loadPackageResp.FullPath)
|
||||
return loadPackageResp.FullPath, nil
|
||||
}
|
||||
|
||||
func (s *JobExecuting) submitNormalTask(rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, pcmImgInfo schmod.PCMImage, resourceID pcmsdk.ResourceID) error {
|
||||
task, err := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSubmitTask(
|
||||
ccInfo.PCMParticipantID,
|
||||
pcmImgInfo.PCMImageID,
|
||||
// TODO 选择资源的算法
|
||||
resourceID,
|
||||
cmd,
|
||||
envs,
|
||||
// params, TODO params不应该是kv数组,而应该是字符串数组
|
||||
[]schsdk.KVPair{},
|
||||
), ccInfo)
|
||||
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
taskFut := task.Receive()
|
||||
for {
|
||||
msg := <-taskFut.Chan()
|
||||
tskStatus := msg.Value.Status.(*exetsk.SubmitTaskStatus)
|
||||
|
||||
if tskStatus.Status != s.lastStatus {
|
||||
logger.Infof("task %s -> %s", s.lastStatus, tskStatus.Status)
|
||||
}
|
||||
s.lastStatus = tskStatus.Status
|
||||
|
||||
switch tskStatus.Status {
|
||||
case pcmsdk.TaskStatusSuccess:
|
||||
return nil
|
||||
|
||||
case "Completed":
|
||||
return nil
|
||||
|
||||
case pcmsdk.TaskStatusFailed:
|
||||
return fmt.Errorf("task failed")
|
||||
}
|
||||
|
||||
taskFut = task.Receive()
|
||||
}
|
||||
}
|
||||
|
||||
func (s *JobExecuting) submitDataPreprocessTask(rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdssdk.StorageID, userID cdssdk.UserID) (string, error) {
|
||||
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.DefCtx(), storageID)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
return "", fmt.Errorf("getting object storage info: %w", err)
|
||||
}
|
||||
|
||||
task, err := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSchedulerDataPreprocess(
|
||||
userID,
|
||||
cmd,
|
||||
envs,
|
||||
objectStorage,
|
||||
), ccInfo)
|
||||
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
return "", err
|
||||
}
|
||||
|
||||
taskFut := task.Receive()
|
||||
msg := <-taskFut.Chan()
|
||||
tskStatus := msg.Value.Status.(*exetsk.SchedulerDataPreprocessStatus)
|
||||
|
||||
if tskStatus.Error != nil {
|
||||
logger.Error(tskStatus.Error.Error())
|
||||
return "", tskStatus.Error
|
||||
}
|
||||
|
||||
return tskStatus.InstanceID, nil
|
||||
}
|
||||
|
||||
func (s *JobExecuting) submitFinetuningTask(userID cdssdk.UserID, rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdssdk.StorageID, runningJob *job.NormalJob) error {
|
||||
|
||||
objectStorage, modelInfo, err := getModelInfoAndObjectStorage(rtx, runningJob.Info.ModelJobInfo.ModelID, storageID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting model info and object storage: %w", err)
|
||||
}
|
||||
|
||||
task, err := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSchedulerModelFinetuning(
|
||||
userID,
|
||||
cmd,
|
||||
*objectStorage,
|
||||
*modelInfo,
|
||||
envs,
|
||||
string(runningJob.Files.Dataset.ECSInstanceID),
|
||||
), ccInfo)
|
||||
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
taskFut := task.Receive()
|
||||
msg := <-taskFut.Chan()
|
||||
tskStatus := msg.Value.Status.(*exetsk.SchedulerModelFinetuningStatus)
|
||||
|
||||
if tskStatus.Error != nil {
|
||||
logger.Error(tskStatus.Error.Error())
|
||||
return tskStatus.Error
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *JobExecuting) submitInstanceTask(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job, runningJob *job.InstanceJob, ccInfo schmod.ComputingCenter,
|
||||
storageID cdssdk.StorageID, userID cdssdk.UserID, envs []schsdk.KVPair) error {
|
||||
|
||||
modelJobInfo := runningJob.Info.ModelJobInfo
|
||||
|
||||
objectStorage, modelInfo, err := getModelInfoAndObjectStorage(rtx, modelJobInfo.ModelID, storageID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting model info and object storage: %w", err)
|
||||
}
|
||||
|
||||
// 发送扩容任务
|
||||
ecs := exetsk.NewScheduleCreateECS(
|
||||
userID,
|
||||
runningJob.Info.Runtime.Command+"\\n"+modelJobInfo.Command,
|
||||
*objectStorage,
|
||||
*modelInfo,
|
||||
envs,
|
||||
)
|
||||
task, err := rtx.Mgr.ExecMgr.StartTask(ecs, ccInfo)
|
||||
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
waitFut := event.BeginWaitType[*event.Update](rtx.EventSet)
|
||||
taskFut := task.Receive()
|
||||
|
||||
for {
|
||||
select {
|
||||
case v1 := <-waitFut.Chan():
|
||||
// 对任务进行更新操作
|
||||
client, err := executormgr.ExecutorPool.AcquireByUrl(ccInfo.ExecutorURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting executor client: %w", err)
|
||||
}
|
||||
evt := v1.Value.(*event.Update)
|
||||
operateResp, err := client.OperateTask(executor.NewTaskOperateInfo(task.ID(), evt.Operate, evt.Runtime))
|
||||
if err != nil {
|
||||
return fmt.Errorf("operate task: %w", err)
|
||||
}
|
||||
|
||||
evt.Result.SetValue(event.UpdateResult{
|
||||
Err: operateResp.Err,
|
||||
})
|
||||
|
||||
if operateResp.Err != nil {
|
||||
return fmt.Errorf("operate task: %w", operateResp.Err)
|
||||
}
|
||||
|
||||
// 持续等待
|
||||
waitFut = event.BeginWaitType[*event.Update](rtx.EventSet)
|
||||
case msg := <-taskFut.Chan():
|
||||
switch v2 := msg.Value.Status.(type) {
|
||||
case *exetsk.ScheduleCreateECSStatus:
|
||||
if v2.Error != "" {
|
||||
logger.Error("update task fail, error: " + v2.Error)
|
||||
if v2.Operate == schsdk.CreateECS || v2.Operate == schsdk.Invalid {
|
||||
// 创建失败或者检测不可用,从多实例任务中删除
|
||||
v2.Operate = schsdk.DestroyECS
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
switch v2.Operate {
|
||||
case schsdk.CreateECS:
|
||||
// 扩容任务,将结果放到池子中
|
||||
node := schsdk.NodeInfo{
|
||||
InstanceID: jo.JobID,
|
||||
Address: schsdk.Address(v2.Result),
|
||||
Status: schsdk.RunECS,
|
||||
}
|
||||
|
||||
rtx.Mgr.NodeSvc.SetNodeData(jo.JobSetID, modelJobInfo, node)
|
||||
logger.Infof("node expansion: %v", v2.Result)
|
||||
case schsdk.DestroyECS:
|
||||
// 缩容任务,从节点列表中移除
|
||||
rtx.Mgr.NodeSvc.RemoveNodeFromRunningModels(modelJobInfo, jo.JobID)
|
||||
// 从多实例任务中删除
|
||||
postDeleteInstanceEvent(rtx, jo, runningJob)
|
||||
case schsdk.PauseECS:
|
||||
// 更新节点状态
|
||||
rtx.Mgr.NodeSvc.UpdateNodeFromRunningModels(modelJobInfo, jo.JobID, schsdk.PauseECS)
|
||||
case schsdk.RunECS:
|
||||
// 更新节点状态
|
||||
rtx.Mgr.NodeSvc.UpdateNodeFromRunningModels(modelJobInfo, jo.JobID, schsdk.RunECS)
|
||||
case schsdk.OperateServer:
|
||||
println()
|
||||
case schsdk.GPUMonitor:
|
||||
rtx.Mgr.NodeSvc.SetNodeUsageRateInfo(jo.JobID, v2.Result)
|
||||
}
|
||||
|
||||
case error:
|
||||
fmt.Println("Received error:", v2.Error())
|
||||
default:
|
||||
fmt.Println("Received unexpected type")
|
||||
}
|
||||
|
||||
// 持续接收
|
||||
taskFut = task.Receive()
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getModelInfoAndObjectStorage(rtx jobmgr.JobStateRunContext, modelID schsdk.ModelID, storageID cdssdk.StorageID) (*schmod.ObjectStorage, *schmod.ModelResource, error) {
|
||||
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.DefCtx(), storageID)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
return nil, nil, fmt.Errorf("getting object storage info: %w", err)
|
||||
}
|
||||
|
||||
// 先从数据库中查询是否已经预置了模型
|
||||
modelInfo, err := rtx.Mgr.DB.Models().GetModelByID(rtx.Mgr.DB.DefCtx(), modelID, objectStorage.ID)
|
||||
if &modelInfo == nil {
|
||||
logger.Error(err.Error())
|
||||
return nil, nil, fmt.Errorf("the model is not exists: %w", err)
|
||||
}
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
return nil, nil, fmt.Errorf("getting model info info: %w", err)
|
||||
}
|
||||
|
||||
return &objectStorage, &modelInfo, nil
|
||||
}
|
||||
|
||||
func postDeleteInstanceEvent(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job, runningJob *job.InstanceJob) {
|
||||
deleteInfo := event.InstanceDeleteInfo{
|
||||
InstanceID: jo.JobID,
|
||||
}
|
||||
fut := future.NewSetValue[event.OperateInstanceResult]()
|
||||
rtx.Mgr.PostEvent(runningJob.ParentJobID, event.NewInstanceOperate(&deleteInfo, fut))
|
||||
_, _ = fut.Wait(context.TODO())
|
||||
}
|
||||
|
||||
// 判断算力中心是否支持环境变量配置,如果不支持,则读取脚本内容并拼接在Command参数后面
|
||||
func getRuntimeCommand(runtime schsdk.JobRuntimeInfo, dataSetPath string, outputPath string, remoteBase string, ccInfo schmod.ComputingCenter) (string, []schsdk.KVPair) {
|
||||
var envs []schsdk.KVPair
|
||||
var params []string
|
||||
var cmd string
|
||||
|
||||
envs = append(envs, schsdk.KVPair{Key: schsdk.JobDataInEnv, Value: filepath.Join(remoteBase, dataSetPath)})
|
||||
envs = append(envs, schsdk.KVPair{Key: schsdk.JobDataOutEnv, Value: filepath.Join(remoteBase, outputPath)})
|
||||
envs = append(envs, runtime.Envs...)
|
||||
switch boot := ccInfo.Bootstrap.(type) {
|
||||
case *schsdk.DirectBootstrap:
|
||||
cmd = runtime.Command
|
||||
case *schsdk.NoEnvBootstrap:
|
||||
cmd = boot.ScriptFileName
|
||||
params = append(params, runtime.Command)
|
||||
envMap := lo.Map(envs, func(env schsdk.KVPair, _ int) string {
|
||||
return fmt.Sprintf("%s=%s", env.Key, env.Value)
|
||||
})
|
||||
params = append(params, envMap...)
|
||||
default:
|
||||
cmd = runtime.Command
|
||||
}
|
||||
|
||||
return cmd, envs
|
||||
}
|
||||
|
||||
func getCCInfoAndStgInfo(rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, userID cdssdk.UserID) (*schmod.ComputingCenter, *cdsapi.StorageGetResp, error) {
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), targetCCID)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("new cds client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
getStg, err := stgCli.StorageGet(cdsapi.StorageGet{
|
||||
UserID: userID,
|
||||
StorageID: ccInfo.CDSStorageID,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("request to cds: %w", err)
|
||||
}
|
||||
|
||||
return &ccInfo, getStg, nil
|
||||
}
|
||||
|
||||
type DataReturnJobExecuting struct {
|
||||
}
|
||||
|
||||
func NewDataReturnJobExecuting() *DataReturnJobExecuting {
|
||||
return &DataReturnJobExecuting{}
|
||||
}
|
||||
|
||||
func (s *DataReturnJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, SuccessComplete())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DataReturnJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.DataReturnExecutingDump{}
|
||||
}
|
||||
|
||||
func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
reJob := jo.Body.(*job.DataReturnJob)
|
||||
userID := cdssdk.UserID(1)
|
||||
|
||||
log := logger.WithType[JobExecuting]("State").WithField("JobID", jo.JobID)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// 监听取消事件
|
||||
go func() {
|
||||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), reJob.TargetJobCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
|
||||
packageName := utils.MakeResourcePackageName(reJob.TargetJobID)
|
||||
logger.Info("TargetJobOutputPath: " + reJob.TargetJobOutputPath + ", and packageName: " + packageName)
|
||||
time.Sleep(30 * time.Second)
|
||||
task, err := rtx.Mgr.ExecMgr.StartTask(exetsk.NewStorageCreatePackage(
|
||||
userID, // TOOD 用户ID
|
||||
ccInfo.CDSStorageID,
|
||||
reJob.TargetJobOutputPath,
|
||||
reJob.Info.BucketID,
|
||||
packageName,
|
||||
), ccInfo)
|
||||
if err != nil {
|
||||
log.Error(err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
fut := task.Receive()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
status := <-fut.Chan()
|
||||
tskStatus := status.Value.Status.(*exetsk.StorageCreatePackageStatus)
|
||||
if tskStatus.Error != "" {
|
||||
return fmt.Errorf("creating package: %s", tskStatus.Error)
|
||||
}
|
||||
|
||||
log.Infof("the outputs of job %v has been updated as a package %v(%v)", reJob.TargetJobID, packageName, tskStatus.PackageID)
|
||||
|
||||
reJob.DataReturnPackageID = tskStatus.PackageID
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
)
|
||||
|
||||
type MakingAdjustScheme struct {
|
||||
}
|
||||
|
||||
func NewMakeingAdjustScheme() *MakingAdjustScheme {
|
||||
return &MakingAdjustScheme{}
|
||||
}
|
||||
|
||||
func (s *MakingAdjustScheme) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
scheme, err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewAdjusting(*scheme))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MakingAdjustScheme) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) (*jobmod.JobScheduleScheme, error) {
|
||||
//ctx, cancel := context.WithCancel(context.Background())
|
||||
//defer cancel()
|
||||
//
|
||||
//// 监听取消事件
|
||||
//go func() {
|
||||
// event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||||
// cancel()
|
||||
//}()
|
||||
//
|
||||
//wt := rtx.Mgr.AdvMgr.StartTask(advtsk.NewMakeAdjustScheme(jo.Dump(rtx, jo, s)))
|
||||
//defer wt.Close()
|
||||
//
|
||||
//status, err := wt.Receive(ctx)
|
||||
//if err != nil {
|
||||
// return nil, fmt.Errorf("making adjust scheme: %w", err)
|
||||
//}
|
||||
//
|
||||
//mkStatus := status.(*advtsk.MakeAdjustSchemeStatus)
|
||||
//if mkStatus.Error != "" {
|
||||
// return nil, fmt.Errorf("making adjust scheme: %s", mkStatus.Error)
|
||||
//}
|
||||
//
|
||||
//return &mkStatus.Scheme, nil
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (s *MakingAdjustScheme) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.MakeingAdjustSchemeDump{}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
|
||||
)
|
||||
|
||||
type MultiInstanceInit struct {
|
||||
}
|
||||
|
||||
func NewMultiInstanceInit() *MultiInstanceInit {
|
||||
return &MultiInstanceInit{}
|
||||
}
|
||||
|
||||
func (s *MultiInstanceInit) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||||
s.do(rtx, job)
|
||||
}
|
||||
|
||||
func (s *MultiInstanceInit) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
multInstJob := jo.Body.(*job.MultiInstanceJob)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
go func() {
|
||||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
newLocalJobID := fmt.Sprintf("%s_%s", multInstJob.Info.LocalJobID, utils.GenerateRandomID())
|
||||
|
||||
instJobInfo := &schsdk.InstanceJobInfo{
|
||||
Type: schsdk.JobTypeInstance,
|
||||
LocalJobID: newLocalJobID,
|
||||
Files: multInstJob.Info.Files,
|
||||
Runtime: multInstJob.Info.Runtime,
|
||||
Resources: multInstJob.Info.Resources,
|
||||
ModelJobInfo: multInstJob.Info.ModelJobInfo,
|
||||
}
|
||||
|
||||
files := jobmod.JobFiles{
|
||||
Dataset: multInstJob.Files.Dataset,
|
||||
Code: multInstJob.Files.Code,
|
||||
Image: multInstJob.Files.Image,
|
||||
}
|
||||
|
||||
// 创建实例并运行
|
||||
instanceJob := job.NewInstanceJob(*instJobInfo, files, jo.JobID)
|
||||
jobID := rtx.Mgr.AddJob(jo.JobSetID, instanceJob, NewPreSchuduling(multInstJob.PreScheduler))
|
||||
|
||||
// 在多实例任务中新增这个实例的任务ID
|
||||
multInstJob.SubJobs = append(multInstJob.SubJobs, jobID)
|
||||
|
||||
rtx.Mgr.ChangeState(jo, NewMultiInstanceRunning(prescheduler.NewDefaultPreScheduler()))
|
||||
}
|
||||
|
||||
func (s *MultiInstanceInit) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.MultiInstCreateInitDump{}
|
||||
}
|
|
@ -0,0 +1,206 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type MultiInstanceRunning struct {
|
||||
preScheduler prescheduler.PreScheduler
|
||||
}
|
||||
|
||||
func NewMultiInstanceRunning(preScheduler prescheduler.PreScheduler) *MultiInstanceRunning {
|
||||
return &MultiInstanceRunning{
|
||||
preScheduler: preScheduler,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MultiInstanceRunning) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||||
s.do(rtx, job)
|
||||
}
|
||||
|
||||
func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
go func() {
|
||||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
multInstJob := jo.Body.(*job.MultiInstanceJob)
|
||||
|
||||
go pollingInstance(rtx, multInstJob)
|
||||
|
||||
waitFut := event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
|
||||
for {
|
||||
chanValue := <-waitFut.Chan()
|
||||
instanceInfo := chanValue.Value.(*event.InstanceOperate)
|
||||
instanceFuture := instanceInfo.Result
|
||||
logger.Info("wait a event happened")
|
||||
waitFut = event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
|
||||
|
||||
switch info := instanceInfo.Info.(type) {
|
||||
case *event.InstanceCreateInfo:
|
||||
createInstance(rtx, info, s.preScheduler, jo, multInstJob, instanceFuture)
|
||||
case *event.InstanceUpdateInfo:
|
||||
subJobs := info.Info.SubJobs
|
||||
// 微调任务特殊处理
|
||||
if info.Info.UpdateType == schsdk.FineTuning {
|
||||
multInstJob.Info.ModelJobInfo.Command = info.Info.Runtime.Command
|
||||
// 从原有配置中删除微调的输出路径,防止冲突
|
||||
for i := 0; i < len(multInstJob.Info.Runtime.Envs); i++ {
|
||||
if multInstJob.Info.Runtime.Envs[i].Key == schsdk.FinetuningOutEnv {
|
||||
multInstJob.Info.Runtime.Envs = append(multInstJob.Info.Runtime.Envs[:i], multInstJob.Info.Runtime.Envs[i+1:]...)
|
||||
}
|
||||
}
|
||||
multInstJob.Info.Runtime.Envs = append(multInstJob.Info.Runtime.Envs, info.Info.Runtime.Envs...)
|
||||
subJobs = multInstJob.SubJobs
|
||||
}
|
||||
updateInstance(rtx, info, subJobs, instanceFuture)
|
||||
case *event.InstanceDeleteInfo:
|
||||
deleteInstance(multInstJob, info.InstanceID)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func deleteInstance(multiJob *job.MultiInstanceJob, instanceID schsdk.JobID) {
|
||||
for i := 0; i < len(multiJob.SubJobs); i++ {
|
||||
// 找到instanceID后,从列表中删除
|
||||
if multiJob.SubJobs[i] == instanceID {
|
||||
multiJob.SubJobs = append(multiJob.SubJobs[:i], multiJob.SubJobs[i+1:]...)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 实例轮询,用于查询GPU等信息
|
||||
func pollingInstance(rtx jobmgr.JobStateRunContext, multiJob *job.MultiInstanceJob) {
|
||||
for {
|
||||
time.Sleep(time.Second * 30)
|
||||
for i := 0; i < len(multiJob.SubJobs); i++ {
|
||||
instanceID := multiJob.SubJobs[i]
|
||||
logger.Info("polling instanceID: " + instanceID)
|
||||
go func() {
|
||||
fut := future.NewSetValue[event.UpdateResult]()
|
||||
rtx.Mgr.PostEvent(instanceID, event.NewUpdate(schsdk.JobRuntimeInfo{}, schsdk.GPUMonitor, fut))
|
||||
_, err := fut.Wait(context.TODO())
|
||||
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
}
|
||||
println()
|
||||
}()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func updateInstance(rtx jobmgr.JobStateRunContext, updateInfo *event.InstanceUpdateInfo, subJobs []schsdk.JobID, updateInstanceFuture event.OperateInstanceFuture) {
|
||||
|
||||
var failJobs []string
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for i := 0; i < len(subJobs); i++ {
|
||||
// 发送请求进行任务更新
|
||||
instanceID := subJobs[i]
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
fut := future.NewSetValue[event.UpdateResult]()
|
||||
rtx.Mgr.PostEvent(instanceID, event.NewUpdate(updateInfo.Info.Runtime, schsdk.RestartServer, fut))
|
||||
_, err := fut.Wait(context.TODO())
|
||||
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
failJobs = append(failJobs, string(instanceID))
|
||||
}
|
||||
println()
|
||||
}()
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
if len(failJobs) == 0 {
|
||||
updateInstanceFuture.SetValue(event.OperateInstanceResult{
|
||||
Err: nil,
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// 返回更新失败的instance
|
||||
result := strings.Join(failJobs, ",")
|
||||
updateInstanceFuture.SetValue(event.OperateInstanceResult{
|
||||
OperateResult: result,
|
||||
Err: fmt.Errorf("error"),
|
||||
})
|
||||
}
|
||||
|
||||
func createInstance(rtx jobmgr.JobStateRunContext, info *event.InstanceCreateInfo, preScheduler prescheduler.PreScheduler, jo *jobmgr.Job, multInstJob *job.MultiInstanceJob, future event.OperateInstanceFuture) {
|
||||
dataSet := info.DataSet
|
||||
|
||||
//如果是模型扩容任务,直接使用父Job的资源文件
|
||||
if &multInstJob.Info.ModelJobInfo != nil {
|
||||
dataSet = multInstJob.Info.Files.Dataset
|
||||
}
|
||||
|
||||
// 构建InstanceJobInfo
|
||||
infoFiles := schsdk.JobFilesInfo{
|
||||
Dataset: dataSet,
|
||||
Code: multInstJob.Info.Files.Code,
|
||||
Image: multInstJob.Info.Files.Image,
|
||||
}
|
||||
|
||||
newLocalJobID := fmt.Sprintf("%s_%s", multInstJob.Info.LocalJobID, utils.GenerateRandomID())
|
||||
|
||||
instJobInfo := &schsdk.InstanceJobInfo{
|
||||
Type: schsdk.JobTypeInstance,
|
||||
LocalJobID: newLocalJobID,
|
||||
Files: infoFiles,
|
||||
Runtime: multInstJob.Info.Runtime,
|
||||
Resources: multInstJob.Info.Resources,
|
||||
ModelJobInfo: multInstJob.Info.ModelJobInfo,
|
||||
}
|
||||
|
||||
files := jobmod.JobFiles{
|
||||
Code: multInstJob.Files.Code,
|
||||
Image: multInstJob.Files.Image,
|
||||
}
|
||||
|
||||
// 生成预调度方案和文件上传方案
|
||||
jobSchedule, filesUploadScheme, err := preScheduler.ScheduleJob(instJobInfo)
|
||||
if err != nil {
|
||||
future.SetError(err)
|
||||
return
|
||||
}
|
||||
|
||||
// 创建实例并运行
|
||||
instanceJob := job.NewInstanceJob(*instJobInfo, files, jo.JobID)
|
||||
jobID := rtx.Mgr.AddJob(jo.JobSetID, instanceJob, NewPreSchuduling(*jobSchedule))
|
||||
|
||||
// 在多实例任务中新增这个实例的任务ID
|
||||
multInstJob.SubJobs = append(multInstJob.SubJobs, jobID)
|
||||
|
||||
// 将实例ID和文件上传方案返回
|
||||
future.SetValue(event.OperateInstanceResult{
|
||||
JobID: jobID,
|
||||
FilesUploadScheme: *filesUploadScheme,
|
||||
})
|
||||
}
|
||||
|
||||
func (s *MultiInstanceRunning) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.MultiInstCreateRunningDump{}
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type MultiInstanceUpdate struct {
|
||||
originalJob jobmod.JobDump
|
||||
}
|
||||
|
||||
func NewMultiInstanceUpdate(originalJob jobmod.JobDump) *MultiInstanceUpdate {
|
||||
return &MultiInstanceUpdate{
|
||||
originalJob: originalJob,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MultiInstanceUpdate) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
|
||||
err := s.do(rtx, job)
|
||||
if err != nil {
|
||||
logger.Error("update multi instance failed: %s", err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func (s *MultiInstanceUpdate) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
updateJob := jo.Body.(*job.UpdateMultiInstanceJob)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// 监听取消事件
|
||||
go func() {
|
||||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
var fullPath string
|
||||
instanceJob := jo.Body.(*job.UpdateMultiInstanceJob)
|
||||
if instanceJob.Info.UpdateType == schsdk.FineTuning {
|
||||
var dtrJob *job.DataReturnJob
|
||||
// 等待回源任务完成
|
||||
if rt, ok := updateJob.Info.Files.Dataset.(*schsdk.DataReturnJobFileInfo); ok {
|
||||
evt, ok := event.WaitTypeAnd[*event.JobCompleted](ctx, rtx.EventSet, func(val *event.JobCompleted) bool {
|
||||
return val.Job.GetInfo().GetLocalJobID() == rt.DataReturnLocalJobID
|
||||
})
|
||||
if !ok {
|
||||
return jobmgr.ErrJobCancelled
|
||||
}
|
||||
if evt.Err != nil {
|
||||
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
|
||||
}
|
||||
dtrJob, ok = evt.Job.Body.(*job.DataReturnJob)
|
||||
if !ok {
|
||||
return fmt.Errorf("job %s is not a DataReturn job(which is %T)", evt.Job.JobID, evt.Job)
|
||||
}
|
||||
}
|
||||
|
||||
stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cloudream storage client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), dtrJob.TargetJobCCID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting computing center info: %w", err)
|
||||
}
|
||||
|
||||
userID := cdssdk.UserID(1)
|
||||
getStg, err := stgCli.StorageGet(cdsapi.StorageGet{
|
||||
UserID: userID,
|
||||
StorageID: ccInfo.CDSStorageID,
|
||||
})
|
||||
|
||||
loadPackageResp, err := stgCli.StorageLoadPackage(cdsapi.StorageLoadPackageReq{
|
||||
UserID: userID,
|
||||
PackageID: dtrJob.DataReturnPackageID,
|
||||
StorageID: getStg.StorageID,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("loading package: %w", err)
|
||||
}
|
||||
logger.Info("load pacakge path: " + loadPackageResp.FullPath)
|
||||
fullPath = loadPackageResp.FullPath
|
||||
}
|
||||
|
||||
// 发送事件,更新各个instance
|
||||
updateJob.Info.Runtime.Envs = append(updateJob.Info.Runtime.Envs, schsdk.KVPair{Key: schsdk.FinetuningOutEnv, Value: fullPath})
|
||||
updateInfo := event.InstanceUpdateInfo{
|
||||
Info: updateJob.Info,
|
||||
}
|
||||
fut := future.NewSetValue[event.OperateInstanceResult]()
|
||||
rtx.Mgr.PostEvent(s.originalJob.JobID, event.NewInstanceOperate(&updateInfo, fut))
|
||||
|
||||
result, err := fut.Wait(context.TODO())
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
println(result.JobID)
|
||||
|
||||
if result.Err != nil {
|
||||
return fmt.Errorf("update instance failed: %s", result.OperateResult)
|
||||
}
|
||||
|
||||
logger.Info("update instance success!")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *MultiInstanceUpdate) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.MultiInstanceUpdateDump{}
|
||||
}
|
|
@ -0,0 +1,287 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type PreScheduling struct {
|
||||
scheme jobmod.JobScheduleScheme
|
||||
targetCCInfo schmod.ComputingCenter
|
||||
}
|
||||
|
||||
func NewPreSchuduling(scheme jobmod.JobScheduleScheme) *PreScheduling {
|
||||
return &PreScheduling{
|
||||
scheme: scheme,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
logger.Info("start run preScheduling, jobID: " + jo.JobID)
|
||||
|
||||
var jobFilesInfo schsdk.JobFilesInfo
|
||||
var jobFiles *jobmod.JobFiles
|
||||
|
||||
switch runningJob := jo.Body.(type) {
|
||||
case *job.NormalJob:
|
||||
jobFilesInfo = runningJob.Info.Files
|
||||
jobFiles = &runningJob.Files
|
||||
runningJob.TargetCCID = s.scheme.TargetCCID
|
||||
case *job.MultiInstanceJob:
|
||||
jobFilesInfo = runningJob.Info.Files
|
||||
jobFiles = &runningJob.Files
|
||||
runningJob.TargetCCID = s.scheme.TargetCCID
|
||||
case *job.InstanceJob:
|
||||
jobFilesInfo = runningJob.Info.Files
|
||||
jobFiles = &runningJob.Files
|
||||
runningJob.TargetCCID = s.scheme.TargetCCID
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// 监听取消事件
|
||||
go func() {
|
||||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), s.scheme.TargetCCID)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(fmt.Errorf("getting computing center info: %w", err)))
|
||||
return
|
||||
}
|
||||
s.targetCCInfo = ccInfo
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(3)
|
||||
|
||||
var e1, e2, e3 error
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e1 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Dataset, &jobFiles.Dataset, &s.scheme.Dataset)
|
||||
if e1 != nil {
|
||||
cancel()
|
||||
logger.Debugf("dataset scheduling done, err: %v", e1)
|
||||
} else {
|
||||
logger.Debug("dataset scheduling done")
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e2 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Code, &jobFiles.Code, &s.scheme.Code)
|
||||
if e2 != nil {
|
||||
cancel()
|
||||
logger.Debugf("code scheduling done, err: %v", e2)
|
||||
} else {
|
||||
logger.Debug("code scheduling done")
|
||||
}
|
||||
}()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
e3 = s.doImageScheduling(ctx, rtx, s.scheme.TargetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image)
|
||||
if e3 != nil {
|
||||
cancel()
|
||||
logger.Debugf("iamge scheduling done, err: %v", e3)
|
||||
} else {
|
||||
logger.Debug("image scheduling done")
|
||||
}
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
|
||||
allErr := errors.Join(e1, e2, e3)
|
||||
if allErr != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(allErr))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewReadyToAdjust())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PreScheduling) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.PreSchedulingDump{
|
||||
Scheme: s.scheme,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||
switch info := fileInfo.(type) {
|
||||
case *schsdk.LocalJobFileInfo:
|
||||
evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool {
|
||||
return e.LocalPath == info.LocalPath
|
||||
})
|
||||
if !ok {
|
||||
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||
}
|
||||
if evt.Error != nil {
|
||||
return evt.Error
|
||||
}
|
||||
|
||||
file.PackageID = evt.PackageID
|
||||
|
||||
case *schsdk.PackageJobFileInfo:
|
||||
file.PackageID = info.PackageID
|
||||
|
||||
case *schsdk.DataReturnJobFileInfo:
|
||||
return nil
|
||||
|
||||
default:
|
||||
return fmt.Errorf("unknown dataset type: %T", info)
|
||||
}
|
||||
|
||||
//if scheme.Action == jobmod.ActionMove {
|
||||
// logger.Debugf("begin move pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
|
||||
//
|
||||
// taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("moving package: %w", err)
|
||||
// }
|
||||
//
|
||||
// fut := taskStatus.Receive()
|
||||
// status := <-fut.Chan()
|
||||
//
|
||||
// moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
|
||||
// if moveStatus.Error != "" {
|
||||
// return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
// }
|
||||
//
|
||||
// return nil
|
||||
//}
|
||||
//
|
||||
//if scheme.Action == jobmod.ActionLoad {
|
||||
// logger.Debugf("begin load pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
|
||||
//
|
||||
// taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
|
||||
// if err != nil {
|
||||
// return fmt.Errorf("moving package: %w", err)
|
||||
// }
|
||||
//
|
||||
// fut := taskStatus.Receive()
|
||||
// status := <-fut.Chan()
|
||||
//
|
||||
// loadStatus := status.Value.Status.(*exectsk.StorageLoadPackageStatus)
|
||||
// if loadStatus.Error != "" {
|
||||
// return fmt.Errorf("moving package: %s", loadStatus.Error)
|
||||
// }
|
||||
//
|
||||
// return nil
|
||||
//}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
|
||||
switch info := fileInfo.(type) {
|
||||
case *schsdk.LocalJobFileInfo:
|
||||
evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool {
|
||||
return e.LocalPath == info.LocalPath
|
||||
})
|
||||
if !ok {
|
||||
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||
}
|
||||
if evt.Error != nil {
|
||||
return evt.Error
|
||||
}
|
||||
|
||||
// 上传完毕,则可以新建一个空的镜像的记录
|
||||
// TODO 镜像名称
|
||||
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.DefCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating image info: %w", err)
|
||||
}
|
||||
|
||||
// 填充ImageID和PackageID
|
||||
file.ImageID = imgID
|
||||
file.PackageID = &evt.PackageID
|
||||
|
||||
case *schsdk.ImageJobFileInfo:
|
||||
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.DefCtx(), info.ImageID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("getting image info: %w", err)
|
||||
}
|
||||
|
||||
file.ImageID = imageInfo.ImageID
|
||||
file.PackageID = imageInfo.CDSPackageID
|
||||
}
|
||||
|
||||
if scheme.Action == jobmod.ActionImportImage {
|
||||
// TODO 需要重新设计镜像导入流程
|
||||
return fmt.Errorf("not implemented")
|
||||
|
||||
//if file.PackageID == nil {
|
||||
// return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
|
||||
//}
|
||||
//
|
||||
//// TODO UserID
|
||||
//taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("moving package: %w", err)
|
||||
//}
|
||||
//
|
||||
//fut := taskStatus.Receive()
|
||||
//status := <-fut.Chan()
|
||||
//
|
||||
//moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
|
||||
//if moveStatus.Error != "" {
|
||||
// return fmt.Errorf("moving package: %s", moveStatus.Error)
|
||||
//}
|
||||
//
|
||||
//stgCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("new cloudream storage client: %w", err)
|
||||
//}
|
||||
//defer schglb.CloudreamStoragePool.Release(stgCli)
|
||||
//
|
||||
//// TODO UserID
|
||||
//pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("getting package objects: %w", err)
|
||||
//}
|
||||
//
|
||||
//if len(pkgObjs.Objects) == 0 {
|
||||
// return fmt.Errorf("no object in the package which will be imported")
|
||||
//}
|
||||
//
|
||||
//if len(pkgObjs.Objects) > 1 {
|
||||
// return fmt.Errorf("there must be only 1 object in the package which will be imported")
|
||||
//}
|
||||
//
|
||||
//taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("moving package: %w", err)
|
||||
//}
|
||||
//
|
||||
//fut2 := taskStatus2.Receive()
|
||||
//status2 := <-fut2.Chan()
|
||||
//
|
||||
//uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
|
||||
//if uploadStatus.Error != "" {
|
||||
// return fmt.Errorf("uploading image: %s", uploadStatus.Error)
|
||||
//}
|
||||
//
|
||||
//// TODO 镜像名称
|
||||
//err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.DefCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("creating image info: %w", err)
|
||||
//}
|
||||
//
|
||||
//return nil
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
|
||||
)
|
||||
|
||||
type ReadyToAdjust struct {
|
||||
}
|
||||
|
||||
func NewReadyToAdjust() *ReadyToAdjust {
|
||||
return &ReadyToAdjust{}
|
||||
}
|
||||
|
||||
func (s *ReadyToAdjust) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewMakeingAdjustScheme())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
var jobFilesInfo schsdk.JobFilesInfo
|
||||
var jobFiles *jobmod.JobFiles
|
||||
|
||||
switch runningJob := jo.Body.(type) {
|
||||
case *job.NormalJob:
|
||||
jobFilesInfo = runningJob.Info.Files
|
||||
jobFiles = &runningJob.Files
|
||||
case *job.InstanceJob:
|
||||
jobFilesInfo = runningJob.Info.Files
|
||||
jobFiles = &runningJob.Files
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// 监听取消事件
|
||||
go func() {
|
||||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
if rt, ok := jobFilesInfo.Dataset.(*schsdk.DataReturnJobFileInfo); ok {
|
||||
evt, ok := event.WaitTypeAnd[*event.JobCompleted](ctx, rtx.EventSet, func(val *event.JobCompleted) bool {
|
||||
return val.Job.GetInfo().GetLocalJobID() == rt.DataReturnLocalJobID
|
||||
})
|
||||
if !ok {
|
||||
return jobmgr.ErrJobCancelled
|
||||
}
|
||||
if evt.Err != nil {
|
||||
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
|
||||
}
|
||||
rtJob, ok := evt.Job.Body.(*job.DataReturnJob)
|
||||
if !ok {
|
||||
return fmt.Errorf("job %s is not a DataReturn job(which is %T)", evt.Job.JobID, evt.Job)
|
||||
}
|
||||
|
||||
jobFiles.Dataset.PackageID = rtJob.DataReturnPackageID
|
||||
jobFiles.Dataset.ECSInstanceID = rtJob.ECSInstanceID
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *ReadyToAdjust) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.ReadyToAdjustDump{}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
)
|
||||
|
||||
type NormalJobReadyToExecute struct {
|
||||
}
|
||||
|
||||
func NewNormalJobReadyToExecute() *NormalJobReadyToExecute {
|
||||
return &NormalJobReadyToExecute{}
|
||||
}
|
||||
|
||||
func (s *NormalJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
// TODO 目前直接启动执行
|
||||
rtx.Mgr.ChangeState(jo, NewNormalJobExecuting())
|
||||
}
|
||||
|
||||
func (s *NormalJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.NormalJobReadyToExecuteDump{}
|
||||
}
|
||||
|
||||
type DataReturnJobReadyToExecute struct {
|
||||
}
|
||||
|
||||
func NewDataReturnJobReadyToExecute() *DataReturnJobReadyToExecute {
|
||||
return &DataReturnJobReadyToExecute{}
|
||||
}
|
||||
|
||||
func (s *DataReturnJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
// TODO 目前直接启动执行
|
||||
rtx.Mgr.ChangeState(jo, NewDataReturnJobExecuting())
|
||||
}
|
||||
|
||||
func (s *DataReturnJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.DataReturnReadyToExecuteDump{}
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
package state
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
|
||||
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
)
|
||||
|
||||
type WaitTargetComplete struct {
|
||||
}
|
||||
|
||||
func NewWaitTargetComplete() *WaitTargetComplete {
|
||||
return &WaitTargetComplete{}
|
||||
}
|
||||
|
||||
func (s *WaitTargetComplete) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx, jo)
|
||||
if err != nil {
|
||||
rtx.Mgr.ChangeState(jo, FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewDataReturnJobReadyToExecute())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *WaitTargetComplete) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
|
||||
reJob := jo.Body.(*job.DataReturnJob)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// 监听取消事件
|
||||
go func() {
|
||||
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
|
||||
cancel()
|
||||
}()
|
||||
|
||||
evt, ok := event.WaitTypeAnd[*event.JobCompleted](ctx, rtx.EventSet, func(val *event.JobCompleted) bool {
|
||||
return val.Job.GetInfo().GetLocalJobID() == reJob.Info.TargetLocalJobID
|
||||
})
|
||||
if !ok {
|
||||
return jobmgr.ErrJobCancelled
|
||||
}
|
||||
if evt.Err != nil {
|
||||
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
|
||||
}
|
||||
norJob, ok := evt.Job.Body.(*job.NormalJob)
|
||||
if !ok {
|
||||
return fmt.Errorf("job %s is not a Normal job(which is %T)", evt.Job.JobID, evt.Job)
|
||||
}
|
||||
|
||||
reJob.TargetJobID = evt.Job.JobID
|
||||
reJob.TargetJobCCID = norJob.TargetCCID
|
||||
reJob.TargetJobOutputPath = norJob.OutputPath
|
||||
reJob.ECSInstanceID = norJob.ECSInstanceID
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *WaitTargetComplete) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.WaitTargetCompleteDump{}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
package state2
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
)
|
||||
|
||||
type DataSchedule struct {
|
||||
taskID sch.TaskID
|
||||
scheduleData []sch.ScheduleData
|
||||
}
|
||||
|
||||
func NewDataSchedule(taskID sch.TaskID, scheduleData []sch.ScheduleData) *DataSchedule {
|
||||
return &DataSchedule{
|
||||
taskID: taskID,
|
||||
scheduleData: scheduleData,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DataSchedule) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
results, err := s.do(rtx)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
rtx.Mgr.ChangeState(jo, NewPCMJobCancel(s.taskID, err.Error()))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewPCMJobStartup(s.taskID, results))
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DataSchedule) do(rtx jobmgr.JobStateRunContext) ([]sch.DataScheduleResults, error) {
|
||||
uploaderCli, err := schglb.UploaderPool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.UploaderPool.Release(uploaderCli)
|
||||
|
||||
// 从数据库中获取集群映射
|
||||
clusterMapping, err := rtx.Mgr.DB.UploadData().GetClusterMapping(rtx.Mgr.DB.DefCtx())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var results []sch.DataScheduleResults
|
||||
for _, data := range s.scheduleData {
|
||||
|
||||
var clusters []uploadersdk.Cluster
|
||||
var errResults []sch.DataScheduleResult
|
||||
// 根据clusterID获取JCS的storageID
|
||||
for _, id := range data.ClusterIDs {
|
||||
storageID, ok := clusterMapping[id]
|
||||
if !ok {
|
||||
errResults = append(errResults, sch.DataScheduleResult{
|
||||
Clusters: sch.DataDetail{
|
||||
ClusterID: id,
|
||||
},
|
||||
Msg: "cluster not found",
|
||||
Status: false,
|
||||
})
|
||||
logger.Error(fmt.Errorf("cluster %d not found", id))
|
||||
continue
|
||||
}
|
||||
clusters = append(clusters, uploadersdk.Cluster{
|
||||
ClusterID: id,
|
||||
StorageID: storageID,
|
||||
})
|
||||
}
|
||||
|
||||
// 发送调度请求
|
||||
req := uploadersdk.DataScheduleReq{
|
||||
Clusters: clusters,
|
||||
PackageID: data.PackageID,
|
||||
//StorageType: data.StorageType,
|
||||
}
|
||||
scheduleResult, err := uploaderCli.DataSchedule(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("schedule data: %w", err)
|
||||
}
|
||||
if len(errResults) > 0 {
|
||||
scheduleResult.Results = append(scheduleResult.Results, errResults...)
|
||||
}
|
||||
results = append(results, sch.DataScheduleResults{
|
||||
DataType: data.DataType,
|
||||
Results: scheduleResult.Results,
|
||||
})
|
||||
|
||||
}
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func (s *DataSchedule) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.NormalJobReadyToExecuteDump{}
|
||||
}
|
|
@ -0,0 +1,218 @@
|
|||
package state2
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
"gitlink.org.cn/cloudream/common/sdks/blockchain"
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
|
||||
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type DataUpload struct {
|
||||
userID cdssdk.UserID
|
||||
uploadInfo sch.UploadInfo
|
||||
dataType string
|
||||
storages []cdssdk.StorageID
|
||||
lock sync.Mutex
|
||||
}
|
||||
|
||||
func NewDataUpload(userID cdssdk.UserID, uploadInfo sch.UploadInfo, dataType string, storages []cdssdk.StorageID) *DataUpload {
|
||||
return &DataUpload{
|
||||
userID: userID,
|
||||
uploadInfo: uploadInfo,
|
||||
dataType: dataType,
|
||||
storages: storages,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DataUpload) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
s.lock.Lock()
|
||||
defer s.lock.Unlock()
|
||||
|
||||
err := s.do(rtx)
|
||||
if err != nil {
|
||||
logger.Error(err)
|
||||
rtx.Mgr.ChangeState(jo, state.FailureComplete(err))
|
||||
return
|
||||
}
|
||||
rtx.Mgr.ChangeState(jo, state.SuccessComplete())
|
||||
}
|
||||
|
||||
func (s *DataUpload) do(rtx jobmgr.JobStateRunContext) error {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
// 获取集群信息
|
||||
var clusters []*uploadersdk.Cluster
|
||||
for _, id := range s.storages {
|
||||
clusters = append(clusters, &uploadersdk.Cluster{
|
||||
StorageID: id,
|
||||
})
|
||||
}
|
||||
|
||||
var objectIDs []cdssdk.ObjectID
|
||||
|
||||
// 存证信息
|
||||
//var fileInfos []schmod.FileUploadedInfo
|
||||
//var folderID uploadersdk.FolderID
|
||||
|
||||
switch info := s.uploadInfo.(type) {
|
||||
// 通过本地上传
|
||||
case *sch.LocalUploadInfo:
|
||||
// 等待上传完成
|
||||
// TODO 需要设置超时机制
|
||||
evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool {
|
||||
return e.LocalPath == info.LocalPath
|
||||
})
|
||||
if !ok {
|
||||
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
|
||||
}
|
||||
if evt.Error.Error() != "" {
|
||||
return evt.Error
|
||||
}
|
||||
|
||||
//packageData.PackageID = evt.PackageID
|
||||
objectIDs = evt.ObjectIDs
|
||||
//packageData.Name = info.LocalPath
|
||||
//folderID = evt.FolderID
|
||||
//fileInfos = evt.UploadedInfo
|
||||
|
||||
// 通过URL上传
|
||||
case *sch.RemoteUploadInfo:
|
||||
uploaderCli, err := schglb.UploaderPool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.UploaderPool.Release(uploaderCli)
|
||||
|
||||
var targetClusters []uploadersdk.ClusterID
|
||||
for _, id := range info.TargetClusters {
|
||||
targetClusters = append(targetClusters, uploadersdk.ClusterID(id))
|
||||
}
|
||||
req := uploadersdk.UploadReq{
|
||||
Type: s.dataType,
|
||||
Source: &uploadersdk.UrlSource{
|
||||
Url: info.Url,
|
||||
},
|
||||
Target: &uploadersdk.UrlTarget{
|
||||
Clusters: targetClusters,
|
||||
},
|
||||
}
|
||||
uploadResp, err := uploaderCli.Upload(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("upload data: %w", err)
|
||||
}
|
||||
|
||||
if uploadResp.JsonData != "" {
|
||||
err = rtx.Mgr.DB.UploadData().UpdatePackage(rtx.Mgr.DB.DefCtx(), uploadResp.PackageID, uploadResp.JsonData, -1)
|
||||
if err != nil {
|
||||
return fmt.Errorf("update package: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
objectIDs = uploadResp.ObjectIDs
|
||||
|
||||
}
|
||||
|
||||
// 将上传结果写入数据库
|
||||
//packageData.UserID = 1
|
||||
//dataID, err := rtx.Mgr.DB.UploadData().InsertPackage(rtx.Mgr.DB.DefCtx(), packageData, clusters, folderID)
|
||||
//if err != nil {
|
||||
// return fmt.Errorf("insert upload data fail: %w", err)
|
||||
//}
|
||||
|
||||
// 传入存证
|
||||
blockChains, err := s.blockChain(objectIDs)
|
||||
if err != nil {
|
||||
return fmt.Errorf("blockchain: %w", err)
|
||||
}
|
||||
|
||||
err = rtx.Mgr.DB.UploadData().InsertBlockchains(rtx.Mgr.DB.DefCtx(), blockChains)
|
||||
if err != nil {
|
||||
return fmt.Errorf("insert blockchains: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DataUpload) blockChain(objectIDs []cdssdk.ObjectID) ([]*uploadersdk.BlockChain, error) {
|
||||
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(cdsCli)
|
||||
|
||||
objects, err := cdsCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{})
|
||||
|
||||
bcCli, err := schglb.BlockChainPool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new blockchain client: %w", err)
|
||||
}
|
||||
defer schglb.BlockChainPool.Release(bcCli)
|
||||
|
||||
var blockChains []*uploadersdk.BlockChain
|
||||
|
||||
for _, obj := range objects.Objects {
|
||||
|
||||
now := time.Now()
|
||||
timestamp := now.UnixNano() / int64(time.Millisecond)
|
||||
fileNo := strconv.FormatInt(int64(obj.ObjectID), 10) + "_" + strconv.FormatInt(timestamp, 10)
|
||||
formattedTime := now.Format("2006-01-02 15:04:05")
|
||||
paths := strings.Split(obj.Path, "/")
|
||||
fileName := paths[len(paths)-1]
|
||||
// 去掉hash前四个字符
|
||||
fileHash := obj.FileHash[4:]
|
||||
|
||||
var args = make(map[string]string)
|
||||
args["userID"] = strconv.FormatInt(int64(s.userID), 10)
|
||||
args["type"] = s.dataType
|
||||
args["fileName"] = fileName
|
||||
args["fileHash"] = string(fileHash)
|
||||
args["fileSize"] = strconv.FormatInt(obj.Size, 10)
|
||||
args["fileNo"] = fileNo
|
||||
args["createTime"] = formattedTime
|
||||
|
||||
// 将map转换成json字符串
|
||||
argsJson, _ := json.Marshal(args)
|
||||
|
||||
argsArr := []string{fileNo, string(argsJson)}
|
||||
|
||||
req := blockchain.InvokeReq{
|
||||
ContractAddress: schglb.BlockChainConfig.ContractAddress,
|
||||
FunctionName: schglb.BlockChainConfig.FunctionName,
|
||||
MemberName: schglb.BlockChainConfig.MemberName,
|
||||
Type: schglb.BlockChainConfig.Type,
|
||||
Args: argsArr,
|
||||
}
|
||||
err = bcCli.BlockChainInvoke(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invoke blockchain: %w", err)
|
||||
}
|
||||
blockChains = append(blockChains, &uploadersdk.BlockChain{
|
||||
ObjectID: obj.ObjectID,
|
||||
BlockChainID: fileNo,
|
||||
//FileHash: string(fileHash),
|
||||
//FileName: fileName,
|
||||
//FileSize: obj.Size,
|
||||
})
|
||||
}
|
||||
|
||||
return blockChains, nil
|
||||
}
|
||||
|
||||
func (s *DataUpload) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.NormalJobReadyToExecuteDump{}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package state2
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
|
||||
)
|
||||
|
||||
type PCMJobCancel struct {
|
||||
taskID sch.TaskID
|
||||
msg string
|
||||
}
|
||||
|
||||
func NewPCMJobCancel(taskID sch.TaskID, msg string) *PCMJobCancel {
|
||||
return &PCMJobCancel{
|
||||
taskID: taskID,
|
||||
msg: msg,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PCMJobCancel) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
schCli, err := schglb.PCMSchePool.Acquire()
|
||||
if err != nil {
|
||||
logger.Error(fmt.Sprintf("new scheduler client: %v", err))
|
||||
return
|
||||
}
|
||||
defer schglb.PCMSchePool.Release(schCli)
|
||||
|
||||
req := sch.CancelJobReq{
|
||||
TaskID: s.taskID,
|
||||
Msg: s.msg,
|
||||
}
|
||||
err = schCli.CancelJob(req)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
}
|
||||
rtx.Mgr.ChangeState(jo, state.FailureComplete(err))
|
||||
}
|
||||
|
||||
func (s *PCMJobCancel) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.NormalJobReadyToExecuteDump{}
|
||||
}
|
|
@ -0,0 +1,129 @@
|
|||
package state2
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
|
||||
)
|
||||
|
||||
type PCMJobCreate struct {
|
||||
jobInfo *schsdk.PCMJobInfo
|
||||
}
|
||||
|
||||
func NewPCMJobCreate(info *schsdk.PCMJobInfo) *PCMJobCreate {
|
||||
return &PCMJobCreate{
|
||||
jobInfo: info,
|
||||
}
|
||||
}
|
||||
|
||||
type UploadedData struct {
|
||||
}
|
||||
|
||||
type CodeDistribute struct {
|
||||
}
|
||||
|
||||
func (s *PCMJobCreate) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
scheduleData, err := s.do(rtx)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
rtx.Mgr.ChangeState(jo, state.FailureComplete(err))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, NewDataSchedule(scheduleData.TaskID, scheduleData.ScheduleDatas))
|
||||
}
|
||||
}
|
||||
|
||||
// 根据数据分布情况和资源需求创建任务
|
||||
func (s *PCMJobCreate) do(rtx jobmgr.JobStateRunContext) (*sch.CreateJobResp, error) {
|
||||
schCli, err := schglb.PCMSchePool.Acquire()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.PCMSchePool.Release(schCli)
|
||||
|
||||
// 获取所有packageID
|
||||
var packages []cdssdk.PackageID
|
||||
var bindingIDs []int64
|
||||
collectDataID(s.jobInfo.Files.Code, packages, bindingIDs)
|
||||
collectDataID(s.jobInfo.Files.Dataset, packages, bindingIDs)
|
||||
collectDataID(s.jobInfo.Files.Image, packages, bindingIDs)
|
||||
collectDataID(s.jobInfo.Files.Model, packages, bindingIDs)
|
||||
|
||||
if (len(packages) & len(bindingIDs)) == 0 {
|
||||
return nil, fmt.Errorf("no packageID")
|
||||
}
|
||||
|
||||
// 从数据库中读取数据信息
|
||||
uploadDatas, err := rtx.Mgr.DB.UploadData().GetByPackageID(rtx.Mgr.DB.DefCtx(), packages, bindingIDs)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(uploadDatas) == 0 {
|
||||
return nil, fmt.Errorf("no upload data")
|
||||
}
|
||||
|
||||
// 获取数据分布信息
|
||||
var dataDistribute sch.DataDistribute
|
||||
for _, data := range uploadDatas {
|
||||
var clusters []sch.DataDetail
|
||||
for _, cluster := range data.UploadedCluster {
|
||||
clusters = append(clusters, sch.DataDetail{
|
||||
ClusterID: cluster.ClusterID,
|
||||
JsonData: data.JsonData,
|
||||
})
|
||||
}
|
||||
|
||||
switch data.DataType {
|
||||
case sch.CODE:
|
||||
dataDistribute.Code = append(dataDistribute.Code, sch.CodeDistribute{
|
||||
Clusters: clusters,
|
||||
PackageID: data.PackageID,
|
||||
})
|
||||
case sch.DATASET:
|
||||
dataDistribute.Dataset = append(dataDistribute.Dataset, sch.DatasetDistribute{
|
||||
Clusters: clusters,
|
||||
PackageID: data.PackageID,
|
||||
})
|
||||
case sch.MODEL:
|
||||
dataDistribute.Model = append(dataDistribute.Model, sch.ModelDistribute{
|
||||
Clusters: clusters,
|
||||
PackageID: data.PackageID,
|
||||
})
|
||||
case sch.IMAGE:
|
||||
dataDistribute.Image = append(dataDistribute.Image, sch.ImageDistribute{
|
||||
Clusters: clusters,
|
||||
PackageID: data.PackageID,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
req := sch.CreateJobReq{
|
||||
DataDistribute: dataDistribute,
|
||||
JobResources: s.jobInfo.JobResources,
|
||||
}
|
||||
|
||||
resp, err := schCli.CreateJob(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create task: %w", err)
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func collectDataID(fileInfo schsdk.JobFileInfo, packageIDs []cdssdk.PackageID, bindingIDs []int64) {
|
||||
switch info := fileInfo.(type) {
|
||||
case *schsdk.PackageJobFileInfo:
|
||||
packageIDs = append(packageIDs, info.PackageID)
|
||||
case *schsdk.BindingJobFileInfo:
|
||||
bindingIDs = append(bindingIDs, info.BindingID)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PCMJobCreate) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.NormalJobReadyToExecuteDump{}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
package state2
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
|
||||
)
|
||||
|
||||
type PCMJobStartup struct {
|
||||
taskID sch.TaskID
|
||||
scheduledData []sch.DataScheduleResults
|
||||
}
|
||||
|
||||
func NewPCMJobStartup(taskID sch.TaskID, scheduledData []sch.DataScheduleResults) *PCMJobStartup {
|
||||
return &PCMJobStartup{
|
||||
taskID: taskID,
|
||||
scheduledData: scheduledData,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PCMJobStartup) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
|
||||
err := s.do(rtx)
|
||||
if err != nil {
|
||||
logger.Error(err.Error())
|
||||
rtx.Mgr.ChangeState(jo, NewPCMJobCancel(s.taskID, err.Error()))
|
||||
} else {
|
||||
rtx.Mgr.ChangeState(jo, state.SuccessComplete())
|
||||
}
|
||||
}
|
||||
|
||||
func (s *PCMJobStartup) do(rtx jobmgr.JobStateRunContext) error {
|
||||
schCli, err := schglb.PCMSchePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.PCMSchePool.Release(schCli)
|
||||
|
||||
req := sch.RunJobReq{
|
||||
TaskID: s.taskID,
|
||||
ScheduledDatas: s.scheduledData,
|
||||
}
|
||||
err = schCli.RunJob(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("run job: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *PCMJobStartup) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
|
||||
return &jobmod.NormalJobReadyToExecuteDump{}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package jobmgr
|
||||
|
||||
import jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
|
||||
type JobStateRunContext struct {
|
||||
Mgr *Manager
|
||||
EventSet *EventSet
|
||||
LastState JobState
|
||||
}
|
||||
|
||||
type JobState interface {
|
||||
Run(ctx JobStateRunContext, job *Job)
|
||||
Dump(ctx JobStateRunContext, job *Job) jobmod.JobStateDump
|
||||
}
|
|
@ -0,0 +1,259 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/executormgr"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
|
||||
)
|
||||
|
||||
type mgrJob struct {
|
||||
job Job
|
||||
eventSet EventSet
|
||||
state JobState
|
||||
isCompleted bool // 任务是否结束。注:任务状态为Completed时,此字段不一定为true,因为在Completed状态下也有工作要做。
|
||||
}
|
||||
|
||||
type mgrJobSet struct {
|
||||
jobs map[schsdk.JobID]*mgrJob
|
||||
}
|
||||
|
||||
type Manager struct {
|
||||
// 任何修改job、jobset的操作,都需要加这个锁
|
||||
pubLock sync.Mutex
|
||||
|
||||
ExecMgr *executormgr.Manager
|
||||
DB *db.DB
|
||||
NodeSvc *NodeService
|
||||
|
||||
jobSetIDIndex int
|
||||
jobSets map[schsdk.JobSetID]*mgrJobSet
|
||||
jobIDIndex int
|
||||
jobs map[schsdk.JobID]*mgrJob
|
||||
}
|
||||
|
||||
func NewManager(db *db.DB, nodeSvc *NodeService) (*Manager, error) {
|
||||
mgr := &Manager{
|
||||
DB: db,
|
||||
NodeSvc: nodeSvc,
|
||||
jobSets: make(map[schsdk.JobSetID]*mgrJobSet),
|
||||
jobs: make(map[schsdk.JobID]*mgrJob),
|
||||
}
|
||||
|
||||
return mgr, nil
|
||||
}
|
||||
|
||||
func (m *Manager) Serve() error {
|
||||
|
||||
ticker := time.NewTicker(time.Minute)
|
||||
defer ticker.Stop()
|
||||
|
||||
// TODO 应该要阻塞在这里
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) Stop() {
|
||||
|
||||
}
|
||||
|
||||
// 改变任务状态。注:将任务改变为Completed状态不会设置mgrJob.isCompleted为true
|
||||
func (m *Manager) ChangeState(job *Job, state JobState) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
mgrJob, ok := m.jobs[job.JobID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
lastState := mgrJob.state
|
||||
mgrJob.state = state
|
||||
logger.Info("jobID: %s change state from %s to %s", job.JobID, lastState, state)
|
||||
|
||||
go func() {
|
||||
logger.WithField("JobID", job.JobID).Infof("state changed: %T -> %T", lastState, state)
|
||||
|
||||
state.Run(JobStateRunContext{
|
||||
Mgr: m,
|
||||
EventSet: &mgrJob.eventSet,
|
||||
LastState: lastState,
|
||||
}, job)
|
||||
}()
|
||||
}
|
||||
|
||||
// 将任务标记为结束
|
||||
func (m *Manager) JobCompleted(job *Job) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
mgrJob, ok := m.jobs[job.JobID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
mgrJob.isCompleted = true
|
||||
|
||||
// 如果任务集中的所有任务都完成了,则删除任务集
|
||||
jobSet := m.jobSets[job.JobSetID]
|
||||
for _, mjob := range jobSet.jobs {
|
||||
if !mjob.isCompleted {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// TODO 可以考虑加个回调
|
||||
delete(m.jobSets, job.JobSetID)
|
||||
|
||||
go func() {
|
||||
logger.Infof("job set %s completed", job.JobSetID)
|
||||
}()
|
||||
}
|
||||
|
||||
// 向某个任务投递事件
|
||||
func (m *Manager) PostEvent(jobID schsdk.JobID, evt Event) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
mgrJob, ok := m.jobs[jobID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
go func() {
|
||||
mgrJob.eventSet.Post(evt)
|
||||
}()
|
||||
}
|
||||
|
||||
// 向某个任务集中的所有任务投递事件
|
||||
func (m *Manager) BroadcastEvent(jobSetID schsdk.JobSetID, evt Event) {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
jobSet, ok := m.jobSets[jobSetID]
|
||||
if !ok {
|
||||
// 如果作业集不存在,则直接返回
|
||||
return
|
||||
}
|
||||
|
||||
for _, mjob := range jobSet.jobs {
|
||||
go func(j *mgrJob) {
|
||||
j.eventSet.Post(evt)
|
||||
}(mjob)
|
||||
}
|
||||
}
|
||||
|
||||
type SubmittingJob struct {
|
||||
Body JobBody
|
||||
InitState JobState
|
||||
}
|
||||
|
||||
// 提交一个任务集
|
||||
func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex))
|
||||
m.jobSetIDIndex += 1
|
||||
|
||||
jobSet := &mgrJobSet{
|
||||
jobs: make(map[schsdk.JobID]*mgrJob),
|
||||
}
|
||||
m.jobSets[jobSetID] = jobSet
|
||||
|
||||
var addedJobs []*mgrJob
|
||||
for i, subJob := range jobs {
|
||||
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
|
||||
job := &mgrJob{
|
||||
job: Job{
|
||||
JobSetID: jobSetID,
|
||||
JobID: jobID,
|
||||
Body: subJob.Body,
|
||||
},
|
||||
eventSet: NewEventSet(),
|
||||
state: subJob.InitState,
|
||||
}
|
||||
jobSet.jobs[jobID] = job
|
||||
m.jobs[jobID] = job
|
||||
addedJobs = append(addedJobs, job)
|
||||
}
|
||||
m.jobIDIndex += len(jobs)
|
||||
|
||||
// 先添加完所有Job,再启动
|
||||
for _, job := range addedJobs {
|
||||
go func(j *mgrJob) {
|
||||
j.state.Run(JobStateRunContext{
|
||||
Mgr: m,
|
||||
EventSet: &j.eventSet,
|
||||
LastState: nil,
|
||||
}, &j.job)
|
||||
}(job)
|
||||
}
|
||||
|
||||
return jobSetID
|
||||
}
|
||||
|
||||
// 导出任务集中所有任务的状态
|
||||
func (m *Manager) DumpJobSet(jobSetID schsdk.JobSetID) []jobmod.JobDump {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
jobSet, ok := m.jobSets[jobSetID]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
var jobDumps []jobmod.JobDump
|
||||
for _, mgrJob := range jobSet.jobs {
|
||||
jobDumps = append(jobDumps, mgrJob.job.Dump(JobStateRunContext{
|
||||
Mgr: m,
|
||||
EventSet: &mgrJob.eventSet,
|
||||
LastState: mgrJob.state,
|
||||
}, &mgrJob.job, mgrJob.state))
|
||||
}
|
||||
|
||||
return jobDumps
|
||||
}
|
||||
|
||||
type PreSchedulerInstJob struct {
|
||||
Body JobBody
|
||||
InitState JobState
|
||||
}
|
||||
|
||||
// AddJob 添加一个作业到指定的作业集。
|
||||
func (m *Manager) AddJob(jobSetID schsdk.JobSetID, jobBody JobBody, jobState JobState) schsdk.JobID {
|
||||
m.pubLock.Lock()
|
||||
defer m.pubLock.Unlock()
|
||||
|
||||
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex))
|
||||
m.jobIDIndex += 1
|
||||
|
||||
job := &mgrJob{
|
||||
job: Job{
|
||||
JobSetID: jobSetID,
|
||||
JobID: jobID,
|
||||
Body: jobBody,
|
||||
},
|
||||
state: jobState,
|
||||
eventSet: NewEventSet(),
|
||||
}
|
||||
|
||||
m.jobs[jobID] = job
|
||||
jobSet := m.jobSets[jobSetID]
|
||||
jobSet.jobs[jobID] = job
|
||||
|
||||
go func() {
|
||||
jobState.Run(JobStateRunContext{
|
||||
Mgr: m,
|
||||
EventSet: &job.eventSet,
|
||||
LastState: nil,
|
||||
}, &job.job)
|
||||
}()
|
||||
|
||||
return jobID
|
||||
}
|
|
@ -0,0 +1,187 @@
|
|||
package jobmgr
|
||||
|
||||
import (
|
||||
"github.com/patrickmn/go-cache"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
type NodeService struct {
|
||||
RunningModels map[string]schsdk.RunningModelInfo
|
||||
NodeUsageCache map[schsdk.JobID]*cache.Cache
|
||||
Lock sync.Mutex
|
||||
}
|
||||
|
||||
func NewNodeService() *NodeService {
|
||||
return &NodeService{
|
||||
NodeUsageCache: make(map[schsdk.JobID]*cache.Cache),
|
||||
RunningModels: make(map[string]schsdk.RunningModelInfo),
|
||||
}
|
||||
}
|
||||
|
||||
// SetNodeData 新增节点
|
||||
func (s *NodeService) SetNodeData(jobSetID schsdk.JobSetID, modelJobInfo schsdk.ModelJobInfo, node schsdk.NodeInfo) {
|
||||
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
|
||||
value, ok := s.RunningModels[key]
|
||||
// 如果不存在
|
||||
if !ok {
|
||||
nodes := []schsdk.NodeInfo{node}
|
||||
value = schsdk.RunningModelInfo{
|
||||
JobSetID: jobSetID,
|
||||
Nodes: nodes,
|
||||
ModelID: modelJobInfo.ModelID,
|
||||
// 这里的model name应该从数据库中查询
|
||||
ModelName: "",
|
||||
CustomModelName: modelJobInfo.CustomModelName,
|
||||
}
|
||||
s.RunningModels[key] = value
|
||||
return
|
||||
}
|
||||
// 如果存在
|
||||
value.Nodes = append(value.Nodes, node)
|
||||
s.RunningModels[key] = value
|
||||
}
|
||||
|
||||
// RemoveNodeFromRunningModels 移除节点
|
||||
func (s *NodeService) RemoveNodeFromRunningModels(modelJobInfo schsdk.ModelJobInfo, instanceID schsdk.JobID) {
|
||||
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
|
||||
value, ok := s.RunningModels[key]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
for i := 0; i < len(value.Nodes); i++ {
|
||||
node := value.Nodes[i]
|
||||
if node.InstanceID == instanceID {
|
||||
value.Nodes = append(value.Nodes[:i], value.Nodes[i+1:]...)
|
||||
s.RunningModels[key] = value
|
||||
logger.Info("remove node success from running models, job id: " + instanceID)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NodeService) UpdateNodeFromRunningModels(modelJobInfo schsdk.ModelJobInfo, instanceID schsdk.JobID, status string) {
|
||||
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
|
||||
value, ok := s.RunningModels[key]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
for i := 0; i < len(value.Nodes); i++ {
|
||||
node := value.Nodes[i]
|
||||
if node.InstanceID == instanceID {
|
||||
node.Status = status
|
||||
logger.Info("update node success from running models, job id: " + instanceID)
|
||||
value.Nodes[i] = node
|
||||
s.RunningModels[key] = value
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NodeService) GetAvailableNodes() map[string]schsdk.RunningModelInfo {
|
||||
return s.RunningModels
|
||||
}
|
||||
|
||||
func (s *NodeService) GetNodeUsageRateInfo(customModelName schsdk.ModelName, modelID schsdk.ModelID) []schsdk.NodeUsageRateInfo {
|
||||
var rateInfos []schsdk.NodeUsageRateInfo
|
||||
|
||||
key := string(customModelName) + "_" + string(modelID)
|
||||
value, ok := s.RunningModels[key]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
for i := 0; i < len(value.Nodes); i++ {
|
||||
node := value.Nodes[i]
|
||||
c, ok := s.NodeUsageCache[node.InstanceID]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
rateInfo := getCacheData(c)
|
||||
rateInfo.InstanceID = node.InstanceID
|
||||
rateInfo.Address = node.Address
|
||||
|
||||
rateInfos = append(rateInfos, rateInfo)
|
||||
}
|
||||
|
||||
return rateInfos
|
||||
}
|
||||
|
||||
func (s *NodeService) SetNodeUsageRateInfo(key schsdk.JobID, value string) {
|
||||
timeStamp := strconv.FormatInt(time.Now().Unix(), 10)
|
||||
ch, ok := s.NodeUsageCache[key]
|
||||
if !ok {
|
||||
ch = cache.New(time.Minute*60, time.Minute*60)
|
||||
ch.Set(timeStamp, value, cache.DefaultExpiration)
|
||||
s.NodeUsageCache[key] = ch
|
||||
return
|
||||
}
|
||||
ch.Set(timeStamp, value, cache.DefaultExpiration)
|
||||
}
|
||||
|
||||
func getCacheData(c *cache.Cache) schsdk.NodeUsageRateInfo {
|
||||
|
||||
var nodeUsageRateInfo schsdk.NodeUsageRateInfo
|
||||
|
||||
infoMap := make(map[string][]schsdk.UsageRate)
|
||||
|
||||
// 获取缓存中的所有项
|
||||
|
||||
items := c.Items()
|
||||
|
||||
// 遍历缓存项,将其放入 map 中
|
||||
for tmstamp, item := range items {
|
||||
|
||||
msg := item.Object.(string)
|
||||
arr1 := strings.Split(msg, "\n")
|
||||
// 提取所有kv
|
||||
for i := 0; i < len(arr1); i++ {
|
||||
arr2 := strings.Split(arr1[i], ":")
|
||||
if len(arr2) != 2 {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSpace(arr2[0])
|
||||
value := strings.TrimSpace(arr2[1])
|
||||
rate, ok := infoMap[key]
|
||||
if !ok {
|
||||
infoMap[key] = []schsdk.UsageRate{
|
||||
{
|
||||
Timestamp: tmstamp,
|
||||
Number: value,
|
||||
},
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
rate = append(rate, schsdk.UsageRate{
|
||||
Timestamp: tmstamp,
|
||||
Number: value,
|
||||
})
|
||||
infoMap[key] = rate
|
||||
}
|
||||
}
|
||||
|
||||
for k, v := range infoMap {
|
||||
// 对v 进行排序
|
||||
sort.Slice(v, func(i, j int) bool {
|
||||
return v[i].Timestamp < v[j].Timestamp
|
||||
})
|
||||
switch k {
|
||||
case schsdk.MemoryUtilization:
|
||||
nodeUsageRateInfo.MemoryUtilization = v
|
||||
case schsdk.GPUUtilization:
|
||||
nodeUsageRateInfo.GPUUtilization = v
|
||||
case schsdk.CPUUtilization:
|
||||
nodeUsageRateInfo.CPUUtilization = v
|
||||
}
|
||||
}
|
||||
|
||||
return nodeUsageRateInfo
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package jobTask
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/async"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/future"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
"math/rand"
|
||||
"time"
|
||||
)
|
||||
|
||||
type JobTask[T any] struct {
|
||||
id string
|
||||
taskChan *async.UnboundChannel[T]
|
||||
}
|
||||
|
||||
func NewJobTask[T any]() *JobTask[T] {
|
||||
return &JobTask[T]{
|
||||
id: getTaskID(),
|
||||
taskChan: async.NewUnboundChannel[T](),
|
||||
}
|
||||
}
|
||||
|
||||
func getTaskID() string {
|
||||
now := time.Now()
|
||||
nano := now.UnixNano()
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
randomNumber := rand.Intn(9000) + 1000 // 生成1000到9999之间的随机数
|
||||
|
||||
taskID := fmt.Sprintf("id_%d_%d", nano, randomNumber)
|
||||
|
||||
return taskID
|
||||
}
|
||||
|
||||
func (c *JobTask[T]) Receive() future.Future1[T] {
|
||||
|
||||
return c.taskChan.Receive()
|
||||
}
|
||||
|
||||
func (c *JobTask[T]) Send(info any) {
|
||||
|
||||
logger.Info("send http")
|
||||
}
|
||||
|
||||
func (c *JobTask[T]) Chan() *async.UnboundChannel[T] {
|
||||
return c.taskChan
|
||||
}
|
||||
|
||||
func (c *JobTask[T]) ID() string {
|
||||
return c.id
|
||||
}
|
|
@ -0,0 +1,467 @@
|
|||
package services
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
|
||||
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
|
||||
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state2"
|
||||
"sort"
|
||||
|
||||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||||
)
|
||||
|
||||
type JobSetService struct {
|
||||
*Service
|
||||
}
|
||||
|
||||
func (svc *Service) JobSetSvc() *JobSetService {
|
||||
return &JobSetService{Service: svc}
|
||||
}
|
||||
|
||||
func (svc *JobSetService) PreScheduler(jobSet schsdk.JobSetInfo) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error) {
|
||||
ccs, err := svc.db.ComputingCenter().GetAll(svc.db.DefCtx())
|
||||
if err != nil {
|
||||
logger.Warnf("getting all computing center: %s", err.Error())
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
schScheme, uploadScheme, err := svc.preScheduler.ScheduleJobSet(&jobSet, ccs)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("pre scheduling: %w", err)
|
||||
}
|
||||
|
||||
return schScheme, uploadScheme, nil
|
||||
}
|
||||
|
||||
func (svc *JobSetService) Upload(userID cdssdk.UserID, params sch.UploadParams) (*schsdk.JobSetID, *[]cdssdk.StorageID, error) {
|
||||
logger.Debugf("uploading job")
|
||||
|
||||
// 查询数据库里维护的集群
|
||||
//ccs, err := svc.db.ComputingCenter().GetAll(svc.db.DefCtx())
|
||||
//if err != nil {
|
||||
// logger.Warnf("getting all computing center: %s", err.Error())
|
||||
// return nil, nil, err
|
||||
//}
|
||||
|
||||
// 获取集群与存储的对应关系
|
||||
clusterMapping, err := svc.db.UploadData().GetClusterMapping(svc.db.DefCtx())
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("query cluster mapping error: %w", err)
|
||||
}
|
||||
|
||||
var storages []cdssdk.StorageID
|
||||
switch uploadPriority := params.UploadPriority.(type) {
|
||||
case *sch.Preferences:
|
||||
// 进行预调度
|
||||
clusterID, err := svc.preScheduler.ScheduleJob(uploadPriority.ResourcePriorities, clusterMapping)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("pre scheduling: %w", err)
|
||||
}
|
||||
|
||||
storageID, ok := clusterMapping[*clusterID]
|
||||
if !ok {
|
||||
return nil, nil, fmt.Errorf("cluster %d not found", clusterID)
|
||||
}
|
||||
|
||||
storages = append(storages, storageID)
|
||||
case *sch.SpecifyCluster:
|
||||
// 指定集群
|
||||
for _, clusterID := range uploadPriority.Clusters {
|
||||
storageID, ok := clusterMapping[clusterID]
|
||||
if !ok {
|
||||
logger.Warnf("cluster %d not found", clusterID)
|
||||
continue
|
||||
}
|
||||
storages = append(storages, storageID)
|
||||
}
|
||||
}
|
||||
|
||||
if len(storages) == 0 {
|
||||
return nil, nil, errors.New("no storage is available")
|
||||
}
|
||||
|
||||
var jobs []jobmgr.SubmittingJob
|
||||
jo := job.NewNormalJob(schsdk.NormalJobInfo{})
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: jo,
|
||||
InitState: state2.NewDataUpload(userID, params.UploadInfo, params.DataType, storages),
|
||||
})
|
||||
|
||||
jobSetID := svc.jobMgr.SubmitJobSet(jobs)
|
||||
|
||||
return &jobSetID, &storages, nil
|
||||
}
|
||||
|
||||
// Submit 提交任务集
|
||||
func (svc *JobSetService) Submit(jobSet schsdk.JobSetInfo, schScheme *jobmod.JobSetPreScheduleScheme) (*schsdk.JobSetID, error) {
|
||||
logger.Debugf("submitting job")
|
||||
|
||||
var jobs []jobmgr.SubmittingJob
|
||||
for _, jobInfo := range jobSet.Jobs {
|
||||
switch info := jobInfo.(type) {
|
||||
case *schsdk.PCMJobInfo:
|
||||
jo := job.NewPCMJob(*info)
|
||||
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: jo,
|
||||
//InitState: state.NewPreSchuduling(preSch),
|
||||
InitState: state2.NewPCMJobCreate(info),
|
||||
})
|
||||
|
||||
case *schsdk.NormalJobInfo:
|
||||
jo := job.NewNormalJob(*info)
|
||||
jo.SubType = schsdk.JobTypeNormal
|
||||
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: jo,
|
||||
//InitState: state.NewPreSchuduling(preSch),
|
||||
})
|
||||
|
||||
case *schsdk.DataReturnJobInfo:
|
||||
jo := job.NewDataReturnJob(*info)
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: jo,
|
||||
InitState: state.NewWaitTargetComplete(),
|
||||
})
|
||||
|
||||
case *schsdk.MultiInstanceJobInfo:
|
||||
preSch, ok := schScheme.JobSchemes[info.LocalJobID]
|
||||
|
||||
jo := job.NewMultiInstanceJob(*info, preSch)
|
||||
|
||||
if !ok {
|
||||
return nil, errors.New(fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
||||
}
|
||||
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: jo,
|
||||
InitState: state.NewMultiInstanceInit(),
|
||||
})
|
||||
|
||||
case *schsdk.UpdateMultiInstanceJobInfo:
|
||||
modelJob := job.NewUpdateMultiInstanceJob(*info)
|
||||
instanceJobSets := svc.jobMgr.DumpJobSet(modelJob.Info.MultiInstanceJobSetID)
|
||||
if len(instanceJobSets) == 0 {
|
||||
return nil, errors.New(fmt.Sprintf("job set %s is not found", modelJob.Info.MultiInstanceJobSetID))
|
||||
}
|
||||
|
||||
// 找到多实例任务本身
|
||||
var multiInstanceJobDump jobmod.JobDump
|
||||
for i := 0; i < len(instanceJobSets); i++ {
|
||||
jobDump := instanceJobSets[i]
|
||||
if _, ok := jobDump.Body.(*jobmod.MultiInstanceJobDump); ok {
|
||||
multiInstanceJobDump = jobDump
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: modelJob,
|
||||
InitState: state.NewMultiInstanceUpdate(multiInstanceJobDump),
|
||||
})
|
||||
|
||||
case *schsdk.DataPreprocessJobInfo:
|
||||
// 后续的调度流程跟NormalJob是一致的
|
||||
normalJobInfo := &schsdk.NormalJobInfo{
|
||||
Type: schsdk.JobTypeNormal,
|
||||
JobInfoBase: info.JobInfoBase,
|
||||
Files: info.Files,
|
||||
Runtime: info.Runtime,
|
||||
Services: info.Services,
|
||||
Resources: info.Resources,
|
||||
}
|
||||
jo := job.NewNormalJob(*normalJobInfo)
|
||||
jo.SubType = schsdk.JobTypeDataPreprocess
|
||||
|
||||
preSch, ok := schScheme.JobSchemes[info.LocalJobID]
|
||||
if !ok {
|
||||
return nil, errors.New(fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
||||
}
|
||||
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: jo,
|
||||
InitState: state.NewPreSchuduling(preSch),
|
||||
})
|
||||
|
||||
case *schsdk.FinetuningJobInfo:
|
||||
// 后续的调度流程跟NormalJob是一致的
|
||||
normalJobInfo := &schsdk.NormalJobInfo{
|
||||
Type: schsdk.JobTypeNormal,
|
||||
Files: info.Files,
|
||||
JobInfoBase: info.JobInfoBase,
|
||||
Runtime: info.Runtime,
|
||||
Services: info.Services,
|
||||
Resources: info.Resources,
|
||||
ModelJobInfo: info.ModelJobInfo,
|
||||
}
|
||||
jo := job.NewNormalJob(*normalJobInfo)
|
||||
jo.SubType = schsdk.JobTypeFinetuning
|
||||
|
||||
preSch, ok := schScheme.JobSchemes[info.LocalJobID]
|
||||
if !ok {
|
||||
return nil, errors.New(fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
|
||||
}
|
||||
|
||||
jobs = append(jobs, jobmgr.SubmittingJob{
|
||||
Body: jo,
|
||||
InitState: state.NewPreSchuduling(preSch),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
jobSetID := svc.jobMgr.SubmitJobSet(jobs)
|
||||
return &jobSetID, nil
|
||||
}
|
||||
|
||||
// LocalFileUploaded 任务集中某个文件上传完成
|
||||
func (svc *JobSetService) LocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, errMsg string, packageID cdssdk.PackageID, objectIDs []cdssdk.ObjectID) {
|
||||
|
||||
err := errors.New(errMsg)
|
||||
|
||||
svc.jobMgr.BroadcastEvent(jobSetID, event.NewLocalFileUploaded(localPath, err, packageID, objectIDs))
|
||||
}
|
||||
|
||||
func (svc *JobSetService) CreateFolder(packageID cdssdk.PackageID, path string) error {
|
||||
err := svc.JobSetSvc().db.UploadData().InsertFolder(svc.db.DefCtx(), packageID, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// 删除文件或文件夹
|
||||
func (svc *JobSetService) DeleteFile(userID cdssdk.UserID, objectIDs []cdssdk.ObjectID) error {
|
||||
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(cdsCli)
|
||||
|
||||
err = cdsCli.Object().Delete(cdsapi.ObjectDelete{
|
||||
ObjectIDs: objectIDs,
|
||||
UserID: userID,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete object: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (svc *JobSetService) DeleteFolder(userID cdssdk.UserID, packageID cdssdk.PackageID, path string) error {
|
||||
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(cdsCli)
|
||||
|
||||
list, err := cdsCli.Object().List(cdsapi.ObjectList{
|
||||
UserID: userID,
|
||||
PackageID: packageID,
|
||||
Path: path,
|
||||
IsPrefix: true,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete object: %w", err)
|
||||
}
|
||||
|
||||
if len(list.Objects) > 0 {
|
||||
var objectIDs []cdssdk.ObjectID
|
||||
for _, obj := range list.Objects {
|
||||
objectIDs = append(objectIDs, obj.ObjectID)
|
||||
}
|
||||
|
||||
err = cdsCli.Object().Delete(cdsapi.ObjectDelete{
|
||||
ObjectIDs: objectIDs,
|
||||
UserID: userID,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete object: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
err = svc.JobSetSvc().db.UploadData().DeleteFolder(svc.db.DefCtx(), packageID, path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to delete object: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (svc *JobSetService) QueryUploaded(queryParams sch.QueryData) ([]uploadersdk.Package, int, int, error) {
|
||||
// 查询根目录
|
||||
if queryParams.PackageID == -1 {
|
||||
packages, err := svc.JobSetSvc().db.UploadData().QueryPackage(svc.db.DefCtx(), queryParams)
|
||||
if err != nil {
|
||||
return nil, 0, 0, fmt.Errorf("failed to query uploaded data: %w", err)
|
||||
}
|
||||
return packages, 0, 0, nil
|
||||
}
|
||||
|
||||
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return nil, 0, 0, fmt.Errorf("new scheduler client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(cdsCli)
|
||||
|
||||
queryListReq := cdsapi.ObjectList{
|
||||
UserID: queryParams.UserID,
|
||||
PackageID: queryParams.PackageID,
|
||||
Path: queryParams.Path,
|
||||
IsPrefix: true,
|
||||
}
|
||||
objList, err := cdsCli.Object().List(queryListReq)
|
||||
if err != nil {
|
||||
return nil, 0, 0, fmt.Errorf("failed to query uploaded data: %w", err)
|
||||
}
|
||||
|
||||
folders, err := svc.db.UploadData().QueryFolder(svc.db.DefCtx(), queryParams)
|
||||
if err != nil {
|
||||
return nil, 0, 0, fmt.Errorf("failed to query uploaded data: %w", err)
|
||||
}
|
||||
for _, folder := range folders {
|
||||
f := cdssdk.Object{
|
||||
ObjectID: -1,
|
||||
PackageID: folder.PackageID,
|
||||
Path: folder.Path,
|
||||
Size: 0,
|
||||
CreateTime: folder.CreateTime,
|
||||
}
|
||||
objList.Objects = append(objList.Objects, f)
|
||||
}
|
||||
// 根据orderBy字段排序
|
||||
sort.Slice(objList.Objects, func(i, j int) bool {
|
||||
if queryParams.OrderBy == sch.OrderByName {
|
||||
return objList.Objects[i].Path < objList.Objects[j].Path
|
||||
} else if queryParams.OrderBy == sch.OrderBySize {
|
||||
return objList.Objects[i].Size < objList.Objects[j].Size
|
||||
} else if queryParams.OrderBy == sch.OrderByTime {
|
||||
return objList.Objects[i].CreateTime.Unix() < objList.Objects[j].CreateTime.Unix()
|
||||
}
|
||||
return false
|
||||
})
|
||||
|
||||
totalNum := len(objList.Objects)
|
||||
|
||||
// 分页返回
|
||||
if queryParams.PageSize > 0 {
|
||||
start := (queryParams.CurrentPage - 1) * queryParams.PageSize
|
||||
end := start + queryParams.PageSize
|
||||
if start >= totalNum {
|
||||
return nil, 0, 0, nil
|
||||
}
|
||||
if end > totalNum {
|
||||
end = totalNum
|
||||
}
|
||||
objList.Objects = objList.Objects[start:end]
|
||||
}
|
||||
totalPages := totalNum / queryParams.PageSize
|
||||
|
||||
var datas []uploadersdk.Package
|
||||
data, err := svc.db.UploadData().QueryPackageByID(svc.db.DefCtx(), queryParams.PackageID)
|
||||
if err != nil {
|
||||
return nil, 0, 0, err
|
||||
}
|
||||
pkg := uploadersdk.Package{
|
||||
PackageID: data.PackageID,
|
||||
PackageName: data.PackageName,
|
||||
JsonData: data.JsonData,
|
||||
BindingID: data.BindingID,
|
||||
UserID: data.UserID,
|
||||
Objects: objList.Objects,
|
||||
UploadedCluster: data.UploadedCluster,
|
||||
}
|
||||
datas = append(datas, pkg)
|
||||
|
||||
return datas, totalPages, totalNum, nil
|
||||
}
|
||||
|
||||
func (svc *JobSetService) DataBinding(bindingData uploadersdk.BindingData, pacakgeIDs []cdssdk.PackageID) error {
|
||||
|
||||
err := svc.db.UploadData().InsertOrUpdateBinding(svc.db.DefCtx(), bindingData)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, id := range pacakgeIDs {
|
||||
err = svc.db.UploadData().UpdatePackage(svc.db.DefCtx(), id, "", bindingData.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (svc *JobSetService) RemoveBinding(pacakgeIDs []cdssdk.PackageID) error {
|
||||
|
||||
for _, id := range pacakgeIDs {
|
||||
err := svc.db.UploadData().UpdatePackage(svc.db.DefCtx(), id, "", uploadersdk.DataID(-1))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (svc *JobSetService) CreatePackage(userID cdssdk.UserID, name string, dataType string) error {
|
||||
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cds client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(cdsCli)
|
||||
|
||||
// 创建package
|
||||
newPackage, err := cdsCli.Package().Create(cdsapi.PackageCreate{
|
||||
UserID: userID,
|
||||
BucketID: 1,
|
||||
Name: name,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create package: %w", err)
|
||||
}
|
||||
|
||||
pkg := uploadersdk.Package{
|
||||
UserID: userID,
|
||||
PackageID: newPackage.Package.PackageID,
|
||||
PackageName: name,
|
||||
DataType: dataType,
|
||||
}
|
||||
|
||||
// 写入数据库存档
|
||||
err = svc.JobSetSvc().db.UploadData().InsertPackage(svc.db.DefCtx(), pkg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (svc *JobSetService) DeletePackage(userID cdssdk.UserID, packageID cdssdk.PackageID) error {
|
||||
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
|
||||
if err != nil {
|
||||
return fmt.Errorf("new cds client: %w", err)
|
||||
}
|
||||
defer schglb.CloudreamStoragePool.Release(cdsCli)
|
||||
|
||||
err = cdsCli.Package().Delete(cdsapi.PackageDelete{
|
||||
UserID: userID,
|
||||
PackageID: packageID,
|
||||
})
|
||||
|
||||
err = svc.JobSetSvc().db.UploadData().DeletePackage(svc.db.DefCtx(), userID, packageID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
package services
|
||||
|
||||
import (
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler2"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
)
|
||||
|
||||
type Service struct {
|
||||
preScheduler prescheduler2.PreScheduler
|
||||
jobMgr *jobmgr.Manager
|
||||
db *db.DB
|
||||
}
|
||||
|
||||
func NewService(preScheduler prescheduler2.PreScheduler, jobMgr *jobmgr.Manager, db *db.DB) (*Service, error) {
|
||||
return &Service{
|
||||
preScheduler: preScheduler,
|
||||
jobMgr: jobMgr,
|
||||
db: db,
|
||||
}, nil
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
|
||||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler2"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/cmdline"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/config"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
|
||||
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/services"
|
||||
"os"
|
||||
)
|
||||
|
||||
func main() {
|
||||
err := config.Init()
|
||||
if err != nil {
|
||||
fmt.Printf("init config failed, err: %s", err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
err = logger.Init(&config.Cfg().Logger)
|
||||
if err != nil {
|
||||
fmt.Printf("init logger failed, err: %s", err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
schglb.InitPCMSchePool(&config.Cfg().PCMScheduler)
|
||||
schglb.InitUploaderPool(&config.Cfg().Uploader)
|
||||
schglb.InitBlockChainPool(&config.Cfg().BlockChain)
|
||||
schglb.InitCloudreamStoragePool(&config.Cfg().CloudreamStorage)
|
||||
|
||||
dbSvc, err := db.NewDB(&config.Cfg().DB)
|
||||
if err != nil {
|
||||
logger.Fatalf("new db2 failed, err: %s", err.Error())
|
||||
}
|
||||
|
||||
preSchr := prescheduler2.NewDefaultPreScheduler()
|
||||
nodeSvc := jobmgr.NewNodeService()
|
||||
jobMgr, err := jobmgr.NewManager(dbSvc, nodeSvc)
|
||||
if err != nil {
|
||||
logger.Fatalf("new job manager failed, err: %s", err.Error())
|
||||
}
|
||||
svc, err := services.NewService(preSchr, jobMgr, dbSvc)
|
||||
if err != nil {
|
||||
logger.Fatalf("new service failed, err: %s", err.Error())
|
||||
}
|
||||
|
||||
cmds, err := cmdline.NewCommandline(svc)
|
||||
if err != nil {
|
||||
logger.Warnf("new command line failed, err: %s", err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
cmds.DispatchCommand(os.Args[1:])
|
||||
}
|
Loading…
Reference in New Issue