新增调度中间件模块

This commit is contained in:
JeshuaRen 2024-12-31 09:13:12 +08:00
parent 017ef22216
commit 8d49c8684b
73 changed files with 7143 additions and 225 deletions

View File

@ -0,0 +1,36 @@
{
"logger": {
"output": "file",
"outputFileName": "schedulerclient",
"outputDirectory": "log",
"level": "debug"
},
"db2": {
"address": "101.201.215.196:3306",
"account": "pcm",
"password": "123456@Asd",
"databaseName": "scheduler"
},
"db": {
"address": "localhost:3306",
"account": "root",
"password": "123456",
"databaseName": "scheduler"
},
"pcmScheduler": {
"url": "https://comnet.jointcloud.net/pcm/v1/schedule"
},
"uploader": {
"url": "https://kbguhfxfanfp.test.jointcloud.net:443/v1/storage"
},
"blockChain": {
"url": "https://ai4m.jointcloud.net/blockChain",
"contractAddress": "0xc860ab27901b3c2b810165a6096c64d88763617f",
"functionName": "storeEvidence",
"memberName": "pcm",
"type": "6"
},
"cloudreamStorage": {
"url": "http://localhost:32010"
}
}

View File

@ -0,0 +1,34 @@
{
"uploadParams": {
"dataType": "dataset",
"uploadInfo": {
"type": "local",
"localPath": "yuque_mind.jpeg"
},
"dataName": "yuque_mind.jpeg",
"uploadPriority": {
"type": "preference",
"priorities": [
{
"type": "region",
"options": [
"华东区域",
"华北区域"
]
},
{
"type": "chip",
"options": [
"DCU"
]
},
{
"type": "bias",
"options": [
"网络优先"
]
}
]
}
}
}

View File

@ -1,10 +1,13 @@
package schglb
import (
"gitlink.org.cn/cloudream/common/sdks/blockchain"
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
pcmsch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
scmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq"
advmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor"
cltmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/collector"
@ -20,6 +23,27 @@ var CollectorMQPool cltmq.Pool
var ManagerMQPool mgrmq.Pool
var PCMSchePool pcmsch.Pool
var UploaderPool uploadersdk.Pool
var BlockChainPool blockchain.Pool
var BlockChainConfig *blockchain.Config
func InitBlockChainPool(cfg *blockchain.Config) {
BlockChainPool = blockchain.NewPool(cfg)
BlockChainConfig = cfg
}
func InitUploaderPool(cfg *uploadersdk.Config) {
UploaderPool = uploadersdk.NewPool(cfg)
}
func InitPCMSchePool(cfg *pcmsch.Config) {
PCMSchePool = pcmsch.NewPool(cfg)
}
func InitMQPool(cfg *scmq.Config) {
ExecutorMQPool = exemq.NewPool(cfg)
AdvisorMQPool = advmq.NewPool(cfg)

View File

@ -18,56 +18,59 @@ type AdvisorID string
type ComputingCenter struct {
// 计算中心名称
CCID schsdk.CCID `json:"CCID" db:"CCID"`
CCID schsdk.CCID `json:"CCID" gorm:"column:CCID"`
// 计算中心在运控系统的ID
UOPSlwNodeID uopsdk.SlwNodeID `json:"uopSlwNodeID" db:"UOPSlwNodeID"`
UOPSlwNodeID uopsdk.SlwNodeID `json:"uopSlwNodeID" gorm:"column:UOPSlwNodeID"`
// 计算中心在PCM系统的ID
PCMParticipantID pcmsdk.ParticipantID `json:"pcmParticipantID" db:"PCMParticipantID"`
PCMParticipantID pcmsdk.ParticipantID `json:"pcmParticipantID" gorm:"column:PCMParticipantID"`
// 此算力中心的存储服务对应在存储系统中的ID
CDSStorageID cdssdk.StorageID `json:"cdsStorageID" db:"CDSStorageID"`
CDSStorageID cdssdk.StorageID `json:"cdsStorageID" gorm:"column:CDSStorageID"`
// 计算中心名称
Name string `json:"name" db:"Name"`
Name string `json:"name" gorm:"column:Name"`
// 任务启动方式
Bootstrap schsdk.Bootstrap `json:"bootstrap" db:"Bootstrap"`
//Bootstrap schsdk.Bootstrap `json:"bootstrap" gorm:"column:Bootstrap"`
Bootstrap schsdk.Bootstrap `json:"bootstrap" gorm:"-"`
// 执行器ID
ExecutorID string `json:"executorID" db:"executorID"`
ExecutorID string `json:"executorID" gorm:"column:executorID"`
// 执行器URL
ExecutorURL string `json:"executorURL" db:"executorURL"`
ExecutorURL string `json:"executorURL" gorm:"column:executorURL"`
//ClusterID schsdk.ClusterID `json:"clusterID" gorm:"column:ClusterID"`
}
type Image struct {
// 调度系统内的镜像ID
ImageID schsdk.ImageID `json:"imageID" db:"ImageID"`
ImageID schsdk.ImageID `json:"imageID" gorm:"column:ImageID"`
// 镜像文件对应的存储系统PackageID可以为空为空则代表此镜像不可被自动导入到算力中心比如是预制镜像
CDSPackageID *cdssdk.PackageID `json:"cdsPackageID" db:"CDSPackageID"`
CDSPackageID *cdssdk.PackageID `json:"cdsPackageID" gorm:"column:CDSPackageID"`
// 镜像名称,在调度系统上设置的
Name string `json:"name" db:"Name"`
Name string `json:"name" gorm:"column:Name"`
// 镜像创建时间
CreateTime time.Time `json:"createTime" db:"CreateTime"`
CreateTime time.Time `json:"createTime" gorm:"column:CreateTime"`
}
type PCMImage struct {
// 调度系统内的镜像ID
ImageID schsdk.ImageID `json:"imageID" db:"ImageID"`
ImageID schsdk.ImageID `json:"imageID" gorm:"column:ImageID"`
// 导入到的计算中心的ID
CCID schsdk.CCID `json:"ccID" db:"CCID"`
CCID schsdk.CCID `json:"ccID" gorm:"column:CCID"`
// 通过PCM系统导入到各计算中心后得到的ID
PCMImageID pcmsdk.ImageID `json:"pcmImageID" db:"PCMImageID"`
PCMImageID pcmsdk.ImageID `json:"pcmImageID" gorm:"column:PCMImageID"`
// 镜像名称通过PCM导入后获得
Name string `json:"name" db:"Name"`
Name string `json:"name" gorm:"column:Name"`
// 镜像导入时间
UploadTime time.Time `json:"uploadTime" db:"UploadTime"`
UploadTime time.Time `json:"uploadTime" gorm:"column:UploadTime"`
}
type CCResource struct {
// 计算中心ID
CCID schsdk.CCID `json:"ccID" db:"CCID"`
CCID schsdk.CCID `json:"ccID" gorm:"column:CCID"`
// PCM系统返回的资源规格ID
PCMResourceID pcmsdk.ResourceID `json:"pcmResourceID" db:"PCMResourceID"`
PCMResourceID pcmsdk.ResourceID `json:"pcmResourceID" gorm:"column:PCMResourceID"`
// PCM系统返回的资源规格名称
PCMResourceName string `json:"pcmResourceName" db:"PCMResourceName"`
PCMResourceName string `json:"pcmResourceName" gorm:"column:PCMResourceName"`
// 此种规格具体包含的资源信息
Resource CCResourceInfo `json:"resource" db:"Resource"`
Resource CCResourceInfo `json:"resource" gorm:"column:Resource"`
}
type CCResourceInfo struct {
@ -80,32 +83,52 @@ type CCResourceInfo struct {
}
type Models struct {
ModelID schsdk.ModelID `json:"modelID" db:"modelID"`
ModelName schsdk.ModelName `json:"modelName" db:"modelName"`
// 模型ID
ModelID schsdk.ModelID `json:"modelID" gorm:"column:modelID"`
// 模型名称
ModelName schsdk.ModelName `json:"modelName" gorm:"column:modelName"`
}
type ModelResource struct {
ModelID int64 `json:"modelID" db:"modelID"`
OjbStgID int64 `json:"OjbStgID" db:"OjbStgID"`
ModelPath string `json:"modelPath" db:"modelPath"`
StartShellPath string `json:"startShellPath" db:"startShellPath"`
ServerPort int64 `json:"serverPort" db:"serverPort"`
ServerUrlPath string `json:"serverUrlPath" db:"serverUrlPath"`
StopShellPath string `json:"stopShellPath" db:"stopShellPath"`
FinetuningShellPath string `json:"finetuningShellPath" db:"finetuningShellPath"`
// 模型ID
ModelID int64 `json:"modelID" gorm:"column:modelID"`
// 存储ID
OjbStgID int64 `json:"OjbStgID" gorm:"column:OjbStgID"`
// 模型路径
ModelPath string `json:"modelPath" gorm:"column:modelPath"`
// 启动脚本路径
StartShellPath string `json:"startShellPath" gorm:"column:startShellPath"`
// 服务器端口
ServerPort int64 `json:"serverPort" gorm:"column:serverPort"`
// 服务器URL路径
ServerUrlPath string `json:"serverUrlPath" gorm:"column:serverUrlPath"`
// 停止脚本路径
StopShellPath string `json:"stopShellPath" gorm:"column:stopShellPath"`
// 微调脚本路径
FinetuningShellPath string `json:"finetuningShellPath" gorm:"column:finetuningShellPath"`
}
type ObjectStorage struct {
ID int64 `json:"ID" db:"ID"`
Name string `json:"name" db:"name"`
Manufacturer string `json:"manufacturer" db:"manufacturer"`
Region string `json:"region" db:"region"`
AK string `json:"access_key_id" db:"access_key_id"`
SK string `json:"secret_access_key" db:"secret_access_key"`
Endpoint string `json:"endpoint" db:"endpoint"`
Bucket string `json:"bucket" db:"bucket"`
CDSStorageID cdssdk.StorageID `json:"CDSStorageID" db:"CDSStorageID"`
MountType string `json:"mountType" db:"mountType"`
// 存储ID
ID int64 `json:"ID" gorm:"column:ID"`
// 存储名称
Name string `json:"name" gorm:"column:name"`
// 厂商
Manufacturer string `json:"manufacturer" gorm:"column:manufacturer"`
// 区域
Region string `json:"region" gorm:"column:region"`
// 访问密钥ID
AK string `json:"access_key_id" gorm:"column:access_key_id"`
// 秘密访问密钥
SK string `json:"secret_access_key" gorm:"column:secret_access_key"`
// 存储端点
Endpoint string `json:"endpoint" gorm:"column:endpoint"`
// 存储桶
Bucket string `json:"bucket" gorm:"column:bucket"`
// CDS存储ID
CDSStorageID cdssdk.StorageID `json:"CDSStorageID" gorm:"column:CDSStorageID"`
// 挂载类型
MountType string `json:"mountType" gorm:"column:mountType"`
}
func (i *CCResourceInfo) Scan(src interface{}) error {
@ -122,3 +145,9 @@ const (
AliCloud = "AliCloud"
SugonCloud = "SugonCloud"
)
//type FileUploadedInfo struct {
// FileName string `json:"fileName"`
// FileHash string `json:"fileHash"`
// FileSize int64 `json:"fileSize"`
//}

View File

@ -1,7 +1,6 @@
package db
import (
"github.com/jmoiron/sqlx"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
)
@ -16,6 +15,6 @@ func (db *DB) CCResource() *CCResourceDB {
func (*CCResourceDB) GetByCCID(ctx SQLContext, id schsdk.CCID) ([]schmod.CCResource, error) {
var ret []schmod.CCResource
err := sqlx.Select(ctx, &ret, "select * from CCResource where CCID = ?", id)
err := ctx.Where("CCID = ?", id).Find(&ret).Error
return ret, err
}

View File

@ -1,7 +1,6 @@
package db
import (
"github.com/jmoiron/sqlx"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
)
@ -14,20 +13,52 @@ func (db *DB) ComputingCenter() *ComputingCenterDB {
return &ComputingCenterDB{DB: db}
}
//func (*ComputingCenterDB) GetByPackageID(ctx SQLContext, id schsdk.CCID) (schmod.ComputingCenter, error) {
// var ret TempComputingCenter
// err := sqlx.Get(ctx, &ret, "select cc.*, ei.executorURL from (select * from ComputingCenter where CCID = ?) as cc left join (select * from ExecutorInfo) as ei on cc.executorID = ei.executorID", id)
// return ret.ToComputingCenter(), err
//}
func (*ComputingCenterDB) GetByID(ctx SQLContext, id schsdk.CCID) (schmod.ComputingCenter, error) {
var ret TempComputingCenter
err := sqlx.Get(ctx, &ret, "select cc.*, ei.executorURL from (select * from ComputingCenter where CCID = ?) as cc left join (select * from ExecutorInfo) as ei on cc.executorID = ei.executorID", id)
// 使用 GORM 的原始 SQL 查询
err := ctx.Table("ComputingCenter").Raw(`
SELECT cc.*, ei.executorURL
FROM (SELECT * FROM ComputingCenter WHERE CCID = ?) AS cc
LEFT JOIN (SELECT * FROM ExecutorInfo) AS ei
ON cc.executorID = ei.executorID`, id).Scan(&ret).Error
// 将 TempComputingCenter 转换为 ComputingCenter
return ret.ToComputingCenter(), err
}
func (*ComputingCenterDB) GetAll(ctx SQLContext) ([]schmod.ComputingCenter, error) {
var tmp []TempComputingCenter
err := sqlx.Select(ctx, &tmp, "select * from ComputingCenter")
//func (*ComputingCenterDB) GetAll(ctx SQLContext) ([]schmod.ComputingCenter, error) {
// var tmp []TempComputingCenter
// err := sqlx.Select(ctx, &tmp, "select * from ComputingCenter")
//
// var ret []schmod.ComputingCenter
// for _, t := range tmp {
// ret = append(ret, t.ToComputingCenter())
// }
//
// return ret, err
//}
var ret []schmod.ComputingCenter
for _, t := range tmp {
ret = append(ret, t.ToComputingCenter())
func (*ComputingCenterDB) GetAll(ctx SQLContext) ([]schmod.ComputingCenter, error) {
//var tmp []TempComputingCenter
var tmp []schmod.ComputingCenter
// 使用 GORM 的 Find 查询所有 ComputingCenter 数据
err := ctx.Table("ComputingCenter").Find(&tmp).Error
if err != nil {
return nil, err
}
return ret, err
// 将查询结果从 TempComputingCenter 转换为 ComputingCenter
//var ret []schmod.ComputingCenter
//for _, t := range tmp {
// ret = append(ret, t.ToComputingCenter())
//}
return tmp, nil
}

View File

@ -1,64 +1,91 @@
package db
import (
"context"
"database/sql"
"fmt"
_ "github.com/go-sql-driver/mysql"
"github.com/jmoiron/sqlx"
"github.com/sirupsen/logrus"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db/config"
"gorm.io/driver/mysql"
"gorm.io/gorm"
)
// TODO 迁移到Gorm
// TODO ComputingCenter去掉了CDSNodeID字段需要修改DB的结构
type DB struct {
d *sqlx.DB
}
//type DB struct {
// d *sqlx.DB
//}
//
//type SQLContext interface {
// sqlx.Queryer
// sqlx.Execer
// sqlx.Ext
//}
//
//func NewDB(cfg *config.Config) (*DB, error) {
// db, err := sqlx.Open("mysql", cfg.MakeSourceString())
// if err != nil {
// return nil, fmt.Errorf("open database connection failed, err: %w", err)
// }
//
// // 尝试连接一下数据库,如果数据库配置有错误在这里就能报出来
// err = db.Ping()
// if err != nil {
// return nil, err
// }
//
// return &DB{
// d: db,
// }, nil
//}
//
//func (db *DB) DoTx(isolation sql.IsolationLevel, fn func(tx *sqlx.Tx) error) error {
// tx, err := db.d.BeginTxx(context.Background(), &sql.TxOptions{Isolation: isolation})
// if err != nil {
// return err
// }
//
// if err := fn(tx); err != nil {
// tx.Rollback()
// return err
// }
//
// if err := tx.Commit(); err != nil {
// tx.Rollback()
// return err
// }
//
// return nil
//}
//
//func (db *DB) SQLCtx() SQLContext {
// return db.d
//}
type SQLContext interface {
sqlx.Queryer
sqlx.Execer
sqlx.Ext
type DB struct {
db *gorm.DB
}
func NewDB(cfg *config.Config) (*DB, error) {
db, err := sqlx.Open("mysql", cfg.MakeSourceString())
mydb, err := gorm.Open(mysql.Open(cfg.MakeSourceString()), &gorm.Config{})
if err != nil {
return nil, fmt.Errorf("open database connection failed, err: %w", err)
}
// 尝试连接一下数据库,如果数据库配置有错误在这里就能报出来
err = db.Ping()
if err != nil {
return nil, err
logrus.Fatalf("failed to connect to database: %v", err)
}
return &DB{
d: db,
db: mydb,
}, nil
}
func (db *DB) DoTx(isolation sql.IsolationLevel, fn func(tx *sqlx.Tx) error) error {
tx, err := db.d.BeginTxx(context.Background(), &sql.TxOptions{Isolation: isolation})
if err != nil {
return err
}
if err := fn(tx); err != nil {
tx.Rollback()
return err
}
if err := tx.Commit(); err != nil {
tx.Rollback()
return err
}
return nil
func (db *DB) DoTx(do func(tx SQLContext) error) error {
return db.db.Transaction(func(tx *gorm.DB) error {
return do(SQLContext{tx})
})
}
func (db *DB) SQLCtx() SQLContext {
return db.d
type SQLContext struct {
*gorm.DB
}
func (db *DB) DefCtx() SQLContext {
return SQLContext{db.db}
}

View File

@ -3,7 +3,6 @@ package db
import (
"time"
"github.com/jmoiron/sqlx"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
@ -17,22 +16,42 @@ func (db *DB) Image() *ImageDB {
return &ImageDB{DB: db}
}
//func (*ImageDB) GetByPackageID(ctx SQLContext, id schsdk.ImageID) (schmod.Image, error) {
// var ret schmod.Image
// err := sqlx.Get(ctx, &ret, "select * from Image where ImageID = ?", id)
// return ret, err
//}
func (*ImageDB) GetByID(ctx SQLContext, id schsdk.ImageID) (schmod.Image, error) {
var ret schmod.Image
err := sqlx.Get(ctx, &ret, "select * from Image where ImageID = ?", id)
err := ctx.Table("Image").Where("ImageID = ?", id).First(&ret).Error
return ret, err
}
//func (*ImageDB) Create(ctx SQLContext, cdsPackageID *cdssdk.PackageID, name string, createTime time.Time) (schsdk.ImageID, error) {
// ret, err := ctx.Exec("insert into Image(CDSPackageID, Name, CreateTime) values(?, ?, ?)", cdsPackageID, name, createTime)
// if err != nil {
// return 0, err
// }
//
// id, err := ret.LastInsertId()
// if err != nil {
// return 0, err
// }
//
// return schsdk.ImageID(id), nil
//}
func (*ImageDB) Create(ctx SQLContext, cdsPackageID *cdssdk.PackageID, name string, createTime time.Time) (schsdk.ImageID, error) {
ret, err := ctx.Exec("insert into Image(CDSPackageID, Name, CreateTime) values(?, ?, ?)", cdsPackageID, name, createTime)
if err != nil {
image := schmod.Image{
CDSPackageID: cdsPackageID,
Name: name,
CreateTime: createTime,
}
if err := ctx.Table("Image").Create(&image).Error; err != nil {
return 0, err
}
id, err := ret.LastInsertId()
if err != nil {
return 0, err
}
return schsdk.ImageID(id), nil
return image.ImageID, nil
}

View File

@ -11,7 +11,7 @@ import (
type TempComputingCenter struct {
schmod.ComputingCenter
Bootstrap BootstrapWarpper `db:"Bootstrap"`
Bootstrap BootstrapWrapper `gorm:"column:Bootstrap"`
}
func (c *TempComputingCenter) ToComputingCenter() schmod.ComputingCenter {
@ -20,11 +20,11 @@ func (c *TempComputingCenter) ToComputingCenter() schmod.ComputingCenter {
return cc
}
type BootstrapWarpper struct {
type BootstrapWrapper struct {
Value schsdk.Bootstrap
}
func (o *BootstrapWarpper) Scan(src interface{}) error {
func (o *BootstrapWrapper) Scan(src interface{}) error {
data, ok := src.([]uint8)
if !ok {
return fmt.Errorf("unknow src type: %v", reflect.TypeOf(data))

View File

@ -1,7 +1,6 @@
package db
import (
"github.com/jmoiron/sqlx"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
)
@ -14,15 +13,27 @@ func (db *DB) Models() *ModelsDB {
return &ModelsDB{DB: db}
}
//func (*ModelsDB) GetAll(ctx SQLContext) ([]schmod.Models, error) {
// var ret []schmod.Models
// err := sqlx.Select(ctx, &ret, "select * from Models")
//
// return ret, err
//}
func (*ModelsDB) GetAll(ctx SQLContext) ([]schmod.Models, error) {
var ret []schmod.Models
err := sqlx.Select(ctx, &ret, "select * from Models")
err := ctx.Table("Models").Find(&ret).Error
return ret, err
}
//func (*ModelsDB) GetModelByID(ctx SQLContext, modelID schsdk.ModelID, OjbStgID int64) (schmod.ModelResource, error) {
// var ret schmod.ModelResource
// err := sqlx.Get(ctx, &ret, "select * from ModelResource where modelID = ? and OjbStgID = ?", modelID, OjbStgID)
// return ret, err
//}
func (*ModelsDB) GetModelByID(ctx SQLContext, modelID schsdk.ModelID, OjbStgID int64) (schmod.ModelResource, error) {
var ret schmod.ModelResource
err := sqlx.Get(ctx, &ret, "select * from ModelResource where modelID = ? and OjbStgID = ?", modelID, OjbStgID)
err := ctx.Table("ModelResource").Where("modelID = ? AND OjbStgID = ?", modelID, OjbStgID).First(&ret).Error
return ret, err
}

View File

@ -1,7 +1,6 @@
package db
import (
"github.com/jmoiron/sqlx"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
)
@ -14,8 +13,14 @@ func (db *DB) ObjectStorage() *ObjectStorageDB {
return &ObjectStorageDB{DB: db}
}
//func (*ObjectStorageDB) GetObjectStorageByStorageID(ctx SQLContext, CDSStorageID cdssdk.StorageID) (schmod.ObjectStorage, error) {
// var ret schmod.ObjectStorage
// err := sqlx.Get(ctx, &ret, "select * from ObjectStorage where CDSStorageID = ?", CDSStorageID)
// return ret, err
//}
func (*ObjectStorageDB) GetObjectStorageByStorageID(ctx SQLContext, CDSStorageID cdssdk.StorageID) (schmod.ObjectStorage, error) {
var ret schmod.ObjectStorage
err := sqlx.Get(ctx, &ret, "select * from ObjectStorage where CDSStorageID = ?", CDSStorageID)
err := ctx.Table("ObjectStorage").Where("CDSStorageID = ?", CDSStorageID).First(&ret).Error
return ret, err
}

View File

@ -1,12 +1,10 @@
package db
import (
"time"
"github.com/jmoiron/sqlx"
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
"time"
)
type PCMImageDB struct {
@ -17,19 +15,43 @@ func (db *DB) PCMImage() *PCMImageDB {
return &PCMImageDB{DB: db}
}
//func (*PCMImageDB) GetByImageID(ctx SQLContext, id schsdk.ImageID) ([]schmod.PCMImage, error) {
// var ret []schmod.PCMImage
// err := sqlx.Select(ctx, &ret, "select * from PCMImage where ImageID = ?", id)
// return ret, err
//}
//
//func (*PCMImageDB) GetByImageIDAndCCID(ctx SQLContext, imageID schsdk.ImageID, ccID schsdk.CCID) (schmod.PCMImage, error) {
// var ret schmod.PCMImage
// err := sqlx.Get(ctx, &ret, "select * from PCMImage where ImageID = ? and CCID = ?", imageID, ccID)
// return ret, err
//}
//
//func (*PCMImageDB) Create(ctx SQLContext, imageID schsdk.ImageID, ccID schsdk.CCID, pcmImageID pcmsdk.ImageID, name string, uploadTime time.Time) error {
// _, err := ctx.Exec("insert into PCMImage values(?, ?, ?, ?, ?)", imageID, ccID, pcmImageID, name, uploadTime)
// return err
//}
func (*PCMImageDB) GetByImageID(ctx SQLContext, id schsdk.ImageID) ([]schmod.PCMImage, error) {
var ret []schmod.PCMImage
err := sqlx.Select(ctx, &ret, "select * from PCMImage where ImageID = ?", id)
err := ctx.Table("PCMImage").Where("ImageID = ?", id).Find(&ret).Error
return ret, err
}
func (*PCMImageDB) GetByImageIDAndCCID(ctx SQLContext, imageID schsdk.ImageID, ccID schsdk.CCID) (schmod.PCMImage, error) {
var ret schmod.PCMImage
err := sqlx.Get(ctx, &ret, "select * from PCMImage where ImageID = ? and CCID = ?", imageID, ccID)
err := ctx.Table("PCMImage").Where("ImageID = ? AND CCID = ?", imageID, ccID).First(&ret).Error
return ret, err
}
func (*PCMImageDB) Create(ctx SQLContext, imageID schsdk.ImageID, ccID schsdk.CCID, pcmImageID pcmsdk.ImageID, name string, uploadTime time.Time) error {
_, err := ctx.Exec("insert into PCMImage values(?, ?, ?, ?, ?)", imageID, ccID, pcmImageID, name, uploadTime)
return err
pcmImage := schmod.PCMImage{
ImageID: imageID,
CCID: ccID,
PCMImageID: pcmImageID,
Name: name,
UploadTime: uploadTime,
}
return ctx.Table("PCMImage").Create(&pcmImage).Error
}

View File

@ -0,0 +1,198 @@
package db
import (
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
"gorm.io/gorm/clause"
"strings"
"time"
)
type UploadDataDB struct {
*DB
}
func (db *DB) UploadData() *UploadDataDB {
return &UploadDataDB{DB: db}
}
func (db *UploadDataDB) GetByPackageID(ctx SQLContext, packageIDs []cdssdk.PackageID, bindingIDs []int64) ([]uploadersdk.Package, error) {
var ret []uploadersdk.Package
err := ctx.Table("UploadData").Where("packageID IN ? or bindingID IN ?", packageIDs, bindingIDs).Find(&ret).Error
return ret, err
}
func (db *UploadDataDB) GetByID(ctx SQLContext, ids []uploadersdk.DataID) ([]uploadersdk.Package, error) {
var ret []uploadersdk.Package
err := ctx.Table("UploadData").Where("ID IN ?", ids).Find(&ret).Error
return ret, err
}
func (db *UploadDataDB) QueryFolder(ctx SQLContext, queryParams sch.QueryData) ([]uploadersdk.Folder, error) {
var ret []uploadersdk.Folder
err := ctx.Table("folders").Where("package_id = ? and path_name like ?", queryParams.PackageID, queryParams.Path+"%").Find(&ret).Error
// 使用 map 来去重路径
pathSet := make(map[string]struct{})
for _, folder := range ret {
// 如果路径是输入路径的后一级子路径
if strings.HasPrefix(folder.Path, queryParams.Path+"/") {
path := folder.Path[len(queryParams.Path)+1:]
pathArr := strings.Split(path, "/")
if len(pathArr) == 0 {
continue
}
path = queryParams.Path + "/" + pathArr[0]
// 保留输入路径后的下一级路径,并去重
pathSet[path] = struct{}{}
}
}
// 将去重后的路径从 map 转换回切片
var result []uploadersdk.Folder
for path := range pathSet {
result = append(result, uploadersdk.Folder{Path: path})
}
return result, err
}
func (db *UploadDataDB) InsertFolder(ctx SQLContext, packageID cdssdk.PackageID, path string) error {
folder := uploadersdk.Folder{
PackageID: packageID,
Path: path,
CreateTime: time.Now(),
}
if err := ctx.Table("folders").Create(&folder).Error; err != nil {
return err
}
return nil
}
func (db *UploadDataDB) DeleteFolder(ctx SQLContext, packageID cdssdk.PackageID, path string) error {
err := ctx.Table("folders").Where("package_id = ? and path_name like ?", packageID, path+"%").Delete(&uploadersdk.Folder{}).Error
return err
}
func (db *UploadDataDB) QueryPackage(ctx SQLContext, queryParams sch.QueryData) ([]uploadersdk.Package, error) {
var ret []uploadersdk.PackageDAO
err := ctx.Table("package").Where("user_id = ? and data_type = ?", queryParams.UserID, queryParams.DataType).Find(&ret).Error
// 将数据转换成 uploadersdk.Package
var res []uploadersdk.Package
for _, dao := range ret {
pkg := uploadersdk.Package{
UserID: dao.UserID,
PackageID: dao.PackageID,
PackageName: dao.PackageName,
DataType: dao.DataType,
JsonData: dao.JsonData,
BindingID: dao.BindingID,
UploadedCluster: dao.UploadedCluster,
}
res = append(res, pkg)
}
return res, err
}
func (db *UploadDataDB) InsertPackage(ctx SQLContext, newPackage uploadersdk.Package) error {
// 查询是否存在
if err := ctx.Table("package").Where("package_id = ?", newPackage.PackageID).First(&uploadersdk.PackageDAO{}).Error; err == nil {
return nil
}
dao := uploadersdk.PackageDAO{
PackageID: newPackage.PackageID,
PackageName: newPackage.PackageName,
DataType: newPackage.DataType,
UserID: newPackage.UserID,
BindingID: -1,
}
// 插入新包
if err := ctx.Table("package").Create(&dao).Error; err != nil {
return err
}
return nil
}
func (db *UploadDataDB) DeletePackage(ctx SQLContext, userID cdssdk.UserID, packageID cdssdk.PackageID) error {
err := ctx.Table("package").
Where("package_id = ? and user_id = ?", packageID, userID).
Delete(&uploadersdk.PackageDAO{}).Error
return err
}
func (db *UploadDataDB) QueryPackageByID(ctx SQLContext, ID cdssdk.PackageID) (uploadersdk.PackageDAO, error) {
var ret uploadersdk.PackageDAO
err := ctx.Table("package").Where("package_id = ?", ID).Omit("Objects").Preload("UploadedCluster").Find(&ret).Error
return ret, err
}
type ClusterMappingRow struct {
ClusterID schsdk.ClusterID `gorm:"column:clusterID" json:"clusterID"`
StorageID cdssdk.StorageID `gorm:"column:storageID" json:"storageID"`
}
func (db *UploadDataDB) GetClusterMapping(ctx SQLContext) (map[schsdk.ClusterID]cdssdk.StorageID, error) {
var rows []ClusterMappingRow
err := ctx.Table("ClusterMapping").Find(&rows).Error
if err != nil {
return nil, err
}
// 手动将查询结果转换为 map
ret := make(map[schsdk.ClusterID]cdssdk.StorageID)
for _, row := range rows {
ret[row.ClusterID] = row.StorageID
}
return ret, nil
}
func (db *UploadDataDB) UpdatePackage(ctx SQLContext, packageID cdssdk.PackageID, jsonData string, bindingID uploadersdk.DataID) error {
if jsonData != "" {
if err := ctx.Table("package").Where("package_id = ?", packageID).Update("json_data", jsonData).Error; err != nil {
return err
}
}
if bindingID != -1 {
if err := ctx.Table("package").Where("package_id = ?", packageID).Update("binding_id", bindingID).Error; err != nil {
return err
}
}
return nil
}
func (db *UploadDataDB) InsertBlockchains(ctx SQLContext, blockchains []*uploadersdk.BlockChain) error {
if err := ctx.Table("BlockChain").Create(&blockchains).Error; err != nil {
return err
}
return nil
}
func (db *UploadDataDB) InsertOrUpdateBinding(ctx SQLContext, data uploadersdk.BindingData) error {
err := ctx.Table("BindingData").Clauses(clause.OnConflict{
Columns: []clause.Column{{Name: "ID"}}, // 指定冲突列
DoUpdates: clause.Assignments(map[string]interface{}{
"bindingName": data.BindingName,
//"bindingType": data.BindingType,
}),
}).Create(&data).Error
if err != nil {
return err
}
return nil
}

View File

@ -403,13 +403,6 @@ func (s *DefaultPreScheduler) scheduleForNormalOrMultiJob(jobSet *schsdk.JobSetI
return nil, ErrNoAvailableScheme
}
// 此逻辑用于测试,生产环境必须删除!
for i := 0; i < len(allCCsArr); i++ {
if allCCsArr[i].CC.CCID == schsdk.CCID(jobResource.Storage) {
targetNode = allCCsArr[i]
}
}
scheme := s.makeSchemeForNode(jobFiles, targetNode)
return &scheme, nil
}

View File

@ -0,0 +1,349 @@
package prescheduler2
import (
"fmt"
pcmsch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
"github.com/inhies/go-bytesize"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
"gitlink.org.cn/cloudream/common/utils/math2"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/collector"
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
)
func (s *DefaultPreScheduler) calcResourceScore(jobResource schsdk.JobResourcesInfo, allCCs map[schsdk.CCID]*candidate) error {
schCli, err := schglb.PCMSchePool.Acquire()
if err != nil {
return fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.PCMSchePool.Release(schCli)
clusterIDs := make([]schsdk.ClusterID, 0, len(allCCs))
for _, cc := range allCCs {
clusterIDs = append(clusterIDs, schsdk.ClusterID(cc.CC.CCID))
}
clusterDetails, err := schCli.GetClusterInfo(pcmsch.GetClusterInfoReq{
IDs: clusterIDs,
})
if err != nil {
return fmt.Errorf("get cluster info: %w", err)
}
for _, cluster := range clusterDetails {
res, err := s.calcOneResourceScore(jobResource, cluster.Resources2)
if err != nil {
return err
}
for _, cc := range allCCs {
if schsdk.ClusterID(cc.CC.CCID) == cluster.ClusterId {
cc.Resource = *res
break
}
}
}
return nil
}
// 划分节点资源等级,并计算资源得分
func (s *DefaultPreScheduler) calcOneResourceScore(requires schsdk.JobResourcesInfo, resourceData []pcmsch.ResourceData) (*resourcesDetail, error) {
var resDetail resourcesDetail
//计算资源得分
totalScore := 0.0
maxLevel := 0
resKinds := 0
if requires.CPU > 0 {
res := findResuorce[*pcmsch.CPUResourceData](resourceData)
if res == nil {
resDetail.CPU.Level = ResourceLevel3
resDetail.CPU.Score = 0
} else {
resDetail.CPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.CPU)
resDetail.CPU.Score = (float64(res.Available.Value) / requires.CPU) * CpuResourceWeight
}
maxLevel = math2.Max(maxLevel, resDetail.CPU.Level)
totalScore += resDetail.CPU.Score
resKinds++
}
if requires.GPU > 0 {
res := findResuorce[*pcmsch.GPUResourceData](resourceData)
if res == nil {
resDetail.GPU.Level = ResourceLevel3
resDetail.GPU.Score = 0
} else {
resDetail.GPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.GPU)
resDetail.GPU.Score = (float64(res.Available.Value) / requires.GPU) * CpuResourceWeight
}
maxLevel = math2.Max(maxLevel, resDetail.GPU.Level)
totalScore += resDetail.GPU.Score
resKinds++
}
if requires.NPU > 0 {
res := findResuorce[*pcmsch.NPUResourceData](resourceData)
if res == nil {
resDetail.NPU.Level = ResourceLevel3
resDetail.NPU.Score = 0
} else {
resDetail.NPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.NPU)
resDetail.NPU.Score = (float64(res.Available.Value) / requires.NPU) * CpuResourceWeight
}
maxLevel = math2.Max(maxLevel, resDetail.NPU.Level)
totalScore += resDetail.NPU.Score
resKinds++
}
if requires.MLU > 0 {
res := findResuorce[*pcmsch.MLUResourceData](resourceData)
if res == nil {
resDetail.MLU.Level = ResourceLevel3
resDetail.MLU.Score = 0
} else {
resDetail.MLU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.MLU)
resDetail.MLU.Score = (float64(res.Available.Value) / requires.MLU) * CpuResourceWeight
}
maxLevel = math2.Max(maxLevel, resDetail.MLU.Level)
totalScore += resDetail.MLU.Score
resKinds++
}
if requires.Storage > 0 {
res := findResuorce[*pcmsch.StorageResourceData](resourceData)
if res == nil {
resDetail.Storage.Level = ResourceLevel3
resDetail.Storage.Score = 0
} else {
bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit))
if err != nil {
return nil, err
}
resDetail.Storage.Level = s.calcResourceLevel(float64(bytes), float64(requires.Storage))
resDetail.Storage.Score = (float64(bytes) / float64(requires.Storage)) * StgResourceWeight
}
maxLevel = math2.Max(maxLevel, resDetail.Storage.Level)
totalScore += resDetail.Storage.Score
resKinds++
}
if requires.Memory > 0 {
res := findResuorce[*pcmsch.MemoryResourceData](resourceData)
if res == nil {
resDetail.Memory.Level = ResourceLevel3
resDetail.Memory.Score = 0
} else {
bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit))
if err != nil {
return nil, err
}
resDetail.Memory.Level = s.calcResourceLevel(float64(bytes), float64(requires.Memory))
resDetail.Memory.Score = (float64(bytes) / float64(requires.Memory)) * StgResourceWeight
}
maxLevel = math2.Max(maxLevel, resDetail.Memory.Level)
totalScore += resDetail.Memory.Score
resKinds++
}
if resKinds == 0 {
return &resDetail, nil
}
resDetail.TotalScore = totalScore
resDetail.AvgScore = resDetail.AvgScore / float64(resKinds)
resDetail.MaxLevel = maxLevel
return &resDetail, nil
}
func (s *DefaultPreScheduler) calcResourceLevel(avai float64, need float64) int {
if avai >= 1.5*need {
return ResourceLevel1
}
if avai >= need {
return ResourceLevel2
}
return ResourceLevel3
}
// 计算节点得分情况
func (s *DefaultPreScheduler) calcFileScore(files schsdk.JobFilesInfo, allCCs map[schsdk.CCID]*candidate) error {
// 只计算运控返回的可用计算中心上的存储服务的数据权重
cdsStgToCC := make(map[cdssdk.StorageID]*candidate)
for _, cc := range allCCs {
cdsStgToCC[cc.CC.CDSStorageID] = cc
}
//计算code相关得分
if pkgFile, ok := files.Code.(*schsdk.PackageJobFileInfo); ok {
codeFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsStgToCC)
if err != nil {
return fmt.Errorf("calc code file score: %w", err)
}
for id, score := range codeFileScores {
allCCs[id].Files.Code = *score
}
}
//计算dataset相关得分
if pkgFile, ok := files.Dataset.(*schsdk.PackageJobFileInfo); ok {
datasetFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsStgToCC)
if err != nil {
return fmt.Errorf("calc dataset file score: %w", err)
}
for id, score := range datasetFileScores {
allCCs[id].Files.Dataset = *score
}
}
//计算image相关得分
if imgFile, ok := files.Image.(*schsdk.ImageJobFileInfo); ok {
//计算image相关得分
imageFileScores, err := s.calcImageFileScore(imgFile.ImageID, allCCs, cdsStgToCC)
if err != nil {
return fmt.Errorf("calc image file score: %w", err)
}
for id, score := range imageFileScores {
allCCs[id].Files.Image = *score
}
}
for _, cc := range allCCs {
cc.Files.TotalScore = cc.Files.Code.CachingScore +
cc.Files.Code.LoadingScore +
cc.Files.Dataset.CachingScore +
cc.Files.Dataset.LoadingScore +
cc.Files.Image.CachingScore +
cc.Files.Image.LoadingScore
}
return nil
}
// 计算package在各节点的得分情况
func (s *DefaultPreScheduler) calcPackageFileScore(packageID cdssdk.PackageID, cdsStgToCC map[cdssdk.StorageID]*candidate) (map[schsdk.CCID]*fileDetail, error) {
colCli, err := schglb.CollectorMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new collector client: %w", err)
}
defer schglb.CollectorMQPool.Release(colCli)
ccFileScores := make(map[schsdk.CCID]*fileDetail)
// TODO UserID
cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(1, packageID))
if err != nil {
return nil, err
}
for _, cdsNodeCacheInfo := range cachedResp.StorageInfos {
cc, ok := cdsStgToCC[cdsNodeCacheInfo.StorageID]
if !ok {
continue
}
ccFileScores[cc.CC.CCID] = &fileDetail{
//TODO 根据缓存方式不同,可能会有不同的计算方式
CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight,
}
}
// TODO UserID
loadedResp, err := colCli.PackageGetLoadedStgs(collector.NewPackageGetLoadedStgs(1, packageID))
if err != nil {
return nil, err
}
for _, cdsStgID := range loadedResp.StgIDs {
cc, ok := cdsStgToCC[cdsStgID]
if !ok {
continue
}
sfc, ok := ccFileScores[cc.CC.CCID]
if !ok {
sfc = &fileDetail{}
ccFileScores[cc.CC.CCID] = sfc
}
sfc.LoadingScore = 1 * LoadedWeight
sfc.IsLoaded = true
}
return ccFileScores, nil
}
// 计算package在各节点的得分情况
func (s *DefaultPreScheduler) calcImageFileScore(imageID schsdk.ImageID, allCCs map[schsdk.CCID]*candidate, cdsStgToCC map[cdssdk.StorageID]*candidate) (map[schsdk.CCID]*fileDetail, error) {
colCli, err := schglb.CollectorMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new collector client: %w", err)
}
defer schglb.CollectorMQPool.Release(colCli)
magCli, err := schglb.ManagerMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new manager client: %w", err)
}
defer schglb.ManagerMQPool.Release(magCli)
imageInfoResp, err := magCli.GetImageInfo(mgrmq.NewGetImageInfo(imageID))
if err != nil {
return nil, fmt.Errorf("getting image info: %w", err)
}
ccFileScores := make(map[schsdk.CCID]*fileDetail)
if imageInfoResp.Image.CDSPackageID != nil {
cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(1, *imageInfoResp.Image.CDSPackageID))
if err != nil {
return nil, err
}
for _, cdsNodeCacheInfo := range cachedResp.StorageInfos {
cc, ok := cdsStgToCC[cdsNodeCacheInfo.StorageID]
if !ok {
continue
}
ccFileScores[cc.CC.CCID] = &fileDetail{
//TODO 根据缓存方式不同,可能会有不同的计算方式
CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight,
}
}
}
// 镜像的LoadingScore是判断是否导入到算力中心
for _, pcmImg := range imageInfoResp.PCMImages {
_, ok := allCCs[pcmImg.CCID]
if !ok {
continue
}
fsc, ok := ccFileScores[pcmImg.CCID]
if !ok {
fsc = &fileDetail{}
ccFileScores[pcmImg.CCID] = fsc
}
fsc.LoadingScore = 1 * LoadedWeight
fsc.IsLoaded = true
}
return ccFileScores, nil
}

View File

@ -0,0 +1,636 @@
package prescheduler2
import (
"fmt"
pcmsch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
"sort"
"github.com/samber/lo"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
const (
//每个节点划分的资源等级:
// ResourceLevel1表示所有资源类型均满足 大于等于1.5倍
ResourceLevel1 = 1
// ResourceLevel2表示不满足Level1但所有资源类型均满足 大于等于1倍
ResourceLevel2 = 2
// ResourceLevel3 表示某些资源类型 小于一倍
ResourceLevel3 = 3
CpuResourceWeight float64 = 1
StgResourceWeight float64 = 1.2
CachingWeight float64 = 1
LoadedWeight float64 = 2
)
var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait")
type candidate struct {
CC schmod.ComputingCenter
IsReferencedJobTarget bool // 这个节点是否是所依赖的任务所选择的节点
Resource resourcesDetail
Files filesDetail
}
type resourcesDetail struct {
CPU resourceDetail
GPU resourceDetail
NPU resourceDetail
MLU resourceDetail
Storage resourceDetail
Memory resourceDetail
TotalScore float64
AvgScore float64
MaxLevel int
}
type resourceDetail struct {
Level int
Score float64
}
type filesDetail struct {
Dataset fileDetail
Code fileDetail
Image fileDetail
TotalScore float64
}
type fileDetail struct {
CachingScore float64
LoadingScore float64
IsLoaded bool //表示storage是否已经调度到该节点, image表示镜像是否已经加载到该算力中心
}
type schedulingJob struct {
Job schsdk.JobInfo
Afters []string
}
type CandidateArr []*candidate
func (a CandidateArr) Len() int { return len(a) }
func (a CandidateArr) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a CandidateArr) Less(i, j int) bool {
n1 := a[i]
n2 := a[j]
// 优先与所依赖的任务放到一起,但要求那个节点的资源足够
if n1.IsReferencedJobTarget && n1.Resource.MaxLevel < ResourceLevel3 {
return true
}
if n2.IsReferencedJobTarget && n2.Resource.MaxLevel < ResourceLevel3 {
return true
}
// 优先判断资源等级,资源等级越低,代表越满足需求
if n1.Resource.MaxLevel < n2.Resource.MaxLevel {
return true
}
if n1.Resource.MaxLevel > n2.Resource.MaxLevel {
return false
}
// 等级相同时,根据单项分值比较
switch n1.Resource.MaxLevel {
case ResourceLevel1:
// 数据文件总分越高,代表此节点上拥有的数据文件越完整,则越优先考虑
return n1.Files.TotalScore > n2.Files.TotalScore
case ResourceLevel2:
// 资源分的平均值越高,代表资源越空余,则越优先考虑
return n1.Resource.AvgScore > n2.Resource.AvgScore
case ResourceLevel3:
// 资源分的平均值越高,代表资源越空余,则越优先考虑
return n1.Resource.AvgScore > n2.Resource.AvgScore
}
return false
}
type DefaultPreScheduler struct {
}
func NewDefaultPreScheduler() *DefaultPreScheduler {
return &DefaultPreScheduler{}
}
// ScheduleJobSet 任务集预调度
func (s *DefaultPreScheduler) ScheduleJobSet(info *schsdk.JobSetInfo, allCC []schmod.ComputingCenter) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error) {
jobSetScheme := &jobmod.JobSetPreScheduleScheme{
JobSchemes: make(map[string]jobmod.JobScheduleScheme),
}
filesUploadSchemes := make(map[string]schsdk.LocalFileUploadScheme)
ccs := make(map[schsdk.CCID]schmod.ComputingCenter)
for _, node := range allCC {
ccs[node.CCID] = node
}
if len(ccs) == 0 {
return nil, nil, ErrNoAvailableScheme
}
// 先根据任务配置收集它们依赖的任务的LocalID
var schJobs []*schedulingJob
for _, job := range info.Jobs {
j := &schedulingJob{
Job: job,
}
if norJob, ok := job.(*schsdk.NormalJobInfo); ok {
if resFile, ok := norJob.Files.Dataset.(*schsdk.DataReturnJobFileInfo); ok {
j.Afters = append(j.Afters, resFile.DataReturnLocalJobID)
}
if resFile, ok := norJob.Files.Code.(*schsdk.DataReturnJobFileInfo); ok {
j.Afters = append(j.Afters, resFile.DataReturnLocalJobID)
}
} else if resJob, ok := job.(*schsdk.DataReturnJobInfo); ok {
j.Afters = append(j.Afters, resJob.TargetLocalJobID)
}
schJobs = append(schJobs, j)
}
// 然后根据依赖进行排序
schJobs, ok := s.orderByAfters(schJobs)
if !ok {
return nil, nil, fmt.Errorf("circular reference detected between jobs in the job set")
}
// 经过排序后,按顺序生成调度方案
for _, job := range schJobs {
var fileInfo schsdk.JobFilesInfo
isNormalType := false
norJob, ok := job.Job.(*schsdk.NormalJobInfo)
if ok {
fileInfo = norJob.Files
isNormalType = true
}
dpJob, ok := job.Job.(*schsdk.DataPreprocessJobInfo)
if ok {
fileInfo = dpJob.Files
isNormalType = true
}
ftJob, ok := job.Job.(*schsdk.FinetuningJobInfo)
if ok {
fileInfo = ftJob.Files
isNormalType = true
}
if isNormalType {
scheme, err := s.scheduleForNormalOrMultiJob(info, job, ccs, jobSetScheme.JobSchemes)
if err != nil {
return nil, nil, err
}
jobSetScheme.JobSchemes[job.Job.GetLocalJobID()] = *scheme
// 检查数据文件的配置项,生成上传文件方案
s.fillNormarlJobLocalUploadScheme(fileInfo, scheme.TargetCCID, filesUploadSchemes, ccs)
}
if mulJob, ok := job.Job.(*schsdk.MultiInstanceJobInfo); ok {
scheme, err := s.scheduleForNormalOrMultiJob(info, job, ccs, jobSetScheme.JobSchemes)
if err != nil {
return nil, nil, err
}
jobSetScheme.JobSchemes[job.Job.GetLocalJobID()] = *scheme
// 检查数据文件的配置项,生成上传文件方案
s.fillNormarlJobLocalUploadScheme(mulJob.Files, scheme.TargetCCID, filesUploadSchemes, ccs)
}
// 回源任务目前不需要生成调度方案
}
return jobSetScheme, &schsdk.JobSetFilesUploadScheme{
LocalFileSchemes: lo.Values(filesUploadSchemes),
}, nil
}
// ScheduleJob 单个任务预调度
func (s *DefaultPreScheduler) ScheduleJob(priorities []pcmsch.ResourcePriority, clusterMapping map[schsdk.ClusterID]cdssdk.StorageID) (*schsdk.ClusterID, error) {
schCli, err := schglb.PCMSchePool.Acquire()
if err != nil {
return nil, fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.PCMSchePool.Release(schCli)
// 查询指定算力中心
clusterIDs := make([]schsdk.ClusterID, 0, len(clusterMapping))
for id, _ := range clusterMapping {
clusterIDs = append(clusterIDs, id)
}
clusterDetails, err := schCli.GetClusterInfo(pcmsch.GetClusterInfoReq{
IDs: clusterIDs,
})
if err != nil {
return nil, fmt.Errorf("get cluster info: %w", err)
}
//var regionPriority *pcmsch.RegionPriority
var chipPriority *pcmsch.ChipPriority
var biasPriority *pcmsch.BiasPriority
for _, priority := range priorities {
switch pr := priority.(type) {
case *pcmsch.ChipPriority:
chipPriority = pr
//case *pcmsch.RegionPriority:
// regionPriority = pr
case *pcmsch.BiasPriority:
biasPriority = pr
}
}
var eligibleClusters []pcmsch.ClusterDetail
// 遍历所有集群,检查是否符合选择条件
for _, cluster := range clusterDetails {
// 匹配地域选择
//if !matchRegion(cluster.Region, regionPriority) {
// continue
//}
// 匹配卡类型选择例如CPU、GPU等
if !matchChipType(cluster.Resources, chipPriority) {
continue
}
// 匹配功能选择(云算,智算,超算等)
if !matchFunction(cluster.ClusterType, biasPriority) {
continue
}
// 将符合条件的集群加入候选列表
eligibleClusters = append(eligibleClusters, cluster)
}
// 根据剩余资源量对符合条件的集群进行排序,优先选择剩余资源最多的
if len(eligibleClusters) > 1 {
sort.Slice(eligibleClusters, func(i, j int) bool {
return getRemainingResources(eligibleClusters[i].Resources) > getRemainingResources(eligibleClusters[j].Resources)
})
// 返回剩余资源最多的算力中心
return &eligibleClusters[0].ClusterId, nil
} else if len(eligibleClusters) == 1 {
// 如果只有一个符合条件的算力中心,直接选择
return &eligibleClusters[0].ClusterId, nil
}
// 如果没有符合条件的算力中心,返回 nil
return nil, fmt.Errorf("no eligible clusters found")
}
// 匹配地域选择
func matchRegion(region string, regionPriority *pcmsch.RegionPriority) bool {
if regionPriority == nil || len(regionPriority.Options) == 0 {
return true
}
for _, option := range regionPriority.Options {
if option == region {
return true
}
}
return false
}
// 匹配卡类型选择
func matchChipType(resources []pcmsch.TmpResourceData, chipPriority *pcmsch.ChipPriority) bool {
if chipPriority == nil || len(chipPriority.Options) == 0 {
return true
}
for _, resource := range resources {
if contains(chipPriority.Options, string(resource.Type)) {
return true
}
}
//for _, resource := range resources {
// switch r := resource.(type) {
// case *pcmsch.CPUResourceData:
// if contains(chipPriority.Options, string(r.Name)) {
// return true
// }
//
// case *pcmsch.GPUResourceData:
// if contains(chipPriority.Options, string(r.Name)) {
// return true
// }
//
// case *pcmsch.NPUResourceData:
// if contains(chipPriority.Options, string(r.Name)) {
// return true
// }
//
// case *pcmsch.MLUResourceData:
// if contains(chipPriority.Options, string(r.Name)) {
// return true
// }
//
// case *pcmsch.MemoryResourceData:
// if contains(chipPriority.Options, string(r.Name)) {
// return true
// }
//
// case *pcmsch.StorageResourceData:
// if contains(chipPriority.Options, string(r.Name)) {
// return true
// }
// }
//}
return false
}
// 匹配功能选择
func matchFunction(functionType string, biasPriority *pcmsch.BiasPriority) bool {
if biasPriority == nil || len(biasPriority.Options) == 0 {
return true
}
for _, option := range biasPriority.Options {
if option == functionType {
return true
}
}
return false
}
// 获取剩余资源(可以基于具体资源类型进行扩展)
func getRemainingResources(resources []pcmsch.TmpResourceData) float64 {
var totalAvailable float64
for _, resource := range resources {
switch resource.Type {
case pcmsch.ResourceTypeCPU:
totalAvailable += resource.Available.Value * CpuResourceWeight
case pcmsch.ResourceTypeNPU:
totalAvailable += resource.Available.Value * CpuResourceWeight
case pcmsch.ResourceTypeGPU:
totalAvailable += resource.Available.Value * CpuResourceWeight
case pcmsch.ResourceTypeMLU:
totalAvailable += resource.Available.Value * CpuResourceWeight
case pcmsch.ResourceTypeStorage:
totalAvailable += float64(resource.Available.Value) * StgResourceWeight
case pcmsch.ResourceTypeMemory:
totalAvailable += float64(resource.Available.Value) * StgResourceWeight
}
}
//for _, resource := range resources {
// switch resourceData := resource.(type) {
// case *pcmsch.CPUResourceData:
// totalAvailable += float64(resourceData.Available.Value) * CpuResourceWeight
//
// case *pcmsch.GPUResourceData:
// totalAvailable += float64(resourceData.Available.Value) * CpuResourceWeight
//
// case *pcmsch.NPUResourceData:
// totalAvailable += float64(resourceData.Available.Value) * CpuResourceWeight
//
// case *pcmsch.MLUResourceData:
// totalAvailable += float64(resourceData.Available.Value) * CpuResourceWeight
//
// case *pcmsch.StorageResourceData:
// totalAvailable += float64(resourceData.Available.Value) * StgResourceWeight
//
// case *pcmsch.MemoryResourceData:
// totalAvailable += float64(resourceData.Available.Value) * StgResourceWeight
// }
//}
return totalAvailable
}
// 判断切片中是否包含某个元素
func contains(slice []string, str string) bool {
for _, item := range slice {
if item == str {
return true
}
}
return false
}
func (s *DefaultPreScheduler) orderByAfters(jobs []*schedulingJob) ([]*schedulingJob, bool) {
type jobOrder struct {
Job *schedulingJob
Afters []string
}
var jobOrders []*jobOrder
for _, job := range jobs {
od := &jobOrder{
Job: job,
Afters: make([]string, len(job.Afters)),
}
copy(od.Afters, job.Afters)
jobOrders = append(jobOrders, od)
}
// 然后排序
var orderedJob []*schedulingJob
for {
rm := 0
for i, jo := range jobOrders {
// 找到没有依赖的任务,然后将其取出
if len(jo.Afters) == 0 {
orderedJob = append(orderedJob, jo.Job)
// 删除其他任务对它的引用
for _, job2 := range jobOrders {
job2.Afters = lo.Reject(job2.Afters, func(item string, idx int) bool { return item == jo.Job.Job.GetLocalJobID() })
}
rm++
continue
}
jobOrders[i-rm] = jobOrders[i]
}
jobOrders = jobOrders[:len(jobOrders)-rm]
if len(jobOrders) == 0 {
break
}
// 遍历一轮后没有找到无依赖的任务,那么就是存在循环引用,排序失败
if rm == 0 {
return nil, false
}
}
return orderedJob, true
}
func (s *DefaultPreScheduler) scheduleForNormalOrMultiJob(jobSet *schsdk.JobSetInfo, job *schedulingJob, ccs map[schsdk.CCID]schmod.ComputingCenter, jobSchemes map[string]jobmod.JobScheduleScheme) (*jobmod.JobScheduleScheme, error) {
allCCs := make(map[schsdk.CCID]*candidate)
// 初始化备选节点信息
for _, cc := range ccs {
caNode := &candidate{
CC: cc,
}
// 检查此节点是否是它所引用的任务所选的节点
for _, af := range job.Afters {
resJob := findJobInfo[*schsdk.DataReturnJobInfo](jobSet.Jobs, af)
if resJob == nil {
return nil, fmt.Errorf("resource job %s not found in the job set", af)
}
// 由于jobs已经按照引用排序所以正常情况下这里肯定能取到值
scheme, ok := jobSchemes[resJob.TargetLocalJobID]
if !ok {
continue
}
if scheme.TargetCCID == cc.CCID {
caNode.IsReferencedJobTarget = true
break
}
}
allCCs[cc.CCID] = caNode
}
var jobFiles *schsdk.JobFilesInfo
var jobResource *schsdk.JobResourcesInfo
switch runningJob := job.Job.(type) {
case *schsdk.NormalJobInfo:
jobFiles = &runningJob.Files
jobResource = &runningJob.Resources
case *schsdk.DataPreprocessJobInfo:
jobFiles = &runningJob.Files
jobResource = &runningJob.Resources
case *schsdk.FinetuningJobInfo:
jobFiles = &runningJob.Files
jobResource = &runningJob.Resources
case *schsdk.MultiInstanceJobInfo:
jobFiles = &runningJob.Files
jobResource = &runningJob.Resources
}
// 计算文件占有量得分
err := s.calcFileScore(*jobFiles, allCCs)
if err != nil {
return nil, err
}
// 计算资源余量得分
err = s.calcResourceScore(*jobResource, allCCs)
if err != nil {
return nil, err
}
allCCsArr := lo.Values(allCCs)
sort.Sort(CandidateArr(allCCsArr))
targetNode := allCCsArr[0]
if targetNode.Resource.MaxLevel == ResourceLevel3 {
return nil, ErrNoAvailableScheme
}
scheme := s.makeSchemeForNode(jobFiles, targetNode)
return &scheme, nil
}
func (s *DefaultPreScheduler) fillNormarlJobLocalUploadScheme(files schsdk.JobFilesInfo, targetCCID schsdk.CCID, schemes map[string]schsdk.LocalFileUploadScheme, ccs map[schsdk.CCID]schmod.ComputingCenter) {
if localFile, ok := files.Dataset.(*schsdk.LocalJobFileInfo); ok {
if _, ok := schemes[localFile.LocalPath]; !ok {
cdsNodeID := ccs[targetCCID].CDSStorageID
schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{
LocalPath: localFile.LocalPath,
UploadToCDStorageID: cdsNodeID,
}
}
}
if localFile, ok := files.Code.(*schsdk.LocalJobFileInfo); ok {
if _, ok := schemes[localFile.LocalPath]; !ok {
cdsNodeID := ccs[targetCCID].CDSStorageID
schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{
LocalPath: localFile.LocalPath,
UploadToCDStorageID: cdsNodeID,
}
}
}
if localFile, ok := files.Image.(*schsdk.LocalJobFileInfo); ok {
if _, ok := schemes[localFile.LocalPath]; !ok {
cdsNodeID := ccs[targetCCID].CDSStorageID
schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{
LocalPath: localFile.LocalPath,
UploadToCDStorageID: cdsNodeID,
}
}
}
}
func (s *DefaultPreScheduler) makeSchemeForNode(jobFiles *schsdk.JobFilesInfo, targetCC *candidate) jobmod.JobScheduleScheme {
scheme := jobmod.JobScheduleScheme{
TargetCCID: targetCC.CC.CCID,
}
// TODO 根据实际情况选择Move或者Load
if _, ok := jobFiles.Dataset.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Dataset.IsLoaded {
scheme.Dataset.Action = jobmod.ActionLoad
} else {
scheme.Dataset.Action = jobmod.ActionNo
}
if _, ok := jobFiles.Code.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Code.IsLoaded {
scheme.Code.Action = jobmod.ActionLoad
} else {
scheme.Code.Action = jobmod.ActionNo
}
if _, ok := jobFiles.Image.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Image.IsLoaded {
scheme.Image.Action = jobmod.ActionImportImage
} else {
scheme.Image.Action = jobmod.ActionNo
}
return scheme
}
func findResuorce[T uopsdk.ResourceData](all []pcmsch.ResourceData) T {
for _, data := range all {
if ret, ok := data.(T); ok {
return ret
}
}
var def T
return def
}
func findJobInfo[T schsdk.JobInfo](jobs []schsdk.JobInfo, localJobID string) T {
for _, job := range jobs {
if ret, ok := job.(T); ok && job.GetLocalJobID() == localJobID {
return ret
}
}
var def T
return def
}

View File

@ -0,0 +1,14 @@
package prescheduler2
import (
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type PreScheduler interface {
ScheduleJobSet(info *schsdk.JobSetInfo, allCC []schmod.ComputingCenter) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error)
ScheduleJob(priority []sch.ResourcePriority, clusterMapping map[schsdk.ClusterID]cdssdk.StorageID) (*schsdk.ClusterID, error)
}

View File

@ -192,3 +192,753 @@
2024-09-26 15:21:41 [WARN] [HTTP:JobSet.Submit] submitting jobset: submitting job set to manager: code: OperationFailed, message: job set 0 is not found
2024-09-26 16:15:22 [WARN] [HTTP:JobSet.Submit] submitting jobset: submitting job set to manager: code: OperationFailed, message: job set 0 is not found
2024-09-26 16:30:52 [WARN] [HTTP:JobSet.Submit] submitting jobset: submitting job set to manager: code: OperationFailed, message: job set 0 is not found
2024-12-05 16:38:44 [FATA] failed to connect to database: dial tcp 101.201.215.196:3306: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
2024-12-05 16:41:25 [INFO] start serving http at: :7891
2024-12-05 16:48:18 [DEBU] uploading job
2024-12-05 16:51:36 [INFO] start serving http at: :7891
2024-12-05 16:51:44 [DEBU] uploading job
2024-12-05 16:54:57 [WARN] [HTTP:JobSet.LocalFileUploaded] binding body: Key: 'JobSetLocalFileUploadedReq.JobSetID' Error:Field validation for 'JobSetID' failed on the 'required' tag
Key: 'JobSetLocalFileUploadedReq.LocalPath' Error:Field validation for 'LocalPath' failed on the 'required' tag
2024-12-05 16:55:44 [DEBU] uploading job
2024-12-05 16:56:17 [WARN] [HTTP:JobSet.LocalFileUploaded] binding body: Key: 'JobSetLocalFileUploadedReq.JobSetID' Error:Field validation for 'JobSetID' failed on the 'required' tag
Key: 'JobSetLocalFileUploadedReq.LocalPath' Error:Field validation for 'LocalPath' failed on the 'required' tag
2024-12-05 16:58:38 [ERRO]
2024-12-05 16:58:38 [INFO] jobID: %s change state from %s to %s1&{0xc000780db0 test_image.png image [111]} &{0xc000780bc0}
2024-12-05 16:58:38 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
2024-12-05 16:58:38 [INFO] [JobID:1] [LastState:*state2.DataUpload] job failed with:
2024-12-05 16:58:38 [INFO] job set 1 completed
2024-12-05 16:58:52 [DEBU] uploading job
2024-12-05 16:59:30 [ERRO]
2024-12-05 16:59:30 [INFO] jobID: %s change state from %s to %s2&{0xc0005aa0c0 test_image.png image [111]} &{0xc000780c30}
2024-12-05 16:59:30 [INFO] [JobID:2] state changed: *state2.DataUpload -> *state.Completed
2024-12-05 16:59:30 [INFO] [JobID:2] [LastState:*state2.DataUpload] job failed with:
2024-12-05 16:59:30 [INFO] job set 2 completed
2024-12-05 16:59:53 [DEBU] uploading job
2024-12-05 17:00:59 [ERRO]
2024-12-05 17:00:59 [INFO] jobID: %s change state from %s to %s3&{0xc0005aa510 test_image.png image [111]} &{0xc0005aa750}
2024-12-05 17:00:59 [INFO] [JobID:3] state changed: *state2.DataUpload -> *state.Completed
2024-12-05 17:00:59 [INFO] [JobID:3] [LastState:*state2.DataUpload] job failed with:
2024-12-05 17:00:59 [INFO] job set 3 completed
2024-12-05 17:01:02 [INFO] start serving http at: :7891
2024-12-05 17:01:12 [DEBU] uploading job
2024-12-05 17:10:17 [ERRO] insert upload data fail: Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-05 17:10:17 [INFO] jobID: %s change state from %s to %s0&{0xc0004212c0 test_image.png image [111]} &{0xc00045e0e0}
2024-12-05 17:10:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-05 17:10:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-05 17:10:17 [INFO] job set 0 completed
2024-12-10 09:18:52 [INFO] start serving http at: :7891
2024-12-10 14:54:34 [INFO] start serving http at: :7891
2024-12-10 15:05:05 [INFO] start serving http at: :7891
2024-12-10 15:06:50 [DEBU] submitting job
2024-12-10 15:06:50 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-10 15:06:53 [INFO] start serving http at: :7891
2024-12-10 15:07:54 [DEBU] submitting job
2024-12-10 15:07:54 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-10 15:07:58 [INFO] start serving http at: :7891
2024-12-10 15:10:34 [DEBU] submitting job
2024-12-10 15:10:34 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-10 15:10:42 [INFO] start serving http at: :7891
2024-12-10 15:12:31 [DEBU] submitting job
2024-12-10 15:12:31 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-10 15:12:33 [INFO] start serving http at: :7891
2024-12-10 15:13:32 [DEBU] submitting job
2024-12-10 15:13:32 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-10 15:13:35 [INFO] start serving http at: :7891
2024-12-10 15:14:39 [DEBU] submitting job
2024-12-10 15:14:39 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-10 15:14:41 [INFO] start serving http at: :7891
2024-12-10 15:19:47 [DEBU] submitting job
2024-12-10 15:19:47 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-10 15:19:51 [INFO] start serving http at: :7891
2024-12-10 15:20:31 [INFO] start serving http at: :7891
2024-12-10 15:32:43 [INFO] start serving http at: :7891
2024-12-10 15:33:00 [DEBU] submitting job
2024-12-10 15:35:12 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-10 15:35:13 [INFO] start serving http at: :7891
2024-12-10 15:35:36 [DEBU] submitting job
2024-12-10 15:36:33 [ERRO] Error 1146 (42S02): Table 'scheduler.uploaddata' doesn't exist
2024-12-10 15:36:35 [INFO] start serving http at: :7891
2024-12-10 15:36:44 [DEBU] submitting job
2024-12-10 15:37:11 [ERRO] no upload data
2024-12-10 15:40:24 [INFO] start serving http at: :7891
2024-12-10 15:40:55 [DEBU] uploading job
2024-12-10 15:40:55 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: Error 1146 (42S02): Table 'scheduler.clustermapping' doesn't exist
2024-12-10 15:42:19 [INFO] start serving http at: :7891
2024-12-10 15:42:50 [DEBU] uploading job
2024-12-10 15:42:50 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
2024-12-10 15:45:43 [INFO] start serving http at: :7891
2024-12-10 15:45:52 [DEBU] uploading job
2024-12-10 15:45:52 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
2024-12-10 15:46:26 [INFO] start serving http at: :7891
2024-12-10 15:46:33 [DEBU] uploading job
2024-12-10 15:46:33 [WARN] getting all computing center: Error 1054 (42S22): Unknown column 'ComputingCenter.executorID' in 'field list'
2024-12-10 15:46:33 [WARN] [HTTP:JobSet.Upload] uploading file: Error 1054 (42S22): Unknown column 'ComputingCenter.executorID' in 'field list'
2024-12-10 15:50:07 [INFO] start serving http at: :7891
2024-12-10 15:50:15 [DEBU] uploading job
2024-12-10 15:50:15 [WARN] getting all computing center: Error 1054 (42S22): Unknown column 'ComputingCenter.executorURL' in 'field list'
2024-12-10 15:50:15 [WARN] [HTTP:JobSet.Upload] uploading file: Error 1054 (42S22): Unknown column 'ComputingCenter.executorURL' in 'field list'
2024-12-10 15:52:31 [INFO] start serving http at: :7891
2024-12-10 15:52:36 [DEBU] uploading job
2024-12-10 15:52:36 [WARN] getting all computing center: Error 1054 (42S22): Unknown column 'ComputingCenter.ClusterID' in 'field list'
2024-12-10 15:52:36 [WARN] [HTTP:JobSet.Upload] uploading file: Error 1054 (42S22): Unknown column 'ComputingCenter.ClusterID' in 'field list'
2024-12-10 16:02:11 [INFO] start serving http at: :7891
2024-12-10 16:02:15 [DEBU] uploading job
2024-12-10 16:02:15 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
2024-12-10 16:05:46 [INFO] start serving http at: :7891
2024-12-10 16:05:59 [DEBU] uploading job
2024-12-10 16:05:59 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
2024-12-10 16:06:09 [DEBU] uploading job
2024-12-10 16:06:09 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
2024-12-10 16:07:13 [DEBU] uploading job
2024-12-10 16:10:39 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
2024-12-10 16:10:41 [INFO] start serving http at: :7891
2024-12-10 16:10:46 [DEBU] uploading job
2024-12-10 16:11:22 [WARN] [HTTP:JobSet.Upload] uploading file: query cluster mapping error: sql: Scan called without calling Next
2024-12-10 16:14:01 [INFO] start serving http at: :7891
2024-12-10 16:14:06 [DEBU] uploading job
2024-12-10 16:20:51 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: Get "http://121.36.5.116:7890/queryResources": dial tcp 121.36.5.116:7890: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
2024-12-10 16:20:55 [INFO] start serving http at: :7891
2024-12-10 16:21:00 [DEBU] uploading job
2024-12-10 16:23:03 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
2024-12-10 16:23:27 [INFO] start serving http at: :7891
2024-12-10 16:23:32 [DEBU] uploading job
2024-12-10 16:26:44 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
2024-12-10 16:26:47 [INFO] start serving http at: :7891
2024-12-10 16:26:53 [DEBU] uploading job
2024-12-10 16:37:01 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
2024-12-10 16:38:15 [INFO] start serving http at: :7891
2024-12-10 16:38:28 [DEBU] uploading job
2024-12-10 16:50:06 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
2024-12-10 16:50:08 [INFO] start serving http at: :7891
2024-12-10 16:50:14 [DEBU] uploading job
2024-12-10 16:54:56 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
2024-12-10 16:54:58 [INFO] start serving http at: :7891
2024-12-10 16:55:08 [DEBU] uploading job
2024-12-10 16:57:27 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
2024-12-10 16:57:29 [INFO] start serving http at: :7891
2024-12-10 16:57:36 [DEBU] uploading job
2024-12-10 17:03:50 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
2024-12-10 17:03:53 [INFO] start serving http at: :7891
2024-12-10 17:03:57 [DEBU] uploading job
2024-12-10 17:04:12 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field ClusterDetail.data.data.ClusterId of type schsdk.ClusterID
2024-12-10 17:04:23 [INFO] start serving http at: :7891
2024-12-10 17:04:26 [DEBU] uploading job
2024-12-10 17:05:08 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field ClusterDetail.data.data.ClusterId of type schsdk.ClusterID
2024-12-10 17:05:10 [INFO] start serving http at: :7891
2024-12-10 17:05:15 [DEBU] uploading job
2024-12-10 17:11:05 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field response[[]gitlink.org.cn/cloudream/common/sdks/pcmscheduler.ClusterDetail].data of type []sch.ClusterDetail
2024-12-10 17:11:08 [INFO] start serving http at: :7891
2024-12-10 17:11:21 [DEBU] uploading job
2024-12-10 17:13:01 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field ClusterDetail.data.data.ClusterId of type schsdk.ClusterID
2024-12-10 17:13:15 [INFO] start serving http at: :7891
2024-12-10 17:13:22 [DEBU] uploading job
2024-12-10 17:13:28 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.data.Resource of type sch.ResourceData
2024-12-10 17:13:37 [DEBU] uploading job
2024-12-10 17:42:29 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.data.Resource of type sch.ResourceData
2024-12-11 15:04:03 [INFO] start serving http at: :7891
2024-12-11 15:07:00 [INFO] start serving http at: :7891
2024-12-11 15:09:01 [DEBU] submitting job
2024-12-11 15:09:40 [ERRO] no upload data
2024-12-11 15:09:40 [INFO] jobID: %s change state from %s to %s0&{0xc0001d0000} &{0xc00023a150}
2024-12-11 15:09:40 [INFO] [JobID:0] state changed: *state2.PCMJobCreate -> *state.Completed
2024-12-11 15:09:40 [INFO] [JobID:0] [LastState:*state2.PCMJobCreate] job failed with: no upload data
2024-12-11 15:09:40 [INFO] job set 0 completed
2024-12-11 15:10:07 [DEBU] uploading job
2024-12-11 15:10:24 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.data.Resource of type sch.ResourceData
2024-12-11 15:12:21 [DEBU] uploading job
2024-12-11 15:12:21 [WARN] cluster 2 not found
2024-12-11 15:12:21 [WARN] cluster 3 not found
2024-12-11 15:13:30 [DEBU] uploading job
2024-12-11 15:13:30 [WARN] cluster 2 not found
2024-12-11 15:13:30 [WARN] cluster 3 not found
2024-12-11 15:14:45 [DEBU] uploading job
2024-12-11 15:16:41 [ERRO] insert upload data fail: sql: Scan called without calling Next
2024-12-11 15:16:41 [INFO] jobID: %s change state from %s to %s3&{0xc00059e290 test_image.png image [1 2]} &{0xc00022c1e0}
2024-12-11 15:16:41 [INFO] [JobID:3] state changed: *state2.DataUpload -> *state.Completed
2024-12-11 15:16:41 [INFO] [JobID:3] [LastState:*state2.DataUpload] job failed with: insert upload data fail: sql: Scan called without calling Next
2024-12-11 15:16:41 [INFO] job set 3 completed
2024-12-11 15:21:10 [INFO] start serving http at: :7891
2024-12-11 15:21:20 [DEBU] uploading job
2024-12-11 15:22:11 [INFO] jobID: %s change state from %s to %s0&{0xc0006a88c0 test_image.png image [1 2]} &{<nil>}
2024-12-11 15:22:11 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-11 15:22:11 [INFO] [JobID:0] job completed successfuly
2024-12-11 15:22:11 [INFO] job set 0 completed
2024-12-11 15:24:59 [DEBU] uploading job
2024-12-11 15:28:32 [INFO] jobID: %s change state from %s to %s1&{0xc00053a0d0 test_image.png image [1 2]} &{<nil>}
2024-12-11 15:28:32 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
2024-12-11 15:28:32 [INFO] [JobID:1] job completed successfuly
2024-12-11 15:28:32 [INFO] job set 1 completed
2024-12-11 15:38:34 [DEBU] uploading job
2024-12-11 15:45:34 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal array into Go struct field response[gitlink.org.cn/cloudream/common/sdks/pcmscheduler.GetClusterInfoResp].data of type sch.GetClusterInfoResp
2024-12-11 15:47:21 [DEBU] submitting job
2024-12-11 16:36:15 [INFO] start serving http at: :7891
2024-12-11 16:37:38 [DEBU] submitting job
2024-12-11 16:40:34 [ERRO] create task: unknow response content type: text/plain; charset=utf-8
2024-12-11 16:40:34 [INFO] jobID: %s change state from %s to %s0&{0xc00018e0e0} &{0xc0001da3c0}
2024-12-11 16:40:34 [INFO] [JobID:0] state changed: *state2.PCMJobCreate -> *state.Completed
2024-12-11 16:40:34 [INFO] [JobID:0] [LastState:*state2.PCMJobCreate] job failed with: create task: unknow response content type: text/plain; charset=utf-8
2024-12-11 16:40:34 [INFO] job set 0 completed
2024-12-11 16:58:56 [WARN] [HTTP:JobSet.GetServiceList] binding body: strconv.ParseInt: parsing "[1,5]": invalid syntax
2024-12-11 16:59:28 [INFO] start serving http at: :7891
2024-12-11 17:00:35 [WARN] [HTTP:JobSet.GetServiceList] binding body: Key: 'QueryUploadedReq.DataType' Error:Field validation for 'DataType' failed on the 'required' tag
Key: 'QueryUploadedReq.UserID' Error:Field validation for 'UserID' failed on the 'required' tag
2024-12-11 17:01:19 [INFO] start serving http at: :7891
2024-12-11 17:01:29 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
2024-12-11 17:02:19 [INFO] start serving http at: :7891
2024-12-11 17:02:26 [WARN] [HTTP:JobSet.GetServiceList] getting service list: UploadedCluster: unsupported relations for schema UploadedData
2024-12-11 17:08:14 [INFO] start serving http at: :7891
2024-12-11 17:08:26 [WARN] [HTTP:JobSet.GetServiceList] getting service list: UploadedCluster: unsupported relations for schema UploadedData
2024-12-11 17:15:22 [INFO] start serving http at: :7891
2024-12-11 17:15:26 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
2024-12-11 17:17:08 [INFO] start serving http at: :7891
2024-12-11 17:17:11 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
2024-12-11 17:20:27 [INFO] start serving http at: :7891
2024-12-11 17:20:32 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
2024-12-11 17:21:05 [INFO] start serving http at: :7891
2024-12-11 17:21:08 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.clusters' doesn't exist
2024-12-11 17:21:49 [INFO] start serving http at: :7891
2024-12-11 17:21:51 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.uploaded_clusters' doesn't exist
2024-12-11 17:22:21 [INFO] start serving http at: :7891
2024-12-11 17:22:23 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
2024-12-11 17:23:02 [INFO] start serving http at: :7891
2024-12-11 17:23:05 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
2024-12-11 17:26:19 [INFO] start serving http at: :7891
2024-12-11 17:26:21 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
2024-12-11 17:26:53 [INFO] start serving http at: :7891
2024-12-11 17:26:57 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
2024-12-11 17:28:05 [INFO] start serving http at: :7891
2024-12-11 17:28:06 [WARN] [HTTP:JobSet.GetServiceList] getting service list: uploadedcluster: unsupported relations for schema UploadedData
2024-12-11 17:29:20 [INFO] start serving http at: :7891
2024-12-11 17:29:34 [WARN] [HTTP:JobSet.GetServiceList] getting service list: clusters: unsupported relations for schema UploadedData
2024-12-11 17:30:25 [INFO] start serving http at: :7891
2024-12-11 17:31:05 [INFO] start serving http at: :7891
2024-12-11 17:31:07 [WARN] [HTTP:JobSet.GetServiceList] getting service list: UploadedCluster: unsupported relations for schema UploadedData
2024-12-11 17:31:39 [INFO] start serving http at: :7891
2024-12-11 17:31:41 [WARN] [HTTP:JobSet.GetServiceList] getting service list: UploadedCluster: unsupported relations for schema UploadedData
2024-12-11 17:31:55 [INFO] start serving http at: :7891
2024-12-11 17:33:56 [INFO] start serving http at: :7891
2024-12-12 09:39:30 [DEBU] uploading job
2024-12-12 09:39:30 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
2024-12-12 09:41:57 [INFO] start serving http at: :7891
2024-12-12 09:42:01 [DEBU] uploading job
2024-12-12 09:42:01 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
2024-12-12 09:53:16 [DEBU] uploading job
2024-12-12 09:54:11 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
2024-12-12 09:54:26 [DEBU] uploading job
2024-12-12 10:00:14 [INFO] start serving http at: :7891
2024-12-12 10:00:21 [DEBU] uploading job
2024-12-12 10:00:22 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
2024-12-12 10:00:43 [INFO] start serving http at: :7891
2024-12-12 10:00:47 [DEBU] uploading job
2024-12-12 10:00:48 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
2024-12-12 10:00:54 [DEBU] uploading job
2024-12-12 10:00:55 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
2024-12-12 10:01:04 [INFO] start serving http at: :7891
2024-12-12 10:01:51 [INFO] start serving http at: :7891
2024-12-12 10:02:06 [DEBU] uploading job
2024-12-12 10:02:06 [WARN] getting all computing center: sql: Scan error on column index 6, name "Bootstrap": unsupported Scan, storing driver.Value type []uint8 into type *schsdk.Bootstrap
2024-12-12 10:02:06 [WARN] [HTTP:JobSet.Upload] uploading file: sql: Scan error on column index 6, name "Bootstrap": unsupported Scan, storing driver.Value type []uint8 into type *schsdk.Bootstrap
2024-12-12 10:04:35 [INFO] start serving http at: :7891
2024-12-12 10:04:41 [DEBU] uploading job
2024-12-12 10:04:41 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
2024-12-12 10:06:41 [DEBU] uploading job
2024-12-12 10:07:47 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: unknow response content type:
2024-12-12 10:09:29 [DEBU] uploading job
2024-12-12 10:12:19 [INFO] start serving http at: :7891
2024-12-12 10:12:30 [DEBU] uploading job
2024-12-12 10:12:30 [WARN] getting all computing center: sql: Scan error on column index 6, name "Bootstrap": unsupported Scan, storing driver.Value type []uint8 into type *schsdk.Bootstrap
2024-12-12 10:12:30 [WARN] [HTTP:JobSet.Upload] uploading file: sql: Scan error on column index 6, name "Bootstrap": unsupported Scan, storing driver.Value type []uint8 into type *schsdk.Bootstrap
2024-12-12 10:15:08 [INFO] start serving http at: :7891
2024-12-12 10:15:10 [DEBU] uploading job
2024-12-12 10:15:13 [WARN] getting all computing center: Error 1054 (42S22): Unknown column 'ComputingCenter.ClusterID' in 'field list'
2024-12-12 10:15:13 [WARN] [HTTP:JobSet.Upload] uploading file: Error 1054 (42S22): Unknown column 'ComputingCenter.ClusterID' in 'field list'
2024-12-12 10:16:19 [INFO] start serving http at: :7891
2024-12-12 10:16:21 [DEBU] uploading job
2024-12-12 10:18:07 [INFO] start serving http at: :7891
2024-12-12 10:18:12 [DEBU] uploading job
2024-12-12 10:18:35 [DEBU] uploading job
2024-12-12 10:18:36 [DEBU] uploading job
2024-12-12 10:18:37 [DEBU] uploading job
2024-12-12 10:18:45 [DEBU] uploading job
2024-12-12 10:24:51 [INFO] start serving http at: :7891
2024-12-12 10:25:03 [DEBU] submitting job
2024-12-12 10:25:03 [ERRO] create task: unknow response content type: text/plain; charset=utf-8
2024-12-12 10:25:03 [INFO] jobID: %s change state from %s to %s0&{0xc0002204d0} &{0xc0001da000}
2024-12-12 10:25:03 [INFO] [JobID:0] state changed: *state2.PCMJobCreate -> *state.Completed
2024-12-12 10:25:03 [INFO] [JobID:0] [LastState:*state2.PCMJobCreate] job failed with: create task: unknow response content type: text/plain; charset=utf-8
2024-12-12 10:25:03 [INFO] job set 0 completed
2024-12-12 15:02:30 [INFO] start serving http at: :7891
2024-12-12 15:10:51 [DEBU] submitting job
2024-12-12 15:10:55 [ERRO] create task: unknow response content type: text/plain; charset=utf-8
2024-12-12 15:10:55 [INFO] jobID: %s change state from %s to %s0&{0xc00018c2d0} &{0xc00007e560}
2024-12-12 15:10:55 [INFO] [JobID:0] state changed: *state2.PCMJobCreate -> *state.Completed
2024-12-12 15:10:55 [INFO] [JobID:0] [LastState:*state2.PCMJobCreate] job failed with: create task: unknow response content type: text/plain; charset=utf-8
2024-12-12 15:10:55 [INFO] job set 0 completed
2024-12-12 15:11:42 [DEBU] uploading job
2024-12-12 15:15:39 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal array into Go struct field response[gitlink.org.cn/cloudream/common/sdks/pcmscheduler.GetClusterInfoResp].data of type sch.GetClusterInfoResp
2024-12-12 15:16:10 [INFO] start serving http at: :7891
2024-12-12 15:16:17 [DEBU] uploading job
2024-12-12 15:24:49 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal array into Go struct field response[gitlink.org.cn/cloudream/common/sdks/pcmscheduler.GetClusterInfoResp].data of type sch.GetClusterInfoResp
2024-12-12 15:24:55 [INFO] start serving http at: :7891
2024-12-12 15:25:02 [DEBU] uploading job
2024-12-12 15:27:38 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal array into Go struct field response[gitlink.org.cn/cloudream/common/sdks/pcmscheduler.GetClusterInfoResp].data of type sch.GetClusterInfoResp
2024-12-12 15:27:43 [INFO] start serving http at: :7891
2024-12-12 15:27:53 [DEBU] uploading job
2024-12-12 15:31:21 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
2024-12-12 15:31:27 [INFO] start serving http at: :7891
2024-12-12 15:31:35 [DEBU] uploading job
2024-12-12 15:31:52 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
2024-12-12 15:31:55 [DEBU] uploading job
2024-12-12 15:39:59 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
2024-12-12 15:42:02 [INFO] start serving http at: :7891
2024-12-12 15:42:14 [DEBU] uploading job
2024-12-12 15:43:39 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal number into Go struct field TmpResourceData.data.resources.total of type sch.UnitValue[int64]
2024-12-12 15:43:45 [INFO] start serving http at: :7891
2024-12-12 15:43:57 [DEBU] uploading job
2024-12-12 15:44:42 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal number 362398603018.24 into Go struct field TmpResourceData.data.resources.available of type int64
2024-12-12 15:44:48 [INFO] start serving http at: :7891
2024-12-12 15:44:57 [DEBU] uploading job
2024-12-13 09:28:04 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
2024-12-13 09:28:50 [INFO] start serving http at: :7891
2024-12-13 09:29:15 [DEBU] uploading job
2024-12-13 09:40:16 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
2024-12-13 09:40:22 [INFO] start serving http at: :7891
2024-12-13 09:40:27 [DEBU] uploading job
2024-12-13 09:43:16 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
2024-12-13 09:43:22 [INFO] start serving http at: :7891
2024-12-13 09:43:26 [DEBU] uploading job
2024-12-13 09:49:21 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field UnitValue[float64].data.resources.total.value of type float64
2024-12-13 09:49:28 [INFO] start serving http at: :7891
2024-12-13 09:49:31 [DEBU] uploading job
2024-12-13 09:50:25 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal string into Go struct field UnitValue[float64].data.resources.total.value of type float64
2024-12-13 09:50:32 [INFO] start serving http at: :7891
2024-12-13 09:50:47 [DEBU] uploading job
2024-12-13 09:52:39 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal number 2886.65 into Go struct field UnitValue[int64].data.resources.total.value of type int64
2024-12-13 09:52:46 [INFO] start serving http at: :7891
2024-12-13 09:52:49 [DEBU] uploading job
2024-12-13 09:53:15 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
2024-12-13 09:53:32 [INFO] start serving http at: :7891
2024-12-13 09:53:39 [DEBU] uploading job
2024-12-13 10:04:48 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
2024-12-13 10:04:51 [INFO] start serving http at: :7891
2024-12-13 10:04:54 [DEBU] uploading job
2024-12-13 10:18:24 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: parsing response: json: cannot unmarshal object into Go struct field ClusterDetail.data.resources of type sch.ResourceData
2024-12-13 10:24:48 [INFO] start serving http at: :7891
2024-12-13 10:24:56 [DEBU] uploading job
2024-12-13 10:29:33 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
2024-12-13 10:30:34 [INFO] start serving http at: :7891
2024-12-13 10:30:36 [DEBU] uploading job
2024-12-13 10:38:20 [ERRO] insert upload data fail: Error 1264 (22003): Out of range value for column 'ClusterID' at row 1
2024-12-13 10:38:20 [INFO] jobID: %s change state from %s to %s0&{0xc0001da420 test_image.png image [2]} &{0xc00022d320}
2024-12-13 10:38:20 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-13 10:38:20 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1264 (22003): Out of range value for column 'ClusterID' at row 1
2024-12-13 10:38:20 [INFO] job set 0 completed
2024-12-13 16:25:25 [DEBU] uploading job
2024-12-13 16:25:25 [WARN] cluster 0 not found
2024-12-13 16:25:25 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
2024-12-13 16:25:31 [DEBU] uploading job
2024-12-13 16:25:31 [WARN] cluster 0 not found
2024-12-13 16:25:31 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
2024-12-13 16:35:36 [DEBU] uploading job
2024-12-13 16:35:36 [WARN] cluster 0 not found
2024-12-13 16:35:36 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
2024-12-13 16:36:09 [INFO] start serving http at: :7891
2024-12-13 16:36:13 [DEBU] uploading job
2024-12-13 16:36:13 [WARN] cluster 0 not found
2024-12-13 16:36:13 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
2024-12-13 16:36:43 [DEBU] uploading job
2024-12-13 16:37:15 [WARN] cluster 0 not found
2024-12-13 16:37:26 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
2024-12-13 16:37:45 [INFO] start serving http at: :7891
2024-12-13 16:37:52 [DEBU] uploading job
2024-12-13 16:37:52 [WARN] cluster 0 not found
2024-12-13 16:37:52 [WARN] [HTTP:JobSet.Upload] uploading file: no storage is available
2024-12-13 16:50:35 [INFO] start serving http at: :7891
2024-12-13 16:57:17 [DEBU] uploading job
2024-12-13 16:57:17 [ERRO] upload data: Post "jobSet/upload": unsupported protocol scheme ""
2024-12-13 16:57:17 [INFO] jobID: %s change state from %s to %s0&{0xc000124380 webgl_lines_dashed dataset [2]} &{0xc00007e700}
2024-12-13 16:57:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-13 16:57:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: upload data: Post "jobSet/upload": unsupported protocol scheme ""
2024-12-13 16:57:17 [INFO] job set 0 completed
2024-12-13 16:58:20 [DEBU] uploading job
2024-12-13 17:05:42 [ERRO] upload data: Post "jobSet/upload": unsupported protocol scheme ""
2024-12-13 17:05:43 [INFO] jobID: %s change state from %s to %s1&{0xc000124740 webgl_lines_dashed dataset [2]} &{0xc0001da040}
2024-12-13 17:05:46 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
2024-12-13 17:05:46 [INFO] [JobID:1] [LastState:*state2.DataUpload] job failed with: upload data: Post "jobSet/upload": unsupported protocol scheme ""
2024-12-13 17:05:46 [INFO] job set 1 completed
2024-12-13 17:07:37 [INFO] start serving http at: :7891
2024-12-13 17:07:47 [DEBU] uploading job
2024-12-13 17:07:47 [ERRO] upload data: code: 400, message:
2024-12-13 17:07:47 [INFO] jobID: %s change state from %s to %s0&{0xc000088740 webgl_lines_dashed dataset [2]} &{0xc00007e740}
2024-12-13 17:07:47 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-13 17:07:47 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: upload data: code: 400, message:
2024-12-13 17:07:47 [INFO] job set 0 completed
2024-12-13 17:33:55 [DEBU] uploading job
2024-12-13 17:33:55 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: get cluster info: code: 100002, message:
2024-12-16 09:26:34 [INFO] start serving http at: :7891
2024-12-16 09:26:51 [DEBU] uploading job
2024-12-16 09:26:52 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
2024-12-17 09:47:49 [INFO] start serving http at: :7891
2024-12-17 09:48:40 [DEBU] uploading job
2024-12-17 09:52:27 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
2024-12-17 09:52:30 [INFO] start serving http at: :7891
2024-12-17 09:52:41 [DEBU] uploading job
2024-12-17 09:52:43 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
2024-12-17 09:53:03 [DEBU] uploading job
2024-12-17 09:55:15 [WARN] [HTTP:JobSet.Upload] uploading file: pre scheduling: no eligible clusters found
2024-12-17 09:55:17 [INFO] start serving http at: :7891
2024-12-17 09:55:21 [DEBU] uploading job
2024-12-17 09:57:53 [ERRO] insert upload data fail: Error 1364 (HY000): Field 'userID' doesn't have a default value
2024-12-17 09:57:53 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007e920 yuque_mind.jpeg dataset [2] {1 0}} &{0xc00022d100}
2024-12-17 09:57:53 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-17 09:57:53 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1364 (HY000): Field 'userID' doesn't have a default value
2024-12-17 09:57:53 [INFO] job set 0 completed
2024-12-17 09:59:15 [INFO] start serving http at: :7891
2024-12-17 09:59:22 [DEBU] uploading job
2024-12-17 10:03:17 [ERRO] insert blockchains: empty slice found
2024-12-17 10:03:17 [INFO] jobID: %s change state from %s to %s0&{1 0xc00045f740 yuque_mind.jpeg dataset [1] {1 0}} &{0xc00022c1a0}
2024-12-17 10:03:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-17 10:03:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert blockchains: empty slice found
2024-12-17 10:03:17 [INFO] job set 0 completed
2024-12-17 10:03:21 [INFO] start serving http at: :7891
2024-12-17 10:03:36 [DEBU] uploading job
2024-12-17 10:05:04 [ERRO] insert blockchains: empty slice found
2024-12-17 10:05:04 [INFO] jobID: %s change state from %s to %s0&{1 0xc0003513a0 yuque_mind.jpeg dataset [2] {1 0}} &{0xc000351b80}
2024-12-17 10:05:04 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-17 10:05:04 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert blockchains: empty slice found
2024-12-17 10:05:04 [INFO] job set 0 completed
2024-12-17 10:05:18 [INFO] start serving http at: :7891
2024-12-17 10:05:24 [DEBU] uploading job
2024-12-17 10:09:36 [ERRO] insert blockchains: empty slice found
2024-12-17 10:09:36 [INFO] jobID: %s change state from %s to %s0&{1 0xc00019d920 yuque_mind.jpeg dataset [1] {1 0}} &{0xc00027a0c0}
2024-12-17 10:09:36 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-17 10:09:36 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert blockchains: empty slice found
2024-12-17 10:09:36 [INFO] job set 0 completed
2024-12-17 10:09:40 [INFO] start serving http at: :7891
2024-12-17 10:09:50 [DEBU] uploading job
2024-12-17 10:12:11 [INFO] start serving http at: :7891
2024-12-17 10:12:14 [DEBU] uploading job
2024-12-17 10:21:56 [ERRO] blockchain: invoke blockchain: unknow response content type: text/html
2024-12-17 10:21:56 [INFO] jobID: %s change state from %s to %s0&{1 0xc00088ab80 yuque_mind.jpeg dataset [2] {1 0}} &{0xc000138020}
2024-12-17 10:21:56 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-17 10:21:56 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: blockchain: invoke blockchain: unknow response content type: text/html
2024-12-17 10:21:56 [INFO] job set 0 completed
2024-12-17 10:22:03 [INFO] start serving http at: :7891
2024-12-17 10:22:12 [DEBU] uploading job
2024-12-17 10:24:36 [ERRO] blockchain: invoke blockchain: unknow response content type: text/html
2024-12-17 10:24:36 [INFO] jobID: %s change state from %s to %s0&{1 0xc0004612e0 yuque_mind.jpeg dataset [2] {1 0}} &{0xc00022c020}
2024-12-17 10:24:36 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-17 10:24:36 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: blockchain: invoke blockchain: unknow response content type: text/html
2024-12-17 10:24:36 [INFO] job set 0 completed
2024-12-17 10:24:38 [INFO] start serving http at: :7891
2024-12-17 10:24:44 [DEBU] uploading job
2024-12-17 10:26:17 [ERRO] blockchain: invoke blockchain: unknow response content type: text/html
2024-12-17 10:26:17 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001da460 yuque_mind.jpeg dataset [2] {1 0}} &{0xc000120020}
2024-12-17 10:26:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-17 10:26:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: blockchain: invoke blockchain: unknow response content type: text/html
2024-12-17 10:26:17 [INFO] job set 0 completed
2024-12-17 10:26:40 [INFO] start serving http at: :7891
2024-12-17 10:26:49 [DEBU] uploading job
2024-12-17 10:29:39 [ERRO] insert blockchains: Error 1054 (42S22): Unknown column 'data_id' in 'field list'
2024-12-17 10:29:39 [INFO] jobID: %s change state from %s to %s0&{1 0xc0003242c0 yuque_mind.jpeg dataset [1] {1 0}} &{0xc000324520}
2024-12-17 10:29:39 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-17 10:29:39 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert blockchains: Error 1054 (42S22): Unknown column 'data_id' in 'field list'
2024-12-17 10:29:39 [INFO] job set 0 completed
2024-12-17 10:29:43 [INFO] start serving http at: :7891
2024-12-17 10:30:02 [DEBU] uploading job
2024-12-17 10:30:28 [INFO] jobID: %s change state from %s to %s0&{1 0xc000128480 yuque_mind.jpeg dataset [1] {1 0}} &{<nil>}
2024-12-17 10:30:28 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-17 10:30:28 [INFO] [JobID:0] job completed successfuly
2024-12-17 10:30:28 [INFO] job set 0 completed
2024-12-17 10:37:45 [INFO] start serving http at: :7891
2024-12-17 10:38:51 [WARN] [HTTP:JobSet.GetServiceList] parsing request body: http.QueryUploadedReq.ClusterIDs: []schsdk.ClusterID: ReadString: expects " or n, but found 1, error found in #10 byte of ...|erIDs": [1, 5]
}|..., bigger context ...|"dataset",
"userID": 1,
"clusterIDs": [1, 5]
}|...
2024-12-17 10:42:06 [INFO] start serving http at: :7891
2024-12-17 10:42:26 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
2024-12-17 10:42:31 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
2024-12-17 10:46:45 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
2024-12-17 10:46:48 [INFO] start serving http at: :7891
2024-12-17 10:47:23 [INFO] start serving http at: :7891
2024-12-17 10:48:10 [INFO] start serving http at: :7891
2024-12-17 10:48:58 [INFO] start serving http at: :7891
2024-12-17 10:50:11 [INFO] start serving http at: :7891
2024-12-17 10:54:10 [INFO] start serving http at: :7891
2024-12-17 10:54:33 [INFO] start serving http at: :7891
2024-12-17 10:54:44 [INFO] start serving http at: :7891
2024-12-17 10:55:22 [INFO] start serving http at: :7891
2024-12-17 10:55:27 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
2024-12-17 10:57:12 [INFO] start serving http at: :7891
2024-12-17 10:57:17 [WARN] [HTTP:JobSet.GetServiceList] getting service list: Error 1146 (42S02): Table 'scheduler.block_chains' doesn't exist
2024-12-17 10:58:50 [INFO] start serving http at: :7891
2024-12-19 11:06:46 [INFO] start serving http at: :7891
2024-12-19 15:38:10 [INFO] start serving http at: :7891
2024-12-24 09:49:07 [INFO] start serving http at: :7891
2024-12-24 09:56:04 [DEBU] uploading job
2024-12-24 09:57:17 [ERRO] insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`folders`, CONSTRAINT `folders_ibfk_1` FOREIGN KEY (`parentID`) REFERENCES `folders` (`id`))
2024-12-24 09:57:17 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007ea60 yuque_mind.jpeg dataset [3] {1 0}} &{0xc00007eba0}
2024-12-24 09:57:17 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 09:57:17 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`folders`, CONSTRAINT `folders_ibfk_1` FOREIGN KEY (`parentID`) REFERENCES `folders` (`id`))
2024-12-24 09:57:17 [INFO] job set 0 completed
2024-12-24 09:59:07 [INFO] start serving http at: :7891
2024-12-24 09:59:14 [DEBU] uploading job
2024-12-24 10:04:42 [ERRO] insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`folders`, CONSTRAINT `folders_ibfk_1` FOREIGN KEY (`parentID`) REFERENCES `folders` (`id`))
2024-12-24 10:04:42 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007e8a0 yuque_mind.jpeg dataset [2] {1 0}} &{0xc0004615c0}
2024-12-24 10:04:42 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:04:42 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`folders`, CONSTRAINT `folders_ibfk_1` FOREIGN KEY (`parentID`) REFERENCES `folders` (`id`))
2024-12-24 10:04:42 [INFO] job set 0 completed
2024-12-24 10:04:44 [INFO] start serving http at: :7891
2024-12-24 10:06:04 [DEBU] uploading job
2024-12-24 10:09:21 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '0000-00-00' for column 'createTime' at row 1
2024-12-24 10:09:21 [INFO] jobID: %s change state from %s to %s0&{1 0xc0004603a0 yuque_mind.jpeg dataset [3] {1 0}} &{0xc00007eb40}
2024-12-24 10:09:21 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:09:21 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '0000-00-00' for column 'createTime' at row 1
2024-12-24 10:09:21 [INFO] job set 0 completed
2024-12-24 10:10:08 [INFO] start serving http at: :7891
2024-12-24 10:11:28 [DEBU] uploading job
2024-12-24 10:12:31 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '0000-00-00' for column 'createTime' at row 1
2024-12-24 10:12:31 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001fa440 yuque_mind.jpeg dataset [3] {1 0}} &{0xc0001fa000}
2024-12-24 10:12:31 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:12:31 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '0000-00-00' for column 'createTime' at row 1
2024-12-24 10:12:31 [INFO] job set 0 completed
2024-12-24 10:12:35 [INFO] start serving http at: :7891
2024-12-24 10:12:41 [DEBU] uploading job
2024-12-24 10:14:25 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
2024-12-24 10:14:25 [INFO] jobID: %s change state from %s to %s0&{1 0xc0003534a0 yuque_mind.jpeg dataset [3] {1 0}} &{0xc000120a60}
2024-12-24 10:14:25 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:14:25 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
2024-12-24 10:14:25 [INFO] job set 0 completed
2024-12-24 10:14:27 [INFO] start serving http at: :7891
2024-12-24 10:14:39 [DEBU] uploading job
2024-12-24 10:17:21 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
2024-12-24 10:17:21 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001fa480 yuque_mind.jpeg dataset [3] {1 0}} &{0xc000248180}
2024-12-24 10:17:21 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:17:21 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
2024-12-24 10:17:21 [INFO] job set 0 completed
2024-12-24 10:17:26 [INFO] start serving http at: :7891
2024-12-24 10:17:32 [DEBU] uploading job
2024-12-24 10:25:30 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
2024-12-24 10:25:30 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001fa480 yuque_mind.jpeg dataset [3] {1 0}} &{0xc000249160}
2024-12-24 10:25:30 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:25:30 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
2024-12-24 10:25:30 [INFO] job set 0 completed
2024-12-24 10:25:33 [INFO] start serving http at: :7891
2024-12-24 10:25:40 [DEBU] uploading job
2024-12-24 10:29:18 [ERRO] insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
2024-12-24 10:29:18 [INFO] jobID: %s change state from %s to %s0&{1 0xc00019c800 dataset [3] {1 0}} &{0xc00019c3a0}
2024-12-24 10:29:18 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:29:18 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1292 (22007): Incorrect datetime value: '' for column 'createTime' at row 1
2024-12-24 10:29:18 [INFO] job set 0 completed
2024-12-24 10:29:50 [INFO] start serving http at: :7891
2024-12-24 10:34:07 [DEBU] uploading job
2024-12-24 10:40:14 [ERRO] insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`uploaddata`, CONSTRAINT `uploaddata_ibfk_1` FOREIGN KEY (`folderID`) REFERENCES `folders` (`id`))
2024-12-24 10:40:14 [INFO] jobID: %s change state from %s to %s0&{1 0xc00050e200 dataset [3] {1 0}} &{0xc000352260}
2024-12-24 10:40:14 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:40:14 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1452 (23000): Cannot add or update a child row: a foreign key constraint fails (`scheduler`.`uploaddata`, CONSTRAINT `uploaddata_ibfk_1` FOREIGN KEY (`folderID`) REFERENCES `folders` (`id`))
2024-12-24 10:40:14 [INFO] job set 0 completed
2024-12-24 10:40:17 [INFO] start serving http at: :7891
2024-12-24 10:40:48 [INFO] start serving http at: :7891
2024-12-24 10:40:55 [DEBU] uploading job
2024-12-24 10:43:04 [INFO] jobID: %s change state from %s to %s0&{1 0xc0001da380 dataset [3] {1 0}} &{<nil>}
2024-12-24 10:43:04 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:43:04 [INFO] [JobID:0] job completed successfuly
2024-12-24 10:43:04 [INFO] job set 0 completed
2024-12-24 10:44:57 [DEBU] uploading job
2024-12-24 10:49:43 [INFO] start serving http at: :7891
2024-12-24 10:49:51 [DEBU] uploading job
2024-12-24 10:50:48 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007e920 dataset [3] {1 0}} &{<nil>}
2024-12-24 10:50:48 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:50:48 [INFO] [JobID:0] job completed successfuly
2024-12-24 10:50:48 [INFO] job set 0 completed
2024-12-24 10:52:43 [DEBU] uploading job
2024-12-24 10:52:50 [DEBU] uploading job
2024-12-24 10:52:53 [DEBU] uploading job
2024-12-24 10:53:05 [INFO] jobID: %s change state from %s to %s1&{1 0xc00007e720 dataset [3] {1 0}} &{<nil>}
2024-12-24 10:53:05 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 10:53:05 [INFO] [JobID:1] job completed successfuly
2024-12-24 10:53:05 [INFO] job set 1 completed
2024-12-24 11:00:58 [INFO] start serving http at: :7891
2024-12-24 11:01:03 [DEBU] uploading job
2024-12-24 11:01:06 [DEBU] uploading job
2024-12-24 11:01:10 [DEBU] uploading job
2024-12-24 11:01:32 [INFO] jobID: %s change state from %s to %s0&{1 0xc00022d220 dataset [3] {1 0}} &{<nil>}
2024-12-24 11:01:32 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 11:01:32 [INFO] [JobID:0] job completed successfuly
2024-12-24 11:01:32 [INFO] job set 0 completed
2024-12-24 15:37:05 [INFO] start serving http at: :7891
2024-12-24 15:37:36 [DEBU] uploading job
2024-12-24 15:37:42 [DEBU] uploading job
2024-12-24 15:37:49 [DEBU] uploading job
2024-12-24 15:38:00 [INFO] jobID: %s change state from %s to %s0&{1 0xc000531d80 dataset [3] {1 0}} &{<nil>}
2024-12-24 15:38:00 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 15:38:00 [INFO] [JobID:0] job completed successfuly
2024-12-24 15:38:00 [INFO] job set 0 completed
2024-12-24 15:38:19 [INFO] jobID: %s change state from %s to %s1&{1 0xc0001da0a0 dataset [3] {1 0}} &{<nil>}
2024-12-24 15:38:19 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 15:38:19 [INFO] [JobID:1] job completed successfuly
2024-12-24 15:38:19 [INFO] job set 1 completed
2024-12-24 15:38:23 [INFO] jobID: %s change state from %s to %s2&{1 0xc00070fa80 dataset [3] {1 0}} &{<nil>}
2024-12-24 15:38:23 [INFO] [JobID:2] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 15:38:23 [INFO] [JobID:2] job completed successfuly
2024-12-24 15:38:23 [INFO] job set 2 completed
2024-12-24 15:38:45 [DEBU] uploading job
2024-12-24 15:38:53 [ERRO] insert upload data fail: data already exists
2024-12-24 15:38:53 [INFO] jobID: %s change state from %s to %s3&{1 0xc000460580 dataset [3] {1 0}} &{0xc00007ed00}
2024-12-24 15:38:53 [INFO] [JobID:3] state changed: *state2.DataUpload -> *state.Completed
2024-12-24 15:38:53 [INFO] [JobID:3] [LastState:*state2.DataUpload] job failed with: insert upload data fail: data already exists
2024-12-24 15:38:53 [INFO] job set 3 completed
2024-12-24 15:48:42 [INFO] start serving http at: :7891
2024-12-24 16:06:49 [INFO] start serving http at: :7891
2024-12-24 16:07:38 [INFO] start serving http at: :7891
2024-12-24 16:09:14 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: Error 1054 (42S22): Unknown column 'queryType' in 'field list'
2024-12-24 16:09:17 [INFO] start serving http at: :7891
2024-12-24 16:13:11 [INFO] start serving http at: :7891
2024-12-24 16:13:16 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1
2024-12-24 16:15:11 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1
2024-12-24 16:15:14 [INFO] start serving http at: :7891
2024-12-24 16:15:25 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1
2024-12-24 16:22:14 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1; sql: expected 10 destination arguments in Scan, not 1
2024-12-24 16:22:18 [INFO] start serving http at: :7891
2024-12-24 16:28:26 [INFO] start serving http at: :7891
2024-12-24 17:09:10 [INFO] start serving http at: :7891
2024-12-25 17:34:34 [INFO] start serving http at: :7891
2024-12-25 17:35:57 [WARN] [HTTP:JobSet.CreateFolder] creating folder: folder already exists
2024-12-25 17:36:22 [INFO] start serving http at: :7891
2024-12-25 17:36:26 [WARN] [HTTP:JobSet.CreateFolder] creating folder: folder already exists
2024-12-26 16:52:34 [INFO] start serving http at: :7891
2024-12-26 16:53:03 [DEBU] uploading job
2024-12-26 16:53:23 [ERRO] insert upload data fail: Error 1054 (42S22): Unknown column 'fileType' in 'field list'
2024-12-26 16:53:23 [INFO] jobID: %s change state from %s to %s0&{1 0xc00080b680 dataset [3] {1 0}} &{0xc0001da020}
2024-12-26 16:53:23 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-26 16:53:23 [INFO] [JobID:0] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1054 (42S22): Unknown column 'fileType' in 'field list'
2024-12-26 16:53:23 [INFO] job set 0 completed
2024-12-26 16:54:36 [DEBU] uploading job
2024-12-26 16:59:19 [ERRO] insert upload data fail: Error 1054 (42S22): Unknown column 'fileType' in 'field list'
2024-12-26 16:59:19 [INFO] jobID: %s change state from %s to %s1&{1 0xc00007e680 dataset [3] {1 0}} &{0xc00080a020}
2024-12-26 16:59:19 [INFO] [JobID:1] state changed: *state2.DataUpload -> *state.Completed
2024-12-26 16:59:19 [INFO] [JobID:1] [LastState:*state2.DataUpload] job failed with: insert upload data fail: Error 1054 (42S22): Unknown column 'fileType' in 'field list'
2024-12-26 16:59:19 [INFO] job set 1 completed
2024-12-26 16:59:22 [INFO] start serving http at: :7891
2024-12-26 16:59:31 [DEBU] uploading job
2024-12-26 17:00:20 [INFO] jobID: %s change state from %s to %s0&{1 0xc00007e800 dataset [3] {1 0}} &{<nil>}
2024-12-26 17:00:20 [INFO] [JobID:0] state changed: *state2.DataUpload -> *state.Completed
2024-12-26 17:00:20 [INFO] [JobID:0] job completed successfuly
2024-12-26 17:00:20 [INFO] job set 0 completed
2024-12-26 17:02:04 [INFO] start serving http at: :7891
2024-12-30 11:06:44 [INFO] start serving http at: :7891
2024-12-30 11:12:35 [INFO] start serving http at: :7891
2024-12-30 11:14:53 [INFO] start serving http at: :7891
2024-12-30 11:19:30 [INFO] start serving http at: :7891
2024-12-30 11:21:17 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Get "http://120.46.183.86:32010/object/list?isPrefix=true&packageID=1&path=objects&userID=1": dial tcp 120.46.183.86:32010: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
2024-12-30 11:27:07 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Get "http://120.46.183.86:32010/object/list?isPrefix=true&packageID=1&path=objects&userID=1": dial tcp 120.46.183.86:32010: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.
2024-12-30 11:27:09 [INFO] start serving http at: :7891
2024-12-30 11:28:58 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Get "http://localhost:32010/object/list?isPrefix=true&packageID=1&path=objects&userID=1": read tcp [::1]:19147->[::1]:32010: wsarecv: An existing connection was forcibly closed by the remote host.
2024-12-30 11:36:50 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: code: OperationFailed, message: listing objects: requsting to coodinator: code: OperationFailed, message: get objects with prefix failed
2024-12-30 11:39:18 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Error 1054 (42S22): Unknown column 'package_id' in 'where clause'
2024-12-30 14:37:12 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Error 1054 (42S22): Unknown column 'package_id' in 'where clause'
2024-12-30 14:37:15 [INFO] start serving http at: :7891
2024-12-30 14:37:43 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: Get "http://localhost:32010/object/list?isPrefix=true&packageID=916&path=objects&userID=1": dial tcp [::1]:32010: connectex: No connection could be made because the target machine actively refused it.
2024-12-30 14:39:50 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: sql: expected 1 arguments, got 2
2024-12-30 14:39:53 [INFO] start serving http at: :7891
2024-12-30 14:40:40 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: sql: expected 1 arguments, got 2
2024-12-30 14:40:43 [INFO] start serving http at: :7891
2024-12-30 14:42:13 [INFO] start serving http at: :7891
2024-12-30 14:44:12 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:44:16 [INFO] start serving http at: :7891
2024-12-30 14:45:40 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:45:42 [INFO] start serving http at: :7891
2024-12-30 14:46:35 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:46:39 [INFO] start serving http at: :7891
2024-12-30 14:47:24 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:47:26 [INFO] start serving http at: :7891
2024-12-30 14:52:55 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:52:58 [INFO] start serving http at: :7891
2024-12-30 14:54:02 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:54:04 [INFO] start serving http at: :7891
2024-12-30 14:54:46 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:54:48 [INFO] start serving http at: :7891
2024-12-30 14:56:02 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:56:05 [INFO] start serving http at: :7891
2024-12-30 14:58:39 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:58:42 [INFO] start serving http at: :7891
2024-12-30 14:59:05 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 14:59:08 [INFO] start serving http at: :7891
2024-12-30 15:01:28 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:01:31 [INFO] start serving http at: :7891
2024-12-30 15:07:59 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:08:03 [INFO] start serving http at: :7891
2024-12-30 15:09:46 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:09:48 [INFO] start serving http at: :7891
2024-12-30 15:10:54 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:15:07 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:15:10 [INFO] start serving http at: :7891
2024-12-30 15:18:20 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:18:24 [INFO] start serving http at: :7891
2024-12-30 15:18:41 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:18:43 [INFO] start serving http at: :7891
2024-12-30 15:18:51 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:20:34 [INFO] start serving http at: :7891
2024-12-30 15:20:48 [INFO] start serving http at: :7891
2024-12-30 15:24:58 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:25:01 [INFO] start serving http at: :7891
2024-12-30 15:27:06 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:27:29 [INFO] start serving http at: :7891
2024-12-30 15:28:34 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:29:00 [INFO] start serving http at: :7891
2024-12-30 15:29:54 [INFO] start serving http at: :7891
2024-12-30 15:30:02 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:30:43 [INFO] start serving http at: :7891
2024-12-30 15:30:58 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: uploadedCluster: unsupported relations for schema Package2
2024-12-30 15:31:01 [INFO] start serving http at: :7891
2024-12-30 15:32:36 [INFO] start serving http at: :7891
2024-12-30 15:37:24 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:37:27 [INFO] start serving http at: :7891
2024-12-30 15:38:49 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:38:52 [INFO] start serving http at: :7891
2024-12-30 15:40:15 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:40:18 [INFO] start serving http at: :7891
2024-12-30 15:51:15 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:51:24 [INFO] start serving http at: :7891
2024-12-30 15:57:38 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: invalid serializer type union
2024-12-30 15:58:12 [INFO] start serving http at: :7891
2024-12-30 16:18:58 [INFO] start serving http at: :7891
2024-12-30 16:44:22 [INFO] start serving http at: :7891
2024-12-30 16:46:14 [WARN] [HTTP:JobSet.CreateFolder] creating folder: invalid serializer type union
2024-12-30 16:47:07 [INFO] start serving http at: :7891
2024-12-30 16:47:34 [INFO] start serving http at: :7891
2024-12-30 16:47:37 [WARN] [HTTP:JobSet.CreateFolder] creating folder: failed to create package: code: DataExists, message: package already exists
2024-12-30 16:47:46 [WARN] [HTTP:JobSet.CreateFolder] creating folder: invalid serializer type union
2024-12-30 16:49:00 [INFO] start serving http at: :7891
2024-12-30 16:50:56 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: invalid serializer type union
2024-12-30 16:53:38 [WARN] [HTTP:JobSet.QueryUploaded] getting service list: failed to query uploaded data: invalid serializer type union
2024-12-30 16:53:48 [INFO] start serving http at: :7891
2024-12-30 16:54:49 [INFO] start serving http at: :7891
2024-12-30 17:00:28 [WARN] [HTTP:JobSet.CreateFolder] creating folder: failed to create package: code: DataExists, message: package already exists
2024-12-30 17:01:06 [WARN] [HTTP:JobSet.CreateFolder] creating folder: failed to create package: code: DataExists, message: package already exists
2024-12-30 17:02:00 [INFO] start serving http at: :7891
2024-12-30 17:03:55 [WARN] [HTTP:JobSet.CreateFolder] parsing request body: http.PackageDelete.ReadString: expects " or n, but found }, error found in #10 byte of ...|: 1042,
}|..., bigger context ...|{
"userID": 1,
"packageID": 1042,
}|...
2024-12-30 17:04:00 [WARN] [HTTP:JobSet.CreateFolder] creating folder: invalid serializer type union
2024-12-30 17:04:35 [INFO] start serving http at: :7891
2024-12-30 17:13:24 [INFO] start serving http at: :7891
2024-12-30 17:14:58 [INFO] start serving http at: :7891
2024-12-30 17:21:17 [INFO] start serving http at: :7891
2024-12-30 17:22:16 [INFO] start serving http at: :7891
2024-12-30 17:26:49 [INFO] start serving http at: :7891
2024-12-30 17:29:04 [INFO] start serving http at: :7891
2024-12-30 17:30:27 [INFO] start serving http at: :7891
2024-12-30 17:34:22 [INFO] start serving http at: :7891
2024-12-30 17:35:43 [INFO] start serving http at: :7891
2024-12-31 09:00:26 [INFO] start serving http at: :7891
2024-12-31 09:00:33 [WARN] [HTTP:JobSet.DeleteFolder] creating folder: failed to delete object: Get "http://localhost:32010/object/list?isPrefix=true&packageID=1045&path=%2Fpath%2Ftest%2F3&userID=1": dial tcp [::1]:32010: connectex: No connection could be made because the target machine actively refused it.
2024-12-31 09:01:11 [WARN] [HTTP:JobSet.DeleteFolder] creating folder: failed to delete object: Get "http://localhost:32010/object/list?isPrefix=true&packageID=1045&path=%2Fpath%2Ftest%2F3&userID=1": dial tcp [::1]:32010: connectex: No connection could be made because the target machine actively refused it.
2024-12-31 09:03:02 [WARN] [HTTP:JobSet.DeleteFolder] creating folder: failed to delete object: code: BadArgument, message: missing argument or invalid argument
2024-12-31 09:03:11 [INFO] start serving http at: :7891
2024-12-31 09:05:52 [DEBU] uploading job
2024-12-31 09:08:16 [INFO] start serving http at: :7891
2024-12-31 09:08:32 [DEBU] uploading job

View File

@ -64,6 +64,9 @@ func Bin() error {
if err := Manager(); err != nil {
return err
}
if err := ScheduleMiddleware(); err != nil {
return err
}
return nil
}
@ -95,7 +98,7 @@ func Confs() error {
confDir := "./common/assets/confs"
info, err := os.Stat(confDir)
if errors.Is(err, os.ErrNotExist) {
if errors.Is(err, os.ErrNotExist) {
fmt.Printf("no confs.\n")
return nil
}
@ -158,3 +161,12 @@ func Manager() error {
EntryFile: "manager/main.go",
})
}
func ScheduleMiddleware() error {
return magefiles.Build(magefiles.BuildArgs{
OutputName: "schedulerMiddleware",
OutputDir: "schedulerMiddleware",
AssetsDir: "assets",
EntryFile: "schedulerMiddleware/main.go",
})
}

View File

@ -44,7 +44,7 @@ type InstanceCreateInfo struct {
}
type InstanceUpdateInfo struct {
serder.Metadata `union:"Update"`
serder.Metadata `union:"UpdatePackage"`
InstanceInfoBase
Type string `json:"type"`
Info schsdk.UpdateMultiInstanceJobInfo `json:"info"`

View File

@ -58,7 +58,7 @@ func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), s.scheme.TargetCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
@ -227,7 +227,7 @@ func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRu
// }
// // TODO 镜像名称
// err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
// err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.DefCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
// if err != nil {
// return fmt.Errorf("creating image info: %w", err)
// }

View File

@ -6,7 +6,6 @@ import (
"path/filepath"
"time"
"github.com/samber/lo"
"gitlink.org.cn/cloudream/common/pkgs/future"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
@ -60,11 +59,11 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
case *job.NormalJob:
switch runningJob.SubType {
case schsdk.JobTypeNormal: // 普通任务
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.SQLCtx(), runningJob.Files.Image.ImageID, runningJob.TargetCCID)
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.DefCtx(), runningJob.Files.Image.ImageID, runningJob.TargetCCID)
if err != nil {
return fmt.Errorf("getting pcm image info: %w", err)
}
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.SQLCtx(), runningJob.TargetCCID)
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.DefCtx(), runningJob.TargetCCID)
if err != nil {
return fmt.Errorf("getting computing center resource: %w", err)
}
@ -72,12 +71,12 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
return fmt.Errorf("no resource found at computing center %v", runningJob.TargetCCID)
}
ccInfo, getStg, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
ccInfo, _, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
if err != nil {
return fmt.Errorf("getting storage info: %w", err)
}
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, getStg.RemoteBase, *ccInfo)
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
err = s.submitNormalTask(rtx, cmd, envs, *ccInfo, pcmImgInfo, ress[0].PCMResourceID)
if err != nil {
logger.Error(err.Error())
@ -89,7 +88,7 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
return fmt.Errorf("getting storage info: %w", err)
}
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, getStg.RemoteBase, *ccInfo)
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
instID, err := s.submitDataPreprocessTask(rtx, cmd, envs, *ccInfo, getStg.StorageID, userID)
if err != nil {
logger.Error(err.Error())
@ -110,7 +109,7 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
return fmt.Errorf("loading dataset package: %w", err)
}
}
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, getStg.RemoteBase, *ccInfo)
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
err = s.submitFinetuningTask(userID, rtx, cmd, envs, *ccInfo, getStg.StorageID, runningJob)
if err != nil {
logger.Error(err.Error())
@ -124,7 +123,7 @@ func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
return fmt.Errorf("getting storage info: %w", err)
}
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
_, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, getStg.RemoteBase, *ccInfo)
_, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
err = s.submitInstanceTask(rtx, jo, runningJob, *ccInfo, getStg.StorageID, userID, envs)
if err != nil {
logger.Error(err.Error())
@ -203,7 +202,7 @@ func (s *JobExecuting) submitNormalTask(rtx jobmgr.JobStateRunContext, cmd strin
}
func (s *JobExecuting) submitDataPreprocessTask(rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdssdk.StorageID, userID cdssdk.UserID) (string, error) {
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.SQLCtx(), storageID)
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.DefCtx(), storageID)
if err != nil {
logger.Error(err.Error())
return "", fmt.Errorf("getting object storage info: %w", err)
@ -373,14 +372,14 @@ func (s *JobExecuting) submitInstanceTask(rtx jobmgr.JobStateRunContext, jo *job
}
func getModelInfoAndObjectStorage(rtx jobmgr.JobStateRunContext, modelID schsdk.ModelID, storageID cdssdk.StorageID) (*schmod.ObjectStorage, *schmod.ModelResource, error) {
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.SQLCtx(), storageID)
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.DefCtx(), storageID)
if err != nil {
logger.Error(err.Error())
return nil, nil, fmt.Errorf("getting object storage info: %w", err)
}
// 先从数据库中查询是否已经预置了模型
modelInfo, err := rtx.Mgr.DB.Models().GetModelByID(rtx.Mgr.DB.SQLCtx(), modelID, objectStorage.ID)
modelInfo, err := rtx.Mgr.DB.Models().GetModelByID(rtx.Mgr.DB.DefCtx(), modelID, objectStorage.ID)
if &modelInfo == nil {
logger.Error(err.Error())
return nil, nil, fmt.Errorf("the model is not exists: %w", err)
@ -405,31 +404,31 @@ func postDeleteInstanceEvent(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job, runn
// 判断算力中心是否支持环境变量配置如果不支持则读取脚本内容并拼接在Command参数后面
func getRuntimeCommand(runtime schsdk.JobRuntimeInfo, dataSetPath string, outputPath string, remoteBase string, ccInfo schmod.ComputingCenter) (string, []schsdk.KVPair) {
var envs []schsdk.KVPair
var params []string
//var params []string
var cmd string
envs = append(envs, schsdk.KVPair{Key: schsdk.JobDataInEnv, Value: filepath.Join(remoteBase, dataSetPath)})
envs = append(envs, schsdk.KVPair{Key: schsdk.JobDataOutEnv, Value: filepath.Join(remoteBase, outputPath)})
envs = append(envs, runtime.Envs...)
switch boot := ccInfo.Bootstrap.(type) {
case *schsdk.DirectBootstrap:
cmd = runtime.Command
case *schsdk.NoEnvBootstrap:
cmd = boot.ScriptFileName
params = append(params, runtime.Command)
envMap := lo.Map(envs, func(env schsdk.KVPair, _ int) string {
return fmt.Sprintf("%s=%s", env.Key, env.Value)
})
params = append(params, envMap...)
default:
cmd = runtime.Command
}
//switch boot := ccInfo.Bootstrap.(type) {
//case *schsdk.DirectBootstrap:
// cmd = runtime.Command
//case *schsdk.NoEnvBootstrap:
// cmd = boot.ScriptFileName
// params = append(params, runtime.Command)
// envMap := lo.Map(envs, func(env schsdk.KVPair, _ int) string {
// return fmt.Sprintf("%s=%s", env.Key, env.Value)
// })
// params = append(params, envMap...)
//default:
// cmd = runtime.Command
//}
return cmd, envs
}
func getCCInfoAndStgInfo(rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, userID cdssdk.UserID) (*schmod.ComputingCenter, *cdsapi.StorageGetResp, error) {
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), targetCCID)
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), targetCCID)
if err != nil {
return nil, nil, fmt.Errorf("getting computing center info: %w", err)
}
@ -485,7 +484,7 @@ func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Jo
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), reJob.TargetJobCCID)
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), reJob.TargetJobCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}

View File

@ -73,7 +73,7 @@ func (s *MultiInstanceUpdate) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), dtrJob.TargetJobCCID)
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), dtrJob.TargetJobCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}

View File

@ -10,8 +10,6 @@ import (
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
@ -61,7 +59,7 @@ func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.SQLCtx(), s.scheme.TargetCCID)
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), s.scheme.TargetCCID)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(fmt.Errorf("getting computing center info: %w", err)))
return
@ -203,7 +201,7 @@ func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobSta
// 上传完毕,则可以新建一个空的镜像的记录
// TODO 镜像名称
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.SQLCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.DefCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
@ -213,7 +211,7 @@ func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobSta
file.PackageID = &evt.PackageID
case *schsdk.ImageJobFileInfo:
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.SQLCtx(), info.ImageID)
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.DefCtx(), info.ImageID)
if err != nil {
return fmt.Errorf("getting image info: %w", err)
}
@ -226,64 +224,64 @@ func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobSta
// TODO 需要重新设计镜像导入流程
return fmt.Errorf("not implemented")
if file.PackageID == nil {
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
}
// TODO UserID
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cloudream storage client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
// TODO UserID
pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
if err != nil {
return fmt.Errorf("getting package objects: %w", err)
}
if len(pkgObjs.Objects) == 0 {
return fmt.Errorf("no object in the package which will be imported")
}
if len(pkgObjs.Objects) > 1 {
return fmt.Errorf("there must be only 1 object in the package which will be imported")
}
taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut2 := taskStatus2.Receive()
status2 := <-fut2.Chan()
uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
if uploadStatus.Error != "" {
return fmt.Errorf("uploading image: %s", uploadStatus.Error)
}
// TODO 镜像名称
err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
return nil
//if file.PackageID == nil {
// return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
//}
//
//// TODO UserID
//taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
//if err != nil {
// return fmt.Errorf("moving package: %w", err)
//}
//
//fut := taskStatus.Receive()
//status := <-fut.Chan()
//
//moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
//if moveStatus.Error != "" {
// return fmt.Errorf("moving package: %s", moveStatus.Error)
//}
//
//stgCli, err := schglb.CloudreamStoragePool.Acquire()
//if err != nil {
// return fmt.Errorf("new cloudream storage client: %w", err)
//}
//defer schglb.CloudreamStoragePool.Release(stgCli)
//
//// TODO UserID
//pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
//if err != nil {
// return fmt.Errorf("getting package objects: %w", err)
//}
//
//if len(pkgObjs.Objects) == 0 {
// return fmt.Errorf("no object in the package which will be imported")
//}
//
//if len(pkgObjs.Objects) > 1 {
// return fmt.Errorf("there must be only 1 object in the package which will be imported")
//}
//
//taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
//if err != nil {
// return fmt.Errorf("moving package: %w", err)
//}
//
//fut2 := taskStatus2.Receive()
//status2 := <-fut2.Chan()
//
//uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
//if uploadStatus.Error != "" {
// return fmt.Errorf("uploading image: %s", uploadStatus.Error)
//}
//
//// TODO 镜像名称
//err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.DefCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
//if err != nil {
// return fmt.Errorf("creating image info: %w", err)
//}
//
//return nil
}
return nil

View File

@ -8,7 +8,7 @@ import (
)
func (svc *Service) GetAllComputingCenter(msg *mgrmq.GetAllComputingCenter) (*mgrmq.GetAllComputingCenterResp, *mq.CodeMessage) {
ccs, err := svc.db.ComputingCenter().GetAll(svc.db.SQLCtx())
ccs, err := svc.db.ComputingCenter().GetAll(svc.db.DefCtx())
if err != nil {
logger.Warnf("getting all computing center: %s", err.Error())
return nil, mq.Failed(errorcode.OperationFailed, "get all computing center failed")

View File

@ -8,13 +8,13 @@ import (
)
func (svc *Service) GetImageInfo(msg *mgrmq.GetImageInfo) (*mgrmq.GetImageInfoResp, *mq.CodeMessage) {
image, err := svc.db.Image().GetByID(svc.db.SQLCtx(), msg.ImageID)
image, err := svc.db.Image().GetByID(svc.db.DefCtx(), msg.ImageID)
if err != nil {
logger.WithField("ImageID", msg.ImageID).Warnf("getting image by id: %s", err.Error())
return nil, mq.Failed(errorcode.OperationFailed, "get image failed")
}
pcmImages, err := svc.db.PCMImage().GetByImageID(svc.db.SQLCtx(), msg.ImageID)
pcmImages, err := svc.db.PCMImage().GetByImageID(svc.db.DefCtx(), msg.ImageID)
if err != nil {
logger.WithField("ImageID", msg.ImageID).Warnf("getting pcm image by image id: %s", err.Error())
return nil, mq.Failed(errorcode.OperationFailed, "get pcm images failed")

View File

@ -207,7 +207,7 @@ func (svc *Service) GetServiceList(msg *mgrmq.GetServiceList) (*mgrmq.GetService
_, ok = jo.State.(*jobmod.NormalJobExecutingDump)
if ok {
computingCenter, err := svc.db.ComputingCenter().GetByID(svc.db.SQLCtx(), norJob.TargetCCID)
computingCenter, err := svc.db.ComputingCenter().GetByID(svc.db.DefCtx(), norJob.TargetCCID)
if err != nil {
return nil, mq.Failed(errorcode.OperationFailed, fmt.Sprintf("get cdsNodeID failed by CCID: %s", err.Error()))
}

View File

@ -19,7 +19,7 @@ func (svc *Service) ECSNodeRunningInfo(req *schsdk.ECSNodeRunningInfoReq) (*schs
}
func (svc *Service) GetAllModels(msg *mgrmq.GetAllModels) (*mgrmq.GetAllModelsResp, *mq.CodeMessage) {
models, err := svc.db.Models().GetAll(svc.db.SQLCtx())
models, err := svc.db.Models().GetAll(svc.db.DefCtx())
if err != nil {
logger.Warnf("getting all models: %s", err.Error())
return nil, mq.Failed(errorcode.OperationFailed, "get all models failed")

View File

@ -0,0 +1,45 @@
package cmdline
import (
"fmt"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/services"
"os"
"gitlink.org.cn/cloudream/common/pkgs/cmdtrie"
)
type CommandContext struct {
Cmdline *Commandline
}
var commands cmdtrie.CommandTrie[CommandContext, error] = cmdtrie.NewCommandTrie[CommandContext, error]()
type Commandline struct {
Svc *services.Service
}
func NewCommandline(svc *services.Service) (*Commandline, error) {
return &Commandline{
Svc: svc,
}, nil
}
func (c *Commandline) DispatchCommand(allArgs []string) {
cmdCtx := CommandContext{
Cmdline: c,
}
cmdErr, err := commands.Execute(cmdCtx, allArgs, cmdtrie.ExecuteOption{ReplaceEmptyArrayWithNil: true})
if err != nil {
fmt.Printf("execute command failed, err: %s", err.Error())
os.Exit(1)
}
if cmdErr != nil {
fmt.Printf("execute command failed, err: %s", cmdErr.Error())
os.Exit(1)
}
}
func MustAddCmd(fn any, prefixWords ...string) any {
commands.MustAdd(fn, prefixWords...)
return nil
}

View File

@ -0,0 +1,5 @@
package cmdline
// var _ = MustAddCmd(func(ctx CommandContext, infoFilePath string) error {
//
// }, "jobset", "new")

View File

@ -0,0 +1,25 @@
package cmdline
import (
"fmt"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/http"
)
var _ = MustAddCmd(func(ctx CommandContext, args []string) error {
listenAddr := ":7891"
if len(args) > 0 {
listenAddr = args[0]
}
httpSvr, err := http.NewServer(listenAddr, ctx.Cmdline.Svc)
if err != nil {
return fmt.Errorf("new http server: %w", err)
}
err = httpSvr.Serve()
if err != nil {
return fmt.Errorf("serving http: %w", err)
}
return nil
}, "serve", "http")

View File

@ -0,0 +1,30 @@
package config
import (
"gitlink.org.cn/cloudream/common/pkgs/logger"
"gitlink.org.cn/cloudream/common/sdks/blockchain"
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
"gitlink.org.cn/cloudream/common/utils/config"
db "gitlink.org.cn/cloudream/scheduler/common/pkgs/db/config"
)
type Config struct {
Logger logger.Config `json:"logger"`
DB db.Config `json:"db"`
PCMScheduler sch.Config `json:"pcmScheduler"`
Uploader uploadersdk.Config `json:"uploader"`
BlockChain blockchain.Config `json:"blockChain"`
CloudreamStorage cdsapi.Config `json:"cloudreamStorage"`
}
var cfg Config
func Init() error {
return config.DefaultLoad("middleware", &cfg)
}
func Cfg() *Config {
return &cfg
}

View File

@ -0,0 +1,24 @@
package http
import "gitlink.org.cn/cloudream/common/consts/errorcode"
type Response struct {
Code string `json:"code"`
Message string `json:"message"`
Data any `json:"data"`
}
func OK(data any) Response {
return Response{
Code: errorcode.OK,
Message: "",
Data: data,
}
}
func Failed(code string, msg string) Response {
return Response{
Code: code,
Message: msg,
}
}

View File

@ -0,0 +1,417 @@
package http
import (
"github.com/gin-gonic/gin"
"gitlink.org.cn/cloudream/common/consts/errorcode"
"gitlink.org.cn/cloudream/common/pkgs/logger"
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
"gitlink.org.cn/cloudream/common/utils/serder"
"io"
"net/http"
)
type JobSetService struct {
*Server
}
func (s *Server) JobSetSvc() *JobSetService {
return &JobSetService{
Server: s,
}
}
type JobSetSubmitResp struct {
JobSetID schsdk.JobSetID `json:"jobSetID"`
FilesUploadScheme schsdk.JobSetFilesUploadScheme `json:"filesUploadScheme"`
}
func (s *JobSetService) Submit(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.Submit")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
jobSetInfo, err := serder.JSONToObjectEx[schsdk.JobSetInfo](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
schScheme, uploadScheme, err := s.svc.JobSetSvc().PreScheduler(jobSetInfo)
if err != nil {
log.Warnf("pre-scheduling jobset: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "pre-scheduling jobset failed"))
return
}
jobsetID, err := s.svc.JobSetSvc().Submit(jobSetInfo, schScheme)
if err != nil {
log.Warnf("submitting jobset: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "submit jobset failed"))
return
}
ctx.JSON(http.StatusOK, OK(JobSetSubmitResp{
JobSetID: *jobsetID,
FilesUploadScheme: *uploadScheme,
}))
}
type JobSetLocalFileUploadedReq struct {
JobSetID schsdk.JobSetID `json:"jobSetID" binding:"required"`
LocalPath string `json:"localPath" binding:"required"`
Error string `json:"error"`
PackageID cdssdk.PackageID `json:"packageID"`
ObjectIDs []cdssdk.ObjectID `json:"objectIDs"`
//FolderID uploadersdk.FolderID `json:"folderID"`
//UploadedInfo []schmod.FileUploadedInfo `json:"uploadedInfo"`
}
func (s *JobSetService) LocalFileUploaded(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.LocalFileUploaded")
var req JobSetLocalFileUploadedReq
if err := ctx.ShouldBindJSON(&req); err != nil {
log.Warnf("binding body: %s", err.Error())
ctx.JSON(http.StatusBadRequest, Failed(errorcode.BadArgument, "missing argument or invalid argument"))
return
}
s.svc.JobSetSvc().LocalFileUploaded(req.JobSetID, req.LocalPath, req.Error, req.PackageID, req.ObjectIDs)
ctx.JSON(http.StatusOK, OK(nil))
}
type UploadReq struct {
UserID cdssdk.UserID `json:"userID"`
UploadParams sch.UploadParams `json:"uploadParams"`
}
type UploadResp struct {
JobSetID schsdk.JobSetID `json:"jobSetID"`
LocalPath string `json:"localPath"`
StorageIDs []cdssdk.StorageID `json:"storageIDs"`
BucketID cdssdk.BucketID `json:"bucketID"`
}
func (s *JobSetService) Upload(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.Upload")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
req, err := serder.JSONToObjectEx[UploadReq](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
jobsetID, storages, err := s.svc.JobSetSvc().Upload(req.UserID, req.UploadParams)
if err != nil {
log.Warnf("uploading file: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "upload file failed, error: "+err.Error()))
return
}
switch info := req.UploadParams.UploadInfo.(type) {
case *sch.LocalUploadInfo:
ctx.JSON(http.StatusOK, OK(UploadResp{
JobSetID: *jobsetID,
LocalPath: info.LocalPath,
StorageIDs: *storages,
BucketID: 1,
}))
case *sch.RemoteUploadInfo:
ctx.JSON(http.StatusOK, OK("success"))
}
}
type CreateFolderReq struct {
PackageID cdssdk.PackageID `json:"packageID"`
Path string `json:"path"`
}
func (s *JobSetService) CreateFolder(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.CreateFolder")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
req, err := serder.JSONToObjectEx[CreateFolderReq](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
err = s.svc.JobSetSvc().CreateFolder(req.PackageID, req.Path)
if err != nil {
log.Warnf("creating folder: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, err.Error()))
return
}
ctx.JSON(http.StatusOK, OK("success"))
}
type DeleteFileReq struct {
UserID cdssdk.UserID `json:"userID" binding:"required"`
ObjectIDs []cdssdk.ObjectID `json:"objectIDs" binding:"required"`
}
func (s *JobSetService) DeleteFile(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.DeleteFile")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
req, err := serder.JSONToObjectEx[DeleteFileReq](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
err = s.svc.JobSetSvc().DeleteFile(req.UserID, req.ObjectIDs)
if err != nil {
log.Warnf("creating folder: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "create folder failed"))
return
}
ctx.JSON(http.StatusOK, OK("success"))
}
type DeleteFolderReq struct {
UserID cdssdk.UserID `json:"userID" binding:"required"`
PackageID cdssdk.PackageID `json:"packageID" binding:"required"`
Path string `json:"path" binding:"required"`
}
func (s *JobSetService) DeleteFolder(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.DeleteFolder")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
req, err := serder.JSONToObjectEx[DeleteFolderReq](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
err = s.svc.JobSetSvc().DeleteFolder(req.UserID, req.PackageID, req.Path)
if err != nil {
log.Warnf("creating folder: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "delete folder failed"))
return
}
ctx.JSON(http.StatusOK, OK("success"))
}
type QueryUploadedReq struct {
QueryParams sch.QueryData `json:"queryParams" binding:"required"`
}
type QueryUploadedResp struct {
TotalPages int `json:"totalPages"`
TotalCount int `json:"totalCount"`
CurrentPage int `json:"currentPage"`
PageSize int `json:"pageSize"`
UploadedDatas []uploadersdk.Package `json:"uploadedDatas"`
}
func (s *JobSetService) QueryUploaded(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.QueryUploaded")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
req, err := serder.JSONToObjectEx[QueryUploadedReq](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
uploadedDatas, totalPages, totalCount, err := s.svc.JobSetSvc().QueryUploaded(req.QueryParams)
if err != nil {
log.Warnf("getting service list: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "get upload data list failed, error: "+err.Error()))
return
}
ctx.JSON(http.StatusOK, OK(QueryUploadedResp{
TotalPages: totalPages,
TotalCount: totalCount,
CurrentPage: req.QueryParams.CurrentPage,
PageSize: req.QueryParams.PageSize,
UploadedDatas: uploadedDatas,
}))
}
type BindingReq struct {
ID uploadersdk.DataID `json:"ID"`
UserID cdssdk.UserID `json:"userID" binding:"required"`
BindingName string `json:"bindingName" binding:"required"`
BindingType string `json:"bindingType" binding:"required"`
PacakgeIDs []cdssdk.PackageID `json:"pacakgeIDs" binding:"required"`
}
func (s *JobSetService) Binding(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.Binding")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
req, err := serder.JSONToObjectEx[BindingReq](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
params := uploadersdk.BindingData{
ID: req.ID,
UserID: req.UserID,
BindingName: req.BindingName,
BindingType: req.BindingType,
}
err = s.svc.JobSetSvc().DataBinding(params, req.PacakgeIDs)
if err != nil {
log.Warnf("getting service list: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "binding data failed, error: "+err.Error()))
return
}
ctx.JSON(http.StatusOK, OK("success"))
}
type RemoveBindingReq struct {
UploadDatas []uploadersdk.DataID `json:"uploadDatas"`
}
func (s *JobSetService) RemoveBinding(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.Binding")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
req, err := serder.JSONToObjectEx[BindingReq](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
err = s.svc.JobSetSvc().RemoveBinding(req.PacakgeIDs)
if err != nil {
log.Warnf("getting service list: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "remove binding failed, error: "+err.Error()))
return
}
ctx.JSON(http.StatusOK, OK("success"))
}
type PackageCreate struct {
UserID cdssdk.UserID `json:"userID"`
Name string `json:"name"`
DataType string `json:"dataType"`
}
func (s *JobSetService) CreatePackage(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.CreateFolder")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
req, err := serder.JSONToObjectEx[PackageCreate](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
err = s.svc.JobSetSvc().CreatePackage(req.UserID, req.Name, req.DataType)
if err != nil {
log.Warnf("creating folder: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, err.Error()))
return
}
ctx.JSON(http.StatusOK, OK("success"))
}
type PackageDelete struct {
UserID cdssdk.UserID `json:"userID" binding:"required"`
PackageID cdssdk.PackageID `json:"packageID" binding:"required"`
}
func (s *JobSetService) DeletePackage(ctx *gin.Context) {
log := logger.WithField("HTTP", "JobSet.CreateFolder")
bodyData, err := io.ReadAll(ctx.Request.Body)
if err != nil {
log.Warnf("reading request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "read request body failed"))
return
}
req, err := serder.JSONToObjectEx[PackageDelete](bodyData)
if err != nil {
log.Warnf("parsing request body: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, "parse request body failed"))
return
}
err = s.svc.JobSetSvc().DeletePackage(req.UserID, req.PackageID)
if err != nil {
log.Warnf("creating folder: %s", err.Error())
ctx.JSON(http.StatusOK, Failed(errorcode.OperationFailed, err.Error()))
return
}
ctx.JSON(http.StatusOK, OK("success"))
}

View File

@ -0,0 +1,55 @@
package http
import (
"github.com/gin-gonic/gin"
"gitlink.org.cn/cloudream/common/pkgs/logger"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/services"
)
type Server struct {
engine *gin.Engine
listenAddr string
svc *services.Service
}
func NewServer(listenAddr string, svc *services.Service) (*Server, error) {
engine := gin.New()
return &Server{
engine: engine,
listenAddr: listenAddr,
svc: svc,
}, nil
}
func (s *Server) Serve() error {
s.initRouters()
logger.Infof("start serving http at: %s", s.listenAddr)
err := s.engine.Run(s.listenAddr)
if err != nil {
logger.Infof("http stopped with error: %s", err.Error())
return err
}
logger.Infof("http stopped")
return nil
}
func (s *Server) initRouters() {
s.engine.POST("/jobSet/upload", s.JobSetSvc().Upload)
s.engine.POST("/jobSet/submit", s.JobSetSvc().Submit)
s.engine.POST("/jobSet/localFileUploaded", s.JobSetSvc().LocalFileUploaded)
s.engine.POST("/jobSet/queryUploaded", s.JobSetSvc().QueryUploaded)
s.engine.POST("/jobSet/createPackage", s.JobSetSvc().CreatePackage)
s.engine.POST("/jobSet/deletePackage", s.JobSetSvc().DeletePackage)
s.engine.POST("/jobSet/createFolder", s.JobSetSvc().CreateFolder)
s.engine.POST("/jobSet/deleteFolder", s.JobSetSvc().DeleteFolder)
s.engine.POST("/jobSet/deleteFile", s.JobSetSvc().DeleteFile)
s.engine.POST("/jobSet/binding", s.JobSetSvc().Binding)
s.engine.POST("/jobSet/removeBinding", s.JobSetSvc().RemoveBinding)
}

View File

@ -0,0 +1,194 @@
package executormgr
import (
"bufio"
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/async"
log "gitlink.org.cn/cloudream/common/pkgs/logger"
"gitlink.org.cn/cloudream/common/utils/serder"
jobTask "gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/task"
"io"
"strings"
"sync"
"time"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
exemq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
)
type task struct {
statusChan *async.UnboundChannel[mgrmq.ExecutorTaskStatus]
}
type ExecutorStatus struct {
executorID schmod.ExecutorID
tasks map[string]task // key 为 TaskID
}
var ErrWaitReportTimeout = fmt.Errorf("wait report timeout")
var ExecutorPool exemq.HttpPool
func InitExecutorPool() {
ExecutorPool = exemq.NewHttpPool(&exemq.Config{})
}
type Manager struct {
executors map[schmod.ExecutorID]*ExecutorStatus
lock sync.Mutex
exeCli *exemq.Client
reportTimeout time.Duration
}
func NewManager(reportTimeout time.Duration) (*Manager, error) {
exeCli, err := schglb.ExecutorMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new executor client: %w", err)
}
return &Manager{
executors: make(map[schmod.ExecutorID]*ExecutorStatus),
exeCli: exeCli,
reportTimeout: reportTimeout,
}, nil
}
func (m *Manager) ReceiveExecutorTaskStatus(url string) (*mgrmq.ExecutorTaskStatus, error) {
client, err := ExecutorPool.AcquireByUrl(url)
if err != nil {
log.Error(err)
return &mgrmq.ExecutorTaskStatus{}, err
}
resp, err := client.GetReportInfo()
if err != nil {
log.Error(err)
return &mgrmq.ExecutorTaskStatus{}, err
}
reader := bufio.NewReader(resp.Body)
line, err := reader.ReadString('\n')
if err != nil && err != io.EOF {
log.Error("Error reading from response body:", err)
return &mgrmq.ExecutorTaskStatus{}, err
}
// TODO 第一次获取的值包含执行器所有任务,用于失败重试
executorInfo := convertLine(line)
// 将第一次的executor放入到池子中
exec := &ExecutorStatus{
executorID: executorInfo.ExecutorID,
tasks: make(map[string]task),
}
m.executors[executorInfo.ExecutorID] = exec
go func() {
for {
line, err = reader.ReadString('\n')
if err != nil {
if err != io.EOF {
log.Error("Error reading from response body:", err)
}
return
}
status := convertLine(line)
if status == nil {
continue
}
m.Report(*status)
}
}()
return executorInfo, nil
}
func convertLine(line string) *mgrmq.ExecutorTaskStatus {
if line == "" {
return nil
}
line = strings.TrimPrefix(line, "data: ")
line = strings.TrimSpace(line)
if len(line) == 0 {
return nil
}
readResp, err := serder.JSONToObjectEx[mgrmq.ExecutorTaskStatus]([]byte(line))
if err != nil {
log.Error(err)
return nil
}
return &readResp
}
func (m *Manager) Report(status mgrmq.ExecutorTaskStatus) {
m.lock.Lock()
defer m.lock.Unlock()
exec := m.executors[status.ExecutorID]
if exec == nil {
log.Error("Executor not found: ", status.ExecutorID)
return
}
// 由于先将task chan放入到池子中再执行的task所以这里的task必存在
tsk := exec.tasks[status.TaskID]
// TODO 考虑主动检测channel是否关闭然后取消task
if tsk.statusChan.Send(status) != nil {
delete(exec.tasks, status.TaskID)
if len(exec.tasks) == 0 {
delete(m.executors, exec.executorID)
}
}
}
// 启动一个Task
func (m *Manager) StartTask(info exetsk.TaskInfo, ccInfo schmod.ComputingCenter) (*jobTask.JobTask[mgrmq.ExecutorTaskStatus], error) {
m.lock.Lock()
defer m.lock.Unlock()
newJobTask := jobTask.NewJobTask[mgrmq.ExecutorTaskStatus]()
ch := newJobTask.Chan()
client, err := ExecutorPool.AcquireByUrl(ccInfo.ExecutorURL)
if err != nil {
ch.CloseWithError(fmt.Errorf("start task: %w", err))
return newJobTask, err
}
executorID := schmod.ExecutorID(ccInfo.ExecutorID)
// 检测是否连接过这个Executor如果第一次连则发送请求监听上报信息
_, ok := m.executors[executorID]
if !ok {
_, err = m.ReceiveExecutorTaskStatus(ccInfo.ExecutorURL)
if err != nil {
ch.CloseWithError(fmt.Errorf("start task: %w", err))
return newJobTask, err
}
}
// 上面已经将executor放入到池子中了这里的executor必存在
exeInfo := m.executors[executorID]
exeInfo.tasks[newJobTask.ID()] = task{
statusChan: ch,
}
_, err = client.SubmitTask(exemq.NewStartTask(newJobTask.ID(), info))
if err != nil {
ch.CloseWithError(fmt.Errorf("start task: %w", err))
return newJobTask, err
}
return newJobTask, nil
}
func (m *Manager) Serve() {
InitExecutorPool()
}

View File

@ -0,0 +1,7 @@
package event
type Cancel struct {
}
func (s *Cancel) Noop() {
}

View File

@ -0,0 +1,69 @@
package event
import (
"gitlink.org.cn/cloudream/common/pkgs/future"
"gitlink.org.cn/cloudream/common/pkgs/types"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
"gitlink.org.cn/cloudream/common/utils/serder"
)
type OperateInstanceFuture = *future.SetValueFuture[OperateInstanceResult]
type InstanceOperate struct {
Info InstanceOperateInfo
Result OperateInstanceFuture
}
type OperateInstanceResult struct {
OperateResult string
Err error
JobID schsdk.JobID
FilesUploadScheme schsdk.JobFilesUploadScheme
}
type InstanceOperateInfo interface {
Instance()
}
type InstanceInfoBase struct{}
func (i *InstanceInfoBase) Instance() {}
var InstanceOperateInfoTypeUnion = types.NewTypeUnion[InstanceOperateInfo](
(*InstanceCreateInfo)(nil),
(*InstanceUpdateInfo)(nil),
(*InstanceDeleteInfo)(nil),
)
var _ = serder.UseTypeUnionInternallyTagged(&InstanceOperateInfoTypeUnion, "type")
type InstanceCreateInfo struct {
serder.Metadata `union:"Create"`
InstanceInfoBase
DataSet schsdk.JobFileInfo
}
type InstanceUpdateInfo struct {
serder.Metadata `union:"UpdatePackage"`
InstanceInfoBase
Type string `json:"type"`
Info schsdk.UpdateMultiInstanceJobInfo `json:"info"`
//PackageID cdssdk.PackageID `json:"packageID"`
//LoRAPackage string `json:"loraPackage"`
}
type InstanceDeleteInfo struct {
serder.Metadata `union:"Delete"`
InstanceInfoBase
InstanceID schsdk.JobID `json:"instanceID"`
}
func NewInstanceOperate(info InstanceOperateInfo, future OperateInstanceFuture) *InstanceOperate {
return &InstanceOperate{
Info: info,
Result: future,
}
}
func (s *InstanceOperate) Noop() {
}

View File

@ -0,0 +1,21 @@
package event
import (
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
)
// 任务结束,包括成功或者失败
type JobCompleted struct {
Job *jobmgr.Job
Err error
}
func NewJobCompleted(job *jobmgr.Job, err error) *JobCompleted {
return &JobCompleted{
Job: job,
Err: err,
}
}
func (s *JobCompleted) Noop() {
}

View File

@ -0,0 +1,27 @@
package event
import (
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
)
// 本地文件上传结束
type LocalFileUploaded struct {
LocalPath string
Error error
PackageID cdssdk.PackageID
ObjectIDs []cdssdk.ObjectID
//FolderID uploadersdk.FolderID
//UploadedInfo []schmod.FileUploadedInfo
}
func NewLocalFileUploaded(localPath string, err error, packageID cdssdk.PackageID, objectIDs []cdssdk.ObjectID) *LocalFileUploaded {
return &LocalFileUploaded{
LocalPath: localPath,
Error: err,
PackageID: packageID,
ObjectIDs: objectIDs,
}
}
func (s *LocalFileUploaded) Noop() {
}

View File

@ -0,0 +1,28 @@
package event
import (
"gitlink.org.cn/cloudream/common/pkgs/future"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
)
type JobUpdateFuture = *future.SetValueFuture[UpdateResult]
type Update struct {
Runtime schsdk.JobRuntimeInfo
Operate string
Result JobUpdateFuture
}
func (s *Update) Noop() {}
type UpdateResult struct {
Err error
}
func NewUpdate(runTime schsdk.JobRuntimeInfo, operate string, jobUpdateFuture JobUpdateFuture) *Update {
return &Update{
Runtime: runTime,
Operate: operate,
Result: jobUpdateFuture,
}
}

View File

@ -0,0 +1,75 @@
package event
import (
"context"
"gitlink.org.cn/cloudream/common/pkgs/future"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
)
// WaitType 等待一个特定类型的事件。
// 通过给定的上下文和事件集,这个函数会阻塞直到匹配指定类型的事件发生。
// ctx: 用于控制等待过程的上下文,如果上下文被取消或到期,等待将被终止。
// set: 指向一个事件集,这个事件集会被用来等待特定类型的事件。
// 返回值 T: 等待到的事件,它会被强制转换为函数参数类型 T。
// 返回值 bool: 表示等待操作是否成功。如果成功等到事件,返回 true如果因为上下文被取消或到期而终止返回 false。
func WaitType[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet) (T, bool) {
// 使用 set.Wait 方法等待一个满足给定条件的事件。
// 条件函数检查事件是否能被转换为类型 T。
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
_, ok := evt.(T)
return ok
})
if ret == nil {
var r T
return r, false // 如果事件为空则返回false。
}
// 因为 set.Wait 返回的事件类型是 jobmgr.Event这里将它转换为 T 类型,并返回转换结果及操作成功标志。
return ret.(T), ok
}
// WaitTypeAnd 等待一个特定类型的事件并检查该事件是否满足给定的条件。
// ctx: 上下文,用于控制等待过程的取消或超时。
// set: 事件集合,从中等待事件发生。
// cond: 一个函数,用于检查等待的事件是否满足特定条件。
// 返回值为满足条件的事件和一个布尔值,指示获取事件是否成功。
func WaitTypeAnd[T jobmgr.Event](ctx context.Context, set *jobmgr.EventSet, cond func(val T) bool) (T, bool) {
// 等待一个满足特定类型和条件的事件。
ret, ok := set.Wait(ctx, func(evt jobmgr.Event) bool {
// 尝试将事件断言为特定类型T并检查断言是否成功。
e, ok := evt.(T)
if !ok {
return false // 如果事件不是期望的类型T则返回false。
}
// 如果事件是类型T且满足给定条件则返回true。
return cond(e)
})
if ret == nil {
var r T
return r, false // 如果事件为空则返回false。
}
// 断言返回的事件为类型T并返回该事件和操作成功标志。
return ret.(T), ok
}
func BeginWaitType[T jobmgr.Event](set *jobmgr.EventSet) future.Future1[jobmgr.Event] {
// 等待一个满足特定类型和条件的事件。
return set.BeginWait(func(evt jobmgr.Event) bool {
_, ok := evt.(T)
return ok
})
}
func BeginWaitTypeAnd[T jobmgr.Event](set *jobmgr.EventSet, cond func(val T) bool) future.Future1[jobmgr.Event] {
// 等待一个满足特定类型和条件的事件。
return set.BeginWait(func(evt jobmgr.Event) bool {
// 尝试将事件断言为特定类型T并检查断言是否成功。
e, ok := evt.(T)
if !ok {
return false // 如果事件不是期望的类型T则返回false。
}
// 如果事件是类型T且满足给定条件则返回true。
return cond(e)
})
}

View File

@ -0,0 +1,105 @@
package jobmgr
import (
"context"
"errors"
"sync"
"gitlink.org.cn/cloudream/common/pkgs/future"
"gitlink.org.cn/cloudream/common/utils/lo2"
)
type EventWaitCondition func(evt Event) bool
var ErrJobCancelled = errors.New("job cancelled")
type Event interface {
Noop()
}
type EventWaiter struct {
condition EventWaitCondition
future *future.SetValueFuture[Event]
}
type EventSet struct {
events []Event
waiters []EventWaiter
lock sync.Mutex
}
func NewEventSet() EventSet {
return EventSet{}
}
func (s *EventSet) Post(evt Event) {
s.lock.Lock() // 加锁保护事件集合
defer s.lock.Unlock() // 确保在函数结束时释放锁
// 遍历等待者列表查找匹配的等待者。如果找到从列表中移除并设置其future的值。
used := false // 标记当前事件是否已被使用(即是否唤醒了某个等待者)
for i, waiter := range s.waiters {
if waiter.condition(evt) { // 检查当前事件是否满足等待条件
s.waiters = lo2.RemoveAt(s.waiters, i) // 从等待者列表中移除当前等待者
waiter.future.SetValue(evt) // 设置等待者的future值为当前事件
used = true // 标记事件已被使用
}
}
// 如果没有匹配的等待者,则将事件添加到事件列表中。
if !used {
s.events = append(s.events, evt)
}
}
func (s *EventSet) Wait(ctx context.Context, cond EventWaitCondition) (Event, bool) {
s.lock.Lock()
for i, evt := range s.events {
if cond(evt) {
s.events = lo2.RemoveAt(s.events, i)
s.lock.Unlock()
return evt, true
}
}
fut := future.NewSetValue[Event]()
waiter := EventWaiter{
condition: cond,
future: fut,
}
s.waiters = append(s.waiters, waiter)
s.lock.Unlock()
val, err := fut.Wait(ctx)
if err != nil {
return nil, false
}
return val, true
}
func (s *EventSet) BeginWait(cond EventWaitCondition) future.Future1[Event] {
s.lock.Lock()
for i, evt := range s.events {
if cond(evt) {
s.events = lo2.RemoveAt(s.events, i)
s.lock.Unlock()
return future.NewReadyValue1(evt)
}
}
fut := future.NewSetValue[Event]()
waiter := EventWaiter{
condition: cond,
future: fut,
}
s.waiters = append(s.waiters, waiter)
s.lock.Unlock()
return fut
}

View File

@ -0,0 +1,88 @@
package jobmgr
import (
"github.com/samber/lo"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type FileScheduleAction string
// 文件调度方案
const (
ActionNo FileScheduleAction = "No" // 不需要操作
ActionMove FileScheduleAction = "Move" // 需要在指定节点上建立缓存
ActionLoad FileScheduleAction = "Load" // 需要加载到Storage
ActionImportImage FileScheduleAction = "ImportImage" // 需要导入镜像
)
type FileScheduleScheme struct {
Action FileScheduleAction `json:"action"`
}
// 任务调度方案
type JobScheduleScheme struct {
TargetCCID schsdk.CCID `json:"targetCCID"`
Dataset FileScheduleScheme `json:"dataset"`
Code FileScheduleScheme `json:"code"`
Image FileScheduleScheme `json:"image"`
}
// 任务集的预调度方案
type JobSetPreScheduleScheme struct {
JobSchemes map[string]JobScheduleScheme `json:"jobSchemes"` // 任务的预调度方案。Key为LocalJobID
}
// 任务集
type JobSet struct {
JobSetID schsdk.JobSetID `json:"jobSetID"` // 全局唯一的任务集ID
JobRefs []JobSetJobRef `json:"jobRefs"` // 任务集中包含的任务,只是一个引用
PreScheduleScheme JobSetPreScheduleScheme `json:"preScheduleScheme"`
}
type JobSetJobRef struct {
JobID schsdk.JobID `json:"jobID"` // 任务ID
LocalJobID string `json:"localJobID"` // 在当前任务集内的任务ID
}
func NewJobSet(jobSetID schsdk.JobSetID, jobRefs []JobSetJobRef, preScheduleScheme JobSetPreScheduleScheme) *JobSet {
return &JobSet{
JobSetID: jobSetID,
JobRefs: jobRefs,
PreScheduleScheme: preScheduleScheme,
}
}
func (j *JobSet) FindRefByLocalJobID(localJobID string) *JobSetJobRef {
ref, ok := lo.Find(j.JobRefs, func(item JobSetJobRef) bool { return item.LocalJobID == localJobID })
if !ok {
return nil
}
return &ref
}
// 任务
type Job struct {
JobSetID schsdk.JobSetID // 任务集ID
JobID schsdk.JobID // 全局唯一任务ID
Body JobBody // 具体任务
}
func (j *Job) GetInfo() schsdk.JobInfo {
return j.Body.GetInfo()
}
func (j *Job) Dump(ctx JobStateRunContext, job *Job, curState JobState) jobmod.JobDump {
return jobmod.JobDump{
JobID: j.JobID,
JobSetID: j.JobSetID,
Info: j.GetInfo(),
Body: job.Body.Dump(),
State: curState.Dump(ctx, job),
}
}
type JobBody interface {
GetInfo() schsdk.JobInfo
Dump() jobmod.JobBodyDump
}

View File

@ -0,0 +1,32 @@
package job
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type DataReturnJob struct {
Info schsdk.DataReturnJobInfo
TargetJobID schsdk.JobID // 目标任务的ID
TargetJobCCID schsdk.CCID // 目标任务所在计算中心的ID
TargetJobOutputPath string // 目标任务的结果输出路径,相对路径
DataReturnPackageID cdssdk.PackageID // 回源之后得到的PackageID
ECSInstanceID schsdk.ECSInstanceID // ECS实例ID在数据预处理和模型微调需要复用同一台机器时使用
}
func NewDataReturnJob(info schsdk.DataReturnJobInfo) *DataReturnJob {
return &DataReturnJob{
Info: info,
}
}
func (j *DataReturnJob) GetInfo() schsdk.JobInfo {
return &j.Info
}
func (j *DataReturnJob) Dump() jobmod.JobBodyDump {
return &jobmod.DataReturnJobDump{
DataReturnPackageID: j.DataReturnPackageID,
}
}

View File

@ -0,0 +1,33 @@
package job
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type InstanceJob struct {
Info schsdk.InstanceJobInfo // 提交任务时提供的任务描述信息
Files jobmod.JobFiles // 任务需要的文件
TargetCCID schsdk.CCID // 将要运行此任务的算力中心ID
OutputPath string // 程序结果输出路径一个相对路径需要加上CDS数据库中记录的RemoteBase才是完整路径
ParentJobID schsdk.JobID
}
func NewInstanceJob(info schsdk.InstanceJobInfo, files jobmod.JobFiles, parentJobID schsdk.JobID) *InstanceJob {
return &InstanceJob{
Info: info,
Files: files,
ParentJobID: parentJobID,
}
}
func (j *InstanceJob) GetInfo() schsdk.JobInfo {
return &j.Info
}
func (j *InstanceJob) Dump() jobmod.JobBodyDump {
return &jobmod.InstanceJobDump{
Files: j.Files,
TargetCCID: j.TargetCCID,
}
}

View File

@ -0,0 +1,32 @@
package job
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type MultiInstanceJob struct {
Info schsdk.MultiInstanceJobInfo
Files jobmod.JobFiles
TargetCCID schsdk.CCID
SubJobs []schsdk.JobID
PreScheduler jobmod.JobScheduleScheme
}
func NewMultiInstanceJob(info schsdk.MultiInstanceJobInfo, preScheduler jobmod.JobScheduleScheme) *MultiInstanceJob {
return &MultiInstanceJob{
Info: info,
PreScheduler: preScheduler,
}
}
func (j *MultiInstanceJob) GetInfo() schsdk.JobInfo {
return &j.Info
}
func (j *MultiInstanceJob) Dump() jobmod.JobBodyDump {
return &jobmod.MultiInstanceJobDump{
Files: j.Files,
TargetCCID: j.TargetCCID,
}
}

View File

@ -0,0 +1,30 @@
package job
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type UpdateMultiInstanceJob struct {
Info schsdk.UpdateMultiInstanceJobInfo
Files jobmod.JobFiles
//InstanceIDs []schsdk.JobID
//UpdateStrategy string
}
func NewUpdateMultiInstanceJob(info schsdk.UpdateMultiInstanceJobInfo) *UpdateMultiInstanceJob {
return &UpdateMultiInstanceJob{
Info: info,
}
}
func (j *UpdateMultiInstanceJob) GetInfo() schsdk.JobInfo {
return &j.Info
}
func (j *UpdateMultiInstanceJob) Dump() jobmod.JobBodyDump {
return &jobmod.UpdateMultiInstanceJobDump{
Files: j.Files,
}
}

View File

@ -0,0 +1,32 @@
package job
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type NormalJob struct {
Info schsdk.NormalJobInfo // 提交任务时提供的任务描述信息
Files jobmod.JobFiles // 任务需要的文件
TargetCCID schsdk.CCID // 将要运行此任务的算力中心ID
OutputPath string // 程序结果输出路径一个相对路径需要加上CDS数据库中记录的RemoteBase才是完整路径
SubType string // 用于区分普通任务下的子类型
ECSInstanceID schsdk.ECSInstanceID // ECS实例ID在数据预处理和模型微调需要复用同一台机器时使用
}
func NewNormalJob(info schsdk.NormalJobInfo) *NormalJob {
return &NormalJob{
Info: info,
}
}
func (j *NormalJob) GetInfo() schsdk.JobInfo {
return &j.Info
}
func (j *NormalJob) Dump() jobmod.JobBodyDump {
return &jobmod.NormalJobDump{
Files: j.Files,
TargetCCID: j.TargetCCID,
}
}

View File

@ -0,0 +1,27 @@
package job
import (
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type PCMJob struct {
Info schsdk.PCMJobInfo // 提交任务时提供的任务描述信息
Files jobmod.JobFiles // 任务需要的文件
}
func NewPCMJob(info schsdk.PCMJobInfo) *PCMJob {
return &PCMJob{
Info: info,
}
}
func (j *PCMJob) GetInfo() schsdk.JobInfo {
return &j.Info
}
func (j *PCMJob) Dump() jobmod.JobBodyDump {
return &jobmod.NormalJobDump{
Files: j.Files,
}
}

View File

@ -0,0 +1,239 @@
package state
import (
"context"
"errors"
"fmt"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
"sync"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
)
type Adjusting struct {
scheme jobmod.JobScheduleScheme
targetCCInfo schmod.ComputingCenter
}
func NewAdjusting(scheme jobmod.JobScheduleScheme) *Adjusting {
return &Adjusting{
scheme: scheme,
}
}
func (s *Adjusting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewNormalJobReadyToExecute())
}
}
func (s *Adjusting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.AdjustingDump{
Scheme: s.scheme,
}
}
func (s *Adjusting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
userID := cdssdk.UserID(1)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), s.scheme.TargetCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
s.targetCCInfo = ccInfo
logger.WithField("JobID", jo.JobID).Infof("job is scheduled to %v(%v)", ccInfo.Name, ccInfo.CCID)
// 已经确定最终执行的目标计算中心,则可以生成结果输出路径了
// TODO UserID
outputPath := utils.MakeJobOutputPath(userID, jo.JobID)
var jobFilesInfo schsdk.JobFilesInfo
var jobFiles *jobmod.JobFiles
switch runningJob := jo.Body.(type) {
case *job.NormalJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
runningJob.OutputPath = outputPath
case *job.MultiInstanceJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
case *job.InstanceJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
runningJob.OutputPath = outputPath
}
wg := sync.WaitGroup{}
wg.Add(3)
var e1, e2, e3 error
go func() {
defer wg.Done()
e1 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Dataset, &jobFiles.Dataset, &s.scheme.Dataset)
if e1 != nil {
cancel()
}
}()
go func() {
defer wg.Done()
e2 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Code, &jobFiles.Code, &s.scheme.Code)
if e2 != nil {
cancel()
}
}()
go func() {
defer wg.Done()
e3 = s.doImageScheduling(ctx, rtx, s.scheme.TargetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image)
if e3 != nil {
cancel()
}
}()
wg.Wait()
return errors.Join(e1, e2, e3)
}
func (s *Adjusting) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
if scheme.Action == jobmod.ActionMove {
logger.Debugf("begin move pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
return nil
}
if scheme.Action == jobmod.ActionLoad {
logger.Debugf("begin load pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
loadStatus := status.Value.Status.(*exectsk.StorageLoadPackageStatus)
if loadStatus.Error != "" {
return fmt.Errorf("loading package: %s", loadStatus.Error)
}
// file.PackagePath = loadStatus.PackagePath TODO 路径谁来产生的问题
return nil
}
return nil
}
func (s *Adjusting) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
if scheme.Action == jobmod.ActionImportImage {
// TODO 镜像文件位置需要重新设计
return fmt.Errorf("not implemented yet")
if file.PackageID == nil {
return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
}
// TODO UserID
taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
if err != nil {
return fmt.Errorf("moving package: %w", err)
}
fut := taskStatus.Receive()
status := <-fut.Chan()
moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
if moveStatus.Error != "" {
return fmt.Errorf("moving package: %s", moveStatus.Error)
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cloudream storage client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
// TODO UserID
pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
if err != nil {
return fmt.Errorf("getting package objects: %w", err)
}
if len(pkgObjs.Objects) == 0 {
return fmt.Errorf("no object in the package which will be imported")
}
if len(pkgObjs.Objects) > 1 {
return fmt.Errorf("there must be only 1 object in the package which will be imported")
}
// taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
// if err != nil {
// return fmt.Errorf("moving package: %w", err)
// }
// fut2 := taskStatus2.Receive()
// status2 := <-fut2.Chan()
// if err != nil {
// return fmt.Errorf("uploading image: %w", err)
// }
// uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
// if uploadStatus.Error != "" {
// return fmt.Errorf("uploading image: %s", uploadStatus.Error)
// }
// // TODO 镜像名称
// err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.SQLCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
// if err != nil {
// return fmt.Errorf("creating image info: %w", err)
// }
return nil
}
return nil
}

View File

@ -0,0 +1,55 @@
package state
import (
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"reflect"
"gitlink.org.cn/cloudream/common/pkgs/logger"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type Completed struct {
err error
}
func SuccessComplete() *Completed {
return &Completed{}
}
func FailureComplete(err error) *Completed {
return &Completed{err: err}
}
func (c *Completed) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
// TODO 可以考虑将执行记录落库
if c.err == nil {
c.handleSuccess(rtx, jo)
} else {
c.handleFailed(rtx, jo)
}
}
func (s *Completed) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
err := ""
if s.err != nil {
err = s.err.Error()
}
return &jobmod.CompletedDump{
Error: err,
}
}
func (c *Completed) handleSuccess(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
logger.WithField("JobID", job.JobID).Infof("job completed successfuly")
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
rtx.Mgr.JobCompleted(job)
}
func (c *Completed) handleFailed(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
logger.
WithField("JobID", job.JobID).
WithField("LastState", reflect.TypeOf(rtx.LastState).String()).
Infof("job failed with: %v", c.err)
rtx.Mgr.BroadcastEvent(job.JobSetID, event.NewJobCompleted(job, c.err))
rtx.Mgr.JobCompleted(job)
}

View File

@ -0,0 +1,522 @@
package state
import (
"context"
"fmt"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/executormgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
"path/filepath"
"time"
"github.com/samber/lo"
"gitlink.org.cn/cloudream/common/pkgs/future"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
"gitlink.org.cn/cloudream/common/pkgs/logger"
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
)
type JobExecuting struct {
lastStatus pcmsdk.TaskStatus
}
func NewNormalJobExecuting() *JobExecuting {
return &JobExecuting{
lastStatus: "Begin",
}
}
func (s *JobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, SuccessComplete())
}
}
func (s *JobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.NormalJobExecutingDump{
TaskStatus: s.lastStatus,
}
}
func (s *JobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
// TODO UserID
userID := cdssdk.UserID(1)
err := error(nil)
switch runningJob := jo.Body.(type) {
case *job.NormalJob:
switch runningJob.SubType {
case schsdk.JobTypeNormal: // 普通任务
pcmImgInfo, err := rtx.Mgr.DB.PCMImage().GetByImageIDAndCCID(rtx.Mgr.DB.DefCtx(), runningJob.Files.Image.ImageID, runningJob.TargetCCID)
if err != nil {
return fmt.Errorf("getting pcm image info: %w", err)
}
ress, err := rtx.Mgr.DB.CCResource().GetByCCID(rtx.Mgr.DB.DefCtx(), runningJob.TargetCCID)
if err != nil {
return fmt.Errorf("getting computing center resource: %w", err)
}
if len(ress) == 0 {
return fmt.Errorf("no resource found at computing center %v", runningJob.TargetCCID)
}
ccInfo, _, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
if err != nil {
return fmt.Errorf("getting storage info: %w", err)
}
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
err = s.submitNormalTask(rtx, cmd, envs, *ccInfo, pcmImgInfo, ress[0].PCMResourceID)
if err != nil {
logger.Error(err.Error())
}
case schsdk.JobTypeDataPreprocess: // 数据预处理
ccInfo, getStg, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
if err != nil {
return fmt.Errorf("getting storage info: %w", err)
}
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
instID, err := s.submitDataPreprocessTask(rtx, cmd, envs, *ccInfo, getStg.StorageID, userID)
if err != nil {
logger.Error(err.Error())
}
runningJob.ECSInstanceID = schsdk.ECSInstanceID(instID)
case schsdk.JobTypeFinetuning: // 模型微调
ccInfo, getStg, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
if err != nil {
return fmt.Errorf("getting storage info: %w", err)
}
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
// 将整理的数据集提交到OSS
if runningJob.Files.Dataset.ECSInstanceID != "" {
logger.Infof("instance id: %v", runningJob.ECSInstanceID)
dataSetPath, err = loadDatasetPackage(userID, runningJob.Files.Dataset.PackageID, getStg.StorageID)
if err != nil {
return fmt.Errorf("loading dataset package: %w", err)
}
}
cmd, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
err = s.submitFinetuningTask(userID, rtx, cmd, envs, *ccInfo, getStg.StorageID, runningJob)
if err != nil {
logger.Error(err.Error())
}
}
case *job.InstanceJob: // 推理任务
ccInfo, getStg, err := getCCInfoAndStgInfo(rtx, runningJob.TargetCCID, userID)
if err != nil {
return fmt.Errorf("getting storage info: %w", err)
}
dataSetPath := getDataSetPathByID(runningJob.Files.Dataset.PackageID)
_, envs := getRuntimeCommand(runningJob.Info.Runtime, dataSetPath, runningJob.OutputPath, "getStg.RemoteBase", *ccInfo)
err = s.submitInstanceTask(rtx, jo, runningJob, *ccInfo, getStg.StorageID, userID, envs)
if err != nil {
logger.Error(err.Error())
// 创建失败,从多实例任务中删除
postDeleteInstanceEvent(rtx, jo, runningJob)
}
}
return err
}
func getDataSetPathByID(packageID cdssdk.PackageID) string {
// TODO 临时使用这个路径应该来自于CDS
dataSetPath := filepath.Join("packages", "1", fmt.Sprintf("%v", packageID))
return dataSetPath
}
func loadDatasetPackage(userID cdssdk.UserID, packageID cdssdk.PackageID, storageID cdssdk.StorageID) (string, error) {
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return "", err
}
defer schglb.CloudreamStoragePool.Release(stgCli)
loadPackageResp, err := stgCli.StorageLoadPackage(cdsapi.StorageLoadPackageReq{
UserID: userID,
PackageID: packageID,
StorageID: storageID,
})
if err != nil {
return "", err
}
logger.Info("load pacakge path: " + loadPackageResp.FullPath)
return loadPackageResp.FullPath, nil
}
func (s *JobExecuting) submitNormalTask(rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, pcmImgInfo schmod.PCMImage, resourceID pcmsdk.ResourceID) error {
task, err := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSubmitTask(
ccInfo.PCMParticipantID,
pcmImgInfo.PCMImageID,
// TODO 选择资源的算法
resourceID,
cmd,
envs,
// params, TODO params不应该是kv数组而应该是字符串数组
[]schsdk.KVPair{},
), ccInfo)
if err != nil {
logger.Error(err.Error())
return err
}
taskFut := task.Receive()
for {
msg := <-taskFut.Chan()
tskStatus := msg.Value.Status.(*exetsk.SubmitTaskStatus)
if tskStatus.Status != s.lastStatus {
logger.Infof("task %s -> %s", s.lastStatus, tskStatus.Status)
}
s.lastStatus = tskStatus.Status
switch tskStatus.Status {
case pcmsdk.TaskStatusSuccess:
return nil
case "Completed":
return nil
case pcmsdk.TaskStatusFailed:
return fmt.Errorf("task failed")
}
taskFut = task.Receive()
}
}
func (s *JobExecuting) submitDataPreprocessTask(rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdssdk.StorageID, userID cdssdk.UserID) (string, error) {
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.DefCtx(), storageID)
if err != nil {
logger.Error(err.Error())
return "", fmt.Errorf("getting object storage info: %w", err)
}
task, err := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSchedulerDataPreprocess(
userID,
cmd,
envs,
objectStorage,
), ccInfo)
if err != nil {
logger.Error(err.Error())
return "", err
}
taskFut := task.Receive()
msg := <-taskFut.Chan()
tskStatus := msg.Value.Status.(*exetsk.SchedulerDataPreprocessStatus)
if tskStatus.Error != nil {
logger.Error(tskStatus.Error.Error())
return "", tskStatus.Error
}
return tskStatus.InstanceID, nil
}
func (s *JobExecuting) submitFinetuningTask(userID cdssdk.UserID, rtx jobmgr.JobStateRunContext, cmd string, envs []schsdk.KVPair, ccInfo schmod.ComputingCenter, storageID cdssdk.StorageID, runningJob *job.NormalJob) error {
objectStorage, modelInfo, err := getModelInfoAndObjectStorage(rtx, runningJob.Info.ModelJobInfo.ModelID, storageID)
if err != nil {
return fmt.Errorf("getting model info and object storage: %w", err)
}
task, err := rtx.Mgr.ExecMgr.StartTask(exetsk.NewSchedulerModelFinetuning(
userID,
cmd,
*objectStorage,
*modelInfo,
envs,
string(runningJob.Files.Dataset.ECSInstanceID),
), ccInfo)
if err != nil {
logger.Error(err.Error())
return err
}
taskFut := task.Receive()
msg := <-taskFut.Chan()
tskStatus := msg.Value.Status.(*exetsk.SchedulerModelFinetuningStatus)
if tskStatus.Error != nil {
logger.Error(tskStatus.Error.Error())
return tskStatus.Error
}
return nil
}
func (s *JobExecuting) submitInstanceTask(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job, runningJob *job.InstanceJob, ccInfo schmod.ComputingCenter,
storageID cdssdk.StorageID, userID cdssdk.UserID, envs []schsdk.KVPair) error {
modelJobInfo := runningJob.Info.ModelJobInfo
objectStorage, modelInfo, err := getModelInfoAndObjectStorage(rtx, modelJobInfo.ModelID, storageID)
if err != nil {
return fmt.Errorf("getting model info and object storage: %w", err)
}
// 发送扩容任务
ecs := exetsk.NewScheduleCreateECS(
userID,
runningJob.Info.Runtime.Command+"\\n"+modelJobInfo.Command,
*objectStorage,
*modelInfo,
envs,
)
task, err := rtx.Mgr.ExecMgr.StartTask(ecs, ccInfo)
if err != nil {
logger.Error(err.Error())
return err
}
waitFut := event.BeginWaitType[*event.Update](rtx.EventSet)
taskFut := task.Receive()
for {
select {
case v1 := <-waitFut.Chan():
// 对任务进行更新操作
client, err := executormgr.ExecutorPool.AcquireByUrl(ccInfo.ExecutorURL)
if err != nil {
return fmt.Errorf("getting executor client: %w", err)
}
evt := v1.Value.(*event.Update)
operateResp, err := client.OperateTask(executor.NewTaskOperateInfo(task.ID(), evt.Operate, evt.Runtime))
if err != nil {
return fmt.Errorf("operate task: %w", err)
}
evt.Result.SetValue(event.UpdateResult{
Err: operateResp.Err,
})
if operateResp.Err != nil {
return fmt.Errorf("operate task: %w", operateResp.Err)
}
// 持续等待
waitFut = event.BeginWaitType[*event.Update](rtx.EventSet)
case msg := <-taskFut.Chan():
switch v2 := msg.Value.Status.(type) {
case *exetsk.ScheduleCreateECSStatus:
if v2.Error != "" {
logger.Error("update task fail, error: " + v2.Error)
if v2.Operate == schsdk.CreateECS || v2.Operate == schsdk.Invalid {
// 创建失败或者检测不可用,从多实例任务中删除
v2.Operate = schsdk.DestroyECS
} else {
continue
}
}
switch v2.Operate {
case schsdk.CreateECS:
// 扩容任务,将结果放到池子中
node := schsdk.NodeInfo{
InstanceID: jo.JobID,
Address: schsdk.Address(v2.Result),
Status: schsdk.RunECS,
}
rtx.Mgr.NodeSvc.SetNodeData(jo.JobSetID, modelJobInfo, node)
logger.Infof("node expansion: %v", v2.Result)
case schsdk.DestroyECS:
// 缩容任务,从节点列表中移除
rtx.Mgr.NodeSvc.RemoveNodeFromRunningModels(modelJobInfo, jo.JobID)
// 从多实例任务中删除
postDeleteInstanceEvent(rtx, jo, runningJob)
case schsdk.PauseECS:
// 更新节点状态
rtx.Mgr.NodeSvc.UpdateNodeFromRunningModels(modelJobInfo, jo.JobID, schsdk.PauseECS)
case schsdk.RunECS:
// 更新节点状态
rtx.Mgr.NodeSvc.UpdateNodeFromRunningModels(modelJobInfo, jo.JobID, schsdk.RunECS)
case schsdk.OperateServer:
println()
case schsdk.GPUMonitor:
rtx.Mgr.NodeSvc.SetNodeUsageRateInfo(jo.JobID, v2.Result)
}
case error:
fmt.Println("Received error:", v2.Error())
default:
fmt.Println("Received unexpected type")
}
// 持续接收
taskFut = task.Receive()
}
}
}
func getModelInfoAndObjectStorage(rtx jobmgr.JobStateRunContext, modelID schsdk.ModelID, storageID cdssdk.StorageID) (*schmod.ObjectStorage, *schmod.ModelResource, error) {
objectStorage, err := rtx.Mgr.DB.ObjectStorage().GetObjectStorageByStorageID(rtx.Mgr.DB.DefCtx(), storageID)
if err != nil {
logger.Error(err.Error())
return nil, nil, fmt.Errorf("getting object storage info: %w", err)
}
// 先从数据库中查询是否已经预置了模型
modelInfo, err := rtx.Mgr.DB.Models().GetModelByID(rtx.Mgr.DB.DefCtx(), modelID, objectStorage.ID)
if &modelInfo == nil {
logger.Error(err.Error())
return nil, nil, fmt.Errorf("the model is not exists: %w", err)
}
if err != nil {
logger.Error(err.Error())
return nil, nil, fmt.Errorf("getting model info info: %w", err)
}
return &objectStorage, &modelInfo, nil
}
func postDeleteInstanceEvent(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job, runningJob *job.InstanceJob) {
deleteInfo := event.InstanceDeleteInfo{
InstanceID: jo.JobID,
}
fut := future.NewSetValue[event.OperateInstanceResult]()
rtx.Mgr.PostEvent(runningJob.ParentJobID, event.NewInstanceOperate(&deleteInfo, fut))
_, _ = fut.Wait(context.TODO())
}
// 判断算力中心是否支持环境变量配置如果不支持则读取脚本内容并拼接在Command参数后面
func getRuntimeCommand(runtime schsdk.JobRuntimeInfo, dataSetPath string, outputPath string, remoteBase string, ccInfo schmod.ComputingCenter) (string, []schsdk.KVPair) {
var envs []schsdk.KVPair
var params []string
var cmd string
envs = append(envs, schsdk.KVPair{Key: schsdk.JobDataInEnv, Value: filepath.Join(remoteBase, dataSetPath)})
envs = append(envs, schsdk.KVPair{Key: schsdk.JobDataOutEnv, Value: filepath.Join(remoteBase, outputPath)})
envs = append(envs, runtime.Envs...)
switch boot := ccInfo.Bootstrap.(type) {
case *schsdk.DirectBootstrap:
cmd = runtime.Command
case *schsdk.NoEnvBootstrap:
cmd = boot.ScriptFileName
params = append(params, runtime.Command)
envMap := lo.Map(envs, func(env schsdk.KVPair, _ int) string {
return fmt.Sprintf("%s=%s", env.Key, env.Value)
})
params = append(params, envMap...)
default:
cmd = runtime.Command
}
return cmd, envs
}
func getCCInfoAndStgInfo(rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, userID cdssdk.UserID) (*schmod.ComputingCenter, *cdsapi.StorageGetResp, error) {
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), targetCCID)
if err != nil {
return nil, nil, fmt.Errorf("getting computing center info: %w", err)
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return nil, nil, fmt.Errorf("new cds client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
getStg, err := stgCli.StorageGet(cdsapi.StorageGet{
UserID: userID,
StorageID: ccInfo.CDSStorageID,
})
if err != nil {
return nil, nil, fmt.Errorf("request to cds: %w", err)
}
return &ccInfo, getStg, nil
}
type DataReturnJobExecuting struct {
}
func NewDataReturnJobExecuting() *DataReturnJobExecuting {
return &DataReturnJobExecuting{}
}
func (s *DataReturnJobExecuting) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, SuccessComplete())
}
}
func (s *DataReturnJobExecuting) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.DataReturnExecutingDump{}
}
func (s *DataReturnJobExecuting) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
reJob := jo.Body.(*job.DataReturnJob)
userID := cdssdk.UserID(1)
log := logger.WithType[JobExecuting]("State").WithField("JobID", jo.JobID)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), reJob.TargetJobCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
packageName := utils.MakeResourcePackageName(reJob.TargetJobID)
logger.Info("TargetJobOutputPath: " + reJob.TargetJobOutputPath + ", and packageName: " + packageName)
time.Sleep(30 * time.Second)
task, err := rtx.Mgr.ExecMgr.StartTask(exetsk.NewStorageCreatePackage(
userID, // TOOD 用户ID
ccInfo.CDSStorageID,
reJob.TargetJobOutputPath,
reJob.Info.BucketID,
packageName,
), ccInfo)
if err != nil {
log.Error(err.Error())
return err
}
fut := task.Receive()
if err != nil {
return err
}
status := <-fut.Chan()
tskStatus := status.Value.Status.(*exetsk.StorageCreatePackageStatus)
if tskStatus.Error != "" {
return fmt.Errorf("creating package: %s", tskStatus.Error)
}
log.Infof("the outputs of job %v has been updated as a package %v(%v)", reJob.TargetJobID, packageName, tskStatus.PackageID)
reJob.DataReturnPackageID = tskStatus.PackageID
return nil
}

View File

@ -0,0 +1,54 @@
package state
import (
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
)
type MakingAdjustScheme struct {
}
func NewMakeingAdjustScheme() *MakingAdjustScheme {
return &MakingAdjustScheme{}
}
func (s *MakingAdjustScheme) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
scheme, err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewAdjusting(*scheme))
}
}
func (s *MakingAdjustScheme) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) (*jobmod.JobScheduleScheme, error) {
//ctx, cancel := context.WithCancel(context.Background())
//defer cancel()
//
//// 监听取消事件
//go func() {
// event.WaitType[*event.Cancel](ctx, rtx.EventSet)
// cancel()
//}()
//
//wt := rtx.Mgr.AdvMgr.StartTask(advtsk.NewMakeAdjustScheme(jo.Dump(rtx, jo, s)))
//defer wt.Close()
//
//status, err := wt.Receive(ctx)
//if err != nil {
// return nil, fmt.Errorf("making adjust scheme: %w", err)
//}
//
//mkStatus := status.(*advtsk.MakeAdjustSchemeStatus)
//if mkStatus.Error != "" {
// return nil, fmt.Errorf("making adjust scheme: %s", mkStatus.Error)
//}
//
//return &mkStatus.Scheme, nil
return nil, nil
}
func (s *MakingAdjustScheme) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.MakeingAdjustSchemeDump{}
}

View File

@ -0,0 +1,66 @@
package state
import (
"context"
"fmt"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
)
type MultiInstanceInit struct {
}
func NewMultiInstanceInit() *MultiInstanceInit {
return &MultiInstanceInit{}
}
func (s *MultiInstanceInit) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
s.do(rtx, job)
}
func (s *MultiInstanceInit) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
multInstJob := jo.Body.(*job.MultiInstanceJob)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
newLocalJobID := fmt.Sprintf("%s_%s", multInstJob.Info.LocalJobID, utils.GenerateRandomID())
instJobInfo := &schsdk.InstanceJobInfo{
Type: schsdk.JobTypeInstance,
LocalJobID: newLocalJobID,
Files: multInstJob.Info.Files,
Runtime: multInstJob.Info.Runtime,
Resources: multInstJob.Info.Resources,
ModelJobInfo: multInstJob.Info.ModelJobInfo,
}
files := jobmod.JobFiles{
Dataset: multInstJob.Files.Dataset,
Code: multInstJob.Files.Code,
Image: multInstJob.Files.Image,
}
// 创建实例并运行
instanceJob := job.NewInstanceJob(*instJobInfo, files, jo.JobID)
jobID := rtx.Mgr.AddJob(jo.JobSetID, instanceJob, NewPreSchuduling(multInstJob.PreScheduler))
// 在多实例任务中新增这个实例的任务ID
multInstJob.SubJobs = append(multInstJob.SubJobs, jobID)
rtx.Mgr.ChangeState(jo, NewMultiInstanceRunning(prescheduler.NewDefaultPreScheduler()))
}
func (s *MultiInstanceInit) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.MultiInstCreateInitDump{}
}

View File

@ -0,0 +1,206 @@
package state
import (
"context"
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/future"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
"strings"
"sync"
"time"
)
type MultiInstanceRunning struct {
preScheduler prescheduler.PreScheduler
}
func NewMultiInstanceRunning(preScheduler prescheduler.PreScheduler) *MultiInstanceRunning {
return &MultiInstanceRunning{
preScheduler: preScheduler,
}
}
func (s *MultiInstanceRunning) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
s.do(rtx, job)
}
func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
multInstJob := jo.Body.(*job.MultiInstanceJob)
go pollingInstance(rtx, multInstJob)
waitFut := event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
for {
chanValue := <-waitFut.Chan()
instanceInfo := chanValue.Value.(*event.InstanceOperate)
instanceFuture := instanceInfo.Result
logger.Info("wait a event happened")
waitFut = event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
switch info := instanceInfo.Info.(type) {
case *event.InstanceCreateInfo:
createInstance(rtx, info, s.preScheduler, jo, multInstJob, instanceFuture)
case *event.InstanceUpdateInfo:
subJobs := info.Info.SubJobs
// 微调任务特殊处理
if info.Info.UpdateType == schsdk.FineTuning {
multInstJob.Info.ModelJobInfo.Command = info.Info.Runtime.Command
// 从原有配置中删除微调的输出路径,防止冲突
for i := 0; i < len(multInstJob.Info.Runtime.Envs); i++ {
if multInstJob.Info.Runtime.Envs[i].Key == schsdk.FinetuningOutEnv {
multInstJob.Info.Runtime.Envs = append(multInstJob.Info.Runtime.Envs[:i], multInstJob.Info.Runtime.Envs[i+1:]...)
}
}
multInstJob.Info.Runtime.Envs = append(multInstJob.Info.Runtime.Envs, info.Info.Runtime.Envs...)
subJobs = multInstJob.SubJobs
}
updateInstance(rtx, info, subJobs, instanceFuture)
case *event.InstanceDeleteInfo:
deleteInstance(multInstJob, info.InstanceID)
}
}
}
func deleteInstance(multiJob *job.MultiInstanceJob, instanceID schsdk.JobID) {
for i := 0; i < len(multiJob.SubJobs); i++ {
// 找到instanceID后从列表中删除
if multiJob.SubJobs[i] == instanceID {
multiJob.SubJobs = append(multiJob.SubJobs[:i], multiJob.SubJobs[i+1:]...)
break
}
}
}
// 实例轮询用于查询GPU等信息
func pollingInstance(rtx jobmgr.JobStateRunContext, multiJob *job.MultiInstanceJob) {
for {
time.Sleep(time.Second * 30)
for i := 0; i < len(multiJob.SubJobs); i++ {
instanceID := multiJob.SubJobs[i]
logger.Info("polling instanceID: " + instanceID)
go func() {
fut := future.NewSetValue[event.UpdateResult]()
rtx.Mgr.PostEvent(instanceID, event.NewUpdate(schsdk.JobRuntimeInfo{}, schsdk.GPUMonitor, fut))
_, err := fut.Wait(context.TODO())
if err != nil {
logger.Error(err.Error())
}
println()
}()
}
}
}
func updateInstance(rtx jobmgr.JobStateRunContext, updateInfo *event.InstanceUpdateInfo, subJobs []schsdk.JobID, updateInstanceFuture event.OperateInstanceFuture) {
var failJobs []string
var wg sync.WaitGroup
for i := 0; i < len(subJobs); i++ {
// 发送请求进行任务更新
instanceID := subJobs[i]
wg.Add(1)
go func() {
defer wg.Done()
fut := future.NewSetValue[event.UpdateResult]()
rtx.Mgr.PostEvent(instanceID, event.NewUpdate(updateInfo.Info.Runtime, schsdk.RestartServer, fut))
_, err := fut.Wait(context.TODO())
if err != nil {
logger.Error(err.Error())
failJobs = append(failJobs, string(instanceID))
}
println()
}()
}
wg.Wait()
if len(failJobs) == 0 {
updateInstanceFuture.SetValue(event.OperateInstanceResult{
Err: nil,
})
return
}
// 返回更新失败的instance
result := strings.Join(failJobs, ",")
updateInstanceFuture.SetValue(event.OperateInstanceResult{
OperateResult: result,
Err: fmt.Errorf("error"),
})
}
func createInstance(rtx jobmgr.JobStateRunContext, info *event.InstanceCreateInfo, preScheduler prescheduler.PreScheduler, jo *jobmgr.Job, multInstJob *job.MultiInstanceJob, future event.OperateInstanceFuture) {
dataSet := info.DataSet
//如果是模型扩容任务直接使用父Job的资源文件
if &multInstJob.Info.ModelJobInfo != nil {
dataSet = multInstJob.Info.Files.Dataset
}
// 构建InstanceJobInfo
infoFiles := schsdk.JobFilesInfo{
Dataset: dataSet,
Code: multInstJob.Info.Files.Code,
Image: multInstJob.Info.Files.Image,
}
newLocalJobID := fmt.Sprintf("%s_%s", multInstJob.Info.LocalJobID, utils.GenerateRandomID())
instJobInfo := &schsdk.InstanceJobInfo{
Type: schsdk.JobTypeInstance,
LocalJobID: newLocalJobID,
Files: infoFiles,
Runtime: multInstJob.Info.Runtime,
Resources: multInstJob.Info.Resources,
ModelJobInfo: multInstJob.Info.ModelJobInfo,
}
files := jobmod.JobFiles{
Code: multInstJob.Files.Code,
Image: multInstJob.Files.Image,
}
// 生成预调度方案和文件上传方案
jobSchedule, filesUploadScheme, err := preScheduler.ScheduleJob(instJobInfo)
if err != nil {
future.SetError(err)
return
}
// 创建实例并运行
instanceJob := job.NewInstanceJob(*instJobInfo, files, jo.JobID)
jobID := rtx.Mgr.AddJob(jo.JobSetID, instanceJob, NewPreSchuduling(*jobSchedule))
// 在多实例任务中新增这个实例的任务ID
multInstJob.SubJobs = append(multInstJob.SubJobs, jobID)
// 将实例ID和文件上传方案返回
future.SetValue(event.OperateInstanceResult{
JobID: jobID,
FilesUploadScheme: *filesUploadScheme,
})
}
func (s *MultiInstanceRunning) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.MultiInstCreateRunningDump{}
}

View File

@ -0,0 +1,124 @@
package state
import (
"context"
"fmt"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
"gitlink.org.cn/cloudream/common/pkgs/future"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type MultiInstanceUpdate struct {
originalJob jobmod.JobDump
}
func NewMultiInstanceUpdate(originalJob jobmod.JobDump) *MultiInstanceUpdate {
return &MultiInstanceUpdate{
originalJob: originalJob,
}
}
func (s *MultiInstanceUpdate) Run(rtx jobmgr.JobStateRunContext, job *jobmgr.Job) {
err := s.do(rtx, job)
if err != nil {
logger.Error("update multi instance failed: %s", err)
return
}
}
func (s *MultiInstanceUpdate) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
updateJob := jo.Body.(*job.UpdateMultiInstanceJob)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
var fullPath string
instanceJob := jo.Body.(*job.UpdateMultiInstanceJob)
if instanceJob.Info.UpdateType == schsdk.FineTuning {
var dtrJob *job.DataReturnJob
// 等待回源任务完成
if rt, ok := updateJob.Info.Files.Dataset.(*schsdk.DataReturnJobFileInfo); ok {
evt, ok := event.WaitTypeAnd[*event.JobCompleted](ctx, rtx.EventSet, func(val *event.JobCompleted) bool {
return val.Job.GetInfo().GetLocalJobID() == rt.DataReturnLocalJobID
})
if !ok {
return jobmgr.ErrJobCancelled
}
if evt.Err != nil {
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
}
dtrJob, ok = evt.Job.Body.(*job.DataReturnJob)
if !ok {
return fmt.Errorf("job %s is not a DataReturn job(which is %T)", evt.Job.JobID, evt.Job)
}
}
stgCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cloudream storage client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(stgCli)
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), dtrJob.TargetJobCCID)
if err != nil {
return fmt.Errorf("getting computing center info: %w", err)
}
userID := cdssdk.UserID(1)
getStg, err := stgCli.StorageGet(cdsapi.StorageGet{
UserID: userID,
StorageID: ccInfo.CDSStorageID,
})
loadPackageResp, err := stgCli.StorageLoadPackage(cdsapi.StorageLoadPackageReq{
UserID: userID,
PackageID: dtrJob.DataReturnPackageID,
StorageID: getStg.StorageID,
})
if err != nil {
return fmt.Errorf("loading package: %w", err)
}
logger.Info("load pacakge path: " + loadPackageResp.FullPath)
fullPath = loadPackageResp.FullPath
}
// 发送事件更新各个instance
updateJob.Info.Runtime.Envs = append(updateJob.Info.Runtime.Envs, schsdk.KVPair{Key: schsdk.FinetuningOutEnv, Value: fullPath})
updateInfo := event.InstanceUpdateInfo{
Info: updateJob.Info,
}
fut := future.NewSetValue[event.OperateInstanceResult]()
rtx.Mgr.PostEvent(s.originalJob.JobID, event.NewInstanceOperate(&updateInfo, fut))
result, err := fut.Wait(context.TODO())
if err != nil {
return err
}
println(result.JobID)
if result.Err != nil {
return fmt.Errorf("update instance failed: %s", result.OperateResult)
}
logger.Info("update instance success!")
return nil
}
func (s *MultiInstanceUpdate) Dump(ctx jobmgr.JobStateRunContext, job *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.MultiInstanceUpdateDump{}
}

View File

@ -0,0 +1,287 @@
package state
import (
"context"
"errors"
"fmt"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
"sync"
"time"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type PreScheduling struct {
scheme jobmod.JobScheduleScheme
targetCCInfo schmod.ComputingCenter
}
func NewPreSchuduling(scheme jobmod.JobScheduleScheme) *PreScheduling {
return &PreScheduling{
scheme: scheme,
}
}
func (s *PreScheduling) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
logger.Info("start run preScheduling, jobID: " + jo.JobID)
var jobFilesInfo schsdk.JobFilesInfo
var jobFiles *jobmod.JobFiles
switch runningJob := jo.Body.(type) {
case *job.NormalJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
case *job.MultiInstanceJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
case *job.InstanceJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
runningJob.TargetCCID = s.scheme.TargetCCID
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
ccInfo, err := rtx.Mgr.DB.ComputingCenter().GetByID(rtx.Mgr.DB.DefCtx(), s.scheme.TargetCCID)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(fmt.Errorf("getting computing center info: %w", err)))
return
}
s.targetCCInfo = ccInfo
wg := sync.WaitGroup{}
wg.Add(3)
var e1, e2, e3 error
go func() {
defer wg.Done()
e1 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Dataset, &jobFiles.Dataset, &s.scheme.Dataset)
if e1 != nil {
cancel()
logger.Debugf("dataset scheduling done, err: %v", e1)
} else {
logger.Debug("dataset scheduling done")
}
}()
go func() {
defer wg.Done()
e2 = s.doPackageScheduling(ctx, rtx, jobFilesInfo.Code, &jobFiles.Code, &s.scheme.Code)
if e2 != nil {
cancel()
logger.Debugf("code scheduling done, err: %v", e2)
} else {
logger.Debug("code scheduling done")
}
}()
go func() {
defer wg.Done()
e3 = s.doImageScheduling(ctx, rtx, s.scheme.TargetCCID, jobFilesInfo.Image, &jobFiles.Image, &s.scheme.Image)
if e3 != nil {
cancel()
logger.Debugf("iamge scheduling done, err: %v", e3)
} else {
logger.Debug("image scheduling done")
}
}()
wg.Wait()
allErr := errors.Join(e1, e2, e3)
if allErr != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(allErr))
} else {
rtx.Mgr.ChangeState(jo, NewReadyToAdjust())
}
}
func (s *PreScheduling) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.PreSchedulingDump{
Scheme: s.scheme,
}
}
func (s *PreScheduling) doPackageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, fileInfo schsdk.JobFileInfo, file *jobmod.PackageJobFile, scheme *jobmod.FileScheduleScheme) error {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool {
return e.LocalPath == info.LocalPath
})
if !ok {
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
}
if evt.Error != nil {
return evt.Error
}
file.PackageID = evt.PackageID
case *schsdk.PackageJobFileInfo:
file.PackageID = info.PackageID
case *schsdk.DataReturnJobFileInfo:
return nil
default:
return fmt.Errorf("unknown dataset type: %T", info)
}
//if scheme.Action == jobmod.ActionMove {
// logger.Debugf("begin move pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
//
// taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
// if err != nil {
// return fmt.Errorf("moving package: %w", err)
// }
//
// fut := taskStatus.Receive()
// status := <-fut.Chan()
//
// moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
// if moveStatus.Error != "" {
// return fmt.Errorf("moving package: %s", moveStatus.Error)
// }
//
// return nil
//}
//
//if scheme.Action == jobmod.ActionLoad {
// logger.Debugf("begin load pacakge %v to %v", file.PackageID, s.targetCCInfo.CDSStorageID)
//
// taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewStorageLoadPackage(1, file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
// if err != nil {
// return fmt.Errorf("moving package: %w", err)
// }
//
// fut := taskStatus.Receive()
// status := <-fut.Chan()
//
// loadStatus := status.Value.Status.(*exectsk.StorageLoadPackageStatus)
// if loadStatus.Error != "" {
// return fmt.Errorf("moving package: %s", loadStatus.Error)
// }
//
// return nil
//}
return nil
}
func (s *PreScheduling) doImageScheduling(ctx context.Context, rtx jobmgr.JobStateRunContext, targetCCID schsdk.CCID, fileInfo schsdk.JobFileInfo, file *jobmod.ImageJobFile, scheme *jobmod.FileScheduleScheme) error {
switch info := fileInfo.(type) {
case *schsdk.LocalJobFileInfo:
evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool {
return e.LocalPath == info.LocalPath
})
if !ok {
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
}
if evt.Error != nil {
return evt.Error
}
// 上传完毕,则可以新建一个空的镜像的记录
// TODO 镜像名称
imgID, err := rtx.Mgr.DB.Image().Create(rtx.Mgr.DB.DefCtx(), &evt.PackageID, fmt.Sprintf("UPLOAD@%s", time.Now().Unix()), time.Now())
if err != nil {
return fmt.Errorf("creating image info: %w", err)
}
// 填充ImageID和PackageID
file.ImageID = imgID
file.PackageID = &evt.PackageID
case *schsdk.ImageJobFileInfo:
imageInfo, err := rtx.Mgr.DB.Image().GetByID(rtx.Mgr.DB.DefCtx(), info.ImageID)
if err != nil {
return fmt.Errorf("getting image info: %w", err)
}
file.ImageID = imageInfo.ImageID
file.PackageID = imageInfo.CDSPackageID
}
if scheme.Action == jobmod.ActionImportImage {
// TODO 需要重新设计镜像导入流程
return fmt.Errorf("not implemented")
//if file.PackageID == nil {
// return fmt.Errorf("image %v has no associated package, which cannot be uploaded to %v", file.ImageID, s.targetCCInfo.CCID)
//}
//
//// TODO UserID
//taskStatus, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewCacheMovePackage(1, *file.PackageID, s.targetCCInfo.CDSStorageID), s.targetCCInfo)
//if err != nil {
// return fmt.Errorf("moving package: %w", err)
//}
//
//fut := taskStatus.Receive()
//status := <-fut.Chan()
//
//moveStatus := status.Value.Status.(*exectsk.CacheMovePackageStatus)
//if moveStatus.Error != "" {
// return fmt.Errorf("moving package: %s", moveStatus.Error)
//}
//
//stgCli, err := schglb.CloudreamStoragePool.Acquire()
//if err != nil {
// return fmt.Errorf("new cloudream storage client: %w", err)
//}
//defer schglb.CloudreamStoragePool.Release(stgCli)
//
//// TODO UserID
//pkgObjs, err := stgCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{UserID: 1, PackageID: *file.PackageID})
//if err != nil {
// return fmt.Errorf("getting package objects: %w", err)
//}
//
//if len(pkgObjs.Objects) == 0 {
// return fmt.Errorf("no object in the package which will be imported")
//}
//
//if len(pkgObjs.Objects) > 1 {
// return fmt.Errorf("there must be only 1 object in the package which will be imported")
//}
//
//taskStatus2, err := rtx.Mgr.ExecMgr.StartTask(exectsk.NewUploadImage(s.targetCCInfo.PCMParticipantID, cdsapi.MakeIPFSFilePath(pkgObjs.Objects[0].FileHash)), s.targetCCInfo)
//if err != nil {
// return fmt.Errorf("moving package: %w", err)
//}
//
//fut2 := taskStatus2.Receive()
//status2 := <-fut2.Chan()
//
//uploadStatus := status2.Value.Status.(*exectsk.UploadImageStatus)
//if uploadStatus.Error != "" {
// return fmt.Errorf("uploading image: %s", uploadStatus.Error)
//}
//
//// TODO 镜像名称
//err = rtx.Mgr.DB.PCMImage().Create(rtx.Mgr.DB.DefCtx(), file.ImageID, targetCCID, uploadStatus.PCMImageID, uploadStatus.Name, time.Now())
//if err != nil {
// return fmt.Errorf("creating image info: %w", err)
//}
//
//return nil
}
return nil
}

View File

@ -0,0 +1,75 @@
package state
import (
"context"
"fmt"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
)
type ReadyToAdjust struct {
}
func NewReadyToAdjust() *ReadyToAdjust {
return &ReadyToAdjust{}
}
func (s *ReadyToAdjust) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewMakeingAdjustScheme())
}
}
func (s *ReadyToAdjust) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
var jobFilesInfo schsdk.JobFilesInfo
var jobFiles *jobmod.JobFiles
switch runningJob := jo.Body.(type) {
case *job.NormalJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
case *job.InstanceJob:
jobFilesInfo = runningJob.Info.Files
jobFiles = &runningJob.Files
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
if rt, ok := jobFilesInfo.Dataset.(*schsdk.DataReturnJobFileInfo); ok {
evt, ok := event.WaitTypeAnd[*event.JobCompleted](ctx, rtx.EventSet, func(val *event.JobCompleted) bool {
return val.Job.GetInfo().GetLocalJobID() == rt.DataReturnLocalJobID
})
if !ok {
return jobmgr.ErrJobCancelled
}
if evt.Err != nil {
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
}
rtJob, ok := evt.Job.Body.(*job.DataReturnJob)
if !ok {
return fmt.Errorf("job %s is not a DataReturn job(which is %T)", evt.Job.JobID, evt.Job)
}
jobFiles.Dataset.PackageID = rtJob.DataReturnPackageID
jobFiles.Dataset.ECSInstanceID = rtJob.ECSInstanceID
}
return nil
}
func (s *ReadyToAdjust) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.ReadyToAdjustDump{}
}

View File

@ -0,0 +1,38 @@
package state
import (
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
)
type NormalJobReadyToExecute struct {
}
func NewNormalJobReadyToExecute() *NormalJobReadyToExecute {
return &NormalJobReadyToExecute{}
}
func (s *NormalJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
// TODO 目前直接启动执行
rtx.Mgr.ChangeState(jo, NewNormalJobExecuting())
}
func (s *NormalJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.NormalJobReadyToExecuteDump{}
}
type DataReturnJobReadyToExecute struct {
}
func NewDataReturnJobReadyToExecute() *DataReturnJobReadyToExecute {
return &DataReturnJobReadyToExecute{}
}
func (s *DataReturnJobReadyToExecute) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
// TODO 目前直接启动执行
rtx.Mgr.ChangeState(jo, NewDataReturnJobExecuting())
}
func (s *DataReturnJobReadyToExecute) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.DataReturnReadyToExecuteDump{}
}

View File

@ -0,0 +1,64 @@
package state
import (
"context"
"fmt"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
)
type WaitTargetComplete struct {
}
func NewWaitTargetComplete() *WaitTargetComplete {
return &WaitTargetComplete{}
}
func (s *WaitTargetComplete) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx, jo)
if err != nil {
rtx.Mgr.ChangeState(jo, FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewDataReturnJobReadyToExecute())
}
}
func (s *WaitTargetComplete) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) error {
reJob := jo.Body.(*job.DataReturnJob)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 监听取消事件
go func() {
event.WaitType[*event.Cancel](ctx, rtx.EventSet)
cancel()
}()
evt, ok := event.WaitTypeAnd[*event.JobCompleted](ctx, rtx.EventSet, func(val *event.JobCompleted) bool {
return val.Job.GetInfo().GetLocalJobID() == reJob.Info.TargetLocalJobID
})
if !ok {
return jobmgr.ErrJobCancelled
}
if evt.Err != nil {
return fmt.Errorf("depended job %s was failed", evt.Job.JobID)
}
norJob, ok := evt.Job.Body.(*job.NormalJob)
if !ok {
return fmt.Errorf("job %s is not a Normal job(which is %T)", evt.Job.JobID, evt.Job)
}
reJob.TargetJobID = evt.Job.JobID
reJob.TargetJobCCID = norJob.TargetCCID
reJob.TargetJobOutputPath = norJob.OutputPath
reJob.ECSInstanceID = norJob.ECSInstanceID
return nil
}
func (s *WaitTargetComplete) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.WaitTargetCompleteDump{}
}

View File

@ -0,0 +1,97 @@
package state2
import (
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/logger"
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
)
type DataSchedule struct {
taskID sch.TaskID
scheduleData []sch.ScheduleData
}
func NewDataSchedule(taskID sch.TaskID, scheduleData []sch.ScheduleData) *DataSchedule {
return &DataSchedule{
taskID: taskID,
scheduleData: scheduleData,
}
}
func (s *DataSchedule) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
results, err := s.do(rtx)
if err != nil {
logger.Error(err.Error())
rtx.Mgr.ChangeState(jo, NewPCMJobCancel(s.taskID, err.Error()))
} else {
rtx.Mgr.ChangeState(jo, NewPCMJobStartup(s.taskID, results))
}
}
func (s *DataSchedule) do(rtx jobmgr.JobStateRunContext) ([]sch.DataScheduleResults, error) {
uploaderCli, err := schglb.UploaderPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.UploaderPool.Release(uploaderCli)
// 从数据库中获取集群映射
clusterMapping, err := rtx.Mgr.DB.UploadData().GetClusterMapping(rtx.Mgr.DB.DefCtx())
if err != nil {
return nil, err
}
var results []sch.DataScheduleResults
for _, data := range s.scheduleData {
var clusters []uploadersdk.Cluster
var errResults []sch.DataScheduleResult
// 根据clusterID获取JCS的storageID
for _, id := range data.ClusterIDs {
storageID, ok := clusterMapping[id]
if !ok {
errResults = append(errResults, sch.DataScheduleResult{
Clusters: sch.DataDetail{
ClusterID: id,
},
Msg: "cluster not found",
Status: false,
})
logger.Error(fmt.Errorf("cluster %d not found", id))
continue
}
clusters = append(clusters, uploadersdk.Cluster{
ClusterID: id,
StorageID: storageID,
})
}
// 发送调度请求
req := uploadersdk.DataScheduleReq{
Clusters: clusters,
PackageID: data.PackageID,
//StorageType: data.StorageType,
}
scheduleResult, err := uploaderCli.DataSchedule(req)
if err != nil {
return nil, fmt.Errorf("schedule data: %w", err)
}
if len(errResults) > 0 {
scheduleResult.Results = append(scheduleResult.Results, errResults...)
}
results = append(results, sch.DataScheduleResults{
DataType: data.DataType,
Results: scheduleResult.Results,
})
}
return results, nil
}
func (s *DataSchedule) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.NormalJobReadyToExecuteDump{}
}

View File

@ -0,0 +1,218 @@
package state2
import (
"context"
"encoding/json"
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/logger"
"gitlink.org.cn/cloudream/common/sdks/blockchain"
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
"strconv"
"strings"
"sync"
"time"
)
type DataUpload struct {
userID cdssdk.UserID
uploadInfo sch.UploadInfo
dataType string
storages []cdssdk.StorageID
lock sync.Mutex
}
func NewDataUpload(userID cdssdk.UserID, uploadInfo sch.UploadInfo, dataType string, storages []cdssdk.StorageID) *DataUpload {
return &DataUpload{
userID: userID,
uploadInfo: uploadInfo,
dataType: dataType,
storages: storages,
}
}
func (s *DataUpload) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
s.lock.Lock()
defer s.lock.Unlock()
err := s.do(rtx)
if err != nil {
logger.Error(err)
rtx.Mgr.ChangeState(jo, state.FailureComplete(err))
return
}
rtx.Mgr.ChangeState(jo, state.SuccessComplete())
}
func (s *DataUpload) do(rtx jobmgr.JobStateRunContext) error {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// 获取集群信息
var clusters []*uploadersdk.Cluster
for _, id := range s.storages {
clusters = append(clusters, &uploadersdk.Cluster{
StorageID: id,
})
}
var objectIDs []cdssdk.ObjectID
// 存证信息
//var fileInfos []schmod.FileUploadedInfo
//var folderID uploadersdk.FolderID
switch info := s.uploadInfo.(type) {
// 通过本地上传
case *sch.LocalUploadInfo:
// 等待上传完成
// TODO 需要设置超时机制
evt, ok := event.WaitTypeAnd[*event.LocalFileUploaded](ctx, rtx.EventSet, func(e *event.LocalFileUploaded) bool {
return e.LocalPath == info.LocalPath
})
if !ok {
return fmt.Errorf("local file %s not uploaded", info.LocalPath)
}
if evt.Error.Error() != "" {
return evt.Error
}
//packageData.PackageID = evt.PackageID
objectIDs = evt.ObjectIDs
//packageData.Name = info.LocalPath
//folderID = evt.FolderID
//fileInfos = evt.UploadedInfo
// 通过URL上传
case *sch.RemoteUploadInfo:
uploaderCli, err := schglb.UploaderPool.Acquire()
if err != nil {
return fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.UploaderPool.Release(uploaderCli)
var targetClusters []uploadersdk.ClusterID
for _, id := range info.TargetClusters {
targetClusters = append(targetClusters, uploadersdk.ClusterID(id))
}
req := uploadersdk.UploadReq{
Type: s.dataType,
Source: &uploadersdk.UrlSource{
Url: info.Url,
},
Target: &uploadersdk.UrlTarget{
Clusters: targetClusters,
},
}
uploadResp, err := uploaderCli.Upload(req)
if err != nil {
return fmt.Errorf("upload data: %w", err)
}
if uploadResp.JsonData != "" {
err = rtx.Mgr.DB.UploadData().UpdatePackage(rtx.Mgr.DB.DefCtx(), uploadResp.PackageID, uploadResp.JsonData, -1)
if err != nil {
return fmt.Errorf("update package: %w", err)
}
}
objectIDs = uploadResp.ObjectIDs
}
// 将上传结果写入数据库
//packageData.UserID = 1
//dataID, err := rtx.Mgr.DB.UploadData().InsertPackage(rtx.Mgr.DB.DefCtx(), packageData, clusters, folderID)
//if err != nil {
// return fmt.Errorf("insert upload data fail: %w", err)
//}
// 传入存证
blockChains, err := s.blockChain(objectIDs)
if err != nil {
return fmt.Errorf("blockchain: %w", err)
}
err = rtx.Mgr.DB.UploadData().InsertBlockchains(rtx.Mgr.DB.DefCtx(), blockChains)
if err != nil {
return fmt.Errorf("insert blockchains: %w", err)
}
return nil
}
func (s *DataUpload) blockChain(objectIDs []cdssdk.ObjectID) ([]*uploadersdk.BlockChain, error) {
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return nil, fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(cdsCli)
objects, err := cdsCli.Object().GetPackageObjects(cdsapi.ObjectGetPackageObjects{})
bcCli, err := schglb.BlockChainPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new blockchain client: %w", err)
}
defer schglb.BlockChainPool.Release(bcCli)
var blockChains []*uploadersdk.BlockChain
for _, obj := range objects.Objects {
now := time.Now()
timestamp := now.UnixNano() / int64(time.Millisecond)
fileNo := strconv.FormatInt(int64(obj.ObjectID), 10) + "_" + strconv.FormatInt(timestamp, 10)
formattedTime := now.Format("2006-01-02 15:04:05")
paths := strings.Split(obj.Path, "/")
fileName := paths[len(paths)-1]
// 去掉hash前四个字符
fileHash := obj.FileHash[4:]
var args = make(map[string]string)
args["userID"] = strconv.FormatInt(int64(s.userID), 10)
args["type"] = s.dataType
args["fileName"] = fileName
args["fileHash"] = string(fileHash)
args["fileSize"] = strconv.FormatInt(obj.Size, 10)
args["fileNo"] = fileNo
args["createTime"] = formattedTime
// 将map转换成json字符串
argsJson, _ := json.Marshal(args)
argsArr := []string{fileNo, string(argsJson)}
req := blockchain.InvokeReq{
ContractAddress: schglb.BlockChainConfig.ContractAddress,
FunctionName: schglb.BlockChainConfig.FunctionName,
MemberName: schglb.BlockChainConfig.MemberName,
Type: schglb.BlockChainConfig.Type,
Args: argsArr,
}
err = bcCli.BlockChainInvoke(req)
if err != nil {
return nil, fmt.Errorf("invoke blockchain: %w", err)
}
blockChains = append(blockChains, &uploadersdk.BlockChain{
ObjectID: obj.ObjectID,
BlockChainID: fileNo,
//FileHash: string(fileHash),
//FileName: fileName,
//FileSize: obj.Size,
})
}
return blockChains, nil
}
func (s *DataUpload) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.NormalJobReadyToExecuteDump{}
}

View File

@ -0,0 +1,46 @@
package state2
import (
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/logger"
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
)
type PCMJobCancel struct {
taskID sch.TaskID
msg string
}
func NewPCMJobCancel(taskID sch.TaskID, msg string) *PCMJobCancel {
return &PCMJobCancel{
taskID: taskID,
msg: msg,
}
}
func (s *PCMJobCancel) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
schCli, err := schglb.PCMSchePool.Acquire()
if err != nil {
logger.Error(fmt.Sprintf("new scheduler client: %v", err))
return
}
defer schglb.PCMSchePool.Release(schCli)
req := sch.CancelJobReq{
TaskID: s.taskID,
Msg: s.msg,
}
err = schCli.CancelJob(req)
if err != nil {
logger.Error(err.Error())
}
rtx.Mgr.ChangeState(jo, state.FailureComplete(err))
}
func (s *PCMJobCancel) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.NormalJobReadyToExecuteDump{}
}

View File

@ -0,0 +1,129 @@
package state2
import (
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/logger"
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
)
type PCMJobCreate struct {
jobInfo *schsdk.PCMJobInfo
}
func NewPCMJobCreate(info *schsdk.PCMJobInfo) *PCMJobCreate {
return &PCMJobCreate{
jobInfo: info,
}
}
type UploadedData struct {
}
type CodeDistribute struct {
}
func (s *PCMJobCreate) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
scheduleData, err := s.do(rtx)
if err != nil {
logger.Error(err.Error())
rtx.Mgr.ChangeState(jo, state.FailureComplete(err))
} else {
rtx.Mgr.ChangeState(jo, NewDataSchedule(scheduleData.TaskID, scheduleData.ScheduleDatas))
}
}
// 根据数据分布情况和资源需求创建任务
func (s *PCMJobCreate) do(rtx jobmgr.JobStateRunContext) (*sch.CreateJobResp, error) {
schCli, err := schglb.PCMSchePool.Acquire()
if err != nil {
return nil, fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.PCMSchePool.Release(schCli)
// 获取所有packageID
var packages []cdssdk.PackageID
var bindingIDs []int64
collectDataID(s.jobInfo.Files.Code, packages, bindingIDs)
collectDataID(s.jobInfo.Files.Dataset, packages, bindingIDs)
collectDataID(s.jobInfo.Files.Image, packages, bindingIDs)
collectDataID(s.jobInfo.Files.Model, packages, bindingIDs)
if (len(packages) & len(bindingIDs)) == 0 {
return nil, fmt.Errorf("no packageID")
}
// 从数据库中读取数据信息
uploadDatas, err := rtx.Mgr.DB.UploadData().GetByPackageID(rtx.Mgr.DB.DefCtx(), packages, bindingIDs)
if err != nil {
return nil, err
}
if len(uploadDatas) == 0 {
return nil, fmt.Errorf("no upload data")
}
// 获取数据分布信息
var dataDistribute sch.DataDistribute
for _, data := range uploadDatas {
var clusters []sch.DataDetail
for _, cluster := range data.UploadedCluster {
clusters = append(clusters, sch.DataDetail{
ClusterID: cluster.ClusterID,
JsonData: data.JsonData,
})
}
switch data.DataType {
case sch.CODE:
dataDistribute.Code = append(dataDistribute.Code, sch.CodeDistribute{
Clusters: clusters,
PackageID: data.PackageID,
})
case sch.DATASET:
dataDistribute.Dataset = append(dataDistribute.Dataset, sch.DatasetDistribute{
Clusters: clusters,
PackageID: data.PackageID,
})
case sch.MODEL:
dataDistribute.Model = append(dataDistribute.Model, sch.ModelDistribute{
Clusters: clusters,
PackageID: data.PackageID,
})
case sch.IMAGE:
dataDistribute.Image = append(dataDistribute.Image, sch.ImageDistribute{
Clusters: clusters,
PackageID: data.PackageID,
})
}
}
req := sch.CreateJobReq{
DataDistribute: dataDistribute,
JobResources: s.jobInfo.JobResources,
}
resp, err := schCli.CreateJob(req)
if err != nil {
return nil, fmt.Errorf("create task: %w", err)
}
return resp, nil
}
func collectDataID(fileInfo schsdk.JobFileInfo, packageIDs []cdssdk.PackageID, bindingIDs []int64) {
switch info := fileInfo.(type) {
case *schsdk.PackageJobFileInfo:
packageIDs = append(packageIDs, info.PackageID)
case *schsdk.BindingJobFileInfo:
bindingIDs = append(bindingIDs, info.BindingID)
}
}
func (s *PCMJobCreate) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.NormalJobReadyToExecuteDump{}
}

View File

@ -0,0 +1,55 @@
package state2
import (
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/logger"
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
)
type PCMJobStartup struct {
taskID sch.TaskID
scheduledData []sch.DataScheduleResults
}
func NewPCMJobStartup(taskID sch.TaskID, scheduledData []sch.DataScheduleResults) *PCMJobStartup {
return &PCMJobStartup{
taskID: taskID,
scheduledData: scheduledData,
}
}
func (s *PCMJobStartup) Run(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) {
err := s.do(rtx)
if err != nil {
logger.Error(err.Error())
rtx.Mgr.ChangeState(jo, NewPCMJobCancel(s.taskID, err.Error()))
} else {
rtx.Mgr.ChangeState(jo, state.SuccessComplete())
}
}
func (s *PCMJobStartup) do(rtx jobmgr.JobStateRunContext) error {
schCli, err := schglb.PCMSchePool.Acquire()
if err != nil {
return fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.PCMSchePool.Release(schCli)
req := sch.RunJobReq{
TaskID: s.taskID,
ScheduledDatas: s.scheduledData,
}
err = schCli.RunJob(req)
if err != nil {
return fmt.Errorf("run job: %w", err)
}
return nil
}
func (s *PCMJobStartup) Dump(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job) jobmod.JobStateDump {
return &jobmod.NormalJobReadyToExecuteDump{}
}

View File

@ -0,0 +1,14 @@
package jobmgr
import jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
type JobStateRunContext struct {
Mgr *Manager
EventSet *EventSet
LastState JobState
}
type JobState interface {
Run(ctx JobStateRunContext, job *Job)
Dump(ctx JobStateRunContext, job *Job) jobmod.JobStateDump
}

View File

@ -0,0 +1,259 @@
package jobmgr
import (
"fmt"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/executormgr"
"sync"
"time"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
)
type mgrJob struct {
job Job
eventSet EventSet
state JobState
isCompleted bool // 任务是否结束。注任务状态为Completed时此字段不一定为true因为在Completed状态下也有工作要做。
}
type mgrJobSet struct {
jobs map[schsdk.JobID]*mgrJob
}
type Manager struct {
// 任何修改job、jobset的操作都需要加这个锁
pubLock sync.Mutex
ExecMgr *executormgr.Manager
DB *db.DB
NodeSvc *NodeService
jobSetIDIndex int
jobSets map[schsdk.JobSetID]*mgrJobSet
jobIDIndex int
jobs map[schsdk.JobID]*mgrJob
}
func NewManager(db *db.DB, nodeSvc *NodeService) (*Manager, error) {
mgr := &Manager{
DB: db,
NodeSvc: nodeSvc,
jobSets: make(map[schsdk.JobSetID]*mgrJobSet),
jobs: make(map[schsdk.JobID]*mgrJob),
}
return mgr, nil
}
func (m *Manager) Serve() error {
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()
// TODO 应该要阻塞在这里
return nil
}
func (m *Manager) Stop() {
}
// 改变任务状态。注将任务改变为Completed状态不会设置mgrJob.isCompleted为true
func (m *Manager) ChangeState(job *Job, state JobState) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
mgrJob, ok := m.jobs[job.JobID]
if !ok {
return
}
lastState := mgrJob.state
mgrJob.state = state
logger.Info("jobID: %s change state from %s to %s", job.JobID, lastState, state)
go func() {
logger.WithField("JobID", job.JobID).Infof("state changed: %T -> %T", lastState, state)
state.Run(JobStateRunContext{
Mgr: m,
EventSet: &mgrJob.eventSet,
LastState: lastState,
}, job)
}()
}
// 将任务标记为结束
func (m *Manager) JobCompleted(job *Job) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
mgrJob, ok := m.jobs[job.JobID]
if !ok {
return
}
mgrJob.isCompleted = true
// 如果任务集中的所有任务都完成了,则删除任务集
jobSet := m.jobSets[job.JobSetID]
for _, mjob := range jobSet.jobs {
if !mjob.isCompleted {
return
}
}
// TODO 可以考虑加个回调
delete(m.jobSets, job.JobSetID)
go func() {
logger.Infof("job set %s completed", job.JobSetID)
}()
}
// 向某个任务投递事件
func (m *Manager) PostEvent(jobID schsdk.JobID, evt Event) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
mgrJob, ok := m.jobs[jobID]
if !ok {
return
}
go func() {
mgrJob.eventSet.Post(evt)
}()
}
// 向某个任务集中的所有任务投递事件
func (m *Manager) BroadcastEvent(jobSetID schsdk.JobSetID, evt Event) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
jobSet, ok := m.jobSets[jobSetID]
if !ok {
// 如果作业集不存在,则直接返回
return
}
for _, mjob := range jobSet.jobs {
go func(j *mgrJob) {
j.eventSet.Post(evt)
}(mjob)
}
}
type SubmittingJob struct {
Body JobBody
InitState JobState
}
// 提交一个任务集
func (m *Manager) SubmitJobSet(jobs []SubmittingJob) schsdk.JobSetID {
m.pubLock.Lock()
defer m.pubLock.Unlock()
jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex))
m.jobSetIDIndex += 1
jobSet := &mgrJobSet{
jobs: make(map[schsdk.JobID]*mgrJob),
}
m.jobSets[jobSetID] = jobSet
var addedJobs []*mgrJob
for i, subJob := range jobs {
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
job := &mgrJob{
job: Job{
JobSetID: jobSetID,
JobID: jobID,
Body: subJob.Body,
},
eventSet: NewEventSet(),
state: subJob.InitState,
}
jobSet.jobs[jobID] = job
m.jobs[jobID] = job
addedJobs = append(addedJobs, job)
}
m.jobIDIndex += len(jobs)
// 先添加完所有Job再启动
for _, job := range addedJobs {
go func(j *mgrJob) {
j.state.Run(JobStateRunContext{
Mgr: m,
EventSet: &j.eventSet,
LastState: nil,
}, &j.job)
}(job)
}
return jobSetID
}
// 导出任务集中所有任务的状态
func (m *Manager) DumpJobSet(jobSetID schsdk.JobSetID) []jobmod.JobDump {
m.pubLock.Lock()
defer m.pubLock.Unlock()
jobSet, ok := m.jobSets[jobSetID]
if !ok {
return nil
}
var jobDumps []jobmod.JobDump
for _, mgrJob := range jobSet.jobs {
jobDumps = append(jobDumps, mgrJob.job.Dump(JobStateRunContext{
Mgr: m,
EventSet: &mgrJob.eventSet,
LastState: mgrJob.state,
}, &mgrJob.job, mgrJob.state))
}
return jobDumps
}
type PreSchedulerInstJob struct {
Body JobBody
InitState JobState
}
// AddJob 添加一个作业到指定的作业集。
func (m *Manager) AddJob(jobSetID schsdk.JobSetID, jobBody JobBody, jobState JobState) schsdk.JobID {
m.pubLock.Lock()
defer m.pubLock.Unlock()
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex))
m.jobIDIndex += 1
job := &mgrJob{
job: Job{
JobSetID: jobSetID,
JobID: jobID,
Body: jobBody,
},
state: jobState,
eventSet: NewEventSet(),
}
m.jobs[jobID] = job
jobSet := m.jobSets[jobSetID]
jobSet.jobs[jobID] = job
go func() {
jobState.Run(JobStateRunContext{
Mgr: m,
EventSet: &job.eventSet,
LastState: nil,
}, &job.job)
}()
return jobID
}

View File

@ -0,0 +1,187 @@
package jobmgr
import (
"github.com/patrickmn/go-cache"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
"sort"
"strconv"
"strings"
"sync"
"time"
)
type NodeService struct {
RunningModels map[string]schsdk.RunningModelInfo
NodeUsageCache map[schsdk.JobID]*cache.Cache
Lock sync.Mutex
}
func NewNodeService() *NodeService {
return &NodeService{
NodeUsageCache: make(map[schsdk.JobID]*cache.Cache),
RunningModels: make(map[string]schsdk.RunningModelInfo),
}
}
// SetNodeData 新增节点
func (s *NodeService) SetNodeData(jobSetID schsdk.JobSetID, modelJobInfo schsdk.ModelJobInfo, node schsdk.NodeInfo) {
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
value, ok := s.RunningModels[key]
// 如果不存在
if !ok {
nodes := []schsdk.NodeInfo{node}
value = schsdk.RunningModelInfo{
JobSetID: jobSetID,
Nodes: nodes,
ModelID: modelJobInfo.ModelID,
// 这里的model name应该从数据库中查询
ModelName: "",
CustomModelName: modelJobInfo.CustomModelName,
}
s.RunningModels[key] = value
return
}
// 如果存在
value.Nodes = append(value.Nodes, node)
s.RunningModels[key] = value
}
// RemoveNodeFromRunningModels 移除节点
func (s *NodeService) RemoveNodeFromRunningModels(modelJobInfo schsdk.ModelJobInfo, instanceID schsdk.JobID) {
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
value, ok := s.RunningModels[key]
if !ok {
return
}
for i := 0; i < len(value.Nodes); i++ {
node := value.Nodes[i]
if node.InstanceID == instanceID {
value.Nodes = append(value.Nodes[:i], value.Nodes[i+1:]...)
s.RunningModels[key] = value
logger.Info("remove node success from running models, job id: " + instanceID)
break
}
}
}
func (s *NodeService) UpdateNodeFromRunningModels(modelJobInfo schsdk.ModelJobInfo, instanceID schsdk.JobID, status string) {
key := string(modelJobInfo.CustomModelName) + "_" + string(modelJobInfo.ModelID)
value, ok := s.RunningModels[key]
if !ok {
return
}
for i := 0; i < len(value.Nodes); i++ {
node := value.Nodes[i]
if node.InstanceID == instanceID {
node.Status = status
logger.Info("update node success from running models, job id: " + instanceID)
value.Nodes[i] = node
s.RunningModels[key] = value
break
}
}
}
func (s *NodeService) GetAvailableNodes() map[string]schsdk.RunningModelInfo {
return s.RunningModels
}
func (s *NodeService) GetNodeUsageRateInfo(customModelName schsdk.ModelName, modelID schsdk.ModelID) []schsdk.NodeUsageRateInfo {
var rateInfos []schsdk.NodeUsageRateInfo
key := string(customModelName) + "_" + string(modelID)
value, ok := s.RunningModels[key]
if !ok {
return nil
}
for i := 0; i < len(value.Nodes); i++ {
node := value.Nodes[i]
c, ok := s.NodeUsageCache[node.InstanceID]
if !ok {
continue
}
rateInfo := getCacheData(c)
rateInfo.InstanceID = node.InstanceID
rateInfo.Address = node.Address
rateInfos = append(rateInfos, rateInfo)
}
return rateInfos
}
func (s *NodeService) SetNodeUsageRateInfo(key schsdk.JobID, value string) {
timeStamp := strconv.FormatInt(time.Now().Unix(), 10)
ch, ok := s.NodeUsageCache[key]
if !ok {
ch = cache.New(time.Minute*60, time.Minute*60)
ch.Set(timeStamp, value, cache.DefaultExpiration)
s.NodeUsageCache[key] = ch
return
}
ch.Set(timeStamp, value, cache.DefaultExpiration)
}
func getCacheData(c *cache.Cache) schsdk.NodeUsageRateInfo {
var nodeUsageRateInfo schsdk.NodeUsageRateInfo
infoMap := make(map[string][]schsdk.UsageRate)
// 获取缓存中的所有项
items := c.Items()
// 遍历缓存项,将其放入 map 中
for tmstamp, item := range items {
msg := item.Object.(string)
arr1 := strings.Split(msg, "\n")
// 提取所有kv
for i := 0; i < len(arr1); i++ {
arr2 := strings.Split(arr1[i], ":")
if len(arr2) != 2 {
continue
}
key := strings.TrimSpace(arr2[0])
value := strings.TrimSpace(arr2[1])
rate, ok := infoMap[key]
if !ok {
infoMap[key] = []schsdk.UsageRate{
{
Timestamp: tmstamp,
Number: value,
},
}
continue
}
rate = append(rate, schsdk.UsageRate{
Timestamp: tmstamp,
Number: value,
})
infoMap[key] = rate
}
}
for k, v := range infoMap {
// 对v 进行排序
sort.Slice(v, func(i, j int) bool {
return v[i].Timestamp < v[j].Timestamp
})
switch k {
case schsdk.MemoryUtilization:
nodeUsageRateInfo.MemoryUtilization = v
case schsdk.GPUUtilization:
nodeUsageRateInfo.GPUUtilization = v
case schsdk.CPUUtilization:
nodeUsageRateInfo.CPUUtilization = v
}
}
return nodeUsageRateInfo
}

View File

@ -0,0 +1,51 @@
package jobTask
import (
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/async"
"gitlink.org.cn/cloudream/common/pkgs/future"
"gitlink.org.cn/cloudream/common/pkgs/logger"
"math/rand"
"time"
)
type JobTask[T any] struct {
id string
taskChan *async.UnboundChannel[T]
}
func NewJobTask[T any]() *JobTask[T] {
return &JobTask[T]{
id: getTaskID(),
taskChan: async.NewUnboundChannel[T](),
}
}
func getTaskID() string {
now := time.Now()
nano := now.UnixNano()
rand.Seed(time.Now().UnixNano())
randomNumber := rand.Intn(9000) + 1000 // 生成1000到9999之间的随机数
taskID := fmt.Sprintf("id_%d_%d", nano, randomNumber)
return taskID
}
func (c *JobTask[T]) Receive() future.Future1[T] {
return c.taskChan.Receive()
}
func (c *JobTask[T]) Send(info any) {
logger.Info("send http")
}
func (c *JobTask[T]) Chan() *async.UnboundChannel[T] {
return c.taskChan
}
func (c *JobTask[T]) ID() string {
return c.id
}

View File

@ -0,0 +1,467 @@
package services
import (
"errors"
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/logger"
sch "gitlink.org.cn/cloudream/common/sdks/pcmscheduler"
"gitlink.org.cn/cloudream/common/sdks/storage/cdsapi"
uploadersdk "gitlink.org.cn/cloudream/common/sdks/uploader"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/event"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr/job/state2"
"sort"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
)
type JobSetService struct {
*Service
}
func (svc *Service) JobSetSvc() *JobSetService {
return &JobSetService{Service: svc}
}
func (svc *JobSetService) PreScheduler(jobSet schsdk.JobSetInfo) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error) {
ccs, err := svc.db.ComputingCenter().GetAll(svc.db.DefCtx())
if err != nil {
logger.Warnf("getting all computing center: %s", err.Error())
return nil, nil, err
}
schScheme, uploadScheme, err := svc.preScheduler.ScheduleJobSet(&jobSet, ccs)
if err != nil {
return nil, nil, fmt.Errorf("pre scheduling: %w", err)
}
return schScheme, uploadScheme, nil
}
func (svc *JobSetService) Upload(userID cdssdk.UserID, params sch.UploadParams) (*schsdk.JobSetID, *[]cdssdk.StorageID, error) {
logger.Debugf("uploading job")
// 查询数据库里维护的集群
//ccs, err := svc.db.ComputingCenter().GetAll(svc.db.DefCtx())
//if err != nil {
// logger.Warnf("getting all computing center: %s", err.Error())
// return nil, nil, err
//}
// 获取集群与存储的对应关系
clusterMapping, err := svc.db.UploadData().GetClusterMapping(svc.db.DefCtx())
if err != nil {
return nil, nil, fmt.Errorf("query cluster mapping error: %w", err)
}
var storages []cdssdk.StorageID
switch uploadPriority := params.UploadPriority.(type) {
case *sch.Preferences:
// 进行预调度
clusterID, err := svc.preScheduler.ScheduleJob(uploadPriority.ResourcePriorities, clusterMapping)
if err != nil {
return nil, nil, fmt.Errorf("pre scheduling: %w", err)
}
storageID, ok := clusterMapping[*clusterID]
if !ok {
return nil, nil, fmt.Errorf("cluster %d not found", clusterID)
}
storages = append(storages, storageID)
case *sch.SpecifyCluster:
// 指定集群
for _, clusterID := range uploadPriority.Clusters {
storageID, ok := clusterMapping[clusterID]
if !ok {
logger.Warnf("cluster %d not found", clusterID)
continue
}
storages = append(storages, storageID)
}
}
if len(storages) == 0 {
return nil, nil, errors.New("no storage is available")
}
var jobs []jobmgr.SubmittingJob
jo := job.NewNormalJob(schsdk.NormalJobInfo{})
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state2.NewDataUpload(userID, params.UploadInfo, params.DataType, storages),
})
jobSetID := svc.jobMgr.SubmitJobSet(jobs)
return &jobSetID, &storages, nil
}
// Submit 提交任务集
func (svc *JobSetService) Submit(jobSet schsdk.JobSetInfo, schScheme *jobmod.JobSetPreScheduleScheme) (*schsdk.JobSetID, error) {
logger.Debugf("submitting job")
var jobs []jobmgr.SubmittingJob
for _, jobInfo := range jobSet.Jobs {
switch info := jobInfo.(type) {
case *schsdk.PCMJobInfo:
jo := job.NewPCMJob(*info)
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
//InitState: state.NewPreSchuduling(preSch),
InitState: state2.NewPCMJobCreate(info),
})
case *schsdk.NormalJobInfo:
jo := job.NewNormalJob(*info)
jo.SubType = schsdk.JobTypeNormal
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
//InitState: state.NewPreSchuduling(preSch),
})
case *schsdk.DataReturnJobInfo:
jo := job.NewDataReturnJob(*info)
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state.NewWaitTargetComplete(),
})
case *schsdk.MultiInstanceJobInfo:
preSch, ok := schScheme.JobSchemes[info.LocalJobID]
jo := job.NewMultiInstanceJob(*info, preSch)
if !ok {
return nil, errors.New(fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state.NewMultiInstanceInit(),
})
case *schsdk.UpdateMultiInstanceJobInfo:
modelJob := job.NewUpdateMultiInstanceJob(*info)
instanceJobSets := svc.jobMgr.DumpJobSet(modelJob.Info.MultiInstanceJobSetID)
if len(instanceJobSets) == 0 {
return nil, errors.New(fmt.Sprintf("job set %s is not found", modelJob.Info.MultiInstanceJobSetID))
}
// 找到多实例任务本身
var multiInstanceJobDump jobmod.JobDump
for i := 0; i < len(instanceJobSets); i++ {
jobDump := instanceJobSets[i]
if _, ok := jobDump.Body.(*jobmod.MultiInstanceJobDump); ok {
multiInstanceJobDump = jobDump
break
}
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: modelJob,
InitState: state.NewMultiInstanceUpdate(multiInstanceJobDump),
})
case *schsdk.DataPreprocessJobInfo:
// 后续的调度流程跟NormalJob是一致的
normalJobInfo := &schsdk.NormalJobInfo{
Type: schsdk.JobTypeNormal,
JobInfoBase: info.JobInfoBase,
Files: info.Files,
Runtime: info.Runtime,
Services: info.Services,
Resources: info.Resources,
}
jo := job.NewNormalJob(*normalJobInfo)
jo.SubType = schsdk.JobTypeDataPreprocess
preSch, ok := schScheme.JobSchemes[info.LocalJobID]
if !ok {
return nil, errors.New(fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state.NewPreSchuduling(preSch),
})
case *schsdk.FinetuningJobInfo:
// 后续的调度流程跟NormalJob是一致的
normalJobInfo := &schsdk.NormalJobInfo{
Type: schsdk.JobTypeNormal,
Files: info.Files,
JobInfoBase: info.JobInfoBase,
Runtime: info.Runtime,
Services: info.Services,
Resources: info.Resources,
ModelJobInfo: info.ModelJobInfo,
}
jo := job.NewNormalJob(*normalJobInfo)
jo.SubType = schsdk.JobTypeFinetuning
preSch, ok := schScheme.JobSchemes[info.LocalJobID]
if !ok {
return nil, errors.New(fmt.Sprintf("pre schedule scheme for job %s is not found", info.LocalJobID))
}
jobs = append(jobs, jobmgr.SubmittingJob{
Body: jo,
InitState: state.NewPreSchuduling(preSch),
})
}
}
jobSetID := svc.jobMgr.SubmitJobSet(jobs)
return &jobSetID, nil
}
// LocalFileUploaded 任务集中某个文件上传完成
func (svc *JobSetService) LocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, errMsg string, packageID cdssdk.PackageID, objectIDs []cdssdk.ObjectID) {
err := errors.New(errMsg)
svc.jobMgr.BroadcastEvent(jobSetID, event.NewLocalFileUploaded(localPath, err, packageID, objectIDs))
}
func (svc *JobSetService) CreateFolder(packageID cdssdk.PackageID, path string) error {
err := svc.JobSetSvc().db.UploadData().InsertFolder(svc.db.DefCtx(), packageID, path)
if err != nil {
return err
}
return nil
}
// 删除文件或文件夹
func (svc *JobSetService) DeleteFile(userID cdssdk.UserID, objectIDs []cdssdk.ObjectID) error {
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(cdsCli)
err = cdsCli.Object().Delete(cdsapi.ObjectDelete{
ObjectIDs: objectIDs,
UserID: userID,
})
if err != nil {
return fmt.Errorf("failed to delete object: %w", err)
}
return nil
}
func (svc *JobSetService) DeleteFolder(userID cdssdk.UserID, packageID cdssdk.PackageID, path string) error {
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(cdsCli)
list, err := cdsCli.Object().List(cdsapi.ObjectList{
UserID: userID,
PackageID: packageID,
Path: path,
IsPrefix: true,
})
if err != nil {
return fmt.Errorf("failed to delete object: %w", err)
}
if len(list.Objects) > 0 {
var objectIDs []cdssdk.ObjectID
for _, obj := range list.Objects {
objectIDs = append(objectIDs, obj.ObjectID)
}
err = cdsCli.Object().Delete(cdsapi.ObjectDelete{
ObjectIDs: objectIDs,
UserID: userID,
})
if err != nil {
return fmt.Errorf("failed to delete object: %w", err)
}
}
err = svc.JobSetSvc().db.UploadData().DeleteFolder(svc.db.DefCtx(), packageID, path)
if err != nil {
return fmt.Errorf("failed to delete object: %w", err)
}
return nil
}
func (svc *JobSetService) QueryUploaded(queryParams sch.QueryData) ([]uploadersdk.Package, int, int, error) {
// 查询根目录
if queryParams.PackageID == -1 {
packages, err := svc.JobSetSvc().db.UploadData().QueryPackage(svc.db.DefCtx(), queryParams)
if err != nil {
return nil, 0, 0, fmt.Errorf("failed to query uploaded data: %w", err)
}
return packages, 0, 0, nil
}
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return nil, 0, 0, fmt.Errorf("new scheduler client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(cdsCli)
queryListReq := cdsapi.ObjectList{
UserID: queryParams.UserID,
PackageID: queryParams.PackageID,
Path: queryParams.Path,
IsPrefix: true,
}
objList, err := cdsCli.Object().List(queryListReq)
if err != nil {
return nil, 0, 0, fmt.Errorf("failed to query uploaded data: %w", err)
}
folders, err := svc.db.UploadData().QueryFolder(svc.db.DefCtx(), queryParams)
if err != nil {
return nil, 0, 0, fmt.Errorf("failed to query uploaded data: %w", err)
}
for _, folder := range folders {
f := cdssdk.Object{
ObjectID: -1,
PackageID: folder.PackageID,
Path: folder.Path,
Size: 0,
CreateTime: folder.CreateTime,
}
objList.Objects = append(objList.Objects, f)
}
// 根据orderBy字段排序
sort.Slice(objList.Objects, func(i, j int) bool {
if queryParams.OrderBy == sch.OrderByName {
return objList.Objects[i].Path < objList.Objects[j].Path
} else if queryParams.OrderBy == sch.OrderBySize {
return objList.Objects[i].Size < objList.Objects[j].Size
} else if queryParams.OrderBy == sch.OrderByTime {
return objList.Objects[i].CreateTime.Unix() < objList.Objects[j].CreateTime.Unix()
}
return false
})
totalNum := len(objList.Objects)
// 分页返回
if queryParams.PageSize > 0 {
start := (queryParams.CurrentPage - 1) * queryParams.PageSize
end := start + queryParams.PageSize
if start >= totalNum {
return nil, 0, 0, nil
}
if end > totalNum {
end = totalNum
}
objList.Objects = objList.Objects[start:end]
}
totalPages := totalNum / queryParams.PageSize
var datas []uploadersdk.Package
data, err := svc.db.UploadData().QueryPackageByID(svc.db.DefCtx(), queryParams.PackageID)
if err != nil {
return nil, 0, 0, err
}
pkg := uploadersdk.Package{
PackageID: data.PackageID,
PackageName: data.PackageName,
JsonData: data.JsonData,
BindingID: data.BindingID,
UserID: data.UserID,
Objects: objList.Objects,
UploadedCluster: data.UploadedCluster,
}
datas = append(datas, pkg)
return datas, totalPages, totalNum, nil
}
func (svc *JobSetService) DataBinding(bindingData uploadersdk.BindingData, pacakgeIDs []cdssdk.PackageID) error {
err := svc.db.UploadData().InsertOrUpdateBinding(svc.db.DefCtx(), bindingData)
if err != nil {
return err
}
for _, id := range pacakgeIDs {
err = svc.db.UploadData().UpdatePackage(svc.db.DefCtx(), id, "", bindingData.ID)
if err != nil {
return err
}
}
return nil
}
func (svc *JobSetService) RemoveBinding(pacakgeIDs []cdssdk.PackageID) error {
for _, id := range pacakgeIDs {
err := svc.db.UploadData().UpdatePackage(svc.db.DefCtx(), id, "", uploadersdk.DataID(-1))
if err != nil {
return err
}
}
return nil
}
func (svc *JobSetService) CreatePackage(userID cdssdk.UserID, name string, dataType string) error {
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cds client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(cdsCli)
// 创建package
newPackage, err := cdsCli.Package().Create(cdsapi.PackageCreate{
UserID: userID,
BucketID: 1,
Name: name,
})
if err != nil {
return fmt.Errorf("failed to create package: %w", err)
}
pkg := uploadersdk.Package{
UserID: userID,
PackageID: newPackage.Package.PackageID,
PackageName: name,
DataType: dataType,
}
// 写入数据库存档
err = svc.JobSetSvc().db.UploadData().InsertPackage(svc.db.DefCtx(), pkg)
if err != nil {
return err
}
return nil
}
func (svc *JobSetService) DeletePackage(userID cdssdk.UserID, packageID cdssdk.PackageID) error {
cdsCli, err := schglb.CloudreamStoragePool.Acquire()
if err != nil {
return fmt.Errorf("new cds client: %w", err)
}
defer schglb.CloudreamStoragePool.Release(cdsCli)
err = cdsCli.Package().Delete(cdsapi.PackageDelete{
UserID: userID,
PackageID: packageID,
})
err = svc.JobSetSvc().db.UploadData().DeletePackage(svc.db.DefCtx(), userID, packageID)
if err != nil {
return err
}
return nil
}

View File

@ -0,0 +1,21 @@
package services
import (
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler2"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
)
type Service struct {
preScheduler prescheduler2.PreScheduler
jobMgr *jobmgr.Manager
db *db.DB
}
func NewService(preScheduler prescheduler2.PreScheduler, jobMgr *jobmgr.Manager, db *db.DB) (*Service, error) {
return &Service{
preScheduler: preScheduler,
jobMgr: jobMgr,
db: db,
}, nil
}

View File

@ -0,0 +1,57 @@
package main
import (
"fmt"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/prescheduler2"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/cmdline"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/config"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/manager/jobmgr"
"gitlink.org.cn/cloudream/scheduler/schedulerMiddleware/internal/services"
"os"
)
func main() {
err := config.Init()
if err != nil {
fmt.Printf("init config failed, err: %s", err.Error())
os.Exit(1)
}
err = logger.Init(&config.Cfg().Logger)
if err != nil {
fmt.Printf("init logger failed, err: %s", err.Error())
os.Exit(1)
}
schglb.InitPCMSchePool(&config.Cfg().PCMScheduler)
schglb.InitUploaderPool(&config.Cfg().Uploader)
schglb.InitBlockChainPool(&config.Cfg().BlockChain)
schglb.InitCloudreamStoragePool(&config.Cfg().CloudreamStorage)
dbSvc, err := db.NewDB(&config.Cfg().DB)
if err != nil {
logger.Fatalf("new db2 failed, err: %s", err.Error())
}
preSchr := prescheduler2.NewDefaultPreScheduler()
nodeSvc := jobmgr.NewNodeService()
jobMgr, err := jobmgr.NewManager(dbSvc, nodeSvc)
if err != nil {
logger.Fatalf("new job manager failed, err: %s", err.Error())
}
svc, err := services.NewService(preSchr, jobMgr, dbSvc)
if err != nil {
logger.Fatalf("new service failed, err: %s", err.Error())
}
cmds, err := cmdline.NewCommandline(svc)
if err != nil {
logger.Warnf("new command line failed, err: %s", err.Error())
os.Exit(1)
}
cmds.DispatchCommand(os.Args[1:])
}