优化代码
This commit is contained in:
parent
d6b47ff5fc
commit
cca6452879
|
@ -28,8 +28,8 @@
|
||||||
"createECS": {
|
"createECS": {
|
||||||
"cloud": "AliCloud",
|
"cloud": "AliCloud",
|
||||||
"auth_config": {
|
"auth_config": {
|
||||||
"AccessKeyId": "LTAI5tJBqN3uRnzXeiiXTxkT",
|
"AccessKeyId": "xxx",
|
||||||
"AccessKeySecret": "dilS4SJ0I3SMWtY7h1ByHe3MOULuGA"
|
"AccessKeySecret": "xxx"
|
||||||
},
|
},
|
||||||
"ecs_config": {
|
"ecs_config": {
|
||||||
"DryRun": false,
|
"DryRun": false,
|
||||||
|
@ -52,8 +52,8 @@
|
||||||
"createECS-hw": {
|
"createECS-hw": {
|
||||||
"cloud": "HuaweiCloud",
|
"cloud": "HuaweiCloud",
|
||||||
"auth_config": {
|
"auth_config": {
|
||||||
"AccessKeyId": "LTAI5tMraAgfzhrF4PF79Js4",
|
"AccessKeyId": "xxx",
|
||||||
"AccessKeySecret": "aWTkvrBWZt58kvpop7MNwTDMinJFtj"
|
"AccessKeySecret": "xxx"
|
||||||
},
|
},
|
||||||
"ecs_config": {
|
"ecs_config": {
|
||||||
"Region": "cn-hangzhou"
|
"Region": "cn-hangzhou"
|
||||||
|
@ -62,9 +62,9 @@
|
||||||
"createECS-sugon": {
|
"createECS-sugon": {
|
||||||
"cloud": "SugonCloud",
|
"cloud": "SugonCloud",
|
||||||
"auth_config": {
|
"auth_config": {
|
||||||
"user": "acgnnmfbwo",
|
"user": "xxx",
|
||||||
"password": "Pcl@2020",
|
"password": "xxx",
|
||||||
"orgid": "c8befbc1301665ba2dc5b2826f8dca1e",
|
"orgid": "xxx",
|
||||||
"clusterName": "华东一区【昆山】"
|
"clusterName": "华东一区【昆山】"
|
||||||
},
|
},
|
||||||
"ecs_config": {
|
"ecs_config": {
|
||||||
|
|
|
@ -39,9 +39,9 @@ type JobFiles struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type PackageJobFile struct {
|
type PackageJobFile struct {
|
||||||
PackageID cdssdk.PackageID `json:"packageID"`
|
PackageID cdssdk.PackageID `json:"packageID"`
|
||||||
PackagePath string `json:"packagePath"` // Load之后的文件路径,一个相对路径,需要加上CDS数据库中的RemoteBase才是完整路径
|
PackagePath string `json:"packagePath"` // Load之后的文件路径,一个相对路径,需要加上CDS数据库中的RemoteBase才是完整路径
|
||||||
ECSInstanceID schsdk.ECSInstanceID
|
ECSInstanceID schsdk.ECSInstanceID // TODO 这个实例ID暂时放在这里,后续会修改
|
||||||
}
|
}
|
||||||
|
|
||||||
type ImageJobFile struct {
|
type ImageJobFile struct {
|
||||||
|
|
|
@ -5,6 +5,7 @@ import (
|
||||||
"encoding/hex"
|
"encoding/hex"
|
||||||
"fmt"
|
"fmt"
|
||||||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||||||
|
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
|
||||||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
@ -72,35 +73,35 @@ func GetRcloneCommands(storage schmod.ObjectStorage, userID cdssdk.UserID, mount
|
||||||
var commands []string
|
var commands []string
|
||||||
|
|
||||||
// 下载Rclone
|
// 下载Rclone
|
||||||
//commandContent := "yum install -y fuse3"
|
commandContent := "yum install -y fuse3"
|
||||||
//commands = append(commands, commandContent)
|
commands = append(commands, commandContent)
|
||||||
//commandContent = "cd /opt && downloadCode='import requests;response=requests.get(\"@url@\",stream=True);response.raise_for_status();boundary=response.headers.get(\"Content-Type\").split(\"boundary=\")[-1].encode();content=response.content;body=[part.split(b\"\\r\\n\\r\\n\",1)[1].rsplit(b\"\\r\\n--\",1)[0] for part in content.split(b\"--\"+boundary+b\"\\r\\n\") if b\"filename=\" in part][0];open(\"@filename@\",\"wb\").write(body);print(\"success\")' && rclone=\"$cds_url/object/download?userID=$userID&objectID=$rcloneID\" && python3 -c \"$(echo \"$downloadCode\" | sed -e \"s|@url@|$(printf '%s' \"$rclone\" | sed 's/[&/\\]/\\\\&/g')|\" -e \"s|@filename@|rclone|\")\" && chmod +x rclone"
|
commandContent = "cd /opt && downloadCode='import requests;response=requests.get(\"@url@\",stream=True);response.raise_for_status();boundary=response.headers.get(\"Content-Type\").split(\"boundary=\")[-1].encode();content=response.content;body=[part.split(b\"\\r\\n\\r\\n\",1)[1].rsplit(b\"\\r\\n--\",1)[0] for part in content.split(b\"--\"+boundary+b\"\\r\\n\") if b\"filename=\" in part][0];open(\"@filename@\",\"wb\").write(body);print(\"success\")' && rclone=\"$cds_url/object/download?userID=$userID&objectID=$rcloneID\" && python3 -c \"$(echo \"$downloadCode\" | sed -e \"s|@url@|$(printf '%s' \"$rclone\" | sed 's/[&/\\]/\\\\&/g')|\" -e \"s|@filename@|rclone|\")\" && chmod +x rclone"
|
||||||
//commandContent = strings.Replace(commandContent, "$cds_url", schglb.CloudreamStorageConfig.URL, -1)
|
commandContent = strings.Replace(commandContent, "$cds_url", schglb.CloudreamStorageConfig.URL, -1)
|
||||||
//commandContent = strings.Replace(commandContent, "$rcloneID", schglb.CDSRclone.CDSRcloneID, -1)
|
commandContent = strings.Replace(commandContent, "$rcloneID", schglb.CDSRclone.CDSRcloneID, -1)
|
||||||
//commandContent = strings.Replace(commandContent, "$userID", strconv.FormatInt(int64(userID), 10), -1)
|
commandContent = strings.Replace(commandContent, "$userID", strconv.FormatInt(int64(userID), 10), -1)
|
||||||
//commands = append(commands, commandContent)
|
commands = append(commands, commandContent)
|
||||||
//
|
|
||||||
//// 生成Rclone配置文件
|
|
||||||
//commandContent = "echo -e '[@tagName@] \n type = s3 \n provider = @provider@ \n access_key_id = @ak@ \n secret_access_key = @sk@ \n endpoint = @endpoint@ \n storage_class = STANDARD' > /opt/rclone.conf"
|
|
||||||
tagName := storage.Bucket + "_" + storage.AK
|
|
||||||
//commandContent = strings.Replace(commandContent, "@tagName@", tagName, -1)
|
|
||||||
//commandContent = strings.Replace(commandContent, "@provider@", storage.Manufacturer, -1)
|
|
||||||
//commandContent = strings.Replace(commandContent, "@ak@", storage.AK, -1)
|
|
||||||
//commandContent = strings.Replace(commandContent, "@sk@", storage.SK, -1)
|
|
||||||
//commandContent = strings.Replace(commandContent, "@endpoint@", storage.Endpoint, -1)
|
|
||||||
//commands = append(commands, commandContent)
|
|
||||||
|
|
||||||
umountCommand := "umount -l /mnt/oss"
|
// 生成Rclone配置文件
|
||||||
commands = append(commands, umountCommand)
|
commandContent = "echo -e '[@tagName@] \n type = s3 \n provider = @provider@ \n access_key_id = @ak@ \n secret_access_key = @sk@ \n endpoint = @endpoint@ \n storage_class = STANDARD' > /opt/rclone.conf"
|
||||||
|
tagName := storage.Bucket + "_" + storage.AK
|
||||||
|
commandContent = strings.Replace(commandContent, "@tagName@", tagName, -1)
|
||||||
|
commandContent = strings.Replace(commandContent, "@provider@", storage.Manufacturer, -1)
|
||||||
|
commandContent = strings.Replace(commandContent, "@ak@", storage.AK, -1)
|
||||||
|
commandContent = strings.Replace(commandContent, "@sk@", storage.SK, -1)
|
||||||
|
commandContent = strings.Replace(commandContent, "@endpoint@", storage.Endpoint, -1)
|
||||||
|
commands = append(commands, commandContent)
|
||||||
|
|
||||||
|
//umountCommand := "umount -l /mnt/oss"
|
||||||
|
//commands = append(commands, umountCommand)
|
||||||
// 挂载Rclone
|
// 挂载Rclone
|
||||||
commandContent := "mkdir -p @mountDir@ && cd /opt && nohup ./rclone mount @tagName@:@bucket@ @mountDir@ --vfs-cache-mode full --vfs-read-wait 0 --vfs-read-chunk-size 128M --cache-db-purge -vv > rcloneMount.log 2>&1 &"
|
commandContent = "mkdir -p @mountDir@ && cd /opt && nohup ./rclone mount @tagName@:@bucket@ @mountDir@ --vfs-cache-mode full --vfs-read-wait 0 --vfs-read-chunk-size 128M --cache-db-purge -vv > rcloneMount.log 2>&1 &"
|
||||||
commandContent = strings.Replace(commandContent, "@tagName@", tagName, -1)
|
commandContent = strings.Replace(commandContent, "@tagName@", tagName, -1)
|
||||||
commandContent = strings.Replace(commandContent, "@bucket@", storage.Bucket, -1)
|
commandContent = strings.Replace(commandContent, "@bucket@", storage.Bucket, -1)
|
||||||
commandContent = strings.Replace(commandContent, "@mountDir@", mountDir, -1)
|
commandContent = strings.Replace(commandContent, "@mountDir@", mountDir, -1)
|
||||||
commands = append(commands, commandContent)
|
commands = append(commands, commandContent)
|
||||||
|
|
||||||
//commandContent = "cd /opt && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda3 && eval \"$($HOME/miniconda3/bin/conda shell.bash hook)\" && conda create -n myenv python=3.10 -y"
|
commandContent = "cd /opt && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b -p $HOME/miniconda3 && eval \"$($HOME/miniconda3/bin/conda shell.bash hook)\" && conda create -n myenv python=3.10 -y"
|
||||||
//commands = append(commands, commandContent)
|
commands = append(commands, commandContent)
|
||||||
|
|
||||||
return commands
|
return commands
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,7 +12,6 @@ import (
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// AliCloud实现了CloudProvider接口
|
|
||||||
type AliCloud struct{}
|
type AliCloud struct{}
|
||||||
|
|
||||||
var aliclient = &ecs.Client{}
|
var aliclient = &ecs.Client{}
|
||||||
|
@ -36,11 +35,8 @@ func AliConfig(authConfigs map[string]interface{}, ecsConfigs map[string]interfa
|
||||||
}
|
}
|
||||||
|
|
||||||
config := &openapi.Config{}
|
config := &openapi.Config{}
|
||||||
// 您的AccessKey ID
|
|
||||||
config.AccessKeyId = tea.String(authConfigs["AccessKeyId"].(string))
|
config.AccessKeyId = tea.String(authConfigs["AccessKeyId"].(string))
|
||||||
// 您的AccessKey Secret
|
|
||||||
config.AccessKeySecret = tea.String(authConfigs["AccessKeySecret"].(string))
|
config.AccessKeySecret = tea.String(authConfigs["AccessKeySecret"].(string))
|
||||||
// 您的可用区ID
|
|
||||||
config.RegionId = requestParam.RegionId
|
config.RegionId = requestParam.RegionId
|
||||||
aliclient, _ = ecs.NewClient(config)
|
aliclient, _ = ecs.NewClient(config)
|
||||||
}
|
}
|
||||||
|
@ -159,7 +155,6 @@ func (a *AliCloud) StartInstances(instanceID string) (string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func runShellCommand(commandContent string, instanceID string, regionId string) (*string, error) {
|
func runShellCommand(commandContent string, instanceID string, regionId string) (*string, error) {
|
||||||
// 从CDS下载文件
|
|
||||||
commandRequest := ecs.RunCommandRequest{
|
commandRequest := ecs.RunCommandRequest{
|
||||||
InstanceId: []*string{&instanceID},
|
InstanceId: []*string{&instanceID},
|
||||||
CommandContent: tea.String(commandContent),
|
CommandContent: tea.String(commandContent),
|
||||||
|
|
|
@ -2,7 +2,6 @@ package create_ecs
|
||||||
|
|
||||||
import schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
import schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||||||
|
|
||||||
// CloudProvider 是一个接口,定义了创建服务器的方法
|
|
||||||
type CloudProvider interface {
|
type CloudProvider interface {
|
||||||
CreateServer() (string, string, error)
|
CreateServer() (string, string, error)
|
||||||
RunCommand(commands []string, instanceID string, timeout int) (string, error)
|
RunCommand(commands []string, instanceID string, timeout int) (string, error)
|
||||||
|
@ -17,14 +16,12 @@ type CloudFactory interface {
|
||||||
CreateProvider() CloudProvider
|
CreateProvider() CloudProvider
|
||||||
}
|
}
|
||||||
|
|
||||||
// HuaweiCloudFactory 实现了CloudFactory接口
|
|
||||||
type HuaweiCloudFactory struct{}
|
type HuaweiCloudFactory struct{}
|
||||||
|
|
||||||
func (f *HuaweiCloudFactory) CreateProvider() CloudProvider {
|
func (f *HuaweiCloudFactory) CreateProvider() CloudProvider {
|
||||||
return &HuaweiCloud{}
|
return &HuaweiCloud{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// AliCloudFactory 实现了CloudFactory接口
|
|
||||||
type AliCloudFactory struct{}
|
type AliCloudFactory struct{}
|
||||||
|
|
||||||
func (f *AliCloudFactory) CreateProvider() CloudProvider {
|
func (f *AliCloudFactory) CreateProvider() CloudProvider {
|
||||||
|
|
|
@ -42,8 +42,8 @@ func (t *ScheduleCreateECS) do(task *Task, ctx TaskContext) error {
|
||||||
// 创建云主机
|
// 创建云主机
|
||||||
factory := create_ecs.GetFactory(config.CloudName)
|
factory := create_ecs.GetFactory(config.CloudName)
|
||||||
provider := factory.CreateProvider()
|
provider := factory.CreateProvider()
|
||||||
//instanceID, ecsIP, err := provider.CreateServer()
|
instanceID, ecsIP, err := provider.CreateServer()
|
||||||
instanceID, ecsIP, err := "i-bp16imo8en907iy1oixd", "120.55.45.90", error(nil)
|
//instanceID, ecsIP, err := "i-bp16imo8en907iy1oixd", "120.55.45.90", error(nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.CreateECS, err.Error()))
|
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.CreateECS, err.Error()))
|
||||||
return err
|
return err
|
||||||
|
@ -74,18 +74,18 @@ func (t *ScheduleCreateECS) do(task *Task, ctx TaskContext) error {
|
||||||
commands = append(commands, utils.HandleCommand(startScript))
|
commands = append(commands, utils.HandleCommand(startScript))
|
||||||
|
|
||||||
// 安装依赖包,用于获取GPU信息
|
// 安装依赖包,用于获取GPU信息
|
||||||
//commandContent := getPipCommand()
|
commandContent := getPipCommand()
|
||||||
//commands = append(commands, commandContent)
|
commands = append(commands, commandContent)
|
||||||
|
|
||||||
// 获取用户输入的命令
|
// 获取用户输入的命令
|
||||||
arr := utils.SplitCommands(t.Command)
|
arr := utils.SplitCommands(t.Command)
|
||||||
commands = append(commands, arr...)
|
commands = append(commands, arr...)
|
||||||
|
|
||||||
// 执行命令
|
// 执行命令
|
||||||
//_, err = provider.RunCommand(commands, instanceID, 2000)
|
_, err = provider.RunCommand(commands, instanceID, 2000)
|
||||||
//if err != nil {
|
if err != nil {
|
||||||
// logger.Error("run command error: " + err.Error())
|
logger.Error("run command error: " + err.Error())
|
||||||
//}
|
}
|
||||||
|
|
||||||
address := "http://" + ecsIP + ":" + strconv.FormatInt(t.ModelResource.ServerPort, 10) + "/" + t.ModelResource.ServerUrlPath
|
address := "http://" + ecsIP + ":" + strconv.FormatInt(t.ModelResource.ServerPort, 10) + "/" + t.ModelResource.ServerUrlPath
|
||||||
if config.CloudName == schmod.SugonCloud {
|
if config.CloudName == schmod.SugonCloud {
|
||||||
|
@ -128,15 +128,15 @@ func (t *ScheduleCreateECS) do(task *Task, ctx TaskContext) error {
|
||||||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.PauseECS, ""))
|
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.PauseECS, ""))
|
||||||
case schsdk.DestroyECS:
|
case schsdk.DestroyECS:
|
||||||
logger.Info("destroy ecs")
|
logger.Info("destroy ecs")
|
||||||
//_, err := provider.DeleteInstance(instanceID)
|
_, err := provider.DeleteInstance(instanceID)
|
||||||
//if err != nil {
|
if err != nil {
|
||||||
// task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
|
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
|
||||||
// continue
|
continue
|
||||||
//}
|
}
|
||||||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.DestroyECS, ""))
|
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.DestroyECS, ""))
|
||||||
break
|
break
|
||||||
case schsdk.RestartServer:
|
case schsdk.RestartServer:
|
||||||
commandContent := utils.RemountRclone(t.ObjectStorage, t.UserID, schsdk.MountDir)
|
commandContent = utils.RemountRclone(t.ObjectStorage, t.UserID, schsdk.MountDir)
|
||||||
info.Runtime.Command = info.Runtime.Command + "\n" + commandContent
|
info.Runtime.Command = info.Runtime.Command + "\n" + commandContent
|
||||||
commandContent = schsdk.MountDir + "/" + t.ModelResource.StopShellPath
|
commandContent = schsdk.MountDir + "/" + t.ModelResource.StopShellPath
|
||||||
info.Runtime.Command = info.Runtime.Command + "\n" + utils.HandleCommand(commandContent)
|
info.Runtime.Command = info.Runtime.Command + "\n" + utils.HandleCommand(commandContent)
|
||||||
|
|
|
@ -62,8 +62,8 @@ func (t *SchedulerDataPreprocess) do(task *Task, ctx TaskContext) error {
|
||||||
provider := factory.CreateProvider()
|
provider := factory.CreateProvider()
|
||||||
|
|
||||||
// 创建服务器
|
// 创建服务器
|
||||||
//instanceID, ecsIP, err := provider.CreateServer()
|
instanceID, ecsIP, err := provider.CreateServer()
|
||||||
instanceID, ecsIP, err := "i-bp16imo8en907iy1oixd", "120.55.45.90", error(nil)
|
//instanceID, ecsIP, err := "i-bp16imo8en907iy1oixd", "120.55.45.90", error(nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
task.SendStatus(exectsk.NewSchedulerDataPreprocessStatus("", err))
|
task.SendStatus(exectsk.NewSchedulerDataPreprocessStatus("", err))
|
||||||
return err
|
return err
|
||||||
|
@ -89,39 +89,20 @@ func getDataPreprocessCommands(envs []schsdk.KVPair, inferencePlatform schsdk.In
|
||||||
|
|
||||||
var commands []string
|
var commands []string
|
||||||
|
|
||||||
// 获取当前工作目录
|
// 读取预置的脚本
|
||||||
currentDir, err := filepath.Abs(".")
|
currentDir, err := filepath.Abs(".")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println("Error getting current directory:", err)
|
fmt.Println("Error getting current directory:", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
parentDir := filepath.Dir(currentDir)
|
parentDir := filepath.Dir(currentDir)
|
||||||
|
fileName := "./scripts/data_preprocess.py"
|
||||||
// 指定要读取的文件名
|
|
||||||
fileName := "example.txt" // 替换为你要读取的文件名
|
|
||||||
|
|
||||||
// 构造完整路径
|
|
||||||
filePath := filepath.Join(parentDir, fileName)
|
filePath := filepath.Join(parentDir, fileName)
|
||||||
|
|
||||||
// 读取文件
|
|
||||||
data, err := ioutil.ReadFile(filePath)
|
data, err := ioutil.ReadFile(filePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Println("Error reading file:", err)
|
fmt.Println("Error reading file:", err)
|
||||||
}
|
}
|
||||||
|
fileContent := string(data)
|
||||||
|
|
||||||
// 输出文件内容
|
|
||||||
fmt.Println("File content:")
|
|
||||||
fmt.Println(string(data))
|
|
||||||
content := string(data)
|
|
||||||
|
|
||||||
// 读取文件
|
|
||||||
//content, err := ioutil.ReadFile("D:\\Work\\Codes\\new\\workspace\\workspace\\scheduler\\common\\assets\\scripts\\data_preprocess.py")
|
|
||||||
//if err != nil {
|
|
||||||
// logger.Error(err)
|
|
||||||
// return nil, err
|
|
||||||
//}
|
|
||||||
|
|
||||||
fileContent := string(content)
|
|
||||||
fileContent = strings.ReplaceAll(fileContent, "@base_url@", inferencePlatform.ApiBaseUrl)
|
fileContent = strings.ReplaceAll(fileContent, "@base_url@", inferencePlatform.ApiBaseUrl)
|
||||||
fileContent = strings.ReplaceAll(fileContent, "@api_key@", inferencePlatform.ApiKey)
|
fileContent = strings.ReplaceAll(fileContent, "@api_key@", inferencePlatform.ApiKey)
|
||||||
inputPath := schsdk.MountDir + "/" + envs[0].Value
|
inputPath := schsdk.MountDir + "/" + envs[0].Value
|
||||||
|
|
|
@ -45,23 +45,23 @@ func (t *SchedulerModelFinetuning) do(task *Task, ctx TaskContext) error {
|
||||||
provider := factory.CreateProvider()
|
provider := factory.CreateProvider()
|
||||||
|
|
||||||
instanceID := t.InstanceID
|
instanceID := t.InstanceID
|
||||||
// 如果没有指定实例ID,则创建一个
|
// 如果没有指定实例ID,则创建一个(即预处理服务器与微调服务器分块)
|
||||||
//if t.InstanceID == "" {
|
if t.InstanceID == "" {
|
||||||
// // 创建服务器
|
// 创建服务器
|
||||||
// instID, ecsIP, err := provider.CreateServer()
|
instID, ecsIP, err := provider.CreateServer()
|
||||||
// if err != nil {
|
if err != nil {
|
||||||
// task.SendStatus(exectsk.NewSchedulerModelFinetuningStatus(err))
|
task.SendStatus(exectsk.NewSchedulerModelFinetuningStatus(err))
|
||||||
// return err
|
return err
|
||||||
// }
|
}
|
||||||
// instanceID = instID
|
instanceID = instID
|
||||||
// logger.Info("create ECS success, instance id: " + instanceID + ", ip: " + ecsIP)
|
logger.Info("create ECS success, instance id: " + instanceID + ", ip: " + ecsIP)
|
||||||
//
|
|
||||||
// if t.ObjectStorage.MountType == schsdk.RcloneMount {
|
if t.ObjectStorage.MountType == schsdk.RcloneMount {
|
||||||
// // 获取Rclone挂载命令
|
// 获取Rclone挂载命令
|
||||||
// mountCommands := utils.GetRcloneCommands(t.ObjectStorage, t.UserID, schsdk.MountDir)
|
mountCommands := utils.GetRcloneCommands(t.ObjectStorage, t.UserID, schsdk.MountDir)
|
||||||
// commands = append(commands, mountCommands...)
|
commands = append(commands, mountCommands...)
|
||||||
// }
|
}
|
||||||
//}
|
}
|
||||||
|
|
||||||
mountCommands := utils.GetRcloneCommands(t.ObjectStorage, t.UserID, schsdk.MountDir)
|
mountCommands := utils.GetRcloneCommands(t.ObjectStorage, t.UserID, schsdk.MountDir)
|
||||||
commands = append(commands, mountCommands...)
|
commands = append(commands, mountCommands...)
|
||||||
|
@ -76,11 +76,11 @@ func (t *SchedulerModelFinetuning) do(task *Task, ctx TaskContext) error {
|
||||||
// 执行微调任务
|
// 执行微调任务
|
||||||
_, err := provider.RunCommand(commands, instanceID, 2000)
|
_, err := provider.RunCommand(commands, instanceID, 2000)
|
||||||
// 执行结束后销毁服务器
|
// 执行结束后销毁服务器
|
||||||
//_, err2 := provider.DeleteInstance(instanceID)
|
_, err2 := provider.DeleteInstance(instanceID)
|
||||||
//if err2 != nil {
|
if err2 != nil {
|
||||||
// task.SendStatus(exectsk.NewSchedulerModelFinetuningStatus(err))
|
task.SendStatus(exectsk.NewSchedulerModelFinetuningStatus(err))
|
||||||
// return err2
|
return err2
|
||||||
//}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
task.SendStatus(exectsk.NewSchedulerModelFinetuningStatus(err))
|
task.SendStatus(exectsk.NewSchedulerModelFinetuningStatus(err))
|
||||||
return err
|
return err
|
||||||
|
|
|
@ -43,7 +43,7 @@ func (s *MultiInstanceRunning) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job)
|
||||||
|
|
||||||
multInstJob := jo.Body.(*job.MultiInstanceJob)
|
multInstJob := jo.Body.(*job.MultiInstanceJob)
|
||||||
|
|
||||||
//go pollingInstance(rtx, multInstJob)
|
go pollingInstance(rtx, multInstJob)
|
||||||
|
|
||||||
waitFut := event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
|
waitFut := event.BeginWaitType[*event.InstanceOperate](rtx.EventSet)
|
||||||
for {
|
for {
|
||||||
|
|
|
@ -95,7 +95,6 @@ func (s *MultiInstanceUpdate) do(rtx jobmgr.JobStateRunContext, jo *jobmgr.Job)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 发送事件,更新各个instance
|
// 发送事件,更新各个instance
|
||||||
//updateJob.Info.Runtime.Command = strings.Replace(updateJob.Info.Runtime.Command, "$1", fullPath, -1)
|
|
||||||
updateJob.Info.Runtime.Envs = append(updateJob.Info.Runtime.Envs, schsdk.KVPair{Key: schsdk.FinetuningOutEnv, Value: fullPath})
|
updateJob.Info.Runtime.Envs = append(updateJob.Info.Runtime.Envs, schsdk.KVPair{Key: schsdk.FinetuningOutEnv, Value: fullPath})
|
||||||
updateInfo := event.InstanceUpdateInfo{
|
updateInfo := event.InstanceUpdateInfo{
|
||||||
Info: updateJob.Info,
|
Info: updateJob.Info,
|
||||||
|
|
|
@ -168,21 +168,19 @@ func getCacheData(c *cache.Cache) schsdk.NodeUsageRateInfo {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, v := range infoMap {
|
for k, v := range infoMap {
|
||||||
// 对v 进行排序
|
// 对v 进行排序
|
||||||
sort.Slice(v, func(i, j int) bool {
|
sort.Slice(v, func(i, j int) bool {
|
||||||
return v[i].Timestamp < v[j].Timestamp
|
return v[i].Timestamp < v[j].Timestamp
|
||||||
})
|
})
|
||||||
//switch k {
|
switch k {
|
||||||
//case "MemoryUtilization":
|
case schsdk.MemoryUtilization:
|
||||||
// nodeUsageRateInfo.MemoryUtilization = v
|
nodeUsageRateInfo.MemoryUtilization = v
|
||||||
//case "GPUUtilization":
|
case schsdk.GPUUtilization:
|
||||||
// nodeUsageRateInfo.GPUUtilization = v
|
nodeUsageRateInfo.GPUUtilization = v
|
||||||
//case "CPUUtilization":
|
case schsdk.CPUUtilization:
|
||||||
// nodeUsageRateInfo.CPUUtilization = v
|
nodeUsageRateInfo.CPUUtilization = v
|
||||||
//}
|
}
|
||||||
nodeUsageRateInfo.MemoryUtilization = v
|
|
||||||
nodeUsageRateInfo.GPUUtilization = v
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return nodeUsageRateInfo
|
return nodeUsageRateInfo
|
||||||
|
|
|
@ -85,6 +85,7 @@ func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetRe
|
||||||
})
|
})
|
||||||
|
|
||||||
case *schsdk.DataPreprocessJobInfo:
|
case *schsdk.DataPreprocessJobInfo:
|
||||||
|
// 后续的调度流程跟NormalJob是一致的
|
||||||
normalJobInfo := &schsdk.NormalJobInfo{
|
normalJobInfo := &schsdk.NormalJobInfo{
|
||||||
Type: schsdk.JobTypeNormal,
|
Type: schsdk.JobTypeNormal,
|
||||||
JobInfoBase: info.JobInfoBase,
|
JobInfoBase: info.JobInfoBase,
|
||||||
|
@ -107,6 +108,7 @@ func (svc *Service) SubmitJobSet(msg *mgrmq.SubmitJobSet) (*mgrmq.SubmitJobSetRe
|
||||||
})
|
})
|
||||||
|
|
||||||
case *schsdk.FinetuningJobInfo:
|
case *schsdk.FinetuningJobInfo:
|
||||||
|
// 后续的调度流程跟NormalJob是一致的
|
||||||
normalJobInfo := &schsdk.NormalJobInfo{
|
normalJobInfo := &schsdk.NormalJobInfo{
|
||||||
Type: schsdk.JobTypeNormal,
|
Type: schsdk.JobTypeNormal,
|
||||||
Files: info.Files,
|
Files: info.Files,
|
||||||
|
|
Loading…
Reference in New Issue