JCC-CSScheduler/executor/internal/task/scheduler_create_ecs.go

212 lines
8.7 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package task
import (
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/executor/internal/config"
"strconv"
"strings"
//"gitlink.org.cn/cloudream/scheduler/executor/internal/config"
"gitlink.org.cn/cloudream/scheduler/executor/internal/task/create_ecs"
)
type ScheduleCreateECS struct {
*exectsk.ScheduleCreateECS
}
func NewScheduleCreateECS(info *exectsk.ScheduleCreateECS) *ScheduleCreateECS {
return &ScheduleCreateECS{
ScheduleCreateECS: info,
}
}
func (t *ScheduleCreateECS) Execute(task *Task, ctx TaskContext) {
log := logger.WithType[ScheduleCreateECS]("Task")
log.Debugf("begin")
defer log.Debugf("end")
err := t.do(task, ctx)
if err != nil {
log.Error(err)
return
}
log.Info("ScheduleCreateECS...")
}
func (t *ScheduleCreateECS) do(task *Task, ctx TaskContext) error {
// 创建云主机
factory := create_ecs.GetFactory(config.CloudName)
provider := factory.CreateProvider()
instanceID, ecsIP, err := provider.CreateServer()
//instanceID, ecsIP, err := "i-bp16imo8en907iy1oixd", "120.55.45.90", error(nil)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.CreateECS, err.Error()))
return err
}
logger.Info("create ECS success, instance id: " + instanceID + ", address: " + ecsIP)
// 曙光服务器需要将访问路径放入到环境变量中
if config.CloudName == schmod.SugonCloud {
segments := strings.Split(strings.Trim(ecsIP, "/"), "/")
if len(segments) > 0 {
t.Envs = append(t.Envs, schsdk.KVPair{Key: schsdk.AccessPath, Value: segments[len(segments)-1]})
}
}
// 设置环境变量
t.Envs = append(t.Envs, schsdk.KVPair{Key: "MountDir", Value: schsdk.MountDir})
commands := utils.ConvertEnvsToCommand(t.Envs)
startScript := t.ModelResource.StartShellPath
if t.ObjectStorage.MountType == schsdk.RcloneMount {
startScript = schsdk.MountDir + "/" + t.ModelResource.StartShellPath
// 获取Rclone挂载命令
mountCommands := utils.GetRcloneCommands(t.ObjectStorage, t.UserID, schsdk.MountDir)
commands = append(commands, mountCommands...)
}
// 获取启动命令
commands = append(commands, utils.HandleCommand(startScript))
// 安装依赖包用于获取GPU信息
commandContent := getPipCommand()
commands = append(commands, commandContent)
// 获取用户输入的命令
arr := utils.SplitCommands(t.Command)
commands = append(commands, arr...)
// 执行命令
_, err = provider.RunCommand(commands, instanceID, 2000)
if err != nil {
logger.Error("run command error: " + err.Error())
}
address := "http://" + ecsIP + ":" + strconv.FormatInt(t.ModelResource.ServerPort, 10) + "/" + t.ModelResource.ServerUrlPath
if config.CloudName == schmod.SugonCloud {
address = ecsIP + "/" + t.ModelResource.ServerUrlPath
address = strings.Replace(address, "//", "/", -1)
}
// 返回执行结果
task.SendStatus(exectsk.NewScheduleCreateECSStatus(address, schsdk.CreateECS, ""))
logger.Info("run all commands complete")
// 监听更新操作
for {
taskOperate, err := task.taskChan.Chan.Receive()
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
info, ok := taskOperate.(executor.TaskOperateInfo)
if !ok {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", "invalid task operate info"))
continue
}
switch info.Operate {
case schsdk.RunECS:
_, err := provider.StartInstances(instanceID)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.RunECS, ""))
case schsdk.PauseECS:
_, err := provider.StopInstance(instanceID)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.PauseECS, ""))
case schsdk.DestroyECS:
logger.Info("destroy ecs")
_, err := provider.DeleteInstance(instanceID)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.DestroyECS, ""))
break
case schsdk.RestartServer:
commandContent = utils.RemountRclone(t.ObjectStorage, t.UserID, schsdk.MountDir)
info.Runtime.Command = info.Runtime.Command + "\n" + commandContent
commandContent = schsdk.MountDir + "/" + t.ModelResource.StopShellPath
info.Runtime.Command = info.Runtime.Command + "\n" + utils.HandleCommand(commandContent)
commandContent = schsdk.MountDir + "/" + t.ModelResource.StartShellPath
info.Runtime.Command = info.Runtime.Command + "\n" + utils.HandleCommand(commandContent)
executeCommands(provider, instanceID, task, info.Runtime)
case schsdk.OperateServer:
executeCommands(provider, instanceID, task, info.Runtime)
case schsdk.GPUMonitor:
commands, logFile := getGPUCommand(instanceID)
var res string
// 曙光服务器执行命令无法获取返回值,所以需要特殊处理
if config.CloudName == schmod.SugonCloud {
commands = append(commands, logFile)
res, err = provider.RunCommand(commands, instanceID, -1)
} else {
res, err = provider.RunCommand(commands, instanceID, 2000)
}
if err != nil {
// 如果命令执行失败,判断机器是否可用
check := provider.AvailableCheck(instanceID)
if !check {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.Invalid, err.Error()))
break
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus(res, schsdk.GPUMonitor, ""))
default:
executeCommands(provider, instanceID, task, info.Runtime)
}
}
}
func getPipCommand() string {
commandContent := "python -m pip install --upgrade pip \n pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple \n pip install torch \n pip install gputil \n pip install psutil"
return commandContent
}
func getGPUCommand(instanceID string) ([]string, string) {
var commands []string
path := "/public/home/acgnnmfbwo/modeltest/gpu_monitor/"
logFileName := path + "gpuMonitor_" + instanceID + ".log"
commandContent := "mkdir -p " + path + " && rm -rf " + logFileName
commands = append(commands, commandContent)
commandContent = "echo -e 'import torch\nimport GPUtil\nimport psutil\n\ndef get_memory_usage():\n if torch.cuda.is_available():\n allocated_memory = torch.cuda.memory_allocated()\n allocated_memory_mb = allocated_memory / (1024 * 1024)\n gpus = GPUtil.getGPUs()\n if len(gpus) > 0:\n total_memory_mb = gpus[0].memoryTotal\n memory_utilization = (allocated_memory_mb / total_memory_mb) * 100\n print(f\"MemoryUtilization: {memory_utilization:.2f}\")\n else:\n print(\"No GPU found with GPUtil.\")\n else:\n print(\"CUDA is not available. Please check your GPU setup.\")\n\ndef get_gpu_utilization_with_gputil():\n try:\n gpus = GPUtil.getGPUs()\n for gpu in gpus:\n print(f\"GPU_Name: {gpu.name}\")\n print(f\"GPUUtilization: {gpu.load * 100:.2f}\")\n print(f\"Memory_Used: {gpu.memoryUsed} MB\")\n print(f\"Memory_Free: {gpu.memoryFree} MB\")\n print(f\"Memory_Total: {gpu.memoryTotal} MB\")\n except Exception as e:\n print(f\"Error getting GPU utilization with GPUtil: {e}\")\n\ndef get_cpu_usage():\n cpu_percent = psutil.cpu_percent(interval=1)\n print(f\"CPUUtilization: {cpu_percent:.2f}\")\n\nif __name__ == \"__main__\":\n get_memory_usage()\n get_gpu_utilization_with_gputil()\n get_cpu_usage()\n' > ./modeltest/gpu_monitor/gpuMonitor.py"
commands = append(commands, commandContent)
commandContent = "python ./modeltest/gpu_monitor/gpuMonitor.py > " + logFileName
commands = append(commands, commandContent)
//logFileName = strings.Replace(logFileName, "./", "", -1)
return commands, logFileName
}
func executeCommands(provider create_ecs.CloudProvider, instanceID string, task *Task, runtime schsdk.JobRuntimeInfo) {
commands := utils.ConvertEnvsToCommand(runtime.Envs)
commands = append(commands, utils.SplitCommands(runtime.Command)...)
res, err := provider.RunCommand(commands, instanceID, 2000)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
return
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus(res, schsdk.OperateServer, ""))
}
func init() {
Register(NewScheduleCreateECS)
}