212 lines
8.7 KiB
Go
212 lines
8.7 KiB
Go
package task
|
||
|
||
import (
|
||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
|
||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
|
||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||
"gitlink.org.cn/cloudream/scheduler/common/utils"
|
||
"gitlink.org.cn/cloudream/scheduler/executor/internal/config"
|
||
"strconv"
|
||
"strings"
|
||
//"gitlink.org.cn/cloudream/scheduler/executor/internal/config"
|
||
"gitlink.org.cn/cloudream/scheduler/executor/internal/task/create_ecs"
|
||
)
|
||
|
||
type ScheduleCreateECS struct {
|
||
*exectsk.ScheduleCreateECS
|
||
}
|
||
|
||
func NewScheduleCreateECS(info *exectsk.ScheduleCreateECS) *ScheduleCreateECS {
|
||
return &ScheduleCreateECS{
|
||
ScheduleCreateECS: info,
|
||
}
|
||
}
|
||
|
||
func (t *ScheduleCreateECS) Execute(task *Task, ctx TaskContext) {
|
||
log := logger.WithType[ScheduleCreateECS]("Task")
|
||
log.Debugf("begin")
|
||
defer log.Debugf("end")
|
||
|
||
err := t.do(task, ctx)
|
||
if err != nil {
|
||
log.Error(err)
|
||
return
|
||
}
|
||
|
||
log.Info("ScheduleCreateECS...")
|
||
}
|
||
|
||
func (t *ScheduleCreateECS) do(task *Task, ctx TaskContext) error {
|
||
// 创建云主机
|
||
factory := create_ecs.GetFactory(config.CloudName)
|
||
provider := factory.CreateProvider()
|
||
instanceID, ecsIP, err := provider.CreateServer()
|
||
//instanceID, ecsIP, err := "i-bp16imo8en907iy1oixd", "120.55.45.90", error(nil)
|
||
if err != nil {
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.CreateECS, err.Error()))
|
||
return err
|
||
}
|
||
|
||
logger.Info("create ECS success, instance id: " + instanceID + ", address: " + ecsIP)
|
||
|
||
// 曙光服务器需要将访问路径放入到环境变量中
|
||
if config.CloudName == schmod.SugonCloud {
|
||
segments := strings.Split(strings.Trim(ecsIP, "/"), "/")
|
||
if len(segments) > 0 {
|
||
t.Envs = append(t.Envs, schsdk.KVPair{Key: schsdk.AccessPath, Value: segments[len(segments)-1]})
|
||
}
|
||
}
|
||
|
||
// 设置环境变量
|
||
t.Envs = append(t.Envs, schsdk.KVPair{Key: "MountDir", Value: schsdk.MountDir})
|
||
commands := utils.ConvertEnvsToCommand(t.Envs)
|
||
|
||
startScript := t.ModelResource.StartShellPath
|
||
if t.ObjectStorage.MountType == schsdk.RcloneMount {
|
||
startScript = schsdk.MountDir + "/" + t.ModelResource.StartShellPath
|
||
// 获取Rclone挂载命令
|
||
mountCommands := utils.GetRcloneCommands(t.ObjectStorage, t.UserID, schsdk.MountDir)
|
||
commands = append(commands, mountCommands...)
|
||
}
|
||
// 获取启动命令
|
||
commands = append(commands, utils.HandleCommand(startScript))
|
||
|
||
// 安装依赖包,用于获取GPU信息
|
||
commandContent := getPipCommand()
|
||
commands = append(commands, commandContent)
|
||
|
||
// 获取用户输入的命令
|
||
arr := utils.SplitCommands(t.Command)
|
||
commands = append(commands, arr...)
|
||
|
||
// 执行命令
|
||
_, err = provider.RunCommand(commands, instanceID, 2000)
|
||
if err != nil {
|
||
logger.Error("run command error: " + err.Error())
|
||
}
|
||
|
||
address := "http://" + ecsIP + ":" + strconv.FormatInt(t.ModelResource.ServerPort, 10) + "/" + t.ModelResource.ServerUrlPath
|
||
if config.CloudName == schmod.SugonCloud {
|
||
address = ecsIP + "/" + t.ModelResource.ServerUrlPath
|
||
address = strings.Replace(address, "//", "/", -1)
|
||
}
|
||
|
||
// 返回执行结果
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus(address, schsdk.CreateECS, ""))
|
||
logger.Info("run all commands complete")
|
||
|
||
// 监听更新操作
|
||
for {
|
||
taskOperate, err := task.taskChan.Chan.Receive()
|
||
if err != nil {
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
|
||
continue
|
||
}
|
||
|
||
info, ok := taskOperate.(executor.TaskOperateInfo)
|
||
if !ok {
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", "invalid task operate info"))
|
||
continue
|
||
}
|
||
|
||
switch info.Operate {
|
||
case schsdk.RunECS:
|
||
_, err := provider.StartInstances(instanceID)
|
||
if err != nil {
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
|
||
continue
|
||
}
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.RunECS, ""))
|
||
case schsdk.PauseECS:
|
||
_, err := provider.StopInstance(instanceID)
|
||
if err != nil {
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
|
||
continue
|
||
}
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.PauseECS, ""))
|
||
case schsdk.DestroyECS:
|
||
logger.Info("destroy ecs")
|
||
_, err := provider.DeleteInstance(instanceID)
|
||
if err != nil {
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
|
||
continue
|
||
}
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.DestroyECS, ""))
|
||
break
|
||
case schsdk.RestartServer:
|
||
commandContent = utils.RemountRclone(t.ObjectStorage, t.UserID, schsdk.MountDir)
|
||
info.Runtime.Command = info.Runtime.Command + "\n" + commandContent
|
||
commandContent = schsdk.MountDir + "/" + t.ModelResource.StopShellPath
|
||
info.Runtime.Command = info.Runtime.Command + "\n" + utils.HandleCommand(commandContent)
|
||
commandContent = schsdk.MountDir + "/" + t.ModelResource.StartShellPath
|
||
info.Runtime.Command = info.Runtime.Command + "\n" + utils.HandleCommand(commandContent)
|
||
executeCommands(provider, instanceID, task, info.Runtime)
|
||
case schsdk.OperateServer:
|
||
executeCommands(provider, instanceID, task, info.Runtime)
|
||
case schsdk.GPUMonitor:
|
||
commands, logFile := getGPUCommand(instanceID)
|
||
var res string
|
||
// 曙光服务器执行命令无法获取返回值,所以需要特殊处理
|
||
if config.CloudName == schmod.SugonCloud {
|
||
commands = append(commands, logFile)
|
||
res, err = provider.RunCommand(commands, instanceID, -1)
|
||
} else {
|
||
res, err = provider.RunCommand(commands, instanceID, 2000)
|
||
}
|
||
if err != nil {
|
||
// 如果命令执行失败,判断机器是否可用
|
||
check := provider.AvailableCheck(instanceID)
|
||
if !check {
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.Invalid, err.Error()))
|
||
break
|
||
}
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
|
||
continue
|
||
}
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus(res, schsdk.GPUMonitor, ""))
|
||
default:
|
||
executeCommands(provider, instanceID, task, info.Runtime)
|
||
}
|
||
|
||
}
|
||
}
|
||
|
||
func getPipCommand() string {
|
||
commandContent := "python -m pip install --upgrade pip \n pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple \n pip install torch \n pip install gputil \n pip install psutil"
|
||
return commandContent
|
||
}
|
||
|
||
func getGPUCommand(instanceID string) ([]string, string) {
|
||
var commands []string
|
||
|
||
path := "/public/home/acgnnmfbwo/modeltest/gpu_monitor/"
|
||
logFileName := path + "gpuMonitor_" + instanceID + ".log"
|
||
commandContent := "mkdir -p " + path + " && rm -rf " + logFileName
|
||
commands = append(commands, commandContent)
|
||
commandContent = "echo -e 'import torch\nimport GPUtil\nimport psutil\n\ndef get_memory_usage():\n if torch.cuda.is_available():\n allocated_memory = torch.cuda.memory_allocated()\n allocated_memory_mb = allocated_memory / (1024 * 1024)\n gpus = GPUtil.getGPUs()\n if len(gpus) > 0:\n total_memory_mb = gpus[0].memoryTotal\n memory_utilization = (allocated_memory_mb / total_memory_mb) * 100\n print(f\"MemoryUtilization: {memory_utilization:.2f}\")\n else:\n print(\"No GPU found with GPUtil.\")\n else:\n print(\"CUDA is not available. Please check your GPU setup.\")\n\ndef get_gpu_utilization_with_gputil():\n try:\n gpus = GPUtil.getGPUs()\n for gpu in gpus:\n print(f\"GPU_Name: {gpu.name}\")\n print(f\"GPUUtilization: {gpu.load * 100:.2f}\")\n print(f\"Memory_Used: {gpu.memoryUsed} MB\")\n print(f\"Memory_Free: {gpu.memoryFree} MB\")\n print(f\"Memory_Total: {gpu.memoryTotal} MB\")\n except Exception as e:\n print(f\"Error getting GPU utilization with GPUtil: {e}\")\n\ndef get_cpu_usage():\n cpu_percent = psutil.cpu_percent(interval=1)\n print(f\"CPUUtilization: {cpu_percent:.2f}\")\n\nif __name__ == \"__main__\":\n get_memory_usage()\n get_gpu_utilization_with_gputil()\n get_cpu_usage()\n' > ./modeltest/gpu_monitor/gpuMonitor.py"
|
||
commands = append(commands, commandContent)
|
||
commandContent = "python ./modeltest/gpu_monitor/gpuMonitor.py > " + logFileName
|
||
commands = append(commands, commandContent)
|
||
//logFileName = strings.Replace(logFileName, "./", "", -1)
|
||
|
||
return commands, logFileName
|
||
}
|
||
|
||
func executeCommands(provider create_ecs.CloudProvider, instanceID string, task *Task, runtime schsdk.JobRuntimeInfo) {
|
||
commands := utils.ConvertEnvsToCommand(runtime.Envs)
|
||
commands = append(commands, utils.SplitCommands(runtime.Command)...)
|
||
|
||
res, err := provider.RunCommand(commands, instanceID, 2000)
|
||
if err != nil {
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
|
||
return
|
||
}
|
||
task.SendStatus(exectsk.NewScheduleCreateECSStatus(res, schsdk.OperateServer, ""))
|
||
}
|
||
|
||
func init() {
|
||
Register(NewScheduleCreateECS)
|
||
}
|