JCC-CSScheduler/executor/internal/task/scheduler_create_ecs.go

253 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package task
import (
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/executor/internal/config"
"math/rand"
"strconv"
"strings"
"time"
//"gitlink.org.cn/cloudream/scheduler/executor/internal/config"
"gitlink.org.cn/cloudream/scheduler/executor/internal/task/create_ecs"
)
type ScheduleCreateECS struct {
*exectsk.ScheduleCreateECS
}
func NewScheduleCreateECS(info *exectsk.ScheduleCreateECS) *ScheduleCreateECS {
return &ScheduleCreateECS{
ScheduleCreateECS: info,
}
}
func (t *ScheduleCreateECS) Execute(task *Task, ctx TaskContext) {
log := logger.WithType[ScheduleCreateECS]("Task")
log.Debugf("begin")
defer log.Debugf("end")
err := t.do(task, ctx)
if err != nil {
log.Error(err)
return
}
log.Info("ScheduleCreateECS...")
}
func (t *ScheduleCreateECS) do(task *Task, ctx TaskContext) error {
// 创建云主机
factory := create_ecs.GetFactory(config.CloudName)
provider := factory.CreateProvider()
instanceID, ecsIP, err := provider.CreateServer()
//instanceID, ecsIP, err := "i-bp18see6gypratlt3nhp", "47.96.28.209", error(nil)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.CreateECS, err.Error()))
return err
}
logger.Info("create ECS success, instance id: " + instanceID + ", address: " + ecsIP)
// 曙光服务器需要将访问路径放入到环境变量中
if config.CloudName == schmod.SugonCloud {
segments := strings.Split(strings.Trim(ecsIP, "/"), "/")
if len(segments) > 0 {
t.Envs = append(t.Envs, schsdk.KVPair{Key: schsdk.AccessPath, Value: segments[len(segments)-1]})
}
}
// 设置环境变量
commands := utils.ConvertEnvsToCommand(t.Envs)
// 获取挂载命令
switch t.ObjectStorage.MountType {
case schsdk.RcloneMount:
rcloneCommands := getRcloneCommands(t.ModelResource, t.ObjectStorage, t.UserID)
commands = append(commands, rcloneCommands...)
case schsdk.Mounted:
commandContent := "sudo sh @startScript@ > /opt/startup.log"
commandContent = strings.Replace(commandContent, "@startScript@", t.ModelResource.StartShellPath, -1)
commands = append(commands, commandContent)
}
// 安装依赖包用于获取GPU信息
commandContent := getPipCommand()
commands = append(commands, commandContent)
// 获取用户输入的命令
arr := utils.SplitCommands(t.Command)
commands = append(commands, arr...)
//_, err = provider.RunCommand(commands, instanceID, 2000)
//if err != nil {
// logger.Error("run command error: " + err.Error())
//}
address := "http://" + ecsIP + ":" + strconv.FormatInt(t.ModelResource.ServerPort, 10) + "/" + t.ModelResource.ServerUrlPath
if config.CloudName == schmod.SugonCloud {
address = ecsIP + "/" + t.ModelResource.ServerUrlPath
address = strings.Replace(address, "//", "/", -1)
}
// 返回执行结果
task.SendStatus(exectsk.NewScheduleCreateECSStatus(address, schsdk.CreateECS, ""))
logger.Info("run all commands complete")
// 监听更新操作
for {
taskOperate, err := task.taskChan.Chan.Receive()
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
info, ok := taskOperate.(executor.TaskOperateInfo)
if !ok {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", "invalid task operate info"))
continue
}
switch info.Operate {
case schsdk.RunECS:
_, err := provider.StartInstances(instanceID)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.RunECS, ""))
case schsdk.PauseECS:
_, err := provider.StopInstance(instanceID)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.PauseECS, ""))
case schsdk.DestroyECS:
_, err := provider.DeleteInstance(instanceID)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.DestroyECS, ""))
break
case schsdk.OperateServer:
executeCommands(provider, instanceID, task, info.Runtime)
case schsdk.GPUMonitor:
commands, logFile := getGPUCommand(instanceID)
var res string
// 曙光服务器执行命令无法获取返回值,所以需要特殊处理
if config.CloudName == schmod.SugonCloud {
commands = append(commands, logFile)
res, err = provider.RunCommand(commands, instanceID, -1)
} else {
res, err = provider.RunCommand(commands, instanceID, 2000)
}
if err != nil {
// 如果命令执行失败,判断机器是否可用
check := provider.AvailableCheck(instanceID)
if !check {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", schsdk.Invalid, err.Error()))
break
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
continue
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus(res, schsdk.GPUMonitor, ""))
default:
executeCommands(provider, instanceID, task, info.Runtime)
}
}
}
func getRandomNum() string {
rand.Seed(time.Now().UnixNano())
randomFloat := rand.Float64() * 20
return strconv.FormatFloat(randomFloat, 'f', 2, 64)
}
func getPipCommand() string {
commandContent := "python -m pip install --upgrade pip \n pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple \n pip install torch \n pip install gputil \n pip install psutil"
return commandContent
}
func getGPUCommand(instanceID string) ([]string, string) {
var commands []string
path := "/public/home/acgnnmfbwo/modeltest/gpu_monitor/"
logFileName := path + "gpuMonitor_" + instanceID + ".log"
commandContent := "mkdir -p " + path + " && rm -rf " + logFileName
commands = append(commands, commandContent)
commandContent = "echo -e 'import torch\nimport GPUtil\nimport psutil\n\ndef get_memory_usage():\n if torch.cuda.is_available():\n allocated_memory = torch.cuda.memory_allocated()\n allocated_memory_mb = allocated_memory / (1024 * 1024)\n gpus = GPUtil.getGPUs()\n if len(gpus) > 0:\n total_memory_mb = gpus[0].memoryTotal\n memory_utilization = (allocated_memory_mb / total_memory_mb) * 100\n print(f\"MemoryUtilization: {memory_utilization:.2f}\")\n else:\n print(\"No GPU found with GPUtil.\")\n else:\n print(\"CUDA is not available. Please check your GPU setup.\")\n\ndef get_gpu_utilization_with_gputil():\n try:\n gpus = GPUtil.getGPUs()\n for gpu in gpus:\n print(f\"GPU_Name: {gpu.name}\")\n print(f\"GPUUtilization: {gpu.load * 100:.2f}\")\n print(f\"Memory_Used: {gpu.memoryUsed} MB\")\n print(f\"Memory_Free: {gpu.memoryFree} MB\")\n print(f\"Memory_Total: {gpu.memoryTotal} MB\")\n except Exception as e:\n print(f\"Error getting GPU utilization with GPUtil: {e}\")\n\ndef get_cpu_usage():\n cpu_percent = psutil.cpu_percent(interval=1)\n print(f\"CPUUtilization: {cpu_percent:.2f}\")\n\nif __name__ == \"__main__\":\n get_memory_usage()\n get_gpu_utilization_with_gputil()\n get_cpu_usage()\n' > ./modeltest/gpu_monitor/gpuMonitor.py"
commands = append(commands, commandContent)
commandContent = "python ./modeltest/gpu_monitor/gpuMonitor.py > " + logFileName
commands = append(commands, commandContent)
//logFileName = strings.Replace(logFileName, "./", "", -1)
return commands, logFileName
}
func getRcloneCommands(resource schmod.ModelResource, storage schmod.ObjectStorage, userID cdssdk.UserID) []string {
var commands []string
// 下载Rclone
commandContent := "yum install -y fuse3"
commands = append(commands, commandContent)
commandContent = "cd /opt && downloadCode='import requests;response=requests.get(\"@url@\",stream=True);response.raise_for_status();boundary=response.headers.get(\"Content-Type\").split(\"boundary=\")[-1].encode();content=response.content;body=[part.split(b\"\\r\\n\\r\\n\",1)[1].rsplit(b\"\\r\\n--\",1)[0] for part in content.split(b\"--\"+boundary+b\"\\r\\n\") if b\"filename=\" in part][0];open(\"@filename@\",\"wb\").write(body);print(\"success\")' && rclone=\"$cds_url/object/download?userID=$userID&objectID=$rcloneID\" && python3 -c \"$(echo \"$downloadCode\" | sed -e \"s|@url@|$(printf '%s' \"$rclone\" | sed 's/[&/\\]/\\\\&/g')|\" -e \"s|@filename@|rclone|\")\" && chmod +x rclone"
commandContent = strings.Replace(commandContent, "$cds_url", schglb.CloudreamStorageConfig.URL, -1)
commandContent = strings.Replace(commandContent, "$rcloneID", schglb.CDSRclone.CDSRcloneID, -1)
commandContent = strings.Replace(commandContent, "$userID", strconv.FormatInt(int64(userID), 10), -1)
commands = append(commands, commandContent)
// 生成Rclone配置文件
commandContent = "echo -e '[@tagName@] \n type = s3 \n provider = @provider@ \n access_key_id = @ak@ \n secret_access_key = @sk@ \n endpoint = @endpoint@ \n storage_class = STANDARD' > /opt/rclone.conf"
tagName := storage.Bucket + "_" + storage.AK
commandContent = strings.Replace(commandContent, "@tagName@", tagName, -1)
commandContent = strings.Replace(commandContent, "@provider@", storage.Manufacturer, -1)
commandContent = strings.Replace(commandContent, "@ak@", storage.AK, -1)
commandContent = strings.Replace(commandContent, "@sk@", storage.SK, -1)
commandContent = strings.Replace(commandContent, "@endpoint@", storage.Endpoint, -1)
commands = append(commands, commandContent)
// 挂载Rclone
mountDir := "/mnt/oss"
commandContent = "mkdir -p @mountDir@ && cd /opt && nohup ./rclone mount @tagName@:@bucket@ @mountDir@ --vfs-cache-mode full --vfs-read-wait 0 --vfs-read-chunk-size 128M --cache-db-purge -vv > rcloneMount.log 2>&1 &"
commandContent = strings.Replace(commandContent, "@tagName@", tagName, -1)
commandContent = strings.Replace(commandContent, "@bucket@", storage.Bucket, -1)
commandContent = strings.Replace(commandContent, "@mountDir@", mountDir, -1)
commands = append(commands, commandContent)
// 执行启动脚本
startScript := mountDir + "/" + resource.StartShellPath
startScript = strings.Replace(startScript, "//", "/", -1)
commandContent = "sudo sh @startScript@ > /opt/startup.log"
commandContent = strings.Replace(commandContent, "@startScript@", startScript, -1)
commands = append(commands, commandContent)
return commands
}
func executeCommands(provider create_ecs.CloudProvider, instanceID string, task *Task, runtime schsdk.JobRuntimeInfo) {
commands := utils.ConvertEnvsToCommand(runtime.Envs)
commands = append(commands, utils.SplitCommands(runtime.Command)...)
res, err := provider.RunCommand(commands, instanceID, 2000)
if err != nil {
task.SendStatus(exectsk.NewScheduleCreateECSStatus("", "", err.Error()))
return
}
task.SendStatus(exectsk.NewScheduleCreateECSStatus(res, schsdk.OperateServer, ""))
}
func init() {
Register(NewScheduleCreateECS)
}