forked from JointCloud/pcm-coordinator
1120 lines
32 KiB
Go
1120 lines
32 KiB
Go
/*
|
||
|
||
Copyright (c) [2023] [pcm]
|
||
[pcm-coordinator] is licensed under Mulan PSL v2.
|
||
You can use this software according to the terms and conditions of the Mulan PSL v2.
|
||
You may obtain a copy of Mulan PSL v2 at:
|
||
http://license.coscl.org.cn/MulanPSL2
|
||
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
||
EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
||
MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
||
See the Mulan PSL v2 for more details.
|
||
|
||
*/
|
||
|
||
package storeLink
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"github.com/pkg/errors"
|
||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
|
||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
|
||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
|
||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
|
||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
|
||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
|
||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils"
|
||
"gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
|
||
"gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
|
||
"gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
|
||
modelartsclient "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
|
||
"k8s.io/apimachinery/pkg/util/json"
|
||
"log"
|
||
"mime/multipart"
|
||
"regexp"
|
||
"strconv"
|
||
"strings"
|
||
"sync"
|
||
"time"
|
||
)
|
||
|
||
const (
|
||
Ascend = "Ascend"
|
||
Npu = "npu"
|
||
ImageNetResnet50Cmd = "cd /home/ma-user & python ./inference_ascend.py"
|
||
ChatGLM6BCmd = "cd /home/ma-user && python ./download_model.py && python ./inference_chatGLM.py"
|
||
ASCEND = "ASCEND910"
|
||
)
|
||
|
||
type ModelArtsLink struct {
|
||
modelArtsRpc modelartsservice.ModelArtsService
|
||
modelArtsImgRpc imagesservice.ImagesService
|
||
platform string
|
||
participantId int64
|
||
pageIndex int32
|
||
pageSize int32
|
||
SourceLocation string
|
||
Version string
|
||
ModelId string
|
||
ModelType string
|
||
}
|
||
|
||
type MoUsage struct {
|
||
CpuSize int64
|
||
NpuSize int64
|
||
MemorySize int64
|
||
VMemorySize int64
|
||
VMemoryNumber int64
|
||
CpuAvailable int64
|
||
NpuAvailable int64
|
||
MemoryAvailable int64
|
||
VMemoryAvailable int64
|
||
}
|
||
|
||
// Version 结构体表示版本号
|
||
type Version struct {
|
||
Major, Minor, Patch int
|
||
}
|
||
|
||
// ParseVersion 从字符串解析版本号
|
||
func ParseVersion(versionStr string) (*Version, error) {
|
||
parts := strings.Split(versionStr, ".")
|
||
if len(parts) != 3 {
|
||
return nil, fmt.Errorf("invalid version format: %s", versionStr)
|
||
}
|
||
|
||
major, err := strconv.Atoi(parts[0])
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
minor, err := strconv.Atoi(parts[1])
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
patch, err := strconv.Atoi(parts[2])
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
return &Version{Major: major, Minor: minor, Patch: patch}, nil
|
||
}
|
||
|
||
// Increment 根据给定规则递增版本号
|
||
func (v *Version) Increment() {
|
||
if v.Patch < 9 {
|
||
v.Patch++
|
||
} else {
|
||
v.Patch = 0
|
||
if v.Minor < 9 {
|
||
v.Minor++
|
||
} else {
|
||
v.Minor = 0
|
||
v.Major++
|
||
}
|
||
}
|
||
}
|
||
|
||
// String 将版本号转换回字符串格式
|
||
func (v *Version) String() string {
|
||
return fmt.Sprintf("%d.%d.%d", v.Major, v.Minor, v.Patch)
|
||
}
|
||
|
||
func NewModelArtsLink(modelArtsRpc modelartsservice.ModelArtsService, modelArtsImgRpc imagesservice.ImagesService, name string, id int64, nickname string) *ModelArtsLink {
|
||
return &ModelArtsLink{modelArtsRpc: modelArtsRpc, modelArtsImgRpc: modelArtsImgRpc, platform: nickname, participantId: id, pageIndex: 0, pageSize: 50}
|
||
}
|
||
|
||
func (m *ModelArtsLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
|
||
//TODO modelArts上传镜像
|
||
return nil, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
|
||
// TODO modelArts删除镜像
|
||
return nil, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) QueryImageList(ctx context.Context) (interface{}, error) {
|
||
// modelArts获取镜像列表
|
||
req := &modelarts.ListRepoReq{
|
||
Offset: "0",
|
||
Limit: strconv.Itoa(int(m.pageSize)),
|
||
Platform: m.platform,
|
||
}
|
||
resp, err := m.modelArtsImgRpc.ListReposDetails(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
return resp, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
|
||
// modelArts提交任务
|
||
environments := make(map[string]string)
|
||
parameters := make([]*modelarts.ParametersTrainJob, 0)
|
||
/* inputs := make([]*modelarts.InputTraining, 0)
|
||
outputs := make([]*modelarts.OutputTraining, 0)*/
|
||
for _, env := range envs {
|
||
s := strings.Split(env, COMMA)
|
||
environments[s[0]] = s[1]
|
||
}
|
||
for _, param := range params {
|
||
s := strings.Split(param, COMMA)
|
||
parameters = append(parameters, &modelarts.ParametersTrainJob{
|
||
Name: s[0],
|
||
Value: s[1],
|
||
})
|
||
}
|
||
/* inputs = append(inputs, &modelarts.InputTraining{
|
||
Name: "data_url",
|
||
Remote: &modelarts.RemoteTra{
|
||
Obs: &modelarts.Obs1{
|
||
ObsUrl: "/test-wq/data/mnist.npz",
|
||
},
|
||
}})
|
||
|
||
outputs = append(outputs, &modelarts.OutputTraining{
|
||
Name: "train_url",
|
||
Remote: &modelarts.RemoteOut{
|
||
Obs: &modelarts.ObsTra{
|
||
ObsUrl: "/test-wq/model/",
|
||
},
|
||
},
|
||
})*/
|
||
req := &modelarts.CreateTrainingJobReq{
|
||
Kind: "job",
|
||
Metadata: &modelarts.MetadataS{
|
||
Name: TASK_NAME_PREFIX + utils.RandomString(10),
|
||
WorkspaceId: "0",
|
||
},
|
||
Algorithm: &modelarts.Algorithms{
|
||
Id: algorithmId,
|
||
Engine: &modelarts.EngineCreateTraining{
|
||
ImageUrl: imageId,
|
||
},
|
||
Command: cmd,
|
||
Environments: environments,
|
||
Parameters: parameters,
|
||
//Inputs: inputs,
|
||
//Outputs: outputs,
|
||
},
|
||
Spec: &modelarts.SpecsC{
|
||
Resource: &modelarts.ResourceCreateTraining{
|
||
FlavorId: resourceId,
|
||
NodeCount: 1,
|
||
},
|
||
},
|
||
Platform: m.platform,
|
||
}
|
||
marshal, err2 := json.Marshal(req)
|
||
if err2 != nil {
|
||
|
||
}
|
||
println(string(marshal))
|
||
resp, err := m.modelArtsRpc.CreateTrainingJob(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
if resp.ErrorMsg != "" {
|
||
return nil, errors.New(resp.ErrorMsg)
|
||
}
|
||
|
||
return resp, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
|
||
// 获取任务
|
||
req := &modelarts.DetailTrainingJobsReq{
|
||
TrainingJobId: taskId,
|
||
Platform: m.platform,
|
||
}
|
||
resp, err := m.modelArtsRpc.GetTrainingJobs(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
return resp, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
|
||
// 删除任务
|
||
req := &modelarts.DeleteTrainingJobReq{
|
||
TrainingJobId: taskId,
|
||
Platform: m.platform,
|
||
}
|
||
resp, err := m.modelArtsRpc.DeleteTrainingJob(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
return resp, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) QuerySpecs(ctx context.Context) (interface{}, error) {
|
||
// octopus查询资源规格
|
||
req := &modelarts.TrainingJobFlavorsReq{
|
||
Platform: m.platform,
|
||
}
|
||
resp, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
return resp, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
|
||
req := &modelarts.GetPoolsRuntimeMetricsReq{}
|
||
resp, err := m.modelArtsRpc.GetPoolsRuntimeMetrics(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if resp.ErrorMsg != "" {
|
||
return nil, errors.New("failed to get algorithms")
|
||
}
|
||
resourceStats := &collector.ResourceStats{}
|
||
CpuCoreTotalSum := int64(0)
|
||
CpuCoreAvailSum := int64(0)
|
||
MemTotalSum := float64(0)
|
||
MemAvailSum := float64(0)
|
||
var CpuCoreTotal int64
|
||
var CpuCoreAvail int64
|
||
var MemTotal float64
|
||
var MemAvail float64
|
||
for _, items := range resp.Items {
|
||
//TODO The value of taskType is temporarily fixed to "pytorch"
|
||
CpuCoreTotal, err = strconv.ParseInt(items.Table.Capacity.Value.Cpu, 10, 64)
|
||
CpuCoreTotalSum += CpuCoreTotal
|
||
CpuCoreAvail, err = strconv.ParseInt(items.Table.Allocated.Value.Cpu, 10, 64)
|
||
CpuCoreAvailSum += CpuCoreAvail
|
||
MemTotal, err = strconv.ParseFloat(items.Table.Capacity.Value.Memory, 64)
|
||
MemTotalSum += MemTotal
|
||
MemAvail, err = strconv.ParseFloat(items.Table.Allocated.Value.Memory, 64)
|
||
MemAvailSum += MemAvail
|
||
}
|
||
resourceStats.CpuCoreTotal = CpuCoreTotalSum
|
||
resourceStats.CpuCoreAvail = CpuCoreAvailSum
|
||
resourceStats.MemTotal = MemTotalSum
|
||
resourceStats.MemAvail = MemAvailSum
|
||
req1 := &modelarts.GetResourceFlavorsReq{}
|
||
resp1, err := m.modelArtsRpc.GetResourceFlavors(ctx, req1)
|
||
|
||
num32, _ := strconv.Atoi(resp1.Items[0].Spec.Npu.Size)
|
||
var cards []*collector.Card
|
||
card := &collector.Card{
|
||
Platform: MODELARTS,
|
||
Type: CARD,
|
||
Name: Npu,
|
||
CardNum: int32(num32),
|
||
TOpsAtFp16: float64(num32 * 320),
|
||
}
|
||
cards = append(cards, card)
|
||
resourceStats.CardsAvail = cards
|
||
|
||
return resourceStats, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
|
||
return nil, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
|
||
var algorithms []*collector.Algorithm
|
||
|
||
req := &modelarts.ListAlgorithmsReq{
|
||
Platform: m.platform,
|
||
Offset: m.pageIndex,
|
||
Limit: m.pageSize,
|
||
}
|
||
resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if resp.ErrorMsg != "" {
|
||
return nil, errors.New("failed to get algorithms")
|
||
}
|
||
|
||
for _, a := range resp.Items {
|
||
//TODO The value of taskType is temporarily fixed to "pytorch"
|
||
algorithm := &collector.Algorithm{Name: a.Metadata.Name, Platform: MODELARTS, TaskType: "pytorch"}
|
||
algorithms = append(algorithms, algorithm)
|
||
}
|
||
return algorithms, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetComputeCards(ctx context.Context) ([]string, error) {
|
||
var cards []string
|
||
cards = append(cards, Ascend)
|
||
return cards, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetUserBalance(ctx context.Context) (float64, error) {
|
||
return 0, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
|
||
algoName := dataset + DASH + algorithm
|
||
req := &modelarts.GetFileReq{
|
||
Path: algoName + FORWARD_SLASH + TRAIN_FILE,
|
||
}
|
||
resp, err := m.modelArtsRpc.GetFile(ctx, req)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
return string(resp.Content), nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
|
||
return nil
|
||
}
|
||
|
||
// Determine whether there is a necessary image in image management and query the image name based on the image name
|
||
func (m *ModelArtsLink) getSourceLocationFromImages(ctx context.Context, option *option.InferOption) error {
|
||
req := &modelarts.ListImagesReq{
|
||
//Platform: m.platform,
|
||
Limit: 50,
|
||
Offset: 0,
|
||
}
|
||
|
||
ListImagesResp, err := m.modelArtsRpc.ListImages(ctx, req)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if ListImagesResp.Code != 200 {
|
||
return errors.New("failed to get ListImages")
|
||
}
|
||
|
||
for _, ListImages := range ListImagesResp.Data {
|
||
if option.ModelName == "ChatGLM-6B" {
|
||
if ListImages.Name == "chatglm-6b" {
|
||
m.SourceLocation = ListImages.SwrPath
|
||
return nil
|
||
}
|
||
} else {
|
||
if ListImages.Name == option.ModelName {
|
||
m.SourceLocation = ListImages.SwrPath
|
||
return nil
|
||
}
|
||
}
|
||
}
|
||
return errors.New("SourceLocation not set")
|
||
}
|
||
|
||
// Get AI Application List
|
||
func (m *ModelArtsLink) GetModelId(ctx context.Context, option *option.InferOption) error {
|
||
req := &modelarts.ListModelReq{
|
||
Platform: m.platform,
|
||
ModelName: option.ModelName,
|
||
//ModelType: "Image",
|
||
Limit: int64(m.pageIndex),
|
||
Offset: int64(m.pageSize),
|
||
}
|
||
ListModelResp, err := m.modelArtsRpc.ListModels(ctx, req)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if ListModelResp.Code == 200 {
|
||
//return errors.New("failed to get ModelId")
|
||
for _, ListModel := range ListModelResp.Models {
|
||
if ListModel.ModelName == option.ModelName {
|
||
option.ModelId = ListModel.ModelId
|
||
m.Version = ListModel.ModelVersion
|
||
return nil
|
||
}
|
||
}
|
||
|
||
}
|
||
err = m.CreateModel(ctx, option)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetModel(ctx context.Context, option *option.InferOption) string {
|
||
req := &modelarts.ShowModelReq{
|
||
Platform: m.platform,
|
||
ModelId: option.ModelId,
|
||
}
|
||
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Second)
|
||
defer cancel()
|
||
ShowModelsResp, err := m.modelArtsRpc.ShowModels(ctx, req)
|
||
if err != nil {
|
||
if err == context.DeadlineExceeded {
|
||
log.Println("Request timed out")
|
||
// 重试请求或其他处理
|
||
} else {
|
||
log.Fatalf("could not call method: %v", err)
|
||
}
|
||
}
|
||
if ShowModelsResp.Code != 200 {
|
||
errors.New("failed to get findModelsStatus")
|
||
}
|
||
m.ModelType = ShowModelsResp.ShowModelDetail.ModelAlgorithm
|
||
return ShowModelsResp.ShowModelDetail.ModelStatus
|
||
}
|
||
|
||
// Get AI Application List
|
||
func (m *ModelArtsLink) GetModelStatus(ctx context.Context, option *option.InferOption) error {
|
||
var wg sync.WaitGroup
|
||
wg.Add(1)
|
||
// 使用goroutine进行轮询
|
||
//defer wg.Done()
|
||
for {
|
||
status := m.GetModel(ctx, option)
|
||
if status == "published" {
|
||
fmt.Println("Model is now published.")
|
||
break // 一旦状态变为published,就退出循环
|
||
}
|
||
fmt.Println("Waiting for model to be published...")
|
||
time.Sleep(5 * time.Second) // 等待一段时间后再次检查
|
||
}
|
||
// 在这里执行模型状态为published后需要进行的操作
|
||
fmt.Println("Continuing with the program...")
|
||
return nil
|
||
}
|
||
|
||
// Create an AI application
|
||
func (m *ModelArtsLink) CreateModel(ctx context.Context, option *option.InferOption) error {
|
||
//Before creating an AI application, check if there are any images that can be created
|
||
err := m.getSourceLocationFromImages(ctx, option)
|
||
if err != nil { //
|
||
return errors.New("No image available for creationd")
|
||
}
|
||
|
||
//
|
||
var CMD string
|
||
if option.ModelName == "imagenet_resnet50" {
|
||
CMD = ImageNetResnet50Cmd
|
||
} else if option.ModelName == "ChatGLM-6B" {
|
||
CMD = ChatGLM6BCmd
|
||
}
|
||
|
||
if m.Version == "" {
|
||
m.Version = "0.0.1"
|
||
}
|
||
version, err := ParseVersion(m.Version)
|
||
version.Increment()
|
||
req := &modelarts.CreateModelReq{
|
||
Platform: m.platform,
|
||
ModelName: option.ModelName,
|
||
ModelType: "Image",
|
||
ModelVersion: version.String(),
|
||
SourceLocation: m.SourceLocation,
|
||
InstallType: []string{"real-time"},
|
||
Cmd: CMD,
|
||
ModelAlgorithm: option.ModelType,
|
||
}
|
||
ModelResp, err := m.modelArtsRpc.CreateModel(ctx, req)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if ModelResp.Code != 200 {
|
||
return errors.New("failed to get ModelId")
|
||
}
|
||
option.ModelId = ModelResp.ModelId
|
||
|
||
return nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetSpecifications(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
|
||
req := &modelarts.ListSpecificationsReq{
|
||
//Platform: m.platform,
|
||
IsPersonalCluster: false,
|
||
InferType: "real-time",
|
||
Limit: m.pageIndex,
|
||
OffSet: m.pageSize,
|
||
}
|
||
ListSpecificationsResp, err := m.modelArtsRpc.ListSpecifications(ctx, req)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
for _, ListSpecifications := range ListSpecificationsResp.Specifications {
|
||
if ListSpecifications.Specification == "modelarts.kat1.xlarge" {
|
||
ifoption.Specification = ListSpecifications.Specification
|
||
return nil
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
|
||
req := &modelartsservice.GetTrainingJobLogsPreviewReq{
|
||
Platform: m.platform,
|
||
TaskId: "worker-0",
|
||
TrainingJobId: taskId,
|
||
}
|
||
resp, err := m.modelArtsRpc.GetTrainingJobLogsPreview(ctx, req)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
if strings.Contains(resp.Content, "404 Not Found") {
|
||
resp.Content = "waiting for logs..."
|
||
}
|
||
return resp.Content, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
|
||
resp, err := m.QueryTask(ctx, taskId)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
jobresp, ok := (resp).(*modelartsservice.JobResponse)
|
||
if jobresp.ErrorMsg != "" || !ok {
|
||
if jobresp.ErrorMsg != "" {
|
||
return nil, errors.New(jobresp.ErrorMsg)
|
||
} else {
|
||
return nil, errors.New("get training task failed, empty error returned")
|
||
}
|
||
}
|
||
var task collector.Task
|
||
task.Id = jobresp.Metadata.Id
|
||
|
||
switch strings.ToLower(jobresp.Status.Phase) {
|
||
case "completed":
|
||
milliTimestamp := int64(jobresp.Status.StartTime)
|
||
task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
|
||
duration := int64(jobresp.Status.Duration)
|
||
task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
|
||
task.Status = constants.Completed
|
||
case "failed":
|
||
milliTimestamp := int64(jobresp.Status.StartTime)
|
||
task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
|
||
duration := int64(jobresp.Status.Duration)
|
||
task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
|
||
task.Status = constants.Failed
|
||
case "running":
|
||
milliTimestamp := int64(jobresp.Status.StartTime)
|
||
task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
|
||
task.Status = constants.Running
|
||
case "stopped":
|
||
task.Status = constants.Stopped
|
||
case "pending":
|
||
task.Status = constants.Pending
|
||
case "terminated":
|
||
//TODO Failed
|
||
task.Status = constants.Failed
|
||
default:
|
||
task.Status = "undefined"
|
||
}
|
||
|
||
return &task, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
|
||
switch mode {
|
||
case executor.SUBMIT_MODE_JOINT_CLOUD:
|
||
err := m.GenerateSubmitParams(ctx, option)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
|
||
var ascendNum int32
|
||
for _, res := range option.ResourcesRequired {
|
||
typeName, ok := res["type"]
|
||
if !ok {
|
||
continue
|
||
}
|
||
switch typeName {
|
||
case "NPU":
|
||
num, ok := res["number"]
|
||
if !ok {
|
||
continue
|
||
}
|
||
n := common.ConvertTypeToString(num)
|
||
val, err := strconv.ParseInt(n, 10, 32)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
ascendNum = int32(val)
|
||
}
|
||
}
|
||
|
||
req := &modelarts.TrainingJobFlavorsReq{
|
||
Platform: "modelarts-CloudBrain2",
|
||
FlavorType: "",
|
||
}
|
||
resp, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, req)
|
||
for _, v := range resp.Flavors {
|
||
if ascendNum == v.FlavorInfo.Npu.UnitNum {
|
||
option.ResourceId = v.FlavorId
|
||
break
|
||
} else if ascendNum <= 1 {
|
||
option.ResourceId = "modelarts.kat1.xlarge"
|
||
break
|
||
} else if ascendNum == 2 {
|
||
option.ResourceId = "modelarts.kat1.2xlarge"
|
||
break
|
||
} else if ascendNum > 2 && ascendNum <= 4 {
|
||
option.ResourceId = "modelarts.kat1.4xlarge"
|
||
break
|
||
} else if ascendNum >= 5 && ascendNum <= 8 {
|
||
option.ResourceId = "modelarts.kat1.8xlarge"
|
||
break
|
||
} else if ascendNum > 8 {
|
||
option.ResourceId = "modelarts.kat1.8xlarge"
|
||
break
|
||
}
|
||
}
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
option.ComputeCard = NPU
|
||
default:
|
||
return nil, errors.New("failed to choose submit mode")
|
||
}
|
||
task, err := m.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return task, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
|
||
err := m.generateResourceId(ctx, option, nil)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
err = m.generateAlgorithmId(ctx, option)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
err = m.generateImageId(option)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
err = m.generateCmd(option)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
err = m.generateEnv(option)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
err = m.generateParams(option)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) generateResourceId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
|
||
option.ResourceId = "modelarts.kat1.xlarge"
|
||
return nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) generateImageId(option *option.AiOption) error {
|
||
|
||
return nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) generateCmd(option *option.AiOption) error {
|
||
|
||
return nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) generateEnv(option *option.AiOption) error {
|
||
|
||
return nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) generateParams(option *option.AiOption) error {
|
||
|
||
return nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
|
||
req := &modelarts.ListAlgorithmsReq{
|
||
Platform: m.platform,
|
||
Offset: m.pageIndex,
|
||
Limit: m.pageSize,
|
||
}
|
||
resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if resp.ErrorMsg != "" {
|
||
return errors.New("failed to get algorithmId")
|
||
}
|
||
|
||
for _, algorithm := range resp.Items {
|
||
engVersion := algorithm.JobConfig.Engine.EngineVersion
|
||
if strings.Contains(engVersion, option.TaskType) {
|
||
ns := strings.Split(algorithm.Metadata.Name, DASH)
|
||
if ns[0] != option.TaskType {
|
||
continue
|
||
}
|
||
if ns[1] != option.DatasetsName {
|
||
continue
|
||
}
|
||
if ns[2] != option.AlgorithmName {
|
||
continue
|
||
}
|
||
option.AlgorithmId = algorithm.Metadata.Id
|
||
return nil
|
||
}
|
||
}
|
||
|
||
if option.AlgorithmId == "" {
|
||
return errors.New("Algorithm does not exist")
|
||
}
|
||
|
||
return errors.New("failed to get AlgorithmId")
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
|
||
var imageUrls []*inference.InferUrl
|
||
urlReq := &modelartsclient.ImageReasoningUrlReq{
|
||
ServiceName: option.ModelName,
|
||
Type: option.ModelType,
|
||
Card: "npu",
|
||
}
|
||
urlResp, err := m.modelArtsRpc.ImageReasoningUrl(ctx, urlReq)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
imageUrl := &inference.InferUrl{
|
||
Url: urlResp.Url,
|
||
Card: "npu",
|
||
}
|
||
imageUrls = append(imageUrls, imageUrl)
|
||
|
||
clusterWithUrl := &inference.ClusterInferUrl{
|
||
ClusterName: m.platform,
|
||
ClusterType: TYPE_MODELARTS,
|
||
InferUrls: imageUrls,
|
||
}
|
||
return clusterWithUrl, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
|
||
var insList []*inference.DeployInstance
|
||
req := &modelarts.ListServicesReq{
|
||
Platform: m.platform,
|
||
OffSet: m.pageIndex,
|
||
Limit: m.pageSize,
|
||
}
|
||
//list, err := m.modelArtsRpc.ListServices(ctx, req)
|
||
resp, err := m.modelArtsRpc.ListServices(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if resp.ErrorMsg != "" {
|
||
return nil, errors.New(resp.Msg)
|
||
}
|
||
|
||
for _, services := range resp.Services {
|
||
ins := &inference.DeployInstance{}
|
||
ins.InstanceName = services.ServiceName
|
||
ins.InstanceId = services.ServiceId
|
||
ins.Status = services.Status
|
||
ins.InferCard = "NPU"
|
||
ins.ClusterName = m.platform
|
||
ins.CreatedTime = string(services.StartTime)
|
||
ins.ClusterType = TYPE_MODELARTS
|
||
insList = append(insList, ins)
|
||
}
|
||
|
||
return insList, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) StartInferDeployInstance(ctx context.Context, id string) bool {
|
||
req := &modelartsclient.UpdateServiceReq{
|
||
ServiceId: id,
|
||
Status: "running",
|
||
}
|
||
resp, err := m.modelArtsRpc.UpdateService(ctx, req)
|
||
if err != nil || resp.Code != 0 {
|
||
return false
|
||
}
|
||
if resp.Code == 0 {
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
func (m *ModelArtsLink) StopInferDeployInstance(ctx context.Context, id string) bool {
|
||
req := &modelartsclient.UpdateServiceReq{
|
||
ServiceId: id,
|
||
Status: "stopped",
|
||
}
|
||
resp, err := m.modelArtsRpc.UpdateService(ctx, req)
|
||
if err != nil || resp.Code != 0 {
|
||
return false
|
||
}
|
||
if resp.Code == 0 {
|
||
return true
|
||
}
|
||
return false
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
|
||
req := &modelarts.ShowServiceReq{
|
||
ServiceId: id,
|
||
}
|
||
resp, err := m.modelArtsRpc.ShowService(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if resp.ErrorMsg != "" {
|
||
return nil, errors.New(resp.Msg)
|
||
}
|
||
ins := &inference.DeployInstance{}
|
||
ins.InstanceName = resp.ServiceName
|
||
ins.InstanceId = resp.ServiceId
|
||
ins.Status = resp.Status
|
||
ins.InferCard = "NPU"
|
||
ins.ClusterName = m.platform
|
||
ins.CreatedTime = string(resp.StartTime)
|
||
ins.ClusterType = TYPE_MODELARTS
|
||
ins.ModelName = resp.Config[0].ModelName
|
||
ins.ModelType = m.ModelType
|
||
ins.InferUrl = resp.AccessAddress
|
||
return ins, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
|
||
return "", nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
|
||
|
||
err := m.GetModelId(ctx, option)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
err = m.GetModelStatus(ctx, option)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
configParam := &modelarts.ServiceConfig{
|
||
Specification: "modelarts.kat1.xlarge",
|
||
Weight: 100,
|
||
ModelId: option.ModelId,
|
||
InstanceCount: 1,
|
||
}
|
||
var configItems []*modelarts.ServiceConfig
|
||
configItems = append(configItems, configParam)
|
||
now := time.Now()
|
||
timestampSec := now.Unix()
|
||
str := strconv.FormatInt(timestampSec, 10)
|
||
req := &modelarts.CreateServiceReq{
|
||
Platform: m.platform,
|
||
Config: configItems,
|
||
InferType: "real-time",
|
||
ServiceName: option.ModelName + "_" + option.ModelType + "_" + Npu + "_" + str,
|
||
}
|
||
ctx, cancel := context.WithTimeout(context.Background(), 150*time.Second)
|
||
defer cancel()
|
||
resp, err := m.modelArtsRpc.CreateService(ctx, req)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
|
||
return resp.ServiceId, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) CheckModelExistence(ctx context.Context, name string, mtype string) bool {
|
||
ifoption := &option.InferOption{
|
||
ModelName: name,
|
||
ModelType: mtype,
|
||
}
|
||
err := m.CheckImageExist(ctx, ifoption)
|
||
if err != nil {
|
||
return false
|
||
}
|
||
|
||
return true
|
||
}
|
||
|
||
func (m *ModelArtsLink) CheckImageExist(ctx context.Context, option *option.InferOption) error {
|
||
req := &modelarts.ListImagesReq{
|
||
Limit: m.pageSize,
|
||
Offset: m.pageIndex,
|
||
}
|
||
ListImageResp, err := m.modelArtsRpc.ListImages(ctx, req)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
var modelName string
|
||
if ListImageResp.Code == 200 {
|
||
//return errors.New("failed to get ModelId")
|
||
for _, ListImage := range ListImageResp.Data {
|
||
if option.ModelName == "ChatGLM-6B" {
|
||
modelName = "chatglm-6b"
|
||
} else {
|
||
modelName = option.ModelName
|
||
}
|
||
|
||
if ListImage.Name == modelName {
|
||
return nil
|
||
}
|
||
}
|
||
}
|
||
return errors.New("failed to find Image ")
|
||
}
|
||
|
||
func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context) (*collector.ResourceSpec, error) {
|
||
var wg sync.WaitGroup
|
||
//查询modelarts资源规格
|
||
req := &modelarts.GetResourceFlavorsReq{}
|
||
resp, err := m.modelArtsRpc.GetResourceFlavors(ctx, req)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if resp.Msg != "" {
|
||
return nil, errors.New(resp.Msg)
|
||
}
|
||
MoUsage := MoUsage{}
|
||
var cpusum int64 = 0
|
||
var npusum int64 = 0
|
||
var memorysum int64 = 0
|
||
var VMemorysum int64 = 0
|
||
var RunningTaskNum int64 = 0
|
||
var BalanceValue float64 = -1
|
||
var RateValue float64 = 0.930000
|
||
var StorageValue int64 = 1024
|
||
var AvailableValue int64 = 886
|
||
for _, Flavors := range resp.Items {
|
||
if Flavors.Metadata.Name == "modelarts.kat1.8xlarge" {
|
||
MoUsage.CpuSize, err = strconv.ParseInt(Flavors.Spec.Cpu, 10, 64) //CPU的值
|
||
if err != nil {
|
||
// 如果转换失败,处理错误
|
||
fmt.Println("转换错误:", err)
|
||
return nil, err
|
||
}
|
||
cpusum = MoUsage.CpuSize
|
||
MoUsage.NpuSize, err = strconv.ParseInt(Flavors.Spec.Npu.Size, 10, 64) //NPU的值
|
||
if err != nil {
|
||
// 如果转换失败,处理错误
|
||
fmt.Println("转换错误:", err)
|
||
return nil, err
|
||
}
|
||
npusum = MoUsage.NpuSize
|
||
re := regexp.MustCompile(`\d+`)
|
||
numberStr := re.FindString(Flavors.Spec.Memory) //正则表达式去单位
|
||
MoUsage.MemorySize, err = strconv.ParseInt(numberStr, 10, 64) //内存的值
|
||
if err != nil {
|
||
// 如果转换失败,处理错误
|
||
fmt.Println("转换错误:", err)
|
||
return nil, err
|
||
}
|
||
memorysum = MoUsage.MemorySize * 1024
|
||
}
|
||
}
|
||
//查询获取训练作业支持的公共规格(包括1,2,4,8卡的选择和显存的数值)
|
||
reqJobFlavors := &modelarts.TrainingJobFlavorsReq{
|
||
Platform: m.platform,
|
||
}
|
||
respJobFlavors, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, reqJobFlavors)
|
||
if err != nil {
|
||
wg.Done()
|
||
return nil, err
|
||
}
|
||
|
||
for _, TrainLists := range respJobFlavors.Flavors {
|
||
if TrainLists.FlavorId == "modelarts.kat1.8xlarge" {
|
||
re := regexp.MustCompile(`\d+`)
|
||
numberStr := re.FindString(string(TrainLists.FlavorInfo.Npu.Memory)) //正则表达式去单位
|
||
MoUsage.VMemorySize, err = strconv.ParseInt(numberStr, 10, 64) //显存的值
|
||
VMemorysum = MoUsage.VMemorySize * int64(TrainLists.FlavorInfo.Npu.UnitNum)
|
||
}
|
||
}
|
||
|
||
reqTraining := &modelarts.ListTrainingJobsreq{
|
||
Platform: m.platform,
|
||
}
|
||
//查询作业列表
|
||
|
||
respList, err := m.modelArtsRpc.GetListTrainingJobs(ctx, reqTraining)
|
||
if err != nil {
|
||
wg.Done()
|
||
return nil, err
|
||
}
|
||
var CoreNum int32 = 0
|
||
var NpuNum int32 = 0
|
||
var MemoryNum int32 = 0
|
||
var VMemoryNum int64 = 0
|
||
for _, TrainLists := range respList.Items {
|
||
if len(respList.Items) == 0 {
|
||
wg.Done()
|
||
}
|
||
if TrainLists.Status.Phase == "Running" {
|
||
CoreNum += TrainLists.Spec.Resource.FlavorDetail.FlavorInfo.Cpu.CoreNum
|
||
NpuNum += TrainLists.Spec.Resource.FlavorDetail.FlavorInfo.Npu.UnitNum
|
||
MemoryNum += TrainLists.Spec.Resource.FlavorDetail.FlavorInfo.Memory.Size
|
||
VMemoryNum, _ = strconv.ParseInt(TrainLists.Spec.Resource.FlavorDetail.FlavorInfo.Npu.Memory, 10, 64)
|
||
VMemoryNum += VMemoryNum
|
||
RunningTaskNum += 1
|
||
}
|
||
}
|
||
MoUsage.CpuAvailable = cpusum - int64(CoreNum)
|
||
MoUsage.NpuAvailable = npusum - int64(NpuNum)
|
||
MoUsage.MemoryAvailable = memorysum - int64(MemoryNum)
|
||
MoUsage.VMemoryAvailable = VMemorysum - VMemoryNum
|
||
|
||
UsageCPU := &collector.Usage{Type: strings.ToUpper(CPU), Name: strings.ToUpper("ARM"), Total: &collector.UnitValue{Unit: CPUCORE, Value: cpusum}, Available: &collector.UnitValue{Unit: CPUCORE, Value: MoUsage.CpuAvailable}}
|
||
UsageNPU := &collector.Usage{Type: strings.ToUpper(NPU), Name: ASCEND, Total: &collector.UnitValue{Unit: NUMBER, Value: npusum}, Available: &collector.UnitValue{Unit: NUMBER, Value: MoUsage.NpuAvailable}}
|
||
UsageMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(RAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: memorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.MemoryAvailable}}
|
||
UsageVMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(VRAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: VMemorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.VMemoryAvailable}}
|
||
RunningTask := &collector.Usage{Type: strings.ToUpper(RUNNINGTASK), Total: &collector.UnitValue{Unit: NUMBER, Value: RunningTaskNum}}
|
||
Balance := &collector.Usage{Type: strings.ToUpper(BALANCE), Total: &collector.UnitValue{Unit: RMB, Value: BalanceValue}}
|
||
Rate := &collector.Usage{Type: strings.ToUpper(RATE), Total: &collector.UnitValue{Unit: PERHOUR, Value: RateValue}}
|
||
Storage := &collector.Usage{Type: strings.ToUpper(STORAGE), Total: &collector.UnitValue{Unit: GIGABYTE, Value: StorageValue}, Name: strings.ToUpper("disk"), Available: &collector.UnitValue{Unit: GIGABYTE, Value: AvailableValue}}
|
||
|
||
resUsage := &collector.ResourceSpec{
|
||
ClusterId: strconv.FormatInt(m.participantId, 10),
|
||
}
|
||
|
||
cres := &collector.ClusterResource{}
|
||
cres.Resource = UsageNPU
|
||
cres.BaseResources = append(cres.BaseResources, UsageCPU)
|
||
cres.BaseResources = append(cres.BaseResources, UsageMEMORY)
|
||
cres.BaseResources = append(cres.BaseResources, UsageVMEMORY)
|
||
cres.BaseResources = append(cres.BaseResources, Storage)
|
||
|
||
RunningTaskRes := &collector.ClusterResource{}
|
||
RunningTaskRes.Resource = RunningTask
|
||
|
||
BalanceRes := &collector.ClusterResource{}
|
||
BalanceRes.Resource = Balance
|
||
|
||
RateRes := &collector.ClusterResource{}
|
||
RateRes.Resource = Rate
|
||
|
||
/* StorageRes := &collector.ClusterResource{}
|
||
StorageRes.Resource = Storage*/
|
||
|
||
resUsage.Resources = append(resUsage.Resources, cres)
|
||
resUsage.Resources = append(resUsage.Resources, RunningTaskRes)
|
||
resUsage.Resources = append(resUsage.Resources, BalanceRes)
|
||
resUsage.Resources = append(resUsage.Resources, RateRes)
|
||
//resUsage.Resources = append(resUsage.Resources, StorageRes)
|
||
return resUsage, nil
|
||
}
|
||
|
||
func (m *ModelArtsLink) Stop(ctx context.Context, id string) error {
|
||
req := &modelarts.StopTrainingJobReq{
|
||
TrainingJobId: id,
|
||
ActionType: "terminate",
|
||
}
|
||
resp, err := m.modelArtsRpc.StopTrainingJob(ctx, req)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if resp.Code != 0 {
|
||
return errors.New(resp.ErrorMsg)
|
||
}
|
||
return nil
|
||
}
|