forked from JointCloud/pcm-coordinator
1293 lines
32 KiB
Go
1293 lines
32 KiB
Go
/*
|
|
|
|
Copyright (c) [2023] [pcm]
|
|
[pcm-coordinator] is licensed under Mulan PSL v2.
|
|
You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
You may obtain a copy of Mulan PSL v2 at:
|
|
http://license.coscl.org.cn/MulanPSL2
|
|
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
See the Mulan PSL v2 for more details.
|
|
|
|
*/
|
|
|
|
package storeLink
|
|
|
|
import (
|
|
"bufio"
|
|
"context"
|
|
"errors"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
|
|
"gitlink.org.cn/JointCloud/pcm-octopus/octopus"
|
|
"gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
|
|
"io"
|
|
"math"
|
|
"mime/multipart"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type OctopusLink struct {
|
|
octopusRpc octopusclient.Octopus
|
|
pageIndex int32
|
|
pageSize int32
|
|
platform string
|
|
participantId int64
|
|
}
|
|
|
|
const (
|
|
IMG_NAME_PREFIX = "oct_"
|
|
IMG_VERSION_PREFIX = "version_"
|
|
TASK_NAME_PREFIX = "trainJob"
|
|
RESOURCE_POOL = "common-pool"
|
|
HANWUJI = "hanwuji"
|
|
SUIYUAN = "suiyuan"
|
|
SAILINGSI = "sailingsi"
|
|
MLU = "MLU"
|
|
BIV100 = "BI-V100"
|
|
CAMBRICONMLU290 = 256
|
|
GCU = "GCU"
|
|
ENFLAME = "enflame"
|
|
EnflameT20 = 128
|
|
BASE_TOPS = 128
|
|
CAMBRICON = "cambricon"
|
|
ILUVATAR = "iluvatar"
|
|
TRAIN_CMD = "cd /code; python train.py"
|
|
VERSION = "V1"
|
|
DOMAIN = "http://192.168.242.41:8001/"
|
|
CAMBRICON_CN = "寒武纪290"
|
|
ENFLAME_CN = "燧原T20"
|
|
ILUVATAR_CN = "天数BI-V100"
|
|
)
|
|
|
|
var (
|
|
cardAliasMap = map[string]string{
|
|
MLU: CAMBRICON,
|
|
GCU: ENFLAME,
|
|
BIV100: ILUVATAR,
|
|
}
|
|
cardCnMap = map[string]string{
|
|
MLU: CAMBRICON_CN,
|
|
GCU: ENFLAME_CN,
|
|
BIV100: ILUVATAR_CN,
|
|
}
|
|
cardTopsMap = map[string]float64{
|
|
MLU: CAMBRICONMLU290,
|
|
GCU: EnflameT20,
|
|
}
|
|
CardModelNameCmdMap = map[string]map[string]string{
|
|
BIV100: {"blip-image-captioning-base": "pip install -U transformers; pip install fastapi uvicorn[standard]; pip install python-multipart; cd /code; python infer_biv100.py",
|
|
"imagenet_resnet50": "pip install -U transformers; pip install fastapi uvicorn[standard]; pip install python-multipart; cd /code/infer; python infer_biv100.py",
|
|
"ChatGLM-6B": "su root; pip install transformers==4.33.2; pip install fastapi uvicorn[standard]; cd /code; python infer_biv100.py"},
|
|
MLU: {"blip-image-captioning-base": "",
|
|
"imagenet_resnet50": "su root; . /torch/venv3/pytorch/bin/activate; pip install fastapi uvicorn[standard]; pip install python-multipart; cd /code/infer; python infer_mlu.py"},
|
|
}
|
|
)
|
|
|
|
func NewOctopusLink(octopusRpc octopusclient.Octopus, name string, id int64) *OctopusLink {
|
|
return &OctopusLink{octopusRpc: octopusRpc, platform: name, participantId: id, pageIndex: 1, pageSize: 100}
|
|
}
|
|
|
|
func (o *OctopusLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
|
|
// octopus创建镜像
|
|
createReq := &octopus.CreateImageReq{
|
|
Platform: o.platform,
|
|
CreateImage: &octopus.CreateImage{
|
|
SourceType: 1,
|
|
ImageName: IMG_NAME_PREFIX + utils.RandomString(7),
|
|
ImageVersion: IMG_VERSION_PREFIX + utils.RandomString(7),
|
|
},
|
|
}
|
|
createResp, err := o.octopusRpc.CreateImage(ctx, createReq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// octopus上传镜像
|
|
uploadReq := &octopus.UploadImageReq{
|
|
Platform: o.platform,
|
|
ImageId: createResp.Payload.ImageId,
|
|
Params: &octopus.UploadImageParam{
|
|
Domain: "",
|
|
FileName: "",
|
|
},
|
|
}
|
|
uploadResp, err := o.octopusRpc.UploadImage(ctx, uploadReq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Todo 实际上传
|
|
|
|
return uploadResp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
|
|
// octopus删除镜像
|
|
req := &octopus.DeleteImageReq{
|
|
Platform: o.platform,
|
|
ImageId: imageId,
|
|
}
|
|
resp, err := o.octopusRpc.DeleteImage(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) QueryImageList(ctx context.Context) (interface{}, error) {
|
|
// octopus获取镜像列表
|
|
req := &octopus.GetUserImageListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetUserImageList(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
|
|
// octopus提交任务
|
|
|
|
// python参数
|
|
var prms []*octopus.Parameters
|
|
for _, param := range params {
|
|
var p octopus.Parameters
|
|
s := strings.Split(param, COMMA)
|
|
p.Key = s[0]
|
|
p.Value = s[1]
|
|
prms = append(prms, &p)
|
|
}
|
|
|
|
//环境变量
|
|
envMap := make(map[string]string)
|
|
for _, env := range envs {
|
|
s := strings.Split(env, COMMA)
|
|
envMap[s[0]] = s[1]
|
|
}
|
|
|
|
req := &octopus.CreateTrainJobReq{
|
|
Platform: o.platform,
|
|
Params: &octopus.CreateTrainJobParam{
|
|
ImageId: imageId,
|
|
Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
|
|
ResourcePool: RESOURCE_POOL,
|
|
Config: []*octopus.Config{
|
|
{
|
|
Command: cmd,
|
|
ResourceSpecId: resourceId,
|
|
MinFailedTaskCount: 1,
|
|
MinSucceededTaskCount: 1,
|
|
TaskNumber: 1,
|
|
Parameters: prms,
|
|
Envs: envMap,
|
|
},
|
|
},
|
|
DataSetId: datasetsId,
|
|
DataSetVersion: VERSION,
|
|
AlgorithmId: algorithmId,
|
|
AlgorithmVersion: VERSION,
|
|
},
|
|
}
|
|
resp, err := o.octopusRpc.CreateTrainJob(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
|
|
// octopus获取任务
|
|
req := &octopus.GetTrainJobReq{
|
|
Platform: o.platform,
|
|
Id: taskId,
|
|
}
|
|
resp, err := o.octopusRpc.GetTrainJob(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
|
|
// octopus删除任务
|
|
req := &octopus.DeleteTrainJobReq{
|
|
Platform: o.platform,
|
|
JobIds: []string{taskId},
|
|
}
|
|
resp, err := o.octopusRpc.DeleteTrainJob(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) QuerySpecs(ctx context.Context) (interface{}, error) {
|
|
// octopus查询资源规格
|
|
req := &octopus.GetResourceSpecsReq{
|
|
Platform: o.platform,
|
|
ResourcePool: RESOURCE_POOL,
|
|
}
|
|
resp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
|
|
req := &octopus.GetResourceSpecsReq{
|
|
Platform: o.platform,
|
|
ResourcePool: RESOURCE_POOL,
|
|
}
|
|
specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !specResp.Success {
|
|
return nil, errors.New(specResp.Error.Message)
|
|
}
|
|
balanceReq := &octopus.GetUserBalanceReq{
|
|
Platform: o.platform,
|
|
}
|
|
balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !balanceResp.Success {
|
|
return nil, errors.New(balanceResp.Error.Message)
|
|
}
|
|
|
|
var cards []*collector.Card
|
|
balance := float64(balanceResp.Payload.BillingUser.Amount)
|
|
var cpuHours float64
|
|
for _, spec := range specResp.TrainResourceSpecs {
|
|
if spec.Price == 0 {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
if len(ns) == 2 {
|
|
nss := strings.Split(ns[0], COLON)
|
|
if nss[0] == CPU {
|
|
cpuHours = -1
|
|
}
|
|
}
|
|
}
|
|
|
|
if spec.Price == 1 {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
cardSpecs := strings.Split(ns[0], STAR)
|
|
|
|
cardTops, isMapContainsKey := cardTopsMap[cardSpecs[1]]
|
|
if !isMapContainsKey {
|
|
continue
|
|
}
|
|
|
|
card := &collector.Card{
|
|
Platform: OCTOPUS,
|
|
Type: CARD,
|
|
Name: cardSpecs[1],
|
|
TOpsAtFp16: cardTops,
|
|
CardHours: balance / spec.Price,
|
|
}
|
|
cards = append(cards, card)
|
|
}
|
|
}
|
|
|
|
resourceStats := &collector.ResourceStats{
|
|
ClusterId: strconv.FormatInt(o.participantId, 10),
|
|
Name: o.platform,
|
|
Balance: balance,
|
|
CardsAvail: cards,
|
|
CpuCoreHours: cpuHours,
|
|
}
|
|
|
|
return resourceStats, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
|
|
req := &octopus.GetMyDatasetListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !resp.Success {
|
|
return nil, errors.New(resp.Error.Message)
|
|
}
|
|
specs := []*collector.DatasetsSpecs{}
|
|
for _, dataset := range resp.Payload.Datasets {
|
|
spec := &collector.DatasetsSpecs{Name: dataset.Name}
|
|
specs = append(specs, spec)
|
|
}
|
|
return specs, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
|
|
var algorithms []*collector.Algorithm
|
|
|
|
req := &octopus.GetMyAlgorithmListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !resp.Success {
|
|
return nil, errors.New("failed to get algorithms")
|
|
}
|
|
|
|
for _, a := range resp.Payload.Algorithms {
|
|
algorithm := &collector.Algorithm{Name: a.AlgorithmName, Platform: OCTOPUS, TaskType: strings.ToLower(a.FrameworkName)}
|
|
algorithms = append(algorithms, algorithm)
|
|
}
|
|
return algorithms, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetComputeCards(ctx context.Context) ([]string, error) {
|
|
var cards []string
|
|
for s, _ := range cardAliasMap {
|
|
cards = append(cards, s)
|
|
}
|
|
return cards, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetUserBalance(ctx context.Context) (float64, error) {
|
|
balanceReq := &octopus.GetUserBalanceReq{
|
|
Platform: o.platform,
|
|
}
|
|
balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if !balanceResp.Success {
|
|
if balanceResp.Error != nil {
|
|
return 0, errors.New(balanceResp.Error.Message)
|
|
} else {
|
|
return 0, errors.New("failed to get user balance")
|
|
}
|
|
}
|
|
balance := float64(balanceResp.Payload.BillingUser.Amount)
|
|
return balance, nil
|
|
}
|
|
|
|
func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
|
|
var name string
|
|
if resourceType == CARD {
|
|
name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
|
|
} else {
|
|
name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
|
|
}
|
|
|
|
req := &octopus.GetMyAlgorithmListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if !resp.Success {
|
|
return "", errors.New("failed to get algorithmList")
|
|
}
|
|
|
|
var algorithmId string
|
|
var algorithms []*octopus.Algorithms
|
|
for _, a := range resp.Payload.Algorithms {
|
|
if strings.ToLower(a.FrameworkName) != taskType {
|
|
continue
|
|
}
|
|
|
|
if a.AlgorithmDescript == name {
|
|
algorithms = append(algorithms, a)
|
|
}
|
|
}
|
|
|
|
if len(algorithms) == 0 {
|
|
return "", errors.New("algorithmId not found")
|
|
}
|
|
|
|
if len(algorithms) == 1 {
|
|
algorithmId = algorithms[0].AlgorithmId
|
|
}
|
|
|
|
aLatest := &octopus.Algorithms{}
|
|
for i, _ := range algorithms {
|
|
if time.Unix(algorithms[i].CreatedAt, 0).After(time.Unix(aLatest.CreatedAt, 0)) {
|
|
aLatest = algorithms[i]
|
|
}
|
|
}
|
|
if aLatest.AlgorithmId == "" {
|
|
return "", errors.New("algorithmId not found")
|
|
}
|
|
|
|
algorithmId = aLatest.AlgorithmId
|
|
|
|
dcReq := &octopus.DownloadCompressReq{
|
|
Platform: o.platform,
|
|
Version: VERSION,
|
|
AlgorithmId: algorithmId,
|
|
}
|
|
dcResp, err := o.octopusRpc.DownloadCompress(ctx, dcReq)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if !dcResp.Success {
|
|
return "", errors.New(dcResp.Error.Message)
|
|
}
|
|
|
|
daReq := &octopus.DownloadAlgorithmReq{
|
|
Platform: o.platform,
|
|
Version: VERSION,
|
|
AlgorithmId: algorithmId,
|
|
CompressAt: dcResp.Payload.CompressAt,
|
|
Domain: DOMAIN,
|
|
}
|
|
daResp, err := o.octopusRpc.DownloadAlgorithm(ctx, daReq)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if !daResp.Success {
|
|
return "", errors.New(dcResp.Error.Message)
|
|
}
|
|
urlReq := &octopus.AlgorithmUrlReq{
|
|
Platform: o.platform,
|
|
Url: daResp.Payload.DownloadUrl,
|
|
}
|
|
urlResp, err := o.octopusRpc.DownloadAlgorithmUrl(ctx, urlReq)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return urlResp.Algorithm, nil
|
|
}
|
|
|
|
func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
|
|
//var name string
|
|
//if resourceType == CARD {
|
|
// name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
|
|
//} else {
|
|
// name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
|
|
//}
|
|
//uploadReq := &octopus.UploadAlgorithmReq{}
|
|
return nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
|
|
instance, err := strconv.ParseInt(instanceNum, 10, 32)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
req := &octopus.GetTrainJobLogReq{
|
|
Platform: o.platform,
|
|
TaskId: taskId,
|
|
TaskNum: "task0",
|
|
Num: int32(instance),
|
|
}
|
|
resp, err := o.octopusRpc.GetTrainJobLog(ctx, req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if strings.Contains(resp.Content, "404 Not Found") {
|
|
resp.Content = "waiting for logs..."
|
|
}
|
|
|
|
return resp.Content, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
|
|
resp, err := o.QueryTask(ctx, taskId)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
jobresp, ok := (resp).(*octopus.GetTrainJobResp)
|
|
if !jobresp.Success || !ok {
|
|
if jobresp.Error != nil {
|
|
return nil, errors.New(jobresp.Error.Message)
|
|
} else {
|
|
return nil, errors.New("get training task failed, empty error returned")
|
|
}
|
|
}
|
|
var task collector.Task
|
|
task.Id = jobresp.Payload.TrainJob.Id
|
|
if jobresp.Payload.TrainJob.StartedAt != 0 {
|
|
task.Start = time.Unix(jobresp.Payload.TrainJob.StartedAt, 0).Format(constants.Layout)
|
|
}
|
|
if jobresp.Payload.TrainJob.CompletedAt != 0 {
|
|
task.End = time.Unix(jobresp.Payload.TrainJob.CompletedAt, 0).Format(constants.Layout)
|
|
}
|
|
switch jobresp.Payload.TrainJob.Status {
|
|
case "succeeded":
|
|
task.Status = constants.Completed
|
|
case "failed":
|
|
task.Status = constants.Failed
|
|
case "running":
|
|
task.Status = constants.Running
|
|
case "stopped":
|
|
task.Status = constants.Stopped
|
|
case "pending":
|
|
task.Status = constants.Pending
|
|
default:
|
|
task.Status = "undefined"
|
|
}
|
|
|
|
return &task, nil
|
|
}
|
|
|
|
func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
|
|
err := o.GenerateSubmitParams(ctx, option)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return task, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
|
|
err := o.generateResourceId(ctx, option, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateDatasetsId(ctx, option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateImageId(ctx, option, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateAlgorithmId(ctx, option, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateCmd(option, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateEnv(option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateParams(option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
|
|
req := &octopus.GetResourceSpecsReq{
|
|
Platform: o.platform,
|
|
ResourcePool: RESOURCE_POOL,
|
|
}
|
|
specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !specResp.Success {
|
|
return errors.New(specResp.Error.Message)
|
|
}
|
|
|
|
if option != nil {
|
|
err = generateResourceIdForTraining(option, specResp)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if ifoption != nil {
|
|
err = generateResourceIdForInferDeployInstance(ifoption, specResp)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
return errors.New("failed to set ResourceId")
|
|
}
|
|
|
|
func generateResourceIdForTraining(option *option.AiOption, specResp *octopus.GetResourceSpecsResp) error {
|
|
if option.ResourceType == "" {
|
|
return errors.New("ResourceType not set")
|
|
}
|
|
|
|
if option.ResourceType == CPU {
|
|
for _, spec := range specResp.TrainResourceSpecs {
|
|
if spec.Price == 0 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
|
|
if option.ResourceType == CARD {
|
|
if option.ComputeCard == "" {
|
|
option.ComputeCard = GCU
|
|
}
|
|
err := setResourceIdByCard(option, specResp, option.ComputeCard)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
return errors.New("ResourceType not set")
|
|
}
|
|
|
|
func generateResourceIdForInferDeployInstance(option *option.InferOption, specResp *octopus.GetResourceSpecsResp) error {
|
|
// temporarily use bi-v100
|
|
cardName, ok := cardCnMap[BIV100]
|
|
if !ok {
|
|
errors.New("computeCard not set")
|
|
}
|
|
|
|
// set computeCard
|
|
option.ComputeCard = BIV100
|
|
|
|
for _, spec := range specResp.TrainResourceSpecs {
|
|
names := strings.Split(spec.Name, COMMA)
|
|
if len(names) != 4 {
|
|
continue
|
|
}
|
|
|
|
ns := strings.Split(names[0], STAR)
|
|
if len(ns) != 2 {
|
|
continue
|
|
}
|
|
|
|
if ns[0] == "1" && ns[1] == cardName {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return errors.New("failed to set ResourceId")
|
|
}
|
|
|
|
func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiOption) error {
|
|
if option.DatasetsName == "" {
|
|
return errors.New("DatasetsName not set")
|
|
}
|
|
req := &octopus.GetMyDatasetListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !resp.Success {
|
|
return errors.New("failed to get DatasetsId")
|
|
}
|
|
for _, dataset := range resp.Payload.Datasets {
|
|
if dataset.Name == option.DatasetsName {
|
|
option.DatasetsId = dataset.Id
|
|
return nil
|
|
}
|
|
}
|
|
return errors.New("failed to get DatasetsId")
|
|
}
|
|
|
|
func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
|
|
preImgReq := &octopus.GetPresetImageListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
preImgResp, err := o.octopusRpc.GetPresetImageList(ctx, preImgReq)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !preImgResp.Success {
|
|
return errors.New("failed to get PresetImages")
|
|
}
|
|
|
|
if option != nil {
|
|
if option.TaskType == "" {
|
|
return errors.New("TaskType not set")
|
|
}
|
|
|
|
req := &octopus.GetUserImageListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetUserImageList(ctx, req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !resp.Success {
|
|
return errors.New("failed to get imageId")
|
|
}
|
|
|
|
if option.ResourceType == CPU {
|
|
for _, img := range resp.Payload.Images {
|
|
if img.Image.ImageName == "test-image" {
|
|
option.ImageId = img.Image.Id
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
|
|
err = generateImageIdForTraining(option, preImgResp)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if ifoption != nil {
|
|
err = generateImageIdForInferDeployInstance(ifoption, preImgResp)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
return errors.New("failed to get ImageId")
|
|
}
|
|
|
|
func generateImageIdForTraining(option *option.AiOption, preImgResp *octopus.GetPresetImageListResp) error {
|
|
if option.ResourceType == CARD {
|
|
for _, image := range preImgResp.Payload.Images {
|
|
if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(option.ComputeCard)]) {
|
|
switch strings.ToUpper(option.ComputeCard) {
|
|
case GCU:
|
|
if strings.HasPrefix(image.ImageVersion, "t20_") {
|
|
option.ImageId = image.Id
|
|
return nil
|
|
}
|
|
case BIV100:
|
|
if strings.HasPrefix(image.ImageVersion, "bi_") {
|
|
option.ImageId = image.Id
|
|
return nil
|
|
}
|
|
case MLU:
|
|
option.ImageId = image.Id
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return errors.New("failed to set ImageId")
|
|
}
|
|
|
|
func generateImageIdForInferDeployInstance(option *option.InferOption, preImgResp *octopus.GetPresetImageListResp) error {
|
|
for _, image := range preImgResp.Payload.Images {
|
|
// temporarily use bi-v100
|
|
if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(BIV100)]) {
|
|
switch strings.ToUpper(BIV100) {
|
|
case GCU:
|
|
if strings.HasPrefix(image.ImageVersion, "t20_") {
|
|
option.ImageId = image.Id
|
|
return nil
|
|
}
|
|
case BIV100:
|
|
if strings.HasPrefix(image.ImageVersion, "bi_") {
|
|
option.ImageId = image.Id
|
|
return nil
|
|
}
|
|
case MLU:
|
|
option.ImageId = image.Id
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return errors.New("failed to set ImageId")
|
|
}
|
|
|
|
func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
|
|
req := &octopus.GetMyAlgorithmListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !resp.Success {
|
|
return errors.New("failed to get algorithmId")
|
|
}
|
|
|
|
if option != nil {
|
|
err = generateAlgorithmIdForTraining(option, resp)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if ifoption != nil {
|
|
err = generateAlgorithmIdForInferDeployInstance(ifoption, resp)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
return errors.New("failed to set AlgorithmId")
|
|
}
|
|
|
|
func generateAlgorithmIdForTraining(option *option.AiOption, resp *octopus.GetMyAlgorithmListResp) error {
|
|
for _, algorithm := range resp.Payload.Algorithms {
|
|
if algorithm.FrameworkName == strings.Title(option.TaskType) {
|
|
ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
|
|
if ns[0] != option.DatasetsName {
|
|
continue
|
|
}
|
|
if ns[1] != option.AlgorithmName {
|
|
continue
|
|
}
|
|
switch option.ResourceType {
|
|
case CPU:
|
|
if ns[2] != CPU {
|
|
continue
|
|
}
|
|
case CARD:
|
|
if ns[2] != strings.ToLower(option.ComputeCard) {
|
|
continue
|
|
}
|
|
}
|
|
|
|
option.AlgorithmId = algorithm.AlgorithmId
|
|
return nil
|
|
}
|
|
}
|
|
|
|
return errors.New("Algorithm does not exist")
|
|
}
|
|
|
|
func generateAlgorithmIdForInferDeployInstance(option *option.InferOption, resp *octopus.GetMyAlgorithmListResp) error {
|
|
if option.ModelType == "" {
|
|
return errors.New("ModelType not set")
|
|
}
|
|
|
|
if option.ModelName == "" {
|
|
return errors.New("ModelName not set")
|
|
}
|
|
|
|
for _, algorithm := range resp.Payload.Algorithms {
|
|
if strings.Contains(algorithm.AlgorithmName, option.ModelName) {
|
|
option.AlgorithmId = algorithm.AlgorithmId
|
|
return nil
|
|
}
|
|
}
|
|
return errors.New("ModelName does not exist")
|
|
}
|
|
|
|
func (o *OctopusLink) generateCmd(option *option.AiOption, ifoption *option.InferOption) error {
|
|
if option != nil {
|
|
err := generateCmdForTraining(option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
if ifoption != nil {
|
|
err := generateCmdForInferDeployInstance(ifoption)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
return errors.New("failed to set cmd")
|
|
}
|
|
|
|
func generateCmdForTraining(option *option.AiOption) error {
|
|
if option.Cmd == "" {
|
|
switch option.ComputeCard {
|
|
case GCU:
|
|
option.Cmd = "cd /code; python3 train.py"
|
|
case MLU:
|
|
option.Cmd = ". /torch/venv3/pytorch/bin/activate; cd /code; python train.py"
|
|
default:
|
|
option.Cmd = TRAIN_CMD
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func generateCmdForInferDeployInstance(option *option.InferOption) error {
|
|
if option.Cmd == "" {
|
|
nameCmd, ok := CardModelNameCmdMap[option.ComputeCard]
|
|
if !ok {
|
|
return errors.New("failed to set cmd, ComputeCard not exist")
|
|
}
|
|
cmd, ok := nameCmd[option.ModelName]
|
|
if !ok {
|
|
return errors.New("failed to set cmd, ModelName not exist")
|
|
}
|
|
option.Cmd = cmd
|
|
return nil
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (o *OctopusLink) generateEnv(option *option.AiOption) error {
|
|
|
|
return nil
|
|
}
|
|
|
|
func (o *OctopusLink) generateParams(option *option.AiOption) error {
|
|
if len(option.Params) == 0 {
|
|
epoch := "epoch" + COMMA + "1"
|
|
option.Params = append(option.Params, epoch)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error {
|
|
if option.Tops == 0 {
|
|
for _, spec := range specs.TrainResourceSpecs {
|
|
if spec.Price == 1 {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
cardSpecs := strings.Split(ns[0], STAR)
|
|
if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
|
|
option.ResourceId = spec.Id
|
|
option.ComputeCard = computeCard
|
|
return nil
|
|
}
|
|
} else {
|
|
continue
|
|
}
|
|
}
|
|
} else {
|
|
cardNum := math.Ceil(option.Tops / float64(BASE_TOPS))
|
|
for _, spec := range specs.TrainResourceSpecs {
|
|
if option.Tops < BASE_TOPS {
|
|
if spec.Price == 1 {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
cardSpecs := strings.Split(ns[0], STAR)
|
|
if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
|
|
option.ResourceId = spec.Id
|
|
option.ComputeCard = computeCard
|
|
return nil
|
|
}
|
|
} else {
|
|
continue
|
|
}
|
|
} else {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
if len(ns) != 4 {
|
|
continue
|
|
}
|
|
cardSpecs := strings.Split(ns[0], STAR)
|
|
if cardSpecs[1] != cardCnMap[strings.ToUpper(computeCard)] {
|
|
continue
|
|
}
|
|
s, err := strconv.ParseFloat(cardSpecs[0], 64)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
switch computeCard {
|
|
case GCU:
|
|
option.ComputeCard = computeCard
|
|
if cardNum == s { // 1, 4, 8
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
if 1 < cardNum && cardNum <= 4 && s == 4 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
if 4 < cardNum && s == 8 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
|
|
case MLU: // 1, 2, 4
|
|
option.ComputeCard = computeCard
|
|
if cardNum/2 == s {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
if 2 < cardNum/2 && s == 4 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return errors.New("set ResourceId error")
|
|
}
|
|
|
|
func (o *OctopusLink) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
|
|
req := &octopus.GetNotebookListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
SearchKey: DEPLOY_INSTANCE_PREFIEX,
|
|
}
|
|
list, err := o.octopusRpc.GetNotebookList(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var imageUrls []*inference.InferUrl
|
|
for _, notebook := range list.Payload.GetNotebooks() {
|
|
if strings.Contains(notebook.Desc, option.ModelName) && notebook.Status == "running" {
|
|
url := strings.Replace(notebook.Tasks[0].Url, FORWARD_SLASH, "", -1)
|
|
names := strings.Split(notebook.Desc, FORWARD_SLASH)
|
|
imageUrl := &inference.InferUrl{
|
|
Url: DOMAIN + url,
|
|
Card: names[2],
|
|
}
|
|
imageUrls = append(imageUrls, imageUrl)
|
|
} else {
|
|
continue
|
|
}
|
|
}
|
|
|
|
if len(imageUrls) == 0 {
|
|
return nil, errors.New("no infer url available")
|
|
}
|
|
|
|
clusterWithUrl := &inference.ClusterInferUrl{
|
|
ClusterName: o.platform,
|
|
ClusterType: TYPE_OCTOPUS,
|
|
InferUrls: imageUrls,
|
|
}
|
|
return clusterWithUrl, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
|
|
var insList []*inference.DeployInstance
|
|
req := &octopus.GetNotebookListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
SearchKey: DEPLOY_INSTANCE_PREFIEX,
|
|
}
|
|
list, err := o.octopusRpc.GetNotebookList(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if list.Error != nil {
|
|
return nil, errors.New(list.Error.Message)
|
|
}
|
|
for _, notebook := range list.Payload.Notebooks {
|
|
ins := &inference.DeployInstance{}
|
|
ins.InstanceName = notebook.Name
|
|
ins.InstanceId = notebook.Id
|
|
ins.ClusterName = o.platform
|
|
ins.Status = notebook.Status
|
|
ins.ClusterType = TYPE_OCTOPUS
|
|
insList = append(insList, ins)
|
|
}
|
|
return insList, nil
|
|
}
|
|
|
|
func (o *OctopusLink) StartInferDeployInstance(ctx context.Context, id string) bool {
|
|
req := &octopus.StartNotebookReq{
|
|
Platform: o.platform,
|
|
Id: id,
|
|
}
|
|
resp, err := o.octopusRpc.StartNotebook(ctx, req)
|
|
if err != nil || !resp.Success {
|
|
return false
|
|
}
|
|
return resp.Success
|
|
}
|
|
|
|
func (o *OctopusLink) StopInferDeployInstance(ctx context.Context, id string) bool {
|
|
req := &octopus.StopNotebookReq{
|
|
Platform: o.platform,
|
|
Id: id,
|
|
}
|
|
resp, err := o.octopusRpc.StopNotebook(ctx, req)
|
|
if err != nil || !resp.Success {
|
|
return false
|
|
}
|
|
return resp.Success
|
|
}
|
|
|
|
func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
|
|
ins := &inference.DeployInstance{}
|
|
|
|
req := &octopus.GetNotebookReq{
|
|
Platform: o.platform,
|
|
Id: id,
|
|
}
|
|
|
|
resp, err := o.octopusRpc.GetNotebook(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if resp.Payload == nil {
|
|
return nil, errors.New("instance does not exist")
|
|
}
|
|
|
|
url := strings.Replace(resp.Payload.Notebook.Tasks[0].Url, FORWARD_SLASH, "", -1)
|
|
inferUrl := DOMAIN + url
|
|
|
|
var modelType string
|
|
var modelName string
|
|
var card string
|
|
if resp.Payload.Notebook.Desc != "" {
|
|
str := strings.Split(resp.Payload.Notebook.Desc, FORWARD_SLASH)
|
|
if len(str) == 3 {
|
|
modelType = str[0]
|
|
modelName = str[1]
|
|
card = str[2]
|
|
}
|
|
}
|
|
|
|
ins.InstanceName = resp.Payload.Notebook.Name
|
|
ins.InstanceId = resp.Payload.Notebook.Id
|
|
ins.ClusterName = o.platform
|
|
ins.Status = resp.Payload.Notebook.Status
|
|
ins.ClusterType = TYPE_OCTOPUS
|
|
ins.ModelType = modelType
|
|
ins.ModelName = modelName
|
|
ins.InferUrl = inferUrl
|
|
ins.InferCard = card
|
|
|
|
return ins, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
|
|
stream, err := o.octopusRpc.GetInferResult(ctx)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
buffer := make([]byte, 2048)
|
|
bufferedReader := bufio.NewReader(file)
|
|
for {
|
|
_, err = bufferedReader.Read(buffer)
|
|
if err != nil {
|
|
if err != io.EOF {
|
|
return "", err
|
|
}
|
|
break
|
|
}
|
|
err = stream.Send(&octopus.InferResultReq{
|
|
Platform: o.platform,
|
|
InferUrl: url,
|
|
FileName: fileName,
|
|
FileBytes: buffer,
|
|
})
|
|
}
|
|
|
|
recv, err := stream.CloseAndRecv()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return recv.Result, nil
|
|
}
|
|
|
|
func (o *OctopusLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
|
|
err := o.generateResourceId(ctx, nil, option)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
err = o.generateAlgorithmId(ctx, nil, option)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
err = o.generateImageId(ctx, nil, option)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
err = o.generateCmd(nil, option)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
desc := option.ModelType + FORWARD_SLASH + option.ModelName + FORWARD_SLASH + strings.ToLower(BIV100)
|
|
param := &octopus.CreateNotebookParam{
|
|
Name: DEPLOY_INSTANCE_PREFIEX + DASH + utils.TimeString(),
|
|
ResourcePool: RESOURCE_POOL,
|
|
ResourceSpecId: option.ResourceId,
|
|
AlgorithmId: option.AlgorithmId,
|
|
AlgorithmVersion: VERSION,
|
|
ImageId: option.ImageId,
|
|
DatasetId: "",
|
|
DatasetVersion: "",
|
|
Command: option.Cmd,
|
|
Desc: desc,
|
|
TaskNumber: 1,
|
|
}
|
|
req := &octopus.CreateNotebookReq{
|
|
Platform: o.platform,
|
|
Params: param,
|
|
}
|
|
|
|
resp, err := o.octopusRpc.CreateNotebook(ctx, req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if !resp.Success {
|
|
return "", errors.New(resp.Error.Message)
|
|
}
|
|
|
|
return resp.Payload.Id, nil
|
|
}
|
|
|
|
func (o *OctopusLink) CheckModelExistence(ctx context.Context, name string, mtype string) bool {
|
|
ifoption := &option.InferOption{
|
|
ModelName: name,
|
|
ModelType: mtype,
|
|
}
|
|
err := o.generateAlgorithmId(ctx, nil, ifoption)
|
|
if err != nil {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func (o *OctopusLink) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
|
|
_ = &collector.ResourceSpec{
|
|
ClusterId: strconv.FormatInt(o.participantId, 10),
|
|
Resources: make([]interface{}, 0),
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
func (o *OctopusLink) Stop(ctx context.Context, id string) error {
|
|
return errors.New("failed to implement")
|
|
}
|