forked from JointCloud/pcm-coordinator
906 lines
22 KiB
Go
906 lines
22 KiB
Go
/*
|
|
|
|
Copyright (c) [2023] [pcm]
|
|
[pcm-coordinator] is licensed under Mulan PSL v2.
|
|
You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
You may obtain a copy of Mulan PSL v2 at:
|
|
http://license.coscl.org.cn/MulanPSL2
|
|
THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
See the Mulan PSL v2 for more details.
|
|
|
|
*/
|
|
|
|
package storeLink
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
|
|
"gitlink.org.cn/JointCloud/pcm-octopus/octopus"
|
|
"gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
|
|
"math"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
type OctopusLink struct {
|
|
octopusRpc octopusclient.Octopus
|
|
pageIndex int32
|
|
pageSize int32
|
|
platform string
|
|
participantId int64
|
|
}
|
|
|
|
const (
|
|
IMG_NAME_PREFIX = "oct_"
|
|
IMG_VERSION_PREFIX = "version_"
|
|
TASK_NAME_PREFIX = "trainJob"
|
|
RESOURCE_POOL = "common-pool"
|
|
HANWUJI = "hanwuji"
|
|
SUIYUAN = "suiyuan"
|
|
SAILINGSI = "sailingsi"
|
|
MLU = "MLU"
|
|
BIV100 = "BI-V100"
|
|
CAMBRICONMLU290 = 256
|
|
GCU = "GCU"
|
|
ENFLAME = "enflame"
|
|
EnflameT20 = 128
|
|
BASE_TOPS = 128
|
|
CAMBRICON = "cambricon"
|
|
ILUVATAR = "iluvatar"
|
|
TRAIN_CMD = "cd /code; python train.py"
|
|
VERSION = "V1"
|
|
DOMAIN = "http://192.168.242.41:8001/"
|
|
CAMBRICON_CN = "寒武纪290"
|
|
ENFLAME_CN = "燧原T20"
|
|
ILUVATAR_CN = "天数BI-V100"
|
|
)
|
|
|
|
var (
|
|
cardAliasMap = map[string]string{
|
|
MLU: CAMBRICON,
|
|
GCU: ENFLAME,
|
|
BIV100: ILUVATAR,
|
|
}
|
|
cardCnMap = map[string]string{
|
|
MLU: CAMBRICON_CN,
|
|
GCU: ENFLAME_CN,
|
|
BIV100: ILUVATAR_CN,
|
|
}
|
|
cardTopsMap = map[string]float64{
|
|
MLU: CAMBRICONMLU290,
|
|
GCU: EnflameT20,
|
|
}
|
|
)
|
|
|
|
func NewOctopusLink(octopusRpc octopusclient.Octopus, name string, id int64) *OctopusLink {
|
|
return &OctopusLink{octopusRpc: octopusRpc, platform: name, participantId: id, pageIndex: 1, pageSize: 100}
|
|
}
|
|
|
|
func (o *OctopusLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
|
|
// octopus创建镜像
|
|
createReq := &octopus.CreateImageReq{
|
|
Platform: o.platform,
|
|
CreateImage: &octopus.CreateImage{
|
|
SourceType: 1,
|
|
ImageName: IMG_NAME_PREFIX + utils.RandomString(7),
|
|
ImageVersion: IMG_VERSION_PREFIX + utils.RandomString(7),
|
|
},
|
|
}
|
|
createResp, err := o.octopusRpc.CreateImage(ctx, createReq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// octopus上传镜像
|
|
uploadReq := &octopus.UploadImageReq{
|
|
Platform: o.platform,
|
|
ImageId: createResp.Payload.ImageId,
|
|
Params: &octopus.UploadImageParam{
|
|
Domain: "",
|
|
FileName: "",
|
|
},
|
|
}
|
|
uploadResp, err := o.octopusRpc.UploadImage(ctx, uploadReq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Todo 实际上传
|
|
|
|
return uploadResp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
|
|
// octopus删除镜像
|
|
req := &octopus.DeleteImageReq{
|
|
Platform: o.platform,
|
|
ImageId: imageId,
|
|
}
|
|
resp, err := o.octopusRpc.DeleteImage(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) QueryImageList(ctx context.Context) (interface{}, error) {
|
|
// octopus获取镜像列表
|
|
req := &octopus.GetUserImageListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetUserImageList(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
|
|
// octopus提交任务
|
|
|
|
// python参数
|
|
var prms []*octopus.Parameters
|
|
for _, param := range params {
|
|
var p octopus.Parameters
|
|
s := strings.Split(param, COMMA)
|
|
p.Key = s[0]
|
|
p.Value = s[1]
|
|
prms = append(prms, &p)
|
|
}
|
|
|
|
//环境变量
|
|
envMap := make(map[string]string)
|
|
for _, env := range envs {
|
|
s := strings.Split(env, COMMA)
|
|
envMap[s[0]] = s[1]
|
|
}
|
|
|
|
req := &octopus.CreateTrainJobReq{
|
|
Platform: o.platform,
|
|
Params: &octopus.CreateTrainJobParam{
|
|
ImageId: imageId,
|
|
Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
|
|
ResourcePool: RESOURCE_POOL,
|
|
Config: []*octopus.Config{
|
|
{
|
|
Command: cmd,
|
|
ResourceSpecId: resourceId,
|
|
MinFailedTaskCount: 1,
|
|
MinSucceededTaskCount: 1,
|
|
TaskNumber: 1,
|
|
Parameters: prms,
|
|
Envs: envMap,
|
|
},
|
|
},
|
|
DataSetId: datasetsId,
|
|
DataSetVersion: VERSION,
|
|
AlgorithmId: algorithmId,
|
|
AlgorithmVersion: VERSION,
|
|
},
|
|
}
|
|
resp, err := o.octopusRpc.CreateTrainJob(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
|
|
// octopus获取任务
|
|
req := &octopus.GetTrainJobReq{
|
|
Platform: o.platform,
|
|
Id: taskId,
|
|
}
|
|
resp, err := o.octopusRpc.GetTrainJob(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
|
|
// octopus删除任务
|
|
req := &octopus.DeleteTrainJobReq{
|
|
Platform: o.platform,
|
|
JobIds: []string{taskId},
|
|
}
|
|
resp, err := o.octopusRpc.DeleteTrainJob(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) QuerySpecs(ctx context.Context) (interface{}, error) {
|
|
// octopus查询资源规格
|
|
req := &octopus.GetResourceSpecsReq{
|
|
Platform: o.platform,
|
|
ResourcePool: RESOURCE_POOL,
|
|
}
|
|
resp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return resp, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
|
|
req := &octopus.GetResourceSpecsReq{
|
|
Platform: o.platform,
|
|
ResourcePool: RESOURCE_POOL,
|
|
}
|
|
specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !specResp.Success {
|
|
return nil, errors.New(specResp.Error.Message)
|
|
}
|
|
balanceReq := &octopus.GetUserBalanceReq{
|
|
Platform: o.platform,
|
|
}
|
|
balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !balanceResp.Success {
|
|
return nil, errors.New(balanceResp.Error.Message)
|
|
}
|
|
|
|
var cards []*collector.Card
|
|
balance := float64(balanceResp.Payload.BillingUser.Amount)
|
|
var cpuHours float64
|
|
for _, spec := range specResp.TrainResourceSpecs {
|
|
if spec.Price == 0 {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
if len(ns) == 2 {
|
|
nss := strings.Split(ns[0], COLON)
|
|
if nss[0] == CPU {
|
|
cpuHours = -1
|
|
}
|
|
}
|
|
}
|
|
|
|
if spec.Price == 1 {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
cardSpecs := strings.Split(ns[0], STAR)
|
|
|
|
cardTops, isMapContainsKey := cardTopsMap[cardSpecs[1]]
|
|
if !isMapContainsKey {
|
|
continue
|
|
}
|
|
|
|
card := &collector.Card{
|
|
Platform: OCTOPUS,
|
|
Type: CARD,
|
|
Name: cardSpecs[1],
|
|
TOpsAtFp16: cardTops,
|
|
CardHours: balance / spec.Price,
|
|
}
|
|
cards = append(cards, card)
|
|
}
|
|
}
|
|
|
|
resourceStats := &collector.ResourceStats{
|
|
ClusterId: strconv.FormatInt(o.participantId, 10),
|
|
Name: o.platform,
|
|
Balance: balance,
|
|
CardsAvail: cards,
|
|
CpuCoreHours: cpuHours,
|
|
}
|
|
|
|
return resourceStats, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
|
|
req := &octopus.GetMyDatasetListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !resp.Success {
|
|
return nil, errors.New(resp.Error.Message)
|
|
}
|
|
specs := []*collector.DatasetsSpecs{}
|
|
for _, dataset := range resp.Payload.Datasets {
|
|
spec := &collector.DatasetsSpecs{Name: dataset.Name}
|
|
specs = append(specs, spec)
|
|
}
|
|
return specs, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
|
|
var algorithms []*collector.Algorithm
|
|
|
|
req := &octopus.GetMyAlgorithmListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if !resp.Success {
|
|
return nil, errors.New("failed to get algorithms")
|
|
}
|
|
|
|
for _, a := range resp.Payload.Algorithms {
|
|
algorithm := &collector.Algorithm{Name: a.AlgorithmName, Platform: OCTOPUS, TaskType: strings.ToLower(a.FrameworkName)}
|
|
algorithms = append(algorithms, algorithm)
|
|
}
|
|
return algorithms, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetComputeCards(ctx context.Context) ([]string, error) {
|
|
var cards []string
|
|
for s, _ := range cardAliasMap {
|
|
cards = append(cards, s)
|
|
}
|
|
return cards, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetUserBalance(ctx context.Context) (float64, error) {
|
|
balanceReq := &octopus.GetUserBalanceReq{
|
|
Platform: o.platform,
|
|
}
|
|
balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if !balanceResp.Success {
|
|
if balanceResp.Error != nil {
|
|
return 0, errors.New(balanceResp.Error.Message)
|
|
} else {
|
|
return 0, errors.New("failed to get user balance")
|
|
}
|
|
}
|
|
balance := float64(balanceResp.Payload.BillingUser.Amount)
|
|
return balance, nil
|
|
}
|
|
|
|
func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
|
|
var name string
|
|
if resourceType == CARD {
|
|
name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
|
|
} else {
|
|
name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
|
|
}
|
|
|
|
req := &octopus.GetMyAlgorithmListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if !resp.Success {
|
|
return "", errors.New("failed to get algorithmList")
|
|
}
|
|
|
|
var algorithmId string
|
|
var algorithms []*octopus.Algorithms
|
|
for _, a := range resp.Payload.Algorithms {
|
|
if strings.ToLower(a.FrameworkName) != taskType {
|
|
continue
|
|
}
|
|
|
|
if a.AlgorithmDescript == name {
|
|
algorithms = append(algorithms, a)
|
|
}
|
|
}
|
|
|
|
if len(algorithms) == 0 {
|
|
return "", errors.New("algorithmId not found")
|
|
}
|
|
|
|
if len(algorithms) == 1 {
|
|
algorithmId = algorithms[0].AlgorithmId
|
|
}
|
|
|
|
aLatest := &octopus.Algorithms{}
|
|
for i, _ := range algorithms {
|
|
if time.Unix(algorithms[i].CreatedAt, 0).After(time.Unix(aLatest.CreatedAt, 0)) {
|
|
aLatest = algorithms[i]
|
|
}
|
|
}
|
|
if aLatest.AlgorithmId == "" {
|
|
return "", errors.New("algorithmId not found")
|
|
}
|
|
|
|
algorithmId = aLatest.AlgorithmId
|
|
|
|
dcReq := &octopus.DownloadCompressReq{
|
|
Platform: o.platform,
|
|
Version: VERSION,
|
|
AlgorithmId: algorithmId,
|
|
}
|
|
dcResp, err := o.octopusRpc.DownloadCompress(ctx, dcReq)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if !dcResp.Success {
|
|
return "", errors.New(dcResp.Error.Message)
|
|
}
|
|
|
|
daReq := &octopus.DownloadAlgorithmReq{
|
|
Platform: o.platform,
|
|
Version: VERSION,
|
|
AlgorithmId: algorithmId,
|
|
CompressAt: dcResp.Payload.CompressAt,
|
|
Domain: DOMAIN,
|
|
}
|
|
daResp, err := o.octopusRpc.DownloadAlgorithm(ctx, daReq)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if !daResp.Success {
|
|
return "", errors.New(dcResp.Error.Message)
|
|
}
|
|
urlReq := &octopus.AlgorithmUrlReq{
|
|
Platform: o.platform,
|
|
Url: daResp.Payload.DownloadUrl,
|
|
}
|
|
urlResp, err := o.octopusRpc.DownloadAlgorithmUrl(ctx, urlReq)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return urlResp.Algorithm, nil
|
|
}
|
|
|
|
func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
|
|
//var name string
|
|
//if resourceType == CARD {
|
|
// name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
|
|
//} else {
|
|
// name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
|
|
//}
|
|
//uploadReq := &octopus.UploadAlgorithmReq{}
|
|
return nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
|
|
instance, err := strconv.ParseInt(instanceNum, 10, 32)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
req := &octopus.GetTrainJobLogReq{
|
|
Platform: o.platform,
|
|
TaskId: taskId,
|
|
TaskNum: "task0",
|
|
Num: int32(instance),
|
|
}
|
|
resp, err := o.octopusRpc.GetTrainJobLog(ctx, req)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
if strings.Contains(resp.Content, "404 Not Found") {
|
|
resp.Content = "waiting for logs..."
|
|
}
|
|
|
|
return resp.Content, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
|
|
resp, err := o.QueryTask(ctx, taskId)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
jobresp, ok := (resp).(*octopus.GetTrainJobResp)
|
|
if !jobresp.Success || !ok {
|
|
if jobresp.Error != nil {
|
|
return nil, errors.New(jobresp.Error.Message)
|
|
} else {
|
|
return nil, errors.New("get training task failed, empty error returned")
|
|
}
|
|
}
|
|
var task collector.Task
|
|
task.Id = jobresp.Payload.TrainJob.Id
|
|
if jobresp.Payload.TrainJob.StartedAt != 0 {
|
|
task.Start = time.Unix(jobresp.Payload.TrainJob.StartedAt, 0).Format(constants.Layout)
|
|
}
|
|
if jobresp.Payload.TrainJob.CompletedAt != 0 {
|
|
task.End = time.Unix(jobresp.Payload.TrainJob.CompletedAt, 0).Format(constants.Layout)
|
|
}
|
|
switch jobresp.Payload.TrainJob.Status {
|
|
case "succeeded":
|
|
task.Status = constants.Completed
|
|
case "failed":
|
|
task.Status = constants.Failed
|
|
case "running":
|
|
task.Status = constants.Running
|
|
case "stopped":
|
|
task.Status = constants.Stopped
|
|
case "pending":
|
|
task.Status = constants.Pending
|
|
default:
|
|
task.Status = "undefined"
|
|
}
|
|
|
|
return &task, nil
|
|
}
|
|
|
|
func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
|
|
err := o.GenerateSubmitParams(ctx, option)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return task, nil
|
|
}
|
|
|
|
func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
|
|
err := o.generateResourceId(ctx, option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateDatasetsId(ctx, option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateImageId(ctx, option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateAlgorithmId(ctx, option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateCmd(option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateEnv(option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
err = o.generateParams(option)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption) error {
|
|
if option.ResourceType == "" {
|
|
return errors.New("ResourceType not set")
|
|
}
|
|
req := &octopus.GetResourceSpecsReq{
|
|
Platform: o.platform,
|
|
ResourcePool: RESOURCE_POOL,
|
|
}
|
|
specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !specResp.Success {
|
|
return errors.New(specResp.Error.Message)
|
|
}
|
|
|
|
if option.ResourceType == CPU {
|
|
for _, spec := range specResp.TrainResourceSpecs {
|
|
if spec.Price == 0 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
|
|
if option.ResourceType == CARD {
|
|
if option.ComputeCard == "" {
|
|
option.ComputeCard = GCU
|
|
}
|
|
err = setResourceIdByCard(option, specResp, option.ComputeCard)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
return errors.New("failed to get ResourceId")
|
|
}
|
|
|
|
func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiOption) error {
|
|
if option.DatasetsName == "" {
|
|
return errors.New("DatasetsName not set")
|
|
}
|
|
req := &octopus.GetMyDatasetListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !resp.Success {
|
|
return errors.New("failed to get DatasetsId")
|
|
}
|
|
for _, dataset := range resp.Payload.Datasets {
|
|
if dataset.Name == option.DatasetsName {
|
|
option.DatasetsId = dataset.Id
|
|
return nil
|
|
}
|
|
}
|
|
return errors.New("failed to get DatasetsId")
|
|
}
|
|
|
|
func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption) error {
|
|
if option.TaskType == "" {
|
|
return errors.New("TaskType not set")
|
|
}
|
|
|
|
req := &octopus.GetUserImageListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetUserImageList(ctx, req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !resp.Success {
|
|
return errors.New("failed to get imageId")
|
|
}
|
|
|
|
if option.ResourceType == CPU {
|
|
for _, img := range resp.Payload.Images {
|
|
if img.Image.ImageName == "test-image" {
|
|
option.ImageId = img.Image.Id
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
|
|
preImgReq := &octopus.GetPresetImageListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
preImgResp, err := o.octopusRpc.GetPresetImageList(ctx, preImgReq)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !preImgResp.Success {
|
|
return errors.New("failed to get PresetImages")
|
|
}
|
|
|
|
if option.ResourceType == CARD {
|
|
for _, image := range preImgResp.Payload.Images {
|
|
if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(option.ComputeCard)]) {
|
|
switch strings.ToUpper(option.ComputeCard) {
|
|
case GCU:
|
|
if strings.HasPrefix(image.ImageVersion, "t20_") {
|
|
option.ImageId = image.Id
|
|
return nil
|
|
}
|
|
case BIV100:
|
|
if strings.HasPrefix(image.ImageVersion, "bi_") {
|
|
option.ImageId = image.Id
|
|
return nil
|
|
}
|
|
case MLU:
|
|
option.ImageId = image.Id
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return errors.New("failed to get ImageId")
|
|
}
|
|
|
|
func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
|
|
req := &octopus.GetMyAlgorithmListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !resp.Success {
|
|
return errors.New("failed to get algorithmId")
|
|
}
|
|
|
|
for _, algorithm := range resp.Payload.Algorithms {
|
|
if algorithm.FrameworkName == strings.Title(option.TaskType) {
|
|
ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
|
|
if ns[0] != option.DatasetsName {
|
|
continue
|
|
}
|
|
if ns[1] != option.AlgorithmName {
|
|
continue
|
|
}
|
|
switch option.ResourceType {
|
|
case CPU:
|
|
if ns[2] != CPU {
|
|
continue
|
|
}
|
|
case CARD:
|
|
if ns[2] != strings.ToLower(option.ComputeCard) {
|
|
continue
|
|
}
|
|
}
|
|
|
|
option.AlgorithmId = algorithm.AlgorithmId
|
|
return nil
|
|
}
|
|
}
|
|
|
|
if option.AlgorithmId == "" {
|
|
return errors.New("Algorithm does not exist")
|
|
}
|
|
|
|
return errors.New("failed to get AlgorithmId")
|
|
}
|
|
|
|
func (o *OctopusLink) generateCmd(option *option.AiOption) error {
|
|
if option.Cmd == "" {
|
|
switch option.ComputeCard {
|
|
case GCU:
|
|
option.Cmd = "cd /code; python3 train.py"
|
|
case MLU:
|
|
option.Cmd = ". /torch/venv3/pytorch/bin/activate; cd /code; python train.py"
|
|
default:
|
|
option.Cmd = TRAIN_CMD
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (o *OctopusLink) generateEnv(option *option.AiOption) error {
|
|
|
|
return nil
|
|
}
|
|
|
|
func (o *OctopusLink) generateParams(option *option.AiOption) error {
|
|
if len(option.Params) == 0 {
|
|
epoch := "epoch" + COMMA + "1"
|
|
option.Params = append(option.Params, epoch)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error {
|
|
if option.Tops == 0 {
|
|
for _, spec := range specs.TrainResourceSpecs {
|
|
if spec.Price == 1 {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
cardSpecs := strings.Split(ns[0], STAR)
|
|
if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
|
|
option.ResourceId = spec.Id
|
|
option.ComputeCard = computeCard
|
|
return nil
|
|
}
|
|
} else {
|
|
continue
|
|
}
|
|
}
|
|
} else {
|
|
cardNum := math.Ceil(option.Tops / float64(BASE_TOPS))
|
|
for _, spec := range specs.TrainResourceSpecs {
|
|
if option.Tops < BASE_TOPS {
|
|
if spec.Price == 1 {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
cardSpecs := strings.Split(ns[0], STAR)
|
|
if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
|
|
option.ResourceId = spec.Id
|
|
option.ComputeCard = computeCard
|
|
return nil
|
|
}
|
|
} else {
|
|
continue
|
|
}
|
|
} else {
|
|
ns := strings.Split(spec.Name, COMMA)
|
|
if len(ns) != 4 {
|
|
continue
|
|
}
|
|
cardSpecs := strings.Split(ns[0], STAR)
|
|
if cardSpecs[1] != cardCnMap[strings.ToUpper(computeCard)] {
|
|
continue
|
|
}
|
|
s, err := strconv.ParseFloat(cardSpecs[0], 64)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
switch computeCard {
|
|
case GCU:
|
|
option.ComputeCard = computeCard
|
|
if cardNum == s { // 1, 4, 8
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
if 1 < cardNum && cardNum <= 4 && s == 4 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
if 4 < cardNum && s == 8 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
|
|
case MLU: // 1, 2, 4
|
|
option.ComputeCard = computeCard
|
|
if cardNum/2 == s {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
if 2 < cardNum/2 && s == 4 {
|
|
option.ResourceId = spec.Id
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return errors.New("set ResourceId error")
|
|
}
|
|
|
|
func (o *OctopusLink) GetInferUrl(ctx context.Context, option *option.InferOption) ([]*inference.InferUrl, error) {
|
|
req := &octopus.GetNotebookListReq{
|
|
Platform: o.platform,
|
|
PageIndex: o.pageIndex,
|
|
PageSize: o.pageSize,
|
|
}
|
|
list, err := o.octopusRpc.GetNotebookList(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var imageUrls []*inference.InferUrl
|
|
for _, notebook := range list.Payload.GetNotebooks() {
|
|
if strings.Contains(notebook.AlgorithmName, option.ModelName) && notebook.Status == "running" {
|
|
url := strings.Replace(notebook.Tasks[0].Url, FORWARD_SLASH, "", -1)
|
|
names := strings.Split(notebook.AlgorithmName, UNDERSCORE)
|
|
imageUrl := &inference.InferUrl{
|
|
Url: DOMAIN + url,
|
|
Card: names[2],
|
|
}
|
|
imageUrls = append(imageUrls, imageUrl)
|
|
} else {
|
|
continue
|
|
}
|
|
}
|
|
|
|
if len(imageUrls) == 0 {
|
|
return nil, errors.New("no infer url available")
|
|
}
|
|
return imageUrls, nil
|
|
}
|