pcm-coordinator/internal/logic/inference/createinferencetasklogic.go

355 lines
9.0 KiB
Go

package inference
import (
"context"
"encoding/json"
"errors"
"fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/status"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
"strconv"
"strings"
"sync"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"github.com/zeromicro/go-zero/core/logx"
)
type CreateInferenceTaskLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
}
func NewCreateInferenceTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CreateInferenceTaskLogic {
return &CreateInferenceTaskLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
}
}
const AdapterId = "1777144940459986944"
func (l *CreateInferenceTaskLogic) CreateInferenceTask(req *types.CreateInferenceTaskReq) (resp *types.CreateInferenceTaskResp, err error) {
resp = &types.CreateInferenceTaskResp{}
err = task.ValidateJobResources(req.JobResources, "inference")
if err != nil {
return nil, err
}
clusters, err := generateClustersForTaskCreation(req.DataDistributes, req.Name)
if err != nil {
return nil, err
}
modelName, err := generateModelName(clusters)
if err != nil {
return nil, err
}
taskName, err := l.svcCtx.Scheduler.AiService.HandleDuplicateTaskName(req.Name, "inference")
if err != nil {
return nil, err
}
assignedClusters := task.CopyParams(clusters, req.JobResources.Clusters, "inference")
opt := &option.InferOption{
TaskName: taskName,
TaskDesc: req.Description,
ModelType: "",
ModelName: modelName,
Cmd: "",
}
taskId, err := l.svcCtx.Scheduler.AiStorages.SaveInferDeployTask(taskName, req.UserId, modelName, "", req.Description)
if err != nil {
return nil, err
}
adapterClusterMap := make(map[string][]*strategy.AssignedCluster)
adapterClusterMap[AdapterId] = assignedClusters
err = l.createInferenceTask(taskId, adapterClusterMap, opt)
if err != nil {
if len(assignedClusters) != 0 {
_ = status.ReportInferenceStatusMessages(l.svcCtx, nil, taskName, strconv.FormatInt(taskId, 10), assignedClusters[0].ClusterId, "", false, "")
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
return nil, err
}
resp.TaskId = strconv.FormatInt(taskId, 10)
resp.TaskName = taskName
return
}
func (l *CreateInferenceTaskLogic) createInferenceTask(taskId int64, adapterClusterMap map[string][]*strategy.AssignedCluster, option *option.InferOption) error {
var clusterlen int
for _, c := range adapterClusterMap {
clusterlen += len(c)
}
var errCh = make(chan interface{}, clusterlen)
var errs []interface{}
buf := make(chan bool, 2)
var wg sync.WaitGroup
for aid, v := range adapterClusterMap {
for _, c := range v {
wg.Add(1)
cluster := c
buf <- true
go func() {
opt, _ := cloneOption(option)
updateInferOption(cluster, opt)
err := l.createDeployInstance(taskId, aid, cluster.ClusterId, opt)
if err != nil {
e := struct {
err error
clusterId string
}{
err: err,
clusterId: cluster.ClusterId,
}
errCh <- e
wg.Done()
<-buf
return
}
wg.Done()
<-buf
}()
}
}
wg.Wait()
close(errCh)
for e := range errCh {
errs = append(errs, e)
}
if len(errs) != 0 {
var msg string
for _, err := range errs {
e := (err).(struct {
err error
clusterId string
})
clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(e.clusterId)
if err != nil {
clusterName = e.clusterId
}
msg += fmt.Sprintf("CreateInstance Failed # clusterName: %v, error: %v \n", clusterName, e.err.Error())
}
return errors.New(msg)
}
return nil
}
func updateInferOption(cluster *strategy.AssignedCluster, opt *option.InferOption) {
opt.Cmd = cluster.Cmd
opt.Envs = cluster.Envs
opt.Params = cluster.Params
opt.ImageId = cluster.ImageId
opt.AlgorithmId = cluster.CodeId
opt.ModelID = cluster.ModelId
opt.ResourcesRequired = cluster.ResourcesRequired
opt.Output = cluster.Output
}
func generateClustersForTaskCreation(distributes types.DataDistribute, taskName string) ([]*strategy.AssignedCluster, error) {
var assignedClusters []*strategy.AssignedCluster
clusterMap := make(map[string]*strategy.AssignedCluster)
for _, distribute := range distributes.Code {
if len(distribute.Clusters) == 0 {
return nil, fmt.Errorf("Code distribute: must specify at least one cluster")
}
for _, c := range distribute.Clusters {
if c.ClusterID == "" {
return nil, fmt.Errorf("Code distribute: clusterId can not be empty")
}
cluster := &strategy.AssignedCluster{}
cluster.ClusterId = c.ClusterID
jsonData := entity.JsonData{}
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
if err != nil {
return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Code")
}
if jsonData.Id == "" {
continue
}
cluster.CodeId = jsonData.Id
clusterMap[c.ClusterID] = cluster
}
}
for _, distribute := range distributes.Model {
if len(distribute.Clusters) == 0 {
return nil, fmt.Errorf("Model distribute: must specify at least one cluster")
}
for _, c := range distribute.Clusters {
if c.ClusterID == "" {
return nil, fmt.Errorf("Model distribute: clusterId can not be empty")
}
jsonData := entity.JsonData{}
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
if err != nil {
return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Model")
}
if jsonData.Id == "" {
continue
}
cluster, ok := clusterMap[c.ClusterID]
if ok {
cluster.ModelId = jsonData.Id
cluster.ModelName = jsonData.Name
}
}
}
for _, distribute := range distributes.Image {
if len(distribute.Clusters) == 0 {
return nil, fmt.Errorf("Image distribute: must specify at least one cluster")
}
for _, c := range distribute.Clusters {
if c.ClusterID == "" {
return nil, fmt.Errorf("Image distribute: clusterId can not be empty")
}
jsonData := entity.JsonData{}
err := json.Unmarshal([]byte(c.JsonData), &jsonData)
if err != nil {
return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Image")
}
if jsonData.Id == "" {
continue
}
cluster, ok := clusterMap[c.ClusterID]
if ok {
cluster.ImageId = jsonData.Id
}
}
}
for _, c := range clusterMap {
if c.ModelId == "" {
return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "ModelId")
}
if c.CodeId == "" {
return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "CodeId")
}
if c.ImageId == "" {
return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "ImageId")
}
assignedClusters = append(assignedClusters, c)
}
if len(assignedClusters) == 0 {
return nil, fmt.Errorf("no code provided")
}
return assignedClusters, nil
}
func generateModelName(clusters []*strategy.AssignedCluster) (string, error) {
if len(clusters) == 1 {
return clusters[0].ModelName, nil
}
var modelName string
for _, c := range clusters {
modelName += c.ModelName + ","
}
modelName = strings.TrimSuffix(modelName, ",")
return modelName, nil
}
func (l *CreateInferenceTaskLogic) createDeployInstance(taskId int64, adapterId string, clusterId string, opt *option.InferOption) error {
cmap, found := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[adapterId]
if !found {
return errors.New("adapterId not exist: " + adapterId)
}
iCluster, found := cmap[clusterId]
if !found {
return errors.New("clusterId not exist: " + clusterId)
}
insId, err := iCluster.CreateInferDeployInstance(l.ctx, opt)
if err != nil {
return err
}
aid, err := strconv.ParseInt(adapterId, 10, 64)
if err != nil {
return err
}
cid, err := strconv.ParseInt(clusterId, 10, 64)
if err != nil {
return err
}
adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(adapterId)
if err != nil {
return err
}
clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(clusterId)
if err != nil {
return err
}
ins, err := iCluster.GetInferDeployInstance(l.ctx, insId)
if err != nil {
return err
}
_, err = l.svcCtx.Scheduler.AiStorages.SaveInferDeployInstance(taskId, ins.InstanceId, ins.InstanceName, aid, adapterName, cid, clusterName, ins.ModelName, ins.ModelType, ins.InferCard, ins.ClusterType)
if err != nil {
return err
}
return nil
}
func cloneOption(opt *option.InferOption) (*option.InferOption, error) {
origJSON, err := json.Marshal(opt)
if err != nil {
return nil, err
}
clone := option.InferOption{}
if err = json.Unmarshal(origJSON, &clone); err != nil {
return nil, err
}
return &clone, nil
}