pcm-coordinator/internal/scheduler/service/aiService.go

167 lines
6.3 KiB
Go

package service
import (
"github.com/zeromicro/go-zero/zrpc"
"gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/config"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task/tasksync"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink/octopusHttp"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
"gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
"gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
"strconv"
"sync"
"time"
)
const (
OCTOPUS = "octopus"
MODELARTS = "modelarts"
SHUGUANGAI = "shuguangAi"
OPENI = "openI"
)
type AiService struct {
AiExecutorAdapterMap map[string]map[string]executor.AiExecutor
AiCollectorAdapterMap map[string]map[string]collector.AiCollector
InferenceAdapterMap map[string]map[string]inference.ICluster
Storage *database.AiStorage
LocalCache map[string]interface{}
Conf *config.Config
TaskSyncLock sync.Mutex
St *tasksync.SyncTrain
Si *tasksync.SyncInfer
}
func NewAiService(conf *config.Config, storages *database.AiStorage, localCache map[string]interface{}) (*AiService, error) {
var aiType = "1"
adapterIds, err := storages.GetAdapterIdsByType(aiType)
if err != nil {
return nil, err
}
aiService := &AiService{
AiExecutorAdapterMap: make(map[string]map[string]executor.AiExecutor),
AiCollectorAdapterMap: make(map[string]map[string]collector.AiCollector),
InferenceAdapterMap: make(map[string]map[string]inference.ICluster),
Storage: storages,
LocalCache: localCache,
Conf: conf,
}
for _, id := range adapterIds {
clusters, err := storages.GetClustersByAdapterId(id)
if err != nil {
return nil, err
}
if len(clusters.List) == 0 {
continue
}
exeClusterMap, colClusterMap, inferMap := InitAiClusterMap(conf, clusters.List)
aiService.AiExecutorAdapterMap[id] = exeClusterMap
aiService.AiCollectorAdapterMap[id] = colClusterMap
aiService.InferenceAdapterMap[id] = inferMap
}
st := tasksync.NewTrainTask(storages, aiService.AiCollectorAdapterMap, conf)
si := tasksync.NewInferTask(storages, aiService.InferenceAdapterMap, conf)
aiService.St = st
aiService.Si = si
return aiService, nil
}
func InitAiClusterMap(conf *config.Config, clusters []types.ClusterInfo) (map[string]executor.AiExecutor, map[string]collector.AiCollector, map[string]inference.ICluster) {
executorMap := make(map[string]executor.AiExecutor)
collectorMap := make(map[string]collector.AiCollector)
inferenceMap := make(map[string]inference.ICluster)
for _, c := range clusters {
switch c.Driver {
case OCTOPUS:
id, _ := strconv.ParseInt(c.Id, 10, 64)
octopus := octopusHttp.NewOctopusHttp(id, c.Nickname, c.Server, c.Address, c.Username, c.Password)
collectorMap[c.Id] = octopus
executorMap[c.Id] = octopus
inferenceMap[c.Id] = octopus
case MODELARTS:
id, _ := strconv.ParseInt(c.Id, 10, 64)
modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
collectorMap[c.Id] = modelarts
executorMap[c.Id] = modelarts
inferenceMap[c.Id] = modelarts
case SHUGUANGAI:
id, _ := strconv.ParseInt(c.Id, 10, 64)
aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
collectorMap[c.Id] = sgai
executorMap[c.Id] = sgai
inferenceMap[c.Id] = sgai
case OPENI:
id, _ := strconv.ParseInt(c.Id, 10, 64)
openi := storeLink.NewOpenI(c.Server, id, c.Username, c.Token, c.Nickname)
collectorMap[c.Id] = openi
executorMap[c.Id] = openi
inferenceMap[c.Id] = openi
}
}
return executorMap, collectorMap, inferenceMap
}
func (as *AiService) UpdateClusterMaps(conf *config.Config, adapterId string, clusters []types.ClusterInfo) {
for _, c := range clusters {
_, ok := as.AiExecutorAdapterMap[adapterId][c.Id]
_, ok2 := as.AiCollectorAdapterMap[adapterId][c.Id]
_, ok3 := as.InferenceAdapterMap[adapterId][c.Id]
if !ok && !ok2 && !ok3 {
switch c.Name {
case OCTOPUS:
id, _ := strconv.ParseInt(c.Id, 10, 64)
octopusRpc := octopusclient.NewOctopus(zrpc.MustNewClient(conf.OctopusRpcConf))
octopus := storeLink.NewOctopusLink(octopusRpc, c.Nickname, id)
as.AiExecutorAdapterMap[adapterId][c.Id] = octopus
as.AiCollectorAdapterMap[adapterId][c.Id] = octopus
as.InferenceAdapterMap[adapterId][c.Id] = octopus
case MODELARTS:
id, _ := strconv.ParseInt(c.Id, 10, 64)
modelArtsRpc := modelartsservice.NewModelArtsService(zrpc.MustNewClient(conf.ModelArtsRpcConf))
modelArtsImgRpc := imagesservice.NewImagesService(zrpc.MustNewClient(conf.ModelArtsImgRpcConf))
modelarts := storeLink.NewModelArtsLink(modelArtsRpc, modelArtsImgRpc, c.Name, id, c.Nickname)
as.AiExecutorAdapterMap[adapterId][c.Id] = modelarts
as.AiCollectorAdapterMap[adapterId][c.Id] = modelarts
as.InferenceAdapterMap[adapterId][c.Id] = modelarts
case SHUGUANGAI:
id, _ := strconv.ParseInt(c.Id, 10, 64)
aCRpc := hpcacclient.NewHpcAC(zrpc.MustNewClient(conf.ACRpcConf))
sgai := storeLink.NewShuguangAi(aCRpc, c.Nickname, id)
as.AiExecutorAdapterMap[adapterId][c.Id] = sgai
as.AiCollectorAdapterMap[adapterId][c.Id] = sgai
as.InferenceAdapterMap[adapterId][c.Id] = sgai
}
} else {
continue
}
}
}
func (as *AiService) HandleDuplicateTaskName(name string, taskType string) (string, error) {
exist, err := as.Storage.DoesTaskNameExist(name, taskType)
if err != nil {
return "", err
}
if exist {
return name + "_" + time.Now().Format(constants.Layout_Time_Suffix), nil
}
return name, nil
}