updated imageinfer api

Former-commit-id: 56ea124571
This commit is contained in:
tzwang 2024-06-21 19:27:33 +08:00
parent 928ba4f4dd
commit 1bfd8e052a
22 changed files with 194 additions and 51 deletions

View File

@ -1,6 +1,7 @@
Name: pcm.core.api
Host: 0.0.0.0
Port: 8999
MaxBytes: 524288000
Timeout: 50000

View File

@ -90,7 +90,7 @@ func UpdateAiTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
var wg sync.WaitGroup
for _, aitask := range aiTaskList {
t := aitask
if t.Status == constants.Completed || t.Status == constants.Failed {
if t.Status == constants.Completed || t.Status == constants.Failed || t.JobId == "" {
continue
}
wg.Add(1)

View File

@ -7,22 +7,19 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
)
func ModelNamesByTypeHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var req types.ModelNamesReq
if err := httpx.Parse(r, &req); err != nil {
httpx.ErrorCtx(r.Context(), w, err)
result.ParamErrorResult(r, w, err)
return
}
l := inference.NewModelNamesByTypeLogic(r.Context(), svcCtx)
resp, err := l.ModelNamesByType(&req)
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
result.HttpResult(r, w, resp, err)
}
}

View File

@ -1,21 +1,16 @@
package inference
import (
"net/http"
"github.com/zeromicro/go-zero/rest/httpx"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/logic/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
"net/http"
)
func ModelTypesHandler(svcCtx *svc.ServiceContext) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
l := inference.NewModelTypesLogic(r.Context(), svcCtx)
resp, err := l.ModelTypes()
if err != nil {
httpx.ErrorCtx(r.Context(), w, err)
} else {
httpx.OkJsonCtx(r.Context(), w, resp)
}
result.HttpResult(r, w, resp, err)
}
}

View File

@ -94,7 +94,7 @@ func (l *CommitGeneralTaskLogic) CommitGeneralTask(req *types.GeneralTaskReq) er
Name: req.Name,
CommitTime: time.Now(),
YamlString: strings.Join(req.ReqBody, "\n---\n"),
AdapterTypeDict: 0,
AdapterTypeDict: "0",
SynergyStatus: synergyStatus,
Strategy: strategy,
}

View File

@ -86,7 +86,7 @@ func (l *CommitVmTaskLogic) CommitVmTask(req *types.CommitVmTaskReq) (resp *type
Name: req.Name,
CommitTime: time.Now(),
Description: "vm task",
AdapterTypeDict: 0,
AdapterTypeDict: "0",
SynergyStatus: synergyStatus,
Strategy: strategy,
}

View File

@ -263,7 +263,7 @@ func (l *PageListTaskLogic) updateAiTaskStatus(tasklist []*types.TaskModel, ch c
var wg sync.WaitGroup
for _, aitask := range aiTaskList {
t := aitask
if t.Status == constants.Completed || t.Status == constants.Failed {
if t.Status == constants.Completed || t.Status == constants.Failed || t.JobId == "" {
continue
}
wg.Add(1)

View File

@ -36,14 +36,14 @@ func (l *TaskDetailsLogic) TaskDetails(req *types.FId) (resp *types.TaskDetailsR
var cList []*types.ClusterInfo
var subList []*types.SubTaskInfo
switch task.AdapterTypeDict {
case 0:
case "0":
l.svcCtx.DbEngin.Table("task_cloud").Where("task_id", task.Id).Scan(&subList)
if len(subList) <= 0 {
l.svcCtx.DbEngin.Table("task_vm").Where("task_id", task.Id).Find(&subList)
}
case 1:
case "1":
l.svcCtx.DbEngin.Table("task_ai").Where("task_id", task.Id).Scan(&subList)
case 2:
case "2":
l.svcCtx.DbEngin.Table("task_hpc").Where("task_id", task.Id).Scan(&subList)
}
for _, sub := range subList {

View File

@ -122,7 +122,7 @@ func (l *TaskListLogic) TaskList(req *types.TaskListReq) (resp *types.TaskListRe
func (l *TaskListLogic) updateAitaskStatus(tasks []models.Task, ch chan<- struct{}) {
for _, task := range tasks {
if task.AdapterTypeDict != 1 {
if task.AdapterTypeDict != "1" {
continue
}
if task.Status == constants.Succeeded {

View File

@ -40,7 +40,7 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t
Strategy: 0,
SynergyStatus: 0,
CommitTime: time.Now(),
AdapterTypeDict: 2,
AdapterTypeDict: "2",
}
// 保存任务数据到数据库

View File

@ -10,10 +10,13 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"math/rand"
"mime/multipart"
"net/http"
"sort"
"strconv"
"sync"
"time"
)
@ -129,17 +132,50 @@ func infer(opt *option.InferOption, clusters []*strategy.AssignedCluster, ts []s
var wg sync.WaitGroup
var cluster_ch = make(chan struct {
urls []*collector.ImageInferUrl
clusterId string
clusterName string
imageNum int32
}, len(clusters))
var cs []struct {
urls []*collector.ImageInferUrl
clusterId string
clusterName string
imageNum int32
}
collectorMap := svcCtx.Scheduler.AiService.AiCollectorAdapterMap[opt.AdapterId]
//save task
var synergystatus int64
if len(clusters) > 1 {
synergystatus = 1
}
strategyCode, err := svcCtx.Scheduler.AiStorages.GetStrategyCode(opt.Strategy)
if err != nil {
return nil, err
}
adapterName, err := svcCtx.Scheduler.AiStorages.GetAdapterNameById(opt.AdapterId)
if err != nil {
return nil, err
}
id, err := svcCtx.Scheduler.AiStorages.SaveTask(opt.TaskName, strategyCode, synergystatus, "11")
if err != nil {
return nil, err
}
svcCtx.Scheduler.AiStorages.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "create", "任务创建中")
//save taskai
for _, c := range clusters {
clusterName, _ := svcCtx.Scheduler.AiStorages.GetClusterNameById(c.ClusterId)
opt.Replica = c.Replicas
err := svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, adapterName, c.ClusterId, clusterName, "", constants.Saved, "")
if err != nil {
return nil, err
}
}
for _, cluster := range clusters {
wg.Add(1)
c := cluster
@ -153,10 +189,12 @@ func infer(opt *option.InferOption, clusters []*strategy.AssignedCluster, ts []s
s := struct {
urls []*collector.ImageInferUrl
clusterId string
clusterName string
imageNum int32
}{
urls: imageUrls,
clusterId: c.ClusterId,
clusterName: clusterName,
imageNum: c.Replicas,
}
@ -173,11 +211,42 @@ func infer(opt *option.InferOption, clusters []*strategy.AssignedCluster, ts []s
cs = append(cs, s)
}
var aiTaskList []*models.TaskAi
tx := svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", id).Scan(&aiTaskList)
if tx.Error != nil {
return nil, tx.Error
}
//change cluster status
if len(clusters) != len(cs) {
var acs []*strategy.AssignedCluster
for _, cluster := range clusters {
if contains(cs, cluster.ClusterId) {
continue
} else {
var ac *strategy.AssignedCluster
ac = cluster
acs = append(acs, ac)
}
}
// update failed cluster status
for _, ac := range acs {
for _, t := range aiTaskList {
if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
t.Status = constants.Failed
err := svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
if err != nil {
logx.Errorf(tx.Error.Error())
}
}
}
}
}
var result_ch = make(chan *types.ImageResult, len(ts))
var results []*types.ImageResult
wg.Add(len(ts))
var imageNumIdx int32 = 0
var imageNumIdxEnd int32 = 0
for _, c := range cs {
@ -191,10 +260,10 @@ func infer(opt *option.InferOption, clusters []*strategy.AssignedCluster, ts []s
new_images = new_images[imageNumIdx:imageNumIdxEnd]
imageNumIdx = imageNumIdx + c.imageNum
wg.Add(len(new_images))
go sendInferReq(new_images, c, &wg, result_ch)
}
wg.Wait()
close(result_ch)
for s := range result_ch {
@ -205,6 +274,18 @@ func infer(opt *option.InferOption, clusters []*strategy.AssignedCluster, ts []s
return results[p].ClusterName < results[q].ClusterName
})
// update succeeded cluster status
for _, c := range cs {
for _, t := range aiTaskList {
if c.clusterId == strconv.Itoa(int(t.ClusterId)) {
t.Status = constants.Completed
err := svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
if err != nil {
logx.Errorf(tx.Error.Error())
}
}
}
}
return results, nil
}
@ -213,6 +294,7 @@ func sendInferReq(images []struct {
file multipart.File
}, cluster struct {
urls []*collector.ImageInferUrl
clusterId string
clusterName string
imageNum int32
}, wg *sync.WaitGroup, ch chan<- *types.ImageResult) {
@ -222,6 +304,7 @@ func sendInferReq(images []struct {
file multipart.File
}, c struct {
urls []*collector.ImageInferUrl
clusterId string
clusterName string
imageNum int32
}) {
@ -288,3 +371,17 @@ func GetRestyRequest(timeoutSeconds int64) *resty.Request {
type Res struct {
Result string `json:"result"`
}
func contains(cs []struct {
urls []*collector.ImageInferUrl
clusterId string
clusterName string
imageNum int32
}, e string) bool {
for _, c := range cs {
if c.clusterId == e {
return true
}
}
return false
}

View File

@ -2,6 +2,7 @@ package inference
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
@ -24,7 +25,12 @@ func NewModelNamesByTypeLogic(ctx context.Context, svcCtx *svc.ServiceContext) *
}
func (l *ModelNamesByTypeLogic) ModelNamesByType(req *types.ModelNamesReq) (resp *types.ModelNamesResp, err error) {
// todo: add your logic here and delete this line
return
resp = &types.ModelNamesResp{}
models, err := storeLink.GetModelNamesByType(req.Type)
if err != nil {
logx.Errorf("ModelNamesByType err: %v", err)
return nil, err
}
resp.ModelNames = models
return resp, nil
}

View File

@ -2,6 +2,7 @@ package inference
import (
"context"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
@ -24,7 +25,8 @@ func NewModelTypesLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ModelT
}
func (l *ModelTypesLogic) ModelTypes() (resp *types.ModelTypesResp, err error) {
// todo: add your logic here and delete this line
return
resp = &types.ModelTypesResp{}
mTypes := storeLink.GetModelTypes()
resp.ModelTypes = mTypes
return resp, nil
}

View File

@ -71,7 +71,7 @@ func (l *ScheduleSubmitLogic) ScheduleSubmit(req *types.ScheduleReq) (resp *type
if err != nil {
return nil, err
}
id, err := l.svcCtx.Scheduler.AiStorages.SaveTask(req.AiOption.TaskName, strategyCode, synergystatus)
id, err := l.svcCtx.Scheduler.AiStorages.SaveTask(req.AiOption.TaskName, strategyCode, synergystatus, "10")
if err != nil {
return nil, err
}

View File

@ -94,7 +94,7 @@ func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, e
return resp, nil
}
func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int64) (int64, error) {
func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int64, aiType string) (int64, error) {
// 构建主任务结构体
taskModel := models.Task{
Status: constants.Saved,
@ -102,7 +102,8 @@ func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int6
Name: name,
SynergyStatus: synergyStatus,
Strategy: strategyCode,
AdapterTypeDict: 1,
AdapterTypeDict: "1",
TaskTypeDict: aiType,
CommitTime: time.Now(),
}
// 保存任务数据到数据库
@ -113,9 +114,22 @@ func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int6
return taskModel.Id, nil
}
func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, adapterName string, clusterId string, clusterName string, jobId string, status string, msg string) error {
func (s *AiStorage) SaveAiTask(taskId int64, opt option.Option, adapterName string, clusterId string, clusterName string, jobId string, status string, msg string) error {
var aiOpt *option.AiOption
switch (opt).(type) {
case *option.AiOption:
aiOpt = (opt).(*option.AiOption)
case *option.InferOption:
inferOpt := (opt).(*option.InferOption)
aiOpt = &option.AiOption{}
aiOpt.TaskName = inferOpt.TaskName
aiOpt.Replica = inferOpt.Replica
aiOpt.AdapterId = inferOpt.AdapterId
aiOpt.TaskType = inferOpt.ModelType
aiOpt.StrategyName = inferOpt.Strategy
}
// 构建主任务结构体
aId, err := strconv.ParseInt(option.AdapterId, 10, 64)
aId, err := strconv.ParseInt(aiOpt.AdapterId, 10, 64)
if err != nil {
return err
}
@ -130,14 +144,14 @@ func (s *AiStorage) SaveAiTask(taskId int64, option *option.AiOption, adapterNam
AdapterName: adapterName,
ClusterId: cId,
ClusterName: clusterName,
Name: option.TaskName,
Replica: int64(option.Replica),
Name: aiOpt.TaskName,
Replica: int64(aiOpt.Replica),
JobId: jobId,
TaskType: option.TaskType,
Strategy: option.StrategyName,
TaskType: aiOpt.TaskType,
Strategy: aiOpt.StrategyName,
Status: status,
Msg: msg,
Card: option.ComputeCard,
Card: aiOpt.ComputeCard,
CommitTime: time.Now(),
}
// 保存任务数据到数据库

View File

@ -222,7 +222,7 @@ func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster) (interfa
synergystatus = 1
}
strategyCode, err := as.AiStorages.GetStrategyCode(as.option.StrategyName)
taskId, err := as.AiStorages.SaveTask(as.option.TaskName, strategyCode, synergystatus)
taskId, err := as.AiStorages.SaveTask(as.option.TaskName, strategyCode, synergystatus, "10")
if err != nil {
return nil, errors.New("database add failed: " + err.Error())
}

View File

@ -16,3 +16,7 @@ type InferOption struct {
Cmd string `json:"cmd,optional"`
Replica int32 `json:"replicas,optional"`
}
func (a InferOption) GetOptionType() string {
return AI_INFER
}

View File

@ -1,10 +1,11 @@
package option
const (
AI = "ai"
CLOUD = "cloud"
HPC = "hpc"
VM = "vm"
AI_INFER = "ai_infer"
AI = "ai"
CLOUD = "cloud"
HPC = "hpc"
VM = "vm"
)
type Option interface {

View File

@ -385,10 +385,13 @@ func (m *ModelArtsLink) GetImageInferUrl(ctx context.Context, option *option.Inf
Type: option.ModelType,
Card: "npu",
}
urlResp, _ := m.modelArtsRpc.ImageReasoningUrl(ctx, urlReq)
urlResp, err := m.modelArtsRpc.ImageReasoningUrl(ctx, urlReq)
if err != nil {
return nil, err
}
imageUrl := &collector.ImageInferUrl{
Url: urlResp.Url,
Card: option.ComputeCard,
Card: "npu",
}
imageUrls = append(imageUrls, imageUrl)

View File

@ -739,7 +739,10 @@ func (s *ShuguangAi) GetImageInferUrl(ctx context.Context, option *option.InferO
Card: "dcu",
}
urlResp, _ := s.aCRpc.GetInferUrl(ctx, urlReq)
urlResp, err := s.aCRpc.GetInferUrl(ctx, urlReq)
if err != nil {
return nil, err
}
imageUrl := &collector.ImageInferUrl{
Url: urlResp.Url,
Card: option.ComputeCard,

View File

@ -76,6 +76,9 @@ var (
3: "制作完成",
4: "制作失败",
}
ModelTypeMap = map[string][]string{
"image_recognition": {"imagenet_resnet50"},
}
AITYPE = map[string]string{
"1": OCTOPUS,
"2": MODELARTS,
@ -128,6 +131,22 @@ func GetResourceTypes() []string {
return resourceTypes
}
func GetModelTypes() []string {
var mTypes []string
for k, _ := range ModelTypeMap {
mTypes = append(mTypes, k)
}
return mTypes
}
func GetModelNamesByType(t string) ([]string, error) {
_, ok := ModelTypeMap[t]
if !ok {
return nil, errors.New("model type does not exist")
}
return ModelTypeMap[t], nil
}
func GetDatasetsNames(ctx context.Context, collectorMap map[string]collector.AiCollector) ([]string, error) {
var wg sync.WaitGroup
var errCh = make(chan interface{}, len(collectorMap))

View File

@ -49,7 +49,8 @@ type (
Result string `db:"result"` // 作业结果
DeletedAt gorm.DeletedAt `gorm:"index"`
NsID string `db:"ns_id"`
AdapterTypeDict int `db:"adapter_type_dict"` //任务类型(对应字典表的值)
AdapterTypeDict string `db:"adapter_type_dict"` //任务类型(对应字典表的值)
TaskTypeDict string `db:"task_type_dict"`
}
)