fix statusSync

This commit is contained in:
tzwang 2025-05-14 10:49:24 +08:00
parent 42e936f5fc
commit c0b5cb6607
7 changed files with 121 additions and 51 deletions

View File

@ -7,6 +7,7 @@ import (
"fmt" "fmt"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/status"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
"strconv" "strconv"
@ -78,6 +79,10 @@ func (l *CreateInferenceTaskLogic) CreateInferenceTask(req *types.CreateInferenc
err = l.createInferenceTask(taskId, adapterClusterMap, opt) err = l.createInferenceTask(taskId, adapterClusterMap, opt)
if err != nil { if err != nil {
if len(assignedClusters) != 0 {
_ = status.ReportStatus(l.svcCtx, taskName, strconv.FormatInt(taskId, 10), assignedClusters[0].ClusterId, "", false, "")
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
return nil, err return nil, err
} }

View File

@ -261,13 +261,6 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
return err return err
} }
//report msg
report := &jcs.JobStatusReportReq{
TaskName: "",
TaskID: strconv.FormatInt(taskId, 10),
Messages: make([]*jcs.ReportMessage, 0),
}
var errmsg string var errmsg string
for _, err := range errs { for _, err := range errs {
e := (err).(struct { e := (err).(struct {
@ -284,14 +277,23 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
return errors.New("database add failed: " + err.Error()) return errors.New("database add failed: " + err.Error())
} }
//add report msg //report msg
jobMsg := &jcs.ReportMessage{ report := &jcs.TrainReportMessage{
Type: "Train",
TaskName: "",
TaskID: strconv.FormatInt(taskId, 10),
Status: false, Status: false,
Message: msg, Message: msg,
ClusterID: e.clusterId, ClusterID: e.clusterId,
Output: "", Output: "",
} }
report.Messages = append(report.Messages, jobMsg)
//report status
_ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
logx.Errorf(errors.New(errmsg).Error())
return errors.New(errmsg)
} }
for _, s := range results { for _, s := range results {
as.option.ComputeCard = s.Card //execute card as.option.ComputeCard = s.Card //execute card
@ -313,18 +315,19 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
} }
} }
//add report msg //add report msg
jobMsg := &jcs.ReportMessage{ report := &jcs.TrainReportMessage{
Type: "Train",
TaskName: "",
TaskID: strconv.FormatInt(taskId, 10),
Status: false, Status: false,
Message: s.Msg, Message: s.Msg,
ClusterID: s.ClusterId, ClusterID: s.ClusterId,
Output: "", Output: "",
} }
report.Messages = append(report.Messages, jobMsg) //report status
_ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
} }
//report status
_ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
logx.Errorf(errors.New(errmsg).Error()) logx.Errorf(errors.New(errmsg).Error())
return errors.New(errmsg) return errors.New(errmsg)
} }

View File

@ -10,18 +10,30 @@ import (
) )
type JobStatusReportReq struct { type JobStatusReportReq struct {
TaskName string `json:"taskName"` TaskName string `json:"taskName"`
TaskID string `json:"taskID"` TaskID string `json:"taskID"`
Messages []*ReportMessage `json:"messages"` Messages []interface{} `json:"messages"`
} }
type ReportMessage struct { type TrainReportMessage struct {
Type string `json:"type"`
TaskName string `json:"taskName"`
TaskID string `json:"taskID"`
Status bool `json:"status"` Status bool `json:"status"`
Message string `json:"message"` Message string `json:"message"`
ClusterID string `json:"clusterID"` ClusterID string `json:"clusterID"`
Output string `json:"output"` Output string `json:"output"`
} }
type InferReportMessage struct {
Type string `json:"type"`
TaskName string `json:"taskName"`
TaskID string `json:"taskID"`
Status bool `json:"status"`
Message string `json:"message"`
ClusterID string `json:"clusterID"`
Url string `json:"url"`
}
func StatusReport(url string, report *JobStatusReportReq) error { func StatusReport(url string, report interface{}) error {
resp := struct { resp := struct {
Code string `json:"code"` Code string `json:"code"`
Msg string `json:"message"` Msg string `json:"message"`
@ -49,7 +61,7 @@ func StatusReport(url string, report *JobStatusReportReq) error {
return nil return nil
} }
func TempSaveReportToTask(store *database.AiStorage, task *types.TaskModel, report *JobStatusReportReq) error { func TempSaveReportToTask(store *database.AiStorage, task *types.TaskModel, report interface{}) error {
jsonBytes, err := json.Marshal(report) jsonBytes, err := json.Marshal(report)
task.Result = string(jsonBytes) task.Result = string(jsonBytes)

View File

@ -1,6 +1,7 @@
package status package status
import ( import (
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
@ -97,6 +98,11 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
} }
return return
} }
url := ins.InferUrl
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Running instance.Status = constants.Running
case "stopped": case "stopped":
if instance.Status == constants.Stopped { if instance.Status == constants.Stopped {
@ -120,6 +126,11 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
} }
return return
} }
url := ins.InferUrl
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Running instance.Status = constants.Running
case "stopped": case "stopped":
if instance.Status == constants.Stopped { if instance.Status == constants.Stopped {
@ -130,6 +141,19 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
return return
} }
instance.Status = constants.Stopped instance.Status = constants.Stopped
case "failed":
if instance.Status == constants.Failed {
if ch != nil {
<-ch
return
}
return
}
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Failed
default: default:
instance.Status = ins.Status instance.Status = ins.Status
} }
@ -166,6 +190,11 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
} }
return return
} }
url := ins.InferUrl
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Running instance.Status = constants.Running
case "STOPPED": case "STOPPED":
if instance.Status == constants.Stopped { if instance.Status == constants.Stopped {
@ -184,6 +213,10 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
} }
return return
} }
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Failed instance.Status = constants.Failed
case "FAILED": case "FAILED":
if instance.Status == constants.Failed { if instance.Status == constants.Failed {
@ -193,6 +226,10 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
} }
return return
} }
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Failed instance.Status = constants.Failed
default: default:
instance.Status = ins.Status instance.Status = ins.Status

View File

@ -14,19 +14,16 @@ import (
) )
func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpcTask *models.TaskHpc, status bool, message string) error { func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpcTask *models.TaskHpc, status bool, message string) error {
report := &jcs.JobStatusReportReq{ report := &jcs.TrainReportMessage{
TaskName: task.Name, Type: "Train",
TaskID: strconv.FormatInt(task.Id, 10), TaskName: task.Name,
Messages: make([]*jcs.ReportMessage, 0), TaskID: strconv.FormatInt(task.Id, 10),
}
jobMsg := &jcs.ReportMessage{
Status: status, Status: status,
Message: message, Message: message,
ClusterID: strconv.FormatInt(hpcTask.ClusterId, 10), ClusterID: strconv.FormatInt(hpcTask.ClusterId, 10),
Output: hpcTask.WorkDir, Output: hpcTask.WorkDir,
} }
report.Messages = append(report.Messages, jobMsg)
marshal, _ := jsoniter.MarshalToString(report) marshal, _ := jsoniter.MarshalToString(report)
log.Debug().Msgf("通知中间件任务状态参数: [%v]", marshal) log.Debug().Msgf("通知中间件任务状态参数: [%v]", marshal)
err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)

View File

@ -8,6 +8,7 @@ import (
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
@ -191,12 +192,12 @@ func (s *TaskStatus) updateAiTask(aiTaskList []*models.TaskAi) {
} }
func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.TaskAi) error { func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.TaskAi) error {
report := &jcs.JobStatusReportReq{ report := &jcs.TrainReportMessage{
Type: "Train",
TaskName: task.Name, TaskName: task.Name,
TaskID: strconv.FormatInt(task.Id, 10), TaskID: strconv.FormatInt(task.Id, 10),
Messages: make([]*jcs.ReportMessage, 0),
} }
//add report msg
var output string var output string
switch aiTask.ClusterName { switch aiTask.ClusterName {
case "openI": case "openI":
@ -205,13 +206,10 @@ func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.
output = aiTask.Output output = aiTask.Output
} }
jobMsg := &jcs.ReportMessage{ report.Status = true
Status: true, report.Message = ""
Message: "", report.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10)
ClusterID: strconv.FormatInt(aiTask.ClusterId, 10), report.Output = output
Output: output,
}
report.Messages = append(report.Messages, jobMsg)
err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report) err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report)
if err != nil { if err != nil {
@ -224,3 +222,21 @@ func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.
} }
return nil return nil
} }
func ReportStatus(svc *svc.ServiceContext, taskName string, taskId string, clusterId string, url string, status bool, msg string) error {
report := &jcs.InferReportMessage{
Type: "Inference",
TaskName: taskName,
TaskID: taskId,
Status: status,
Message: msg,
ClusterID: clusterId,
Url: url,
}
err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
if err != nil {
return err
}
return nil
}

View File

@ -166,12 +166,12 @@ func UpdateTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
} }
func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask *models.TaskAi) error { func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask *models.TaskAi) error {
report := &jcs.JobStatusReportReq{ report := &jcs.TrainReportMessage{
Type: "Train",
TaskName: task.Name, TaskName: task.Name,
TaskID: strconv.FormatInt(task.Id, 10), TaskID: strconv.FormatInt(task.Id, 10),
Messages: make([]*jcs.ReportMessage, 0),
} }
//add report msg
var output string var output string
switch aiTask.ClusterName { switch aiTask.ClusterName {
case "openI": case "openI":
@ -180,17 +180,17 @@ func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask
output = aiTask.Output output = aiTask.Output
} }
jobMsg := &jcs.ReportMessage{ report.Status = true
Status: true, report.Message = ""
Message: "", report.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10)
ClusterID: strconv.FormatInt(aiTask.ClusterId, 10), report.Output = output
Output: output,
err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
if err != nil {
return err
} }
report.Messages = append(report.Messages, jobMsg)
_ = jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report) err = jcs.TempSaveReportToTask(svc.Scheduler.AiStorages, task, report)
err := jcs.TempSaveReportToTask(svc.Scheduler.AiStorages, task, report)
if err != nil { if err != nil {
return err return err
} }