forked from JointCloud/pcm-coordinator
fix statusSync
This commit is contained in:
parent
42e936f5fc
commit
c0b5cb6607
|
@ -7,6 +7,7 @@ import (
|
|||
"fmt"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/status"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
|
||||
"strconv"
|
||||
|
@ -78,6 +79,10 @@ func (l *CreateInferenceTaskLogic) CreateInferenceTask(req *types.CreateInferenc
|
|||
|
||||
err = l.createInferenceTask(taskId, adapterClusterMap, opt)
|
||||
if err != nil {
|
||||
if len(assignedClusters) != 0 {
|
||||
_ = status.ReportStatus(l.svcCtx, taskName, strconv.FormatInt(taskId, 10), assignedClusters[0].ClusterId, "", false, "")
|
||||
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
|
|
|
@ -261,13 +261,6 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
|
|||
return err
|
||||
}
|
||||
|
||||
//report msg
|
||||
report := &jcs.JobStatusReportReq{
|
||||
TaskName: "",
|
||||
TaskID: strconv.FormatInt(taskId, 10),
|
||||
Messages: make([]*jcs.ReportMessage, 0),
|
||||
}
|
||||
|
||||
var errmsg string
|
||||
for _, err := range errs {
|
||||
e := (err).(struct {
|
||||
|
@ -284,14 +277,23 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
|
|||
return errors.New("database add failed: " + err.Error())
|
||||
}
|
||||
|
||||
//add report msg
|
||||
jobMsg := &jcs.ReportMessage{
|
||||
//report msg
|
||||
report := &jcs.TrainReportMessage{
|
||||
Type: "Train",
|
||||
TaskName: "",
|
||||
TaskID: strconv.FormatInt(taskId, 10),
|
||||
Status: false,
|
||||
Message: msg,
|
||||
ClusterID: e.clusterId,
|
||||
Output: "",
|
||||
}
|
||||
report.Messages = append(report.Messages, jobMsg)
|
||||
|
||||
//report status
|
||||
_ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
|
||||
|
||||
logx.Errorf(errors.New(errmsg).Error())
|
||||
return errors.New(errmsg)
|
||||
|
||||
}
|
||||
for _, s := range results {
|
||||
as.option.ComputeCard = s.Card //execute card
|
||||
|
@ -313,17 +315,18 @@ func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.Ass
|
|||
}
|
||||
}
|
||||
//add report msg
|
||||
jobMsg := &jcs.ReportMessage{
|
||||
report := &jcs.TrainReportMessage{
|
||||
Type: "Train",
|
||||
TaskName: "",
|
||||
TaskID: strconv.FormatInt(taskId, 10),
|
||||
Status: false,
|
||||
Message: s.Msg,
|
||||
ClusterID: s.ClusterId,
|
||||
Output: "",
|
||||
}
|
||||
report.Messages = append(report.Messages, jobMsg)
|
||||
}
|
||||
|
||||
//report status
|
||||
_ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
|
||||
}
|
||||
|
||||
logx.Errorf(errors.New(errmsg).Error())
|
||||
return errors.New(errmsg)
|
||||
|
|
|
@ -12,16 +12,28 @@ import (
|
|||
type JobStatusReportReq struct {
|
||||
TaskName string `json:"taskName"`
|
||||
TaskID string `json:"taskID"`
|
||||
Messages []*ReportMessage `json:"messages"`
|
||||
Messages []interface{} `json:"messages"`
|
||||
}
|
||||
type ReportMessage struct {
|
||||
type TrainReportMessage struct {
|
||||
Type string `json:"type"`
|
||||
TaskName string `json:"taskName"`
|
||||
TaskID string `json:"taskID"`
|
||||
Status bool `json:"status"`
|
||||
Message string `json:"message"`
|
||||
ClusterID string `json:"clusterID"`
|
||||
Output string `json:"output"`
|
||||
}
|
||||
type InferReportMessage struct {
|
||||
Type string `json:"type"`
|
||||
TaskName string `json:"taskName"`
|
||||
TaskID string `json:"taskID"`
|
||||
Status bool `json:"status"`
|
||||
Message string `json:"message"`
|
||||
ClusterID string `json:"clusterID"`
|
||||
Url string `json:"url"`
|
||||
}
|
||||
|
||||
func StatusReport(url string, report *JobStatusReportReq) error {
|
||||
func StatusReport(url string, report interface{}) error {
|
||||
resp := struct {
|
||||
Code string `json:"code"`
|
||||
Msg string `json:"message"`
|
||||
|
@ -49,7 +61,7 @@ func StatusReport(url string, report *JobStatusReportReq) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func TempSaveReportToTask(store *database.AiStorage, task *types.TaskModel, report *JobStatusReportReq) error {
|
||||
func TempSaveReportToTask(store *database.AiStorage, task *types.TaskModel, report interface{}) error {
|
||||
jsonBytes, err := json.Marshal(report)
|
||||
|
||||
task.Result = string(jsonBytes)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package status
|
||||
|
||||
import (
|
||||
"github.com/zeromicro/go-zero/core/logx"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
||||
|
@ -97,6 +98,11 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
|
|||
}
|
||||
return
|
||||
}
|
||||
url := ins.InferUrl
|
||||
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
|
||||
if err != nil {
|
||||
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
||||
}
|
||||
instance.Status = constants.Running
|
||||
case "stopped":
|
||||
if instance.Status == constants.Stopped {
|
||||
|
@ -120,6 +126,11 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
|
|||
}
|
||||
return
|
||||
}
|
||||
url := ins.InferUrl
|
||||
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
|
||||
if err != nil {
|
||||
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
||||
}
|
||||
instance.Status = constants.Running
|
||||
case "stopped":
|
||||
if instance.Status == constants.Stopped {
|
||||
|
@ -130,6 +141,19 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
|
|||
return
|
||||
}
|
||||
instance.Status = constants.Stopped
|
||||
case "failed":
|
||||
if instance.Status == constants.Failed {
|
||||
if ch != nil {
|
||||
<-ch
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
|
||||
if err != nil {
|
||||
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
||||
}
|
||||
instance.Status = constants.Failed
|
||||
default:
|
||||
instance.Status = ins.Status
|
||||
}
|
||||
|
@ -166,6 +190,11 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
|
|||
}
|
||||
return
|
||||
}
|
||||
url := ins.InferUrl
|
||||
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
|
||||
if err != nil {
|
||||
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
||||
}
|
||||
instance.Status = constants.Running
|
||||
case "STOPPED":
|
||||
if instance.Status == constants.Stopped {
|
||||
|
@ -184,6 +213,10 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
|
|||
}
|
||||
return
|
||||
}
|
||||
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
|
||||
if err != nil {
|
||||
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
||||
}
|
||||
instance.Status = constants.Failed
|
||||
case "FAILED":
|
||||
if instance.Status == constants.Failed {
|
||||
|
@ -193,6 +226,10 @@ func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInfe
|
|||
}
|
||||
return
|
||||
}
|
||||
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
|
||||
if err != nil {
|
||||
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
||||
}
|
||||
instance.Status = constants.Failed
|
||||
default:
|
||||
instance.Status = ins.Status
|
||||
|
|
|
@ -14,19 +14,16 @@ import (
|
|||
)
|
||||
|
||||
func reportHpcStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, hpcTask *models.TaskHpc, status bool, message string) error {
|
||||
report := &jcs.JobStatusReportReq{
|
||||
report := &jcs.TrainReportMessage{
|
||||
Type: "Train",
|
||||
TaskName: task.Name,
|
||||
TaskID: strconv.FormatInt(task.Id, 10),
|
||||
Messages: make([]*jcs.ReportMessage, 0),
|
||||
}
|
||||
|
||||
jobMsg := &jcs.ReportMessage{
|
||||
Status: status,
|
||||
Message: message,
|
||||
ClusterID: strconv.FormatInt(hpcTask.ClusterId, 10),
|
||||
Output: hpcTask.WorkDir,
|
||||
}
|
||||
report.Messages = append(report.Messages, jobMsg)
|
||||
|
||||
marshal, _ := jsoniter.MarshalToString(report)
|
||||
log.Debug().Msgf("通知中间件任务状态参数: [%v]", marshal)
|
||||
err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
||||
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
||||
|
@ -191,12 +192,12 @@ func (s *TaskStatus) updateAiTask(aiTaskList []*models.TaskAi) {
|
|||
}
|
||||
|
||||
func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.TaskAi) error {
|
||||
report := &jcs.JobStatusReportReq{
|
||||
report := &jcs.TrainReportMessage{
|
||||
Type: "Train",
|
||||
TaskName: task.Name,
|
||||
TaskID: strconv.FormatInt(task.Id, 10),
|
||||
Messages: make([]*jcs.ReportMessage, 0),
|
||||
}
|
||||
//add report msg
|
||||
|
||||
var output string
|
||||
switch aiTask.ClusterName {
|
||||
case "openI":
|
||||
|
@ -205,13 +206,10 @@ func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.
|
|||
output = aiTask.Output
|
||||
}
|
||||
|
||||
jobMsg := &jcs.ReportMessage{
|
||||
Status: true,
|
||||
Message: "",
|
||||
ClusterID: strconv.FormatInt(aiTask.ClusterId, 10),
|
||||
Output: output,
|
||||
}
|
||||
report.Messages = append(report.Messages, jobMsg)
|
||||
report.Status = true
|
||||
report.Message = ""
|
||||
report.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10)
|
||||
report.Output = output
|
||||
|
||||
err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report)
|
||||
if err != nil {
|
||||
|
@ -224,3 +222,21 @@ func (s *TaskStatus) reportStatusMessages(task *types.TaskModel, aiTask *models.
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func ReportStatus(svc *svc.ServiceContext, taskName string, taskId string, clusterId string, url string, status bool, msg string) error {
|
||||
report := &jcs.InferReportMessage{
|
||||
Type: "Inference",
|
||||
TaskName: taskName,
|
||||
TaskID: taskId,
|
||||
Status: status,
|
||||
Message: msg,
|
||||
ClusterID: clusterId,
|
||||
Url: url,
|
||||
}
|
||||
|
||||
err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -166,12 +166,12 @@ func UpdateTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
|
|||
}
|
||||
|
||||
func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask *models.TaskAi) error {
|
||||
report := &jcs.JobStatusReportReq{
|
||||
report := &jcs.TrainReportMessage{
|
||||
Type: "Train",
|
||||
TaskName: task.Name,
|
||||
TaskID: strconv.FormatInt(task.Id, 10),
|
||||
Messages: make([]*jcs.ReportMessage, 0),
|
||||
}
|
||||
//add report msg
|
||||
|
||||
var output string
|
||||
switch aiTask.ClusterName {
|
||||
case "openI":
|
||||
|
@ -180,17 +180,17 @@ func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask
|
|||
output = aiTask.Output
|
||||
}
|
||||
|
||||
jobMsg := &jcs.ReportMessage{
|
||||
Status: true,
|
||||
Message: "",
|
||||
ClusterID: strconv.FormatInt(aiTask.ClusterId, 10),
|
||||
Output: output,
|
||||
report.Status = true
|
||||
report.Message = ""
|
||||
report.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10)
|
||||
report.Output = output
|
||||
|
||||
err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
report.Messages = append(report.Messages, jobMsg)
|
||||
|
||||
_ = jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
|
||||
|
||||
err := jcs.TempSaveReportToTask(svc.Scheduler.AiStorages, task, report)
|
||||
err = jcs.TempSaveReportToTask(svc.Scheduler.AiStorages, task, report)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue