forked from JointCloud/pcm-coordinator
449 lines
12 KiB
Go
449 lines
12 KiB
Go
package status
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"github.com/zeromicro/go-zero/core/logx"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
|
"google.golang.org/grpc/codes"
|
|
"google.golang.org/grpc/status"
|
|
"net/http"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
func UpdateTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
|
|
svc.Scheduler.AiService.TaskSyncLock.Lock()
|
|
defer svc.Scheduler.AiService.TaskSyncLock.Unlock()
|
|
|
|
list := make([]*types.TaskModel, len(tasklist))
|
|
copy(list, tasklist)
|
|
for i := len(list) - 1; i >= 0; i-- {
|
|
if list[i].AdapterTypeDict != "1" || list[i].Status == constants.Succeeded || list[i].Status == constants.Failed || list[i].Status == constants.Cancelled {
|
|
list = append(list[:i], list[i+1:]...)
|
|
}
|
|
}
|
|
|
|
if len(list) == 0 {
|
|
return
|
|
}
|
|
|
|
task := list[0]
|
|
for i := range list {
|
|
earliest, _ := time.Parse(time.RFC3339, task.UpdatedTime)
|
|
latest, _ := time.Parse(time.RFC3339, list[i].UpdatedTime)
|
|
if latest.Before(earliest) {
|
|
task = list[i]
|
|
}
|
|
}
|
|
|
|
// Update Infer Task Status
|
|
//if task.TaskTypeDict == "11" || task.TaskTypeDict == "12" {
|
|
// updateInferTaskStatus(svc, *task)
|
|
// return
|
|
//}
|
|
|
|
aiTask, err := svc.Scheduler.AiStorages.GetAiTaskListById(task.Id)
|
|
if err != nil {
|
|
logx.Errorf(err.Error())
|
|
return
|
|
}
|
|
|
|
if len(aiTask) == 0 {
|
|
err := svc.Scheduler.AiStorages.UpdateTask(task)
|
|
if err != nil {
|
|
return
|
|
}
|
|
return
|
|
}
|
|
|
|
logx.Errorf("############ Report Status Message Before switch %s", task.Status)
|
|
if len(aiTask) == 1 {
|
|
logx.Errorf("############ Report Status Message Switch %s", aiTask[0].Status)
|
|
switch aiTask[0].Status {
|
|
|
|
case constants.Completed:
|
|
task.Status = constants.Succeeded
|
|
logx.Errorf("############ Report Status Message Before Sending %s", task.Status)
|
|
|
|
_ = reportStatusMessages(svc, task, aiTask[0])
|
|
//case constants.Running:
|
|
// task.Status = constants.Succeeded
|
|
// logx.Errorf("############ Report Status Message Before Sending %s", task.Status)
|
|
//
|
|
// _ = reportStatusMessages(svc, task, aiTask[0])
|
|
case constants.Failed:
|
|
task.Status = constants.Failed
|
|
logx.Errorf("############ Report Status Message Before Sending %s", task.Status)
|
|
|
|
_ = reportStatusMessages(svc, task, aiTask[0])
|
|
default:
|
|
task.Status = aiTask[0].Status
|
|
}
|
|
|
|
task.StartTime = aiTask[0].StartTime
|
|
task.EndTime = aiTask[0].EndTime
|
|
err := svc.Scheduler.AiStorages.UpdateTask(task)
|
|
if err != nil {
|
|
return
|
|
}
|
|
return
|
|
}
|
|
logx.Errorf("############ Report Status Message After switch %s", task.Status)
|
|
|
|
for i := len(aiTask) - 1; i >= 0; i-- {
|
|
if aiTask[i].StartTime == "" {
|
|
task.Status = aiTask[i].Status
|
|
aiTask = append(aiTask[:i], aiTask[i+1:]...)
|
|
}
|
|
}
|
|
|
|
if len(aiTask) == 0 {
|
|
err := svc.Scheduler.AiStorages.UpdateTask(task)
|
|
if err != nil {
|
|
return
|
|
}
|
|
return
|
|
}
|
|
|
|
//start, _ := time.ParseInLocation(constants.Layout, aiTask[0].StartTime, time.Local)
|
|
//end, _ := time.ParseInLocation(constants.Layout, aiTask[0].EndTime, time.Local)
|
|
//
|
|
//var status string
|
|
//var count int
|
|
//for _, a := range aiTask {
|
|
// s, _ := time.ParseInLocation(constants.Layout, a.StartTime, time.Local)
|
|
// e, _ := time.ParseInLocation(constants.Layout, a.EndTime, time.Local)
|
|
//
|
|
// if s.Before(start) {
|
|
// start = s
|
|
// }
|
|
//
|
|
// if e.After(end) {
|
|
// end = e
|
|
// }
|
|
//
|
|
// if a.Status == constants.Failed {
|
|
// status = a.Status
|
|
// break
|
|
// }
|
|
//
|
|
// if a.Status == constants.Pending {
|
|
// status = a.Status
|
|
// continue
|
|
// }
|
|
//
|
|
// if a.Status == constants.Running {
|
|
// status = a.Status
|
|
// continue
|
|
// }
|
|
//
|
|
// if a.Status == constants.Completed {
|
|
// count++
|
|
// continue
|
|
// }
|
|
//}
|
|
|
|
//if count == len(aiTask) {
|
|
// status = constants.Succeeded
|
|
//}
|
|
|
|
//if status != "" {
|
|
// task.Status = status
|
|
// task.StartTime = start.Format(constants.Layout)
|
|
// task.EndTime = end.Format(constants.Layout)
|
|
//}
|
|
//
|
|
//err = svc.Scheduler.AiStorages.UpdateTask(task)
|
|
//if err != nil {
|
|
// return
|
|
//}
|
|
}
|
|
|
|
func reportStatusMessages(svc *svc.ServiceContext, task *types.TaskModel, aiTask *models.TaskAi) error {
|
|
report := &jcs.TrainReportMessage{
|
|
Type: "Train",
|
|
TaskName: task.Name,
|
|
TaskID: strconv.FormatInt(task.Id, 10),
|
|
}
|
|
|
|
var output string
|
|
switch aiTask.ClusterName {
|
|
case "openI":
|
|
output = aiTask.JobId
|
|
case "鹏城云脑II-modelarts":
|
|
output = aiTask.Output
|
|
}
|
|
|
|
report.Status = true
|
|
report.Message = ""
|
|
report.ClusterID = strconv.FormatInt(aiTask.ClusterId, 10)
|
|
report.Output = output
|
|
|
|
err := jcs.StatusReport(svc.Scheduler.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = jcs.TempSaveReportToTask(svc.Scheduler.AiStorages, task, report)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func updateInferTaskStatus(svc *svc.ServiceContext, task types.TaskModel) {
|
|
svc.Scheduler.AiService.TaskSyncLock.Lock()
|
|
defer svc.Scheduler.AiService.TaskSyncLock.Unlock()
|
|
aiTask, err := svc.Scheduler.AiStorages.GetAiTaskListById(task.Id)
|
|
if err != nil {
|
|
logx.Errorf(err.Error())
|
|
return
|
|
}
|
|
|
|
if len(aiTask) == 0 {
|
|
task.Status = constants.Failed
|
|
err = svc.Scheduler.AiStorages.UpdateTask(&task)
|
|
if err != nil {
|
|
return
|
|
}
|
|
return
|
|
}
|
|
|
|
if len(aiTask) == 1 {
|
|
if aiTask[0].Status == constants.Completed {
|
|
task.StartTime = aiTask[0].StartTime
|
|
task.EndTime = aiTask[0].EndTime
|
|
task.Status = constants.Succeeded
|
|
} else {
|
|
task.StartTime = aiTask[0].StartTime
|
|
task.Status = aiTask[0].Status
|
|
}
|
|
|
|
err = svc.Scheduler.AiStorages.UpdateTask(&task)
|
|
if err != nil {
|
|
return
|
|
}
|
|
return
|
|
}
|
|
|
|
//for i := len(aiTask) - 1; i >= 0; i-- {
|
|
// if aiTask[i].StartTime == "" {
|
|
// task.Status = aiTask[i].Status
|
|
// aiTask = append(aiTask[:i], aiTask[i+1:]...)
|
|
// }
|
|
//}
|
|
//
|
|
//if len(aiTask) == 0 {
|
|
// task.UpdatedTime = time.Now().Format(constants.Layout)
|
|
// tx = svc.DbEngin.Table("task").Model(task).Updates(task)
|
|
// if tx.Error != nil {
|
|
// logx.Errorf(tx.Error.Error())
|
|
// return
|
|
// }
|
|
// return
|
|
//}
|
|
|
|
if aiTask[0].StartTime == "" {
|
|
return
|
|
}
|
|
|
|
start, _ := time.ParseInLocation(time.RFC3339, aiTask[0].StartTime, time.Local)
|
|
end, _ := time.ParseInLocation(time.RFC3339, aiTask[0].EndTime, time.Local)
|
|
var status string
|
|
var count int
|
|
for _, a := range aiTask {
|
|
if a.Status == constants.Failed {
|
|
status = a.Status
|
|
break
|
|
}
|
|
|
|
if a.Status == constants.Pending {
|
|
status = a.Status
|
|
continue
|
|
}
|
|
|
|
if a.Status == constants.Running {
|
|
status = a.Status
|
|
continue
|
|
}
|
|
|
|
if a.Status == constants.Completed {
|
|
count++
|
|
continue
|
|
}
|
|
}
|
|
|
|
if count == len(aiTask) {
|
|
status = constants.Succeeded
|
|
}
|
|
|
|
if status == constants.Succeeded {
|
|
task.Status = status
|
|
task.StartTime = start.Format(time.RFC3339)
|
|
task.EndTime = end.Format(time.RFC3339)
|
|
} else {
|
|
task.Status = status
|
|
task.StartTime = start.Format(time.RFC3339)
|
|
}
|
|
|
|
err = svc.Scheduler.AiStorages.UpdateTask(&task)
|
|
if err != nil {
|
|
return
|
|
}
|
|
}
|
|
|
|
func UpdateAiTask(svc *svc.ServiceContext, aiTaskList ...*models.TaskAi) {
|
|
var wg sync.WaitGroup
|
|
for _, aitask := range aiTaskList {
|
|
t := aitask
|
|
if t.Status == constants.Completed || t.Status == constants.Failed || t.JobId == "" || t.Status == constants.Cancelled {
|
|
continue
|
|
}
|
|
wg.Add(1)
|
|
go func() {
|
|
h := http.Request{}
|
|
trainingTask, err := svc.Scheduler.AiService.AiCollectorAdapterMap[strconv.FormatInt(t.AdapterId, 10)][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(h.Context(), t.JobId)
|
|
if err != nil {
|
|
if status.Code(err) == codes.DeadlineExceeded {
|
|
msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
|
|
logx.Errorf(errors.New(msg).Error())
|
|
wg.Done()
|
|
return
|
|
}
|
|
|
|
msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
|
|
logx.Errorf(errors.New(msg).Error())
|
|
wg.Done()
|
|
return
|
|
}
|
|
if trainingTask == nil {
|
|
wg.Done()
|
|
return
|
|
}
|
|
switch trainingTask.Status {
|
|
case constants.Running:
|
|
if t.Status != trainingTask.Status {
|
|
svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "running", "任务运行中")
|
|
t.Status = trainingTask.Status
|
|
}
|
|
case constants.Failed:
|
|
if t.Status != trainingTask.Status {
|
|
svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "failed", "任务失败")
|
|
t.Status = trainingTask.Status
|
|
}
|
|
case constants.Completed:
|
|
if t.Status != trainingTask.Status {
|
|
svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "completed", "任务完成")
|
|
t.Status = trainingTask.Status
|
|
}
|
|
default:
|
|
if t.Status != trainingTask.Status {
|
|
svc.Scheduler.AiStorages.AddNoticeInfo(strconv.FormatInt(t.AdapterId, 10), t.AdapterName, strconv.FormatInt(t.ClusterId, 10), t.ClusterName, t.Name, "pending", "任务pending")
|
|
t.Status = trainingTask.Status
|
|
}
|
|
}
|
|
t.StartTime = trainingTask.Start
|
|
t.EndTime = trainingTask.End
|
|
err = svc.Scheduler.AiStorages.UpdateAiTask(t)
|
|
if err != nil {
|
|
msg := fmt.Sprintf("###UpdateAiTaskStatus###, AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
|
|
logx.Errorf(errors.New(msg).Error())
|
|
wg.Done()
|
|
return
|
|
}
|
|
wg.Done()
|
|
}()
|
|
}
|
|
wg.Wait()
|
|
}
|
|
|
|
func UpdateAiTaskStatus(svc *svc.ServiceContext, tasklist []*types.TaskModel) {
|
|
svc.Scheduler.AiService.TaskSyncLock.Lock()
|
|
defer svc.Scheduler.AiService.TaskSyncLock.Unlock()
|
|
list := make([]*types.TaskModel, len(tasklist))
|
|
copy(list, tasklist)
|
|
for i := len(list) - 1; i >= 0; i-- {
|
|
if list[i].AdapterTypeDict != "1" || list[i].Status == constants.Succeeded || list[i].Status == constants.Failed {
|
|
list = append(list[:i], list[i+1:]...)
|
|
}
|
|
}
|
|
|
|
if len(list) == 0 {
|
|
return
|
|
}
|
|
|
|
task := list[0]
|
|
for i := range list {
|
|
earliest, _ := time.Parse(constants.Layout, task.UpdatedTime)
|
|
latest, _ := time.Parse(constants.Layout, list[i].UpdatedTime)
|
|
if latest.Before(earliest) {
|
|
task = list[i]
|
|
}
|
|
}
|
|
|
|
aiTaskList, err := svc.Scheduler.AiStorages.GetAiTaskListById(task.Id)
|
|
if err != nil {
|
|
logx.Errorf(err.Error())
|
|
return
|
|
}
|
|
|
|
if len(aiTaskList) == 0 {
|
|
return
|
|
}
|
|
|
|
UpdateAiTask(svc, aiTaskList...)
|
|
}
|
|
|
|
func UpdateTrainingTaskStatus(svc *svc.ServiceContext, list []*types.AdapterInfo) {
|
|
svc.Scheduler.AiService.TaskSyncLock.Lock()
|
|
defer svc.Scheduler.AiService.TaskSyncLock.Unlock()
|
|
var wg sync.WaitGroup
|
|
for _, adapter := range list {
|
|
taskList, err := svc.Scheduler.AiStorages.GetAiTasksByAdapterId(adapter.Id)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if len(taskList) == 0 {
|
|
continue
|
|
}
|
|
for _, task := range taskList {
|
|
t := task
|
|
if t.Status == constants.Completed || task.Status == constants.Failed || task.Status == constants.Stopped || task.TaskType != "pytorch" {
|
|
continue
|
|
}
|
|
wg.Add(1)
|
|
go func() {
|
|
h := http.Request{}
|
|
trainingTask, err := svc.Scheduler.AiService.AiCollectorAdapterMap[adapter.Id][strconv.FormatInt(t.ClusterId, 10)].GetTrainingTask(h.Context(), t.JobId)
|
|
if err != nil {
|
|
msg := fmt.Sprintf("AiTaskId: %v, clusterId: %v , JobId: %v, error: %v \n", t.Id, t.ClusterId, t.JobId, err.Error())
|
|
logx.Errorf(errors.New(msg).Error())
|
|
wg.Done()
|
|
return
|
|
}
|
|
if trainingTask == nil {
|
|
wg.Done()
|
|
return
|
|
}
|
|
t.Status = trainingTask.Status
|
|
t.StartTime = trainingTask.Start
|
|
t.EndTime = trainingTask.End
|
|
err = svc.Scheduler.AiStorages.UpdateAiTask(t)
|
|
if err != nil {
|
|
wg.Done()
|
|
return
|
|
}
|
|
wg.Done()
|
|
}()
|
|
}
|
|
}
|
|
wg.Wait()
|
|
return
|
|
}
|