forked from JointCloud/pcm-coordinator
391 lines
9.3 KiB
Go
391 lines
9.3 KiB
Go
package tasksync
|
|
|
|
import (
|
|
"github.com/zeromicro/go-zero/core/logx"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/config"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
|
|
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
|
|
"net/http"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
type SyncInfer struct {
|
|
mu sync.Mutex
|
|
aiStorages *database.AiStorage
|
|
inferenceAdapterMap map[string]map[string]inference.ICluster
|
|
config *config.Config
|
|
}
|
|
|
|
func NewInferTask(storage *database.AiStorage, inferenceAdapterMap map[string]map[string]inference.ICluster, config *config.Config) *SyncInfer {
|
|
return &SyncInfer{
|
|
aiStorages: storage,
|
|
inferenceAdapterMap: inferenceAdapterMap,
|
|
config: config,
|
|
}
|
|
}
|
|
|
|
func (s *SyncInfer) UpdateDeployInstanceStatusBatch(insList []*models.AiInferDeployInstance, needfilter bool) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
list := make([]*models.AiInferDeployInstance, len(insList))
|
|
copy(list, insList)
|
|
|
|
if needfilter {
|
|
for i := len(list) - 1; i >= 0; i-- {
|
|
if list[i].Status == constants.Running || list[i].Status == constants.Stopped || list[i].Status == constants.Failed {
|
|
list = append(list[:i], list[i+1:]...)
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(list) == 0 {
|
|
return
|
|
}
|
|
|
|
buffer := make(chan bool, 3)
|
|
for _, instance := range list {
|
|
buffer <- true
|
|
go s.UpdateDeployInstanceStatus(instance, false, buffer)
|
|
}
|
|
}
|
|
|
|
func (s *SyncInfer) UpdateDeployTaskStatus() {
|
|
list, err := s.aiStorages.GetAllDeployTasks()
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
ins := list[0]
|
|
for i := range list {
|
|
uTime, _ := time.Parse(time.RFC3339, ins.UpdateTime)
|
|
latest, _ := time.Parse(time.RFC3339, list[i].UpdateTime)
|
|
if latest.After(uTime) {
|
|
ins = list[i]
|
|
}
|
|
}
|
|
inslist, err := s.aiStorages.GetInstanceListByDeployTaskId(ins.Id)
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
buffer := make(chan bool, 2)
|
|
for _, instance := range inslist {
|
|
buffer <- true
|
|
go s.UpdateDeployInstanceStatus(instance, false, buffer)
|
|
}
|
|
}
|
|
|
|
func (s *SyncInfer) UpdateDeployInstanceStatus(instance *models.AiInferDeployInstance, updatetime bool, ch chan bool) {
|
|
amap, found := s.inferenceAdapterMap[strconv.FormatInt(instance.AdapterId, 10)]
|
|
if !found {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
cmap, found := amap[strconv.FormatInt(instance.ClusterId, 10)]
|
|
if !found {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
h := http.Request{}
|
|
ins, err := cmap.GetInferDeployInstance(h.Context(), instance.InstanceId)
|
|
if err != nil {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
switch instance.ClusterType {
|
|
case storeLink.TYPE_OCTOPUS:
|
|
switch ins.Status {
|
|
case "running":
|
|
if instance.Status == constants.Running {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
url := ins.InferUrl
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
|
|
if err != nil {
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
|
}
|
|
instance.Status = constants.Running
|
|
case "stopped":
|
|
if instance.Status == constants.Stopped {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
instance.Status = constants.Stopped
|
|
default:
|
|
instance.Status = ins.Status
|
|
}
|
|
case storeLink.TYPE_MODELARTS:
|
|
switch ins.Status {
|
|
case "running":
|
|
if instance.Status == constants.Running {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
url := ins.InferUrl
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
|
|
if err != nil {
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
|
}
|
|
instance.Status = constants.Running
|
|
case "stopped":
|
|
if instance.Status == constants.Stopped {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
instance.Status = constants.Stopped
|
|
case "failed":
|
|
if instance.Status == constants.Failed {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
|
|
if err != nil {
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
|
}
|
|
instance.Status = constants.Failed
|
|
default:
|
|
instance.Status = ins.Status
|
|
}
|
|
case storeLink.TYPE_SHUGUANGAI:
|
|
switch ins.Status {
|
|
case "Running":
|
|
if instance.Status == constants.Running {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
instance.Status = constants.Running
|
|
case "Terminated":
|
|
if instance.Status == constants.Stopped {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
instance.Status = constants.Stopped
|
|
default:
|
|
instance.Status = ins.Status
|
|
}
|
|
case storeLink.TYPE_OPENI:
|
|
switch ins.Status {
|
|
case "RUNNING":
|
|
if instance.Status == constants.Running {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
url := ins.InferUrl
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
|
|
if err != nil {
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
|
}
|
|
instance.Status = constants.Running
|
|
case "STOPPED":
|
|
if instance.Status == constants.Stopped {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
instance.Status = constants.Stopped
|
|
case "CREATED_FAILED":
|
|
if instance.Status == constants.Failed {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
|
|
if err != nil {
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
|
}
|
|
instance.Status = constants.Failed
|
|
case "FAILED":
|
|
if instance.Status == constants.Failed {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
err := s.ReportInferenceStatusMessages(instance, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
|
|
if err != nil {
|
|
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
|
|
}
|
|
instance.Status = constants.Failed
|
|
default:
|
|
instance.Status = ins.Status
|
|
}
|
|
}
|
|
|
|
err = s.aiStorages.UpdateInferDeployInstance(instance, updatetime)
|
|
if err != nil {
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
return
|
|
}
|
|
|
|
if ch != nil {
|
|
<-ch
|
|
return
|
|
}
|
|
}
|
|
|
|
func (s *SyncInfer) UpdateAutoStoppedInstance() {
|
|
list, err := s.aiStorages.GetInferDeployInstanceListLastMonth()
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
if len(list) == 0 {
|
|
return
|
|
}
|
|
|
|
s.UpdateDeployInstanceStatusBatch(list, false)
|
|
}
|
|
|
|
func (s *SyncInfer) CheckStopStatus(in *inference.DeployInstance) bool {
|
|
switch in.ClusterType {
|
|
case storeLink.TYPE_OCTOPUS:
|
|
switch in.Status {
|
|
case "stopped":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
case storeLink.TYPE_MODELARTS:
|
|
switch in.Status {
|
|
case "stopped":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
case storeLink.TYPE_SHUGUANGAI:
|
|
switch in.Status {
|
|
case "Terminated":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
case storeLink.TYPE_OPENI:
|
|
switch in.Status {
|
|
case "STOPPED":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func (s *SyncInfer) CheckRunningStatus(in *inference.DeployInstance) bool {
|
|
switch in.ClusterType {
|
|
case storeLink.TYPE_OCTOPUS:
|
|
switch in.Status {
|
|
case "running":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
case storeLink.TYPE_MODELARTS:
|
|
switch in.Status {
|
|
case "running":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
case storeLink.TYPE_SHUGUANGAI:
|
|
switch in.Status {
|
|
case "Running":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
case storeLink.TYPE_OPENI:
|
|
switch in.Status {
|
|
case "RUNNING":
|
|
return true
|
|
case "WAITING":
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func (s *SyncInfer) ReportInferenceStatusMessages(ins *models.AiInferDeployInstance, taskName string, taskId string, clusterId string, url string, status bool, msg string) error {
|
|
var id string
|
|
var adapterID string
|
|
var clusterID string
|
|
var instanceID string
|
|
if ins != nil {
|
|
id = strconv.FormatInt(ins.Id, 10)
|
|
adapterID = strconv.FormatInt(ins.AdapterId, 10)
|
|
clusterID = strconv.FormatInt(ins.ClusterId, 10)
|
|
instanceID = ins.InstanceId
|
|
}
|
|
report := &jcs.JobStatusReportReq{}
|
|
reportMsg := &jcs.InferReportMessage{
|
|
Type: "Inference",
|
|
TaskName: taskName,
|
|
TaskID: taskId,
|
|
Status: status,
|
|
Message: msg,
|
|
Url: url,
|
|
ID: id,
|
|
AdapterID: adapterID,
|
|
ClusterID: clusterID,
|
|
InstanceID: instanceID,
|
|
}
|
|
report.Report = reportMsg
|
|
|
|
err := jcs.StatusReport(s.config.JcsMiddleware.JobStatusReportUrl, report)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|