pcm-coordinator/internal/scheduler/service/utils/status/deployInstance.go

322 lines
7.4 KiB
Go

package status
import (
"github.com/zeromicro/go-zero/core/logx"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
"net/http"
"strconv"
"time"
)
func UpdateDeployInstanceStatusBatch(svc *svc.ServiceContext, insList []*models.AiInferDeployInstance, needfilter bool) {
list := make([]*models.AiInferDeployInstance, len(insList))
copy(list, insList)
if needfilter {
for i := len(list) - 1; i >= 0; i-- {
if list[i].Status == constants.Running || list[i].Status == constants.Stopped || list[i].Status == constants.Failed {
list = append(list[:i], list[i+1:]...)
}
}
}
if len(list) == 0 {
return
}
buffer := make(chan bool, 3)
for _, instance := range list {
buffer <- true
go UpdateDeployInstanceStatus(svc, instance, false, buffer)
}
}
func UpdateDeployTaskStatus(svc *svc.ServiceContext) {
list, err := svc.Scheduler.AiStorages.GetAllDeployTasks()
if err != nil {
return
}
ins := list[0]
for i := range list {
uTime, _ := time.Parse(time.RFC3339, ins.UpdateTime)
latest, _ := time.Parse(time.RFC3339, list[i].UpdateTime)
if latest.After(uTime) {
ins = list[i]
}
}
inslist, err := svc.Scheduler.AiStorages.GetInstanceListByDeployTaskId(ins.Id)
if err != nil {
return
}
buffer := make(chan bool, 2)
for _, instance := range inslist {
buffer <- true
go UpdateDeployInstanceStatus(svc, instance, false, buffer)
}
}
func UpdateDeployInstanceStatus(svc *svc.ServiceContext, instance *models.AiInferDeployInstance, updatetime bool, ch chan bool) {
amap, found := svc.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(instance.AdapterId, 10)]
if !found {
if ch != nil {
<-ch
return
}
return
}
cmap, found := amap[strconv.FormatInt(instance.ClusterId, 10)]
if !found {
if ch != nil {
<-ch
return
}
return
}
h := http.Request{}
ins, err := cmap.GetInferDeployInstance(h.Context(), instance.InstanceId)
if err != nil {
if ch != nil {
<-ch
return
}
return
}
switch instance.ClusterType {
case storeLink.TYPE_OCTOPUS:
switch ins.Status {
case "running":
if instance.Status == constants.Running {
if ch != nil {
<-ch
return
}
return
}
url := ins.InferUrl
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Running
case "stopped":
if instance.Status == constants.Stopped {
if ch != nil {
<-ch
return
}
return
}
instance.Status = constants.Stopped
default:
instance.Status = ins.Status
}
case storeLink.TYPE_MODELARTS:
switch ins.Status {
case "running":
if instance.Status == constants.Running {
if ch != nil {
<-ch
return
}
return
}
url := ins.InferUrl
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Running
case "stopped":
if instance.Status == constants.Stopped {
if ch != nil {
<-ch
return
}
return
}
instance.Status = constants.Stopped
case "failed":
if instance.Status == constants.Failed {
if ch != nil {
<-ch
return
}
return
}
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Failed
default:
instance.Status = ins.Status
}
case storeLink.TYPE_SHUGUANGAI:
switch ins.Status {
case "Running":
if instance.Status == constants.Running {
if ch != nil {
<-ch
return
}
return
}
instance.Status = constants.Running
case "Terminated":
if instance.Status == constants.Stopped {
if ch != nil {
<-ch
return
}
return
}
instance.Status = constants.Stopped
default:
instance.Status = ins.Status
}
case storeLink.TYPE_OPENI:
switch ins.Status {
case "RUNNING":
if instance.Status == constants.Running {
if ch != nil {
<-ch
return
}
return
}
url := ins.InferUrl
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), url, true, "")
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Running
case "STOPPED":
if instance.Status == constants.Stopped {
if ch != nil {
<-ch
return
}
return
}
instance.Status = constants.Stopped
case "CREATED_FAILED":
if instance.Status == constants.Failed {
if ch != nil {
<-ch
return
}
return
}
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Failed
case "FAILED":
if instance.Status == constants.Failed {
if ch != nil {
<-ch
return
}
return
}
err := ReportStatus(svc, instance.InstanceName, strconv.FormatInt(instance.DeployInstanceTaskId, 10), strconv.FormatInt(instance.ClusterId, 10), "", false, ins.Status)
if err != nil {
logx.Errorf("############ Report Infer Task Status Message Error %s", err.Error())
}
instance.Status = constants.Failed
default:
instance.Status = ins.Status
}
}
err = svc.Scheduler.AiStorages.UpdateInferDeployInstance(instance, updatetime)
if err != nil {
if ch != nil {
<-ch
return
}
return
}
if ch != nil {
<-ch
return
}
}
func UpdateAutoStoppedInstance(svc *svc.ServiceContext) {
list, err := svc.Scheduler.AiStorages.GetInferDeployInstanceList()
if err != nil {
return
}
if len(list) == 0 {
return
}
UpdateDeployInstanceStatusBatch(svc, list, false)
}
func CheckStopStatus(in *inference.DeployInstance) bool {
switch in.ClusterType {
case storeLink.TYPE_OCTOPUS:
switch in.Status {
case "stopped":
return true
default:
return false
}
case storeLink.TYPE_MODELARTS:
switch in.Status {
case "stopped":
return true
default:
return false
}
case storeLink.TYPE_SHUGUANGAI:
switch in.Status {
case "Terminated":
return true
default:
return false
}
default:
return false
}
}
func CheckRunningStatus(in *inference.DeployInstance) bool {
switch in.ClusterType {
case storeLink.TYPE_OCTOPUS:
switch in.Status {
case "running":
return true
default:
return false
}
case storeLink.TYPE_MODELARTS:
switch in.Status {
case "running":
return true
default:
return false
}
case storeLink.TYPE_SHUGUANGAI:
switch in.Status {
case "Running":
return true
default:
return false
}
default:
return false
}
}