pcm-coordinator/internal/logic/schedule/schedulecreatetasklogic.go

489 lines
13 KiB
Go

package schedule
import (
"context"
"fmt"
"github.com/pkg/errors"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
"gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
"gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
"gopkg.in/yaml.v3"
"slices"
"strings"
"time"
"github.com/zeromicro/go-zero/core/logx"
)
const (
TRAINNING_TASK_REPLICA = 1
TRAINNING_TASK_SUFFIX_LEN = 10
QUERY_RESOURCE_RETRY = 3
)
type ClustersWithDataDistributes struct {
Clusters []*strategy.AssignedCluster
DataDistributes *types.DataDistribute
}
type ScheduleCreateTaskLogic struct {
logx.Logger
ctx context.Context
svcCtx *svc.ServiceContext
queryResource *QueryResourcesLogic
}
func NewScheduleCreateTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleCreateTaskLogic {
return &ScheduleCreateTaskLogic{
Logger: logx.WithContext(ctx),
ctx: ctx,
svcCtx: svcCtx,
queryResource: NewQueryResourcesLogic(ctx, svcCtx),
}
}
func generateFilteredDataDistributes(clusters []*strategy.AssignedCluster, distribute types.DataDistribute) *ClustersWithDataDistributes {
var clusterIds []string
for _, c := range clusters {
clusterIds = append(clusterIds, c.ClusterId)
}
clustersWithDataDistributes := &ClustersWithDataDistributes{
Clusters: clusters,
DataDistributes: &types.DataDistribute{
Dataset: make([]*types.DatasetDistribute, 0),
Image: make([]*types.ImageDistribute, 0),
Model: make([]*types.ModelDistribute, 0),
Code: make([]*types.CodeDistribute, 0),
},
}
for _, datasetDistribute := range distribute.Dataset {
dataset := &types.DatasetDistribute{}
dataset.DataName = datasetDistribute.DataName
dataset.PackageID = datasetDistribute.PackageID
clusterScheduledList := make([]*types.ClusterScheduled, 0)
if len(datasetDistribute.Clusters) != 0 {
for _, cluster := range datasetDistribute.Clusters {
if slices.Contains(clusterIds, cluster.ClusterID) {
clusterScheduledList = append(clusterScheduledList, cluster)
}
}
}
dataset.Clusters = clusterScheduledList
clustersWithDataDistributes.DataDistributes.Dataset = append(clustersWithDataDistributes.DataDistributes.Dataset, dataset)
}
for _, imageDistribute := range distribute.Image {
image := &types.ImageDistribute{}
image.DataName = imageDistribute.DataName
image.PackageID = imageDistribute.PackageID
clusterScheduledList := make([]*types.ClusterScheduled, 0)
if len(imageDistribute.Clusters) != 0 {
for _, cluster := range imageDistribute.Clusters {
if slices.Contains(clusterIds, cluster.ClusterID) {
clusterScheduledList = append(clusterScheduledList, cluster)
}
}
}
image.Clusters = clusterScheduledList
clustersWithDataDistributes.DataDistributes.Image = append(clustersWithDataDistributes.DataDistributes.Image, image)
}
for _, codeDistribute := range distribute.Code {
code := &types.CodeDistribute{}
code.DataName = codeDistribute.DataName
code.PackageID = codeDistribute.PackageID
code.Output = codeDistribute.Output
clusterScheduledList := make([]*types.ClusterScheduled, 0)
if len(codeDistribute.Clusters) != 0 {
for _, cluster := range codeDistribute.Clusters {
if slices.Contains(clusterIds, cluster.ClusterID) {
clusterScheduledList = append(clusterScheduledList, cluster)
}
}
}
code.Clusters = clusterScheduledList
clustersWithDataDistributes.DataDistributes.Code = append(clustersWithDataDistributes.DataDistributes.Code, code)
}
for _, modelDistribute := range distribute.Model {
model := &types.ModelDistribute{}
model.DataName = modelDistribute.DataName
model.PackageID = modelDistribute.PackageID
clusterScheduledList := make([]*types.ClusterScheduled, 0)
if len(modelDistribute.Clusters) != 0 {
for _, cluster := range modelDistribute.Clusters {
if slices.Contains(clusterIds, cluster.ClusterID) {
clusterScheduledList = append(clusterScheduledList, cluster)
}
}
}
model.Clusters = clusterScheduledList
clustersWithDataDistributes.DataDistributes.Model = append(clustersWithDataDistributes.DataDistributes.Model, model)
}
return clustersWithDataDistributes
}
func (l *ScheduleCreateTaskLogic) ScheduleCreateTask(req *types.CreateTaskReq) (resp *types.CreateTaskResp, err error) {
resp = &types.CreateTaskResp{}
err = validateJobResources(req.JobResources)
if err != nil {
return nil, err
}
taskName, err := l.handleDuplicateTaskName(req.Name)
if err != nil {
return nil, err
}
var clusters []string
if len(req.JobResources.Clusters) == 1 {
clusters = append(clusters, req.JobResources.Clusters[0].ClusterID)
schedatas, err := l.generateScheduleResult(req.DataDistributes, clusters)
if err != nil {
return nil, err
}
assignedClusters := copyParams([]*strategy.AssignedCluster{{
ClusterId: req.JobResources.Clusters[0].ClusterID, Replicas: 1,
}}, req.JobResources.Clusters)
// filter data distribution
clustersWithDataDistributes := generateFilteredDataDistributes(assignedClusters, req.DataDistributes)
taskId, err := l.createTask(taskName, req.Description, req.JobResources.ScheduleStrategy, clustersWithDataDistributes, req.Token, req.UserIp)
if err != nil {
return nil, err
}
resp.ScheduleDatas = schedatas
resp.TaskID = taskId
return resp, nil
} else {
assignedClusters, err := l.getAssignedClustersByStrategy(&req.JobResources, &req.DataDistributes)
if err != nil {
return nil, err
}
if len(assignedClusters) == 0 {
return nil, fmt.Errorf("failed to create task, no scheduled cluster found")
}
for _, c := range assignedClusters {
clusters = append(clusters, c.ClusterId)
}
schedatas, err := l.generateScheduleResult(req.DataDistributes, clusters)
if err != nil {
return nil, err
}
// filter data distribution
clustersWithDataDistributes := generateFilteredDataDistributes(assignedClusters, req.DataDistributes)
taskId, err := l.createTask(taskName, req.Description, req.JobResources.ScheduleStrategy, clustersWithDataDistributes, req.Token, req.UserIp)
if err != nil {
return nil, err
}
resp.ScheduleDatas = schedatas
resp.TaskID = taskId
return resp, nil
}
}
func validateJobResources(resources types.JobResources) error {
if resources.ScheduleStrategy == "" {
return fmt.Errorf("must specify ScheduleStrategy")
}
if len(resources.Clusters) == 0 {
return fmt.Errorf("must specify at least one cluster")
}
for _, c := range resources.Clusters {
if c.ClusterID == "" {
return fmt.Errorf("must specify clusterID")
}
if len(c.Resources) == 0 {
return fmt.Errorf("cluster: %s must specify at least one compute resource", c.ClusterID)
//return errors.Wrapf(xerr.NewErrCodeMsg(1234, fmt.Sprintf("cluster: %s must specify at least one compute resource", c.ClusterID)), "")
}
}
return nil
}
func (l *ScheduleCreateTaskLogic) handleDuplicateTaskName(name string) (string, error) {
exist, err := l.svcCtx.Scheduler.AiStorages.DoesTaskNameExist(name)
if err != nil {
return "", err
}
if exist {
return name + "_" + time.Now().Format(constants.Layout_Time_Suffix), nil
}
return name, nil
}
func (l *ScheduleCreateTaskLogic) getAssignedClustersByStrategy(resources *types.JobResources, dataDistribute *types.DataDistribute) ([]*strategy.AssignedCluster, error) {
var assignedClusters []*strategy.AssignedCluster
switch resources.ScheduleStrategy {
case strategy.LEASTLOADFIRST:
var resSpecs []*collector.ResourceSpec
var resCount int
for i := 0; i < QUERY_RESOURCE_RETRY; i++ {
defer time.Sleep(time.Second)
qResources, err := l.queryResource.QueryResourcesByClusterId(nil)
if err != nil {
continue
}
for _, resource := range qResources {
if resource.Resources != nil {
resCount++
}
}
if resCount >= 1 {
resSpecs = qResources
break
} else {
resCount = 0
continue
}
}
if resCount == 0 {
return nil, fmt.Errorf("failed to create task, resources counting fails")
}
strtg := strategy.NewLeastLoadFirst(TRAINNING_TASK_REPLICA, resSpecs)
clusters, err := strtg.Schedule()
if err != nil {
return nil, err
}
assignedClusters = copyParams(clusters, resources.Clusters)
case strategy.DATA_LOCALITY:
strtg := strategy.NewDataLocality(TRAINNING_TASK_REPLICA, dataDistribute)
clusters, err := strtg.Schedule()
if err != nil {
return nil, err
}
assignedClusters = copyParams(clusters, resources.Clusters)
default:
return nil, errors.New("no strategy has been chosen")
}
return assignedClusters, nil
}
func copyParams(clusters []*strategy.AssignedCluster, clusterInfos []*types.JobClusterInfo) []*strategy.AssignedCluster {
var result []*strategy.AssignedCluster
for _, c := range clusters {
for _, info := range clusterInfos {
if c.ClusterId == info.ClusterID {
var envs []string
var params []string
for k, v := range info.Runtime.Envs {
val := common.ConvertTypeToString(v)
if val != "" {
env := k + storeLink.COMMA + val
envs = append(envs, env)
}
}
for k, v := range info.Runtime.Params {
val := common.ConvertTypeToString(v)
if val != "" {
p := k + storeLink.COMMA + val
params = append(params, p)
}
}
cluster := &strategy.AssignedCluster{
ClusterId: c.ClusterId,
ClusterName: c.ClusterName,
Replicas: c.Replicas,
ResourcesRequired: info.Resources,
Cmd: info.Runtime.Command,
Envs: envs,
Params: params,
}
result = append(result, cluster)
}
}
}
return result
}
func (l *ScheduleCreateTaskLogic) createTask(taskName string, desc string, strategyName string, clustersWithDataDistributes *ClustersWithDataDistributes, token string, userIp string) (int64, error) {
var synergyStatus int64
if len(clustersWithDataDistributes.Clusters) > 1 {
synergyStatus = 1
}
y, err := yaml.Marshal(clustersWithDataDistributes)
if err != nil {
fmt.Printf("Error while Marshaling. %v", err)
}
taskId, err := l.svcCtx.Scheduler.CreateTask(taskName, desc, synergyStatus, strategyName, string(y), token, userIp, &l.svcCtx.Config)
if err != nil {
return 0, err
}
return taskId, nil
}
func (l *ScheduleCreateTaskLogic) generateScheduleResult(distribute types.DataDistribute, clusters []string) ([]*types.ScheduleData, error) {
var schedatas []*types.ScheduleData
for _, d := range distribute.Dataset {
data := &types.ScheduleData{
DataType: "dataset",
PackageID: d.PackageID,
ClusterIDs: make([]string, 0),
}
var cSlc []string
for _, cluster := range d.Clusters {
cSlc = append(cSlc, cluster.ClusterID)
}
for _, cluster := range clusters {
if !slices.Contains(cSlc, cluster) {
data.ClusterIDs = append(data.ClusterIDs, cluster)
} else {
continue
}
}
if len(data.ClusterIDs) != 0 {
schedatas = append(schedatas, data)
}
}
for _, d := range distribute.Code {
data := &types.ScheduleData{
DataType: "code",
PackageID: d.PackageID,
ClusterIDs: make([]string, 0),
}
var cSlc []string
for _, cluster := range d.Clusters {
cSlc = append(cSlc, cluster.ClusterID)
}
for _, cluster := range clusters {
if !slices.Contains(cSlc, cluster) {
data.ClusterIDs = append(data.ClusterIDs, cluster)
} else {
continue
}
}
if len(data.ClusterIDs) != 0 {
schedatas = append(schedatas, data)
}
}
for _, d := range distribute.Image {
data := &types.ScheduleData{
DataType: "image",
PackageID: d.PackageID,
ClusterIDs: make([]string, 0),
}
var cSlc []string
for _, cluster := range d.Clusters {
cSlc = append(cSlc, cluster.ClusterID)
}
for _, cluster := range clusters {
if !slices.Contains(cSlc, cluster) {
data.ClusterIDs = append(data.ClusterIDs, cluster)
} else {
continue
}
}
if len(data.ClusterIDs) != 0 {
schedatas = append(schedatas, data)
}
}
for _, d := range distribute.Model {
data := &types.ScheduleData{
DataType: "model",
PackageID: d.PackageID,
ClusterIDs: make([]string, 0),
}
var cSlc []string
for _, cluster := range d.Clusters {
cSlc = append(cSlc, cluster.ClusterID)
}
for _, cluster := range clusters {
if !slices.Contains(cSlc, cluster) {
data.ClusterIDs = append(data.ClusterIDs, cluster)
} else {
continue
}
}
if len(data.ClusterIDs) != 0 {
schedatas = append(schedatas, data)
}
}
if len(schedatas) != 0 {
err := l.updateStorageType(&schedatas)
if err != nil {
return nil, err
}
}
return schedatas, nil
}
func (l *ScheduleCreateTaskLogic) updateStorageType(schedatas *[]*types.ScheduleData) error {
for _, s := range *schedatas {
var storageType string
var sTypes []string
for _, id := range s.ClusterIDs {
cluster, err := l.svcCtx.Scheduler.AiStorages.GetClustersById(id)
if err != nil {
return err
}
stype, ok := storeLink.StorageTypeMap[strings.Title(cluster.Name)]
if ok {
sTypes = append(sTypes, stype)
}
}
sTypes = common.Unique(sTypes)
for _, st := range sTypes {
storageType += st + storeLink.COMMA
}
storageType = strings.TrimSuffix(storageType, storeLink.COMMA)
s.StorageType = storageType
}
return nil
}