JCC-CSScheduler/client/internal/prescheduler/default_prescheduler.go

722 lines
20 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package prescheduler
import (
"fmt"
"sort"
"github.com/inhies/go-bytesize"
"github.com/samber/lo"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
uopsdk "gitlink.org.cn/cloudream/common/sdks/unifyops"
"gitlink.org.cn/cloudream/common/utils/math"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
schmod "gitlink.org.cn/cloudream/scheduler/common/models"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/collector"
mgrmq "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/manager"
)
const (
//每个节点划分的资源等级:
// ResourceLevel1表示所有资源类型均满足 大于等于1.5倍
ResourceLevel1 = 1
// ResourceLevel2表示不满足Level1但所有资源类型均满足 大于等于1倍
ResourceLevel2 = 2
// ResourceLevel3 表示某些资源类型 小于一倍
ResourceLevel3 = 3
CpuResourceWeight float64 = 1
StgResourceWeight float64 = 1.2
CachingWeight float64 = 1
LoadedWeight float64 = 2
)
var ErrNoAvailableScheme = fmt.Errorf("no appropriate scheduling node found, please wait")
type candidate struct {
CC schmod.ComputingCenter
IsReferencedJobTarget bool // 这个节点是否是所依赖的任务所选择的节点
Resource resourcesDetail
Files filesDetail
}
type resourcesDetail struct {
CPU resourceDetail
GPU resourceDetail
NPU resourceDetail
MLU resourceDetail
Storage resourceDetail
Memory resourceDetail
TotalScore float64
AvgScore float64
MaxLevel int
}
type resourceDetail struct {
Level int
Score float64
}
type filesDetail struct {
Dataset fileDetail
Code fileDetail
Image fileDetail
TotalScore float64
}
type fileDetail struct {
CachingScore float64
LoadingScore float64
IsLoaded bool //表示storage是否已经调度到该节点, image表示镜像是否已经加载到该算力中心
}
type schedulingJob struct {
Job schsdk.JobInfo
Afters []string
}
type CandidateArr []*candidate
func (a CandidateArr) Len() int { return len(a) }
func (a CandidateArr) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a CandidateArr) Less(i, j int) bool {
n1 := a[i]
n2 := a[j]
// 优先与所依赖的任务放到一起,但要求那个节点的资源足够
if n1.IsReferencedJobTarget && n1.Resource.MaxLevel < ResourceLevel3 {
return true
}
if n2.IsReferencedJobTarget && n2.Resource.MaxLevel < ResourceLevel3 {
return true
}
// 优先判断资源等级,资源等级越低,代表越满足需求
if n1.Resource.MaxLevel < n2.Resource.MaxLevel {
return true
}
if n1.Resource.MaxLevel > n2.Resource.MaxLevel {
return false
}
// 等级相同时,根据单项分值比较
switch n1.Resource.MaxLevel {
case ResourceLevel1:
// 数据文件总分越高,代表此节点上拥有的数据文件越完整,则越优先考虑
return n1.Files.TotalScore > n2.Files.TotalScore
case ResourceLevel2:
// 资源分的平均值越高,代表资源越空余,则越优先考虑
return n1.Resource.AvgScore > n2.Resource.AvgScore
case ResourceLevel3:
// 资源分的平均值越高,代表资源越空余,则越优先考虑
return n1.Resource.AvgScore > n2.Resource.AvgScore
}
return false
}
type DefaultPreScheduler struct {
}
func NewDefaultPreScheduler() *DefaultPreScheduler {
return &DefaultPreScheduler{}
}
func (s *DefaultPreScheduler) Schedule(info *schsdk.JobSetInfo) (*jobmod.JobSetPreScheduleScheme, *schsdk.JobSetFilesUploadScheme, error) {
jobSetScheme := &jobmod.JobSetPreScheduleScheme{
JobSchemes: make(map[string]jobmod.JobScheduleScheme),
}
filesUploadSchemes := make(map[string]schsdk.LocalFileUploadScheme)
mgrCli, err := schglb.ManagerMQPool.Acquire()
if err != nil {
return nil, nil, fmt.Errorf("new collector client: %w", err)
}
defer schglb.ManagerMQPool.Release(mgrCli)
// 查询有哪些算力中心可用
allCC, err := mgrCli.GetAllComputingCenter(mgrmq.NewGetAllComputingCenter())
if err != nil {
return nil, nil, fmt.Errorf("getting all computing center info: %w", err)
}
ccs := make(map[schsdk.CCID]schmod.ComputingCenter)
for _, node := range allCC.ComputingCenters {
ccs[node.CCID] = node
}
if len(ccs) == 0 {
return nil, nil, ErrNoAvailableScheme
}
// 先根据任务配置收集它们依赖的任务的LocalID
var schJobs []*schedulingJob
for _, job := range info.Jobs {
j := &schedulingJob{
Job: job,
}
if norJob, ok := job.(*schsdk.NormalJobInfo); ok {
if resFile, ok := norJob.Files.Dataset.(*schsdk.ResourceJobFileInfo); ok {
j.Afters = append(j.Afters, resFile.ResourceLocalJobID)
}
if resFile, ok := norJob.Files.Code.(*schsdk.ResourceJobFileInfo); ok {
j.Afters = append(j.Afters, resFile.ResourceLocalJobID)
}
} else if resJob, ok := job.(*schsdk.ResourceJobInfo); ok {
j.Afters = append(j.Afters, resJob.TargetLocalJobID)
}
schJobs = append(schJobs, j)
}
// 然后根据依赖进行排序
schJobs, ok := s.orderByAfters(schJobs)
if !ok {
return nil, nil, fmt.Errorf("circular reference detected between jobs in the job set")
}
// 经过排序后,按顺序生成调度方案
for _, job := range schJobs {
if norJob, ok := job.Job.(*schsdk.NormalJobInfo); ok {
scheme, err := s.scheduleForNormalJob(info, job, ccs, jobSetScheme.JobSchemes)
if err != nil {
return nil, nil, err
}
jobSetScheme.JobSchemes[job.Job.GetLocalJobID()] = *scheme
// 检查数据文件的配置项,生成上传文件方案
s.fillNormarlJobLocalUploadScheme(norJob, scheme.TargetCCID, filesUploadSchemes, ccs)
}
// 回源任务目前不需要生成调度方案
}
return jobSetScheme, &schsdk.JobSetFilesUploadScheme{
LocalFileSchemes: lo.Values(filesUploadSchemes),
}, nil
}
func (s *DefaultPreScheduler) orderByAfters(jobs []*schedulingJob) ([]*schedulingJob, bool) {
type jobOrder struct {
Job *schedulingJob
Afters []string
}
var jobOrders []*jobOrder
for _, job := range jobs {
od := &jobOrder{
Job: job,
Afters: make([]string, len(job.Afters)),
}
copy(od.Afters, job.Afters)
jobOrders = append(jobOrders, od)
}
// 然后排序
var orderedJob []*schedulingJob
for {
rm := 0
for i, jo := range jobOrders {
// 找到没有依赖的任务,然后将其取出
if len(jo.Afters) == 0 {
orderedJob = append(orderedJob, jo.Job)
// 删除其他任务对它的引用
for _, job2 := range jobOrders {
job2.Afters = lo.Reject(job2.Afters, func(item string, idx int) bool { return item == jo.Job.Job.GetLocalJobID() })
}
rm++
continue
}
jobOrders[i-rm] = jobOrders[i]
}
jobOrders = jobOrders[:len(jobOrders)-rm]
if len(jobOrders) == 0 {
break
}
// 遍历一轮后没有找到无依赖的任务,那么就是存在循环引用,排序失败
if rm == 0 {
return nil, false
}
}
return orderedJob, true
}
func (s *DefaultPreScheduler) scheduleForNormalJob(jobSet *schsdk.JobSetInfo, job *schedulingJob, ccs map[schsdk.CCID]schmod.ComputingCenter, jobSchemes map[string]jobmod.JobScheduleScheme) (*jobmod.JobScheduleScheme, error) {
allCCs := make(map[schsdk.CCID]*candidate)
// 初始化备选节点信息
for _, cc := range ccs {
caNode := &candidate{
CC: cc,
}
// 检查此节点是否是它所引用的任务所选的节点
for _, af := range job.Afters {
resJob := findJobInfo[*schsdk.ResourceJobInfo](jobSet.Jobs, af)
if resJob == nil {
return nil, fmt.Errorf("resource job %s not found in the job set", af)
}
// 由于jobs已经按照引用排序所以正常情况下这里肯定能取到值
scheme, ok := jobSchemes[resJob.TargetLocalJobID]
if !ok {
continue
}
if scheme.TargetCCID == cc.CCID {
caNode.IsReferencedJobTarget = true
break
}
}
allCCs[cc.CCID] = caNode
}
norJob := job.Job.(*schsdk.NormalJobInfo)
// 计算文件占有量得分
err := s.calcFileScore(norJob.Files, allCCs)
if err != nil {
return nil, err
}
// 计算资源余量得分
err = s.calcResourceScore(norJob, allCCs)
if err != nil {
return nil, err
}
allCCsArr := lo.Values(allCCs)
sort.Sort(CandidateArr(allCCsArr))
targetNode := allCCsArr[0]
if targetNode.Resource.MaxLevel == ResourceLevel3 {
return nil, ErrNoAvailableScheme
}
scheme := s.makeSchemeForNode(norJob, targetNode)
return &scheme, nil
}
func (s *DefaultPreScheduler) fillNormarlJobLocalUploadScheme(norJob *schsdk.NormalJobInfo, targetCCID schsdk.CCID, schemes map[string]schsdk.LocalFileUploadScheme, ccs map[schsdk.CCID]schmod.ComputingCenter) {
if localFile, ok := norJob.Files.Dataset.(*schsdk.LocalJobFileInfo); ok {
if _, ok := schemes[localFile.LocalPath]; !ok {
cdsNodeID := ccs[targetCCID].CDSNodeID
schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{
LocalPath: localFile.LocalPath,
UploadToCDSNodeID: &cdsNodeID,
}
}
}
if localFile, ok := norJob.Files.Code.(*schsdk.LocalJobFileInfo); ok {
if _, ok := schemes[localFile.LocalPath]; !ok {
cdsNodeID := ccs[targetCCID].CDSNodeID
schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{
LocalPath: localFile.LocalPath,
UploadToCDSNodeID: &cdsNodeID,
}
}
}
if localFile, ok := norJob.Files.Image.(*schsdk.LocalJobFileInfo); ok {
if _, ok := schemes[localFile.LocalPath]; !ok {
cdsNodeID := ccs[targetCCID].CDSNodeID
schemes[localFile.LocalPath] = schsdk.LocalFileUploadScheme{
LocalPath: localFile.LocalPath,
UploadToCDSNodeID: &cdsNodeID,
}
}
}
}
func (s *DefaultPreScheduler) makeSchemeForNode(job *schsdk.NormalJobInfo, targetCC *candidate) jobmod.JobScheduleScheme {
scheme := jobmod.JobScheduleScheme{
TargetCCID: targetCC.CC.CCID,
}
// TODO 根据实际情况选择Move或者Load
if _, ok := job.Files.Dataset.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Dataset.IsLoaded {
scheme.Dataset.Action = jobmod.ActionLoad
} else {
scheme.Dataset.Action = jobmod.ActionNo
}
if _, ok := job.Files.Code.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Code.IsLoaded {
scheme.Code.Action = jobmod.ActionLoad
} else {
scheme.Code.Action = jobmod.ActionNo
}
if _, ok := job.Files.Image.(*schsdk.PackageJobFileInfo); ok && !targetCC.Files.Image.IsLoaded {
scheme.Image.Action = jobmod.ActionImportImage
} else {
scheme.Image.Action = jobmod.ActionNo
}
return scheme
}
func (s *DefaultPreScheduler) calcResourceScore(job *schsdk.NormalJobInfo, allCCs map[schsdk.CCID]*candidate) error {
for _, cc := range allCCs {
res, err := s.calcOneResourceScore(job.Resources, &cc.CC)
if err != nil {
return err
}
cc.Resource = *res
}
return nil
}
// 划分节点资源等级,并计算资源得分
func (s *DefaultPreScheduler) calcOneResourceScore(requires schsdk.JobResourcesInfo, cc *schmod.ComputingCenter) (*resourcesDetail, error) {
colCli, err := schglb.CollectorMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new collector client: %w", err)
}
defer schglb.CollectorMQPool.Release(colCli)
getResDataResp, err := colCli.GetAllResourceData(collector.NewGetAllResourceData(cc.UOPSlwNodeID))
if err != nil {
return nil, err
}
var resDetail resourcesDetail
//计算资源得分
totalScore := 0.0
maxLevel := 0
resKinds := 0
if requires.CPU > 0 {
res := findResuorce[*uopsdk.CPUResourceData](getResDataResp.Datas)
if res == nil {
resDetail.CPU.Level = ResourceLevel3
resDetail.CPU.Score = 0
} else {
resDetail.CPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.CPU)
resDetail.CPU.Score = (float64(res.Available.Value) / requires.CPU) * CpuResourceWeight
}
maxLevel = math.Max(maxLevel, resDetail.CPU.Level)
totalScore += resDetail.CPU.Score
resKinds++
}
if requires.GPU > 0 {
res := findResuorce[*uopsdk.GPUResourceData](getResDataResp.Datas)
if res == nil {
resDetail.GPU.Level = ResourceLevel3
resDetail.GPU.Score = 0
} else {
resDetail.GPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.GPU)
resDetail.GPU.Score = (float64(res.Available.Value) / requires.GPU) * CpuResourceWeight
}
maxLevel = math.Max(maxLevel, resDetail.GPU.Level)
totalScore += resDetail.GPU.Score
resKinds++
}
if requires.NPU > 0 {
res := findResuorce[*uopsdk.NPUResourceData](getResDataResp.Datas)
if res == nil {
resDetail.NPU.Level = ResourceLevel3
resDetail.NPU.Score = 0
} else {
resDetail.NPU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.NPU)
resDetail.NPU.Score = (float64(res.Available.Value) / requires.NPU) * CpuResourceWeight
}
maxLevel = math.Max(maxLevel, resDetail.NPU.Level)
totalScore += resDetail.NPU.Score
resKinds++
}
if requires.MLU > 0 {
res := findResuorce[*uopsdk.MLUResourceData](getResDataResp.Datas)
if res == nil {
resDetail.MLU.Level = ResourceLevel3
resDetail.MLU.Score = 0
} else {
resDetail.MLU.Level = s.calcResourceLevel(float64(res.Available.Value), requires.MLU)
resDetail.MLU.Score = (float64(res.Available.Value) / requires.MLU) * CpuResourceWeight
}
maxLevel = math.Max(maxLevel, resDetail.MLU.Level)
totalScore += resDetail.MLU.Score
resKinds++
}
if requires.Storage > 0 {
res := findResuorce[*uopsdk.StorageResourceData](getResDataResp.Datas)
if res == nil {
resDetail.Storage.Level = ResourceLevel3
resDetail.Storage.Score = 0
} else {
bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit))
if err != nil {
return nil, err
}
resDetail.Storage.Level = s.calcResourceLevel(float64(bytes), float64(requires.Storage))
resDetail.Storage.Score = (float64(bytes) / float64(requires.Storage)) * StgResourceWeight
}
maxLevel = math.Max(maxLevel, resDetail.Storage.Level)
totalScore += resDetail.Storage.Score
resKinds++
}
if requires.Memory > 0 {
res := findResuorce[*uopsdk.MemoryResourceData](getResDataResp.Datas)
if res == nil {
resDetail.Memory.Level = ResourceLevel3
resDetail.Memory.Score = 0
} else {
bytes, err := bytesize.Parse(fmt.Sprintf("%f%s", res.Available.Value, res.Available.Unit))
if err != nil {
return nil, err
}
resDetail.Memory.Level = s.calcResourceLevel(float64(bytes), float64(requires.Memory))
resDetail.Memory.Score = (float64(bytes) / float64(requires.Memory)) * StgResourceWeight
}
maxLevel = math.Max(maxLevel, resDetail.Memory.Level)
totalScore += resDetail.Memory.Score
resKinds++
}
if resKinds == 0 {
return &resDetail, nil
}
resDetail.TotalScore = totalScore
resDetail.AvgScore = resDetail.AvgScore / float64(resKinds)
resDetail.MaxLevel = maxLevel
return &resDetail, nil
}
func (s *DefaultPreScheduler) calcResourceLevel(avai float64, need float64) int {
if avai >= 1.5*need {
return ResourceLevel1
}
if avai >= need {
return ResourceLevel2
}
return ResourceLevel3
}
// 计算节点得分情况
func (s *DefaultPreScheduler) calcFileScore(files schsdk.JobFilesInfo, allCCs map[schsdk.CCID]*candidate) error {
// 只计算运控返回的可用计算中心上的存储服务的数据权重
cdsNodeToCC := make(map[cdssdk.NodeID]*candidate)
for _, cc := range allCCs {
cdsNodeToCC[cc.CC.CDSNodeID] = cc
}
//计算code相关得分
if pkgFile, ok := files.Code.(*schsdk.PackageJobFileInfo); ok {
codeFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsNodeToCC)
if err != nil {
return fmt.Errorf("calc code file score: %w", err)
}
for id, score := range codeFileScores {
allCCs[id].Files.Code = *score
}
}
//计算dataset相关得分
if pkgFile, ok := files.Dataset.(*schsdk.PackageJobFileInfo); ok {
datasetFileScores, err := s.calcPackageFileScore(pkgFile.PackageID, cdsNodeToCC)
if err != nil {
return fmt.Errorf("calc dataset file score: %w", err)
}
for id, score := range datasetFileScores {
allCCs[id].Files.Dataset = *score
}
}
//计算image相关得分
if imgFile, ok := files.Image.(*schsdk.ImageJobFileInfo); ok {
//计算image相关得分
imageFileScores, err := s.calcImageFileScore(imgFile.ImageID, allCCs, cdsNodeToCC)
if err != nil {
return fmt.Errorf("calc image file score: %w", err)
}
for id, score := range imageFileScores {
allCCs[id].Files.Image = *score
}
}
for _, cc := range allCCs {
cc.Files.TotalScore = cc.Files.Code.CachingScore +
cc.Files.Code.LoadingScore +
cc.Files.Dataset.CachingScore +
cc.Files.Dataset.LoadingScore +
cc.Files.Image.CachingScore +
cc.Files.Image.LoadingScore
}
return nil
}
// 计算package在各节点的得分情况
func (s *DefaultPreScheduler) calcPackageFileScore(packageID cdssdk.PackageID, cdsNodeToCC map[cdssdk.NodeID]*candidate) (map[schsdk.CCID]*fileDetail, error) {
colCli, err := schglb.CollectorMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new collector client: %w", err)
}
defer schglb.CollectorMQPool.Release(colCli)
ccFileScores := make(map[schsdk.CCID]*fileDetail)
// TODO UserID
cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(0, packageID))
if err != nil {
return nil, err
}
for _, cdsNodeCacheInfo := range cachedResp.NodeInfos {
cc, ok := cdsNodeToCC[cdsNodeCacheInfo.NodeID]
if !ok {
continue
}
ccFileScores[cc.CC.CCID] = &fileDetail{
//TODO 根据缓存方式不同,可能会有不同的计算方式
CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight,
}
}
// TODO UserID
loadedResp, err := colCli.PackageGetLoadedStgNodes(collector.NewPackageGetLoadedStgNodes(0, packageID))
if err != nil {
return nil, err
}
for _, cdsNodeID := range loadedResp.StgNodeIDs {
cc, ok := cdsNodeToCC[cdsNodeID]
if !ok {
continue
}
sfc, ok := ccFileScores[cc.CC.CCID]
if !ok {
sfc = &fileDetail{}
ccFileScores[cc.CC.CCID] = sfc
}
sfc.LoadingScore = 1 * LoadedWeight
sfc.IsLoaded = true
}
return ccFileScores, nil
}
// 计算package在各节点的得分情况
func (s *DefaultPreScheduler) calcImageFileScore(imageID schsdk.ImageID, allCCs map[schsdk.CCID]*candidate, cdsNodeToCC map[cdssdk.NodeID]*candidate) (map[schsdk.CCID]*fileDetail, error) {
colCli, err := schglb.CollectorMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new collector client: %w", err)
}
defer schglb.CollectorMQPool.Release(colCli)
magCli, err := schglb.ManagerMQPool.Acquire()
if err != nil {
return nil, fmt.Errorf("new manager client: %w", err)
}
defer schglb.ManagerMQPool.Release(magCli)
imageInfoResp, err := magCli.GetImageInfo(mgrmq.NewGetImageInfo(imageID))
if err != nil {
return nil, fmt.Errorf("getting image info: %w", err)
}
ccFileScores := make(map[schsdk.CCID]*fileDetail)
if imageInfoResp.Image.CDSPackageID != nil {
cachedResp, err := colCli.PackageGetCachedStgNodes(collector.NewPackageGetCachedStgNodes(0, *imageInfoResp.Image.CDSPackageID))
if err != nil {
return nil, err
}
for _, cdsNodeCacheInfo := range cachedResp.NodeInfos {
cc, ok := cdsNodeToCC[cdsNodeCacheInfo.NodeID]
if !ok {
continue
}
ccFileScores[cc.CC.CCID] = &fileDetail{
//TODO 根据缓存方式不同,可能会有不同的计算方式
CachingScore: float64(cdsNodeCacheInfo.FileSize) / float64(cachedResp.PackageSize) * CachingWeight,
}
}
}
// 镜像的LoadingScore是判断是否导入到算力中心
for _, pcmImg := range imageInfoResp.PCMImages {
_, ok := allCCs[pcmImg.CCID]
if !ok {
continue
}
fsc, ok := ccFileScores[pcmImg.CCID]
if !ok {
fsc = &fileDetail{}
ccFileScores[pcmImg.CCID] = fsc
}
fsc.LoadingScore = 1 * LoadedWeight
fsc.IsLoaded = true
}
return ccFileScores, nil
}
func findResuorce[T uopsdk.ResourceData](all []uopsdk.ResourceData) T {
for _, data := range all {
if ret, ok := data.(T); ok {
return ret
}
}
var def T
return def
}
func findJobInfo[T schsdk.JobInfo](jobs []schsdk.JobInfo, localJobID string) T {
for _, job := range jobs {
if ret, ok := job.(T); ok && job.GetLocalJobID() == localJobID {
return ret
}
}
var def T
return def
}