forked from JointCloud/JCC-CSScheduler
284 lines
7.3 KiB
Go
284 lines
7.3 KiB
Go
package jobmgr
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"reflect"
|
||
"sync"
|
||
"time"
|
||
|
||
"gitlink.org.cn/cloudream/common/pkgs/logger"
|
||
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
|
||
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
|
||
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
|
||
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
|
||
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
|
||
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
|
||
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/advisormgr"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr"
|
||
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
|
||
)
|
||
|
||
type mgrJob struct {
|
||
Job jobmod.Job
|
||
Handler StateHandler
|
||
}
|
||
|
||
type Manager struct {
|
||
// 任何修改job、jobset的操作,都需要加这个锁
|
||
pubLock sync.Mutex
|
||
|
||
execMgr *executormgr.Manager
|
||
advMgr *advisormgr.Manager
|
||
db *db.DB
|
||
|
||
handlers map[reflect.Type]StateHandler
|
||
defaultHandler StateHandler
|
||
|
||
jobSetIDIndex int
|
||
jobSets map[schsdk.JobSetID]*jobmod.JobSet
|
||
jobIDIndex int
|
||
jobs map[schsdk.JobID]*mgrJob
|
||
}
|
||
|
||
func NewManager(execMgr *executormgr.Manager, advMgr *advisormgr.Manager, db *db.DB) (*Manager, error) {
|
||
mgr := &Manager{
|
||
execMgr: execMgr,
|
||
advMgr: advMgr,
|
||
db: db,
|
||
|
||
handlers: make(map[reflect.Type]StateHandler),
|
||
jobSets: make(map[schsdk.JobSetID]*jobmod.JobSet),
|
||
jobs: make(map[schsdk.JobID]*mgrJob),
|
||
}
|
||
|
||
execMgr.OnTaskUpdated(mgr.executorTaskUpdated)
|
||
execMgr.OnTaskTimeout(mgr.executorTaskTimeout)
|
||
|
||
advMgr.OnTaskUpdated(mgr.advisorTaskUpdated)
|
||
advMgr.OnTaskTimeout(mgr.advisorTaskTimeout)
|
||
|
||
// TODO 考虑优化这部分逻辑
|
||
|
||
mgr.handlers[myreflect.TypeOf[*jobmod.StatePreScheduling]()] = NewPreSchedulingHandler(mgr)
|
||
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToAdjust]()] = NewReadyToAdjustHandler(mgr)
|
||
mgr.handlers[myreflect.TypeOf[*jobmod.StateMakingAdjustScheme]()] = NewMakingAdjustSchemeHandler(mgr)
|
||
mgr.handlers[myreflect.TypeOf[*jobmod.StateAdjusting]()] = NewAdjustingHandler(mgr)
|
||
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToExecute]()] = NewReadyToExecuteHandler(mgr)
|
||
mgr.handlers[myreflect.TypeOf[*jobmod.StateExecuting]()] = NewExecutingHandler(mgr)
|
||
|
||
compHder := NewCompleteHandler(mgr)
|
||
mgr.handlers[myreflect.TypeOf[*jobmod.StateFailed]()] = compHder
|
||
mgr.handlers[myreflect.TypeOf[*jobmod.StateSuccess]()] = compHder
|
||
|
||
mgr.defaultHandler = NewDefaultHandler(mgr)
|
||
|
||
return mgr, nil
|
||
}
|
||
|
||
func (m *Manager) Serve() error {
|
||
for _, h := range m.handlers {
|
||
go h.Serve()
|
||
}
|
||
|
||
go m.defaultHandler.Serve()
|
||
|
||
ticker := time.NewTicker(time.Minute)
|
||
defer ticker.Stop()
|
||
|
||
for {
|
||
select {
|
||
case <-ticker.C:
|
||
// 每一分钟产生一个空事件,防止无限等待
|
||
m.pubLock.Lock()
|
||
m.onEvent(event.ToAll(), nil)
|
||
m.pubLock.Unlock()
|
||
}
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
func (m *Manager) Stop() {
|
||
for _, h := range m.handlers {
|
||
h.Stop()
|
||
}
|
||
|
||
m.defaultHandler.Stop()
|
||
}
|
||
|
||
func (m *Manager) SubmitJobSet(jobSetInfo schsdk.JobSetInfo, preScheduleScheme jobmod.JobSetPreScheduleScheme) (*jobmod.JobSet, error) {
|
||
m.pubLock.Lock()
|
||
defer m.pubLock.Unlock()
|
||
|
||
jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex))
|
||
|
||
var jobs []jobmod.Job
|
||
var normalJobs []*jobmod.NormalJob
|
||
var resJobs []*jobmod.ResourceJob
|
||
var jobRefs []jobmod.JobSetJobRef
|
||
for i, jobInfo := range jobSetInfo.Jobs {
|
||
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
|
||
|
||
switch info := jobInfo.(type) {
|
||
case *schsdk.NormalJobInfo:
|
||
job := jobmod.NewNormalJob(jobSetID, jobID, *info)
|
||
jobs = append(jobs, job)
|
||
normalJobs = append(normalJobs, job)
|
||
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
|
||
LocalJobID: info.LocalJobID,
|
||
JobID: jobID,
|
||
})
|
||
|
||
preSch, ok := preScheduleScheme.JobSchemes[info.LocalJobID]
|
||
if !ok {
|
||
return nil, fmt.Errorf("pre schedule scheme for job %s is not found", info.LocalJobID)
|
||
}
|
||
|
||
job.State = jobmod.NewStatePreScheduling(preSch)
|
||
job.TargetCCID = preSch.TargetCCID
|
||
|
||
case *schsdk.ResourceJobInfo:
|
||
job := jobmod.NewResourceJob(jobSetID, jobID, *info)
|
||
jobs = append(jobs, job)
|
||
resJobs = append(resJobs, job)
|
||
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
|
||
LocalJobID: info.LocalJobID,
|
||
JobID: jobID,
|
||
})
|
||
|
||
// 回源任务不需要预调度,所以直接是进入待调整状态
|
||
job.State = jobmod.NewStateReadyToAdjust()
|
||
}
|
||
}
|
||
|
||
// TODO 可以考虑检查一下有依赖的任务的信息所描述依赖的LocalJobID是不是有效的
|
||
|
||
jobSet := jobmod.NewJobSet(jobSetID, jobRefs, preScheduleScheme)
|
||
m.jobSets[jobSetID] = jobSet
|
||
for _, job := range jobs {
|
||
m.jobs[job.GetJobID()] = &mgrJob{
|
||
Job: job,
|
||
}
|
||
|
||
m.handleState(job)
|
||
}
|
||
|
||
m.jobSetIDIndex += 1
|
||
m.jobIDIndex += len(jobSetInfo.Jobs)
|
||
|
||
return jobSet, nil
|
||
}
|
||
|
||
func (m *Manager) LocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) error {
|
||
m.pubLock.Lock()
|
||
defer m.pubLock.Unlock()
|
||
|
||
for _, h := range m.handlers {
|
||
h.OnEvent(event.ToJobSet(jobSetID), event.NewLocalFileUploaded(jobSetID, localPath, err, packageID))
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
func (m *Manager) executorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus exectsk.TaskStatus) {
|
||
m.pubLock.Lock()
|
||
defer m.pubLock.Unlock()
|
||
|
||
job, ok := m.jobs[jobID]
|
||
if !ok {
|
||
return
|
||
}
|
||
|
||
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskUpdated(fullTaskID, taskStatus))
|
||
}
|
||
|
||
func (m *Manager) executorTaskTimeout(jobID schsdk.JobID, fullTaskID string) {
|
||
m.pubLock.Lock()
|
||
defer m.pubLock.Unlock()
|
||
|
||
job, ok := m.jobs[jobID]
|
||
if !ok {
|
||
return
|
||
}
|
||
|
||
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskTimeout(fullTaskID))
|
||
}
|
||
|
||
func (m *Manager) advisorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus) {
|
||
m.pubLock.Lock()
|
||
defer m.pubLock.Unlock()
|
||
|
||
job, ok := m.jobs[jobID]
|
||
if !ok {
|
||
return
|
||
}
|
||
|
||
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskUpdated(fullTaskID, taskStatus))
|
||
}
|
||
|
||
func (m *Manager) advisorTaskTimeout(jobID schsdk.JobID, fullTaskID string) {
|
||
m.pubLock.Lock()
|
||
defer m.pubLock.Unlock()
|
||
|
||
job, ok := m.jobs[jobID]
|
||
if !ok {
|
||
return
|
||
}
|
||
|
||
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskTimeout(fullTaskID))
|
||
}
|
||
|
||
func (m *Manager) CloneJob(jobID schsdk.JobID) (jobmod.Job, error) {
|
||
m.pubLock.Lock()
|
||
|
||
job, ok := m.jobs[jobID]
|
||
if !ok {
|
||
m.pubLock.Unlock()
|
||
return nil, fmt.Errorf("job not found")
|
||
}
|
||
|
||
evt := event.NewCloneJob()
|
||
job.Handler.OnEvent(event.ToJob(jobID), evt)
|
||
m.pubLock.Unlock()
|
||
|
||
return evt.Callback.WaitValue(context.Background())
|
||
}
|
||
|
||
// 根据job状态选择handler进行处理。需要加锁
|
||
func (m *Manager) handleState(job jobmod.Job) {
|
||
logger.WithField("JobID", job.GetJobID()).
|
||
WithField("State", reflect.TypeOf(job.GetState()).String()).
|
||
Debugf("job state changed")
|
||
|
||
runtime, ok := m.jobs[job.GetJobID()]
|
||
if !ok {
|
||
return
|
||
}
|
||
|
||
state := job.GetState()
|
||
if state == nil {
|
||
runtime.Handler = m.defaultHandler
|
||
m.defaultHandler.Handle(job)
|
||
return
|
||
}
|
||
|
||
stateType := reflect.TypeOf(state)
|
||
handler, ok := m.handlers[stateType]
|
||
if !ok {
|
||
runtime.Handler = m.defaultHandler
|
||
m.defaultHandler.Handle(job)
|
||
return
|
||
}
|
||
|
||
runtime.Handler = handler
|
||
handler.Handle(job)
|
||
}
|
||
|
||
func (m *Manager) onEvent(broadcast event.Broadcast, evt event.Event) {
|
||
for _, h := range m.handlers {
|
||
h.OnEvent(broadcast, evt)
|
||
}
|
||
}
|