JCC-CSScheduler/manager/internal/jobmgr/jobmgr.go

284 lines
7.3 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package jobmgr
import (
"context"
"fmt"
"reflect"
"sync"
"time"
"gitlink.org.cn/cloudream/common/pkgs/logger"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
cdssdk "gitlink.org.cn/cloudream/common/sdks/storage"
myreflect "gitlink.org.cn/cloudream/common/utils/reflect"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
"gitlink.org.cn/cloudream/scheduler/common/pkgs/db"
advtsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/advisor/task"
exectsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/manager/internal/advisormgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/executormgr"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type mgrJob struct {
Job jobmod.Job
Handler StateHandler
}
type Manager struct {
// 任何修改job、jobset的操作都需要加这个锁
pubLock sync.Mutex
execMgr *executormgr.Manager
advMgr *advisormgr.Manager
db *db.DB
handlers map[reflect.Type]StateHandler
defaultHandler StateHandler
jobSetIDIndex int
jobSets map[schsdk.JobSetID]*jobmod.JobSet
jobIDIndex int
jobs map[schsdk.JobID]*mgrJob
}
func NewManager(execMgr *executormgr.Manager, advMgr *advisormgr.Manager, db *db.DB) (*Manager, error) {
mgr := &Manager{
execMgr: execMgr,
advMgr: advMgr,
db: db,
handlers: make(map[reflect.Type]StateHandler),
jobSets: make(map[schsdk.JobSetID]*jobmod.JobSet),
jobs: make(map[schsdk.JobID]*mgrJob),
}
execMgr.OnTaskUpdated(mgr.executorTaskUpdated)
execMgr.OnTaskTimeout(mgr.executorTaskTimeout)
advMgr.OnTaskUpdated(mgr.advisorTaskUpdated)
advMgr.OnTaskTimeout(mgr.advisorTaskTimeout)
// TODO 考虑优化这部分逻辑
mgr.handlers[myreflect.TypeOf[*jobmod.StatePreScheduling]()] = NewPreSchedulingHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToAdjust]()] = NewReadyToAdjustHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateMakingAdjustScheme]()] = NewMakingAdjustSchemeHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateAdjusting]()] = NewAdjustingHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateReadyToExecute]()] = NewReadyToExecuteHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateExecuting]()] = NewExecutingHandler(mgr)
compHder := NewCompleteHandler(mgr)
mgr.handlers[myreflect.TypeOf[*jobmod.StateFailed]()] = compHder
mgr.handlers[myreflect.TypeOf[*jobmod.StateSuccess]()] = compHder
mgr.defaultHandler = NewDefaultHandler(mgr)
return mgr, nil
}
func (m *Manager) Serve() error {
for _, h := range m.handlers {
go h.Serve()
}
go m.defaultHandler.Serve()
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
// 每一分钟产生一个空事件,防止无限等待
m.pubLock.Lock()
m.onEvent(event.ToAll(), nil)
m.pubLock.Unlock()
}
}
return nil
}
func (m *Manager) Stop() {
for _, h := range m.handlers {
h.Stop()
}
m.defaultHandler.Stop()
}
func (m *Manager) SubmitJobSet(jobSetInfo schsdk.JobSetInfo, preScheduleScheme jobmod.JobSetPreScheduleScheme) (*jobmod.JobSet, error) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
jobSetID := schsdk.JobSetID(fmt.Sprintf("%d", m.jobSetIDIndex))
var jobs []jobmod.Job
var normalJobs []*jobmod.NormalJob
var resJobs []*jobmod.ResourceJob
var jobRefs []jobmod.JobSetJobRef
for i, jobInfo := range jobSetInfo.Jobs {
jobID := schsdk.JobID(fmt.Sprintf("%d", m.jobIDIndex+i))
switch info := jobInfo.(type) {
case *schsdk.NormalJobInfo:
job := jobmod.NewNormalJob(jobSetID, jobID, *info)
jobs = append(jobs, job)
normalJobs = append(normalJobs, job)
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
LocalJobID: info.LocalJobID,
JobID: jobID,
})
preSch, ok := preScheduleScheme.JobSchemes[info.LocalJobID]
if !ok {
return nil, fmt.Errorf("pre schedule scheme for job %s is not found", info.LocalJobID)
}
job.State = jobmod.NewStatePreScheduling(preSch)
job.TargetCCID = preSch.TargetCCID
case *schsdk.ResourceJobInfo:
job := jobmod.NewResourceJob(jobSetID, jobID, *info)
jobs = append(jobs, job)
resJobs = append(resJobs, job)
jobRefs = append(jobRefs, jobmod.JobSetJobRef{
LocalJobID: info.LocalJobID,
JobID: jobID,
})
// 回源任务不需要预调度,所以直接是进入待调整状态
job.State = jobmod.NewStateReadyToAdjust()
}
}
// TODO 可以考虑检查一下有依赖的任务的信息所描述依赖的LocalJobID是不是有效的
jobSet := jobmod.NewJobSet(jobSetID, jobRefs, preScheduleScheme)
m.jobSets[jobSetID] = jobSet
for _, job := range jobs {
m.jobs[job.GetJobID()] = &mgrJob{
Job: job,
}
m.handleState(job)
}
m.jobSetIDIndex += 1
m.jobIDIndex += len(jobSetInfo.Jobs)
return jobSet, nil
}
func (m *Manager) LocalFileUploaded(jobSetID schsdk.JobSetID, localPath string, err string, packageID cdssdk.PackageID) error {
m.pubLock.Lock()
defer m.pubLock.Unlock()
for _, h := range m.handlers {
h.OnEvent(event.ToJobSet(jobSetID), event.NewLocalFileUploaded(jobSetID, localPath, err, packageID))
}
return nil
}
func (m *Manager) executorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus exectsk.TaskStatus) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
job, ok := m.jobs[jobID]
if !ok {
return
}
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskUpdated(fullTaskID, taskStatus))
}
func (m *Manager) executorTaskTimeout(jobID schsdk.JobID, fullTaskID string) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
job, ok := m.jobs[jobID]
if !ok {
return
}
job.Handler.OnEvent(event.ToJob(jobID), event.NewExecutorTaskTimeout(fullTaskID))
}
func (m *Manager) advisorTaskUpdated(jobID schsdk.JobID, fullTaskID string, taskStatus advtsk.TaskStatus) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
job, ok := m.jobs[jobID]
if !ok {
return
}
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskUpdated(fullTaskID, taskStatus))
}
func (m *Manager) advisorTaskTimeout(jobID schsdk.JobID, fullTaskID string) {
m.pubLock.Lock()
defer m.pubLock.Unlock()
job, ok := m.jobs[jobID]
if !ok {
return
}
job.Handler.OnEvent(event.ToJob(jobID), event.NewAdvisorTaskTimeout(fullTaskID))
}
func (m *Manager) CloneJob(jobID schsdk.JobID) (jobmod.Job, error) {
m.pubLock.Lock()
job, ok := m.jobs[jobID]
if !ok {
m.pubLock.Unlock()
return nil, fmt.Errorf("job not found")
}
evt := event.NewCloneJob()
job.Handler.OnEvent(event.ToJob(jobID), evt)
m.pubLock.Unlock()
return evt.Callback.WaitValue(context.Background())
}
// 根据job状态选择handler进行处理。需要加锁
func (m *Manager) handleState(job jobmod.Job) {
logger.WithField("JobID", job.GetJobID()).
WithField("State", reflect.TypeOf(job.GetState()).String()).
Debugf("job state changed")
runtime, ok := m.jobs[job.GetJobID()]
if !ok {
return
}
state := job.GetState()
if state == nil {
runtime.Handler = m.defaultHandler
m.defaultHandler.Handle(job)
return
}
stateType := reflect.TypeOf(state)
handler, ok := m.handlers[stateType]
if !ok {
runtime.Handler = m.defaultHandler
m.defaultHandler.Handle(job)
return
}
runtime.Handler = handler
handler.Handle(job)
}
func (m *Manager) onEvent(broadcast event.Broadcast, evt event.Event) {
for _, h := range m.handlers {
h.OnEvent(broadcast, evt)
}
}