JCC-CSScheduler/manager/internal/jobmgr/executing_handler.go

265 lines
7.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package jobmgr
import (
"fmt"
"reflect"
"gitlink.org.cn/cloudream/common/pkgs/actor"
"gitlink.org.cn/cloudream/common/pkgs/logger"
pcmsdk "gitlink.org.cn/cloudream/common/sdks/pcm"
schsdk "gitlink.org.cn/cloudream/common/sdks/scheduler"
schglb "gitlink.org.cn/cloudream/scheduler/common/globals"
jobmod "gitlink.org.cn/cloudream/scheduler/common/models/job"
exetsk "gitlink.org.cn/cloudream/scheduler/common/pkgs/mq/executor/task"
"gitlink.org.cn/cloudream/scheduler/common/utils"
"gitlink.org.cn/cloudream/scheduler/manager/internal/jobmgr/event"
)
type executingJob struct {
job jobmod.Job
state *jobmod.StateExecuting
}
type ExecutingHandler struct {
mgr *Manager
jobs map[schsdk.JobID]*executingJob
cmdChan actor.CommandChannel
}
func NewExecutingHandler(mgr *Manager) *ExecutingHandler {
return &ExecutingHandler{
mgr: mgr,
jobs: make(map[schsdk.JobID]*executingJob),
cmdChan: *actor.NewCommandChannel(),
}
}
func (h *ExecutingHandler) Handle(job jobmod.Job) {
h.cmdChan.Send(func() {
state, ok := job.GetState().(*jobmod.StateExecuting)
if !ok {
h.changeJobState(job, jobmod.NewStateFailed(fmt.Sprintf("unknow state: %v", reflect.TypeOf(job.GetState())), job.GetState()))
return
}
rjob := &executingJob{
job: job,
state: state,
}
h.jobs[job.GetJobID()] = rjob
h.onJobEvent(nil, rjob)
})
}
func (h *ExecutingHandler) onJobEvent(evt event.Event, job *executingJob) {
if cloneEvt, ok := evt.(*event.CloneJob); ok {
cloneEvt.Callback.SetValue(job.job.Clone())
return
}
if norJob, ok := job.job.(*jobmod.NormalJob); ok {
h.onNormalJobEvent(evt, job, norJob)
} else if resJob, ok := job.job.(*jobmod.ResourceJob); ok {
h.onResourceJobEvent(evt, job, resJob)
}
}
func (h *ExecutingHandler) onNormalJobEvent(evt event.Event, job *executingJob, norJob *jobmod.NormalJob) {
if job.state.FullTaskID == "" {
pcmImgInfo, err := h.mgr.db.PCMImage().GetByImageIDAndCCID(h.mgr.db.SQLCtx(), norJob.Files.Image.ImageID, norJob.TargetCCID)
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed("getting pcm image info: "+err.Error(), job.state))
return
}
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), norJob.TargetCCID)
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state))
return
}
// TODO 需要添加DATA_IN、DATA_OUT等环境变量这些数据从Job的信息中来获取
ress, err := h.mgr.db.CCResource().GetByCCID(h.mgr.db.SQLCtx(), norJob.TargetCCID)
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center resource info: %s", err.Error()), job.state))
return
}
if len(ress) == 0 {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("there is no resource at computing center %v", norJob.TargetCCID), job.state))
return
}
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(),
exetsk.NewSubmitTask(
ccInfo.PCMParticipantID,
pcmImgInfo.PCMImageID,
// TODO 选择资源的算法
ress[0].PCMResourceID,
norJob.Info.Runtime.Command,
norJob.Info.Runtime.Envs,
))
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
job.state.FullTaskID = fullTaskID
}
if execRet, err := event.AssertExecutorTaskStatus[*exetsk.SubmitTaskStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
if err == event.ErrTaskTimeout {
h.changeJobState(job.job, jobmod.NewStateFailed("schedule task timeout", job.state))
return
}
logger.WithField("JobID", job.job.GetJobID()).
WithField("State", reflect.TypeOf(job.state).String()).
Infof("pcm task state change to: %s", execRet.Status)
if execRet.Status == pcmsdk.TaskStatusSuccess {
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
h.changeJobState(job.job, jobmod.NewStateSuccess())
} else if execRet.Status == pcmsdk.TaskStatuFailed {
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
h.changeJobState(job.job, jobmod.NewStateFailed(execRet.Error, job.state))
}
}
}
func (h *ExecutingHandler) onResourceJobEvent(evt event.Event, job *executingJob, resJob *jobmod.ResourceJob) {
if job.state.FullTaskID == "" {
h.mgr.pubLock.Lock()
jobSet, ok := h.mgr.jobSets[resJob.GetJobSetID()]
if !ok {
h.mgr.pubLock.Unlock()
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job set %s not found", resJob.GetJobSetID()), job.state))
return
}
ref := jobSet.FindRefByLocalJobID(resJob.Info.TargetLocalJobID)
if ref == nil {
h.mgr.pubLock.Unlock()
h.changeJobState(job.job, jobmod.NewStateFailed(
fmt.Sprintf("job %s not found in job set %s",
resJob.Info.TargetLocalJobID,
resJob.GetJobSetID()),
job.state,
))
return
}
targetJob, ok := h.mgr.jobs[ref.JobID]
h.mgr.pubLock.Unlock()
if !ok {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job %s not found", ref.JobID), job.state))
return
}
tarNorJob, ok := targetJob.Job.(*jobmod.NormalJob)
if !ok {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("job(%v) %s is not a Normal job", reflect.TypeOf(targetJob), ref.JobID), job.state))
return
}
colCli, err := schglb.CollectorMQPool.Acquire()
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("new collector client: %s", err.Error()), job.state))
return
}
defer schglb.CollectorMQPool.Release(colCli)
ccInfo, err := h.mgr.db.ComputingCenter().GetByID(h.mgr.db.SQLCtx(), tarNorJob.TargetCCID)
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(fmt.Sprintf("getting computing center info: %s", err.Error()), job.state))
return
}
fullTaskID, err := h.mgr.execMgr.StartTask(job.job.GetJobID(), exetsk.NewStorageCreatePackage(
0, // TOOD 用户ID
ccInfo.CDSStorageID,
tarNorJob.OutputFullPath,
resJob.Info.BucketID,
utils.MakeResourcePackageName(resJob.JobID),
))
if err != nil {
h.changeJobState(job.job, jobmod.NewStateFailed(err.Error(), job.state))
return
}
job.state.FullTaskID = fullTaskID
}
if createRet, err := event.AssertExecutorTaskStatus[*exetsk.StorageCreatePackageStatus](evt, job.state.FullTaskID); err != event.ErrUnconcernedTask {
if err == event.ErrTaskTimeout {
h.changeJobState(job.job, jobmod.NewStateFailed("storage create package timeout", job.state))
return
}
h.mgr.execMgr.ForgetTask(job.state.FullTaskID)
if createRet.Error != "" {
h.changeJobState(job.job, jobmod.NewStateFailed(createRet.Error, job.state))
return
}
resJob.ResourcePackageID = createRet.PackageID
h.changeJobState(job.job, jobmod.NewStateSuccess())
}
}
func (h *ExecutingHandler) changeJobState(job jobmod.Job, state jobmod.JobState) {
job.SetState(state)
delete(h.jobs, job.GetJobID())
h.mgr.pubLock.Lock()
h.mgr.handleState(job)
h.mgr.pubLock.Unlock()
}
func (h *ExecutingHandler) OnEvent(broadcast event.Broadcast, evt event.Event) {
h.cmdChan.Send(func() {
if broadcast.ToAll() {
for _, job := range h.jobs {
h.onJobEvent(evt, job)
}
} else if broadcast.ToJobSet() {
for _, job := range h.jobs {
if job.job.GetJobSetID() != broadcast.JobSetID {
continue
}
h.onJobEvent(evt, job)
}
} else if broadcast.ToJob() {
if job, ok := h.jobs[broadcast.JobID]; ok {
h.onJobEvent(evt, job)
}
}
})
}
func (h *ExecutingHandler) Serve() {
cmdChan := h.cmdChan.BeginChanReceive()
defer h.cmdChan.CloseChanReceive()
for {
select {
case cmd := <-cmdChan:
cmd()
}
}
}
func (h *ExecutingHandler) Stop() {
// TODO 支持STOP
}