// Copyright Project Harbor Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package runner import ( "context" "fmt" "runtime" "time" "github.com/gocraft/work" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" "github.com/goharbor/harbor/src/jobservice/env" "github.com/goharbor/harbor/src/jobservice/errs" "github.com/goharbor/harbor/src/jobservice/job" "github.com/goharbor/harbor/src/jobservice/lcm" "github.com/goharbor/harbor/src/jobservice/logger" "github.com/goharbor/harbor/src/jobservice/period" "github.com/goharbor/harbor/src/lib/errors" "github.com/goharbor/harbor/src/lib/metric" tracelib "github.com/goharbor/harbor/src/lib/trace" ) const ( maxTrackRetries = 6 tracerName = "goharbor/harbor/src/jobservice/runner/redis" ) // RedisJob is a job wrapper to wrap the job.Interface to the style which can be recognized by the redis worker. type RedisJob struct { job interface{} // the real job implementation context *env.Context // context ctl lcm.Controller // life cycle controller } // NewRedisJob is constructor of RedisJob func NewRedisJob(job interface{}, ctx *env.Context, ctl lcm.Controller) *RedisJob { return &RedisJob{ job: job, context: ctx, ctl: ctl, } } // Run the job func (rj *RedisJob) Run(j *work.Job) (err error) { _, span := tracelib.StartTrace(context.Background(), tracerName, "run-job") defer span.End() var ( runningJob job.Interface execContext job.Context tracker job.Tracker ) // Track the running job now jID := j.ID // used to instrument process time now := time.Now() // Check if the job is a periodic one as periodic job has its own ID format if eID, yes := isPeriodicJobExecution(j); yes { jID = eID span.SetAttributes(attribute.Key("periodicJob").Bool(true)) logger.Infof("Start to run periodical job execution: %s", eID) } span.SetAttributes(attribute.Key("jobID").String(jID), attribute.Key("jobName").String(j.Name)) // As the job stats may not be ready when job executing sometimes (corner case), // the track call here may get NOT_FOUND error. For that case, let's do retry to recovery. for retried := 0; retried <= maxTrackRetries; retried++ { tracker, err = rj.ctl.Track(jID) if err == nil { break } if errs.IsObjectNotFoundError(err) { if retried < maxTrackRetries { // Still have chance to re-track the given job. // Hold for a while and retry b := backoff(retried) logger.Errorf("Track job %s: stats may not have been ready yet, hold for %d ms and retry again", jID, b) <-time.After(time.Duration(b) * time.Millisecond) span.AddEvent("retrying to get job stat") continue } else { // Exit and never try. // Directly return without retry again as we have no way to restore the stats again. j.Fails = 10000000000 // never retry } } // Log error and exit logger.Errorf("Job '%s:%s' exit with error: failed to get job tracker: %s", j.Name, j.ID, err) tracelib.RecordError(span, err, "failed to get job tracker") // ELSE: // As tracker creation failed, there is no way to mark the job status change. // Also a non nil error return consumes a fail. If all retries are failed here, // it will cause the job to be zombie one (pending forever). // Those zombie ones will be reaped by the reaper later. return } // Defer to switch status defer func() { // Switch job status based on the returned error. // The err happened here should not override the job run error, just log it. if err != nil { // log error logger.Errorf("Job '%s:%s' exit with error: %s", j.Name, j.ID, err) metric.JobserviceTotalTask.WithLabelValues(j.Name, "fail").Inc() metric.JobservieTaskProcessTimeSummary.WithLabelValues(j.Name, "fail").Observe(time.Since(now).Seconds()) tracelib.RecordError(span, err, "job failed with err") if er := tracker.Fail(); er != nil { logger.Errorf("Error occurred when marking the status of job %s:%s to failure: %s", j.Name, j.ID, er) span.RecordError(err) } return } // Nil error might be returned by the stopped job. Check the latest status here. // If refresh latest status failed, let the process to go on to void missing status updating. if latest, er := tracker.Status(); er != nil { logger.Errorf("Error occurred when getting the status of job %s:%s: %s", j.Name, j.ID, er) } else { if latest == job.StoppedStatus { // Logged metric.JobserviceTotalTask.WithLabelValues(j.Name, "stop").Inc() metric.JobservieTaskProcessTimeSummary.WithLabelValues(j.Name, "stop").Observe(time.Since(now).Seconds()) tracelib.RecordError(span, err, "job stopped") logger.Infof("Job %s:%s is stopped", j.Name, j.ID) return } } metric.JobserviceTotalTask.WithLabelValues(j.Name, "success").Inc() metric.JobservieTaskProcessTimeSummary.WithLabelValues(j.Name, "success").Observe(time.Since(now).Seconds()) // Mark job status to success. logger.Infof("Job '%s:%s' exit with success", j.Name, j.ID) span.SetStatus(codes.Ok, "job exit with success") if er := tracker.Succeed(); er != nil { tracelib.RecordError(span, err, "failed to mark job success") logger.Errorf("Error occurred when marking the status of job %s:%s to success: %s", j.Name, j.ID, er) } }() // Defer to handle runtime error defer func() { if r := recover(); r != nil { // Log the stack buf := make([]byte, 1<<20) size := runtime.Stack(buf, false) err = errors.Errorf("runtime error: %s; stack: %s", r, buf[0:size]) logger.Errorf("Run job %s:%s error: %s", j.Name, j.ID, err) } }() // Do operation based on the job status jStatus := job.Status(tracker.Job().Info.Status) switch jStatus { case job.PendingStatus, job.ScheduledStatus: // do nothing now break case job.StoppedStatus: // Probably jobs has been stopped by directly mark status to stopped. // Directly exit and no retry return nil case job.RunningStatus, job.ErrorStatus: // The failed jobs can be put into retry queue and the in progress jobs may be // interrupted by a sudden service crash event, all those jobs can be rescheduled. // Reset job info. if err = tracker.Reset(); err != nil { // Log error and return the original error if existing tracelib.RecordError(span, err, "reset job failed") err = errors.Wrap(err, fmt.Sprintf("retrying %s job %s:%s failed", jStatus.String(), j.Name, j.ID)) if len(j.LastErr) > 0 { err = errors.Wrap(err, j.LastErr) } return } logger.Infof("Retrying job %s:%s, revision: %d", j.Name, j.ID, tracker.Job().Info.Revision) case job.SuccessStatus: // do nothing return nil default: err = errors.Errorf("mismatch status for running job: expected %s/%s but got %s", job.PendingStatus, job.ScheduledStatus, jStatus.String()) tracelib.RecordError(span, err, "status mismatch") return err } // Build job context if execContext, err = rj.context.JobContext.Build(tracker); err != nil { return } // Defer to close logger stream defer func() { // Close open io stream first if closer, ok := execContext.GetLogger().(logger.Closer); ok { if er := closer.Close(); er != nil { tracelib.RecordError(span, er, "close job logger failed") logger.Errorf("Close job logger failed: %s", er) } } }() // Wrap job runningJob = Wrap(rj.job) // Set status to run if err = tracker.Run(); err != nil { tracelib.RecordError(span, err, "failed set status to run") return } // Run the job err = runningJob.Run(execContext, j.Args) // Add error context if err != nil { err = errors.Wrap(err, "run error") } // Handle retry rj.retry(runningJob, j) // Handle periodic job execution if _, yes := isPeriodicJobExecution(j); yes { if er := tracker.PeriodicExecutionDone(); er != nil { // Just log it logger.Error(er) } } return } func (rj *RedisJob) retry(j job.Interface, wj *work.Job) { if !j.ShouldRetry() { // Cancel retry immediately // Make it big enough to avoid retrying wj.Fails = 10000000000 return } } func isPeriodicJobExecution(j *work.Job) (string, bool) { epoch, ok := j.Args[period.PeriodicExecutionMark] return fmt.Sprintf("%s@%s", j.ID, epoch), ok } func backoff(x int) int { // y=ax^2+bx+c var a, b, c = -111, 666, 500 y := a*x*x + b*x + c if y < 0 { y = 0 - y } return y }