harbor/src/jobservice/runner/redis.go

// Copyright Project Harbor Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package runner

import (
	"context"
	"fmt"
	"runtime"
	"time"

	"github.com/gocraft/work"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/codes"

	"github.com/goharbor/harbor/src/jobservice/env"
	"github.com/goharbor/harbor/src/jobservice/errs"
	"github.com/goharbor/harbor/src/jobservice/job"
	"github.com/goharbor/harbor/src/jobservice/lcm"
	"github.com/goharbor/harbor/src/jobservice/logger"
	"github.com/goharbor/harbor/src/jobservice/period"
	"github.com/goharbor/harbor/src/lib/errors"
	"github.com/goharbor/harbor/src/lib/metric"
	tracelib "github.com/goharbor/harbor/src/lib/trace"
)

const (
	maxTrackRetries = 6
	tracerName      = "goharbor/harbor/src/jobservice/runner/redis"
)

// RedisJob is a job wrapper to wrap the job.Interface to the style which can be recognized by the redis worker.
type RedisJob struct {
	job     interface{}    // the real job implementation
	context *env.Context   // context
	ctl     lcm.Controller // life cycle controller
}

// NewRedisJob is constructor of RedisJob
func NewRedisJob(job interface{}, ctx *env.Context, ctl lcm.Controller) *RedisJob {
	return &RedisJob{
		job:     job,
		context: ctx,
		ctl:     ctl,
	}
}

// Run the job
func (rj *RedisJob) Run(j *work.Job) (err error) {
	_, span := tracelib.StartTrace(context.Background(), tracerName, "run-job")
	defer span.End()

	var (
		runningJob  job.Interface
		execContext job.Context
		tracker     job.Tracker
	)
	// Track the running job now
	jID := j.ID

	// used to instrument process time
	now := time.Now()
	// Check if the job is a periodic one as periodic job has its own ID format
	if eID, yes := isPeriodicJobExecution(j); yes {
		jID = eID
		span.SetAttributes(attribute.Key("periodicJob").Bool(true))
		logger.Infof("Start to run periodical job execution: %s", eID)
	}
	span.SetAttributes(attribute.Key("jobID").String(jID), attribute.Key("jobName").String(j.Name))

	// As the job stats may not be ready when job executing sometimes (corner case),
	// the track call here may get NOT_FOUND error. For that case, let's do retry to recovery.
	for retried := 0; retried <= maxTrackRetries; retried++ {
		tracker, err = rj.ctl.Track(jID)
		if err == nil {
			break
		}
		if errs.IsObjectNotFoundError(err) {
			if retried < maxTrackRetries {
				// Still have chance to re-track the given job.
				// Hold for a while and retry
				b := backoff(retried)
				logger.Errorf("Track job %s: stats may not have been ready yet, hold for %d ms and retry again", jID, b)
				<-time.After(time.Duration(b) * time.Millisecond)
				span.AddEvent("retrying to get job stat")
				continue
			} else {
				// Exit and never try.
				// Directly return without retry again as we have no way to restore the stats again.
				j.Fails = 10000000000 // never retry
			}
		}

		// Log error and exit
		logger.Errorf("Job '%s:%s' exit with error: failed to get job tracker: %s", j.Name, j.ID, err)
		tracelib.RecordError(span, err, "failed to get job tracker")
		// ELSE:
		// As tracker creation failed, there is no way to mark the job status change.
		// Also a non nil error return consumes a fail. If all retries are failed here,
		// it will cause the job to be zombie one (pending forever).
		// Those zombie ones will be reaped by the reaper later.

		return
	}

	// Defer to switch status
	defer func() {
		// Switch job status based on the returned error.
		// The err happened here should not override the job run error, just log it.
		if err != nil {
			// log error
			logger.Errorf("Job '%s:%s' exit with error: %s", j.Name, j.ID, err)
			metric.JobserviceTotalTask.WithLabelValues(j.Name, "fail").Inc()
			metric.JobservieTaskProcessTimeSummary.WithLabelValues(j.Name, "fail").Observe(time.Since(now).Seconds())
			tracelib.RecordError(span, err, "job failed with err")
			if er := tracker.Fail(); er != nil {
				logger.Errorf("Error occurred when marking the status of job %s:%s to failure: %s", j.Name, j.ID, er)
				span.RecordError(err)
			}

			return
		}

		// Nil error might be returned by the stopped job. Check the latest status here.
		// If refresh latest status failed, let the process to go on to void missing status updating.
		if latest, er := tracker.Status(); er != nil {
			logger.Errorf("Error occurred when getting the status of job %s:%s: %s", j.Name, j.ID, er)
		} else {
			if latest == job.StoppedStatus {
				// Logged
				metric.JobserviceTotalTask.WithLabelValues(j.Name, "stop").Inc()
				metric.JobservieTaskProcessTimeSummary.WithLabelValues(j.Name, "stop").Observe(time.Since(now).Seconds())
				tracelib.RecordError(span, err, "job stopped")
				logger.Infof("Job %s:%s is stopped", j.Name, j.ID)
				return
			}
		}
		metric.JobserviceTotalTask.WithLabelValues(j.Name, "success").Inc()
		metric.JobservieTaskProcessTimeSummary.WithLabelValues(j.Name, "success").Observe(time.Since(now).Seconds())
		// Mark job status to success.
		logger.Infof("Job '%s:%s' exit with success", j.Name, j.ID)
		span.SetStatus(codes.Ok, "job exit with success")
		if er := tracker.Succeed(); er != nil {
			tracelib.RecordError(span, err, "failed to mark job success")
			logger.Errorf("Error occurred when marking the status of job %s:%s to success: %s", j.Name, j.ID, er)
		}
	}()

	// Defer to handle runtime error
	defer func() {
		if r := recover(); r != nil {
			// Log the stack
			buf := make([]byte, 1<<20)
			size := runtime.Stack(buf, false)
			err = errors.Errorf("runtime error: %s; stack: %s", r, buf[0:size])
			logger.Errorf("Run job %s:%s error: %s", j.Name, j.ID, err)
		}
	}()

	// Do operation based on the job status
	jStatus := job.Status(tracker.Job().Info.Status)
	switch jStatus {
	case job.PendingStatus, job.ScheduledStatus:
		// do nothing now
		break
	case job.StoppedStatus:
		// Probably jobs has been stopped by directly mark status to stopped.
		// Directly exit and no retry
		return nil
	case job.RunningStatus, job.ErrorStatus:
		// The failed jobs can be put into retry queue and the in progress jobs may be
		// interrupted by a sudden service crash event, all those jobs can be rescheduled.
		// Reset job info.
		if err = tracker.Reset(); err != nil {
			// Log error and return the original error if existing
			tracelib.RecordError(span, err, "reset job failed")
			err = errors.Wrap(err, fmt.Sprintf("retrying %s job %s:%s failed", jStatus.String(), j.Name, j.ID))

			if len(j.LastErr) > 0 {
				err = errors.Wrap(err, j.LastErr)
			}

			return
		}

		logger.Infof("Retrying job %s:%s, revision: %d", j.Name, j.ID, tracker.Job().Info.Revision)
	case job.SuccessStatus:
		// do nothing
		return nil
	default:
		err = errors.Errorf("mismatch status for running job: expected %s/%s but got %s", job.PendingStatus, job.ScheduledStatus, jStatus.String())
		tracelib.RecordError(span, err, "status mismatch")
		return err
	}

	// Build job context
	if execContext, err = rj.context.JobContext.Build(tracker); err != nil {
		return
	}

	// Defer to close logger stream
	defer func() {
		// Close open io stream first
		if closer, ok := execContext.GetLogger().(logger.Closer); ok {
			if er := closer.Close(); er != nil {
				tracelib.RecordError(span, er, "close job logger failed")
				logger.Errorf("Close job logger failed: %s", er)
			}
		}
	}()

	// Wrap job
	runningJob = Wrap(rj.job)
	// Set status to run
	if err = tracker.Run(); err != nil {
		tracelib.RecordError(span, err, "failed set status to run")
		return
	}
	// Run the job
	err = runningJob.Run(execContext, j.Args)
	// Add error context
	if err != nil {
		err = errors.Wrap(err, "run error")
	}

	// Handle retry
	rj.retry(runningJob, j)
	// Handle periodic job execution
	if _, yes := isPeriodicJobExecution(j); yes {
		if er := tracker.PeriodicExecutionDone(); er != nil {
			// Just log it
			logger.Error(er)
		}
	}

	return
}

func (rj *RedisJob) retry(j job.Interface, wj *work.Job) {
	if !j.ShouldRetry() {
		// Cancel retry immediately
		// Make it big enough to avoid retrying
		wj.Fails = 10000000000
		return
	}
}

func isPeriodicJobExecution(j *work.Job) (string, bool) {
	epoch, ok := j.Args[period.PeriodicExecutionMark]
	return fmt.Sprintf("%s@%s", j.ID, epoch), ok
}

func backoff(x int) int {
	// y=ax^2+bx+c
	var a, b, c = -111, 666, 500

	y := a*x*x + b*x + c
	if y < 0 {
		y = 0 - y
	}

	return y
}