Github Action job 分配到集群
背景
job 是 Github Action 的基本單位,每個 job 單獨分配一個 runner。workflow 由一個或者多個 job 組成。如果用戶觸發runs-on
字段為arc-runner-set
的 job,那么 Github Action 服務器將 job 分配給 listener pod。
源碼
handleMessage
函數主要處理2類處理服務器消息。第一類是狀態為started
的 job: job 已經由服務器分配給 runner 執行。HandleJobStarted
函數 job 信息局部更新給EphemeralRunner
資源。
func (l *Listener) handleMessage(ctx context.Context, handler Handler, msg *actions.RunnerScaleSetMessage) error {parsedMsg, err := l.parseMessage(ctx, msg) // 解析消息l.lastMessageID = msg.MessageIdif err := l.deleteLastMessage(ctx); err != nil { // 請求服務器刪去消息return fmt.Errorf("failed to delete message: %w", err)}for _, jobStarted := range parsedMsg.jobsStarted {if err := handler.HandleJobStarted(ctx, jobStarted); err != nil {return fmt.Errorf("failed to handle job started: %w", err)}l.metrics.PublishJobStarted(jobStarted)}desiredRunners, err := handler.HandleDesiredRunnerCount(ctx, parsedMsg.statistics.TotalAssignedJobs, len(parsedMsg.jobsCompleted))l.metrics.PublishDesiredRunners(desiredRunners)return nil
}
第二類是狀態為Assigned
和Completed
的 job。前者是還未結束的任務,后者是已經結束的任務。
HandleDesiredRunnerCount
函數首先調用setDesiredWorkerState
函數計算集群的 runner 數量。
// count = parsedMsg.statistics.TotalAssignedJobs 表示未結束的 job 數量
// jobsCompleted = len(parsedMsg.jobsCompleted) 表示已經運行結束的 job 數量
func (w *Worker) setDesiredWorkerState(count, jobsCompleted int) int {// 根據用戶在 runner scale set chart 的 values.yaml 文件配置的 minRunenrs 和 maxRunners 以及分配的 job 數量綜合確定集群的 runner 數量。targetRunnerCount := min(w.config.MinRunners+count, w.config.MaxRunners)w.patchSeq++ // 批次序號+1desiredPatchID := w.patchSeqif count == 0 && jobsCompleted == 0 { // 本批次既沒有運行的 job,也沒有結束的 jobtargetRunnerCount = max(w.lastPatch, targetRunnerCount)if targetRunnerCount == w.config.MinRunners {// 運行至此,本批次沒有活躍 job,上批次也沒有活躍 job。集群處于空閑狀態desiredPatchID = 0 // 將 desiredPatchID 設為 0 用于觸發縮容}}w.lastPatch = targetRunnerCountreturn desiredPatchID
}
之后HandleDesiredRunnerCount
函數將批次序號和 runner 數局部更新給EphemeralRunnerSet
資源。
patch, err := json.Marshal(&v1alpha1.EphemeralRunnerSet{Spec: v1alpha1.EphemeralRunnerSetSpec{Replicas: w.lastPatch, // targetRunnerCountPatchID: patchID, // desiredPatchID},},)
EphemeralRunnerSet
控制器根據批次序號和 runner 數更改 runner 資源。主要邏輯如下:
// total 是集群中運行的 runner pod 的數量,與 Github Action 服務器無關total := ephemeralRunnerState.scaleTotal()if ephemeralRunnerSet.Spec.PatchID == 0 || ephemeralRunnerSet.Spec.PatchID != ephemeralRunnerState.latestPatchID {defer func() {// 主動刪除已經結束的`EphemeralRunner`資源if err := r.cleanupFinishedEphemeralRunners(ctx, ephemeralRunnerState.finished, log); err != nil {log.Error(err, "failed to cleanup finished ephemeral runners")}}()log.Info("Scaling comparison", "current", total, "desired", ephemeralRunnerSet.Spec.Replicas)switch {// 集群中 runner 數量小于 job 數量,擴容case total < ephemeralRunnerSet.Spec.Replicas: // Handle scale upcount := ephemeralRunnerSet.Spec.Replicas - totallog.Info("Creating new ephemeral runners (scale up)", "count", count)if err := r.createEphemeralRunners(ctx, ephemeralRunnerSet, count, log); err != nil {log.Error(err, "failed to make ephemeral runner")return ctrl.Result{}, err}// 集群中 runner 數量大于 job 數量,說明 Github Action 服務器已經認為某些 job 結束,但是 job 對應的 runner pod 還未到結束狀態。等待 runner pod 自行結束。case ephemeralRunnerSet.Spec.PatchID > 0 && total >= ephemeralRunnerSet.Spec.Replicas:// PatchID == 0 是 setDesiredWorkerState 函數的縮容標記,避免集群空閑時仍然有 runner pod 在運行case ephemeralRunnerSet.Spec.PatchID == 0 && total > ephemeralRunnerSet.Spec.Replicas:count := total - ephemeralRunnerSet.Spec.Replicasif err := r.deleteIdleEphemeralRunners(ctx,ephemeralRunnerSet,ephemeralRunnerState.pending,ephemeralRunnerState.running,count,log,); err != nil {log.Error(err, "failed to delete idle runners")return ctrl.Result{}, err}}}
createEphemeralRunners
函數創建EphemeralRunner
資源。資源結構如下:
return &v1alpha1.EphemeralRunner{TypeMeta: metav1.TypeMeta{},ObjectMeta: metav1.ObjectMeta{GenerateName: ephemeralRunnerSet.Name + "-runner-", // k8s 自動為EphemeralRunner 資源名稱創建隨機后綴,因為一個 EphemeralRunnerSet 對應多個 EphemeralRunner 資源Namespace: ephemeralRunnerSet.Namespace,Labels: labels,Annotations: annotations,OwnerReferences: []metav1.OwnerReference{{APIVersion: ephemeralRunnerSet.GetObjectKind().GroupVersionKind().GroupVersion().String(),Kind: ephemeralRunnerSet.GetObjectKind().GroupVersionKind().Kind,UID: ephemeralRunnerSet.GetUID(),Name: ephemeralRunnerSet.GetName(),Controller: boolPtr(true),BlockOwnerDeletion: boolPtr(true),},},},Spec: ephemeralRunnerSet.Spec.EphemeralRunnerSpec,}
EphemeralRunner
控制器根據EphemeralRunner
資源創建 runner pod。步驟如下:
- 添加 finalizer 字段。
finalizer
表示在集群中注銷本資源。runner-registration-finalizer
表示在 Github Action 服務器注銷本資源。
const (ephemeralRunnerFinalizerName = "ephemeralrunner.actions.github.com/finalizer"ephemeralRunnerActionsFinalizerName = "ephemeralrunner.actions.github.com/runner-registration-finalizer"
)
- 向 Github Action 服務器注冊自身。返回 RunnerId 表示注冊成功。
type RunnerScaleSetJitRunnerSetting struct {Name string `json:"name"`WorkFolder string `json:"workFolder"`
}
// jit(Just-in-Time) config:服務器返回的注冊信息
// jitSettings 的類型是 RunnerScaleSetJitRunnerSetting。包含 runner 名稱以及工作目錄。
// RunnerScaleSetId: runner 所屬 runner scale set 的注冊Id
jitConfig, err := actionsClient.GenerateJitRunnerConfig(ctx, jitSettings, ephemeralRunner.Spec.RunnerScaleSetId)// 更新 EphemeralRunner 資源
err = patchSubResource(ctx, r.Status(), ephemeralRunner, func(obj *v1alpha1.EphemeralRunner) {obj.Status.RunnerId = jitConfig.Runner.Id // runner idobj.Status.RunnerName = jitConfig.Runner.Name // runner 名稱obj.Status.RunnerJITConfig = jitConfig.EncodedJITConfig // 配置 secret})
- 將 RunnerJITConfig 注冊為 secret, runner pod 將通過 config 與 Github Action 服務器連接。
return &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Name: ephemeralRunner.Name,Namespace: ephemeralRunner.Namespace,},Data: map[string][]byte{jitTokenKey: []byte(ephemeralRunner.Status.RunnerJITConfig),},}
- 創建 runner pod。runner pod 的鏡像是
ghcr.io/actions/actions-runner
。它將與 Github Action 服務器建立連接。服務器分配 job 給 runner pod,服務器將 job 的任務內容交給 runner pod 執行。執行完成后,runner pod 結束。
c.Env = append(c.Env,corev1.EnvVar{Name: EnvVarRunnerJITConfig,ValueFrom: &corev1.EnvVarSource{SecretKeyRef: &corev1.SecretKeySelector{LocalObjectReference: corev1.LocalObjectReference{Name: secret.Name,},Key: jitTokenKey, // 將 RunnerJITConfig secret 掛載給 runner container。},},},
)
- 監控 runner container 的狀態。
總結
本文講了 Github Action job 到 runner pod 的執行流程。