这篇文章将单独分析调度过程中关于volumeBinding插件的内容,这里会涉及到Filter的逻辑,并且这部分结果在调度的assume阶段也会用到。接下来直接进入主题。

先看FindPodVolumes函数,在这里会过滤pod和node的信息,检查pod和node的volume绑定关系,先直接上代码,

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
func (b *volumeBinder) FindPodVolumes(pod *v1.Pod, node *v1.Node) (reasons ConflictReasons, err error) {
	podName := getPodName(pod)

	// Warning: Below log needs high verbosity as it can be printed several times (#60933).
	klog.V(5).Infof("FindPodVolumes for pod %q, node %q", podName, node.Name)

	// Initialize to true for pods that don't have volumes. These
	// booleans get translated into reason strings when the function
	// returns without an error.
	unboundVolumesSatisfied := true
	boundVolumesSatisfied := true
	defer func() {
		if err != nil {
			return
		}
		if !boundVolumesSatisfied {
			reasons = append(reasons, ErrReasonNodeConflict)
		}
		if !unboundVolumesSatisfied {
			reasons = append(reasons, ErrReasonBindConflict)
		}
	}()

	start := time.Now()
	defer func() {
		metrics.VolumeSchedulingStageLatency.WithLabelValues("predicate").Observe(time.Since(start).Seconds())
		if err != nil {
			metrics.VolumeSchedulingStageFailed.WithLabelValues("predicate").Inc()
		}
	}()

	var (
		matchedBindings   []*bindingInfo
		provisionedClaims []*v1.PersistentVolumeClaim
	)
	defer func() {
		// We recreate bindings for each new schedule loop.
		if len(matchedBindings) == 0 && len(provisionedClaims) == 0 {
			// Clear cache if no claims to bind or provision for this node.
			b.podBindingCache.ClearBindings(pod, node.Name)
			return
		}
		// Although we do not distinguish nil from empty in this function, for
		// easier testing, we normalize empty to nil.
		if len(matchedBindings) == 0 {
			matchedBindings = nil
		}
		if len(provisionedClaims) == 0 {
			provisionedClaims = nil
		}
		// Mark cache with all matched and provisioned claims for this node
    //这里会更新podBindingCache,后面在调度的Assume阶段会用到
		b.podBindingCache.UpdateBindings(pod, node.Name, matchedBindings, provisionedClaims)
	}()

	// The pod's volumes need to be processed in one call to avoid the race condition where
	// volumes can get bound/provisioned in between calls.
  // 这里返回的三个参数分别是:  已经绑定的pvc,需要延迟绑定还没有绑定的,不是延迟绑定的还没有绑定的
	boundClaims, claimsToBind, unboundClaimsImmediate, err := b.getPodVolumes(pod)
	if err != nil {
		return nil, err
	}

	// Immediate claims should be bound
	if len(unboundClaimsImmediate) > 0 {
		return nil, fmt.Errorf("pod has unbound immediate PersistentVolumeClaims")
	}

	// Check PV node affinity on bound volumes
	if len(boundClaims) > 0 {
		boundVolumesSatisfied, err = b.checkBoundClaims(boundClaims, node, podName)
		if err != nil {
			return nil, err
		}
	}

	// Find matching volumes and node for unbound claims
	if len(claimsToBind) > 0 {
		var (
			claimsToFindMatching []*v1.PersistentVolumeClaim
			claimsToProvision    []*v1.PersistentVolumeClaim
		)

		// Filter out claims to provision
		for _, claim := range claimsToBind {
			if selectedNode, ok := claim.Annotations[pvutil.AnnSelectedNode]; ok {
				if selectedNode != node.Name {
					// Fast path, skip unmatched node.
					unboundVolumesSatisfied = false
					return
				}
				claimsToProvision = append(claimsToProvision, claim)
			} else {
				claimsToFindMatching = append(claimsToFindMatching, claim)
			}
		}

		// Find matching volumes
		if len(claimsToFindMatching) > 0 {
			var unboundClaims []*v1.PersistentVolumeClaim
			unboundVolumesSatisfied, matchedBindings, unboundClaims, err = b.findMatchingVolumes(pod, claimsToFindMatching, node)
			if err != nil {
				return nil, err
			}
			claimsToProvision = append(claimsToProvision, unboundClaims...)
		}

		// Check for claims to provision
		if len(claimsToProvision) > 0 {
			unboundVolumesSatisfied, provisionedClaims, err = b.checkVolumeProvisions(pod, claimsToProvision, node)
			if err != nil {
				return nil, err
			}
		}
	}

	return
}

在上面的函数中先会调用getPodVolumes分析处于各个阶段的volume,返回的结果分别是已经完成绑定的pvc,需要延迟绑定还没有绑定的pvc,不是延迟绑定的还没有绑定的pvc。

  1. 针对第一种情况会再通过checkBoundClaims检测对应的pv是否符合调度策略,主要是通过checkVolumeNodeAffinity检测pv的NodeAffinity.Required.NodeSelectorTerms是否服务节点的标签,如果不符合则返回错误。
  2. 第二种延迟绑定,这里会先检查pvc.Annotations中是否有key为"volume.kubernetes.io/selected-node"的值,如果有并和当前节点相同则会添加到claimsToProvision的slice,如果不存在这个key,则把这个这个pvc添加到claimsToFindMatching的slice。
  3. 第三种不是延迟绑定,则直接返回错误。

接下来会针对需要绑定的pvc去查找对应的pv,FindMatchingVolume会检查pvc对应的StorageClass下所有的volume是否有符合要求的pv,检查条件是:

  1. pv是否有对应pvc,如果有并不是当前的pvc则继续;
  2. 检查pv容量是否小于pvc的容量,pv>pvc的容量才行;
  3. 检查pv和pvc的volumeMode是否相等,例如都是Filesystem类型;
  4. 检查pv的DeletionTimestamp是否为空;
  5. 检查pv的nodeAffinity是否符合node的labels;
  6. 如果pv和pvc已经绑定好了并检查pv是否符合nodeAffinity;
  7. 检查pv的状态,是否和pvc的selector匹配;
  8. 检查pv和pvc的Mode是否一致,然后从所有符合条件的volume(pv)中选择容量最小的volume。

上面检查完了pv和pvc的关联关系会将pvc分为unboundClaims不能找到对应的pv和bindings能正常绑定的pv两种,而且只有全部的pvc都能找到pv才是foundMatchs。到这里延时绑定中没有pv的问题也解决了,接下来还需要分析pvc有对应的volume.kubernetes.io/selected-node字段的pvc。
如果绑定了具体的node,则会通过checkVolumeProvisions函数检测node是否符合sc的拓扑需求。 最后会通过UpdateBindings更新podBindingCache字段。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
func (c *podBindingCache) UpdateBindings(pod *v1.Pod, node string, bindings []*bindingInfo, pvcs []*v1.PersistentVolumeClaim) {
	c.rwMutex.Lock()
	defer c.rwMutex.Unlock()

	podName := getPodName(pod)
	decisions, ok := c.bindingDecisions[podName]
	if !ok {
		decisions = nodeDecisions{}
		c.bindingDecisions[podName] = decisions
	}
	decision, ok := decisions[node]
	if !ok {
		decision = nodeDecision{
			bindings:      bindings,
			provisionings: pvcs,
		}
		metrics.VolumeBindingRequestSchedulerBinderCache.WithLabelValues("add").Inc()
	} else {
		decision.bindings = bindings
		decision.provisionings = pvcs
	}
	decisions[node] = decision
}

更新完volumeBinder缓存,后面再调度的assume阶段会用到这些信息。到这里volumeBinding插件的Filter过程就分析完了。逻辑图大概如下图所示。

总结

到这里所有的调度基本都分析完了,这一篇主要是介绍pvc和pv之间的过滤流程,之前疑惑localpath类型的sc怎么做到定点调度的问题也能完全解释的通。针对调度的整个逻辑在这里做一个阅读记录,以便后续需要查到。