Swift标准库源码阅读笔记 - Dictionary

1,222 阅读8分钟

Dictionary

Dictionary 内部只有一个成员变量 _variantBuffer,它的类型是 _VariantDictionaryBuffer

public struct Dictionary<Key: Hashable, Value> {

  internal typealias _VariantBuffer = _VariantDictionaryBuffer<Key, Value>

  
![](https://p1-jj.byteimg.com/tos-cn-i-t2oaga2asx/gold-user-assets/2018/7/6/1646ed8037e769fe~tplv-t2oaga2asx-image.image)

主要有两种初始化方法。

  public init() {
    self = Dictionary<Key, Value>(_nativeBuffer: _NativeBuffer())
  }
  
  public init(minimumCapacity: Int) {
    _variantBuffer = .native(_NativeBuffer(minimumCapacity: minimumCapacity))
  }

#if _runtime(_ObjC)
  public init(_immutableCocoaDictionary: _NSDictionary) {
    _sanityCheck(
      _isBridgedVerbatimToObjectiveC(Key.self) &&
      _isBridgedVerbatimToObjectiveC(Value.self),
      "Dictionary can be backed by NSDictionary buffer only when both key and value are bridged verbatim to Objective-C")
    _variantBuffer = .cocoa(
      _CocoaDictionaryBuffer(cocoaDictionary: _immutableCocoaDictionary))
  }
#endif
}

从两个初始化的情况来看, 一种是需要桥接到 Objective-C ,而另外一种是不需要桥接到 Objective-C

接下来具体看看 _VariantDictionaryBuffer

_VariantDictionaryBuffer

internal enum _VariantDictionaryBuffer<Key: Hashable, Value>: _HashBuffer {

  internal typealias NativeBuffer = _NativeDictionaryBuffer<Key, Value>
#if _runtime(_ObjC)
  internal typealias CocoaBuffer = _CocoaDictionaryBuffer
#endif

  case native(NativeBuffer)
#if _runtime(_ObjC)
  case cocoa(CocoaBuffer)
#endif

_VariantDictionaryBuffer 其实是个 enum ,只有两个值 nativecocoa,分别对应着 _NativeDictionaryBuffer_CocoaDictionaryBuffer

_NativeDictionaryBuffer

_NativeDictionaryBuffer 中也只有一个成员变量 _storage,它的类型是 _RawNativeDictionaryStorage

internal struct _NativeDictionaryBuffer<Key, Value> {

  internal typealias RawStorage = _RawNativeDictionaryStorage
  
  internal var _storage: RawStorage

再来看看 _NativeDictionaryBuffer 内部的初始化代码。

  internal init(minimumCapacity: Int) {
    let bucketCount = _NativeDictionaryBuffer.bucketCount(
      forCapacity: minimumCapacity,
      maxLoadFactorInverse: _hashContainerDefaultMaxLoadFactorInverse)
    self.init(bucketCount: bucketCount)
  }
  
  internal var _hashContainerDefaultMaxLoadFactorInverse: Double {
      return 1.0 / 0.75
  }
  
   internal static func bucketCount(
    forCapacity capacity: Int,
    maxLoadFactorInverse: Double
  ) -> Int {
    return max(Int((Double(capacity) * maxLoadFactorInverse).rounded(.up)),
               capacity + 1)
  }
  
  internal init(bucketCount: Int) {
    _sanityCheck(bucketCount <= (Int.max >> 1) + 1)
    let buckets = 1 &<< ((Swift.max(bucketCount, 2) - 1)._binaryLogarithm() + 1)
    self.init(_exactBucketCount: buckets)
  }

  internal init(_exactBucketCount bucketCount: Int) {
    let bitmapWordCount = _UnsafeBitMap.sizeInWords(forSizeInBits: bucketCount)
    let storage = Builtin.allocWithTailElems_3(HashTypedStorage.self,
        bitmapWordCount._builtinWordValue, UInt.self,
        bucketCount._builtinWordValue, Key.self,
        bucketCount._builtinWordValue, Value.self)
    self.init(_exactBucketCount: bucketCount, storage: storage)
  }

  internal init(_exactBucketCount bucketCount: Int, storage: RawStorage) {
    storage.bucketCount = bucketCount
    storage.count = 0

    self.init(_storage: storage)

    let initializedEntries = _UnsafeBitMap(
        storage: _initializedHashtableEntriesBitMapBuffer,
        bitCount: bucketCount)
    initializedEntries.initializeToZero()

    let bitmapAddr = Builtin.projectTailElems(_storage, UInt.self)
    let bitmapWordCount = _UnsafeBitMap.sizeInWords(forSizeInBits: bucketCount)
    let keysAddr = Builtin.getTailAddr_Word(bitmapAddr,
           bitmapWordCount._builtinWordValue, UInt.self, Key.self)

    _storage.initializedEntries = initializedEntries
    _storage.keys = UnsafeMutableRawPointer(keysAddr)
    let valuesAddr = Builtin.getTailAddr_Word(keysAddr,
        bucketCount._builtinWordValue, Key.self, Value.self)
    _storage.values = UnsafeMutableRawPointer(valuesAddr)
    
    let seed = _Hasher._seed
    let perturbation = bucketCount
    _storage.seed = (seed.0 ^ UInt64(truncatingIfNeeded: perturbation), seed.1)
  }
}

TypedStorage 在文档中的说明是,此类有两个作用,第一个作用是为了能够创建 _NativeDictionaryBuffer<AnyObject, AnyObject>,但是创建出来的 Dictionary 只能使用索引和迭代器的功能,第二个作用是继承于 _RawNativeDictionaryStorage,实现 deinit 方法,反初始化 key 和 value

Builtin.allocWithTailElems_3 初始化 _storage,分配几段连续内存

  • 为防止 keyAnyObject 需要为其添加类型约束,所以需要为 TypeStorage 分配一段约束。
  • 分配 bitmapWordCount 个连续的内存块用来存储 bitmap (ps:bitmap,最主要的作用是能够快速的对key值进行查重操作)
  • 分配 bucketCount 个连续的内存块用来存储 key
  • 分配 bucketCount 个连续的内存块用来存储 value

bucketCount 计算相当于是 minimumCapacity1.0 / 0.75 倍。

initializedEntries 用于哈希算法中解决冲突问题的辅助 bitmap 下面会具体提到它的使用。

Builtin.getTailAddr_Word 获取到 keyvalue的初始内存地址分别赋值给 _storage.keyvalue ,而 _RawNativeDictionaryStorage 内部定义了通用的方法和属性。

ps: 至于 seed 的作用,我一直没搞懂,希望有大牛能够指点一下。

追加空间申请

来看一下 _NativeDictionaryBuffer 中追加空间申请的实现思路。暂时会隐去 _CocoaDictionaryBuffer 的相关内容。

  public mutating func reserveCapacity(_ minimumCapacity: Int) {
    _variantBuffer.reserveCapacity(minimumCapacity)
  }
  
  //_VariantDictionaryBuffer
  internal mutating func reserveCapacity(_ capacity: Int) {
    _ = ensureUniqueNativeBuffer(withCapacity: capacity)
  }
  
  internal mutating func ensureUniqueNativeBuffer(
    withCapacity minimumCapacity: Int
  ) -> (reallocated: Bool, capacityChanged: Bool) {
    let bucketCount = NativeBuffer.bucketCount(
      forCapacity: minimumCapacity,
      maxLoadFactorInverse: _hashContainerDefaultMaxLoadFactorInverse)
    return ensureUniqueNativeBuffer(withBucketCount: bucketCount)
  }
  
  internal mutating func ensureUniqueNativeBuffer(
    withBucketCount desiredBucketCount: Int
  ) -> (reallocated: Bool, capacityChanged: Bool) {
    let n = _isNative
    if n {
      return ensureUniqueNativeBufferNative(withBucketCount: desiredBucketCount)
    }
  }
  
  internal mutating func ensureUniqueNativeBufferNative(
    withBucketCount desiredBucketCount: Int
  ) -> (reallocated: Bool, capacityChanged: Bool) {
    let oldBucketCount = asNative.bucketCount
    if oldBucketCount >= desiredBucketCount && isUniquelyReferenced() {
      return (reallocated: false, capacityChanged: false)
    }

    let oldNativeBuffer = asNative
    var newNativeBuffer = NativeBuffer(bucketCount: desiredBucketCount)
    let newBucketCount = newNativeBuffer.bucketCount
    for i in 0..<oldBucketCount {
      if oldNativeBuffer.isInitializedEntry(at: i) {
        if oldBucketCount == newBucketCount {
          let key = oldNativeBuffer.key(at: i)
          let value = oldNativeBuffer.value(at: i)
          newNativeBuffer.initializeKey(key, value: value , at: i)
        } else {
          let key = oldNativeBuffer.key(at: i)
          newNativeBuffer.unsafeAddNew(
            key: key,
            value: oldNativeBuffer.value(at: i))
        }
      }
    }
    newNativeBuffer.count = oldNativeBuffer.count

    self = .native(newNativeBuffer)
    return (reallocated: true,
      capacityChanged: oldBucketCount != newBucketCount)
  }
  
  internal func initializeKey(_ k: Key, value v: Value, at i: Int) {
    _sanityCheck(!isInitializedEntry(at: i))
    defer { _fixLifetime(self) }

    (keys + i).initialize(to: k)
    (values + i).initialize(to: v)
    _storage.initializedEntries[i] = true
  }

可以看到,capacity 不足时,Swift 将现存的 capacity 取其 1.0 / 0.75 倍,再去申请新的 buffer

ensureUniqueNativeBufferNative 确保 Dictionary 持有者的唯一性和追加空间申请。

  • 如果现有容量大于等于所需要的容量,并且 Dictionary 只有一个持有者,则直接返回不进行任何操作。
  • 如果持有者不唯一或者需要追加空间申请,则需要重新初始化一个 _NativeDictionaryBuffer 的实例 newNativeBuffer 并分配 desiredBucketCount 个连续的内存块。isInitializedEntry是用于确认当前偏移量为ikey 值是否已存在。
    • 如果不需要追加内存空间,也就是 Dictionary的持有者不唯一,则会执行 写时复制 ,则初始化偏移量为 i 的内存空间,并赋值 keyvalue
    • 如果需要追加内存空间,由于原本的存储空间发生了改变,所以需要重新计算每个 key 应该插入的位置,则会调用 unsafeAddNew 先找到一个合适的位置插入 key,之后再进行初始化的操作。

就上文中提到的 unsafeAddNew ,来看看 Swift 是如何解决哈希表 key 值冲突的问题。

internal func unsafeAddNew(key newKey: Key, value: Value) {
    let (i, found) = _find(newKey, startBucket: _bucket(newKey))
    initializeKey(newKey, value: value, at: i.offset)
  }
  
  internal func _find(_ key: Key, startBucket: Int)
    -> (pos: Index, found: Bool) {

    var bucket = startBucket

    while true {
      let isHole = !isInitializedEntry(at: bucket)
      if isHole {
        return (Index(offset: bucket), false)
      }
      if self.key(at: bucket) == key {
        return (Index(offset: bucket), true)
      }
      bucket = _index(after: bucket)
    }
  }

其实解决冲突主要的函数是 _find

  • 通过 _bucket(newKey) 找到当前 key 值对应的位置 bucket
  • bucket 开始遍历,如果找到 bucketkey 与当前的 key 相同则返回当前 key 的位置 bucket
  • 否则找到 第一个未被占用的位置。

由此可见 Swift 在解决哈希表冲突的时候,使用的是线性探测法。

从删除的情况来看,Dictionary 采用的是 链地址法,解决 key 值之间的冲突。

_CocoaDictionaryBuffer

_CocoaDictionaryBuffer 中只有一个成员变量 cocoaDictionary,它的类型是 _NSDictionary 。而 _NSDictionaryShadowProtocols.swift 内部定义为一个协议,继承与 _NSDictionaryCore 主要用于桥接到 Objective-C

internal struct _CocoaDictionaryBuffer: _HashBuffer {
  internal var cocoaDictionary: _NSDictionary
}

而在 _CocoaDictionaryBuffer 结构体内部,只是实现了 _HashBuffer 协议相关的函数和计算属性。

追加空间申请

在追加空间申请时,同 _NativeDictionaryBuffer 不同的地方就在于 ensureUniqueNativeBuffer 内部实现。

  internal mutating func ensureUniqueNativeBuffer(
    withBucketCount desiredBucketCount: Int
  ) -> (reallocated: Bool, capacityChanged: Bool) {
      let cocoaDictionary = cocoaBuffer.cocoaDictionary
      var newNativeBuffer = NativeBuffer(bucketCount: desiredBucketCount)
      let oldCocoaIterator = _CocoaDictionaryIterator(cocoaDictionary)

      while let (key, value) = oldCocoaIterator.next() {
        newNativeBuffer.unsafeAddNew(
          key: _forceBridgeFromObjectiveC(key, Key.self),
          value: _forceBridgeFromObjectiveC(value, Value.self))
      }

      newNativeBuffer.count = cocoaDictionary.count

      self = .native(newNativeBuffer)
      return (reallocated: true, capacityChanged: true)
  }
  
  
  public func _forceBridgeFromObjectiveC<T>(_ x: AnyObject, _: T.Type) -> T {
  if _fastPath(_isClassOrObjCExistential(T.self)) {
    return x as! T
  }

  var result: T?
  _bridgeNonVerbatimFromObjectiveC(x, T.self, &result)
  return result!
}

从源码中,可以看出,在追加空间申请时,会将 _CocoaDictionaryBuffer 转换成 _NativeDictionaryBuffer,接下来的操作就和 _NativeDictionaryBuffer 一样。
_forceBridgeFromObjectiveC 的功能就是将 xObjective-C 桥接到 Swift ,来看看具体实现。

  • 如果是 Tclass
    • x 的类型是 T 或者 T 子类,则函数会直接返回 `x。
  • 否则 ,如果 T 是 遵从 _ObjectiveCBridgeable协议。
    • 如果 x 的类型不是 T.ObjectiveType 或者其子类,则会crash。
    • 否则,会返回 T._forceBridgeFromObjectiveC(x)的结果。

操作

接下来再来看看,一些比较常用操作的具体实现。

remove

internal mutating func removeValue(forKey key: Key) -> Value? {
    if _fastPath(guaranteedNative) {
      return nativeRemoveObject(forKey: key)
    }

    switch self {
    case .native:
      return nativeRemoveObject(forKey: key)
#if _runtime(_ObjC)
    case .cocoa(let cocoaBuffer):
      let anyObjectKey: AnyObject = _bridgeAnythingToObjectiveC(key)
      if cocoaBuffer.maybeGet(anyObjectKey) == nil {
        return nil
      }
      migrateDataToNativeBuffer(cocoaBuffer)
      return nativeRemoveObject(forKey: key)
#endif
    }

_NativeDictionaryBuffer

internal mutating func nativeRemoveObject(forKey key: Key) -> Value? {
    var idealBucket = asNative._bucket(key)
    var (index, found) = asNative._find(key, startBucket: idealBucket)

    if !found {
      return nil
    }

    let bucketCount = asNative.bucketCount
    let (_, capacityChanged) = ensureUniqueNativeBuffer(
      withBucketCount: bucketCount)
    let nativeBuffer = asNative
    if capacityChanged {
      idealBucket = nativeBuffer._bucket(key)
      (index, found) = nativeBuffer._find(key, startBucket: idealBucket)
      _sanityCheck(found, "key was lost during buffer migration")
    }
    let oldValue = nativeBuffer.value(at: index.offset)
    nativeDelete(nativeBuffer, idealBucket: idealBucket,
      offset: index.offset)
    return oldValue
  }
  
  internal mutating func nativeDelete(
    _ nativeBuffer: NativeBuffer, idealBucket: Int, offset: Int
  ) {
    var nativeBuffer = nativeBuffer

    nativeBuffer.destroyEntry(at: offset)
    nativeBuffer.count -= 1

    var hole = offset
    
    var start = idealBucket
    while nativeBuffer.isInitializedEntry(at: nativeBuffer._prev(start)) {
      start = nativeBuffer._prev(start)
    }

    var lastInChain = hole
    var b = nativeBuffer._index(after: lastInChain)
    while nativeBuffer.isInitializedEntry(at: b) {
      lastInChain = b
      b = nativeBuffer._index(after: b)
    }

    while hole != lastInChain {
      var b = lastInChain
      while b != hole {
        let idealBucket = nativeBuffer._bucket(nativeBuffer.key(at: b))
        
        let c0 = idealBucket >= start
        let c1 = idealBucket <= hole
        if start <= hole ? (c0 && c1) : (c0 || c1) {
          break // Found it
        }
        b = nativeBuffer._prev(b)
      }

      if b == hole {
        break
      }

      nativeBuffer.moveInitializeEntry(
        from: nativeBuffer,
        at: b,
        toEntryAt: hole)
      hole = b
    }
  }
  • 检查 Dictionary 的持有者是否唯一,如果不唯一则进行写时复制。
  • 找到 key 对应的 index
  • destroyEntry 从内存中回收 offset 偏移量的 keyvalue,并且将 offset 偏移量的位置在 bitmap 中标记为未被占用。
  • offset 左边取第一个被占用的 bucket,记为 start 和 右边取最后一个被占用的 bucket,记为 lastInChain
  • 查找 [start, lastInChain] 不合理的元素,并进行调整。

解释下何为不合理的元素,举个简单例子:比如有一条哈希表 hashTable 为:

[1,2,3,4,5,6,7,8]

插入 3 ,采用线性探测法

[1,2,3,4,5,6,7,8,3]

删除 5

[1,2,3,4,nil,6,7,8,3]

但是发现 3 本应该是在 nil 之前的,此时 3 就是不合理的元素,所以需要交换 3nil 的位置。

[1,2,3,4,3,6,7,8,nil]

_CocoaDictionaryBuffer

  public func _bridgeAnythingToObjectiveC<T>(_ x: T) -> AnyObject {
    if _fastPath(_isClassOrObjCExistential(T.self)) {
      return unsafeBitCast(x, to: AnyObject.self)
    }
    return _bridgeAnythingNonVerbatimToObjectiveC(x)
  }
  
  internal func maybeGet(_ key: Key) -> Value? {
    return cocoaDictionary.objectFor(key)
  }
  
  internal mutating func migrateDataToNativeBuffer(
    _ cocoaBuffer: _CocoaDictionaryBuffer
  ) {
    let allocated = ensureUniqueNativeBuffer(
      withCapacity: cocoaBuffer.count).reallocated
    _sanityCheck(allocated, "failed to allocate native Dictionary buffer")
  }
  • _bridgeAnythingToObjectiveC 将任意值转换成 AnyObject
  • maybeGet 获取当前 key 值的 value
  • migrateDataToNativeBuffer ,和追加申请空间的时候操作相同,将 _CocoaDictionaryBuffer 转换成 _NativeDictionaryBuffer
  • 调用 nativeRemoveObject ,操作和 _NativeDictionaryBuffer 相同。

updateValue

internal mutating func updateValue(
    _ value: Value, forKey key: Key
  ) -> Value? {

    if _fastPath(guaranteedNative) {
      return nativeUpdateValue(value, forKey: key)
    }

    switch self {
    case .native:
      return nativeUpdateValue(value, forKey: key)
#if _runtime(_ObjC)
    case .cocoa(let cocoaBuffer):
      migrateDataToNativeBuffer(cocoaBuffer)
      return nativeUpdateValue(value, forKey: key)
#endif
    }
  }

_NativeDictionaryBuffer

  internal mutating func nativeUpdateValue(
    _ value: Value, forKey key: Key
  ) -> Value? {
    var (i, found) = asNative._find(key, startBucket: asNative._bucket(key))

    let minBuckets = found
      ? asNative.bucketCount
      : NativeBuffer.bucketCount(
          forCapacity: asNative.count + 1,
          maxLoadFactorInverse: _hashContainerDefaultMaxLoadFactorInverse)

    let (_, capacityChanged) = ensureUniqueNativeBuffer(
      withBucketCount: minBuckets)
    if capacityChanged {
      i = asNative._find(key, startBucket: asNative._bucket(key)).pos
    }

    let oldValue: Value? = found ? asNative.value(at: i.offset) : nil
    if found {
      asNative.setKey(key, value: value, at: i.offset)
    } else {
      asNative.initializeKey(key, value: value, at: i.offset)
      asNative.count += 1
    }

    return oldValue
  }
  • 如果当前 key 存在,调用setKey,通过偏移量改变 value
  • 如果不存在
    • 校验添加新的 key 之后是否超出原有的容量。
      • 如果有,则追加空间申请,重新计算当前的 i
  • 初始化偏移量为 i.offset 的内存块,并初始化 keyvalue

_CocoaDictionaryBuffer

  • _CocoaDictionaryBuffer 转换成 _NativeDictionaryBuffer,之后操作和 _NativeDictionaryBuffer 相同。

总结

  • 从操作内存的函数来看,Swift 在实现 keyvalue 的一一对应,采用的是内存偏移量的方式。
  • 字典中,哈希表解决冲突,采用了线性探测法。
  • 字典的完整结构为