狀態機
可以放大觀看。
HashJoinState
Hash Join運行期狀態結構體
typedef struct HashJoinState
{JoinState js; /* 基類;its first field is NodeTag */ExprState *hashclauses;//hash連接條件List *hj_OuterHashKeys; /* 外表條件鏈表;list of ExprState nodes */List *hj_InnerHashKeys; /* 內表連接條件;list of ExprState nodes */List *hj_HashOperators; /* 操作符OIDs鏈表;list of operator OIDs */HashJoinTable hj_HashTable;//Hash表uint32 hj_CurHashValue;//當前的Hash值int hj_CurBucketNo;//當前的bucket編號int hj_CurSkewBucketNo;//行傾斜bucket編號HashJoinTuple hj_CurTuple;//當前元組TupleTableSlot *hj_OuterTupleSlot;//outer relation slotTupleTableSlot *hj_HashTupleSlot;//Hash tuple slotTupleTableSlot *hj_NullOuterTupleSlot;//用于外連接的outer虛擬slotTupleTableSlot *hj_NullInnerTupleSlot;//用于外連接的inner虛擬slotTupleTableSlot *hj_FirstOuterTupleSlot;//int hj_JoinState;//JoinState狀態bool hj_MatchedOuter;//是否匹配bool hj_OuterNotEmpty;//outer relation是否為空
} HashJoinState;
HashJoinTable
typedef struct HashJoinTableData
{int nbuckets; /* 內存中的hash桶數;# buckets in the in-memory hash table */int log2_nbuckets; /* 2的對數(nbuckets必須是2的冪);its log2 (nbuckets must be a power of 2) */int nbuckets_original; /* 首次hash時的桶數;# buckets when starting the first hash */int nbuckets_optimal; /* 優化后的桶數(每個批次);optimal # buckets (per batch) */int log2_nbuckets_optimal; /* 2的對數;log2(nbuckets_optimal) *//* buckets[i] is head of list of tuples in i'th in-memory bucket *///bucket [i]是內存中第i個桶中的元組鏈表的head itemunion{/* unshared array is per-batch storage, as are all the tuples *///未共享數組是按批處理存儲的,所有元組均如此struct HashJoinTupleData **unshared;/* shared array is per-query DSA area, as are all the tuples *///共享數組是每個查詢的DSA區域,所有元組均如此dsa_pointer_atomic *shared;}buckets;bool keepNulls; /*如不匹配則存儲NULL元組,該值為T;true to store unmatchable NULL tuples *///關于skew優化的變量bool skewEnabled; /*是否使用傾斜優化?;are we using skew optimization? */HashSkewBucket **skewBucket; /* 傾斜的hash表桶數;hashtable of skew buckets */int skewBucketLen; /* skewBucket數組大小;size of skewBucket array (a power of 2!) */int nSkewBuckets; /* 活動的傾斜桶數;number of active skew buckets */int *skewBucketNums; /* 活動傾斜桶數組索引;array indexes of active skew buckets */int nbatch; /* 批次數;number of batches */int curbatch; /* 當前批次,第一輪為0;current batch #; 0 during 1st pass */int nbatch_original; /* 在開始inner掃描時的批次;nbatch when we started inner scan */int nbatch_outstart; /* 在開始outer掃描時的批次;nbatch when we started outer scan */bool growEnabled; /* 關閉nbatch增加的標記;flag to shut off nbatch increases */double totalTuples; /* 從inner plan獲得的元組數;# tuples obtained from inner plan */double partialTuples; /* 通過hashjoin獲得的inner元組數;# tuples obtained from inner plan by me */double skewTuples; /* 傾斜元組數;# tuples inserted into skew tuples *//** 這些數組在散列連接的生命周期內分配,但僅當nbatch > 1時分配。* 只有當第一次將元組寫入文件時,文件才會打開(否則它的指針將保持NULL)。* 注意,第0個數組元素永遠不會被使用,因為批次0的元組永遠不會轉儲.*/BufFile **innerBatchFile; /* 每個批次的inner虛擬臨時文件緩存;buffered virtual temp file per batch */BufFile **outerBatchFile; /* 每個批次的outer虛擬臨時文件緩存;buffered virtual temp file per batch *//** 有關正在散列的數據類型的特定于數據類型的散列函數的信息。* 這些數組的長度與散列連接子句(散列鍵)的數量相同。*/FmgrInfo *outer_hashfunctions; /* outer hash函數FmgrInfo結構體;lookup data for hash functions */FmgrInfo *inner_hashfunctions; /* inner hash函數FmgrInfo結構體;lookup data for hash functions */bool *hashStrict; /* 每個hash操作符是嚴格?is each hash join operator strict? */Size spaceUsed; /* 元組使用的當前內存空間大小;memory space currently used by tuples */Size spaceAllowed; /* 空間使用上限;upper limit for space used */Size spacePeak; /* 峰值的空間使用;peak space used */Size spaceUsedSkew; /* 傾斜哈希表的當前空間使用情況;skew hash table's current space usage */Size spaceAllowedSkew; /* 傾斜哈希表的使用上限;upper limit for skew hashtable */MemoryContext hashCxt; /* 整個散列連接存儲的上下文;context for whole-hash-join storage */MemoryContext batchCxt; /* 該批次存儲的上下文;context for this-batch-only storage *//* used for dense allocation of tuples (into linked chunks) *///用于密集分配元組(到鏈接塊中)HashMemoryChunk chunks; /* 整個批次使用一個鏈表;one list for the whole batch *//* Shared and private state for Parallel Hash. *///并行hash使用的共享和私有狀態HashMemoryChunk current_chunk; /* 后臺進程的當前chunk;this backend's current chunk */dsa_area *area; /* 用于分配內存的DSA區域;DSA area to allocate memory from */ParallelHashJoinState *parallel_state;//并行執行狀態ParallelHashJoinBatchAccessor *batches;//并行訪問器dsa_pointer current_chunk_shared;//當前chunk的開始指針
} HashJoinTableData;
其他
在inner join第一次掃描中,可以從執行器得到tuple。
如果batch數目 > 1,那么不屬于第一批的tuple將被保存在batch的inner的臨時文件中。
在outer join中同理,不過我們將tuple寫入batch的outer臨時文件中
完成第一次掃描后,堆每批剩余的tuple做如下操作:
1、從inner 批處理文件中讀取元組,加載到hash table中的bucket
2、從outer 批處理文件中讀取元組,匹配hash bucket,然后輸出結果。
參考
postgresql-13源碼