本节简单介绍了PostgreSQL执行聚合函数所使用的相关数据结构,包括AggState,AggStatePerAgg,AggStatePerTrans,AggStatePerGroup,AggStatePerPhase和AggStatePerHash.

一、数据结构

AggState
聚合函数执行时状态结构体,内含AggStatePerAgg等结构体


/* ---------------------
 *    AggState information
 *
 *    ss.ss_ScanTupleSlot refers to output of underlying plan.
 *  ss.ss_ScanTupleSlot指的是基础计划的输出.
 *    (ss = ScanState,ps = PlanState)
 *
 *    Note: ss.ps.ps_ExprContext contains ecxt_aggvalues and
 *    ecxt_aggnulls arrays, which hold the computed agg values for the current
 *    input group during evaluation of an Agg node's output tuple(s).  We
 *    create a second ExprContext, tmpcontext, in which to evaluate input
 *    expressions and run the aggregate transition functions.
 *    注意:ss.ps.ps_ExprContext包含了ecxt_aggvalues和ecxt_aggnulls数组,
 *      这两个数组保存了在计算agg节点的输出元组时当前输入组已计算的agg值.
 * ---------------------
 */
/* these structs are private in nodeAgg.c: */
//在nodeAgg.c中私有的结构体
typedef struct AggStatePerAggData *AggStatePerAgg;
typedef struct AggStatePerTransData *AggStatePerTrans;
typedef struct AggStatePerGroupData *AggStatePerGroup;
typedef struct AggStatePerPhaseData *AggStatePerPhase;
typedef struct AggStatePerHashData *AggStatePerHash;
typedef struct AggState
{
    //第一个字段是NodeTag(继承自ScanState)
    ScanState    ss;                /* its first field is NodeTag */
    //targetlist和quals中所有的Aggref
    List       *aggs;            /* all Aggref nodes in targetlist & quals */
    //链表的大小(可以为0)
    int            numaggs;        /* length of list (could be zero!) */
    //pertrans条目大小
    int            numtrans;        /* number of pertrans items */
    //Agg策略模式
    AggStrategy aggstrategy;    /* strategy mode */
    //agg-splitting模式,参见nodes.h
    AggSplit    aggsplit;        /* agg-splitting mode, see nodes.h */
    //指向当前步骤数据的指针
    AggStatePerPhase phase;        /* pointer to current phase data */
    //步骤数(包括0)
    int            numphases;        /* number of phases (including phase 0) */
    //当前步骤
    int            current_phase;    /* current phase number */
    //per-Aggref信息
    AggStatePerAgg peragg;        /* per-Aggref information */
    //per-Trans状态信息
    AggStatePerTrans pertrans;    /* per-Trans state information */
    //长生命周期数据的ExprContexts(hashtable)
    ExprContext *hashcontext;    /* econtexts for long-lived data (hashtable) */
    ////长生命周期数据的ExprContexts(每一个GS使用)
    ExprContext **aggcontexts;    /* econtexts for long-lived data (per GS) */
    //输入表达式的ExprContext
    ExprContext *tmpcontext;    /* econtext for input expressions */
#define FIELDNO_AGGSTATE_CURAGGCONTEXT 14
    //当前活跃的aggcontext
    ExprContext *curaggcontext; /* currently active aggcontext */
    //当前活跃的aggregate(如存在)
    AggStatePerAgg curperagg;    /* currently active aggregate, if any */
#define FIELDNO_AGGSTATE_CURPERTRANS 16
    //当前活跃的trans state
    AggStatePerTrans curpertrans;    /* currently active trans state, if any */
    //输入结束?
    bool        input_done;        /* indicates end of input */
    //Agg扫描结束?
    bool        agg_done;        /* indicates completion of Agg scan */
    //最后一个grouping set
    int            projected_set;    /* The last projected grouping set */
#define FIELDNO_AGGSTATE_CURRENT_SET 20
    //将要解析的当前grouping set
    int            current_set;    /* The current grouping set being evaluated */
    //当前投影操作的分组列
    Bitmapset  *grouped_cols;    /* grouped cols in current projection */
    //倒序的分组列链表
    List       *all_grouped_cols;    /* list of all grouped cols in DESC order */
    /* These fields are for grouping set phase data */
    //-------- 下面的列用于grouping set步骤数据
    //所有步骤中最大的sets大小
    int            maxsets;        /* The max number of sets in any phase */
    //所有步骤的数组
    AggStatePerPhase phases;    /* array of all phases */
    //对于phases > 1,已排序的输入信息
    Tuplesortstate *sort_in;    /* sorted input to phases > 1 */
    //对于下一个步骤,输入已拷贝
    Tuplesortstate *sort_out;    /* input is copied here for next phase */
    //排序结果的slot
    TupleTableSlot *sort_slot;    /* slot for sort results */
    /* these fields are used in AGG_PLAIN and AGG_SORTED modes: */
    //------- 下面的列用于AGG_PLAIN和AGG_SORTED模式:
    //per-group指针的grouping set编号数组
    AggStatePerGroup *pergroups;    /* grouping set indexed array of per-group
                                     * pointers */
    //当前组的第一个元组拷贝
    HeapTuple    grp_firstTuple; /* copy of first tuple of current group */
    /* these fields are used in AGG_HASHED and AGG_MIXED modes: */
    //--------- 下面的列用于AGG_HASHED和AGG_MIXED模式:
    //是否已填充hash表?
    bool        table_filled;    /* hash table filled yet? */
    //hash桶数?
    int            num_hashes;
    //相应的哈希表数据数组
    AggStatePerHash perhash;    /* array of per-hashtable data */
    //per-group指针的grouping set编号数组
    AggStatePerGroup *hash_pergroup;    /* grouping set indexed array of
                                         * per-group pointers */
    /* support for evaluation of agg input expressions: */
    //---------- agg输入表达式解析支持
#define FIELDNO_AGGSTATE_ALL_PERGROUPS 34
    //首先是->pergroups,然后是hash_pergroup
    AggStatePerGroup *all_pergroups;    /* array of first ->pergroups, than
                                         * ->hash_pergroup */
    //投影实现机制
    ProjectionInfo *combinedproj;    /* projection machinery */
} AggState;
/* Primitive options supported by nodeAgg.c: */
//nodeag .c支持的基本选项
#define AGGSPLITOP_COMBINE        0x01    /* substitute combinefn for transfn */
#define AGGSPLITOP_SKIPFINAL    0x02    /* skip finalfn, return state as-is */
#define AGGSPLITOP_SERIALIZE    0x04    /* apply serializefn to output */
#define AGGSPLITOP_DESERIALIZE    0x08    /* apply deserializefn to input */
/* Supported operating modes (i.e., useful combinations of these options): */
//支持的操作模式
typedef enum AggSplit
{
    /* Basic, non-split aggregation: */
    //基本 : 非split聚合
    AGGSPLIT_SIMPLE = 0,
    /* Initial phase of partial aggregation, with serialization: */
    //部分聚合的初始步骤,序列化
    AGGSPLIT_INITIAL_SERIAL = AGGSPLITOP_SKIPFINAL | AGGSPLITOP_SERIALIZE,
    /* Final phase of partial aggregation, with deserialization: */
    //部分聚合的最终步骤,反序列化
    AGGSPLIT_FINAL_DESERIAL = AGGSPLITOP_COMBINE | AGGSPLITOP_DESERIALIZE
} AggSplit;
/* Test whether an AggSplit value selects each primitive option: */
//测试AggSplit选择了哪些基本选项
#define DO_AGGSPLIT_COMBINE(as)        (((as) & AGGSPLITOP_COMBINE) != 0)
#define DO_AGGSPLIT_SKIPFINAL(as)    (((as) & AGGSPLITOP_SKIPFINAL) != 0)
#define DO_AGGSPLIT_SERIALIZE(as)    (((as) & AGGSPLITOP_SERIALIZE) != 0)
#define DO_AGGSPLIT_DESERIALIZE(as) (((as) & AGGSPLITOP_DESERIALIZE) != 0)

AggStatePerAggData
per-aggregate信息,这个结构体包含了调用最终函数的信息,用以从状态值中产生一个最终的聚合结果.如果查询中有多个相同的Aggrefs,共享相同的per-agg数据.


/*
 * AggStatePerAggData - per-aggregate information
 * AggStatePerAggData - per-aggregate信息
 *
 * This contains the information needed to call the final function, to produce
 * a final aggregate result from the state value. If there are multiple
 * identical Aggrefs in the query, they can all share the same per-agg data.
 * 这个结构体包含了调用最终函数的信息,用以从状态值中产生一个最终的聚合结果.
 * 如果查询中有多个相同的Aggrefs,共享相同的per-agg数据.
 *
 * These values are set up during ExecInitAgg() and do not change thereafter.
 * 这些值在ExecInitAgg()中设置,此后不再变化.
 */
typedef struct AggStatePerAggData
{
    /*
     * Link to an Aggref expr this state value is for.
     * 链接到该状态值代表的Aggref expr上.
     *
     * There can be multiple identical Aggref's sharing the same per-agg. This
     * points to the first one of them.
     * 可能有多个相同的Aggref共享相同的per-agg.指向第一个.
     */
    Aggref       *aggref;
    /* index to the state value which this agg should use */
    //该agg应使用的状态值索引
    int            transno;
    /* Optional Oid of final function (may be InvalidOid) */
    //final function函数的Oid(可以是InvalidOid)
    Oid            finalfn_oid;
    /*
     * fmgr lookup data for final function --- only valid when finalfn_oid is
     * not InvalidOid.
     * final function的fmgr检索数据 --- 在finalfn_oid不是InvalidOid时才有效
     */
    FmgrInfo    finalfn;
    /*
     * Number of arguments to pass to the finalfn.  This is always at least 1
     * (the transition state value) plus any ordered-set direct args. If the
     * finalfn wants extra args then we pass nulls corresponding to the
     * aggregated input columns.
     * 传递给finalfn的参数个数.
     * 这通常不小于1(转换状态值)加上所有已排序集合的直接参数.
     * 如果finalfn需要特别的参数,那么会传递nulls对应聚合的输入列.
     * 
     */
    int            numFinalArgs;
    /* ExprStates for any direct-argument expressions */
    //所有直接参数表达式的ExprStates
    List       *aggdirectargs;
    /*
     * We need the len and byval info for the agg's result data type in order
     * to know how to copy/delete values.
     * 对于agg结果数据类型需要长度和byval信息,用以知道如何拷贝和删除值.
     */
    int16        resulttypeLen;
    bool        resulttypeByVal;
    /*
     * "shareable" is false if this agg cannot share state values with other
     * aggregates because the final function is read-write.
     * 如因为final function是RW,agg不能与其他aggregates共享状态值,则shareable为F
     */
    bool        shareable;
}            AggStatePerAggData;

AggStatePerTransData
聚合状态值信息(per aggregate state value information), 通过输入行调用转换函数更新聚合状态值的工作状态.该结构体不会存储从转换状态而来的用于产生最终聚合结果的相关信息,这些信息会存储在AggStatePerAggData中.


/*
 * AggStatePerTransData - per aggregate state value information
 * AggStatePerTransData - 聚合状态值信息
 *
 * Working state for updating the aggregate's state value, by calling the
 * transition function with an input row. This struct does not store the
 * information needed to produce the final aggregate result from the transition
 * state, that's stored in AggStatePerAggData instead. This separation allows
 * multiple aggregate results to be produced from a single state value.
 * 通过输入行调用转换函数更新聚合状态值的工作状态.
 * 该结构体不会存储从转换状态而来的用于产生最终聚合结果的相关信息,这些信息会存储在AggStatePerAggData中.
 * 这样的分离可以做到多个聚合结果可通过单个状态值产生.
 */
typedef struct AggStatePerTransData
{
    /*
     * These values are set up during ExecInitAgg() and do not change
     * thereafter:
     * 这些值在ExecInitAgg()执行期间设置,以后不会修改.
     */
    /*
     * Link to an Aggref expr this state value is for.
     * 链接到该状态值所代表的Aggref表达式上面.
     *
     * There can be multiple Aggref's sharing the same state value, so long as
     * the inputs and transition functions are identical and the final
     * functions are not read-write.  This points to the first one of them.
     * 参见AggStatePerAggData结构体注释
     */
    Aggref       *aggref;
    /*
     * Is this state value actually being shared by more than one Aggref?
     * 是否共享?
     */
    bool        aggshared;
    /*
     * Number of aggregated input columns.  This includes ORDER BY expressions
     * in both the plain-agg and ordered-set cases.  Ordered-set direct args
     * are not counted, though.
     * 聚合输入列个数.
     */
    int            numInputs;
    /*
     * Number of aggregated input columns to pass to the transfn.  This
     * includes the ORDER BY columns for ordered-set aggs, but not for plain
     * aggs.  (This doesn't count the transition state value!)
     * 传递给transfn的聚合输入列个数.
     */
    int            numTransInputs;
    /* Oid of the state transition or combine function */
    //转换或组合函数Oid
    Oid            transfn_oid;
    /* Oid of the serialization function or InvalidOid */
    //序列化函数Oid或InvalidOid
    Oid            serialfn_oid;
    /* Oid of the deserialization function or InvalidOid */
    //反序列化函数Oid或InvalidOid
    Oid            deserialfn_oid;
    /* Oid of state value's datatype */
    //状态值数据类型Oid
    Oid            aggtranstype;
    /*
     * fmgr lookup data for transition function or combine function.  Note in
     * particular that the fn_strict flag is kept here.
     * 转换函数或组合函数的fmgr检索数据.
     */
    FmgrInfo    transfn;
    /* fmgr lookup data for serialization function */
    //序列化函数fmgr
    FmgrInfo    serialfn;
    /* fmgr lookup data for deserialization function */
    //反序列化函数fmgr
    FmgrInfo    deserialfn;
    /* Input collation derived for aggregate */
    //派生于聚合的输入排序规则
    Oid            aggCollation;
    /* number of sorting columns */
    //排序列个数
    int            numSortCols;
    /* number of sorting columns to consider in DISTINCT comparisons */
    /* (this is either zero or the same as numSortCols) */
    //在DISTINCT比较时需考虑的排序列数
    int            numDistinctCols;
    /* deconstructed sorting information (arrays of length numSortCols) */
    //重组排序信息
    AttrNumber *sortColIdx;
    Oid           *sortOperators;
    Oid           *sortCollations;
    bool       *sortNullsFirst;
    /*
     * Comparators for input columns --- only set/used when aggregate has
     * DISTINCT flag. equalfnOne version is used for single-column
     * comparisons, equalfnMulti for the case of multiple columns.
     * 输入列比较器,在聚合有DISTINCT标记时才会设置/使用
     * equalfnOne用于单个列比较,equalfnMulti用于多列.
     */
    FmgrInfo    equalfnOne;
    ExprState  *equalfnMulti;
    /*
     * initial value from pg_aggregate entry
     * pg_aggregate条目的初始值
     */
    Datum        initValue;
    bool        initValueIsNull;
    /*
     * We need the len and byval info for the agg's input and transition data
     * types in order to know how to copy/delete values.
     * 需要聚合输入的len和byval信息以及转换数据类型,以便知道如何拷贝/删除值
     *
     * Note that the info for the input type is used only when handling
     * DISTINCT aggs with just one argument, so there is only one input type.
     * 注意:输入类型的信息仅用于处理单个参数的DISTINCT聚合,因此只有一个输入类型
     */
    int16        inputtypeLen,
                transtypeLen;
    bool        inputtypeByVal,
                transtypeByVal;
    /*
     * Slots for holding the evaluated input arguments.  These are set up
     * during ExecInitAgg() and then used for each input row requiring either
     * FILTER or ORDER BY/DISTINCT processing.
     * 保存解析输入参数的slots.
     * 在ExecInitAgg()中设置用于每个输入行,在FILTER或ORDER BY/DISTINCT处理过程中需要.
     */
    //当前输入的tuple
    TupleTableSlot *sortslot;    /* current input tuple */
    //用于多列DISTINCT
    TupleTableSlot *uniqslot;    /* used for multi-column DISTINCT */
    //输入元组描述符
    TupleDesc    sortdesc;        /* descriptor of input tuples */
    /*
     * These values are working state that is initialized at the start of an
     * input tuple group and updated for each input tuple.
     * 这些值是在输入tuple group被初始化时的工作状态,在处理每个tuple都会更新.
     *
     * For a simple (non DISTINCT/ORDER BY) aggregate, we just feed the input
     * values straight to the transition function.  If it's DISTINCT or
     * requires ORDER BY, we pass the input values into a Tuplesort object;
     * then at completion of the input tuple group, we scan the sorted values,
     * eliminate duplicates if needed, and run the transition function on the
     * rest.
     * 对于简单的(不是DISTINCT/ORDER BY)聚合,直接把输入值提供给转换函数即可.
     * 如果是DISTINCT/ORDER BY,传递输入值给Tuplesort对象,
     *   在输入的tuple组结束时,扫描已存储值,如需要去重并在剩余的元组上执行转换函数
     *
     * We need a separate tuplesort for each grouping set.
     * 需要为每一个grouping set提供tuplesort
     */
    //排序对象,仅用于DISTINCT/ORDER BY
    Tuplesortstate **sortstates;    /* sort objects, if DISTINCT or ORDER BY */
    /*
     * This field is a pre-initialized FunctionCallInfo struct used for
     * calling this aggregate's transfn.  We save a few cycles per row by not
     * re-initializing the unchanging fields; which isn't much, but it seems
     * worth the extra space consumption.
     * 该字段是预初始化FunctionCallInfo结构体,用于调用聚合的转换函数transfn.
     * 对于每一行,通过减少不会改变的字段的初始化可以节省一些CPU处理周期,
     *   但这个收益不会太大,但看起来值得额外的空间消耗.
     */
    FunctionCallInfoData transfn_fcinfo;
    /* Likewise for serialization and deserialization functions */
    //序列化和反序列化函数信息
    FunctionCallInfoData serialfn_fcinfo;
    FunctionCallInfoData deserialfn_fcinfo;
}            AggStatePerTransData;

AggStatePerGroupData
per-aggregate-per-group工作状态,这些工作状态值在第一个输入tuple group时初始化,后续在处理每个输入tuple时更新.


/*
 * AggStatePerGroupData - per-aggregate-per-group working state
 * AggStatePerGroupData - per-aggregate-per-group工作状态
 *
 * These values are working state that is initialized at the start of
 * an input tuple group and updated for each input tuple.
 * 这些工作状态值在第一个输入tuple group时初始化,后续在处理每个输入tuple时更新.
 *
 * In AGG_PLAIN and AGG_SORTED modes, we have a single array of these
 * structs (pointed to by aggstate->pergroup); we re-use the array for
 * each input group, if it's AGG_SORTED mode.  In AGG_HASHED mode, the
 * hash table contains an array of these structs for each tuple group.
 * 在AGG_PLAIN/AGG_SORTED模式,这些结构体都有一个单独的数组(aggstate->pergroup指向这些结构体);
 * 在AGG_SORTED模式,对于每一个输入group,都会重用这些数组.
 * 在AGG_HASHED模式,hash表中都有对应每一个tuple group的这些结构体的数组.
 *
 * Logically, the sortstate field belongs in this struct, but we do not
 * keep it here for space reasons: we don't support DISTINCT aggregates
 * in AGG_HASHED mode, so there's no reason to use up a pointer field
 * in every entry of the hashtable.
 * 逻辑上来说,sortstate字段属于该结构体,但出于空间大小考虑,不在这里保存:
 *   在AGG_HASHED模式,不支持DISTINCT聚合,因此没有理由在hash表的每一个条目中都包含指针域.
 */
typedef struct AggStatePerGroupData
{
#define FIELDNO_AGGSTATEPERGROUPDATA_TRANSVALUE 0
    //当前转换值
    Datum        transValue;        /* current transition value */
#define FIELDNO_AGGSTATEPERGROUPDATA_TRANSVALUEISNULL 1
    bool        transValueIsNull;
#define FIELDNO_AGGSTATEPERGROUPDATA_NOTRANSVALUE 2
    //如transValue尚未设置,则为T
    bool        noTransValue;    /* true if transValue not set yet */
    /*
     * Note: noTransValue initially has the same value as transValueIsNull,
     * and if true both are cleared to false at the same time.  They are not
     * the same though: if transfn later returns a NULL, we want to keep that
     * NULL and not auto-replace it with a later input value. Only the first
     * non-NULL input will be auto-substituted.
     * 注意:noTransValue与transValueIsNull在初始化时值一样,如同为T,则同时将二者设置为F.
     * 但它们并不相同,如果transfn后续返回NULL,需要保存该NULL值而不是用随后的输入值自动替换之.
     * 只有在第一个非NULL输入会被自动替换.
     */
}            AggStatePerGroupData;

AggStatePerPhaseData
per-grouping-set-phase状态.Grouping sets会被分拆为多个”步骤”,每一个单独的步骤在输入上都会完成一轮处理.


/*
 * AggStatePerPhaseData - per-grouping-set-phase state
 * AggStatePerPhaseData - per-grouping-set-phase状态
 *
 * Grouping sets are divided into "phases", where a single phase can be
 * processed in one pass over the input. If there is more than one phase, then
 * at the end of input from the current phase, state is reset and another pass
 * taken over the data which has been re-sorted in the mean time.
 * Grouping sets会被分拆为多个"步骤",每一个单独的步骤在输入上都会完成一轮处理.
 * 如果步骤多于一个,在当前步骤的最后一个输入处,状态会被重置,同时另一次传递接管了在此期间重排的数据.
 *
 * Accordingly, each phase specifies a list of grouping sets and group clause
 * information, plus each phase after the first also has a sort order.
 * 相应的,每一个步骤指定了grouping sets和group clause信息链表,外加在第一个步骤的排序.
 */
typedef struct AggStatePerPhaseData
{
    //该步骤使用的策略
    AggStrategy aggstrategy;    /* strategy for this phase */
    //grouping sets个数,如无则为0
    int            numsets;        /* number of grouping sets (or 0) */
    //grouping sets的大小
    int           *gset_lengths;    /* lengths of grouping sets */
    //rollup(上卷)列组
    Bitmapset **grouped_cols;    /* column groupings for rollup */
    //返回等价的表达式,比较列序号作为索引
    ExprState **eqfunctions;    /* expression returning equality, indexed by
                                 * nr of cols to compare */
    //对应步骤数据的Agg节点
    Agg           *aggnode;        /* Agg node for phase data */
    //该步骤的输入排序Sort节点
    Sort       *sortnode;        /* Sort node for input ordering for phase */
    //转换函数解析
    ExprState  *evaltrans;        /* evaluation of transition functions  */
}            AggStatePerPhaseData;

AggStatePerHashData
per-hashtable状态.使用哈希进行grouping set,每一个grouping set都会有一个这样的结构体.


/*
 * AggStatePerHashData - per-hashtable state
 * AggStatePerHashData - per-hashtable状态
 *
 * When doing grouping sets with hashing, we have one of these for each
 * grouping set. (When doing hashing without grouping sets, we have just one of
 * them.)
 * 使用哈希进行grouping set,每一个grouping set都会有一个这样的结构体.
 * (如无grouping sets执行哈希,则只需要一个即可)
 */
typedef struct AggStatePerHashData
{
    //每一个group都有一个条目的哈希表
    TupleHashTable hashtable;    /* hash table with one entry per group */
    //访问哈希表的迭代器
    TupleHashIterator hashiter; /* for iterating through hash table */
    //装载哈希表的slot
    TupleTableSlot *hashslot;    /* slot for loading hash table */
    //per-grouping-field哈希函数
    FmgrInfo   *hashfunctions;    /* per-grouping-field hash fns */
    //per-grouping-field等价函数
    Oid           *eqfuncoids;        /* per-grouping-field equality fns */
    //哈希键列个数
    int            numCols;        /* number of hash key columns */
    //哈希表中的列数
    int            numhashGrpCols; /* number of columns in hash table */
    //请求哈希最大的列
    int            largestGrpColIdx;    /* largest col required for hashing */
    //输入slot中的hash col索引数组
    AttrNumber *hashGrpColIdxInput; /* hash col indices in input slot */
    //hashtbl元组索引数组
    AttrNumber *hashGrpColIdxHash;    /* indices in hashtbl tuples */
    //元素的Agg节点,用于numGroups等等
    Agg           *aggnode;        /* original Agg node, for numGroups etc. */
}            AggStatePerHashData;

二、参考资料

N/A

PostgreSQL 源码解读（178）- 查询#95(聚合函数)#1相关数据结构

一、数据结构

二、参考资料